]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
import ceph 16.2.7
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15
16 #include "acconfig.h"
17
18 #include <cctype>
19 #include <fstream>
20 #include <iostream>
21 #include <iterator>
22
23 #include <unistd.h>
24 #include <sys/stat.h>
25 #include <signal.h>
26 #include <time.h>
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
29
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
32 #endif
33
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
36 #endif
37
38 #include "osd/PG.h"
39 #include "osd/scrub_machine.h"
40 #include "osd/pg_scrubber.h"
41
42 #include "include/types.h"
43 #include "include/compat.h"
44 #include "include/random.h"
45
46 #include "OSD.h"
47 #include "OSDMap.h"
48 #include "Watch.h"
49 #include "osdc/Objecter.h"
50
51 #include "common/errno.h"
52 #include "common/ceph_argparse.h"
53 #include "common/ceph_releases.h"
54 #include "common/ceph_time.h"
55 #include "common/version.h"
56 #include "common/async/blocked_completion.h"
57 #include "common/pick_address.h"
58 #include "common/blkdev.h"
59 #include "common/numa.h"
60
61 #include "os/ObjectStore.h"
62 #ifdef HAVE_LIBFUSE
63 #include "os/FuseStore.h"
64 #endif
65
66 #include "PrimaryLogPG.h"
67
68 #include "msg/Messenger.h"
69 #include "msg/Message.h"
70
71 #include "mon/MonClient.h"
72
73 #include "messages/MLog.h"
74
75 #include "messages/MGenericMessage.h"
76 #include "messages/MOSDPing.h"
77 #include "messages/MOSDFailure.h"
78 #include "messages/MOSDMarkMeDown.h"
79 #include "messages/MOSDMarkMeDead.h"
80 #include "messages/MOSDFull.h"
81 #include "messages/MOSDOp.h"
82 #include "messages/MOSDOpReply.h"
83 #include "messages/MOSDBackoff.h"
84 #include "messages/MOSDBeacon.h"
85 #include "messages/MOSDRepOp.h"
86 #include "messages/MOSDRepOpReply.h"
87 #include "messages/MOSDBoot.h"
88 #include "messages/MOSDPGTemp.h"
89 #include "messages/MOSDPGReadyToMerge.h"
90
91 #include "messages/MOSDMap.h"
92 #include "messages/MMonGetOSDMap.h"
93 #include "messages/MOSDPGNotify.h"
94 #include "messages/MOSDPGNotify2.h"
95 #include "messages/MOSDPGQuery.h"
96 #include "messages/MOSDPGQuery2.h"
97 #include "messages/MOSDPGLog.h"
98 #include "messages/MOSDPGRemove.h"
99 #include "messages/MOSDPGInfo.h"
100 #include "messages/MOSDPGInfo2.h"
101 #include "messages/MOSDPGCreate.h"
102 #include "messages/MOSDPGCreate2.h"
103 #include "messages/MBackfillReserve.h"
104 #include "messages/MRecoveryReserve.h"
105 #include "messages/MOSDForceRecovery.h"
106 #include "messages/MOSDECSubOpWrite.h"
107 #include "messages/MOSDECSubOpWriteReply.h"
108 #include "messages/MOSDECSubOpRead.h"
109 #include "messages/MOSDECSubOpReadReply.h"
110 #include "messages/MOSDPGCreated.h"
111 #include "messages/MOSDPGUpdateLogMissing.h"
112 #include "messages/MOSDPGUpdateLogMissingReply.h"
113
114 #include "messages/MOSDPeeringOp.h"
115
116 #include "messages/MOSDAlive.h"
117
118 #include "messages/MOSDScrub.h"
119 #include "messages/MOSDScrub2.h"
120 #include "messages/MOSDRepScrub.h"
121
122 #include "messages/MCommand.h"
123 #include "messages/MCommandReply.h"
124
125 #include "messages/MPGStats.h"
126
127 #include "messages/MWatchNotify.h"
128 #include "messages/MOSDPGPush.h"
129 #include "messages/MOSDPGPushReply.h"
130 #include "messages/MOSDPGPull.h"
131
132 #include "messages/MMonGetPurgedSnaps.h"
133 #include "messages/MMonGetPurgedSnapsReply.h"
134
135 #include "common/perf_counters.h"
136 #include "common/Timer.h"
137 #include "common/LogClient.h"
138 #include "common/AsyncReserver.h"
139 #include "common/HeartbeatMap.h"
140 #include "common/admin_socket.h"
141 #include "common/ceph_context.h"
142
143 #include "global/signal_handler.h"
144 #include "global/pidfile.h"
145
146 #include "include/color.h"
147 #include "perfglue/cpu_profiler.h"
148 #include "perfglue/heap_profiler.h"
149
150 #include "osd/ClassHandler.h"
151 #include "osd/OpRequest.h"
152
153 #include "auth/AuthAuthorizeHandler.h"
154 #include "auth/RotatingKeyRing.h"
155
156 #include "objclass/objclass.h"
157
158 #include "common/cmdparse.h"
159 #include "include/str_list.h"
160 #include "include/util.h"
161
162 #include "include/ceph_assert.h"
163 #include "common/config.h"
164 #include "common/EventTrace.h"
165
166 #include "json_spirit/json_spirit_reader.h"
167 #include "json_spirit/json_spirit_writer.h"
168
169 #ifdef WITH_LTTNG
170 #define TRACEPOINT_DEFINE
171 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172 #include "tracing/osd.h"
173 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
174 #undef TRACEPOINT_DEFINE
175 #else
176 #define tracepoint(...)
177 #endif
178 #ifdef HAVE_JAEGER
179 #include "common/tracer.h"
180 #endif
181
182 #define dout_context cct
183 #define dout_subsys ceph_subsys_osd
184 #undef dout_prefix
185 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
186
187 using std::deque;
188 using std::list;
189 using std::lock_guard;
190 using std::make_pair;
191 using std::make_tuple;
192 using std::make_unique;
193 using std::map;
194 using std::ostream;
195 using std::ostringstream;
196 using std::pair;
197 using std::set;
198 using std::string;
199 using std::stringstream;
200 using std::to_string;
201 using std::unique_ptr;
202 using std::vector;
203
204 using ceph::bufferlist;
205 using ceph::bufferptr;
206 using ceph::decode;
207 using ceph::encode;
208 using ceph::fixed_u_to_string;
209 using ceph::Formatter;
210 using ceph::heartbeat_handle_d;
211 using ceph::make_mutex;
212
213 using namespace ceph::osd::scheduler;
214 using TOPNSPC::common::cmd_getval;
215
216 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
217 return *_dout << "osd." << whoami << " " << epoch << " ";
218 }
219
220 //Initial features in new superblock.
221 //Features here are also automatically upgraded
222 CompatSet OSD::get_osd_initial_compat_set() {
223 CompatSet::FeatureSet ceph_osd_feature_compat;
224 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
225 CompatSet::FeatureSet ceph_osd_feature_incompat;
226 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
227 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
228 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
229 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
230 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
231 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
232 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
233 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
234 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
235 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
236 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
237 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
238 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
239 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
240 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
241 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
242 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
243 ceph_osd_feature_incompat);
244 }
245
246 //Features are added here that this OSD supports.
247 CompatSet OSD::get_osd_compat_set() {
248 CompatSet compat = get_osd_initial_compat_set();
249 //Any features here can be set in code, but not in initial superblock
250 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
251 return compat;
252 }
253
254 OSDService::OSDService(OSD *osd, ceph::async::io_context_pool& poolctx) :
255 osd(osd),
256 cct(osd->cct),
257 whoami(osd->whoami), store(osd->store),
258 log_client(osd->log_client), clog(osd->clog),
259 pg_recovery_stats(osd->pg_recovery_stats),
260 cluster_messenger(osd->cluster_messenger),
261 client_messenger(osd->client_messenger),
262 logger(osd->logger),
263 recoverystate_perf(osd->recoverystate_perf),
264 monc(osd->monc),
265 osd_max_object_size(cct->_conf, "osd_max_object_size"),
266 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
267 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
268 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
269 max_oldest_map(0),
270 scrubs_local(0),
271 scrubs_remote(0),
272 agent_valid_iterator(false),
273 agent_ops(0),
274 flush_mode_high_count(0),
275 agent_active(true),
276 agent_thread(this),
277 agent_stop_flag(false),
278 agent_timer(osd->client_messenger->cct, agent_timer_lock),
279 last_recalibrate(ceph_clock_now()),
280 promote_max_objects(0),
281 promote_max_bytes(0),
282 poolctx(poolctx),
283 objecter(make_unique<Objecter>(osd->client_messenger->cct,
284 osd->objecter_messenger,
285 osd->monc, poolctx)),
286 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
287 watch_timer(osd->client_messenger->cct, watch_lock),
288 next_notif_id(0),
289 recovery_request_timer(cct, recovery_request_lock, false),
290 sleep_timer(cct, sleep_lock, false),
291 reserver_finisher(cct),
292 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
293 cct->_conf->osd_min_recovery_priority),
294 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
295 cct->_conf->osd_min_recovery_priority),
296 snap_reserver(cct, &reserver_finisher,
297 cct->_conf->osd_max_trimming_pgs),
298 recovery_ops_active(0),
299 recovery_ops_reserved(0),
300 recovery_paused(false),
301 map_cache(cct, cct->_conf->osd_map_cache_size),
302 map_bl_cache(cct->_conf->osd_map_cache_size),
303 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
304 cur_state(NONE),
305 cur_ratio(0), physical_ratio(0),
306 boot_epoch(0), up_epoch(0), bind_epoch(0)
307 {
308 objecter->init();
309
310 for (int i = 0; i < m_objecter_finishers; i++) {
311 ostringstream str;
312 str << "objecter-finisher-" << i;
313 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
314 objecter_finishers.push_back(std::move(fin));
315 }
316 }
317
318 #ifdef PG_DEBUG_REFS
319 void OSDService::add_pgid(spg_t pgid, PG *pg) {
320 std::lock_guard l(pgid_lock);
321 if (!pgid_tracker.count(pgid)) {
322 live_pgs[pgid] = pg;
323 }
324 pgid_tracker[pgid]++;
325 }
326 void OSDService::remove_pgid(spg_t pgid, PG *pg)
327 {
328 std::lock_guard l(pgid_lock);
329 ceph_assert(pgid_tracker.count(pgid));
330 ceph_assert(pgid_tracker[pgid] > 0);
331 pgid_tracker[pgid]--;
332 if (pgid_tracker[pgid] == 0) {
333 pgid_tracker.erase(pgid);
334 live_pgs.erase(pgid);
335 }
336 }
337 void OSDService::dump_live_pgids()
338 {
339 std::lock_guard l(pgid_lock);
340 derr << "live pgids:" << dendl;
341 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
342 i != pgid_tracker.cend();
343 ++i) {
344 derr << "\t" << *i << dendl;
345 live_pgs[i->first]->dump_live_ids();
346 }
347 }
348 #endif
349
350
351 ceph::signedspan OSDService::get_mnow()
352 {
353 return ceph::mono_clock::now() - osd->startup_time;
354 }
355
356 void OSDService::identify_splits_and_merges(
357 OSDMapRef old_map,
358 OSDMapRef new_map,
359 spg_t pgid,
360 set<pair<spg_t,epoch_t>> *split_children,
361 set<pair<spg_t,epoch_t>> *merge_pgs)
362 {
363 if (!old_map->have_pg_pool(pgid.pool())) {
364 return;
365 }
366 int old_pgnum = old_map->get_pg_num(pgid.pool());
367 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
368 if (p == osd->pg_num_history.pg_nums.end()) {
369 return;
370 }
371 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
372 << " to e" << new_map->get_epoch()
373 << " pg_nums " << p->second << dendl;
374 deque<spg_t> queue;
375 queue.push_back(pgid);
376 set<spg_t> did;
377 while (!queue.empty()) {
378 auto cur = queue.front();
379 queue.pop_front();
380 did.insert(cur);
381 unsigned pgnum = old_pgnum;
382 for (auto q = p->second.lower_bound(old_map->get_epoch());
383 q != p->second.end() &&
384 q->first <= new_map->get_epoch();
385 ++q) {
386 if (pgnum < q->second) {
387 // split?
388 if (cur.ps() < pgnum) {
389 set<spg_t> children;
390 if (cur.is_split(pgnum, q->second, &children)) {
391 dout(20) << __func__ << " " << cur << " e" << q->first
392 << " pg_num " << pgnum << " -> " << q->second
393 << " children " << children << dendl;
394 for (auto i : children) {
395 split_children->insert(make_pair(i, q->first));
396 if (!did.count(i))
397 queue.push_back(i);
398 }
399 }
400 } else if (cur.ps() < q->second) {
401 dout(20) << __func__ << " " << cur << " e" << q->first
402 << " pg_num " << pgnum << " -> " << q->second
403 << " is a child" << dendl;
404 // normally we'd capture this from the parent, but it's
405 // possible the parent doesn't exist yet (it will be
406 // fabricated to allow an intervening merge). note this PG
407 // as a split child here to be sure we catch it.
408 split_children->insert(make_pair(cur, q->first));
409 } else {
410 dout(20) << __func__ << " " << cur << " e" << q->first
411 << " pg_num " << pgnum << " -> " << q->second
412 << " is post-split, skipping" << dendl;
413 }
414 } else if (merge_pgs) {
415 // merge?
416 if (cur.ps() >= q->second) {
417 if (cur.ps() < pgnum) {
418 spg_t parent;
419 if (cur.is_merge_source(pgnum, q->second, &parent)) {
420 set<spg_t> children;
421 parent.is_split(q->second, pgnum, &children);
422 dout(20) << __func__ << " " << cur << " e" << q->first
423 << " pg_num " << pgnum << " -> " << q->second
424 << " is merge source, target " << parent
425 << ", source(s) " << children << dendl;
426 merge_pgs->insert(make_pair(parent, q->first));
427 if (!did.count(parent)) {
428 // queue (and re-scan) parent in case it might not exist yet
429 // and there are some future splits pending on it
430 queue.push_back(parent);
431 }
432 for (auto c : children) {
433 merge_pgs->insert(make_pair(c, q->first));
434 if (!did.count(c))
435 queue.push_back(c);
436 }
437 }
438 } else {
439 dout(20) << __func__ << " " << cur << " e" << q->first
440 << " pg_num " << pgnum << " -> " << q->second
441 << " is beyond old pgnum, skipping" << dendl;
442 }
443 } else {
444 set<spg_t> children;
445 if (cur.is_split(q->second, pgnum, &children)) {
446 dout(20) << __func__ << " " << cur << " e" << q->first
447 << " pg_num " << pgnum << " -> " << q->second
448 << " is merge target, source " << children << dendl;
449 for (auto c : children) {
450 merge_pgs->insert(make_pair(c, q->first));
451 if (!did.count(c))
452 queue.push_back(c);
453 }
454 merge_pgs->insert(make_pair(cur, q->first));
455 }
456 }
457 }
458 pgnum = q->second;
459 }
460 }
461 }
462
463 void OSDService::need_heartbeat_peer_update()
464 {
465 osd->need_heartbeat_peer_update();
466 }
467
468 HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
469 {
470 std::lock_guard l(hb_stamp_lock);
471 if (peer >= hb_stamps.size()) {
472 hb_stamps.resize(peer + 1);
473 }
474 if (!hb_stamps[peer]) {
475 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
476 }
477 return hb_stamps[peer];
478 }
479
480 void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
481 {
482 osd->enqueue_peering_evt(
483 spgid,
484 PGPeeringEventRef(
485 std::make_shared<PGPeeringEvent>(
486 epoch, epoch,
487 RenewLease())));
488 }
489
490 void OSDService::start_shutdown()
491 {
492 {
493 std::lock_guard l(agent_timer_lock);
494 agent_timer.shutdown();
495 }
496
497 {
498 std::lock_guard l(sleep_lock);
499 sleep_timer.shutdown();
500 }
501
502 {
503 std::lock_guard l(recovery_request_lock);
504 recovery_request_timer.shutdown();
505 }
506 }
507
508 void OSDService::shutdown_reserver()
509 {
510 reserver_finisher.wait_for_empty();
511 reserver_finisher.stop();
512 }
513
514 void OSDService::shutdown()
515 {
516 mono_timer.suspend();
517
518 {
519 std::lock_guard l(watch_lock);
520 watch_timer.shutdown();
521 }
522
523 objecter->shutdown();
524 for (auto& f : objecter_finishers) {
525 f->wait_for_empty();
526 f->stop();
527 }
528
529 publish_map(OSDMapRef());
530 next_osdmap = OSDMapRef();
531 }
532
533 void OSDService::init()
534 {
535 reserver_finisher.start();
536 for (auto& f : objecter_finishers) {
537 f->start();
538 }
539 objecter->set_client_incarnation(0);
540
541 // deprioritize objecter in daemonperf output
542 objecter->get_logger()->set_prio_adjust(-3);
543
544 watch_timer.init();
545 agent_timer.init();
546 mono_timer.resume();
547
548 agent_thread.create("osd_srv_agent");
549
550 if (cct->_conf->osd_recovery_delay_start)
551 defer_recovery(cct->_conf->osd_recovery_delay_start);
552 }
553
554 void OSDService::final_init()
555 {
556 objecter->start(osdmap.get());
557 }
558
559 void OSDService::activate_map()
560 {
561 // wake/unwake the tiering agent
562 std::lock_guard l{agent_lock};
563 agent_active =
564 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
565 osd->is_active();
566 agent_cond.notify_all();
567 }
568
569 void OSDService::request_osdmap_update(epoch_t e)
570 {
571 osd->osdmap_subscribe(e, false);
572 }
573
574
575 class AgentTimeoutCB : public Context {
576 PGRef pg;
577 public:
578 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
579 void finish(int) override {
580 pg->agent_choose_mode_restart();
581 }
582 };
583
584 void OSDService::agent_entry()
585 {
586 dout(10) << __func__ << " start" << dendl;
587 std::unique_lock agent_locker{agent_lock};
588
589 while (!agent_stop_flag) {
590 if (agent_queue.empty()) {
591 dout(20) << __func__ << " empty queue" << dendl;
592 agent_cond.wait(agent_locker);
593 continue;
594 }
595 uint64_t level = agent_queue.rbegin()->first;
596 set<PGRef>& top = agent_queue.rbegin()->second;
597 dout(10) << __func__
598 << " tiers " << agent_queue.size()
599 << ", top is " << level
600 << " with pgs " << top.size()
601 << ", ops " << agent_ops << "/"
602 << cct->_conf->osd_agent_max_ops
603 << (agent_active ? " active" : " NOT ACTIVE")
604 << dendl;
605 dout(20) << __func__ << " oids " << agent_oids << dendl;
606 int max = cct->_conf->osd_agent_max_ops - agent_ops;
607 int agent_flush_quota = max;
608 if (!flush_mode_high_count)
609 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
610 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
611 agent_cond.wait(agent_locker);
612 continue;
613 }
614
615 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
616 agent_queue_pos = top.begin();
617 agent_valid_iterator = true;
618 }
619 PGRef pg = *agent_queue_pos;
620 dout(10) << "high_count " << flush_mode_high_count
621 << " agent_ops " << agent_ops
622 << " flush_quota " << agent_flush_quota << dendl;
623 agent_locker.unlock();
624 if (!pg->agent_work(max, agent_flush_quota)) {
625 dout(10) << __func__ << " " << pg->pg_id
626 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
627 << " seconds" << dendl;
628
629 logger->inc(l_osd_tier_delay);
630 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
631 std::lock_guard timer_locker{agent_timer_lock};
632 Context *cb = new AgentTimeoutCB(pg);
633 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
634 }
635 agent_locker.lock();
636 }
637 dout(10) << __func__ << " finish" << dendl;
638 }
639
640 void OSDService::agent_stop()
641 {
642 {
643 std::lock_guard l(agent_lock);
644
645 // By this time all ops should be cancelled
646 ceph_assert(agent_ops == 0);
647 // By this time all PGs are shutdown and dequeued
648 if (!agent_queue.empty()) {
649 set<PGRef>& top = agent_queue.rbegin()->second;
650 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
651 ceph_abort_msg("agent queue not empty");
652 }
653
654 agent_stop_flag = true;
655 agent_cond.notify_all();
656 }
657 agent_thread.join();
658 }
659
660 // -------------------------------------
661
662 void OSDService::promote_throttle_recalibrate()
663 {
664 utime_t now = ceph_clock_now();
665 double dur = now - last_recalibrate;
666 last_recalibrate = now;
667 unsigned prob = promote_probability_millis;
668
669 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
670 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
671
672 unsigned min_prob = 1;
673
674 uint64_t attempts, obj, bytes;
675 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
676 dout(10) << __func__ << " " << attempts << " attempts, promoted "
677 << obj << " objects and " << byte_u_t(bytes) << "; target "
678 << target_obj_sec << " obj/sec or "
679 << byte_u_t(target_bytes_sec) << "/sec"
680 << dendl;
681
682 // calculate what the probability *should* be, given the targets
683 unsigned new_prob;
684 if (attempts && dur > 0) {
685 uint64_t avg_size = 1;
686 if (obj)
687 avg_size = std::max<uint64_t>(bytes / obj, 1);
688 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
689 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
690 / (double)attempts;
691 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
692 << avg_size << dendl;
693 if (target_obj_sec && target_bytes_sec)
694 new_prob = std::min(po, pb);
695 else if (target_obj_sec)
696 new_prob = po;
697 else if (target_bytes_sec)
698 new_prob = pb;
699 else
700 new_prob = 1000;
701 } else {
702 new_prob = 1000;
703 }
704 dout(20) << __func__ << " new_prob " << new_prob << dendl;
705
706 // correct for persistent skew between target rate and actual rate, adjust
707 double ratio = 1.0;
708 unsigned actual = 0;
709 if (attempts && obj) {
710 actual = obj * 1000 / attempts;
711 ratio = (double)actual / (double)prob;
712 new_prob = (double)new_prob / ratio;
713 }
714 new_prob = std::max(new_prob, min_prob);
715 new_prob = std::min(new_prob, 1000u);
716
717 // adjust
718 prob = (prob + new_prob) / 2;
719 prob = std::max(prob, min_prob);
720 prob = std::min(prob, 1000u);
721 dout(10) << __func__ << " actual " << actual
722 << ", actual/prob ratio " << ratio
723 << ", adjusted new_prob " << new_prob
724 << ", prob " << promote_probability_millis << " -> " << prob
725 << dendl;
726 promote_probability_millis = prob;
727
728 // set hard limits for this interval to mitigate stampedes
729 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
730 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
731 }
732
733 // -------------------------------------
734
735 float OSDService::get_failsafe_full_ratio()
736 {
737 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
738 if (full_ratio > 1.0) full_ratio /= 100.0;
739 return full_ratio;
740 }
741
742 OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
743 {
744 // The OSDMap ratios take precendence. So if the failsafe is .95 and
745 // the admin sets the cluster full to .96, the failsafe moves up to .96
746 // too. (Not that having failsafe == full is ideal, but it's better than
747 // dropping writes before the clusters appears full.)
748 OSDMapRef osdmap = get_osdmap();
749 if (!osdmap || osdmap->get_epoch() == 0) {
750 return NONE;
751 }
752 float nearfull_ratio = osdmap->get_nearfull_ratio();
753 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
754 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
755 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
756
757 if (osdmap->require_osd_release < ceph_release_t::luminous) {
758 // use the failsafe for nearfull and full; the mon isn't using the
759 // flags anyway because we're mid-upgrade.
760 full_ratio = failsafe_ratio;
761 backfillfull_ratio = failsafe_ratio;
762 nearfull_ratio = failsafe_ratio;
763 } else if (full_ratio <= 0 ||
764 backfillfull_ratio <= 0 ||
765 nearfull_ratio <= 0) {
766 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
767 // use failsafe flag. ick. the monitor did something wrong or the user
768 // did something stupid.
769 full_ratio = failsafe_ratio;
770 backfillfull_ratio = failsafe_ratio;
771 nearfull_ratio = failsafe_ratio;
772 }
773
774 if (injectfull_state > NONE && injectfull) {
775 inject = "(Injected)";
776 return injectfull_state;
777 } else if (pratio > failsafe_ratio) {
778 return FAILSAFE;
779 } else if (ratio > full_ratio) {
780 return FULL;
781 } else if (ratio > backfillfull_ratio) {
782 return BACKFILLFULL;
783 } else if (pratio > nearfull_ratio) {
784 return NEARFULL;
785 }
786 return NONE;
787 }
788
789 void OSDService::check_full_status(float ratio, float pratio)
790 {
791 std::lock_guard l(full_status_lock);
792
793 cur_ratio = ratio;
794 physical_ratio = pratio;
795
796 string inject;
797 s_names new_state;
798 new_state = recalc_full_state(ratio, pratio, inject);
799
800 dout(20) << __func__ << " cur ratio " << ratio
801 << ", physical ratio " << pratio
802 << ", new state " << get_full_state_name(new_state)
803 << " " << inject
804 << dendl;
805
806 // warn
807 if (cur_state != new_state) {
808 dout(10) << __func__ << " " << get_full_state_name(cur_state)
809 << " -> " << get_full_state_name(new_state) << dendl;
810 if (new_state == FAILSAFE) {
811 clog->error() << "full status failsafe engaged, dropping updates, now "
812 << (int)roundf(ratio * 100) << "% full";
813 } else if (cur_state == FAILSAFE) {
814 clog->error() << "full status failsafe disengaged, no longer dropping "
815 << "updates, now " << (int)roundf(ratio * 100) << "% full";
816 }
817 cur_state = new_state;
818 }
819 }
820
821 bool OSDService::need_fullness_update()
822 {
823 OSDMapRef osdmap = get_osdmap();
824 s_names cur = NONE;
825 if (osdmap->exists(whoami)) {
826 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
827 cur = FULL;
828 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
829 cur = BACKFILLFULL;
830 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
831 cur = NEARFULL;
832 }
833 }
834 s_names want = NONE;
835 if (is_full())
836 want = FULL;
837 else if (is_backfillfull())
838 want = BACKFILLFULL;
839 else if (is_nearfull())
840 want = NEARFULL;
841 return want != cur;
842 }
843
844 bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
845 {
846 if (injectfull && injectfull_state >= type) {
847 // injectfull is either a count of the number of times to return failsafe full
848 // or if -1 then always return full
849 if (injectfull > 0)
850 --injectfull;
851 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
852 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
853 << dendl;
854 return true;
855 }
856 return false;
857 }
858
859 bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
860 {
861 std::lock_guard l(full_status_lock);
862
863 if (_check_inject_full(dpp, type))
864 return true;
865
866 if (cur_state >= type)
867 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
868 << " physical " << physical_ratio << dendl;
869
870 return cur_state >= type;
871 }
872
873 bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
874 {
875 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
876 {
877 std::lock_guard l(full_status_lock);
878 if (_check_inject_full(dpp, type)) {
879 return true;
880 }
881 }
882
883 float pratio;
884 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
885
886 string notused;
887 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
888
889 if (tentative_state >= type)
890 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
891
892 return tentative_state >= type;
893 }
894
895 bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
896 {
897 return _check_full(dpp, FAILSAFE);
898 }
899
900 bool OSDService::check_full(DoutPrefixProvider *dpp) const
901 {
902 return _check_full(dpp, FULL);
903 }
904
905 bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
906 {
907 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
908 }
909
910 bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
911 {
912 return _check_full(dpp, BACKFILLFULL);
913 }
914
915 bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
916 {
917 return _check_full(dpp, NEARFULL);
918 }
919
920 bool OSDService::is_failsafe_full() const
921 {
922 std::lock_guard l(full_status_lock);
923 return cur_state == FAILSAFE;
924 }
925
926 bool OSDService::is_full() const
927 {
928 std::lock_guard l(full_status_lock);
929 return cur_state >= FULL;
930 }
931
932 bool OSDService::is_backfillfull() const
933 {
934 std::lock_guard l(full_status_lock);
935 return cur_state >= BACKFILLFULL;
936 }
937
938 bool OSDService::is_nearfull() const
939 {
940 std::lock_guard l(full_status_lock);
941 return cur_state >= NEARFULL;
942 }
943
944 void OSDService::set_injectfull(s_names type, int64_t count)
945 {
946 std::lock_guard l(full_status_lock);
947 injectfull_state = type;
948 injectfull = count;
949 }
950
951 void OSDService::set_statfs(const struct store_statfs_t &stbuf,
952 osd_alert_list_t& alerts)
953 {
954 uint64_t bytes = stbuf.total;
955 uint64_t avail = stbuf.available;
956 uint64_t used = stbuf.get_used_raw();
957
958 // For testing fake statfs values so it doesn't matter if all
959 // OSDs are using the same partition.
960 if (cct->_conf->fake_statfs_for_testing) {
961 uint64_t total_num_bytes = 0;
962 vector<PGRef> pgs;
963 osd->_get_pgs(&pgs);
964 for (auto p : pgs) {
965 total_num_bytes += p->get_stats_num_bytes();
966 }
967 bytes = cct->_conf->fake_statfs_for_testing;
968 if (total_num_bytes < bytes)
969 avail = bytes - total_num_bytes;
970 else
971 avail = 0;
972 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
973 << " adjust available " << avail
974 << dendl;
975 used = bytes - avail;
976 }
977
978 logger->set(l_osd_stat_bytes, bytes);
979 logger->set(l_osd_stat_bytes_used, used);
980 logger->set(l_osd_stat_bytes_avail, avail);
981
982 std::lock_guard l(stat_lock);
983 osd_stat.statfs = stbuf;
984 osd_stat.os_alerts.clear();
985 osd_stat.os_alerts[whoami].swap(alerts);
986 if (cct->_conf->fake_statfs_for_testing) {
987 osd_stat.statfs.total = bytes;
988 osd_stat.statfs.available = avail;
989 // For testing don't want used to go negative, so clear reserved
990 osd_stat.statfs.internally_reserved = 0;
991 }
992 }
993
994 osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
995 int num_pgs)
996 {
997 utime_t now = ceph_clock_now();
998 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
999 std::lock_guard l(stat_lock);
1000 osd_stat.hb_peers.swap(hb_peers);
1001 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
1002 osd_stat.num_pgs = num_pgs;
1003 // Clean entries that aren't updated
1004 // This is called often enough that we can just remove 1 at a time
1005 for (auto i: osd_stat.hb_pingtime) {
1006 if (i.second.last_update == 0)
1007 continue;
1008 if (stale_time && now.sec() - i.second.last_update > stale_time) {
1009 dout(20) << __func__ << " time out heartbeat for osd " << i.first
1010 << " last_update " << i.second.last_update << dendl;
1011 osd_stat.hb_pingtime.erase(i.first);
1012 break;
1013 }
1014 }
1015 return osd_stat;
1016 }
1017
1018 void OSDService::inc_osd_stat_repaired()
1019 {
1020 std::lock_guard l(stat_lock);
1021 osd_stat.num_shards_repaired++;
1022 return;
1023 }
1024
1025 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
1026 uint64_t adjust_used)
1027 {
1028 *pratio =
1029 ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1030
1031 if (adjust_used) {
1032 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1033 if (new_stat.statfs.available > adjust_used)
1034 new_stat.statfs.available -= adjust_used;
1035 else
1036 new_stat.statfs.available = 0;
1037 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
1038 }
1039
1040 // Check all pgs and adjust kb_used to include all pending backfill data
1041 int backfill_adjusted = 0;
1042 vector<PGRef> pgs;
1043 osd->_get_pgs(&pgs);
1044 for (auto p : pgs) {
1045 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1046 }
1047 if (backfill_adjusted) {
1048 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1049 }
1050 return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1051 }
1052
1053 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1054 {
1055 OSDMapRef next_map = get_nextmap_reserved();
1056 // service map is always newer/newest
1057 ceph_assert(from_epoch <= next_map->get_epoch());
1058
1059 if (next_map->is_down(peer) ||
1060 next_map->get_info(peer).up_from > from_epoch) {
1061 m->put();
1062 release_map(next_map);
1063 return;
1064 }
1065 ConnectionRef peer_con;
1066 if (peer == whoami) {
1067 peer_con = osd->cluster_messenger->get_loopback_connection();
1068 } else {
1069 peer_con = osd->cluster_messenger->connect_to_osd(
1070 next_map->get_cluster_addrs(peer), false, true);
1071 }
1072 maybe_share_map(peer_con.get(), next_map);
1073 peer_con->send_message(m);
1074 release_map(next_map);
1075 }
1076
1077 void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1078 {
1079 OSDMapRef next_map = get_nextmap_reserved();
1080 // service map is always newer/newest
1081 ceph_assert(from_epoch <= next_map->get_epoch());
1082
1083 for (auto& iter : messages) {
1084 if (next_map->is_down(iter.first) ||
1085 next_map->get_info(iter.first).up_from > from_epoch) {
1086 iter.second->put();
1087 continue;
1088 }
1089 ConnectionRef peer_con;
1090 if (iter.first == whoami) {
1091 peer_con = osd->cluster_messenger->get_loopback_connection();
1092 } else {
1093 peer_con = osd->cluster_messenger->connect_to_osd(
1094 next_map->get_cluster_addrs(iter.first), false, true);
1095 }
1096 maybe_share_map(peer_con.get(), next_map);
1097 peer_con->send_message(iter.second);
1098 }
1099 release_map(next_map);
1100 }
1101 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1102 {
1103 OSDMapRef next_map = get_nextmap_reserved();
1104 // service map is always newer/newest
1105 ceph_assert(from_epoch <= next_map->get_epoch());
1106
1107 if (next_map->is_down(peer) ||
1108 next_map->get_info(peer).up_from > from_epoch) {
1109 release_map(next_map);
1110 return NULL;
1111 }
1112 ConnectionRef con;
1113 if (peer == whoami) {
1114 con = osd->cluster_messenger->get_loopback_connection();
1115 } else {
1116 con = osd->cluster_messenger->connect_to_osd(
1117 next_map->get_cluster_addrs(peer), false, true);
1118 }
1119 release_map(next_map);
1120 return con;
1121 }
1122
1123 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1124 {
1125 OSDMapRef next_map = get_nextmap_reserved();
1126 // service map is always newer/newest
1127 ceph_assert(from_epoch <= next_map->get_epoch());
1128
1129 pair<ConnectionRef,ConnectionRef> ret;
1130 if (next_map->is_down(peer) ||
1131 next_map->get_info(peer).up_from > from_epoch) {
1132 release_map(next_map);
1133 return ret;
1134 }
1135 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1136 next_map->get_hb_back_addrs(peer));
1137 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1138 next_map->get_hb_front_addrs(peer));
1139 release_map(next_map);
1140 return ret;
1141 }
1142
1143 entity_name_t OSDService::get_cluster_msgr_name() const
1144 {
1145 return cluster_messenger->get_myname();
1146 }
1147
1148 void OSDService::queue_want_pg_temp(pg_t pgid,
1149 const vector<int>& want,
1150 bool forced)
1151 {
1152 std::lock_guard l(pg_temp_lock);
1153 auto p = pg_temp_pending.find(pgid);
1154 if (p == pg_temp_pending.end() ||
1155 p->second.acting != want ||
1156 forced) {
1157 pg_temp_wanted[pgid] = {want, forced};
1158 }
1159 }
1160
1161 void OSDService::remove_want_pg_temp(pg_t pgid)
1162 {
1163 std::lock_guard l(pg_temp_lock);
1164 pg_temp_wanted.erase(pgid);
1165 pg_temp_pending.erase(pgid);
1166 }
1167
1168 void OSDService::_sent_pg_temp()
1169 {
1170 #ifdef HAVE_STDLIB_MAP_SPLICING
1171 pg_temp_pending.merge(pg_temp_wanted);
1172 #else
1173 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1174 make_move_iterator(end(pg_temp_wanted)));
1175 #endif
1176 pg_temp_wanted.clear();
1177 }
1178
1179 void OSDService::requeue_pg_temp()
1180 {
1181 std::lock_guard l(pg_temp_lock);
1182 // wanted overrides pending. note that remove_want_pg_temp
1183 // clears the item out of both.
1184 unsigned old_wanted = pg_temp_wanted.size();
1185 unsigned old_pending = pg_temp_pending.size();
1186 _sent_pg_temp();
1187 pg_temp_wanted.swap(pg_temp_pending);
1188 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1189 << pg_temp_wanted.size() << dendl;
1190 }
1191
1192 std::ostream& operator<<(std::ostream& out,
1193 const OSDService::pg_temp_t& pg_temp)
1194 {
1195 out << pg_temp.acting;
1196 if (pg_temp.forced) {
1197 out << " (forced)";
1198 }
1199 return out;
1200 }
1201
1202 void OSDService::send_pg_temp()
1203 {
1204 std::lock_guard l(pg_temp_lock);
1205 if (pg_temp_wanted.empty())
1206 return;
1207 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1208 MOSDPGTemp *ms[2] = {nullptr, nullptr};
1209 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1210 auto& m = ms[pg_temp.forced];
1211 if (!m) {
1212 m = new MOSDPGTemp(osdmap->get_epoch());
1213 m->forced = pg_temp.forced;
1214 }
1215 m->pg_temp.emplace(pgid, pg_temp.acting);
1216 }
1217 for (auto m : ms) {
1218 if (m) {
1219 monc->send_mon_message(m);
1220 }
1221 }
1222 _sent_pg_temp();
1223 }
1224
1225 void OSDService::send_pg_created(pg_t pgid)
1226 {
1227 std::lock_guard l(pg_created_lock);
1228 dout(20) << __func__ << dendl;
1229 auto o = get_osdmap();
1230 if (o->require_osd_release >= ceph_release_t::luminous) {
1231 pg_created.insert(pgid);
1232 monc->send_mon_message(new MOSDPGCreated(pgid));
1233 }
1234 }
1235
1236 void OSDService::send_pg_created()
1237 {
1238 std::lock_guard l(pg_created_lock);
1239 dout(20) << __func__ << dendl;
1240 auto o = get_osdmap();
1241 if (o->require_osd_release >= ceph_release_t::luminous) {
1242 for (auto pgid : pg_created) {
1243 monc->send_mon_message(new MOSDPGCreated(pgid));
1244 }
1245 }
1246 }
1247
1248 void OSDService::prune_pg_created()
1249 {
1250 std::lock_guard l(pg_created_lock);
1251 dout(20) << __func__ << dendl;
1252 auto o = get_osdmap();
1253 auto i = pg_created.begin();
1254 while (i != pg_created.end()) {
1255 auto p = o->get_pg_pool(i->pool());
1256 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1257 dout(20) << __func__ << " pruning " << *i << dendl;
1258 i = pg_created.erase(i);
1259 } else {
1260 dout(20) << __func__ << " keeping " << *i << dendl;
1261 ++i;
1262 }
1263 }
1264 }
1265
1266
1267 // --------------------------------------
1268 // dispatch
1269
1270 bool OSDService::can_inc_scrubs()
1271 {
1272 bool can_inc = false;
1273 std::lock_guard l(sched_scrub_lock);
1274
1275 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1276 dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1277 << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
1278 can_inc = true;
1279 } else {
1280 dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1281 << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1282 }
1283
1284 return can_inc;
1285 }
1286
1287 bool OSDService::inc_scrubs_local()
1288 {
1289 bool result = false;
1290 std::lock_guard l{sched_scrub_lock};
1291 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1292 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1293 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1294 result = true;
1295 ++scrubs_local;
1296 } else {
1297 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1298 }
1299 return result;
1300 }
1301
1302 void OSDService::dec_scrubs_local()
1303 {
1304 std::lock_guard l{sched_scrub_lock};
1305 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1306 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1307 --scrubs_local;
1308 ceph_assert(scrubs_local >= 0);
1309 }
1310
1311 bool OSDService::inc_scrubs_remote()
1312 {
1313 bool result = false;
1314 std::lock_guard l{sched_scrub_lock};
1315 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1316 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1317 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1318 result = true;
1319 ++scrubs_remote;
1320 } else {
1321 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1322 }
1323 return result;
1324 }
1325
1326 void OSDService::dec_scrubs_remote()
1327 {
1328 std::lock_guard l{sched_scrub_lock};
1329 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1330 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1331 --scrubs_remote;
1332 ceph_assert(scrubs_remote >= 0);
1333 }
1334
1335 void OSDService::dump_scrub_reservations(Formatter *f)
1336 {
1337 std::lock_guard l{sched_scrub_lock};
1338 f->dump_int("scrubs_local", scrubs_local);
1339 f->dump_int("scrubs_remote", scrubs_remote);
1340 f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
1341 }
1342
1343 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1344 epoch_t *_bind_epoch) const
1345 {
1346 std::lock_guard l(epoch_lock);
1347 if (_boot_epoch)
1348 *_boot_epoch = boot_epoch;
1349 if (_up_epoch)
1350 *_up_epoch = up_epoch;
1351 if (_bind_epoch)
1352 *_bind_epoch = bind_epoch;
1353 }
1354
1355 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1356 const epoch_t *_bind_epoch)
1357 {
1358 std::lock_guard l(epoch_lock);
1359 if (_boot_epoch) {
1360 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1361 boot_epoch = *_boot_epoch;
1362 }
1363 if (_up_epoch) {
1364 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1365 up_epoch = *_up_epoch;
1366 }
1367 if (_bind_epoch) {
1368 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1369 bind_epoch = *_bind_epoch;
1370 }
1371 }
1372
1373 bool OSDService::prepare_to_stop()
1374 {
1375 std::unique_lock l(is_stopping_lock);
1376 if (get_state() != NOT_STOPPING)
1377 return false;
1378
1379 OSDMapRef osdmap = get_osdmap();
1380 if (osdmap && osdmap->is_up(whoami)) {
1381 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1382 set_state(PREPARING_TO_STOP);
1383 monc->send_mon_message(
1384 new MOSDMarkMeDown(
1385 monc->get_fsid(),
1386 whoami,
1387 osdmap->get_addrs(whoami),
1388 osdmap->get_epoch(),
1389 true // request ack
1390 ));
1391 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1392 is_stopping_cond.wait_for(l, timeout,
1393 [this] { return get_state() == STOPPING; });
1394 }
1395 dout(0) << __func__ << " starting shutdown" << dendl;
1396 set_state(STOPPING);
1397 return true;
1398 }
1399
1400 void OSDService::got_stop_ack()
1401 {
1402 std::scoped_lock l(is_stopping_lock);
1403 if (get_state() == PREPARING_TO_STOP) {
1404 dout(0) << __func__ << " starting shutdown" << dendl;
1405 set_state(STOPPING);
1406 is_stopping_cond.notify_all();
1407 } else {
1408 dout(10) << __func__ << " ignoring msg" << dendl;
1409 }
1410 }
1411
1412 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1413 OSDSuperblock& sblock)
1414 {
1415 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1416 osdmap->get_encoding_features());
1417 m->oldest_map = max_oldest_map;
1418 m->newest_map = sblock.newest_map;
1419
1420 int max = cct->_conf->osd_map_message_max;
1421 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1422
1423 if (since < m->oldest_map) {
1424 // we don't have the next map the target wants, so start with a
1425 // full map.
1426 bufferlist bl;
1427 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1428 << since << ", starting with full map" << dendl;
1429 since = m->oldest_map;
1430 if (!get_map_bl(since, bl)) {
1431 derr << __func__ << " missing full map " << since << dendl;
1432 goto panic;
1433 }
1434 max--;
1435 max_bytes -= bl.length();
1436 m->maps[since] = std::move(bl);
1437 }
1438 for (epoch_t e = since + 1; e <= to; ++e) {
1439 bufferlist bl;
1440 if (get_inc_map_bl(e, bl)) {
1441 m->incremental_maps[e] = std::move(bl);
1442 } else {
1443 dout(10) << __func__ << " missing incremental map " << e << dendl;
1444 if (!get_map_bl(e, bl)) {
1445 derr << __func__ << " also missing full map " << e << dendl;
1446 goto panic;
1447 }
1448 m->maps[e] = std::move(bl);
1449 }
1450 max--;
1451 max_bytes -= bl.length();
1452 if (max <= 0 || max_bytes <= 0) {
1453 break;
1454 }
1455 }
1456 return m;
1457
1458 panic:
1459 if (!m->maps.empty() ||
1460 !m->incremental_maps.empty()) {
1461 // send what we have so far
1462 return m;
1463 }
1464 // send something
1465 bufferlist bl;
1466 if (get_inc_map_bl(m->newest_map, bl)) {
1467 m->incremental_maps[m->newest_map] = std::move(bl);
1468 } else {
1469 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1470 if (!get_map_bl(m->newest_map, bl)) {
1471 derr << __func__ << " unable to load latest full map " << m->newest_map
1472 << dendl;
1473 ceph_abort();
1474 }
1475 m->maps[m->newest_map] = std::move(bl);
1476 }
1477 return m;
1478 }
1479
1480 void OSDService::send_map(MOSDMap *m, Connection *con)
1481 {
1482 con->send_message(m);
1483 }
1484
1485 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1486 const OSDMapRef& osdmap)
1487 {
1488 epoch_t to = osdmap->get_epoch();
1489 dout(10) << "send_incremental_map " << since << " -> " << to
1490 << " to " << con << " " << con->get_peer_addr() << dendl;
1491
1492 MOSDMap *m = NULL;
1493 while (!m) {
1494 OSDSuperblock sblock(get_superblock());
1495 if (since < sblock.oldest_map) {
1496 // just send latest full map
1497 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1498 osdmap->get_encoding_features());
1499 m->oldest_map = max_oldest_map;
1500 m->newest_map = sblock.newest_map;
1501 get_map_bl(to, m->maps[to]);
1502 send_map(m, con);
1503 return;
1504 }
1505
1506 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1507 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1508 << ", only sending most recent" << dendl;
1509 since = to - cct->_conf->osd_map_share_max_epochs;
1510 }
1511
1512 m = build_incremental_map_msg(since, to, sblock);
1513 }
1514 send_map(m, con);
1515 }
1516
1517 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1518 {
1519 bool found = map_bl_cache.lookup(e, &bl);
1520 if (found) {
1521 logger->inc(l_osd_map_bl_cache_hit);
1522 return true;
1523 }
1524 logger->inc(l_osd_map_bl_cache_miss);
1525 found = store->read(meta_ch,
1526 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1527 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1528 if (found) {
1529 _add_map_bl(e, bl);
1530 }
1531 return found;
1532 }
1533
1534 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1535 {
1536 std::lock_guard l(map_cache_lock);
1537 bool found = map_bl_inc_cache.lookup(e, &bl);
1538 if (found) {
1539 logger->inc(l_osd_map_bl_cache_hit);
1540 return true;
1541 }
1542 logger->inc(l_osd_map_bl_cache_miss);
1543 found = store->read(meta_ch,
1544 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1545 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1546 if (found) {
1547 _add_map_inc_bl(e, bl);
1548 }
1549 return found;
1550 }
1551
1552 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1553 {
1554 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1555 // cache a contiguous buffer
1556 if (bl.get_num_buffers() > 1) {
1557 bl.rebuild();
1558 }
1559 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1560 map_bl_cache.add(e, bl);
1561 }
1562
1563 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1564 {
1565 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1566 // cache a contiguous buffer
1567 if (bl.get_num_buffers() > 1) {
1568 bl.rebuild();
1569 }
1570 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1571 map_bl_inc_cache.add(e, bl);
1572 }
1573
1574 OSDMapRef OSDService::_add_map(OSDMap *o)
1575 {
1576 epoch_t e = o->get_epoch();
1577
1578 if (cct->_conf->osd_map_dedup) {
1579 // Dedup against an existing map at a nearby epoch
1580 OSDMapRef for_dedup = map_cache.lower_bound(e);
1581 if (for_dedup) {
1582 OSDMap::dedup(for_dedup.get(), o);
1583 }
1584 }
1585 bool existed;
1586 OSDMapRef l = map_cache.add(e, o, &existed);
1587 if (existed) {
1588 delete o;
1589 }
1590 return l;
1591 }
1592
1593 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1594 {
1595 std::lock_guard l(map_cache_lock);
1596 OSDMapRef retval = map_cache.lookup(epoch);
1597 if (retval) {
1598 dout(30) << "get_map " << epoch << " -cached" << dendl;
1599 logger->inc(l_osd_map_cache_hit);
1600 return retval;
1601 }
1602 {
1603 logger->inc(l_osd_map_cache_miss);
1604 epoch_t lb = map_cache.cached_key_lower_bound();
1605 if (epoch < lb) {
1606 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1607 logger->inc(l_osd_map_cache_miss_low);
1608 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1609 }
1610 }
1611
1612 OSDMap *map = new OSDMap;
1613 if (epoch > 0) {
1614 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1615 bufferlist bl;
1616 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1617 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1618 delete map;
1619 return OSDMapRef();
1620 }
1621 map->decode(bl);
1622 } else {
1623 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1624 }
1625 return _add_map(map);
1626 }
1627
1628 // ops
1629
1630
1631 void OSDService::reply_op_error(OpRequestRef op, int err)
1632 {
1633 reply_op_error(op, err, eversion_t(), 0, {});
1634 }
1635
1636 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1637 version_t uv,
1638 vector<pg_log_op_return_item_t> op_returns)
1639 {
1640 auto m = op->get_req<MOSDOp>();
1641 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1642 int flags;
1643 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1644
1645 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1646 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
1647 reply->set_reply_versions(v, uv);
1648 reply->set_op_returns(op_returns);
1649 m->get_connection()->send_message(reply);
1650 }
1651
1652 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1653 {
1654 if (!cct->_conf->osd_debug_misdirected_ops) {
1655 return;
1656 }
1657
1658 auto m = op->get_req<MOSDOp>();
1659 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1660
1661 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1662
1663 if (pg->is_ec_pg()) {
1664 /**
1665 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1666 * can get this result:
1667 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1668 * [CRUSH_ITEM_NONE, 2, 3]/3
1669 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1670 * [3, 2, 3]/3
1671 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1672 * -- misdirected op
1673 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1674 * it and fulfils it
1675 *
1676 * We can't compute the op target based on the sending map epoch due to
1677 * splitting. The simplest thing is to detect such cases here and drop
1678 * them without an error (the client will resend anyway).
1679 */
1680 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1681 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1682 if (!opmap) {
1683 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1684 << m->get_map_epoch() << ", dropping" << dendl;
1685 return;
1686 }
1687 pg_t _pgid = m->get_raw_pg();
1688 spg_t pgid;
1689 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1690 _pgid = opmap->raw_pg_to_pg(_pgid);
1691 if (opmap->get_primary_shard(_pgid, &pgid) &&
1692 pgid.shard != pg->pg_id.shard) {
1693 dout(7) << __func__ << ": " << *pg << " primary changed since "
1694 << m->get_map_epoch() << ", dropping" << dendl;
1695 return;
1696 }
1697 }
1698
1699 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1700 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1701 << " pg " << m->get_raw_pg()
1702 << " to osd." << whoami
1703 << " not " << pg->get_acting()
1704 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1705 }
1706
1707 void OSDService::enqueue_back(OpSchedulerItem&& qi)
1708 {
1709 osd->op_shardedwq.queue(std::move(qi));
1710 }
1711
1712 void OSDService::enqueue_front(OpSchedulerItem&& qi)
1713 {
1714 osd->op_shardedwq.queue_front(std::move(qi));
1715 }
1716
1717 void OSDService::queue_recovery_context(
1718 PG *pg,
1719 GenContext<ThreadPool::TPHandle&> *c)
1720 {
1721 epoch_t e = get_osdmap_epoch();
1722 enqueue_back(
1723 OpSchedulerItem(
1724 unique_ptr<OpSchedulerItem::OpQueueable>(
1725 new PGRecoveryContext(pg->get_pgid(), c, e)),
1726 cct->_conf->osd_recovery_cost,
1727 cct->_conf->osd_recovery_priority,
1728 ceph_clock_now(),
1729 0,
1730 e));
1731 }
1732
1733 void OSDService::queue_for_snap_trim(PG *pg)
1734 {
1735 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1736 enqueue_back(
1737 OpSchedulerItem(
1738 unique_ptr<OpSchedulerItem::OpQueueable>(
1739 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1740 cct->_conf->osd_snap_trim_cost,
1741 cct->_conf->osd_snap_trim_priority,
1742 ceph_clock_now(),
1743 0,
1744 pg->get_osdmap_epoch()));
1745 }
1746
1747 template <class MSG_TYPE>
1748 void OSDService::queue_scrub_event_msg(PG* pg,
1749 Scrub::scrub_prio_t with_priority,
1750 unsigned int qu_priority)
1751 {
1752 const auto epoch = pg->get_osdmap_epoch();
1753 auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1754 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
1755
1756 enqueue_back(OpSchedulerItem(
1757 unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1758 pg->scrub_requeue_priority(with_priority, qu_priority), ceph_clock_now(), 0, epoch));
1759 }
1760
1761 template <class MSG_TYPE>
1762 void OSDService::queue_scrub_event_msg(PG* pg, Scrub::scrub_prio_t with_priority)
1763 {
1764 const auto epoch = pg->get_osdmap_epoch();
1765 auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1766 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
1767
1768 enqueue_back(OpSchedulerItem(
1769 unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1770 pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
1771 }
1772
1773 void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
1774 {
1775 queue_scrub_event_msg<PGScrub>(pg, with_priority);
1776 }
1777
1778 void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
1779 {
1780 queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
1781 }
1782
1783 void OSDService::queue_for_rep_scrub(PG* pg,
1784 Scrub::scrub_prio_t with_priority,
1785 unsigned int qu_priority)
1786 {
1787 queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority);
1788 }
1789
1790 void OSDService::queue_for_rep_scrub_resched(PG* pg,
1791 Scrub::scrub_prio_t with_priority,
1792 unsigned int qu_priority)
1793 {
1794 // Resulting scrub event: 'SchedReplica'
1795 queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority);
1796 }
1797
1798 void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
1799 {
1800 // Resulting scrub event: 'RemotesReserved'
1801 queue_scrub_event_msg<PGScrubResourcesOK>(pg, with_priority);
1802 }
1803
1804 void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority)
1805 {
1806 // Resulting scrub event: 'ReservationFailure'
1807 queue_scrub_event_msg<PGScrubDenied>(pg, with_priority);
1808 }
1809
1810 void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
1811 {
1812 // Resulting scrub event: 'InternalSchedScrub'
1813 queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
1814 }
1815
1816 void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
1817 {
1818 // Resulting scrub event: 'ActivePushesUpd'
1819 queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
1820 }
1821
1822 void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
1823 {
1824 queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
1825 }
1826
1827 void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
1828 {
1829 // Resulting scrub event: 'Unblocked'
1830 queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
1831 }
1832
1833 void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
1834 {
1835 // Resulting scrub event: 'DigestUpdate'
1836 queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
1837 }
1838
1839 void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
1840 {
1841 // Resulting scrub event: 'GotReplicas'
1842 queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
1843 }
1844
1845 void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
1846 {
1847 // Resulting scrub event: 'ReplicaPushesUpd'
1848 queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
1849 }
1850
1851 void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1852 {
1853 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1854 enqueue_back(
1855 OpSchedulerItem(
1856 unique_ptr<OpSchedulerItem::OpQueueable>(
1857 new PGDelete(pgid, e)),
1858 cct->_conf->osd_pg_delete_cost,
1859 cct->_conf->osd_pg_delete_priority,
1860 ceph_clock_now(),
1861 0,
1862 e));
1863 }
1864
1865 bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1866 {
1867 return osd->try_finish_pg_delete(pg, old_pg_num);
1868 }
1869
1870 // ---
1871
1872 void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1873 {
1874 std::lock_guard l(merge_lock);
1875 dout(10) << __func__ << " " << pg->pg_id << dendl;
1876 ready_to_merge_source[pg->pg_id.pgid] = version;
1877 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1878 _send_ready_to_merge();
1879 }
1880
1881 void OSDService::set_ready_to_merge_target(PG *pg,
1882 eversion_t version,
1883 epoch_t last_epoch_started,
1884 epoch_t last_epoch_clean)
1885 {
1886 std::lock_guard l(merge_lock);
1887 dout(10) << __func__ << " " << pg->pg_id << dendl;
1888 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1889 make_tuple(version,
1890 last_epoch_started,
1891 last_epoch_clean)));
1892 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1893 _send_ready_to_merge();
1894 }
1895
1896 void OSDService::set_not_ready_to_merge_source(pg_t source)
1897 {
1898 std::lock_guard l(merge_lock);
1899 dout(10) << __func__ << " " << source << dendl;
1900 not_ready_to_merge_source.insert(source);
1901 assert(ready_to_merge_source.count(source) == 0);
1902 _send_ready_to_merge();
1903 }
1904
1905 void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1906 {
1907 std::lock_guard l(merge_lock);
1908 dout(10) << __func__ << " " << target << " source " << source << dendl;
1909 not_ready_to_merge_target[target] = source;
1910 assert(ready_to_merge_target.count(target) == 0);
1911 _send_ready_to_merge();
1912 }
1913
1914 void OSDService::send_ready_to_merge()
1915 {
1916 std::lock_guard l(merge_lock);
1917 _send_ready_to_merge();
1918 }
1919
1920 void OSDService::_send_ready_to_merge()
1921 {
1922 dout(20) << __func__
1923 << " ready_to_merge_source " << ready_to_merge_source
1924 << " not_ready_to_merge_source " << not_ready_to_merge_source
1925 << " ready_to_merge_target " << ready_to_merge_target
1926 << " not_ready_to_merge_target " << not_ready_to_merge_target
1927 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1928 << dendl;
1929 for (auto src : not_ready_to_merge_source) {
1930 if (sent_ready_to_merge_source.count(src) == 0) {
1931 monc->send_mon_message(new MOSDPGReadyToMerge(
1932 src,
1933 {}, {}, 0, 0,
1934 false,
1935 osdmap->get_epoch()));
1936 sent_ready_to_merge_source.insert(src);
1937 }
1938 }
1939 for (auto p : not_ready_to_merge_target) {
1940 if (sent_ready_to_merge_source.count(p.second) == 0) {
1941 monc->send_mon_message(new MOSDPGReadyToMerge(
1942 p.second,
1943 {}, {}, 0, 0,
1944 false,
1945 osdmap->get_epoch()));
1946 sent_ready_to_merge_source.insert(p.second);
1947 }
1948 }
1949 for (auto src : ready_to_merge_source) {
1950 if (not_ready_to_merge_source.count(src.first) ||
1951 not_ready_to_merge_target.count(src.first.get_parent())) {
1952 continue;
1953 }
1954 auto p = ready_to_merge_target.find(src.first.get_parent());
1955 if (p != ready_to_merge_target.end() &&
1956 sent_ready_to_merge_source.count(src.first) == 0) {
1957 monc->send_mon_message(new MOSDPGReadyToMerge(
1958 src.first, // source pgid
1959 src.second, // src version
1960 std::get<0>(p->second), // target version
1961 std::get<1>(p->second), // PG's last_epoch_started
1962 std::get<2>(p->second), // PG's last_epoch_clean
1963 true,
1964 osdmap->get_epoch()));
1965 sent_ready_to_merge_source.insert(src.first);
1966 }
1967 }
1968 }
1969
1970 void OSDService::clear_ready_to_merge(PG *pg)
1971 {
1972 std::lock_guard l(merge_lock);
1973 dout(10) << __func__ << " " << pg->pg_id << dendl;
1974 ready_to_merge_source.erase(pg->pg_id.pgid);
1975 ready_to_merge_target.erase(pg->pg_id.pgid);
1976 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1977 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1978 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1979 }
1980
1981 void OSDService::clear_sent_ready_to_merge()
1982 {
1983 std::lock_guard l(merge_lock);
1984 sent_ready_to_merge_source.clear();
1985 }
1986
1987 void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
1988 {
1989 std::lock_guard l(merge_lock);
1990 auto i = sent_ready_to_merge_source.begin();
1991 while (i != sent_ready_to_merge_source.end()) {
1992 if (!osdmap->pg_exists(*i)) {
1993 dout(10) << __func__ << " " << *i << dendl;
1994 i = sent_ready_to_merge_source.erase(i);
1995 } else {
1996 ++i;
1997 }
1998 }
1999 }
2000
2001 // ---
2002
2003 void OSDService::_queue_for_recovery(
2004 std::pair<epoch_t, PGRef> p,
2005 uint64_t reserved_pushes)
2006 {
2007 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
2008 enqueue_back(
2009 OpSchedulerItem(
2010 unique_ptr<OpSchedulerItem::OpQueueable>(
2011 new PGRecovery(
2012 p.second->get_pgid(), p.first, reserved_pushes)),
2013 cct->_conf->osd_recovery_cost,
2014 cct->_conf->osd_recovery_priority,
2015 ceph_clock_now(),
2016 0,
2017 p.first));
2018 }
2019
2020 // ====================================================================
2021 // OSD
2022
2023 #undef dout_prefix
2024 #define dout_prefix *_dout
2025
2026 // Commands shared between OSD's console and admin console:
2027 namespace ceph::osd_cmds {
2028
2029 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
2030
2031 } // namespace ceph::osd_cmds
2032
2033 int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, string osdspec_affinity)
2034 {
2035 int ret;
2036
2037 OSDSuperblock sb;
2038 bufferlist sbbl;
2039 ObjectStore::CollectionHandle ch;
2040
2041 // if we are fed a uuid for this osd, use it.
2042 store->set_fsid(cct->_conf->osd_uuid);
2043
2044 ret = store->mkfs();
2045 if (ret) {
2046 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2047 << cpp_strerror(ret) << dendl;
2048 goto free_store;
2049 }
2050
2051 store->set_cache_shards(1); // doesn't matter for mkfs!
2052
2053 ret = store->mount();
2054 if (ret) {
2055 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2056 << cpp_strerror(ret) << dendl;
2057 goto free_store;
2058 }
2059
2060 ch = store->open_collection(coll_t::meta());
2061 if (ch) {
2062 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2063 if (ret < 0) {
2064 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
2065 goto free_store;
2066 }
2067 /* if we already have superblock, check content of superblock */
2068 dout(0) << " have superblock" << dendl;
2069 auto p = sbbl.cbegin();
2070 decode(sb, p);
2071 if (whoami != sb.whoami) {
2072 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2073 << dendl;
2074 ret = -EINVAL;
2075 goto umount_store;
2076 }
2077 if (fsid != sb.cluster_fsid) {
2078 derr << "provided cluster fsid " << fsid
2079 << " != superblock's " << sb.cluster_fsid << dendl;
2080 ret = -EINVAL;
2081 goto umount_store;
2082 }
2083 } else {
2084 // create superblock
2085 sb.cluster_fsid = fsid;
2086 sb.osd_fsid = store->get_fsid();
2087 sb.whoami = whoami;
2088 sb.compat_features = get_osd_initial_compat_set();
2089
2090 bufferlist bl;
2091 encode(sb, bl);
2092
2093 ObjectStore::CollectionHandle ch = store->create_new_collection(
2094 coll_t::meta());
2095 ObjectStore::Transaction t;
2096 t.create_collection(coll_t::meta(), 0);
2097 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
2098 ret = store->queue_transaction(ch, std::move(t));
2099 if (ret) {
2100 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2101 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
2102 goto umount_store;
2103 }
2104 ch->flush();
2105 }
2106
2107 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
2108 if (ret) {
2109 derr << "OSD::mkfs: failed to write fsid file: error "
2110 << cpp_strerror(ret) << dendl;
2111 goto umount_store;
2112 }
2113
2114 umount_store:
2115 if (ch) {
2116 ch.reset();
2117 }
2118 store->umount();
2119 free_store:
2120 delete store;
2121 return ret;
2122 }
2123
2124 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
2125 {
2126 char val[80];
2127 int r;
2128
2129 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2130 r = store->write_meta("magic", val);
2131 if (r < 0)
2132 return r;
2133
2134 snprintf(val, sizeof(val), "%d", whoami);
2135 r = store->write_meta("whoami", val);
2136 if (r < 0)
2137 return r;
2138
2139 cluster_fsid.print(val);
2140 r = store->write_meta("ceph_fsid", val);
2141 if (r < 0)
2142 return r;
2143
2144 string key = cct->_conf.get_val<string>("key");
2145 if (key.size()) {
2146 r = store->write_meta("osd_key", key);
2147 if (r < 0)
2148 return r;
2149 } else {
2150 string keyfile = cct->_conf.get_val<string>("keyfile");
2151 if (!keyfile.empty()) {
2152 bufferlist keybl;
2153 string err;
2154 r = keybl.read_file(keyfile.c_str(), &err);
2155 if (r < 0) {
2156 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2157 << err << ": " << cpp_strerror(r) << dendl;
2158 return r;
2159 }
2160 r = store->write_meta("osd_key", keybl.to_str());
2161 if (r < 0)
2162 return r;
2163 }
2164 }
2165 if (!osdspec_affinity.empty()) {
2166 r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2167 if (r < 0)
2168 return r;
2169 }
2170
2171 r = store->write_meta("ready", "ready");
2172 if (r < 0)
2173 return r;
2174
2175 return 0;
2176 }
2177
2178 int OSD::peek_meta(ObjectStore *store,
2179 std::string *magic,
2180 uuid_d *cluster_fsid,
2181 uuid_d *osd_fsid,
2182 int *whoami,
2183 ceph_release_t *require_osd_release)
2184 {
2185 string val;
2186
2187 int r = store->read_meta("magic", &val);
2188 if (r < 0)
2189 return r;
2190 *magic = val;
2191
2192 r = store->read_meta("whoami", &val);
2193 if (r < 0)
2194 return r;
2195 *whoami = atoi(val.c_str());
2196
2197 r = store->read_meta("ceph_fsid", &val);
2198 if (r < 0)
2199 return r;
2200 r = cluster_fsid->parse(val.c_str());
2201 if (!r)
2202 return -EINVAL;
2203
2204 r = store->read_meta("fsid", &val);
2205 if (r < 0) {
2206 *osd_fsid = uuid_d();
2207 } else {
2208 r = osd_fsid->parse(val.c_str());
2209 if (!r)
2210 return -EINVAL;
2211 }
2212
2213 r = store->read_meta("require_osd_release", &val);
2214 if (r >= 0) {
2215 *require_osd_release = ceph_release_from_name(val);
2216 }
2217
2218 return 0;
2219 }
2220
2221
2222 #undef dout_prefix
2223 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2224
2225 // cons/des
2226
2227 OSD::OSD(CephContext *cct_, ObjectStore *store_,
2228 int id,
2229 Messenger *internal_messenger,
2230 Messenger *external_messenger,
2231 Messenger *hb_client_front,
2232 Messenger *hb_client_back,
2233 Messenger *hb_front_serverm,
2234 Messenger *hb_back_serverm,
2235 Messenger *osdc_messenger,
2236 MonClient *mc,
2237 const std::string &dev, const std::string &jdev,
2238 ceph::async::io_context_pool& poolctx) :
2239 Dispatcher(cct_),
2240 tick_timer(cct, osd_lock),
2241 tick_timer_without_osd_lock(cct, tick_timer_lock),
2242 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2243 cluster_messenger(internal_messenger),
2244 client_messenger(external_messenger),
2245 objecter_messenger(osdc_messenger),
2246 monc(mc),
2247 mgrc(cct_, client_messenger, &mc->monmap),
2248 logger(create_logger()),
2249 recoverystate_perf(create_recoverystate_perf()),
2250 store(store_),
2251 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2252 clog(log_client.create_channel()),
2253 whoami(id),
2254 dev_path(dev), journal_path(jdev),
2255 store_is_rotational(store->is_rotational()),
2256 trace_endpoint("0.0.0.0", 0, "osd"),
2257 asok_hook(NULL),
2258 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2259 "osd_pg_epoch_max_lag_factor")),
2260 osd_compat(get_osd_compat_set()),
2261 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2262 get_num_op_threads()),
2263 heartbeat_stop(false),
2264 heartbeat_need_update(true),
2265 hb_front_client_messenger(hb_client_front),
2266 hb_back_client_messenger(hb_client_back),
2267 hb_front_server_messenger(hb_front_serverm),
2268 hb_back_server_messenger(hb_back_serverm),
2269 daily_loadavg(0.0),
2270 heartbeat_thread(this),
2271 heartbeat_dispatcher(this),
2272 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2273 cct->_conf->osd_num_op_tracker_shard),
2274 test_ops_hook(NULL),
2275 op_shardedwq(
2276 this,
2277 ceph::make_timespan(cct->_conf->osd_op_thread_timeout),
2278 ceph::make_timespan(cct->_conf->osd_op_thread_suicide_timeout),
2279 &osd_op_tp),
2280 last_pg_create_epoch(0),
2281 boot_finisher(cct),
2282 up_thru_wanted(0),
2283 requested_full_first(0),
2284 requested_full_last(0),
2285 service(this, poolctx)
2286 {
2287
2288 if (!gss_ktfile_client.empty()) {
2289 // Assert we can export environment variable
2290 /*
2291 The default client keytab is used, if it is present and readable,
2292 to automatically obtain initial credentials for GSSAPI client
2293 applications. The principal name of the first entry in the client
2294 keytab is used by default when obtaining initial credentials.
2295 1. The KRB5_CLIENT_KTNAME environment variable.
2296 2. The default_client_keytab_name profile variable in [libdefaults].
2297 3. The hardcoded default, DEFCKTNAME.
2298 */
2299 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2300 gss_ktfile_client.c_str(), 1));
2301 ceph_assert(set_result == 0);
2302 }
2303
2304 monc->set_messenger(client_messenger);
2305 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2306 cct->_conf->osd_op_log_threshold);
2307 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2308 cct->_conf->osd_op_history_duration);
2309 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2310 cct->_conf->osd_op_history_slow_op_threshold);
2311 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
2312 #ifdef WITH_BLKIN
2313 std::stringstream ss;
2314 ss << "osd." << whoami;
2315 trace_endpoint.copy_name(ss.str());
2316 #endif
2317
2318 // initialize shards
2319 num_shards = get_num_op_shards();
2320 for (uint32_t i = 0; i < num_shards; i++) {
2321 OSDShard *one_shard = new OSDShard(
2322 i,
2323 cct,
2324 this);
2325 shards.push_back(one_shard);
2326 }
2327 }
2328
2329 OSD::~OSD()
2330 {
2331 while (!shards.empty()) {
2332 delete shards.back();
2333 shards.pop_back();
2334 }
2335 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2336 cct->get_perfcounters_collection()->remove(logger);
2337 delete recoverystate_perf;
2338 delete logger;
2339 delete store;
2340 }
2341
2342 double OSD::get_tick_interval() const
2343 {
2344 // vary +/- 5% to avoid scrub scheduling livelocks
2345 constexpr auto delta = 0.05;
2346 return (OSD_TICK_INTERVAL *
2347 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2348 }
2349
2350 void OSD::handle_signal(int signum)
2351 {
2352 ceph_assert(signum == SIGINT || signum == SIGTERM);
2353 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2354 shutdown();
2355 }
2356
2357 int OSD::pre_init()
2358 {
2359 std::lock_guard lock(osd_lock);
2360 if (is_stopping())
2361 return 0;
2362
2363 if (store->test_mount_in_use()) {
2364 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2365 << "currently in use. (Is ceph-osd already running?)" << dendl;
2366 return -EBUSY;
2367 }
2368
2369 cct->_conf.add_observer(this);
2370 return 0;
2371 }
2372
2373 int OSD::set_numa_affinity()
2374 {
2375 // storage numa node
2376 int store_node = -1;
2377 store->get_numa_node(&store_node, nullptr, nullptr);
2378 if (store_node >= 0) {
2379 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2380 }
2381
2382 // check network numa node(s)
2383 int front_node = -1, back_node = -1;
2384 string front_iface = pick_iface(
2385 cct,
2386 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2387 string back_iface = pick_iface(
2388 cct,
2389 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2390 int r = get_iface_numa_node(front_iface, &front_node);
2391 if (r >= 0 && front_node >= 0) {
2392 dout(1) << __func__ << " public network " << front_iface << " numa node "
2393 << front_node << dendl;
2394 r = get_iface_numa_node(back_iface, &back_node);
2395 if (r >= 0 && back_node >= 0) {
2396 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2397 << back_node << dendl;
2398 if (front_node == back_node &&
2399 front_node == store_node) {
2400 dout(1) << " objectstore and network numa nodes all match" << dendl;
2401 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2402 numa_node = front_node;
2403 }
2404 } else if (front_node != back_node) {
2405 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2406 << dendl;
2407 } else {
2408 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2409 << dendl;
2410 }
2411 } else if (back_node == -2) {
2412 dout(1) << __func__ << " cluster network " << back_iface
2413 << " ports numa nodes do not match" << dendl;
2414 } else {
2415 derr << __func__ << " unable to identify cluster interface '" << back_iface
2416 << "' numa node: " << cpp_strerror(r) << dendl;
2417 }
2418 } else if (front_node == -2) {
2419 dout(1) << __func__ << " public network " << front_iface
2420 << " ports numa nodes do not match" << dendl;
2421 } else {
2422 derr << __func__ << " unable to identify public interface '" << front_iface
2423 << "' numa node: " << cpp_strerror(r) << dendl;
2424 }
2425 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2426 // this takes precedence over the automagic logic above
2427 numa_node = node;
2428 }
2429 if (numa_node >= 0) {
2430 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2431 if (r < 0) {
2432 dout(1) << __func__ << " unable to determine numa node " << numa_node
2433 << " CPUs" << dendl;
2434 numa_node = -1;
2435 } else {
2436 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2437 << " cpus "
2438 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2439 << dendl;
2440 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
2441 if (r < 0) {
2442 r = -errno;
2443 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2444 << dendl;
2445 numa_node = -1;
2446 }
2447 }
2448 } else {
2449 dout(1) << __func__ << " not setting numa affinity" << dendl;
2450 }
2451 return 0;
2452 }
2453
2454 // asok
2455
2456 class OSDSocketHook : public AdminSocketHook {
2457 OSD *osd;
2458 public:
2459 explicit OSDSocketHook(OSD *o) : osd(o) {}
2460 int call(std::string_view prefix, const cmdmap_t& cmdmap,
2461 Formatter *f,
2462 std::ostream& ss,
2463 bufferlist& out) override {
2464 ceph_abort("should use async hook");
2465 }
2466 void call_async(
2467 std::string_view prefix,
2468 const cmdmap_t& cmdmap,
2469 Formatter *f,
2470 const bufferlist& inbl,
2471 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
2472 try {
2473 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2474 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2475 bufferlist empty;
2476 on_finish(-EINVAL, e.what(), empty);
2477 }
2478 }
2479 };
2480
2481 std::set<int64_t> OSD::get_mapped_pools()
2482 {
2483 std::set<int64_t> pools;
2484 std::vector<spg_t> pgids;
2485 _get_pgids(&pgids);
2486 for (const auto &pgid : pgids) {
2487 pools.insert(pgid.pool());
2488 }
2489 return pools;
2490 }
2491
2492 void OSD::asok_command(
2493 std::string_view prefix, const cmdmap_t& cmdmap,
2494 Formatter *f,
2495 const bufferlist& inbl,
2496 std::function<void(int,const std::string&,bufferlist&)> on_finish)
2497 {
2498 int ret = 0;
2499 stringstream ss; // stderr error message stream
2500 bufferlist outbl; // if empty at end, we'll dump formatter as output
2501
2502 // --- PG commands are routed here to PG::do_command ---
2503 if (prefix == "pg" ||
2504 prefix == "query" ||
2505 prefix == "mark_unfound_lost" ||
2506 prefix == "list_unfound" ||
2507 prefix == "scrub" ||
2508 prefix == "deep_scrub"
2509 ) {
2510 string pgidstr;
2511 pg_t pgid;
2512 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2513 ss << "no pgid specified";
2514 ret = -EINVAL;
2515 goto out;
2516 }
2517 if (!pgid.parse(pgidstr.c_str())) {
2518 ss << "couldn't parse pgid '" << pgidstr << "'";
2519 ret = -EINVAL;
2520 goto out;
2521 }
2522 spg_t pcand;
2523 PGRef pg;
2524 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2525 (pg = _lookup_lock_pg(pcand))) {
2526 if (pg->is_primary()) {
2527 cmdmap_t new_cmdmap = cmdmap;
2528 try {
2529 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2530 pg->unlock();
2531 return; // the pg handler calls on_finish directly
2532 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2533 pg->unlock();
2534 ss << e.what();
2535 ret = -EINVAL;
2536 goto out;
2537 }
2538 } else {
2539 ss << "not primary for pgid " << pgid;
2540 // do not reply; they will get newer maps and realize they
2541 // need to resend.
2542 pg->unlock();
2543 ret = -EAGAIN;
2544 goto out;
2545 }
2546 } else {
2547 ss << "i don't have pgid " << pgid;
2548 ret = -ENOENT;
2549 }
2550 }
2551
2552 // --- OSD commands follow ---
2553
2554 else if (prefix == "status") {
2555 lock_guard l(osd_lock);
2556 f->open_object_section("status");
2557 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2558 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2559 f->dump_unsigned("whoami", superblock.whoami);
2560 f->dump_string("state", get_state_name(get_state()));
2561 f->dump_unsigned("oldest_map", superblock.oldest_map);
2562 f->dump_unsigned("newest_map", superblock.newest_map);
2563 f->dump_unsigned("num_pgs", num_pgs);
2564 f->close_section();
2565 } else if (prefix == "flush_journal") {
2566 store->flush_journal();
2567 } else if (prefix == "dump_ops_in_flight" ||
2568 prefix == "ops" ||
2569 prefix == "dump_blocked_ops" ||
2570 prefix == "dump_historic_ops" ||
2571 prefix == "dump_historic_ops_by_duration" ||
2572 prefix == "dump_historic_slow_ops") {
2573
2574 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2575 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2576 will start to track new ops received afterwards.";
2577
2578 set<string> filters;
2579 vector<string> filter_str;
2580 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
2581 copy(filter_str.begin(), filter_str.end(),
2582 inserter(filters, filters.end()));
2583 }
2584
2585 if (prefix == "dump_ops_in_flight" ||
2586 prefix == "ops") {
2587 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2588 ss << error_str;
2589 ret = -EINVAL;
2590 goto out;
2591 }
2592 }
2593 if (prefix == "dump_blocked_ops") {
2594 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2595 ss << error_str;
2596 ret = -EINVAL;
2597 goto out;
2598 }
2599 }
2600 if (prefix == "dump_historic_ops") {
2601 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2602 ss << error_str;
2603 ret = -EINVAL;
2604 goto out;
2605 }
2606 }
2607 if (prefix == "dump_historic_ops_by_duration") {
2608 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2609 ss << error_str;
2610 ret = -EINVAL;
2611 goto out;
2612 }
2613 }
2614 if (prefix == "dump_historic_slow_ops") {
2615 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2616 ss << error_str;
2617 ret = -EINVAL;
2618 goto out;
2619 }
2620 }
2621 } else if (prefix == "dump_op_pq_state") {
2622 f->open_object_section("pq");
2623 op_shardedwq.dump(f);
2624 f->close_section();
2625 } else if (prefix == "dump_blocklist") {
2626 list<pair<entity_addr_t,utime_t> > bl;
2627 OSDMapRef curmap = service.get_osdmap();
2628
2629 f->open_array_section("blocklist");
2630 curmap->get_blocklist(&bl);
2631 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2632 it != bl.end(); ++it) {
2633 f->open_object_section("entry");
2634 f->open_object_section("entity_addr_t");
2635 it->first.dump(f);
2636 f->close_section(); //entity_addr_t
2637 it->second.localtime(f->dump_stream("expire_time"));
2638 f->close_section(); //entry
2639 }
2640 f->close_section(); //blocklist
2641 } else if (prefix == "dump_watchers") {
2642 list<obj_watch_item_t> watchers;
2643 // scan pg's
2644 vector<PGRef> pgs;
2645 _get_pgs(&pgs);
2646 for (auto& pg : pgs) {
2647 list<obj_watch_item_t> pg_watchers;
2648 pg->get_watchers(&pg_watchers);
2649 watchers.splice(watchers.end(), pg_watchers);
2650 }
2651
2652 f->open_array_section("watchers");
2653 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2654 it != watchers.end(); ++it) {
2655
2656 f->open_object_section("watch");
2657
2658 f->dump_string("namespace", it->obj.nspace);
2659 f->dump_string("object", it->obj.oid.name);
2660
2661 f->open_object_section("entity_name");
2662 it->wi.name.dump(f);
2663 f->close_section(); //entity_name_t
2664
2665 f->dump_unsigned("cookie", it->wi.cookie);
2666 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2667
2668 f->open_object_section("entity_addr_t");
2669 it->wi.addr.dump(f);
2670 f->close_section(); //entity_addr_t
2671
2672 f->close_section(); //watch
2673 }
2674
2675 f->close_section(); //watchers
2676 } else if (prefix == "dump_recovery_reservations") {
2677 f->open_object_section("reservations");
2678 f->open_object_section("local_reservations");
2679 service.local_reserver.dump(f);
2680 f->close_section();
2681 f->open_object_section("remote_reservations");
2682 service.remote_reserver.dump(f);
2683 f->close_section();
2684 f->close_section();
2685 } else if (prefix == "dump_scrub_reservations") {
2686 f->open_object_section("scrub_reservations");
2687 service.dump_scrub_reservations(f);
2688 f->close_section();
2689 } else if (prefix == "get_latest_osdmap") {
2690 get_latest_osdmap();
2691 } else if (prefix == "set_heap_property") {
2692 string property;
2693 int64_t value = 0;
2694 string error;
2695 bool success = false;
2696 if (!cmd_getval(cmdmap, "property", property)) {
2697 error = "unable to get property";
2698 success = false;
2699 } else if (!cmd_getval(cmdmap, "value", value)) {
2700 error = "unable to get value";
2701 success = false;
2702 } else if (value < 0) {
2703 error = "negative value not allowed";
2704 success = false;
2705 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2706 error = "invalid property";
2707 success = false;
2708 } else {
2709 success = true;
2710 }
2711 f->open_object_section("result");
2712 f->dump_string("error", error);
2713 f->dump_bool("success", success);
2714 f->close_section();
2715 } else if (prefix == "get_heap_property") {
2716 string property;
2717 size_t value = 0;
2718 string error;
2719 bool success = false;
2720 if (!cmd_getval(cmdmap, "property", property)) {
2721 error = "unable to get property";
2722 success = false;
2723 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2724 error = "invalid property";
2725 success = false;
2726 } else {
2727 success = true;
2728 }
2729 f->open_object_section("result");
2730 f->dump_string("error", error);
2731 f->dump_bool("success", success);
2732 f->dump_int("value", value);
2733 f->close_section();
2734 } else if (prefix == "dump_objectstore_kv_stats") {
2735 store->get_db_statistics(f);
2736 } else if (prefix == "dump_scrubs") {
2737 service.dumps_scrub(f);
2738 } else if (prefix == "calc_objectstore_db_histogram") {
2739 store->generate_db_histogram(f);
2740 } else if (prefix == "flush_store_cache") {
2741 store->flush_cache(&ss);
2742 } else if (prefix == "dump_pgstate_history") {
2743 f->open_object_section("pgstate_history");
2744 f->open_array_section("pgs");
2745 vector<PGRef> pgs;
2746 _get_pgs(&pgs);
2747 for (auto& pg : pgs) {
2748 f->open_object_section("pg");
2749 f->dump_stream("pg") << pg->pg_id;
2750 f->dump_string("currently", pg->get_current_state());
2751 pg->dump_pgstate_history(f);
2752 f->close_section();
2753 }
2754 f->close_section();
2755 f->close_section();
2756 } else if (prefix == "compact") {
2757 dout(1) << "triggering manual compaction" << dendl;
2758 auto start = ceph::coarse_mono_clock::now();
2759 store->compact();
2760 auto end = ceph::coarse_mono_clock::now();
2761 double duration = std::chrono::duration<double>(end-start).count();
2762 dout(1) << "finished manual compaction in "
2763 << duration
2764 << " seconds" << dendl;
2765 f->open_object_section("compact_result");
2766 f->dump_float("elapsed_time", duration);
2767 f->close_section();
2768 } else if (prefix == "get_mapped_pools") {
2769 f->open_array_section("mapped_pools");
2770 set<int64_t> poollist = get_mapped_pools();
2771 for (auto pool : poollist) {
2772 f->dump_int("pool_id", pool);
2773 }
2774 f->close_section();
2775 } else if (prefix == "smart") {
2776 string devid;
2777 cmd_getval(cmdmap, "devid", devid);
2778 ostringstream out;
2779 probe_smart(devid, out);
2780 outbl.append(out.str());
2781 } else if (prefix == "list_devices") {
2782 set<string> devnames;
2783 store->get_devices(&devnames);
2784 f->open_array_section("list_devices");
2785 for (auto dev : devnames) {
2786 if (dev.find("dm-") == 0) {
2787 continue;
2788 }
2789 string err;
2790 f->open_object_section("device");
2791 f->dump_string("device", "/dev/" + dev);
2792 f->dump_string("device_id", get_device_id(dev, &err));
2793 f->close_section();
2794 }
2795 f->close_section();
2796 } else if (prefix == "send_beacon") {
2797 lock_guard l(osd_lock);
2798 if (is_active()) {
2799 send_beacon(ceph::coarse_mono_clock::now());
2800 }
2801 }
2802
2803 else if (prefix == "cluster_log") {
2804 vector<string> msg;
2805 cmd_getval(cmdmap, "message", msg);
2806 if (msg.empty()) {
2807 ret = -EINVAL;
2808 ss << "ignoring empty log message";
2809 goto out;
2810 }
2811 string message = msg.front();
2812 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2813 message += " " + *a;
2814 string lvl;
2815 cmd_getval(cmdmap, "level", lvl);
2816 clog_type level = string_to_clog_type(lvl);
2817 if (level < 0) {
2818 ret = -EINVAL;
2819 ss << "unknown level '" << lvl << "'";
2820 goto out;
2821 }
2822 clog->do_log(level, message);
2823 }
2824
2825 else if (prefix == "bench") {
2826 int64_t count;
2827 int64_t bsize;
2828 int64_t osize, onum;
2829 // default count 1G, size 4MB
2830 cmd_getval(cmdmap, "count", count, (int64_t)1 << 30);
2831 cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
2832 cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
2833 cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
2834 double elapsed = 0.0;
2835
2836 ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
2837 if (ret != 0) {
2838 goto out;
2839 }
2840
2841 double rate = count / elapsed;
2842 double iops = rate / bsize;
2843 f->open_object_section("osd_bench_results");
2844 f->dump_int("bytes_written", count);
2845 f->dump_int("blocksize", bsize);
2846 f->dump_float("elapsed_sec", elapsed);
2847 f->dump_float("bytes_per_sec", rate);
2848 f->dump_float("iops", iops);
2849 f->close_section();
2850 }
2851
2852 else if (prefix == "flush_pg_stats") {
2853 mgrc.send_pgstats();
2854 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2855 }
2856
2857 else if (prefix == "heap") {
2858 ret = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2859 }
2860
2861 else if (prefix == "debug dump_missing") {
2862 f->open_array_section("pgs");
2863 vector<PGRef> pgs;
2864 _get_pgs(&pgs);
2865 for (auto& pg : pgs) {
2866 string s = stringify(pg->pg_id);
2867 f->open_array_section(s.c_str());
2868 pg->lock();
2869 pg->dump_missing(f);
2870 pg->unlock();
2871 f->close_section();
2872 }
2873 f->close_section();
2874 }
2875
2876 else if (prefix == "debug kick_recovery_wq") {
2877 int64_t delay;
2878 cmd_getval(cmdmap, "delay", delay);
2879 ostringstream oss;
2880 oss << delay;
2881 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
2882 if (ret != 0) {
2883 ss << "kick_recovery_wq: error setting "
2884 << "osd_recovery_delay_start to '" << delay << "': error "
2885 << ret;
2886 goto out;
2887 }
2888 cct->_conf.apply_changes(nullptr);
2889 ss << "kicking recovery queue. set osd_recovery_delay_start "
2890 << "to " << cct->_conf->osd_recovery_delay_start;
2891 }
2892
2893 else if (prefix == "cpu_profiler") {
2894 ostringstream ds;
2895 string arg;
2896 cmd_getval(cmdmap, "arg", arg);
2897 vector<string> argvec;
2898 get_str_vec(arg, argvec);
2899 cpu_profiler_handle_command(argvec, ds);
2900 outbl.append(ds.str());
2901 }
2902
2903 else if (prefix == "dump_pg_recovery_stats") {
2904 lock_guard l(osd_lock);
2905 pg_recovery_stats.dump_formatted(f);
2906 }
2907
2908 else if (prefix == "reset_pg_recovery_stats") {
2909 lock_guard l(osd_lock);
2910 pg_recovery_stats.reset();
2911 }
2912
2913 else if (prefix == "perf histogram dump") {
2914 std::string logger;
2915 std::string counter;
2916 cmd_getval(cmdmap, "logger", logger);
2917 cmd_getval(cmdmap, "counter", counter);
2918 cct->get_perfcounters_collection()->dump_formatted_histograms(
2919 f, false, logger, counter);
2920 }
2921
2922 else if (prefix == "cache drop") {
2923 lock_guard l(osd_lock);
2924 dout(20) << "clearing all caches" << dendl;
2925 // Clear the objectstore's cache - onode and buffer for Bluestore,
2926 // system's pagecache for Filestore
2927 ret = store->flush_cache(&ss);
2928 if (ret < 0) {
2929 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
2930 goto out;
2931 }
2932 // Clear the objectcontext cache (per PG)
2933 vector<PGRef> pgs;
2934 _get_pgs(&pgs);
2935 for (auto& pg: pgs) {
2936 pg->clear_cache();
2937 }
2938 }
2939
2940 else if (prefix == "cache status") {
2941 lock_guard l(osd_lock);
2942 int obj_ctx_count = 0;
2943 vector<PGRef> pgs;
2944 _get_pgs(&pgs);
2945 for (auto& pg: pgs) {
2946 obj_ctx_count += pg->get_cache_obj_count();
2947 }
2948 f->open_object_section("cache_status");
2949 f->dump_int("object_ctx", obj_ctx_count);
2950 store->dump_cache_stats(f);
2951 f->close_section();
2952 }
2953
2954 else if (prefix == "scrub_purged_snaps") {
2955 lock_guard l(osd_lock);
2956 scrub_purged_snaps();
2957 }
2958
2959 else if (prefix == "dump_osd_network") {
2960 lock_guard l(osd_lock);
2961 int64_t value = 0;
2962 if (!(cmd_getval(cmdmap, "value", value))) {
2963 // Convert milliseconds to microseconds
2964 value = static_cast<double>(g_conf().get_val<double>(
2965 "mon_warn_on_slow_ping_time")) * 1000;
2966 if (value == 0) {
2967 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2968 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2969 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2970 }
2971 } else {
2972 // Convert user input to microseconds
2973 value *= 1000;
2974 }
2975 if (value < 0) value = 0;
2976
2977 struct osd_ping_time_t {
2978 uint32_t pingtime;
2979 int to;
2980 bool back;
2981 std::array<uint32_t,3> times;
2982 std::array<uint32_t,3> min;
2983 std::array<uint32_t,3> max;
2984 uint32_t last;
2985 uint32_t last_update;
2986
2987 bool operator<(const osd_ping_time_t& rhs) const {
2988 if (pingtime < rhs.pingtime)
2989 return true;
2990 if (pingtime > rhs.pingtime)
2991 return false;
2992 if (to < rhs.to)
2993 return true;
2994 if (to > rhs.to)
2995 return false;
2996 return back;
2997 }
2998 };
2999
3000 set<osd_ping_time_t> sorted;
3001 // Get pingtimes under lock and not on the stack
3002 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3003 service.get_hb_pingtime(pingtimes);
3004 for (auto j : *pingtimes) {
3005 if (j.second.last_update == 0)
3006 continue;
3007 osd_ping_time_t item;
3008 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3009 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3010 if (item.pingtime >= value) {
3011 item.to = j.first;
3012 item.times[0] = j.second.back_pingtime[0];
3013 item.times[1] = j.second.back_pingtime[1];
3014 item.times[2] = j.second.back_pingtime[2];
3015 item.min[0] = j.second.back_min[0];
3016 item.min[1] = j.second.back_min[1];
3017 item.min[2] = j.second.back_min[2];
3018 item.max[0] = j.second.back_max[0];
3019 item.max[1] = j.second.back_max[1];
3020 item.max[2] = j.second.back_max[2];
3021 item.last = j.second.back_last;
3022 item.back = true;
3023 item.last_update = j.second.last_update;
3024 sorted.emplace(item);
3025 }
3026 if (j.second.front_last == 0)
3027 continue;
3028 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3029 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3030 if (item.pingtime >= value) {
3031 item.to = j.first;
3032 item.times[0] = j.second.front_pingtime[0];
3033 item.times[1] = j.second.front_pingtime[1];
3034 item.times[2] = j.second.front_pingtime[2];
3035 item.min[0] = j.second.front_min[0];
3036 item.min[1] = j.second.front_min[1];
3037 item.min[2] = j.second.front_min[2];
3038 item.max[0] = j.second.front_max[0];
3039 item.max[1] = j.second.front_max[1];
3040 item.max[2] = j.second.front_max[2];
3041 item.last = j.second.front_last;
3042 item.last_update = j.second.last_update;
3043 item.back = false;
3044 sorted.emplace(item);
3045 }
3046 }
3047 delete pingtimes;
3048 //
3049 // Network ping times (1min 5min 15min)
3050 f->open_object_section("network_ping_times");
3051 f->dump_int("threshold", value / 1000);
3052 f->open_array_section("entries");
3053 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3054 ceph_assert(sitem.pingtime >= value);
3055 f->open_object_section("entry");
3056
3057 const time_t lu(sitem.last_update);
3058 char buffer[26];
3059 string lustr(ctime_r(&lu, buffer));
3060 lustr.pop_back(); // Remove trailing \n
3061 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3062 f->dump_string("last update", lustr);
3063 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3064 f->dump_int("from osd", whoami);
3065 f->dump_int("to osd", sitem.to);
3066 f->dump_string("interface", (sitem.back ? "back" : "front"));
3067 f->open_object_section("average");
3068 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3069 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3070 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3071 f->close_section(); // average
3072 f->open_object_section("min");
3073 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3074 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3075 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3076 f->close_section(); // min
3077 f->open_object_section("max");
3078 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3079 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3080 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3081 f->close_section(); // max
3082 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3083 f->close_section(); // entry
3084 }
3085 f->close_section(); // entries
3086 f->close_section(); // network_ping_times
3087 } else {
3088 ceph_abort_msg("broken asok registration");
3089 }
3090
3091 out:
3092 on_finish(ret, ss.str(), outbl);
3093 }
3094
3095 int OSD::run_osd_bench_test(
3096 int64_t count,
3097 int64_t bsize,
3098 int64_t osize,
3099 int64_t onum,
3100 double *elapsed,
3101 ostream &ss)
3102 {
3103 int ret = 0;
3104 uint32_t duration = cct->_conf->osd_bench_duration;
3105
3106 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
3107 // let us limit the block size because the next checks rely on it
3108 // having a sane value. If we allow any block size to be set things
3109 // can still go sideways.
3110 ss << "block 'size' values are capped at "
3111 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
3112 << " a higher value, please adjust 'osd_bench_max_block_size'";
3113 ret = -EINVAL;
3114 return ret;
3115 } else if (bsize < (int64_t) (1 << 20)) {
3116 // entering the realm of small block sizes.
3117 // limit the count to a sane value, assuming a configurable amount of
3118 // IOPS and duration, so that the OSD doesn't get hung up on this,
3119 // preventing timeouts from going off
3120 int64_t max_count =
3121 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
3122 if (count > max_count) {
3123 ss << "'count' values greater than " << max_count
3124 << " for a block size of " << byte_u_t(bsize) << ", assuming "
3125 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
3126 << " for " << duration << " seconds,"
3127 << " can cause ill effects on osd. "
3128 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
3129 << " value if you wish to use a higher 'count'.";
3130 ret = -EINVAL;
3131 return ret;
3132 }
3133 } else {
3134 // 1MB block sizes are big enough so that we get more stuff done.
3135 // However, to avoid the osd from getting hung on this and having
3136 // timers being triggered, we are going to limit the count assuming
3137 // a configurable throughput and duration.
3138 // NOTE: max_count is the total amount of bytes that we believe we
3139 // will be able to write during 'duration' for the given
3140 // throughput. The block size hardly impacts this unless it's
3141 // way too big. Given we already check how big the block size
3142 // is, it's safe to assume everything will check out.
3143 int64_t max_count =
3144 cct->_conf->osd_bench_large_size_max_throughput * duration;
3145 if (count > max_count) {
3146 ss << "'count' values greater than " << max_count
3147 << " for a block size of " << byte_u_t(bsize) << ", assuming "
3148 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
3149 << " for " << duration << " seconds,"
3150 << " can cause ill effects on osd. "
3151 << " Please adjust 'osd_bench_large_size_max_throughput'"
3152 << " with a higher value if you wish to use a higher 'count'.";
3153 ret = -EINVAL;
3154 return ret;
3155 }
3156 }
3157
3158 if (osize && bsize > osize) {
3159 bsize = osize;
3160 }
3161
3162 dout(1) << " bench count " << count
3163 << " bsize " << byte_u_t(bsize) << dendl;
3164
3165 ObjectStore::Transaction cleanupt;
3166
3167 if (osize && onum) {
3168 bufferlist bl;
3169 bufferptr bp(osize);
3170 bp.zero();
3171 bl.push_back(std::move(bp));
3172 bl.rebuild_page_aligned();
3173 for (int i=0; i<onum; ++i) {
3174 char nm[30];
3175 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
3176 object_t oid(nm);
3177 hobject_t soid(sobject_t(oid, 0));
3178 ObjectStore::Transaction t;
3179 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
3180 store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3181 cleanupt.remove(coll_t(), ghobject_t(soid));
3182 }
3183 }
3184
3185 bufferlist bl;
3186 bufferptr bp(bsize);
3187 bp.zero();
3188 bl.push_back(std::move(bp));
3189 bl.rebuild_page_aligned();
3190
3191 {
3192 C_SaferCond waiter;
3193 if (!service.meta_ch->flush_commit(&waiter)) {
3194 waiter.wait();
3195 }
3196 }
3197
3198 utime_t start = ceph_clock_now();
3199 for (int64_t pos = 0; pos < count; pos += bsize) {
3200 char nm[30];
3201 unsigned offset = 0;
3202 if (onum && osize) {
3203 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
3204 offset = rand() % (osize / bsize) * bsize;
3205 } else {
3206 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
3207 }
3208 object_t oid(nm);
3209 hobject_t soid(sobject_t(oid, 0));
3210 ObjectStore::Transaction t;
3211 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
3212 store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3213 if (!onum || !osize) {
3214 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
3215 }
3216 }
3217
3218 {
3219 C_SaferCond waiter;
3220 if (!service.meta_ch->flush_commit(&waiter)) {
3221 waiter.wait();
3222 }
3223 }
3224 utime_t end = ceph_clock_now();
3225 *elapsed = end - start;
3226
3227 // clean up
3228 store->queue_transaction(service.meta_ch, std::move(cleanupt), nullptr);
3229 {
3230 C_SaferCond waiter;
3231 if (!service.meta_ch->flush_commit(&waiter)) {
3232 waiter.wait();
3233 }
3234 }
3235
3236 return ret;
3237 }
3238
3239 class TestOpsSocketHook : public AdminSocketHook {
3240 OSDService *service;
3241 ObjectStore *store;
3242 public:
3243 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
3244 int call(std::string_view command, const cmdmap_t& cmdmap,
3245 Formatter *f,
3246 std::ostream& errss,
3247 bufferlist& out) override {
3248 int r = 0;
3249 stringstream outss;
3250 try {
3251 test_ops(service, store, command, cmdmap, outss);
3252 out.append(outss);
3253 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3254 errss << e.what();
3255 r = -EINVAL;
3256 }
3257 return r;
3258 }
3259 void test_ops(OSDService *service, ObjectStore *store,
3260 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
3261
3262 };
3263
3264 class OSD::C_Tick : public Context {
3265 OSD *osd;
3266 public:
3267 explicit C_Tick(OSD *o) : osd(o) {}
3268 void finish(int r) override {
3269 osd->tick();
3270 }
3271 };
3272
3273 class OSD::C_Tick_WithoutOSDLock : public Context {
3274 OSD *osd;
3275 public:
3276 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3277 void finish(int r) override {
3278 osd->tick_without_osd_lock();
3279 }
3280 };
3281
3282 int OSD::enable_disable_fuse(bool stop)
3283 {
3284 #ifdef HAVE_LIBFUSE
3285 int r;
3286 string mntpath = cct->_conf->osd_data + "/fuse";
3287 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3288 dout(1) << __func__ << " disabling" << dendl;
3289 fuse_store->stop();
3290 delete fuse_store;
3291 fuse_store = NULL;
3292 r = ::rmdir(mntpath.c_str());
3293 if (r < 0) {
3294 r = -errno;
3295 derr << __func__ << " failed to rmdir " << mntpath << ": "
3296 << cpp_strerror(r) << dendl;
3297 return r;
3298 }
3299 return 0;
3300 }
3301 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3302 dout(1) << __func__ << " enabling" << dendl;
3303 r = ::mkdir(mntpath.c_str(), 0700);
3304 if (r < 0)
3305 r = -errno;
3306 if (r < 0 && r != -EEXIST) {
3307 derr << __func__ << " unable to create " << mntpath << ": "
3308 << cpp_strerror(r) << dendl;
3309 return r;
3310 }
3311 fuse_store = new FuseStore(store, mntpath);
3312 r = fuse_store->start();
3313 if (r < 0) {
3314 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3315 delete fuse_store;
3316 fuse_store = NULL;
3317 return r;
3318 }
3319 }
3320 #endif // HAVE_LIBFUSE
3321 return 0;
3322 }
3323
3324 size_t OSD::get_num_cache_shards()
3325 {
3326 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3327 }
3328
3329 int OSD::get_num_op_shards()
3330 {
3331 if (cct->_conf->osd_op_num_shards)
3332 return cct->_conf->osd_op_num_shards;
3333 if (store_is_rotational)
3334 return cct->_conf->osd_op_num_shards_hdd;
3335 else
3336 return cct->_conf->osd_op_num_shards_ssd;
3337 }
3338
3339 int OSD::get_num_op_threads()
3340 {
3341 if (cct->_conf->osd_op_num_threads_per_shard)
3342 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3343 if (store_is_rotational)
3344 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3345 else
3346 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3347 }
3348
3349 float OSD::get_osd_recovery_sleep()
3350 {
3351 if (cct->_conf->osd_recovery_sleep)
3352 return cct->_conf->osd_recovery_sleep;
3353 if (!store_is_rotational && !journal_is_rotational)
3354 return cct->_conf->osd_recovery_sleep_ssd;
3355 else if (store_is_rotational && !journal_is_rotational)
3356 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
3357 else
3358 return cct->_conf->osd_recovery_sleep_hdd;
3359 }
3360
3361 float OSD::get_osd_delete_sleep()
3362 {
3363 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3364 if (osd_delete_sleep > 0)
3365 return osd_delete_sleep;
3366 if (!store_is_rotational && !journal_is_rotational)
3367 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3368 if (store_is_rotational && !journal_is_rotational)
3369 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3370 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3371 }
3372
3373 int OSD::get_recovery_max_active()
3374 {
3375 if (cct->_conf->osd_recovery_max_active)
3376 return cct->_conf->osd_recovery_max_active;
3377 if (store_is_rotational)
3378 return cct->_conf->osd_recovery_max_active_hdd;
3379 else
3380 return cct->_conf->osd_recovery_max_active_ssd;
3381 }
3382
3383 float OSD::get_osd_snap_trim_sleep()
3384 {
3385 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3386 if (osd_snap_trim_sleep > 0)
3387 return osd_snap_trim_sleep;
3388 if (!store_is_rotational && !journal_is_rotational)
3389 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3390 if (store_is_rotational && !journal_is_rotational)
3391 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3392 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3393 }
3394
3395 int OSD::init()
3396 {
3397 OSDMapRef osdmap;
3398 CompatSet initial, diff;
3399 std::lock_guard lock(osd_lock);
3400 if (is_stopping())
3401 return 0;
3402
3403 tick_timer.init();
3404 tick_timer_without_osd_lock.init();
3405 service.recovery_request_timer.init();
3406 service.sleep_timer.init();
3407
3408 boot_finisher.start();
3409
3410 {
3411 string val;
3412 store->read_meta("require_osd_release", &val);
3413 last_require_osd_release = ceph_release_from_name(val);
3414 }
3415
3416 // mount.
3417 dout(2) << "init " << dev_path
3418 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3419 << dendl;
3420 dout(2) << "journal " << journal_path << dendl;
3421 ceph_assert(store); // call pre_init() first!
3422
3423 store->set_cache_shards(get_num_cache_shards());
3424
3425 int r = store->mount();
3426 if (r < 0) {
3427 derr << "OSD:init: unable to mount object store" << dendl;
3428 return r;
3429 }
3430 journal_is_rotational = store->is_journal_rotational();
3431 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3432 << dendl;
3433
3434 enable_disable_fuse(false);
3435
3436 dout(2) << "boot" << dendl;
3437
3438 service.meta_ch = store->open_collection(coll_t::meta());
3439
3440 // initialize the daily loadavg with current 15min loadavg
3441 double loadavgs[3];
3442 if (getloadavg(loadavgs, 3) == 3) {
3443 daily_loadavg = loadavgs[2];
3444 } else {
3445 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3446 daily_loadavg = 1.0;
3447 }
3448
3449 int rotating_auth_attempts = 0;
3450 auto rotating_auth_timeout =
3451 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3452
3453 // sanity check long object name handling
3454 {
3455 hobject_t l;
3456 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3457 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3458 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3459 r = store->validate_hobject_key(l);
3460 if (r < 0) {
3461 derr << "backend (" << store->get_type() << ") is unable to support max "
3462 << "object name[space] len" << dendl;
3463 derr << " osd max object name len = "
3464 << cct->_conf->osd_max_object_name_len << dendl;
3465 derr << " osd max object namespace len = "
3466 << cct->_conf->osd_max_object_namespace_len << dendl;
3467 derr << cpp_strerror(r) << dendl;
3468 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3469 goto out;
3470 }
3471 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3472 << dendl;
3473 } else {
3474 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3475 }
3476 }
3477
3478 // read superblock
3479 r = read_superblock();
3480 if (r < 0) {
3481 derr << "OSD::init() : unable to read osd superblock" << dendl;
3482 r = -EINVAL;
3483 goto out;
3484 }
3485
3486 if (osd_compat.compare(superblock.compat_features) < 0) {
3487 derr << "The disk uses features unsupported by the executable." << dendl;
3488 derr << " ondisk features " << superblock.compat_features << dendl;
3489 derr << " daemon features " << osd_compat << dendl;
3490
3491 if (osd_compat.writeable(superblock.compat_features)) {
3492 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3493 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3494 r = -EOPNOTSUPP;
3495 goto out;
3496 }
3497 else {
3498 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3499 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3500 r = -EOPNOTSUPP;
3501 goto out;
3502 }
3503 }
3504
3505 assert_warn(whoami == superblock.whoami);
3506 if (whoami != superblock.whoami) {
3507 derr << "OSD::init: superblock says osd"
3508 << superblock.whoami << " but I am osd." << whoami << dendl;
3509 r = -EINVAL;
3510 goto out;
3511 }
3512
3513 startup_time = ceph::mono_clock::now();
3514
3515 // load up "current" osdmap
3516 assert_warn(!get_osdmap());
3517 if (get_osdmap()) {
3518 derr << "OSD::init: unable to read current osdmap" << dendl;
3519 r = -EINVAL;
3520 goto out;
3521 }
3522 osdmap = get_map(superblock.current_epoch);
3523 set_osdmap(osdmap);
3524
3525 // make sure we don't have legacy pgs deleting
3526 {
3527 vector<coll_t> ls;
3528 int r = store->list_collections(ls);
3529 ceph_assert(r >= 0);
3530 for (auto c : ls) {
3531 spg_t pgid;
3532 if (c.is_pg(&pgid) &&
3533 !osdmap->have_pg_pool(pgid.pool())) {
3534 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3535 if (!store->exists(service.meta_ch, oid)) {
3536 derr << __func__ << " missing pg_pool_t for deleted pool "
3537 << pgid.pool() << " for pg " << pgid
3538 << "; please downgrade to luminous and allow "
3539 << "pg deletion to complete before upgrading" << dendl;
3540 ceph_abort();
3541 }
3542 }
3543 }
3544 }
3545
3546 initial = get_osd_initial_compat_set();
3547 diff = superblock.compat_features.unsupported(initial);
3548 if (superblock.compat_features.merge(initial)) {
3549 // Are we adding SNAPMAPPER2?
3550 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3551 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3552 << dendl;
3553 auto ch = service.meta_ch;
3554 auto hoid = make_snapmapper_oid();
3555 unsigned max = cct->_conf->osd_target_transaction_size;
3556 r = SnapMapper::convert_legacy(cct, store, ch, hoid, max);
3557 if (r < 0)
3558 goto out;
3559 }
3560 // We need to persist the new compat_set before we
3561 // do anything else
3562 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3563 ObjectStore::Transaction t;
3564 write_superblock(t);
3565 r = store->queue_transaction(service.meta_ch, std::move(t));
3566 if (r < 0)
3567 goto out;
3568 }
3569
3570 // make sure snap mapper object exists
3571 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3572 dout(10) << "init creating/touching snapmapper object" << dendl;
3573 ObjectStore::Transaction t;
3574 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3575 r = store->queue_transaction(service.meta_ch, std::move(t));
3576 if (r < 0)
3577 goto out;
3578 }
3579 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3580 dout(10) << "init creating/touching purged_snaps object" << dendl;
3581 ObjectStore::Transaction t;
3582 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3583 r = store->queue_transaction(service.meta_ch, std::move(t));
3584 if (r < 0)
3585 goto out;
3586 }
3587
3588 if (cct->_conf->osd_open_classes_on_start) {
3589 int r = ClassHandler::get_instance().open_all_classes();
3590 if (r)
3591 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3592 }
3593
3594 check_osdmap_features();
3595
3596 {
3597 epoch_t bind_epoch = osdmap->get_epoch();
3598 service.set_epochs(NULL, NULL, &bind_epoch);
3599 }
3600
3601 clear_temp_objects();
3602
3603 // initialize osdmap references in sharded wq
3604 for (auto& shard : shards) {
3605 std::lock_guard l(shard->osdmap_lock);
3606 shard->shard_osdmap = osdmap;
3607 }
3608
3609 // load up pgs (as they previously existed)
3610 load_pgs();
3611
3612 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3613
3614 if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
3615 dout(2) << "compacting object store's omap" << dendl;
3616 store->compact();
3617 }
3618
3619 // prime osd stats
3620 {
3621 struct store_statfs_t stbuf;
3622 osd_alert_list_t alerts;
3623 int r = store->statfs(&stbuf, &alerts);
3624 ceph_assert(r == 0);
3625 service.set_statfs(stbuf, alerts);
3626 }
3627
3628 // client_messenger's auth_client will be set up by monc->init() later.
3629 for (auto m : { cluster_messenger,
3630 objecter_messenger,
3631 hb_front_client_messenger,
3632 hb_back_client_messenger,
3633 hb_front_server_messenger,
3634 hb_back_server_messenger } ) {
3635 m->set_auth_client(monc);
3636 }
3637 for (auto m : { client_messenger,
3638 cluster_messenger,
3639 hb_front_server_messenger,
3640 hb_back_server_messenger }) {
3641 m->set_auth_server(monc);
3642 }
3643 monc->set_handle_authentication_dispatcher(this);
3644
3645 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3646 | CEPH_ENTITY_TYPE_MGR);
3647 r = monc->init();
3648 if (r < 0)
3649 goto out;
3650
3651 mgrc.set_pgstats_cb([this]() { return collect_pg_stats(); });
3652 mgrc.set_perf_metric_query_cb(
3653 [this](const ConfigPayload &config_payload) {
3654 set_perf_queries(config_payload);
3655 },
3656 [this] {
3657 return get_perf_reports();
3658 });
3659 mgrc.init();
3660
3661 // tell monc about log_client so it will know about mon session resets
3662 monc->set_log_client(&log_client);
3663 update_log_config();
3664
3665 // i'm ready!
3666 client_messenger->add_dispatcher_tail(&mgrc);
3667 client_messenger->add_dispatcher_tail(this);
3668 cluster_messenger->add_dispatcher_head(this);
3669
3670 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3671 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3672 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3673 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3674
3675 objecter_messenger->add_dispatcher_head(service.objecter.get());
3676
3677 service.init();
3678 service.publish_map(osdmap);
3679 service.publish_superblock(superblock);
3680 service.max_oldest_map = superblock.oldest_map;
3681
3682 for (auto& shard : shards) {
3683 // put PGs in a temporary set because we may modify pg_slots
3684 // unordered_map below.
3685 set<PGRef> pgs;
3686 for (auto& i : shard->pg_slots) {
3687 PGRef pg = i.second->pg;
3688 if (!pg) {
3689 continue;
3690 }
3691 pgs.insert(pg);
3692 }
3693 for (auto pg : pgs) {
3694 std::scoped_lock l{*pg};
3695 set<pair<spg_t,epoch_t>> new_children;
3696 set<pair<spg_t,epoch_t>> merge_pgs;
3697 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3698 &new_children, &merge_pgs);
3699 if (!new_children.empty()) {
3700 for (auto shard : shards) {
3701 shard->prime_splits(osdmap, &new_children);
3702 }
3703 assert(new_children.empty());
3704 }
3705 if (!merge_pgs.empty()) {
3706 for (auto shard : shards) {
3707 shard->prime_merges(osdmap, &merge_pgs);
3708 }
3709 assert(merge_pgs.empty());
3710 }
3711 }
3712 }
3713
3714 osd_op_tp.start();
3715
3716 // start the heartbeat
3717 heartbeat_thread.create("osd_srv_heartbt");
3718
3719 // tick
3720 tick_timer.add_event_after(get_tick_interval(),
3721 new C_Tick(this));
3722 {
3723 std::lock_guard l(tick_timer_lock);
3724 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3725 new C_Tick_WithoutOSDLock(this));
3726 }
3727
3728 osd_lock.unlock();
3729
3730 r = monc->authenticate();
3731 if (r < 0) {
3732 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3733 << dendl;
3734 exit(1);
3735 }
3736
3737 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3738 derr << "unable to obtain rotating service keys; retrying" << dendl;
3739 ++rotating_auth_attempts;
3740 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3741 derr << __func__ << " wait_auth_rotating timed out" << dendl;
3742 exit(1);
3743 }
3744 }
3745
3746 r = update_crush_device_class();
3747 if (r < 0) {
3748 derr << __func__ << " unable to update_crush_device_class: "
3749 << cpp_strerror(r) << dendl;
3750 exit(1);
3751 }
3752
3753 r = update_crush_location();
3754 if (r < 0) {
3755 derr << __func__ << " unable to update_crush_location: "
3756 << cpp_strerror(r) << dendl;
3757 exit(1);
3758 }
3759
3760 osd_lock.lock();
3761 if (is_stopping())
3762 return 0;
3763
3764 // start objecter *after* we have authenticated, so that we don't ignore
3765 // the OSDMaps it requests.
3766 service.final_init();
3767
3768 check_config();
3769
3770 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3771 consume_map();
3772
3773 dout(0) << "done with init, starting boot process" << dendl;
3774
3775 // subscribe to any pg creations
3776 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3777
3778 // MgrClient needs this (it doesn't have MonClient reference itself)
3779 monc->sub_want("mgrmap", 0, 0);
3780
3781 // we don't need to ask for an osdmap here; objecter will
3782 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3783
3784 monc->renew_subs();
3785
3786 start_boot();
3787
3788 // Override a few options if mclock scheduler is enabled.
3789 maybe_override_max_osd_capacity_for_qos();
3790 maybe_override_options_for_qos();
3791
3792 return 0;
3793
3794 out:
3795 enable_disable_fuse(true);
3796 store->umount();
3797 delete store;
3798 store = NULL;
3799 return r;
3800 }
3801
3802 void OSD::final_init()
3803 {
3804 AdminSocket *admin_socket = cct->get_admin_socket();
3805 asok_hook = new OSDSocketHook(this);
3806 int r = admin_socket->register_command("status", asok_hook,
3807 "high-level status of OSD");
3808 ceph_assert(r == 0);
3809 r = admin_socket->register_command("flush_journal",
3810 asok_hook,
3811 "flush the journal to permanent store");
3812 ceph_assert(r == 0);
3813 r = admin_socket->register_command("dump_ops_in_flight " \
3814 "name=filterstr,type=CephString,n=N,req=false",
3815 asok_hook,
3816 "show the ops currently in flight");
3817 ceph_assert(r == 0);
3818 r = admin_socket->register_command("ops " \
3819 "name=filterstr,type=CephString,n=N,req=false",
3820 asok_hook,
3821 "show the ops currently in flight");
3822 ceph_assert(r == 0);
3823 r = admin_socket->register_command("dump_blocked_ops " \
3824 "name=filterstr,type=CephString,n=N,req=false",
3825 asok_hook,
3826 "show the blocked ops currently in flight");
3827 ceph_assert(r == 0);
3828 r = admin_socket->register_command("dump_historic_ops " \
3829 "name=filterstr,type=CephString,n=N,req=false",
3830 asok_hook,
3831 "show recent ops");
3832 ceph_assert(r == 0);
3833 r = admin_socket->register_command("dump_historic_slow_ops " \
3834 "name=filterstr,type=CephString,n=N,req=false",
3835 asok_hook,
3836 "show slowest recent ops");
3837 ceph_assert(r == 0);
3838 r = admin_socket->register_command("dump_historic_ops_by_duration " \
3839 "name=filterstr,type=CephString,n=N,req=false",
3840 asok_hook,
3841 "show slowest recent ops, sorted by duration");
3842 ceph_assert(r == 0);
3843 r = admin_socket->register_command("dump_op_pq_state",
3844 asok_hook,
3845 "dump op priority queue state");
3846 ceph_assert(r == 0);
3847 r = admin_socket->register_command("dump_blocklist",
3848 asok_hook,
3849 "dump blocklisted clients and times");
3850 ceph_assert(r == 0);
3851 r = admin_socket->register_command("dump_watchers",
3852 asok_hook,
3853 "show clients which have active watches,"
3854 " and on which objects");
3855 ceph_assert(r == 0);
3856 r = admin_socket->register_command("dump_recovery_reservations",
3857 asok_hook,
3858 "show recovery reservations");
3859 ceph_assert(r == 0);
3860 r = admin_socket->register_command("dump_scrub_reservations",
3861 asok_hook,
3862 "show scrub reservations");
3863 ceph_assert(r == 0);
3864 r = admin_socket->register_command("get_latest_osdmap",
3865 asok_hook,
3866 "force osd to update the latest map from "
3867 "the mon");
3868 ceph_assert(r == 0);
3869
3870 r = admin_socket->register_command("set_heap_property " \
3871 "name=property,type=CephString " \
3872 "name=value,type=CephInt",
3873 asok_hook,
3874 "update malloc extension heap property");
3875 ceph_assert(r == 0);
3876
3877 r = admin_socket->register_command("get_heap_property " \
3878 "name=property,type=CephString",
3879 asok_hook,
3880 "get malloc extension heap property");
3881 ceph_assert(r == 0);
3882
3883 r = admin_socket->register_command("dump_objectstore_kv_stats",
3884 asok_hook,
3885 "print statistics of kvdb which used by bluestore");
3886 ceph_assert(r == 0);
3887
3888 r = admin_socket->register_command("dump_scrubs",
3889 asok_hook,
3890 "print scheduled scrubs");
3891 ceph_assert(r == 0);
3892
3893 r = admin_socket->register_command("calc_objectstore_db_histogram",
3894 asok_hook,
3895 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3896 ceph_assert(r == 0);
3897
3898 r = admin_socket->register_command("flush_store_cache",
3899 asok_hook,
3900 "Flush bluestore internal cache");
3901 ceph_assert(r == 0);
3902 r = admin_socket->register_command("dump_pgstate_history",
3903 asok_hook,
3904 "show recent state history");
3905 ceph_assert(r == 0);
3906
3907 r = admin_socket->register_command("compact",
3908 asok_hook,
3909 "Commpact object store's omap."
3910 " WARNING: Compaction probably slows your requests");
3911 ceph_assert(r == 0);
3912
3913 r = admin_socket->register_command("get_mapped_pools",
3914 asok_hook,
3915 "dump pools whose PG(s) are mapped to this OSD.");
3916
3917 ceph_assert(r == 0);
3918
3919 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
3920 asok_hook,
3921 "probe OSD devices for SMART data.");
3922
3923 ceph_assert(r == 0);
3924
3925 r = admin_socket->register_command("list_devices",
3926 asok_hook,
3927 "list OSD devices.");
3928 r = admin_socket->register_command("send_beacon",
3929 asok_hook,
3930 "send OSD beacon to mon immediately");
3931
3932 r = admin_socket->register_command(
3933 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3934 "Dump osd heartbeat network ping times");
3935 ceph_assert(r == 0);
3936
3937 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3938 // Note: pools are CephString instead of CephPoolname because
3939 // these commands traditionally support both pool names and numbers
3940 r = admin_socket->register_command(
3941 "setomapval " \
3942 "name=pool,type=CephString " \
3943 "name=objname,type=CephObjectname " \
3944 "name=key,type=CephString "\
3945 "name=val,type=CephString",
3946 test_ops_hook,
3947 "set omap key");
3948 ceph_assert(r == 0);
3949 r = admin_socket->register_command(
3950 "rmomapkey " \
3951 "name=pool,type=CephString " \
3952 "name=objname,type=CephObjectname " \
3953 "name=key,type=CephString",
3954 test_ops_hook,
3955 "remove omap key");
3956 ceph_assert(r == 0);
3957 r = admin_socket->register_command(
3958 "setomapheader " \
3959 "name=pool,type=CephString " \
3960 "name=objname,type=CephObjectname " \
3961 "name=header,type=CephString",
3962 test_ops_hook,
3963 "set omap header");
3964 ceph_assert(r == 0);
3965
3966 r = admin_socket->register_command(
3967 "getomap " \
3968 "name=pool,type=CephString " \
3969 "name=objname,type=CephObjectname",
3970 test_ops_hook,
3971 "output entire object map");
3972 ceph_assert(r == 0);
3973
3974 r = admin_socket->register_command(
3975 "truncobj " \
3976 "name=pool,type=CephString " \
3977 "name=objname,type=CephObjectname " \
3978 "name=len,type=CephInt",
3979 test_ops_hook,
3980 "truncate object to length");
3981 ceph_assert(r == 0);
3982
3983 r = admin_socket->register_command(
3984 "injectdataerr " \
3985 "name=pool,type=CephString " \
3986 "name=objname,type=CephObjectname " \
3987 "name=shardid,type=CephInt,req=false,range=0|255",
3988 test_ops_hook,
3989 "inject data error to an object");
3990 ceph_assert(r == 0);
3991
3992 r = admin_socket->register_command(
3993 "injectmdataerr " \
3994 "name=pool,type=CephString " \
3995 "name=objname,type=CephObjectname " \
3996 "name=shardid,type=CephInt,req=false,range=0|255",
3997 test_ops_hook,
3998 "inject metadata error to an object");
3999 ceph_assert(r == 0);
4000 r = admin_socket->register_command(
4001 "set_recovery_delay " \
4002 "name=utime,type=CephInt,req=false",
4003 test_ops_hook,
4004 "Delay osd recovery by specified seconds");
4005 ceph_assert(r == 0);
4006 r = admin_socket->register_command(
4007 "injectfull " \
4008 "name=type,type=CephString,req=false " \
4009 "name=count,type=CephInt,req=false ",
4010 test_ops_hook,
4011 "Inject a full disk (optional count times)");
4012 ceph_assert(r == 0);
4013 r = admin_socket->register_command(
4014 "bench " \
4015 "name=count,type=CephInt,req=false " \
4016 "name=size,type=CephInt,req=false " \
4017 "name=object_size,type=CephInt,req=false " \
4018 "name=object_num,type=CephInt,req=false ",
4019 asok_hook,
4020 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
4021 "(default count=1G default size=4MB). Results in log.");
4022 ceph_assert(r == 0);
4023 r = admin_socket->register_command(
4024 "cluster_log " \
4025 "name=level,type=CephChoices,strings=error,warning,info,debug " \
4026 "name=message,type=CephString,n=N",
4027 asok_hook,
4028 "log a message to the cluster log");
4029 ceph_assert(r == 0);
4030 r = admin_socket->register_command(
4031 "flush_pg_stats",
4032 asok_hook,
4033 "flush pg stats");
4034 ceph_assert(r == 0);
4035 r = admin_socket->register_command(
4036 "heap " \
4037 "name=heapcmd,type=CephChoices,strings=" \
4038 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
4039 "name=value,type=CephString,req=false",
4040 asok_hook,
4041 "show heap usage info (available only if compiled with tcmalloc)");
4042 ceph_assert(r == 0);
4043 r = admin_socket->register_command(
4044 "debug dump_missing " \
4045 "name=filename,type=CephFilepath",
4046 asok_hook,
4047 "dump missing objects to a named file");
4048 ceph_assert(r == 0);
4049 r = admin_socket->register_command(
4050 "debug kick_recovery_wq " \
4051 "name=delay,type=CephInt,range=0",
4052 asok_hook,
4053 "set osd_recovery_delay_start to <val>");
4054 ceph_assert(r == 0);
4055 r = admin_socket->register_command(
4056 "cpu_profiler " \
4057 "name=arg,type=CephChoices,strings=status|flush",
4058 asok_hook,
4059 "run cpu profiling on daemon");
4060 ceph_assert(r == 0);
4061 r = admin_socket->register_command(
4062 "dump_pg_recovery_stats",
4063 asok_hook,
4064 "dump pg recovery statistics");
4065 ceph_assert(r == 0);
4066 r = admin_socket->register_command(
4067 "reset_pg_recovery_stats",
4068 asok_hook,
4069 "reset pg recovery statistics");
4070 ceph_assert(r == 0);
4071 r = admin_socket->register_command(
4072 "cache drop",
4073 asok_hook,
4074 "Drop all OSD caches");
4075 ceph_assert(r == 0);
4076 r = admin_socket->register_command(
4077 "cache status",
4078 asok_hook,
4079 "Get OSD caches statistics");
4080 ceph_assert(r == 0);
4081 r = admin_socket->register_command(
4082 "scrub_purged_snaps",
4083 asok_hook,
4084 "Scrub purged_snaps vs snapmapper index");
4085 ceph_assert(r == 0);
4086
4087 // -- pg commands --
4088 // old form: ceph pg <pgid> command ...
4089 r = admin_socket->register_command(
4090 "pg " \
4091 "name=pgid,type=CephPgid " \
4092 "name=cmd,type=CephChoices,strings=query",
4093 asok_hook,
4094 "");
4095 ceph_assert(r == 0);
4096 r = admin_socket->register_command(
4097 "pg " \
4098 "name=pgid,type=CephPgid " \
4099 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
4100 "name=mulcmd,type=CephChoices,strings=revert|delete",
4101 asok_hook,
4102 "");
4103 ceph_assert(r == 0);
4104 r = admin_socket->register_command(
4105 "pg " \
4106 "name=pgid,type=CephPgid " \
4107 "name=cmd,type=CephChoices,strings=list_unfound " \
4108 "name=offset,type=CephString,req=false",
4109 asok_hook,
4110 "");
4111 ceph_assert(r == 0);
4112 r = admin_socket->register_command(
4113 "pg " \
4114 "name=pgid,type=CephPgid " \
4115 "name=cmd,type=CephChoices,strings=scrub " \
4116 "name=time,type=CephInt,req=false",
4117 asok_hook,
4118 "");
4119 ceph_assert(r == 0);
4120 r = admin_socket->register_command(
4121 "pg " \
4122 "name=pgid,type=CephPgid " \
4123 "name=cmd,type=CephChoices,strings=deep_scrub " \
4124 "name=time,type=CephInt,req=false",
4125 asok_hook,
4126 "");
4127 ceph_assert(r == 0);
4128 // new form: tell <pgid> <cmd> for both cli and rest
4129 r = admin_socket->register_command(
4130 "query",
4131 asok_hook,
4132 "show details of a specific pg");
4133 ceph_assert(r == 0);
4134 r = admin_socket->register_command(
4135 "mark_unfound_lost " \
4136 "name=pgid,type=CephPgid,req=false " \
4137 "name=mulcmd,type=CephChoices,strings=revert|delete",
4138 asok_hook,
4139 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4140 ceph_assert(r == 0);
4141 r = admin_socket->register_command(
4142 "list_unfound " \
4143 "name=pgid,type=CephPgid,req=false " \
4144 "name=offset,type=CephString,req=false",
4145 asok_hook,
4146 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4147 ceph_assert(r == 0);
4148 r = admin_socket->register_command(
4149 "scrub " \
4150 "name=pgid,type=CephPgid,req=false " \
4151 "name=time,type=CephInt,req=false",
4152 asok_hook,
4153 "Trigger a scheduled scrub ");
4154 ceph_assert(r == 0);
4155 r = admin_socket->register_command(
4156 "deep_scrub " \
4157 "name=pgid,type=CephPgid,req=false " \
4158 "name=time,type=CephInt,req=false",
4159 asok_hook,
4160 "Trigger a scheduled deep scrub ");
4161 ceph_assert(r == 0);
4162 }
4163
4164 PerfCounters* OSD::create_logger()
4165 {
4166 PerfCounters* logger = build_osd_logger(cct);
4167 cct->get_perfcounters_collection()->add(logger);
4168 return logger;
4169 }
4170
4171 PerfCounters* OSD::create_recoverystate_perf()
4172 {
4173 PerfCounters* recoverystate_perf = build_recoverystate_perf(cct);
4174 cct->get_perfcounters_collection()->add(recoverystate_perf);
4175 return recoverystate_perf;
4176 }
4177
4178 int OSD::shutdown()
4179 {
4180 if (cct->_conf->osd_fast_shutdown) {
4181 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4182 if (cct->_conf->osd_fast_shutdown_notify_mon)
4183 service.prepare_to_stop();
4184 cct->_log->flush();
4185 _exit(0);
4186 }
4187
4188 if (!service.prepare_to_stop())
4189 return 0; // already shutting down
4190 osd_lock.lock();
4191 if (is_stopping()) {
4192 osd_lock.unlock();
4193 return 0;
4194 }
4195 dout(0) << "shutdown" << dendl;
4196
4197 set_state(STATE_STOPPING);
4198
4199 // Debugging
4200 if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4201 cct->_conf.set_val("debug_osd", "100");
4202 cct->_conf.set_val("debug_journal", "100");
4203 cct->_conf.set_val("debug_filestore", "100");
4204 cct->_conf.set_val("debug_bluestore", "100");
4205 cct->_conf.set_val("debug_ms", "100");
4206 cct->_conf.apply_changes(nullptr);
4207 }
4208
4209 // stop MgrClient earlier as it's more like an internal consumer of OSD
4210 mgrc.shutdown();
4211
4212 service.start_shutdown();
4213
4214 // stop sending work to pgs. this just prevents any new work in _process
4215 // from racing with on_shutdown and potentially entering the pg after.
4216 op_shardedwq.drain();
4217
4218 // Shutdown PGs
4219 {
4220 vector<PGRef> pgs;
4221 _get_pgs(&pgs);
4222 for (auto pg : pgs) {
4223 pg->shutdown();
4224 }
4225 }
4226
4227 // drain op queue again (in case PGs requeued something)
4228 op_shardedwq.drain();
4229 {
4230 finished.clear(); // zap waiters (bleh, this is messy)
4231 waiting_for_osdmap.clear();
4232 }
4233
4234 // unregister commands
4235 cct->get_admin_socket()->unregister_commands(asok_hook);
4236 delete asok_hook;
4237 asok_hook = NULL;
4238
4239 cct->get_admin_socket()->unregister_commands(test_ops_hook);
4240 delete test_ops_hook;
4241 test_ops_hook = NULL;
4242
4243 osd_lock.unlock();
4244
4245 {
4246 std::lock_guard l{heartbeat_lock};
4247 heartbeat_stop = true;
4248 heartbeat_cond.notify_all();
4249 heartbeat_peers.clear();
4250 }
4251 heartbeat_thread.join();
4252
4253 hb_back_server_messenger->mark_down_all();
4254 hb_front_server_messenger->mark_down_all();
4255 hb_front_client_messenger->mark_down_all();
4256 hb_back_client_messenger->mark_down_all();
4257
4258 osd_op_tp.drain();
4259 osd_op_tp.stop();
4260 dout(10) << "op sharded tp stopped" << dendl;
4261
4262 dout(10) << "stopping agent" << dendl;
4263 service.agent_stop();
4264
4265 boot_finisher.wait_for_empty();
4266
4267 osd_lock.lock();
4268
4269 boot_finisher.stop();
4270 reset_heartbeat_peers(true);
4271
4272 tick_timer.shutdown();
4273
4274 {
4275 std::lock_guard l(tick_timer_lock);
4276 tick_timer_without_osd_lock.shutdown();
4277 }
4278
4279 // note unmount epoch
4280 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
4281 superblock.mounted = service.get_boot_epoch();
4282 superblock.clean_thru = get_osdmap_epoch();
4283 ObjectStore::Transaction t;
4284 write_superblock(t);
4285 int r = store->queue_transaction(service.meta_ch, std::move(t));
4286 if (r) {
4287 derr << "OSD::shutdown: error writing superblock: "
4288 << cpp_strerror(r) << dendl;
4289 }
4290
4291
4292 service.shutdown_reserver();
4293
4294 // Remove PGs
4295 #ifdef PG_DEBUG_REFS
4296 service.dump_live_pgids();
4297 #endif
4298 while (true) {
4299 vector<PGRef> pgs;
4300 _get_pgs(&pgs, true);
4301 if (pgs.empty()) {
4302 break;
4303 }
4304 for (auto& pg : pgs) {
4305 if (pg->is_deleted()) {
4306 continue;
4307 }
4308 dout(20) << " kicking pg " << pg << dendl;
4309 pg->lock();
4310 if (pg->get_num_ref() != 1) {
4311 derr << "pgid " << pg->get_pgid() << " has ref count of "
4312 << pg->get_num_ref() << dendl;
4313 #ifdef PG_DEBUG_REFS
4314 pg->dump_live_ids();
4315 #endif
4316 if (cct->_conf->osd_shutdown_pgref_assert) {
4317 ceph_abort();
4318 }
4319 }
4320 pg->ch.reset();
4321 pg->unlock();
4322 }
4323 }
4324 #ifdef PG_DEBUG_REFS
4325 service.dump_live_pgids();
4326 #endif
4327
4328 osd_lock.unlock();
4329 cct->_conf.remove_observer(this);
4330 osd_lock.lock();
4331
4332 service.meta_ch.reset();
4333
4334 dout(10) << "syncing store" << dendl;
4335 enable_disable_fuse(true);
4336
4337 if (cct->_conf->osd_journal_flush_on_shutdown) {
4338 dout(10) << "flushing journal" << dendl;
4339 store->flush_journal();
4340 }
4341
4342 monc->shutdown();
4343 osd_lock.unlock();
4344 {
4345 std::unique_lock l{map_lock};
4346 set_osdmap(OSDMapRef());
4347 }
4348 for (auto s : shards) {
4349 std::lock_guard l(s->osdmap_lock);
4350 s->shard_osdmap = OSDMapRef();
4351 }
4352 service.shutdown();
4353
4354 std::lock_guard lock(osd_lock);
4355 store->umount();
4356 delete store;
4357 store = nullptr;
4358 dout(10) << "Store synced" << dendl;
4359
4360 op_tracker.on_shutdown();
4361
4362 ClassHandler::get_instance().shutdown();
4363 client_messenger->shutdown();
4364 cluster_messenger->shutdown();
4365 hb_front_client_messenger->shutdown();
4366 hb_back_client_messenger->shutdown();
4367 objecter_messenger->shutdown();
4368 hb_front_server_messenger->shutdown();
4369 hb_back_server_messenger->shutdown();
4370
4371 return r;
4372 }
4373
4374 int OSD::mon_cmd_maybe_osd_create(string &cmd)
4375 {
4376 bool created = false;
4377 while (true) {
4378 dout(10) << __func__ << " cmd: " << cmd << dendl;
4379 vector<string> vcmd{cmd};
4380 bufferlist inbl;
4381 C_SaferCond w;
4382 string outs;
4383 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4384 int r = w.wait();
4385 if (r < 0) {
4386 if (r == -ENOENT && !created) {
4387 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4388 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4389 vector<string> vnewcmd{newcmd};
4390 bufferlist inbl;
4391 C_SaferCond w;
4392 string outs;
4393 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4394 int r = w.wait();
4395 if (r < 0) {
4396 derr << __func__ << " fail: osd does not exist and created failed: "
4397 << cpp_strerror(r) << dendl;
4398 return r;
4399 }
4400 created = true;
4401 continue;
4402 }
4403 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4404 return r;
4405 }
4406 break;
4407 }
4408
4409 return 0;
4410 }
4411
4412 int OSD::update_crush_location()
4413 {
4414 if (!cct->_conf->osd_crush_update_on_start) {
4415 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4416 return 0;
4417 }
4418
4419 char weight[32];
4420 if (cct->_conf->osd_crush_initial_weight >= 0) {
4421 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4422 } else {
4423 struct store_statfs_t st;
4424 osd_alert_list_t alerts;
4425 int r = store->statfs(&st, &alerts);
4426 if (r < 0) {
4427 derr << "statfs: " << cpp_strerror(r) << dendl;
4428 return r;
4429 }
4430 snprintf(weight, sizeof(weight), "%.4lf",
4431 std::max(.00001,
4432 double(st.total) /
4433 double(1ull << 40 /* TB */)));
4434 }
4435
4436 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
4437
4438 string cmd =
4439 string("{\"prefix\": \"osd crush create-or-move\", ") +
4440 string("\"id\": ") + stringify(whoami) + ", " +
4441 string("\"weight\":") + weight + ", " +
4442 string("\"args\": [") + stringify(cct->crush_location) + "]}";
4443 return mon_cmd_maybe_osd_create(cmd);
4444 }
4445
4446 int OSD::update_crush_device_class()
4447 {
4448 if (!cct->_conf->osd_class_update_on_start) {
4449 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4450 return 0;
4451 }
4452
4453 string device_class;
4454 int r = store->read_meta("crush_device_class", &device_class);
4455 if (r < 0 || device_class.empty()) {
4456 device_class = store->get_default_device_class();
4457 }
4458
4459 if (device_class.empty()) {
4460 dout(20) << __func__ << " no device class stored locally" << dendl;
4461 return 0;
4462 }
4463
4464 string cmd =
4465 string("{\"prefix\": \"osd crush set-device-class\", ") +
4466 string("\"class\": \"") + device_class + string("\", ") +
4467 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4468
4469 r = mon_cmd_maybe_osd_create(cmd);
4470 if (r == -EBUSY) {
4471 // good, already bound to a device-class
4472 return 0;
4473 } else {
4474 return r;
4475 }
4476 }
4477
4478 void OSD::write_superblock(ObjectStore::Transaction& t)
4479 {
4480 dout(10) << "write_superblock " << superblock << dendl;
4481
4482 //hack: at minimum it's using the baseline feature set
4483 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4484 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4485
4486 bufferlist bl;
4487 encode(superblock, bl);
4488 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4489 }
4490
4491 int OSD::read_superblock()
4492 {
4493 bufferlist bl;
4494 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4495 if (r < 0)
4496 return r;
4497
4498 auto p = bl.cbegin();
4499 decode(superblock, p);
4500
4501 dout(10) << "read_superblock " << superblock << dendl;
4502
4503 return 0;
4504 }
4505
4506 void OSD::clear_temp_objects()
4507 {
4508 dout(10) << __func__ << dendl;
4509 vector<coll_t> ls;
4510 store->list_collections(ls);
4511 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4512 spg_t pgid;
4513 if (!p->is_pg(&pgid))
4514 continue;
4515
4516 // list temp objects
4517 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4518
4519 vector<ghobject_t> temps;
4520 ghobject_t next;
4521 while (1) {
4522 vector<ghobject_t> objects;
4523 auto ch = store->open_collection(*p);
4524 ceph_assert(ch);
4525 store->collection_list(ch, next, ghobject_t::get_max(),
4526 store->get_ideal_list_max(),
4527 &objects, &next);
4528 if (objects.empty())
4529 break;
4530 vector<ghobject_t>::iterator q;
4531 for (q = objects.begin(); q != objects.end(); ++q) {
4532 // Hammer set pool for temps to -1, so check for clean-up
4533 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4534 temps.push_back(*q);
4535 } else {
4536 break;
4537 }
4538 }
4539 // If we saw a non-temp object and hit the break above we can
4540 // break out of the while loop too.
4541 if (q != objects.end())
4542 break;
4543 }
4544 if (!temps.empty()) {
4545 ObjectStore::Transaction t;
4546 int removed = 0;
4547 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4548 dout(20) << " removing " << *p << " object " << *q << dendl;
4549 t.remove(*p, *q);
4550 if (++removed > cct->_conf->osd_target_transaction_size) {
4551 store->queue_transaction(service.meta_ch, std::move(t));
4552 t = ObjectStore::Transaction();
4553 removed = 0;
4554 }
4555 }
4556 if (removed) {
4557 store->queue_transaction(service.meta_ch, std::move(t));
4558 }
4559 }
4560 }
4561 }
4562
4563 void OSD::recursive_remove_collection(CephContext* cct,
4564 ObjectStore *store, spg_t pgid,
4565 coll_t tmp)
4566 {
4567 OSDriver driver(
4568 store,
4569 coll_t(),
4570 make_snapmapper_oid());
4571
4572 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4573 ObjectStore::Transaction t;
4574 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4575
4576 ghobject_t next;
4577 int max = cct->_conf->osd_target_transaction_size;
4578 vector<ghobject_t> objects;
4579 objects.reserve(max);
4580 while (true) {
4581 objects.clear();
4582 store->collection_list(ch, next, ghobject_t::get_max(),
4583 max, &objects, &next);
4584 generic_dout(10) << __func__ << " " << objects << dendl;
4585 if (objects.empty())
4586 break;
4587 for (auto& p: objects) {
4588 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4589 int r = mapper.remove_oid(p.hobj, &_t);
4590 if (r != 0 && r != -ENOENT)
4591 ceph_abort();
4592 t.remove(tmp, p);
4593 }
4594 int r = store->queue_transaction(ch, std::move(t));
4595 ceph_assert(r == 0);
4596 t = ObjectStore::Transaction();
4597 }
4598 t.remove_collection(tmp);
4599 int r = store->queue_transaction(ch, std::move(t));
4600 ceph_assert(r == 0);
4601
4602 C_SaferCond waiter;
4603 if (!ch->flush_commit(&waiter)) {
4604 waiter.wait();
4605 }
4606 }
4607
4608
4609 // ======================================================
4610 // PG's
4611
4612 PG* OSD::_make_pg(
4613 OSDMapRef createmap,
4614 spg_t pgid)
4615 {
4616 dout(10) << __func__ << " " << pgid << dendl;
4617 pg_pool_t pi;
4618 map<string,string> ec_profile;
4619 string name;
4620 if (createmap->have_pg_pool(pgid.pool())) {
4621 pi = *createmap->get_pg_pool(pgid.pool());
4622 name = createmap->get_pool_name(pgid.pool());
4623 if (pi.is_erasure()) {
4624 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4625 }
4626 } else {
4627 // pool was deleted; grab final pg_pool_t off disk.
4628 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4629 bufferlist bl;
4630 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4631 if (r < 0) {
4632 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4633 << dendl;
4634 return nullptr;
4635 }
4636 ceph_assert(r >= 0);
4637 auto p = bl.cbegin();
4638 decode(pi, p);
4639 decode(name, p);
4640 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4641 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4642 << " tombstone" << dendl;
4643 return nullptr;
4644 }
4645 decode(ec_profile, p);
4646 }
4647 PGPool pool(createmap, pgid.pool(), pi, name);
4648 PG *pg;
4649 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4650 pi.type == pg_pool_t::TYPE_ERASURE)
4651 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4652 else
4653 ceph_abort();
4654 return pg;
4655 }
4656
4657 void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4658 {
4659 v->clear();
4660 v->reserve(get_num_pgs());
4661 for (auto& s : shards) {
4662 std::lock_guard l(s->shard_lock);
4663 for (auto& j : s->pg_slots) {
4664 if (j.second->pg &&
4665 !j.second->pg->is_deleted()) {
4666 v->push_back(j.second->pg);
4667 if (clear_too) {
4668 s->_detach_pg(j.second.get());
4669 }
4670 }
4671 }
4672 }
4673 }
4674
4675 void OSD::_get_pgids(vector<spg_t> *v)
4676 {
4677 v->clear();
4678 v->reserve(get_num_pgs());
4679 for (auto& s : shards) {
4680 std::lock_guard l(s->shard_lock);
4681 for (auto& j : s->pg_slots) {
4682 if (j.second->pg &&
4683 !j.second->pg->is_deleted()) {
4684 v->push_back(j.first);
4685 }
4686 }
4687 }
4688 }
4689
4690 void OSD::register_pg(PGRef pg)
4691 {
4692 spg_t pgid = pg->get_pgid();
4693 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4694 auto sdata = shards[shard_index];
4695 std::lock_guard l(sdata->shard_lock);
4696 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4697 ceph_assert(r.second);
4698 auto *slot = r.first->second.get();
4699 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4700 sdata->_attach_pg(slot, pg.get());
4701 }
4702
4703 bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4704 {
4705 auto sdata = pg->osd_shard;
4706 ceph_assert(sdata);
4707 {
4708 std::lock_guard l(sdata->shard_lock);
4709 auto p = sdata->pg_slots.find(pg->pg_id);
4710 if (p == sdata->pg_slots.end() ||
4711 !p->second->pg) {
4712 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4713 return false;
4714 }
4715 if (p->second->waiting_for_merge_epoch) {
4716 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4717 return false;
4718 }
4719 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4720 sdata->_detach_pg(p->second.get());
4721 }
4722
4723 for (auto shard : shards) {
4724 shard->unprime_split_children(pg->pg_id, old_pg_num);
4725 }
4726
4727 // update pg count now since we might not get an osdmap any time soon.
4728 if (pg->is_primary())
4729 service.logger->dec(l_osd_pg_primary);
4730 else if (pg->is_nonprimary())
4731 service.logger->dec(l_osd_pg_replica); // misnomver
4732 else
4733 service.logger->dec(l_osd_pg_stray);
4734
4735 return true;
4736 }
4737
4738 PGRef OSD::_lookup_pg(spg_t pgid)
4739 {
4740 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4741 auto sdata = shards[shard_index];
4742 std::lock_guard l(sdata->shard_lock);
4743 auto p = sdata->pg_slots.find(pgid);
4744 if (p == sdata->pg_slots.end()) {
4745 return nullptr;
4746 }
4747 return p->second->pg;
4748 }
4749
4750 PGRef OSD::_lookup_lock_pg(spg_t pgid)
4751 {
4752 PGRef pg = _lookup_pg(pgid);
4753 if (!pg) {
4754 return nullptr;
4755 }
4756 pg->lock();
4757 if (!pg->is_deleted()) {
4758 return pg;
4759 }
4760 pg->unlock();
4761 return nullptr;
4762 }
4763
4764 PGRef OSD::lookup_lock_pg(spg_t pgid)
4765 {
4766 return _lookup_lock_pg(pgid);
4767 }
4768
4769 void OSD::load_pgs()
4770 {
4771 ceph_assert(ceph_mutex_is_locked(osd_lock));
4772 dout(0) << "load_pgs" << dendl;
4773
4774 {
4775 auto pghist = make_pg_num_history_oid();
4776 bufferlist bl;
4777 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4778 if (r >= 0 && bl.length() > 0) {
4779 auto p = bl.cbegin();
4780 decode(pg_num_history, p);
4781 }
4782 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
4783 }
4784
4785 vector<coll_t> ls;
4786 int r = store->list_collections(ls);
4787 if (r < 0) {
4788 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4789 }
4790
4791 int num = 0;
4792 for (vector<coll_t>::iterator it = ls.begin();
4793 it != ls.end();
4794 ++it) {
4795 spg_t pgid;
4796 if (it->is_temp(&pgid) ||
4797 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
4798 dout(10) << "load_pgs " << *it
4799 << " removing, legacy or flagged for removal pg" << dendl;
4800 recursive_remove_collection(cct, store, pgid, *it);
4801 continue;
4802 }
4803
4804 if (!it->is_pg(&pgid)) {
4805 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4806 continue;
4807 }
4808
4809 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4810 epoch_t map_epoch = 0;
4811 int r = PG::peek_map_epoch(store, pgid, &map_epoch);
4812 if (r < 0) {
4813 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4814 << dendl;
4815 continue;
4816 }
4817
4818 PGRef pg;
4819 if (map_epoch > 0) {
4820 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4821 if (!pgosdmap) {
4822 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
4823 derr << __func__ << ": could not find map for epoch " << map_epoch
4824 << " on pg " << pgid << ", but the pool is not present in the "
4825 << "current map, so this is probably a result of bug 10617. "
4826 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4827 << "to clean it up later." << dendl;
4828 continue;
4829 } else {
4830 derr << __func__ << ": have pgid " << pgid << " at epoch "
4831 << map_epoch << ", but missing map. Crashing."
4832 << dendl;
4833 ceph_abort_msg("Missing map in load_pgs");
4834 }
4835 }
4836 pg = _make_pg(pgosdmap, pgid);
4837 } else {
4838 pg = _make_pg(get_osdmap(), pgid);
4839 }
4840 if (!pg) {
4841 recursive_remove_collection(cct, store, pgid, *it);
4842 continue;
4843 }
4844
4845 // there can be no waiters here, so we don't call _wake_pg_slot
4846
4847 pg->lock();
4848 pg->ch = store->open_collection(pg->coll);
4849
4850 // read pg state, log
4851 pg->read_state(store);
4852
4853 if (pg->dne()) {
4854 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4855 pg->ch = nullptr;
4856 pg->unlock();
4857 recursive_remove_collection(cct, store, pgid, *it);
4858 continue;
4859 }
4860 {
4861 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4862 assert(NULL != shards[shard_index]);
4863 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4864 }
4865
4866 pg->reg_next_scrub();
4867
4868 dout(10) << __func__ << " loaded " << *pg << dendl;
4869 pg->unlock();
4870
4871 register_pg(pg);
4872 ++num;
4873 }
4874 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
4875 }
4876
4877
4878 PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4879 const PGCreateInfo *info)
4880 {
4881 spg_t pgid = info->pgid;
4882
4883 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4884 dout(10) << __func__ << " hit max pg, dropping" << dendl;
4885 return nullptr;
4886 }
4887
4888 PeeringCtx rctx = create_context();
4889
4890 OSDMapRef startmap = get_map(info->epoch);
4891
4892 if (info->by_mon) {
4893 int64_t pool_id = pgid.pgid.pool();
4894 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4895 if (!pool) {
4896 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4897 return nullptr;
4898 }
4899 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
4900 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4901 // this ensures we do not process old creating messages after the
4902 // pool's initial pgs have been created (and pg are subsequently
4903 // allowed to split or merge).
4904 dout(20) << __func__ << " dropping " << pgid
4905 << "create, pool does not have CREATING flag set" << dendl;
4906 return nullptr;
4907 }
4908 }
4909
4910 int up_primary, acting_primary;
4911 vector<int> up, acting;
4912 startmap->pg_to_up_acting_osds(
4913 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4914
4915 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4916 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4917 store->get_type() != "bluestore") {
4918 clog->warn() << "pg " << pgid
4919 << " is at risk of silent data corruption: "
4920 << "the pool allows ec overwrites but is not stored in "
4921 << "bluestore, so deep scrubbing will not detect bitrot";
4922 }
4923 create_pg_collection(
4924 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4925 init_pg_ondisk(rctx.transaction, pgid, pp);
4926
4927 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
4928
4929 PGRef pg = _make_pg(startmap, pgid);
4930 pg->ch = store->create_new_collection(pg->coll);
4931
4932 {
4933 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4934 assert(NULL != shards[shard_index]);
4935 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4936 }
4937
4938 pg->lock(true);
4939
4940 // we are holding the shard lock
4941 ceph_assert(!pg->is_deleted());
4942
4943 pg->init(
4944 role,
4945 up,
4946 up_primary,
4947 acting,
4948 acting_primary,
4949 info->history,
4950 info->past_intervals,
4951 false,
4952 rctx.transaction);
4953
4954 pg->init_collection_pool_opts();
4955
4956 if (pg->is_primary()) {
4957 std::lock_guard locker{m_perf_queries_lock};
4958 pg->set_dynamic_perf_stats_queries(m_perf_queries);
4959 }
4960
4961 pg->handle_initialize(rctx);
4962 pg->handle_activate_map(rctx);
4963
4964 dispatch_context(rctx, pg.get(), osdmap, nullptr);
4965
4966 dout(10) << __func__ << " new pg " << *pg << dendl;
4967 return pg;
4968 }
4969
4970 bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4971 spg_t pgid,
4972 bool is_mon_create)
4973 {
4974 const auto max_pgs_per_osd =
4975 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4976 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4977
4978 if (num_pgs < max_pgs_per_osd) {
4979 return false;
4980 }
4981
4982 std::lock_guard l(pending_creates_lock);
4983 if (is_mon_create) {
4984 pending_creates_from_mon++;
4985 } else {
4986 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
4987 pending_creates_from_osd.emplace(pgid, is_primary);
4988 }
4989 dout(1) << __func__ << " withhold creation of pg " << pgid
4990 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
4991 return true;
4992 }
4993
4994 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4995 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4996 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4997 static vector<int32_t> twiddle(const vector<int>& acting) {
4998 if (acting.size() > 1) {
4999 return {acting[0]};
5000 } else {
5001 vector<int32_t> twiddled(acting.begin(), acting.end());
5002 twiddled.push_back(-1);
5003 return twiddled;
5004 }
5005 }
5006
5007 void OSD::resume_creating_pg()
5008 {
5009 bool do_sub_pg_creates = false;
5010 bool have_pending_creates = false;
5011 {
5012 const auto max_pgs_per_osd =
5013 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5014 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
5015 if (max_pgs_per_osd <= num_pgs) {
5016 // this could happen if admin decreases this setting before a PG is removed
5017 return;
5018 }
5019 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
5020 std::lock_guard l(pending_creates_lock);
5021 if (pending_creates_from_mon > 0) {
5022 dout(20) << __func__ << " pending_creates_from_mon "
5023 << pending_creates_from_mon << dendl;
5024 do_sub_pg_creates = true;
5025 if (pending_creates_from_mon >= spare_pgs) {
5026 spare_pgs = pending_creates_from_mon = 0;
5027 } else {
5028 spare_pgs -= pending_creates_from_mon;
5029 pending_creates_from_mon = 0;
5030 }
5031 }
5032 auto pg = pending_creates_from_osd.cbegin();
5033 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
5034 dout(20) << __func__ << " pg " << pg->first << dendl;
5035 vector<int> acting;
5036 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
5037 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
5038 pg = pending_creates_from_osd.erase(pg);
5039 do_sub_pg_creates = true;
5040 spare_pgs--;
5041 }
5042 have_pending_creates = (pending_creates_from_mon > 0 ||
5043 !pending_creates_from_osd.empty());
5044 }
5045
5046 bool do_renew_subs = false;
5047 if (do_sub_pg_creates) {
5048 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
5049 dout(4) << __func__ << ": resolicit pg creates from mon since "
5050 << last_pg_create_epoch << dendl;
5051 do_renew_subs = true;
5052 }
5053 }
5054 version_t start = get_osdmap_epoch() + 1;
5055 if (have_pending_creates) {
5056 // don't miss any new osdmap deleting PGs
5057 if (monc->sub_want("osdmap", start, 0)) {
5058 dout(4) << __func__ << ": resolicit osdmap from mon since "
5059 << start << dendl;
5060 do_renew_subs = true;
5061 }
5062 } else if (do_sub_pg_creates) {
5063 // no need to subscribe the osdmap continuously anymore
5064 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
5065 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
5066 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
5067 << start << dendl;
5068 do_renew_subs = true;
5069 }
5070 }
5071
5072 if (do_renew_subs) {
5073 monc->renew_subs();
5074 }
5075
5076 service.send_pg_temp();
5077 }
5078
5079 void OSD::build_initial_pg_history(
5080 spg_t pgid,
5081 epoch_t created,
5082 utime_t created_stamp,
5083 pg_history_t *h,
5084 PastIntervals *pi)
5085 {
5086 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
5087 *h = pg_history_t(created, created_stamp);
5088
5089 OSDMapRef lastmap = service.get_map(created);
5090 int up_primary, acting_primary;
5091 vector<int> up, acting;
5092 lastmap->pg_to_up_acting_osds(
5093 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
5094
5095 ostringstream debug;
5096 for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
5097 OSDMapRef osdmap = service.get_map(e);
5098 int new_up_primary, new_acting_primary;
5099 vector<int> new_up, new_acting;
5100 osdmap->pg_to_up_acting_osds(
5101 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
5102
5103 // this is a bit imprecise, but sufficient?
5104 struct min_size_predicate_t : public IsPGRecoverablePredicate {
5105 const pg_pool_t *pi;
5106 bool operator()(const set<pg_shard_t> &have) const {
5107 return have.size() >= pi->min_size;
5108 }
5109 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
5110 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
5111
5112 bool new_interval = PastIntervals::check_new_interval(
5113 acting_primary,
5114 new_acting_primary,
5115 acting, new_acting,
5116 up_primary,
5117 new_up_primary,
5118 up, new_up,
5119 h->same_interval_since,
5120 h->last_epoch_clean,
5121 osdmap.get(),
5122 lastmap.get(),
5123 pgid.pgid,
5124 min_size_predicate,
5125 pi,
5126 &debug);
5127 if (new_interval) {
5128 h->same_interval_since = e;
5129 if (up != new_up) {
5130 h->same_up_since = e;
5131 }
5132 if (acting_primary != new_acting_primary) {
5133 h->same_primary_since = e;
5134 }
5135 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
5136 osdmap->get_pg_num(pgid.pgid.pool()),
5137 nullptr)) {
5138 h->last_epoch_split = e;
5139 }
5140 up = new_up;
5141 acting = new_acting;
5142 up_primary = new_up_primary;
5143 acting_primary = new_acting_primary;
5144 }
5145 lastmap = osdmap;
5146 }
5147 dout(20) << __func__ << " " << debug.str() << dendl;
5148 dout(10) << __func__ << " " << *h << " " << *pi
5149 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5150 pi->get_bounds()) << ")"
5151 << dendl;
5152 }
5153
5154 void OSD::_add_heartbeat_peer(int p)
5155 {
5156 if (p == whoami)
5157 return;
5158 HeartbeatInfo *hi;
5159
5160 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5161 if (i == heartbeat_peers.end()) {
5162 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
5163 if (!cons.first)
5164 return;
5165 assert(cons.second);
5166
5167 hi = &heartbeat_peers[p];
5168 hi->peer = p;
5169
5170 auto stamps = service.get_hb_stamps(p);
5171
5172 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5173 sb->peer = p;
5174 sb->stamps = stamps;
5175 hi->hb_interval_start = ceph_clock_now();
5176 hi->con_back = cons.first.get();
5177 hi->con_back->set_priv(sb);
5178
5179 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5180 sf->peer = p;
5181 sf->stamps = stamps;
5182 hi->con_front = cons.second.get();
5183 hi->con_front->set_priv(sf);
5184
5185 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5186 << " " << hi->con_back->get_peer_addr()
5187 << " " << hi->con_front->get_peer_addr()
5188 << dendl;
5189 } else {
5190 hi = &i->second;
5191 }
5192 hi->epoch = get_osdmap_epoch();
5193 }
5194
5195 void OSD::_remove_heartbeat_peer(int n)
5196 {
5197 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5198 ceph_assert(q != heartbeat_peers.end());
5199 dout(20) << " removing heartbeat peer osd." << n
5200 << " " << q->second.con_back->get_peer_addr()
5201 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5202 << dendl;
5203 q->second.clear_mark_down();
5204 heartbeat_peers.erase(q);
5205 }
5206
5207 void OSD::need_heartbeat_peer_update()
5208 {
5209 if (is_stopping())
5210 return;
5211 dout(20) << "need_heartbeat_peer_update" << dendl;
5212 heartbeat_set_peers_need_update();
5213 }
5214
5215 void OSD::maybe_update_heartbeat_peers()
5216 {
5217 ceph_assert(ceph_mutex_is_locked(osd_lock));
5218
5219 if (is_waiting_for_healthy() || is_active()) {
5220 utime_t now = ceph_clock_now();
5221 if (last_heartbeat_resample == utime_t()) {
5222 last_heartbeat_resample = now;
5223 heartbeat_set_peers_need_update();
5224 } else if (!heartbeat_peers_need_update()) {
5225 utime_t dur = now - last_heartbeat_resample;
5226 if (dur > cct->_conf->osd_heartbeat_grace) {
5227 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5228 heartbeat_set_peers_need_update();
5229 last_heartbeat_resample = now;
5230 // automatically clean up any stale heartbeat peers
5231 // if we are unhealthy, then clean all
5232 reset_heartbeat_peers(is_waiting_for_healthy());
5233 }
5234 }
5235 }
5236
5237 if (!heartbeat_peers_need_update())
5238 return;
5239 heartbeat_clear_peers_need_update();
5240
5241 std::lock_guard l(heartbeat_lock);
5242
5243 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5244
5245
5246 // build heartbeat from set
5247 if (is_active()) {
5248 vector<PGRef> pgs;
5249 _get_pgs(&pgs);
5250 for (auto& pg : pgs) {
5251 pg->with_heartbeat_peers([&](int peer) {
5252 if (get_osdmap()->is_up(peer)) {
5253 _add_heartbeat_peer(peer);
5254 }
5255 });
5256 }
5257 }
5258
5259 // include next and previous up osds to ensure we have a fully-connected set
5260 set<int> want, extras;
5261 const int next = get_osdmap()->get_next_up_osd_after(whoami);
5262 if (next >= 0)
5263 want.insert(next);
5264 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
5265 if (prev >= 0 && prev != next)
5266 want.insert(prev);
5267
5268 // make sure we have at least **min_down** osds coming from different
5269 // subtree level (e.g., hosts) for fast failure detection.
5270 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5271 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5272 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5273 get_osdmap()->get_random_up_osds_by_subtree(
5274 whoami, subtree, limit, want, &want);
5275
5276 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5277 dout(10) << " adding neighbor peer osd." << *p << dendl;
5278 extras.insert(*p);
5279 _add_heartbeat_peer(*p);
5280 }
5281
5282 // remove down peers; enumerate extras
5283 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5284 while (p != heartbeat_peers.end()) {
5285 if (!get_osdmap()->is_up(p->first)) {
5286 int o = p->first;
5287 ++p;
5288 _remove_heartbeat_peer(o);
5289 continue;
5290 }
5291 if (p->second.epoch < get_osdmap_epoch()) {
5292 extras.insert(p->first);
5293 }
5294 ++p;
5295 }
5296
5297 // too few?
5298 for (int n = next; n >= 0; ) {
5299 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5300 break;
5301 if (!extras.count(n) && !want.count(n) && n != whoami) {
5302 dout(10) << " adding random peer osd." << n << dendl;
5303 extras.insert(n);
5304 _add_heartbeat_peer(n);
5305 }
5306 n = get_osdmap()->get_next_up_osd_after(n);
5307 if (n == next)
5308 break; // came full circle; stop
5309 }
5310
5311 // too many?
5312 for (set<int>::iterator p = extras.begin();
5313 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5314 ++p) {
5315 if (want.count(*p))
5316 continue;
5317 _remove_heartbeat_peer(*p);
5318 }
5319
5320 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5321
5322 // clean up stale failure pending
5323 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5324 if (heartbeat_peers.count(it->first) == 0) {
5325 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5326 failure_pending.erase(it++);
5327 } else {
5328 it++;
5329 }
5330 }
5331 }
5332
5333 void OSD::reset_heartbeat_peers(bool all)
5334 {
5335 ceph_assert(ceph_mutex_is_locked(osd_lock));
5336 dout(10) << "reset_heartbeat_peers" << dendl;
5337 utime_t stale = ceph_clock_now();
5338 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5339 std::lock_guard l(heartbeat_lock);
5340 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5341 auto& [peer, hi] = *it;
5342 if (all || hi.is_stale(stale)) {
5343 hi.clear_mark_down();
5344 // stop sending failure_report to mon too
5345 failure_queue.erase(peer);
5346 failure_pending.erase(peer);
5347 it = heartbeat_peers.erase(it);
5348 } else {
5349 ++it;
5350 }
5351 }
5352 }
5353
5354 void OSD::handle_osd_ping(MOSDPing *m)
5355 {
5356 if (superblock.cluster_fsid != m->fsid) {
5357 dout(20) << "handle_osd_ping from " << m->get_source_inst()
5358 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5359 << dendl;
5360 m->put();
5361 return;
5362 }
5363
5364 int from = m->get_source().num();
5365
5366 heartbeat_lock.lock();
5367 if (is_stopping()) {
5368 heartbeat_lock.unlock();
5369 m->put();
5370 return;
5371 }
5372
5373 utime_t now = ceph_clock_now();
5374 auto mnow = service.get_mnow();
5375 ConnectionRef con(m->get_connection());
5376 OSDMapRef curmap = service.get_osdmap();
5377 if (!curmap) {
5378 heartbeat_lock.unlock();
5379 m->put();
5380 return;
5381 }
5382
5383 auto sref = con->get_priv();
5384 Session *s = static_cast<Session*>(sref.get());
5385 if (!s) {
5386 heartbeat_lock.unlock();
5387 m->put();
5388 return;
5389 }
5390 if (!s->stamps) {
5391 s->peer = from;
5392 s->stamps = service.get_hb_stamps(from);
5393 }
5394
5395 switch (m->op) {
5396
5397 case MOSDPing::PING:
5398 {
5399 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5400 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5401 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5402 if (heartbeat_drop->second == 0) {
5403 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5404 } else {
5405 --heartbeat_drop->second;
5406 dout(5) << "Dropping heartbeat from " << from
5407 << ", " << heartbeat_drop->second
5408 << " remaining to drop" << dendl;
5409 break;
5410 }
5411 } else if (cct->_conf->osd_debug_drop_ping_probability >
5412 ((((double)(rand()%100))/100.0))) {
5413 heartbeat_drop =
5414 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5415 cct->_conf->osd_debug_drop_ping_duration)).first;
5416 dout(5) << "Dropping heartbeat from " << from
5417 << ", " << heartbeat_drop->second
5418 << " remaining to drop" << dendl;
5419 break;
5420 }
5421 }
5422
5423 ceph::signedspan sender_delta_ub{};
5424 s->stamps->got_ping(
5425 m->up_from,
5426 mnow,
5427 m->mono_send_stamp,
5428 m->delta_ub,
5429 &sender_delta_ub);
5430 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5431
5432 if (!cct->get_heartbeat_map()->is_healthy()) {
5433 dout(10) << "internal heartbeat not healthy, dropping ping request"
5434 << dendl;
5435 break;
5436 }
5437
5438 Message *r = new MOSDPing(monc->get_fsid(),
5439 curmap->get_epoch(),
5440 MOSDPing::PING_REPLY,
5441 m->ping_stamp,
5442 m->mono_ping_stamp,
5443 mnow,
5444 service.get_up_epoch(),
5445 cct->_conf->osd_heartbeat_min_size,
5446 sender_delta_ub);
5447 con->send_message(r);
5448
5449 if (curmap->is_up(from)) {
5450 if (is_active()) {
5451 ConnectionRef cluster_con = service.get_con_osd_cluster(
5452 from, curmap->get_epoch());
5453 if (cluster_con) {
5454 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5455 }
5456 }
5457 } else if (!curmap->exists(from) ||
5458 curmap->get_down_at(from) > m->map_epoch) {
5459 // tell them they have died
5460 Message *r = new MOSDPing(monc->get_fsid(),
5461 curmap->get_epoch(),
5462 MOSDPing::YOU_DIED,
5463 m->ping_stamp,
5464 m->mono_ping_stamp,
5465 mnow,
5466 service.get_up_epoch(),
5467 cct->_conf->osd_heartbeat_min_size);
5468 con->send_message(r);
5469 }
5470 }
5471 break;
5472
5473 case MOSDPing::PING_REPLY:
5474 {
5475 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5476 if (i != heartbeat_peers.end()) {
5477 auto acked = i->second.ping_history.find(m->ping_stamp);
5478 if (acked != i->second.ping_history.end()) {
5479 int &unacknowledged = acked->second.second;
5480 if (con == i->second.con_back) {
5481 dout(25) << "handle_osd_ping got reply from osd." << from
5482 << " first_tx " << i->second.first_tx
5483 << " last_tx " << i->second.last_tx
5484 << " last_rx_back " << i->second.last_rx_back
5485 << " -> " << now
5486 << " last_rx_front " << i->second.last_rx_front
5487 << dendl;
5488 i->second.last_rx_back = now;
5489 ceph_assert(unacknowledged > 0);
5490 --unacknowledged;
5491 // if there is no front con, set both stamps.
5492 if (i->second.con_front == NULL) {
5493 i->second.last_rx_front = now;
5494 ceph_assert(unacknowledged > 0);
5495 --unacknowledged;
5496 }
5497 } else if (con == i->second.con_front) {
5498 dout(25) << "handle_osd_ping got reply from osd." << from
5499 << " first_tx " << i->second.first_tx
5500 << " last_tx " << i->second.last_tx
5501 << " last_rx_back " << i->second.last_rx_back
5502 << " last_rx_front " << i->second.last_rx_front
5503 << " -> " << now
5504 << dendl;
5505 i->second.last_rx_front = now;
5506 ceph_assert(unacknowledged > 0);
5507 --unacknowledged;
5508 }
5509
5510 if (unacknowledged == 0) {
5511 // succeeded in getting all replies
5512 dout(25) << "handle_osd_ping got all replies from osd." << from
5513 << " , erase pending ping(sent at " << m->ping_stamp << ")"
5514 << " and older pending ping(s)"
5515 << dendl;
5516
5517 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5518 ++i->second.hb_average_count;
5519 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
5520 i->second.hb_total_back += back_pingtime;
5521 if (back_pingtime < i->second.hb_min_back)
5522 i->second.hb_min_back = back_pingtime;
5523 if (back_pingtime > i->second.hb_max_back)
5524 i->second.hb_max_back = back_pingtime;
5525 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
5526 i->second.hb_total_front += front_pingtime;
5527 if (front_pingtime < i->second.hb_min_front)
5528 i->second.hb_min_front = front_pingtime;
5529 if (front_pingtime > i->second.hb_max_front)
5530 i->second.hb_max_front = front_pingtime;
5531
5532 ceph_assert(i->second.hb_interval_start != utime_t());
5533 if (i->second.hb_interval_start == utime_t())
5534 i->second.hb_interval_start = now;
5535 int64_t hb_avg_time_period = 60;
5536 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5537 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5538 }
5539 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5540 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5541 uint32_t back_min = i->second.hb_min_back;
5542 uint32_t back_max = i->second.hb_max_back;
5543 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5544 uint32_t front_min = i->second.hb_min_front;
5545 uint32_t front_max = i->second.hb_max_front;
5546
5547 // Reset for new interval
5548 i->second.hb_average_count = 0;
5549 i->second.hb_interval_start = now;
5550 i->second.hb_total_back = i->second.hb_max_back = 0;
5551 i->second.hb_min_back = UINT_MAX;
5552 i->second.hb_total_front = i->second.hb_max_front = 0;
5553 i->second.hb_min_front = UINT_MAX;
5554
5555 // Record per osd interace ping times
5556 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5557 if (i->second.hb_back_pingtime.size() == 0) {
5558 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5559 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5560 i->second.hb_back_pingtime.push_back(back_avg);
5561 i->second.hb_back_min.push_back(back_min);
5562 i->second.hb_back_max.push_back(back_max);
5563 i->second.hb_front_pingtime.push_back(front_avg);
5564 i->second.hb_front_min.push_back(front_min);
5565 i->second.hb_front_max.push_back(front_max);
5566 ++i->second.hb_index;
5567 }
5568 } else {
5569 int index = i->second.hb_index & (hb_vector_size - 1);
5570 i->second.hb_back_pingtime[index] = back_avg;
5571 i->second.hb_back_min[index] = back_min;
5572 i->second.hb_back_max[index] = back_max;
5573 i->second.hb_front_pingtime[index] = front_avg;
5574 i->second.hb_front_min[index] = front_min;
5575 i->second.hb_front_max[index] = front_max;
5576 ++i->second.hb_index;
5577 }
5578
5579 {
5580 std::lock_guard l(service.stat_lock);
5581 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5582 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5583
5584 uint32_t total = 0;
5585 uint32_t min = UINT_MAX;
5586 uint32_t max = 0;
5587 uint32_t count = 0;
5588 uint32_t which = 0;
5589 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5590 for (int32_t k = size - 1 ; k >= 0; --k) {
5591 ++count;
5592 int index = (i->second.hb_index + k) % size;
5593 total += i->second.hb_back_pingtime[index];
5594 if (i->second.hb_back_min[index] < min)
5595 min = i->second.hb_back_min[index];
5596 if (i->second.hb_back_max[index] > max)
5597 max = i->second.hb_back_max[index];
5598 if (count == 1 || count == 5 || count == 15) {
5599 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5600 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5601 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5602 which++;
5603 if (count == 15)
5604 break;
5605 }
5606 }
5607
5608 if (i->second.con_front != NULL) {
5609 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5610
5611 total = 0;
5612 min = UINT_MAX;
5613 max = 0;
5614 count = 0;
5615 which = 0;
5616 for (int32_t k = size - 1 ; k >= 0; --k) {
5617 ++count;
5618 int index = (i->second.hb_index + k) % size;
5619 total += i->second.hb_front_pingtime[index];
5620 if (i->second.hb_front_min[index] < min)
5621 min = i->second.hb_front_min[index];
5622 if (i->second.hb_front_max[index] > max)
5623 max = i->second.hb_front_max[index];
5624 if (count == 1 || count == 5 || count == 15) {
5625 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5626 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5627 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5628 which++;
5629 if (count == 15)
5630 break;
5631 }
5632 }
5633 }
5634 }
5635 } else {
5636 std::lock_guard l(service.stat_lock);
5637 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5638 if (i->second.con_front != NULL)
5639 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5640 }
5641 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5642 }
5643
5644 if (i->second.is_healthy(now)) {
5645 // Cancel false reports
5646 auto failure_queue_entry = failure_queue.find(from);
5647 if (failure_queue_entry != failure_queue.end()) {
5648 dout(10) << "handle_osd_ping canceling queued "
5649 << "failure report for osd." << from << dendl;
5650 failure_queue.erase(failure_queue_entry);
5651 }
5652
5653 auto failure_pending_entry = failure_pending.find(from);
5654 if (failure_pending_entry != failure_pending.end()) {
5655 dout(10) << "handle_osd_ping canceling in-flight "
5656 << "failure report for osd." << from << dendl;
5657 send_still_alive(curmap->get_epoch(),
5658 from,
5659 failure_pending_entry->second.second);
5660 failure_pending.erase(failure_pending_entry);
5661 }
5662 }
5663 } else {
5664 // old replies, deprecated by newly sent pings.
5665 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
5666 << ") is found, treat as covered by newly sent pings "
5667 << "and ignore"
5668 << dendl;
5669 }
5670 }
5671
5672 if (m->map_epoch &&
5673 curmap->is_up(from)) {
5674 if (is_active()) {
5675 ConnectionRef cluster_con = service.get_con_osd_cluster(
5676 from, curmap->get_epoch());
5677 if (cluster_con) {
5678 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5679 }
5680 }
5681 }
5682
5683 s->stamps->got_ping_reply(
5684 mnow,
5685 m->mono_send_stamp,
5686 m->delta_ub);
5687 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5688 }
5689 break;
5690
5691 case MOSDPing::YOU_DIED:
5692 dout(10) << "handle_osd_ping " << m->get_source_inst()
5693 << " says i am down in " << m->map_epoch << dendl;
5694 osdmap_subscribe(curmap->get_epoch()+1, false);
5695 break;
5696 }
5697
5698 heartbeat_lock.unlock();
5699 m->put();
5700 }
5701
5702 void OSD::heartbeat_entry()
5703 {
5704 std::unique_lock l(heartbeat_lock);
5705 if (is_stopping())
5706 return;
5707 while (!heartbeat_stop) {
5708 heartbeat();
5709
5710 double wait;
5711 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5712 wait = (float)cct->_conf->osd_heartbeat_interval;
5713 } else {
5714 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5715 }
5716 auto w = ceph::make_timespan(wait);
5717 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5718 heartbeat_cond.wait_for(l, w);
5719 if (is_stopping())
5720 return;
5721 dout(30) << "heartbeat_entry woke up" << dendl;
5722 }
5723 }
5724
5725 void OSD::heartbeat_check()
5726 {
5727 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
5728 utime_t now = ceph_clock_now();
5729
5730 // check for incoming heartbeats (move me elsewhere?)
5731 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5732 p != heartbeat_peers.end();
5733 ++p) {
5734
5735 if (p->second.first_tx == utime_t()) {
5736 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5737 << " yet, skipping" << dendl;
5738 continue;
5739 }
5740
5741 dout(25) << "heartbeat_check osd." << p->first
5742 << " first_tx " << p->second.first_tx
5743 << " last_tx " << p->second.last_tx
5744 << " last_rx_back " << p->second.last_rx_back
5745 << " last_rx_front " << p->second.last_rx_front
5746 << dendl;
5747 if (p->second.is_unhealthy(now)) {
5748 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5749 if (p->second.last_rx_back == utime_t() ||
5750 p->second.last_rx_front == utime_t()) {
5751 derr << "heartbeat_check: no reply from "
5752 << p->second.con_front->get_peer_addr().get_sockaddr()
5753 << " osd." << p->first
5754 << " ever on either front or back, first ping sent "
5755 << p->second.first_tx
5756 << " (oldest deadline " << oldest_deadline << ")"
5757 << dendl;
5758 // fail
5759 failure_queue[p->first] = p->second.first_tx;
5760 } else {
5761 derr << "heartbeat_check: no reply from "
5762 << p->second.con_front->get_peer_addr().get_sockaddr()
5763 << " osd." << p->first << " since back " << p->second.last_rx_back
5764 << " front " << p->second.last_rx_front
5765 << " (oldest deadline " << oldest_deadline << ")"
5766 << dendl;
5767 // fail
5768 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5769 }
5770 }
5771 }
5772 }
5773
5774 void OSD::heartbeat()
5775 {
5776 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
5777 dout(30) << "heartbeat" << dendl;
5778
5779 // get CPU load avg
5780 double loadavgs[1];
5781 int hb_interval = cct->_conf->osd_heartbeat_interval;
5782 int n_samples = 86400;
5783 if (hb_interval > 1) {
5784 n_samples /= hb_interval;
5785 if (n_samples < 1)
5786 n_samples = 1;
5787 }
5788
5789 if (getloadavg(loadavgs, 1) == 1) {
5790 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5791 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5792 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5793 }
5794
5795 dout(30) << "heartbeat checking stats" << dendl;
5796
5797 // refresh peer list and osd stats
5798 vector<int> hb_peers;
5799 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5800 p != heartbeat_peers.end();
5801 ++p)
5802 hb_peers.push_back(p->first);
5803
5804 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5805 dout(5) << __func__ << " " << new_stat << dendl;
5806 ceph_assert(new_stat.statfs.total);
5807
5808 float pratio;
5809 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5810
5811 service.check_full_status(ratio, pratio);
5812
5813 utime_t now = ceph_clock_now();
5814 auto mnow = service.get_mnow();
5815 utime_t deadline = now;
5816 deadline += cct->_conf->osd_heartbeat_grace;
5817
5818 // send heartbeats
5819 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5820 i != heartbeat_peers.end();
5821 ++i) {
5822 int peer = i->first;
5823 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5824 if (!s) {
5825 dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
5826 continue;
5827 }
5828 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5829
5830 i->second.last_tx = now;
5831 if (i->second.first_tx == utime_t())
5832 i->second.first_tx = now;
5833 i->second.ping_history[now] = make_pair(deadline,
5834 HeartbeatInfo::HEARTBEAT_MAX_CONN);
5835 if (i->second.hb_interval_start == utime_t())
5836 i->second.hb_interval_start = now;
5837
5838 std::optional<ceph::signedspan> delta_ub;
5839 s->stamps->sent_ping(&delta_ub);
5840
5841 i->second.con_back->send_message(
5842 new MOSDPing(monc->get_fsid(),
5843 service.get_osdmap_epoch(),
5844 MOSDPing::PING,
5845 now,
5846 mnow,
5847 mnow,
5848 service.get_up_epoch(),
5849 cct->_conf->osd_heartbeat_min_size,
5850 delta_ub));
5851
5852 if (i->second.con_front)
5853 i->second.con_front->send_message(
5854 new MOSDPing(monc->get_fsid(),
5855 service.get_osdmap_epoch(),
5856 MOSDPing::PING,
5857 now,
5858 mnow,
5859 mnow,
5860 service.get_up_epoch(),
5861 cct->_conf->osd_heartbeat_min_size,
5862 delta_ub));
5863 }
5864
5865 logger->set(l_osd_hb_to, heartbeat_peers.size());
5866
5867 // hmm.. am i all alone?
5868 dout(30) << "heartbeat lonely?" << dendl;
5869 if (heartbeat_peers.empty()) {
5870 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5871 last_mon_heartbeat = now;
5872 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5873 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5874 }
5875 }
5876
5877 dout(30) << "heartbeat done" << dendl;
5878 }
5879
5880 bool OSD::heartbeat_reset(Connection *con)
5881 {
5882 std::lock_guard l(heartbeat_lock);
5883 auto s = con->get_priv();
5884 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
5885 con->set_priv(nullptr);
5886 if (s) {
5887 if (is_stopping()) {
5888 return true;
5889 }
5890 auto session = static_cast<Session*>(s.get());
5891 auto p = heartbeat_peers.find(session->peer);
5892 if (p != heartbeat_peers.end() &&
5893 (p->second.con_back == con ||
5894 p->second.con_front == con)) {
5895 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5896 << ", reopening" << dendl;
5897 p->second.clear_mark_down(con);
5898 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5899 if (newcon.first) {
5900 p->second.con_back = newcon.first.get();
5901 p->second.con_back->set_priv(s);
5902 if (newcon.second) {
5903 p->second.con_front = newcon.second.get();
5904 p->second.con_front->set_priv(s);
5905 }
5906 p->second.ping_history.clear();
5907 } else {
5908 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5909 << ", raced with osdmap update, closing out peer" << dendl;
5910 heartbeat_peers.erase(p);
5911 }
5912 } else {
5913 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5914 }
5915 }
5916 return true;
5917 }
5918
5919
5920
5921 // =========================================
5922
5923 void OSD::tick()
5924 {
5925 ceph_assert(ceph_mutex_is_locked(osd_lock));
5926 dout(10) << "tick" << dendl;
5927
5928 utime_t now = ceph_clock_now();
5929 // throw out any obsolete markdown log
5930 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
5931 while (!osd_markdown_log.empty() &&
5932 osd_markdown_log.front() + grace < now)
5933 osd_markdown_log.pop_front();
5934
5935 if (is_active() || is_waiting_for_healthy()) {
5936 maybe_update_heartbeat_peers();
5937 }
5938
5939 if (is_waiting_for_healthy()) {
5940 start_boot();
5941 }
5942
5943 if (is_waiting_for_healthy() || is_booting()) {
5944 std::lock_guard l(heartbeat_lock);
5945 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5946 last_mon_heartbeat = now;
5947 dout(1) << __func__ << " checking mon for new map" << dendl;
5948 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5949 }
5950 }
5951
5952 do_waiters();
5953
5954 // scrub purged_snaps every deep scrub interval
5955 {
5956 const utime_t last = superblock.last_purged_snaps_scrub;
5957 utime_t next = last;
5958 next += cct->_conf->osd_scrub_min_interval;
5959 std::mt19937 rng;
5960 // use a seed that is stable for each scrub interval, but varies
5961 // by OSD to avoid any herds.
5962 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
5963 double r = (rng() % 1024) / 1024;
5964 next +=
5965 cct->_conf->osd_scrub_min_interval *
5966 cct->_conf->osd_scrub_interval_randomize_ratio * r;
5967 if (next < ceph_clock_now()) {
5968 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5969 << " next " << next << " ... now" << dendl;
5970 scrub_purged_snaps();
5971 } else {
5972 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5973 << " next " << next << dendl;
5974 }
5975 }
5976
5977 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
5978 }
5979
5980 void OSD::tick_without_osd_lock()
5981 {
5982 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
5983 dout(10) << "tick_without_osd_lock" << dendl;
5984
5985 logger->set(l_osd_cached_crc, ceph::buffer::get_cached_crc());
5986 logger->set(l_osd_cached_crc_adjusted, ceph::buffer::get_cached_crc_adjusted());
5987 logger->set(l_osd_missed_crc, ceph::buffer::get_missed_crc());
5988
5989 // refresh osd stats
5990 struct store_statfs_t stbuf;
5991 osd_alert_list_t alerts;
5992 int r = store->statfs(&stbuf, &alerts);
5993 ceph_assert(r == 0);
5994 service.set_statfs(stbuf, alerts);
5995
5996 // osd_lock is not being held, which means the OSD state
5997 // might change when doing the monitor report
5998 if (is_active() || is_waiting_for_healthy()) {
5999 {
6000 std::lock_guard l{heartbeat_lock};
6001 heartbeat_check();
6002 }
6003 map_lock.lock_shared();
6004 std::lock_guard l(mon_report_lock);
6005
6006 // mon report?
6007 utime_t now = ceph_clock_now();
6008 if (service.need_fullness_update() ||
6009 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
6010 last_mon_report = now;
6011 send_full_update();
6012 send_failures();
6013 }
6014 map_lock.unlock_shared();
6015
6016 epoch_t max_waiting_epoch = 0;
6017 for (auto s : shards) {
6018 max_waiting_epoch = std::max(max_waiting_epoch,
6019 s->get_max_waiting_epoch());
6020 }
6021 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
6022 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
6023 << ", requesting new map" << dendl;
6024 osdmap_subscribe(superblock.newest_map + 1, false);
6025 }
6026 }
6027
6028 if (is_active()) {
6029 if (!scrub_random_backoff()) {
6030 sched_scrub();
6031 }
6032 service.promote_throttle_recalibrate();
6033 resume_creating_pg();
6034 bool need_send_beacon = false;
6035 const auto now = ceph::coarse_mono_clock::now();
6036 {
6037 // borrow lec lock to pretect last_sent_beacon from changing
6038 std::lock_guard l{min_last_epoch_clean_lock};
6039 const auto elapsed = now - last_sent_beacon;
6040 if (std::chrono::duration_cast<std::chrono::seconds>(elapsed).count() >
6041 cct->_conf->osd_beacon_report_interval) {
6042 need_send_beacon = true;
6043 }
6044 }
6045 if (need_send_beacon) {
6046 send_beacon(now);
6047 }
6048 }
6049
6050 mgrc.update_daemon_health(get_health_metrics());
6051 service.kick_recovery_queue();
6052 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
6053 new C_Tick_WithoutOSDLock(this));
6054 }
6055
6056 // Usage:
6057 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
6058 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
6059 // setomapheader <pool-id> [namespace/]<obj-name> <header>
6060 // getomap <pool> [namespace/]<obj-name>
6061 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
6062 // injectmdataerr [namespace/]<obj-name> [shardid]
6063 // injectdataerr [namespace/]<obj-name> [shardid]
6064 //
6065 // set_recovery_delay [utime]
6066 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
6067 std::string_view command,
6068 const cmdmap_t& cmdmap, ostream &ss)
6069 {
6070 //Test support
6071 //Support changing the omap on a single osd by using the Admin Socket to
6072 //directly request the osd make a change.
6073 if (command == "setomapval" || command == "rmomapkey" ||
6074 command == "setomapheader" || command == "getomap" ||
6075 command == "truncobj" || command == "injectmdataerr" ||
6076 command == "injectdataerr"
6077 ) {
6078 pg_t rawpg;
6079 int64_t pool;
6080 OSDMapRef curmap = service->get_osdmap();
6081 int r = -1;
6082
6083 string poolstr;
6084
6085 cmd_getval(cmdmap, "pool", poolstr);
6086 pool = curmap->lookup_pg_pool_name(poolstr);
6087 //If we can't find it by name then maybe id specified
6088 if (pool < 0 && isdigit(poolstr[0]))
6089 pool = atoll(poolstr.c_str());
6090 if (pool < 0) {
6091 ss << "Invalid pool '" << poolstr << "''";
6092 return;
6093 }
6094
6095 string objname, nspace;
6096 cmd_getval(cmdmap, "objname", objname);
6097 std::size_t found = objname.find_first_of('/');
6098 if (found != string::npos) {
6099 nspace = objname.substr(0, found);
6100 objname = objname.substr(found+1);
6101 }
6102 object_locator_t oloc(pool, nspace);
6103 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
6104
6105 if (r < 0) {
6106 ss << "Invalid namespace/objname";
6107 return;
6108 }
6109
6110 int64_t shardid;
6111 cmd_getval(cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
6112 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
6113 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
6114 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
6115 if (curmap->pg_is_ec(rawpg)) {
6116 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
6117 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
6118 return;
6119 }
6120 }
6121
6122 ObjectStore::Transaction t;
6123
6124 if (command == "setomapval") {
6125 map<string, bufferlist> newattrs;
6126 bufferlist val;
6127 string key, valstr;
6128 cmd_getval(cmdmap, "key", key);
6129 cmd_getval(cmdmap, "val", valstr);
6130
6131 val.append(valstr);
6132 newattrs[key] = val;
6133 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
6134 r = store->queue_transaction(service->meta_ch, std::move(t));
6135 if (r < 0)
6136 ss << "error=" << r;
6137 else
6138 ss << "ok";
6139 } else if (command == "rmomapkey") {
6140 string key;
6141 cmd_getval(cmdmap, "key", key);
6142
6143 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
6144 r = store->queue_transaction(service->meta_ch, std::move(t));
6145 if (r < 0)
6146 ss << "error=" << r;
6147 else
6148 ss << "ok";
6149 } else if (command == "setomapheader") {
6150 bufferlist newheader;
6151 string headerstr;
6152
6153 cmd_getval(cmdmap, "header", headerstr);
6154 newheader.append(headerstr);
6155 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
6156 r = store->queue_transaction(service->meta_ch, std::move(t));
6157 if (r < 0)
6158 ss << "error=" << r;
6159 else
6160 ss << "ok";
6161 } else if (command == "getomap") {
6162 //Debug: Output entire omap
6163 bufferlist hdrbl;
6164 map<string, bufferlist> keyvals;
6165 auto ch = store->open_collection(coll_t(pgid));
6166 if (!ch) {
6167 ss << "unable to open collection for " << pgid;
6168 r = -ENOENT;
6169 } else {
6170 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6171 if (r >= 0) {
6172 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6173 for (map<string, bufferlist>::iterator it = keyvals.begin();
6174 it != keyvals.end(); ++it)
6175 ss << " key=" << (*it).first << " val="
6176 << string((*it).second.c_str(), (*it).second.length());
6177 } else {
6178 ss << "error=" << r;
6179 }
6180 }
6181 } else if (command == "truncobj") {
6182 int64_t trunclen;
6183 cmd_getval(cmdmap, "len", trunclen);
6184 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
6185 r = store->queue_transaction(service->meta_ch, std::move(t));
6186 if (r < 0)
6187 ss << "error=" << r;
6188 else
6189 ss << "ok";
6190 } else if (command == "injectdataerr") {
6191 store->inject_data_error(gobj);
6192 ss << "ok";
6193 } else if (command == "injectmdataerr") {
6194 store->inject_mdata_error(gobj);
6195 ss << "ok";
6196 }
6197 return;
6198 }
6199 if (command == "set_recovery_delay") {
6200 int64_t delay;
6201 cmd_getval(cmdmap, "utime", delay, (int64_t)0);
6202 ostringstream oss;
6203 oss << delay;
6204 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
6205 oss.str().c_str());
6206 if (r != 0) {
6207 ss << "set_recovery_delay: error setting "
6208 << "osd_recovery_delay_start to '" << delay << "': error "
6209 << r;
6210 return;
6211 }
6212 service->cct->_conf.apply_changes(nullptr);
6213 ss << "set_recovery_delay: set osd_recovery_delay_start "
6214 << "to " << service->cct->_conf->osd_recovery_delay_start;
6215 return;
6216 }
6217 if (command == "injectfull") {
6218 int64_t count;
6219 string type;
6220 OSDService::s_names state;
6221 cmd_getval(cmdmap, "type", type, string("full"));
6222 cmd_getval(cmdmap, "count", count, (int64_t)-1);
6223 if (type == "none" || count == 0) {
6224 type = "none";
6225 count = 0;
6226 }
6227 state = service->get_full_state(type);
6228 if (state == OSDService::s_names::INVALID) {
6229 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6230 return;
6231 }
6232 service->set_injectfull(state, count);
6233 return;
6234 }
6235 ss << "Internal error - command=" << command;
6236 }
6237
6238 // =========================================
6239
6240 void OSD::ms_handle_connect(Connection *con)
6241 {
6242 dout(10) << __func__ << " con " << con << dendl;
6243 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6244 std::lock_guard l(osd_lock);
6245 if (is_stopping())
6246 return;
6247 dout(10) << __func__ << " on mon" << dendl;
6248
6249 if (is_preboot()) {
6250 start_boot();
6251 } else if (is_booting()) {
6252 _send_boot(); // resend boot message
6253 } else {
6254 map_lock.lock_shared();
6255 std::lock_guard l2(mon_report_lock);
6256
6257 utime_t now = ceph_clock_now();
6258 last_mon_report = now;
6259
6260 // resend everything, it's a new session
6261 send_full_update();
6262 send_alive();
6263 service.requeue_pg_temp();
6264 service.clear_sent_ready_to_merge();
6265 service.send_pg_temp();
6266 service.send_ready_to_merge();
6267 service.send_pg_created();
6268 requeue_failures();
6269 send_failures();
6270
6271 map_lock.unlock_shared();
6272 if (is_active()) {
6273 send_beacon(ceph::coarse_mono_clock::now());
6274 }
6275 }
6276
6277 // full map requests may happen while active or pre-boot
6278 if (requested_full_first) {
6279 rerequest_full_maps();
6280 }
6281 }
6282 }
6283
6284 void OSD::ms_handle_fast_connect(Connection *con)
6285 {
6286 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6287 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6288 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6289 s = ceph::make_ref<Session>(cct, con);
6290 con->set_priv(s);
6291 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6292 << " addr=" << s->con->get_peer_addr() << dendl;
6293 // we don't connect to clients
6294 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6295 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6296 }
6297 }
6298 }
6299
6300 void OSD::ms_handle_fast_accept(Connection *con)
6301 {
6302 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6303 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6304 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6305 s = ceph::make_ref<Session>(cct, con);
6306 con->set_priv(s);
6307 dout(10) << "new session (incoming)" << s << " con=" << con
6308 << " addr=" << con->get_peer_addr()
6309 << " must have raced with connect" << dendl;
6310 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6311 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6312 }
6313 }
6314 }
6315
6316 bool OSD::ms_handle_reset(Connection *con)
6317 {
6318 auto session = ceph::ref_cast<Session>(con->get_priv());
6319 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
6320 if (!session)
6321 return false;
6322 session->wstate.reset(con);
6323 session->con->set_priv(nullptr);
6324 session->con.reset(); // break con <-> session ref cycle
6325 // note that we break session->con *before* the session_handle_reset
6326 // cleanup below. this avoids a race between us and
6327 // PG::add_backoff, Session::check_backoff, etc.
6328 session_handle_reset(session);
6329 return true;
6330 }
6331
6332 bool OSD::ms_handle_refused(Connection *con)
6333 {
6334 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6335 return false;
6336
6337 auto session = ceph::ref_cast<Session>(con->get_priv());
6338 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
6339 if (!session)
6340 return false;
6341 int type = con->get_peer_type();
6342 // handle only OSD failures here
6343 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6344 OSDMapRef osdmap = get_osdmap();
6345 if (osdmap) {
6346 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6347 if (id >= 0 && osdmap->is_up(id)) {
6348 // I'm cheating mon heartbeat grace logic, because we know it's not going
6349 // to respawn alone. +1 so we won't hit any boundary case.
6350 monc->send_mon_message(
6351 new MOSDFailure(
6352 monc->get_fsid(),
6353 id,
6354 osdmap->get_addrs(id),
6355 cct->_conf->osd_heartbeat_grace + 1,
6356 osdmap->get_epoch(),
6357 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6358 ));
6359 }
6360 }
6361 }
6362 return true;
6363 }
6364
6365 struct CB_OSD_GetVersion {
6366 OSD *osd;
6367 explicit CB_OSD_GetVersion(OSD *o) : osd(o) {}
6368 void operator ()(boost::system::error_code ec, version_t newest,
6369 version_t oldest) {
6370 if (!ec)
6371 osd->_got_mon_epochs(oldest, newest);
6372 }
6373 };
6374
6375 void OSD::start_boot()
6376 {
6377 if (!_is_healthy()) {
6378 // if we are not healthy, do not mark ourselves up (yet)
6379 dout(1) << "not healthy; waiting to boot" << dendl;
6380 if (!is_waiting_for_healthy())
6381 start_waiting_for_healthy();
6382 // send pings sooner rather than later
6383 heartbeat_kick();
6384 return;
6385 }
6386 dout(1) << __func__ << dendl;
6387 set_state(STATE_PREBOOT);
6388 dout(10) << "start_boot - have maps " << superblock.oldest_map
6389 << ".." << superblock.newest_map << dendl;
6390 monc->get_version("osdmap", CB_OSD_GetVersion(this));
6391 }
6392
6393 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6394 {
6395 std::lock_guard l(osd_lock);
6396 if (is_preboot()) {
6397 _preboot(oldest, newest);
6398 }
6399 }
6400
6401 void OSD::_preboot(epoch_t oldest, epoch_t newest)
6402 {
6403 ceph_assert(is_preboot());
6404 dout(10) << __func__ << " _preboot mon has osdmaps "
6405 << oldest << ".." << newest << dendl;
6406
6407 // ensure our local fullness awareness is accurate
6408 {
6409 std::lock_guard l(heartbeat_lock);
6410 heartbeat();
6411 }
6412
6413 const auto& monmap = monc->monmap;
6414 const auto osdmap = get_osdmap();
6415 // if our map within recent history, try to add ourselves to the osdmap.
6416 if (osdmap->get_epoch() == 0) {
6417 derr << "waiting for initial osdmap" << dendl;
6418 } else if (osdmap->is_destroyed(whoami)) {
6419 derr << "osdmap says I am destroyed" << dendl;
6420 // provide a small margin so we don't livelock seeing if we
6421 // un-destroyed ourselves.
6422 if (osdmap->get_epoch() > newest - 1) {
6423 exit(0);
6424 }
6425 } else if (osdmap->is_noup(whoami)) {
6426 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6427 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6428 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6429 << dendl;
6430 } else if (service.need_fullness_update()) {
6431 derr << "osdmap fullness state needs update" << dendl;
6432 send_full_update();
6433 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6434 superblock.purged_snaps_last < superblock.current_epoch) {
6435 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6436 << " < newest_map " << superblock.current_epoch << dendl;
6437 _get_purged_snaps();
6438 } else if (osdmap->get_epoch() >= oldest - 1 &&
6439 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6440
6441 // wait for pgs to fully catch up in a different thread, since
6442 // this thread might be required for splitting and merging PGs to
6443 // make progress.
6444 boot_finisher.queue(
6445 new LambdaContext(
6446 [this](int r) {
6447 std::unique_lock l(osd_lock);
6448 if (is_preboot()) {
6449 dout(10) << __func__ << " waiting for peering work to drain"
6450 << dendl;
6451 l.unlock();
6452 for (auto shard : shards) {
6453 shard->wait_min_pg_epoch(get_osdmap_epoch());
6454 }
6455 l.lock();
6456 }
6457 if (is_preboot()) {
6458 _send_boot();
6459 }
6460 }));
6461 return;
6462 }
6463
6464 // get all the latest maps
6465 if (osdmap->get_epoch() + 1 >= oldest)
6466 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6467 else
6468 osdmap_subscribe(oldest - 1, true);
6469 }
6470
6471 void OSD::_get_purged_snaps()
6472 {
6473 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6474 // overlapping requests to the mon, which will be somewhat inefficient, but
6475 // it should be reliable.
6476 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6477 << ", newest_map " << superblock.current_epoch << dendl;
6478 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6479 superblock.purged_snaps_last + 1,
6480 superblock.current_epoch + 1);
6481 monc->send_mon_message(m);
6482 }
6483
6484 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6485 {
6486 dout(10) << __func__ << " " << *m << dendl;
6487 ObjectStore::Transaction t;
6488 if (!is_preboot() ||
6489 m->last < superblock.purged_snaps_last) {
6490 goto out;
6491 }
6492 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
6493 make_purged_snaps_oid(), &t,
6494 m->purged_snaps);
6495 superblock.purged_snaps_last = m->last;
6496 write_superblock(t);
6497 store->queue_transaction(
6498 service.meta_ch,
6499 std::move(t));
6500 service.publish_superblock(superblock);
6501 if (m->last < superblock.current_epoch) {
6502 _get_purged_snaps();
6503 } else {
6504 start_boot();
6505 }
6506 out:
6507 m->put();
6508 }
6509
6510 void OSD::send_full_update()
6511 {
6512 if (!service.need_fullness_update())
6513 return;
6514 unsigned state = 0;
6515 if (service.is_full()) {
6516 state = CEPH_OSD_FULL;
6517 } else if (service.is_backfillfull()) {
6518 state = CEPH_OSD_BACKFILLFULL;
6519 } else if (service.is_nearfull()) {
6520 state = CEPH_OSD_NEARFULL;
6521 }
6522 set<string> s;
6523 OSDMap::calc_state_set(state, s);
6524 dout(10) << __func__ << " want state " << s << dendl;
6525 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
6526 }
6527
6528 void OSD::start_waiting_for_healthy()
6529 {
6530 dout(1) << "start_waiting_for_healthy" << dendl;
6531 set_state(STATE_WAITING_FOR_HEALTHY);
6532 last_heartbeat_resample = utime_t();
6533
6534 // subscribe to osdmap updates, in case our peers really are known to be dead
6535 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6536 }
6537
6538 bool OSD::_is_healthy()
6539 {
6540 if (!cct->get_heartbeat_map()->is_healthy()) {
6541 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6542 return false;
6543 }
6544
6545 if (is_waiting_for_healthy()) {
6546 utime_t now = ceph_clock_now();
6547 if (osd_markdown_log.empty()) {
6548 dout(5) << __func__ << " force returning true since last markdown"
6549 << " was " << cct->_conf->osd_max_markdown_period
6550 << "s ago" << dendl;
6551 return true;
6552 }
6553 std::lock_guard l(heartbeat_lock);
6554 int num = 0, up = 0;
6555 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6556 p != heartbeat_peers.end();
6557 ++p) {
6558 if (p->second.is_healthy(now))
6559 ++up;
6560 ++num;
6561 }
6562 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6563 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6564 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6565 return false;
6566 }
6567 }
6568
6569 return true;
6570 }
6571
6572 void OSD::_send_boot()
6573 {
6574 dout(10) << "_send_boot" << dendl;
6575 Connection *local_connection =
6576 cluster_messenger->get_loopback_connection().get();
6577 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6578 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6579 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6580 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6581
6582 dout(20) << " initial client_addrs " << client_addrs
6583 << ", cluster_addrs " << cluster_addrs
6584 << ", hb_back_addrs " << hb_back_addrs
6585 << ", hb_front_addrs " << hb_front_addrs
6586 << dendl;
6587 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6588 dout(10) << " assuming cluster_addrs match client_addrs "
6589 << client_addrs << dendl;
6590 cluster_addrs = cluster_messenger->get_myaddrs();
6591 }
6592 if (auto session = local_connection->get_priv(); !session) {
6593 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6594 }
6595
6596 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6597 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6598 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6599 << cluster_addrs << dendl;
6600 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6601 }
6602 if (auto session = local_connection->get_priv(); !session) {
6603 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6604 }
6605
6606 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6607 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6608 dout(10) << " assuming hb_front_addrs match client_addrs "
6609 << client_addrs << dendl;
6610 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6611 }
6612 if (auto session = local_connection->get_priv(); !session) {
6613 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6614 }
6615
6616 // we now know what our front and back addrs will be, and we are
6617 // about to tell the mon what our metadata (including numa bindings)
6618 // are, so now is a good time!
6619 set_numa_affinity();
6620
6621 MOSDBoot *mboot = new MOSDBoot(
6622 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6623 hb_back_addrs, hb_front_addrs, cluster_addrs,
6624 CEPH_FEATURES_ALL);
6625 dout(10) << " final client_addrs " << client_addrs
6626 << ", cluster_addrs " << cluster_addrs
6627 << ", hb_back_addrs " << hb_back_addrs
6628 << ", hb_front_addrs " << hb_front_addrs
6629 << dendl;
6630 _collect_metadata(&mboot->metadata);
6631 monc->send_mon_message(mboot);
6632 set_state(STATE_BOOTING);
6633 }
6634
6635 void OSD::_collect_metadata(map<string,string> *pm)
6636 {
6637 // config info
6638 (*pm)["osd_data"] = dev_path;
6639 if (store->get_type() == "filestore") {
6640 // not applicable for bluestore
6641 (*pm)["osd_journal"] = journal_path;
6642 }
6643 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6644 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6645 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6646 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6647
6648 // backend
6649 (*pm)["osd_objectstore"] = store->get_type();
6650 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6651 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6652 (*pm)["default_device_class"] = store->get_default_device_class();
6653 string osdspec_affinity;
6654 int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6655 if (r < 0 || osdspec_affinity.empty()) {
6656 osdspec_affinity = "";
6657 }
6658 (*pm)["osdspec_affinity"] = osdspec_affinity;
6659 store->collect_metadata(pm);
6660
6661 collect_sys_info(pm, cct);
6662
6663 (*pm)["front_iface"] = pick_iface(
6664 cct,
6665 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6666 (*pm)["back_iface"] = pick_iface(
6667 cct,
6668 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6669
6670 // network numa
6671 {
6672 int node = -1;
6673 set<int> nodes;
6674 set<string> unknown;
6675 for (auto nm : { "front_iface", "back_iface" }) {
6676 if (!(*pm)[nm].size()) {
6677 unknown.insert(nm);
6678 continue;
6679 }
6680 int n = -1;
6681 int r = get_iface_numa_node((*pm)[nm], &n);
6682 if (r < 0) {
6683 unknown.insert((*pm)[nm]);
6684 continue;
6685 }
6686 nodes.insert(n);
6687 if (node < 0) {
6688 node = n;
6689 }
6690 }
6691 if (unknown.size()) {
6692 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6693 }
6694 if (!nodes.empty()) {
6695 (*pm)["network_numa_nodes"] = stringify(nodes);
6696 }
6697 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6698 (*pm)["network_numa_node"] = stringify(node);
6699 }
6700 }
6701
6702 if (numa_node >= 0) {
6703 (*pm)["numa_node"] = stringify(numa_node);
6704 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6705 &numa_cpu_set);
6706 }
6707
6708 set<string> devnames;
6709 store->get_devices(&devnames);
6710 map<string,string> errs;
6711 get_device_metadata(devnames, pm, &errs);
6712 for (auto& i : errs) {
6713 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
6714 }
6715 dout(10) << __func__ << " " << *pm << dendl;
6716 }
6717
6718 void OSD::queue_want_up_thru(epoch_t want)
6719 {
6720 std::shared_lock map_locker{map_lock};
6721 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6722 std::lock_guard report_locker(mon_report_lock);
6723 if (want > up_thru_wanted) {
6724 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6725 << ", currently " << cur
6726 << dendl;
6727 up_thru_wanted = want;
6728 send_alive();
6729 } else {
6730 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6731 << ", currently " << cur
6732 << dendl;
6733 }
6734 }
6735
6736 void OSD::send_alive()
6737 {
6738 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6739 const auto osdmap = get_osdmap();
6740 if (!osdmap->exists(whoami))
6741 return;
6742 epoch_t up_thru = osdmap->get_up_thru(whoami);
6743 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6744 if (up_thru_wanted > up_thru) {
6745 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6746 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6747 }
6748 }
6749
6750 void OSD::request_full_map(epoch_t first, epoch_t last)
6751 {
6752 dout(10) << __func__ << " " << first << ".." << last
6753 << ", previously requested "
6754 << requested_full_first << ".." << requested_full_last << dendl;
6755 ceph_assert(ceph_mutex_is_locked(osd_lock));
6756 ceph_assert(first > 0 && last > 0);
6757 ceph_assert(first <= last);
6758 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6759 if (requested_full_first == 0) {
6760 // first request
6761 requested_full_first = first;
6762 requested_full_last = last;
6763 } else if (last <= requested_full_last) {
6764 // dup
6765 return;
6766 } else {
6767 // additional request
6768 first = requested_full_last + 1;
6769 requested_full_last = last;
6770 }
6771 MMonGetOSDMap *req = new MMonGetOSDMap;
6772 req->request_full(first, last);
6773 monc->send_mon_message(req);
6774 }
6775
6776 void OSD::got_full_map(epoch_t e)
6777 {
6778 ceph_assert(requested_full_first <= requested_full_last);
6779 ceph_assert(ceph_mutex_is_locked(osd_lock));
6780 if (requested_full_first == 0) {
6781 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6782 return;
6783 }
6784 if (e < requested_full_first) {
6785 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6786 << ".." << requested_full_last
6787 << ", ignoring" << dendl;
6788 return;
6789 }
6790 if (e >= requested_full_last) {
6791 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6792 << ".." << requested_full_last << ", resetting" << dendl;
6793 requested_full_first = requested_full_last = 0;
6794 return;
6795 }
6796
6797 requested_full_first = e + 1;
6798
6799 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6800 << ".." << requested_full_last
6801 << ", still need more" << dendl;
6802 }
6803
6804 void OSD::requeue_failures()
6805 {
6806 std::lock_guard l(heartbeat_lock);
6807 unsigned old_queue = failure_queue.size();
6808 unsigned old_pending = failure_pending.size();
6809 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
6810 failure_queue[p->first] = p->second.first;
6811 failure_pending.erase(p++);
6812 }
6813 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6814 << failure_queue.size() << dendl;
6815 }
6816
6817 void OSD::send_failures()
6818 {
6819 ceph_assert(ceph_mutex_is_locked(map_lock));
6820 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6821 std::lock_guard l(heartbeat_lock);
6822 utime_t now = ceph_clock_now();
6823 const auto osdmap = get_osdmap();
6824 while (!failure_queue.empty()) {
6825 int osd = failure_queue.begin()->first;
6826 if (!failure_pending.count(osd)) {
6827 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6828 monc->send_mon_message(
6829 new MOSDFailure(
6830 monc->get_fsid(),
6831 osd,
6832 osdmap->get_addrs(osd),
6833 failed_for,
6834 osdmap->get_epoch()));
6835 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6836 osdmap->get_addrs(osd));
6837 }
6838 failure_queue.erase(osd);
6839 }
6840 }
6841
6842 void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
6843 {
6844 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6845 MOSDFailure::FLAG_ALIVE);
6846 monc->send_mon_message(m);
6847 }
6848
6849 void OSD::cancel_pending_failures()
6850 {
6851 std::lock_guard l(heartbeat_lock);
6852 auto it = failure_pending.begin();
6853 while (it != failure_pending.end()) {
6854 dout(10) << __func__ << " canceling in-flight failure report for osd."
6855 << it->first << dendl;
6856 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
6857 failure_pending.erase(it++);
6858 }
6859 }
6860
6861 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6862 {
6863 const auto& monmap = monc->monmap;
6864 // send beacon to mon even if we are just connected, and the monmap is not
6865 // initialized yet by then.
6866 if (monmap.epoch > 0 &&
6867 monmap.get_required_features().contains_all(
6868 ceph::features::mon::FEATURE_LUMINOUS)) {
6869 dout(20) << __func__ << " sending" << dendl;
6870 MOSDBeacon* beacon = nullptr;
6871 {
6872 std::lock_guard l{min_last_epoch_clean_lock};
6873 beacon = new MOSDBeacon(get_osdmap_epoch(),
6874 min_last_epoch_clean,
6875 superblock.last_purged_snaps_scrub,
6876 cct->_conf->osd_beacon_report_interval);
6877 beacon->pgs = min_last_epoch_clean_pgs;
6878 last_sent_beacon = now;
6879 }
6880 monc->send_mon_message(beacon);
6881 } else {
6882 dout(20) << __func__ << " not sending" << dendl;
6883 }
6884 }
6885
6886 void OSD::handle_command(MCommand *m)
6887 {
6888 ConnectionRef con = m->get_connection();
6889 auto session = ceph::ref_cast<Session>(con->get_priv());
6890 if (!session) {
6891 con->send_message(new MCommandReply(m, -EACCES));
6892 m->put();
6893 return;
6894 }
6895 if (!session->caps.allow_all()) {
6896 con->send_message(new MCommandReply(m, -EACCES));
6897 m->put();
6898 return;
6899 }
6900 cct->get_admin_socket()->queue_tell_command(m);
6901 m->put();
6902 }
6903
6904 namespace {
6905 class unlock_guard {
6906 ceph::mutex& m;
6907 public:
6908 explicit unlock_guard(ceph::mutex& mutex)
6909 : m(mutex)
6910 {
6911 m.unlock();
6912 }
6913 unlock_guard(unlock_guard&) = delete;
6914 ~unlock_guard() {
6915 m.lock();
6916 }
6917 };
6918 }
6919
6920 void OSD::scrub_purged_snaps()
6921 {
6922 dout(10) << __func__ << dendl;
6923 ceph_assert(ceph_mutex_is_locked(osd_lock));
6924 SnapMapper::Scrubber s(cct, store, service.meta_ch,
6925 make_snapmapper_oid(),
6926 make_purged_snaps_oid());
6927 clog->debug() << "purged_snaps scrub starts";
6928 osd_lock.unlock();
6929 s.run();
6930 if (s.stray.size()) {
6931 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
6932 } else {
6933 clog->debug() << "purged_snaps scrub ok";
6934 }
6935 set<pair<spg_t,snapid_t>> queued;
6936 for (auto& [pool, snap, hash, shard] : s.stray) {
6937 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
6938 if (!pi) {
6939 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
6940 continue;
6941 }
6942 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
6943 spg_t spgid(pgid, shard);
6944 pair<spg_t,snapid_t> p(spgid, snap);
6945 if (queued.count(p)) {
6946 dout(20) << __func__ << " pg " << spgid << " snap " << snap
6947 << " already queued" << dendl;
6948 continue;
6949 }
6950 PGRef pg = lookup_lock_pg(spgid);
6951 if (!pg) {
6952 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
6953 continue;
6954 }
6955 queued.insert(p);
6956 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
6957 << snap << dendl;
6958 pg->queue_snap_retrim(snap);
6959 pg->unlock();
6960 }
6961 osd_lock.lock();
6962 if (is_stopping()) {
6963 return;
6964 }
6965 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
6966 ObjectStore::Transaction t;
6967 superblock.last_purged_snaps_scrub = ceph_clock_now();
6968 write_superblock(t);
6969 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
6970 ceph_assert(tr == 0);
6971 if (is_active()) {
6972 send_beacon(ceph::coarse_mono_clock::now());
6973 }
6974 dout(10) << __func__ << " done" << dendl;
6975 }
6976
6977 void OSD::probe_smart(const string& only_devid, ostream& ss)
6978 {
6979 set<string> devnames;
6980 store->get_devices(&devnames);
6981 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
6982 "osd_smart_report_timeout");
6983
6984 // == typedef std::map<std::string, mValue> mObject;
6985 json_spirit::mObject json_map;
6986
6987 for (auto dev : devnames) {
6988 // smartctl works only on physical devices; filter out any logical device
6989 if (dev.find("dm-") == 0) {
6990 continue;
6991 }
6992
6993 string err;
6994 string devid = get_device_id(dev, &err);
6995 if (devid.size() == 0) {
6996 dout(10) << __func__ << " no unique id for dev " << dev << " ("
6997 << err << "), skipping" << dendl;
6998 continue;
6999 }
7000 if (only_devid.size() && devid != only_devid) {
7001 continue;
7002 }
7003
7004 json_spirit::mValue smart_json;
7005 if (block_device_get_metrics(dev, smart_timeout,
7006 &smart_json)) {
7007 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
7008 continue;
7009 }
7010 json_map[devid] = smart_json;
7011 }
7012 json_spirit::write(json_map, ss, json_spirit::pretty_print);
7013 }
7014
7015 bool OSD::heartbeat_dispatch(Message *m)
7016 {
7017 dout(30) << "heartbeat_dispatch " << m << dendl;
7018 switch (m->get_type()) {
7019
7020 case CEPH_MSG_PING:
7021 dout(10) << "ping from " << m->get_source_inst() << dendl;
7022 m->put();
7023 break;
7024
7025 case MSG_OSD_PING:
7026 handle_osd_ping(static_cast<MOSDPing*>(m));
7027 break;
7028
7029 default:
7030 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7031 m->put();
7032 }
7033
7034 return true;
7035 }
7036
7037 bool OSD::ms_dispatch(Message *m)
7038 {
7039 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7040 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7041 service.got_stop_ack();
7042 m->put();
7043 return true;
7044 }
7045
7046 // lock!
7047
7048 osd_lock.lock();
7049 if (is_stopping()) {
7050 osd_lock.unlock();
7051 m->put();
7052 return true;
7053 }
7054
7055 do_waiters();
7056 _dispatch(m);
7057
7058 osd_lock.unlock();
7059
7060 return true;
7061 }
7062
7063 void OSDService::maybe_share_map(
7064 Connection *con,
7065 const OSDMapRef& osdmap,
7066 epoch_t peer_epoch_lb)
7067 {
7068 // NOTE: we assume caller hold something that keeps the Connection itself
7069 // pinned (e.g., an OpRequest's MessageRef).
7070 auto session = ceph::ref_cast<Session>(con->get_priv());
7071 if (!session) {
7072 return;
7073 }
7074
7075 // assume the peer has the newer of the op's sent_epoch and what
7076 // we think we sent them.
7077 session->sent_epoch_lock.lock();
7078 if (peer_epoch_lb > session->last_sent_epoch) {
7079 dout(10) << __func__ << " con " << con
7080 << " " << con->get_peer_addr()
7081 << " map epoch " << session->last_sent_epoch
7082 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
7083 session->last_sent_epoch = peer_epoch_lb;
7084 }
7085 epoch_t last_sent_epoch = session->last_sent_epoch;
7086 session->sent_epoch_lock.unlock();
7087
7088 if (osdmap->get_epoch() <= last_sent_epoch) {
7089 return;
7090 }
7091
7092 send_incremental_map(last_sent_epoch, con, osdmap);
7093 last_sent_epoch = osdmap->get_epoch();
7094
7095 session->sent_epoch_lock.lock();
7096 if (session->last_sent_epoch < last_sent_epoch) {
7097 dout(10) << __func__ << " con " << con
7098 << " " << con->get_peer_addr()
7099 << " map epoch " << session->last_sent_epoch
7100 << " -> " << last_sent_epoch << " (shared)" << dendl;
7101 session->last_sent_epoch = last_sent_epoch;
7102 }
7103 session->sent_epoch_lock.unlock();
7104 }
7105
7106 void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
7107 {
7108 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
7109
7110 auto i = session->waiting_on_map.begin();
7111 while (i != session->waiting_on_map.end()) {
7112 OpRequestRef op = &(*i);
7113 ceph_assert(ms_can_fast_dispatch(op->get_req()));
7114 auto m = op->get_req<MOSDFastDispatchOp>();
7115 if (m->get_min_epoch() > osdmap->get_epoch()) {
7116 break;
7117 }
7118 session->waiting_on_map.erase(i++);
7119 op->put();
7120
7121 spg_t pgid;
7122 if (m->get_type() == CEPH_MSG_OSD_OP) {
7123 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7124 static_cast<const MOSDOp*>(m)->get_pg());
7125 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7126 continue;
7127 }
7128 } else {
7129 pgid = m->get_spg();
7130 }
7131 enqueue_op(pgid, std::move(op), m->get_map_epoch());
7132 }
7133
7134 if (session->waiting_on_map.empty()) {
7135 clear_session_waiting_on_map(session);
7136 } else {
7137 register_session_waiting_on_map(session);
7138 }
7139 }
7140
7141 void OSD::ms_fast_dispatch(Message *m)
7142 {
7143
7144 #ifdef HAVE_JAEGER
7145 jaeger_tracing::init_tracer("osd-services-reinit");
7146 dout(10) << "jaeger tracer after " << opentracing::Tracer::Global() << dendl;
7147 auto dispatch_span = jaeger_tracing::new_span(__func__);
7148 #endif
7149 FUNCTRACE(cct);
7150 if (service.is_stopping()) {
7151 m->put();
7152 return;
7153 }
7154
7155 // peering event?
7156 switch (m->get_type()) {
7157 case CEPH_MSG_PING:
7158 dout(10) << "ping from " << m->get_source() << dendl;
7159 m->put();
7160 return;
7161 case MSG_OSD_FORCE_RECOVERY:
7162 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7163 return;
7164 case MSG_OSD_SCRUB2:
7165 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7166 return;
7167
7168 case MSG_OSD_PG_CREATE2:
7169 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7170 case MSG_OSD_PG_QUERY:
7171 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7172 case MSG_OSD_PG_NOTIFY:
7173 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7174 case MSG_OSD_PG_INFO:
7175 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7176 case MSG_OSD_PG_REMOVE:
7177 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7178
7179 // these are single-pg messages that handle themselves
7180 case MSG_OSD_PG_LOG:
7181 case MSG_OSD_PG_TRIM:
7182 case MSG_OSD_PG_NOTIFY2:
7183 case MSG_OSD_PG_QUERY2:
7184 case MSG_OSD_PG_INFO2:
7185 case MSG_OSD_BACKFILL_RESERVE:
7186 case MSG_OSD_RECOVERY_RESERVE:
7187 case MSG_OSD_PG_LEASE:
7188 case MSG_OSD_PG_LEASE_ACK:
7189 {
7190 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7191 if (require_osd_peer(pm)) {
7192 enqueue_peering_evt(
7193 pm->get_spg(),
7194 PGPeeringEventRef(pm->get_event()));
7195 }
7196 pm->put();
7197 return;
7198 }
7199 }
7200
7201 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7202 {
7203 #ifdef WITH_LTTNG
7204 osd_reqid_t reqid = op->get_reqid();
7205 #endif
7206 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7207 reqid.name._num, reqid.tid, reqid.inc);
7208 }
7209 #ifdef HAVE_JAEGER
7210 op->set_osd_parent_span(dispatch_span);
7211 if (op->osd_parent_span) {
7212 auto op_req_span = jaeger_tracing::child_span("op-request-created", op->osd_parent_span);
7213 op->set_osd_parent_span(op_req_span);
7214 }
7215 #endif
7216 if (m->trace)
7217 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7218
7219 // note sender epoch, min req's epoch
7220 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7221 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7222 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7223
7224 service.maybe_inject_dispatch_delay();
7225
7226 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7227 m->get_type() != CEPH_MSG_OSD_OP) {
7228 // queue it directly
7229 enqueue_op(
7230 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7231 std::move(op),
7232 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7233 } else {
7234 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7235 // message that didn't have an explicit spg_t); we need to map
7236 // them to an spg_t while preserving delivery order.
7237 auto priv = m->get_connection()->get_priv();
7238 if (auto session = static_cast<Session*>(priv.get()); session) {
7239 std::lock_guard l{session->session_dispatch_lock};
7240 op->get();
7241 session->waiting_on_map.push_back(*op);
7242 OSDMapRef nextmap = service.get_nextmap_reserved();
7243 dispatch_session_waiting(session, nextmap);
7244 service.release_map(nextmap);
7245 }
7246 }
7247 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7248 }
7249
7250 int OSD::ms_handle_authentication(Connection *con)
7251 {
7252 int ret = 0;
7253 auto s = ceph::ref_cast<Session>(con->get_priv());
7254 if (!s) {
7255 s = ceph::make_ref<Session>(cct, con);
7256 con->set_priv(s);
7257 s->entity_name = con->get_peer_entity_name();
7258 dout(10) << __func__ << " new session " << s << " con " << s->con
7259 << " entity " << s->entity_name
7260 << " addr " << con->get_peer_addrs() << dendl;
7261 } else {
7262 dout(10) << __func__ << " existing session " << s << " con " << s->con
7263 << " entity " << s->entity_name
7264 << " addr " << con->get_peer_addrs() << dendl;
7265 }
7266
7267 AuthCapsInfo &caps_info = con->get_peer_caps_info();
7268 if (caps_info.allow_all) {
7269 s->caps.set_allow_all();
7270 } else if (caps_info.caps.length() > 0) {
7271 bufferlist::const_iterator p = caps_info.caps.cbegin();
7272 string str;
7273 try {
7274 decode(str, p);
7275 }
7276 catch (ceph::buffer::error& e) {
7277 dout(10) << __func__ << " session " << s << " " << s->entity_name
7278 << " failed to decode caps string" << dendl;
7279 ret = -EACCES;
7280 }
7281 if (!ret) {
7282 bool success = s->caps.parse(str);
7283 if (success) {
7284 dout(10) << __func__ << " session " << s
7285 << " " << s->entity_name
7286 << " has caps " << s->caps << " '" << str << "'" << dendl;
7287 ret = 1;
7288 } else {
7289 dout(10) << __func__ << " session " << s << " " << s->entity_name
7290 << " failed to parse caps '" << str << "'" << dendl;
7291 ret = -EACCES;
7292 }
7293 }
7294 }
7295 return ret;
7296 }
7297
7298 void OSD::do_waiters()
7299 {
7300 ceph_assert(ceph_mutex_is_locked(osd_lock));
7301
7302 dout(10) << "do_waiters -- start" << dendl;
7303 while (!finished.empty()) {
7304 OpRequestRef next = finished.front();
7305 finished.pop_front();
7306 dispatch_op(next);
7307 }
7308 dout(10) << "do_waiters -- finish" << dendl;
7309 }
7310
7311 void OSD::dispatch_op(OpRequestRef op)
7312 {
7313 switch (op->get_req()->get_type()) {
7314
7315 case MSG_OSD_PG_CREATE:
7316 handle_pg_create(op);
7317 break;
7318 }
7319 }
7320
7321 void OSD::_dispatch(Message *m)
7322 {
7323 ceph_assert(ceph_mutex_is_locked(osd_lock));
7324 dout(20) << "_dispatch " << m << " " << *m << dendl;
7325
7326 switch (m->get_type()) {
7327 // -- don't need OSDMap --
7328
7329 // map and replication
7330 case CEPH_MSG_OSD_MAP:
7331 handle_osd_map(static_cast<MOSDMap*>(m));
7332 break;
7333 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7334 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7335 break;
7336
7337 // osd
7338 case MSG_OSD_SCRUB:
7339 handle_scrub(static_cast<MOSDScrub*>(m));
7340 break;
7341
7342 case MSG_COMMAND:
7343 handle_command(static_cast<MCommand*>(m));
7344 return;
7345
7346 // -- need OSDMap --
7347
7348 case MSG_OSD_PG_CREATE:
7349 {
7350 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7351 if (m->trace)
7352 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7353 // no map? starting up?
7354 if (!get_osdmap()) {
7355 dout(7) << "no OSDMap, not booted" << dendl;
7356 logger->inc(l_osd_waiting_for_map);
7357 waiting_for_osdmap.push_back(op);
7358 op->mark_delayed("no osdmap");
7359 break;
7360 }
7361
7362 // need OSDMap
7363 dispatch_op(op);
7364 }
7365 }
7366 }
7367
7368 // remove me post-nautilus
7369 void OSD::handle_scrub(MOSDScrub *m)
7370 {
7371 dout(10) << "handle_scrub " << *m << dendl;
7372 if (!require_mon_or_mgr_peer(m)) {
7373 m->put();
7374 return;
7375 }
7376 if (m->fsid != monc->get_fsid()) {
7377 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7378 << dendl;
7379 m->put();
7380 return;
7381 }
7382
7383 vector<spg_t> spgs;
7384 _get_pgids(&spgs);
7385
7386 if (!m->scrub_pgs.empty()) {
7387 vector<spg_t> v;
7388 for (auto pgid : m->scrub_pgs) {
7389 spg_t pcand;
7390 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
7391 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7392 v.push_back(pcand);
7393 }
7394 }
7395 spgs.swap(v);
7396 }
7397
7398 for (auto pgid : spgs) {
7399 enqueue_peering_evt(
7400 pgid,
7401 PGPeeringEventRef(
7402 std::make_shared<PGPeeringEvent>(
7403 get_osdmap_epoch(),
7404 get_osdmap_epoch(),
7405 PeeringState::RequestScrub(m->deep, m->repair))));
7406 }
7407
7408 m->put();
7409 }
7410
7411 void OSD::handle_fast_scrub(MOSDScrub2 *m)
7412 {
7413 dout(10) << __func__ << " " << *m << dendl;
7414 if (!require_mon_or_mgr_peer(m)) {
7415 m->put();
7416 return;
7417 }
7418 if (m->fsid != monc->get_fsid()) {
7419 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7420 << dendl;
7421 m->put();
7422 return;
7423 }
7424 for (auto pgid : m->scrub_pgs) {
7425 enqueue_peering_evt(
7426 pgid,
7427 PGPeeringEventRef(
7428 std::make_shared<PGPeeringEvent>(
7429 m->epoch,
7430 m->epoch,
7431 PeeringState::RequestScrub(m->deep, m->repair))));
7432 }
7433 m->put();
7434 }
7435
7436 bool OSD::scrub_random_backoff()
7437 {
7438 bool coin_flip = (rand() / (double)RAND_MAX >=
7439 cct->_conf->osd_scrub_backoff_ratio);
7440 if (!coin_flip) {
7441 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7442 return true;
7443 }
7444 return false;
7445 }
7446
7447 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7448 const spg_t& pg, const utime_t& timestamp,
7449 double pool_scrub_min_interval,
7450 double pool_scrub_max_interval, bool must)
7451 : cct(cct),
7452 pgid(pg),
7453 sched_time(timestamp),
7454 deadline(timestamp)
7455 {
7456 // if not explicitly requested, postpone the scrub with a random delay
7457 if (!must) {
7458 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7459 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7460 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7461 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7462
7463 sched_time += scrub_min_interval;
7464 double r = rand() / (double)RAND_MAX;
7465 sched_time +=
7466 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7467 if (scrub_max_interval == 0) {
7468 deadline = utime_t();
7469 } else {
7470 deadline += scrub_max_interval;
7471 }
7472
7473 }
7474 }
7475
7476 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7477 if (sched_time < rhs.sched_time)
7478 return true;
7479 if (sched_time > rhs.sched_time)
7480 return false;
7481 return pgid < rhs.pgid;
7482 }
7483
7484 void OSDService::dumps_scrub(ceph::Formatter *f)
7485 {
7486 ceph_assert(f != nullptr);
7487 std::lock_guard l(sched_scrub_lock);
7488
7489 f->open_array_section("scrubs");
7490 for (const auto &i: sched_scrub_pg) {
7491 f->open_object_section("scrub");
7492 f->dump_stream("pgid") << i.pgid;
7493 f->dump_stream("sched_time") << i.sched_time;
7494 f->dump_stream("deadline") << i.deadline;
7495 f->dump_bool("forced", i.sched_time == PgScrubber::scrub_must_stamp());
7496 f->close_section();
7497 }
7498 f->close_section();
7499 }
7500
7501 double OSD::scrub_sleep_time(bool must_scrub)
7502 {
7503 if (must_scrub) {
7504 return cct->_conf->osd_scrub_sleep;
7505 }
7506 utime_t now = ceph_clock_now();
7507 if (scrub_time_permit(now)) {
7508 return cct->_conf->osd_scrub_sleep;
7509 }
7510 double normal_sleep = cct->_conf->osd_scrub_sleep;
7511 double extended_sleep = cct->_conf->osd_scrub_extended_sleep;
7512 return std::max(extended_sleep, normal_sleep);
7513 }
7514
7515 bool OSD::scrub_time_permit(utime_t now)
7516 {
7517 struct tm bdt;
7518 time_t tt = now.sec();
7519 localtime_r(&tt, &bdt);
7520
7521 bool day_permit = false;
7522 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7523 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7524 day_permit = true;
7525 }
7526 } else {
7527 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7528 day_permit = true;
7529 }
7530 }
7531
7532 if (!day_permit) {
7533 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7534 << " - " << cct->_conf->osd_scrub_end_week_day
7535 << " now " << bdt.tm_wday << " = no" << dendl;
7536 return false;
7537 }
7538
7539 bool time_permit = false;
7540 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7541 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7542 time_permit = true;
7543 }
7544 } else {
7545 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7546 time_permit = true;
7547 }
7548 }
7549 if (time_permit) {
7550 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7551 << " - " << cct->_conf->osd_scrub_end_hour
7552 << " now " << bdt.tm_hour << " = yes" << dendl;
7553 } else {
7554 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7555 << " - " << cct->_conf->osd_scrub_end_hour
7556 << " now " << bdt.tm_hour << " = no" << dendl;
7557 }
7558 return time_permit;
7559 }
7560
7561 bool OSD::scrub_load_below_threshold()
7562 {
7563 double loadavgs[3];
7564 if (getloadavg(loadavgs, 3) != 3) {
7565 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7566 return false;
7567 }
7568
7569 // allow scrub if below configured threshold
7570 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7571 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7572 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7573 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7574 << " < max " << cct->_conf->osd_scrub_load_threshold
7575 << " = yes" << dendl;
7576 return true;
7577 }
7578
7579 // allow scrub if below daily avg and currently decreasing
7580 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7581 dout(20) << __func__ << " loadavg " << loadavgs[0]
7582 << " < daily_loadavg " << daily_loadavg
7583 << " and < 15m avg " << loadavgs[2]
7584 << " = yes" << dendl;
7585 return true;
7586 }
7587
7588 dout(20) << __func__ << " loadavg " << loadavgs[0]
7589 << " >= max " << cct->_conf->osd_scrub_load_threshold
7590 << " and ( >= daily_loadavg " << daily_loadavg
7591 << " or >= 15m avg " << loadavgs[2]
7592 << ") = no" << dendl;
7593 return false;
7594 }
7595
7596 void OSD::sched_scrub()
7597 {
7598 dout(20) << __func__ << " sched_scrub starts" << dendl;
7599
7600 // if not permitted, fail fast
7601 if (!service.can_inc_scrubs()) {
7602 dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
7603 return;
7604 }
7605 bool allow_requested_repair_only = false;
7606 if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7607 if (!cct->_conf->osd_repair_during_recovery) {
7608 dout(15) << __func__ << ": not scheduling scrubs due to active recovery" << dendl;
7609 return;
7610 }
7611 dout(10) << __func__
7612 << " will only schedule explicitly requested repair due to active recovery"
7613 << dendl;
7614 allow_requested_repair_only = true;
7615 }
7616
7617 utime_t now = ceph_clock_now();
7618 bool time_permit = scrub_time_permit(now);
7619 bool load_is_low = scrub_load_below_threshold();
7620 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7621
7622 OSDService::ScrubJob scrub_job;
7623 if (service.first_scrub_stamp(&scrub_job)) {
7624 do {
7625 dout(30) << "sched_scrub examine " << scrub_job.pgid << " at " << scrub_job.sched_time << dendl;
7626
7627 if (scrub_job.sched_time > now) {
7628 // save ourselves some effort
7629 dout(20) << "sched_scrub " << scrub_job.pgid << " scheduled at " << scrub_job.sched_time
7630 << " > " << now << dendl;
7631 break;
7632 }
7633
7634 if ((scrub_job.deadline.is_zero() || scrub_job.deadline >= now) && !(time_permit && load_is_low)) {
7635 dout(15) << __func__ << " not scheduling scrub for " << scrub_job.pgid << " due to "
7636 << (!time_permit ? "time not permit" : "high load") << dendl;
7637 continue;
7638 }
7639
7640 PGRef pg = _lookup_lock_pg(scrub_job.pgid);
7641 if (!pg) {
7642 dout(20) << __func__ << " pg " << scrub_job.pgid << " not found" << dendl;
7643 continue;
7644 }
7645
7646 // This has already started, so go on to the next scrub job
7647 if (pg->is_scrub_active()) {
7648 pg->unlock();
7649 dout(20) << __func__ << ": already in progress pgid " << scrub_job.pgid << dendl;
7650 continue;
7651 }
7652 // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
7653 if (allow_requested_repair_only && !pg->m_planned_scrub.must_repair) {
7654 pg->unlock();
7655 dout(10) << __func__ << " skip " << scrub_job.pgid
7656 << " because repairing is not explicitly requested on it"
7657 << dendl;
7658 continue;
7659 }
7660
7661 // If it is reserving, let it resolve before going to the next scrub job
7662 if (pg->m_scrubber->is_reserving()) {
7663 pg->unlock();
7664 dout(10) << __func__ << ": reserve in progress pgid " << scrub_job.pgid << dendl;
7665 break;
7666 }
7667 dout(15) << "sched_scrub scrubbing " << scrub_job.pgid << " at " << scrub_job.sched_time
7668 << (pg->get_must_scrub() ? ", explicitly requested" :
7669 (load_is_low ? ", load_is_low" : " deadline < now"))
7670 << dendl;
7671 if (pg->sched_scrub()) {
7672 pg->unlock();
7673 dout(10) << __func__ << " scheduled a scrub!" << " (~" << scrub_job.pgid << "~)" << dendl;
7674 break;
7675 }
7676 pg->unlock();
7677 } while (service.next_scrub_stamp(scrub_job, &scrub_job));
7678 }
7679 dout(20) << "sched_scrub done" << dendl;
7680 }
7681
7682 void OSD::resched_all_scrubs()
7683 {
7684 dout(10) << __func__ << ": start" << dendl;
7685 const vector<spg_t> pgs = [this] {
7686 vector<spg_t> pgs;
7687 OSDService::ScrubJob job;
7688 if (service.first_scrub_stamp(&job)) {
7689 do {
7690 pgs.push_back(job.pgid);
7691 } while (service.next_scrub_stamp(job, &job));
7692 }
7693 return pgs;
7694 }();
7695 for (auto& pgid : pgs) {
7696 dout(20) << __func__ << ": examine " << pgid << dendl;
7697 PGRef pg = _lookup_lock_pg(pgid);
7698 if (!pg)
7699 continue;
7700 if (!pg->m_planned_scrub.must_scrub && !pg->m_planned_scrub.need_auto) {
7701 dout(15) << __func__ << ": reschedule " << pgid << dendl;
7702 pg->on_info_history_change();
7703 }
7704 pg->unlock();
7705 }
7706 dout(10) << __func__ << ": done" << dendl;
7707 }
7708
7709 MPGStats* OSD::collect_pg_stats()
7710 {
7711 // This implementation unconditionally sends every is_primary PG's
7712 // stats every time we're called. This has equivalent cost to the
7713 // previous implementation's worst case where all PGs are busy and
7714 // their stats are always enqueued for sending.
7715 std::shared_lock l{map_lock};
7716
7717 osd_stat_t cur_stat = service.get_osd_stat();
7718 cur_stat.os_perf_stat = store->get_cur_stats();
7719
7720 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
7721 m->osd_stat = cur_stat;
7722
7723 std::lock_guard lec{min_last_epoch_clean_lock};
7724 min_last_epoch_clean = get_osdmap_epoch();
7725 min_last_epoch_clean_pgs.clear();
7726
7727 std::set<int64_t> pool_set;
7728 vector<PGRef> pgs;
7729 _get_pgs(&pgs);
7730 for (auto& pg : pgs) {
7731 auto pool = pg->pg_id.pgid.pool();
7732 pool_set.emplace((int64_t)pool);
7733 if (!pg->is_primary()) {
7734 continue;
7735 }
7736 pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7737 m->pg_stat[pg->pg_id.pgid] = s;
7738 min_last_epoch_clean = std::min(min_last_epoch_clean, lec);
7739 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7740 });
7741 }
7742 store_statfs_t st;
7743 bool per_pool_stats = false;
7744 bool per_pool_omap_stats = false;
7745 for (auto p : pool_set) {
7746 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
7747 if (r == -ENOTSUP) {
7748 break;
7749 } else {
7750 assert(r >= 0);
7751 m->pool_stat[p] = st;
7752 per_pool_stats = true;
7753 }
7754 }
7755
7756 // indicate whether we are reporting per-pool stats
7757 m->osd_stat.num_osds = 1;
7758 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
7759 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
7760
7761 return m;
7762 }
7763
7764 vector<DaemonHealthMetric> OSD::get_health_metrics()
7765 {
7766 vector<DaemonHealthMetric> metrics;
7767 {
7768 utime_t oldest_secs;
7769 const utime_t now = ceph_clock_now();
7770 auto too_old = now;
7771 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7772 int slow = 0;
7773 TrackedOpRef oldest_op;
7774 auto count_slow_ops = [&](TrackedOp& op) {
7775 if (op.get_initiated() < too_old) {
7776 stringstream ss;
7777 ss << "slow request " << op.get_desc()
7778 << " initiated "
7779 << op.get_initiated()
7780 << " currently "
7781 << op.state_string();
7782 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7783 clog->warn() << ss.str();
7784 slow++;
7785 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7786 oldest_op = &op;
7787 }
7788 return true;
7789 } else {
7790 return false;
7791 }
7792 };
7793 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7794 if (slow) {
7795 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7796 << oldest_op->get_desc() << dendl;
7797 }
7798 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7799 } else {
7800 // no news is not good news.
7801 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7802 }
7803 }
7804 {
7805 std::lock_guard l(pending_creates_lock);
7806 auto n_primaries = pending_creates_from_mon;
7807 for (const auto& create : pending_creates_from_osd) {
7808 if (create.second) {
7809 n_primaries++;
7810 }
7811 }
7812 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
7813 }
7814 return metrics;
7815 }
7816
7817 // =====================================================
7818 // MAP
7819
7820 void OSD::wait_for_new_map(OpRequestRef op)
7821 {
7822 // ask?
7823 if (waiting_for_osdmap.empty()) {
7824 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7825 }
7826
7827 logger->inc(l_osd_waiting_for_map);
7828 waiting_for_osdmap.push_back(op);
7829 op->mark_delayed("wait for new map");
7830 }
7831
7832
7833 /** update_map
7834 * assimilate new OSDMap(s). scan pgs, etc.
7835 */
7836
7837 void OSD::note_down_osd(int peer)
7838 {
7839 ceph_assert(ceph_mutex_is_locked(osd_lock));
7840 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7841
7842 std::lock_guard l{heartbeat_lock};
7843 failure_queue.erase(peer);
7844 failure_pending.erase(peer);
7845 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7846 if (p != heartbeat_peers.end()) {
7847 p->second.clear_mark_down();
7848 heartbeat_peers.erase(p);
7849 }
7850 }
7851
7852 void OSD::note_up_osd(int peer)
7853 {
7854 heartbeat_set_peers_need_update();
7855 }
7856
7857 struct C_OnMapCommit : public Context {
7858 OSD *osd;
7859 epoch_t first, last;
7860 MOSDMap *msg;
7861 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7862 : osd(o), first(f), last(l), msg(m) {}
7863 void finish(int r) override {
7864 osd->_committed_osd_maps(first, last, msg);
7865 msg->put();
7866 }
7867 };
7868
7869 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7870 {
7871 std::lock_guard l(osdmap_subscribe_lock);
7872 if (latest_subscribed_epoch >= epoch && !force_request)
7873 return;
7874
7875 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
7876
7877 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7878 force_request) {
7879 monc->renew_subs();
7880 }
7881 }
7882
7883 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7884 {
7885 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7886 if (min <= superblock.oldest_map)
7887 return;
7888
7889 int num = 0;
7890 ObjectStore::Transaction t;
7891 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7892 dout(20) << " removing old osdmap epoch " << e << dendl;
7893 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7894 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7895 superblock.oldest_map = e + 1;
7896 num++;
7897 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7898 service.publish_superblock(superblock);
7899 write_superblock(t);
7900 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7901 ceph_assert(tr == 0);
7902 num = 0;
7903 if (!skip_maps) {
7904 // skip_maps leaves us with a range of old maps if we fail to remove all
7905 // of them before moving superblock.oldest_map forward to the first map
7906 // in the incoming MOSDMap msg. so we should continue removing them in
7907 // this case, even we could do huge series of delete transactions all at
7908 // once.
7909 break;
7910 }
7911 }
7912 }
7913 if (num > 0) {
7914 service.publish_superblock(superblock);
7915 write_superblock(t);
7916 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7917 ceph_assert(tr == 0);
7918 }
7919 // we should not remove the cached maps
7920 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7921 }
7922
7923 void OSD::handle_osd_map(MOSDMap *m)
7924 {
7925 // wait for pgs to catch up
7926 {
7927 // we extend the map cache pins to accomodate pgs slow to consume maps
7928 // for some period, until we hit the max_lag_factor bound, at which point
7929 // we block here to stop injesting more maps than they are able to keep
7930 // up with.
7931 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7932 m_osd_pg_epoch_max_lag_factor;
7933 ceph_assert(max_lag > 0);
7934 epoch_t osd_min = 0;
7935 for (auto shard : shards) {
7936 epoch_t min = shard->get_min_pg_epoch();
7937 if (osd_min == 0 || min < osd_min) {
7938 osd_min = min;
7939 }
7940 }
7941 epoch_t osdmap_epoch = get_osdmap_epoch();
7942 if (osd_min > 0 &&
7943 osdmap_epoch > max_lag &&
7944 osdmap_epoch - max_lag > osd_min) {
7945 epoch_t need = osdmap_epoch - max_lag;
7946 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7947 << " max_lag " << max_lag << ")" << dendl;
7948 for (auto shard : shards) {
7949 epoch_t min = shard->get_min_pg_epoch();
7950 if (need > min) {
7951 dout(10) << __func__ << " waiting for pgs to consume " << need
7952 << " (shard " << shard->shard_id << " min " << min
7953 << ", map cache is " << cct->_conf->osd_map_cache_size
7954 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7955 << ")" << dendl;
7956 unlock_guard unlock{osd_lock};
7957 shard->wait_min_pg_epoch(need);
7958 }
7959 }
7960 }
7961 }
7962
7963 ceph_assert(ceph_mutex_is_locked(osd_lock));
7964 map<epoch_t,OSDMapRef> added_maps;
7965 map<epoch_t,bufferlist> added_maps_bl;
7966 if (m->fsid != monc->get_fsid()) {
7967 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7968 << monc->get_fsid() << dendl;
7969 m->put();
7970 return;
7971 }
7972 if (is_initializing()) {
7973 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7974 m->put();
7975 return;
7976 }
7977
7978 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7979 if (session && !(session->entity_name.is_mon() ||
7980 session->entity_name.is_osd())) {
7981 //not enough perms!
7982 dout(10) << "got osd map from Session " << session
7983 << " which we can't take maps from (not a mon or osd)" << dendl;
7984 m->put();
7985 return;
7986 }
7987
7988 // share with the objecter
7989 if (!is_preboot())
7990 service.objecter->handle_osd_map(m);
7991
7992 epoch_t first = m->get_first();
7993 epoch_t last = m->get_last();
7994 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7995 << superblock.newest_map
7996 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7997 << dendl;
7998
7999 logger->inc(l_osd_map);
8000 logger->inc(l_osd_mape, last - first + 1);
8001 if (first <= superblock.newest_map)
8002 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
8003 if (service.max_oldest_map < m->oldest_map) {
8004 service.max_oldest_map = m->oldest_map;
8005 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
8006 }
8007
8008 // make sure there is something new, here, before we bother flushing
8009 // the queues and such
8010 if (last <= superblock.newest_map) {
8011 dout(10) << " no new maps here, dropping" << dendl;
8012 m->put();
8013 return;
8014 }
8015
8016 // missing some?
8017 bool skip_maps = false;
8018 if (first > superblock.newest_map + 1) {
8019 dout(10) << "handle_osd_map message skips epochs "
8020 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
8021 if (m->oldest_map <= superblock.newest_map + 1) {
8022 osdmap_subscribe(superblock.newest_map + 1, false);
8023 m->put();
8024 return;
8025 }
8026 // always try to get the full range of maps--as many as we can. this
8027 // 1- is good to have
8028 // 2- is at present the only way to ensure that we get a *full* map as
8029 // the first map!
8030 if (m->oldest_map < first) {
8031 osdmap_subscribe(m->oldest_map - 1, true);
8032 m->put();
8033 return;
8034 }
8035 skip_maps = true;
8036 }
8037
8038 ObjectStore::Transaction t;
8039 uint64_t txn_size = 0;
8040
8041 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
8042
8043 // store new maps: queue for disk and put in the osdmap cache
8044 epoch_t start = std::max(superblock.newest_map + 1, first);
8045 for (epoch_t e = start; e <= last; e++) {
8046 if (txn_size >= t.get_num_bytes()) {
8047 derr << __func__ << " transaction size overflowed" << dendl;
8048 ceph_assert(txn_size < t.get_num_bytes());
8049 }
8050 txn_size = t.get_num_bytes();
8051 map<epoch_t,bufferlist>::iterator p;
8052 p = m->maps.find(e);
8053 if (p != m->maps.end()) {
8054 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
8055 OSDMap *o = new OSDMap;
8056 bufferlist& bl = p->second;
8057
8058 o->decode(bl);
8059
8060 purged_snaps[e] = o->get_new_purged_snaps();
8061
8062 ghobject_t fulloid = get_osdmap_pobject_name(e);
8063 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
8064 added_maps[e] = add_map(o);
8065 added_maps_bl[e] = bl;
8066 got_full_map(e);
8067 continue;
8068 }
8069
8070 p = m->incremental_maps.find(e);
8071 if (p != m->incremental_maps.end()) {
8072 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
8073 bufferlist& bl = p->second;
8074 ghobject_t oid = get_inc_osdmap_pobject_name(e);
8075 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
8076
8077 OSDMap *o = new OSDMap;
8078 if (e > 1) {
8079 bufferlist obl;
8080 bool got = get_map_bl(e - 1, obl);
8081 if (!got) {
8082 auto p = added_maps_bl.find(e - 1);
8083 ceph_assert(p != added_maps_bl.end());
8084 obl = p->second;
8085 }
8086 o->decode(obl);
8087 }
8088
8089 OSDMap::Incremental inc;
8090 auto p = bl.cbegin();
8091 inc.decode(p);
8092
8093 if (o->apply_incremental(inc) < 0) {
8094 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
8095 ceph_abort_msg("bad fsid");
8096 }
8097
8098 bufferlist fbl;
8099 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8100
8101 bool injected_failure = false;
8102 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8103 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8104 derr << __func__ << " injecting map crc failure" << dendl;
8105 injected_failure = true;
8106 }
8107
8108 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8109 dout(2) << "got incremental " << e
8110 << " but failed to encode full with correct crc; requesting"
8111 << dendl;
8112 clog->warn() << "failed to encode map e" << e << " with expected crc";
8113 dout(20) << "my encoded map was:\n";
8114 fbl.hexdump(*_dout);
8115 *_dout << dendl;
8116 delete o;
8117 request_full_map(e, last);
8118 last = e - 1;
8119
8120 // don't continue committing if we failed to enc the first inc map
8121 if (last < start) {
8122 dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
8123 m->put();
8124 return;
8125 }
8126 break;
8127 }
8128 got_full_map(e);
8129 purged_snaps[e] = o->get_new_purged_snaps();
8130
8131 ghobject_t fulloid = get_osdmap_pobject_name(e);
8132 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
8133 added_maps[e] = add_map(o);
8134 added_maps_bl[e] = fbl;
8135 continue;
8136 }
8137
8138 ceph_abort_msg("MOSDMap lied about what maps it had?");
8139 }
8140
8141 // even if this map isn't from a mon, we may have satisfied our subscription
8142 monc->sub_got("osdmap", last);
8143
8144 if (!m->maps.empty() && requested_full_first) {
8145 dout(10) << __func__ << " still missing full maps " << requested_full_first
8146 << ".." << requested_full_last << dendl;
8147 rerequest_full_maps();
8148 }
8149
8150 if (superblock.oldest_map) {
8151 // make sure we at least keep pace with incoming maps
8152 trim_maps(m->oldest_map, last - first + 1, skip_maps);
8153 pg_num_history.prune(superblock.oldest_map);
8154 }
8155
8156 if (!superblock.oldest_map || skip_maps)
8157 superblock.oldest_map = first;
8158 superblock.newest_map = last;
8159 superblock.current_epoch = last;
8160
8161 // note in the superblock that we were clean thru the prior epoch
8162 epoch_t boot_epoch = service.get_boot_epoch();
8163 if (boot_epoch && boot_epoch >= superblock.mounted) {
8164 superblock.mounted = boot_epoch;
8165 superblock.clean_thru = last;
8166 }
8167
8168 // check for pg_num changes and deleted pools
8169 OSDMapRef lastmap;
8170 for (auto& i : added_maps) {
8171 if (!lastmap) {
8172 if (!(lastmap = service.try_get_map(i.first - 1))) {
8173 dout(10) << __func__ << " can't get previous map " << i.first - 1
8174 << " probably first start of this osd" << dendl;
8175 continue;
8176 }
8177 }
8178 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8179 for (auto& j : lastmap->get_pools()) {
8180 if (!i.second->have_pg_pool(j.first)) {
8181 pg_num_history.log_pool_delete(i.first, j.first);
8182 dout(10) << __func__ << " recording final pg_pool_t for pool "
8183 << j.first << dendl;
8184 // this information is needed by _make_pg() if have to restart before
8185 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8186 ghobject_t obj = make_final_pool_info_oid(j.first);
8187 bufferlist bl;
8188 encode(j.second, bl, CEPH_FEATURES_ALL);
8189 string name = lastmap->get_pool_name(j.first);
8190 encode(name, bl);
8191 map<string,string> profile;
8192 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8193 profile = lastmap->get_erasure_code_profile(
8194 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8195 }
8196 encode(profile, bl);
8197 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8198 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8199 new_pg_num != j.second.get_pg_num()) {
8200 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8201 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8202 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8203 }
8204 }
8205 for (auto& j : i.second->get_pools()) {
8206 if (!lastmap->have_pg_pool(j.first)) {
8207 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8208 << j.second.get_pg_num() << dendl;
8209 pg_num_history.log_pg_num_change(i.first, j.first,
8210 j.second.get_pg_num());
8211 }
8212 }
8213 lastmap = i.second;
8214 }
8215 pg_num_history.epoch = last;
8216 {
8217 bufferlist bl;
8218 ::encode(pg_num_history, bl);
8219 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8220 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8221 }
8222
8223 // record new purged_snaps
8224 if (superblock.purged_snaps_last == start - 1) {
8225 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
8226 make_purged_snaps_oid(), &t,
8227 purged_snaps);
8228 superblock.purged_snaps_last = last;
8229 } else {
8230 dout(10) << __func__ << " superblock purged_snaps_last is "
8231 << superblock.purged_snaps_last
8232 << ", not recording new purged_snaps" << dendl;
8233 }
8234
8235 // superblock and commit
8236 write_superblock(t);
8237 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8238 store->queue_transaction(
8239 service.meta_ch,
8240 std::move(t));
8241 service.publish_superblock(superblock);
8242 }
8243
8244 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8245 {
8246 dout(10) << __func__ << " " << first << ".." << last << dendl;
8247 if (is_stopping()) {
8248 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8249 return;
8250 }
8251 std::lock_guard l(osd_lock);
8252 if (is_stopping()) {
8253 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8254 return;
8255 }
8256 map_lock.lock();
8257
8258 ceph_assert(first <= last);
8259
8260 bool do_shutdown = false;
8261 bool do_restart = false;
8262 bool network_error = false;
8263 OSDMapRef osdmap = get_osdmap();
8264
8265 // advance through the new maps
8266 for (epoch_t cur = first; cur <= last; cur++) {
8267 dout(10) << " advance to epoch " << cur
8268 << " (<= last " << last
8269 << " <= newest_map " << superblock.newest_map
8270 << ")" << dendl;
8271
8272 OSDMapRef newmap = get_map(cur);
8273 ceph_assert(newmap); // we just cached it above!
8274
8275 // start blocklisting messages sent to peers that go down.
8276 service.pre_publish_map(newmap);
8277
8278 // kill connections to newly down osds
8279 bool waited_for_reservations = false;
8280 set<int> old;
8281 osdmap = get_osdmap();
8282 osdmap->get_all_osds(old);
8283 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8284 if (*p != whoami &&
8285 osdmap->is_up(*p) && // in old map
8286 newmap->is_down(*p)) { // but not the new one
8287 if (!waited_for_reservations) {
8288 service.await_reserved_maps();
8289 waited_for_reservations = true;
8290 }
8291 note_down_osd(*p);
8292 } else if (*p != whoami &&
8293 osdmap->is_down(*p) &&
8294 newmap->is_up(*p)) {
8295 note_up_osd(*p);
8296 }
8297 }
8298
8299 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8300 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8301 << dendl;
8302 if (is_booting()) {
8303 // this captures the case where we sent the boot message while
8304 // NOUP was being set on the mon and our boot request was
8305 // dropped, and then later it is cleared. it imperfectly
8306 // handles the case where our original boot message was not
8307 // dropped and we restart even though we might have booted, but
8308 // that is harmless (boot will just take slightly longer).
8309 do_restart = true;
8310 }
8311 }
8312
8313 osdmap = std::move(newmap);
8314 set_osdmap(osdmap);
8315 epoch_t up_epoch;
8316 epoch_t boot_epoch;
8317 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8318 if (!up_epoch &&
8319 osdmap->is_up(whoami) &&
8320 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8321 up_epoch = osdmap->get_epoch();
8322 dout(10) << "up_epoch is " << up_epoch << dendl;
8323 if (!boot_epoch) {
8324 boot_epoch = osdmap->get_epoch();
8325 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8326 }
8327 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8328 }
8329 }
8330
8331 epoch_t _bind_epoch = service.get_bind_epoch();
8332 if (osdmap->is_up(whoami) &&
8333 osdmap->get_addrs(whoami).legacy_equals(
8334 client_messenger->get_myaddrs()) &&
8335 _bind_epoch < osdmap->get_up_from(whoami)) {
8336
8337 if (is_booting()) {
8338 dout(1) << "state: booting -> active" << dendl;
8339 set_state(STATE_ACTIVE);
8340 do_restart = false;
8341
8342 // set incarnation so that osd_reqid_t's we generate for our
8343 // objecter requests are unique across restarts.
8344 service.objecter->set_client_incarnation(osdmap->get_epoch());
8345 cancel_pending_failures();
8346 }
8347 }
8348
8349 if (osdmap->get_epoch() > 0 &&
8350 is_active()) {
8351 if (!osdmap->exists(whoami)) {
8352 derr << "map says i do not exist. shutting down." << dendl;
8353 do_shutdown = true; // don't call shutdown() while we have
8354 // everything paused
8355 } else if (osdmap->is_stop(whoami)) {
8356 derr << "map says i am stopped by admin. shutting down." << dendl;
8357 do_shutdown = true;
8358 } else if (!osdmap->is_up(whoami) ||
8359 !osdmap->get_addrs(whoami).legacy_equals(
8360 client_messenger->get_myaddrs()) ||
8361 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8362 cluster_messenger->get_myaddrs()) ||
8363 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8364 hb_back_server_messenger->get_myaddrs()) ||
8365 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8366 hb_front_server_messenger->get_myaddrs())) {
8367 if (!osdmap->is_up(whoami)) {
8368 if (service.is_preparing_to_stop() || service.is_stopping()) {
8369 service.got_stop_ack();
8370 } else {
8371 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8372 "but it is still running";
8373 clog->debug() << "map e" << osdmap->get_epoch()
8374 << " wrongly marked me down at e"
8375 << osdmap->get_down_at(whoami);
8376 }
8377 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8378 // note that this is best-effort...
8379 monc->send_mon_message(
8380 new MOSDMarkMeDead(
8381 monc->get_fsid(),
8382 whoami,
8383 osdmap->get_epoch()));
8384 }
8385 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8386 client_messenger->get_myaddrs())) {
8387 clog->error() << "map e" << osdmap->get_epoch()
8388 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8389 << " != my " << client_messenger->get_myaddrs() << ")";
8390 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8391 cluster_messenger->get_myaddrs())) {
8392 clog->error() << "map e" << osdmap->get_epoch()
8393 << " had wrong cluster addr ("
8394 << osdmap->get_cluster_addrs(whoami)
8395 << " != my " << cluster_messenger->get_myaddrs() << ")";
8396 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8397 hb_back_server_messenger->get_myaddrs())) {
8398 clog->error() << "map e" << osdmap->get_epoch()
8399 << " had wrong heartbeat back addr ("
8400 << osdmap->get_hb_back_addrs(whoami)
8401 << " != my " << hb_back_server_messenger->get_myaddrs()
8402 << ")";
8403 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8404 hb_front_server_messenger->get_myaddrs())) {
8405 clog->error() << "map e" << osdmap->get_epoch()
8406 << " had wrong heartbeat front addr ("
8407 << osdmap->get_hb_front_addrs(whoami)
8408 << " != my " << hb_front_server_messenger->get_myaddrs()
8409 << ")";
8410 }
8411
8412 if (!service.is_stopping()) {
8413 epoch_t up_epoch = 0;
8414 epoch_t bind_epoch = osdmap->get_epoch();
8415 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8416 do_restart = true;
8417
8418 //add markdown log
8419 utime_t now = ceph_clock_now();
8420 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8421 osd_markdown_log.push_back(now);
8422 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8423 derr << __func__ << " marked down "
8424 << osd_markdown_log.size()
8425 << " > osd_max_markdown_count "
8426 << cct->_conf->osd_max_markdown_count
8427 << " in last " << grace << " seconds, shutting down"
8428 << dendl;
8429 do_restart = false;
8430 do_shutdown = true;
8431 }
8432
8433 start_waiting_for_healthy();
8434
8435 set<int> avoid_ports;
8436 #if defined(__FreeBSD__)
8437 // prevent FreeBSD from grabbing the client_messenger port during
8438 // rebinding. In which case a cluster_meesneger will connect also
8439 // to the same port
8440 client_messenger->get_myaddrs().get_ports(&avoid_ports);
8441 #endif
8442 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8443
8444 int r = cluster_messenger->rebind(avoid_ports);
8445 if (r != 0) {
8446 do_shutdown = true; // FIXME: do_restart?
8447 network_error = true;
8448 derr << __func__ << " marked down:"
8449 << " rebind cluster_messenger failed" << dendl;
8450 }
8451
8452 hb_back_server_messenger->mark_down_all();
8453 hb_front_server_messenger->mark_down_all();
8454 hb_front_client_messenger->mark_down_all();
8455 hb_back_client_messenger->mark_down_all();
8456
8457 reset_heartbeat_peers(true);
8458 }
8459 }
8460 }
8461
8462 map_lock.unlock();
8463
8464 check_osdmap_features();
8465
8466 // yay!
8467 consume_map();
8468
8469 if (is_active() || is_waiting_for_healthy())
8470 maybe_update_heartbeat_peers();
8471
8472 if (is_active()) {
8473 activate_map();
8474 }
8475
8476 if (do_shutdown) {
8477 if (network_error) {
8478 cancel_pending_failures();
8479 }
8480 // trigger shutdown in a different thread
8481 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8482 queue_async_signal(SIGINT);
8483 }
8484 else if (m->newest_map && m->newest_map > last) {
8485 dout(10) << " msg say newest map is " << m->newest_map
8486 << ", requesting more" << dendl;
8487 osdmap_subscribe(osdmap->get_epoch()+1, false);
8488 }
8489 else if (is_preboot()) {
8490 if (m->get_source().is_mon())
8491 _preboot(m->oldest_map, m->newest_map);
8492 else
8493 start_boot();
8494 }
8495 else if (do_restart)
8496 start_boot();
8497
8498 }
8499
8500 void OSD::check_osdmap_features()
8501 {
8502 // adjust required feature bits?
8503
8504 // we have to be a bit careful here, because we are accessing the
8505 // Policy structures without taking any lock. in particular, only
8506 // modify integer values that can safely be read by a racing CPU.
8507 // since we are only accessing existing Policy structures a their
8508 // current memory location, and setting or clearing bits in integer
8509 // fields, and we are the only writer, this is not a problem.
8510
8511 const auto osdmap = get_osdmap();
8512 {
8513 Messenger::Policy p = client_messenger->get_default_policy();
8514 uint64_t mask;
8515 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8516 if ((p.features_required & mask) != features) {
8517 dout(0) << "crush map has features " << features
8518 << ", adjusting msgr requires for clients" << dendl;
8519 p.features_required = (p.features_required & ~mask) | features;
8520 client_messenger->set_default_policy(p);
8521 }
8522 }
8523 {
8524 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8525 uint64_t mask;
8526 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8527 if ((p.features_required & mask) != features) {
8528 dout(0) << "crush map has features " << features
8529 << " was " << p.features_required
8530 << ", adjusting msgr requires for mons" << dendl;
8531 p.features_required = (p.features_required & ~mask) | features;
8532 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8533 }
8534 }
8535 {
8536 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8537 uint64_t mask;
8538 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8539
8540 if ((p.features_required & mask) != features) {
8541 dout(0) << "crush map has features " << features
8542 << ", adjusting msgr requires for osds" << dendl;
8543 p.features_required = (p.features_required & ~mask) | features;
8544 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8545 }
8546
8547 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8548 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8549 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8550 ObjectStore::Transaction t;
8551 write_superblock(t);
8552 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8553 ceph_assert(err == 0);
8554 }
8555 }
8556
8557 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8558 hb_front_server_messenger->set_require_authorizer(false);
8559 hb_back_server_messenger->set_require_authorizer(false);
8560 } else {
8561 hb_front_server_messenger->set_require_authorizer(true);
8562 hb_back_server_messenger->set_require_authorizer(true);
8563 }
8564
8565 if (osdmap->require_osd_release != last_require_osd_release) {
8566 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8567 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8568 store->write_meta("require_osd_release",
8569 stringify((int)osdmap->require_osd_release));
8570 last_require_osd_release = osdmap->require_osd_release;
8571 }
8572 }
8573
8574 struct C_FinishSplits : public Context {
8575 OSD *osd;
8576 set<PGRef> pgs;
8577 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8578 : osd(osd), pgs(in) {}
8579 void finish(int r) override {
8580 osd->_finish_splits(pgs);
8581 }
8582 };
8583
8584 void OSD::_finish_splits(set<PGRef>& pgs)
8585 {
8586 dout(10) << __func__ << " " << pgs << dendl;
8587 if (is_stopping())
8588 return;
8589 for (set<PGRef>::iterator i = pgs.begin();
8590 i != pgs.end();
8591 ++i) {
8592 PG *pg = i->get();
8593
8594 PeeringCtx rctx = create_context();
8595 pg->lock();
8596 dout(10) << __func__ << " " << *pg << dendl;
8597 epoch_t e = pg->get_osdmap_epoch();
8598 pg->handle_initialize(rctx);
8599 pg->queue_null(e, e);
8600 dispatch_context(rctx, pg, service.get_osdmap());
8601 pg->unlock();
8602
8603 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8604 shards[shard_index]->register_and_wake_split_child(pg);
8605 }
8606 };
8607
8608 bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8609 unsigned need)
8610 {
8611 std::lock_guard l(merge_lock);
8612 auto& p = merge_waiters[nextmap->get_epoch()][target];
8613 p[src->pg_id] = src;
8614 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8615 << " for " << target << ", have " << p.size() << "/" << need
8616 << dendl;
8617 return p.size() == need;
8618 }
8619
8620 bool OSD::advance_pg(
8621 epoch_t osd_epoch,
8622 PG *pg,
8623 ThreadPool::TPHandle &handle,
8624 PeeringCtx &rctx)
8625 {
8626 if (osd_epoch <= pg->get_osdmap_epoch()) {
8627 return true;
8628 }
8629 ceph_assert(pg->is_locked());
8630 OSDMapRef lastmap = pg->get_osdmap();
8631 set<PGRef> new_pgs; // any split children
8632 bool ret = true;
8633
8634 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8635 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8636 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8637 next_epoch <= osd_epoch;
8638 ++next_epoch) {
8639 OSDMapRef nextmap = service.try_get_map(next_epoch);
8640 if (!nextmap) {
8641 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8642 continue;
8643 }
8644
8645 unsigned new_pg_num =
8646 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8647 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8648 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8649 // check for merge
8650 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8651 spg_t parent;
8652 if (pg->pg_id.is_merge_source(
8653 old_pg_num,
8654 new_pg_num,
8655 &parent)) {
8656 // we are merge source
8657 PGRef spg = pg; // carry a ref
8658 dout(1) << __func__ << " " << pg->pg_id
8659 << " is merge source, target is " << parent
8660 << dendl;
8661 pg->write_if_dirty(rctx);
8662 if (!new_pgs.empty()) {
8663 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8664 new_pgs));
8665 new_pgs.clear();
8666 }
8667 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8668 pg->ch->flush();
8669 // release backoffs explicitly, since the on_shutdown path
8670 // aggressively tears down backoff state.
8671 if (pg->is_primary()) {
8672 pg->release_pg_backoffs();
8673 }
8674 pg->on_shutdown();
8675 OSDShard *sdata = pg->osd_shard;
8676 {
8677 std::lock_guard l(sdata->shard_lock);
8678 if (pg->pg_slot) {
8679 sdata->_detach_pg(pg->pg_slot);
8680 // update pg count now since we might not get an osdmap
8681 // any time soon.
8682 if (pg->is_primary())
8683 logger->dec(l_osd_pg_primary);
8684 else if (pg->is_nonprimary())
8685 logger->dec(l_osd_pg_replica); // misnomer
8686 else
8687 logger->dec(l_osd_pg_stray);
8688 }
8689 }
8690 pg->unlock();
8691
8692 set<spg_t> children;
8693 parent.is_split(new_pg_num, old_pg_num, &children);
8694 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8695 enqueue_peering_evt(
8696 parent,
8697 PGPeeringEventRef(
8698 std::make_shared<PGPeeringEvent>(
8699 nextmap->get_epoch(),
8700 nextmap->get_epoch(),
8701 NullEvt())));
8702 }
8703 ret = false;
8704 goto out;
8705 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8706 // we are merge target
8707 set<spg_t> children;
8708 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8709 dout(20) << __func__ << " " << pg->pg_id
8710 << " is merge target, sources are " << children
8711 << dendl;
8712 map<spg_t,PGRef> sources;
8713 {
8714 std::lock_guard l(merge_lock);
8715 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8716 unsigned need = children.size();
8717 dout(20) << __func__ << " have " << s.size() << "/"
8718 << need << dendl;
8719 if (s.size() == need) {
8720 sources.swap(s);
8721 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8722 if (merge_waiters[nextmap->get_epoch()].empty()) {
8723 merge_waiters.erase(nextmap->get_epoch());
8724 }
8725 }
8726 }
8727 if (!sources.empty()) {
8728 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8729 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8730 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8731 pg->merge_from(
8732 sources, rctx, split_bits,
8733 nextmap->get_pg_pool(
8734 pg->pg_id.pool())->last_pg_merge_meta);
8735 pg->pg_slot->waiting_for_merge_epoch = 0;
8736 } else {
8737 dout(20) << __func__ << " not ready to merge yet" << dendl;
8738 pg->write_if_dirty(rctx);
8739 if (!new_pgs.empty()) {
8740 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8741 new_pgs));
8742 new_pgs.clear();
8743 }
8744 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8745 pg->unlock();
8746 // kick source(s) to get them ready
8747 for (auto& i : children) {
8748 dout(20) << __func__ << " kicking source " << i << dendl;
8749 enqueue_peering_evt(
8750 i,
8751 PGPeeringEventRef(
8752 std::make_shared<PGPeeringEvent>(
8753 nextmap->get_epoch(),
8754 nextmap->get_epoch(),
8755 NullEvt())));
8756 }
8757 ret = false;
8758 goto out;
8759 }
8760 }
8761 }
8762 }
8763
8764 vector<int> newup, newacting;
8765 int up_primary, acting_primary;
8766 nextmap->pg_to_up_acting_osds(
8767 pg->pg_id.pgid,
8768 &newup, &up_primary,
8769 &newacting, &acting_primary);
8770 pg->handle_advance_map(
8771 nextmap, lastmap, newup, up_primary,
8772 newacting, acting_primary, rctx);
8773
8774 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8775 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8776 if (oldpool != lastmap->get_pools().end()
8777 && newpool != nextmap->get_pools().end()) {
8778 dout(20) << __func__
8779 << " new pool opts " << newpool->second.opts
8780 << " old pool opts " << oldpool->second.opts
8781 << dendl;
8782
8783 double old_min_interval = 0, new_min_interval = 0;
8784 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8785 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8786
8787 double old_max_interval = 0, new_max_interval = 0;
8788 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8789 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8790
8791 // Assume if an interval is change from set to unset or vice versa the actual config
8792 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8793 // unnecessarily.
8794 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8795 pg->on_info_history_change();
8796 }
8797 }
8798
8799 if (new_pg_num && old_pg_num != new_pg_num) {
8800 // check for split
8801 set<spg_t> children;
8802 if (pg->pg_id.is_split(
8803 old_pg_num,
8804 new_pg_num,
8805 &children)) {
8806 split_pgs(
8807 pg, children, &new_pgs, lastmap, nextmap,
8808 rctx);
8809 }
8810 }
8811
8812 lastmap = nextmap;
8813 old_pg_num = new_pg_num;
8814 handle.reset_tp_timeout();
8815 }
8816 pg->handle_activate_map(rctx);
8817
8818 ret = true;
8819 out:
8820 if (!new_pgs.empty()) {
8821 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
8822 }
8823 return ret;
8824 }
8825
8826 void OSD::consume_map()
8827 {
8828 ceph_assert(ceph_mutex_is_locked(osd_lock));
8829 auto osdmap = get_osdmap();
8830 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8831
8832 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8833 * speak the older sorting version any more. Be careful not to force
8834 * a shutdown if we are merely processing old maps, though.
8835 */
8836 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8837 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8838 ceph_abort();
8839 }
8840
8841 service.pre_publish_map(osdmap);
8842 service.await_reserved_maps();
8843 service.publish_map(osdmap);
8844
8845 // prime splits and merges
8846 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8847 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8848 for (auto& shard : shards) {
8849 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8850 }
8851 if (!newly_split.empty()) {
8852 for (auto& shard : shards) {
8853 shard->prime_splits(osdmap, &newly_split);
8854 }
8855 ceph_assert(newly_split.empty());
8856 }
8857
8858 // prune sent_ready_to_merge
8859 service.prune_sent_ready_to_merge(osdmap);
8860
8861 // FIXME, maybe: We could race against an incoming peering message
8862 // that instantiates a merge PG after identify_merges() below and
8863 // never set up its peer to complete the merge. An OSD restart
8864 // would clear it up. This is a hard race to resolve,
8865 // extraordinarily rare (we only merge PGs that are stable and
8866 // clean, so it'd have to be an imported PG to an OSD with a
8867 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8868 // replace all of this with a seastar-based code soon anyway.
8869 if (!merge_pgs.empty()) {
8870 // mark the pgs we already have, or create new and empty merge
8871 // participants for those we are missing. do this all under the
8872 // shard lock so we don't have to worry about racing pg creates
8873 // via _process.
8874 for (auto& shard : shards) {
8875 shard->prime_merges(osdmap, &merge_pgs);
8876 }
8877 ceph_assert(merge_pgs.empty());
8878 }
8879
8880 service.prune_pg_created();
8881
8882 unsigned pushes_to_free = 0;
8883 for (auto& shard : shards) {
8884 shard->consume_map(osdmap, &pushes_to_free);
8885 }
8886
8887 vector<spg_t> pgids;
8888 _get_pgids(&pgids);
8889
8890 // count (FIXME, probably during seastar rewrite)
8891 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8892 vector<PGRef> pgs;
8893 _get_pgs(&pgs);
8894 for (auto& pg : pgs) {
8895 // FIXME (probably during seastar rewrite): this is lockless and
8896 // racy, but we don't want to take pg lock here.
8897 if (pg->is_primary())
8898 num_pg_primary++;
8899 else if (pg->is_nonprimary())
8900 num_pg_replica++; // misnomer
8901 else
8902 num_pg_stray++;
8903 }
8904
8905 {
8906 // FIXME (as part of seastar rewrite): move to OSDShard
8907 std::lock_guard l(pending_creates_lock);
8908 for (auto pg = pending_creates_from_osd.begin();
8909 pg != pending_creates_from_osd.end();) {
8910 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
8911 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8912 << "discarding pending_create_from_osd" << dendl;
8913 pg = pending_creates_from_osd.erase(pg);
8914 } else {
8915 ++pg;
8916 }
8917 }
8918 }
8919
8920 service.maybe_inject_dispatch_delay();
8921
8922 dispatch_sessions_waiting_on_map();
8923
8924 service.maybe_inject_dispatch_delay();
8925
8926 service.release_reserved_pushes(pushes_to_free);
8927
8928 // queue null events to push maps down to individual PGs
8929 for (auto pgid : pgids) {
8930 enqueue_peering_evt(
8931 pgid,
8932 PGPeeringEventRef(
8933 std::make_shared<PGPeeringEvent>(
8934 osdmap->get_epoch(),
8935 osdmap->get_epoch(),
8936 NullEvt())));
8937 }
8938 logger->set(l_osd_pg, pgids.size());
8939 logger->set(l_osd_pg_primary, num_pg_primary);
8940 logger->set(l_osd_pg_replica, num_pg_replica);
8941 logger->set(l_osd_pg_stray, num_pg_stray);
8942 }
8943
8944 void OSD::activate_map()
8945 {
8946 ceph_assert(ceph_mutex_is_locked(osd_lock));
8947 auto osdmap = get_osdmap();
8948
8949 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8950
8951 // norecover?
8952 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8953 if (!service.recovery_is_paused()) {
8954 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8955 service.pause_recovery();
8956 }
8957 } else {
8958 if (service.recovery_is_paused()) {
8959 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8960 service.unpause_recovery();
8961 }
8962 }
8963
8964 service.activate_map();
8965
8966 // process waiters
8967 take_waiters(waiting_for_osdmap);
8968 }
8969
8970 bool OSD::require_mon_peer(const Message *m)
8971 {
8972 if (!m->get_connection()->peer_is_mon()) {
8973 dout(0) << "require_mon_peer received from non-mon "
8974 << m->get_connection()->get_peer_addr()
8975 << " " << *m << dendl;
8976 return false;
8977 }
8978 return true;
8979 }
8980
8981 bool OSD::require_mon_or_mgr_peer(const Message *m)
8982 {
8983 if (!m->get_connection()->peer_is_mon() &&
8984 !m->get_connection()->peer_is_mgr()) {
8985 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8986 << m->get_connection()->get_peer_addr()
8987 << " " << *m << dendl;
8988 return false;
8989 }
8990 return true;
8991 }
8992
8993 bool OSD::require_osd_peer(const Message *m)
8994 {
8995 if (!m->get_connection()->peer_is_osd()) {
8996 dout(0) << "require_osd_peer received from non-osd "
8997 << m->get_connection()->get_peer_addr()
8998 << " " << *m << dendl;
8999 return false;
9000 }
9001 return true;
9002 }
9003
9004 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
9005 {
9006 epoch_t up_epoch = service.get_up_epoch();
9007 if (epoch < up_epoch) {
9008 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
9009 return false;
9010 }
9011
9012 if (!is_active()) {
9013 dout(7) << "still in boot state, dropping message " << *m << dendl;
9014 return false;
9015 }
9016
9017 return true;
9018 }
9019
9020 bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
9021 bool is_fast_dispatch)
9022 {
9023 int from = m->get_source().num();
9024
9025 if (map->is_down(from) ||
9026 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
9027 dout(5) << "from dead osd." << from << ", marking down, "
9028 << " msg was " << m->get_source_inst().addr
9029 << " expected "
9030 << (map->is_up(from) ?
9031 map->get_cluster_addrs(from) : entity_addrvec_t())
9032 << dendl;
9033 ConnectionRef con = m->get_connection();
9034 con->mark_down();
9035 if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
9036 if (!is_fast_dispatch)
9037 s->session_dispatch_lock.lock();
9038 clear_session_waiting_on_map(s);
9039 con->set_priv(nullptr); // break ref <-> session cycle, if any
9040 s->con.reset();
9041 if (!is_fast_dispatch)
9042 s->session_dispatch_lock.unlock();
9043 }
9044 return false;
9045 }
9046 return true;
9047 }
9048
9049
9050 /*
9051 * require that we have same (or newer) map, and that
9052 * the source is the pg primary.
9053 */
9054 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
9055 bool is_fast_dispatch)
9056 {
9057 const Message *m = op->get_req();
9058 const auto osdmap = get_osdmap();
9059 dout(15) << "require_same_or_newer_map " << epoch
9060 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
9061
9062 ceph_assert(ceph_mutex_is_locked(osd_lock));
9063
9064 // do they have a newer map?
9065 if (epoch > osdmap->get_epoch()) {
9066 dout(7) << "waiting for newer map epoch " << epoch
9067 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
9068 wait_for_new_map(op);
9069 return false;
9070 }
9071
9072 if (!require_self_aliveness(op->get_req(), epoch)) {
9073 return false;
9074 }
9075
9076 // ok, our map is same or newer.. do they still exist?
9077 if (m->get_connection()->get_messenger() == cluster_messenger &&
9078 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
9079 return false;
9080 }
9081
9082 return true;
9083 }
9084
9085
9086
9087
9088
9089 // ----------------------------------------
9090 // pg creation
9091
9092 void OSD::split_pgs(
9093 PG *parent,
9094 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
9095 OSDMapRef curmap,
9096 OSDMapRef nextmap,
9097 PeeringCtx &rctx)
9098 {
9099 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9100 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
9101
9102 vector<object_stat_sum_t> updated_stats;
9103 parent->start_split_stats(childpgids, &updated_stats);
9104
9105 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9106 for (set<spg_t>::const_iterator i = childpgids.begin();
9107 i != childpgids.end();
9108 ++i, ++stat_iter) {
9109 ceph_assert(stat_iter != updated_stats.end());
9110 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
9111 PG* child = _make_pg(nextmap, *i);
9112 child->lock(true);
9113 out_pgs->insert(child);
9114 child->ch = store->create_new_collection(child->coll);
9115
9116 {
9117 uint32_t shard_index = i->hash_to_shard(shards.size());
9118 assert(NULL != shards[shard_index]);
9119 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9120 }
9121
9122 unsigned split_bits = i->get_split_bits(pg_num);
9123 dout(10) << " pg_num is " << pg_num
9124 << ", m_seed " << i->ps()
9125 << ", split_bits is " << split_bits << dendl;
9126 parent->split_colls(
9127 *i,
9128 split_bits,
9129 i->ps(),
9130 &child->get_pool().info,
9131 rctx.transaction);
9132 parent->split_into(
9133 i->pgid,
9134 child,
9135 split_bits);
9136
9137 child->init_collection_pool_opts();
9138
9139 child->finish_split_stats(*stat_iter, rctx.transaction);
9140 child->unlock();
9141 }
9142 ceph_assert(stat_iter != updated_stats.end());
9143 parent->finish_split_stats(*stat_iter, rctx.transaction);
9144 }
9145
9146 /*
9147 * holding osd_lock
9148 */
9149 void OSD::handle_pg_create(OpRequestRef op)
9150 {
9151 // NOTE: this can be removed in P release (mimic is the last version to
9152 // send MOSDPGCreate messages).
9153
9154 auto m = op->get_req<MOSDPGCreate>();
9155 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
9156
9157 dout(10) << "handle_pg_create " << *m << dendl;
9158
9159 if (!require_mon_peer(op->get_req())) {
9160 return;
9161 }
9162
9163 if (!require_same_or_newer_map(op, m->epoch, false))
9164 return;
9165
9166 op->mark_started();
9167
9168 const auto osdmap = get_osdmap();
9169 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9170 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9171 p != m->mkpg.end();
9172 ++p, ++ci) {
9173 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
9174 epoch_t created = p->second.created;
9175 if (p->second.split_bits) // Skip split pgs
9176 continue;
9177 pg_t on = p->first;
9178
9179 if (!osdmap->have_pg_pool(on.pool())) {
9180 dout(20) << "ignoring pg on deleted pool " << on << dendl;
9181 continue;
9182 }
9183
9184 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9185
9186 spg_t pgid;
9187 bool mapped = osdmap->get_primary_shard(on, &pgid);
9188 ceph_assert(mapped);
9189
9190 // is it still ours?
9191 vector<int> up, acting;
9192 int up_primary = -1;
9193 int acting_primary = -1;
9194 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9195 int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
9196
9197 if (acting_primary != whoami) {
9198 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9199 << "), my role=" << role << ", skipping" << dendl;
9200 continue;
9201 }
9202
9203
9204 PastIntervals pi;
9205 pg_history_t history;
9206 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9207
9208 // The mon won't resend unless the primary changed, so we ignore
9209 // same_interval_since. We'll pass this history with the current
9210 // epoch as the event.
9211 if (history.same_primary_since > m->epoch) {
9212 dout(10) << __func__ << ": got obsolete pg create on pgid "
9213 << pgid << " from epoch " << m->epoch
9214 << ", primary changed in " << history.same_primary_since
9215 << dendl;
9216 continue;
9217 }
9218 enqueue_peering_evt(
9219 pgid,
9220 PGPeeringEventRef(
9221 std::make_shared<PGPeeringEvent>(
9222 osdmap->get_epoch(),
9223 osdmap->get_epoch(),
9224 NullEvt(),
9225 true,
9226 new PGCreateInfo(
9227 pgid,
9228 osdmap->get_epoch(),
9229 history,
9230 pi,
9231 true)
9232 )));
9233 }
9234
9235 {
9236 std::lock_guard l(pending_creates_lock);
9237 if (pending_creates_from_mon == 0) {
9238 last_pg_create_epoch = m->epoch;
9239 }
9240 }
9241
9242 maybe_update_heartbeat_peers();
9243 }
9244
9245
9246 // ----------------------------------------
9247 // peering and recovery
9248
9249 PeeringCtx OSD::create_context()
9250 {
9251 return PeeringCtx(get_osdmap()->require_osd_release);
9252 }
9253
9254 void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
9255 ThreadPool::TPHandle *handle)
9256 {
9257 if (!service.get_osdmap()->is_up(whoami)) {
9258 dout(20) << __func__ << " not up in osdmap" << dendl;
9259 } else if (!is_active()) {
9260 dout(20) << __func__ << " not active" << dendl;
9261 } else {
9262 for (auto& [osd, ls] : ctx.message_map) {
9263 if (!curmap->is_up(osd)) {
9264 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9265 continue;
9266 }
9267 ConnectionRef con = service.get_con_osd_cluster(
9268 osd, curmap->get_epoch());
9269 if (!con) {
9270 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9271 << dendl;
9272 continue;
9273 }
9274 service.maybe_share_map(con.get(), curmap);
9275 for (auto m : ls) {
9276 con->send_message2(m);
9277 }
9278 ls.clear();
9279 }
9280 }
9281 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
9282 int tr = store->queue_transaction(
9283 pg->ch,
9284 std::move(ctx.transaction), TrackedOpRef(),
9285 handle);
9286 ceph_assert(tr == 0);
9287 }
9288 }
9289
9290 void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9291 {
9292 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9293 if (!require_mon_peer(m)) {
9294 m->put();
9295 return;
9296 }
9297 for (auto& p : m->pgs) {
9298 spg_t pgid = p.first;
9299 epoch_t created = p.second.first;
9300 utime_t created_stamp = p.second.second;
9301 auto q = m->pg_extra.find(pgid);
9302 if (q == m->pg_extra.end()) {
9303 dout(20) << __func__ << " " << pgid << " e" << created
9304 << "@" << created_stamp
9305 << " (no history or past_intervals)" << dendl;
9306 // pre-octopus ... no pg history. this can be removed in Q release.
9307 enqueue_peering_evt(
9308 pgid,
9309 PGPeeringEventRef(
9310 std::make_shared<PGPeeringEvent>(
9311 m->epoch,
9312 m->epoch,
9313 NullEvt(),
9314 true,
9315 new PGCreateInfo(
9316 pgid,
9317 created,
9318 pg_history_t(created, created_stamp),
9319 PastIntervals(),
9320 true)
9321 )));
9322 } else {
9323 dout(20) << __func__ << " " << pgid << " e" << created
9324 << "@" << created_stamp
9325 << " history " << q->second.first
9326 << " pi " << q->second.second << dendl;
9327 if (!q->second.second.empty() &&
9328 m->epoch < q->second.second.get_bounds().second) {
9329 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9330 << " and unmatched past_intervals " << q->second.second
9331 << " (history " << q->second.first << ")";
9332 } else {
9333 enqueue_peering_evt(
9334 pgid,
9335 PGPeeringEventRef(
9336 std::make_shared<PGPeeringEvent>(
9337 m->epoch,
9338 m->epoch,
9339 NullEvt(),
9340 true,
9341 new PGCreateInfo(
9342 pgid,
9343 m->epoch,
9344 q->second.first,
9345 q->second.second,
9346 true)
9347 )));
9348 }
9349 }
9350 }
9351
9352 {
9353 std::lock_guard l(pending_creates_lock);
9354 if (pending_creates_from_mon == 0) {
9355 last_pg_create_epoch = m->epoch;
9356 }
9357 }
9358
9359 m->put();
9360 }
9361
9362 void OSD::handle_fast_pg_query(MOSDPGQuery *m)
9363 {
9364 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9365 if (!require_osd_peer(m)) {
9366 m->put();
9367 return;
9368 }
9369 int from = m->get_source().num();
9370 for (auto& p : m->pg_list) {
9371 enqueue_peering_evt(
9372 p.first,
9373 PGPeeringEventRef(
9374 std::make_shared<PGPeeringEvent>(
9375 p.second.epoch_sent, p.second.epoch_sent,
9376 MQuery(
9377 p.first,
9378 pg_shard_t(from, p.second.from),
9379 p.second,
9380 p.second.epoch_sent),
9381 false))
9382 );
9383 }
9384 m->put();
9385 }
9386
9387 void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9388 {
9389 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9390 if (!require_osd_peer(m)) {
9391 m->put();
9392 return;
9393 }
9394 int from = m->get_source().num();
9395 for (auto& p : m->get_pg_list()) {
9396 spg_t pgid(p.info.pgid.pgid, p.to);
9397 enqueue_peering_evt(
9398 pgid,
9399 PGPeeringEventRef(
9400 std::make_shared<PGPeeringEvent>(
9401 p.epoch_sent,
9402 p.query_epoch,
9403 MNotifyRec(
9404 pgid, pg_shard_t(from, p.from),
9405 p,
9406 m->get_connection()->get_features()),
9407 true,
9408 new PGCreateInfo(
9409 pgid,
9410 p.query_epoch,
9411 p.info.history,
9412 p.past_intervals,
9413 false)
9414 )));
9415 }
9416 m->put();
9417 }
9418
9419 void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9420 {
9421 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9422 if (!require_osd_peer(m)) {
9423 m->put();
9424 return;
9425 }
9426 int from = m->get_source().num();
9427 for (auto& p : m->pg_list) {
9428 enqueue_peering_evt(
9429 spg_t(p.info.pgid.pgid, p.to),
9430 PGPeeringEventRef(
9431 std::make_shared<PGPeeringEvent>(
9432 p.epoch_sent, p.query_epoch,
9433 MInfoRec(
9434 pg_shard_t(from, p.from),
9435 p.info,
9436 p.epoch_sent)))
9437 );
9438 }
9439 m->put();
9440 }
9441
9442 void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9443 {
9444 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9445 if (!require_osd_peer(m)) {
9446 m->put();
9447 return;
9448 }
9449 for (auto& pgid : m->pg_list) {
9450 enqueue_peering_evt(
9451 pgid,
9452 PGPeeringEventRef(
9453 std::make_shared<PGPeeringEvent>(
9454 m->get_epoch(), m->get_epoch(),
9455 PeeringState::DeleteStart())));
9456 }
9457 m->put();
9458 }
9459
9460 void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9461 {
9462 dout(10) << __func__ << " " << *m << dendl;
9463 if (!require_mon_or_mgr_peer(m)) {
9464 m->put();
9465 return;
9466 }
9467 epoch_t epoch = get_osdmap_epoch();
9468 for (auto pgid : m->forced_pgs) {
9469 if (m->options & OFR_BACKFILL) {
9470 if (m->options & OFR_CANCEL) {
9471 enqueue_peering_evt(
9472 pgid,
9473 PGPeeringEventRef(
9474 std::make_shared<PGPeeringEvent>(
9475 epoch, epoch,
9476 PeeringState::UnsetForceBackfill())));
9477 } else {
9478 enqueue_peering_evt(
9479 pgid,
9480 PGPeeringEventRef(
9481 std::make_shared<PGPeeringEvent>(
9482 epoch, epoch,
9483 PeeringState::SetForceBackfill())));
9484 }
9485 } else if (m->options & OFR_RECOVERY) {
9486 if (m->options & OFR_CANCEL) {
9487 enqueue_peering_evt(
9488 pgid,
9489 PGPeeringEventRef(
9490 std::make_shared<PGPeeringEvent>(
9491 epoch, epoch,
9492 PeeringState::UnsetForceRecovery())));
9493 } else {
9494 enqueue_peering_evt(
9495 pgid,
9496 PGPeeringEventRef(
9497 std::make_shared<PGPeeringEvent>(
9498 epoch, epoch,
9499 PeeringState::SetForceRecovery())));
9500 }
9501 }
9502 }
9503 m->put();
9504 }
9505
9506 void OSD::handle_pg_query_nopg(const MQuery& q)
9507 {
9508 spg_t pgid = q.pgid;
9509 dout(10) << __func__ << " " << pgid << dendl;
9510
9511 OSDMapRef osdmap = get_osdmap();
9512 if (!osdmap->have_pg_pool(pgid.pool()))
9513 return;
9514
9515 dout(10) << " pg " << pgid << " dne" << dendl;
9516 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9517 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9518 if (con) {
9519 Message *m;
9520 if (q.query.type == pg_query_t::LOG ||
9521 q.query.type == pg_query_t::FULLLOG) {
9522 m = new MOSDPGLog(
9523 q.query.from, q.query.to,
9524 osdmap->get_epoch(), empty,
9525 q.query.epoch_sent);
9526 } else {
9527 vector<pg_notify_t> ls;
9528 ls.push_back(
9529 pg_notify_t(
9530 q.query.from, q.query.to,
9531 q.query.epoch_sent,
9532 osdmap->get_epoch(),
9533 empty,
9534 PastIntervals()));
9535 m = new MOSDPGNotify(osdmap->get_epoch(), std::move(ls));
9536 }
9537 service.maybe_share_map(con.get(), osdmap);
9538 con->send_message(m);
9539 }
9540 }
9541
9542 void OSDService::queue_check_readable(spg_t spgid,
9543 epoch_t lpr,
9544 ceph::signedspan delay)
9545 {
9546 if (delay == ceph::signedspan::zero()) {
9547 osd->enqueue_peering_evt(
9548 spgid,
9549 PGPeeringEventRef(
9550 std::make_shared<PGPeeringEvent>(
9551 lpr, lpr,
9552 PeeringState::CheckReadable())));
9553 } else {
9554 mono_timer.add_event(
9555 delay,
9556 [this, spgid, lpr]() {
9557 queue_check_readable(spgid, lpr);
9558 });
9559 }
9560 }
9561
9562
9563 // =========================================================
9564 // RECOVERY
9565
9566 void OSDService::_maybe_queue_recovery() {
9567 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
9568 uint64_t available_pushes;
9569 while (!awaiting_throttle.empty() &&
9570 _recover_now(&available_pushes)) {
9571 uint64_t to_start = std::min(
9572 available_pushes,
9573 cct->_conf->osd_recovery_max_single_start);
9574 _queue_for_recovery(awaiting_throttle.front(), to_start);
9575 awaiting_throttle.pop_front();
9576 dout(10) << __func__ << " starting " << to_start
9577 << ", recovery_ops_reserved " << recovery_ops_reserved
9578 << " -> " << (recovery_ops_reserved + to_start) << dendl;
9579 recovery_ops_reserved += to_start;
9580 }
9581 }
9582
9583 bool OSDService::_recover_now(uint64_t *available_pushes)
9584 {
9585 if (available_pushes)
9586 *available_pushes = 0;
9587
9588 if (ceph_clock_now() < defer_recovery_until) {
9589 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9590 return false;
9591 }
9592
9593 if (recovery_paused) {
9594 dout(15) << __func__ << " paused" << dendl;
9595 return false;
9596 }
9597
9598 uint64_t max = osd->get_recovery_max_active();
9599 if (max <= recovery_ops_active + recovery_ops_reserved) {
9600 dout(15) << __func__ << " active " << recovery_ops_active
9601 << " + reserved " << recovery_ops_reserved
9602 << " >= max " << max << dendl;
9603 return false;
9604 }
9605
9606 if (available_pushes)
9607 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9608
9609 return true;
9610 }
9611
9612 unsigned OSDService::get_target_pg_log_entries() const
9613 {
9614 auto num_pgs = osd->get_num_pgs();
9615 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9616 if (num_pgs > 0 && target > 0) {
9617 // target an even spread of our budgeted log entries across all
9618 // PGs. note that while we only get to control the entry count
9619 // for primary PGs, we'll normally be responsible for a mix of
9620 // primary and replica PGs (for the same pool(s) even), so this
9621 // will work out.
9622 return std::max<unsigned>(
9623 std::min<unsigned>(target / num_pgs,
9624 cct->_conf->osd_max_pg_log_entries),
9625 cct->_conf->osd_min_pg_log_entries);
9626 } else {
9627 // fall back to a per-pg value.
9628 return cct->_conf->osd_min_pg_log_entries;
9629 }
9630 }
9631
9632 void OSD::do_recovery(
9633 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9634 ThreadPool::TPHandle &handle)
9635 {
9636 uint64_t started = 0;
9637
9638 /*
9639 * When the value of osd_recovery_sleep is set greater than zero, recovery
9640 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9641 * recovery event's schedule time. This is done by adding a
9642 * recovery_requeue_callback event, which re-queues the recovery op using
9643 * queue_recovery_after_sleep.
9644 */
9645 float recovery_sleep = get_osd_recovery_sleep();
9646 {
9647 std::lock_guard l(service.sleep_lock);
9648 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9649 PGRef pgref(pg);
9650 auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
9651 dout(20) << "do_recovery wake up at "
9652 << ceph_clock_now()
9653 << ", re-queuing recovery" << dendl;
9654 std::lock_guard l(service.sleep_lock);
9655 service.recovery_needs_sleep = false;
9656 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9657 });
9658
9659 // This is true for the first recovery op and when the previous recovery op
9660 // has been scheduled in the past. The next recovery op is scheduled after
9661 // completing the sleep from now.
9662
9663 if (auto now = ceph::real_clock::now();
9664 service.recovery_schedule_time < now) {
9665 service.recovery_schedule_time = now;
9666 }
9667 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
9668 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9669 recovery_requeue_callback);
9670 dout(20) << "Recovery event scheduled at "
9671 << service.recovery_schedule_time << dendl;
9672 return;
9673 }
9674 }
9675
9676 {
9677 {
9678 std::lock_guard l(service.sleep_lock);
9679 service.recovery_needs_sleep = true;
9680 }
9681
9682 if (pg->pg_has_reset_since(queued)) {
9683 goto out;
9684 }
9685
9686 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9687 #ifdef DEBUG_RECOVERY_OIDS
9688 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
9689 #endif
9690
9691 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
9692 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9693 << " on " << *pg << dendl;
9694
9695 if (do_unfound) {
9696 PeeringCtx rctx = create_context();
9697 rctx.handle = &handle;
9698 pg->find_unfound(queued, rctx);
9699 dispatch_context(rctx, pg, pg->get_osdmap());
9700 }
9701 }
9702
9703 out:
9704 ceph_assert(started <= reserved_pushes);
9705 service.release_reserved_pushes(reserved_pushes);
9706 }
9707
9708 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9709 {
9710 std::lock_guard l(recovery_lock);
9711 dout(10) << "start_recovery_op " << *pg << " " << soid
9712 << " (" << recovery_ops_active << "/"
9713 << osd->get_recovery_max_active() << " rops)"
9714 << dendl;
9715 recovery_ops_active++;
9716
9717 #ifdef DEBUG_RECOVERY_OIDS
9718 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9719 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9720 recovery_oids[pg->pg_id].insert(soid);
9721 #endif
9722 }
9723
9724 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9725 {
9726 std::lock_guard l(recovery_lock);
9727 dout(10) << "finish_recovery_op " << *pg << " " << soid
9728 << " dequeue=" << dequeue
9729 << " (" << recovery_ops_active << "/"
9730 << osd->get_recovery_max_active() << " rops)"
9731 << dendl;
9732
9733 // adjust count
9734 ceph_assert(recovery_ops_active > 0);
9735 recovery_ops_active--;
9736
9737 #ifdef DEBUG_RECOVERY_OIDS
9738 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9739 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9740 recovery_oids[pg->pg_id].erase(soid);
9741 #endif
9742
9743 _maybe_queue_recovery();
9744 }
9745
9746 bool OSDService::is_recovery_active()
9747 {
9748 if (cct->_conf->osd_debug_pretend_recovery_active) {
9749 return true;
9750 }
9751 return local_reserver.has_reservation() || remote_reserver.has_reservation();
9752 }
9753
9754 void OSDService::release_reserved_pushes(uint64_t pushes)
9755 {
9756 std::lock_guard l(recovery_lock);
9757 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9758 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9759 << dendl;
9760 ceph_assert(recovery_ops_reserved >= pushes);
9761 recovery_ops_reserved -= pushes;
9762 _maybe_queue_recovery();
9763 }
9764
9765 // =========================================================
9766 // OPS
9767
9768 bool OSD::op_is_discardable(const MOSDOp *op)
9769 {
9770 // drop client request if they are not connected and can't get the
9771 // reply anyway.
9772 if (!op->get_connection()->is_connected()) {
9773 return true;
9774 }
9775 return false;
9776 }
9777
9778 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
9779 {
9780 const utime_t stamp = op->get_req()->get_recv_stamp();
9781 const utime_t latency = ceph_clock_now() - stamp;
9782 const unsigned priority = op->get_req()->get_priority();
9783 const int cost = op->get_req()->get_cost();
9784 const uint64_t owner = op->get_req()->get_source().num();
9785 const int type = op->get_req()->get_type();
9786
9787 dout(15) << "enqueue_op " << op << " prio " << priority
9788 << " type " << type
9789 << " cost " << cost
9790 << " latency " << latency
9791 << " epoch " << epoch
9792 << " " << *(op->get_req()) << dendl;
9793 op->osd_trace.event("enqueue op");
9794 op->osd_trace.keyval("priority", priority);
9795 op->osd_trace.keyval("cost", cost);
9796 #ifdef HAVE_JAEGER
9797 if (op->osd_parent_span) {
9798 auto enqueue_span = jaeger_tracing::child_span(__func__, op->osd_parent_span);
9799 enqueue_span->Log({
9800 {"priority", priority},
9801 {"cost", cost},
9802 {"epoch", epoch},
9803 {"owner", owner},
9804 {"type", type}
9805 });
9806 }
9807 #endif
9808 op->mark_queued_for_pg();
9809 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9810 if (type == MSG_OSD_PG_PUSH ||
9811 type == MSG_OSD_PG_PUSH_REPLY) {
9812 op_shardedwq.queue(
9813 OpSchedulerItem(
9814 unique_ptr<OpSchedulerItem::OpQueueable>(new PGRecoveryMsg(pg, std::move(op))),
9815 cost, priority, stamp, owner, epoch));
9816 } else {
9817 op_shardedwq.queue(
9818 OpSchedulerItem(
9819 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9820 cost, priority, stamp, owner, epoch));
9821 }
9822 }
9823
9824 void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9825 {
9826 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9827 op_shardedwq.queue(
9828 OpSchedulerItem(
9829 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9830 10,
9831 cct->_conf->osd_peering_op_priority,
9832 utime_t(),
9833 0,
9834 evt->get_epoch_sent()));
9835 }
9836
9837 /*
9838 * NOTE: dequeue called in worker thread, with pg lock
9839 */
9840 void OSD::dequeue_op(
9841 PGRef pg, OpRequestRef op,
9842 ThreadPool::TPHandle &handle)
9843 {
9844 const Message *m = op->get_req();
9845
9846 FUNCTRACE(cct);
9847 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
9848
9849 utime_t now = ceph_clock_now();
9850 op->set_dequeued_time(now);
9851
9852 utime_t latency = now - m->get_recv_stamp();
9853 dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9854 << " cost " << m->get_cost()
9855 << " latency " << latency
9856 << " " << *m
9857 << " pg " << *pg << dendl;
9858
9859 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9860
9861 service.maybe_share_map(m->get_connection().get(),
9862 pg->get_osdmap(),
9863 op->sent_epoch);
9864
9865 if (pg->is_deleting())
9866 return;
9867
9868 op->mark_reached_pg();
9869 op->osd_trace.event("dequeue_op");
9870
9871 pg->do_request(op, handle);
9872
9873 // finish
9874 dout(10) << "dequeue_op " << op << " finish" << dendl;
9875 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
9876 }
9877
9878
9879 void OSD::dequeue_peering_evt(
9880 OSDShard *sdata,
9881 PG *pg,
9882 PGPeeringEventRef evt,
9883 ThreadPool::TPHandle& handle)
9884 {
9885 PeeringCtx rctx = create_context();
9886 auto curmap = sdata->get_osdmap();
9887 bool need_up_thru = false;
9888 epoch_t same_interval_since = 0;
9889 if (!pg) {
9890 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9891 handle_pg_query_nopg(*q);
9892 } else {
9893 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9894 ceph_abort();
9895 }
9896 } else if (advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9897 pg->do_peering_event(evt, rctx);
9898 if (pg->is_deleted()) {
9899 pg->unlock();
9900 return;
9901 }
9902 dispatch_context(rctx, pg, curmap, &handle);
9903 need_up_thru = pg->get_need_up_thru();
9904 same_interval_since = pg->get_same_interval_since();
9905 pg->unlock();
9906 }
9907
9908 if (need_up_thru) {
9909 queue_want_up_thru(same_interval_since);
9910 }
9911
9912 service.send_pg_temp();
9913 }
9914
9915 void OSD::dequeue_delete(
9916 OSDShard *sdata,
9917 PG *pg,
9918 epoch_t e,
9919 ThreadPool::TPHandle& handle)
9920 {
9921 dequeue_peering_evt(
9922 sdata,
9923 pg,
9924 PGPeeringEventRef(
9925 std::make_shared<PGPeeringEvent>(
9926 e, e,
9927 PeeringState::DeleteSome())),
9928 handle);
9929 }
9930
9931
9932
9933 // --------------------------------
9934
9935 const char** OSD::get_tracked_conf_keys() const
9936 {
9937 static const char* KEYS[] = {
9938 "osd_max_backfills",
9939 "osd_min_recovery_priority",
9940 "osd_max_trimming_pgs",
9941 "osd_op_complaint_time",
9942 "osd_op_log_threshold",
9943 "osd_op_history_size",
9944 "osd_op_history_duration",
9945 "osd_op_history_slow_op_size",
9946 "osd_op_history_slow_op_threshold",
9947 "osd_enable_op_tracker",
9948 "osd_map_cache_size",
9949 "osd_pg_epoch_max_lag_factor",
9950 "osd_pg_epoch_persisted_max_stale",
9951 "osd_recovery_sleep",
9952 "osd_recovery_sleep_hdd",
9953 "osd_recovery_sleep_ssd",
9954 "osd_recovery_sleep_hybrid",
9955 "osd_delete_sleep",
9956 "osd_delete_sleep_hdd",
9957 "osd_delete_sleep_ssd",
9958 "osd_delete_sleep_hybrid",
9959 "osd_snap_trim_sleep",
9960 "osd_snap_trim_sleep_hdd",
9961 "osd_snap_trim_sleep_ssd",
9962 "osd_snap_trim_sleep_hybrid"
9963 "osd_scrub_sleep",
9964 "osd_recovery_max_active",
9965 "osd_recovery_max_active_hdd",
9966 "osd_recovery_max_active_ssd",
9967 // clog & admin clog
9968 "clog_to_monitors",
9969 "clog_to_syslog",
9970 "clog_to_syslog_facility",
9971 "clog_to_syslog_level",
9972 "osd_objectstore_fuse",
9973 "clog_to_graylog",
9974 "clog_to_graylog_host",
9975 "clog_to_graylog_port",
9976 "host",
9977 "fsid",
9978 "osd_recovery_delay_start",
9979 "osd_client_message_size_cap",
9980 "osd_client_message_cap",
9981 "osd_heartbeat_min_size",
9982 "osd_heartbeat_interval",
9983 "osd_object_clean_region_max_num_intervals",
9984 "osd_scrub_min_interval",
9985 "osd_scrub_max_interval",
9986 NULL
9987 };
9988 return KEYS;
9989 }
9990
9991 void OSD::handle_conf_change(const ConfigProxy& conf,
9992 const std::set <std::string> &changed)
9993 {
9994 std::lock_guard l{osd_lock};
9995
9996 if (changed.count("osd_max_backfills") ||
9997 changed.count("osd_delete_sleep") ||
9998 changed.count("osd_delete_sleep_hdd") ||
9999 changed.count("osd_delete_sleep_ssd") ||
10000 changed.count("osd_delete_sleep_hybrid") ||
10001 changed.count("osd_snap_trim_sleep") ||
10002 changed.count("osd_snap_trim_sleep_hdd") ||
10003 changed.count("osd_snap_trim_sleep_ssd") ||
10004 changed.count("osd_snap_trim_sleep_hybrid") ||
10005 changed.count("osd_scrub_sleep") ||
10006 changed.count("osd_recovery_sleep") ||
10007 changed.count("osd_recovery_sleep_hdd") ||
10008 changed.count("osd_recovery_sleep_ssd") ||
10009 changed.count("osd_recovery_sleep_hybrid") ||
10010 changed.count("osd_recovery_max_active") ||
10011 changed.count("osd_recovery_max_active_hdd") ||
10012 changed.count("osd_recovery_max_active_ssd")) {
10013 if (!maybe_override_options_for_qos() &&
10014 changed.count("osd_max_backfills")) {
10015 // Scheduler is not "mclock". Fallback to earlier behavior
10016 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
10017 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
10018 }
10019 }
10020 if (changed.count("osd_min_recovery_priority")) {
10021 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10022 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10023 }
10024 if (changed.count("osd_max_trimming_pgs")) {
10025 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
10026 }
10027 if (changed.count("osd_op_complaint_time") ||
10028 changed.count("osd_op_log_threshold")) {
10029 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
10030 cct->_conf->osd_op_log_threshold);
10031 }
10032 if (changed.count("osd_op_history_size") ||
10033 changed.count("osd_op_history_duration")) {
10034 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
10035 cct->_conf->osd_op_history_duration);
10036 }
10037 if (changed.count("osd_op_history_slow_op_size") ||
10038 changed.count("osd_op_history_slow_op_threshold")) {
10039 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
10040 cct->_conf->osd_op_history_slow_op_threshold);
10041 }
10042 if (changed.count("osd_enable_op_tracker")) {
10043 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
10044 }
10045 if (changed.count("osd_map_cache_size")) {
10046 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
10047 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
10048 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
10049 }
10050 if (changed.count("clog_to_monitors") ||
10051 changed.count("clog_to_syslog") ||
10052 changed.count("clog_to_syslog_level") ||
10053 changed.count("clog_to_syslog_facility") ||
10054 changed.count("clog_to_graylog") ||
10055 changed.count("clog_to_graylog_host") ||
10056 changed.count("clog_to_graylog_port") ||
10057 changed.count("host") ||
10058 changed.count("fsid")) {
10059 update_log_config();
10060 }
10061 if (changed.count("osd_pg_epoch_max_lag_factor")) {
10062 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
10063 "osd_pg_epoch_max_lag_factor");
10064 }
10065
10066 #ifdef HAVE_LIBFUSE
10067 if (changed.count("osd_objectstore_fuse")) {
10068 if (store) {
10069 enable_disable_fuse(false);
10070 }
10071 }
10072 #endif
10073
10074 if (changed.count("osd_recovery_delay_start")) {
10075 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10076 service.kick_recovery_queue();
10077 }
10078
10079 if (changed.count("osd_client_message_cap")) {
10080 uint64_t newval = cct->_conf->osd_client_message_cap;
10081 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10082 if (pol.throttler_messages && newval > 0) {
10083 pol.throttler_messages->reset_max(newval);
10084 }
10085 }
10086 if (changed.count("osd_client_message_size_cap")) {
10087 uint64_t newval = cct->_conf->osd_client_message_size_cap;
10088 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10089 if (pol.throttler_bytes && newval > 0) {
10090 pol.throttler_bytes->reset_max(newval);
10091 }
10092 }
10093 if (changed.count("osd_object_clean_region_max_num_intervals")) {
10094 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
10095 }
10096
10097 if (changed.count("osd_scrub_min_interval") ||
10098 changed.count("osd_scrub_max_interval")) {
10099 resched_all_scrubs();
10100 dout(0) << __func__ << ": scrub interval change" << dendl;
10101 }
10102 check_config();
10103 if (changed.count("osd_asio_thread_count")) {
10104 service.poolctx.stop();
10105 service.poolctx.start(conf.get_val<std::uint64_t>("osd_asio_thread_count"));
10106 }
10107 }
10108
10109 void OSD::maybe_override_max_osd_capacity_for_qos()
10110 {
10111 // If the scheduler enabled is mclock, override the default
10112 // osd capacity with the value obtained from running the
10113 // osd bench test. This is later used to setup mclock.
10114 if ((cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") &&
10115 (cct->_conf.get_val<bool>("osd_mclock_skip_benchmark") == false)) {
10116 std::string max_capacity_iops_config;
10117 bool force_run_benchmark =
10118 cct->_conf.get_val<bool>("osd_mclock_force_run_benchmark_on_init");
10119
10120 if (store_is_rotational) {
10121 max_capacity_iops_config = "osd_mclock_max_capacity_iops_hdd";
10122 } else {
10123 max_capacity_iops_config = "osd_mclock_max_capacity_iops_ssd";
10124 }
10125
10126 if (!force_run_benchmark) {
10127 double default_iops = 0.0;
10128
10129 // Get the current osd iops capacity
10130 double cur_iops = cct->_conf.get_val<double>(max_capacity_iops_config);
10131
10132 // Get the default max iops capacity
10133 auto val = cct->_conf.get_val_default(max_capacity_iops_config);
10134 if (!val.has_value()) {
10135 derr << __func__ << " Unable to determine default value of "
10136 << max_capacity_iops_config << dendl;
10137 // Cannot determine default iops. Force a run of the OSD benchmark.
10138 force_run_benchmark = true;
10139 } else {
10140 // Default iops
10141 default_iops = std::stod(val.value());
10142 }
10143
10144 // Determine if we really need to run the osd benchmark
10145 if (!force_run_benchmark && (default_iops != cur_iops)) {
10146 dout(1) << __func__ << std::fixed << std::setprecision(2)
10147 << " default_iops: " << default_iops
10148 << " cur_iops: " << cur_iops
10149 << ". Skip OSD benchmark test." << dendl;
10150 return;
10151 }
10152 }
10153
10154 // Run osd bench: write 100 4MiB objects with blocksize 4KiB
10155 int64_t count = 12288000; // Count of bytes to write
10156 int64_t bsize = 4096; // Block size
10157 int64_t osize = 4194304; // Object size
10158 int64_t onum = 100; // Count of objects to write
10159 double elapsed = 0.0; // Time taken to complete the test
10160 double iops = 0.0;
10161 stringstream ss;
10162 int ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
10163 if (ret != 0) {
10164 derr << __func__
10165 << " osd bench err: " << ret
10166 << " osd bench errstr: " << ss.str()
10167 << dendl;
10168 return;
10169 }
10170
10171 double rate = count / elapsed;
10172 iops = rate / bsize;
10173 dout(1) << __func__
10174 << " osd bench result -"
10175 << std::fixed << std::setprecision(3)
10176 << " bandwidth (MiB/sec): " << rate / (1024 * 1024)
10177 << " iops: " << iops
10178 << " elapsed_sec: " << elapsed
10179 << dendl;
10180
10181 // Persist iops to the MON store
10182 ret = mon_cmd_set_config(max_capacity_iops_config, std::to_string(iops));
10183 if (ret < 0) {
10184 // Fallback to setting the config within the in-memory "values" map.
10185 cct->_conf.set_val(max_capacity_iops_config, std::to_string(iops));
10186 }
10187
10188 // Override the max osd capacity for all shards
10189 for (auto& shard : shards) {
10190 shard->update_scheduler_config();
10191 }
10192 }
10193 }
10194
10195 bool OSD::maybe_override_options_for_qos()
10196 {
10197 // If the scheduler enabled is mclock, override the recovery, backfill
10198 // and sleep options so that mclock can meet the QoS goals.
10199 if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") {
10200 dout(1) << __func__
10201 << ": Changing recovery/backfill/sleep settings for QoS" << dendl;
10202
10203 // Set high value for recovery max active
10204 uint32_t rec_max_active = 1000;
10205 cct->_conf.set_val(
10206 "osd_recovery_max_active", std::to_string(rec_max_active));
10207 cct->_conf.set_val(
10208 "osd_recovery_max_active_hdd", std::to_string(rec_max_active));
10209 cct->_conf.set_val(
10210 "osd_recovery_max_active_ssd", std::to_string(rec_max_active));
10211
10212 // Set high value for osd_max_backfill
10213 uint32_t max_backfills = 1000;
10214 cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills));
10215 service.local_reserver.set_max(max_backfills);
10216 service.remote_reserver.set_max(max_backfills);
10217
10218 // Disable recovery sleep
10219 cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
10220 cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
10221 cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
10222 cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
10223
10224 // Disable delete sleep
10225 cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
10226 cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
10227 cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
10228 cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
10229
10230 // Disable snap trim sleep
10231 cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
10232 cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
10233 cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
10234 cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
10235
10236 // Disable scrub sleep
10237 cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
10238 return true;
10239 }
10240 return false;
10241 }
10242
10243 int OSD::mon_cmd_set_config(const std::string &key, const std::string &val)
10244 {
10245 std::string cmd =
10246 "{"
10247 "\"prefix\": \"config set\", "
10248 "\"who\": \"osd." + std::to_string(whoami) + "\", "
10249 "\"name\": \"" + key + "\", "
10250 "\"value\": \"" + val + "\""
10251 "}";
10252
10253 vector<std::string> vcmd{cmd};
10254 bufferlist inbl;
10255 std::string outs;
10256 C_SaferCond cond;
10257 monc->start_mon_command(vcmd, inbl, nullptr, &outs, &cond);
10258 int r = cond.wait();
10259 if (r < 0) {
10260 derr << __func__ << " Failed to set config key " << key
10261 << " err: " << cpp_strerror(r)
10262 << " errstr: " << outs << dendl;
10263 return r;
10264 }
10265
10266 return 0;
10267 }
10268
10269 void OSD::update_log_config()
10270 {
10271 map<string,string> log_to_monitors;
10272 map<string,string> log_to_syslog;
10273 map<string,string> log_channel;
10274 map<string,string> log_prio;
10275 map<string,string> log_to_graylog;
10276 map<string,string> log_to_graylog_host;
10277 map<string,string> log_to_graylog_port;
10278 uuid_d fsid;
10279 string host;
10280
10281 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
10282 log_channel, log_prio, log_to_graylog,
10283 log_to_graylog_host, log_to_graylog_port,
10284 fsid, host) == 0)
10285 clog->update_config(log_to_monitors, log_to_syslog,
10286 log_channel, log_prio, log_to_graylog,
10287 log_to_graylog_host, log_to_graylog_port,
10288 fsid, host);
10289 derr << "log_to_monitors " << log_to_monitors << dendl;
10290 }
10291
10292 void OSD::check_config()
10293 {
10294 // some sanity checks
10295 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10296 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10297 << " is not > osd_pg_epoch_persisted_max_stale ("
10298 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10299 }
10300 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
10301 clog->warn() << "osd_object_clean_region_max_num_intervals ("
10302 << cct->_conf->osd_object_clean_region_max_num_intervals
10303 << ") is < 0";
10304 }
10305 }
10306
10307 // --------------------------------
10308
10309 void OSD::get_latest_osdmap()
10310 {
10311 dout(10) << __func__ << " -- start" << dendl;
10312
10313 boost::system::error_code ec;
10314 service.objecter->wait_for_latest_osdmap(ceph::async::use_blocked[ec]);
10315
10316 dout(10) << __func__ << " -- finish" << dendl;
10317 }
10318
10319 // --------------------------------
10320
10321 void OSD::set_perf_queries(const ConfigPayload &config_payload) {
10322 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
10323 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
10324 dout(10) << "setting " << queries.size() << " queries" << dendl;
10325
10326 std::list<OSDPerfMetricQuery> supported_queries;
10327 for (auto &it : queries) {
10328 auto &query = it.first;
10329 if (!query.key_descriptor.empty()) {
10330 supported_queries.push_back(query);
10331 }
10332 }
10333 if (supported_queries.size() < queries.size()) {
10334 dout(1) << queries.size() - supported_queries.size()
10335 << " unsupported queries" << dendl;
10336 }
10337 {
10338 std::lock_guard locker{m_perf_queries_lock};
10339 m_perf_queries = supported_queries;
10340 m_perf_limits = queries;
10341 }
10342 std::vector<PGRef> pgs;
10343 _get_pgs(&pgs);
10344 for (auto& pg : pgs) {
10345 std::scoped_lock l{*pg};
10346 pg->set_dynamic_perf_stats_queries(supported_queries);
10347 }
10348 }
10349
10350 MetricPayload OSD::get_perf_reports() {
10351 OSDMetricPayload payload;
10352 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
10353
10354 std::vector<PGRef> pgs;
10355 _get_pgs(&pgs);
10356 DynamicPerfStats dps;
10357 for (auto& pg : pgs) {
10358 // m_perf_queries can be modified only in set_perf_queries by mgr client
10359 // request, and it is protected by by mgr client's lock, which is held
10360 // when set_perf_queries/get_perf_reports are called, so we may not hold
10361 // m_perf_queries_lock here.
10362 DynamicPerfStats pg_dps(m_perf_queries);
10363 pg->lock();
10364 pg->get_dynamic_perf_stats(&pg_dps);
10365 pg->unlock();
10366 dps.merge(pg_dps);
10367 }
10368 dps.add_to_reports(m_perf_limits, &reports);
10369 dout(20) << "reports for " << reports.size() << " queries" << dendl;
10370
10371 return payload;
10372 }
10373
10374 // =============================================================
10375
10376 #undef dout_context
10377 #define dout_context cct
10378 #undef dout_prefix
10379 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10380
10381 void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
10382 {
10383 dout(10) << pg->pg_id << " " << pg << dendl;
10384 slot->pg = pg;
10385 pg->osd_shard = this;
10386 pg->pg_slot = slot;
10387 osd->inc_num_pgs();
10388
10389 slot->epoch = pg->get_osdmap_epoch();
10390 pg_slots_by_epoch.insert(*slot);
10391 }
10392
10393 void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10394 {
10395 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10396 slot->pg->osd_shard = nullptr;
10397 slot->pg->pg_slot = nullptr;
10398 slot->pg = nullptr;
10399 osd->dec_num_pgs();
10400
10401 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10402 slot->epoch = 0;
10403 if (waiting_for_min_pg_epoch) {
10404 min_pg_epoch_cond.notify_all();
10405 }
10406 }
10407
10408 void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10409 {
10410 std::lock_guard l(shard_lock);
10411 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10412 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10413 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10414 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10415 slot->epoch = e;
10416 pg_slots_by_epoch.insert(*slot);
10417 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10418 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10419 if (waiting_for_min_pg_epoch) {
10420 min_pg_epoch_cond.notify_all();
10421 }
10422 }
10423
10424 epoch_t OSDShard::get_min_pg_epoch()
10425 {
10426 std::lock_guard l(shard_lock);
10427 auto p = pg_slots_by_epoch.begin();
10428 if (p == pg_slots_by_epoch.end()) {
10429 return 0;
10430 }
10431 return p->epoch;
10432 }
10433
10434 void OSDShard::wait_min_pg_epoch(epoch_t need)
10435 {
10436 std::unique_lock l{shard_lock};
10437 ++waiting_for_min_pg_epoch;
10438 min_pg_epoch_cond.wait(l, [need, this] {
10439 if (pg_slots_by_epoch.empty()) {
10440 return true;
10441 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10442 return true;
10443 } else {
10444 dout(10) << need << " waiting on "
10445 << pg_slots_by_epoch.begin()->epoch << dendl;
10446 return false;
10447 }
10448 });
10449 --waiting_for_min_pg_epoch;
10450 }
10451
10452 epoch_t OSDShard::get_max_waiting_epoch()
10453 {
10454 std::lock_guard l(shard_lock);
10455 epoch_t r = 0;
10456 for (auto& i : pg_slots) {
10457 if (!i.second->waiting_peering.empty()) {
10458 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10459 }
10460 }
10461 return r;
10462 }
10463
10464 void OSDShard::consume_map(
10465 const OSDMapRef& new_osdmap,
10466 unsigned *pushes_to_free)
10467 {
10468 std::lock_guard l(shard_lock);
10469 OSDMapRef old_osdmap;
10470 {
10471 std::lock_guard l(osdmap_lock);
10472 old_osdmap = std::move(shard_osdmap);
10473 shard_osdmap = new_osdmap;
10474 }
10475 dout(10) << new_osdmap->get_epoch()
10476 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10477 << dendl;
10478 bool queued = false;
10479
10480 // check slots
10481 auto p = pg_slots.begin();
10482 while (p != pg_slots.end()) {
10483 OSDShardPGSlot *slot = p->second.get();
10484 const spg_t& pgid = p->first;
10485 dout(20) << __func__ << " " << pgid << dendl;
10486 if (!slot->waiting_for_split.empty()) {
10487 dout(20) << __func__ << " " << pgid
10488 << " waiting for split " << slot->waiting_for_split << dendl;
10489 ++p;
10490 continue;
10491 }
10492 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10493 dout(20) << __func__ << " " << pgid
10494 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10495 << dendl;
10496 ++p;
10497 continue;
10498 }
10499 if (!slot->waiting_peering.empty()) {
10500 epoch_t first = slot->waiting_peering.begin()->first;
10501 if (first <= new_osdmap->get_epoch()) {
10502 dout(20) << __func__ << " " << pgid
10503 << " pending_peering first epoch " << first
10504 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10505 _wake_pg_slot(pgid, slot);
10506 queued = true;
10507 }
10508 ++p;
10509 continue;
10510 }
10511 if (!slot->waiting.empty()) {
10512 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10513 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10514 << dendl;
10515 ++p;
10516 continue;
10517 }
10518 while (!slot->waiting.empty() &&
10519 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10520 auto& qi = slot->waiting.front();
10521 dout(20) << __func__ << " " << pgid
10522 << " waiting item " << qi
10523 << " epoch " << qi.get_map_epoch()
10524 << " <= " << new_osdmap->get_epoch()
10525 << ", "
10526 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10527 "misdirected")
10528 << ", dropping" << dendl;
10529 *pushes_to_free += qi.get_reserved_pushes();
10530 slot->waiting.pop_front();
10531 }
10532 }
10533 if (slot->waiting.empty() &&
10534 slot->num_running == 0 &&
10535 slot->waiting_for_split.empty() &&
10536 !slot->pg) {
10537 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10538 p = pg_slots.erase(p);
10539 continue;
10540 }
10541
10542 ++p;
10543 }
10544 if (queued) {
10545 std::lock_guard l{sdata_wait_lock};
10546 sdata_cond.notify_one();
10547 }
10548 }
10549
10550 void OSDShard::_wake_pg_slot(
10551 spg_t pgid,
10552 OSDShardPGSlot *slot)
10553 {
10554 dout(20) << __func__ << " " << pgid
10555 << " to_process " << slot->to_process
10556 << " waiting " << slot->waiting
10557 << " waiting_peering " << slot->waiting_peering << dendl;
10558 for (auto i = slot->to_process.rbegin();
10559 i != slot->to_process.rend();
10560 ++i) {
10561 scheduler->enqueue_front(std::move(*i));
10562 }
10563 slot->to_process.clear();
10564 for (auto i = slot->waiting.rbegin();
10565 i != slot->waiting.rend();
10566 ++i) {
10567 scheduler->enqueue_front(std::move(*i));
10568 }
10569 slot->waiting.clear();
10570 for (auto i = slot->waiting_peering.rbegin();
10571 i != slot->waiting_peering.rend();
10572 ++i) {
10573 // this is overkill; we requeue everything, even if some of these
10574 // items are waiting for maps we don't have yet. FIXME, maybe,
10575 // someday, if we decide this inefficiency matters
10576 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10577 scheduler->enqueue_front(std::move(*j));
10578 }
10579 }
10580 slot->waiting_peering.clear();
10581 ++slot->requeue_seq;
10582 }
10583
10584 void OSDShard::identify_splits_and_merges(
10585 const OSDMapRef& as_of_osdmap,
10586 set<pair<spg_t,epoch_t>> *split_pgs,
10587 set<pair<spg_t,epoch_t>> *merge_pgs)
10588 {
10589 std::lock_guard l(shard_lock);
10590 if (shard_osdmap) {
10591 for (auto& i : pg_slots) {
10592 const spg_t& pgid = i.first;
10593 auto *slot = i.second.get();
10594 if (slot->pg) {
10595 osd->service.identify_splits_and_merges(
10596 shard_osdmap, as_of_osdmap, pgid,
10597 split_pgs, merge_pgs);
10598 } else if (!slot->waiting_for_split.empty()) {
10599 osd->service.identify_splits_and_merges(
10600 shard_osdmap, as_of_osdmap, pgid,
10601 split_pgs, nullptr);
10602 } else {
10603 dout(20) << __func__ << " slot " << pgid
10604 << " has no pg and waiting_for_split " << dendl;
10605 }
10606 }
10607 }
10608 }
10609
10610 void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10611 set<pair<spg_t,epoch_t>> *pgids)
10612 {
10613 std::lock_guard l(shard_lock);
10614 _prime_splits(pgids);
10615 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10616 set<pair<spg_t,epoch_t>> newer_children;
10617 for (auto i : *pgids) {
10618 osd->service.identify_splits_and_merges(
10619 as_of_osdmap, shard_osdmap, i.first,
10620 &newer_children, nullptr);
10621 }
10622 newer_children.insert(pgids->begin(), pgids->end());
10623 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10624 << shard_osdmap->get_epoch() << ", new children " << newer_children
10625 << dendl;
10626 _prime_splits(&newer_children);
10627 // note: we don't care what is left over here for other shards.
10628 // if this shard is ahead of us and one isn't, e.g., one thread is
10629 // calling into prime_splits via _process (due to a newly created
10630 // pg) and this shard has a newer map due to a racing consume_map,
10631 // then any grandchildren left here will be identified (or were
10632 // identified) when the slower shard's osdmap is advanced.
10633 // _prime_splits() will tolerate the case where the pgid is
10634 // already primed.
10635 }
10636 }
10637
10638 void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10639 {
10640 dout(10) << *pgids << dendl;
10641 auto p = pgids->begin();
10642 while (p != pgids->end()) {
10643 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10644 if (shard_index == shard_id) {
10645 auto r = pg_slots.emplace(p->first, nullptr);
10646 if (r.second) {
10647 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10648 r.first->second = make_unique<OSDShardPGSlot>();
10649 r.first->second->waiting_for_split.insert(p->second);
10650 } else {
10651 auto q = r.first;
10652 ceph_assert(q != pg_slots.end());
10653 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10654 << dendl;
10655 q->second->waiting_for_split.insert(p->second);
10656 }
10657 p = pgids->erase(p);
10658 } else {
10659 ++p;
10660 }
10661 }
10662 }
10663
10664 void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10665 set<pair<spg_t,epoch_t>> *merge_pgs)
10666 {
10667 std::lock_guard l(shard_lock);
10668 dout(20) << __func__ << " checking shard " << shard_id
10669 << " for remaining merge pgs " << merge_pgs << dendl;
10670 auto p = merge_pgs->begin();
10671 while (p != merge_pgs->end()) {
10672 spg_t pgid = p->first;
10673 epoch_t epoch = p->second;
10674 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10675 if (shard_index != shard_id) {
10676 ++p;
10677 continue;
10678 }
10679 OSDShardPGSlot *slot;
10680 auto r = pg_slots.emplace(pgid, nullptr);
10681 if (r.second) {
10682 r.first->second = make_unique<OSDShardPGSlot>();
10683 }
10684 slot = r.first->second.get();
10685 if (slot->pg) {
10686 // already have pg
10687 dout(20) << __func__ << " have merge participant pg " << pgid
10688 << " " << slot->pg << dendl;
10689 } else if (!slot->waiting_for_split.empty() &&
10690 *slot->waiting_for_split.begin() < epoch) {
10691 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10692 << " " << slot->waiting_for_split << dendl;
10693 } else {
10694 dout(20) << __func__ << " creating empty merge participant " << pgid
10695 << " for merge in " << epoch << dendl;
10696 // leave history zeroed; PG::merge_from() will fill it in.
10697 pg_history_t history;
10698 PGCreateInfo cinfo(pgid, epoch - 1,
10699 history, PastIntervals(), false);
10700 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10701 _attach_pg(r.first->second.get(), pg.get());
10702 _wake_pg_slot(pgid, slot);
10703 pg->unlock();
10704 }
10705 // mark slot for merge
10706 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10707 slot->waiting_for_merge_epoch = epoch;
10708 p = merge_pgs->erase(p);
10709 }
10710 }
10711
10712 void OSDShard::register_and_wake_split_child(PG *pg)
10713 {
10714 epoch_t epoch;
10715 {
10716 std::lock_guard l(shard_lock);
10717 dout(10) << pg->pg_id << " " << pg << dendl;
10718 auto p = pg_slots.find(pg->pg_id);
10719 ceph_assert(p != pg_slots.end());
10720 auto *slot = p->second.get();
10721 dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
10722 << dendl;
10723 ceph_assert(!slot->pg);
10724 ceph_assert(!slot->waiting_for_split.empty());
10725 _attach_pg(slot, pg);
10726
10727 epoch = pg->get_osdmap_epoch();
10728 ceph_assert(slot->waiting_for_split.count(epoch));
10729 slot->waiting_for_split.erase(epoch);
10730 if (slot->waiting_for_split.empty()) {
10731 _wake_pg_slot(pg->pg_id, slot);
10732 } else {
10733 dout(10) << __func__ << " still waiting for split on "
10734 << slot->waiting_for_split << dendl;
10735 }
10736 }
10737
10738 // kick child to ensure it pulls up to the latest osdmap
10739 osd->enqueue_peering_evt(
10740 pg->pg_id,
10741 PGPeeringEventRef(
10742 std::make_shared<PGPeeringEvent>(
10743 epoch,
10744 epoch,
10745 NullEvt())));
10746
10747 std::lock_guard l{sdata_wait_lock};
10748 sdata_cond.notify_one();
10749 }
10750
10751 void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
10752 {
10753 std::lock_guard l(shard_lock);
10754 vector<spg_t> to_delete;
10755 for (auto& i : pg_slots) {
10756 if (i.first != parent &&
10757 i.first.get_ancestor(old_pg_num) == parent) {
10758 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10759 << dendl;
10760 _wake_pg_slot(i.first, i.second.get());
10761 to_delete.push_back(i.first);
10762 }
10763 }
10764 for (auto pgid : to_delete) {
10765 pg_slots.erase(pgid);
10766 }
10767 }
10768
10769 void OSDShard::update_scheduler_config()
10770 {
10771 std::lock_guard l(shard_lock);
10772 scheduler->update_configuration();
10773 }
10774
10775 OSDShard::OSDShard(
10776 int id,
10777 CephContext *cct,
10778 OSD *osd)
10779 : shard_id(id),
10780 cct(cct),
10781 osd(osd),
10782 shard_name(string("OSDShard.") + stringify(id)),
10783 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10784 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10785 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10786 shard_lock_name(shard_name + "::shard_lock"),
10787 shard_lock{make_mutex(shard_lock_name)},
10788 scheduler(ceph::osd::scheduler::make_scheduler(
10789 cct, osd->num_shards, osd->store->is_rotational())),
10790 context_queue(sdata_wait_lock, sdata_cond)
10791 {
10792 dout(0) << "using op scheduler " << *scheduler << dendl;
10793 }
10794
10795
10796 // =============================================================
10797
10798 #undef dout_context
10799 #define dout_context osd->cct
10800 #undef dout_prefix
10801 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10802
10803 void OSD::ShardedOpWQ::_add_slot_waiter(
10804 spg_t pgid,
10805 OSDShardPGSlot *slot,
10806 OpSchedulerItem&& qi)
10807 {
10808 if (qi.is_peering()) {
10809 dout(20) << __func__ << " " << pgid
10810 << " peering, item epoch is "
10811 << qi.get_map_epoch()
10812 << ", will wait on " << qi << dendl;
10813 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10814 } else {
10815 dout(20) << __func__ << " " << pgid
10816 << " item epoch is "
10817 << qi.get_map_epoch()
10818 << ", will wait on " << qi << dendl;
10819 slot->waiting.push_back(std::move(qi));
10820 }
10821 }
10822
10823 #undef dout_prefix
10824 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10825
10826 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10827 {
10828 uint32_t shard_index = thread_index % osd->num_shards;
10829 auto& sdata = osd->shards[shard_index];
10830 ceph_assert(sdata);
10831
10832 // If all threads of shards do oncommits, there is a out-of-order
10833 // problem. So we choose the thread which has the smallest
10834 // thread_index(thread_index < num_shards) of shard to do oncommit
10835 // callback.
10836 bool is_smallest_thread_index = thread_index < osd->num_shards;
10837
10838 // peek at spg_t
10839 sdata->shard_lock.lock();
10840 if (sdata->scheduler->empty() &&
10841 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10842 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10843 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10844 // we raced with a context_queue addition, don't wait
10845 wait_lock.unlock();
10846 } else if (!sdata->stop_waiting) {
10847 dout(20) << __func__ << " empty q, waiting" << dendl;
10848 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10849 sdata->shard_lock.unlock();
10850 sdata->sdata_cond.wait(wait_lock);
10851 wait_lock.unlock();
10852 sdata->shard_lock.lock();
10853 if (sdata->scheduler->empty() &&
10854 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10855 sdata->shard_lock.unlock();
10856 return;
10857 }
10858 // found a work item; reapply default wq timeouts
10859 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10860 timeout_interval, suicide_interval);
10861 } else {
10862 dout(20) << __func__ << " need return immediately" << dendl;
10863 wait_lock.unlock();
10864 sdata->shard_lock.unlock();
10865 return;
10866 }
10867 }
10868
10869 list<Context *> oncommits;
10870 if (is_smallest_thread_index) {
10871 sdata->context_queue.move_to(oncommits);
10872 }
10873
10874 WorkItem work_item;
10875 while (!std::get_if<OpSchedulerItem>(&work_item)) {
10876 if (sdata->scheduler->empty()) {
10877 if (osd->is_stopping()) {
10878 sdata->shard_lock.unlock();
10879 for (auto c : oncommits) {
10880 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10881 delete c;
10882 }
10883 return; // OSD shutdown, discard.
10884 }
10885 sdata->shard_lock.unlock();
10886 handle_oncommits(oncommits);
10887 return;
10888 }
10889
10890 work_item = sdata->scheduler->dequeue();
10891 if (osd->is_stopping()) {
10892 sdata->shard_lock.unlock();
10893 for (auto c : oncommits) {
10894 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10895 delete c;
10896 }
10897 return; // OSD shutdown, discard.
10898 }
10899
10900 // If the work item is scheduled in the future, wait until
10901 // the time returned in the dequeue response before retrying.
10902 if (auto when_ready = std::get_if<double>(&work_item)) {
10903 if (is_smallest_thread_index) {
10904 sdata->shard_lock.unlock();
10905 handle_oncommits(oncommits);
10906 return;
10907 }
10908 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10909 auto future_time = ceph::real_clock::from_double(*when_ready);
10910 dout(10) << __func__ << " dequeue future request at " << future_time << dendl;
10911 // Disable heartbeat timeout until we find a non-future work item to process.
10912 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10913 sdata->shard_lock.unlock();
10914 ++sdata->waiting_threads;
10915 sdata->sdata_cond.wait_until(wait_lock, future_time);
10916 --sdata->waiting_threads;
10917 wait_lock.unlock();
10918 sdata->shard_lock.lock();
10919 // Reapply default wq timeouts
10920 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10921 timeout_interval, suicide_interval);
10922 }
10923 } // while
10924
10925 // Access the stored item
10926 auto item = std::move(std::get<OpSchedulerItem>(work_item));
10927 if (osd->is_stopping()) {
10928 sdata->shard_lock.unlock();
10929 for (auto c : oncommits) {
10930 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10931 delete c;
10932 }
10933 return; // OSD shutdown, discard.
10934 }
10935
10936 const auto token = item.get_ordering_token();
10937 auto r = sdata->pg_slots.emplace(token, nullptr);
10938 if (r.second) {
10939 r.first->second = make_unique<OSDShardPGSlot>();
10940 }
10941 OSDShardPGSlot *slot = r.first->second.get();
10942 dout(20) << __func__ << " " << token
10943 << (r.second ? " (new)" : "")
10944 << " to_process " << slot->to_process
10945 << " waiting " << slot->waiting
10946 << " waiting_peering " << slot->waiting_peering
10947 << dendl;
10948 slot->to_process.push_back(std::move(item));
10949 dout(20) << __func__ << " " << slot->to_process.back()
10950 << " queued" << dendl;
10951
10952 retry_pg:
10953 PGRef pg = slot->pg;
10954
10955 // lock pg (if we have it)
10956 if (pg) {
10957 // note the requeue seq now...
10958 uint64_t requeue_seq = slot->requeue_seq;
10959 ++slot->num_running;
10960
10961 sdata->shard_lock.unlock();
10962 osd->service.maybe_inject_dispatch_delay();
10963 pg->lock();
10964 osd->service.maybe_inject_dispatch_delay();
10965 sdata->shard_lock.lock();
10966
10967 auto q = sdata->pg_slots.find(token);
10968 if (q == sdata->pg_slots.end()) {
10969 // this can happen if we race with pg removal.
10970 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10971 pg->unlock();
10972 sdata->shard_lock.unlock();
10973 handle_oncommits(oncommits);
10974 return;
10975 }
10976 slot = q->second.get();
10977 --slot->num_running;
10978
10979 if (slot->to_process.empty()) {
10980 // raced with _wake_pg_slot or consume_map
10981 dout(20) << __func__ << " " << token
10982 << " nothing queued" << dendl;
10983 pg->unlock();
10984 sdata->shard_lock.unlock();
10985 handle_oncommits(oncommits);
10986 return;
10987 }
10988 if (requeue_seq != slot->requeue_seq) {
10989 dout(20) << __func__ << " " << token
10990 << " requeue_seq " << slot->requeue_seq << " > our "
10991 << requeue_seq << ", we raced with _wake_pg_slot"
10992 << dendl;
10993 pg->unlock();
10994 sdata->shard_lock.unlock();
10995 handle_oncommits(oncommits);
10996 return;
10997 }
10998 if (slot->pg != pg) {
10999 // this can happen if we race with pg removal.
11000 dout(20) << __func__ << " slot " << token << " no longer attached to "
11001 << pg << dendl;
11002 pg->unlock();
11003 goto retry_pg;
11004 }
11005 }
11006
11007 dout(20) << __func__ << " " << token
11008 << " to_process " << slot->to_process
11009 << " waiting " << slot->waiting
11010 << " waiting_peering " << slot->waiting_peering << dendl;
11011
11012 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
11013 suicide_interval);
11014
11015 // take next item
11016 auto qi = std::move(slot->to_process.front());
11017 slot->to_process.pop_front();
11018 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
11019 set<pair<spg_t,epoch_t>> new_children;
11020 OSDMapRef osdmap;
11021
11022 while (!pg) {
11023 // should this pg shard exist on this osd in this (or a later) epoch?
11024 osdmap = sdata->shard_osdmap;
11025 const PGCreateInfo *create_info = qi.creates_pg();
11026 if (!slot->waiting_for_split.empty()) {
11027 dout(20) << __func__ << " " << token
11028 << " splitting " << slot->waiting_for_split << dendl;
11029 _add_slot_waiter(token, slot, std::move(qi));
11030 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
11031 dout(20) << __func__ << " " << token
11032 << " map " << qi.get_map_epoch() << " > "
11033 << osdmap->get_epoch() << dendl;
11034 _add_slot_waiter(token, slot, std::move(qi));
11035 } else if (qi.is_peering()) {
11036 if (!qi.peering_requires_pg()) {
11037 // for pg-less events, we run them under the ordering lock, since
11038 // we don't have the pg lock to keep them ordered.
11039 qi.run(osd, sdata, pg, tp_handle);
11040 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11041 if (create_info) {
11042 if (create_info->by_mon &&
11043 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
11044 dout(20) << __func__ << " " << token
11045 << " no pg, no longer primary, ignoring mon create on "
11046 << qi << dendl;
11047 } else {
11048 dout(20) << __func__ << " " << token
11049 << " no pg, should create on " << qi << dendl;
11050 pg = osd->handle_pg_create_info(osdmap, create_info);
11051 if (pg) {
11052 // we created the pg! drop out and continue "normally"!
11053 sdata->_attach_pg(slot, pg.get());
11054 sdata->_wake_pg_slot(token, slot);
11055
11056 // identify split children between create epoch and shard epoch.
11057 osd->service.identify_splits_and_merges(
11058 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
11059 sdata->_prime_splits(&new_children);
11060 // distribute remaining split children to other shards below!
11061 break;
11062 }
11063 dout(20) << __func__ << " ignored create on " << qi << dendl;
11064 }
11065 } else {
11066 dout(20) << __func__ << " " << token
11067 << " no pg, peering, !create, discarding " << qi << dendl;
11068 }
11069 } else {
11070 dout(20) << __func__ << " " << token
11071 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
11072 << ", discarding " << qi
11073 << dendl;
11074 }
11075 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11076 dout(20) << __func__ << " " << token
11077 << " no pg, should exist e" << osdmap->get_epoch()
11078 << ", will wait on " << qi << dendl;
11079 _add_slot_waiter(token, slot, std::move(qi));
11080 } else {
11081 dout(20) << __func__ << " " << token
11082 << " no pg, shouldn't exist e" << osdmap->get_epoch()
11083 << ", dropping " << qi << dendl;
11084 // share map with client?
11085 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11086 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
11087 sdata->shard_osdmap,
11088 (*_op)->sent_epoch);
11089 }
11090 unsigned pushes_to_free = qi.get_reserved_pushes();
11091 if (pushes_to_free > 0) {
11092 sdata->shard_lock.unlock();
11093 osd->service.release_reserved_pushes(pushes_to_free);
11094 handle_oncommits(oncommits);
11095 return;
11096 }
11097 }
11098 sdata->shard_lock.unlock();
11099 handle_oncommits(oncommits);
11100 return;
11101 }
11102 if (qi.is_peering()) {
11103 OSDMapRef osdmap = sdata->shard_osdmap;
11104 if (qi.get_map_epoch() > osdmap->get_epoch()) {
11105 _add_slot_waiter(token, slot, std::move(qi));
11106 sdata->shard_lock.unlock();
11107 pg->unlock();
11108 handle_oncommits(oncommits);
11109 return;
11110 }
11111 }
11112 sdata->shard_lock.unlock();
11113
11114 if (!new_children.empty()) {
11115 for (auto shard : osd->shards) {
11116 shard->prime_splits(osdmap, &new_children);
11117 }
11118 ceph_assert(new_children.empty());
11119 }
11120
11121 // osd_opwq_process marks the point at which an operation has been dequeued
11122 // and will begin to be handled by a worker thread.
11123 {
11124 #ifdef WITH_LTTNG
11125 osd_reqid_t reqid;
11126 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11127 reqid = (*_op)->get_reqid();
11128 }
11129 #endif
11130 tracepoint(osd, opwq_process_start, reqid.name._type,
11131 reqid.name._num, reqid.tid, reqid.inc);
11132 }
11133
11134 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
11135 Formatter *f = Formatter::create("json");
11136 f->open_object_section("q");
11137 dump(f);
11138 f->close_section();
11139 f->flush(*_dout);
11140 delete f;
11141 *_dout << dendl;
11142
11143 qi.run(osd, sdata, pg, tp_handle);
11144
11145 {
11146 #ifdef WITH_LTTNG
11147 osd_reqid_t reqid;
11148 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11149 reqid = (*_op)->get_reqid();
11150 }
11151 #endif
11152 tracepoint(osd, opwq_process_finish, reqid.name._type,
11153 reqid.name._num, reqid.tid, reqid.inc);
11154 }
11155
11156 handle_oncommits(oncommits);
11157 }
11158
11159 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
11160 uint32_t shard_index =
11161 item.get_ordering_token().hash_to_shard(osd->shards.size());
11162
11163 dout(20) << __func__ << " " << item << dendl;
11164
11165 OSDShard* sdata = osd->shards[shard_index];
11166 assert (NULL != sdata);
11167
11168 bool empty = true;
11169 {
11170 std::lock_guard l{sdata->shard_lock};
11171 empty = sdata->scheduler->empty();
11172 sdata->scheduler->enqueue(std::move(item));
11173 }
11174
11175 {
11176 std::lock_guard l{sdata->sdata_wait_lock};
11177 if (empty) {
11178 sdata->sdata_cond.notify_all();
11179 } else if (sdata->waiting_threads) {
11180 sdata->sdata_cond.notify_one();
11181 }
11182 }
11183 }
11184
11185 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
11186 {
11187 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11188 auto& sdata = osd->shards[shard_index];
11189 ceph_assert(sdata);
11190 sdata->shard_lock.lock();
11191 auto p = sdata->pg_slots.find(item.get_ordering_token());
11192 if (p != sdata->pg_slots.end() &&
11193 !p->second->to_process.empty()) {
11194 // we may be racing with _process, which has dequeued a new item
11195 // from scheduler, put it on to_process, and is now busy taking the
11196 // pg lock. ensure this old requeued item is ordered before any
11197 // such newer item in to_process.
11198 p->second->to_process.push_front(std::move(item));
11199 item = std::move(p->second->to_process.back());
11200 p->second->to_process.pop_back();
11201 dout(20) << __func__
11202 << " " << p->second->to_process.front()
11203 << " shuffled w/ " << item << dendl;
11204 } else {
11205 dout(20) << __func__ << " " << item << dendl;
11206 }
11207 sdata->scheduler->enqueue_front(std::move(item));
11208 sdata->shard_lock.unlock();
11209 std::lock_guard l{sdata->sdata_wait_lock};
11210 sdata->sdata_cond.notify_one();
11211 }
11212
11213 namespace ceph::osd_cmds {
11214
11215 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
11216 std::ostream& os)
11217 {
11218 if (!ceph_using_tcmalloc()) {
11219 os << "could not issue heap profiler command -- not using tcmalloc!";
11220 return -EOPNOTSUPP;
11221 }
11222
11223 string cmd;
11224 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
11225 os << "unable to get value for command \"" << cmd << "\"";
11226 return -EINVAL;
11227 }
11228
11229 std::vector<std::string> cmd_vec;
11230 get_str_vec(cmd, cmd_vec);
11231
11232 string val;
11233 if (cmd_getval(cmdmap, "value", val)) {
11234 cmd_vec.push_back(val);
11235 }
11236
11237 ceph_heap_profiler_handle_command(cmd_vec, os);
11238
11239 return 0;
11240 }
11241
11242 } // namespace ceph::osd_cmds