]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
import 15.2.4
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15
16 #include "acconfig.h"
17
18 #include <cctype>
19 #include <fstream>
20 #include <iostream>
21 #include <iterator>
22
23 #include <unistd.h>
24 #include <sys/stat.h>
25 #include <signal.h>
26 #include <time.h>
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
29
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
32 #endif
33
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
36 #endif
37
38 #include "osd/PG.h"
39
40 #include "include/types.h"
41 #include "include/compat.h"
42 #include "include/random.h"
43
44 #include "OSD.h"
45 #include "OSDMap.h"
46 #include "Watch.h"
47 #include "osdc/Objecter.h"
48
49 #include "common/errno.h"
50 #include "common/ceph_argparse.h"
51 #include "common/ceph_releases.h"
52 #include "common/ceph_time.h"
53 #include "common/version.h"
54 #include "common/pick_address.h"
55 #include "common/blkdev.h"
56 #include "common/numa.h"
57
58 #include "os/ObjectStore.h"
59 #ifdef HAVE_LIBFUSE
60 #include "os/FuseStore.h"
61 #endif
62
63 #include "PrimaryLogPG.h"
64
65 #include "msg/Messenger.h"
66 #include "msg/Message.h"
67
68 #include "mon/MonClient.h"
69
70 #include "messages/MLog.h"
71
72 #include "messages/MGenericMessage.h"
73 #include "messages/MOSDPing.h"
74 #include "messages/MOSDFailure.h"
75 #include "messages/MOSDMarkMeDown.h"
76 #include "messages/MOSDMarkMeDead.h"
77 #include "messages/MOSDFull.h"
78 #include "messages/MOSDOp.h"
79 #include "messages/MOSDOpReply.h"
80 #include "messages/MOSDBackoff.h"
81 #include "messages/MOSDBeacon.h"
82 #include "messages/MOSDRepOp.h"
83 #include "messages/MOSDRepOpReply.h"
84 #include "messages/MOSDBoot.h"
85 #include "messages/MOSDPGTemp.h"
86 #include "messages/MOSDPGReadyToMerge.h"
87
88 #include "messages/MOSDMap.h"
89 #include "messages/MMonGetOSDMap.h"
90 #include "messages/MOSDPGNotify.h"
91 #include "messages/MOSDPGNotify2.h"
92 #include "messages/MOSDPGQuery.h"
93 #include "messages/MOSDPGQuery2.h"
94 #include "messages/MOSDPGLog.h"
95 #include "messages/MOSDPGRemove.h"
96 #include "messages/MOSDPGInfo.h"
97 #include "messages/MOSDPGInfo2.h"
98 #include "messages/MOSDPGCreate.h"
99 #include "messages/MOSDPGCreate2.h"
100 #include "messages/MOSDPGScan.h"
101 #include "messages/MBackfillReserve.h"
102 #include "messages/MRecoveryReserve.h"
103 #include "messages/MOSDForceRecovery.h"
104 #include "messages/MOSDECSubOpWrite.h"
105 #include "messages/MOSDECSubOpWriteReply.h"
106 #include "messages/MOSDECSubOpRead.h"
107 #include "messages/MOSDECSubOpReadReply.h"
108 #include "messages/MOSDPGCreated.h"
109 #include "messages/MOSDPGUpdateLogMissing.h"
110 #include "messages/MOSDPGUpdateLogMissingReply.h"
111
112 #include "messages/MOSDPeeringOp.h"
113
114 #include "messages/MOSDAlive.h"
115
116 #include "messages/MOSDScrub.h"
117 #include "messages/MOSDScrub2.h"
118 #include "messages/MOSDRepScrub.h"
119
120 #include "messages/MCommand.h"
121 #include "messages/MCommandReply.h"
122
123 #include "messages/MPGStats.h"
124 #include "messages/MPGStatsAck.h"
125
126 #include "messages/MWatchNotify.h"
127 #include "messages/MOSDPGPush.h"
128 #include "messages/MOSDPGPushReply.h"
129 #include "messages/MOSDPGPull.h"
130
131 #include "messages/MMonGetPurgedSnaps.h"
132 #include "messages/MMonGetPurgedSnapsReply.h"
133
134 #include "common/perf_counters.h"
135 #include "common/Timer.h"
136 #include "common/LogClient.h"
137 #include "common/AsyncReserver.h"
138 #include "common/HeartbeatMap.h"
139 #include "common/admin_socket.h"
140 #include "common/ceph_context.h"
141
142 #include "global/signal_handler.h"
143 #include "global/pidfile.h"
144
145 #include "include/color.h"
146 #include "perfglue/cpu_profiler.h"
147 #include "perfglue/heap_profiler.h"
148
149 #include "osd/OpRequest.h"
150
151 #include "auth/AuthAuthorizeHandler.h"
152 #include "auth/RotatingKeyRing.h"
153
154 #include "objclass/objclass.h"
155
156 #include "common/cmdparse.h"
157 #include "include/str_list.h"
158 #include "include/util.h"
159
160 #include "include/ceph_assert.h"
161 #include "common/config.h"
162 #include "common/EventTrace.h"
163
164 #include "json_spirit/json_spirit_reader.h"
165 #include "json_spirit/json_spirit_writer.h"
166
167 #ifdef WITH_LTTNG
168 #define TRACEPOINT_DEFINE
169 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
170 #include "tracing/osd.h"
171 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172 #undef TRACEPOINT_DEFINE
173 #else
174 #define tracepoint(...)
175 #endif
176
177 #define dout_context cct
178 #define dout_subsys ceph_subsys_osd
179 #undef dout_prefix
180 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
181
182 using namespace ceph::osd::scheduler;
183 using TOPNSPC::common::cmd_getval;
184
185 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
186 return *_dout << "osd." << whoami << " " << epoch << " ";
187 }
188
189 //Initial features in new superblock.
190 //Features here are also automatically upgraded
191 CompatSet OSD::get_osd_initial_compat_set() {
192 CompatSet::FeatureSet ceph_osd_feature_compat;
193 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
194 CompatSet::FeatureSet ceph_osd_feature_incompat;
195 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
196 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
197 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
198 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
199 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
200 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
201 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
202 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
203 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
204 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
205 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
206 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
207 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
208 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
209 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
210 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
211 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
212 ceph_osd_feature_incompat);
213 }
214
215 //Features are added here that this OSD supports.
216 CompatSet OSD::get_osd_compat_set() {
217 CompatSet compat = get_osd_initial_compat_set();
218 //Any features here can be set in code, but not in initial superblock
219 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
220 return compat;
221 }
222
223 OSDService::OSDService(OSD *osd) :
224 osd(osd),
225 cct(osd->cct),
226 whoami(osd->whoami), store(osd->store),
227 log_client(osd->log_client), clog(osd->clog),
228 pg_recovery_stats(osd->pg_recovery_stats),
229 cluster_messenger(osd->cluster_messenger),
230 client_messenger(osd->client_messenger),
231 logger(osd->logger),
232 recoverystate_perf(osd->recoverystate_perf),
233 monc(osd->monc),
234 osd_max_object_size(cct->_conf, "osd_max_object_size"),
235 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
236 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
237 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
238 max_oldest_map(0),
239 scrubs_local(0),
240 scrubs_remote(0),
241 agent_valid_iterator(false),
242 agent_ops(0),
243 flush_mode_high_count(0),
244 agent_active(true),
245 agent_thread(this),
246 agent_stop_flag(false),
247 agent_timer(osd->client_messenger->cct, agent_timer_lock),
248 last_recalibrate(ceph_clock_now()),
249 promote_max_objects(0),
250 promote_max_bytes(0),
251 objecter(make_unique<Objecter>(osd->client_messenger->cct,
252 osd->objecter_messenger,
253 osd->monc, nullptr, 0, 0)),
254 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
255 watch_timer(osd->client_messenger->cct, watch_lock),
256 next_notif_id(0),
257 recovery_request_timer(cct, recovery_request_lock, false),
258 sleep_timer(cct, sleep_lock, false),
259 reserver_finisher(cct),
260 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
261 cct->_conf->osd_min_recovery_priority),
262 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
263 cct->_conf->osd_min_recovery_priority),
264 snap_reserver(cct, &reserver_finisher,
265 cct->_conf->osd_max_trimming_pgs),
266 recovery_ops_active(0),
267 recovery_ops_reserved(0),
268 recovery_paused(false),
269 map_cache(cct, cct->_conf->osd_map_cache_size),
270 map_bl_cache(cct->_conf->osd_map_cache_size),
271 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
272 cur_state(NONE),
273 cur_ratio(0), physical_ratio(0),
274 boot_epoch(0), up_epoch(0), bind_epoch(0)
275 {
276 objecter->init();
277
278 for (int i = 0; i < m_objecter_finishers; i++) {
279 ostringstream str;
280 str << "objecter-finisher-" << i;
281 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
282 objecter_finishers.push_back(std::move(fin));
283 }
284 }
285
286 #ifdef PG_DEBUG_REFS
287 void OSDService::add_pgid(spg_t pgid, PG *pg){
288 std::lock_guard l(pgid_lock);
289 if (!pgid_tracker.count(pgid)) {
290 live_pgs[pgid] = pg;
291 }
292 pgid_tracker[pgid]++;
293 }
294 void OSDService::remove_pgid(spg_t pgid, PG *pg)
295 {
296 std::lock_guard l(pgid_lock);
297 ceph_assert(pgid_tracker.count(pgid));
298 ceph_assert(pgid_tracker[pgid] > 0);
299 pgid_tracker[pgid]--;
300 if (pgid_tracker[pgid] == 0) {
301 pgid_tracker.erase(pgid);
302 live_pgs.erase(pgid);
303 }
304 }
305 void OSDService::dump_live_pgids()
306 {
307 std::lock_guard l(pgid_lock);
308 derr << "live pgids:" << dendl;
309 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
310 i != pgid_tracker.cend();
311 ++i) {
312 derr << "\t" << *i << dendl;
313 live_pgs[i->first]->dump_live_ids();
314 }
315 }
316 #endif
317
318
319 ceph::signedspan OSDService::get_mnow()
320 {
321 return ceph::mono_clock::now() - osd->startup_time;
322 }
323
324 void OSDService::identify_splits_and_merges(
325 OSDMapRef old_map,
326 OSDMapRef new_map,
327 spg_t pgid,
328 set<pair<spg_t,epoch_t>> *split_children,
329 set<pair<spg_t,epoch_t>> *merge_pgs)
330 {
331 if (!old_map->have_pg_pool(pgid.pool())) {
332 return;
333 }
334 int old_pgnum = old_map->get_pg_num(pgid.pool());
335 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
336 if (p == osd->pg_num_history.pg_nums.end()) {
337 return;
338 }
339 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
340 << " to e" << new_map->get_epoch()
341 << " pg_nums " << p->second << dendl;
342 deque<spg_t> queue;
343 queue.push_back(pgid);
344 set<spg_t> did;
345 while (!queue.empty()) {
346 auto cur = queue.front();
347 queue.pop_front();
348 did.insert(cur);
349 unsigned pgnum = old_pgnum;
350 for (auto q = p->second.lower_bound(old_map->get_epoch());
351 q != p->second.end() &&
352 q->first <= new_map->get_epoch();
353 ++q) {
354 if (pgnum < q->second) {
355 // split?
356 if (cur.ps() < pgnum) {
357 set<spg_t> children;
358 if (cur.is_split(pgnum, q->second, &children)) {
359 dout(20) << __func__ << " " << cur << " e" << q->first
360 << " pg_num " << pgnum << " -> " << q->second
361 << " children " << children << dendl;
362 for (auto i : children) {
363 split_children->insert(make_pair(i, q->first));
364 if (!did.count(i))
365 queue.push_back(i);
366 }
367 }
368 } else if (cur.ps() < q->second) {
369 dout(20) << __func__ << " " << cur << " e" << q->first
370 << " pg_num " << pgnum << " -> " << q->second
371 << " is a child" << dendl;
372 // normally we'd capture this from the parent, but it's
373 // possible the parent doesn't exist yet (it will be
374 // fabricated to allow an intervening merge). note this PG
375 // as a split child here to be sure we catch it.
376 split_children->insert(make_pair(cur, q->first));
377 } else {
378 dout(20) << __func__ << " " << cur << " e" << q->first
379 << " pg_num " << pgnum << " -> " << q->second
380 << " is post-split, skipping" << dendl;
381 }
382 } else if (merge_pgs) {
383 // merge?
384 if (cur.ps() >= q->second) {
385 if (cur.ps() < pgnum) {
386 spg_t parent;
387 if (cur.is_merge_source(pgnum, q->second, &parent)) {
388 set<spg_t> children;
389 parent.is_split(q->second, pgnum, &children);
390 dout(20) << __func__ << " " << cur << " e" << q->first
391 << " pg_num " << pgnum << " -> " << q->second
392 << " is merge source, target " << parent
393 << ", source(s) " << children << dendl;
394 merge_pgs->insert(make_pair(parent, q->first));
395 if (!did.count(parent)) {
396 // queue (and re-scan) parent in case it might not exist yet
397 // and there are some future splits pending on it
398 queue.push_back(parent);
399 }
400 for (auto c : children) {
401 merge_pgs->insert(make_pair(c, q->first));
402 if (!did.count(c))
403 queue.push_back(c);
404 }
405 }
406 } else {
407 dout(20) << __func__ << " " << cur << " e" << q->first
408 << " pg_num " << pgnum << " -> " << q->second
409 << " is beyond old pgnum, skipping" << dendl;
410 }
411 } else {
412 set<spg_t> children;
413 if (cur.is_split(q->second, pgnum, &children)) {
414 dout(20) << __func__ << " " << cur << " e" << q->first
415 << " pg_num " << pgnum << " -> " << q->second
416 << " is merge target, source " << children << dendl;
417 for (auto c : children) {
418 merge_pgs->insert(make_pair(c, q->first));
419 if (!did.count(c))
420 queue.push_back(c);
421 }
422 merge_pgs->insert(make_pair(cur, q->first));
423 }
424 }
425 }
426 pgnum = q->second;
427 }
428 }
429 }
430
431 void OSDService::need_heartbeat_peer_update()
432 {
433 osd->need_heartbeat_peer_update();
434 }
435
436 HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
437 {
438 std::lock_guard l(hb_stamp_lock);
439 if (peer >= hb_stamps.size()) {
440 hb_stamps.resize(peer + 1);
441 }
442 if (!hb_stamps[peer]) {
443 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
444 }
445 return hb_stamps[peer];
446 }
447
448 void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
449 {
450 osd->enqueue_peering_evt(
451 spgid,
452 PGPeeringEventRef(
453 std::make_shared<PGPeeringEvent>(
454 epoch, epoch,
455 RenewLease())));
456 }
457
458 void OSDService::start_shutdown()
459 {
460 {
461 std::lock_guard l(agent_timer_lock);
462 agent_timer.shutdown();
463 }
464
465 {
466 std::lock_guard l(sleep_lock);
467 sleep_timer.shutdown();
468 }
469
470 {
471 std::lock_guard l(recovery_request_lock);
472 recovery_request_timer.shutdown();
473 }
474 }
475
476 void OSDService::shutdown_reserver()
477 {
478 reserver_finisher.wait_for_empty();
479 reserver_finisher.stop();
480 }
481
482 void OSDService::shutdown()
483 {
484 mono_timer.suspend();
485
486 {
487 std::lock_guard l(watch_lock);
488 watch_timer.shutdown();
489 }
490
491 objecter->shutdown();
492 for (auto& f : objecter_finishers) {
493 f->wait_for_empty();
494 f->stop();
495 }
496
497 publish_map(OSDMapRef());
498 next_osdmap = OSDMapRef();
499 }
500
501 void OSDService::init()
502 {
503 reserver_finisher.start();
504 for (auto& f : objecter_finishers) {
505 f->start();
506 }
507 objecter->set_client_incarnation(0);
508
509 // deprioritize objecter in daemonperf output
510 objecter->get_logger()->set_prio_adjust(-3);
511
512 watch_timer.init();
513 agent_timer.init();
514 mono_timer.resume();
515
516 agent_thread.create("osd_srv_agent");
517
518 if (cct->_conf->osd_recovery_delay_start)
519 defer_recovery(cct->_conf->osd_recovery_delay_start);
520 }
521
522 void OSDService::final_init()
523 {
524 objecter->start(osdmap.get());
525 }
526
527 void OSDService::activate_map()
528 {
529 // wake/unwake the tiering agent
530 std::lock_guard l{agent_lock};
531 agent_active =
532 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
533 osd->is_active();
534 agent_cond.notify_all();
535 }
536
537 void OSDService::request_osdmap_update(epoch_t e)
538 {
539 osd->osdmap_subscribe(e, false);
540 }
541
542
543 class AgentTimeoutCB : public Context {
544 PGRef pg;
545 public:
546 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
547 void finish(int) override {
548 pg->agent_choose_mode_restart();
549 }
550 };
551
552 void OSDService::agent_entry()
553 {
554 dout(10) << __func__ << " start" << dendl;
555 std::unique_lock agent_locker{agent_lock};
556
557 while (!agent_stop_flag) {
558 if (agent_queue.empty()) {
559 dout(20) << __func__ << " empty queue" << dendl;
560 agent_cond.wait(agent_locker);
561 continue;
562 }
563 uint64_t level = agent_queue.rbegin()->first;
564 set<PGRef>& top = agent_queue.rbegin()->second;
565 dout(10) << __func__
566 << " tiers " << agent_queue.size()
567 << ", top is " << level
568 << " with pgs " << top.size()
569 << ", ops " << agent_ops << "/"
570 << cct->_conf->osd_agent_max_ops
571 << (agent_active ? " active" : " NOT ACTIVE")
572 << dendl;
573 dout(20) << __func__ << " oids " << agent_oids << dendl;
574 int max = cct->_conf->osd_agent_max_ops - agent_ops;
575 int agent_flush_quota = max;
576 if (!flush_mode_high_count)
577 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
578 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
579 agent_cond.wait(agent_locker);
580 continue;
581 }
582
583 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
584 agent_queue_pos = top.begin();
585 agent_valid_iterator = true;
586 }
587 PGRef pg = *agent_queue_pos;
588 dout(10) << "high_count " << flush_mode_high_count
589 << " agent_ops " << agent_ops
590 << " flush_quota " << agent_flush_quota << dendl;
591 agent_locker.unlock();
592 if (!pg->agent_work(max, agent_flush_quota)) {
593 dout(10) << __func__ << " " << pg->pg_id
594 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
595 << " seconds" << dendl;
596
597 osd->logger->inc(l_osd_tier_delay);
598 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
599 std::lock_guard timer_locker{agent_timer_lock};
600 Context *cb = new AgentTimeoutCB(pg);
601 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
602 }
603 agent_locker.lock();
604 }
605 dout(10) << __func__ << " finish" << dendl;
606 }
607
608 void OSDService::agent_stop()
609 {
610 {
611 std::lock_guard l(agent_lock);
612
613 // By this time all ops should be cancelled
614 ceph_assert(agent_ops == 0);
615 // By this time all PGs are shutdown and dequeued
616 if (!agent_queue.empty()) {
617 set<PGRef>& top = agent_queue.rbegin()->second;
618 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
619 ceph_abort_msg("agent queue not empty");
620 }
621
622 agent_stop_flag = true;
623 agent_cond.notify_all();
624 }
625 agent_thread.join();
626 }
627
628 // -------------------------------------
629
630 void OSDService::promote_throttle_recalibrate()
631 {
632 utime_t now = ceph_clock_now();
633 double dur = now - last_recalibrate;
634 last_recalibrate = now;
635 unsigned prob = promote_probability_millis;
636
637 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
638 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
639
640 unsigned min_prob = 1;
641
642 uint64_t attempts, obj, bytes;
643 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
644 dout(10) << __func__ << " " << attempts << " attempts, promoted "
645 << obj << " objects and " << byte_u_t(bytes) << "; target "
646 << target_obj_sec << " obj/sec or "
647 << byte_u_t(target_bytes_sec) << "/sec"
648 << dendl;
649
650 // calculate what the probability *should* be, given the targets
651 unsigned new_prob;
652 if (attempts && dur > 0) {
653 uint64_t avg_size = 1;
654 if (obj)
655 avg_size = std::max<uint64_t>(bytes / obj, 1);
656 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
657 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
658 / (double)attempts;
659 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
660 << avg_size << dendl;
661 if (target_obj_sec && target_bytes_sec)
662 new_prob = std::min(po, pb);
663 else if (target_obj_sec)
664 new_prob = po;
665 else if (target_bytes_sec)
666 new_prob = pb;
667 else
668 new_prob = 1000;
669 } else {
670 new_prob = 1000;
671 }
672 dout(20) << __func__ << " new_prob " << new_prob << dendl;
673
674 // correct for persistent skew between target rate and actual rate, adjust
675 double ratio = 1.0;
676 unsigned actual = 0;
677 if (attempts && obj) {
678 actual = obj * 1000 / attempts;
679 ratio = (double)actual / (double)prob;
680 new_prob = (double)new_prob / ratio;
681 }
682 new_prob = std::max(new_prob, min_prob);
683 new_prob = std::min(new_prob, 1000u);
684
685 // adjust
686 prob = (prob + new_prob) / 2;
687 prob = std::max(prob, min_prob);
688 prob = std::min(prob, 1000u);
689 dout(10) << __func__ << " actual " << actual
690 << ", actual/prob ratio " << ratio
691 << ", adjusted new_prob " << new_prob
692 << ", prob " << promote_probability_millis << " -> " << prob
693 << dendl;
694 promote_probability_millis = prob;
695
696 // set hard limits for this interval to mitigate stampedes
697 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
698 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
699 }
700
701 // -------------------------------------
702
703 float OSDService::get_failsafe_full_ratio()
704 {
705 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
706 if (full_ratio > 1.0) full_ratio /= 100.0;
707 return full_ratio;
708 }
709
710 OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
711 {
712 // The OSDMap ratios take precendence. So if the failsafe is .95 and
713 // the admin sets the cluster full to .96, the failsafe moves up to .96
714 // too. (Not that having failsafe == full is ideal, but it's better than
715 // dropping writes before the clusters appears full.)
716 OSDMapRef osdmap = get_osdmap();
717 if (!osdmap || osdmap->get_epoch() == 0) {
718 return NONE;
719 }
720 float nearfull_ratio = osdmap->get_nearfull_ratio();
721 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
722 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
723 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
724
725 if (osdmap->require_osd_release < ceph_release_t::luminous) {
726 // use the failsafe for nearfull and full; the mon isn't using the
727 // flags anyway because we're mid-upgrade.
728 full_ratio = failsafe_ratio;
729 backfillfull_ratio = failsafe_ratio;
730 nearfull_ratio = failsafe_ratio;
731 } else if (full_ratio <= 0 ||
732 backfillfull_ratio <= 0 ||
733 nearfull_ratio <= 0) {
734 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
735 // use failsafe flag. ick. the monitor did something wrong or the user
736 // did something stupid.
737 full_ratio = failsafe_ratio;
738 backfillfull_ratio = failsafe_ratio;
739 nearfull_ratio = failsafe_ratio;
740 }
741
742 if (injectfull_state > NONE && injectfull) {
743 inject = "(Injected)";
744 return injectfull_state;
745 } else if (pratio > failsafe_ratio) {
746 return FAILSAFE;
747 } else if (ratio > full_ratio) {
748 return FULL;
749 } else if (ratio > backfillfull_ratio) {
750 return BACKFILLFULL;
751 } else if (pratio > nearfull_ratio) {
752 return NEARFULL;
753 }
754 return NONE;
755 }
756
757 void OSDService::check_full_status(float ratio, float pratio)
758 {
759 std::lock_guard l(full_status_lock);
760
761 cur_ratio = ratio;
762 physical_ratio = pratio;
763
764 string inject;
765 s_names new_state;
766 new_state = recalc_full_state(ratio, pratio, inject);
767
768 dout(20) << __func__ << " cur ratio " << ratio
769 << ", physical ratio " << pratio
770 << ", new state " << get_full_state_name(new_state)
771 << " " << inject
772 << dendl;
773
774 // warn
775 if (cur_state != new_state) {
776 dout(10) << __func__ << " " << get_full_state_name(cur_state)
777 << " -> " << get_full_state_name(new_state) << dendl;
778 if (new_state == FAILSAFE) {
779 clog->error() << "full status failsafe engaged, dropping updates, now "
780 << (int)roundf(ratio * 100) << "% full";
781 } else if (cur_state == FAILSAFE) {
782 clog->error() << "full status failsafe disengaged, no longer dropping "
783 << "updates, now " << (int)roundf(ratio * 100) << "% full";
784 }
785 cur_state = new_state;
786 }
787 }
788
789 bool OSDService::need_fullness_update()
790 {
791 OSDMapRef osdmap = get_osdmap();
792 s_names cur = NONE;
793 if (osdmap->exists(whoami)) {
794 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
795 cur = FULL;
796 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
797 cur = BACKFILLFULL;
798 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
799 cur = NEARFULL;
800 }
801 }
802 s_names want = NONE;
803 if (is_full())
804 want = FULL;
805 else if (is_backfillfull())
806 want = BACKFILLFULL;
807 else if (is_nearfull())
808 want = NEARFULL;
809 return want != cur;
810 }
811
812 bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
813 {
814 if (injectfull && injectfull_state >= type) {
815 // injectfull is either a count of the number of times to return failsafe full
816 // or if -1 then always return full
817 if (injectfull > 0)
818 --injectfull;
819 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
820 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
821 << dendl;
822 return true;
823 }
824 return false;
825 }
826
827 bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
828 {
829 std::lock_guard l(full_status_lock);
830
831 if (_check_inject_full(dpp, type))
832 return true;
833
834 if (cur_state >= type)
835 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
836 << " physical " << physical_ratio << dendl;
837
838 return cur_state >= type;
839 }
840
841 bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
842 {
843 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
844 {
845 std::lock_guard l(full_status_lock);
846 if (_check_inject_full(dpp, type)) {
847 return true;
848 }
849 }
850
851 float pratio;
852 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
853
854 string notused;
855 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
856
857 if (tentative_state >= type)
858 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
859
860 return tentative_state >= type;
861 }
862
863 bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
864 {
865 return _check_full(dpp, FAILSAFE);
866 }
867
868 bool OSDService::check_full(DoutPrefixProvider *dpp) const
869 {
870 return _check_full(dpp, FULL);
871 }
872
873 bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
874 {
875 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
876 }
877
878 bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
879 {
880 return _check_full(dpp, BACKFILLFULL);
881 }
882
883 bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
884 {
885 return _check_full(dpp, NEARFULL);
886 }
887
888 bool OSDService::is_failsafe_full() const
889 {
890 std::lock_guard l(full_status_lock);
891 return cur_state == FAILSAFE;
892 }
893
894 bool OSDService::is_full() const
895 {
896 std::lock_guard l(full_status_lock);
897 return cur_state >= FULL;
898 }
899
900 bool OSDService::is_backfillfull() const
901 {
902 std::lock_guard l(full_status_lock);
903 return cur_state >= BACKFILLFULL;
904 }
905
906 bool OSDService::is_nearfull() const
907 {
908 std::lock_guard l(full_status_lock);
909 return cur_state >= NEARFULL;
910 }
911
912 void OSDService::set_injectfull(s_names type, int64_t count)
913 {
914 std::lock_guard l(full_status_lock);
915 injectfull_state = type;
916 injectfull = count;
917 }
918
919 void OSDService::set_statfs(const struct store_statfs_t &stbuf,
920 osd_alert_list_t& alerts)
921 {
922 uint64_t bytes = stbuf.total;
923 uint64_t avail = stbuf.available;
924 uint64_t used = stbuf.get_used_raw();
925
926 // For testing fake statfs values so it doesn't matter if all
927 // OSDs are using the same partition.
928 if (cct->_conf->fake_statfs_for_testing) {
929 uint64_t total_num_bytes = 0;
930 vector<PGRef> pgs;
931 osd->_get_pgs(&pgs);
932 for (auto p : pgs) {
933 total_num_bytes += p->get_stats_num_bytes();
934 }
935 bytes = cct->_conf->fake_statfs_for_testing;
936 if (total_num_bytes < bytes)
937 avail = bytes - total_num_bytes;
938 else
939 avail = 0;
940 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
941 << " adjust available " << avail
942 << dendl;
943 used = bytes - avail;
944 }
945
946 osd->logger->set(l_osd_stat_bytes, bytes);
947 osd->logger->set(l_osd_stat_bytes_used, used);
948 osd->logger->set(l_osd_stat_bytes_avail, avail);
949
950 std::lock_guard l(stat_lock);
951 osd_stat.statfs = stbuf;
952 osd_stat.os_alerts.clear();
953 osd_stat.os_alerts[whoami].swap(alerts);
954 if (cct->_conf->fake_statfs_for_testing) {
955 osd_stat.statfs.total = bytes;
956 osd_stat.statfs.available = avail;
957 // For testing don't want used to go negative, so clear reserved
958 osd_stat.statfs.internally_reserved = 0;
959 }
960 }
961
962 osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
963 int num_pgs)
964 {
965 utime_t now = ceph_clock_now();
966 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
967 std::lock_guard l(stat_lock);
968 osd_stat.hb_peers.swap(hb_peers);
969 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
970 osd_stat.num_pgs = num_pgs;
971 // Clean entries that aren't updated
972 // This is called often enough that we can just remove 1 at a time
973 for (auto i: osd_stat.hb_pingtime) {
974 if (i.second.last_update == 0)
975 continue;
976 if (stale_time && now.sec() - i.second.last_update > stale_time) {
977 dout(20) << __func__ << " time out heartbeat for osd " << i.first
978 << " last_update " << i.second.last_update << dendl;
979 osd_stat.hb_pingtime.erase(i.first);
980 break;
981 }
982 }
983 return osd_stat;
984 }
985
986 void OSDService::inc_osd_stat_repaired()
987 {
988 std::lock_guard l(stat_lock);
989 osd_stat.num_shards_repaired++;
990 return;
991 }
992
993 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
994 uint64_t adjust_used)
995 {
996 *pratio =
997 ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
998
999 if (adjust_used) {
1000 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1001 if (new_stat.statfs.available > adjust_used)
1002 new_stat.statfs.available -= adjust_used;
1003 else
1004 new_stat.statfs.available = 0;
1005 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
1006 }
1007
1008 // Check all pgs and adjust kb_used to include all pending backfill data
1009 int backfill_adjusted = 0;
1010 vector<PGRef> pgs;
1011 osd->_get_pgs(&pgs);
1012 for (auto p : pgs) {
1013 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1014 }
1015 if (backfill_adjusted) {
1016 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1017 }
1018 return ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
1019 }
1020
1021 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1022 {
1023 OSDMapRef next_map = get_nextmap_reserved();
1024 // service map is always newer/newest
1025 ceph_assert(from_epoch <= next_map->get_epoch());
1026
1027 if (next_map->is_down(peer) ||
1028 next_map->get_info(peer).up_from > from_epoch) {
1029 m->put();
1030 release_map(next_map);
1031 return;
1032 }
1033 ConnectionRef peer_con;
1034 if (peer == whoami) {
1035 peer_con = osd->cluster_messenger->get_loopback_connection();
1036 } else {
1037 peer_con = osd->cluster_messenger->connect_to_osd(
1038 next_map->get_cluster_addrs(peer), false, true);
1039 }
1040 maybe_share_map(peer_con.get(), next_map);
1041 peer_con->send_message(m);
1042 release_map(next_map);
1043 }
1044
1045 void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1046 {
1047 OSDMapRef next_map = get_nextmap_reserved();
1048 // service map is always newer/newest
1049 ceph_assert(from_epoch <= next_map->get_epoch());
1050
1051 for (auto& iter : messages) {
1052 if (next_map->is_down(iter.first) ||
1053 next_map->get_info(iter.first).up_from > from_epoch) {
1054 iter.second->put();
1055 continue;
1056 }
1057 ConnectionRef peer_con;
1058 if (iter.first == whoami) {
1059 peer_con = osd->cluster_messenger->get_loopback_connection();
1060 } else {
1061 peer_con = osd->cluster_messenger->connect_to_osd(
1062 next_map->get_cluster_addrs(iter.first), false, true);
1063 }
1064 maybe_share_map(peer_con.get(), next_map);
1065 peer_con->send_message(iter.second);
1066 }
1067 release_map(next_map);
1068 }
1069 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1070 {
1071 OSDMapRef next_map = get_nextmap_reserved();
1072 // service map is always newer/newest
1073 ceph_assert(from_epoch <= next_map->get_epoch());
1074
1075 if (next_map->is_down(peer) ||
1076 next_map->get_info(peer).up_from > from_epoch) {
1077 release_map(next_map);
1078 return NULL;
1079 }
1080 ConnectionRef con;
1081 if (peer == whoami) {
1082 con = osd->cluster_messenger->get_loopback_connection();
1083 } else {
1084 con = osd->cluster_messenger->connect_to_osd(
1085 next_map->get_cluster_addrs(peer), false, true);
1086 }
1087 release_map(next_map);
1088 return con;
1089 }
1090
1091 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1092 {
1093 OSDMapRef next_map = get_nextmap_reserved();
1094 // service map is always newer/newest
1095 ceph_assert(from_epoch <= next_map->get_epoch());
1096
1097 pair<ConnectionRef,ConnectionRef> ret;
1098 if (next_map->is_down(peer) ||
1099 next_map->get_info(peer).up_from > from_epoch) {
1100 release_map(next_map);
1101 return ret;
1102 }
1103 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1104 next_map->get_hb_back_addrs(peer));
1105 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1106 next_map->get_hb_front_addrs(peer));
1107 release_map(next_map);
1108 return ret;
1109 }
1110
1111 entity_name_t OSDService::get_cluster_msgr_name() const
1112 {
1113 return cluster_messenger->get_myname();
1114 }
1115
1116 void OSDService::queue_want_pg_temp(pg_t pgid,
1117 const vector<int>& want,
1118 bool forced)
1119 {
1120 std::lock_guard l(pg_temp_lock);
1121 auto p = pg_temp_pending.find(pgid);
1122 if (p == pg_temp_pending.end() ||
1123 p->second.acting != want ||
1124 forced) {
1125 pg_temp_wanted[pgid] = {want, forced};
1126 }
1127 }
1128
1129 void OSDService::remove_want_pg_temp(pg_t pgid)
1130 {
1131 std::lock_guard l(pg_temp_lock);
1132 pg_temp_wanted.erase(pgid);
1133 pg_temp_pending.erase(pgid);
1134 }
1135
1136 void OSDService::_sent_pg_temp()
1137 {
1138 #ifdef HAVE_STDLIB_MAP_SPLICING
1139 pg_temp_pending.merge(pg_temp_wanted);
1140 #else
1141 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1142 make_move_iterator(end(pg_temp_wanted)));
1143 #endif
1144 pg_temp_wanted.clear();
1145 }
1146
1147 void OSDService::requeue_pg_temp()
1148 {
1149 std::lock_guard l(pg_temp_lock);
1150 // wanted overrides pending. note that remove_want_pg_temp
1151 // clears the item out of both.
1152 unsigned old_wanted = pg_temp_wanted.size();
1153 unsigned old_pending = pg_temp_pending.size();
1154 _sent_pg_temp();
1155 pg_temp_wanted.swap(pg_temp_pending);
1156 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1157 << pg_temp_wanted.size() << dendl;
1158 }
1159
1160 std::ostream& operator<<(std::ostream& out,
1161 const OSDService::pg_temp_t& pg_temp)
1162 {
1163 out << pg_temp.acting;
1164 if (pg_temp.forced) {
1165 out << " (forced)";
1166 }
1167 return out;
1168 }
1169
1170 void OSDService::send_pg_temp()
1171 {
1172 std::lock_guard l(pg_temp_lock);
1173 if (pg_temp_wanted.empty())
1174 return;
1175 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1176 MOSDPGTemp *ms[2] = {nullptr, nullptr};
1177 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1178 auto& m = ms[pg_temp.forced];
1179 if (!m) {
1180 m = new MOSDPGTemp(osdmap->get_epoch());
1181 m->forced = pg_temp.forced;
1182 }
1183 m->pg_temp.emplace(pgid, pg_temp.acting);
1184 }
1185 for (auto m : ms) {
1186 if (m) {
1187 monc->send_mon_message(m);
1188 }
1189 }
1190 _sent_pg_temp();
1191 }
1192
1193 void OSDService::send_pg_created(pg_t pgid)
1194 {
1195 std::lock_guard l(pg_created_lock);
1196 dout(20) << __func__ << dendl;
1197 auto o = get_osdmap();
1198 if (o->require_osd_release >= ceph_release_t::luminous) {
1199 pg_created.insert(pgid);
1200 monc->send_mon_message(new MOSDPGCreated(pgid));
1201 }
1202 }
1203
1204 void OSDService::send_pg_created()
1205 {
1206 std::lock_guard l(pg_created_lock);
1207 dout(20) << __func__ << dendl;
1208 auto o = get_osdmap();
1209 if (o->require_osd_release >= ceph_release_t::luminous) {
1210 for (auto pgid : pg_created) {
1211 monc->send_mon_message(new MOSDPGCreated(pgid));
1212 }
1213 }
1214 }
1215
1216 void OSDService::prune_pg_created()
1217 {
1218 std::lock_guard l(pg_created_lock);
1219 dout(20) << __func__ << dendl;
1220 auto o = get_osdmap();
1221 auto i = pg_created.begin();
1222 while (i != pg_created.end()) {
1223 auto p = o->get_pg_pool(i->pool());
1224 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1225 dout(20) << __func__ << " pruning " << *i << dendl;
1226 i = pg_created.erase(i);
1227 } else {
1228 dout(20) << __func__ << " keeping " << *i << dendl;
1229 ++i;
1230 }
1231 }
1232 }
1233
1234
1235 // --------------------------------------
1236 // dispatch
1237
1238 bool OSDService::can_inc_scrubs()
1239 {
1240 bool can_inc = false;
1241 std::lock_guard l(sched_scrub_lock);
1242
1243 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1244 dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1245 << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
1246 can_inc = true;
1247 } else {
1248 dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1249 << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1250 }
1251
1252 return can_inc;
1253 }
1254
1255 bool OSDService::inc_scrubs_local()
1256 {
1257 bool result = false;
1258 std::lock_guard l{sched_scrub_lock};
1259 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1260 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1261 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1262 result = true;
1263 ++scrubs_local;
1264 } else {
1265 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1266 }
1267 return result;
1268 }
1269
1270 void OSDService::dec_scrubs_local()
1271 {
1272 std::lock_guard l{sched_scrub_lock};
1273 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1274 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1275 --scrubs_local;
1276 ceph_assert(scrubs_local >= 0);
1277 }
1278
1279 bool OSDService::inc_scrubs_remote()
1280 {
1281 bool result = false;
1282 std::lock_guard l{sched_scrub_lock};
1283 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1284 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1285 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1286 result = true;
1287 ++scrubs_remote;
1288 } else {
1289 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1290 }
1291 return result;
1292 }
1293
1294 void OSDService::dec_scrubs_remote()
1295 {
1296 std::lock_guard l{sched_scrub_lock};
1297 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1298 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1299 --scrubs_remote;
1300 ceph_assert(scrubs_remote >= 0);
1301 }
1302
1303 void OSDService::dump_scrub_reservations(Formatter *f)
1304 {
1305 std::lock_guard l{sched_scrub_lock};
1306 f->dump_int("scrubs_local", scrubs_local);
1307 f->dump_int("scrubs_remote", scrubs_remote);
1308 f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
1309 }
1310
1311 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1312 epoch_t *_bind_epoch) const
1313 {
1314 std::lock_guard l(epoch_lock);
1315 if (_boot_epoch)
1316 *_boot_epoch = boot_epoch;
1317 if (_up_epoch)
1318 *_up_epoch = up_epoch;
1319 if (_bind_epoch)
1320 *_bind_epoch = bind_epoch;
1321 }
1322
1323 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1324 const epoch_t *_bind_epoch)
1325 {
1326 std::lock_guard l(epoch_lock);
1327 if (_boot_epoch) {
1328 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1329 boot_epoch = *_boot_epoch;
1330 }
1331 if (_up_epoch) {
1332 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1333 up_epoch = *_up_epoch;
1334 }
1335 if (_bind_epoch) {
1336 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1337 bind_epoch = *_bind_epoch;
1338 }
1339 }
1340
1341 bool OSDService::prepare_to_stop()
1342 {
1343 std::unique_lock l(is_stopping_lock);
1344 if (get_state() != NOT_STOPPING)
1345 return false;
1346
1347 OSDMapRef osdmap = get_osdmap();
1348 if (osdmap && osdmap->is_up(whoami)) {
1349 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1350 set_state(PREPARING_TO_STOP);
1351 monc->send_mon_message(
1352 new MOSDMarkMeDown(
1353 monc->get_fsid(),
1354 whoami,
1355 osdmap->get_addrs(whoami),
1356 osdmap->get_epoch(),
1357 true // request ack
1358 ));
1359 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1360 is_stopping_cond.wait_for(l, timeout,
1361 [this] { return get_state() == STOPPING; });
1362 }
1363 dout(0) << __func__ << " starting shutdown" << dendl;
1364 set_state(STOPPING);
1365 return true;
1366 }
1367
1368 void OSDService::got_stop_ack()
1369 {
1370 std::scoped_lock l(is_stopping_lock);
1371 if (get_state() == PREPARING_TO_STOP) {
1372 dout(0) << __func__ << " starting shutdown" << dendl;
1373 set_state(STOPPING);
1374 is_stopping_cond.notify_all();
1375 } else {
1376 dout(10) << __func__ << " ignoring msg" << dendl;
1377 }
1378 }
1379
1380 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1381 OSDSuperblock& sblock)
1382 {
1383 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1384 osdmap->get_encoding_features());
1385 m->oldest_map = max_oldest_map;
1386 m->newest_map = sblock.newest_map;
1387
1388 int max = cct->_conf->osd_map_message_max;
1389 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1390
1391 if (since < m->oldest_map) {
1392 // we don't have the next map the target wants, so start with a
1393 // full map.
1394 bufferlist bl;
1395 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1396 << since << ", starting with full map" << dendl;
1397 since = m->oldest_map;
1398 if (!get_map_bl(since, bl)) {
1399 derr << __func__ << " missing full map " << since << dendl;
1400 goto panic;
1401 }
1402 max--;
1403 max_bytes -= bl.length();
1404 m->maps[since].claim(bl);
1405 }
1406 for (epoch_t e = since + 1; e <= to; ++e) {
1407 bufferlist bl;
1408 if (get_inc_map_bl(e, bl)) {
1409 m->incremental_maps[e].claim(bl);
1410 } else {
1411 dout(10) << __func__ << " missing incremental map " << e << dendl;
1412 if (!get_map_bl(e, bl)) {
1413 derr << __func__ << " also missing full map " << e << dendl;
1414 goto panic;
1415 }
1416 m->maps[e].claim(bl);
1417 }
1418 max--;
1419 max_bytes -= bl.length();
1420 if (max <= 0 || max_bytes <= 0) {
1421 break;
1422 }
1423 }
1424 return m;
1425
1426 panic:
1427 if (!m->maps.empty() ||
1428 !m->incremental_maps.empty()) {
1429 // send what we have so far
1430 return m;
1431 }
1432 // send something
1433 bufferlist bl;
1434 if (get_inc_map_bl(m->newest_map, bl)) {
1435 m->incremental_maps[m->newest_map].claim(bl);
1436 } else {
1437 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1438 if (!get_map_bl(m->newest_map, bl)) {
1439 derr << __func__ << " unable to load latest full map " << m->newest_map
1440 << dendl;
1441 ceph_abort();
1442 }
1443 m->maps[m->newest_map].claim(bl);
1444 }
1445 return m;
1446 }
1447
1448 void OSDService::send_map(MOSDMap *m, Connection *con)
1449 {
1450 con->send_message(m);
1451 }
1452
1453 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1454 const OSDMapRef& osdmap)
1455 {
1456 epoch_t to = osdmap->get_epoch();
1457 dout(10) << "send_incremental_map " << since << " -> " << to
1458 << " to " << con << " " << con->get_peer_addr() << dendl;
1459
1460 MOSDMap *m = NULL;
1461 while (!m) {
1462 OSDSuperblock sblock(get_superblock());
1463 if (since < sblock.oldest_map) {
1464 // just send latest full map
1465 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1466 osdmap->get_encoding_features());
1467 m->oldest_map = max_oldest_map;
1468 m->newest_map = sblock.newest_map;
1469 get_map_bl(to, m->maps[to]);
1470 send_map(m, con);
1471 return;
1472 }
1473
1474 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1475 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1476 << ", only sending most recent" << dendl;
1477 since = to - cct->_conf->osd_map_share_max_epochs;
1478 }
1479
1480 m = build_incremental_map_msg(since, to, sblock);
1481 }
1482 send_map(m, con);
1483 }
1484
1485 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1486 {
1487 bool found = map_bl_cache.lookup(e, &bl);
1488 if (found) {
1489 if (logger)
1490 logger->inc(l_osd_map_bl_cache_hit);
1491 return true;
1492 }
1493 if (logger)
1494 logger->inc(l_osd_map_bl_cache_miss);
1495 found = store->read(meta_ch,
1496 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1497 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1498 if (found) {
1499 _add_map_bl(e, bl);
1500 }
1501 return found;
1502 }
1503
1504 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1505 {
1506 std::lock_guard l(map_cache_lock);
1507 bool found = map_bl_inc_cache.lookup(e, &bl);
1508 if (found) {
1509 if (logger)
1510 logger->inc(l_osd_map_bl_cache_hit);
1511 return true;
1512 }
1513 if (logger)
1514 logger->inc(l_osd_map_bl_cache_miss);
1515 found = store->read(meta_ch,
1516 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1517 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1518 if (found) {
1519 _add_map_inc_bl(e, bl);
1520 }
1521 return found;
1522 }
1523
1524 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1525 {
1526 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1527 // cache a contiguous buffer
1528 if (bl.get_num_buffers() > 1) {
1529 bl.rebuild();
1530 }
1531 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1532 map_bl_cache.add(e, bl);
1533 }
1534
1535 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1536 {
1537 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1538 // cache a contiguous buffer
1539 if (bl.get_num_buffers() > 1) {
1540 bl.rebuild();
1541 }
1542 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1543 map_bl_inc_cache.add(e, bl);
1544 }
1545
1546 OSDMapRef OSDService::_add_map(OSDMap *o)
1547 {
1548 epoch_t e = o->get_epoch();
1549
1550 if (cct->_conf->osd_map_dedup) {
1551 // Dedup against an existing map at a nearby epoch
1552 OSDMapRef for_dedup = map_cache.lower_bound(e);
1553 if (for_dedup) {
1554 OSDMap::dedup(for_dedup.get(), o);
1555 }
1556 }
1557 bool existed;
1558 OSDMapRef l = map_cache.add(e, o, &existed);
1559 if (existed) {
1560 delete o;
1561 }
1562 return l;
1563 }
1564
1565 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1566 {
1567 std::lock_guard l(map_cache_lock);
1568 OSDMapRef retval = map_cache.lookup(epoch);
1569 if (retval) {
1570 dout(30) << "get_map " << epoch << " -cached" << dendl;
1571 if (logger) {
1572 logger->inc(l_osd_map_cache_hit);
1573 }
1574 return retval;
1575 }
1576 if (logger) {
1577 logger->inc(l_osd_map_cache_miss);
1578 epoch_t lb = map_cache.cached_key_lower_bound();
1579 if (epoch < lb) {
1580 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1581 logger->inc(l_osd_map_cache_miss_low);
1582 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1583 }
1584 }
1585
1586 OSDMap *map = new OSDMap;
1587 if (epoch > 0) {
1588 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1589 bufferlist bl;
1590 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1591 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1592 delete map;
1593 return OSDMapRef();
1594 }
1595 map->decode(bl);
1596 } else {
1597 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1598 }
1599 return _add_map(map);
1600 }
1601
1602 // ops
1603
1604
1605 void OSDService::reply_op_error(OpRequestRef op, int err)
1606 {
1607 reply_op_error(op, err, eversion_t(), 0, {});
1608 }
1609
1610 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1611 version_t uv,
1612 vector<pg_log_op_return_item_t> op_returns)
1613 {
1614 auto m = op->get_req<MOSDOp>();
1615 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1616 int flags;
1617 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1618
1619 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1620 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
1621 reply->set_reply_versions(v, uv);
1622 reply->set_op_returns(op_returns);
1623 m->get_connection()->send_message(reply);
1624 }
1625
1626 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1627 {
1628 if (!cct->_conf->osd_debug_misdirected_ops) {
1629 return;
1630 }
1631
1632 auto m = op->get_req<MOSDOp>();
1633 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1634
1635 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1636
1637 if (pg->is_ec_pg()) {
1638 /**
1639 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1640 * can get this result:
1641 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1642 * [CRUSH_ITEM_NONE, 2, 3]/3
1643 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1644 * [3, 2, 3]/3
1645 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1646 * -- misdirected op
1647 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1648 * it and fulfils it
1649 *
1650 * We can't compute the op target based on the sending map epoch due to
1651 * splitting. The simplest thing is to detect such cases here and drop
1652 * them without an error (the client will resend anyway).
1653 */
1654 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1655 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1656 if (!opmap) {
1657 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1658 << m->get_map_epoch() << ", dropping" << dendl;
1659 return;
1660 }
1661 pg_t _pgid = m->get_raw_pg();
1662 spg_t pgid;
1663 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1664 _pgid = opmap->raw_pg_to_pg(_pgid);
1665 if (opmap->get_primary_shard(_pgid, &pgid) &&
1666 pgid.shard != pg->pg_id.shard) {
1667 dout(7) << __func__ << ": " << *pg << " primary changed since "
1668 << m->get_map_epoch() << ", dropping" << dendl;
1669 return;
1670 }
1671 }
1672
1673 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1674 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1675 << " pg " << m->get_raw_pg()
1676 << " to osd." << whoami
1677 << " not " << pg->get_acting()
1678 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1679 }
1680
1681 void OSDService::enqueue_back(OpSchedulerItem&& qi)
1682 {
1683 osd->op_shardedwq.queue(std::move(qi));
1684 }
1685
1686 void OSDService::enqueue_front(OpSchedulerItem&& qi)
1687 {
1688 osd->op_shardedwq.queue_front(std::move(qi));
1689 }
1690
1691 void OSDService::queue_recovery_context(
1692 PG *pg,
1693 GenContext<ThreadPool::TPHandle&> *c)
1694 {
1695 epoch_t e = get_osdmap_epoch();
1696 enqueue_back(
1697 OpSchedulerItem(
1698 unique_ptr<OpSchedulerItem::OpQueueable>(
1699 new PGRecoveryContext(pg->get_pgid(), c, e)),
1700 cct->_conf->osd_recovery_cost,
1701 cct->_conf->osd_recovery_priority,
1702 ceph_clock_now(),
1703 0,
1704 e));
1705 }
1706
1707 void OSDService::queue_for_snap_trim(PG *pg)
1708 {
1709 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1710 enqueue_back(
1711 OpSchedulerItem(
1712 unique_ptr<OpSchedulerItem::OpQueueable>(
1713 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1714 cct->_conf->osd_snap_trim_cost,
1715 cct->_conf->osd_snap_trim_priority,
1716 ceph_clock_now(),
1717 0,
1718 pg->get_osdmap_epoch()));
1719 }
1720
1721 void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
1722 {
1723 unsigned scrub_queue_priority = pg->scrubber.priority;
1724 if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
1725 scrub_queue_priority = cct->_conf->osd_client_op_priority;
1726 }
1727 const auto epoch = pg->get_osdmap_epoch();
1728 enqueue_back(
1729 OpSchedulerItem(
1730 unique_ptr<OpSchedulerItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
1731 cct->_conf->osd_scrub_cost,
1732 scrub_queue_priority,
1733 ceph_clock_now(),
1734 0,
1735 epoch));
1736 }
1737
1738 void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1739 {
1740 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1741 enqueue_back(
1742 OpSchedulerItem(
1743 unique_ptr<OpSchedulerItem::OpQueueable>(
1744 new PGDelete(pgid, e)),
1745 cct->_conf->osd_pg_delete_cost,
1746 cct->_conf->osd_pg_delete_priority,
1747 ceph_clock_now(),
1748 0,
1749 e));
1750 }
1751
1752 bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1753 {
1754 return osd->try_finish_pg_delete(pg, old_pg_num);
1755 }
1756
1757 // ---
1758
1759 void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1760 {
1761 std::lock_guard l(merge_lock);
1762 dout(10) << __func__ << " " << pg->pg_id << dendl;
1763 ready_to_merge_source[pg->pg_id.pgid] = version;
1764 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1765 _send_ready_to_merge();
1766 }
1767
1768 void OSDService::set_ready_to_merge_target(PG *pg,
1769 eversion_t version,
1770 epoch_t last_epoch_started,
1771 epoch_t last_epoch_clean)
1772 {
1773 std::lock_guard l(merge_lock);
1774 dout(10) << __func__ << " " << pg->pg_id << dendl;
1775 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1776 make_tuple(version,
1777 last_epoch_started,
1778 last_epoch_clean)));
1779 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1780 _send_ready_to_merge();
1781 }
1782
1783 void OSDService::set_not_ready_to_merge_source(pg_t source)
1784 {
1785 std::lock_guard l(merge_lock);
1786 dout(10) << __func__ << " " << source << dendl;
1787 not_ready_to_merge_source.insert(source);
1788 assert(ready_to_merge_source.count(source) == 0);
1789 _send_ready_to_merge();
1790 }
1791
1792 void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1793 {
1794 std::lock_guard l(merge_lock);
1795 dout(10) << __func__ << " " << target << " source " << source << dendl;
1796 not_ready_to_merge_target[target] = source;
1797 assert(ready_to_merge_target.count(target) == 0);
1798 _send_ready_to_merge();
1799 }
1800
1801 void OSDService::send_ready_to_merge()
1802 {
1803 std::lock_guard l(merge_lock);
1804 _send_ready_to_merge();
1805 }
1806
1807 void OSDService::_send_ready_to_merge()
1808 {
1809 dout(20) << __func__
1810 << " ready_to_merge_source " << ready_to_merge_source
1811 << " not_ready_to_merge_source " << not_ready_to_merge_source
1812 << " ready_to_merge_target " << ready_to_merge_target
1813 << " not_ready_to_merge_target " << not_ready_to_merge_target
1814 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1815 << dendl;
1816 for (auto src : not_ready_to_merge_source) {
1817 if (sent_ready_to_merge_source.count(src) == 0) {
1818 monc->send_mon_message(new MOSDPGReadyToMerge(
1819 src,
1820 {}, {}, 0, 0,
1821 false,
1822 osdmap->get_epoch()));
1823 sent_ready_to_merge_source.insert(src);
1824 }
1825 }
1826 for (auto p : not_ready_to_merge_target) {
1827 if (sent_ready_to_merge_source.count(p.second) == 0) {
1828 monc->send_mon_message(new MOSDPGReadyToMerge(
1829 p.second,
1830 {}, {}, 0, 0,
1831 false,
1832 osdmap->get_epoch()));
1833 sent_ready_to_merge_source.insert(p.second);
1834 }
1835 }
1836 for (auto src : ready_to_merge_source) {
1837 if (not_ready_to_merge_source.count(src.first) ||
1838 not_ready_to_merge_target.count(src.first.get_parent())) {
1839 continue;
1840 }
1841 auto p = ready_to_merge_target.find(src.first.get_parent());
1842 if (p != ready_to_merge_target.end() &&
1843 sent_ready_to_merge_source.count(src.first) == 0) {
1844 monc->send_mon_message(new MOSDPGReadyToMerge(
1845 src.first, // source pgid
1846 src.second, // src version
1847 std::get<0>(p->second), // target version
1848 std::get<1>(p->second), // PG's last_epoch_started
1849 std::get<2>(p->second), // PG's last_epoch_clean
1850 true,
1851 osdmap->get_epoch()));
1852 sent_ready_to_merge_source.insert(src.first);
1853 }
1854 }
1855 }
1856
1857 void OSDService::clear_ready_to_merge(PG *pg)
1858 {
1859 std::lock_guard l(merge_lock);
1860 dout(10) << __func__ << " " << pg->pg_id << dendl;
1861 ready_to_merge_source.erase(pg->pg_id.pgid);
1862 ready_to_merge_target.erase(pg->pg_id.pgid);
1863 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1864 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1865 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1866 }
1867
1868 void OSDService::clear_sent_ready_to_merge()
1869 {
1870 std::lock_guard l(merge_lock);
1871 sent_ready_to_merge_source.clear();
1872 }
1873
1874 void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
1875 {
1876 std::lock_guard l(merge_lock);
1877 auto i = sent_ready_to_merge_source.begin();
1878 while (i != sent_ready_to_merge_source.end()) {
1879 if (!osdmap->pg_exists(*i)) {
1880 dout(10) << __func__ << " " << *i << dendl;
1881 i = sent_ready_to_merge_source.erase(i);
1882 } else {
1883 ++i;
1884 }
1885 }
1886 }
1887
1888 // ---
1889
1890 void OSDService::_queue_for_recovery(
1891 std::pair<epoch_t, PGRef> p,
1892 uint64_t reserved_pushes)
1893 {
1894 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
1895 enqueue_back(
1896 OpSchedulerItem(
1897 unique_ptr<OpSchedulerItem::OpQueueable>(
1898 new PGRecovery(
1899 p.second->get_pgid(), p.first, reserved_pushes)),
1900 cct->_conf->osd_recovery_cost,
1901 cct->_conf->osd_recovery_priority,
1902 ceph_clock_now(),
1903 0,
1904 p.first));
1905 }
1906
1907 // ====================================================================
1908 // OSD
1909
1910 #undef dout_prefix
1911 #define dout_prefix *_dout
1912
1913 // Commands shared between OSD's console and admin console:
1914 namespace ceph {
1915 namespace osd_cmds {
1916
1917 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1918
1919 }} // namespace ceph::osd_cmds
1920
1921 int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, string osdspec_affinity)
1922 {
1923 int ret;
1924
1925 OSDSuperblock sb;
1926 bufferlist sbbl;
1927 ObjectStore::CollectionHandle ch;
1928
1929 // if we are fed a uuid for this osd, use it.
1930 store->set_fsid(cct->_conf->osd_uuid);
1931
1932 ret = store->mkfs();
1933 if (ret) {
1934 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1935 << cpp_strerror(ret) << dendl;
1936 goto free_store;
1937 }
1938
1939 store->set_cache_shards(1); // doesn't matter for mkfs!
1940
1941 ret = store->mount();
1942 if (ret) {
1943 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1944 << cpp_strerror(ret) << dendl;
1945 goto free_store;
1946 }
1947
1948 ch = store->open_collection(coll_t::meta());
1949 if (ch) {
1950 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1951 if (ret < 0) {
1952 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
1953 goto free_store;
1954 }
1955 /* if we already have superblock, check content of superblock */
1956 dout(0) << " have superblock" << dendl;
1957 auto p = sbbl.cbegin();
1958 decode(sb, p);
1959 if (whoami != sb.whoami) {
1960 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1961 << dendl;
1962 ret = -EINVAL;
1963 goto umount_store;
1964 }
1965 if (fsid != sb.cluster_fsid) {
1966 derr << "provided cluster fsid " << fsid
1967 << " != superblock's " << sb.cluster_fsid << dendl;
1968 ret = -EINVAL;
1969 goto umount_store;
1970 }
1971 } else {
1972 // create superblock
1973 sb.cluster_fsid = fsid;
1974 sb.osd_fsid = store->get_fsid();
1975 sb.whoami = whoami;
1976 sb.compat_features = get_osd_initial_compat_set();
1977
1978 bufferlist bl;
1979 encode(sb, bl);
1980
1981 ObjectStore::CollectionHandle ch = store->create_new_collection(
1982 coll_t::meta());
1983 ObjectStore::Transaction t;
1984 t.create_collection(coll_t::meta(), 0);
1985 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1986 ret = store->queue_transaction(ch, std::move(t));
1987 if (ret) {
1988 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1989 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
1990 goto umount_store;
1991 }
1992 }
1993
1994 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
1995 if (ret) {
1996 derr << "OSD::mkfs: failed to write fsid file: error "
1997 << cpp_strerror(ret) << dendl;
1998 goto umount_store;
1999 }
2000
2001 umount_store:
2002 if (ch) {
2003 ch.reset();
2004 }
2005 store->umount();
2006 free_store:
2007 delete store;
2008 return ret;
2009 }
2010
2011 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
2012 {
2013 char val[80];
2014 int r;
2015
2016 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2017 r = store->write_meta("magic", val);
2018 if (r < 0)
2019 return r;
2020
2021 snprintf(val, sizeof(val), "%d", whoami);
2022 r = store->write_meta("whoami", val);
2023 if (r < 0)
2024 return r;
2025
2026 cluster_fsid.print(val);
2027 r = store->write_meta("ceph_fsid", val);
2028 if (r < 0)
2029 return r;
2030
2031 string key = cct->_conf.get_val<string>("key");
2032 if (key.size()) {
2033 r = store->write_meta("osd_key", key);
2034 if (r < 0)
2035 return r;
2036 } else {
2037 string keyfile = cct->_conf.get_val<string>("keyfile");
2038 if (!keyfile.empty()) {
2039 bufferlist keybl;
2040 string err;
2041 r = keybl.read_file(keyfile.c_str(), &err);
2042 if (r < 0) {
2043 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2044 << err << ": " << cpp_strerror(r) << dendl;
2045 return r;
2046 }
2047 r = store->write_meta("osd_key", keybl.to_str());
2048 if (r < 0)
2049 return r;
2050 }
2051 }
2052 if (!osdspec_affinity.empty()) {
2053 r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2054 if (r < 0)
2055 return r;
2056 }
2057
2058 r = store->write_meta("ready", "ready");
2059 if (r < 0)
2060 return r;
2061
2062 return 0;
2063 }
2064
2065 int OSD::peek_meta(ObjectStore *store,
2066 std::string *magic,
2067 uuid_d *cluster_fsid,
2068 uuid_d *osd_fsid,
2069 int *whoami,
2070 ceph_release_t *require_osd_release)
2071 {
2072 string val;
2073
2074 int r = store->read_meta("magic", &val);
2075 if (r < 0)
2076 return r;
2077 *magic = val;
2078
2079 r = store->read_meta("whoami", &val);
2080 if (r < 0)
2081 return r;
2082 *whoami = atoi(val.c_str());
2083
2084 r = store->read_meta("ceph_fsid", &val);
2085 if (r < 0)
2086 return r;
2087 r = cluster_fsid->parse(val.c_str());
2088 if (!r)
2089 return -EINVAL;
2090
2091 r = store->read_meta("fsid", &val);
2092 if (r < 0) {
2093 *osd_fsid = uuid_d();
2094 } else {
2095 r = osd_fsid->parse(val.c_str());
2096 if (!r)
2097 return -EINVAL;
2098 }
2099
2100 r = store->read_meta("require_osd_release", &val);
2101 if (r >= 0) {
2102 *require_osd_release = ceph_release_from_name(val);
2103 }
2104
2105 return 0;
2106 }
2107
2108
2109 #undef dout_prefix
2110 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2111
2112 // cons/des
2113
2114 OSD::OSD(CephContext *cct_, ObjectStore *store_,
2115 int id,
2116 Messenger *internal_messenger,
2117 Messenger *external_messenger,
2118 Messenger *hb_client_front,
2119 Messenger *hb_client_back,
2120 Messenger *hb_front_serverm,
2121 Messenger *hb_back_serverm,
2122 Messenger *osdc_messenger,
2123 MonClient *mc,
2124 const std::string &dev, const std::string &jdev) :
2125 Dispatcher(cct_),
2126 tick_timer(cct, osd_lock),
2127 tick_timer_without_osd_lock(cct, tick_timer_lock),
2128 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2129 cluster_messenger(internal_messenger),
2130 client_messenger(external_messenger),
2131 objecter_messenger(osdc_messenger),
2132 monc(mc),
2133 mgrc(cct_, client_messenger, &mc->monmap),
2134 logger(NULL),
2135 recoverystate_perf(NULL),
2136 store(store_),
2137 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2138 clog(log_client.create_channel()),
2139 whoami(id),
2140 dev_path(dev), journal_path(jdev),
2141 store_is_rotational(store->is_rotational()),
2142 trace_endpoint("0.0.0.0", 0, "osd"),
2143 asok_hook(NULL),
2144 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2145 "osd_pg_epoch_max_lag_factor")),
2146 osd_compat(get_osd_compat_set()),
2147 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2148 get_num_op_threads()),
2149 heartbeat_stop(false),
2150 heartbeat_need_update(true),
2151 hb_front_client_messenger(hb_client_front),
2152 hb_back_client_messenger(hb_client_back),
2153 hb_front_server_messenger(hb_front_serverm),
2154 hb_back_server_messenger(hb_back_serverm),
2155 daily_loadavg(0.0),
2156 heartbeat_thread(this),
2157 heartbeat_dispatcher(this),
2158 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2159 cct->_conf->osd_num_op_tracker_shard),
2160 test_ops_hook(NULL),
2161 op_shardedwq(
2162 this,
2163 cct->_conf->osd_op_thread_timeout,
2164 cct->_conf->osd_op_thread_suicide_timeout,
2165 &osd_op_tp),
2166 last_pg_create_epoch(0),
2167 boot_finisher(cct),
2168 up_thru_wanted(0),
2169 requested_full_first(0),
2170 requested_full_last(0),
2171 service(this)
2172 {
2173
2174 if (!gss_ktfile_client.empty()) {
2175 // Assert we can export environment variable
2176 /*
2177 The default client keytab is used, if it is present and readable,
2178 to automatically obtain initial credentials for GSSAPI client
2179 applications. The principal name of the first entry in the client
2180 keytab is used by default when obtaining initial credentials.
2181 1. The KRB5_CLIENT_KTNAME environment variable.
2182 2. The default_client_keytab_name profile variable in [libdefaults].
2183 3. The hardcoded default, DEFCKTNAME.
2184 */
2185 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2186 gss_ktfile_client.c_str(), 1));
2187 ceph_assert(set_result == 0);
2188 }
2189
2190 monc->set_messenger(client_messenger);
2191 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2192 cct->_conf->osd_op_log_threshold);
2193 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2194 cct->_conf->osd_op_history_duration);
2195 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2196 cct->_conf->osd_op_history_slow_op_threshold);
2197 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
2198 #ifdef WITH_BLKIN
2199 std::stringstream ss;
2200 ss << "osd." << whoami;
2201 trace_endpoint.copy_name(ss.str());
2202 #endif
2203
2204 // initialize shards
2205 num_shards = get_num_op_shards();
2206 for (uint32_t i = 0; i < num_shards; i++) {
2207 OSDShard *one_shard = new OSDShard(
2208 i,
2209 cct,
2210 this);
2211 shards.push_back(one_shard);
2212 }
2213 }
2214
2215 OSD::~OSD()
2216 {
2217 while (!shards.empty()) {
2218 delete shards.back();
2219 shards.pop_back();
2220 }
2221 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2222 cct->get_perfcounters_collection()->remove(logger);
2223 delete recoverystate_perf;
2224 delete logger;
2225 delete store;
2226 }
2227
2228 double OSD::get_tick_interval() const
2229 {
2230 // vary +/- 5% to avoid scrub scheduling livelocks
2231 constexpr auto delta = 0.05;
2232 return (OSD_TICK_INTERVAL *
2233 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2234 }
2235
2236 void OSD::handle_signal(int signum)
2237 {
2238 ceph_assert(signum == SIGINT || signum == SIGTERM);
2239 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2240 shutdown();
2241 }
2242
2243 int OSD::pre_init()
2244 {
2245 std::lock_guard lock(osd_lock);
2246 if (is_stopping())
2247 return 0;
2248
2249 if (store->test_mount_in_use()) {
2250 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2251 << "currently in use. (Is ceph-osd already running?)" << dendl;
2252 return -EBUSY;
2253 }
2254
2255 cct->_conf.add_observer(this);
2256 return 0;
2257 }
2258
2259 int OSD::set_numa_affinity()
2260 {
2261 // storage numa node
2262 int store_node = -1;
2263 store->get_numa_node(&store_node, nullptr, nullptr);
2264 if (store_node >= 0) {
2265 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2266 }
2267
2268 // check network numa node(s)
2269 int front_node = -1, back_node = -1;
2270 string front_iface = pick_iface(
2271 cct,
2272 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2273 string back_iface = pick_iface(
2274 cct,
2275 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2276 int r = get_iface_numa_node(front_iface, &front_node);
2277 if (r >= 0 && front_node >= 0) {
2278 dout(1) << __func__ << " public network " << front_iface << " numa node "
2279 << front_node << dendl;
2280 r = get_iface_numa_node(back_iface, &back_node);
2281 if (r >= 0 && back_node >= 0) {
2282 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2283 << back_node << dendl;
2284 if (front_node == back_node &&
2285 front_node == store_node) {
2286 dout(1) << " objectstore and network numa nodes all match" << dendl;
2287 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2288 numa_node = front_node;
2289 }
2290 } else if (front_node != back_node) {
2291 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2292 << dendl;
2293 } else {
2294 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2295 << dendl;
2296 }
2297 } else if (back_node == -2) {
2298 dout(1) << __func__ << " cluster network " << back_iface
2299 << " ports numa nodes do not match" << dendl;
2300 } else {
2301 derr << __func__ << " unable to identify cluster interface '" << back_iface
2302 << "' numa node: " << cpp_strerror(r) << dendl;
2303 }
2304 } else if (front_node == -2) {
2305 dout(1) << __func__ << " public network " << front_iface
2306 << " ports numa nodes do not match" << dendl;
2307 } else {
2308 derr << __func__ << " unable to identify public interface '" << front_iface
2309 << "' numa node: " << cpp_strerror(r) << dendl;
2310 }
2311 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2312 // this takes precedence over the automagic logic above
2313 numa_node = node;
2314 }
2315 if (numa_node >= 0) {
2316 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2317 if (r < 0) {
2318 dout(1) << __func__ << " unable to determine numa node " << numa_node
2319 << " CPUs" << dendl;
2320 numa_node = -1;
2321 } else {
2322 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2323 << " cpus "
2324 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2325 << dendl;
2326 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
2327 if (r < 0) {
2328 r = -errno;
2329 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2330 << dendl;
2331 numa_node = -1;
2332 }
2333 }
2334 } else {
2335 dout(1) << __func__ << " not setting numa affinity" << dendl;
2336 }
2337 return 0;
2338 }
2339
2340 // asok
2341
2342 class OSDSocketHook : public AdminSocketHook {
2343 OSD *osd;
2344 public:
2345 explicit OSDSocketHook(OSD *o) : osd(o) {}
2346 int call(std::string_view prefix, const cmdmap_t& cmdmap,
2347 Formatter *f,
2348 std::ostream& ss,
2349 bufferlist& out) override {
2350 ceph_abort("should use async hook");
2351 }
2352 void call_async(
2353 std::string_view prefix,
2354 const cmdmap_t& cmdmap,
2355 Formatter *f,
2356 const bufferlist& inbl,
2357 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
2358 try {
2359 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2360 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2361 bufferlist empty;
2362 on_finish(-EINVAL, e.what(), empty);
2363 }
2364 }
2365 };
2366
2367 std::set<int64_t> OSD::get_mapped_pools()
2368 {
2369 std::set<int64_t> pools;
2370 std::vector<spg_t> pgids;
2371 _get_pgids(&pgids);
2372 for (const auto &pgid : pgids) {
2373 pools.insert(pgid.pool());
2374 }
2375 return pools;
2376 }
2377
2378 void OSD::asok_command(
2379 std::string_view prefix, const cmdmap_t& cmdmap,
2380 Formatter *f,
2381 const bufferlist& inbl,
2382 std::function<void(int,const std::string&,bufferlist&)> on_finish)
2383 {
2384 int ret = 0;
2385 stringstream ss; // stderr error message stream
2386 bufferlist outbl; // if empty at end, we'll dump formatter as output
2387
2388 // --- PG commands are routed here to PG::do_command ---
2389 if (prefix == "pg" ||
2390 prefix == "query" ||
2391 prefix == "mark_unfound_lost" ||
2392 prefix == "list_unfound" ||
2393 prefix == "scrub" ||
2394 prefix == "deep_scrub"
2395 ) {
2396 string pgidstr;
2397 pg_t pgid;
2398 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2399 ss << "no pgid specified";
2400 ret = -EINVAL;
2401 goto out;
2402 }
2403 if (!pgid.parse(pgidstr.c_str())) {
2404 ss << "couldn't parse pgid '" << pgidstr << "'";
2405 ret = -EINVAL;
2406 goto out;
2407 }
2408 spg_t pcand;
2409 PGRef pg;
2410 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2411 (pg = _lookup_lock_pg(pcand))) {
2412 if (pg->is_primary()) {
2413 cmdmap_t new_cmdmap = cmdmap;
2414 try {
2415 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2416 pg->unlock();
2417 return; // the pg handler calls on_finish directly
2418 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2419 pg->unlock();
2420 ss << e.what();
2421 ret = -EINVAL;
2422 goto out;
2423 }
2424 } else {
2425 ss << "not primary for pgid " << pgid;
2426 // do not reply; they will get newer maps and realize they
2427 // need to resend.
2428 pg->unlock();
2429 ret = -EAGAIN;
2430 goto out;
2431 }
2432 } else {
2433 ss << "i don't have pgid " << pgid;
2434 ret = -ENOENT;
2435 }
2436 }
2437
2438 // --- OSD commands follow ---
2439
2440 else if (prefix == "status") {
2441 lock_guard l(osd_lock);
2442 f->open_object_section("status");
2443 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2444 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2445 f->dump_unsigned("whoami", superblock.whoami);
2446 f->dump_string("state", get_state_name(get_state()));
2447 f->dump_unsigned("oldest_map", superblock.oldest_map);
2448 f->dump_unsigned("newest_map", superblock.newest_map);
2449 f->dump_unsigned("num_pgs", num_pgs);
2450 f->close_section();
2451 } else if (prefix == "flush_journal") {
2452 store->flush_journal();
2453 } else if (prefix == "dump_ops_in_flight" ||
2454 prefix == "ops" ||
2455 prefix == "dump_blocked_ops" ||
2456 prefix == "dump_historic_ops" ||
2457 prefix == "dump_historic_ops_by_duration" ||
2458 prefix == "dump_historic_slow_ops") {
2459
2460 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2461 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2462 will start to track new ops received afterwards.";
2463
2464 set<string> filters;
2465 vector<string> filter_str;
2466 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
2467 copy(filter_str.begin(), filter_str.end(),
2468 inserter(filters, filters.end()));
2469 }
2470
2471 if (prefix == "dump_ops_in_flight" ||
2472 prefix == "ops") {
2473 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2474 ss << error_str;
2475 ret = -EINVAL;
2476 goto out;
2477 }
2478 }
2479 if (prefix == "dump_blocked_ops") {
2480 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2481 ss << error_str;
2482 ret = -EINVAL;
2483 goto out;
2484 }
2485 }
2486 if (prefix == "dump_historic_ops") {
2487 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2488 ss << error_str;
2489 ret = -EINVAL;
2490 goto out;
2491 }
2492 }
2493 if (prefix == "dump_historic_ops_by_duration") {
2494 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2495 ss << error_str;
2496 ret = -EINVAL;
2497 goto out;
2498 }
2499 }
2500 if (prefix == "dump_historic_slow_ops") {
2501 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2502 ss << error_str;
2503 ret = -EINVAL;
2504 goto out;
2505 }
2506 }
2507 } else if (prefix == "dump_op_pq_state") {
2508 f->open_object_section("pq");
2509 op_shardedwq.dump(f);
2510 f->close_section();
2511 } else if (prefix == "dump_blacklist") {
2512 list<pair<entity_addr_t,utime_t> > bl;
2513 OSDMapRef curmap = service.get_osdmap();
2514
2515 f->open_array_section("blacklist");
2516 curmap->get_blacklist(&bl);
2517 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2518 it != bl.end(); ++it) {
2519 f->open_object_section("entry");
2520 f->open_object_section("entity_addr_t");
2521 it->first.dump(f);
2522 f->close_section(); //entity_addr_t
2523 it->second.localtime(f->dump_stream("expire_time"));
2524 f->close_section(); //entry
2525 }
2526 f->close_section(); //blacklist
2527 } else if (prefix == "dump_watchers") {
2528 list<obj_watch_item_t> watchers;
2529 // scan pg's
2530 vector<PGRef> pgs;
2531 _get_pgs(&pgs);
2532 for (auto& pg : pgs) {
2533 list<obj_watch_item_t> pg_watchers;
2534 pg->get_watchers(&pg_watchers);
2535 watchers.splice(watchers.end(), pg_watchers);
2536 }
2537
2538 f->open_array_section("watchers");
2539 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2540 it != watchers.end(); ++it) {
2541
2542 f->open_object_section("watch");
2543
2544 f->dump_string("namespace", it->obj.nspace);
2545 f->dump_string("object", it->obj.oid.name);
2546
2547 f->open_object_section("entity_name");
2548 it->wi.name.dump(f);
2549 f->close_section(); //entity_name_t
2550
2551 f->dump_unsigned("cookie", it->wi.cookie);
2552 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2553
2554 f->open_object_section("entity_addr_t");
2555 it->wi.addr.dump(f);
2556 f->close_section(); //entity_addr_t
2557
2558 f->close_section(); //watch
2559 }
2560
2561 f->close_section(); //watchers
2562 } else if (prefix == "dump_recovery_reservations") {
2563 f->open_object_section("reservations");
2564 f->open_object_section("local_reservations");
2565 service.local_reserver.dump(f);
2566 f->close_section();
2567 f->open_object_section("remote_reservations");
2568 service.remote_reserver.dump(f);
2569 f->close_section();
2570 f->close_section();
2571 } else if (prefix == "dump_scrub_reservations") {
2572 f->open_object_section("scrub_reservations");
2573 service.dump_scrub_reservations(f);
2574 f->close_section();
2575 } else if (prefix == "get_latest_osdmap") {
2576 get_latest_osdmap();
2577 } else if (prefix == "set_heap_property") {
2578 string property;
2579 int64_t value = 0;
2580 string error;
2581 bool success = false;
2582 if (!cmd_getval(cmdmap, "property", property)) {
2583 error = "unable to get property";
2584 success = false;
2585 } else if (!cmd_getval(cmdmap, "value", value)) {
2586 error = "unable to get value";
2587 success = false;
2588 } else if (value < 0) {
2589 error = "negative value not allowed";
2590 success = false;
2591 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2592 error = "invalid property";
2593 success = false;
2594 } else {
2595 success = true;
2596 }
2597 f->open_object_section("result");
2598 f->dump_string("error", error);
2599 f->dump_bool("success", success);
2600 f->close_section();
2601 } else if (prefix == "get_heap_property") {
2602 string property;
2603 size_t value = 0;
2604 string error;
2605 bool success = false;
2606 if (!cmd_getval(cmdmap, "property", property)) {
2607 error = "unable to get property";
2608 success = false;
2609 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2610 error = "invalid property";
2611 success = false;
2612 } else {
2613 success = true;
2614 }
2615 f->open_object_section("result");
2616 f->dump_string("error", error);
2617 f->dump_bool("success", success);
2618 f->dump_int("value", value);
2619 f->close_section();
2620 } else if (prefix == "dump_objectstore_kv_stats") {
2621 store->get_db_statistics(f);
2622 } else if (prefix == "dump_scrubs") {
2623 service.dumps_scrub(f);
2624 } else if (prefix == "calc_objectstore_db_histogram") {
2625 store->generate_db_histogram(f);
2626 } else if (prefix == "flush_store_cache") {
2627 store->flush_cache(&ss);
2628 } else if (prefix == "dump_pgstate_history") {
2629 f->open_object_section("pgstate_history");
2630 f->open_array_section("pgs");
2631 vector<PGRef> pgs;
2632 _get_pgs(&pgs);
2633 for (auto& pg : pgs) {
2634 f->open_object_section("pg");
2635 f->dump_stream("pg") << pg->pg_id;
2636 f->dump_string("currently", pg->get_current_state());
2637 pg->dump_pgstate_history(f);
2638 f->close_section();
2639 }
2640 f->close_section();
2641 f->close_section();
2642 } else if (prefix == "compact") {
2643 dout(1) << "triggering manual compaction" << dendl;
2644 auto start = ceph::coarse_mono_clock::now();
2645 store->compact();
2646 auto end = ceph::coarse_mono_clock::now();
2647 double duration = std::chrono::duration<double>(end-start).count();
2648 dout(1) << "finished manual compaction in "
2649 << duration
2650 << " seconds" << dendl;
2651 f->open_object_section("compact_result");
2652 f->dump_float("elapsed_time", duration);
2653 f->close_section();
2654 } else if (prefix == "get_mapped_pools") {
2655 f->open_array_section("mapped_pools");
2656 set<int64_t> poollist = get_mapped_pools();
2657 for (auto pool : poollist) {
2658 f->dump_int("pool_id", pool);
2659 }
2660 f->close_section();
2661 } else if (prefix == "smart") {
2662 string devid;
2663 cmd_getval(cmdmap, "devid", devid);
2664 ostringstream out;
2665 probe_smart(devid, out);
2666 outbl.append(out.str());
2667 } else if (prefix == "list_devices") {
2668 set<string> devnames;
2669 store->get_devices(&devnames);
2670 f->open_array_section("list_devices");
2671 for (auto dev : devnames) {
2672 if (dev.find("dm-") == 0) {
2673 continue;
2674 }
2675 string err;
2676 f->open_object_section("device");
2677 f->dump_string("device", "/dev/" + dev);
2678 f->dump_string("device_id", get_device_id(dev, &err));
2679 f->close_section();
2680 }
2681 f->close_section();
2682 } else if (prefix == "send_beacon") {
2683 lock_guard l(osd_lock);
2684 if (is_active()) {
2685 send_beacon(ceph::coarse_mono_clock::now());
2686 }
2687 }
2688
2689 else if (prefix == "cluster_log") {
2690 vector<string> msg;
2691 cmd_getval(cmdmap, "message", msg);
2692 if (msg.empty()) {
2693 ret = -EINVAL;
2694 ss << "ignoring empty log message";
2695 goto out;
2696 }
2697 string message = msg.front();
2698 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2699 message += " " + *a;
2700 string lvl;
2701 cmd_getval(cmdmap, "level", lvl);
2702 clog_type level = string_to_clog_type(lvl);
2703 if (level < 0) {
2704 ret = -EINVAL;
2705 ss << "unknown level '" << lvl << "'";
2706 goto out;
2707 }
2708 clog->do_log(level, message);
2709 }
2710
2711 else if (prefix == "bench") {
2712 lock_guard l(osd_lock);
2713 int64_t count;
2714 int64_t bsize;
2715 int64_t osize, onum;
2716 // default count 1G, size 4MB
2717 cmd_getval(cmdmap, "count", count, (int64_t)1 << 30);
2718 cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
2719 cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
2720 cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
2721
2722 uint32_t duration = cct->_conf->osd_bench_duration;
2723
2724 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
2725 // let us limit the block size because the next checks rely on it
2726 // having a sane value. If we allow any block size to be set things
2727 // can still go sideways.
2728 ss << "block 'size' values are capped at "
2729 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
2730 << " a higher value, please adjust 'osd_bench_max_block_size'";
2731 ret = -EINVAL;
2732 goto out;
2733 } else if (bsize < (int64_t) (1 << 20)) {
2734 // entering the realm of small block sizes.
2735 // limit the count to a sane value, assuming a configurable amount of
2736 // IOPS and duration, so that the OSD doesn't get hung up on this,
2737 // preventing timeouts from going off
2738 int64_t max_count =
2739 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
2740 if (count > max_count) {
2741 ss << "'count' values greater than " << max_count
2742 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2743 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
2744 << " for " << duration << " seconds,"
2745 << " can cause ill effects on osd. "
2746 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2747 << " value if you wish to use a higher 'count'.";
2748 ret = -EINVAL;
2749 goto out;
2750 }
2751 } else {
2752 // 1MB block sizes are big enough so that we get more stuff done.
2753 // However, to avoid the osd from getting hung on this and having
2754 // timers being triggered, we are going to limit the count assuming
2755 // a configurable throughput and duration.
2756 // NOTE: max_count is the total amount of bytes that we believe we
2757 // will be able to write during 'duration' for the given
2758 // throughput. The block size hardly impacts this unless it's
2759 // way too big. Given we already check how big the block size
2760 // is, it's safe to assume everything will check out.
2761 int64_t max_count =
2762 cct->_conf->osd_bench_large_size_max_throughput * duration;
2763 if (count > max_count) {
2764 ss << "'count' values greater than " << max_count
2765 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2766 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
2767 << " for " << duration << " seconds,"
2768 << " can cause ill effects on osd. "
2769 << " Please adjust 'osd_bench_large_size_max_throughput'"
2770 << " with a higher value if you wish to use a higher 'count'.";
2771 ret = -EINVAL;
2772 goto out;
2773 }
2774 }
2775
2776 if (osize && bsize > osize)
2777 bsize = osize;
2778
2779 dout(1) << " bench count " << count
2780 << " bsize " << byte_u_t(bsize) << dendl;
2781
2782 ObjectStore::Transaction cleanupt;
2783
2784 if (osize && onum) {
2785 bufferlist bl;
2786 bufferptr bp(osize);
2787 bp.zero();
2788 bl.push_back(std::move(bp));
2789 bl.rebuild_page_aligned();
2790 for (int i=0; i<onum; ++i) {
2791 char nm[30];
2792 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
2793 object_t oid(nm);
2794 hobject_t soid(sobject_t(oid, 0));
2795 ObjectStore::Transaction t;
2796 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
2797 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2798 cleanupt.remove(coll_t(), ghobject_t(soid));
2799 }
2800 }
2801
2802 bufferlist bl;
2803 bufferptr bp(bsize);
2804 bp.zero();
2805 bl.push_back(std::move(bp));
2806 bl.rebuild_page_aligned();
2807
2808 {
2809 C_SaferCond waiter;
2810 if (!service.meta_ch->flush_commit(&waiter)) {
2811 waiter.wait();
2812 }
2813 }
2814
2815 utime_t start = ceph_clock_now();
2816 for (int64_t pos = 0; pos < count; pos += bsize) {
2817 char nm[30];
2818 unsigned offset = 0;
2819 if (onum && osize) {
2820 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
2821 offset = rand() % (osize / bsize) * bsize;
2822 } else {
2823 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
2824 }
2825 object_t oid(nm);
2826 hobject_t soid(sobject_t(oid, 0));
2827 ObjectStore::Transaction t;
2828 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
2829 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2830 if (!onum || !osize)
2831 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
2832 }
2833
2834 {
2835 C_SaferCond waiter;
2836 if (!service.meta_ch->flush_commit(&waiter)) {
2837 waiter.wait();
2838 }
2839 }
2840 utime_t end = ceph_clock_now();
2841
2842 // clean up
2843 store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
2844 {
2845 C_SaferCond waiter;
2846 if (!service.meta_ch->flush_commit(&waiter)) {
2847 waiter.wait();
2848 }
2849 }
2850
2851 double elapsed = end - start;
2852 double rate = count / elapsed;
2853 double iops = rate / bsize;
2854 f->open_object_section("osd_bench_results");
2855 f->dump_int("bytes_written", count);
2856 f->dump_int("blocksize", bsize);
2857 f->dump_float("elapsed_sec", elapsed);
2858 f->dump_float("bytes_per_sec", rate);
2859 f->dump_float("iops", iops);
2860 f->close_section();
2861 }
2862
2863 else if (prefix == "flush_pg_stats") {
2864 mgrc.send_pgstats();
2865 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2866 }
2867
2868 else if (prefix == "heap") {
2869 ret = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2870 }
2871
2872 else if (prefix == "debug dump_missing") {
2873 f->open_array_section("pgs");
2874 vector<PGRef> pgs;
2875 _get_pgs(&pgs);
2876 for (auto& pg : pgs) {
2877 string s = stringify(pg->pg_id);
2878 f->open_array_section(s.c_str());
2879 pg->lock();
2880 pg->dump_missing(f);
2881 pg->unlock();
2882 f->close_section();
2883 }
2884 f->close_section();
2885 }
2886
2887 else if (prefix == "debug kick_recovery_wq") {
2888 int64_t delay;
2889 cmd_getval(cmdmap, "delay", delay);
2890 ostringstream oss;
2891 oss << delay;
2892 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
2893 if (ret != 0) {
2894 ss << "kick_recovery_wq: error setting "
2895 << "osd_recovery_delay_start to '" << delay << "': error "
2896 << ret;
2897 goto out;
2898 }
2899 cct->_conf.apply_changes(nullptr);
2900 ss << "kicking recovery queue. set osd_recovery_delay_start "
2901 << "to " << cct->_conf->osd_recovery_delay_start;
2902 }
2903
2904 else if (prefix == "cpu_profiler") {
2905 ostringstream ds;
2906 string arg;
2907 cmd_getval(cmdmap, "arg", arg);
2908 vector<string> argvec;
2909 get_str_vec(arg, argvec);
2910 cpu_profiler_handle_command(argvec, ds);
2911 outbl.append(ds.str());
2912 }
2913
2914 else if (prefix == "dump_pg_recovery_stats") {
2915 lock_guard l(osd_lock);
2916 pg_recovery_stats.dump_formatted(f);
2917 }
2918
2919 else if (prefix == "reset_pg_recovery_stats") {
2920 lock_guard l(osd_lock);
2921 pg_recovery_stats.reset();
2922 }
2923
2924 else if (prefix == "perf histogram dump") {
2925 std::string logger;
2926 std::string counter;
2927 cmd_getval(cmdmap, "logger", logger);
2928 cmd_getval(cmdmap, "counter", counter);
2929 cct->get_perfcounters_collection()->dump_formatted_histograms(
2930 f, false, logger, counter);
2931 }
2932
2933 else if (prefix == "cache drop") {
2934 lock_guard l(osd_lock);
2935 dout(20) << "clearing all caches" << dendl;
2936 // Clear the objectstore's cache - onode and buffer for Bluestore,
2937 // system's pagecache for Filestore
2938 ret = store->flush_cache(&ss);
2939 if (ret < 0) {
2940 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
2941 goto out;
2942 }
2943 // Clear the objectcontext cache (per PG)
2944 vector<PGRef> pgs;
2945 _get_pgs(&pgs);
2946 for (auto& pg: pgs) {
2947 pg->clear_cache();
2948 }
2949 }
2950
2951 else if (prefix == "cache status") {
2952 lock_guard l(osd_lock);
2953 int obj_ctx_count = 0;
2954 vector<PGRef> pgs;
2955 _get_pgs(&pgs);
2956 for (auto& pg: pgs) {
2957 obj_ctx_count += pg->get_cache_obj_count();
2958 }
2959 f->open_object_section("cache_status");
2960 f->dump_int("object_ctx", obj_ctx_count);
2961 store->dump_cache_stats(f);
2962 f->close_section();
2963 }
2964
2965 else if (prefix == "scrub_purged_snaps") {
2966 lock_guard l(osd_lock);
2967 scrub_purged_snaps();
2968 }
2969
2970 else if (prefix == "dump_osd_network") {
2971 lock_guard l(osd_lock);
2972 int64_t value = 0;
2973 if (!(cmd_getval(cmdmap, "value", value))) {
2974 // Convert milliseconds to microseconds
2975 value = static_cast<double>(g_conf().get_val<double>(
2976 "mon_warn_on_slow_ping_time")) * 1000;
2977 if (value == 0) {
2978 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2979 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2980 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2981 }
2982 } else {
2983 // Convert user input to microseconds
2984 value *= 1000;
2985 }
2986 if (value < 0) value = 0;
2987
2988 struct osd_ping_time_t {
2989 uint32_t pingtime;
2990 int to;
2991 bool back;
2992 std::array<uint32_t,3> times;
2993 std::array<uint32_t,3> min;
2994 std::array<uint32_t,3> max;
2995 uint32_t last;
2996 uint32_t last_update;
2997
2998 bool operator<(const osd_ping_time_t& rhs) const {
2999 if (pingtime < rhs.pingtime)
3000 return true;
3001 if (pingtime > rhs.pingtime)
3002 return false;
3003 if (to < rhs.to)
3004 return true;
3005 if (to > rhs.to)
3006 return false;
3007 return back;
3008 }
3009 };
3010
3011 set<osd_ping_time_t> sorted;
3012 // Get pingtimes under lock and not on the stack
3013 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3014 service.get_hb_pingtime(pingtimes);
3015 for (auto j : *pingtimes) {
3016 if (j.second.last_update == 0)
3017 continue;
3018 osd_ping_time_t item;
3019 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3020 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3021 if (item.pingtime >= value) {
3022 item.to = j.first;
3023 item.times[0] = j.second.back_pingtime[0];
3024 item.times[1] = j.second.back_pingtime[1];
3025 item.times[2] = j.second.back_pingtime[2];
3026 item.min[0] = j.second.back_min[0];
3027 item.min[1] = j.second.back_min[1];
3028 item.min[2] = j.second.back_min[2];
3029 item.max[0] = j.second.back_max[0];
3030 item.max[1] = j.second.back_max[1];
3031 item.max[2] = j.second.back_max[2];
3032 item.last = j.second.back_last;
3033 item.back = true;
3034 item.last_update = j.second.last_update;
3035 sorted.emplace(item);
3036 }
3037 if (j.second.front_last == 0)
3038 continue;
3039 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3040 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3041 if (item.pingtime >= value) {
3042 item.to = j.first;
3043 item.times[0] = j.second.front_pingtime[0];
3044 item.times[1] = j.second.front_pingtime[1];
3045 item.times[2] = j.second.front_pingtime[2];
3046 item.min[0] = j.second.front_min[0];
3047 item.min[1] = j.second.front_min[1];
3048 item.min[2] = j.second.front_min[2];
3049 item.max[0] = j.second.front_max[0];
3050 item.max[1] = j.second.front_max[1];
3051 item.max[2] = j.second.front_max[2];
3052 item.last = j.second.front_last;
3053 item.last_update = j.second.last_update;
3054 item.back = false;
3055 sorted.emplace(item);
3056 }
3057 }
3058 delete pingtimes;
3059 //
3060 // Network ping times (1min 5min 15min)
3061 f->open_object_section("network_ping_times");
3062 f->dump_int("threshold", value / 1000);
3063 f->open_array_section("entries");
3064 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3065 ceph_assert(sitem.pingtime >= value);
3066 f->open_object_section("entry");
3067
3068 const time_t lu(sitem.last_update);
3069 char buffer[26];
3070 string lustr(ctime_r(&lu, buffer));
3071 lustr.pop_back(); // Remove trailing \n
3072 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3073 f->dump_string("last update", lustr);
3074 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3075 f->dump_int("from osd", whoami);
3076 f->dump_int("to osd", sitem.to);
3077 f->dump_string("interface", (sitem.back ? "back" : "front"));
3078 f->open_object_section("average");
3079 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3080 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3081 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3082 f->close_section(); // average
3083 f->open_object_section("min");
3084 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3085 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3086 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3087 f->close_section(); // min
3088 f->open_object_section("max");
3089 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3090 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3091 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3092 f->close_section(); // max
3093 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3094 f->close_section(); // entry
3095 }
3096 f->close_section(); // entries
3097 f->close_section(); // network_ping_times
3098 } else {
3099 ceph_abort_msg("broken asok registration");
3100 }
3101
3102 out:
3103 on_finish(ret, ss.str(), outbl);
3104 }
3105
3106 class TestOpsSocketHook : public AdminSocketHook {
3107 OSDService *service;
3108 ObjectStore *store;
3109 public:
3110 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
3111 int call(std::string_view command, const cmdmap_t& cmdmap,
3112 Formatter *f,
3113 std::ostream& errss,
3114 bufferlist& out) override {
3115 int r = 0;
3116 stringstream outss;
3117 try {
3118 test_ops(service, store, command, cmdmap, outss);
3119 out.append(outss);
3120 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3121 errss << e.what();
3122 r = -EINVAL;
3123 }
3124 return r;
3125 }
3126 void test_ops(OSDService *service, ObjectStore *store,
3127 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
3128
3129 };
3130
3131 class OSD::C_Tick : public Context {
3132 OSD *osd;
3133 public:
3134 explicit C_Tick(OSD *o) : osd(o) {}
3135 void finish(int r) override {
3136 osd->tick();
3137 }
3138 };
3139
3140 class OSD::C_Tick_WithoutOSDLock : public Context {
3141 OSD *osd;
3142 public:
3143 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3144 void finish(int r) override {
3145 osd->tick_without_osd_lock();
3146 }
3147 };
3148
3149 int OSD::enable_disable_fuse(bool stop)
3150 {
3151 #ifdef HAVE_LIBFUSE
3152 int r;
3153 string mntpath = cct->_conf->osd_data + "/fuse";
3154 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3155 dout(1) << __func__ << " disabling" << dendl;
3156 fuse_store->stop();
3157 delete fuse_store;
3158 fuse_store = NULL;
3159 r = ::rmdir(mntpath.c_str());
3160 if (r < 0) {
3161 r = -errno;
3162 derr << __func__ << " failed to rmdir " << mntpath << ": "
3163 << cpp_strerror(r) << dendl;
3164 return r;
3165 }
3166 return 0;
3167 }
3168 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3169 dout(1) << __func__ << " enabling" << dendl;
3170 r = ::mkdir(mntpath.c_str(), 0700);
3171 if (r < 0)
3172 r = -errno;
3173 if (r < 0 && r != -EEXIST) {
3174 derr << __func__ << " unable to create " << mntpath << ": "
3175 << cpp_strerror(r) << dendl;
3176 return r;
3177 }
3178 fuse_store = new FuseStore(store, mntpath);
3179 r = fuse_store->start();
3180 if (r < 0) {
3181 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3182 delete fuse_store;
3183 fuse_store = NULL;
3184 return r;
3185 }
3186 }
3187 #endif // HAVE_LIBFUSE
3188 return 0;
3189 }
3190
3191 size_t OSD::get_num_cache_shards()
3192 {
3193 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3194 }
3195
3196 int OSD::get_num_op_shards()
3197 {
3198 if (cct->_conf->osd_op_num_shards)
3199 return cct->_conf->osd_op_num_shards;
3200 if (store_is_rotational)
3201 return cct->_conf->osd_op_num_shards_hdd;
3202 else
3203 return cct->_conf->osd_op_num_shards_ssd;
3204 }
3205
3206 int OSD::get_num_op_threads()
3207 {
3208 if (cct->_conf->osd_op_num_threads_per_shard)
3209 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3210 if (store_is_rotational)
3211 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3212 else
3213 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3214 }
3215
3216 float OSD::get_osd_recovery_sleep()
3217 {
3218 if (cct->_conf->osd_recovery_sleep)
3219 return cct->_conf->osd_recovery_sleep;
3220 if (!store_is_rotational && !journal_is_rotational)
3221 return cct->_conf->osd_recovery_sleep_ssd;
3222 else if (store_is_rotational && !journal_is_rotational)
3223 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
3224 else
3225 return cct->_conf->osd_recovery_sleep_hdd;
3226 }
3227
3228 float OSD::get_osd_delete_sleep()
3229 {
3230 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3231 if (osd_delete_sleep > 0)
3232 return osd_delete_sleep;
3233 if (!store_is_rotational && !journal_is_rotational)
3234 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3235 if (store_is_rotational && !journal_is_rotational)
3236 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3237 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3238 }
3239
3240 int OSD::get_recovery_max_active()
3241 {
3242 if (cct->_conf->osd_recovery_max_active)
3243 return cct->_conf->osd_recovery_max_active;
3244 if (store_is_rotational)
3245 return cct->_conf->osd_recovery_max_active_hdd;
3246 else
3247 return cct->_conf->osd_recovery_max_active_ssd;
3248 }
3249
3250 float OSD::get_osd_snap_trim_sleep()
3251 {
3252 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3253 if (osd_snap_trim_sleep > 0)
3254 return osd_snap_trim_sleep;
3255 if (!store_is_rotational && !journal_is_rotational)
3256 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3257 if (store_is_rotational && !journal_is_rotational)
3258 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3259 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3260 }
3261
3262 int OSD::init()
3263 {
3264 OSDMapRef osdmap;
3265 CompatSet initial, diff;
3266 std::lock_guard lock(osd_lock);
3267 if (is_stopping())
3268 return 0;
3269
3270 tick_timer.init();
3271 tick_timer_without_osd_lock.init();
3272 service.recovery_request_timer.init();
3273 service.sleep_timer.init();
3274
3275 boot_finisher.start();
3276
3277 {
3278 string val;
3279 store->read_meta("require_osd_release", &val);
3280 last_require_osd_release = ceph_release_from_name(val);
3281 }
3282
3283 // mount.
3284 dout(2) << "init " << dev_path
3285 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3286 << dendl;
3287 dout(2) << "journal " << journal_path << dendl;
3288 ceph_assert(store); // call pre_init() first!
3289
3290 store->set_cache_shards(get_num_cache_shards());
3291
3292 int r = store->mount();
3293 if (r < 0) {
3294 derr << "OSD:init: unable to mount object store" << dendl;
3295 return r;
3296 }
3297 journal_is_rotational = store->is_journal_rotational();
3298 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3299 << dendl;
3300
3301 enable_disable_fuse(false);
3302
3303 dout(2) << "boot" << dendl;
3304
3305 service.meta_ch = store->open_collection(coll_t::meta());
3306
3307 // initialize the daily loadavg with current 15min loadavg
3308 double loadavgs[3];
3309 if (getloadavg(loadavgs, 3) == 3) {
3310 daily_loadavg = loadavgs[2];
3311 } else {
3312 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3313 daily_loadavg = 1.0;
3314 }
3315
3316 int rotating_auth_attempts = 0;
3317 auto rotating_auth_timeout =
3318 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3319
3320 // sanity check long object name handling
3321 {
3322 hobject_t l;
3323 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3324 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3325 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3326 r = store->validate_hobject_key(l);
3327 if (r < 0) {
3328 derr << "backend (" << store->get_type() << ") is unable to support max "
3329 << "object name[space] len" << dendl;
3330 derr << " osd max object name len = "
3331 << cct->_conf->osd_max_object_name_len << dendl;
3332 derr << " osd max object namespace len = "
3333 << cct->_conf->osd_max_object_namespace_len << dendl;
3334 derr << cpp_strerror(r) << dendl;
3335 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3336 goto out;
3337 }
3338 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3339 << dendl;
3340 } else {
3341 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3342 }
3343 }
3344
3345 // read superblock
3346 r = read_superblock();
3347 if (r < 0) {
3348 derr << "OSD::init() : unable to read osd superblock" << dendl;
3349 r = -EINVAL;
3350 goto out;
3351 }
3352
3353 if (osd_compat.compare(superblock.compat_features) < 0) {
3354 derr << "The disk uses features unsupported by the executable." << dendl;
3355 derr << " ondisk features " << superblock.compat_features << dendl;
3356 derr << " daemon features " << osd_compat << dendl;
3357
3358 if (osd_compat.writeable(superblock.compat_features)) {
3359 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3360 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3361 r = -EOPNOTSUPP;
3362 goto out;
3363 }
3364 else {
3365 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3366 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3367 r = -EOPNOTSUPP;
3368 goto out;
3369 }
3370 }
3371
3372 assert_warn(whoami == superblock.whoami);
3373 if (whoami != superblock.whoami) {
3374 derr << "OSD::init: superblock says osd"
3375 << superblock.whoami << " but I am osd." << whoami << dendl;
3376 r = -EINVAL;
3377 goto out;
3378 }
3379
3380 startup_time = ceph::mono_clock::now();
3381
3382 // load up "current" osdmap
3383 assert_warn(!get_osdmap());
3384 if (get_osdmap()) {
3385 derr << "OSD::init: unable to read current osdmap" << dendl;
3386 r = -EINVAL;
3387 goto out;
3388 }
3389 osdmap = get_map(superblock.current_epoch);
3390 set_osdmap(osdmap);
3391
3392 // make sure we don't have legacy pgs deleting
3393 {
3394 vector<coll_t> ls;
3395 int r = store->list_collections(ls);
3396 ceph_assert(r >= 0);
3397 for (auto c : ls) {
3398 spg_t pgid;
3399 if (c.is_pg(&pgid) &&
3400 !osdmap->have_pg_pool(pgid.pool())) {
3401 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3402 if (!store->exists(service.meta_ch, oid)) {
3403 derr << __func__ << " missing pg_pool_t for deleted pool "
3404 << pgid.pool() << " for pg " << pgid
3405 << "; please downgrade to luminous and allow "
3406 << "pg deletion to complete before upgrading" << dendl;
3407 ceph_abort();
3408 }
3409 }
3410 }
3411 }
3412
3413 initial = get_osd_initial_compat_set();
3414 diff = superblock.compat_features.unsupported(initial);
3415 if (superblock.compat_features.merge(initial)) {
3416 // Are we adding SNAPMAPPER2?
3417 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3418 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3419 << dendl;
3420 auto ch = service.meta_ch;
3421 auto hoid = make_snapmapper_oid();
3422 unsigned max = cct->_conf->osd_target_transaction_size;
3423 r = SnapMapper::convert_legacy(cct, store, ch, hoid, max);
3424 if (r < 0)
3425 goto out;
3426 }
3427 // We need to persist the new compat_set before we
3428 // do anything else
3429 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3430 ObjectStore::Transaction t;
3431 write_superblock(t);
3432 r = store->queue_transaction(service.meta_ch, std::move(t));
3433 if (r < 0)
3434 goto out;
3435 }
3436
3437 // make sure snap mapper object exists
3438 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3439 dout(10) << "init creating/touching snapmapper object" << dendl;
3440 ObjectStore::Transaction t;
3441 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3442 r = store->queue_transaction(service.meta_ch, std::move(t));
3443 if (r < 0)
3444 goto out;
3445 }
3446 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3447 dout(10) << "init creating/touching purged_snaps object" << dendl;
3448 ObjectStore::Transaction t;
3449 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3450 r = store->queue_transaction(service.meta_ch, std::move(t));
3451 if (r < 0)
3452 goto out;
3453 }
3454
3455 if (cct->_conf->osd_open_classes_on_start) {
3456 int r = ClassHandler::get_instance().open_all_classes();
3457 if (r)
3458 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3459 }
3460
3461 check_osdmap_features();
3462
3463 create_recoverystate_perf();
3464
3465 {
3466 epoch_t bind_epoch = osdmap->get_epoch();
3467 service.set_epochs(NULL, NULL, &bind_epoch);
3468 }
3469
3470 clear_temp_objects();
3471
3472 // initialize osdmap references in sharded wq
3473 for (auto& shard : shards) {
3474 std::lock_guard l(shard->osdmap_lock);
3475 shard->shard_osdmap = osdmap;
3476 }
3477
3478 // load up pgs (as they previously existed)
3479 load_pgs();
3480
3481 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3482
3483 create_logger();
3484
3485 // prime osd stats
3486 {
3487 struct store_statfs_t stbuf;
3488 osd_alert_list_t alerts;
3489 int r = store->statfs(&stbuf, &alerts);
3490 ceph_assert(r == 0);
3491 service.set_statfs(stbuf, alerts);
3492 }
3493
3494 // client_messenger auth_client is already set up by monc.
3495 for (auto m : { cluster_messenger,
3496 objecter_messenger,
3497 hb_front_client_messenger,
3498 hb_back_client_messenger,
3499 hb_front_server_messenger,
3500 hb_back_server_messenger } ) {
3501 m->set_auth_client(monc);
3502 }
3503 for (auto m : { client_messenger,
3504 cluster_messenger,
3505 hb_front_server_messenger,
3506 hb_back_server_messenger }) {
3507 m->set_auth_server(monc);
3508 }
3509 monc->set_handle_authentication_dispatcher(this);
3510
3511 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3512 | CEPH_ENTITY_TYPE_MGR);
3513 r = monc->init();
3514 if (r < 0)
3515 goto out;
3516
3517 mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); });
3518 mgrc.set_perf_metric_query_cb(
3519 [this](const ConfigPayload &config_payload) {
3520 set_perf_queries(config_payload);
3521 },
3522 [this] {
3523 return get_perf_reports();
3524 });
3525 mgrc.init();
3526
3527 // tell monc about log_client so it will know about mon session resets
3528 monc->set_log_client(&log_client);
3529 update_log_config();
3530
3531 // i'm ready!
3532 client_messenger->add_dispatcher_tail(&mgrc);
3533 client_messenger->add_dispatcher_tail(this);
3534 cluster_messenger->add_dispatcher_head(this);
3535
3536 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3537 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3538 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3539 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3540
3541 objecter_messenger->add_dispatcher_head(service.objecter.get());
3542
3543 service.init();
3544 service.publish_map(osdmap);
3545 service.publish_superblock(superblock);
3546 service.max_oldest_map = superblock.oldest_map;
3547
3548 for (auto& shard : shards) {
3549 // put PGs in a temporary set because we may modify pg_slots
3550 // unordered_map below.
3551 set<PGRef> pgs;
3552 for (auto& i : shard->pg_slots) {
3553 PGRef pg = i.second->pg;
3554 if (!pg) {
3555 continue;
3556 }
3557 pgs.insert(pg);
3558 }
3559 for (auto pg : pgs) {
3560 std::scoped_lock l{*pg};
3561 set<pair<spg_t,epoch_t>> new_children;
3562 set<pair<spg_t,epoch_t>> merge_pgs;
3563 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3564 &new_children, &merge_pgs);
3565 if (!new_children.empty()) {
3566 for (auto shard : shards) {
3567 shard->prime_splits(osdmap, &new_children);
3568 }
3569 assert(new_children.empty());
3570 }
3571 if (!merge_pgs.empty()) {
3572 for (auto shard : shards) {
3573 shard->prime_merges(osdmap, &merge_pgs);
3574 }
3575 assert(merge_pgs.empty());
3576 }
3577 }
3578 }
3579
3580 osd_op_tp.start();
3581
3582 // start the heartbeat
3583 heartbeat_thread.create("osd_srv_heartbt");
3584
3585 // tick
3586 tick_timer.add_event_after(get_tick_interval(),
3587 new C_Tick(this));
3588 {
3589 std::lock_guard l(tick_timer_lock);
3590 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3591 new C_Tick_WithoutOSDLock(this));
3592 }
3593
3594 osd_lock.unlock();
3595
3596 r = monc->authenticate();
3597 if (r < 0) {
3598 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3599 << dendl;
3600 exit(1);
3601 }
3602
3603 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3604 derr << "unable to obtain rotating service keys; retrying" << dendl;
3605 ++rotating_auth_attempts;
3606 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3607 derr << __func__ << " wait_auth_rotating timed out" << dendl;
3608 exit(1);
3609 }
3610 }
3611
3612 r = update_crush_device_class();
3613 if (r < 0) {
3614 derr << __func__ << " unable to update_crush_device_class: "
3615 << cpp_strerror(r) << dendl;
3616 exit(1);
3617 }
3618
3619 r = update_crush_location();
3620 if (r < 0) {
3621 derr << __func__ << " unable to update_crush_location: "
3622 << cpp_strerror(r) << dendl;
3623 exit(1);
3624 }
3625
3626 osd_lock.lock();
3627 if (is_stopping())
3628 return 0;
3629
3630 // start objecter *after* we have authenticated, so that we don't ignore
3631 // the OSDMaps it requests.
3632 service.final_init();
3633
3634 check_config();
3635
3636 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3637 consume_map();
3638
3639 dout(0) << "done with init, starting boot process" << dendl;
3640
3641 // subscribe to any pg creations
3642 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3643
3644 // MgrClient needs this (it doesn't have MonClient reference itself)
3645 monc->sub_want("mgrmap", 0, 0);
3646
3647 // we don't need to ask for an osdmap here; objecter will
3648 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3649
3650 monc->renew_subs();
3651
3652 start_boot();
3653
3654 return 0;
3655
3656 out:
3657 enable_disable_fuse(true);
3658 store->umount();
3659 delete store;
3660 store = NULL;
3661 return r;
3662 }
3663
3664 void OSD::final_init()
3665 {
3666 AdminSocket *admin_socket = cct->get_admin_socket();
3667 asok_hook = new OSDSocketHook(this);
3668 int r = admin_socket->register_command("status", asok_hook,
3669 "high-level status of OSD");
3670 ceph_assert(r == 0);
3671 r = admin_socket->register_command("flush_journal",
3672 asok_hook,
3673 "flush the journal to permanent store");
3674 ceph_assert(r == 0);
3675 r = admin_socket->register_command("dump_ops_in_flight " \
3676 "name=filterstr,type=CephString,n=N,req=false",
3677 asok_hook,
3678 "show the ops currently in flight");
3679 ceph_assert(r == 0);
3680 r = admin_socket->register_command("ops " \
3681 "name=filterstr,type=CephString,n=N,req=false",
3682 asok_hook,
3683 "show the ops currently in flight");
3684 ceph_assert(r == 0);
3685 r = admin_socket->register_command("dump_blocked_ops " \
3686 "name=filterstr,type=CephString,n=N,req=false",
3687 asok_hook,
3688 "show the blocked ops currently in flight");
3689 ceph_assert(r == 0);
3690 r = admin_socket->register_command("dump_historic_ops " \
3691 "name=filterstr,type=CephString,n=N,req=false",
3692 asok_hook,
3693 "show recent ops");
3694 ceph_assert(r == 0);
3695 r = admin_socket->register_command("dump_historic_slow_ops " \
3696 "name=filterstr,type=CephString,n=N,req=false",
3697 asok_hook,
3698 "show slowest recent ops");
3699 ceph_assert(r == 0);
3700 r = admin_socket->register_command("dump_historic_ops_by_duration " \
3701 "name=filterstr,type=CephString,n=N,req=false",
3702 asok_hook,
3703 "show slowest recent ops, sorted by duration");
3704 ceph_assert(r == 0);
3705 r = admin_socket->register_command("dump_op_pq_state",
3706 asok_hook,
3707 "dump op priority queue state");
3708 ceph_assert(r == 0);
3709 r = admin_socket->register_command("dump_blacklist",
3710 asok_hook,
3711 "dump blacklisted clients and times");
3712 ceph_assert(r == 0);
3713 r = admin_socket->register_command("dump_watchers",
3714 asok_hook,
3715 "show clients which have active watches,"
3716 " and on which objects");
3717 ceph_assert(r == 0);
3718 r = admin_socket->register_command("dump_recovery_reservations",
3719 asok_hook,
3720 "show recovery reservations");
3721 ceph_assert(r == 0);
3722 r = admin_socket->register_command("dump_scrub_reservations",
3723 asok_hook,
3724 "show recovery reservations");
3725 ceph_assert(r == 0);
3726 r = admin_socket->register_command("get_latest_osdmap",
3727 asok_hook,
3728 "force osd to update the latest map from "
3729 "the mon");
3730 ceph_assert(r == 0);
3731
3732 r = admin_socket->register_command("set_heap_property " \
3733 "name=property,type=CephString " \
3734 "name=value,type=CephInt",
3735 asok_hook,
3736 "update malloc extension heap property");
3737 ceph_assert(r == 0);
3738
3739 r = admin_socket->register_command("get_heap_property " \
3740 "name=property,type=CephString",
3741 asok_hook,
3742 "get malloc extension heap property");
3743 ceph_assert(r == 0);
3744
3745 r = admin_socket->register_command("dump_objectstore_kv_stats",
3746 asok_hook,
3747 "print statistics of kvdb which used by bluestore");
3748 ceph_assert(r == 0);
3749
3750 r = admin_socket->register_command("dump_scrubs",
3751 asok_hook,
3752 "print scheduled scrubs");
3753 ceph_assert(r == 0);
3754
3755 r = admin_socket->register_command("calc_objectstore_db_histogram",
3756 asok_hook,
3757 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3758 ceph_assert(r == 0);
3759
3760 r = admin_socket->register_command("flush_store_cache",
3761 asok_hook,
3762 "Flush bluestore internal cache");
3763 ceph_assert(r == 0);
3764 r = admin_socket->register_command("dump_pgstate_history",
3765 asok_hook,
3766 "show recent state history");
3767 ceph_assert(r == 0);
3768
3769 r = admin_socket->register_command("compact",
3770 asok_hook,
3771 "Commpact object store's omap."
3772 " WARNING: Compaction probably slows your requests");
3773 ceph_assert(r == 0);
3774
3775 r = admin_socket->register_command("get_mapped_pools",
3776 asok_hook,
3777 "dump pools whose PG(s) are mapped to this OSD.");
3778
3779 ceph_assert(r == 0);
3780
3781 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
3782 asok_hook,
3783 "probe OSD devices for SMART data.");
3784
3785 ceph_assert(r == 0);
3786
3787 r = admin_socket->register_command("list_devices",
3788 asok_hook,
3789 "list OSD devices.");
3790 r = admin_socket->register_command("send_beacon",
3791 asok_hook,
3792 "send OSD beacon to mon immediately");
3793
3794 r = admin_socket->register_command(
3795 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3796 "Dump osd heartbeat network ping times");
3797 ceph_assert(r == 0);
3798
3799 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3800 // Note: pools are CephString instead of CephPoolname because
3801 // these commands traditionally support both pool names and numbers
3802 r = admin_socket->register_command(
3803 "setomapval " \
3804 "name=pool,type=CephString " \
3805 "name=objname,type=CephObjectname " \
3806 "name=key,type=CephString "\
3807 "name=val,type=CephString",
3808 test_ops_hook,
3809 "set omap key");
3810 ceph_assert(r == 0);
3811 r = admin_socket->register_command(
3812 "rmomapkey " \
3813 "name=pool,type=CephString " \
3814 "name=objname,type=CephObjectname " \
3815 "name=key,type=CephString",
3816 test_ops_hook,
3817 "remove omap key");
3818 ceph_assert(r == 0);
3819 r = admin_socket->register_command(
3820 "setomapheader " \
3821 "name=pool,type=CephString " \
3822 "name=objname,type=CephObjectname " \
3823 "name=header,type=CephString",
3824 test_ops_hook,
3825 "set omap header");
3826 ceph_assert(r == 0);
3827
3828 r = admin_socket->register_command(
3829 "getomap " \
3830 "name=pool,type=CephString " \
3831 "name=objname,type=CephObjectname",
3832 test_ops_hook,
3833 "output entire object map");
3834 ceph_assert(r == 0);
3835
3836 r = admin_socket->register_command(
3837 "truncobj " \
3838 "name=pool,type=CephString " \
3839 "name=objname,type=CephObjectname " \
3840 "name=len,type=CephInt",
3841 test_ops_hook,
3842 "truncate object to length");
3843 ceph_assert(r == 0);
3844
3845 r = admin_socket->register_command(
3846 "injectdataerr " \
3847 "name=pool,type=CephString " \
3848 "name=objname,type=CephObjectname " \
3849 "name=shardid,type=CephInt,req=false,range=0|255",
3850 test_ops_hook,
3851 "inject data error to an object");
3852 ceph_assert(r == 0);
3853
3854 r = admin_socket->register_command(
3855 "injectmdataerr " \
3856 "name=pool,type=CephString " \
3857 "name=objname,type=CephObjectname " \
3858 "name=shardid,type=CephInt,req=false,range=0|255",
3859 test_ops_hook,
3860 "inject metadata error to an object");
3861 ceph_assert(r == 0);
3862 r = admin_socket->register_command(
3863 "set_recovery_delay " \
3864 "name=utime,type=CephInt,req=false",
3865 test_ops_hook,
3866 "Delay osd recovery by specified seconds");
3867 ceph_assert(r == 0);
3868 r = admin_socket->register_command(
3869 "injectfull " \
3870 "name=type,type=CephString,req=false " \
3871 "name=count,type=CephInt,req=false ",
3872 test_ops_hook,
3873 "Inject a full disk (optional count times)");
3874 ceph_assert(r == 0);
3875 r = admin_socket->register_command(
3876 "bench " \
3877 "name=count,type=CephInt,req=false " \
3878 "name=size,type=CephInt,req=false " \
3879 "name=object_size,type=CephInt,req=false " \
3880 "name=object_num,type=CephInt,req=false ",
3881 asok_hook,
3882 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3883 "(default count=1G default size=4MB). Results in log.");
3884 ceph_assert(r == 0);
3885 r = admin_socket->register_command(
3886 "cluster_log " \
3887 "name=level,type=CephChoices,strings=error,warning,info,debug " \
3888 "name=message,type=CephString,n=N",
3889 asok_hook,
3890 "log a message to the cluster log");
3891 ceph_assert(r == 0);
3892 r = admin_socket->register_command(
3893 "flush_pg_stats",
3894 asok_hook,
3895 "flush pg stats");
3896 ceph_assert(r == 0);
3897 r = admin_socket->register_command(
3898 "heap " \
3899 "name=heapcmd,type=CephChoices,strings=" \
3900 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
3901 "name=value,type=CephString,req=false",
3902 asok_hook,
3903 "show heap usage info (available only if compiled with tcmalloc)");
3904 ceph_assert(r == 0);
3905 r = admin_socket->register_command(
3906 "debug dump_missing " \
3907 "name=filename,type=CephFilepath",
3908 asok_hook,
3909 "dump missing objects to a named file");
3910 ceph_assert(r == 0);
3911 r = admin_socket->register_command(
3912 "debug kick_recovery_wq " \
3913 "name=delay,type=CephInt,range=0",
3914 asok_hook,
3915 "set osd_recovery_delay_start to <val>");
3916 ceph_assert(r == 0);
3917 r = admin_socket->register_command(
3918 "cpu_profiler " \
3919 "name=arg,type=CephChoices,strings=status|flush",
3920 asok_hook,
3921 "run cpu profiling on daemon");
3922 ceph_assert(r == 0);
3923 r = admin_socket->register_command(
3924 "dump_pg_recovery_stats",
3925 asok_hook,
3926 "dump pg recovery statistics");
3927 ceph_assert(r == 0);
3928 r = admin_socket->register_command(
3929 "reset_pg_recovery_stats",
3930 asok_hook,
3931 "reset pg recovery statistics");
3932 ceph_assert(r == 0);
3933 r = admin_socket->register_command(
3934 "cache drop",
3935 asok_hook,
3936 "Drop all OSD caches");
3937 ceph_assert(r == 0);
3938 r = admin_socket->register_command(
3939 "cache status",
3940 asok_hook,
3941 "Get OSD caches statistics");
3942 ceph_assert(r == 0);
3943 r = admin_socket->register_command(
3944 "scrub_purged_snaps",
3945 asok_hook,
3946 "Scrub purged_snaps vs snapmapper index");
3947 ceph_assert(r == 0);
3948
3949 // -- pg commands --
3950 // old form: ceph pg <pgid> command ...
3951 r = admin_socket->register_command(
3952 "pg " \
3953 "name=pgid,type=CephPgid " \
3954 "name=cmd,type=CephChoices,strings=query",
3955 asok_hook,
3956 "");
3957 ceph_assert(r == 0);
3958 r = admin_socket->register_command(
3959 "pg " \
3960 "name=pgid,type=CephPgid " \
3961 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
3962 "name=mulcmd,type=CephChoices,strings=revert|delete",
3963 asok_hook,
3964 "");
3965 ceph_assert(r == 0);
3966 r = admin_socket->register_command(
3967 "pg " \
3968 "name=pgid,type=CephPgid " \
3969 "name=cmd,type=CephChoices,strings=list_unfound " \
3970 "name=offset,type=CephString,req=false",
3971 asok_hook,
3972 "");
3973 ceph_assert(r == 0);
3974 r = admin_socket->register_command(
3975 "pg " \
3976 "name=pgid,type=CephPgid " \
3977 "name=cmd,type=CephChoices,strings=scrub " \
3978 "name=time,type=CephInt,req=false",
3979 asok_hook,
3980 "");
3981 ceph_assert(r == 0);
3982 r = admin_socket->register_command(
3983 "pg " \
3984 "name=pgid,type=CephPgid " \
3985 "name=cmd,type=CephChoices,strings=deep_scrub " \
3986 "name=time,type=CephInt,req=false",
3987 asok_hook,
3988 "");
3989 ceph_assert(r == 0);
3990 // new form: tell <pgid> <cmd> for both cli and rest
3991 r = admin_socket->register_command(
3992 "query",
3993 asok_hook,
3994 "show details of a specific pg");
3995 ceph_assert(r == 0);
3996 r = admin_socket->register_command(
3997 "mark_unfound_lost " \
3998 "name=pgid,type=CephPgid,req=false " \
3999 "name=mulcmd,type=CephChoices,strings=revert|delete",
4000 asok_hook,
4001 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4002 ceph_assert(r == 0);
4003 r = admin_socket->register_command(
4004 "list_unfound " \
4005 "name=pgid,type=CephPgid,req=false " \
4006 "name=offset,type=CephString,req=false",
4007 asok_hook,
4008 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4009 ceph_assert(r == 0);
4010 r = admin_socket->register_command(
4011 "scrub " \
4012 "name=pgid,type=CephPgid,req=false " \
4013 "name=time,type=CephInt,req=false",
4014 asok_hook,
4015 "Trigger a scheduled scrub ");
4016 ceph_assert(r == 0);
4017 r = admin_socket->register_command(
4018 "deep_scrub " \
4019 "name=pgid,type=CephPgid,req=false " \
4020 "name=time,type=CephInt,req=false",
4021 asok_hook,
4022 "Trigger a scheduled deep scrub ");
4023 ceph_assert(r == 0);
4024 }
4025
4026 void OSD::create_logger()
4027 {
4028 dout(10) << "create_logger" << dendl;
4029
4030 logger = build_osd_logger(cct);
4031 cct->get_perfcounters_collection()->add(logger);
4032 }
4033
4034 void OSD::create_recoverystate_perf()
4035 {
4036 dout(10) << "create_recoverystate_perf" << dendl;
4037
4038 recoverystate_perf = build_recoverystate_perf(cct);
4039 cct->get_perfcounters_collection()->add(recoverystate_perf);
4040 }
4041
4042 int OSD::shutdown()
4043 {
4044 if (cct->_conf->osd_fast_shutdown) {
4045 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4046 cct->_log->flush();
4047 _exit(0);
4048 }
4049
4050 if (!service.prepare_to_stop())
4051 return 0; // already shutting down
4052 osd_lock.lock();
4053 if (is_stopping()) {
4054 osd_lock.unlock();
4055 return 0;
4056 }
4057 dout(0) << "shutdown" << dendl;
4058
4059 set_state(STATE_STOPPING);
4060
4061 // Debugging
4062 if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4063 cct->_conf.set_val("debug_osd", "100");
4064 cct->_conf.set_val("debug_journal", "100");
4065 cct->_conf.set_val("debug_filestore", "100");
4066 cct->_conf.set_val("debug_bluestore", "100");
4067 cct->_conf.set_val("debug_ms", "100");
4068 cct->_conf.apply_changes(nullptr);
4069 }
4070
4071 // stop MgrClient earlier as it's more like an internal consumer of OSD
4072 mgrc.shutdown();
4073
4074 service.start_shutdown();
4075
4076 // stop sending work to pgs. this just prevents any new work in _process
4077 // from racing with on_shutdown and potentially entering the pg after.
4078 op_shardedwq.drain();
4079
4080 // Shutdown PGs
4081 {
4082 vector<PGRef> pgs;
4083 _get_pgs(&pgs);
4084 for (auto pg : pgs) {
4085 pg->shutdown();
4086 }
4087 }
4088
4089 // drain op queue again (in case PGs requeued something)
4090 op_shardedwq.drain();
4091 {
4092 finished.clear(); // zap waiters (bleh, this is messy)
4093 waiting_for_osdmap.clear();
4094 }
4095
4096 // unregister commands
4097 cct->get_admin_socket()->unregister_commands(asok_hook);
4098 delete asok_hook;
4099 asok_hook = NULL;
4100
4101 cct->get_admin_socket()->unregister_commands(test_ops_hook);
4102 delete test_ops_hook;
4103 test_ops_hook = NULL;
4104
4105 osd_lock.unlock();
4106
4107 {
4108 std::lock_guard l{heartbeat_lock};
4109 heartbeat_stop = true;
4110 heartbeat_cond.notify_all();
4111 heartbeat_peers.clear();
4112 }
4113 heartbeat_thread.join();
4114
4115 hb_back_server_messenger->mark_down_all();
4116 hb_front_server_messenger->mark_down_all();
4117 hb_front_client_messenger->mark_down_all();
4118 hb_back_client_messenger->mark_down_all();
4119
4120 osd_op_tp.drain();
4121 osd_op_tp.stop();
4122 dout(10) << "op sharded tp stopped" << dendl;
4123
4124 dout(10) << "stopping agent" << dendl;
4125 service.agent_stop();
4126
4127 boot_finisher.wait_for_empty();
4128
4129 osd_lock.lock();
4130
4131 boot_finisher.stop();
4132 reset_heartbeat_peers(true);
4133
4134 tick_timer.shutdown();
4135
4136 {
4137 std::lock_guard l(tick_timer_lock);
4138 tick_timer_without_osd_lock.shutdown();
4139 }
4140
4141 // note unmount epoch
4142 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
4143 superblock.mounted = service.get_boot_epoch();
4144 superblock.clean_thru = get_osdmap_epoch();
4145 ObjectStore::Transaction t;
4146 write_superblock(t);
4147 int r = store->queue_transaction(service.meta_ch, std::move(t));
4148 if (r) {
4149 derr << "OSD::shutdown: error writing superblock: "
4150 << cpp_strerror(r) << dendl;
4151 }
4152
4153
4154 service.shutdown_reserver();
4155
4156 // Remove PGs
4157 #ifdef PG_DEBUG_REFS
4158 service.dump_live_pgids();
4159 #endif
4160 while (true) {
4161 vector<PGRef> pgs;
4162 _get_pgs(&pgs, true);
4163 if (pgs.empty()) {
4164 break;
4165 }
4166 for (auto& pg : pgs) {
4167 if (pg->is_deleted()) {
4168 continue;
4169 }
4170 dout(20) << " kicking pg " << pg << dendl;
4171 pg->lock();
4172 if (pg->get_num_ref() != 1) {
4173 derr << "pgid " << pg->get_pgid() << " has ref count of "
4174 << pg->get_num_ref() << dendl;
4175 #ifdef PG_DEBUG_REFS
4176 pg->dump_live_ids();
4177 #endif
4178 if (cct->_conf->osd_shutdown_pgref_assert) {
4179 ceph_abort();
4180 }
4181 }
4182 pg->ch.reset();
4183 pg->unlock();
4184 }
4185 }
4186 #ifdef PG_DEBUG_REFS
4187 service.dump_live_pgids();
4188 #endif
4189
4190 osd_lock.unlock();
4191 cct->_conf.remove_observer(this);
4192 osd_lock.lock();
4193
4194 service.meta_ch.reset();
4195
4196 dout(10) << "syncing store" << dendl;
4197 enable_disable_fuse(true);
4198
4199 if (cct->_conf->osd_journal_flush_on_shutdown) {
4200 dout(10) << "flushing journal" << dendl;
4201 store->flush_journal();
4202 }
4203
4204 monc->shutdown();
4205 osd_lock.unlock();
4206 {
4207 std::unique_lock l{map_lock};
4208 set_osdmap(OSDMapRef());
4209 }
4210 for (auto s : shards) {
4211 std::lock_guard l(s->osdmap_lock);
4212 s->shard_osdmap = OSDMapRef();
4213 }
4214 service.shutdown();
4215
4216 std::lock_guard lock(osd_lock);
4217 store->umount();
4218 delete store;
4219 store = nullptr;
4220 dout(10) << "Store synced" << dendl;
4221
4222 op_tracker.on_shutdown();
4223
4224 ClassHandler::get_instance().shutdown();
4225 client_messenger->shutdown();
4226 cluster_messenger->shutdown();
4227 hb_front_client_messenger->shutdown();
4228 hb_back_client_messenger->shutdown();
4229 objecter_messenger->shutdown();
4230 hb_front_server_messenger->shutdown();
4231 hb_back_server_messenger->shutdown();
4232
4233 return r;
4234 }
4235
4236 int OSD::mon_cmd_maybe_osd_create(string &cmd)
4237 {
4238 bool created = false;
4239 while (true) {
4240 dout(10) << __func__ << " cmd: " << cmd << dendl;
4241 vector<string> vcmd{cmd};
4242 bufferlist inbl;
4243 C_SaferCond w;
4244 string outs;
4245 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4246 int r = w.wait();
4247 if (r < 0) {
4248 if (r == -ENOENT && !created) {
4249 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4250 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4251 vector<string> vnewcmd{newcmd};
4252 bufferlist inbl;
4253 C_SaferCond w;
4254 string outs;
4255 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4256 int r = w.wait();
4257 if (r < 0) {
4258 derr << __func__ << " fail: osd does not exist and created failed: "
4259 << cpp_strerror(r) << dendl;
4260 return r;
4261 }
4262 created = true;
4263 continue;
4264 }
4265 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4266 return r;
4267 }
4268 break;
4269 }
4270
4271 return 0;
4272 }
4273
4274 int OSD::update_crush_location()
4275 {
4276 if (!cct->_conf->osd_crush_update_on_start) {
4277 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4278 return 0;
4279 }
4280
4281 char weight[32];
4282 if (cct->_conf->osd_crush_initial_weight >= 0) {
4283 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4284 } else {
4285 struct store_statfs_t st;
4286 osd_alert_list_t alerts;
4287 int r = store->statfs(&st, &alerts);
4288 if (r < 0) {
4289 derr << "statfs: " << cpp_strerror(r) << dendl;
4290 return r;
4291 }
4292 snprintf(weight, sizeof(weight), "%.4lf",
4293 std::max(.00001,
4294 double(st.total) /
4295 double(1ull << 40 /* TB */)));
4296 }
4297
4298 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
4299
4300 string cmd =
4301 string("{\"prefix\": \"osd crush create-or-move\", ") +
4302 string("\"id\": ") + stringify(whoami) + ", " +
4303 string("\"weight\":") + weight + ", " +
4304 string("\"args\": [") + stringify(cct->crush_location) + "]}";
4305 return mon_cmd_maybe_osd_create(cmd);
4306 }
4307
4308 int OSD::update_crush_device_class()
4309 {
4310 if (!cct->_conf->osd_class_update_on_start) {
4311 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4312 return 0;
4313 }
4314
4315 string device_class;
4316 int r = store->read_meta("crush_device_class", &device_class);
4317 if (r < 0 || device_class.empty()) {
4318 device_class = store->get_default_device_class();
4319 }
4320
4321 if (device_class.empty()) {
4322 dout(20) << __func__ << " no device class stored locally" << dendl;
4323 return 0;
4324 }
4325
4326 string cmd =
4327 string("{\"prefix\": \"osd crush set-device-class\", ") +
4328 string("\"class\": \"") + device_class + string("\", ") +
4329 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4330
4331 r = mon_cmd_maybe_osd_create(cmd);
4332 if (r == -EBUSY) {
4333 // good, already bound to a device-class
4334 return 0;
4335 } else {
4336 return r;
4337 }
4338 }
4339
4340 void OSD::write_superblock(ObjectStore::Transaction& t)
4341 {
4342 dout(10) << "write_superblock " << superblock << dendl;
4343
4344 //hack: at minimum it's using the baseline feature set
4345 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4346 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4347
4348 bufferlist bl;
4349 encode(superblock, bl);
4350 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4351 }
4352
4353 int OSD::read_superblock()
4354 {
4355 bufferlist bl;
4356 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4357 if (r < 0)
4358 return r;
4359
4360 auto p = bl.cbegin();
4361 decode(superblock, p);
4362
4363 dout(10) << "read_superblock " << superblock << dendl;
4364
4365 return 0;
4366 }
4367
4368 void OSD::clear_temp_objects()
4369 {
4370 dout(10) << __func__ << dendl;
4371 vector<coll_t> ls;
4372 store->list_collections(ls);
4373 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4374 spg_t pgid;
4375 if (!p->is_pg(&pgid))
4376 continue;
4377
4378 // list temp objects
4379 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4380
4381 vector<ghobject_t> temps;
4382 ghobject_t next;
4383 while (1) {
4384 vector<ghobject_t> objects;
4385 auto ch = store->open_collection(*p);
4386 ceph_assert(ch);
4387 store->collection_list(ch, next, ghobject_t::get_max(),
4388 store->get_ideal_list_max(),
4389 &objects, &next);
4390 if (objects.empty())
4391 break;
4392 vector<ghobject_t>::iterator q;
4393 for (q = objects.begin(); q != objects.end(); ++q) {
4394 // Hammer set pool for temps to -1, so check for clean-up
4395 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4396 temps.push_back(*q);
4397 } else {
4398 break;
4399 }
4400 }
4401 // If we saw a non-temp object and hit the break above we can
4402 // break out of the while loop too.
4403 if (q != objects.end())
4404 break;
4405 }
4406 if (!temps.empty()) {
4407 ObjectStore::Transaction t;
4408 int removed = 0;
4409 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4410 dout(20) << " removing " << *p << " object " << *q << dendl;
4411 t.remove(*p, *q);
4412 if (++removed > cct->_conf->osd_target_transaction_size) {
4413 store->queue_transaction(service.meta_ch, std::move(t));
4414 t = ObjectStore::Transaction();
4415 removed = 0;
4416 }
4417 }
4418 if (removed) {
4419 store->queue_transaction(service.meta_ch, std::move(t));
4420 }
4421 }
4422 }
4423 }
4424
4425 void OSD::recursive_remove_collection(CephContext* cct,
4426 ObjectStore *store, spg_t pgid,
4427 coll_t tmp)
4428 {
4429 OSDriver driver(
4430 store,
4431 coll_t(),
4432 make_snapmapper_oid());
4433
4434 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4435 ObjectStore::Transaction t;
4436 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4437
4438 ghobject_t next;
4439 int max = cct->_conf->osd_target_transaction_size;
4440 vector<ghobject_t> objects;
4441 objects.reserve(max);
4442 while (true) {
4443 objects.clear();
4444 store->collection_list(ch, next, ghobject_t::get_max(),
4445 max, &objects, &next);
4446 generic_dout(10) << __func__ << " " << objects << dendl;
4447 if (objects.empty())
4448 break;
4449 for (auto& p: objects) {
4450 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4451 int r = mapper.remove_oid(p.hobj, &_t);
4452 if (r != 0 && r != -ENOENT)
4453 ceph_abort();
4454 t.remove(tmp, p);
4455 }
4456 int r = store->queue_transaction(ch, std::move(t));
4457 ceph_assert(r == 0);
4458 t = ObjectStore::Transaction();
4459 }
4460 t.remove_collection(tmp);
4461 int r = store->queue_transaction(ch, std::move(t));
4462 ceph_assert(r == 0);
4463
4464 C_SaferCond waiter;
4465 if (!ch->flush_commit(&waiter)) {
4466 waiter.wait();
4467 }
4468 }
4469
4470
4471 // ======================================================
4472 // PG's
4473
4474 PG* OSD::_make_pg(
4475 OSDMapRef createmap,
4476 spg_t pgid)
4477 {
4478 dout(10) << __func__ << " " << pgid << dendl;
4479 pg_pool_t pi;
4480 map<string,string> ec_profile;
4481 string name;
4482 if (createmap->have_pg_pool(pgid.pool())) {
4483 pi = *createmap->get_pg_pool(pgid.pool());
4484 name = createmap->get_pool_name(pgid.pool());
4485 if (pi.is_erasure()) {
4486 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4487 }
4488 } else {
4489 // pool was deleted; grab final pg_pool_t off disk.
4490 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4491 bufferlist bl;
4492 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4493 if (r < 0) {
4494 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4495 << dendl;
4496 return nullptr;
4497 }
4498 ceph_assert(r >= 0);
4499 auto p = bl.cbegin();
4500 decode(pi, p);
4501 decode(name, p);
4502 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4503 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4504 << " tombstone" << dendl;
4505 return nullptr;
4506 }
4507 decode(ec_profile, p);
4508 }
4509 PGPool pool(cct, createmap, pgid.pool(), pi, name);
4510 PG *pg;
4511 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4512 pi.type == pg_pool_t::TYPE_ERASURE)
4513 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4514 else
4515 ceph_abort();
4516 return pg;
4517 }
4518
4519 void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4520 {
4521 v->clear();
4522 v->reserve(get_num_pgs());
4523 for (auto& s : shards) {
4524 std::lock_guard l(s->shard_lock);
4525 for (auto& j : s->pg_slots) {
4526 if (j.second->pg &&
4527 !j.second->pg->is_deleted()) {
4528 v->push_back(j.second->pg);
4529 if (clear_too) {
4530 s->_detach_pg(j.second.get());
4531 }
4532 }
4533 }
4534 }
4535 }
4536
4537 void OSD::_get_pgids(vector<spg_t> *v)
4538 {
4539 v->clear();
4540 v->reserve(get_num_pgs());
4541 for (auto& s : shards) {
4542 std::lock_guard l(s->shard_lock);
4543 for (auto& j : s->pg_slots) {
4544 if (j.second->pg &&
4545 !j.second->pg->is_deleted()) {
4546 v->push_back(j.first);
4547 }
4548 }
4549 }
4550 }
4551
4552 void OSD::register_pg(PGRef pg)
4553 {
4554 spg_t pgid = pg->get_pgid();
4555 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4556 auto sdata = shards[shard_index];
4557 std::lock_guard l(sdata->shard_lock);
4558 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4559 ceph_assert(r.second);
4560 auto *slot = r.first->second.get();
4561 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4562 sdata->_attach_pg(slot, pg.get());
4563 }
4564
4565 bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4566 {
4567 auto sdata = pg->osd_shard;
4568 ceph_assert(sdata);
4569 {
4570 std::lock_guard l(sdata->shard_lock);
4571 auto p = sdata->pg_slots.find(pg->pg_id);
4572 if (p == sdata->pg_slots.end() ||
4573 !p->second->pg) {
4574 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4575 return false;
4576 }
4577 if (p->second->waiting_for_merge_epoch) {
4578 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4579 return false;
4580 }
4581 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4582 sdata->_detach_pg(p->second.get());
4583 }
4584
4585 for (auto shard : shards) {
4586 shard->unprime_split_children(pg->pg_id, old_pg_num);
4587 }
4588
4589 // update pg count now since we might not get an osdmap any time soon.
4590 if (pg->is_primary())
4591 service.logger->dec(l_osd_pg_primary);
4592 else if (pg->is_nonprimary())
4593 service.logger->dec(l_osd_pg_replica); // misnomver
4594 else
4595 service.logger->dec(l_osd_pg_stray);
4596
4597 return true;
4598 }
4599
4600 PGRef OSD::_lookup_pg(spg_t pgid)
4601 {
4602 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4603 auto sdata = shards[shard_index];
4604 std::lock_guard l(sdata->shard_lock);
4605 auto p = sdata->pg_slots.find(pgid);
4606 if (p == sdata->pg_slots.end()) {
4607 return nullptr;
4608 }
4609 return p->second->pg;
4610 }
4611
4612 PGRef OSD::_lookup_lock_pg(spg_t pgid)
4613 {
4614 PGRef pg = _lookup_pg(pgid);
4615 if (!pg) {
4616 return nullptr;
4617 }
4618 pg->lock();
4619 if (!pg->is_deleted()) {
4620 return pg;
4621 }
4622 pg->unlock();
4623 return nullptr;
4624 }
4625
4626 PGRef OSD::lookup_lock_pg(spg_t pgid)
4627 {
4628 return _lookup_lock_pg(pgid);
4629 }
4630
4631 void OSD::load_pgs()
4632 {
4633 ceph_assert(ceph_mutex_is_locked(osd_lock));
4634 dout(0) << "load_pgs" << dendl;
4635
4636 {
4637 auto pghist = make_pg_num_history_oid();
4638 bufferlist bl;
4639 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4640 if (r >= 0 && bl.length() > 0) {
4641 auto p = bl.cbegin();
4642 decode(pg_num_history, p);
4643 }
4644 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
4645 }
4646
4647 vector<coll_t> ls;
4648 int r = store->list_collections(ls);
4649 if (r < 0) {
4650 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4651 }
4652
4653 int num = 0;
4654 for (vector<coll_t>::iterator it = ls.begin();
4655 it != ls.end();
4656 ++it) {
4657 spg_t pgid;
4658 if (it->is_temp(&pgid) ||
4659 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
4660 dout(10) << "load_pgs " << *it
4661 << " removing, legacy or flagged for removal pg" << dendl;
4662 recursive_remove_collection(cct, store, pgid, *it);
4663 continue;
4664 }
4665
4666 if (!it->is_pg(&pgid)) {
4667 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4668 continue;
4669 }
4670
4671 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4672 epoch_t map_epoch = 0;
4673 int r = PG::peek_map_epoch(store, pgid, &map_epoch);
4674 if (r < 0) {
4675 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4676 << dendl;
4677 continue;
4678 }
4679
4680 PGRef pg;
4681 if (map_epoch > 0) {
4682 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4683 if (!pgosdmap) {
4684 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
4685 derr << __func__ << ": could not find map for epoch " << map_epoch
4686 << " on pg " << pgid << ", but the pool is not present in the "
4687 << "current map, so this is probably a result of bug 10617. "
4688 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4689 << "to clean it up later." << dendl;
4690 continue;
4691 } else {
4692 derr << __func__ << ": have pgid " << pgid << " at epoch "
4693 << map_epoch << ", but missing map. Crashing."
4694 << dendl;
4695 ceph_abort_msg("Missing map in load_pgs");
4696 }
4697 }
4698 pg = _make_pg(pgosdmap, pgid);
4699 } else {
4700 pg = _make_pg(get_osdmap(), pgid);
4701 }
4702 if (!pg) {
4703 recursive_remove_collection(cct, store, pgid, *it);
4704 continue;
4705 }
4706
4707 // there can be no waiters here, so we don't call _wake_pg_slot
4708
4709 pg->lock();
4710 pg->ch = store->open_collection(pg->coll);
4711
4712 // read pg state, log
4713 pg->read_state(store);
4714
4715 if (pg->dne()) {
4716 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4717 pg->ch = nullptr;
4718 pg->unlock();
4719 recursive_remove_collection(cct, store, pgid, *it);
4720 continue;
4721 }
4722 {
4723 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4724 assert(NULL != shards[shard_index]);
4725 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4726 }
4727
4728 pg->reg_next_scrub();
4729
4730 dout(10) << __func__ << " loaded " << *pg << dendl;
4731 pg->unlock();
4732
4733 register_pg(pg);
4734 ++num;
4735 }
4736 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
4737 }
4738
4739
4740 PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4741 const PGCreateInfo *info)
4742 {
4743 spg_t pgid = info->pgid;
4744
4745 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4746 dout(10) << __func__ << " hit max pg, dropping" << dendl;
4747 return nullptr;
4748 }
4749
4750 PeeringCtx rctx = create_context();
4751
4752 OSDMapRef startmap = get_map(info->epoch);
4753
4754 if (info->by_mon) {
4755 int64_t pool_id = pgid.pgid.pool();
4756 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4757 if (!pool) {
4758 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4759 return nullptr;
4760 }
4761 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
4762 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4763 // this ensures we do not process old creating messages after the
4764 // pool's initial pgs have been created (and pg are subsequently
4765 // allowed to split or merge).
4766 dout(20) << __func__ << " dropping " << pgid
4767 << "create, pool does not have CREATING flag set" << dendl;
4768 return nullptr;
4769 }
4770 }
4771
4772 int up_primary, acting_primary;
4773 vector<int> up, acting;
4774 startmap->pg_to_up_acting_osds(
4775 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4776
4777 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4778 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4779 store->get_type() != "bluestore") {
4780 clog->warn() << "pg " << pgid
4781 << " is at risk of silent data corruption: "
4782 << "the pool allows ec overwrites but is not stored in "
4783 << "bluestore, so deep scrubbing will not detect bitrot";
4784 }
4785 create_pg_collection(
4786 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4787 init_pg_ondisk(rctx.transaction, pgid, pp);
4788
4789 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
4790
4791 PGRef pg = _make_pg(startmap, pgid);
4792 pg->ch = store->create_new_collection(pg->coll);
4793
4794 {
4795 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4796 assert(NULL != shards[shard_index]);
4797 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4798 }
4799
4800 pg->lock(true);
4801
4802 // we are holding the shard lock
4803 ceph_assert(!pg->is_deleted());
4804
4805 pg->init(
4806 role,
4807 up,
4808 up_primary,
4809 acting,
4810 acting_primary,
4811 info->history,
4812 info->past_intervals,
4813 false,
4814 rctx.transaction);
4815
4816 pg->init_collection_pool_opts();
4817
4818 if (pg->is_primary()) {
4819 std::lock_guard locker{m_perf_queries_lock};
4820 pg->set_dynamic_perf_stats_queries(m_perf_queries);
4821 }
4822
4823 pg->handle_initialize(rctx);
4824 pg->handle_activate_map(rctx);
4825
4826 dispatch_context(rctx, pg.get(), osdmap, nullptr);
4827
4828 dout(10) << __func__ << " new pg " << *pg << dendl;
4829 return pg;
4830 }
4831
4832 bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4833 spg_t pgid,
4834 bool is_mon_create)
4835 {
4836 const auto max_pgs_per_osd =
4837 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4838 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4839
4840 if (num_pgs < max_pgs_per_osd) {
4841 return false;
4842 }
4843
4844 std::lock_guard l(pending_creates_lock);
4845 if (is_mon_create) {
4846 pending_creates_from_mon++;
4847 } else {
4848 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
4849 pending_creates_from_osd.emplace(pgid, is_primary);
4850 }
4851 dout(1) << __func__ << " withhold creation of pg " << pgid
4852 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
4853 return true;
4854 }
4855
4856 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4857 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4858 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4859 static vector<int32_t> twiddle(const vector<int>& acting) {
4860 if (acting.size() > 1) {
4861 return {acting[0]};
4862 } else {
4863 vector<int32_t> twiddled(acting.begin(), acting.end());
4864 twiddled.push_back(-1);
4865 return twiddled;
4866 }
4867 }
4868
4869 void OSD::resume_creating_pg()
4870 {
4871 bool do_sub_pg_creates = false;
4872 bool have_pending_creates = false;
4873 {
4874 const auto max_pgs_per_osd =
4875 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4876 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4877 if (max_pgs_per_osd <= num_pgs) {
4878 // this could happen if admin decreases this setting before a PG is removed
4879 return;
4880 }
4881 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4882 std::lock_guard l(pending_creates_lock);
4883 if (pending_creates_from_mon > 0) {
4884 dout(20) << __func__ << " pending_creates_from_mon "
4885 << pending_creates_from_mon << dendl;
4886 do_sub_pg_creates = true;
4887 if (pending_creates_from_mon >= spare_pgs) {
4888 spare_pgs = pending_creates_from_mon = 0;
4889 } else {
4890 spare_pgs -= pending_creates_from_mon;
4891 pending_creates_from_mon = 0;
4892 }
4893 }
4894 auto pg = pending_creates_from_osd.cbegin();
4895 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
4896 dout(20) << __func__ << " pg " << pg->first << dendl;
4897 vector<int> acting;
4898 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
4899 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
4900 pg = pending_creates_from_osd.erase(pg);
4901 do_sub_pg_creates = true;
4902 spare_pgs--;
4903 }
4904 have_pending_creates = (pending_creates_from_mon > 0 ||
4905 !pending_creates_from_osd.empty());
4906 }
4907
4908 bool do_renew_subs = false;
4909 if (do_sub_pg_creates) {
4910 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4911 dout(4) << __func__ << ": resolicit pg creates from mon since "
4912 << last_pg_create_epoch << dendl;
4913 do_renew_subs = true;
4914 }
4915 }
4916 version_t start = get_osdmap_epoch() + 1;
4917 if (have_pending_creates) {
4918 // don't miss any new osdmap deleting PGs
4919 if (monc->sub_want("osdmap", start, 0)) {
4920 dout(4) << __func__ << ": resolicit osdmap from mon since "
4921 << start << dendl;
4922 do_renew_subs = true;
4923 }
4924 } else if (do_sub_pg_creates) {
4925 // no need to subscribe the osdmap continuously anymore
4926 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4927 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
4928 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
4929 << start << dendl;
4930 do_renew_subs = true;
4931 }
4932 }
4933
4934 if (do_renew_subs) {
4935 monc->renew_subs();
4936 }
4937
4938 service.send_pg_temp();
4939 }
4940
4941 void OSD::build_initial_pg_history(
4942 spg_t pgid,
4943 epoch_t created,
4944 utime_t created_stamp,
4945 pg_history_t *h,
4946 PastIntervals *pi)
4947 {
4948 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4949 *h = pg_history_t(created, created_stamp);
4950
4951 OSDMapRef lastmap = service.get_map(created);
4952 int up_primary, acting_primary;
4953 vector<int> up, acting;
4954 lastmap->pg_to_up_acting_osds(
4955 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4956
4957 ostringstream debug;
4958 for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
4959 OSDMapRef osdmap = service.get_map(e);
4960 int new_up_primary, new_acting_primary;
4961 vector<int> new_up, new_acting;
4962 osdmap->pg_to_up_acting_osds(
4963 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4964
4965 // this is a bit imprecise, but sufficient?
4966 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4967 const pg_pool_t *pi;
4968 bool operator()(const set<pg_shard_t> &have) const {
4969 return have.size() >= pi->min_size;
4970 }
4971 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4972 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4973
4974 bool new_interval = PastIntervals::check_new_interval(
4975 acting_primary,
4976 new_acting_primary,
4977 acting, new_acting,
4978 up_primary,
4979 new_up_primary,
4980 up, new_up,
4981 h->same_interval_since,
4982 h->last_epoch_clean,
4983 osdmap.get(),
4984 lastmap.get(),
4985 pgid.pgid,
4986 min_size_predicate,
4987 pi,
4988 &debug);
4989 if (new_interval) {
4990 h->same_interval_since = e;
4991 if (up != new_up) {
4992 h->same_up_since = e;
4993 }
4994 if (acting_primary != new_acting_primary) {
4995 h->same_primary_since = e;
4996 }
4997 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4998 osdmap->get_pg_num(pgid.pgid.pool()),
4999 nullptr)) {
5000 h->last_epoch_split = e;
5001 }
5002 up = new_up;
5003 acting = new_acting;
5004 up_primary = new_up_primary;
5005 acting_primary = new_acting_primary;
5006 }
5007 lastmap = osdmap;
5008 }
5009 dout(20) << __func__ << " " << debug.str() << dendl;
5010 dout(10) << __func__ << " " << *h << " " << *pi
5011 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5012 pi->get_bounds()) << ")"
5013 << dendl;
5014 }
5015
5016 void OSD::_add_heartbeat_peer(int p)
5017 {
5018 if (p == whoami)
5019 return;
5020 HeartbeatInfo *hi;
5021
5022 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5023 if (i == heartbeat_peers.end()) {
5024 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
5025 if (!cons.first)
5026 return;
5027 assert(cons.second);
5028
5029 hi = &heartbeat_peers[p];
5030 hi->peer = p;
5031
5032 auto stamps = service.get_hb_stamps(p);
5033
5034 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5035 sb->peer = p;
5036 sb->stamps = stamps;
5037 hi->hb_interval_start = ceph_clock_now();
5038 hi->con_back = cons.first.get();
5039 hi->con_back->set_priv(sb);
5040
5041 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5042 sf->peer = p;
5043 sf->stamps = stamps;
5044 hi->con_front = cons.second.get();
5045 hi->con_front->set_priv(sf);
5046
5047 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5048 << " " << hi->con_back->get_peer_addr()
5049 << " " << hi->con_front->get_peer_addr()
5050 << dendl;
5051 } else {
5052 hi = &i->second;
5053 }
5054 hi->epoch = get_osdmap_epoch();
5055 }
5056
5057 void OSD::_remove_heartbeat_peer(int n)
5058 {
5059 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5060 ceph_assert(q != heartbeat_peers.end());
5061 dout(20) << " removing heartbeat peer osd." << n
5062 << " " << q->second.con_back->get_peer_addr()
5063 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5064 << dendl;
5065 q->second.clear_mark_down();
5066 heartbeat_peers.erase(q);
5067 }
5068
5069 void OSD::need_heartbeat_peer_update()
5070 {
5071 if (is_stopping())
5072 return;
5073 dout(20) << "need_heartbeat_peer_update" << dendl;
5074 heartbeat_set_peers_need_update();
5075 }
5076
5077 void OSD::maybe_update_heartbeat_peers()
5078 {
5079 ceph_assert(ceph_mutex_is_locked(osd_lock));
5080
5081 if (is_waiting_for_healthy() || is_active()) {
5082 utime_t now = ceph_clock_now();
5083 if (last_heartbeat_resample == utime_t()) {
5084 last_heartbeat_resample = now;
5085 heartbeat_set_peers_need_update();
5086 } else if (!heartbeat_peers_need_update()) {
5087 utime_t dur = now - last_heartbeat_resample;
5088 if (dur > cct->_conf->osd_heartbeat_grace) {
5089 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5090 heartbeat_set_peers_need_update();
5091 last_heartbeat_resample = now;
5092 // automatically clean up any stale heartbeat peers
5093 // if we are unhealthy, then clean all
5094 reset_heartbeat_peers(is_waiting_for_healthy());
5095 }
5096 }
5097 }
5098
5099 if (!heartbeat_peers_need_update())
5100 return;
5101 heartbeat_clear_peers_need_update();
5102
5103 std::lock_guard l(heartbeat_lock);
5104
5105 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5106
5107
5108 // build heartbeat from set
5109 if (is_active()) {
5110 vector<PGRef> pgs;
5111 _get_pgs(&pgs);
5112 for (auto& pg : pgs) {
5113 pg->with_heartbeat_peers([&](int peer) {
5114 if (get_osdmap()->is_up(peer)) {
5115 _add_heartbeat_peer(peer);
5116 }
5117 });
5118 }
5119 }
5120
5121 // include next and previous up osds to ensure we have a fully-connected set
5122 set<int> want, extras;
5123 const int next = get_osdmap()->get_next_up_osd_after(whoami);
5124 if (next >= 0)
5125 want.insert(next);
5126 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
5127 if (prev >= 0 && prev != next)
5128 want.insert(prev);
5129
5130 // make sure we have at least **min_down** osds coming from different
5131 // subtree level (e.g., hosts) for fast failure detection.
5132 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5133 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5134 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5135 get_osdmap()->get_random_up_osds_by_subtree(
5136 whoami, subtree, limit, want, &want);
5137
5138 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5139 dout(10) << " adding neighbor peer osd." << *p << dendl;
5140 extras.insert(*p);
5141 _add_heartbeat_peer(*p);
5142 }
5143
5144 // remove down peers; enumerate extras
5145 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5146 while (p != heartbeat_peers.end()) {
5147 if (!get_osdmap()->is_up(p->first)) {
5148 int o = p->first;
5149 ++p;
5150 _remove_heartbeat_peer(o);
5151 continue;
5152 }
5153 if (p->second.epoch < get_osdmap_epoch()) {
5154 extras.insert(p->first);
5155 }
5156 ++p;
5157 }
5158
5159 // too few?
5160 for (int n = next; n >= 0; ) {
5161 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5162 break;
5163 if (!extras.count(n) && !want.count(n) && n != whoami) {
5164 dout(10) << " adding random peer osd." << n << dendl;
5165 extras.insert(n);
5166 _add_heartbeat_peer(n);
5167 }
5168 n = get_osdmap()->get_next_up_osd_after(n);
5169 if (n == next)
5170 break; // came full circle; stop
5171 }
5172
5173 // too many?
5174 for (set<int>::iterator p = extras.begin();
5175 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5176 ++p) {
5177 if (want.count(*p))
5178 continue;
5179 _remove_heartbeat_peer(*p);
5180 }
5181
5182 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5183
5184 // clean up stale failure pending
5185 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5186 if (heartbeat_peers.count(it->first) == 0) {
5187 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5188 failure_pending.erase(it++);
5189 } else {
5190 it++;
5191 }
5192 }
5193 }
5194
5195 void OSD::reset_heartbeat_peers(bool all)
5196 {
5197 ceph_assert(ceph_mutex_is_locked(osd_lock));
5198 dout(10) << "reset_heartbeat_peers" << dendl;
5199 utime_t stale = ceph_clock_now();
5200 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5201 std::lock_guard l(heartbeat_lock);
5202 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5203 HeartbeatInfo& hi = it->second;
5204 if (all || hi.is_stale(stale)) {
5205 hi.clear_mark_down();
5206 // stop sending failure_report to mon too
5207 failure_queue.erase(it->first);
5208 heartbeat_peers.erase(it++);
5209 } else {
5210 it++;
5211 }
5212 }
5213 }
5214
5215 void OSD::handle_osd_ping(MOSDPing *m)
5216 {
5217 if (superblock.cluster_fsid != m->fsid) {
5218 dout(20) << "handle_osd_ping from " << m->get_source_inst()
5219 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5220 << dendl;
5221 m->put();
5222 return;
5223 }
5224
5225 int from = m->get_source().num();
5226
5227 heartbeat_lock.lock();
5228 if (is_stopping()) {
5229 heartbeat_lock.unlock();
5230 m->put();
5231 return;
5232 }
5233
5234 utime_t now = ceph_clock_now();
5235 auto mnow = service.get_mnow();
5236 ConnectionRef con(m->get_connection());
5237 OSDMapRef curmap = service.get_osdmap();
5238 if (!curmap) {
5239 heartbeat_lock.unlock();
5240 m->put();
5241 return;
5242 }
5243
5244 auto sref = con->get_priv();
5245 Session *s = static_cast<Session*>(sref.get());
5246 if (!s) {
5247 heartbeat_lock.unlock();
5248 m->put();
5249 return;
5250 }
5251 if (!s->stamps) {
5252 s->peer = from;
5253 s->stamps = service.get_hb_stamps(from);
5254 }
5255
5256 switch (m->op) {
5257
5258 case MOSDPing::PING:
5259 {
5260 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5261 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5262 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5263 if (heartbeat_drop->second == 0) {
5264 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5265 } else {
5266 --heartbeat_drop->second;
5267 dout(5) << "Dropping heartbeat from " << from
5268 << ", " << heartbeat_drop->second
5269 << " remaining to drop" << dendl;
5270 break;
5271 }
5272 } else if (cct->_conf->osd_debug_drop_ping_probability >
5273 ((((double)(rand()%100))/100.0))) {
5274 heartbeat_drop =
5275 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5276 cct->_conf->osd_debug_drop_ping_duration)).first;
5277 dout(5) << "Dropping heartbeat from " << from
5278 << ", " << heartbeat_drop->second
5279 << " remaining to drop" << dendl;
5280 break;
5281 }
5282 }
5283
5284 ceph::signedspan sender_delta_ub{};
5285 s->stamps->got_ping(
5286 m->up_from,
5287 mnow,
5288 m->mono_send_stamp,
5289 m->delta_ub,
5290 &sender_delta_ub);
5291 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5292
5293 if (!cct->get_heartbeat_map()->is_healthy()) {
5294 dout(10) << "internal heartbeat not healthy, dropping ping request"
5295 << dendl;
5296 break;
5297 }
5298
5299 Message *r = new MOSDPing(monc->get_fsid(),
5300 curmap->get_epoch(),
5301 MOSDPing::PING_REPLY,
5302 m->ping_stamp,
5303 m->mono_ping_stamp,
5304 mnow,
5305 service.get_up_epoch(),
5306 cct->_conf->osd_heartbeat_min_size,
5307 sender_delta_ub);
5308 con->send_message(r);
5309
5310 if (curmap->is_up(from)) {
5311 if (is_active()) {
5312 ConnectionRef cluster_con = service.get_con_osd_cluster(
5313 from, curmap->get_epoch());
5314 if (cluster_con) {
5315 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5316 }
5317 }
5318 } else if (!curmap->exists(from) ||
5319 curmap->get_down_at(from) > m->map_epoch) {
5320 // tell them they have died
5321 Message *r = new MOSDPing(monc->get_fsid(),
5322 curmap->get_epoch(),
5323 MOSDPing::YOU_DIED,
5324 m->ping_stamp,
5325 m->mono_ping_stamp,
5326 mnow,
5327 service.get_up_epoch(),
5328 cct->_conf->osd_heartbeat_min_size);
5329 con->send_message(r);
5330 }
5331 }
5332 break;
5333
5334 case MOSDPing::PING_REPLY:
5335 {
5336 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5337 if (i != heartbeat_peers.end()) {
5338 auto acked = i->second.ping_history.find(m->ping_stamp);
5339 if (acked != i->second.ping_history.end()) {
5340 int &unacknowledged = acked->second.second;
5341 if (con == i->second.con_back) {
5342 dout(25) << "handle_osd_ping got reply from osd." << from
5343 << " first_tx " << i->second.first_tx
5344 << " last_tx " << i->second.last_tx
5345 << " last_rx_back " << i->second.last_rx_back
5346 << " -> " << now
5347 << " last_rx_front " << i->second.last_rx_front
5348 << dendl;
5349 i->second.last_rx_back = now;
5350 ceph_assert(unacknowledged > 0);
5351 --unacknowledged;
5352 // if there is no front con, set both stamps.
5353 if (i->second.con_front == NULL) {
5354 i->second.last_rx_front = now;
5355 ceph_assert(unacknowledged > 0);
5356 --unacknowledged;
5357 }
5358 } else if (con == i->second.con_front) {
5359 dout(25) << "handle_osd_ping got reply from osd." << from
5360 << " first_tx " << i->second.first_tx
5361 << " last_tx " << i->second.last_tx
5362 << " last_rx_back " << i->second.last_rx_back
5363 << " last_rx_front " << i->second.last_rx_front
5364 << " -> " << now
5365 << dendl;
5366 i->second.last_rx_front = now;
5367 ceph_assert(unacknowledged > 0);
5368 --unacknowledged;
5369 }
5370
5371 if (unacknowledged == 0) {
5372 // succeeded in getting all replies
5373 dout(25) << "handle_osd_ping got all replies from osd." << from
5374 << " , erase pending ping(sent at " << m->ping_stamp << ")"
5375 << " and older pending ping(s)"
5376 << dendl;
5377
5378 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5379 ++i->second.hb_average_count;
5380 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
5381 i->second.hb_total_back += back_pingtime;
5382 if (back_pingtime < i->second.hb_min_back)
5383 i->second.hb_min_back = back_pingtime;
5384 if (back_pingtime > i->second.hb_max_back)
5385 i->second.hb_max_back = back_pingtime;
5386 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
5387 i->second.hb_total_front += front_pingtime;
5388 if (front_pingtime < i->second.hb_min_front)
5389 i->second.hb_min_front = front_pingtime;
5390 if (front_pingtime > i->second.hb_max_front)
5391 i->second.hb_max_front = front_pingtime;
5392
5393 ceph_assert(i->second.hb_interval_start != utime_t());
5394 if (i->second.hb_interval_start == utime_t())
5395 i->second.hb_interval_start = now;
5396 int64_t hb_avg_time_period = 60;
5397 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5398 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5399 }
5400 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5401 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5402 uint32_t back_min = i->second.hb_min_back;
5403 uint32_t back_max = i->second.hb_max_back;
5404 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5405 uint32_t front_min = i->second.hb_min_front;
5406 uint32_t front_max = i->second.hb_max_front;
5407
5408 // Reset for new interval
5409 i->second.hb_average_count = 0;
5410 i->second.hb_interval_start = now;
5411 i->second.hb_total_back = i->second.hb_max_back = 0;
5412 i->second.hb_min_back = UINT_MAX;
5413 i->second.hb_total_front = i->second.hb_max_front = 0;
5414 i->second.hb_min_front = UINT_MAX;
5415
5416 // Record per osd interace ping times
5417 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5418 if (i->second.hb_back_pingtime.size() == 0) {
5419 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5420 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5421 i->second.hb_back_pingtime.push_back(back_avg);
5422 i->second.hb_back_min.push_back(back_min);
5423 i->second.hb_back_max.push_back(back_max);
5424 i->second.hb_front_pingtime.push_back(front_avg);
5425 i->second.hb_front_min.push_back(front_min);
5426 i->second.hb_front_max.push_back(front_max);
5427 ++i->second.hb_index;
5428 }
5429 } else {
5430 int index = i->second.hb_index & (hb_vector_size - 1);
5431 i->second.hb_back_pingtime[index] = back_avg;
5432 i->second.hb_back_min[index] = back_min;
5433 i->second.hb_back_max[index] = back_max;
5434 i->second.hb_front_pingtime[index] = front_avg;
5435 i->second.hb_front_min[index] = front_min;
5436 i->second.hb_front_max[index] = front_max;
5437 ++i->second.hb_index;
5438 }
5439
5440 {
5441 std::lock_guard l(service.stat_lock);
5442 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5443 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5444
5445 uint32_t total = 0;
5446 uint32_t min = UINT_MAX;
5447 uint32_t max = 0;
5448 uint32_t count = 0;
5449 uint32_t which = 0;
5450 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5451 for (int32_t k = size - 1 ; k >= 0; --k) {
5452 ++count;
5453 int index = (i->second.hb_index + k) % size;
5454 total += i->second.hb_back_pingtime[index];
5455 if (i->second.hb_back_min[index] < min)
5456 min = i->second.hb_back_min[index];
5457 if (i->second.hb_back_max[index] > max)
5458 max = i->second.hb_back_max[index];
5459 if (count == 1 || count == 5 || count == 15) {
5460 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5461 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5462 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5463 which++;
5464 if (count == 15)
5465 break;
5466 }
5467 }
5468
5469 if (i->second.con_front != NULL) {
5470 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5471
5472 total = 0;
5473 min = UINT_MAX;
5474 max = 0;
5475 count = 0;
5476 which = 0;
5477 for (int32_t k = size - 1 ; k >= 0; --k) {
5478 ++count;
5479 int index = (i->second.hb_index + k) % size;
5480 total += i->second.hb_front_pingtime[index];
5481 if (i->second.hb_front_min[index] < min)
5482 min = i->second.hb_front_min[index];
5483 if (i->second.hb_front_max[index] > max)
5484 max = i->second.hb_front_max[index];
5485 if (count == 1 || count == 5 || count == 15) {
5486 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5487 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5488 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5489 which++;
5490 if (count == 15)
5491 break;
5492 }
5493 }
5494 }
5495 }
5496 } else {
5497 std::lock_guard l(service.stat_lock);
5498 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5499 if (i->second.con_front != NULL)
5500 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5501 }
5502 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5503 }
5504
5505 if (i->second.is_healthy(now)) {
5506 // Cancel false reports
5507 auto failure_queue_entry = failure_queue.find(from);
5508 if (failure_queue_entry != failure_queue.end()) {
5509 dout(10) << "handle_osd_ping canceling queued "
5510 << "failure report for osd." << from << dendl;
5511 failure_queue.erase(failure_queue_entry);
5512 }
5513
5514 auto failure_pending_entry = failure_pending.find(from);
5515 if (failure_pending_entry != failure_pending.end()) {
5516 dout(10) << "handle_osd_ping canceling in-flight "
5517 << "failure report for osd." << from << dendl;
5518 send_still_alive(curmap->get_epoch(),
5519 from,
5520 failure_pending_entry->second.second);
5521 failure_pending.erase(failure_pending_entry);
5522 }
5523 }
5524 } else {
5525 // old replies, deprecated by newly sent pings.
5526 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
5527 << ") is found, treat as covered by newly sent pings "
5528 << "and ignore"
5529 << dendl;
5530 }
5531 }
5532
5533 if (m->map_epoch &&
5534 curmap->is_up(from)) {
5535 if (is_active()) {
5536 ConnectionRef cluster_con = service.get_con_osd_cluster(
5537 from, curmap->get_epoch());
5538 if (cluster_con) {
5539 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5540 }
5541 }
5542 }
5543
5544 s->stamps->got_ping_reply(
5545 mnow,
5546 m->mono_send_stamp,
5547 m->delta_ub);
5548 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5549 }
5550 break;
5551
5552 case MOSDPing::YOU_DIED:
5553 dout(10) << "handle_osd_ping " << m->get_source_inst()
5554 << " says i am down in " << m->map_epoch << dendl;
5555 osdmap_subscribe(curmap->get_epoch()+1, false);
5556 break;
5557 }
5558
5559 heartbeat_lock.unlock();
5560 m->put();
5561 }
5562
5563 void OSD::heartbeat_entry()
5564 {
5565 std::unique_lock l(heartbeat_lock);
5566 if (is_stopping())
5567 return;
5568 while (!heartbeat_stop) {
5569 heartbeat();
5570
5571 double wait;
5572 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5573 wait = (float)cct->_conf->osd_heartbeat_interval;
5574 } else {
5575 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5576 }
5577 auto w = ceph::make_timespan(wait);
5578 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5579 heartbeat_cond.wait_for(l, w);
5580 if (is_stopping())
5581 return;
5582 dout(30) << "heartbeat_entry woke up" << dendl;
5583 }
5584 }
5585
5586 void OSD::heartbeat_check()
5587 {
5588 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
5589 utime_t now = ceph_clock_now();
5590
5591 // check for incoming heartbeats (move me elsewhere?)
5592 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5593 p != heartbeat_peers.end();
5594 ++p) {
5595
5596 if (p->second.first_tx == utime_t()) {
5597 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5598 << " yet, skipping" << dendl;
5599 continue;
5600 }
5601
5602 dout(25) << "heartbeat_check osd." << p->first
5603 << " first_tx " << p->second.first_tx
5604 << " last_tx " << p->second.last_tx
5605 << " last_rx_back " << p->second.last_rx_back
5606 << " last_rx_front " << p->second.last_rx_front
5607 << dendl;
5608 if (p->second.is_unhealthy(now)) {
5609 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5610 if (p->second.last_rx_back == utime_t() ||
5611 p->second.last_rx_front == utime_t()) {
5612 derr << "heartbeat_check: no reply from "
5613 << p->second.con_front->get_peer_addr().get_sockaddr()
5614 << " osd." << p->first
5615 << " ever on either front or back, first ping sent "
5616 << p->second.first_tx
5617 << " (oldest deadline " << oldest_deadline << ")"
5618 << dendl;
5619 // fail
5620 failure_queue[p->first] = p->second.first_tx;
5621 } else {
5622 derr << "heartbeat_check: no reply from "
5623 << p->second.con_front->get_peer_addr().get_sockaddr()
5624 << " osd." << p->first << " since back " << p->second.last_rx_back
5625 << " front " << p->second.last_rx_front
5626 << " (oldest deadline " << oldest_deadline << ")"
5627 << dendl;
5628 // fail
5629 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5630 }
5631 }
5632 }
5633 }
5634
5635 void OSD::heartbeat()
5636 {
5637 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
5638 dout(30) << "heartbeat" << dendl;
5639
5640 // get CPU load avg
5641 double loadavgs[1];
5642 int hb_interval = cct->_conf->osd_heartbeat_interval;
5643 int n_samples = 86400;
5644 if (hb_interval > 1) {
5645 n_samples /= hb_interval;
5646 if (n_samples < 1)
5647 n_samples = 1;
5648 }
5649
5650 if (getloadavg(loadavgs, 1) == 1) {
5651 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5652 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5653 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5654 }
5655
5656 dout(30) << "heartbeat checking stats" << dendl;
5657
5658 // refresh peer list and osd stats
5659 vector<int> hb_peers;
5660 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5661 p != heartbeat_peers.end();
5662 ++p)
5663 hb_peers.push_back(p->first);
5664
5665 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5666 dout(5) << __func__ << " " << new_stat << dendl;
5667 ceph_assert(new_stat.statfs.total);
5668
5669 float pratio;
5670 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5671
5672 service.check_full_status(ratio, pratio);
5673
5674 utime_t now = ceph_clock_now();
5675 auto mnow = service.get_mnow();
5676 utime_t deadline = now;
5677 deadline += cct->_conf->osd_heartbeat_grace;
5678
5679 // send heartbeats
5680 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5681 i != heartbeat_peers.end();
5682 ++i) {
5683 int peer = i->first;
5684 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5685
5686 i->second.last_tx = now;
5687 if (i->second.first_tx == utime_t())
5688 i->second.first_tx = now;
5689 i->second.ping_history[now] = make_pair(deadline,
5690 HeartbeatInfo::HEARTBEAT_MAX_CONN);
5691 if (i->second.hb_interval_start == utime_t())
5692 i->second.hb_interval_start = now;
5693
5694 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5695 std::optional<ceph::signedspan> delta_ub;
5696 s->stamps->sent_ping(&delta_ub);
5697
5698 i->second.con_back->send_message(
5699 new MOSDPing(monc->get_fsid(),
5700 service.get_osdmap_epoch(),
5701 MOSDPing::PING,
5702 now,
5703 mnow,
5704 mnow,
5705 service.get_up_epoch(),
5706 cct->_conf->osd_heartbeat_min_size,
5707 delta_ub));
5708
5709 if (i->second.con_front)
5710 i->second.con_front->send_message(
5711 new MOSDPing(monc->get_fsid(),
5712 service.get_osdmap_epoch(),
5713 MOSDPing::PING,
5714 now,
5715 mnow,
5716 mnow,
5717 service.get_up_epoch(),
5718 cct->_conf->osd_heartbeat_min_size,
5719 delta_ub));
5720 }
5721
5722 logger->set(l_osd_hb_to, heartbeat_peers.size());
5723
5724 // hmm.. am i all alone?
5725 dout(30) << "heartbeat lonely?" << dendl;
5726 if (heartbeat_peers.empty()) {
5727 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5728 last_mon_heartbeat = now;
5729 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5730 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5731 }
5732 }
5733
5734 dout(30) << "heartbeat done" << dendl;
5735 }
5736
5737 bool OSD::heartbeat_reset(Connection *con)
5738 {
5739 std::lock_guard l(heartbeat_lock);
5740 auto s = con->get_priv();
5741 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
5742 con->set_priv(nullptr);
5743 if (s) {
5744 if (is_stopping()) {
5745 return true;
5746 }
5747 auto session = static_cast<Session*>(s.get());
5748 auto p = heartbeat_peers.find(session->peer);
5749 if (p != heartbeat_peers.end() &&
5750 (p->second.con_back == con ||
5751 p->second.con_front == con)) {
5752 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5753 << ", reopening" << dendl;
5754 p->second.clear_mark_down(con);
5755 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5756 if (newcon.first) {
5757 p->second.con_back = newcon.first.get();
5758 p->second.con_back->set_priv(s);
5759 if (newcon.second) {
5760 p->second.con_front = newcon.second.get();
5761 p->second.con_front->set_priv(s);
5762 }
5763 p->second.ping_history.clear();
5764 } else {
5765 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5766 << ", raced with osdmap update, closing out peer" << dendl;
5767 heartbeat_peers.erase(p);
5768 }
5769 } else {
5770 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5771 }
5772 }
5773 return true;
5774 }
5775
5776
5777
5778 // =========================================
5779
5780 void OSD::tick()
5781 {
5782 ceph_assert(ceph_mutex_is_locked(osd_lock));
5783 dout(10) << "tick" << dendl;
5784
5785 utime_t now = ceph_clock_now();
5786 // throw out any obsolete markdown log
5787 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
5788 while (!osd_markdown_log.empty() &&
5789 osd_markdown_log.front() + grace < now)
5790 osd_markdown_log.pop_front();
5791
5792 if (is_active() || is_waiting_for_healthy()) {
5793 maybe_update_heartbeat_peers();
5794 }
5795
5796 if (is_waiting_for_healthy()) {
5797 start_boot();
5798 }
5799
5800 if (is_waiting_for_healthy() || is_booting()) {
5801 std::lock_guard l(heartbeat_lock);
5802 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5803 last_mon_heartbeat = now;
5804 dout(1) << __func__ << " checking mon for new map" << dendl;
5805 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5806 }
5807 }
5808
5809 do_waiters();
5810
5811 // scrub purged_snaps every deep scrub interval
5812 {
5813 const utime_t last = superblock.last_purged_snaps_scrub;
5814 utime_t next = last;
5815 next += cct->_conf->osd_scrub_min_interval;
5816 std::mt19937 rng;
5817 // use a seed that is stable for each scrub interval, but varies
5818 // by OSD to avoid any herds.
5819 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
5820 double r = (rng() % 1024) / 1024;
5821 next +=
5822 cct->_conf->osd_scrub_min_interval *
5823 cct->_conf->osd_scrub_interval_randomize_ratio * r;
5824 if (next < ceph_clock_now()) {
5825 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5826 << " next " << next << " ... now" << dendl;
5827 scrub_purged_snaps();
5828 } else {
5829 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5830 << " next " << next << dendl;
5831 }
5832 }
5833
5834 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
5835 }
5836
5837 void OSD::tick_without_osd_lock()
5838 {
5839 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
5840 dout(10) << "tick_without_osd_lock" << dendl;
5841
5842 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5843 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5844 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5845
5846 // refresh osd stats
5847 struct store_statfs_t stbuf;
5848 osd_alert_list_t alerts;
5849 int r = store->statfs(&stbuf, &alerts);
5850 ceph_assert(r == 0);
5851 service.set_statfs(stbuf, alerts);
5852
5853 // osd_lock is not being held, which means the OSD state
5854 // might change when doing the monitor report
5855 if (is_active() || is_waiting_for_healthy()) {
5856 {
5857 std::lock_guard l{heartbeat_lock};
5858 heartbeat_check();
5859 }
5860 map_lock.lock_shared();
5861 std::lock_guard l(mon_report_lock);
5862
5863 // mon report?
5864 utime_t now = ceph_clock_now();
5865 if (service.need_fullness_update() ||
5866 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
5867 last_mon_report = now;
5868 send_full_update();
5869 send_failures();
5870 }
5871 map_lock.unlock_shared();
5872
5873 epoch_t max_waiting_epoch = 0;
5874 for (auto s : shards) {
5875 max_waiting_epoch = std::max(max_waiting_epoch,
5876 s->get_max_waiting_epoch());
5877 }
5878 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
5879 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
5880 << ", requesting new map" << dendl;
5881 osdmap_subscribe(superblock.newest_map + 1, false);
5882 }
5883 }
5884
5885 if (is_active()) {
5886 if (!scrub_random_backoff()) {
5887 sched_scrub();
5888 }
5889 service.promote_throttle_recalibrate();
5890 resume_creating_pg();
5891 bool need_send_beacon = false;
5892 const auto now = ceph::coarse_mono_clock::now();
5893 {
5894 // borrow lec lock to pretect last_sent_beacon from changing
5895 std::lock_guard l{min_last_epoch_clean_lock};
5896 const auto elapsed = now - last_sent_beacon;
5897 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5898 cct->_conf->osd_beacon_report_interval) {
5899 need_send_beacon = true;
5900 }
5901 }
5902 if (need_send_beacon) {
5903 send_beacon(now);
5904 }
5905 }
5906
5907 mgrc.update_daemon_health(get_health_metrics());
5908 service.kick_recovery_queue();
5909 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5910 new C_Tick_WithoutOSDLock(this));
5911 }
5912
5913 // Usage:
5914 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5915 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5916 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5917 // getomap <pool> [namespace/]<obj-name>
5918 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5919 // injectmdataerr [namespace/]<obj-name> [shardid]
5920 // injectdataerr [namespace/]<obj-name> [shardid]
5921 //
5922 // set_recovery_delay [utime]
5923 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5924 std::string_view command,
5925 const cmdmap_t& cmdmap, ostream &ss)
5926 {
5927 //Test support
5928 //Support changing the omap on a single osd by using the Admin Socket to
5929 //directly request the osd make a change.
5930 if (command == "setomapval" || command == "rmomapkey" ||
5931 command == "setomapheader" || command == "getomap" ||
5932 command == "truncobj" || command == "injectmdataerr" ||
5933 command == "injectdataerr"
5934 ) {
5935 pg_t rawpg;
5936 int64_t pool;
5937 OSDMapRef curmap = service->get_osdmap();
5938 int r = -1;
5939
5940 string poolstr;
5941
5942 cmd_getval(cmdmap, "pool", poolstr);
5943 pool = curmap->lookup_pg_pool_name(poolstr);
5944 //If we can't find it by name then maybe id specified
5945 if (pool < 0 && isdigit(poolstr[0]))
5946 pool = atoll(poolstr.c_str());
5947 if (pool < 0) {
5948 ss << "Invalid pool '" << poolstr << "''";
5949 return;
5950 }
5951
5952 string objname, nspace;
5953 cmd_getval(cmdmap, "objname", objname);
5954 std::size_t found = objname.find_first_of('/');
5955 if (found != string::npos) {
5956 nspace = objname.substr(0, found);
5957 objname = objname.substr(found+1);
5958 }
5959 object_locator_t oloc(pool, nspace);
5960 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5961
5962 if (r < 0) {
5963 ss << "Invalid namespace/objname";
5964 return;
5965 }
5966
5967 int64_t shardid;
5968 cmd_getval(cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5969 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5970 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5971 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5972 if (curmap->pg_is_ec(rawpg)) {
5973 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5974 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5975 return;
5976 }
5977 }
5978
5979 ObjectStore::Transaction t;
5980
5981 if (command == "setomapval") {
5982 map<string, bufferlist> newattrs;
5983 bufferlist val;
5984 string key, valstr;
5985 cmd_getval(cmdmap, "key", key);
5986 cmd_getval(cmdmap, "val", valstr);
5987
5988 val.append(valstr);
5989 newattrs[key] = val;
5990 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5991 r = store->queue_transaction(service->meta_ch, std::move(t));
5992 if (r < 0)
5993 ss << "error=" << r;
5994 else
5995 ss << "ok";
5996 } else if (command == "rmomapkey") {
5997 string key;
5998 cmd_getval(cmdmap, "key", key);
5999
6000 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
6001 r = store->queue_transaction(service->meta_ch, std::move(t));
6002 if (r < 0)
6003 ss << "error=" << r;
6004 else
6005 ss << "ok";
6006 } else if (command == "setomapheader") {
6007 bufferlist newheader;
6008 string headerstr;
6009
6010 cmd_getval(cmdmap, "header", headerstr);
6011 newheader.append(headerstr);
6012 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
6013 r = store->queue_transaction(service->meta_ch, std::move(t));
6014 if (r < 0)
6015 ss << "error=" << r;
6016 else
6017 ss << "ok";
6018 } else if (command == "getomap") {
6019 //Debug: Output entire omap
6020 bufferlist hdrbl;
6021 map<string, bufferlist> keyvals;
6022 auto ch = store->open_collection(coll_t(pgid));
6023 if (!ch) {
6024 ss << "unable to open collection for " << pgid;
6025 r = -ENOENT;
6026 } else {
6027 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6028 if (r >= 0) {
6029 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6030 for (map<string, bufferlist>::iterator it = keyvals.begin();
6031 it != keyvals.end(); ++it)
6032 ss << " key=" << (*it).first << " val="
6033 << string((*it).second.c_str(), (*it).second.length());
6034 } else {
6035 ss << "error=" << r;
6036 }
6037 }
6038 } else if (command == "truncobj") {
6039 int64_t trunclen;
6040 cmd_getval(cmdmap, "len", trunclen);
6041 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
6042 r = store->queue_transaction(service->meta_ch, std::move(t));
6043 if (r < 0)
6044 ss << "error=" << r;
6045 else
6046 ss << "ok";
6047 } else if (command == "injectdataerr") {
6048 store->inject_data_error(gobj);
6049 ss << "ok";
6050 } else if (command == "injectmdataerr") {
6051 store->inject_mdata_error(gobj);
6052 ss << "ok";
6053 }
6054 return;
6055 }
6056 if (command == "set_recovery_delay") {
6057 int64_t delay;
6058 cmd_getval(cmdmap, "utime", delay, (int64_t)0);
6059 ostringstream oss;
6060 oss << delay;
6061 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
6062 oss.str().c_str());
6063 if (r != 0) {
6064 ss << "set_recovery_delay: error setting "
6065 << "osd_recovery_delay_start to '" << delay << "': error "
6066 << r;
6067 return;
6068 }
6069 service->cct->_conf.apply_changes(nullptr);
6070 ss << "set_recovery_delay: set osd_recovery_delay_start "
6071 << "to " << service->cct->_conf->osd_recovery_delay_start;
6072 return;
6073 }
6074 if (command == "injectfull") {
6075 int64_t count;
6076 string type;
6077 OSDService::s_names state;
6078 cmd_getval(cmdmap, "type", type, string("full"));
6079 cmd_getval(cmdmap, "count", count, (int64_t)-1);
6080 if (type == "none" || count == 0) {
6081 type = "none";
6082 count = 0;
6083 }
6084 state = service->get_full_state(type);
6085 if (state == OSDService::s_names::INVALID) {
6086 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6087 return;
6088 }
6089 service->set_injectfull(state, count);
6090 return;
6091 }
6092 ss << "Internal error - command=" << command;
6093 }
6094
6095 // =========================================
6096
6097 void OSD::ms_handle_connect(Connection *con)
6098 {
6099 dout(10) << __func__ << " con " << con << dendl;
6100 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6101 std::lock_guard l(osd_lock);
6102 if (is_stopping())
6103 return;
6104 dout(10) << __func__ << " on mon" << dendl;
6105
6106 if (is_preboot()) {
6107 start_boot();
6108 } else if (is_booting()) {
6109 _send_boot(); // resend boot message
6110 } else {
6111 map_lock.lock_shared();
6112 std::lock_guard l2(mon_report_lock);
6113
6114 utime_t now = ceph_clock_now();
6115 last_mon_report = now;
6116
6117 // resend everything, it's a new session
6118 send_full_update();
6119 send_alive();
6120 service.requeue_pg_temp();
6121 service.clear_sent_ready_to_merge();
6122 service.send_pg_temp();
6123 service.send_ready_to_merge();
6124 service.send_pg_created();
6125 requeue_failures();
6126 send_failures();
6127
6128 map_lock.unlock_shared();
6129 if (is_active()) {
6130 send_beacon(ceph::coarse_mono_clock::now());
6131 }
6132 }
6133
6134 // full map requests may happen while active or pre-boot
6135 if (requested_full_first) {
6136 rerequest_full_maps();
6137 }
6138 }
6139 }
6140
6141 void OSD::ms_handle_fast_connect(Connection *con)
6142 {
6143 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6144 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6145 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6146 s = ceph::make_ref<Session>(cct, con);
6147 con->set_priv(s);
6148 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6149 << " addr=" << s->con->get_peer_addr() << dendl;
6150 // we don't connect to clients
6151 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6152 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6153 }
6154 }
6155 }
6156
6157 void OSD::ms_handle_fast_accept(Connection *con)
6158 {
6159 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6160 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6161 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6162 s = ceph::make_ref<Session>(cct, con);
6163 con->set_priv(s);
6164 dout(10) << "new session (incoming)" << s << " con=" << con
6165 << " addr=" << con->get_peer_addr()
6166 << " must have raced with connect" << dendl;
6167 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6168 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6169 }
6170 }
6171 }
6172
6173 bool OSD::ms_handle_reset(Connection *con)
6174 {
6175 auto session = ceph::ref_cast<Session>(con->get_priv());
6176 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
6177 if (!session)
6178 return false;
6179 session->wstate.reset(con);
6180 session->con->set_priv(nullptr);
6181 session->con.reset(); // break con <-> session ref cycle
6182 // note that we break session->con *before* the session_handle_reset
6183 // cleanup below. this avoids a race between us and
6184 // PG::add_backoff, Session::check_backoff, etc.
6185 session_handle_reset(session);
6186 return true;
6187 }
6188
6189 bool OSD::ms_handle_refused(Connection *con)
6190 {
6191 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6192 return false;
6193
6194 auto session = ceph::ref_cast<Session>(con->get_priv());
6195 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
6196 if (!session)
6197 return false;
6198 int type = con->get_peer_type();
6199 // handle only OSD failures here
6200 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6201 OSDMapRef osdmap = get_osdmap();
6202 if (osdmap) {
6203 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6204 if (id >= 0 && osdmap->is_up(id)) {
6205 // I'm cheating mon heartbeat grace logic, because we know it's not going
6206 // to respawn alone. +1 so we won't hit any boundary case.
6207 monc->send_mon_message(
6208 new MOSDFailure(
6209 monc->get_fsid(),
6210 id,
6211 osdmap->get_addrs(id),
6212 cct->_conf->osd_heartbeat_grace + 1,
6213 osdmap->get_epoch(),
6214 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6215 ));
6216 }
6217 }
6218 }
6219 return true;
6220 }
6221
6222 struct C_OSD_GetVersion : public Context {
6223 OSD *osd;
6224 uint64_t oldest, newest;
6225 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
6226 void finish(int r) override {
6227 if (r >= 0)
6228 osd->_got_mon_epochs(oldest, newest);
6229 }
6230 };
6231
6232 void OSD::start_boot()
6233 {
6234 if (!_is_healthy()) {
6235 // if we are not healthy, do not mark ourselves up (yet)
6236 dout(1) << "not healthy; waiting to boot" << dendl;
6237 if (!is_waiting_for_healthy())
6238 start_waiting_for_healthy();
6239 // send pings sooner rather than later
6240 heartbeat_kick();
6241 return;
6242 }
6243 dout(1) << __func__ << dendl;
6244 set_state(STATE_PREBOOT);
6245 dout(10) << "start_boot - have maps " << superblock.oldest_map
6246 << ".." << superblock.newest_map << dendl;
6247 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
6248 monc->get_version("osdmap", &c->newest, &c->oldest, c);
6249 }
6250
6251 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6252 {
6253 std::lock_guard l(osd_lock);
6254 if (is_preboot()) {
6255 _preboot(oldest, newest);
6256 }
6257 }
6258
6259 void OSD::_preboot(epoch_t oldest, epoch_t newest)
6260 {
6261 ceph_assert(is_preboot());
6262 dout(10) << __func__ << " _preboot mon has osdmaps "
6263 << oldest << ".." << newest << dendl;
6264
6265 // ensure our local fullness awareness is accurate
6266 {
6267 std::lock_guard l(heartbeat_lock);
6268 heartbeat();
6269 }
6270
6271 const auto& monmap = monc->monmap;
6272 const auto osdmap = get_osdmap();
6273 // if our map within recent history, try to add ourselves to the osdmap.
6274 if (osdmap->get_epoch() == 0) {
6275 derr << "waiting for initial osdmap" << dendl;
6276 } else if (osdmap->is_destroyed(whoami)) {
6277 derr << "osdmap says I am destroyed" << dendl;
6278 // provide a small margin so we don't livelock seeing if we
6279 // un-destroyed ourselves.
6280 if (osdmap->get_epoch() > newest - 1) {
6281 exit(0);
6282 }
6283 } else if (osdmap->is_noup(whoami)) {
6284 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6285 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6286 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6287 << dendl;
6288 } else if (osdmap->require_osd_release < ceph_release_t::luminous) {
6289 derr << "osdmap require_osd_release < luminous; please upgrade to luminous"
6290 << dendl;
6291 } else if (service.need_fullness_update()) {
6292 derr << "osdmap fullness state needs update" << dendl;
6293 send_full_update();
6294 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6295 superblock.purged_snaps_last < superblock.current_epoch) {
6296 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6297 << " < newest_map " << superblock.current_epoch << dendl;
6298 _get_purged_snaps();
6299 } else if (osdmap->get_epoch() >= oldest - 1 &&
6300 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6301
6302 // wait for pgs to fully catch up in a different thread, since
6303 // this thread might be required for splitting and merging PGs to
6304 // make progress.
6305 boot_finisher.queue(
6306 new LambdaContext(
6307 [this](int r) {
6308 std::unique_lock l(osd_lock);
6309 if (is_preboot()) {
6310 dout(10) << __func__ << " waiting for peering work to drain"
6311 << dendl;
6312 l.unlock();
6313 for (auto shard : shards) {
6314 shard->wait_min_pg_epoch(get_osdmap_epoch());
6315 }
6316 l.lock();
6317 }
6318 if (is_preboot()) {
6319 _send_boot();
6320 }
6321 }));
6322 return;
6323 }
6324
6325 // get all the latest maps
6326 if (osdmap->get_epoch() + 1 >= oldest)
6327 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6328 else
6329 osdmap_subscribe(oldest - 1, true);
6330 }
6331
6332 void OSD::_get_purged_snaps()
6333 {
6334 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6335 // overlapping requests to the mon, which will be somewhat inefficient, but
6336 // it should be reliable.
6337 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6338 << ", newest_map " << superblock.current_epoch << dendl;
6339 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6340 superblock.purged_snaps_last + 1,
6341 superblock.current_epoch + 1);
6342 monc->send_mon_message(m);
6343 }
6344
6345 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6346 {
6347 dout(10) << __func__ << " " << *m << dendl;
6348 ObjectStore::Transaction t;
6349 if (!is_preboot() ||
6350 m->last < superblock.purged_snaps_last) {
6351 goto out;
6352 }
6353 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
6354 make_purged_snaps_oid(), &t,
6355 m->purged_snaps);
6356 superblock.purged_snaps_last = m->last;
6357 write_superblock(t);
6358 store->queue_transaction(
6359 service.meta_ch,
6360 std::move(t));
6361 service.publish_superblock(superblock);
6362 if (m->last < superblock.current_epoch) {
6363 _get_purged_snaps();
6364 } else {
6365 start_boot();
6366 }
6367 out:
6368 m->put();
6369 }
6370
6371 void OSD::send_full_update()
6372 {
6373 if (!service.need_fullness_update())
6374 return;
6375 unsigned state = 0;
6376 if (service.is_full()) {
6377 state = CEPH_OSD_FULL;
6378 } else if (service.is_backfillfull()) {
6379 state = CEPH_OSD_BACKFILLFULL;
6380 } else if (service.is_nearfull()) {
6381 state = CEPH_OSD_NEARFULL;
6382 }
6383 set<string> s;
6384 OSDMap::calc_state_set(state, s);
6385 dout(10) << __func__ << " want state " << s << dendl;
6386 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
6387 }
6388
6389 void OSD::start_waiting_for_healthy()
6390 {
6391 dout(1) << "start_waiting_for_healthy" << dendl;
6392 set_state(STATE_WAITING_FOR_HEALTHY);
6393 last_heartbeat_resample = utime_t();
6394
6395 // subscribe to osdmap updates, in case our peers really are known to be dead
6396 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6397 }
6398
6399 bool OSD::_is_healthy()
6400 {
6401 if (!cct->get_heartbeat_map()->is_healthy()) {
6402 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6403 return false;
6404 }
6405
6406 if (is_waiting_for_healthy()) {
6407 utime_t now = ceph_clock_now();
6408 if (osd_markdown_log.empty()) {
6409 dout(5) << __func__ << " force returning true since last markdown"
6410 << " was " << cct->_conf->osd_max_markdown_period
6411 << "s ago" << dendl;
6412 return true;
6413 }
6414 std::lock_guard l(heartbeat_lock);
6415 int num = 0, up = 0;
6416 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6417 p != heartbeat_peers.end();
6418 ++p) {
6419 if (p->second.is_healthy(now))
6420 ++up;
6421 ++num;
6422 }
6423 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6424 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6425 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6426 return false;
6427 }
6428 }
6429
6430 return true;
6431 }
6432
6433 void OSD::_send_boot()
6434 {
6435 dout(10) << "_send_boot" << dendl;
6436 Connection *local_connection =
6437 cluster_messenger->get_loopback_connection().get();
6438 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6439 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6440 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6441 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6442
6443 dout(20) << " initial client_addrs " << client_addrs
6444 << ", cluster_addrs " << cluster_addrs
6445 << ", hb_back_addrs " << hb_back_addrs
6446 << ", hb_front_addrs " << hb_front_addrs
6447 << dendl;
6448 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6449 dout(10) << " assuming cluster_addrs match client_addrs "
6450 << client_addrs << dendl;
6451 cluster_addrs = cluster_messenger->get_myaddrs();
6452 }
6453 if (auto session = local_connection->get_priv(); !session) {
6454 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6455 }
6456
6457 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6458 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6459 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6460 << cluster_addrs << dendl;
6461 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6462 }
6463 if (auto session = local_connection->get_priv(); !session) {
6464 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6465 }
6466
6467 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6468 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6469 dout(10) << " assuming hb_front_addrs match client_addrs "
6470 << client_addrs << dendl;
6471 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6472 }
6473 if (auto session = local_connection->get_priv(); !session) {
6474 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6475 }
6476
6477 // we now know what our front and back addrs will be, and we are
6478 // about to tell the mon what our metadata (including numa bindings)
6479 // are, so now is a good time!
6480 set_numa_affinity();
6481
6482 MOSDBoot *mboot = new MOSDBoot(
6483 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6484 hb_back_addrs, hb_front_addrs, cluster_addrs,
6485 CEPH_FEATURES_ALL);
6486 dout(10) << " final client_addrs " << client_addrs
6487 << ", cluster_addrs " << cluster_addrs
6488 << ", hb_back_addrs " << hb_back_addrs
6489 << ", hb_front_addrs " << hb_front_addrs
6490 << dendl;
6491 _collect_metadata(&mboot->metadata);
6492 monc->send_mon_message(mboot);
6493 set_state(STATE_BOOTING);
6494 }
6495
6496 void OSD::_collect_metadata(map<string,string> *pm)
6497 {
6498 // config info
6499 (*pm)["osd_data"] = dev_path;
6500 if (store->get_type() == "filestore") {
6501 // not applicable for bluestore
6502 (*pm)["osd_journal"] = journal_path;
6503 }
6504 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6505 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6506 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6507 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6508
6509 // backend
6510 (*pm)["osd_objectstore"] = store->get_type();
6511 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6512 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6513 (*pm)["default_device_class"] = store->get_default_device_class();
6514 store->collect_metadata(pm);
6515
6516 collect_sys_info(pm, cct);
6517
6518 (*pm)["front_iface"] = pick_iface(
6519 cct,
6520 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6521 (*pm)["back_iface"] = pick_iface(
6522 cct,
6523 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6524
6525 // network numa
6526 {
6527 int node = -1;
6528 set<int> nodes;
6529 set<string> unknown;
6530 for (auto nm : { "front_iface", "back_iface" }) {
6531 if (!(*pm)[nm].size()) {
6532 unknown.insert(nm);
6533 continue;
6534 }
6535 int n = -1;
6536 int r = get_iface_numa_node((*pm)[nm], &n);
6537 if (r < 0) {
6538 unknown.insert((*pm)[nm]);
6539 continue;
6540 }
6541 nodes.insert(n);
6542 if (node < 0) {
6543 node = n;
6544 }
6545 }
6546 if (unknown.size()) {
6547 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6548 }
6549 if (!nodes.empty()) {
6550 (*pm)["network_numa_nodes"] = stringify(nodes);
6551 }
6552 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6553 (*pm)["network_numa_node"] = stringify(node);
6554 }
6555 }
6556
6557 if (numa_node >= 0) {
6558 (*pm)["numa_node"] = stringify(numa_node);
6559 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6560 &numa_cpu_set);
6561 }
6562
6563 set<string> devnames;
6564 store->get_devices(&devnames);
6565 map<string,string> errs;
6566 get_device_metadata(devnames, pm, &errs);
6567 for (auto& i : errs) {
6568 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
6569 }
6570 dout(10) << __func__ << " " << *pm << dendl;
6571 }
6572
6573 void OSD::queue_want_up_thru(epoch_t want)
6574 {
6575 std::shared_lock map_locker{map_lock};
6576 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6577 std::lock_guard report_locker(mon_report_lock);
6578 if (want > up_thru_wanted) {
6579 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6580 << ", currently " << cur
6581 << dendl;
6582 up_thru_wanted = want;
6583 send_alive();
6584 } else {
6585 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6586 << ", currently " << cur
6587 << dendl;
6588 }
6589 }
6590
6591 void OSD::send_alive()
6592 {
6593 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6594 const auto osdmap = get_osdmap();
6595 if (!osdmap->exists(whoami))
6596 return;
6597 epoch_t up_thru = osdmap->get_up_thru(whoami);
6598 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6599 if (up_thru_wanted > up_thru) {
6600 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6601 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6602 }
6603 }
6604
6605 void OSD::request_full_map(epoch_t first, epoch_t last)
6606 {
6607 dout(10) << __func__ << " " << first << ".." << last
6608 << ", previously requested "
6609 << requested_full_first << ".." << requested_full_last << dendl;
6610 ceph_assert(ceph_mutex_is_locked(osd_lock));
6611 ceph_assert(first > 0 && last > 0);
6612 ceph_assert(first <= last);
6613 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6614 if (requested_full_first == 0) {
6615 // first request
6616 requested_full_first = first;
6617 requested_full_last = last;
6618 } else if (last <= requested_full_last) {
6619 // dup
6620 return;
6621 } else {
6622 // additional request
6623 first = requested_full_last + 1;
6624 requested_full_last = last;
6625 }
6626 MMonGetOSDMap *req = new MMonGetOSDMap;
6627 req->request_full(first, last);
6628 monc->send_mon_message(req);
6629 }
6630
6631 void OSD::got_full_map(epoch_t e)
6632 {
6633 ceph_assert(requested_full_first <= requested_full_last);
6634 ceph_assert(ceph_mutex_is_locked(osd_lock));
6635 if (requested_full_first == 0) {
6636 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6637 return;
6638 }
6639 if (e < requested_full_first) {
6640 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6641 << ".." << requested_full_last
6642 << ", ignoring" << dendl;
6643 return;
6644 }
6645 if (e >= requested_full_last) {
6646 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6647 << ".." << requested_full_last << ", resetting" << dendl;
6648 requested_full_first = requested_full_last = 0;
6649 return;
6650 }
6651
6652 requested_full_first = e + 1;
6653
6654 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6655 << ".." << requested_full_last
6656 << ", still need more" << dendl;
6657 }
6658
6659 void OSD::requeue_failures()
6660 {
6661 std::lock_guard l(heartbeat_lock);
6662 unsigned old_queue = failure_queue.size();
6663 unsigned old_pending = failure_pending.size();
6664 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
6665 failure_queue[p->first] = p->second.first;
6666 failure_pending.erase(p++);
6667 }
6668 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6669 << failure_queue.size() << dendl;
6670 }
6671
6672 void OSD::send_failures()
6673 {
6674 ceph_assert(ceph_mutex_is_locked(map_lock));
6675 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6676 std::lock_guard l(heartbeat_lock);
6677 utime_t now = ceph_clock_now();
6678 const auto osdmap = get_osdmap();
6679 while (!failure_queue.empty()) {
6680 int osd = failure_queue.begin()->first;
6681 if (!failure_pending.count(osd)) {
6682 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6683 monc->send_mon_message(
6684 new MOSDFailure(
6685 monc->get_fsid(),
6686 osd,
6687 osdmap->get_addrs(osd),
6688 failed_for,
6689 osdmap->get_epoch()));
6690 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6691 osdmap->get_addrs(osd));
6692 }
6693 failure_queue.erase(osd);
6694 }
6695 }
6696
6697 void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
6698 {
6699 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6700 MOSDFailure::FLAG_ALIVE);
6701 monc->send_mon_message(m);
6702 }
6703
6704 void OSD::cancel_pending_failures()
6705 {
6706 std::lock_guard l(heartbeat_lock);
6707 auto it = failure_pending.begin();
6708 while (it != failure_pending.end()) {
6709 dout(10) << __func__ << " canceling in-flight failure report for osd."
6710 << it->first << dendl;
6711 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
6712 failure_pending.erase(it++);
6713 }
6714 }
6715
6716 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6717 {
6718 const auto& monmap = monc->monmap;
6719 // send beacon to mon even if we are just connected, and the monmap is not
6720 // initialized yet by then.
6721 if (monmap.epoch > 0 &&
6722 monmap.get_required_features().contains_all(
6723 ceph::features::mon::FEATURE_LUMINOUS)) {
6724 dout(20) << __func__ << " sending" << dendl;
6725 MOSDBeacon* beacon = nullptr;
6726 {
6727 std::lock_guard l{min_last_epoch_clean_lock};
6728 beacon = new MOSDBeacon(get_osdmap_epoch(),
6729 min_last_epoch_clean,
6730 superblock.last_purged_snaps_scrub);
6731 beacon->pgs = min_last_epoch_clean_pgs;
6732 last_sent_beacon = now;
6733 }
6734 monc->send_mon_message(beacon);
6735 } else {
6736 dout(20) << __func__ << " not sending" << dendl;
6737 }
6738 }
6739
6740 void OSD::handle_command(MCommand *m)
6741 {
6742 ConnectionRef con = m->get_connection();
6743 auto session = ceph::ref_cast<Session>(con->get_priv());
6744 if (!session) {
6745 con->send_message(new MCommandReply(m, -EACCES));
6746 m->put();
6747 return;
6748 }
6749 if (!session->caps.allow_all()) {
6750 con->send_message(new MCommandReply(m, -EACCES));
6751 m->put();
6752 return;
6753 }
6754 cct->get_admin_socket()->queue_tell_command(m);
6755 m->put();
6756 }
6757
6758 namespace {
6759 class unlock_guard {
6760 ceph::mutex& m;
6761 public:
6762 explicit unlock_guard(ceph::mutex& mutex)
6763 : m(mutex)
6764 {
6765 m.unlock();
6766 }
6767 unlock_guard(unlock_guard&) = delete;
6768 ~unlock_guard() {
6769 m.lock();
6770 }
6771 };
6772 }
6773
6774 void OSD::scrub_purged_snaps()
6775 {
6776 dout(10) << __func__ << dendl;
6777 ceph_assert(ceph_mutex_is_locked(osd_lock));
6778 SnapMapper::Scrubber s(cct, store, service.meta_ch,
6779 make_snapmapper_oid(),
6780 make_purged_snaps_oid());
6781 clog->debug() << "purged_snaps scrub starts";
6782 osd_lock.unlock();
6783 s.run();
6784 if (s.stray.size()) {
6785 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
6786 } else {
6787 clog->debug() << "purged_snaps scrub ok";
6788 }
6789 set<pair<spg_t,snapid_t>> queued;
6790 for (auto& [pool, snap, hash, shard] : s.stray) {
6791 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
6792 if (!pi) {
6793 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
6794 continue;
6795 }
6796 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
6797 spg_t spgid(pgid, shard);
6798 pair<spg_t,snapid_t> p(spgid, snap);
6799 if (queued.count(p)) {
6800 dout(20) << __func__ << " pg " << spgid << " snap " << snap
6801 << " already queued" << dendl;
6802 continue;
6803 }
6804 PGRef pg = lookup_lock_pg(spgid);
6805 if (!pg) {
6806 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
6807 continue;
6808 }
6809 queued.insert(p);
6810 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
6811 << snap << dendl;
6812 pg->queue_snap_retrim(snap);
6813 pg->unlock();
6814 }
6815 osd_lock.lock();
6816 if (is_stopping()) {
6817 return;
6818 }
6819 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
6820 ObjectStore::Transaction t;
6821 superblock.last_purged_snaps_scrub = ceph_clock_now();
6822 write_superblock(t);
6823 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
6824 ceph_assert(tr == 0);
6825 if (is_active()) {
6826 send_beacon(ceph::coarse_mono_clock::now());
6827 }
6828 dout(10) << __func__ << " done" << dendl;
6829 }
6830
6831 void OSD::probe_smart(const string& only_devid, ostream& ss)
6832 {
6833 set<string> devnames;
6834 store->get_devices(&devnames);
6835 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
6836 "osd_smart_report_timeout");
6837
6838 // == typedef std::map<std::string, mValue> mObject;
6839 json_spirit::mObject json_map;
6840
6841 for (auto dev : devnames) {
6842 // smartctl works only on physical devices; filter out any logical device
6843 if (dev.find("dm-") == 0) {
6844 continue;
6845 }
6846
6847 string err;
6848 string devid = get_device_id(dev, &err);
6849 if (devid.size() == 0) {
6850 dout(10) << __func__ << " no unique id for dev " << dev << " ("
6851 << err << "), skipping" << dendl;
6852 continue;
6853 }
6854 if (only_devid.size() && devid != only_devid) {
6855 continue;
6856 }
6857
6858 json_spirit::mValue smart_json;
6859 if (block_device_get_metrics(dev, smart_timeout,
6860 &smart_json)) {
6861 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
6862 continue;
6863 }
6864 json_map[devid] = smart_json;
6865 }
6866 json_spirit::write(json_map, ss, json_spirit::pretty_print);
6867 }
6868
6869 bool OSD::heartbeat_dispatch(Message *m)
6870 {
6871 dout(30) << "heartbeat_dispatch " << m << dendl;
6872 switch (m->get_type()) {
6873
6874 case CEPH_MSG_PING:
6875 dout(10) << "ping from " << m->get_source_inst() << dendl;
6876 m->put();
6877 break;
6878
6879 case MSG_OSD_PING:
6880 handle_osd_ping(static_cast<MOSDPing*>(m));
6881 break;
6882
6883 default:
6884 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
6885 m->put();
6886 }
6887
6888 return true;
6889 }
6890
6891 bool OSD::ms_dispatch(Message *m)
6892 {
6893 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
6894 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
6895 service.got_stop_ack();
6896 m->put();
6897 return true;
6898 }
6899
6900 // lock!
6901
6902 osd_lock.lock();
6903 if (is_stopping()) {
6904 osd_lock.unlock();
6905 m->put();
6906 return true;
6907 }
6908
6909 do_waiters();
6910 _dispatch(m);
6911
6912 osd_lock.unlock();
6913
6914 return true;
6915 }
6916
6917 void OSDService::maybe_share_map(
6918 Connection *con,
6919 const OSDMapRef& osdmap,
6920 epoch_t peer_epoch_lb)
6921 {
6922 // NOTE: we assume caller hold something that keeps the Connection itself
6923 // pinned (e.g., an OpRequest's MessageRef).
6924 auto session = ceph::ref_cast<Session>(con->get_priv());
6925 if (!session) {
6926 return;
6927 }
6928
6929 // assume the peer has the newer of the op's sent_epoch and what
6930 // we think we sent them.
6931 session->sent_epoch_lock.lock();
6932 if (peer_epoch_lb > session->last_sent_epoch) {
6933 dout(10) << __func__ << " con " << con
6934 << " " << con->get_peer_addr()
6935 << " map epoch " << session->last_sent_epoch
6936 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
6937 session->last_sent_epoch = peer_epoch_lb;
6938 }
6939 epoch_t last_sent_epoch = session->last_sent_epoch;
6940 session->sent_epoch_lock.unlock();
6941
6942 if (osdmap->get_epoch() <= last_sent_epoch) {
6943 return;
6944 }
6945
6946 send_incremental_map(last_sent_epoch, con, osdmap);
6947 last_sent_epoch = osdmap->get_epoch();
6948
6949 session->sent_epoch_lock.lock();
6950 if (session->last_sent_epoch < last_sent_epoch) {
6951 dout(10) << __func__ << " con " << con
6952 << " " << con->get_peer_addr()
6953 << " map epoch " << session->last_sent_epoch
6954 << " -> " << last_sent_epoch << " (shared)" << dendl;
6955 session->last_sent_epoch = last_sent_epoch;
6956 }
6957 session->sent_epoch_lock.unlock();
6958 }
6959
6960 void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
6961 {
6962 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
6963
6964 auto i = session->waiting_on_map.begin();
6965 while (i != session->waiting_on_map.end()) {
6966 OpRequestRef op = &(*i);
6967 ceph_assert(ms_can_fast_dispatch(op->get_req()));
6968 auto m = op->get_req<MOSDFastDispatchOp>();
6969 if (m->get_min_epoch() > osdmap->get_epoch()) {
6970 break;
6971 }
6972 session->waiting_on_map.erase(i++);
6973 op->put();
6974
6975 spg_t pgid;
6976 if (m->get_type() == CEPH_MSG_OSD_OP) {
6977 pg_t actual_pgid = osdmap->raw_pg_to_pg(
6978 static_cast<const MOSDOp*>(m)->get_pg());
6979 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
6980 continue;
6981 }
6982 } else {
6983 pgid = m->get_spg();
6984 }
6985 enqueue_op(pgid, std::move(op), m->get_map_epoch());
6986 }
6987
6988 if (session->waiting_on_map.empty()) {
6989 clear_session_waiting_on_map(session);
6990 } else {
6991 register_session_waiting_on_map(session);
6992 }
6993 }
6994
6995 void OSD::ms_fast_dispatch(Message *m)
6996 {
6997 FUNCTRACE(cct);
6998 if (service.is_stopping()) {
6999 m->put();
7000 return;
7001 }
7002
7003 // peering event?
7004 switch (m->get_type()) {
7005 case CEPH_MSG_PING:
7006 dout(10) << "ping from " << m->get_source() << dendl;
7007 m->put();
7008 return;
7009 case MSG_OSD_FORCE_RECOVERY:
7010 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7011 return;
7012 case MSG_OSD_SCRUB2:
7013 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7014 return;
7015
7016 case MSG_OSD_PG_CREATE2:
7017 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7018 case MSG_OSD_PG_QUERY:
7019 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7020 case MSG_OSD_PG_NOTIFY:
7021 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7022 case MSG_OSD_PG_INFO:
7023 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7024 case MSG_OSD_PG_REMOVE:
7025 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7026
7027 // these are single-pg messages that handle themselves
7028 case MSG_OSD_PG_LOG:
7029 case MSG_OSD_PG_TRIM:
7030 case MSG_OSD_PG_NOTIFY2:
7031 case MSG_OSD_PG_QUERY2:
7032 case MSG_OSD_PG_INFO2:
7033 case MSG_OSD_BACKFILL_RESERVE:
7034 case MSG_OSD_RECOVERY_RESERVE:
7035 case MSG_OSD_PG_LEASE:
7036 case MSG_OSD_PG_LEASE_ACK:
7037 {
7038 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7039 if (require_osd_peer(pm)) {
7040 enqueue_peering_evt(
7041 pm->get_spg(),
7042 PGPeeringEventRef(pm->get_event()));
7043 }
7044 pm->put();
7045 return;
7046 }
7047 }
7048
7049 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7050 {
7051 #ifdef WITH_LTTNG
7052 osd_reqid_t reqid = op->get_reqid();
7053 #endif
7054 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7055 reqid.name._num, reqid.tid, reqid.inc);
7056 }
7057
7058 if (m->trace)
7059 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7060
7061 // note sender epoch, min req's epoch
7062 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7063 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7064 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7065
7066 service.maybe_inject_dispatch_delay();
7067
7068 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7069 m->get_type() != CEPH_MSG_OSD_OP) {
7070 // queue it directly
7071 enqueue_op(
7072 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7073 std::move(op),
7074 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7075 } else {
7076 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7077 // message that didn't have an explicit spg_t); we need to map
7078 // them to an spg_t while preserving delivery order.
7079 auto priv = m->get_connection()->get_priv();
7080 if (auto session = static_cast<Session*>(priv.get()); session) {
7081 std::lock_guard l{session->session_dispatch_lock};
7082 op->get();
7083 session->waiting_on_map.push_back(*op);
7084 OSDMapRef nextmap = service.get_nextmap_reserved();
7085 dispatch_session_waiting(session, nextmap);
7086 service.release_map(nextmap);
7087 }
7088 }
7089 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7090 }
7091
7092 int OSD::ms_handle_authentication(Connection *con)
7093 {
7094 int ret = 0;
7095 auto s = ceph::ref_cast<Session>(con->get_priv());
7096 if (!s) {
7097 s = ceph::make_ref<Session>(cct, con);
7098 con->set_priv(s);
7099 s->entity_name = con->get_peer_entity_name();
7100 dout(10) << __func__ << " new session " << s << " con " << s->con
7101 << " entity " << s->entity_name
7102 << " addr " << con->get_peer_addrs() << dendl;
7103 } else {
7104 dout(10) << __func__ << " existing session " << s << " con " << s->con
7105 << " entity " << s->entity_name
7106 << " addr " << con->get_peer_addrs() << dendl;
7107 }
7108
7109 AuthCapsInfo &caps_info = con->get_peer_caps_info();
7110 if (caps_info.allow_all) {
7111 s->caps.set_allow_all();
7112 } else if (caps_info.caps.length() > 0) {
7113 bufferlist::const_iterator p = caps_info.caps.cbegin();
7114 string str;
7115 try {
7116 decode(str, p);
7117 }
7118 catch (buffer::error& e) {
7119 dout(10) << __func__ << " session " << s << " " << s->entity_name
7120 << " failed to decode caps string" << dendl;
7121 ret = -EACCES;
7122 }
7123 if (!ret) {
7124 bool success = s->caps.parse(str);
7125 if (success) {
7126 dout(10) << __func__ << " session " << s
7127 << " " << s->entity_name
7128 << " has caps " << s->caps << " '" << str << "'" << dendl;
7129 ret = 1;
7130 } else {
7131 dout(10) << __func__ << " session " << s << " " << s->entity_name
7132 << " failed to parse caps '" << str << "'" << dendl;
7133 ret = -EACCES;
7134 }
7135 }
7136 }
7137 return ret;
7138 }
7139
7140 void OSD::do_waiters()
7141 {
7142 ceph_assert(ceph_mutex_is_locked(osd_lock));
7143
7144 dout(10) << "do_waiters -- start" << dendl;
7145 while (!finished.empty()) {
7146 OpRequestRef next = finished.front();
7147 finished.pop_front();
7148 dispatch_op(next);
7149 }
7150 dout(10) << "do_waiters -- finish" << dendl;
7151 }
7152
7153 void OSD::dispatch_op(OpRequestRef op)
7154 {
7155 switch (op->get_req()->get_type()) {
7156
7157 case MSG_OSD_PG_CREATE:
7158 handle_pg_create(op);
7159 break;
7160 }
7161 }
7162
7163 void OSD::_dispatch(Message *m)
7164 {
7165 ceph_assert(ceph_mutex_is_locked(osd_lock));
7166 dout(20) << "_dispatch " << m << " " << *m << dendl;
7167
7168 switch (m->get_type()) {
7169 // -- don't need OSDMap --
7170
7171 // map and replication
7172 case CEPH_MSG_OSD_MAP:
7173 handle_osd_map(static_cast<MOSDMap*>(m));
7174 break;
7175 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7176 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7177 break;
7178
7179 // osd
7180 case MSG_OSD_SCRUB:
7181 handle_scrub(static_cast<MOSDScrub*>(m));
7182 break;
7183
7184 case MSG_COMMAND:
7185 handle_command(static_cast<MCommand*>(m));
7186 return;
7187
7188 // -- need OSDMap --
7189
7190 case MSG_OSD_PG_CREATE:
7191 {
7192 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7193 if (m->trace)
7194 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7195 // no map? starting up?
7196 if (!get_osdmap()) {
7197 dout(7) << "no OSDMap, not booted" << dendl;
7198 logger->inc(l_osd_waiting_for_map);
7199 waiting_for_osdmap.push_back(op);
7200 op->mark_delayed("no osdmap");
7201 break;
7202 }
7203
7204 // need OSDMap
7205 dispatch_op(op);
7206 }
7207 }
7208 }
7209
7210 // remove me post-nautilus
7211 void OSD::handle_scrub(MOSDScrub *m)
7212 {
7213 dout(10) << "handle_scrub " << *m << dendl;
7214 if (!require_mon_or_mgr_peer(m)) {
7215 m->put();
7216 return;
7217 }
7218 if (m->fsid != monc->get_fsid()) {
7219 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7220 << dendl;
7221 m->put();
7222 return;
7223 }
7224
7225 vector<spg_t> spgs;
7226 _get_pgids(&spgs);
7227
7228 if (!m->scrub_pgs.empty()) {
7229 vector<spg_t> v;
7230 for (auto pgid : m->scrub_pgs) {
7231 spg_t pcand;
7232 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
7233 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7234 v.push_back(pcand);
7235 }
7236 }
7237 spgs.swap(v);
7238 }
7239
7240 for (auto pgid : spgs) {
7241 enqueue_peering_evt(
7242 pgid,
7243 PGPeeringEventRef(
7244 std::make_shared<PGPeeringEvent>(
7245 get_osdmap_epoch(),
7246 get_osdmap_epoch(),
7247 PeeringState::RequestScrub(m->deep, m->repair))));
7248 }
7249
7250 m->put();
7251 }
7252
7253 void OSD::handle_fast_scrub(MOSDScrub2 *m)
7254 {
7255 dout(10) << __func__ << " " << *m << dendl;
7256 if (!require_mon_or_mgr_peer(m)) {
7257 m->put();
7258 return;
7259 }
7260 if (m->fsid != monc->get_fsid()) {
7261 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7262 << dendl;
7263 m->put();
7264 return;
7265 }
7266 for (auto pgid : m->scrub_pgs) {
7267 enqueue_peering_evt(
7268 pgid,
7269 PGPeeringEventRef(
7270 std::make_shared<PGPeeringEvent>(
7271 m->epoch,
7272 m->epoch,
7273 PeeringState::RequestScrub(m->deep, m->repair))));
7274 }
7275 m->put();
7276 }
7277
7278 bool OSD::scrub_random_backoff()
7279 {
7280 bool coin_flip = (rand() / (double)RAND_MAX >=
7281 cct->_conf->osd_scrub_backoff_ratio);
7282 if (!coin_flip) {
7283 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7284 return true;
7285 }
7286 return false;
7287 }
7288
7289 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7290 const spg_t& pg, const utime_t& timestamp,
7291 double pool_scrub_min_interval,
7292 double pool_scrub_max_interval, bool must)
7293 : cct(cct),
7294 pgid(pg),
7295 sched_time(timestamp),
7296 deadline(timestamp)
7297 {
7298 // if not explicitly requested, postpone the scrub with a random delay
7299 if (!must) {
7300 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7301 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7302 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7303 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7304
7305 sched_time += scrub_min_interval;
7306 double r = rand() / (double)RAND_MAX;
7307 sched_time +=
7308 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7309 if (scrub_max_interval == 0) {
7310 deadline = utime_t();
7311 } else {
7312 deadline += scrub_max_interval;
7313 }
7314
7315 }
7316 }
7317
7318 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7319 if (sched_time < rhs.sched_time)
7320 return true;
7321 if (sched_time > rhs.sched_time)
7322 return false;
7323 return pgid < rhs.pgid;
7324 }
7325
7326 double OSD::scrub_sleep_time(bool must_scrub)
7327 {
7328 if (must_scrub) {
7329 return cct->_conf->osd_scrub_sleep;
7330 }
7331 utime_t now = ceph_clock_now();
7332 if (scrub_time_permit(now)) {
7333 return cct->_conf->osd_scrub_sleep;
7334 }
7335 double normal_sleep = cct->_conf->osd_scrub_sleep;
7336 double extended_sleep = cct->_conf->osd_scrub_extended_sleep;
7337 return std::max(extended_sleep, normal_sleep);
7338 }
7339
7340 bool OSD::scrub_time_permit(utime_t now)
7341 {
7342 struct tm bdt;
7343 time_t tt = now.sec();
7344 localtime_r(&tt, &bdt);
7345
7346 bool day_permit = false;
7347 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7348 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7349 day_permit = true;
7350 }
7351 } else {
7352 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7353 day_permit = true;
7354 }
7355 }
7356
7357 if (!day_permit) {
7358 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7359 << " - " << cct->_conf->osd_scrub_end_week_day
7360 << " now " << bdt.tm_wday << " = no" << dendl;
7361 return false;
7362 }
7363
7364 bool time_permit = false;
7365 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7366 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7367 time_permit = true;
7368 }
7369 } else {
7370 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7371 time_permit = true;
7372 }
7373 }
7374 if (!time_permit) {
7375 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7376 << " - " << cct->_conf->osd_scrub_end_hour
7377 << " now " << bdt.tm_hour << " = no" << dendl;
7378 } else {
7379 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7380 << " - " << cct->_conf->osd_scrub_end_hour
7381 << " now " << bdt.tm_hour << " = yes" << dendl;
7382 }
7383 return time_permit;
7384 }
7385
7386 bool OSD::scrub_load_below_threshold()
7387 {
7388 double loadavgs[3];
7389 if (getloadavg(loadavgs, 3) != 3) {
7390 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7391 return false;
7392 }
7393
7394 // allow scrub if below configured threshold
7395 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7396 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7397 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7398 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7399 << " < max " << cct->_conf->osd_scrub_load_threshold
7400 << " = yes" << dendl;
7401 return true;
7402 }
7403
7404 // allow scrub if below daily avg and currently decreasing
7405 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7406 dout(20) << __func__ << " loadavg " << loadavgs[0]
7407 << " < daily_loadavg " << daily_loadavg
7408 << " and < 15m avg " << loadavgs[2]
7409 << " = yes" << dendl;
7410 return true;
7411 }
7412
7413 dout(20) << __func__ << " loadavg " << loadavgs[0]
7414 << " >= max " << cct->_conf->osd_scrub_load_threshold
7415 << " and ( >= daily_loadavg " << daily_loadavg
7416 << " or >= 15m avg " << loadavgs[2]
7417 << ") = no" << dendl;
7418 return false;
7419 }
7420
7421 void OSD::sched_scrub()
7422 {
7423 // if not permitted, fail fast
7424 if (!service.can_inc_scrubs()) {
7425 return;
7426 }
7427 bool allow_requested_repair_only = false;
7428 if (service.is_recovery_active()) {
7429 if (!cct->_conf->osd_scrub_during_recovery && cct->_conf->osd_repair_during_recovery) {
7430 dout(10) << __func__
7431 << " will only schedule explicitly requested repair due to active recovery"
7432 << dendl;
7433 allow_requested_repair_only = true;
7434 } else if (!cct->_conf->osd_scrub_during_recovery && !cct->_conf->osd_repair_during_recovery) {
7435 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7436 return;
7437 }
7438 }
7439
7440 utime_t now = ceph_clock_now();
7441 bool time_permit = scrub_time_permit(now);
7442 bool load_is_low = scrub_load_below_threshold();
7443 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7444
7445 OSDService::ScrubJob scrub;
7446 if (service.first_scrub_stamp(&scrub)) {
7447 do {
7448 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7449
7450 if (scrub.sched_time > now) {
7451 // save ourselves some effort
7452 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7453 << " > " << now << dendl;
7454 break;
7455 }
7456
7457 if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
7458 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7459 << (!time_permit ? "time not permit" : "high load") << dendl;
7460 continue;
7461 }
7462
7463 PGRef pg = _lookup_lock_pg(scrub.pgid);
7464 if (!pg)
7465 continue;
7466 // This has already started, so go on to the next scrub job
7467 if (pg->scrubber.active) {
7468 pg->unlock();
7469 dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
7470 continue;
7471 }
7472 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7473 if (allow_requested_repair_only && !pg->scrubber.must_repair) {
7474 pg->unlock();
7475 dout(10) << __func__ << " skip " << scrub.pgid
7476 << " because repairing is not explicitly requested on it"
7477 << dendl;
7478 continue;
7479 }
7480 // If it is reserving, let it resolve before going to the next scrub job
7481 if (pg->scrubber.local_reserved && !pg->scrubber.active) {
7482 pg->unlock();
7483 dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
7484 break;
7485 }
7486 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7487 << (pg->get_must_scrub() ? ", explicitly requested" :
7488 (load_is_low ? ", load_is_low" : " deadline < now"))
7489 << dendl;
7490 if (pg->sched_scrub()) {
7491 pg->unlock();
7492 break;
7493 }
7494 pg->unlock();
7495 } while (service.next_scrub_stamp(scrub, &scrub));
7496 }
7497 dout(20) << "sched_scrub done" << dendl;
7498 }
7499
7500 void OSD::resched_all_scrubs()
7501 {
7502 dout(10) << __func__ << ": start" << dendl;
7503 OSDService::ScrubJob scrub;
7504 if (service.first_scrub_stamp(&scrub)) {
7505 do {
7506 dout(20) << __func__ << ": examine " << scrub.pgid << dendl;
7507
7508 PGRef pg = _lookup_lock_pg(scrub.pgid);
7509 if (!pg)
7510 continue;
7511 if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) {
7512 dout(20) << __func__ << ": reschedule " << scrub.pgid << dendl;
7513 pg->on_info_history_change();
7514 }
7515 pg->unlock();
7516 } while (service.next_scrub_stamp(scrub, &scrub));
7517 }
7518 dout(10) << __func__ << ": done" << dendl;
7519 }
7520
7521 MPGStats* OSD::collect_pg_stats()
7522 {
7523 // This implementation unconditionally sends every is_primary PG's
7524 // stats every time we're called. This has equivalent cost to the
7525 // previous implementation's worst case where all PGs are busy and
7526 // their stats are always enqueued for sending.
7527 std::shared_lock l{map_lock};
7528
7529 osd_stat_t cur_stat = service.get_osd_stat();
7530 cur_stat.os_perf_stat = store->get_cur_stats();
7531
7532 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
7533 m->osd_stat = cur_stat;
7534
7535 std::lock_guard lec{min_last_epoch_clean_lock};
7536 min_last_epoch_clean = get_osdmap_epoch();
7537 min_last_epoch_clean_pgs.clear();
7538
7539 std::set<int64_t> pool_set;
7540 vector<PGRef> pgs;
7541 _get_pgs(&pgs);
7542 for (auto& pg : pgs) {
7543 auto pool = pg->pg_id.pgid.pool();
7544 pool_set.emplace((int64_t)pool);
7545 if (!pg->is_primary()) {
7546 continue;
7547 }
7548 pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7549 m->pg_stat[pg->pg_id.pgid] = s;
7550 min_last_epoch_clean = min(min_last_epoch_clean, lec);
7551 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7552 });
7553 }
7554 store_statfs_t st;
7555 bool per_pool_stats = false;
7556 bool per_pool_omap_stats = false;
7557 for (auto p : pool_set) {
7558 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
7559 if (r == -ENOTSUP) {
7560 break;
7561 } else {
7562 assert(r >= 0);
7563 m->pool_stat[p] = st;
7564 per_pool_stats = true;
7565 }
7566 }
7567
7568 // indicate whether we are reporting per-pool stats
7569 m->osd_stat.num_osds = 1;
7570 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
7571 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
7572
7573 return m;
7574 }
7575
7576 vector<DaemonHealthMetric> OSD::get_health_metrics()
7577 {
7578 vector<DaemonHealthMetric> metrics;
7579 {
7580 utime_t oldest_secs;
7581 const utime_t now = ceph_clock_now();
7582 auto too_old = now;
7583 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7584 int slow = 0;
7585 TrackedOpRef oldest_op;
7586 auto count_slow_ops = [&](TrackedOp& op) {
7587 if (op.get_initiated() < too_old) {
7588 stringstream ss;
7589 ss << "slow request " << op.get_desc()
7590 << " initiated "
7591 << op.get_initiated()
7592 << " currently "
7593 << op.state_string();
7594 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7595 clog->warn() << ss.str();
7596 slow++;
7597 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7598 oldest_op = &op;
7599 }
7600 return true;
7601 } else {
7602 return false;
7603 }
7604 };
7605 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7606 if (slow) {
7607 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7608 << oldest_op->get_desc() << dendl;
7609 }
7610 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7611 } else {
7612 // no news is not good news.
7613 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7614 }
7615 }
7616 {
7617 std::lock_guard l(pending_creates_lock);
7618 auto n_primaries = pending_creates_from_mon;
7619 for (const auto& create : pending_creates_from_osd) {
7620 if (create.second) {
7621 n_primaries++;
7622 }
7623 }
7624 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
7625 }
7626 return metrics;
7627 }
7628
7629 // =====================================================
7630 // MAP
7631
7632 void OSD::wait_for_new_map(OpRequestRef op)
7633 {
7634 // ask?
7635 if (waiting_for_osdmap.empty()) {
7636 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7637 }
7638
7639 logger->inc(l_osd_waiting_for_map);
7640 waiting_for_osdmap.push_back(op);
7641 op->mark_delayed("wait for new map");
7642 }
7643
7644
7645 /** update_map
7646 * assimilate new OSDMap(s). scan pgs, etc.
7647 */
7648
7649 void OSD::note_down_osd(int peer)
7650 {
7651 ceph_assert(ceph_mutex_is_locked(osd_lock));
7652 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7653
7654 std::lock_guard l{heartbeat_lock};
7655 failure_queue.erase(peer);
7656 failure_pending.erase(peer);
7657 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7658 if (p != heartbeat_peers.end()) {
7659 p->second.clear_mark_down();
7660 heartbeat_peers.erase(p);
7661 }
7662 }
7663
7664 void OSD::note_up_osd(int peer)
7665 {
7666 heartbeat_set_peers_need_update();
7667 }
7668
7669 struct C_OnMapCommit : public Context {
7670 OSD *osd;
7671 epoch_t first, last;
7672 MOSDMap *msg;
7673 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7674 : osd(o), first(f), last(l), msg(m) {}
7675 void finish(int r) override {
7676 osd->_committed_osd_maps(first, last, msg);
7677 msg->put();
7678 }
7679 };
7680
7681 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7682 {
7683 std::lock_guard l(osdmap_subscribe_lock);
7684 if (latest_subscribed_epoch >= epoch && !force_request)
7685 return;
7686
7687 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
7688
7689 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7690 force_request) {
7691 monc->renew_subs();
7692 }
7693 }
7694
7695 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7696 {
7697 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7698 if (min <= superblock.oldest_map)
7699 return;
7700
7701 int num = 0;
7702 ObjectStore::Transaction t;
7703 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7704 dout(20) << " removing old osdmap epoch " << e << dendl;
7705 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7706 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7707 superblock.oldest_map = e + 1;
7708 num++;
7709 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7710 service.publish_superblock(superblock);
7711 write_superblock(t);
7712 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7713 ceph_assert(tr == 0);
7714 num = 0;
7715 if (!skip_maps) {
7716 // skip_maps leaves us with a range of old maps if we fail to remove all
7717 // of them before moving superblock.oldest_map forward to the first map
7718 // in the incoming MOSDMap msg. so we should continue removing them in
7719 // this case, even we could do huge series of delete transactions all at
7720 // once.
7721 break;
7722 }
7723 }
7724 }
7725 if (num > 0) {
7726 service.publish_superblock(superblock);
7727 write_superblock(t);
7728 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7729 ceph_assert(tr == 0);
7730 }
7731 // we should not remove the cached maps
7732 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7733 }
7734
7735 void OSD::handle_osd_map(MOSDMap *m)
7736 {
7737 // wait for pgs to catch up
7738 {
7739 // we extend the map cache pins to accomodate pgs slow to consume maps
7740 // for some period, until we hit the max_lag_factor bound, at which point
7741 // we block here to stop injesting more maps than they are able to keep
7742 // up with.
7743 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7744 m_osd_pg_epoch_max_lag_factor;
7745 ceph_assert(max_lag > 0);
7746 epoch_t osd_min = 0;
7747 for (auto shard : shards) {
7748 epoch_t min = shard->get_min_pg_epoch();
7749 if (osd_min == 0 || min < osd_min) {
7750 osd_min = min;
7751 }
7752 }
7753 epoch_t osdmap_epoch = get_osdmap_epoch();
7754 if (osd_min > 0 &&
7755 osdmap_epoch > max_lag &&
7756 osdmap_epoch - max_lag > osd_min) {
7757 epoch_t need = osdmap_epoch - max_lag;
7758 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7759 << " max_lag " << max_lag << ")" << dendl;
7760 for (auto shard : shards) {
7761 epoch_t min = shard->get_min_pg_epoch();
7762 if (need > min) {
7763 dout(10) << __func__ << " waiting for pgs to consume " << need
7764 << " (shard " << shard->shard_id << " min " << min
7765 << ", map cache is " << cct->_conf->osd_map_cache_size
7766 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7767 << ")" << dendl;
7768 unlock_guard unlock{osd_lock};
7769 shard->wait_min_pg_epoch(need);
7770 }
7771 }
7772 }
7773 }
7774
7775 ceph_assert(ceph_mutex_is_locked(osd_lock));
7776 map<epoch_t,OSDMapRef> added_maps;
7777 map<epoch_t,bufferlist> added_maps_bl;
7778 if (m->fsid != monc->get_fsid()) {
7779 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7780 << monc->get_fsid() << dendl;
7781 m->put();
7782 return;
7783 }
7784 if (is_initializing()) {
7785 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7786 m->put();
7787 return;
7788 }
7789
7790 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7791 if (session && !(session->entity_name.is_mon() ||
7792 session->entity_name.is_osd())) {
7793 //not enough perms!
7794 dout(10) << "got osd map from Session " << session
7795 << " which we can't take maps from (not a mon or osd)" << dendl;
7796 m->put();
7797 return;
7798 }
7799
7800 // share with the objecter
7801 if (!is_preboot())
7802 service.objecter->handle_osd_map(m);
7803
7804 epoch_t first = m->get_first();
7805 epoch_t last = m->get_last();
7806 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7807 << superblock.newest_map
7808 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7809 << dendl;
7810
7811 logger->inc(l_osd_map);
7812 logger->inc(l_osd_mape, last - first + 1);
7813 if (first <= superblock.newest_map)
7814 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7815 if (service.max_oldest_map < m->oldest_map) {
7816 service.max_oldest_map = m->oldest_map;
7817 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7818 }
7819
7820 // make sure there is something new, here, before we bother flushing
7821 // the queues and such
7822 if (last <= superblock.newest_map) {
7823 dout(10) << " no new maps here, dropping" << dendl;
7824 m->put();
7825 return;
7826 }
7827
7828 // missing some?
7829 bool skip_maps = false;
7830 if (first > superblock.newest_map + 1) {
7831 dout(10) << "handle_osd_map message skips epochs "
7832 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7833 if (m->oldest_map <= superblock.newest_map + 1) {
7834 osdmap_subscribe(superblock.newest_map + 1, false);
7835 m->put();
7836 return;
7837 }
7838 // always try to get the full range of maps--as many as we can. this
7839 // 1- is good to have
7840 // 2- is at present the only way to ensure that we get a *full* map as
7841 // the first map!
7842 if (m->oldest_map < first) {
7843 osdmap_subscribe(m->oldest_map - 1, true);
7844 m->put();
7845 return;
7846 }
7847 skip_maps = true;
7848 }
7849
7850 ObjectStore::Transaction t;
7851 uint64_t txn_size = 0;
7852
7853 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
7854
7855 // store new maps: queue for disk and put in the osdmap cache
7856 epoch_t start = std::max(superblock.newest_map + 1, first);
7857 for (epoch_t e = start; e <= last; e++) {
7858 if (txn_size >= t.get_num_bytes()) {
7859 derr << __func__ << " transaction size overflowed" << dendl;
7860 ceph_assert(txn_size < t.get_num_bytes());
7861 }
7862 txn_size = t.get_num_bytes();
7863 map<epoch_t,bufferlist>::iterator p;
7864 p = m->maps.find(e);
7865 if (p != m->maps.end()) {
7866 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7867 OSDMap *o = new OSDMap;
7868 bufferlist& bl = p->second;
7869
7870 o->decode(bl);
7871
7872 purged_snaps[e] = o->get_new_purged_snaps();
7873
7874 ghobject_t fulloid = get_osdmap_pobject_name(e);
7875 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7876 added_maps[e] = add_map(o);
7877 added_maps_bl[e] = bl;
7878 got_full_map(e);
7879 continue;
7880 }
7881
7882 p = m->incremental_maps.find(e);
7883 if (p != m->incremental_maps.end()) {
7884 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7885 bufferlist& bl = p->second;
7886 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7887 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7888
7889 OSDMap *o = new OSDMap;
7890 if (e > 1) {
7891 bufferlist obl;
7892 bool got = get_map_bl(e - 1, obl);
7893 if (!got) {
7894 auto p = added_maps_bl.find(e - 1);
7895 ceph_assert(p != added_maps_bl.end());
7896 obl = p->second;
7897 }
7898 o->decode(obl);
7899 }
7900
7901 OSDMap::Incremental inc;
7902 auto p = bl.cbegin();
7903 inc.decode(p);
7904
7905 if (o->apply_incremental(inc) < 0) {
7906 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
7907 ceph_abort_msg("bad fsid");
7908 }
7909
7910 bufferlist fbl;
7911 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7912
7913 bool injected_failure = false;
7914 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7915 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7916 derr << __func__ << " injecting map crc failure" << dendl;
7917 injected_failure = true;
7918 }
7919
7920 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7921 dout(2) << "got incremental " << e
7922 << " but failed to encode full with correct crc; requesting"
7923 << dendl;
7924 clog->warn() << "failed to encode map e" << e << " with expected crc";
7925 dout(20) << "my encoded map was:\n";
7926 fbl.hexdump(*_dout);
7927 *_dout << dendl;
7928 delete o;
7929 request_full_map(e, last);
7930 last = e - 1;
7931 break;
7932 }
7933 got_full_map(e);
7934 purged_snaps[e] = o->get_new_purged_snaps();
7935
7936 ghobject_t fulloid = get_osdmap_pobject_name(e);
7937 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7938 added_maps[e] = add_map(o);
7939 added_maps_bl[e] = fbl;
7940 continue;
7941 }
7942
7943 ceph_abort_msg("MOSDMap lied about what maps it had?");
7944 }
7945
7946 // even if this map isn't from a mon, we may have satisfied our subscription
7947 monc->sub_got("osdmap", last);
7948
7949 if (!m->maps.empty() && requested_full_first) {
7950 dout(10) << __func__ << " still missing full maps " << requested_full_first
7951 << ".." << requested_full_last << dendl;
7952 rerequest_full_maps();
7953 }
7954
7955 if (superblock.oldest_map) {
7956 // make sure we at least keep pace with incoming maps
7957 trim_maps(m->oldest_map, last - first + 1, skip_maps);
7958 pg_num_history.prune(superblock.oldest_map);
7959 }
7960
7961 if (!superblock.oldest_map || skip_maps)
7962 superblock.oldest_map = first;
7963 superblock.newest_map = last;
7964 superblock.current_epoch = last;
7965
7966 // note in the superblock that we were clean thru the prior epoch
7967 epoch_t boot_epoch = service.get_boot_epoch();
7968 if (boot_epoch && boot_epoch >= superblock.mounted) {
7969 superblock.mounted = boot_epoch;
7970 superblock.clean_thru = last;
7971 }
7972
7973 // check for pg_num changes and deleted pools
7974 OSDMapRef lastmap;
7975 for (auto& i : added_maps) {
7976 if (!lastmap) {
7977 if (!(lastmap = service.try_get_map(i.first - 1))) {
7978 dout(10) << __func__ << " can't get previous map " << i.first - 1
7979 << " probably first start of this osd" << dendl;
7980 continue;
7981 }
7982 }
7983 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
7984 for (auto& j : lastmap->get_pools()) {
7985 if (!i.second->have_pg_pool(j.first)) {
7986 pg_num_history.log_pool_delete(i.first, j.first);
7987 dout(10) << __func__ << " recording final pg_pool_t for pool "
7988 << j.first << dendl;
7989 // this information is needed by _make_pg() if have to restart before
7990 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
7991 ghobject_t obj = make_final_pool_info_oid(j.first);
7992 bufferlist bl;
7993 encode(j.second, bl, CEPH_FEATURES_ALL);
7994 string name = lastmap->get_pool_name(j.first);
7995 encode(name, bl);
7996 map<string,string> profile;
7997 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
7998 profile = lastmap->get_erasure_code_profile(
7999 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8000 }
8001 encode(profile, bl);
8002 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8003 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8004 new_pg_num != j.second.get_pg_num()) {
8005 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8006 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8007 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8008 }
8009 }
8010 for (auto& j : i.second->get_pools()) {
8011 if (!lastmap->have_pg_pool(j.first)) {
8012 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8013 << j.second.get_pg_num() << dendl;
8014 pg_num_history.log_pg_num_change(i.first, j.first,
8015 j.second.get_pg_num());
8016 }
8017 }
8018 lastmap = i.second;
8019 }
8020 pg_num_history.epoch = last;
8021 {
8022 bufferlist bl;
8023 ::encode(pg_num_history, bl);
8024 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8025 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8026 }
8027
8028 // record new purged_snaps
8029 if (superblock.purged_snaps_last == start - 1) {
8030 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
8031 make_purged_snaps_oid(), &t,
8032 purged_snaps);
8033 superblock.purged_snaps_last = last;
8034 } else {
8035 dout(10) << __func__ << " superblock purged_snaps_last is "
8036 << superblock.purged_snaps_last
8037 << ", not recording new purged_snaps" << dendl;
8038 }
8039
8040 // superblock and commit
8041 write_superblock(t);
8042 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8043 store->queue_transaction(
8044 service.meta_ch,
8045 std::move(t));
8046 service.publish_superblock(superblock);
8047 }
8048
8049 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8050 {
8051 dout(10) << __func__ << " " << first << ".." << last << dendl;
8052 if (is_stopping()) {
8053 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8054 return;
8055 }
8056 std::lock_guard l(osd_lock);
8057 if (is_stopping()) {
8058 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8059 return;
8060 }
8061 map_lock.lock();
8062
8063 bool do_shutdown = false;
8064 bool do_restart = false;
8065 bool network_error = false;
8066 OSDMapRef osdmap;
8067
8068 // advance through the new maps
8069 for (epoch_t cur = first; cur <= last; cur++) {
8070 dout(10) << " advance to epoch " << cur
8071 << " (<= last " << last
8072 << " <= newest_map " << superblock.newest_map
8073 << ")" << dendl;
8074
8075 OSDMapRef newmap = get_map(cur);
8076 ceph_assert(newmap); // we just cached it above!
8077
8078 // start blacklisting messages sent to peers that go down.
8079 service.pre_publish_map(newmap);
8080
8081 // kill connections to newly down osds
8082 bool waited_for_reservations = false;
8083 set<int> old;
8084 osdmap = get_osdmap();
8085 osdmap->get_all_osds(old);
8086 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8087 if (*p != whoami &&
8088 osdmap->is_up(*p) && // in old map
8089 newmap->is_down(*p)) { // but not the new one
8090 if (!waited_for_reservations) {
8091 service.await_reserved_maps();
8092 waited_for_reservations = true;
8093 }
8094 note_down_osd(*p);
8095 } else if (*p != whoami &&
8096 osdmap->is_down(*p) &&
8097 newmap->is_up(*p)) {
8098 note_up_osd(*p);
8099 }
8100 }
8101
8102 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8103 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8104 << dendl;
8105 if (is_booting()) {
8106 // this captures the case where we sent the boot message while
8107 // NOUP was being set on the mon and our boot request was
8108 // dropped, and then later it is cleared. it imperfectly
8109 // handles the case where our original boot message was not
8110 // dropped and we restart even though we might have booted, but
8111 // that is harmless (boot will just take slightly longer).
8112 do_restart = true;
8113 }
8114 }
8115
8116 osdmap = std::move(newmap);
8117 set_osdmap(osdmap);
8118 epoch_t up_epoch;
8119 epoch_t boot_epoch;
8120 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8121 if (!up_epoch &&
8122 osdmap->is_up(whoami) &&
8123 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8124 up_epoch = osdmap->get_epoch();
8125 dout(10) << "up_epoch is " << up_epoch << dendl;
8126 if (!boot_epoch) {
8127 boot_epoch = osdmap->get_epoch();
8128 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8129 }
8130 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8131 }
8132 }
8133
8134 epoch_t _bind_epoch = service.get_bind_epoch();
8135 if (osdmap->is_up(whoami) &&
8136 osdmap->get_addrs(whoami).legacy_equals(
8137 client_messenger->get_myaddrs()) &&
8138 _bind_epoch < osdmap->get_up_from(whoami)) {
8139
8140 if (is_booting()) {
8141 dout(1) << "state: booting -> active" << dendl;
8142 set_state(STATE_ACTIVE);
8143 do_restart = false;
8144
8145 // set incarnation so that osd_reqid_t's we generate for our
8146 // objecter requests are unique across restarts.
8147 service.objecter->set_client_incarnation(osdmap->get_epoch());
8148 cancel_pending_failures();
8149 }
8150 }
8151
8152 if (osdmap->get_epoch() > 0 &&
8153 is_active()) {
8154 if (!osdmap->exists(whoami)) {
8155 derr << "map says i do not exist. shutting down." << dendl;
8156 do_shutdown = true; // don't call shutdown() while we have
8157 // everything paused
8158 } else if (osdmap->is_stop(whoami)) {
8159 derr << "map says i am stopped by admin. shutting down." << dendl;
8160 do_shutdown = true;
8161 } else if (!osdmap->is_up(whoami) ||
8162 !osdmap->get_addrs(whoami).legacy_equals(
8163 client_messenger->get_myaddrs()) ||
8164 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8165 cluster_messenger->get_myaddrs()) ||
8166 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8167 hb_back_server_messenger->get_myaddrs()) ||
8168 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8169 hb_front_server_messenger->get_myaddrs())) {
8170 if (!osdmap->is_up(whoami)) {
8171 if (service.is_preparing_to_stop() || service.is_stopping()) {
8172 service.got_stop_ack();
8173 } else {
8174 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8175 "but it is still running";
8176 clog->debug() << "map e" << osdmap->get_epoch()
8177 << " wrongly marked me down at e"
8178 << osdmap->get_down_at(whoami);
8179 }
8180 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8181 // note that this is best-effort...
8182 monc->send_mon_message(
8183 new MOSDMarkMeDead(
8184 monc->get_fsid(),
8185 whoami,
8186 osdmap->get_epoch()));
8187 }
8188 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8189 client_messenger->get_myaddrs())) {
8190 clog->error() << "map e" << osdmap->get_epoch()
8191 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8192 << " != my " << client_messenger->get_myaddrs() << ")";
8193 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8194 cluster_messenger->get_myaddrs())) {
8195 clog->error() << "map e" << osdmap->get_epoch()
8196 << " had wrong cluster addr ("
8197 << osdmap->get_cluster_addrs(whoami)
8198 << " != my " << cluster_messenger->get_myaddrs() << ")";
8199 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8200 hb_back_server_messenger->get_myaddrs())) {
8201 clog->error() << "map e" << osdmap->get_epoch()
8202 << " had wrong heartbeat back addr ("
8203 << osdmap->get_hb_back_addrs(whoami)
8204 << " != my " << hb_back_server_messenger->get_myaddrs()
8205 << ")";
8206 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8207 hb_front_server_messenger->get_myaddrs())) {
8208 clog->error() << "map e" << osdmap->get_epoch()
8209 << " had wrong heartbeat front addr ("
8210 << osdmap->get_hb_front_addrs(whoami)
8211 << " != my " << hb_front_server_messenger->get_myaddrs()
8212 << ")";
8213 }
8214
8215 if (!service.is_stopping()) {
8216 epoch_t up_epoch = 0;
8217 epoch_t bind_epoch = osdmap->get_epoch();
8218 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8219 do_restart = true;
8220
8221 //add markdown log
8222 utime_t now = ceph_clock_now();
8223 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8224 osd_markdown_log.push_back(now);
8225 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8226 derr << __func__ << " marked down "
8227 << osd_markdown_log.size()
8228 << " > osd_max_markdown_count "
8229 << cct->_conf->osd_max_markdown_count
8230 << " in last " << grace << " seconds, shutting down"
8231 << dendl;
8232 do_restart = false;
8233 do_shutdown = true;
8234 }
8235
8236 start_waiting_for_healthy();
8237
8238 set<int> avoid_ports;
8239 #if defined(__FreeBSD__)
8240 // prevent FreeBSD from grabbing the client_messenger port during
8241 // rebinding. In which case a cluster_meesneger will connect also
8242 // to the same port
8243 client_messenger->get_myaddrs().get_ports(&avoid_ports);
8244 #endif
8245 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8246
8247 int r = cluster_messenger->rebind(avoid_ports);
8248 if (r != 0) {
8249 do_shutdown = true; // FIXME: do_restart?
8250 network_error = true;
8251 derr << __func__ << " marked down:"
8252 << " rebind cluster_messenger failed" << dendl;
8253 }
8254
8255 hb_back_server_messenger->mark_down_all();
8256 hb_front_server_messenger->mark_down_all();
8257 hb_front_client_messenger->mark_down_all();
8258 hb_back_client_messenger->mark_down_all();
8259
8260 reset_heartbeat_peers(true);
8261 }
8262 }
8263 }
8264
8265 map_lock.unlock();
8266
8267 check_osdmap_features();
8268
8269 // yay!
8270 consume_map();
8271
8272 if (is_active() || is_waiting_for_healthy())
8273 maybe_update_heartbeat_peers();
8274
8275 if (is_active()) {
8276 activate_map();
8277 }
8278
8279 if (do_shutdown) {
8280 if (network_error) {
8281 cancel_pending_failures();
8282 }
8283 // trigger shutdown in a different thread
8284 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8285 queue_async_signal(SIGINT);
8286 }
8287 else if (m->newest_map && m->newest_map > last) {
8288 dout(10) << " msg say newest map is " << m->newest_map
8289 << ", requesting more" << dendl;
8290 osdmap_subscribe(osdmap->get_epoch()+1, false);
8291 }
8292 else if (is_preboot()) {
8293 if (m->get_source().is_mon())
8294 _preboot(m->oldest_map, m->newest_map);
8295 else
8296 start_boot();
8297 }
8298 else if (do_restart)
8299 start_boot();
8300
8301 }
8302
8303 void OSD::check_osdmap_features()
8304 {
8305 // adjust required feature bits?
8306
8307 // we have to be a bit careful here, because we are accessing the
8308 // Policy structures without taking any lock. in particular, only
8309 // modify integer values that can safely be read by a racing CPU.
8310 // since we are only accessing existing Policy structures a their
8311 // current memory location, and setting or clearing bits in integer
8312 // fields, and we are the only writer, this is not a problem.
8313
8314 const auto osdmap = get_osdmap();
8315 {
8316 Messenger::Policy p = client_messenger->get_default_policy();
8317 uint64_t mask;
8318 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8319 if ((p.features_required & mask) != features) {
8320 dout(0) << "crush map has features " << features
8321 << ", adjusting msgr requires for clients" << dendl;
8322 p.features_required = (p.features_required & ~mask) | features;
8323 client_messenger->set_default_policy(p);
8324 }
8325 }
8326 {
8327 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8328 uint64_t mask;
8329 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8330 if ((p.features_required & mask) != features) {
8331 dout(0) << "crush map has features " << features
8332 << " was " << p.features_required
8333 << ", adjusting msgr requires for mons" << dendl;
8334 p.features_required = (p.features_required & ~mask) | features;
8335 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8336 }
8337 }
8338 {
8339 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8340 uint64_t mask;
8341 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8342
8343 if ((p.features_required & mask) != features) {
8344 dout(0) << "crush map has features " << features
8345 << ", adjusting msgr requires for osds" << dendl;
8346 p.features_required = (p.features_required & ~mask) | features;
8347 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8348 }
8349
8350 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8351 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8352 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8353 ObjectStore::Transaction t;
8354 write_superblock(t);
8355 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8356 ceph_assert(err == 0);
8357 }
8358 }
8359
8360 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8361 hb_front_server_messenger->set_require_authorizer(false);
8362 hb_back_server_messenger->set_require_authorizer(false);
8363 } else {
8364 hb_front_server_messenger->set_require_authorizer(true);
8365 hb_back_server_messenger->set_require_authorizer(true);
8366 }
8367
8368 if (osdmap->require_osd_release != last_require_osd_release) {
8369 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8370 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8371 store->write_meta("require_osd_release",
8372 stringify((int)osdmap->require_osd_release));
8373 last_require_osd_release = osdmap->require_osd_release;
8374 }
8375 }
8376
8377 struct C_FinishSplits : public Context {
8378 OSD *osd;
8379 set<PGRef> pgs;
8380 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8381 : osd(osd), pgs(in) {}
8382 void finish(int r) override {
8383 osd->_finish_splits(pgs);
8384 }
8385 };
8386
8387 void OSD::_finish_splits(set<PGRef>& pgs)
8388 {
8389 dout(10) << __func__ << " " << pgs << dendl;
8390 if (is_stopping())
8391 return;
8392 for (set<PGRef>::iterator i = pgs.begin();
8393 i != pgs.end();
8394 ++i) {
8395 PG *pg = i->get();
8396
8397 PeeringCtx rctx = create_context();
8398 pg->lock();
8399 dout(10) << __func__ << " " << *pg << dendl;
8400 epoch_t e = pg->get_osdmap_epoch();
8401 pg->handle_initialize(rctx);
8402 pg->queue_null(e, e);
8403 dispatch_context(rctx, pg, service.get_osdmap());
8404 pg->unlock();
8405
8406 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8407 shards[shard_index]->register_and_wake_split_child(pg);
8408 }
8409 };
8410
8411 bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8412 unsigned need)
8413 {
8414 std::lock_guard l(merge_lock);
8415 auto& p = merge_waiters[nextmap->get_epoch()][target];
8416 p[src->pg_id] = src;
8417 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8418 << " for " << target << ", have " << p.size() << "/" << need
8419 << dendl;
8420 return p.size() == need;
8421 }
8422
8423 bool OSD::advance_pg(
8424 epoch_t osd_epoch,
8425 PG *pg,
8426 ThreadPool::TPHandle &handle,
8427 PeeringCtx &rctx)
8428 {
8429 if (osd_epoch <= pg->get_osdmap_epoch()) {
8430 return true;
8431 }
8432 ceph_assert(pg->is_locked());
8433 OSDMapRef lastmap = pg->get_osdmap();
8434 ceph_assert(lastmap->get_epoch() < osd_epoch);
8435 set<PGRef> new_pgs; // any split children
8436 bool ret = true;
8437
8438 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8439 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8440 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8441 next_epoch <= osd_epoch;
8442 ++next_epoch) {
8443 OSDMapRef nextmap = service.try_get_map(next_epoch);
8444 if (!nextmap) {
8445 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8446 continue;
8447 }
8448
8449 unsigned new_pg_num =
8450 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8451 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8452 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8453 // check for merge
8454 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8455 spg_t parent;
8456 if (pg->pg_id.is_merge_source(
8457 old_pg_num,
8458 new_pg_num,
8459 &parent)) {
8460 // we are merge source
8461 PGRef spg = pg; // carry a ref
8462 dout(1) << __func__ << " " << pg->pg_id
8463 << " is merge source, target is " << parent
8464 << dendl;
8465 pg->write_if_dirty(rctx);
8466 if (!new_pgs.empty()) {
8467 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8468 new_pgs));
8469 new_pgs.clear();
8470 }
8471 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8472 pg->ch->flush();
8473 // release backoffs explicitly, since the on_shutdown path
8474 // aggressively tears down backoff state.
8475 if (pg->is_primary()) {
8476 pg->release_pg_backoffs();
8477 }
8478 pg->on_shutdown();
8479 OSDShard *sdata = pg->osd_shard;
8480 {
8481 std::lock_guard l(sdata->shard_lock);
8482 if (pg->pg_slot) {
8483 sdata->_detach_pg(pg->pg_slot);
8484 // update pg count now since we might not get an osdmap
8485 // any time soon.
8486 if (pg->is_primary())
8487 logger->dec(l_osd_pg_primary);
8488 else if (pg->is_nonprimary())
8489 logger->dec(l_osd_pg_replica); // misnomer
8490 else
8491 logger->dec(l_osd_pg_stray);
8492 }
8493 }
8494 pg->unlock();
8495
8496 set<spg_t> children;
8497 parent.is_split(new_pg_num, old_pg_num, &children);
8498 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8499 enqueue_peering_evt(
8500 parent,
8501 PGPeeringEventRef(
8502 std::make_shared<PGPeeringEvent>(
8503 nextmap->get_epoch(),
8504 nextmap->get_epoch(),
8505 NullEvt())));
8506 }
8507 ret = false;
8508 goto out;
8509 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8510 // we are merge target
8511 set<spg_t> children;
8512 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8513 dout(20) << __func__ << " " << pg->pg_id
8514 << " is merge target, sources are " << children
8515 << dendl;
8516 map<spg_t,PGRef> sources;
8517 {
8518 std::lock_guard l(merge_lock);
8519 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8520 unsigned need = children.size();
8521 dout(20) << __func__ << " have " << s.size() << "/"
8522 << need << dendl;
8523 if (s.size() == need) {
8524 sources.swap(s);
8525 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8526 if (merge_waiters[nextmap->get_epoch()].empty()) {
8527 merge_waiters.erase(nextmap->get_epoch());
8528 }
8529 }
8530 }
8531 if (!sources.empty()) {
8532 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8533 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8534 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8535 pg->merge_from(
8536 sources, rctx, split_bits,
8537 nextmap->get_pg_pool(
8538 pg->pg_id.pool())->last_pg_merge_meta);
8539 pg->pg_slot->waiting_for_merge_epoch = 0;
8540 } else {
8541 dout(20) << __func__ << " not ready to merge yet" << dendl;
8542 pg->write_if_dirty(rctx);
8543 if (!new_pgs.empty()) {
8544 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8545 new_pgs));
8546 new_pgs.clear();
8547 }
8548 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8549 pg->unlock();
8550 // kick source(s) to get them ready
8551 for (auto& i : children) {
8552 dout(20) << __func__ << " kicking source " << i << dendl;
8553 enqueue_peering_evt(
8554 i,
8555 PGPeeringEventRef(
8556 std::make_shared<PGPeeringEvent>(
8557 nextmap->get_epoch(),
8558 nextmap->get_epoch(),
8559 NullEvt())));
8560 }
8561 ret = false;
8562 goto out;
8563 }
8564 }
8565 }
8566 }
8567
8568 vector<int> newup, newacting;
8569 int up_primary, acting_primary;
8570 nextmap->pg_to_up_acting_osds(
8571 pg->pg_id.pgid,
8572 &newup, &up_primary,
8573 &newacting, &acting_primary);
8574 pg->handle_advance_map(
8575 nextmap, lastmap, newup, up_primary,
8576 newacting, acting_primary, rctx);
8577
8578 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8579 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8580 if (oldpool != lastmap->get_pools().end()
8581 && newpool != nextmap->get_pools().end()) {
8582 dout(20) << __func__
8583 << " new pool opts " << newpool->second.opts
8584 << " old pool opts " << oldpool->second.opts
8585 << dendl;
8586
8587 double old_min_interval = 0, new_min_interval = 0;
8588 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8589 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8590
8591 double old_max_interval = 0, new_max_interval = 0;
8592 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8593 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8594
8595 // Assume if an interval is change from set to unset or vice versa the actual config
8596 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8597 // unnecessarily.
8598 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8599 pg->on_info_history_change();
8600 }
8601 }
8602
8603 if (new_pg_num && old_pg_num != new_pg_num) {
8604 // check for split
8605 set<spg_t> children;
8606 if (pg->pg_id.is_split(
8607 old_pg_num,
8608 new_pg_num,
8609 &children)) {
8610 split_pgs(
8611 pg, children, &new_pgs, lastmap, nextmap,
8612 rctx);
8613 }
8614 }
8615
8616 lastmap = nextmap;
8617 old_pg_num = new_pg_num;
8618 handle.reset_tp_timeout();
8619 }
8620 pg->handle_activate_map(rctx);
8621
8622 ret = true;
8623 out:
8624 if (!new_pgs.empty()) {
8625 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
8626 }
8627 return ret;
8628 }
8629
8630 void OSD::consume_map()
8631 {
8632 ceph_assert(ceph_mutex_is_locked(osd_lock));
8633 auto osdmap = get_osdmap();
8634 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8635
8636 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8637 * speak the older sorting version any more. Be careful not to force
8638 * a shutdown if we are merely processing old maps, though.
8639 */
8640 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8641 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8642 ceph_abort();
8643 }
8644
8645 service.pre_publish_map(osdmap);
8646 service.await_reserved_maps();
8647 service.publish_map(osdmap);
8648
8649 // prime splits and merges
8650 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8651 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8652 for (auto& shard : shards) {
8653 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8654 }
8655 if (!newly_split.empty()) {
8656 for (auto& shard : shards) {
8657 shard->prime_splits(osdmap, &newly_split);
8658 }
8659 ceph_assert(newly_split.empty());
8660 }
8661
8662 // prune sent_ready_to_merge
8663 service.prune_sent_ready_to_merge(osdmap);
8664
8665 // FIXME, maybe: We could race against an incoming peering message
8666 // that instantiates a merge PG after identify_merges() below and
8667 // never set up its peer to complete the merge. An OSD restart
8668 // would clear it up. This is a hard race to resolve,
8669 // extraordinarily rare (we only merge PGs that are stable and
8670 // clean, so it'd have to be an imported PG to an OSD with a
8671 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8672 // replace all of this with a seastar-based code soon anyway.
8673 if (!merge_pgs.empty()) {
8674 // mark the pgs we already have, or create new and empty merge
8675 // participants for those we are missing. do this all under the
8676 // shard lock so we don't have to worry about racing pg creates
8677 // via _process.
8678 for (auto& shard : shards) {
8679 shard->prime_merges(osdmap, &merge_pgs);
8680 }
8681 ceph_assert(merge_pgs.empty());
8682 }
8683
8684 service.prune_pg_created();
8685
8686 unsigned pushes_to_free = 0;
8687 for (auto& shard : shards) {
8688 shard->consume_map(osdmap, &pushes_to_free);
8689 }
8690
8691 vector<spg_t> pgids;
8692 _get_pgids(&pgids);
8693
8694 // count (FIXME, probably during seastar rewrite)
8695 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8696 vector<PGRef> pgs;
8697 _get_pgs(&pgs);
8698 for (auto& pg : pgs) {
8699 // FIXME (probably during seastar rewrite): this is lockless and
8700 // racy, but we don't want to take pg lock here.
8701 if (pg->is_primary())
8702 num_pg_primary++;
8703 else if (pg->is_nonprimary())
8704 num_pg_replica++; // misnomer
8705 else
8706 num_pg_stray++;
8707 }
8708
8709 {
8710 // FIXME (as part of seastar rewrite): move to OSDShard
8711 std::lock_guard l(pending_creates_lock);
8712 for (auto pg = pending_creates_from_osd.begin();
8713 pg != pending_creates_from_osd.end();) {
8714 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
8715 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8716 << "discarding pending_create_from_osd" << dendl;
8717 pg = pending_creates_from_osd.erase(pg);
8718 } else {
8719 ++pg;
8720 }
8721 }
8722 }
8723
8724 service.maybe_inject_dispatch_delay();
8725
8726 dispatch_sessions_waiting_on_map();
8727
8728 service.maybe_inject_dispatch_delay();
8729
8730 service.release_reserved_pushes(pushes_to_free);
8731
8732 // queue null events to push maps down to individual PGs
8733 for (auto pgid : pgids) {
8734 enqueue_peering_evt(
8735 pgid,
8736 PGPeeringEventRef(
8737 std::make_shared<PGPeeringEvent>(
8738 osdmap->get_epoch(),
8739 osdmap->get_epoch(),
8740 NullEvt())));
8741 }
8742 logger->set(l_osd_pg, pgids.size());
8743 logger->set(l_osd_pg_primary, num_pg_primary);
8744 logger->set(l_osd_pg_replica, num_pg_replica);
8745 logger->set(l_osd_pg_stray, num_pg_stray);
8746 }
8747
8748 void OSD::activate_map()
8749 {
8750 ceph_assert(ceph_mutex_is_locked(osd_lock));
8751 auto osdmap = get_osdmap();
8752
8753 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8754
8755 // norecover?
8756 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8757 if (!service.recovery_is_paused()) {
8758 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8759 service.pause_recovery();
8760 }
8761 } else {
8762 if (service.recovery_is_paused()) {
8763 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8764 service.unpause_recovery();
8765 }
8766 }
8767
8768 service.activate_map();
8769
8770 // process waiters
8771 take_waiters(waiting_for_osdmap);
8772 }
8773
8774 bool OSD::require_mon_peer(const Message *m)
8775 {
8776 if (!m->get_connection()->peer_is_mon()) {
8777 dout(0) << "require_mon_peer received from non-mon "
8778 << m->get_connection()->get_peer_addr()
8779 << " " << *m << dendl;
8780 return false;
8781 }
8782 return true;
8783 }
8784
8785 bool OSD::require_mon_or_mgr_peer(const Message *m)
8786 {
8787 if (!m->get_connection()->peer_is_mon() &&
8788 !m->get_connection()->peer_is_mgr()) {
8789 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8790 << m->get_connection()->get_peer_addr()
8791 << " " << *m << dendl;
8792 return false;
8793 }
8794 return true;
8795 }
8796
8797 bool OSD::require_osd_peer(const Message *m)
8798 {
8799 if (!m->get_connection()->peer_is_osd()) {
8800 dout(0) << "require_osd_peer received from non-osd "
8801 << m->get_connection()->get_peer_addr()
8802 << " " << *m << dendl;
8803 return false;
8804 }
8805 return true;
8806 }
8807
8808 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8809 {
8810 epoch_t up_epoch = service.get_up_epoch();
8811 if (epoch < up_epoch) {
8812 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8813 return false;
8814 }
8815
8816 if (!is_active()) {
8817 dout(7) << "still in boot state, dropping message " << *m << dendl;
8818 return false;
8819 }
8820
8821 return true;
8822 }
8823
8824 bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
8825 bool is_fast_dispatch)
8826 {
8827 int from = m->get_source().num();
8828
8829 if (map->is_down(from) ||
8830 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
8831 dout(5) << "from dead osd." << from << ", marking down, "
8832 << " msg was " << m->get_source_inst().addr
8833 << " expected "
8834 << (map->is_up(from) ?
8835 map->get_cluster_addrs(from) : entity_addrvec_t())
8836 << dendl;
8837 ConnectionRef con = m->get_connection();
8838 con->mark_down();
8839 if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
8840 if (!is_fast_dispatch)
8841 s->session_dispatch_lock.lock();
8842 clear_session_waiting_on_map(s);
8843 con->set_priv(nullptr); // break ref <-> session cycle, if any
8844 s->con.reset();
8845 if (!is_fast_dispatch)
8846 s->session_dispatch_lock.unlock();
8847 }
8848 return false;
8849 }
8850 return true;
8851 }
8852
8853
8854 /*
8855 * require that we have same (or newer) map, and that
8856 * the source is the pg primary.
8857 */
8858 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8859 bool is_fast_dispatch)
8860 {
8861 const Message *m = op->get_req();
8862 const auto osdmap = get_osdmap();
8863 dout(15) << "require_same_or_newer_map " << epoch
8864 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8865
8866 ceph_assert(ceph_mutex_is_locked(osd_lock));
8867
8868 // do they have a newer map?
8869 if (epoch > osdmap->get_epoch()) {
8870 dout(7) << "waiting for newer map epoch " << epoch
8871 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8872 wait_for_new_map(op);
8873 return false;
8874 }
8875
8876 if (!require_self_aliveness(op->get_req(), epoch)) {
8877 return false;
8878 }
8879
8880 // ok, our map is same or newer.. do they still exist?
8881 if (m->get_connection()->get_messenger() == cluster_messenger &&
8882 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8883 return false;
8884 }
8885
8886 return true;
8887 }
8888
8889
8890
8891
8892
8893 // ----------------------------------------
8894 // pg creation
8895
8896 void OSD::split_pgs(
8897 PG *parent,
8898 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
8899 OSDMapRef curmap,
8900 OSDMapRef nextmap,
8901 PeeringCtx &rctx)
8902 {
8903 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
8904 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
8905
8906 vector<object_stat_sum_t> updated_stats;
8907 parent->start_split_stats(childpgids, &updated_stats);
8908
8909 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8910 for (set<spg_t>::const_iterator i = childpgids.begin();
8911 i != childpgids.end();
8912 ++i, ++stat_iter) {
8913 ceph_assert(stat_iter != updated_stats.end());
8914 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
8915 PG* child = _make_pg(nextmap, *i);
8916 child->lock(true);
8917 out_pgs->insert(child);
8918 child->ch = store->create_new_collection(child->coll);
8919
8920 {
8921 uint32_t shard_index = i->hash_to_shard(shards.size());
8922 assert(NULL != shards[shard_index]);
8923 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
8924 }
8925
8926 unsigned split_bits = i->get_split_bits(pg_num);
8927 dout(10) << " pg_num is " << pg_num
8928 << ", m_seed " << i->ps()
8929 << ", split_bits is " << split_bits << dendl;
8930 parent->split_colls(
8931 *i,
8932 split_bits,
8933 i->ps(),
8934 &child->get_pool().info,
8935 rctx.transaction);
8936 parent->split_into(
8937 i->pgid,
8938 child,
8939 split_bits);
8940
8941 child->init_collection_pool_opts();
8942
8943 child->finish_split_stats(*stat_iter, rctx.transaction);
8944 child->unlock();
8945 }
8946 ceph_assert(stat_iter != updated_stats.end());
8947 parent->finish_split_stats(*stat_iter, rctx.transaction);
8948 }
8949
8950 /*
8951 * holding osd_lock
8952 */
8953 void OSD::handle_pg_create(OpRequestRef op)
8954 {
8955 // NOTE: this can be removed in P release (mimic is the last version to
8956 // send MOSDPGCreate messages).
8957
8958 auto m = op->get_req<MOSDPGCreate>();
8959 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
8960
8961 dout(10) << "handle_pg_create " << *m << dendl;
8962
8963 if (!require_mon_peer(op->get_req())) {
8964 return;
8965 }
8966
8967 if (!require_same_or_newer_map(op, m->epoch, false))
8968 return;
8969
8970 op->mark_started();
8971
8972 const auto osdmap = get_osdmap();
8973 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8974 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8975 p != m->mkpg.end();
8976 ++p, ++ci) {
8977 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
8978 epoch_t created = p->second.created;
8979 if (p->second.split_bits) // Skip split pgs
8980 continue;
8981 pg_t on = p->first;
8982
8983 if (!osdmap->have_pg_pool(on.pool())) {
8984 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8985 continue;
8986 }
8987
8988 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
8989
8990 spg_t pgid;
8991 bool mapped = osdmap->get_primary_shard(on, &pgid);
8992 ceph_assert(mapped);
8993
8994 // is it still ours?
8995 vector<int> up, acting;
8996 int up_primary = -1;
8997 int acting_primary = -1;
8998 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
8999 int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
9000
9001 if (acting_primary != whoami) {
9002 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9003 << "), my role=" << role << ", skipping" << dendl;
9004 continue;
9005 }
9006
9007
9008 PastIntervals pi;
9009 pg_history_t history;
9010 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9011
9012 // The mon won't resend unless the primary changed, so we ignore
9013 // same_interval_since. We'll pass this history with the current
9014 // epoch as the event.
9015 if (history.same_primary_since > m->epoch) {
9016 dout(10) << __func__ << ": got obsolete pg create on pgid "
9017 << pgid << " from epoch " << m->epoch
9018 << ", primary changed in " << history.same_primary_since
9019 << dendl;
9020 continue;
9021 }
9022 enqueue_peering_evt(
9023 pgid,
9024 PGPeeringEventRef(
9025 std::make_shared<PGPeeringEvent>(
9026 osdmap->get_epoch(),
9027 osdmap->get_epoch(),
9028 NullEvt(),
9029 true,
9030 new PGCreateInfo(
9031 pgid,
9032 osdmap->get_epoch(),
9033 history,
9034 pi,
9035 true)
9036 )));
9037 }
9038
9039 {
9040 std::lock_guard l(pending_creates_lock);
9041 if (pending_creates_from_mon == 0) {
9042 last_pg_create_epoch = m->epoch;
9043 }
9044 }
9045
9046 maybe_update_heartbeat_peers();
9047 }
9048
9049
9050 // ----------------------------------------
9051 // peering and recovery
9052
9053 PeeringCtx OSD::create_context()
9054 {
9055 return PeeringCtx(get_osdmap()->require_osd_release);
9056 }
9057
9058 void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
9059 ThreadPool::TPHandle *handle)
9060 {
9061 if (!service.get_osdmap()->is_up(whoami)) {
9062 dout(20) << __func__ << " not up in osdmap" << dendl;
9063 } else if (!is_active()) {
9064 dout(20) << __func__ << " not active" << dendl;
9065 } else {
9066 for (auto& [osd, ls] : ctx.message_map) {
9067 if (!curmap->is_up(osd)) {
9068 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9069 continue;
9070 }
9071 ConnectionRef con = service.get_con_osd_cluster(
9072 osd, curmap->get_epoch());
9073 if (!con) {
9074 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9075 << dendl;
9076 continue;
9077 }
9078 service.maybe_share_map(con.get(), curmap);
9079 for (auto m : ls) {
9080 con->send_message2(m);
9081 }
9082 ls.clear();
9083 }
9084 }
9085 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
9086 int tr = store->queue_transaction(
9087 pg->ch,
9088 std::move(ctx.transaction), TrackedOpRef(),
9089 handle);
9090 ceph_assert(tr == 0);
9091 }
9092 }
9093
9094 void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9095 {
9096 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9097 if (!require_mon_peer(m)) {
9098 m->put();
9099 return;
9100 }
9101 for (auto& p : m->pgs) {
9102 spg_t pgid = p.first;
9103 epoch_t created = p.second.first;
9104 utime_t created_stamp = p.second.second;
9105 auto q = m->pg_extra.find(pgid);
9106 if (q == m->pg_extra.end()) {
9107 dout(20) << __func__ << " " << pgid << " e" << created
9108 << "@" << created_stamp
9109 << " (no history or past_intervals)" << dendl;
9110 // pre-octopus ... no pg history. this can be removed in Q release.
9111 enqueue_peering_evt(
9112 pgid,
9113 PGPeeringEventRef(
9114 std::make_shared<PGPeeringEvent>(
9115 m->epoch,
9116 m->epoch,
9117 NullEvt(),
9118 true,
9119 new PGCreateInfo(
9120 pgid,
9121 created,
9122 pg_history_t(created, created_stamp),
9123 PastIntervals(),
9124 true)
9125 )));
9126 } else {
9127 dout(20) << __func__ << " " << pgid << " e" << created
9128 << "@" << created_stamp
9129 << " history " << q->second.first
9130 << " pi " << q->second.second << dendl;
9131 if (!q->second.second.empty() &&
9132 m->epoch < q->second.second.get_bounds().second) {
9133 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9134 << " and unmatched past_intervals " << q->second.second
9135 << " (history " << q->second.first << ")";
9136 } else {
9137 enqueue_peering_evt(
9138 pgid,
9139 PGPeeringEventRef(
9140 std::make_shared<PGPeeringEvent>(
9141 m->epoch,
9142 m->epoch,
9143 NullEvt(),
9144 true,
9145 new PGCreateInfo(
9146 pgid,
9147 m->epoch,
9148 q->second.first,
9149 q->second.second,
9150 true)
9151 )));
9152 }
9153 }
9154 }
9155
9156 {
9157 std::lock_guard l(pending_creates_lock);
9158 if (pending_creates_from_mon == 0) {
9159 last_pg_create_epoch = m->epoch;
9160 }
9161 }
9162
9163 m->put();
9164 }
9165
9166 void OSD::handle_fast_pg_query(MOSDPGQuery *m)
9167 {
9168 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9169 if (!require_osd_peer(m)) {
9170 m->put();
9171 return;
9172 }
9173 int from = m->get_source().num();
9174 for (auto& p : m->pg_list) {
9175 enqueue_peering_evt(
9176 p.first,
9177 PGPeeringEventRef(
9178 std::make_shared<PGPeeringEvent>(
9179 p.second.epoch_sent, p.second.epoch_sent,
9180 MQuery(
9181 p.first,
9182 pg_shard_t(from, p.second.from),
9183 p.second,
9184 p.second.epoch_sent),
9185 false))
9186 );
9187 }
9188 m->put();
9189 }
9190
9191 void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9192 {
9193 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9194 if (!require_osd_peer(m)) {
9195 m->put();
9196 return;
9197 }
9198 int from = m->get_source().num();
9199 for (auto& p : m->get_pg_list()) {
9200 spg_t pgid(p.info.pgid.pgid, p.to);
9201 enqueue_peering_evt(
9202 pgid,
9203 PGPeeringEventRef(
9204 std::make_shared<PGPeeringEvent>(
9205 p.epoch_sent,
9206 p.query_epoch,
9207 MNotifyRec(
9208 pgid, pg_shard_t(from, p.from),
9209 p,
9210 m->get_connection()->get_features()),
9211 true,
9212 new PGCreateInfo(
9213 pgid,
9214 p.query_epoch,
9215 p.info.history,
9216 p.past_intervals,
9217 false)
9218 )));
9219 }
9220 m->put();
9221 }
9222
9223 void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9224 {
9225 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9226 if (!require_osd_peer(m)) {
9227 m->put();
9228 return;
9229 }
9230 int from = m->get_source().num();
9231 for (auto& p : m->pg_list) {
9232 enqueue_peering_evt(
9233 spg_t(p.info.pgid.pgid, p.to),
9234 PGPeeringEventRef(
9235 std::make_shared<PGPeeringEvent>(
9236 p.epoch_sent, p.query_epoch,
9237 MInfoRec(
9238 pg_shard_t(from, p.from),
9239 p.info,
9240 p.epoch_sent)))
9241 );
9242 }
9243 m->put();
9244 }
9245
9246 void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9247 {
9248 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9249 if (!require_osd_peer(m)) {
9250 m->put();
9251 return;
9252 }
9253 for (auto& pgid : m->pg_list) {
9254 enqueue_peering_evt(
9255 pgid,
9256 PGPeeringEventRef(
9257 std::make_shared<PGPeeringEvent>(
9258 m->get_epoch(), m->get_epoch(),
9259 PeeringState::DeleteStart())));
9260 }
9261 m->put();
9262 }
9263
9264 void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9265 {
9266 dout(10) << __func__ << " " << *m << dendl;
9267 if (!require_mon_or_mgr_peer(m)) {
9268 m->put();
9269 return;
9270 }
9271 epoch_t epoch = get_osdmap_epoch();
9272 for (auto pgid : m->forced_pgs) {
9273 if (m->options & OFR_BACKFILL) {
9274 if (m->options & OFR_CANCEL) {
9275 enqueue_peering_evt(
9276 pgid,
9277 PGPeeringEventRef(
9278 std::make_shared<PGPeeringEvent>(
9279 epoch, epoch,
9280 PeeringState::UnsetForceBackfill())));
9281 } else {
9282 enqueue_peering_evt(
9283 pgid,
9284 PGPeeringEventRef(
9285 std::make_shared<PGPeeringEvent>(
9286 epoch, epoch,
9287 PeeringState::SetForceBackfill())));
9288 }
9289 } else if (m->options & OFR_RECOVERY) {
9290 if (m->options & OFR_CANCEL) {
9291 enqueue_peering_evt(
9292 pgid,
9293 PGPeeringEventRef(
9294 std::make_shared<PGPeeringEvent>(
9295 epoch, epoch,
9296 PeeringState::UnsetForceRecovery())));
9297 } else {
9298 enqueue_peering_evt(
9299 pgid,
9300 PGPeeringEventRef(
9301 std::make_shared<PGPeeringEvent>(
9302 epoch, epoch,
9303 PeeringState::SetForceRecovery())));
9304 }
9305 }
9306 }
9307 m->put();
9308 }
9309
9310 void OSD::handle_pg_query_nopg(const MQuery& q)
9311 {
9312 spg_t pgid = q.pgid;
9313 dout(10) << __func__ << " " << pgid << dendl;
9314
9315 OSDMapRef osdmap = get_osdmap();
9316 if (!osdmap->have_pg_pool(pgid.pool()))
9317 return;
9318
9319 dout(10) << " pg " << pgid << " dne" << dendl;
9320 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9321 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9322 if (con) {
9323 Message *m;
9324 if (q.query.type == pg_query_t::LOG ||
9325 q.query.type == pg_query_t::FULLLOG) {
9326 m = new MOSDPGLog(
9327 q.query.from, q.query.to,
9328 osdmap->get_epoch(), empty,
9329 q.query.epoch_sent);
9330 } else {
9331 vector<pg_notify_t> ls;
9332 ls.push_back(
9333 pg_notify_t(
9334 q.query.from, q.query.to,
9335 q.query.epoch_sent,
9336 osdmap->get_epoch(),
9337 empty,
9338 PastIntervals()));
9339 m = new MOSDPGNotify(osdmap->get_epoch(), std::move(ls));
9340 }
9341 service.maybe_share_map(con.get(), osdmap);
9342 con->send_message(m);
9343 }
9344 }
9345
9346 void OSDService::queue_check_readable(spg_t spgid,
9347 epoch_t lpr,
9348 ceph::signedspan delay)
9349 {
9350 if (delay == ceph::signedspan::zero()) {
9351 osd->enqueue_peering_evt(
9352 spgid,
9353 PGPeeringEventRef(
9354 std::make_shared<PGPeeringEvent>(
9355 lpr, lpr,
9356 PeeringState::CheckReadable())));
9357 } else {
9358 mono_timer.add_event(
9359 delay,
9360 [this, spgid, lpr]() {
9361 queue_check_readable(spgid, lpr);
9362 });
9363 }
9364 }
9365
9366
9367 // =========================================================
9368 // RECOVERY
9369
9370 void OSDService::_maybe_queue_recovery() {
9371 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
9372 uint64_t available_pushes;
9373 while (!awaiting_throttle.empty() &&
9374 _recover_now(&available_pushes)) {
9375 uint64_t to_start = std::min(
9376 available_pushes,
9377 cct->_conf->osd_recovery_max_single_start);
9378 _queue_for_recovery(awaiting_throttle.front(), to_start);
9379 awaiting_throttle.pop_front();
9380 dout(10) << __func__ << " starting " << to_start
9381 << ", recovery_ops_reserved " << recovery_ops_reserved
9382 << " -> " << (recovery_ops_reserved + to_start) << dendl;
9383 recovery_ops_reserved += to_start;
9384 }
9385 }
9386
9387 bool OSDService::_recover_now(uint64_t *available_pushes)
9388 {
9389 if (available_pushes)
9390 *available_pushes = 0;
9391
9392 if (ceph_clock_now() < defer_recovery_until) {
9393 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9394 return false;
9395 }
9396
9397 if (recovery_paused) {
9398 dout(15) << __func__ << " paused" << dendl;
9399 return false;
9400 }
9401
9402 uint64_t max = osd->get_recovery_max_active();
9403 if (max <= recovery_ops_active + recovery_ops_reserved) {
9404 dout(15) << __func__ << " active " << recovery_ops_active
9405 << " + reserved " << recovery_ops_reserved
9406 << " >= max " << max << dendl;
9407 return false;
9408 }
9409
9410 if (available_pushes)
9411 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9412
9413 return true;
9414 }
9415
9416 unsigned OSDService::get_target_pg_log_entries() const
9417 {
9418 auto num_pgs = osd->get_num_pgs();
9419 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9420 if (num_pgs > 0 && target > 0) {
9421 // target an even spread of our budgeted log entries across all
9422 // PGs. note that while we only get to control the entry count
9423 // for primary PGs, we'll normally be responsible for a mix of
9424 // primary and replica PGs (for the same pool(s) even), so this
9425 // will work out.
9426 return std::max<unsigned>(
9427 std::min<unsigned>(target / num_pgs,
9428 cct->_conf->osd_max_pg_log_entries),
9429 cct->_conf->osd_min_pg_log_entries);
9430 } else {
9431 // fall back to a per-pg value.
9432 return cct->_conf->osd_min_pg_log_entries;
9433 }
9434 }
9435
9436 void OSD::do_recovery(
9437 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9438 ThreadPool::TPHandle &handle)
9439 {
9440 uint64_t started = 0;
9441
9442 /*
9443 * When the value of osd_recovery_sleep is set greater than zero, recovery
9444 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9445 * recovery event's schedule time. This is done by adding a
9446 * recovery_requeue_callback event, which re-queues the recovery op using
9447 * queue_recovery_after_sleep.
9448 */
9449 float recovery_sleep = get_osd_recovery_sleep();
9450 {
9451 std::lock_guard l(service.sleep_lock);
9452 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9453 PGRef pgref(pg);
9454 auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
9455 dout(20) << "do_recovery wake up at "
9456 << ceph_clock_now()
9457 << ", re-queuing recovery" << dendl;
9458 std::lock_guard l(service.sleep_lock);
9459 service.recovery_needs_sleep = false;
9460 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9461 });
9462
9463 // This is true for the first recovery op and when the previous recovery op
9464 // has been scheduled in the past. The next recovery op is scheduled after
9465 // completing the sleep from now.
9466
9467 if (auto now = ceph::real_clock::now();
9468 service.recovery_schedule_time < now) {
9469 service.recovery_schedule_time = now;
9470 }
9471 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
9472 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9473 recovery_requeue_callback);
9474 dout(20) << "Recovery event scheduled at "
9475 << service.recovery_schedule_time << dendl;
9476 return;
9477 }
9478 }
9479
9480 {
9481 {
9482 std::lock_guard l(service.sleep_lock);
9483 service.recovery_needs_sleep = true;
9484 }
9485
9486 if (pg->pg_has_reset_since(queued)) {
9487 goto out;
9488 }
9489
9490 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9491 #ifdef DEBUG_RECOVERY_OIDS
9492 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
9493 #endif
9494
9495 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
9496 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9497 << " on " << *pg << dendl;
9498
9499 if (do_unfound) {
9500 PeeringCtx rctx = create_context();
9501 rctx.handle = &handle;
9502 pg->find_unfound(queued, rctx);
9503 dispatch_context(rctx, pg, pg->get_osdmap());
9504 }
9505 }
9506
9507 out:
9508 ceph_assert(started <= reserved_pushes);
9509 service.release_reserved_pushes(reserved_pushes);
9510 }
9511
9512 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9513 {
9514 std::lock_guard l(recovery_lock);
9515 dout(10) << "start_recovery_op " << *pg << " " << soid
9516 << " (" << recovery_ops_active << "/"
9517 << osd->get_recovery_max_active() << " rops)"
9518 << dendl;
9519 recovery_ops_active++;
9520
9521 #ifdef DEBUG_RECOVERY_OIDS
9522 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9523 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9524 recovery_oids[pg->pg_id].insert(soid);
9525 #endif
9526 }
9527
9528 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9529 {
9530 std::lock_guard l(recovery_lock);
9531 dout(10) << "finish_recovery_op " << *pg << " " << soid
9532 << " dequeue=" << dequeue
9533 << " (" << recovery_ops_active << "/"
9534 << osd->get_recovery_max_active() << " rops)"
9535 << dendl;
9536
9537 // adjust count
9538 ceph_assert(recovery_ops_active > 0);
9539 recovery_ops_active--;
9540
9541 #ifdef DEBUG_RECOVERY_OIDS
9542 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9543 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9544 recovery_oids[pg->pg_id].erase(soid);
9545 #endif
9546
9547 _maybe_queue_recovery();
9548 }
9549
9550 bool OSDService::is_recovery_active()
9551 {
9552 if (cct->_conf->osd_debug_pretend_recovery_active) {
9553 return true;
9554 }
9555 return local_reserver.has_reservation() || remote_reserver.has_reservation();
9556 }
9557
9558 void OSDService::release_reserved_pushes(uint64_t pushes)
9559 {
9560 std::lock_guard l(recovery_lock);
9561 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9562 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9563 << dendl;
9564 ceph_assert(recovery_ops_reserved >= pushes);
9565 recovery_ops_reserved -= pushes;
9566 _maybe_queue_recovery();
9567 }
9568
9569 // =========================================================
9570 // OPS
9571
9572 bool OSD::op_is_discardable(const MOSDOp *op)
9573 {
9574 // drop client request if they are not connected and can't get the
9575 // reply anyway.
9576 if (!op->get_connection()->is_connected()) {
9577 return true;
9578 }
9579 return false;
9580 }
9581
9582 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
9583 {
9584 const utime_t stamp = op->get_req()->get_recv_stamp();
9585 const utime_t latency = ceph_clock_now() - stamp;
9586 const unsigned priority = op->get_req()->get_priority();
9587 const int cost = op->get_req()->get_cost();
9588 const uint64_t owner = op->get_req()->get_source().num();
9589
9590 dout(15) << "enqueue_op " << op << " prio " << priority
9591 << " cost " << cost
9592 << " latency " << latency
9593 << " epoch " << epoch
9594 << " " << *(op->get_req()) << dendl;
9595 op->osd_trace.event("enqueue op");
9596 op->osd_trace.keyval("priority", priority);
9597 op->osd_trace.keyval("cost", cost);
9598 op->mark_queued_for_pg();
9599 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9600 op_shardedwq.queue(
9601 OpSchedulerItem(
9602 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9603 cost, priority, stamp, owner, epoch));
9604 }
9605
9606 void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9607 {
9608 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9609 op_shardedwq.queue(
9610 OpSchedulerItem(
9611 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9612 10,
9613 cct->_conf->osd_peering_op_priority,
9614 utime_t(),
9615 0,
9616 evt->get_epoch_sent()));
9617 }
9618
9619 /*
9620 * NOTE: dequeue called in worker thread, with pg lock
9621 */
9622 void OSD::dequeue_op(
9623 PGRef pg, OpRequestRef op,
9624 ThreadPool::TPHandle &handle)
9625 {
9626 const Message *m = op->get_req();
9627
9628 FUNCTRACE(cct);
9629 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
9630
9631 utime_t now = ceph_clock_now();
9632 op->set_dequeued_time(now);
9633
9634 utime_t latency = now - m->get_recv_stamp();
9635 dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9636 << " cost " << m->get_cost()
9637 << " latency " << latency
9638 << " " << *m
9639 << " pg " << *pg << dendl;
9640
9641 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9642
9643 service.maybe_share_map(m->get_connection().get(),
9644 pg->get_osdmap(),
9645 op->sent_epoch);
9646
9647 if (pg->is_deleting())
9648 return;
9649
9650 op->mark_reached_pg();
9651 op->osd_trace.event("dequeue_op");
9652
9653 pg->do_request(op, handle);
9654
9655 // finish
9656 dout(10) << "dequeue_op " << op << " finish" << dendl;
9657 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
9658 }
9659
9660
9661 void OSD::dequeue_peering_evt(
9662 OSDShard *sdata,
9663 PG *pg,
9664 PGPeeringEventRef evt,
9665 ThreadPool::TPHandle& handle)
9666 {
9667 PeeringCtx rctx = create_context();
9668 auto curmap = sdata->get_osdmap();
9669 bool need_up_thru = false;
9670 epoch_t same_interval_since = 0;
9671 if (!pg) {
9672 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9673 handle_pg_query_nopg(*q);
9674 } else {
9675 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9676 ceph_abort();
9677 }
9678 } else if (advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9679 pg->do_peering_event(evt, rctx);
9680 if (pg->is_deleted()) {
9681 pg->unlock();
9682 return;
9683 }
9684 dispatch_context(rctx, pg, curmap, &handle);
9685 need_up_thru = pg->get_need_up_thru();
9686 same_interval_since = pg->get_same_interval_since();
9687 pg->unlock();
9688 }
9689
9690 if (need_up_thru) {
9691 queue_want_up_thru(same_interval_since);
9692 }
9693
9694 service.send_pg_temp();
9695 }
9696
9697 void OSD::dequeue_delete(
9698 OSDShard *sdata,
9699 PG *pg,
9700 epoch_t e,
9701 ThreadPool::TPHandle& handle)
9702 {
9703 dequeue_peering_evt(
9704 sdata,
9705 pg,
9706 PGPeeringEventRef(
9707 std::make_shared<PGPeeringEvent>(
9708 e, e,
9709 PeeringState::DeleteSome())),
9710 handle);
9711 }
9712
9713
9714
9715 // --------------------------------
9716
9717 const char** OSD::get_tracked_conf_keys() const
9718 {
9719 static const char* KEYS[] = {
9720 "osd_max_backfills",
9721 "osd_min_recovery_priority",
9722 "osd_max_trimming_pgs",
9723 "osd_op_complaint_time",
9724 "osd_op_log_threshold",
9725 "osd_op_history_size",
9726 "osd_op_history_duration",
9727 "osd_op_history_slow_op_size",
9728 "osd_op_history_slow_op_threshold",
9729 "osd_enable_op_tracker",
9730 "osd_map_cache_size",
9731 "osd_pg_epoch_max_lag_factor",
9732 "osd_pg_epoch_persisted_max_stale",
9733 // clog & admin clog
9734 "clog_to_monitors",
9735 "clog_to_syslog",
9736 "clog_to_syslog_facility",
9737 "clog_to_syslog_level",
9738 "osd_objectstore_fuse",
9739 "clog_to_graylog",
9740 "clog_to_graylog_host",
9741 "clog_to_graylog_port",
9742 "host",
9743 "fsid",
9744 "osd_recovery_delay_start",
9745 "osd_client_message_size_cap",
9746 "osd_client_message_cap",
9747 "osd_heartbeat_min_size",
9748 "osd_heartbeat_interval",
9749 "osd_object_clean_region_max_num_intervals",
9750 "osd_scrub_min_interval",
9751 "osd_scrub_max_interval",
9752 NULL
9753 };
9754 return KEYS;
9755 }
9756
9757 void OSD::handle_conf_change(const ConfigProxy& conf,
9758 const std::set <std::string> &changed)
9759 {
9760 std::lock_guard l{osd_lock};
9761 if (changed.count("osd_max_backfills")) {
9762 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9763 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9764 }
9765 if (changed.count("osd_min_recovery_priority")) {
9766 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9767 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9768 }
9769 if (changed.count("osd_max_trimming_pgs")) {
9770 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9771 }
9772 if (changed.count("osd_op_complaint_time") ||
9773 changed.count("osd_op_log_threshold")) {
9774 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9775 cct->_conf->osd_op_log_threshold);
9776 }
9777 if (changed.count("osd_op_history_size") ||
9778 changed.count("osd_op_history_duration")) {
9779 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9780 cct->_conf->osd_op_history_duration);
9781 }
9782 if (changed.count("osd_op_history_slow_op_size") ||
9783 changed.count("osd_op_history_slow_op_threshold")) {
9784 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9785 cct->_conf->osd_op_history_slow_op_threshold);
9786 }
9787 if (changed.count("osd_enable_op_tracker")) {
9788 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9789 }
9790 if (changed.count("osd_map_cache_size")) {
9791 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9792 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9793 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9794 }
9795 if (changed.count("clog_to_monitors") ||
9796 changed.count("clog_to_syslog") ||
9797 changed.count("clog_to_syslog_level") ||
9798 changed.count("clog_to_syslog_facility") ||
9799 changed.count("clog_to_graylog") ||
9800 changed.count("clog_to_graylog_host") ||
9801 changed.count("clog_to_graylog_port") ||
9802 changed.count("host") ||
9803 changed.count("fsid")) {
9804 update_log_config();
9805 }
9806 if (changed.count("osd_pg_epoch_max_lag_factor")) {
9807 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
9808 "osd_pg_epoch_max_lag_factor");
9809 }
9810
9811 #ifdef HAVE_LIBFUSE
9812 if (changed.count("osd_objectstore_fuse")) {
9813 if (store) {
9814 enable_disable_fuse(false);
9815 }
9816 }
9817 #endif
9818
9819 if (changed.count("osd_recovery_delay_start")) {
9820 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9821 service.kick_recovery_queue();
9822 }
9823
9824 if (changed.count("osd_client_message_cap")) {
9825 uint64_t newval = cct->_conf->osd_client_message_cap;
9826 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9827 if (pol.throttler_messages && newval > 0) {
9828 pol.throttler_messages->reset_max(newval);
9829 }
9830 }
9831 if (changed.count("osd_client_message_size_cap")) {
9832 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9833 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9834 if (pol.throttler_bytes && newval > 0) {
9835 pol.throttler_bytes->reset_max(newval);
9836 }
9837 }
9838 if (changed.count("osd_object_clean_region_max_num_intervals")) {
9839 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
9840 }
9841
9842 if (changed.count("osd_scrub_min_interval") ||
9843 changed.count("osd_scrub_max_interval")) {
9844 resched_all_scrubs();
9845 dout(0) << __func__ << ": scrub interval change" << dendl;
9846 }
9847 check_config();
9848 }
9849
9850 void OSD::update_log_config()
9851 {
9852 map<string,string> log_to_monitors;
9853 map<string,string> log_to_syslog;
9854 map<string,string> log_channel;
9855 map<string,string> log_prio;
9856 map<string,string> log_to_graylog;
9857 map<string,string> log_to_graylog_host;
9858 map<string,string> log_to_graylog_port;
9859 uuid_d fsid;
9860 string host;
9861
9862 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9863 log_channel, log_prio, log_to_graylog,
9864 log_to_graylog_host, log_to_graylog_port,
9865 fsid, host) == 0)
9866 clog->update_config(log_to_monitors, log_to_syslog,
9867 log_channel, log_prio, log_to_graylog,
9868 log_to_graylog_host, log_to_graylog_port,
9869 fsid, host);
9870 derr << "log_to_monitors " << log_to_monitors << dendl;
9871 }
9872
9873 void OSD::check_config()
9874 {
9875 // some sanity checks
9876 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9877 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9878 << " is not > osd_pg_epoch_persisted_max_stale ("
9879 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9880 }
9881 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
9882 clog->warn() << "osd_object_clean_region_max_num_intervals ("
9883 << cct->_conf->osd_object_clean_region_max_num_intervals
9884 << ") is < 0";
9885 }
9886 }
9887
9888 // --------------------------------
9889
9890 void OSD::get_latest_osdmap()
9891 {
9892 dout(10) << __func__ << " -- start" << dendl;
9893
9894 C_SaferCond cond;
9895 service.objecter->wait_for_latest_osdmap(&cond);
9896 cond.wait();
9897
9898 dout(10) << __func__ << " -- finish" << dendl;
9899 }
9900
9901 // --------------------------------
9902
9903 void OSD::set_perf_queries(const ConfigPayload &config_payload) {
9904 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
9905 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
9906 dout(10) << "setting " << queries.size() << " queries" << dendl;
9907
9908 std::list<OSDPerfMetricQuery> supported_queries;
9909 for (auto &it : queries) {
9910 auto &query = it.first;
9911 if (!query.key_descriptor.empty()) {
9912 supported_queries.push_back(query);
9913 }
9914 }
9915 if (supported_queries.size() < queries.size()) {
9916 dout(1) << queries.size() - supported_queries.size()
9917 << " unsupported queries" << dendl;
9918 }
9919 {
9920 std::lock_guard locker{m_perf_queries_lock};
9921 m_perf_queries = supported_queries;
9922 m_perf_limits = queries;
9923 }
9924 std::vector<PGRef> pgs;
9925 _get_pgs(&pgs);
9926 for (auto& pg : pgs) {
9927 std::scoped_lock l{*pg};
9928 pg->set_dynamic_perf_stats_queries(supported_queries);
9929 }
9930 }
9931
9932 MetricPayload OSD::get_perf_reports() {
9933 OSDMetricPayload payload;
9934 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
9935
9936 std::vector<PGRef> pgs;
9937 _get_pgs(&pgs);
9938 DynamicPerfStats dps;
9939 for (auto& pg : pgs) {
9940 // m_perf_queries can be modified only in set_perf_queries by mgr client
9941 // request, and it is protected by by mgr client's lock, which is held
9942 // when set_perf_queries/get_perf_reports are called, so we may not hold
9943 // m_perf_queries_lock here.
9944 DynamicPerfStats pg_dps(m_perf_queries);
9945 pg->lock();
9946 pg->get_dynamic_perf_stats(&pg_dps);
9947 pg->unlock();
9948 dps.merge(pg_dps);
9949 }
9950 dps.add_to_reports(m_perf_limits, &reports);
9951 dout(20) << "reports for " << reports.size() << " queries" << dendl;
9952
9953 return payload;
9954 }
9955
9956 // =============================================================
9957
9958 #undef dout_context
9959 #define dout_context cct
9960 #undef dout_prefix
9961 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
9962
9963 void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
9964 {
9965 dout(10) << pg->pg_id << " " << pg << dendl;
9966 slot->pg = pg;
9967 pg->osd_shard = this;
9968 pg->pg_slot = slot;
9969 osd->inc_num_pgs();
9970
9971 slot->epoch = pg->get_osdmap_epoch();
9972 pg_slots_by_epoch.insert(*slot);
9973 }
9974
9975 void OSDShard::_detach_pg(OSDShardPGSlot *slot)
9976 {
9977 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
9978 slot->pg->osd_shard = nullptr;
9979 slot->pg->pg_slot = nullptr;
9980 slot->pg = nullptr;
9981 osd->dec_num_pgs();
9982
9983 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
9984 slot->epoch = 0;
9985 if (waiting_for_min_pg_epoch) {
9986 min_pg_epoch_cond.notify_all();
9987 }
9988 }
9989
9990 void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
9991 {
9992 std::lock_guard l(shard_lock);
9993 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
9994 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
9995 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
9996 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
9997 slot->epoch = e;
9998 pg_slots_by_epoch.insert(*slot);
9999 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10000 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10001 if (waiting_for_min_pg_epoch) {
10002 min_pg_epoch_cond.notify_all();
10003 }
10004 }
10005
10006 epoch_t OSDShard::get_min_pg_epoch()
10007 {
10008 std::lock_guard l(shard_lock);
10009 auto p = pg_slots_by_epoch.begin();
10010 if (p == pg_slots_by_epoch.end()) {
10011 return 0;
10012 }
10013 return p->epoch;
10014 }
10015
10016 void OSDShard::wait_min_pg_epoch(epoch_t need)
10017 {
10018 std::unique_lock l{shard_lock};
10019 ++waiting_for_min_pg_epoch;
10020 min_pg_epoch_cond.wait(l, [need, this] {
10021 if (pg_slots_by_epoch.empty()) {
10022 return true;
10023 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10024 return true;
10025 } else {
10026 dout(10) << need << " waiting on "
10027 << pg_slots_by_epoch.begin()->epoch << dendl;
10028 return false;
10029 }
10030 });
10031 --waiting_for_min_pg_epoch;
10032 }
10033
10034 epoch_t OSDShard::get_max_waiting_epoch()
10035 {
10036 std::lock_guard l(shard_lock);
10037 epoch_t r = 0;
10038 for (auto& i : pg_slots) {
10039 if (!i.second->waiting_peering.empty()) {
10040 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10041 }
10042 }
10043 return r;
10044 }
10045
10046 void OSDShard::consume_map(
10047 const OSDMapRef& new_osdmap,
10048 unsigned *pushes_to_free)
10049 {
10050 std::lock_guard l(shard_lock);
10051 OSDMapRef old_osdmap;
10052 {
10053 std::lock_guard l(osdmap_lock);
10054 old_osdmap = std::move(shard_osdmap);
10055 shard_osdmap = new_osdmap;
10056 }
10057 dout(10) << new_osdmap->get_epoch()
10058 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10059 << dendl;
10060 bool queued = false;
10061
10062 // check slots
10063 auto p = pg_slots.begin();
10064 while (p != pg_slots.end()) {
10065 OSDShardPGSlot *slot = p->second.get();
10066 const spg_t& pgid = p->first;
10067 dout(20) << __func__ << " " << pgid << dendl;
10068 if (!slot->waiting_for_split.empty()) {
10069 dout(20) << __func__ << " " << pgid
10070 << " waiting for split " << slot->waiting_for_split << dendl;
10071 ++p;
10072 continue;
10073 }
10074 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10075 dout(20) << __func__ << " " << pgid
10076 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10077 << dendl;
10078 ++p;
10079 continue;
10080 }
10081 if (!slot->waiting_peering.empty()) {
10082 epoch_t first = slot->waiting_peering.begin()->first;
10083 if (first <= new_osdmap->get_epoch()) {
10084 dout(20) << __func__ << " " << pgid
10085 << " pending_peering first epoch " << first
10086 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10087 _wake_pg_slot(pgid, slot);
10088 queued = true;
10089 }
10090 ++p;
10091 continue;
10092 }
10093 if (!slot->waiting.empty()) {
10094 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10095 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10096 << dendl;
10097 ++p;
10098 continue;
10099 }
10100 while (!slot->waiting.empty() &&
10101 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10102 auto& qi = slot->waiting.front();
10103 dout(20) << __func__ << " " << pgid
10104 << " waiting item " << qi
10105 << " epoch " << qi.get_map_epoch()
10106 << " <= " << new_osdmap->get_epoch()
10107 << ", "
10108 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10109 "misdirected")
10110 << ", dropping" << dendl;
10111 *pushes_to_free += qi.get_reserved_pushes();
10112 slot->waiting.pop_front();
10113 }
10114 }
10115 if (slot->waiting.empty() &&
10116 slot->num_running == 0 &&
10117 slot->waiting_for_split.empty() &&
10118 !slot->pg) {
10119 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10120 p = pg_slots.erase(p);
10121 continue;
10122 }
10123
10124 ++p;
10125 }
10126 if (queued) {
10127 std::lock_guard l{sdata_wait_lock};
10128 sdata_cond.notify_one();
10129 }
10130 }
10131
10132 void OSDShard::_wake_pg_slot(
10133 spg_t pgid,
10134 OSDShardPGSlot *slot)
10135 {
10136 dout(20) << __func__ << " " << pgid
10137 << " to_process " << slot->to_process
10138 << " waiting " << slot->waiting
10139 << " waiting_peering " << slot->waiting_peering << dendl;
10140 for (auto i = slot->to_process.rbegin();
10141 i != slot->to_process.rend();
10142 ++i) {
10143 scheduler->enqueue_front(std::move(*i));
10144 }
10145 slot->to_process.clear();
10146 for (auto i = slot->waiting.rbegin();
10147 i != slot->waiting.rend();
10148 ++i) {
10149 scheduler->enqueue_front(std::move(*i));
10150 }
10151 slot->waiting.clear();
10152 for (auto i = slot->waiting_peering.rbegin();
10153 i != slot->waiting_peering.rend();
10154 ++i) {
10155 // this is overkill; we requeue everything, even if some of these
10156 // items are waiting for maps we don't have yet. FIXME, maybe,
10157 // someday, if we decide this inefficiency matters
10158 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10159 scheduler->enqueue_front(std::move(*j));
10160 }
10161 }
10162 slot->waiting_peering.clear();
10163 ++slot->requeue_seq;
10164 }
10165
10166 void OSDShard::identify_splits_and_merges(
10167 const OSDMapRef& as_of_osdmap,
10168 set<pair<spg_t,epoch_t>> *split_pgs,
10169 set<pair<spg_t,epoch_t>> *merge_pgs)
10170 {
10171 std::lock_guard l(shard_lock);
10172 if (shard_osdmap) {
10173 for (auto& i : pg_slots) {
10174 const spg_t& pgid = i.first;
10175 auto *slot = i.second.get();
10176 if (slot->pg) {
10177 osd->service.identify_splits_and_merges(
10178 shard_osdmap, as_of_osdmap, pgid,
10179 split_pgs, merge_pgs);
10180 } else if (!slot->waiting_for_split.empty()) {
10181 osd->service.identify_splits_and_merges(
10182 shard_osdmap, as_of_osdmap, pgid,
10183 split_pgs, nullptr);
10184 } else {
10185 dout(20) << __func__ << " slot " << pgid
10186 << " has no pg and waiting_for_split " << dendl;
10187 }
10188 }
10189 }
10190 }
10191
10192 void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10193 set<pair<spg_t,epoch_t>> *pgids)
10194 {
10195 std::lock_guard l(shard_lock);
10196 _prime_splits(pgids);
10197 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10198 set<pair<spg_t,epoch_t>> newer_children;
10199 for (auto i : *pgids) {
10200 osd->service.identify_splits_and_merges(
10201 as_of_osdmap, shard_osdmap, i.first,
10202 &newer_children, nullptr);
10203 }
10204 newer_children.insert(pgids->begin(), pgids->end());
10205 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10206 << shard_osdmap->get_epoch() << ", new children " << newer_children
10207 << dendl;
10208 _prime_splits(&newer_children);
10209 // note: we don't care what is left over here for other shards.
10210 // if this shard is ahead of us and one isn't, e.g., one thread is
10211 // calling into prime_splits via _process (due to a newly created
10212 // pg) and this shard has a newer map due to a racing consume_map,
10213 // then any grandchildren left here will be identified (or were
10214 // identified) when the slower shard's osdmap is advanced.
10215 // _prime_splits() will tolerate the case where the pgid is
10216 // already primed.
10217 }
10218 }
10219
10220 void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10221 {
10222 dout(10) << *pgids << dendl;
10223 auto p = pgids->begin();
10224 while (p != pgids->end()) {
10225 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10226 if (shard_index == shard_id) {
10227 auto r = pg_slots.emplace(p->first, nullptr);
10228 if (r.second) {
10229 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10230 r.first->second = make_unique<OSDShardPGSlot>();
10231 r.first->second->waiting_for_split.insert(p->second);
10232 } else {
10233 auto q = r.first;
10234 ceph_assert(q != pg_slots.end());
10235 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10236 << dendl;
10237 q->second->waiting_for_split.insert(p->second);
10238 }
10239 p = pgids->erase(p);
10240 } else {
10241 ++p;
10242 }
10243 }
10244 }
10245
10246 void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10247 set<pair<spg_t,epoch_t>> *merge_pgs)
10248 {
10249 std::lock_guard l(shard_lock);
10250 dout(20) << __func__ << " checking shard " << shard_id
10251 << " for remaining merge pgs " << merge_pgs << dendl;
10252 auto p = merge_pgs->begin();
10253 while (p != merge_pgs->end()) {
10254 spg_t pgid = p->first;
10255 epoch_t epoch = p->second;
10256 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10257 if (shard_index != shard_id) {
10258 ++p;
10259 continue;
10260 }
10261 OSDShardPGSlot *slot;
10262 auto r = pg_slots.emplace(pgid, nullptr);
10263 if (r.second) {
10264 r.first->second = make_unique<OSDShardPGSlot>();
10265 }
10266 slot = r.first->second.get();
10267 if (slot->pg) {
10268 // already have pg
10269 dout(20) << __func__ << " have merge participant pg " << pgid
10270 << " " << slot->pg << dendl;
10271 } else if (!slot->waiting_for_split.empty() &&
10272 *slot->waiting_for_split.begin() < epoch) {
10273 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10274 << " " << slot->waiting_for_split << dendl;
10275 } else {
10276 dout(20) << __func__ << " creating empty merge participant " << pgid
10277 << " for merge in " << epoch << dendl;
10278 // leave history zeroed; PG::merge_from() will fill it in.
10279 pg_history_t history;
10280 PGCreateInfo cinfo(pgid, epoch - 1,
10281 history, PastIntervals(), false);
10282 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10283 _attach_pg(r.first->second.get(), pg.get());
10284 _wake_pg_slot(pgid, slot);
10285 pg->unlock();
10286 }
10287 // mark slot for merge
10288 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10289 slot->waiting_for_merge_epoch = epoch;
10290 p = merge_pgs->erase(p);
10291 }
10292 }
10293
10294 void OSDShard::register_and_wake_split_child(PG *pg)
10295 {
10296 epoch_t epoch;
10297 {
10298 std::lock_guard l(shard_lock);
10299 dout(10) << pg->pg_id << " " << pg << dendl;
10300 auto p = pg_slots.find(pg->pg_id);
10301 ceph_assert(p != pg_slots.end());
10302 auto *slot = p->second.get();
10303 dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
10304 << dendl;
10305 ceph_assert(!slot->pg);
10306 ceph_assert(!slot->waiting_for_split.empty());
10307 _attach_pg(slot, pg);
10308
10309 epoch = pg->get_osdmap_epoch();
10310 ceph_assert(slot->waiting_for_split.count(epoch));
10311 slot->waiting_for_split.erase(epoch);
10312 if (slot->waiting_for_split.empty()) {
10313 _wake_pg_slot(pg->pg_id, slot);
10314 } else {
10315 dout(10) << __func__ << " still waiting for split on "
10316 << slot->waiting_for_split << dendl;
10317 }
10318 }
10319
10320 // kick child to ensure it pulls up to the latest osdmap
10321 osd->enqueue_peering_evt(
10322 pg->pg_id,
10323 PGPeeringEventRef(
10324 std::make_shared<PGPeeringEvent>(
10325 epoch,
10326 epoch,
10327 NullEvt())));
10328
10329 std::lock_guard l{sdata_wait_lock};
10330 sdata_cond.notify_one();
10331 }
10332
10333 void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
10334 {
10335 std::lock_guard l(shard_lock);
10336 vector<spg_t> to_delete;
10337 for (auto& i : pg_slots) {
10338 if (i.first != parent &&
10339 i.first.get_ancestor(old_pg_num) == parent) {
10340 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10341 << dendl;
10342 _wake_pg_slot(i.first, i.second.get());
10343 to_delete.push_back(i.first);
10344 }
10345 }
10346 for (auto pgid : to_delete) {
10347 pg_slots.erase(pgid);
10348 }
10349 }
10350
10351 OSDShard::OSDShard(
10352 int id,
10353 CephContext *cct,
10354 OSD *osd)
10355 : shard_id(id),
10356 cct(cct),
10357 osd(osd),
10358 shard_name(string("OSDShard.") + stringify(id)),
10359 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10360 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10361 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10362 shard_lock_name(shard_name + "::shard_lock"),
10363 shard_lock{make_mutex(shard_lock_name)},
10364 scheduler(ceph::osd::scheduler::make_scheduler(cct)),
10365 context_queue(sdata_wait_lock, sdata_cond)
10366 {
10367 dout(0) << "using op scheduler " << *scheduler << dendl;
10368 }
10369
10370
10371 // =============================================================
10372
10373 #undef dout_context
10374 #define dout_context osd->cct
10375 #undef dout_prefix
10376 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10377
10378 void OSD::ShardedOpWQ::_add_slot_waiter(
10379 spg_t pgid,
10380 OSDShardPGSlot *slot,
10381 OpSchedulerItem&& qi)
10382 {
10383 if (qi.is_peering()) {
10384 dout(20) << __func__ << " " << pgid
10385 << " peering, item epoch is "
10386 << qi.get_map_epoch()
10387 << ", will wait on " << qi << dendl;
10388 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10389 } else {
10390 dout(20) << __func__ << " " << pgid
10391 << " item epoch is "
10392 << qi.get_map_epoch()
10393 << ", will wait on " << qi << dendl;
10394 slot->waiting.push_back(std::move(qi));
10395 }
10396 }
10397
10398 #undef dout_prefix
10399 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10400
10401 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10402 {
10403 uint32_t shard_index = thread_index % osd->num_shards;
10404 auto& sdata = osd->shards[shard_index];
10405 ceph_assert(sdata);
10406
10407 // If all threads of shards do oncommits, there is a out-of-order
10408 // problem. So we choose the thread which has the smallest
10409 // thread_index(thread_index < num_shards) of shard to do oncommit
10410 // callback.
10411 bool is_smallest_thread_index = thread_index < osd->num_shards;
10412
10413 // peek at spg_t
10414 sdata->shard_lock.lock();
10415 if (sdata->scheduler->empty() &&
10416 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10417 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10418 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10419 // we raced with a context_queue addition, don't wait
10420 wait_lock.unlock();
10421 } else if (!sdata->stop_waiting) {
10422 dout(20) << __func__ << " empty q, waiting" << dendl;
10423 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10424 sdata->shard_lock.unlock();
10425 sdata->sdata_cond.wait(wait_lock);
10426 wait_lock.unlock();
10427 sdata->shard_lock.lock();
10428 if (sdata->scheduler->empty() &&
10429 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10430 sdata->shard_lock.unlock();
10431 return;
10432 }
10433 // found a work item; reapply default wq timeouts
10434 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10435 timeout_interval, suicide_interval);
10436 } else {
10437 dout(20) << __func__ << " need return immediately" << dendl;
10438 wait_lock.unlock();
10439 sdata->shard_lock.unlock();
10440 return;
10441 }
10442 }
10443
10444 list<Context *> oncommits;
10445 if (is_smallest_thread_index) {
10446 sdata->context_queue.move_to(oncommits);
10447 }
10448
10449 if (sdata->scheduler->empty()) {
10450 if (osd->is_stopping()) {
10451 sdata->shard_lock.unlock();
10452 for (auto c : oncommits) {
10453 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10454 delete c;
10455 }
10456 return; // OSD shutdown, discard.
10457 }
10458 sdata->shard_lock.unlock();
10459 handle_oncommits(oncommits);
10460 return;
10461 }
10462
10463 OpSchedulerItem item = sdata->scheduler->dequeue();
10464 if (osd->is_stopping()) {
10465 sdata->shard_lock.unlock();
10466 for (auto c : oncommits) {
10467 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10468 delete c;
10469 }
10470 return; // OSD shutdown, discard.
10471 }
10472
10473 const auto token = item.get_ordering_token();
10474 auto r = sdata->pg_slots.emplace(token, nullptr);
10475 if (r.second) {
10476 r.first->second = make_unique<OSDShardPGSlot>();
10477 }
10478 OSDShardPGSlot *slot = r.first->second.get();
10479 dout(20) << __func__ << " " << token
10480 << (r.second ? " (new)" : "")
10481 << " to_process " << slot->to_process
10482 << " waiting " << slot->waiting
10483 << " waiting_peering " << slot->waiting_peering
10484 << dendl;
10485 slot->to_process.push_back(std::move(item));
10486 dout(20) << __func__ << " " << slot->to_process.back()
10487 << " queued" << dendl;
10488
10489 retry_pg:
10490 PGRef pg = slot->pg;
10491
10492 // lock pg (if we have it)
10493 if (pg) {
10494 // note the requeue seq now...
10495 uint64_t requeue_seq = slot->requeue_seq;
10496 ++slot->num_running;
10497
10498 sdata->shard_lock.unlock();
10499 osd->service.maybe_inject_dispatch_delay();
10500 pg->lock();
10501 osd->service.maybe_inject_dispatch_delay();
10502 sdata->shard_lock.lock();
10503
10504 auto q = sdata->pg_slots.find(token);
10505 if (q == sdata->pg_slots.end()) {
10506 // this can happen if we race with pg removal.
10507 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10508 pg->unlock();
10509 sdata->shard_lock.unlock();
10510 handle_oncommits(oncommits);
10511 return;
10512 }
10513 slot = q->second.get();
10514 --slot->num_running;
10515
10516 if (slot->to_process.empty()) {
10517 // raced with _wake_pg_slot or consume_map
10518 dout(20) << __func__ << " " << token
10519 << " nothing queued" << dendl;
10520 pg->unlock();
10521 sdata->shard_lock.unlock();
10522 handle_oncommits(oncommits);
10523 return;
10524 }
10525 if (requeue_seq != slot->requeue_seq) {
10526 dout(20) << __func__ << " " << token
10527 << " requeue_seq " << slot->requeue_seq << " > our "
10528 << requeue_seq << ", we raced with _wake_pg_slot"
10529 << dendl;
10530 pg->unlock();
10531 sdata->shard_lock.unlock();
10532 handle_oncommits(oncommits);
10533 return;
10534 }
10535 if (slot->pg != pg) {
10536 // this can happen if we race with pg removal.
10537 dout(20) << __func__ << " slot " << token << " no longer attached to "
10538 << pg << dendl;
10539 pg->unlock();
10540 goto retry_pg;
10541 }
10542 }
10543
10544 dout(20) << __func__ << " " << token
10545 << " to_process " << slot->to_process
10546 << " waiting " << slot->waiting
10547 << " waiting_peering " << slot->waiting_peering << dendl;
10548
10549 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10550 suicide_interval);
10551
10552 // take next item
10553 auto qi = std::move(slot->to_process.front());
10554 slot->to_process.pop_front();
10555 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10556 set<pair<spg_t,epoch_t>> new_children;
10557 OSDMapRef osdmap;
10558
10559 while (!pg) {
10560 // should this pg shard exist on this osd in this (or a later) epoch?
10561 osdmap = sdata->shard_osdmap;
10562 const PGCreateInfo *create_info = qi.creates_pg();
10563 if (!slot->waiting_for_split.empty()) {
10564 dout(20) << __func__ << " " << token
10565 << " splitting " << slot->waiting_for_split << dendl;
10566 _add_slot_waiter(token, slot, std::move(qi));
10567 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10568 dout(20) << __func__ << " " << token
10569 << " map " << qi.get_map_epoch() << " > "
10570 << osdmap->get_epoch() << dendl;
10571 _add_slot_waiter(token, slot, std::move(qi));
10572 } else if (qi.is_peering()) {
10573 if (!qi.peering_requires_pg()) {
10574 // for pg-less events, we run them under the ordering lock, since
10575 // we don't have the pg lock to keep them ordered.
10576 qi.run(osd, sdata, pg, tp_handle);
10577 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10578 if (create_info) {
10579 if (create_info->by_mon &&
10580 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
10581 dout(20) << __func__ << " " << token
10582 << " no pg, no longer primary, ignoring mon create on "
10583 << qi << dendl;
10584 } else {
10585 dout(20) << __func__ << " " << token
10586 << " no pg, should create on " << qi << dendl;
10587 pg = osd->handle_pg_create_info(osdmap, create_info);
10588 if (pg) {
10589 // we created the pg! drop out and continue "normally"!
10590 sdata->_attach_pg(slot, pg.get());
10591 sdata->_wake_pg_slot(token, slot);
10592
10593 // identify split children between create epoch and shard epoch.
10594 osd->service.identify_splits_and_merges(
10595 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
10596 sdata->_prime_splits(&new_children);
10597 // distribute remaining split children to other shards below!
10598 break;
10599 }
10600 dout(20) << __func__ << " ignored create on " << qi << dendl;
10601 }
10602 } else {
10603 dout(20) << __func__ << " " << token
10604 << " no pg, peering, !create, discarding " << qi << dendl;
10605 }
10606 } else {
10607 dout(20) << __func__ << " " << token
10608 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
10609 << ", discarding " << qi
10610 << dendl;
10611 }
10612 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10613 dout(20) << __func__ << " " << token
10614 << " no pg, should exist e" << osdmap->get_epoch()
10615 << ", will wait on " << qi << dendl;
10616 _add_slot_waiter(token, slot, std::move(qi));
10617 } else {
10618 dout(20) << __func__ << " " << token
10619 << " no pg, shouldn't exist e" << osdmap->get_epoch()
10620 << ", dropping " << qi << dendl;
10621 // share map with client?
10622 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10623 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
10624 sdata->shard_osdmap,
10625 (*_op)->sent_epoch);
10626 }
10627 unsigned pushes_to_free = qi.get_reserved_pushes();
10628 if (pushes_to_free > 0) {
10629 sdata->shard_lock.unlock();
10630 osd->service.release_reserved_pushes(pushes_to_free);
10631 handle_oncommits(oncommits);
10632 return;
10633 }
10634 }
10635 sdata->shard_lock.unlock();
10636 handle_oncommits(oncommits);
10637 return;
10638 }
10639 if (qi.is_peering()) {
10640 OSDMapRef osdmap = sdata->shard_osdmap;
10641 if (qi.get_map_epoch() > osdmap->get_epoch()) {
10642 _add_slot_waiter(token, slot, std::move(qi));
10643 sdata->shard_lock.unlock();
10644 pg->unlock();
10645 handle_oncommits(oncommits);
10646 return;
10647 }
10648 }
10649 sdata->shard_lock.unlock();
10650
10651 if (!new_children.empty()) {
10652 for (auto shard : osd->shards) {
10653 shard->prime_splits(osdmap, &new_children);
10654 }
10655 ceph_assert(new_children.empty());
10656 }
10657
10658 // osd_opwq_process marks the point at which an operation has been dequeued
10659 // and will begin to be handled by a worker thread.
10660 {
10661 #ifdef WITH_LTTNG
10662 osd_reqid_t reqid;
10663 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10664 reqid = (*_op)->get_reqid();
10665 }
10666 #endif
10667 tracepoint(osd, opwq_process_start, reqid.name._type,
10668 reqid.name._num, reqid.tid, reqid.inc);
10669 }
10670
10671 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10672 Formatter *f = Formatter::create("json");
10673 f->open_object_section("q");
10674 dump(f);
10675 f->close_section();
10676 f->flush(*_dout);
10677 delete f;
10678 *_dout << dendl;
10679
10680 qi.run(osd, sdata, pg, tp_handle);
10681
10682 {
10683 #ifdef WITH_LTTNG
10684 osd_reqid_t reqid;
10685 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10686 reqid = (*_op)->get_reqid();
10687 }
10688 #endif
10689 tracepoint(osd, opwq_process_finish, reqid.name._type,
10690 reqid.name._num, reqid.tid, reqid.inc);
10691 }
10692
10693 handle_oncommits(oncommits);
10694 }
10695
10696 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
10697 uint32_t shard_index =
10698 item.get_ordering_token().hash_to_shard(osd->shards.size());
10699
10700 dout(20) << __func__ << " " << item << dendl;
10701
10702 OSDShard* sdata = osd->shards[shard_index];
10703 assert (NULL != sdata);
10704
10705 bool empty = true;
10706 {
10707 std::lock_guard l{sdata->shard_lock};
10708 empty = sdata->scheduler->empty();
10709 sdata->scheduler->enqueue(std::move(item));
10710 }
10711
10712 if (empty) {
10713 std::lock_guard l{sdata->sdata_wait_lock};
10714 sdata->sdata_cond.notify_one();
10715 }
10716 }
10717
10718 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
10719 {
10720 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
10721 auto& sdata = osd->shards[shard_index];
10722 ceph_assert(sdata);
10723 sdata->shard_lock.lock();
10724 auto p = sdata->pg_slots.find(item.get_ordering_token());
10725 if (p != sdata->pg_slots.end() &&
10726 !p->second->to_process.empty()) {
10727 // we may be racing with _process, which has dequeued a new item
10728 // from scheduler, put it on to_process, and is now busy taking the
10729 // pg lock. ensure this old requeued item is ordered before any
10730 // such newer item in to_process.
10731 p->second->to_process.push_front(std::move(item));
10732 item = std::move(p->second->to_process.back());
10733 p->second->to_process.pop_back();
10734 dout(20) << __func__
10735 << " " << p->second->to_process.front()
10736 << " shuffled w/ " << item << dendl;
10737 } else {
10738 dout(20) << __func__ << " " << item << dendl;
10739 }
10740 sdata->scheduler->enqueue_front(std::move(item));
10741 sdata->shard_lock.unlock();
10742 std::lock_guard l{sdata->sdata_wait_lock};
10743 sdata->sdata_cond.notify_one();
10744 }
10745
10746 namespace ceph {
10747 namespace osd_cmds {
10748
10749 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
10750 std::ostream& os)
10751 {
10752 if (!ceph_using_tcmalloc()) {
10753 os << "could not issue heap profiler command -- not using tcmalloc!";
10754 return -EOPNOTSUPP;
10755 }
10756
10757 string cmd;
10758 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
10759 os << "unable to get value for command \"" << cmd << "\"";
10760 return -EINVAL;
10761 }
10762
10763 std::vector<std::string> cmd_vec;
10764 get_str_vec(cmd, cmd_vec);
10765
10766 string val;
10767 if (cmd_getval(cmdmap, "value", val)) {
10768 cmd_vec.push_back(val);
10769 }
10770
10771 ceph_heap_profiler_handle_command(cmd_vec, os);
10772
10773 return 0;
10774 }
10775
10776 }} // namespace ceph::osd_cmds