]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
import 15.2.5
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15
16 #include "acconfig.h"
17
18 #include <cctype>
19 #include <fstream>
20 #include <iostream>
21 #include <iterator>
22
23 #include <unistd.h>
24 #include <sys/stat.h>
25 #include <signal.h>
26 #include <time.h>
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
29
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
32 #endif
33
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
36 #endif
37
38 #include "osd/PG.h"
39
40 #include "include/types.h"
41 #include "include/compat.h"
42 #include "include/random.h"
43
44 #include "OSD.h"
45 #include "OSDMap.h"
46 #include "Watch.h"
47 #include "osdc/Objecter.h"
48
49 #include "common/errno.h"
50 #include "common/ceph_argparse.h"
51 #include "common/ceph_releases.h"
52 #include "common/ceph_time.h"
53 #include "common/version.h"
54 #include "common/pick_address.h"
55 #include "common/blkdev.h"
56 #include "common/numa.h"
57
58 #include "os/ObjectStore.h"
59 #ifdef HAVE_LIBFUSE
60 #include "os/FuseStore.h"
61 #endif
62
63 #include "PrimaryLogPG.h"
64
65 #include "msg/Messenger.h"
66 #include "msg/Message.h"
67
68 #include "mon/MonClient.h"
69
70 #include "messages/MLog.h"
71
72 #include "messages/MGenericMessage.h"
73 #include "messages/MOSDPing.h"
74 #include "messages/MOSDFailure.h"
75 #include "messages/MOSDMarkMeDown.h"
76 #include "messages/MOSDMarkMeDead.h"
77 #include "messages/MOSDFull.h"
78 #include "messages/MOSDOp.h"
79 #include "messages/MOSDOpReply.h"
80 #include "messages/MOSDBackoff.h"
81 #include "messages/MOSDBeacon.h"
82 #include "messages/MOSDRepOp.h"
83 #include "messages/MOSDRepOpReply.h"
84 #include "messages/MOSDBoot.h"
85 #include "messages/MOSDPGTemp.h"
86 #include "messages/MOSDPGReadyToMerge.h"
87
88 #include "messages/MOSDMap.h"
89 #include "messages/MMonGetOSDMap.h"
90 #include "messages/MOSDPGNotify.h"
91 #include "messages/MOSDPGNotify2.h"
92 #include "messages/MOSDPGQuery.h"
93 #include "messages/MOSDPGQuery2.h"
94 #include "messages/MOSDPGLog.h"
95 #include "messages/MOSDPGRemove.h"
96 #include "messages/MOSDPGInfo.h"
97 #include "messages/MOSDPGInfo2.h"
98 #include "messages/MOSDPGCreate.h"
99 #include "messages/MOSDPGCreate2.h"
100 #include "messages/MOSDPGScan.h"
101 #include "messages/MBackfillReserve.h"
102 #include "messages/MRecoveryReserve.h"
103 #include "messages/MOSDForceRecovery.h"
104 #include "messages/MOSDECSubOpWrite.h"
105 #include "messages/MOSDECSubOpWriteReply.h"
106 #include "messages/MOSDECSubOpRead.h"
107 #include "messages/MOSDECSubOpReadReply.h"
108 #include "messages/MOSDPGCreated.h"
109 #include "messages/MOSDPGUpdateLogMissing.h"
110 #include "messages/MOSDPGUpdateLogMissingReply.h"
111
112 #include "messages/MOSDPeeringOp.h"
113
114 #include "messages/MOSDAlive.h"
115
116 #include "messages/MOSDScrub.h"
117 #include "messages/MOSDScrub2.h"
118 #include "messages/MOSDRepScrub.h"
119
120 #include "messages/MCommand.h"
121 #include "messages/MCommandReply.h"
122
123 #include "messages/MPGStats.h"
124 #include "messages/MPGStatsAck.h"
125
126 #include "messages/MWatchNotify.h"
127 #include "messages/MOSDPGPush.h"
128 #include "messages/MOSDPGPushReply.h"
129 #include "messages/MOSDPGPull.h"
130
131 #include "messages/MMonGetPurgedSnaps.h"
132 #include "messages/MMonGetPurgedSnapsReply.h"
133
134 #include "common/perf_counters.h"
135 #include "common/Timer.h"
136 #include "common/LogClient.h"
137 #include "common/AsyncReserver.h"
138 #include "common/HeartbeatMap.h"
139 #include "common/admin_socket.h"
140 #include "common/ceph_context.h"
141
142 #include "global/signal_handler.h"
143 #include "global/pidfile.h"
144
145 #include "include/color.h"
146 #include "perfglue/cpu_profiler.h"
147 #include "perfglue/heap_profiler.h"
148
149 #include "osd/OpRequest.h"
150
151 #include "auth/AuthAuthorizeHandler.h"
152 #include "auth/RotatingKeyRing.h"
153
154 #include "objclass/objclass.h"
155
156 #include "common/cmdparse.h"
157 #include "include/str_list.h"
158 #include "include/util.h"
159
160 #include "include/ceph_assert.h"
161 #include "common/config.h"
162 #include "common/EventTrace.h"
163
164 #include "json_spirit/json_spirit_reader.h"
165 #include "json_spirit/json_spirit_writer.h"
166
167 #ifdef WITH_LTTNG
168 #define TRACEPOINT_DEFINE
169 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
170 #include "tracing/osd.h"
171 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172 #undef TRACEPOINT_DEFINE
173 #else
174 #define tracepoint(...)
175 #endif
176
177 #define dout_context cct
178 #define dout_subsys ceph_subsys_osd
179 #undef dout_prefix
180 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
181
182 using namespace ceph::osd::scheduler;
183 using TOPNSPC::common::cmd_getval;
184
185 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
186 return *_dout << "osd." << whoami << " " << epoch << " ";
187 }
188
189 //Initial features in new superblock.
190 //Features here are also automatically upgraded
191 CompatSet OSD::get_osd_initial_compat_set() {
192 CompatSet::FeatureSet ceph_osd_feature_compat;
193 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
194 CompatSet::FeatureSet ceph_osd_feature_incompat;
195 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
196 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
197 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
198 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
199 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
200 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
201 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
202 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
203 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
204 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
205 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
206 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
207 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
208 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
209 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
210 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
211 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
212 ceph_osd_feature_incompat);
213 }
214
215 //Features are added here that this OSD supports.
216 CompatSet OSD::get_osd_compat_set() {
217 CompatSet compat = get_osd_initial_compat_set();
218 //Any features here can be set in code, but not in initial superblock
219 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
220 return compat;
221 }
222
223 OSDService::OSDService(OSD *osd) :
224 osd(osd),
225 cct(osd->cct),
226 whoami(osd->whoami), store(osd->store),
227 log_client(osd->log_client), clog(osd->clog),
228 pg_recovery_stats(osd->pg_recovery_stats),
229 cluster_messenger(osd->cluster_messenger),
230 client_messenger(osd->client_messenger),
231 logger(osd->logger),
232 recoverystate_perf(osd->recoverystate_perf),
233 monc(osd->monc),
234 osd_max_object_size(cct->_conf, "osd_max_object_size"),
235 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
236 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
237 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
238 max_oldest_map(0),
239 scrubs_local(0),
240 scrubs_remote(0),
241 agent_valid_iterator(false),
242 agent_ops(0),
243 flush_mode_high_count(0),
244 agent_active(true),
245 agent_thread(this),
246 agent_stop_flag(false),
247 agent_timer(osd->client_messenger->cct, agent_timer_lock),
248 last_recalibrate(ceph_clock_now()),
249 promote_max_objects(0),
250 promote_max_bytes(0),
251 objecter(make_unique<Objecter>(osd->client_messenger->cct,
252 osd->objecter_messenger,
253 osd->monc, nullptr, 0, 0)),
254 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
255 watch_timer(osd->client_messenger->cct, watch_lock),
256 next_notif_id(0),
257 recovery_request_timer(cct, recovery_request_lock, false),
258 sleep_timer(cct, sleep_lock, false),
259 reserver_finisher(cct),
260 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
261 cct->_conf->osd_min_recovery_priority),
262 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
263 cct->_conf->osd_min_recovery_priority),
264 snap_reserver(cct, &reserver_finisher,
265 cct->_conf->osd_max_trimming_pgs),
266 recovery_ops_active(0),
267 recovery_ops_reserved(0),
268 recovery_paused(false),
269 map_cache(cct, cct->_conf->osd_map_cache_size),
270 map_bl_cache(cct->_conf->osd_map_cache_size),
271 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
272 cur_state(NONE),
273 cur_ratio(0), physical_ratio(0),
274 boot_epoch(0), up_epoch(0), bind_epoch(0)
275 {
276 objecter->init();
277
278 for (int i = 0; i < m_objecter_finishers; i++) {
279 ostringstream str;
280 str << "objecter-finisher-" << i;
281 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
282 objecter_finishers.push_back(std::move(fin));
283 }
284 }
285
286 #ifdef PG_DEBUG_REFS
287 void OSDService::add_pgid(spg_t pgid, PG *pg){
288 std::lock_guard l(pgid_lock);
289 if (!pgid_tracker.count(pgid)) {
290 live_pgs[pgid] = pg;
291 }
292 pgid_tracker[pgid]++;
293 }
294 void OSDService::remove_pgid(spg_t pgid, PG *pg)
295 {
296 std::lock_guard l(pgid_lock);
297 ceph_assert(pgid_tracker.count(pgid));
298 ceph_assert(pgid_tracker[pgid] > 0);
299 pgid_tracker[pgid]--;
300 if (pgid_tracker[pgid] == 0) {
301 pgid_tracker.erase(pgid);
302 live_pgs.erase(pgid);
303 }
304 }
305 void OSDService::dump_live_pgids()
306 {
307 std::lock_guard l(pgid_lock);
308 derr << "live pgids:" << dendl;
309 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
310 i != pgid_tracker.cend();
311 ++i) {
312 derr << "\t" << *i << dendl;
313 live_pgs[i->first]->dump_live_ids();
314 }
315 }
316 #endif
317
318
319 ceph::signedspan OSDService::get_mnow()
320 {
321 return ceph::mono_clock::now() - osd->startup_time;
322 }
323
324 void OSDService::identify_splits_and_merges(
325 OSDMapRef old_map,
326 OSDMapRef new_map,
327 spg_t pgid,
328 set<pair<spg_t,epoch_t>> *split_children,
329 set<pair<spg_t,epoch_t>> *merge_pgs)
330 {
331 if (!old_map->have_pg_pool(pgid.pool())) {
332 return;
333 }
334 int old_pgnum = old_map->get_pg_num(pgid.pool());
335 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
336 if (p == osd->pg_num_history.pg_nums.end()) {
337 return;
338 }
339 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
340 << " to e" << new_map->get_epoch()
341 << " pg_nums " << p->second << dendl;
342 deque<spg_t> queue;
343 queue.push_back(pgid);
344 set<spg_t> did;
345 while (!queue.empty()) {
346 auto cur = queue.front();
347 queue.pop_front();
348 did.insert(cur);
349 unsigned pgnum = old_pgnum;
350 for (auto q = p->second.lower_bound(old_map->get_epoch());
351 q != p->second.end() &&
352 q->first <= new_map->get_epoch();
353 ++q) {
354 if (pgnum < q->second) {
355 // split?
356 if (cur.ps() < pgnum) {
357 set<spg_t> children;
358 if (cur.is_split(pgnum, q->second, &children)) {
359 dout(20) << __func__ << " " << cur << " e" << q->first
360 << " pg_num " << pgnum << " -> " << q->second
361 << " children " << children << dendl;
362 for (auto i : children) {
363 split_children->insert(make_pair(i, q->first));
364 if (!did.count(i))
365 queue.push_back(i);
366 }
367 }
368 } else if (cur.ps() < q->second) {
369 dout(20) << __func__ << " " << cur << " e" << q->first
370 << " pg_num " << pgnum << " -> " << q->second
371 << " is a child" << dendl;
372 // normally we'd capture this from the parent, but it's
373 // possible the parent doesn't exist yet (it will be
374 // fabricated to allow an intervening merge). note this PG
375 // as a split child here to be sure we catch it.
376 split_children->insert(make_pair(cur, q->first));
377 } else {
378 dout(20) << __func__ << " " << cur << " e" << q->first
379 << " pg_num " << pgnum << " -> " << q->second
380 << " is post-split, skipping" << dendl;
381 }
382 } else if (merge_pgs) {
383 // merge?
384 if (cur.ps() >= q->second) {
385 if (cur.ps() < pgnum) {
386 spg_t parent;
387 if (cur.is_merge_source(pgnum, q->second, &parent)) {
388 set<spg_t> children;
389 parent.is_split(q->second, pgnum, &children);
390 dout(20) << __func__ << " " << cur << " e" << q->first
391 << " pg_num " << pgnum << " -> " << q->second
392 << " is merge source, target " << parent
393 << ", source(s) " << children << dendl;
394 merge_pgs->insert(make_pair(parent, q->first));
395 if (!did.count(parent)) {
396 // queue (and re-scan) parent in case it might not exist yet
397 // and there are some future splits pending on it
398 queue.push_back(parent);
399 }
400 for (auto c : children) {
401 merge_pgs->insert(make_pair(c, q->first));
402 if (!did.count(c))
403 queue.push_back(c);
404 }
405 }
406 } else {
407 dout(20) << __func__ << " " << cur << " e" << q->first
408 << " pg_num " << pgnum << " -> " << q->second
409 << " is beyond old pgnum, skipping" << dendl;
410 }
411 } else {
412 set<spg_t> children;
413 if (cur.is_split(q->second, pgnum, &children)) {
414 dout(20) << __func__ << " " << cur << " e" << q->first
415 << " pg_num " << pgnum << " -> " << q->second
416 << " is merge target, source " << children << dendl;
417 for (auto c : children) {
418 merge_pgs->insert(make_pair(c, q->first));
419 if (!did.count(c))
420 queue.push_back(c);
421 }
422 merge_pgs->insert(make_pair(cur, q->first));
423 }
424 }
425 }
426 pgnum = q->second;
427 }
428 }
429 }
430
431 void OSDService::need_heartbeat_peer_update()
432 {
433 osd->need_heartbeat_peer_update();
434 }
435
436 HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
437 {
438 std::lock_guard l(hb_stamp_lock);
439 if (peer >= hb_stamps.size()) {
440 hb_stamps.resize(peer + 1);
441 }
442 if (!hb_stamps[peer]) {
443 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
444 }
445 return hb_stamps[peer];
446 }
447
448 void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
449 {
450 osd->enqueue_peering_evt(
451 spgid,
452 PGPeeringEventRef(
453 std::make_shared<PGPeeringEvent>(
454 epoch, epoch,
455 RenewLease())));
456 }
457
458 void OSDService::start_shutdown()
459 {
460 {
461 std::lock_guard l(agent_timer_lock);
462 agent_timer.shutdown();
463 }
464
465 {
466 std::lock_guard l(sleep_lock);
467 sleep_timer.shutdown();
468 }
469
470 {
471 std::lock_guard l(recovery_request_lock);
472 recovery_request_timer.shutdown();
473 }
474 }
475
476 void OSDService::shutdown_reserver()
477 {
478 reserver_finisher.wait_for_empty();
479 reserver_finisher.stop();
480 }
481
482 void OSDService::shutdown()
483 {
484 mono_timer.suspend();
485
486 {
487 std::lock_guard l(watch_lock);
488 watch_timer.shutdown();
489 }
490
491 objecter->shutdown();
492 for (auto& f : objecter_finishers) {
493 f->wait_for_empty();
494 f->stop();
495 }
496
497 publish_map(OSDMapRef());
498 next_osdmap = OSDMapRef();
499 }
500
501 void OSDService::init()
502 {
503 reserver_finisher.start();
504 for (auto& f : objecter_finishers) {
505 f->start();
506 }
507 objecter->set_client_incarnation(0);
508
509 // deprioritize objecter in daemonperf output
510 objecter->get_logger()->set_prio_adjust(-3);
511
512 watch_timer.init();
513 agent_timer.init();
514 mono_timer.resume();
515
516 agent_thread.create("osd_srv_agent");
517
518 if (cct->_conf->osd_recovery_delay_start)
519 defer_recovery(cct->_conf->osd_recovery_delay_start);
520 }
521
522 void OSDService::final_init()
523 {
524 objecter->start(osdmap.get());
525 }
526
527 void OSDService::activate_map()
528 {
529 // wake/unwake the tiering agent
530 std::lock_guard l{agent_lock};
531 agent_active =
532 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
533 osd->is_active();
534 agent_cond.notify_all();
535 }
536
537 void OSDService::request_osdmap_update(epoch_t e)
538 {
539 osd->osdmap_subscribe(e, false);
540 }
541
542
543 class AgentTimeoutCB : public Context {
544 PGRef pg;
545 public:
546 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
547 void finish(int) override {
548 pg->agent_choose_mode_restart();
549 }
550 };
551
552 void OSDService::agent_entry()
553 {
554 dout(10) << __func__ << " start" << dendl;
555 std::unique_lock agent_locker{agent_lock};
556
557 while (!agent_stop_flag) {
558 if (agent_queue.empty()) {
559 dout(20) << __func__ << " empty queue" << dendl;
560 agent_cond.wait(agent_locker);
561 continue;
562 }
563 uint64_t level = agent_queue.rbegin()->first;
564 set<PGRef>& top = agent_queue.rbegin()->second;
565 dout(10) << __func__
566 << " tiers " << agent_queue.size()
567 << ", top is " << level
568 << " with pgs " << top.size()
569 << ", ops " << agent_ops << "/"
570 << cct->_conf->osd_agent_max_ops
571 << (agent_active ? " active" : " NOT ACTIVE")
572 << dendl;
573 dout(20) << __func__ << " oids " << agent_oids << dendl;
574 int max = cct->_conf->osd_agent_max_ops - agent_ops;
575 int agent_flush_quota = max;
576 if (!flush_mode_high_count)
577 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
578 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
579 agent_cond.wait(agent_locker);
580 continue;
581 }
582
583 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
584 agent_queue_pos = top.begin();
585 agent_valid_iterator = true;
586 }
587 PGRef pg = *agent_queue_pos;
588 dout(10) << "high_count " << flush_mode_high_count
589 << " agent_ops " << agent_ops
590 << " flush_quota " << agent_flush_quota << dendl;
591 agent_locker.unlock();
592 if (!pg->agent_work(max, agent_flush_quota)) {
593 dout(10) << __func__ << " " << pg->pg_id
594 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
595 << " seconds" << dendl;
596
597 osd->logger->inc(l_osd_tier_delay);
598 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
599 std::lock_guard timer_locker{agent_timer_lock};
600 Context *cb = new AgentTimeoutCB(pg);
601 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
602 }
603 agent_locker.lock();
604 }
605 dout(10) << __func__ << " finish" << dendl;
606 }
607
608 void OSDService::agent_stop()
609 {
610 {
611 std::lock_guard l(agent_lock);
612
613 // By this time all ops should be cancelled
614 ceph_assert(agent_ops == 0);
615 // By this time all PGs are shutdown and dequeued
616 if (!agent_queue.empty()) {
617 set<PGRef>& top = agent_queue.rbegin()->second;
618 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
619 ceph_abort_msg("agent queue not empty");
620 }
621
622 agent_stop_flag = true;
623 agent_cond.notify_all();
624 }
625 agent_thread.join();
626 }
627
628 // -------------------------------------
629
630 void OSDService::promote_throttle_recalibrate()
631 {
632 utime_t now = ceph_clock_now();
633 double dur = now - last_recalibrate;
634 last_recalibrate = now;
635 unsigned prob = promote_probability_millis;
636
637 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
638 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
639
640 unsigned min_prob = 1;
641
642 uint64_t attempts, obj, bytes;
643 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
644 dout(10) << __func__ << " " << attempts << " attempts, promoted "
645 << obj << " objects and " << byte_u_t(bytes) << "; target "
646 << target_obj_sec << " obj/sec or "
647 << byte_u_t(target_bytes_sec) << "/sec"
648 << dendl;
649
650 // calculate what the probability *should* be, given the targets
651 unsigned new_prob;
652 if (attempts && dur > 0) {
653 uint64_t avg_size = 1;
654 if (obj)
655 avg_size = std::max<uint64_t>(bytes / obj, 1);
656 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
657 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
658 / (double)attempts;
659 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
660 << avg_size << dendl;
661 if (target_obj_sec && target_bytes_sec)
662 new_prob = std::min(po, pb);
663 else if (target_obj_sec)
664 new_prob = po;
665 else if (target_bytes_sec)
666 new_prob = pb;
667 else
668 new_prob = 1000;
669 } else {
670 new_prob = 1000;
671 }
672 dout(20) << __func__ << " new_prob " << new_prob << dendl;
673
674 // correct for persistent skew between target rate and actual rate, adjust
675 double ratio = 1.0;
676 unsigned actual = 0;
677 if (attempts && obj) {
678 actual = obj * 1000 / attempts;
679 ratio = (double)actual / (double)prob;
680 new_prob = (double)new_prob / ratio;
681 }
682 new_prob = std::max(new_prob, min_prob);
683 new_prob = std::min(new_prob, 1000u);
684
685 // adjust
686 prob = (prob + new_prob) / 2;
687 prob = std::max(prob, min_prob);
688 prob = std::min(prob, 1000u);
689 dout(10) << __func__ << " actual " << actual
690 << ", actual/prob ratio " << ratio
691 << ", adjusted new_prob " << new_prob
692 << ", prob " << promote_probability_millis << " -> " << prob
693 << dendl;
694 promote_probability_millis = prob;
695
696 // set hard limits for this interval to mitigate stampedes
697 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
698 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
699 }
700
701 // -------------------------------------
702
703 float OSDService::get_failsafe_full_ratio()
704 {
705 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
706 if (full_ratio > 1.0) full_ratio /= 100.0;
707 return full_ratio;
708 }
709
710 OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
711 {
712 // The OSDMap ratios take precendence. So if the failsafe is .95 and
713 // the admin sets the cluster full to .96, the failsafe moves up to .96
714 // too. (Not that having failsafe == full is ideal, but it's better than
715 // dropping writes before the clusters appears full.)
716 OSDMapRef osdmap = get_osdmap();
717 if (!osdmap || osdmap->get_epoch() == 0) {
718 return NONE;
719 }
720 float nearfull_ratio = osdmap->get_nearfull_ratio();
721 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
722 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
723 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
724
725 if (osdmap->require_osd_release < ceph_release_t::luminous) {
726 // use the failsafe for nearfull and full; the mon isn't using the
727 // flags anyway because we're mid-upgrade.
728 full_ratio = failsafe_ratio;
729 backfillfull_ratio = failsafe_ratio;
730 nearfull_ratio = failsafe_ratio;
731 } else if (full_ratio <= 0 ||
732 backfillfull_ratio <= 0 ||
733 nearfull_ratio <= 0) {
734 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
735 // use failsafe flag. ick. the monitor did something wrong or the user
736 // did something stupid.
737 full_ratio = failsafe_ratio;
738 backfillfull_ratio = failsafe_ratio;
739 nearfull_ratio = failsafe_ratio;
740 }
741
742 if (injectfull_state > NONE && injectfull) {
743 inject = "(Injected)";
744 return injectfull_state;
745 } else if (pratio > failsafe_ratio) {
746 return FAILSAFE;
747 } else if (ratio > full_ratio) {
748 return FULL;
749 } else if (ratio > backfillfull_ratio) {
750 return BACKFILLFULL;
751 } else if (pratio > nearfull_ratio) {
752 return NEARFULL;
753 }
754 return NONE;
755 }
756
757 void OSDService::check_full_status(float ratio, float pratio)
758 {
759 std::lock_guard l(full_status_lock);
760
761 cur_ratio = ratio;
762 physical_ratio = pratio;
763
764 string inject;
765 s_names new_state;
766 new_state = recalc_full_state(ratio, pratio, inject);
767
768 dout(20) << __func__ << " cur ratio " << ratio
769 << ", physical ratio " << pratio
770 << ", new state " << get_full_state_name(new_state)
771 << " " << inject
772 << dendl;
773
774 // warn
775 if (cur_state != new_state) {
776 dout(10) << __func__ << " " << get_full_state_name(cur_state)
777 << " -> " << get_full_state_name(new_state) << dendl;
778 if (new_state == FAILSAFE) {
779 clog->error() << "full status failsafe engaged, dropping updates, now "
780 << (int)roundf(ratio * 100) << "% full";
781 } else if (cur_state == FAILSAFE) {
782 clog->error() << "full status failsafe disengaged, no longer dropping "
783 << "updates, now " << (int)roundf(ratio * 100) << "% full";
784 }
785 cur_state = new_state;
786 }
787 }
788
789 bool OSDService::need_fullness_update()
790 {
791 OSDMapRef osdmap = get_osdmap();
792 s_names cur = NONE;
793 if (osdmap->exists(whoami)) {
794 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
795 cur = FULL;
796 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
797 cur = BACKFILLFULL;
798 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
799 cur = NEARFULL;
800 }
801 }
802 s_names want = NONE;
803 if (is_full())
804 want = FULL;
805 else if (is_backfillfull())
806 want = BACKFILLFULL;
807 else if (is_nearfull())
808 want = NEARFULL;
809 return want != cur;
810 }
811
812 bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
813 {
814 if (injectfull && injectfull_state >= type) {
815 // injectfull is either a count of the number of times to return failsafe full
816 // or if -1 then always return full
817 if (injectfull > 0)
818 --injectfull;
819 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
820 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
821 << dendl;
822 return true;
823 }
824 return false;
825 }
826
827 bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
828 {
829 std::lock_guard l(full_status_lock);
830
831 if (_check_inject_full(dpp, type))
832 return true;
833
834 if (cur_state >= type)
835 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
836 << " physical " << physical_ratio << dendl;
837
838 return cur_state >= type;
839 }
840
841 bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
842 {
843 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
844 {
845 std::lock_guard l(full_status_lock);
846 if (_check_inject_full(dpp, type)) {
847 return true;
848 }
849 }
850
851 float pratio;
852 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
853
854 string notused;
855 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
856
857 if (tentative_state >= type)
858 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
859
860 return tentative_state >= type;
861 }
862
863 bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
864 {
865 return _check_full(dpp, FAILSAFE);
866 }
867
868 bool OSDService::check_full(DoutPrefixProvider *dpp) const
869 {
870 return _check_full(dpp, FULL);
871 }
872
873 bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
874 {
875 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
876 }
877
878 bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
879 {
880 return _check_full(dpp, BACKFILLFULL);
881 }
882
883 bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
884 {
885 return _check_full(dpp, NEARFULL);
886 }
887
888 bool OSDService::is_failsafe_full() const
889 {
890 std::lock_guard l(full_status_lock);
891 return cur_state == FAILSAFE;
892 }
893
894 bool OSDService::is_full() const
895 {
896 std::lock_guard l(full_status_lock);
897 return cur_state >= FULL;
898 }
899
900 bool OSDService::is_backfillfull() const
901 {
902 std::lock_guard l(full_status_lock);
903 return cur_state >= BACKFILLFULL;
904 }
905
906 bool OSDService::is_nearfull() const
907 {
908 std::lock_guard l(full_status_lock);
909 return cur_state >= NEARFULL;
910 }
911
912 void OSDService::set_injectfull(s_names type, int64_t count)
913 {
914 std::lock_guard l(full_status_lock);
915 injectfull_state = type;
916 injectfull = count;
917 }
918
919 void OSDService::set_statfs(const struct store_statfs_t &stbuf,
920 osd_alert_list_t& alerts)
921 {
922 uint64_t bytes = stbuf.total;
923 uint64_t avail = stbuf.available;
924 uint64_t used = stbuf.get_used_raw();
925
926 // For testing fake statfs values so it doesn't matter if all
927 // OSDs are using the same partition.
928 if (cct->_conf->fake_statfs_for_testing) {
929 uint64_t total_num_bytes = 0;
930 vector<PGRef> pgs;
931 osd->_get_pgs(&pgs);
932 for (auto p : pgs) {
933 total_num_bytes += p->get_stats_num_bytes();
934 }
935 bytes = cct->_conf->fake_statfs_for_testing;
936 if (total_num_bytes < bytes)
937 avail = bytes - total_num_bytes;
938 else
939 avail = 0;
940 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
941 << " adjust available " << avail
942 << dendl;
943 used = bytes - avail;
944 }
945
946 osd->logger->set(l_osd_stat_bytes, bytes);
947 osd->logger->set(l_osd_stat_bytes_used, used);
948 osd->logger->set(l_osd_stat_bytes_avail, avail);
949
950 std::lock_guard l(stat_lock);
951 osd_stat.statfs = stbuf;
952 osd_stat.os_alerts.clear();
953 osd_stat.os_alerts[whoami].swap(alerts);
954 if (cct->_conf->fake_statfs_for_testing) {
955 osd_stat.statfs.total = bytes;
956 osd_stat.statfs.available = avail;
957 // For testing don't want used to go negative, so clear reserved
958 osd_stat.statfs.internally_reserved = 0;
959 }
960 }
961
962 osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
963 int num_pgs)
964 {
965 utime_t now = ceph_clock_now();
966 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
967 std::lock_guard l(stat_lock);
968 osd_stat.hb_peers.swap(hb_peers);
969 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
970 osd_stat.num_pgs = num_pgs;
971 // Clean entries that aren't updated
972 // This is called often enough that we can just remove 1 at a time
973 for (auto i: osd_stat.hb_pingtime) {
974 if (i.second.last_update == 0)
975 continue;
976 if (stale_time && now.sec() - i.second.last_update > stale_time) {
977 dout(20) << __func__ << " time out heartbeat for osd " << i.first
978 << " last_update " << i.second.last_update << dendl;
979 osd_stat.hb_pingtime.erase(i.first);
980 break;
981 }
982 }
983 return osd_stat;
984 }
985
986 void OSDService::inc_osd_stat_repaired()
987 {
988 std::lock_guard l(stat_lock);
989 osd_stat.num_shards_repaired++;
990 return;
991 }
992
993 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
994 uint64_t adjust_used)
995 {
996 *pratio =
997 ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
998
999 if (adjust_used) {
1000 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1001 if (new_stat.statfs.available > adjust_used)
1002 new_stat.statfs.available -= adjust_used;
1003 else
1004 new_stat.statfs.available = 0;
1005 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
1006 }
1007
1008 // Check all pgs and adjust kb_used to include all pending backfill data
1009 int backfill_adjusted = 0;
1010 vector<PGRef> pgs;
1011 osd->_get_pgs(&pgs);
1012 for (auto p : pgs) {
1013 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1014 }
1015 if (backfill_adjusted) {
1016 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1017 }
1018 return ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
1019 }
1020
1021 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1022 {
1023 OSDMapRef next_map = get_nextmap_reserved();
1024 // service map is always newer/newest
1025 ceph_assert(from_epoch <= next_map->get_epoch());
1026
1027 if (next_map->is_down(peer) ||
1028 next_map->get_info(peer).up_from > from_epoch) {
1029 m->put();
1030 release_map(next_map);
1031 return;
1032 }
1033 ConnectionRef peer_con;
1034 if (peer == whoami) {
1035 peer_con = osd->cluster_messenger->get_loopback_connection();
1036 } else {
1037 peer_con = osd->cluster_messenger->connect_to_osd(
1038 next_map->get_cluster_addrs(peer), false, true);
1039 }
1040 maybe_share_map(peer_con.get(), next_map);
1041 peer_con->send_message(m);
1042 release_map(next_map);
1043 }
1044
1045 void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1046 {
1047 OSDMapRef next_map = get_nextmap_reserved();
1048 // service map is always newer/newest
1049 ceph_assert(from_epoch <= next_map->get_epoch());
1050
1051 for (auto& iter : messages) {
1052 if (next_map->is_down(iter.first) ||
1053 next_map->get_info(iter.first).up_from > from_epoch) {
1054 iter.second->put();
1055 continue;
1056 }
1057 ConnectionRef peer_con;
1058 if (iter.first == whoami) {
1059 peer_con = osd->cluster_messenger->get_loopback_connection();
1060 } else {
1061 peer_con = osd->cluster_messenger->connect_to_osd(
1062 next_map->get_cluster_addrs(iter.first), false, true);
1063 }
1064 maybe_share_map(peer_con.get(), next_map);
1065 peer_con->send_message(iter.second);
1066 }
1067 release_map(next_map);
1068 }
1069 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1070 {
1071 OSDMapRef next_map = get_nextmap_reserved();
1072 // service map is always newer/newest
1073 ceph_assert(from_epoch <= next_map->get_epoch());
1074
1075 if (next_map->is_down(peer) ||
1076 next_map->get_info(peer).up_from > from_epoch) {
1077 release_map(next_map);
1078 return NULL;
1079 }
1080 ConnectionRef con;
1081 if (peer == whoami) {
1082 con = osd->cluster_messenger->get_loopback_connection();
1083 } else {
1084 con = osd->cluster_messenger->connect_to_osd(
1085 next_map->get_cluster_addrs(peer), false, true);
1086 }
1087 release_map(next_map);
1088 return con;
1089 }
1090
1091 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1092 {
1093 OSDMapRef next_map = get_nextmap_reserved();
1094 // service map is always newer/newest
1095 ceph_assert(from_epoch <= next_map->get_epoch());
1096
1097 pair<ConnectionRef,ConnectionRef> ret;
1098 if (next_map->is_down(peer) ||
1099 next_map->get_info(peer).up_from > from_epoch) {
1100 release_map(next_map);
1101 return ret;
1102 }
1103 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1104 next_map->get_hb_back_addrs(peer));
1105 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1106 next_map->get_hb_front_addrs(peer));
1107 release_map(next_map);
1108 return ret;
1109 }
1110
1111 entity_name_t OSDService::get_cluster_msgr_name() const
1112 {
1113 return cluster_messenger->get_myname();
1114 }
1115
1116 void OSDService::queue_want_pg_temp(pg_t pgid,
1117 const vector<int>& want,
1118 bool forced)
1119 {
1120 std::lock_guard l(pg_temp_lock);
1121 auto p = pg_temp_pending.find(pgid);
1122 if (p == pg_temp_pending.end() ||
1123 p->second.acting != want ||
1124 forced) {
1125 pg_temp_wanted[pgid] = {want, forced};
1126 }
1127 }
1128
1129 void OSDService::remove_want_pg_temp(pg_t pgid)
1130 {
1131 std::lock_guard l(pg_temp_lock);
1132 pg_temp_wanted.erase(pgid);
1133 pg_temp_pending.erase(pgid);
1134 }
1135
1136 void OSDService::_sent_pg_temp()
1137 {
1138 #ifdef HAVE_STDLIB_MAP_SPLICING
1139 pg_temp_pending.merge(pg_temp_wanted);
1140 #else
1141 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1142 make_move_iterator(end(pg_temp_wanted)));
1143 #endif
1144 pg_temp_wanted.clear();
1145 }
1146
1147 void OSDService::requeue_pg_temp()
1148 {
1149 std::lock_guard l(pg_temp_lock);
1150 // wanted overrides pending. note that remove_want_pg_temp
1151 // clears the item out of both.
1152 unsigned old_wanted = pg_temp_wanted.size();
1153 unsigned old_pending = pg_temp_pending.size();
1154 _sent_pg_temp();
1155 pg_temp_wanted.swap(pg_temp_pending);
1156 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1157 << pg_temp_wanted.size() << dendl;
1158 }
1159
1160 std::ostream& operator<<(std::ostream& out,
1161 const OSDService::pg_temp_t& pg_temp)
1162 {
1163 out << pg_temp.acting;
1164 if (pg_temp.forced) {
1165 out << " (forced)";
1166 }
1167 return out;
1168 }
1169
1170 void OSDService::send_pg_temp()
1171 {
1172 std::lock_guard l(pg_temp_lock);
1173 if (pg_temp_wanted.empty())
1174 return;
1175 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1176 MOSDPGTemp *ms[2] = {nullptr, nullptr};
1177 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1178 auto& m = ms[pg_temp.forced];
1179 if (!m) {
1180 m = new MOSDPGTemp(osdmap->get_epoch());
1181 m->forced = pg_temp.forced;
1182 }
1183 m->pg_temp.emplace(pgid, pg_temp.acting);
1184 }
1185 for (auto m : ms) {
1186 if (m) {
1187 monc->send_mon_message(m);
1188 }
1189 }
1190 _sent_pg_temp();
1191 }
1192
1193 void OSDService::send_pg_created(pg_t pgid)
1194 {
1195 std::lock_guard l(pg_created_lock);
1196 dout(20) << __func__ << dendl;
1197 auto o = get_osdmap();
1198 if (o->require_osd_release >= ceph_release_t::luminous) {
1199 pg_created.insert(pgid);
1200 monc->send_mon_message(new MOSDPGCreated(pgid));
1201 }
1202 }
1203
1204 void OSDService::send_pg_created()
1205 {
1206 std::lock_guard l(pg_created_lock);
1207 dout(20) << __func__ << dendl;
1208 auto o = get_osdmap();
1209 if (o->require_osd_release >= ceph_release_t::luminous) {
1210 for (auto pgid : pg_created) {
1211 monc->send_mon_message(new MOSDPGCreated(pgid));
1212 }
1213 }
1214 }
1215
1216 void OSDService::prune_pg_created()
1217 {
1218 std::lock_guard l(pg_created_lock);
1219 dout(20) << __func__ << dendl;
1220 auto o = get_osdmap();
1221 auto i = pg_created.begin();
1222 while (i != pg_created.end()) {
1223 auto p = o->get_pg_pool(i->pool());
1224 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1225 dout(20) << __func__ << " pruning " << *i << dendl;
1226 i = pg_created.erase(i);
1227 } else {
1228 dout(20) << __func__ << " keeping " << *i << dendl;
1229 ++i;
1230 }
1231 }
1232 }
1233
1234
1235 // --------------------------------------
1236 // dispatch
1237
1238 bool OSDService::can_inc_scrubs()
1239 {
1240 bool can_inc = false;
1241 std::lock_guard l(sched_scrub_lock);
1242
1243 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1244 dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1245 << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
1246 can_inc = true;
1247 } else {
1248 dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1249 << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1250 }
1251
1252 return can_inc;
1253 }
1254
1255 bool OSDService::inc_scrubs_local()
1256 {
1257 bool result = false;
1258 std::lock_guard l{sched_scrub_lock};
1259 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1260 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1261 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1262 result = true;
1263 ++scrubs_local;
1264 } else {
1265 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1266 }
1267 return result;
1268 }
1269
1270 void OSDService::dec_scrubs_local()
1271 {
1272 std::lock_guard l{sched_scrub_lock};
1273 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1274 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1275 --scrubs_local;
1276 ceph_assert(scrubs_local >= 0);
1277 }
1278
1279 bool OSDService::inc_scrubs_remote()
1280 {
1281 bool result = false;
1282 std::lock_guard l{sched_scrub_lock};
1283 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1284 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1285 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1286 result = true;
1287 ++scrubs_remote;
1288 } else {
1289 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1290 }
1291 return result;
1292 }
1293
1294 void OSDService::dec_scrubs_remote()
1295 {
1296 std::lock_guard l{sched_scrub_lock};
1297 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1298 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1299 --scrubs_remote;
1300 ceph_assert(scrubs_remote >= 0);
1301 }
1302
1303 void OSDService::dump_scrub_reservations(Formatter *f)
1304 {
1305 std::lock_guard l{sched_scrub_lock};
1306 f->dump_int("scrubs_local", scrubs_local);
1307 f->dump_int("scrubs_remote", scrubs_remote);
1308 f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
1309 }
1310
1311 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1312 epoch_t *_bind_epoch) const
1313 {
1314 std::lock_guard l(epoch_lock);
1315 if (_boot_epoch)
1316 *_boot_epoch = boot_epoch;
1317 if (_up_epoch)
1318 *_up_epoch = up_epoch;
1319 if (_bind_epoch)
1320 *_bind_epoch = bind_epoch;
1321 }
1322
1323 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1324 const epoch_t *_bind_epoch)
1325 {
1326 std::lock_guard l(epoch_lock);
1327 if (_boot_epoch) {
1328 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1329 boot_epoch = *_boot_epoch;
1330 }
1331 if (_up_epoch) {
1332 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1333 up_epoch = *_up_epoch;
1334 }
1335 if (_bind_epoch) {
1336 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1337 bind_epoch = *_bind_epoch;
1338 }
1339 }
1340
1341 bool OSDService::prepare_to_stop()
1342 {
1343 std::unique_lock l(is_stopping_lock);
1344 if (get_state() != NOT_STOPPING)
1345 return false;
1346
1347 OSDMapRef osdmap = get_osdmap();
1348 if (osdmap && osdmap->is_up(whoami)) {
1349 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1350 set_state(PREPARING_TO_STOP);
1351 monc->send_mon_message(
1352 new MOSDMarkMeDown(
1353 monc->get_fsid(),
1354 whoami,
1355 osdmap->get_addrs(whoami),
1356 osdmap->get_epoch(),
1357 true // request ack
1358 ));
1359 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1360 is_stopping_cond.wait_for(l, timeout,
1361 [this] { return get_state() == STOPPING; });
1362 }
1363 dout(0) << __func__ << " starting shutdown" << dendl;
1364 set_state(STOPPING);
1365 return true;
1366 }
1367
1368 void OSDService::got_stop_ack()
1369 {
1370 std::scoped_lock l(is_stopping_lock);
1371 if (get_state() == PREPARING_TO_STOP) {
1372 dout(0) << __func__ << " starting shutdown" << dendl;
1373 set_state(STOPPING);
1374 is_stopping_cond.notify_all();
1375 } else {
1376 dout(10) << __func__ << " ignoring msg" << dendl;
1377 }
1378 }
1379
1380 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1381 OSDSuperblock& sblock)
1382 {
1383 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1384 osdmap->get_encoding_features());
1385 m->oldest_map = max_oldest_map;
1386 m->newest_map = sblock.newest_map;
1387
1388 int max = cct->_conf->osd_map_message_max;
1389 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1390
1391 if (since < m->oldest_map) {
1392 // we don't have the next map the target wants, so start with a
1393 // full map.
1394 bufferlist bl;
1395 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1396 << since << ", starting with full map" << dendl;
1397 since = m->oldest_map;
1398 if (!get_map_bl(since, bl)) {
1399 derr << __func__ << " missing full map " << since << dendl;
1400 goto panic;
1401 }
1402 max--;
1403 max_bytes -= bl.length();
1404 m->maps[since].claim(bl);
1405 }
1406 for (epoch_t e = since + 1; e <= to; ++e) {
1407 bufferlist bl;
1408 if (get_inc_map_bl(e, bl)) {
1409 m->incremental_maps[e].claim(bl);
1410 } else {
1411 dout(10) << __func__ << " missing incremental map " << e << dendl;
1412 if (!get_map_bl(e, bl)) {
1413 derr << __func__ << " also missing full map " << e << dendl;
1414 goto panic;
1415 }
1416 m->maps[e].claim(bl);
1417 }
1418 max--;
1419 max_bytes -= bl.length();
1420 if (max <= 0 || max_bytes <= 0) {
1421 break;
1422 }
1423 }
1424 return m;
1425
1426 panic:
1427 if (!m->maps.empty() ||
1428 !m->incremental_maps.empty()) {
1429 // send what we have so far
1430 return m;
1431 }
1432 // send something
1433 bufferlist bl;
1434 if (get_inc_map_bl(m->newest_map, bl)) {
1435 m->incremental_maps[m->newest_map].claim(bl);
1436 } else {
1437 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1438 if (!get_map_bl(m->newest_map, bl)) {
1439 derr << __func__ << " unable to load latest full map " << m->newest_map
1440 << dendl;
1441 ceph_abort();
1442 }
1443 m->maps[m->newest_map].claim(bl);
1444 }
1445 return m;
1446 }
1447
1448 void OSDService::send_map(MOSDMap *m, Connection *con)
1449 {
1450 con->send_message(m);
1451 }
1452
1453 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1454 const OSDMapRef& osdmap)
1455 {
1456 epoch_t to = osdmap->get_epoch();
1457 dout(10) << "send_incremental_map " << since << " -> " << to
1458 << " to " << con << " " << con->get_peer_addr() << dendl;
1459
1460 MOSDMap *m = NULL;
1461 while (!m) {
1462 OSDSuperblock sblock(get_superblock());
1463 if (since < sblock.oldest_map) {
1464 // just send latest full map
1465 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1466 osdmap->get_encoding_features());
1467 m->oldest_map = max_oldest_map;
1468 m->newest_map = sblock.newest_map;
1469 get_map_bl(to, m->maps[to]);
1470 send_map(m, con);
1471 return;
1472 }
1473
1474 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1475 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1476 << ", only sending most recent" << dendl;
1477 since = to - cct->_conf->osd_map_share_max_epochs;
1478 }
1479
1480 m = build_incremental_map_msg(since, to, sblock);
1481 }
1482 send_map(m, con);
1483 }
1484
1485 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1486 {
1487 bool found = map_bl_cache.lookup(e, &bl);
1488 if (found) {
1489 if (logger)
1490 logger->inc(l_osd_map_bl_cache_hit);
1491 return true;
1492 }
1493 if (logger)
1494 logger->inc(l_osd_map_bl_cache_miss);
1495 found = store->read(meta_ch,
1496 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1497 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1498 if (found) {
1499 _add_map_bl(e, bl);
1500 }
1501 return found;
1502 }
1503
1504 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1505 {
1506 std::lock_guard l(map_cache_lock);
1507 bool found = map_bl_inc_cache.lookup(e, &bl);
1508 if (found) {
1509 if (logger)
1510 logger->inc(l_osd_map_bl_cache_hit);
1511 return true;
1512 }
1513 if (logger)
1514 logger->inc(l_osd_map_bl_cache_miss);
1515 found = store->read(meta_ch,
1516 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1517 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1518 if (found) {
1519 _add_map_inc_bl(e, bl);
1520 }
1521 return found;
1522 }
1523
1524 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1525 {
1526 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1527 // cache a contiguous buffer
1528 if (bl.get_num_buffers() > 1) {
1529 bl.rebuild();
1530 }
1531 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1532 map_bl_cache.add(e, bl);
1533 }
1534
1535 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1536 {
1537 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1538 // cache a contiguous buffer
1539 if (bl.get_num_buffers() > 1) {
1540 bl.rebuild();
1541 }
1542 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1543 map_bl_inc_cache.add(e, bl);
1544 }
1545
1546 OSDMapRef OSDService::_add_map(OSDMap *o)
1547 {
1548 epoch_t e = o->get_epoch();
1549
1550 if (cct->_conf->osd_map_dedup) {
1551 // Dedup against an existing map at a nearby epoch
1552 OSDMapRef for_dedup = map_cache.lower_bound(e);
1553 if (for_dedup) {
1554 OSDMap::dedup(for_dedup.get(), o);
1555 }
1556 }
1557 bool existed;
1558 OSDMapRef l = map_cache.add(e, o, &existed);
1559 if (existed) {
1560 delete o;
1561 }
1562 return l;
1563 }
1564
1565 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1566 {
1567 std::lock_guard l(map_cache_lock);
1568 OSDMapRef retval = map_cache.lookup(epoch);
1569 if (retval) {
1570 dout(30) << "get_map " << epoch << " -cached" << dendl;
1571 if (logger) {
1572 logger->inc(l_osd_map_cache_hit);
1573 }
1574 return retval;
1575 }
1576 if (logger) {
1577 logger->inc(l_osd_map_cache_miss);
1578 epoch_t lb = map_cache.cached_key_lower_bound();
1579 if (epoch < lb) {
1580 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1581 logger->inc(l_osd_map_cache_miss_low);
1582 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1583 }
1584 }
1585
1586 OSDMap *map = new OSDMap;
1587 if (epoch > 0) {
1588 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1589 bufferlist bl;
1590 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1591 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1592 delete map;
1593 return OSDMapRef();
1594 }
1595 map->decode(bl);
1596 } else {
1597 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1598 }
1599 return _add_map(map);
1600 }
1601
1602 // ops
1603
1604
1605 void OSDService::reply_op_error(OpRequestRef op, int err)
1606 {
1607 reply_op_error(op, err, eversion_t(), 0, {});
1608 }
1609
1610 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1611 version_t uv,
1612 vector<pg_log_op_return_item_t> op_returns)
1613 {
1614 auto m = op->get_req<MOSDOp>();
1615 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1616 int flags;
1617 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1618
1619 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1620 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
1621 reply->set_reply_versions(v, uv);
1622 reply->set_op_returns(op_returns);
1623 m->get_connection()->send_message(reply);
1624 }
1625
1626 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1627 {
1628 if (!cct->_conf->osd_debug_misdirected_ops) {
1629 return;
1630 }
1631
1632 auto m = op->get_req<MOSDOp>();
1633 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1634
1635 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1636
1637 if (pg->is_ec_pg()) {
1638 /**
1639 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1640 * can get this result:
1641 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1642 * [CRUSH_ITEM_NONE, 2, 3]/3
1643 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1644 * [3, 2, 3]/3
1645 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1646 * -- misdirected op
1647 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1648 * it and fulfils it
1649 *
1650 * We can't compute the op target based on the sending map epoch due to
1651 * splitting. The simplest thing is to detect such cases here and drop
1652 * them without an error (the client will resend anyway).
1653 */
1654 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1655 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1656 if (!opmap) {
1657 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1658 << m->get_map_epoch() << ", dropping" << dendl;
1659 return;
1660 }
1661 pg_t _pgid = m->get_raw_pg();
1662 spg_t pgid;
1663 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1664 _pgid = opmap->raw_pg_to_pg(_pgid);
1665 if (opmap->get_primary_shard(_pgid, &pgid) &&
1666 pgid.shard != pg->pg_id.shard) {
1667 dout(7) << __func__ << ": " << *pg << " primary changed since "
1668 << m->get_map_epoch() << ", dropping" << dendl;
1669 return;
1670 }
1671 }
1672
1673 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1674 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1675 << " pg " << m->get_raw_pg()
1676 << " to osd." << whoami
1677 << " not " << pg->get_acting()
1678 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1679 }
1680
1681 void OSDService::enqueue_back(OpSchedulerItem&& qi)
1682 {
1683 osd->op_shardedwq.queue(std::move(qi));
1684 }
1685
1686 void OSDService::enqueue_front(OpSchedulerItem&& qi)
1687 {
1688 osd->op_shardedwq.queue_front(std::move(qi));
1689 }
1690
1691 void OSDService::queue_recovery_context(
1692 PG *pg,
1693 GenContext<ThreadPool::TPHandle&> *c)
1694 {
1695 epoch_t e = get_osdmap_epoch();
1696 enqueue_back(
1697 OpSchedulerItem(
1698 unique_ptr<OpSchedulerItem::OpQueueable>(
1699 new PGRecoveryContext(pg->get_pgid(), c, e)),
1700 cct->_conf->osd_recovery_cost,
1701 cct->_conf->osd_recovery_priority,
1702 ceph_clock_now(),
1703 0,
1704 e));
1705 }
1706
1707 void OSDService::queue_for_snap_trim(PG *pg)
1708 {
1709 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1710 enqueue_back(
1711 OpSchedulerItem(
1712 unique_ptr<OpSchedulerItem::OpQueueable>(
1713 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1714 cct->_conf->osd_snap_trim_cost,
1715 cct->_conf->osd_snap_trim_priority,
1716 ceph_clock_now(),
1717 0,
1718 pg->get_osdmap_epoch()));
1719 }
1720
1721 void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
1722 {
1723 unsigned scrub_queue_priority = pg->scrubber.priority;
1724 if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
1725 scrub_queue_priority = cct->_conf->osd_client_op_priority;
1726 }
1727 const auto epoch = pg->get_osdmap_epoch();
1728 enqueue_back(
1729 OpSchedulerItem(
1730 unique_ptr<OpSchedulerItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
1731 cct->_conf->osd_scrub_cost,
1732 scrub_queue_priority,
1733 ceph_clock_now(),
1734 0,
1735 epoch));
1736 }
1737
1738 void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1739 {
1740 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1741 enqueue_back(
1742 OpSchedulerItem(
1743 unique_ptr<OpSchedulerItem::OpQueueable>(
1744 new PGDelete(pgid, e)),
1745 cct->_conf->osd_pg_delete_cost,
1746 cct->_conf->osd_pg_delete_priority,
1747 ceph_clock_now(),
1748 0,
1749 e));
1750 }
1751
1752 bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1753 {
1754 return osd->try_finish_pg_delete(pg, old_pg_num);
1755 }
1756
1757 // ---
1758
1759 void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1760 {
1761 std::lock_guard l(merge_lock);
1762 dout(10) << __func__ << " " << pg->pg_id << dendl;
1763 ready_to_merge_source[pg->pg_id.pgid] = version;
1764 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1765 _send_ready_to_merge();
1766 }
1767
1768 void OSDService::set_ready_to_merge_target(PG *pg,
1769 eversion_t version,
1770 epoch_t last_epoch_started,
1771 epoch_t last_epoch_clean)
1772 {
1773 std::lock_guard l(merge_lock);
1774 dout(10) << __func__ << " " << pg->pg_id << dendl;
1775 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1776 make_tuple(version,
1777 last_epoch_started,
1778 last_epoch_clean)));
1779 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1780 _send_ready_to_merge();
1781 }
1782
1783 void OSDService::set_not_ready_to_merge_source(pg_t source)
1784 {
1785 std::lock_guard l(merge_lock);
1786 dout(10) << __func__ << " " << source << dendl;
1787 not_ready_to_merge_source.insert(source);
1788 assert(ready_to_merge_source.count(source) == 0);
1789 _send_ready_to_merge();
1790 }
1791
1792 void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1793 {
1794 std::lock_guard l(merge_lock);
1795 dout(10) << __func__ << " " << target << " source " << source << dendl;
1796 not_ready_to_merge_target[target] = source;
1797 assert(ready_to_merge_target.count(target) == 0);
1798 _send_ready_to_merge();
1799 }
1800
1801 void OSDService::send_ready_to_merge()
1802 {
1803 std::lock_guard l(merge_lock);
1804 _send_ready_to_merge();
1805 }
1806
1807 void OSDService::_send_ready_to_merge()
1808 {
1809 dout(20) << __func__
1810 << " ready_to_merge_source " << ready_to_merge_source
1811 << " not_ready_to_merge_source " << not_ready_to_merge_source
1812 << " ready_to_merge_target " << ready_to_merge_target
1813 << " not_ready_to_merge_target " << not_ready_to_merge_target
1814 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1815 << dendl;
1816 for (auto src : not_ready_to_merge_source) {
1817 if (sent_ready_to_merge_source.count(src) == 0) {
1818 monc->send_mon_message(new MOSDPGReadyToMerge(
1819 src,
1820 {}, {}, 0, 0,
1821 false,
1822 osdmap->get_epoch()));
1823 sent_ready_to_merge_source.insert(src);
1824 }
1825 }
1826 for (auto p : not_ready_to_merge_target) {
1827 if (sent_ready_to_merge_source.count(p.second) == 0) {
1828 monc->send_mon_message(new MOSDPGReadyToMerge(
1829 p.second,
1830 {}, {}, 0, 0,
1831 false,
1832 osdmap->get_epoch()));
1833 sent_ready_to_merge_source.insert(p.second);
1834 }
1835 }
1836 for (auto src : ready_to_merge_source) {
1837 if (not_ready_to_merge_source.count(src.first) ||
1838 not_ready_to_merge_target.count(src.first.get_parent())) {
1839 continue;
1840 }
1841 auto p = ready_to_merge_target.find(src.first.get_parent());
1842 if (p != ready_to_merge_target.end() &&
1843 sent_ready_to_merge_source.count(src.first) == 0) {
1844 monc->send_mon_message(new MOSDPGReadyToMerge(
1845 src.first, // source pgid
1846 src.second, // src version
1847 std::get<0>(p->second), // target version
1848 std::get<1>(p->second), // PG's last_epoch_started
1849 std::get<2>(p->second), // PG's last_epoch_clean
1850 true,
1851 osdmap->get_epoch()));
1852 sent_ready_to_merge_source.insert(src.first);
1853 }
1854 }
1855 }
1856
1857 void OSDService::clear_ready_to_merge(PG *pg)
1858 {
1859 std::lock_guard l(merge_lock);
1860 dout(10) << __func__ << " " << pg->pg_id << dendl;
1861 ready_to_merge_source.erase(pg->pg_id.pgid);
1862 ready_to_merge_target.erase(pg->pg_id.pgid);
1863 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1864 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1865 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1866 }
1867
1868 void OSDService::clear_sent_ready_to_merge()
1869 {
1870 std::lock_guard l(merge_lock);
1871 sent_ready_to_merge_source.clear();
1872 }
1873
1874 void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
1875 {
1876 std::lock_guard l(merge_lock);
1877 auto i = sent_ready_to_merge_source.begin();
1878 while (i != sent_ready_to_merge_source.end()) {
1879 if (!osdmap->pg_exists(*i)) {
1880 dout(10) << __func__ << " " << *i << dendl;
1881 i = sent_ready_to_merge_source.erase(i);
1882 } else {
1883 ++i;
1884 }
1885 }
1886 }
1887
1888 // ---
1889
1890 void OSDService::_queue_for_recovery(
1891 std::pair<epoch_t, PGRef> p,
1892 uint64_t reserved_pushes)
1893 {
1894 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
1895 enqueue_back(
1896 OpSchedulerItem(
1897 unique_ptr<OpSchedulerItem::OpQueueable>(
1898 new PGRecovery(
1899 p.second->get_pgid(), p.first, reserved_pushes)),
1900 cct->_conf->osd_recovery_cost,
1901 cct->_conf->osd_recovery_priority,
1902 ceph_clock_now(),
1903 0,
1904 p.first));
1905 }
1906
1907 // ====================================================================
1908 // OSD
1909
1910 #undef dout_prefix
1911 #define dout_prefix *_dout
1912
1913 // Commands shared between OSD's console and admin console:
1914 namespace ceph {
1915 namespace osd_cmds {
1916
1917 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1918
1919 }} // namespace ceph::osd_cmds
1920
1921 int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, string osdspec_affinity)
1922 {
1923 int ret;
1924
1925 OSDSuperblock sb;
1926 bufferlist sbbl;
1927 ObjectStore::CollectionHandle ch;
1928
1929 // if we are fed a uuid for this osd, use it.
1930 store->set_fsid(cct->_conf->osd_uuid);
1931
1932 ret = store->mkfs();
1933 if (ret) {
1934 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1935 << cpp_strerror(ret) << dendl;
1936 goto free_store;
1937 }
1938
1939 store->set_cache_shards(1); // doesn't matter for mkfs!
1940
1941 ret = store->mount();
1942 if (ret) {
1943 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1944 << cpp_strerror(ret) << dendl;
1945 goto free_store;
1946 }
1947
1948 ch = store->open_collection(coll_t::meta());
1949 if (ch) {
1950 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1951 if (ret < 0) {
1952 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
1953 goto free_store;
1954 }
1955 /* if we already have superblock, check content of superblock */
1956 dout(0) << " have superblock" << dendl;
1957 auto p = sbbl.cbegin();
1958 decode(sb, p);
1959 if (whoami != sb.whoami) {
1960 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1961 << dendl;
1962 ret = -EINVAL;
1963 goto umount_store;
1964 }
1965 if (fsid != sb.cluster_fsid) {
1966 derr << "provided cluster fsid " << fsid
1967 << " != superblock's " << sb.cluster_fsid << dendl;
1968 ret = -EINVAL;
1969 goto umount_store;
1970 }
1971 } else {
1972 // create superblock
1973 sb.cluster_fsid = fsid;
1974 sb.osd_fsid = store->get_fsid();
1975 sb.whoami = whoami;
1976 sb.compat_features = get_osd_initial_compat_set();
1977
1978 bufferlist bl;
1979 encode(sb, bl);
1980
1981 ObjectStore::CollectionHandle ch = store->create_new_collection(
1982 coll_t::meta());
1983 ObjectStore::Transaction t;
1984 t.create_collection(coll_t::meta(), 0);
1985 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1986 ret = store->queue_transaction(ch, std::move(t));
1987 if (ret) {
1988 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1989 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
1990 goto umount_store;
1991 }
1992 }
1993
1994 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
1995 if (ret) {
1996 derr << "OSD::mkfs: failed to write fsid file: error "
1997 << cpp_strerror(ret) << dendl;
1998 goto umount_store;
1999 }
2000
2001 umount_store:
2002 if (ch) {
2003 ch.reset();
2004 }
2005 store->umount();
2006 free_store:
2007 delete store;
2008 return ret;
2009 }
2010
2011 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
2012 {
2013 char val[80];
2014 int r;
2015
2016 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2017 r = store->write_meta("magic", val);
2018 if (r < 0)
2019 return r;
2020
2021 snprintf(val, sizeof(val), "%d", whoami);
2022 r = store->write_meta("whoami", val);
2023 if (r < 0)
2024 return r;
2025
2026 cluster_fsid.print(val);
2027 r = store->write_meta("ceph_fsid", val);
2028 if (r < 0)
2029 return r;
2030
2031 string key = cct->_conf.get_val<string>("key");
2032 if (key.size()) {
2033 r = store->write_meta("osd_key", key);
2034 if (r < 0)
2035 return r;
2036 } else {
2037 string keyfile = cct->_conf.get_val<string>("keyfile");
2038 if (!keyfile.empty()) {
2039 bufferlist keybl;
2040 string err;
2041 r = keybl.read_file(keyfile.c_str(), &err);
2042 if (r < 0) {
2043 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2044 << err << ": " << cpp_strerror(r) << dendl;
2045 return r;
2046 }
2047 r = store->write_meta("osd_key", keybl.to_str());
2048 if (r < 0)
2049 return r;
2050 }
2051 }
2052 if (!osdspec_affinity.empty()) {
2053 r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2054 if (r < 0)
2055 return r;
2056 }
2057
2058 r = store->write_meta("ready", "ready");
2059 if (r < 0)
2060 return r;
2061
2062 return 0;
2063 }
2064
2065 int OSD::peek_meta(ObjectStore *store,
2066 std::string *magic,
2067 uuid_d *cluster_fsid,
2068 uuid_d *osd_fsid,
2069 int *whoami,
2070 ceph_release_t *require_osd_release)
2071 {
2072 string val;
2073
2074 int r = store->read_meta("magic", &val);
2075 if (r < 0)
2076 return r;
2077 *magic = val;
2078
2079 r = store->read_meta("whoami", &val);
2080 if (r < 0)
2081 return r;
2082 *whoami = atoi(val.c_str());
2083
2084 r = store->read_meta("ceph_fsid", &val);
2085 if (r < 0)
2086 return r;
2087 r = cluster_fsid->parse(val.c_str());
2088 if (!r)
2089 return -EINVAL;
2090
2091 r = store->read_meta("fsid", &val);
2092 if (r < 0) {
2093 *osd_fsid = uuid_d();
2094 } else {
2095 r = osd_fsid->parse(val.c_str());
2096 if (!r)
2097 return -EINVAL;
2098 }
2099
2100 r = store->read_meta("require_osd_release", &val);
2101 if (r >= 0) {
2102 *require_osd_release = ceph_release_from_name(val);
2103 }
2104
2105 return 0;
2106 }
2107
2108
2109 #undef dout_prefix
2110 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2111
2112 // cons/des
2113
2114 OSD::OSD(CephContext *cct_, ObjectStore *store_,
2115 int id,
2116 Messenger *internal_messenger,
2117 Messenger *external_messenger,
2118 Messenger *hb_client_front,
2119 Messenger *hb_client_back,
2120 Messenger *hb_front_serverm,
2121 Messenger *hb_back_serverm,
2122 Messenger *osdc_messenger,
2123 MonClient *mc,
2124 const std::string &dev, const std::string &jdev) :
2125 Dispatcher(cct_),
2126 tick_timer(cct, osd_lock),
2127 tick_timer_without_osd_lock(cct, tick_timer_lock),
2128 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2129 cluster_messenger(internal_messenger),
2130 client_messenger(external_messenger),
2131 objecter_messenger(osdc_messenger),
2132 monc(mc),
2133 mgrc(cct_, client_messenger, &mc->monmap),
2134 logger(NULL),
2135 recoverystate_perf(NULL),
2136 store(store_),
2137 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2138 clog(log_client.create_channel()),
2139 whoami(id),
2140 dev_path(dev), journal_path(jdev),
2141 store_is_rotational(store->is_rotational()),
2142 trace_endpoint("0.0.0.0", 0, "osd"),
2143 asok_hook(NULL),
2144 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2145 "osd_pg_epoch_max_lag_factor")),
2146 osd_compat(get_osd_compat_set()),
2147 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2148 get_num_op_threads()),
2149 heartbeat_stop(false),
2150 heartbeat_need_update(true),
2151 hb_front_client_messenger(hb_client_front),
2152 hb_back_client_messenger(hb_client_back),
2153 hb_front_server_messenger(hb_front_serverm),
2154 hb_back_server_messenger(hb_back_serverm),
2155 daily_loadavg(0.0),
2156 heartbeat_thread(this),
2157 heartbeat_dispatcher(this),
2158 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2159 cct->_conf->osd_num_op_tracker_shard),
2160 test_ops_hook(NULL),
2161 op_shardedwq(
2162 this,
2163 cct->_conf->osd_op_thread_timeout,
2164 cct->_conf->osd_op_thread_suicide_timeout,
2165 &osd_op_tp),
2166 last_pg_create_epoch(0),
2167 boot_finisher(cct),
2168 up_thru_wanted(0),
2169 requested_full_first(0),
2170 requested_full_last(0),
2171 service(this)
2172 {
2173
2174 if (!gss_ktfile_client.empty()) {
2175 // Assert we can export environment variable
2176 /*
2177 The default client keytab is used, if it is present and readable,
2178 to automatically obtain initial credentials for GSSAPI client
2179 applications. The principal name of the first entry in the client
2180 keytab is used by default when obtaining initial credentials.
2181 1. The KRB5_CLIENT_KTNAME environment variable.
2182 2. The default_client_keytab_name profile variable in [libdefaults].
2183 3. The hardcoded default, DEFCKTNAME.
2184 */
2185 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2186 gss_ktfile_client.c_str(), 1));
2187 ceph_assert(set_result == 0);
2188 }
2189
2190 monc->set_messenger(client_messenger);
2191 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2192 cct->_conf->osd_op_log_threshold);
2193 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2194 cct->_conf->osd_op_history_duration);
2195 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2196 cct->_conf->osd_op_history_slow_op_threshold);
2197 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
2198 #ifdef WITH_BLKIN
2199 std::stringstream ss;
2200 ss << "osd." << whoami;
2201 trace_endpoint.copy_name(ss.str());
2202 #endif
2203
2204 // initialize shards
2205 num_shards = get_num_op_shards();
2206 for (uint32_t i = 0; i < num_shards; i++) {
2207 OSDShard *one_shard = new OSDShard(
2208 i,
2209 cct,
2210 this);
2211 shards.push_back(one_shard);
2212 }
2213 }
2214
2215 OSD::~OSD()
2216 {
2217 while (!shards.empty()) {
2218 delete shards.back();
2219 shards.pop_back();
2220 }
2221 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2222 cct->get_perfcounters_collection()->remove(logger);
2223 delete recoverystate_perf;
2224 delete logger;
2225 delete store;
2226 }
2227
2228 double OSD::get_tick_interval() const
2229 {
2230 // vary +/- 5% to avoid scrub scheduling livelocks
2231 constexpr auto delta = 0.05;
2232 return (OSD_TICK_INTERVAL *
2233 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2234 }
2235
2236 void OSD::handle_signal(int signum)
2237 {
2238 ceph_assert(signum == SIGINT || signum == SIGTERM);
2239 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2240 shutdown();
2241 }
2242
2243 int OSD::pre_init()
2244 {
2245 std::lock_guard lock(osd_lock);
2246 if (is_stopping())
2247 return 0;
2248
2249 if (store->test_mount_in_use()) {
2250 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2251 << "currently in use. (Is ceph-osd already running?)" << dendl;
2252 return -EBUSY;
2253 }
2254
2255 cct->_conf.add_observer(this);
2256 return 0;
2257 }
2258
2259 int OSD::set_numa_affinity()
2260 {
2261 // storage numa node
2262 int store_node = -1;
2263 store->get_numa_node(&store_node, nullptr, nullptr);
2264 if (store_node >= 0) {
2265 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2266 }
2267
2268 // check network numa node(s)
2269 int front_node = -1, back_node = -1;
2270 string front_iface = pick_iface(
2271 cct,
2272 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2273 string back_iface = pick_iface(
2274 cct,
2275 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2276 int r = get_iface_numa_node(front_iface, &front_node);
2277 if (r >= 0 && front_node >= 0) {
2278 dout(1) << __func__ << " public network " << front_iface << " numa node "
2279 << front_node << dendl;
2280 r = get_iface_numa_node(back_iface, &back_node);
2281 if (r >= 0 && back_node >= 0) {
2282 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2283 << back_node << dendl;
2284 if (front_node == back_node &&
2285 front_node == store_node) {
2286 dout(1) << " objectstore and network numa nodes all match" << dendl;
2287 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2288 numa_node = front_node;
2289 }
2290 } else if (front_node != back_node) {
2291 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2292 << dendl;
2293 } else {
2294 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2295 << dendl;
2296 }
2297 } else if (back_node == -2) {
2298 dout(1) << __func__ << " cluster network " << back_iface
2299 << " ports numa nodes do not match" << dendl;
2300 } else {
2301 derr << __func__ << " unable to identify cluster interface '" << back_iface
2302 << "' numa node: " << cpp_strerror(r) << dendl;
2303 }
2304 } else if (front_node == -2) {
2305 dout(1) << __func__ << " public network " << front_iface
2306 << " ports numa nodes do not match" << dendl;
2307 } else {
2308 derr << __func__ << " unable to identify public interface '" << front_iface
2309 << "' numa node: " << cpp_strerror(r) << dendl;
2310 }
2311 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2312 // this takes precedence over the automagic logic above
2313 numa_node = node;
2314 }
2315 if (numa_node >= 0) {
2316 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2317 if (r < 0) {
2318 dout(1) << __func__ << " unable to determine numa node " << numa_node
2319 << " CPUs" << dendl;
2320 numa_node = -1;
2321 } else {
2322 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2323 << " cpus "
2324 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2325 << dendl;
2326 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
2327 if (r < 0) {
2328 r = -errno;
2329 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2330 << dendl;
2331 numa_node = -1;
2332 }
2333 }
2334 } else {
2335 dout(1) << __func__ << " not setting numa affinity" << dendl;
2336 }
2337 return 0;
2338 }
2339
2340 // asok
2341
2342 class OSDSocketHook : public AdminSocketHook {
2343 OSD *osd;
2344 public:
2345 explicit OSDSocketHook(OSD *o) : osd(o) {}
2346 int call(std::string_view prefix, const cmdmap_t& cmdmap,
2347 Formatter *f,
2348 std::ostream& ss,
2349 bufferlist& out) override {
2350 ceph_abort("should use async hook");
2351 }
2352 void call_async(
2353 std::string_view prefix,
2354 const cmdmap_t& cmdmap,
2355 Formatter *f,
2356 const bufferlist& inbl,
2357 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
2358 try {
2359 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2360 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2361 bufferlist empty;
2362 on_finish(-EINVAL, e.what(), empty);
2363 }
2364 }
2365 };
2366
2367 std::set<int64_t> OSD::get_mapped_pools()
2368 {
2369 std::set<int64_t> pools;
2370 std::vector<spg_t> pgids;
2371 _get_pgids(&pgids);
2372 for (const auto &pgid : pgids) {
2373 pools.insert(pgid.pool());
2374 }
2375 return pools;
2376 }
2377
2378 void OSD::asok_command(
2379 std::string_view prefix, const cmdmap_t& cmdmap,
2380 Formatter *f,
2381 const bufferlist& inbl,
2382 std::function<void(int,const std::string&,bufferlist&)> on_finish)
2383 {
2384 int ret = 0;
2385 stringstream ss; // stderr error message stream
2386 bufferlist outbl; // if empty at end, we'll dump formatter as output
2387
2388 // --- PG commands are routed here to PG::do_command ---
2389 if (prefix == "pg" ||
2390 prefix == "query" ||
2391 prefix == "mark_unfound_lost" ||
2392 prefix == "list_unfound" ||
2393 prefix == "scrub" ||
2394 prefix == "deep_scrub"
2395 ) {
2396 string pgidstr;
2397 pg_t pgid;
2398 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2399 ss << "no pgid specified";
2400 ret = -EINVAL;
2401 goto out;
2402 }
2403 if (!pgid.parse(pgidstr.c_str())) {
2404 ss << "couldn't parse pgid '" << pgidstr << "'";
2405 ret = -EINVAL;
2406 goto out;
2407 }
2408 spg_t pcand;
2409 PGRef pg;
2410 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2411 (pg = _lookup_lock_pg(pcand))) {
2412 if (pg->is_primary()) {
2413 cmdmap_t new_cmdmap = cmdmap;
2414 try {
2415 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2416 pg->unlock();
2417 return; // the pg handler calls on_finish directly
2418 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2419 pg->unlock();
2420 ss << e.what();
2421 ret = -EINVAL;
2422 goto out;
2423 }
2424 } else {
2425 ss << "not primary for pgid " << pgid;
2426 // do not reply; they will get newer maps and realize they
2427 // need to resend.
2428 pg->unlock();
2429 ret = -EAGAIN;
2430 goto out;
2431 }
2432 } else {
2433 ss << "i don't have pgid " << pgid;
2434 ret = -ENOENT;
2435 }
2436 }
2437
2438 // --- OSD commands follow ---
2439
2440 else if (prefix == "status") {
2441 lock_guard l(osd_lock);
2442 f->open_object_section("status");
2443 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2444 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2445 f->dump_unsigned("whoami", superblock.whoami);
2446 f->dump_string("state", get_state_name(get_state()));
2447 f->dump_unsigned("oldest_map", superblock.oldest_map);
2448 f->dump_unsigned("newest_map", superblock.newest_map);
2449 f->dump_unsigned("num_pgs", num_pgs);
2450 f->close_section();
2451 } else if (prefix == "flush_journal") {
2452 store->flush_journal();
2453 } else if (prefix == "dump_ops_in_flight" ||
2454 prefix == "ops" ||
2455 prefix == "dump_blocked_ops" ||
2456 prefix == "dump_historic_ops" ||
2457 prefix == "dump_historic_ops_by_duration" ||
2458 prefix == "dump_historic_slow_ops") {
2459
2460 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2461 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2462 will start to track new ops received afterwards.";
2463
2464 set<string> filters;
2465 vector<string> filter_str;
2466 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
2467 copy(filter_str.begin(), filter_str.end(),
2468 inserter(filters, filters.end()));
2469 }
2470
2471 if (prefix == "dump_ops_in_flight" ||
2472 prefix == "ops") {
2473 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2474 ss << error_str;
2475 ret = -EINVAL;
2476 goto out;
2477 }
2478 }
2479 if (prefix == "dump_blocked_ops") {
2480 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2481 ss << error_str;
2482 ret = -EINVAL;
2483 goto out;
2484 }
2485 }
2486 if (prefix == "dump_historic_ops") {
2487 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2488 ss << error_str;
2489 ret = -EINVAL;
2490 goto out;
2491 }
2492 }
2493 if (prefix == "dump_historic_ops_by_duration") {
2494 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2495 ss << error_str;
2496 ret = -EINVAL;
2497 goto out;
2498 }
2499 }
2500 if (prefix == "dump_historic_slow_ops") {
2501 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2502 ss << error_str;
2503 ret = -EINVAL;
2504 goto out;
2505 }
2506 }
2507 } else if (prefix == "dump_op_pq_state") {
2508 f->open_object_section("pq");
2509 op_shardedwq.dump(f);
2510 f->close_section();
2511 } else if (prefix == "dump_blacklist") {
2512 list<pair<entity_addr_t,utime_t> > bl;
2513 OSDMapRef curmap = service.get_osdmap();
2514
2515 f->open_array_section("blacklist");
2516 curmap->get_blacklist(&bl);
2517 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2518 it != bl.end(); ++it) {
2519 f->open_object_section("entry");
2520 f->open_object_section("entity_addr_t");
2521 it->first.dump(f);
2522 f->close_section(); //entity_addr_t
2523 it->second.localtime(f->dump_stream("expire_time"));
2524 f->close_section(); //entry
2525 }
2526 f->close_section(); //blacklist
2527 } else if (prefix == "dump_watchers") {
2528 list<obj_watch_item_t> watchers;
2529 // scan pg's
2530 vector<PGRef> pgs;
2531 _get_pgs(&pgs);
2532 for (auto& pg : pgs) {
2533 list<obj_watch_item_t> pg_watchers;
2534 pg->get_watchers(&pg_watchers);
2535 watchers.splice(watchers.end(), pg_watchers);
2536 }
2537
2538 f->open_array_section("watchers");
2539 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2540 it != watchers.end(); ++it) {
2541
2542 f->open_object_section("watch");
2543
2544 f->dump_string("namespace", it->obj.nspace);
2545 f->dump_string("object", it->obj.oid.name);
2546
2547 f->open_object_section("entity_name");
2548 it->wi.name.dump(f);
2549 f->close_section(); //entity_name_t
2550
2551 f->dump_unsigned("cookie", it->wi.cookie);
2552 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2553
2554 f->open_object_section("entity_addr_t");
2555 it->wi.addr.dump(f);
2556 f->close_section(); //entity_addr_t
2557
2558 f->close_section(); //watch
2559 }
2560
2561 f->close_section(); //watchers
2562 } else if (prefix == "dump_recovery_reservations") {
2563 f->open_object_section("reservations");
2564 f->open_object_section("local_reservations");
2565 service.local_reserver.dump(f);
2566 f->close_section();
2567 f->open_object_section("remote_reservations");
2568 service.remote_reserver.dump(f);
2569 f->close_section();
2570 f->close_section();
2571 } else if (prefix == "dump_scrub_reservations") {
2572 f->open_object_section("scrub_reservations");
2573 service.dump_scrub_reservations(f);
2574 f->close_section();
2575 } else if (prefix == "get_latest_osdmap") {
2576 get_latest_osdmap();
2577 } else if (prefix == "set_heap_property") {
2578 string property;
2579 int64_t value = 0;
2580 string error;
2581 bool success = false;
2582 if (!cmd_getval(cmdmap, "property", property)) {
2583 error = "unable to get property";
2584 success = false;
2585 } else if (!cmd_getval(cmdmap, "value", value)) {
2586 error = "unable to get value";
2587 success = false;
2588 } else if (value < 0) {
2589 error = "negative value not allowed";
2590 success = false;
2591 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2592 error = "invalid property";
2593 success = false;
2594 } else {
2595 success = true;
2596 }
2597 f->open_object_section("result");
2598 f->dump_string("error", error);
2599 f->dump_bool("success", success);
2600 f->close_section();
2601 } else if (prefix == "get_heap_property") {
2602 string property;
2603 size_t value = 0;
2604 string error;
2605 bool success = false;
2606 if (!cmd_getval(cmdmap, "property", property)) {
2607 error = "unable to get property";
2608 success = false;
2609 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2610 error = "invalid property";
2611 success = false;
2612 } else {
2613 success = true;
2614 }
2615 f->open_object_section("result");
2616 f->dump_string("error", error);
2617 f->dump_bool("success", success);
2618 f->dump_int("value", value);
2619 f->close_section();
2620 } else if (prefix == "dump_objectstore_kv_stats") {
2621 store->get_db_statistics(f);
2622 } else if (prefix == "dump_scrubs") {
2623 service.dumps_scrub(f);
2624 } else if (prefix == "calc_objectstore_db_histogram") {
2625 store->generate_db_histogram(f);
2626 } else if (prefix == "flush_store_cache") {
2627 store->flush_cache(&ss);
2628 } else if (prefix == "dump_pgstate_history") {
2629 f->open_object_section("pgstate_history");
2630 f->open_array_section("pgs");
2631 vector<PGRef> pgs;
2632 _get_pgs(&pgs);
2633 for (auto& pg : pgs) {
2634 f->open_object_section("pg");
2635 f->dump_stream("pg") << pg->pg_id;
2636 f->dump_string("currently", pg->get_current_state());
2637 pg->dump_pgstate_history(f);
2638 f->close_section();
2639 }
2640 f->close_section();
2641 f->close_section();
2642 } else if (prefix == "compact") {
2643 dout(1) << "triggering manual compaction" << dendl;
2644 auto start = ceph::coarse_mono_clock::now();
2645 store->compact();
2646 auto end = ceph::coarse_mono_clock::now();
2647 double duration = std::chrono::duration<double>(end-start).count();
2648 dout(1) << "finished manual compaction in "
2649 << duration
2650 << " seconds" << dendl;
2651 f->open_object_section("compact_result");
2652 f->dump_float("elapsed_time", duration);
2653 f->close_section();
2654 } else if (prefix == "get_mapped_pools") {
2655 f->open_array_section("mapped_pools");
2656 set<int64_t> poollist = get_mapped_pools();
2657 for (auto pool : poollist) {
2658 f->dump_int("pool_id", pool);
2659 }
2660 f->close_section();
2661 } else if (prefix == "smart") {
2662 string devid;
2663 cmd_getval(cmdmap, "devid", devid);
2664 ostringstream out;
2665 probe_smart(devid, out);
2666 outbl.append(out.str());
2667 } else if (prefix == "list_devices") {
2668 set<string> devnames;
2669 store->get_devices(&devnames);
2670 f->open_array_section("list_devices");
2671 for (auto dev : devnames) {
2672 if (dev.find("dm-") == 0) {
2673 continue;
2674 }
2675 string err;
2676 f->open_object_section("device");
2677 f->dump_string("device", "/dev/" + dev);
2678 f->dump_string("device_id", get_device_id(dev, &err));
2679 f->close_section();
2680 }
2681 f->close_section();
2682 } else if (prefix == "send_beacon") {
2683 lock_guard l(osd_lock);
2684 if (is_active()) {
2685 send_beacon(ceph::coarse_mono_clock::now());
2686 }
2687 }
2688
2689 else if (prefix == "cluster_log") {
2690 vector<string> msg;
2691 cmd_getval(cmdmap, "message", msg);
2692 if (msg.empty()) {
2693 ret = -EINVAL;
2694 ss << "ignoring empty log message";
2695 goto out;
2696 }
2697 string message = msg.front();
2698 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2699 message += " " + *a;
2700 string lvl;
2701 cmd_getval(cmdmap, "level", lvl);
2702 clog_type level = string_to_clog_type(lvl);
2703 if (level < 0) {
2704 ret = -EINVAL;
2705 ss << "unknown level '" << lvl << "'";
2706 goto out;
2707 }
2708 clog->do_log(level, message);
2709 }
2710
2711 else if (prefix == "bench") {
2712 int64_t count;
2713 int64_t bsize;
2714 int64_t osize, onum;
2715 // default count 1G, size 4MB
2716 cmd_getval(cmdmap, "count", count, (int64_t)1 << 30);
2717 cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
2718 cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
2719 cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
2720
2721 uint32_t duration = cct->_conf->osd_bench_duration;
2722
2723 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
2724 // let us limit the block size because the next checks rely on it
2725 // having a sane value. If we allow any block size to be set things
2726 // can still go sideways.
2727 ss << "block 'size' values are capped at "
2728 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
2729 << " a higher value, please adjust 'osd_bench_max_block_size'";
2730 ret = -EINVAL;
2731 goto out;
2732 } else if (bsize < (int64_t) (1 << 20)) {
2733 // entering the realm of small block sizes.
2734 // limit the count to a sane value, assuming a configurable amount of
2735 // IOPS and duration, so that the OSD doesn't get hung up on this,
2736 // preventing timeouts from going off
2737 int64_t max_count =
2738 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
2739 if (count > max_count) {
2740 ss << "'count' values greater than " << max_count
2741 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2742 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
2743 << " for " << duration << " seconds,"
2744 << " can cause ill effects on osd. "
2745 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2746 << " value if you wish to use a higher 'count'.";
2747 ret = -EINVAL;
2748 goto out;
2749 }
2750 } else {
2751 // 1MB block sizes are big enough so that we get more stuff done.
2752 // However, to avoid the osd from getting hung on this and having
2753 // timers being triggered, we are going to limit the count assuming
2754 // a configurable throughput and duration.
2755 // NOTE: max_count is the total amount of bytes that we believe we
2756 // will be able to write during 'duration' for the given
2757 // throughput. The block size hardly impacts this unless it's
2758 // way too big. Given we already check how big the block size
2759 // is, it's safe to assume everything will check out.
2760 int64_t max_count =
2761 cct->_conf->osd_bench_large_size_max_throughput * duration;
2762 if (count > max_count) {
2763 ss << "'count' values greater than " << max_count
2764 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2765 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
2766 << " for " << duration << " seconds,"
2767 << " can cause ill effects on osd. "
2768 << " Please adjust 'osd_bench_large_size_max_throughput'"
2769 << " with a higher value if you wish to use a higher 'count'.";
2770 ret = -EINVAL;
2771 goto out;
2772 }
2773 }
2774
2775 if (osize && bsize > osize)
2776 bsize = osize;
2777
2778 dout(1) << " bench count " << count
2779 << " bsize " << byte_u_t(bsize) << dendl;
2780
2781 ObjectStore::Transaction cleanupt;
2782
2783 if (osize && onum) {
2784 bufferlist bl;
2785 bufferptr bp(osize);
2786 bp.zero();
2787 bl.push_back(std::move(bp));
2788 bl.rebuild_page_aligned();
2789 for (int i=0; i<onum; ++i) {
2790 char nm[30];
2791 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
2792 object_t oid(nm);
2793 hobject_t soid(sobject_t(oid, 0));
2794 ObjectStore::Transaction t;
2795 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
2796 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2797 cleanupt.remove(coll_t(), ghobject_t(soid));
2798 }
2799 }
2800
2801 bufferlist bl;
2802 bufferptr bp(bsize);
2803 bp.zero();
2804 bl.push_back(std::move(bp));
2805 bl.rebuild_page_aligned();
2806
2807 {
2808 C_SaferCond waiter;
2809 if (!service.meta_ch->flush_commit(&waiter)) {
2810 waiter.wait();
2811 }
2812 }
2813
2814 utime_t start = ceph_clock_now();
2815 for (int64_t pos = 0; pos < count; pos += bsize) {
2816 char nm[30];
2817 unsigned offset = 0;
2818 if (onum && osize) {
2819 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
2820 offset = rand() % (osize / bsize) * bsize;
2821 } else {
2822 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
2823 }
2824 object_t oid(nm);
2825 hobject_t soid(sobject_t(oid, 0));
2826 ObjectStore::Transaction t;
2827 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
2828 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2829 if (!onum || !osize)
2830 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
2831 }
2832
2833 {
2834 C_SaferCond waiter;
2835 if (!service.meta_ch->flush_commit(&waiter)) {
2836 waiter.wait();
2837 }
2838 }
2839 utime_t end = ceph_clock_now();
2840
2841 // clean up
2842 store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
2843 {
2844 C_SaferCond waiter;
2845 if (!service.meta_ch->flush_commit(&waiter)) {
2846 waiter.wait();
2847 }
2848 }
2849
2850 double elapsed = end - start;
2851 double rate = count / elapsed;
2852 double iops = rate / bsize;
2853 f->open_object_section("osd_bench_results");
2854 f->dump_int("bytes_written", count);
2855 f->dump_int("blocksize", bsize);
2856 f->dump_float("elapsed_sec", elapsed);
2857 f->dump_float("bytes_per_sec", rate);
2858 f->dump_float("iops", iops);
2859 f->close_section();
2860 }
2861
2862 else if (prefix == "flush_pg_stats") {
2863 mgrc.send_pgstats();
2864 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2865 }
2866
2867 else if (prefix == "heap") {
2868 ret = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2869 }
2870
2871 else if (prefix == "debug dump_missing") {
2872 f->open_array_section("pgs");
2873 vector<PGRef> pgs;
2874 _get_pgs(&pgs);
2875 for (auto& pg : pgs) {
2876 string s = stringify(pg->pg_id);
2877 f->open_array_section(s.c_str());
2878 pg->lock();
2879 pg->dump_missing(f);
2880 pg->unlock();
2881 f->close_section();
2882 }
2883 f->close_section();
2884 }
2885
2886 else if (prefix == "debug kick_recovery_wq") {
2887 int64_t delay;
2888 cmd_getval(cmdmap, "delay", delay);
2889 ostringstream oss;
2890 oss << delay;
2891 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
2892 if (ret != 0) {
2893 ss << "kick_recovery_wq: error setting "
2894 << "osd_recovery_delay_start to '" << delay << "': error "
2895 << ret;
2896 goto out;
2897 }
2898 cct->_conf.apply_changes(nullptr);
2899 ss << "kicking recovery queue. set osd_recovery_delay_start "
2900 << "to " << cct->_conf->osd_recovery_delay_start;
2901 }
2902
2903 else if (prefix == "cpu_profiler") {
2904 ostringstream ds;
2905 string arg;
2906 cmd_getval(cmdmap, "arg", arg);
2907 vector<string> argvec;
2908 get_str_vec(arg, argvec);
2909 cpu_profiler_handle_command(argvec, ds);
2910 outbl.append(ds.str());
2911 }
2912
2913 else if (prefix == "dump_pg_recovery_stats") {
2914 lock_guard l(osd_lock);
2915 pg_recovery_stats.dump_formatted(f);
2916 }
2917
2918 else if (prefix == "reset_pg_recovery_stats") {
2919 lock_guard l(osd_lock);
2920 pg_recovery_stats.reset();
2921 }
2922
2923 else if (prefix == "perf histogram dump") {
2924 std::string logger;
2925 std::string counter;
2926 cmd_getval(cmdmap, "logger", logger);
2927 cmd_getval(cmdmap, "counter", counter);
2928 cct->get_perfcounters_collection()->dump_formatted_histograms(
2929 f, false, logger, counter);
2930 }
2931
2932 else if (prefix == "cache drop") {
2933 lock_guard l(osd_lock);
2934 dout(20) << "clearing all caches" << dendl;
2935 // Clear the objectstore's cache - onode and buffer for Bluestore,
2936 // system's pagecache for Filestore
2937 ret = store->flush_cache(&ss);
2938 if (ret < 0) {
2939 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
2940 goto out;
2941 }
2942 // Clear the objectcontext cache (per PG)
2943 vector<PGRef> pgs;
2944 _get_pgs(&pgs);
2945 for (auto& pg: pgs) {
2946 pg->clear_cache();
2947 }
2948 }
2949
2950 else if (prefix == "cache status") {
2951 lock_guard l(osd_lock);
2952 int obj_ctx_count = 0;
2953 vector<PGRef> pgs;
2954 _get_pgs(&pgs);
2955 for (auto& pg: pgs) {
2956 obj_ctx_count += pg->get_cache_obj_count();
2957 }
2958 f->open_object_section("cache_status");
2959 f->dump_int("object_ctx", obj_ctx_count);
2960 store->dump_cache_stats(f);
2961 f->close_section();
2962 }
2963
2964 else if (prefix == "scrub_purged_snaps") {
2965 lock_guard l(osd_lock);
2966 scrub_purged_snaps();
2967 }
2968
2969 else if (prefix == "dump_osd_network") {
2970 lock_guard l(osd_lock);
2971 int64_t value = 0;
2972 if (!(cmd_getval(cmdmap, "value", value))) {
2973 // Convert milliseconds to microseconds
2974 value = static_cast<double>(g_conf().get_val<double>(
2975 "mon_warn_on_slow_ping_time")) * 1000;
2976 if (value == 0) {
2977 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2978 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2979 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2980 }
2981 } else {
2982 // Convert user input to microseconds
2983 value *= 1000;
2984 }
2985 if (value < 0) value = 0;
2986
2987 struct osd_ping_time_t {
2988 uint32_t pingtime;
2989 int to;
2990 bool back;
2991 std::array<uint32_t,3> times;
2992 std::array<uint32_t,3> min;
2993 std::array<uint32_t,3> max;
2994 uint32_t last;
2995 uint32_t last_update;
2996
2997 bool operator<(const osd_ping_time_t& rhs) const {
2998 if (pingtime < rhs.pingtime)
2999 return true;
3000 if (pingtime > rhs.pingtime)
3001 return false;
3002 if (to < rhs.to)
3003 return true;
3004 if (to > rhs.to)
3005 return false;
3006 return back;
3007 }
3008 };
3009
3010 set<osd_ping_time_t> sorted;
3011 // Get pingtimes under lock and not on the stack
3012 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3013 service.get_hb_pingtime(pingtimes);
3014 for (auto j : *pingtimes) {
3015 if (j.second.last_update == 0)
3016 continue;
3017 osd_ping_time_t item;
3018 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3019 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3020 if (item.pingtime >= value) {
3021 item.to = j.first;
3022 item.times[0] = j.second.back_pingtime[0];
3023 item.times[1] = j.second.back_pingtime[1];
3024 item.times[2] = j.second.back_pingtime[2];
3025 item.min[0] = j.second.back_min[0];
3026 item.min[1] = j.second.back_min[1];
3027 item.min[2] = j.second.back_min[2];
3028 item.max[0] = j.second.back_max[0];
3029 item.max[1] = j.second.back_max[1];
3030 item.max[2] = j.second.back_max[2];
3031 item.last = j.second.back_last;
3032 item.back = true;
3033 item.last_update = j.second.last_update;
3034 sorted.emplace(item);
3035 }
3036 if (j.second.front_last == 0)
3037 continue;
3038 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3039 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3040 if (item.pingtime >= value) {
3041 item.to = j.first;
3042 item.times[0] = j.second.front_pingtime[0];
3043 item.times[1] = j.second.front_pingtime[1];
3044 item.times[2] = j.second.front_pingtime[2];
3045 item.min[0] = j.second.front_min[0];
3046 item.min[1] = j.second.front_min[1];
3047 item.min[2] = j.second.front_min[2];
3048 item.max[0] = j.second.front_max[0];
3049 item.max[1] = j.second.front_max[1];
3050 item.max[2] = j.second.front_max[2];
3051 item.last = j.second.front_last;
3052 item.last_update = j.second.last_update;
3053 item.back = false;
3054 sorted.emplace(item);
3055 }
3056 }
3057 delete pingtimes;
3058 //
3059 // Network ping times (1min 5min 15min)
3060 f->open_object_section("network_ping_times");
3061 f->dump_int("threshold", value / 1000);
3062 f->open_array_section("entries");
3063 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3064 ceph_assert(sitem.pingtime >= value);
3065 f->open_object_section("entry");
3066
3067 const time_t lu(sitem.last_update);
3068 char buffer[26];
3069 string lustr(ctime_r(&lu, buffer));
3070 lustr.pop_back(); // Remove trailing \n
3071 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3072 f->dump_string("last update", lustr);
3073 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3074 f->dump_int("from osd", whoami);
3075 f->dump_int("to osd", sitem.to);
3076 f->dump_string("interface", (sitem.back ? "back" : "front"));
3077 f->open_object_section("average");
3078 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3079 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3080 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3081 f->close_section(); // average
3082 f->open_object_section("min");
3083 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3084 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3085 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3086 f->close_section(); // min
3087 f->open_object_section("max");
3088 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3089 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3090 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3091 f->close_section(); // max
3092 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3093 f->close_section(); // entry
3094 }
3095 f->close_section(); // entries
3096 f->close_section(); // network_ping_times
3097 } else {
3098 ceph_abort_msg("broken asok registration");
3099 }
3100
3101 out:
3102 on_finish(ret, ss.str(), outbl);
3103 }
3104
3105 class TestOpsSocketHook : public AdminSocketHook {
3106 OSDService *service;
3107 ObjectStore *store;
3108 public:
3109 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
3110 int call(std::string_view command, const cmdmap_t& cmdmap,
3111 Formatter *f,
3112 std::ostream& errss,
3113 bufferlist& out) override {
3114 int r = 0;
3115 stringstream outss;
3116 try {
3117 test_ops(service, store, command, cmdmap, outss);
3118 out.append(outss);
3119 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3120 errss << e.what();
3121 r = -EINVAL;
3122 }
3123 return r;
3124 }
3125 void test_ops(OSDService *service, ObjectStore *store,
3126 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
3127
3128 };
3129
3130 class OSD::C_Tick : public Context {
3131 OSD *osd;
3132 public:
3133 explicit C_Tick(OSD *o) : osd(o) {}
3134 void finish(int r) override {
3135 osd->tick();
3136 }
3137 };
3138
3139 class OSD::C_Tick_WithoutOSDLock : public Context {
3140 OSD *osd;
3141 public:
3142 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3143 void finish(int r) override {
3144 osd->tick_without_osd_lock();
3145 }
3146 };
3147
3148 int OSD::enable_disable_fuse(bool stop)
3149 {
3150 #ifdef HAVE_LIBFUSE
3151 int r;
3152 string mntpath = cct->_conf->osd_data + "/fuse";
3153 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3154 dout(1) << __func__ << " disabling" << dendl;
3155 fuse_store->stop();
3156 delete fuse_store;
3157 fuse_store = NULL;
3158 r = ::rmdir(mntpath.c_str());
3159 if (r < 0) {
3160 r = -errno;
3161 derr << __func__ << " failed to rmdir " << mntpath << ": "
3162 << cpp_strerror(r) << dendl;
3163 return r;
3164 }
3165 return 0;
3166 }
3167 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3168 dout(1) << __func__ << " enabling" << dendl;
3169 r = ::mkdir(mntpath.c_str(), 0700);
3170 if (r < 0)
3171 r = -errno;
3172 if (r < 0 && r != -EEXIST) {
3173 derr << __func__ << " unable to create " << mntpath << ": "
3174 << cpp_strerror(r) << dendl;
3175 return r;
3176 }
3177 fuse_store = new FuseStore(store, mntpath);
3178 r = fuse_store->start();
3179 if (r < 0) {
3180 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3181 delete fuse_store;
3182 fuse_store = NULL;
3183 return r;
3184 }
3185 }
3186 #endif // HAVE_LIBFUSE
3187 return 0;
3188 }
3189
3190 size_t OSD::get_num_cache_shards()
3191 {
3192 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3193 }
3194
3195 int OSD::get_num_op_shards()
3196 {
3197 if (cct->_conf->osd_op_num_shards)
3198 return cct->_conf->osd_op_num_shards;
3199 if (store_is_rotational)
3200 return cct->_conf->osd_op_num_shards_hdd;
3201 else
3202 return cct->_conf->osd_op_num_shards_ssd;
3203 }
3204
3205 int OSD::get_num_op_threads()
3206 {
3207 if (cct->_conf->osd_op_num_threads_per_shard)
3208 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3209 if (store_is_rotational)
3210 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3211 else
3212 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3213 }
3214
3215 float OSD::get_osd_recovery_sleep()
3216 {
3217 if (cct->_conf->osd_recovery_sleep)
3218 return cct->_conf->osd_recovery_sleep;
3219 if (!store_is_rotational && !journal_is_rotational)
3220 return cct->_conf->osd_recovery_sleep_ssd;
3221 else if (store_is_rotational && !journal_is_rotational)
3222 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
3223 else
3224 return cct->_conf->osd_recovery_sleep_hdd;
3225 }
3226
3227 float OSD::get_osd_delete_sleep()
3228 {
3229 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3230 if (osd_delete_sleep > 0)
3231 return osd_delete_sleep;
3232 if (!store_is_rotational && !journal_is_rotational)
3233 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3234 if (store_is_rotational && !journal_is_rotational)
3235 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3236 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3237 }
3238
3239 int OSD::get_recovery_max_active()
3240 {
3241 if (cct->_conf->osd_recovery_max_active)
3242 return cct->_conf->osd_recovery_max_active;
3243 if (store_is_rotational)
3244 return cct->_conf->osd_recovery_max_active_hdd;
3245 else
3246 return cct->_conf->osd_recovery_max_active_ssd;
3247 }
3248
3249 float OSD::get_osd_snap_trim_sleep()
3250 {
3251 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3252 if (osd_snap_trim_sleep > 0)
3253 return osd_snap_trim_sleep;
3254 if (!store_is_rotational && !journal_is_rotational)
3255 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3256 if (store_is_rotational && !journal_is_rotational)
3257 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3258 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3259 }
3260
3261 int OSD::init()
3262 {
3263 OSDMapRef osdmap;
3264 CompatSet initial, diff;
3265 std::lock_guard lock(osd_lock);
3266 if (is_stopping())
3267 return 0;
3268
3269 tick_timer.init();
3270 tick_timer_without_osd_lock.init();
3271 service.recovery_request_timer.init();
3272 service.sleep_timer.init();
3273
3274 boot_finisher.start();
3275
3276 {
3277 string val;
3278 store->read_meta("require_osd_release", &val);
3279 last_require_osd_release = ceph_release_from_name(val);
3280 }
3281
3282 // mount.
3283 dout(2) << "init " << dev_path
3284 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3285 << dendl;
3286 dout(2) << "journal " << journal_path << dendl;
3287 ceph_assert(store); // call pre_init() first!
3288
3289 store->set_cache_shards(get_num_cache_shards());
3290
3291 int r = store->mount();
3292 if (r < 0) {
3293 derr << "OSD:init: unable to mount object store" << dendl;
3294 return r;
3295 }
3296 journal_is_rotational = store->is_journal_rotational();
3297 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3298 << dendl;
3299
3300 enable_disable_fuse(false);
3301
3302 dout(2) << "boot" << dendl;
3303
3304 service.meta_ch = store->open_collection(coll_t::meta());
3305
3306 // initialize the daily loadavg with current 15min loadavg
3307 double loadavgs[3];
3308 if (getloadavg(loadavgs, 3) == 3) {
3309 daily_loadavg = loadavgs[2];
3310 } else {
3311 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3312 daily_loadavg = 1.0;
3313 }
3314
3315 int rotating_auth_attempts = 0;
3316 auto rotating_auth_timeout =
3317 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3318
3319 // sanity check long object name handling
3320 {
3321 hobject_t l;
3322 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3323 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3324 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3325 r = store->validate_hobject_key(l);
3326 if (r < 0) {
3327 derr << "backend (" << store->get_type() << ") is unable to support max "
3328 << "object name[space] len" << dendl;
3329 derr << " osd max object name len = "
3330 << cct->_conf->osd_max_object_name_len << dendl;
3331 derr << " osd max object namespace len = "
3332 << cct->_conf->osd_max_object_namespace_len << dendl;
3333 derr << cpp_strerror(r) << dendl;
3334 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3335 goto out;
3336 }
3337 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3338 << dendl;
3339 } else {
3340 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3341 }
3342 }
3343
3344 // read superblock
3345 r = read_superblock();
3346 if (r < 0) {
3347 derr << "OSD::init() : unable to read osd superblock" << dendl;
3348 r = -EINVAL;
3349 goto out;
3350 }
3351
3352 if (osd_compat.compare(superblock.compat_features) < 0) {
3353 derr << "The disk uses features unsupported by the executable." << dendl;
3354 derr << " ondisk features " << superblock.compat_features << dendl;
3355 derr << " daemon features " << osd_compat << dendl;
3356
3357 if (osd_compat.writeable(superblock.compat_features)) {
3358 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3359 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3360 r = -EOPNOTSUPP;
3361 goto out;
3362 }
3363 else {
3364 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3365 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3366 r = -EOPNOTSUPP;
3367 goto out;
3368 }
3369 }
3370
3371 assert_warn(whoami == superblock.whoami);
3372 if (whoami != superblock.whoami) {
3373 derr << "OSD::init: superblock says osd"
3374 << superblock.whoami << " but I am osd." << whoami << dendl;
3375 r = -EINVAL;
3376 goto out;
3377 }
3378
3379 startup_time = ceph::mono_clock::now();
3380
3381 // load up "current" osdmap
3382 assert_warn(!get_osdmap());
3383 if (get_osdmap()) {
3384 derr << "OSD::init: unable to read current osdmap" << dendl;
3385 r = -EINVAL;
3386 goto out;
3387 }
3388 osdmap = get_map(superblock.current_epoch);
3389 set_osdmap(osdmap);
3390
3391 // make sure we don't have legacy pgs deleting
3392 {
3393 vector<coll_t> ls;
3394 int r = store->list_collections(ls);
3395 ceph_assert(r >= 0);
3396 for (auto c : ls) {
3397 spg_t pgid;
3398 if (c.is_pg(&pgid) &&
3399 !osdmap->have_pg_pool(pgid.pool())) {
3400 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3401 if (!store->exists(service.meta_ch, oid)) {
3402 derr << __func__ << " missing pg_pool_t for deleted pool "
3403 << pgid.pool() << " for pg " << pgid
3404 << "; please downgrade to luminous and allow "
3405 << "pg deletion to complete before upgrading" << dendl;
3406 ceph_abort();
3407 }
3408 }
3409 }
3410 }
3411
3412 initial = get_osd_initial_compat_set();
3413 diff = superblock.compat_features.unsupported(initial);
3414 if (superblock.compat_features.merge(initial)) {
3415 // Are we adding SNAPMAPPER2?
3416 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3417 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3418 << dendl;
3419 auto ch = service.meta_ch;
3420 auto hoid = make_snapmapper_oid();
3421 unsigned max = cct->_conf->osd_target_transaction_size;
3422 r = SnapMapper::convert_legacy(cct, store, ch, hoid, max);
3423 if (r < 0)
3424 goto out;
3425 }
3426 // We need to persist the new compat_set before we
3427 // do anything else
3428 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3429 ObjectStore::Transaction t;
3430 write_superblock(t);
3431 r = store->queue_transaction(service.meta_ch, std::move(t));
3432 if (r < 0)
3433 goto out;
3434 }
3435
3436 // make sure snap mapper object exists
3437 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3438 dout(10) << "init creating/touching snapmapper object" << dendl;
3439 ObjectStore::Transaction t;
3440 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3441 r = store->queue_transaction(service.meta_ch, std::move(t));
3442 if (r < 0)
3443 goto out;
3444 }
3445 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3446 dout(10) << "init creating/touching purged_snaps object" << dendl;
3447 ObjectStore::Transaction t;
3448 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3449 r = store->queue_transaction(service.meta_ch, std::move(t));
3450 if (r < 0)
3451 goto out;
3452 }
3453
3454 if (cct->_conf->osd_open_classes_on_start) {
3455 int r = ClassHandler::get_instance().open_all_classes();
3456 if (r)
3457 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3458 }
3459
3460 check_osdmap_features();
3461
3462 create_recoverystate_perf();
3463
3464 {
3465 epoch_t bind_epoch = osdmap->get_epoch();
3466 service.set_epochs(NULL, NULL, &bind_epoch);
3467 }
3468
3469 clear_temp_objects();
3470
3471 // initialize osdmap references in sharded wq
3472 for (auto& shard : shards) {
3473 std::lock_guard l(shard->osdmap_lock);
3474 shard->shard_osdmap = osdmap;
3475 }
3476
3477 // load up pgs (as they previously existed)
3478 load_pgs();
3479
3480 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3481
3482 create_logger();
3483
3484 // prime osd stats
3485 {
3486 struct store_statfs_t stbuf;
3487 osd_alert_list_t alerts;
3488 int r = store->statfs(&stbuf, &alerts);
3489 ceph_assert(r == 0);
3490 service.set_statfs(stbuf, alerts);
3491 }
3492
3493 // client_messenger auth_client is already set up by monc.
3494 for (auto m : { cluster_messenger,
3495 objecter_messenger,
3496 hb_front_client_messenger,
3497 hb_back_client_messenger,
3498 hb_front_server_messenger,
3499 hb_back_server_messenger } ) {
3500 m->set_auth_client(monc);
3501 }
3502 for (auto m : { client_messenger,
3503 cluster_messenger,
3504 hb_front_server_messenger,
3505 hb_back_server_messenger }) {
3506 m->set_auth_server(monc);
3507 }
3508 monc->set_handle_authentication_dispatcher(this);
3509
3510 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3511 | CEPH_ENTITY_TYPE_MGR);
3512 r = monc->init();
3513 if (r < 0)
3514 goto out;
3515
3516 mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); });
3517 mgrc.set_perf_metric_query_cb(
3518 [this](const ConfigPayload &config_payload) {
3519 set_perf_queries(config_payload);
3520 },
3521 [this] {
3522 return get_perf_reports();
3523 });
3524 mgrc.init();
3525
3526 // tell monc about log_client so it will know about mon session resets
3527 monc->set_log_client(&log_client);
3528 update_log_config();
3529
3530 // i'm ready!
3531 client_messenger->add_dispatcher_tail(&mgrc);
3532 client_messenger->add_dispatcher_tail(this);
3533 cluster_messenger->add_dispatcher_head(this);
3534
3535 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3536 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3537 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3538 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3539
3540 objecter_messenger->add_dispatcher_head(service.objecter.get());
3541
3542 service.init();
3543 service.publish_map(osdmap);
3544 service.publish_superblock(superblock);
3545 service.max_oldest_map = superblock.oldest_map;
3546
3547 for (auto& shard : shards) {
3548 // put PGs in a temporary set because we may modify pg_slots
3549 // unordered_map below.
3550 set<PGRef> pgs;
3551 for (auto& i : shard->pg_slots) {
3552 PGRef pg = i.second->pg;
3553 if (!pg) {
3554 continue;
3555 }
3556 pgs.insert(pg);
3557 }
3558 for (auto pg : pgs) {
3559 std::scoped_lock l{*pg};
3560 set<pair<spg_t,epoch_t>> new_children;
3561 set<pair<spg_t,epoch_t>> merge_pgs;
3562 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3563 &new_children, &merge_pgs);
3564 if (!new_children.empty()) {
3565 for (auto shard : shards) {
3566 shard->prime_splits(osdmap, &new_children);
3567 }
3568 assert(new_children.empty());
3569 }
3570 if (!merge_pgs.empty()) {
3571 for (auto shard : shards) {
3572 shard->prime_merges(osdmap, &merge_pgs);
3573 }
3574 assert(merge_pgs.empty());
3575 }
3576 }
3577 }
3578
3579 osd_op_tp.start();
3580
3581 // start the heartbeat
3582 heartbeat_thread.create("osd_srv_heartbt");
3583
3584 // tick
3585 tick_timer.add_event_after(get_tick_interval(),
3586 new C_Tick(this));
3587 {
3588 std::lock_guard l(tick_timer_lock);
3589 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3590 new C_Tick_WithoutOSDLock(this));
3591 }
3592
3593 osd_lock.unlock();
3594
3595 r = monc->authenticate();
3596 if (r < 0) {
3597 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3598 << dendl;
3599 exit(1);
3600 }
3601
3602 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3603 derr << "unable to obtain rotating service keys; retrying" << dendl;
3604 ++rotating_auth_attempts;
3605 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3606 derr << __func__ << " wait_auth_rotating timed out" << dendl;
3607 exit(1);
3608 }
3609 }
3610
3611 r = update_crush_device_class();
3612 if (r < 0) {
3613 derr << __func__ << " unable to update_crush_device_class: "
3614 << cpp_strerror(r) << dendl;
3615 exit(1);
3616 }
3617
3618 r = update_crush_location();
3619 if (r < 0) {
3620 derr << __func__ << " unable to update_crush_location: "
3621 << cpp_strerror(r) << dendl;
3622 exit(1);
3623 }
3624
3625 osd_lock.lock();
3626 if (is_stopping())
3627 return 0;
3628
3629 // start objecter *after* we have authenticated, so that we don't ignore
3630 // the OSDMaps it requests.
3631 service.final_init();
3632
3633 check_config();
3634
3635 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3636 consume_map();
3637
3638 dout(0) << "done with init, starting boot process" << dendl;
3639
3640 // subscribe to any pg creations
3641 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3642
3643 // MgrClient needs this (it doesn't have MonClient reference itself)
3644 monc->sub_want("mgrmap", 0, 0);
3645
3646 // we don't need to ask for an osdmap here; objecter will
3647 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3648
3649 monc->renew_subs();
3650
3651 start_boot();
3652
3653 return 0;
3654
3655 out:
3656 enable_disable_fuse(true);
3657 store->umount();
3658 delete store;
3659 store = NULL;
3660 return r;
3661 }
3662
3663 void OSD::final_init()
3664 {
3665 AdminSocket *admin_socket = cct->get_admin_socket();
3666 asok_hook = new OSDSocketHook(this);
3667 int r = admin_socket->register_command("status", asok_hook,
3668 "high-level status of OSD");
3669 ceph_assert(r == 0);
3670 r = admin_socket->register_command("flush_journal",
3671 asok_hook,
3672 "flush the journal to permanent store");
3673 ceph_assert(r == 0);
3674 r = admin_socket->register_command("dump_ops_in_flight " \
3675 "name=filterstr,type=CephString,n=N,req=false",
3676 asok_hook,
3677 "show the ops currently in flight");
3678 ceph_assert(r == 0);
3679 r = admin_socket->register_command("ops " \
3680 "name=filterstr,type=CephString,n=N,req=false",
3681 asok_hook,
3682 "show the ops currently in flight");
3683 ceph_assert(r == 0);
3684 r = admin_socket->register_command("dump_blocked_ops " \
3685 "name=filterstr,type=CephString,n=N,req=false",
3686 asok_hook,
3687 "show the blocked ops currently in flight");
3688 ceph_assert(r == 0);
3689 r = admin_socket->register_command("dump_historic_ops " \
3690 "name=filterstr,type=CephString,n=N,req=false",
3691 asok_hook,
3692 "show recent ops");
3693 ceph_assert(r == 0);
3694 r = admin_socket->register_command("dump_historic_slow_ops " \
3695 "name=filterstr,type=CephString,n=N,req=false",
3696 asok_hook,
3697 "show slowest recent ops");
3698 ceph_assert(r == 0);
3699 r = admin_socket->register_command("dump_historic_ops_by_duration " \
3700 "name=filterstr,type=CephString,n=N,req=false",
3701 asok_hook,
3702 "show slowest recent ops, sorted by duration");
3703 ceph_assert(r == 0);
3704 r = admin_socket->register_command("dump_op_pq_state",
3705 asok_hook,
3706 "dump op priority queue state");
3707 ceph_assert(r == 0);
3708 r = admin_socket->register_command("dump_blacklist",
3709 asok_hook,
3710 "dump blacklisted clients and times");
3711 ceph_assert(r == 0);
3712 r = admin_socket->register_command("dump_watchers",
3713 asok_hook,
3714 "show clients which have active watches,"
3715 " and on which objects");
3716 ceph_assert(r == 0);
3717 r = admin_socket->register_command("dump_recovery_reservations",
3718 asok_hook,
3719 "show recovery reservations");
3720 ceph_assert(r == 0);
3721 r = admin_socket->register_command("dump_scrub_reservations",
3722 asok_hook,
3723 "show scrub reservations");
3724 ceph_assert(r == 0);
3725 r = admin_socket->register_command("get_latest_osdmap",
3726 asok_hook,
3727 "force osd to update the latest map from "
3728 "the mon");
3729 ceph_assert(r == 0);
3730
3731 r = admin_socket->register_command("set_heap_property " \
3732 "name=property,type=CephString " \
3733 "name=value,type=CephInt",
3734 asok_hook,
3735 "update malloc extension heap property");
3736 ceph_assert(r == 0);
3737
3738 r = admin_socket->register_command("get_heap_property " \
3739 "name=property,type=CephString",
3740 asok_hook,
3741 "get malloc extension heap property");
3742 ceph_assert(r == 0);
3743
3744 r = admin_socket->register_command("dump_objectstore_kv_stats",
3745 asok_hook,
3746 "print statistics of kvdb which used by bluestore");
3747 ceph_assert(r == 0);
3748
3749 r = admin_socket->register_command("dump_scrubs",
3750 asok_hook,
3751 "print scheduled scrubs");
3752 ceph_assert(r == 0);
3753
3754 r = admin_socket->register_command("calc_objectstore_db_histogram",
3755 asok_hook,
3756 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3757 ceph_assert(r == 0);
3758
3759 r = admin_socket->register_command("flush_store_cache",
3760 asok_hook,
3761 "Flush bluestore internal cache");
3762 ceph_assert(r == 0);
3763 r = admin_socket->register_command("dump_pgstate_history",
3764 asok_hook,
3765 "show recent state history");
3766 ceph_assert(r == 0);
3767
3768 r = admin_socket->register_command("compact",
3769 asok_hook,
3770 "Commpact object store's omap."
3771 " WARNING: Compaction probably slows your requests");
3772 ceph_assert(r == 0);
3773
3774 r = admin_socket->register_command("get_mapped_pools",
3775 asok_hook,
3776 "dump pools whose PG(s) are mapped to this OSD.");
3777
3778 ceph_assert(r == 0);
3779
3780 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
3781 asok_hook,
3782 "probe OSD devices for SMART data.");
3783
3784 ceph_assert(r == 0);
3785
3786 r = admin_socket->register_command("list_devices",
3787 asok_hook,
3788 "list OSD devices.");
3789 r = admin_socket->register_command("send_beacon",
3790 asok_hook,
3791 "send OSD beacon to mon immediately");
3792
3793 r = admin_socket->register_command(
3794 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3795 "Dump osd heartbeat network ping times");
3796 ceph_assert(r == 0);
3797
3798 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3799 // Note: pools are CephString instead of CephPoolname because
3800 // these commands traditionally support both pool names and numbers
3801 r = admin_socket->register_command(
3802 "setomapval " \
3803 "name=pool,type=CephString " \
3804 "name=objname,type=CephObjectname " \
3805 "name=key,type=CephString "\
3806 "name=val,type=CephString",
3807 test_ops_hook,
3808 "set omap key");
3809 ceph_assert(r == 0);
3810 r = admin_socket->register_command(
3811 "rmomapkey " \
3812 "name=pool,type=CephString " \
3813 "name=objname,type=CephObjectname " \
3814 "name=key,type=CephString",
3815 test_ops_hook,
3816 "remove omap key");
3817 ceph_assert(r == 0);
3818 r = admin_socket->register_command(
3819 "setomapheader " \
3820 "name=pool,type=CephString " \
3821 "name=objname,type=CephObjectname " \
3822 "name=header,type=CephString",
3823 test_ops_hook,
3824 "set omap header");
3825 ceph_assert(r == 0);
3826
3827 r = admin_socket->register_command(
3828 "getomap " \
3829 "name=pool,type=CephString " \
3830 "name=objname,type=CephObjectname",
3831 test_ops_hook,
3832 "output entire object map");
3833 ceph_assert(r == 0);
3834
3835 r = admin_socket->register_command(
3836 "truncobj " \
3837 "name=pool,type=CephString " \
3838 "name=objname,type=CephObjectname " \
3839 "name=len,type=CephInt",
3840 test_ops_hook,
3841 "truncate object to length");
3842 ceph_assert(r == 0);
3843
3844 r = admin_socket->register_command(
3845 "injectdataerr " \
3846 "name=pool,type=CephString " \
3847 "name=objname,type=CephObjectname " \
3848 "name=shardid,type=CephInt,req=false,range=0|255",
3849 test_ops_hook,
3850 "inject data error to an object");
3851 ceph_assert(r == 0);
3852
3853 r = admin_socket->register_command(
3854 "injectmdataerr " \
3855 "name=pool,type=CephString " \
3856 "name=objname,type=CephObjectname " \
3857 "name=shardid,type=CephInt,req=false,range=0|255",
3858 test_ops_hook,
3859 "inject metadata error to an object");
3860 ceph_assert(r == 0);
3861 r = admin_socket->register_command(
3862 "set_recovery_delay " \
3863 "name=utime,type=CephInt,req=false",
3864 test_ops_hook,
3865 "Delay osd recovery by specified seconds");
3866 ceph_assert(r == 0);
3867 r = admin_socket->register_command(
3868 "injectfull " \
3869 "name=type,type=CephString,req=false " \
3870 "name=count,type=CephInt,req=false ",
3871 test_ops_hook,
3872 "Inject a full disk (optional count times)");
3873 ceph_assert(r == 0);
3874 r = admin_socket->register_command(
3875 "bench " \
3876 "name=count,type=CephInt,req=false " \
3877 "name=size,type=CephInt,req=false " \
3878 "name=object_size,type=CephInt,req=false " \
3879 "name=object_num,type=CephInt,req=false ",
3880 asok_hook,
3881 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3882 "(default count=1G default size=4MB). Results in log.");
3883 ceph_assert(r == 0);
3884 r = admin_socket->register_command(
3885 "cluster_log " \
3886 "name=level,type=CephChoices,strings=error,warning,info,debug " \
3887 "name=message,type=CephString,n=N",
3888 asok_hook,
3889 "log a message to the cluster log");
3890 ceph_assert(r == 0);
3891 r = admin_socket->register_command(
3892 "flush_pg_stats",
3893 asok_hook,
3894 "flush pg stats");
3895 ceph_assert(r == 0);
3896 r = admin_socket->register_command(
3897 "heap " \
3898 "name=heapcmd,type=CephChoices,strings=" \
3899 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
3900 "name=value,type=CephString,req=false",
3901 asok_hook,
3902 "show heap usage info (available only if compiled with tcmalloc)");
3903 ceph_assert(r == 0);
3904 r = admin_socket->register_command(
3905 "debug dump_missing " \
3906 "name=filename,type=CephFilepath",
3907 asok_hook,
3908 "dump missing objects to a named file");
3909 ceph_assert(r == 0);
3910 r = admin_socket->register_command(
3911 "debug kick_recovery_wq " \
3912 "name=delay,type=CephInt,range=0",
3913 asok_hook,
3914 "set osd_recovery_delay_start to <val>");
3915 ceph_assert(r == 0);
3916 r = admin_socket->register_command(
3917 "cpu_profiler " \
3918 "name=arg,type=CephChoices,strings=status|flush",
3919 asok_hook,
3920 "run cpu profiling on daemon");
3921 ceph_assert(r == 0);
3922 r = admin_socket->register_command(
3923 "dump_pg_recovery_stats",
3924 asok_hook,
3925 "dump pg recovery statistics");
3926 ceph_assert(r == 0);
3927 r = admin_socket->register_command(
3928 "reset_pg_recovery_stats",
3929 asok_hook,
3930 "reset pg recovery statistics");
3931 ceph_assert(r == 0);
3932 r = admin_socket->register_command(
3933 "cache drop",
3934 asok_hook,
3935 "Drop all OSD caches");
3936 ceph_assert(r == 0);
3937 r = admin_socket->register_command(
3938 "cache status",
3939 asok_hook,
3940 "Get OSD caches statistics");
3941 ceph_assert(r == 0);
3942 r = admin_socket->register_command(
3943 "scrub_purged_snaps",
3944 asok_hook,
3945 "Scrub purged_snaps vs snapmapper index");
3946 ceph_assert(r == 0);
3947
3948 // -- pg commands --
3949 // old form: ceph pg <pgid> command ...
3950 r = admin_socket->register_command(
3951 "pg " \
3952 "name=pgid,type=CephPgid " \
3953 "name=cmd,type=CephChoices,strings=query",
3954 asok_hook,
3955 "");
3956 ceph_assert(r == 0);
3957 r = admin_socket->register_command(
3958 "pg " \
3959 "name=pgid,type=CephPgid " \
3960 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
3961 "name=mulcmd,type=CephChoices,strings=revert|delete",
3962 asok_hook,
3963 "");
3964 ceph_assert(r == 0);
3965 r = admin_socket->register_command(
3966 "pg " \
3967 "name=pgid,type=CephPgid " \
3968 "name=cmd,type=CephChoices,strings=list_unfound " \
3969 "name=offset,type=CephString,req=false",
3970 asok_hook,
3971 "");
3972 ceph_assert(r == 0);
3973 r = admin_socket->register_command(
3974 "pg " \
3975 "name=pgid,type=CephPgid " \
3976 "name=cmd,type=CephChoices,strings=scrub " \
3977 "name=time,type=CephInt,req=false",
3978 asok_hook,
3979 "");
3980 ceph_assert(r == 0);
3981 r = admin_socket->register_command(
3982 "pg " \
3983 "name=pgid,type=CephPgid " \
3984 "name=cmd,type=CephChoices,strings=deep_scrub " \
3985 "name=time,type=CephInt,req=false",
3986 asok_hook,
3987 "");
3988 ceph_assert(r == 0);
3989 // new form: tell <pgid> <cmd> for both cli and rest
3990 r = admin_socket->register_command(
3991 "query",
3992 asok_hook,
3993 "show details of a specific pg");
3994 ceph_assert(r == 0);
3995 r = admin_socket->register_command(
3996 "mark_unfound_lost " \
3997 "name=pgid,type=CephPgid,req=false " \
3998 "name=mulcmd,type=CephChoices,strings=revert|delete",
3999 asok_hook,
4000 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4001 ceph_assert(r == 0);
4002 r = admin_socket->register_command(
4003 "list_unfound " \
4004 "name=pgid,type=CephPgid,req=false " \
4005 "name=offset,type=CephString,req=false",
4006 asok_hook,
4007 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4008 ceph_assert(r == 0);
4009 r = admin_socket->register_command(
4010 "scrub " \
4011 "name=pgid,type=CephPgid,req=false " \
4012 "name=time,type=CephInt,req=false",
4013 asok_hook,
4014 "Trigger a scheduled scrub ");
4015 ceph_assert(r == 0);
4016 r = admin_socket->register_command(
4017 "deep_scrub " \
4018 "name=pgid,type=CephPgid,req=false " \
4019 "name=time,type=CephInt,req=false",
4020 asok_hook,
4021 "Trigger a scheduled deep scrub ");
4022 ceph_assert(r == 0);
4023 }
4024
4025 void OSD::create_logger()
4026 {
4027 dout(10) << "create_logger" << dendl;
4028
4029 logger = build_osd_logger(cct);
4030 cct->get_perfcounters_collection()->add(logger);
4031 }
4032
4033 void OSD::create_recoverystate_perf()
4034 {
4035 dout(10) << "create_recoverystate_perf" << dendl;
4036
4037 recoverystate_perf = build_recoverystate_perf(cct);
4038 cct->get_perfcounters_collection()->add(recoverystate_perf);
4039 }
4040
4041 int OSD::shutdown()
4042 {
4043 if (cct->_conf->osd_fast_shutdown) {
4044 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4045 cct->_log->flush();
4046 _exit(0);
4047 }
4048
4049 if (!service.prepare_to_stop())
4050 return 0; // already shutting down
4051 osd_lock.lock();
4052 if (is_stopping()) {
4053 osd_lock.unlock();
4054 return 0;
4055 }
4056 dout(0) << "shutdown" << dendl;
4057
4058 set_state(STATE_STOPPING);
4059
4060 // Debugging
4061 if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4062 cct->_conf.set_val("debug_osd", "100");
4063 cct->_conf.set_val("debug_journal", "100");
4064 cct->_conf.set_val("debug_filestore", "100");
4065 cct->_conf.set_val("debug_bluestore", "100");
4066 cct->_conf.set_val("debug_ms", "100");
4067 cct->_conf.apply_changes(nullptr);
4068 }
4069
4070 // stop MgrClient earlier as it's more like an internal consumer of OSD
4071 mgrc.shutdown();
4072
4073 service.start_shutdown();
4074
4075 // stop sending work to pgs. this just prevents any new work in _process
4076 // from racing with on_shutdown and potentially entering the pg after.
4077 op_shardedwq.drain();
4078
4079 // Shutdown PGs
4080 {
4081 vector<PGRef> pgs;
4082 _get_pgs(&pgs);
4083 for (auto pg : pgs) {
4084 pg->shutdown();
4085 }
4086 }
4087
4088 // drain op queue again (in case PGs requeued something)
4089 op_shardedwq.drain();
4090 {
4091 finished.clear(); // zap waiters (bleh, this is messy)
4092 waiting_for_osdmap.clear();
4093 }
4094
4095 // unregister commands
4096 cct->get_admin_socket()->unregister_commands(asok_hook);
4097 delete asok_hook;
4098 asok_hook = NULL;
4099
4100 cct->get_admin_socket()->unregister_commands(test_ops_hook);
4101 delete test_ops_hook;
4102 test_ops_hook = NULL;
4103
4104 osd_lock.unlock();
4105
4106 {
4107 std::lock_guard l{heartbeat_lock};
4108 heartbeat_stop = true;
4109 heartbeat_cond.notify_all();
4110 heartbeat_peers.clear();
4111 }
4112 heartbeat_thread.join();
4113
4114 hb_back_server_messenger->mark_down_all();
4115 hb_front_server_messenger->mark_down_all();
4116 hb_front_client_messenger->mark_down_all();
4117 hb_back_client_messenger->mark_down_all();
4118
4119 osd_op_tp.drain();
4120 osd_op_tp.stop();
4121 dout(10) << "op sharded tp stopped" << dendl;
4122
4123 dout(10) << "stopping agent" << dendl;
4124 service.agent_stop();
4125
4126 boot_finisher.wait_for_empty();
4127
4128 osd_lock.lock();
4129
4130 boot_finisher.stop();
4131 reset_heartbeat_peers(true);
4132
4133 tick_timer.shutdown();
4134
4135 {
4136 std::lock_guard l(tick_timer_lock);
4137 tick_timer_without_osd_lock.shutdown();
4138 }
4139
4140 // note unmount epoch
4141 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
4142 superblock.mounted = service.get_boot_epoch();
4143 superblock.clean_thru = get_osdmap_epoch();
4144 ObjectStore::Transaction t;
4145 write_superblock(t);
4146 int r = store->queue_transaction(service.meta_ch, std::move(t));
4147 if (r) {
4148 derr << "OSD::shutdown: error writing superblock: "
4149 << cpp_strerror(r) << dendl;
4150 }
4151
4152
4153 service.shutdown_reserver();
4154
4155 // Remove PGs
4156 #ifdef PG_DEBUG_REFS
4157 service.dump_live_pgids();
4158 #endif
4159 while (true) {
4160 vector<PGRef> pgs;
4161 _get_pgs(&pgs, true);
4162 if (pgs.empty()) {
4163 break;
4164 }
4165 for (auto& pg : pgs) {
4166 if (pg->is_deleted()) {
4167 continue;
4168 }
4169 dout(20) << " kicking pg " << pg << dendl;
4170 pg->lock();
4171 if (pg->get_num_ref() != 1) {
4172 derr << "pgid " << pg->get_pgid() << " has ref count of "
4173 << pg->get_num_ref() << dendl;
4174 #ifdef PG_DEBUG_REFS
4175 pg->dump_live_ids();
4176 #endif
4177 if (cct->_conf->osd_shutdown_pgref_assert) {
4178 ceph_abort();
4179 }
4180 }
4181 pg->ch.reset();
4182 pg->unlock();
4183 }
4184 }
4185 #ifdef PG_DEBUG_REFS
4186 service.dump_live_pgids();
4187 #endif
4188
4189 osd_lock.unlock();
4190 cct->_conf.remove_observer(this);
4191 osd_lock.lock();
4192
4193 service.meta_ch.reset();
4194
4195 dout(10) << "syncing store" << dendl;
4196 enable_disable_fuse(true);
4197
4198 if (cct->_conf->osd_journal_flush_on_shutdown) {
4199 dout(10) << "flushing journal" << dendl;
4200 store->flush_journal();
4201 }
4202
4203 monc->shutdown();
4204 osd_lock.unlock();
4205 {
4206 std::unique_lock l{map_lock};
4207 set_osdmap(OSDMapRef());
4208 }
4209 for (auto s : shards) {
4210 std::lock_guard l(s->osdmap_lock);
4211 s->shard_osdmap = OSDMapRef();
4212 }
4213 service.shutdown();
4214
4215 std::lock_guard lock(osd_lock);
4216 store->umount();
4217 delete store;
4218 store = nullptr;
4219 dout(10) << "Store synced" << dendl;
4220
4221 op_tracker.on_shutdown();
4222
4223 ClassHandler::get_instance().shutdown();
4224 client_messenger->shutdown();
4225 cluster_messenger->shutdown();
4226 hb_front_client_messenger->shutdown();
4227 hb_back_client_messenger->shutdown();
4228 objecter_messenger->shutdown();
4229 hb_front_server_messenger->shutdown();
4230 hb_back_server_messenger->shutdown();
4231
4232 return r;
4233 }
4234
4235 int OSD::mon_cmd_maybe_osd_create(string &cmd)
4236 {
4237 bool created = false;
4238 while (true) {
4239 dout(10) << __func__ << " cmd: " << cmd << dendl;
4240 vector<string> vcmd{cmd};
4241 bufferlist inbl;
4242 C_SaferCond w;
4243 string outs;
4244 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4245 int r = w.wait();
4246 if (r < 0) {
4247 if (r == -ENOENT && !created) {
4248 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4249 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4250 vector<string> vnewcmd{newcmd};
4251 bufferlist inbl;
4252 C_SaferCond w;
4253 string outs;
4254 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4255 int r = w.wait();
4256 if (r < 0) {
4257 derr << __func__ << " fail: osd does not exist and created failed: "
4258 << cpp_strerror(r) << dendl;
4259 return r;
4260 }
4261 created = true;
4262 continue;
4263 }
4264 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4265 return r;
4266 }
4267 break;
4268 }
4269
4270 return 0;
4271 }
4272
4273 int OSD::update_crush_location()
4274 {
4275 if (!cct->_conf->osd_crush_update_on_start) {
4276 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4277 return 0;
4278 }
4279
4280 char weight[32];
4281 if (cct->_conf->osd_crush_initial_weight >= 0) {
4282 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4283 } else {
4284 struct store_statfs_t st;
4285 osd_alert_list_t alerts;
4286 int r = store->statfs(&st, &alerts);
4287 if (r < 0) {
4288 derr << "statfs: " << cpp_strerror(r) << dendl;
4289 return r;
4290 }
4291 snprintf(weight, sizeof(weight), "%.4lf",
4292 std::max(.00001,
4293 double(st.total) /
4294 double(1ull << 40 /* TB */)));
4295 }
4296
4297 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
4298
4299 string cmd =
4300 string("{\"prefix\": \"osd crush create-or-move\", ") +
4301 string("\"id\": ") + stringify(whoami) + ", " +
4302 string("\"weight\":") + weight + ", " +
4303 string("\"args\": [") + stringify(cct->crush_location) + "]}";
4304 return mon_cmd_maybe_osd_create(cmd);
4305 }
4306
4307 int OSD::update_crush_device_class()
4308 {
4309 if (!cct->_conf->osd_class_update_on_start) {
4310 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4311 return 0;
4312 }
4313
4314 string device_class;
4315 int r = store->read_meta("crush_device_class", &device_class);
4316 if (r < 0 || device_class.empty()) {
4317 device_class = store->get_default_device_class();
4318 }
4319
4320 if (device_class.empty()) {
4321 dout(20) << __func__ << " no device class stored locally" << dendl;
4322 return 0;
4323 }
4324
4325 string cmd =
4326 string("{\"prefix\": \"osd crush set-device-class\", ") +
4327 string("\"class\": \"") + device_class + string("\", ") +
4328 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4329
4330 r = mon_cmd_maybe_osd_create(cmd);
4331 if (r == -EBUSY) {
4332 // good, already bound to a device-class
4333 return 0;
4334 } else {
4335 return r;
4336 }
4337 }
4338
4339 void OSD::write_superblock(ObjectStore::Transaction& t)
4340 {
4341 dout(10) << "write_superblock " << superblock << dendl;
4342
4343 //hack: at minimum it's using the baseline feature set
4344 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4345 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4346
4347 bufferlist bl;
4348 encode(superblock, bl);
4349 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4350 }
4351
4352 int OSD::read_superblock()
4353 {
4354 bufferlist bl;
4355 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4356 if (r < 0)
4357 return r;
4358
4359 auto p = bl.cbegin();
4360 decode(superblock, p);
4361
4362 dout(10) << "read_superblock " << superblock << dendl;
4363
4364 return 0;
4365 }
4366
4367 void OSD::clear_temp_objects()
4368 {
4369 dout(10) << __func__ << dendl;
4370 vector<coll_t> ls;
4371 store->list_collections(ls);
4372 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4373 spg_t pgid;
4374 if (!p->is_pg(&pgid))
4375 continue;
4376
4377 // list temp objects
4378 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4379
4380 vector<ghobject_t> temps;
4381 ghobject_t next;
4382 while (1) {
4383 vector<ghobject_t> objects;
4384 auto ch = store->open_collection(*p);
4385 ceph_assert(ch);
4386 store->collection_list(ch, next, ghobject_t::get_max(),
4387 store->get_ideal_list_max(),
4388 &objects, &next);
4389 if (objects.empty())
4390 break;
4391 vector<ghobject_t>::iterator q;
4392 for (q = objects.begin(); q != objects.end(); ++q) {
4393 // Hammer set pool for temps to -1, so check for clean-up
4394 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4395 temps.push_back(*q);
4396 } else {
4397 break;
4398 }
4399 }
4400 // If we saw a non-temp object and hit the break above we can
4401 // break out of the while loop too.
4402 if (q != objects.end())
4403 break;
4404 }
4405 if (!temps.empty()) {
4406 ObjectStore::Transaction t;
4407 int removed = 0;
4408 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4409 dout(20) << " removing " << *p << " object " << *q << dendl;
4410 t.remove(*p, *q);
4411 if (++removed > cct->_conf->osd_target_transaction_size) {
4412 store->queue_transaction(service.meta_ch, std::move(t));
4413 t = ObjectStore::Transaction();
4414 removed = 0;
4415 }
4416 }
4417 if (removed) {
4418 store->queue_transaction(service.meta_ch, std::move(t));
4419 }
4420 }
4421 }
4422 }
4423
4424 void OSD::recursive_remove_collection(CephContext* cct,
4425 ObjectStore *store, spg_t pgid,
4426 coll_t tmp)
4427 {
4428 OSDriver driver(
4429 store,
4430 coll_t(),
4431 make_snapmapper_oid());
4432
4433 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4434 ObjectStore::Transaction t;
4435 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4436
4437 ghobject_t next;
4438 int max = cct->_conf->osd_target_transaction_size;
4439 vector<ghobject_t> objects;
4440 objects.reserve(max);
4441 while (true) {
4442 objects.clear();
4443 store->collection_list(ch, next, ghobject_t::get_max(),
4444 max, &objects, &next);
4445 generic_dout(10) << __func__ << " " << objects << dendl;
4446 if (objects.empty())
4447 break;
4448 for (auto& p: objects) {
4449 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4450 int r = mapper.remove_oid(p.hobj, &_t);
4451 if (r != 0 && r != -ENOENT)
4452 ceph_abort();
4453 t.remove(tmp, p);
4454 }
4455 int r = store->queue_transaction(ch, std::move(t));
4456 ceph_assert(r == 0);
4457 t = ObjectStore::Transaction();
4458 }
4459 t.remove_collection(tmp);
4460 int r = store->queue_transaction(ch, std::move(t));
4461 ceph_assert(r == 0);
4462
4463 C_SaferCond waiter;
4464 if (!ch->flush_commit(&waiter)) {
4465 waiter.wait();
4466 }
4467 }
4468
4469
4470 // ======================================================
4471 // PG's
4472
4473 PG* OSD::_make_pg(
4474 OSDMapRef createmap,
4475 spg_t pgid)
4476 {
4477 dout(10) << __func__ << " " << pgid << dendl;
4478 pg_pool_t pi;
4479 map<string,string> ec_profile;
4480 string name;
4481 if (createmap->have_pg_pool(pgid.pool())) {
4482 pi = *createmap->get_pg_pool(pgid.pool());
4483 name = createmap->get_pool_name(pgid.pool());
4484 if (pi.is_erasure()) {
4485 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4486 }
4487 } else {
4488 // pool was deleted; grab final pg_pool_t off disk.
4489 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4490 bufferlist bl;
4491 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4492 if (r < 0) {
4493 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4494 << dendl;
4495 return nullptr;
4496 }
4497 ceph_assert(r >= 0);
4498 auto p = bl.cbegin();
4499 decode(pi, p);
4500 decode(name, p);
4501 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4502 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4503 << " tombstone" << dendl;
4504 return nullptr;
4505 }
4506 decode(ec_profile, p);
4507 }
4508 PGPool pool(cct, createmap, pgid.pool(), pi, name);
4509 PG *pg;
4510 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4511 pi.type == pg_pool_t::TYPE_ERASURE)
4512 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4513 else
4514 ceph_abort();
4515 return pg;
4516 }
4517
4518 void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4519 {
4520 v->clear();
4521 v->reserve(get_num_pgs());
4522 for (auto& s : shards) {
4523 std::lock_guard l(s->shard_lock);
4524 for (auto& j : s->pg_slots) {
4525 if (j.second->pg &&
4526 !j.second->pg->is_deleted()) {
4527 v->push_back(j.second->pg);
4528 if (clear_too) {
4529 s->_detach_pg(j.second.get());
4530 }
4531 }
4532 }
4533 }
4534 }
4535
4536 void OSD::_get_pgids(vector<spg_t> *v)
4537 {
4538 v->clear();
4539 v->reserve(get_num_pgs());
4540 for (auto& s : shards) {
4541 std::lock_guard l(s->shard_lock);
4542 for (auto& j : s->pg_slots) {
4543 if (j.second->pg &&
4544 !j.second->pg->is_deleted()) {
4545 v->push_back(j.first);
4546 }
4547 }
4548 }
4549 }
4550
4551 void OSD::register_pg(PGRef pg)
4552 {
4553 spg_t pgid = pg->get_pgid();
4554 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4555 auto sdata = shards[shard_index];
4556 std::lock_guard l(sdata->shard_lock);
4557 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4558 ceph_assert(r.second);
4559 auto *slot = r.first->second.get();
4560 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4561 sdata->_attach_pg(slot, pg.get());
4562 }
4563
4564 bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4565 {
4566 auto sdata = pg->osd_shard;
4567 ceph_assert(sdata);
4568 {
4569 std::lock_guard l(sdata->shard_lock);
4570 auto p = sdata->pg_slots.find(pg->pg_id);
4571 if (p == sdata->pg_slots.end() ||
4572 !p->second->pg) {
4573 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4574 return false;
4575 }
4576 if (p->second->waiting_for_merge_epoch) {
4577 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4578 return false;
4579 }
4580 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4581 sdata->_detach_pg(p->second.get());
4582 }
4583
4584 for (auto shard : shards) {
4585 shard->unprime_split_children(pg->pg_id, old_pg_num);
4586 }
4587
4588 // update pg count now since we might not get an osdmap any time soon.
4589 if (pg->is_primary())
4590 service.logger->dec(l_osd_pg_primary);
4591 else if (pg->is_nonprimary())
4592 service.logger->dec(l_osd_pg_replica); // misnomver
4593 else
4594 service.logger->dec(l_osd_pg_stray);
4595
4596 return true;
4597 }
4598
4599 PGRef OSD::_lookup_pg(spg_t pgid)
4600 {
4601 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4602 auto sdata = shards[shard_index];
4603 std::lock_guard l(sdata->shard_lock);
4604 auto p = sdata->pg_slots.find(pgid);
4605 if (p == sdata->pg_slots.end()) {
4606 return nullptr;
4607 }
4608 return p->second->pg;
4609 }
4610
4611 PGRef OSD::_lookup_lock_pg(spg_t pgid)
4612 {
4613 PGRef pg = _lookup_pg(pgid);
4614 if (!pg) {
4615 return nullptr;
4616 }
4617 pg->lock();
4618 if (!pg->is_deleted()) {
4619 return pg;
4620 }
4621 pg->unlock();
4622 return nullptr;
4623 }
4624
4625 PGRef OSD::lookup_lock_pg(spg_t pgid)
4626 {
4627 return _lookup_lock_pg(pgid);
4628 }
4629
4630 void OSD::load_pgs()
4631 {
4632 ceph_assert(ceph_mutex_is_locked(osd_lock));
4633 dout(0) << "load_pgs" << dendl;
4634
4635 {
4636 auto pghist = make_pg_num_history_oid();
4637 bufferlist bl;
4638 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4639 if (r >= 0 && bl.length() > 0) {
4640 auto p = bl.cbegin();
4641 decode(pg_num_history, p);
4642 }
4643 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
4644 }
4645
4646 vector<coll_t> ls;
4647 int r = store->list_collections(ls);
4648 if (r < 0) {
4649 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4650 }
4651
4652 int num = 0;
4653 for (vector<coll_t>::iterator it = ls.begin();
4654 it != ls.end();
4655 ++it) {
4656 spg_t pgid;
4657 if (it->is_temp(&pgid) ||
4658 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
4659 dout(10) << "load_pgs " << *it
4660 << " removing, legacy or flagged for removal pg" << dendl;
4661 recursive_remove_collection(cct, store, pgid, *it);
4662 continue;
4663 }
4664
4665 if (!it->is_pg(&pgid)) {
4666 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4667 continue;
4668 }
4669
4670 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4671 epoch_t map_epoch = 0;
4672 int r = PG::peek_map_epoch(store, pgid, &map_epoch);
4673 if (r < 0) {
4674 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4675 << dendl;
4676 continue;
4677 }
4678
4679 PGRef pg;
4680 if (map_epoch > 0) {
4681 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4682 if (!pgosdmap) {
4683 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
4684 derr << __func__ << ": could not find map for epoch " << map_epoch
4685 << " on pg " << pgid << ", but the pool is not present in the "
4686 << "current map, so this is probably a result of bug 10617. "
4687 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4688 << "to clean it up later." << dendl;
4689 continue;
4690 } else {
4691 derr << __func__ << ": have pgid " << pgid << " at epoch "
4692 << map_epoch << ", but missing map. Crashing."
4693 << dendl;
4694 ceph_abort_msg("Missing map in load_pgs");
4695 }
4696 }
4697 pg = _make_pg(pgosdmap, pgid);
4698 } else {
4699 pg = _make_pg(get_osdmap(), pgid);
4700 }
4701 if (!pg) {
4702 recursive_remove_collection(cct, store, pgid, *it);
4703 continue;
4704 }
4705
4706 // there can be no waiters here, so we don't call _wake_pg_slot
4707
4708 pg->lock();
4709 pg->ch = store->open_collection(pg->coll);
4710
4711 // read pg state, log
4712 pg->read_state(store);
4713
4714 if (pg->dne()) {
4715 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4716 pg->ch = nullptr;
4717 pg->unlock();
4718 recursive_remove_collection(cct, store, pgid, *it);
4719 continue;
4720 }
4721 {
4722 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4723 assert(NULL != shards[shard_index]);
4724 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4725 }
4726
4727 pg->reg_next_scrub();
4728
4729 dout(10) << __func__ << " loaded " << *pg << dendl;
4730 pg->unlock();
4731
4732 register_pg(pg);
4733 ++num;
4734 }
4735 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
4736 }
4737
4738
4739 PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4740 const PGCreateInfo *info)
4741 {
4742 spg_t pgid = info->pgid;
4743
4744 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4745 dout(10) << __func__ << " hit max pg, dropping" << dendl;
4746 return nullptr;
4747 }
4748
4749 PeeringCtx rctx = create_context();
4750
4751 OSDMapRef startmap = get_map(info->epoch);
4752
4753 if (info->by_mon) {
4754 int64_t pool_id = pgid.pgid.pool();
4755 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4756 if (!pool) {
4757 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4758 return nullptr;
4759 }
4760 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
4761 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4762 // this ensures we do not process old creating messages after the
4763 // pool's initial pgs have been created (and pg are subsequently
4764 // allowed to split or merge).
4765 dout(20) << __func__ << " dropping " << pgid
4766 << "create, pool does not have CREATING flag set" << dendl;
4767 return nullptr;
4768 }
4769 }
4770
4771 int up_primary, acting_primary;
4772 vector<int> up, acting;
4773 startmap->pg_to_up_acting_osds(
4774 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4775
4776 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4777 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4778 store->get_type() != "bluestore") {
4779 clog->warn() << "pg " << pgid
4780 << " is at risk of silent data corruption: "
4781 << "the pool allows ec overwrites but is not stored in "
4782 << "bluestore, so deep scrubbing will not detect bitrot";
4783 }
4784 create_pg_collection(
4785 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4786 init_pg_ondisk(rctx.transaction, pgid, pp);
4787
4788 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
4789
4790 PGRef pg = _make_pg(startmap, pgid);
4791 pg->ch = store->create_new_collection(pg->coll);
4792
4793 {
4794 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4795 assert(NULL != shards[shard_index]);
4796 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4797 }
4798
4799 pg->lock(true);
4800
4801 // we are holding the shard lock
4802 ceph_assert(!pg->is_deleted());
4803
4804 pg->init(
4805 role,
4806 up,
4807 up_primary,
4808 acting,
4809 acting_primary,
4810 info->history,
4811 info->past_intervals,
4812 false,
4813 rctx.transaction);
4814
4815 pg->init_collection_pool_opts();
4816
4817 if (pg->is_primary()) {
4818 std::lock_guard locker{m_perf_queries_lock};
4819 pg->set_dynamic_perf_stats_queries(m_perf_queries);
4820 }
4821
4822 pg->handle_initialize(rctx);
4823 pg->handle_activate_map(rctx);
4824
4825 dispatch_context(rctx, pg.get(), osdmap, nullptr);
4826
4827 dout(10) << __func__ << " new pg " << *pg << dendl;
4828 return pg;
4829 }
4830
4831 bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4832 spg_t pgid,
4833 bool is_mon_create)
4834 {
4835 const auto max_pgs_per_osd =
4836 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4837 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4838
4839 if (num_pgs < max_pgs_per_osd) {
4840 return false;
4841 }
4842
4843 std::lock_guard l(pending_creates_lock);
4844 if (is_mon_create) {
4845 pending_creates_from_mon++;
4846 } else {
4847 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
4848 pending_creates_from_osd.emplace(pgid, is_primary);
4849 }
4850 dout(1) << __func__ << " withhold creation of pg " << pgid
4851 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
4852 return true;
4853 }
4854
4855 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4856 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4857 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4858 static vector<int32_t> twiddle(const vector<int>& acting) {
4859 if (acting.size() > 1) {
4860 return {acting[0]};
4861 } else {
4862 vector<int32_t> twiddled(acting.begin(), acting.end());
4863 twiddled.push_back(-1);
4864 return twiddled;
4865 }
4866 }
4867
4868 void OSD::resume_creating_pg()
4869 {
4870 bool do_sub_pg_creates = false;
4871 bool have_pending_creates = false;
4872 {
4873 const auto max_pgs_per_osd =
4874 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4875 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4876 if (max_pgs_per_osd <= num_pgs) {
4877 // this could happen if admin decreases this setting before a PG is removed
4878 return;
4879 }
4880 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4881 std::lock_guard l(pending_creates_lock);
4882 if (pending_creates_from_mon > 0) {
4883 dout(20) << __func__ << " pending_creates_from_mon "
4884 << pending_creates_from_mon << dendl;
4885 do_sub_pg_creates = true;
4886 if (pending_creates_from_mon >= spare_pgs) {
4887 spare_pgs = pending_creates_from_mon = 0;
4888 } else {
4889 spare_pgs -= pending_creates_from_mon;
4890 pending_creates_from_mon = 0;
4891 }
4892 }
4893 auto pg = pending_creates_from_osd.cbegin();
4894 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
4895 dout(20) << __func__ << " pg " << pg->first << dendl;
4896 vector<int> acting;
4897 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
4898 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
4899 pg = pending_creates_from_osd.erase(pg);
4900 do_sub_pg_creates = true;
4901 spare_pgs--;
4902 }
4903 have_pending_creates = (pending_creates_from_mon > 0 ||
4904 !pending_creates_from_osd.empty());
4905 }
4906
4907 bool do_renew_subs = false;
4908 if (do_sub_pg_creates) {
4909 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4910 dout(4) << __func__ << ": resolicit pg creates from mon since "
4911 << last_pg_create_epoch << dendl;
4912 do_renew_subs = true;
4913 }
4914 }
4915 version_t start = get_osdmap_epoch() + 1;
4916 if (have_pending_creates) {
4917 // don't miss any new osdmap deleting PGs
4918 if (monc->sub_want("osdmap", start, 0)) {
4919 dout(4) << __func__ << ": resolicit osdmap from mon since "
4920 << start << dendl;
4921 do_renew_subs = true;
4922 }
4923 } else if (do_sub_pg_creates) {
4924 // no need to subscribe the osdmap continuously anymore
4925 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4926 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
4927 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
4928 << start << dendl;
4929 do_renew_subs = true;
4930 }
4931 }
4932
4933 if (do_renew_subs) {
4934 monc->renew_subs();
4935 }
4936
4937 service.send_pg_temp();
4938 }
4939
4940 void OSD::build_initial_pg_history(
4941 spg_t pgid,
4942 epoch_t created,
4943 utime_t created_stamp,
4944 pg_history_t *h,
4945 PastIntervals *pi)
4946 {
4947 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4948 *h = pg_history_t(created, created_stamp);
4949
4950 OSDMapRef lastmap = service.get_map(created);
4951 int up_primary, acting_primary;
4952 vector<int> up, acting;
4953 lastmap->pg_to_up_acting_osds(
4954 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4955
4956 ostringstream debug;
4957 for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
4958 OSDMapRef osdmap = service.get_map(e);
4959 int new_up_primary, new_acting_primary;
4960 vector<int> new_up, new_acting;
4961 osdmap->pg_to_up_acting_osds(
4962 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4963
4964 // this is a bit imprecise, but sufficient?
4965 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4966 const pg_pool_t *pi;
4967 bool operator()(const set<pg_shard_t> &have) const {
4968 return have.size() >= pi->min_size;
4969 }
4970 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4971 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4972
4973 bool new_interval = PastIntervals::check_new_interval(
4974 acting_primary,
4975 new_acting_primary,
4976 acting, new_acting,
4977 up_primary,
4978 new_up_primary,
4979 up, new_up,
4980 h->same_interval_since,
4981 h->last_epoch_clean,
4982 osdmap.get(),
4983 lastmap.get(),
4984 pgid.pgid,
4985 min_size_predicate,
4986 pi,
4987 &debug);
4988 if (new_interval) {
4989 h->same_interval_since = e;
4990 if (up != new_up) {
4991 h->same_up_since = e;
4992 }
4993 if (acting_primary != new_acting_primary) {
4994 h->same_primary_since = e;
4995 }
4996 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4997 osdmap->get_pg_num(pgid.pgid.pool()),
4998 nullptr)) {
4999 h->last_epoch_split = e;
5000 }
5001 up = new_up;
5002 acting = new_acting;
5003 up_primary = new_up_primary;
5004 acting_primary = new_acting_primary;
5005 }
5006 lastmap = osdmap;
5007 }
5008 dout(20) << __func__ << " " << debug.str() << dendl;
5009 dout(10) << __func__ << " " << *h << " " << *pi
5010 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5011 pi->get_bounds()) << ")"
5012 << dendl;
5013 }
5014
5015 void OSD::_add_heartbeat_peer(int p)
5016 {
5017 if (p == whoami)
5018 return;
5019 HeartbeatInfo *hi;
5020
5021 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5022 if (i == heartbeat_peers.end()) {
5023 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
5024 if (!cons.first)
5025 return;
5026 assert(cons.second);
5027
5028 hi = &heartbeat_peers[p];
5029 hi->peer = p;
5030
5031 auto stamps = service.get_hb_stamps(p);
5032
5033 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5034 sb->peer = p;
5035 sb->stamps = stamps;
5036 hi->hb_interval_start = ceph_clock_now();
5037 hi->con_back = cons.first.get();
5038 hi->con_back->set_priv(sb);
5039
5040 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5041 sf->peer = p;
5042 sf->stamps = stamps;
5043 hi->con_front = cons.second.get();
5044 hi->con_front->set_priv(sf);
5045
5046 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5047 << " " << hi->con_back->get_peer_addr()
5048 << " " << hi->con_front->get_peer_addr()
5049 << dendl;
5050 } else {
5051 hi = &i->second;
5052 }
5053 hi->epoch = get_osdmap_epoch();
5054 }
5055
5056 void OSD::_remove_heartbeat_peer(int n)
5057 {
5058 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5059 ceph_assert(q != heartbeat_peers.end());
5060 dout(20) << " removing heartbeat peer osd." << n
5061 << " " << q->second.con_back->get_peer_addr()
5062 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5063 << dendl;
5064 q->second.clear_mark_down();
5065 heartbeat_peers.erase(q);
5066 }
5067
5068 void OSD::need_heartbeat_peer_update()
5069 {
5070 if (is_stopping())
5071 return;
5072 dout(20) << "need_heartbeat_peer_update" << dendl;
5073 heartbeat_set_peers_need_update();
5074 }
5075
5076 void OSD::maybe_update_heartbeat_peers()
5077 {
5078 ceph_assert(ceph_mutex_is_locked(osd_lock));
5079
5080 if (is_waiting_for_healthy() || is_active()) {
5081 utime_t now = ceph_clock_now();
5082 if (last_heartbeat_resample == utime_t()) {
5083 last_heartbeat_resample = now;
5084 heartbeat_set_peers_need_update();
5085 } else if (!heartbeat_peers_need_update()) {
5086 utime_t dur = now - last_heartbeat_resample;
5087 if (dur > cct->_conf->osd_heartbeat_grace) {
5088 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5089 heartbeat_set_peers_need_update();
5090 last_heartbeat_resample = now;
5091 // automatically clean up any stale heartbeat peers
5092 // if we are unhealthy, then clean all
5093 reset_heartbeat_peers(is_waiting_for_healthy());
5094 }
5095 }
5096 }
5097
5098 if (!heartbeat_peers_need_update())
5099 return;
5100 heartbeat_clear_peers_need_update();
5101
5102 std::lock_guard l(heartbeat_lock);
5103
5104 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5105
5106
5107 // build heartbeat from set
5108 if (is_active()) {
5109 vector<PGRef> pgs;
5110 _get_pgs(&pgs);
5111 for (auto& pg : pgs) {
5112 pg->with_heartbeat_peers([&](int peer) {
5113 if (get_osdmap()->is_up(peer)) {
5114 _add_heartbeat_peer(peer);
5115 }
5116 });
5117 }
5118 }
5119
5120 // include next and previous up osds to ensure we have a fully-connected set
5121 set<int> want, extras;
5122 const int next = get_osdmap()->get_next_up_osd_after(whoami);
5123 if (next >= 0)
5124 want.insert(next);
5125 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
5126 if (prev >= 0 && prev != next)
5127 want.insert(prev);
5128
5129 // make sure we have at least **min_down** osds coming from different
5130 // subtree level (e.g., hosts) for fast failure detection.
5131 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5132 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5133 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5134 get_osdmap()->get_random_up_osds_by_subtree(
5135 whoami, subtree, limit, want, &want);
5136
5137 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5138 dout(10) << " adding neighbor peer osd." << *p << dendl;
5139 extras.insert(*p);
5140 _add_heartbeat_peer(*p);
5141 }
5142
5143 // remove down peers; enumerate extras
5144 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5145 while (p != heartbeat_peers.end()) {
5146 if (!get_osdmap()->is_up(p->first)) {
5147 int o = p->first;
5148 ++p;
5149 _remove_heartbeat_peer(o);
5150 continue;
5151 }
5152 if (p->second.epoch < get_osdmap_epoch()) {
5153 extras.insert(p->first);
5154 }
5155 ++p;
5156 }
5157
5158 // too few?
5159 for (int n = next; n >= 0; ) {
5160 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5161 break;
5162 if (!extras.count(n) && !want.count(n) && n != whoami) {
5163 dout(10) << " adding random peer osd." << n << dendl;
5164 extras.insert(n);
5165 _add_heartbeat_peer(n);
5166 }
5167 n = get_osdmap()->get_next_up_osd_after(n);
5168 if (n == next)
5169 break; // came full circle; stop
5170 }
5171
5172 // too many?
5173 for (set<int>::iterator p = extras.begin();
5174 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5175 ++p) {
5176 if (want.count(*p))
5177 continue;
5178 _remove_heartbeat_peer(*p);
5179 }
5180
5181 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5182
5183 // clean up stale failure pending
5184 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5185 if (heartbeat_peers.count(it->first) == 0) {
5186 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5187 failure_pending.erase(it++);
5188 } else {
5189 it++;
5190 }
5191 }
5192 }
5193
5194 void OSD::reset_heartbeat_peers(bool all)
5195 {
5196 ceph_assert(ceph_mutex_is_locked(osd_lock));
5197 dout(10) << "reset_heartbeat_peers" << dendl;
5198 utime_t stale = ceph_clock_now();
5199 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5200 std::lock_guard l(heartbeat_lock);
5201 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5202 HeartbeatInfo& hi = it->second;
5203 if (all || hi.is_stale(stale)) {
5204 hi.clear_mark_down();
5205 // stop sending failure_report to mon too
5206 failure_queue.erase(it->first);
5207 heartbeat_peers.erase(it++);
5208 } else {
5209 it++;
5210 }
5211 }
5212 }
5213
5214 void OSD::handle_osd_ping(MOSDPing *m)
5215 {
5216 if (superblock.cluster_fsid != m->fsid) {
5217 dout(20) << "handle_osd_ping from " << m->get_source_inst()
5218 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5219 << dendl;
5220 m->put();
5221 return;
5222 }
5223
5224 int from = m->get_source().num();
5225
5226 heartbeat_lock.lock();
5227 if (is_stopping()) {
5228 heartbeat_lock.unlock();
5229 m->put();
5230 return;
5231 }
5232
5233 utime_t now = ceph_clock_now();
5234 auto mnow = service.get_mnow();
5235 ConnectionRef con(m->get_connection());
5236 OSDMapRef curmap = service.get_osdmap();
5237 if (!curmap) {
5238 heartbeat_lock.unlock();
5239 m->put();
5240 return;
5241 }
5242
5243 auto sref = con->get_priv();
5244 Session *s = static_cast<Session*>(sref.get());
5245 if (!s) {
5246 heartbeat_lock.unlock();
5247 m->put();
5248 return;
5249 }
5250 if (!s->stamps) {
5251 s->peer = from;
5252 s->stamps = service.get_hb_stamps(from);
5253 }
5254
5255 switch (m->op) {
5256
5257 case MOSDPing::PING:
5258 {
5259 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5260 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5261 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5262 if (heartbeat_drop->second == 0) {
5263 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5264 } else {
5265 --heartbeat_drop->second;
5266 dout(5) << "Dropping heartbeat from " << from
5267 << ", " << heartbeat_drop->second
5268 << " remaining to drop" << dendl;
5269 break;
5270 }
5271 } else if (cct->_conf->osd_debug_drop_ping_probability >
5272 ((((double)(rand()%100))/100.0))) {
5273 heartbeat_drop =
5274 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5275 cct->_conf->osd_debug_drop_ping_duration)).first;
5276 dout(5) << "Dropping heartbeat from " << from
5277 << ", " << heartbeat_drop->second
5278 << " remaining to drop" << dendl;
5279 break;
5280 }
5281 }
5282
5283 ceph::signedspan sender_delta_ub{};
5284 s->stamps->got_ping(
5285 m->up_from,
5286 mnow,
5287 m->mono_send_stamp,
5288 m->delta_ub,
5289 &sender_delta_ub);
5290 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5291
5292 if (!cct->get_heartbeat_map()->is_healthy()) {
5293 dout(10) << "internal heartbeat not healthy, dropping ping request"
5294 << dendl;
5295 break;
5296 }
5297
5298 Message *r = new MOSDPing(monc->get_fsid(),
5299 curmap->get_epoch(),
5300 MOSDPing::PING_REPLY,
5301 m->ping_stamp,
5302 m->mono_ping_stamp,
5303 mnow,
5304 service.get_up_epoch(),
5305 cct->_conf->osd_heartbeat_min_size,
5306 sender_delta_ub);
5307 con->send_message(r);
5308
5309 if (curmap->is_up(from)) {
5310 if (is_active()) {
5311 ConnectionRef cluster_con = service.get_con_osd_cluster(
5312 from, curmap->get_epoch());
5313 if (cluster_con) {
5314 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5315 }
5316 }
5317 } else if (!curmap->exists(from) ||
5318 curmap->get_down_at(from) > m->map_epoch) {
5319 // tell them they have died
5320 Message *r = new MOSDPing(monc->get_fsid(),
5321 curmap->get_epoch(),
5322 MOSDPing::YOU_DIED,
5323 m->ping_stamp,
5324 m->mono_ping_stamp,
5325 mnow,
5326 service.get_up_epoch(),
5327 cct->_conf->osd_heartbeat_min_size);
5328 con->send_message(r);
5329 }
5330 }
5331 break;
5332
5333 case MOSDPing::PING_REPLY:
5334 {
5335 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5336 if (i != heartbeat_peers.end()) {
5337 auto acked = i->second.ping_history.find(m->ping_stamp);
5338 if (acked != i->second.ping_history.end()) {
5339 int &unacknowledged = acked->second.second;
5340 if (con == i->second.con_back) {
5341 dout(25) << "handle_osd_ping got reply from osd." << from
5342 << " first_tx " << i->second.first_tx
5343 << " last_tx " << i->second.last_tx
5344 << " last_rx_back " << i->second.last_rx_back
5345 << " -> " << now
5346 << " last_rx_front " << i->second.last_rx_front
5347 << dendl;
5348 i->second.last_rx_back = now;
5349 ceph_assert(unacknowledged > 0);
5350 --unacknowledged;
5351 // if there is no front con, set both stamps.
5352 if (i->second.con_front == NULL) {
5353 i->second.last_rx_front = now;
5354 ceph_assert(unacknowledged > 0);
5355 --unacknowledged;
5356 }
5357 } else if (con == i->second.con_front) {
5358 dout(25) << "handle_osd_ping got reply from osd." << from
5359 << " first_tx " << i->second.first_tx
5360 << " last_tx " << i->second.last_tx
5361 << " last_rx_back " << i->second.last_rx_back
5362 << " last_rx_front " << i->second.last_rx_front
5363 << " -> " << now
5364 << dendl;
5365 i->second.last_rx_front = now;
5366 ceph_assert(unacknowledged > 0);
5367 --unacknowledged;
5368 }
5369
5370 if (unacknowledged == 0) {
5371 // succeeded in getting all replies
5372 dout(25) << "handle_osd_ping got all replies from osd." << from
5373 << " , erase pending ping(sent at " << m->ping_stamp << ")"
5374 << " and older pending ping(s)"
5375 << dendl;
5376
5377 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5378 ++i->second.hb_average_count;
5379 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
5380 i->second.hb_total_back += back_pingtime;
5381 if (back_pingtime < i->second.hb_min_back)
5382 i->second.hb_min_back = back_pingtime;
5383 if (back_pingtime > i->second.hb_max_back)
5384 i->second.hb_max_back = back_pingtime;
5385 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
5386 i->second.hb_total_front += front_pingtime;
5387 if (front_pingtime < i->second.hb_min_front)
5388 i->second.hb_min_front = front_pingtime;
5389 if (front_pingtime > i->second.hb_max_front)
5390 i->second.hb_max_front = front_pingtime;
5391
5392 ceph_assert(i->second.hb_interval_start != utime_t());
5393 if (i->second.hb_interval_start == utime_t())
5394 i->second.hb_interval_start = now;
5395 int64_t hb_avg_time_period = 60;
5396 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5397 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5398 }
5399 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5400 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5401 uint32_t back_min = i->second.hb_min_back;
5402 uint32_t back_max = i->second.hb_max_back;
5403 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5404 uint32_t front_min = i->second.hb_min_front;
5405 uint32_t front_max = i->second.hb_max_front;
5406
5407 // Reset for new interval
5408 i->second.hb_average_count = 0;
5409 i->second.hb_interval_start = now;
5410 i->second.hb_total_back = i->second.hb_max_back = 0;
5411 i->second.hb_min_back = UINT_MAX;
5412 i->second.hb_total_front = i->second.hb_max_front = 0;
5413 i->second.hb_min_front = UINT_MAX;
5414
5415 // Record per osd interace ping times
5416 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5417 if (i->second.hb_back_pingtime.size() == 0) {
5418 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5419 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5420 i->second.hb_back_pingtime.push_back(back_avg);
5421 i->second.hb_back_min.push_back(back_min);
5422 i->second.hb_back_max.push_back(back_max);
5423 i->second.hb_front_pingtime.push_back(front_avg);
5424 i->second.hb_front_min.push_back(front_min);
5425 i->second.hb_front_max.push_back(front_max);
5426 ++i->second.hb_index;
5427 }
5428 } else {
5429 int index = i->second.hb_index & (hb_vector_size - 1);
5430 i->second.hb_back_pingtime[index] = back_avg;
5431 i->second.hb_back_min[index] = back_min;
5432 i->second.hb_back_max[index] = back_max;
5433 i->second.hb_front_pingtime[index] = front_avg;
5434 i->second.hb_front_min[index] = front_min;
5435 i->second.hb_front_max[index] = front_max;
5436 ++i->second.hb_index;
5437 }
5438
5439 {
5440 std::lock_guard l(service.stat_lock);
5441 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5442 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5443
5444 uint32_t total = 0;
5445 uint32_t min = UINT_MAX;
5446 uint32_t max = 0;
5447 uint32_t count = 0;
5448 uint32_t which = 0;
5449 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5450 for (int32_t k = size - 1 ; k >= 0; --k) {
5451 ++count;
5452 int index = (i->second.hb_index + k) % size;
5453 total += i->second.hb_back_pingtime[index];
5454 if (i->second.hb_back_min[index] < min)
5455 min = i->second.hb_back_min[index];
5456 if (i->second.hb_back_max[index] > max)
5457 max = i->second.hb_back_max[index];
5458 if (count == 1 || count == 5 || count == 15) {
5459 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5460 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5461 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5462 which++;
5463 if (count == 15)
5464 break;
5465 }
5466 }
5467
5468 if (i->second.con_front != NULL) {
5469 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5470
5471 total = 0;
5472 min = UINT_MAX;
5473 max = 0;
5474 count = 0;
5475 which = 0;
5476 for (int32_t k = size - 1 ; k >= 0; --k) {
5477 ++count;
5478 int index = (i->second.hb_index + k) % size;
5479 total += i->second.hb_front_pingtime[index];
5480 if (i->second.hb_front_min[index] < min)
5481 min = i->second.hb_front_min[index];
5482 if (i->second.hb_front_max[index] > max)
5483 max = i->second.hb_front_max[index];
5484 if (count == 1 || count == 5 || count == 15) {
5485 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5486 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5487 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5488 which++;
5489 if (count == 15)
5490 break;
5491 }
5492 }
5493 }
5494 }
5495 } else {
5496 std::lock_guard l(service.stat_lock);
5497 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5498 if (i->second.con_front != NULL)
5499 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5500 }
5501 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5502 }
5503
5504 if (i->second.is_healthy(now)) {
5505 // Cancel false reports
5506 auto failure_queue_entry = failure_queue.find(from);
5507 if (failure_queue_entry != failure_queue.end()) {
5508 dout(10) << "handle_osd_ping canceling queued "
5509 << "failure report for osd." << from << dendl;
5510 failure_queue.erase(failure_queue_entry);
5511 }
5512
5513 auto failure_pending_entry = failure_pending.find(from);
5514 if (failure_pending_entry != failure_pending.end()) {
5515 dout(10) << "handle_osd_ping canceling in-flight "
5516 << "failure report for osd." << from << dendl;
5517 send_still_alive(curmap->get_epoch(),
5518 from,
5519 failure_pending_entry->second.second);
5520 failure_pending.erase(failure_pending_entry);
5521 }
5522 }
5523 } else {
5524 // old replies, deprecated by newly sent pings.
5525 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
5526 << ") is found, treat as covered by newly sent pings "
5527 << "and ignore"
5528 << dendl;
5529 }
5530 }
5531
5532 if (m->map_epoch &&
5533 curmap->is_up(from)) {
5534 if (is_active()) {
5535 ConnectionRef cluster_con = service.get_con_osd_cluster(
5536 from, curmap->get_epoch());
5537 if (cluster_con) {
5538 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5539 }
5540 }
5541 }
5542
5543 s->stamps->got_ping_reply(
5544 mnow,
5545 m->mono_send_stamp,
5546 m->delta_ub);
5547 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5548 }
5549 break;
5550
5551 case MOSDPing::YOU_DIED:
5552 dout(10) << "handle_osd_ping " << m->get_source_inst()
5553 << " says i am down in " << m->map_epoch << dendl;
5554 osdmap_subscribe(curmap->get_epoch()+1, false);
5555 break;
5556 }
5557
5558 heartbeat_lock.unlock();
5559 m->put();
5560 }
5561
5562 void OSD::heartbeat_entry()
5563 {
5564 std::unique_lock l(heartbeat_lock);
5565 if (is_stopping())
5566 return;
5567 while (!heartbeat_stop) {
5568 heartbeat();
5569
5570 double wait;
5571 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5572 wait = (float)cct->_conf->osd_heartbeat_interval;
5573 } else {
5574 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5575 }
5576 auto w = ceph::make_timespan(wait);
5577 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5578 heartbeat_cond.wait_for(l, w);
5579 if (is_stopping())
5580 return;
5581 dout(30) << "heartbeat_entry woke up" << dendl;
5582 }
5583 }
5584
5585 void OSD::heartbeat_check()
5586 {
5587 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
5588 utime_t now = ceph_clock_now();
5589
5590 // check for incoming heartbeats (move me elsewhere?)
5591 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5592 p != heartbeat_peers.end();
5593 ++p) {
5594
5595 if (p->second.first_tx == utime_t()) {
5596 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5597 << " yet, skipping" << dendl;
5598 continue;
5599 }
5600
5601 dout(25) << "heartbeat_check osd." << p->first
5602 << " first_tx " << p->second.first_tx
5603 << " last_tx " << p->second.last_tx
5604 << " last_rx_back " << p->second.last_rx_back
5605 << " last_rx_front " << p->second.last_rx_front
5606 << dendl;
5607 if (p->second.is_unhealthy(now)) {
5608 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5609 if (p->second.last_rx_back == utime_t() ||
5610 p->second.last_rx_front == utime_t()) {
5611 derr << "heartbeat_check: no reply from "
5612 << p->second.con_front->get_peer_addr().get_sockaddr()
5613 << " osd." << p->first
5614 << " ever on either front or back, first ping sent "
5615 << p->second.first_tx
5616 << " (oldest deadline " << oldest_deadline << ")"
5617 << dendl;
5618 // fail
5619 failure_queue[p->first] = p->second.first_tx;
5620 } else {
5621 derr << "heartbeat_check: no reply from "
5622 << p->second.con_front->get_peer_addr().get_sockaddr()
5623 << " osd." << p->first << " since back " << p->second.last_rx_back
5624 << " front " << p->second.last_rx_front
5625 << " (oldest deadline " << oldest_deadline << ")"
5626 << dendl;
5627 // fail
5628 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5629 }
5630 }
5631 }
5632 }
5633
5634 void OSD::heartbeat()
5635 {
5636 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
5637 dout(30) << "heartbeat" << dendl;
5638
5639 // get CPU load avg
5640 double loadavgs[1];
5641 int hb_interval = cct->_conf->osd_heartbeat_interval;
5642 int n_samples = 86400;
5643 if (hb_interval > 1) {
5644 n_samples /= hb_interval;
5645 if (n_samples < 1)
5646 n_samples = 1;
5647 }
5648
5649 if (getloadavg(loadavgs, 1) == 1) {
5650 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5651 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5652 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5653 }
5654
5655 dout(30) << "heartbeat checking stats" << dendl;
5656
5657 // refresh peer list and osd stats
5658 vector<int> hb_peers;
5659 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5660 p != heartbeat_peers.end();
5661 ++p)
5662 hb_peers.push_back(p->first);
5663
5664 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5665 dout(5) << __func__ << " " << new_stat << dendl;
5666 ceph_assert(new_stat.statfs.total);
5667
5668 float pratio;
5669 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5670
5671 service.check_full_status(ratio, pratio);
5672
5673 utime_t now = ceph_clock_now();
5674 auto mnow = service.get_mnow();
5675 utime_t deadline = now;
5676 deadline += cct->_conf->osd_heartbeat_grace;
5677
5678 // send heartbeats
5679 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5680 i != heartbeat_peers.end();
5681 ++i) {
5682 int peer = i->first;
5683 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5684
5685 i->second.last_tx = now;
5686 if (i->second.first_tx == utime_t())
5687 i->second.first_tx = now;
5688 i->second.ping_history[now] = make_pair(deadline,
5689 HeartbeatInfo::HEARTBEAT_MAX_CONN);
5690 if (i->second.hb_interval_start == utime_t())
5691 i->second.hb_interval_start = now;
5692
5693 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5694 std::optional<ceph::signedspan> delta_ub;
5695 s->stamps->sent_ping(&delta_ub);
5696
5697 i->second.con_back->send_message(
5698 new MOSDPing(monc->get_fsid(),
5699 service.get_osdmap_epoch(),
5700 MOSDPing::PING,
5701 now,
5702 mnow,
5703 mnow,
5704 service.get_up_epoch(),
5705 cct->_conf->osd_heartbeat_min_size,
5706 delta_ub));
5707
5708 if (i->second.con_front)
5709 i->second.con_front->send_message(
5710 new MOSDPing(monc->get_fsid(),
5711 service.get_osdmap_epoch(),
5712 MOSDPing::PING,
5713 now,
5714 mnow,
5715 mnow,
5716 service.get_up_epoch(),
5717 cct->_conf->osd_heartbeat_min_size,
5718 delta_ub));
5719 }
5720
5721 logger->set(l_osd_hb_to, heartbeat_peers.size());
5722
5723 // hmm.. am i all alone?
5724 dout(30) << "heartbeat lonely?" << dendl;
5725 if (heartbeat_peers.empty()) {
5726 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5727 last_mon_heartbeat = now;
5728 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5729 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5730 }
5731 }
5732
5733 dout(30) << "heartbeat done" << dendl;
5734 }
5735
5736 bool OSD::heartbeat_reset(Connection *con)
5737 {
5738 std::lock_guard l(heartbeat_lock);
5739 auto s = con->get_priv();
5740 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
5741 con->set_priv(nullptr);
5742 if (s) {
5743 if (is_stopping()) {
5744 return true;
5745 }
5746 auto session = static_cast<Session*>(s.get());
5747 auto p = heartbeat_peers.find(session->peer);
5748 if (p != heartbeat_peers.end() &&
5749 (p->second.con_back == con ||
5750 p->second.con_front == con)) {
5751 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5752 << ", reopening" << dendl;
5753 p->second.clear_mark_down(con);
5754 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5755 if (newcon.first) {
5756 p->second.con_back = newcon.first.get();
5757 p->second.con_back->set_priv(s);
5758 if (newcon.second) {
5759 p->second.con_front = newcon.second.get();
5760 p->second.con_front->set_priv(s);
5761 }
5762 p->second.ping_history.clear();
5763 } else {
5764 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5765 << ", raced with osdmap update, closing out peer" << dendl;
5766 heartbeat_peers.erase(p);
5767 }
5768 } else {
5769 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5770 }
5771 }
5772 return true;
5773 }
5774
5775
5776
5777 // =========================================
5778
5779 void OSD::tick()
5780 {
5781 ceph_assert(ceph_mutex_is_locked(osd_lock));
5782 dout(10) << "tick" << dendl;
5783
5784 utime_t now = ceph_clock_now();
5785 // throw out any obsolete markdown log
5786 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
5787 while (!osd_markdown_log.empty() &&
5788 osd_markdown_log.front() + grace < now)
5789 osd_markdown_log.pop_front();
5790
5791 if (is_active() || is_waiting_for_healthy()) {
5792 maybe_update_heartbeat_peers();
5793 }
5794
5795 if (is_waiting_for_healthy()) {
5796 start_boot();
5797 }
5798
5799 if (is_waiting_for_healthy() || is_booting()) {
5800 std::lock_guard l(heartbeat_lock);
5801 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5802 last_mon_heartbeat = now;
5803 dout(1) << __func__ << " checking mon for new map" << dendl;
5804 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5805 }
5806 }
5807
5808 do_waiters();
5809
5810 // scrub purged_snaps every deep scrub interval
5811 {
5812 const utime_t last = superblock.last_purged_snaps_scrub;
5813 utime_t next = last;
5814 next += cct->_conf->osd_scrub_min_interval;
5815 std::mt19937 rng;
5816 // use a seed that is stable for each scrub interval, but varies
5817 // by OSD to avoid any herds.
5818 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
5819 double r = (rng() % 1024) / 1024;
5820 next +=
5821 cct->_conf->osd_scrub_min_interval *
5822 cct->_conf->osd_scrub_interval_randomize_ratio * r;
5823 if (next < ceph_clock_now()) {
5824 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5825 << " next " << next << " ... now" << dendl;
5826 scrub_purged_snaps();
5827 } else {
5828 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5829 << " next " << next << dendl;
5830 }
5831 }
5832
5833 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
5834 }
5835
5836 void OSD::tick_without_osd_lock()
5837 {
5838 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
5839 dout(10) << "tick_without_osd_lock" << dendl;
5840
5841 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5842 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5843 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5844
5845 // refresh osd stats
5846 struct store_statfs_t stbuf;
5847 osd_alert_list_t alerts;
5848 int r = store->statfs(&stbuf, &alerts);
5849 ceph_assert(r == 0);
5850 service.set_statfs(stbuf, alerts);
5851
5852 // osd_lock is not being held, which means the OSD state
5853 // might change when doing the monitor report
5854 if (is_active() || is_waiting_for_healthy()) {
5855 {
5856 std::lock_guard l{heartbeat_lock};
5857 heartbeat_check();
5858 }
5859 map_lock.lock_shared();
5860 std::lock_guard l(mon_report_lock);
5861
5862 // mon report?
5863 utime_t now = ceph_clock_now();
5864 if (service.need_fullness_update() ||
5865 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
5866 last_mon_report = now;
5867 send_full_update();
5868 send_failures();
5869 }
5870 map_lock.unlock_shared();
5871
5872 epoch_t max_waiting_epoch = 0;
5873 for (auto s : shards) {
5874 max_waiting_epoch = std::max(max_waiting_epoch,
5875 s->get_max_waiting_epoch());
5876 }
5877 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
5878 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
5879 << ", requesting new map" << dendl;
5880 osdmap_subscribe(superblock.newest_map + 1, false);
5881 }
5882 }
5883
5884 if (is_active()) {
5885 if (!scrub_random_backoff()) {
5886 sched_scrub();
5887 }
5888 service.promote_throttle_recalibrate();
5889 resume_creating_pg();
5890 bool need_send_beacon = false;
5891 const auto now = ceph::coarse_mono_clock::now();
5892 {
5893 // borrow lec lock to pretect last_sent_beacon from changing
5894 std::lock_guard l{min_last_epoch_clean_lock};
5895 const auto elapsed = now - last_sent_beacon;
5896 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5897 cct->_conf->osd_beacon_report_interval) {
5898 need_send_beacon = true;
5899 }
5900 }
5901 if (need_send_beacon) {
5902 send_beacon(now);
5903 }
5904 }
5905
5906 mgrc.update_daemon_health(get_health_metrics());
5907 service.kick_recovery_queue();
5908 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5909 new C_Tick_WithoutOSDLock(this));
5910 }
5911
5912 // Usage:
5913 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5914 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5915 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5916 // getomap <pool> [namespace/]<obj-name>
5917 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5918 // injectmdataerr [namespace/]<obj-name> [shardid]
5919 // injectdataerr [namespace/]<obj-name> [shardid]
5920 //
5921 // set_recovery_delay [utime]
5922 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5923 std::string_view command,
5924 const cmdmap_t& cmdmap, ostream &ss)
5925 {
5926 //Test support
5927 //Support changing the omap on a single osd by using the Admin Socket to
5928 //directly request the osd make a change.
5929 if (command == "setomapval" || command == "rmomapkey" ||
5930 command == "setomapheader" || command == "getomap" ||
5931 command == "truncobj" || command == "injectmdataerr" ||
5932 command == "injectdataerr"
5933 ) {
5934 pg_t rawpg;
5935 int64_t pool;
5936 OSDMapRef curmap = service->get_osdmap();
5937 int r = -1;
5938
5939 string poolstr;
5940
5941 cmd_getval(cmdmap, "pool", poolstr);
5942 pool = curmap->lookup_pg_pool_name(poolstr);
5943 //If we can't find it by name then maybe id specified
5944 if (pool < 0 && isdigit(poolstr[0]))
5945 pool = atoll(poolstr.c_str());
5946 if (pool < 0) {
5947 ss << "Invalid pool '" << poolstr << "''";
5948 return;
5949 }
5950
5951 string objname, nspace;
5952 cmd_getval(cmdmap, "objname", objname);
5953 std::size_t found = objname.find_first_of('/');
5954 if (found != string::npos) {
5955 nspace = objname.substr(0, found);
5956 objname = objname.substr(found+1);
5957 }
5958 object_locator_t oloc(pool, nspace);
5959 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5960
5961 if (r < 0) {
5962 ss << "Invalid namespace/objname";
5963 return;
5964 }
5965
5966 int64_t shardid;
5967 cmd_getval(cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5968 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5969 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5970 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5971 if (curmap->pg_is_ec(rawpg)) {
5972 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5973 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5974 return;
5975 }
5976 }
5977
5978 ObjectStore::Transaction t;
5979
5980 if (command == "setomapval") {
5981 map<string, bufferlist> newattrs;
5982 bufferlist val;
5983 string key, valstr;
5984 cmd_getval(cmdmap, "key", key);
5985 cmd_getval(cmdmap, "val", valstr);
5986
5987 val.append(valstr);
5988 newattrs[key] = val;
5989 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5990 r = store->queue_transaction(service->meta_ch, std::move(t));
5991 if (r < 0)
5992 ss << "error=" << r;
5993 else
5994 ss << "ok";
5995 } else if (command == "rmomapkey") {
5996 string key;
5997 cmd_getval(cmdmap, "key", key);
5998
5999 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
6000 r = store->queue_transaction(service->meta_ch, std::move(t));
6001 if (r < 0)
6002 ss << "error=" << r;
6003 else
6004 ss << "ok";
6005 } else if (command == "setomapheader") {
6006 bufferlist newheader;
6007 string headerstr;
6008
6009 cmd_getval(cmdmap, "header", headerstr);
6010 newheader.append(headerstr);
6011 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
6012 r = store->queue_transaction(service->meta_ch, std::move(t));
6013 if (r < 0)
6014 ss << "error=" << r;
6015 else
6016 ss << "ok";
6017 } else if (command == "getomap") {
6018 //Debug: Output entire omap
6019 bufferlist hdrbl;
6020 map<string, bufferlist> keyvals;
6021 auto ch = store->open_collection(coll_t(pgid));
6022 if (!ch) {
6023 ss << "unable to open collection for " << pgid;
6024 r = -ENOENT;
6025 } else {
6026 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6027 if (r >= 0) {
6028 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6029 for (map<string, bufferlist>::iterator it = keyvals.begin();
6030 it != keyvals.end(); ++it)
6031 ss << " key=" << (*it).first << " val="
6032 << string((*it).second.c_str(), (*it).second.length());
6033 } else {
6034 ss << "error=" << r;
6035 }
6036 }
6037 } else if (command == "truncobj") {
6038 int64_t trunclen;
6039 cmd_getval(cmdmap, "len", trunclen);
6040 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
6041 r = store->queue_transaction(service->meta_ch, std::move(t));
6042 if (r < 0)
6043 ss << "error=" << r;
6044 else
6045 ss << "ok";
6046 } else if (command == "injectdataerr") {
6047 store->inject_data_error(gobj);
6048 ss << "ok";
6049 } else if (command == "injectmdataerr") {
6050 store->inject_mdata_error(gobj);
6051 ss << "ok";
6052 }
6053 return;
6054 }
6055 if (command == "set_recovery_delay") {
6056 int64_t delay;
6057 cmd_getval(cmdmap, "utime", delay, (int64_t)0);
6058 ostringstream oss;
6059 oss << delay;
6060 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
6061 oss.str().c_str());
6062 if (r != 0) {
6063 ss << "set_recovery_delay: error setting "
6064 << "osd_recovery_delay_start to '" << delay << "': error "
6065 << r;
6066 return;
6067 }
6068 service->cct->_conf.apply_changes(nullptr);
6069 ss << "set_recovery_delay: set osd_recovery_delay_start "
6070 << "to " << service->cct->_conf->osd_recovery_delay_start;
6071 return;
6072 }
6073 if (command == "injectfull") {
6074 int64_t count;
6075 string type;
6076 OSDService::s_names state;
6077 cmd_getval(cmdmap, "type", type, string("full"));
6078 cmd_getval(cmdmap, "count", count, (int64_t)-1);
6079 if (type == "none" || count == 0) {
6080 type = "none";
6081 count = 0;
6082 }
6083 state = service->get_full_state(type);
6084 if (state == OSDService::s_names::INVALID) {
6085 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6086 return;
6087 }
6088 service->set_injectfull(state, count);
6089 return;
6090 }
6091 ss << "Internal error - command=" << command;
6092 }
6093
6094 // =========================================
6095
6096 void OSD::ms_handle_connect(Connection *con)
6097 {
6098 dout(10) << __func__ << " con " << con << dendl;
6099 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6100 std::lock_guard l(osd_lock);
6101 if (is_stopping())
6102 return;
6103 dout(10) << __func__ << " on mon" << dendl;
6104
6105 if (is_preboot()) {
6106 start_boot();
6107 } else if (is_booting()) {
6108 _send_boot(); // resend boot message
6109 } else {
6110 map_lock.lock_shared();
6111 std::lock_guard l2(mon_report_lock);
6112
6113 utime_t now = ceph_clock_now();
6114 last_mon_report = now;
6115
6116 // resend everything, it's a new session
6117 send_full_update();
6118 send_alive();
6119 service.requeue_pg_temp();
6120 service.clear_sent_ready_to_merge();
6121 service.send_pg_temp();
6122 service.send_ready_to_merge();
6123 service.send_pg_created();
6124 requeue_failures();
6125 send_failures();
6126
6127 map_lock.unlock_shared();
6128 if (is_active()) {
6129 send_beacon(ceph::coarse_mono_clock::now());
6130 }
6131 }
6132
6133 // full map requests may happen while active or pre-boot
6134 if (requested_full_first) {
6135 rerequest_full_maps();
6136 }
6137 }
6138 }
6139
6140 void OSD::ms_handle_fast_connect(Connection *con)
6141 {
6142 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6143 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6144 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6145 s = ceph::make_ref<Session>(cct, con);
6146 con->set_priv(s);
6147 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6148 << " addr=" << s->con->get_peer_addr() << dendl;
6149 // we don't connect to clients
6150 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6151 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6152 }
6153 }
6154 }
6155
6156 void OSD::ms_handle_fast_accept(Connection *con)
6157 {
6158 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6159 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6160 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6161 s = ceph::make_ref<Session>(cct, con);
6162 con->set_priv(s);
6163 dout(10) << "new session (incoming)" << s << " con=" << con
6164 << " addr=" << con->get_peer_addr()
6165 << " must have raced with connect" << dendl;
6166 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6167 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6168 }
6169 }
6170 }
6171
6172 bool OSD::ms_handle_reset(Connection *con)
6173 {
6174 auto session = ceph::ref_cast<Session>(con->get_priv());
6175 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
6176 if (!session)
6177 return false;
6178 session->wstate.reset(con);
6179 session->con->set_priv(nullptr);
6180 session->con.reset(); // break con <-> session ref cycle
6181 // note that we break session->con *before* the session_handle_reset
6182 // cleanup below. this avoids a race between us and
6183 // PG::add_backoff, Session::check_backoff, etc.
6184 session_handle_reset(session);
6185 return true;
6186 }
6187
6188 bool OSD::ms_handle_refused(Connection *con)
6189 {
6190 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6191 return false;
6192
6193 auto session = ceph::ref_cast<Session>(con->get_priv());
6194 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
6195 if (!session)
6196 return false;
6197 int type = con->get_peer_type();
6198 // handle only OSD failures here
6199 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6200 OSDMapRef osdmap = get_osdmap();
6201 if (osdmap) {
6202 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6203 if (id >= 0 && osdmap->is_up(id)) {
6204 // I'm cheating mon heartbeat grace logic, because we know it's not going
6205 // to respawn alone. +1 so we won't hit any boundary case.
6206 monc->send_mon_message(
6207 new MOSDFailure(
6208 monc->get_fsid(),
6209 id,
6210 osdmap->get_addrs(id),
6211 cct->_conf->osd_heartbeat_grace + 1,
6212 osdmap->get_epoch(),
6213 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6214 ));
6215 }
6216 }
6217 }
6218 return true;
6219 }
6220
6221 struct C_OSD_GetVersion : public Context {
6222 OSD *osd;
6223 uint64_t oldest, newest;
6224 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
6225 void finish(int r) override {
6226 if (r >= 0)
6227 osd->_got_mon_epochs(oldest, newest);
6228 }
6229 };
6230
6231 void OSD::start_boot()
6232 {
6233 if (!_is_healthy()) {
6234 // if we are not healthy, do not mark ourselves up (yet)
6235 dout(1) << "not healthy; waiting to boot" << dendl;
6236 if (!is_waiting_for_healthy())
6237 start_waiting_for_healthy();
6238 // send pings sooner rather than later
6239 heartbeat_kick();
6240 return;
6241 }
6242 dout(1) << __func__ << dendl;
6243 set_state(STATE_PREBOOT);
6244 dout(10) << "start_boot - have maps " << superblock.oldest_map
6245 << ".." << superblock.newest_map << dendl;
6246 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
6247 monc->get_version("osdmap", &c->newest, &c->oldest, c);
6248 }
6249
6250 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6251 {
6252 std::lock_guard l(osd_lock);
6253 if (is_preboot()) {
6254 _preboot(oldest, newest);
6255 }
6256 }
6257
6258 void OSD::_preboot(epoch_t oldest, epoch_t newest)
6259 {
6260 ceph_assert(is_preboot());
6261 dout(10) << __func__ << " _preboot mon has osdmaps "
6262 << oldest << ".." << newest << dendl;
6263
6264 // ensure our local fullness awareness is accurate
6265 {
6266 std::lock_guard l(heartbeat_lock);
6267 heartbeat();
6268 }
6269
6270 const auto& monmap = monc->monmap;
6271 const auto osdmap = get_osdmap();
6272 // if our map within recent history, try to add ourselves to the osdmap.
6273 if (osdmap->get_epoch() == 0) {
6274 derr << "waiting for initial osdmap" << dendl;
6275 } else if (osdmap->is_destroyed(whoami)) {
6276 derr << "osdmap says I am destroyed" << dendl;
6277 // provide a small margin so we don't livelock seeing if we
6278 // un-destroyed ourselves.
6279 if (osdmap->get_epoch() > newest - 1) {
6280 exit(0);
6281 }
6282 } else if (osdmap->is_noup(whoami)) {
6283 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6284 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6285 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6286 << dendl;
6287 } else if (osdmap->require_osd_release < ceph_release_t::luminous) {
6288 derr << "osdmap require_osd_release < luminous; please upgrade to luminous"
6289 << dendl;
6290 } else if (service.need_fullness_update()) {
6291 derr << "osdmap fullness state needs update" << dendl;
6292 send_full_update();
6293 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6294 superblock.purged_snaps_last < superblock.current_epoch) {
6295 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6296 << " < newest_map " << superblock.current_epoch << dendl;
6297 _get_purged_snaps();
6298 } else if (osdmap->get_epoch() >= oldest - 1 &&
6299 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6300
6301 // wait for pgs to fully catch up in a different thread, since
6302 // this thread might be required for splitting and merging PGs to
6303 // make progress.
6304 boot_finisher.queue(
6305 new LambdaContext(
6306 [this](int r) {
6307 std::unique_lock l(osd_lock);
6308 if (is_preboot()) {
6309 dout(10) << __func__ << " waiting for peering work to drain"
6310 << dendl;
6311 l.unlock();
6312 for (auto shard : shards) {
6313 shard->wait_min_pg_epoch(get_osdmap_epoch());
6314 }
6315 l.lock();
6316 }
6317 if (is_preboot()) {
6318 _send_boot();
6319 }
6320 }));
6321 return;
6322 }
6323
6324 // get all the latest maps
6325 if (osdmap->get_epoch() + 1 >= oldest)
6326 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6327 else
6328 osdmap_subscribe(oldest - 1, true);
6329 }
6330
6331 void OSD::_get_purged_snaps()
6332 {
6333 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6334 // overlapping requests to the mon, which will be somewhat inefficient, but
6335 // it should be reliable.
6336 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6337 << ", newest_map " << superblock.current_epoch << dendl;
6338 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6339 superblock.purged_snaps_last + 1,
6340 superblock.current_epoch + 1);
6341 monc->send_mon_message(m);
6342 }
6343
6344 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6345 {
6346 dout(10) << __func__ << " " << *m << dendl;
6347 ObjectStore::Transaction t;
6348 if (!is_preboot() ||
6349 m->last < superblock.purged_snaps_last) {
6350 goto out;
6351 }
6352 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
6353 make_purged_snaps_oid(), &t,
6354 m->purged_snaps);
6355 superblock.purged_snaps_last = m->last;
6356 write_superblock(t);
6357 store->queue_transaction(
6358 service.meta_ch,
6359 std::move(t));
6360 service.publish_superblock(superblock);
6361 if (m->last < superblock.current_epoch) {
6362 _get_purged_snaps();
6363 } else {
6364 start_boot();
6365 }
6366 out:
6367 m->put();
6368 }
6369
6370 void OSD::send_full_update()
6371 {
6372 if (!service.need_fullness_update())
6373 return;
6374 unsigned state = 0;
6375 if (service.is_full()) {
6376 state = CEPH_OSD_FULL;
6377 } else if (service.is_backfillfull()) {
6378 state = CEPH_OSD_BACKFILLFULL;
6379 } else if (service.is_nearfull()) {
6380 state = CEPH_OSD_NEARFULL;
6381 }
6382 set<string> s;
6383 OSDMap::calc_state_set(state, s);
6384 dout(10) << __func__ << " want state " << s << dendl;
6385 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
6386 }
6387
6388 void OSD::start_waiting_for_healthy()
6389 {
6390 dout(1) << "start_waiting_for_healthy" << dendl;
6391 set_state(STATE_WAITING_FOR_HEALTHY);
6392 last_heartbeat_resample = utime_t();
6393
6394 // subscribe to osdmap updates, in case our peers really are known to be dead
6395 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6396 }
6397
6398 bool OSD::_is_healthy()
6399 {
6400 if (!cct->get_heartbeat_map()->is_healthy()) {
6401 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6402 return false;
6403 }
6404
6405 if (is_waiting_for_healthy()) {
6406 utime_t now = ceph_clock_now();
6407 if (osd_markdown_log.empty()) {
6408 dout(5) << __func__ << " force returning true since last markdown"
6409 << " was " << cct->_conf->osd_max_markdown_period
6410 << "s ago" << dendl;
6411 return true;
6412 }
6413 std::lock_guard l(heartbeat_lock);
6414 int num = 0, up = 0;
6415 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6416 p != heartbeat_peers.end();
6417 ++p) {
6418 if (p->second.is_healthy(now))
6419 ++up;
6420 ++num;
6421 }
6422 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6423 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6424 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6425 return false;
6426 }
6427 }
6428
6429 return true;
6430 }
6431
6432 void OSD::_send_boot()
6433 {
6434 dout(10) << "_send_boot" << dendl;
6435 Connection *local_connection =
6436 cluster_messenger->get_loopback_connection().get();
6437 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6438 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6439 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6440 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6441
6442 dout(20) << " initial client_addrs " << client_addrs
6443 << ", cluster_addrs " << cluster_addrs
6444 << ", hb_back_addrs " << hb_back_addrs
6445 << ", hb_front_addrs " << hb_front_addrs
6446 << dendl;
6447 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6448 dout(10) << " assuming cluster_addrs match client_addrs "
6449 << client_addrs << dendl;
6450 cluster_addrs = cluster_messenger->get_myaddrs();
6451 }
6452 if (auto session = local_connection->get_priv(); !session) {
6453 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6454 }
6455
6456 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6457 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6458 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6459 << cluster_addrs << dendl;
6460 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6461 }
6462 if (auto session = local_connection->get_priv(); !session) {
6463 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6464 }
6465
6466 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6467 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6468 dout(10) << " assuming hb_front_addrs match client_addrs "
6469 << client_addrs << dendl;
6470 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6471 }
6472 if (auto session = local_connection->get_priv(); !session) {
6473 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6474 }
6475
6476 // we now know what our front and back addrs will be, and we are
6477 // about to tell the mon what our metadata (including numa bindings)
6478 // are, so now is a good time!
6479 set_numa_affinity();
6480
6481 MOSDBoot *mboot = new MOSDBoot(
6482 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6483 hb_back_addrs, hb_front_addrs, cluster_addrs,
6484 CEPH_FEATURES_ALL);
6485 dout(10) << " final client_addrs " << client_addrs
6486 << ", cluster_addrs " << cluster_addrs
6487 << ", hb_back_addrs " << hb_back_addrs
6488 << ", hb_front_addrs " << hb_front_addrs
6489 << dendl;
6490 _collect_metadata(&mboot->metadata);
6491 monc->send_mon_message(mboot);
6492 set_state(STATE_BOOTING);
6493 }
6494
6495 void OSD::_collect_metadata(map<string,string> *pm)
6496 {
6497 // config info
6498 (*pm)["osd_data"] = dev_path;
6499 if (store->get_type() == "filestore") {
6500 // not applicable for bluestore
6501 (*pm)["osd_journal"] = journal_path;
6502 }
6503 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6504 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6505 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6506 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6507
6508 // backend
6509 (*pm)["osd_objectstore"] = store->get_type();
6510 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6511 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6512 (*pm)["default_device_class"] = store->get_default_device_class();
6513 string osdspec_affinity;
6514 int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6515 if (r < 0 || osdspec_affinity.empty()) {
6516 osdspec_affinity = "";
6517 }
6518 (*pm)["osdspec_affinity"] = osdspec_affinity;
6519 store->collect_metadata(pm);
6520
6521 collect_sys_info(pm, cct);
6522
6523 (*pm)["front_iface"] = pick_iface(
6524 cct,
6525 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6526 (*pm)["back_iface"] = pick_iface(
6527 cct,
6528 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6529
6530 // network numa
6531 {
6532 int node = -1;
6533 set<int> nodes;
6534 set<string> unknown;
6535 for (auto nm : { "front_iface", "back_iface" }) {
6536 if (!(*pm)[nm].size()) {
6537 unknown.insert(nm);
6538 continue;
6539 }
6540 int n = -1;
6541 int r = get_iface_numa_node((*pm)[nm], &n);
6542 if (r < 0) {
6543 unknown.insert((*pm)[nm]);
6544 continue;
6545 }
6546 nodes.insert(n);
6547 if (node < 0) {
6548 node = n;
6549 }
6550 }
6551 if (unknown.size()) {
6552 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6553 }
6554 if (!nodes.empty()) {
6555 (*pm)["network_numa_nodes"] = stringify(nodes);
6556 }
6557 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6558 (*pm)["network_numa_node"] = stringify(node);
6559 }
6560 }
6561
6562 if (numa_node >= 0) {
6563 (*pm)["numa_node"] = stringify(numa_node);
6564 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6565 &numa_cpu_set);
6566 }
6567
6568 set<string> devnames;
6569 store->get_devices(&devnames);
6570 map<string,string> errs;
6571 get_device_metadata(devnames, pm, &errs);
6572 for (auto& i : errs) {
6573 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
6574 }
6575 dout(10) << __func__ << " " << *pm << dendl;
6576 }
6577
6578 void OSD::queue_want_up_thru(epoch_t want)
6579 {
6580 std::shared_lock map_locker{map_lock};
6581 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6582 std::lock_guard report_locker(mon_report_lock);
6583 if (want > up_thru_wanted) {
6584 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6585 << ", currently " << cur
6586 << dendl;
6587 up_thru_wanted = want;
6588 send_alive();
6589 } else {
6590 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6591 << ", currently " << cur
6592 << dendl;
6593 }
6594 }
6595
6596 void OSD::send_alive()
6597 {
6598 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6599 const auto osdmap = get_osdmap();
6600 if (!osdmap->exists(whoami))
6601 return;
6602 epoch_t up_thru = osdmap->get_up_thru(whoami);
6603 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6604 if (up_thru_wanted > up_thru) {
6605 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6606 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6607 }
6608 }
6609
6610 void OSD::request_full_map(epoch_t first, epoch_t last)
6611 {
6612 dout(10) << __func__ << " " << first << ".." << last
6613 << ", previously requested "
6614 << requested_full_first << ".." << requested_full_last << dendl;
6615 ceph_assert(ceph_mutex_is_locked(osd_lock));
6616 ceph_assert(first > 0 && last > 0);
6617 ceph_assert(first <= last);
6618 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6619 if (requested_full_first == 0) {
6620 // first request
6621 requested_full_first = first;
6622 requested_full_last = last;
6623 } else if (last <= requested_full_last) {
6624 // dup
6625 return;
6626 } else {
6627 // additional request
6628 first = requested_full_last + 1;
6629 requested_full_last = last;
6630 }
6631 MMonGetOSDMap *req = new MMonGetOSDMap;
6632 req->request_full(first, last);
6633 monc->send_mon_message(req);
6634 }
6635
6636 void OSD::got_full_map(epoch_t e)
6637 {
6638 ceph_assert(requested_full_first <= requested_full_last);
6639 ceph_assert(ceph_mutex_is_locked(osd_lock));
6640 if (requested_full_first == 0) {
6641 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6642 return;
6643 }
6644 if (e < requested_full_first) {
6645 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6646 << ".." << requested_full_last
6647 << ", ignoring" << dendl;
6648 return;
6649 }
6650 if (e >= requested_full_last) {
6651 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6652 << ".." << requested_full_last << ", resetting" << dendl;
6653 requested_full_first = requested_full_last = 0;
6654 return;
6655 }
6656
6657 requested_full_first = e + 1;
6658
6659 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6660 << ".." << requested_full_last
6661 << ", still need more" << dendl;
6662 }
6663
6664 void OSD::requeue_failures()
6665 {
6666 std::lock_guard l(heartbeat_lock);
6667 unsigned old_queue = failure_queue.size();
6668 unsigned old_pending = failure_pending.size();
6669 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
6670 failure_queue[p->first] = p->second.first;
6671 failure_pending.erase(p++);
6672 }
6673 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6674 << failure_queue.size() << dendl;
6675 }
6676
6677 void OSD::send_failures()
6678 {
6679 ceph_assert(ceph_mutex_is_locked(map_lock));
6680 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6681 std::lock_guard l(heartbeat_lock);
6682 utime_t now = ceph_clock_now();
6683 const auto osdmap = get_osdmap();
6684 while (!failure_queue.empty()) {
6685 int osd = failure_queue.begin()->first;
6686 if (!failure_pending.count(osd)) {
6687 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6688 monc->send_mon_message(
6689 new MOSDFailure(
6690 monc->get_fsid(),
6691 osd,
6692 osdmap->get_addrs(osd),
6693 failed_for,
6694 osdmap->get_epoch()));
6695 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6696 osdmap->get_addrs(osd));
6697 }
6698 failure_queue.erase(osd);
6699 }
6700 }
6701
6702 void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
6703 {
6704 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6705 MOSDFailure::FLAG_ALIVE);
6706 monc->send_mon_message(m);
6707 }
6708
6709 void OSD::cancel_pending_failures()
6710 {
6711 std::lock_guard l(heartbeat_lock);
6712 auto it = failure_pending.begin();
6713 while (it != failure_pending.end()) {
6714 dout(10) << __func__ << " canceling in-flight failure report for osd."
6715 << it->first << dendl;
6716 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
6717 failure_pending.erase(it++);
6718 }
6719 }
6720
6721 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6722 {
6723 const auto& monmap = monc->monmap;
6724 // send beacon to mon even if we are just connected, and the monmap is not
6725 // initialized yet by then.
6726 if (monmap.epoch > 0 &&
6727 monmap.get_required_features().contains_all(
6728 ceph::features::mon::FEATURE_LUMINOUS)) {
6729 dout(20) << __func__ << " sending" << dendl;
6730 MOSDBeacon* beacon = nullptr;
6731 {
6732 std::lock_guard l{min_last_epoch_clean_lock};
6733 beacon = new MOSDBeacon(get_osdmap_epoch(),
6734 min_last_epoch_clean,
6735 superblock.last_purged_snaps_scrub);
6736 beacon->pgs = min_last_epoch_clean_pgs;
6737 last_sent_beacon = now;
6738 }
6739 monc->send_mon_message(beacon);
6740 } else {
6741 dout(20) << __func__ << " not sending" << dendl;
6742 }
6743 }
6744
6745 void OSD::handle_command(MCommand *m)
6746 {
6747 ConnectionRef con = m->get_connection();
6748 auto session = ceph::ref_cast<Session>(con->get_priv());
6749 if (!session) {
6750 con->send_message(new MCommandReply(m, -EACCES));
6751 m->put();
6752 return;
6753 }
6754 if (!session->caps.allow_all()) {
6755 con->send_message(new MCommandReply(m, -EACCES));
6756 m->put();
6757 return;
6758 }
6759 cct->get_admin_socket()->queue_tell_command(m);
6760 m->put();
6761 }
6762
6763 namespace {
6764 class unlock_guard {
6765 ceph::mutex& m;
6766 public:
6767 explicit unlock_guard(ceph::mutex& mutex)
6768 : m(mutex)
6769 {
6770 m.unlock();
6771 }
6772 unlock_guard(unlock_guard&) = delete;
6773 ~unlock_guard() {
6774 m.lock();
6775 }
6776 };
6777 }
6778
6779 void OSD::scrub_purged_snaps()
6780 {
6781 dout(10) << __func__ << dendl;
6782 ceph_assert(ceph_mutex_is_locked(osd_lock));
6783 SnapMapper::Scrubber s(cct, store, service.meta_ch,
6784 make_snapmapper_oid(),
6785 make_purged_snaps_oid());
6786 clog->debug() << "purged_snaps scrub starts";
6787 osd_lock.unlock();
6788 s.run();
6789 if (s.stray.size()) {
6790 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
6791 } else {
6792 clog->debug() << "purged_snaps scrub ok";
6793 }
6794 set<pair<spg_t,snapid_t>> queued;
6795 for (auto& [pool, snap, hash, shard] : s.stray) {
6796 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
6797 if (!pi) {
6798 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
6799 continue;
6800 }
6801 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
6802 spg_t spgid(pgid, shard);
6803 pair<spg_t,snapid_t> p(spgid, snap);
6804 if (queued.count(p)) {
6805 dout(20) << __func__ << " pg " << spgid << " snap " << snap
6806 << " already queued" << dendl;
6807 continue;
6808 }
6809 PGRef pg = lookup_lock_pg(spgid);
6810 if (!pg) {
6811 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
6812 continue;
6813 }
6814 queued.insert(p);
6815 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
6816 << snap << dendl;
6817 pg->queue_snap_retrim(snap);
6818 pg->unlock();
6819 }
6820 osd_lock.lock();
6821 if (is_stopping()) {
6822 return;
6823 }
6824 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
6825 ObjectStore::Transaction t;
6826 superblock.last_purged_snaps_scrub = ceph_clock_now();
6827 write_superblock(t);
6828 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
6829 ceph_assert(tr == 0);
6830 if (is_active()) {
6831 send_beacon(ceph::coarse_mono_clock::now());
6832 }
6833 dout(10) << __func__ << " done" << dendl;
6834 }
6835
6836 void OSD::probe_smart(const string& only_devid, ostream& ss)
6837 {
6838 set<string> devnames;
6839 store->get_devices(&devnames);
6840 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
6841 "osd_smart_report_timeout");
6842
6843 // == typedef std::map<std::string, mValue> mObject;
6844 json_spirit::mObject json_map;
6845
6846 for (auto dev : devnames) {
6847 // smartctl works only on physical devices; filter out any logical device
6848 if (dev.find("dm-") == 0) {
6849 continue;
6850 }
6851
6852 string err;
6853 string devid = get_device_id(dev, &err);
6854 if (devid.size() == 0) {
6855 dout(10) << __func__ << " no unique id for dev " << dev << " ("
6856 << err << "), skipping" << dendl;
6857 continue;
6858 }
6859 if (only_devid.size() && devid != only_devid) {
6860 continue;
6861 }
6862
6863 json_spirit::mValue smart_json;
6864 if (block_device_get_metrics(dev, smart_timeout,
6865 &smart_json)) {
6866 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
6867 continue;
6868 }
6869 json_map[devid] = smart_json;
6870 }
6871 json_spirit::write(json_map, ss, json_spirit::pretty_print);
6872 }
6873
6874 bool OSD::heartbeat_dispatch(Message *m)
6875 {
6876 dout(30) << "heartbeat_dispatch " << m << dendl;
6877 switch (m->get_type()) {
6878
6879 case CEPH_MSG_PING:
6880 dout(10) << "ping from " << m->get_source_inst() << dendl;
6881 m->put();
6882 break;
6883
6884 case MSG_OSD_PING:
6885 handle_osd_ping(static_cast<MOSDPing*>(m));
6886 break;
6887
6888 default:
6889 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
6890 m->put();
6891 }
6892
6893 return true;
6894 }
6895
6896 bool OSD::ms_dispatch(Message *m)
6897 {
6898 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
6899 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
6900 service.got_stop_ack();
6901 m->put();
6902 return true;
6903 }
6904
6905 // lock!
6906
6907 osd_lock.lock();
6908 if (is_stopping()) {
6909 osd_lock.unlock();
6910 m->put();
6911 return true;
6912 }
6913
6914 do_waiters();
6915 _dispatch(m);
6916
6917 osd_lock.unlock();
6918
6919 return true;
6920 }
6921
6922 void OSDService::maybe_share_map(
6923 Connection *con,
6924 const OSDMapRef& osdmap,
6925 epoch_t peer_epoch_lb)
6926 {
6927 // NOTE: we assume caller hold something that keeps the Connection itself
6928 // pinned (e.g., an OpRequest's MessageRef).
6929 auto session = ceph::ref_cast<Session>(con->get_priv());
6930 if (!session) {
6931 return;
6932 }
6933
6934 // assume the peer has the newer of the op's sent_epoch and what
6935 // we think we sent them.
6936 session->sent_epoch_lock.lock();
6937 if (peer_epoch_lb > session->last_sent_epoch) {
6938 dout(10) << __func__ << " con " << con
6939 << " " << con->get_peer_addr()
6940 << " map epoch " << session->last_sent_epoch
6941 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
6942 session->last_sent_epoch = peer_epoch_lb;
6943 }
6944 epoch_t last_sent_epoch = session->last_sent_epoch;
6945 session->sent_epoch_lock.unlock();
6946
6947 if (osdmap->get_epoch() <= last_sent_epoch) {
6948 return;
6949 }
6950
6951 send_incremental_map(last_sent_epoch, con, osdmap);
6952 last_sent_epoch = osdmap->get_epoch();
6953
6954 session->sent_epoch_lock.lock();
6955 if (session->last_sent_epoch < last_sent_epoch) {
6956 dout(10) << __func__ << " con " << con
6957 << " " << con->get_peer_addr()
6958 << " map epoch " << session->last_sent_epoch
6959 << " -> " << last_sent_epoch << " (shared)" << dendl;
6960 session->last_sent_epoch = last_sent_epoch;
6961 }
6962 session->sent_epoch_lock.unlock();
6963 }
6964
6965 void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
6966 {
6967 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
6968
6969 auto i = session->waiting_on_map.begin();
6970 while (i != session->waiting_on_map.end()) {
6971 OpRequestRef op = &(*i);
6972 ceph_assert(ms_can_fast_dispatch(op->get_req()));
6973 auto m = op->get_req<MOSDFastDispatchOp>();
6974 if (m->get_min_epoch() > osdmap->get_epoch()) {
6975 break;
6976 }
6977 session->waiting_on_map.erase(i++);
6978 op->put();
6979
6980 spg_t pgid;
6981 if (m->get_type() == CEPH_MSG_OSD_OP) {
6982 pg_t actual_pgid = osdmap->raw_pg_to_pg(
6983 static_cast<const MOSDOp*>(m)->get_pg());
6984 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
6985 continue;
6986 }
6987 } else {
6988 pgid = m->get_spg();
6989 }
6990 enqueue_op(pgid, std::move(op), m->get_map_epoch());
6991 }
6992
6993 if (session->waiting_on_map.empty()) {
6994 clear_session_waiting_on_map(session);
6995 } else {
6996 register_session_waiting_on_map(session);
6997 }
6998 }
6999
7000 void OSD::ms_fast_dispatch(Message *m)
7001 {
7002 FUNCTRACE(cct);
7003 if (service.is_stopping()) {
7004 m->put();
7005 return;
7006 }
7007
7008 // peering event?
7009 switch (m->get_type()) {
7010 case CEPH_MSG_PING:
7011 dout(10) << "ping from " << m->get_source() << dendl;
7012 m->put();
7013 return;
7014 case MSG_OSD_FORCE_RECOVERY:
7015 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7016 return;
7017 case MSG_OSD_SCRUB2:
7018 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7019 return;
7020
7021 case MSG_OSD_PG_CREATE2:
7022 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7023 case MSG_OSD_PG_QUERY:
7024 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7025 case MSG_OSD_PG_NOTIFY:
7026 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7027 case MSG_OSD_PG_INFO:
7028 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7029 case MSG_OSD_PG_REMOVE:
7030 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7031
7032 // these are single-pg messages that handle themselves
7033 case MSG_OSD_PG_LOG:
7034 case MSG_OSD_PG_TRIM:
7035 case MSG_OSD_PG_NOTIFY2:
7036 case MSG_OSD_PG_QUERY2:
7037 case MSG_OSD_PG_INFO2:
7038 case MSG_OSD_BACKFILL_RESERVE:
7039 case MSG_OSD_RECOVERY_RESERVE:
7040 case MSG_OSD_PG_LEASE:
7041 case MSG_OSD_PG_LEASE_ACK:
7042 {
7043 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7044 if (require_osd_peer(pm)) {
7045 enqueue_peering_evt(
7046 pm->get_spg(),
7047 PGPeeringEventRef(pm->get_event()));
7048 }
7049 pm->put();
7050 return;
7051 }
7052 }
7053
7054 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7055 {
7056 #ifdef WITH_LTTNG
7057 osd_reqid_t reqid = op->get_reqid();
7058 #endif
7059 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7060 reqid.name._num, reqid.tid, reqid.inc);
7061 }
7062
7063 if (m->trace)
7064 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7065
7066 // note sender epoch, min req's epoch
7067 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7068 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7069 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7070
7071 service.maybe_inject_dispatch_delay();
7072
7073 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7074 m->get_type() != CEPH_MSG_OSD_OP) {
7075 // queue it directly
7076 enqueue_op(
7077 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7078 std::move(op),
7079 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7080 } else {
7081 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7082 // message that didn't have an explicit spg_t); we need to map
7083 // them to an spg_t while preserving delivery order.
7084 auto priv = m->get_connection()->get_priv();
7085 if (auto session = static_cast<Session*>(priv.get()); session) {
7086 std::lock_guard l{session->session_dispatch_lock};
7087 op->get();
7088 session->waiting_on_map.push_back(*op);
7089 OSDMapRef nextmap = service.get_nextmap_reserved();
7090 dispatch_session_waiting(session, nextmap);
7091 service.release_map(nextmap);
7092 }
7093 }
7094 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7095 }
7096
7097 int OSD::ms_handle_authentication(Connection *con)
7098 {
7099 int ret = 0;
7100 auto s = ceph::ref_cast<Session>(con->get_priv());
7101 if (!s) {
7102 s = ceph::make_ref<Session>(cct, con);
7103 con->set_priv(s);
7104 s->entity_name = con->get_peer_entity_name();
7105 dout(10) << __func__ << " new session " << s << " con " << s->con
7106 << " entity " << s->entity_name
7107 << " addr " << con->get_peer_addrs() << dendl;
7108 } else {
7109 dout(10) << __func__ << " existing session " << s << " con " << s->con
7110 << " entity " << s->entity_name
7111 << " addr " << con->get_peer_addrs() << dendl;
7112 }
7113
7114 AuthCapsInfo &caps_info = con->get_peer_caps_info();
7115 if (caps_info.allow_all) {
7116 s->caps.set_allow_all();
7117 } else if (caps_info.caps.length() > 0) {
7118 bufferlist::const_iterator p = caps_info.caps.cbegin();
7119 string str;
7120 try {
7121 decode(str, p);
7122 }
7123 catch (buffer::error& e) {
7124 dout(10) << __func__ << " session " << s << " " << s->entity_name
7125 << " failed to decode caps string" << dendl;
7126 ret = -EACCES;
7127 }
7128 if (!ret) {
7129 bool success = s->caps.parse(str);
7130 if (success) {
7131 dout(10) << __func__ << " session " << s
7132 << " " << s->entity_name
7133 << " has caps " << s->caps << " '" << str << "'" << dendl;
7134 ret = 1;
7135 } else {
7136 dout(10) << __func__ << " session " << s << " " << s->entity_name
7137 << " failed to parse caps '" << str << "'" << dendl;
7138 ret = -EACCES;
7139 }
7140 }
7141 }
7142 return ret;
7143 }
7144
7145 void OSD::do_waiters()
7146 {
7147 ceph_assert(ceph_mutex_is_locked(osd_lock));
7148
7149 dout(10) << "do_waiters -- start" << dendl;
7150 while (!finished.empty()) {
7151 OpRequestRef next = finished.front();
7152 finished.pop_front();
7153 dispatch_op(next);
7154 }
7155 dout(10) << "do_waiters -- finish" << dendl;
7156 }
7157
7158 void OSD::dispatch_op(OpRequestRef op)
7159 {
7160 switch (op->get_req()->get_type()) {
7161
7162 case MSG_OSD_PG_CREATE:
7163 handle_pg_create(op);
7164 break;
7165 }
7166 }
7167
7168 void OSD::_dispatch(Message *m)
7169 {
7170 ceph_assert(ceph_mutex_is_locked(osd_lock));
7171 dout(20) << "_dispatch " << m << " " << *m << dendl;
7172
7173 switch (m->get_type()) {
7174 // -- don't need OSDMap --
7175
7176 // map and replication
7177 case CEPH_MSG_OSD_MAP:
7178 handle_osd_map(static_cast<MOSDMap*>(m));
7179 break;
7180 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7181 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7182 break;
7183
7184 // osd
7185 case MSG_OSD_SCRUB:
7186 handle_scrub(static_cast<MOSDScrub*>(m));
7187 break;
7188
7189 case MSG_COMMAND:
7190 handle_command(static_cast<MCommand*>(m));
7191 return;
7192
7193 // -- need OSDMap --
7194
7195 case MSG_OSD_PG_CREATE:
7196 {
7197 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7198 if (m->trace)
7199 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7200 // no map? starting up?
7201 if (!get_osdmap()) {
7202 dout(7) << "no OSDMap, not booted" << dendl;
7203 logger->inc(l_osd_waiting_for_map);
7204 waiting_for_osdmap.push_back(op);
7205 op->mark_delayed("no osdmap");
7206 break;
7207 }
7208
7209 // need OSDMap
7210 dispatch_op(op);
7211 }
7212 }
7213 }
7214
7215 // remove me post-nautilus
7216 void OSD::handle_scrub(MOSDScrub *m)
7217 {
7218 dout(10) << "handle_scrub " << *m << dendl;
7219 if (!require_mon_or_mgr_peer(m)) {
7220 m->put();
7221 return;
7222 }
7223 if (m->fsid != monc->get_fsid()) {
7224 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7225 << dendl;
7226 m->put();
7227 return;
7228 }
7229
7230 vector<spg_t> spgs;
7231 _get_pgids(&spgs);
7232
7233 if (!m->scrub_pgs.empty()) {
7234 vector<spg_t> v;
7235 for (auto pgid : m->scrub_pgs) {
7236 spg_t pcand;
7237 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
7238 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7239 v.push_back(pcand);
7240 }
7241 }
7242 spgs.swap(v);
7243 }
7244
7245 for (auto pgid : spgs) {
7246 enqueue_peering_evt(
7247 pgid,
7248 PGPeeringEventRef(
7249 std::make_shared<PGPeeringEvent>(
7250 get_osdmap_epoch(),
7251 get_osdmap_epoch(),
7252 PeeringState::RequestScrub(m->deep, m->repair))));
7253 }
7254
7255 m->put();
7256 }
7257
7258 void OSD::handle_fast_scrub(MOSDScrub2 *m)
7259 {
7260 dout(10) << __func__ << " " << *m << dendl;
7261 if (!require_mon_or_mgr_peer(m)) {
7262 m->put();
7263 return;
7264 }
7265 if (m->fsid != monc->get_fsid()) {
7266 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7267 << dendl;
7268 m->put();
7269 return;
7270 }
7271 for (auto pgid : m->scrub_pgs) {
7272 enqueue_peering_evt(
7273 pgid,
7274 PGPeeringEventRef(
7275 std::make_shared<PGPeeringEvent>(
7276 m->epoch,
7277 m->epoch,
7278 PeeringState::RequestScrub(m->deep, m->repair))));
7279 }
7280 m->put();
7281 }
7282
7283 bool OSD::scrub_random_backoff()
7284 {
7285 bool coin_flip = (rand() / (double)RAND_MAX >=
7286 cct->_conf->osd_scrub_backoff_ratio);
7287 if (!coin_flip) {
7288 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7289 return true;
7290 }
7291 return false;
7292 }
7293
7294 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7295 const spg_t& pg, const utime_t& timestamp,
7296 double pool_scrub_min_interval,
7297 double pool_scrub_max_interval, bool must)
7298 : cct(cct),
7299 pgid(pg),
7300 sched_time(timestamp),
7301 deadline(timestamp)
7302 {
7303 // if not explicitly requested, postpone the scrub with a random delay
7304 if (!must) {
7305 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7306 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7307 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7308 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7309
7310 sched_time += scrub_min_interval;
7311 double r = rand() / (double)RAND_MAX;
7312 sched_time +=
7313 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7314 if (scrub_max_interval == 0) {
7315 deadline = utime_t();
7316 } else {
7317 deadline += scrub_max_interval;
7318 }
7319
7320 }
7321 }
7322
7323 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7324 if (sched_time < rhs.sched_time)
7325 return true;
7326 if (sched_time > rhs.sched_time)
7327 return false;
7328 return pgid < rhs.pgid;
7329 }
7330
7331 double OSD::scrub_sleep_time(bool must_scrub)
7332 {
7333 if (must_scrub) {
7334 return cct->_conf->osd_scrub_sleep;
7335 }
7336 utime_t now = ceph_clock_now();
7337 if (scrub_time_permit(now)) {
7338 return cct->_conf->osd_scrub_sleep;
7339 }
7340 double normal_sleep = cct->_conf->osd_scrub_sleep;
7341 double extended_sleep = cct->_conf->osd_scrub_extended_sleep;
7342 return std::max(extended_sleep, normal_sleep);
7343 }
7344
7345 bool OSD::scrub_time_permit(utime_t now)
7346 {
7347 struct tm bdt;
7348 time_t tt = now.sec();
7349 localtime_r(&tt, &bdt);
7350
7351 bool day_permit = false;
7352 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7353 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7354 day_permit = true;
7355 }
7356 } else {
7357 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7358 day_permit = true;
7359 }
7360 }
7361
7362 if (!day_permit) {
7363 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7364 << " - " << cct->_conf->osd_scrub_end_week_day
7365 << " now " << bdt.tm_wday << " = no" << dendl;
7366 return false;
7367 }
7368
7369 bool time_permit = false;
7370 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7371 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7372 time_permit = true;
7373 }
7374 } else {
7375 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7376 time_permit = true;
7377 }
7378 }
7379 if (!time_permit) {
7380 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7381 << " - " << cct->_conf->osd_scrub_end_hour
7382 << " now " << bdt.tm_hour << " = no" << dendl;
7383 } else {
7384 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7385 << " - " << cct->_conf->osd_scrub_end_hour
7386 << " now " << bdt.tm_hour << " = yes" << dendl;
7387 }
7388 return time_permit;
7389 }
7390
7391 bool OSD::scrub_load_below_threshold()
7392 {
7393 double loadavgs[3];
7394 if (getloadavg(loadavgs, 3) != 3) {
7395 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7396 return false;
7397 }
7398
7399 // allow scrub if below configured threshold
7400 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7401 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7402 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7403 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7404 << " < max " << cct->_conf->osd_scrub_load_threshold
7405 << " = yes" << dendl;
7406 return true;
7407 }
7408
7409 // allow scrub if below daily avg and currently decreasing
7410 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7411 dout(20) << __func__ << " loadavg " << loadavgs[0]
7412 << " < daily_loadavg " << daily_loadavg
7413 << " and < 15m avg " << loadavgs[2]
7414 << " = yes" << dendl;
7415 return true;
7416 }
7417
7418 dout(20) << __func__ << " loadavg " << loadavgs[0]
7419 << " >= max " << cct->_conf->osd_scrub_load_threshold
7420 << " and ( >= daily_loadavg " << daily_loadavg
7421 << " or >= 15m avg " << loadavgs[2]
7422 << ") = no" << dendl;
7423 return false;
7424 }
7425
7426 void OSD::sched_scrub()
7427 {
7428 // if not permitted, fail fast
7429 if (!service.can_inc_scrubs()) {
7430 return;
7431 }
7432 bool allow_requested_repair_only = false;
7433 if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7434 if (!cct->_conf->osd_repair_during_recovery) {
7435 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7436 return;
7437 }
7438 dout(10) << __func__
7439 << " will only schedule explicitly requested repair due to active recovery"
7440 << dendl;
7441 allow_requested_repair_only = true;
7442 }
7443
7444 utime_t now = ceph_clock_now();
7445 bool time_permit = scrub_time_permit(now);
7446 bool load_is_low = scrub_load_below_threshold();
7447 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7448
7449 OSDService::ScrubJob scrub;
7450 if (service.first_scrub_stamp(&scrub)) {
7451 do {
7452 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7453
7454 if (scrub.sched_time > now) {
7455 // save ourselves some effort
7456 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7457 << " > " << now << dendl;
7458 break;
7459 }
7460
7461 if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
7462 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7463 << (!time_permit ? "time not permit" : "high load") << dendl;
7464 continue;
7465 }
7466
7467 PGRef pg = _lookup_lock_pg(scrub.pgid);
7468 if (!pg)
7469 continue;
7470 // This has already started, so go on to the next scrub job
7471 if (pg->scrubber.active) {
7472 pg->unlock();
7473 dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
7474 continue;
7475 }
7476 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7477 if (allow_requested_repair_only && !pg->scrubber.must_repair) {
7478 pg->unlock();
7479 dout(10) << __func__ << " skip " << scrub.pgid
7480 << " because repairing is not explicitly requested on it"
7481 << dendl;
7482 continue;
7483 }
7484 // If it is reserving, let it resolve before going to the next scrub job
7485 if (pg->scrubber.local_reserved && !pg->scrubber.active) {
7486 pg->unlock();
7487 dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
7488 break;
7489 }
7490 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7491 << (pg->get_must_scrub() ? ", explicitly requested" :
7492 (load_is_low ? ", load_is_low" : " deadline < now"))
7493 << dendl;
7494 if (pg->sched_scrub()) {
7495 pg->unlock();
7496 break;
7497 }
7498 pg->unlock();
7499 } while (service.next_scrub_stamp(scrub, &scrub));
7500 }
7501 dout(20) << "sched_scrub done" << dendl;
7502 }
7503
7504 void OSD::resched_all_scrubs()
7505 {
7506 dout(10) << __func__ << ": start" << dendl;
7507 OSDService::ScrubJob scrub;
7508 if (service.first_scrub_stamp(&scrub)) {
7509 do {
7510 dout(20) << __func__ << ": examine " << scrub.pgid << dendl;
7511
7512 PGRef pg = _lookup_lock_pg(scrub.pgid);
7513 if (!pg)
7514 continue;
7515 if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) {
7516 dout(20) << __func__ << ": reschedule " << scrub.pgid << dendl;
7517 pg->on_info_history_change();
7518 }
7519 pg->unlock();
7520 } while (service.next_scrub_stamp(scrub, &scrub));
7521 }
7522 dout(10) << __func__ << ": done" << dendl;
7523 }
7524
7525 MPGStats* OSD::collect_pg_stats()
7526 {
7527 // This implementation unconditionally sends every is_primary PG's
7528 // stats every time we're called. This has equivalent cost to the
7529 // previous implementation's worst case where all PGs are busy and
7530 // their stats are always enqueued for sending.
7531 std::shared_lock l{map_lock};
7532
7533 osd_stat_t cur_stat = service.get_osd_stat();
7534 cur_stat.os_perf_stat = store->get_cur_stats();
7535
7536 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
7537 m->osd_stat = cur_stat;
7538
7539 std::lock_guard lec{min_last_epoch_clean_lock};
7540 min_last_epoch_clean = get_osdmap_epoch();
7541 min_last_epoch_clean_pgs.clear();
7542
7543 std::set<int64_t> pool_set;
7544 vector<PGRef> pgs;
7545 _get_pgs(&pgs);
7546 for (auto& pg : pgs) {
7547 auto pool = pg->pg_id.pgid.pool();
7548 pool_set.emplace((int64_t)pool);
7549 if (!pg->is_primary()) {
7550 continue;
7551 }
7552 pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7553 m->pg_stat[pg->pg_id.pgid] = s;
7554 min_last_epoch_clean = min(min_last_epoch_clean, lec);
7555 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7556 });
7557 }
7558 store_statfs_t st;
7559 bool per_pool_stats = false;
7560 bool per_pool_omap_stats = false;
7561 for (auto p : pool_set) {
7562 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
7563 if (r == -ENOTSUP) {
7564 break;
7565 } else {
7566 assert(r >= 0);
7567 m->pool_stat[p] = st;
7568 per_pool_stats = true;
7569 }
7570 }
7571
7572 // indicate whether we are reporting per-pool stats
7573 m->osd_stat.num_osds = 1;
7574 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
7575 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
7576
7577 return m;
7578 }
7579
7580 vector<DaemonHealthMetric> OSD::get_health_metrics()
7581 {
7582 vector<DaemonHealthMetric> metrics;
7583 {
7584 utime_t oldest_secs;
7585 const utime_t now = ceph_clock_now();
7586 auto too_old = now;
7587 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7588 int slow = 0;
7589 TrackedOpRef oldest_op;
7590 auto count_slow_ops = [&](TrackedOp& op) {
7591 if (op.get_initiated() < too_old) {
7592 stringstream ss;
7593 ss << "slow request " << op.get_desc()
7594 << " initiated "
7595 << op.get_initiated()
7596 << " currently "
7597 << op.state_string();
7598 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7599 clog->warn() << ss.str();
7600 slow++;
7601 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7602 oldest_op = &op;
7603 }
7604 return true;
7605 } else {
7606 return false;
7607 }
7608 };
7609 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7610 if (slow) {
7611 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7612 << oldest_op->get_desc() << dendl;
7613 }
7614 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7615 } else {
7616 // no news is not good news.
7617 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7618 }
7619 }
7620 {
7621 std::lock_guard l(pending_creates_lock);
7622 auto n_primaries = pending_creates_from_mon;
7623 for (const auto& create : pending_creates_from_osd) {
7624 if (create.second) {
7625 n_primaries++;
7626 }
7627 }
7628 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
7629 }
7630 return metrics;
7631 }
7632
7633 // =====================================================
7634 // MAP
7635
7636 void OSD::wait_for_new_map(OpRequestRef op)
7637 {
7638 // ask?
7639 if (waiting_for_osdmap.empty()) {
7640 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7641 }
7642
7643 logger->inc(l_osd_waiting_for_map);
7644 waiting_for_osdmap.push_back(op);
7645 op->mark_delayed("wait for new map");
7646 }
7647
7648
7649 /** update_map
7650 * assimilate new OSDMap(s). scan pgs, etc.
7651 */
7652
7653 void OSD::note_down_osd(int peer)
7654 {
7655 ceph_assert(ceph_mutex_is_locked(osd_lock));
7656 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7657
7658 std::lock_guard l{heartbeat_lock};
7659 failure_queue.erase(peer);
7660 failure_pending.erase(peer);
7661 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7662 if (p != heartbeat_peers.end()) {
7663 p->second.clear_mark_down();
7664 heartbeat_peers.erase(p);
7665 }
7666 }
7667
7668 void OSD::note_up_osd(int peer)
7669 {
7670 heartbeat_set_peers_need_update();
7671 }
7672
7673 struct C_OnMapCommit : public Context {
7674 OSD *osd;
7675 epoch_t first, last;
7676 MOSDMap *msg;
7677 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7678 : osd(o), first(f), last(l), msg(m) {}
7679 void finish(int r) override {
7680 osd->_committed_osd_maps(first, last, msg);
7681 msg->put();
7682 }
7683 };
7684
7685 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7686 {
7687 std::lock_guard l(osdmap_subscribe_lock);
7688 if (latest_subscribed_epoch >= epoch && !force_request)
7689 return;
7690
7691 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
7692
7693 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7694 force_request) {
7695 monc->renew_subs();
7696 }
7697 }
7698
7699 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7700 {
7701 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7702 if (min <= superblock.oldest_map)
7703 return;
7704
7705 int num = 0;
7706 ObjectStore::Transaction t;
7707 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7708 dout(20) << " removing old osdmap epoch " << e << dendl;
7709 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7710 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7711 superblock.oldest_map = e + 1;
7712 num++;
7713 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7714 service.publish_superblock(superblock);
7715 write_superblock(t);
7716 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7717 ceph_assert(tr == 0);
7718 num = 0;
7719 if (!skip_maps) {
7720 // skip_maps leaves us with a range of old maps if we fail to remove all
7721 // of them before moving superblock.oldest_map forward to the first map
7722 // in the incoming MOSDMap msg. so we should continue removing them in
7723 // this case, even we could do huge series of delete transactions all at
7724 // once.
7725 break;
7726 }
7727 }
7728 }
7729 if (num > 0) {
7730 service.publish_superblock(superblock);
7731 write_superblock(t);
7732 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7733 ceph_assert(tr == 0);
7734 }
7735 // we should not remove the cached maps
7736 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7737 }
7738
7739 void OSD::handle_osd_map(MOSDMap *m)
7740 {
7741 // wait for pgs to catch up
7742 {
7743 // we extend the map cache pins to accomodate pgs slow to consume maps
7744 // for some period, until we hit the max_lag_factor bound, at which point
7745 // we block here to stop injesting more maps than they are able to keep
7746 // up with.
7747 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7748 m_osd_pg_epoch_max_lag_factor;
7749 ceph_assert(max_lag > 0);
7750 epoch_t osd_min = 0;
7751 for (auto shard : shards) {
7752 epoch_t min = shard->get_min_pg_epoch();
7753 if (osd_min == 0 || min < osd_min) {
7754 osd_min = min;
7755 }
7756 }
7757 epoch_t osdmap_epoch = get_osdmap_epoch();
7758 if (osd_min > 0 &&
7759 osdmap_epoch > max_lag &&
7760 osdmap_epoch - max_lag > osd_min) {
7761 epoch_t need = osdmap_epoch - max_lag;
7762 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7763 << " max_lag " << max_lag << ")" << dendl;
7764 for (auto shard : shards) {
7765 epoch_t min = shard->get_min_pg_epoch();
7766 if (need > min) {
7767 dout(10) << __func__ << " waiting for pgs to consume " << need
7768 << " (shard " << shard->shard_id << " min " << min
7769 << ", map cache is " << cct->_conf->osd_map_cache_size
7770 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7771 << ")" << dendl;
7772 unlock_guard unlock{osd_lock};
7773 shard->wait_min_pg_epoch(need);
7774 }
7775 }
7776 }
7777 }
7778
7779 ceph_assert(ceph_mutex_is_locked(osd_lock));
7780 map<epoch_t,OSDMapRef> added_maps;
7781 map<epoch_t,bufferlist> added_maps_bl;
7782 if (m->fsid != monc->get_fsid()) {
7783 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7784 << monc->get_fsid() << dendl;
7785 m->put();
7786 return;
7787 }
7788 if (is_initializing()) {
7789 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7790 m->put();
7791 return;
7792 }
7793
7794 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7795 if (session && !(session->entity_name.is_mon() ||
7796 session->entity_name.is_osd())) {
7797 //not enough perms!
7798 dout(10) << "got osd map from Session " << session
7799 << " which we can't take maps from (not a mon or osd)" << dendl;
7800 m->put();
7801 return;
7802 }
7803
7804 // share with the objecter
7805 if (!is_preboot())
7806 service.objecter->handle_osd_map(m);
7807
7808 epoch_t first = m->get_first();
7809 epoch_t last = m->get_last();
7810 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7811 << superblock.newest_map
7812 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7813 << dendl;
7814
7815 logger->inc(l_osd_map);
7816 logger->inc(l_osd_mape, last - first + 1);
7817 if (first <= superblock.newest_map)
7818 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7819 if (service.max_oldest_map < m->oldest_map) {
7820 service.max_oldest_map = m->oldest_map;
7821 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7822 }
7823
7824 // make sure there is something new, here, before we bother flushing
7825 // the queues and such
7826 if (last <= superblock.newest_map) {
7827 dout(10) << " no new maps here, dropping" << dendl;
7828 m->put();
7829 return;
7830 }
7831
7832 // missing some?
7833 bool skip_maps = false;
7834 if (first > superblock.newest_map + 1) {
7835 dout(10) << "handle_osd_map message skips epochs "
7836 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7837 if (m->oldest_map <= superblock.newest_map + 1) {
7838 osdmap_subscribe(superblock.newest_map + 1, false);
7839 m->put();
7840 return;
7841 }
7842 // always try to get the full range of maps--as many as we can. this
7843 // 1- is good to have
7844 // 2- is at present the only way to ensure that we get a *full* map as
7845 // the first map!
7846 if (m->oldest_map < first) {
7847 osdmap_subscribe(m->oldest_map - 1, true);
7848 m->put();
7849 return;
7850 }
7851 skip_maps = true;
7852 }
7853
7854 ObjectStore::Transaction t;
7855 uint64_t txn_size = 0;
7856
7857 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
7858
7859 // store new maps: queue for disk and put in the osdmap cache
7860 epoch_t start = std::max(superblock.newest_map + 1, first);
7861 for (epoch_t e = start; e <= last; e++) {
7862 if (txn_size >= t.get_num_bytes()) {
7863 derr << __func__ << " transaction size overflowed" << dendl;
7864 ceph_assert(txn_size < t.get_num_bytes());
7865 }
7866 txn_size = t.get_num_bytes();
7867 map<epoch_t,bufferlist>::iterator p;
7868 p = m->maps.find(e);
7869 if (p != m->maps.end()) {
7870 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7871 OSDMap *o = new OSDMap;
7872 bufferlist& bl = p->second;
7873
7874 o->decode(bl);
7875
7876 purged_snaps[e] = o->get_new_purged_snaps();
7877
7878 ghobject_t fulloid = get_osdmap_pobject_name(e);
7879 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7880 added_maps[e] = add_map(o);
7881 added_maps_bl[e] = bl;
7882 got_full_map(e);
7883 continue;
7884 }
7885
7886 p = m->incremental_maps.find(e);
7887 if (p != m->incremental_maps.end()) {
7888 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7889 bufferlist& bl = p->second;
7890 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7891 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7892
7893 OSDMap *o = new OSDMap;
7894 if (e > 1) {
7895 bufferlist obl;
7896 bool got = get_map_bl(e - 1, obl);
7897 if (!got) {
7898 auto p = added_maps_bl.find(e - 1);
7899 ceph_assert(p != added_maps_bl.end());
7900 obl = p->second;
7901 }
7902 o->decode(obl);
7903 }
7904
7905 OSDMap::Incremental inc;
7906 auto p = bl.cbegin();
7907 inc.decode(p);
7908
7909 if (o->apply_incremental(inc) < 0) {
7910 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
7911 ceph_abort_msg("bad fsid");
7912 }
7913
7914 bufferlist fbl;
7915 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7916
7917 bool injected_failure = false;
7918 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7919 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7920 derr << __func__ << " injecting map crc failure" << dendl;
7921 injected_failure = true;
7922 }
7923
7924 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7925 dout(2) << "got incremental " << e
7926 << " but failed to encode full with correct crc; requesting"
7927 << dendl;
7928 clog->warn() << "failed to encode map e" << e << " with expected crc";
7929 dout(20) << "my encoded map was:\n";
7930 fbl.hexdump(*_dout);
7931 *_dout << dendl;
7932 delete o;
7933 request_full_map(e, last);
7934 last = e - 1;
7935
7936 // don't continue committing if we failed to enc the first inc map
7937 if (last < start) {
7938 dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
7939 m->put();
7940 return;
7941 }
7942 break;
7943 }
7944 got_full_map(e);
7945 purged_snaps[e] = o->get_new_purged_snaps();
7946
7947 ghobject_t fulloid = get_osdmap_pobject_name(e);
7948 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7949 added_maps[e] = add_map(o);
7950 added_maps_bl[e] = fbl;
7951 continue;
7952 }
7953
7954 ceph_abort_msg("MOSDMap lied about what maps it had?");
7955 }
7956
7957 // even if this map isn't from a mon, we may have satisfied our subscription
7958 monc->sub_got("osdmap", last);
7959
7960 if (!m->maps.empty() && requested_full_first) {
7961 dout(10) << __func__ << " still missing full maps " << requested_full_first
7962 << ".." << requested_full_last << dendl;
7963 rerequest_full_maps();
7964 }
7965
7966 if (superblock.oldest_map) {
7967 // make sure we at least keep pace with incoming maps
7968 trim_maps(m->oldest_map, last - first + 1, skip_maps);
7969 pg_num_history.prune(superblock.oldest_map);
7970 }
7971
7972 if (!superblock.oldest_map || skip_maps)
7973 superblock.oldest_map = first;
7974 superblock.newest_map = last;
7975 superblock.current_epoch = last;
7976
7977 // note in the superblock that we were clean thru the prior epoch
7978 epoch_t boot_epoch = service.get_boot_epoch();
7979 if (boot_epoch && boot_epoch >= superblock.mounted) {
7980 superblock.mounted = boot_epoch;
7981 superblock.clean_thru = last;
7982 }
7983
7984 // check for pg_num changes and deleted pools
7985 OSDMapRef lastmap;
7986 for (auto& i : added_maps) {
7987 if (!lastmap) {
7988 if (!(lastmap = service.try_get_map(i.first - 1))) {
7989 dout(10) << __func__ << " can't get previous map " << i.first - 1
7990 << " probably first start of this osd" << dendl;
7991 continue;
7992 }
7993 }
7994 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
7995 for (auto& j : lastmap->get_pools()) {
7996 if (!i.second->have_pg_pool(j.first)) {
7997 pg_num_history.log_pool_delete(i.first, j.first);
7998 dout(10) << __func__ << " recording final pg_pool_t for pool "
7999 << j.first << dendl;
8000 // this information is needed by _make_pg() if have to restart before
8001 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8002 ghobject_t obj = make_final_pool_info_oid(j.first);
8003 bufferlist bl;
8004 encode(j.second, bl, CEPH_FEATURES_ALL);
8005 string name = lastmap->get_pool_name(j.first);
8006 encode(name, bl);
8007 map<string,string> profile;
8008 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8009 profile = lastmap->get_erasure_code_profile(
8010 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8011 }
8012 encode(profile, bl);
8013 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8014 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8015 new_pg_num != j.second.get_pg_num()) {
8016 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8017 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8018 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8019 }
8020 }
8021 for (auto& j : i.second->get_pools()) {
8022 if (!lastmap->have_pg_pool(j.first)) {
8023 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8024 << j.second.get_pg_num() << dendl;
8025 pg_num_history.log_pg_num_change(i.first, j.first,
8026 j.second.get_pg_num());
8027 }
8028 }
8029 lastmap = i.second;
8030 }
8031 pg_num_history.epoch = last;
8032 {
8033 bufferlist bl;
8034 ::encode(pg_num_history, bl);
8035 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8036 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8037 }
8038
8039 // record new purged_snaps
8040 if (superblock.purged_snaps_last == start - 1) {
8041 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
8042 make_purged_snaps_oid(), &t,
8043 purged_snaps);
8044 superblock.purged_snaps_last = last;
8045 } else {
8046 dout(10) << __func__ << " superblock purged_snaps_last is "
8047 << superblock.purged_snaps_last
8048 << ", not recording new purged_snaps" << dendl;
8049 }
8050
8051 // superblock and commit
8052 write_superblock(t);
8053 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8054 store->queue_transaction(
8055 service.meta_ch,
8056 std::move(t));
8057 service.publish_superblock(superblock);
8058 }
8059
8060 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8061 {
8062 dout(10) << __func__ << " " << first << ".." << last << dendl;
8063 if (is_stopping()) {
8064 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8065 return;
8066 }
8067 std::lock_guard l(osd_lock);
8068 if (is_stopping()) {
8069 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8070 return;
8071 }
8072 map_lock.lock();
8073
8074 ceph_assert(first <= last);
8075
8076 bool do_shutdown = false;
8077 bool do_restart = false;
8078 bool network_error = false;
8079 OSDMapRef osdmap = get_osdmap();
8080
8081 // advance through the new maps
8082 for (epoch_t cur = first; cur <= last; cur++) {
8083 dout(10) << " advance to epoch " << cur
8084 << " (<= last " << last
8085 << " <= newest_map " << superblock.newest_map
8086 << ")" << dendl;
8087
8088 OSDMapRef newmap = get_map(cur);
8089 ceph_assert(newmap); // we just cached it above!
8090
8091 // start blacklisting messages sent to peers that go down.
8092 service.pre_publish_map(newmap);
8093
8094 // kill connections to newly down osds
8095 bool waited_for_reservations = false;
8096 set<int> old;
8097 osdmap = get_osdmap();
8098 osdmap->get_all_osds(old);
8099 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8100 if (*p != whoami &&
8101 osdmap->is_up(*p) && // in old map
8102 newmap->is_down(*p)) { // but not the new one
8103 if (!waited_for_reservations) {
8104 service.await_reserved_maps();
8105 waited_for_reservations = true;
8106 }
8107 note_down_osd(*p);
8108 } else if (*p != whoami &&
8109 osdmap->is_down(*p) &&
8110 newmap->is_up(*p)) {
8111 note_up_osd(*p);
8112 }
8113 }
8114
8115 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8116 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8117 << dendl;
8118 if (is_booting()) {
8119 // this captures the case where we sent the boot message while
8120 // NOUP was being set on the mon and our boot request was
8121 // dropped, and then later it is cleared. it imperfectly
8122 // handles the case where our original boot message was not
8123 // dropped and we restart even though we might have booted, but
8124 // that is harmless (boot will just take slightly longer).
8125 do_restart = true;
8126 }
8127 }
8128
8129 osdmap = std::move(newmap);
8130 set_osdmap(osdmap);
8131 epoch_t up_epoch;
8132 epoch_t boot_epoch;
8133 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8134 if (!up_epoch &&
8135 osdmap->is_up(whoami) &&
8136 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8137 up_epoch = osdmap->get_epoch();
8138 dout(10) << "up_epoch is " << up_epoch << dendl;
8139 if (!boot_epoch) {
8140 boot_epoch = osdmap->get_epoch();
8141 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8142 }
8143 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8144 }
8145 }
8146
8147 epoch_t _bind_epoch = service.get_bind_epoch();
8148 if (osdmap->is_up(whoami) &&
8149 osdmap->get_addrs(whoami).legacy_equals(
8150 client_messenger->get_myaddrs()) &&
8151 _bind_epoch < osdmap->get_up_from(whoami)) {
8152
8153 if (is_booting()) {
8154 dout(1) << "state: booting -> active" << dendl;
8155 set_state(STATE_ACTIVE);
8156 do_restart = false;
8157
8158 // set incarnation so that osd_reqid_t's we generate for our
8159 // objecter requests are unique across restarts.
8160 service.objecter->set_client_incarnation(osdmap->get_epoch());
8161 cancel_pending_failures();
8162 }
8163 }
8164
8165 if (osdmap->get_epoch() > 0 &&
8166 is_active()) {
8167 if (!osdmap->exists(whoami)) {
8168 derr << "map says i do not exist. shutting down." << dendl;
8169 do_shutdown = true; // don't call shutdown() while we have
8170 // everything paused
8171 } else if (osdmap->is_stop(whoami)) {
8172 derr << "map says i am stopped by admin. shutting down." << dendl;
8173 do_shutdown = true;
8174 } else if (!osdmap->is_up(whoami) ||
8175 !osdmap->get_addrs(whoami).legacy_equals(
8176 client_messenger->get_myaddrs()) ||
8177 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8178 cluster_messenger->get_myaddrs()) ||
8179 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8180 hb_back_server_messenger->get_myaddrs()) ||
8181 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8182 hb_front_server_messenger->get_myaddrs())) {
8183 if (!osdmap->is_up(whoami)) {
8184 if (service.is_preparing_to_stop() || service.is_stopping()) {
8185 service.got_stop_ack();
8186 } else {
8187 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8188 "but it is still running";
8189 clog->debug() << "map e" << osdmap->get_epoch()
8190 << " wrongly marked me down at e"
8191 << osdmap->get_down_at(whoami);
8192 }
8193 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8194 // note that this is best-effort...
8195 monc->send_mon_message(
8196 new MOSDMarkMeDead(
8197 monc->get_fsid(),
8198 whoami,
8199 osdmap->get_epoch()));
8200 }
8201 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8202 client_messenger->get_myaddrs())) {
8203 clog->error() << "map e" << osdmap->get_epoch()
8204 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8205 << " != my " << client_messenger->get_myaddrs() << ")";
8206 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8207 cluster_messenger->get_myaddrs())) {
8208 clog->error() << "map e" << osdmap->get_epoch()
8209 << " had wrong cluster addr ("
8210 << osdmap->get_cluster_addrs(whoami)
8211 << " != my " << cluster_messenger->get_myaddrs() << ")";
8212 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8213 hb_back_server_messenger->get_myaddrs())) {
8214 clog->error() << "map e" << osdmap->get_epoch()
8215 << " had wrong heartbeat back addr ("
8216 << osdmap->get_hb_back_addrs(whoami)
8217 << " != my " << hb_back_server_messenger->get_myaddrs()
8218 << ")";
8219 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8220 hb_front_server_messenger->get_myaddrs())) {
8221 clog->error() << "map e" << osdmap->get_epoch()
8222 << " had wrong heartbeat front addr ("
8223 << osdmap->get_hb_front_addrs(whoami)
8224 << " != my " << hb_front_server_messenger->get_myaddrs()
8225 << ")";
8226 }
8227
8228 if (!service.is_stopping()) {
8229 epoch_t up_epoch = 0;
8230 epoch_t bind_epoch = osdmap->get_epoch();
8231 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8232 do_restart = true;
8233
8234 //add markdown log
8235 utime_t now = ceph_clock_now();
8236 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8237 osd_markdown_log.push_back(now);
8238 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8239 derr << __func__ << " marked down "
8240 << osd_markdown_log.size()
8241 << " > osd_max_markdown_count "
8242 << cct->_conf->osd_max_markdown_count
8243 << " in last " << grace << " seconds, shutting down"
8244 << dendl;
8245 do_restart = false;
8246 do_shutdown = true;
8247 }
8248
8249 start_waiting_for_healthy();
8250
8251 set<int> avoid_ports;
8252 #if defined(__FreeBSD__)
8253 // prevent FreeBSD from grabbing the client_messenger port during
8254 // rebinding. In which case a cluster_meesneger will connect also
8255 // to the same port
8256 client_messenger->get_myaddrs().get_ports(&avoid_ports);
8257 #endif
8258 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8259
8260 int r = cluster_messenger->rebind(avoid_ports);
8261 if (r != 0) {
8262 do_shutdown = true; // FIXME: do_restart?
8263 network_error = true;
8264 derr << __func__ << " marked down:"
8265 << " rebind cluster_messenger failed" << dendl;
8266 }
8267
8268 hb_back_server_messenger->mark_down_all();
8269 hb_front_server_messenger->mark_down_all();
8270 hb_front_client_messenger->mark_down_all();
8271 hb_back_client_messenger->mark_down_all();
8272
8273 reset_heartbeat_peers(true);
8274 }
8275 }
8276 }
8277
8278 map_lock.unlock();
8279
8280 check_osdmap_features();
8281
8282 // yay!
8283 consume_map();
8284
8285 if (is_active() || is_waiting_for_healthy())
8286 maybe_update_heartbeat_peers();
8287
8288 if (is_active()) {
8289 activate_map();
8290 }
8291
8292 if (do_shutdown) {
8293 if (network_error) {
8294 cancel_pending_failures();
8295 }
8296 // trigger shutdown in a different thread
8297 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8298 queue_async_signal(SIGINT);
8299 }
8300 else if (m->newest_map && m->newest_map > last) {
8301 dout(10) << " msg say newest map is " << m->newest_map
8302 << ", requesting more" << dendl;
8303 osdmap_subscribe(osdmap->get_epoch()+1, false);
8304 }
8305 else if (is_preboot()) {
8306 if (m->get_source().is_mon())
8307 _preboot(m->oldest_map, m->newest_map);
8308 else
8309 start_boot();
8310 }
8311 else if (do_restart)
8312 start_boot();
8313
8314 }
8315
8316 void OSD::check_osdmap_features()
8317 {
8318 // adjust required feature bits?
8319
8320 // we have to be a bit careful here, because we are accessing the
8321 // Policy structures without taking any lock. in particular, only
8322 // modify integer values that can safely be read by a racing CPU.
8323 // since we are only accessing existing Policy structures a their
8324 // current memory location, and setting or clearing bits in integer
8325 // fields, and we are the only writer, this is not a problem.
8326
8327 const auto osdmap = get_osdmap();
8328 {
8329 Messenger::Policy p = client_messenger->get_default_policy();
8330 uint64_t mask;
8331 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8332 if ((p.features_required & mask) != features) {
8333 dout(0) << "crush map has features " << features
8334 << ", adjusting msgr requires for clients" << dendl;
8335 p.features_required = (p.features_required & ~mask) | features;
8336 client_messenger->set_default_policy(p);
8337 }
8338 }
8339 {
8340 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8341 uint64_t mask;
8342 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8343 if ((p.features_required & mask) != features) {
8344 dout(0) << "crush map has features " << features
8345 << " was " << p.features_required
8346 << ", adjusting msgr requires for mons" << dendl;
8347 p.features_required = (p.features_required & ~mask) | features;
8348 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8349 }
8350 }
8351 {
8352 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8353 uint64_t mask;
8354 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8355
8356 if ((p.features_required & mask) != features) {
8357 dout(0) << "crush map has features " << features
8358 << ", adjusting msgr requires for osds" << dendl;
8359 p.features_required = (p.features_required & ~mask) | features;
8360 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8361 }
8362
8363 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8364 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8365 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8366 ObjectStore::Transaction t;
8367 write_superblock(t);
8368 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8369 ceph_assert(err == 0);
8370 }
8371 }
8372
8373 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8374 hb_front_server_messenger->set_require_authorizer(false);
8375 hb_back_server_messenger->set_require_authorizer(false);
8376 } else {
8377 hb_front_server_messenger->set_require_authorizer(true);
8378 hb_back_server_messenger->set_require_authorizer(true);
8379 }
8380
8381 if (osdmap->require_osd_release != last_require_osd_release) {
8382 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8383 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8384 store->write_meta("require_osd_release",
8385 stringify((int)osdmap->require_osd_release));
8386 last_require_osd_release = osdmap->require_osd_release;
8387 }
8388 }
8389
8390 struct C_FinishSplits : public Context {
8391 OSD *osd;
8392 set<PGRef> pgs;
8393 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8394 : osd(osd), pgs(in) {}
8395 void finish(int r) override {
8396 osd->_finish_splits(pgs);
8397 }
8398 };
8399
8400 void OSD::_finish_splits(set<PGRef>& pgs)
8401 {
8402 dout(10) << __func__ << " " << pgs << dendl;
8403 if (is_stopping())
8404 return;
8405 for (set<PGRef>::iterator i = pgs.begin();
8406 i != pgs.end();
8407 ++i) {
8408 PG *pg = i->get();
8409
8410 PeeringCtx rctx = create_context();
8411 pg->lock();
8412 dout(10) << __func__ << " " << *pg << dendl;
8413 epoch_t e = pg->get_osdmap_epoch();
8414 pg->handle_initialize(rctx);
8415 pg->queue_null(e, e);
8416 dispatch_context(rctx, pg, service.get_osdmap());
8417 pg->unlock();
8418
8419 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8420 shards[shard_index]->register_and_wake_split_child(pg);
8421 }
8422 };
8423
8424 bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8425 unsigned need)
8426 {
8427 std::lock_guard l(merge_lock);
8428 auto& p = merge_waiters[nextmap->get_epoch()][target];
8429 p[src->pg_id] = src;
8430 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8431 << " for " << target << ", have " << p.size() << "/" << need
8432 << dendl;
8433 return p.size() == need;
8434 }
8435
8436 bool OSD::advance_pg(
8437 epoch_t osd_epoch,
8438 PG *pg,
8439 ThreadPool::TPHandle &handle,
8440 PeeringCtx &rctx)
8441 {
8442 if (osd_epoch <= pg->get_osdmap_epoch()) {
8443 return true;
8444 }
8445 ceph_assert(pg->is_locked());
8446 OSDMapRef lastmap = pg->get_osdmap();
8447 ceph_assert(lastmap->get_epoch() < osd_epoch);
8448 set<PGRef> new_pgs; // any split children
8449 bool ret = true;
8450
8451 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8452 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8453 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8454 next_epoch <= osd_epoch;
8455 ++next_epoch) {
8456 OSDMapRef nextmap = service.try_get_map(next_epoch);
8457 if (!nextmap) {
8458 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8459 continue;
8460 }
8461
8462 unsigned new_pg_num =
8463 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8464 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8465 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8466 // check for merge
8467 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8468 spg_t parent;
8469 if (pg->pg_id.is_merge_source(
8470 old_pg_num,
8471 new_pg_num,
8472 &parent)) {
8473 // we are merge source
8474 PGRef spg = pg; // carry a ref
8475 dout(1) << __func__ << " " << pg->pg_id
8476 << " is merge source, target is " << parent
8477 << dendl;
8478 pg->write_if_dirty(rctx);
8479 if (!new_pgs.empty()) {
8480 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8481 new_pgs));
8482 new_pgs.clear();
8483 }
8484 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8485 pg->ch->flush();
8486 // release backoffs explicitly, since the on_shutdown path
8487 // aggressively tears down backoff state.
8488 if (pg->is_primary()) {
8489 pg->release_pg_backoffs();
8490 }
8491 pg->on_shutdown();
8492 OSDShard *sdata = pg->osd_shard;
8493 {
8494 std::lock_guard l(sdata->shard_lock);
8495 if (pg->pg_slot) {
8496 sdata->_detach_pg(pg->pg_slot);
8497 // update pg count now since we might not get an osdmap
8498 // any time soon.
8499 if (pg->is_primary())
8500 logger->dec(l_osd_pg_primary);
8501 else if (pg->is_nonprimary())
8502 logger->dec(l_osd_pg_replica); // misnomer
8503 else
8504 logger->dec(l_osd_pg_stray);
8505 }
8506 }
8507 pg->unlock();
8508
8509 set<spg_t> children;
8510 parent.is_split(new_pg_num, old_pg_num, &children);
8511 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8512 enqueue_peering_evt(
8513 parent,
8514 PGPeeringEventRef(
8515 std::make_shared<PGPeeringEvent>(
8516 nextmap->get_epoch(),
8517 nextmap->get_epoch(),
8518 NullEvt())));
8519 }
8520 ret = false;
8521 goto out;
8522 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8523 // we are merge target
8524 set<spg_t> children;
8525 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8526 dout(20) << __func__ << " " << pg->pg_id
8527 << " is merge target, sources are " << children
8528 << dendl;
8529 map<spg_t,PGRef> sources;
8530 {
8531 std::lock_guard l(merge_lock);
8532 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8533 unsigned need = children.size();
8534 dout(20) << __func__ << " have " << s.size() << "/"
8535 << need << dendl;
8536 if (s.size() == need) {
8537 sources.swap(s);
8538 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8539 if (merge_waiters[nextmap->get_epoch()].empty()) {
8540 merge_waiters.erase(nextmap->get_epoch());
8541 }
8542 }
8543 }
8544 if (!sources.empty()) {
8545 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8546 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8547 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8548 pg->merge_from(
8549 sources, rctx, split_bits,
8550 nextmap->get_pg_pool(
8551 pg->pg_id.pool())->last_pg_merge_meta);
8552 pg->pg_slot->waiting_for_merge_epoch = 0;
8553 } else {
8554 dout(20) << __func__ << " not ready to merge yet" << dendl;
8555 pg->write_if_dirty(rctx);
8556 if (!new_pgs.empty()) {
8557 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8558 new_pgs));
8559 new_pgs.clear();
8560 }
8561 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8562 pg->unlock();
8563 // kick source(s) to get them ready
8564 for (auto& i : children) {
8565 dout(20) << __func__ << " kicking source " << i << dendl;
8566 enqueue_peering_evt(
8567 i,
8568 PGPeeringEventRef(
8569 std::make_shared<PGPeeringEvent>(
8570 nextmap->get_epoch(),
8571 nextmap->get_epoch(),
8572 NullEvt())));
8573 }
8574 ret = false;
8575 goto out;
8576 }
8577 }
8578 }
8579 }
8580
8581 vector<int> newup, newacting;
8582 int up_primary, acting_primary;
8583 nextmap->pg_to_up_acting_osds(
8584 pg->pg_id.pgid,
8585 &newup, &up_primary,
8586 &newacting, &acting_primary);
8587 pg->handle_advance_map(
8588 nextmap, lastmap, newup, up_primary,
8589 newacting, acting_primary, rctx);
8590
8591 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8592 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8593 if (oldpool != lastmap->get_pools().end()
8594 && newpool != nextmap->get_pools().end()) {
8595 dout(20) << __func__
8596 << " new pool opts " << newpool->second.opts
8597 << " old pool opts " << oldpool->second.opts
8598 << dendl;
8599
8600 double old_min_interval = 0, new_min_interval = 0;
8601 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8602 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8603
8604 double old_max_interval = 0, new_max_interval = 0;
8605 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8606 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8607
8608 // Assume if an interval is change from set to unset or vice versa the actual config
8609 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8610 // unnecessarily.
8611 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8612 pg->on_info_history_change();
8613 }
8614 }
8615
8616 if (new_pg_num && old_pg_num != new_pg_num) {
8617 // check for split
8618 set<spg_t> children;
8619 if (pg->pg_id.is_split(
8620 old_pg_num,
8621 new_pg_num,
8622 &children)) {
8623 split_pgs(
8624 pg, children, &new_pgs, lastmap, nextmap,
8625 rctx);
8626 }
8627 }
8628
8629 lastmap = nextmap;
8630 old_pg_num = new_pg_num;
8631 handle.reset_tp_timeout();
8632 }
8633 pg->handle_activate_map(rctx);
8634
8635 ret = true;
8636 out:
8637 if (!new_pgs.empty()) {
8638 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
8639 }
8640 return ret;
8641 }
8642
8643 void OSD::consume_map()
8644 {
8645 ceph_assert(ceph_mutex_is_locked(osd_lock));
8646 auto osdmap = get_osdmap();
8647 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8648
8649 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8650 * speak the older sorting version any more. Be careful not to force
8651 * a shutdown if we are merely processing old maps, though.
8652 */
8653 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8654 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8655 ceph_abort();
8656 }
8657
8658 service.pre_publish_map(osdmap);
8659 service.await_reserved_maps();
8660 service.publish_map(osdmap);
8661
8662 // prime splits and merges
8663 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8664 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8665 for (auto& shard : shards) {
8666 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8667 }
8668 if (!newly_split.empty()) {
8669 for (auto& shard : shards) {
8670 shard->prime_splits(osdmap, &newly_split);
8671 }
8672 ceph_assert(newly_split.empty());
8673 }
8674
8675 // prune sent_ready_to_merge
8676 service.prune_sent_ready_to_merge(osdmap);
8677
8678 // FIXME, maybe: We could race against an incoming peering message
8679 // that instantiates a merge PG after identify_merges() below and
8680 // never set up its peer to complete the merge. An OSD restart
8681 // would clear it up. This is a hard race to resolve,
8682 // extraordinarily rare (we only merge PGs that are stable and
8683 // clean, so it'd have to be an imported PG to an OSD with a
8684 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8685 // replace all of this with a seastar-based code soon anyway.
8686 if (!merge_pgs.empty()) {
8687 // mark the pgs we already have, or create new and empty merge
8688 // participants for those we are missing. do this all under the
8689 // shard lock so we don't have to worry about racing pg creates
8690 // via _process.
8691 for (auto& shard : shards) {
8692 shard->prime_merges(osdmap, &merge_pgs);
8693 }
8694 ceph_assert(merge_pgs.empty());
8695 }
8696
8697 service.prune_pg_created();
8698
8699 unsigned pushes_to_free = 0;
8700 for (auto& shard : shards) {
8701 shard->consume_map(osdmap, &pushes_to_free);
8702 }
8703
8704 vector<spg_t> pgids;
8705 _get_pgids(&pgids);
8706
8707 // count (FIXME, probably during seastar rewrite)
8708 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8709 vector<PGRef> pgs;
8710 _get_pgs(&pgs);
8711 for (auto& pg : pgs) {
8712 // FIXME (probably during seastar rewrite): this is lockless and
8713 // racy, but we don't want to take pg lock here.
8714 if (pg->is_primary())
8715 num_pg_primary++;
8716 else if (pg->is_nonprimary())
8717 num_pg_replica++; // misnomer
8718 else
8719 num_pg_stray++;
8720 }
8721
8722 {
8723 // FIXME (as part of seastar rewrite): move to OSDShard
8724 std::lock_guard l(pending_creates_lock);
8725 for (auto pg = pending_creates_from_osd.begin();
8726 pg != pending_creates_from_osd.end();) {
8727 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
8728 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8729 << "discarding pending_create_from_osd" << dendl;
8730 pg = pending_creates_from_osd.erase(pg);
8731 } else {
8732 ++pg;
8733 }
8734 }
8735 }
8736
8737 service.maybe_inject_dispatch_delay();
8738
8739 dispatch_sessions_waiting_on_map();
8740
8741 service.maybe_inject_dispatch_delay();
8742
8743 service.release_reserved_pushes(pushes_to_free);
8744
8745 // queue null events to push maps down to individual PGs
8746 for (auto pgid : pgids) {
8747 enqueue_peering_evt(
8748 pgid,
8749 PGPeeringEventRef(
8750 std::make_shared<PGPeeringEvent>(
8751 osdmap->get_epoch(),
8752 osdmap->get_epoch(),
8753 NullEvt())));
8754 }
8755 logger->set(l_osd_pg, pgids.size());
8756 logger->set(l_osd_pg_primary, num_pg_primary);
8757 logger->set(l_osd_pg_replica, num_pg_replica);
8758 logger->set(l_osd_pg_stray, num_pg_stray);
8759 }
8760
8761 void OSD::activate_map()
8762 {
8763 ceph_assert(ceph_mutex_is_locked(osd_lock));
8764 auto osdmap = get_osdmap();
8765
8766 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8767
8768 // norecover?
8769 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8770 if (!service.recovery_is_paused()) {
8771 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8772 service.pause_recovery();
8773 }
8774 } else {
8775 if (service.recovery_is_paused()) {
8776 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8777 service.unpause_recovery();
8778 }
8779 }
8780
8781 service.activate_map();
8782
8783 // process waiters
8784 take_waiters(waiting_for_osdmap);
8785 }
8786
8787 bool OSD::require_mon_peer(const Message *m)
8788 {
8789 if (!m->get_connection()->peer_is_mon()) {
8790 dout(0) << "require_mon_peer received from non-mon "
8791 << m->get_connection()->get_peer_addr()
8792 << " " << *m << dendl;
8793 return false;
8794 }
8795 return true;
8796 }
8797
8798 bool OSD::require_mon_or_mgr_peer(const Message *m)
8799 {
8800 if (!m->get_connection()->peer_is_mon() &&
8801 !m->get_connection()->peer_is_mgr()) {
8802 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8803 << m->get_connection()->get_peer_addr()
8804 << " " << *m << dendl;
8805 return false;
8806 }
8807 return true;
8808 }
8809
8810 bool OSD::require_osd_peer(const Message *m)
8811 {
8812 if (!m->get_connection()->peer_is_osd()) {
8813 dout(0) << "require_osd_peer received from non-osd "
8814 << m->get_connection()->get_peer_addr()
8815 << " " << *m << dendl;
8816 return false;
8817 }
8818 return true;
8819 }
8820
8821 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8822 {
8823 epoch_t up_epoch = service.get_up_epoch();
8824 if (epoch < up_epoch) {
8825 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8826 return false;
8827 }
8828
8829 if (!is_active()) {
8830 dout(7) << "still in boot state, dropping message " << *m << dendl;
8831 return false;
8832 }
8833
8834 return true;
8835 }
8836
8837 bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
8838 bool is_fast_dispatch)
8839 {
8840 int from = m->get_source().num();
8841
8842 if (map->is_down(from) ||
8843 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
8844 dout(5) << "from dead osd." << from << ", marking down, "
8845 << " msg was " << m->get_source_inst().addr
8846 << " expected "
8847 << (map->is_up(from) ?
8848 map->get_cluster_addrs(from) : entity_addrvec_t())
8849 << dendl;
8850 ConnectionRef con = m->get_connection();
8851 con->mark_down();
8852 if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
8853 if (!is_fast_dispatch)
8854 s->session_dispatch_lock.lock();
8855 clear_session_waiting_on_map(s);
8856 con->set_priv(nullptr); // break ref <-> session cycle, if any
8857 s->con.reset();
8858 if (!is_fast_dispatch)
8859 s->session_dispatch_lock.unlock();
8860 }
8861 return false;
8862 }
8863 return true;
8864 }
8865
8866
8867 /*
8868 * require that we have same (or newer) map, and that
8869 * the source is the pg primary.
8870 */
8871 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8872 bool is_fast_dispatch)
8873 {
8874 const Message *m = op->get_req();
8875 const auto osdmap = get_osdmap();
8876 dout(15) << "require_same_or_newer_map " << epoch
8877 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8878
8879 ceph_assert(ceph_mutex_is_locked(osd_lock));
8880
8881 // do they have a newer map?
8882 if (epoch > osdmap->get_epoch()) {
8883 dout(7) << "waiting for newer map epoch " << epoch
8884 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8885 wait_for_new_map(op);
8886 return false;
8887 }
8888
8889 if (!require_self_aliveness(op->get_req(), epoch)) {
8890 return false;
8891 }
8892
8893 // ok, our map is same or newer.. do they still exist?
8894 if (m->get_connection()->get_messenger() == cluster_messenger &&
8895 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8896 return false;
8897 }
8898
8899 return true;
8900 }
8901
8902
8903
8904
8905
8906 // ----------------------------------------
8907 // pg creation
8908
8909 void OSD::split_pgs(
8910 PG *parent,
8911 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
8912 OSDMapRef curmap,
8913 OSDMapRef nextmap,
8914 PeeringCtx &rctx)
8915 {
8916 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
8917 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
8918
8919 vector<object_stat_sum_t> updated_stats;
8920 parent->start_split_stats(childpgids, &updated_stats);
8921
8922 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8923 for (set<spg_t>::const_iterator i = childpgids.begin();
8924 i != childpgids.end();
8925 ++i, ++stat_iter) {
8926 ceph_assert(stat_iter != updated_stats.end());
8927 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
8928 PG* child = _make_pg(nextmap, *i);
8929 child->lock(true);
8930 out_pgs->insert(child);
8931 child->ch = store->create_new_collection(child->coll);
8932
8933 {
8934 uint32_t shard_index = i->hash_to_shard(shards.size());
8935 assert(NULL != shards[shard_index]);
8936 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
8937 }
8938
8939 unsigned split_bits = i->get_split_bits(pg_num);
8940 dout(10) << " pg_num is " << pg_num
8941 << ", m_seed " << i->ps()
8942 << ", split_bits is " << split_bits << dendl;
8943 parent->split_colls(
8944 *i,
8945 split_bits,
8946 i->ps(),
8947 &child->get_pool().info,
8948 rctx.transaction);
8949 parent->split_into(
8950 i->pgid,
8951 child,
8952 split_bits);
8953
8954 child->init_collection_pool_opts();
8955
8956 child->finish_split_stats(*stat_iter, rctx.transaction);
8957 child->unlock();
8958 }
8959 ceph_assert(stat_iter != updated_stats.end());
8960 parent->finish_split_stats(*stat_iter, rctx.transaction);
8961 }
8962
8963 /*
8964 * holding osd_lock
8965 */
8966 void OSD::handle_pg_create(OpRequestRef op)
8967 {
8968 // NOTE: this can be removed in P release (mimic is the last version to
8969 // send MOSDPGCreate messages).
8970
8971 auto m = op->get_req<MOSDPGCreate>();
8972 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
8973
8974 dout(10) << "handle_pg_create " << *m << dendl;
8975
8976 if (!require_mon_peer(op->get_req())) {
8977 return;
8978 }
8979
8980 if (!require_same_or_newer_map(op, m->epoch, false))
8981 return;
8982
8983 op->mark_started();
8984
8985 const auto osdmap = get_osdmap();
8986 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8987 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8988 p != m->mkpg.end();
8989 ++p, ++ci) {
8990 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
8991 epoch_t created = p->second.created;
8992 if (p->second.split_bits) // Skip split pgs
8993 continue;
8994 pg_t on = p->first;
8995
8996 if (!osdmap->have_pg_pool(on.pool())) {
8997 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8998 continue;
8999 }
9000
9001 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9002
9003 spg_t pgid;
9004 bool mapped = osdmap->get_primary_shard(on, &pgid);
9005 ceph_assert(mapped);
9006
9007 // is it still ours?
9008 vector<int> up, acting;
9009 int up_primary = -1;
9010 int acting_primary = -1;
9011 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9012 int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
9013
9014 if (acting_primary != whoami) {
9015 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9016 << "), my role=" << role << ", skipping" << dendl;
9017 continue;
9018 }
9019
9020
9021 PastIntervals pi;
9022 pg_history_t history;
9023 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9024
9025 // The mon won't resend unless the primary changed, so we ignore
9026 // same_interval_since. We'll pass this history with the current
9027 // epoch as the event.
9028 if (history.same_primary_since > m->epoch) {
9029 dout(10) << __func__ << ": got obsolete pg create on pgid "
9030 << pgid << " from epoch " << m->epoch
9031 << ", primary changed in " << history.same_primary_since
9032 << dendl;
9033 continue;
9034 }
9035 enqueue_peering_evt(
9036 pgid,
9037 PGPeeringEventRef(
9038 std::make_shared<PGPeeringEvent>(
9039 osdmap->get_epoch(),
9040 osdmap->get_epoch(),
9041 NullEvt(),
9042 true,
9043 new PGCreateInfo(
9044 pgid,
9045 osdmap->get_epoch(),
9046 history,
9047 pi,
9048 true)
9049 )));
9050 }
9051
9052 {
9053 std::lock_guard l(pending_creates_lock);
9054 if (pending_creates_from_mon == 0) {
9055 last_pg_create_epoch = m->epoch;
9056 }
9057 }
9058
9059 maybe_update_heartbeat_peers();
9060 }
9061
9062
9063 // ----------------------------------------
9064 // peering and recovery
9065
9066 PeeringCtx OSD::create_context()
9067 {
9068 return PeeringCtx(get_osdmap()->require_osd_release);
9069 }
9070
9071 void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
9072 ThreadPool::TPHandle *handle)
9073 {
9074 if (!service.get_osdmap()->is_up(whoami)) {
9075 dout(20) << __func__ << " not up in osdmap" << dendl;
9076 } else if (!is_active()) {
9077 dout(20) << __func__ << " not active" << dendl;
9078 } else {
9079 for (auto& [osd, ls] : ctx.message_map) {
9080 if (!curmap->is_up(osd)) {
9081 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9082 continue;
9083 }
9084 ConnectionRef con = service.get_con_osd_cluster(
9085 osd, curmap->get_epoch());
9086 if (!con) {
9087 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9088 << dendl;
9089 continue;
9090 }
9091 service.maybe_share_map(con.get(), curmap);
9092 for (auto m : ls) {
9093 con->send_message2(m);
9094 }
9095 ls.clear();
9096 }
9097 }
9098 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
9099 int tr = store->queue_transaction(
9100 pg->ch,
9101 std::move(ctx.transaction), TrackedOpRef(),
9102 handle);
9103 ceph_assert(tr == 0);
9104 }
9105 }
9106
9107 void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9108 {
9109 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9110 if (!require_mon_peer(m)) {
9111 m->put();
9112 return;
9113 }
9114 for (auto& p : m->pgs) {
9115 spg_t pgid = p.first;
9116 epoch_t created = p.second.first;
9117 utime_t created_stamp = p.second.second;
9118 auto q = m->pg_extra.find(pgid);
9119 if (q == m->pg_extra.end()) {
9120 dout(20) << __func__ << " " << pgid << " e" << created
9121 << "@" << created_stamp
9122 << " (no history or past_intervals)" << dendl;
9123 // pre-octopus ... no pg history. this can be removed in Q release.
9124 enqueue_peering_evt(
9125 pgid,
9126 PGPeeringEventRef(
9127 std::make_shared<PGPeeringEvent>(
9128 m->epoch,
9129 m->epoch,
9130 NullEvt(),
9131 true,
9132 new PGCreateInfo(
9133 pgid,
9134 created,
9135 pg_history_t(created, created_stamp),
9136 PastIntervals(),
9137 true)
9138 )));
9139 } else {
9140 dout(20) << __func__ << " " << pgid << " e" << created
9141 << "@" << created_stamp
9142 << " history " << q->second.first
9143 << " pi " << q->second.second << dendl;
9144 if (!q->second.second.empty() &&
9145 m->epoch < q->second.second.get_bounds().second) {
9146 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9147 << " and unmatched past_intervals " << q->second.second
9148 << " (history " << q->second.first << ")";
9149 } else {
9150 enqueue_peering_evt(
9151 pgid,
9152 PGPeeringEventRef(
9153 std::make_shared<PGPeeringEvent>(
9154 m->epoch,
9155 m->epoch,
9156 NullEvt(),
9157 true,
9158 new PGCreateInfo(
9159 pgid,
9160 m->epoch,
9161 q->second.first,
9162 q->second.second,
9163 true)
9164 )));
9165 }
9166 }
9167 }
9168
9169 {
9170 std::lock_guard l(pending_creates_lock);
9171 if (pending_creates_from_mon == 0) {
9172 last_pg_create_epoch = m->epoch;
9173 }
9174 }
9175
9176 m->put();
9177 }
9178
9179 void OSD::handle_fast_pg_query(MOSDPGQuery *m)
9180 {
9181 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9182 if (!require_osd_peer(m)) {
9183 m->put();
9184 return;
9185 }
9186 int from = m->get_source().num();
9187 for (auto& p : m->pg_list) {
9188 enqueue_peering_evt(
9189 p.first,
9190 PGPeeringEventRef(
9191 std::make_shared<PGPeeringEvent>(
9192 p.second.epoch_sent, p.second.epoch_sent,
9193 MQuery(
9194 p.first,
9195 pg_shard_t(from, p.second.from),
9196 p.second,
9197 p.second.epoch_sent),
9198 false))
9199 );
9200 }
9201 m->put();
9202 }
9203
9204 void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9205 {
9206 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9207 if (!require_osd_peer(m)) {
9208 m->put();
9209 return;
9210 }
9211 int from = m->get_source().num();
9212 for (auto& p : m->get_pg_list()) {
9213 spg_t pgid(p.info.pgid.pgid, p.to);
9214 enqueue_peering_evt(
9215 pgid,
9216 PGPeeringEventRef(
9217 std::make_shared<PGPeeringEvent>(
9218 p.epoch_sent,
9219 p.query_epoch,
9220 MNotifyRec(
9221 pgid, pg_shard_t(from, p.from),
9222 p,
9223 m->get_connection()->get_features()),
9224 true,
9225 new PGCreateInfo(
9226 pgid,
9227 p.query_epoch,
9228 p.info.history,
9229 p.past_intervals,
9230 false)
9231 )));
9232 }
9233 m->put();
9234 }
9235
9236 void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9237 {
9238 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9239 if (!require_osd_peer(m)) {
9240 m->put();
9241 return;
9242 }
9243 int from = m->get_source().num();
9244 for (auto& p : m->pg_list) {
9245 enqueue_peering_evt(
9246 spg_t(p.info.pgid.pgid, p.to),
9247 PGPeeringEventRef(
9248 std::make_shared<PGPeeringEvent>(
9249 p.epoch_sent, p.query_epoch,
9250 MInfoRec(
9251 pg_shard_t(from, p.from),
9252 p.info,
9253 p.epoch_sent)))
9254 );
9255 }
9256 m->put();
9257 }
9258
9259 void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9260 {
9261 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9262 if (!require_osd_peer(m)) {
9263 m->put();
9264 return;
9265 }
9266 for (auto& pgid : m->pg_list) {
9267 enqueue_peering_evt(
9268 pgid,
9269 PGPeeringEventRef(
9270 std::make_shared<PGPeeringEvent>(
9271 m->get_epoch(), m->get_epoch(),
9272 PeeringState::DeleteStart())));
9273 }
9274 m->put();
9275 }
9276
9277 void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9278 {
9279 dout(10) << __func__ << " " << *m << dendl;
9280 if (!require_mon_or_mgr_peer(m)) {
9281 m->put();
9282 return;
9283 }
9284 epoch_t epoch = get_osdmap_epoch();
9285 for (auto pgid : m->forced_pgs) {
9286 if (m->options & OFR_BACKFILL) {
9287 if (m->options & OFR_CANCEL) {
9288 enqueue_peering_evt(
9289 pgid,
9290 PGPeeringEventRef(
9291 std::make_shared<PGPeeringEvent>(
9292 epoch, epoch,
9293 PeeringState::UnsetForceBackfill())));
9294 } else {
9295 enqueue_peering_evt(
9296 pgid,
9297 PGPeeringEventRef(
9298 std::make_shared<PGPeeringEvent>(
9299 epoch, epoch,
9300 PeeringState::SetForceBackfill())));
9301 }
9302 } else if (m->options & OFR_RECOVERY) {
9303 if (m->options & OFR_CANCEL) {
9304 enqueue_peering_evt(
9305 pgid,
9306 PGPeeringEventRef(
9307 std::make_shared<PGPeeringEvent>(
9308 epoch, epoch,
9309 PeeringState::UnsetForceRecovery())));
9310 } else {
9311 enqueue_peering_evt(
9312 pgid,
9313 PGPeeringEventRef(
9314 std::make_shared<PGPeeringEvent>(
9315 epoch, epoch,
9316 PeeringState::SetForceRecovery())));
9317 }
9318 }
9319 }
9320 m->put();
9321 }
9322
9323 void OSD::handle_pg_query_nopg(const MQuery& q)
9324 {
9325 spg_t pgid = q.pgid;
9326 dout(10) << __func__ << " " << pgid << dendl;
9327
9328 OSDMapRef osdmap = get_osdmap();
9329 if (!osdmap->have_pg_pool(pgid.pool()))
9330 return;
9331
9332 dout(10) << " pg " << pgid << " dne" << dendl;
9333 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9334 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9335 if (con) {
9336 Message *m;
9337 if (q.query.type == pg_query_t::LOG ||
9338 q.query.type == pg_query_t::FULLLOG) {
9339 m = new MOSDPGLog(
9340 q.query.from, q.query.to,
9341 osdmap->get_epoch(), empty,
9342 q.query.epoch_sent);
9343 } else {
9344 vector<pg_notify_t> ls;
9345 ls.push_back(
9346 pg_notify_t(
9347 q.query.from, q.query.to,
9348 q.query.epoch_sent,
9349 osdmap->get_epoch(),
9350 empty,
9351 PastIntervals()));
9352 m = new MOSDPGNotify(osdmap->get_epoch(), std::move(ls));
9353 }
9354 service.maybe_share_map(con.get(), osdmap);
9355 con->send_message(m);
9356 }
9357 }
9358
9359 void OSDService::queue_check_readable(spg_t spgid,
9360 epoch_t lpr,
9361 ceph::signedspan delay)
9362 {
9363 if (delay == ceph::signedspan::zero()) {
9364 osd->enqueue_peering_evt(
9365 spgid,
9366 PGPeeringEventRef(
9367 std::make_shared<PGPeeringEvent>(
9368 lpr, lpr,
9369 PeeringState::CheckReadable())));
9370 } else {
9371 mono_timer.add_event(
9372 delay,
9373 [this, spgid, lpr]() {
9374 queue_check_readable(spgid, lpr);
9375 });
9376 }
9377 }
9378
9379
9380 // =========================================================
9381 // RECOVERY
9382
9383 void OSDService::_maybe_queue_recovery() {
9384 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
9385 uint64_t available_pushes;
9386 while (!awaiting_throttle.empty() &&
9387 _recover_now(&available_pushes)) {
9388 uint64_t to_start = std::min(
9389 available_pushes,
9390 cct->_conf->osd_recovery_max_single_start);
9391 _queue_for_recovery(awaiting_throttle.front(), to_start);
9392 awaiting_throttle.pop_front();
9393 dout(10) << __func__ << " starting " << to_start
9394 << ", recovery_ops_reserved " << recovery_ops_reserved
9395 << " -> " << (recovery_ops_reserved + to_start) << dendl;
9396 recovery_ops_reserved += to_start;
9397 }
9398 }
9399
9400 bool OSDService::_recover_now(uint64_t *available_pushes)
9401 {
9402 if (available_pushes)
9403 *available_pushes = 0;
9404
9405 if (ceph_clock_now() < defer_recovery_until) {
9406 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9407 return false;
9408 }
9409
9410 if (recovery_paused) {
9411 dout(15) << __func__ << " paused" << dendl;
9412 return false;
9413 }
9414
9415 uint64_t max = osd->get_recovery_max_active();
9416 if (max <= recovery_ops_active + recovery_ops_reserved) {
9417 dout(15) << __func__ << " active " << recovery_ops_active
9418 << " + reserved " << recovery_ops_reserved
9419 << " >= max " << max << dendl;
9420 return false;
9421 }
9422
9423 if (available_pushes)
9424 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9425
9426 return true;
9427 }
9428
9429 unsigned OSDService::get_target_pg_log_entries() const
9430 {
9431 auto num_pgs = osd->get_num_pgs();
9432 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9433 if (num_pgs > 0 && target > 0) {
9434 // target an even spread of our budgeted log entries across all
9435 // PGs. note that while we only get to control the entry count
9436 // for primary PGs, we'll normally be responsible for a mix of
9437 // primary and replica PGs (for the same pool(s) even), so this
9438 // will work out.
9439 return std::max<unsigned>(
9440 std::min<unsigned>(target / num_pgs,
9441 cct->_conf->osd_max_pg_log_entries),
9442 cct->_conf->osd_min_pg_log_entries);
9443 } else {
9444 // fall back to a per-pg value.
9445 return cct->_conf->osd_min_pg_log_entries;
9446 }
9447 }
9448
9449 void OSD::do_recovery(
9450 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9451 ThreadPool::TPHandle &handle)
9452 {
9453 uint64_t started = 0;
9454
9455 /*
9456 * When the value of osd_recovery_sleep is set greater than zero, recovery
9457 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9458 * recovery event's schedule time. This is done by adding a
9459 * recovery_requeue_callback event, which re-queues the recovery op using
9460 * queue_recovery_after_sleep.
9461 */
9462 float recovery_sleep = get_osd_recovery_sleep();
9463 {
9464 std::lock_guard l(service.sleep_lock);
9465 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9466 PGRef pgref(pg);
9467 auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
9468 dout(20) << "do_recovery wake up at "
9469 << ceph_clock_now()
9470 << ", re-queuing recovery" << dendl;
9471 std::lock_guard l(service.sleep_lock);
9472 service.recovery_needs_sleep = false;
9473 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9474 });
9475
9476 // This is true for the first recovery op and when the previous recovery op
9477 // has been scheduled in the past. The next recovery op is scheduled after
9478 // completing the sleep from now.
9479
9480 if (auto now = ceph::real_clock::now();
9481 service.recovery_schedule_time < now) {
9482 service.recovery_schedule_time = now;
9483 }
9484 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
9485 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9486 recovery_requeue_callback);
9487 dout(20) << "Recovery event scheduled at "
9488 << service.recovery_schedule_time << dendl;
9489 return;
9490 }
9491 }
9492
9493 {
9494 {
9495 std::lock_guard l(service.sleep_lock);
9496 service.recovery_needs_sleep = true;
9497 }
9498
9499 if (pg->pg_has_reset_since(queued)) {
9500 goto out;
9501 }
9502
9503 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9504 #ifdef DEBUG_RECOVERY_OIDS
9505 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
9506 #endif
9507
9508 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
9509 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9510 << " on " << *pg << dendl;
9511
9512 if (do_unfound) {
9513 PeeringCtx rctx = create_context();
9514 rctx.handle = &handle;
9515 pg->find_unfound(queued, rctx);
9516 dispatch_context(rctx, pg, pg->get_osdmap());
9517 }
9518 }
9519
9520 out:
9521 ceph_assert(started <= reserved_pushes);
9522 service.release_reserved_pushes(reserved_pushes);
9523 }
9524
9525 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9526 {
9527 std::lock_guard l(recovery_lock);
9528 dout(10) << "start_recovery_op " << *pg << " " << soid
9529 << " (" << recovery_ops_active << "/"
9530 << osd->get_recovery_max_active() << " rops)"
9531 << dendl;
9532 recovery_ops_active++;
9533
9534 #ifdef DEBUG_RECOVERY_OIDS
9535 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9536 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9537 recovery_oids[pg->pg_id].insert(soid);
9538 #endif
9539 }
9540
9541 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9542 {
9543 std::lock_guard l(recovery_lock);
9544 dout(10) << "finish_recovery_op " << *pg << " " << soid
9545 << " dequeue=" << dequeue
9546 << " (" << recovery_ops_active << "/"
9547 << osd->get_recovery_max_active() << " rops)"
9548 << dendl;
9549
9550 // adjust count
9551 ceph_assert(recovery_ops_active > 0);
9552 recovery_ops_active--;
9553
9554 #ifdef DEBUG_RECOVERY_OIDS
9555 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9556 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9557 recovery_oids[pg->pg_id].erase(soid);
9558 #endif
9559
9560 _maybe_queue_recovery();
9561 }
9562
9563 bool OSDService::is_recovery_active()
9564 {
9565 if (cct->_conf->osd_debug_pretend_recovery_active) {
9566 return true;
9567 }
9568 return local_reserver.has_reservation() || remote_reserver.has_reservation();
9569 }
9570
9571 void OSDService::release_reserved_pushes(uint64_t pushes)
9572 {
9573 std::lock_guard l(recovery_lock);
9574 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9575 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9576 << dendl;
9577 ceph_assert(recovery_ops_reserved >= pushes);
9578 recovery_ops_reserved -= pushes;
9579 _maybe_queue_recovery();
9580 }
9581
9582 // =========================================================
9583 // OPS
9584
9585 bool OSD::op_is_discardable(const MOSDOp *op)
9586 {
9587 // drop client request if they are not connected and can't get the
9588 // reply anyway.
9589 if (!op->get_connection()->is_connected()) {
9590 return true;
9591 }
9592 return false;
9593 }
9594
9595 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
9596 {
9597 const utime_t stamp = op->get_req()->get_recv_stamp();
9598 const utime_t latency = ceph_clock_now() - stamp;
9599 const unsigned priority = op->get_req()->get_priority();
9600 const int cost = op->get_req()->get_cost();
9601 const uint64_t owner = op->get_req()->get_source().num();
9602
9603 dout(15) << "enqueue_op " << op << " prio " << priority
9604 << " cost " << cost
9605 << " latency " << latency
9606 << " epoch " << epoch
9607 << " " << *(op->get_req()) << dendl;
9608 op->osd_trace.event("enqueue op");
9609 op->osd_trace.keyval("priority", priority);
9610 op->osd_trace.keyval("cost", cost);
9611 op->mark_queued_for_pg();
9612 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9613 op_shardedwq.queue(
9614 OpSchedulerItem(
9615 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9616 cost, priority, stamp, owner, epoch));
9617 }
9618
9619 void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9620 {
9621 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9622 op_shardedwq.queue(
9623 OpSchedulerItem(
9624 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9625 10,
9626 cct->_conf->osd_peering_op_priority,
9627 utime_t(),
9628 0,
9629 evt->get_epoch_sent()));
9630 }
9631
9632 /*
9633 * NOTE: dequeue called in worker thread, with pg lock
9634 */
9635 void OSD::dequeue_op(
9636 PGRef pg, OpRequestRef op,
9637 ThreadPool::TPHandle &handle)
9638 {
9639 const Message *m = op->get_req();
9640
9641 FUNCTRACE(cct);
9642 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
9643
9644 utime_t now = ceph_clock_now();
9645 op->set_dequeued_time(now);
9646
9647 utime_t latency = now - m->get_recv_stamp();
9648 dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9649 << " cost " << m->get_cost()
9650 << " latency " << latency
9651 << " " << *m
9652 << " pg " << *pg << dendl;
9653
9654 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9655
9656 service.maybe_share_map(m->get_connection().get(),
9657 pg->get_osdmap(),
9658 op->sent_epoch);
9659
9660 if (pg->is_deleting())
9661 return;
9662
9663 op->mark_reached_pg();
9664 op->osd_trace.event("dequeue_op");
9665
9666 pg->do_request(op, handle);
9667
9668 // finish
9669 dout(10) << "dequeue_op " << op << " finish" << dendl;
9670 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
9671 }
9672
9673
9674 void OSD::dequeue_peering_evt(
9675 OSDShard *sdata,
9676 PG *pg,
9677 PGPeeringEventRef evt,
9678 ThreadPool::TPHandle& handle)
9679 {
9680 PeeringCtx rctx = create_context();
9681 auto curmap = sdata->get_osdmap();
9682 bool need_up_thru = false;
9683 epoch_t same_interval_since = 0;
9684 if (!pg) {
9685 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9686 handle_pg_query_nopg(*q);
9687 } else {
9688 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9689 ceph_abort();
9690 }
9691 } else if (advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9692 pg->do_peering_event(evt, rctx);
9693 if (pg->is_deleted()) {
9694 pg->unlock();
9695 return;
9696 }
9697 dispatch_context(rctx, pg, curmap, &handle);
9698 need_up_thru = pg->get_need_up_thru();
9699 same_interval_since = pg->get_same_interval_since();
9700 pg->unlock();
9701 }
9702
9703 if (need_up_thru) {
9704 queue_want_up_thru(same_interval_since);
9705 }
9706
9707 service.send_pg_temp();
9708 }
9709
9710 void OSD::dequeue_delete(
9711 OSDShard *sdata,
9712 PG *pg,
9713 epoch_t e,
9714 ThreadPool::TPHandle& handle)
9715 {
9716 dequeue_peering_evt(
9717 sdata,
9718 pg,
9719 PGPeeringEventRef(
9720 std::make_shared<PGPeeringEvent>(
9721 e, e,
9722 PeeringState::DeleteSome())),
9723 handle);
9724 }
9725
9726
9727
9728 // --------------------------------
9729
9730 const char** OSD::get_tracked_conf_keys() const
9731 {
9732 static const char* KEYS[] = {
9733 "osd_max_backfills",
9734 "osd_min_recovery_priority",
9735 "osd_max_trimming_pgs",
9736 "osd_op_complaint_time",
9737 "osd_op_log_threshold",
9738 "osd_op_history_size",
9739 "osd_op_history_duration",
9740 "osd_op_history_slow_op_size",
9741 "osd_op_history_slow_op_threshold",
9742 "osd_enable_op_tracker",
9743 "osd_map_cache_size",
9744 "osd_pg_epoch_max_lag_factor",
9745 "osd_pg_epoch_persisted_max_stale",
9746 // clog & admin clog
9747 "clog_to_monitors",
9748 "clog_to_syslog",
9749 "clog_to_syslog_facility",
9750 "clog_to_syslog_level",
9751 "osd_objectstore_fuse",
9752 "clog_to_graylog",
9753 "clog_to_graylog_host",
9754 "clog_to_graylog_port",
9755 "host",
9756 "fsid",
9757 "osd_recovery_delay_start",
9758 "osd_client_message_size_cap",
9759 "osd_client_message_cap",
9760 "osd_heartbeat_min_size",
9761 "osd_heartbeat_interval",
9762 "osd_object_clean_region_max_num_intervals",
9763 "osd_scrub_min_interval",
9764 "osd_scrub_max_interval",
9765 NULL
9766 };
9767 return KEYS;
9768 }
9769
9770 void OSD::handle_conf_change(const ConfigProxy& conf,
9771 const std::set <std::string> &changed)
9772 {
9773 std::lock_guard l{osd_lock};
9774 if (changed.count("osd_max_backfills")) {
9775 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9776 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9777 }
9778 if (changed.count("osd_min_recovery_priority")) {
9779 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9780 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9781 }
9782 if (changed.count("osd_max_trimming_pgs")) {
9783 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9784 }
9785 if (changed.count("osd_op_complaint_time") ||
9786 changed.count("osd_op_log_threshold")) {
9787 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9788 cct->_conf->osd_op_log_threshold);
9789 }
9790 if (changed.count("osd_op_history_size") ||
9791 changed.count("osd_op_history_duration")) {
9792 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9793 cct->_conf->osd_op_history_duration);
9794 }
9795 if (changed.count("osd_op_history_slow_op_size") ||
9796 changed.count("osd_op_history_slow_op_threshold")) {
9797 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9798 cct->_conf->osd_op_history_slow_op_threshold);
9799 }
9800 if (changed.count("osd_enable_op_tracker")) {
9801 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9802 }
9803 if (changed.count("osd_map_cache_size")) {
9804 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9805 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9806 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9807 }
9808 if (changed.count("clog_to_monitors") ||
9809 changed.count("clog_to_syslog") ||
9810 changed.count("clog_to_syslog_level") ||
9811 changed.count("clog_to_syslog_facility") ||
9812 changed.count("clog_to_graylog") ||
9813 changed.count("clog_to_graylog_host") ||
9814 changed.count("clog_to_graylog_port") ||
9815 changed.count("host") ||
9816 changed.count("fsid")) {
9817 update_log_config();
9818 }
9819 if (changed.count("osd_pg_epoch_max_lag_factor")) {
9820 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
9821 "osd_pg_epoch_max_lag_factor");
9822 }
9823
9824 #ifdef HAVE_LIBFUSE
9825 if (changed.count("osd_objectstore_fuse")) {
9826 if (store) {
9827 enable_disable_fuse(false);
9828 }
9829 }
9830 #endif
9831
9832 if (changed.count("osd_recovery_delay_start")) {
9833 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9834 service.kick_recovery_queue();
9835 }
9836
9837 if (changed.count("osd_client_message_cap")) {
9838 uint64_t newval = cct->_conf->osd_client_message_cap;
9839 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9840 if (pol.throttler_messages && newval > 0) {
9841 pol.throttler_messages->reset_max(newval);
9842 }
9843 }
9844 if (changed.count("osd_client_message_size_cap")) {
9845 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9846 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9847 if (pol.throttler_bytes && newval > 0) {
9848 pol.throttler_bytes->reset_max(newval);
9849 }
9850 }
9851 if (changed.count("osd_object_clean_region_max_num_intervals")) {
9852 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
9853 }
9854
9855 if (changed.count("osd_scrub_min_interval") ||
9856 changed.count("osd_scrub_max_interval")) {
9857 resched_all_scrubs();
9858 dout(0) << __func__ << ": scrub interval change" << dendl;
9859 }
9860 check_config();
9861 }
9862
9863 void OSD::update_log_config()
9864 {
9865 map<string,string> log_to_monitors;
9866 map<string,string> log_to_syslog;
9867 map<string,string> log_channel;
9868 map<string,string> log_prio;
9869 map<string,string> log_to_graylog;
9870 map<string,string> log_to_graylog_host;
9871 map<string,string> log_to_graylog_port;
9872 uuid_d fsid;
9873 string host;
9874
9875 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9876 log_channel, log_prio, log_to_graylog,
9877 log_to_graylog_host, log_to_graylog_port,
9878 fsid, host) == 0)
9879 clog->update_config(log_to_monitors, log_to_syslog,
9880 log_channel, log_prio, log_to_graylog,
9881 log_to_graylog_host, log_to_graylog_port,
9882 fsid, host);
9883 derr << "log_to_monitors " << log_to_monitors << dendl;
9884 }
9885
9886 void OSD::check_config()
9887 {
9888 // some sanity checks
9889 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9890 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9891 << " is not > osd_pg_epoch_persisted_max_stale ("
9892 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9893 }
9894 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
9895 clog->warn() << "osd_object_clean_region_max_num_intervals ("
9896 << cct->_conf->osd_object_clean_region_max_num_intervals
9897 << ") is < 0";
9898 }
9899 }
9900
9901 // --------------------------------
9902
9903 void OSD::get_latest_osdmap()
9904 {
9905 dout(10) << __func__ << " -- start" << dendl;
9906
9907 C_SaferCond cond;
9908 service.objecter->wait_for_latest_osdmap(&cond);
9909 cond.wait();
9910
9911 dout(10) << __func__ << " -- finish" << dendl;
9912 }
9913
9914 // --------------------------------
9915
9916 void OSD::set_perf_queries(const ConfigPayload &config_payload) {
9917 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
9918 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
9919 dout(10) << "setting " << queries.size() << " queries" << dendl;
9920
9921 std::list<OSDPerfMetricQuery> supported_queries;
9922 for (auto &it : queries) {
9923 auto &query = it.first;
9924 if (!query.key_descriptor.empty()) {
9925 supported_queries.push_back(query);
9926 }
9927 }
9928 if (supported_queries.size() < queries.size()) {
9929 dout(1) << queries.size() - supported_queries.size()
9930 << " unsupported queries" << dendl;
9931 }
9932 {
9933 std::lock_guard locker{m_perf_queries_lock};
9934 m_perf_queries = supported_queries;
9935 m_perf_limits = queries;
9936 }
9937 std::vector<PGRef> pgs;
9938 _get_pgs(&pgs);
9939 for (auto& pg : pgs) {
9940 std::scoped_lock l{*pg};
9941 pg->set_dynamic_perf_stats_queries(supported_queries);
9942 }
9943 }
9944
9945 MetricPayload OSD::get_perf_reports() {
9946 OSDMetricPayload payload;
9947 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
9948
9949 std::vector<PGRef> pgs;
9950 _get_pgs(&pgs);
9951 DynamicPerfStats dps;
9952 for (auto& pg : pgs) {
9953 // m_perf_queries can be modified only in set_perf_queries by mgr client
9954 // request, and it is protected by by mgr client's lock, which is held
9955 // when set_perf_queries/get_perf_reports are called, so we may not hold
9956 // m_perf_queries_lock here.
9957 DynamicPerfStats pg_dps(m_perf_queries);
9958 pg->lock();
9959 pg->get_dynamic_perf_stats(&pg_dps);
9960 pg->unlock();
9961 dps.merge(pg_dps);
9962 }
9963 dps.add_to_reports(m_perf_limits, &reports);
9964 dout(20) << "reports for " << reports.size() << " queries" << dendl;
9965
9966 return payload;
9967 }
9968
9969 // =============================================================
9970
9971 #undef dout_context
9972 #define dout_context cct
9973 #undef dout_prefix
9974 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
9975
9976 void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
9977 {
9978 dout(10) << pg->pg_id << " " << pg << dendl;
9979 slot->pg = pg;
9980 pg->osd_shard = this;
9981 pg->pg_slot = slot;
9982 osd->inc_num_pgs();
9983
9984 slot->epoch = pg->get_osdmap_epoch();
9985 pg_slots_by_epoch.insert(*slot);
9986 }
9987
9988 void OSDShard::_detach_pg(OSDShardPGSlot *slot)
9989 {
9990 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
9991 slot->pg->osd_shard = nullptr;
9992 slot->pg->pg_slot = nullptr;
9993 slot->pg = nullptr;
9994 osd->dec_num_pgs();
9995
9996 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
9997 slot->epoch = 0;
9998 if (waiting_for_min_pg_epoch) {
9999 min_pg_epoch_cond.notify_all();
10000 }
10001 }
10002
10003 void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10004 {
10005 std::lock_guard l(shard_lock);
10006 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10007 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10008 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10009 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10010 slot->epoch = e;
10011 pg_slots_by_epoch.insert(*slot);
10012 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10013 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10014 if (waiting_for_min_pg_epoch) {
10015 min_pg_epoch_cond.notify_all();
10016 }
10017 }
10018
10019 epoch_t OSDShard::get_min_pg_epoch()
10020 {
10021 std::lock_guard l(shard_lock);
10022 auto p = pg_slots_by_epoch.begin();
10023 if (p == pg_slots_by_epoch.end()) {
10024 return 0;
10025 }
10026 return p->epoch;
10027 }
10028
10029 void OSDShard::wait_min_pg_epoch(epoch_t need)
10030 {
10031 std::unique_lock l{shard_lock};
10032 ++waiting_for_min_pg_epoch;
10033 min_pg_epoch_cond.wait(l, [need, this] {
10034 if (pg_slots_by_epoch.empty()) {
10035 return true;
10036 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10037 return true;
10038 } else {
10039 dout(10) << need << " waiting on "
10040 << pg_slots_by_epoch.begin()->epoch << dendl;
10041 return false;
10042 }
10043 });
10044 --waiting_for_min_pg_epoch;
10045 }
10046
10047 epoch_t OSDShard::get_max_waiting_epoch()
10048 {
10049 std::lock_guard l(shard_lock);
10050 epoch_t r = 0;
10051 for (auto& i : pg_slots) {
10052 if (!i.second->waiting_peering.empty()) {
10053 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10054 }
10055 }
10056 return r;
10057 }
10058
10059 void OSDShard::consume_map(
10060 const OSDMapRef& new_osdmap,
10061 unsigned *pushes_to_free)
10062 {
10063 std::lock_guard l(shard_lock);
10064 OSDMapRef old_osdmap;
10065 {
10066 std::lock_guard l(osdmap_lock);
10067 old_osdmap = std::move(shard_osdmap);
10068 shard_osdmap = new_osdmap;
10069 }
10070 dout(10) << new_osdmap->get_epoch()
10071 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10072 << dendl;
10073 bool queued = false;
10074
10075 // check slots
10076 auto p = pg_slots.begin();
10077 while (p != pg_slots.end()) {
10078 OSDShardPGSlot *slot = p->second.get();
10079 const spg_t& pgid = p->first;
10080 dout(20) << __func__ << " " << pgid << dendl;
10081 if (!slot->waiting_for_split.empty()) {
10082 dout(20) << __func__ << " " << pgid
10083 << " waiting for split " << slot->waiting_for_split << dendl;
10084 ++p;
10085 continue;
10086 }
10087 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10088 dout(20) << __func__ << " " << pgid
10089 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10090 << dendl;
10091 ++p;
10092 continue;
10093 }
10094 if (!slot->waiting_peering.empty()) {
10095 epoch_t first = slot->waiting_peering.begin()->first;
10096 if (first <= new_osdmap->get_epoch()) {
10097 dout(20) << __func__ << " " << pgid
10098 << " pending_peering first epoch " << first
10099 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10100 _wake_pg_slot(pgid, slot);
10101 queued = true;
10102 }
10103 ++p;
10104 continue;
10105 }
10106 if (!slot->waiting.empty()) {
10107 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10108 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10109 << dendl;
10110 ++p;
10111 continue;
10112 }
10113 while (!slot->waiting.empty() &&
10114 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10115 auto& qi = slot->waiting.front();
10116 dout(20) << __func__ << " " << pgid
10117 << " waiting item " << qi
10118 << " epoch " << qi.get_map_epoch()
10119 << " <= " << new_osdmap->get_epoch()
10120 << ", "
10121 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10122 "misdirected")
10123 << ", dropping" << dendl;
10124 *pushes_to_free += qi.get_reserved_pushes();
10125 slot->waiting.pop_front();
10126 }
10127 }
10128 if (slot->waiting.empty() &&
10129 slot->num_running == 0 &&
10130 slot->waiting_for_split.empty() &&
10131 !slot->pg) {
10132 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10133 p = pg_slots.erase(p);
10134 continue;
10135 }
10136
10137 ++p;
10138 }
10139 if (queued) {
10140 std::lock_guard l{sdata_wait_lock};
10141 sdata_cond.notify_one();
10142 }
10143 }
10144
10145 void OSDShard::_wake_pg_slot(
10146 spg_t pgid,
10147 OSDShardPGSlot *slot)
10148 {
10149 dout(20) << __func__ << " " << pgid
10150 << " to_process " << slot->to_process
10151 << " waiting " << slot->waiting
10152 << " waiting_peering " << slot->waiting_peering << dendl;
10153 for (auto i = slot->to_process.rbegin();
10154 i != slot->to_process.rend();
10155 ++i) {
10156 scheduler->enqueue_front(std::move(*i));
10157 }
10158 slot->to_process.clear();
10159 for (auto i = slot->waiting.rbegin();
10160 i != slot->waiting.rend();
10161 ++i) {
10162 scheduler->enqueue_front(std::move(*i));
10163 }
10164 slot->waiting.clear();
10165 for (auto i = slot->waiting_peering.rbegin();
10166 i != slot->waiting_peering.rend();
10167 ++i) {
10168 // this is overkill; we requeue everything, even if some of these
10169 // items are waiting for maps we don't have yet. FIXME, maybe,
10170 // someday, if we decide this inefficiency matters
10171 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10172 scheduler->enqueue_front(std::move(*j));
10173 }
10174 }
10175 slot->waiting_peering.clear();
10176 ++slot->requeue_seq;
10177 }
10178
10179 void OSDShard::identify_splits_and_merges(
10180 const OSDMapRef& as_of_osdmap,
10181 set<pair<spg_t,epoch_t>> *split_pgs,
10182 set<pair<spg_t,epoch_t>> *merge_pgs)
10183 {
10184 std::lock_guard l(shard_lock);
10185 if (shard_osdmap) {
10186 for (auto& i : pg_slots) {
10187 const spg_t& pgid = i.first;
10188 auto *slot = i.second.get();
10189 if (slot->pg) {
10190 osd->service.identify_splits_and_merges(
10191 shard_osdmap, as_of_osdmap, pgid,
10192 split_pgs, merge_pgs);
10193 } else if (!slot->waiting_for_split.empty()) {
10194 osd->service.identify_splits_and_merges(
10195 shard_osdmap, as_of_osdmap, pgid,
10196 split_pgs, nullptr);
10197 } else {
10198 dout(20) << __func__ << " slot " << pgid
10199 << " has no pg and waiting_for_split " << dendl;
10200 }
10201 }
10202 }
10203 }
10204
10205 void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10206 set<pair<spg_t,epoch_t>> *pgids)
10207 {
10208 std::lock_guard l(shard_lock);
10209 _prime_splits(pgids);
10210 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10211 set<pair<spg_t,epoch_t>> newer_children;
10212 for (auto i : *pgids) {
10213 osd->service.identify_splits_and_merges(
10214 as_of_osdmap, shard_osdmap, i.first,
10215 &newer_children, nullptr);
10216 }
10217 newer_children.insert(pgids->begin(), pgids->end());
10218 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10219 << shard_osdmap->get_epoch() << ", new children " << newer_children
10220 << dendl;
10221 _prime_splits(&newer_children);
10222 // note: we don't care what is left over here for other shards.
10223 // if this shard is ahead of us and one isn't, e.g., one thread is
10224 // calling into prime_splits via _process (due to a newly created
10225 // pg) and this shard has a newer map due to a racing consume_map,
10226 // then any grandchildren left here will be identified (or were
10227 // identified) when the slower shard's osdmap is advanced.
10228 // _prime_splits() will tolerate the case where the pgid is
10229 // already primed.
10230 }
10231 }
10232
10233 void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10234 {
10235 dout(10) << *pgids << dendl;
10236 auto p = pgids->begin();
10237 while (p != pgids->end()) {
10238 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10239 if (shard_index == shard_id) {
10240 auto r = pg_slots.emplace(p->first, nullptr);
10241 if (r.second) {
10242 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10243 r.first->second = make_unique<OSDShardPGSlot>();
10244 r.first->second->waiting_for_split.insert(p->second);
10245 } else {
10246 auto q = r.first;
10247 ceph_assert(q != pg_slots.end());
10248 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10249 << dendl;
10250 q->second->waiting_for_split.insert(p->second);
10251 }
10252 p = pgids->erase(p);
10253 } else {
10254 ++p;
10255 }
10256 }
10257 }
10258
10259 void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10260 set<pair<spg_t,epoch_t>> *merge_pgs)
10261 {
10262 std::lock_guard l(shard_lock);
10263 dout(20) << __func__ << " checking shard " << shard_id
10264 << " for remaining merge pgs " << merge_pgs << dendl;
10265 auto p = merge_pgs->begin();
10266 while (p != merge_pgs->end()) {
10267 spg_t pgid = p->first;
10268 epoch_t epoch = p->second;
10269 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10270 if (shard_index != shard_id) {
10271 ++p;
10272 continue;
10273 }
10274 OSDShardPGSlot *slot;
10275 auto r = pg_slots.emplace(pgid, nullptr);
10276 if (r.second) {
10277 r.first->second = make_unique<OSDShardPGSlot>();
10278 }
10279 slot = r.first->second.get();
10280 if (slot->pg) {
10281 // already have pg
10282 dout(20) << __func__ << " have merge participant pg " << pgid
10283 << " " << slot->pg << dendl;
10284 } else if (!slot->waiting_for_split.empty() &&
10285 *slot->waiting_for_split.begin() < epoch) {
10286 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10287 << " " << slot->waiting_for_split << dendl;
10288 } else {
10289 dout(20) << __func__ << " creating empty merge participant " << pgid
10290 << " for merge in " << epoch << dendl;
10291 // leave history zeroed; PG::merge_from() will fill it in.
10292 pg_history_t history;
10293 PGCreateInfo cinfo(pgid, epoch - 1,
10294 history, PastIntervals(), false);
10295 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10296 _attach_pg(r.first->second.get(), pg.get());
10297 _wake_pg_slot(pgid, slot);
10298 pg->unlock();
10299 }
10300 // mark slot for merge
10301 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10302 slot->waiting_for_merge_epoch = epoch;
10303 p = merge_pgs->erase(p);
10304 }
10305 }
10306
10307 void OSDShard::register_and_wake_split_child(PG *pg)
10308 {
10309 epoch_t epoch;
10310 {
10311 std::lock_guard l(shard_lock);
10312 dout(10) << pg->pg_id << " " << pg << dendl;
10313 auto p = pg_slots.find(pg->pg_id);
10314 ceph_assert(p != pg_slots.end());
10315 auto *slot = p->second.get();
10316 dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
10317 << dendl;
10318 ceph_assert(!slot->pg);
10319 ceph_assert(!slot->waiting_for_split.empty());
10320 _attach_pg(slot, pg);
10321
10322 epoch = pg->get_osdmap_epoch();
10323 ceph_assert(slot->waiting_for_split.count(epoch));
10324 slot->waiting_for_split.erase(epoch);
10325 if (slot->waiting_for_split.empty()) {
10326 _wake_pg_slot(pg->pg_id, slot);
10327 } else {
10328 dout(10) << __func__ << " still waiting for split on "
10329 << slot->waiting_for_split << dendl;
10330 }
10331 }
10332
10333 // kick child to ensure it pulls up to the latest osdmap
10334 osd->enqueue_peering_evt(
10335 pg->pg_id,
10336 PGPeeringEventRef(
10337 std::make_shared<PGPeeringEvent>(
10338 epoch,
10339 epoch,
10340 NullEvt())));
10341
10342 std::lock_guard l{sdata_wait_lock};
10343 sdata_cond.notify_one();
10344 }
10345
10346 void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
10347 {
10348 std::lock_guard l(shard_lock);
10349 vector<spg_t> to_delete;
10350 for (auto& i : pg_slots) {
10351 if (i.first != parent &&
10352 i.first.get_ancestor(old_pg_num) == parent) {
10353 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10354 << dendl;
10355 _wake_pg_slot(i.first, i.second.get());
10356 to_delete.push_back(i.first);
10357 }
10358 }
10359 for (auto pgid : to_delete) {
10360 pg_slots.erase(pgid);
10361 }
10362 }
10363
10364 OSDShard::OSDShard(
10365 int id,
10366 CephContext *cct,
10367 OSD *osd)
10368 : shard_id(id),
10369 cct(cct),
10370 osd(osd),
10371 shard_name(string("OSDShard.") + stringify(id)),
10372 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10373 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10374 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10375 shard_lock_name(shard_name + "::shard_lock"),
10376 shard_lock{make_mutex(shard_lock_name)},
10377 scheduler(ceph::osd::scheduler::make_scheduler(cct)),
10378 context_queue(sdata_wait_lock, sdata_cond)
10379 {
10380 dout(0) << "using op scheduler " << *scheduler << dendl;
10381 }
10382
10383
10384 // =============================================================
10385
10386 #undef dout_context
10387 #define dout_context osd->cct
10388 #undef dout_prefix
10389 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10390
10391 void OSD::ShardedOpWQ::_add_slot_waiter(
10392 spg_t pgid,
10393 OSDShardPGSlot *slot,
10394 OpSchedulerItem&& qi)
10395 {
10396 if (qi.is_peering()) {
10397 dout(20) << __func__ << " " << pgid
10398 << " peering, item epoch is "
10399 << qi.get_map_epoch()
10400 << ", will wait on " << qi << dendl;
10401 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10402 } else {
10403 dout(20) << __func__ << " " << pgid
10404 << " item epoch is "
10405 << qi.get_map_epoch()
10406 << ", will wait on " << qi << dendl;
10407 slot->waiting.push_back(std::move(qi));
10408 }
10409 }
10410
10411 #undef dout_prefix
10412 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10413
10414 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10415 {
10416 uint32_t shard_index = thread_index % osd->num_shards;
10417 auto& sdata = osd->shards[shard_index];
10418 ceph_assert(sdata);
10419
10420 // If all threads of shards do oncommits, there is a out-of-order
10421 // problem. So we choose the thread which has the smallest
10422 // thread_index(thread_index < num_shards) of shard to do oncommit
10423 // callback.
10424 bool is_smallest_thread_index = thread_index < osd->num_shards;
10425
10426 // peek at spg_t
10427 sdata->shard_lock.lock();
10428 if (sdata->scheduler->empty() &&
10429 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10430 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10431 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10432 // we raced with a context_queue addition, don't wait
10433 wait_lock.unlock();
10434 } else if (!sdata->stop_waiting) {
10435 dout(20) << __func__ << " empty q, waiting" << dendl;
10436 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10437 sdata->shard_lock.unlock();
10438 sdata->sdata_cond.wait(wait_lock);
10439 wait_lock.unlock();
10440 sdata->shard_lock.lock();
10441 if (sdata->scheduler->empty() &&
10442 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10443 sdata->shard_lock.unlock();
10444 return;
10445 }
10446 // found a work item; reapply default wq timeouts
10447 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10448 timeout_interval, suicide_interval);
10449 } else {
10450 dout(20) << __func__ << " need return immediately" << dendl;
10451 wait_lock.unlock();
10452 sdata->shard_lock.unlock();
10453 return;
10454 }
10455 }
10456
10457 list<Context *> oncommits;
10458 if (is_smallest_thread_index) {
10459 sdata->context_queue.move_to(oncommits);
10460 }
10461
10462 if (sdata->scheduler->empty()) {
10463 if (osd->is_stopping()) {
10464 sdata->shard_lock.unlock();
10465 for (auto c : oncommits) {
10466 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10467 delete c;
10468 }
10469 return; // OSD shutdown, discard.
10470 }
10471 sdata->shard_lock.unlock();
10472 handle_oncommits(oncommits);
10473 return;
10474 }
10475
10476 OpSchedulerItem item = sdata->scheduler->dequeue();
10477 if (osd->is_stopping()) {
10478 sdata->shard_lock.unlock();
10479 for (auto c : oncommits) {
10480 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10481 delete c;
10482 }
10483 return; // OSD shutdown, discard.
10484 }
10485
10486 const auto token = item.get_ordering_token();
10487 auto r = sdata->pg_slots.emplace(token, nullptr);
10488 if (r.second) {
10489 r.first->second = make_unique<OSDShardPGSlot>();
10490 }
10491 OSDShardPGSlot *slot = r.first->second.get();
10492 dout(20) << __func__ << " " << token
10493 << (r.second ? " (new)" : "")
10494 << " to_process " << slot->to_process
10495 << " waiting " << slot->waiting
10496 << " waiting_peering " << slot->waiting_peering
10497 << dendl;
10498 slot->to_process.push_back(std::move(item));
10499 dout(20) << __func__ << " " << slot->to_process.back()
10500 << " queued" << dendl;
10501
10502 retry_pg:
10503 PGRef pg = slot->pg;
10504
10505 // lock pg (if we have it)
10506 if (pg) {
10507 // note the requeue seq now...
10508 uint64_t requeue_seq = slot->requeue_seq;
10509 ++slot->num_running;
10510
10511 sdata->shard_lock.unlock();
10512 osd->service.maybe_inject_dispatch_delay();
10513 pg->lock();
10514 osd->service.maybe_inject_dispatch_delay();
10515 sdata->shard_lock.lock();
10516
10517 auto q = sdata->pg_slots.find(token);
10518 if (q == sdata->pg_slots.end()) {
10519 // this can happen if we race with pg removal.
10520 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10521 pg->unlock();
10522 sdata->shard_lock.unlock();
10523 handle_oncommits(oncommits);
10524 return;
10525 }
10526 slot = q->second.get();
10527 --slot->num_running;
10528
10529 if (slot->to_process.empty()) {
10530 // raced with _wake_pg_slot or consume_map
10531 dout(20) << __func__ << " " << token
10532 << " nothing queued" << dendl;
10533 pg->unlock();
10534 sdata->shard_lock.unlock();
10535 handle_oncommits(oncommits);
10536 return;
10537 }
10538 if (requeue_seq != slot->requeue_seq) {
10539 dout(20) << __func__ << " " << token
10540 << " requeue_seq " << slot->requeue_seq << " > our "
10541 << requeue_seq << ", we raced with _wake_pg_slot"
10542 << dendl;
10543 pg->unlock();
10544 sdata->shard_lock.unlock();
10545 handle_oncommits(oncommits);
10546 return;
10547 }
10548 if (slot->pg != pg) {
10549 // this can happen if we race with pg removal.
10550 dout(20) << __func__ << " slot " << token << " no longer attached to "
10551 << pg << dendl;
10552 pg->unlock();
10553 goto retry_pg;
10554 }
10555 }
10556
10557 dout(20) << __func__ << " " << token
10558 << " to_process " << slot->to_process
10559 << " waiting " << slot->waiting
10560 << " waiting_peering " << slot->waiting_peering << dendl;
10561
10562 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10563 suicide_interval);
10564
10565 // take next item
10566 auto qi = std::move(slot->to_process.front());
10567 slot->to_process.pop_front();
10568 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10569 set<pair<spg_t,epoch_t>> new_children;
10570 OSDMapRef osdmap;
10571
10572 while (!pg) {
10573 // should this pg shard exist on this osd in this (or a later) epoch?
10574 osdmap = sdata->shard_osdmap;
10575 const PGCreateInfo *create_info = qi.creates_pg();
10576 if (!slot->waiting_for_split.empty()) {
10577 dout(20) << __func__ << " " << token
10578 << " splitting " << slot->waiting_for_split << dendl;
10579 _add_slot_waiter(token, slot, std::move(qi));
10580 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10581 dout(20) << __func__ << " " << token
10582 << " map " << qi.get_map_epoch() << " > "
10583 << osdmap->get_epoch() << dendl;
10584 _add_slot_waiter(token, slot, std::move(qi));
10585 } else if (qi.is_peering()) {
10586 if (!qi.peering_requires_pg()) {
10587 // for pg-less events, we run them under the ordering lock, since
10588 // we don't have the pg lock to keep them ordered.
10589 qi.run(osd, sdata, pg, tp_handle);
10590 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10591 if (create_info) {
10592 if (create_info->by_mon &&
10593 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
10594 dout(20) << __func__ << " " << token
10595 << " no pg, no longer primary, ignoring mon create on "
10596 << qi << dendl;
10597 } else {
10598 dout(20) << __func__ << " " << token
10599 << " no pg, should create on " << qi << dendl;
10600 pg = osd->handle_pg_create_info(osdmap, create_info);
10601 if (pg) {
10602 // we created the pg! drop out and continue "normally"!
10603 sdata->_attach_pg(slot, pg.get());
10604 sdata->_wake_pg_slot(token, slot);
10605
10606 // identify split children between create epoch and shard epoch.
10607 osd->service.identify_splits_and_merges(
10608 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
10609 sdata->_prime_splits(&new_children);
10610 // distribute remaining split children to other shards below!
10611 break;
10612 }
10613 dout(20) << __func__ << " ignored create on " << qi << dendl;
10614 }
10615 } else {
10616 dout(20) << __func__ << " " << token
10617 << " no pg, peering, !create, discarding " << qi << dendl;
10618 }
10619 } else {
10620 dout(20) << __func__ << " " << token
10621 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
10622 << ", discarding " << qi
10623 << dendl;
10624 }
10625 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10626 dout(20) << __func__ << " " << token
10627 << " no pg, should exist e" << osdmap->get_epoch()
10628 << ", will wait on " << qi << dendl;
10629 _add_slot_waiter(token, slot, std::move(qi));
10630 } else {
10631 dout(20) << __func__ << " " << token
10632 << " no pg, shouldn't exist e" << osdmap->get_epoch()
10633 << ", dropping " << qi << dendl;
10634 // share map with client?
10635 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10636 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
10637 sdata->shard_osdmap,
10638 (*_op)->sent_epoch);
10639 }
10640 unsigned pushes_to_free = qi.get_reserved_pushes();
10641 if (pushes_to_free > 0) {
10642 sdata->shard_lock.unlock();
10643 osd->service.release_reserved_pushes(pushes_to_free);
10644 handle_oncommits(oncommits);
10645 return;
10646 }
10647 }
10648 sdata->shard_lock.unlock();
10649 handle_oncommits(oncommits);
10650 return;
10651 }
10652 if (qi.is_peering()) {
10653 OSDMapRef osdmap = sdata->shard_osdmap;
10654 if (qi.get_map_epoch() > osdmap->get_epoch()) {
10655 _add_slot_waiter(token, slot, std::move(qi));
10656 sdata->shard_lock.unlock();
10657 pg->unlock();
10658 handle_oncommits(oncommits);
10659 return;
10660 }
10661 }
10662 sdata->shard_lock.unlock();
10663
10664 if (!new_children.empty()) {
10665 for (auto shard : osd->shards) {
10666 shard->prime_splits(osdmap, &new_children);
10667 }
10668 ceph_assert(new_children.empty());
10669 }
10670
10671 // osd_opwq_process marks the point at which an operation has been dequeued
10672 // and will begin to be handled by a worker thread.
10673 {
10674 #ifdef WITH_LTTNG
10675 osd_reqid_t reqid;
10676 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10677 reqid = (*_op)->get_reqid();
10678 }
10679 #endif
10680 tracepoint(osd, opwq_process_start, reqid.name._type,
10681 reqid.name._num, reqid.tid, reqid.inc);
10682 }
10683
10684 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10685 Formatter *f = Formatter::create("json");
10686 f->open_object_section("q");
10687 dump(f);
10688 f->close_section();
10689 f->flush(*_dout);
10690 delete f;
10691 *_dout << dendl;
10692
10693 qi.run(osd, sdata, pg, tp_handle);
10694
10695 {
10696 #ifdef WITH_LTTNG
10697 osd_reqid_t reqid;
10698 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10699 reqid = (*_op)->get_reqid();
10700 }
10701 #endif
10702 tracepoint(osd, opwq_process_finish, reqid.name._type,
10703 reqid.name._num, reqid.tid, reqid.inc);
10704 }
10705
10706 handle_oncommits(oncommits);
10707 }
10708
10709 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
10710 uint32_t shard_index =
10711 item.get_ordering_token().hash_to_shard(osd->shards.size());
10712
10713 dout(20) << __func__ << " " << item << dendl;
10714
10715 OSDShard* sdata = osd->shards[shard_index];
10716 assert (NULL != sdata);
10717
10718 bool empty = true;
10719 {
10720 std::lock_guard l{sdata->shard_lock};
10721 empty = sdata->scheduler->empty();
10722 sdata->scheduler->enqueue(std::move(item));
10723 }
10724
10725 if (empty) {
10726 std::lock_guard l{sdata->sdata_wait_lock};
10727 sdata->sdata_cond.notify_all();
10728 }
10729 }
10730
10731 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
10732 {
10733 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
10734 auto& sdata = osd->shards[shard_index];
10735 ceph_assert(sdata);
10736 sdata->shard_lock.lock();
10737 auto p = sdata->pg_slots.find(item.get_ordering_token());
10738 if (p != sdata->pg_slots.end() &&
10739 !p->second->to_process.empty()) {
10740 // we may be racing with _process, which has dequeued a new item
10741 // from scheduler, put it on to_process, and is now busy taking the
10742 // pg lock. ensure this old requeued item is ordered before any
10743 // such newer item in to_process.
10744 p->second->to_process.push_front(std::move(item));
10745 item = std::move(p->second->to_process.back());
10746 p->second->to_process.pop_back();
10747 dout(20) << __func__
10748 << " " << p->second->to_process.front()
10749 << " shuffled w/ " << item << dendl;
10750 } else {
10751 dout(20) << __func__ << " " << item << dendl;
10752 }
10753 sdata->scheduler->enqueue_front(std::move(item));
10754 sdata->shard_lock.unlock();
10755 std::lock_guard l{sdata->sdata_wait_lock};
10756 sdata->sdata_cond.notify_one();
10757 }
10758
10759 namespace ceph {
10760 namespace osd_cmds {
10761
10762 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
10763 std::ostream& os)
10764 {
10765 if (!ceph_using_tcmalloc()) {
10766 os << "could not issue heap profiler command -- not using tcmalloc!";
10767 return -EOPNOTSUPP;
10768 }
10769
10770 string cmd;
10771 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
10772 os << "unable to get value for command \"" << cmd << "\"";
10773 return -EINVAL;
10774 }
10775
10776 std::vector<std::string> cmd_vec;
10777 get_str_vec(cmd, cmd_vec);
10778
10779 string val;
10780 if (cmd_getval(cmdmap, "value", val)) {
10781 cmd_vec.push_back(val);
10782 }
10783
10784 ceph_heap_profiler_handle_command(cmd_vec, os);
10785
10786 return 0;
10787 }
10788
10789 }} // namespace ceph::osd_cmds