]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
8ae62212e35999bcb31dec4a257abaa71e0b0cf2
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15
16 #include "acconfig.h"
17
18 #include <cctype>
19 #include <fstream>
20 #include <iostream>
21 #include <iterator>
22
23 #include <unistd.h>
24 #include <sys/stat.h>
25 #include <signal.h>
26 #include <time.h>
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
29
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
32 #endif
33
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
36 #endif
37
38 #include "osd/PG.h"
39
40 #include "include/types.h"
41 #include "include/compat.h"
42 #include "include/random.h"
43
44 #include "OSD.h"
45 #include "OSDMap.h"
46 #include "Watch.h"
47 #include "osdc/Objecter.h"
48
49 #include "common/errno.h"
50 #include "common/ceph_argparse.h"
51 #include "common/ceph_releases.h"
52 #include "common/ceph_time.h"
53 #include "common/version.h"
54 #include "common/pick_address.h"
55 #include "common/blkdev.h"
56 #include "common/numa.h"
57
58 #include "os/ObjectStore.h"
59 #ifdef HAVE_LIBFUSE
60 #include "os/FuseStore.h"
61 #endif
62
63 #include "PrimaryLogPG.h"
64
65 #include "msg/Messenger.h"
66 #include "msg/Message.h"
67
68 #include "mon/MonClient.h"
69
70 #include "messages/MLog.h"
71
72 #include "messages/MGenericMessage.h"
73 #include "messages/MOSDPing.h"
74 #include "messages/MOSDFailure.h"
75 #include "messages/MOSDMarkMeDown.h"
76 #include "messages/MOSDMarkMeDead.h"
77 #include "messages/MOSDFull.h"
78 #include "messages/MOSDOp.h"
79 #include "messages/MOSDOpReply.h"
80 #include "messages/MOSDBackoff.h"
81 #include "messages/MOSDBeacon.h"
82 #include "messages/MOSDRepOp.h"
83 #include "messages/MOSDRepOpReply.h"
84 #include "messages/MOSDBoot.h"
85 #include "messages/MOSDPGTemp.h"
86 #include "messages/MOSDPGReadyToMerge.h"
87
88 #include "messages/MOSDMap.h"
89 #include "messages/MMonGetOSDMap.h"
90 #include "messages/MOSDPGNotify.h"
91 #include "messages/MOSDPGNotify2.h"
92 #include "messages/MOSDPGQuery.h"
93 #include "messages/MOSDPGQuery2.h"
94 #include "messages/MOSDPGLog.h"
95 #include "messages/MOSDPGRemove.h"
96 #include "messages/MOSDPGInfo.h"
97 #include "messages/MOSDPGInfo2.h"
98 #include "messages/MOSDPGCreate.h"
99 #include "messages/MOSDPGCreate2.h"
100 #include "messages/MOSDPGScan.h"
101 #include "messages/MBackfillReserve.h"
102 #include "messages/MRecoveryReserve.h"
103 #include "messages/MOSDForceRecovery.h"
104 #include "messages/MOSDECSubOpWrite.h"
105 #include "messages/MOSDECSubOpWriteReply.h"
106 #include "messages/MOSDECSubOpRead.h"
107 #include "messages/MOSDECSubOpReadReply.h"
108 #include "messages/MOSDPGCreated.h"
109 #include "messages/MOSDPGUpdateLogMissing.h"
110 #include "messages/MOSDPGUpdateLogMissingReply.h"
111
112 #include "messages/MOSDPeeringOp.h"
113
114 #include "messages/MOSDAlive.h"
115
116 #include "messages/MOSDScrub.h"
117 #include "messages/MOSDScrub2.h"
118 #include "messages/MOSDRepScrub.h"
119
120 #include "messages/MCommand.h"
121 #include "messages/MCommandReply.h"
122
123 #include "messages/MPGStats.h"
124 #include "messages/MPGStatsAck.h"
125
126 #include "messages/MWatchNotify.h"
127 #include "messages/MOSDPGPush.h"
128 #include "messages/MOSDPGPushReply.h"
129 #include "messages/MOSDPGPull.h"
130
131 #include "messages/MMonGetPurgedSnaps.h"
132 #include "messages/MMonGetPurgedSnapsReply.h"
133
134 #include "common/perf_counters.h"
135 #include "common/Timer.h"
136 #include "common/LogClient.h"
137 #include "common/AsyncReserver.h"
138 #include "common/HeartbeatMap.h"
139 #include "common/admin_socket.h"
140 #include "common/ceph_context.h"
141
142 #include "global/signal_handler.h"
143 #include "global/pidfile.h"
144
145 #include "include/color.h"
146 #include "perfglue/cpu_profiler.h"
147 #include "perfglue/heap_profiler.h"
148
149 #include "osd/OpRequest.h"
150
151 #include "auth/AuthAuthorizeHandler.h"
152 #include "auth/RotatingKeyRing.h"
153
154 #include "objclass/objclass.h"
155
156 #include "common/cmdparse.h"
157 #include "include/str_list.h"
158 #include "include/util.h"
159
160 #include "include/ceph_assert.h"
161 #include "common/config.h"
162 #include "common/EventTrace.h"
163
164 #include "json_spirit/json_spirit_reader.h"
165 #include "json_spirit/json_spirit_writer.h"
166
167 #ifdef WITH_LTTNG
168 #define TRACEPOINT_DEFINE
169 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
170 #include "tracing/osd.h"
171 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172 #undef TRACEPOINT_DEFINE
173 #else
174 #define tracepoint(...)
175 #endif
176
177 #define dout_context cct
178 #define dout_subsys ceph_subsys_osd
179 #undef dout_prefix
180 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
181
182 using namespace ceph::osd::scheduler;
183 using TOPNSPC::common::cmd_getval;
184
185 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
186 return *_dout << "osd." << whoami << " " << epoch << " ";
187 }
188
189 //Initial features in new superblock.
190 //Features here are also automatically upgraded
191 CompatSet OSD::get_osd_initial_compat_set() {
192 CompatSet::FeatureSet ceph_osd_feature_compat;
193 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
194 CompatSet::FeatureSet ceph_osd_feature_incompat;
195 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
196 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
197 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
198 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
199 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
200 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
201 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
202 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
203 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
204 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
205 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
206 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
207 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
208 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
209 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
210 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
211 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
212 ceph_osd_feature_incompat);
213 }
214
215 //Features are added here that this OSD supports.
216 CompatSet OSD::get_osd_compat_set() {
217 CompatSet compat = get_osd_initial_compat_set();
218 //Any features here can be set in code, but not in initial superblock
219 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
220 return compat;
221 }
222
223 OSDService::OSDService(OSD *osd) :
224 osd(osd),
225 cct(osd->cct),
226 whoami(osd->whoami), store(osd->store),
227 log_client(osd->log_client), clog(osd->clog),
228 pg_recovery_stats(osd->pg_recovery_stats),
229 cluster_messenger(osd->cluster_messenger),
230 client_messenger(osd->client_messenger),
231 logger(osd->logger),
232 recoverystate_perf(osd->recoverystate_perf),
233 monc(osd->monc),
234 osd_max_object_size(cct->_conf, "osd_max_object_size"),
235 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
236 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
237 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
238 max_oldest_map(0),
239 scrubs_local(0),
240 scrubs_remote(0),
241 agent_valid_iterator(false),
242 agent_ops(0),
243 flush_mode_high_count(0),
244 agent_active(true),
245 agent_thread(this),
246 agent_stop_flag(false),
247 agent_timer(osd->client_messenger->cct, agent_timer_lock),
248 last_recalibrate(ceph_clock_now()),
249 promote_max_objects(0),
250 promote_max_bytes(0),
251 objecter(make_unique<Objecter>(osd->client_messenger->cct,
252 osd->objecter_messenger,
253 osd->monc, nullptr)),
254 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
255 watch_timer(osd->client_messenger->cct, watch_lock),
256 next_notif_id(0),
257 recovery_request_timer(cct, recovery_request_lock, false),
258 sleep_timer(cct, sleep_lock, false),
259 reserver_finisher(cct),
260 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
261 cct->_conf->osd_min_recovery_priority),
262 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
263 cct->_conf->osd_min_recovery_priority),
264 snap_reserver(cct, &reserver_finisher,
265 cct->_conf->osd_max_trimming_pgs),
266 recovery_ops_active(0),
267 recovery_ops_reserved(0),
268 recovery_paused(false),
269 map_cache(cct, cct->_conf->osd_map_cache_size),
270 map_bl_cache(cct->_conf->osd_map_cache_size),
271 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
272 cur_state(NONE),
273 cur_ratio(0), physical_ratio(0),
274 boot_epoch(0), up_epoch(0), bind_epoch(0)
275 {
276 objecter->init();
277
278 for (int i = 0; i < m_objecter_finishers; i++) {
279 ostringstream str;
280 str << "objecter-finisher-" << i;
281 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
282 objecter_finishers.push_back(std::move(fin));
283 }
284 }
285
286 #ifdef PG_DEBUG_REFS
287 void OSDService::add_pgid(spg_t pgid, PG *pg){
288 std::lock_guard l(pgid_lock);
289 if (!pgid_tracker.count(pgid)) {
290 live_pgs[pgid] = pg;
291 }
292 pgid_tracker[pgid]++;
293 }
294 void OSDService::remove_pgid(spg_t pgid, PG *pg)
295 {
296 std::lock_guard l(pgid_lock);
297 ceph_assert(pgid_tracker.count(pgid));
298 ceph_assert(pgid_tracker[pgid] > 0);
299 pgid_tracker[pgid]--;
300 if (pgid_tracker[pgid] == 0) {
301 pgid_tracker.erase(pgid);
302 live_pgs.erase(pgid);
303 }
304 }
305 void OSDService::dump_live_pgids()
306 {
307 std::lock_guard l(pgid_lock);
308 derr << "live pgids:" << dendl;
309 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
310 i != pgid_tracker.cend();
311 ++i) {
312 derr << "\t" << *i << dendl;
313 live_pgs[i->first]->dump_live_ids();
314 }
315 }
316 #endif
317
318
319 ceph::signedspan OSDService::get_mnow()
320 {
321 return ceph::mono_clock::now() - osd->startup_time;
322 }
323
324 void OSDService::identify_splits_and_merges(
325 OSDMapRef old_map,
326 OSDMapRef new_map,
327 spg_t pgid,
328 set<pair<spg_t,epoch_t>> *split_children,
329 set<pair<spg_t,epoch_t>> *merge_pgs)
330 {
331 if (!old_map->have_pg_pool(pgid.pool())) {
332 return;
333 }
334 int old_pgnum = old_map->get_pg_num(pgid.pool());
335 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
336 if (p == osd->pg_num_history.pg_nums.end()) {
337 return;
338 }
339 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
340 << " to e" << new_map->get_epoch()
341 << " pg_nums " << p->second << dendl;
342 deque<spg_t> queue;
343 queue.push_back(pgid);
344 set<spg_t> did;
345 while (!queue.empty()) {
346 auto cur = queue.front();
347 queue.pop_front();
348 did.insert(cur);
349 unsigned pgnum = old_pgnum;
350 for (auto q = p->second.lower_bound(old_map->get_epoch());
351 q != p->second.end() &&
352 q->first <= new_map->get_epoch();
353 ++q) {
354 if (pgnum < q->second) {
355 // split?
356 if (cur.ps() < pgnum) {
357 set<spg_t> children;
358 if (cur.is_split(pgnum, q->second, &children)) {
359 dout(20) << __func__ << " " << cur << " e" << q->first
360 << " pg_num " << pgnum << " -> " << q->second
361 << " children " << children << dendl;
362 for (auto i : children) {
363 split_children->insert(make_pair(i, q->first));
364 if (!did.count(i))
365 queue.push_back(i);
366 }
367 }
368 } else if (cur.ps() < q->second) {
369 dout(20) << __func__ << " " << cur << " e" << q->first
370 << " pg_num " << pgnum << " -> " << q->second
371 << " is a child" << dendl;
372 // normally we'd capture this from the parent, but it's
373 // possible the parent doesn't exist yet (it will be
374 // fabricated to allow an intervening merge). note this PG
375 // as a split child here to be sure we catch it.
376 split_children->insert(make_pair(cur, q->first));
377 } else {
378 dout(20) << __func__ << " " << cur << " e" << q->first
379 << " pg_num " << pgnum << " -> " << q->second
380 << " is post-split, skipping" << dendl;
381 }
382 } else if (merge_pgs) {
383 // merge?
384 if (cur.ps() >= q->second) {
385 if (cur.ps() < pgnum) {
386 spg_t parent;
387 if (cur.is_merge_source(pgnum, q->second, &parent)) {
388 set<spg_t> children;
389 parent.is_split(q->second, pgnum, &children);
390 dout(20) << __func__ << " " << cur << " e" << q->first
391 << " pg_num " << pgnum << " -> " << q->second
392 << " is merge source, target " << parent
393 << ", source(s) " << children << dendl;
394 merge_pgs->insert(make_pair(parent, q->first));
395 if (!did.count(parent)) {
396 // queue (and re-scan) parent in case it might not exist yet
397 // and there are some future splits pending on it
398 queue.push_back(parent);
399 }
400 for (auto c : children) {
401 merge_pgs->insert(make_pair(c, q->first));
402 if (!did.count(c))
403 queue.push_back(c);
404 }
405 }
406 } else {
407 dout(20) << __func__ << " " << cur << " e" << q->first
408 << " pg_num " << pgnum << " -> " << q->second
409 << " is beyond old pgnum, skipping" << dendl;
410 }
411 } else {
412 set<spg_t> children;
413 if (cur.is_split(q->second, pgnum, &children)) {
414 dout(20) << __func__ << " " << cur << " e" << q->first
415 << " pg_num " << pgnum << " -> " << q->second
416 << " is merge target, source " << children << dendl;
417 for (auto c : children) {
418 merge_pgs->insert(make_pair(c, q->first));
419 if (!did.count(c))
420 queue.push_back(c);
421 }
422 merge_pgs->insert(make_pair(cur, q->first));
423 }
424 }
425 }
426 pgnum = q->second;
427 }
428 }
429 }
430
431 void OSDService::need_heartbeat_peer_update()
432 {
433 osd->need_heartbeat_peer_update();
434 }
435
436 HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
437 {
438 std::lock_guard l(hb_stamp_lock);
439 if (peer >= hb_stamps.size()) {
440 hb_stamps.resize(peer + 1);
441 }
442 if (!hb_stamps[peer]) {
443 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
444 }
445 return hb_stamps[peer];
446 }
447
448 void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
449 {
450 osd->enqueue_peering_evt(
451 spgid,
452 PGPeeringEventRef(
453 std::make_shared<PGPeeringEvent>(
454 epoch, epoch,
455 RenewLease())));
456 }
457
458 void OSDService::start_shutdown()
459 {
460 {
461 std::lock_guard l(agent_timer_lock);
462 agent_timer.shutdown();
463 }
464
465 {
466 std::lock_guard l(sleep_lock);
467 sleep_timer.shutdown();
468 }
469
470 {
471 std::lock_guard l(recovery_request_lock);
472 recovery_request_timer.shutdown();
473 }
474 }
475
476 void OSDService::shutdown_reserver()
477 {
478 reserver_finisher.wait_for_empty();
479 reserver_finisher.stop();
480 }
481
482 void OSDService::shutdown()
483 {
484 mono_timer.suspend();
485
486 {
487 std::lock_guard l(watch_lock);
488 watch_timer.shutdown();
489 }
490
491 objecter->shutdown();
492 for (auto& f : objecter_finishers) {
493 f->wait_for_empty();
494 f->stop();
495 }
496
497 publish_map(OSDMapRef());
498 next_osdmap = OSDMapRef();
499 }
500
501 void OSDService::init()
502 {
503 reserver_finisher.start();
504 for (auto& f : objecter_finishers) {
505 f->start();
506 }
507 objecter->set_client_incarnation(0);
508
509 // deprioritize objecter in daemonperf output
510 objecter->get_logger()->set_prio_adjust(-3);
511
512 watch_timer.init();
513 agent_timer.init();
514 mono_timer.resume();
515
516 agent_thread.create("osd_srv_agent");
517
518 if (cct->_conf->osd_recovery_delay_start)
519 defer_recovery(cct->_conf->osd_recovery_delay_start);
520 }
521
522 void OSDService::final_init()
523 {
524 objecter->start(osdmap.get());
525 }
526
527 void OSDService::activate_map()
528 {
529 // wake/unwake the tiering agent
530 std::lock_guard l{agent_lock};
531 agent_active =
532 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
533 osd->is_active();
534 agent_cond.notify_all();
535 }
536
537 void OSDService::request_osdmap_update(epoch_t e)
538 {
539 osd->osdmap_subscribe(e, false);
540 }
541
542
543 class AgentTimeoutCB : public Context {
544 PGRef pg;
545 public:
546 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
547 void finish(int) override {
548 pg->agent_choose_mode_restart();
549 }
550 };
551
552 void OSDService::agent_entry()
553 {
554 dout(10) << __func__ << " start" << dendl;
555 std::unique_lock agent_locker{agent_lock};
556
557 while (!agent_stop_flag) {
558 if (agent_queue.empty()) {
559 dout(20) << __func__ << " empty queue" << dendl;
560 agent_cond.wait(agent_locker);
561 continue;
562 }
563 uint64_t level = agent_queue.rbegin()->first;
564 set<PGRef>& top = agent_queue.rbegin()->second;
565 dout(10) << __func__
566 << " tiers " << agent_queue.size()
567 << ", top is " << level
568 << " with pgs " << top.size()
569 << ", ops " << agent_ops << "/"
570 << cct->_conf->osd_agent_max_ops
571 << (agent_active ? " active" : " NOT ACTIVE")
572 << dendl;
573 dout(20) << __func__ << " oids " << agent_oids << dendl;
574 int max = cct->_conf->osd_agent_max_ops - agent_ops;
575 int agent_flush_quota = max;
576 if (!flush_mode_high_count)
577 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
578 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
579 agent_cond.wait(agent_locker);
580 continue;
581 }
582
583 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
584 agent_queue_pos = top.begin();
585 agent_valid_iterator = true;
586 }
587 PGRef pg = *agent_queue_pos;
588 dout(10) << "high_count " << flush_mode_high_count
589 << " agent_ops " << agent_ops
590 << " flush_quota " << agent_flush_quota << dendl;
591 agent_locker.unlock();
592 if (!pg->agent_work(max, agent_flush_quota)) {
593 dout(10) << __func__ << " " << pg->pg_id
594 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
595 << " seconds" << dendl;
596
597 osd->logger->inc(l_osd_tier_delay);
598 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
599 std::lock_guard timer_locker{agent_timer_lock};
600 Context *cb = new AgentTimeoutCB(pg);
601 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
602 }
603 agent_locker.lock();
604 }
605 dout(10) << __func__ << " finish" << dendl;
606 }
607
608 void OSDService::agent_stop()
609 {
610 {
611 std::lock_guard l(agent_lock);
612
613 // By this time all ops should be cancelled
614 ceph_assert(agent_ops == 0);
615 // By this time all PGs are shutdown and dequeued
616 if (!agent_queue.empty()) {
617 set<PGRef>& top = agent_queue.rbegin()->second;
618 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
619 ceph_abort_msg("agent queue not empty");
620 }
621
622 agent_stop_flag = true;
623 agent_cond.notify_all();
624 }
625 agent_thread.join();
626 }
627
628 // -------------------------------------
629
630 void OSDService::promote_throttle_recalibrate()
631 {
632 utime_t now = ceph_clock_now();
633 double dur = now - last_recalibrate;
634 last_recalibrate = now;
635 unsigned prob = promote_probability_millis;
636
637 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
638 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
639
640 unsigned min_prob = 1;
641
642 uint64_t attempts, obj, bytes;
643 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
644 dout(10) << __func__ << " " << attempts << " attempts, promoted "
645 << obj << " objects and " << byte_u_t(bytes) << "; target "
646 << target_obj_sec << " obj/sec or "
647 << byte_u_t(target_bytes_sec) << "/sec"
648 << dendl;
649
650 // calculate what the probability *should* be, given the targets
651 unsigned new_prob;
652 if (attempts && dur > 0) {
653 uint64_t avg_size = 1;
654 if (obj)
655 avg_size = std::max<uint64_t>(bytes / obj, 1);
656 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
657 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
658 / (double)attempts;
659 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
660 << avg_size << dendl;
661 if (target_obj_sec && target_bytes_sec)
662 new_prob = std::min(po, pb);
663 else if (target_obj_sec)
664 new_prob = po;
665 else if (target_bytes_sec)
666 new_prob = pb;
667 else
668 new_prob = 1000;
669 } else {
670 new_prob = 1000;
671 }
672 dout(20) << __func__ << " new_prob " << new_prob << dendl;
673
674 // correct for persistent skew between target rate and actual rate, adjust
675 double ratio = 1.0;
676 unsigned actual = 0;
677 if (attempts && obj) {
678 actual = obj * 1000 / attempts;
679 ratio = (double)actual / (double)prob;
680 new_prob = (double)new_prob / ratio;
681 }
682 new_prob = std::max(new_prob, min_prob);
683 new_prob = std::min(new_prob, 1000u);
684
685 // adjust
686 prob = (prob + new_prob) / 2;
687 prob = std::max(prob, min_prob);
688 prob = std::min(prob, 1000u);
689 dout(10) << __func__ << " actual " << actual
690 << ", actual/prob ratio " << ratio
691 << ", adjusted new_prob " << new_prob
692 << ", prob " << promote_probability_millis << " -> " << prob
693 << dendl;
694 promote_probability_millis = prob;
695
696 // set hard limits for this interval to mitigate stampedes
697 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
698 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
699 }
700
701 // -------------------------------------
702
703 float OSDService::get_failsafe_full_ratio()
704 {
705 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
706 if (full_ratio > 1.0) full_ratio /= 100.0;
707 return full_ratio;
708 }
709
710 OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
711 {
712 // The OSDMap ratios take precendence. So if the failsafe is .95 and
713 // the admin sets the cluster full to .96, the failsafe moves up to .96
714 // too. (Not that having failsafe == full is ideal, but it's better than
715 // dropping writes before the clusters appears full.)
716 OSDMapRef osdmap = get_osdmap();
717 if (!osdmap || osdmap->get_epoch() == 0) {
718 return NONE;
719 }
720 float nearfull_ratio = osdmap->get_nearfull_ratio();
721 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
722 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
723 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
724
725 if (osdmap->require_osd_release < ceph_release_t::luminous) {
726 // use the failsafe for nearfull and full; the mon isn't using the
727 // flags anyway because we're mid-upgrade.
728 full_ratio = failsafe_ratio;
729 backfillfull_ratio = failsafe_ratio;
730 nearfull_ratio = failsafe_ratio;
731 } else if (full_ratio <= 0 ||
732 backfillfull_ratio <= 0 ||
733 nearfull_ratio <= 0) {
734 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
735 // use failsafe flag. ick. the monitor did something wrong or the user
736 // did something stupid.
737 full_ratio = failsafe_ratio;
738 backfillfull_ratio = failsafe_ratio;
739 nearfull_ratio = failsafe_ratio;
740 }
741
742 if (injectfull_state > NONE && injectfull) {
743 inject = "(Injected)";
744 return injectfull_state;
745 } else if (pratio > failsafe_ratio) {
746 return FAILSAFE;
747 } else if (ratio > full_ratio) {
748 return FULL;
749 } else if (ratio > backfillfull_ratio) {
750 return BACKFILLFULL;
751 } else if (pratio > nearfull_ratio) {
752 return NEARFULL;
753 }
754 return NONE;
755 }
756
757 void OSDService::check_full_status(float ratio, float pratio)
758 {
759 std::lock_guard l(full_status_lock);
760
761 cur_ratio = ratio;
762 physical_ratio = pratio;
763
764 string inject;
765 s_names new_state;
766 new_state = recalc_full_state(ratio, pratio, inject);
767
768 dout(20) << __func__ << " cur ratio " << ratio
769 << ", physical ratio " << pratio
770 << ", new state " << get_full_state_name(new_state)
771 << " " << inject
772 << dendl;
773
774 // warn
775 if (cur_state != new_state) {
776 dout(10) << __func__ << " " << get_full_state_name(cur_state)
777 << " -> " << get_full_state_name(new_state) << dendl;
778 if (new_state == FAILSAFE) {
779 clog->error() << "full status failsafe engaged, dropping updates, now "
780 << (int)roundf(ratio * 100) << "% full";
781 } else if (cur_state == FAILSAFE) {
782 clog->error() << "full status failsafe disengaged, no longer dropping "
783 << "updates, now " << (int)roundf(ratio * 100) << "% full";
784 }
785 cur_state = new_state;
786 }
787 }
788
789 bool OSDService::need_fullness_update()
790 {
791 OSDMapRef osdmap = get_osdmap();
792 s_names cur = NONE;
793 if (osdmap->exists(whoami)) {
794 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
795 cur = FULL;
796 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
797 cur = BACKFILLFULL;
798 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
799 cur = NEARFULL;
800 }
801 }
802 s_names want = NONE;
803 if (is_full())
804 want = FULL;
805 else if (is_backfillfull())
806 want = BACKFILLFULL;
807 else if (is_nearfull())
808 want = NEARFULL;
809 return want != cur;
810 }
811
812 bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
813 {
814 if (injectfull && injectfull_state >= type) {
815 // injectfull is either a count of the number of times to return failsafe full
816 // or if -1 then always return full
817 if (injectfull > 0)
818 --injectfull;
819 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
820 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
821 << dendl;
822 return true;
823 }
824 return false;
825 }
826
827 bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
828 {
829 std::lock_guard l(full_status_lock);
830
831 if (_check_inject_full(dpp, type))
832 return true;
833
834 if (cur_state >= type)
835 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
836 << " physical " << physical_ratio << dendl;
837
838 return cur_state >= type;
839 }
840
841 bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
842 {
843 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
844 {
845 std::lock_guard l(full_status_lock);
846 if (_check_inject_full(dpp, type)) {
847 return true;
848 }
849 }
850
851 float pratio;
852 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
853
854 string notused;
855 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
856
857 if (tentative_state >= type)
858 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
859
860 return tentative_state >= type;
861 }
862
863 bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
864 {
865 return _check_full(dpp, FAILSAFE);
866 }
867
868 bool OSDService::check_full(DoutPrefixProvider *dpp) const
869 {
870 return _check_full(dpp, FULL);
871 }
872
873 bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
874 {
875 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
876 }
877
878 bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
879 {
880 return _check_full(dpp, BACKFILLFULL);
881 }
882
883 bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
884 {
885 return _check_full(dpp, NEARFULL);
886 }
887
888 bool OSDService::is_failsafe_full() const
889 {
890 std::lock_guard l(full_status_lock);
891 return cur_state == FAILSAFE;
892 }
893
894 bool OSDService::is_full() const
895 {
896 std::lock_guard l(full_status_lock);
897 return cur_state >= FULL;
898 }
899
900 bool OSDService::is_backfillfull() const
901 {
902 std::lock_guard l(full_status_lock);
903 return cur_state >= BACKFILLFULL;
904 }
905
906 bool OSDService::is_nearfull() const
907 {
908 std::lock_guard l(full_status_lock);
909 return cur_state >= NEARFULL;
910 }
911
912 void OSDService::set_injectfull(s_names type, int64_t count)
913 {
914 std::lock_guard l(full_status_lock);
915 injectfull_state = type;
916 injectfull = count;
917 }
918
919 void OSDService::set_statfs(const struct store_statfs_t &stbuf,
920 osd_alert_list_t& alerts)
921 {
922 uint64_t bytes = stbuf.total;
923 uint64_t avail = stbuf.available;
924 uint64_t used = stbuf.get_used_raw();
925
926 // For testing fake statfs values so it doesn't matter if all
927 // OSDs are using the same partition.
928 if (cct->_conf->fake_statfs_for_testing) {
929 uint64_t total_num_bytes = 0;
930 vector<PGRef> pgs;
931 osd->_get_pgs(&pgs);
932 for (auto p : pgs) {
933 total_num_bytes += p->get_stats_num_bytes();
934 }
935 bytes = cct->_conf->fake_statfs_for_testing;
936 if (total_num_bytes < bytes)
937 avail = bytes - total_num_bytes;
938 else
939 avail = 0;
940 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
941 << " adjust available " << avail
942 << dendl;
943 used = bytes - avail;
944 }
945
946 osd->logger->set(l_osd_stat_bytes, bytes);
947 osd->logger->set(l_osd_stat_bytes_used, used);
948 osd->logger->set(l_osd_stat_bytes_avail, avail);
949
950 std::lock_guard l(stat_lock);
951 osd_stat.statfs = stbuf;
952 osd_stat.os_alerts.clear();
953 osd_stat.os_alerts[whoami].swap(alerts);
954 if (cct->_conf->fake_statfs_for_testing) {
955 osd_stat.statfs.total = bytes;
956 osd_stat.statfs.available = avail;
957 // For testing don't want used to go negative, so clear reserved
958 osd_stat.statfs.internally_reserved = 0;
959 }
960 }
961
962 osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
963 int num_pgs)
964 {
965 utime_t now = ceph_clock_now();
966 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
967 std::lock_guard l(stat_lock);
968 osd_stat.hb_peers.swap(hb_peers);
969 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
970 osd_stat.num_pgs = num_pgs;
971 // Clean entries that aren't updated
972 // This is called often enough that we can just remove 1 at a time
973 for (auto i: osd_stat.hb_pingtime) {
974 if (i.second.last_update == 0)
975 continue;
976 if (stale_time && now.sec() - i.second.last_update > stale_time) {
977 dout(20) << __func__ << " time out heartbeat for osd " << i.first
978 << " last_update " << i.second.last_update << dendl;
979 osd_stat.hb_pingtime.erase(i.first);
980 break;
981 }
982 }
983 return osd_stat;
984 }
985
986 void OSDService::inc_osd_stat_repaired()
987 {
988 std::lock_guard l(stat_lock);
989 osd_stat.num_shards_repaired++;
990 return;
991 }
992
993 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
994 uint64_t adjust_used)
995 {
996 *pratio =
997 ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
998
999 if (adjust_used) {
1000 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1001 if (new_stat.statfs.available > adjust_used)
1002 new_stat.statfs.available -= adjust_used;
1003 else
1004 new_stat.statfs.available = 0;
1005 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
1006 }
1007
1008 // Check all pgs and adjust kb_used to include all pending backfill data
1009 int backfill_adjusted = 0;
1010 vector<PGRef> pgs;
1011 osd->_get_pgs(&pgs);
1012 for (auto p : pgs) {
1013 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1014 }
1015 if (backfill_adjusted) {
1016 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1017 }
1018 return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1019 }
1020
1021 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1022 {
1023 OSDMapRef next_map = get_nextmap_reserved();
1024 // service map is always newer/newest
1025 ceph_assert(from_epoch <= next_map->get_epoch());
1026
1027 if (next_map->is_down(peer) ||
1028 next_map->get_info(peer).up_from > from_epoch) {
1029 m->put();
1030 release_map(next_map);
1031 return;
1032 }
1033 ConnectionRef peer_con;
1034 if (peer == whoami) {
1035 peer_con = osd->cluster_messenger->get_loopback_connection();
1036 } else {
1037 peer_con = osd->cluster_messenger->connect_to_osd(
1038 next_map->get_cluster_addrs(peer), false, true);
1039 }
1040 maybe_share_map(peer_con.get(), next_map);
1041 peer_con->send_message(m);
1042 release_map(next_map);
1043 }
1044
1045 void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1046 {
1047 OSDMapRef next_map = get_nextmap_reserved();
1048 // service map is always newer/newest
1049 ceph_assert(from_epoch <= next_map->get_epoch());
1050
1051 for (auto& iter : messages) {
1052 if (next_map->is_down(iter.first) ||
1053 next_map->get_info(iter.first).up_from > from_epoch) {
1054 iter.second->put();
1055 continue;
1056 }
1057 ConnectionRef peer_con;
1058 if (iter.first == whoami) {
1059 peer_con = osd->cluster_messenger->get_loopback_connection();
1060 } else {
1061 peer_con = osd->cluster_messenger->connect_to_osd(
1062 next_map->get_cluster_addrs(iter.first), false, true);
1063 }
1064 maybe_share_map(peer_con.get(), next_map);
1065 peer_con->send_message(iter.second);
1066 }
1067 release_map(next_map);
1068 }
1069 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1070 {
1071 OSDMapRef next_map = get_nextmap_reserved();
1072 // service map is always newer/newest
1073 ceph_assert(from_epoch <= next_map->get_epoch());
1074
1075 if (next_map->is_down(peer) ||
1076 next_map->get_info(peer).up_from > from_epoch) {
1077 release_map(next_map);
1078 return NULL;
1079 }
1080 ConnectionRef con;
1081 if (peer == whoami) {
1082 con = osd->cluster_messenger->get_loopback_connection();
1083 } else {
1084 con = osd->cluster_messenger->connect_to_osd(
1085 next_map->get_cluster_addrs(peer), false, true);
1086 }
1087 release_map(next_map);
1088 return con;
1089 }
1090
1091 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1092 {
1093 OSDMapRef next_map = get_nextmap_reserved();
1094 // service map is always newer/newest
1095 ceph_assert(from_epoch <= next_map->get_epoch());
1096
1097 pair<ConnectionRef,ConnectionRef> ret;
1098 if (next_map->is_down(peer) ||
1099 next_map->get_info(peer).up_from > from_epoch) {
1100 release_map(next_map);
1101 return ret;
1102 }
1103 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1104 next_map->get_hb_back_addrs(peer));
1105 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1106 next_map->get_hb_front_addrs(peer));
1107 release_map(next_map);
1108 return ret;
1109 }
1110
1111 entity_name_t OSDService::get_cluster_msgr_name() const
1112 {
1113 return cluster_messenger->get_myname();
1114 }
1115
1116 void OSDService::queue_want_pg_temp(pg_t pgid,
1117 const vector<int>& want,
1118 bool forced)
1119 {
1120 std::lock_guard l(pg_temp_lock);
1121 auto p = pg_temp_pending.find(pgid);
1122 if (p == pg_temp_pending.end() ||
1123 p->second.acting != want ||
1124 forced) {
1125 pg_temp_wanted[pgid] = {want, forced};
1126 }
1127 }
1128
1129 void OSDService::remove_want_pg_temp(pg_t pgid)
1130 {
1131 std::lock_guard l(pg_temp_lock);
1132 pg_temp_wanted.erase(pgid);
1133 pg_temp_pending.erase(pgid);
1134 }
1135
1136 void OSDService::_sent_pg_temp()
1137 {
1138 #ifdef HAVE_STDLIB_MAP_SPLICING
1139 pg_temp_pending.merge(pg_temp_wanted);
1140 #else
1141 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1142 make_move_iterator(end(pg_temp_wanted)));
1143 #endif
1144 pg_temp_wanted.clear();
1145 }
1146
1147 void OSDService::requeue_pg_temp()
1148 {
1149 std::lock_guard l(pg_temp_lock);
1150 // wanted overrides pending. note that remove_want_pg_temp
1151 // clears the item out of both.
1152 unsigned old_wanted = pg_temp_wanted.size();
1153 unsigned old_pending = pg_temp_pending.size();
1154 _sent_pg_temp();
1155 pg_temp_wanted.swap(pg_temp_pending);
1156 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1157 << pg_temp_wanted.size() << dendl;
1158 }
1159
1160 std::ostream& operator<<(std::ostream& out,
1161 const OSDService::pg_temp_t& pg_temp)
1162 {
1163 out << pg_temp.acting;
1164 if (pg_temp.forced) {
1165 out << " (forced)";
1166 }
1167 return out;
1168 }
1169
1170 void OSDService::send_pg_temp()
1171 {
1172 std::lock_guard l(pg_temp_lock);
1173 if (pg_temp_wanted.empty())
1174 return;
1175 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1176 MOSDPGTemp *ms[2] = {nullptr, nullptr};
1177 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1178 auto& m = ms[pg_temp.forced];
1179 if (!m) {
1180 m = new MOSDPGTemp(osdmap->get_epoch());
1181 m->forced = pg_temp.forced;
1182 }
1183 m->pg_temp.emplace(pgid, pg_temp.acting);
1184 }
1185 for (auto m : ms) {
1186 if (m) {
1187 monc->send_mon_message(m);
1188 }
1189 }
1190 _sent_pg_temp();
1191 }
1192
1193 void OSDService::send_pg_created(pg_t pgid)
1194 {
1195 std::lock_guard l(pg_created_lock);
1196 dout(20) << __func__ << dendl;
1197 auto o = get_osdmap();
1198 if (o->require_osd_release >= ceph_release_t::luminous) {
1199 pg_created.insert(pgid);
1200 monc->send_mon_message(new MOSDPGCreated(pgid));
1201 }
1202 }
1203
1204 void OSDService::send_pg_created()
1205 {
1206 std::lock_guard l(pg_created_lock);
1207 dout(20) << __func__ << dendl;
1208 auto o = get_osdmap();
1209 if (o->require_osd_release >= ceph_release_t::luminous) {
1210 for (auto pgid : pg_created) {
1211 monc->send_mon_message(new MOSDPGCreated(pgid));
1212 }
1213 }
1214 }
1215
1216 void OSDService::prune_pg_created()
1217 {
1218 std::lock_guard l(pg_created_lock);
1219 dout(20) << __func__ << dendl;
1220 auto o = get_osdmap();
1221 auto i = pg_created.begin();
1222 while (i != pg_created.end()) {
1223 auto p = o->get_pg_pool(i->pool());
1224 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1225 dout(20) << __func__ << " pruning " << *i << dendl;
1226 i = pg_created.erase(i);
1227 } else {
1228 dout(20) << __func__ << " keeping " << *i << dendl;
1229 ++i;
1230 }
1231 }
1232 }
1233
1234
1235 // --------------------------------------
1236 // dispatch
1237
1238 bool OSDService::can_inc_scrubs()
1239 {
1240 bool can_inc = false;
1241 std::lock_guard l(sched_scrub_lock);
1242
1243 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1244 dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1245 << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
1246 can_inc = true;
1247 } else {
1248 dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1249 << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1250 }
1251
1252 return can_inc;
1253 }
1254
1255 bool OSDService::inc_scrubs_local()
1256 {
1257 bool result = false;
1258 std::lock_guard l{sched_scrub_lock};
1259 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1260 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1261 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1262 result = true;
1263 ++scrubs_local;
1264 } else {
1265 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1266 }
1267 return result;
1268 }
1269
1270 void OSDService::dec_scrubs_local()
1271 {
1272 std::lock_guard l{sched_scrub_lock};
1273 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1274 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1275 --scrubs_local;
1276 ceph_assert(scrubs_local >= 0);
1277 }
1278
1279 bool OSDService::inc_scrubs_remote()
1280 {
1281 bool result = false;
1282 std::lock_guard l{sched_scrub_lock};
1283 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1284 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1285 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1286 result = true;
1287 ++scrubs_remote;
1288 } else {
1289 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1290 }
1291 return result;
1292 }
1293
1294 void OSDService::dec_scrubs_remote()
1295 {
1296 std::lock_guard l{sched_scrub_lock};
1297 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1298 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1299 --scrubs_remote;
1300 ceph_assert(scrubs_remote >= 0);
1301 }
1302
1303 void OSDService::dump_scrub_reservations(Formatter *f)
1304 {
1305 std::lock_guard l{sched_scrub_lock};
1306 f->dump_int("scrubs_local", scrubs_local);
1307 f->dump_int("scrubs_remote", scrubs_remote);
1308 f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
1309 }
1310
1311 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1312 epoch_t *_bind_epoch) const
1313 {
1314 std::lock_guard l(epoch_lock);
1315 if (_boot_epoch)
1316 *_boot_epoch = boot_epoch;
1317 if (_up_epoch)
1318 *_up_epoch = up_epoch;
1319 if (_bind_epoch)
1320 *_bind_epoch = bind_epoch;
1321 }
1322
1323 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1324 const epoch_t *_bind_epoch)
1325 {
1326 std::lock_guard l(epoch_lock);
1327 if (_boot_epoch) {
1328 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1329 boot_epoch = *_boot_epoch;
1330 }
1331 if (_up_epoch) {
1332 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1333 up_epoch = *_up_epoch;
1334 }
1335 if (_bind_epoch) {
1336 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1337 bind_epoch = *_bind_epoch;
1338 }
1339 }
1340
1341 bool OSDService::prepare_to_stop()
1342 {
1343 std::unique_lock l(is_stopping_lock);
1344 if (get_state() != NOT_STOPPING)
1345 return false;
1346
1347 OSDMapRef osdmap = get_osdmap();
1348 if (osdmap && osdmap->is_up(whoami)) {
1349 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1350 set_state(PREPARING_TO_STOP);
1351 monc->send_mon_message(
1352 new MOSDMarkMeDown(
1353 monc->get_fsid(),
1354 whoami,
1355 osdmap->get_addrs(whoami),
1356 osdmap->get_epoch(),
1357 true // request ack
1358 ));
1359 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1360 is_stopping_cond.wait_for(l, timeout,
1361 [this] { return get_state() == STOPPING; });
1362 }
1363 dout(0) << __func__ << " starting shutdown" << dendl;
1364 set_state(STOPPING);
1365 return true;
1366 }
1367
1368 void OSDService::got_stop_ack()
1369 {
1370 std::scoped_lock l(is_stopping_lock);
1371 if (get_state() == PREPARING_TO_STOP) {
1372 dout(0) << __func__ << " starting shutdown" << dendl;
1373 set_state(STOPPING);
1374 is_stopping_cond.notify_all();
1375 } else {
1376 dout(10) << __func__ << " ignoring msg" << dendl;
1377 }
1378 }
1379
1380 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1381 OSDSuperblock& sblock)
1382 {
1383 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1384 osdmap->get_encoding_features());
1385 m->oldest_map = max_oldest_map;
1386 m->newest_map = sblock.newest_map;
1387
1388 int max = cct->_conf->osd_map_message_max;
1389 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1390
1391 if (since < m->oldest_map) {
1392 // we don't have the next map the target wants, so start with a
1393 // full map.
1394 bufferlist bl;
1395 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1396 << since << ", starting with full map" << dendl;
1397 since = m->oldest_map;
1398 if (!get_map_bl(since, bl)) {
1399 derr << __func__ << " missing full map " << since << dendl;
1400 goto panic;
1401 }
1402 max--;
1403 max_bytes -= bl.length();
1404 m->maps[since].claim(bl);
1405 }
1406 for (epoch_t e = since + 1; e <= to; ++e) {
1407 bufferlist bl;
1408 if (get_inc_map_bl(e, bl)) {
1409 m->incremental_maps[e].claim(bl);
1410 } else {
1411 dout(10) << __func__ << " missing incremental map " << e << dendl;
1412 if (!get_map_bl(e, bl)) {
1413 derr << __func__ << " also missing full map " << e << dendl;
1414 goto panic;
1415 }
1416 m->maps[e].claim(bl);
1417 }
1418 max--;
1419 max_bytes -= bl.length();
1420 if (max <= 0 || max_bytes <= 0) {
1421 break;
1422 }
1423 }
1424 return m;
1425
1426 panic:
1427 if (!m->maps.empty() ||
1428 !m->incremental_maps.empty()) {
1429 // send what we have so far
1430 return m;
1431 }
1432 // send something
1433 bufferlist bl;
1434 if (get_inc_map_bl(m->newest_map, bl)) {
1435 m->incremental_maps[m->newest_map].claim(bl);
1436 } else {
1437 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1438 if (!get_map_bl(m->newest_map, bl)) {
1439 derr << __func__ << " unable to load latest full map " << m->newest_map
1440 << dendl;
1441 ceph_abort();
1442 }
1443 m->maps[m->newest_map].claim(bl);
1444 }
1445 return m;
1446 }
1447
1448 void OSDService::send_map(MOSDMap *m, Connection *con)
1449 {
1450 con->send_message(m);
1451 }
1452
1453 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1454 const OSDMapRef& osdmap)
1455 {
1456 epoch_t to = osdmap->get_epoch();
1457 dout(10) << "send_incremental_map " << since << " -> " << to
1458 << " to " << con << " " << con->get_peer_addr() << dendl;
1459
1460 MOSDMap *m = NULL;
1461 while (!m) {
1462 OSDSuperblock sblock(get_superblock());
1463 if (since < sblock.oldest_map) {
1464 // just send latest full map
1465 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1466 osdmap->get_encoding_features());
1467 m->oldest_map = max_oldest_map;
1468 m->newest_map = sblock.newest_map;
1469 get_map_bl(to, m->maps[to]);
1470 send_map(m, con);
1471 return;
1472 }
1473
1474 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1475 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1476 << ", only sending most recent" << dendl;
1477 since = to - cct->_conf->osd_map_share_max_epochs;
1478 }
1479
1480 m = build_incremental_map_msg(since, to, sblock);
1481 }
1482 send_map(m, con);
1483 }
1484
1485 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1486 {
1487 bool found = map_bl_cache.lookup(e, &bl);
1488 if (found) {
1489 if (logger)
1490 logger->inc(l_osd_map_bl_cache_hit);
1491 return true;
1492 }
1493 if (logger)
1494 logger->inc(l_osd_map_bl_cache_miss);
1495 found = store->read(meta_ch,
1496 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1497 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1498 if (found) {
1499 _add_map_bl(e, bl);
1500 }
1501 return found;
1502 }
1503
1504 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1505 {
1506 std::lock_guard l(map_cache_lock);
1507 bool found = map_bl_inc_cache.lookup(e, &bl);
1508 if (found) {
1509 if (logger)
1510 logger->inc(l_osd_map_bl_cache_hit);
1511 return true;
1512 }
1513 if (logger)
1514 logger->inc(l_osd_map_bl_cache_miss);
1515 found = store->read(meta_ch,
1516 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1517 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1518 if (found) {
1519 _add_map_inc_bl(e, bl);
1520 }
1521 return found;
1522 }
1523
1524 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1525 {
1526 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1527 // cache a contiguous buffer
1528 if (bl.get_num_buffers() > 1) {
1529 bl.rebuild();
1530 }
1531 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1532 map_bl_cache.add(e, bl);
1533 }
1534
1535 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1536 {
1537 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1538 // cache a contiguous buffer
1539 if (bl.get_num_buffers() > 1) {
1540 bl.rebuild();
1541 }
1542 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1543 map_bl_inc_cache.add(e, bl);
1544 }
1545
1546 OSDMapRef OSDService::_add_map(OSDMap *o)
1547 {
1548 epoch_t e = o->get_epoch();
1549
1550 if (cct->_conf->osd_map_dedup) {
1551 // Dedup against an existing map at a nearby epoch
1552 OSDMapRef for_dedup = map_cache.lower_bound(e);
1553 if (for_dedup) {
1554 OSDMap::dedup(for_dedup.get(), o);
1555 }
1556 }
1557 bool existed;
1558 OSDMapRef l = map_cache.add(e, o, &existed);
1559 if (existed) {
1560 delete o;
1561 }
1562 return l;
1563 }
1564
1565 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1566 {
1567 std::lock_guard l(map_cache_lock);
1568 OSDMapRef retval = map_cache.lookup(epoch);
1569 if (retval) {
1570 dout(30) << "get_map " << epoch << " -cached" << dendl;
1571 if (logger) {
1572 logger->inc(l_osd_map_cache_hit);
1573 }
1574 return retval;
1575 }
1576 if (logger) {
1577 logger->inc(l_osd_map_cache_miss);
1578 epoch_t lb = map_cache.cached_key_lower_bound();
1579 if (epoch < lb) {
1580 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1581 logger->inc(l_osd_map_cache_miss_low);
1582 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1583 }
1584 }
1585
1586 OSDMap *map = new OSDMap;
1587 if (epoch > 0) {
1588 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1589 bufferlist bl;
1590 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1591 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1592 delete map;
1593 return OSDMapRef();
1594 }
1595 map->decode(bl);
1596 } else {
1597 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1598 }
1599 return _add_map(map);
1600 }
1601
1602 // ops
1603
1604
1605 void OSDService::reply_op_error(OpRequestRef op, int err)
1606 {
1607 reply_op_error(op, err, eversion_t(), 0, {});
1608 }
1609
1610 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1611 version_t uv,
1612 vector<pg_log_op_return_item_t> op_returns)
1613 {
1614 auto m = op->get_req<MOSDOp>();
1615 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1616 int flags;
1617 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1618
1619 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1620 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
1621 reply->set_reply_versions(v, uv);
1622 reply->set_op_returns(op_returns);
1623 m->get_connection()->send_message(reply);
1624 }
1625
1626 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1627 {
1628 if (!cct->_conf->osd_debug_misdirected_ops) {
1629 return;
1630 }
1631
1632 auto m = op->get_req<MOSDOp>();
1633 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1634
1635 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1636
1637 if (pg->is_ec_pg()) {
1638 /**
1639 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1640 * can get this result:
1641 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1642 * [CRUSH_ITEM_NONE, 2, 3]/3
1643 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1644 * [3, 2, 3]/3
1645 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1646 * -- misdirected op
1647 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1648 * it and fulfils it
1649 *
1650 * We can't compute the op target based on the sending map epoch due to
1651 * splitting. The simplest thing is to detect such cases here and drop
1652 * them without an error (the client will resend anyway).
1653 */
1654 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1655 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1656 if (!opmap) {
1657 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1658 << m->get_map_epoch() << ", dropping" << dendl;
1659 return;
1660 }
1661 pg_t _pgid = m->get_raw_pg();
1662 spg_t pgid;
1663 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1664 _pgid = opmap->raw_pg_to_pg(_pgid);
1665 if (opmap->get_primary_shard(_pgid, &pgid) &&
1666 pgid.shard != pg->pg_id.shard) {
1667 dout(7) << __func__ << ": " << *pg << " primary changed since "
1668 << m->get_map_epoch() << ", dropping" << dendl;
1669 return;
1670 }
1671 }
1672
1673 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1674 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1675 << " pg " << m->get_raw_pg()
1676 << " to osd." << whoami
1677 << " not " << pg->get_acting()
1678 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1679 }
1680
1681 void OSDService::enqueue_back(OpSchedulerItem&& qi)
1682 {
1683 osd->op_shardedwq.queue(std::move(qi));
1684 }
1685
1686 void OSDService::enqueue_front(OpSchedulerItem&& qi)
1687 {
1688 osd->op_shardedwq.queue_front(std::move(qi));
1689 }
1690
1691 void OSDService::queue_recovery_context(
1692 PG *pg,
1693 GenContext<ThreadPool::TPHandle&> *c)
1694 {
1695 epoch_t e = get_osdmap_epoch();
1696 enqueue_back(
1697 OpSchedulerItem(
1698 unique_ptr<OpSchedulerItem::OpQueueable>(
1699 new PGRecoveryContext(pg->get_pgid(), c, e)),
1700 cct->_conf->osd_recovery_cost,
1701 cct->_conf->osd_recovery_priority,
1702 ceph_clock_now(),
1703 0,
1704 e));
1705 }
1706
1707 void OSDService::queue_for_snap_trim(PG *pg)
1708 {
1709 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1710 enqueue_back(
1711 OpSchedulerItem(
1712 unique_ptr<OpSchedulerItem::OpQueueable>(
1713 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1714 cct->_conf->osd_snap_trim_cost,
1715 cct->_conf->osd_snap_trim_priority,
1716 ceph_clock_now(),
1717 0,
1718 pg->get_osdmap_epoch()));
1719 }
1720
1721 void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
1722 {
1723 unsigned scrub_queue_priority = pg->scrubber.priority;
1724 if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
1725 scrub_queue_priority = cct->_conf->osd_client_op_priority;
1726 }
1727 const auto epoch = pg->get_osdmap_epoch();
1728 enqueue_back(
1729 OpSchedulerItem(
1730 unique_ptr<OpSchedulerItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
1731 cct->_conf->osd_scrub_cost,
1732 scrub_queue_priority,
1733 ceph_clock_now(),
1734 0,
1735 epoch));
1736 }
1737
1738 void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1739 {
1740 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1741 enqueue_back(
1742 OpSchedulerItem(
1743 unique_ptr<OpSchedulerItem::OpQueueable>(
1744 new PGDelete(pgid, e)),
1745 cct->_conf->osd_pg_delete_cost,
1746 cct->_conf->osd_pg_delete_priority,
1747 ceph_clock_now(),
1748 0,
1749 e));
1750 }
1751
1752 bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1753 {
1754 return osd->try_finish_pg_delete(pg, old_pg_num);
1755 }
1756
1757 // ---
1758
1759 void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1760 {
1761 std::lock_guard l(merge_lock);
1762 dout(10) << __func__ << " " << pg->pg_id << dendl;
1763 ready_to_merge_source[pg->pg_id.pgid] = version;
1764 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1765 _send_ready_to_merge();
1766 }
1767
1768 void OSDService::set_ready_to_merge_target(PG *pg,
1769 eversion_t version,
1770 epoch_t last_epoch_started,
1771 epoch_t last_epoch_clean)
1772 {
1773 std::lock_guard l(merge_lock);
1774 dout(10) << __func__ << " " << pg->pg_id << dendl;
1775 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1776 make_tuple(version,
1777 last_epoch_started,
1778 last_epoch_clean)));
1779 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1780 _send_ready_to_merge();
1781 }
1782
1783 void OSDService::set_not_ready_to_merge_source(pg_t source)
1784 {
1785 std::lock_guard l(merge_lock);
1786 dout(10) << __func__ << " " << source << dendl;
1787 not_ready_to_merge_source.insert(source);
1788 assert(ready_to_merge_source.count(source) == 0);
1789 _send_ready_to_merge();
1790 }
1791
1792 void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1793 {
1794 std::lock_guard l(merge_lock);
1795 dout(10) << __func__ << " " << target << " source " << source << dendl;
1796 not_ready_to_merge_target[target] = source;
1797 assert(ready_to_merge_target.count(target) == 0);
1798 _send_ready_to_merge();
1799 }
1800
1801 void OSDService::send_ready_to_merge()
1802 {
1803 std::lock_guard l(merge_lock);
1804 _send_ready_to_merge();
1805 }
1806
1807 void OSDService::_send_ready_to_merge()
1808 {
1809 dout(20) << __func__
1810 << " ready_to_merge_source " << ready_to_merge_source
1811 << " not_ready_to_merge_source " << not_ready_to_merge_source
1812 << " ready_to_merge_target " << ready_to_merge_target
1813 << " not_ready_to_merge_target " << not_ready_to_merge_target
1814 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1815 << dendl;
1816 for (auto src : not_ready_to_merge_source) {
1817 if (sent_ready_to_merge_source.count(src) == 0) {
1818 monc->send_mon_message(new MOSDPGReadyToMerge(
1819 src,
1820 {}, {}, 0, 0,
1821 false,
1822 osdmap->get_epoch()));
1823 sent_ready_to_merge_source.insert(src);
1824 }
1825 }
1826 for (auto p : not_ready_to_merge_target) {
1827 if (sent_ready_to_merge_source.count(p.second) == 0) {
1828 monc->send_mon_message(new MOSDPGReadyToMerge(
1829 p.second,
1830 {}, {}, 0, 0,
1831 false,
1832 osdmap->get_epoch()));
1833 sent_ready_to_merge_source.insert(p.second);
1834 }
1835 }
1836 for (auto src : ready_to_merge_source) {
1837 if (not_ready_to_merge_source.count(src.first) ||
1838 not_ready_to_merge_target.count(src.first.get_parent())) {
1839 continue;
1840 }
1841 auto p = ready_to_merge_target.find(src.first.get_parent());
1842 if (p != ready_to_merge_target.end() &&
1843 sent_ready_to_merge_source.count(src.first) == 0) {
1844 monc->send_mon_message(new MOSDPGReadyToMerge(
1845 src.first, // source pgid
1846 src.second, // src version
1847 std::get<0>(p->second), // target version
1848 std::get<1>(p->second), // PG's last_epoch_started
1849 std::get<2>(p->second), // PG's last_epoch_clean
1850 true,
1851 osdmap->get_epoch()));
1852 sent_ready_to_merge_source.insert(src.first);
1853 }
1854 }
1855 }
1856
1857 void OSDService::clear_ready_to_merge(PG *pg)
1858 {
1859 std::lock_guard l(merge_lock);
1860 dout(10) << __func__ << " " << pg->pg_id << dendl;
1861 ready_to_merge_source.erase(pg->pg_id.pgid);
1862 ready_to_merge_target.erase(pg->pg_id.pgid);
1863 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1864 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1865 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1866 }
1867
1868 void OSDService::clear_sent_ready_to_merge()
1869 {
1870 std::lock_guard l(merge_lock);
1871 sent_ready_to_merge_source.clear();
1872 }
1873
1874 void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
1875 {
1876 std::lock_guard l(merge_lock);
1877 auto i = sent_ready_to_merge_source.begin();
1878 while (i != sent_ready_to_merge_source.end()) {
1879 if (!osdmap->pg_exists(*i)) {
1880 dout(10) << __func__ << " " << *i << dendl;
1881 i = sent_ready_to_merge_source.erase(i);
1882 } else {
1883 ++i;
1884 }
1885 }
1886 }
1887
1888 // ---
1889
1890 void OSDService::_queue_for_recovery(
1891 std::pair<epoch_t, PGRef> p,
1892 uint64_t reserved_pushes)
1893 {
1894 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
1895 enqueue_back(
1896 OpSchedulerItem(
1897 unique_ptr<OpSchedulerItem::OpQueueable>(
1898 new PGRecovery(
1899 p.second->get_pgid(), p.first, reserved_pushes)),
1900 cct->_conf->osd_recovery_cost,
1901 cct->_conf->osd_recovery_priority,
1902 ceph_clock_now(),
1903 0,
1904 p.first));
1905 }
1906
1907 // ====================================================================
1908 // OSD
1909
1910 #undef dout_prefix
1911 #define dout_prefix *_dout
1912
1913 // Commands shared between OSD's console and admin console:
1914 namespace ceph {
1915 namespace osd_cmds {
1916
1917 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1918
1919 }} // namespace ceph::osd_cmds
1920
1921 int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, string osdspec_affinity)
1922 {
1923 int ret;
1924
1925 OSDSuperblock sb;
1926 bufferlist sbbl;
1927 ObjectStore::CollectionHandle ch;
1928
1929 // if we are fed a uuid for this osd, use it.
1930 store->set_fsid(cct->_conf->osd_uuid);
1931
1932 ret = store->mkfs();
1933 if (ret) {
1934 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1935 << cpp_strerror(ret) << dendl;
1936 goto free_store;
1937 }
1938
1939 store->set_cache_shards(1); // doesn't matter for mkfs!
1940
1941 ret = store->mount();
1942 if (ret) {
1943 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1944 << cpp_strerror(ret) << dendl;
1945 goto free_store;
1946 }
1947
1948 ch = store->open_collection(coll_t::meta());
1949 if (ch) {
1950 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1951 if (ret < 0) {
1952 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
1953 goto free_store;
1954 }
1955 /* if we already have superblock, check content of superblock */
1956 dout(0) << " have superblock" << dendl;
1957 auto p = sbbl.cbegin();
1958 decode(sb, p);
1959 if (whoami != sb.whoami) {
1960 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1961 << dendl;
1962 ret = -EINVAL;
1963 goto umount_store;
1964 }
1965 if (fsid != sb.cluster_fsid) {
1966 derr << "provided cluster fsid " << fsid
1967 << " != superblock's " << sb.cluster_fsid << dendl;
1968 ret = -EINVAL;
1969 goto umount_store;
1970 }
1971 } else {
1972 // create superblock
1973 sb.cluster_fsid = fsid;
1974 sb.osd_fsid = store->get_fsid();
1975 sb.whoami = whoami;
1976 sb.compat_features = get_osd_initial_compat_set();
1977
1978 bufferlist bl;
1979 encode(sb, bl);
1980
1981 ObjectStore::CollectionHandle ch = store->create_new_collection(
1982 coll_t::meta());
1983 ObjectStore::Transaction t;
1984 t.create_collection(coll_t::meta(), 0);
1985 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1986 ret = store->queue_transaction(ch, std::move(t));
1987 if (ret) {
1988 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1989 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
1990 goto umount_store;
1991 }
1992 }
1993
1994 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
1995 if (ret) {
1996 derr << "OSD::mkfs: failed to write fsid file: error "
1997 << cpp_strerror(ret) << dendl;
1998 goto umount_store;
1999 }
2000
2001 umount_store:
2002 if (ch) {
2003 ch.reset();
2004 }
2005 store->umount();
2006 free_store:
2007 delete store;
2008 return ret;
2009 }
2010
2011 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
2012 {
2013 char val[80];
2014 int r;
2015
2016 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2017 r = store->write_meta("magic", val);
2018 if (r < 0)
2019 return r;
2020
2021 snprintf(val, sizeof(val), "%d", whoami);
2022 r = store->write_meta("whoami", val);
2023 if (r < 0)
2024 return r;
2025
2026 cluster_fsid.print(val);
2027 r = store->write_meta("ceph_fsid", val);
2028 if (r < 0)
2029 return r;
2030
2031 string key = cct->_conf.get_val<string>("key");
2032 if (key.size()) {
2033 r = store->write_meta("osd_key", key);
2034 if (r < 0)
2035 return r;
2036 } else {
2037 string keyfile = cct->_conf.get_val<string>("keyfile");
2038 if (!keyfile.empty()) {
2039 bufferlist keybl;
2040 string err;
2041 r = keybl.read_file(keyfile.c_str(), &err);
2042 if (r < 0) {
2043 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2044 << err << ": " << cpp_strerror(r) << dendl;
2045 return r;
2046 }
2047 r = store->write_meta("osd_key", keybl.to_str());
2048 if (r < 0)
2049 return r;
2050 }
2051 }
2052 if (!osdspec_affinity.empty()) {
2053 r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2054 if (r < 0)
2055 return r;
2056 }
2057
2058 r = store->write_meta("ready", "ready");
2059 if (r < 0)
2060 return r;
2061
2062 return 0;
2063 }
2064
2065 int OSD::peek_meta(ObjectStore *store,
2066 std::string *magic,
2067 uuid_d *cluster_fsid,
2068 uuid_d *osd_fsid,
2069 int *whoami,
2070 ceph_release_t *require_osd_release)
2071 {
2072 string val;
2073
2074 int r = store->read_meta("magic", &val);
2075 if (r < 0)
2076 return r;
2077 *magic = val;
2078
2079 r = store->read_meta("whoami", &val);
2080 if (r < 0)
2081 return r;
2082 *whoami = atoi(val.c_str());
2083
2084 r = store->read_meta("ceph_fsid", &val);
2085 if (r < 0)
2086 return r;
2087 r = cluster_fsid->parse(val.c_str());
2088 if (!r)
2089 return -EINVAL;
2090
2091 r = store->read_meta("fsid", &val);
2092 if (r < 0) {
2093 *osd_fsid = uuid_d();
2094 } else {
2095 r = osd_fsid->parse(val.c_str());
2096 if (!r)
2097 return -EINVAL;
2098 }
2099
2100 r = store->read_meta("require_osd_release", &val);
2101 if (r >= 0) {
2102 *require_osd_release = ceph_release_from_name(val);
2103 }
2104
2105 return 0;
2106 }
2107
2108
2109 #undef dout_prefix
2110 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2111
2112 // cons/des
2113
2114 OSD::OSD(CephContext *cct_, ObjectStore *store_,
2115 int id,
2116 Messenger *internal_messenger,
2117 Messenger *external_messenger,
2118 Messenger *hb_client_front,
2119 Messenger *hb_client_back,
2120 Messenger *hb_front_serverm,
2121 Messenger *hb_back_serverm,
2122 Messenger *osdc_messenger,
2123 MonClient *mc,
2124 const std::string &dev, const std::string &jdev) :
2125 Dispatcher(cct_),
2126 tick_timer(cct, osd_lock),
2127 tick_timer_without_osd_lock(cct, tick_timer_lock),
2128 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2129 cluster_messenger(internal_messenger),
2130 client_messenger(external_messenger),
2131 objecter_messenger(osdc_messenger),
2132 monc(mc),
2133 mgrc(cct_, client_messenger, &mc->monmap),
2134 logger(NULL),
2135 recoverystate_perf(NULL),
2136 store(store_),
2137 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2138 clog(log_client.create_channel()),
2139 whoami(id),
2140 dev_path(dev), journal_path(jdev),
2141 store_is_rotational(store->is_rotational()),
2142 trace_endpoint("0.0.0.0", 0, "osd"),
2143 asok_hook(NULL),
2144 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2145 "osd_pg_epoch_max_lag_factor")),
2146 osd_compat(get_osd_compat_set()),
2147 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2148 get_num_op_threads()),
2149 heartbeat_stop(false),
2150 heartbeat_need_update(true),
2151 hb_front_client_messenger(hb_client_front),
2152 hb_back_client_messenger(hb_client_back),
2153 hb_front_server_messenger(hb_front_serverm),
2154 hb_back_server_messenger(hb_back_serverm),
2155 daily_loadavg(0.0),
2156 heartbeat_thread(this),
2157 heartbeat_dispatcher(this),
2158 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2159 cct->_conf->osd_num_op_tracker_shard),
2160 test_ops_hook(NULL),
2161 op_shardedwq(
2162 this,
2163 cct->_conf->osd_op_thread_timeout,
2164 cct->_conf->osd_op_thread_suicide_timeout,
2165 &osd_op_tp),
2166 last_pg_create_epoch(0),
2167 boot_finisher(cct),
2168 up_thru_wanted(0),
2169 requested_full_first(0),
2170 requested_full_last(0),
2171 service(this)
2172 {
2173
2174 if (!gss_ktfile_client.empty()) {
2175 // Assert we can export environment variable
2176 /*
2177 The default client keytab is used, if it is present and readable,
2178 to automatically obtain initial credentials for GSSAPI client
2179 applications. The principal name of the first entry in the client
2180 keytab is used by default when obtaining initial credentials.
2181 1. The KRB5_CLIENT_KTNAME environment variable.
2182 2. The default_client_keytab_name profile variable in [libdefaults].
2183 3. The hardcoded default, DEFCKTNAME.
2184 */
2185 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2186 gss_ktfile_client.c_str(), 1));
2187 ceph_assert(set_result == 0);
2188 }
2189
2190 monc->set_messenger(client_messenger);
2191 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2192 cct->_conf->osd_op_log_threshold);
2193 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2194 cct->_conf->osd_op_history_duration);
2195 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2196 cct->_conf->osd_op_history_slow_op_threshold);
2197 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
2198 #ifdef WITH_BLKIN
2199 std::stringstream ss;
2200 ss << "osd." << whoami;
2201 trace_endpoint.copy_name(ss.str());
2202 #endif
2203
2204 // initialize shards
2205 num_shards = get_num_op_shards();
2206 for (uint32_t i = 0; i < num_shards; i++) {
2207 OSDShard *one_shard = new OSDShard(
2208 i,
2209 cct,
2210 this);
2211 shards.push_back(one_shard);
2212 }
2213 }
2214
2215 OSD::~OSD()
2216 {
2217 while (!shards.empty()) {
2218 delete shards.back();
2219 shards.pop_back();
2220 }
2221 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2222 cct->get_perfcounters_collection()->remove(logger);
2223 delete recoverystate_perf;
2224 delete logger;
2225 delete store;
2226 }
2227
2228 double OSD::get_tick_interval() const
2229 {
2230 // vary +/- 5% to avoid scrub scheduling livelocks
2231 constexpr auto delta = 0.05;
2232 return (OSD_TICK_INTERVAL *
2233 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2234 }
2235
2236 void OSD::handle_signal(int signum)
2237 {
2238 ceph_assert(signum == SIGINT || signum == SIGTERM);
2239 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2240 shutdown();
2241 }
2242
2243 int OSD::pre_init()
2244 {
2245 std::lock_guard lock(osd_lock);
2246 if (is_stopping())
2247 return 0;
2248
2249 if (store->test_mount_in_use()) {
2250 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2251 << "currently in use. (Is ceph-osd already running?)" << dendl;
2252 return -EBUSY;
2253 }
2254
2255 cct->_conf.add_observer(this);
2256 return 0;
2257 }
2258
2259 int OSD::set_numa_affinity()
2260 {
2261 // storage numa node
2262 int store_node = -1;
2263 store->get_numa_node(&store_node, nullptr, nullptr);
2264 if (store_node >= 0) {
2265 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2266 }
2267
2268 // check network numa node(s)
2269 int front_node = -1, back_node = -1;
2270 string front_iface = pick_iface(
2271 cct,
2272 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2273 string back_iface = pick_iface(
2274 cct,
2275 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2276 int r = get_iface_numa_node(front_iface, &front_node);
2277 if (r >= 0 && front_node >= 0) {
2278 dout(1) << __func__ << " public network " << front_iface << " numa node "
2279 << front_node << dendl;
2280 r = get_iface_numa_node(back_iface, &back_node);
2281 if (r >= 0 && back_node >= 0) {
2282 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2283 << back_node << dendl;
2284 if (front_node == back_node &&
2285 front_node == store_node) {
2286 dout(1) << " objectstore and network numa nodes all match" << dendl;
2287 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2288 numa_node = front_node;
2289 }
2290 } else if (front_node != back_node) {
2291 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2292 << dendl;
2293 } else {
2294 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2295 << dendl;
2296 }
2297 } else if (back_node == -2) {
2298 dout(1) << __func__ << " cluster network " << back_iface
2299 << " ports numa nodes do not match" << dendl;
2300 } else {
2301 derr << __func__ << " unable to identify cluster interface '" << back_iface
2302 << "' numa node: " << cpp_strerror(r) << dendl;
2303 }
2304 } else if (front_node == -2) {
2305 dout(1) << __func__ << " public network " << front_iface
2306 << " ports numa nodes do not match" << dendl;
2307 } else {
2308 derr << __func__ << " unable to identify public interface '" << front_iface
2309 << "' numa node: " << cpp_strerror(r) << dendl;
2310 }
2311 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2312 // this takes precedence over the automagic logic above
2313 numa_node = node;
2314 }
2315 if (numa_node >= 0) {
2316 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2317 if (r < 0) {
2318 dout(1) << __func__ << " unable to determine numa node " << numa_node
2319 << " CPUs" << dendl;
2320 numa_node = -1;
2321 } else {
2322 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2323 << " cpus "
2324 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2325 << dendl;
2326 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
2327 if (r < 0) {
2328 r = -errno;
2329 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2330 << dendl;
2331 numa_node = -1;
2332 }
2333 }
2334 } else {
2335 dout(1) << __func__ << " not setting numa affinity" << dendl;
2336 }
2337 return 0;
2338 }
2339
2340 // asok
2341
2342 class OSDSocketHook : public AdminSocketHook {
2343 OSD *osd;
2344 public:
2345 explicit OSDSocketHook(OSD *o) : osd(o) {}
2346 int call(std::string_view prefix, const cmdmap_t& cmdmap,
2347 Formatter *f,
2348 std::ostream& ss,
2349 bufferlist& out) override {
2350 ceph_abort("should use async hook");
2351 }
2352 void call_async(
2353 std::string_view prefix,
2354 const cmdmap_t& cmdmap,
2355 Formatter *f,
2356 const bufferlist& inbl,
2357 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
2358 try {
2359 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2360 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2361 bufferlist empty;
2362 on_finish(-EINVAL, e.what(), empty);
2363 }
2364 }
2365 };
2366
2367 std::set<int64_t> OSD::get_mapped_pools()
2368 {
2369 std::set<int64_t> pools;
2370 std::vector<spg_t> pgids;
2371 _get_pgids(&pgids);
2372 for (const auto &pgid : pgids) {
2373 pools.insert(pgid.pool());
2374 }
2375 return pools;
2376 }
2377
2378 void OSD::asok_command(
2379 std::string_view prefix, const cmdmap_t& cmdmap,
2380 Formatter *f,
2381 const bufferlist& inbl,
2382 std::function<void(int,const std::string&,bufferlist&)> on_finish)
2383 {
2384 int ret = 0;
2385 stringstream ss; // stderr error message stream
2386 bufferlist outbl; // if empty at end, we'll dump formatter as output
2387
2388 // --- PG commands are routed here to PG::do_command ---
2389 if (prefix == "pg" ||
2390 prefix == "query" ||
2391 prefix == "mark_unfound_lost" ||
2392 prefix == "list_unfound" ||
2393 prefix == "scrub" ||
2394 prefix == "deep_scrub"
2395 ) {
2396 string pgidstr;
2397 pg_t pgid;
2398 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2399 ss << "no pgid specified";
2400 ret = -EINVAL;
2401 goto out;
2402 }
2403 if (!pgid.parse(pgidstr.c_str())) {
2404 ss << "couldn't parse pgid '" << pgidstr << "'";
2405 ret = -EINVAL;
2406 goto out;
2407 }
2408 spg_t pcand;
2409 PGRef pg;
2410 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2411 (pg = _lookup_lock_pg(pcand))) {
2412 if (pg->is_primary()) {
2413 cmdmap_t new_cmdmap = cmdmap;
2414 try {
2415 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2416 pg->unlock();
2417 return; // the pg handler calls on_finish directly
2418 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2419 pg->unlock();
2420 ss << e.what();
2421 ret = -EINVAL;
2422 goto out;
2423 }
2424 } else {
2425 ss << "not primary for pgid " << pgid;
2426 // do not reply; they will get newer maps and realize they
2427 // need to resend.
2428 pg->unlock();
2429 ret = -EAGAIN;
2430 goto out;
2431 }
2432 } else {
2433 ss << "i don't have pgid " << pgid;
2434 ret = -ENOENT;
2435 }
2436 }
2437
2438 // --- OSD commands follow ---
2439
2440 else if (prefix == "status") {
2441 lock_guard l(osd_lock);
2442 f->open_object_section("status");
2443 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2444 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2445 f->dump_unsigned("whoami", superblock.whoami);
2446 f->dump_string("state", get_state_name(get_state()));
2447 f->dump_unsigned("oldest_map", superblock.oldest_map);
2448 f->dump_unsigned("newest_map", superblock.newest_map);
2449 f->dump_unsigned("num_pgs", num_pgs);
2450 f->close_section();
2451 } else if (prefix == "flush_journal") {
2452 store->flush_journal();
2453 } else if (prefix == "dump_ops_in_flight" ||
2454 prefix == "ops" ||
2455 prefix == "dump_blocked_ops" ||
2456 prefix == "dump_historic_ops" ||
2457 prefix == "dump_historic_ops_by_duration" ||
2458 prefix == "dump_historic_slow_ops") {
2459
2460 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2461 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2462 will start to track new ops received afterwards.";
2463
2464 set<string> filters;
2465 vector<string> filter_str;
2466 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
2467 copy(filter_str.begin(), filter_str.end(),
2468 inserter(filters, filters.end()));
2469 }
2470
2471 if (prefix == "dump_ops_in_flight" ||
2472 prefix == "ops") {
2473 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2474 ss << error_str;
2475 ret = -EINVAL;
2476 goto out;
2477 }
2478 }
2479 if (prefix == "dump_blocked_ops") {
2480 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2481 ss << error_str;
2482 ret = -EINVAL;
2483 goto out;
2484 }
2485 }
2486 if (prefix == "dump_historic_ops") {
2487 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2488 ss << error_str;
2489 ret = -EINVAL;
2490 goto out;
2491 }
2492 }
2493 if (prefix == "dump_historic_ops_by_duration") {
2494 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2495 ss << error_str;
2496 ret = -EINVAL;
2497 goto out;
2498 }
2499 }
2500 if (prefix == "dump_historic_slow_ops") {
2501 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2502 ss << error_str;
2503 ret = -EINVAL;
2504 goto out;
2505 }
2506 }
2507 } else if (prefix == "dump_op_pq_state") {
2508 f->open_object_section("pq");
2509 op_shardedwq.dump(f);
2510 f->close_section();
2511 } else if (prefix == "dump_blacklist") {
2512 list<pair<entity_addr_t,utime_t> > bl;
2513 OSDMapRef curmap = service.get_osdmap();
2514
2515 f->open_array_section("blacklist");
2516 curmap->get_blacklist(&bl);
2517 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2518 it != bl.end(); ++it) {
2519 f->open_object_section("entry");
2520 f->open_object_section("entity_addr_t");
2521 it->first.dump(f);
2522 f->close_section(); //entity_addr_t
2523 it->second.localtime(f->dump_stream("expire_time"));
2524 f->close_section(); //entry
2525 }
2526 f->close_section(); //blacklist
2527 } else if (prefix == "dump_watchers") {
2528 list<obj_watch_item_t> watchers;
2529 // scan pg's
2530 vector<PGRef> pgs;
2531 _get_pgs(&pgs);
2532 for (auto& pg : pgs) {
2533 list<obj_watch_item_t> pg_watchers;
2534 pg->get_watchers(&pg_watchers);
2535 watchers.splice(watchers.end(), pg_watchers);
2536 }
2537
2538 f->open_array_section("watchers");
2539 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2540 it != watchers.end(); ++it) {
2541
2542 f->open_object_section("watch");
2543
2544 f->dump_string("namespace", it->obj.nspace);
2545 f->dump_string("object", it->obj.oid.name);
2546
2547 f->open_object_section("entity_name");
2548 it->wi.name.dump(f);
2549 f->close_section(); //entity_name_t
2550
2551 f->dump_unsigned("cookie", it->wi.cookie);
2552 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2553
2554 f->open_object_section("entity_addr_t");
2555 it->wi.addr.dump(f);
2556 f->close_section(); //entity_addr_t
2557
2558 f->close_section(); //watch
2559 }
2560
2561 f->close_section(); //watchers
2562 } else if (prefix == "dump_recovery_reservations") {
2563 f->open_object_section("reservations");
2564 f->open_object_section("local_reservations");
2565 service.local_reserver.dump(f);
2566 f->close_section();
2567 f->open_object_section("remote_reservations");
2568 service.remote_reserver.dump(f);
2569 f->close_section();
2570 f->close_section();
2571 } else if (prefix == "dump_scrub_reservations") {
2572 f->open_object_section("scrub_reservations");
2573 service.dump_scrub_reservations(f);
2574 f->close_section();
2575 } else if (prefix == "get_latest_osdmap") {
2576 get_latest_osdmap();
2577 } else if (prefix == "set_heap_property") {
2578 string property;
2579 int64_t value = 0;
2580 string error;
2581 bool success = false;
2582 if (!cmd_getval(cmdmap, "property", property)) {
2583 error = "unable to get property";
2584 success = false;
2585 } else if (!cmd_getval(cmdmap, "value", value)) {
2586 error = "unable to get value";
2587 success = false;
2588 } else if (value < 0) {
2589 error = "negative value not allowed";
2590 success = false;
2591 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2592 error = "invalid property";
2593 success = false;
2594 } else {
2595 success = true;
2596 }
2597 f->open_object_section("result");
2598 f->dump_string("error", error);
2599 f->dump_bool("success", success);
2600 f->close_section();
2601 } else if (prefix == "get_heap_property") {
2602 string property;
2603 size_t value = 0;
2604 string error;
2605 bool success = false;
2606 if (!cmd_getval(cmdmap, "property", property)) {
2607 error = "unable to get property";
2608 success = false;
2609 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2610 error = "invalid property";
2611 success = false;
2612 } else {
2613 success = true;
2614 }
2615 f->open_object_section("result");
2616 f->dump_string("error", error);
2617 f->dump_bool("success", success);
2618 f->dump_int("value", value);
2619 f->close_section();
2620 } else if (prefix == "dump_objectstore_kv_stats") {
2621 store->get_db_statistics(f);
2622 } else if (prefix == "dump_scrubs") {
2623 service.dumps_scrub(f);
2624 } else if (prefix == "calc_objectstore_db_histogram") {
2625 store->generate_db_histogram(f);
2626 } else if (prefix == "flush_store_cache") {
2627 store->flush_cache(&ss);
2628 } else if (prefix == "dump_pgstate_history") {
2629 f->open_object_section("pgstate_history");
2630 f->open_array_section("pgs");
2631 vector<PGRef> pgs;
2632 _get_pgs(&pgs);
2633 for (auto& pg : pgs) {
2634 f->open_object_section("pg");
2635 f->dump_stream("pg") << pg->pg_id;
2636 f->dump_string("currently", pg->get_current_state());
2637 pg->dump_pgstate_history(f);
2638 f->close_section();
2639 }
2640 f->close_section();
2641 f->close_section();
2642 } else if (prefix == "compact") {
2643 dout(1) << "triggering manual compaction" << dendl;
2644 auto start = ceph::coarse_mono_clock::now();
2645 store->compact();
2646 auto end = ceph::coarse_mono_clock::now();
2647 double duration = std::chrono::duration<double>(end-start).count();
2648 dout(1) << "finished manual compaction in "
2649 << duration
2650 << " seconds" << dendl;
2651 f->open_object_section("compact_result");
2652 f->dump_float("elapsed_time", duration);
2653 f->close_section();
2654 } else if (prefix == "get_mapped_pools") {
2655 f->open_array_section("mapped_pools");
2656 set<int64_t> poollist = get_mapped_pools();
2657 for (auto pool : poollist) {
2658 f->dump_int("pool_id", pool);
2659 }
2660 f->close_section();
2661 } else if (prefix == "smart") {
2662 string devid;
2663 cmd_getval(cmdmap, "devid", devid);
2664 ostringstream out;
2665 probe_smart(devid, out);
2666 outbl.append(out.str());
2667 } else if (prefix == "list_devices") {
2668 set<string> devnames;
2669 store->get_devices(&devnames);
2670 f->open_array_section("list_devices");
2671 for (auto dev : devnames) {
2672 if (dev.find("dm-") == 0) {
2673 continue;
2674 }
2675 string err;
2676 f->open_object_section("device");
2677 f->dump_string("device", "/dev/" + dev);
2678 f->dump_string("device_id", get_device_id(dev, &err));
2679 f->close_section();
2680 }
2681 f->close_section();
2682 } else if (prefix == "send_beacon") {
2683 lock_guard l(osd_lock);
2684 if (is_active()) {
2685 send_beacon(ceph::coarse_mono_clock::now());
2686 }
2687 }
2688
2689 else if (prefix == "cluster_log") {
2690 vector<string> msg;
2691 cmd_getval(cmdmap, "message", msg);
2692 if (msg.empty()) {
2693 ret = -EINVAL;
2694 ss << "ignoring empty log message";
2695 goto out;
2696 }
2697 string message = msg.front();
2698 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2699 message += " " + *a;
2700 string lvl;
2701 cmd_getval(cmdmap, "level", lvl);
2702 clog_type level = string_to_clog_type(lvl);
2703 if (level < 0) {
2704 ret = -EINVAL;
2705 ss << "unknown level '" << lvl << "'";
2706 goto out;
2707 }
2708 clog->do_log(level, message);
2709 }
2710
2711 else if (prefix == "bench") {
2712 int64_t count;
2713 int64_t bsize;
2714 int64_t osize, onum;
2715 // default count 1G, size 4MB
2716 cmd_getval(cmdmap, "count", count, (int64_t)1 << 30);
2717 cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
2718 cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
2719 cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
2720
2721 uint32_t duration = cct->_conf->osd_bench_duration;
2722
2723 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
2724 // let us limit the block size because the next checks rely on it
2725 // having a sane value. If we allow any block size to be set things
2726 // can still go sideways.
2727 ss << "block 'size' values are capped at "
2728 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
2729 << " a higher value, please adjust 'osd_bench_max_block_size'";
2730 ret = -EINVAL;
2731 goto out;
2732 } else if (bsize < (int64_t) (1 << 20)) {
2733 // entering the realm of small block sizes.
2734 // limit the count to a sane value, assuming a configurable amount of
2735 // IOPS and duration, so that the OSD doesn't get hung up on this,
2736 // preventing timeouts from going off
2737 int64_t max_count =
2738 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
2739 if (count > max_count) {
2740 ss << "'count' values greater than " << max_count
2741 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2742 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
2743 << " for " << duration << " seconds,"
2744 << " can cause ill effects on osd. "
2745 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2746 << " value if you wish to use a higher 'count'.";
2747 ret = -EINVAL;
2748 goto out;
2749 }
2750 } else {
2751 // 1MB block sizes are big enough so that we get more stuff done.
2752 // However, to avoid the osd from getting hung on this and having
2753 // timers being triggered, we are going to limit the count assuming
2754 // a configurable throughput and duration.
2755 // NOTE: max_count is the total amount of bytes that we believe we
2756 // will be able to write during 'duration' for the given
2757 // throughput. The block size hardly impacts this unless it's
2758 // way too big. Given we already check how big the block size
2759 // is, it's safe to assume everything will check out.
2760 int64_t max_count =
2761 cct->_conf->osd_bench_large_size_max_throughput * duration;
2762 if (count > max_count) {
2763 ss << "'count' values greater than " << max_count
2764 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2765 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
2766 << " for " << duration << " seconds,"
2767 << " can cause ill effects on osd. "
2768 << " Please adjust 'osd_bench_large_size_max_throughput'"
2769 << " with a higher value if you wish to use a higher 'count'.";
2770 ret = -EINVAL;
2771 goto out;
2772 }
2773 }
2774
2775 if (osize && bsize > osize)
2776 bsize = osize;
2777
2778 dout(1) << " bench count " << count
2779 << " bsize " << byte_u_t(bsize) << dendl;
2780
2781 ObjectStore::Transaction cleanupt;
2782
2783 if (osize && onum) {
2784 bufferlist bl;
2785 bufferptr bp(osize);
2786 bp.zero();
2787 bl.push_back(std::move(bp));
2788 bl.rebuild_page_aligned();
2789 for (int i=0; i<onum; ++i) {
2790 char nm[30];
2791 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
2792 object_t oid(nm);
2793 hobject_t soid(sobject_t(oid, 0));
2794 ObjectStore::Transaction t;
2795 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
2796 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2797 cleanupt.remove(coll_t(), ghobject_t(soid));
2798 }
2799 }
2800
2801 bufferlist bl;
2802 bufferptr bp(bsize);
2803 bp.zero();
2804 bl.push_back(std::move(bp));
2805 bl.rebuild_page_aligned();
2806
2807 {
2808 C_SaferCond waiter;
2809 if (!service.meta_ch->flush_commit(&waiter)) {
2810 waiter.wait();
2811 }
2812 }
2813
2814 utime_t start = ceph_clock_now();
2815 for (int64_t pos = 0; pos < count; pos += bsize) {
2816 char nm[30];
2817 unsigned offset = 0;
2818 if (onum && osize) {
2819 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
2820 offset = rand() % (osize / bsize) * bsize;
2821 } else {
2822 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
2823 }
2824 object_t oid(nm);
2825 hobject_t soid(sobject_t(oid, 0));
2826 ObjectStore::Transaction t;
2827 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
2828 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2829 if (!onum || !osize)
2830 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
2831 }
2832
2833 {
2834 C_SaferCond waiter;
2835 if (!service.meta_ch->flush_commit(&waiter)) {
2836 waiter.wait();
2837 }
2838 }
2839 utime_t end = ceph_clock_now();
2840
2841 // clean up
2842 store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
2843 {
2844 C_SaferCond waiter;
2845 if (!service.meta_ch->flush_commit(&waiter)) {
2846 waiter.wait();
2847 }
2848 }
2849
2850 double elapsed = end - start;
2851 double rate = count / elapsed;
2852 double iops = rate / bsize;
2853 f->open_object_section("osd_bench_results");
2854 f->dump_int("bytes_written", count);
2855 f->dump_int("blocksize", bsize);
2856 f->dump_float("elapsed_sec", elapsed);
2857 f->dump_float("bytes_per_sec", rate);
2858 f->dump_float("iops", iops);
2859 f->close_section();
2860 }
2861
2862 else if (prefix == "flush_pg_stats") {
2863 mgrc.send_pgstats();
2864 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2865 }
2866
2867 else if (prefix == "heap") {
2868 ret = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2869 }
2870
2871 else if (prefix == "debug dump_missing") {
2872 f->open_array_section("pgs");
2873 vector<PGRef> pgs;
2874 _get_pgs(&pgs);
2875 for (auto& pg : pgs) {
2876 string s = stringify(pg->pg_id);
2877 f->open_array_section(s.c_str());
2878 pg->lock();
2879 pg->dump_missing(f);
2880 pg->unlock();
2881 f->close_section();
2882 }
2883 f->close_section();
2884 }
2885
2886 else if (prefix == "debug kick_recovery_wq") {
2887 int64_t delay;
2888 cmd_getval(cmdmap, "delay", delay);
2889 ostringstream oss;
2890 oss << delay;
2891 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
2892 if (ret != 0) {
2893 ss << "kick_recovery_wq: error setting "
2894 << "osd_recovery_delay_start to '" << delay << "': error "
2895 << ret;
2896 goto out;
2897 }
2898 cct->_conf.apply_changes(nullptr);
2899 ss << "kicking recovery queue. set osd_recovery_delay_start "
2900 << "to " << cct->_conf->osd_recovery_delay_start;
2901 }
2902
2903 else if (prefix == "cpu_profiler") {
2904 ostringstream ds;
2905 string arg;
2906 cmd_getval(cmdmap, "arg", arg);
2907 vector<string> argvec;
2908 get_str_vec(arg, argvec);
2909 cpu_profiler_handle_command(argvec, ds);
2910 outbl.append(ds.str());
2911 }
2912
2913 else if (prefix == "dump_pg_recovery_stats") {
2914 lock_guard l(osd_lock);
2915 pg_recovery_stats.dump_formatted(f);
2916 }
2917
2918 else if (prefix == "reset_pg_recovery_stats") {
2919 lock_guard l(osd_lock);
2920 pg_recovery_stats.reset();
2921 }
2922
2923 else if (prefix == "perf histogram dump") {
2924 std::string logger;
2925 std::string counter;
2926 cmd_getval(cmdmap, "logger", logger);
2927 cmd_getval(cmdmap, "counter", counter);
2928 cct->get_perfcounters_collection()->dump_formatted_histograms(
2929 f, false, logger, counter);
2930 }
2931
2932 else if (prefix == "cache drop") {
2933 lock_guard l(osd_lock);
2934 dout(20) << "clearing all caches" << dendl;
2935 // Clear the objectstore's cache - onode and buffer for Bluestore,
2936 // system's pagecache for Filestore
2937 ret = store->flush_cache(&ss);
2938 if (ret < 0) {
2939 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
2940 goto out;
2941 }
2942 // Clear the objectcontext cache (per PG)
2943 vector<PGRef> pgs;
2944 _get_pgs(&pgs);
2945 for (auto& pg: pgs) {
2946 pg->clear_cache();
2947 }
2948 }
2949
2950 else if (prefix == "cache status") {
2951 lock_guard l(osd_lock);
2952 int obj_ctx_count = 0;
2953 vector<PGRef> pgs;
2954 _get_pgs(&pgs);
2955 for (auto& pg: pgs) {
2956 obj_ctx_count += pg->get_cache_obj_count();
2957 }
2958 f->open_object_section("cache_status");
2959 f->dump_int("object_ctx", obj_ctx_count);
2960 store->dump_cache_stats(f);
2961 f->close_section();
2962 }
2963
2964 else if (prefix == "scrub_purged_snaps") {
2965 lock_guard l(osd_lock);
2966 scrub_purged_snaps();
2967 }
2968
2969 else if (prefix == "dump_osd_network") {
2970 lock_guard l(osd_lock);
2971 int64_t value = 0;
2972 if (!(cmd_getval(cmdmap, "value", value))) {
2973 // Convert milliseconds to microseconds
2974 value = static_cast<double>(g_conf().get_val<double>(
2975 "mon_warn_on_slow_ping_time")) * 1000;
2976 if (value == 0) {
2977 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2978 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2979 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2980 }
2981 } else {
2982 // Convert user input to microseconds
2983 value *= 1000;
2984 }
2985 if (value < 0) value = 0;
2986
2987 struct osd_ping_time_t {
2988 uint32_t pingtime;
2989 int to;
2990 bool back;
2991 std::array<uint32_t,3> times;
2992 std::array<uint32_t,3> min;
2993 std::array<uint32_t,3> max;
2994 uint32_t last;
2995 uint32_t last_update;
2996
2997 bool operator<(const osd_ping_time_t& rhs) const {
2998 if (pingtime < rhs.pingtime)
2999 return true;
3000 if (pingtime > rhs.pingtime)
3001 return false;
3002 if (to < rhs.to)
3003 return true;
3004 if (to > rhs.to)
3005 return false;
3006 return back;
3007 }
3008 };
3009
3010 set<osd_ping_time_t> sorted;
3011 // Get pingtimes under lock and not on the stack
3012 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3013 service.get_hb_pingtime(pingtimes);
3014 for (auto j : *pingtimes) {
3015 if (j.second.last_update == 0)
3016 continue;
3017 osd_ping_time_t item;
3018 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3019 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3020 if (item.pingtime >= value) {
3021 item.to = j.first;
3022 item.times[0] = j.second.back_pingtime[0];
3023 item.times[1] = j.second.back_pingtime[1];
3024 item.times[2] = j.second.back_pingtime[2];
3025 item.min[0] = j.second.back_min[0];
3026 item.min[1] = j.second.back_min[1];
3027 item.min[2] = j.second.back_min[2];
3028 item.max[0] = j.second.back_max[0];
3029 item.max[1] = j.second.back_max[1];
3030 item.max[2] = j.second.back_max[2];
3031 item.last = j.second.back_last;
3032 item.back = true;
3033 item.last_update = j.second.last_update;
3034 sorted.emplace(item);
3035 }
3036 if (j.second.front_last == 0)
3037 continue;
3038 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3039 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3040 if (item.pingtime >= value) {
3041 item.to = j.first;
3042 item.times[0] = j.second.front_pingtime[0];
3043 item.times[1] = j.second.front_pingtime[1];
3044 item.times[2] = j.second.front_pingtime[2];
3045 item.min[0] = j.second.front_min[0];
3046 item.min[1] = j.second.front_min[1];
3047 item.min[2] = j.second.front_min[2];
3048 item.max[0] = j.second.front_max[0];
3049 item.max[1] = j.second.front_max[1];
3050 item.max[2] = j.second.front_max[2];
3051 item.last = j.second.front_last;
3052 item.last_update = j.second.last_update;
3053 item.back = false;
3054 sorted.emplace(item);
3055 }
3056 }
3057 delete pingtimes;
3058 //
3059 // Network ping times (1min 5min 15min)
3060 f->open_object_section("network_ping_times");
3061 f->dump_int("threshold", value / 1000);
3062 f->open_array_section("entries");
3063 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3064 ceph_assert(sitem.pingtime >= value);
3065 f->open_object_section("entry");
3066
3067 const time_t lu(sitem.last_update);
3068 char buffer[26];
3069 string lustr(ctime_r(&lu, buffer));
3070 lustr.pop_back(); // Remove trailing \n
3071 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3072 f->dump_string("last update", lustr);
3073 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3074 f->dump_int("from osd", whoami);
3075 f->dump_int("to osd", sitem.to);
3076 f->dump_string("interface", (sitem.back ? "back" : "front"));
3077 f->open_object_section("average");
3078 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3079 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3080 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3081 f->close_section(); // average
3082 f->open_object_section("min");
3083 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3084 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3085 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3086 f->close_section(); // min
3087 f->open_object_section("max");
3088 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3089 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3090 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3091 f->close_section(); // max
3092 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3093 f->close_section(); // entry
3094 }
3095 f->close_section(); // entries
3096 f->close_section(); // network_ping_times
3097 } else {
3098 ceph_abort_msg("broken asok registration");
3099 }
3100
3101 out:
3102 on_finish(ret, ss.str(), outbl);
3103 }
3104
3105 class TestOpsSocketHook : public AdminSocketHook {
3106 OSDService *service;
3107 ObjectStore *store;
3108 public:
3109 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
3110 int call(std::string_view command, const cmdmap_t& cmdmap,
3111 Formatter *f,
3112 std::ostream& errss,
3113 bufferlist& out) override {
3114 int r = 0;
3115 stringstream outss;
3116 try {
3117 test_ops(service, store, command, cmdmap, outss);
3118 out.append(outss);
3119 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3120 errss << e.what();
3121 r = -EINVAL;
3122 }
3123 return r;
3124 }
3125 void test_ops(OSDService *service, ObjectStore *store,
3126 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
3127
3128 };
3129
3130 class OSD::C_Tick : public Context {
3131 OSD *osd;
3132 public:
3133 explicit C_Tick(OSD *o) : osd(o) {}
3134 void finish(int r) override {
3135 osd->tick();
3136 }
3137 };
3138
3139 class OSD::C_Tick_WithoutOSDLock : public Context {
3140 OSD *osd;
3141 public:
3142 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3143 void finish(int r) override {
3144 osd->tick_without_osd_lock();
3145 }
3146 };
3147
3148 int OSD::enable_disable_fuse(bool stop)
3149 {
3150 #ifdef HAVE_LIBFUSE
3151 int r;
3152 string mntpath = cct->_conf->osd_data + "/fuse";
3153 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3154 dout(1) << __func__ << " disabling" << dendl;
3155 fuse_store->stop();
3156 delete fuse_store;
3157 fuse_store = NULL;
3158 r = ::rmdir(mntpath.c_str());
3159 if (r < 0) {
3160 r = -errno;
3161 derr << __func__ << " failed to rmdir " << mntpath << ": "
3162 << cpp_strerror(r) << dendl;
3163 return r;
3164 }
3165 return 0;
3166 }
3167 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3168 dout(1) << __func__ << " enabling" << dendl;
3169 r = ::mkdir(mntpath.c_str(), 0700);
3170 if (r < 0)
3171 r = -errno;
3172 if (r < 0 && r != -EEXIST) {
3173 derr << __func__ << " unable to create " << mntpath << ": "
3174 << cpp_strerror(r) << dendl;
3175 return r;
3176 }
3177 fuse_store = new FuseStore(store, mntpath);
3178 r = fuse_store->start();
3179 if (r < 0) {
3180 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3181 delete fuse_store;
3182 fuse_store = NULL;
3183 return r;
3184 }
3185 }
3186 #endif // HAVE_LIBFUSE
3187 return 0;
3188 }
3189
3190 size_t OSD::get_num_cache_shards()
3191 {
3192 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3193 }
3194
3195 int OSD::get_num_op_shards()
3196 {
3197 if (cct->_conf->osd_op_num_shards)
3198 return cct->_conf->osd_op_num_shards;
3199 if (store_is_rotational)
3200 return cct->_conf->osd_op_num_shards_hdd;
3201 else
3202 return cct->_conf->osd_op_num_shards_ssd;
3203 }
3204
3205 int OSD::get_num_op_threads()
3206 {
3207 if (cct->_conf->osd_op_num_threads_per_shard)
3208 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3209 if (store_is_rotational)
3210 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3211 else
3212 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3213 }
3214
3215 float OSD::get_osd_recovery_sleep()
3216 {
3217 if (cct->_conf->osd_recovery_sleep)
3218 return cct->_conf->osd_recovery_sleep;
3219 if (!store_is_rotational && !journal_is_rotational)
3220 return cct->_conf->osd_recovery_sleep_ssd;
3221 else if (store_is_rotational && !journal_is_rotational)
3222 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
3223 else
3224 return cct->_conf->osd_recovery_sleep_hdd;
3225 }
3226
3227 float OSD::get_osd_delete_sleep()
3228 {
3229 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3230 if (osd_delete_sleep > 0)
3231 return osd_delete_sleep;
3232 if (!store_is_rotational && !journal_is_rotational)
3233 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3234 if (store_is_rotational && !journal_is_rotational)
3235 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3236 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3237 }
3238
3239 int OSD::get_recovery_max_active()
3240 {
3241 if (cct->_conf->osd_recovery_max_active)
3242 return cct->_conf->osd_recovery_max_active;
3243 if (store_is_rotational)
3244 return cct->_conf->osd_recovery_max_active_hdd;
3245 else
3246 return cct->_conf->osd_recovery_max_active_ssd;
3247 }
3248
3249 float OSD::get_osd_snap_trim_sleep()
3250 {
3251 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3252 if (osd_snap_trim_sleep > 0)
3253 return osd_snap_trim_sleep;
3254 if (!store_is_rotational && !journal_is_rotational)
3255 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3256 if (store_is_rotational && !journal_is_rotational)
3257 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3258 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3259 }
3260
3261 int OSD::init()
3262 {
3263 OSDMapRef osdmap;
3264 CompatSet initial, diff;
3265 std::lock_guard lock(osd_lock);
3266 if (is_stopping())
3267 return 0;
3268
3269 tick_timer.init();
3270 tick_timer_without_osd_lock.init();
3271 service.recovery_request_timer.init();
3272 service.sleep_timer.init();
3273
3274 boot_finisher.start();
3275
3276 {
3277 string val;
3278 store->read_meta("require_osd_release", &val);
3279 last_require_osd_release = ceph_release_from_name(val);
3280 }
3281
3282 // mount.
3283 dout(2) << "init " << dev_path
3284 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3285 << dendl;
3286 dout(2) << "journal " << journal_path << dendl;
3287 ceph_assert(store); // call pre_init() first!
3288
3289 store->set_cache_shards(get_num_cache_shards());
3290
3291 int r = store->mount();
3292 if (r < 0) {
3293 derr << "OSD:init: unable to mount object store" << dendl;
3294 return r;
3295 }
3296 journal_is_rotational = store->is_journal_rotational();
3297 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3298 << dendl;
3299
3300 enable_disable_fuse(false);
3301
3302 dout(2) << "boot" << dendl;
3303
3304 service.meta_ch = store->open_collection(coll_t::meta());
3305
3306 // initialize the daily loadavg with current 15min loadavg
3307 double loadavgs[3];
3308 if (getloadavg(loadavgs, 3) == 3) {
3309 daily_loadavg = loadavgs[2];
3310 } else {
3311 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3312 daily_loadavg = 1.0;
3313 }
3314
3315 int rotating_auth_attempts = 0;
3316 auto rotating_auth_timeout =
3317 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3318
3319 // sanity check long object name handling
3320 {
3321 hobject_t l;
3322 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3323 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3324 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3325 r = store->validate_hobject_key(l);
3326 if (r < 0) {
3327 derr << "backend (" << store->get_type() << ") is unable to support max "
3328 << "object name[space] len" << dendl;
3329 derr << " osd max object name len = "
3330 << cct->_conf->osd_max_object_name_len << dendl;
3331 derr << " osd max object namespace len = "
3332 << cct->_conf->osd_max_object_namespace_len << dendl;
3333 derr << cpp_strerror(r) << dendl;
3334 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3335 goto out;
3336 }
3337 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3338 << dendl;
3339 } else {
3340 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3341 }
3342 }
3343
3344 // read superblock
3345 r = read_superblock();
3346 if (r < 0) {
3347 derr << "OSD::init() : unable to read osd superblock" << dendl;
3348 r = -EINVAL;
3349 goto out;
3350 }
3351
3352 if (osd_compat.compare(superblock.compat_features) < 0) {
3353 derr << "The disk uses features unsupported by the executable." << dendl;
3354 derr << " ondisk features " << superblock.compat_features << dendl;
3355 derr << " daemon features " << osd_compat << dendl;
3356
3357 if (osd_compat.writeable(superblock.compat_features)) {
3358 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3359 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3360 r = -EOPNOTSUPP;
3361 goto out;
3362 }
3363 else {
3364 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3365 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3366 r = -EOPNOTSUPP;
3367 goto out;
3368 }
3369 }
3370
3371 assert_warn(whoami == superblock.whoami);
3372 if (whoami != superblock.whoami) {
3373 derr << "OSD::init: superblock says osd"
3374 << superblock.whoami << " but I am osd." << whoami << dendl;
3375 r = -EINVAL;
3376 goto out;
3377 }
3378
3379 startup_time = ceph::mono_clock::now();
3380
3381 // load up "current" osdmap
3382 assert_warn(!get_osdmap());
3383 if (get_osdmap()) {
3384 derr << "OSD::init: unable to read current osdmap" << dendl;
3385 r = -EINVAL;
3386 goto out;
3387 }
3388 osdmap = get_map(superblock.current_epoch);
3389 set_osdmap(osdmap);
3390
3391 // make sure we don't have legacy pgs deleting
3392 {
3393 vector<coll_t> ls;
3394 int r = store->list_collections(ls);
3395 ceph_assert(r >= 0);
3396 for (auto c : ls) {
3397 spg_t pgid;
3398 if (c.is_pg(&pgid) &&
3399 !osdmap->have_pg_pool(pgid.pool())) {
3400 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3401 if (!store->exists(service.meta_ch, oid)) {
3402 derr << __func__ << " missing pg_pool_t for deleted pool "
3403 << pgid.pool() << " for pg " << pgid
3404 << "; please downgrade to luminous and allow "
3405 << "pg deletion to complete before upgrading" << dendl;
3406 ceph_abort();
3407 }
3408 }
3409 }
3410 }
3411
3412 initial = get_osd_initial_compat_set();
3413 diff = superblock.compat_features.unsupported(initial);
3414 if (superblock.compat_features.merge(initial)) {
3415 // Are we adding SNAPMAPPER2?
3416 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3417 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3418 << dendl;
3419 auto ch = service.meta_ch;
3420 auto hoid = make_snapmapper_oid();
3421 unsigned max = cct->_conf->osd_target_transaction_size;
3422 r = SnapMapper::convert_legacy(cct, store, ch, hoid, max);
3423 if (r < 0)
3424 goto out;
3425 }
3426 // We need to persist the new compat_set before we
3427 // do anything else
3428 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3429 ObjectStore::Transaction t;
3430 write_superblock(t);
3431 r = store->queue_transaction(service.meta_ch, std::move(t));
3432 if (r < 0)
3433 goto out;
3434 }
3435
3436 // make sure snap mapper object exists
3437 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3438 dout(10) << "init creating/touching snapmapper object" << dendl;
3439 ObjectStore::Transaction t;
3440 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3441 r = store->queue_transaction(service.meta_ch, std::move(t));
3442 if (r < 0)
3443 goto out;
3444 }
3445 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3446 dout(10) << "init creating/touching purged_snaps object" << dendl;
3447 ObjectStore::Transaction t;
3448 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3449 r = store->queue_transaction(service.meta_ch, std::move(t));
3450 if (r < 0)
3451 goto out;
3452 }
3453
3454 if (cct->_conf->osd_open_classes_on_start) {
3455 int r = ClassHandler::get_instance().open_all_classes();
3456 if (r)
3457 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3458 }
3459
3460 check_osdmap_features();
3461
3462 create_recoverystate_perf();
3463
3464 {
3465 epoch_t bind_epoch = osdmap->get_epoch();
3466 service.set_epochs(NULL, NULL, &bind_epoch);
3467 }
3468
3469 clear_temp_objects();
3470
3471 // initialize osdmap references in sharded wq
3472 for (auto& shard : shards) {
3473 std::lock_guard l(shard->osdmap_lock);
3474 shard->shard_osdmap = osdmap;
3475 }
3476
3477 // load up pgs (as they previously existed)
3478 load_pgs();
3479
3480 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3481
3482 create_logger();
3483
3484 // prime osd stats
3485 {
3486 struct store_statfs_t stbuf;
3487 osd_alert_list_t alerts;
3488 int r = store->statfs(&stbuf, &alerts);
3489 ceph_assert(r == 0);
3490 service.set_statfs(stbuf, alerts);
3491 }
3492
3493 // client_messenger auth_client is already set up by monc.
3494 for (auto m : { cluster_messenger,
3495 objecter_messenger,
3496 hb_front_client_messenger,
3497 hb_back_client_messenger,
3498 hb_front_server_messenger,
3499 hb_back_server_messenger } ) {
3500 m->set_auth_client(monc);
3501 }
3502 for (auto m : { client_messenger,
3503 cluster_messenger,
3504 hb_front_server_messenger,
3505 hb_back_server_messenger }) {
3506 m->set_auth_server(monc);
3507 }
3508 monc->set_handle_authentication_dispatcher(this);
3509
3510 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3511 | CEPH_ENTITY_TYPE_MGR);
3512 r = monc->init();
3513 if (r < 0)
3514 goto out;
3515
3516 mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); });
3517 mgrc.set_perf_metric_query_cb(
3518 [this](const ConfigPayload &config_payload) {
3519 set_perf_queries(config_payload);
3520 },
3521 [this] {
3522 return get_perf_reports();
3523 });
3524 mgrc.init();
3525
3526 // tell monc about log_client so it will know about mon session resets
3527 monc->set_log_client(&log_client);
3528 update_log_config();
3529
3530 // i'm ready!
3531 client_messenger->add_dispatcher_tail(&mgrc);
3532 client_messenger->add_dispatcher_tail(this);
3533 cluster_messenger->add_dispatcher_head(this);
3534
3535 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3536 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3537 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3538 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3539
3540 objecter_messenger->add_dispatcher_head(service.objecter.get());
3541
3542 service.init();
3543 service.publish_map(osdmap);
3544 service.publish_superblock(superblock);
3545 service.max_oldest_map = superblock.oldest_map;
3546
3547 for (auto& shard : shards) {
3548 // put PGs in a temporary set because we may modify pg_slots
3549 // unordered_map below.
3550 set<PGRef> pgs;
3551 for (auto& i : shard->pg_slots) {
3552 PGRef pg = i.second->pg;
3553 if (!pg) {
3554 continue;
3555 }
3556 pgs.insert(pg);
3557 }
3558 for (auto pg : pgs) {
3559 std::scoped_lock l{*pg};
3560 set<pair<spg_t,epoch_t>> new_children;
3561 set<pair<spg_t,epoch_t>> merge_pgs;
3562 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3563 &new_children, &merge_pgs);
3564 if (!new_children.empty()) {
3565 for (auto shard : shards) {
3566 shard->prime_splits(osdmap, &new_children);
3567 }
3568 assert(new_children.empty());
3569 }
3570 if (!merge_pgs.empty()) {
3571 for (auto shard : shards) {
3572 shard->prime_merges(osdmap, &merge_pgs);
3573 }
3574 assert(merge_pgs.empty());
3575 }
3576 }
3577 }
3578
3579 osd_op_tp.start();
3580
3581 // start the heartbeat
3582 heartbeat_thread.create("osd_srv_heartbt");
3583
3584 // tick
3585 tick_timer.add_event_after(get_tick_interval(),
3586 new C_Tick(this));
3587 {
3588 std::lock_guard l(tick_timer_lock);
3589 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3590 new C_Tick_WithoutOSDLock(this));
3591 }
3592
3593 osd_lock.unlock();
3594
3595 r = monc->authenticate();
3596 if (r < 0) {
3597 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3598 << dendl;
3599 exit(1);
3600 }
3601
3602 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3603 derr << "unable to obtain rotating service keys; retrying" << dendl;
3604 ++rotating_auth_attempts;
3605 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3606 derr << __func__ << " wait_auth_rotating timed out" << dendl;
3607 exit(1);
3608 }
3609 }
3610
3611 r = update_crush_device_class();
3612 if (r < 0) {
3613 derr << __func__ << " unable to update_crush_device_class: "
3614 << cpp_strerror(r) << dendl;
3615 exit(1);
3616 }
3617
3618 r = update_crush_location();
3619 if (r < 0) {
3620 derr << __func__ << " unable to update_crush_location: "
3621 << cpp_strerror(r) << dendl;
3622 exit(1);
3623 }
3624
3625 osd_lock.lock();
3626 if (is_stopping())
3627 return 0;
3628
3629 // start objecter *after* we have authenticated, so that we don't ignore
3630 // the OSDMaps it requests.
3631 service.final_init();
3632
3633 check_config();
3634
3635 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3636 consume_map();
3637
3638 dout(0) << "done with init, starting boot process" << dendl;
3639
3640 // subscribe to any pg creations
3641 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3642
3643 // MgrClient needs this (it doesn't have MonClient reference itself)
3644 monc->sub_want("mgrmap", 0, 0);
3645
3646 // we don't need to ask for an osdmap here; objecter will
3647 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3648
3649 monc->renew_subs();
3650
3651 start_boot();
3652
3653 return 0;
3654
3655 out:
3656 enable_disable_fuse(true);
3657 store->umount();
3658 delete store;
3659 store = NULL;
3660 return r;
3661 }
3662
3663 void OSD::final_init()
3664 {
3665 AdminSocket *admin_socket = cct->get_admin_socket();
3666 asok_hook = new OSDSocketHook(this);
3667 int r = admin_socket->register_command("status", asok_hook,
3668 "high-level status of OSD");
3669 ceph_assert(r == 0);
3670 r = admin_socket->register_command("flush_journal",
3671 asok_hook,
3672 "flush the journal to permanent store");
3673 ceph_assert(r == 0);
3674 r = admin_socket->register_command("dump_ops_in_flight " \
3675 "name=filterstr,type=CephString,n=N,req=false",
3676 asok_hook,
3677 "show the ops currently in flight");
3678 ceph_assert(r == 0);
3679 r = admin_socket->register_command("ops " \
3680 "name=filterstr,type=CephString,n=N,req=false",
3681 asok_hook,
3682 "show the ops currently in flight");
3683 ceph_assert(r == 0);
3684 r = admin_socket->register_command("dump_blocked_ops " \
3685 "name=filterstr,type=CephString,n=N,req=false",
3686 asok_hook,
3687 "show the blocked ops currently in flight");
3688 ceph_assert(r == 0);
3689 r = admin_socket->register_command("dump_historic_ops " \
3690 "name=filterstr,type=CephString,n=N,req=false",
3691 asok_hook,
3692 "show recent ops");
3693 ceph_assert(r == 0);
3694 r = admin_socket->register_command("dump_historic_slow_ops " \
3695 "name=filterstr,type=CephString,n=N,req=false",
3696 asok_hook,
3697 "show slowest recent ops");
3698 ceph_assert(r == 0);
3699 r = admin_socket->register_command("dump_historic_ops_by_duration " \
3700 "name=filterstr,type=CephString,n=N,req=false",
3701 asok_hook,
3702 "show slowest recent ops, sorted by duration");
3703 ceph_assert(r == 0);
3704 r = admin_socket->register_command("dump_op_pq_state",
3705 asok_hook,
3706 "dump op priority queue state");
3707 ceph_assert(r == 0);
3708 r = admin_socket->register_command("dump_blacklist",
3709 asok_hook,
3710 "dump blacklisted clients and times");
3711 ceph_assert(r == 0);
3712 r = admin_socket->register_command("dump_watchers",
3713 asok_hook,
3714 "show clients which have active watches,"
3715 " and on which objects");
3716 ceph_assert(r == 0);
3717 r = admin_socket->register_command("dump_recovery_reservations",
3718 asok_hook,
3719 "show recovery reservations");
3720 ceph_assert(r == 0);
3721 r = admin_socket->register_command("dump_scrub_reservations",
3722 asok_hook,
3723 "show scrub reservations");
3724 ceph_assert(r == 0);
3725 r = admin_socket->register_command("get_latest_osdmap",
3726 asok_hook,
3727 "force osd to update the latest map from "
3728 "the mon");
3729 ceph_assert(r == 0);
3730
3731 r = admin_socket->register_command("set_heap_property " \
3732 "name=property,type=CephString " \
3733 "name=value,type=CephInt",
3734 asok_hook,
3735 "update malloc extension heap property");
3736 ceph_assert(r == 0);
3737
3738 r = admin_socket->register_command("get_heap_property " \
3739 "name=property,type=CephString",
3740 asok_hook,
3741 "get malloc extension heap property");
3742 ceph_assert(r == 0);
3743
3744 r = admin_socket->register_command("dump_objectstore_kv_stats",
3745 asok_hook,
3746 "print statistics of kvdb which used by bluestore");
3747 ceph_assert(r == 0);
3748
3749 r = admin_socket->register_command("dump_scrubs",
3750 asok_hook,
3751 "print scheduled scrubs");
3752 ceph_assert(r == 0);
3753
3754 r = admin_socket->register_command("calc_objectstore_db_histogram",
3755 asok_hook,
3756 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3757 ceph_assert(r == 0);
3758
3759 r = admin_socket->register_command("flush_store_cache",
3760 asok_hook,
3761 "Flush bluestore internal cache");
3762 ceph_assert(r == 0);
3763 r = admin_socket->register_command("dump_pgstate_history",
3764 asok_hook,
3765 "show recent state history");
3766 ceph_assert(r == 0);
3767
3768 r = admin_socket->register_command("compact",
3769 asok_hook,
3770 "Commpact object store's omap."
3771 " WARNING: Compaction probably slows your requests");
3772 ceph_assert(r == 0);
3773
3774 r = admin_socket->register_command("get_mapped_pools",
3775 asok_hook,
3776 "dump pools whose PG(s) are mapped to this OSD.");
3777
3778 ceph_assert(r == 0);
3779
3780 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
3781 asok_hook,
3782 "probe OSD devices for SMART data.");
3783
3784 ceph_assert(r == 0);
3785
3786 r = admin_socket->register_command("list_devices",
3787 asok_hook,
3788 "list OSD devices.");
3789 r = admin_socket->register_command("send_beacon",
3790 asok_hook,
3791 "send OSD beacon to mon immediately");
3792
3793 r = admin_socket->register_command(
3794 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3795 "Dump osd heartbeat network ping times");
3796 ceph_assert(r == 0);
3797
3798 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3799 // Note: pools are CephString instead of CephPoolname because
3800 // these commands traditionally support both pool names and numbers
3801 r = admin_socket->register_command(
3802 "setomapval " \
3803 "name=pool,type=CephString " \
3804 "name=objname,type=CephObjectname " \
3805 "name=key,type=CephString "\
3806 "name=val,type=CephString",
3807 test_ops_hook,
3808 "set omap key");
3809 ceph_assert(r == 0);
3810 r = admin_socket->register_command(
3811 "rmomapkey " \
3812 "name=pool,type=CephString " \
3813 "name=objname,type=CephObjectname " \
3814 "name=key,type=CephString",
3815 test_ops_hook,
3816 "remove omap key");
3817 ceph_assert(r == 0);
3818 r = admin_socket->register_command(
3819 "setomapheader " \
3820 "name=pool,type=CephString " \
3821 "name=objname,type=CephObjectname " \
3822 "name=header,type=CephString",
3823 test_ops_hook,
3824 "set omap header");
3825 ceph_assert(r == 0);
3826
3827 r = admin_socket->register_command(
3828 "getomap " \
3829 "name=pool,type=CephString " \
3830 "name=objname,type=CephObjectname",
3831 test_ops_hook,
3832 "output entire object map");
3833 ceph_assert(r == 0);
3834
3835 r = admin_socket->register_command(
3836 "truncobj " \
3837 "name=pool,type=CephString " \
3838 "name=objname,type=CephObjectname " \
3839 "name=len,type=CephInt",
3840 test_ops_hook,
3841 "truncate object to length");
3842 ceph_assert(r == 0);
3843
3844 r = admin_socket->register_command(
3845 "injectdataerr " \
3846 "name=pool,type=CephString " \
3847 "name=objname,type=CephObjectname " \
3848 "name=shardid,type=CephInt,req=false,range=0|255",
3849 test_ops_hook,
3850 "inject data error to an object");
3851 ceph_assert(r == 0);
3852
3853 r = admin_socket->register_command(
3854 "injectmdataerr " \
3855 "name=pool,type=CephString " \
3856 "name=objname,type=CephObjectname " \
3857 "name=shardid,type=CephInt,req=false,range=0|255",
3858 test_ops_hook,
3859 "inject metadata error to an object");
3860 ceph_assert(r == 0);
3861 r = admin_socket->register_command(
3862 "set_recovery_delay " \
3863 "name=utime,type=CephInt,req=false",
3864 test_ops_hook,
3865 "Delay osd recovery by specified seconds");
3866 ceph_assert(r == 0);
3867 r = admin_socket->register_command(
3868 "injectfull " \
3869 "name=type,type=CephString,req=false " \
3870 "name=count,type=CephInt,req=false ",
3871 test_ops_hook,
3872 "Inject a full disk (optional count times)");
3873 ceph_assert(r == 0);
3874 r = admin_socket->register_command(
3875 "bench " \
3876 "name=count,type=CephInt,req=false " \
3877 "name=size,type=CephInt,req=false " \
3878 "name=object_size,type=CephInt,req=false " \
3879 "name=object_num,type=CephInt,req=false ",
3880 asok_hook,
3881 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3882 "(default count=1G default size=4MB). Results in log.");
3883 ceph_assert(r == 0);
3884 r = admin_socket->register_command(
3885 "cluster_log " \
3886 "name=level,type=CephChoices,strings=error,warning,info,debug " \
3887 "name=message,type=CephString,n=N",
3888 asok_hook,
3889 "log a message to the cluster log");
3890 ceph_assert(r == 0);
3891 r = admin_socket->register_command(
3892 "flush_pg_stats",
3893 asok_hook,
3894 "flush pg stats");
3895 ceph_assert(r == 0);
3896 r = admin_socket->register_command(
3897 "heap " \
3898 "name=heapcmd,type=CephChoices,strings=" \
3899 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
3900 "name=value,type=CephString,req=false",
3901 asok_hook,
3902 "show heap usage info (available only if compiled with tcmalloc)");
3903 ceph_assert(r == 0);
3904 r = admin_socket->register_command(
3905 "debug dump_missing " \
3906 "name=filename,type=CephFilepath",
3907 asok_hook,
3908 "dump missing objects to a named file");
3909 ceph_assert(r == 0);
3910 r = admin_socket->register_command(
3911 "debug kick_recovery_wq " \
3912 "name=delay,type=CephInt,range=0",
3913 asok_hook,
3914 "set osd_recovery_delay_start to <val>");
3915 ceph_assert(r == 0);
3916 r = admin_socket->register_command(
3917 "cpu_profiler " \
3918 "name=arg,type=CephChoices,strings=status|flush",
3919 asok_hook,
3920 "run cpu profiling on daemon");
3921 ceph_assert(r == 0);
3922 r = admin_socket->register_command(
3923 "dump_pg_recovery_stats",
3924 asok_hook,
3925 "dump pg recovery statistics");
3926 ceph_assert(r == 0);
3927 r = admin_socket->register_command(
3928 "reset_pg_recovery_stats",
3929 asok_hook,
3930 "reset pg recovery statistics");
3931 ceph_assert(r == 0);
3932 r = admin_socket->register_command(
3933 "cache drop",
3934 asok_hook,
3935 "Drop all OSD caches");
3936 ceph_assert(r == 0);
3937 r = admin_socket->register_command(
3938 "cache status",
3939 asok_hook,
3940 "Get OSD caches statistics");
3941 ceph_assert(r == 0);
3942 r = admin_socket->register_command(
3943 "scrub_purged_snaps",
3944 asok_hook,
3945 "Scrub purged_snaps vs snapmapper index");
3946 ceph_assert(r == 0);
3947
3948 // -- pg commands --
3949 // old form: ceph pg <pgid> command ...
3950 r = admin_socket->register_command(
3951 "pg " \
3952 "name=pgid,type=CephPgid " \
3953 "name=cmd,type=CephChoices,strings=query",
3954 asok_hook,
3955 "");
3956 ceph_assert(r == 0);
3957 r = admin_socket->register_command(
3958 "pg " \
3959 "name=pgid,type=CephPgid " \
3960 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
3961 "name=mulcmd,type=CephChoices,strings=revert|delete",
3962 asok_hook,
3963 "");
3964 ceph_assert(r == 0);
3965 r = admin_socket->register_command(
3966 "pg " \
3967 "name=pgid,type=CephPgid " \
3968 "name=cmd,type=CephChoices,strings=list_unfound " \
3969 "name=offset,type=CephString,req=false",
3970 asok_hook,
3971 "");
3972 ceph_assert(r == 0);
3973 r = admin_socket->register_command(
3974 "pg " \
3975 "name=pgid,type=CephPgid " \
3976 "name=cmd,type=CephChoices,strings=scrub " \
3977 "name=time,type=CephInt,req=false",
3978 asok_hook,
3979 "");
3980 ceph_assert(r == 0);
3981 r = admin_socket->register_command(
3982 "pg " \
3983 "name=pgid,type=CephPgid " \
3984 "name=cmd,type=CephChoices,strings=deep_scrub " \
3985 "name=time,type=CephInt,req=false",
3986 asok_hook,
3987 "");
3988 ceph_assert(r == 0);
3989 // new form: tell <pgid> <cmd> for both cli and rest
3990 r = admin_socket->register_command(
3991 "query",
3992 asok_hook,
3993 "show details of a specific pg");
3994 ceph_assert(r == 0);
3995 r = admin_socket->register_command(
3996 "mark_unfound_lost " \
3997 "name=pgid,type=CephPgid,req=false " \
3998 "name=mulcmd,type=CephChoices,strings=revert|delete",
3999 asok_hook,
4000 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4001 ceph_assert(r == 0);
4002 r = admin_socket->register_command(
4003 "list_unfound " \
4004 "name=pgid,type=CephPgid,req=false " \
4005 "name=offset,type=CephString,req=false",
4006 asok_hook,
4007 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4008 ceph_assert(r == 0);
4009 r = admin_socket->register_command(
4010 "scrub " \
4011 "name=pgid,type=CephPgid,req=false " \
4012 "name=time,type=CephInt,req=false",
4013 asok_hook,
4014 "Trigger a scheduled scrub ");
4015 ceph_assert(r == 0);
4016 r = admin_socket->register_command(
4017 "deep_scrub " \
4018 "name=pgid,type=CephPgid,req=false " \
4019 "name=time,type=CephInt,req=false",
4020 asok_hook,
4021 "Trigger a scheduled deep scrub ");
4022 ceph_assert(r == 0);
4023 }
4024
4025 void OSD::create_logger()
4026 {
4027 dout(10) << "create_logger" << dendl;
4028
4029 logger = build_osd_logger(cct);
4030 cct->get_perfcounters_collection()->add(logger);
4031 }
4032
4033 void OSD::create_recoverystate_perf()
4034 {
4035 dout(10) << "create_recoverystate_perf" << dendl;
4036
4037 recoverystate_perf = build_recoverystate_perf(cct);
4038 cct->get_perfcounters_collection()->add(recoverystate_perf);
4039 }
4040
4041 int OSD::shutdown()
4042 {
4043 if (cct->_conf->osd_fast_shutdown) {
4044 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4045 if (cct->_conf->osd_fast_shutdown_notify_mon)
4046 service.prepare_to_stop();
4047 cct->_log->flush();
4048 _exit(0);
4049 }
4050
4051 if (!service.prepare_to_stop())
4052 return 0; // already shutting down
4053 osd_lock.lock();
4054 if (is_stopping()) {
4055 osd_lock.unlock();
4056 return 0;
4057 }
4058 dout(0) << "shutdown" << dendl;
4059
4060 set_state(STATE_STOPPING);
4061
4062 // Debugging
4063 if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4064 cct->_conf.set_val("debug_osd", "100");
4065 cct->_conf.set_val("debug_journal", "100");
4066 cct->_conf.set_val("debug_filestore", "100");
4067 cct->_conf.set_val("debug_bluestore", "100");
4068 cct->_conf.set_val("debug_ms", "100");
4069 cct->_conf.apply_changes(nullptr);
4070 }
4071
4072 // stop MgrClient earlier as it's more like an internal consumer of OSD
4073 mgrc.shutdown();
4074
4075 service.start_shutdown();
4076
4077 // stop sending work to pgs. this just prevents any new work in _process
4078 // from racing with on_shutdown and potentially entering the pg after.
4079 op_shardedwq.drain();
4080
4081 // Shutdown PGs
4082 {
4083 vector<PGRef> pgs;
4084 _get_pgs(&pgs);
4085 for (auto pg : pgs) {
4086 pg->shutdown();
4087 }
4088 }
4089
4090 // drain op queue again (in case PGs requeued something)
4091 op_shardedwq.drain();
4092 {
4093 finished.clear(); // zap waiters (bleh, this is messy)
4094 waiting_for_osdmap.clear();
4095 }
4096
4097 // unregister commands
4098 cct->get_admin_socket()->unregister_commands(asok_hook);
4099 delete asok_hook;
4100 asok_hook = NULL;
4101
4102 cct->get_admin_socket()->unregister_commands(test_ops_hook);
4103 delete test_ops_hook;
4104 test_ops_hook = NULL;
4105
4106 osd_lock.unlock();
4107
4108 {
4109 std::lock_guard l{heartbeat_lock};
4110 heartbeat_stop = true;
4111 heartbeat_cond.notify_all();
4112 heartbeat_peers.clear();
4113 }
4114 heartbeat_thread.join();
4115
4116 hb_back_server_messenger->mark_down_all();
4117 hb_front_server_messenger->mark_down_all();
4118 hb_front_client_messenger->mark_down_all();
4119 hb_back_client_messenger->mark_down_all();
4120
4121 osd_op_tp.drain();
4122 osd_op_tp.stop();
4123 dout(10) << "op sharded tp stopped" << dendl;
4124
4125 dout(10) << "stopping agent" << dendl;
4126 service.agent_stop();
4127
4128 boot_finisher.wait_for_empty();
4129
4130 osd_lock.lock();
4131
4132 boot_finisher.stop();
4133 reset_heartbeat_peers(true);
4134
4135 tick_timer.shutdown();
4136
4137 {
4138 std::lock_guard l(tick_timer_lock);
4139 tick_timer_without_osd_lock.shutdown();
4140 }
4141
4142 // note unmount epoch
4143 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
4144 superblock.mounted = service.get_boot_epoch();
4145 superblock.clean_thru = get_osdmap_epoch();
4146 ObjectStore::Transaction t;
4147 write_superblock(t);
4148 int r = store->queue_transaction(service.meta_ch, std::move(t));
4149 if (r) {
4150 derr << "OSD::shutdown: error writing superblock: "
4151 << cpp_strerror(r) << dendl;
4152 }
4153
4154
4155 service.shutdown_reserver();
4156
4157 // Remove PGs
4158 #ifdef PG_DEBUG_REFS
4159 service.dump_live_pgids();
4160 #endif
4161 while (true) {
4162 vector<PGRef> pgs;
4163 _get_pgs(&pgs, true);
4164 if (pgs.empty()) {
4165 break;
4166 }
4167 for (auto& pg : pgs) {
4168 if (pg->is_deleted()) {
4169 continue;
4170 }
4171 dout(20) << " kicking pg " << pg << dendl;
4172 pg->lock();
4173 if (pg->get_num_ref() != 1) {
4174 derr << "pgid " << pg->get_pgid() << " has ref count of "
4175 << pg->get_num_ref() << dendl;
4176 #ifdef PG_DEBUG_REFS
4177 pg->dump_live_ids();
4178 #endif
4179 if (cct->_conf->osd_shutdown_pgref_assert) {
4180 ceph_abort();
4181 }
4182 }
4183 pg->ch.reset();
4184 pg->unlock();
4185 }
4186 }
4187 #ifdef PG_DEBUG_REFS
4188 service.dump_live_pgids();
4189 #endif
4190
4191 osd_lock.unlock();
4192 cct->_conf.remove_observer(this);
4193 osd_lock.lock();
4194
4195 service.meta_ch.reset();
4196
4197 dout(10) << "syncing store" << dendl;
4198 enable_disable_fuse(true);
4199
4200 if (cct->_conf->osd_journal_flush_on_shutdown) {
4201 dout(10) << "flushing journal" << dendl;
4202 store->flush_journal();
4203 }
4204
4205 monc->shutdown();
4206 osd_lock.unlock();
4207 {
4208 std::unique_lock l{map_lock};
4209 set_osdmap(OSDMapRef());
4210 }
4211 for (auto s : shards) {
4212 std::lock_guard l(s->osdmap_lock);
4213 s->shard_osdmap = OSDMapRef();
4214 }
4215 service.shutdown();
4216
4217 std::lock_guard lock(osd_lock);
4218 store->umount();
4219 delete store;
4220 store = nullptr;
4221 dout(10) << "Store synced" << dendl;
4222
4223 op_tracker.on_shutdown();
4224
4225 ClassHandler::get_instance().shutdown();
4226 client_messenger->shutdown();
4227 cluster_messenger->shutdown();
4228 hb_front_client_messenger->shutdown();
4229 hb_back_client_messenger->shutdown();
4230 objecter_messenger->shutdown();
4231 hb_front_server_messenger->shutdown();
4232 hb_back_server_messenger->shutdown();
4233
4234 return r;
4235 }
4236
4237 int OSD::mon_cmd_maybe_osd_create(string &cmd)
4238 {
4239 bool created = false;
4240 while (true) {
4241 dout(10) << __func__ << " cmd: " << cmd << dendl;
4242 vector<string> vcmd{cmd};
4243 bufferlist inbl;
4244 C_SaferCond w;
4245 string outs;
4246 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4247 int r = w.wait();
4248 if (r < 0) {
4249 if (r == -ENOENT && !created) {
4250 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4251 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4252 vector<string> vnewcmd{newcmd};
4253 bufferlist inbl;
4254 C_SaferCond w;
4255 string outs;
4256 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4257 int r = w.wait();
4258 if (r < 0) {
4259 derr << __func__ << " fail: osd does not exist and created failed: "
4260 << cpp_strerror(r) << dendl;
4261 return r;
4262 }
4263 created = true;
4264 continue;
4265 }
4266 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4267 return r;
4268 }
4269 break;
4270 }
4271
4272 return 0;
4273 }
4274
4275 int OSD::update_crush_location()
4276 {
4277 if (!cct->_conf->osd_crush_update_on_start) {
4278 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4279 return 0;
4280 }
4281
4282 char weight[32];
4283 if (cct->_conf->osd_crush_initial_weight >= 0) {
4284 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4285 } else {
4286 struct store_statfs_t st;
4287 osd_alert_list_t alerts;
4288 int r = store->statfs(&st, &alerts);
4289 if (r < 0) {
4290 derr << "statfs: " << cpp_strerror(r) << dendl;
4291 return r;
4292 }
4293 snprintf(weight, sizeof(weight), "%.4lf",
4294 std::max(.00001,
4295 double(st.total) /
4296 double(1ull << 40 /* TB */)));
4297 }
4298
4299 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
4300
4301 string cmd =
4302 string("{\"prefix\": \"osd crush create-or-move\", ") +
4303 string("\"id\": ") + stringify(whoami) + ", " +
4304 string("\"weight\":") + weight + ", " +
4305 string("\"args\": [") + stringify(cct->crush_location) + "]}";
4306 return mon_cmd_maybe_osd_create(cmd);
4307 }
4308
4309 int OSD::update_crush_device_class()
4310 {
4311 if (!cct->_conf->osd_class_update_on_start) {
4312 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4313 return 0;
4314 }
4315
4316 string device_class;
4317 int r = store->read_meta("crush_device_class", &device_class);
4318 if (r < 0 || device_class.empty()) {
4319 device_class = store->get_default_device_class();
4320 }
4321
4322 if (device_class.empty()) {
4323 dout(20) << __func__ << " no device class stored locally" << dendl;
4324 return 0;
4325 }
4326
4327 string cmd =
4328 string("{\"prefix\": \"osd crush set-device-class\", ") +
4329 string("\"class\": \"") + device_class + string("\", ") +
4330 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4331
4332 r = mon_cmd_maybe_osd_create(cmd);
4333 if (r == -EBUSY) {
4334 // good, already bound to a device-class
4335 return 0;
4336 } else {
4337 return r;
4338 }
4339 }
4340
4341 void OSD::write_superblock(ObjectStore::Transaction& t)
4342 {
4343 dout(10) << "write_superblock " << superblock << dendl;
4344
4345 //hack: at minimum it's using the baseline feature set
4346 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4347 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4348
4349 bufferlist bl;
4350 encode(superblock, bl);
4351 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4352 }
4353
4354 int OSD::read_superblock()
4355 {
4356 bufferlist bl;
4357 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4358 if (r < 0)
4359 return r;
4360
4361 auto p = bl.cbegin();
4362 decode(superblock, p);
4363
4364 dout(10) << "read_superblock " << superblock << dendl;
4365
4366 return 0;
4367 }
4368
4369 void OSD::clear_temp_objects()
4370 {
4371 dout(10) << __func__ << dendl;
4372 vector<coll_t> ls;
4373 store->list_collections(ls);
4374 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4375 spg_t pgid;
4376 if (!p->is_pg(&pgid))
4377 continue;
4378
4379 // list temp objects
4380 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4381
4382 vector<ghobject_t> temps;
4383 ghobject_t next;
4384 while (1) {
4385 vector<ghobject_t> objects;
4386 auto ch = store->open_collection(*p);
4387 ceph_assert(ch);
4388 store->collection_list(ch, next, ghobject_t::get_max(),
4389 store->get_ideal_list_max(),
4390 &objects, &next);
4391 if (objects.empty())
4392 break;
4393 vector<ghobject_t>::iterator q;
4394 for (q = objects.begin(); q != objects.end(); ++q) {
4395 // Hammer set pool for temps to -1, so check for clean-up
4396 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4397 temps.push_back(*q);
4398 } else {
4399 break;
4400 }
4401 }
4402 // If we saw a non-temp object and hit the break above we can
4403 // break out of the while loop too.
4404 if (q != objects.end())
4405 break;
4406 }
4407 if (!temps.empty()) {
4408 ObjectStore::Transaction t;
4409 int removed = 0;
4410 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4411 dout(20) << " removing " << *p << " object " << *q << dendl;
4412 t.remove(*p, *q);
4413 if (++removed > cct->_conf->osd_target_transaction_size) {
4414 store->queue_transaction(service.meta_ch, std::move(t));
4415 t = ObjectStore::Transaction();
4416 removed = 0;
4417 }
4418 }
4419 if (removed) {
4420 store->queue_transaction(service.meta_ch, std::move(t));
4421 }
4422 }
4423 }
4424 }
4425
4426 void OSD::recursive_remove_collection(CephContext* cct,
4427 ObjectStore *store, spg_t pgid,
4428 coll_t tmp)
4429 {
4430 OSDriver driver(
4431 store,
4432 coll_t(),
4433 make_snapmapper_oid());
4434
4435 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4436 ObjectStore::Transaction t;
4437 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4438
4439 ghobject_t next;
4440 int max = cct->_conf->osd_target_transaction_size;
4441 vector<ghobject_t> objects;
4442 objects.reserve(max);
4443 while (true) {
4444 objects.clear();
4445 store->collection_list(ch, next, ghobject_t::get_max(),
4446 max, &objects, &next);
4447 generic_dout(10) << __func__ << " " << objects << dendl;
4448 if (objects.empty())
4449 break;
4450 for (auto& p: objects) {
4451 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4452 int r = mapper.remove_oid(p.hobj, &_t);
4453 if (r != 0 && r != -ENOENT)
4454 ceph_abort();
4455 t.remove(tmp, p);
4456 }
4457 int r = store->queue_transaction(ch, std::move(t));
4458 ceph_assert(r == 0);
4459 t = ObjectStore::Transaction();
4460 }
4461 t.remove_collection(tmp);
4462 int r = store->queue_transaction(ch, std::move(t));
4463 ceph_assert(r == 0);
4464
4465 C_SaferCond waiter;
4466 if (!ch->flush_commit(&waiter)) {
4467 waiter.wait();
4468 }
4469 }
4470
4471
4472 // ======================================================
4473 // PG's
4474
4475 PG* OSD::_make_pg(
4476 OSDMapRef createmap,
4477 spg_t pgid)
4478 {
4479 dout(10) << __func__ << " " << pgid << dendl;
4480 pg_pool_t pi;
4481 map<string,string> ec_profile;
4482 string name;
4483 if (createmap->have_pg_pool(pgid.pool())) {
4484 pi = *createmap->get_pg_pool(pgid.pool());
4485 name = createmap->get_pool_name(pgid.pool());
4486 if (pi.is_erasure()) {
4487 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4488 }
4489 } else {
4490 // pool was deleted; grab final pg_pool_t off disk.
4491 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4492 bufferlist bl;
4493 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4494 if (r < 0) {
4495 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4496 << dendl;
4497 return nullptr;
4498 }
4499 ceph_assert(r >= 0);
4500 auto p = bl.cbegin();
4501 decode(pi, p);
4502 decode(name, p);
4503 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4504 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4505 << " tombstone" << dendl;
4506 return nullptr;
4507 }
4508 decode(ec_profile, p);
4509 }
4510 PGPool pool(cct, createmap, pgid.pool(), pi, name);
4511 PG *pg;
4512 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4513 pi.type == pg_pool_t::TYPE_ERASURE)
4514 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4515 else
4516 ceph_abort();
4517 return pg;
4518 }
4519
4520 void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4521 {
4522 v->clear();
4523 v->reserve(get_num_pgs());
4524 for (auto& s : shards) {
4525 std::lock_guard l(s->shard_lock);
4526 for (auto& j : s->pg_slots) {
4527 if (j.second->pg &&
4528 !j.second->pg->is_deleted()) {
4529 v->push_back(j.second->pg);
4530 if (clear_too) {
4531 s->_detach_pg(j.second.get());
4532 }
4533 }
4534 }
4535 }
4536 }
4537
4538 void OSD::_get_pgids(vector<spg_t> *v)
4539 {
4540 v->clear();
4541 v->reserve(get_num_pgs());
4542 for (auto& s : shards) {
4543 std::lock_guard l(s->shard_lock);
4544 for (auto& j : s->pg_slots) {
4545 if (j.second->pg &&
4546 !j.second->pg->is_deleted()) {
4547 v->push_back(j.first);
4548 }
4549 }
4550 }
4551 }
4552
4553 void OSD::register_pg(PGRef pg)
4554 {
4555 spg_t pgid = pg->get_pgid();
4556 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4557 auto sdata = shards[shard_index];
4558 std::lock_guard l(sdata->shard_lock);
4559 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4560 ceph_assert(r.second);
4561 auto *slot = r.first->second.get();
4562 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4563 sdata->_attach_pg(slot, pg.get());
4564 }
4565
4566 bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4567 {
4568 auto sdata = pg->osd_shard;
4569 ceph_assert(sdata);
4570 {
4571 std::lock_guard l(sdata->shard_lock);
4572 auto p = sdata->pg_slots.find(pg->pg_id);
4573 if (p == sdata->pg_slots.end() ||
4574 !p->second->pg) {
4575 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4576 return false;
4577 }
4578 if (p->second->waiting_for_merge_epoch) {
4579 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4580 return false;
4581 }
4582 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4583 sdata->_detach_pg(p->second.get());
4584 }
4585
4586 for (auto shard : shards) {
4587 shard->unprime_split_children(pg->pg_id, old_pg_num);
4588 }
4589
4590 // update pg count now since we might not get an osdmap any time soon.
4591 if (pg->is_primary())
4592 service.logger->dec(l_osd_pg_primary);
4593 else if (pg->is_nonprimary())
4594 service.logger->dec(l_osd_pg_replica); // misnomver
4595 else
4596 service.logger->dec(l_osd_pg_stray);
4597
4598 return true;
4599 }
4600
4601 PGRef OSD::_lookup_pg(spg_t pgid)
4602 {
4603 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4604 auto sdata = shards[shard_index];
4605 std::lock_guard l(sdata->shard_lock);
4606 auto p = sdata->pg_slots.find(pgid);
4607 if (p == sdata->pg_slots.end()) {
4608 return nullptr;
4609 }
4610 return p->second->pg;
4611 }
4612
4613 PGRef OSD::_lookup_lock_pg(spg_t pgid)
4614 {
4615 PGRef pg = _lookup_pg(pgid);
4616 if (!pg) {
4617 return nullptr;
4618 }
4619 pg->lock();
4620 if (!pg->is_deleted()) {
4621 return pg;
4622 }
4623 pg->unlock();
4624 return nullptr;
4625 }
4626
4627 PGRef OSD::lookup_lock_pg(spg_t pgid)
4628 {
4629 return _lookup_lock_pg(pgid);
4630 }
4631
4632 void OSD::load_pgs()
4633 {
4634 ceph_assert(ceph_mutex_is_locked(osd_lock));
4635 dout(0) << "load_pgs" << dendl;
4636
4637 {
4638 auto pghist = make_pg_num_history_oid();
4639 bufferlist bl;
4640 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4641 if (r >= 0 && bl.length() > 0) {
4642 auto p = bl.cbegin();
4643 decode(pg_num_history, p);
4644 }
4645 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
4646 }
4647
4648 vector<coll_t> ls;
4649 int r = store->list_collections(ls);
4650 if (r < 0) {
4651 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4652 }
4653
4654 int num = 0;
4655 for (vector<coll_t>::iterator it = ls.begin();
4656 it != ls.end();
4657 ++it) {
4658 spg_t pgid;
4659 if (it->is_temp(&pgid) ||
4660 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
4661 dout(10) << "load_pgs " << *it
4662 << " removing, legacy or flagged for removal pg" << dendl;
4663 recursive_remove_collection(cct, store, pgid, *it);
4664 continue;
4665 }
4666
4667 if (!it->is_pg(&pgid)) {
4668 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4669 continue;
4670 }
4671
4672 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4673 epoch_t map_epoch = 0;
4674 int r = PG::peek_map_epoch(store, pgid, &map_epoch);
4675 if (r < 0) {
4676 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4677 << dendl;
4678 continue;
4679 }
4680
4681 PGRef pg;
4682 if (map_epoch > 0) {
4683 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4684 if (!pgosdmap) {
4685 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
4686 derr << __func__ << ": could not find map for epoch " << map_epoch
4687 << " on pg " << pgid << ", but the pool is not present in the "
4688 << "current map, so this is probably a result of bug 10617. "
4689 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4690 << "to clean it up later." << dendl;
4691 continue;
4692 } else {
4693 derr << __func__ << ": have pgid " << pgid << " at epoch "
4694 << map_epoch << ", but missing map. Crashing."
4695 << dendl;
4696 ceph_abort_msg("Missing map in load_pgs");
4697 }
4698 }
4699 pg = _make_pg(pgosdmap, pgid);
4700 } else {
4701 pg = _make_pg(get_osdmap(), pgid);
4702 }
4703 if (!pg) {
4704 recursive_remove_collection(cct, store, pgid, *it);
4705 continue;
4706 }
4707
4708 // there can be no waiters here, so we don't call _wake_pg_slot
4709
4710 pg->lock();
4711 pg->ch = store->open_collection(pg->coll);
4712
4713 // read pg state, log
4714 pg->read_state(store);
4715
4716 if (pg->dne()) {
4717 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4718 pg->ch = nullptr;
4719 pg->unlock();
4720 recursive_remove_collection(cct, store, pgid, *it);
4721 continue;
4722 }
4723 {
4724 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4725 assert(NULL != shards[shard_index]);
4726 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4727 }
4728
4729 pg->reg_next_scrub();
4730
4731 dout(10) << __func__ << " loaded " << *pg << dendl;
4732 pg->unlock();
4733
4734 register_pg(pg);
4735 ++num;
4736 }
4737 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
4738 }
4739
4740
4741 PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4742 const PGCreateInfo *info)
4743 {
4744 spg_t pgid = info->pgid;
4745
4746 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4747 dout(10) << __func__ << " hit max pg, dropping" << dendl;
4748 return nullptr;
4749 }
4750
4751 PeeringCtx rctx = create_context();
4752
4753 OSDMapRef startmap = get_map(info->epoch);
4754
4755 if (info->by_mon) {
4756 int64_t pool_id = pgid.pgid.pool();
4757 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4758 if (!pool) {
4759 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4760 return nullptr;
4761 }
4762 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
4763 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4764 // this ensures we do not process old creating messages after the
4765 // pool's initial pgs have been created (and pg are subsequently
4766 // allowed to split or merge).
4767 dout(20) << __func__ << " dropping " << pgid
4768 << "create, pool does not have CREATING flag set" << dendl;
4769 return nullptr;
4770 }
4771 }
4772
4773 int up_primary, acting_primary;
4774 vector<int> up, acting;
4775 startmap->pg_to_up_acting_osds(
4776 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4777
4778 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4779 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4780 store->get_type() != "bluestore") {
4781 clog->warn() << "pg " << pgid
4782 << " is at risk of silent data corruption: "
4783 << "the pool allows ec overwrites but is not stored in "
4784 << "bluestore, so deep scrubbing will not detect bitrot";
4785 }
4786 create_pg_collection(
4787 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4788 init_pg_ondisk(rctx.transaction, pgid, pp);
4789
4790 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
4791
4792 PGRef pg = _make_pg(startmap, pgid);
4793 pg->ch = store->create_new_collection(pg->coll);
4794
4795 {
4796 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4797 assert(NULL != shards[shard_index]);
4798 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4799 }
4800
4801 pg->lock(true);
4802
4803 // we are holding the shard lock
4804 ceph_assert(!pg->is_deleted());
4805
4806 pg->init(
4807 role,
4808 up,
4809 up_primary,
4810 acting,
4811 acting_primary,
4812 info->history,
4813 info->past_intervals,
4814 false,
4815 rctx.transaction);
4816
4817 pg->init_collection_pool_opts();
4818
4819 if (pg->is_primary()) {
4820 std::lock_guard locker{m_perf_queries_lock};
4821 pg->set_dynamic_perf_stats_queries(m_perf_queries);
4822 }
4823
4824 pg->handle_initialize(rctx);
4825 pg->handle_activate_map(rctx);
4826
4827 dispatch_context(rctx, pg.get(), osdmap, nullptr);
4828
4829 dout(10) << __func__ << " new pg " << *pg << dendl;
4830 return pg;
4831 }
4832
4833 bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4834 spg_t pgid,
4835 bool is_mon_create)
4836 {
4837 const auto max_pgs_per_osd =
4838 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4839 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4840
4841 if (num_pgs < max_pgs_per_osd) {
4842 return false;
4843 }
4844
4845 std::lock_guard l(pending_creates_lock);
4846 if (is_mon_create) {
4847 pending_creates_from_mon++;
4848 } else {
4849 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
4850 pending_creates_from_osd.emplace(pgid, is_primary);
4851 }
4852 dout(1) << __func__ << " withhold creation of pg " << pgid
4853 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
4854 return true;
4855 }
4856
4857 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4858 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4859 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4860 static vector<int32_t> twiddle(const vector<int>& acting) {
4861 if (acting.size() > 1) {
4862 return {acting[0]};
4863 } else {
4864 vector<int32_t> twiddled(acting.begin(), acting.end());
4865 twiddled.push_back(-1);
4866 return twiddled;
4867 }
4868 }
4869
4870 void OSD::resume_creating_pg()
4871 {
4872 bool do_sub_pg_creates = false;
4873 bool have_pending_creates = false;
4874 {
4875 const auto max_pgs_per_osd =
4876 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4877 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4878 if (max_pgs_per_osd <= num_pgs) {
4879 // this could happen if admin decreases this setting before a PG is removed
4880 return;
4881 }
4882 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4883 std::lock_guard l(pending_creates_lock);
4884 if (pending_creates_from_mon > 0) {
4885 dout(20) << __func__ << " pending_creates_from_mon "
4886 << pending_creates_from_mon << dendl;
4887 do_sub_pg_creates = true;
4888 if (pending_creates_from_mon >= spare_pgs) {
4889 spare_pgs = pending_creates_from_mon = 0;
4890 } else {
4891 spare_pgs -= pending_creates_from_mon;
4892 pending_creates_from_mon = 0;
4893 }
4894 }
4895 auto pg = pending_creates_from_osd.cbegin();
4896 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
4897 dout(20) << __func__ << " pg " << pg->first << dendl;
4898 vector<int> acting;
4899 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
4900 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
4901 pg = pending_creates_from_osd.erase(pg);
4902 do_sub_pg_creates = true;
4903 spare_pgs--;
4904 }
4905 have_pending_creates = (pending_creates_from_mon > 0 ||
4906 !pending_creates_from_osd.empty());
4907 }
4908
4909 bool do_renew_subs = false;
4910 if (do_sub_pg_creates) {
4911 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4912 dout(4) << __func__ << ": resolicit pg creates from mon since "
4913 << last_pg_create_epoch << dendl;
4914 do_renew_subs = true;
4915 }
4916 }
4917 version_t start = get_osdmap_epoch() + 1;
4918 if (have_pending_creates) {
4919 // don't miss any new osdmap deleting PGs
4920 if (monc->sub_want("osdmap", start, 0)) {
4921 dout(4) << __func__ << ": resolicit osdmap from mon since "
4922 << start << dendl;
4923 do_renew_subs = true;
4924 }
4925 } else if (do_sub_pg_creates) {
4926 // no need to subscribe the osdmap continuously anymore
4927 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4928 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
4929 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
4930 << start << dendl;
4931 do_renew_subs = true;
4932 }
4933 }
4934
4935 if (do_renew_subs) {
4936 monc->renew_subs();
4937 }
4938
4939 service.send_pg_temp();
4940 }
4941
4942 void OSD::build_initial_pg_history(
4943 spg_t pgid,
4944 epoch_t created,
4945 utime_t created_stamp,
4946 pg_history_t *h,
4947 PastIntervals *pi)
4948 {
4949 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4950 *h = pg_history_t(created, created_stamp);
4951
4952 OSDMapRef lastmap = service.get_map(created);
4953 int up_primary, acting_primary;
4954 vector<int> up, acting;
4955 lastmap->pg_to_up_acting_osds(
4956 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4957
4958 ostringstream debug;
4959 for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
4960 OSDMapRef osdmap = service.get_map(e);
4961 int new_up_primary, new_acting_primary;
4962 vector<int> new_up, new_acting;
4963 osdmap->pg_to_up_acting_osds(
4964 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4965
4966 // this is a bit imprecise, but sufficient?
4967 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4968 const pg_pool_t *pi;
4969 bool operator()(const set<pg_shard_t> &have) const {
4970 return have.size() >= pi->min_size;
4971 }
4972 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4973 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4974
4975 bool new_interval = PastIntervals::check_new_interval(
4976 acting_primary,
4977 new_acting_primary,
4978 acting, new_acting,
4979 up_primary,
4980 new_up_primary,
4981 up, new_up,
4982 h->same_interval_since,
4983 h->last_epoch_clean,
4984 osdmap.get(),
4985 lastmap.get(),
4986 pgid.pgid,
4987 min_size_predicate,
4988 pi,
4989 &debug);
4990 if (new_interval) {
4991 h->same_interval_since = e;
4992 if (up != new_up) {
4993 h->same_up_since = e;
4994 }
4995 if (acting_primary != new_acting_primary) {
4996 h->same_primary_since = e;
4997 }
4998 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4999 osdmap->get_pg_num(pgid.pgid.pool()),
5000 nullptr)) {
5001 h->last_epoch_split = e;
5002 }
5003 up = new_up;
5004 acting = new_acting;
5005 up_primary = new_up_primary;
5006 acting_primary = new_acting_primary;
5007 }
5008 lastmap = osdmap;
5009 }
5010 dout(20) << __func__ << " " << debug.str() << dendl;
5011 dout(10) << __func__ << " " << *h << " " << *pi
5012 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5013 pi->get_bounds()) << ")"
5014 << dendl;
5015 }
5016
5017 void OSD::_add_heartbeat_peer(int p)
5018 {
5019 if (p == whoami)
5020 return;
5021 HeartbeatInfo *hi;
5022
5023 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5024 if (i == heartbeat_peers.end()) {
5025 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
5026 if (!cons.first)
5027 return;
5028 assert(cons.second);
5029
5030 hi = &heartbeat_peers[p];
5031 hi->peer = p;
5032
5033 auto stamps = service.get_hb_stamps(p);
5034
5035 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5036 sb->peer = p;
5037 sb->stamps = stamps;
5038 hi->hb_interval_start = ceph_clock_now();
5039 hi->con_back = cons.first.get();
5040 hi->con_back->set_priv(sb);
5041
5042 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5043 sf->peer = p;
5044 sf->stamps = stamps;
5045 hi->con_front = cons.second.get();
5046 hi->con_front->set_priv(sf);
5047
5048 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5049 << " " << hi->con_back->get_peer_addr()
5050 << " " << hi->con_front->get_peer_addr()
5051 << dendl;
5052 } else {
5053 hi = &i->second;
5054 }
5055 hi->epoch = get_osdmap_epoch();
5056 }
5057
5058 void OSD::_remove_heartbeat_peer(int n)
5059 {
5060 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5061 ceph_assert(q != heartbeat_peers.end());
5062 dout(20) << " removing heartbeat peer osd." << n
5063 << " " << q->second.con_back->get_peer_addr()
5064 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5065 << dendl;
5066 q->second.clear_mark_down();
5067 heartbeat_peers.erase(q);
5068 }
5069
5070 void OSD::need_heartbeat_peer_update()
5071 {
5072 if (is_stopping())
5073 return;
5074 dout(20) << "need_heartbeat_peer_update" << dendl;
5075 heartbeat_set_peers_need_update();
5076 }
5077
5078 void OSD::maybe_update_heartbeat_peers()
5079 {
5080 ceph_assert(ceph_mutex_is_locked(osd_lock));
5081
5082 if (is_waiting_for_healthy() || is_active()) {
5083 utime_t now = ceph_clock_now();
5084 if (last_heartbeat_resample == utime_t()) {
5085 last_heartbeat_resample = now;
5086 heartbeat_set_peers_need_update();
5087 } else if (!heartbeat_peers_need_update()) {
5088 utime_t dur = now - last_heartbeat_resample;
5089 if (dur > cct->_conf->osd_heartbeat_grace) {
5090 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5091 heartbeat_set_peers_need_update();
5092 last_heartbeat_resample = now;
5093 // automatically clean up any stale heartbeat peers
5094 // if we are unhealthy, then clean all
5095 reset_heartbeat_peers(is_waiting_for_healthy());
5096 }
5097 }
5098 }
5099
5100 if (!heartbeat_peers_need_update())
5101 return;
5102 heartbeat_clear_peers_need_update();
5103
5104 std::lock_guard l(heartbeat_lock);
5105
5106 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5107
5108
5109 // build heartbeat from set
5110 if (is_active()) {
5111 vector<PGRef> pgs;
5112 _get_pgs(&pgs);
5113 for (auto& pg : pgs) {
5114 pg->with_heartbeat_peers([&](int peer) {
5115 if (get_osdmap()->is_up(peer)) {
5116 _add_heartbeat_peer(peer);
5117 }
5118 });
5119 }
5120 }
5121
5122 // include next and previous up osds to ensure we have a fully-connected set
5123 set<int> want, extras;
5124 const int next = get_osdmap()->get_next_up_osd_after(whoami);
5125 if (next >= 0)
5126 want.insert(next);
5127 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
5128 if (prev >= 0 && prev != next)
5129 want.insert(prev);
5130
5131 // make sure we have at least **min_down** osds coming from different
5132 // subtree level (e.g., hosts) for fast failure detection.
5133 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5134 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5135 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5136 get_osdmap()->get_random_up_osds_by_subtree(
5137 whoami, subtree, limit, want, &want);
5138
5139 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5140 dout(10) << " adding neighbor peer osd." << *p << dendl;
5141 extras.insert(*p);
5142 _add_heartbeat_peer(*p);
5143 }
5144
5145 // remove down peers; enumerate extras
5146 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5147 while (p != heartbeat_peers.end()) {
5148 if (!get_osdmap()->is_up(p->first)) {
5149 int o = p->first;
5150 ++p;
5151 _remove_heartbeat_peer(o);
5152 continue;
5153 }
5154 if (p->second.epoch < get_osdmap_epoch()) {
5155 extras.insert(p->first);
5156 }
5157 ++p;
5158 }
5159
5160 // too few?
5161 for (int n = next; n >= 0; ) {
5162 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5163 break;
5164 if (!extras.count(n) && !want.count(n) && n != whoami) {
5165 dout(10) << " adding random peer osd." << n << dendl;
5166 extras.insert(n);
5167 _add_heartbeat_peer(n);
5168 }
5169 n = get_osdmap()->get_next_up_osd_after(n);
5170 if (n == next)
5171 break; // came full circle; stop
5172 }
5173
5174 // too many?
5175 for (set<int>::iterator p = extras.begin();
5176 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5177 ++p) {
5178 if (want.count(*p))
5179 continue;
5180 _remove_heartbeat_peer(*p);
5181 }
5182
5183 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5184
5185 // clean up stale failure pending
5186 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5187 if (heartbeat_peers.count(it->first) == 0) {
5188 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5189 failure_pending.erase(it++);
5190 } else {
5191 it++;
5192 }
5193 }
5194 }
5195
5196 void OSD::reset_heartbeat_peers(bool all)
5197 {
5198 ceph_assert(ceph_mutex_is_locked(osd_lock));
5199 dout(10) << "reset_heartbeat_peers" << dendl;
5200 utime_t stale = ceph_clock_now();
5201 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5202 std::lock_guard l(heartbeat_lock);
5203 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5204 auto& [peer, hi] = *it;
5205 if (all || hi.is_stale(stale)) {
5206 hi.clear_mark_down();
5207 // stop sending failure_report to mon too
5208 failure_queue.erase(peer);
5209 failure_pending.erase(peer);
5210 it = heartbeat_peers.erase(it);
5211 } else {
5212 ++it;
5213 }
5214 }
5215 }
5216
5217 void OSD::handle_osd_ping(MOSDPing *m)
5218 {
5219 if (superblock.cluster_fsid != m->fsid) {
5220 dout(20) << "handle_osd_ping from " << m->get_source_inst()
5221 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5222 << dendl;
5223 m->put();
5224 return;
5225 }
5226
5227 int from = m->get_source().num();
5228
5229 heartbeat_lock.lock();
5230 if (is_stopping()) {
5231 heartbeat_lock.unlock();
5232 m->put();
5233 return;
5234 }
5235
5236 utime_t now = ceph_clock_now();
5237 auto mnow = service.get_mnow();
5238 ConnectionRef con(m->get_connection());
5239 OSDMapRef curmap = service.get_osdmap();
5240 if (!curmap) {
5241 heartbeat_lock.unlock();
5242 m->put();
5243 return;
5244 }
5245
5246 auto sref = con->get_priv();
5247 Session *s = static_cast<Session*>(sref.get());
5248 if (!s) {
5249 heartbeat_lock.unlock();
5250 m->put();
5251 return;
5252 }
5253 if (!s->stamps) {
5254 s->peer = from;
5255 s->stamps = service.get_hb_stamps(from);
5256 }
5257
5258 switch (m->op) {
5259
5260 case MOSDPing::PING:
5261 {
5262 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5263 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5264 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5265 if (heartbeat_drop->second == 0) {
5266 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5267 } else {
5268 --heartbeat_drop->second;
5269 dout(5) << "Dropping heartbeat from " << from
5270 << ", " << heartbeat_drop->second
5271 << " remaining to drop" << dendl;
5272 break;
5273 }
5274 } else if (cct->_conf->osd_debug_drop_ping_probability >
5275 ((((double)(rand()%100))/100.0))) {
5276 heartbeat_drop =
5277 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5278 cct->_conf->osd_debug_drop_ping_duration)).first;
5279 dout(5) << "Dropping heartbeat from " << from
5280 << ", " << heartbeat_drop->second
5281 << " remaining to drop" << dendl;
5282 break;
5283 }
5284 }
5285
5286 ceph::signedspan sender_delta_ub{};
5287 s->stamps->got_ping(
5288 m->up_from,
5289 mnow,
5290 m->mono_send_stamp,
5291 m->delta_ub,
5292 &sender_delta_ub);
5293 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5294
5295 if (!cct->get_heartbeat_map()->is_healthy()) {
5296 dout(10) << "internal heartbeat not healthy, dropping ping request"
5297 << dendl;
5298 break;
5299 }
5300
5301 Message *r = new MOSDPing(monc->get_fsid(),
5302 curmap->get_epoch(),
5303 MOSDPing::PING_REPLY,
5304 m->ping_stamp,
5305 m->mono_ping_stamp,
5306 mnow,
5307 service.get_up_epoch(),
5308 cct->_conf->osd_heartbeat_min_size,
5309 sender_delta_ub);
5310 con->send_message(r);
5311
5312 if (curmap->is_up(from)) {
5313 if (is_active()) {
5314 ConnectionRef cluster_con = service.get_con_osd_cluster(
5315 from, curmap->get_epoch());
5316 if (cluster_con) {
5317 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5318 }
5319 }
5320 } else if (!curmap->exists(from) ||
5321 curmap->get_down_at(from) > m->map_epoch) {
5322 // tell them they have died
5323 Message *r = new MOSDPing(monc->get_fsid(),
5324 curmap->get_epoch(),
5325 MOSDPing::YOU_DIED,
5326 m->ping_stamp,
5327 m->mono_ping_stamp,
5328 mnow,
5329 service.get_up_epoch(),
5330 cct->_conf->osd_heartbeat_min_size);
5331 con->send_message(r);
5332 }
5333 }
5334 break;
5335
5336 case MOSDPing::PING_REPLY:
5337 {
5338 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5339 if (i != heartbeat_peers.end()) {
5340 auto acked = i->second.ping_history.find(m->ping_stamp);
5341 if (acked != i->second.ping_history.end()) {
5342 int &unacknowledged = acked->second.second;
5343 if (con == i->second.con_back) {
5344 dout(25) << "handle_osd_ping got reply from osd." << from
5345 << " first_tx " << i->second.first_tx
5346 << " last_tx " << i->second.last_tx
5347 << " last_rx_back " << i->second.last_rx_back
5348 << " -> " << now
5349 << " last_rx_front " << i->second.last_rx_front
5350 << dendl;
5351 i->second.last_rx_back = now;
5352 ceph_assert(unacknowledged > 0);
5353 --unacknowledged;
5354 // if there is no front con, set both stamps.
5355 if (i->second.con_front == NULL) {
5356 i->second.last_rx_front = now;
5357 ceph_assert(unacknowledged > 0);
5358 --unacknowledged;
5359 }
5360 } else if (con == i->second.con_front) {
5361 dout(25) << "handle_osd_ping got reply from osd." << from
5362 << " first_tx " << i->second.first_tx
5363 << " last_tx " << i->second.last_tx
5364 << " last_rx_back " << i->second.last_rx_back
5365 << " last_rx_front " << i->second.last_rx_front
5366 << " -> " << now
5367 << dendl;
5368 i->second.last_rx_front = now;
5369 ceph_assert(unacknowledged > 0);
5370 --unacknowledged;
5371 }
5372
5373 if (unacknowledged == 0) {
5374 // succeeded in getting all replies
5375 dout(25) << "handle_osd_ping got all replies from osd." << from
5376 << " , erase pending ping(sent at " << m->ping_stamp << ")"
5377 << " and older pending ping(s)"
5378 << dendl;
5379
5380 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5381 ++i->second.hb_average_count;
5382 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
5383 i->second.hb_total_back += back_pingtime;
5384 if (back_pingtime < i->second.hb_min_back)
5385 i->second.hb_min_back = back_pingtime;
5386 if (back_pingtime > i->second.hb_max_back)
5387 i->second.hb_max_back = back_pingtime;
5388 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
5389 i->second.hb_total_front += front_pingtime;
5390 if (front_pingtime < i->second.hb_min_front)
5391 i->second.hb_min_front = front_pingtime;
5392 if (front_pingtime > i->second.hb_max_front)
5393 i->second.hb_max_front = front_pingtime;
5394
5395 ceph_assert(i->second.hb_interval_start != utime_t());
5396 if (i->second.hb_interval_start == utime_t())
5397 i->second.hb_interval_start = now;
5398 int64_t hb_avg_time_period = 60;
5399 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5400 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5401 }
5402 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5403 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5404 uint32_t back_min = i->second.hb_min_back;
5405 uint32_t back_max = i->second.hb_max_back;
5406 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5407 uint32_t front_min = i->second.hb_min_front;
5408 uint32_t front_max = i->second.hb_max_front;
5409
5410 // Reset for new interval
5411 i->second.hb_average_count = 0;
5412 i->second.hb_interval_start = now;
5413 i->second.hb_total_back = i->second.hb_max_back = 0;
5414 i->second.hb_min_back = UINT_MAX;
5415 i->second.hb_total_front = i->second.hb_max_front = 0;
5416 i->second.hb_min_front = UINT_MAX;
5417
5418 // Record per osd interace ping times
5419 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5420 if (i->second.hb_back_pingtime.size() == 0) {
5421 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5422 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5423 i->second.hb_back_pingtime.push_back(back_avg);
5424 i->second.hb_back_min.push_back(back_min);
5425 i->second.hb_back_max.push_back(back_max);
5426 i->second.hb_front_pingtime.push_back(front_avg);
5427 i->second.hb_front_min.push_back(front_min);
5428 i->second.hb_front_max.push_back(front_max);
5429 ++i->second.hb_index;
5430 }
5431 } else {
5432 int index = i->second.hb_index & (hb_vector_size - 1);
5433 i->second.hb_back_pingtime[index] = back_avg;
5434 i->second.hb_back_min[index] = back_min;
5435 i->second.hb_back_max[index] = back_max;
5436 i->second.hb_front_pingtime[index] = front_avg;
5437 i->second.hb_front_min[index] = front_min;
5438 i->second.hb_front_max[index] = front_max;
5439 ++i->second.hb_index;
5440 }
5441
5442 {
5443 std::lock_guard l(service.stat_lock);
5444 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5445 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5446
5447 uint32_t total = 0;
5448 uint32_t min = UINT_MAX;
5449 uint32_t max = 0;
5450 uint32_t count = 0;
5451 uint32_t which = 0;
5452 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5453 for (int32_t k = size - 1 ; k >= 0; --k) {
5454 ++count;
5455 int index = (i->second.hb_index + k) % size;
5456 total += i->second.hb_back_pingtime[index];
5457 if (i->second.hb_back_min[index] < min)
5458 min = i->second.hb_back_min[index];
5459 if (i->second.hb_back_max[index] > max)
5460 max = i->second.hb_back_max[index];
5461 if (count == 1 || count == 5 || count == 15) {
5462 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5463 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5464 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5465 which++;
5466 if (count == 15)
5467 break;
5468 }
5469 }
5470
5471 if (i->second.con_front != NULL) {
5472 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5473
5474 total = 0;
5475 min = UINT_MAX;
5476 max = 0;
5477 count = 0;
5478 which = 0;
5479 for (int32_t k = size - 1 ; k >= 0; --k) {
5480 ++count;
5481 int index = (i->second.hb_index + k) % size;
5482 total += i->second.hb_front_pingtime[index];
5483 if (i->second.hb_front_min[index] < min)
5484 min = i->second.hb_front_min[index];
5485 if (i->second.hb_front_max[index] > max)
5486 max = i->second.hb_front_max[index];
5487 if (count == 1 || count == 5 || count == 15) {
5488 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5489 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5490 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5491 which++;
5492 if (count == 15)
5493 break;
5494 }
5495 }
5496 }
5497 }
5498 } else {
5499 std::lock_guard l(service.stat_lock);
5500 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5501 if (i->second.con_front != NULL)
5502 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5503 }
5504 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5505 }
5506
5507 if (i->second.is_healthy(now)) {
5508 // Cancel false reports
5509 auto failure_queue_entry = failure_queue.find(from);
5510 if (failure_queue_entry != failure_queue.end()) {
5511 dout(10) << "handle_osd_ping canceling queued "
5512 << "failure report for osd." << from << dendl;
5513 failure_queue.erase(failure_queue_entry);
5514 }
5515
5516 auto failure_pending_entry = failure_pending.find(from);
5517 if (failure_pending_entry != failure_pending.end()) {
5518 dout(10) << "handle_osd_ping canceling in-flight "
5519 << "failure report for osd." << from << dendl;
5520 send_still_alive(curmap->get_epoch(),
5521 from,
5522 failure_pending_entry->second.second);
5523 failure_pending.erase(failure_pending_entry);
5524 }
5525 }
5526 } else {
5527 // old replies, deprecated by newly sent pings.
5528 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
5529 << ") is found, treat as covered by newly sent pings "
5530 << "and ignore"
5531 << dendl;
5532 }
5533 }
5534
5535 if (m->map_epoch &&
5536 curmap->is_up(from)) {
5537 if (is_active()) {
5538 ConnectionRef cluster_con = service.get_con_osd_cluster(
5539 from, curmap->get_epoch());
5540 if (cluster_con) {
5541 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5542 }
5543 }
5544 }
5545
5546 s->stamps->got_ping_reply(
5547 mnow,
5548 m->mono_send_stamp,
5549 m->delta_ub);
5550 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5551 }
5552 break;
5553
5554 case MOSDPing::YOU_DIED:
5555 dout(10) << "handle_osd_ping " << m->get_source_inst()
5556 << " says i am down in " << m->map_epoch << dendl;
5557 osdmap_subscribe(curmap->get_epoch()+1, false);
5558 break;
5559 }
5560
5561 heartbeat_lock.unlock();
5562 m->put();
5563 }
5564
5565 void OSD::heartbeat_entry()
5566 {
5567 std::unique_lock l(heartbeat_lock);
5568 if (is_stopping())
5569 return;
5570 while (!heartbeat_stop) {
5571 heartbeat();
5572
5573 double wait;
5574 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5575 wait = (float)cct->_conf->osd_heartbeat_interval;
5576 } else {
5577 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5578 }
5579 auto w = ceph::make_timespan(wait);
5580 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5581 heartbeat_cond.wait_for(l, w);
5582 if (is_stopping())
5583 return;
5584 dout(30) << "heartbeat_entry woke up" << dendl;
5585 }
5586 }
5587
5588 void OSD::heartbeat_check()
5589 {
5590 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
5591 utime_t now = ceph_clock_now();
5592
5593 // check for incoming heartbeats (move me elsewhere?)
5594 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5595 p != heartbeat_peers.end();
5596 ++p) {
5597
5598 if (p->second.first_tx == utime_t()) {
5599 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5600 << " yet, skipping" << dendl;
5601 continue;
5602 }
5603
5604 dout(25) << "heartbeat_check osd." << p->first
5605 << " first_tx " << p->second.first_tx
5606 << " last_tx " << p->second.last_tx
5607 << " last_rx_back " << p->second.last_rx_back
5608 << " last_rx_front " << p->second.last_rx_front
5609 << dendl;
5610 if (p->second.is_unhealthy(now)) {
5611 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5612 if (p->second.last_rx_back == utime_t() ||
5613 p->second.last_rx_front == utime_t()) {
5614 derr << "heartbeat_check: no reply from "
5615 << p->second.con_front->get_peer_addr().get_sockaddr()
5616 << " osd." << p->first
5617 << " ever on either front or back, first ping sent "
5618 << p->second.first_tx
5619 << " (oldest deadline " << oldest_deadline << ")"
5620 << dendl;
5621 // fail
5622 failure_queue[p->first] = p->second.first_tx;
5623 } else {
5624 derr << "heartbeat_check: no reply from "
5625 << p->second.con_front->get_peer_addr().get_sockaddr()
5626 << " osd." << p->first << " since back " << p->second.last_rx_back
5627 << " front " << p->second.last_rx_front
5628 << " (oldest deadline " << oldest_deadline << ")"
5629 << dendl;
5630 // fail
5631 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5632 }
5633 }
5634 }
5635 }
5636
5637 void OSD::heartbeat()
5638 {
5639 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
5640 dout(30) << "heartbeat" << dendl;
5641
5642 // get CPU load avg
5643 double loadavgs[1];
5644 int hb_interval = cct->_conf->osd_heartbeat_interval;
5645 int n_samples = 86400;
5646 if (hb_interval > 1) {
5647 n_samples /= hb_interval;
5648 if (n_samples < 1)
5649 n_samples = 1;
5650 }
5651
5652 if (getloadavg(loadavgs, 1) == 1) {
5653 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5654 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5655 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5656 }
5657
5658 dout(30) << "heartbeat checking stats" << dendl;
5659
5660 // refresh peer list and osd stats
5661 vector<int> hb_peers;
5662 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5663 p != heartbeat_peers.end();
5664 ++p)
5665 hb_peers.push_back(p->first);
5666
5667 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5668 dout(5) << __func__ << " " << new_stat << dendl;
5669 ceph_assert(new_stat.statfs.total);
5670
5671 float pratio;
5672 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5673
5674 service.check_full_status(ratio, pratio);
5675
5676 utime_t now = ceph_clock_now();
5677 auto mnow = service.get_mnow();
5678 utime_t deadline = now;
5679 deadline += cct->_conf->osd_heartbeat_grace;
5680
5681 // send heartbeats
5682 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5683 i != heartbeat_peers.end();
5684 ++i) {
5685 int peer = i->first;
5686 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5687 if (!s) {
5688 dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
5689 continue;
5690 }
5691 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5692
5693 i->second.last_tx = now;
5694 if (i->second.first_tx == utime_t())
5695 i->second.first_tx = now;
5696 i->second.ping_history[now] = make_pair(deadline,
5697 HeartbeatInfo::HEARTBEAT_MAX_CONN);
5698 if (i->second.hb_interval_start == utime_t())
5699 i->second.hb_interval_start = now;
5700
5701 std::optional<ceph::signedspan> delta_ub;
5702 s->stamps->sent_ping(&delta_ub);
5703
5704 i->second.con_back->send_message(
5705 new MOSDPing(monc->get_fsid(),
5706 service.get_osdmap_epoch(),
5707 MOSDPing::PING,
5708 now,
5709 mnow,
5710 mnow,
5711 service.get_up_epoch(),
5712 cct->_conf->osd_heartbeat_min_size,
5713 delta_ub));
5714
5715 if (i->second.con_front)
5716 i->second.con_front->send_message(
5717 new MOSDPing(monc->get_fsid(),
5718 service.get_osdmap_epoch(),
5719 MOSDPing::PING,
5720 now,
5721 mnow,
5722 mnow,
5723 service.get_up_epoch(),
5724 cct->_conf->osd_heartbeat_min_size,
5725 delta_ub));
5726 }
5727
5728 logger->set(l_osd_hb_to, heartbeat_peers.size());
5729
5730 // hmm.. am i all alone?
5731 dout(30) << "heartbeat lonely?" << dendl;
5732 if (heartbeat_peers.empty()) {
5733 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5734 last_mon_heartbeat = now;
5735 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5736 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5737 }
5738 }
5739
5740 dout(30) << "heartbeat done" << dendl;
5741 }
5742
5743 bool OSD::heartbeat_reset(Connection *con)
5744 {
5745 std::lock_guard l(heartbeat_lock);
5746 auto s = con->get_priv();
5747 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
5748 con->set_priv(nullptr);
5749 if (s) {
5750 if (is_stopping()) {
5751 return true;
5752 }
5753 auto session = static_cast<Session*>(s.get());
5754 auto p = heartbeat_peers.find(session->peer);
5755 if (p != heartbeat_peers.end() &&
5756 (p->second.con_back == con ||
5757 p->second.con_front == con)) {
5758 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5759 << ", reopening" << dendl;
5760 p->second.clear_mark_down(con);
5761 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5762 if (newcon.first) {
5763 p->second.con_back = newcon.first.get();
5764 p->second.con_back->set_priv(s);
5765 if (newcon.second) {
5766 p->second.con_front = newcon.second.get();
5767 p->second.con_front->set_priv(s);
5768 }
5769 p->second.ping_history.clear();
5770 } else {
5771 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5772 << ", raced with osdmap update, closing out peer" << dendl;
5773 heartbeat_peers.erase(p);
5774 }
5775 } else {
5776 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5777 }
5778 }
5779 return true;
5780 }
5781
5782
5783
5784 // =========================================
5785
5786 void OSD::tick()
5787 {
5788 ceph_assert(ceph_mutex_is_locked(osd_lock));
5789 dout(10) << "tick" << dendl;
5790
5791 utime_t now = ceph_clock_now();
5792 // throw out any obsolete markdown log
5793 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
5794 while (!osd_markdown_log.empty() &&
5795 osd_markdown_log.front() + grace < now)
5796 osd_markdown_log.pop_front();
5797
5798 if (is_active() || is_waiting_for_healthy()) {
5799 maybe_update_heartbeat_peers();
5800 }
5801
5802 if (is_waiting_for_healthy()) {
5803 start_boot();
5804 }
5805
5806 if (is_waiting_for_healthy() || is_booting()) {
5807 std::lock_guard l(heartbeat_lock);
5808 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5809 last_mon_heartbeat = now;
5810 dout(1) << __func__ << " checking mon for new map" << dendl;
5811 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5812 }
5813 }
5814
5815 do_waiters();
5816
5817 // scrub purged_snaps every deep scrub interval
5818 {
5819 const utime_t last = superblock.last_purged_snaps_scrub;
5820 utime_t next = last;
5821 next += cct->_conf->osd_scrub_min_interval;
5822 std::mt19937 rng;
5823 // use a seed that is stable for each scrub interval, but varies
5824 // by OSD to avoid any herds.
5825 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
5826 double r = (rng() % 1024) / 1024;
5827 next +=
5828 cct->_conf->osd_scrub_min_interval *
5829 cct->_conf->osd_scrub_interval_randomize_ratio * r;
5830 if (next < ceph_clock_now()) {
5831 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5832 << " next " << next << " ... now" << dendl;
5833 scrub_purged_snaps();
5834 } else {
5835 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5836 << " next " << next << dendl;
5837 }
5838 }
5839
5840 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
5841 }
5842
5843 void OSD::tick_without_osd_lock()
5844 {
5845 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
5846 dout(10) << "tick_without_osd_lock" << dendl;
5847
5848 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5849 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5850 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5851
5852 // refresh osd stats
5853 struct store_statfs_t stbuf;
5854 osd_alert_list_t alerts;
5855 int r = store->statfs(&stbuf, &alerts);
5856 ceph_assert(r == 0);
5857 service.set_statfs(stbuf, alerts);
5858
5859 // osd_lock is not being held, which means the OSD state
5860 // might change when doing the monitor report
5861 if (is_active() || is_waiting_for_healthy()) {
5862 {
5863 std::lock_guard l{heartbeat_lock};
5864 heartbeat_check();
5865 }
5866 map_lock.lock_shared();
5867 std::lock_guard l(mon_report_lock);
5868
5869 // mon report?
5870 utime_t now = ceph_clock_now();
5871 if (service.need_fullness_update() ||
5872 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
5873 last_mon_report = now;
5874 send_full_update();
5875 send_failures();
5876 }
5877 map_lock.unlock_shared();
5878
5879 epoch_t max_waiting_epoch = 0;
5880 for (auto s : shards) {
5881 max_waiting_epoch = std::max(max_waiting_epoch,
5882 s->get_max_waiting_epoch());
5883 }
5884 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
5885 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
5886 << ", requesting new map" << dendl;
5887 osdmap_subscribe(superblock.newest_map + 1, false);
5888 }
5889 }
5890
5891 if (is_active()) {
5892 if (!scrub_random_backoff()) {
5893 sched_scrub();
5894 }
5895 service.promote_throttle_recalibrate();
5896 resume_creating_pg();
5897 bool need_send_beacon = false;
5898 const auto now = ceph::coarse_mono_clock::now();
5899 {
5900 // borrow lec lock to pretect last_sent_beacon from changing
5901 std::lock_guard l{min_last_epoch_clean_lock};
5902 const auto elapsed = now - last_sent_beacon;
5903 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5904 cct->_conf->osd_beacon_report_interval) {
5905 need_send_beacon = true;
5906 }
5907 }
5908 if (need_send_beacon) {
5909 send_beacon(now);
5910 }
5911 }
5912
5913 mgrc.update_daemon_health(get_health_metrics());
5914 service.kick_recovery_queue();
5915 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5916 new C_Tick_WithoutOSDLock(this));
5917 }
5918
5919 // Usage:
5920 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5921 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5922 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5923 // getomap <pool> [namespace/]<obj-name>
5924 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5925 // injectmdataerr [namespace/]<obj-name> [shardid]
5926 // injectdataerr [namespace/]<obj-name> [shardid]
5927 //
5928 // set_recovery_delay [utime]
5929 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5930 std::string_view command,
5931 const cmdmap_t& cmdmap, ostream &ss)
5932 {
5933 //Test support
5934 //Support changing the omap on a single osd by using the Admin Socket to
5935 //directly request the osd make a change.
5936 if (command == "setomapval" || command == "rmomapkey" ||
5937 command == "setomapheader" || command == "getomap" ||
5938 command == "truncobj" || command == "injectmdataerr" ||
5939 command == "injectdataerr"
5940 ) {
5941 pg_t rawpg;
5942 int64_t pool;
5943 OSDMapRef curmap = service->get_osdmap();
5944 int r = -1;
5945
5946 string poolstr;
5947
5948 cmd_getval(cmdmap, "pool", poolstr);
5949 pool = curmap->lookup_pg_pool_name(poolstr);
5950 //If we can't find it by name then maybe id specified
5951 if (pool < 0 && isdigit(poolstr[0]))
5952 pool = atoll(poolstr.c_str());
5953 if (pool < 0) {
5954 ss << "Invalid pool '" << poolstr << "''";
5955 return;
5956 }
5957
5958 string objname, nspace;
5959 cmd_getval(cmdmap, "objname", objname);
5960 std::size_t found = objname.find_first_of('/');
5961 if (found != string::npos) {
5962 nspace = objname.substr(0, found);
5963 objname = objname.substr(found+1);
5964 }
5965 object_locator_t oloc(pool, nspace);
5966 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5967
5968 if (r < 0) {
5969 ss << "Invalid namespace/objname";
5970 return;
5971 }
5972
5973 int64_t shardid;
5974 cmd_getval(cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5975 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5976 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5977 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5978 if (curmap->pg_is_ec(rawpg)) {
5979 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5980 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5981 return;
5982 }
5983 }
5984
5985 ObjectStore::Transaction t;
5986
5987 if (command == "setomapval") {
5988 map<string, bufferlist> newattrs;
5989 bufferlist val;
5990 string key, valstr;
5991 cmd_getval(cmdmap, "key", key);
5992 cmd_getval(cmdmap, "val", valstr);
5993
5994 val.append(valstr);
5995 newattrs[key] = val;
5996 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5997 r = store->queue_transaction(service->meta_ch, std::move(t));
5998 if (r < 0)
5999 ss << "error=" << r;
6000 else
6001 ss << "ok";
6002 } else if (command == "rmomapkey") {
6003 string key;
6004 cmd_getval(cmdmap, "key", key);
6005
6006 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
6007 r = store->queue_transaction(service->meta_ch, std::move(t));
6008 if (r < 0)
6009 ss << "error=" << r;
6010 else
6011 ss << "ok";
6012 } else if (command == "setomapheader") {
6013 bufferlist newheader;
6014 string headerstr;
6015
6016 cmd_getval(cmdmap, "header", headerstr);
6017 newheader.append(headerstr);
6018 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
6019 r = store->queue_transaction(service->meta_ch, std::move(t));
6020 if (r < 0)
6021 ss << "error=" << r;
6022 else
6023 ss << "ok";
6024 } else if (command == "getomap") {
6025 //Debug: Output entire omap
6026 bufferlist hdrbl;
6027 map<string, bufferlist> keyvals;
6028 auto ch = store->open_collection(coll_t(pgid));
6029 if (!ch) {
6030 ss << "unable to open collection for " << pgid;
6031 r = -ENOENT;
6032 } else {
6033 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6034 if (r >= 0) {
6035 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6036 for (map<string, bufferlist>::iterator it = keyvals.begin();
6037 it != keyvals.end(); ++it)
6038 ss << " key=" << (*it).first << " val="
6039 << string((*it).second.c_str(), (*it).second.length());
6040 } else {
6041 ss << "error=" << r;
6042 }
6043 }
6044 } else if (command == "truncobj") {
6045 int64_t trunclen;
6046 cmd_getval(cmdmap, "len", trunclen);
6047 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
6048 r = store->queue_transaction(service->meta_ch, std::move(t));
6049 if (r < 0)
6050 ss << "error=" << r;
6051 else
6052 ss << "ok";
6053 } else if (command == "injectdataerr") {
6054 store->inject_data_error(gobj);
6055 ss << "ok";
6056 } else if (command == "injectmdataerr") {
6057 store->inject_mdata_error(gobj);
6058 ss << "ok";
6059 }
6060 return;
6061 }
6062 if (command == "set_recovery_delay") {
6063 int64_t delay;
6064 cmd_getval(cmdmap, "utime", delay, (int64_t)0);
6065 ostringstream oss;
6066 oss << delay;
6067 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
6068 oss.str().c_str());
6069 if (r != 0) {
6070 ss << "set_recovery_delay: error setting "
6071 << "osd_recovery_delay_start to '" << delay << "': error "
6072 << r;
6073 return;
6074 }
6075 service->cct->_conf.apply_changes(nullptr);
6076 ss << "set_recovery_delay: set osd_recovery_delay_start "
6077 << "to " << service->cct->_conf->osd_recovery_delay_start;
6078 return;
6079 }
6080 if (command == "injectfull") {
6081 int64_t count;
6082 string type;
6083 OSDService::s_names state;
6084 cmd_getval(cmdmap, "type", type, string("full"));
6085 cmd_getval(cmdmap, "count", count, (int64_t)-1);
6086 if (type == "none" || count == 0) {
6087 type = "none";
6088 count = 0;
6089 }
6090 state = service->get_full_state(type);
6091 if (state == OSDService::s_names::INVALID) {
6092 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6093 return;
6094 }
6095 service->set_injectfull(state, count);
6096 return;
6097 }
6098 ss << "Internal error - command=" << command;
6099 }
6100
6101 // =========================================
6102
6103 void OSD::ms_handle_connect(Connection *con)
6104 {
6105 dout(10) << __func__ << " con " << con << dendl;
6106 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6107 std::lock_guard l(osd_lock);
6108 if (is_stopping())
6109 return;
6110 dout(10) << __func__ << " on mon" << dendl;
6111
6112 if (is_preboot()) {
6113 start_boot();
6114 } else if (is_booting()) {
6115 _send_boot(); // resend boot message
6116 } else {
6117 map_lock.lock_shared();
6118 std::lock_guard l2(mon_report_lock);
6119
6120 utime_t now = ceph_clock_now();
6121 last_mon_report = now;
6122
6123 // resend everything, it's a new session
6124 send_full_update();
6125 send_alive();
6126 service.requeue_pg_temp();
6127 service.clear_sent_ready_to_merge();
6128 service.send_pg_temp();
6129 service.send_ready_to_merge();
6130 service.send_pg_created();
6131 requeue_failures();
6132 send_failures();
6133
6134 map_lock.unlock_shared();
6135 if (is_active()) {
6136 send_beacon(ceph::coarse_mono_clock::now());
6137 }
6138 }
6139
6140 // full map requests may happen while active or pre-boot
6141 if (requested_full_first) {
6142 rerequest_full_maps();
6143 }
6144 }
6145 }
6146
6147 void OSD::ms_handle_fast_connect(Connection *con)
6148 {
6149 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6150 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6151 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6152 s = ceph::make_ref<Session>(cct, con);
6153 con->set_priv(s);
6154 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6155 << " addr=" << s->con->get_peer_addr() << dendl;
6156 // we don't connect to clients
6157 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6158 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6159 }
6160 }
6161 }
6162
6163 void OSD::ms_handle_fast_accept(Connection *con)
6164 {
6165 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6166 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6167 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6168 s = ceph::make_ref<Session>(cct, con);
6169 con->set_priv(s);
6170 dout(10) << "new session (incoming)" << s << " con=" << con
6171 << " addr=" << con->get_peer_addr()
6172 << " must have raced with connect" << dendl;
6173 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6174 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6175 }
6176 }
6177 }
6178
6179 bool OSD::ms_handle_reset(Connection *con)
6180 {
6181 auto session = ceph::ref_cast<Session>(con->get_priv());
6182 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
6183 if (!session)
6184 return false;
6185 session->wstate.reset(con);
6186 session->con->set_priv(nullptr);
6187 session->con.reset(); // break con <-> session ref cycle
6188 // note that we break session->con *before* the session_handle_reset
6189 // cleanup below. this avoids a race between us and
6190 // PG::add_backoff, Session::check_backoff, etc.
6191 session_handle_reset(session);
6192 return true;
6193 }
6194
6195 bool OSD::ms_handle_refused(Connection *con)
6196 {
6197 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6198 return false;
6199
6200 auto session = ceph::ref_cast<Session>(con->get_priv());
6201 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
6202 if (!session)
6203 return false;
6204 int type = con->get_peer_type();
6205 // handle only OSD failures here
6206 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6207 OSDMapRef osdmap = get_osdmap();
6208 if (osdmap) {
6209 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6210 if (id >= 0 && osdmap->is_up(id)) {
6211 // I'm cheating mon heartbeat grace logic, because we know it's not going
6212 // to respawn alone. +1 so we won't hit any boundary case.
6213 monc->send_mon_message(
6214 new MOSDFailure(
6215 monc->get_fsid(),
6216 id,
6217 osdmap->get_addrs(id),
6218 cct->_conf->osd_heartbeat_grace + 1,
6219 osdmap->get_epoch(),
6220 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6221 ));
6222 }
6223 }
6224 }
6225 return true;
6226 }
6227
6228 struct C_OSD_GetVersion : public Context {
6229 OSD *osd;
6230 uint64_t oldest, newest;
6231 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
6232 void finish(int r) override {
6233 if (r >= 0)
6234 osd->_got_mon_epochs(oldest, newest);
6235 }
6236 };
6237
6238 void OSD::start_boot()
6239 {
6240 if (!_is_healthy()) {
6241 // if we are not healthy, do not mark ourselves up (yet)
6242 dout(1) << "not healthy; waiting to boot" << dendl;
6243 if (!is_waiting_for_healthy())
6244 start_waiting_for_healthy();
6245 // send pings sooner rather than later
6246 heartbeat_kick();
6247 return;
6248 }
6249 dout(1) << __func__ << dendl;
6250 set_state(STATE_PREBOOT);
6251 dout(10) << "start_boot - have maps " << superblock.oldest_map
6252 << ".." << superblock.newest_map << dendl;
6253 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
6254 monc->get_version("osdmap", &c->newest, &c->oldest, c);
6255 }
6256
6257 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6258 {
6259 std::lock_guard l(osd_lock);
6260 if (is_preboot()) {
6261 _preboot(oldest, newest);
6262 }
6263 }
6264
6265 void OSD::_preboot(epoch_t oldest, epoch_t newest)
6266 {
6267 ceph_assert(is_preboot());
6268 dout(10) << __func__ << " _preboot mon has osdmaps "
6269 << oldest << ".." << newest << dendl;
6270
6271 // ensure our local fullness awareness is accurate
6272 {
6273 std::lock_guard l(heartbeat_lock);
6274 heartbeat();
6275 }
6276
6277 const auto& monmap = monc->monmap;
6278 const auto osdmap = get_osdmap();
6279 // if our map within recent history, try to add ourselves to the osdmap.
6280 if (osdmap->get_epoch() == 0) {
6281 derr << "waiting for initial osdmap" << dendl;
6282 } else if (osdmap->is_destroyed(whoami)) {
6283 derr << "osdmap says I am destroyed" << dendl;
6284 // provide a small margin so we don't livelock seeing if we
6285 // un-destroyed ourselves.
6286 if (osdmap->get_epoch() > newest - 1) {
6287 exit(0);
6288 }
6289 } else if (osdmap->is_noup(whoami)) {
6290 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6291 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6292 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6293 << dendl;
6294 } else if (osdmap->require_osd_release < ceph_release_t::luminous) {
6295 derr << "osdmap require_osd_release < luminous; please upgrade to luminous"
6296 << dendl;
6297 } else if (service.need_fullness_update()) {
6298 derr << "osdmap fullness state needs update" << dendl;
6299 send_full_update();
6300 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6301 superblock.purged_snaps_last < superblock.current_epoch) {
6302 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6303 << " < newest_map " << superblock.current_epoch << dendl;
6304 _get_purged_snaps();
6305 } else if (osdmap->get_epoch() >= oldest - 1 &&
6306 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6307
6308 // wait for pgs to fully catch up in a different thread, since
6309 // this thread might be required for splitting and merging PGs to
6310 // make progress.
6311 boot_finisher.queue(
6312 new LambdaContext(
6313 [this](int r) {
6314 std::unique_lock l(osd_lock);
6315 if (is_preboot()) {
6316 dout(10) << __func__ << " waiting for peering work to drain"
6317 << dendl;
6318 l.unlock();
6319 for (auto shard : shards) {
6320 shard->wait_min_pg_epoch(get_osdmap_epoch());
6321 }
6322 l.lock();
6323 }
6324 if (is_preboot()) {
6325 _send_boot();
6326 }
6327 }));
6328 return;
6329 }
6330
6331 // get all the latest maps
6332 if (osdmap->get_epoch() + 1 >= oldest)
6333 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6334 else
6335 osdmap_subscribe(oldest - 1, true);
6336 }
6337
6338 void OSD::_get_purged_snaps()
6339 {
6340 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6341 // overlapping requests to the mon, which will be somewhat inefficient, but
6342 // it should be reliable.
6343 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6344 << ", newest_map " << superblock.current_epoch << dendl;
6345 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6346 superblock.purged_snaps_last + 1,
6347 superblock.current_epoch + 1);
6348 monc->send_mon_message(m);
6349 }
6350
6351 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6352 {
6353 dout(10) << __func__ << " " << *m << dendl;
6354 ObjectStore::Transaction t;
6355 if (!is_preboot() ||
6356 m->last < superblock.purged_snaps_last) {
6357 goto out;
6358 }
6359 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
6360 make_purged_snaps_oid(), &t,
6361 m->purged_snaps);
6362 superblock.purged_snaps_last = m->last;
6363 write_superblock(t);
6364 store->queue_transaction(
6365 service.meta_ch,
6366 std::move(t));
6367 service.publish_superblock(superblock);
6368 if (m->last < superblock.current_epoch) {
6369 _get_purged_snaps();
6370 } else {
6371 start_boot();
6372 }
6373 out:
6374 m->put();
6375 }
6376
6377 void OSD::send_full_update()
6378 {
6379 if (!service.need_fullness_update())
6380 return;
6381 unsigned state = 0;
6382 if (service.is_full()) {
6383 state = CEPH_OSD_FULL;
6384 } else if (service.is_backfillfull()) {
6385 state = CEPH_OSD_BACKFILLFULL;
6386 } else if (service.is_nearfull()) {
6387 state = CEPH_OSD_NEARFULL;
6388 }
6389 set<string> s;
6390 OSDMap::calc_state_set(state, s);
6391 dout(10) << __func__ << " want state " << s << dendl;
6392 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
6393 }
6394
6395 void OSD::start_waiting_for_healthy()
6396 {
6397 dout(1) << "start_waiting_for_healthy" << dendl;
6398 set_state(STATE_WAITING_FOR_HEALTHY);
6399 last_heartbeat_resample = utime_t();
6400
6401 // subscribe to osdmap updates, in case our peers really are known to be dead
6402 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6403 }
6404
6405 bool OSD::_is_healthy()
6406 {
6407 if (!cct->get_heartbeat_map()->is_healthy()) {
6408 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6409 return false;
6410 }
6411
6412 if (is_waiting_for_healthy()) {
6413 utime_t now = ceph_clock_now();
6414 if (osd_markdown_log.empty()) {
6415 dout(5) << __func__ << " force returning true since last markdown"
6416 << " was " << cct->_conf->osd_max_markdown_period
6417 << "s ago" << dendl;
6418 return true;
6419 }
6420 std::lock_guard l(heartbeat_lock);
6421 int num = 0, up = 0;
6422 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6423 p != heartbeat_peers.end();
6424 ++p) {
6425 if (p->second.is_healthy(now))
6426 ++up;
6427 ++num;
6428 }
6429 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6430 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6431 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6432 return false;
6433 }
6434 }
6435
6436 return true;
6437 }
6438
6439 void OSD::_send_boot()
6440 {
6441 dout(10) << "_send_boot" << dendl;
6442 Connection *local_connection =
6443 cluster_messenger->get_loopback_connection().get();
6444 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6445 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6446 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6447 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6448
6449 dout(20) << " initial client_addrs " << client_addrs
6450 << ", cluster_addrs " << cluster_addrs
6451 << ", hb_back_addrs " << hb_back_addrs
6452 << ", hb_front_addrs " << hb_front_addrs
6453 << dendl;
6454 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6455 dout(10) << " assuming cluster_addrs match client_addrs "
6456 << client_addrs << dendl;
6457 cluster_addrs = cluster_messenger->get_myaddrs();
6458 }
6459 if (auto session = local_connection->get_priv(); !session) {
6460 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6461 }
6462
6463 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6464 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6465 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6466 << cluster_addrs << dendl;
6467 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6468 }
6469 if (auto session = local_connection->get_priv(); !session) {
6470 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6471 }
6472
6473 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6474 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6475 dout(10) << " assuming hb_front_addrs match client_addrs "
6476 << client_addrs << dendl;
6477 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6478 }
6479 if (auto session = local_connection->get_priv(); !session) {
6480 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6481 }
6482
6483 // we now know what our front and back addrs will be, and we are
6484 // about to tell the mon what our metadata (including numa bindings)
6485 // are, so now is a good time!
6486 set_numa_affinity();
6487
6488 MOSDBoot *mboot = new MOSDBoot(
6489 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6490 hb_back_addrs, hb_front_addrs, cluster_addrs,
6491 CEPH_FEATURES_ALL);
6492 dout(10) << " final client_addrs " << client_addrs
6493 << ", cluster_addrs " << cluster_addrs
6494 << ", hb_back_addrs " << hb_back_addrs
6495 << ", hb_front_addrs " << hb_front_addrs
6496 << dendl;
6497 _collect_metadata(&mboot->metadata);
6498 monc->send_mon_message(mboot);
6499 set_state(STATE_BOOTING);
6500 }
6501
6502 void OSD::_collect_metadata(map<string,string> *pm)
6503 {
6504 // config info
6505 (*pm)["osd_data"] = dev_path;
6506 if (store->get_type() == "filestore") {
6507 // not applicable for bluestore
6508 (*pm)["osd_journal"] = journal_path;
6509 }
6510 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6511 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6512 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6513 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6514
6515 // backend
6516 (*pm)["osd_objectstore"] = store->get_type();
6517 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6518 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6519 (*pm)["default_device_class"] = store->get_default_device_class();
6520 string osdspec_affinity;
6521 int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6522 if (r < 0 || osdspec_affinity.empty()) {
6523 osdspec_affinity = "";
6524 }
6525 (*pm)["osdspec_affinity"] = osdspec_affinity;
6526 store->collect_metadata(pm);
6527
6528 collect_sys_info(pm, cct);
6529
6530 (*pm)["front_iface"] = pick_iface(
6531 cct,
6532 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6533 (*pm)["back_iface"] = pick_iface(
6534 cct,
6535 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6536
6537 // network numa
6538 {
6539 int node = -1;
6540 set<int> nodes;
6541 set<string> unknown;
6542 for (auto nm : { "front_iface", "back_iface" }) {
6543 if (!(*pm)[nm].size()) {
6544 unknown.insert(nm);
6545 continue;
6546 }
6547 int n = -1;
6548 int r = get_iface_numa_node((*pm)[nm], &n);
6549 if (r < 0) {
6550 unknown.insert((*pm)[nm]);
6551 continue;
6552 }
6553 nodes.insert(n);
6554 if (node < 0) {
6555 node = n;
6556 }
6557 }
6558 if (unknown.size()) {
6559 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6560 }
6561 if (!nodes.empty()) {
6562 (*pm)["network_numa_nodes"] = stringify(nodes);
6563 }
6564 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6565 (*pm)["network_numa_node"] = stringify(node);
6566 }
6567 }
6568
6569 if (numa_node >= 0) {
6570 (*pm)["numa_node"] = stringify(numa_node);
6571 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6572 &numa_cpu_set);
6573 }
6574
6575 set<string> devnames;
6576 store->get_devices(&devnames);
6577 map<string,string> errs;
6578 get_device_metadata(devnames, pm, &errs);
6579 for (auto& i : errs) {
6580 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
6581 }
6582 dout(10) << __func__ << " " << *pm << dendl;
6583 }
6584
6585 void OSD::queue_want_up_thru(epoch_t want)
6586 {
6587 std::shared_lock map_locker{map_lock};
6588 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6589 std::lock_guard report_locker(mon_report_lock);
6590 if (want > up_thru_wanted) {
6591 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6592 << ", currently " << cur
6593 << dendl;
6594 up_thru_wanted = want;
6595 send_alive();
6596 } else {
6597 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6598 << ", currently " << cur
6599 << dendl;
6600 }
6601 }
6602
6603 void OSD::send_alive()
6604 {
6605 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6606 const auto osdmap = get_osdmap();
6607 if (!osdmap->exists(whoami))
6608 return;
6609 epoch_t up_thru = osdmap->get_up_thru(whoami);
6610 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6611 if (up_thru_wanted > up_thru) {
6612 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6613 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6614 }
6615 }
6616
6617 void OSD::request_full_map(epoch_t first, epoch_t last)
6618 {
6619 dout(10) << __func__ << " " << first << ".." << last
6620 << ", previously requested "
6621 << requested_full_first << ".." << requested_full_last << dendl;
6622 ceph_assert(ceph_mutex_is_locked(osd_lock));
6623 ceph_assert(first > 0 && last > 0);
6624 ceph_assert(first <= last);
6625 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6626 if (requested_full_first == 0) {
6627 // first request
6628 requested_full_first = first;
6629 requested_full_last = last;
6630 } else if (last <= requested_full_last) {
6631 // dup
6632 return;
6633 } else {
6634 // additional request
6635 first = requested_full_last + 1;
6636 requested_full_last = last;
6637 }
6638 MMonGetOSDMap *req = new MMonGetOSDMap;
6639 req->request_full(first, last);
6640 monc->send_mon_message(req);
6641 }
6642
6643 void OSD::got_full_map(epoch_t e)
6644 {
6645 ceph_assert(requested_full_first <= requested_full_last);
6646 ceph_assert(ceph_mutex_is_locked(osd_lock));
6647 if (requested_full_first == 0) {
6648 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6649 return;
6650 }
6651 if (e < requested_full_first) {
6652 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6653 << ".." << requested_full_last
6654 << ", ignoring" << dendl;
6655 return;
6656 }
6657 if (e >= requested_full_last) {
6658 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6659 << ".." << requested_full_last << ", resetting" << dendl;
6660 requested_full_first = requested_full_last = 0;
6661 return;
6662 }
6663
6664 requested_full_first = e + 1;
6665
6666 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6667 << ".." << requested_full_last
6668 << ", still need more" << dendl;
6669 }
6670
6671 void OSD::requeue_failures()
6672 {
6673 std::lock_guard l(heartbeat_lock);
6674 unsigned old_queue = failure_queue.size();
6675 unsigned old_pending = failure_pending.size();
6676 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
6677 failure_queue[p->first] = p->second.first;
6678 failure_pending.erase(p++);
6679 }
6680 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6681 << failure_queue.size() << dendl;
6682 }
6683
6684 void OSD::send_failures()
6685 {
6686 ceph_assert(ceph_mutex_is_locked(map_lock));
6687 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6688 std::lock_guard l(heartbeat_lock);
6689 utime_t now = ceph_clock_now();
6690 const auto osdmap = get_osdmap();
6691 while (!failure_queue.empty()) {
6692 int osd = failure_queue.begin()->first;
6693 if (!failure_pending.count(osd)) {
6694 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6695 monc->send_mon_message(
6696 new MOSDFailure(
6697 monc->get_fsid(),
6698 osd,
6699 osdmap->get_addrs(osd),
6700 failed_for,
6701 osdmap->get_epoch()));
6702 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6703 osdmap->get_addrs(osd));
6704 }
6705 failure_queue.erase(osd);
6706 }
6707 }
6708
6709 void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
6710 {
6711 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6712 MOSDFailure::FLAG_ALIVE);
6713 monc->send_mon_message(m);
6714 }
6715
6716 void OSD::cancel_pending_failures()
6717 {
6718 std::lock_guard l(heartbeat_lock);
6719 auto it = failure_pending.begin();
6720 while (it != failure_pending.end()) {
6721 dout(10) << __func__ << " canceling in-flight failure report for osd."
6722 << it->first << dendl;
6723 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
6724 failure_pending.erase(it++);
6725 }
6726 }
6727
6728 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6729 {
6730 const auto& monmap = monc->monmap;
6731 // send beacon to mon even if we are just connected, and the monmap is not
6732 // initialized yet by then.
6733 if (monmap.epoch > 0 &&
6734 monmap.get_required_features().contains_all(
6735 ceph::features::mon::FEATURE_LUMINOUS)) {
6736 dout(20) << __func__ << " sending" << dendl;
6737 MOSDBeacon* beacon = nullptr;
6738 {
6739 std::lock_guard l{min_last_epoch_clean_lock};
6740 beacon = new MOSDBeacon(get_osdmap_epoch(),
6741 min_last_epoch_clean,
6742 superblock.last_purged_snaps_scrub);
6743 beacon->pgs = min_last_epoch_clean_pgs;
6744 last_sent_beacon = now;
6745 }
6746 monc->send_mon_message(beacon);
6747 } else {
6748 dout(20) << __func__ << " not sending" << dendl;
6749 }
6750 }
6751
6752 void OSD::handle_command(MCommand *m)
6753 {
6754 ConnectionRef con = m->get_connection();
6755 auto session = ceph::ref_cast<Session>(con->get_priv());
6756 if (!session) {
6757 con->send_message(new MCommandReply(m, -EACCES));
6758 m->put();
6759 return;
6760 }
6761 if (!session->caps.allow_all()) {
6762 con->send_message(new MCommandReply(m, -EACCES));
6763 m->put();
6764 return;
6765 }
6766 cct->get_admin_socket()->queue_tell_command(m);
6767 m->put();
6768 }
6769
6770 namespace {
6771 class unlock_guard {
6772 ceph::mutex& m;
6773 public:
6774 explicit unlock_guard(ceph::mutex& mutex)
6775 : m(mutex)
6776 {
6777 m.unlock();
6778 }
6779 unlock_guard(unlock_guard&) = delete;
6780 ~unlock_guard() {
6781 m.lock();
6782 }
6783 };
6784 }
6785
6786 void OSD::scrub_purged_snaps()
6787 {
6788 dout(10) << __func__ << dendl;
6789 ceph_assert(ceph_mutex_is_locked(osd_lock));
6790 SnapMapper::Scrubber s(cct, store, service.meta_ch,
6791 make_snapmapper_oid(),
6792 make_purged_snaps_oid());
6793 clog->debug() << "purged_snaps scrub starts";
6794 osd_lock.unlock();
6795 s.run();
6796 if (s.stray.size()) {
6797 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
6798 } else {
6799 clog->debug() << "purged_snaps scrub ok";
6800 }
6801 set<pair<spg_t,snapid_t>> queued;
6802 for (auto& [pool, snap, hash, shard] : s.stray) {
6803 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
6804 if (!pi) {
6805 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
6806 continue;
6807 }
6808 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
6809 spg_t spgid(pgid, shard);
6810 pair<spg_t,snapid_t> p(spgid, snap);
6811 if (queued.count(p)) {
6812 dout(20) << __func__ << " pg " << spgid << " snap " << snap
6813 << " already queued" << dendl;
6814 continue;
6815 }
6816 PGRef pg = lookup_lock_pg(spgid);
6817 if (!pg) {
6818 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
6819 continue;
6820 }
6821 queued.insert(p);
6822 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
6823 << snap << dendl;
6824 pg->queue_snap_retrim(snap);
6825 pg->unlock();
6826 }
6827 osd_lock.lock();
6828 if (is_stopping()) {
6829 return;
6830 }
6831 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
6832 ObjectStore::Transaction t;
6833 superblock.last_purged_snaps_scrub = ceph_clock_now();
6834 write_superblock(t);
6835 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
6836 ceph_assert(tr == 0);
6837 if (is_active()) {
6838 send_beacon(ceph::coarse_mono_clock::now());
6839 }
6840 dout(10) << __func__ << " done" << dendl;
6841 }
6842
6843 void OSD::probe_smart(const string& only_devid, ostream& ss)
6844 {
6845 set<string> devnames;
6846 store->get_devices(&devnames);
6847 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
6848 "osd_smart_report_timeout");
6849
6850 // == typedef std::map<std::string, mValue> mObject;
6851 json_spirit::mObject json_map;
6852
6853 for (auto dev : devnames) {
6854 // smartctl works only on physical devices; filter out any logical device
6855 if (dev.find("dm-") == 0) {
6856 continue;
6857 }
6858
6859 string err;
6860 string devid = get_device_id(dev, &err);
6861 if (devid.size() == 0) {
6862 dout(10) << __func__ << " no unique id for dev " << dev << " ("
6863 << err << "), skipping" << dendl;
6864 continue;
6865 }
6866 if (only_devid.size() && devid != only_devid) {
6867 continue;
6868 }
6869
6870 json_spirit::mValue smart_json;
6871 if (block_device_get_metrics(dev, smart_timeout,
6872 &smart_json)) {
6873 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
6874 continue;
6875 }
6876 json_map[devid] = smart_json;
6877 }
6878 json_spirit::write(json_map, ss, json_spirit::pretty_print);
6879 }
6880
6881 bool OSD::heartbeat_dispatch(Message *m)
6882 {
6883 dout(30) << "heartbeat_dispatch " << m << dendl;
6884 switch (m->get_type()) {
6885
6886 case CEPH_MSG_PING:
6887 dout(10) << "ping from " << m->get_source_inst() << dendl;
6888 m->put();
6889 break;
6890
6891 case MSG_OSD_PING:
6892 handle_osd_ping(static_cast<MOSDPing*>(m));
6893 break;
6894
6895 default:
6896 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
6897 m->put();
6898 }
6899
6900 return true;
6901 }
6902
6903 bool OSD::ms_dispatch(Message *m)
6904 {
6905 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
6906 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
6907 service.got_stop_ack();
6908 m->put();
6909 return true;
6910 }
6911
6912 // lock!
6913
6914 osd_lock.lock();
6915 if (is_stopping()) {
6916 osd_lock.unlock();
6917 m->put();
6918 return true;
6919 }
6920
6921 do_waiters();
6922 _dispatch(m);
6923
6924 osd_lock.unlock();
6925
6926 return true;
6927 }
6928
6929 void OSDService::maybe_share_map(
6930 Connection *con,
6931 const OSDMapRef& osdmap,
6932 epoch_t peer_epoch_lb)
6933 {
6934 // NOTE: we assume caller hold something that keeps the Connection itself
6935 // pinned (e.g., an OpRequest's MessageRef).
6936 auto session = ceph::ref_cast<Session>(con->get_priv());
6937 if (!session) {
6938 return;
6939 }
6940
6941 // assume the peer has the newer of the op's sent_epoch and what
6942 // we think we sent them.
6943 session->sent_epoch_lock.lock();
6944 if (peer_epoch_lb > session->last_sent_epoch) {
6945 dout(10) << __func__ << " con " << con
6946 << " " << con->get_peer_addr()
6947 << " map epoch " << session->last_sent_epoch
6948 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
6949 session->last_sent_epoch = peer_epoch_lb;
6950 }
6951 epoch_t last_sent_epoch = session->last_sent_epoch;
6952 session->sent_epoch_lock.unlock();
6953
6954 if (osdmap->get_epoch() <= last_sent_epoch) {
6955 return;
6956 }
6957
6958 send_incremental_map(last_sent_epoch, con, osdmap);
6959 last_sent_epoch = osdmap->get_epoch();
6960
6961 session->sent_epoch_lock.lock();
6962 if (session->last_sent_epoch < last_sent_epoch) {
6963 dout(10) << __func__ << " con " << con
6964 << " " << con->get_peer_addr()
6965 << " map epoch " << session->last_sent_epoch
6966 << " -> " << last_sent_epoch << " (shared)" << dendl;
6967 session->last_sent_epoch = last_sent_epoch;
6968 }
6969 session->sent_epoch_lock.unlock();
6970 }
6971
6972 void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
6973 {
6974 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
6975
6976 auto i = session->waiting_on_map.begin();
6977 while (i != session->waiting_on_map.end()) {
6978 OpRequestRef op = &(*i);
6979 ceph_assert(ms_can_fast_dispatch(op->get_req()));
6980 auto m = op->get_req<MOSDFastDispatchOp>();
6981 if (m->get_min_epoch() > osdmap->get_epoch()) {
6982 break;
6983 }
6984 session->waiting_on_map.erase(i++);
6985 op->put();
6986
6987 spg_t pgid;
6988 if (m->get_type() == CEPH_MSG_OSD_OP) {
6989 pg_t actual_pgid = osdmap->raw_pg_to_pg(
6990 static_cast<const MOSDOp*>(m)->get_pg());
6991 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
6992 continue;
6993 }
6994 } else {
6995 pgid = m->get_spg();
6996 }
6997 enqueue_op(pgid, std::move(op), m->get_map_epoch());
6998 }
6999
7000 if (session->waiting_on_map.empty()) {
7001 clear_session_waiting_on_map(session);
7002 } else {
7003 register_session_waiting_on_map(session);
7004 }
7005 }
7006
7007 void OSD::ms_fast_dispatch(Message *m)
7008 {
7009 FUNCTRACE(cct);
7010 if (service.is_stopping()) {
7011 m->put();
7012 return;
7013 }
7014
7015 // peering event?
7016 switch (m->get_type()) {
7017 case CEPH_MSG_PING:
7018 dout(10) << "ping from " << m->get_source() << dendl;
7019 m->put();
7020 return;
7021 case MSG_OSD_FORCE_RECOVERY:
7022 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7023 return;
7024 case MSG_OSD_SCRUB2:
7025 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7026 return;
7027
7028 case MSG_OSD_PG_CREATE2:
7029 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7030 case MSG_OSD_PG_QUERY:
7031 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7032 case MSG_OSD_PG_NOTIFY:
7033 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7034 case MSG_OSD_PG_INFO:
7035 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7036 case MSG_OSD_PG_REMOVE:
7037 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7038
7039 // these are single-pg messages that handle themselves
7040 case MSG_OSD_PG_LOG:
7041 case MSG_OSD_PG_TRIM:
7042 case MSG_OSD_PG_NOTIFY2:
7043 case MSG_OSD_PG_QUERY2:
7044 case MSG_OSD_PG_INFO2:
7045 case MSG_OSD_BACKFILL_RESERVE:
7046 case MSG_OSD_RECOVERY_RESERVE:
7047 case MSG_OSD_PG_LEASE:
7048 case MSG_OSD_PG_LEASE_ACK:
7049 {
7050 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7051 if (require_osd_peer(pm)) {
7052 enqueue_peering_evt(
7053 pm->get_spg(),
7054 PGPeeringEventRef(pm->get_event()));
7055 }
7056 pm->put();
7057 return;
7058 }
7059 }
7060
7061 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7062 {
7063 #ifdef WITH_LTTNG
7064 osd_reqid_t reqid = op->get_reqid();
7065 #endif
7066 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7067 reqid.name._num, reqid.tid, reqid.inc);
7068 }
7069
7070 if (m->trace)
7071 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7072
7073 // note sender epoch, min req's epoch
7074 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7075 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7076 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7077
7078 service.maybe_inject_dispatch_delay();
7079
7080 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7081 m->get_type() != CEPH_MSG_OSD_OP) {
7082 // queue it directly
7083 enqueue_op(
7084 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7085 std::move(op),
7086 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7087 } else {
7088 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7089 // message that didn't have an explicit spg_t); we need to map
7090 // them to an spg_t while preserving delivery order.
7091 auto priv = m->get_connection()->get_priv();
7092 if (auto session = static_cast<Session*>(priv.get()); session) {
7093 std::lock_guard l{session->session_dispatch_lock};
7094 op->get();
7095 session->waiting_on_map.push_back(*op);
7096 OSDMapRef nextmap = service.get_nextmap_reserved();
7097 dispatch_session_waiting(session, nextmap);
7098 service.release_map(nextmap);
7099 }
7100 }
7101 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7102 }
7103
7104 int OSD::ms_handle_authentication(Connection *con)
7105 {
7106 int ret = 0;
7107 auto s = ceph::ref_cast<Session>(con->get_priv());
7108 if (!s) {
7109 s = ceph::make_ref<Session>(cct, con);
7110 con->set_priv(s);
7111 s->entity_name = con->get_peer_entity_name();
7112 dout(10) << __func__ << " new session " << s << " con " << s->con
7113 << " entity " << s->entity_name
7114 << " addr " << con->get_peer_addrs() << dendl;
7115 } else {
7116 dout(10) << __func__ << " existing session " << s << " con " << s->con
7117 << " entity " << s->entity_name
7118 << " addr " << con->get_peer_addrs() << dendl;
7119 }
7120
7121 AuthCapsInfo &caps_info = con->get_peer_caps_info();
7122 if (caps_info.allow_all) {
7123 s->caps.set_allow_all();
7124 } else if (caps_info.caps.length() > 0) {
7125 bufferlist::const_iterator p = caps_info.caps.cbegin();
7126 string str;
7127 try {
7128 decode(str, p);
7129 }
7130 catch (buffer::error& e) {
7131 dout(10) << __func__ << " session " << s << " " << s->entity_name
7132 << " failed to decode caps string" << dendl;
7133 ret = -EACCES;
7134 }
7135 if (!ret) {
7136 bool success = s->caps.parse(str);
7137 if (success) {
7138 dout(10) << __func__ << " session " << s
7139 << " " << s->entity_name
7140 << " has caps " << s->caps << " '" << str << "'" << dendl;
7141 ret = 1;
7142 } else {
7143 dout(10) << __func__ << " session " << s << " " << s->entity_name
7144 << " failed to parse caps '" << str << "'" << dendl;
7145 ret = -EACCES;
7146 }
7147 }
7148 }
7149 return ret;
7150 }
7151
7152 void OSD::do_waiters()
7153 {
7154 ceph_assert(ceph_mutex_is_locked(osd_lock));
7155
7156 dout(10) << "do_waiters -- start" << dendl;
7157 while (!finished.empty()) {
7158 OpRequestRef next = finished.front();
7159 finished.pop_front();
7160 dispatch_op(next);
7161 }
7162 dout(10) << "do_waiters -- finish" << dendl;
7163 }
7164
7165 void OSD::dispatch_op(OpRequestRef op)
7166 {
7167 switch (op->get_req()->get_type()) {
7168
7169 case MSG_OSD_PG_CREATE:
7170 handle_pg_create(op);
7171 break;
7172 }
7173 }
7174
7175 void OSD::_dispatch(Message *m)
7176 {
7177 ceph_assert(ceph_mutex_is_locked(osd_lock));
7178 dout(20) << "_dispatch " << m << " " << *m << dendl;
7179
7180 switch (m->get_type()) {
7181 // -- don't need OSDMap --
7182
7183 // map and replication
7184 case CEPH_MSG_OSD_MAP:
7185 handle_osd_map(static_cast<MOSDMap*>(m));
7186 break;
7187 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7188 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7189 break;
7190
7191 // osd
7192 case MSG_OSD_SCRUB:
7193 handle_scrub(static_cast<MOSDScrub*>(m));
7194 break;
7195
7196 case MSG_COMMAND:
7197 handle_command(static_cast<MCommand*>(m));
7198 return;
7199
7200 // -- need OSDMap --
7201
7202 case MSG_OSD_PG_CREATE:
7203 {
7204 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7205 if (m->trace)
7206 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7207 // no map? starting up?
7208 if (!get_osdmap()) {
7209 dout(7) << "no OSDMap, not booted" << dendl;
7210 logger->inc(l_osd_waiting_for_map);
7211 waiting_for_osdmap.push_back(op);
7212 op->mark_delayed("no osdmap");
7213 break;
7214 }
7215
7216 // need OSDMap
7217 dispatch_op(op);
7218 }
7219 }
7220 }
7221
7222 // remove me post-nautilus
7223 void OSD::handle_scrub(MOSDScrub *m)
7224 {
7225 dout(10) << "handle_scrub " << *m << dendl;
7226 if (!require_mon_or_mgr_peer(m)) {
7227 m->put();
7228 return;
7229 }
7230 if (m->fsid != monc->get_fsid()) {
7231 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7232 << dendl;
7233 m->put();
7234 return;
7235 }
7236
7237 vector<spg_t> spgs;
7238 _get_pgids(&spgs);
7239
7240 if (!m->scrub_pgs.empty()) {
7241 vector<spg_t> v;
7242 for (auto pgid : m->scrub_pgs) {
7243 spg_t pcand;
7244 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
7245 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7246 v.push_back(pcand);
7247 }
7248 }
7249 spgs.swap(v);
7250 }
7251
7252 for (auto pgid : spgs) {
7253 enqueue_peering_evt(
7254 pgid,
7255 PGPeeringEventRef(
7256 std::make_shared<PGPeeringEvent>(
7257 get_osdmap_epoch(),
7258 get_osdmap_epoch(),
7259 PeeringState::RequestScrub(m->deep, m->repair))));
7260 }
7261
7262 m->put();
7263 }
7264
7265 void OSD::handle_fast_scrub(MOSDScrub2 *m)
7266 {
7267 dout(10) << __func__ << " " << *m << dendl;
7268 if (!require_mon_or_mgr_peer(m)) {
7269 m->put();
7270 return;
7271 }
7272 if (m->fsid != monc->get_fsid()) {
7273 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7274 << dendl;
7275 m->put();
7276 return;
7277 }
7278 for (auto pgid : m->scrub_pgs) {
7279 enqueue_peering_evt(
7280 pgid,
7281 PGPeeringEventRef(
7282 std::make_shared<PGPeeringEvent>(
7283 m->epoch,
7284 m->epoch,
7285 PeeringState::RequestScrub(m->deep, m->repair))));
7286 }
7287 m->put();
7288 }
7289
7290 bool OSD::scrub_random_backoff()
7291 {
7292 bool coin_flip = (rand() / (double)RAND_MAX >=
7293 cct->_conf->osd_scrub_backoff_ratio);
7294 if (!coin_flip) {
7295 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7296 return true;
7297 }
7298 return false;
7299 }
7300
7301 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7302 const spg_t& pg, const utime_t& timestamp,
7303 double pool_scrub_min_interval,
7304 double pool_scrub_max_interval, bool must)
7305 : cct(cct),
7306 pgid(pg),
7307 sched_time(timestamp),
7308 deadline(timestamp)
7309 {
7310 // if not explicitly requested, postpone the scrub with a random delay
7311 if (!must) {
7312 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7313 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7314 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7315 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7316
7317 sched_time += scrub_min_interval;
7318 double r = rand() / (double)RAND_MAX;
7319 sched_time +=
7320 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7321 if (scrub_max_interval == 0) {
7322 deadline = utime_t();
7323 } else {
7324 deadline += scrub_max_interval;
7325 }
7326
7327 }
7328 }
7329
7330 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7331 if (sched_time < rhs.sched_time)
7332 return true;
7333 if (sched_time > rhs.sched_time)
7334 return false;
7335 return pgid < rhs.pgid;
7336 }
7337
7338 double OSD::scrub_sleep_time(bool must_scrub)
7339 {
7340 if (must_scrub) {
7341 return cct->_conf->osd_scrub_sleep;
7342 }
7343 utime_t now = ceph_clock_now();
7344 if (scrub_time_permit(now)) {
7345 return cct->_conf->osd_scrub_sleep;
7346 }
7347 double normal_sleep = cct->_conf->osd_scrub_sleep;
7348 double extended_sleep = cct->_conf->osd_scrub_extended_sleep;
7349 return std::max(extended_sleep, normal_sleep);
7350 }
7351
7352 bool OSD::scrub_time_permit(utime_t now)
7353 {
7354 struct tm bdt;
7355 time_t tt = now.sec();
7356 localtime_r(&tt, &bdt);
7357
7358 bool day_permit = false;
7359 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7360 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7361 day_permit = true;
7362 }
7363 } else {
7364 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7365 day_permit = true;
7366 }
7367 }
7368
7369 if (!day_permit) {
7370 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7371 << " - " << cct->_conf->osd_scrub_end_week_day
7372 << " now " << bdt.tm_wday << " = no" << dendl;
7373 return false;
7374 }
7375
7376 bool time_permit = false;
7377 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7378 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7379 time_permit = true;
7380 }
7381 } else {
7382 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7383 time_permit = true;
7384 }
7385 }
7386 if (!time_permit) {
7387 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7388 << " - " << cct->_conf->osd_scrub_end_hour
7389 << " now " << bdt.tm_hour << " = no" << dendl;
7390 } else {
7391 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7392 << " - " << cct->_conf->osd_scrub_end_hour
7393 << " now " << bdt.tm_hour << " = yes" << dendl;
7394 }
7395 return time_permit;
7396 }
7397
7398 bool OSD::scrub_load_below_threshold()
7399 {
7400 double loadavgs[3];
7401 if (getloadavg(loadavgs, 3) != 3) {
7402 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7403 return false;
7404 }
7405
7406 // allow scrub if below configured threshold
7407 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7408 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7409 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7410 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7411 << " < max " << cct->_conf->osd_scrub_load_threshold
7412 << " = yes" << dendl;
7413 return true;
7414 }
7415
7416 // allow scrub if below daily avg and currently decreasing
7417 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7418 dout(20) << __func__ << " loadavg " << loadavgs[0]
7419 << " < daily_loadavg " << daily_loadavg
7420 << " and < 15m avg " << loadavgs[2]
7421 << " = yes" << dendl;
7422 return true;
7423 }
7424
7425 dout(20) << __func__ << " loadavg " << loadavgs[0]
7426 << " >= max " << cct->_conf->osd_scrub_load_threshold
7427 << " and ( >= daily_loadavg " << daily_loadavg
7428 << " or >= 15m avg " << loadavgs[2]
7429 << ") = no" << dendl;
7430 return false;
7431 }
7432
7433 void OSD::sched_scrub()
7434 {
7435 // if not permitted, fail fast
7436 if (!service.can_inc_scrubs()) {
7437 return;
7438 }
7439 bool allow_requested_repair_only = false;
7440 if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7441 if (!cct->_conf->osd_repair_during_recovery) {
7442 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7443 return;
7444 }
7445 dout(10) << __func__
7446 << " will only schedule explicitly requested repair due to active recovery"
7447 << dendl;
7448 allow_requested_repair_only = true;
7449 }
7450
7451 utime_t now = ceph_clock_now();
7452 bool time_permit = scrub_time_permit(now);
7453 bool load_is_low = scrub_load_below_threshold();
7454 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7455
7456 OSDService::ScrubJob scrub;
7457 if (service.first_scrub_stamp(&scrub)) {
7458 do {
7459 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7460
7461 if (scrub.sched_time > now) {
7462 // save ourselves some effort
7463 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7464 << " > " << now << dendl;
7465 break;
7466 }
7467
7468 if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
7469 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7470 << (!time_permit ? "time not permit" : "high load") << dendl;
7471 continue;
7472 }
7473
7474 PGRef pg = _lookup_lock_pg(scrub.pgid);
7475 if (!pg)
7476 continue;
7477 // This has already started, so go on to the next scrub job
7478 if (pg->scrubber.active) {
7479 pg->unlock();
7480 dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
7481 continue;
7482 }
7483 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7484 if (allow_requested_repair_only && !pg->scrubber.must_repair) {
7485 pg->unlock();
7486 dout(10) << __func__ << " skip " << scrub.pgid
7487 << " because repairing is not explicitly requested on it"
7488 << dendl;
7489 continue;
7490 }
7491 // If it is reserving, let it resolve before going to the next scrub job
7492 if (pg->scrubber.local_reserved && !pg->scrubber.active) {
7493 pg->unlock();
7494 dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
7495 break;
7496 }
7497 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7498 << (pg->get_must_scrub() ? ", explicitly requested" :
7499 (load_is_low ? ", load_is_low" : " deadline < now"))
7500 << dendl;
7501 if (pg->sched_scrub()) {
7502 pg->unlock();
7503 break;
7504 }
7505 pg->unlock();
7506 } while (service.next_scrub_stamp(scrub, &scrub));
7507 }
7508 dout(20) << "sched_scrub done" << dendl;
7509 }
7510
7511 void OSD::resched_all_scrubs()
7512 {
7513 dout(10) << __func__ << ": start" << dendl;
7514 OSDService::ScrubJob scrub;
7515 if (service.first_scrub_stamp(&scrub)) {
7516 do {
7517 dout(20) << __func__ << ": examine " << scrub.pgid << dendl;
7518
7519 PGRef pg = _lookup_lock_pg(scrub.pgid);
7520 if (!pg)
7521 continue;
7522 if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) {
7523 dout(20) << __func__ << ": reschedule " << scrub.pgid << dendl;
7524 pg->on_info_history_change();
7525 }
7526 pg->unlock();
7527 } while (service.next_scrub_stamp(scrub, &scrub));
7528 }
7529 dout(10) << __func__ << ": done" << dendl;
7530 }
7531
7532 MPGStats* OSD::collect_pg_stats()
7533 {
7534 // This implementation unconditionally sends every is_primary PG's
7535 // stats every time we're called. This has equivalent cost to the
7536 // previous implementation's worst case where all PGs are busy and
7537 // their stats are always enqueued for sending.
7538 std::shared_lock l{map_lock};
7539
7540 osd_stat_t cur_stat = service.get_osd_stat();
7541 cur_stat.os_perf_stat = store->get_cur_stats();
7542
7543 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
7544 m->osd_stat = cur_stat;
7545
7546 std::lock_guard lec{min_last_epoch_clean_lock};
7547 min_last_epoch_clean = get_osdmap_epoch();
7548 min_last_epoch_clean_pgs.clear();
7549
7550 std::set<int64_t> pool_set;
7551 vector<PGRef> pgs;
7552 _get_pgs(&pgs);
7553 for (auto& pg : pgs) {
7554 auto pool = pg->pg_id.pgid.pool();
7555 pool_set.emplace((int64_t)pool);
7556 if (!pg->is_primary()) {
7557 continue;
7558 }
7559 pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7560 m->pg_stat[pg->pg_id.pgid] = s;
7561 min_last_epoch_clean = min(min_last_epoch_clean, lec);
7562 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7563 });
7564 }
7565 store_statfs_t st;
7566 bool per_pool_stats = false;
7567 bool per_pool_omap_stats = false;
7568 for (auto p : pool_set) {
7569 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
7570 if (r == -ENOTSUP) {
7571 break;
7572 } else {
7573 assert(r >= 0);
7574 m->pool_stat[p] = st;
7575 per_pool_stats = true;
7576 }
7577 }
7578
7579 // indicate whether we are reporting per-pool stats
7580 m->osd_stat.num_osds = 1;
7581 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
7582 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
7583
7584 return m;
7585 }
7586
7587 vector<DaemonHealthMetric> OSD::get_health_metrics()
7588 {
7589 vector<DaemonHealthMetric> metrics;
7590 {
7591 utime_t oldest_secs;
7592 const utime_t now = ceph_clock_now();
7593 auto too_old = now;
7594 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7595 int slow = 0;
7596 TrackedOpRef oldest_op;
7597 auto count_slow_ops = [&](TrackedOp& op) {
7598 if (op.get_initiated() < too_old) {
7599 stringstream ss;
7600 ss << "slow request " << op.get_desc()
7601 << " initiated "
7602 << op.get_initiated()
7603 << " currently "
7604 << op.state_string();
7605 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7606 clog->warn() << ss.str();
7607 slow++;
7608 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7609 oldest_op = &op;
7610 }
7611 return true;
7612 } else {
7613 return false;
7614 }
7615 };
7616 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7617 if (slow) {
7618 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7619 << oldest_op->get_desc() << dendl;
7620 }
7621 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7622 } else {
7623 // no news is not good news.
7624 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7625 }
7626 }
7627 {
7628 std::lock_guard l(pending_creates_lock);
7629 auto n_primaries = pending_creates_from_mon;
7630 for (const auto& create : pending_creates_from_osd) {
7631 if (create.second) {
7632 n_primaries++;
7633 }
7634 }
7635 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
7636 }
7637 return metrics;
7638 }
7639
7640 // =====================================================
7641 // MAP
7642
7643 void OSD::wait_for_new_map(OpRequestRef op)
7644 {
7645 // ask?
7646 if (waiting_for_osdmap.empty()) {
7647 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7648 }
7649
7650 logger->inc(l_osd_waiting_for_map);
7651 waiting_for_osdmap.push_back(op);
7652 op->mark_delayed("wait for new map");
7653 }
7654
7655
7656 /** update_map
7657 * assimilate new OSDMap(s). scan pgs, etc.
7658 */
7659
7660 void OSD::note_down_osd(int peer)
7661 {
7662 ceph_assert(ceph_mutex_is_locked(osd_lock));
7663 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7664
7665 std::lock_guard l{heartbeat_lock};
7666 failure_queue.erase(peer);
7667 failure_pending.erase(peer);
7668 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7669 if (p != heartbeat_peers.end()) {
7670 p->second.clear_mark_down();
7671 heartbeat_peers.erase(p);
7672 }
7673 }
7674
7675 void OSD::note_up_osd(int peer)
7676 {
7677 heartbeat_set_peers_need_update();
7678 }
7679
7680 struct C_OnMapCommit : public Context {
7681 OSD *osd;
7682 epoch_t first, last;
7683 MOSDMap *msg;
7684 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7685 : osd(o), first(f), last(l), msg(m) {}
7686 void finish(int r) override {
7687 osd->_committed_osd_maps(first, last, msg);
7688 msg->put();
7689 }
7690 };
7691
7692 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7693 {
7694 std::lock_guard l(osdmap_subscribe_lock);
7695 if (latest_subscribed_epoch >= epoch && !force_request)
7696 return;
7697
7698 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
7699
7700 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7701 force_request) {
7702 monc->renew_subs();
7703 }
7704 }
7705
7706 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7707 {
7708 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7709 if (min <= superblock.oldest_map)
7710 return;
7711
7712 int num = 0;
7713 ObjectStore::Transaction t;
7714 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7715 dout(20) << " removing old osdmap epoch " << e << dendl;
7716 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7717 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7718 superblock.oldest_map = e + 1;
7719 num++;
7720 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7721 service.publish_superblock(superblock);
7722 write_superblock(t);
7723 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7724 ceph_assert(tr == 0);
7725 num = 0;
7726 if (!skip_maps) {
7727 // skip_maps leaves us with a range of old maps if we fail to remove all
7728 // of them before moving superblock.oldest_map forward to the first map
7729 // in the incoming MOSDMap msg. so we should continue removing them in
7730 // this case, even we could do huge series of delete transactions all at
7731 // once.
7732 break;
7733 }
7734 }
7735 }
7736 if (num > 0) {
7737 service.publish_superblock(superblock);
7738 write_superblock(t);
7739 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7740 ceph_assert(tr == 0);
7741 }
7742 // we should not remove the cached maps
7743 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7744 }
7745
7746 void OSD::handle_osd_map(MOSDMap *m)
7747 {
7748 // wait for pgs to catch up
7749 {
7750 // we extend the map cache pins to accomodate pgs slow to consume maps
7751 // for some period, until we hit the max_lag_factor bound, at which point
7752 // we block here to stop injesting more maps than they are able to keep
7753 // up with.
7754 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7755 m_osd_pg_epoch_max_lag_factor;
7756 ceph_assert(max_lag > 0);
7757 epoch_t osd_min = 0;
7758 for (auto shard : shards) {
7759 epoch_t min = shard->get_min_pg_epoch();
7760 if (osd_min == 0 || min < osd_min) {
7761 osd_min = min;
7762 }
7763 }
7764 epoch_t osdmap_epoch = get_osdmap_epoch();
7765 if (osd_min > 0 &&
7766 osdmap_epoch > max_lag &&
7767 osdmap_epoch - max_lag > osd_min) {
7768 epoch_t need = osdmap_epoch - max_lag;
7769 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7770 << " max_lag " << max_lag << ")" << dendl;
7771 for (auto shard : shards) {
7772 epoch_t min = shard->get_min_pg_epoch();
7773 if (need > min) {
7774 dout(10) << __func__ << " waiting for pgs to consume " << need
7775 << " (shard " << shard->shard_id << " min " << min
7776 << ", map cache is " << cct->_conf->osd_map_cache_size
7777 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7778 << ")" << dendl;
7779 unlock_guard unlock{osd_lock};
7780 shard->wait_min_pg_epoch(need);
7781 }
7782 }
7783 }
7784 }
7785
7786 ceph_assert(ceph_mutex_is_locked(osd_lock));
7787 map<epoch_t,OSDMapRef> added_maps;
7788 map<epoch_t,bufferlist> added_maps_bl;
7789 if (m->fsid != monc->get_fsid()) {
7790 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7791 << monc->get_fsid() << dendl;
7792 m->put();
7793 return;
7794 }
7795 if (is_initializing()) {
7796 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7797 m->put();
7798 return;
7799 }
7800
7801 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7802 if (session && !(session->entity_name.is_mon() ||
7803 session->entity_name.is_osd())) {
7804 //not enough perms!
7805 dout(10) << "got osd map from Session " << session
7806 << " which we can't take maps from (not a mon or osd)" << dendl;
7807 m->put();
7808 return;
7809 }
7810
7811 // share with the objecter
7812 if (!is_preboot())
7813 service.objecter->handle_osd_map(m);
7814
7815 epoch_t first = m->get_first();
7816 epoch_t last = m->get_last();
7817 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7818 << superblock.newest_map
7819 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7820 << dendl;
7821
7822 logger->inc(l_osd_map);
7823 logger->inc(l_osd_mape, last - first + 1);
7824 if (first <= superblock.newest_map)
7825 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7826 if (service.max_oldest_map < m->oldest_map) {
7827 service.max_oldest_map = m->oldest_map;
7828 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7829 }
7830
7831 // make sure there is something new, here, before we bother flushing
7832 // the queues and such
7833 if (last <= superblock.newest_map) {
7834 dout(10) << " no new maps here, dropping" << dendl;
7835 m->put();
7836 return;
7837 }
7838
7839 // missing some?
7840 bool skip_maps = false;
7841 if (first > superblock.newest_map + 1) {
7842 dout(10) << "handle_osd_map message skips epochs "
7843 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7844 if (m->oldest_map <= superblock.newest_map + 1) {
7845 osdmap_subscribe(superblock.newest_map + 1, false);
7846 m->put();
7847 return;
7848 }
7849 // always try to get the full range of maps--as many as we can. this
7850 // 1- is good to have
7851 // 2- is at present the only way to ensure that we get a *full* map as
7852 // the first map!
7853 if (m->oldest_map < first) {
7854 osdmap_subscribe(m->oldest_map - 1, true);
7855 m->put();
7856 return;
7857 }
7858 skip_maps = true;
7859 }
7860
7861 ObjectStore::Transaction t;
7862 uint64_t txn_size = 0;
7863
7864 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
7865
7866 // store new maps: queue for disk and put in the osdmap cache
7867 epoch_t start = std::max(superblock.newest_map + 1, first);
7868 for (epoch_t e = start; e <= last; e++) {
7869 if (txn_size >= t.get_num_bytes()) {
7870 derr << __func__ << " transaction size overflowed" << dendl;
7871 ceph_assert(txn_size < t.get_num_bytes());
7872 }
7873 txn_size = t.get_num_bytes();
7874 map<epoch_t,bufferlist>::iterator p;
7875 p = m->maps.find(e);
7876 if (p != m->maps.end()) {
7877 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7878 OSDMap *o = new OSDMap;
7879 bufferlist& bl = p->second;
7880
7881 o->decode(bl);
7882
7883 purged_snaps[e] = o->get_new_purged_snaps();
7884
7885 ghobject_t fulloid = get_osdmap_pobject_name(e);
7886 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7887 added_maps[e] = add_map(o);
7888 added_maps_bl[e] = bl;
7889 got_full_map(e);
7890 continue;
7891 }
7892
7893 p = m->incremental_maps.find(e);
7894 if (p != m->incremental_maps.end()) {
7895 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7896 bufferlist& bl = p->second;
7897 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7898 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7899
7900 OSDMap *o = new OSDMap;
7901 if (e > 1) {
7902 bufferlist obl;
7903 bool got = get_map_bl(e - 1, obl);
7904 if (!got) {
7905 auto p = added_maps_bl.find(e - 1);
7906 ceph_assert(p != added_maps_bl.end());
7907 obl = p->second;
7908 }
7909 o->decode(obl);
7910 }
7911
7912 OSDMap::Incremental inc;
7913 auto p = bl.cbegin();
7914 inc.decode(p);
7915
7916 if (o->apply_incremental(inc) < 0) {
7917 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
7918 ceph_abort_msg("bad fsid");
7919 }
7920
7921 bufferlist fbl;
7922 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7923
7924 bool injected_failure = false;
7925 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7926 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7927 derr << __func__ << " injecting map crc failure" << dendl;
7928 injected_failure = true;
7929 }
7930
7931 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7932 dout(2) << "got incremental " << e
7933 << " but failed to encode full with correct crc; requesting"
7934 << dendl;
7935 clog->warn() << "failed to encode map e" << e << " with expected crc";
7936 dout(20) << "my encoded map was:\n";
7937 fbl.hexdump(*_dout);
7938 *_dout << dendl;
7939 delete o;
7940 request_full_map(e, last);
7941 last = e - 1;
7942
7943 // don't continue committing if we failed to enc the first inc map
7944 if (last < start) {
7945 dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
7946 m->put();
7947 return;
7948 }
7949 break;
7950 }
7951 got_full_map(e);
7952 purged_snaps[e] = o->get_new_purged_snaps();
7953
7954 ghobject_t fulloid = get_osdmap_pobject_name(e);
7955 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7956 added_maps[e] = add_map(o);
7957 added_maps_bl[e] = fbl;
7958 continue;
7959 }
7960
7961 ceph_abort_msg("MOSDMap lied about what maps it had?");
7962 }
7963
7964 // even if this map isn't from a mon, we may have satisfied our subscription
7965 monc->sub_got("osdmap", last);
7966
7967 if (!m->maps.empty() && requested_full_first) {
7968 dout(10) << __func__ << " still missing full maps " << requested_full_first
7969 << ".." << requested_full_last << dendl;
7970 rerequest_full_maps();
7971 }
7972
7973 if (superblock.oldest_map) {
7974 // make sure we at least keep pace with incoming maps
7975 trim_maps(m->oldest_map, last - first + 1, skip_maps);
7976 pg_num_history.prune(superblock.oldest_map);
7977 }
7978
7979 if (!superblock.oldest_map || skip_maps)
7980 superblock.oldest_map = first;
7981 superblock.newest_map = last;
7982 superblock.current_epoch = last;
7983
7984 // note in the superblock that we were clean thru the prior epoch
7985 epoch_t boot_epoch = service.get_boot_epoch();
7986 if (boot_epoch && boot_epoch >= superblock.mounted) {
7987 superblock.mounted = boot_epoch;
7988 superblock.clean_thru = last;
7989 }
7990
7991 // check for pg_num changes and deleted pools
7992 OSDMapRef lastmap;
7993 for (auto& i : added_maps) {
7994 if (!lastmap) {
7995 if (!(lastmap = service.try_get_map(i.first - 1))) {
7996 dout(10) << __func__ << " can't get previous map " << i.first - 1
7997 << " probably first start of this osd" << dendl;
7998 continue;
7999 }
8000 }
8001 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8002 for (auto& j : lastmap->get_pools()) {
8003 if (!i.second->have_pg_pool(j.first)) {
8004 pg_num_history.log_pool_delete(i.first, j.first);
8005 dout(10) << __func__ << " recording final pg_pool_t for pool "
8006 << j.first << dendl;
8007 // this information is needed by _make_pg() if have to restart before
8008 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8009 ghobject_t obj = make_final_pool_info_oid(j.first);
8010 bufferlist bl;
8011 encode(j.second, bl, CEPH_FEATURES_ALL);
8012 string name = lastmap->get_pool_name(j.first);
8013 encode(name, bl);
8014 map<string,string> profile;
8015 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8016 profile = lastmap->get_erasure_code_profile(
8017 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8018 }
8019 encode(profile, bl);
8020 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8021 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8022 new_pg_num != j.second.get_pg_num()) {
8023 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8024 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8025 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8026 }
8027 }
8028 for (auto& j : i.second->get_pools()) {
8029 if (!lastmap->have_pg_pool(j.first)) {
8030 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8031 << j.second.get_pg_num() << dendl;
8032 pg_num_history.log_pg_num_change(i.first, j.first,
8033 j.second.get_pg_num());
8034 }
8035 }
8036 lastmap = i.second;
8037 }
8038 pg_num_history.epoch = last;
8039 {
8040 bufferlist bl;
8041 ::encode(pg_num_history, bl);
8042 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8043 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8044 }
8045
8046 // record new purged_snaps
8047 if (superblock.purged_snaps_last == start - 1) {
8048 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
8049 make_purged_snaps_oid(), &t,
8050 purged_snaps);
8051 superblock.purged_snaps_last = last;
8052 } else {
8053 dout(10) << __func__ << " superblock purged_snaps_last is "
8054 << superblock.purged_snaps_last
8055 << ", not recording new purged_snaps" << dendl;
8056 }
8057
8058 // superblock and commit
8059 write_superblock(t);
8060 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8061 store->queue_transaction(
8062 service.meta_ch,
8063 std::move(t));
8064 service.publish_superblock(superblock);
8065 }
8066
8067 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8068 {
8069 dout(10) << __func__ << " " << first << ".." << last << dendl;
8070 if (is_stopping()) {
8071 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8072 return;
8073 }
8074 std::lock_guard l(osd_lock);
8075 if (is_stopping()) {
8076 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8077 return;
8078 }
8079 map_lock.lock();
8080
8081 ceph_assert(first <= last);
8082
8083 bool do_shutdown = false;
8084 bool do_restart = false;
8085 bool network_error = false;
8086 OSDMapRef osdmap = get_osdmap();
8087
8088 // advance through the new maps
8089 for (epoch_t cur = first; cur <= last; cur++) {
8090 dout(10) << " advance to epoch " << cur
8091 << " (<= last " << last
8092 << " <= newest_map " << superblock.newest_map
8093 << ")" << dendl;
8094
8095 OSDMapRef newmap = get_map(cur);
8096 ceph_assert(newmap); // we just cached it above!
8097
8098 // start blacklisting messages sent to peers that go down.
8099 service.pre_publish_map(newmap);
8100
8101 // kill connections to newly down osds
8102 bool waited_for_reservations = false;
8103 set<int> old;
8104 osdmap = get_osdmap();
8105 osdmap->get_all_osds(old);
8106 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8107 if (*p != whoami &&
8108 osdmap->is_up(*p) && // in old map
8109 newmap->is_down(*p)) { // but not the new one
8110 if (!waited_for_reservations) {
8111 service.await_reserved_maps();
8112 waited_for_reservations = true;
8113 }
8114 note_down_osd(*p);
8115 } else if (*p != whoami &&
8116 osdmap->is_down(*p) &&
8117 newmap->is_up(*p)) {
8118 note_up_osd(*p);
8119 }
8120 }
8121
8122 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8123 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8124 << dendl;
8125 if (is_booting()) {
8126 // this captures the case where we sent the boot message while
8127 // NOUP was being set on the mon and our boot request was
8128 // dropped, and then later it is cleared. it imperfectly
8129 // handles the case where our original boot message was not
8130 // dropped and we restart even though we might have booted, but
8131 // that is harmless (boot will just take slightly longer).
8132 do_restart = true;
8133 }
8134 }
8135
8136 osdmap = std::move(newmap);
8137 set_osdmap(osdmap);
8138 epoch_t up_epoch;
8139 epoch_t boot_epoch;
8140 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8141 if (!up_epoch &&
8142 osdmap->is_up(whoami) &&
8143 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8144 up_epoch = osdmap->get_epoch();
8145 dout(10) << "up_epoch is " << up_epoch << dendl;
8146 if (!boot_epoch) {
8147 boot_epoch = osdmap->get_epoch();
8148 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8149 }
8150 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8151 }
8152 }
8153
8154 epoch_t _bind_epoch = service.get_bind_epoch();
8155 if (osdmap->is_up(whoami) &&
8156 osdmap->get_addrs(whoami).legacy_equals(
8157 client_messenger->get_myaddrs()) &&
8158 _bind_epoch < osdmap->get_up_from(whoami)) {
8159
8160 if (is_booting()) {
8161 dout(1) << "state: booting -> active" << dendl;
8162 set_state(STATE_ACTIVE);
8163 do_restart = false;
8164
8165 // set incarnation so that osd_reqid_t's we generate for our
8166 // objecter requests are unique across restarts.
8167 service.objecter->set_client_incarnation(osdmap->get_epoch());
8168 cancel_pending_failures();
8169 }
8170 }
8171
8172 if (osdmap->get_epoch() > 0 &&
8173 is_active()) {
8174 if (!osdmap->exists(whoami)) {
8175 derr << "map says i do not exist. shutting down." << dendl;
8176 do_shutdown = true; // don't call shutdown() while we have
8177 // everything paused
8178 } else if (osdmap->is_stop(whoami)) {
8179 derr << "map says i am stopped by admin. shutting down." << dendl;
8180 do_shutdown = true;
8181 } else if (!osdmap->is_up(whoami) ||
8182 !osdmap->get_addrs(whoami).legacy_equals(
8183 client_messenger->get_myaddrs()) ||
8184 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8185 cluster_messenger->get_myaddrs()) ||
8186 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8187 hb_back_server_messenger->get_myaddrs()) ||
8188 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8189 hb_front_server_messenger->get_myaddrs())) {
8190 if (!osdmap->is_up(whoami)) {
8191 if (service.is_preparing_to_stop() || service.is_stopping()) {
8192 service.got_stop_ack();
8193 } else {
8194 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8195 "but it is still running";
8196 clog->debug() << "map e" << osdmap->get_epoch()
8197 << " wrongly marked me down at e"
8198 << osdmap->get_down_at(whoami);
8199 }
8200 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8201 // note that this is best-effort...
8202 monc->send_mon_message(
8203 new MOSDMarkMeDead(
8204 monc->get_fsid(),
8205 whoami,
8206 osdmap->get_epoch()));
8207 }
8208 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8209 client_messenger->get_myaddrs())) {
8210 clog->error() << "map e" << osdmap->get_epoch()
8211 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8212 << " != my " << client_messenger->get_myaddrs() << ")";
8213 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8214 cluster_messenger->get_myaddrs())) {
8215 clog->error() << "map e" << osdmap->get_epoch()
8216 << " had wrong cluster addr ("
8217 << osdmap->get_cluster_addrs(whoami)
8218 << " != my " << cluster_messenger->get_myaddrs() << ")";
8219 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8220 hb_back_server_messenger->get_myaddrs())) {
8221 clog->error() << "map e" << osdmap->get_epoch()
8222 << " had wrong heartbeat back addr ("
8223 << osdmap->get_hb_back_addrs(whoami)
8224 << " != my " << hb_back_server_messenger->get_myaddrs()
8225 << ")";
8226 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8227 hb_front_server_messenger->get_myaddrs())) {
8228 clog->error() << "map e" << osdmap->get_epoch()
8229 << " had wrong heartbeat front addr ("
8230 << osdmap->get_hb_front_addrs(whoami)
8231 << " != my " << hb_front_server_messenger->get_myaddrs()
8232 << ")";
8233 }
8234
8235 if (!service.is_stopping()) {
8236 epoch_t up_epoch = 0;
8237 epoch_t bind_epoch = osdmap->get_epoch();
8238 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8239 do_restart = true;
8240
8241 //add markdown log
8242 utime_t now = ceph_clock_now();
8243 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8244 osd_markdown_log.push_back(now);
8245 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8246 derr << __func__ << " marked down "
8247 << osd_markdown_log.size()
8248 << " > osd_max_markdown_count "
8249 << cct->_conf->osd_max_markdown_count
8250 << " in last " << grace << " seconds, shutting down"
8251 << dendl;
8252 do_restart = false;
8253 do_shutdown = true;
8254 }
8255
8256 start_waiting_for_healthy();
8257
8258 set<int> avoid_ports;
8259 #if defined(__FreeBSD__)
8260 // prevent FreeBSD from grabbing the client_messenger port during
8261 // rebinding. In which case a cluster_meesneger will connect also
8262 // to the same port
8263 client_messenger->get_myaddrs().get_ports(&avoid_ports);
8264 #endif
8265 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8266
8267 int r = cluster_messenger->rebind(avoid_ports);
8268 if (r != 0) {
8269 do_shutdown = true; // FIXME: do_restart?
8270 network_error = true;
8271 derr << __func__ << " marked down:"
8272 << " rebind cluster_messenger failed" << dendl;
8273 }
8274
8275 hb_back_server_messenger->mark_down_all();
8276 hb_front_server_messenger->mark_down_all();
8277 hb_front_client_messenger->mark_down_all();
8278 hb_back_client_messenger->mark_down_all();
8279
8280 reset_heartbeat_peers(true);
8281 }
8282 }
8283 }
8284
8285 map_lock.unlock();
8286
8287 check_osdmap_features();
8288
8289 // yay!
8290 consume_map();
8291
8292 if (is_active() || is_waiting_for_healthy())
8293 maybe_update_heartbeat_peers();
8294
8295 if (is_active()) {
8296 activate_map();
8297 }
8298
8299 if (do_shutdown) {
8300 if (network_error) {
8301 cancel_pending_failures();
8302 }
8303 // trigger shutdown in a different thread
8304 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8305 queue_async_signal(SIGINT);
8306 }
8307 else if (m->newest_map && m->newest_map > last) {
8308 dout(10) << " msg say newest map is " << m->newest_map
8309 << ", requesting more" << dendl;
8310 osdmap_subscribe(osdmap->get_epoch()+1, false);
8311 }
8312 else if (is_preboot()) {
8313 if (m->get_source().is_mon())
8314 _preboot(m->oldest_map, m->newest_map);
8315 else
8316 start_boot();
8317 }
8318 else if (do_restart)
8319 start_boot();
8320
8321 }
8322
8323 void OSD::check_osdmap_features()
8324 {
8325 // adjust required feature bits?
8326
8327 // we have to be a bit careful here, because we are accessing the
8328 // Policy structures without taking any lock. in particular, only
8329 // modify integer values that can safely be read by a racing CPU.
8330 // since we are only accessing existing Policy structures a their
8331 // current memory location, and setting or clearing bits in integer
8332 // fields, and we are the only writer, this is not a problem.
8333
8334 const auto osdmap = get_osdmap();
8335 {
8336 Messenger::Policy p = client_messenger->get_default_policy();
8337 uint64_t mask;
8338 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8339 if ((p.features_required & mask) != features) {
8340 dout(0) << "crush map has features " << features
8341 << ", adjusting msgr requires for clients" << dendl;
8342 p.features_required = (p.features_required & ~mask) | features;
8343 client_messenger->set_default_policy(p);
8344 }
8345 }
8346 {
8347 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8348 uint64_t mask;
8349 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8350 if ((p.features_required & mask) != features) {
8351 dout(0) << "crush map has features " << features
8352 << " was " << p.features_required
8353 << ", adjusting msgr requires for mons" << dendl;
8354 p.features_required = (p.features_required & ~mask) | features;
8355 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8356 }
8357 }
8358 {
8359 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8360 uint64_t mask;
8361 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8362
8363 if ((p.features_required & mask) != features) {
8364 dout(0) << "crush map has features " << features
8365 << ", adjusting msgr requires for osds" << dendl;
8366 p.features_required = (p.features_required & ~mask) | features;
8367 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8368 }
8369
8370 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8371 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8372 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8373 ObjectStore::Transaction t;
8374 write_superblock(t);
8375 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8376 ceph_assert(err == 0);
8377 }
8378 }
8379
8380 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8381 hb_front_server_messenger->set_require_authorizer(false);
8382 hb_back_server_messenger->set_require_authorizer(false);
8383 } else {
8384 hb_front_server_messenger->set_require_authorizer(true);
8385 hb_back_server_messenger->set_require_authorizer(true);
8386 }
8387
8388 if (osdmap->require_osd_release != last_require_osd_release) {
8389 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8390 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8391 store->write_meta("require_osd_release",
8392 stringify((int)osdmap->require_osd_release));
8393 last_require_osd_release = osdmap->require_osd_release;
8394 }
8395 }
8396
8397 struct C_FinishSplits : public Context {
8398 OSD *osd;
8399 set<PGRef> pgs;
8400 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8401 : osd(osd), pgs(in) {}
8402 void finish(int r) override {
8403 osd->_finish_splits(pgs);
8404 }
8405 };
8406
8407 void OSD::_finish_splits(set<PGRef>& pgs)
8408 {
8409 dout(10) << __func__ << " " << pgs << dendl;
8410 if (is_stopping())
8411 return;
8412 for (set<PGRef>::iterator i = pgs.begin();
8413 i != pgs.end();
8414 ++i) {
8415 PG *pg = i->get();
8416
8417 PeeringCtx rctx = create_context();
8418 pg->lock();
8419 dout(10) << __func__ << " " << *pg << dendl;
8420 epoch_t e = pg->get_osdmap_epoch();
8421 pg->handle_initialize(rctx);
8422 pg->queue_null(e, e);
8423 dispatch_context(rctx, pg, service.get_osdmap());
8424 pg->unlock();
8425
8426 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8427 shards[shard_index]->register_and_wake_split_child(pg);
8428 }
8429 };
8430
8431 bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8432 unsigned need)
8433 {
8434 std::lock_guard l(merge_lock);
8435 auto& p = merge_waiters[nextmap->get_epoch()][target];
8436 p[src->pg_id] = src;
8437 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8438 << " for " << target << ", have " << p.size() << "/" << need
8439 << dendl;
8440 return p.size() == need;
8441 }
8442
8443 bool OSD::advance_pg(
8444 epoch_t osd_epoch,
8445 PG *pg,
8446 ThreadPool::TPHandle &handle,
8447 PeeringCtx &rctx)
8448 {
8449 if (osd_epoch <= pg->get_osdmap_epoch()) {
8450 return true;
8451 }
8452 ceph_assert(pg->is_locked());
8453 OSDMapRef lastmap = pg->get_osdmap();
8454 ceph_assert(lastmap->get_epoch() < osd_epoch);
8455 set<PGRef> new_pgs; // any split children
8456 bool ret = true;
8457
8458 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8459 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8460 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8461 next_epoch <= osd_epoch;
8462 ++next_epoch) {
8463 OSDMapRef nextmap = service.try_get_map(next_epoch);
8464 if (!nextmap) {
8465 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8466 continue;
8467 }
8468
8469 unsigned new_pg_num =
8470 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8471 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8472 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8473 // check for merge
8474 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8475 spg_t parent;
8476 if (pg->pg_id.is_merge_source(
8477 old_pg_num,
8478 new_pg_num,
8479 &parent)) {
8480 // we are merge source
8481 PGRef spg = pg; // carry a ref
8482 dout(1) << __func__ << " " << pg->pg_id
8483 << " is merge source, target is " << parent
8484 << dendl;
8485 pg->write_if_dirty(rctx);
8486 if (!new_pgs.empty()) {
8487 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8488 new_pgs));
8489 new_pgs.clear();
8490 }
8491 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8492 pg->ch->flush();
8493 // release backoffs explicitly, since the on_shutdown path
8494 // aggressively tears down backoff state.
8495 if (pg->is_primary()) {
8496 pg->release_pg_backoffs();
8497 }
8498 pg->on_shutdown();
8499 OSDShard *sdata = pg->osd_shard;
8500 {
8501 std::lock_guard l(sdata->shard_lock);
8502 if (pg->pg_slot) {
8503 sdata->_detach_pg(pg->pg_slot);
8504 // update pg count now since we might not get an osdmap
8505 // any time soon.
8506 if (pg->is_primary())
8507 logger->dec(l_osd_pg_primary);
8508 else if (pg->is_nonprimary())
8509 logger->dec(l_osd_pg_replica); // misnomer
8510 else
8511 logger->dec(l_osd_pg_stray);
8512 }
8513 }
8514 pg->unlock();
8515
8516 set<spg_t> children;
8517 parent.is_split(new_pg_num, old_pg_num, &children);
8518 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8519 enqueue_peering_evt(
8520 parent,
8521 PGPeeringEventRef(
8522 std::make_shared<PGPeeringEvent>(
8523 nextmap->get_epoch(),
8524 nextmap->get_epoch(),
8525 NullEvt())));
8526 }
8527 ret = false;
8528 goto out;
8529 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8530 // we are merge target
8531 set<spg_t> children;
8532 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8533 dout(20) << __func__ << " " << pg->pg_id
8534 << " is merge target, sources are " << children
8535 << dendl;
8536 map<spg_t,PGRef> sources;
8537 {
8538 std::lock_guard l(merge_lock);
8539 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8540 unsigned need = children.size();
8541 dout(20) << __func__ << " have " << s.size() << "/"
8542 << need << dendl;
8543 if (s.size() == need) {
8544 sources.swap(s);
8545 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8546 if (merge_waiters[nextmap->get_epoch()].empty()) {
8547 merge_waiters.erase(nextmap->get_epoch());
8548 }
8549 }
8550 }
8551 if (!sources.empty()) {
8552 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8553 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8554 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8555 pg->merge_from(
8556 sources, rctx, split_bits,
8557 nextmap->get_pg_pool(
8558 pg->pg_id.pool())->last_pg_merge_meta);
8559 pg->pg_slot->waiting_for_merge_epoch = 0;
8560 } else {
8561 dout(20) << __func__ << " not ready to merge yet" << dendl;
8562 pg->write_if_dirty(rctx);
8563 if (!new_pgs.empty()) {
8564 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8565 new_pgs));
8566 new_pgs.clear();
8567 }
8568 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8569 pg->unlock();
8570 // kick source(s) to get them ready
8571 for (auto& i : children) {
8572 dout(20) << __func__ << " kicking source " << i << dendl;
8573 enqueue_peering_evt(
8574 i,
8575 PGPeeringEventRef(
8576 std::make_shared<PGPeeringEvent>(
8577 nextmap->get_epoch(),
8578 nextmap->get_epoch(),
8579 NullEvt())));
8580 }
8581 ret = false;
8582 goto out;
8583 }
8584 }
8585 }
8586 }
8587
8588 vector<int> newup, newacting;
8589 int up_primary, acting_primary;
8590 nextmap->pg_to_up_acting_osds(
8591 pg->pg_id.pgid,
8592 &newup, &up_primary,
8593 &newacting, &acting_primary);
8594 pg->handle_advance_map(
8595 nextmap, lastmap, newup, up_primary,
8596 newacting, acting_primary, rctx);
8597
8598 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8599 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8600 if (oldpool != lastmap->get_pools().end()
8601 && newpool != nextmap->get_pools().end()) {
8602 dout(20) << __func__
8603 << " new pool opts " << newpool->second.opts
8604 << " old pool opts " << oldpool->second.opts
8605 << dendl;
8606
8607 double old_min_interval = 0, new_min_interval = 0;
8608 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8609 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8610
8611 double old_max_interval = 0, new_max_interval = 0;
8612 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8613 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8614
8615 // Assume if an interval is change from set to unset or vice versa the actual config
8616 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8617 // unnecessarily.
8618 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8619 pg->on_info_history_change();
8620 }
8621 }
8622
8623 if (new_pg_num && old_pg_num != new_pg_num) {
8624 // check for split
8625 set<spg_t> children;
8626 if (pg->pg_id.is_split(
8627 old_pg_num,
8628 new_pg_num,
8629 &children)) {
8630 split_pgs(
8631 pg, children, &new_pgs, lastmap, nextmap,
8632 rctx);
8633 }
8634 }
8635
8636 lastmap = nextmap;
8637 old_pg_num = new_pg_num;
8638 handle.reset_tp_timeout();
8639 }
8640 pg->handle_activate_map(rctx);
8641
8642 ret = true;
8643 out:
8644 if (!new_pgs.empty()) {
8645 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
8646 }
8647 return ret;
8648 }
8649
8650 void OSD::consume_map()
8651 {
8652 ceph_assert(ceph_mutex_is_locked(osd_lock));
8653 auto osdmap = get_osdmap();
8654 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8655
8656 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8657 * speak the older sorting version any more. Be careful not to force
8658 * a shutdown if we are merely processing old maps, though.
8659 */
8660 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8661 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8662 ceph_abort();
8663 }
8664
8665 service.pre_publish_map(osdmap);
8666 service.await_reserved_maps();
8667 service.publish_map(osdmap);
8668
8669 // prime splits and merges
8670 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8671 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8672 for (auto& shard : shards) {
8673 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8674 }
8675 if (!newly_split.empty()) {
8676 for (auto& shard : shards) {
8677 shard->prime_splits(osdmap, &newly_split);
8678 }
8679 ceph_assert(newly_split.empty());
8680 }
8681
8682 // prune sent_ready_to_merge
8683 service.prune_sent_ready_to_merge(osdmap);
8684
8685 // FIXME, maybe: We could race against an incoming peering message
8686 // that instantiates a merge PG after identify_merges() below and
8687 // never set up its peer to complete the merge. An OSD restart
8688 // would clear it up. This is a hard race to resolve,
8689 // extraordinarily rare (we only merge PGs that are stable and
8690 // clean, so it'd have to be an imported PG to an OSD with a
8691 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8692 // replace all of this with a seastar-based code soon anyway.
8693 if (!merge_pgs.empty()) {
8694 // mark the pgs we already have, or create new and empty merge
8695 // participants for those we are missing. do this all under the
8696 // shard lock so we don't have to worry about racing pg creates
8697 // via _process.
8698 for (auto& shard : shards) {
8699 shard->prime_merges(osdmap, &merge_pgs);
8700 }
8701 ceph_assert(merge_pgs.empty());
8702 }
8703
8704 service.prune_pg_created();
8705
8706 unsigned pushes_to_free = 0;
8707 for (auto& shard : shards) {
8708 shard->consume_map(osdmap, &pushes_to_free);
8709 }
8710
8711 vector<spg_t> pgids;
8712 _get_pgids(&pgids);
8713
8714 // count (FIXME, probably during seastar rewrite)
8715 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8716 vector<PGRef> pgs;
8717 _get_pgs(&pgs);
8718 for (auto& pg : pgs) {
8719 // FIXME (probably during seastar rewrite): this is lockless and
8720 // racy, but we don't want to take pg lock here.
8721 if (pg->is_primary())
8722 num_pg_primary++;
8723 else if (pg->is_nonprimary())
8724 num_pg_replica++; // misnomer
8725 else
8726 num_pg_stray++;
8727 }
8728
8729 {
8730 // FIXME (as part of seastar rewrite): move to OSDShard
8731 std::lock_guard l(pending_creates_lock);
8732 for (auto pg = pending_creates_from_osd.begin();
8733 pg != pending_creates_from_osd.end();) {
8734 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
8735 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8736 << "discarding pending_create_from_osd" << dendl;
8737 pg = pending_creates_from_osd.erase(pg);
8738 } else {
8739 ++pg;
8740 }
8741 }
8742 }
8743
8744 service.maybe_inject_dispatch_delay();
8745
8746 dispatch_sessions_waiting_on_map();
8747
8748 service.maybe_inject_dispatch_delay();
8749
8750 service.release_reserved_pushes(pushes_to_free);
8751
8752 // queue null events to push maps down to individual PGs
8753 for (auto pgid : pgids) {
8754 enqueue_peering_evt(
8755 pgid,
8756 PGPeeringEventRef(
8757 std::make_shared<PGPeeringEvent>(
8758 osdmap->get_epoch(),
8759 osdmap->get_epoch(),
8760 NullEvt())));
8761 }
8762 logger->set(l_osd_pg, pgids.size());
8763 logger->set(l_osd_pg_primary, num_pg_primary);
8764 logger->set(l_osd_pg_replica, num_pg_replica);
8765 logger->set(l_osd_pg_stray, num_pg_stray);
8766 }
8767
8768 void OSD::activate_map()
8769 {
8770 ceph_assert(ceph_mutex_is_locked(osd_lock));
8771 auto osdmap = get_osdmap();
8772
8773 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8774
8775 // norecover?
8776 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8777 if (!service.recovery_is_paused()) {
8778 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8779 service.pause_recovery();
8780 }
8781 } else {
8782 if (service.recovery_is_paused()) {
8783 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8784 service.unpause_recovery();
8785 }
8786 }
8787
8788 service.activate_map();
8789
8790 // process waiters
8791 take_waiters(waiting_for_osdmap);
8792 }
8793
8794 bool OSD::require_mon_peer(const Message *m)
8795 {
8796 if (!m->get_connection()->peer_is_mon()) {
8797 dout(0) << "require_mon_peer received from non-mon "
8798 << m->get_connection()->get_peer_addr()
8799 << " " << *m << dendl;
8800 return false;
8801 }
8802 return true;
8803 }
8804
8805 bool OSD::require_mon_or_mgr_peer(const Message *m)
8806 {
8807 if (!m->get_connection()->peer_is_mon() &&
8808 !m->get_connection()->peer_is_mgr()) {
8809 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8810 << m->get_connection()->get_peer_addr()
8811 << " " << *m << dendl;
8812 return false;
8813 }
8814 return true;
8815 }
8816
8817 bool OSD::require_osd_peer(const Message *m)
8818 {
8819 if (!m->get_connection()->peer_is_osd()) {
8820 dout(0) << "require_osd_peer received from non-osd "
8821 << m->get_connection()->get_peer_addr()
8822 << " " << *m << dendl;
8823 return false;
8824 }
8825 return true;
8826 }
8827
8828 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8829 {
8830 epoch_t up_epoch = service.get_up_epoch();
8831 if (epoch < up_epoch) {
8832 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8833 return false;
8834 }
8835
8836 if (!is_active()) {
8837 dout(7) << "still in boot state, dropping message " << *m << dendl;
8838 return false;
8839 }
8840
8841 return true;
8842 }
8843
8844 bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
8845 bool is_fast_dispatch)
8846 {
8847 int from = m->get_source().num();
8848
8849 if (map->is_down(from) ||
8850 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
8851 dout(5) << "from dead osd." << from << ", marking down, "
8852 << " msg was " << m->get_source_inst().addr
8853 << " expected "
8854 << (map->is_up(from) ?
8855 map->get_cluster_addrs(from) : entity_addrvec_t())
8856 << dendl;
8857 ConnectionRef con = m->get_connection();
8858 con->mark_down();
8859 if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
8860 if (!is_fast_dispatch)
8861 s->session_dispatch_lock.lock();
8862 clear_session_waiting_on_map(s);
8863 con->set_priv(nullptr); // break ref <-> session cycle, if any
8864 s->con.reset();
8865 if (!is_fast_dispatch)
8866 s->session_dispatch_lock.unlock();
8867 }
8868 return false;
8869 }
8870 return true;
8871 }
8872
8873
8874 /*
8875 * require that we have same (or newer) map, and that
8876 * the source is the pg primary.
8877 */
8878 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8879 bool is_fast_dispatch)
8880 {
8881 const Message *m = op->get_req();
8882 const auto osdmap = get_osdmap();
8883 dout(15) << "require_same_or_newer_map " << epoch
8884 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8885
8886 ceph_assert(ceph_mutex_is_locked(osd_lock));
8887
8888 // do they have a newer map?
8889 if (epoch > osdmap->get_epoch()) {
8890 dout(7) << "waiting for newer map epoch " << epoch
8891 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8892 wait_for_new_map(op);
8893 return false;
8894 }
8895
8896 if (!require_self_aliveness(op->get_req(), epoch)) {
8897 return false;
8898 }
8899
8900 // ok, our map is same or newer.. do they still exist?
8901 if (m->get_connection()->get_messenger() == cluster_messenger &&
8902 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8903 return false;
8904 }
8905
8906 return true;
8907 }
8908
8909
8910
8911
8912
8913 // ----------------------------------------
8914 // pg creation
8915
8916 void OSD::split_pgs(
8917 PG *parent,
8918 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
8919 OSDMapRef curmap,
8920 OSDMapRef nextmap,
8921 PeeringCtx &rctx)
8922 {
8923 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
8924 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
8925
8926 vector<object_stat_sum_t> updated_stats;
8927 parent->start_split_stats(childpgids, &updated_stats);
8928
8929 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8930 for (set<spg_t>::const_iterator i = childpgids.begin();
8931 i != childpgids.end();
8932 ++i, ++stat_iter) {
8933 ceph_assert(stat_iter != updated_stats.end());
8934 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
8935 PG* child = _make_pg(nextmap, *i);
8936 child->lock(true);
8937 out_pgs->insert(child);
8938 child->ch = store->create_new_collection(child->coll);
8939
8940 {
8941 uint32_t shard_index = i->hash_to_shard(shards.size());
8942 assert(NULL != shards[shard_index]);
8943 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
8944 }
8945
8946 unsigned split_bits = i->get_split_bits(pg_num);
8947 dout(10) << " pg_num is " << pg_num
8948 << ", m_seed " << i->ps()
8949 << ", split_bits is " << split_bits << dendl;
8950 parent->split_colls(
8951 *i,
8952 split_bits,
8953 i->ps(),
8954 &child->get_pool().info,
8955 rctx.transaction);
8956 parent->split_into(
8957 i->pgid,
8958 child,
8959 split_bits);
8960
8961 child->init_collection_pool_opts();
8962
8963 child->finish_split_stats(*stat_iter, rctx.transaction);
8964 child->unlock();
8965 }
8966 ceph_assert(stat_iter != updated_stats.end());
8967 parent->finish_split_stats(*stat_iter, rctx.transaction);
8968 }
8969
8970 /*
8971 * holding osd_lock
8972 */
8973 void OSD::handle_pg_create(OpRequestRef op)
8974 {
8975 // NOTE: this can be removed in P release (mimic is the last version to
8976 // send MOSDPGCreate messages).
8977
8978 auto m = op->get_req<MOSDPGCreate>();
8979 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
8980
8981 dout(10) << "handle_pg_create " << *m << dendl;
8982
8983 if (!require_mon_peer(op->get_req())) {
8984 return;
8985 }
8986
8987 if (!require_same_or_newer_map(op, m->epoch, false))
8988 return;
8989
8990 op->mark_started();
8991
8992 const auto osdmap = get_osdmap();
8993 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8994 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8995 p != m->mkpg.end();
8996 ++p, ++ci) {
8997 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
8998 epoch_t created = p->second.created;
8999 if (p->second.split_bits) // Skip split pgs
9000 continue;
9001 pg_t on = p->first;
9002
9003 if (!osdmap->have_pg_pool(on.pool())) {
9004 dout(20) << "ignoring pg on deleted pool " << on << dendl;
9005 continue;
9006 }
9007
9008 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9009
9010 spg_t pgid;
9011 bool mapped = osdmap->get_primary_shard(on, &pgid);
9012 ceph_assert(mapped);
9013
9014 // is it still ours?
9015 vector<int> up, acting;
9016 int up_primary = -1;
9017 int acting_primary = -1;
9018 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9019 int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
9020
9021 if (acting_primary != whoami) {
9022 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9023 << "), my role=" << role << ", skipping" << dendl;
9024 continue;
9025 }
9026
9027
9028 PastIntervals pi;
9029 pg_history_t history;
9030 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9031
9032 // The mon won't resend unless the primary changed, so we ignore
9033 // same_interval_since. We'll pass this history with the current
9034 // epoch as the event.
9035 if (history.same_primary_since > m->epoch) {
9036 dout(10) << __func__ << ": got obsolete pg create on pgid "
9037 << pgid << " from epoch " << m->epoch
9038 << ", primary changed in " << history.same_primary_since
9039 << dendl;
9040 continue;
9041 }
9042 enqueue_peering_evt(
9043 pgid,
9044 PGPeeringEventRef(
9045 std::make_shared<PGPeeringEvent>(
9046 osdmap->get_epoch(),
9047 osdmap->get_epoch(),
9048 NullEvt(),
9049 true,
9050 new PGCreateInfo(
9051 pgid,
9052 osdmap->get_epoch(),
9053 history,
9054 pi,
9055 true)
9056 )));
9057 }
9058
9059 {
9060 std::lock_guard l(pending_creates_lock);
9061 if (pending_creates_from_mon == 0) {
9062 last_pg_create_epoch = m->epoch;
9063 }
9064 }
9065
9066 maybe_update_heartbeat_peers();
9067 }
9068
9069
9070 // ----------------------------------------
9071 // peering and recovery
9072
9073 PeeringCtx OSD::create_context()
9074 {
9075 return PeeringCtx(get_osdmap()->require_osd_release);
9076 }
9077
9078 void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
9079 ThreadPool::TPHandle *handle)
9080 {
9081 if (!service.get_osdmap()->is_up(whoami)) {
9082 dout(20) << __func__ << " not up in osdmap" << dendl;
9083 } else if (!is_active()) {
9084 dout(20) << __func__ << " not active" << dendl;
9085 } else {
9086 for (auto& [osd, ls] : ctx.message_map) {
9087 if (!curmap->is_up(osd)) {
9088 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9089 continue;
9090 }
9091 ConnectionRef con = service.get_con_osd_cluster(
9092 osd, curmap->get_epoch());
9093 if (!con) {
9094 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9095 << dendl;
9096 continue;
9097 }
9098 service.maybe_share_map(con.get(), curmap);
9099 for (auto m : ls) {
9100 con->send_message2(m);
9101 }
9102 ls.clear();
9103 }
9104 }
9105 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
9106 int tr = store->queue_transaction(
9107 pg->ch,
9108 std::move(ctx.transaction), TrackedOpRef(),
9109 handle);
9110 ceph_assert(tr == 0);
9111 }
9112 }
9113
9114 void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9115 {
9116 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9117 if (!require_mon_peer(m)) {
9118 m->put();
9119 return;
9120 }
9121 for (auto& p : m->pgs) {
9122 spg_t pgid = p.first;
9123 epoch_t created = p.second.first;
9124 utime_t created_stamp = p.second.second;
9125 auto q = m->pg_extra.find(pgid);
9126 if (q == m->pg_extra.end()) {
9127 dout(20) << __func__ << " " << pgid << " e" << created
9128 << "@" << created_stamp
9129 << " (no history or past_intervals)" << dendl;
9130 // pre-octopus ... no pg history. this can be removed in Q release.
9131 enqueue_peering_evt(
9132 pgid,
9133 PGPeeringEventRef(
9134 std::make_shared<PGPeeringEvent>(
9135 m->epoch,
9136 m->epoch,
9137 NullEvt(),
9138 true,
9139 new PGCreateInfo(
9140 pgid,
9141 created,
9142 pg_history_t(created, created_stamp),
9143 PastIntervals(),
9144 true)
9145 )));
9146 } else {
9147 dout(20) << __func__ << " " << pgid << " e" << created
9148 << "@" << created_stamp
9149 << " history " << q->second.first
9150 << " pi " << q->second.second << dendl;
9151 if (!q->second.second.empty() &&
9152 m->epoch < q->second.second.get_bounds().second) {
9153 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9154 << " and unmatched past_intervals " << q->second.second
9155 << " (history " << q->second.first << ")";
9156 } else {
9157 enqueue_peering_evt(
9158 pgid,
9159 PGPeeringEventRef(
9160 std::make_shared<PGPeeringEvent>(
9161 m->epoch,
9162 m->epoch,
9163 NullEvt(),
9164 true,
9165 new PGCreateInfo(
9166 pgid,
9167 m->epoch,
9168 q->second.first,
9169 q->second.second,
9170 true)
9171 )));
9172 }
9173 }
9174 }
9175
9176 {
9177 std::lock_guard l(pending_creates_lock);
9178 if (pending_creates_from_mon == 0) {
9179 last_pg_create_epoch = m->epoch;
9180 }
9181 }
9182
9183 m->put();
9184 }
9185
9186 void OSD::handle_fast_pg_query(MOSDPGQuery *m)
9187 {
9188 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9189 if (!require_osd_peer(m)) {
9190 m->put();
9191 return;
9192 }
9193 int from = m->get_source().num();
9194 for (auto& p : m->pg_list) {
9195 enqueue_peering_evt(
9196 p.first,
9197 PGPeeringEventRef(
9198 std::make_shared<PGPeeringEvent>(
9199 p.second.epoch_sent, p.second.epoch_sent,
9200 MQuery(
9201 p.first,
9202 pg_shard_t(from, p.second.from),
9203 p.second,
9204 p.second.epoch_sent),
9205 false))
9206 );
9207 }
9208 m->put();
9209 }
9210
9211 void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9212 {
9213 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9214 if (!require_osd_peer(m)) {
9215 m->put();
9216 return;
9217 }
9218 int from = m->get_source().num();
9219 for (auto& p : m->get_pg_list()) {
9220 spg_t pgid(p.info.pgid.pgid, p.to);
9221 enqueue_peering_evt(
9222 pgid,
9223 PGPeeringEventRef(
9224 std::make_shared<PGPeeringEvent>(
9225 p.epoch_sent,
9226 p.query_epoch,
9227 MNotifyRec(
9228 pgid, pg_shard_t(from, p.from),
9229 p,
9230 m->get_connection()->get_features()),
9231 true,
9232 new PGCreateInfo(
9233 pgid,
9234 p.query_epoch,
9235 p.info.history,
9236 p.past_intervals,
9237 false)
9238 )));
9239 }
9240 m->put();
9241 }
9242
9243 void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9244 {
9245 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9246 if (!require_osd_peer(m)) {
9247 m->put();
9248 return;
9249 }
9250 int from = m->get_source().num();
9251 for (auto& p : m->pg_list) {
9252 enqueue_peering_evt(
9253 spg_t(p.info.pgid.pgid, p.to),
9254 PGPeeringEventRef(
9255 std::make_shared<PGPeeringEvent>(
9256 p.epoch_sent, p.query_epoch,
9257 MInfoRec(
9258 pg_shard_t(from, p.from),
9259 p.info,
9260 p.epoch_sent)))
9261 );
9262 }
9263 m->put();
9264 }
9265
9266 void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9267 {
9268 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9269 if (!require_osd_peer(m)) {
9270 m->put();
9271 return;
9272 }
9273 for (auto& pgid : m->pg_list) {
9274 enqueue_peering_evt(
9275 pgid,
9276 PGPeeringEventRef(
9277 std::make_shared<PGPeeringEvent>(
9278 m->get_epoch(), m->get_epoch(),
9279 PeeringState::DeleteStart())));
9280 }
9281 m->put();
9282 }
9283
9284 void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9285 {
9286 dout(10) << __func__ << " " << *m << dendl;
9287 if (!require_mon_or_mgr_peer(m)) {
9288 m->put();
9289 return;
9290 }
9291 epoch_t epoch = get_osdmap_epoch();
9292 for (auto pgid : m->forced_pgs) {
9293 if (m->options & OFR_BACKFILL) {
9294 if (m->options & OFR_CANCEL) {
9295 enqueue_peering_evt(
9296 pgid,
9297 PGPeeringEventRef(
9298 std::make_shared<PGPeeringEvent>(
9299 epoch, epoch,
9300 PeeringState::UnsetForceBackfill())));
9301 } else {
9302 enqueue_peering_evt(
9303 pgid,
9304 PGPeeringEventRef(
9305 std::make_shared<PGPeeringEvent>(
9306 epoch, epoch,
9307 PeeringState::SetForceBackfill())));
9308 }
9309 } else if (m->options & OFR_RECOVERY) {
9310 if (m->options & OFR_CANCEL) {
9311 enqueue_peering_evt(
9312 pgid,
9313 PGPeeringEventRef(
9314 std::make_shared<PGPeeringEvent>(
9315 epoch, epoch,
9316 PeeringState::UnsetForceRecovery())));
9317 } else {
9318 enqueue_peering_evt(
9319 pgid,
9320 PGPeeringEventRef(
9321 std::make_shared<PGPeeringEvent>(
9322 epoch, epoch,
9323 PeeringState::SetForceRecovery())));
9324 }
9325 }
9326 }
9327 m->put();
9328 }
9329
9330 void OSD::handle_pg_query_nopg(const MQuery& q)
9331 {
9332 spg_t pgid = q.pgid;
9333 dout(10) << __func__ << " " << pgid << dendl;
9334
9335 OSDMapRef osdmap = get_osdmap();
9336 if (!osdmap->have_pg_pool(pgid.pool()))
9337 return;
9338
9339 dout(10) << " pg " << pgid << " dne" << dendl;
9340 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9341 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9342 if (con) {
9343 Message *m;
9344 if (q.query.type == pg_query_t::LOG ||
9345 q.query.type == pg_query_t::FULLLOG) {
9346 m = new MOSDPGLog(
9347 q.query.from, q.query.to,
9348 osdmap->get_epoch(), empty,
9349 q.query.epoch_sent);
9350 } else {
9351 vector<pg_notify_t> ls;
9352 ls.push_back(
9353 pg_notify_t(
9354 q.query.from, q.query.to,
9355 q.query.epoch_sent,
9356 osdmap->get_epoch(),
9357 empty,
9358 PastIntervals()));
9359 m = new MOSDPGNotify(osdmap->get_epoch(), std::move(ls));
9360 }
9361 service.maybe_share_map(con.get(), osdmap);
9362 con->send_message(m);
9363 }
9364 }
9365
9366 void OSDService::queue_check_readable(spg_t spgid,
9367 epoch_t lpr,
9368 ceph::signedspan delay)
9369 {
9370 if (delay == ceph::signedspan::zero()) {
9371 osd->enqueue_peering_evt(
9372 spgid,
9373 PGPeeringEventRef(
9374 std::make_shared<PGPeeringEvent>(
9375 lpr, lpr,
9376 PeeringState::CheckReadable())));
9377 } else {
9378 mono_timer.add_event(
9379 delay,
9380 [this, spgid, lpr]() {
9381 queue_check_readable(spgid, lpr);
9382 });
9383 }
9384 }
9385
9386
9387 // =========================================================
9388 // RECOVERY
9389
9390 void OSDService::_maybe_queue_recovery() {
9391 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
9392 uint64_t available_pushes;
9393 while (!awaiting_throttle.empty() &&
9394 _recover_now(&available_pushes)) {
9395 uint64_t to_start = std::min(
9396 available_pushes,
9397 cct->_conf->osd_recovery_max_single_start);
9398 _queue_for_recovery(awaiting_throttle.front(), to_start);
9399 awaiting_throttle.pop_front();
9400 dout(10) << __func__ << " starting " << to_start
9401 << ", recovery_ops_reserved " << recovery_ops_reserved
9402 << " -> " << (recovery_ops_reserved + to_start) << dendl;
9403 recovery_ops_reserved += to_start;
9404 }
9405 }
9406
9407 bool OSDService::_recover_now(uint64_t *available_pushes)
9408 {
9409 if (available_pushes)
9410 *available_pushes = 0;
9411
9412 if (ceph_clock_now() < defer_recovery_until) {
9413 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9414 return false;
9415 }
9416
9417 if (recovery_paused) {
9418 dout(15) << __func__ << " paused" << dendl;
9419 return false;
9420 }
9421
9422 uint64_t max = osd->get_recovery_max_active();
9423 if (max <= recovery_ops_active + recovery_ops_reserved) {
9424 dout(15) << __func__ << " active " << recovery_ops_active
9425 << " + reserved " << recovery_ops_reserved
9426 << " >= max " << max << dendl;
9427 return false;
9428 }
9429
9430 if (available_pushes)
9431 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9432
9433 return true;
9434 }
9435
9436 unsigned OSDService::get_target_pg_log_entries() const
9437 {
9438 auto num_pgs = osd->get_num_pgs();
9439 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9440 if (num_pgs > 0 && target > 0) {
9441 // target an even spread of our budgeted log entries across all
9442 // PGs. note that while we only get to control the entry count
9443 // for primary PGs, we'll normally be responsible for a mix of
9444 // primary and replica PGs (for the same pool(s) even), so this
9445 // will work out.
9446 return std::max<unsigned>(
9447 std::min<unsigned>(target / num_pgs,
9448 cct->_conf->osd_max_pg_log_entries),
9449 cct->_conf->osd_min_pg_log_entries);
9450 } else {
9451 // fall back to a per-pg value.
9452 return cct->_conf->osd_min_pg_log_entries;
9453 }
9454 }
9455
9456 void OSD::do_recovery(
9457 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9458 ThreadPool::TPHandle &handle)
9459 {
9460 uint64_t started = 0;
9461
9462 /*
9463 * When the value of osd_recovery_sleep is set greater than zero, recovery
9464 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9465 * recovery event's schedule time. This is done by adding a
9466 * recovery_requeue_callback event, which re-queues the recovery op using
9467 * queue_recovery_after_sleep.
9468 */
9469 float recovery_sleep = get_osd_recovery_sleep();
9470 {
9471 std::lock_guard l(service.sleep_lock);
9472 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9473 PGRef pgref(pg);
9474 auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
9475 dout(20) << "do_recovery wake up at "
9476 << ceph_clock_now()
9477 << ", re-queuing recovery" << dendl;
9478 std::lock_guard l(service.sleep_lock);
9479 service.recovery_needs_sleep = false;
9480 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9481 });
9482
9483 // This is true for the first recovery op and when the previous recovery op
9484 // has been scheduled in the past. The next recovery op is scheduled after
9485 // completing the sleep from now.
9486
9487 if (auto now = ceph::real_clock::now();
9488 service.recovery_schedule_time < now) {
9489 service.recovery_schedule_time = now;
9490 }
9491 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
9492 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9493 recovery_requeue_callback);
9494 dout(20) << "Recovery event scheduled at "
9495 << service.recovery_schedule_time << dendl;
9496 return;
9497 }
9498 }
9499
9500 {
9501 {
9502 std::lock_guard l(service.sleep_lock);
9503 service.recovery_needs_sleep = true;
9504 }
9505
9506 if (pg->pg_has_reset_since(queued)) {
9507 goto out;
9508 }
9509
9510 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9511 #ifdef DEBUG_RECOVERY_OIDS
9512 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
9513 #endif
9514
9515 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
9516 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9517 << " on " << *pg << dendl;
9518
9519 if (do_unfound) {
9520 PeeringCtx rctx = create_context();
9521 rctx.handle = &handle;
9522 pg->find_unfound(queued, rctx);
9523 dispatch_context(rctx, pg, pg->get_osdmap());
9524 }
9525 }
9526
9527 out:
9528 ceph_assert(started <= reserved_pushes);
9529 service.release_reserved_pushes(reserved_pushes);
9530 }
9531
9532 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9533 {
9534 std::lock_guard l(recovery_lock);
9535 dout(10) << "start_recovery_op " << *pg << " " << soid
9536 << " (" << recovery_ops_active << "/"
9537 << osd->get_recovery_max_active() << " rops)"
9538 << dendl;
9539 recovery_ops_active++;
9540
9541 #ifdef DEBUG_RECOVERY_OIDS
9542 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9543 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9544 recovery_oids[pg->pg_id].insert(soid);
9545 #endif
9546 }
9547
9548 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9549 {
9550 std::lock_guard l(recovery_lock);
9551 dout(10) << "finish_recovery_op " << *pg << " " << soid
9552 << " dequeue=" << dequeue
9553 << " (" << recovery_ops_active << "/"
9554 << osd->get_recovery_max_active() << " rops)"
9555 << dendl;
9556
9557 // adjust count
9558 ceph_assert(recovery_ops_active > 0);
9559 recovery_ops_active--;
9560
9561 #ifdef DEBUG_RECOVERY_OIDS
9562 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9563 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9564 recovery_oids[pg->pg_id].erase(soid);
9565 #endif
9566
9567 _maybe_queue_recovery();
9568 }
9569
9570 bool OSDService::is_recovery_active()
9571 {
9572 if (cct->_conf->osd_debug_pretend_recovery_active) {
9573 return true;
9574 }
9575 return local_reserver.has_reservation() || remote_reserver.has_reservation();
9576 }
9577
9578 void OSDService::release_reserved_pushes(uint64_t pushes)
9579 {
9580 std::lock_guard l(recovery_lock);
9581 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9582 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9583 << dendl;
9584 ceph_assert(recovery_ops_reserved >= pushes);
9585 recovery_ops_reserved -= pushes;
9586 _maybe_queue_recovery();
9587 }
9588
9589 // =========================================================
9590 // OPS
9591
9592 bool OSD::op_is_discardable(const MOSDOp *op)
9593 {
9594 // drop client request if they are not connected and can't get the
9595 // reply anyway.
9596 if (!op->get_connection()->is_connected()) {
9597 return true;
9598 }
9599 return false;
9600 }
9601
9602 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
9603 {
9604 const utime_t stamp = op->get_req()->get_recv_stamp();
9605 const utime_t latency = ceph_clock_now() - stamp;
9606 const unsigned priority = op->get_req()->get_priority();
9607 const int cost = op->get_req()->get_cost();
9608 const uint64_t owner = op->get_req()->get_source().num();
9609
9610 dout(15) << "enqueue_op " << op << " prio " << priority
9611 << " cost " << cost
9612 << " latency " << latency
9613 << " epoch " << epoch
9614 << " " << *(op->get_req()) << dendl;
9615 op->osd_trace.event("enqueue op");
9616 op->osd_trace.keyval("priority", priority);
9617 op->osd_trace.keyval("cost", cost);
9618 op->mark_queued_for_pg();
9619 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9620 op_shardedwq.queue(
9621 OpSchedulerItem(
9622 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9623 cost, priority, stamp, owner, epoch));
9624 }
9625
9626 void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9627 {
9628 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9629 op_shardedwq.queue(
9630 OpSchedulerItem(
9631 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9632 10,
9633 cct->_conf->osd_peering_op_priority,
9634 utime_t(),
9635 0,
9636 evt->get_epoch_sent()));
9637 }
9638
9639 /*
9640 * NOTE: dequeue called in worker thread, with pg lock
9641 */
9642 void OSD::dequeue_op(
9643 PGRef pg, OpRequestRef op,
9644 ThreadPool::TPHandle &handle)
9645 {
9646 const Message *m = op->get_req();
9647
9648 FUNCTRACE(cct);
9649 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
9650
9651 utime_t now = ceph_clock_now();
9652 op->set_dequeued_time(now);
9653
9654 utime_t latency = now - m->get_recv_stamp();
9655 dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9656 << " cost " << m->get_cost()
9657 << " latency " << latency
9658 << " " << *m
9659 << " pg " << *pg << dendl;
9660
9661 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9662
9663 service.maybe_share_map(m->get_connection().get(),
9664 pg->get_osdmap(),
9665 op->sent_epoch);
9666
9667 if (pg->is_deleting())
9668 return;
9669
9670 op->mark_reached_pg();
9671 op->osd_trace.event("dequeue_op");
9672
9673 pg->do_request(op, handle);
9674
9675 // finish
9676 dout(10) << "dequeue_op " << op << " finish" << dendl;
9677 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
9678 }
9679
9680
9681 void OSD::dequeue_peering_evt(
9682 OSDShard *sdata,
9683 PG *pg,
9684 PGPeeringEventRef evt,
9685 ThreadPool::TPHandle& handle)
9686 {
9687 PeeringCtx rctx = create_context();
9688 auto curmap = sdata->get_osdmap();
9689 bool need_up_thru = false;
9690 epoch_t same_interval_since = 0;
9691 if (!pg) {
9692 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9693 handle_pg_query_nopg(*q);
9694 } else {
9695 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9696 ceph_abort();
9697 }
9698 } else if (advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9699 pg->do_peering_event(evt, rctx);
9700 if (pg->is_deleted()) {
9701 pg->unlock();
9702 return;
9703 }
9704 dispatch_context(rctx, pg, curmap, &handle);
9705 need_up_thru = pg->get_need_up_thru();
9706 same_interval_since = pg->get_same_interval_since();
9707 pg->unlock();
9708 }
9709
9710 if (need_up_thru) {
9711 queue_want_up_thru(same_interval_since);
9712 }
9713
9714 service.send_pg_temp();
9715 }
9716
9717 void OSD::dequeue_delete(
9718 OSDShard *sdata,
9719 PG *pg,
9720 epoch_t e,
9721 ThreadPool::TPHandle& handle)
9722 {
9723 dequeue_peering_evt(
9724 sdata,
9725 pg,
9726 PGPeeringEventRef(
9727 std::make_shared<PGPeeringEvent>(
9728 e, e,
9729 PeeringState::DeleteSome())),
9730 handle);
9731 }
9732
9733
9734
9735 // --------------------------------
9736
9737 const char** OSD::get_tracked_conf_keys() const
9738 {
9739 static const char* KEYS[] = {
9740 "osd_max_backfills",
9741 "osd_min_recovery_priority",
9742 "osd_max_trimming_pgs",
9743 "osd_op_complaint_time",
9744 "osd_op_log_threshold",
9745 "osd_op_history_size",
9746 "osd_op_history_duration",
9747 "osd_op_history_slow_op_size",
9748 "osd_op_history_slow_op_threshold",
9749 "osd_enable_op_tracker",
9750 "osd_map_cache_size",
9751 "osd_pg_epoch_max_lag_factor",
9752 "osd_pg_epoch_persisted_max_stale",
9753 // clog & admin clog
9754 "clog_to_monitors",
9755 "clog_to_syslog",
9756 "clog_to_syslog_facility",
9757 "clog_to_syslog_level",
9758 "osd_objectstore_fuse",
9759 "clog_to_graylog",
9760 "clog_to_graylog_host",
9761 "clog_to_graylog_port",
9762 "host",
9763 "fsid",
9764 "osd_recovery_delay_start",
9765 "osd_client_message_size_cap",
9766 "osd_client_message_cap",
9767 "osd_heartbeat_min_size",
9768 "osd_heartbeat_interval",
9769 "osd_object_clean_region_max_num_intervals",
9770 "osd_scrub_min_interval",
9771 "osd_scrub_max_interval",
9772 NULL
9773 };
9774 return KEYS;
9775 }
9776
9777 void OSD::handle_conf_change(const ConfigProxy& conf,
9778 const std::set <std::string> &changed)
9779 {
9780 std::lock_guard l{osd_lock};
9781 if (changed.count("osd_max_backfills")) {
9782 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9783 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9784 }
9785 if (changed.count("osd_min_recovery_priority")) {
9786 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9787 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9788 }
9789 if (changed.count("osd_max_trimming_pgs")) {
9790 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9791 }
9792 if (changed.count("osd_op_complaint_time") ||
9793 changed.count("osd_op_log_threshold")) {
9794 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9795 cct->_conf->osd_op_log_threshold);
9796 }
9797 if (changed.count("osd_op_history_size") ||
9798 changed.count("osd_op_history_duration")) {
9799 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9800 cct->_conf->osd_op_history_duration);
9801 }
9802 if (changed.count("osd_op_history_slow_op_size") ||
9803 changed.count("osd_op_history_slow_op_threshold")) {
9804 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9805 cct->_conf->osd_op_history_slow_op_threshold);
9806 }
9807 if (changed.count("osd_enable_op_tracker")) {
9808 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9809 }
9810 if (changed.count("osd_map_cache_size")) {
9811 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9812 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9813 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9814 }
9815 if (changed.count("clog_to_monitors") ||
9816 changed.count("clog_to_syslog") ||
9817 changed.count("clog_to_syslog_level") ||
9818 changed.count("clog_to_syslog_facility") ||
9819 changed.count("clog_to_graylog") ||
9820 changed.count("clog_to_graylog_host") ||
9821 changed.count("clog_to_graylog_port") ||
9822 changed.count("host") ||
9823 changed.count("fsid")) {
9824 update_log_config();
9825 }
9826 if (changed.count("osd_pg_epoch_max_lag_factor")) {
9827 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
9828 "osd_pg_epoch_max_lag_factor");
9829 }
9830
9831 #ifdef HAVE_LIBFUSE
9832 if (changed.count("osd_objectstore_fuse")) {
9833 if (store) {
9834 enable_disable_fuse(false);
9835 }
9836 }
9837 #endif
9838
9839 if (changed.count("osd_recovery_delay_start")) {
9840 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9841 service.kick_recovery_queue();
9842 }
9843
9844 if (changed.count("osd_client_message_cap")) {
9845 uint64_t newval = cct->_conf->osd_client_message_cap;
9846 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9847 if (pol.throttler_messages && newval > 0) {
9848 pol.throttler_messages->reset_max(newval);
9849 }
9850 }
9851 if (changed.count("osd_client_message_size_cap")) {
9852 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9853 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9854 if (pol.throttler_bytes && newval > 0) {
9855 pol.throttler_bytes->reset_max(newval);
9856 }
9857 }
9858 if (changed.count("osd_object_clean_region_max_num_intervals")) {
9859 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
9860 }
9861
9862 if (changed.count("osd_scrub_min_interval") ||
9863 changed.count("osd_scrub_max_interval")) {
9864 resched_all_scrubs();
9865 dout(0) << __func__ << ": scrub interval change" << dendl;
9866 }
9867 check_config();
9868 }
9869
9870 void OSD::update_log_config()
9871 {
9872 map<string,string> log_to_monitors;
9873 map<string,string> log_to_syslog;
9874 map<string,string> log_channel;
9875 map<string,string> log_prio;
9876 map<string,string> log_to_graylog;
9877 map<string,string> log_to_graylog_host;
9878 map<string,string> log_to_graylog_port;
9879 uuid_d fsid;
9880 string host;
9881
9882 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9883 log_channel, log_prio, log_to_graylog,
9884 log_to_graylog_host, log_to_graylog_port,
9885 fsid, host) == 0)
9886 clog->update_config(log_to_monitors, log_to_syslog,
9887 log_channel, log_prio, log_to_graylog,
9888 log_to_graylog_host, log_to_graylog_port,
9889 fsid, host);
9890 derr << "log_to_monitors " << log_to_monitors << dendl;
9891 }
9892
9893 void OSD::check_config()
9894 {
9895 // some sanity checks
9896 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9897 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9898 << " is not > osd_pg_epoch_persisted_max_stale ("
9899 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9900 }
9901 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
9902 clog->warn() << "osd_object_clean_region_max_num_intervals ("
9903 << cct->_conf->osd_object_clean_region_max_num_intervals
9904 << ") is < 0";
9905 }
9906 }
9907
9908 // --------------------------------
9909
9910 void OSD::get_latest_osdmap()
9911 {
9912 dout(10) << __func__ << " -- start" << dendl;
9913
9914 C_SaferCond cond;
9915 service.objecter->wait_for_latest_osdmap(&cond);
9916 cond.wait();
9917
9918 dout(10) << __func__ << " -- finish" << dendl;
9919 }
9920
9921 // --------------------------------
9922
9923 void OSD::set_perf_queries(const ConfigPayload &config_payload) {
9924 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
9925 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
9926 dout(10) << "setting " << queries.size() << " queries" << dendl;
9927
9928 std::list<OSDPerfMetricQuery> supported_queries;
9929 for (auto &it : queries) {
9930 auto &query = it.first;
9931 if (!query.key_descriptor.empty()) {
9932 supported_queries.push_back(query);
9933 }
9934 }
9935 if (supported_queries.size() < queries.size()) {
9936 dout(1) << queries.size() - supported_queries.size()
9937 << " unsupported queries" << dendl;
9938 }
9939 {
9940 std::lock_guard locker{m_perf_queries_lock};
9941 m_perf_queries = supported_queries;
9942 m_perf_limits = queries;
9943 }
9944 std::vector<PGRef> pgs;
9945 _get_pgs(&pgs);
9946 for (auto& pg : pgs) {
9947 std::scoped_lock l{*pg};
9948 pg->set_dynamic_perf_stats_queries(supported_queries);
9949 }
9950 }
9951
9952 MetricPayload OSD::get_perf_reports() {
9953 OSDMetricPayload payload;
9954 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
9955
9956 std::vector<PGRef> pgs;
9957 _get_pgs(&pgs);
9958 DynamicPerfStats dps;
9959 for (auto& pg : pgs) {
9960 // m_perf_queries can be modified only in set_perf_queries by mgr client
9961 // request, and it is protected by by mgr client's lock, which is held
9962 // when set_perf_queries/get_perf_reports are called, so we may not hold
9963 // m_perf_queries_lock here.
9964 DynamicPerfStats pg_dps(m_perf_queries);
9965 pg->lock();
9966 pg->get_dynamic_perf_stats(&pg_dps);
9967 pg->unlock();
9968 dps.merge(pg_dps);
9969 }
9970 dps.add_to_reports(m_perf_limits, &reports);
9971 dout(20) << "reports for " << reports.size() << " queries" << dendl;
9972
9973 return payload;
9974 }
9975
9976 // =============================================================
9977
9978 #undef dout_context
9979 #define dout_context cct
9980 #undef dout_prefix
9981 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
9982
9983 void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
9984 {
9985 dout(10) << pg->pg_id << " " << pg << dendl;
9986 slot->pg = pg;
9987 pg->osd_shard = this;
9988 pg->pg_slot = slot;
9989 osd->inc_num_pgs();
9990
9991 slot->epoch = pg->get_osdmap_epoch();
9992 pg_slots_by_epoch.insert(*slot);
9993 }
9994
9995 void OSDShard::_detach_pg(OSDShardPGSlot *slot)
9996 {
9997 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
9998 slot->pg->osd_shard = nullptr;
9999 slot->pg->pg_slot = nullptr;
10000 slot->pg = nullptr;
10001 osd->dec_num_pgs();
10002
10003 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10004 slot->epoch = 0;
10005 if (waiting_for_min_pg_epoch) {
10006 min_pg_epoch_cond.notify_all();
10007 }
10008 }
10009
10010 void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10011 {
10012 std::lock_guard l(shard_lock);
10013 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10014 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10015 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10016 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10017 slot->epoch = e;
10018 pg_slots_by_epoch.insert(*slot);
10019 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10020 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10021 if (waiting_for_min_pg_epoch) {
10022 min_pg_epoch_cond.notify_all();
10023 }
10024 }
10025
10026 epoch_t OSDShard::get_min_pg_epoch()
10027 {
10028 std::lock_guard l(shard_lock);
10029 auto p = pg_slots_by_epoch.begin();
10030 if (p == pg_slots_by_epoch.end()) {
10031 return 0;
10032 }
10033 return p->epoch;
10034 }
10035
10036 void OSDShard::wait_min_pg_epoch(epoch_t need)
10037 {
10038 std::unique_lock l{shard_lock};
10039 ++waiting_for_min_pg_epoch;
10040 min_pg_epoch_cond.wait(l, [need, this] {
10041 if (pg_slots_by_epoch.empty()) {
10042 return true;
10043 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10044 return true;
10045 } else {
10046 dout(10) << need << " waiting on "
10047 << pg_slots_by_epoch.begin()->epoch << dendl;
10048 return false;
10049 }
10050 });
10051 --waiting_for_min_pg_epoch;
10052 }
10053
10054 epoch_t OSDShard::get_max_waiting_epoch()
10055 {
10056 std::lock_guard l(shard_lock);
10057 epoch_t r = 0;
10058 for (auto& i : pg_slots) {
10059 if (!i.second->waiting_peering.empty()) {
10060 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10061 }
10062 }
10063 return r;
10064 }
10065
10066 void OSDShard::consume_map(
10067 const OSDMapRef& new_osdmap,
10068 unsigned *pushes_to_free)
10069 {
10070 std::lock_guard l(shard_lock);
10071 OSDMapRef old_osdmap;
10072 {
10073 std::lock_guard l(osdmap_lock);
10074 old_osdmap = std::move(shard_osdmap);
10075 shard_osdmap = new_osdmap;
10076 }
10077 dout(10) << new_osdmap->get_epoch()
10078 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10079 << dendl;
10080 bool queued = false;
10081
10082 // check slots
10083 auto p = pg_slots.begin();
10084 while (p != pg_slots.end()) {
10085 OSDShardPGSlot *slot = p->second.get();
10086 const spg_t& pgid = p->first;
10087 dout(20) << __func__ << " " << pgid << dendl;
10088 if (!slot->waiting_for_split.empty()) {
10089 dout(20) << __func__ << " " << pgid
10090 << " waiting for split " << slot->waiting_for_split << dendl;
10091 ++p;
10092 continue;
10093 }
10094 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10095 dout(20) << __func__ << " " << pgid
10096 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10097 << dendl;
10098 ++p;
10099 continue;
10100 }
10101 if (!slot->waiting_peering.empty()) {
10102 epoch_t first = slot->waiting_peering.begin()->first;
10103 if (first <= new_osdmap->get_epoch()) {
10104 dout(20) << __func__ << " " << pgid
10105 << " pending_peering first epoch " << first
10106 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10107 _wake_pg_slot(pgid, slot);
10108 queued = true;
10109 }
10110 ++p;
10111 continue;
10112 }
10113 if (!slot->waiting.empty()) {
10114 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10115 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10116 << dendl;
10117 ++p;
10118 continue;
10119 }
10120 while (!slot->waiting.empty() &&
10121 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10122 auto& qi = slot->waiting.front();
10123 dout(20) << __func__ << " " << pgid
10124 << " waiting item " << qi
10125 << " epoch " << qi.get_map_epoch()
10126 << " <= " << new_osdmap->get_epoch()
10127 << ", "
10128 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10129 "misdirected")
10130 << ", dropping" << dendl;
10131 *pushes_to_free += qi.get_reserved_pushes();
10132 slot->waiting.pop_front();
10133 }
10134 }
10135 if (slot->waiting.empty() &&
10136 slot->num_running == 0 &&
10137 slot->waiting_for_split.empty() &&
10138 !slot->pg) {
10139 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10140 p = pg_slots.erase(p);
10141 continue;
10142 }
10143
10144 ++p;
10145 }
10146 if (queued) {
10147 std::lock_guard l{sdata_wait_lock};
10148 sdata_cond.notify_one();
10149 }
10150 }
10151
10152 void OSDShard::_wake_pg_slot(
10153 spg_t pgid,
10154 OSDShardPGSlot *slot)
10155 {
10156 dout(20) << __func__ << " " << pgid
10157 << " to_process " << slot->to_process
10158 << " waiting " << slot->waiting
10159 << " waiting_peering " << slot->waiting_peering << dendl;
10160 for (auto i = slot->to_process.rbegin();
10161 i != slot->to_process.rend();
10162 ++i) {
10163 scheduler->enqueue_front(std::move(*i));
10164 }
10165 slot->to_process.clear();
10166 for (auto i = slot->waiting.rbegin();
10167 i != slot->waiting.rend();
10168 ++i) {
10169 scheduler->enqueue_front(std::move(*i));
10170 }
10171 slot->waiting.clear();
10172 for (auto i = slot->waiting_peering.rbegin();
10173 i != slot->waiting_peering.rend();
10174 ++i) {
10175 // this is overkill; we requeue everything, even if some of these
10176 // items are waiting for maps we don't have yet. FIXME, maybe,
10177 // someday, if we decide this inefficiency matters
10178 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10179 scheduler->enqueue_front(std::move(*j));
10180 }
10181 }
10182 slot->waiting_peering.clear();
10183 ++slot->requeue_seq;
10184 }
10185
10186 void OSDShard::identify_splits_and_merges(
10187 const OSDMapRef& as_of_osdmap,
10188 set<pair<spg_t,epoch_t>> *split_pgs,
10189 set<pair<spg_t,epoch_t>> *merge_pgs)
10190 {
10191 std::lock_guard l(shard_lock);
10192 if (shard_osdmap) {
10193 for (auto& i : pg_slots) {
10194 const spg_t& pgid = i.first;
10195 auto *slot = i.second.get();
10196 if (slot->pg) {
10197 osd->service.identify_splits_and_merges(
10198 shard_osdmap, as_of_osdmap, pgid,
10199 split_pgs, merge_pgs);
10200 } else if (!slot->waiting_for_split.empty()) {
10201 osd->service.identify_splits_and_merges(
10202 shard_osdmap, as_of_osdmap, pgid,
10203 split_pgs, nullptr);
10204 } else {
10205 dout(20) << __func__ << " slot " << pgid
10206 << " has no pg and waiting_for_split " << dendl;
10207 }
10208 }
10209 }
10210 }
10211
10212 void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10213 set<pair<spg_t,epoch_t>> *pgids)
10214 {
10215 std::lock_guard l(shard_lock);
10216 _prime_splits(pgids);
10217 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10218 set<pair<spg_t,epoch_t>> newer_children;
10219 for (auto i : *pgids) {
10220 osd->service.identify_splits_and_merges(
10221 as_of_osdmap, shard_osdmap, i.first,
10222 &newer_children, nullptr);
10223 }
10224 newer_children.insert(pgids->begin(), pgids->end());
10225 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10226 << shard_osdmap->get_epoch() << ", new children " << newer_children
10227 << dendl;
10228 _prime_splits(&newer_children);
10229 // note: we don't care what is left over here for other shards.
10230 // if this shard is ahead of us and one isn't, e.g., one thread is
10231 // calling into prime_splits via _process (due to a newly created
10232 // pg) and this shard has a newer map due to a racing consume_map,
10233 // then any grandchildren left here will be identified (or were
10234 // identified) when the slower shard's osdmap is advanced.
10235 // _prime_splits() will tolerate the case where the pgid is
10236 // already primed.
10237 }
10238 }
10239
10240 void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10241 {
10242 dout(10) << *pgids << dendl;
10243 auto p = pgids->begin();
10244 while (p != pgids->end()) {
10245 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10246 if (shard_index == shard_id) {
10247 auto r = pg_slots.emplace(p->first, nullptr);
10248 if (r.second) {
10249 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10250 r.first->second = make_unique<OSDShardPGSlot>();
10251 r.first->second->waiting_for_split.insert(p->second);
10252 } else {
10253 auto q = r.first;
10254 ceph_assert(q != pg_slots.end());
10255 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10256 << dendl;
10257 q->second->waiting_for_split.insert(p->second);
10258 }
10259 p = pgids->erase(p);
10260 } else {
10261 ++p;
10262 }
10263 }
10264 }
10265
10266 void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10267 set<pair<spg_t,epoch_t>> *merge_pgs)
10268 {
10269 std::lock_guard l(shard_lock);
10270 dout(20) << __func__ << " checking shard " << shard_id
10271 << " for remaining merge pgs " << merge_pgs << dendl;
10272 auto p = merge_pgs->begin();
10273 while (p != merge_pgs->end()) {
10274 spg_t pgid = p->first;
10275 epoch_t epoch = p->second;
10276 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10277 if (shard_index != shard_id) {
10278 ++p;
10279 continue;
10280 }
10281 OSDShardPGSlot *slot;
10282 auto r = pg_slots.emplace(pgid, nullptr);
10283 if (r.second) {
10284 r.first->second = make_unique<OSDShardPGSlot>();
10285 }
10286 slot = r.first->second.get();
10287 if (slot->pg) {
10288 // already have pg
10289 dout(20) << __func__ << " have merge participant pg " << pgid
10290 << " " << slot->pg << dendl;
10291 } else if (!slot->waiting_for_split.empty() &&
10292 *slot->waiting_for_split.begin() < epoch) {
10293 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10294 << " " << slot->waiting_for_split << dendl;
10295 } else {
10296 dout(20) << __func__ << " creating empty merge participant " << pgid
10297 << " for merge in " << epoch << dendl;
10298 // leave history zeroed; PG::merge_from() will fill it in.
10299 pg_history_t history;
10300 PGCreateInfo cinfo(pgid, epoch - 1,
10301 history, PastIntervals(), false);
10302 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10303 _attach_pg(r.first->second.get(), pg.get());
10304 _wake_pg_slot(pgid, slot);
10305 pg->unlock();
10306 }
10307 // mark slot for merge
10308 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10309 slot->waiting_for_merge_epoch = epoch;
10310 p = merge_pgs->erase(p);
10311 }
10312 }
10313
10314 void OSDShard::register_and_wake_split_child(PG *pg)
10315 {
10316 epoch_t epoch;
10317 {
10318 std::lock_guard l(shard_lock);
10319 dout(10) << pg->pg_id << " " << pg << dendl;
10320 auto p = pg_slots.find(pg->pg_id);
10321 ceph_assert(p != pg_slots.end());
10322 auto *slot = p->second.get();
10323 dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
10324 << dendl;
10325 ceph_assert(!slot->pg);
10326 ceph_assert(!slot->waiting_for_split.empty());
10327 _attach_pg(slot, pg);
10328
10329 epoch = pg->get_osdmap_epoch();
10330 ceph_assert(slot->waiting_for_split.count(epoch));
10331 slot->waiting_for_split.erase(epoch);
10332 if (slot->waiting_for_split.empty()) {
10333 _wake_pg_slot(pg->pg_id, slot);
10334 } else {
10335 dout(10) << __func__ << " still waiting for split on "
10336 << slot->waiting_for_split << dendl;
10337 }
10338 }
10339
10340 // kick child to ensure it pulls up to the latest osdmap
10341 osd->enqueue_peering_evt(
10342 pg->pg_id,
10343 PGPeeringEventRef(
10344 std::make_shared<PGPeeringEvent>(
10345 epoch,
10346 epoch,
10347 NullEvt())));
10348
10349 std::lock_guard l{sdata_wait_lock};
10350 sdata_cond.notify_one();
10351 }
10352
10353 void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
10354 {
10355 std::lock_guard l(shard_lock);
10356 vector<spg_t> to_delete;
10357 for (auto& i : pg_slots) {
10358 if (i.first != parent &&
10359 i.first.get_ancestor(old_pg_num) == parent) {
10360 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10361 << dendl;
10362 _wake_pg_slot(i.first, i.second.get());
10363 to_delete.push_back(i.first);
10364 }
10365 }
10366 for (auto pgid : to_delete) {
10367 pg_slots.erase(pgid);
10368 }
10369 }
10370
10371 OSDShard::OSDShard(
10372 int id,
10373 CephContext *cct,
10374 OSD *osd)
10375 : shard_id(id),
10376 cct(cct),
10377 osd(osd),
10378 shard_name(string("OSDShard.") + stringify(id)),
10379 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10380 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10381 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10382 shard_lock_name(shard_name + "::shard_lock"),
10383 shard_lock{make_mutex(shard_lock_name)},
10384 scheduler(ceph::osd::scheduler::make_scheduler(cct)),
10385 context_queue(sdata_wait_lock, sdata_cond)
10386 {
10387 dout(0) << "using op scheduler " << *scheduler << dendl;
10388 }
10389
10390
10391 // =============================================================
10392
10393 #undef dout_context
10394 #define dout_context osd->cct
10395 #undef dout_prefix
10396 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10397
10398 void OSD::ShardedOpWQ::_add_slot_waiter(
10399 spg_t pgid,
10400 OSDShardPGSlot *slot,
10401 OpSchedulerItem&& qi)
10402 {
10403 if (qi.is_peering()) {
10404 dout(20) << __func__ << " " << pgid
10405 << " peering, item epoch is "
10406 << qi.get_map_epoch()
10407 << ", will wait on " << qi << dendl;
10408 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10409 } else {
10410 dout(20) << __func__ << " " << pgid
10411 << " item epoch is "
10412 << qi.get_map_epoch()
10413 << ", will wait on " << qi << dendl;
10414 slot->waiting.push_back(std::move(qi));
10415 }
10416 }
10417
10418 #undef dout_prefix
10419 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10420
10421 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10422 {
10423 uint32_t shard_index = thread_index % osd->num_shards;
10424 auto& sdata = osd->shards[shard_index];
10425 ceph_assert(sdata);
10426
10427 // If all threads of shards do oncommits, there is a out-of-order
10428 // problem. So we choose the thread which has the smallest
10429 // thread_index(thread_index < num_shards) of shard to do oncommit
10430 // callback.
10431 bool is_smallest_thread_index = thread_index < osd->num_shards;
10432
10433 // peek at spg_t
10434 sdata->shard_lock.lock();
10435 if (sdata->scheduler->empty() &&
10436 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10437 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10438 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10439 // we raced with a context_queue addition, don't wait
10440 wait_lock.unlock();
10441 } else if (!sdata->stop_waiting) {
10442 dout(20) << __func__ << " empty q, waiting" << dendl;
10443 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10444 sdata->shard_lock.unlock();
10445 sdata->sdata_cond.wait(wait_lock);
10446 wait_lock.unlock();
10447 sdata->shard_lock.lock();
10448 if (sdata->scheduler->empty() &&
10449 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10450 sdata->shard_lock.unlock();
10451 return;
10452 }
10453 // found a work item; reapply default wq timeouts
10454 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10455 timeout_interval, suicide_interval);
10456 } else {
10457 dout(20) << __func__ << " need return immediately" << dendl;
10458 wait_lock.unlock();
10459 sdata->shard_lock.unlock();
10460 return;
10461 }
10462 }
10463
10464 list<Context *> oncommits;
10465 if (is_smallest_thread_index) {
10466 sdata->context_queue.move_to(oncommits);
10467 }
10468
10469 if (sdata->scheduler->empty()) {
10470 if (osd->is_stopping()) {
10471 sdata->shard_lock.unlock();
10472 for (auto c : oncommits) {
10473 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10474 delete c;
10475 }
10476 return; // OSD shutdown, discard.
10477 }
10478 sdata->shard_lock.unlock();
10479 handle_oncommits(oncommits);
10480 return;
10481 }
10482
10483 OpSchedulerItem item = sdata->scheduler->dequeue();
10484 if (osd->is_stopping()) {
10485 sdata->shard_lock.unlock();
10486 for (auto c : oncommits) {
10487 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10488 delete c;
10489 }
10490 return; // OSD shutdown, discard.
10491 }
10492
10493 const auto token = item.get_ordering_token();
10494 auto r = sdata->pg_slots.emplace(token, nullptr);
10495 if (r.second) {
10496 r.first->second = make_unique<OSDShardPGSlot>();
10497 }
10498 OSDShardPGSlot *slot = r.first->second.get();
10499 dout(20) << __func__ << " " << token
10500 << (r.second ? " (new)" : "")
10501 << " to_process " << slot->to_process
10502 << " waiting " << slot->waiting
10503 << " waiting_peering " << slot->waiting_peering
10504 << dendl;
10505 slot->to_process.push_back(std::move(item));
10506 dout(20) << __func__ << " " << slot->to_process.back()
10507 << " queued" << dendl;
10508
10509 retry_pg:
10510 PGRef pg = slot->pg;
10511
10512 // lock pg (if we have it)
10513 if (pg) {
10514 // note the requeue seq now...
10515 uint64_t requeue_seq = slot->requeue_seq;
10516 ++slot->num_running;
10517
10518 sdata->shard_lock.unlock();
10519 osd->service.maybe_inject_dispatch_delay();
10520 pg->lock();
10521 osd->service.maybe_inject_dispatch_delay();
10522 sdata->shard_lock.lock();
10523
10524 auto q = sdata->pg_slots.find(token);
10525 if (q == sdata->pg_slots.end()) {
10526 // this can happen if we race with pg removal.
10527 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10528 pg->unlock();
10529 sdata->shard_lock.unlock();
10530 handle_oncommits(oncommits);
10531 return;
10532 }
10533 slot = q->second.get();
10534 --slot->num_running;
10535
10536 if (slot->to_process.empty()) {
10537 // raced with _wake_pg_slot or consume_map
10538 dout(20) << __func__ << " " << token
10539 << " nothing queued" << dendl;
10540 pg->unlock();
10541 sdata->shard_lock.unlock();
10542 handle_oncommits(oncommits);
10543 return;
10544 }
10545 if (requeue_seq != slot->requeue_seq) {
10546 dout(20) << __func__ << " " << token
10547 << " requeue_seq " << slot->requeue_seq << " > our "
10548 << requeue_seq << ", we raced with _wake_pg_slot"
10549 << dendl;
10550 pg->unlock();
10551 sdata->shard_lock.unlock();
10552 handle_oncommits(oncommits);
10553 return;
10554 }
10555 if (slot->pg != pg) {
10556 // this can happen if we race with pg removal.
10557 dout(20) << __func__ << " slot " << token << " no longer attached to "
10558 << pg << dendl;
10559 pg->unlock();
10560 goto retry_pg;
10561 }
10562 }
10563
10564 dout(20) << __func__ << " " << token
10565 << " to_process " << slot->to_process
10566 << " waiting " << slot->waiting
10567 << " waiting_peering " << slot->waiting_peering << dendl;
10568
10569 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10570 suicide_interval);
10571
10572 // take next item
10573 auto qi = std::move(slot->to_process.front());
10574 slot->to_process.pop_front();
10575 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10576 set<pair<spg_t,epoch_t>> new_children;
10577 OSDMapRef osdmap;
10578
10579 while (!pg) {
10580 // should this pg shard exist on this osd in this (or a later) epoch?
10581 osdmap = sdata->shard_osdmap;
10582 const PGCreateInfo *create_info = qi.creates_pg();
10583 if (!slot->waiting_for_split.empty()) {
10584 dout(20) << __func__ << " " << token
10585 << " splitting " << slot->waiting_for_split << dendl;
10586 _add_slot_waiter(token, slot, std::move(qi));
10587 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10588 dout(20) << __func__ << " " << token
10589 << " map " << qi.get_map_epoch() << " > "
10590 << osdmap->get_epoch() << dendl;
10591 _add_slot_waiter(token, slot, std::move(qi));
10592 } else if (qi.is_peering()) {
10593 if (!qi.peering_requires_pg()) {
10594 // for pg-less events, we run them under the ordering lock, since
10595 // we don't have the pg lock to keep them ordered.
10596 qi.run(osd, sdata, pg, tp_handle);
10597 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10598 if (create_info) {
10599 if (create_info->by_mon &&
10600 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
10601 dout(20) << __func__ << " " << token
10602 << " no pg, no longer primary, ignoring mon create on "
10603 << qi << dendl;
10604 } else {
10605 dout(20) << __func__ << " " << token
10606 << " no pg, should create on " << qi << dendl;
10607 pg = osd->handle_pg_create_info(osdmap, create_info);
10608 if (pg) {
10609 // we created the pg! drop out and continue "normally"!
10610 sdata->_attach_pg(slot, pg.get());
10611 sdata->_wake_pg_slot(token, slot);
10612
10613 // identify split children between create epoch and shard epoch.
10614 osd->service.identify_splits_and_merges(
10615 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
10616 sdata->_prime_splits(&new_children);
10617 // distribute remaining split children to other shards below!
10618 break;
10619 }
10620 dout(20) << __func__ << " ignored create on " << qi << dendl;
10621 }
10622 } else {
10623 dout(20) << __func__ << " " << token
10624 << " no pg, peering, !create, discarding " << qi << dendl;
10625 }
10626 } else {
10627 dout(20) << __func__ << " " << token
10628 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
10629 << ", discarding " << qi
10630 << dendl;
10631 }
10632 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10633 dout(20) << __func__ << " " << token
10634 << " no pg, should exist e" << osdmap->get_epoch()
10635 << ", will wait on " << qi << dendl;
10636 _add_slot_waiter(token, slot, std::move(qi));
10637 } else {
10638 dout(20) << __func__ << " " << token
10639 << " no pg, shouldn't exist e" << osdmap->get_epoch()
10640 << ", dropping " << qi << dendl;
10641 // share map with client?
10642 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10643 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
10644 sdata->shard_osdmap,
10645 (*_op)->sent_epoch);
10646 }
10647 unsigned pushes_to_free = qi.get_reserved_pushes();
10648 if (pushes_to_free > 0) {
10649 sdata->shard_lock.unlock();
10650 osd->service.release_reserved_pushes(pushes_to_free);
10651 handle_oncommits(oncommits);
10652 return;
10653 }
10654 }
10655 sdata->shard_lock.unlock();
10656 handle_oncommits(oncommits);
10657 return;
10658 }
10659 if (qi.is_peering()) {
10660 OSDMapRef osdmap = sdata->shard_osdmap;
10661 if (qi.get_map_epoch() > osdmap->get_epoch()) {
10662 _add_slot_waiter(token, slot, std::move(qi));
10663 sdata->shard_lock.unlock();
10664 pg->unlock();
10665 handle_oncommits(oncommits);
10666 return;
10667 }
10668 }
10669 sdata->shard_lock.unlock();
10670
10671 if (!new_children.empty()) {
10672 for (auto shard : osd->shards) {
10673 shard->prime_splits(osdmap, &new_children);
10674 }
10675 ceph_assert(new_children.empty());
10676 }
10677
10678 // osd_opwq_process marks the point at which an operation has been dequeued
10679 // and will begin to be handled by a worker thread.
10680 {
10681 #ifdef WITH_LTTNG
10682 osd_reqid_t reqid;
10683 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10684 reqid = (*_op)->get_reqid();
10685 }
10686 #endif
10687 tracepoint(osd, opwq_process_start, reqid.name._type,
10688 reqid.name._num, reqid.tid, reqid.inc);
10689 }
10690
10691 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10692 Formatter *f = Formatter::create("json");
10693 f->open_object_section("q");
10694 dump(f);
10695 f->close_section();
10696 f->flush(*_dout);
10697 delete f;
10698 *_dout << dendl;
10699
10700 qi.run(osd, sdata, pg, tp_handle);
10701
10702 {
10703 #ifdef WITH_LTTNG
10704 osd_reqid_t reqid;
10705 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10706 reqid = (*_op)->get_reqid();
10707 }
10708 #endif
10709 tracepoint(osd, opwq_process_finish, reqid.name._type,
10710 reqid.name._num, reqid.tid, reqid.inc);
10711 }
10712
10713 handle_oncommits(oncommits);
10714 }
10715
10716 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
10717 uint32_t shard_index =
10718 item.get_ordering_token().hash_to_shard(osd->shards.size());
10719
10720 dout(20) << __func__ << " " << item << dendl;
10721
10722 OSDShard* sdata = osd->shards[shard_index];
10723 assert (NULL != sdata);
10724
10725 bool empty = true;
10726 {
10727 std::lock_guard l{sdata->shard_lock};
10728 empty = sdata->scheduler->empty();
10729 sdata->scheduler->enqueue(std::move(item));
10730 }
10731
10732 if (empty) {
10733 std::lock_guard l{sdata->sdata_wait_lock};
10734 sdata->sdata_cond.notify_all();
10735 }
10736 }
10737
10738 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
10739 {
10740 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
10741 auto& sdata = osd->shards[shard_index];
10742 ceph_assert(sdata);
10743 sdata->shard_lock.lock();
10744 auto p = sdata->pg_slots.find(item.get_ordering_token());
10745 if (p != sdata->pg_slots.end() &&
10746 !p->second->to_process.empty()) {
10747 // we may be racing with _process, which has dequeued a new item
10748 // from scheduler, put it on to_process, and is now busy taking the
10749 // pg lock. ensure this old requeued item is ordered before any
10750 // such newer item in to_process.
10751 p->second->to_process.push_front(std::move(item));
10752 item = std::move(p->second->to_process.back());
10753 p->second->to_process.pop_back();
10754 dout(20) << __func__
10755 << " " << p->second->to_process.front()
10756 << " shuffled w/ " << item << dendl;
10757 } else {
10758 dout(20) << __func__ << " " << item << dendl;
10759 }
10760 sdata->scheduler->enqueue_front(std::move(item));
10761 sdata->shard_lock.unlock();
10762 std::lock_guard l{sdata->sdata_wait_lock};
10763 sdata->sdata_cond.notify_one();
10764 }
10765
10766 namespace ceph {
10767 namespace osd_cmds {
10768
10769 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
10770 std::ostream& os)
10771 {
10772 if (!ceph_using_tcmalloc()) {
10773 os << "could not issue heap profiler command -- not using tcmalloc!";
10774 return -EOPNOTSUPP;
10775 }
10776
10777 string cmd;
10778 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
10779 os << "unable to get value for command \"" << cmd << "\"";
10780 return -EINVAL;
10781 }
10782
10783 std::vector<std::string> cmd_vec;
10784 get_str_vec(cmd, cmd_vec);
10785
10786 string val;
10787 if (cmd_getval(cmdmap, "value", val)) {
10788 cmd_vec.push_back(val);
10789 }
10790
10791 ceph_heap_profiler_handle_command(cmd_vec, os);
10792
10793 return 0;
10794 }
10795
10796 }} // namespace ceph::osd_cmds