]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
7ff0ee94e9bfb2225da217e6a6cfd536d7e340a2
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15
16 #include "acconfig.h"
17
18 #include <cctype>
19 #include <fstream>
20 #include <iostream>
21 #include <iterator>
22
23 #include <unistd.h>
24 #include <sys/stat.h>
25 #include <signal.h>
26 #include <time.h>
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
29
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
32 #endif
33
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
36 #endif
37
38 #include "osd/PG.h"
39
40 #include "include/types.h"
41 #include "include/compat.h"
42 #include "include/random.h"
43
44 #include "OSD.h"
45 #include "OSDMap.h"
46 #include "Watch.h"
47 #include "osdc/Objecter.h"
48
49 #include "common/errno.h"
50 #include "common/ceph_argparse.h"
51 #include "common/ceph_releases.h"
52 #include "common/ceph_time.h"
53 #include "common/version.h"
54 #include "common/pick_address.h"
55 #include "common/blkdev.h"
56 #include "common/numa.h"
57
58 #include "os/ObjectStore.h"
59 #ifdef HAVE_LIBFUSE
60 #include "os/FuseStore.h"
61 #endif
62
63 #include "PrimaryLogPG.h"
64
65 #include "msg/Messenger.h"
66 #include "msg/Message.h"
67
68 #include "mon/MonClient.h"
69
70 #include "messages/MLog.h"
71
72 #include "messages/MGenericMessage.h"
73 #include "messages/MOSDPing.h"
74 #include "messages/MOSDFailure.h"
75 #include "messages/MOSDMarkMeDown.h"
76 #include "messages/MOSDMarkMeDead.h"
77 #include "messages/MOSDFull.h"
78 #include "messages/MOSDOp.h"
79 #include "messages/MOSDOpReply.h"
80 #include "messages/MOSDBackoff.h"
81 #include "messages/MOSDBeacon.h"
82 #include "messages/MOSDRepOp.h"
83 #include "messages/MOSDRepOpReply.h"
84 #include "messages/MOSDBoot.h"
85 #include "messages/MOSDPGTemp.h"
86 #include "messages/MOSDPGReadyToMerge.h"
87
88 #include "messages/MOSDMap.h"
89 #include "messages/MMonGetOSDMap.h"
90 #include "messages/MOSDPGNotify.h"
91 #include "messages/MOSDPGNotify2.h"
92 #include "messages/MOSDPGQuery.h"
93 #include "messages/MOSDPGQuery2.h"
94 #include "messages/MOSDPGLog.h"
95 #include "messages/MOSDPGRemove.h"
96 #include "messages/MOSDPGInfo.h"
97 #include "messages/MOSDPGInfo2.h"
98 #include "messages/MOSDPGCreate.h"
99 #include "messages/MOSDPGCreate2.h"
100 #include "messages/MOSDPGScan.h"
101 #include "messages/MBackfillReserve.h"
102 #include "messages/MRecoveryReserve.h"
103 #include "messages/MOSDForceRecovery.h"
104 #include "messages/MOSDECSubOpWrite.h"
105 #include "messages/MOSDECSubOpWriteReply.h"
106 #include "messages/MOSDECSubOpRead.h"
107 #include "messages/MOSDECSubOpReadReply.h"
108 #include "messages/MOSDPGCreated.h"
109 #include "messages/MOSDPGUpdateLogMissing.h"
110 #include "messages/MOSDPGUpdateLogMissingReply.h"
111
112 #include "messages/MOSDPeeringOp.h"
113
114 #include "messages/MOSDAlive.h"
115
116 #include "messages/MOSDScrub.h"
117 #include "messages/MOSDScrub2.h"
118 #include "messages/MOSDRepScrub.h"
119
120 #include "messages/MCommand.h"
121 #include "messages/MCommandReply.h"
122
123 #include "messages/MPGStats.h"
124 #include "messages/MPGStatsAck.h"
125
126 #include "messages/MWatchNotify.h"
127 #include "messages/MOSDPGPush.h"
128 #include "messages/MOSDPGPushReply.h"
129 #include "messages/MOSDPGPull.h"
130
131 #include "messages/MMonGetPurgedSnaps.h"
132 #include "messages/MMonGetPurgedSnapsReply.h"
133
134 #include "common/perf_counters.h"
135 #include "common/Timer.h"
136 #include "common/LogClient.h"
137 #include "common/AsyncReserver.h"
138 #include "common/HeartbeatMap.h"
139 #include "common/admin_socket.h"
140 #include "common/ceph_context.h"
141
142 #include "global/signal_handler.h"
143 #include "global/pidfile.h"
144
145 #include "include/color.h"
146 #include "perfglue/cpu_profiler.h"
147 #include "perfglue/heap_profiler.h"
148
149 #include "osd/OpRequest.h"
150
151 #include "auth/AuthAuthorizeHandler.h"
152 #include "auth/RotatingKeyRing.h"
153
154 #include "objclass/objclass.h"
155
156 #include "common/cmdparse.h"
157 #include "include/str_list.h"
158 #include "include/util.h"
159
160 #include "include/ceph_assert.h"
161 #include "common/config.h"
162 #include "common/EventTrace.h"
163
164 #include "json_spirit/json_spirit_reader.h"
165 #include "json_spirit/json_spirit_writer.h"
166
167 #ifdef WITH_LTTNG
168 #define TRACEPOINT_DEFINE
169 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
170 #include "tracing/osd.h"
171 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172 #undef TRACEPOINT_DEFINE
173 #else
174 #define tracepoint(...)
175 #endif
176
177 #define dout_context cct
178 #define dout_subsys ceph_subsys_osd
179 #undef dout_prefix
180 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
181
182 using namespace ceph::osd::scheduler;
183 using TOPNSPC::common::cmd_getval;
184
185 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
186 return *_dout << "osd." << whoami << " " << epoch << " ";
187 }
188
189 //Initial features in new superblock.
190 //Features here are also automatically upgraded
191 CompatSet OSD::get_osd_initial_compat_set() {
192 CompatSet::FeatureSet ceph_osd_feature_compat;
193 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
194 CompatSet::FeatureSet ceph_osd_feature_incompat;
195 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
196 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
197 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
198 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
199 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
200 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
201 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
202 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
203 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
204 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
205 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
206 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
207 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
208 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
209 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
210 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
211 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
212 ceph_osd_feature_incompat);
213 }
214
215 //Features are added here that this OSD supports.
216 CompatSet OSD::get_osd_compat_set() {
217 CompatSet compat = get_osd_initial_compat_set();
218 //Any features here can be set in code, but not in initial superblock
219 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
220 return compat;
221 }
222
223 OSDService::OSDService(OSD *osd) :
224 osd(osd),
225 cct(osd->cct),
226 whoami(osd->whoami), store(osd->store),
227 log_client(osd->log_client), clog(osd->clog),
228 pg_recovery_stats(osd->pg_recovery_stats),
229 cluster_messenger(osd->cluster_messenger),
230 client_messenger(osd->client_messenger),
231 logger(osd->logger),
232 recoverystate_perf(osd->recoverystate_perf),
233 monc(osd->monc),
234 osd_max_object_size(cct->_conf, "osd_max_object_size"),
235 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
236 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
237 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
238 max_oldest_map(0),
239 scrubs_local(0),
240 scrubs_remote(0),
241 agent_valid_iterator(false),
242 agent_ops(0),
243 flush_mode_high_count(0),
244 agent_active(true),
245 agent_thread(this),
246 agent_stop_flag(false),
247 agent_timer(osd->client_messenger->cct, agent_timer_lock),
248 last_recalibrate(ceph_clock_now()),
249 promote_max_objects(0),
250 promote_max_bytes(0),
251 objecter(make_unique<Objecter>(osd->client_messenger->cct,
252 osd->objecter_messenger,
253 osd->monc, nullptr, 0, 0)),
254 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
255 watch_timer(osd->client_messenger->cct, watch_lock),
256 next_notif_id(0),
257 recovery_request_timer(cct, recovery_request_lock, false),
258 sleep_timer(cct, sleep_lock, false),
259 reserver_finisher(cct),
260 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
261 cct->_conf->osd_min_recovery_priority),
262 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
263 cct->_conf->osd_min_recovery_priority),
264 snap_reserver(cct, &reserver_finisher,
265 cct->_conf->osd_max_trimming_pgs),
266 recovery_ops_active(0),
267 recovery_ops_reserved(0),
268 recovery_paused(false),
269 map_cache(cct, cct->_conf->osd_map_cache_size),
270 map_bl_cache(cct->_conf->osd_map_cache_size),
271 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
272 cur_state(NONE),
273 cur_ratio(0), physical_ratio(0),
274 boot_epoch(0), up_epoch(0), bind_epoch(0)
275 {
276 objecter->init();
277
278 for (int i = 0; i < m_objecter_finishers; i++) {
279 ostringstream str;
280 str << "objecter-finisher-" << i;
281 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
282 objecter_finishers.push_back(std::move(fin));
283 }
284 }
285
286 #ifdef PG_DEBUG_REFS
287 void OSDService::add_pgid(spg_t pgid, PG *pg){
288 std::lock_guard l(pgid_lock);
289 if (!pgid_tracker.count(pgid)) {
290 live_pgs[pgid] = pg;
291 }
292 pgid_tracker[pgid]++;
293 }
294 void OSDService::remove_pgid(spg_t pgid, PG *pg)
295 {
296 std::lock_guard l(pgid_lock);
297 ceph_assert(pgid_tracker.count(pgid));
298 ceph_assert(pgid_tracker[pgid] > 0);
299 pgid_tracker[pgid]--;
300 if (pgid_tracker[pgid] == 0) {
301 pgid_tracker.erase(pgid);
302 live_pgs.erase(pgid);
303 }
304 }
305 void OSDService::dump_live_pgids()
306 {
307 std::lock_guard l(pgid_lock);
308 derr << "live pgids:" << dendl;
309 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
310 i != pgid_tracker.cend();
311 ++i) {
312 derr << "\t" << *i << dendl;
313 live_pgs[i->first]->dump_live_ids();
314 }
315 }
316 #endif
317
318
319 ceph::signedspan OSDService::get_mnow()
320 {
321 return ceph::mono_clock::now() - osd->startup_time;
322 }
323
324 void OSDService::identify_splits_and_merges(
325 OSDMapRef old_map,
326 OSDMapRef new_map,
327 spg_t pgid,
328 set<pair<spg_t,epoch_t>> *split_children,
329 set<pair<spg_t,epoch_t>> *merge_pgs)
330 {
331 if (!old_map->have_pg_pool(pgid.pool())) {
332 return;
333 }
334 int old_pgnum = old_map->get_pg_num(pgid.pool());
335 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
336 if (p == osd->pg_num_history.pg_nums.end()) {
337 return;
338 }
339 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
340 << " to e" << new_map->get_epoch()
341 << " pg_nums " << p->second << dendl;
342 deque<spg_t> queue;
343 queue.push_back(pgid);
344 set<spg_t> did;
345 while (!queue.empty()) {
346 auto cur = queue.front();
347 queue.pop_front();
348 did.insert(cur);
349 unsigned pgnum = old_pgnum;
350 for (auto q = p->second.lower_bound(old_map->get_epoch());
351 q != p->second.end() &&
352 q->first <= new_map->get_epoch();
353 ++q) {
354 if (pgnum < q->second) {
355 // split?
356 if (cur.ps() < pgnum) {
357 set<spg_t> children;
358 if (cur.is_split(pgnum, q->second, &children)) {
359 dout(20) << __func__ << " " << cur << " e" << q->first
360 << " pg_num " << pgnum << " -> " << q->second
361 << " children " << children << dendl;
362 for (auto i : children) {
363 split_children->insert(make_pair(i, q->first));
364 if (!did.count(i))
365 queue.push_back(i);
366 }
367 }
368 } else if (cur.ps() < q->second) {
369 dout(20) << __func__ << " " << cur << " e" << q->first
370 << " pg_num " << pgnum << " -> " << q->second
371 << " is a child" << dendl;
372 // normally we'd capture this from the parent, but it's
373 // possible the parent doesn't exist yet (it will be
374 // fabricated to allow an intervening merge). note this PG
375 // as a split child here to be sure we catch it.
376 split_children->insert(make_pair(cur, q->first));
377 } else {
378 dout(20) << __func__ << " " << cur << " e" << q->first
379 << " pg_num " << pgnum << " -> " << q->second
380 << " is post-split, skipping" << dendl;
381 }
382 } else if (merge_pgs) {
383 // merge?
384 if (cur.ps() >= q->second) {
385 if (cur.ps() < pgnum) {
386 spg_t parent;
387 if (cur.is_merge_source(pgnum, q->second, &parent)) {
388 set<spg_t> children;
389 parent.is_split(q->second, pgnum, &children);
390 dout(20) << __func__ << " " << cur << " e" << q->first
391 << " pg_num " << pgnum << " -> " << q->second
392 << " is merge source, target " << parent
393 << ", source(s) " << children << dendl;
394 merge_pgs->insert(make_pair(parent, q->first));
395 if (!did.count(parent)) {
396 // queue (and re-scan) parent in case it might not exist yet
397 // and there are some future splits pending on it
398 queue.push_back(parent);
399 }
400 for (auto c : children) {
401 merge_pgs->insert(make_pair(c, q->first));
402 if (!did.count(c))
403 queue.push_back(c);
404 }
405 }
406 } else {
407 dout(20) << __func__ << " " << cur << " e" << q->first
408 << " pg_num " << pgnum << " -> " << q->second
409 << " is beyond old pgnum, skipping" << dendl;
410 }
411 } else {
412 set<spg_t> children;
413 if (cur.is_split(q->second, pgnum, &children)) {
414 dout(20) << __func__ << " " << cur << " e" << q->first
415 << " pg_num " << pgnum << " -> " << q->second
416 << " is merge target, source " << children << dendl;
417 for (auto c : children) {
418 merge_pgs->insert(make_pair(c, q->first));
419 if (!did.count(c))
420 queue.push_back(c);
421 }
422 merge_pgs->insert(make_pair(cur, q->first));
423 }
424 }
425 }
426 pgnum = q->second;
427 }
428 }
429 }
430
431 void OSDService::need_heartbeat_peer_update()
432 {
433 osd->need_heartbeat_peer_update();
434 }
435
436 HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
437 {
438 std::lock_guard l(hb_stamp_lock);
439 if (peer >= hb_stamps.size()) {
440 hb_stamps.resize(peer + 1);
441 }
442 if (!hb_stamps[peer]) {
443 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
444 }
445 return hb_stamps[peer];
446 }
447
448 void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
449 {
450 osd->enqueue_peering_evt(
451 spgid,
452 PGPeeringEventRef(
453 std::make_shared<PGPeeringEvent>(
454 epoch, epoch,
455 RenewLease())));
456 }
457
458 void OSDService::start_shutdown()
459 {
460 {
461 std::lock_guard l(agent_timer_lock);
462 agent_timer.shutdown();
463 }
464
465 {
466 std::lock_guard l(sleep_lock);
467 sleep_timer.shutdown();
468 }
469
470 {
471 std::lock_guard l(recovery_request_lock);
472 recovery_request_timer.shutdown();
473 }
474 }
475
476 void OSDService::shutdown_reserver()
477 {
478 reserver_finisher.wait_for_empty();
479 reserver_finisher.stop();
480 }
481
482 void OSDService::shutdown()
483 {
484 mono_timer.suspend();
485
486 {
487 std::lock_guard l(watch_lock);
488 watch_timer.shutdown();
489 }
490
491 objecter->shutdown();
492 for (auto& f : objecter_finishers) {
493 f->wait_for_empty();
494 f->stop();
495 }
496
497 publish_map(OSDMapRef());
498 next_osdmap = OSDMapRef();
499 }
500
501 void OSDService::init()
502 {
503 reserver_finisher.start();
504 for (auto& f : objecter_finishers) {
505 f->start();
506 }
507 objecter->set_client_incarnation(0);
508
509 // deprioritize objecter in daemonperf output
510 objecter->get_logger()->set_prio_adjust(-3);
511
512 watch_timer.init();
513 agent_timer.init();
514 mono_timer.resume();
515
516 agent_thread.create("osd_srv_agent");
517
518 if (cct->_conf->osd_recovery_delay_start)
519 defer_recovery(cct->_conf->osd_recovery_delay_start);
520 }
521
522 void OSDService::final_init()
523 {
524 objecter->start(osdmap.get());
525 }
526
527 void OSDService::activate_map()
528 {
529 // wake/unwake the tiering agent
530 std::lock_guard l{agent_lock};
531 agent_active =
532 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
533 osd->is_active();
534 agent_cond.notify_all();
535 }
536
537 void OSDService::request_osdmap_update(epoch_t e)
538 {
539 osd->osdmap_subscribe(e, false);
540 }
541
542
543 class AgentTimeoutCB : public Context {
544 PGRef pg;
545 public:
546 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
547 void finish(int) override {
548 pg->agent_choose_mode_restart();
549 }
550 };
551
552 void OSDService::agent_entry()
553 {
554 dout(10) << __func__ << " start" << dendl;
555 std::unique_lock agent_locker{agent_lock};
556
557 while (!agent_stop_flag) {
558 if (agent_queue.empty()) {
559 dout(20) << __func__ << " empty queue" << dendl;
560 agent_cond.wait(agent_locker);
561 continue;
562 }
563 uint64_t level = agent_queue.rbegin()->first;
564 set<PGRef>& top = agent_queue.rbegin()->second;
565 dout(10) << __func__
566 << " tiers " << agent_queue.size()
567 << ", top is " << level
568 << " with pgs " << top.size()
569 << ", ops " << agent_ops << "/"
570 << cct->_conf->osd_agent_max_ops
571 << (agent_active ? " active" : " NOT ACTIVE")
572 << dendl;
573 dout(20) << __func__ << " oids " << agent_oids << dendl;
574 int max = cct->_conf->osd_agent_max_ops - agent_ops;
575 int agent_flush_quota = max;
576 if (!flush_mode_high_count)
577 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
578 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
579 agent_cond.wait(agent_locker);
580 continue;
581 }
582
583 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
584 agent_queue_pos = top.begin();
585 agent_valid_iterator = true;
586 }
587 PGRef pg = *agent_queue_pos;
588 dout(10) << "high_count " << flush_mode_high_count
589 << " agent_ops " << agent_ops
590 << " flush_quota " << agent_flush_quota << dendl;
591 agent_locker.unlock();
592 if (!pg->agent_work(max, agent_flush_quota)) {
593 dout(10) << __func__ << " " << pg->pg_id
594 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
595 << " seconds" << dendl;
596
597 osd->logger->inc(l_osd_tier_delay);
598 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
599 std::lock_guard timer_locker{agent_timer_lock};
600 Context *cb = new AgentTimeoutCB(pg);
601 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
602 }
603 agent_locker.lock();
604 }
605 dout(10) << __func__ << " finish" << dendl;
606 }
607
608 void OSDService::agent_stop()
609 {
610 {
611 std::lock_guard l(agent_lock);
612
613 // By this time all ops should be cancelled
614 ceph_assert(agent_ops == 0);
615 // By this time all PGs are shutdown and dequeued
616 if (!agent_queue.empty()) {
617 set<PGRef>& top = agent_queue.rbegin()->second;
618 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
619 ceph_abort_msg("agent queue not empty");
620 }
621
622 agent_stop_flag = true;
623 agent_cond.notify_all();
624 }
625 agent_thread.join();
626 }
627
628 // -------------------------------------
629
630 void OSDService::promote_throttle_recalibrate()
631 {
632 utime_t now = ceph_clock_now();
633 double dur = now - last_recalibrate;
634 last_recalibrate = now;
635 unsigned prob = promote_probability_millis;
636
637 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
638 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
639
640 unsigned min_prob = 1;
641
642 uint64_t attempts, obj, bytes;
643 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
644 dout(10) << __func__ << " " << attempts << " attempts, promoted "
645 << obj << " objects and " << byte_u_t(bytes) << "; target "
646 << target_obj_sec << " obj/sec or "
647 << byte_u_t(target_bytes_sec) << "/sec"
648 << dendl;
649
650 // calculate what the probability *should* be, given the targets
651 unsigned new_prob;
652 if (attempts && dur > 0) {
653 uint64_t avg_size = 1;
654 if (obj)
655 avg_size = std::max<uint64_t>(bytes / obj, 1);
656 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
657 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
658 / (double)attempts;
659 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
660 << avg_size << dendl;
661 if (target_obj_sec && target_bytes_sec)
662 new_prob = std::min(po, pb);
663 else if (target_obj_sec)
664 new_prob = po;
665 else if (target_bytes_sec)
666 new_prob = pb;
667 else
668 new_prob = 1000;
669 } else {
670 new_prob = 1000;
671 }
672 dout(20) << __func__ << " new_prob " << new_prob << dendl;
673
674 // correct for persistent skew between target rate and actual rate, adjust
675 double ratio = 1.0;
676 unsigned actual = 0;
677 if (attempts && obj) {
678 actual = obj * 1000 / attempts;
679 ratio = (double)actual / (double)prob;
680 new_prob = (double)new_prob / ratio;
681 }
682 new_prob = std::max(new_prob, min_prob);
683 new_prob = std::min(new_prob, 1000u);
684
685 // adjust
686 prob = (prob + new_prob) / 2;
687 prob = std::max(prob, min_prob);
688 prob = std::min(prob, 1000u);
689 dout(10) << __func__ << " actual " << actual
690 << ", actual/prob ratio " << ratio
691 << ", adjusted new_prob " << new_prob
692 << ", prob " << promote_probability_millis << " -> " << prob
693 << dendl;
694 promote_probability_millis = prob;
695
696 // set hard limits for this interval to mitigate stampedes
697 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
698 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
699 }
700
701 // -------------------------------------
702
703 float OSDService::get_failsafe_full_ratio()
704 {
705 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
706 if (full_ratio > 1.0) full_ratio /= 100.0;
707 return full_ratio;
708 }
709
710 OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
711 {
712 // The OSDMap ratios take precendence. So if the failsafe is .95 and
713 // the admin sets the cluster full to .96, the failsafe moves up to .96
714 // too. (Not that having failsafe == full is ideal, but it's better than
715 // dropping writes before the clusters appears full.)
716 OSDMapRef osdmap = get_osdmap();
717 if (!osdmap || osdmap->get_epoch() == 0) {
718 return NONE;
719 }
720 float nearfull_ratio = osdmap->get_nearfull_ratio();
721 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
722 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
723 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
724
725 if (osdmap->require_osd_release < ceph_release_t::luminous) {
726 // use the failsafe for nearfull and full; the mon isn't using the
727 // flags anyway because we're mid-upgrade.
728 full_ratio = failsafe_ratio;
729 backfillfull_ratio = failsafe_ratio;
730 nearfull_ratio = failsafe_ratio;
731 } else if (full_ratio <= 0 ||
732 backfillfull_ratio <= 0 ||
733 nearfull_ratio <= 0) {
734 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
735 // use failsafe flag. ick. the monitor did something wrong or the user
736 // did something stupid.
737 full_ratio = failsafe_ratio;
738 backfillfull_ratio = failsafe_ratio;
739 nearfull_ratio = failsafe_ratio;
740 }
741
742 if (injectfull_state > NONE && injectfull) {
743 inject = "(Injected)";
744 return injectfull_state;
745 } else if (pratio > failsafe_ratio) {
746 return FAILSAFE;
747 } else if (ratio > full_ratio) {
748 return FULL;
749 } else if (ratio > backfillfull_ratio) {
750 return BACKFILLFULL;
751 } else if (pratio > nearfull_ratio) {
752 return NEARFULL;
753 }
754 return NONE;
755 }
756
757 void OSDService::check_full_status(float ratio, float pratio)
758 {
759 std::lock_guard l(full_status_lock);
760
761 cur_ratio = ratio;
762 physical_ratio = pratio;
763
764 string inject;
765 s_names new_state;
766 new_state = recalc_full_state(ratio, pratio, inject);
767
768 dout(20) << __func__ << " cur ratio " << ratio
769 << ", physical ratio " << pratio
770 << ", new state " << get_full_state_name(new_state)
771 << " " << inject
772 << dendl;
773
774 // warn
775 if (cur_state != new_state) {
776 dout(10) << __func__ << " " << get_full_state_name(cur_state)
777 << " -> " << get_full_state_name(new_state) << dendl;
778 if (new_state == FAILSAFE) {
779 clog->error() << "full status failsafe engaged, dropping updates, now "
780 << (int)roundf(ratio * 100) << "% full";
781 } else if (cur_state == FAILSAFE) {
782 clog->error() << "full status failsafe disengaged, no longer dropping "
783 << "updates, now " << (int)roundf(ratio * 100) << "% full";
784 }
785 cur_state = new_state;
786 }
787 }
788
789 bool OSDService::need_fullness_update()
790 {
791 OSDMapRef osdmap = get_osdmap();
792 s_names cur = NONE;
793 if (osdmap->exists(whoami)) {
794 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
795 cur = FULL;
796 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
797 cur = BACKFILLFULL;
798 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
799 cur = NEARFULL;
800 }
801 }
802 s_names want = NONE;
803 if (is_full())
804 want = FULL;
805 else if (is_backfillfull())
806 want = BACKFILLFULL;
807 else if (is_nearfull())
808 want = NEARFULL;
809 return want != cur;
810 }
811
812 bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
813 {
814 if (injectfull && injectfull_state >= type) {
815 // injectfull is either a count of the number of times to return failsafe full
816 // or if -1 then always return full
817 if (injectfull > 0)
818 --injectfull;
819 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
820 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
821 << dendl;
822 return true;
823 }
824 return false;
825 }
826
827 bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
828 {
829 std::lock_guard l(full_status_lock);
830
831 if (_check_inject_full(dpp, type))
832 return true;
833
834 if (cur_state >= type)
835 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
836 << " physical " << physical_ratio << dendl;
837
838 return cur_state >= type;
839 }
840
841 bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
842 {
843 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
844 {
845 std::lock_guard l(full_status_lock);
846 if (_check_inject_full(dpp, type)) {
847 return true;
848 }
849 }
850
851 float pratio;
852 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
853
854 string notused;
855 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
856
857 if (tentative_state >= type)
858 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
859
860 return tentative_state >= type;
861 }
862
863 bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
864 {
865 return _check_full(dpp, FAILSAFE);
866 }
867
868 bool OSDService::check_full(DoutPrefixProvider *dpp) const
869 {
870 return _check_full(dpp, FULL);
871 }
872
873 bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
874 {
875 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
876 }
877
878 bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
879 {
880 return _check_full(dpp, BACKFILLFULL);
881 }
882
883 bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
884 {
885 return _check_full(dpp, NEARFULL);
886 }
887
888 bool OSDService::is_failsafe_full() const
889 {
890 std::lock_guard l(full_status_lock);
891 return cur_state == FAILSAFE;
892 }
893
894 bool OSDService::is_full() const
895 {
896 std::lock_guard l(full_status_lock);
897 return cur_state >= FULL;
898 }
899
900 bool OSDService::is_backfillfull() const
901 {
902 std::lock_guard l(full_status_lock);
903 return cur_state >= BACKFILLFULL;
904 }
905
906 bool OSDService::is_nearfull() const
907 {
908 std::lock_guard l(full_status_lock);
909 return cur_state >= NEARFULL;
910 }
911
912 void OSDService::set_injectfull(s_names type, int64_t count)
913 {
914 std::lock_guard l(full_status_lock);
915 injectfull_state = type;
916 injectfull = count;
917 }
918
919 void OSDService::set_statfs(const struct store_statfs_t &stbuf,
920 osd_alert_list_t& alerts)
921 {
922 uint64_t bytes = stbuf.total;
923 uint64_t avail = stbuf.available;
924 uint64_t used = stbuf.get_used_raw();
925
926 // For testing fake statfs values so it doesn't matter if all
927 // OSDs are using the same partition.
928 if (cct->_conf->fake_statfs_for_testing) {
929 uint64_t total_num_bytes = 0;
930 vector<PGRef> pgs;
931 osd->_get_pgs(&pgs);
932 for (auto p : pgs) {
933 total_num_bytes += p->get_stats_num_bytes();
934 }
935 bytes = cct->_conf->fake_statfs_for_testing;
936 if (total_num_bytes < bytes)
937 avail = bytes - total_num_bytes;
938 else
939 avail = 0;
940 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
941 << " adjust available " << avail
942 << dendl;
943 used = bytes - avail;
944 }
945
946 osd->logger->set(l_osd_stat_bytes, bytes);
947 osd->logger->set(l_osd_stat_bytes_used, used);
948 osd->logger->set(l_osd_stat_bytes_avail, avail);
949
950 std::lock_guard l(stat_lock);
951 osd_stat.statfs = stbuf;
952 osd_stat.os_alerts.clear();
953 osd_stat.os_alerts[whoami].swap(alerts);
954 if (cct->_conf->fake_statfs_for_testing) {
955 osd_stat.statfs.total = bytes;
956 osd_stat.statfs.available = avail;
957 // For testing don't want used to go negative, so clear reserved
958 osd_stat.statfs.internally_reserved = 0;
959 }
960 }
961
962 osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
963 int num_pgs)
964 {
965 utime_t now = ceph_clock_now();
966 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
967 std::lock_guard l(stat_lock);
968 osd_stat.hb_peers.swap(hb_peers);
969 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
970 osd_stat.num_pgs = num_pgs;
971 // Clean entries that aren't updated
972 // This is called often enough that we can just remove 1 at a time
973 for (auto i: osd_stat.hb_pingtime) {
974 if (i.second.last_update == 0)
975 continue;
976 if (stale_time && now.sec() - i.second.last_update > stale_time) {
977 dout(20) << __func__ << " time out heartbeat for osd " << i.first
978 << " last_update " << i.second.last_update << dendl;
979 osd_stat.hb_pingtime.erase(i.first);
980 break;
981 }
982 }
983 return osd_stat;
984 }
985
986 void OSDService::inc_osd_stat_repaired()
987 {
988 std::lock_guard l(stat_lock);
989 osd_stat.num_shards_repaired++;
990 return;
991 }
992
993 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
994 uint64_t adjust_used)
995 {
996 *pratio =
997 ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
998
999 if (adjust_used) {
1000 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1001 if (new_stat.statfs.available > adjust_used)
1002 new_stat.statfs.available -= adjust_used;
1003 else
1004 new_stat.statfs.available = 0;
1005 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
1006 }
1007
1008 // Check all pgs and adjust kb_used to include all pending backfill data
1009 int backfill_adjusted = 0;
1010 vector<PGRef> pgs;
1011 osd->_get_pgs(&pgs);
1012 for (auto p : pgs) {
1013 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1014 }
1015 if (backfill_adjusted) {
1016 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1017 }
1018 return ((float)new_stat.statfs.get_used()) / ((float)new_stat.statfs.total);
1019 }
1020
1021 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1022 {
1023 OSDMapRef next_map = get_nextmap_reserved();
1024 // service map is always newer/newest
1025 ceph_assert(from_epoch <= next_map->get_epoch());
1026
1027 if (next_map->is_down(peer) ||
1028 next_map->get_info(peer).up_from > from_epoch) {
1029 m->put();
1030 release_map(next_map);
1031 return;
1032 }
1033 ConnectionRef peer_con;
1034 if (peer == whoami) {
1035 peer_con = osd->cluster_messenger->get_loopback_connection();
1036 } else {
1037 peer_con = osd->cluster_messenger->connect_to_osd(
1038 next_map->get_cluster_addrs(peer), false, true);
1039 }
1040 maybe_share_map(peer_con.get(), next_map);
1041 peer_con->send_message(m);
1042 release_map(next_map);
1043 }
1044
1045 void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1046 {
1047 OSDMapRef next_map = get_nextmap_reserved();
1048 // service map is always newer/newest
1049 ceph_assert(from_epoch <= next_map->get_epoch());
1050
1051 for (auto& iter : messages) {
1052 if (next_map->is_down(iter.first) ||
1053 next_map->get_info(iter.first).up_from > from_epoch) {
1054 iter.second->put();
1055 continue;
1056 }
1057 ConnectionRef peer_con;
1058 if (iter.first == whoami) {
1059 peer_con = osd->cluster_messenger->get_loopback_connection();
1060 } else {
1061 peer_con = osd->cluster_messenger->connect_to_osd(
1062 next_map->get_cluster_addrs(iter.first), false, true);
1063 }
1064 maybe_share_map(peer_con.get(), next_map);
1065 peer_con->send_message(iter.second);
1066 }
1067 release_map(next_map);
1068 }
1069 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1070 {
1071 OSDMapRef next_map = get_nextmap_reserved();
1072 // service map is always newer/newest
1073 ceph_assert(from_epoch <= next_map->get_epoch());
1074
1075 if (next_map->is_down(peer) ||
1076 next_map->get_info(peer).up_from > from_epoch) {
1077 release_map(next_map);
1078 return NULL;
1079 }
1080 ConnectionRef con;
1081 if (peer == whoami) {
1082 con = osd->cluster_messenger->get_loopback_connection();
1083 } else {
1084 con = osd->cluster_messenger->connect_to_osd(
1085 next_map->get_cluster_addrs(peer), false, true);
1086 }
1087 release_map(next_map);
1088 return con;
1089 }
1090
1091 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1092 {
1093 OSDMapRef next_map = get_nextmap_reserved();
1094 // service map is always newer/newest
1095 ceph_assert(from_epoch <= next_map->get_epoch());
1096
1097 pair<ConnectionRef,ConnectionRef> ret;
1098 if (next_map->is_down(peer) ||
1099 next_map->get_info(peer).up_from > from_epoch) {
1100 release_map(next_map);
1101 return ret;
1102 }
1103 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1104 next_map->get_hb_back_addrs(peer));
1105 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1106 next_map->get_hb_front_addrs(peer));
1107 release_map(next_map);
1108 return ret;
1109 }
1110
1111 entity_name_t OSDService::get_cluster_msgr_name() const
1112 {
1113 return cluster_messenger->get_myname();
1114 }
1115
1116 void OSDService::queue_want_pg_temp(pg_t pgid,
1117 const vector<int>& want,
1118 bool forced)
1119 {
1120 std::lock_guard l(pg_temp_lock);
1121 auto p = pg_temp_pending.find(pgid);
1122 if (p == pg_temp_pending.end() ||
1123 p->second.acting != want ||
1124 forced) {
1125 pg_temp_wanted[pgid] = {want, forced};
1126 }
1127 }
1128
1129 void OSDService::remove_want_pg_temp(pg_t pgid)
1130 {
1131 std::lock_guard l(pg_temp_lock);
1132 pg_temp_wanted.erase(pgid);
1133 pg_temp_pending.erase(pgid);
1134 }
1135
1136 void OSDService::_sent_pg_temp()
1137 {
1138 #ifdef HAVE_STDLIB_MAP_SPLICING
1139 pg_temp_pending.merge(pg_temp_wanted);
1140 #else
1141 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1142 make_move_iterator(end(pg_temp_wanted)));
1143 #endif
1144 pg_temp_wanted.clear();
1145 }
1146
1147 void OSDService::requeue_pg_temp()
1148 {
1149 std::lock_guard l(pg_temp_lock);
1150 // wanted overrides pending. note that remove_want_pg_temp
1151 // clears the item out of both.
1152 unsigned old_wanted = pg_temp_wanted.size();
1153 unsigned old_pending = pg_temp_pending.size();
1154 _sent_pg_temp();
1155 pg_temp_wanted.swap(pg_temp_pending);
1156 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1157 << pg_temp_wanted.size() << dendl;
1158 }
1159
1160 std::ostream& operator<<(std::ostream& out,
1161 const OSDService::pg_temp_t& pg_temp)
1162 {
1163 out << pg_temp.acting;
1164 if (pg_temp.forced) {
1165 out << " (forced)";
1166 }
1167 return out;
1168 }
1169
1170 void OSDService::send_pg_temp()
1171 {
1172 std::lock_guard l(pg_temp_lock);
1173 if (pg_temp_wanted.empty())
1174 return;
1175 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1176 MOSDPGTemp *ms[2] = {nullptr, nullptr};
1177 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1178 auto& m = ms[pg_temp.forced];
1179 if (!m) {
1180 m = new MOSDPGTemp(osdmap->get_epoch());
1181 m->forced = pg_temp.forced;
1182 }
1183 m->pg_temp.emplace(pgid, pg_temp.acting);
1184 }
1185 for (auto m : ms) {
1186 if (m) {
1187 monc->send_mon_message(m);
1188 }
1189 }
1190 _sent_pg_temp();
1191 }
1192
1193 void OSDService::send_pg_created(pg_t pgid)
1194 {
1195 std::lock_guard l(pg_created_lock);
1196 dout(20) << __func__ << dendl;
1197 auto o = get_osdmap();
1198 if (o->require_osd_release >= ceph_release_t::luminous) {
1199 pg_created.insert(pgid);
1200 monc->send_mon_message(new MOSDPGCreated(pgid));
1201 }
1202 }
1203
1204 void OSDService::send_pg_created()
1205 {
1206 std::lock_guard l(pg_created_lock);
1207 dout(20) << __func__ << dendl;
1208 auto o = get_osdmap();
1209 if (o->require_osd_release >= ceph_release_t::luminous) {
1210 for (auto pgid : pg_created) {
1211 monc->send_mon_message(new MOSDPGCreated(pgid));
1212 }
1213 }
1214 }
1215
1216 void OSDService::prune_pg_created()
1217 {
1218 std::lock_guard l(pg_created_lock);
1219 dout(20) << __func__ << dendl;
1220 auto o = get_osdmap();
1221 auto i = pg_created.begin();
1222 while (i != pg_created.end()) {
1223 auto p = o->get_pg_pool(i->pool());
1224 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1225 dout(20) << __func__ << " pruning " << *i << dendl;
1226 i = pg_created.erase(i);
1227 } else {
1228 dout(20) << __func__ << " keeping " << *i << dendl;
1229 ++i;
1230 }
1231 }
1232 }
1233
1234
1235 // --------------------------------------
1236 // dispatch
1237
1238 bool OSDService::can_inc_scrubs()
1239 {
1240 bool can_inc = false;
1241 std::lock_guard l(sched_scrub_lock);
1242
1243 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1244 dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1245 << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
1246 can_inc = true;
1247 } else {
1248 dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1249 << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1250 }
1251
1252 return can_inc;
1253 }
1254
1255 bool OSDService::inc_scrubs_local()
1256 {
1257 bool result = false;
1258 std::lock_guard l{sched_scrub_lock};
1259 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1260 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1261 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1262 result = true;
1263 ++scrubs_local;
1264 } else {
1265 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1266 }
1267 return result;
1268 }
1269
1270 void OSDService::dec_scrubs_local()
1271 {
1272 std::lock_guard l{sched_scrub_lock};
1273 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1274 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1275 --scrubs_local;
1276 ceph_assert(scrubs_local >= 0);
1277 }
1278
1279 bool OSDService::inc_scrubs_remote()
1280 {
1281 bool result = false;
1282 std::lock_guard l{sched_scrub_lock};
1283 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1284 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1285 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1286 result = true;
1287 ++scrubs_remote;
1288 } else {
1289 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1290 }
1291 return result;
1292 }
1293
1294 void OSDService::dec_scrubs_remote()
1295 {
1296 std::lock_guard l{sched_scrub_lock};
1297 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1298 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1299 --scrubs_remote;
1300 ceph_assert(scrubs_remote >= 0);
1301 }
1302
1303 void OSDService::dump_scrub_reservations(Formatter *f)
1304 {
1305 std::lock_guard l{sched_scrub_lock};
1306 f->dump_int("scrubs_local", scrubs_local);
1307 f->dump_int("scrubs_remote", scrubs_remote);
1308 f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
1309 }
1310
1311 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1312 epoch_t *_bind_epoch) const
1313 {
1314 std::lock_guard l(epoch_lock);
1315 if (_boot_epoch)
1316 *_boot_epoch = boot_epoch;
1317 if (_up_epoch)
1318 *_up_epoch = up_epoch;
1319 if (_bind_epoch)
1320 *_bind_epoch = bind_epoch;
1321 }
1322
1323 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1324 const epoch_t *_bind_epoch)
1325 {
1326 std::lock_guard l(epoch_lock);
1327 if (_boot_epoch) {
1328 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1329 boot_epoch = *_boot_epoch;
1330 }
1331 if (_up_epoch) {
1332 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1333 up_epoch = *_up_epoch;
1334 }
1335 if (_bind_epoch) {
1336 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1337 bind_epoch = *_bind_epoch;
1338 }
1339 }
1340
1341 bool OSDService::prepare_to_stop()
1342 {
1343 std::unique_lock l(is_stopping_lock);
1344 if (get_state() != NOT_STOPPING)
1345 return false;
1346
1347 OSDMapRef osdmap = get_osdmap();
1348 if (osdmap && osdmap->is_up(whoami)) {
1349 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1350 set_state(PREPARING_TO_STOP);
1351 monc->send_mon_message(
1352 new MOSDMarkMeDown(
1353 monc->get_fsid(),
1354 whoami,
1355 osdmap->get_addrs(whoami),
1356 osdmap->get_epoch(),
1357 true // request ack
1358 ));
1359 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1360 is_stopping_cond.wait_for(l, timeout,
1361 [this] { return get_state() == STOPPING; });
1362 }
1363 dout(0) << __func__ << " starting shutdown" << dendl;
1364 set_state(STOPPING);
1365 return true;
1366 }
1367
1368 void OSDService::got_stop_ack()
1369 {
1370 std::scoped_lock l(is_stopping_lock);
1371 if (get_state() == PREPARING_TO_STOP) {
1372 dout(0) << __func__ << " starting shutdown" << dendl;
1373 set_state(STOPPING);
1374 is_stopping_cond.notify_all();
1375 } else {
1376 dout(10) << __func__ << " ignoring msg" << dendl;
1377 }
1378 }
1379
1380 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1381 OSDSuperblock& sblock)
1382 {
1383 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1384 osdmap->get_encoding_features());
1385 m->oldest_map = max_oldest_map;
1386 m->newest_map = sblock.newest_map;
1387
1388 int max = cct->_conf->osd_map_message_max;
1389 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1390
1391 if (since < m->oldest_map) {
1392 // we don't have the next map the target wants, so start with a
1393 // full map.
1394 bufferlist bl;
1395 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1396 << since << ", starting with full map" << dendl;
1397 since = m->oldest_map;
1398 if (!get_map_bl(since, bl)) {
1399 derr << __func__ << " missing full map " << since << dendl;
1400 goto panic;
1401 }
1402 max--;
1403 max_bytes -= bl.length();
1404 m->maps[since].claim(bl);
1405 }
1406 for (epoch_t e = since + 1; e <= to; ++e) {
1407 bufferlist bl;
1408 if (get_inc_map_bl(e, bl)) {
1409 m->incremental_maps[e].claim(bl);
1410 } else {
1411 derr << __func__ << " missing incremental map " << e << dendl;
1412 if (!get_map_bl(e, bl)) {
1413 derr << __func__ << " also missing full map " << e << dendl;
1414 goto panic;
1415 }
1416 m->maps[e].claim(bl);
1417 }
1418 max--;
1419 max_bytes -= bl.length();
1420 if (max <= 0 || max_bytes <= 0) {
1421 break;
1422 }
1423 }
1424 return m;
1425
1426 panic:
1427 if (!m->maps.empty() ||
1428 !m->incremental_maps.empty()) {
1429 // send what we have so far
1430 return m;
1431 }
1432 // send something
1433 bufferlist bl;
1434 if (get_inc_map_bl(m->newest_map, bl)) {
1435 m->incremental_maps[m->newest_map].claim(bl);
1436 } else {
1437 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1438 if (!get_map_bl(m->newest_map, bl)) {
1439 derr << __func__ << " unable to load latest full map " << m->newest_map
1440 << dendl;
1441 ceph_abort();
1442 }
1443 m->maps[m->newest_map].claim(bl);
1444 }
1445 return m;
1446 }
1447
1448 void OSDService::send_map(MOSDMap *m, Connection *con)
1449 {
1450 con->send_message(m);
1451 }
1452
1453 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1454 const OSDMapRef& osdmap)
1455 {
1456 epoch_t to = osdmap->get_epoch();
1457 dout(10) << "send_incremental_map " << since << " -> " << to
1458 << " to " << con << " " << con->get_peer_addr() << dendl;
1459
1460 MOSDMap *m = NULL;
1461 while (!m) {
1462 OSDSuperblock sblock(get_superblock());
1463 if (since < sblock.oldest_map) {
1464 // just send latest full map
1465 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1466 osdmap->get_encoding_features());
1467 m->oldest_map = max_oldest_map;
1468 m->newest_map = sblock.newest_map;
1469 get_map_bl(to, m->maps[to]);
1470 send_map(m, con);
1471 return;
1472 }
1473
1474 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1475 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1476 << ", only sending most recent" << dendl;
1477 since = to - cct->_conf->osd_map_share_max_epochs;
1478 }
1479
1480 m = build_incremental_map_msg(since, to, sblock);
1481 }
1482 send_map(m, con);
1483 }
1484
1485 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1486 {
1487 bool found = map_bl_cache.lookup(e, &bl);
1488 if (found) {
1489 if (logger)
1490 logger->inc(l_osd_map_bl_cache_hit);
1491 return true;
1492 }
1493 if (logger)
1494 logger->inc(l_osd_map_bl_cache_miss);
1495 found = store->read(meta_ch,
1496 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1497 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1498 if (found) {
1499 _add_map_bl(e, bl);
1500 }
1501 return found;
1502 }
1503
1504 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1505 {
1506 std::lock_guard l(map_cache_lock);
1507 bool found = map_bl_inc_cache.lookup(e, &bl);
1508 if (found) {
1509 if (logger)
1510 logger->inc(l_osd_map_bl_cache_hit);
1511 return true;
1512 }
1513 if (logger)
1514 logger->inc(l_osd_map_bl_cache_miss);
1515 found = store->read(meta_ch,
1516 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1517 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1518 if (found) {
1519 _add_map_inc_bl(e, bl);
1520 }
1521 return found;
1522 }
1523
1524 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1525 {
1526 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1527 // cache a contiguous buffer
1528 if (bl.get_num_buffers() > 1) {
1529 bl.rebuild();
1530 }
1531 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1532 map_bl_cache.add(e, bl);
1533 }
1534
1535 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1536 {
1537 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1538 // cache a contiguous buffer
1539 if (bl.get_num_buffers() > 1) {
1540 bl.rebuild();
1541 }
1542 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1543 map_bl_inc_cache.add(e, bl);
1544 }
1545
1546 OSDMapRef OSDService::_add_map(OSDMap *o)
1547 {
1548 epoch_t e = o->get_epoch();
1549
1550 if (cct->_conf->osd_map_dedup) {
1551 // Dedup against an existing map at a nearby epoch
1552 OSDMapRef for_dedup = map_cache.lower_bound(e);
1553 if (for_dedup) {
1554 OSDMap::dedup(for_dedup.get(), o);
1555 }
1556 }
1557 bool existed;
1558 OSDMapRef l = map_cache.add(e, o, &existed);
1559 if (existed) {
1560 delete o;
1561 }
1562 return l;
1563 }
1564
1565 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1566 {
1567 std::lock_guard l(map_cache_lock);
1568 OSDMapRef retval = map_cache.lookup(epoch);
1569 if (retval) {
1570 dout(30) << "get_map " << epoch << " -cached" << dendl;
1571 if (logger) {
1572 logger->inc(l_osd_map_cache_hit);
1573 }
1574 return retval;
1575 }
1576 if (logger) {
1577 logger->inc(l_osd_map_cache_miss);
1578 epoch_t lb = map_cache.cached_key_lower_bound();
1579 if (epoch < lb) {
1580 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1581 logger->inc(l_osd_map_cache_miss_low);
1582 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1583 }
1584 }
1585
1586 OSDMap *map = new OSDMap;
1587 if (epoch > 0) {
1588 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1589 bufferlist bl;
1590 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1591 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1592 delete map;
1593 return OSDMapRef();
1594 }
1595 map->decode(bl);
1596 } else {
1597 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1598 }
1599 return _add_map(map);
1600 }
1601
1602 // ops
1603
1604
1605 void OSDService::reply_op_error(OpRequestRef op, int err)
1606 {
1607 reply_op_error(op, err, eversion_t(), 0, {});
1608 }
1609
1610 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1611 version_t uv,
1612 vector<pg_log_op_return_item_t> op_returns)
1613 {
1614 auto m = op->get_req<MOSDOp>();
1615 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1616 int flags;
1617 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1618
1619 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1620 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
1621 reply->set_reply_versions(v, uv);
1622 reply->set_op_returns(op_returns);
1623 m->get_connection()->send_message(reply);
1624 }
1625
1626 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1627 {
1628 if (!cct->_conf->osd_debug_misdirected_ops) {
1629 return;
1630 }
1631
1632 auto m = op->get_req<MOSDOp>();
1633 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1634
1635 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1636
1637 if (pg->is_ec_pg()) {
1638 /**
1639 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1640 * can get this result:
1641 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1642 * [CRUSH_ITEM_NONE, 2, 3]/3
1643 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1644 * [3, 2, 3]/3
1645 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1646 * -- misdirected op
1647 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1648 * it and fulfils it
1649 *
1650 * We can't compute the op target based on the sending map epoch due to
1651 * splitting. The simplest thing is to detect such cases here and drop
1652 * them without an error (the client will resend anyway).
1653 */
1654 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1655 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1656 if (!opmap) {
1657 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1658 << m->get_map_epoch() << ", dropping" << dendl;
1659 return;
1660 }
1661 pg_t _pgid = m->get_raw_pg();
1662 spg_t pgid;
1663 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1664 _pgid = opmap->raw_pg_to_pg(_pgid);
1665 if (opmap->get_primary_shard(_pgid, &pgid) &&
1666 pgid.shard != pg->pg_id.shard) {
1667 dout(7) << __func__ << ": " << *pg << " primary changed since "
1668 << m->get_map_epoch() << ", dropping" << dendl;
1669 return;
1670 }
1671 }
1672
1673 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1674 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1675 << " pg " << m->get_raw_pg()
1676 << " to osd." << whoami
1677 << " not " << pg->get_acting()
1678 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1679 }
1680
1681 void OSDService::enqueue_back(OpSchedulerItem&& qi)
1682 {
1683 osd->op_shardedwq.queue(std::move(qi));
1684 }
1685
1686 void OSDService::enqueue_front(OpSchedulerItem&& qi)
1687 {
1688 osd->op_shardedwq.queue_front(std::move(qi));
1689 }
1690
1691 void OSDService::queue_recovery_context(
1692 PG *pg,
1693 GenContext<ThreadPool::TPHandle&> *c)
1694 {
1695 epoch_t e = get_osdmap_epoch();
1696 enqueue_back(
1697 OpSchedulerItem(
1698 unique_ptr<OpSchedulerItem::OpQueueable>(
1699 new PGRecoveryContext(pg->get_pgid(), c, e)),
1700 cct->_conf->osd_recovery_cost,
1701 cct->_conf->osd_recovery_priority,
1702 ceph_clock_now(),
1703 0,
1704 e));
1705 }
1706
1707 void OSDService::queue_for_snap_trim(PG *pg)
1708 {
1709 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1710 enqueue_back(
1711 OpSchedulerItem(
1712 unique_ptr<OpSchedulerItem::OpQueueable>(
1713 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1714 cct->_conf->osd_snap_trim_cost,
1715 cct->_conf->osd_snap_trim_priority,
1716 ceph_clock_now(),
1717 0,
1718 pg->get_osdmap_epoch()));
1719 }
1720
1721 void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
1722 {
1723 unsigned scrub_queue_priority = pg->scrubber.priority;
1724 if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
1725 scrub_queue_priority = cct->_conf->osd_client_op_priority;
1726 }
1727 const auto epoch = pg->get_osdmap_epoch();
1728 enqueue_back(
1729 OpSchedulerItem(
1730 unique_ptr<OpSchedulerItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
1731 cct->_conf->osd_scrub_cost,
1732 scrub_queue_priority,
1733 ceph_clock_now(),
1734 0,
1735 epoch));
1736 }
1737
1738 void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1739 {
1740 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1741 enqueue_back(
1742 OpSchedulerItem(
1743 unique_ptr<OpSchedulerItem::OpQueueable>(
1744 new PGDelete(pgid, e)),
1745 cct->_conf->osd_pg_delete_cost,
1746 cct->_conf->osd_pg_delete_priority,
1747 ceph_clock_now(),
1748 0,
1749 e));
1750 }
1751
1752 bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1753 {
1754 return osd->try_finish_pg_delete(pg, old_pg_num);
1755 }
1756
1757 // ---
1758
1759 void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1760 {
1761 std::lock_guard l(merge_lock);
1762 dout(10) << __func__ << " " << pg->pg_id << dendl;
1763 ready_to_merge_source[pg->pg_id.pgid] = version;
1764 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1765 _send_ready_to_merge();
1766 }
1767
1768 void OSDService::set_ready_to_merge_target(PG *pg,
1769 eversion_t version,
1770 epoch_t last_epoch_started,
1771 epoch_t last_epoch_clean)
1772 {
1773 std::lock_guard l(merge_lock);
1774 dout(10) << __func__ << " " << pg->pg_id << dendl;
1775 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1776 make_tuple(version,
1777 last_epoch_started,
1778 last_epoch_clean)));
1779 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1780 _send_ready_to_merge();
1781 }
1782
1783 void OSDService::set_not_ready_to_merge_source(pg_t source)
1784 {
1785 std::lock_guard l(merge_lock);
1786 dout(10) << __func__ << " " << source << dendl;
1787 not_ready_to_merge_source.insert(source);
1788 assert(ready_to_merge_source.count(source) == 0);
1789 _send_ready_to_merge();
1790 }
1791
1792 void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1793 {
1794 std::lock_guard l(merge_lock);
1795 dout(10) << __func__ << " " << target << " source " << source << dendl;
1796 not_ready_to_merge_target[target] = source;
1797 assert(ready_to_merge_target.count(target) == 0);
1798 _send_ready_to_merge();
1799 }
1800
1801 void OSDService::send_ready_to_merge()
1802 {
1803 std::lock_guard l(merge_lock);
1804 _send_ready_to_merge();
1805 }
1806
1807 void OSDService::_send_ready_to_merge()
1808 {
1809 dout(20) << __func__
1810 << " ready_to_merge_source " << ready_to_merge_source
1811 << " not_ready_to_merge_source " << not_ready_to_merge_source
1812 << " ready_to_merge_target " << ready_to_merge_target
1813 << " not_ready_to_merge_target " << not_ready_to_merge_target
1814 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1815 << dendl;
1816 for (auto src : not_ready_to_merge_source) {
1817 if (sent_ready_to_merge_source.count(src) == 0) {
1818 monc->send_mon_message(new MOSDPGReadyToMerge(
1819 src,
1820 {}, {}, 0, 0,
1821 false,
1822 osdmap->get_epoch()));
1823 sent_ready_to_merge_source.insert(src);
1824 }
1825 }
1826 for (auto p : not_ready_to_merge_target) {
1827 if (sent_ready_to_merge_source.count(p.second) == 0) {
1828 monc->send_mon_message(new MOSDPGReadyToMerge(
1829 p.second,
1830 {}, {}, 0, 0,
1831 false,
1832 osdmap->get_epoch()));
1833 sent_ready_to_merge_source.insert(p.second);
1834 }
1835 }
1836 for (auto src : ready_to_merge_source) {
1837 if (not_ready_to_merge_source.count(src.first) ||
1838 not_ready_to_merge_target.count(src.first.get_parent())) {
1839 continue;
1840 }
1841 auto p = ready_to_merge_target.find(src.first.get_parent());
1842 if (p != ready_to_merge_target.end() &&
1843 sent_ready_to_merge_source.count(src.first) == 0) {
1844 monc->send_mon_message(new MOSDPGReadyToMerge(
1845 src.first, // source pgid
1846 src.second, // src version
1847 std::get<0>(p->second), // target version
1848 std::get<1>(p->second), // PG's last_epoch_started
1849 std::get<2>(p->second), // PG's last_epoch_clean
1850 true,
1851 osdmap->get_epoch()));
1852 sent_ready_to_merge_source.insert(src.first);
1853 }
1854 }
1855 }
1856
1857 void OSDService::clear_ready_to_merge(PG *pg)
1858 {
1859 std::lock_guard l(merge_lock);
1860 dout(10) << __func__ << " " << pg->pg_id << dendl;
1861 ready_to_merge_source.erase(pg->pg_id.pgid);
1862 ready_to_merge_target.erase(pg->pg_id.pgid);
1863 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1864 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1865 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1866 }
1867
1868 void OSDService::clear_sent_ready_to_merge()
1869 {
1870 std::lock_guard l(merge_lock);
1871 sent_ready_to_merge_source.clear();
1872 }
1873
1874 void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
1875 {
1876 std::lock_guard l(merge_lock);
1877 auto i = sent_ready_to_merge_source.begin();
1878 while (i != sent_ready_to_merge_source.end()) {
1879 if (!osdmap->pg_exists(*i)) {
1880 dout(10) << __func__ << " " << *i << dendl;
1881 i = sent_ready_to_merge_source.erase(i);
1882 } else {
1883 ++i;
1884 }
1885 }
1886 }
1887
1888 // ---
1889
1890 void OSDService::_queue_for_recovery(
1891 std::pair<epoch_t, PGRef> p,
1892 uint64_t reserved_pushes)
1893 {
1894 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
1895 enqueue_back(
1896 OpSchedulerItem(
1897 unique_ptr<OpSchedulerItem::OpQueueable>(
1898 new PGRecovery(
1899 p.second->get_pgid(), p.first, reserved_pushes)),
1900 cct->_conf->osd_recovery_cost,
1901 cct->_conf->osd_recovery_priority,
1902 ceph_clock_now(),
1903 0,
1904 p.first));
1905 }
1906
1907 // ====================================================================
1908 // OSD
1909
1910 #undef dout_prefix
1911 #define dout_prefix *_dout
1912
1913 // Commands shared between OSD's console and admin console:
1914 namespace ceph {
1915 namespace osd_cmds {
1916
1917 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1918
1919 }} // namespace ceph::osd_cmds
1920
1921 int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami)
1922 {
1923 int ret;
1924
1925 OSDSuperblock sb;
1926 bufferlist sbbl;
1927 ObjectStore::CollectionHandle ch;
1928
1929 // if we are fed a uuid for this osd, use it.
1930 store->set_fsid(cct->_conf->osd_uuid);
1931
1932 ret = store->mkfs();
1933 if (ret) {
1934 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1935 << cpp_strerror(ret) << dendl;
1936 goto free_store;
1937 }
1938
1939 store->set_cache_shards(1); // doesn't matter for mkfs!
1940
1941 ret = store->mount();
1942 if (ret) {
1943 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1944 << cpp_strerror(ret) << dendl;
1945 goto free_store;
1946 }
1947
1948 ch = store->open_collection(coll_t::meta());
1949 if (ch) {
1950 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1951 if (ret < 0) {
1952 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
1953 goto free_store;
1954 }
1955 /* if we already have superblock, check content of superblock */
1956 dout(0) << " have superblock" << dendl;
1957 auto p = sbbl.cbegin();
1958 decode(sb, p);
1959 if (whoami != sb.whoami) {
1960 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1961 << dendl;
1962 ret = -EINVAL;
1963 goto umount_store;
1964 }
1965 if (fsid != sb.cluster_fsid) {
1966 derr << "provided cluster fsid " << fsid
1967 << " != superblock's " << sb.cluster_fsid << dendl;
1968 ret = -EINVAL;
1969 goto umount_store;
1970 }
1971 } else {
1972 // create superblock
1973 sb.cluster_fsid = fsid;
1974 sb.osd_fsid = store->get_fsid();
1975 sb.whoami = whoami;
1976 sb.compat_features = get_osd_initial_compat_set();
1977
1978 bufferlist bl;
1979 encode(sb, bl);
1980
1981 ObjectStore::CollectionHandle ch = store->create_new_collection(
1982 coll_t::meta());
1983 ObjectStore::Transaction t;
1984 t.create_collection(coll_t::meta(), 0);
1985 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1986 ret = store->queue_transaction(ch, std::move(t));
1987 if (ret) {
1988 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1989 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
1990 goto umount_store;
1991 }
1992 }
1993
1994 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
1995 if (ret) {
1996 derr << "OSD::mkfs: failed to write fsid file: error "
1997 << cpp_strerror(ret) << dendl;
1998 goto umount_store;
1999 }
2000
2001 umount_store:
2002 if (ch) {
2003 ch.reset();
2004 }
2005 store->umount();
2006 free_store:
2007 delete store;
2008 return ret;
2009 }
2010
2011 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
2012 {
2013 char val[80];
2014 int r;
2015
2016 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2017 r = store->write_meta("magic", val);
2018 if (r < 0)
2019 return r;
2020
2021 snprintf(val, sizeof(val), "%d", whoami);
2022 r = store->write_meta("whoami", val);
2023 if (r < 0)
2024 return r;
2025
2026 cluster_fsid.print(val);
2027 r = store->write_meta("ceph_fsid", val);
2028 if (r < 0)
2029 return r;
2030
2031 string key = cct->_conf.get_val<string>("key");
2032 if (key.size()) {
2033 r = store->write_meta("osd_key", key);
2034 if (r < 0)
2035 return r;
2036 } else {
2037 string keyfile = cct->_conf.get_val<string>("keyfile");
2038 if (!keyfile.empty()) {
2039 bufferlist keybl;
2040 string err;
2041 r = keybl.read_file(keyfile.c_str(), &err);
2042 if (r < 0) {
2043 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2044 << err << ": " << cpp_strerror(r) << dendl;
2045 return r;
2046 }
2047 r = store->write_meta("osd_key", keybl.to_str());
2048 if (r < 0)
2049 return r;
2050 }
2051 }
2052
2053 r = store->write_meta("ready", "ready");
2054 if (r < 0)
2055 return r;
2056
2057 return 0;
2058 }
2059
2060 int OSD::peek_meta(ObjectStore *store,
2061 std::string *magic,
2062 uuid_d *cluster_fsid,
2063 uuid_d *osd_fsid,
2064 int *whoami,
2065 ceph_release_t *require_osd_release)
2066 {
2067 string val;
2068
2069 int r = store->read_meta("magic", &val);
2070 if (r < 0)
2071 return r;
2072 *magic = val;
2073
2074 r = store->read_meta("whoami", &val);
2075 if (r < 0)
2076 return r;
2077 *whoami = atoi(val.c_str());
2078
2079 r = store->read_meta("ceph_fsid", &val);
2080 if (r < 0)
2081 return r;
2082 r = cluster_fsid->parse(val.c_str());
2083 if (!r)
2084 return -EINVAL;
2085
2086 r = store->read_meta("fsid", &val);
2087 if (r < 0) {
2088 *osd_fsid = uuid_d();
2089 } else {
2090 r = osd_fsid->parse(val.c_str());
2091 if (!r)
2092 return -EINVAL;
2093 }
2094
2095 r = store->read_meta("require_osd_release", &val);
2096 if (r >= 0) {
2097 *require_osd_release = ceph_release_from_name(val);
2098 }
2099
2100 return 0;
2101 }
2102
2103
2104 #undef dout_prefix
2105 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2106
2107 // cons/des
2108
2109 OSD::OSD(CephContext *cct_, ObjectStore *store_,
2110 int id,
2111 Messenger *internal_messenger,
2112 Messenger *external_messenger,
2113 Messenger *hb_client_front,
2114 Messenger *hb_client_back,
2115 Messenger *hb_front_serverm,
2116 Messenger *hb_back_serverm,
2117 Messenger *osdc_messenger,
2118 MonClient *mc,
2119 const std::string &dev, const std::string &jdev) :
2120 Dispatcher(cct_),
2121 tick_timer(cct, osd_lock),
2122 tick_timer_without_osd_lock(cct, tick_timer_lock),
2123 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2124 cluster_messenger(internal_messenger),
2125 client_messenger(external_messenger),
2126 objecter_messenger(osdc_messenger),
2127 monc(mc),
2128 mgrc(cct_, client_messenger, &mc->monmap),
2129 logger(NULL),
2130 recoverystate_perf(NULL),
2131 store(store_),
2132 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2133 clog(log_client.create_channel()),
2134 whoami(id),
2135 dev_path(dev), journal_path(jdev),
2136 store_is_rotational(store->is_rotational()),
2137 trace_endpoint("0.0.0.0", 0, "osd"),
2138 asok_hook(NULL),
2139 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2140 "osd_pg_epoch_max_lag_factor")),
2141 osd_compat(get_osd_compat_set()),
2142 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2143 get_num_op_threads()),
2144 heartbeat_stop(false),
2145 heartbeat_need_update(true),
2146 hb_front_client_messenger(hb_client_front),
2147 hb_back_client_messenger(hb_client_back),
2148 hb_front_server_messenger(hb_front_serverm),
2149 hb_back_server_messenger(hb_back_serverm),
2150 daily_loadavg(0.0),
2151 heartbeat_thread(this),
2152 heartbeat_dispatcher(this),
2153 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2154 cct->_conf->osd_num_op_tracker_shard),
2155 test_ops_hook(NULL),
2156 op_shardedwq(
2157 this,
2158 cct->_conf->osd_op_thread_timeout,
2159 cct->_conf->osd_op_thread_suicide_timeout,
2160 &osd_op_tp),
2161 last_pg_create_epoch(0),
2162 boot_finisher(cct),
2163 up_thru_wanted(0),
2164 requested_full_first(0),
2165 requested_full_last(0),
2166 service(this)
2167 {
2168
2169 if (!gss_ktfile_client.empty()) {
2170 // Assert we can export environment variable
2171 /*
2172 The default client keytab is used, if it is present and readable,
2173 to automatically obtain initial credentials for GSSAPI client
2174 applications. The principal name of the first entry in the client
2175 keytab is used by default when obtaining initial credentials.
2176 1. The KRB5_CLIENT_KTNAME environment variable.
2177 2. The default_client_keytab_name profile variable in [libdefaults].
2178 3. The hardcoded default, DEFCKTNAME.
2179 */
2180 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2181 gss_ktfile_client.c_str(), 1));
2182 ceph_assert(set_result == 0);
2183 }
2184
2185 monc->set_messenger(client_messenger);
2186 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2187 cct->_conf->osd_op_log_threshold);
2188 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2189 cct->_conf->osd_op_history_duration);
2190 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2191 cct->_conf->osd_op_history_slow_op_threshold);
2192 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
2193 #ifdef WITH_BLKIN
2194 std::stringstream ss;
2195 ss << "osd." << whoami;
2196 trace_endpoint.copy_name(ss.str());
2197 #endif
2198
2199 // initialize shards
2200 num_shards = get_num_op_shards();
2201 for (uint32_t i = 0; i < num_shards; i++) {
2202 OSDShard *one_shard = new OSDShard(
2203 i,
2204 cct,
2205 this);
2206 shards.push_back(one_shard);
2207 }
2208 }
2209
2210 OSD::~OSD()
2211 {
2212 while (!shards.empty()) {
2213 delete shards.back();
2214 shards.pop_back();
2215 }
2216 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2217 cct->get_perfcounters_collection()->remove(logger);
2218 delete recoverystate_perf;
2219 delete logger;
2220 delete store;
2221 }
2222
2223 double OSD::get_tick_interval() const
2224 {
2225 // vary +/- 5% to avoid scrub scheduling livelocks
2226 constexpr auto delta = 0.05;
2227 return (OSD_TICK_INTERVAL *
2228 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2229 }
2230
2231 void OSD::handle_signal(int signum)
2232 {
2233 ceph_assert(signum == SIGINT || signum == SIGTERM);
2234 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2235 shutdown();
2236 }
2237
2238 int OSD::pre_init()
2239 {
2240 std::lock_guard lock(osd_lock);
2241 if (is_stopping())
2242 return 0;
2243
2244 if (store->test_mount_in_use()) {
2245 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2246 << "currently in use. (Is ceph-osd already running?)" << dendl;
2247 return -EBUSY;
2248 }
2249
2250 cct->_conf.add_observer(this);
2251 return 0;
2252 }
2253
2254 int OSD::set_numa_affinity()
2255 {
2256 // storage numa node
2257 int store_node = -1;
2258 store->get_numa_node(&store_node, nullptr, nullptr);
2259 if (store_node >= 0) {
2260 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2261 }
2262
2263 // check network numa node(s)
2264 int front_node = -1, back_node = -1;
2265 string front_iface = pick_iface(
2266 cct,
2267 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2268 string back_iface = pick_iface(
2269 cct,
2270 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2271 int r = get_iface_numa_node(front_iface, &front_node);
2272 if (r >= 0 && front_node >= 0) {
2273 dout(1) << __func__ << " public network " << front_iface << " numa node "
2274 << front_node << dendl;
2275 r = get_iface_numa_node(back_iface, &back_node);
2276 if (r >= 0 && back_node >= 0) {
2277 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2278 << back_node << dendl;
2279 if (front_node == back_node &&
2280 front_node == store_node) {
2281 dout(1) << " objectstore and network numa nodes all match" << dendl;
2282 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2283 numa_node = front_node;
2284 }
2285 } else if (front_node != back_node) {
2286 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2287 << dendl;
2288 } else {
2289 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2290 << dendl;
2291 }
2292 } else if (back_node == -2) {
2293 dout(1) << __func__ << " cluster network " << back_iface
2294 << " ports numa nodes do not match" << dendl;
2295 } else {
2296 derr << __func__ << " unable to identify cluster interface '" << back_iface
2297 << "' numa node: " << cpp_strerror(r) << dendl;
2298 }
2299 } else if (front_node == -2) {
2300 dout(1) << __func__ << " public network " << front_iface
2301 << " ports numa nodes do not match" << dendl;
2302 } else {
2303 derr << __func__ << " unable to identify public interface '" << front_iface
2304 << "' numa node: " << cpp_strerror(r) << dendl;
2305 }
2306 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2307 // this takes precedence over the automagic logic above
2308 numa_node = node;
2309 }
2310 if (numa_node >= 0) {
2311 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2312 if (r < 0) {
2313 dout(1) << __func__ << " unable to determine numa node " << numa_node
2314 << " CPUs" << dendl;
2315 numa_node = -1;
2316 } else {
2317 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2318 << " cpus "
2319 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2320 << dendl;
2321 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
2322 if (r < 0) {
2323 r = -errno;
2324 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2325 << dendl;
2326 numa_node = -1;
2327 }
2328 }
2329 } else {
2330 dout(1) << __func__ << " not setting numa affinity" << dendl;
2331 }
2332 return 0;
2333 }
2334
2335 // asok
2336
2337 class OSDSocketHook : public AdminSocketHook {
2338 OSD *osd;
2339 public:
2340 explicit OSDSocketHook(OSD *o) : osd(o) {}
2341 int call(std::string_view prefix, const cmdmap_t& cmdmap,
2342 Formatter *f,
2343 std::ostream& ss,
2344 bufferlist& out) override {
2345 ceph_abort("should use async hook");
2346 }
2347 void call_async(
2348 std::string_view prefix,
2349 const cmdmap_t& cmdmap,
2350 Formatter *f,
2351 const bufferlist& inbl,
2352 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
2353 try {
2354 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2355 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2356 bufferlist empty;
2357 on_finish(-EINVAL, e.what(), empty);
2358 }
2359 }
2360 };
2361
2362 std::set<int64_t> OSD::get_mapped_pools()
2363 {
2364 std::set<int64_t> pools;
2365 std::vector<spg_t> pgids;
2366 _get_pgids(&pgids);
2367 for (const auto &pgid : pgids) {
2368 pools.insert(pgid.pool());
2369 }
2370 return pools;
2371 }
2372
2373 void OSD::asok_command(
2374 std::string_view prefix, const cmdmap_t& cmdmap,
2375 Formatter *f,
2376 const bufferlist& inbl,
2377 std::function<void(int,const std::string&,bufferlist&)> on_finish)
2378 {
2379 int ret = 0;
2380 stringstream ss; // stderr error message stream
2381 bufferlist outbl; // if empty at end, we'll dump formatter as output
2382
2383 // --- PG commands are routed here to PG::do_command ---
2384 if (prefix == "pg" ||
2385 prefix == "query" ||
2386 prefix == "mark_unfound_lost" ||
2387 prefix == "list_unfound" ||
2388 prefix == "scrub" ||
2389 prefix == "deep_scrub"
2390 ) {
2391 string pgidstr;
2392 pg_t pgid;
2393 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2394 ss << "no pgid specified";
2395 ret = -EINVAL;
2396 goto out;
2397 }
2398 if (!pgid.parse(pgidstr.c_str())) {
2399 ss << "couldn't parse pgid '" << pgidstr << "'";
2400 ret = -EINVAL;
2401 goto out;
2402 }
2403 spg_t pcand;
2404 PGRef pg;
2405 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2406 (pg = _lookup_lock_pg(pcand))) {
2407 if (pg->is_primary()) {
2408 cmdmap_t new_cmdmap = cmdmap;
2409 try {
2410 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2411 pg->unlock();
2412 return; // the pg handler calls on_finish directly
2413 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2414 pg->unlock();
2415 ss << e.what();
2416 ret = -EINVAL;
2417 goto out;
2418 }
2419 } else {
2420 ss << "not primary for pgid " << pgid;
2421 // do not reply; they will get newer maps and realize they
2422 // need to resend.
2423 pg->unlock();
2424 ret = -EAGAIN;
2425 goto out;
2426 }
2427 } else {
2428 ss << "i don't have pgid " << pgid;
2429 ret = -ENOENT;
2430 }
2431 }
2432
2433 // --- OSD commands follow ---
2434
2435 else if (prefix == "status") {
2436 lock_guard l(osd_lock);
2437 f->open_object_section("status");
2438 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2439 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2440 f->dump_unsigned("whoami", superblock.whoami);
2441 f->dump_string("state", get_state_name(get_state()));
2442 f->dump_unsigned("oldest_map", superblock.oldest_map);
2443 f->dump_unsigned("newest_map", superblock.newest_map);
2444 f->dump_unsigned("num_pgs", num_pgs);
2445 f->close_section();
2446 } else if (prefix == "flush_journal") {
2447 store->flush_journal();
2448 } else if (prefix == "dump_ops_in_flight" ||
2449 prefix == "ops" ||
2450 prefix == "dump_blocked_ops" ||
2451 prefix == "dump_historic_ops" ||
2452 prefix == "dump_historic_ops_by_duration" ||
2453 prefix == "dump_historic_slow_ops") {
2454
2455 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2456 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2457 will start to track new ops received afterwards.";
2458
2459 set<string> filters;
2460 vector<string> filter_str;
2461 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
2462 copy(filter_str.begin(), filter_str.end(),
2463 inserter(filters, filters.end()));
2464 }
2465
2466 if (prefix == "dump_ops_in_flight" ||
2467 prefix == "ops") {
2468 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2469 ss << error_str;
2470 ret = -EINVAL;
2471 goto out;
2472 }
2473 }
2474 if (prefix == "dump_blocked_ops") {
2475 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2476 ss << error_str;
2477 ret = -EINVAL;
2478 goto out;
2479 }
2480 }
2481 if (prefix == "dump_historic_ops") {
2482 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2483 ss << error_str;
2484 ret = -EINVAL;
2485 goto out;
2486 }
2487 }
2488 if (prefix == "dump_historic_ops_by_duration") {
2489 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2490 ss << error_str;
2491 ret = -EINVAL;
2492 goto out;
2493 }
2494 }
2495 if (prefix == "dump_historic_slow_ops") {
2496 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2497 ss << error_str;
2498 ret = -EINVAL;
2499 goto out;
2500 }
2501 }
2502 } else if (prefix == "dump_op_pq_state") {
2503 f->open_object_section("pq");
2504 op_shardedwq.dump(f);
2505 f->close_section();
2506 } else if (prefix == "dump_blacklist") {
2507 list<pair<entity_addr_t,utime_t> > bl;
2508 OSDMapRef curmap = service.get_osdmap();
2509
2510 f->open_array_section("blacklist");
2511 curmap->get_blacklist(&bl);
2512 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2513 it != bl.end(); ++it) {
2514 f->open_object_section("entry");
2515 f->open_object_section("entity_addr_t");
2516 it->first.dump(f);
2517 f->close_section(); //entity_addr_t
2518 it->second.localtime(f->dump_stream("expire_time"));
2519 f->close_section(); //entry
2520 }
2521 f->close_section(); //blacklist
2522 } else if (prefix == "dump_watchers") {
2523 list<obj_watch_item_t> watchers;
2524 // scan pg's
2525 vector<PGRef> pgs;
2526 _get_pgs(&pgs);
2527 for (auto& pg : pgs) {
2528 list<obj_watch_item_t> pg_watchers;
2529 pg->get_watchers(&pg_watchers);
2530 watchers.splice(watchers.end(), pg_watchers);
2531 }
2532
2533 f->open_array_section("watchers");
2534 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2535 it != watchers.end(); ++it) {
2536
2537 f->open_object_section("watch");
2538
2539 f->dump_string("namespace", it->obj.nspace);
2540 f->dump_string("object", it->obj.oid.name);
2541
2542 f->open_object_section("entity_name");
2543 it->wi.name.dump(f);
2544 f->close_section(); //entity_name_t
2545
2546 f->dump_unsigned("cookie", it->wi.cookie);
2547 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2548
2549 f->open_object_section("entity_addr_t");
2550 it->wi.addr.dump(f);
2551 f->close_section(); //entity_addr_t
2552
2553 f->close_section(); //watch
2554 }
2555
2556 f->close_section(); //watchers
2557 } else if (prefix == "dump_recovery_reservations") {
2558 f->open_object_section("reservations");
2559 f->open_object_section("local_reservations");
2560 service.local_reserver.dump(f);
2561 f->close_section();
2562 f->open_object_section("remote_reservations");
2563 service.remote_reserver.dump(f);
2564 f->close_section();
2565 f->close_section();
2566 } else if (prefix == "dump_scrub_reservations") {
2567 f->open_object_section("scrub_reservations");
2568 service.dump_scrub_reservations(f);
2569 f->close_section();
2570 } else if (prefix == "get_latest_osdmap") {
2571 get_latest_osdmap();
2572 } else if (prefix == "set_heap_property") {
2573 string property;
2574 int64_t value = 0;
2575 string error;
2576 bool success = false;
2577 if (!cmd_getval(cmdmap, "property", property)) {
2578 error = "unable to get property";
2579 success = false;
2580 } else if (!cmd_getval(cmdmap, "value", value)) {
2581 error = "unable to get value";
2582 success = false;
2583 } else if (value < 0) {
2584 error = "negative value not allowed";
2585 success = false;
2586 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2587 error = "invalid property";
2588 success = false;
2589 } else {
2590 success = true;
2591 }
2592 f->open_object_section("result");
2593 f->dump_string("error", error);
2594 f->dump_bool("success", success);
2595 f->close_section();
2596 } else if (prefix == "get_heap_property") {
2597 string property;
2598 size_t value = 0;
2599 string error;
2600 bool success = false;
2601 if (!cmd_getval(cmdmap, "property", property)) {
2602 error = "unable to get property";
2603 success = false;
2604 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2605 error = "invalid property";
2606 success = false;
2607 } else {
2608 success = true;
2609 }
2610 f->open_object_section("result");
2611 f->dump_string("error", error);
2612 f->dump_bool("success", success);
2613 f->dump_int("value", value);
2614 f->close_section();
2615 } else if (prefix == "dump_objectstore_kv_stats") {
2616 store->get_db_statistics(f);
2617 } else if (prefix == "dump_scrubs") {
2618 service.dumps_scrub(f);
2619 } else if (prefix == "calc_objectstore_db_histogram") {
2620 store->generate_db_histogram(f);
2621 } else if (prefix == "flush_store_cache") {
2622 store->flush_cache(&ss);
2623 } else if (prefix == "dump_pgstate_history") {
2624 f->open_object_section("pgstate_history");
2625 f->open_array_section("pgs");
2626 vector<PGRef> pgs;
2627 _get_pgs(&pgs);
2628 for (auto& pg : pgs) {
2629 f->open_object_section("pg");
2630 f->dump_stream("pg") << pg->pg_id;
2631 f->dump_string("currently", pg->get_current_state());
2632 pg->dump_pgstate_history(f);
2633 f->close_section();
2634 }
2635 f->close_section();
2636 f->close_section();
2637 } else if (prefix == "compact") {
2638 dout(1) << "triggering manual compaction" << dendl;
2639 auto start = ceph::coarse_mono_clock::now();
2640 store->compact();
2641 auto end = ceph::coarse_mono_clock::now();
2642 double duration = std::chrono::duration<double>(end-start).count();
2643 dout(1) << "finished manual compaction in "
2644 << duration
2645 << " seconds" << dendl;
2646 f->open_object_section("compact_result");
2647 f->dump_float("elapsed_time", duration);
2648 f->close_section();
2649 } else if (prefix == "get_mapped_pools") {
2650 f->open_array_section("mapped_pools");
2651 set<int64_t> poollist = get_mapped_pools();
2652 for (auto pool : poollist) {
2653 f->dump_int("pool_id", pool);
2654 }
2655 f->close_section();
2656 } else if (prefix == "smart") {
2657 string devid;
2658 cmd_getval(cmdmap, "devid", devid);
2659 ostringstream out;
2660 probe_smart(devid, out);
2661 outbl.append(out.str());
2662 } else if (prefix == "list_devices") {
2663 set<string> devnames;
2664 store->get_devices(&devnames);
2665 f->open_array_section("list_devices");
2666 for (auto dev : devnames) {
2667 if (dev.find("dm-") == 0) {
2668 continue;
2669 }
2670 string err;
2671 f->open_object_section("device");
2672 f->dump_string("device", "/dev/" + dev);
2673 f->dump_string("device_id", get_device_id(dev, &err));
2674 f->close_section();
2675 }
2676 f->close_section();
2677 } else if (prefix == "send_beacon") {
2678 lock_guard l(osd_lock);
2679 if (is_active()) {
2680 send_beacon(ceph::coarse_mono_clock::now());
2681 }
2682 }
2683
2684 else if (prefix == "cluster_log") {
2685 vector<string> msg;
2686 cmd_getval(cmdmap, "message", msg);
2687 if (msg.empty()) {
2688 ret = -EINVAL;
2689 ss << "ignoring empty log message";
2690 goto out;
2691 }
2692 string message = msg.front();
2693 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2694 message += " " + *a;
2695 string lvl;
2696 cmd_getval(cmdmap, "level", lvl);
2697 clog_type level = string_to_clog_type(lvl);
2698 if (level < 0) {
2699 ret = -EINVAL;
2700 ss << "unknown level '" << lvl << "'";
2701 goto out;
2702 }
2703 clog->do_log(level, message);
2704 }
2705
2706 else if (prefix == "bench") {
2707 lock_guard l(osd_lock);
2708 int64_t count;
2709 int64_t bsize;
2710 int64_t osize, onum;
2711 // default count 1G, size 4MB
2712 cmd_getval(cmdmap, "count", count, (int64_t)1 << 30);
2713 cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
2714 cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
2715 cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
2716
2717 uint32_t duration = cct->_conf->osd_bench_duration;
2718
2719 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
2720 // let us limit the block size because the next checks rely on it
2721 // having a sane value. If we allow any block size to be set things
2722 // can still go sideways.
2723 ss << "block 'size' values are capped at "
2724 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
2725 << " a higher value, please adjust 'osd_bench_max_block_size'";
2726 ret = -EINVAL;
2727 goto out;
2728 } else if (bsize < (int64_t) (1 << 20)) {
2729 // entering the realm of small block sizes.
2730 // limit the count to a sane value, assuming a configurable amount of
2731 // IOPS and duration, so that the OSD doesn't get hung up on this,
2732 // preventing timeouts from going off
2733 int64_t max_count =
2734 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
2735 if (count > max_count) {
2736 ss << "'count' values greater than " << max_count
2737 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2738 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
2739 << " for " << duration << " seconds,"
2740 << " can cause ill effects on osd. "
2741 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2742 << " value if you wish to use a higher 'count'.";
2743 ret = -EINVAL;
2744 goto out;
2745 }
2746 } else {
2747 // 1MB block sizes are big enough so that we get more stuff done.
2748 // However, to avoid the osd from getting hung on this and having
2749 // timers being triggered, we are going to limit the count assuming
2750 // a configurable throughput and duration.
2751 // NOTE: max_count is the total amount of bytes that we believe we
2752 // will be able to write during 'duration' for the given
2753 // throughput. The block size hardly impacts this unless it's
2754 // way too big. Given we already check how big the block size
2755 // is, it's safe to assume everything will check out.
2756 int64_t max_count =
2757 cct->_conf->osd_bench_large_size_max_throughput * duration;
2758 if (count > max_count) {
2759 ss << "'count' values greater than " << max_count
2760 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2761 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
2762 << " for " << duration << " seconds,"
2763 << " can cause ill effects on osd. "
2764 << " Please adjust 'osd_bench_large_size_max_throughput'"
2765 << " with a higher value if you wish to use a higher 'count'.";
2766 ret = -EINVAL;
2767 goto out;
2768 }
2769 }
2770
2771 if (osize && bsize > osize)
2772 bsize = osize;
2773
2774 dout(1) << " bench count " << count
2775 << " bsize " << byte_u_t(bsize) << dendl;
2776
2777 ObjectStore::Transaction cleanupt;
2778
2779 if (osize && onum) {
2780 bufferlist bl;
2781 bufferptr bp(osize);
2782 bp.zero();
2783 bl.push_back(std::move(bp));
2784 bl.rebuild_page_aligned();
2785 for (int i=0; i<onum; ++i) {
2786 char nm[30];
2787 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
2788 object_t oid(nm);
2789 hobject_t soid(sobject_t(oid, 0));
2790 ObjectStore::Transaction t;
2791 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
2792 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2793 cleanupt.remove(coll_t(), ghobject_t(soid));
2794 }
2795 }
2796
2797 bufferlist bl;
2798 bufferptr bp(bsize);
2799 bp.zero();
2800 bl.push_back(std::move(bp));
2801 bl.rebuild_page_aligned();
2802
2803 {
2804 C_SaferCond waiter;
2805 if (!service.meta_ch->flush_commit(&waiter)) {
2806 waiter.wait();
2807 }
2808 }
2809
2810 utime_t start = ceph_clock_now();
2811 for (int64_t pos = 0; pos < count; pos += bsize) {
2812 char nm[30];
2813 unsigned offset = 0;
2814 if (onum && osize) {
2815 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
2816 offset = rand() % (osize / bsize) * bsize;
2817 } else {
2818 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
2819 }
2820 object_t oid(nm);
2821 hobject_t soid(sobject_t(oid, 0));
2822 ObjectStore::Transaction t;
2823 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
2824 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2825 if (!onum || !osize)
2826 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
2827 }
2828
2829 {
2830 C_SaferCond waiter;
2831 if (!service.meta_ch->flush_commit(&waiter)) {
2832 waiter.wait();
2833 }
2834 }
2835 utime_t end = ceph_clock_now();
2836
2837 // clean up
2838 store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
2839 {
2840 C_SaferCond waiter;
2841 if (!service.meta_ch->flush_commit(&waiter)) {
2842 waiter.wait();
2843 }
2844 }
2845
2846 double elapsed = end - start;
2847 double rate = count / elapsed;
2848 double iops = rate / bsize;
2849 f->open_object_section("osd_bench_results");
2850 f->dump_int("bytes_written", count);
2851 f->dump_int("blocksize", bsize);
2852 f->dump_float("elapsed_sec", elapsed);
2853 f->dump_float("bytes_per_sec", rate);
2854 f->dump_float("iops", iops);
2855 f->close_section();
2856 }
2857
2858 else if (prefix == "flush_pg_stats") {
2859 mgrc.send_pgstats();
2860 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2861 }
2862
2863 else if (prefix == "heap") {
2864 ret = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2865 }
2866
2867 else if (prefix == "debug dump_missing") {
2868 f->open_array_section("pgs");
2869 vector<PGRef> pgs;
2870 _get_pgs(&pgs);
2871 for (auto& pg : pgs) {
2872 string s = stringify(pg->pg_id);
2873 f->open_array_section(s.c_str());
2874 pg->lock();
2875 pg->dump_missing(f);
2876 pg->unlock();
2877 f->close_section();
2878 }
2879 f->close_section();
2880 }
2881
2882 else if (prefix == "debug kick_recovery_wq") {
2883 int64_t delay;
2884 cmd_getval(cmdmap, "delay", delay);
2885 ostringstream oss;
2886 oss << delay;
2887 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
2888 if (ret != 0) {
2889 ss << "kick_recovery_wq: error setting "
2890 << "osd_recovery_delay_start to '" << delay << "': error "
2891 << ret;
2892 goto out;
2893 }
2894 cct->_conf.apply_changes(nullptr);
2895 ss << "kicking recovery queue. set osd_recovery_delay_start "
2896 << "to " << cct->_conf->osd_recovery_delay_start;
2897 }
2898
2899 else if (prefix == "cpu_profiler") {
2900 ostringstream ds;
2901 string arg;
2902 cmd_getval(cmdmap, "arg", arg);
2903 vector<string> argvec;
2904 get_str_vec(arg, argvec);
2905 cpu_profiler_handle_command(argvec, ds);
2906 outbl.append(ds.str());
2907 }
2908
2909 else if (prefix == "dump_pg_recovery_stats") {
2910 lock_guard l(osd_lock);
2911 pg_recovery_stats.dump_formatted(f);
2912 }
2913
2914 else if (prefix == "reset_pg_recovery_stats") {
2915 lock_guard l(osd_lock);
2916 pg_recovery_stats.reset();
2917 }
2918
2919 else if (prefix == "perf histogram dump") {
2920 std::string logger;
2921 std::string counter;
2922 cmd_getval(cmdmap, "logger", logger);
2923 cmd_getval(cmdmap, "counter", counter);
2924 cct->get_perfcounters_collection()->dump_formatted_histograms(
2925 f, false, logger, counter);
2926 }
2927
2928 else if (prefix == "cache drop") {
2929 lock_guard l(osd_lock);
2930 dout(20) << "clearing all caches" << dendl;
2931 // Clear the objectstore's cache - onode and buffer for Bluestore,
2932 // system's pagecache for Filestore
2933 ret = store->flush_cache(&ss);
2934 if (ret < 0) {
2935 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
2936 goto out;
2937 }
2938 // Clear the objectcontext cache (per PG)
2939 vector<PGRef> pgs;
2940 _get_pgs(&pgs);
2941 for (auto& pg: pgs) {
2942 pg->clear_cache();
2943 }
2944 }
2945
2946 else if (prefix == "cache status") {
2947 lock_guard l(osd_lock);
2948 int obj_ctx_count = 0;
2949 vector<PGRef> pgs;
2950 _get_pgs(&pgs);
2951 for (auto& pg: pgs) {
2952 obj_ctx_count += pg->get_cache_obj_count();
2953 }
2954 f->open_object_section("cache_status");
2955 f->dump_int("object_ctx", obj_ctx_count);
2956 store->dump_cache_stats(f);
2957 f->close_section();
2958 }
2959
2960 else if (prefix == "scrub_purged_snaps") {
2961 lock_guard l(osd_lock);
2962 scrub_purged_snaps();
2963 }
2964
2965 else if (prefix == "dump_osd_network") {
2966 lock_guard l(osd_lock);
2967 int64_t value = 0;
2968 if (!(cmd_getval(cmdmap, "value", value))) {
2969 // Convert milliseconds to microseconds
2970 value = static_cast<double>(g_conf().get_val<double>(
2971 "mon_warn_on_slow_ping_time")) * 1000;
2972 if (value == 0) {
2973 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2974 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2975 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2976 }
2977 } else {
2978 // Convert user input to microseconds
2979 value *= 1000;
2980 }
2981 if (value < 0) value = 0;
2982
2983 struct osd_ping_time_t {
2984 uint32_t pingtime;
2985 int to;
2986 bool back;
2987 std::array<uint32_t,3> times;
2988 std::array<uint32_t,3> min;
2989 std::array<uint32_t,3> max;
2990 uint32_t last;
2991 uint32_t last_update;
2992
2993 bool operator<(const osd_ping_time_t& rhs) const {
2994 if (pingtime < rhs.pingtime)
2995 return true;
2996 if (pingtime > rhs.pingtime)
2997 return false;
2998 if (to < rhs.to)
2999 return true;
3000 if (to > rhs.to)
3001 return false;
3002 return back;
3003 }
3004 };
3005
3006 set<osd_ping_time_t> sorted;
3007 // Get pingtimes under lock and not on the stack
3008 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3009 service.get_hb_pingtime(pingtimes);
3010 for (auto j : *pingtimes) {
3011 if (j.second.last_update == 0)
3012 continue;
3013 osd_ping_time_t item;
3014 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3015 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3016 if (item.pingtime >= value) {
3017 item.to = j.first;
3018 item.times[0] = j.second.back_pingtime[0];
3019 item.times[1] = j.second.back_pingtime[1];
3020 item.times[2] = j.second.back_pingtime[2];
3021 item.min[0] = j.second.back_min[0];
3022 item.min[1] = j.second.back_min[1];
3023 item.min[2] = j.second.back_min[2];
3024 item.max[0] = j.second.back_max[0];
3025 item.max[1] = j.second.back_max[1];
3026 item.max[2] = j.second.back_max[2];
3027 item.last = j.second.back_last;
3028 item.back = true;
3029 item.last_update = j.second.last_update;
3030 sorted.emplace(item);
3031 }
3032 if (j.second.front_last == 0)
3033 continue;
3034 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3035 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3036 if (item.pingtime >= value) {
3037 item.to = j.first;
3038 item.times[0] = j.second.front_pingtime[0];
3039 item.times[1] = j.second.front_pingtime[1];
3040 item.times[2] = j.second.front_pingtime[2];
3041 item.min[0] = j.second.front_min[0];
3042 item.min[1] = j.second.front_min[1];
3043 item.min[2] = j.second.front_min[2];
3044 item.max[0] = j.second.front_max[0];
3045 item.max[1] = j.second.front_max[1];
3046 item.max[2] = j.second.front_max[2];
3047 item.last = j.second.front_last;
3048 item.last_update = j.second.last_update;
3049 item.back = false;
3050 sorted.emplace(item);
3051 }
3052 }
3053 delete pingtimes;
3054 //
3055 // Network ping times (1min 5min 15min)
3056 f->open_object_section("network_ping_times");
3057 f->dump_int("threshold", value / 1000);
3058 f->open_array_section("entries");
3059 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3060 ceph_assert(sitem.pingtime >= value);
3061 f->open_object_section("entry");
3062
3063 const time_t lu(sitem.last_update);
3064 char buffer[26];
3065 string lustr(ctime_r(&lu, buffer));
3066 lustr.pop_back(); // Remove trailing \n
3067 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3068 f->dump_string("last update", lustr);
3069 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3070 f->dump_int("from osd", whoami);
3071 f->dump_int("to osd", sitem.to);
3072 f->dump_string("interface", (sitem.back ? "back" : "front"));
3073 f->open_object_section("average");
3074 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3075 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3076 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3077 f->close_section(); // average
3078 f->open_object_section("min");
3079 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3080 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3081 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3082 f->close_section(); // min
3083 f->open_object_section("max");
3084 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3085 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3086 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3087 f->close_section(); // max
3088 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3089 f->close_section(); // entry
3090 }
3091 f->close_section(); // entries
3092 f->close_section(); // network_ping_times
3093 } else {
3094 ceph_abort_msg("broken asok registration");
3095 }
3096
3097 out:
3098 on_finish(ret, ss.str(), outbl);
3099 }
3100
3101 class TestOpsSocketHook : public AdminSocketHook {
3102 OSDService *service;
3103 ObjectStore *store;
3104 public:
3105 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
3106 int call(std::string_view command, const cmdmap_t& cmdmap,
3107 Formatter *f,
3108 std::ostream& errss,
3109 bufferlist& out) override {
3110 int r = 0;
3111 stringstream outss;
3112 try {
3113 test_ops(service, store, command, cmdmap, outss);
3114 out.append(outss);
3115 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3116 errss << e.what();
3117 r = -EINVAL;
3118 }
3119 return r;
3120 }
3121 void test_ops(OSDService *service, ObjectStore *store,
3122 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
3123
3124 };
3125
3126 class OSD::C_Tick : public Context {
3127 OSD *osd;
3128 public:
3129 explicit C_Tick(OSD *o) : osd(o) {}
3130 void finish(int r) override {
3131 osd->tick();
3132 }
3133 };
3134
3135 class OSD::C_Tick_WithoutOSDLock : public Context {
3136 OSD *osd;
3137 public:
3138 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3139 void finish(int r) override {
3140 osd->tick_without_osd_lock();
3141 }
3142 };
3143
3144 int OSD::enable_disable_fuse(bool stop)
3145 {
3146 #ifdef HAVE_LIBFUSE
3147 int r;
3148 string mntpath = cct->_conf->osd_data + "/fuse";
3149 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3150 dout(1) << __func__ << " disabling" << dendl;
3151 fuse_store->stop();
3152 delete fuse_store;
3153 fuse_store = NULL;
3154 r = ::rmdir(mntpath.c_str());
3155 if (r < 0) {
3156 r = -errno;
3157 derr << __func__ << " failed to rmdir " << mntpath << ": "
3158 << cpp_strerror(r) << dendl;
3159 return r;
3160 }
3161 return 0;
3162 }
3163 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3164 dout(1) << __func__ << " enabling" << dendl;
3165 r = ::mkdir(mntpath.c_str(), 0700);
3166 if (r < 0)
3167 r = -errno;
3168 if (r < 0 && r != -EEXIST) {
3169 derr << __func__ << " unable to create " << mntpath << ": "
3170 << cpp_strerror(r) << dendl;
3171 return r;
3172 }
3173 fuse_store = new FuseStore(store, mntpath);
3174 r = fuse_store->start();
3175 if (r < 0) {
3176 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3177 delete fuse_store;
3178 fuse_store = NULL;
3179 return r;
3180 }
3181 }
3182 #endif // HAVE_LIBFUSE
3183 return 0;
3184 }
3185
3186 size_t OSD::get_num_cache_shards()
3187 {
3188 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3189 }
3190
3191 int OSD::get_num_op_shards()
3192 {
3193 if (cct->_conf->osd_op_num_shards)
3194 return cct->_conf->osd_op_num_shards;
3195 if (store_is_rotational)
3196 return cct->_conf->osd_op_num_shards_hdd;
3197 else
3198 return cct->_conf->osd_op_num_shards_ssd;
3199 }
3200
3201 int OSD::get_num_op_threads()
3202 {
3203 if (cct->_conf->osd_op_num_threads_per_shard)
3204 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3205 if (store_is_rotational)
3206 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3207 else
3208 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3209 }
3210
3211 float OSD::get_osd_recovery_sleep()
3212 {
3213 if (cct->_conf->osd_recovery_sleep)
3214 return cct->_conf->osd_recovery_sleep;
3215 if (!store_is_rotational && !journal_is_rotational)
3216 return cct->_conf->osd_recovery_sleep_ssd;
3217 else if (store_is_rotational && !journal_is_rotational)
3218 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
3219 else
3220 return cct->_conf->osd_recovery_sleep_hdd;
3221 }
3222
3223 float OSD::get_osd_delete_sleep()
3224 {
3225 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3226 if (osd_delete_sleep > 0)
3227 return osd_delete_sleep;
3228 if (!store_is_rotational && !journal_is_rotational)
3229 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3230 if (store_is_rotational && !journal_is_rotational)
3231 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3232 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3233 }
3234
3235 int OSD::get_recovery_max_active()
3236 {
3237 if (cct->_conf->osd_recovery_max_active)
3238 return cct->_conf->osd_recovery_max_active;
3239 if (store_is_rotational)
3240 return cct->_conf->osd_recovery_max_active_hdd;
3241 else
3242 return cct->_conf->osd_recovery_max_active_ssd;
3243 }
3244
3245 float OSD::get_osd_snap_trim_sleep()
3246 {
3247 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3248 if (osd_snap_trim_sleep > 0)
3249 return osd_snap_trim_sleep;
3250 if (!store_is_rotational && !journal_is_rotational)
3251 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3252 if (store_is_rotational && !journal_is_rotational)
3253 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3254 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3255 }
3256
3257 int OSD::init()
3258 {
3259 OSDMapRef osdmap;
3260 CompatSet initial, diff;
3261 std::lock_guard lock(osd_lock);
3262 if (is_stopping())
3263 return 0;
3264
3265 tick_timer.init();
3266 tick_timer_without_osd_lock.init();
3267 service.recovery_request_timer.init();
3268 service.sleep_timer.init();
3269
3270 boot_finisher.start();
3271
3272 {
3273 string val;
3274 store->read_meta("require_osd_release", &val);
3275 last_require_osd_release = ceph_release_from_name(val);
3276 }
3277
3278 // mount.
3279 dout(2) << "init " << dev_path
3280 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3281 << dendl;
3282 dout(2) << "journal " << journal_path << dendl;
3283 ceph_assert(store); // call pre_init() first!
3284
3285 store->set_cache_shards(get_num_cache_shards());
3286
3287 int r = store->mount();
3288 if (r < 0) {
3289 derr << "OSD:init: unable to mount object store" << dendl;
3290 return r;
3291 }
3292 journal_is_rotational = store->is_journal_rotational();
3293 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3294 << dendl;
3295
3296 enable_disable_fuse(false);
3297
3298 dout(2) << "boot" << dendl;
3299
3300 service.meta_ch = store->open_collection(coll_t::meta());
3301
3302 // initialize the daily loadavg with current 15min loadavg
3303 double loadavgs[3];
3304 if (getloadavg(loadavgs, 3) == 3) {
3305 daily_loadavg = loadavgs[2];
3306 } else {
3307 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3308 daily_loadavg = 1.0;
3309 }
3310
3311 int rotating_auth_attempts = 0;
3312 auto rotating_auth_timeout =
3313 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3314
3315 // sanity check long object name handling
3316 {
3317 hobject_t l;
3318 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3319 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3320 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3321 r = store->validate_hobject_key(l);
3322 if (r < 0) {
3323 derr << "backend (" << store->get_type() << ") is unable to support max "
3324 << "object name[space] len" << dendl;
3325 derr << " osd max object name len = "
3326 << cct->_conf->osd_max_object_name_len << dendl;
3327 derr << " osd max object namespace len = "
3328 << cct->_conf->osd_max_object_namespace_len << dendl;
3329 derr << cpp_strerror(r) << dendl;
3330 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3331 goto out;
3332 }
3333 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3334 << dendl;
3335 } else {
3336 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3337 }
3338 }
3339
3340 // read superblock
3341 r = read_superblock();
3342 if (r < 0) {
3343 derr << "OSD::init() : unable to read osd superblock" << dendl;
3344 r = -EINVAL;
3345 goto out;
3346 }
3347
3348 if (osd_compat.compare(superblock.compat_features) < 0) {
3349 derr << "The disk uses features unsupported by the executable." << dendl;
3350 derr << " ondisk features " << superblock.compat_features << dendl;
3351 derr << " daemon features " << osd_compat << dendl;
3352
3353 if (osd_compat.writeable(superblock.compat_features)) {
3354 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3355 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3356 r = -EOPNOTSUPP;
3357 goto out;
3358 }
3359 else {
3360 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3361 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3362 r = -EOPNOTSUPP;
3363 goto out;
3364 }
3365 }
3366
3367 assert_warn(whoami == superblock.whoami);
3368 if (whoami != superblock.whoami) {
3369 derr << "OSD::init: superblock says osd"
3370 << superblock.whoami << " but I am osd." << whoami << dendl;
3371 r = -EINVAL;
3372 goto out;
3373 }
3374
3375 startup_time = ceph::mono_clock::now();
3376
3377 // load up "current" osdmap
3378 assert_warn(!get_osdmap());
3379 if (get_osdmap()) {
3380 derr << "OSD::init: unable to read current osdmap" << dendl;
3381 r = -EINVAL;
3382 goto out;
3383 }
3384 osdmap = get_map(superblock.current_epoch);
3385 set_osdmap(osdmap);
3386
3387 // make sure we don't have legacy pgs deleting
3388 {
3389 vector<coll_t> ls;
3390 int r = store->list_collections(ls);
3391 ceph_assert(r >= 0);
3392 for (auto c : ls) {
3393 spg_t pgid;
3394 if (c.is_pg(&pgid) &&
3395 !osdmap->have_pg_pool(pgid.pool())) {
3396 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3397 if (!store->exists(service.meta_ch, oid)) {
3398 derr << __func__ << " missing pg_pool_t for deleted pool "
3399 << pgid.pool() << " for pg " << pgid
3400 << "; please downgrade to luminous and allow "
3401 << "pg deletion to complete before upgrading" << dendl;
3402 ceph_abort();
3403 }
3404 }
3405 }
3406 }
3407
3408 initial = get_osd_initial_compat_set();
3409 diff = superblock.compat_features.unsupported(initial);
3410 if (superblock.compat_features.merge(initial)) {
3411 // Are we adding SNAPMAPPER2?
3412 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3413 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3414 << dendl;
3415 auto ch = service.meta_ch;
3416 auto hoid = make_snapmapper_oid();
3417 unsigned max = cct->_conf->osd_target_transaction_size;
3418 r = SnapMapper::convert_legacy(cct, store, ch, hoid, max);
3419 if (r < 0)
3420 goto out;
3421 }
3422 // We need to persist the new compat_set before we
3423 // do anything else
3424 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3425 ObjectStore::Transaction t;
3426 write_superblock(t);
3427 r = store->queue_transaction(service.meta_ch, std::move(t));
3428 if (r < 0)
3429 goto out;
3430 }
3431
3432 // make sure snap mapper object exists
3433 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3434 dout(10) << "init creating/touching snapmapper object" << dendl;
3435 ObjectStore::Transaction t;
3436 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3437 r = store->queue_transaction(service.meta_ch, std::move(t));
3438 if (r < 0)
3439 goto out;
3440 }
3441 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3442 dout(10) << "init creating/touching purged_snaps object" << dendl;
3443 ObjectStore::Transaction t;
3444 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3445 r = store->queue_transaction(service.meta_ch, std::move(t));
3446 if (r < 0)
3447 goto out;
3448 }
3449
3450 if (cct->_conf->osd_open_classes_on_start) {
3451 int r = ClassHandler::get_instance().open_all_classes();
3452 if (r)
3453 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3454 }
3455
3456 check_osdmap_features();
3457
3458 create_recoverystate_perf();
3459
3460 {
3461 epoch_t bind_epoch = osdmap->get_epoch();
3462 service.set_epochs(NULL, NULL, &bind_epoch);
3463 }
3464
3465 clear_temp_objects();
3466
3467 // initialize osdmap references in sharded wq
3468 for (auto& shard : shards) {
3469 std::lock_guard l(shard->osdmap_lock);
3470 shard->shard_osdmap = osdmap;
3471 }
3472
3473 // load up pgs (as they previously existed)
3474 load_pgs();
3475
3476 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3477
3478 create_logger();
3479
3480 // prime osd stats
3481 {
3482 struct store_statfs_t stbuf;
3483 osd_alert_list_t alerts;
3484 int r = store->statfs(&stbuf, &alerts);
3485 ceph_assert(r == 0);
3486 service.set_statfs(stbuf, alerts);
3487 }
3488
3489 // client_messenger auth_client is already set up by monc.
3490 for (auto m : { cluster_messenger,
3491 objecter_messenger,
3492 hb_front_client_messenger,
3493 hb_back_client_messenger,
3494 hb_front_server_messenger,
3495 hb_back_server_messenger } ) {
3496 m->set_auth_client(monc);
3497 }
3498 for (auto m : { client_messenger,
3499 cluster_messenger,
3500 hb_front_server_messenger,
3501 hb_back_server_messenger }) {
3502 m->set_auth_server(monc);
3503 }
3504 monc->set_handle_authentication_dispatcher(this);
3505
3506 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3507 | CEPH_ENTITY_TYPE_MGR);
3508 r = monc->init();
3509 if (r < 0)
3510 goto out;
3511
3512 mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); });
3513 mgrc.set_perf_metric_query_cb(
3514 [this](const ConfigPayload &config_payload) {
3515 set_perf_queries(config_payload);
3516 },
3517 [this] {
3518 return get_perf_reports();
3519 });
3520 mgrc.init();
3521
3522 // tell monc about log_client so it will know about mon session resets
3523 monc->set_log_client(&log_client);
3524 update_log_config();
3525
3526 // i'm ready!
3527 client_messenger->add_dispatcher_tail(&mgrc);
3528 client_messenger->add_dispatcher_tail(this);
3529 cluster_messenger->add_dispatcher_head(this);
3530
3531 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3532 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3533 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3534 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3535
3536 objecter_messenger->add_dispatcher_head(service.objecter.get());
3537
3538 service.init();
3539 service.publish_map(osdmap);
3540 service.publish_superblock(superblock);
3541 service.max_oldest_map = superblock.oldest_map;
3542
3543 for (auto& shard : shards) {
3544 // put PGs in a temporary set because we may modify pg_slots
3545 // unordered_map below.
3546 set<PGRef> pgs;
3547 for (auto& i : shard->pg_slots) {
3548 PGRef pg = i.second->pg;
3549 if (!pg) {
3550 continue;
3551 }
3552 pgs.insert(pg);
3553 }
3554 for (auto pg : pgs) {
3555 std::scoped_lock l{*pg};
3556 set<pair<spg_t,epoch_t>> new_children;
3557 set<pair<spg_t,epoch_t>> merge_pgs;
3558 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3559 &new_children, &merge_pgs);
3560 if (!new_children.empty()) {
3561 for (auto shard : shards) {
3562 shard->prime_splits(osdmap, &new_children);
3563 }
3564 assert(new_children.empty());
3565 }
3566 if (!merge_pgs.empty()) {
3567 for (auto shard : shards) {
3568 shard->prime_merges(osdmap, &merge_pgs);
3569 }
3570 assert(merge_pgs.empty());
3571 }
3572 }
3573 }
3574
3575 osd_op_tp.start();
3576
3577 // start the heartbeat
3578 heartbeat_thread.create("osd_srv_heartbt");
3579
3580 // tick
3581 tick_timer.add_event_after(get_tick_interval(),
3582 new C_Tick(this));
3583 {
3584 std::lock_guard l(tick_timer_lock);
3585 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3586 new C_Tick_WithoutOSDLock(this));
3587 }
3588
3589 osd_lock.unlock();
3590
3591 r = monc->authenticate();
3592 if (r < 0) {
3593 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3594 << dendl;
3595 exit(1);
3596 }
3597
3598 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3599 derr << "unable to obtain rotating service keys; retrying" << dendl;
3600 ++rotating_auth_attempts;
3601 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3602 derr << __func__ << " wait_auth_rotating timed out" << dendl;
3603 exit(1);
3604 }
3605 }
3606
3607 r = update_crush_device_class();
3608 if (r < 0) {
3609 derr << __func__ << " unable to update_crush_device_class: "
3610 << cpp_strerror(r) << dendl;
3611 exit(1);
3612 }
3613
3614 r = update_crush_location();
3615 if (r < 0) {
3616 derr << __func__ << " unable to update_crush_location: "
3617 << cpp_strerror(r) << dendl;
3618 exit(1);
3619 }
3620
3621 osd_lock.lock();
3622 if (is_stopping())
3623 return 0;
3624
3625 // start objecter *after* we have authenticated, so that we don't ignore
3626 // the OSDMaps it requests.
3627 service.final_init();
3628
3629 check_config();
3630
3631 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3632 consume_map();
3633
3634 dout(0) << "done with init, starting boot process" << dendl;
3635
3636 // subscribe to any pg creations
3637 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3638
3639 // MgrClient needs this (it doesn't have MonClient reference itself)
3640 monc->sub_want("mgrmap", 0, 0);
3641
3642 // we don't need to ask for an osdmap here; objecter will
3643 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3644
3645 monc->renew_subs();
3646
3647 start_boot();
3648
3649 return 0;
3650
3651 out:
3652 enable_disable_fuse(true);
3653 store->umount();
3654 delete store;
3655 store = NULL;
3656 return r;
3657 }
3658
3659 void OSD::final_init()
3660 {
3661 AdminSocket *admin_socket = cct->get_admin_socket();
3662 asok_hook = new OSDSocketHook(this);
3663 int r = admin_socket->register_command("status", asok_hook,
3664 "high-level status of OSD");
3665 ceph_assert(r == 0);
3666 r = admin_socket->register_command("flush_journal",
3667 asok_hook,
3668 "flush the journal to permanent store");
3669 ceph_assert(r == 0);
3670 r = admin_socket->register_command("dump_ops_in_flight " \
3671 "name=filterstr,type=CephString,n=N,req=false",
3672 asok_hook,
3673 "show the ops currently in flight");
3674 ceph_assert(r == 0);
3675 r = admin_socket->register_command("ops " \
3676 "name=filterstr,type=CephString,n=N,req=false",
3677 asok_hook,
3678 "show the ops currently in flight");
3679 ceph_assert(r == 0);
3680 r = admin_socket->register_command("dump_blocked_ops " \
3681 "name=filterstr,type=CephString,n=N,req=false",
3682 asok_hook,
3683 "show the blocked ops currently in flight");
3684 ceph_assert(r == 0);
3685 r = admin_socket->register_command("dump_historic_ops " \
3686 "name=filterstr,type=CephString,n=N,req=false",
3687 asok_hook,
3688 "show recent ops");
3689 ceph_assert(r == 0);
3690 r = admin_socket->register_command("dump_historic_slow_ops " \
3691 "name=filterstr,type=CephString,n=N,req=false",
3692 asok_hook,
3693 "show slowest recent ops");
3694 ceph_assert(r == 0);
3695 r = admin_socket->register_command("dump_historic_ops_by_duration " \
3696 "name=filterstr,type=CephString,n=N,req=false",
3697 asok_hook,
3698 "show slowest recent ops, sorted by duration");
3699 ceph_assert(r == 0);
3700 r = admin_socket->register_command("dump_op_pq_state",
3701 asok_hook,
3702 "dump op priority queue state");
3703 ceph_assert(r == 0);
3704 r = admin_socket->register_command("dump_blacklist",
3705 asok_hook,
3706 "dump blacklisted clients and times");
3707 ceph_assert(r == 0);
3708 r = admin_socket->register_command("dump_watchers",
3709 asok_hook,
3710 "show clients which have active watches,"
3711 " and on which objects");
3712 ceph_assert(r == 0);
3713 r = admin_socket->register_command("dump_recovery_reservations",
3714 asok_hook,
3715 "show recovery reservations");
3716 ceph_assert(r == 0);
3717 r = admin_socket->register_command("dump_scrub_reservations",
3718 asok_hook,
3719 "show recovery reservations");
3720 ceph_assert(r == 0);
3721 r = admin_socket->register_command("get_latest_osdmap",
3722 asok_hook,
3723 "force osd to update the latest map from "
3724 "the mon");
3725 ceph_assert(r == 0);
3726
3727 r = admin_socket->register_command("set_heap_property " \
3728 "name=property,type=CephString " \
3729 "name=value,type=CephInt",
3730 asok_hook,
3731 "update malloc extension heap property");
3732 ceph_assert(r == 0);
3733
3734 r = admin_socket->register_command("get_heap_property " \
3735 "name=property,type=CephString",
3736 asok_hook,
3737 "get malloc extension heap property");
3738 ceph_assert(r == 0);
3739
3740 r = admin_socket->register_command("dump_objectstore_kv_stats",
3741 asok_hook,
3742 "print statistics of kvdb which used by bluestore");
3743 ceph_assert(r == 0);
3744
3745 r = admin_socket->register_command("dump_scrubs",
3746 asok_hook,
3747 "print scheduled scrubs");
3748 ceph_assert(r == 0);
3749
3750 r = admin_socket->register_command("calc_objectstore_db_histogram",
3751 asok_hook,
3752 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3753 ceph_assert(r == 0);
3754
3755 r = admin_socket->register_command("flush_store_cache",
3756 asok_hook,
3757 "Flush bluestore internal cache");
3758 ceph_assert(r == 0);
3759 r = admin_socket->register_command("dump_pgstate_history",
3760 asok_hook,
3761 "show recent state history");
3762 ceph_assert(r == 0);
3763
3764 r = admin_socket->register_command("compact",
3765 asok_hook,
3766 "Commpact object store's omap."
3767 " WARNING: Compaction probably slows your requests");
3768 ceph_assert(r == 0);
3769
3770 r = admin_socket->register_command("get_mapped_pools",
3771 asok_hook,
3772 "dump pools whose PG(s) are mapped to this OSD.");
3773
3774 ceph_assert(r == 0);
3775
3776 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
3777 asok_hook,
3778 "probe OSD devices for SMART data.");
3779
3780 ceph_assert(r == 0);
3781
3782 r = admin_socket->register_command("list_devices",
3783 asok_hook,
3784 "list OSD devices.");
3785 r = admin_socket->register_command("send_beacon",
3786 asok_hook,
3787 "send OSD beacon to mon immediately");
3788
3789 r = admin_socket->register_command(
3790 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3791 "Dump osd heartbeat network ping times");
3792 ceph_assert(r == 0);
3793
3794 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3795 // Note: pools are CephString instead of CephPoolname because
3796 // these commands traditionally support both pool names and numbers
3797 r = admin_socket->register_command(
3798 "setomapval " \
3799 "name=pool,type=CephString " \
3800 "name=objname,type=CephObjectname " \
3801 "name=key,type=CephString "\
3802 "name=val,type=CephString",
3803 test_ops_hook,
3804 "set omap key");
3805 ceph_assert(r == 0);
3806 r = admin_socket->register_command(
3807 "rmomapkey " \
3808 "name=pool,type=CephString " \
3809 "name=objname,type=CephObjectname " \
3810 "name=key,type=CephString",
3811 test_ops_hook,
3812 "remove omap key");
3813 ceph_assert(r == 0);
3814 r = admin_socket->register_command(
3815 "setomapheader " \
3816 "name=pool,type=CephString " \
3817 "name=objname,type=CephObjectname " \
3818 "name=header,type=CephString",
3819 test_ops_hook,
3820 "set omap header");
3821 ceph_assert(r == 0);
3822
3823 r = admin_socket->register_command(
3824 "getomap " \
3825 "name=pool,type=CephString " \
3826 "name=objname,type=CephObjectname",
3827 test_ops_hook,
3828 "output entire object map");
3829 ceph_assert(r == 0);
3830
3831 r = admin_socket->register_command(
3832 "truncobj " \
3833 "name=pool,type=CephString " \
3834 "name=objname,type=CephObjectname " \
3835 "name=len,type=CephInt",
3836 test_ops_hook,
3837 "truncate object to length");
3838 ceph_assert(r == 0);
3839
3840 r = admin_socket->register_command(
3841 "injectdataerr " \
3842 "name=pool,type=CephString " \
3843 "name=objname,type=CephObjectname " \
3844 "name=shardid,type=CephInt,req=false,range=0|255",
3845 test_ops_hook,
3846 "inject data error to an object");
3847 ceph_assert(r == 0);
3848
3849 r = admin_socket->register_command(
3850 "injectmdataerr " \
3851 "name=pool,type=CephString " \
3852 "name=objname,type=CephObjectname " \
3853 "name=shardid,type=CephInt,req=false,range=0|255",
3854 test_ops_hook,
3855 "inject metadata error to an object");
3856 ceph_assert(r == 0);
3857 r = admin_socket->register_command(
3858 "set_recovery_delay " \
3859 "name=utime,type=CephInt,req=false",
3860 test_ops_hook,
3861 "Delay osd recovery by specified seconds");
3862 ceph_assert(r == 0);
3863 r = admin_socket->register_command(
3864 "injectfull " \
3865 "name=type,type=CephString,req=false " \
3866 "name=count,type=CephInt,req=false ",
3867 test_ops_hook,
3868 "Inject a full disk (optional count times)");
3869 ceph_assert(r == 0);
3870 r = admin_socket->register_command(
3871 "bench " \
3872 "name=count,type=CephInt,req=false " \
3873 "name=size,type=CephInt,req=false " \
3874 "name=object_size,type=CephInt,req=false " \
3875 "name=object_num,type=CephInt,req=false ",
3876 asok_hook,
3877 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3878 "(default count=1G default size=4MB). Results in log.");
3879 ceph_assert(r == 0);
3880 r = admin_socket->register_command(
3881 "cluster_log " \
3882 "name=level,type=CephChoices,strings=error,warning,info,debug " \
3883 "name=message,type=CephString,n=N",
3884 asok_hook,
3885 "log a message to the cluster log");
3886 ceph_assert(r == 0);
3887 r = admin_socket->register_command(
3888 "flush_pg_stats",
3889 asok_hook,
3890 "flush pg stats");
3891 ceph_assert(r == 0);
3892 r = admin_socket->register_command(
3893 "heap " \
3894 "name=heapcmd,type=CephChoices,strings=" \
3895 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
3896 "name=value,type=CephString,req=false",
3897 asok_hook,
3898 "show heap usage info (available only if compiled with tcmalloc)");
3899 ceph_assert(r == 0);
3900 r = admin_socket->register_command(
3901 "debug dump_missing " \
3902 "name=filename,type=CephFilepath",
3903 asok_hook,
3904 "dump missing objects to a named file");
3905 ceph_assert(r == 0);
3906 r = admin_socket->register_command(
3907 "debug kick_recovery_wq " \
3908 "name=delay,type=CephInt,range=0",
3909 asok_hook,
3910 "set osd_recovery_delay_start to <val>");
3911 ceph_assert(r == 0);
3912 r = admin_socket->register_command(
3913 "cpu_profiler " \
3914 "name=arg,type=CephChoices,strings=status|flush",
3915 asok_hook,
3916 "run cpu profiling on daemon");
3917 ceph_assert(r == 0);
3918 r = admin_socket->register_command(
3919 "dump_pg_recovery_stats",
3920 asok_hook,
3921 "dump pg recovery statistics");
3922 ceph_assert(r == 0);
3923 r = admin_socket->register_command(
3924 "reset_pg_recovery_stats",
3925 asok_hook,
3926 "reset pg recovery statistics");
3927 ceph_assert(r == 0);
3928 r = admin_socket->register_command(
3929 "cache drop",
3930 asok_hook,
3931 "Drop all OSD caches");
3932 ceph_assert(r == 0);
3933 r = admin_socket->register_command(
3934 "cache status",
3935 asok_hook,
3936 "Get OSD caches statistics");
3937 ceph_assert(r == 0);
3938 r = admin_socket->register_command(
3939 "scrub_purged_snaps",
3940 asok_hook,
3941 "Scrub purged_snaps vs snapmapper index");
3942 ceph_assert(r == 0);
3943
3944 // -- pg commands --
3945 // old form: ceph pg <pgid> command ...
3946 r = admin_socket->register_command(
3947 "pg " \
3948 "name=pgid,type=CephPgid " \
3949 "name=cmd,type=CephChoices,strings=query",
3950 asok_hook,
3951 "");
3952 ceph_assert(r == 0);
3953 r = admin_socket->register_command(
3954 "pg " \
3955 "name=pgid,type=CephPgid " \
3956 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
3957 "name=mulcmd,type=CephChoices,strings=revert|delete",
3958 asok_hook,
3959 "");
3960 ceph_assert(r == 0);
3961 r = admin_socket->register_command(
3962 "pg " \
3963 "name=pgid,type=CephPgid " \
3964 "name=cmd,type=CephChoices,strings=list_unfound " \
3965 "name=offset,type=CephString,req=false",
3966 asok_hook,
3967 "");
3968 ceph_assert(r == 0);
3969 r = admin_socket->register_command(
3970 "pg " \
3971 "name=pgid,type=CephPgid " \
3972 "name=cmd,type=CephChoices,strings=scrub " \
3973 "name=time,type=CephInt,req=false",
3974 asok_hook,
3975 "");
3976 ceph_assert(r == 0);
3977 r = admin_socket->register_command(
3978 "pg " \
3979 "name=pgid,type=CephPgid " \
3980 "name=cmd,type=CephChoices,strings=deep_scrub " \
3981 "name=time,type=CephInt,req=false",
3982 asok_hook,
3983 "");
3984 ceph_assert(r == 0);
3985 // new form: tell <pgid> <cmd> for both cli and rest
3986 r = admin_socket->register_command(
3987 "query",
3988 asok_hook,
3989 "show details of a specific pg");
3990 ceph_assert(r == 0);
3991 r = admin_socket->register_command(
3992 "mark_unfound_lost " \
3993 "name=pgid,type=CephPgid,req=false " \
3994 "name=mulcmd,type=CephChoices,strings=revert|delete",
3995 asok_hook,
3996 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
3997 ceph_assert(r == 0);
3998 r = admin_socket->register_command(
3999 "list_unfound " \
4000 "name=pgid,type=CephPgid,req=false " \
4001 "name=offset,type=CephString,req=false",
4002 asok_hook,
4003 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4004 ceph_assert(r == 0);
4005 r = admin_socket->register_command(
4006 "scrub " \
4007 "name=pgid,type=CephPgid,req=false " \
4008 "name=time,type=CephInt,req=false",
4009 asok_hook,
4010 "Trigger a scheduled scrub ");
4011 ceph_assert(r == 0);
4012 r = admin_socket->register_command(
4013 "deep_scrub " \
4014 "name=pgid,type=CephPgid,req=false " \
4015 "name=time,type=CephInt,req=false",
4016 asok_hook,
4017 "Trigger a scheduled deep scrub ");
4018 ceph_assert(r == 0);
4019 }
4020
4021 void OSD::create_logger()
4022 {
4023 dout(10) << "create_logger" << dendl;
4024
4025 logger = build_osd_logger(cct);
4026 cct->get_perfcounters_collection()->add(logger);
4027 }
4028
4029 void OSD::create_recoverystate_perf()
4030 {
4031 dout(10) << "create_recoverystate_perf" << dendl;
4032
4033 recoverystate_perf = build_recoverystate_perf(cct);
4034 cct->get_perfcounters_collection()->add(recoverystate_perf);
4035 }
4036
4037 int OSD::shutdown()
4038 {
4039 if (cct->_conf->osd_fast_shutdown) {
4040 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4041 cct->_log->flush();
4042 _exit(0);
4043 }
4044
4045 if (!service.prepare_to_stop())
4046 return 0; // already shutting down
4047 osd_lock.lock();
4048 if (is_stopping()) {
4049 osd_lock.unlock();
4050 return 0;
4051 }
4052 dout(0) << "shutdown" << dendl;
4053
4054 set_state(STATE_STOPPING);
4055
4056 // Debugging
4057 if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4058 cct->_conf.set_val("debug_osd", "100");
4059 cct->_conf.set_val("debug_journal", "100");
4060 cct->_conf.set_val("debug_filestore", "100");
4061 cct->_conf.set_val("debug_bluestore", "100");
4062 cct->_conf.set_val("debug_ms", "100");
4063 cct->_conf.apply_changes(nullptr);
4064 }
4065
4066 // stop MgrClient earlier as it's more like an internal consumer of OSD
4067 mgrc.shutdown();
4068
4069 service.start_shutdown();
4070
4071 // stop sending work to pgs. this just prevents any new work in _process
4072 // from racing with on_shutdown and potentially entering the pg after.
4073 op_shardedwq.drain();
4074
4075 // Shutdown PGs
4076 {
4077 vector<PGRef> pgs;
4078 _get_pgs(&pgs);
4079 for (auto pg : pgs) {
4080 pg->shutdown();
4081 }
4082 }
4083
4084 // drain op queue again (in case PGs requeued something)
4085 op_shardedwq.drain();
4086 {
4087 finished.clear(); // zap waiters (bleh, this is messy)
4088 waiting_for_osdmap.clear();
4089 }
4090
4091 // unregister commands
4092 cct->get_admin_socket()->unregister_commands(asok_hook);
4093 delete asok_hook;
4094 asok_hook = NULL;
4095
4096 cct->get_admin_socket()->unregister_commands(test_ops_hook);
4097 delete test_ops_hook;
4098 test_ops_hook = NULL;
4099
4100 osd_lock.unlock();
4101
4102 {
4103 std::lock_guard l{heartbeat_lock};
4104 heartbeat_stop = true;
4105 heartbeat_cond.notify_all();
4106 heartbeat_peers.clear();
4107 }
4108 heartbeat_thread.join();
4109
4110 hb_back_server_messenger->mark_down_all();
4111 hb_front_server_messenger->mark_down_all();
4112 hb_front_client_messenger->mark_down_all();
4113 hb_back_client_messenger->mark_down_all();
4114
4115 osd_op_tp.drain();
4116 osd_op_tp.stop();
4117 dout(10) << "op sharded tp stopped" << dendl;
4118
4119 dout(10) << "stopping agent" << dendl;
4120 service.agent_stop();
4121
4122 boot_finisher.wait_for_empty();
4123
4124 osd_lock.lock();
4125
4126 boot_finisher.stop();
4127 reset_heartbeat_peers(true);
4128
4129 tick_timer.shutdown();
4130
4131 {
4132 std::lock_guard l(tick_timer_lock);
4133 tick_timer_without_osd_lock.shutdown();
4134 }
4135
4136 // note unmount epoch
4137 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
4138 superblock.mounted = service.get_boot_epoch();
4139 superblock.clean_thru = get_osdmap_epoch();
4140 ObjectStore::Transaction t;
4141 write_superblock(t);
4142 int r = store->queue_transaction(service.meta_ch, std::move(t));
4143 if (r) {
4144 derr << "OSD::shutdown: error writing superblock: "
4145 << cpp_strerror(r) << dendl;
4146 }
4147
4148
4149 service.shutdown_reserver();
4150
4151 // Remove PGs
4152 #ifdef PG_DEBUG_REFS
4153 service.dump_live_pgids();
4154 #endif
4155 while (true) {
4156 vector<PGRef> pgs;
4157 _get_pgs(&pgs, true);
4158 if (pgs.empty()) {
4159 break;
4160 }
4161 for (auto& pg : pgs) {
4162 if (pg->is_deleted()) {
4163 continue;
4164 }
4165 dout(20) << " kicking pg " << pg << dendl;
4166 pg->lock();
4167 if (pg->get_num_ref() != 1) {
4168 derr << "pgid " << pg->get_pgid() << " has ref count of "
4169 << pg->get_num_ref() << dendl;
4170 #ifdef PG_DEBUG_REFS
4171 pg->dump_live_ids();
4172 #endif
4173 if (cct->_conf->osd_shutdown_pgref_assert) {
4174 ceph_abort();
4175 }
4176 }
4177 pg->ch.reset();
4178 pg->unlock();
4179 }
4180 }
4181 #ifdef PG_DEBUG_REFS
4182 service.dump_live_pgids();
4183 #endif
4184
4185 osd_lock.unlock();
4186 cct->_conf.remove_observer(this);
4187 osd_lock.lock();
4188
4189 service.meta_ch.reset();
4190
4191 dout(10) << "syncing store" << dendl;
4192 enable_disable_fuse(true);
4193
4194 if (cct->_conf->osd_journal_flush_on_shutdown) {
4195 dout(10) << "flushing journal" << dendl;
4196 store->flush_journal();
4197 }
4198
4199 monc->shutdown();
4200 osd_lock.unlock();
4201 {
4202 std::unique_lock l{map_lock};
4203 set_osdmap(OSDMapRef());
4204 }
4205 for (auto s : shards) {
4206 std::lock_guard l(s->osdmap_lock);
4207 s->shard_osdmap = OSDMapRef();
4208 }
4209 service.shutdown();
4210
4211 std::lock_guard lock(osd_lock);
4212 store->umount();
4213 delete store;
4214 store = nullptr;
4215 dout(10) << "Store synced" << dendl;
4216
4217 op_tracker.on_shutdown();
4218
4219 ClassHandler::get_instance().shutdown();
4220 client_messenger->shutdown();
4221 cluster_messenger->shutdown();
4222 hb_front_client_messenger->shutdown();
4223 hb_back_client_messenger->shutdown();
4224 objecter_messenger->shutdown();
4225 hb_front_server_messenger->shutdown();
4226 hb_back_server_messenger->shutdown();
4227
4228 return r;
4229 }
4230
4231 int OSD::mon_cmd_maybe_osd_create(string &cmd)
4232 {
4233 bool created = false;
4234 while (true) {
4235 dout(10) << __func__ << " cmd: " << cmd << dendl;
4236 vector<string> vcmd{cmd};
4237 bufferlist inbl;
4238 C_SaferCond w;
4239 string outs;
4240 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4241 int r = w.wait();
4242 if (r < 0) {
4243 if (r == -ENOENT && !created) {
4244 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4245 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4246 vector<string> vnewcmd{newcmd};
4247 bufferlist inbl;
4248 C_SaferCond w;
4249 string outs;
4250 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4251 int r = w.wait();
4252 if (r < 0) {
4253 derr << __func__ << " fail: osd does not exist and created failed: "
4254 << cpp_strerror(r) << dendl;
4255 return r;
4256 }
4257 created = true;
4258 continue;
4259 }
4260 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4261 return r;
4262 }
4263 break;
4264 }
4265
4266 return 0;
4267 }
4268
4269 int OSD::update_crush_location()
4270 {
4271 if (!cct->_conf->osd_crush_update_on_start) {
4272 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4273 return 0;
4274 }
4275
4276 char weight[32];
4277 if (cct->_conf->osd_crush_initial_weight >= 0) {
4278 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4279 } else {
4280 struct store_statfs_t st;
4281 osd_alert_list_t alerts;
4282 int r = store->statfs(&st, &alerts);
4283 if (r < 0) {
4284 derr << "statfs: " << cpp_strerror(r) << dendl;
4285 return r;
4286 }
4287 snprintf(weight, sizeof(weight), "%.4lf",
4288 std::max(.00001,
4289 double(st.total) /
4290 double(1ull << 40 /* TB */)));
4291 }
4292
4293 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
4294
4295 string cmd =
4296 string("{\"prefix\": \"osd crush create-or-move\", ") +
4297 string("\"id\": ") + stringify(whoami) + ", " +
4298 string("\"weight\":") + weight + ", " +
4299 string("\"args\": [") + stringify(cct->crush_location) + "]}";
4300 return mon_cmd_maybe_osd_create(cmd);
4301 }
4302
4303 int OSD::update_crush_device_class()
4304 {
4305 if (!cct->_conf->osd_class_update_on_start) {
4306 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4307 return 0;
4308 }
4309
4310 string device_class;
4311 int r = store->read_meta("crush_device_class", &device_class);
4312 if (r < 0 || device_class.empty()) {
4313 device_class = store->get_default_device_class();
4314 }
4315
4316 if (device_class.empty()) {
4317 dout(20) << __func__ << " no device class stored locally" << dendl;
4318 return 0;
4319 }
4320
4321 string cmd =
4322 string("{\"prefix\": \"osd crush set-device-class\", ") +
4323 string("\"class\": \"") + device_class + string("\", ") +
4324 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4325
4326 r = mon_cmd_maybe_osd_create(cmd);
4327 if (r == -EBUSY) {
4328 // good, already bound to a device-class
4329 return 0;
4330 } else {
4331 return r;
4332 }
4333 }
4334
4335 void OSD::write_superblock(ObjectStore::Transaction& t)
4336 {
4337 dout(10) << "write_superblock " << superblock << dendl;
4338
4339 //hack: at minimum it's using the baseline feature set
4340 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4341 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4342
4343 bufferlist bl;
4344 encode(superblock, bl);
4345 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4346 }
4347
4348 int OSD::read_superblock()
4349 {
4350 bufferlist bl;
4351 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4352 if (r < 0)
4353 return r;
4354
4355 auto p = bl.cbegin();
4356 decode(superblock, p);
4357
4358 dout(10) << "read_superblock " << superblock << dendl;
4359
4360 return 0;
4361 }
4362
4363 void OSD::clear_temp_objects()
4364 {
4365 dout(10) << __func__ << dendl;
4366 vector<coll_t> ls;
4367 store->list_collections(ls);
4368 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4369 spg_t pgid;
4370 if (!p->is_pg(&pgid))
4371 continue;
4372
4373 // list temp objects
4374 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4375
4376 vector<ghobject_t> temps;
4377 ghobject_t next;
4378 while (1) {
4379 vector<ghobject_t> objects;
4380 auto ch = store->open_collection(*p);
4381 ceph_assert(ch);
4382 store->collection_list(ch, next, ghobject_t::get_max(),
4383 store->get_ideal_list_max(),
4384 &objects, &next);
4385 if (objects.empty())
4386 break;
4387 vector<ghobject_t>::iterator q;
4388 for (q = objects.begin(); q != objects.end(); ++q) {
4389 // Hammer set pool for temps to -1, so check for clean-up
4390 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4391 temps.push_back(*q);
4392 } else {
4393 break;
4394 }
4395 }
4396 // If we saw a non-temp object and hit the break above we can
4397 // break out of the while loop too.
4398 if (q != objects.end())
4399 break;
4400 }
4401 if (!temps.empty()) {
4402 ObjectStore::Transaction t;
4403 int removed = 0;
4404 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4405 dout(20) << " removing " << *p << " object " << *q << dendl;
4406 t.remove(*p, *q);
4407 if (++removed > cct->_conf->osd_target_transaction_size) {
4408 store->queue_transaction(service.meta_ch, std::move(t));
4409 t = ObjectStore::Transaction();
4410 removed = 0;
4411 }
4412 }
4413 if (removed) {
4414 store->queue_transaction(service.meta_ch, std::move(t));
4415 }
4416 }
4417 }
4418 }
4419
4420 void OSD::recursive_remove_collection(CephContext* cct,
4421 ObjectStore *store, spg_t pgid,
4422 coll_t tmp)
4423 {
4424 OSDriver driver(
4425 store,
4426 coll_t(),
4427 make_snapmapper_oid());
4428
4429 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4430 ObjectStore::Transaction t;
4431 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4432
4433 ghobject_t next;
4434 int max = cct->_conf->osd_target_transaction_size;
4435 vector<ghobject_t> objects;
4436 objects.reserve(max);
4437 while (true) {
4438 objects.clear();
4439 store->collection_list(ch, next, ghobject_t::get_max(),
4440 max, &objects, &next);
4441 generic_dout(10) << __func__ << " " << objects << dendl;
4442 if (objects.empty())
4443 break;
4444 for (auto& p: objects) {
4445 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4446 int r = mapper.remove_oid(p.hobj, &_t);
4447 if (r != 0 && r != -ENOENT)
4448 ceph_abort();
4449 t.remove(tmp, p);
4450 }
4451 int r = store->queue_transaction(ch, std::move(t));
4452 ceph_assert(r == 0);
4453 t = ObjectStore::Transaction();
4454 }
4455 t.remove_collection(tmp);
4456 int r = store->queue_transaction(ch, std::move(t));
4457 ceph_assert(r == 0);
4458
4459 C_SaferCond waiter;
4460 if (!ch->flush_commit(&waiter)) {
4461 waiter.wait();
4462 }
4463 }
4464
4465
4466 // ======================================================
4467 // PG's
4468
4469 PG* OSD::_make_pg(
4470 OSDMapRef createmap,
4471 spg_t pgid)
4472 {
4473 dout(10) << __func__ << " " << pgid << dendl;
4474 pg_pool_t pi;
4475 map<string,string> ec_profile;
4476 string name;
4477 if (createmap->have_pg_pool(pgid.pool())) {
4478 pi = *createmap->get_pg_pool(pgid.pool());
4479 name = createmap->get_pool_name(pgid.pool());
4480 if (pi.is_erasure()) {
4481 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4482 }
4483 } else {
4484 // pool was deleted; grab final pg_pool_t off disk.
4485 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4486 bufferlist bl;
4487 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4488 if (r < 0) {
4489 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4490 << dendl;
4491 return nullptr;
4492 }
4493 ceph_assert(r >= 0);
4494 auto p = bl.cbegin();
4495 decode(pi, p);
4496 decode(name, p);
4497 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4498 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4499 << " tombstone" << dendl;
4500 return nullptr;
4501 }
4502 decode(ec_profile, p);
4503 }
4504 PGPool pool(cct, createmap, pgid.pool(), pi, name);
4505 PG *pg;
4506 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4507 pi.type == pg_pool_t::TYPE_ERASURE)
4508 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4509 else
4510 ceph_abort();
4511 return pg;
4512 }
4513
4514 void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4515 {
4516 v->clear();
4517 v->reserve(get_num_pgs());
4518 for (auto& s : shards) {
4519 std::lock_guard l(s->shard_lock);
4520 for (auto& j : s->pg_slots) {
4521 if (j.second->pg &&
4522 !j.second->pg->is_deleted()) {
4523 v->push_back(j.second->pg);
4524 if (clear_too) {
4525 s->_detach_pg(j.second.get());
4526 }
4527 }
4528 }
4529 }
4530 }
4531
4532 void OSD::_get_pgids(vector<spg_t> *v)
4533 {
4534 v->clear();
4535 v->reserve(get_num_pgs());
4536 for (auto& s : shards) {
4537 std::lock_guard l(s->shard_lock);
4538 for (auto& j : s->pg_slots) {
4539 if (j.second->pg &&
4540 !j.second->pg->is_deleted()) {
4541 v->push_back(j.first);
4542 }
4543 }
4544 }
4545 }
4546
4547 void OSD::register_pg(PGRef pg)
4548 {
4549 spg_t pgid = pg->get_pgid();
4550 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4551 auto sdata = shards[shard_index];
4552 std::lock_guard l(sdata->shard_lock);
4553 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4554 ceph_assert(r.second);
4555 auto *slot = r.first->second.get();
4556 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4557 sdata->_attach_pg(slot, pg.get());
4558 }
4559
4560 bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4561 {
4562 auto sdata = pg->osd_shard;
4563 ceph_assert(sdata);
4564 {
4565 std::lock_guard l(sdata->shard_lock);
4566 auto p = sdata->pg_slots.find(pg->pg_id);
4567 if (p == sdata->pg_slots.end() ||
4568 !p->second->pg) {
4569 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4570 return false;
4571 }
4572 if (p->second->waiting_for_merge_epoch) {
4573 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4574 return false;
4575 }
4576 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4577 sdata->_detach_pg(p->second.get());
4578 }
4579
4580 for (auto shard : shards) {
4581 shard->unprime_split_children(pg->pg_id, old_pg_num);
4582 }
4583
4584 // update pg count now since we might not get an osdmap any time soon.
4585 if (pg->is_primary())
4586 service.logger->dec(l_osd_pg_primary);
4587 else if (pg->is_nonprimary())
4588 service.logger->dec(l_osd_pg_replica); // misnomver
4589 else
4590 service.logger->dec(l_osd_pg_stray);
4591
4592 return true;
4593 }
4594
4595 PGRef OSD::_lookup_pg(spg_t pgid)
4596 {
4597 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4598 auto sdata = shards[shard_index];
4599 std::lock_guard l(sdata->shard_lock);
4600 auto p = sdata->pg_slots.find(pgid);
4601 if (p == sdata->pg_slots.end()) {
4602 return nullptr;
4603 }
4604 return p->second->pg;
4605 }
4606
4607 PGRef OSD::_lookup_lock_pg(spg_t pgid)
4608 {
4609 PGRef pg = _lookup_pg(pgid);
4610 if (!pg) {
4611 return nullptr;
4612 }
4613 pg->lock();
4614 if (!pg->is_deleted()) {
4615 return pg;
4616 }
4617 pg->unlock();
4618 return nullptr;
4619 }
4620
4621 PGRef OSD::lookup_lock_pg(spg_t pgid)
4622 {
4623 return _lookup_lock_pg(pgid);
4624 }
4625
4626 void OSD::load_pgs()
4627 {
4628 ceph_assert(ceph_mutex_is_locked(osd_lock));
4629 dout(0) << "load_pgs" << dendl;
4630
4631 {
4632 auto pghist = make_pg_num_history_oid();
4633 bufferlist bl;
4634 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4635 if (r >= 0 && bl.length() > 0) {
4636 auto p = bl.cbegin();
4637 decode(pg_num_history, p);
4638 }
4639 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
4640 }
4641
4642 vector<coll_t> ls;
4643 int r = store->list_collections(ls);
4644 if (r < 0) {
4645 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4646 }
4647
4648 int num = 0;
4649 for (vector<coll_t>::iterator it = ls.begin();
4650 it != ls.end();
4651 ++it) {
4652 spg_t pgid;
4653 if (it->is_temp(&pgid) ||
4654 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
4655 dout(10) << "load_pgs " << *it
4656 << " removing, legacy or flagged for removal pg" << dendl;
4657 recursive_remove_collection(cct, store, pgid, *it);
4658 continue;
4659 }
4660
4661 if (!it->is_pg(&pgid)) {
4662 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4663 continue;
4664 }
4665
4666 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4667 epoch_t map_epoch = 0;
4668 int r = PG::peek_map_epoch(store, pgid, &map_epoch);
4669 if (r < 0) {
4670 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4671 << dendl;
4672 continue;
4673 }
4674
4675 PGRef pg;
4676 if (map_epoch > 0) {
4677 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4678 if (!pgosdmap) {
4679 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
4680 derr << __func__ << ": could not find map for epoch " << map_epoch
4681 << " on pg " << pgid << ", but the pool is not present in the "
4682 << "current map, so this is probably a result of bug 10617. "
4683 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4684 << "to clean it up later." << dendl;
4685 continue;
4686 } else {
4687 derr << __func__ << ": have pgid " << pgid << " at epoch "
4688 << map_epoch << ", but missing map. Crashing."
4689 << dendl;
4690 ceph_abort_msg("Missing map in load_pgs");
4691 }
4692 }
4693 pg = _make_pg(pgosdmap, pgid);
4694 } else {
4695 pg = _make_pg(get_osdmap(), pgid);
4696 }
4697 if (!pg) {
4698 recursive_remove_collection(cct, store, pgid, *it);
4699 continue;
4700 }
4701
4702 // there can be no waiters here, so we don't call _wake_pg_slot
4703
4704 pg->lock();
4705 pg->ch = store->open_collection(pg->coll);
4706
4707 // read pg state, log
4708 pg->read_state(store);
4709
4710 if (pg->dne()) {
4711 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4712 pg->ch = nullptr;
4713 pg->unlock();
4714 recursive_remove_collection(cct, store, pgid, *it);
4715 continue;
4716 }
4717 {
4718 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4719 assert(NULL != shards[shard_index]);
4720 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4721 }
4722
4723 pg->reg_next_scrub();
4724
4725 dout(10) << __func__ << " loaded " << *pg << dendl;
4726 pg->unlock();
4727
4728 register_pg(pg);
4729 ++num;
4730 }
4731 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
4732 }
4733
4734
4735 PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4736 const PGCreateInfo *info)
4737 {
4738 spg_t pgid = info->pgid;
4739
4740 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4741 dout(10) << __func__ << " hit max pg, dropping" << dendl;
4742 return nullptr;
4743 }
4744
4745 PeeringCtx rctx = create_context();
4746
4747 OSDMapRef startmap = get_map(info->epoch);
4748
4749 if (info->by_mon) {
4750 int64_t pool_id = pgid.pgid.pool();
4751 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4752 if (!pool) {
4753 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4754 return nullptr;
4755 }
4756 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
4757 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4758 // this ensures we do not process old creating messages after the
4759 // pool's initial pgs have been created (and pg are subsequently
4760 // allowed to split or merge).
4761 dout(20) << __func__ << " dropping " << pgid
4762 << "create, pool does not have CREATING flag set" << dendl;
4763 return nullptr;
4764 }
4765 }
4766
4767 int up_primary, acting_primary;
4768 vector<int> up, acting;
4769 startmap->pg_to_up_acting_osds(
4770 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4771
4772 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4773 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4774 store->get_type() != "bluestore") {
4775 clog->warn() << "pg " << pgid
4776 << " is at risk of silent data corruption: "
4777 << "the pool allows ec overwrites but is not stored in "
4778 << "bluestore, so deep scrubbing will not detect bitrot";
4779 }
4780 create_pg_collection(
4781 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4782 init_pg_ondisk(rctx.transaction, pgid, pp);
4783
4784 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
4785
4786 PGRef pg = _make_pg(startmap, pgid);
4787 pg->ch = store->create_new_collection(pg->coll);
4788
4789 {
4790 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4791 assert(NULL != shards[shard_index]);
4792 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4793 }
4794
4795 pg->lock(true);
4796
4797 // we are holding the shard lock
4798 ceph_assert(!pg->is_deleted());
4799
4800 pg->init(
4801 role,
4802 up,
4803 up_primary,
4804 acting,
4805 acting_primary,
4806 info->history,
4807 info->past_intervals,
4808 false,
4809 rctx.transaction);
4810
4811 pg->init_collection_pool_opts();
4812
4813 if (pg->is_primary()) {
4814 std::lock_guard locker{m_perf_queries_lock};
4815 pg->set_dynamic_perf_stats_queries(m_perf_queries);
4816 }
4817
4818 pg->handle_initialize(rctx);
4819 pg->handle_activate_map(rctx);
4820
4821 dispatch_context(rctx, pg.get(), osdmap, nullptr);
4822
4823 dout(10) << __func__ << " new pg " << *pg << dendl;
4824 return pg;
4825 }
4826
4827 bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4828 spg_t pgid,
4829 bool is_mon_create)
4830 {
4831 const auto max_pgs_per_osd =
4832 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4833 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4834
4835 if (num_pgs < max_pgs_per_osd) {
4836 return false;
4837 }
4838
4839 std::lock_guard l(pending_creates_lock);
4840 if (is_mon_create) {
4841 pending_creates_from_mon++;
4842 } else {
4843 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
4844 pending_creates_from_osd.emplace(pgid, is_primary);
4845 }
4846 dout(1) << __func__ << " withhold creation of pg " << pgid
4847 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
4848 return true;
4849 }
4850
4851 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4852 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4853 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4854 static vector<int32_t> twiddle(const vector<int>& acting) {
4855 if (acting.size() > 1) {
4856 return {acting[0]};
4857 } else {
4858 vector<int32_t> twiddled(acting.begin(), acting.end());
4859 twiddled.push_back(-1);
4860 return twiddled;
4861 }
4862 }
4863
4864 void OSD::resume_creating_pg()
4865 {
4866 bool do_sub_pg_creates = false;
4867 bool have_pending_creates = false;
4868 {
4869 const auto max_pgs_per_osd =
4870 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4871 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4872 if (max_pgs_per_osd <= num_pgs) {
4873 // this could happen if admin decreases this setting before a PG is removed
4874 return;
4875 }
4876 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4877 std::lock_guard l(pending_creates_lock);
4878 if (pending_creates_from_mon > 0) {
4879 dout(20) << __func__ << " pending_creates_from_mon "
4880 << pending_creates_from_mon << dendl;
4881 do_sub_pg_creates = true;
4882 if (pending_creates_from_mon >= spare_pgs) {
4883 spare_pgs = pending_creates_from_mon = 0;
4884 } else {
4885 spare_pgs -= pending_creates_from_mon;
4886 pending_creates_from_mon = 0;
4887 }
4888 }
4889 auto pg = pending_creates_from_osd.cbegin();
4890 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
4891 dout(20) << __func__ << " pg " << pg->first << dendl;
4892 vector<int> acting;
4893 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
4894 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
4895 pg = pending_creates_from_osd.erase(pg);
4896 do_sub_pg_creates = true;
4897 spare_pgs--;
4898 }
4899 have_pending_creates = (pending_creates_from_mon > 0 ||
4900 !pending_creates_from_osd.empty());
4901 }
4902
4903 bool do_renew_subs = false;
4904 if (do_sub_pg_creates) {
4905 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4906 dout(4) << __func__ << ": resolicit pg creates from mon since "
4907 << last_pg_create_epoch << dendl;
4908 do_renew_subs = true;
4909 }
4910 }
4911 version_t start = get_osdmap_epoch() + 1;
4912 if (have_pending_creates) {
4913 // don't miss any new osdmap deleting PGs
4914 if (monc->sub_want("osdmap", start, 0)) {
4915 dout(4) << __func__ << ": resolicit osdmap from mon since "
4916 << start << dendl;
4917 do_renew_subs = true;
4918 }
4919 } else if (do_sub_pg_creates) {
4920 // no need to subscribe the osdmap continuously anymore
4921 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4922 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
4923 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
4924 << start << dendl;
4925 do_renew_subs = true;
4926 }
4927 }
4928
4929 if (do_renew_subs) {
4930 monc->renew_subs();
4931 }
4932
4933 service.send_pg_temp();
4934 }
4935
4936 void OSD::build_initial_pg_history(
4937 spg_t pgid,
4938 epoch_t created,
4939 utime_t created_stamp,
4940 pg_history_t *h,
4941 PastIntervals *pi)
4942 {
4943 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4944 *h = pg_history_t(created, created_stamp);
4945
4946 OSDMapRef lastmap = service.get_map(created);
4947 int up_primary, acting_primary;
4948 vector<int> up, acting;
4949 lastmap->pg_to_up_acting_osds(
4950 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4951
4952 ostringstream debug;
4953 for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
4954 OSDMapRef osdmap = service.get_map(e);
4955 int new_up_primary, new_acting_primary;
4956 vector<int> new_up, new_acting;
4957 osdmap->pg_to_up_acting_osds(
4958 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4959
4960 // this is a bit imprecise, but sufficient?
4961 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4962 const pg_pool_t *pi;
4963 bool operator()(const set<pg_shard_t> &have) const {
4964 return have.size() >= pi->min_size;
4965 }
4966 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4967 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4968
4969 bool new_interval = PastIntervals::check_new_interval(
4970 acting_primary,
4971 new_acting_primary,
4972 acting, new_acting,
4973 up_primary,
4974 new_up_primary,
4975 up, new_up,
4976 h->same_interval_since,
4977 h->last_epoch_clean,
4978 osdmap.get(),
4979 lastmap.get(),
4980 pgid.pgid,
4981 min_size_predicate,
4982 pi,
4983 &debug);
4984 if (new_interval) {
4985 h->same_interval_since = e;
4986 if (up != new_up) {
4987 h->same_up_since = e;
4988 }
4989 if (acting_primary != new_acting_primary) {
4990 h->same_primary_since = e;
4991 }
4992 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4993 osdmap->get_pg_num(pgid.pgid.pool()),
4994 nullptr)) {
4995 h->last_epoch_split = e;
4996 }
4997 up = new_up;
4998 acting = new_acting;
4999 up_primary = new_up_primary;
5000 acting_primary = new_acting_primary;
5001 }
5002 lastmap = osdmap;
5003 }
5004 dout(20) << __func__ << " " << debug.str() << dendl;
5005 dout(10) << __func__ << " " << *h << " " << *pi
5006 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5007 pi->get_bounds()) << ")"
5008 << dendl;
5009 }
5010
5011 void OSD::_add_heartbeat_peer(int p)
5012 {
5013 if (p == whoami)
5014 return;
5015 HeartbeatInfo *hi;
5016
5017 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5018 if (i == heartbeat_peers.end()) {
5019 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
5020 if (!cons.first)
5021 return;
5022 assert(cons.second);
5023
5024 hi = &heartbeat_peers[p];
5025 hi->peer = p;
5026
5027 auto stamps = service.get_hb_stamps(p);
5028
5029 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5030 sb->peer = p;
5031 sb->stamps = stamps;
5032 hi->hb_interval_start = ceph_clock_now();
5033 hi->con_back = cons.first.get();
5034 hi->con_back->set_priv(sb);
5035
5036 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5037 sf->peer = p;
5038 sf->stamps = stamps;
5039 hi->con_front = cons.second.get();
5040 hi->con_front->set_priv(sf);
5041
5042 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5043 << " " << hi->con_back->get_peer_addr()
5044 << " " << hi->con_front->get_peer_addr()
5045 << dendl;
5046 } else {
5047 hi = &i->second;
5048 }
5049 hi->epoch = get_osdmap_epoch();
5050 }
5051
5052 void OSD::_remove_heartbeat_peer(int n)
5053 {
5054 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5055 ceph_assert(q != heartbeat_peers.end());
5056 dout(20) << " removing heartbeat peer osd." << n
5057 << " " << q->second.con_back->get_peer_addr()
5058 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5059 << dendl;
5060 q->second.clear_mark_down();
5061 heartbeat_peers.erase(q);
5062 }
5063
5064 void OSD::need_heartbeat_peer_update()
5065 {
5066 if (is_stopping())
5067 return;
5068 dout(20) << "need_heartbeat_peer_update" << dendl;
5069 heartbeat_set_peers_need_update();
5070 }
5071
5072 void OSD::maybe_update_heartbeat_peers()
5073 {
5074 ceph_assert(ceph_mutex_is_locked(osd_lock));
5075
5076 if (is_waiting_for_healthy() || is_active()) {
5077 utime_t now = ceph_clock_now();
5078 if (last_heartbeat_resample == utime_t()) {
5079 last_heartbeat_resample = now;
5080 heartbeat_set_peers_need_update();
5081 } else if (!heartbeat_peers_need_update()) {
5082 utime_t dur = now - last_heartbeat_resample;
5083 if (dur > cct->_conf->osd_heartbeat_grace) {
5084 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5085 heartbeat_set_peers_need_update();
5086 last_heartbeat_resample = now;
5087 // automatically clean up any stale heartbeat peers
5088 // if we are unhealthy, then clean all
5089 reset_heartbeat_peers(is_waiting_for_healthy());
5090 }
5091 }
5092 }
5093
5094 if (!heartbeat_peers_need_update())
5095 return;
5096 heartbeat_clear_peers_need_update();
5097
5098 std::lock_guard l(heartbeat_lock);
5099
5100 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5101
5102
5103 // build heartbeat from set
5104 if (is_active()) {
5105 vector<PGRef> pgs;
5106 _get_pgs(&pgs);
5107 for (auto& pg : pgs) {
5108 pg->with_heartbeat_peers([&](int peer) {
5109 if (get_osdmap()->is_up(peer)) {
5110 _add_heartbeat_peer(peer);
5111 }
5112 });
5113 }
5114 }
5115
5116 // include next and previous up osds to ensure we have a fully-connected set
5117 set<int> want, extras;
5118 const int next = get_osdmap()->get_next_up_osd_after(whoami);
5119 if (next >= 0)
5120 want.insert(next);
5121 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
5122 if (prev >= 0 && prev != next)
5123 want.insert(prev);
5124
5125 // make sure we have at least **min_down** osds coming from different
5126 // subtree level (e.g., hosts) for fast failure detection.
5127 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5128 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5129 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5130 get_osdmap()->get_random_up_osds_by_subtree(
5131 whoami, subtree, limit, want, &want);
5132
5133 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5134 dout(10) << " adding neighbor peer osd." << *p << dendl;
5135 extras.insert(*p);
5136 _add_heartbeat_peer(*p);
5137 }
5138
5139 // remove down peers; enumerate extras
5140 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5141 while (p != heartbeat_peers.end()) {
5142 if (!get_osdmap()->is_up(p->first)) {
5143 int o = p->first;
5144 ++p;
5145 _remove_heartbeat_peer(o);
5146 continue;
5147 }
5148 if (p->second.epoch < get_osdmap_epoch()) {
5149 extras.insert(p->first);
5150 }
5151 ++p;
5152 }
5153
5154 // too few?
5155 for (int n = next; n >= 0; ) {
5156 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5157 break;
5158 if (!extras.count(n) && !want.count(n) && n != whoami) {
5159 dout(10) << " adding random peer osd." << n << dendl;
5160 extras.insert(n);
5161 _add_heartbeat_peer(n);
5162 }
5163 n = get_osdmap()->get_next_up_osd_after(n);
5164 if (n == next)
5165 break; // came full circle; stop
5166 }
5167
5168 // too many?
5169 for (set<int>::iterator p = extras.begin();
5170 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5171 ++p) {
5172 if (want.count(*p))
5173 continue;
5174 _remove_heartbeat_peer(*p);
5175 }
5176
5177 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5178
5179 // clean up stale failure pending
5180 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5181 if (heartbeat_peers.count(it->first) == 0) {
5182 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5183 failure_pending.erase(it++);
5184 } else {
5185 it++;
5186 }
5187 }
5188 }
5189
5190 void OSD::reset_heartbeat_peers(bool all)
5191 {
5192 ceph_assert(ceph_mutex_is_locked(osd_lock));
5193 dout(10) << "reset_heartbeat_peers" << dendl;
5194 utime_t stale = ceph_clock_now();
5195 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5196 std::lock_guard l(heartbeat_lock);
5197 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5198 HeartbeatInfo& hi = it->second;
5199 if (all || hi.is_stale(stale)) {
5200 hi.clear_mark_down();
5201 // stop sending failure_report to mon too
5202 failure_queue.erase(it->first);
5203 heartbeat_peers.erase(it++);
5204 } else {
5205 it++;
5206 }
5207 }
5208 }
5209
5210 void OSD::handle_osd_ping(MOSDPing *m)
5211 {
5212 if (superblock.cluster_fsid != m->fsid) {
5213 dout(20) << "handle_osd_ping from " << m->get_source_inst()
5214 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5215 << dendl;
5216 m->put();
5217 return;
5218 }
5219
5220 int from = m->get_source().num();
5221
5222 heartbeat_lock.lock();
5223 if (is_stopping()) {
5224 heartbeat_lock.unlock();
5225 m->put();
5226 return;
5227 }
5228
5229 utime_t now = ceph_clock_now();
5230 auto mnow = service.get_mnow();
5231 ConnectionRef con(m->get_connection());
5232 OSDMapRef curmap = service.get_osdmap();
5233 if (!curmap) {
5234 heartbeat_lock.unlock();
5235 m->put();
5236 return;
5237 }
5238
5239 auto sref = con->get_priv();
5240 Session *s = static_cast<Session*>(sref.get());
5241 if (!s) {
5242 heartbeat_lock.unlock();
5243 m->put();
5244 return;
5245 }
5246 if (!s->stamps) {
5247 s->peer = from;
5248 s->stamps = service.get_hb_stamps(from);
5249 }
5250
5251 switch (m->op) {
5252
5253 case MOSDPing::PING:
5254 {
5255 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5256 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5257 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5258 if (heartbeat_drop->second == 0) {
5259 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5260 } else {
5261 --heartbeat_drop->second;
5262 dout(5) << "Dropping heartbeat from " << from
5263 << ", " << heartbeat_drop->second
5264 << " remaining to drop" << dendl;
5265 break;
5266 }
5267 } else if (cct->_conf->osd_debug_drop_ping_probability >
5268 ((((double)(rand()%100))/100.0))) {
5269 heartbeat_drop =
5270 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5271 cct->_conf->osd_debug_drop_ping_duration)).first;
5272 dout(5) << "Dropping heartbeat from " << from
5273 << ", " << heartbeat_drop->second
5274 << " remaining to drop" << dendl;
5275 break;
5276 }
5277 }
5278
5279 ceph::signedspan sender_delta_ub{};
5280 s->stamps->got_ping(
5281 m->up_from,
5282 mnow,
5283 m->mono_send_stamp,
5284 m->delta_ub,
5285 &sender_delta_ub);
5286 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5287
5288 if (!cct->get_heartbeat_map()->is_healthy()) {
5289 dout(10) << "internal heartbeat not healthy, dropping ping request"
5290 << dendl;
5291 break;
5292 }
5293
5294 Message *r = new MOSDPing(monc->get_fsid(),
5295 curmap->get_epoch(),
5296 MOSDPing::PING_REPLY,
5297 m->ping_stamp,
5298 m->mono_ping_stamp,
5299 mnow,
5300 service.get_up_epoch(),
5301 cct->_conf->osd_heartbeat_min_size,
5302 sender_delta_ub);
5303 con->send_message(r);
5304
5305 if (curmap->is_up(from)) {
5306 if (is_active()) {
5307 ConnectionRef cluster_con = service.get_con_osd_cluster(
5308 from, curmap->get_epoch());
5309 if (cluster_con) {
5310 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5311 }
5312 }
5313 } else if (!curmap->exists(from) ||
5314 curmap->get_down_at(from) > m->map_epoch) {
5315 // tell them they have died
5316 Message *r = new MOSDPing(monc->get_fsid(),
5317 curmap->get_epoch(),
5318 MOSDPing::YOU_DIED,
5319 m->ping_stamp,
5320 m->mono_ping_stamp,
5321 mnow,
5322 service.get_up_epoch(),
5323 cct->_conf->osd_heartbeat_min_size);
5324 con->send_message(r);
5325 }
5326 }
5327 break;
5328
5329 case MOSDPing::PING_REPLY:
5330 {
5331 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5332 if (i != heartbeat_peers.end()) {
5333 auto acked = i->second.ping_history.find(m->ping_stamp);
5334 if (acked != i->second.ping_history.end()) {
5335 int &unacknowledged = acked->second.second;
5336 if (con == i->second.con_back) {
5337 dout(25) << "handle_osd_ping got reply from osd." << from
5338 << " first_tx " << i->second.first_tx
5339 << " last_tx " << i->second.last_tx
5340 << " last_rx_back " << i->second.last_rx_back
5341 << " -> " << now
5342 << " last_rx_front " << i->second.last_rx_front
5343 << dendl;
5344 i->second.last_rx_back = now;
5345 ceph_assert(unacknowledged > 0);
5346 --unacknowledged;
5347 // if there is no front con, set both stamps.
5348 if (i->second.con_front == NULL) {
5349 i->second.last_rx_front = now;
5350 ceph_assert(unacknowledged > 0);
5351 --unacknowledged;
5352 }
5353 } else if (con == i->second.con_front) {
5354 dout(25) << "handle_osd_ping got reply from osd." << from
5355 << " first_tx " << i->second.first_tx
5356 << " last_tx " << i->second.last_tx
5357 << " last_rx_back " << i->second.last_rx_back
5358 << " last_rx_front " << i->second.last_rx_front
5359 << " -> " << now
5360 << dendl;
5361 i->second.last_rx_front = now;
5362 ceph_assert(unacknowledged > 0);
5363 --unacknowledged;
5364 }
5365
5366 if (unacknowledged == 0) {
5367 // succeeded in getting all replies
5368 dout(25) << "handle_osd_ping got all replies from osd." << from
5369 << " , erase pending ping(sent at " << m->ping_stamp << ")"
5370 << " and older pending ping(s)"
5371 << dendl;
5372
5373 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5374 ++i->second.hb_average_count;
5375 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
5376 i->second.hb_total_back += back_pingtime;
5377 if (back_pingtime < i->second.hb_min_back)
5378 i->second.hb_min_back = back_pingtime;
5379 if (back_pingtime > i->second.hb_max_back)
5380 i->second.hb_max_back = back_pingtime;
5381 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
5382 i->second.hb_total_front += front_pingtime;
5383 if (front_pingtime < i->second.hb_min_front)
5384 i->second.hb_min_front = front_pingtime;
5385 if (front_pingtime > i->second.hb_max_front)
5386 i->second.hb_max_front = front_pingtime;
5387
5388 ceph_assert(i->second.hb_interval_start != utime_t());
5389 if (i->second.hb_interval_start == utime_t())
5390 i->second.hb_interval_start = now;
5391 int64_t hb_avg_time_period = 60;
5392 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5393 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5394 }
5395 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5396 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5397 uint32_t back_min = i->second.hb_min_back;
5398 uint32_t back_max = i->second.hb_max_back;
5399 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5400 uint32_t front_min = i->second.hb_min_front;
5401 uint32_t front_max = i->second.hb_max_front;
5402
5403 // Reset for new interval
5404 i->second.hb_average_count = 0;
5405 i->second.hb_interval_start = now;
5406 i->second.hb_total_back = i->second.hb_max_back = 0;
5407 i->second.hb_min_back = UINT_MAX;
5408 i->second.hb_total_front = i->second.hb_max_front = 0;
5409 i->second.hb_min_front = UINT_MAX;
5410
5411 // Record per osd interace ping times
5412 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5413 if (i->second.hb_back_pingtime.size() == 0) {
5414 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5415 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5416 i->second.hb_back_pingtime.push_back(back_avg);
5417 i->second.hb_back_min.push_back(back_min);
5418 i->second.hb_back_max.push_back(back_max);
5419 i->second.hb_front_pingtime.push_back(front_avg);
5420 i->second.hb_front_min.push_back(front_min);
5421 i->second.hb_front_max.push_back(front_max);
5422 ++i->second.hb_index;
5423 }
5424 } else {
5425 int index = i->second.hb_index & (hb_vector_size - 1);
5426 i->second.hb_back_pingtime[index] = back_avg;
5427 i->second.hb_back_min[index] = back_min;
5428 i->second.hb_back_max[index] = back_max;
5429 i->second.hb_front_pingtime[index] = front_avg;
5430 i->second.hb_front_min[index] = front_min;
5431 i->second.hb_front_max[index] = front_max;
5432 ++i->second.hb_index;
5433 }
5434
5435 {
5436 std::lock_guard l(service.stat_lock);
5437 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5438 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5439
5440 uint32_t total = 0;
5441 uint32_t min = UINT_MAX;
5442 uint32_t max = 0;
5443 uint32_t count = 0;
5444 uint32_t which = 0;
5445 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5446 for (int32_t k = size - 1 ; k >= 0; --k) {
5447 ++count;
5448 int index = (i->second.hb_index + k) % size;
5449 total += i->second.hb_back_pingtime[index];
5450 if (i->second.hb_back_min[index] < min)
5451 min = i->second.hb_back_min[index];
5452 if (i->second.hb_back_max[index] > max)
5453 max = i->second.hb_back_max[index];
5454 if (count == 1 || count == 5 || count == 15) {
5455 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5456 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5457 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5458 which++;
5459 if (count == 15)
5460 break;
5461 }
5462 }
5463
5464 if (i->second.con_front != NULL) {
5465 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5466
5467 total = 0;
5468 min = UINT_MAX;
5469 max = 0;
5470 count = 0;
5471 which = 0;
5472 for (int32_t k = size - 1 ; k >= 0; --k) {
5473 ++count;
5474 int index = (i->second.hb_index + k) % size;
5475 total += i->second.hb_front_pingtime[index];
5476 if (i->second.hb_front_min[index] < min)
5477 min = i->second.hb_front_min[index];
5478 if (i->second.hb_front_max[index] > max)
5479 max = i->second.hb_front_max[index];
5480 if (count == 1 || count == 5 || count == 15) {
5481 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5482 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5483 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5484 which++;
5485 if (count == 15)
5486 break;
5487 }
5488 }
5489 }
5490 }
5491 } else {
5492 std::lock_guard l(service.stat_lock);
5493 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5494 if (i->second.con_front != NULL)
5495 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5496 }
5497 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5498 }
5499
5500 if (i->second.is_healthy(now)) {
5501 // Cancel false reports
5502 auto failure_queue_entry = failure_queue.find(from);
5503 if (failure_queue_entry != failure_queue.end()) {
5504 dout(10) << "handle_osd_ping canceling queued "
5505 << "failure report for osd." << from << dendl;
5506 failure_queue.erase(failure_queue_entry);
5507 }
5508
5509 auto failure_pending_entry = failure_pending.find(from);
5510 if (failure_pending_entry != failure_pending.end()) {
5511 dout(10) << "handle_osd_ping canceling in-flight "
5512 << "failure report for osd." << from << dendl;
5513 send_still_alive(curmap->get_epoch(),
5514 from,
5515 failure_pending_entry->second.second);
5516 failure_pending.erase(failure_pending_entry);
5517 }
5518 }
5519 } else {
5520 // old replies, deprecated by newly sent pings.
5521 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
5522 << ") is found, treat as covered by newly sent pings "
5523 << "and ignore"
5524 << dendl;
5525 }
5526 }
5527
5528 if (m->map_epoch &&
5529 curmap->is_up(from)) {
5530 if (is_active()) {
5531 ConnectionRef cluster_con = service.get_con_osd_cluster(
5532 from, curmap->get_epoch());
5533 if (cluster_con) {
5534 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5535 }
5536 }
5537 }
5538
5539 s->stamps->got_ping_reply(
5540 mnow,
5541 m->mono_send_stamp,
5542 m->delta_ub);
5543 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5544 }
5545 break;
5546
5547 case MOSDPing::YOU_DIED:
5548 dout(10) << "handle_osd_ping " << m->get_source_inst()
5549 << " says i am down in " << m->map_epoch << dendl;
5550 osdmap_subscribe(curmap->get_epoch()+1, false);
5551 break;
5552 }
5553
5554 heartbeat_lock.unlock();
5555 m->put();
5556 }
5557
5558 void OSD::heartbeat_entry()
5559 {
5560 std::unique_lock l(heartbeat_lock);
5561 if (is_stopping())
5562 return;
5563 while (!heartbeat_stop) {
5564 heartbeat();
5565
5566 double wait;
5567 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5568 wait = (float)cct->_conf->osd_heartbeat_interval;
5569 } else {
5570 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5571 }
5572 auto w = ceph::make_timespan(wait);
5573 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5574 heartbeat_cond.wait_for(l, w);
5575 if (is_stopping())
5576 return;
5577 dout(30) << "heartbeat_entry woke up" << dendl;
5578 }
5579 }
5580
5581 void OSD::heartbeat_check()
5582 {
5583 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
5584 utime_t now = ceph_clock_now();
5585
5586 // check for incoming heartbeats (move me elsewhere?)
5587 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5588 p != heartbeat_peers.end();
5589 ++p) {
5590
5591 if (p->second.first_tx == utime_t()) {
5592 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5593 << " yet, skipping" << dendl;
5594 continue;
5595 }
5596
5597 dout(25) << "heartbeat_check osd." << p->first
5598 << " first_tx " << p->second.first_tx
5599 << " last_tx " << p->second.last_tx
5600 << " last_rx_back " << p->second.last_rx_back
5601 << " last_rx_front " << p->second.last_rx_front
5602 << dendl;
5603 if (p->second.is_unhealthy(now)) {
5604 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5605 if (p->second.last_rx_back == utime_t() ||
5606 p->second.last_rx_front == utime_t()) {
5607 derr << "heartbeat_check: no reply from "
5608 << p->second.con_front->get_peer_addr().get_sockaddr()
5609 << " osd." << p->first
5610 << " ever on either front or back, first ping sent "
5611 << p->second.first_tx
5612 << " (oldest deadline " << oldest_deadline << ")"
5613 << dendl;
5614 // fail
5615 failure_queue[p->first] = p->second.first_tx;
5616 } else {
5617 derr << "heartbeat_check: no reply from "
5618 << p->second.con_front->get_peer_addr().get_sockaddr()
5619 << " osd." << p->first << " since back " << p->second.last_rx_back
5620 << " front " << p->second.last_rx_front
5621 << " (oldest deadline " << oldest_deadline << ")"
5622 << dendl;
5623 // fail
5624 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5625 }
5626 }
5627 }
5628 }
5629
5630 void OSD::heartbeat()
5631 {
5632 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
5633 dout(30) << "heartbeat" << dendl;
5634
5635 // get CPU load avg
5636 double loadavgs[1];
5637 int hb_interval = cct->_conf->osd_heartbeat_interval;
5638 int n_samples = 86400;
5639 if (hb_interval > 1) {
5640 n_samples /= hb_interval;
5641 if (n_samples < 1)
5642 n_samples = 1;
5643 }
5644
5645 if (getloadavg(loadavgs, 1) == 1) {
5646 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5647 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5648 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5649 }
5650
5651 dout(30) << "heartbeat checking stats" << dendl;
5652
5653 // refresh peer list and osd stats
5654 vector<int> hb_peers;
5655 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5656 p != heartbeat_peers.end();
5657 ++p)
5658 hb_peers.push_back(p->first);
5659
5660 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5661 dout(5) << __func__ << " " << new_stat << dendl;
5662 ceph_assert(new_stat.statfs.total);
5663
5664 float pratio;
5665 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5666
5667 service.check_full_status(ratio, pratio);
5668
5669 utime_t now = ceph_clock_now();
5670 auto mnow = service.get_mnow();
5671 utime_t deadline = now;
5672 deadline += cct->_conf->osd_heartbeat_grace;
5673
5674 // send heartbeats
5675 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5676 i != heartbeat_peers.end();
5677 ++i) {
5678 int peer = i->first;
5679 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5680
5681 i->second.last_tx = now;
5682 if (i->second.first_tx == utime_t())
5683 i->second.first_tx = now;
5684 i->second.ping_history[now] = make_pair(deadline,
5685 HeartbeatInfo::HEARTBEAT_MAX_CONN);
5686 if (i->second.hb_interval_start == utime_t())
5687 i->second.hb_interval_start = now;
5688
5689 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5690 std::optional<ceph::signedspan> delta_ub;
5691 s->stamps->sent_ping(&delta_ub);
5692
5693 i->second.con_back->send_message(
5694 new MOSDPing(monc->get_fsid(),
5695 service.get_osdmap_epoch(),
5696 MOSDPing::PING,
5697 now,
5698 mnow,
5699 mnow,
5700 service.get_up_epoch(),
5701 cct->_conf->osd_heartbeat_min_size,
5702 delta_ub));
5703
5704 if (i->second.con_front)
5705 i->second.con_front->send_message(
5706 new MOSDPing(monc->get_fsid(),
5707 service.get_osdmap_epoch(),
5708 MOSDPing::PING,
5709 now,
5710 mnow,
5711 mnow,
5712 service.get_up_epoch(),
5713 cct->_conf->osd_heartbeat_min_size,
5714 delta_ub));
5715 }
5716
5717 logger->set(l_osd_hb_to, heartbeat_peers.size());
5718
5719 // hmm.. am i all alone?
5720 dout(30) << "heartbeat lonely?" << dendl;
5721 if (heartbeat_peers.empty()) {
5722 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5723 last_mon_heartbeat = now;
5724 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5725 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5726 }
5727 }
5728
5729 dout(30) << "heartbeat done" << dendl;
5730 }
5731
5732 bool OSD::heartbeat_reset(Connection *con)
5733 {
5734 std::lock_guard l(heartbeat_lock);
5735 auto s = con->get_priv();
5736 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
5737 con->set_priv(nullptr);
5738 if (s) {
5739 if (is_stopping()) {
5740 return true;
5741 }
5742 auto session = static_cast<Session*>(s.get());
5743 auto p = heartbeat_peers.find(session->peer);
5744 if (p != heartbeat_peers.end() &&
5745 (p->second.con_back == con ||
5746 p->second.con_front == con)) {
5747 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5748 << ", reopening" << dendl;
5749 p->second.clear_mark_down(con);
5750 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5751 if (newcon.first) {
5752 p->second.con_back = newcon.first.get();
5753 p->second.con_back->set_priv(s);
5754 if (newcon.second) {
5755 p->second.con_front = newcon.second.get();
5756 p->second.con_front->set_priv(s);
5757 }
5758 p->second.ping_history.clear();
5759 } else {
5760 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5761 << ", raced with osdmap update, closing out peer" << dendl;
5762 heartbeat_peers.erase(p);
5763 }
5764 } else {
5765 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5766 }
5767 }
5768 return true;
5769 }
5770
5771
5772
5773 // =========================================
5774
5775 void OSD::tick()
5776 {
5777 ceph_assert(ceph_mutex_is_locked(osd_lock));
5778 dout(10) << "tick" << dendl;
5779
5780 utime_t now = ceph_clock_now();
5781 // throw out any obsolete markdown log
5782 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
5783 while (!osd_markdown_log.empty() &&
5784 osd_markdown_log.front() + grace < now)
5785 osd_markdown_log.pop_front();
5786
5787 if (is_active() || is_waiting_for_healthy()) {
5788 maybe_update_heartbeat_peers();
5789 }
5790
5791 if (is_waiting_for_healthy()) {
5792 start_boot();
5793 }
5794
5795 if (is_waiting_for_healthy() || is_booting()) {
5796 std::lock_guard l(heartbeat_lock);
5797 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5798 last_mon_heartbeat = now;
5799 dout(1) << __func__ << " checking mon for new map" << dendl;
5800 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5801 }
5802 }
5803
5804 do_waiters();
5805
5806 // scrub purged_snaps every deep scrub interval
5807 {
5808 const utime_t last = superblock.last_purged_snaps_scrub;
5809 utime_t next = last;
5810 next += cct->_conf->osd_scrub_min_interval;
5811 std::mt19937 rng;
5812 // use a seed that is stable for each scrub interval, but varies
5813 // by OSD to avoid any herds.
5814 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
5815 double r = (rng() % 1024) / 1024;
5816 next +=
5817 cct->_conf->osd_scrub_min_interval *
5818 cct->_conf->osd_scrub_interval_randomize_ratio * r;
5819 if (next < ceph_clock_now()) {
5820 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5821 << " next " << next << " ... now" << dendl;
5822 scrub_purged_snaps();
5823 } else {
5824 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5825 << " next " << next << dendl;
5826 }
5827 }
5828
5829 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
5830 }
5831
5832 void OSD::tick_without_osd_lock()
5833 {
5834 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
5835 dout(10) << "tick_without_osd_lock" << dendl;
5836
5837 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5838 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5839 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5840
5841 // refresh osd stats
5842 struct store_statfs_t stbuf;
5843 osd_alert_list_t alerts;
5844 int r = store->statfs(&stbuf, &alerts);
5845 ceph_assert(r == 0);
5846 service.set_statfs(stbuf, alerts);
5847
5848 // osd_lock is not being held, which means the OSD state
5849 // might change when doing the monitor report
5850 if (is_active() || is_waiting_for_healthy()) {
5851 {
5852 std::lock_guard l{heartbeat_lock};
5853 heartbeat_check();
5854 }
5855 map_lock.lock_shared();
5856 std::lock_guard l(mon_report_lock);
5857
5858 // mon report?
5859 utime_t now = ceph_clock_now();
5860 if (service.need_fullness_update() ||
5861 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
5862 last_mon_report = now;
5863 send_full_update();
5864 send_failures();
5865 }
5866 map_lock.unlock_shared();
5867
5868 epoch_t max_waiting_epoch = 0;
5869 for (auto s : shards) {
5870 max_waiting_epoch = std::max(max_waiting_epoch,
5871 s->get_max_waiting_epoch());
5872 }
5873 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
5874 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
5875 << ", requesting new map" << dendl;
5876 osdmap_subscribe(superblock.newest_map + 1, false);
5877 }
5878 }
5879
5880 if (is_active()) {
5881 if (!scrub_random_backoff()) {
5882 sched_scrub();
5883 }
5884 service.promote_throttle_recalibrate();
5885 resume_creating_pg();
5886 bool need_send_beacon = false;
5887 const auto now = ceph::coarse_mono_clock::now();
5888 {
5889 // borrow lec lock to pretect last_sent_beacon from changing
5890 std::lock_guard l{min_last_epoch_clean_lock};
5891 const auto elapsed = now - last_sent_beacon;
5892 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5893 cct->_conf->osd_beacon_report_interval) {
5894 need_send_beacon = true;
5895 }
5896 }
5897 if (need_send_beacon) {
5898 send_beacon(now);
5899 }
5900 }
5901
5902 mgrc.update_daemon_health(get_health_metrics());
5903 service.kick_recovery_queue();
5904 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5905 new C_Tick_WithoutOSDLock(this));
5906 }
5907
5908 // Usage:
5909 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5910 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5911 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5912 // getomap <pool> [namespace/]<obj-name>
5913 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5914 // injectmdataerr [namespace/]<obj-name> [shardid]
5915 // injectdataerr [namespace/]<obj-name> [shardid]
5916 //
5917 // set_recovery_delay [utime]
5918 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5919 std::string_view command,
5920 const cmdmap_t& cmdmap, ostream &ss)
5921 {
5922 //Test support
5923 //Support changing the omap on a single osd by using the Admin Socket to
5924 //directly request the osd make a change.
5925 if (command == "setomapval" || command == "rmomapkey" ||
5926 command == "setomapheader" || command == "getomap" ||
5927 command == "truncobj" || command == "injectmdataerr" ||
5928 command == "injectdataerr"
5929 ) {
5930 pg_t rawpg;
5931 int64_t pool;
5932 OSDMapRef curmap = service->get_osdmap();
5933 int r = -1;
5934
5935 string poolstr;
5936
5937 cmd_getval(cmdmap, "pool", poolstr);
5938 pool = curmap->lookup_pg_pool_name(poolstr);
5939 //If we can't find it by name then maybe id specified
5940 if (pool < 0 && isdigit(poolstr[0]))
5941 pool = atoll(poolstr.c_str());
5942 if (pool < 0) {
5943 ss << "Invalid pool '" << poolstr << "''";
5944 return;
5945 }
5946
5947 string objname, nspace;
5948 cmd_getval(cmdmap, "objname", objname);
5949 std::size_t found = objname.find_first_of('/');
5950 if (found != string::npos) {
5951 nspace = objname.substr(0, found);
5952 objname = objname.substr(found+1);
5953 }
5954 object_locator_t oloc(pool, nspace);
5955 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5956
5957 if (r < 0) {
5958 ss << "Invalid namespace/objname";
5959 return;
5960 }
5961
5962 int64_t shardid;
5963 cmd_getval(cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5964 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5965 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5966 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5967 if (curmap->pg_is_ec(rawpg)) {
5968 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5969 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5970 return;
5971 }
5972 }
5973
5974 ObjectStore::Transaction t;
5975
5976 if (command == "setomapval") {
5977 map<string, bufferlist> newattrs;
5978 bufferlist val;
5979 string key, valstr;
5980 cmd_getval(cmdmap, "key", key);
5981 cmd_getval(cmdmap, "val", valstr);
5982
5983 val.append(valstr);
5984 newattrs[key] = val;
5985 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5986 r = store->queue_transaction(service->meta_ch, std::move(t));
5987 if (r < 0)
5988 ss << "error=" << r;
5989 else
5990 ss << "ok";
5991 } else if (command == "rmomapkey") {
5992 string key;
5993 cmd_getval(cmdmap, "key", key);
5994
5995 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
5996 r = store->queue_transaction(service->meta_ch, std::move(t));
5997 if (r < 0)
5998 ss << "error=" << r;
5999 else
6000 ss << "ok";
6001 } else if (command == "setomapheader") {
6002 bufferlist newheader;
6003 string headerstr;
6004
6005 cmd_getval(cmdmap, "header", headerstr);
6006 newheader.append(headerstr);
6007 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
6008 r = store->queue_transaction(service->meta_ch, std::move(t));
6009 if (r < 0)
6010 ss << "error=" << r;
6011 else
6012 ss << "ok";
6013 } else if (command == "getomap") {
6014 //Debug: Output entire omap
6015 bufferlist hdrbl;
6016 map<string, bufferlist> keyvals;
6017 auto ch = store->open_collection(coll_t(pgid));
6018 if (!ch) {
6019 ss << "unable to open collection for " << pgid;
6020 r = -ENOENT;
6021 } else {
6022 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6023 if (r >= 0) {
6024 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6025 for (map<string, bufferlist>::iterator it = keyvals.begin();
6026 it != keyvals.end(); ++it)
6027 ss << " key=" << (*it).first << " val="
6028 << string((*it).second.c_str(), (*it).second.length());
6029 } else {
6030 ss << "error=" << r;
6031 }
6032 }
6033 } else if (command == "truncobj") {
6034 int64_t trunclen;
6035 cmd_getval(cmdmap, "len", trunclen);
6036 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
6037 r = store->queue_transaction(service->meta_ch, std::move(t));
6038 if (r < 0)
6039 ss << "error=" << r;
6040 else
6041 ss << "ok";
6042 } else if (command == "injectdataerr") {
6043 store->inject_data_error(gobj);
6044 ss << "ok";
6045 } else if (command == "injectmdataerr") {
6046 store->inject_mdata_error(gobj);
6047 ss << "ok";
6048 }
6049 return;
6050 }
6051 if (command == "set_recovery_delay") {
6052 int64_t delay;
6053 cmd_getval(cmdmap, "utime", delay, (int64_t)0);
6054 ostringstream oss;
6055 oss << delay;
6056 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
6057 oss.str().c_str());
6058 if (r != 0) {
6059 ss << "set_recovery_delay: error setting "
6060 << "osd_recovery_delay_start to '" << delay << "': error "
6061 << r;
6062 return;
6063 }
6064 service->cct->_conf.apply_changes(nullptr);
6065 ss << "set_recovery_delay: set osd_recovery_delay_start "
6066 << "to " << service->cct->_conf->osd_recovery_delay_start;
6067 return;
6068 }
6069 if (command == "injectfull") {
6070 int64_t count;
6071 string type;
6072 OSDService::s_names state;
6073 cmd_getval(cmdmap, "type", type, string("full"));
6074 cmd_getval(cmdmap, "count", count, (int64_t)-1);
6075 if (type == "none" || count == 0) {
6076 type = "none";
6077 count = 0;
6078 }
6079 state = service->get_full_state(type);
6080 if (state == OSDService::s_names::INVALID) {
6081 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6082 return;
6083 }
6084 service->set_injectfull(state, count);
6085 return;
6086 }
6087 ss << "Internal error - command=" << command;
6088 }
6089
6090 // =========================================
6091
6092 void OSD::ms_handle_connect(Connection *con)
6093 {
6094 dout(10) << __func__ << " con " << con << dendl;
6095 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6096 std::lock_guard l(osd_lock);
6097 if (is_stopping())
6098 return;
6099 dout(10) << __func__ << " on mon" << dendl;
6100
6101 if (is_preboot()) {
6102 start_boot();
6103 } else if (is_booting()) {
6104 _send_boot(); // resend boot message
6105 } else {
6106 map_lock.lock_shared();
6107 std::lock_guard l2(mon_report_lock);
6108
6109 utime_t now = ceph_clock_now();
6110 last_mon_report = now;
6111
6112 // resend everything, it's a new session
6113 send_full_update();
6114 send_alive();
6115 service.requeue_pg_temp();
6116 service.clear_sent_ready_to_merge();
6117 service.send_pg_temp();
6118 service.send_ready_to_merge();
6119 service.send_pg_created();
6120 requeue_failures();
6121 send_failures();
6122
6123 map_lock.unlock_shared();
6124 if (is_active()) {
6125 send_beacon(ceph::coarse_mono_clock::now());
6126 }
6127 }
6128
6129 // full map requests may happen while active or pre-boot
6130 if (requested_full_first) {
6131 rerequest_full_maps();
6132 }
6133 }
6134 }
6135
6136 void OSD::ms_handle_fast_connect(Connection *con)
6137 {
6138 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6139 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6140 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6141 s = ceph::make_ref<Session>(cct, con);
6142 con->set_priv(s);
6143 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6144 << " addr=" << s->con->get_peer_addr() << dendl;
6145 // we don't connect to clients
6146 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6147 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6148 }
6149 }
6150 }
6151
6152 void OSD::ms_handle_fast_accept(Connection *con)
6153 {
6154 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6155 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6156 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6157 s = ceph::make_ref<Session>(cct, con);
6158 con->set_priv(s);
6159 dout(10) << "new session (incoming)" << s << " con=" << con
6160 << " addr=" << con->get_peer_addr()
6161 << " must have raced with connect" << dendl;
6162 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6163 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6164 }
6165 }
6166 }
6167
6168 bool OSD::ms_handle_reset(Connection *con)
6169 {
6170 auto session = ceph::ref_cast<Session>(con->get_priv());
6171 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
6172 if (!session)
6173 return false;
6174 session->wstate.reset(con);
6175 session->con->set_priv(nullptr);
6176 session->con.reset(); // break con <-> session ref cycle
6177 // note that we break session->con *before* the session_handle_reset
6178 // cleanup below. this avoids a race between us and
6179 // PG::add_backoff, Session::check_backoff, etc.
6180 session_handle_reset(session);
6181 return true;
6182 }
6183
6184 bool OSD::ms_handle_refused(Connection *con)
6185 {
6186 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6187 return false;
6188
6189 auto session = ceph::ref_cast<Session>(con->get_priv());
6190 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
6191 if (!session)
6192 return false;
6193 int type = con->get_peer_type();
6194 // handle only OSD failures here
6195 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6196 OSDMapRef osdmap = get_osdmap();
6197 if (osdmap) {
6198 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6199 if (id >= 0 && osdmap->is_up(id)) {
6200 // I'm cheating mon heartbeat grace logic, because we know it's not going
6201 // to respawn alone. +1 so we won't hit any boundary case.
6202 monc->send_mon_message(
6203 new MOSDFailure(
6204 monc->get_fsid(),
6205 id,
6206 osdmap->get_addrs(id),
6207 cct->_conf->osd_heartbeat_grace + 1,
6208 osdmap->get_epoch(),
6209 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6210 ));
6211 }
6212 }
6213 }
6214 return true;
6215 }
6216
6217 struct C_OSD_GetVersion : public Context {
6218 OSD *osd;
6219 uint64_t oldest, newest;
6220 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
6221 void finish(int r) override {
6222 if (r >= 0)
6223 osd->_got_mon_epochs(oldest, newest);
6224 }
6225 };
6226
6227 void OSD::start_boot()
6228 {
6229 if (!_is_healthy()) {
6230 // if we are not healthy, do not mark ourselves up (yet)
6231 dout(1) << "not healthy; waiting to boot" << dendl;
6232 if (!is_waiting_for_healthy())
6233 start_waiting_for_healthy();
6234 // send pings sooner rather than later
6235 heartbeat_kick();
6236 return;
6237 }
6238 dout(1) << __func__ << dendl;
6239 set_state(STATE_PREBOOT);
6240 dout(10) << "start_boot - have maps " << superblock.oldest_map
6241 << ".." << superblock.newest_map << dendl;
6242 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
6243 monc->get_version("osdmap", &c->newest, &c->oldest, c);
6244 }
6245
6246 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6247 {
6248 std::lock_guard l(osd_lock);
6249 if (is_preboot()) {
6250 _preboot(oldest, newest);
6251 }
6252 }
6253
6254 void OSD::_preboot(epoch_t oldest, epoch_t newest)
6255 {
6256 ceph_assert(is_preboot());
6257 dout(10) << __func__ << " _preboot mon has osdmaps "
6258 << oldest << ".." << newest << dendl;
6259
6260 // ensure our local fullness awareness is accurate
6261 {
6262 std::lock_guard l(heartbeat_lock);
6263 heartbeat();
6264 }
6265
6266 const auto& monmap = monc->monmap;
6267 const auto osdmap = get_osdmap();
6268 // if our map within recent history, try to add ourselves to the osdmap.
6269 if (osdmap->get_epoch() == 0) {
6270 derr << "waiting for initial osdmap" << dendl;
6271 } else if (osdmap->is_destroyed(whoami)) {
6272 derr << "osdmap says I am destroyed" << dendl;
6273 // provide a small margin so we don't livelock seeing if we
6274 // un-destroyed ourselves.
6275 if (osdmap->get_epoch() > newest - 1) {
6276 exit(0);
6277 }
6278 } else if (osdmap->is_noup(whoami)) {
6279 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6280 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6281 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6282 << dendl;
6283 } else if (osdmap->require_osd_release < ceph_release_t::luminous) {
6284 derr << "osdmap require_osd_release < luminous; please upgrade to luminous"
6285 << dendl;
6286 } else if (service.need_fullness_update()) {
6287 derr << "osdmap fullness state needs update" << dendl;
6288 send_full_update();
6289 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6290 superblock.purged_snaps_last < superblock.current_epoch) {
6291 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6292 << " < newest_map " << superblock.current_epoch << dendl;
6293 _get_purged_snaps();
6294 } else if (osdmap->get_epoch() >= oldest - 1 &&
6295 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6296
6297 // wait for pgs to fully catch up in a different thread, since
6298 // this thread might be required for splitting and merging PGs to
6299 // make progress.
6300 boot_finisher.queue(
6301 new LambdaContext(
6302 [this](int r) {
6303 std::unique_lock l(osd_lock);
6304 if (is_preboot()) {
6305 dout(10) << __func__ << " waiting for peering work to drain"
6306 << dendl;
6307 l.unlock();
6308 for (auto shard : shards) {
6309 shard->wait_min_pg_epoch(get_osdmap_epoch());
6310 }
6311 l.lock();
6312 }
6313 if (is_preboot()) {
6314 _send_boot();
6315 }
6316 }));
6317 return;
6318 }
6319
6320 // get all the latest maps
6321 if (osdmap->get_epoch() + 1 >= oldest)
6322 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6323 else
6324 osdmap_subscribe(oldest - 1, true);
6325 }
6326
6327 void OSD::_get_purged_snaps()
6328 {
6329 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6330 // overlapping requests to the mon, which will be somewhat inefficient, but
6331 // it should be reliable.
6332 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6333 << ", newest_map " << superblock.current_epoch << dendl;
6334 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6335 superblock.purged_snaps_last + 1,
6336 superblock.current_epoch + 1);
6337 monc->send_mon_message(m);
6338 }
6339
6340 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6341 {
6342 dout(10) << __func__ << " " << *m << dendl;
6343 ObjectStore::Transaction t;
6344 if (!is_preboot() ||
6345 m->last < superblock.purged_snaps_last) {
6346 goto out;
6347 }
6348 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
6349 make_purged_snaps_oid(), &t,
6350 m->purged_snaps);
6351 superblock.purged_snaps_last = m->last;
6352 write_superblock(t);
6353 store->queue_transaction(
6354 service.meta_ch,
6355 std::move(t));
6356 service.publish_superblock(superblock);
6357 if (m->last < superblock.current_epoch) {
6358 _get_purged_snaps();
6359 } else {
6360 start_boot();
6361 }
6362 out:
6363 m->put();
6364 }
6365
6366 void OSD::send_full_update()
6367 {
6368 if (!service.need_fullness_update())
6369 return;
6370 unsigned state = 0;
6371 if (service.is_full()) {
6372 state = CEPH_OSD_FULL;
6373 } else if (service.is_backfillfull()) {
6374 state = CEPH_OSD_BACKFILLFULL;
6375 } else if (service.is_nearfull()) {
6376 state = CEPH_OSD_NEARFULL;
6377 }
6378 set<string> s;
6379 OSDMap::calc_state_set(state, s);
6380 dout(10) << __func__ << " want state " << s << dendl;
6381 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
6382 }
6383
6384 void OSD::start_waiting_for_healthy()
6385 {
6386 dout(1) << "start_waiting_for_healthy" << dendl;
6387 set_state(STATE_WAITING_FOR_HEALTHY);
6388 last_heartbeat_resample = utime_t();
6389
6390 // subscribe to osdmap updates, in case our peers really are known to be dead
6391 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6392 }
6393
6394 bool OSD::_is_healthy()
6395 {
6396 if (!cct->get_heartbeat_map()->is_healthy()) {
6397 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6398 return false;
6399 }
6400
6401 if (is_waiting_for_healthy()) {
6402 utime_t now = ceph_clock_now();
6403 if (osd_markdown_log.empty()) {
6404 dout(5) << __func__ << " force returning true since last markdown"
6405 << " was " << cct->_conf->osd_max_markdown_period
6406 << "s ago" << dendl;
6407 return true;
6408 }
6409 std::lock_guard l(heartbeat_lock);
6410 int num = 0, up = 0;
6411 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6412 p != heartbeat_peers.end();
6413 ++p) {
6414 if (p->second.is_healthy(now))
6415 ++up;
6416 ++num;
6417 }
6418 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6419 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6420 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6421 return false;
6422 }
6423 }
6424
6425 return true;
6426 }
6427
6428 void OSD::_send_boot()
6429 {
6430 dout(10) << "_send_boot" << dendl;
6431 Connection *local_connection =
6432 cluster_messenger->get_loopback_connection().get();
6433 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6434 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6435 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6436 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6437
6438 dout(20) << " initial client_addrs " << client_addrs
6439 << ", cluster_addrs " << cluster_addrs
6440 << ", hb_back_addrs " << hb_back_addrs
6441 << ", hb_front_addrs " << hb_front_addrs
6442 << dendl;
6443 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6444 dout(10) << " assuming cluster_addrs match client_addrs "
6445 << client_addrs << dendl;
6446 cluster_addrs = cluster_messenger->get_myaddrs();
6447 }
6448 if (auto session = local_connection->get_priv(); !session) {
6449 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6450 }
6451
6452 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6453 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6454 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6455 << cluster_addrs << dendl;
6456 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6457 }
6458 if (auto session = local_connection->get_priv(); !session) {
6459 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6460 }
6461
6462 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6463 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6464 dout(10) << " assuming hb_front_addrs match client_addrs "
6465 << client_addrs << dendl;
6466 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6467 }
6468 if (auto session = local_connection->get_priv(); !session) {
6469 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6470 }
6471
6472 // we now know what our front and back addrs will be, and we are
6473 // about to tell the mon what our metadata (including numa bindings)
6474 // are, so now is a good time!
6475 set_numa_affinity();
6476
6477 MOSDBoot *mboot = new MOSDBoot(
6478 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6479 hb_back_addrs, hb_front_addrs, cluster_addrs,
6480 CEPH_FEATURES_ALL);
6481 dout(10) << " final client_addrs " << client_addrs
6482 << ", cluster_addrs " << cluster_addrs
6483 << ", hb_back_addrs " << hb_back_addrs
6484 << ", hb_front_addrs " << hb_front_addrs
6485 << dendl;
6486 _collect_metadata(&mboot->metadata);
6487 monc->send_mon_message(mboot);
6488 set_state(STATE_BOOTING);
6489 }
6490
6491 void OSD::_collect_metadata(map<string,string> *pm)
6492 {
6493 // config info
6494 (*pm)["osd_data"] = dev_path;
6495 if (store->get_type() == "filestore") {
6496 // not applicable for bluestore
6497 (*pm)["osd_journal"] = journal_path;
6498 }
6499 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6500 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6501 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6502 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6503
6504 // backend
6505 (*pm)["osd_objectstore"] = store->get_type();
6506 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6507 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6508 (*pm)["default_device_class"] = store->get_default_device_class();
6509 store->collect_metadata(pm);
6510
6511 collect_sys_info(pm, cct);
6512
6513 (*pm)["front_iface"] = pick_iface(
6514 cct,
6515 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6516 (*pm)["back_iface"] = pick_iface(
6517 cct,
6518 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6519
6520 // network numa
6521 {
6522 int node = -1;
6523 set<int> nodes;
6524 set<string> unknown;
6525 for (auto nm : { "front_iface", "back_iface" }) {
6526 if (!(*pm)[nm].size()) {
6527 unknown.insert(nm);
6528 continue;
6529 }
6530 int n = -1;
6531 int r = get_iface_numa_node((*pm)[nm], &n);
6532 if (r < 0) {
6533 unknown.insert((*pm)[nm]);
6534 continue;
6535 }
6536 nodes.insert(n);
6537 if (node < 0) {
6538 node = n;
6539 }
6540 }
6541 if (unknown.size()) {
6542 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6543 }
6544 if (!nodes.empty()) {
6545 (*pm)["network_numa_nodes"] = stringify(nodes);
6546 }
6547 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6548 (*pm)["network_numa_node"] = stringify(node);
6549 }
6550 }
6551
6552 if (numa_node >= 0) {
6553 (*pm)["numa_node"] = stringify(numa_node);
6554 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6555 &numa_cpu_set);
6556 }
6557
6558 set<string> devnames;
6559 store->get_devices(&devnames);
6560 map<string,string> errs;
6561 get_device_metadata(devnames, pm, &errs);
6562 for (auto& i : errs) {
6563 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
6564 }
6565 dout(10) << __func__ << " " << *pm << dendl;
6566 }
6567
6568 void OSD::queue_want_up_thru(epoch_t want)
6569 {
6570 std::shared_lock map_locker{map_lock};
6571 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6572 std::lock_guard report_locker(mon_report_lock);
6573 if (want > up_thru_wanted) {
6574 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6575 << ", currently " << cur
6576 << dendl;
6577 up_thru_wanted = want;
6578 send_alive();
6579 } else {
6580 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6581 << ", currently " << cur
6582 << dendl;
6583 }
6584 }
6585
6586 void OSD::send_alive()
6587 {
6588 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6589 const auto osdmap = get_osdmap();
6590 if (!osdmap->exists(whoami))
6591 return;
6592 epoch_t up_thru = osdmap->get_up_thru(whoami);
6593 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6594 if (up_thru_wanted > up_thru) {
6595 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6596 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6597 }
6598 }
6599
6600 void OSD::request_full_map(epoch_t first, epoch_t last)
6601 {
6602 dout(10) << __func__ << " " << first << ".." << last
6603 << ", previously requested "
6604 << requested_full_first << ".." << requested_full_last << dendl;
6605 ceph_assert(ceph_mutex_is_locked(osd_lock));
6606 ceph_assert(first > 0 && last > 0);
6607 ceph_assert(first <= last);
6608 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6609 if (requested_full_first == 0) {
6610 // first request
6611 requested_full_first = first;
6612 requested_full_last = last;
6613 } else if (last <= requested_full_last) {
6614 // dup
6615 return;
6616 } else {
6617 // additional request
6618 first = requested_full_last + 1;
6619 requested_full_last = last;
6620 }
6621 MMonGetOSDMap *req = new MMonGetOSDMap;
6622 req->request_full(first, last);
6623 monc->send_mon_message(req);
6624 }
6625
6626 void OSD::got_full_map(epoch_t e)
6627 {
6628 ceph_assert(requested_full_first <= requested_full_last);
6629 ceph_assert(ceph_mutex_is_locked(osd_lock));
6630 if (requested_full_first == 0) {
6631 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6632 return;
6633 }
6634 if (e < requested_full_first) {
6635 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6636 << ".." << requested_full_last
6637 << ", ignoring" << dendl;
6638 return;
6639 }
6640 if (e >= requested_full_last) {
6641 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6642 << ".." << requested_full_last << ", resetting" << dendl;
6643 requested_full_first = requested_full_last = 0;
6644 return;
6645 }
6646
6647 requested_full_first = e + 1;
6648
6649 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6650 << ".." << requested_full_last
6651 << ", still need more" << dendl;
6652 }
6653
6654 void OSD::requeue_failures()
6655 {
6656 std::lock_guard l(heartbeat_lock);
6657 unsigned old_queue = failure_queue.size();
6658 unsigned old_pending = failure_pending.size();
6659 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
6660 failure_queue[p->first] = p->second.first;
6661 failure_pending.erase(p++);
6662 }
6663 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6664 << failure_queue.size() << dendl;
6665 }
6666
6667 void OSD::send_failures()
6668 {
6669 ceph_assert(ceph_mutex_is_locked(map_lock));
6670 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6671 std::lock_guard l(heartbeat_lock);
6672 utime_t now = ceph_clock_now();
6673 const auto osdmap = get_osdmap();
6674 while (!failure_queue.empty()) {
6675 int osd = failure_queue.begin()->first;
6676 if (!failure_pending.count(osd)) {
6677 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6678 monc->send_mon_message(
6679 new MOSDFailure(
6680 monc->get_fsid(),
6681 osd,
6682 osdmap->get_addrs(osd),
6683 failed_for,
6684 osdmap->get_epoch()));
6685 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6686 osdmap->get_addrs(osd));
6687 }
6688 failure_queue.erase(osd);
6689 }
6690 }
6691
6692 void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
6693 {
6694 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6695 MOSDFailure::FLAG_ALIVE);
6696 monc->send_mon_message(m);
6697 }
6698
6699 void OSD::cancel_pending_failures()
6700 {
6701 std::lock_guard l(heartbeat_lock);
6702 auto it = failure_pending.begin();
6703 while (it != failure_pending.end()) {
6704 dout(10) << __func__ << " canceling in-flight failure report for osd."
6705 << it->first << dendl;
6706 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
6707 failure_pending.erase(it++);
6708 }
6709 }
6710
6711 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6712 {
6713 const auto& monmap = monc->monmap;
6714 // send beacon to mon even if we are just connected, and the monmap is not
6715 // initialized yet by then.
6716 if (monmap.epoch > 0 &&
6717 monmap.get_required_features().contains_all(
6718 ceph::features::mon::FEATURE_LUMINOUS)) {
6719 dout(20) << __func__ << " sending" << dendl;
6720 MOSDBeacon* beacon = nullptr;
6721 {
6722 std::lock_guard l{min_last_epoch_clean_lock};
6723 beacon = new MOSDBeacon(get_osdmap_epoch(),
6724 min_last_epoch_clean,
6725 superblock.last_purged_snaps_scrub);
6726 beacon->pgs = min_last_epoch_clean_pgs;
6727 last_sent_beacon = now;
6728 }
6729 monc->send_mon_message(beacon);
6730 } else {
6731 dout(20) << __func__ << " not sending" << dendl;
6732 }
6733 }
6734
6735 void OSD::handle_command(MCommand *m)
6736 {
6737 ConnectionRef con = m->get_connection();
6738 auto session = ceph::ref_cast<Session>(con->get_priv());
6739 if (!session) {
6740 con->send_message(new MCommandReply(m, -EACCES));
6741 m->put();
6742 return;
6743 }
6744 if (!session->caps.allow_all()) {
6745 con->send_message(new MCommandReply(m, -EACCES));
6746 m->put();
6747 return;
6748 }
6749 cct->get_admin_socket()->queue_tell_command(m);
6750 m->put();
6751 }
6752
6753 namespace {
6754 class unlock_guard {
6755 ceph::mutex& m;
6756 public:
6757 explicit unlock_guard(ceph::mutex& mutex)
6758 : m(mutex)
6759 {
6760 m.unlock();
6761 }
6762 unlock_guard(unlock_guard&) = delete;
6763 ~unlock_guard() {
6764 m.lock();
6765 }
6766 };
6767 }
6768
6769 void OSD::scrub_purged_snaps()
6770 {
6771 dout(10) << __func__ << dendl;
6772 ceph_assert(ceph_mutex_is_locked(osd_lock));
6773 SnapMapper::Scrubber s(cct, store, service.meta_ch,
6774 make_snapmapper_oid(),
6775 make_purged_snaps_oid());
6776 clog->debug() << "purged_snaps scrub starts";
6777 osd_lock.unlock();
6778 s.run();
6779 if (s.stray.size()) {
6780 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
6781 } else {
6782 clog->debug() << "purged_snaps scrub ok";
6783 }
6784 set<pair<spg_t,snapid_t>> queued;
6785 for (auto& [pool, snap, hash, shard] : s.stray) {
6786 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
6787 if (!pi) {
6788 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
6789 continue;
6790 }
6791 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
6792 spg_t spgid(pgid, shard);
6793 pair<spg_t,snapid_t> p(spgid, snap);
6794 if (queued.count(p)) {
6795 dout(20) << __func__ << " pg " << spgid << " snap " << snap
6796 << " already queued" << dendl;
6797 continue;
6798 }
6799 PGRef pg = lookup_lock_pg(spgid);
6800 if (!pg) {
6801 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
6802 continue;
6803 }
6804 queued.insert(p);
6805 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
6806 << snap << dendl;
6807 pg->queue_snap_retrim(snap);
6808 pg->unlock();
6809 }
6810 osd_lock.lock();
6811 if (is_stopping()) {
6812 return;
6813 }
6814 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
6815 ObjectStore::Transaction t;
6816 superblock.last_purged_snaps_scrub = ceph_clock_now();
6817 write_superblock(t);
6818 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
6819 ceph_assert(tr == 0);
6820 if (is_active()) {
6821 send_beacon(ceph::coarse_mono_clock::now());
6822 }
6823 dout(10) << __func__ << " done" << dendl;
6824 }
6825
6826 void OSD::probe_smart(const string& only_devid, ostream& ss)
6827 {
6828 set<string> devnames;
6829 store->get_devices(&devnames);
6830 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
6831 "osd_smart_report_timeout");
6832
6833 // == typedef std::map<std::string, mValue> mObject;
6834 json_spirit::mObject json_map;
6835
6836 for (auto dev : devnames) {
6837 // smartctl works only on physical devices; filter out any logical device
6838 if (dev.find("dm-") == 0) {
6839 continue;
6840 }
6841
6842 string err;
6843 string devid = get_device_id(dev, &err);
6844 if (devid.size() == 0) {
6845 dout(10) << __func__ << " no unique id for dev " << dev << " ("
6846 << err << "), skipping" << dendl;
6847 continue;
6848 }
6849 if (only_devid.size() && devid != only_devid) {
6850 continue;
6851 }
6852
6853 json_spirit::mValue smart_json;
6854 if (block_device_get_metrics(dev, smart_timeout,
6855 &smart_json)) {
6856 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
6857 continue;
6858 }
6859 json_map[devid] = smart_json;
6860 }
6861 json_spirit::write(json_map, ss, json_spirit::pretty_print);
6862 }
6863
6864 bool OSD::heartbeat_dispatch(Message *m)
6865 {
6866 dout(30) << "heartbeat_dispatch " << m << dendl;
6867 switch (m->get_type()) {
6868
6869 case CEPH_MSG_PING:
6870 dout(10) << "ping from " << m->get_source_inst() << dendl;
6871 m->put();
6872 break;
6873
6874 case MSG_OSD_PING:
6875 handle_osd_ping(static_cast<MOSDPing*>(m));
6876 break;
6877
6878 default:
6879 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
6880 m->put();
6881 }
6882
6883 return true;
6884 }
6885
6886 bool OSD::ms_dispatch(Message *m)
6887 {
6888 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
6889 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
6890 service.got_stop_ack();
6891 m->put();
6892 return true;
6893 }
6894
6895 // lock!
6896
6897 osd_lock.lock();
6898 if (is_stopping()) {
6899 osd_lock.unlock();
6900 m->put();
6901 return true;
6902 }
6903
6904 do_waiters();
6905 _dispatch(m);
6906
6907 osd_lock.unlock();
6908
6909 return true;
6910 }
6911
6912 void OSDService::maybe_share_map(
6913 Connection *con,
6914 const OSDMapRef& osdmap,
6915 epoch_t peer_epoch_lb)
6916 {
6917 // NOTE: we assume caller hold something that keeps the Connection itself
6918 // pinned (e.g., an OpRequest's MessageRef).
6919 auto session = ceph::ref_cast<Session>(con->get_priv());
6920 if (!session) {
6921 return;
6922 }
6923
6924 // assume the peer has the newer of the op's sent_epoch and what
6925 // we think we sent them.
6926 session->sent_epoch_lock.lock();
6927 if (peer_epoch_lb > session->last_sent_epoch) {
6928 dout(10) << __func__ << " con " << con
6929 << " " << con->get_peer_addr()
6930 << " map epoch " << session->last_sent_epoch
6931 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
6932 session->last_sent_epoch = peer_epoch_lb;
6933 }
6934 epoch_t last_sent_epoch = session->last_sent_epoch;
6935 session->sent_epoch_lock.unlock();
6936
6937 if (osdmap->get_epoch() <= last_sent_epoch) {
6938 return;
6939 }
6940
6941 send_incremental_map(last_sent_epoch, con, osdmap);
6942 last_sent_epoch = osdmap->get_epoch();
6943
6944 session->sent_epoch_lock.lock();
6945 if (session->last_sent_epoch < last_sent_epoch) {
6946 dout(10) << __func__ << " con " << con
6947 << " " << con->get_peer_addr()
6948 << " map epoch " << session->last_sent_epoch
6949 << " -> " << last_sent_epoch << " (shared)" << dendl;
6950 session->last_sent_epoch = last_sent_epoch;
6951 }
6952 session->sent_epoch_lock.unlock();
6953 }
6954
6955 void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
6956 {
6957 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
6958
6959 auto i = session->waiting_on_map.begin();
6960 while (i != session->waiting_on_map.end()) {
6961 OpRequestRef op = &(*i);
6962 ceph_assert(ms_can_fast_dispatch(op->get_req()));
6963 auto m = op->get_req<MOSDFastDispatchOp>();
6964 if (m->get_min_epoch() > osdmap->get_epoch()) {
6965 break;
6966 }
6967 session->waiting_on_map.erase(i++);
6968 op->put();
6969
6970 spg_t pgid;
6971 if (m->get_type() == CEPH_MSG_OSD_OP) {
6972 pg_t actual_pgid = osdmap->raw_pg_to_pg(
6973 static_cast<const MOSDOp*>(m)->get_pg());
6974 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
6975 continue;
6976 }
6977 } else {
6978 pgid = m->get_spg();
6979 }
6980 enqueue_op(pgid, std::move(op), m->get_map_epoch());
6981 }
6982
6983 if (session->waiting_on_map.empty()) {
6984 clear_session_waiting_on_map(session);
6985 } else {
6986 register_session_waiting_on_map(session);
6987 }
6988 }
6989
6990 void OSD::ms_fast_dispatch(Message *m)
6991 {
6992 FUNCTRACE(cct);
6993 if (service.is_stopping()) {
6994 m->put();
6995 return;
6996 }
6997
6998 // peering event?
6999 switch (m->get_type()) {
7000 case CEPH_MSG_PING:
7001 dout(10) << "ping from " << m->get_source() << dendl;
7002 m->put();
7003 return;
7004 case MSG_OSD_FORCE_RECOVERY:
7005 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7006 return;
7007 case MSG_OSD_SCRUB2:
7008 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7009 return;
7010
7011 case MSG_OSD_PG_CREATE2:
7012 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7013 case MSG_OSD_PG_QUERY:
7014 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7015 case MSG_OSD_PG_NOTIFY:
7016 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7017 case MSG_OSD_PG_INFO:
7018 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7019 case MSG_OSD_PG_REMOVE:
7020 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7021
7022 // these are single-pg messages that handle themselves
7023 case MSG_OSD_PG_LOG:
7024 case MSG_OSD_PG_TRIM:
7025 case MSG_OSD_PG_NOTIFY2:
7026 case MSG_OSD_PG_QUERY2:
7027 case MSG_OSD_PG_INFO2:
7028 case MSG_OSD_BACKFILL_RESERVE:
7029 case MSG_OSD_RECOVERY_RESERVE:
7030 case MSG_OSD_PG_LEASE:
7031 case MSG_OSD_PG_LEASE_ACK:
7032 {
7033 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7034 if (require_osd_peer(pm)) {
7035 enqueue_peering_evt(
7036 pm->get_spg(),
7037 PGPeeringEventRef(pm->get_event()));
7038 }
7039 pm->put();
7040 return;
7041 }
7042 }
7043
7044 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7045 {
7046 #ifdef WITH_LTTNG
7047 osd_reqid_t reqid = op->get_reqid();
7048 #endif
7049 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7050 reqid.name._num, reqid.tid, reqid.inc);
7051 }
7052
7053 if (m->trace)
7054 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7055
7056 // note sender epoch, min req's epoch
7057 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7058 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7059 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7060
7061 service.maybe_inject_dispatch_delay();
7062
7063 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7064 m->get_type() != CEPH_MSG_OSD_OP) {
7065 // queue it directly
7066 enqueue_op(
7067 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7068 std::move(op),
7069 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7070 } else {
7071 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7072 // message that didn't have an explicit spg_t); we need to map
7073 // them to an spg_t while preserving delivery order.
7074 auto priv = m->get_connection()->get_priv();
7075 if (auto session = static_cast<Session*>(priv.get()); session) {
7076 std::lock_guard l{session->session_dispatch_lock};
7077 op->get();
7078 session->waiting_on_map.push_back(*op);
7079 OSDMapRef nextmap = service.get_nextmap_reserved();
7080 dispatch_session_waiting(session, nextmap);
7081 service.release_map(nextmap);
7082 }
7083 }
7084 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7085 }
7086
7087 int OSD::ms_handle_authentication(Connection *con)
7088 {
7089 int ret = 0;
7090 auto s = ceph::ref_cast<Session>(con->get_priv());
7091 if (!s) {
7092 s = ceph::make_ref<Session>(cct, con);
7093 con->set_priv(s);
7094 s->entity_name = con->get_peer_entity_name();
7095 dout(10) << __func__ << " new session " << s << " con " << s->con
7096 << " entity " << s->entity_name
7097 << " addr " << con->get_peer_addrs() << dendl;
7098 } else {
7099 dout(10) << __func__ << " existing session " << s << " con " << s->con
7100 << " entity " << s->entity_name
7101 << " addr " << con->get_peer_addrs() << dendl;
7102 }
7103
7104 AuthCapsInfo &caps_info = con->get_peer_caps_info();
7105 if (caps_info.allow_all) {
7106 s->caps.set_allow_all();
7107 } else if (caps_info.caps.length() > 0) {
7108 bufferlist::const_iterator p = caps_info.caps.cbegin();
7109 string str;
7110 try {
7111 decode(str, p);
7112 }
7113 catch (buffer::error& e) {
7114 dout(10) << __func__ << " session " << s << " " << s->entity_name
7115 << " failed to decode caps string" << dendl;
7116 ret = -EACCES;
7117 }
7118 if (!ret) {
7119 bool success = s->caps.parse(str);
7120 if (success) {
7121 dout(10) << __func__ << " session " << s
7122 << " " << s->entity_name
7123 << " has caps " << s->caps << " '" << str << "'" << dendl;
7124 ret = 1;
7125 } else {
7126 dout(10) << __func__ << " session " << s << " " << s->entity_name
7127 << " failed to parse caps '" << str << "'" << dendl;
7128 ret = -EACCES;
7129 }
7130 }
7131 }
7132 return ret;
7133 }
7134
7135 void OSD::do_waiters()
7136 {
7137 ceph_assert(ceph_mutex_is_locked(osd_lock));
7138
7139 dout(10) << "do_waiters -- start" << dendl;
7140 while (!finished.empty()) {
7141 OpRequestRef next = finished.front();
7142 finished.pop_front();
7143 dispatch_op(next);
7144 }
7145 dout(10) << "do_waiters -- finish" << dendl;
7146 }
7147
7148 void OSD::dispatch_op(OpRequestRef op)
7149 {
7150 switch (op->get_req()->get_type()) {
7151
7152 case MSG_OSD_PG_CREATE:
7153 handle_pg_create(op);
7154 break;
7155 }
7156 }
7157
7158 void OSD::_dispatch(Message *m)
7159 {
7160 ceph_assert(ceph_mutex_is_locked(osd_lock));
7161 dout(20) << "_dispatch " << m << " " << *m << dendl;
7162
7163 switch (m->get_type()) {
7164 // -- don't need OSDMap --
7165
7166 // map and replication
7167 case CEPH_MSG_OSD_MAP:
7168 handle_osd_map(static_cast<MOSDMap*>(m));
7169 break;
7170 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7171 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7172 break;
7173
7174 // osd
7175 case MSG_OSD_SCRUB:
7176 handle_scrub(static_cast<MOSDScrub*>(m));
7177 break;
7178
7179 case MSG_COMMAND:
7180 handle_command(static_cast<MCommand*>(m));
7181 return;
7182
7183 // -- need OSDMap --
7184
7185 case MSG_OSD_PG_CREATE:
7186 {
7187 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7188 if (m->trace)
7189 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7190 // no map? starting up?
7191 if (!get_osdmap()) {
7192 dout(7) << "no OSDMap, not booted" << dendl;
7193 logger->inc(l_osd_waiting_for_map);
7194 waiting_for_osdmap.push_back(op);
7195 op->mark_delayed("no osdmap");
7196 break;
7197 }
7198
7199 // need OSDMap
7200 dispatch_op(op);
7201 }
7202 }
7203 }
7204
7205 // remove me post-nautilus
7206 void OSD::handle_scrub(MOSDScrub *m)
7207 {
7208 dout(10) << "handle_scrub " << *m << dendl;
7209 if (!require_mon_or_mgr_peer(m)) {
7210 m->put();
7211 return;
7212 }
7213 if (m->fsid != monc->get_fsid()) {
7214 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7215 << dendl;
7216 m->put();
7217 return;
7218 }
7219
7220 vector<spg_t> spgs;
7221 _get_pgids(&spgs);
7222
7223 if (!m->scrub_pgs.empty()) {
7224 vector<spg_t> v;
7225 for (auto pgid : m->scrub_pgs) {
7226 spg_t pcand;
7227 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
7228 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7229 v.push_back(pcand);
7230 }
7231 }
7232 spgs.swap(v);
7233 }
7234
7235 for (auto pgid : spgs) {
7236 enqueue_peering_evt(
7237 pgid,
7238 PGPeeringEventRef(
7239 std::make_shared<PGPeeringEvent>(
7240 get_osdmap_epoch(),
7241 get_osdmap_epoch(),
7242 PeeringState::RequestScrub(m->deep, m->repair))));
7243 }
7244
7245 m->put();
7246 }
7247
7248 void OSD::handle_fast_scrub(MOSDScrub2 *m)
7249 {
7250 dout(10) << __func__ << " " << *m << dendl;
7251 if (!require_mon_or_mgr_peer(m)) {
7252 m->put();
7253 return;
7254 }
7255 if (m->fsid != monc->get_fsid()) {
7256 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7257 << dendl;
7258 m->put();
7259 return;
7260 }
7261 for (auto pgid : m->scrub_pgs) {
7262 enqueue_peering_evt(
7263 pgid,
7264 PGPeeringEventRef(
7265 std::make_shared<PGPeeringEvent>(
7266 m->epoch,
7267 m->epoch,
7268 PeeringState::RequestScrub(m->deep, m->repair))));
7269 }
7270 m->put();
7271 }
7272
7273 bool OSD::scrub_random_backoff()
7274 {
7275 bool coin_flip = (rand() / (double)RAND_MAX >=
7276 cct->_conf->osd_scrub_backoff_ratio);
7277 if (!coin_flip) {
7278 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7279 return true;
7280 }
7281 return false;
7282 }
7283
7284 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7285 const spg_t& pg, const utime_t& timestamp,
7286 double pool_scrub_min_interval,
7287 double pool_scrub_max_interval, bool must)
7288 : cct(cct),
7289 pgid(pg),
7290 sched_time(timestamp),
7291 deadline(timestamp)
7292 {
7293 // if not explicitly requested, postpone the scrub with a random delay
7294 if (!must) {
7295 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7296 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7297 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7298 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7299
7300 sched_time += scrub_min_interval;
7301 double r = rand() / (double)RAND_MAX;
7302 sched_time +=
7303 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7304 if (scrub_max_interval == 0) {
7305 deadline = utime_t();
7306 } else {
7307 deadline += scrub_max_interval;
7308 }
7309
7310 }
7311 }
7312
7313 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7314 if (sched_time < rhs.sched_time)
7315 return true;
7316 if (sched_time > rhs.sched_time)
7317 return false;
7318 return pgid < rhs.pgid;
7319 }
7320
7321 double OSD::scrub_sleep_time(bool must_scrub)
7322 {
7323 if (must_scrub) {
7324 return cct->_conf->osd_scrub_sleep;
7325 }
7326 utime_t now = ceph_clock_now();
7327 if (scrub_time_permit(now)) {
7328 return cct->_conf->osd_scrub_sleep;
7329 }
7330 double normal_sleep = cct->_conf->osd_scrub_sleep;
7331 double extended_sleep = cct->_conf->osd_scrub_extended_sleep;
7332 return std::max(extended_sleep, normal_sleep);
7333 }
7334
7335 bool OSD::scrub_time_permit(utime_t now)
7336 {
7337 struct tm bdt;
7338 time_t tt = now.sec();
7339 localtime_r(&tt, &bdt);
7340
7341 bool day_permit = false;
7342 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7343 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7344 day_permit = true;
7345 }
7346 } else {
7347 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7348 day_permit = true;
7349 }
7350 }
7351
7352 if (!day_permit) {
7353 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7354 << " - " << cct->_conf->osd_scrub_end_week_day
7355 << " now " << bdt.tm_wday << " = no" << dendl;
7356 return false;
7357 }
7358
7359 bool time_permit = false;
7360 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7361 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7362 time_permit = true;
7363 }
7364 } else {
7365 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7366 time_permit = true;
7367 }
7368 }
7369 if (!time_permit) {
7370 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7371 << " - " << cct->_conf->osd_scrub_end_hour
7372 << " now " << bdt.tm_hour << " = no" << dendl;
7373 } else {
7374 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7375 << " - " << cct->_conf->osd_scrub_end_hour
7376 << " now " << bdt.tm_hour << " = yes" << dendl;
7377 }
7378 return time_permit;
7379 }
7380
7381 bool OSD::scrub_load_below_threshold()
7382 {
7383 double loadavgs[3];
7384 if (getloadavg(loadavgs, 3) != 3) {
7385 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7386 return false;
7387 }
7388
7389 // allow scrub if below configured threshold
7390 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7391 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7392 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7393 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7394 << " < max " << cct->_conf->osd_scrub_load_threshold
7395 << " = yes" << dendl;
7396 return true;
7397 }
7398
7399 // allow scrub if below daily avg and currently decreasing
7400 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7401 dout(20) << __func__ << " loadavg " << loadavgs[0]
7402 << " < daily_loadavg " << daily_loadavg
7403 << " and < 15m avg " << loadavgs[2]
7404 << " = yes" << dendl;
7405 return true;
7406 }
7407
7408 dout(20) << __func__ << " loadavg " << loadavgs[0]
7409 << " >= max " << cct->_conf->osd_scrub_load_threshold
7410 << " and ( >= daily_loadavg " << daily_loadavg
7411 << " or >= 15m avg " << loadavgs[2]
7412 << ") = no" << dendl;
7413 return false;
7414 }
7415
7416 void OSD::sched_scrub()
7417 {
7418 // if not permitted, fail fast
7419 if (!service.can_inc_scrubs()) {
7420 return;
7421 }
7422 bool allow_requested_repair_only = false;
7423 if (service.is_recovery_active()) {
7424 if (!cct->_conf->osd_scrub_during_recovery && cct->_conf->osd_repair_during_recovery) {
7425 dout(10) << __func__
7426 << " will only schedule explicitly requested repair due to active recovery"
7427 << dendl;
7428 allow_requested_repair_only = true;
7429 } else if (!cct->_conf->osd_scrub_during_recovery && !cct->_conf->osd_repair_during_recovery) {
7430 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7431 return;
7432 }
7433 }
7434
7435 utime_t now = ceph_clock_now();
7436 bool time_permit = scrub_time_permit(now);
7437 bool load_is_low = scrub_load_below_threshold();
7438 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7439
7440 OSDService::ScrubJob scrub;
7441 if (service.first_scrub_stamp(&scrub)) {
7442 do {
7443 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7444
7445 if (scrub.sched_time > now) {
7446 // save ourselves some effort
7447 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7448 << " > " << now << dendl;
7449 break;
7450 }
7451
7452 if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
7453 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7454 << (!time_permit ? "time not permit" : "high load") << dendl;
7455 continue;
7456 }
7457
7458 PGRef pg = _lookup_lock_pg(scrub.pgid);
7459 if (!pg)
7460 continue;
7461 // This has already started, so go on to the next scrub job
7462 if (pg->scrubber.active) {
7463 pg->unlock();
7464 dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
7465 continue;
7466 }
7467 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7468 if (allow_requested_repair_only && !pg->scrubber.must_repair) {
7469 pg->unlock();
7470 dout(10) << __func__ << " skip " << scrub.pgid
7471 << " because repairing is not explicitly requested on it"
7472 << dendl;
7473 continue;
7474 }
7475 // If it is reserving, let it resolve before going to the next scrub job
7476 if (pg->scrubber.local_reserved && !pg->scrubber.active) {
7477 pg->unlock();
7478 dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
7479 break;
7480 }
7481 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7482 << (pg->get_must_scrub() ? ", explicitly requested" :
7483 (load_is_low ? ", load_is_low" : " deadline < now"))
7484 << dendl;
7485 if (pg->sched_scrub()) {
7486 pg->unlock();
7487 break;
7488 }
7489 pg->unlock();
7490 } while (service.next_scrub_stamp(scrub, &scrub));
7491 }
7492 dout(20) << "sched_scrub done" << dendl;
7493 }
7494
7495 void OSD::resched_all_scrubs()
7496 {
7497 dout(10) << __func__ << ": start" << dendl;
7498 OSDService::ScrubJob scrub;
7499 if (service.first_scrub_stamp(&scrub)) {
7500 do {
7501 dout(20) << __func__ << ": examine " << scrub.pgid << dendl;
7502
7503 PGRef pg = _lookup_lock_pg(scrub.pgid);
7504 if (!pg)
7505 continue;
7506 if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) {
7507 dout(20) << __func__ << ": reschedule " << scrub.pgid << dendl;
7508 pg->on_info_history_change();
7509 }
7510 pg->unlock();
7511 } while (service.next_scrub_stamp(scrub, &scrub));
7512 }
7513 dout(10) << __func__ << ": done" << dendl;
7514 }
7515
7516 MPGStats* OSD::collect_pg_stats()
7517 {
7518 // This implementation unconditionally sends every is_primary PG's
7519 // stats every time we're called. This has equivalent cost to the
7520 // previous implementation's worst case where all PGs are busy and
7521 // their stats are always enqueued for sending.
7522 std::shared_lock l{map_lock};
7523
7524 osd_stat_t cur_stat = service.get_osd_stat();
7525 cur_stat.os_perf_stat = store->get_cur_stats();
7526
7527 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
7528 m->osd_stat = cur_stat;
7529
7530 std::lock_guard lec{min_last_epoch_clean_lock};
7531 min_last_epoch_clean = get_osdmap_epoch();
7532 min_last_epoch_clean_pgs.clear();
7533
7534 std::set<int64_t> pool_set;
7535 vector<PGRef> pgs;
7536 _get_pgs(&pgs);
7537 for (auto& pg : pgs) {
7538 auto pool = pg->pg_id.pgid.pool();
7539 pool_set.emplace((int64_t)pool);
7540 if (!pg->is_primary()) {
7541 continue;
7542 }
7543 pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7544 m->pg_stat[pg->pg_id.pgid] = s;
7545 min_last_epoch_clean = min(min_last_epoch_clean, lec);
7546 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7547 });
7548 }
7549 store_statfs_t st;
7550 bool per_pool_stats = false;
7551 bool per_pool_omap_stats = false;
7552 for (auto p : pool_set) {
7553 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
7554 if (r == -ENOTSUP) {
7555 break;
7556 } else {
7557 assert(r >= 0);
7558 m->pool_stat[p] = st;
7559 per_pool_stats = true;
7560 }
7561 }
7562
7563 // indicate whether we are reporting per-pool stats
7564 m->osd_stat.num_osds = 1;
7565 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
7566 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
7567
7568 return m;
7569 }
7570
7571 vector<DaemonHealthMetric> OSD::get_health_metrics()
7572 {
7573 vector<DaemonHealthMetric> metrics;
7574 {
7575 utime_t oldest_secs;
7576 const utime_t now = ceph_clock_now();
7577 auto too_old = now;
7578 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7579 int slow = 0;
7580 TrackedOpRef oldest_op;
7581 auto count_slow_ops = [&](TrackedOp& op) {
7582 if (op.get_initiated() < too_old) {
7583 stringstream ss;
7584 ss << "slow request " << op.get_desc()
7585 << " initiated "
7586 << op.get_initiated()
7587 << " currently "
7588 << op.state_string();
7589 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7590 clog->warn() << ss.str();
7591 slow++;
7592 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7593 oldest_op = &op;
7594 }
7595 return true;
7596 } else {
7597 return false;
7598 }
7599 };
7600 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7601 if (slow) {
7602 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7603 << oldest_op->get_desc() << dendl;
7604 }
7605 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7606 } else {
7607 // no news is not good news.
7608 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7609 }
7610 }
7611 {
7612 std::lock_guard l(pending_creates_lock);
7613 auto n_primaries = pending_creates_from_mon;
7614 for (const auto& create : pending_creates_from_osd) {
7615 if (create.second) {
7616 n_primaries++;
7617 }
7618 }
7619 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
7620 }
7621 return metrics;
7622 }
7623
7624 // =====================================================
7625 // MAP
7626
7627 void OSD::wait_for_new_map(OpRequestRef op)
7628 {
7629 // ask?
7630 if (waiting_for_osdmap.empty()) {
7631 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7632 }
7633
7634 logger->inc(l_osd_waiting_for_map);
7635 waiting_for_osdmap.push_back(op);
7636 op->mark_delayed("wait for new map");
7637 }
7638
7639
7640 /** update_map
7641 * assimilate new OSDMap(s). scan pgs, etc.
7642 */
7643
7644 void OSD::note_down_osd(int peer)
7645 {
7646 ceph_assert(ceph_mutex_is_locked(osd_lock));
7647 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7648
7649 std::lock_guard l{heartbeat_lock};
7650 failure_queue.erase(peer);
7651 failure_pending.erase(peer);
7652 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7653 if (p != heartbeat_peers.end()) {
7654 p->second.clear_mark_down();
7655 heartbeat_peers.erase(p);
7656 }
7657 }
7658
7659 void OSD::note_up_osd(int peer)
7660 {
7661 heartbeat_set_peers_need_update();
7662 }
7663
7664 struct C_OnMapCommit : public Context {
7665 OSD *osd;
7666 epoch_t first, last;
7667 MOSDMap *msg;
7668 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7669 : osd(o), first(f), last(l), msg(m) {}
7670 void finish(int r) override {
7671 osd->_committed_osd_maps(first, last, msg);
7672 msg->put();
7673 }
7674 };
7675
7676 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7677 {
7678 std::lock_guard l(osdmap_subscribe_lock);
7679 if (latest_subscribed_epoch >= epoch && !force_request)
7680 return;
7681
7682 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
7683
7684 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7685 force_request) {
7686 monc->renew_subs();
7687 }
7688 }
7689
7690 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7691 {
7692 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7693 if (min <= superblock.oldest_map)
7694 return;
7695
7696 int num = 0;
7697 ObjectStore::Transaction t;
7698 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7699 dout(20) << " removing old osdmap epoch " << e << dendl;
7700 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7701 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7702 superblock.oldest_map = e + 1;
7703 num++;
7704 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7705 service.publish_superblock(superblock);
7706 write_superblock(t);
7707 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7708 ceph_assert(tr == 0);
7709 num = 0;
7710 if (!skip_maps) {
7711 // skip_maps leaves us with a range of old maps if we fail to remove all
7712 // of them before moving superblock.oldest_map forward to the first map
7713 // in the incoming MOSDMap msg. so we should continue removing them in
7714 // this case, even we could do huge series of delete transactions all at
7715 // once.
7716 break;
7717 }
7718 }
7719 }
7720 if (num > 0) {
7721 service.publish_superblock(superblock);
7722 write_superblock(t);
7723 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7724 ceph_assert(tr == 0);
7725 }
7726 // we should not remove the cached maps
7727 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7728 }
7729
7730 void OSD::handle_osd_map(MOSDMap *m)
7731 {
7732 // wait for pgs to catch up
7733 {
7734 // we extend the map cache pins to accomodate pgs slow to consume maps
7735 // for some period, until we hit the max_lag_factor bound, at which point
7736 // we block here to stop injesting more maps than they are able to keep
7737 // up with.
7738 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7739 m_osd_pg_epoch_max_lag_factor;
7740 ceph_assert(max_lag > 0);
7741 epoch_t osd_min = 0;
7742 for (auto shard : shards) {
7743 epoch_t min = shard->get_min_pg_epoch();
7744 if (osd_min == 0 || min < osd_min) {
7745 osd_min = min;
7746 }
7747 }
7748 epoch_t osdmap_epoch = get_osdmap_epoch();
7749 if (osd_min > 0 &&
7750 osdmap_epoch > max_lag &&
7751 osdmap_epoch - max_lag > osd_min) {
7752 epoch_t need = osdmap_epoch - max_lag;
7753 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7754 << " max_lag " << max_lag << ")" << dendl;
7755 for (auto shard : shards) {
7756 epoch_t min = shard->get_min_pg_epoch();
7757 if (need > min) {
7758 dout(10) << __func__ << " waiting for pgs to consume " << need
7759 << " (shard " << shard->shard_id << " min " << min
7760 << ", map cache is " << cct->_conf->osd_map_cache_size
7761 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7762 << ")" << dendl;
7763 unlock_guard unlock{osd_lock};
7764 shard->wait_min_pg_epoch(need);
7765 }
7766 }
7767 }
7768 }
7769
7770 ceph_assert(ceph_mutex_is_locked(osd_lock));
7771 map<epoch_t,OSDMapRef> added_maps;
7772 map<epoch_t,bufferlist> added_maps_bl;
7773 if (m->fsid != monc->get_fsid()) {
7774 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7775 << monc->get_fsid() << dendl;
7776 m->put();
7777 return;
7778 }
7779 if (is_initializing()) {
7780 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7781 m->put();
7782 return;
7783 }
7784
7785 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7786 if (session && !(session->entity_name.is_mon() ||
7787 session->entity_name.is_osd())) {
7788 //not enough perms!
7789 dout(10) << "got osd map from Session " << session
7790 << " which we can't take maps from (not a mon or osd)" << dendl;
7791 m->put();
7792 return;
7793 }
7794
7795 // share with the objecter
7796 if (!is_preboot())
7797 service.objecter->handle_osd_map(m);
7798
7799 epoch_t first = m->get_first();
7800 epoch_t last = m->get_last();
7801 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7802 << superblock.newest_map
7803 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7804 << dendl;
7805
7806 logger->inc(l_osd_map);
7807 logger->inc(l_osd_mape, last - first + 1);
7808 if (first <= superblock.newest_map)
7809 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7810 if (service.max_oldest_map < m->oldest_map) {
7811 service.max_oldest_map = m->oldest_map;
7812 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7813 }
7814
7815 // make sure there is something new, here, before we bother flushing
7816 // the queues and such
7817 if (last <= superblock.newest_map) {
7818 dout(10) << " no new maps here, dropping" << dendl;
7819 m->put();
7820 return;
7821 }
7822
7823 // missing some?
7824 bool skip_maps = false;
7825 if (first > superblock.newest_map + 1) {
7826 dout(10) << "handle_osd_map message skips epochs "
7827 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7828 if (m->oldest_map <= superblock.newest_map + 1) {
7829 osdmap_subscribe(superblock.newest_map + 1, false);
7830 m->put();
7831 return;
7832 }
7833 // always try to get the full range of maps--as many as we can. this
7834 // 1- is good to have
7835 // 2- is at present the only way to ensure that we get a *full* map as
7836 // the first map!
7837 if (m->oldest_map < first) {
7838 osdmap_subscribe(m->oldest_map - 1, true);
7839 m->put();
7840 return;
7841 }
7842 skip_maps = true;
7843 }
7844
7845 ObjectStore::Transaction t;
7846 uint64_t txn_size = 0;
7847
7848 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
7849
7850 // store new maps: queue for disk and put in the osdmap cache
7851 epoch_t start = std::max(superblock.newest_map + 1, first);
7852 for (epoch_t e = start; e <= last; e++) {
7853 if (txn_size >= t.get_num_bytes()) {
7854 derr << __func__ << " transaction size overflowed" << dendl;
7855 ceph_assert(txn_size < t.get_num_bytes());
7856 }
7857 txn_size = t.get_num_bytes();
7858 map<epoch_t,bufferlist>::iterator p;
7859 p = m->maps.find(e);
7860 if (p != m->maps.end()) {
7861 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7862 OSDMap *o = new OSDMap;
7863 bufferlist& bl = p->second;
7864
7865 o->decode(bl);
7866
7867 purged_snaps[e] = o->get_new_purged_snaps();
7868
7869 ghobject_t fulloid = get_osdmap_pobject_name(e);
7870 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7871 added_maps[e] = add_map(o);
7872 added_maps_bl[e] = bl;
7873 got_full_map(e);
7874 continue;
7875 }
7876
7877 p = m->incremental_maps.find(e);
7878 if (p != m->incremental_maps.end()) {
7879 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7880 bufferlist& bl = p->second;
7881 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7882 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7883
7884 OSDMap *o = new OSDMap;
7885 if (e > 1) {
7886 bufferlist obl;
7887 bool got = get_map_bl(e - 1, obl);
7888 if (!got) {
7889 auto p = added_maps_bl.find(e - 1);
7890 ceph_assert(p != added_maps_bl.end());
7891 obl = p->second;
7892 }
7893 o->decode(obl);
7894 }
7895
7896 OSDMap::Incremental inc;
7897 auto p = bl.cbegin();
7898 inc.decode(p);
7899
7900 if (o->apply_incremental(inc) < 0) {
7901 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
7902 ceph_abort_msg("bad fsid");
7903 }
7904
7905 bufferlist fbl;
7906 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7907
7908 bool injected_failure = false;
7909 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7910 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7911 derr << __func__ << " injecting map crc failure" << dendl;
7912 injected_failure = true;
7913 }
7914
7915 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7916 dout(2) << "got incremental " << e
7917 << " but failed to encode full with correct crc; requesting"
7918 << dendl;
7919 clog->warn() << "failed to encode map e" << e << " with expected crc";
7920 dout(20) << "my encoded map was:\n";
7921 fbl.hexdump(*_dout);
7922 *_dout << dendl;
7923 delete o;
7924 request_full_map(e, last);
7925 last = e - 1;
7926 break;
7927 }
7928 got_full_map(e);
7929 purged_snaps[e] = o->get_new_purged_snaps();
7930
7931 ghobject_t fulloid = get_osdmap_pobject_name(e);
7932 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7933 added_maps[e] = add_map(o);
7934 added_maps_bl[e] = fbl;
7935 continue;
7936 }
7937
7938 ceph_abort_msg("MOSDMap lied about what maps it had?");
7939 }
7940
7941 // even if this map isn't from a mon, we may have satisfied our subscription
7942 monc->sub_got("osdmap", last);
7943
7944 if (!m->maps.empty() && requested_full_first) {
7945 dout(10) << __func__ << " still missing full maps " << requested_full_first
7946 << ".." << requested_full_last << dendl;
7947 rerequest_full_maps();
7948 }
7949
7950 if (superblock.oldest_map) {
7951 // make sure we at least keep pace with incoming maps
7952 trim_maps(m->oldest_map, last - first + 1, skip_maps);
7953 pg_num_history.prune(superblock.oldest_map);
7954 }
7955
7956 if (!superblock.oldest_map || skip_maps)
7957 superblock.oldest_map = first;
7958 superblock.newest_map = last;
7959 superblock.current_epoch = last;
7960
7961 // note in the superblock that we were clean thru the prior epoch
7962 epoch_t boot_epoch = service.get_boot_epoch();
7963 if (boot_epoch && boot_epoch >= superblock.mounted) {
7964 superblock.mounted = boot_epoch;
7965 superblock.clean_thru = last;
7966 }
7967
7968 // check for pg_num changes and deleted pools
7969 OSDMapRef lastmap;
7970 for (auto& i : added_maps) {
7971 if (!lastmap) {
7972 if (!(lastmap = service.try_get_map(i.first - 1))) {
7973 dout(10) << __func__ << " can't get previous map " << i.first - 1
7974 << " probably first start of this osd" << dendl;
7975 continue;
7976 }
7977 }
7978 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
7979 for (auto& j : lastmap->get_pools()) {
7980 if (!i.second->have_pg_pool(j.first)) {
7981 pg_num_history.log_pool_delete(i.first, j.first);
7982 dout(10) << __func__ << " recording final pg_pool_t for pool "
7983 << j.first << dendl;
7984 // this information is needed by _make_pg() if have to restart before
7985 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
7986 ghobject_t obj = make_final_pool_info_oid(j.first);
7987 bufferlist bl;
7988 encode(j.second, bl, CEPH_FEATURES_ALL);
7989 string name = lastmap->get_pool_name(j.first);
7990 encode(name, bl);
7991 map<string,string> profile;
7992 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
7993 profile = lastmap->get_erasure_code_profile(
7994 lastmap->get_pg_pool(j.first)->erasure_code_profile);
7995 }
7996 encode(profile, bl);
7997 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
7998 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
7999 new_pg_num != j.second.get_pg_num()) {
8000 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8001 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8002 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8003 }
8004 }
8005 for (auto& j : i.second->get_pools()) {
8006 if (!lastmap->have_pg_pool(j.first)) {
8007 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8008 << j.second.get_pg_num() << dendl;
8009 pg_num_history.log_pg_num_change(i.first, j.first,
8010 j.second.get_pg_num());
8011 }
8012 }
8013 lastmap = i.second;
8014 }
8015 pg_num_history.epoch = last;
8016 {
8017 bufferlist bl;
8018 ::encode(pg_num_history, bl);
8019 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8020 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8021 }
8022
8023 // record new purged_snaps
8024 if (superblock.purged_snaps_last == start - 1) {
8025 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
8026 make_purged_snaps_oid(), &t,
8027 purged_snaps);
8028 superblock.purged_snaps_last = last;
8029 } else {
8030 dout(10) << __func__ << " superblock purged_snaps_last is "
8031 << superblock.purged_snaps_last
8032 << ", not recording new purged_snaps" << dendl;
8033 }
8034
8035 // superblock and commit
8036 write_superblock(t);
8037 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8038 store->queue_transaction(
8039 service.meta_ch,
8040 std::move(t));
8041 service.publish_superblock(superblock);
8042 }
8043
8044 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8045 {
8046 dout(10) << __func__ << " " << first << ".." << last << dendl;
8047 if (is_stopping()) {
8048 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8049 return;
8050 }
8051 std::lock_guard l(osd_lock);
8052 if (is_stopping()) {
8053 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8054 return;
8055 }
8056 map_lock.lock();
8057
8058 bool do_shutdown = false;
8059 bool do_restart = false;
8060 bool network_error = false;
8061 OSDMapRef osdmap;
8062
8063 // advance through the new maps
8064 for (epoch_t cur = first; cur <= last; cur++) {
8065 dout(10) << " advance to epoch " << cur
8066 << " (<= last " << last
8067 << " <= newest_map " << superblock.newest_map
8068 << ")" << dendl;
8069
8070 OSDMapRef newmap = get_map(cur);
8071 ceph_assert(newmap); // we just cached it above!
8072
8073 // start blacklisting messages sent to peers that go down.
8074 service.pre_publish_map(newmap);
8075
8076 // kill connections to newly down osds
8077 bool waited_for_reservations = false;
8078 set<int> old;
8079 osdmap = get_osdmap();
8080 osdmap->get_all_osds(old);
8081 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8082 if (*p != whoami &&
8083 osdmap->is_up(*p) && // in old map
8084 newmap->is_down(*p)) { // but not the new one
8085 if (!waited_for_reservations) {
8086 service.await_reserved_maps();
8087 waited_for_reservations = true;
8088 }
8089 note_down_osd(*p);
8090 } else if (*p != whoami &&
8091 osdmap->is_down(*p) &&
8092 newmap->is_up(*p)) {
8093 note_up_osd(*p);
8094 }
8095 }
8096
8097 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8098 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8099 << dendl;
8100 if (is_booting()) {
8101 // this captures the case where we sent the boot message while
8102 // NOUP was being set on the mon and our boot request was
8103 // dropped, and then later it is cleared. it imperfectly
8104 // handles the case where our original boot message was not
8105 // dropped and we restart even though we might have booted, but
8106 // that is harmless (boot will just take slightly longer).
8107 do_restart = true;
8108 }
8109 }
8110
8111 osdmap = std::move(newmap);
8112 set_osdmap(osdmap);
8113 epoch_t up_epoch;
8114 epoch_t boot_epoch;
8115 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8116 if (!up_epoch &&
8117 osdmap->is_up(whoami) &&
8118 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8119 up_epoch = osdmap->get_epoch();
8120 dout(10) << "up_epoch is " << up_epoch << dendl;
8121 if (!boot_epoch) {
8122 boot_epoch = osdmap->get_epoch();
8123 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8124 }
8125 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8126 }
8127 }
8128
8129 epoch_t _bind_epoch = service.get_bind_epoch();
8130 if (osdmap->is_up(whoami) &&
8131 osdmap->get_addrs(whoami).legacy_equals(
8132 client_messenger->get_myaddrs()) &&
8133 _bind_epoch < osdmap->get_up_from(whoami)) {
8134
8135 if (is_booting()) {
8136 dout(1) << "state: booting -> active" << dendl;
8137 set_state(STATE_ACTIVE);
8138 do_restart = false;
8139
8140 // set incarnation so that osd_reqid_t's we generate for our
8141 // objecter requests are unique across restarts.
8142 service.objecter->set_client_incarnation(osdmap->get_epoch());
8143 cancel_pending_failures();
8144 }
8145 }
8146
8147 if (osdmap->get_epoch() > 0 &&
8148 is_active()) {
8149 if (!osdmap->exists(whoami)) {
8150 derr << "map says i do not exist. shutting down." << dendl;
8151 do_shutdown = true; // don't call shutdown() while we have
8152 // everything paused
8153 } else if (osdmap->is_stop(whoami)) {
8154 derr << "map says i am stopped by admin. shutting down." << dendl;
8155 do_shutdown = true;
8156 } else if (!osdmap->is_up(whoami) ||
8157 !osdmap->get_addrs(whoami).legacy_equals(
8158 client_messenger->get_myaddrs()) ||
8159 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8160 cluster_messenger->get_myaddrs()) ||
8161 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8162 hb_back_server_messenger->get_myaddrs()) ||
8163 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8164 hb_front_server_messenger->get_myaddrs())) {
8165 if (!osdmap->is_up(whoami)) {
8166 if (service.is_preparing_to_stop() || service.is_stopping()) {
8167 service.got_stop_ack();
8168 } else {
8169 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8170 "but it is still running";
8171 clog->debug() << "map e" << osdmap->get_epoch()
8172 << " wrongly marked me down at e"
8173 << osdmap->get_down_at(whoami);
8174 }
8175 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8176 // note that this is best-effort...
8177 monc->send_mon_message(
8178 new MOSDMarkMeDead(
8179 monc->get_fsid(),
8180 whoami,
8181 osdmap->get_epoch()));
8182 }
8183 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8184 client_messenger->get_myaddrs())) {
8185 clog->error() << "map e" << osdmap->get_epoch()
8186 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8187 << " != my " << client_messenger->get_myaddrs() << ")";
8188 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8189 cluster_messenger->get_myaddrs())) {
8190 clog->error() << "map e" << osdmap->get_epoch()
8191 << " had wrong cluster addr ("
8192 << osdmap->get_cluster_addrs(whoami)
8193 << " != my " << cluster_messenger->get_myaddrs() << ")";
8194 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8195 hb_back_server_messenger->get_myaddrs())) {
8196 clog->error() << "map e" << osdmap->get_epoch()
8197 << " had wrong heartbeat back addr ("
8198 << osdmap->get_hb_back_addrs(whoami)
8199 << " != my " << hb_back_server_messenger->get_myaddrs()
8200 << ")";
8201 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8202 hb_front_server_messenger->get_myaddrs())) {
8203 clog->error() << "map e" << osdmap->get_epoch()
8204 << " had wrong heartbeat front addr ("
8205 << osdmap->get_hb_front_addrs(whoami)
8206 << " != my " << hb_front_server_messenger->get_myaddrs()
8207 << ")";
8208 }
8209
8210 if (!service.is_stopping()) {
8211 epoch_t up_epoch = 0;
8212 epoch_t bind_epoch = osdmap->get_epoch();
8213 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8214 do_restart = true;
8215
8216 //add markdown log
8217 utime_t now = ceph_clock_now();
8218 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8219 osd_markdown_log.push_back(now);
8220 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8221 derr << __func__ << " marked down "
8222 << osd_markdown_log.size()
8223 << " > osd_max_markdown_count "
8224 << cct->_conf->osd_max_markdown_count
8225 << " in last " << grace << " seconds, shutting down"
8226 << dendl;
8227 do_restart = false;
8228 do_shutdown = true;
8229 }
8230
8231 start_waiting_for_healthy();
8232
8233 set<int> avoid_ports;
8234 #if defined(__FreeBSD__)
8235 // prevent FreeBSD from grabbing the client_messenger port during
8236 // rebinding. In which case a cluster_meesneger will connect also
8237 // to the same port
8238 client_messenger->get_myaddrs().get_ports(&avoid_ports);
8239 #endif
8240 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8241
8242 int r = cluster_messenger->rebind(avoid_ports);
8243 if (r != 0) {
8244 do_shutdown = true; // FIXME: do_restart?
8245 network_error = true;
8246 derr << __func__ << " marked down:"
8247 << " rebind cluster_messenger failed" << dendl;
8248 }
8249
8250 hb_back_server_messenger->mark_down_all();
8251 hb_front_server_messenger->mark_down_all();
8252 hb_front_client_messenger->mark_down_all();
8253 hb_back_client_messenger->mark_down_all();
8254
8255 reset_heartbeat_peers(true);
8256 }
8257 }
8258 }
8259
8260 map_lock.unlock();
8261
8262 check_osdmap_features();
8263
8264 // yay!
8265 consume_map();
8266
8267 if (is_active() || is_waiting_for_healthy())
8268 maybe_update_heartbeat_peers();
8269
8270 if (is_active()) {
8271 activate_map();
8272 }
8273
8274 if (do_shutdown) {
8275 if (network_error) {
8276 cancel_pending_failures();
8277 }
8278 // trigger shutdown in a different thread
8279 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8280 queue_async_signal(SIGINT);
8281 }
8282 else if (m->newest_map && m->newest_map > last) {
8283 dout(10) << " msg say newest map is " << m->newest_map
8284 << ", requesting more" << dendl;
8285 osdmap_subscribe(osdmap->get_epoch()+1, false);
8286 }
8287 else if (is_preboot()) {
8288 if (m->get_source().is_mon())
8289 _preboot(m->oldest_map, m->newest_map);
8290 else
8291 start_boot();
8292 }
8293 else if (do_restart)
8294 start_boot();
8295
8296 }
8297
8298 void OSD::check_osdmap_features()
8299 {
8300 // adjust required feature bits?
8301
8302 // we have to be a bit careful here, because we are accessing the
8303 // Policy structures without taking any lock. in particular, only
8304 // modify integer values that can safely be read by a racing CPU.
8305 // since we are only accessing existing Policy structures a their
8306 // current memory location, and setting or clearing bits in integer
8307 // fields, and we are the only writer, this is not a problem.
8308
8309 const auto osdmap = get_osdmap();
8310 {
8311 Messenger::Policy p = client_messenger->get_default_policy();
8312 uint64_t mask;
8313 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8314 if ((p.features_required & mask) != features) {
8315 dout(0) << "crush map has features " << features
8316 << ", adjusting msgr requires for clients" << dendl;
8317 p.features_required = (p.features_required & ~mask) | features;
8318 client_messenger->set_default_policy(p);
8319 }
8320 }
8321 {
8322 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8323 uint64_t mask;
8324 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8325 if ((p.features_required & mask) != features) {
8326 dout(0) << "crush map has features " << features
8327 << " was " << p.features_required
8328 << ", adjusting msgr requires for mons" << dendl;
8329 p.features_required = (p.features_required & ~mask) | features;
8330 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8331 }
8332 }
8333 {
8334 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8335 uint64_t mask;
8336 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8337
8338 if ((p.features_required & mask) != features) {
8339 dout(0) << "crush map has features " << features
8340 << ", adjusting msgr requires for osds" << dendl;
8341 p.features_required = (p.features_required & ~mask) | features;
8342 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8343 }
8344
8345 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8346 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8347 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8348 ObjectStore::Transaction t;
8349 write_superblock(t);
8350 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8351 ceph_assert(err == 0);
8352 }
8353 }
8354
8355 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8356 hb_front_server_messenger->set_require_authorizer(false);
8357 hb_back_server_messenger->set_require_authorizer(false);
8358 } else {
8359 hb_front_server_messenger->set_require_authorizer(true);
8360 hb_back_server_messenger->set_require_authorizer(true);
8361 }
8362
8363 if (osdmap->require_osd_release != last_require_osd_release) {
8364 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8365 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8366 store->write_meta("require_osd_release",
8367 stringify((int)osdmap->require_osd_release));
8368 last_require_osd_release = osdmap->require_osd_release;
8369 }
8370 }
8371
8372 struct C_FinishSplits : public Context {
8373 OSD *osd;
8374 set<PGRef> pgs;
8375 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8376 : osd(osd), pgs(in) {}
8377 void finish(int r) override {
8378 osd->_finish_splits(pgs);
8379 }
8380 };
8381
8382 void OSD::_finish_splits(set<PGRef>& pgs)
8383 {
8384 dout(10) << __func__ << " " << pgs << dendl;
8385 if (is_stopping())
8386 return;
8387 for (set<PGRef>::iterator i = pgs.begin();
8388 i != pgs.end();
8389 ++i) {
8390 PG *pg = i->get();
8391
8392 PeeringCtx rctx = create_context();
8393 pg->lock();
8394 dout(10) << __func__ << " " << *pg << dendl;
8395 epoch_t e = pg->get_osdmap_epoch();
8396 pg->handle_initialize(rctx);
8397 pg->queue_null(e, e);
8398 dispatch_context(rctx, pg, service.get_osdmap());
8399 pg->unlock();
8400
8401 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8402 shards[shard_index]->register_and_wake_split_child(pg);
8403 }
8404 };
8405
8406 bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8407 unsigned need)
8408 {
8409 std::lock_guard l(merge_lock);
8410 auto& p = merge_waiters[nextmap->get_epoch()][target];
8411 p[src->pg_id] = src;
8412 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8413 << " for " << target << ", have " << p.size() << "/" << need
8414 << dendl;
8415 return p.size() == need;
8416 }
8417
8418 bool OSD::advance_pg(
8419 epoch_t osd_epoch,
8420 PG *pg,
8421 ThreadPool::TPHandle &handle,
8422 PeeringCtx &rctx)
8423 {
8424 if (osd_epoch <= pg->get_osdmap_epoch()) {
8425 return true;
8426 }
8427 ceph_assert(pg->is_locked());
8428 OSDMapRef lastmap = pg->get_osdmap();
8429 ceph_assert(lastmap->get_epoch() < osd_epoch);
8430 set<PGRef> new_pgs; // any split children
8431 bool ret = true;
8432
8433 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8434 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8435 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8436 next_epoch <= osd_epoch;
8437 ++next_epoch) {
8438 OSDMapRef nextmap = service.try_get_map(next_epoch);
8439 if (!nextmap) {
8440 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8441 continue;
8442 }
8443
8444 unsigned new_pg_num =
8445 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8446 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8447 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8448 // check for merge
8449 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8450 spg_t parent;
8451 if (pg->pg_id.is_merge_source(
8452 old_pg_num,
8453 new_pg_num,
8454 &parent)) {
8455 // we are merge source
8456 PGRef spg = pg; // carry a ref
8457 dout(1) << __func__ << " " << pg->pg_id
8458 << " is merge source, target is " << parent
8459 << dendl;
8460 pg->write_if_dirty(rctx);
8461 if (!new_pgs.empty()) {
8462 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8463 new_pgs));
8464 new_pgs.clear();
8465 }
8466 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8467 pg->ch->flush();
8468 // release backoffs explicitly, since the on_shutdown path
8469 // aggressively tears down backoff state.
8470 if (pg->is_primary()) {
8471 pg->release_pg_backoffs();
8472 }
8473 pg->on_shutdown();
8474 OSDShard *sdata = pg->osd_shard;
8475 {
8476 std::lock_guard l(sdata->shard_lock);
8477 if (pg->pg_slot) {
8478 sdata->_detach_pg(pg->pg_slot);
8479 // update pg count now since we might not get an osdmap
8480 // any time soon.
8481 if (pg->is_primary())
8482 logger->dec(l_osd_pg_primary);
8483 else if (pg->is_nonprimary())
8484 logger->dec(l_osd_pg_replica); // misnomer
8485 else
8486 logger->dec(l_osd_pg_stray);
8487 }
8488 }
8489 pg->unlock();
8490
8491 set<spg_t> children;
8492 parent.is_split(new_pg_num, old_pg_num, &children);
8493 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8494 enqueue_peering_evt(
8495 parent,
8496 PGPeeringEventRef(
8497 std::make_shared<PGPeeringEvent>(
8498 nextmap->get_epoch(),
8499 nextmap->get_epoch(),
8500 NullEvt())));
8501 }
8502 ret = false;
8503 goto out;
8504 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8505 // we are merge target
8506 set<spg_t> children;
8507 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8508 dout(20) << __func__ << " " << pg->pg_id
8509 << " is merge target, sources are " << children
8510 << dendl;
8511 map<spg_t,PGRef> sources;
8512 {
8513 std::lock_guard l(merge_lock);
8514 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8515 unsigned need = children.size();
8516 dout(20) << __func__ << " have " << s.size() << "/"
8517 << need << dendl;
8518 if (s.size() == need) {
8519 sources.swap(s);
8520 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8521 if (merge_waiters[nextmap->get_epoch()].empty()) {
8522 merge_waiters.erase(nextmap->get_epoch());
8523 }
8524 }
8525 }
8526 if (!sources.empty()) {
8527 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8528 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8529 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8530 pg->merge_from(
8531 sources, rctx, split_bits,
8532 nextmap->get_pg_pool(
8533 pg->pg_id.pool())->last_pg_merge_meta);
8534 pg->pg_slot->waiting_for_merge_epoch = 0;
8535 } else {
8536 dout(20) << __func__ << " not ready to merge yet" << dendl;
8537 pg->write_if_dirty(rctx);
8538 if (!new_pgs.empty()) {
8539 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8540 new_pgs));
8541 new_pgs.clear();
8542 }
8543 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8544 pg->unlock();
8545 // kick source(s) to get them ready
8546 for (auto& i : children) {
8547 dout(20) << __func__ << " kicking source " << i << dendl;
8548 enqueue_peering_evt(
8549 i,
8550 PGPeeringEventRef(
8551 std::make_shared<PGPeeringEvent>(
8552 nextmap->get_epoch(),
8553 nextmap->get_epoch(),
8554 NullEvt())));
8555 }
8556 ret = false;
8557 goto out;
8558 }
8559 }
8560 }
8561 }
8562
8563 vector<int> newup, newacting;
8564 int up_primary, acting_primary;
8565 nextmap->pg_to_up_acting_osds(
8566 pg->pg_id.pgid,
8567 &newup, &up_primary,
8568 &newacting, &acting_primary);
8569 pg->handle_advance_map(
8570 nextmap, lastmap, newup, up_primary,
8571 newacting, acting_primary, rctx);
8572
8573 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8574 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8575 if (oldpool != lastmap->get_pools().end()
8576 && newpool != nextmap->get_pools().end()) {
8577 dout(20) << __func__
8578 << " new pool opts " << newpool->second.opts
8579 << " old pool opts " << oldpool->second.opts
8580 << dendl;
8581
8582 double old_min_interval = 0, new_min_interval = 0;
8583 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8584 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8585
8586 double old_max_interval = 0, new_max_interval = 0;
8587 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8588 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8589
8590 // Assume if an interval is change from set to unset or vice versa the actual config
8591 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8592 // unnecessarily.
8593 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8594 pg->on_info_history_change();
8595 }
8596 }
8597
8598 if (new_pg_num && old_pg_num != new_pg_num) {
8599 // check for split
8600 set<spg_t> children;
8601 if (pg->pg_id.is_split(
8602 old_pg_num,
8603 new_pg_num,
8604 &children)) {
8605 split_pgs(
8606 pg, children, &new_pgs, lastmap, nextmap,
8607 rctx);
8608 }
8609 }
8610
8611 lastmap = nextmap;
8612 old_pg_num = new_pg_num;
8613 handle.reset_tp_timeout();
8614 }
8615 pg->handle_activate_map(rctx);
8616
8617 ret = true;
8618 out:
8619 if (!new_pgs.empty()) {
8620 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
8621 }
8622 return ret;
8623 }
8624
8625 void OSD::consume_map()
8626 {
8627 ceph_assert(ceph_mutex_is_locked(osd_lock));
8628 auto osdmap = get_osdmap();
8629 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8630
8631 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8632 * speak the older sorting version any more. Be careful not to force
8633 * a shutdown if we are merely processing old maps, though.
8634 */
8635 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8636 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8637 ceph_abort();
8638 }
8639
8640 service.pre_publish_map(osdmap);
8641 service.await_reserved_maps();
8642 service.publish_map(osdmap);
8643
8644 // prime splits and merges
8645 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8646 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8647 for (auto& shard : shards) {
8648 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8649 }
8650 if (!newly_split.empty()) {
8651 for (auto& shard : shards) {
8652 shard->prime_splits(osdmap, &newly_split);
8653 }
8654 ceph_assert(newly_split.empty());
8655 }
8656
8657 // prune sent_ready_to_merge
8658 service.prune_sent_ready_to_merge(osdmap);
8659
8660 // FIXME, maybe: We could race against an incoming peering message
8661 // that instantiates a merge PG after identify_merges() below and
8662 // never set up its peer to complete the merge. An OSD restart
8663 // would clear it up. This is a hard race to resolve,
8664 // extraordinarily rare (we only merge PGs that are stable and
8665 // clean, so it'd have to be an imported PG to an OSD with a
8666 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8667 // replace all of this with a seastar-based code soon anyway.
8668 if (!merge_pgs.empty()) {
8669 // mark the pgs we already have, or create new and empty merge
8670 // participants for those we are missing. do this all under the
8671 // shard lock so we don't have to worry about racing pg creates
8672 // via _process.
8673 for (auto& shard : shards) {
8674 shard->prime_merges(osdmap, &merge_pgs);
8675 }
8676 ceph_assert(merge_pgs.empty());
8677 }
8678
8679 service.prune_pg_created();
8680
8681 unsigned pushes_to_free = 0;
8682 for (auto& shard : shards) {
8683 shard->consume_map(osdmap, &pushes_to_free);
8684 }
8685
8686 vector<spg_t> pgids;
8687 _get_pgids(&pgids);
8688
8689 // count (FIXME, probably during seastar rewrite)
8690 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8691 vector<PGRef> pgs;
8692 _get_pgs(&pgs);
8693 for (auto& pg : pgs) {
8694 // FIXME (probably during seastar rewrite): this is lockless and
8695 // racy, but we don't want to take pg lock here.
8696 if (pg->is_primary())
8697 num_pg_primary++;
8698 else if (pg->is_nonprimary())
8699 num_pg_replica++; // misnomer
8700 else
8701 num_pg_stray++;
8702 }
8703
8704 {
8705 // FIXME (as part of seastar rewrite): move to OSDShard
8706 std::lock_guard l(pending_creates_lock);
8707 for (auto pg = pending_creates_from_osd.begin();
8708 pg != pending_creates_from_osd.end();) {
8709 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
8710 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8711 << "discarding pending_create_from_osd" << dendl;
8712 pg = pending_creates_from_osd.erase(pg);
8713 } else {
8714 ++pg;
8715 }
8716 }
8717 }
8718
8719 service.maybe_inject_dispatch_delay();
8720
8721 dispatch_sessions_waiting_on_map();
8722
8723 service.maybe_inject_dispatch_delay();
8724
8725 service.release_reserved_pushes(pushes_to_free);
8726
8727 // queue null events to push maps down to individual PGs
8728 for (auto pgid : pgids) {
8729 enqueue_peering_evt(
8730 pgid,
8731 PGPeeringEventRef(
8732 std::make_shared<PGPeeringEvent>(
8733 osdmap->get_epoch(),
8734 osdmap->get_epoch(),
8735 NullEvt())));
8736 }
8737 logger->set(l_osd_pg, pgids.size());
8738 logger->set(l_osd_pg_primary, num_pg_primary);
8739 logger->set(l_osd_pg_replica, num_pg_replica);
8740 logger->set(l_osd_pg_stray, num_pg_stray);
8741 }
8742
8743 void OSD::activate_map()
8744 {
8745 ceph_assert(ceph_mutex_is_locked(osd_lock));
8746 auto osdmap = get_osdmap();
8747
8748 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8749
8750 // norecover?
8751 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8752 if (!service.recovery_is_paused()) {
8753 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8754 service.pause_recovery();
8755 }
8756 } else {
8757 if (service.recovery_is_paused()) {
8758 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8759 service.unpause_recovery();
8760 }
8761 }
8762
8763 service.activate_map();
8764
8765 // process waiters
8766 take_waiters(waiting_for_osdmap);
8767 }
8768
8769 bool OSD::require_mon_peer(const Message *m)
8770 {
8771 if (!m->get_connection()->peer_is_mon()) {
8772 dout(0) << "require_mon_peer received from non-mon "
8773 << m->get_connection()->get_peer_addr()
8774 << " " << *m << dendl;
8775 return false;
8776 }
8777 return true;
8778 }
8779
8780 bool OSD::require_mon_or_mgr_peer(const Message *m)
8781 {
8782 if (!m->get_connection()->peer_is_mon() &&
8783 !m->get_connection()->peer_is_mgr()) {
8784 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8785 << m->get_connection()->get_peer_addr()
8786 << " " << *m << dendl;
8787 return false;
8788 }
8789 return true;
8790 }
8791
8792 bool OSD::require_osd_peer(const Message *m)
8793 {
8794 if (!m->get_connection()->peer_is_osd()) {
8795 dout(0) << "require_osd_peer received from non-osd "
8796 << m->get_connection()->get_peer_addr()
8797 << " " << *m << dendl;
8798 return false;
8799 }
8800 return true;
8801 }
8802
8803 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8804 {
8805 epoch_t up_epoch = service.get_up_epoch();
8806 if (epoch < up_epoch) {
8807 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8808 return false;
8809 }
8810
8811 if (!is_active()) {
8812 dout(7) << "still in boot state, dropping message " << *m << dendl;
8813 return false;
8814 }
8815
8816 return true;
8817 }
8818
8819 bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
8820 bool is_fast_dispatch)
8821 {
8822 int from = m->get_source().num();
8823
8824 if (map->is_down(from) ||
8825 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
8826 dout(5) << "from dead osd." << from << ", marking down, "
8827 << " msg was " << m->get_source_inst().addr
8828 << " expected "
8829 << (map->is_up(from) ?
8830 map->get_cluster_addrs(from) : entity_addrvec_t())
8831 << dendl;
8832 ConnectionRef con = m->get_connection();
8833 con->mark_down();
8834 if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
8835 if (!is_fast_dispatch)
8836 s->session_dispatch_lock.lock();
8837 clear_session_waiting_on_map(s);
8838 con->set_priv(nullptr); // break ref <-> session cycle, if any
8839 s->con.reset();
8840 if (!is_fast_dispatch)
8841 s->session_dispatch_lock.unlock();
8842 }
8843 return false;
8844 }
8845 return true;
8846 }
8847
8848
8849 /*
8850 * require that we have same (or newer) map, and that
8851 * the source is the pg primary.
8852 */
8853 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8854 bool is_fast_dispatch)
8855 {
8856 const Message *m = op->get_req();
8857 const auto osdmap = get_osdmap();
8858 dout(15) << "require_same_or_newer_map " << epoch
8859 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8860
8861 ceph_assert(ceph_mutex_is_locked(osd_lock));
8862
8863 // do they have a newer map?
8864 if (epoch > osdmap->get_epoch()) {
8865 dout(7) << "waiting for newer map epoch " << epoch
8866 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8867 wait_for_new_map(op);
8868 return false;
8869 }
8870
8871 if (!require_self_aliveness(op->get_req(), epoch)) {
8872 return false;
8873 }
8874
8875 // ok, our map is same or newer.. do they still exist?
8876 if (m->get_connection()->get_messenger() == cluster_messenger &&
8877 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8878 return false;
8879 }
8880
8881 return true;
8882 }
8883
8884
8885
8886
8887
8888 // ----------------------------------------
8889 // pg creation
8890
8891 void OSD::split_pgs(
8892 PG *parent,
8893 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
8894 OSDMapRef curmap,
8895 OSDMapRef nextmap,
8896 PeeringCtx &rctx)
8897 {
8898 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
8899 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
8900
8901 vector<object_stat_sum_t> updated_stats;
8902 parent->start_split_stats(childpgids, &updated_stats);
8903
8904 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8905 for (set<spg_t>::const_iterator i = childpgids.begin();
8906 i != childpgids.end();
8907 ++i, ++stat_iter) {
8908 ceph_assert(stat_iter != updated_stats.end());
8909 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
8910 PG* child = _make_pg(nextmap, *i);
8911 child->lock(true);
8912 out_pgs->insert(child);
8913 child->ch = store->create_new_collection(child->coll);
8914
8915 {
8916 uint32_t shard_index = i->hash_to_shard(shards.size());
8917 assert(NULL != shards[shard_index]);
8918 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
8919 }
8920
8921 unsigned split_bits = i->get_split_bits(pg_num);
8922 dout(10) << " pg_num is " << pg_num
8923 << ", m_seed " << i->ps()
8924 << ", split_bits is " << split_bits << dendl;
8925 parent->split_colls(
8926 *i,
8927 split_bits,
8928 i->ps(),
8929 &child->get_pool().info,
8930 rctx.transaction);
8931 parent->split_into(
8932 i->pgid,
8933 child,
8934 split_bits);
8935
8936 child->init_collection_pool_opts();
8937
8938 child->finish_split_stats(*stat_iter, rctx.transaction);
8939 child->unlock();
8940 }
8941 ceph_assert(stat_iter != updated_stats.end());
8942 parent->finish_split_stats(*stat_iter, rctx.transaction);
8943 }
8944
8945 /*
8946 * holding osd_lock
8947 */
8948 void OSD::handle_pg_create(OpRequestRef op)
8949 {
8950 // NOTE: this can be removed in P release (mimic is the last version to
8951 // send MOSDPGCreate messages).
8952
8953 auto m = op->get_req<MOSDPGCreate>();
8954 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
8955
8956 dout(10) << "handle_pg_create " << *m << dendl;
8957
8958 if (!require_mon_peer(op->get_req())) {
8959 return;
8960 }
8961
8962 if (!require_same_or_newer_map(op, m->epoch, false))
8963 return;
8964
8965 op->mark_started();
8966
8967 const auto osdmap = get_osdmap();
8968 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8969 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8970 p != m->mkpg.end();
8971 ++p, ++ci) {
8972 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
8973 epoch_t created = p->second.created;
8974 if (p->second.split_bits) // Skip split pgs
8975 continue;
8976 pg_t on = p->first;
8977
8978 if (!osdmap->have_pg_pool(on.pool())) {
8979 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8980 continue;
8981 }
8982
8983 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
8984
8985 spg_t pgid;
8986 bool mapped = osdmap->get_primary_shard(on, &pgid);
8987 ceph_assert(mapped);
8988
8989 // is it still ours?
8990 vector<int> up, acting;
8991 int up_primary = -1;
8992 int acting_primary = -1;
8993 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
8994 int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
8995
8996 if (acting_primary != whoami) {
8997 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
8998 << "), my role=" << role << ", skipping" << dendl;
8999 continue;
9000 }
9001
9002
9003 PastIntervals pi;
9004 pg_history_t history;
9005 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9006
9007 // The mon won't resend unless the primary changed, so we ignore
9008 // same_interval_since. We'll pass this history with the current
9009 // epoch as the event.
9010 if (history.same_primary_since > m->epoch) {
9011 dout(10) << __func__ << ": got obsolete pg create on pgid "
9012 << pgid << " from epoch " << m->epoch
9013 << ", primary changed in " << history.same_primary_since
9014 << dendl;
9015 continue;
9016 }
9017 enqueue_peering_evt(
9018 pgid,
9019 PGPeeringEventRef(
9020 std::make_shared<PGPeeringEvent>(
9021 osdmap->get_epoch(),
9022 osdmap->get_epoch(),
9023 NullEvt(),
9024 true,
9025 new PGCreateInfo(
9026 pgid,
9027 osdmap->get_epoch(),
9028 history,
9029 pi,
9030 true)
9031 )));
9032 }
9033
9034 {
9035 std::lock_guard l(pending_creates_lock);
9036 if (pending_creates_from_mon == 0) {
9037 last_pg_create_epoch = m->epoch;
9038 }
9039 }
9040
9041 maybe_update_heartbeat_peers();
9042 }
9043
9044
9045 // ----------------------------------------
9046 // peering and recovery
9047
9048 PeeringCtx OSD::create_context()
9049 {
9050 return PeeringCtx(get_osdmap()->require_osd_release);
9051 }
9052
9053 void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
9054 ThreadPool::TPHandle *handle)
9055 {
9056 if (!service.get_osdmap()->is_up(whoami)) {
9057 dout(20) << __func__ << " not up in osdmap" << dendl;
9058 } else if (!is_active()) {
9059 dout(20) << __func__ << " not active" << dendl;
9060 } else {
9061 for (auto& [osd, ls] : ctx.message_map) {
9062 if (!curmap->is_up(osd)) {
9063 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9064 continue;
9065 }
9066 ConnectionRef con = service.get_con_osd_cluster(
9067 osd, curmap->get_epoch());
9068 if (!con) {
9069 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9070 << dendl;
9071 continue;
9072 }
9073 service.maybe_share_map(con.get(), curmap);
9074 for (auto m : ls) {
9075 con->send_message2(m);
9076 }
9077 ls.clear();
9078 }
9079 }
9080 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
9081 int tr = store->queue_transaction(
9082 pg->ch,
9083 std::move(ctx.transaction), TrackedOpRef(),
9084 handle);
9085 ceph_assert(tr == 0);
9086 }
9087 }
9088
9089 void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9090 {
9091 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9092 if (!require_mon_peer(m)) {
9093 m->put();
9094 return;
9095 }
9096 for (auto& p : m->pgs) {
9097 spg_t pgid = p.first;
9098 epoch_t created = p.second.first;
9099 utime_t created_stamp = p.second.second;
9100 auto q = m->pg_extra.find(pgid);
9101 if (q == m->pg_extra.end()) {
9102 dout(20) << __func__ << " " << pgid << " e" << created
9103 << "@" << created_stamp
9104 << " (no history or past_intervals)" << dendl;
9105 // pre-octopus ... no pg history. this can be removed in Q release.
9106 enqueue_peering_evt(
9107 pgid,
9108 PGPeeringEventRef(
9109 std::make_shared<PGPeeringEvent>(
9110 m->epoch,
9111 m->epoch,
9112 NullEvt(),
9113 true,
9114 new PGCreateInfo(
9115 pgid,
9116 created,
9117 pg_history_t(created, created_stamp),
9118 PastIntervals(),
9119 true)
9120 )));
9121 } else {
9122 dout(20) << __func__ << " " << pgid << " e" << created
9123 << "@" << created_stamp
9124 << " history " << q->second.first
9125 << " pi " << q->second.second << dendl;
9126 if (!q->second.second.empty() &&
9127 m->epoch < q->second.second.get_bounds().second) {
9128 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9129 << " and unmatched past_intervals " << q->second.second
9130 << " (history " << q->second.first << ")";
9131 } else {
9132 enqueue_peering_evt(
9133 pgid,
9134 PGPeeringEventRef(
9135 std::make_shared<PGPeeringEvent>(
9136 m->epoch,
9137 m->epoch,
9138 NullEvt(),
9139 true,
9140 new PGCreateInfo(
9141 pgid,
9142 m->epoch,
9143 q->second.first,
9144 q->second.second,
9145 true)
9146 )));
9147 }
9148 }
9149 }
9150
9151 {
9152 std::lock_guard l(pending_creates_lock);
9153 if (pending_creates_from_mon == 0) {
9154 last_pg_create_epoch = m->epoch;
9155 }
9156 }
9157
9158 m->put();
9159 }
9160
9161 void OSD::handle_fast_pg_query(MOSDPGQuery *m)
9162 {
9163 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9164 if (!require_osd_peer(m)) {
9165 m->put();
9166 return;
9167 }
9168 int from = m->get_source().num();
9169 for (auto& p : m->pg_list) {
9170 enqueue_peering_evt(
9171 p.first,
9172 PGPeeringEventRef(
9173 std::make_shared<PGPeeringEvent>(
9174 p.second.epoch_sent, p.second.epoch_sent,
9175 MQuery(
9176 p.first,
9177 pg_shard_t(from, p.second.from),
9178 p.second,
9179 p.second.epoch_sent),
9180 false))
9181 );
9182 }
9183 m->put();
9184 }
9185
9186 void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9187 {
9188 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9189 if (!require_osd_peer(m)) {
9190 m->put();
9191 return;
9192 }
9193 int from = m->get_source().num();
9194 for (auto& p : m->get_pg_list()) {
9195 spg_t pgid(p.info.pgid.pgid, p.to);
9196 enqueue_peering_evt(
9197 pgid,
9198 PGPeeringEventRef(
9199 std::make_shared<PGPeeringEvent>(
9200 p.epoch_sent,
9201 p.query_epoch,
9202 MNotifyRec(
9203 pgid, pg_shard_t(from, p.from),
9204 p,
9205 m->get_connection()->get_features()),
9206 true,
9207 new PGCreateInfo(
9208 pgid,
9209 p.query_epoch,
9210 p.info.history,
9211 p.past_intervals,
9212 false)
9213 )));
9214 }
9215 m->put();
9216 }
9217
9218 void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9219 {
9220 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9221 if (!require_osd_peer(m)) {
9222 m->put();
9223 return;
9224 }
9225 int from = m->get_source().num();
9226 for (auto& p : m->pg_list) {
9227 enqueue_peering_evt(
9228 spg_t(p.info.pgid.pgid, p.to),
9229 PGPeeringEventRef(
9230 std::make_shared<PGPeeringEvent>(
9231 p.epoch_sent, p.query_epoch,
9232 MInfoRec(
9233 pg_shard_t(from, p.from),
9234 p.info,
9235 p.epoch_sent)))
9236 );
9237 }
9238 m->put();
9239 }
9240
9241 void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9242 {
9243 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9244 if (!require_osd_peer(m)) {
9245 m->put();
9246 return;
9247 }
9248 for (auto& pgid : m->pg_list) {
9249 enqueue_peering_evt(
9250 pgid,
9251 PGPeeringEventRef(
9252 std::make_shared<PGPeeringEvent>(
9253 m->get_epoch(), m->get_epoch(),
9254 PeeringState::DeleteStart())));
9255 }
9256 m->put();
9257 }
9258
9259 void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9260 {
9261 dout(10) << __func__ << " " << *m << dendl;
9262 if (!require_mon_or_mgr_peer(m)) {
9263 m->put();
9264 return;
9265 }
9266 epoch_t epoch = get_osdmap_epoch();
9267 for (auto pgid : m->forced_pgs) {
9268 if (m->options & OFR_BACKFILL) {
9269 if (m->options & OFR_CANCEL) {
9270 enqueue_peering_evt(
9271 pgid,
9272 PGPeeringEventRef(
9273 std::make_shared<PGPeeringEvent>(
9274 epoch, epoch,
9275 PeeringState::UnsetForceBackfill())));
9276 } else {
9277 enqueue_peering_evt(
9278 pgid,
9279 PGPeeringEventRef(
9280 std::make_shared<PGPeeringEvent>(
9281 epoch, epoch,
9282 PeeringState::SetForceBackfill())));
9283 }
9284 } else if (m->options & OFR_RECOVERY) {
9285 if (m->options & OFR_CANCEL) {
9286 enqueue_peering_evt(
9287 pgid,
9288 PGPeeringEventRef(
9289 std::make_shared<PGPeeringEvent>(
9290 epoch, epoch,
9291 PeeringState::UnsetForceRecovery())));
9292 } else {
9293 enqueue_peering_evt(
9294 pgid,
9295 PGPeeringEventRef(
9296 std::make_shared<PGPeeringEvent>(
9297 epoch, epoch,
9298 PeeringState::SetForceRecovery())));
9299 }
9300 }
9301 }
9302 m->put();
9303 }
9304
9305 void OSD::handle_pg_query_nopg(const MQuery& q)
9306 {
9307 spg_t pgid = q.pgid;
9308 dout(10) << __func__ << " " << pgid << dendl;
9309
9310 OSDMapRef osdmap = get_osdmap();
9311 if (!osdmap->have_pg_pool(pgid.pool()))
9312 return;
9313
9314 dout(10) << " pg " << pgid << " dne" << dendl;
9315 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9316 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9317 if (con) {
9318 Message *m;
9319 if (q.query.type == pg_query_t::LOG ||
9320 q.query.type == pg_query_t::FULLLOG) {
9321 m = new MOSDPGLog(
9322 q.query.from, q.query.to,
9323 osdmap->get_epoch(), empty,
9324 q.query.epoch_sent);
9325 } else {
9326 vector<pg_notify_t> ls;
9327 ls.push_back(
9328 pg_notify_t(
9329 q.query.from, q.query.to,
9330 q.query.epoch_sent,
9331 osdmap->get_epoch(),
9332 empty,
9333 PastIntervals()));
9334 m = new MOSDPGNotify(osdmap->get_epoch(), std::move(ls));
9335 }
9336 service.maybe_share_map(con.get(), osdmap);
9337 con->send_message(m);
9338 }
9339 }
9340
9341 void OSDService::queue_check_readable(spg_t spgid,
9342 epoch_t lpr,
9343 ceph::signedspan delay)
9344 {
9345 if (delay == ceph::signedspan::zero()) {
9346 osd->enqueue_peering_evt(
9347 spgid,
9348 PGPeeringEventRef(
9349 std::make_shared<PGPeeringEvent>(
9350 lpr, lpr,
9351 PeeringState::CheckReadable())));
9352 } else {
9353 mono_timer.add_event(
9354 delay,
9355 [this, spgid, lpr]() {
9356 queue_check_readable(spgid, lpr);
9357 });
9358 }
9359 }
9360
9361
9362 // =========================================================
9363 // RECOVERY
9364
9365 void OSDService::_maybe_queue_recovery() {
9366 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
9367 uint64_t available_pushes;
9368 while (!awaiting_throttle.empty() &&
9369 _recover_now(&available_pushes)) {
9370 uint64_t to_start = std::min(
9371 available_pushes,
9372 cct->_conf->osd_recovery_max_single_start);
9373 _queue_for_recovery(awaiting_throttle.front(), to_start);
9374 awaiting_throttle.pop_front();
9375 dout(10) << __func__ << " starting " << to_start
9376 << ", recovery_ops_reserved " << recovery_ops_reserved
9377 << " -> " << (recovery_ops_reserved + to_start) << dendl;
9378 recovery_ops_reserved += to_start;
9379 }
9380 }
9381
9382 bool OSDService::_recover_now(uint64_t *available_pushes)
9383 {
9384 if (available_pushes)
9385 *available_pushes = 0;
9386
9387 if (ceph_clock_now() < defer_recovery_until) {
9388 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9389 return false;
9390 }
9391
9392 if (recovery_paused) {
9393 dout(15) << __func__ << " paused" << dendl;
9394 return false;
9395 }
9396
9397 uint64_t max = osd->get_recovery_max_active();
9398 if (max <= recovery_ops_active + recovery_ops_reserved) {
9399 dout(15) << __func__ << " active " << recovery_ops_active
9400 << " + reserved " << recovery_ops_reserved
9401 << " >= max " << max << dendl;
9402 return false;
9403 }
9404
9405 if (available_pushes)
9406 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9407
9408 return true;
9409 }
9410
9411 unsigned OSDService::get_target_pg_log_entries() const
9412 {
9413 auto num_pgs = osd->get_num_pgs();
9414 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9415 if (num_pgs > 0 && target > 0) {
9416 // target an even spread of our budgeted log entries across all
9417 // PGs. note that while we only get to control the entry count
9418 // for primary PGs, we'll normally be responsible for a mix of
9419 // primary and replica PGs (for the same pool(s) even), so this
9420 // will work out.
9421 return std::max<unsigned>(
9422 std::min<unsigned>(target / num_pgs,
9423 cct->_conf->osd_max_pg_log_entries),
9424 cct->_conf->osd_min_pg_log_entries);
9425 } else {
9426 // fall back to a per-pg value.
9427 return cct->_conf->osd_min_pg_log_entries;
9428 }
9429 }
9430
9431 void OSD::do_recovery(
9432 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9433 ThreadPool::TPHandle &handle)
9434 {
9435 uint64_t started = 0;
9436
9437 /*
9438 * When the value of osd_recovery_sleep is set greater than zero, recovery
9439 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9440 * recovery event's schedule time. This is done by adding a
9441 * recovery_requeue_callback event, which re-queues the recovery op using
9442 * queue_recovery_after_sleep.
9443 */
9444 float recovery_sleep = get_osd_recovery_sleep();
9445 {
9446 std::lock_guard l(service.sleep_lock);
9447 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9448 PGRef pgref(pg);
9449 auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
9450 dout(20) << "do_recovery wake up at "
9451 << ceph_clock_now()
9452 << ", re-queuing recovery" << dendl;
9453 std::lock_guard l(service.sleep_lock);
9454 service.recovery_needs_sleep = false;
9455 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9456 });
9457
9458 // This is true for the first recovery op and when the previous recovery op
9459 // has been scheduled in the past. The next recovery op is scheduled after
9460 // completing the sleep from now.
9461
9462 if (auto now = ceph::real_clock::now();
9463 service.recovery_schedule_time < now) {
9464 service.recovery_schedule_time = now;
9465 }
9466 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
9467 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9468 recovery_requeue_callback);
9469 dout(20) << "Recovery event scheduled at "
9470 << service.recovery_schedule_time << dendl;
9471 return;
9472 }
9473 }
9474
9475 {
9476 {
9477 std::lock_guard l(service.sleep_lock);
9478 service.recovery_needs_sleep = true;
9479 }
9480
9481 if (pg->pg_has_reset_since(queued)) {
9482 goto out;
9483 }
9484
9485 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9486 #ifdef DEBUG_RECOVERY_OIDS
9487 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
9488 #endif
9489
9490 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
9491 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9492 << " on " << *pg << dendl;
9493
9494 if (do_unfound) {
9495 PeeringCtx rctx = create_context();
9496 rctx.handle = &handle;
9497 pg->find_unfound(queued, rctx);
9498 dispatch_context(rctx, pg, pg->get_osdmap());
9499 }
9500 }
9501
9502 out:
9503 ceph_assert(started <= reserved_pushes);
9504 service.release_reserved_pushes(reserved_pushes);
9505 }
9506
9507 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9508 {
9509 std::lock_guard l(recovery_lock);
9510 dout(10) << "start_recovery_op " << *pg << " " << soid
9511 << " (" << recovery_ops_active << "/"
9512 << osd->get_recovery_max_active() << " rops)"
9513 << dendl;
9514 recovery_ops_active++;
9515
9516 #ifdef DEBUG_RECOVERY_OIDS
9517 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9518 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9519 recovery_oids[pg->pg_id].insert(soid);
9520 #endif
9521 }
9522
9523 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9524 {
9525 std::lock_guard l(recovery_lock);
9526 dout(10) << "finish_recovery_op " << *pg << " " << soid
9527 << " dequeue=" << dequeue
9528 << " (" << recovery_ops_active << "/"
9529 << osd->get_recovery_max_active() << " rops)"
9530 << dendl;
9531
9532 // adjust count
9533 ceph_assert(recovery_ops_active > 0);
9534 recovery_ops_active--;
9535
9536 #ifdef DEBUG_RECOVERY_OIDS
9537 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9538 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9539 recovery_oids[pg->pg_id].erase(soid);
9540 #endif
9541
9542 _maybe_queue_recovery();
9543 }
9544
9545 bool OSDService::is_recovery_active()
9546 {
9547 if (cct->_conf->osd_debug_pretend_recovery_active) {
9548 return true;
9549 }
9550 return local_reserver.has_reservation() || remote_reserver.has_reservation();
9551 }
9552
9553 void OSDService::release_reserved_pushes(uint64_t pushes)
9554 {
9555 std::lock_guard l(recovery_lock);
9556 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9557 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9558 << dendl;
9559 ceph_assert(recovery_ops_reserved >= pushes);
9560 recovery_ops_reserved -= pushes;
9561 _maybe_queue_recovery();
9562 }
9563
9564 // =========================================================
9565 // OPS
9566
9567 bool OSD::op_is_discardable(const MOSDOp *op)
9568 {
9569 // drop client request if they are not connected and can't get the
9570 // reply anyway.
9571 if (!op->get_connection()->is_connected()) {
9572 return true;
9573 }
9574 return false;
9575 }
9576
9577 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
9578 {
9579 const utime_t stamp = op->get_req()->get_recv_stamp();
9580 const utime_t latency = ceph_clock_now() - stamp;
9581 const unsigned priority = op->get_req()->get_priority();
9582 const int cost = op->get_req()->get_cost();
9583 const uint64_t owner = op->get_req()->get_source().num();
9584
9585 dout(15) << "enqueue_op " << op << " prio " << priority
9586 << " cost " << cost
9587 << " latency " << latency
9588 << " epoch " << epoch
9589 << " " << *(op->get_req()) << dendl;
9590 op->osd_trace.event("enqueue op");
9591 op->osd_trace.keyval("priority", priority);
9592 op->osd_trace.keyval("cost", cost);
9593 op->mark_queued_for_pg();
9594 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9595 op_shardedwq.queue(
9596 OpSchedulerItem(
9597 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9598 cost, priority, stamp, owner, epoch));
9599 }
9600
9601 void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9602 {
9603 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9604 op_shardedwq.queue(
9605 OpSchedulerItem(
9606 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9607 10,
9608 cct->_conf->osd_peering_op_priority,
9609 utime_t(),
9610 0,
9611 evt->get_epoch_sent()));
9612 }
9613
9614 /*
9615 * NOTE: dequeue called in worker thread, with pg lock
9616 */
9617 void OSD::dequeue_op(
9618 PGRef pg, OpRequestRef op,
9619 ThreadPool::TPHandle &handle)
9620 {
9621 const Message *m = op->get_req();
9622
9623 FUNCTRACE(cct);
9624 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
9625
9626 utime_t now = ceph_clock_now();
9627 op->set_dequeued_time(now);
9628
9629 utime_t latency = now - m->get_recv_stamp();
9630 dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9631 << " cost " << m->get_cost()
9632 << " latency " << latency
9633 << " " << *m
9634 << " pg " << *pg << dendl;
9635
9636 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9637
9638 service.maybe_share_map(m->get_connection().get(),
9639 pg->get_osdmap(),
9640 op->sent_epoch);
9641
9642 if (pg->is_deleting())
9643 return;
9644
9645 op->mark_reached_pg();
9646 op->osd_trace.event("dequeue_op");
9647
9648 pg->do_request(op, handle);
9649
9650 // finish
9651 dout(10) << "dequeue_op " << op << " finish" << dendl;
9652 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
9653 }
9654
9655
9656 void OSD::dequeue_peering_evt(
9657 OSDShard *sdata,
9658 PG *pg,
9659 PGPeeringEventRef evt,
9660 ThreadPool::TPHandle& handle)
9661 {
9662 PeeringCtx rctx = create_context();
9663 auto curmap = sdata->get_osdmap();
9664 bool need_up_thru = false;
9665 epoch_t same_interval_since = 0;
9666 if (!pg) {
9667 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9668 handle_pg_query_nopg(*q);
9669 } else {
9670 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9671 ceph_abort();
9672 }
9673 } else if (advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9674 pg->do_peering_event(evt, rctx);
9675 if (pg->is_deleted()) {
9676 pg->unlock();
9677 return;
9678 }
9679 dispatch_context(rctx, pg, curmap, &handle);
9680 need_up_thru = pg->get_need_up_thru();
9681 same_interval_since = pg->get_same_interval_since();
9682 pg->unlock();
9683 }
9684
9685 if (need_up_thru) {
9686 queue_want_up_thru(same_interval_since);
9687 }
9688
9689 service.send_pg_temp();
9690 }
9691
9692 void OSD::dequeue_delete(
9693 OSDShard *sdata,
9694 PG *pg,
9695 epoch_t e,
9696 ThreadPool::TPHandle& handle)
9697 {
9698 dequeue_peering_evt(
9699 sdata,
9700 pg,
9701 PGPeeringEventRef(
9702 std::make_shared<PGPeeringEvent>(
9703 e, e,
9704 PeeringState::DeleteSome())),
9705 handle);
9706 }
9707
9708
9709
9710 // --------------------------------
9711
9712 const char** OSD::get_tracked_conf_keys() const
9713 {
9714 static const char* KEYS[] = {
9715 "osd_max_backfills",
9716 "osd_min_recovery_priority",
9717 "osd_max_trimming_pgs",
9718 "osd_op_complaint_time",
9719 "osd_op_log_threshold",
9720 "osd_op_history_size",
9721 "osd_op_history_duration",
9722 "osd_op_history_slow_op_size",
9723 "osd_op_history_slow_op_threshold",
9724 "osd_enable_op_tracker",
9725 "osd_map_cache_size",
9726 "osd_pg_epoch_max_lag_factor",
9727 "osd_pg_epoch_persisted_max_stale",
9728 // clog & admin clog
9729 "clog_to_monitors",
9730 "clog_to_syslog",
9731 "clog_to_syslog_facility",
9732 "clog_to_syslog_level",
9733 "osd_objectstore_fuse",
9734 "clog_to_graylog",
9735 "clog_to_graylog_host",
9736 "clog_to_graylog_port",
9737 "host",
9738 "fsid",
9739 "osd_recovery_delay_start",
9740 "osd_client_message_size_cap",
9741 "osd_client_message_cap",
9742 "osd_heartbeat_min_size",
9743 "osd_heartbeat_interval",
9744 "osd_object_clean_region_max_num_intervals",
9745 "osd_scrub_min_interval",
9746 "osd_scrub_max_interval",
9747 NULL
9748 };
9749 return KEYS;
9750 }
9751
9752 void OSD::handle_conf_change(const ConfigProxy& conf,
9753 const std::set <std::string> &changed)
9754 {
9755 std::lock_guard l{osd_lock};
9756 if (changed.count("osd_max_backfills")) {
9757 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9758 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9759 }
9760 if (changed.count("osd_min_recovery_priority")) {
9761 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9762 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9763 }
9764 if (changed.count("osd_max_trimming_pgs")) {
9765 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9766 }
9767 if (changed.count("osd_op_complaint_time") ||
9768 changed.count("osd_op_log_threshold")) {
9769 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9770 cct->_conf->osd_op_log_threshold);
9771 }
9772 if (changed.count("osd_op_history_size") ||
9773 changed.count("osd_op_history_duration")) {
9774 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9775 cct->_conf->osd_op_history_duration);
9776 }
9777 if (changed.count("osd_op_history_slow_op_size") ||
9778 changed.count("osd_op_history_slow_op_threshold")) {
9779 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9780 cct->_conf->osd_op_history_slow_op_threshold);
9781 }
9782 if (changed.count("osd_enable_op_tracker")) {
9783 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9784 }
9785 if (changed.count("osd_map_cache_size")) {
9786 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9787 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9788 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9789 }
9790 if (changed.count("clog_to_monitors") ||
9791 changed.count("clog_to_syslog") ||
9792 changed.count("clog_to_syslog_level") ||
9793 changed.count("clog_to_syslog_facility") ||
9794 changed.count("clog_to_graylog") ||
9795 changed.count("clog_to_graylog_host") ||
9796 changed.count("clog_to_graylog_port") ||
9797 changed.count("host") ||
9798 changed.count("fsid")) {
9799 update_log_config();
9800 }
9801 if (changed.count("osd_pg_epoch_max_lag_factor")) {
9802 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
9803 "osd_pg_epoch_max_lag_factor");
9804 }
9805
9806 #ifdef HAVE_LIBFUSE
9807 if (changed.count("osd_objectstore_fuse")) {
9808 if (store) {
9809 enable_disable_fuse(false);
9810 }
9811 }
9812 #endif
9813
9814 if (changed.count("osd_recovery_delay_start")) {
9815 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9816 service.kick_recovery_queue();
9817 }
9818
9819 if (changed.count("osd_client_message_cap")) {
9820 uint64_t newval = cct->_conf->osd_client_message_cap;
9821 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9822 if (pol.throttler_messages && newval > 0) {
9823 pol.throttler_messages->reset_max(newval);
9824 }
9825 }
9826 if (changed.count("osd_client_message_size_cap")) {
9827 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9828 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9829 if (pol.throttler_bytes && newval > 0) {
9830 pol.throttler_bytes->reset_max(newval);
9831 }
9832 }
9833 if (changed.count("osd_object_clean_region_max_num_intervals")) {
9834 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
9835 }
9836
9837 if (changed.count("osd_scrub_min_interval") ||
9838 changed.count("osd_scrub_max_interval")) {
9839 resched_all_scrubs();
9840 dout(0) << __func__ << ": scrub interval change" << dendl;
9841 }
9842 check_config();
9843 }
9844
9845 void OSD::update_log_config()
9846 {
9847 map<string,string> log_to_monitors;
9848 map<string,string> log_to_syslog;
9849 map<string,string> log_channel;
9850 map<string,string> log_prio;
9851 map<string,string> log_to_graylog;
9852 map<string,string> log_to_graylog_host;
9853 map<string,string> log_to_graylog_port;
9854 uuid_d fsid;
9855 string host;
9856
9857 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9858 log_channel, log_prio, log_to_graylog,
9859 log_to_graylog_host, log_to_graylog_port,
9860 fsid, host) == 0)
9861 clog->update_config(log_to_monitors, log_to_syslog,
9862 log_channel, log_prio, log_to_graylog,
9863 log_to_graylog_host, log_to_graylog_port,
9864 fsid, host);
9865 derr << "log_to_monitors " << log_to_monitors << dendl;
9866 }
9867
9868 void OSD::check_config()
9869 {
9870 // some sanity checks
9871 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9872 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9873 << " is not > osd_pg_epoch_persisted_max_stale ("
9874 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9875 }
9876 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
9877 clog->warn() << "osd_object_clean_region_max_num_intervals ("
9878 << cct->_conf->osd_object_clean_region_max_num_intervals
9879 << ") is < 0";
9880 }
9881 }
9882
9883 // --------------------------------
9884
9885 void OSD::get_latest_osdmap()
9886 {
9887 dout(10) << __func__ << " -- start" << dendl;
9888
9889 C_SaferCond cond;
9890 service.objecter->wait_for_latest_osdmap(&cond);
9891 cond.wait();
9892
9893 dout(10) << __func__ << " -- finish" << dendl;
9894 }
9895
9896 // --------------------------------
9897
9898 void OSD::set_perf_queries(const ConfigPayload &config_payload) {
9899 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
9900 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
9901 dout(10) << "setting " << queries.size() << " queries" << dendl;
9902
9903 std::list<OSDPerfMetricQuery> supported_queries;
9904 for (auto &it : queries) {
9905 auto &query = it.first;
9906 if (!query.key_descriptor.empty()) {
9907 supported_queries.push_back(query);
9908 }
9909 }
9910 if (supported_queries.size() < queries.size()) {
9911 dout(1) << queries.size() - supported_queries.size()
9912 << " unsupported queries" << dendl;
9913 }
9914 {
9915 std::lock_guard locker{m_perf_queries_lock};
9916 m_perf_queries = supported_queries;
9917 m_perf_limits = queries;
9918 }
9919 std::vector<PGRef> pgs;
9920 _get_pgs(&pgs);
9921 for (auto& pg : pgs) {
9922 std::scoped_lock l{*pg};
9923 pg->set_dynamic_perf_stats_queries(supported_queries);
9924 }
9925 }
9926
9927 MetricPayload OSD::get_perf_reports() {
9928 OSDMetricPayload payload;
9929 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
9930
9931 std::vector<PGRef> pgs;
9932 _get_pgs(&pgs);
9933 DynamicPerfStats dps;
9934 for (auto& pg : pgs) {
9935 // m_perf_queries can be modified only in set_perf_queries by mgr client
9936 // request, and it is protected by by mgr client's lock, which is held
9937 // when set_perf_queries/get_perf_reports are called, so we may not hold
9938 // m_perf_queries_lock here.
9939 DynamicPerfStats pg_dps(m_perf_queries);
9940 pg->lock();
9941 pg->get_dynamic_perf_stats(&pg_dps);
9942 pg->unlock();
9943 dps.merge(pg_dps);
9944 }
9945 dps.add_to_reports(m_perf_limits, &reports);
9946 dout(20) << "reports for " << reports.size() << " queries" << dendl;
9947
9948 return payload;
9949 }
9950
9951 // =============================================================
9952
9953 #undef dout_context
9954 #define dout_context cct
9955 #undef dout_prefix
9956 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
9957
9958 void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
9959 {
9960 dout(10) << pg->pg_id << " " << pg << dendl;
9961 slot->pg = pg;
9962 pg->osd_shard = this;
9963 pg->pg_slot = slot;
9964 osd->inc_num_pgs();
9965
9966 slot->epoch = pg->get_osdmap_epoch();
9967 pg_slots_by_epoch.insert(*slot);
9968 }
9969
9970 void OSDShard::_detach_pg(OSDShardPGSlot *slot)
9971 {
9972 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
9973 slot->pg->osd_shard = nullptr;
9974 slot->pg->pg_slot = nullptr;
9975 slot->pg = nullptr;
9976 osd->dec_num_pgs();
9977
9978 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
9979 slot->epoch = 0;
9980 if (waiting_for_min_pg_epoch) {
9981 min_pg_epoch_cond.notify_all();
9982 }
9983 }
9984
9985 void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
9986 {
9987 std::lock_guard l(shard_lock);
9988 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
9989 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
9990 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
9991 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
9992 slot->epoch = e;
9993 pg_slots_by_epoch.insert(*slot);
9994 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
9995 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
9996 if (waiting_for_min_pg_epoch) {
9997 min_pg_epoch_cond.notify_all();
9998 }
9999 }
10000
10001 epoch_t OSDShard::get_min_pg_epoch()
10002 {
10003 std::lock_guard l(shard_lock);
10004 auto p = pg_slots_by_epoch.begin();
10005 if (p == pg_slots_by_epoch.end()) {
10006 return 0;
10007 }
10008 return p->epoch;
10009 }
10010
10011 void OSDShard::wait_min_pg_epoch(epoch_t need)
10012 {
10013 std::unique_lock l{shard_lock};
10014 ++waiting_for_min_pg_epoch;
10015 min_pg_epoch_cond.wait(l, [need, this] {
10016 if (pg_slots_by_epoch.empty()) {
10017 return true;
10018 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10019 return true;
10020 } else {
10021 dout(10) << need << " waiting on "
10022 << pg_slots_by_epoch.begin()->epoch << dendl;
10023 return false;
10024 }
10025 });
10026 --waiting_for_min_pg_epoch;
10027 }
10028
10029 epoch_t OSDShard::get_max_waiting_epoch()
10030 {
10031 std::lock_guard l(shard_lock);
10032 epoch_t r = 0;
10033 for (auto& i : pg_slots) {
10034 if (!i.second->waiting_peering.empty()) {
10035 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10036 }
10037 }
10038 return r;
10039 }
10040
10041 void OSDShard::consume_map(
10042 const OSDMapRef& new_osdmap,
10043 unsigned *pushes_to_free)
10044 {
10045 std::lock_guard l(shard_lock);
10046 OSDMapRef old_osdmap;
10047 {
10048 std::lock_guard l(osdmap_lock);
10049 old_osdmap = std::move(shard_osdmap);
10050 shard_osdmap = new_osdmap;
10051 }
10052 dout(10) << new_osdmap->get_epoch()
10053 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10054 << dendl;
10055 bool queued = false;
10056
10057 // check slots
10058 auto p = pg_slots.begin();
10059 while (p != pg_slots.end()) {
10060 OSDShardPGSlot *slot = p->second.get();
10061 const spg_t& pgid = p->first;
10062 dout(20) << __func__ << " " << pgid << dendl;
10063 if (!slot->waiting_for_split.empty()) {
10064 dout(20) << __func__ << " " << pgid
10065 << " waiting for split " << slot->waiting_for_split << dendl;
10066 ++p;
10067 continue;
10068 }
10069 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10070 dout(20) << __func__ << " " << pgid
10071 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10072 << dendl;
10073 ++p;
10074 continue;
10075 }
10076 if (!slot->waiting_peering.empty()) {
10077 epoch_t first = slot->waiting_peering.begin()->first;
10078 if (first <= new_osdmap->get_epoch()) {
10079 dout(20) << __func__ << " " << pgid
10080 << " pending_peering first epoch " << first
10081 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10082 _wake_pg_slot(pgid, slot);
10083 queued = true;
10084 }
10085 ++p;
10086 continue;
10087 }
10088 if (!slot->waiting.empty()) {
10089 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10090 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10091 << dendl;
10092 ++p;
10093 continue;
10094 }
10095 while (!slot->waiting.empty() &&
10096 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10097 auto& qi = slot->waiting.front();
10098 dout(20) << __func__ << " " << pgid
10099 << " waiting item " << qi
10100 << " epoch " << qi.get_map_epoch()
10101 << " <= " << new_osdmap->get_epoch()
10102 << ", "
10103 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10104 "misdirected")
10105 << ", dropping" << dendl;
10106 *pushes_to_free += qi.get_reserved_pushes();
10107 slot->waiting.pop_front();
10108 }
10109 }
10110 if (slot->waiting.empty() &&
10111 slot->num_running == 0 &&
10112 slot->waiting_for_split.empty() &&
10113 !slot->pg) {
10114 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10115 p = pg_slots.erase(p);
10116 continue;
10117 }
10118
10119 ++p;
10120 }
10121 if (queued) {
10122 std::lock_guard l{sdata_wait_lock};
10123 sdata_cond.notify_one();
10124 }
10125 }
10126
10127 void OSDShard::_wake_pg_slot(
10128 spg_t pgid,
10129 OSDShardPGSlot *slot)
10130 {
10131 dout(20) << __func__ << " " << pgid
10132 << " to_process " << slot->to_process
10133 << " waiting " << slot->waiting
10134 << " waiting_peering " << slot->waiting_peering << dendl;
10135 for (auto i = slot->to_process.rbegin();
10136 i != slot->to_process.rend();
10137 ++i) {
10138 scheduler->enqueue_front(std::move(*i));
10139 }
10140 slot->to_process.clear();
10141 for (auto i = slot->waiting.rbegin();
10142 i != slot->waiting.rend();
10143 ++i) {
10144 scheduler->enqueue_front(std::move(*i));
10145 }
10146 slot->waiting.clear();
10147 for (auto i = slot->waiting_peering.rbegin();
10148 i != slot->waiting_peering.rend();
10149 ++i) {
10150 // this is overkill; we requeue everything, even if some of these
10151 // items are waiting for maps we don't have yet. FIXME, maybe,
10152 // someday, if we decide this inefficiency matters
10153 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10154 scheduler->enqueue_front(std::move(*j));
10155 }
10156 }
10157 slot->waiting_peering.clear();
10158 ++slot->requeue_seq;
10159 }
10160
10161 void OSDShard::identify_splits_and_merges(
10162 const OSDMapRef& as_of_osdmap,
10163 set<pair<spg_t,epoch_t>> *split_pgs,
10164 set<pair<spg_t,epoch_t>> *merge_pgs)
10165 {
10166 std::lock_guard l(shard_lock);
10167 if (shard_osdmap) {
10168 for (auto& i : pg_slots) {
10169 const spg_t& pgid = i.first;
10170 auto *slot = i.second.get();
10171 if (slot->pg) {
10172 osd->service.identify_splits_and_merges(
10173 shard_osdmap, as_of_osdmap, pgid,
10174 split_pgs, merge_pgs);
10175 } else if (!slot->waiting_for_split.empty()) {
10176 osd->service.identify_splits_and_merges(
10177 shard_osdmap, as_of_osdmap, pgid,
10178 split_pgs, nullptr);
10179 } else {
10180 dout(20) << __func__ << " slot " << pgid
10181 << " has no pg and waiting_for_split " << dendl;
10182 }
10183 }
10184 }
10185 }
10186
10187 void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10188 set<pair<spg_t,epoch_t>> *pgids)
10189 {
10190 std::lock_guard l(shard_lock);
10191 _prime_splits(pgids);
10192 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10193 set<pair<spg_t,epoch_t>> newer_children;
10194 for (auto i : *pgids) {
10195 osd->service.identify_splits_and_merges(
10196 as_of_osdmap, shard_osdmap, i.first,
10197 &newer_children, nullptr);
10198 }
10199 newer_children.insert(pgids->begin(), pgids->end());
10200 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10201 << shard_osdmap->get_epoch() << ", new children " << newer_children
10202 << dendl;
10203 _prime_splits(&newer_children);
10204 // note: we don't care what is left over here for other shards.
10205 // if this shard is ahead of us and one isn't, e.g., one thread is
10206 // calling into prime_splits via _process (due to a newly created
10207 // pg) and this shard has a newer map due to a racing consume_map,
10208 // then any grandchildren left here will be identified (or were
10209 // identified) when the slower shard's osdmap is advanced.
10210 // _prime_splits() will tolerate the case where the pgid is
10211 // already primed.
10212 }
10213 }
10214
10215 void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10216 {
10217 dout(10) << *pgids << dendl;
10218 auto p = pgids->begin();
10219 while (p != pgids->end()) {
10220 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10221 if (shard_index == shard_id) {
10222 auto r = pg_slots.emplace(p->first, nullptr);
10223 if (r.second) {
10224 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10225 r.first->second = make_unique<OSDShardPGSlot>();
10226 r.first->second->waiting_for_split.insert(p->second);
10227 } else {
10228 auto q = r.first;
10229 ceph_assert(q != pg_slots.end());
10230 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10231 << dendl;
10232 q->second->waiting_for_split.insert(p->second);
10233 }
10234 p = pgids->erase(p);
10235 } else {
10236 ++p;
10237 }
10238 }
10239 }
10240
10241 void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10242 set<pair<spg_t,epoch_t>> *merge_pgs)
10243 {
10244 std::lock_guard l(shard_lock);
10245 dout(20) << __func__ << " checking shard " << shard_id
10246 << " for remaining merge pgs " << merge_pgs << dendl;
10247 auto p = merge_pgs->begin();
10248 while (p != merge_pgs->end()) {
10249 spg_t pgid = p->first;
10250 epoch_t epoch = p->second;
10251 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10252 if (shard_index != shard_id) {
10253 ++p;
10254 continue;
10255 }
10256 OSDShardPGSlot *slot;
10257 auto r = pg_slots.emplace(pgid, nullptr);
10258 if (r.second) {
10259 r.first->second = make_unique<OSDShardPGSlot>();
10260 }
10261 slot = r.first->second.get();
10262 if (slot->pg) {
10263 // already have pg
10264 dout(20) << __func__ << " have merge participant pg " << pgid
10265 << " " << slot->pg << dendl;
10266 } else if (!slot->waiting_for_split.empty() &&
10267 *slot->waiting_for_split.begin() < epoch) {
10268 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10269 << " " << slot->waiting_for_split << dendl;
10270 } else {
10271 dout(20) << __func__ << " creating empty merge participant " << pgid
10272 << " for merge in " << epoch << dendl;
10273 // leave history zeroed; PG::merge_from() will fill it in.
10274 pg_history_t history;
10275 PGCreateInfo cinfo(pgid, epoch - 1,
10276 history, PastIntervals(), false);
10277 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10278 _attach_pg(r.first->second.get(), pg.get());
10279 _wake_pg_slot(pgid, slot);
10280 pg->unlock();
10281 }
10282 // mark slot for merge
10283 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10284 slot->waiting_for_merge_epoch = epoch;
10285 p = merge_pgs->erase(p);
10286 }
10287 }
10288
10289 void OSDShard::register_and_wake_split_child(PG *pg)
10290 {
10291 epoch_t epoch;
10292 {
10293 std::lock_guard l(shard_lock);
10294 dout(10) << pg->pg_id << " " << pg << dendl;
10295 auto p = pg_slots.find(pg->pg_id);
10296 ceph_assert(p != pg_slots.end());
10297 auto *slot = p->second.get();
10298 dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
10299 << dendl;
10300 ceph_assert(!slot->pg);
10301 ceph_assert(!slot->waiting_for_split.empty());
10302 _attach_pg(slot, pg);
10303
10304 epoch = pg->get_osdmap_epoch();
10305 ceph_assert(slot->waiting_for_split.count(epoch));
10306 slot->waiting_for_split.erase(epoch);
10307 if (slot->waiting_for_split.empty()) {
10308 _wake_pg_slot(pg->pg_id, slot);
10309 } else {
10310 dout(10) << __func__ << " still waiting for split on "
10311 << slot->waiting_for_split << dendl;
10312 }
10313 }
10314
10315 // kick child to ensure it pulls up to the latest osdmap
10316 osd->enqueue_peering_evt(
10317 pg->pg_id,
10318 PGPeeringEventRef(
10319 std::make_shared<PGPeeringEvent>(
10320 epoch,
10321 epoch,
10322 NullEvt())));
10323
10324 std::lock_guard l{sdata_wait_lock};
10325 sdata_cond.notify_one();
10326 }
10327
10328 void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
10329 {
10330 std::lock_guard l(shard_lock);
10331 vector<spg_t> to_delete;
10332 for (auto& i : pg_slots) {
10333 if (i.first != parent &&
10334 i.first.get_ancestor(old_pg_num) == parent) {
10335 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10336 << dendl;
10337 _wake_pg_slot(i.first, i.second.get());
10338 to_delete.push_back(i.first);
10339 }
10340 }
10341 for (auto pgid : to_delete) {
10342 pg_slots.erase(pgid);
10343 }
10344 }
10345
10346 OSDShard::OSDShard(
10347 int id,
10348 CephContext *cct,
10349 OSD *osd)
10350 : shard_id(id),
10351 cct(cct),
10352 osd(osd),
10353 shard_name(string("OSDShard.") + stringify(id)),
10354 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10355 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10356 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10357 shard_lock_name(shard_name + "::shard_lock"),
10358 shard_lock{make_mutex(shard_lock_name)},
10359 scheduler(ceph::osd::scheduler::make_scheduler(cct)),
10360 context_queue(sdata_wait_lock, sdata_cond)
10361 {
10362 dout(0) << "using op scheduler " << *scheduler << dendl;
10363 }
10364
10365
10366 // =============================================================
10367
10368 #undef dout_context
10369 #define dout_context osd->cct
10370 #undef dout_prefix
10371 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10372
10373 void OSD::ShardedOpWQ::_add_slot_waiter(
10374 spg_t pgid,
10375 OSDShardPGSlot *slot,
10376 OpSchedulerItem&& qi)
10377 {
10378 if (qi.is_peering()) {
10379 dout(20) << __func__ << " " << pgid
10380 << " peering, item epoch is "
10381 << qi.get_map_epoch()
10382 << ", will wait on " << qi << dendl;
10383 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10384 } else {
10385 dout(20) << __func__ << " " << pgid
10386 << " item epoch is "
10387 << qi.get_map_epoch()
10388 << ", will wait on " << qi << dendl;
10389 slot->waiting.push_back(std::move(qi));
10390 }
10391 }
10392
10393 #undef dout_prefix
10394 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10395
10396 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10397 {
10398 uint32_t shard_index = thread_index % osd->num_shards;
10399 auto& sdata = osd->shards[shard_index];
10400 ceph_assert(sdata);
10401
10402 // If all threads of shards do oncommits, there is a out-of-order
10403 // problem. So we choose the thread which has the smallest
10404 // thread_index(thread_index < num_shards) of shard to do oncommit
10405 // callback.
10406 bool is_smallest_thread_index = thread_index < osd->num_shards;
10407
10408 // peek at spg_t
10409 sdata->shard_lock.lock();
10410 if (sdata->scheduler->empty() &&
10411 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10412 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10413 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10414 // we raced with a context_queue addition, don't wait
10415 wait_lock.unlock();
10416 } else if (!sdata->stop_waiting) {
10417 dout(20) << __func__ << " empty q, waiting" << dendl;
10418 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10419 sdata->shard_lock.unlock();
10420 sdata->sdata_cond.wait(wait_lock);
10421 wait_lock.unlock();
10422 sdata->shard_lock.lock();
10423 if (sdata->scheduler->empty() &&
10424 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10425 sdata->shard_lock.unlock();
10426 return;
10427 }
10428 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10429 osd->cct->_conf->threadpool_default_timeout, 0);
10430 } else {
10431 dout(20) << __func__ << " need return immediately" << dendl;
10432 wait_lock.unlock();
10433 sdata->shard_lock.unlock();
10434 return;
10435 }
10436 }
10437
10438 list<Context *> oncommits;
10439 if (is_smallest_thread_index) {
10440 sdata->context_queue.move_to(oncommits);
10441 }
10442
10443 if (sdata->scheduler->empty()) {
10444 if (osd->is_stopping()) {
10445 sdata->shard_lock.unlock();
10446 for (auto c : oncommits) {
10447 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10448 delete c;
10449 }
10450 return; // OSD shutdown, discard.
10451 }
10452 sdata->shard_lock.unlock();
10453 handle_oncommits(oncommits);
10454 return;
10455 }
10456
10457 OpSchedulerItem item = sdata->scheduler->dequeue();
10458 if (osd->is_stopping()) {
10459 sdata->shard_lock.unlock();
10460 for (auto c : oncommits) {
10461 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10462 delete c;
10463 }
10464 return; // OSD shutdown, discard.
10465 }
10466
10467 const auto token = item.get_ordering_token();
10468 auto r = sdata->pg_slots.emplace(token, nullptr);
10469 if (r.second) {
10470 r.first->second = make_unique<OSDShardPGSlot>();
10471 }
10472 OSDShardPGSlot *slot = r.first->second.get();
10473 dout(20) << __func__ << " " << token
10474 << (r.second ? " (new)" : "")
10475 << " to_process " << slot->to_process
10476 << " waiting " << slot->waiting
10477 << " waiting_peering " << slot->waiting_peering
10478 << dendl;
10479 slot->to_process.push_back(std::move(item));
10480 dout(20) << __func__ << " " << slot->to_process.back()
10481 << " queued" << dendl;
10482
10483 retry_pg:
10484 PGRef pg = slot->pg;
10485
10486 // lock pg (if we have it)
10487 if (pg) {
10488 // note the requeue seq now...
10489 uint64_t requeue_seq = slot->requeue_seq;
10490 ++slot->num_running;
10491
10492 sdata->shard_lock.unlock();
10493 osd->service.maybe_inject_dispatch_delay();
10494 pg->lock();
10495 osd->service.maybe_inject_dispatch_delay();
10496 sdata->shard_lock.lock();
10497
10498 auto q = sdata->pg_slots.find(token);
10499 if (q == sdata->pg_slots.end()) {
10500 // this can happen if we race with pg removal.
10501 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10502 pg->unlock();
10503 sdata->shard_lock.unlock();
10504 handle_oncommits(oncommits);
10505 return;
10506 }
10507 slot = q->second.get();
10508 --slot->num_running;
10509
10510 if (slot->to_process.empty()) {
10511 // raced with _wake_pg_slot or consume_map
10512 dout(20) << __func__ << " " << token
10513 << " nothing queued" << dendl;
10514 pg->unlock();
10515 sdata->shard_lock.unlock();
10516 handle_oncommits(oncommits);
10517 return;
10518 }
10519 if (requeue_seq != slot->requeue_seq) {
10520 dout(20) << __func__ << " " << token
10521 << " requeue_seq " << slot->requeue_seq << " > our "
10522 << requeue_seq << ", we raced with _wake_pg_slot"
10523 << dendl;
10524 pg->unlock();
10525 sdata->shard_lock.unlock();
10526 handle_oncommits(oncommits);
10527 return;
10528 }
10529 if (slot->pg != pg) {
10530 // this can happen if we race with pg removal.
10531 dout(20) << __func__ << " slot " << token << " no longer attached to "
10532 << pg << dendl;
10533 pg->unlock();
10534 goto retry_pg;
10535 }
10536 }
10537
10538 dout(20) << __func__ << " " << token
10539 << " to_process " << slot->to_process
10540 << " waiting " << slot->waiting
10541 << " waiting_peering " << slot->waiting_peering << dendl;
10542
10543 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10544 suicide_interval);
10545
10546 // take next item
10547 auto qi = std::move(slot->to_process.front());
10548 slot->to_process.pop_front();
10549 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10550 set<pair<spg_t,epoch_t>> new_children;
10551 OSDMapRef osdmap;
10552
10553 while (!pg) {
10554 // should this pg shard exist on this osd in this (or a later) epoch?
10555 osdmap = sdata->shard_osdmap;
10556 const PGCreateInfo *create_info = qi.creates_pg();
10557 if (!slot->waiting_for_split.empty()) {
10558 dout(20) << __func__ << " " << token
10559 << " splitting " << slot->waiting_for_split << dendl;
10560 _add_slot_waiter(token, slot, std::move(qi));
10561 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10562 dout(20) << __func__ << " " << token
10563 << " map " << qi.get_map_epoch() << " > "
10564 << osdmap->get_epoch() << dendl;
10565 _add_slot_waiter(token, slot, std::move(qi));
10566 } else if (qi.is_peering()) {
10567 if (!qi.peering_requires_pg()) {
10568 // for pg-less events, we run them under the ordering lock, since
10569 // we don't have the pg lock to keep them ordered.
10570 qi.run(osd, sdata, pg, tp_handle);
10571 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10572 if (create_info) {
10573 if (create_info->by_mon &&
10574 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
10575 dout(20) << __func__ << " " << token
10576 << " no pg, no longer primary, ignoring mon create on "
10577 << qi << dendl;
10578 } else {
10579 dout(20) << __func__ << " " << token
10580 << " no pg, should create on " << qi << dendl;
10581 pg = osd->handle_pg_create_info(osdmap, create_info);
10582 if (pg) {
10583 // we created the pg! drop out and continue "normally"!
10584 sdata->_attach_pg(slot, pg.get());
10585 sdata->_wake_pg_slot(token, slot);
10586
10587 // identify split children between create epoch and shard epoch.
10588 osd->service.identify_splits_and_merges(
10589 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
10590 sdata->_prime_splits(&new_children);
10591 // distribute remaining split children to other shards below!
10592 break;
10593 }
10594 dout(20) << __func__ << " ignored create on " << qi << dendl;
10595 }
10596 } else {
10597 dout(20) << __func__ << " " << token
10598 << " no pg, peering, !create, discarding " << qi << dendl;
10599 }
10600 } else {
10601 dout(20) << __func__ << " " << token
10602 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
10603 << ", discarding " << qi
10604 << dendl;
10605 }
10606 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10607 dout(20) << __func__ << " " << token
10608 << " no pg, should exist e" << osdmap->get_epoch()
10609 << ", will wait on " << qi << dendl;
10610 _add_slot_waiter(token, slot, std::move(qi));
10611 } else {
10612 dout(20) << __func__ << " " << token
10613 << " no pg, shouldn't exist e" << osdmap->get_epoch()
10614 << ", dropping " << qi << dendl;
10615 // share map with client?
10616 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10617 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
10618 sdata->shard_osdmap,
10619 (*_op)->sent_epoch);
10620 }
10621 unsigned pushes_to_free = qi.get_reserved_pushes();
10622 if (pushes_to_free > 0) {
10623 sdata->shard_lock.unlock();
10624 osd->service.release_reserved_pushes(pushes_to_free);
10625 handle_oncommits(oncommits);
10626 return;
10627 }
10628 }
10629 sdata->shard_lock.unlock();
10630 handle_oncommits(oncommits);
10631 return;
10632 }
10633 if (qi.is_peering()) {
10634 OSDMapRef osdmap = sdata->shard_osdmap;
10635 if (qi.get_map_epoch() > osdmap->get_epoch()) {
10636 _add_slot_waiter(token, slot, std::move(qi));
10637 sdata->shard_lock.unlock();
10638 pg->unlock();
10639 handle_oncommits(oncommits);
10640 return;
10641 }
10642 }
10643 sdata->shard_lock.unlock();
10644
10645 if (!new_children.empty()) {
10646 for (auto shard : osd->shards) {
10647 shard->prime_splits(osdmap, &new_children);
10648 }
10649 ceph_assert(new_children.empty());
10650 }
10651
10652 // osd_opwq_process marks the point at which an operation has been dequeued
10653 // and will begin to be handled by a worker thread.
10654 {
10655 #ifdef WITH_LTTNG
10656 osd_reqid_t reqid;
10657 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10658 reqid = (*_op)->get_reqid();
10659 }
10660 #endif
10661 tracepoint(osd, opwq_process_start, reqid.name._type,
10662 reqid.name._num, reqid.tid, reqid.inc);
10663 }
10664
10665 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10666 Formatter *f = Formatter::create("json");
10667 f->open_object_section("q");
10668 dump(f);
10669 f->close_section();
10670 f->flush(*_dout);
10671 delete f;
10672 *_dout << dendl;
10673
10674 qi.run(osd, sdata, pg, tp_handle);
10675
10676 {
10677 #ifdef WITH_LTTNG
10678 osd_reqid_t reqid;
10679 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10680 reqid = (*_op)->get_reqid();
10681 }
10682 #endif
10683 tracepoint(osd, opwq_process_finish, reqid.name._type,
10684 reqid.name._num, reqid.tid, reqid.inc);
10685 }
10686
10687 handle_oncommits(oncommits);
10688 }
10689
10690 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
10691 uint32_t shard_index =
10692 item.get_ordering_token().hash_to_shard(osd->shards.size());
10693
10694 dout(20) << __func__ << " " << item << dendl;
10695
10696 OSDShard* sdata = osd->shards[shard_index];
10697 assert (NULL != sdata);
10698
10699 bool empty = true;
10700 {
10701 std::lock_guard l{sdata->shard_lock};
10702 empty = sdata->scheduler->empty();
10703 sdata->scheduler->enqueue(std::move(item));
10704 }
10705
10706 if (empty) {
10707 std::lock_guard l{sdata->sdata_wait_lock};
10708 sdata->sdata_cond.notify_one();
10709 }
10710 }
10711
10712 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
10713 {
10714 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
10715 auto& sdata = osd->shards[shard_index];
10716 ceph_assert(sdata);
10717 sdata->shard_lock.lock();
10718 auto p = sdata->pg_slots.find(item.get_ordering_token());
10719 if (p != sdata->pg_slots.end() &&
10720 !p->second->to_process.empty()) {
10721 // we may be racing with _process, which has dequeued a new item
10722 // from scheduler, put it on to_process, and is now busy taking the
10723 // pg lock. ensure this old requeued item is ordered before any
10724 // such newer item in to_process.
10725 p->second->to_process.push_front(std::move(item));
10726 item = std::move(p->second->to_process.back());
10727 p->second->to_process.pop_back();
10728 dout(20) << __func__
10729 << " " << p->second->to_process.front()
10730 << " shuffled w/ " << item << dendl;
10731 } else {
10732 dout(20) << __func__ << " " << item << dendl;
10733 }
10734 sdata->scheduler->enqueue_front(std::move(item));
10735 sdata->shard_lock.unlock();
10736 std::lock_guard l{sdata->sdata_wait_lock};
10737 sdata->sdata_cond.notify_one();
10738 }
10739
10740 namespace ceph {
10741 namespace osd_cmds {
10742
10743 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
10744 std::ostream& os)
10745 {
10746 if (!ceph_using_tcmalloc()) {
10747 os << "could not issue heap profiler command -- not using tcmalloc!";
10748 return -EOPNOTSUPP;
10749 }
10750
10751 string cmd;
10752 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
10753 os << "unable to get value for command \"" << cmd << "\"";
10754 return -EINVAL;
10755 }
10756
10757 std::vector<std::string> cmd_vec;
10758 get_str_vec(cmd, cmd_vec);
10759
10760 string val;
10761 if (cmd_getval(cmdmap, "value", val)) {
10762 cmd_vec.push_back(val);
10763 }
10764
10765 ceph_heap_profiler_handle_command(cmd_vec, os);
10766
10767 return 0;
10768 }
10769
10770 }} // namespace ceph::osd_cmds