]> git.proxmox.com Git - ceph.git/blame_incremental - ceph/src/osd/OSD.cc
import quincy 17.2.0
[ceph.git] / ceph / src / osd / OSD.cc
... / ...
CommitLineData
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15
16#include "acconfig.h"
17
18#include <cctype>
19#include <fstream>
20#include <iostream>
21#include <iterator>
22
23#include <unistd.h>
24#include <sys/stat.h>
25#include <signal.h>
26#include <time.h>
27#include <boost/range/adaptor/reversed.hpp>
28
29#ifdef HAVE_SYS_PARAM_H
30#include <sys/param.h>
31#endif
32
33#ifdef HAVE_SYS_MOUNT_H
34#include <sys/mount.h>
35#endif
36
37#include "osd/PG.h"
38#include "osd/scrubber/scrub_machine.h"
39#include "osd/scrubber/pg_scrubber.h"
40
41#include "include/types.h"
42#include "include/compat.h"
43#include "include/random.h"
44#include "include/scope_guard.h"
45
46#include "OSD.h"
47#include "OSDMap.h"
48#include "Watch.h"
49#include "osdc/Objecter.h"
50
51#include "common/errno.h"
52#include "common/ceph_argparse.h"
53#include "common/ceph_releases.h"
54#include "common/ceph_time.h"
55#include "common/version.h"
56#include "common/async/blocked_completion.h"
57#include "common/pick_address.h"
58#include "common/blkdev.h"
59#include "common/numa.h"
60
61#include "os/ObjectStore.h"
62#ifdef HAVE_LIBFUSE
63#include "os/FuseStore.h"
64#endif
65
66#include "PrimaryLogPG.h"
67
68#include "msg/Messenger.h"
69#include "msg/Message.h"
70
71#include "mon/MonClient.h"
72
73#include "messages/MLog.h"
74
75#include "messages/MGenericMessage.h"
76#include "messages/MOSDPing.h"
77#include "messages/MOSDFailure.h"
78#include "messages/MOSDMarkMeDown.h"
79#include "messages/MOSDMarkMeDead.h"
80#include "messages/MOSDFull.h"
81#include "messages/MOSDOp.h"
82#include "messages/MOSDOpReply.h"
83#include "messages/MOSDBackoff.h"
84#include "messages/MOSDBeacon.h"
85#include "messages/MOSDRepOp.h"
86#include "messages/MOSDRepOpReply.h"
87#include "messages/MOSDBoot.h"
88#include "messages/MOSDPGTemp.h"
89#include "messages/MOSDPGReadyToMerge.h"
90
91#include "messages/MOSDMap.h"
92#include "messages/MMonGetOSDMap.h"
93#include "messages/MOSDPGNotify.h"
94#include "messages/MOSDPGNotify2.h"
95#include "messages/MOSDPGQuery2.h"
96#include "messages/MOSDPGLog.h"
97#include "messages/MOSDPGRemove.h"
98#include "messages/MOSDPGInfo.h"
99#include "messages/MOSDPGInfo2.h"
100#include "messages/MOSDPGCreate.h"
101#include "messages/MOSDPGCreate2.h"
102#include "messages/MBackfillReserve.h"
103#include "messages/MRecoveryReserve.h"
104#include "messages/MOSDForceRecovery.h"
105#include "messages/MOSDECSubOpWrite.h"
106#include "messages/MOSDECSubOpWriteReply.h"
107#include "messages/MOSDECSubOpRead.h"
108#include "messages/MOSDECSubOpReadReply.h"
109#include "messages/MOSDPGCreated.h"
110#include "messages/MOSDPGUpdateLogMissing.h"
111#include "messages/MOSDPGUpdateLogMissingReply.h"
112
113#include "messages/MOSDPeeringOp.h"
114
115#include "messages/MOSDAlive.h"
116
117#include "messages/MOSDScrub.h"
118#include "messages/MOSDScrub2.h"
119
120#include "messages/MCommand.h"
121#include "messages/MCommandReply.h"
122
123#include "messages/MPGStats.h"
124
125#include "messages/MMonGetPurgedSnaps.h"
126#include "messages/MMonGetPurgedSnapsReply.h"
127
128#include "common/perf_counters.h"
129#include "common/Timer.h"
130#include "common/LogClient.h"
131#include "common/AsyncReserver.h"
132#include "common/HeartbeatMap.h"
133#include "common/admin_socket.h"
134#include "common/ceph_context.h"
135
136#include "global/signal_handler.h"
137#include "global/pidfile.h"
138
139#include "include/color.h"
140#include "perfglue/cpu_profiler.h"
141#include "perfglue/heap_profiler.h"
142
143#include "osd/ClassHandler.h"
144#include "osd/OpRequest.h"
145
146#include "auth/AuthAuthorizeHandler.h"
147#include "auth/RotatingKeyRing.h"
148
149#include "objclass/objclass.h"
150
151#include "common/cmdparse.h"
152#include "include/str_list.h"
153#include "include/util.h"
154
155#include "include/ceph_assert.h"
156#include "common/config.h"
157#include "common/EventTrace.h"
158
159#include "json_spirit/json_spirit_reader.h"
160#include "json_spirit/json_spirit_writer.h"
161
162#ifdef WITH_LTTNG
163#define TRACEPOINT_DEFINE
164#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
165#include "tracing/osd.h"
166#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
167#undef TRACEPOINT_DEFINE
168#else
169#define tracepoint(...)
170#endif
171
172#include "osd_tracer.h"
173
174
175#define dout_context cct
176#define dout_subsys ceph_subsys_osd
177#undef dout_prefix
178#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
179
180using std::deque;
181using std::list;
182using std::lock_guard;
183using std::make_pair;
184using std::make_tuple;
185using std::make_unique;
186using std::map;
187using std::ostream;
188using std::ostringstream;
189using std::pair;
190using std::set;
191using std::string;
192using std::stringstream;
193using std::to_string;
194using std::unique_ptr;
195using std::vector;
196
197using ceph::bufferlist;
198using ceph::bufferptr;
199using ceph::decode;
200using ceph::encode;
201using ceph::fixed_u_to_string;
202using ceph::Formatter;
203using ceph::heartbeat_handle_d;
204using ceph::make_mutex;
205
206using namespace ceph::osd::scheduler;
207using TOPNSPC::common::cmd_getval;
208using TOPNSPC::common::cmd_getval_or;
209
210static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
211 return *_dout << "osd." << whoami << " " << epoch << " ";
212}
213
214
215//Initial features in new superblock.
216//Features here are also automatically upgraded
217CompatSet OSD::get_osd_initial_compat_set() {
218 CompatSet::FeatureSet ceph_osd_feature_compat;
219 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
220 CompatSet::FeatureSet ceph_osd_feature_incompat;
221 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
222 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
223 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
224 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
225 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
226 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
227 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
228 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
229 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
230 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
231 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
232 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
233 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
234 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
235 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
236 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
237 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
238 ceph_osd_feature_incompat);
239}
240
241//Features are added here that this OSD supports.
242CompatSet OSD::get_osd_compat_set() {
243 CompatSet compat = get_osd_initial_compat_set();
244 //Any features here can be set in code, but not in initial superblock
245 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
246 return compat;
247}
248
249OSDService::OSDService(OSD *osd, ceph::async::io_context_pool& poolctx) :
250 osd(osd),
251 cct(osd->cct),
252 whoami(osd->whoami), store(osd->store.get()),
253 log_client(osd->log_client), clog(osd->clog),
254 pg_recovery_stats(osd->pg_recovery_stats),
255 cluster_messenger(osd->cluster_messenger),
256 client_messenger(osd->client_messenger),
257 logger(osd->logger),
258 recoverystate_perf(osd->recoverystate_perf),
259 monc(osd->monc),
260 osd_max_object_size(cct->_conf, "osd_max_object_size"),
261 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
262 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
263 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
264 max_oldest_map(0),
265 m_scrub_queue{cct, *this},
266 agent_valid_iterator(false),
267 agent_ops(0),
268 flush_mode_high_count(0),
269 agent_active(true),
270 agent_thread(this),
271 agent_stop_flag(false),
272 agent_timer(osd->client_messenger->cct, agent_timer_lock),
273 last_recalibrate(ceph_clock_now()),
274 promote_max_objects(0),
275 promote_max_bytes(0),
276 poolctx(poolctx),
277 objecter(make_unique<Objecter>(osd->client_messenger->cct,
278 osd->objecter_messenger,
279 osd->monc, poolctx)),
280 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
281 watch_timer(osd->client_messenger->cct, watch_lock),
282 next_notif_id(0),
283 recovery_request_timer(cct, recovery_request_lock, false),
284 sleep_timer(cct, sleep_lock, false),
285 reserver_finisher(cct),
286 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
287 cct->_conf->osd_min_recovery_priority),
288 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
289 cct->_conf->osd_min_recovery_priority),
290 snap_reserver(cct, &reserver_finisher,
291 cct->_conf->osd_max_trimming_pgs),
292 recovery_ops_active(0),
293 recovery_ops_reserved(0),
294 recovery_paused(false),
295 map_cache(cct, cct->_conf->osd_map_cache_size),
296 map_bl_cache(cct->_conf->osd_map_cache_size),
297 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
298 cur_state(NONE),
299 cur_ratio(0), physical_ratio(0),
300 boot_epoch(0), up_epoch(0), bind_epoch(0)
301{
302 objecter->init();
303
304 for (int i = 0; i < m_objecter_finishers; i++) {
305 ostringstream str;
306 str << "objecter-finisher-" << i;
307 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
308 objecter_finishers.push_back(std::move(fin));
309 }
310}
311
312#ifdef PG_DEBUG_REFS
313void OSDService::add_pgid(spg_t pgid, PG *pg) {
314 std::lock_guard l(pgid_lock);
315 if (!pgid_tracker.count(pgid)) {
316 live_pgs[pgid] = pg;
317 }
318 pgid_tracker[pgid]++;
319}
320void OSDService::remove_pgid(spg_t pgid, PG *pg)
321{
322 std::lock_guard l(pgid_lock);
323 ceph_assert(pgid_tracker.count(pgid));
324 ceph_assert(pgid_tracker[pgid] > 0);
325 pgid_tracker[pgid]--;
326 if (pgid_tracker[pgid] == 0) {
327 pgid_tracker.erase(pgid);
328 live_pgs.erase(pgid);
329 }
330}
331void OSDService::dump_live_pgids()
332{
333 std::lock_guard l(pgid_lock);
334 derr << "live pgids:" << dendl;
335 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
336 i != pgid_tracker.cend();
337 ++i) {
338 derr << "\t" << *i << dendl;
339 live_pgs[i->first]->dump_live_ids();
340 }
341}
342#endif
343
344
345ceph::signedspan OSDService::get_mnow()
346{
347 return ceph::mono_clock::now() - osd->startup_time;
348}
349
350void OSDService::identify_splits_and_merges(
351 OSDMapRef old_map,
352 OSDMapRef new_map,
353 spg_t pgid,
354 set<pair<spg_t,epoch_t>> *split_children,
355 set<pair<spg_t,epoch_t>> *merge_pgs)
356{
357 if (!old_map->have_pg_pool(pgid.pool())) {
358 return;
359 }
360 int old_pgnum = old_map->get_pg_num(pgid.pool());
361 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
362 if (p == osd->pg_num_history.pg_nums.end()) {
363 return;
364 }
365 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
366 << " to e" << new_map->get_epoch()
367 << " pg_nums " << p->second << dendl;
368 deque<spg_t> queue;
369 queue.push_back(pgid);
370 set<spg_t> did;
371 while (!queue.empty()) {
372 auto cur = queue.front();
373 queue.pop_front();
374 did.insert(cur);
375 unsigned pgnum = old_pgnum;
376 for (auto q = p->second.lower_bound(old_map->get_epoch());
377 q != p->second.end() &&
378 q->first <= new_map->get_epoch();
379 ++q) {
380 if (pgnum < q->second) {
381 // split?
382 if (cur.ps() < pgnum) {
383 set<spg_t> children;
384 if (cur.is_split(pgnum, q->second, &children)) {
385 dout(20) << __func__ << " " << cur << " e" << q->first
386 << " pg_num " << pgnum << " -> " << q->second
387 << " children " << children << dendl;
388 for (auto i : children) {
389 split_children->insert(make_pair(i, q->first));
390 if (!did.count(i))
391 queue.push_back(i);
392 }
393 }
394 } else if (cur.ps() < q->second) {
395 dout(20) << __func__ << " " << cur << " e" << q->first
396 << " pg_num " << pgnum << " -> " << q->second
397 << " is a child" << dendl;
398 // normally we'd capture this from the parent, but it's
399 // possible the parent doesn't exist yet (it will be
400 // fabricated to allow an intervening merge). note this PG
401 // as a split child here to be sure we catch it.
402 split_children->insert(make_pair(cur, q->first));
403 } else {
404 dout(20) << __func__ << " " << cur << " e" << q->first
405 << " pg_num " << pgnum << " -> " << q->second
406 << " is post-split, skipping" << dendl;
407 }
408 } else if (merge_pgs) {
409 // merge?
410 if (cur.ps() >= q->second) {
411 if (cur.ps() < pgnum) {
412 spg_t parent;
413 if (cur.is_merge_source(pgnum, q->second, &parent)) {
414 set<spg_t> children;
415 parent.is_split(q->second, pgnum, &children);
416 dout(20) << __func__ << " " << cur << " e" << q->first
417 << " pg_num " << pgnum << " -> " << q->second
418 << " is merge source, target " << parent
419 << ", source(s) " << children << dendl;
420 merge_pgs->insert(make_pair(parent, q->first));
421 if (!did.count(parent)) {
422 // queue (and re-scan) parent in case it might not exist yet
423 // and there are some future splits pending on it
424 queue.push_back(parent);
425 }
426 for (auto c : children) {
427 merge_pgs->insert(make_pair(c, q->first));
428 if (!did.count(c))
429 queue.push_back(c);
430 }
431 }
432 } else {
433 dout(20) << __func__ << " " << cur << " e" << q->first
434 << " pg_num " << pgnum << " -> " << q->second
435 << " is beyond old pgnum, skipping" << dendl;
436 }
437 } else {
438 set<spg_t> children;
439 if (cur.is_split(q->second, pgnum, &children)) {
440 dout(20) << __func__ << " " << cur << " e" << q->first
441 << " pg_num " << pgnum << " -> " << q->second
442 << " is merge target, source " << children << dendl;
443 for (auto c : children) {
444 merge_pgs->insert(make_pair(c, q->first));
445 if (!did.count(c))
446 queue.push_back(c);
447 }
448 merge_pgs->insert(make_pair(cur, q->first));
449 }
450 }
451 }
452 pgnum = q->second;
453 }
454 }
455}
456
457void OSDService::need_heartbeat_peer_update()
458{
459 osd->need_heartbeat_peer_update();
460}
461
462HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
463{
464 std::lock_guard l(hb_stamp_lock);
465 if (peer >= hb_stamps.size()) {
466 hb_stamps.resize(peer + 1);
467 }
468 if (!hb_stamps[peer]) {
469 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
470 }
471 return hb_stamps[peer];
472}
473
474void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
475{
476 osd->enqueue_peering_evt(
477 spgid,
478 PGPeeringEventRef(
479 std::make_shared<PGPeeringEvent>(
480 epoch, epoch,
481 RenewLease())));
482}
483
484void OSDService::start_shutdown()
485{
486 {
487 std::lock_guard l(agent_timer_lock);
488 agent_timer.shutdown();
489 }
490
491 {
492 std::lock_guard l(sleep_lock);
493 sleep_timer.shutdown();
494 }
495
496 {
497 std::lock_guard l(recovery_request_lock);
498 recovery_request_timer.shutdown();
499 }
500}
501
502void OSDService::shutdown_reserver()
503{
504 reserver_finisher.wait_for_empty();
505 reserver_finisher.stop();
506}
507
508void OSDService::shutdown()
509{
510 mono_timer.suspend();
511
512 {
513 std::lock_guard l(watch_lock);
514 watch_timer.shutdown();
515 }
516
517 objecter->shutdown();
518 for (auto& f : objecter_finishers) {
519 f->wait_for_empty();
520 f->stop();
521 }
522
523 publish_map(OSDMapRef());
524 next_osdmap = OSDMapRef();
525}
526
527void OSDService::init()
528{
529 reserver_finisher.start();
530 for (auto& f : objecter_finishers) {
531 f->start();
532 }
533 objecter->set_client_incarnation(0);
534
535 // deprioritize objecter in daemonperf output
536 objecter->get_logger()->set_prio_adjust(-3);
537
538 watch_timer.init();
539 agent_timer.init();
540 mono_timer.resume();
541
542 agent_thread.create("osd_srv_agent");
543
544 if (cct->_conf->osd_recovery_delay_start)
545 defer_recovery(cct->_conf->osd_recovery_delay_start);
546}
547
548void OSDService::final_init()
549{
550 objecter->start(osdmap.get());
551}
552
553void OSDService::activate_map()
554{
555 // wake/unwake the tiering agent
556 std::lock_guard l{agent_lock};
557 agent_active =
558 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
559 osd->is_active();
560 agent_cond.notify_all();
561}
562
563void OSDService::request_osdmap_update(epoch_t e)
564{
565 osd->osdmap_subscribe(e, false);
566}
567
568
569class AgentTimeoutCB : public Context {
570 PGRef pg;
571public:
572 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
573 void finish(int) override {
574 pg->agent_choose_mode_restart();
575 }
576};
577
578void OSDService::agent_entry()
579{
580 dout(10) << __func__ << " start" << dendl;
581 std::unique_lock agent_locker{agent_lock};
582
583 while (!agent_stop_flag) {
584 if (agent_queue.empty()) {
585 dout(20) << __func__ << " empty queue" << dendl;
586 agent_cond.wait(agent_locker);
587 continue;
588 }
589 uint64_t level = agent_queue.rbegin()->first;
590 set<PGRef>& top = agent_queue.rbegin()->second;
591 dout(10) << __func__
592 << " tiers " << agent_queue.size()
593 << ", top is " << level
594 << " with pgs " << top.size()
595 << ", ops " << agent_ops << "/"
596 << cct->_conf->osd_agent_max_ops
597 << (agent_active ? " active" : " NOT ACTIVE")
598 << dendl;
599 dout(20) << __func__ << " oids " << agent_oids << dendl;
600 int max = cct->_conf->osd_agent_max_ops - agent_ops;
601 int agent_flush_quota = max;
602 if (!flush_mode_high_count)
603 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
604 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
605 agent_cond.wait(agent_locker);
606 continue;
607 }
608
609 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
610 agent_queue_pos = top.begin();
611 agent_valid_iterator = true;
612 }
613 PGRef pg = *agent_queue_pos;
614 dout(10) << "high_count " << flush_mode_high_count
615 << " agent_ops " << agent_ops
616 << " flush_quota " << agent_flush_quota << dendl;
617 agent_locker.unlock();
618 if (!pg->agent_work(max, agent_flush_quota)) {
619 dout(10) << __func__ << " " << pg->pg_id
620 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
621 << " seconds" << dendl;
622
623 logger->inc(l_osd_tier_delay);
624 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
625 std::lock_guard timer_locker{agent_timer_lock};
626 Context *cb = new AgentTimeoutCB(pg);
627 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
628 }
629 agent_locker.lock();
630 }
631 dout(10) << __func__ << " finish" << dendl;
632}
633
634void OSDService::agent_stop()
635{
636 {
637 std::lock_guard l(agent_lock);
638
639 // By this time all ops should be cancelled
640 ceph_assert(agent_ops == 0);
641 // By this time all PGs are shutdown and dequeued
642 if (!agent_queue.empty()) {
643 set<PGRef>& top = agent_queue.rbegin()->second;
644 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
645 ceph_abort_msg("agent queue not empty");
646 }
647
648 agent_stop_flag = true;
649 agent_cond.notify_all();
650 }
651 agent_thread.join();
652}
653
654// -------------------------------------
655
656void OSDService::promote_throttle_recalibrate()
657{
658 utime_t now = ceph_clock_now();
659 double dur = now - last_recalibrate;
660 last_recalibrate = now;
661 unsigned prob = promote_probability_millis;
662
663 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
664 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
665
666 unsigned min_prob = 1;
667
668 uint64_t attempts, obj, bytes;
669 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
670 dout(10) << __func__ << " " << attempts << " attempts, promoted "
671 << obj << " objects and " << byte_u_t(bytes) << "; target "
672 << target_obj_sec << " obj/sec or "
673 << byte_u_t(target_bytes_sec) << "/sec"
674 << dendl;
675
676 // calculate what the probability *should* be, given the targets
677 unsigned new_prob;
678 if (attempts && dur > 0) {
679 uint64_t avg_size = 1;
680 if (obj)
681 avg_size = std::max<uint64_t>(bytes / obj, 1);
682 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
683 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
684 / (double)attempts;
685 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
686 << avg_size << dendl;
687 if (target_obj_sec && target_bytes_sec)
688 new_prob = std::min(po, pb);
689 else if (target_obj_sec)
690 new_prob = po;
691 else if (target_bytes_sec)
692 new_prob = pb;
693 else
694 new_prob = 1000;
695 } else {
696 new_prob = 1000;
697 }
698 dout(20) << __func__ << " new_prob " << new_prob << dendl;
699
700 // correct for persistent skew between target rate and actual rate, adjust
701 double ratio = 1.0;
702 unsigned actual = 0;
703 if (attempts && obj) {
704 actual = obj * 1000 / attempts;
705 ratio = (double)actual / (double)prob;
706 new_prob = (double)new_prob / ratio;
707 }
708 new_prob = std::max(new_prob, min_prob);
709 new_prob = std::min(new_prob, 1000u);
710
711 // adjust
712 prob = (prob + new_prob) / 2;
713 prob = std::max(prob, min_prob);
714 prob = std::min(prob, 1000u);
715 dout(10) << __func__ << " actual " << actual
716 << ", actual/prob ratio " << ratio
717 << ", adjusted new_prob " << new_prob
718 << ", prob " << promote_probability_millis << " -> " << prob
719 << dendl;
720 promote_probability_millis = prob;
721
722 // set hard limits for this interval to mitigate stampedes
723 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
724 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
725}
726
727// -------------------------------------
728
729float OSDService::get_failsafe_full_ratio()
730{
731 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
732 if (full_ratio > 1.0) full_ratio /= 100.0;
733 return full_ratio;
734}
735
736OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
737{
738 // The OSDMap ratios take precendence. So if the failsafe is .95 and
739 // the admin sets the cluster full to .96, the failsafe moves up to .96
740 // too. (Not that having failsafe == full is ideal, but it's better than
741 // dropping writes before the clusters appears full.)
742 OSDMapRef osdmap = get_osdmap();
743 if (!osdmap || osdmap->get_epoch() == 0) {
744 return NONE;
745 }
746 float nearfull_ratio = osdmap->get_nearfull_ratio();
747 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
748 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
749 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
750
751 if (osdmap->require_osd_release < ceph_release_t::luminous) {
752 // use the failsafe for nearfull and full; the mon isn't using the
753 // flags anyway because we're mid-upgrade.
754 full_ratio = failsafe_ratio;
755 backfillfull_ratio = failsafe_ratio;
756 nearfull_ratio = failsafe_ratio;
757 } else if (full_ratio <= 0 ||
758 backfillfull_ratio <= 0 ||
759 nearfull_ratio <= 0) {
760 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
761 // use failsafe flag. ick. the monitor did something wrong or the user
762 // did something stupid.
763 full_ratio = failsafe_ratio;
764 backfillfull_ratio = failsafe_ratio;
765 nearfull_ratio = failsafe_ratio;
766 }
767
768 if (injectfull_state > NONE && injectfull) {
769 inject = "(Injected)";
770 return injectfull_state;
771 } else if (pratio > failsafe_ratio) {
772 return FAILSAFE;
773 } else if (ratio > full_ratio) {
774 return FULL;
775 } else if (ratio > backfillfull_ratio) {
776 return BACKFILLFULL;
777 } else if (pratio > nearfull_ratio) {
778 return NEARFULL;
779 }
780 return NONE;
781}
782
783void OSDService::check_full_status(float ratio, float pratio)
784{
785 std::lock_guard l(full_status_lock);
786
787 cur_ratio = ratio;
788 physical_ratio = pratio;
789
790 string inject;
791 s_names new_state;
792 new_state = recalc_full_state(ratio, pratio, inject);
793
794 dout(20) << __func__ << " cur ratio " << ratio
795 << ", physical ratio " << pratio
796 << ", new state " << get_full_state_name(new_state)
797 << " " << inject
798 << dendl;
799
800 // warn
801 if (cur_state != new_state) {
802 dout(10) << __func__ << " " << get_full_state_name(cur_state)
803 << " -> " << get_full_state_name(new_state) << dendl;
804 if (new_state == FAILSAFE) {
805 clog->error() << "full status failsafe engaged, dropping updates, now "
806 << (int)roundf(ratio * 100) << "% full";
807 } else if (cur_state == FAILSAFE) {
808 clog->error() << "full status failsafe disengaged, no longer dropping "
809 << "updates, now " << (int)roundf(ratio * 100) << "% full";
810 }
811 cur_state = new_state;
812 }
813}
814
815bool OSDService::need_fullness_update()
816{
817 OSDMapRef osdmap = get_osdmap();
818 s_names cur = NONE;
819 if (osdmap->exists(whoami)) {
820 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
821 cur = FULL;
822 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
823 cur = BACKFILLFULL;
824 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
825 cur = NEARFULL;
826 }
827 }
828 s_names want = NONE;
829 if (is_full())
830 want = FULL;
831 else if (is_backfillfull())
832 want = BACKFILLFULL;
833 else if (is_nearfull())
834 want = NEARFULL;
835 return want != cur;
836}
837
838bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
839{
840 if (injectfull && injectfull_state >= type) {
841 // injectfull is either a count of the number of times to return failsafe full
842 // or if -1 then always return full
843 if (injectfull > 0)
844 --injectfull;
845 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
846 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
847 << dendl;
848 return true;
849 }
850 return false;
851}
852
853bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
854{
855 std::lock_guard l(full_status_lock);
856
857 if (_check_inject_full(dpp, type))
858 return true;
859
860 if (cur_state >= type)
861 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
862 << " physical " << physical_ratio << dendl;
863
864 return cur_state >= type;
865}
866
867bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
868{
869 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
870 {
871 std::lock_guard l(full_status_lock);
872 if (_check_inject_full(dpp, type)) {
873 return true;
874 }
875 }
876
877 float pratio;
878 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
879
880 string notused;
881 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
882
883 if (tentative_state >= type)
884 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
885
886 return tentative_state >= type;
887}
888
889bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
890{
891 return _check_full(dpp, FAILSAFE);
892}
893
894bool OSDService::check_full(DoutPrefixProvider *dpp) const
895{
896 return _check_full(dpp, FULL);
897}
898
899bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
900{
901 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
902}
903
904bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
905{
906 return _check_full(dpp, BACKFILLFULL);
907}
908
909bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
910{
911 return _check_full(dpp, NEARFULL);
912}
913
914bool OSDService::is_failsafe_full() const
915{
916 std::lock_guard l(full_status_lock);
917 return cur_state == FAILSAFE;
918}
919
920bool OSDService::is_full() const
921{
922 std::lock_guard l(full_status_lock);
923 return cur_state >= FULL;
924}
925
926bool OSDService::is_backfillfull() const
927{
928 std::lock_guard l(full_status_lock);
929 return cur_state >= BACKFILLFULL;
930}
931
932bool OSDService::is_nearfull() const
933{
934 std::lock_guard l(full_status_lock);
935 return cur_state >= NEARFULL;
936}
937
938void OSDService::set_injectfull(s_names type, int64_t count)
939{
940 std::lock_guard l(full_status_lock);
941 injectfull_state = type;
942 injectfull = count;
943}
944
945void OSDService::set_statfs(const struct store_statfs_t &stbuf,
946 osd_alert_list_t& alerts)
947{
948 uint64_t bytes = stbuf.total;
949 uint64_t avail = stbuf.available;
950 uint64_t used = stbuf.get_used_raw();
951
952 // For testing fake statfs values so it doesn't matter if all
953 // OSDs are using the same partition.
954 if (cct->_conf->fake_statfs_for_testing) {
955 uint64_t total_num_bytes = 0;
956 vector<PGRef> pgs;
957 osd->_get_pgs(&pgs);
958 for (auto p : pgs) {
959 total_num_bytes += p->get_stats_num_bytes();
960 }
961 bytes = cct->_conf->fake_statfs_for_testing;
962 if (total_num_bytes < bytes)
963 avail = bytes - total_num_bytes;
964 else
965 avail = 0;
966 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
967 << " adjust available " << avail
968 << dendl;
969 used = bytes - avail;
970 }
971
972 logger->set(l_osd_stat_bytes, bytes);
973 logger->set(l_osd_stat_bytes_used, used);
974 logger->set(l_osd_stat_bytes_avail, avail);
975
976 std::lock_guard l(stat_lock);
977 osd_stat.statfs = stbuf;
978 osd_stat.os_alerts.clear();
979 osd_stat.os_alerts[whoami].swap(alerts);
980 if (cct->_conf->fake_statfs_for_testing) {
981 osd_stat.statfs.total = bytes;
982 osd_stat.statfs.available = avail;
983 // For testing don't want used to go negative, so clear reserved
984 osd_stat.statfs.internally_reserved = 0;
985 }
986}
987
988osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
989 int num_pgs)
990{
991 utime_t now = ceph_clock_now();
992 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
993 std::lock_guard l(stat_lock);
994 osd_stat.hb_peers.swap(hb_peers);
995 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
996 osd_stat.num_pgs = num_pgs;
997 // Clean entries that aren't updated
998 // This is called often enough that we can just remove 1 at a time
999 for (auto i: osd_stat.hb_pingtime) {
1000 if (i.second.last_update == 0)
1001 continue;
1002 if (stale_time && now.sec() - i.second.last_update > stale_time) {
1003 dout(20) << __func__ << " time out heartbeat for osd " << i.first
1004 << " last_update " << i.second.last_update << dendl;
1005 osd_stat.hb_pingtime.erase(i.first);
1006 break;
1007 }
1008 }
1009 return osd_stat;
1010}
1011
1012void OSDService::inc_osd_stat_repaired()
1013{
1014 std::lock_guard l(stat_lock);
1015 osd_stat.num_shards_repaired++;
1016 return;
1017}
1018
1019float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
1020 uint64_t adjust_used)
1021{
1022 *pratio =
1023 ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1024
1025 if (adjust_used) {
1026 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1027 if (new_stat.statfs.available > adjust_used)
1028 new_stat.statfs.available -= adjust_used;
1029 else
1030 new_stat.statfs.available = 0;
1031 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
1032 }
1033
1034 // Check all pgs and adjust kb_used to include all pending backfill data
1035 int backfill_adjusted = 0;
1036 vector<PGRef> pgs;
1037 osd->_get_pgs(&pgs);
1038 for (auto p : pgs) {
1039 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1040 }
1041 if (backfill_adjusted) {
1042 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1043 }
1044 return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1045}
1046
1047void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1048{
1049 OSDMapRef next_map = get_nextmap_reserved();
1050 // service map is always newer/newest
1051 ceph_assert(from_epoch <= next_map->get_epoch());
1052
1053 if (next_map->is_down(peer) ||
1054 next_map->get_info(peer).up_from > from_epoch) {
1055 m->put();
1056 release_map(next_map);
1057 return;
1058 }
1059 ConnectionRef peer_con;
1060 if (peer == whoami) {
1061 peer_con = osd->cluster_messenger->get_loopback_connection();
1062 } else {
1063 peer_con = osd->cluster_messenger->connect_to_osd(
1064 next_map->get_cluster_addrs(peer), false, true);
1065 }
1066 maybe_share_map(peer_con.get(), next_map);
1067 peer_con->send_message(m);
1068 release_map(next_map);
1069}
1070
1071void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1072{
1073 OSDMapRef next_map = get_nextmap_reserved();
1074 // service map is always newer/newest
1075 ceph_assert(from_epoch <= next_map->get_epoch());
1076
1077 for (auto& iter : messages) {
1078 if (next_map->is_down(iter.first) ||
1079 next_map->get_info(iter.first).up_from > from_epoch) {
1080 iter.second->put();
1081 continue;
1082 }
1083 ConnectionRef peer_con;
1084 if (iter.first == whoami) {
1085 peer_con = osd->cluster_messenger->get_loopback_connection();
1086 } else {
1087 peer_con = osd->cluster_messenger->connect_to_osd(
1088 next_map->get_cluster_addrs(iter.first), false, true);
1089 }
1090 maybe_share_map(peer_con.get(), next_map);
1091 peer_con->send_message(iter.second);
1092 }
1093 release_map(next_map);
1094}
1095ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1096{
1097 OSDMapRef next_map = get_nextmap_reserved();
1098 // service map is always newer/newest
1099 ceph_assert(from_epoch <= next_map->get_epoch());
1100
1101 if (next_map->is_down(peer) ||
1102 next_map->get_info(peer).up_from > from_epoch) {
1103 release_map(next_map);
1104 return NULL;
1105 }
1106 ConnectionRef con;
1107 if (peer == whoami) {
1108 con = osd->cluster_messenger->get_loopback_connection();
1109 } else {
1110 con = osd->cluster_messenger->connect_to_osd(
1111 next_map->get_cluster_addrs(peer), false, true);
1112 }
1113 release_map(next_map);
1114 return con;
1115}
1116
1117pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1118{
1119 OSDMapRef next_map = get_nextmap_reserved();
1120 // service map is always newer/newest
1121 ceph_assert(from_epoch <= next_map->get_epoch());
1122
1123 pair<ConnectionRef,ConnectionRef> ret;
1124 if (next_map->is_down(peer) ||
1125 next_map->get_info(peer).up_from > from_epoch) {
1126 release_map(next_map);
1127 return ret;
1128 }
1129 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1130 next_map->get_hb_back_addrs(peer));
1131 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1132 next_map->get_hb_front_addrs(peer));
1133 release_map(next_map);
1134 return ret;
1135}
1136
1137entity_name_t OSDService::get_cluster_msgr_name() const
1138{
1139 return cluster_messenger->get_myname();
1140}
1141
1142void OSDService::queue_want_pg_temp(pg_t pgid,
1143 const vector<int>& want,
1144 bool forced)
1145{
1146 std::lock_guard l(pg_temp_lock);
1147 auto p = pg_temp_pending.find(pgid);
1148 if (p == pg_temp_pending.end() ||
1149 p->second.acting != want ||
1150 forced) {
1151 pg_temp_wanted[pgid] = {want, forced};
1152 }
1153}
1154
1155void OSDService::remove_want_pg_temp(pg_t pgid)
1156{
1157 std::lock_guard l(pg_temp_lock);
1158 pg_temp_wanted.erase(pgid);
1159 pg_temp_pending.erase(pgid);
1160}
1161
1162void OSDService::_sent_pg_temp()
1163{
1164#ifdef HAVE_STDLIB_MAP_SPLICING
1165 pg_temp_pending.merge(pg_temp_wanted);
1166#else
1167 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1168 make_move_iterator(end(pg_temp_wanted)));
1169#endif
1170 pg_temp_wanted.clear();
1171}
1172
1173void OSDService::requeue_pg_temp()
1174{
1175 std::lock_guard l(pg_temp_lock);
1176 // wanted overrides pending. note that remove_want_pg_temp
1177 // clears the item out of both.
1178 unsigned old_wanted = pg_temp_wanted.size();
1179 unsigned old_pending = pg_temp_pending.size();
1180 _sent_pg_temp();
1181 pg_temp_wanted.swap(pg_temp_pending);
1182 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1183 << pg_temp_wanted.size() << dendl;
1184}
1185
1186std::ostream& operator<<(std::ostream& out,
1187 const OSDService::pg_temp_t& pg_temp)
1188{
1189 out << pg_temp.acting;
1190 if (pg_temp.forced) {
1191 out << " (forced)";
1192 }
1193 return out;
1194}
1195
1196void OSDService::send_pg_temp()
1197{
1198 std::lock_guard l(pg_temp_lock);
1199 if (pg_temp_wanted.empty())
1200 return;
1201 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1202 MOSDPGTemp *ms[2] = {nullptr, nullptr};
1203 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1204 auto& m = ms[pg_temp.forced];
1205 if (!m) {
1206 m = new MOSDPGTemp(osdmap->get_epoch());
1207 m->forced = pg_temp.forced;
1208 }
1209 m->pg_temp.emplace(pgid, pg_temp.acting);
1210 }
1211 for (auto m : ms) {
1212 if (m) {
1213 monc->send_mon_message(m);
1214 }
1215 }
1216 _sent_pg_temp();
1217}
1218
1219void OSDService::send_pg_created(pg_t pgid)
1220{
1221 std::lock_guard l(pg_created_lock);
1222 dout(20) << __func__ << dendl;
1223 auto o = get_osdmap();
1224 if (o->require_osd_release >= ceph_release_t::luminous) {
1225 pg_created.insert(pgid);
1226 monc->send_mon_message(new MOSDPGCreated(pgid));
1227 }
1228}
1229
1230void OSDService::send_pg_created()
1231{
1232 std::lock_guard l(pg_created_lock);
1233 dout(20) << __func__ << dendl;
1234 auto o = get_osdmap();
1235 if (o->require_osd_release >= ceph_release_t::luminous) {
1236 for (auto pgid : pg_created) {
1237 monc->send_mon_message(new MOSDPGCreated(pgid));
1238 }
1239 }
1240}
1241
1242void OSDService::prune_pg_created()
1243{
1244 std::lock_guard l(pg_created_lock);
1245 dout(20) << __func__ << dendl;
1246 auto o = get_osdmap();
1247 auto i = pg_created.begin();
1248 while (i != pg_created.end()) {
1249 auto p = o->get_pg_pool(i->pool());
1250 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1251 dout(20) << __func__ << " pruning " << *i << dendl;
1252 i = pg_created.erase(i);
1253 } else {
1254 dout(20) << __func__ << " keeping " << *i << dendl;
1255 ++i;
1256 }
1257 }
1258}
1259
1260
1261// --------------------------------------
1262// dispatch
1263
1264void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1265 epoch_t *_bind_epoch) const
1266{
1267 std::lock_guard l(epoch_lock);
1268 if (_boot_epoch)
1269 *_boot_epoch = boot_epoch;
1270 if (_up_epoch)
1271 *_up_epoch = up_epoch;
1272 if (_bind_epoch)
1273 *_bind_epoch = bind_epoch;
1274}
1275
1276void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1277 const epoch_t *_bind_epoch)
1278{
1279 std::lock_guard l(epoch_lock);
1280 if (_boot_epoch) {
1281 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1282 boot_epoch = *_boot_epoch;
1283 }
1284 if (_up_epoch) {
1285 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1286 up_epoch = *_up_epoch;
1287 }
1288 if (_bind_epoch) {
1289 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1290 bind_epoch = *_bind_epoch;
1291 }
1292}
1293
1294bool OSDService::prepare_to_stop()
1295{
1296 std::unique_lock l(is_stopping_lock);
1297 if (get_state() != NOT_STOPPING)
1298 return false;
1299
1300 OSDMapRef osdmap = get_osdmap();
1301 if (osdmap && osdmap->is_up(whoami)) {
1302 dout(0) << __func__ << " telling mon we are shutting down and dead " << dendl;
1303 set_state(PREPARING_TO_STOP);
1304 monc->send_mon_message(
1305 new MOSDMarkMeDown(
1306 monc->get_fsid(),
1307 whoami,
1308 osdmap->get_addrs(whoami),
1309 osdmap->get_epoch(),
1310 true, // request ack
1311 true // mark as down and dead
1312 ));
1313 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1314 is_stopping_cond.wait_for(l, timeout,
1315 [this] { return get_state() == STOPPING; });
1316 }
1317
1318 dout(0) << __func__ << " starting shutdown" << dendl;
1319 set_state(STOPPING);
1320 return true;
1321}
1322
1323void OSDService::got_stop_ack()
1324{
1325 std::scoped_lock l(is_stopping_lock);
1326 if (get_state() == PREPARING_TO_STOP) {
1327 dout(0) << __func__ << " starting shutdown" << dendl;
1328 set_state(STOPPING);
1329 is_stopping_cond.notify_all();
1330 } else {
1331 dout(10) << __func__ << " ignoring msg" << dendl;
1332 }
1333}
1334
1335MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1336 OSDSuperblock& sblock)
1337{
1338 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1339 osdmap->get_encoding_features());
1340 m->oldest_map = max_oldest_map;
1341 m->newest_map = sblock.newest_map;
1342
1343 int max = cct->_conf->osd_map_message_max;
1344 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1345
1346 if (since < m->oldest_map) {
1347 // we don't have the next map the target wants, so start with a
1348 // full map.
1349 bufferlist bl;
1350 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1351 << since << ", starting with full map" << dendl;
1352 since = m->oldest_map;
1353 if (!get_map_bl(since, bl)) {
1354 derr << __func__ << " missing full map " << since << dendl;
1355 goto panic;
1356 }
1357 max--;
1358 max_bytes -= bl.length();
1359 m->maps[since] = std::move(bl);
1360 }
1361 for (epoch_t e = since + 1; e <= to; ++e) {
1362 bufferlist bl;
1363 if (get_inc_map_bl(e, bl)) {
1364 m->incremental_maps[e] = std::move(bl);
1365 } else {
1366 dout(10) << __func__ << " missing incremental map " << e << dendl;
1367 if (!get_map_bl(e, bl)) {
1368 derr << __func__ << " also missing full map " << e << dendl;
1369 goto panic;
1370 }
1371 m->maps[e] = std::move(bl);
1372 }
1373 max--;
1374 max_bytes -= bl.length();
1375 if (max <= 0 || max_bytes <= 0) {
1376 break;
1377 }
1378 }
1379 return m;
1380
1381 panic:
1382 if (!m->maps.empty() ||
1383 !m->incremental_maps.empty()) {
1384 // send what we have so far
1385 return m;
1386 }
1387 // send something
1388 bufferlist bl;
1389 if (get_inc_map_bl(m->newest_map, bl)) {
1390 m->incremental_maps[m->newest_map] = std::move(bl);
1391 } else {
1392 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1393 if (!get_map_bl(m->newest_map, bl)) {
1394 derr << __func__ << " unable to load latest full map " << m->newest_map
1395 << dendl;
1396 ceph_abort();
1397 }
1398 m->maps[m->newest_map] = std::move(bl);
1399 }
1400 return m;
1401}
1402
1403void OSDService::send_map(MOSDMap *m, Connection *con)
1404{
1405 con->send_message(m);
1406}
1407
1408void OSDService::send_incremental_map(epoch_t since, Connection *con,
1409 const OSDMapRef& osdmap)
1410{
1411 epoch_t to = osdmap->get_epoch();
1412 dout(10) << "send_incremental_map " << since << " -> " << to
1413 << " to " << con << " " << con->get_peer_addr() << dendl;
1414
1415 MOSDMap *m = NULL;
1416 while (!m) {
1417 OSDSuperblock sblock(get_superblock());
1418 if (since < sblock.oldest_map) {
1419 // just send latest full map
1420 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1421 osdmap->get_encoding_features());
1422 m->oldest_map = max_oldest_map;
1423 m->newest_map = sblock.newest_map;
1424 get_map_bl(to, m->maps[to]);
1425 send_map(m, con);
1426 return;
1427 }
1428
1429 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1430 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1431 << ", only sending most recent" << dendl;
1432 since = to - cct->_conf->osd_map_share_max_epochs;
1433 }
1434
1435 m = build_incremental_map_msg(since, to, sblock);
1436 }
1437 send_map(m, con);
1438}
1439
1440bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1441{
1442 bool found = map_bl_cache.lookup(e, &bl);
1443 if (found) {
1444 logger->inc(l_osd_map_bl_cache_hit);
1445 return true;
1446 }
1447 logger->inc(l_osd_map_bl_cache_miss);
1448 found = store->read(meta_ch,
1449 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1450 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1451 if (found) {
1452 _add_map_bl(e, bl);
1453 }
1454 return found;
1455}
1456
1457bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1458{
1459 std::lock_guard l(map_cache_lock);
1460 bool found = map_bl_inc_cache.lookup(e, &bl);
1461 if (found) {
1462 logger->inc(l_osd_map_bl_cache_hit);
1463 return true;
1464 }
1465 logger->inc(l_osd_map_bl_cache_miss);
1466 found = store->read(meta_ch,
1467 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1468 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1469 if (found) {
1470 _add_map_inc_bl(e, bl);
1471 }
1472 return found;
1473}
1474
1475void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1476{
1477 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1478 // cache a contiguous buffer
1479 if (bl.get_num_buffers() > 1) {
1480 bl.rebuild();
1481 }
1482 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1483 map_bl_cache.add(e, bl);
1484}
1485
1486void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1487{
1488 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1489 // cache a contiguous buffer
1490 if (bl.get_num_buffers() > 1) {
1491 bl.rebuild();
1492 }
1493 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1494 map_bl_inc_cache.add(e, bl);
1495}
1496
1497OSDMapRef OSDService::_add_map(OSDMap *o)
1498{
1499 epoch_t e = o->get_epoch();
1500
1501 if (cct->_conf->osd_map_dedup) {
1502 // Dedup against an existing map at a nearby epoch
1503 OSDMapRef for_dedup = map_cache.lower_bound(e);
1504 if (for_dedup) {
1505 OSDMap::dedup(for_dedup.get(), o);
1506 }
1507 }
1508 bool existed;
1509 OSDMapRef l = map_cache.add(e, o, &existed);
1510 if (existed) {
1511 delete o;
1512 }
1513 return l;
1514}
1515
1516OSDMapRef OSDService::try_get_map(epoch_t epoch)
1517{
1518 std::lock_guard l(map_cache_lock);
1519 OSDMapRef retval = map_cache.lookup(epoch);
1520 if (retval) {
1521 dout(30) << "get_map " << epoch << " -cached" << dendl;
1522 logger->inc(l_osd_map_cache_hit);
1523 return retval;
1524 }
1525 {
1526 logger->inc(l_osd_map_cache_miss);
1527 epoch_t lb = map_cache.cached_key_lower_bound();
1528 if (epoch < lb) {
1529 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1530 logger->inc(l_osd_map_cache_miss_low);
1531 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1532 }
1533 }
1534
1535 OSDMap *map = new OSDMap;
1536 if (epoch > 0) {
1537 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1538 bufferlist bl;
1539 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1540 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1541 delete map;
1542 return OSDMapRef();
1543 }
1544 map->decode(bl);
1545 } else {
1546 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1547 }
1548 return _add_map(map);
1549}
1550
1551// ops
1552
1553
1554void OSDService::reply_op_error(OpRequestRef op, int err)
1555{
1556 reply_op_error(op, err, eversion_t(), 0, {});
1557}
1558
1559void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1560 version_t uv,
1561 vector<pg_log_op_return_item_t> op_returns)
1562{
1563 auto m = op->get_req<MOSDOp>();
1564 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1565 int flags;
1566 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1567
1568 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1569 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
1570 reply->set_reply_versions(v, uv);
1571 reply->set_op_returns(op_returns);
1572 m->get_connection()->send_message(reply);
1573}
1574
1575void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1576{
1577 if (!cct->_conf->osd_debug_misdirected_ops) {
1578 return;
1579 }
1580
1581 auto m = op->get_req<MOSDOp>();
1582 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1583
1584 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1585
1586 if (pg->is_ec_pg()) {
1587 /**
1588 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1589 * can get this result:
1590 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1591 * [CRUSH_ITEM_NONE, 2, 3]/3
1592 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1593 * [3, 2, 3]/3
1594 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1595 * -- misdirected op
1596 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1597 * it and fulfils it
1598 *
1599 * We can't compute the op target based on the sending map epoch due to
1600 * splitting. The simplest thing is to detect such cases here and drop
1601 * them without an error (the client will resend anyway).
1602 */
1603 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1604 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1605 if (!opmap) {
1606 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1607 << m->get_map_epoch() << ", dropping" << dendl;
1608 return;
1609 }
1610 pg_t _pgid = m->get_raw_pg();
1611 spg_t pgid;
1612 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1613 _pgid = opmap->raw_pg_to_pg(_pgid);
1614 if (opmap->get_primary_shard(_pgid, &pgid) &&
1615 pgid.shard != pg->pg_id.shard) {
1616 dout(7) << __func__ << ": " << *pg << " primary changed since "
1617 << m->get_map_epoch() << ", dropping" << dendl;
1618 return;
1619 }
1620 }
1621
1622 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1623 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1624 << " pg " << m->get_raw_pg()
1625 << " to osd." << whoami
1626 << " not " << pg->get_acting()
1627 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1628}
1629
1630void OSDService::enqueue_back(OpSchedulerItem&& qi)
1631{
1632 osd->op_shardedwq.queue(std::move(qi));
1633}
1634
1635void OSDService::enqueue_front(OpSchedulerItem&& qi)
1636{
1637 osd->op_shardedwq.queue_front(std::move(qi));
1638}
1639
1640void OSDService::queue_recovery_context(
1641 PG *pg,
1642 GenContext<ThreadPool::TPHandle&> *c)
1643{
1644 epoch_t e = get_osdmap_epoch();
1645 enqueue_back(
1646 OpSchedulerItem(
1647 unique_ptr<OpSchedulerItem::OpQueueable>(
1648 new PGRecoveryContext(pg->get_pgid(), c, e)),
1649 cct->_conf->osd_recovery_cost,
1650 cct->_conf->osd_recovery_priority,
1651 ceph_clock_now(),
1652 0,
1653 e));
1654}
1655
1656void OSDService::queue_for_snap_trim(PG *pg)
1657{
1658 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1659 enqueue_back(
1660 OpSchedulerItem(
1661 unique_ptr<OpSchedulerItem::OpQueueable>(
1662 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1663 cct->_conf->osd_snap_trim_cost,
1664 cct->_conf->osd_snap_trim_priority,
1665 ceph_clock_now(),
1666 0,
1667 pg->get_osdmap_epoch()));
1668}
1669
1670template <class MSG_TYPE>
1671void OSDService::queue_scrub_event_msg(PG* pg,
1672 Scrub::scrub_prio_t with_priority,
1673 unsigned int qu_priority,
1674 Scrub::act_token_t act_token)
1675{
1676 const auto epoch = pg->get_osdmap_epoch();
1677 auto msg = new MSG_TYPE(pg->get_pgid(), epoch, act_token);
1678 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg
1679 << ". Epoch: " << epoch << " token: " << act_token << dendl;
1680
1681 enqueue_back(OpSchedulerItem(
1682 unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1683 pg->scrub_requeue_priority(with_priority, qu_priority), ceph_clock_now(), 0, epoch));
1684}
1685
1686template <class MSG_TYPE>
1687void OSDService::queue_scrub_event_msg(PG* pg,
1688 Scrub::scrub_prio_t with_priority)
1689{
1690 const auto epoch = pg->get_osdmap_epoch();
1691 auto msg = new MSG_TYPE(pg->get_pgid(), epoch);
1692 dout(15) << "queue a scrub event (" << *msg << ") for " << *pg << ". Epoch: " << epoch << dendl;
1693
1694 enqueue_back(OpSchedulerItem(
1695 unique_ptr<OpSchedulerItem::OpQueueable>(msg), cct->_conf->osd_scrub_cost,
1696 pg->scrub_requeue_priority(with_priority), ceph_clock_now(), 0, epoch));
1697}
1698
1699void OSDService::queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority)
1700{
1701 queue_scrub_event_msg<PGScrub>(pg, with_priority);
1702}
1703
1704void OSDService::queue_scrub_after_repair(PG* pg, Scrub::scrub_prio_t with_priority)
1705{
1706 queue_scrub_event_msg<PGScrubAfterRepair>(pg, with_priority);
1707}
1708
1709void OSDService::queue_for_rep_scrub(PG* pg,
1710 Scrub::scrub_prio_t with_priority,
1711 unsigned int qu_priority,
1712 Scrub::act_token_t act_token)
1713{
1714 queue_scrub_event_msg<PGRepScrub>(pg, with_priority, qu_priority, act_token);
1715}
1716
1717void OSDService::queue_for_rep_scrub_resched(PG* pg,
1718 Scrub::scrub_prio_t with_priority,
1719 unsigned int qu_priority,
1720 Scrub::act_token_t act_token)
1721{
1722 // Resulting scrub event: 'SchedReplica'
1723 queue_scrub_event_msg<PGRepScrubResched>(pg, with_priority, qu_priority,
1724 act_token);
1725}
1726
1727void OSDService::queue_for_scrub_granted(PG* pg, Scrub::scrub_prio_t with_priority)
1728{
1729 // Resulting scrub event: 'RemotesReserved'
1730 queue_scrub_event_msg<PGScrubResourcesOK>(pg, with_priority);
1731}
1732
1733void OSDService::queue_for_scrub_denied(PG* pg, Scrub::scrub_prio_t with_priority)
1734{
1735 // Resulting scrub event: 'ReservationFailure'
1736 queue_scrub_event_msg<PGScrubDenied>(pg, with_priority);
1737}
1738
1739void OSDService::queue_for_scrub_resched(PG* pg, Scrub::scrub_prio_t with_priority)
1740{
1741 // Resulting scrub event: 'InternalSchedScrub'
1742 queue_scrub_event_msg<PGScrubResched>(pg, with_priority);
1743}
1744
1745void OSDService::queue_scrub_pushes_update(PG* pg, Scrub::scrub_prio_t with_priority)
1746{
1747 // Resulting scrub event: 'ActivePushesUpd'
1748 queue_scrub_event_msg<PGScrubPushesUpdate>(pg, with_priority);
1749}
1750
1751void OSDService::queue_scrub_chunk_free(PG* pg, Scrub::scrub_prio_t with_priority)
1752{
1753 // Resulting scrub event: 'SelectedChunkFree'
1754 queue_scrub_event_msg<PGScrubChunkIsFree>(pg, with_priority);
1755}
1756
1757void OSDService::queue_scrub_chunk_busy(PG* pg, Scrub::scrub_prio_t with_priority)
1758{
1759 // Resulting scrub event: 'ChunkIsBusy'
1760 queue_scrub_event_msg<PGScrubChunkIsBusy>(pg, with_priority);
1761}
1762
1763void OSDService::queue_scrub_applied_update(PG* pg, Scrub::scrub_prio_t with_priority)
1764{
1765 queue_scrub_event_msg<PGScrubAppliedUpdate>(pg, with_priority);
1766}
1767
1768void OSDService::queue_scrub_unblocking(PG* pg, Scrub::scrub_prio_t with_priority)
1769{
1770 // Resulting scrub event: 'Unblocked'
1771 queue_scrub_event_msg<PGScrubUnblocked>(pg, with_priority);
1772}
1773
1774void OSDService::queue_scrub_digest_update(PG* pg, Scrub::scrub_prio_t with_priority)
1775{
1776 // Resulting scrub event: 'DigestUpdate'
1777 queue_scrub_event_msg<PGScrubDigestUpdate>(pg, with_priority);
1778}
1779
1780void OSDService::queue_scrub_got_local_map(PG* pg, Scrub::scrub_prio_t with_priority)
1781{
1782 // Resulting scrub event: 'IntLocalMapDone'
1783 queue_scrub_event_msg<PGScrubGotLocalMap>(pg, with_priority);
1784}
1785
1786void OSDService::queue_scrub_got_repl_maps(PG* pg, Scrub::scrub_prio_t with_priority)
1787{
1788 // Resulting scrub event: 'GotReplicas'
1789 queue_scrub_event_msg<PGScrubGotReplMaps>(pg, with_priority);
1790}
1791
1792void OSDService::queue_scrub_maps_compared(PG* pg, Scrub::scrub_prio_t with_priority)
1793{
1794 // Resulting scrub event: 'MapsCompared'
1795 queue_scrub_event_msg<PGScrubMapsCompared>(pg, with_priority);
1796}
1797
1798void OSDService::queue_scrub_replica_pushes(PG *pg, Scrub::scrub_prio_t with_priority)
1799{
1800 // Resulting scrub event: 'ReplicaPushesUpd'
1801 queue_scrub_event_msg<PGScrubReplicaPushes>(pg, with_priority);
1802}
1803
1804void OSDService::queue_scrub_is_finished(PG *pg)
1805{
1806 // Resulting scrub event: 'ScrubFinished'
1807 queue_scrub_event_msg<PGScrubScrubFinished>(pg, Scrub::scrub_prio_t::high_priority);
1808}
1809
1810void OSDService::queue_scrub_next_chunk(PG *pg, Scrub::scrub_prio_t with_priority)
1811{
1812 // Resulting scrub event: 'NextChunk'
1813 queue_scrub_event_msg<PGScrubGetNextChunk>(pg, with_priority);
1814}
1815
1816void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1817{
1818 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1819 enqueue_back(
1820 OpSchedulerItem(
1821 unique_ptr<OpSchedulerItem::OpQueueable>(
1822 new PGDelete(pgid, e)),
1823 cct->_conf->osd_pg_delete_cost,
1824 cct->_conf->osd_pg_delete_priority,
1825 ceph_clock_now(),
1826 0,
1827 e));
1828}
1829
1830bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1831{
1832 return osd->try_finish_pg_delete(pg, old_pg_num);
1833}
1834
1835// ---
1836
1837void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1838{
1839 std::lock_guard l(merge_lock);
1840 dout(10) << __func__ << " " << pg->pg_id << dendl;
1841 ready_to_merge_source[pg->pg_id.pgid] = version;
1842 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1843 _send_ready_to_merge();
1844}
1845
1846void OSDService::set_ready_to_merge_target(PG *pg,
1847 eversion_t version,
1848 epoch_t last_epoch_started,
1849 epoch_t last_epoch_clean)
1850{
1851 std::lock_guard l(merge_lock);
1852 dout(10) << __func__ << " " << pg->pg_id << dendl;
1853 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1854 make_tuple(version,
1855 last_epoch_started,
1856 last_epoch_clean)));
1857 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1858 _send_ready_to_merge();
1859}
1860
1861void OSDService::set_not_ready_to_merge_source(pg_t source)
1862{
1863 std::lock_guard l(merge_lock);
1864 dout(10) << __func__ << " " << source << dendl;
1865 not_ready_to_merge_source.insert(source);
1866 assert(ready_to_merge_source.count(source) == 0);
1867 _send_ready_to_merge();
1868}
1869
1870void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1871{
1872 std::lock_guard l(merge_lock);
1873 dout(10) << __func__ << " " << target << " source " << source << dendl;
1874 not_ready_to_merge_target[target] = source;
1875 assert(ready_to_merge_target.count(target) == 0);
1876 _send_ready_to_merge();
1877}
1878
1879void OSDService::send_ready_to_merge()
1880{
1881 std::lock_guard l(merge_lock);
1882 _send_ready_to_merge();
1883}
1884
1885void OSDService::_send_ready_to_merge()
1886{
1887 dout(20) << __func__
1888 << " ready_to_merge_source " << ready_to_merge_source
1889 << " not_ready_to_merge_source " << not_ready_to_merge_source
1890 << " ready_to_merge_target " << ready_to_merge_target
1891 << " not_ready_to_merge_target " << not_ready_to_merge_target
1892 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1893 << dendl;
1894 for (auto src : not_ready_to_merge_source) {
1895 if (sent_ready_to_merge_source.count(src) == 0) {
1896 monc->send_mon_message(new MOSDPGReadyToMerge(
1897 src,
1898 {}, {}, 0, 0,
1899 false,
1900 osdmap->get_epoch()));
1901 sent_ready_to_merge_source.insert(src);
1902 }
1903 }
1904 for (auto p : not_ready_to_merge_target) {
1905 if (sent_ready_to_merge_source.count(p.second) == 0) {
1906 monc->send_mon_message(new MOSDPGReadyToMerge(
1907 p.second,
1908 {}, {}, 0, 0,
1909 false,
1910 osdmap->get_epoch()));
1911 sent_ready_to_merge_source.insert(p.second);
1912 }
1913 }
1914 for (auto src : ready_to_merge_source) {
1915 if (not_ready_to_merge_source.count(src.first) ||
1916 not_ready_to_merge_target.count(src.first.get_parent())) {
1917 continue;
1918 }
1919 auto p = ready_to_merge_target.find(src.first.get_parent());
1920 if (p != ready_to_merge_target.end() &&
1921 sent_ready_to_merge_source.count(src.first) == 0) {
1922 monc->send_mon_message(new MOSDPGReadyToMerge(
1923 src.first, // source pgid
1924 src.second, // src version
1925 std::get<0>(p->second), // target version
1926 std::get<1>(p->second), // PG's last_epoch_started
1927 std::get<2>(p->second), // PG's last_epoch_clean
1928 true,
1929 osdmap->get_epoch()));
1930 sent_ready_to_merge_source.insert(src.first);
1931 }
1932 }
1933}
1934
1935void OSDService::clear_ready_to_merge(PG *pg)
1936{
1937 std::lock_guard l(merge_lock);
1938 dout(10) << __func__ << " " << pg->pg_id << dendl;
1939 ready_to_merge_source.erase(pg->pg_id.pgid);
1940 ready_to_merge_target.erase(pg->pg_id.pgid);
1941 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1942 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1943 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1944}
1945
1946void OSDService::clear_sent_ready_to_merge()
1947{
1948 std::lock_guard l(merge_lock);
1949 sent_ready_to_merge_source.clear();
1950}
1951
1952void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
1953{
1954 std::lock_guard l(merge_lock);
1955 auto i = sent_ready_to_merge_source.begin();
1956 while (i != sent_ready_to_merge_source.end()) {
1957 if (!osdmap->pg_exists(*i)) {
1958 dout(10) << __func__ << " " << *i << dendl;
1959 i = sent_ready_to_merge_source.erase(i);
1960 } else {
1961 ++i;
1962 }
1963 }
1964}
1965
1966// ---
1967
1968void OSDService::_queue_for_recovery(
1969 std::pair<epoch_t, PGRef> p,
1970 uint64_t reserved_pushes)
1971{
1972 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
1973 enqueue_back(
1974 OpSchedulerItem(
1975 unique_ptr<OpSchedulerItem::OpQueueable>(
1976 new PGRecovery(
1977 p.second->get_pgid(), p.first, reserved_pushes)),
1978 cct->_conf->osd_recovery_cost,
1979 cct->_conf->osd_recovery_priority,
1980 ceph_clock_now(),
1981 0,
1982 p.first));
1983}
1984
1985// ====================================================================
1986// OSD
1987
1988#undef dout_prefix
1989#define dout_prefix *_dout
1990
1991// Commands shared between OSD's console and admin console:
1992namespace ceph::osd_cmds {
1993
1994int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1995
1996} // namespace ceph::osd_cmds
1997
1998int OSD::mkfs(CephContext *cct,
1999 std::unique_ptr<ObjectStore> store,
2000 uuid_d fsid,
2001 int whoami,
2002 string osdspec_affinity)
2003{
2004 int ret;
2005
2006 OSDSuperblock sb;
2007 bufferlist sbbl;
2008 // if we are fed a uuid for this osd, use it.
2009 store->set_fsid(cct->_conf->osd_uuid);
2010
2011 ret = store->mkfs();
2012 if (ret) {
2013 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
2014 << cpp_strerror(ret) << dendl;
2015 return ret;
2016 }
2017
2018 store->set_cache_shards(1); // doesn't matter for mkfs!
2019
2020 ret = store->mount();
2021 if (ret) {
2022 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
2023 << cpp_strerror(ret) << dendl;
2024 return ret;
2025 }
2026
2027 auto umount_store = make_scope_guard([&] {
2028 store->umount();
2029 });
2030
2031 ObjectStore::CollectionHandle ch =
2032 store->open_collection(coll_t::meta());
2033 if (ch) {
2034 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
2035 if (ret < 0) {
2036 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
2037 return ret;
2038 }
2039 /* if we already have superblock, check content of superblock */
2040 dout(0) << " have superblock" << dendl;
2041 auto p = sbbl.cbegin();
2042 decode(sb, p);
2043 if (whoami != sb.whoami) {
2044 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
2045 << dendl;
2046 return -EINVAL;
2047 }
2048 if (fsid != sb.cluster_fsid) {
2049 derr << "provided cluster fsid " << fsid
2050 << " != superblock's " << sb.cluster_fsid << dendl;
2051 return -EINVAL;
2052 }
2053 } else {
2054 // create superblock
2055 sb.cluster_fsid = fsid;
2056 sb.osd_fsid = store->get_fsid();
2057 sb.whoami = whoami;
2058 sb.compat_features = get_osd_initial_compat_set();
2059
2060 bufferlist bl;
2061 encode(sb, bl);
2062
2063 ObjectStore::CollectionHandle ch = store->create_new_collection(
2064 coll_t::meta());
2065 ObjectStore::Transaction t;
2066 t.create_collection(coll_t::meta(), 0);
2067 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
2068 ret = store->queue_transaction(ch, std::move(t));
2069 if (ret) {
2070 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
2071 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
2072 return ret;
2073 }
2074 ch->flush();
2075 }
2076
2077 ret = write_meta(cct, store.get(), sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
2078 if (ret) {
2079 derr << "OSD::mkfs: failed to write fsid file: error "
2080 << cpp_strerror(ret) << dendl;
2081 }
2082 return ret;
2083}
2084
2085int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
2086{
2087 char val[80];
2088 int r;
2089
2090 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2091 r = store->write_meta("magic", val);
2092 if (r < 0)
2093 return r;
2094
2095 snprintf(val, sizeof(val), "%d", whoami);
2096 r = store->write_meta("whoami", val);
2097 if (r < 0)
2098 return r;
2099
2100 cluster_fsid.print(val);
2101 r = store->write_meta("ceph_fsid", val);
2102 if (r < 0)
2103 return r;
2104
2105 string key = cct->_conf.get_val<string>("key");
2106 if (key.size()) {
2107 r = store->write_meta("osd_key", key);
2108 if (r < 0)
2109 return r;
2110 } else {
2111 string keyfile = cct->_conf.get_val<string>("keyfile");
2112 if (!keyfile.empty()) {
2113 bufferlist keybl;
2114 string err;
2115 r = keybl.read_file(keyfile.c_str(), &err);
2116 if (r < 0) {
2117 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2118 << err << ": " << cpp_strerror(r) << dendl;
2119 return r;
2120 }
2121 r = store->write_meta("osd_key", keybl.to_str());
2122 if (r < 0)
2123 return r;
2124 }
2125 }
2126 if (!osdspec_affinity.empty()) {
2127 r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2128 if (r < 0)
2129 return r;
2130 }
2131
2132 r = store->write_meta("ready", "ready");
2133 if (r < 0)
2134 return r;
2135
2136 return 0;
2137}
2138
2139int OSD::peek_meta(ObjectStore *store,
2140 std::string *magic,
2141 uuid_d *cluster_fsid,
2142 uuid_d *osd_fsid,
2143 int *whoami,
2144 ceph_release_t *require_osd_release)
2145{
2146 string val;
2147
2148 int r = store->read_meta("magic", &val);
2149 if (r < 0)
2150 return r;
2151 *magic = val;
2152
2153 r = store->read_meta("whoami", &val);
2154 if (r < 0)
2155 return r;
2156 *whoami = atoi(val.c_str());
2157
2158 r = store->read_meta("ceph_fsid", &val);
2159 if (r < 0)
2160 return r;
2161 r = cluster_fsid->parse(val.c_str());
2162 if (!r)
2163 return -EINVAL;
2164
2165 r = store->read_meta("fsid", &val);
2166 if (r < 0) {
2167 *osd_fsid = uuid_d();
2168 } else {
2169 r = osd_fsid->parse(val.c_str());
2170 if (!r)
2171 return -EINVAL;
2172 }
2173
2174 r = store->read_meta("require_osd_release", &val);
2175 if (r >= 0) {
2176 *require_osd_release = ceph_release_from_name(val);
2177 }
2178
2179 return 0;
2180}
2181
2182
2183#undef dout_prefix
2184#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2185
2186// cons/des
2187
2188OSD::OSD(CephContext *cct_,
2189 std::unique_ptr<ObjectStore> store_,
2190 int id,
2191 Messenger *internal_messenger,
2192 Messenger *external_messenger,
2193 Messenger *hb_client_front,
2194 Messenger *hb_client_back,
2195 Messenger *hb_front_serverm,
2196 Messenger *hb_back_serverm,
2197 Messenger *osdc_messenger,
2198 MonClient *mc,
2199 const std::string &dev, const std::string &jdev,
2200 ceph::async::io_context_pool& poolctx) :
2201 Dispatcher(cct_),
2202 tick_timer(cct, osd_lock),
2203 tick_timer_without_osd_lock(cct, tick_timer_lock),
2204 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2205 cluster_messenger(internal_messenger),
2206 client_messenger(external_messenger),
2207 objecter_messenger(osdc_messenger),
2208 monc(mc),
2209 mgrc(cct_, client_messenger, &mc->monmap),
2210 logger(create_logger()),
2211 recoverystate_perf(create_recoverystate_perf()),
2212 store(std::move(store_)),
2213 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2214 clog(log_client.create_channel()),
2215 whoami(id),
2216 dev_path(dev), journal_path(jdev),
2217 store_is_rotational(store->is_rotational()),
2218 trace_endpoint("0.0.0.0", 0, "osd"),
2219 asok_hook(NULL),
2220 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2221 "osd_pg_epoch_max_lag_factor")),
2222 osd_compat(get_osd_compat_set()),
2223 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2224 get_num_op_threads()),
2225 heartbeat_stop(false),
2226 heartbeat_need_update(true),
2227 hb_front_client_messenger(hb_client_front),
2228 hb_back_client_messenger(hb_client_back),
2229 hb_front_server_messenger(hb_front_serverm),
2230 hb_back_server_messenger(hb_back_serverm),
2231 daily_loadavg(0.0),
2232 heartbeat_thread(this),
2233 heartbeat_dispatcher(this),
2234 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2235 cct->_conf->osd_num_op_tracker_shard),
2236 test_ops_hook(NULL),
2237 op_shardedwq(
2238 this,
2239 ceph::make_timespan(cct->_conf->osd_op_thread_timeout),
2240 ceph::make_timespan(cct->_conf->osd_op_thread_suicide_timeout),
2241 &osd_op_tp),
2242 last_pg_create_epoch(0),
2243 boot_finisher(cct),
2244 up_thru_wanted(0),
2245 requested_full_first(0),
2246 requested_full_last(0),
2247 service(this, poolctx)
2248{
2249
2250 if (!gss_ktfile_client.empty()) {
2251 // Assert we can export environment variable
2252 /*
2253 The default client keytab is used, if it is present and readable,
2254 to automatically obtain initial credentials for GSSAPI client
2255 applications. The principal name of the first entry in the client
2256 keytab is used by default when obtaining initial credentials.
2257 1. The KRB5_CLIENT_KTNAME environment variable.
2258 2. The default_client_keytab_name profile variable in [libdefaults].
2259 3. The hardcoded default, DEFCKTNAME.
2260 */
2261 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2262 gss_ktfile_client.c_str(), 1));
2263 ceph_assert(set_result == 0);
2264 }
2265
2266 monc->set_messenger(client_messenger);
2267 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2268 cct->_conf->osd_op_log_threshold);
2269 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2270 cct->_conf->osd_op_history_duration);
2271 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2272 cct->_conf->osd_op_history_slow_op_threshold);
2273 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
2274#ifdef WITH_BLKIN
2275 std::stringstream ss;
2276 ss << "osd." << whoami;
2277 trace_endpoint.copy_name(ss.str());
2278#endif
2279
2280 // initialize shards
2281 num_shards = get_num_op_shards();
2282 for (uint32_t i = 0; i < num_shards; i++) {
2283 OSDShard *one_shard = new OSDShard(
2284 i,
2285 cct,
2286 this);
2287 shards.push_back(one_shard);
2288 }
2289}
2290
2291OSD::~OSD()
2292{
2293 while (!shards.empty()) {
2294 delete shards.back();
2295 shards.pop_back();
2296 }
2297 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2298 cct->get_perfcounters_collection()->remove(logger);
2299 delete recoverystate_perf;
2300 delete logger;
2301}
2302
2303double OSD::get_tick_interval() const
2304{
2305 // vary +/- 5% to avoid scrub scheduling livelocks
2306 constexpr auto delta = 0.05;
2307 return (OSD_TICK_INTERVAL *
2308 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2309}
2310
2311void OSD::handle_signal(int signum)
2312{
2313 ceph_assert(signum == SIGINT || signum == SIGTERM);
2314 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2315 shutdown();
2316}
2317
2318int OSD::pre_init()
2319{
2320 std::lock_guard lock(osd_lock);
2321 if (is_stopping())
2322 return 0;
2323
2324 if (store->test_mount_in_use()) {
2325 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2326 << "currently in use. (Is ceph-osd already running?)" << dendl;
2327 return -EBUSY;
2328 }
2329
2330 cct->_conf.add_observer(this);
2331 return 0;
2332}
2333
2334int OSD::set_numa_affinity()
2335{
2336 // storage numa node
2337 int store_node = -1;
2338 store->get_numa_node(&store_node, nullptr, nullptr);
2339 if (store_node >= 0) {
2340 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2341 }
2342
2343 // check network numa node(s)
2344 int front_node = -1, back_node = -1;
2345 string front_iface = pick_iface(
2346 cct,
2347 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2348 string back_iface = pick_iface(
2349 cct,
2350 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2351 int r = get_iface_numa_node(front_iface, &front_node);
2352 if (r >= 0 && front_node >= 0) {
2353 dout(1) << __func__ << " public network " << front_iface << " numa node "
2354 << front_node << dendl;
2355 r = get_iface_numa_node(back_iface, &back_node);
2356 if (r >= 0 && back_node >= 0) {
2357 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2358 << back_node << dendl;
2359 if (front_node == back_node &&
2360 front_node == store_node) {
2361 dout(1) << " objectstore and network numa nodes all match" << dendl;
2362 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2363 numa_node = front_node;
2364 }
2365 } else if (front_node != back_node) {
2366 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2367 << dendl;
2368 } else {
2369 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2370 << dendl;
2371 }
2372 } else if (back_node == -2) {
2373 dout(1) << __func__ << " cluster network " << back_iface
2374 << " ports numa nodes do not match" << dendl;
2375 } else {
2376 derr << __func__ << " unable to identify cluster interface '" << back_iface
2377 << "' numa node: " << cpp_strerror(r) << dendl;
2378 }
2379 } else if (front_node == -2) {
2380 dout(1) << __func__ << " public network " << front_iface
2381 << " ports numa nodes do not match" << dendl;
2382 } else {
2383 derr << __func__ << " unable to identify public interface '" << front_iface
2384 << "' numa node: " << cpp_strerror(r) << dendl;
2385 }
2386 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2387 // this takes precedence over the automagic logic above
2388 numa_node = node;
2389 }
2390 if (numa_node >= 0) {
2391 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2392 if (r < 0) {
2393 dout(1) << __func__ << " unable to determine numa node " << numa_node
2394 << " CPUs" << dendl;
2395 numa_node = -1;
2396 } else {
2397 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2398 << " cpus "
2399 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2400 << dendl;
2401 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
2402 if (r < 0) {
2403 r = -errno;
2404 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2405 << dendl;
2406 numa_node = -1;
2407 }
2408 }
2409 } else {
2410 dout(1) << __func__ << " not setting numa affinity" << dendl;
2411 }
2412 return 0;
2413}
2414
2415// asok
2416
2417class OSDSocketHook : public AdminSocketHook {
2418 OSD *osd;
2419public:
2420 explicit OSDSocketHook(OSD *o) : osd(o) {}
2421 int call(std::string_view prefix, const cmdmap_t& cmdmap,
2422 Formatter *f,
2423 std::ostream& ss,
2424 bufferlist& out) override {
2425 ceph_abort("should use async hook");
2426 }
2427 void call_async(
2428 std::string_view prefix,
2429 const cmdmap_t& cmdmap,
2430 Formatter *f,
2431 const bufferlist& inbl,
2432 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
2433 try {
2434 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2435 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2436 bufferlist empty;
2437 on_finish(-EINVAL, e.what(), empty);
2438 }
2439 }
2440};
2441
2442std::set<int64_t> OSD::get_mapped_pools()
2443{
2444 std::set<int64_t> pools;
2445 std::vector<spg_t> pgids;
2446 _get_pgids(&pgids);
2447 for (const auto &pgid : pgids) {
2448 pools.insert(pgid.pool());
2449 }
2450 return pools;
2451}
2452
2453OSD::PGRefOrError OSD::locate_asok_target(const cmdmap_t& cmdmap,
2454 stringstream& ss,
2455 bool only_primary)
2456{
2457 string pgidstr;
2458 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2459 ss << "no pgid specified";
2460 return OSD::PGRefOrError{std::nullopt, -EINVAL};
2461 }
2462
2463 pg_t pgid;
2464 if (!pgid.parse(pgidstr.c_str())) {
2465 ss << "couldn't parse pgid '" << pgidstr << "'";
2466 return OSD::PGRefOrError{std::nullopt, -EINVAL};
2467 }
2468
2469 spg_t pcand;
2470 PGRef pg;
2471 if (get_osdmap()->get_primary_shard(pgid, &pcand) && (pg = _lookup_lock_pg(pcand))) {
2472 if (pg->is_primary() || !only_primary) {
2473 return OSD::PGRefOrError{pg, 0};
2474 }
2475
2476 ss << "not primary for pgid " << pgid;
2477 pg->unlock();
2478 return OSD::PGRefOrError{std::nullopt, -EAGAIN};
2479 } else {
2480 ss << "i don't have pgid " << pgid;
2481 return OSD::PGRefOrError{std::nullopt, -ENOENT};
2482 }
2483}
2484
2485// note that the cmdmap is explicitly copied into asok_route_to_pg()
2486int OSD::asok_route_to_pg(
2487 bool only_primary,
2488 std::string_view prefix,
2489 cmdmap_t cmdmap,
2490 Formatter* f,
2491 stringstream& ss,
2492 const bufferlist& inbl,
2493 bufferlist& outbl,
2494 std::function<void(int, const std::string&, bufferlist&)> on_finish)
2495{
2496 auto [target_pg, ret] = locate_asok_target(cmdmap, ss, only_primary);
2497
2498 if (!target_pg.has_value()) {
2499 // 'ss' and 'ret' already contain the error information
2500 on_finish(ret, ss.str(), outbl);
2501 return ret;
2502 }
2503
2504 // the PG was locked by locate_asok_target()
2505 try {
2506 (*target_pg)->do_command(prefix, cmdmap, inbl, on_finish);
2507 (*target_pg)->unlock();
2508 return 0; // the pg handler calls on_finish directly
2509 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2510 (*target_pg)->unlock();
2511 ss << e.what();
2512 on_finish(ret, ss.str(), outbl);
2513 return -EINVAL;
2514 }
2515}
2516
2517void OSD::asok_command(
2518 std::string_view prefix, const cmdmap_t& cmdmap,
2519 Formatter *f,
2520 const bufferlist& inbl,
2521 std::function<void(int,const std::string&,bufferlist&)> on_finish)
2522{
2523 int ret = 0;
2524 stringstream ss; // stderr error message stream
2525 bufferlist outbl; // if empty at end, we'll dump formatter as output
2526
2527 // --- PG commands are routed here to PG::do_command ---
2528 if (prefix == "pg" ||
2529 prefix == "query" ||
2530 prefix == "mark_unfound_lost" ||
2531 prefix == "list_unfound" ||
2532 prefix == "scrub" ||
2533 prefix == "deep_scrub"
2534 ) {
2535 string pgidstr;
2536 pg_t pgid;
2537 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2538 ss << "no pgid specified";
2539 ret = -EINVAL;
2540 goto out;
2541 }
2542 if (!pgid.parse(pgidstr.c_str())) {
2543 ss << "couldn't parse pgid '" << pgidstr << "'";
2544 ret = -EINVAL;
2545 goto out;
2546 }
2547 spg_t pcand;
2548 PGRef pg;
2549 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2550 (pg = _lookup_lock_pg(pcand))) {
2551 if (pg->is_primary()) {
2552 cmdmap_t new_cmdmap = cmdmap;
2553 try {
2554 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2555 pg->unlock();
2556 return; // the pg handler calls on_finish directly
2557 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2558 pg->unlock();
2559 ss << e.what();
2560 ret = -EINVAL;
2561 goto out;
2562 }
2563 } else {
2564 ss << "not primary for pgid " << pgid;
2565 // do not reply; they will get newer maps and realize they
2566 // need to resend.
2567 pg->unlock();
2568 ret = -EAGAIN;
2569 goto out;
2570 }
2571 } else {
2572 ss << "i don't have pgid " << pgid;
2573 ret = -ENOENT;
2574 }
2575 }
2576
2577 // --- PG commands that will be answered even if !primary ---
2578
2579 else if (prefix == "scrubdebug") {
2580 asok_route_to_pg(false, prefix, cmdmap, f, ss, inbl, outbl, on_finish);
2581 return;
2582 }
2583
2584 // --- OSD commands follow ---
2585
2586 else if (prefix == "status") {
2587 lock_guard l(osd_lock);
2588 f->open_object_section("status");
2589 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2590 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2591 f->dump_unsigned("whoami", superblock.whoami);
2592 f->dump_string("state", get_state_name(get_state()));
2593 f->dump_unsigned("oldest_map", superblock.oldest_map);
2594 f->dump_unsigned("newest_map", superblock.newest_map);
2595 f->dump_unsigned("num_pgs", num_pgs);
2596 f->close_section();
2597 } else if (prefix == "flush_journal") {
2598 store->flush_journal();
2599 } else if (prefix == "dump_ops_in_flight" ||
2600 prefix == "ops" ||
2601 prefix == "dump_blocked_ops" ||
2602 prefix == "dump_historic_ops" ||
2603 prefix == "dump_historic_ops_by_duration" ||
2604 prefix == "dump_historic_slow_ops") {
2605
2606 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2607even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2608will start to track new ops received afterwards.";
2609
2610 set<string> filters;
2611 vector<string> filter_str;
2612 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
2613 copy(filter_str.begin(), filter_str.end(),
2614 inserter(filters, filters.end()));
2615 }
2616
2617 if (prefix == "dump_ops_in_flight" ||
2618 prefix == "ops") {
2619 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2620 ss << error_str;
2621 ret = -EINVAL;
2622 goto out;
2623 }
2624 }
2625 if (prefix == "dump_blocked_ops") {
2626 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2627 ss << error_str;
2628 ret = -EINVAL;
2629 goto out;
2630 }
2631 }
2632 if (prefix == "dump_historic_ops") {
2633 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2634 ss << error_str;
2635 ret = -EINVAL;
2636 goto out;
2637 }
2638 }
2639 if (prefix == "dump_historic_ops_by_duration") {
2640 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2641 ss << error_str;
2642 ret = -EINVAL;
2643 goto out;
2644 }
2645 }
2646 if (prefix == "dump_historic_slow_ops") {
2647 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2648 ss << error_str;
2649 ret = -EINVAL;
2650 goto out;
2651 }
2652 }
2653 } else if (prefix == "dump_op_pq_state") {
2654 f->open_object_section("pq");
2655 op_shardedwq.dump(f);
2656 f->close_section();
2657 } else if (prefix == "dump_blocklist") {
2658 list<pair<entity_addr_t,utime_t> > bl;
2659 OSDMapRef curmap = service.get_osdmap();
2660
2661 f->open_array_section("blocklist");
2662 curmap->get_blocklist(&bl);
2663 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2664 it != bl.end(); ++it) {
2665 f->open_object_section("entry");
2666 f->open_object_section("entity_addr_t");
2667 it->first.dump(f);
2668 f->close_section(); //entity_addr_t
2669 it->second.localtime(f->dump_stream("expire_time"));
2670 f->close_section(); //entry
2671 }
2672 f->close_section(); //blocklist
2673 } else if (prefix == "dump_watchers") {
2674 list<obj_watch_item_t> watchers;
2675 // scan pg's
2676 vector<PGRef> pgs;
2677 _get_pgs(&pgs);
2678 for (auto& pg : pgs) {
2679 list<obj_watch_item_t> pg_watchers;
2680 pg->get_watchers(&pg_watchers);
2681 watchers.splice(watchers.end(), pg_watchers);
2682 }
2683
2684 f->open_array_section("watchers");
2685 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2686 it != watchers.end(); ++it) {
2687
2688 f->open_object_section("watch");
2689
2690 f->dump_string("namespace", it->obj.nspace);
2691 f->dump_string("object", it->obj.oid.name);
2692
2693 f->open_object_section("entity_name");
2694 it->wi.name.dump(f);
2695 f->close_section(); //entity_name_t
2696
2697 f->dump_unsigned("cookie", it->wi.cookie);
2698 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2699
2700 f->open_object_section("entity_addr_t");
2701 it->wi.addr.dump(f);
2702 f->close_section(); //entity_addr_t
2703
2704 f->close_section(); //watch
2705 }
2706
2707 f->close_section(); //watchers
2708 } else if (prefix == "dump_recovery_reservations") {
2709 f->open_object_section("reservations");
2710 f->open_object_section("local_reservations");
2711 service.local_reserver.dump(f);
2712 f->close_section();
2713 f->open_object_section("remote_reservations");
2714 service.remote_reserver.dump(f);
2715 f->close_section();
2716 f->close_section();
2717 } else if (prefix == "dump_scrub_reservations") {
2718 f->open_object_section("scrub_reservations");
2719 service.get_scrub_services().dump_scrub_reservations(f);
2720 f->close_section();
2721 } else if (prefix == "get_latest_osdmap") {
2722 get_latest_osdmap();
2723 } else if (prefix == "set_heap_property") {
2724 string property;
2725 int64_t value = 0;
2726 string error;
2727 bool success = false;
2728 if (!cmd_getval(cmdmap, "property", property)) {
2729 error = "unable to get property";
2730 success = false;
2731 } else if (!cmd_getval(cmdmap, "value", value)) {
2732 error = "unable to get value";
2733 success = false;
2734 } else if (value < 0) {
2735 error = "negative value not allowed";
2736 success = false;
2737 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2738 error = "invalid property";
2739 success = false;
2740 } else {
2741 success = true;
2742 }
2743 f->open_object_section("result");
2744 f->dump_string("error", error);
2745 f->dump_bool("success", success);
2746 f->close_section();
2747 } else if (prefix == "get_heap_property") {
2748 string property;
2749 size_t value = 0;
2750 string error;
2751 bool success = false;
2752 if (!cmd_getval(cmdmap, "property", property)) {
2753 error = "unable to get property";
2754 success = false;
2755 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2756 error = "invalid property";
2757 success = false;
2758 } else {
2759 success = true;
2760 }
2761 f->open_object_section("result");
2762 f->dump_string("error", error);
2763 f->dump_bool("success", success);
2764 f->dump_int("value", value);
2765 f->close_section();
2766 } else if (prefix == "dump_objectstore_kv_stats") {
2767 store->get_db_statistics(f);
2768 } else if (prefix == "dump_scrubs") {
2769 service.get_scrub_services().dump_scrubs(f);
2770 } else if (prefix == "calc_objectstore_db_histogram") {
2771 store->generate_db_histogram(f);
2772 } else if (prefix == "flush_store_cache") {
2773 store->flush_cache(&ss);
2774 } else if (prefix == "dump_pgstate_history") {
2775 f->open_object_section("pgstate_history");
2776 f->open_array_section("pgs");
2777 vector<PGRef> pgs;
2778 _get_pgs(&pgs);
2779 for (auto& pg : pgs) {
2780 f->open_object_section("pg");
2781 f->dump_stream("pg") << pg->pg_id;
2782 f->dump_string("currently", pg->get_current_state());
2783 pg->dump_pgstate_history(f);
2784 f->close_section();
2785 }
2786 f->close_section();
2787 f->close_section();
2788 } else if (prefix == "compact") {
2789 dout(1) << "triggering manual compaction" << dendl;
2790 auto start = ceph::coarse_mono_clock::now();
2791 store->compact();
2792 auto end = ceph::coarse_mono_clock::now();
2793 double duration = std::chrono::duration<double>(end-start).count();
2794 dout(1) << "finished manual compaction in "
2795 << duration
2796 << " seconds" << dendl;
2797 f->open_object_section("compact_result");
2798 f->dump_float("elapsed_time", duration);
2799 f->close_section();
2800 } else if (prefix == "get_mapped_pools") {
2801 f->open_array_section("mapped_pools");
2802 set<int64_t> poollist = get_mapped_pools();
2803 for (auto pool : poollist) {
2804 f->dump_int("pool_id", pool);
2805 }
2806 f->close_section();
2807 } else if (prefix == "smart") {
2808 string devid;
2809 cmd_getval(cmdmap, "devid", devid);
2810 ostringstream out;
2811 probe_smart(devid, out);
2812 outbl.append(out.str());
2813 } else if (prefix == "list_devices") {
2814 set<string> devnames;
2815 store->get_devices(&devnames);
2816 f->open_array_section("list_devices");
2817 for (auto dev : devnames) {
2818 if (dev.find("dm-") == 0) {
2819 continue;
2820 }
2821 string err;
2822 f->open_object_section("device");
2823 f->dump_string("device", "/dev/" + dev);
2824 f->dump_string("device_id", get_device_id(dev, &err));
2825 f->close_section();
2826 }
2827 f->close_section();
2828 } else if (prefix == "send_beacon") {
2829 lock_guard l(osd_lock);
2830 if (is_active()) {
2831 send_beacon(ceph::coarse_mono_clock::now());
2832 }
2833 }
2834
2835 else if (prefix == "cluster_log") {
2836 vector<string> msg;
2837 cmd_getval(cmdmap, "message", msg);
2838 if (msg.empty()) {
2839 ret = -EINVAL;
2840 ss << "ignoring empty log message";
2841 goto out;
2842 }
2843 string message = msg.front();
2844 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2845 message += " " + *a;
2846 string lvl;
2847 cmd_getval(cmdmap, "level", lvl);
2848 clog_type level = string_to_clog_type(lvl);
2849 if (level < 0) {
2850 ret = -EINVAL;
2851 ss << "unknown level '" << lvl << "'";
2852 goto out;
2853 }
2854 clog->do_log(level, message);
2855 }
2856
2857 else if (prefix == "bench") {
2858 // default count 1G, size 4MB
2859 int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", 1LL << 30);
2860 int64_t bsize = cmd_getval_or<int64_t>(cmdmap, "size", 4LL << 20);
2861 int64_t osize = cmd_getval_or<int64_t>(cmdmap, "object_size", 0);
2862 int64_t onum = cmd_getval_or<int64_t>(cmdmap, "object_num", 0);
2863 double elapsed = 0.0;
2864
2865 ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
2866 if (ret != 0) {
2867 goto out;
2868 }
2869
2870 double rate = count / elapsed;
2871 double iops = rate / bsize;
2872 f->open_object_section("osd_bench_results");
2873 f->dump_int("bytes_written", count);
2874 f->dump_int("blocksize", bsize);
2875 f->dump_float("elapsed_sec", elapsed);
2876 f->dump_float("bytes_per_sec", rate);
2877 f->dump_float("iops", iops);
2878 f->close_section();
2879 }
2880
2881 else if (prefix == "flush_pg_stats") {
2882 mgrc.send_pgstats();
2883 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2884 }
2885
2886 else if (prefix == "heap") {
2887 ret = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2888 }
2889
2890 else if (prefix == "debug dump_missing") {
2891 f->open_array_section("pgs");
2892 vector<PGRef> pgs;
2893 _get_pgs(&pgs);
2894 for (auto& pg : pgs) {
2895 string s = stringify(pg->pg_id);
2896 f->open_array_section(s.c_str());
2897 pg->lock();
2898 pg->dump_missing(f);
2899 pg->unlock();
2900 f->close_section();
2901 }
2902 f->close_section();
2903 }
2904
2905 else if (prefix == "debug kick_recovery_wq") {
2906 int64_t delay;
2907 cmd_getval(cmdmap, "delay", delay);
2908 ostringstream oss;
2909 oss << delay;
2910 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
2911 if (ret != 0) {
2912 ss << "kick_recovery_wq: error setting "
2913 << "osd_recovery_delay_start to '" << delay << "': error "
2914 << ret;
2915 goto out;
2916 }
2917 cct->_conf.apply_changes(nullptr);
2918 ss << "kicking recovery queue. set osd_recovery_delay_start "
2919 << "to " << cct->_conf->osd_recovery_delay_start;
2920 }
2921
2922 else if (prefix == "cpu_profiler") {
2923 ostringstream ds;
2924 string arg;
2925 cmd_getval(cmdmap, "arg", arg);
2926 vector<string> argvec;
2927 get_str_vec(arg, argvec);
2928 cpu_profiler_handle_command(argvec, ds);
2929 outbl.append(ds.str());
2930 }
2931
2932 else if (prefix == "dump_pg_recovery_stats") {
2933 lock_guard l(osd_lock);
2934 pg_recovery_stats.dump_formatted(f);
2935 }
2936
2937 else if (prefix == "reset_pg_recovery_stats") {
2938 lock_guard l(osd_lock);
2939 pg_recovery_stats.reset();
2940 }
2941
2942 else if (prefix == "perf histogram dump") {
2943 std::string logger;
2944 std::string counter;
2945 cmd_getval(cmdmap, "logger", logger);
2946 cmd_getval(cmdmap, "counter", counter);
2947 cct->get_perfcounters_collection()->dump_formatted_histograms(
2948 f, false, logger, counter);
2949 }
2950
2951 else if (prefix == "cache drop") {
2952 lock_guard l(osd_lock);
2953 dout(20) << "clearing all caches" << dendl;
2954 // Clear the objectstore's cache - onode and buffer for Bluestore,
2955 // system's pagecache for Filestore
2956 ret = store->flush_cache(&ss);
2957 if (ret < 0) {
2958 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
2959 goto out;
2960 }
2961 // Clear the objectcontext cache (per PG)
2962 vector<PGRef> pgs;
2963 _get_pgs(&pgs);
2964 for (auto& pg: pgs) {
2965 pg->clear_cache();
2966 }
2967 }
2968
2969 else if (prefix == "cache status") {
2970 lock_guard l(osd_lock);
2971 int obj_ctx_count = 0;
2972 vector<PGRef> pgs;
2973 _get_pgs(&pgs);
2974 for (auto& pg: pgs) {
2975 obj_ctx_count += pg->get_cache_obj_count();
2976 }
2977 f->open_object_section("cache_status");
2978 f->dump_int("object_ctx", obj_ctx_count);
2979 store->dump_cache_stats(f);
2980 f->close_section();
2981 }
2982
2983 else if (prefix == "scrub_purged_snaps") {
2984 lock_guard l(osd_lock);
2985 scrub_purged_snaps();
2986 }
2987
2988 else if (prefix == "dump_osd_network") {
2989 lock_guard l(osd_lock);
2990 int64_t value = 0;
2991 if (!(cmd_getval(cmdmap, "value", value))) {
2992 // Convert milliseconds to microseconds
2993 value = static_cast<double>(g_conf().get_val<double>(
2994 "mon_warn_on_slow_ping_time")) * 1000;
2995 if (value == 0) {
2996 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2997 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2998 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2999 }
3000 } else {
3001 // Convert user input to microseconds
3002 value *= 1000;
3003 }
3004 if (value < 0) value = 0;
3005
3006 struct osd_ping_time_t {
3007 uint32_t pingtime;
3008 int to;
3009 bool back;
3010 std::array<uint32_t,3> times;
3011 std::array<uint32_t,3> min;
3012 std::array<uint32_t,3> max;
3013 uint32_t last;
3014 uint32_t last_update;
3015
3016 bool operator<(const osd_ping_time_t& rhs) const {
3017 if (pingtime < rhs.pingtime)
3018 return true;
3019 if (pingtime > rhs.pingtime)
3020 return false;
3021 if (to < rhs.to)
3022 return true;
3023 if (to > rhs.to)
3024 return false;
3025 return back;
3026 }
3027 };
3028
3029 set<osd_ping_time_t> sorted;
3030 // Get pingtimes under lock and not on the stack
3031 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3032 service.get_hb_pingtime(pingtimes);
3033 for (auto j : *pingtimes) {
3034 if (j.second.last_update == 0)
3035 continue;
3036 osd_ping_time_t item;
3037 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3038 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3039 if (item.pingtime >= value) {
3040 item.to = j.first;
3041 item.times[0] = j.second.back_pingtime[0];
3042 item.times[1] = j.second.back_pingtime[1];
3043 item.times[2] = j.second.back_pingtime[2];
3044 item.min[0] = j.second.back_min[0];
3045 item.min[1] = j.second.back_min[1];
3046 item.min[2] = j.second.back_min[2];
3047 item.max[0] = j.second.back_max[0];
3048 item.max[1] = j.second.back_max[1];
3049 item.max[2] = j.second.back_max[2];
3050 item.last = j.second.back_last;
3051 item.back = true;
3052 item.last_update = j.second.last_update;
3053 sorted.emplace(item);
3054 }
3055 if (j.second.front_last == 0)
3056 continue;
3057 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3058 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3059 if (item.pingtime >= value) {
3060 item.to = j.first;
3061 item.times[0] = j.second.front_pingtime[0];
3062 item.times[1] = j.second.front_pingtime[1];
3063 item.times[2] = j.second.front_pingtime[2];
3064 item.min[0] = j.second.front_min[0];
3065 item.min[1] = j.second.front_min[1];
3066 item.min[2] = j.second.front_min[2];
3067 item.max[0] = j.second.front_max[0];
3068 item.max[1] = j.second.front_max[1];
3069 item.max[2] = j.second.front_max[2];
3070 item.last = j.second.front_last;
3071 item.last_update = j.second.last_update;
3072 item.back = false;
3073 sorted.emplace(item);
3074 }
3075 }
3076 delete pingtimes;
3077 //
3078 // Network ping times (1min 5min 15min)
3079 f->open_object_section("network_ping_times");
3080 f->dump_int("threshold", value / 1000);
3081 f->open_array_section("entries");
3082 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3083 ceph_assert(sitem.pingtime >= value);
3084 f->open_object_section("entry");
3085
3086 const time_t lu(sitem.last_update);
3087 char buffer[26];
3088 string lustr(ctime_r(&lu, buffer));
3089 lustr.pop_back(); // Remove trailing \n
3090 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3091 f->dump_string("last update", lustr);
3092 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3093 f->dump_int("from osd", whoami);
3094 f->dump_int("to osd", sitem.to);
3095 f->dump_string("interface", (sitem.back ? "back" : "front"));
3096 f->open_object_section("average");
3097 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3098 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3099 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3100 f->close_section(); // average
3101 f->open_object_section("min");
3102 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3103 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3104 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3105 f->close_section(); // min
3106 f->open_object_section("max");
3107 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3108 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3109 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3110 f->close_section(); // max
3111 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3112 f->close_section(); // entry
3113 }
3114 f->close_section(); // entries
3115 f->close_section(); // network_ping_times
3116 } else if (prefix == "dump_pool_statfs") {
3117 lock_guard l(osd_lock);
3118
3119 int64_t p = 0;
3120 if (!(cmd_getval(cmdmap, "poolid", p))) {
3121 ss << "Error dumping pool statfs: no poolid provided";
3122 ret = -EINVAL;
3123 goto out;
3124 }
3125
3126 store_statfs_t st;
3127 bool per_pool_omap_stats = false;
3128
3129 ret = store->pool_statfs(p, &st, &per_pool_omap_stats);
3130 if (ret < 0) {
3131 ss << "Error dumping pool statfs: " << cpp_strerror(ret);
3132 goto out;
3133 } else {
3134 ss << "dumping pool statfs...";
3135 f->open_object_section("pool_statfs");
3136 f->dump_int("poolid", p);
3137 st.dump(f);
3138 f->close_section();
3139 }
3140 } else {
3141 ceph_abort_msg("broken asok registration");
3142 }
3143
3144 out:
3145 on_finish(ret, ss.str(), outbl);
3146}
3147
3148int OSD::run_osd_bench_test(
3149 int64_t count,
3150 int64_t bsize,
3151 int64_t osize,
3152 int64_t onum,
3153 double *elapsed,
3154 ostream &ss)
3155{
3156 int ret = 0;
3157 uint32_t duration = cct->_conf->osd_bench_duration;
3158
3159 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
3160 // let us limit the block size because the next checks rely on it
3161 // having a sane value. If we allow any block size to be set things
3162 // can still go sideways.
3163 ss << "block 'size' values are capped at "
3164 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
3165 << " a higher value, please adjust 'osd_bench_max_block_size'";
3166 ret = -EINVAL;
3167 return ret;
3168 } else if (bsize < (int64_t) (1 << 20)) {
3169 // entering the realm of small block sizes.
3170 // limit the count to a sane value, assuming a configurable amount of
3171 // IOPS and duration, so that the OSD doesn't get hung up on this,
3172 // preventing timeouts from going off
3173 int64_t max_count =
3174 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
3175 if (count > max_count) {
3176 ss << "'count' values greater than " << max_count
3177 << " for a block size of " << byte_u_t(bsize) << ", assuming "
3178 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
3179 << " for " << duration << " seconds,"
3180 << " can cause ill effects on osd. "
3181 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
3182 << " value if you wish to use a higher 'count'.";
3183 ret = -EINVAL;
3184 return ret;
3185 }
3186 } else {
3187 // 1MB block sizes are big enough so that we get more stuff done.
3188 // However, to avoid the osd from getting hung on this and having
3189 // timers being triggered, we are going to limit the count assuming
3190 // a configurable throughput and duration.
3191 // NOTE: max_count is the total amount of bytes that we believe we
3192 // will be able to write during 'duration' for the given
3193 // throughput. The block size hardly impacts this unless it's
3194 // way too big. Given we already check how big the block size
3195 // is, it's safe to assume everything will check out.
3196 int64_t max_count =
3197 cct->_conf->osd_bench_large_size_max_throughput * duration;
3198 if (count > max_count) {
3199 ss << "'count' values greater than " << max_count
3200 << " for a block size of " << byte_u_t(bsize) << ", assuming "
3201 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
3202 << " for " << duration << " seconds,"
3203 << " can cause ill effects on osd. "
3204 << " Please adjust 'osd_bench_large_size_max_throughput'"
3205 << " with a higher value if you wish to use a higher 'count'.";
3206 ret = -EINVAL;
3207 return ret;
3208 }
3209 }
3210
3211 if (osize && bsize > osize) {
3212 bsize = osize;
3213 }
3214
3215 dout(1) << " bench count " << count
3216 << " bsize " << byte_u_t(bsize) << dendl;
3217
3218 ObjectStore::Transaction cleanupt;
3219
3220 if (osize && onum) {
3221 bufferlist bl;
3222 bufferptr bp(osize);
3223 memset(bp.c_str(), 'a', bp.length());
3224 bl.push_back(std::move(bp));
3225 bl.rebuild_page_aligned();
3226 for (int i=0; i<onum; ++i) {
3227 char nm[30];
3228 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
3229 object_t oid(nm);
3230 hobject_t soid(sobject_t(oid, 0));
3231 ObjectStore::Transaction t;
3232 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
3233 store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3234 cleanupt.remove(coll_t(), ghobject_t(soid));
3235 }
3236 }
3237
3238 bufferlist bl;
3239 bufferptr bp(bsize);
3240 memset(bp.c_str(), 'a', bp.length());
3241 bl.push_back(std::move(bp));
3242 bl.rebuild_page_aligned();
3243
3244 {
3245 C_SaferCond waiter;
3246 if (!service.meta_ch->flush_commit(&waiter)) {
3247 waiter.wait();
3248 }
3249 }
3250
3251 utime_t start = ceph_clock_now();
3252 for (int64_t pos = 0; pos < count; pos += bsize) {
3253 char nm[30];
3254 unsigned offset = 0;
3255 if (onum && osize) {
3256 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
3257 offset = rand() % (osize / bsize) * bsize;
3258 } else {
3259 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
3260 }
3261 object_t oid(nm);
3262 hobject_t soid(sobject_t(oid, 0));
3263 ObjectStore::Transaction t;
3264 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
3265 store->queue_transaction(service.meta_ch, std::move(t), nullptr);
3266 if (!onum || !osize) {
3267 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
3268 }
3269 }
3270
3271 {
3272 C_SaferCond waiter;
3273 if (!service.meta_ch->flush_commit(&waiter)) {
3274 waiter.wait();
3275 }
3276 }
3277 utime_t end = ceph_clock_now();
3278 *elapsed = end - start;
3279
3280 // clean up
3281 store->queue_transaction(service.meta_ch, std::move(cleanupt), nullptr);
3282 {
3283 C_SaferCond waiter;
3284 if (!service.meta_ch->flush_commit(&waiter)) {
3285 waiter.wait();
3286 }
3287 }
3288
3289 return ret;
3290}
3291
3292class TestOpsSocketHook : public AdminSocketHook {
3293 OSDService *service;
3294 ObjectStore *store;
3295public:
3296 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
3297 int call(std::string_view command, const cmdmap_t& cmdmap,
3298 Formatter *f,
3299 std::ostream& errss,
3300 bufferlist& out) override {
3301 int r = 0;
3302 stringstream outss;
3303 try {
3304 test_ops(service, store, command, cmdmap, outss);
3305 out.append(outss);
3306 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3307 errss << e.what();
3308 r = -EINVAL;
3309 }
3310 return r;
3311 }
3312 void test_ops(OSDService *service, ObjectStore *store,
3313 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
3314
3315};
3316
3317class OSD::C_Tick : public Context {
3318 OSD *osd;
3319 public:
3320 explicit C_Tick(OSD *o) : osd(o) {}
3321 void finish(int r) override {
3322 osd->tick();
3323 }
3324};
3325
3326class OSD::C_Tick_WithoutOSDLock : public Context {
3327 OSD *osd;
3328 public:
3329 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3330 void finish(int r) override {
3331 osd->tick_without_osd_lock();
3332 }
3333};
3334
3335int OSD::enable_disable_fuse(bool stop)
3336{
3337#ifdef HAVE_LIBFUSE
3338 int r;
3339 string mntpath = cct->_conf->osd_data + "/fuse";
3340 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3341 dout(1) << __func__ << " disabling" << dendl;
3342 fuse_store->stop();
3343 delete fuse_store;
3344 fuse_store = NULL;
3345 r = ::rmdir(mntpath.c_str());
3346 if (r < 0) {
3347 r = -errno;
3348 derr << __func__ << " failed to rmdir " << mntpath << ": "
3349 << cpp_strerror(r) << dendl;
3350 return r;
3351 }
3352 return 0;
3353 }
3354 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3355 dout(1) << __func__ << " enabling" << dendl;
3356 r = ::mkdir(mntpath.c_str(), 0700);
3357 if (r < 0)
3358 r = -errno;
3359 if (r < 0 && r != -EEXIST) {
3360 derr << __func__ << " unable to create " << mntpath << ": "
3361 << cpp_strerror(r) << dendl;
3362 return r;
3363 }
3364 fuse_store = new FuseStore(store.get(), mntpath);
3365 r = fuse_store->start();
3366 if (r < 0) {
3367 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3368 delete fuse_store;
3369 fuse_store = NULL;
3370 return r;
3371 }
3372 }
3373#endif // HAVE_LIBFUSE
3374 return 0;
3375}
3376
3377size_t OSD::get_num_cache_shards()
3378{
3379 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3380}
3381
3382int OSD::get_num_op_shards()
3383{
3384 if (cct->_conf->osd_op_num_shards)
3385 return cct->_conf->osd_op_num_shards;
3386 if (store_is_rotational)
3387 return cct->_conf->osd_op_num_shards_hdd;
3388 else
3389 return cct->_conf->osd_op_num_shards_ssd;
3390}
3391
3392int OSD::get_num_op_threads()
3393{
3394 if (cct->_conf->osd_op_num_threads_per_shard)
3395 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3396 if (store_is_rotational)
3397 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3398 else
3399 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3400}
3401
3402float OSD::get_osd_recovery_sleep()
3403{
3404 if (cct->_conf->osd_recovery_sleep)
3405 return cct->_conf->osd_recovery_sleep;
3406 if (!store_is_rotational && !journal_is_rotational)
3407 return cct->_conf->osd_recovery_sleep_ssd;
3408 else if (store_is_rotational && !journal_is_rotational)
3409 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
3410 else
3411 return cct->_conf->osd_recovery_sleep_hdd;
3412}
3413
3414float OSD::get_osd_delete_sleep()
3415{
3416 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3417 if (osd_delete_sleep > 0)
3418 return osd_delete_sleep;
3419 if (!store_is_rotational && !journal_is_rotational)
3420 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3421 if (store_is_rotational && !journal_is_rotational)
3422 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3423 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3424}
3425
3426int OSD::get_recovery_max_active()
3427{
3428 if (cct->_conf->osd_recovery_max_active)
3429 return cct->_conf->osd_recovery_max_active;
3430 if (store_is_rotational)
3431 return cct->_conf->osd_recovery_max_active_hdd;
3432 else
3433 return cct->_conf->osd_recovery_max_active_ssd;
3434}
3435
3436float OSD::get_osd_snap_trim_sleep()
3437{
3438 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3439 if (osd_snap_trim_sleep > 0)
3440 return osd_snap_trim_sleep;
3441 if (!store_is_rotational && !journal_is_rotational)
3442 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3443 if (store_is_rotational && !journal_is_rotational)
3444 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3445 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3446}
3447
3448int OSD::init()
3449{
3450 OSDMapRef osdmap;
3451 CompatSet initial, diff;
3452 std::lock_guard lock(osd_lock);
3453 if (is_stopping())
3454 return 0;
3455 tracing::osd::tracer.init("osd");
3456 tick_timer.init();
3457 tick_timer_without_osd_lock.init();
3458 service.recovery_request_timer.init();
3459 service.sleep_timer.init();
3460
3461 boot_finisher.start();
3462
3463 {
3464 string val;
3465 store->read_meta("require_osd_release", &val);
3466 last_require_osd_release = ceph_release_from_name(val);
3467 }
3468
3469 // mount.
3470 dout(2) << "init " << dev_path
3471 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3472 << dendl;
3473 dout(2) << "journal " << journal_path << dendl;
3474 ceph_assert(store); // call pre_init() first!
3475
3476 store->set_cache_shards(get_num_cache_shards());
3477
3478 int rotating_auth_attempts = 0;
3479 auto rotating_auth_timeout =
3480 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3481
3482 int r = store->mount();
3483 if (r < 0) {
3484 derr << "OSD:init: unable to mount object store" << dendl;
3485 return r;
3486 }
3487 journal_is_rotational = store->is_journal_rotational();
3488 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3489 << dendl;
3490
3491 enable_disable_fuse(false);
3492
3493 dout(2) << "boot" << dendl;
3494
3495 service.meta_ch = store->open_collection(coll_t::meta());
3496 if (!service.meta_ch) {
3497 derr << "OSD:init: unable to open meta collection"
3498 << dendl;
3499 r = -ENOENT;
3500 goto out;
3501 }
3502 // initialize the daily loadavg with current 15min loadavg
3503 double loadavgs[3];
3504 if (getloadavg(loadavgs, 3) == 3) {
3505 daily_loadavg = loadavgs[2];
3506 } else {
3507 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3508 daily_loadavg = 1.0;
3509 }
3510
3511 // sanity check long object name handling
3512 {
3513 hobject_t l;
3514 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3515 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3516 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3517 r = store->validate_hobject_key(l);
3518 if (r < 0) {
3519 derr << "backend (" << store->get_type() << ") is unable to support max "
3520 << "object name[space] len" << dendl;
3521 derr << " osd max object name len = "
3522 << cct->_conf->osd_max_object_name_len << dendl;
3523 derr << " osd max object namespace len = "
3524 << cct->_conf->osd_max_object_namespace_len << dendl;
3525 derr << cpp_strerror(r) << dendl;
3526 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3527 goto out;
3528 }
3529 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3530 << dendl;
3531 } else {
3532 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3533 }
3534 }
3535
3536 // read superblock
3537 r = read_superblock();
3538 if (r < 0) {
3539 derr << "OSD::init() : unable to read osd superblock" << dendl;
3540 r = -EINVAL;
3541 goto out;
3542 }
3543
3544 if (osd_compat.compare(superblock.compat_features) < 0) {
3545 derr << "The disk uses features unsupported by the executable." << dendl;
3546 derr << " ondisk features " << superblock.compat_features << dendl;
3547 derr << " daemon features " << osd_compat << dendl;
3548
3549 if (osd_compat.writeable(superblock.compat_features)) {
3550 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3551 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3552 r = -EOPNOTSUPP;
3553 goto out;
3554 }
3555 else {
3556 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3557 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3558 r = -EOPNOTSUPP;
3559 goto out;
3560 }
3561 }
3562
3563 assert_warn(whoami == superblock.whoami);
3564 if (whoami != superblock.whoami) {
3565 derr << "OSD::init: superblock says osd"
3566 << superblock.whoami << " but I am osd." << whoami << dendl;
3567 r = -EINVAL;
3568 goto out;
3569 }
3570
3571 startup_time = ceph::mono_clock::now();
3572
3573 // load up "current" osdmap
3574 assert_warn(!get_osdmap());
3575 if (get_osdmap()) {
3576 derr << "OSD::init: unable to read current osdmap" << dendl;
3577 r = -EINVAL;
3578 goto out;
3579 }
3580 osdmap = get_map(superblock.current_epoch);
3581 set_osdmap(osdmap);
3582
3583 // make sure we don't have legacy pgs deleting
3584 {
3585 vector<coll_t> ls;
3586 int r = store->list_collections(ls);
3587 ceph_assert(r >= 0);
3588 for (auto c : ls) {
3589 spg_t pgid;
3590 if (c.is_pg(&pgid) &&
3591 !osdmap->have_pg_pool(pgid.pool())) {
3592 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3593 if (!store->exists(service.meta_ch, oid)) {
3594 derr << __func__ << " missing pg_pool_t for deleted pool "
3595 << pgid.pool() << " for pg " << pgid
3596 << "; please downgrade to luminous and allow "
3597 << "pg deletion to complete before upgrading" << dendl;
3598 ceph_abort();
3599 }
3600 }
3601 }
3602 }
3603
3604 initial = get_osd_initial_compat_set();
3605 diff = superblock.compat_features.unsupported(initial);
3606 if (superblock.compat_features.merge(initial)) {
3607 // Are we adding SNAPMAPPER2?
3608 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3609 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3610 << dendl;
3611 auto ch = service.meta_ch;
3612 auto hoid = make_snapmapper_oid();
3613 unsigned max = cct->_conf->osd_target_transaction_size;
3614 r = SnapMapper::convert_legacy(cct, store.get(), ch, hoid, max);
3615 if (r < 0)
3616 goto out;
3617 }
3618 // We need to persist the new compat_set before we
3619 // do anything else
3620 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3621 ObjectStore::Transaction t;
3622 write_superblock(t);
3623 r = store->queue_transaction(service.meta_ch, std::move(t));
3624 if (r < 0)
3625 goto out;
3626 }
3627
3628 // make sure snap mapper object exists
3629 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3630 dout(10) << "init creating/touching snapmapper object" << dendl;
3631 ObjectStore::Transaction t;
3632 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3633 r = store->queue_transaction(service.meta_ch, std::move(t));
3634 if (r < 0)
3635 goto out;
3636 }
3637 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3638 dout(10) << "init creating/touching purged_snaps object" << dendl;
3639 ObjectStore::Transaction t;
3640 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3641 r = store->queue_transaction(service.meta_ch, std::move(t));
3642 if (r < 0)
3643 goto out;
3644 }
3645
3646 if (cct->_conf->osd_open_classes_on_start) {
3647 int r = ClassHandler::get_instance().open_all_classes();
3648 if (r)
3649 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3650 }
3651
3652 check_osdmap_features();
3653
3654 {
3655 epoch_t bind_epoch = osdmap->get_epoch();
3656 service.set_epochs(NULL, NULL, &bind_epoch);
3657 }
3658
3659 clear_temp_objects();
3660
3661 // initialize osdmap references in sharded wq
3662 for (auto& shard : shards) {
3663 std::lock_guard l(shard->osdmap_lock);
3664 shard->shard_osdmap = osdmap;
3665 }
3666
3667 // load up pgs (as they previously existed)
3668 load_pgs();
3669
3670 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3671
3672 if (cct->_conf.get_val<bool>("osd_compact_on_start")) {
3673 dout(2) << "compacting object store's omap" << dendl;
3674 store->compact();
3675 }
3676
3677 // prime osd stats
3678 {
3679 struct store_statfs_t stbuf;
3680 osd_alert_list_t alerts;
3681 int r = store->statfs(&stbuf, &alerts);
3682 ceph_assert(r == 0);
3683 service.set_statfs(stbuf, alerts);
3684 }
3685
3686 // client_messenger's auth_client will be set up by monc->init() later.
3687 for (auto m : { cluster_messenger,
3688 objecter_messenger,
3689 hb_front_client_messenger,
3690 hb_back_client_messenger,
3691 hb_front_server_messenger,
3692 hb_back_server_messenger } ) {
3693 m->set_auth_client(monc);
3694 }
3695 for (auto m : { client_messenger,
3696 cluster_messenger,
3697 hb_front_server_messenger,
3698 hb_back_server_messenger }) {
3699 m->set_auth_server(monc);
3700 }
3701 monc->set_handle_authentication_dispatcher(this);
3702
3703 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3704 | CEPH_ENTITY_TYPE_MGR);
3705 r = monc->init();
3706 if (r < 0)
3707 goto out;
3708
3709 mgrc.set_pgstats_cb([this]() { return collect_pg_stats(); });
3710 mgrc.set_perf_metric_query_cb(
3711 [this](const ConfigPayload &config_payload) {
3712 set_perf_queries(config_payload);
3713 },
3714 [this] {
3715 return get_perf_reports();
3716 });
3717 mgrc.init();
3718
3719 // tell monc about log_client so it will know about mon session resets
3720 monc->set_log_client(&log_client);
3721 update_log_config();
3722
3723 // i'm ready!
3724 client_messenger->add_dispatcher_tail(&mgrc);
3725 client_messenger->add_dispatcher_tail(this);
3726 cluster_messenger->add_dispatcher_head(this);
3727
3728 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3729 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3730 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3731 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3732
3733 objecter_messenger->add_dispatcher_head(service.objecter.get());
3734
3735 service.init();
3736 service.publish_map(osdmap);
3737 service.publish_superblock(superblock);
3738 service.max_oldest_map = superblock.oldest_map;
3739
3740 for (auto& shard : shards) {
3741 // put PGs in a temporary set because we may modify pg_slots
3742 // unordered_map below.
3743 set<PGRef> pgs;
3744 for (auto& i : shard->pg_slots) {
3745 PGRef pg = i.second->pg;
3746 if (!pg) {
3747 continue;
3748 }
3749 pgs.insert(pg);
3750 }
3751 for (auto pg : pgs) {
3752 std::scoped_lock l{*pg};
3753 set<pair<spg_t,epoch_t>> new_children;
3754 set<pair<spg_t,epoch_t>> merge_pgs;
3755 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3756 &new_children, &merge_pgs);
3757 if (!new_children.empty()) {
3758 for (auto shard : shards) {
3759 shard->prime_splits(osdmap, &new_children);
3760 }
3761 assert(new_children.empty());
3762 }
3763 if (!merge_pgs.empty()) {
3764 for (auto shard : shards) {
3765 shard->prime_merges(osdmap, &merge_pgs);
3766 }
3767 assert(merge_pgs.empty());
3768 }
3769 }
3770 }
3771
3772 osd_op_tp.start();
3773
3774 // start the heartbeat
3775 heartbeat_thread.create("osd_srv_heartbt");
3776
3777 // tick
3778 tick_timer.add_event_after(get_tick_interval(),
3779 new C_Tick(this));
3780 {
3781 std::lock_guard l(tick_timer_lock);
3782 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3783 new C_Tick_WithoutOSDLock(this));
3784 }
3785
3786 osd_lock.unlock();
3787
3788 r = monc->authenticate();
3789 if (r < 0) {
3790 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3791 << dendl;
3792 exit(1);
3793 }
3794
3795 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3796 derr << "unable to obtain rotating service keys; retrying" << dendl;
3797 ++rotating_auth_attempts;
3798 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3799 derr << __func__ << " wait_auth_rotating timed out" << dendl;
3800 exit(1);
3801 }
3802 }
3803
3804 r = update_crush_device_class();
3805 if (r < 0) {
3806 derr << __func__ << " unable to update_crush_device_class: "
3807 << cpp_strerror(r) << dendl;
3808 exit(1);
3809 }
3810
3811 r = update_crush_location();
3812 if (r < 0) {
3813 derr << __func__ << " unable to update_crush_location: "
3814 << cpp_strerror(r) << dendl;
3815 exit(1);
3816 }
3817
3818 osd_lock.lock();
3819 if (is_stopping())
3820 return 0;
3821
3822 // start objecter *after* we have authenticated, so that we don't ignore
3823 // the OSDMaps it requests.
3824 service.final_init();
3825
3826 check_config();
3827
3828 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3829 consume_map();
3830
3831 dout(0) << "done with init, starting boot process" << dendl;
3832
3833 // subscribe to any pg creations
3834 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3835
3836 // MgrClient needs this (it doesn't have MonClient reference itself)
3837 monc->sub_want("mgrmap", 0, 0);
3838
3839 // we don't need to ask for an osdmap here; objecter will
3840 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3841
3842 monc->renew_subs();
3843
3844 start_boot();
3845
3846 // Override a few options if mclock scheduler is enabled.
3847 maybe_override_max_osd_capacity_for_qos();
3848 maybe_override_options_for_qos();
3849
3850 return 0;
3851
3852out:
3853 enable_disable_fuse(true);
3854 store->umount();
3855 store.reset();
3856 return r;
3857}
3858
3859void OSD::final_init()
3860{
3861 AdminSocket *admin_socket = cct->get_admin_socket();
3862 asok_hook = new OSDSocketHook(this);
3863 int r = admin_socket->register_command("status", asok_hook,
3864 "high-level status of OSD");
3865 ceph_assert(r == 0);
3866 r = admin_socket->register_command("flush_journal",
3867 asok_hook,
3868 "flush the journal to permanent store");
3869 ceph_assert(r == 0);
3870 r = admin_socket->register_command("dump_ops_in_flight " \
3871 "name=filterstr,type=CephString,n=N,req=false",
3872 asok_hook,
3873 "show the ops currently in flight");
3874 ceph_assert(r == 0);
3875 r = admin_socket->register_command("ops " \
3876 "name=filterstr,type=CephString,n=N,req=false",
3877 asok_hook,
3878 "show the ops currently in flight");
3879 ceph_assert(r == 0);
3880 r = admin_socket->register_command("dump_blocked_ops " \
3881 "name=filterstr,type=CephString,n=N,req=false",
3882 asok_hook,
3883 "show the blocked ops currently in flight");
3884 ceph_assert(r == 0);
3885 r = admin_socket->register_command("dump_historic_ops " \
3886 "name=filterstr,type=CephString,n=N,req=false",
3887 asok_hook,
3888 "show recent ops");
3889 ceph_assert(r == 0);
3890 r = admin_socket->register_command("dump_historic_slow_ops " \
3891 "name=filterstr,type=CephString,n=N,req=false",
3892 asok_hook,
3893 "show slowest recent ops");
3894 ceph_assert(r == 0);
3895 r = admin_socket->register_command("dump_historic_ops_by_duration " \
3896 "name=filterstr,type=CephString,n=N,req=false",
3897 asok_hook,
3898 "show slowest recent ops, sorted by duration");
3899 ceph_assert(r == 0);
3900 r = admin_socket->register_command("dump_op_pq_state",
3901 asok_hook,
3902 "dump op queue state");
3903 ceph_assert(r == 0);
3904 r = admin_socket->register_command("dump_blocklist",
3905 asok_hook,
3906 "dump blocklisted clients and times");
3907 ceph_assert(r == 0);
3908 r = admin_socket->register_command("dump_watchers",
3909 asok_hook,
3910 "show clients which have active watches,"
3911 " and on which objects");
3912 ceph_assert(r == 0);
3913 r = admin_socket->register_command("dump_recovery_reservations",
3914 asok_hook,
3915 "show recovery reservations");
3916 ceph_assert(r == 0);
3917 r = admin_socket->register_command("dump_scrub_reservations",
3918 asok_hook,
3919 "show scrub reservations");
3920 ceph_assert(r == 0);
3921 r = admin_socket->register_command("get_latest_osdmap",
3922 asok_hook,
3923 "force osd to update the latest map from "
3924 "the mon");
3925 ceph_assert(r == 0);
3926
3927 r = admin_socket->register_command("set_heap_property " \
3928 "name=property,type=CephString " \
3929 "name=value,type=CephInt",
3930 asok_hook,
3931 "update malloc extension heap property");
3932 ceph_assert(r == 0);
3933
3934 r = admin_socket->register_command("get_heap_property " \
3935 "name=property,type=CephString",
3936 asok_hook,
3937 "get malloc extension heap property");
3938 ceph_assert(r == 0);
3939
3940 r = admin_socket->register_command("dump_objectstore_kv_stats",
3941 asok_hook,
3942 "print statistics of kvdb which used by bluestore");
3943 ceph_assert(r == 0);
3944
3945 r = admin_socket->register_command("dump_scrubs",
3946 asok_hook,
3947 "print scheduled scrubs");
3948 ceph_assert(r == 0);
3949
3950 r = admin_socket->register_command("calc_objectstore_db_histogram",
3951 asok_hook,
3952 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3953 ceph_assert(r == 0);
3954
3955 r = admin_socket->register_command("flush_store_cache",
3956 asok_hook,
3957 "Flush bluestore internal cache");
3958 ceph_assert(r == 0);
3959 r = admin_socket->register_command("dump_pgstate_history",
3960 asok_hook,
3961 "show recent state history");
3962 ceph_assert(r == 0);
3963
3964 r = admin_socket->register_command("compact",
3965 asok_hook,
3966 "Commpact object store's omap."
3967 " WARNING: Compaction probably slows your requests");
3968 ceph_assert(r == 0);
3969
3970 r = admin_socket->register_command("get_mapped_pools",
3971 asok_hook,
3972 "dump pools whose PG(s) are mapped to this OSD.");
3973
3974 ceph_assert(r == 0);
3975
3976 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
3977 asok_hook,
3978 "probe OSD devices for SMART data.");
3979
3980 ceph_assert(r == 0);
3981
3982 r = admin_socket->register_command("list_devices",
3983 asok_hook,
3984 "list OSD devices.");
3985 r = admin_socket->register_command("send_beacon",
3986 asok_hook,
3987 "send OSD beacon to mon immediately");
3988
3989 r = admin_socket->register_command(
3990 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3991 "Dump osd heartbeat network ping times");
3992 ceph_assert(r == 0);
3993
3994 r = admin_socket->register_command(
3995 "dump_pool_statfs name=poolid,type=CephInt,req=true", asok_hook,
3996 "Dump store's statistics for the given pool");
3997 ceph_assert(r == 0);
3998
3999 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store.get());
4000 // Note: pools are CephString instead of CephPoolname because
4001 // these commands traditionally support both pool names and numbers
4002 r = admin_socket->register_command(
4003 "setomapval " \
4004 "name=pool,type=CephString " \
4005 "name=objname,type=CephObjectname " \
4006 "name=key,type=CephString "\
4007 "name=val,type=CephString",
4008 test_ops_hook,
4009 "set omap key");
4010 ceph_assert(r == 0);
4011 r = admin_socket->register_command(
4012 "rmomapkey " \
4013 "name=pool,type=CephString " \
4014 "name=objname,type=CephObjectname " \
4015 "name=key,type=CephString",
4016 test_ops_hook,
4017 "remove omap key");
4018 ceph_assert(r == 0);
4019 r = admin_socket->register_command(
4020 "setomapheader " \
4021 "name=pool,type=CephString " \
4022 "name=objname,type=CephObjectname " \
4023 "name=header,type=CephString",
4024 test_ops_hook,
4025 "set omap header");
4026 ceph_assert(r == 0);
4027
4028 r = admin_socket->register_command(
4029 "getomap " \
4030 "name=pool,type=CephString " \
4031 "name=objname,type=CephObjectname",
4032 test_ops_hook,
4033 "output entire object map");
4034 ceph_assert(r == 0);
4035
4036 r = admin_socket->register_command(
4037 "truncobj " \
4038 "name=pool,type=CephString " \
4039 "name=objname,type=CephObjectname " \
4040 "name=len,type=CephInt",
4041 test_ops_hook,
4042 "truncate object to length");
4043 ceph_assert(r == 0);
4044
4045 r = admin_socket->register_command(
4046 "injectdataerr " \
4047 "name=pool,type=CephString " \
4048 "name=objname,type=CephObjectname " \
4049 "name=shardid,type=CephInt,req=false,range=0|255",
4050 test_ops_hook,
4051 "inject data error to an object");
4052 ceph_assert(r == 0);
4053
4054 r = admin_socket->register_command(
4055 "injectmdataerr " \
4056 "name=pool,type=CephString " \
4057 "name=objname,type=CephObjectname " \
4058 "name=shardid,type=CephInt,req=false,range=0|255",
4059 test_ops_hook,
4060 "inject metadata error to an object");
4061 ceph_assert(r == 0);
4062 r = admin_socket->register_command(
4063 "set_recovery_delay " \
4064 "name=utime,type=CephInt,req=false",
4065 test_ops_hook,
4066 "Delay osd recovery by specified seconds");
4067 ceph_assert(r == 0);
4068 r = admin_socket->register_command(
4069 "injectfull " \
4070 "name=type,type=CephString,req=false " \
4071 "name=count,type=CephInt,req=false ",
4072 test_ops_hook,
4073 "Inject a full disk (optional count times)");
4074 ceph_assert(r == 0);
4075 r = admin_socket->register_command(
4076 "bench " \
4077 "name=count,type=CephInt,req=false " \
4078 "name=size,type=CephInt,req=false " \
4079 "name=object_size,type=CephInt,req=false " \
4080 "name=object_num,type=CephInt,req=false ",
4081 asok_hook,
4082 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
4083 "(default count=1G default size=4MB). Results in log.");
4084 ceph_assert(r == 0);
4085 r = admin_socket->register_command(
4086 "cluster_log " \
4087 "name=level,type=CephChoices,strings=error,warning,info,debug " \
4088 "name=message,type=CephString,n=N",
4089 asok_hook,
4090 "log a message to the cluster log");
4091 ceph_assert(r == 0);
4092 r = admin_socket->register_command(
4093 "flush_pg_stats",
4094 asok_hook,
4095 "flush pg stats");
4096 ceph_assert(r == 0);
4097 r = admin_socket->register_command(
4098 "heap " \
4099 "name=heapcmd,type=CephChoices,strings=" \
4100 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
4101 "name=value,type=CephString,req=false",
4102 asok_hook,
4103 "show heap usage info (available only if compiled with tcmalloc)");
4104 ceph_assert(r == 0);
4105 r = admin_socket->register_command(
4106 "debug dump_missing " \
4107 "name=filename,type=CephFilepath",
4108 asok_hook,
4109 "dump missing objects to a named file");
4110 ceph_assert(r == 0);
4111 r = admin_socket->register_command(
4112 "debug kick_recovery_wq " \
4113 "name=delay,type=CephInt,range=0",
4114 asok_hook,
4115 "set osd_recovery_delay_start to <val>");
4116 ceph_assert(r == 0);
4117 r = admin_socket->register_command(
4118 "cpu_profiler " \
4119 "name=arg,type=CephChoices,strings=status|flush",
4120 asok_hook,
4121 "run cpu profiling on daemon");
4122 ceph_assert(r == 0);
4123 r = admin_socket->register_command(
4124 "dump_pg_recovery_stats",
4125 asok_hook,
4126 "dump pg recovery statistics");
4127 ceph_assert(r == 0);
4128 r = admin_socket->register_command(
4129 "reset_pg_recovery_stats",
4130 asok_hook,
4131 "reset pg recovery statistics");
4132 ceph_assert(r == 0);
4133 r = admin_socket->register_command(
4134 "cache drop",
4135 asok_hook,
4136 "Drop all OSD caches");
4137 ceph_assert(r == 0);
4138 r = admin_socket->register_command(
4139 "cache status",
4140 asok_hook,
4141 "Get OSD caches statistics");
4142 ceph_assert(r == 0);
4143 r = admin_socket->register_command(
4144 "scrub_purged_snaps",
4145 asok_hook,
4146 "Scrub purged_snaps vs snapmapper index");
4147 ceph_assert(r == 0);
4148 r = admin_socket->register_command(
4149 "scrubdebug " \
4150 "name=pgid,type=CephPgid " \
4151 "name=cmd,type=CephChoices,strings=block|unblock|set|unset " \
4152 "name=value,type=CephString,req=false",
4153 asok_hook,
4154 "debug the scrubber");
4155 ceph_assert(r == 0);
4156
4157 // -- pg commands --
4158 // old form: ceph pg <pgid> command ...
4159 r = admin_socket->register_command(
4160 "pg " \
4161 "name=pgid,type=CephPgid " \
4162 "name=cmd,type=CephChoices,strings=query",
4163 asok_hook,
4164 "");
4165 ceph_assert(r == 0);
4166 r = admin_socket->register_command(
4167 "pg " \
4168 "name=pgid,type=CephPgid " \
4169 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
4170 "name=mulcmd,type=CephChoices,strings=revert|delete",
4171 asok_hook,
4172 "");
4173 ceph_assert(r == 0);
4174 r = admin_socket->register_command(
4175 "pg " \
4176 "name=pgid,type=CephPgid " \
4177 "name=cmd,type=CephChoices,strings=list_unfound " \
4178 "name=offset,type=CephString,req=false",
4179 asok_hook,
4180 "");
4181 ceph_assert(r == 0);
4182 r = admin_socket->register_command(
4183 "pg " \
4184 "name=pgid,type=CephPgid " \
4185 "name=cmd,type=CephChoices,strings=scrub " \
4186 "name=time,type=CephInt,req=false",
4187 asok_hook,
4188 "");
4189 ceph_assert(r == 0);
4190 r = admin_socket->register_command(
4191 "pg " \
4192 "name=pgid,type=CephPgid " \
4193 "name=cmd,type=CephChoices,strings=deep_scrub " \
4194 "name=time,type=CephInt,req=false",
4195 asok_hook,
4196 "");
4197 ceph_assert(r == 0);
4198 // new form: tell <pgid> <cmd> for both cli and rest
4199 r = admin_socket->register_command(
4200 "query",
4201 asok_hook,
4202 "show details of a specific pg");
4203 ceph_assert(r == 0);
4204 r = admin_socket->register_command(
4205 "mark_unfound_lost " \
4206 "name=pgid,type=CephPgid,req=false " \
4207 "name=mulcmd,type=CephChoices,strings=revert|delete",
4208 asok_hook,
4209 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4210 ceph_assert(r == 0);
4211 r = admin_socket->register_command(
4212 "list_unfound " \
4213 "name=pgid,type=CephPgid,req=false " \
4214 "name=offset,type=CephString,req=false",
4215 asok_hook,
4216 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4217 ceph_assert(r == 0);
4218 r = admin_socket->register_command(
4219 "scrub " \
4220 "name=pgid,type=CephPgid,req=false " \
4221 "name=time,type=CephInt,req=false",
4222 asok_hook,
4223 "Trigger a scheduled scrub ");
4224 ceph_assert(r == 0);
4225 r = admin_socket->register_command(
4226 "deep_scrub " \
4227 "name=pgid,type=CephPgid,req=false " \
4228 "name=time,type=CephInt,req=false",
4229 asok_hook,
4230 "Trigger a scheduled deep scrub ");
4231 ceph_assert(r == 0);
4232}
4233
4234PerfCounters* OSD::create_logger()
4235{
4236 PerfCounters* logger = build_osd_logger(cct);
4237 cct->get_perfcounters_collection()->add(logger);
4238 return logger;
4239}
4240
4241PerfCounters* OSD::create_recoverystate_perf()
4242{
4243 PerfCounters* recoverystate_perf = build_recoverystate_perf(cct);
4244 cct->get_perfcounters_collection()->add(recoverystate_perf);
4245 return recoverystate_perf;
4246}
4247
4248int OSD::shutdown()
4249{
4250 // vstart overwrites osd_fast_shutdown value in the conf file -> force the value here!
4251 //cct->_conf->osd_fast_shutdown = true;
4252
4253 dout(0) << "Fast Shutdown: - cct->_conf->osd_fast_shutdown = "
4254 << cct->_conf->osd_fast_shutdown
4255 << ", null-fm = " << store->has_null_manager() << dendl;
4256
4257 utime_t start_time_func = ceph_clock_now();
4258
4259 if (cct->_conf->osd_fast_shutdown) {
4260 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4261 if (cct->_conf->osd_fast_shutdown_notify_mon)
4262 service.prepare_to_stop();
4263
4264 // There is no state we need to keep wehn running in NULL-FM moode
4265 if (!store->has_null_manager()) {
4266 cct->_log->flush();
4267 _exit(0);
4268 }
4269 } else if (!service.prepare_to_stop()) {
4270 return 0; // already shutting down
4271 }
4272
4273 osd_lock.lock();
4274 if (is_stopping()) {
4275 osd_lock.unlock();
4276 return 0;
4277 }
4278
4279 if (!cct->_conf->osd_fast_shutdown) {
4280 dout(0) << "shutdown" << dendl;
4281 }
4282
4283 // don't accept new task for this OSD
4284 set_state(STATE_STOPPING);
4285
4286 // Disabled debugging during fast-shutdown
4287 if (!cct->_conf->osd_fast_shutdown && cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4288 cct->_conf.set_val("debug_osd", "100");
4289 cct->_conf.set_val("debug_journal", "100");
4290 cct->_conf.set_val("debug_filestore", "100");
4291 cct->_conf.set_val("debug_bluestore", "100");
4292 cct->_conf.set_val("debug_ms", "100");
4293 cct->_conf.apply_changes(nullptr);
4294 }
4295
4296 if (cct->_conf->osd_fast_shutdown) {
4297 // first, stop new task from being taken from op_shardedwq
4298 // and clear all pending tasks
4299 op_shardedwq.stop_for_fast_shutdown();
4300
4301 utime_t start_time_timer = ceph_clock_now();
4302 tick_timer.shutdown();
4303 {
4304 std::lock_guard l(tick_timer_lock);
4305 tick_timer_without_osd_lock.shutdown();
4306 }
4307
4308 osd_lock.unlock();
4309 utime_t start_time_osd_drain = ceph_clock_now();
4310
4311 // then, wait on osd_op_tp to drain (TBD: should probably add a timeout)
4312 osd_op_tp.drain();
4313 osd_op_tp.stop();
4314
4315 utime_t start_time_umount = ceph_clock_now();
4316 store->prepare_for_fast_shutdown();
4317 std::lock_guard lock(osd_lock);
4318 // TBD: assert in allocator that nothing is being add
4319 store->umount();
4320
4321 utime_t end_time = ceph_clock_now();
4322 if (cct->_conf->osd_fast_shutdown_timeout) {
4323 ceph_assert(end_time - start_time_func < cct->_conf->osd_fast_shutdown_timeout);
4324 }
4325 dout(0) <<"Fast Shutdown duration total :" << end_time - start_time_func << " seconds" << dendl;
4326 dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount - start_time_osd_drain << " seconds" << dendl;
4327 dout(0) <<"Fast Shutdown duration umount :" << end_time - start_time_umount << " seconds" << dendl;
4328 dout(0) <<"Fast Shutdown duration timer :" << start_time_osd_drain - start_time_timer << " seconds" << dendl;
4329 cct->_log->flush();
4330
4331 // now it is safe to exit
4332 _exit(0);
4333 }
4334
4335 // stop MgrClient earlier as it's more like an internal consumer of OSD
4336 mgrc.shutdown();
4337
4338 service.start_shutdown();
4339
4340 // stop sending work to pgs. this just prevents any new work in _process
4341 // from racing with on_shutdown and potentially entering the pg after.
4342 op_shardedwq.drain();
4343
4344 // Shutdown PGs
4345 {
4346 vector<PGRef> pgs;
4347 _get_pgs(&pgs);
4348 for (auto pg : pgs) {
4349 pg->shutdown();
4350 }
4351 }
4352
4353 // drain op queue again (in case PGs requeued something)
4354 op_shardedwq.drain();
4355 {
4356 finished.clear(); // zap waiters (bleh, this is messy)
4357 waiting_for_osdmap.clear();
4358 }
4359
4360 // unregister commands
4361 cct->get_admin_socket()->unregister_commands(asok_hook);
4362 delete asok_hook;
4363 asok_hook = NULL;
4364
4365 cct->get_admin_socket()->unregister_commands(test_ops_hook);
4366 delete test_ops_hook;
4367 test_ops_hook = NULL;
4368
4369 osd_lock.unlock();
4370
4371 {
4372 std::lock_guard l{heartbeat_lock};
4373 heartbeat_stop = true;
4374 heartbeat_cond.notify_all();
4375 heartbeat_peers.clear();
4376 }
4377 heartbeat_thread.join();
4378
4379 hb_back_server_messenger->mark_down_all();
4380 hb_front_server_messenger->mark_down_all();
4381 hb_front_client_messenger->mark_down_all();
4382 hb_back_client_messenger->mark_down_all();
4383
4384 osd_op_tp.drain();
4385 osd_op_tp.stop();
4386 dout(10) << "op sharded tp stopped" << dendl;
4387
4388 dout(10) << "stopping agent" << dendl;
4389 service.agent_stop();
4390
4391 boot_finisher.wait_for_empty();
4392
4393 osd_lock.lock();
4394
4395 boot_finisher.stop();
4396 reset_heartbeat_peers(true);
4397
4398 tick_timer.shutdown();
4399
4400 {
4401 std::lock_guard l(tick_timer_lock);
4402 tick_timer_without_osd_lock.shutdown();
4403 }
4404
4405 // note unmount epoch
4406 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
4407 superblock.mounted = service.get_boot_epoch();
4408 superblock.clean_thru = get_osdmap_epoch();
4409 ObjectStore::Transaction t;
4410 write_superblock(t);
4411 int r = store->queue_transaction(service.meta_ch, std::move(t));
4412 if (r) {
4413 derr << "OSD::shutdown: error writing superblock: "
4414 << cpp_strerror(r) << dendl;
4415 }
4416
4417
4418 service.shutdown_reserver();
4419
4420 // Remove PGs
4421#ifdef PG_DEBUG_REFS
4422 service.dump_live_pgids();
4423#endif
4424 while (true) {
4425 vector<PGRef> pgs;
4426 _get_pgs(&pgs, true);
4427 if (pgs.empty()) {
4428 break;
4429 }
4430 for (auto& pg : pgs) {
4431 if (pg->is_deleted()) {
4432 continue;
4433 }
4434 dout(20) << " kicking pg " << pg << dendl;
4435 pg->lock();
4436 if (pg->get_num_ref() != 1) {
4437 derr << "pgid " << pg->get_pgid() << " has ref count of "
4438 << pg->get_num_ref() << dendl;
4439#ifdef PG_DEBUG_REFS
4440 pg->dump_live_ids();
4441#endif
4442 if (cct->_conf->osd_shutdown_pgref_assert) {
4443 ceph_abort();
4444 }
4445 }
4446 pg->ch.reset();
4447 pg->unlock();
4448 }
4449 }
4450#ifdef PG_DEBUG_REFS
4451 service.dump_live_pgids();
4452#endif
4453
4454 osd_lock.unlock();
4455 cct->_conf.remove_observer(this);
4456 osd_lock.lock();
4457
4458 service.meta_ch.reset();
4459
4460 dout(10) << "syncing store" << dendl;
4461 enable_disable_fuse(true);
4462
4463 if (cct->_conf->osd_journal_flush_on_shutdown) {
4464 dout(10) << "flushing journal" << dendl;
4465 store->flush_journal();
4466 }
4467
4468 monc->shutdown();
4469 osd_lock.unlock();
4470 {
4471 std::unique_lock l{map_lock};
4472 set_osdmap(OSDMapRef());
4473 }
4474 for (auto s : shards) {
4475 std::lock_guard l(s->osdmap_lock);
4476 s->shard_osdmap = OSDMapRef();
4477 }
4478 service.shutdown();
4479
4480 std::lock_guard lock(osd_lock);
4481 store->umount();
4482 store.reset();
4483 dout(10) << "Store synced" << dendl;
4484
4485 op_tracker.on_shutdown();
4486
4487 ClassHandler::get_instance().shutdown();
4488 client_messenger->shutdown();
4489 cluster_messenger->shutdown();
4490 hb_front_client_messenger->shutdown();
4491 hb_back_client_messenger->shutdown();
4492 objecter_messenger->shutdown();
4493 hb_front_server_messenger->shutdown();
4494 hb_back_server_messenger->shutdown();
4495
4496 utime_t duration = ceph_clock_now() - start_time_func;
4497 dout(0) <<"Slow Shutdown duration:" << duration << " seconds" << dendl;
4498
4499 tracing::osd::tracer.shutdown();
4500
4501 return r;
4502}
4503
4504int OSD::mon_cmd_maybe_osd_create(string &cmd)
4505{
4506 bool created = false;
4507 while (true) {
4508 dout(10) << __func__ << " cmd: " << cmd << dendl;
4509 vector<string> vcmd{cmd};
4510 bufferlist inbl;
4511 C_SaferCond w;
4512 string outs;
4513 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4514 int r = w.wait();
4515 if (r < 0) {
4516 if (r == -ENOENT && !created) {
4517 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4518 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4519 vector<string> vnewcmd{newcmd};
4520 bufferlist inbl;
4521 C_SaferCond w;
4522 string outs;
4523 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4524 int r = w.wait();
4525 if (r < 0) {
4526 derr << __func__ << " fail: osd does not exist and created failed: "
4527 << cpp_strerror(r) << dendl;
4528 return r;
4529 }
4530 created = true;
4531 continue;
4532 }
4533 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4534 return r;
4535 }
4536 break;
4537 }
4538
4539 return 0;
4540}
4541
4542int OSD::update_crush_location()
4543{
4544 if (!cct->_conf->osd_crush_update_on_start) {
4545 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4546 return 0;
4547 }
4548
4549 char weight[32];
4550 if (cct->_conf->osd_crush_initial_weight >= 0) {
4551 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4552 } else {
4553 struct store_statfs_t st;
4554 osd_alert_list_t alerts;
4555 int r = store->statfs(&st, &alerts);
4556 if (r < 0) {
4557 derr << "statfs: " << cpp_strerror(r) << dendl;
4558 return r;
4559 }
4560 snprintf(weight, sizeof(weight), "%.4lf",
4561 std::max(.00001,
4562 double(st.total) /
4563 double(1ull << 40 /* TB */)));
4564 }
4565
4566 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
4567
4568 string cmd =
4569 string("{\"prefix\": \"osd crush create-or-move\", ") +
4570 string("\"id\": ") + stringify(whoami) + ", " +
4571 string("\"weight\":") + weight + ", " +
4572 string("\"args\": [") + stringify(cct->crush_location) + "]}";
4573 return mon_cmd_maybe_osd_create(cmd);
4574}
4575
4576int OSD::update_crush_device_class()
4577{
4578 if (!cct->_conf->osd_class_update_on_start) {
4579 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4580 return 0;
4581 }
4582
4583 string device_class;
4584 int r = store->read_meta("crush_device_class", &device_class);
4585 if (r < 0 || device_class.empty()) {
4586 device_class = store->get_default_device_class();
4587 }
4588
4589 if (device_class.empty()) {
4590 dout(20) << __func__ << " no device class stored locally" << dendl;
4591 return 0;
4592 }
4593
4594 string cmd =
4595 string("{\"prefix\": \"osd crush set-device-class\", ") +
4596 string("\"class\": \"") + device_class + string("\", ") +
4597 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4598
4599 r = mon_cmd_maybe_osd_create(cmd);
4600 if (r == -EBUSY) {
4601 // good, already bound to a device-class
4602 return 0;
4603 } else {
4604 return r;
4605 }
4606}
4607
4608void OSD::write_superblock(ObjectStore::Transaction& t)
4609{
4610 dout(10) << "write_superblock " << superblock << dendl;
4611
4612 //hack: at minimum it's using the baseline feature set
4613 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4614 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4615
4616 bufferlist bl;
4617 encode(superblock, bl);
4618 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4619}
4620
4621int OSD::read_superblock()
4622{
4623 bufferlist bl;
4624 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4625 if (r < 0)
4626 return r;
4627
4628 auto p = bl.cbegin();
4629 decode(superblock, p);
4630
4631 dout(10) << "read_superblock " << superblock << dendl;
4632
4633 return 0;
4634}
4635
4636void OSD::clear_temp_objects()
4637{
4638 dout(10) << __func__ << dendl;
4639 vector<coll_t> ls;
4640 store->list_collections(ls);
4641 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4642 spg_t pgid;
4643 if (!p->is_pg(&pgid))
4644 continue;
4645
4646 // list temp objects
4647 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4648
4649 vector<ghobject_t> temps;
4650 ghobject_t next;
4651 while (1) {
4652 vector<ghobject_t> objects;
4653 auto ch = store->open_collection(*p);
4654 ceph_assert(ch);
4655 store->collection_list(ch, next, ghobject_t::get_max(),
4656 store->get_ideal_list_max(),
4657 &objects, &next);
4658 if (objects.empty())
4659 break;
4660 vector<ghobject_t>::iterator q;
4661 for (q = objects.begin(); q != objects.end(); ++q) {
4662 // Hammer set pool for temps to -1, so check for clean-up
4663 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4664 temps.push_back(*q);
4665 } else {
4666 break;
4667 }
4668 }
4669 // If we saw a non-temp object and hit the break above we can
4670 // break out of the while loop too.
4671 if (q != objects.end())
4672 break;
4673 }
4674 if (!temps.empty()) {
4675 ObjectStore::Transaction t;
4676 int removed = 0;
4677 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4678 dout(20) << " removing " << *p << " object " << *q << dendl;
4679 t.remove(*p, *q);
4680 if (++removed > cct->_conf->osd_target_transaction_size) {
4681 store->queue_transaction(service.meta_ch, std::move(t));
4682 t = ObjectStore::Transaction();
4683 removed = 0;
4684 }
4685 }
4686 if (removed) {
4687 store->queue_transaction(service.meta_ch, std::move(t));
4688 }
4689 }
4690 }
4691}
4692
4693void OSD::recursive_remove_collection(CephContext* cct,
4694 ObjectStore *store, spg_t pgid,
4695 coll_t tmp)
4696{
4697 OSDriver driver(
4698 store,
4699 coll_t(),
4700 make_snapmapper_oid());
4701
4702 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4703 ObjectStore::Transaction t;
4704 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4705
4706 ghobject_t next;
4707 int max = cct->_conf->osd_target_transaction_size;
4708 vector<ghobject_t> objects;
4709 objects.reserve(max);
4710 while (true) {
4711 objects.clear();
4712 store->collection_list(ch, next, ghobject_t::get_max(),
4713 max, &objects, &next);
4714 generic_dout(10) << __func__ << " " << objects << dendl;
4715 if (objects.empty())
4716 break;
4717 for (auto& p: objects) {
4718 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4719 int r = mapper.remove_oid(p.hobj, &_t);
4720 if (r != 0 && r != -ENOENT)
4721 ceph_abort();
4722 t.remove(tmp, p);
4723 }
4724 int r = store->queue_transaction(ch, std::move(t));
4725 ceph_assert(r == 0);
4726 t = ObjectStore::Transaction();
4727 }
4728 t.remove_collection(tmp);
4729 int r = store->queue_transaction(ch, std::move(t));
4730 ceph_assert(r == 0);
4731
4732 C_SaferCond waiter;
4733 if (!ch->flush_commit(&waiter)) {
4734 waiter.wait();
4735 }
4736}
4737
4738
4739// ======================================================
4740// PG's
4741
4742PG* OSD::_make_pg(
4743 OSDMapRef createmap,
4744 spg_t pgid)
4745{
4746 dout(10) << __func__ << " " << pgid << dendl;
4747 pg_pool_t pi;
4748 map<string,string> ec_profile;
4749 string name;
4750 if (createmap->have_pg_pool(pgid.pool())) {
4751 pi = *createmap->get_pg_pool(pgid.pool());
4752 name = createmap->get_pool_name(pgid.pool());
4753 if (pi.is_erasure()) {
4754 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4755 }
4756 } else {
4757 // pool was deleted; grab final pg_pool_t off disk.
4758 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4759 bufferlist bl;
4760 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4761 if (r < 0) {
4762 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4763 << dendl;
4764 return nullptr;
4765 }
4766 ceph_assert(r >= 0);
4767 auto p = bl.cbegin();
4768 decode(pi, p);
4769 decode(name, p);
4770 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4771 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4772 << " tombstone" << dendl;
4773 return nullptr;
4774 }
4775 decode(ec_profile, p);
4776 }
4777 PGPool pool(createmap, pgid.pool(), pi, name);
4778 PG *pg;
4779 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4780 pi.type == pg_pool_t::TYPE_ERASURE)
4781 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4782 else
4783 ceph_abort();
4784 return pg;
4785}
4786
4787void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4788{
4789 v->clear();
4790 v->reserve(get_num_pgs());
4791 for (auto& s : shards) {
4792 std::lock_guard l(s->shard_lock);
4793 for (auto& j : s->pg_slots) {
4794 if (j.second->pg &&
4795 !j.second->pg->is_deleted()) {
4796 v->push_back(j.second->pg);
4797 if (clear_too) {
4798 s->_detach_pg(j.second.get());
4799 }
4800 }
4801 }
4802 }
4803}
4804
4805void OSD::_get_pgids(vector<spg_t> *v)
4806{
4807 v->clear();
4808 v->reserve(get_num_pgs());
4809 for (auto& s : shards) {
4810 std::lock_guard l(s->shard_lock);
4811 for (auto& j : s->pg_slots) {
4812 if (j.second->pg &&
4813 !j.second->pg->is_deleted()) {
4814 v->push_back(j.first);
4815 }
4816 }
4817 }
4818}
4819
4820void OSD::register_pg(PGRef pg)
4821{
4822 spg_t pgid = pg->get_pgid();
4823 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4824 auto sdata = shards[shard_index];
4825 std::lock_guard l(sdata->shard_lock);
4826 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4827 ceph_assert(r.second);
4828 auto *slot = r.first->second.get();
4829 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4830 sdata->_attach_pg(slot, pg.get());
4831}
4832
4833bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4834{
4835 auto sdata = pg->osd_shard;
4836 ceph_assert(sdata);
4837 {
4838 std::lock_guard l(sdata->shard_lock);
4839 auto p = sdata->pg_slots.find(pg->pg_id);
4840 if (p == sdata->pg_slots.end() ||
4841 !p->second->pg) {
4842 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4843 return false;
4844 }
4845 if (p->second->waiting_for_merge_epoch) {
4846 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4847 return false;
4848 }
4849 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4850 sdata->_detach_pg(p->second.get());
4851 }
4852
4853 for (auto shard : shards) {
4854 shard->unprime_split_children(pg->pg_id, old_pg_num);
4855 }
4856
4857 // update pg count now since we might not get an osdmap any time soon.
4858 if (pg->is_primary())
4859 service.logger->dec(l_osd_pg_primary);
4860 else if (pg->is_nonprimary())
4861 service.logger->dec(l_osd_pg_replica); // misnomver
4862 else
4863 service.logger->dec(l_osd_pg_stray);
4864
4865 return true;
4866}
4867
4868PGRef OSD::_lookup_pg(spg_t pgid)
4869{
4870 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4871 auto sdata = shards[shard_index];
4872 std::lock_guard l(sdata->shard_lock);
4873 auto p = sdata->pg_slots.find(pgid);
4874 if (p == sdata->pg_slots.end()) {
4875 return nullptr;
4876 }
4877 return p->second->pg;
4878}
4879
4880PGRef OSD::_lookup_lock_pg(spg_t pgid)
4881{
4882 PGRef pg = _lookup_pg(pgid);
4883 if (!pg) {
4884 return nullptr;
4885 }
4886 pg->lock();
4887 if (!pg->is_deleted()) {
4888 return pg;
4889 }
4890 pg->unlock();
4891 return nullptr;
4892}
4893
4894PGRef OSD::lookup_lock_pg(spg_t pgid)
4895{
4896 return _lookup_lock_pg(pgid);
4897}
4898
4899void OSD::load_pgs()
4900{
4901 ceph_assert(ceph_mutex_is_locked(osd_lock));
4902 dout(0) << "load_pgs" << dendl;
4903
4904 {
4905 auto pghist = make_pg_num_history_oid();
4906 bufferlist bl;
4907 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4908 if (r >= 0 && bl.length() > 0) {
4909 auto p = bl.cbegin();
4910 decode(pg_num_history, p);
4911 }
4912 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
4913 }
4914
4915 vector<coll_t> ls;
4916 int r = store->list_collections(ls);
4917 if (r < 0) {
4918 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4919 }
4920
4921 int num = 0;
4922 for (vector<coll_t>::iterator it = ls.begin();
4923 it != ls.end();
4924 ++it) {
4925 spg_t pgid;
4926 if (it->is_temp(&pgid) ||
4927 (it->is_pg(&pgid) && PG::_has_removal_flag(store.get(), pgid))) {
4928 dout(10) << "load_pgs " << *it
4929 << " removing, legacy or flagged for removal pg" << dendl;
4930 recursive_remove_collection(cct, store.get(), pgid, *it);
4931 continue;
4932 }
4933
4934 if (!it->is_pg(&pgid)) {
4935 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4936 continue;
4937 }
4938
4939 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4940 epoch_t map_epoch = 0;
4941 int r = PG::peek_map_epoch(store.get(), pgid, &map_epoch);
4942 if (r < 0) {
4943 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4944 << dendl;
4945 continue;
4946 }
4947
4948 PGRef pg;
4949 if (map_epoch > 0) {
4950 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4951 if (!pgosdmap) {
4952 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
4953 derr << __func__ << ": could not find map for epoch " << map_epoch
4954 << " on pg " << pgid << ", but the pool is not present in the "
4955 << "current map, so this is probably a result of bug 10617. "
4956 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4957 << "to clean it up later." << dendl;
4958 continue;
4959 } else {
4960 derr << __func__ << ": have pgid " << pgid << " at epoch "
4961 << map_epoch << ", but missing map. Crashing."
4962 << dendl;
4963 ceph_abort_msg("Missing map in load_pgs");
4964 }
4965 }
4966 pg = _make_pg(pgosdmap, pgid);
4967 } else {
4968 pg = _make_pg(get_osdmap(), pgid);
4969 }
4970 if (!pg) {
4971 recursive_remove_collection(cct, store.get(), pgid, *it);
4972 continue;
4973 }
4974
4975 // there can be no waiters here, so we don't call _wake_pg_slot
4976
4977 pg->lock();
4978 pg->ch = store->open_collection(pg->coll);
4979
4980 // read pg state, log
4981 pg->read_state(store.get());
4982
4983 if (pg->dne()) {
4984 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4985 pg->ch = nullptr;
4986 pg->unlock();
4987 recursive_remove_collection(cct, store.get(), pgid, *it);
4988 continue;
4989 }
4990 {
4991 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4992 assert(NULL != shards[shard_index]);
4993 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4994 }
4995
4996 dout(10) << __func__ << " loaded " << *pg << dendl;
4997 pg->unlock();
4998
4999 register_pg(pg);
5000 ++num;
5001 }
5002 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
5003}
5004
5005
5006PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
5007 const PGCreateInfo *info)
5008{
5009 spg_t pgid = info->pgid;
5010
5011 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
5012 dout(10) << __func__ << " hit max pg, dropping" << dendl;
5013 return nullptr;
5014 }
5015
5016 OSDMapRef startmap = get_map(info->epoch);
5017
5018 if (info->by_mon) {
5019 int64_t pool_id = pgid.pgid.pool();
5020 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
5021 if (!pool) {
5022 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
5023 return nullptr;
5024 }
5025 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
5026 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
5027 // this ensures we do not process old creating messages after the
5028 // pool's initial pgs have been created (and pg are subsequently
5029 // allowed to split or merge).
5030 dout(20) << __func__ << " dropping " << pgid
5031 << "create, pool does not have CREATING flag set" << dendl;
5032 return nullptr;
5033 }
5034 }
5035
5036 int up_primary, acting_primary;
5037 vector<int> up, acting;
5038 startmap->pg_to_up_acting_osds(
5039 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
5040
5041 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
5042 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
5043 store->get_type() != "bluestore") {
5044 clog->warn() << "pg " << pgid
5045 << " is at risk of silent data corruption: "
5046 << "the pool allows ec overwrites but is not stored in "
5047 << "bluestore, so deep scrubbing will not detect bitrot";
5048 }
5049 PeeringCtx rctx;
5050 create_pg_collection(
5051 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
5052 init_pg_ondisk(rctx.transaction, pgid, pp);
5053
5054 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
5055
5056 PGRef pg = _make_pg(startmap, pgid);
5057 pg->ch = store->create_new_collection(pg->coll);
5058
5059 {
5060 uint32_t shard_index = pgid.hash_to_shard(shards.size());
5061 assert(NULL != shards[shard_index]);
5062 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
5063 }
5064
5065 pg->lock(true);
5066
5067 // we are holding the shard lock
5068 ceph_assert(!pg->is_deleted());
5069
5070 pg->init(
5071 role,
5072 up,
5073 up_primary,
5074 acting,
5075 acting_primary,
5076 info->history,
5077 info->past_intervals,
5078 rctx.transaction);
5079
5080 pg->init_collection_pool_opts();
5081
5082 if (pg->is_primary()) {
5083 std::lock_guard locker{m_perf_queries_lock};
5084 pg->set_dynamic_perf_stats_queries(m_perf_queries);
5085 }
5086
5087 pg->handle_initialize(rctx);
5088 pg->handle_activate_map(rctx);
5089
5090 dispatch_context(rctx, pg.get(), osdmap, nullptr);
5091
5092 dout(10) << __func__ << " new pg " << *pg << dendl;
5093 return pg;
5094}
5095
5096bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
5097 spg_t pgid,
5098 bool is_mon_create)
5099{
5100 const auto max_pgs_per_osd =
5101 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5102 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
5103
5104 if (num_pgs < max_pgs_per_osd) {
5105 return false;
5106 }
5107
5108 std::lock_guard l(pending_creates_lock);
5109 if (is_mon_create) {
5110 pending_creates_from_mon++;
5111 } else {
5112 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
5113 pending_creates_from_osd.emplace(pgid, is_primary);
5114 }
5115 dout(1) << __func__ << " withhold creation of pg " << pgid
5116 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
5117 return true;
5118}
5119
5120// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
5121// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
5122// to up set if pg_temp is empty. so an empty pg_temp won't work.
5123static vector<int32_t> twiddle(const vector<int>& acting) {
5124 if (acting.size() > 1) {
5125 return {acting[0]};
5126 } else {
5127 vector<int32_t> twiddled(acting.begin(), acting.end());
5128 twiddled.push_back(-1);
5129 return twiddled;
5130 }
5131}
5132
5133void OSD::resume_creating_pg()
5134{
5135 bool do_sub_pg_creates = false;
5136 bool have_pending_creates = false;
5137 {
5138 const auto max_pgs_per_osd =
5139 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
5140 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
5141 if (max_pgs_per_osd <= num_pgs) {
5142 // this could happen if admin decreases this setting before a PG is removed
5143 return;
5144 }
5145 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
5146 std::lock_guard l(pending_creates_lock);
5147 if (pending_creates_from_mon > 0) {
5148 dout(20) << __func__ << " pending_creates_from_mon "
5149 << pending_creates_from_mon << dendl;
5150 do_sub_pg_creates = true;
5151 if (pending_creates_from_mon >= spare_pgs) {
5152 spare_pgs = pending_creates_from_mon = 0;
5153 } else {
5154 spare_pgs -= pending_creates_from_mon;
5155 pending_creates_from_mon = 0;
5156 }
5157 }
5158 auto pg = pending_creates_from_osd.cbegin();
5159 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
5160 dout(20) << __func__ << " pg " << pg->first << dendl;
5161 vector<int> acting;
5162 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
5163 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
5164 pg = pending_creates_from_osd.erase(pg);
5165 do_sub_pg_creates = true;
5166 spare_pgs--;
5167 }
5168 have_pending_creates = (pending_creates_from_mon > 0 ||
5169 !pending_creates_from_osd.empty());
5170 }
5171
5172 bool do_renew_subs = false;
5173 if (do_sub_pg_creates) {
5174 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
5175 dout(4) << __func__ << ": resolicit pg creates from mon since "
5176 << last_pg_create_epoch << dendl;
5177 do_renew_subs = true;
5178 }
5179 }
5180 version_t start = get_osdmap_epoch() + 1;
5181 if (have_pending_creates) {
5182 // don't miss any new osdmap deleting PGs
5183 if (monc->sub_want("osdmap", start, 0)) {
5184 dout(4) << __func__ << ": resolicit osdmap from mon since "
5185 << start << dendl;
5186 do_renew_subs = true;
5187 }
5188 } else if (do_sub_pg_creates) {
5189 // no need to subscribe the osdmap continuously anymore
5190 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
5191 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
5192 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
5193 << start << dendl;
5194 do_renew_subs = true;
5195 }
5196 }
5197
5198 if (do_renew_subs) {
5199 monc->renew_subs();
5200 }
5201
5202 service.send_pg_temp();
5203}
5204
5205void OSD::build_initial_pg_history(
5206 spg_t pgid,
5207 epoch_t created,
5208 utime_t created_stamp,
5209 pg_history_t *h,
5210 PastIntervals *pi)
5211{
5212 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
5213 *h = pg_history_t(created, created_stamp);
5214
5215 OSDMapRef lastmap = service.get_map(created);
5216 int up_primary, acting_primary;
5217 vector<int> up, acting;
5218 lastmap->pg_to_up_acting_osds(
5219 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
5220
5221 ostringstream debug;
5222 for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
5223 OSDMapRef osdmap = service.get_map(e);
5224 int new_up_primary, new_acting_primary;
5225 vector<int> new_up, new_acting;
5226 osdmap->pg_to_up_acting_osds(
5227 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
5228
5229 // this is a bit imprecise, but sufficient?
5230 struct min_size_predicate_t : public IsPGRecoverablePredicate {
5231 const pg_pool_t *pi;
5232 bool operator()(const set<pg_shard_t> &have) const {
5233 return have.size() >= pi->min_size;
5234 }
5235 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
5236 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
5237
5238 bool new_interval = PastIntervals::check_new_interval(
5239 acting_primary,
5240 new_acting_primary,
5241 acting, new_acting,
5242 up_primary,
5243 new_up_primary,
5244 up, new_up,
5245 h->same_interval_since,
5246 h->last_epoch_clean,
5247 osdmap.get(),
5248 lastmap.get(),
5249 pgid.pgid,
5250 min_size_predicate,
5251 pi,
5252 &debug);
5253 if (new_interval) {
5254 h->same_interval_since = e;
5255 if (up != new_up) {
5256 h->same_up_since = e;
5257 }
5258 if (acting_primary != new_acting_primary) {
5259 h->same_primary_since = e;
5260 }
5261 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
5262 osdmap->get_pg_num(pgid.pgid.pool()),
5263 nullptr)) {
5264 h->last_epoch_split = e;
5265 }
5266 up = new_up;
5267 acting = new_acting;
5268 up_primary = new_up_primary;
5269 acting_primary = new_acting_primary;
5270 }
5271 lastmap = osdmap;
5272 }
5273 dout(20) << __func__ << " " << debug.str() << dendl;
5274 dout(10) << __func__ << " " << *h << " " << *pi
5275 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5276 pi->get_bounds()) << ")"
5277 << dendl;
5278}
5279
5280void OSD::_add_heartbeat_peer(int p)
5281{
5282 if (p == whoami)
5283 return;
5284 HeartbeatInfo *hi;
5285
5286 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5287 if (i == heartbeat_peers.end()) {
5288 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
5289 if (!cons.first)
5290 return;
5291 assert(cons.second);
5292
5293 hi = &heartbeat_peers[p];
5294 hi->peer = p;
5295
5296 auto stamps = service.get_hb_stamps(p);
5297
5298 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5299 sb->peer = p;
5300 sb->stamps = stamps;
5301 hi->hb_interval_start = ceph_clock_now();
5302 hi->con_back = cons.first.get();
5303 hi->con_back->set_priv(sb);
5304
5305 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5306 sf->peer = p;
5307 sf->stamps = stamps;
5308 hi->con_front = cons.second.get();
5309 hi->con_front->set_priv(sf);
5310
5311 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5312 << " " << hi->con_back->get_peer_addr()
5313 << " " << hi->con_front->get_peer_addr()
5314 << dendl;
5315 } else {
5316 hi = &i->second;
5317 }
5318 hi->epoch = get_osdmap_epoch();
5319}
5320
5321void OSD::_remove_heartbeat_peer(int n)
5322{
5323 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5324 ceph_assert(q != heartbeat_peers.end());
5325 dout(20) << " removing heartbeat peer osd." << n
5326 << " " << q->second.con_back->get_peer_addr()
5327 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5328 << dendl;
5329 q->second.clear_mark_down();
5330 heartbeat_peers.erase(q);
5331}
5332
5333void OSD::need_heartbeat_peer_update()
5334{
5335 if (is_stopping())
5336 return;
5337 dout(20) << "need_heartbeat_peer_update" << dendl;
5338 heartbeat_set_peers_need_update();
5339}
5340
5341void OSD::maybe_update_heartbeat_peers()
5342{
5343 ceph_assert(ceph_mutex_is_locked(osd_lock));
5344
5345 if (is_waiting_for_healthy() || is_active()) {
5346 utime_t now = ceph_clock_now();
5347 if (last_heartbeat_resample == utime_t()) {
5348 last_heartbeat_resample = now;
5349 heartbeat_set_peers_need_update();
5350 } else if (!heartbeat_peers_need_update()) {
5351 utime_t dur = now - last_heartbeat_resample;
5352 if (dur > cct->_conf->osd_heartbeat_grace) {
5353 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5354 heartbeat_set_peers_need_update();
5355 last_heartbeat_resample = now;
5356 // automatically clean up any stale heartbeat peers
5357 // if we are unhealthy, then clean all
5358 reset_heartbeat_peers(is_waiting_for_healthy());
5359 }
5360 }
5361 }
5362
5363 if (!heartbeat_peers_need_update())
5364 return;
5365 heartbeat_clear_peers_need_update();
5366
5367 std::lock_guard l(heartbeat_lock);
5368
5369 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5370
5371
5372 // build heartbeat from set
5373 if (is_active()) {
5374 vector<PGRef> pgs;
5375 _get_pgs(&pgs);
5376 for (auto& pg : pgs) {
5377 pg->with_heartbeat_peers([&](int peer) {
5378 if (get_osdmap()->is_up(peer)) {
5379 _add_heartbeat_peer(peer);
5380 }
5381 });
5382 }
5383 }
5384
5385 // include next and previous up osds to ensure we have a fully-connected set
5386 set<int> want, extras;
5387 const int next = get_osdmap()->get_next_up_osd_after(whoami);
5388 if (next >= 0)
5389 want.insert(next);
5390 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
5391 if (prev >= 0 && prev != next)
5392 want.insert(prev);
5393
5394 // make sure we have at least **min_down** osds coming from different
5395 // subtree level (e.g., hosts) for fast failure detection.
5396 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5397 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5398 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5399 get_osdmap()->get_random_up_osds_by_subtree(
5400 whoami, subtree, limit, want, &want);
5401
5402 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5403 dout(10) << " adding neighbor peer osd." << *p << dendl;
5404 extras.insert(*p);
5405 _add_heartbeat_peer(*p);
5406 }
5407
5408 // remove down peers; enumerate extras
5409 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5410 while (p != heartbeat_peers.end()) {
5411 if (!get_osdmap()->is_up(p->first)) {
5412 int o = p->first;
5413 ++p;
5414 _remove_heartbeat_peer(o);
5415 continue;
5416 }
5417 if (p->second.epoch < get_osdmap_epoch()) {
5418 extras.insert(p->first);
5419 }
5420 ++p;
5421 }
5422
5423 // too few?
5424 for (int n = next; n >= 0; ) {
5425 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5426 break;
5427 if (!extras.count(n) && !want.count(n) && n != whoami) {
5428 dout(10) << " adding random peer osd." << n << dendl;
5429 extras.insert(n);
5430 _add_heartbeat_peer(n);
5431 }
5432 n = get_osdmap()->get_next_up_osd_after(n);
5433 if (n == next)
5434 break; // came full circle; stop
5435 }
5436
5437 // too many?
5438 for (set<int>::iterator p = extras.begin();
5439 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5440 ++p) {
5441 if (want.count(*p))
5442 continue;
5443 _remove_heartbeat_peer(*p);
5444 }
5445
5446 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5447
5448 // clean up stale failure pending
5449 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5450 if (heartbeat_peers.count(it->first) == 0) {
5451 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5452 failure_pending.erase(it++);
5453 } else {
5454 it++;
5455 }
5456 }
5457}
5458
5459void OSD::reset_heartbeat_peers(bool all)
5460{
5461 ceph_assert(ceph_mutex_is_locked(osd_lock));
5462 dout(10) << "reset_heartbeat_peers" << dendl;
5463 utime_t stale = ceph_clock_now();
5464 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5465 std::lock_guard l(heartbeat_lock);
5466 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5467 auto& [peer, hi] = *it;
5468 if (all || hi.is_stale(stale)) {
5469 hi.clear_mark_down();
5470 // stop sending failure_report to mon too
5471 failure_queue.erase(peer);
5472 failure_pending.erase(peer);
5473 it = heartbeat_peers.erase(it);
5474 } else {
5475 ++it;
5476 }
5477 }
5478}
5479
5480void OSD::handle_osd_ping(MOSDPing *m)
5481{
5482 if (superblock.cluster_fsid != m->fsid) {
5483 dout(20) << "handle_osd_ping from " << m->get_source_inst()
5484 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5485 << dendl;
5486 m->put();
5487 return;
5488 }
5489
5490 int from = m->get_source().num();
5491
5492 heartbeat_lock.lock();
5493 if (is_stopping()) {
5494 heartbeat_lock.unlock();
5495 m->put();
5496 return;
5497 }
5498
5499 utime_t now = ceph_clock_now();
5500 auto mnow = service.get_mnow();
5501 ConnectionRef con(m->get_connection());
5502 OSDMapRef curmap = service.get_osdmap();
5503 if (!curmap) {
5504 heartbeat_lock.unlock();
5505 m->put();
5506 return;
5507 }
5508
5509 auto sref = con->get_priv();
5510 Session *s = static_cast<Session*>(sref.get());
5511 if (!s) {
5512 heartbeat_lock.unlock();
5513 m->put();
5514 return;
5515 }
5516 if (!s->stamps) {
5517 s->peer = from;
5518 s->stamps = service.get_hb_stamps(from);
5519 }
5520
5521 switch (m->op) {
5522
5523 case MOSDPing::PING:
5524 {
5525 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5526 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5527 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5528 if (heartbeat_drop->second == 0) {
5529 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5530 } else {
5531 --heartbeat_drop->second;
5532 dout(5) << "Dropping heartbeat from " << from
5533 << ", " << heartbeat_drop->second
5534 << " remaining to drop" << dendl;
5535 break;
5536 }
5537 } else if (cct->_conf->osd_debug_drop_ping_probability >
5538 ((((double)(rand()%100))/100.0))) {
5539 heartbeat_drop =
5540 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5541 cct->_conf->osd_debug_drop_ping_duration)).first;
5542 dout(5) << "Dropping heartbeat from " << from
5543 << ", " << heartbeat_drop->second
5544 << " remaining to drop" << dendl;
5545 break;
5546 }
5547 }
5548
5549 ceph::signedspan sender_delta_ub{};
5550 s->stamps->got_ping(
5551 m->up_from,
5552 mnow,
5553 m->mono_send_stamp,
5554 m->delta_ub,
5555 &sender_delta_ub);
5556 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5557
5558 if (!cct->get_heartbeat_map()->is_healthy()) {
5559 dout(10) << "internal heartbeat not healthy, dropping ping request"
5560 << dendl;
5561 break;
5562 }
5563
5564 Message *r = new MOSDPing(monc->get_fsid(),
5565 curmap->get_epoch(),
5566 MOSDPing::PING_REPLY,
5567 m->ping_stamp,
5568 m->mono_ping_stamp,
5569 mnow,
5570 service.get_up_epoch(),
5571 cct->_conf->osd_heartbeat_min_size,
5572 sender_delta_ub);
5573 con->send_message(r);
5574
5575 if (curmap->is_up(from)) {
5576 if (is_active()) {
5577 ConnectionRef cluster_con = service.get_con_osd_cluster(
5578 from, curmap->get_epoch());
5579 if (cluster_con) {
5580 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5581 }
5582 }
5583 } else if (!curmap->exists(from) ||
5584 curmap->get_down_at(from) > m->map_epoch) {
5585 // tell them they have died
5586 Message *r = new MOSDPing(monc->get_fsid(),
5587 curmap->get_epoch(),
5588 MOSDPing::YOU_DIED,
5589 m->ping_stamp,
5590 m->mono_ping_stamp,
5591 mnow,
5592 service.get_up_epoch(),
5593 cct->_conf->osd_heartbeat_min_size);
5594 con->send_message(r);
5595 }
5596 }
5597 break;
5598
5599 case MOSDPing::PING_REPLY:
5600 {
5601 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5602 if (i != heartbeat_peers.end()) {
5603 auto acked = i->second.ping_history.find(m->ping_stamp);
5604 if (acked != i->second.ping_history.end()) {
5605 int &unacknowledged = acked->second.second;
5606 if (con == i->second.con_back) {
5607 dout(25) << "handle_osd_ping got reply from osd." << from
5608 << " first_tx " << i->second.first_tx
5609 << " last_tx " << i->second.last_tx
5610 << " last_rx_back " << i->second.last_rx_back
5611 << " -> " << now
5612 << " last_rx_front " << i->second.last_rx_front
5613 << dendl;
5614 i->second.last_rx_back = now;
5615 ceph_assert(unacknowledged > 0);
5616 --unacknowledged;
5617 // if there is no front con, set both stamps.
5618 if (i->second.con_front == NULL) {
5619 i->second.last_rx_front = now;
5620 ceph_assert(unacknowledged > 0);
5621 --unacknowledged;
5622 }
5623 } else if (con == i->second.con_front) {
5624 dout(25) << "handle_osd_ping got reply from osd." << from
5625 << " first_tx " << i->second.first_tx
5626 << " last_tx " << i->second.last_tx
5627 << " last_rx_back " << i->second.last_rx_back
5628 << " last_rx_front " << i->second.last_rx_front
5629 << " -> " << now
5630 << dendl;
5631 i->second.last_rx_front = now;
5632 ceph_assert(unacknowledged > 0);
5633 --unacknowledged;
5634 }
5635
5636 if (unacknowledged == 0) {
5637 // succeeded in getting all replies
5638 dout(25) << "handle_osd_ping got all replies from osd." << from
5639 << " , erase pending ping(sent at " << m->ping_stamp << ")"
5640 << " and older pending ping(s)"
5641 << dendl;
5642
5643#define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5644 ++i->second.hb_average_count;
5645 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
5646 i->second.hb_total_back += back_pingtime;
5647 if (back_pingtime < i->second.hb_min_back)
5648 i->second.hb_min_back = back_pingtime;
5649 if (back_pingtime > i->second.hb_max_back)
5650 i->second.hb_max_back = back_pingtime;
5651 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
5652 i->second.hb_total_front += front_pingtime;
5653 if (front_pingtime < i->second.hb_min_front)
5654 i->second.hb_min_front = front_pingtime;
5655 if (front_pingtime > i->second.hb_max_front)
5656 i->second.hb_max_front = front_pingtime;
5657
5658 ceph_assert(i->second.hb_interval_start != utime_t());
5659 if (i->second.hb_interval_start == utime_t())
5660 i->second.hb_interval_start = now;
5661 int64_t hb_avg_time_period = 60;
5662 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5663 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5664 }
5665 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5666 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5667 uint32_t back_min = i->second.hb_min_back;
5668 uint32_t back_max = i->second.hb_max_back;
5669 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5670 uint32_t front_min = i->second.hb_min_front;
5671 uint32_t front_max = i->second.hb_max_front;
5672
5673 // Reset for new interval
5674 i->second.hb_average_count = 0;
5675 i->second.hb_interval_start = now;
5676 i->second.hb_total_back = i->second.hb_max_back = 0;
5677 i->second.hb_min_back = UINT_MAX;
5678 i->second.hb_total_front = i->second.hb_max_front = 0;
5679 i->second.hb_min_front = UINT_MAX;
5680
5681 // Record per osd interace ping times
5682 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5683 if (i->second.hb_back_pingtime.size() == 0) {
5684 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5685 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5686 i->second.hb_back_pingtime.push_back(back_avg);
5687 i->second.hb_back_min.push_back(back_min);
5688 i->second.hb_back_max.push_back(back_max);
5689 i->second.hb_front_pingtime.push_back(front_avg);
5690 i->second.hb_front_min.push_back(front_min);
5691 i->second.hb_front_max.push_back(front_max);
5692 ++i->second.hb_index;
5693 }
5694 } else {
5695 int index = i->second.hb_index & (hb_vector_size - 1);
5696 i->second.hb_back_pingtime[index] = back_avg;
5697 i->second.hb_back_min[index] = back_min;
5698 i->second.hb_back_max[index] = back_max;
5699 i->second.hb_front_pingtime[index] = front_avg;
5700 i->second.hb_front_min[index] = front_min;
5701 i->second.hb_front_max[index] = front_max;
5702 ++i->second.hb_index;
5703 }
5704
5705 {
5706 std::lock_guard l(service.stat_lock);
5707 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5708 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5709
5710 uint32_t total = 0;
5711 uint32_t min = UINT_MAX;
5712 uint32_t max = 0;
5713 uint32_t count = 0;
5714 uint32_t which = 0;
5715 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5716 for (int32_t k = size - 1 ; k >= 0; --k) {
5717 ++count;
5718 int index = (i->second.hb_index + k) % size;
5719 total += i->second.hb_back_pingtime[index];
5720 if (i->second.hb_back_min[index] < min)
5721 min = i->second.hb_back_min[index];
5722 if (i->second.hb_back_max[index] > max)
5723 max = i->second.hb_back_max[index];
5724 if (count == 1 || count == 5 || count == 15) {
5725 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5726 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5727 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5728 which++;
5729 if (count == 15)
5730 break;
5731 }
5732 }
5733
5734 if (i->second.con_front != NULL) {
5735 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5736
5737 total = 0;
5738 min = UINT_MAX;
5739 max = 0;
5740 count = 0;
5741 which = 0;
5742 for (int32_t k = size - 1 ; k >= 0; --k) {
5743 ++count;
5744 int index = (i->second.hb_index + k) % size;
5745 total += i->second.hb_front_pingtime[index];
5746 if (i->second.hb_front_min[index] < min)
5747 min = i->second.hb_front_min[index];
5748 if (i->second.hb_front_max[index] > max)
5749 max = i->second.hb_front_max[index];
5750 if (count == 1 || count == 5 || count == 15) {
5751 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5752 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5753 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5754 which++;
5755 if (count == 15)
5756 break;
5757 }
5758 }
5759 }
5760 }
5761 } else {
5762 std::lock_guard l(service.stat_lock);
5763 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5764 if (i->second.con_front != NULL)
5765 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5766 }
5767 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5768 }
5769
5770 if (i->second.is_healthy(now)) {
5771 // Cancel false reports
5772 auto failure_queue_entry = failure_queue.find(from);
5773 if (failure_queue_entry != failure_queue.end()) {
5774 dout(10) << "handle_osd_ping canceling queued "
5775 << "failure report for osd." << from << dendl;
5776 failure_queue.erase(failure_queue_entry);
5777 }
5778
5779 auto failure_pending_entry = failure_pending.find(from);
5780 if (failure_pending_entry != failure_pending.end()) {
5781 dout(10) << "handle_osd_ping canceling in-flight "
5782 << "failure report for osd." << from << dendl;
5783 send_still_alive(curmap->get_epoch(),
5784 from,
5785 failure_pending_entry->second.second);
5786 failure_pending.erase(failure_pending_entry);
5787 }
5788 }
5789 } else {
5790 // old replies, deprecated by newly sent pings.
5791 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
5792 << ") is found, treat as covered by newly sent pings "
5793 << "and ignore"
5794 << dendl;
5795 }
5796 }
5797
5798 if (m->map_epoch &&
5799 curmap->is_up(from)) {
5800 if (is_active()) {
5801 ConnectionRef cluster_con = service.get_con_osd_cluster(
5802 from, curmap->get_epoch());
5803 if (cluster_con) {
5804 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5805 }
5806 }
5807 }
5808
5809 s->stamps->got_ping_reply(
5810 mnow,
5811 m->mono_send_stamp,
5812 m->delta_ub);
5813 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5814 }
5815 break;
5816
5817 case MOSDPing::YOU_DIED:
5818 dout(10) << "handle_osd_ping " << m->get_source_inst()
5819 << " says i am down in " << m->map_epoch << dendl;
5820 osdmap_subscribe(curmap->get_epoch()+1, false);
5821 break;
5822 }
5823
5824 heartbeat_lock.unlock();
5825 m->put();
5826}
5827
5828void OSD::heartbeat_entry()
5829{
5830 std::unique_lock l(heartbeat_lock);
5831 if (is_stopping())
5832 return;
5833 while (!heartbeat_stop) {
5834 heartbeat();
5835
5836 double wait;
5837 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5838 wait = (float)cct->_conf->osd_heartbeat_interval;
5839 } else {
5840 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5841 }
5842 auto w = ceph::make_timespan(wait);
5843 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5844 heartbeat_cond.wait_for(l, w);
5845 if (is_stopping())
5846 return;
5847 dout(30) << "heartbeat_entry woke up" << dendl;
5848 }
5849}
5850
5851void OSD::heartbeat_check()
5852{
5853 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
5854 utime_t now = ceph_clock_now();
5855
5856 // check for incoming heartbeats (move me elsewhere?)
5857 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5858 p != heartbeat_peers.end();
5859 ++p) {
5860
5861 if (p->second.first_tx == utime_t()) {
5862 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5863 << " yet, skipping" << dendl;
5864 continue;
5865 }
5866
5867 dout(25) << "heartbeat_check osd." << p->first
5868 << " first_tx " << p->second.first_tx
5869 << " last_tx " << p->second.last_tx
5870 << " last_rx_back " << p->second.last_rx_back
5871 << " last_rx_front " << p->second.last_rx_front
5872 << dendl;
5873 if (p->second.is_unhealthy(now)) {
5874 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5875 if (p->second.last_rx_back == utime_t() ||
5876 p->second.last_rx_front == utime_t()) {
5877 derr << "heartbeat_check: no reply from "
5878 << p->second.con_front->get_peer_addr().get_sockaddr()
5879 << " osd." << p->first
5880 << " ever on either front or back, first ping sent "
5881 << p->second.first_tx
5882 << " (oldest deadline " << oldest_deadline << ")"
5883 << dendl;
5884 // fail
5885 failure_queue[p->first] = p->second.first_tx;
5886 } else {
5887 derr << "heartbeat_check: no reply from "
5888 << p->second.con_front->get_peer_addr().get_sockaddr()
5889 << " osd." << p->first << " since back " << p->second.last_rx_back
5890 << " front " << p->second.last_rx_front
5891 << " (oldest deadline " << oldest_deadline << ")"
5892 << dendl;
5893 // fail
5894 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5895 }
5896 }
5897 }
5898}
5899
5900void OSD::heartbeat()
5901{
5902 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
5903 dout(30) << "heartbeat" << dendl;
5904
5905 auto load_for_logger = service.get_scrub_services().update_load_average();
5906 if (load_for_logger) {
5907 logger->set(l_osd_loadavg, load_for_logger.value());
5908 }
5909 dout(30) << "heartbeat checking stats" << dendl;
5910
5911 // refresh peer list and osd stats
5912 vector<int> hb_peers;
5913 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5914 p != heartbeat_peers.end();
5915 ++p)
5916 hb_peers.push_back(p->first);
5917
5918 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5919 dout(5) << __func__ << " " << new_stat << dendl;
5920 ceph_assert(new_stat.statfs.total);
5921
5922 float pratio;
5923 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5924
5925 service.check_full_status(ratio, pratio);
5926
5927 utime_t now = ceph_clock_now();
5928 auto mnow = service.get_mnow();
5929 utime_t deadline = now;
5930 deadline += cct->_conf->osd_heartbeat_grace;
5931
5932 // send heartbeats
5933 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5934 i != heartbeat_peers.end();
5935 ++i) {
5936 int peer = i->first;
5937 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5938 if (!s) {
5939 dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
5940 continue;
5941 }
5942 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5943
5944 i->second.last_tx = now;
5945 if (i->second.first_tx == utime_t())
5946 i->second.first_tx = now;
5947 i->second.ping_history[now] = make_pair(deadline,
5948 HeartbeatInfo::HEARTBEAT_MAX_CONN);
5949 if (i->second.hb_interval_start == utime_t())
5950 i->second.hb_interval_start = now;
5951
5952 std::optional<ceph::signedspan> delta_ub;
5953 s->stamps->sent_ping(&delta_ub);
5954
5955 i->second.con_back->send_message(
5956 new MOSDPing(monc->get_fsid(),
5957 service.get_osdmap_epoch(),
5958 MOSDPing::PING,
5959 now,
5960 mnow,
5961 mnow,
5962 service.get_up_epoch(),
5963 cct->_conf->osd_heartbeat_min_size,
5964 delta_ub));
5965
5966 if (i->second.con_front)
5967 i->second.con_front->send_message(
5968 new MOSDPing(monc->get_fsid(),
5969 service.get_osdmap_epoch(),
5970 MOSDPing::PING,
5971 now,
5972 mnow,
5973 mnow,
5974 service.get_up_epoch(),
5975 cct->_conf->osd_heartbeat_min_size,
5976 delta_ub));
5977 }
5978
5979 logger->set(l_osd_hb_to, heartbeat_peers.size());
5980
5981 // hmm.. am i all alone?
5982 dout(30) << "heartbeat lonely?" << dendl;
5983 if (heartbeat_peers.empty()) {
5984 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5985 last_mon_heartbeat = now;
5986 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5987 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5988 }
5989 }
5990
5991 dout(30) << "heartbeat done" << dendl;
5992}
5993
5994bool OSD::heartbeat_reset(Connection *con)
5995{
5996 std::lock_guard l(heartbeat_lock);
5997 auto s = con->get_priv();
5998 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
5999 con->set_priv(nullptr);
6000 if (s) {
6001 if (is_stopping()) {
6002 return true;
6003 }
6004 auto session = static_cast<Session*>(s.get());
6005 auto p = heartbeat_peers.find(session->peer);
6006 if (p != heartbeat_peers.end() &&
6007 (p->second.con_back == con ||
6008 p->second.con_front == con)) {
6009 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
6010 << ", reopening" << dendl;
6011 p->second.clear_mark_down(con);
6012 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
6013 if (newcon.first) {
6014 p->second.con_back = newcon.first.get();
6015 p->second.con_back->set_priv(s);
6016 if (newcon.second) {
6017 p->second.con_front = newcon.second.get();
6018 p->second.con_front->set_priv(s);
6019 }
6020 p->second.ping_history.clear();
6021 } else {
6022 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
6023 << ", raced with osdmap update, closing out peer" << dendl;
6024 heartbeat_peers.erase(p);
6025 }
6026 } else {
6027 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
6028 }
6029 }
6030 return true;
6031}
6032
6033
6034
6035// =========================================
6036
6037void OSD::tick()
6038{
6039 ceph_assert(ceph_mutex_is_locked(osd_lock));
6040 dout(10) << "tick" << dendl;
6041
6042 utime_t now = ceph_clock_now();
6043 // throw out any obsolete markdown log
6044 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
6045 while (!osd_markdown_log.empty() &&
6046 osd_markdown_log.front() + grace < now)
6047 osd_markdown_log.pop_front();
6048
6049 if (is_active() || is_waiting_for_healthy()) {
6050 maybe_update_heartbeat_peers();
6051 }
6052
6053 if (is_waiting_for_healthy()) {
6054 start_boot();
6055 }
6056
6057 if (is_waiting_for_healthy() || is_booting()) {
6058 std::lock_guard l(heartbeat_lock);
6059 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
6060 last_mon_heartbeat = now;
6061 dout(1) << __func__ << " checking mon for new map" << dendl;
6062 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6063 }
6064 }
6065
6066 do_waiters();
6067
6068 // scrub purged_snaps every deep scrub interval
6069 {
6070 const utime_t last = superblock.last_purged_snaps_scrub;
6071 utime_t next = last;
6072 next += cct->_conf->osd_scrub_min_interval;
6073 std::mt19937 rng;
6074 // use a seed that is stable for each scrub interval, but varies
6075 // by OSD to avoid any herds.
6076 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
6077 double r = (rng() % 1024) / 1024.0;
6078 next +=
6079 cct->_conf->osd_scrub_min_interval *
6080 cct->_conf->osd_scrub_interval_randomize_ratio * r;
6081 if (next < ceph_clock_now()) {
6082 dout(20) << __func__ << " last_purged_snaps_scrub " << last
6083 << " next " << next << " ... now" << dendl;
6084 scrub_purged_snaps();
6085 } else {
6086 dout(20) << __func__ << " last_purged_snaps_scrub " << last
6087 << " next " << next << dendl;
6088 }
6089 }
6090
6091 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
6092}
6093
6094void OSD::tick_without_osd_lock()
6095{
6096 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
6097 dout(10) << "tick_without_osd_lock" << dendl;
6098
6099 logger->set(l_osd_cached_crc, ceph::buffer::get_cached_crc());
6100 logger->set(l_osd_cached_crc_adjusted, ceph::buffer::get_cached_crc_adjusted());
6101 logger->set(l_osd_missed_crc, ceph::buffer::get_missed_crc());
6102
6103 // refresh osd stats
6104 struct store_statfs_t stbuf;
6105 osd_alert_list_t alerts;
6106 int r = store->statfs(&stbuf, &alerts);
6107 ceph_assert(r == 0);
6108 service.set_statfs(stbuf, alerts);
6109
6110 // osd_lock is not being held, which means the OSD state
6111 // might change when doing the monitor report
6112 if (is_active() || is_waiting_for_healthy()) {
6113 {
6114 std::lock_guard l{heartbeat_lock};
6115 heartbeat_check();
6116 }
6117 map_lock.lock_shared();
6118 std::lock_guard l(mon_report_lock);
6119
6120 // mon report?
6121 utime_t now = ceph_clock_now();
6122 if (service.need_fullness_update() ||
6123 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
6124 last_mon_report = now;
6125 send_full_update();
6126 send_failures();
6127 }
6128 map_lock.unlock_shared();
6129
6130 epoch_t max_waiting_epoch = 0;
6131 for (auto s : shards) {
6132 max_waiting_epoch = std::max(max_waiting_epoch,
6133 s->get_max_waiting_epoch());
6134 }
6135 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
6136 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
6137 << ", requesting new map" << dendl;
6138 osdmap_subscribe(superblock.newest_map + 1, false);
6139 }
6140 }
6141
6142 if (is_active()) {
6143 if (!scrub_random_backoff()) {
6144 sched_scrub();
6145 }
6146 service.promote_throttle_recalibrate();
6147 resume_creating_pg();
6148 bool need_send_beacon = false;
6149 const auto now = ceph::coarse_mono_clock::now();
6150 {
6151 // borrow lec lock to pretect last_sent_beacon from changing
6152 std::lock_guard l{min_last_epoch_clean_lock};
6153 const auto elapsed = now - last_sent_beacon;
6154 if (std::chrono::duration_cast<std::chrono::seconds>(elapsed).count() >
6155 cct->_conf->osd_beacon_report_interval) {
6156 need_send_beacon = true;
6157 }
6158 }
6159 if (need_send_beacon) {
6160 send_beacon(now);
6161 }
6162 }
6163
6164 mgrc.update_daemon_health(get_health_metrics());
6165 service.kick_recovery_queue();
6166 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
6167 new C_Tick_WithoutOSDLock(this));
6168}
6169
6170// Usage:
6171// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
6172// rmomapkey <pool-id> [namespace/]<obj-name> <key>
6173// setomapheader <pool-id> [namespace/]<obj-name> <header>
6174// getomap <pool> [namespace/]<obj-name>
6175// truncobj <pool-id> [namespace/]<obj-name> <newlen>
6176// injectmdataerr [namespace/]<obj-name> [shardid]
6177// injectdataerr [namespace/]<obj-name> [shardid]
6178//
6179// set_recovery_delay [utime]
6180void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
6181 std::string_view command,
6182 const cmdmap_t& cmdmap, ostream &ss)
6183{
6184 //Test support
6185 //Support changing the omap on a single osd by using the Admin Socket to
6186 //directly request the osd make a change.
6187 if (command == "setomapval" || command == "rmomapkey" ||
6188 command == "setomapheader" || command == "getomap" ||
6189 command == "truncobj" || command == "injectmdataerr" ||
6190 command == "injectdataerr"
6191 ) {
6192 pg_t rawpg;
6193 int64_t pool;
6194 OSDMapRef curmap = service->get_osdmap();
6195 int r = -1;
6196
6197 string poolstr;
6198
6199 cmd_getval(cmdmap, "pool", poolstr);
6200 pool = curmap->lookup_pg_pool_name(poolstr);
6201 //If we can't find it by name then maybe id specified
6202 if (pool < 0 && isdigit(poolstr[0]))
6203 pool = atoll(poolstr.c_str());
6204 if (pool < 0) {
6205 ss << "Invalid pool '" << poolstr << "''";
6206 return;
6207 }
6208
6209 string objname, nspace;
6210 cmd_getval(cmdmap, "objname", objname);
6211 std::size_t found = objname.find_first_of('/');
6212 if (found != string::npos) {
6213 nspace = objname.substr(0, found);
6214 objname = objname.substr(found+1);
6215 }
6216 object_locator_t oloc(pool, nspace);
6217 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
6218
6219 if (r < 0) {
6220 ss << "Invalid namespace/objname";
6221 return;
6222 }
6223
6224 int64_t shardid = cmd_getval_or<int64_t>(cmdmap, "shardid", shard_id_t::NO_SHARD);
6225 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
6226 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
6227 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
6228 if (curmap->pg_is_ec(rawpg)) {
6229 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
6230 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
6231 return;
6232 }
6233 }
6234
6235 ObjectStore::Transaction t;
6236
6237 if (command == "setomapval") {
6238 map<string, bufferlist> newattrs;
6239 bufferlist val;
6240 string key, valstr;
6241 cmd_getval(cmdmap, "key", key);
6242 cmd_getval(cmdmap, "val", valstr);
6243
6244 val.append(valstr);
6245 newattrs[key] = val;
6246 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
6247 r = store->queue_transaction(service->meta_ch, std::move(t));
6248 if (r < 0)
6249 ss << "error=" << r;
6250 else
6251 ss << "ok";
6252 } else if (command == "rmomapkey") {
6253 string key;
6254 cmd_getval(cmdmap, "key", key);
6255
6256 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
6257 r = store->queue_transaction(service->meta_ch, std::move(t));
6258 if (r < 0)
6259 ss << "error=" << r;
6260 else
6261 ss << "ok";
6262 } else if (command == "setomapheader") {
6263 bufferlist newheader;
6264 string headerstr;
6265
6266 cmd_getval(cmdmap, "header", headerstr);
6267 newheader.append(headerstr);
6268 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
6269 r = store->queue_transaction(service->meta_ch, std::move(t));
6270 if (r < 0)
6271 ss << "error=" << r;
6272 else
6273 ss << "ok";
6274 } else if (command == "getomap") {
6275 //Debug: Output entire omap
6276 bufferlist hdrbl;
6277 map<string, bufferlist> keyvals;
6278 auto ch = store->open_collection(coll_t(pgid));
6279 if (!ch) {
6280 ss << "unable to open collection for " << pgid;
6281 r = -ENOENT;
6282 } else {
6283 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6284 if (r >= 0) {
6285 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6286 for (map<string, bufferlist>::iterator it = keyvals.begin();
6287 it != keyvals.end(); ++it)
6288 ss << " key=" << (*it).first << " val="
6289 << string((*it).second.c_str(), (*it).second.length());
6290 } else {
6291 ss << "error=" << r;
6292 }
6293 }
6294 } else if (command == "truncobj") {
6295 int64_t trunclen;
6296 cmd_getval(cmdmap, "len", trunclen);
6297 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
6298 r = store->queue_transaction(service->meta_ch, std::move(t));
6299 if (r < 0)
6300 ss << "error=" << r;
6301 else
6302 ss << "ok";
6303 } else if (command == "injectdataerr") {
6304 store->inject_data_error(gobj);
6305 ss << "ok";
6306 } else if (command == "injectmdataerr") {
6307 store->inject_mdata_error(gobj);
6308 ss << "ok";
6309 }
6310 return;
6311 }
6312 if (command == "set_recovery_delay") {
6313 int64_t delay = cmd_getval_or<int64_t>(cmdmap, "utime", 0);
6314 ostringstream oss;
6315 oss << delay;
6316 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
6317 oss.str().c_str());
6318 if (r != 0) {
6319 ss << "set_recovery_delay: error setting "
6320 << "osd_recovery_delay_start to '" << delay << "': error "
6321 << r;
6322 return;
6323 }
6324 service->cct->_conf.apply_changes(nullptr);
6325 ss << "set_recovery_delay: set osd_recovery_delay_start "
6326 << "to " << service->cct->_conf->osd_recovery_delay_start;
6327 return;
6328 }
6329 if (command == "injectfull") {
6330 int64_t count = cmd_getval_or<int64_t>(cmdmap, "count", -1);
6331 string type = cmd_getval_or<string>(cmdmap, "type", "full");
6332 OSDService::s_names state;
6333
6334 if (type == "none" || count == 0) {
6335 type = "none";
6336 count = 0;
6337 }
6338 state = service->get_full_state(type);
6339 if (state == OSDService::s_names::INVALID) {
6340 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6341 return;
6342 }
6343 service->set_injectfull(state, count);
6344 return;
6345 }
6346 ss << "Internal error - command=" << command;
6347}
6348
6349// =========================================
6350
6351void OSD::ms_handle_connect(Connection *con)
6352{
6353 dout(10) << __func__ << " con " << con << dendl;
6354 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6355 std::lock_guard l(osd_lock);
6356 if (is_stopping())
6357 return;
6358 dout(10) << __func__ << " on mon" << dendl;
6359
6360 if (is_preboot()) {
6361 start_boot();
6362 } else if (is_booting()) {
6363 _send_boot(); // resend boot message
6364 } else {
6365 map_lock.lock_shared();
6366 std::lock_guard l2(mon_report_lock);
6367
6368 utime_t now = ceph_clock_now();
6369 last_mon_report = now;
6370
6371 // resend everything, it's a new session
6372 send_full_update();
6373 send_alive();
6374 service.requeue_pg_temp();
6375 service.clear_sent_ready_to_merge();
6376 service.send_pg_temp();
6377 service.send_ready_to_merge();
6378 service.send_pg_created();
6379 requeue_failures();
6380 send_failures();
6381
6382 map_lock.unlock_shared();
6383 if (is_active()) {
6384 send_beacon(ceph::coarse_mono_clock::now());
6385 }
6386 }
6387
6388 // full map requests may happen while active or pre-boot
6389 if (requested_full_first) {
6390 rerequest_full_maps();
6391 }
6392 }
6393}
6394
6395void OSD::ms_handle_fast_connect(Connection *con)
6396{
6397 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6398 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6399 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6400 s = ceph::make_ref<Session>(cct, con);
6401 con->set_priv(s);
6402 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6403 << " addr=" << s->con->get_peer_addr() << dendl;
6404 // we don't connect to clients
6405 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6406 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6407 }
6408 }
6409}
6410
6411void OSD::ms_handle_fast_accept(Connection *con)
6412{
6413 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6414 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6415 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6416 s = ceph::make_ref<Session>(cct, con);
6417 con->set_priv(s);
6418 dout(10) << "new session (incoming)" << s << " con=" << con
6419 << " addr=" << con->get_peer_addr()
6420 << " must have raced with connect" << dendl;
6421 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6422 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6423 }
6424 }
6425}
6426
6427bool OSD::ms_handle_reset(Connection *con)
6428{
6429 auto session = ceph::ref_cast<Session>(con->get_priv());
6430 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
6431 if (!session)
6432 return false;
6433 session->wstate.reset(con);
6434 session->con->set_priv(nullptr);
6435 session->con.reset(); // break con <-> session ref cycle
6436 // note that we break session->con *before* the session_handle_reset
6437 // cleanup below. this avoids a race between us and
6438 // PG::add_backoff, Session::check_backoff, etc.
6439 session_handle_reset(session);
6440 return true;
6441}
6442
6443bool OSD::ms_handle_refused(Connection *con)
6444{
6445 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6446 return false;
6447
6448 auto session = ceph::ref_cast<Session>(con->get_priv());
6449 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
6450 if (!session)
6451 return false;
6452 int type = con->get_peer_type();
6453 // handle only OSD failures here
6454 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6455 OSDMapRef osdmap = get_osdmap();
6456 if (osdmap) {
6457 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6458 if (id >= 0 && osdmap->is_up(id)) {
6459 // I'm cheating mon heartbeat grace logic, because we know it's not going
6460 // to respawn alone. +1 so we won't hit any boundary case.
6461 monc->send_mon_message(
6462 new MOSDFailure(
6463 monc->get_fsid(),
6464 id,
6465 osdmap->get_addrs(id),
6466 cct->_conf->osd_heartbeat_grace + 1,
6467 osdmap->get_epoch(),
6468 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6469 ));
6470 }
6471 }
6472 }
6473 return true;
6474}
6475
6476struct CB_OSD_GetVersion {
6477 OSD *osd;
6478 explicit CB_OSD_GetVersion(OSD *o) : osd(o) {}
6479 void operator ()(boost::system::error_code ec, version_t newest,
6480 version_t oldest) {
6481 if (!ec)
6482 osd->_got_mon_epochs(oldest, newest);
6483 }
6484};
6485
6486void OSD::start_boot()
6487{
6488 if (!_is_healthy()) {
6489 // if we are not healthy, do not mark ourselves up (yet)
6490 dout(1) << "not healthy; waiting to boot" << dendl;
6491 if (!is_waiting_for_healthy())
6492 start_waiting_for_healthy();
6493 // send pings sooner rather than later
6494 heartbeat_kick();
6495 return;
6496 }
6497 dout(1) << __func__ << dendl;
6498 set_state(STATE_PREBOOT);
6499 dout(10) << "start_boot - have maps " << superblock.oldest_map
6500 << ".." << superblock.newest_map << dendl;
6501 monc->get_version("osdmap", CB_OSD_GetVersion(this));
6502}
6503
6504void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6505{
6506 std::lock_guard l(osd_lock);
6507 if (is_preboot()) {
6508 _preboot(oldest, newest);
6509 }
6510}
6511
6512void OSD::_preboot(epoch_t oldest, epoch_t newest)
6513{
6514 ceph_assert(is_preboot());
6515 dout(10) << __func__ << " _preboot mon has osdmaps "
6516 << oldest << ".." << newest << dendl;
6517
6518 // ensure our local fullness awareness is accurate
6519 {
6520 std::lock_guard l(heartbeat_lock);
6521 heartbeat();
6522 }
6523
6524 const auto& monmap = monc->monmap;
6525 const auto osdmap = get_osdmap();
6526 // if our map within recent history, try to add ourselves to the osdmap.
6527 if (osdmap->get_epoch() == 0) {
6528 derr << "waiting for initial osdmap" << dendl;
6529 } else if (osdmap->is_destroyed(whoami)) {
6530 derr << "osdmap says I am destroyed" << dendl;
6531 // provide a small margin so we don't livelock seeing if we
6532 // un-destroyed ourselves.
6533 if (osdmap->get_epoch() > newest - 1) {
6534 exit(0);
6535 }
6536 } else if (osdmap->is_noup(whoami)) {
6537 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6538 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6539 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6540 << dendl;
6541 } else if (service.need_fullness_update()) {
6542 derr << "osdmap fullness state needs update" << dendl;
6543 send_full_update();
6544 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6545 superblock.purged_snaps_last < superblock.current_epoch) {
6546 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6547 << " < newest_map " << superblock.current_epoch << dendl;
6548 _get_purged_snaps();
6549 } else if (osdmap->get_epoch() >= oldest - 1 &&
6550 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6551
6552 // wait for pgs to fully catch up in a different thread, since
6553 // this thread might be required for splitting and merging PGs to
6554 // make progress.
6555 boot_finisher.queue(
6556 new LambdaContext(
6557 [this](int r) {
6558 std::unique_lock l(osd_lock);
6559 if (is_preboot()) {
6560 dout(10) << __func__ << " waiting for peering work to drain"
6561 << dendl;
6562 l.unlock();
6563 for (auto shard : shards) {
6564 shard->wait_min_pg_epoch(get_osdmap_epoch());
6565 }
6566 l.lock();
6567 }
6568 if (is_preboot()) {
6569 _send_boot();
6570 }
6571 }));
6572 return;
6573 }
6574
6575 // get all the latest maps
6576 if (osdmap->get_epoch() + 1 >= oldest)
6577 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6578 else
6579 osdmap_subscribe(oldest - 1, true);
6580}
6581
6582void OSD::_get_purged_snaps()
6583{
6584 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6585 // overlapping requests to the mon, which will be somewhat inefficient, but
6586 // it should be reliable.
6587 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6588 << ", newest_map " << superblock.current_epoch << dendl;
6589 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6590 superblock.purged_snaps_last + 1,
6591 superblock.current_epoch + 1);
6592 monc->send_mon_message(m);
6593}
6594
6595void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6596{
6597 dout(10) << __func__ << " " << *m << dendl;
6598 ObjectStore::Transaction t;
6599 if (!is_preboot() ||
6600 m->last < superblock.purged_snaps_last) {
6601 goto out;
6602 }
6603 SnapMapper::record_purged_snaps(cct, store.get(), service.meta_ch,
6604 make_purged_snaps_oid(), &t,
6605 m->purged_snaps);
6606 superblock.purged_snaps_last = m->last;
6607 write_superblock(t);
6608 store->queue_transaction(
6609 service.meta_ch,
6610 std::move(t));
6611 service.publish_superblock(superblock);
6612 if (m->last < superblock.current_epoch) {
6613 _get_purged_snaps();
6614 } else {
6615 start_boot();
6616 }
6617out:
6618 m->put();
6619}
6620
6621void OSD::send_full_update()
6622{
6623 if (!service.need_fullness_update())
6624 return;
6625 unsigned state = 0;
6626 if (service.is_full()) {
6627 state = CEPH_OSD_FULL;
6628 } else if (service.is_backfillfull()) {
6629 state = CEPH_OSD_BACKFILLFULL;
6630 } else if (service.is_nearfull()) {
6631 state = CEPH_OSD_NEARFULL;
6632 }
6633 set<string> s;
6634 OSDMap::calc_state_set(state, s);
6635 dout(10) << __func__ << " want state " << s << dendl;
6636 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
6637}
6638
6639void OSD::start_waiting_for_healthy()
6640{
6641 dout(1) << "start_waiting_for_healthy" << dendl;
6642 set_state(STATE_WAITING_FOR_HEALTHY);
6643 last_heartbeat_resample = utime_t();
6644
6645 // subscribe to osdmap updates, in case our peers really are known to be dead
6646 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6647}
6648
6649bool OSD::_is_healthy()
6650{
6651 if (!cct->get_heartbeat_map()->is_healthy()) {
6652 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6653 return false;
6654 }
6655
6656 if (is_waiting_for_healthy()) {
6657 utime_t now = ceph_clock_now();
6658 if (osd_markdown_log.empty()) {
6659 dout(5) << __func__ << " force returning true since last markdown"
6660 << " was " << cct->_conf->osd_max_markdown_period
6661 << "s ago" << dendl;
6662 return true;
6663 }
6664 std::lock_guard l(heartbeat_lock);
6665 int num = 0, up = 0;
6666 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6667 p != heartbeat_peers.end();
6668 ++p) {
6669 if (p->second.is_healthy(now))
6670 ++up;
6671 ++num;
6672 }
6673 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6674 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6675 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6676 return false;
6677 }
6678 }
6679
6680 return true;
6681}
6682
6683void OSD::_send_boot()
6684{
6685 dout(10) << "_send_boot" << dendl;
6686 Connection *local_connection =
6687 cluster_messenger->get_loopback_connection().get();
6688 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6689 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6690 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6691 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6692
6693 dout(20) << " initial client_addrs " << client_addrs
6694 << ", cluster_addrs " << cluster_addrs
6695 << ", hb_back_addrs " << hb_back_addrs
6696 << ", hb_front_addrs " << hb_front_addrs
6697 << dendl;
6698 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6699 dout(10) << " assuming cluster_addrs match client_addrs "
6700 << client_addrs << dendl;
6701 cluster_addrs = cluster_messenger->get_myaddrs();
6702 }
6703 if (auto session = local_connection->get_priv(); !session) {
6704 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6705 }
6706
6707 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6708 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6709 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6710 << cluster_addrs << dendl;
6711 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6712 }
6713 if (auto session = local_connection->get_priv(); !session) {
6714 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6715 }
6716
6717 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6718 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6719 dout(10) << " assuming hb_front_addrs match client_addrs "
6720 << client_addrs << dendl;
6721 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6722 }
6723 if (auto session = local_connection->get_priv(); !session) {
6724 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6725 }
6726
6727 // we now know what our front and back addrs will be, and we are
6728 // about to tell the mon what our metadata (including numa bindings)
6729 // are, so now is a good time!
6730 set_numa_affinity();
6731
6732 MOSDBoot *mboot = new MOSDBoot(
6733 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6734 hb_back_addrs, hb_front_addrs, cluster_addrs,
6735 CEPH_FEATURES_ALL);
6736 dout(10) << " final client_addrs " << client_addrs
6737 << ", cluster_addrs " << cluster_addrs
6738 << ", hb_back_addrs " << hb_back_addrs
6739 << ", hb_front_addrs " << hb_front_addrs
6740 << dendl;
6741 _collect_metadata(&mboot->metadata);
6742 monc->send_mon_message(mboot);
6743 set_state(STATE_BOOTING);
6744}
6745
6746void OSD::_collect_metadata(map<string,string> *pm)
6747{
6748 // config info
6749 (*pm)["osd_data"] = dev_path;
6750 if (store->get_type() == "filestore") {
6751 // not applicable for bluestore
6752 (*pm)["osd_journal"] = journal_path;
6753 }
6754 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6755 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6756 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6757 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6758
6759 // backend
6760 (*pm)["osd_objectstore"] = store->get_type();
6761 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6762 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6763 (*pm)["default_device_class"] = store->get_default_device_class();
6764 string osdspec_affinity;
6765 int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6766 if (r < 0 || osdspec_affinity.empty()) {
6767 osdspec_affinity = "";
6768 }
6769 (*pm)["osdspec_affinity"] = osdspec_affinity;
6770 store->collect_metadata(pm);
6771
6772 collect_sys_info(pm, cct);
6773
6774 (*pm)["front_iface"] = pick_iface(
6775 cct,
6776 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6777 (*pm)["back_iface"] = pick_iface(
6778 cct,
6779 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6780
6781 // network numa
6782 {
6783 int node = -1;
6784 set<int> nodes;
6785 set<string> unknown;
6786 for (auto nm : { "front_iface", "back_iface" }) {
6787 if (!(*pm)[nm].size()) {
6788 unknown.insert(nm);
6789 continue;
6790 }
6791 int n = -1;
6792 int r = get_iface_numa_node((*pm)[nm], &n);
6793 if (r < 0) {
6794 unknown.insert((*pm)[nm]);
6795 continue;
6796 }
6797 nodes.insert(n);
6798 if (node < 0) {
6799 node = n;
6800 }
6801 }
6802 if (unknown.size()) {
6803 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6804 }
6805 if (!nodes.empty()) {
6806 (*pm)["network_numa_nodes"] = stringify(nodes);
6807 }
6808 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6809 (*pm)["network_numa_node"] = stringify(node);
6810 }
6811 }
6812
6813 if (numa_node >= 0) {
6814 (*pm)["numa_node"] = stringify(numa_node);
6815 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6816 &numa_cpu_set);
6817 }
6818
6819 set<string> devnames;
6820 store->get_devices(&devnames);
6821 map<string,string> errs;
6822 get_device_metadata(devnames, pm, &errs);
6823 for (auto& i : errs) {
6824 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
6825 }
6826 dout(10) << __func__ << " " << *pm << dendl;
6827}
6828
6829void OSD::queue_want_up_thru(epoch_t want)
6830{
6831 std::shared_lock map_locker{map_lock};
6832 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6833 std::lock_guard report_locker(mon_report_lock);
6834 if (want > up_thru_wanted) {
6835 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6836 << ", currently " << cur
6837 << dendl;
6838 up_thru_wanted = want;
6839 send_alive();
6840 } else {
6841 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6842 << ", currently " << cur
6843 << dendl;
6844 }
6845}
6846
6847void OSD::send_alive()
6848{
6849 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6850 const auto osdmap = get_osdmap();
6851 if (!osdmap->exists(whoami))
6852 return;
6853 epoch_t up_thru = osdmap->get_up_thru(whoami);
6854 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6855 if (up_thru_wanted > up_thru) {
6856 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6857 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6858 }
6859}
6860
6861void OSD::request_full_map(epoch_t first, epoch_t last)
6862{
6863 dout(10) << __func__ << " " << first << ".." << last
6864 << ", previously requested "
6865 << requested_full_first << ".." << requested_full_last << dendl;
6866 ceph_assert(ceph_mutex_is_locked(osd_lock));
6867 ceph_assert(first > 0 && last > 0);
6868 ceph_assert(first <= last);
6869 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6870 if (requested_full_first == 0) {
6871 // first request
6872 requested_full_first = first;
6873 requested_full_last = last;
6874 } else if (last <= requested_full_last) {
6875 // dup
6876 return;
6877 } else {
6878 // additional request
6879 first = requested_full_last + 1;
6880 requested_full_last = last;
6881 }
6882 MMonGetOSDMap *req = new MMonGetOSDMap;
6883 req->request_full(first, last);
6884 monc->send_mon_message(req);
6885}
6886
6887void OSD::got_full_map(epoch_t e)
6888{
6889 ceph_assert(requested_full_first <= requested_full_last);
6890 ceph_assert(ceph_mutex_is_locked(osd_lock));
6891 if (requested_full_first == 0) {
6892 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6893 return;
6894 }
6895 if (e < requested_full_first) {
6896 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6897 << ".." << requested_full_last
6898 << ", ignoring" << dendl;
6899 return;
6900 }
6901 if (e >= requested_full_last) {
6902 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6903 << ".." << requested_full_last << ", resetting" << dendl;
6904 requested_full_first = requested_full_last = 0;
6905 return;
6906 }
6907
6908 requested_full_first = e + 1;
6909
6910 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6911 << ".." << requested_full_last
6912 << ", still need more" << dendl;
6913}
6914
6915void OSD::requeue_failures()
6916{
6917 std::lock_guard l(heartbeat_lock);
6918 unsigned old_queue = failure_queue.size();
6919 unsigned old_pending = failure_pending.size();
6920 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
6921 failure_queue[p->first] = p->second.first;
6922 failure_pending.erase(p++);
6923 }
6924 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6925 << failure_queue.size() << dendl;
6926}
6927
6928void OSD::send_failures()
6929{
6930 ceph_assert(ceph_mutex_is_locked(map_lock));
6931 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6932 std::lock_guard l(heartbeat_lock);
6933 utime_t now = ceph_clock_now();
6934 const auto osdmap = get_osdmap();
6935 while (!failure_queue.empty()) {
6936 int osd = failure_queue.begin()->first;
6937 if (!failure_pending.count(osd)) {
6938 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6939 monc->send_mon_message(
6940 new MOSDFailure(
6941 monc->get_fsid(),
6942 osd,
6943 osdmap->get_addrs(osd),
6944 failed_for,
6945 osdmap->get_epoch()));
6946 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6947 osdmap->get_addrs(osd));
6948 }
6949 failure_queue.erase(osd);
6950 }
6951}
6952
6953void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
6954{
6955 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6956 MOSDFailure::FLAG_ALIVE);
6957 monc->send_mon_message(m);
6958}
6959
6960void OSD::cancel_pending_failures()
6961{
6962 std::lock_guard l(heartbeat_lock);
6963 auto it = failure_pending.begin();
6964 while (it != failure_pending.end()) {
6965 dout(10) << __func__ << " canceling in-flight failure report for osd."
6966 << it->first << dendl;
6967 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
6968 failure_pending.erase(it++);
6969 }
6970}
6971
6972void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6973{
6974 const auto& monmap = monc->monmap;
6975 // send beacon to mon even if we are just connected, and the monmap is not
6976 // initialized yet by then.
6977 if (monmap.epoch > 0 &&
6978 monmap.get_required_features().contains_all(
6979 ceph::features::mon::FEATURE_LUMINOUS)) {
6980 dout(20) << __func__ << " sending" << dendl;
6981 MOSDBeacon* beacon = nullptr;
6982 {
6983 std::lock_guard l{min_last_epoch_clean_lock};
6984 beacon = new MOSDBeacon(get_osdmap_epoch(),
6985 min_last_epoch_clean,
6986 superblock.last_purged_snaps_scrub,
6987 cct->_conf->osd_beacon_report_interval);
6988 beacon->pgs = min_last_epoch_clean_pgs;
6989 last_sent_beacon = now;
6990 }
6991 monc->send_mon_message(beacon);
6992 } else {
6993 dout(20) << __func__ << " not sending" << dendl;
6994 }
6995}
6996
6997void OSD::handle_command(MCommand *m)
6998{
6999 ConnectionRef con = m->get_connection();
7000 auto session = ceph::ref_cast<Session>(con->get_priv());
7001 if (!session) {
7002 con->send_message(new MCommandReply(m, -EACCES));
7003 m->put();
7004 return;
7005 }
7006 if (!session->caps.allow_all()) {
7007 con->send_message(new MCommandReply(m, -EACCES));
7008 m->put();
7009 return;
7010 }
7011 cct->get_admin_socket()->queue_tell_command(m);
7012 m->put();
7013}
7014
7015namespace {
7016 class unlock_guard {
7017 ceph::mutex& m;
7018 public:
7019 explicit unlock_guard(ceph::mutex& mutex)
7020 : m(mutex)
7021 {
7022 m.unlock();
7023 }
7024 unlock_guard(unlock_guard&) = delete;
7025 ~unlock_guard() {
7026 m.lock();
7027 }
7028 };
7029}
7030
7031void OSD::scrub_purged_snaps()
7032{
7033 dout(10) << __func__ << dendl;
7034 ceph_assert(ceph_mutex_is_locked(osd_lock));
7035 SnapMapper::Scrubber s(cct, store.get(), service.meta_ch,
7036 make_snapmapper_oid(),
7037 make_purged_snaps_oid());
7038 clog->debug() << "purged_snaps scrub starts";
7039 osd_lock.unlock();
7040 s.run();
7041 if (s.stray.size()) {
7042 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
7043 } else {
7044 clog->debug() << "purged_snaps scrub ok";
7045 }
7046 set<pair<spg_t,snapid_t>> queued;
7047 for (auto& [pool, snap, hash, shard] : s.stray) {
7048 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
7049 if (!pi) {
7050 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
7051 continue;
7052 }
7053 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
7054 spg_t spgid(pgid, shard);
7055 pair<spg_t,snapid_t> p(spgid, snap);
7056 if (queued.count(p)) {
7057 dout(20) << __func__ << " pg " << spgid << " snap " << snap
7058 << " already queued" << dendl;
7059 continue;
7060 }
7061 PGRef pg = lookup_lock_pg(spgid);
7062 if (!pg) {
7063 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
7064 continue;
7065 }
7066 queued.insert(p);
7067 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
7068 << snap << dendl;
7069 pg->queue_snap_retrim(snap);
7070 pg->unlock();
7071 }
7072 osd_lock.lock();
7073 if (is_stopping()) {
7074 return;
7075 }
7076 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
7077 ObjectStore::Transaction t;
7078 superblock.last_purged_snaps_scrub = ceph_clock_now();
7079 write_superblock(t);
7080 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7081 ceph_assert(tr == 0);
7082 if (is_active()) {
7083 send_beacon(ceph::coarse_mono_clock::now());
7084 }
7085 dout(10) << __func__ << " done" << dendl;
7086}
7087
7088void OSD::probe_smart(const string& only_devid, ostream& ss)
7089{
7090 set<string> devnames;
7091 store->get_devices(&devnames);
7092 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
7093 "osd_smart_report_timeout");
7094
7095 // == typedef std::map<std::string, mValue> mObject;
7096 json_spirit::mObject json_map;
7097
7098 for (auto dev : devnames) {
7099 // smartctl works only on physical devices; filter out any logical device
7100 if (dev.find("dm-") == 0) {
7101 continue;
7102 }
7103
7104 string err;
7105 string devid = get_device_id(dev, &err);
7106 if (devid.size() == 0) {
7107 dout(10) << __func__ << " no unique id for dev " << dev << " ("
7108 << err << "), skipping" << dendl;
7109 continue;
7110 }
7111 if (only_devid.size() && devid != only_devid) {
7112 continue;
7113 }
7114
7115 json_spirit::mValue smart_json;
7116 if (block_device_get_metrics(dev, smart_timeout,
7117 &smart_json)) {
7118 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
7119 continue;
7120 }
7121 json_map[devid] = smart_json;
7122 }
7123 json_spirit::write(json_map, ss, json_spirit::pretty_print);
7124}
7125
7126bool OSD::heartbeat_dispatch(Message *m)
7127{
7128 dout(30) << "heartbeat_dispatch " << m << dendl;
7129 switch (m->get_type()) {
7130
7131 case CEPH_MSG_PING:
7132 dout(10) << "ping from " << m->get_source_inst() << dendl;
7133 m->put();
7134 break;
7135
7136 case MSG_OSD_PING:
7137 handle_osd_ping(static_cast<MOSDPing*>(m));
7138 break;
7139
7140 default:
7141 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7142 m->put();
7143 }
7144
7145 return true;
7146}
7147
7148bool OSD::ms_dispatch(Message *m)
7149{
7150 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7151 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7152 service.got_stop_ack();
7153 m->put();
7154 return true;
7155 }
7156
7157 // lock!
7158
7159 osd_lock.lock();
7160 if (is_stopping()) {
7161 osd_lock.unlock();
7162 m->put();
7163 return true;
7164 }
7165
7166 do_waiters();
7167 _dispatch(m);
7168
7169 osd_lock.unlock();
7170
7171 return true;
7172}
7173
7174void OSDService::maybe_share_map(
7175 Connection *con,
7176 const OSDMapRef& osdmap,
7177 epoch_t peer_epoch_lb)
7178{
7179 // NOTE: we assume caller hold something that keeps the Connection itself
7180 // pinned (e.g., an OpRequest's MessageRef).
7181 auto session = ceph::ref_cast<Session>(con->get_priv());
7182 if (!session) {
7183 return;
7184 }
7185
7186 // assume the peer has the newer of the op's sent_epoch and what
7187 // we think we sent them.
7188 session->sent_epoch_lock.lock();
7189 if (peer_epoch_lb > session->last_sent_epoch) {
7190 dout(10) << __func__ << " con " << con
7191 << " " << con->get_peer_addr()
7192 << " map epoch " << session->last_sent_epoch
7193 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
7194 session->last_sent_epoch = peer_epoch_lb;
7195 }
7196 epoch_t last_sent_epoch = session->last_sent_epoch;
7197 session->sent_epoch_lock.unlock();
7198
7199 if (osdmap->get_epoch() <= last_sent_epoch) {
7200 return;
7201 }
7202
7203 send_incremental_map(last_sent_epoch, con, osdmap);
7204 last_sent_epoch = osdmap->get_epoch();
7205
7206 session->sent_epoch_lock.lock();
7207 if (session->last_sent_epoch < last_sent_epoch) {
7208 dout(10) << __func__ << " con " << con
7209 << " " << con->get_peer_addr()
7210 << " map epoch " << session->last_sent_epoch
7211 << " -> " << last_sent_epoch << " (shared)" << dendl;
7212 session->last_sent_epoch = last_sent_epoch;
7213 }
7214 session->sent_epoch_lock.unlock();
7215}
7216
7217void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
7218{
7219 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
7220
7221 auto i = session->waiting_on_map.begin();
7222 while (i != session->waiting_on_map.end()) {
7223 OpRequestRef op = &(*i);
7224 ceph_assert(ms_can_fast_dispatch(op->get_req()));
7225 auto m = op->get_req<MOSDFastDispatchOp>();
7226 if (m->get_min_epoch() > osdmap->get_epoch()) {
7227 break;
7228 }
7229 session->waiting_on_map.erase(i++);
7230 op->put();
7231
7232 spg_t pgid;
7233 if (m->get_type() == CEPH_MSG_OSD_OP) {
7234 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7235 static_cast<const MOSDOp*>(m)->get_pg());
7236 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7237 continue;
7238 }
7239 } else {
7240 pgid = m->get_spg();
7241 }
7242 enqueue_op(pgid, std::move(op), m->get_map_epoch());
7243 }
7244
7245 if (session->waiting_on_map.empty()) {
7246 clear_session_waiting_on_map(session);
7247 } else {
7248 register_session_waiting_on_map(session);
7249 }
7250}
7251
7252void OSD::ms_fast_dispatch(Message *m)
7253{
7254 auto dispatch_span = tracing::osd::tracer.start_trace(__func__);
7255 FUNCTRACE(cct);
7256 if (service.is_stopping()) {
7257 m->put();
7258 return;
7259 }
7260 // peering event?
7261 switch (m->get_type()) {
7262 case CEPH_MSG_PING:
7263 dout(10) << "ping from " << m->get_source() << dendl;
7264 m->put();
7265 return;
7266 case MSG_OSD_FORCE_RECOVERY:
7267 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7268 return;
7269 case MSG_OSD_SCRUB2:
7270 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7271 return;
7272 case MSG_OSD_PG_CREATE2:
7273 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7274 case MSG_OSD_PG_NOTIFY:
7275 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7276 case MSG_OSD_PG_INFO:
7277 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7278 case MSG_OSD_PG_REMOVE:
7279 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7280 // these are single-pg messages that handle themselves
7281 case MSG_OSD_PG_LOG:
7282 case MSG_OSD_PG_TRIM:
7283 case MSG_OSD_PG_NOTIFY2:
7284 case MSG_OSD_PG_QUERY2:
7285 case MSG_OSD_PG_INFO2:
7286 case MSG_OSD_BACKFILL_RESERVE:
7287 case MSG_OSD_RECOVERY_RESERVE:
7288 case MSG_OSD_PG_LEASE:
7289 case MSG_OSD_PG_LEASE_ACK:
7290 {
7291 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7292 if (require_osd_peer(pm)) {
7293 enqueue_peering_evt(
7294 pm->get_spg(),
7295 PGPeeringEventRef(pm->get_event()));
7296 }
7297 pm->put();
7298 return;
7299 }
7300 }
7301
7302 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7303 {
7304#ifdef WITH_LTTNG
7305 osd_reqid_t reqid = op->get_reqid();
7306#endif
7307 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7308 reqid.name._num, reqid.tid, reqid.inc);
7309 }
7310 op->osd_parent_span = tracing::osd::tracer.add_span("op-request-created", dispatch_span);
7311
7312 if (m->trace)
7313 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7314
7315 // note sender epoch, min req's epoch
7316 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7317 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7318 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7319
7320 service.maybe_inject_dispatch_delay();
7321
7322 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7323 m->get_type() != CEPH_MSG_OSD_OP) {
7324 // queue it directly
7325 enqueue_op(
7326 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7327 std::move(op),
7328 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7329 } else {
7330 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7331 // message that didn't have an explicit spg_t); we need to map
7332 // them to an spg_t while preserving delivery order.
7333 auto priv = m->get_connection()->get_priv();
7334 if (auto session = static_cast<Session*>(priv.get()); session) {
7335 std::lock_guard l{session->session_dispatch_lock};
7336 op->get();
7337 session->waiting_on_map.push_back(*op);
7338 OSDMapRef nextmap = service.get_nextmap_reserved();
7339 dispatch_session_waiting(session, nextmap);
7340 service.release_map(nextmap);
7341 }
7342 }
7343 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7344}
7345
7346int OSD::ms_handle_authentication(Connection *con)
7347{
7348 int ret = 0;
7349 auto s = ceph::ref_cast<Session>(con->get_priv());
7350 if (!s) {
7351 s = ceph::make_ref<Session>(cct, con);
7352 con->set_priv(s);
7353 s->entity_name = con->get_peer_entity_name();
7354 dout(10) << __func__ << " new session " << s << " con " << s->con
7355 << " entity " << s->entity_name
7356 << " addr " << con->get_peer_addrs() << dendl;
7357 } else {
7358 dout(10) << __func__ << " existing session " << s << " con " << s->con
7359 << " entity " << s->entity_name
7360 << " addr " << con->get_peer_addrs() << dendl;
7361 }
7362
7363 AuthCapsInfo &caps_info = con->get_peer_caps_info();
7364 if (caps_info.allow_all) {
7365 s->caps.set_allow_all();
7366 } else if (caps_info.caps.length() > 0) {
7367 bufferlist::const_iterator p = caps_info.caps.cbegin();
7368 string str;
7369 try {
7370 decode(str, p);
7371 }
7372 catch (ceph::buffer::error& e) {
7373 dout(10) << __func__ << " session " << s << " " << s->entity_name
7374 << " failed to decode caps string" << dendl;
7375 ret = -EACCES;
7376 }
7377 if (!ret) {
7378 bool success = s->caps.parse(str);
7379 if (success) {
7380 dout(10) << __func__ << " session " << s
7381 << " " << s->entity_name
7382 << " has caps " << s->caps << " '" << str << "'" << dendl;
7383 ret = 1;
7384 } else {
7385 dout(10) << __func__ << " session " << s << " " << s->entity_name
7386 << " failed to parse caps '" << str << "'" << dendl;
7387 ret = -EACCES;
7388 }
7389 }
7390 }
7391 return ret;
7392}
7393
7394void OSD::do_waiters()
7395{
7396 ceph_assert(ceph_mutex_is_locked(osd_lock));
7397
7398 dout(10) << "do_waiters -- start" << dendl;
7399 while (!finished.empty()) {
7400 OpRequestRef next = finished.front();
7401 finished.pop_front();
7402 dispatch_op(next);
7403 }
7404 dout(10) << "do_waiters -- finish" << dendl;
7405}
7406
7407void OSD::dispatch_op(OpRequestRef op)
7408{
7409 switch (op->get_req()->get_type()) {
7410
7411 case MSG_OSD_PG_CREATE:
7412 handle_pg_create(op);
7413 break;
7414 }
7415}
7416
7417void OSD::_dispatch(Message *m)
7418{
7419 ceph_assert(ceph_mutex_is_locked(osd_lock));
7420 dout(20) << "_dispatch " << m << " " << *m << dendl;
7421
7422 switch (m->get_type()) {
7423 // -- don't need OSDMap --
7424
7425 // map and replication
7426 case CEPH_MSG_OSD_MAP:
7427 handle_osd_map(static_cast<MOSDMap*>(m));
7428 break;
7429 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7430 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7431 break;
7432
7433 // osd
7434 case MSG_OSD_SCRUB:
7435 handle_scrub(static_cast<MOSDScrub*>(m));
7436 break;
7437
7438 case MSG_COMMAND:
7439 handle_command(static_cast<MCommand*>(m));
7440 return;
7441
7442 // -- need OSDMap --
7443
7444 case MSG_OSD_PG_CREATE:
7445 {
7446 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7447 if (m->trace)
7448 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7449 // no map? starting up?
7450 if (!get_osdmap()) {
7451 dout(7) << "no OSDMap, not booted" << dendl;
7452 logger->inc(l_osd_waiting_for_map);
7453 waiting_for_osdmap.push_back(op);
7454 op->mark_delayed("no osdmap");
7455 break;
7456 }
7457
7458 // need OSDMap
7459 dispatch_op(op);
7460 }
7461 }
7462}
7463
7464// remove me post-nautilus
7465void OSD::handle_scrub(MOSDScrub *m)
7466{
7467 dout(10) << "handle_scrub " << *m << dendl;
7468 if (!require_mon_or_mgr_peer(m)) {
7469 m->put();
7470 return;
7471 }
7472 if (m->fsid != monc->get_fsid()) {
7473 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7474 << dendl;
7475 m->put();
7476 return;
7477 }
7478
7479 vector<spg_t> spgs;
7480 _get_pgids(&spgs);
7481
7482 if (!m->scrub_pgs.empty()) {
7483 vector<spg_t> v;
7484 for (auto pgid : m->scrub_pgs) {
7485 spg_t pcand;
7486 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
7487 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7488 v.push_back(pcand);
7489 }
7490 }
7491 spgs.swap(v);
7492 }
7493
7494 for (auto pgid : spgs) {
7495 enqueue_peering_evt(
7496 pgid,
7497 PGPeeringEventRef(
7498 std::make_shared<PGPeeringEvent>(
7499 get_osdmap_epoch(),
7500 get_osdmap_epoch(),
7501 PeeringState::RequestScrub(m->deep, m->repair))));
7502 }
7503
7504 m->put();
7505}
7506
7507void OSD::handle_fast_scrub(MOSDScrub2 *m)
7508{
7509 dout(10) << __func__ << " " << *m << dendl;
7510 if (!require_mon_or_mgr_peer(m)) {
7511 m->put();
7512 return;
7513 }
7514 if (m->fsid != monc->get_fsid()) {
7515 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7516 << dendl;
7517 m->put();
7518 return;
7519 }
7520 for (auto pgid : m->scrub_pgs) {
7521 enqueue_peering_evt(
7522 pgid,
7523 PGPeeringEventRef(
7524 std::make_shared<PGPeeringEvent>(
7525 m->epoch,
7526 m->epoch,
7527 PeeringState::RequestScrub(m->deep, m->repair))));
7528 }
7529 m->put();
7530}
7531
7532bool OSD::scrub_random_backoff()
7533{
7534 bool coin_flip = (rand() / (double)RAND_MAX >=
7535 cct->_conf->osd_scrub_backoff_ratio);
7536 if (!coin_flip) {
7537 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7538 return true;
7539 }
7540 return false;
7541}
7542
7543
7544void OSD::sched_scrub()
7545{
7546 auto& scrub_scheduler = service.get_scrub_services();
7547
7548 // fail fast if no resources are available
7549 if (!scrub_scheduler.can_inc_scrubs()) {
7550 dout(20) << __func__ << ": OSD cannot inc scrubs" << dendl;
7551 return;
7552 }
7553
7554 // if there is a PG that is just now trying to reserve scrub replica resources -
7555 // we should wait and not initiate a new scrub
7556 if (scrub_scheduler.is_reserving_now()) {
7557 dout(20) << __func__ << ": scrub resources reservation in progress" << dendl;
7558 return;
7559 }
7560
7561 Scrub::ScrubPreconds env_conditions;
7562
7563 if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7564 if (!cct->_conf->osd_repair_during_recovery) {
7565 dout(15) << __func__ << ": not scheduling scrubs due to active recovery"
7566 << dendl;
7567 return;
7568 }
7569 dout(10) << __func__
7570 << " will only schedule explicitly requested repair due to active recovery"
7571 << dendl;
7572 env_conditions.allow_requested_repair_only = true;
7573 }
7574
7575 if (g_conf()->subsys.should_gather<ceph_subsys_osd, 20>()) {
7576 dout(20) << __func__ << " sched_scrub starts" << dendl;
7577 auto all_jobs = scrub_scheduler.list_registered_jobs();
7578 for (const auto& sj : all_jobs) {
7579 dout(20) << "sched_scrub scrub-queue jobs: " << *sj << dendl;
7580 }
7581 }
7582
7583 auto was_started = scrub_scheduler.select_pg_and_scrub(env_conditions);
7584 dout(20) << "sched_scrub done (" << ScrubQueue::attempt_res_text(was_started)
7585 << ")" << dendl;
7586}
7587
7588Scrub::schedule_result_t OSDService::initiate_a_scrub(spg_t pgid,
7589 bool allow_requested_repair_only)
7590{
7591 dout(20) << __func__ << " trying " << pgid << dendl;
7592
7593 // we have a candidate to scrub. We need some PG information to know if scrubbing is
7594 // allowed
7595
7596 PGRef pg = osd->lookup_lock_pg(pgid);
7597 if (!pg) {
7598 // the PG was dequeued in the short timespan between creating the candidates list
7599 // (collect_ripe_jobs()) and here
7600 dout(5) << __func__ << " pg " << pgid << " not found" << dendl;
7601 return Scrub::schedule_result_t::no_such_pg;
7602 }
7603
7604 // This has already started, so go on to the next scrub job
7605 if (pg->is_scrub_queued_or_active()) {
7606 pg->unlock();
7607 dout(20) << __func__ << ": already in progress pgid " << pgid << dendl;
7608 return Scrub::schedule_result_t::already_started;
7609 }
7610 // Skip other kinds of scrubbing if only explicitly requested repairing is allowed
7611 if (allow_requested_repair_only && !pg->m_planned_scrub.must_repair) {
7612 pg->unlock();
7613 dout(10) << __func__ << " skip " << pgid
7614 << " because repairing is not explicitly requested on it" << dendl;
7615 return Scrub::schedule_result_t::preconditions;
7616 }
7617
7618 auto scrub_attempt = pg->sched_scrub();
7619 pg->unlock();
7620 return scrub_attempt;
7621}
7622
7623void OSD::resched_all_scrubs()
7624{
7625 dout(10) << __func__ << ": start" << dendl;
7626 auto all_jobs = service.get_scrub_services().list_registered_jobs();
7627 for (auto& e : all_jobs) {
7628
7629 auto& job = *e;
7630 dout(20) << __func__ << ": examine " << job.pgid << dendl;
7631
7632 PGRef pg = _lookup_lock_pg(job.pgid);
7633 if (!pg)
7634 continue;
7635
7636 if (!pg->m_planned_scrub.must_scrub && !pg->m_planned_scrub.need_auto) {
7637 dout(15) << __func__ << ": reschedule " << job.pgid << dendl;
7638 pg->reschedule_scrub();
7639 }
7640 pg->unlock();
7641 }
7642 dout(10) << __func__ << ": done" << dendl;
7643}
7644
7645MPGStats* OSD::collect_pg_stats()
7646{
7647 dout(15) << __func__ << dendl;
7648 // This implementation unconditionally sends every is_primary PG's
7649 // stats every time we're called. This has equivalent cost to the
7650 // previous implementation's worst case where all PGs are busy and
7651 // their stats are always enqueued for sending.
7652 std::shared_lock l{map_lock};
7653
7654 osd_stat_t cur_stat = service.get_osd_stat();
7655 cur_stat.os_perf_stat = store->get_cur_stats();
7656
7657 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
7658 m->osd_stat = cur_stat;
7659
7660 std::lock_guard lec{min_last_epoch_clean_lock};
7661 min_last_epoch_clean = get_osdmap_epoch();
7662 min_last_epoch_clean_pgs.clear();
7663
7664 std::set<int64_t> pool_set;
7665 vector<PGRef> pgs;
7666 _get_pgs(&pgs);
7667 for (auto& pg : pgs) {
7668 auto pool = pg->pg_id.pgid.pool();
7669 pool_set.emplace((int64_t)pool);
7670 if (!pg->is_primary()) {
7671 continue;
7672 }
7673 pg->with_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7674 m->pg_stat[pg->pg_id.pgid] = s;
7675 min_last_epoch_clean = std::min(min_last_epoch_clean, lec);
7676 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7677 });
7678 }
7679 store_statfs_t st;
7680 bool per_pool_stats = false;
7681 bool per_pool_omap_stats = false;
7682 for (auto p : pool_set) {
7683 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
7684 if (r == -ENOTSUP) {
7685 break;
7686 } else {
7687 assert(r >= 0);
7688 m->pool_stat[p] = st;
7689 per_pool_stats = true;
7690 }
7691 }
7692
7693 // indicate whether we are reporting per-pool stats
7694 m->osd_stat.num_osds = 1;
7695 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
7696 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
7697
7698 return m;
7699}
7700
7701vector<DaemonHealthMetric> OSD::get_health_metrics()
7702{
7703 vector<DaemonHealthMetric> metrics;
7704 {
7705 utime_t oldest_secs;
7706 const utime_t now = ceph_clock_now();
7707 auto too_old = now;
7708 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7709 int slow = 0;
7710 TrackedOpRef oldest_op;
7711 OSDMapRef osdmap = get_osdmap();
7712 // map of slow op counts by slow op event type for an aggregated logging to
7713 // the cluster log.
7714 map<uint8_t, int> slow_op_types;
7715 // map of slow op counts by pool for reporting a pool name with highest
7716 // slow ops.
7717 map<uint64_t, int> slow_op_pools;
7718 bool log_aggregated_slow_op =
7719 cct->_conf.get_val<bool>("osd_aggregated_slow_ops_logging");
7720 auto count_slow_ops = [&](TrackedOp& op) {
7721 if (op.get_initiated() < too_old) {
7722 stringstream ss;
7723 ss << "slow request " << op.get_desc()
7724 << " initiated "
7725 << op.get_initiated()
7726 << " currently "
7727 << op.state_string();
7728 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7729 if (log_aggregated_slow_op) {
7730 if (const OpRequest *req = dynamic_cast<const OpRequest *>(&op)) {
7731 uint8_t op_type = req->state_flag();
7732 auto m = req->get_req<MOSDFastDispatchOp>();
7733 uint64_t poolid = m->get_spg().pgid.m_pool;
7734 slow_op_types[op_type]++;
7735 if (poolid > 0 && poolid <= (uint64_t) osdmap->get_pool_max()) {
7736 slow_op_pools[poolid]++;
7737 }
7738 }
7739 } else {
7740 clog->warn() << ss.str();
7741 }
7742 slow++;
7743 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7744 oldest_op = &op;
7745 }
7746 return true;
7747 } else {
7748 return false;
7749 }
7750 };
7751 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7752 if (slow) {
7753 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7754 << oldest_op->get_desc() << dendl;
7755 if (log_aggregated_slow_op &&
7756 slow_op_types.size() > 0) {
7757 stringstream ss;
7758 ss << slow << " slow requests (by type [ ";
7759 for (const auto& [op_type, count] : slow_op_types) {
7760 ss << "'" << OpRequest::get_state_string(op_type)
7761 << "' : " << count
7762 << " ";
7763 }
7764 auto slow_pool_it = std::max_element(slow_op_pools.begin(), slow_op_pools.end(),
7765 [](std::pair<uint64_t, int> p1, std::pair<uint64_t, int> p2) {
7766 return p1.second < p2.second;
7767 });
7768 if (osdmap->get_pools().find(slow_pool_it->first) != osdmap->get_pools().end()) {
7769 string pool_name = osdmap->get_pool_name(slow_pool_it->first);
7770 ss << "] most affected pool [ '"
7771 << pool_name
7772 << "' : "
7773 << slow_pool_it->second
7774 << " ])";
7775 } else {
7776 ss << "])";
7777 }
7778 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7779 clog->warn() << ss.str();
7780 }
7781 }
7782 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7783 } else {
7784 // no news is not good news.
7785 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7786 }
7787 }
7788 {
7789 std::lock_guard l(pending_creates_lock);
7790 auto n_primaries = pending_creates_from_mon;
7791 for (const auto& create : pending_creates_from_osd) {
7792 if (create.second) {
7793 n_primaries++;
7794 }
7795 }
7796 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
7797 }
7798 return metrics;
7799}
7800
7801// =====================================================
7802// MAP
7803
7804void OSD::wait_for_new_map(OpRequestRef op)
7805{
7806 // ask?
7807 if (waiting_for_osdmap.empty()) {
7808 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7809 }
7810
7811 logger->inc(l_osd_waiting_for_map);
7812 waiting_for_osdmap.push_back(op);
7813 op->mark_delayed("wait for new map");
7814}
7815
7816
7817/** update_map
7818 * assimilate new OSDMap(s). scan pgs, etc.
7819 */
7820
7821void OSD::note_down_osd(int peer)
7822{
7823 ceph_assert(ceph_mutex_is_locked(osd_lock));
7824 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7825
7826 std::lock_guard l{heartbeat_lock};
7827 failure_queue.erase(peer);
7828 failure_pending.erase(peer);
7829 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7830 if (p != heartbeat_peers.end()) {
7831 p->second.clear_mark_down();
7832 heartbeat_peers.erase(p);
7833 }
7834}
7835
7836void OSD::note_up_osd(int peer)
7837{
7838 heartbeat_set_peers_need_update();
7839}
7840
7841struct C_OnMapCommit : public Context {
7842 OSD *osd;
7843 epoch_t first, last;
7844 MOSDMap *msg;
7845 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7846 : osd(o), first(f), last(l), msg(m) {}
7847 void finish(int r) override {
7848 osd->_committed_osd_maps(first, last, msg);
7849 msg->put();
7850 }
7851};
7852
7853void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7854{
7855 std::lock_guard l(osdmap_subscribe_lock);
7856 if (latest_subscribed_epoch >= epoch && !force_request)
7857 return;
7858
7859 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
7860
7861 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7862 force_request) {
7863 monc->renew_subs();
7864 }
7865}
7866
7867void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7868{
7869 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7870 if (min <= superblock.oldest_map)
7871 return;
7872
7873 int num = 0;
7874 ObjectStore::Transaction t;
7875 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7876 dout(20) << " removing old osdmap epoch " << e << dendl;
7877 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7878 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7879 superblock.oldest_map = e + 1;
7880 num++;
7881 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7882 service.publish_superblock(superblock);
7883 write_superblock(t);
7884 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7885 ceph_assert(tr == 0);
7886 num = 0;
7887 if (!skip_maps) {
7888 // skip_maps leaves us with a range of old maps if we fail to remove all
7889 // of them before moving superblock.oldest_map forward to the first map
7890 // in the incoming MOSDMap msg. so we should continue removing them in
7891 // this case, even we could do huge series of delete transactions all at
7892 // once.
7893 break;
7894 }
7895 }
7896 }
7897 if (num > 0) {
7898 service.publish_superblock(superblock);
7899 write_superblock(t);
7900 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7901 ceph_assert(tr == 0);
7902 }
7903 // we should not remove the cached maps
7904 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7905}
7906
7907void OSD::handle_osd_map(MOSDMap *m)
7908{
7909 // wait for pgs to catch up
7910 {
7911 // we extend the map cache pins to accomodate pgs slow to consume maps
7912 // for some period, until we hit the max_lag_factor bound, at which point
7913 // we block here to stop injesting more maps than they are able to keep
7914 // up with.
7915 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7916 m_osd_pg_epoch_max_lag_factor;
7917 ceph_assert(max_lag > 0);
7918 epoch_t osd_min = 0;
7919 for (auto shard : shards) {
7920 epoch_t min = shard->get_min_pg_epoch();
7921 if (osd_min == 0 || min < osd_min) {
7922 osd_min = min;
7923 }
7924 }
7925 epoch_t osdmap_epoch = get_osdmap_epoch();
7926 if (osd_min > 0 &&
7927 osdmap_epoch > max_lag &&
7928 osdmap_epoch - max_lag > osd_min) {
7929 epoch_t need = osdmap_epoch - max_lag;
7930 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7931 << " max_lag " << max_lag << ")" << dendl;
7932 for (auto shard : shards) {
7933 epoch_t min = shard->get_min_pg_epoch();
7934 if (need > min) {
7935 dout(10) << __func__ << " waiting for pgs to consume " << need
7936 << " (shard " << shard->shard_id << " min " << min
7937 << ", map cache is " << cct->_conf->osd_map_cache_size
7938 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7939 << ")" << dendl;
7940 unlock_guard unlock{osd_lock};
7941 shard->wait_min_pg_epoch(need);
7942 }
7943 }
7944 }
7945 }
7946
7947 ceph_assert(ceph_mutex_is_locked(osd_lock));
7948 map<epoch_t,OSDMapRef> added_maps;
7949 map<epoch_t,bufferlist> added_maps_bl;
7950 if (m->fsid != monc->get_fsid()) {
7951 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7952 << monc->get_fsid() << dendl;
7953 m->put();
7954 return;
7955 }
7956 if (is_initializing()) {
7957 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7958 m->put();
7959 return;
7960 }
7961
7962 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7963 if (session && !(session->entity_name.is_mon() ||
7964 session->entity_name.is_osd())) {
7965 //not enough perms!
7966 dout(10) << "got osd map from Session " << session
7967 << " which we can't take maps from (not a mon or osd)" << dendl;
7968 m->put();
7969 return;
7970 }
7971
7972 // share with the objecter
7973 if (!is_preboot())
7974 service.objecter->handle_osd_map(m);
7975
7976 epoch_t first = m->get_first();
7977 epoch_t last = m->get_last();
7978 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7979 << superblock.newest_map
7980 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7981 << dendl;
7982
7983 logger->inc(l_osd_map);
7984 logger->inc(l_osd_mape, last - first + 1);
7985 if (first <= superblock.newest_map)
7986 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7987 if (service.max_oldest_map < m->oldest_map) {
7988 service.max_oldest_map = m->oldest_map;
7989 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7990 }
7991
7992 // make sure there is something new, here, before we bother flushing
7993 // the queues and such
7994 if (last <= superblock.newest_map) {
7995 dout(10) << " no new maps here, dropping" << dendl;
7996 m->put();
7997 return;
7998 }
7999
8000 // missing some?
8001 bool skip_maps = false;
8002 if (first > superblock.newest_map + 1) {
8003 dout(10) << "handle_osd_map message skips epochs "
8004 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
8005 if (m->oldest_map <= superblock.newest_map + 1) {
8006 osdmap_subscribe(superblock.newest_map + 1, false);
8007 m->put();
8008 return;
8009 }
8010 // always try to get the full range of maps--as many as we can. this
8011 // 1- is good to have
8012 // 2- is at present the only way to ensure that we get a *full* map as
8013 // the first map!
8014 if (m->oldest_map < first) {
8015 osdmap_subscribe(m->oldest_map - 1, true);
8016 m->put();
8017 return;
8018 }
8019 skip_maps = true;
8020 }
8021
8022 ObjectStore::Transaction t;
8023 uint64_t txn_size = 0;
8024
8025 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
8026
8027 // store new maps: queue for disk and put in the osdmap cache
8028 epoch_t start = std::max(superblock.newest_map + 1, first);
8029 for (epoch_t e = start; e <= last; e++) {
8030 if (txn_size >= t.get_num_bytes()) {
8031 derr << __func__ << " transaction size overflowed" << dendl;
8032 ceph_assert(txn_size < t.get_num_bytes());
8033 }
8034 txn_size = t.get_num_bytes();
8035 map<epoch_t,bufferlist>::iterator p;
8036 p = m->maps.find(e);
8037 if (p != m->maps.end()) {
8038 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
8039 OSDMap *o = new OSDMap;
8040 bufferlist& bl = p->second;
8041
8042 o->decode(bl);
8043
8044 purged_snaps[e] = o->get_new_purged_snaps();
8045
8046 ghobject_t fulloid = get_osdmap_pobject_name(e);
8047 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
8048 added_maps[e] = add_map(o);
8049 added_maps_bl[e] = bl;
8050 got_full_map(e);
8051 continue;
8052 }
8053
8054 p = m->incremental_maps.find(e);
8055 if (p != m->incremental_maps.end()) {
8056 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
8057 bufferlist& bl = p->second;
8058 ghobject_t oid = get_inc_osdmap_pobject_name(e);
8059 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
8060
8061 OSDMap *o = new OSDMap;
8062 if (e > 1) {
8063 bufferlist obl;
8064 bool got = get_map_bl(e - 1, obl);
8065 if (!got) {
8066 auto p = added_maps_bl.find(e - 1);
8067 ceph_assert(p != added_maps_bl.end());
8068 obl = p->second;
8069 }
8070 o->decode(obl);
8071 }
8072
8073 OSDMap::Incremental inc;
8074 auto p = bl.cbegin();
8075 inc.decode(p);
8076
8077 if (o->apply_incremental(inc) < 0) {
8078 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
8079 ceph_abort_msg("bad fsid");
8080 }
8081
8082 bufferlist fbl;
8083 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8084
8085 bool injected_failure = false;
8086 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8087 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8088 derr << __func__ << " injecting map crc failure" << dendl;
8089 injected_failure = true;
8090 }
8091
8092 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8093 dout(2) << "got incremental " << e
8094 << " but failed to encode full with correct crc; requesting"
8095 << dendl;
8096 clog->warn() << "failed to encode map e" << e << " with expected crc";
8097 dout(20) << "my encoded map was:\n";
8098 fbl.hexdump(*_dout);
8099 *_dout << dendl;
8100 delete o;
8101 request_full_map(e, last);
8102 last = e - 1;
8103
8104 // don't continue committing if we failed to enc the first inc map
8105 if (last < start) {
8106 dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
8107 m->put();
8108 return;
8109 }
8110 break;
8111 }
8112 got_full_map(e);
8113 purged_snaps[e] = o->get_new_purged_snaps();
8114
8115 ghobject_t fulloid = get_osdmap_pobject_name(e);
8116 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
8117 added_maps[e] = add_map(o);
8118 added_maps_bl[e] = fbl;
8119 continue;
8120 }
8121
8122 ceph_abort_msg("MOSDMap lied about what maps it had?");
8123 }
8124
8125 // even if this map isn't from a mon, we may have satisfied our subscription
8126 monc->sub_got("osdmap", last);
8127
8128 if (!m->maps.empty() && requested_full_first) {
8129 dout(10) << __func__ << " still missing full maps " << requested_full_first
8130 << ".." << requested_full_last << dendl;
8131 rerequest_full_maps();
8132 }
8133
8134 if (superblock.oldest_map) {
8135 // make sure we at least keep pace with incoming maps
8136 trim_maps(m->oldest_map, last - first + 1, skip_maps);
8137 pg_num_history.prune(superblock.oldest_map);
8138 }
8139
8140 if (!superblock.oldest_map || skip_maps)
8141 superblock.oldest_map = first;
8142 superblock.newest_map = last;
8143 superblock.current_epoch = last;
8144
8145 // note in the superblock that we were clean thru the prior epoch
8146 epoch_t boot_epoch = service.get_boot_epoch();
8147 if (boot_epoch && boot_epoch >= superblock.mounted) {
8148 superblock.mounted = boot_epoch;
8149 superblock.clean_thru = last;
8150 }
8151
8152 // check for pg_num changes and deleted pools
8153 OSDMapRef lastmap;
8154 for (auto& i : added_maps) {
8155 if (!lastmap) {
8156 if (!(lastmap = service.try_get_map(i.first - 1))) {
8157 dout(10) << __func__ << " can't get previous map " << i.first - 1
8158 << " probably first start of this osd" << dendl;
8159 continue;
8160 }
8161 }
8162 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8163 for (auto& j : lastmap->get_pools()) {
8164 if (!i.second->have_pg_pool(j.first)) {
8165 pg_num_history.log_pool_delete(i.first, j.first);
8166 dout(10) << __func__ << " recording final pg_pool_t for pool "
8167 << j.first << dendl;
8168 // this information is needed by _make_pg() if have to restart before
8169 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8170 ghobject_t obj = make_final_pool_info_oid(j.first);
8171 bufferlist bl;
8172 encode(j.second, bl, CEPH_FEATURES_ALL);
8173 string name = lastmap->get_pool_name(j.first);
8174 encode(name, bl);
8175 map<string,string> profile;
8176 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8177 profile = lastmap->get_erasure_code_profile(
8178 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8179 }
8180 encode(profile, bl);
8181 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8182 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8183 new_pg_num != j.second.get_pg_num()) {
8184 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8185 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8186 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8187 }
8188 }
8189 for (auto& j : i.second->get_pools()) {
8190 if (!lastmap->have_pg_pool(j.first)) {
8191 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8192 << j.second.get_pg_num() << dendl;
8193 pg_num_history.log_pg_num_change(i.first, j.first,
8194 j.second.get_pg_num());
8195 }
8196 }
8197 lastmap = i.second;
8198 }
8199 pg_num_history.epoch = last;
8200 {
8201 bufferlist bl;
8202 ::encode(pg_num_history, bl);
8203 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8204 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8205 }
8206
8207 // record new purged_snaps
8208 if (superblock.purged_snaps_last == start - 1) {
8209 SnapMapper::record_purged_snaps(cct, store.get(), service.meta_ch,
8210 make_purged_snaps_oid(), &t,
8211 purged_snaps);
8212 superblock.purged_snaps_last = last;
8213 } else {
8214 dout(10) << __func__ << " superblock purged_snaps_last is "
8215 << superblock.purged_snaps_last
8216 << ", not recording new purged_snaps" << dendl;
8217 }
8218
8219 // superblock and commit
8220 write_superblock(t);
8221 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8222 store->queue_transaction(
8223 service.meta_ch,
8224 std::move(t));
8225 service.publish_superblock(superblock);
8226}
8227
8228void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8229{
8230 dout(10) << __func__ << " " << first << ".." << last << dendl;
8231 if (is_stopping()) {
8232 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8233 return;
8234 }
8235 std::lock_guard l(osd_lock);
8236 if (is_stopping()) {
8237 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8238 return;
8239 }
8240 map_lock.lock();
8241
8242 ceph_assert(first <= last);
8243
8244 bool do_shutdown = false;
8245 bool do_restart = false;
8246 bool network_error = false;
8247 OSDMapRef osdmap = get_osdmap();
8248
8249 // advance through the new maps
8250 for (epoch_t cur = first; cur <= last; cur++) {
8251 dout(10) << " advance to epoch " << cur
8252 << " (<= last " << last
8253 << " <= newest_map " << superblock.newest_map
8254 << ")" << dendl;
8255
8256 OSDMapRef newmap = get_map(cur);
8257 ceph_assert(newmap); // we just cached it above!
8258
8259 // start blocklisting messages sent to peers that go down.
8260 service.pre_publish_map(newmap);
8261
8262 // kill connections to newly down osds
8263 bool waited_for_reservations = false;
8264 set<int> old;
8265 osdmap = get_osdmap();
8266 osdmap->get_all_osds(old);
8267 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8268 if (*p != whoami &&
8269 osdmap->is_up(*p) && // in old map
8270 newmap->is_down(*p)) { // but not the new one
8271 if (!waited_for_reservations) {
8272 service.await_reserved_maps();
8273 waited_for_reservations = true;
8274 }
8275 note_down_osd(*p);
8276 } else if (*p != whoami &&
8277 osdmap->is_down(*p) &&
8278 newmap->is_up(*p)) {
8279 note_up_osd(*p);
8280 }
8281 }
8282
8283 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8284 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8285 << dendl;
8286 if (is_booting()) {
8287 // this captures the case where we sent the boot message while
8288 // NOUP was being set on the mon and our boot request was
8289 // dropped, and then later it is cleared. it imperfectly
8290 // handles the case where our original boot message was not
8291 // dropped and we restart even though we might have booted, but
8292 // that is harmless (boot will just take slightly longer).
8293 do_restart = true;
8294 }
8295 }
8296
8297 osdmap = std::move(newmap);
8298 set_osdmap(osdmap);
8299 epoch_t up_epoch;
8300 epoch_t boot_epoch;
8301 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8302 if (!up_epoch &&
8303 osdmap->is_up(whoami) &&
8304 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8305 up_epoch = osdmap->get_epoch();
8306 dout(10) << "up_epoch is " << up_epoch << dendl;
8307 if (!boot_epoch) {
8308 boot_epoch = osdmap->get_epoch();
8309 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8310 }
8311 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8312 }
8313 }
8314
8315 epoch_t _bind_epoch = service.get_bind_epoch();
8316 if (osdmap->is_up(whoami) &&
8317 osdmap->get_addrs(whoami).legacy_equals(
8318 client_messenger->get_myaddrs()) &&
8319 _bind_epoch < osdmap->get_up_from(whoami)) {
8320
8321 if (is_booting()) {
8322 dout(1) << "state: booting -> active" << dendl;
8323 set_state(STATE_ACTIVE);
8324 do_restart = false;
8325
8326 // set incarnation so that osd_reqid_t's we generate for our
8327 // objecter requests are unique across restarts.
8328 service.objecter->set_client_incarnation(osdmap->get_epoch());
8329 cancel_pending_failures();
8330 }
8331 }
8332
8333 if (osdmap->get_epoch() > 0 &&
8334 is_active()) {
8335 if (!osdmap->exists(whoami)) {
8336 derr << "map says i do not exist. shutting down." << dendl;
8337 do_shutdown = true; // don't call shutdown() while we have
8338 // everything paused
8339 } else if (osdmap->is_stop(whoami)) {
8340 derr << "map says i am stopped by admin. shutting down." << dendl;
8341 do_shutdown = true;
8342 } else if (!osdmap->is_up(whoami) ||
8343 !osdmap->get_addrs(whoami).legacy_equals(
8344 client_messenger->get_myaddrs()) ||
8345 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8346 cluster_messenger->get_myaddrs()) ||
8347 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8348 hb_back_server_messenger->get_myaddrs()) ||
8349 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8350 hb_front_server_messenger->get_myaddrs())) {
8351 if (!osdmap->is_up(whoami)) {
8352 if (service.is_preparing_to_stop() || service.is_stopping()) {
8353 service.got_stop_ack();
8354 } else {
8355 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8356 "but it is still running";
8357 clog->debug() << "map e" << osdmap->get_epoch()
8358 << " wrongly marked me down at e"
8359 << osdmap->get_down_at(whoami);
8360 }
8361 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8362 // note that this is best-effort...
8363 monc->send_mon_message(
8364 new MOSDMarkMeDead(
8365 monc->get_fsid(),
8366 whoami,
8367 osdmap->get_epoch()));
8368 }
8369 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8370 client_messenger->get_myaddrs())) {
8371 clog->error() << "map e" << osdmap->get_epoch()
8372 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8373 << " != my " << client_messenger->get_myaddrs() << ")";
8374 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8375 cluster_messenger->get_myaddrs())) {
8376 clog->error() << "map e" << osdmap->get_epoch()
8377 << " had wrong cluster addr ("
8378 << osdmap->get_cluster_addrs(whoami)
8379 << " != my " << cluster_messenger->get_myaddrs() << ")";
8380 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8381 hb_back_server_messenger->get_myaddrs())) {
8382 clog->error() << "map e" << osdmap->get_epoch()
8383 << " had wrong heartbeat back addr ("
8384 << osdmap->get_hb_back_addrs(whoami)
8385 << " != my " << hb_back_server_messenger->get_myaddrs()
8386 << ")";
8387 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8388 hb_front_server_messenger->get_myaddrs())) {
8389 clog->error() << "map e" << osdmap->get_epoch()
8390 << " had wrong heartbeat front addr ("
8391 << osdmap->get_hb_front_addrs(whoami)
8392 << " != my " << hb_front_server_messenger->get_myaddrs()
8393 << ")";
8394 }
8395
8396 if (!service.is_stopping()) {
8397 epoch_t up_epoch = 0;
8398 epoch_t bind_epoch = osdmap->get_epoch();
8399 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8400 do_restart = true;
8401
8402 //add markdown log
8403 utime_t now = ceph_clock_now();
8404 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8405 osd_markdown_log.push_back(now);
8406 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8407 derr << __func__ << " marked down "
8408 << osd_markdown_log.size()
8409 << " > osd_max_markdown_count "
8410 << cct->_conf->osd_max_markdown_count
8411 << " in last " << grace << " seconds, shutting down"
8412 << dendl;
8413 do_restart = false;
8414 do_shutdown = true;
8415 }
8416
8417 start_waiting_for_healthy();
8418
8419 set<int> avoid_ports;
8420#if defined(__FreeBSD__)
8421 // prevent FreeBSD from grabbing the client_messenger port during
8422 // rebinding. In which case a cluster_meesneger will connect also
8423 // to the same port
8424 client_messenger->get_myaddrs().get_ports(&avoid_ports);
8425#endif
8426 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8427
8428 int r = cluster_messenger->rebind(avoid_ports);
8429 if (r != 0) {
8430 do_shutdown = true; // FIXME: do_restart?
8431 network_error = true;
8432 derr << __func__ << " marked down:"
8433 << " rebind cluster_messenger failed" << dendl;
8434 }
8435
8436 hb_back_server_messenger->mark_down_all();
8437 hb_front_server_messenger->mark_down_all();
8438 hb_front_client_messenger->mark_down_all();
8439 hb_back_client_messenger->mark_down_all();
8440
8441 reset_heartbeat_peers(true);
8442 }
8443 }
8444 } else if (osdmap->get_epoch() > 0 && osdmap->is_stop(whoami)) {
8445 derr << "map says i am stopped by admin. shutting down." << dendl;
8446 do_shutdown = true;
8447 }
8448
8449 map_lock.unlock();
8450
8451 check_osdmap_features();
8452
8453 // yay!
8454 consume_map();
8455
8456 if (is_active() || is_waiting_for_healthy())
8457 maybe_update_heartbeat_peers();
8458
8459 if (is_active()) {
8460 activate_map();
8461 }
8462
8463 if (do_shutdown) {
8464 if (network_error) {
8465 cancel_pending_failures();
8466 }
8467 // trigger shutdown in a different thread
8468 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8469 queue_async_signal(SIGINT);
8470 }
8471 else if (m->newest_map && m->newest_map > last) {
8472 dout(10) << " msg say newest map is " << m->newest_map
8473 << ", requesting more" << dendl;
8474 osdmap_subscribe(osdmap->get_epoch()+1, false);
8475 }
8476 else if (is_preboot()) {
8477 if (m->get_source().is_mon())
8478 _preboot(m->oldest_map, m->newest_map);
8479 else
8480 start_boot();
8481 }
8482 else if (do_restart)
8483 start_boot();
8484
8485}
8486
8487void OSD::check_osdmap_features()
8488{
8489 // adjust required feature bits?
8490
8491 // we have to be a bit careful here, because we are accessing the
8492 // Policy structures without taking any lock. in particular, only
8493 // modify integer values that can safely be read by a racing CPU.
8494 // since we are only accessing existing Policy structures a their
8495 // current memory location, and setting or clearing bits in integer
8496 // fields, and we are the only writer, this is not a problem.
8497
8498 const auto osdmap = get_osdmap();
8499 {
8500 Messenger::Policy p = client_messenger->get_default_policy();
8501 uint64_t mask;
8502 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8503 if ((p.features_required & mask) != features) {
8504 dout(0) << "crush map has features " << features
8505 << ", adjusting msgr requires for clients" << dendl;
8506 p.features_required = (p.features_required & ~mask) | features;
8507 client_messenger->set_default_policy(p);
8508 }
8509 }
8510 {
8511 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8512 uint64_t mask;
8513 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8514 if ((p.features_required & mask) != features) {
8515 dout(0) << "crush map has features " << features
8516 << " was " << p.features_required
8517 << ", adjusting msgr requires for mons" << dendl;
8518 p.features_required = (p.features_required & ~mask) | features;
8519 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8520 }
8521 }
8522 {
8523 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8524 uint64_t mask;
8525 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8526
8527 if ((p.features_required & mask) != features) {
8528 dout(0) << "crush map has features " << features
8529 << ", adjusting msgr requires for osds" << dendl;
8530 p.features_required = (p.features_required & ~mask) | features;
8531 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8532 }
8533
8534 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8535 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8536 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8537 ObjectStore::Transaction t;
8538 write_superblock(t);
8539 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8540 ceph_assert(err == 0);
8541 }
8542 }
8543
8544 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8545 hb_front_server_messenger->set_require_authorizer(false);
8546 hb_back_server_messenger->set_require_authorizer(false);
8547 } else {
8548 hb_front_server_messenger->set_require_authorizer(true);
8549 hb_back_server_messenger->set_require_authorizer(true);
8550 }
8551
8552 if (osdmap->require_osd_release != last_require_osd_release) {
8553 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8554 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8555 store->write_meta("require_osd_release",
8556 stringify((int)osdmap->require_osd_release));
8557 last_require_osd_release = osdmap->require_osd_release;
8558 }
8559}
8560
8561struct C_FinishSplits : public Context {
8562 OSD *osd;
8563 set<PGRef> pgs;
8564 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8565 : osd(osd), pgs(in) {}
8566 void finish(int r) override {
8567 osd->_finish_splits(pgs);
8568 }
8569};
8570
8571void OSD::_finish_splits(set<PGRef>& pgs)
8572{
8573 dout(10) << __func__ << " " << pgs << dendl;
8574 if (is_stopping())
8575 return;
8576 for (set<PGRef>::iterator i = pgs.begin();
8577 i != pgs.end();
8578 ++i) {
8579 PG *pg = i->get();
8580
8581 PeeringCtx rctx;
8582 pg->lock();
8583 dout(10) << __func__ << " " << *pg << dendl;
8584 epoch_t e = pg->get_osdmap_epoch();
8585 pg->handle_initialize(rctx);
8586 pg->queue_null(e, e);
8587 dispatch_context(rctx, pg, service.get_osdmap());
8588 pg->unlock();
8589
8590 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8591 shards[shard_index]->register_and_wake_split_child(pg);
8592 }
8593};
8594
8595bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8596 unsigned need)
8597{
8598 std::lock_guard l(merge_lock);
8599 auto& p = merge_waiters[nextmap->get_epoch()][target];
8600 p[src->pg_id] = src;
8601 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8602 << " for " << target << ", have " << p.size() << "/" << need
8603 << dendl;
8604 return p.size() == need;
8605}
8606
8607bool OSD::advance_pg(
8608 epoch_t osd_epoch,
8609 PG *pg,
8610 ThreadPool::TPHandle &handle,
8611 PeeringCtx &rctx)
8612{
8613 if (osd_epoch <= pg->get_osdmap_epoch()) {
8614 return true;
8615 }
8616 ceph_assert(pg->is_locked());
8617 OSDMapRef lastmap = pg->get_osdmap();
8618 set<PGRef> new_pgs; // any split children
8619 bool ret = true;
8620
8621 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8622 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8623 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8624 next_epoch <= osd_epoch;
8625 ++next_epoch) {
8626 OSDMapRef nextmap = service.try_get_map(next_epoch);
8627 if (!nextmap) {
8628 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8629 continue;
8630 }
8631
8632 unsigned new_pg_num =
8633 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8634 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8635 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8636 // check for merge
8637 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8638 spg_t parent;
8639 if (pg->pg_id.is_merge_source(
8640 old_pg_num,
8641 new_pg_num,
8642 &parent)) {
8643 // we are merge source
8644 PGRef spg = pg; // carry a ref
8645 dout(1) << __func__ << " " << pg->pg_id
8646 << " is merge source, target is " << parent
8647 << dendl;
8648 pg->write_if_dirty(rctx);
8649 if (!new_pgs.empty()) {
8650 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8651 new_pgs));
8652 new_pgs.clear();
8653 }
8654 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8655 pg->ch->flush();
8656 // release backoffs explicitly, since the on_shutdown path
8657 // aggressively tears down backoff state.
8658 if (pg->is_primary()) {
8659 pg->release_pg_backoffs();
8660 }
8661 pg->on_shutdown();
8662 OSDShard *sdata = pg->osd_shard;
8663 {
8664 std::lock_guard l(sdata->shard_lock);
8665 if (pg->pg_slot) {
8666 sdata->_detach_pg(pg->pg_slot);
8667 // update pg count now since we might not get an osdmap
8668 // any time soon.
8669 if (pg->is_primary())
8670 logger->dec(l_osd_pg_primary);
8671 else if (pg->is_nonprimary())
8672 logger->dec(l_osd_pg_replica); // misnomer
8673 else
8674 logger->dec(l_osd_pg_stray);
8675 }
8676 }
8677 pg->unlock();
8678
8679 set<spg_t> children;
8680 parent.is_split(new_pg_num, old_pg_num, &children);
8681 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8682 enqueue_peering_evt(
8683 parent,
8684 PGPeeringEventRef(
8685 std::make_shared<PGPeeringEvent>(
8686 nextmap->get_epoch(),
8687 nextmap->get_epoch(),
8688 NullEvt())));
8689 }
8690 ret = false;
8691 goto out;
8692 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8693 // we are merge target
8694 set<spg_t> children;
8695 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8696 dout(20) << __func__ << " " << pg->pg_id
8697 << " is merge target, sources are " << children
8698 << dendl;
8699 map<spg_t,PGRef> sources;
8700 {
8701 std::lock_guard l(merge_lock);
8702 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8703 unsigned need = children.size();
8704 dout(20) << __func__ << " have " << s.size() << "/"
8705 << need << dendl;
8706 if (s.size() == need) {
8707 sources.swap(s);
8708 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8709 if (merge_waiters[nextmap->get_epoch()].empty()) {
8710 merge_waiters.erase(nextmap->get_epoch());
8711 }
8712 }
8713 }
8714 if (!sources.empty()) {
8715 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8716 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8717 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8718 pg->merge_from(
8719 sources, rctx, split_bits,
8720 nextmap->get_pg_pool(
8721 pg->pg_id.pool())->last_pg_merge_meta);
8722 pg->pg_slot->waiting_for_merge_epoch = 0;
8723 } else {
8724 dout(20) << __func__ << " not ready to merge yet" << dendl;
8725 pg->write_if_dirty(rctx);
8726 if (!new_pgs.empty()) {
8727 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8728 new_pgs));
8729 new_pgs.clear();
8730 }
8731 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8732 pg->unlock();
8733 // kick source(s) to get them ready
8734 for (auto& i : children) {
8735 dout(20) << __func__ << " kicking source " << i << dendl;
8736 enqueue_peering_evt(
8737 i,
8738 PGPeeringEventRef(
8739 std::make_shared<PGPeeringEvent>(
8740 nextmap->get_epoch(),
8741 nextmap->get_epoch(),
8742 NullEvt())));
8743 }
8744 ret = false;
8745 goto out;
8746 }
8747 }
8748 }
8749 }
8750
8751 vector<int> newup, newacting;
8752 int up_primary, acting_primary;
8753 nextmap->pg_to_up_acting_osds(
8754 pg->pg_id.pgid,
8755 &newup, &up_primary,
8756 &newacting, &acting_primary);
8757 pg->handle_advance_map(
8758 nextmap, lastmap, newup, up_primary,
8759 newacting, acting_primary, rctx);
8760
8761 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8762 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8763 if (oldpool != lastmap->get_pools().end()
8764 && newpool != nextmap->get_pools().end()) {
8765 dout(20) << __func__
8766 << " new pool opts " << newpool->second.opts
8767 << " old pool opts " << oldpool->second.opts
8768 << dendl;
8769
8770 double old_min_interval = 0, new_min_interval = 0;
8771 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8772 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8773
8774 double old_max_interval = 0, new_max_interval = 0;
8775 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8776 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8777
8778 // Assume if an interval is change from set to unset or vice versa the actual config
8779 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8780 // unnecessarily.
8781 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8782 pg->on_info_history_change();
8783 }
8784 }
8785
8786 if (new_pg_num && old_pg_num != new_pg_num) {
8787 // check for split
8788 set<spg_t> children;
8789 if (pg->pg_id.is_split(
8790 old_pg_num,
8791 new_pg_num,
8792 &children)) {
8793 split_pgs(
8794 pg, children, &new_pgs, lastmap, nextmap,
8795 rctx);
8796 }
8797 }
8798
8799 lastmap = nextmap;
8800 old_pg_num = new_pg_num;
8801 handle.reset_tp_timeout();
8802 }
8803 pg->handle_activate_map(rctx);
8804
8805 ret = true;
8806 out:
8807 if (!new_pgs.empty()) {
8808 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
8809 }
8810 return ret;
8811}
8812
8813void OSD::consume_map()
8814{
8815 ceph_assert(ceph_mutex_is_locked(osd_lock));
8816 auto osdmap = get_osdmap();
8817 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8818
8819 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8820 * speak the older sorting version any more. Be careful not to force
8821 * a shutdown if we are merely processing old maps, though.
8822 */
8823 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8824 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8825 ceph_abort();
8826 }
8827
8828 service.pre_publish_map(osdmap);
8829 service.await_reserved_maps();
8830 service.publish_map(osdmap);
8831
8832 // prime splits and merges
8833 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8834 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8835 for (auto& shard : shards) {
8836 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8837 }
8838 if (!newly_split.empty()) {
8839 for (auto& shard : shards) {
8840 shard->prime_splits(osdmap, &newly_split);
8841 }
8842 ceph_assert(newly_split.empty());
8843 }
8844
8845 // prune sent_ready_to_merge
8846 service.prune_sent_ready_to_merge(osdmap);
8847
8848 // FIXME, maybe: We could race against an incoming peering message
8849 // that instantiates a merge PG after identify_merges() below and
8850 // never set up its peer to complete the merge. An OSD restart
8851 // would clear it up. This is a hard race to resolve,
8852 // extraordinarily rare (we only merge PGs that are stable and
8853 // clean, so it'd have to be an imported PG to an OSD with a
8854 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8855 // replace all of this with a seastar-based code soon anyway.
8856 if (!merge_pgs.empty()) {
8857 // mark the pgs we already have, or create new and empty merge
8858 // participants for those we are missing. do this all under the
8859 // shard lock so we don't have to worry about racing pg creates
8860 // via _process.
8861 for (auto& shard : shards) {
8862 shard->prime_merges(osdmap, &merge_pgs);
8863 }
8864 ceph_assert(merge_pgs.empty());
8865 }
8866
8867 service.prune_pg_created();
8868
8869 unsigned pushes_to_free = 0;
8870 for (auto& shard : shards) {
8871 shard->consume_map(osdmap, &pushes_to_free);
8872 }
8873
8874 vector<spg_t> pgids;
8875 _get_pgids(&pgids);
8876
8877 // count (FIXME, probably during seastar rewrite)
8878 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8879 vector<PGRef> pgs;
8880 _get_pgs(&pgs);
8881 for (auto& pg : pgs) {
8882 // FIXME (probably during seastar rewrite): this is lockless and
8883 // racy, but we don't want to take pg lock here.
8884 if (pg->is_primary())
8885 num_pg_primary++;
8886 else if (pg->is_nonprimary())
8887 num_pg_replica++; // misnomer
8888 else
8889 num_pg_stray++;
8890 }
8891
8892 {
8893 // FIXME (as part of seastar rewrite): move to OSDShard
8894 std::lock_guard l(pending_creates_lock);
8895 for (auto pg = pending_creates_from_osd.begin();
8896 pg != pending_creates_from_osd.end();) {
8897 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
8898 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8899 << "discarding pending_create_from_osd" << dendl;
8900 pg = pending_creates_from_osd.erase(pg);
8901 } else {
8902 ++pg;
8903 }
8904 }
8905 }
8906
8907 service.maybe_inject_dispatch_delay();
8908
8909 dispatch_sessions_waiting_on_map();
8910
8911 service.maybe_inject_dispatch_delay();
8912
8913 service.release_reserved_pushes(pushes_to_free);
8914
8915 // queue null events to push maps down to individual PGs
8916 for (auto pgid : pgids) {
8917 enqueue_peering_evt(
8918 pgid,
8919 PGPeeringEventRef(
8920 std::make_shared<PGPeeringEvent>(
8921 osdmap->get_epoch(),
8922 osdmap->get_epoch(),
8923 NullEvt())));
8924 }
8925 logger->set(l_osd_pg, pgids.size());
8926 logger->set(l_osd_pg_primary, num_pg_primary);
8927 logger->set(l_osd_pg_replica, num_pg_replica);
8928 logger->set(l_osd_pg_stray, num_pg_stray);
8929}
8930
8931void OSD::activate_map()
8932{
8933 ceph_assert(ceph_mutex_is_locked(osd_lock));
8934 auto osdmap = get_osdmap();
8935
8936 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8937
8938 // norecover?
8939 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8940 if (!service.recovery_is_paused()) {
8941 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8942 service.pause_recovery();
8943 }
8944 } else {
8945 if (service.recovery_is_paused()) {
8946 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8947 service.unpause_recovery();
8948 }
8949 }
8950
8951 service.activate_map();
8952
8953 // process waiters
8954 take_waiters(waiting_for_osdmap);
8955}
8956
8957bool OSD::require_mon_peer(const Message *m)
8958{
8959 if (!m->get_connection()->peer_is_mon()) {
8960 dout(0) << "require_mon_peer received from non-mon "
8961 << m->get_connection()->get_peer_addr()
8962 << " " << *m << dendl;
8963 return false;
8964 }
8965 return true;
8966}
8967
8968bool OSD::require_mon_or_mgr_peer(const Message *m)
8969{
8970 if (!m->get_connection()->peer_is_mon() &&
8971 !m->get_connection()->peer_is_mgr()) {
8972 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8973 << m->get_connection()->get_peer_addr()
8974 << " " << *m << dendl;
8975 return false;
8976 }
8977 return true;
8978}
8979
8980bool OSD::require_osd_peer(const Message *m)
8981{
8982 if (!m->get_connection()->peer_is_osd()) {
8983 dout(0) << "require_osd_peer received from non-osd "
8984 << m->get_connection()->get_peer_addr()
8985 << " " << *m << dendl;
8986 return false;
8987 }
8988 return true;
8989}
8990
8991bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8992{
8993 epoch_t up_epoch = service.get_up_epoch();
8994 if (epoch < up_epoch) {
8995 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8996 return false;
8997 }
8998
8999 if (!is_active()) {
9000 dout(7) << "still in boot state, dropping message " << *m << dendl;
9001 return false;
9002 }
9003
9004 return true;
9005}
9006
9007bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
9008 bool is_fast_dispatch)
9009{
9010 int from = m->get_source().num();
9011
9012 if (map->is_down(from) ||
9013 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
9014 dout(5) << "from dead osd." << from << ", marking down, "
9015 << " msg was " << m->get_source_inst().addr
9016 << " expected "
9017 << (map->is_up(from) ?
9018 map->get_cluster_addrs(from) : entity_addrvec_t())
9019 << dendl;
9020 ConnectionRef con = m->get_connection();
9021 con->mark_down();
9022 if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
9023 if (!is_fast_dispatch)
9024 s->session_dispatch_lock.lock();
9025 clear_session_waiting_on_map(s);
9026 con->set_priv(nullptr); // break ref <-> session cycle, if any
9027 s->con.reset();
9028 if (!is_fast_dispatch)
9029 s->session_dispatch_lock.unlock();
9030 }
9031 return false;
9032 }
9033 return true;
9034}
9035
9036
9037/*
9038 * require that we have same (or newer) map, and that
9039 * the source is the pg primary.
9040 */
9041bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
9042 bool is_fast_dispatch)
9043{
9044 const Message *m = op->get_req();
9045 const auto osdmap = get_osdmap();
9046 dout(15) << "require_same_or_newer_map " << epoch
9047 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
9048
9049 ceph_assert(ceph_mutex_is_locked(osd_lock));
9050
9051 // do they have a newer map?
9052 if (epoch > osdmap->get_epoch()) {
9053 dout(7) << "waiting for newer map epoch " << epoch
9054 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
9055 wait_for_new_map(op);
9056 return false;
9057 }
9058
9059 if (!require_self_aliveness(op->get_req(), epoch)) {
9060 return false;
9061 }
9062
9063 // ok, our map is same or newer.. do they still exist?
9064 if (m->get_connection()->get_messenger() == cluster_messenger &&
9065 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
9066 return false;
9067 }
9068
9069 return true;
9070}
9071
9072
9073
9074
9075
9076// ----------------------------------------
9077// pg creation
9078
9079void OSD::split_pgs(
9080 PG *parent,
9081 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
9082 OSDMapRef curmap,
9083 OSDMapRef nextmap,
9084 PeeringCtx &rctx)
9085{
9086 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
9087 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
9088
9089 vector<object_stat_sum_t> updated_stats;
9090 parent->start_split_stats(childpgids, &updated_stats);
9091
9092 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
9093 for (set<spg_t>::const_iterator i = childpgids.begin();
9094 i != childpgids.end();
9095 ++i, ++stat_iter) {
9096 ceph_assert(stat_iter != updated_stats.end());
9097 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
9098 PG* child = _make_pg(nextmap, *i);
9099 child->lock(true);
9100 out_pgs->insert(child);
9101 child->ch = store->create_new_collection(child->coll);
9102
9103 {
9104 uint32_t shard_index = i->hash_to_shard(shards.size());
9105 assert(NULL != shards[shard_index]);
9106 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
9107 }
9108
9109 unsigned split_bits = i->get_split_bits(pg_num);
9110 dout(10) << " pg_num is " << pg_num
9111 << ", m_seed " << i->ps()
9112 << ", split_bits is " << split_bits << dendl;
9113 parent->split_colls(
9114 *i,
9115 split_bits,
9116 i->ps(),
9117 &child->get_pool().info,
9118 rctx.transaction);
9119 parent->split_into(
9120 i->pgid,
9121 child,
9122 split_bits);
9123
9124 child->init_collection_pool_opts();
9125
9126 child->finish_split_stats(*stat_iter, rctx.transaction);
9127 child->unlock();
9128 }
9129 ceph_assert(stat_iter != updated_stats.end());
9130 parent->finish_split_stats(*stat_iter, rctx.transaction);
9131}
9132
9133/*
9134 * holding osd_lock
9135 */
9136void OSD::handle_pg_create(OpRequestRef op)
9137{
9138 // NOTE: this can be removed in P release (mimic is the last version to
9139 // send MOSDPGCreate messages).
9140
9141 auto m = op->get_req<MOSDPGCreate>();
9142 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
9143
9144 dout(10) << "handle_pg_create " << *m << dendl;
9145
9146 if (!require_mon_peer(op->get_req())) {
9147 return;
9148 }
9149
9150 if (!require_same_or_newer_map(op, m->epoch, false))
9151 return;
9152
9153 op->mark_started();
9154
9155 const auto osdmap = get_osdmap();
9156 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9157 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9158 p != m->mkpg.end();
9159 ++p, ++ci) {
9160 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
9161 epoch_t created = p->second.created;
9162 if (p->second.split_bits) // Skip split pgs
9163 continue;
9164 pg_t on = p->first;
9165
9166 if (!osdmap->have_pg_pool(on.pool())) {
9167 dout(20) << "ignoring pg on deleted pool " << on << dendl;
9168 continue;
9169 }
9170
9171 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9172
9173 spg_t pgid;
9174 bool mapped = osdmap->get_primary_shard(on, &pgid);
9175 ceph_assert(mapped);
9176
9177 // is it still ours?
9178 vector<int> up, acting;
9179 int up_primary = -1;
9180 int acting_primary = -1;
9181 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9182 int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
9183
9184 if (acting_primary != whoami) {
9185 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9186 << "), my role=" << role << ", skipping" << dendl;
9187 continue;
9188 }
9189
9190
9191 PastIntervals pi;
9192 pg_history_t history;
9193 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9194
9195 // The mon won't resend unless the primary changed, so we ignore
9196 // same_interval_since. We'll pass this history with the current
9197 // epoch as the event.
9198 if (history.same_primary_since > m->epoch) {
9199 dout(10) << __func__ << ": got obsolete pg create on pgid "
9200 << pgid << " from epoch " << m->epoch
9201 << ", primary changed in " << history.same_primary_since
9202 << dendl;
9203 continue;
9204 }
9205 enqueue_peering_evt(
9206 pgid,
9207 PGPeeringEventRef(
9208 std::make_shared<PGPeeringEvent>(
9209 osdmap->get_epoch(),
9210 osdmap->get_epoch(),
9211 NullEvt(),
9212 true,
9213 new PGCreateInfo(
9214 pgid,
9215 osdmap->get_epoch(),
9216 history,
9217 pi,
9218 true)
9219 )));
9220 }
9221
9222 {
9223 std::lock_guard l(pending_creates_lock);
9224 if (pending_creates_from_mon == 0) {
9225 last_pg_create_epoch = m->epoch;
9226 }
9227 }
9228
9229 maybe_update_heartbeat_peers();
9230}
9231
9232
9233// ----------------------------------------
9234// peering and recovery
9235
9236void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
9237 ThreadPool::TPHandle *handle)
9238{
9239 if (!service.get_osdmap()->is_up(whoami)) {
9240 dout(20) << __func__ << " not up in osdmap" << dendl;
9241 } else if (!is_active()) {
9242 dout(20) << __func__ << " not active" << dendl;
9243 } else {
9244 for (auto& [osd, ls] : ctx.message_map) {
9245 if (!curmap->is_up(osd)) {
9246 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9247 continue;
9248 }
9249 ConnectionRef con = service.get_con_osd_cluster(
9250 osd, curmap->get_epoch());
9251 if (!con) {
9252 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9253 << dendl;
9254 continue;
9255 }
9256 service.maybe_share_map(con.get(), curmap);
9257 for (auto m : ls) {
9258 con->send_message2(m);
9259 }
9260 ls.clear();
9261 }
9262 }
9263 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
9264 int tr = store->queue_transaction(
9265 pg->ch,
9266 std::move(ctx.transaction), TrackedOpRef(),
9267 handle);
9268 ceph_assert(tr == 0);
9269 }
9270}
9271
9272void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9273{
9274 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9275 if (!require_mon_peer(m)) {
9276 m->put();
9277 return;
9278 }
9279 for (auto& p : m->pgs) {
9280 spg_t pgid = p.first;
9281 epoch_t created = p.second.first;
9282 utime_t created_stamp = p.second.second;
9283 auto q = m->pg_extra.find(pgid);
9284 if (q == m->pg_extra.end()) {
9285 dout(20) << __func__ << " " << pgid << " e" << created
9286 << "@" << created_stamp
9287 << " (no history or past_intervals)" << dendl;
9288 // pre-octopus ... no pg history. this can be removed in Q release.
9289 enqueue_peering_evt(
9290 pgid,
9291 PGPeeringEventRef(
9292 std::make_shared<PGPeeringEvent>(
9293 m->epoch,
9294 m->epoch,
9295 NullEvt(),
9296 true,
9297 new PGCreateInfo(
9298 pgid,
9299 created,
9300 pg_history_t(created, created_stamp),
9301 PastIntervals(),
9302 true)
9303 )));
9304 } else {
9305 dout(20) << __func__ << " " << pgid << " e" << created
9306 << "@" << created_stamp
9307 << " history " << q->second.first
9308 << " pi " << q->second.second << dendl;
9309 if (!q->second.second.empty() &&
9310 m->epoch < q->second.second.get_bounds().second) {
9311 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9312 << " and unmatched past_intervals " << q->second.second
9313 << " (history " << q->second.first << ")";
9314 } else {
9315 enqueue_peering_evt(
9316 pgid,
9317 PGPeeringEventRef(
9318 std::make_shared<PGPeeringEvent>(
9319 m->epoch,
9320 m->epoch,
9321 NullEvt(),
9322 true,
9323 new PGCreateInfo(
9324 pgid,
9325 m->epoch,
9326 q->second.first,
9327 q->second.second,
9328 true)
9329 )));
9330 }
9331 }
9332 }
9333
9334 {
9335 std::lock_guard l(pending_creates_lock);
9336 if (pending_creates_from_mon == 0) {
9337 last_pg_create_epoch = m->epoch;
9338 }
9339 }
9340
9341 m->put();
9342}
9343
9344void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9345{
9346 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9347 if (!require_osd_peer(m)) {
9348 m->put();
9349 return;
9350 }
9351 int from = m->get_source().num();
9352 for (auto& p : m->get_pg_list()) {
9353 spg_t pgid(p.info.pgid.pgid, p.to);
9354 enqueue_peering_evt(
9355 pgid,
9356 PGPeeringEventRef(
9357 std::make_shared<PGPeeringEvent>(
9358 p.epoch_sent,
9359 p.query_epoch,
9360 MNotifyRec(
9361 pgid, pg_shard_t(from, p.from),
9362 p,
9363 m->get_connection()->get_features()),
9364 true,
9365 new PGCreateInfo(
9366 pgid,
9367 p.query_epoch,
9368 p.info.history,
9369 p.past_intervals,
9370 false)
9371 )));
9372 }
9373 m->put();
9374}
9375
9376void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9377{
9378 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9379 if (!require_osd_peer(m)) {
9380 m->put();
9381 return;
9382 }
9383 int from = m->get_source().num();
9384 for (auto& p : m->pg_list) {
9385 enqueue_peering_evt(
9386 spg_t(p.info.pgid.pgid, p.to),
9387 PGPeeringEventRef(
9388 std::make_shared<PGPeeringEvent>(
9389 p.epoch_sent, p.query_epoch,
9390 MInfoRec(
9391 pg_shard_t(from, p.from),
9392 p.info,
9393 p.epoch_sent)))
9394 );
9395 }
9396 m->put();
9397}
9398
9399void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9400{
9401 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9402 if (!require_osd_peer(m)) {
9403 m->put();
9404 return;
9405 }
9406 for (auto& pgid : m->pg_list) {
9407 enqueue_peering_evt(
9408 pgid,
9409 PGPeeringEventRef(
9410 std::make_shared<PGPeeringEvent>(
9411 m->get_epoch(), m->get_epoch(),
9412 PeeringState::DeleteStart())));
9413 }
9414 m->put();
9415}
9416
9417void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9418{
9419 dout(10) << __func__ << " " << *m << dendl;
9420 if (!require_mon_or_mgr_peer(m)) {
9421 m->put();
9422 return;
9423 }
9424 epoch_t epoch = get_osdmap_epoch();
9425 for (auto pgid : m->forced_pgs) {
9426 if (m->options & OFR_BACKFILL) {
9427 if (m->options & OFR_CANCEL) {
9428 enqueue_peering_evt(
9429 pgid,
9430 PGPeeringEventRef(
9431 std::make_shared<PGPeeringEvent>(
9432 epoch, epoch,
9433 PeeringState::UnsetForceBackfill())));
9434 } else {
9435 enqueue_peering_evt(
9436 pgid,
9437 PGPeeringEventRef(
9438 std::make_shared<PGPeeringEvent>(
9439 epoch, epoch,
9440 PeeringState::SetForceBackfill())));
9441 }
9442 } else if (m->options & OFR_RECOVERY) {
9443 if (m->options & OFR_CANCEL) {
9444 enqueue_peering_evt(
9445 pgid,
9446 PGPeeringEventRef(
9447 std::make_shared<PGPeeringEvent>(
9448 epoch, epoch,
9449 PeeringState::UnsetForceRecovery())));
9450 } else {
9451 enqueue_peering_evt(
9452 pgid,
9453 PGPeeringEventRef(
9454 std::make_shared<PGPeeringEvent>(
9455 epoch, epoch,
9456 PeeringState::SetForceRecovery())));
9457 }
9458 }
9459 }
9460 m->put();
9461}
9462
9463void OSD::handle_pg_query_nopg(const MQuery& q)
9464{
9465 spg_t pgid = q.pgid;
9466 dout(10) << __func__ << " " << pgid << dendl;
9467
9468 OSDMapRef osdmap = get_osdmap();
9469 if (!osdmap->have_pg_pool(pgid.pool()))
9470 return;
9471
9472 dout(10) << " pg " << pgid << " dne" << dendl;
9473 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9474 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9475 if (con) {
9476 Message *m;
9477 if (q.query.type == pg_query_t::LOG ||
9478 q.query.type == pg_query_t::FULLLOG) {
9479 m = new MOSDPGLog(
9480 q.query.from, q.query.to,
9481 osdmap->get_epoch(), empty,
9482 q.query.epoch_sent);
9483 } else {
9484 pg_notify_t notify{q.query.from, q.query.to,
9485 q.query.epoch_sent,
9486 osdmap->get_epoch(),
9487 empty,
9488 PastIntervals()};
9489 m = new MOSDPGNotify2(spg_t{pgid.pgid, q.query.from},
9490 std::move(notify));
9491 }
9492 service.maybe_share_map(con.get(), osdmap);
9493 con->send_message(m);
9494 }
9495}
9496
9497void OSDService::queue_check_readable(spg_t spgid,
9498 epoch_t lpr,
9499 ceph::signedspan delay)
9500{
9501 if (delay == ceph::signedspan::zero()) {
9502 osd->enqueue_peering_evt(
9503 spgid,
9504 PGPeeringEventRef(
9505 std::make_shared<PGPeeringEvent>(
9506 lpr, lpr,
9507 PeeringState::CheckReadable())));
9508 } else {
9509 mono_timer.add_event(
9510 delay,
9511 [this, spgid, lpr]() {
9512 queue_check_readable(spgid, lpr);
9513 });
9514 }
9515}
9516
9517
9518// =========================================================
9519// RECOVERY
9520
9521void OSDService::_maybe_queue_recovery() {
9522 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
9523 uint64_t available_pushes;
9524 while (!awaiting_throttle.empty() &&
9525 _recover_now(&available_pushes)) {
9526 uint64_t to_start = std::min(
9527 available_pushes,
9528 cct->_conf->osd_recovery_max_single_start);
9529 _queue_for_recovery(awaiting_throttle.front(), to_start);
9530 awaiting_throttle.pop_front();
9531 dout(10) << __func__ << " starting " << to_start
9532 << ", recovery_ops_reserved " << recovery_ops_reserved
9533 << " -> " << (recovery_ops_reserved + to_start) << dendl;
9534 recovery_ops_reserved += to_start;
9535 }
9536}
9537
9538bool OSDService::_recover_now(uint64_t *available_pushes)
9539{
9540 if (available_pushes)
9541 *available_pushes = 0;
9542
9543 if (ceph_clock_now() < defer_recovery_until) {
9544 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9545 return false;
9546 }
9547
9548 if (recovery_paused) {
9549 dout(15) << __func__ << " paused" << dendl;
9550 return false;
9551 }
9552
9553 uint64_t max = osd->get_recovery_max_active();
9554 if (max <= recovery_ops_active + recovery_ops_reserved) {
9555 dout(15) << __func__ << " active " << recovery_ops_active
9556 << " + reserved " << recovery_ops_reserved
9557 << " >= max " << max << dendl;
9558 return false;
9559 }
9560
9561 if (available_pushes)
9562 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9563
9564 return true;
9565}
9566
9567unsigned OSDService::get_target_pg_log_entries() const
9568{
9569 auto num_pgs = osd->get_num_pgs();
9570 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9571 if (num_pgs > 0 && target > 0) {
9572 // target an even spread of our budgeted log entries across all
9573 // PGs. note that while we only get to control the entry count
9574 // for primary PGs, we'll normally be responsible for a mix of
9575 // primary and replica PGs (for the same pool(s) even), so this
9576 // will work out.
9577 return std::max<unsigned>(
9578 std::min<unsigned>(target / num_pgs,
9579 cct->_conf->osd_max_pg_log_entries),
9580 cct->_conf->osd_min_pg_log_entries);
9581 } else {
9582 // fall back to a per-pg value.
9583 return cct->_conf->osd_min_pg_log_entries;
9584 }
9585}
9586
9587void OSD::do_recovery(
9588 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9589 ThreadPool::TPHandle &handle)
9590{
9591 uint64_t started = 0;
9592
9593 /*
9594 * When the value of osd_recovery_sleep is set greater than zero, recovery
9595 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9596 * recovery event's schedule time. This is done by adding a
9597 * recovery_requeue_callback event, which re-queues the recovery op using
9598 * queue_recovery_after_sleep.
9599 */
9600 float recovery_sleep = get_osd_recovery_sleep();
9601 {
9602 std::lock_guard l(service.sleep_lock);
9603 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9604 PGRef pgref(pg);
9605 auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
9606 dout(20) << "do_recovery wake up at "
9607 << ceph_clock_now()
9608 << ", re-queuing recovery" << dendl;
9609 std::lock_guard l(service.sleep_lock);
9610 service.recovery_needs_sleep = false;
9611 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9612 });
9613
9614 // This is true for the first recovery op and when the previous recovery op
9615 // has been scheduled in the past. The next recovery op is scheduled after
9616 // completing the sleep from now.
9617
9618 if (auto now = ceph::real_clock::now();
9619 service.recovery_schedule_time < now) {
9620 service.recovery_schedule_time = now;
9621 }
9622 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
9623 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9624 recovery_requeue_callback);
9625 dout(20) << "Recovery event scheduled at "
9626 << service.recovery_schedule_time << dendl;
9627 return;
9628 }
9629 }
9630
9631 {
9632 {
9633 std::lock_guard l(service.sleep_lock);
9634 service.recovery_needs_sleep = true;
9635 }
9636
9637 if (pg->pg_has_reset_since(queued)) {
9638 goto out;
9639 }
9640
9641 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9642#ifdef DEBUG_RECOVERY_OIDS
9643 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
9644#endif
9645
9646 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
9647 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9648 << " on " << *pg << dendl;
9649
9650 if (do_unfound) {
9651 PeeringCtx rctx;
9652 rctx.handle = &handle;
9653 pg->find_unfound(queued, rctx);
9654 dispatch_context(rctx, pg, pg->get_osdmap());
9655 }
9656 }
9657
9658 out:
9659 ceph_assert(started <= reserved_pushes);
9660 service.release_reserved_pushes(reserved_pushes);
9661}
9662
9663void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9664{
9665 std::lock_guard l(recovery_lock);
9666 dout(10) << "start_recovery_op " << *pg << " " << soid
9667 << " (" << recovery_ops_active << "/"
9668 << osd->get_recovery_max_active() << " rops)"
9669 << dendl;
9670 recovery_ops_active++;
9671
9672#ifdef DEBUG_RECOVERY_OIDS
9673 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9674 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9675 recovery_oids[pg->pg_id].insert(soid);
9676#endif
9677}
9678
9679void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9680{
9681 std::lock_guard l(recovery_lock);
9682 dout(10) << "finish_recovery_op " << *pg << " " << soid
9683 << " dequeue=" << dequeue
9684 << " (" << recovery_ops_active << "/"
9685 << osd->get_recovery_max_active() << " rops)"
9686 << dendl;
9687
9688 // adjust count
9689 ceph_assert(recovery_ops_active > 0);
9690 recovery_ops_active--;
9691
9692#ifdef DEBUG_RECOVERY_OIDS
9693 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9694 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9695 recovery_oids[pg->pg_id].erase(soid);
9696#endif
9697
9698 _maybe_queue_recovery();
9699}
9700
9701bool OSDService::is_recovery_active()
9702{
9703 if (cct->_conf->osd_debug_pretend_recovery_active) {
9704 return true;
9705 }
9706 return local_reserver.has_reservation() || remote_reserver.has_reservation();
9707}
9708
9709void OSDService::release_reserved_pushes(uint64_t pushes)
9710{
9711 std::lock_guard l(recovery_lock);
9712 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9713 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9714 << dendl;
9715 ceph_assert(recovery_ops_reserved >= pushes);
9716 recovery_ops_reserved -= pushes;
9717 _maybe_queue_recovery();
9718}
9719
9720// =========================================================
9721// OPS
9722
9723bool OSD::op_is_discardable(const MOSDOp *op)
9724{
9725 // drop client request if they are not connected and can't get the
9726 // reply anyway.
9727 if (!op->get_connection()->is_connected()) {
9728 return true;
9729 }
9730 return false;
9731}
9732
9733void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
9734{
9735 const utime_t stamp = op->get_req()->get_recv_stamp();
9736 const utime_t latency = ceph_clock_now() - stamp;
9737 const unsigned priority = op->get_req()->get_priority();
9738 const int cost = op->get_req()->get_cost();
9739 const uint64_t owner = op->get_req()->get_source().num();
9740 const int type = op->get_req()->get_type();
9741
9742 dout(15) << "enqueue_op " << op << " prio " << priority
9743 << " type " << type
9744 << " cost " << cost
9745 << " latency " << latency
9746 << " epoch " << epoch
9747 << " " << *(op->get_req()) << dendl;
9748 op->osd_trace.event("enqueue op");
9749 op->osd_trace.keyval("priority", priority);
9750 op->osd_trace.keyval("cost", cost);
9751
9752 auto enqueue_span = tracing::osd::tracer.add_span(__func__, op->osd_parent_span);
9753 enqueue_span->AddEvent(__func__, {
9754 {"priority", priority},
9755 {"cost", cost},
9756 {"epoch", epoch},
9757 {"owner", owner},
9758 {"type", type}
9759 });
9760
9761 op->mark_queued_for_pg();
9762 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9763 if (type == MSG_OSD_PG_PUSH ||
9764 type == MSG_OSD_PG_PUSH_REPLY) {
9765 op_shardedwq.queue(
9766 OpSchedulerItem(
9767 unique_ptr<OpSchedulerItem::OpQueueable>(new PGRecoveryMsg(pg, std::move(op))),
9768 cost, priority, stamp, owner, epoch));
9769 } else {
9770 op_shardedwq.queue(
9771 OpSchedulerItem(
9772 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9773 cost, priority, stamp, owner, epoch));
9774 }
9775}
9776
9777void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9778{
9779 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9780 op_shardedwq.queue(
9781 OpSchedulerItem(
9782 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9783 10,
9784 cct->_conf->osd_peering_op_priority,
9785 utime_t(),
9786 0,
9787 evt->get_epoch_sent()));
9788}
9789
9790/*
9791 * NOTE: dequeue called in worker thread, with pg lock
9792 */
9793void OSD::dequeue_op(
9794 PGRef pg, OpRequestRef op,
9795 ThreadPool::TPHandle &handle)
9796{
9797 const Message *m = op->get_req();
9798
9799 FUNCTRACE(cct);
9800 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
9801
9802 utime_t now = ceph_clock_now();
9803 op->set_dequeued_time(now);
9804
9805 utime_t latency = now - m->get_recv_stamp();
9806 dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9807 << " cost " << m->get_cost()
9808 << " latency " << latency
9809 << " " << *m
9810 << " pg " << *pg << dendl;
9811
9812 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9813
9814 service.maybe_share_map(m->get_connection().get(),
9815 pg->get_osdmap(),
9816 op->sent_epoch);
9817
9818 if (pg->is_deleting())
9819 return;
9820
9821 op->mark_reached_pg();
9822 op->osd_trace.event("dequeue_op");
9823
9824 pg->do_request(op, handle);
9825
9826 // finish
9827 dout(10) << "dequeue_op " << op << " finish" << dendl;
9828 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
9829}
9830
9831
9832void OSD::dequeue_peering_evt(
9833 OSDShard *sdata,
9834 PG *pg,
9835 PGPeeringEventRef evt,
9836 ThreadPool::TPHandle& handle)
9837{
9838 auto curmap = sdata->get_osdmap();
9839 bool need_up_thru = false;
9840 epoch_t same_interval_since = 0;
9841 if (!pg) {
9842 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9843 handle_pg_query_nopg(*q);
9844 } else {
9845 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9846 ceph_abort();
9847 }
9848 } else if (PeeringCtx rctx;
9849 advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9850 pg->do_peering_event(evt, rctx);
9851 if (pg->is_deleted()) {
9852 pg->unlock();
9853 return;
9854 }
9855 dispatch_context(rctx, pg, curmap, &handle);
9856 need_up_thru = pg->get_need_up_thru();
9857 same_interval_since = pg->get_same_interval_since();
9858 pg->unlock();
9859 }
9860
9861 if (need_up_thru) {
9862 queue_want_up_thru(same_interval_since);
9863 }
9864
9865 service.send_pg_temp();
9866}
9867
9868void OSD::dequeue_delete(
9869 OSDShard *sdata,
9870 PG *pg,
9871 epoch_t e,
9872 ThreadPool::TPHandle& handle)
9873{
9874 dequeue_peering_evt(
9875 sdata,
9876 pg,
9877 PGPeeringEventRef(
9878 std::make_shared<PGPeeringEvent>(
9879 e, e,
9880 PeeringState::DeleteSome())),
9881 handle);
9882}
9883
9884
9885
9886// --------------------------------
9887
9888const char** OSD::get_tracked_conf_keys() const
9889{
9890 static const char* KEYS[] = {
9891 "osd_max_backfills",
9892 "osd_min_recovery_priority",
9893 "osd_max_trimming_pgs",
9894 "osd_op_complaint_time",
9895 "osd_op_log_threshold",
9896 "osd_op_history_size",
9897 "osd_op_history_duration",
9898 "osd_op_history_slow_op_size",
9899 "osd_op_history_slow_op_threshold",
9900 "osd_enable_op_tracker",
9901 "osd_map_cache_size",
9902 "osd_pg_epoch_max_lag_factor",
9903 "osd_pg_epoch_persisted_max_stale",
9904 "osd_recovery_sleep",
9905 "osd_recovery_sleep_hdd",
9906 "osd_recovery_sleep_ssd",
9907 "osd_recovery_sleep_hybrid",
9908 "osd_delete_sleep",
9909 "osd_delete_sleep_hdd",
9910 "osd_delete_sleep_ssd",
9911 "osd_delete_sleep_hybrid",
9912 "osd_snap_trim_sleep",
9913 "osd_snap_trim_sleep_hdd",
9914 "osd_snap_trim_sleep_ssd",
9915 "osd_snap_trim_sleep_hybrid",
9916 "osd_scrub_sleep",
9917 "osd_recovery_max_active",
9918 "osd_recovery_max_active_hdd",
9919 "osd_recovery_max_active_ssd",
9920 // clog & admin clog
9921 "clog_to_monitors",
9922 "clog_to_syslog",
9923 "clog_to_syslog_facility",
9924 "clog_to_syslog_level",
9925 "osd_objectstore_fuse",
9926 "clog_to_graylog",
9927 "clog_to_graylog_host",
9928 "clog_to_graylog_port",
9929 "host",
9930 "fsid",
9931 "osd_recovery_delay_start",
9932 "osd_client_message_size_cap",
9933 "osd_client_message_cap",
9934 "osd_heartbeat_min_size",
9935 "osd_heartbeat_interval",
9936 "osd_object_clean_region_max_num_intervals",
9937 "osd_scrub_min_interval",
9938 "osd_scrub_max_interval",
9939 NULL
9940 };
9941 return KEYS;
9942}
9943
9944void OSD::handle_conf_change(const ConfigProxy& conf,
9945 const std::set <std::string> &changed)
9946{
9947 std::lock_guard l{osd_lock};
9948
9949 if (changed.count("osd_max_backfills") ||
9950 changed.count("osd_delete_sleep") ||
9951 changed.count("osd_delete_sleep_hdd") ||
9952 changed.count("osd_delete_sleep_ssd") ||
9953 changed.count("osd_delete_sleep_hybrid") ||
9954 changed.count("osd_snap_trim_sleep") ||
9955 changed.count("osd_snap_trim_sleep_hdd") ||
9956 changed.count("osd_snap_trim_sleep_ssd") ||
9957 changed.count("osd_snap_trim_sleep_hybrid") ||
9958 changed.count("osd_scrub_sleep") ||
9959 changed.count("osd_recovery_sleep") ||
9960 changed.count("osd_recovery_sleep_hdd") ||
9961 changed.count("osd_recovery_sleep_ssd") ||
9962 changed.count("osd_recovery_sleep_hybrid") ||
9963 changed.count("osd_recovery_max_active") ||
9964 changed.count("osd_recovery_max_active_hdd") ||
9965 changed.count("osd_recovery_max_active_ssd")) {
9966 if (!maybe_override_options_for_qos() &&
9967 changed.count("osd_max_backfills")) {
9968 // Scheduler is not "mclock". Fallback to earlier behavior
9969 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9970 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9971 }
9972 }
9973 if (changed.count("osd_min_recovery_priority")) {
9974 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9975 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9976 }
9977 if (changed.count("osd_max_trimming_pgs")) {
9978 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9979 }
9980 if (changed.count("osd_op_complaint_time") ||
9981 changed.count("osd_op_log_threshold")) {
9982 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9983 cct->_conf->osd_op_log_threshold);
9984 }
9985 if (changed.count("osd_op_history_size") ||
9986 changed.count("osd_op_history_duration")) {
9987 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9988 cct->_conf->osd_op_history_duration);
9989 }
9990 if (changed.count("osd_op_history_slow_op_size") ||
9991 changed.count("osd_op_history_slow_op_threshold")) {
9992 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9993 cct->_conf->osd_op_history_slow_op_threshold);
9994 }
9995 if (changed.count("osd_enable_op_tracker")) {
9996 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9997 }
9998 if (changed.count("osd_map_cache_size")) {
9999 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
10000 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
10001 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
10002 }
10003 if (changed.count("clog_to_monitors") ||
10004 changed.count("clog_to_syslog") ||
10005 changed.count("clog_to_syslog_level") ||
10006 changed.count("clog_to_syslog_facility") ||
10007 changed.count("clog_to_graylog") ||
10008 changed.count("clog_to_graylog_host") ||
10009 changed.count("clog_to_graylog_port") ||
10010 changed.count("host") ||
10011 changed.count("fsid")) {
10012 update_log_config();
10013 }
10014 if (changed.count("osd_pg_epoch_max_lag_factor")) {
10015 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
10016 "osd_pg_epoch_max_lag_factor");
10017 }
10018
10019#ifdef HAVE_LIBFUSE
10020 if (changed.count("osd_objectstore_fuse")) {
10021 if (store) {
10022 enable_disable_fuse(false);
10023 }
10024 }
10025#endif
10026
10027 if (changed.count("osd_recovery_delay_start")) {
10028 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10029 service.kick_recovery_queue();
10030 }
10031
10032 if (changed.count("osd_client_message_cap")) {
10033 uint64_t newval = cct->_conf->osd_client_message_cap;
10034 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10035 if (pol.throttler_messages) {
10036 pol.throttler_messages->reset_max(newval);
10037 }
10038 }
10039 if (changed.count("osd_client_message_size_cap")) {
10040 uint64_t newval = cct->_conf->osd_client_message_size_cap;
10041 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10042 if (pol.throttler_bytes) {
10043 pol.throttler_bytes->reset_max(newval);
10044 }
10045 }
10046 if (changed.count("osd_object_clean_region_max_num_intervals")) {
10047 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
10048 }
10049
10050 if (changed.count("osd_scrub_min_interval") ||
10051 changed.count("osd_scrub_max_interval")) {
10052 resched_all_scrubs();
10053 dout(0) << __func__ << ": scrub interval change" << dendl;
10054 }
10055 check_config();
10056 if (changed.count("osd_asio_thread_count")) {
10057 service.poolctx.stop();
10058 service.poolctx.start(conf.get_val<std::uint64_t>("osd_asio_thread_count"));
10059 }
10060}
10061
10062void OSD::maybe_override_max_osd_capacity_for_qos()
10063{
10064 // If the scheduler enabled is mclock, override the default
10065 // osd capacity with the value obtained from running the
10066 // osd bench test. This is later used to setup mclock.
10067 if ((cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") &&
10068 (cct->_conf.get_val<bool>("osd_mclock_skip_benchmark") == false) &&
10069 (!unsupported_objstore_for_qos())) {
10070 std::string max_capacity_iops_config;
10071 bool force_run_benchmark =
10072 cct->_conf.get_val<bool>("osd_mclock_force_run_benchmark_on_init");
10073
10074 if (store_is_rotational) {
10075 max_capacity_iops_config = "osd_mclock_max_capacity_iops_hdd";
10076 } else {
10077 max_capacity_iops_config = "osd_mclock_max_capacity_iops_ssd";
10078 }
10079
10080 if (!force_run_benchmark) {
10081 double default_iops = 0.0;
10082
10083 // Get the current osd iops capacity
10084 double cur_iops = cct->_conf.get_val<double>(max_capacity_iops_config);
10085
10086 // Get the default max iops capacity
10087 auto val = cct->_conf.get_val_default(max_capacity_iops_config);
10088 if (!val.has_value()) {
10089 derr << __func__ << " Unable to determine default value of "
10090 << max_capacity_iops_config << dendl;
10091 // Cannot determine default iops. Force a run of the OSD benchmark.
10092 force_run_benchmark = true;
10093 } else {
10094 // Default iops
10095 default_iops = std::stod(val.value());
10096 }
10097
10098 // Determine if we really need to run the osd benchmark
10099 if (!force_run_benchmark && (default_iops != cur_iops)) {
10100 dout(1) << __func__ << std::fixed << std::setprecision(2)
10101 << " default_iops: " << default_iops
10102 << " cur_iops: " << cur_iops
10103 << ". Skip OSD benchmark test." << dendl;
10104 return;
10105 }
10106 }
10107
10108 // Run osd bench: write 100 4MiB objects with blocksize 4KiB
10109 int64_t count = 12288000; // Count of bytes to write
10110 int64_t bsize = 4096; // Block size
10111 int64_t osize = 4194304; // Object size
10112 int64_t onum = 100; // Count of objects to write
10113 double elapsed = 0.0; // Time taken to complete the test
10114 double iops = 0.0;
10115 stringstream ss;
10116 int ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
10117 if (ret != 0) {
10118 derr << __func__
10119 << " osd bench err: " << ret
10120 << " osd bench errstr: " << ss.str()
10121 << dendl;
10122 return;
10123 }
10124
10125 double rate = count / elapsed;
10126 iops = rate / bsize;
10127 dout(1) << __func__
10128 << " osd bench result -"
10129 << std::fixed << std::setprecision(3)
10130 << " bandwidth (MiB/sec): " << rate / (1024 * 1024)
10131 << " iops: " << iops
10132 << " elapsed_sec: " << elapsed
10133 << dendl;
10134
10135 // Persist iops to the MON store
10136 ret = mon_cmd_set_config(max_capacity_iops_config, std::to_string(iops));
10137 if (ret < 0) {
10138 // Fallback to setting the config within the in-memory "values" map.
10139 cct->_conf.set_val(max_capacity_iops_config, std::to_string(iops));
10140 }
10141
10142 // Override the max osd capacity for all shards
10143 for (auto& shard : shards) {
10144 shard->update_scheduler_config();
10145 }
10146 }
10147}
10148
10149bool OSD::maybe_override_options_for_qos()
10150{
10151 // If the scheduler enabled is mclock, override the recovery, backfill
10152 // and sleep options so that mclock can meet the QoS goals.
10153 if (cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler" &&
10154 !unsupported_objstore_for_qos()) {
10155 dout(1) << __func__
10156 << ": Changing recovery/backfill/sleep settings for QoS" << dendl;
10157
10158 // Set high value for recovery max active
10159 uint32_t rec_max_active = 1000;
10160 cct->_conf.set_val(
10161 "osd_recovery_max_active", std::to_string(rec_max_active));
10162 cct->_conf.set_val(
10163 "osd_recovery_max_active_hdd", std::to_string(rec_max_active));
10164 cct->_conf.set_val(
10165 "osd_recovery_max_active_ssd", std::to_string(rec_max_active));
10166
10167 // Set high value for osd_max_backfill
10168 uint32_t max_backfills = 1000;
10169 cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills));
10170 service.local_reserver.set_max(max_backfills);
10171 service.remote_reserver.set_max(max_backfills);
10172
10173 // Disable recovery sleep
10174 cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
10175 cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
10176 cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
10177 cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
10178
10179 // Disable delete sleep
10180 cct->_conf.set_val("osd_delete_sleep", std::to_string(0));
10181 cct->_conf.set_val("osd_delete_sleep_hdd", std::to_string(0));
10182 cct->_conf.set_val("osd_delete_sleep_ssd", std::to_string(0));
10183 cct->_conf.set_val("osd_delete_sleep_hybrid", std::to_string(0));
10184
10185 // Disable snap trim sleep
10186 cct->_conf.set_val("osd_snap_trim_sleep", std::to_string(0));
10187 cct->_conf.set_val("osd_snap_trim_sleep_hdd", std::to_string(0));
10188 cct->_conf.set_val("osd_snap_trim_sleep_ssd", std::to_string(0));
10189 cct->_conf.set_val("osd_snap_trim_sleep_hybrid", std::to_string(0));
10190
10191 // Disable scrub sleep
10192 cct->_conf.set_val("osd_scrub_sleep", std::to_string(0));
10193 return true;
10194 }
10195 return false;
10196}
10197
10198int OSD::mon_cmd_set_config(const std::string &key, const std::string &val)
10199{
10200 std::string cmd =
10201 "{"
10202 "\"prefix\": \"config set\", "
10203 "\"who\": \"osd." + std::to_string(whoami) + "\", "
10204 "\"name\": \"" + key + "\", "
10205 "\"value\": \"" + val + "\""
10206 "}";
10207
10208 vector<std::string> vcmd{cmd};
10209 bufferlist inbl;
10210 std::string outs;
10211 C_SaferCond cond;
10212 monc->start_mon_command(vcmd, inbl, nullptr, &outs, &cond);
10213 int r = cond.wait();
10214 if (r < 0) {
10215 derr << __func__ << " Failed to set config key " << key
10216 << " err: " << cpp_strerror(r)
10217 << " errstr: " << outs << dendl;
10218 return r;
10219 }
10220
10221 return 0;
10222}
10223
10224bool OSD::unsupported_objstore_for_qos()
10225{
10226 static const std::vector<std::string> unsupported_objstores = { "filestore" };
10227 return std::find(unsupported_objstores.begin(),
10228 unsupported_objstores.end(),
10229 store->get_type()) != unsupported_objstores.end();
10230}
10231
10232void OSD::update_log_config()
10233{
10234 auto parsed_options = clog->parse_client_options(cct);
10235 derr << "log_to_monitors " << parsed_options.log_to_monitors << dendl;
10236}
10237
10238void OSD::check_config()
10239{
10240 // some sanity checks
10241 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10242 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10243 << " is not > osd_pg_epoch_persisted_max_stale ("
10244 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10245 }
10246 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
10247 clog->warn() << "osd_object_clean_region_max_num_intervals ("
10248 << cct->_conf->osd_object_clean_region_max_num_intervals
10249 << ") is < 0";
10250 }
10251}
10252
10253// --------------------------------
10254
10255void OSD::get_latest_osdmap()
10256{
10257 dout(10) << __func__ << " -- start" << dendl;
10258
10259 boost::system::error_code ec;
10260 service.objecter->wait_for_latest_osdmap(ceph::async::use_blocked[ec]);
10261
10262 dout(10) << __func__ << " -- finish" << dendl;
10263}
10264
10265// --------------------------------
10266
10267void OSD::set_perf_queries(const ConfigPayload &config_payload) {
10268 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
10269 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
10270 dout(10) << "setting " << queries.size() << " queries" << dendl;
10271
10272 std::list<OSDPerfMetricQuery> supported_queries;
10273 for (auto &it : queries) {
10274 auto &query = it.first;
10275 if (!query.key_descriptor.empty()) {
10276 supported_queries.push_back(query);
10277 }
10278 }
10279 if (supported_queries.size() < queries.size()) {
10280 dout(1) << queries.size() - supported_queries.size()
10281 << " unsupported queries" << dendl;
10282 }
10283 {
10284 std::lock_guard locker{m_perf_queries_lock};
10285 m_perf_queries = supported_queries;
10286 m_perf_limits = queries;
10287 }
10288 std::vector<PGRef> pgs;
10289 _get_pgs(&pgs);
10290 for (auto& pg : pgs) {
10291 std::scoped_lock l{*pg};
10292 pg->set_dynamic_perf_stats_queries(supported_queries);
10293 }
10294}
10295
10296MetricPayload OSD::get_perf_reports() {
10297 OSDMetricPayload payload;
10298 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
10299
10300 std::vector<PGRef> pgs;
10301 _get_pgs(&pgs);
10302 DynamicPerfStats dps;
10303 for (auto& pg : pgs) {
10304 // m_perf_queries can be modified only in set_perf_queries by mgr client
10305 // request, and it is protected by by mgr client's lock, which is held
10306 // when set_perf_queries/get_perf_reports are called, so we may not hold
10307 // m_perf_queries_lock here.
10308 DynamicPerfStats pg_dps(m_perf_queries);
10309 pg->lock();
10310 pg->get_dynamic_perf_stats(&pg_dps);
10311 pg->unlock();
10312 dps.merge(pg_dps);
10313 }
10314 dps.add_to_reports(m_perf_limits, &reports);
10315 dout(20) << "reports for " << reports.size() << " queries" << dendl;
10316
10317 return payload;
10318}
10319
10320// =============================================================
10321
10322#undef dout_context
10323#define dout_context cct
10324#undef dout_prefix
10325#define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
10326
10327void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
10328{
10329 dout(10) << pg->pg_id << " " << pg << dendl;
10330 slot->pg = pg;
10331 pg->osd_shard = this;
10332 pg->pg_slot = slot;
10333 osd->inc_num_pgs();
10334
10335 slot->epoch = pg->get_osdmap_epoch();
10336 pg_slots_by_epoch.insert(*slot);
10337}
10338
10339void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10340{
10341 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10342 slot->pg->osd_shard = nullptr;
10343 slot->pg->pg_slot = nullptr;
10344 slot->pg = nullptr;
10345 osd->dec_num_pgs();
10346
10347 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10348 slot->epoch = 0;
10349 if (waiting_for_min_pg_epoch) {
10350 min_pg_epoch_cond.notify_all();
10351 }
10352}
10353
10354void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10355{
10356 std::lock_guard l(shard_lock);
10357 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10358 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10359 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10360 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10361 slot->epoch = e;
10362 pg_slots_by_epoch.insert(*slot);
10363 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10364 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10365 if (waiting_for_min_pg_epoch) {
10366 min_pg_epoch_cond.notify_all();
10367 }
10368}
10369
10370epoch_t OSDShard::get_min_pg_epoch()
10371{
10372 std::lock_guard l(shard_lock);
10373 auto p = pg_slots_by_epoch.begin();
10374 if (p == pg_slots_by_epoch.end()) {
10375 return 0;
10376 }
10377 return p->epoch;
10378}
10379
10380void OSDShard::wait_min_pg_epoch(epoch_t need)
10381{
10382 std::unique_lock l{shard_lock};
10383 ++waiting_for_min_pg_epoch;
10384 min_pg_epoch_cond.wait(l, [need, this] {
10385 if (pg_slots_by_epoch.empty()) {
10386 return true;
10387 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10388 return true;
10389 } else {
10390 dout(10) << need << " waiting on "
10391 << pg_slots_by_epoch.begin()->epoch << dendl;
10392 return false;
10393 }
10394 });
10395 --waiting_for_min_pg_epoch;
10396}
10397
10398epoch_t OSDShard::get_max_waiting_epoch()
10399{
10400 std::lock_guard l(shard_lock);
10401 epoch_t r = 0;
10402 for (auto& i : pg_slots) {
10403 if (!i.second->waiting_peering.empty()) {
10404 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10405 }
10406 }
10407 return r;
10408}
10409
10410void OSDShard::consume_map(
10411 const OSDMapRef& new_osdmap,
10412 unsigned *pushes_to_free)
10413{
10414 std::lock_guard l(shard_lock);
10415 OSDMapRef old_osdmap;
10416 {
10417 std::lock_guard l(osdmap_lock);
10418 old_osdmap = std::move(shard_osdmap);
10419 shard_osdmap = new_osdmap;
10420 }
10421 dout(10) << new_osdmap->get_epoch()
10422 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10423 << dendl;
10424 int queued = 0;
10425
10426 // check slots
10427 auto p = pg_slots.begin();
10428 while (p != pg_slots.end()) {
10429 OSDShardPGSlot *slot = p->second.get();
10430 const spg_t& pgid = p->first;
10431 dout(20) << __func__ << " " << pgid << dendl;
10432 if (!slot->waiting_for_split.empty()) {
10433 dout(20) << __func__ << " " << pgid
10434 << " waiting for split " << slot->waiting_for_split << dendl;
10435 ++p;
10436 continue;
10437 }
10438 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10439 dout(20) << __func__ << " " << pgid
10440 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10441 << dendl;
10442 ++p;
10443 continue;
10444 }
10445 if (!slot->waiting_peering.empty()) {
10446 epoch_t first = slot->waiting_peering.begin()->first;
10447 if (first <= new_osdmap->get_epoch()) {
10448 dout(20) << __func__ << " " << pgid
10449 << " pending_peering first epoch " << first
10450 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10451 queued += _wake_pg_slot(pgid, slot);
10452 }
10453 ++p;
10454 continue;
10455 }
10456 if (!slot->waiting.empty()) {
10457 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10458 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10459 << dendl;
10460 ++p;
10461 continue;
10462 }
10463 while (!slot->waiting.empty() &&
10464 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10465 auto& qi = slot->waiting.front();
10466 dout(20) << __func__ << " " << pgid
10467 << " waiting item " << qi
10468 << " epoch " << qi.get_map_epoch()
10469 << " <= " << new_osdmap->get_epoch()
10470 << ", "
10471 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10472 "misdirected")
10473 << ", dropping" << dendl;
10474 *pushes_to_free += qi.get_reserved_pushes();
10475 slot->waiting.pop_front();
10476 }
10477 }
10478 if (slot->waiting.empty() &&
10479 slot->num_running == 0 &&
10480 slot->waiting_for_split.empty() &&
10481 !slot->pg) {
10482 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10483 p = pg_slots.erase(p);
10484 continue;
10485 }
10486
10487 ++p;
10488 }
10489 if (queued) {
10490 std::lock_guard l{sdata_wait_lock};
10491 if (queued == 1)
10492 sdata_cond.notify_one();
10493 else
10494 sdata_cond.notify_all();
10495 }
10496}
10497
10498int OSDShard::_wake_pg_slot(
10499 spg_t pgid,
10500 OSDShardPGSlot *slot)
10501{
10502 int count = 0;
10503 dout(20) << __func__ << " " << pgid
10504 << " to_process " << slot->to_process
10505 << " waiting " << slot->waiting
10506 << " waiting_peering " << slot->waiting_peering << dendl;
10507 for (auto i = slot->to_process.rbegin();
10508 i != slot->to_process.rend();
10509 ++i) {
10510 scheduler->enqueue_front(std::move(*i));
10511 count++;
10512 }
10513 slot->to_process.clear();
10514 for (auto i = slot->waiting.rbegin();
10515 i != slot->waiting.rend();
10516 ++i) {
10517 scheduler->enqueue_front(std::move(*i));
10518 count++;
10519 }
10520 slot->waiting.clear();
10521 for (auto i = slot->waiting_peering.rbegin();
10522 i != slot->waiting_peering.rend();
10523 ++i) {
10524 // this is overkill; we requeue everything, even if some of these
10525 // items are waiting for maps we don't have yet. FIXME, maybe,
10526 // someday, if we decide this inefficiency matters
10527 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10528 scheduler->enqueue_front(std::move(*j));
10529 count++;
10530 }
10531 }
10532 slot->waiting_peering.clear();
10533 ++slot->requeue_seq;
10534 return count;
10535}
10536
10537void OSDShard::identify_splits_and_merges(
10538 const OSDMapRef& as_of_osdmap,
10539 set<pair<spg_t,epoch_t>> *split_pgs,
10540 set<pair<spg_t,epoch_t>> *merge_pgs)
10541{
10542 std::lock_guard l(shard_lock);
10543 if (shard_osdmap) {
10544 for (auto& i : pg_slots) {
10545 const spg_t& pgid = i.first;
10546 auto *slot = i.second.get();
10547 if (slot->pg) {
10548 osd->service.identify_splits_and_merges(
10549 shard_osdmap, as_of_osdmap, pgid,
10550 split_pgs, merge_pgs);
10551 } else if (!slot->waiting_for_split.empty()) {
10552 osd->service.identify_splits_and_merges(
10553 shard_osdmap, as_of_osdmap, pgid,
10554 split_pgs, nullptr);
10555 } else {
10556 dout(20) << __func__ << " slot " << pgid
10557 << " has no pg and waiting_for_split " << dendl;
10558 }
10559 }
10560 }
10561}
10562
10563void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10564 set<pair<spg_t,epoch_t>> *pgids)
10565{
10566 std::lock_guard l(shard_lock);
10567 _prime_splits(pgids);
10568 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10569 set<pair<spg_t,epoch_t>> newer_children;
10570 for (auto i : *pgids) {
10571 osd->service.identify_splits_and_merges(
10572 as_of_osdmap, shard_osdmap, i.first,
10573 &newer_children, nullptr);
10574 }
10575 newer_children.insert(pgids->begin(), pgids->end());
10576 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10577 << shard_osdmap->get_epoch() << ", new children " << newer_children
10578 << dendl;
10579 _prime_splits(&newer_children);
10580 // note: we don't care what is left over here for other shards.
10581 // if this shard is ahead of us and one isn't, e.g., one thread is
10582 // calling into prime_splits via _process (due to a newly created
10583 // pg) and this shard has a newer map due to a racing consume_map,
10584 // then any grandchildren left here will be identified (or were
10585 // identified) when the slower shard's osdmap is advanced.
10586 // _prime_splits() will tolerate the case where the pgid is
10587 // already primed.
10588 }
10589}
10590
10591void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10592{
10593 dout(10) << *pgids << dendl;
10594 auto p = pgids->begin();
10595 while (p != pgids->end()) {
10596 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10597 if (shard_index == shard_id) {
10598 auto r = pg_slots.emplace(p->first, nullptr);
10599 if (r.second) {
10600 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10601 r.first->second = make_unique<OSDShardPGSlot>();
10602 r.first->second->waiting_for_split.insert(p->second);
10603 } else {
10604 auto q = r.first;
10605 ceph_assert(q != pg_slots.end());
10606 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10607 << dendl;
10608 q->second->waiting_for_split.insert(p->second);
10609 }
10610 p = pgids->erase(p);
10611 } else {
10612 ++p;
10613 }
10614 }
10615}
10616
10617void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10618 set<pair<spg_t,epoch_t>> *merge_pgs)
10619{
10620 std::lock_guard l(shard_lock);
10621 dout(20) << __func__ << " checking shard " << shard_id
10622 << " for remaining merge pgs " << merge_pgs << dendl;
10623 auto p = merge_pgs->begin();
10624 while (p != merge_pgs->end()) {
10625 spg_t pgid = p->first;
10626 epoch_t epoch = p->second;
10627 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10628 if (shard_index != shard_id) {
10629 ++p;
10630 continue;
10631 }
10632 OSDShardPGSlot *slot;
10633 auto r = pg_slots.emplace(pgid, nullptr);
10634 if (r.second) {
10635 r.first->second = make_unique<OSDShardPGSlot>();
10636 }
10637 slot = r.first->second.get();
10638 if (slot->pg) {
10639 // already have pg
10640 dout(20) << __func__ << " have merge participant pg " << pgid
10641 << " " << slot->pg << dendl;
10642 } else if (!slot->waiting_for_split.empty() &&
10643 *slot->waiting_for_split.begin() < epoch) {
10644 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10645 << " " << slot->waiting_for_split << dendl;
10646 } else {
10647 dout(20) << __func__ << " creating empty merge participant " << pgid
10648 << " for merge in " << epoch << dendl;
10649 // leave history zeroed; PG::merge_from() will fill it in.
10650 pg_history_t history;
10651 PGCreateInfo cinfo(pgid, epoch - 1,
10652 history, PastIntervals(), false);
10653 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10654 _attach_pg(r.first->second.get(), pg.get());
10655 _wake_pg_slot(pgid, slot);
10656 pg->unlock();
10657 }
10658 // mark slot for merge
10659 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10660 slot->waiting_for_merge_epoch = epoch;
10661 p = merge_pgs->erase(p);
10662 }
10663}
10664
10665void OSDShard::register_and_wake_split_child(PG *pg)
10666{
10667 dout(15) << __func__ << ": " << pg << " #:" << pg_slots.size() << dendl;
10668 epoch_t epoch;
10669 {
10670 std::lock_guard l(shard_lock);
10671 dout(10) << __func__ << ": " << pg->pg_id << " " << pg << dendl;
10672 auto p = pg_slots.find(pg->pg_id);
10673 ceph_assert(p != pg_slots.end());
10674 auto *slot = p->second.get();
10675 dout(20) << __func__ << ": " << pg->pg_id << " waiting_for_split "
10676 << slot->waiting_for_split << dendl;
10677 ceph_assert(!slot->pg);
10678 ceph_assert(!slot->waiting_for_split.empty());
10679 _attach_pg(slot, pg);
10680
10681 epoch = pg->get_osdmap_epoch();
10682 ceph_assert(slot->waiting_for_split.count(epoch));
10683 slot->waiting_for_split.erase(epoch);
10684 if (slot->waiting_for_split.empty()) {
10685 _wake_pg_slot(pg->pg_id, slot);
10686 } else {
10687 dout(10) << __func__ << " still waiting for split on "
10688 << slot->waiting_for_split << dendl;
10689 }
10690 }
10691
10692 // kick child to ensure it pulls up to the latest osdmap
10693 osd->enqueue_peering_evt(
10694 pg->pg_id,
10695 PGPeeringEventRef(
10696 std::make_shared<PGPeeringEvent>(
10697 epoch,
10698 epoch,
10699 NullEvt())));
10700
10701 std::lock_guard l{sdata_wait_lock};
10702 sdata_cond.notify_one();
10703}
10704
10705void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
10706{
10707 std::lock_guard l(shard_lock);
10708 vector<spg_t> to_delete;
10709 for (auto& i : pg_slots) {
10710 if (i.first != parent &&
10711 i.first.get_ancestor(old_pg_num) == parent) {
10712 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10713 << dendl;
10714 _wake_pg_slot(i.first, i.second.get());
10715 to_delete.push_back(i.first);
10716 }
10717 }
10718 for (auto pgid : to_delete) {
10719 pg_slots.erase(pgid);
10720 }
10721}
10722
10723void OSDShard::update_scheduler_config()
10724{
10725 std::lock_guard l(shard_lock);
10726 scheduler->update_configuration();
10727}
10728
10729std::string OSDShard::get_scheduler_type()
10730{
10731 std::ostringstream scheduler_type;
10732 scheduler_type << *scheduler;
10733 return scheduler_type.str();
10734}
10735
10736OSDShard::OSDShard(
10737 int id,
10738 CephContext *cct,
10739 OSD *osd)
10740 : shard_id(id),
10741 cct(cct),
10742 osd(osd),
10743 shard_name(string("OSDShard.") + stringify(id)),
10744 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10745 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10746 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10747 shard_lock_name(shard_name + "::shard_lock"),
10748 shard_lock{make_mutex(shard_lock_name)},
10749 scheduler(ceph::osd::scheduler::make_scheduler(
10750 cct, osd->num_shards, osd->store->is_rotational(),
10751 osd->store->get_type())),
10752 context_queue(sdata_wait_lock, sdata_cond)
10753{
10754 dout(0) << "using op scheduler " << *scheduler << dendl;
10755}
10756
10757
10758// =============================================================
10759
10760#undef dout_context
10761#define dout_context osd->cct
10762#undef dout_prefix
10763#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10764
10765void OSD::ShardedOpWQ::_add_slot_waiter(
10766 spg_t pgid,
10767 OSDShardPGSlot *slot,
10768 OpSchedulerItem&& qi)
10769{
10770 if (qi.is_peering()) {
10771 dout(20) << __func__ << " " << pgid
10772 << " peering, item epoch is "
10773 << qi.get_map_epoch()
10774 << ", will wait on " << qi << dendl;
10775 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10776 } else {
10777 dout(20) << __func__ << " " << pgid
10778 << " item epoch is "
10779 << qi.get_map_epoch()
10780 << ", will wait on " << qi << dendl;
10781 slot->waiting.push_back(std::move(qi));
10782 }
10783}
10784
10785#undef dout_prefix
10786#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10787
10788void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10789{
10790 uint32_t shard_index = thread_index % osd->num_shards;
10791 auto& sdata = osd->shards[shard_index];
10792 ceph_assert(sdata);
10793
10794 // If all threads of shards do oncommits, there is a out-of-order
10795 // problem. So we choose the thread which has the smallest
10796 // thread_index(thread_index < num_shards) of shard to do oncommit
10797 // callback.
10798 bool is_smallest_thread_index = thread_index < osd->num_shards;
10799
10800 // peek at spg_t
10801 sdata->shard_lock.lock();
10802 if (sdata->scheduler->empty() &&
10803 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10804 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10805 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10806 // we raced with a context_queue addition, don't wait
10807 wait_lock.unlock();
10808 } else if (!sdata->stop_waiting) {
10809 dout(20) << __func__ << " empty q, waiting" << dendl;
10810 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10811 sdata->shard_lock.unlock();
10812 sdata->sdata_cond.wait(wait_lock);
10813 wait_lock.unlock();
10814 sdata->shard_lock.lock();
10815 if (sdata->scheduler->empty() &&
10816 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10817 sdata->shard_lock.unlock();
10818 return;
10819 }
10820 // found a work item; reapply default wq timeouts
10821 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10822 timeout_interval, suicide_interval);
10823 } else {
10824 dout(20) << __func__ << " need return immediately" << dendl;
10825 wait_lock.unlock();
10826 sdata->shard_lock.unlock();
10827 return;
10828 }
10829 }
10830
10831 list<Context *> oncommits;
10832 if (is_smallest_thread_index) {
10833 sdata->context_queue.move_to(oncommits);
10834 }
10835
10836 WorkItem work_item;
10837 while (!std::get_if<OpSchedulerItem>(&work_item)) {
10838 if (sdata->scheduler->empty()) {
10839 if (osd->is_stopping()) {
10840 sdata->shard_lock.unlock();
10841 for (auto c : oncommits) {
10842 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10843 delete c;
10844 }
10845 return; // OSD shutdown, discard.
10846 }
10847 sdata->shard_lock.unlock();
10848 handle_oncommits(oncommits);
10849 return;
10850 }
10851
10852 work_item = sdata->scheduler->dequeue();
10853 if (osd->is_stopping()) {
10854 sdata->shard_lock.unlock();
10855 for (auto c : oncommits) {
10856 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10857 delete c;
10858 }
10859 return; // OSD shutdown, discard.
10860 }
10861
10862 // If the work item is scheduled in the future, wait until
10863 // the time returned in the dequeue response before retrying.
10864 if (auto when_ready = std::get_if<double>(&work_item)) {
10865 if (is_smallest_thread_index) {
10866 sdata->shard_lock.unlock();
10867 handle_oncommits(oncommits);
10868 return;
10869 }
10870 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10871 auto future_time = ceph::real_clock::from_double(*when_ready);
10872 dout(10) << __func__ << " dequeue future request at " << future_time << dendl;
10873 // Disable heartbeat timeout until we find a non-future work item to process.
10874 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10875 sdata->shard_lock.unlock();
10876 ++sdata->waiting_threads;
10877 sdata->sdata_cond.wait_until(wait_lock, future_time);
10878 --sdata->waiting_threads;
10879 wait_lock.unlock();
10880 sdata->shard_lock.lock();
10881 // Reapply default wq timeouts
10882 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10883 timeout_interval, suicide_interval);
10884 }
10885 } // while
10886
10887 // Access the stored item
10888 auto item = std::move(std::get<OpSchedulerItem>(work_item));
10889 if (osd->is_stopping()) {
10890 sdata->shard_lock.unlock();
10891 for (auto c : oncommits) {
10892 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10893 delete c;
10894 }
10895 return; // OSD shutdown, discard.
10896 }
10897
10898 const auto token = item.get_ordering_token();
10899 auto r = sdata->pg_slots.emplace(token, nullptr);
10900 if (r.second) {
10901 r.first->second = make_unique<OSDShardPGSlot>();
10902 }
10903 OSDShardPGSlot *slot = r.first->second.get();
10904 dout(20) << __func__ << " " << token
10905 << (r.second ? " (new)" : "")
10906 << " to_process " << slot->to_process
10907 << " waiting " << slot->waiting
10908 << " waiting_peering " << slot->waiting_peering
10909 << dendl;
10910 slot->to_process.push_back(std::move(item));
10911 dout(20) << __func__ << " " << slot->to_process.back()
10912 << " queued" << dendl;
10913
10914 retry_pg:
10915 PGRef pg = slot->pg;
10916
10917 // lock pg (if we have it)
10918 if (pg) {
10919 // note the requeue seq now...
10920 uint64_t requeue_seq = slot->requeue_seq;
10921 ++slot->num_running;
10922
10923 sdata->shard_lock.unlock();
10924 osd->service.maybe_inject_dispatch_delay();
10925 pg->lock();
10926 osd->service.maybe_inject_dispatch_delay();
10927 sdata->shard_lock.lock();
10928
10929 auto q = sdata->pg_slots.find(token);
10930 if (q == sdata->pg_slots.end()) {
10931 // this can happen if we race with pg removal.
10932 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10933 pg->unlock();
10934 sdata->shard_lock.unlock();
10935 handle_oncommits(oncommits);
10936 return;
10937 }
10938 slot = q->second.get();
10939 --slot->num_running;
10940
10941 if (slot->to_process.empty()) {
10942 // raced with _wake_pg_slot or consume_map
10943 dout(20) << __func__ << " " << token
10944 << " nothing queued" << dendl;
10945 pg->unlock();
10946 sdata->shard_lock.unlock();
10947 handle_oncommits(oncommits);
10948 return;
10949 }
10950 if (requeue_seq != slot->requeue_seq) {
10951 dout(20) << __func__ << " " << token
10952 << " requeue_seq " << slot->requeue_seq << " > our "
10953 << requeue_seq << ", we raced with _wake_pg_slot"
10954 << dendl;
10955 pg->unlock();
10956 sdata->shard_lock.unlock();
10957 handle_oncommits(oncommits);
10958 return;
10959 }
10960 if (slot->pg != pg) {
10961 // this can happen if we race with pg removal.
10962 dout(20) << __func__ << " slot " << token << " no longer attached to "
10963 << pg << dendl;
10964 pg->unlock();
10965 goto retry_pg;
10966 }
10967 }
10968
10969 dout(20) << __func__ << " " << token
10970 << " to_process " << slot->to_process
10971 << " waiting " << slot->waiting
10972 << " waiting_peering " << slot->waiting_peering << dendl;
10973
10974 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10975 suicide_interval);
10976
10977 // take next item
10978 auto qi = std::move(slot->to_process.front());
10979 slot->to_process.pop_front();
10980 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10981 set<pair<spg_t,epoch_t>> new_children;
10982 OSDMapRef osdmap;
10983
10984 while (!pg) {
10985 // should this pg shard exist on this osd in this (or a later) epoch?
10986 osdmap = sdata->shard_osdmap;
10987 const PGCreateInfo *create_info = qi.creates_pg();
10988 if (!slot->waiting_for_split.empty()) {
10989 dout(20) << __func__ << " " << token
10990 << " splitting " << slot->waiting_for_split << dendl;
10991 _add_slot_waiter(token, slot, std::move(qi));
10992 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10993 dout(20) << __func__ << " " << token
10994 << " map " << qi.get_map_epoch() << " > "
10995 << osdmap->get_epoch() << dendl;
10996 _add_slot_waiter(token, slot, std::move(qi));
10997 } else if (qi.is_peering()) {
10998 if (!qi.peering_requires_pg()) {
10999 // for pg-less events, we run them under the ordering lock, since
11000 // we don't have the pg lock to keep them ordered.
11001 qi.run(osd, sdata, pg, tp_handle);
11002 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11003 if (create_info) {
11004 if (create_info->by_mon &&
11005 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
11006 dout(20) << __func__ << " " << token
11007 << " no pg, no longer primary, ignoring mon create on "
11008 << qi << dendl;
11009 } else {
11010 dout(20) << __func__ << " " << token
11011 << " no pg, should create on " << qi << dendl;
11012 pg = osd->handle_pg_create_info(osdmap, create_info);
11013 if (pg) {
11014 // we created the pg! drop out and continue "normally"!
11015 sdata->_attach_pg(slot, pg.get());
11016 sdata->_wake_pg_slot(token, slot);
11017
11018 // identify split children between create epoch and shard epoch.
11019 osd->service.identify_splits_and_merges(
11020 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
11021 sdata->_prime_splits(&new_children);
11022 // distribute remaining split children to other shards below!
11023 break;
11024 }
11025 dout(20) << __func__ << " ignored create on " << qi << dendl;
11026 }
11027 } else {
11028 dout(20) << __func__ << " " << token
11029 << " no pg, peering, !create, discarding " << qi << dendl;
11030 }
11031 } else {
11032 dout(20) << __func__ << " " << token
11033 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
11034 << ", discarding " << qi
11035 << dendl;
11036 }
11037 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
11038 dout(20) << __func__ << " " << token
11039 << " no pg, should exist e" << osdmap->get_epoch()
11040 << ", will wait on " << qi << dendl;
11041 _add_slot_waiter(token, slot, std::move(qi));
11042 } else {
11043 dout(20) << __func__ << " " << token
11044 << " no pg, shouldn't exist e" << osdmap->get_epoch()
11045 << ", dropping " << qi << dendl;
11046 // share map with client?
11047 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11048 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
11049 sdata->shard_osdmap,
11050 (*_op)->sent_epoch);
11051 }
11052 unsigned pushes_to_free = qi.get_reserved_pushes();
11053 if (pushes_to_free > 0) {
11054 sdata->shard_lock.unlock();
11055 osd->service.release_reserved_pushes(pushes_to_free);
11056 handle_oncommits(oncommits);
11057 return;
11058 }
11059 }
11060 sdata->shard_lock.unlock();
11061 handle_oncommits(oncommits);
11062 return;
11063 }
11064 if (qi.is_peering()) {
11065 OSDMapRef osdmap = sdata->shard_osdmap;
11066 if (qi.get_map_epoch() > osdmap->get_epoch()) {
11067 _add_slot_waiter(token, slot, std::move(qi));
11068 sdata->shard_lock.unlock();
11069 pg->unlock();
11070 handle_oncommits(oncommits);
11071 return;
11072 }
11073 }
11074 sdata->shard_lock.unlock();
11075
11076 if (!new_children.empty()) {
11077 for (auto shard : osd->shards) {
11078 shard->prime_splits(osdmap, &new_children);
11079 }
11080 ceph_assert(new_children.empty());
11081 }
11082
11083 // osd_opwq_process marks the point at which an operation has been dequeued
11084 // and will begin to be handled by a worker thread.
11085 {
11086#ifdef WITH_LTTNG
11087 osd_reqid_t reqid;
11088 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11089 reqid = (*_op)->get_reqid();
11090 }
11091#endif
11092 tracepoint(osd, opwq_process_start, reqid.name._type,
11093 reqid.name._num, reqid.tid, reqid.inc);
11094 }
11095
11096 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
11097 Formatter *f = Formatter::create("json");
11098 f->open_object_section("q");
11099 dump(f);
11100 f->close_section();
11101 f->flush(*_dout);
11102 delete f;
11103 *_dout << dendl;
11104
11105 qi.run(osd, sdata, pg, tp_handle);
11106
11107 {
11108#ifdef WITH_LTTNG
11109 osd_reqid_t reqid;
11110 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
11111 reqid = (*_op)->get_reqid();
11112 }
11113#endif
11114 tracepoint(osd, opwq_process_finish, reqid.name._type,
11115 reqid.name._num, reqid.tid, reqid.inc);
11116 }
11117
11118 handle_oncommits(oncommits);
11119}
11120
11121void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
11122 if (unlikely(m_fast_shutdown) ) {
11123 // stop enqueing when we are in the middle of a fast shutdown
11124 return;
11125 }
11126
11127 uint32_t shard_index =
11128 item.get_ordering_token().hash_to_shard(osd->shards.size());
11129
11130 OSDShard* sdata = osd->shards[shard_index];
11131 assert (NULL != sdata);
11132 if (sdata->get_scheduler_type() == "mClockScheduler") {
11133 item.maybe_set_is_qos_item();
11134 }
11135
11136 dout(20) << __func__ << " " << item << dendl;
11137
11138 bool empty = true;
11139 {
11140 std::lock_guard l{sdata->shard_lock};
11141 empty = sdata->scheduler->empty();
11142 sdata->scheduler->enqueue(std::move(item));
11143 }
11144
11145 {
11146 std::lock_guard l{sdata->sdata_wait_lock};
11147 if (empty) {
11148 sdata->sdata_cond.notify_all();
11149 } else if (sdata->waiting_threads) {
11150 sdata->sdata_cond.notify_one();
11151 }
11152 }
11153}
11154
11155void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
11156{
11157 if (unlikely(m_fast_shutdown) ) {
11158 // stop enqueing when we are in the middle of a fast shutdown
11159 return;
11160 }
11161
11162 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
11163 auto& sdata = osd->shards[shard_index];
11164 ceph_assert(sdata);
11165 sdata->shard_lock.lock();
11166 auto p = sdata->pg_slots.find(item.get_ordering_token());
11167 if (p != sdata->pg_slots.end() &&
11168 !p->second->to_process.empty()) {
11169 // we may be racing with _process, which has dequeued a new item
11170 // from scheduler, put it on to_process, and is now busy taking the
11171 // pg lock. ensure this old requeued item is ordered before any
11172 // such newer item in to_process.
11173 p->second->to_process.push_front(std::move(item));
11174 item = std::move(p->second->to_process.back());
11175 p->second->to_process.pop_back();
11176 dout(20) << __func__
11177 << " " << p->second->to_process.front()
11178 << " shuffled w/ " << item << dendl;
11179 } else {
11180 dout(20) << __func__ << " " << item << dendl;
11181 }
11182 sdata->scheduler->enqueue_front(std::move(item));
11183 sdata->shard_lock.unlock();
11184 std::lock_guard l{sdata->sdata_wait_lock};
11185 sdata->sdata_cond.notify_one();
11186}
11187
11188void OSD::ShardedOpWQ::stop_for_fast_shutdown()
11189{
11190 uint32_t shard_index = 0;
11191 m_fast_shutdown = true;
11192
11193 for (; shard_index < osd->num_shards; shard_index++) {
11194 auto& sdata = osd->shards[shard_index];
11195 ceph_assert(sdata);
11196 sdata->shard_lock.lock();
11197 int work_count = 0;
11198 while(! sdata->scheduler->empty() ) {
11199 auto work_item = sdata->scheduler->dequeue();
11200 work_count++;
11201 }
11202 sdata->shard_lock.unlock();
11203 }
11204}
11205
11206namespace ceph::osd_cmds {
11207
11208int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
11209 std::ostream& os)
11210{
11211 if (!ceph_using_tcmalloc()) {
11212 os << "could not issue heap profiler command -- not using tcmalloc!";
11213 return -EOPNOTSUPP;
11214 }
11215
11216 string cmd;
11217 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
11218 os << "unable to get value for command \"" << cmd << "\"";
11219 return -EINVAL;
11220 }
11221
11222 std::vector<std::string> cmd_vec;
11223 get_str_vec(cmd, cmd_vec);
11224
11225 string val;
11226 if (cmd_getval(cmdmap, "value", val)) {
11227 cmd_vec.push_back(val);
11228 }
11229
11230 ceph_heap_profiler_handle_command(cmd_vec, os);
11231
11232 return 0;
11233}
11234
11235} // namespace ceph::osd_cmds