]> git.proxmox.com Git - ceph.git/blame_incremental - ceph/src/osd/OSD.cc
import ceph 15.2.14
[ceph.git] / ceph / src / osd / OSD.cc
... / ...
CommitLineData
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15
16#include "acconfig.h"
17
18#include <cctype>
19#include <fstream>
20#include <iostream>
21#include <iterator>
22
23#include <unistd.h>
24#include <sys/stat.h>
25#include <signal.h>
26#include <time.h>
27#include <boost/scoped_ptr.hpp>
28#include <boost/range/adaptor/reversed.hpp>
29
30#ifdef HAVE_SYS_PARAM_H
31#include <sys/param.h>
32#endif
33
34#ifdef HAVE_SYS_MOUNT_H
35#include <sys/mount.h>
36#endif
37
38#include "osd/PG.h"
39
40#include "include/types.h"
41#include "include/compat.h"
42#include "include/random.h"
43
44#include "OSD.h"
45#include "OSDMap.h"
46#include "Watch.h"
47#include "osdc/Objecter.h"
48
49#include "common/errno.h"
50#include "common/ceph_argparse.h"
51#include "common/ceph_releases.h"
52#include "common/ceph_time.h"
53#include "common/version.h"
54#include "common/pick_address.h"
55#include "common/blkdev.h"
56#include "common/numa.h"
57
58#include "os/ObjectStore.h"
59#ifdef HAVE_LIBFUSE
60#include "os/FuseStore.h"
61#endif
62
63#include "PrimaryLogPG.h"
64
65#include "msg/Messenger.h"
66#include "msg/Message.h"
67
68#include "mon/MonClient.h"
69
70#include "messages/MLog.h"
71
72#include "messages/MGenericMessage.h"
73#include "messages/MOSDPing.h"
74#include "messages/MOSDFailure.h"
75#include "messages/MOSDMarkMeDown.h"
76#include "messages/MOSDMarkMeDead.h"
77#include "messages/MOSDFull.h"
78#include "messages/MOSDOp.h"
79#include "messages/MOSDOpReply.h"
80#include "messages/MOSDBackoff.h"
81#include "messages/MOSDBeacon.h"
82#include "messages/MOSDRepOp.h"
83#include "messages/MOSDRepOpReply.h"
84#include "messages/MOSDBoot.h"
85#include "messages/MOSDPGTemp.h"
86#include "messages/MOSDPGReadyToMerge.h"
87
88#include "messages/MOSDMap.h"
89#include "messages/MMonGetOSDMap.h"
90#include "messages/MOSDPGNotify.h"
91#include "messages/MOSDPGNotify2.h"
92#include "messages/MOSDPGQuery.h"
93#include "messages/MOSDPGQuery2.h"
94#include "messages/MOSDPGLog.h"
95#include "messages/MOSDPGRemove.h"
96#include "messages/MOSDPGInfo.h"
97#include "messages/MOSDPGInfo2.h"
98#include "messages/MOSDPGCreate.h"
99#include "messages/MOSDPGCreate2.h"
100#include "messages/MOSDPGScan.h"
101#include "messages/MBackfillReserve.h"
102#include "messages/MRecoveryReserve.h"
103#include "messages/MOSDForceRecovery.h"
104#include "messages/MOSDECSubOpWrite.h"
105#include "messages/MOSDECSubOpWriteReply.h"
106#include "messages/MOSDECSubOpRead.h"
107#include "messages/MOSDECSubOpReadReply.h"
108#include "messages/MOSDPGCreated.h"
109#include "messages/MOSDPGUpdateLogMissing.h"
110#include "messages/MOSDPGUpdateLogMissingReply.h"
111
112#include "messages/MOSDPeeringOp.h"
113
114#include "messages/MOSDAlive.h"
115
116#include "messages/MOSDScrub.h"
117#include "messages/MOSDScrub2.h"
118#include "messages/MOSDRepScrub.h"
119
120#include "messages/MCommand.h"
121#include "messages/MCommandReply.h"
122
123#include "messages/MPGStats.h"
124#include "messages/MPGStatsAck.h"
125
126#include "messages/MWatchNotify.h"
127#include "messages/MOSDPGPush.h"
128#include "messages/MOSDPGPushReply.h"
129#include "messages/MOSDPGPull.h"
130
131#include "messages/MMonGetPurgedSnaps.h"
132#include "messages/MMonGetPurgedSnapsReply.h"
133
134#include "common/perf_counters.h"
135#include "common/Timer.h"
136#include "common/LogClient.h"
137#include "common/AsyncReserver.h"
138#include "common/HeartbeatMap.h"
139#include "common/admin_socket.h"
140#include "common/ceph_context.h"
141
142#include "global/signal_handler.h"
143#include "global/pidfile.h"
144
145#include "include/color.h"
146#include "perfglue/cpu_profiler.h"
147#include "perfglue/heap_profiler.h"
148
149#include "osd/OpRequest.h"
150
151#include "auth/AuthAuthorizeHandler.h"
152#include "auth/RotatingKeyRing.h"
153
154#include "objclass/objclass.h"
155
156#include "common/cmdparse.h"
157#include "include/str_list.h"
158#include "include/util.h"
159
160#include "include/ceph_assert.h"
161#include "common/config.h"
162#include "common/EventTrace.h"
163
164#include "json_spirit/json_spirit_reader.h"
165#include "json_spirit/json_spirit_writer.h"
166
167#ifdef WITH_LTTNG
168#define TRACEPOINT_DEFINE
169#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
170#include "tracing/osd.h"
171#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172#undef TRACEPOINT_DEFINE
173#else
174#define tracepoint(...)
175#endif
176
177#define dout_context cct
178#define dout_subsys ceph_subsys_osd
179#undef dout_prefix
180#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
181
182using namespace ceph::osd::scheduler;
183using TOPNSPC::common::cmd_getval;
184
185static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
186 return *_dout << "osd." << whoami << " " << epoch << " ";
187}
188
189//Initial features in new superblock.
190//Features here are also automatically upgraded
191CompatSet OSD::get_osd_initial_compat_set() {
192 CompatSet::FeatureSet ceph_osd_feature_compat;
193 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
194 CompatSet::FeatureSet ceph_osd_feature_incompat;
195 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
196 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
197 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
198 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
199 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
200 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
201 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
202 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
203 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
204 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
205 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
206 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
207 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
208 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
209 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
210 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
211 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
212 ceph_osd_feature_incompat);
213}
214
215//Features are added here that this OSD supports.
216CompatSet OSD::get_osd_compat_set() {
217 CompatSet compat = get_osd_initial_compat_set();
218 //Any features here can be set in code, but not in initial superblock
219 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
220 return compat;
221}
222
223OSDService::OSDService(OSD *osd) :
224 osd(osd),
225 cct(osd->cct),
226 whoami(osd->whoami), store(osd->store),
227 log_client(osd->log_client), clog(osd->clog),
228 pg_recovery_stats(osd->pg_recovery_stats),
229 cluster_messenger(osd->cluster_messenger),
230 client_messenger(osd->client_messenger),
231 logger(osd->logger),
232 recoverystate_perf(osd->recoverystate_perf),
233 monc(osd->monc),
234 osd_max_object_size(cct->_conf, "osd_max_object_size"),
235 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
236 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
237 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
238 max_oldest_map(0),
239 scrubs_local(0),
240 scrubs_remote(0),
241 agent_valid_iterator(false),
242 agent_ops(0),
243 flush_mode_high_count(0),
244 agent_active(true),
245 agent_thread(this),
246 agent_stop_flag(false),
247 agent_timer(osd->client_messenger->cct, agent_timer_lock),
248 last_recalibrate(ceph_clock_now()),
249 promote_max_objects(0),
250 promote_max_bytes(0),
251 objecter(make_unique<Objecter>(osd->client_messenger->cct,
252 osd->objecter_messenger,
253 osd->monc, nullptr)),
254 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
255 watch_timer(osd->client_messenger->cct, watch_lock),
256 next_notif_id(0),
257 recovery_request_timer(cct, recovery_request_lock, false),
258 sleep_timer(cct, sleep_lock, false),
259 reserver_finisher(cct),
260 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
261 cct->_conf->osd_min_recovery_priority),
262 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
263 cct->_conf->osd_min_recovery_priority),
264 snap_reserver(cct, &reserver_finisher,
265 cct->_conf->osd_max_trimming_pgs),
266 recovery_ops_active(0),
267 recovery_ops_reserved(0),
268 recovery_paused(false),
269 map_cache(cct, cct->_conf->osd_map_cache_size),
270 map_bl_cache(cct->_conf->osd_map_cache_size),
271 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
272 cur_state(NONE),
273 cur_ratio(0), physical_ratio(0),
274 boot_epoch(0), up_epoch(0), bind_epoch(0)
275{
276 objecter->init();
277
278 for (int i = 0; i < m_objecter_finishers; i++) {
279 ostringstream str;
280 str << "objecter-finisher-" << i;
281 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
282 objecter_finishers.push_back(std::move(fin));
283 }
284}
285
286#ifdef PG_DEBUG_REFS
287void OSDService::add_pgid(spg_t pgid, PG *pg){
288 std::lock_guard l(pgid_lock);
289 if (!pgid_tracker.count(pgid)) {
290 live_pgs[pgid] = pg;
291 }
292 pgid_tracker[pgid]++;
293}
294void OSDService::remove_pgid(spg_t pgid, PG *pg)
295{
296 std::lock_guard l(pgid_lock);
297 ceph_assert(pgid_tracker.count(pgid));
298 ceph_assert(pgid_tracker[pgid] > 0);
299 pgid_tracker[pgid]--;
300 if (pgid_tracker[pgid] == 0) {
301 pgid_tracker.erase(pgid);
302 live_pgs.erase(pgid);
303 }
304}
305void OSDService::dump_live_pgids()
306{
307 std::lock_guard l(pgid_lock);
308 derr << "live pgids:" << dendl;
309 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
310 i != pgid_tracker.cend();
311 ++i) {
312 derr << "\t" << *i << dendl;
313 live_pgs[i->first]->dump_live_ids();
314 }
315}
316#endif
317
318
319ceph::signedspan OSDService::get_mnow()
320{
321 return ceph::mono_clock::now() - osd->startup_time;
322}
323
324void OSDService::identify_splits_and_merges(
325 OSDMapRef old_map,
326 OSDMapRef new_map,
327 spg_t pgid,
328 set<pair<spg_t,epoch_t>> *split_children,
329 set<pair<spg_t,epoch_t>> *merge_pgs)
330{
331 if (!old_map->have_pg_pool(pgid.pool())) {
332 return;
333 }
334 int old_pgnum = old_map->get_pg_num(pgid.pool());
335 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
336 if (p == osd->pg_num_history.pg_nums.end()) {
337 return;
338 }
339 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
340 << " to e" << new_map->get_epoch()
341 << " pg_nums " << p->second << dendl;
342 deque<spg_t> queue;
343 queue.push_back(pgid);
344 set<spg_t> did;
345 while (!queue.empty()) {
346 auto cur = queue.front();
347 queue.pop_front();
348 did.insert(cur);
349 unsigned pgnum = old_pgnum;
350 for (auto q = p->second.lower_bound(old_map->get_epoch());
351 q != p->second.end() &&
352 q->first <= new_map->get_epoch();
353 ++q) {
354 if (pgnum < q->second) {
355 // split?
356 if (cur.ps() < pgnum) {
357 set<spg_t> children;
358 if (cur.is_split(pgnum, q->second, &children)) {
359 dout(20) << __func__ << " " << cur << " e" << q->first
360 << " pg_num " << pgnum << " -> " << q->second
361 << " children " << children << dendl;
362 for (auto i : children) {
363 split_children->insert(make_pair(i, q->first));
364 if (!did.count(i))
365 queue.push_back(i);
366 }
367 }
368 } else if (cur.ps() < q->second) {
369 dout(20) << __func__ << " " << cur << " e" << q->first
370 << " pg_num " << pgnum << " -> " << q->second
371 << " is a child" << dendl;
372 // normally we'd capture this from the parent, but it's
373 // possible the parent doesn't exist yet (it will be
374 // fabricated to allow an intervening merge). note this PG
375 // as a split child here to be sure we catch it.
376 split_children->insert(make_pair(cur, q->first));
377 } else {
378 dout(20) << __func__ << " " << cur << " e" << q->first
379 << " pg_num " << pgnum << " -> " << q->second
380 << " is post-split, skipping" << dendl;
381 }
382 } else if (merge_pgs) {
383 // merge?
384 if (cur.ps() >= q->second) {
385 if (cur.ps() < pgnum) {
386 spg_t parent;
387 if (cur.is_merge_source(pgnum, q->second, &parent)) {
388 set<spg_t> children;
389 parent.is_split(q->second, pgnum, &children);
390 dout(20) << __func__ << " " << cur << " e" << q->first
391 << " pg_num " << pgnum << " -> " << q->second
392 << " is merge source, target " << parent
393 << ", source(s) " << children << dendl;
394 merge_pgs->insert(make_pair(parent, q->first));
395 if (!did.count(parent)) {
396 // queue (and re-scan) parent in case it might not exist yet
397 // and there are some future splits pending on it
398 queue.push_back(parent);
399 }
400 for (auto c : children) {
401 merge_pgs->insert(make_pair(c, q->first));
402 if (!did.count(c))
403 queue.push_back(c);
404 }
405 }
406 } else {
407 dout(20) << __func__ << " " << cur << " e" << q->first
408 << " pg_num " << pgnum << " -> " << q->second
409 << " is beyond old pgnum, skipping" << dendl;
410 }
411 } else {
412 set<spg_t> children;
413 if (cur.is_split(q->second, pgnum, &children)) {
414 dout(20) << __func__ << " " << cur << " e" << q->first
415 << " pg_num " << pgnum << " -> " << q->second
416 << " is merge target, source " << children << dendl;
417 for (auto c : children) {
418 merge_pgs->insert(make_pair(c, q->first));
419 if (!did.count(c))
420 queue.push_back(c);
421 }
422 merge_pgs->insert(make_pair(cur, q->first));
423 }
424 }
425 }
426 pgnum = q->second;
427 }
428 }
429}
430
431void OSDService::need_heartbeat_peer_update()
432{
433 osd->need_heartbeat_peer_update();
434}
435
436HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
437{
438 std::lock_guard l(hb_stamp_lock);
439 if (peer >= hb_stamps.size()) {
440 hb_stamps.resize(peer + 1);
441 }
442 if (!hb_stamps[peer]) {
443 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
444 }
445 return hb_stamps[peer];
446}
447
448void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
449{
450 osd->enqueue_peering_evt(
451 spgid,
452 PGPeeringEventRef(
453 std::make_shared<PGPeeringEvent>(
454 epoch, epoch,
455 RenewLease())));
456}
457
458void OSDService::start_shutdown()
459{
460 {
461 std::lock_guard l(agent_timer_lock);
462 agent_timer.shutdown();
463 }
464
465 {
466 std::lock_guard l(sleep_lock);
467 sleep_timer.shutdown();
468 }
469
470 {
471 std::lock_guard l(recovery_request_lock);
472 recovery_request_timer.shutdown();
473 }
474}
475
476void OSDService::shutdown_reserver()
477{
478 reserver_finisher.wait_for_empty();
479 reserver_finisher.stop();
480}
481
482void OSDService::shutdown()
483{
484 mono_timer.suspend();
485
486 {
487 std::lock_guard l(watch_lock);
488 watch_timer.shutdown();
489 }
490
491 objecter->shutdown();
492 for (auto& f : objecter_finishers) {
493 f->wait_for_empty();
494 f->stop();
495 }
496
497 publish_map(OSDMapRef());
498 next_osdmap = OSDMapRef();
499}
500
501void OSDService::init()
502{
503 reserver_finisher.start();
504 for (auto& f : objecter_finishers) {
505 f->start();
506 }
507 objecter->set_client_incarnation(0);
508
509 // deprioritize objecter in daemonperf output
510 objecter->get_logger()->set_prio_adjust(-3);
511
512 watch_timer.init();
513 agent_timer.init();
514 mono_timer.resume();
515
516 agent_thread.create("osd_srv_agent");
517
518 if (cct->_conf->osd_recovery_delay_start)
519 defer_recovery(cct->_conf->osd_recovery_delay_start);
520}
521
522void OSDService::final_init()
523{
524 objecter->start(osdmap.get());
525}
526
527void OSDService::activate_map()
528{
529 // wake/unwake the tiering agent
530 std::lock_guard l{agent_lock};
531 agent_active =
532 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
533 osd->is_active();
534 agent_cond.notify_all();
535}
536
537void OSDService::request_osdmap_update(epoch_t e)
538{
539 osd->osdmap_subscribe(e, false);
540}
541
542
543class AgentTimeoutCB : public Context {
544 PGRef pg;
545public:
546 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
547 void finish(int) override {
548 pg->agent_choose_mode_restart();
549 }
550};
551
552void OSDService::agent_entry()
553{
554 dout(10) << __func__ << " start" << dendl;
555 std::unique_lock agent_locker{agent_lock};
556
557 while (!agent_stop_flag) {
558 if (agent_queue.empty()) {
559 dout(20) << __func__ << " empty queue" << dendl;
560 agent_cond.wait(agent_locker);
561 continue;
562 }
563 uint64_t level = agent_queue.rbegin()->first;
564 set<PGRef>& top = agent_queue.rbegin()->second;
565 dout(10) << __func__
566 << " tiers " << agent_queue.size()
567 << ", top is " << level
568 << " with pgs " << top.size()
569 << ", ops " << agent_ops << "/"
570 << cct->_conf->osd_agent_max_ops
571 << (agent_active ? " active" : " NOT ACTIVE")
572 << dendl;
573 dout(20) << __func__ << " oids " << agent_oids << dendl;
574 int max = cct->_conf->osd_agent_max_ops - agent_ops;
575 int agent_flush_quota = max;
576 if (!flush_mode_high_count)
577 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
578 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
579 agent_cond.wait(agent_locker);
580 continue;
581 }
582
583 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
584 agent_queue_pos = top.begin();
585 agent_valid_iterator = true;
586 }
587 PGRef pg = *agent_queue_pos;
588 dout(10) << "high_count " << flush_mode_high_count
589 << " agent_ops " << agent_ops
590 << " flush_quota " << agent_flush_quota << dendl;
591 agent_locker.unlock();
592 if (!pg->agent_work(max, agent_flush_quota)) {
593 dout(10) << __func__ << " " << pg->pg_id
594 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
595 << " seconds" << dendl;
596
597 osd->logger->inc(l_osd_tier_delay);
598 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
599 std::lock_guard timer_locker{agent_timer_lock};
600 Context *cb = new AgentTimeoutCB(pg);
601 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
602 }
603 agent_locker.lock();
604 }
605 dout(10) << __func__ << " finish" << dendl;
606}
607
608void OSDService::agent_stop()
609{
610 {
611 std::lock_guard l(agent_lock);
612
613 // By this time all ops should be cancelled
614 ceph_assert(agent_ops == 0);
615 // By this time all PGs are shutdown and dequeued
616 if (!agent_queue.empty()) {
617 set<PGRef>& top = agent_queue.rbegin()->second;
618 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
619 ceph_abort_msg("agent queue not empty");
620 }
621
622 agent_stop_flag = true;
623 agent_cond.notify_all();
624 }
625 agent_thread.join();
626}
627
628// -------------------------------------
629
630void OSDService::promote_throttle_recalibrate()
631{
632 utime_t now = ceph_clock_now();
633 double dur = now - last_recalibrate;
634 last_recalibrate = now;
635 unsigned prob = promote_probability_millis;
636
637 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
638 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
639
640 unsigned min_prob = 1;
641
642 uint64_t attempts, obj, bytes;
643 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
644 dout(10) << __func__ << " " << attempts << " attempts, promoted "
645 << obj << " objects and " << byte_u_t(bytes) << "; target "
646 << target_obj_sec << " obj/sec or "
647 << byte_u_t(target_bytes_sec) << "/sec"
648 << dendl;
649
650 // calculate what the probability *should* be, given the targets
651 unsigned new_prob;
652 if (attempts && dur > 0) {
653 uint64_t avg_size = 1;
654 if (obj)
655 avg_size = std::max<uint64_t>(bytes / obj, 1);
656 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
657 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
658 / (double)attempts;
659 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
660 << avg_size << dendl;
661 if (target_obj_sec && target_bytes_sec)
662 new_prob = std::min(po, pb);
663 else if (target_obj_sec)
664 new_prob = po;
665 else if (target_bytes_sec)
666 new_prob = pb;
667 else
668 new_prob = 1000;
669 } else {
670 new_prob = 1000;
671 }
672 dout(20) << __func__ << " new_prob " << new_prob << dendl;
673
674 // correct for persistent skew between target rate and actual rate, adjust
675 double ratio = 1.0;
676 unsigned actual = 0;
677 if (attempts && obj) {
678 actual = obj * 1000 / attempts;
679 ratio = (double)actual / (double)prob;
680 new_prob = (double)new_prob / ratio;
681 }
682 new_prob = std::max(new_prob, min_prob);
683 new_prob = std::min(new_prob, 1000u);
684
685 // adjust
686 prob = (prob + new_prob) / 2;
687 prob = std::max(prob, min_prob);
688 prob = std::min(prob, 1000u);
689 dout(10) << __func__ << " actual " << actual
690 << ", actual/prob ratio " << ratio
691 << ", adjusted new_prob " << new_prob
692 << ", prob " << promote_probability_millis << " -> " << prob
693 << dendl;
694 promote_probability_millis = prob;
695
696 // set hard limits for this interval to mitigate stampedes
697 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
698 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
699}
700
701// -------------------------------------
702
703float OSDService::get_failsafe_full_ratio()
704{
705 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
706 if (full_ratio > 1.0) full_ratio /= 100.0;
707 return full_ratio;
708}
709
710OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
711{
712 // The OSDMap ratios take precendence. So if the failsafe is .95 and
713 // the admin sets the cluster full to .96, the failsafe moves up to .96
714 // too. (Not that having failsafe == full is ideal, but it's better than
715 // dropping writes before the clusters appears full.)
716 OSDMapRef osdmap = get_osdmap();
717 if (!osdmap || osdmap->get_epoch() == 0) {
718 return NONE;
719 }
720 float nearfull_ratio = osdmap->get_nearfull_ratio();
721 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
722 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
723 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
724
725 if (osdmap->require_osd_release < ceph_release_t::luminous) {
726 // use the failsafe for nearfull and full; the mon isn't using the
727 // flags anyway because we're mid-upgrade.
728 full_ratio = failsafe_ratio;
729 backfillfull_ratio = failsafe_ratio;
730 nearfull_ratio = failsafe_ratio;
731 } else if (full_ratio <= 0 ||
732 backfillfull_ratio <= 0 ||
733 nearfull_ratio <= 0) {
734 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
735 // use failsafe flag. ick. the monitor did something wrong or the user
736 // did something stupid.
737 full_ratio = failsafe_ratio;
738 backfillfull_ratio = failsafe_ratio;
739 nearfull_ratio = failsafe_ratio;
740 }
741
742 if (injectfull_state > NONE && injectfull) {
743 inject = "(Injected)";
744 return injectfull_state;
745 } else if (pratio > failsafe_ratio) {
746 return FAILSAFE;
747 } else if (ratio > full_ratio) {
748 return FULL;
749 } else if (ratio > backfillfull_ratio) {
750 return BACKFILLFULL;
751 } else if (pratio > nearfull_ratio) {
752 return NEARFULL;
753 }
754 return NONE;
755}
756
757void OSDService::check_full_status(float ratio, float pratio)
758{
759 std::lock_guard l(full_status_lock);
760
761 cur_ratio = ratio;
762 physical_ratio = pratio;
763
764 string inject;
765 s_names new_state;
766 new_state = recalc_full_state(ratio, pratio, inject);
767
768 dout(20) << __func__ << " cur ratio " << ratio
769 << ", physical ratio " << pratio
770 << ", new state " << get_full_state_name(new_state)
771 << " " << inject
772 << dendl;
773
774 // warn
775 if (cur_state != new_state) {
776 dout(10) << __func__ << " " << get_full_state_name(cur_state)
777 << " -> " << get_full_state_name(new_state) << dendl;
778 if (new_state == FAILSAFE) {
779 clog->error() << "full status failsafe engaged, dropping updates, now "
780 << (int)roundf(ratio * 100) << "% full";
781 } else if (cur_state == FAILSAFE) {
782 clog->error() << "full status failsafe disengaged, no longer dropping "
783 << "updates, now " << (int)roundf(ratio * 100) << "% full";
784 }
785 cur_state = new_state;
786 }
787}
788
789bool OSDService::need_fullness_update()
790{
791 OSDMapRef osdmap = get_osdmap();
792 s_names cur = NONE;
793 if (osdmap->exists(whoami)) {
794 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
795 cur = FULL;
796 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
797 cur = BACKFILLFULL;
798 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
799 cur = NEARFULL;
800 }
801 }
802 s_names want = NONE;
803 if (is_full())
804 want = FULL;
805 else if (is_backfillfull())
806 want = BACKFILLFULL;
807 else if (is_nearfull())
808 want = NEARFULL;
809 return want != cur;
810}
811
812bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
813{
814 if (injectfull && injectfull_state >= type) {
815 // injectfull is either a count of the number of times to return failsafe full
816 // or if -1 then always return full
817 if (injectfull > 0)
818 --injectfull;
819 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
820 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
821 << dendl;
822 return true;
823 }
824 return false;
825}
826
827bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
828{
829 std::lock_guard l(full_status_lock);
830
831 if (_check_inject_full(dpp, type))
832 return true;
833
834 if (cur_state >= type)
835 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
836 << " physical " << physical_ratio << dendl;
837
838 return cur_state >= type;
839}
840
841bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
842{
843 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
844 {
845 std::lock_guard l(full_status_lock);
846 if (_check_inject_full(dpp, type)) {
847 return true;
848 }
849 }
850
851 float pratio;
852 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
853
854 string notused;
855 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
856
857 if (tentative_state >= type)
858 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
859
860 return tentative_state >= type;
861}
862
863bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
864{
865 return _check_full(dpp, FAILSAFE);
866}
867
868bool OSDService::check_full(DoutPrefixProvider *dpp) const
869{
870 return _check_full(dpp, FULL);
871}
872
873bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
874{
875 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
876}
877
878bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
879{
880 return _check_full(dpp, BACKFILLFULL);
881}
882
883bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
884{
885 return _check_full(dpp, NEARFULL);
886}
887
888bool OSDService::is_failsafe_full() const
889{
890 std::lock_guard l(full_status_lock);
891 return cur_state == FAILSAFE;
892}
893
894bool OSDService::is_full() const
895{
896 std::lock_guard l(full_status_lock);
897 return cur_state >= FULL;
898}
899
900bool OSDService::is_backfillfull() const
901{
902 std::lock_guard l(full_status_lock);
903 return cur_state >= BACKFILLFULL;
904}
905
906bool OSDService::is_nearfull() const
907{
908 std::lock_guard l(full_status_lock);
909 return cur_state >= NEARFULL;
910}
911
912void OSDService::set_injectfull(s_names type, int64_t count)
913{
914 std::lock_guard l(full_status_lock);
915 injectfull_state = type;
916 injectfull = count;
917}
918
919void OSDService::set_statfs(const struct store_statfs_t &stbuf,
920 osd_alert_list_t& alerts)
921{
922 uint64_t bytes = stbuf.total;
923 uint64_t avail = stbuf.available;
924 uint64_t used = stbuf.get_used_raw();
925
926 // For testing fake statfs values so it doesn't matter if all
927 // OSDs are using the same partition.
928 if (cct->_conf->fake_statfs_for_testing) {
929 uint64_t total_num_bytes = 0;
930 vector<PGRef> pgs;
931 osd->_get_pgs(&pgs);
932 for (auto p : pgs) {
933 total_num_bytes += p->get_stats_num_bytes();
934 }
935 bytes = cct->_conf->fake_statfs_for_testing;
936 if (total_num_bytes < bytes)
937 avail = bytes - total_num_bytes;
938 else
939 avail = 0;
940 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
941 << " adjust available " << avail
942 << dendl;
943 used = bytes - avail;
944 }
945
946 osd->logger->set(l_osd_stat_bytes, bytes);
947 osd->logger->set(l_osd_stat_bytes_used, used);
948 osd->logger->set(l_osd_stat_bytes_avail, avail);
949
950 std::lock_guard l(stat_lock);
951 osd_stat.statfs = stbuf;
952 osd_stat.os_alerts.clear();
953 osd_stat.os_alerts[whoami].swap(alerts);
954 if (cct->_conf->fake_statfs_for_testing) {
955 osd_stat.statfs.total = bytes;
956 osd_stat.statfs.available = avail;
957 // For testing don't want used to go negative, so clear reserved
958 osd_stat.statfs.internally_reserved = 0;
959 }
960}
961
962osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
963 int num_pgs)
964{
965 utime_t now = ceph_clock_now();
966 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
967 std::lock_guard l(stat_lock);
968 osd_stat.hb_peers.swap(hb_peers);
969 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
970 osd_stat.num_pgs = num_pgs;
971 // Clean entries that aren't updated
972 // This is called often enough that we can just remove 1 at a time
973 for (auto i: osd_stat.hb_pingtime) {
974 if (i.second.last_update == 0)
975 continue;
976 if (stale_time && now.sec() - i.second.last_update > stale_time) {
977 dout(20) << __func__ << " time out heartbeat for osd " << i.first
978 << " last_update " << i.second.last_update << dendl;
979 osd_stat.hb_pingtime.erase(i.first);
980 break;
981 }
982 }
983 return osd_stat;
984}
985
986void OSDService::inc_osd_stat_repaired()
987{
988 std::lock_guard l(stat_lock);
989 osd_stat.num_shards_repaired++;
990 return;
991}
992
993float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
994 uint64_t adjust_used)
995{
996 *pratio =
997 ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
998
999 if (adjust_used) {
1000 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1001 if (new_stat.statfs.available > adjust_used)
1002 new_stat.statfs.available -= adjust_used;
1003 else
1004 new_stat.statfs.available = 0;
1005 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
1006 }
1007
1008 // Check all pgs and adjust kb_used to include all pending backfill data
1009 int backfill_adjusted = 0;
1010 vector<PGRef> pgs;
1011 osd->_get_pgs(&pgs);
1012 for (auto p : pgs) {
1013 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1014 }
1015 if (backfill_adjusted) {
1016 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1017 }
1018 return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1019}
1020
1021void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1022{
1023 OSDMapRef next_map = get_nextmap_reserved();
1024 // service map is always newer/newest
1025 ceph_assert(from_epoch <= next_map->get_epoch());
1026
1027 if (next_map->is_down(peer) ||
1028 next_map->get_info(peer).up_from > from_epoch) {
1029 m->put();
1030 release_map(next_map);
1031 return;
1032 }
1033 ConnectionRef peer_con;
1034 if (peer == whoami) {
1035 peer_con = osd->cluster_messenger->get_loopback_connection();
1036 } else {
1037 peer_con = osd->cluster_messenger->connect_to_osd(
1038 next_map->get_cluster_addrs(peer), false, true);
1039 }
1040 maybe_share_map(peer_con.get(), next_map);
1041 peer_con->send_message(m);
1042 release_map(next_map);
1043}
1044
1045void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1046{
1047 OSDMapRef next_map = get_nextmap_reserved();
1048 // service map is always newer/newest
1049 ceph_assert(from_epoch <= next_map->get_epoch());
1050
1051 for (auto& iter : messages) {
1052 if (next_map->is_down(iter.first) ||
1053 next_map->get_info(iter.first).up_from > from_epoch) {
1054 iter.second->put();
1055 continue;
1056 }
1057 ConnectionRef peer_con;
1058 if (iter.first == whoami) {
1059 peer_con = osd->cluster_messenger->get_loopback_connection();
1060 } else {
1061 peer_con = osd->cluster_messenger->connect_to_osd(
1062 next_map->get_cluster_addrs(iter.first), false, true);
1063 }
1064 maybe_share_map(peer_con.get(), next_map);
1065 peer_con->send_message(iter.second);
1066 }
1067 release_map(next_map);
1068}
1069ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1070{
1071 OSDMapRef next_map = get_nextmap_reserved();
1072 // service map is always newer/newest
1073 ceph_assert(from_epoch <= next_map->get_epoch());
1074
1075 if (next_map->is_down(peer) ||
1076 next_map->get_info(peer).up_from > from_epoch) {
1077 release_map(next_map);
1078 return NULL;
1079 }
1080 ConnectionRef con;
1081 if (peer == whoami) {
1082 con = osd->cluster_messenger->get_loopback_connection();
1083 } else {
1084 con = osd->cluster_messenger->connect_to_osd(
1085 next_map->get_cluster_addrs(peer), false, true);
1086 }
1087 release_map(next_map);
1088 return con;
1089}
1090
1091pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1092{
1093 OSDMapRef next_map = get_nextmap_reserved();
1094 // service map is always newer/newest
1095 ceph_assert(from_epoch <= next_map->get_epoch());
1096
1097 pair<ConnectionRef,ConnectionRef> ret;
1098 if (next_map->is_down(peer) ||
1099 next_map->get_info(peer).up_from > from_epoch) {
1100 release_map(next_map);
1101 return ret;
1102 }
1103 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1104 next_map->get_hb_back_addrs(peer));
1105 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1106 next_map->get_hb_front_addrs(peer));
1107 release_map(next_map);
1108 return ret;
1109}
1110
1111entity_name_t OSDService::get_cluster_msgr_name() const
1112{
1113 return cluster_messenger->get_myname();
1114}
1115
1116void OSDService::queue_want_pg_temp(pg_t pgid,
1117 const vector<int>& want,
1118 bool forced)
1119{
1120 std::lock_guard l(pg_temp_lock);
1121 auto p = pg_temp_pending.find(pgid);
1122 if (p == pg_temp_pending.end() ||
1123 p->second.acting != want ||
1124 forced) {
1125 pg_temp_wanted[pgid] = {want, forced};
1126 }
1127}
1128
1129void OSDService::remove_want_pg_temp(pg_t pgid)
1130{
1131 std::lock_guard l(pg_temp_lock);
1132 pg_temp_wanted.erase(pgid);
1133 pg_temp_pending.erase(pgid);
1134}
1135
1136void OSDService::_sent_pg_temp()
1137{
1138#ifdef HAVE_STDLIB_MAP_SPLICING
1139 pg_temp_pending.merge(pg_temp_wanted);
1140#else
1141 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1142 make_move_iterator(end(pg_temp_wanted)));
1143#endif
1144 pg_temp_wanted.clear();
1145}
1146
1147void OSDService::requeue_pg_temp()
1148{
1149 std::lock_guard l(pg_temp_lock);
1150 // wanted overrides pending. note that remove_want_pg_temp
1151 // clears the item out of both.
1152 unsigned old_wanted = pg_temp_wanted.size();
1153 unsigned old_pending = pg_temp_pending.size();
1154 _sent_pg_temp();
1155 pg_temp_wanted.swap(pg_temp_pending);
1156 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1157 << pg_temp_wanted.size() << dendl;
1158}
1159
1160std::ostream& operator<<(std::ostream& out,
1161 const OSDService::pg_temp_t& pg_temp)
1162{
1163 out << pg_temp.acting;
1164 if (pg_temp.forced) {
1165 out << " (forced)";
1166 }
1167 return out;
1168}
1169
1170void OSDService::send_pg_temp()
1171{
1172 std::lock_guard l(pg_temp_lock);
1173 if (pg_temp_wanted.empty())
1174 return;
1175 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1176 MOSDPGTemp *ms[2] = {nullptr, nullptr};
1177 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1178 auto& m = ms[pg_temp.forced];
1179 if (!m) {
1180 m = new MOSDPGTemp(osdmap->get_epoch());
1181 m->forced = pg_temp.forced;
1182 }
1183 m->pg_temp.emplace(pgid, pg_temp.acting);
1184 }
1185 for (auto m : ms) {
1186 if (m) {
1187 monc->send_mon_message(m);
1188 }
1189 }
1190 _sent_pg_temp();
1191}
1192
1193void OSDService::send_pg_created(pg_t pgid)
1194{
1195 std::lock_guard l(pg_created_lock);
1196 dout(20) << __func__ << dendl;
1197 auto o = get_osdmap();
1198 if (o->require_osd_release >= ceph_release_t::luminous) {
1199 pg_created.insert(pgid);
1200 monc->send_mon_message(new MOSDPGCreated(pgid));
1201 }
1202}
1203
1204void OSDService::send_pg_created()
1205{
1206 std::lock_guard l(pg_created_lock);
1207 dout(20) << __func__ << dendl;
1208 auto o = get_osdmap();
1209 if (o->require_osd_release >= ceph_release_t::luminous) {
1210 for (auto pgid : pg_created) {
1211 monc->send_mon_message(new MOSDPGCreated(pgid));
1212 }
1213 }
1214}
1215
1216void OSDService::prune_pg_created()
1217{
1218 std::lock_guard l(pg_created_lock);
1219 dout(20) << __func__ << dendl;
1220 auto o = get_osdmap();
1221 auto i = pg_created.begin();
1222 while (i != pg_created.end()) {
1223 auto p = o->get_pg_pool(i->pool());
1224 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1225 dout(20) << __func__ << " pruning " << *i << dendl;
1226 i = pg_created.erase(i);
1227 } else {
1228 dout(20) << __func__ << " keeping " << *i << dendl;
1229 ++i;
1230 }
1231 }
1232}
1233
1234
1235// --------------------------------------
1236// dispatch
1237
1238bool OSDService::can_inc_scrubs()
1239{
1240 bool can_inc = false;
1241 std::lock_guard l(sched_scrub_lock);
1242
1243 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1244 dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1245 << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
1246 can_inc = true;
1247 } else {
1248 dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1249 << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1250 }
1251
1252 return can_inc;
1253}
1254
1255bool OSDService::inc_scrubs_local()
1256{
1257 bool result = false;
1258 std::lock_guard l{sched_scrub_lock};
1259 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1260 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1261 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1262 result = true;
1263 ++scrubs_local;
1264 } else {
1265 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1266 }
1267 return result;
1268}
1269
1270void OSDService::dec_scrubs_local()
1271{
1272 std::lock_guard l{sched_scrub_lock};
1273 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1274 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1275 --scrubs_local;
1276 ceph_assert(scrubs_local >= 0);
1277}
1278
1279bool OSDService::inc_scrubs_remote()
1280{
1281 bool result = false;
1282 std::lock_guard l{sched_scrub_lock};
1283 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1284 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1285 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1286 result = true;
1287 ++scrubs_remote;
1288 } else {
1289 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1290 }
1291 return result;
1292}
1293
1294void OSDService::dec_scrubs_remote()
1295{
1296 std::lock_guard l{sched_scrub_lock};
1297 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1298 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1299 --scrubs_remote;
1300 ceph_assert(scrubs_remote >= 0);
1301}
1302
1303void OSDService::dump_scrub_reservations(Formatter *f)
1304{
1305 std::lock_guard l{sched_scrub_lock};
1306 f->dump_int("scrubs_local", scrubs_local);
1307 f->dump_int("scrubs_remote", scrubs_remote);
1308 f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
1309}
1310
1311void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1312 epoch_t *_bind_epoch) const
1313{
1314 std::lock_guard l(epoch_lock);
1315 if (_boot_epoch)
1316 *_boot_epoch = boot_epoch;
1317 if (_up_epoch)
1318 *_up_epoch = up_epoch;
1319 if (_bind_epoch)
1320 *_bind_epoch = bind_epoch;
1321}
1322
1323void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1324 const epoch_t *_bind_epoch)
1325{
1326 std::lock_guard l(epoch_lock);
1327 if (_boot_epoch) {
1328 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1329 boot_epoch = *_boot_epoch;
1330 }
1331 if (_up_epoch) {
1332 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1333 up_epoch = *_up_epoch;
1334 }
1335 if (_bind_epoch) {
1336 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1337 bind_epoch = *_bind_epoch;
1338 }
1339}
1340
1341bool OSDService::prepare_to_stop()
1342{
1343 std::unique_lock l(is_stopping_lock);
1344 if (get_state() != NOT_STOPPING)
1345 return false;
1346
1347 OSDMapRef osdmap = get_osdmap();
1348 if (osdmap && osdmap->is_up(whoami)) {
1349 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1350 set_state(PREPARING_TO_STOP);
1351 monc->send_mon_message(
1352 new MOSDMarkMeDown(
1353 monc->get_fsid(),
1354 whoami,
1355 osdmap->get_addrs(whoami),
1356 osdmap->get_epoch(),
1357 true // request ack
1358 ));
1359 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1360 is_stopping_cond.wait_for(l, timeout,
1361 [this] { return get_state() == STOPPING; });
1362 }
1363 dout(0) << __func__ << " starting shutdown" << dendl;
1364 set_state(STOPPING);
1365 return true;
1366}
1367
1368void OSDService::got_stop_ack()
1369{
1370 std::scoped_lock l(is_stopping_lock);
1371 if (get_state() == PREPARING_TO_STOP) {
1372 dout(0) << __func__ << " starting shutdown" << dendl;
1373 set_state(STOPPING);
1374 is_stopping_cond.notify_all();
1375 } else {
1376 dout(10) << __func__ << " ignoring msg" << dendl;
1377 }
1378}
1379
1380MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1381 OSDSuperblock& sblock)
1382{
1383 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1384 osdmap->get_encoding_features());
1385 m->oldest_map = max_oldest_map;
1386 m->newest_map = sblock.newest_map;
1387
1388 int max = cct->_conf->osd_map_message_max;
1389 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1390
1391 if (since < m->oldest_map) {
1392 // we don't have the next map the target wants, so start with a
1393 // full map.
1394 bufferlist bl;
1395 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1396 << since << ", starting with full map" << dendl;
1397 since = m->oldest_map;
1398 if (!get_map_bl(since, bl)) {
1399 derr << __func__ << " missing full map " << since << dendl;
1400 goto panic;
1401 }
1402 max--;
1403 max_bytes -= bl.length();
1404 m->maps[since].claim(bl);
1405 }
1406 for (epoch_t e = since + 1; e <= to; ++e) {
1407 bufferlist bl;
1408 if (get_inc_map_bl(e, bl)) {
1409 m->incremental_maps[e].claim(bl);
1410 } else {
1411 dout(10) << __func__ << " missing incremental map " << e << dendl;
1412 if (!get_map_bl(e, bl)) {
1413 derr << __func__ << " also missing full map " << e << dendl;
1414 goto panic;
1415 }
1416 m->maps[e].claim(bl);
1417 }
1418 max--;
1419 max_bytes -= bl.length();
1420 if (max <= 0 || max_bytes <= 0) {
1421 break;
1422 }
1423 }
1424 return m;
1425
1426 panic:
1427 if (!m->maps.empty() ||
1428 !m->incremental_maps.empty()) {
1429 // send what we have so far
1430 return m;
1431 }
1432 // send something
1433 bufferlist bl;
1434 if (get_inc_map_bl(m->newest_map, bl)) {
1435 m->incremental_maps[m->newest_map].claim(bl);
1436 } else {
1437 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1438 if (!get_map_bl(m->newest_map, bl)) {
1439 derr << __func__ << " unable to load latest full map " << m->newest_map
1440 << dendl;
1441 ceph_abort();
1442 }
1443 m->maps[m->newest_map].claim(bl);
1444 }
1445 return m;
1446}
1447
1448void OSDService::send_map(MOSDMap *m, Connection *con)
1449{
1450 con->send_message(m);
1451}
1452
1453void OSDService::send_incremental_map(epoch_t since, Connection *con,
1454 const OSDMapRef& osdmap)
1455{
1456 epoch_t to = osdmap->get_epoch();
1457 dout(10) << "send_incremental_map " << since << " -> " << to
1458 << " to " << con << " " << con->get_peer_addr() << dendl;
1459
1460 MOSDMap *m = NULL;
1461 while (!m) {
1462 OSDSuperblock sblock(get_superblock());
1463 if (since < sblock.oldest_map) {
1464 // just send latest full map
1465 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1466 osdmap->get_encoding_features());
1467 m->oldest_map = max_oldest_map;
1468 m->newest_map = sblock.newest_map;
1469 get_map_bl(to, m->maps[to]);
1470 send_map(m, con);
1471 return;
1472 }
1473
1474 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1475 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1476 << ", only sending most recent" << dendl;
1477 since = to - cct->_conf->osd_map_share_max_epochs;
1478 }
1479
1480 m = build_incremental_map_msg(since, to, sblock);
1481 }
1482 send_map(m, con);
1483}
1484
1485bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1486{
1487 bool found = map_bl_cache.lookup(e, &bl);
1488 if (found) {
1489 if (logger)
1490 logger->inc(l_osd_map_bl_cache_hit);
1491 return true;
1492 }
1493 if (logger)
1494 logger->inc(l_osd_map_bl_cache_miss);
1495 found = store->read(meta_ch,
1496 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1497 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1498 if (found) {
1499 _add_map_bl(e, bl);
1500 }
1501 return found;
1502}
1503
1504bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1505{
1506 std::lock_guard l(map_cache_lock);
1507 bool found = map_bl_inc_cache.lookup(e, &bl);
1508 if (found) {
1509 if (logger)
1510 logger->inc(l_osd_map_bl_cache_hit);
1511 return true;
1512 }
1513 if (logger)
1514 logger->inc(l_osd_map_bl_cache_miss);
1515 found = store->read(meta_ch,
1516 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1517 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1518 if (found) {
1519 _add_map_inc_bl(e, bl);
1520 }
1521 return found;
1522}
1523
1524void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1525{
1526 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1527 // cache a contiguous buffer
1528 if (bl.get_num_buffers() > 1) {
1529 bl.rebuild();
1530 }
1531 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1532 map_bl_cache.add(e, bl);
1533}
1534
1535void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1536{
1537 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1538 // cache a contiguous buffer
1539 if (bl.get_num_buffers() > 1) {
1540 bl.rebuild();
1541 }
1542 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1543 map_bl_inc_cache.add(e, bl);
1544}
1545
1546OSDMapRef OSDService::_add_map(OSDMap *o)
1547{
1548 epoch_t e = o->get_epoch();
1549
1550 if (cct->_conf->osd_map_dedup) {
1551 // Dedup against an existing map at a nearby epoch
1552 OSDMapRef for_dedup = map_cache.lower_bound(e);
1553 if (for_dedup) {
1554 OSDMap::dedup(for_dedup.get(), o);
1555 }
1556 }
1557 bool existed;
1558 OSDMapRef l = map_cache.add(e, o, &existed);
1559 if (existed) {
1560 delete o;
1561 }
1562 return l;
1563}
1564
1565OSDMapRef OSDService::try_get_map(epoch_t epoch)
1566{
1567 std::lock_guard l(map_cache_lock);
1568 OSDMapRef retval = map_cache.lookup(epoch);
1569 if (retval) {
1570 dout(30) << "get_map " << epoch << " -cached" << dendl;
1571 if (logger) {
1572 logger->inc(l_osd_map_cache_hit);
1573 }
1574 return retval;
1575 }
1576 if (logger) {
1577 logger->inc(l_osd_map_cache_miss);
1578 epoch_t lb = map_cache.cached_key_lower_bound();
1579 if (epoch < lb) {
1580 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1581 logger->inc(l_osd_map_cache_miss_low);
1582 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1583 }
1584 }
1585
1586 OSDMap *map = new OSDMap;
1587 if (epoch > 0) {
1588 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1589 bufferlist bl;
1590 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1591 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1592 delete map;
1593 return OSDMapRef();
1594 }
1595 map->decode(bl);
1596 } else {
1597 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1598 }
1599 return _add_map(map);
1600}
1601
1602// ops
1603
1604
1605void OSDService::reply_op_error(OpRequestRef op, int err)
1606{
1607 reply_op_error(op, err, eversion_t(), 0, {});
1608}
1609
1610void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1611 version_t uv,
1612 vector<pg_log_op_return_item_t> op_returns)
1613{
1614 auto m = op->get_req<MOSDOp>();
1615 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1616 int flags;
1617 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1618
1619 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1620 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
1621 reply->set_reply_versions(v, uv);
1622 reply->set_op_returns(op_returns);
1623 m->get_connection()->send_message(reply);
1624}
1625
1626void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1627{
1628 if (!cct->_conf->osd_debug_misdirected_ops) {
1629 return;
1630 }
1631
1632 auto m = op->get_req<MOSDOp>();
1633 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1634
1635 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1636
1637 if (pg->is_ec_pg()) {
1638 /**
1639 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1640 * can get this result:
1641 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1642 * [CRUSH_ITEM_NONE, 2, 3]/3
1643 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1644 * [3, 2, 3]/3
1645 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1646 * -- misdirected op
1647 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1648 * it and fulfils it
1649 *
1650 * We can't compute the op target based on the sending map epoch due to
1651 * splitting. The simplest thing is to detect such cases here and drop
1652 * them without an error (the client will resend anyway).
1653 */
1654 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1655 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1656 if (!opmap) {
1657 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1658 << m->get_map_epoch() << ", dropping" << dendl;
1659 return;
1660 }
1661 pg_t _pgid = m->get_raw_pg();
1662 spg_t pgid;
1663 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1664 _pgid = opmap->raw_pg_to_pg(_pgid);
1665 if (opmap->get_primary_shard(_pgid, &pgid) &&
1666 pgid.shard != pg->pg_id.shard) {
1667 dout(7) << __func__ << ": " << *pg << " primary changed since "
1668 << m->get_map_epoch() << ", dropping" << dendl;
1669 return;
1670 }
1671 }
1672
1673 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1674 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1675 << " pg " << m->get_raw_pg()
1676 << " to osd." << whoami
1677 << " not " << pg->get_acting()
1678 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1679}
1680
1681void OSDService::enqueue_back(OpSchedulerItem&& qi)
1682{
1683 osd->op_shardedwq.queue(std::move(qi));
1684}
1685
1686void OSDService::enqueue_front(OpSchedulerItem&& qi)
1687{
1688 osd->op_shardedwq.queue_front(std::move(qi));
1689}
1690
1691void OSDService::queue_recovery_context(
1692 PG *pg,
1693 GenContext<ThreadPool::TPHandle&> *c)
1694{
1695 epoch_t e = get_osdmap_epoch();
1696 enqueue_back(
1697 OpSchedulerItem(
1698 unique_ptr<OpSchedulerItem::OpQueueable>(
1699 new PGRecoveryContext(pg->get_pgid(), c, e)),
1700 cct->_conf->osd_recovery_cost,
1701 cct->_conf->osd_recovery_priority,
1702 ceph_clock_now(),
1703 0,
1704 e));
1705}
1706
1707void OSDService::queue_for_snap_trim(PG *pg)
1708{
1709 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1710 enqueue_back(
1711 OpSchedulerItem(
1712 unique_ptr<OpSchedulerItem::OpQueueable>(
1713 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1714 cct->_conf->osd_snap_trim_cost,
1715 cct->_conf->osd_snap_trim_priority,
1716 ceph_clock_now(),
1717 0,
1718 pg->get_osdmap_epoch()));
1719}
1720
1721void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
1722{
1723 unsigned scrub_queue_priority = pg->scrubber.priority;
1724 if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
1725 scrub_queue_priority = cct->_conf->osd_client_op_priority;
1726 }
1727 const auto epoch = pg->get_osdmap_epoch();
1728 enqueue_back(
1729 OpSchedulerItem(
1730 unique_ptr<OpSchedulerItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
1731 cct->_conf->osd_scrub_cost,
1732 scrub_queue_priority,
1733 ceph_clock_now(),
1734 0,
1735 epoch));
1736}
1737
1738void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1739{
1740 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1741 enqueue_back(
1742 OpSchedulerItem(
1743 unique_ptr<OpSchedulerItem::OpQueueable>(
1744 new PGDelete(pgid, e)),
1745 cct->_conf->osd_pg_delete_cost,
1746 cct->_conf->osd_pg_delete_priority,
1747 ceph_clock_now(),
1748 0,
1749 e));
1750}
1751
1752bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1753{
1754 return osd->try_finish_pg_delete(pg, old_pg_num);
1755}
1756
1757// ---
1758
1759void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1760{
1761 std::lock_guard l(merge_lock);
1762 dout(10) << __func__ << " " << pg->pg_id << dendl;
1763 ready_to_merge_source[pg->pg_id.pgid] = version;
1764 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1765 _send_ready_to_merge();
1766}
1767
1768void OSDService::set_ready_to_merge_target(PG *pg,
1769 eversion_t version,
1770 epoch_t last_epoch_started,
1771 epoch_t last_epoch_clean)
1772{
1773 std::lock_guard l(merge_lock);
1774 dout(10) << __func__ << " " << pg->pg_id << dendl;
1775 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1776 make_tuple(version,
1777 last_epoch_started,
1778 last_epoch_clean)));
1779 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1780 _send_ready_to_merge();
1781}
1782
1783void OSDService::set_not_ready_to_merge_source(pg_t source)
1784{
1785 std::lock_guard l(merge_lock);
1786 dout(10) << __func__ << " " << source << dendl;
1787 not_ready_to_merge_source.insert(source);
1788 assert(ready_to_merge_source.count(source) == 0);
1789 _send_ready_to_merge();
1790}
1791
1792void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1793{
1794 std::lock_guard l(merge_lock);
1795 dout(10) << __func__ << " " << target << " source " << source << dendl;
1796 not_ready_to_merge_target[target] = source;
1797 assert(ready_to_merge_target.count(target) == 0);
1798 _send_ready_to_merge();
1799}
1800
1801void OSDService::send_ready_to_merge()
1802{
1803 std::lock_guard l(merge_lock);
1804 _send_ready_to_merge();
1805}
1806
1807void OSDService::_send_ready_to_merge()
1808{
1809 dout(20) << __func__
1810 << " ready_to_merge_source " << ready_to_merge_source
1811 << " not_ready_to_merge_source " << not_ready_to_merge_source
1812 << " ready_to_merge_target " << ready_to_merge_target
1813 << " not_ready_to_merge_target " << not_ready_to_merge_target
1814 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1815 << dendl;
1816 for (auto src : not_ready_to_merge_source) {
1817 if (sent_ready_to_merge_source.count(src) == 0) {
1818 monc->send_mon_message(new MOSDPGReadyToMerge(
1819 src,
1820 {}, {}, 0, 0,
1821 false,
1822 osdmap->get_epoch()));
1823 sent_ready_to_merge_source.insert(src);
1824 }
1825 }
1826 for (auto p : not_ready_to_merge_target) {
1827 if (sent_ready_to_merge_source.count(p.second) == 0) {
1828 monc->send_mon_message(new MOSDPGReadyToMerge(
1829 p.second,
1830 {}, {}, 0, 0,
1831 false,
1832 osdmap->get_epoch()));
1833 sent_ready_to_merge_source.insert(p.second);
1834 }
1835 }
1836 for (auto src : ready_to_merge_source) {
1837 if (not_ready_to_merge_source.count(src.first) ||
1838 not_ready_to_merge_target.count(src.first.get_parent())) {
1839 continue;
1840 }
1841 auto p = ready_to_merge_target.find(src.first.get_parent());
1842 if (p != ready_to_merge_target.end() &&
1843 sent_ready_to_merge_source.count(src.first) == 0) {
1844 monc->send_mon_message(new MOSDPGReadyToMerge(
1845 src.first, // source pgid
1846 src.second, // src version
1847 std::get<0>(p->second), // target version
1848 std::get<1>(p->second), // PG's last_epoch_started
1849 std::get<2>(p->second), // PG's last_epoch_clean
1850 true,
1851 osdmap->get_epoch()));
1852 sent_ready_to_merge_source.insert(src.first);
1853 }
1854 }
1855}
1856
1857void OSDService::clear_ready_to_merge(PG *pg)
1858{
1859 std::lock_guard l(merge_lock);
1860 dout(10) << __func__ << " " << pg->pg_id << dendl;
1861 ready_to_merge_source.erase(pg->pg_id.pgid);
1862 ready_to_merge_target.erase(pg->pg_id.pgid);
1863 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1864 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1865 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1866}
1867
1868void OSDService::clear_sent_ready_to_merge()
1869{
1870 std::lock_guard l(merge_lock);
1871 sent_ready_to_merge_source.clear();
1872}
1873
1874void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
1875{
1876 std::lock_guard l(merge_lock);
1877 auto i = sent_ready_to_merge_source.begin();
1878 while (i != sent_ready_to_merge_source.end()) {
1879 if (!osdmap->pg_exists(*i)) {
1880 dout(10) << __func__ << " " << *i << dendl;
1881 i = sent_ready_to_merge_source.erase(i);
1882 } else {
1883 ++i;
1884 }
1885 }
1886}
1887
1888// ---
1889
1890void OSDService::_queue_for_recovery(
1891 std::pair<epoch_t, PGRef> p,
1892 uint64_t reserved_pushes)
1893{
1894 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
1895 enqueue_back(
1896 OpSchedulerItem(
1897 unique_ptr<OpSchedulerItem::OpQueueable>(
1898 new PGRecovery(
1899 p.second->get_pgid(), p.first, reserved_pushes)),
1900 cct->_conf->osd_recovery_cost,
1901 cct->_conf->osd_recovery_priority,
1902 ceph_clock_now(),
1903 0,
1904 p.first));
1905}
1906
1907// ====================================================================
1908// OSD
1909
1910#undef dout_prefix
1911#define dout_prefix *_dout
1912
1913// Commands shared between OSD's console and admin console:
1914namespace ceph {
1915namespace osd_cmds {
1916
1917int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1918
1919}} // namespace ceph::osd_cmds
1920
1921int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, string osdspec_affinity)
1922{
1923 int ret;
1924
1925 OSDSuperblock sb;
1926 bufferlist sbbl;
1927 ObjectStore::CollectionHandle ch;
1928
1929 // if we are fed a uuid for this osd, use it.
1930 store->set_fsid(cct->_conf->osd_uuid);
1931
1932 ret = store->mkfs();
1933 if (ret) {
1934 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1935 << cpp_strerror(ret) << dendl;
1936 goto free_store;
1937 }
1938
1939 store->set_cache_shards(1); // doesn't matter for mkfs!
1940
1941 ret = store->mount();
1942 if (ret) {
1943 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1944 << cpp_strerror(ret) << dendl;
1945 goto free_store;
1946 }
1947
1948 ch = store->open_collection(coll_t::meta());
1949 if (ch) {
1950 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1951 if (ret < 0) {
1952 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
1953 goto free_store;
1954 }
1955 /* if we already have superblock, check content of superblock */
1956 dout(0) << " have superblock" << dendl;
1957 auto p = sbbl.cbegin();
1958 decode(sb, p);
1959 if (whoami != sb.whoami) {
1960 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1961 << dendl;
1962 ret = -EINVAL;
1963 goto umount_store;
1964 }
1965 if (fsid != sb.cluster_fsid) {
1966 derr << "provided cluster fsid " << fsid
1967 << " != superblock's " << sb.cluster_fsid << dendl;
1968 ret = -EINVAL;
1969 goto umount_store;
1970 }
1971 } else {
1972 // create superblock
1973 sb.cluster_fsid = fsid;
1974 sb.osd_fsid = store->get_fsid();
1975 sb.whoami = whoami;
1976 sb.compat_features = get_osd_initial_compat_set();
1977
1978 bufferlist bl;
1979 encode(sb, bl);
1980
1981 ObjectStore::CollectionHandle ch = store->create_new_collection(
1982 coll_t::meta());
1983 ObjectStore::Transaction t;
1984 t.create_collection(coll_t::meta(), 0);
1985 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1986 ret = store->queue_transaction(ch, std::move(t));
1987 if (ret) {
1988 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1989 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
1990 goto umount_store;
1991 }
1992 }
1993
1994 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
1995 if (ret) {
1996 derr << "OSD::mkfs: failed to write fsid file: error "
1997 << cpp_strerror(ret) << dendl;
1998 goto umount_store;
1999 }
2000
2001umount_store:
2002 if (ch) {
2003 ch.reset();
2004 }
2005 store->umount();
2006free_store:
2007 delete store;
2008 return ret;
2009}
2010
2011int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
2012{
2013 char val[80];
2014 int r;
2015
2016 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2017 r = store->write_meta("magic", val);
2018 if (r < 0)
2019 return r;
2020
2021 snprintf(val, sizeof(val), "%d", whoami);
2022 r = store->write_meta("whoami", val);
2023 if (r < 0)
2024 return r;
2025
2026 cluster_fsid.print(val);
2027 r = store->write_meta("ceph_fsid", val);
2028 if (r < 0)
2029 return r;
2030
2031 string key = cct->_conf.get_val<string>("key");
2032 if (key.size()) {
2033 r = store->write_meta("osd_key", key);
2034 if (r < 0)
2035 return r;
2036 } else {
2037 string keyfile = cct->_conf.get_val<string>("keyfile");
2038 if (!keyfile.empty()) {
2039 bufferlist keybl;
2040 string err;
2041 r = keybl.read_file(keyfile.c_str(), &err);
2042 if (r < 0) {
2043 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2044 << err << ": " << cpp_strerror(r) << dendl;
2045 return r;
2046 }
2047 r = store->write_meta("osd_key", keybl.to_str());
2048 if (r < 0)
2049 return r;
2050 }
2051 }
2052 if (!osdspec_affinity.empty()) {
2053 r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2054 if (r < 0)
2055 return r;
2056 }
2057
2058 r = store->write_meta("ready", "ready");
2059 if (r < 0)
2060 return r;
2061
2062 return 0;
2063}
2064
2065int OSD::peek_meta(ObjectStore *store,
2066 std::string *magic,
2067 uuid_d *cluster_fsid,
2068 uuid_d *osd_fsid,
2069 int *whoami,
2070 ceph_release_t *require_osd_release)
2071{
2072 string val;
2073
2074 int r = store->read_meta("magic", &val);
2075 if (r < 0)
2076 return r;
2077 *magic = val;
2078
2079 r = store->read_meta("whoami", &val);
2080 if (r < 0)
2081 return r;
2082 *whoami = atoi(val.c_str());
2083
2084 r = store->read_meta("ceph_fsid", &val);
2085 if (r < 0)
2086 return r;
2087 r = cluster_fsid->parse(val.c_str());
2088 if (!r)
2089 return -EINVAL;
2090
2091 r = store->read_meta("fsid", &val);
2092 if (r < 0) {
2093 *osd_fsid = uuid_d();
2094 } else {
2095 r = osd_fsid->parse(val.c_str());
2096 if (!r)
2097 return -EINVAL;
2098 }
2099
2100 r = store->read_meta("require_osd_release", &val);
2101 if (r >= 0) {
2102 *require_osd_release = ceph_release_from_name(val);
2103 }
2104
2105 return 0;
2106}
2107
2108
2109#undef dout_prefix
2110#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2111
2112// cons/des
2113
2114OSD::OSD(CephContext *cct_, ObjectStore *store_,
2115 int id,
2116 Messenger *internal_messenger,
2117 Messenger *external_messenger,
2118 Messenger *hb_client_front,
2119 Messenger *hb_client_back,
2120 Messenger *hb_front_serverm,
2121 Messenger *hb_back_serverm,
2122 Messenger *osdc_messenger,
2123 MonClient *mc,
2124 const std::string &dev, const std::string &jdev) :
2125 Dispatcher(cct_),
2126 tick_timer(cct, osd_lock),
2127 tick_timer_without_osd_lock(cct, tick_timer_lock),
2128 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2129 cluster_messenger(internal_messenger),
2130 client_messenger(external_messenger),
2131 objecter_messenger(osdc_messenger),
2132 monc(mc),
2133 mgrc(cct_, client_messenger, &mc->monmap),
2134 logger(NULL),
2135 recoverystate_perf(NULL),
2136 store(store_),
2137 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2138 clog(log_client.create_channel()),
2139 whoami(id),
2140 dev_path(dev), journal_path(jdev),
2141 store_is_rotational(store->is_rotational()),
2142 trace_endpoint("0.0.0.0", 0, "osd"),
2143 asok_hook(NULL),
2144 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2145 "osd_pg_epoch_max_lag_factor")),
2146 osd_compat(get_osd_compat_set()),
2147 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2148 get_num_op_threads()),
2149 heartbeat_stop(false),
2150 heartbeat_need_update(true),
2151 hb_front_client_messenger(hb_client_front),
2152 hb_back_client_messenger(hb_client_back),
2153 hb_front_server_messenger(hb_front_serverm),
2154 hb_back_server_messenger(hb_back_serverm),
2155 daily_loadavg(0.0),
2156 heartbeat_thread(this),
2157 heartbeat_dispatcher(this),
2158 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2159 cct->_conf->osd_num_op_tracker_shard),
2160 test_ops_hook(NULL),
2161 op_shardedwq(
2162 this,
2163 cct->_conf->osd_op_thread_timeout,
2164 cct->_conf->osd_op_thread_suicide_timeout,
2165 &osd_op_tp),
2166 last_pg_create_epoch(0),
2167 boot_finisher(cct),
2168 up_thru_wanted(0),
2169 requested_full_first(0),
2170 requested_full_last(0),
2171 service(this)
2172{
2173
2174 if (!gss_ktfile_client.empty()) {
2175 // Assert we can export environment variable
2176 /*
2177 The default client keytab is used, if it is present and readable,
2178 to automatically obtain initial credentials for GSSAPI client
2179 applications. The principal name of the first entry in the client
2180 keytab is used by default when obtaining initial credentials.
2181 1. The KRB5_CLIENT_KTNAME environment variable.
2182 2. The default_client_keytab_name profile variable in [libdefaults].
2183 3. The hardcoded default, DEFCKTNAME.
2184 */
2185 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2186 gss_ktfile_client.c_str(), 1));
2187 ceph_assert(set_result == 0);
2188 }
2189
2190 monc->set_messenger(client_messenger);
2191 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2192 cct->_conf->osd_op_log_threshold);
2193 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2194 cct->_conf->osd_op_history_duration);
2195 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2196 cct->_conf->osd_op_history_slow_op_threshold);
2197 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
2198#ifdef WITH_BLKIN
2199 std::stringstream ss;
2200 ss << "osd." << whoami;
2201 trace_endpoint.copy_name(ss.str());
2202#endif
2203
2204 // initialize shards
2205 num_shards = get_num_op_shards();
2206 for (uint32_t i = 0; i < num_shards; i++) {
2207 OSDShard *one_shard = new OSDShard(
2208 i,
2209 cct,
2210 this);
2211 shards.push_back(one_shard);
2212 }
2213}
2214
2215OSD::~OSD()
2216{
2217 while (!shards.empty()) {
2218 delete shards.back();
2219 shards.pop_back();
2220 }
2221 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2222 cct->get_perfcounters_collection()->remove(logger);
2223 delete recoverystate_perf;
2224 delete logger;
2225 delete store;
2226}
2227
2228double OSD::get_tick_interval() const
2229{
2230 // vary +/- 5% to avoid scrub scheduling livelocks
2231 constexpr auto delta = 0.05;
2232 return (OSD_TICK_INTERVAL *
2233 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2234}
2235
2236void OSD::handle_signal(int signum)
2237{
2238 ceph_assert(signum == SIGINT || signum == SIGTERM);
2239 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2240 shutdown();
2241}
2242
2243int OSD::pre_init()
2244{
2245 std::lock_guard lock(osd_lock);
2246 if (is_stopping())
2247 return 0;
2248
2249 if (store->test_mount_in_use()) {
2250 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2251 << "currently in use. (Is ceph-osd already running?)" << dendl;
2252 return -EBUSY;
2253 }
2254
2255 cct->_conf.add_observer(this);
2256 return 0;
2257}
2258
2259int OSD::set_numa_affinity()
2260{
2261 // storage numa node
2262 int store_node = -1;
2263 store->get_numa_node(&store_node, nullptr, nullptr);
2264 if (store_node >= 0) {
2265 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2266 }
2267
2268 // check network numa node(s)
2269 int front_node = -1, back_node = -1;
2270 string front_iface = pick_iface(
2271 cct,
2272 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2273 string back_iface = pick_iface(
2274 cct,
2275 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2276 int r = get_iface_numa_node(front_iface, &front_node);
2277 if (r >= 0 && front_node >= 0) {
2278 dout(1) << __func__ << " public network " << front_iface << " numa node "
2279 << front_node << dendl;
2280 r = get_iface_numa_node(back_iface, &back_node);
2281 if (r >= 0 && back_node >= 0) {
2282 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2283 << back_node << dendl;
2284 if (front_node == back_node &&
2285 front_node == store_node) {
2286 dout(1) << " objectstore and network numa nodes all match" << dendl;
2287 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2288 numa_node = front_node;
2289 }
2290 } else if (front_node != back_node) {
2291 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2292 << dendl;
2293 } else {
2294 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2295 << dendl;
2296 }
2297 } else if (back_node == -2) {
2298 dout(1) << __func__ << " cluster network " << back_iface
2299 << " ports numa nodes do not match" << dendl;
2300 } else {
2301 derr << __func__ << " unable to identify cluster interface '" << back_iface
2302 << "' numa node: " << cpp_strerror(r) << dendl;
2303 }
2304 } else if (front_node == -2) {
2305 dout(1) << __func__ << " public network " << front_iface
2306 << " ports numa nodes do not match" << dendl;
2307 } else {
2308 derr << __func__ << " unable to identify public interface '" << front_iface
2309 << "' numa node: " << cpp_strerror(r) << dendl;
2310 }
2311 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2312 // this takes precedence over the automagic logic above
2313 numa_node = node;
2314 }
2315 if (numa_node >= 0) {
2316 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2317 if (r < 0) {
2318 dout(1) << __func__ << " unable to determine numa node " << numa_node
2319 << " CPUs" << dendl;
2320 numa_node = -1;
2321 } else {
2322 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2323 << " cpus "
2324 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2325 << dendl;
2326 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
2327 if (r < 0) {
2328 r = -errno;
2329 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2330 << dendl;
2331 numa_node = -1;
2332 }
2333 }
2334 } else {
2335 dout(1) << __func__ << " not setting numa affinity" << dendl;
2336 }
2337 return 0;
2338}
2339
2340// asok
2341
2342class OSDSocketHook : public AdminSocketHook {
2343 OSD *osd;
2344public:
2345 explicit OSDSocketHook(OSD *o) : osd(o) {}
2346 int call(std::string_view prefix, const cmdmap_t& cmdmap,
2347 Formatter *f,
2348 std::ostream& ss,
2349 bufferlist& out) override {
2350 ceph_abort("should use async hook");
2351 }
2352 void call_async(
2353 std::string_view prefix,
2354 const cmdmap_t& cmdmap,
2355 Formatter *f,
2356 const bufferlist& inbl,
2357 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
2358 try {
2359 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2360 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2361 bufferlist empty;
2362 on_finish(-EINVAL, e.what(), empty);
2363 }
2364 }
2365};
2366
2367std::set<int64_t> OSD::get_mapped_pools()
2368{
2369 std::set<int64_t> pools;
2370 std::vector<spg_t> pgids;
2371 _get_pgids(&pgids);
2372 for (const auto &pgid : pgids) {
2373 pools.insert(pgid.pool());
2374 }
2375 return pools;
2376}
2377
2378void OSD::asok_command(
2379 std::string_view prefix, const cmdmap_t& cmdmap,
2380 Formatter *f,
2381 const bufferlist& inbl,
2382 std::function<void(int,const std::string&,bufferlist&)> on_finish)
2383{
2384 int ret = 0;
2385 stringstream ss; // stderr error message stream
2386 bufferlist outbl; // if empty at end, we'll dump formatter as output
2387
2388 // --- PG commands are routed here to PG::do_command ---
2389 if (prefix == "pg" ||
2390 prefix == "query" ||
2391 prefix == "mark_unfound_lost" ||
2392 prefix == "list_unfound" ||
2393 prefix == "scrub" ||
2394 prefix == "deep_scrub"
2395 ) {
2396 string pgidstr;
2397 pg_t pgid;
2398 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2399 ss << "no pgid specified";
2400 ret = -EINVAL;
2401 goto out;
2402 }
2403 if (!pgid.parse(pgidstr.c_str())) {
2404 ss << "couldn't parse pgid '" << pgidstr << "'";
2405 ret = -EINVAL;
2406 goto out;
2407 }
2408 spg_t pcand;
2409 PGRef pg;
2410 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2411 (pg = _lookup_lock_pg(pcand))) {
2412 if (pg->is_primary()) {
2413 cmdmap_t new_cmdmap = cmdmap;
2414 try {
2415 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2416 pg->unlock();
2417 return; // the pg handler calls on_finish directly
2418 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2419 pg->unlock();
2420 ss << e.what();
2421 ret = -EINVAL;
2422 goto out;
2423 }
2424 } else {
2425 ss << "not primary for pgid " << pgid;
2426 // do not reply; they will get newer maps and realize they
2427 // need to resend.
2428 pg->unlock();
2429 ret = -EAGAIN;
2430 goto out;
2431 }
2432 } else {
2433 ss << "i don't have pgid " << pgid;
2434 ret = -ENOENT;
2435 }
2436 }
2437
2438 // --- OSD commands follow ---
2439
2440 else if (prefix == "status") {
2441 lock_guard l(osd_lock);
2442 f->open_object_section("status");
2443 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2444 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2445 f->dump_unsigned("whoami", superblock.whoami);
2446 f->dump_string("state", get_state_name(get_state()));
2447 f->dump_unsigned("oldest_map", superblock.oldest_map);
2448 f->dump_unsigned("newest_map", superblock.newest_map);
2449 f->dump_unsigned("num_pgs", num_pgs);
2450 f->close_section();
2451 } else if (prefix == "flush_journal") {
2452 store->flush_journal();
2453 } else if (prefix == "dump_ops_in_flight" ||
2454 prefix == "ops" ||
2455 prefix == "dump_blocked_ops" ||
2456 prefix == "dump_historic_ops" ||
2457 prefix == "dump_historic_ops_by_duration" ||
2458 prefix == "dump_historic_slow_ops") {
2459
2460 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2461even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2462will start to track new ops received afterwards.";
2463
2464 set<string> filters;
2465 vector<string> filter_str;
2466 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
2467 copy(filter_str.begin(), filter_str.end(),
2468 inserter(filters, filters.end()));
2469 }
2470
2471 if (prefix == "dump_ops_in_flight" ||
2472 prefix == "ops") {
2473 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2474 ss << error_str;
2475 ret = -EINVAL;
2476 goto out;
2477 }
2478 }
2479 if (prefix == "dump_blocked_ops") {
2480 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2481 ss << error_str;
2482 ret = -EINVAL;
2483 goto out;
2484 }
2485 }
2486 if (prefix == "dump_historic_ops") {
2487 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2488 ss << error_str;
2489 ret = -EINVAL;
2490 goto out;
2491 }
2492 }
2493 if (prefix == "dump_historic_ops_by_duration") {
2494 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2495 ss << error_str;
2496 ret = -EINVAL;
2497 goto out;
2498 }
2499 }
2500 if (prefix == "dump_historic_slow_ops") {
2501 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2502 ss << error_str;
2503 ret = -EINVAL;
2504 goto out;
2505 }
2506 }
2507 } else if (prefix == "dump_op_pq_state") {
2508 f->open_object_section("pq");
2509 op_shardedwq.dump(f);
2510 f->close_section();
2511 } else if (prefix == "dump_blacklist") {
2512 list<pair<entity_addr_t,utime_t> > bl;
2513 OSDMapRef curmap = service.get_osdmap();
2514
2515 f->open_array_section("blacklist");
2516 curmap->get_blacklist(&bl);
2517 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2518 it != bl.end(); ++it) {
2519 f->open_object_section("entry");
2520 f->open_object_section("entity_addr_t");
2521 it->first.dump(f);
2522 f->close_section(); //entity_addr_t
2523 it->second.localtime(f->dump_stream("expire_time"));
2524 f->close_section(); //entry
2525 }
2526 f->close_section(); //blacklist
2527 } else if (prefix == "dump_watchers") {
2528 list<obj_watch_item_t> watchers;
2529 // scan pg's
2530 vector<PGRef> pgs;
2531 _get_pgs(&pgs);
2532 for (auto& pg : pgs) {
2533 list<obj_watch_item_t> pg_watchers;
2534 pg->get_watchers(&pg_watchers);
2535 watchers.splice(watchers.end(), pg_watchers);
2536 }
2537
2538 f->open_array_section("watchers");
2539 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2540 it != watchers.end(); ++it) {
2541
2542 f->open_object_section("watch");
2543
2544 f->dump_string("namespace", it->obj.nspace);
2545 f->dump_string("object", it->obj.oid.name);
2546
2547 f->open_object_section("entity_name");
2548 it->wi.name.dump(f);
2549 f->close_section(); //entity_name_t
2550
2551 f->dump_unsigned("cookie", it->wi.cookie);
2552 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2553
2554 f->open_object_section("entity_addr_t");
2555 it->wi.addr.dump(f);
2556 f->close_section(); //entity_addr_t
2557
2558 f->close_section(); //watch
2559 }
2560
2561 f->close_section(); //watchers
2562 } else if (prefix == "dump_recovery_reservations") {
2563 f->open_object_section("reservations");
2564 f->open_object_section("local_reservations");
2565 service.local_reserver.dump(f);
2566 f->close_section();
2567 f->open_object_section("remote_reservations");
2568 service.remote_reserver.dump(f);
2569 f->close_section();
2570 f->close_section();
2571 } else if (prefix == "dump_scrub_reservations") {
2572 f->open_object_section("scrub_reservations");
2573 service.dump_scrub_reservations(f);
2574 f->close_section();
2575 } else if (prefix == "get_latest_osdmap") {
2576 get_latest_osdmap();
2577 } else if (prefix == "set_heap_property") {
2578 string property;
2579 int64_t value = 0;
2580 string error;
2581 bool success = false;
2582 if (!cmd_getval(cmdmap, "property", property)) {
2583 error = "unable to get property";
2584 success = false;
2585 } else if (!cmd_getval(cmdmap, "value", value)) {
2586 error = "unable to get value";
2587 success = false;
2588 } else if (value < 0) {
2589 error = "negative value not allowed";
2590 success = false;
2591 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2592 error = "invalid property";
2593 success = false;
2594 } else {
2595 success = true;
2596 }
2597 f->open_object_section("result");
2598 f->dump_string("error", error);
2599 f->dump_bool("success", success);
2600 f->close_section();
2601 } else if (prefix == "get_heap_property") {
2602 string property;
2603 size_t value = 0;
2604 string error;
2605 bool success = false;
2606 if (!cmd_getval(cmdmap, "property", property)) {
2607 error = "unable to get property";
2608 success = false;
2609 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2610 error = "invalid property";
2611 success = false;
2612 } else {
2613 success = true;
2614 }
2615 f->open_object_section("result");
2616 f->dump_string("error", error);
2617 f->dump_bool("success", success);
2618 f->dump_int("value", value);
2619 f->close_section();
2620 } else if (prefix == "dump_objectstore_kv_stats") {
2621 store->get_db_statistics(f);
2622 } else if (prefix == "dump_scrubs") {
2623 service.dumps_scrub(f);
2624 } else if (prefix == "calc_objectstore_db_histogram") {
2625 store->generate_db_histogram(f);
2626 } else if (prefix == "flush_store_cache") {
2627 store->flush_cache(&ss);
2628 } else if (prefix == "dump_pgstate_history") {
2629 f->open_object_section("pgstate_history");
2630 f->open_array_section("pgs");
2631 vector<PGRef> pgs;
2632 _get_pgs(&pgs);
2633 for (auto& pg : pgs) {
2634 f->open_object_section("pg");
2635 f->dump_stream("pg") << pg->pg_id;
2636 f->dump_string("currently", pg->get_current_state());
2637 pg->dump_pgstate_history(f);
2638 f->close_section();
2639 }
2640 f->close_section();
2641 f->close_section();
2642 } else if (prefix == "compact") {
2643 dout(1) << "triggering manual compaction" << dendl;
2644 auto start = ceph::coarse_mono_clock::now();
2645 store->compact();
2646 auto end = ceph::coarse_mono_clock::now();
2647 double duration = std::chrono::duration<double>(end-start).count();
2648 dout(1) << "finished manual compaction in "
2649 << duration
2650 << " seconds" << dendl;
2651 f->open_object_section("compact_result");
2652 f->dump_float("elapsed_time", duration);
2653 f->close_section();
2654 } else if (prefix == "get_mapped_pools") {
2655 f->open_array_section("mapped_pools");
2656 set<int64_t> poollist = get_mapped_pools();
2657 for (auto pool : poollist) {
2658 f->dump_int("pool_id", pool);
2659 }
2660 f->close_section();
2661 } else if (prefix == "smart") {
2662 string devid;
2663 cmd_getval(cmdmap, "devid", devid);
2664 ostringstream out;
2665 probe_smart(devid, out);
2666 outbl.append(out.str());
2667 } else if (prefix == "list_devices") {
2668 set<string> devnames;
2669 store->get_devices(&devnames);
2670 f->open_array_section("list_devices");
2671 for (auto dev : devnames) {
2672 if (dev.find("dm-") == 0) {
2673 continue;
2674 }
2675 string err;
2676 f->open_object_section("device");
2677 f->dump_string("device", "/dev/" + dev);
2678 f->dump_string("device_id", get_device_id(dev, &err));
2679 f->close_section();
2680 }
2681 f->close_section();
2682 } else if (prefix == "send_beacon") {
2683 lock_guard l(osd_lock);
2684 if (is_active()) {
2685 send_beacon(ceph::coarse_mono_clock::now());
2686 }
2687 }
2688
2689 else if (prefix == "cluster_log") {
2690 vector<string> msg;
2691 cmd_getval(cmdmap, "message", msg);
2692 if (msg.empty()) {
2693 ret = -EINVAL;
2694 ss << "ignoring empty log message";
2695 goto out;
2696 }
2697 string message = msg.front();
2698 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2699 message += " " + *a;
2700 string lvl;
2701 cmd_getval(cmdmap, "level", lvl);
2702 clog_type level = string_to_clog_type(lvl);
2703 if (level < 0) {
2704 ret = -EINVAL;
2705 ss << "unknown level '" << lvl << "'";
2706 goto out;
2707 }
2708 clog->do_log(level, message);
2709 }
2710
2711 else if (prefix == "bench") {
2712 int64_t count;
2713 int64_t bsize;
2714 int64_t osize, onum;
2715 // default count 1G, size 4MB
2716 cmd_getval(cmdmap, "count", count, (int64_t)1 << 30);
2717 cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
2718 cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
2719 cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
2720
2721 uint32_t duration = cct->_conf->osd_bench_duration;
2722
2723 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
2724 // let us limit the block size because the next checks rely on it
2725 // having a sane value. If we allow any block size to be set things
2726 // can still go sideways.
2727 ss << "block 'size' values are capped at "
2728 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
2729 << " a higher value, please adjust 'osd_bench_max_block_size'";
2730 ret = -EINVAL;
2731 goto out;
2732 } else if (bsize < (int64_t) (1 << 20)) {
2733 // entering the realm of small block sizes.
2734 // limit the count to a sane value, assuming a configurable amount of
2735 // IOPS and duration, so that the OSD doesn't get hung up on this,
2736 // preventing timeouts from going off
2737 int64_t max_count =
2738 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
2739 if (count > max_count) {
2740 ss << "'count' values greater than " << max_count
2741 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2742 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
2743 << " for " << duration << " seconds,"
2744 << " can cause ill effects on osd. "
2745 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2746 << " value if you wish to use a higher 'count'.";
2747 ret = -EINVAL;
2748 goto out;
2749 }
2750 } else {
2751 // 1MB block sizes are big enough so that we get more stuff done.
2752 // However, to avoid the osd from getting hung on this and having
2753 // timers being triggered, we are going to limit the count assuming
2754 // a configurable throughput and duration.
2755 // NOTE: max_count is the total amount of bytes that we believe we
2756 // will be able to write during 'duration' for the given
2757 // throughput. The block size hardly impacts this unless it's
2758 // way too big. Given we already check how big the block size
2759 // is, it's safe to assume everything will check out.
2760 int64_t max_count =
2761 cct->_conf->osd_bench_large_size_max_throughput * duration;
2762 if (count > max_count) {
2763 ss << "'count' values greater than " << max_count
2764 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2765 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
2766 << " for " << duration << " seconds,"
2767 << " can cause ill effects on osd. "
2768 << " Please adjust 'osd_bench_large_size_max_throughput'"
2769 << " with a higher value if you wish to use a higher 'count'.";
2770 ret = -EINVAL;
2771 goto out;
2772 }
2773 }
2774
2775 if (osize && bsize > osize)
2776 bsize = osize;
2777
2778 dout(1) << " bench count " << count
2779 << " bsize " << byte_u_t(bsize) << dendl;
2780
2781 ObjectStore::Transaction cleanupt;
2782
2783 if (osize && onum) {
2784 bufferlist bl;
2785 bufferptr bp(osize);
2786 bp.zero();
2787 bl.push_back(std::move(bp));
2788 bl.rebuild_page_aligned();
2789 for (int i=0; i<onum; ++i) {
2790 char nm[30];
2791 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
2792 object_t oid(nm);
2793 hobject_t soid(sobject_t(oid, 0));
2794 ObjectStore::Transaction t;
2795 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
2796 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2797 cleanupt.remove(coll_t(), ghobject_t(soid));
2798 }
2799 }
2800
2801 bufferlist bl;
2802 bufferptr bp(bsize);
2803 bp.zero();
2804 bl.push_back(std::move(bp));
2805 bl.rebuild_page_aligned();
2806
2807 {
2808 C_SaferCond waiter;
2809 if (!service.meta_ch->flush_commit(&waiter)) {
2810 waiter.wait();
2811 }
2812 }
2813
2814 utime_t start = ceph_clock_now();
2815 for (int64_t pos = 0; pos < count; pos += bsize) {
2816 char nm[30];
2817 unsigned offset = 0;
2818 if (onum && osize) {
2819 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
2820 offset = rand() % (osize / bsize) * bsize;
2821 } else {
2822 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
2823 }
2824 object_t oid(nm);
2825 hobject_t soid(sobject_t(oid, 0));
2826 ObjectStore::Transaction t;
2827 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
2828 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2829 if (!onum || !osize)
2830 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
2831 }
2832
2833 {
2834 C_SaferCond waiter;
2835 if (!service.meta_ch->flush_commit(&waiter)) {
2836 waiter.wait();
2837 }
2838 }
2839 utime_t end = ceph_clock_now();
2840
2841 // clean up
2842 store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
2843 {
2844 C_SaferCond waiter;
2845 if (!service.meta_ch->flush_commit(&waiter)) {
2846 waiter.wait();
2847 }
2848 }
2849
2850 double elapsed = end - start;
2851 double rate = count / elapsed;
2852 double iops = rate / bsize;
2853 f->open_object_section("osd_bench_results");
2854 f->dump_int("bytes_written", count);
2855 f->dump_int("blocksize", bsize);
2856 f->dump_float("elapsed_sec", elapsed);
2857 f->dump_float("bytes_per_sec", rate);
2858 f->dump_float("iops", iops);
2859 f->close_section();
2860 }
2861
2862 else if (prefix == "flush_pg_stats") {
2863 mgrc.send_pgstats();
2864 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2865 }
2866
2867 else if (prefix == "heap") {
2868 ret = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2869 }
2870
2871 else if (prefix == "debug dump_missing") {
2872 f->open_array_section("pgs");
2873 vector<PGRef> pgs;
2874 _get_pgs(&pgs);
2875 for (auto& pg : pgs) {
2876 string s = stringify(pg->pg_id);
2877 f->open_array_section(s.c_str());
2878 pg->lock();
2879 pg->dump_missing(f);
2880 pg->unlock();
2881 f->close_section();
2882 }
2883 f->close_section();
2884 }
2885
2886 else if (prefix == "debug kick_recovery_wq") {
2887 int64_t delay;
2888 cmd_getval(cmdmap, "delay", delay);
2889 ostringstream oss;
2890 oss << delay;
2891 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
2892 if (ret != 0) {
2893 ss << "kick_recovery_wq: error setting "
2894 << "osd_recovery_delay_start to '" << delay << "': error "
2895 << ret;
2896 goto out;
2897 }
2898 cct->_conf.apply_changes(nullptr);
2899 ss << "kicking recovery queue. set osd_recovery_delay_start "
2900 << "to " << cct->_conf->osd_recovery_delay_start;
2901 }
2902
2903 else if (prefix == "cpu_profiler") {
2904 ostringstream ds;
2905 string arg;
2906 cmd_getval(cmdmap, "arg", arg);
2907 vector<string> argvec;
2908 get_str_vec(arg, argvec);
2909 cpu_profiler_handle_command(argvec, ds);
2910 outbl.append(ds.str());
2911 }
2912
2913 else if (prefix == "dump_pg_recovery_stats") {
2914 lock_guard l(osd_lock);
2915 pg_recovery_stats.dump_formatted(f);
2916 }
2917
2918 else if (prefix == "reset_pg_recovery_stats") {
2919 lock_guard l(osd_lock);
2920 pg_recovery_stats.reset();
2921 }
2922
2923 else if (prefix == "perf histogram dump") {
2924 std::string logger;
2925 std::string counter;
2926 cmd_getval(cmdmap, "logger", logger);
2927 cmd_getval(cmdmap, "counter", counter);
2928 cct->get_perfcounters_collection()->dump_formatted_histograms(
2929 f, false, logger, counter);
2930 }
2931
2932 else if (prefix == "cache drop") {
2933 lock_guard l(osd_lock);
2934 dout(20) << "clearing all caches" << dendl;
2935 // Clear the objectstore's cache - onode and buffer for Bluestore,
2936 // system's pagecache for Filestore
2937 ret = store->flush_cache(&ss);
2938 if (ret < 0) {
2939 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
2940 goto out;
2941 }
2942 // Clear the objectcontext cache (per PG)
2943 vector<PGRef> pgs;
2944 _get_pgs(&pgs);
2945 for (auto& pg: pgs) {
2946 pg->clear_cache();
2947 }
2948 }
2949
2950 else if (prefix == "cache status") {
2951 lock_guard l(osd_lock);
2952 int obj_ctx_count = 0;
2953 vector<PGRef> pgs;
2954 _get_pgs(&pgs);
2955 for (auto& pg: pgs) {
2956 obj_ctx_count += pg->get_cache_obj_count();
2957 }
2958 f->open_object_section("cache_status");
2959 f->dump_int("object_ctx", obj_ctx_count);
2960 store->dump_cache_stats(f);
2961 f->close_section();
2962 }
2963
2964 else if (prefix == "scrub_purged_snaps") {
2965 lock_guard l(osd_lock);
2966 scrub_purged_snaps();
2967 }
2968
2969 else if (prefix == "dump_osd_network") {
2970 lock_guard l(osd_lock);
2971 int64_t value = 0;
2972 if (!(cmd_getval(cmdmap, "value", value))) {
2973 // Convert milliseconds to microseconds
2974 value = static_cast<double>(g_conf().get_val<double>(
2975 "mon_warn_on_slow_ping_time")) * 1000;
2976 if (value == 0) {
2977 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2978 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2979 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2980 }
2981 } else {
2982 // Convert user input to microseconds
2983 value *= 1000;
2984 }
2985 if (value < 0) value = 0;
2986
2987 struct osd_ping_time_t {
2988 uint32_t pingtime;
2989 int to;
2990 bool back;
2991 std::array<uint32_t,3> times;
2992 std::array<uint32_t,3> min;
2993 std::array<uint32_t,3> max;
2994 uint32_t last;
2995 uint32_t last_update;
2996
2997 bool operator<(const osd_ping_time_t& rhs) const {
2998 if (pingtime < rhs.pingtime)
2999 return true;
3000 if (pingtime > rhs.pingtime)
3001 return false;
3002 if (to < rhs.to)
3003 return true;
3004 if (to > rhs.to)
3005 return false;
3006 return back;
3007 }
3008 };
3009
3010 set<osd_ping_time_t> sorted;
3011 // Get pingtimes under lock and not on the stack
3012 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3013 service.get_hb_pingtime(pingtimes);
3014 for (auto j : *pingtimes) {
3015 if (j.second.last_update == 0)
3016 continue;
3017 osd_ping_time_t item;
3018 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3019 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3020 if (item.pingtime >= value) {
3021 item.to = j.first;
3022 item.times[0] = j.second.back_pingtime[0];
3023 item.times[1] = j.second.back_pingtime[1];
3024 item.times[2] = j.second.back_pingtime[2];
3025 item.min[0] = j.second.back_min[0];
3026 item.min[1] = j.second.back_min[1];
3027 item.min[2] = j.second.back_min[2];
3028 item.max[0] = j.second.back_max[0];
3029 item.max[1] = j.second.back_max[1];
3030 item.max[2] = j.second.back_max[2];
3031 item.last = j.second.back_last;
3032 item.back = true;
3033 item.last_update = j.second.last_update;
3034 sorted.emplace(item);
3035 }
3036 if (j.second.front_last == 0)
3037 continue;
3038 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3039 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3040 if (item.pingtime >= value) {
3041 item.to = j.first;
3042 item.times[0] = j.second.front_pingtime[0];
3043 item.times[1] = j.second.front_pingtime[1];
3044 item.times[2] = j.second.front_pingtime[2];
3045 item.min[0] = j.second.front_min[0];
3046 item.min[1] = j.second.front_min[1];
3047 item.min[2] = j.second.front_min[2];
3048 item.max[0] = j.second.front_max[0];
3049 item.max[1] = j.second.front_max[1];
3050 item.max[2] = j.second.front_max[2];
3051 item.last = j.second.front_last;
3052 item.last_update = j.second.last_update;
3053 item.back = false;
3054 sorted.emplace(item);
3055 }
3056 }
3057 delete pingtimes;
3058 //
3059 // Network ping times (1min 5min 15min)
3060 f->open_object_section("network_ping_times");
3061 f->dump_int("threshold", value / 1000);
3062 f->open_array_section("entries");
3063 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3064 ceph_assert(sitem.pingtime >= value);
3065 f->open_object_section("entry");
3066
3067 const time_t lu(sitem.last_update);
3068 char buffer[26];
3069 string lustr(ctime_r(&lu, buffer));
3070 lustr.pop_back(); // Remove trailing \n
3071 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3072 f->dump_string("last update", lustr);
3073 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3074 f->dump_int("from osd", whoami);
3075 f->dump_int("to osd", sitem.to);
3076 f->dump_string("interface", (sitem.back ? "back" : "front"));
3077 f->open_object_section("average");
3078 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3079 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3080 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3081 f->close_section(); // average
3082 f->open_object_section("min");
3083 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3084 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3085 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3086 f->close_section(); // min
3087 f->open_object_section("max");
3088 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3089 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3090 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3091 f->close_section(); // max
3092 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3093 f->close_section(); // entry
3094 }
3095 f->close_section(); // entries
3096 f->close_section(); // network_ping_times
3097 } else {
3098 ceph_abort_msg("broken asok registration");
3099 }
3100
3101 out:
3102 on_finish(ret, ss.str(), outbl);
3103}
3104
3105class TestOpsSocketHook : public AdminSocketHook {
3106 OSDService *service;
3107 ObjectStore *store;
3108public:
3109 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
3110 int call(std::string_view command, const cmdmap_t& cmdmap,
3111 Formatter *f,
3112 std::ostream& errss,
3113 bufferlist& out) override {
3114 int r = 0;
3115 stringstream outss;
3116 try {
3117 test_ops(service, store, command, cmdmap, outss);
3118 out.append(outss);
3119 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3120 errss << e.what();
3121 r = -EINVAL;
3122 }
3123 return r;
3124 }
3125 void test_ops(OSDService *service, ObjectStore *store,
3126 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
3127
3128};
3129
3130class OSD::C_Tick : public Context {
3131 OSD *osd;
3132 public:
3133 explicit C_Tick(OSD *o) : osd(o) {}
3134 void finish(int r) override {
3135 osd->tick();
3136 }
3137};
3138
3139class OSD::C_Tick_WithoutOSDLock : public Context {
3140 OSD *osd;
3141 public:
3142 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3143 void finish(int r) override {
3144 osd->tick_without_osd_lock();
3145 }
3146};
3147
3148int OSD::enable_disable_fuse(bool stop)
3149{
3150#ifdef HAVE_LIBFUSE
3151 int r;
3152 string mntpath = cct->_conf->osd_data + "/fuse";
3153 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3154 dout(1) << __func__ << " disabling" << dendl;
3155 fuse_store->stop();
3156 delete fuse_store;
3157 fuse_store = NULL;
3158 r = ::rmdir(mntpath.c_str());
3159 if (r < 0) {
3160 r = -errno;
3161 derr << __func__ << " failed to rmdir " << mntpath << ": "
3162 << cpp_strerror(r) << dendl;
3163 return r;
3164 }
3165 return 0;
3166 }
3167 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3168 dout(1) << __func__ << " enabling" << dendl;
3169 r = ::mkdir(mntpath.c_str(), 0700);
3170 if (r < 0)
3171 r = -errno;
3172 if (r < 0 && r != -EEXIST) {
3173 derr << __func__ << " unable to create " << mntpath << ": "
3174 << cpp_strerror(r) << dendl;
3175 return r;
3176 }
3177 fuse_store = new FuseStore(store, mntpath);
3178 r = fuse_store->start();
3179 if (r < 0) {
3180 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3181 delete fuse_store;
3182 fuse_store = NULL;
3183 return r;
3184 }
3185 }
3186#endif // HAVE_LIBFUSE
3187 return 0;
3188}
3189
3190size_t OSD::get_num_cache_shards()
3191{
3192 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3193}
3194
3195int OSD::get_num_op_shards()
3196{
3197 if (cct->_conf->osd_op_num_shards)
3198 return cct->_conf->osd_op_num_shards;
3199 if (store_is_rotational)
3200 return cct->_conf->osd_op_num_shards_hdd;
3201 else
3202 return cct->_conf->osd_op_num_shards_ssd;
3203}
3204
3205int OSD::get_num_op_threads()
3206{
3207 if (cct->_conf->osd_op_num_threads_per_shard)
3208 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3209 if (store_is_rotational)
3210 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3211 else
3212 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3213}
3214
3215float OSD::get_osd_recovery_sleep()
3216{
3217 if (cct->_conf->osd_recovery_sleep)
3218 return cct->_conf->osd_recovery_sleep;
3219 if (!store_is_rotational && !journal_is_rotational)
3220 return cct->_conf->osd_recovery_sleep_ssd;
3221 else if (store_is_rotational && !journal_is_rotational)
3222 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
3223 else
3224 return cct->_conf->osd_recovery_sleep_hdd;
3225}
3226
3227float OSD::get_osd_delete_sleep()
3228{
3229 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3230 if (osd_delete_sleep > 0)
3231 return osd_delete_sleep;
3232 if (!store_is_rotational && !journal_is_rotational)
3233 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3234 if (store_is_rotational && !journal_is_rotational)
3235 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3236 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3237}
3238
3239int OSD::get_recovery_max_active()
3240{
3241 if (cct->_conf->osd_recovery_max_active)
3242 return cct->_conf->osd_recovery_max_active;
3243 if (store_is_rotational)
3244 return cct->_conf->osd_recovery_max_active_hdd;
3245 else
3246 return cct->_conf->osd_recovery_max_active_ssd;
3247}
3248
3249float OSD::get_osd_snap_trim_sleep()
3250{
3251 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3252 if (osd_snap_trim_sleep > 0)
3253 return osd_snap_trim_sleep;
3254 if (!store_is_rotational && !journal_is_rotational)
3255 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3256 if (store_is_rotational && !journal_is_rotational)
3257 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3258 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3259}
3260
3261int OSD::init()
3262{
3263 OSDMapRef osdmap;
3264 CompatSet initial, diff;
3265 std::lock_guard lock(osd_lock);
3266 if (is_stopping())
3267 return 0;
3268
3269 tick_timer.init();
3270 tick_timer_without_osd_lock.init();
3271 service.recovery_request_timer.init();
3272 service.sleep_timer.init();
3273
3274 boot_finisher.start();
3275
3276 {
3277 string val;
3278 store->read_meta("require_osd_release", &val);
3279 last_require_osd_release = ceph_release_from_name(val);
3280 }
3281
3282 // mount.
3283 dout(2) << "init " << dev_path
3284 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3285 << dendl;
3286 dout(2) << "journal " << journal_path << dendl;
3287 ceph_assert(store); // call pre_init() first!
3288
3289 store->set_cache_shards(get_num_cache_shards());
3290
3291 int r = store->mount();
3292 if (r < 0) {
3293 derr << "OSD:init: unable to mount object store" << dendl;
3294 return r;
3295 }
3296 journal_is_rotational = store->is_journal_rotational();
3297 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3298 << dendl;
3299
3300 enable_disable_fuse(false);
3301
3302 dout(2) << "boot" << dendl;
3303
3304 service.meta_ch = store->open_collection(coll_t::meta());
3305
3306 // initialize the daily loadavg with current 15min loadavg
3307 double loadavgs[3];
3308 if (getloadavg(loadavgs, 3) == 3) {
3309 daily_loadavg = loadavgs[2];
3310 } else {
3311 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3312 daily_loadavg = 1.0;
3313 }
3314
3315 int rotating_auth_attempts = 0;
3316 auto rotating_auth_timeout =
3317 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3318
3319 // sanity check long object name handling
3320 {
3321 hobject_t l;
3322 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3323 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3324 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3325 r = store->validate_hobject_key(l);
3326 if (r < 0) {
3327 derr << "backend (" << store->get_type() << ") is unable to support max "
3328 << "object name[space] len" << dendl;
3329 derr << " osd max object name len = "
3330 << cct->_conf->osd_max_object_name_len << dendl;
3331 derr << " osd max object namespace len = "
3332 << cct->_conf->osd_max_object_namespace_len << dendl;
3333 derr << cpp_strerror(r) << dendl;
3334 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3335 goto out;
3336 }
3337 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3338 << dendl;
3339 } else {
3340 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3341 }
3342 }
3343
3344 // read superblock
3345 r = read_superblock();
3346 if (r < 0) {
3347 derr << "OSD::init() : unable to read osd superblock" << dendl;
3348 r = -EINVAL;
3349 goto out;
3350 }
3351
3352 if (osd_compat.compare(superblock.compat_features) < 0) {
3353 derr << "The disk uses features unsupported by the executable." << dendl;
3354 derr << " ondisk features " << superblock.compat_features << dendl;
3355 derr << " daemon features " << osd_compat << dendl;
3356
3357 if (osd_compat.writeable(superblock.compat_features)) {
3358 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3359 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3360 r = -EOPNOTSUPP;
3361 goto out;
3362 }
3363 else {
3364 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3365 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3366 r = -EOPNOTSUPP;
3367 goto out;
3368 }
3369 }
3370
3371 assert_warn(whoami == superblock.whoami);
3372 if (whoami != superblock.whoami) {
3373 derr << "OSD::init: superblock says osd"
3374 << superblock.whoami << " but I am osd." << whoami << dendl;
3375 r = -EINVAL;
3376 goto out;
3377 }
3378
3379 startup_time = ceph::mono_clock::now();
3380
3381 // load up "current" osdmap
3382 assert_warn(!get_osdmap());
3383 if (get_osdmap()) {
3384 derr << "OSD::init: unable to read current osdmap" << dendl;
3385 r = -EINVAL;
3386 goto out;
3387 }
3388 osdmap = get_map(superblock.current_epoch);
3389 set_osdmap(osdmap);
3390
3391 // make sure we don't have legacy pgs deleting
3392 {
3393 vector<coll_t> ls;
3394 int r = store->list_collections(ls);
3395 ceph_assert(r >= 0);
3396 for (auto c : ls) {
3397 spg_t pgid;
3398 if (c.is_pg(&pgid) &&
3399 !osdmap->have_pg_pool(pgid.pool())) {
3400 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3401 if (!store->exists(service.meta_ch, oid)) {
3402 derr << __func__ << " missing pg_pool_t for deleted pool "
3403 << pgid.pool() << " for pg " << pgid
3404 << "; please downgrade to luminous and allow "
3405 << "pg deletion to complete before upgrading" << dendl;
3406 ceph_abort();
3407 }
3408 }
3409 }
3410 }
3411
3412 initial = get_osd_initial_compat_set();
3413 diff = superblock.compat_features.unsupported(initial);
3414 if (superblock.compat_features.merge(initial)) {
3415 // Are we adding SNAPMAPPER2?
3416 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3417 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3418 << dendl;
3419 auto ch = service.meta_ch;
3420 auto hoid = make_snapmapper_oid();
3421 unsigned max = cct->_conf->osd_target_transaction_size;
3422 r = SnapMapper::convert_legacy(cct, store, ch, hoid, max);
3423 if (r < 0)
3424 goto out;
3425 }
3426 // We need to persist the new compat_set before we
3427 // do anything else
3428 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3429 ObjectStore::Transaction t;
3430 write_superblock(t);
3431 r = store->queue_transaction(service.meta_ch, std::move(t));
3432 if (r < 0)
3433 goto out;
3434 }
3435
3436 // make sure snap mapper object exists
3437 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3438 dout(10) << "init creating/touching snapmapper object" << dendl;
3439 ObjectStore::Transaction t;
3440 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3441 r = store->queue_transaction(service.meta_ch, std::move(t));
3442 if (r < 0)
3443 goto out;
3444 }
3445 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3446 dout(10) << "init creating/touching purged_snaps object" << dendl;
3447 ObjectStore::Transaction t;
3448 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3449 r = store->queue_transaction(service.meta_ch, std::move(t));
3450 if (r < 0)
3451 goto out;
3452 }
3453
3454 if (cct->_conf->osd_open_classes_on_start) {
3455 int r = ClassHandler::get_instance().open_all_classes();
3456 if (r)
3457 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3458 }
3459
3460 check_osdmap_features();
3461
3462 create_recoverystate_perf();
3463
3464 {
3465 epoch_t bind_epoch = osdmap->get_epoch();
3466 service.set_epochs(NULL, NULL, &bind_epoch);
3467 }
3468
3469 clear_temp_objects();
3470
3471 // initialize osdmap references in sharded wq
3472 for (auto& shard : shards) {
3473 std::lock_guard l(shard->osdmap_lock);
3474 shard->shard_osdmap = osdmap;
3475 }
3476
3477 // load up pgs (as they previously existed)
3478 load_pgs();
3479
3480 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3481
3482 create_logger();
3483
3484 // prime osd stats
3485 {
3486 struct store_statfs_t stbuf;
3487 osd_alert_list_t alerts;
3488 int r = store->statfs(&stbuf, &alerts);
3489 ceph_assert(r == 0);
3490 service.set_statfs(stbuf, alerts);
3491 }
3492
3493 // client_messenger auth_client is already set up by monc.
3494 for (auto m : { cluster_messenger,
3495 objecter_messenger,
3496 hb_front_client_messenger,
3497 hb_back_client_messenger,
3498 hb_front_server_messenger,
3499 hb_back_server_messenger } ) {
3500 m->set_auth_client(monc);
3501 }
3502 for (auto m : { client_messenger,
3503 cluster_messenger,
3504 hb_front_server_messenger,
3505 hb_back_server_messenger }) {
3506 m->set_auth_server(monc);
3507 }
3508 monc->set_handle_authentication_dispatcher(this);
3509
3510 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3511 | CEPH_ENTITY_TYPE_MGR);
3512 r = monc->init();
3513 if (r < 0)
3514 goto out;
3515
3516 mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); });
3517 mgrc.set_perf_metric_query_cb(
3518 [this](const ConfigPayload &config_payload) {
3519 set_perf_queries(config_payload);
3520 },
3521 [this] {
3522 return get_perf_reports();
3523 });
3524 mgrc.init();
3525
3526 // tell monc about log_client so it will know about mon session resets
3527 monc->set_log_client(&log_client);
3528 update_log_config();
3529
3530 // i'm ready!
3531 client_messenger->add_dispatcher_tail(&mgrc);
3532 client_messenger->add_dispatcher_tail(this);
3533 cluster_messenger->add_dispatcher_head(this);
3534
3535 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3536 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3537 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3538 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3539
3540 objecter_messenger->add_dispatcher_head(service.objecter.get());
3541
3542 service.init();
3543 service.publish_map(osdmap);
3544 service.publish_superblock(superblock);
3545 service.max_oldest_map = superblock.oldest_map;
3546
3547 for (auto& shard : shards) {
3548 // put PGs in a temporary set because we may modify pg_slots
3549 // unordered_map below.
3550 set<PGRef> pgs;
3551 for (auto& i : shard->pg_slots) {
3552 PGRef pg = i.second->pg;
3553 if (!pg) {
3554 continue;
3555 }
3556 pgs.insert(pg);
3557 }
3558 for (auto pg : pgs) {
3559 std::scoped_lock l{*pg};
3560 set<pair<spg_t,epoch_t>> new_children;
3561 set<pair<spg_t,epoch_t>> merge_pgs;
3562 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3563 &new_children, &merge_pgs);
3564 if (!new_children.empty()) {
3565 for (auto shard : shards) {
3566 shard->prime_splits(osdmap, &new_children);
3567 }
3568 assert(new_children.empty());
3569 }
3570 if (!merge_pgs.empty()) {
3571 for (auto shard : shards) {
3572 shard->prime_merges(osdmap, &merge_pgs);
3573 }
3574 assert(merge_pgs.empty());
3575 }
3576 }
3577 }
3578
3579 osd_op_tp.start();
3580
3581 // start the heartbeat
3582 heartbeat_thread.create("osd_srv_heartbt");
3583
3584 // tick
3585 tick_timer.add_event_after(get_tick_interval(),
3586 new C_Tick(this));
3587 {
3588 std::lock_guard l(tick_timer_lock);
3589 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3590 new C_Tick_WithoutOSDLock(this));
3591 }
3592
3593 osd_lock.unlock();
3594
3595 r = monc->authenticate();
3596 if (r < 0) {
3597 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3598 << dendl;
3599 exit(1);
3600 }
3601
3602 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3603 derr << "unable to obtain rotating service keys; retrying" << dendl;
3604 ++rotating_auth_attempts;
3605 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3606 derr << __func__ << " wait_auth_rotating timed out" << dendl;
3607 exit(1);
3608 }
3609 }
3610
3611 r = update_crush_device_class();
3612 if (r < 0) {
3613 derr << __func__ << " unable to update_crush_device_class: "
3614 << cpp_strerror(r) << dendl;
3615 exit(1);
3616 }
3617
3618 r = update_crush_location();
3619 if (r < 0) {
3620 derr << __func__ << " unable to update_crush_location: "
3621 << cpp_strerror(r) << dendl;
3622 exit(1);
3623 }
3624
3625 osd_lock.lock();
3626 if (is_stopping())
3627 return 0;
3628
3629 // start objecter *after* we have authenticated, so that we don't ignore
3630 // the OSDMaps it requests.
3631 service.final_init();
3632
3633 check_config();
3634
3635 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3636 consume_map();
3637
3638 dout(0) << "done with init, starting boot process" << dendl;
3639
3640 // subscribe to any pg creations
3641 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3642
3643 // MgrClient needs this (it doesn't have MonClient reference itself)
3644 monc->sub_want("mgrmap", 0, 0);
3645
3646 // we don't need to ask for an osdmap here; objecter will
3647 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3648
3649 monc->renew_subs();
3650
3651 start_boot();
3652
3653 return 0;
3654
3655out:
3656 enable_disable_fuse(true);
3657 store->umount();
3658 delete store;
3659 store = NULL;
3660 return r;
3661}
3662
3663void OSD::final_init()
3664{
3665 AdminSocket *admin_socket = cct->get_admin_socket();
3666 asok_hook = new OSDSocketHook(this);
3667 int r = admin_socket->register_command("status", asok_hook,
3668 "high-level status of OSD");
3669 ceph_assert(r == 0);
3670 r = admin_socket->register_command("flush_journal",
3671 asok_hook,
3672 "flush the journal to permanent store");
3673 ceph_assert(r == 0);
3674 r = admin_socket->register_command("dump_ops_in_flight " \
3675 "name=filterstr,type=CephString,n=N,req=false",
3676 asok_hook,
3677 "show the ops currently in flight");
3678 ceph_assert(r == 0);
3679 r = admin_socket->register_command("ops " \
3680 "name=filterstr,type=CephString,n=N,req=false",
3681 asok_hook,
3682 "show the ops currently in flight");
3683 ceph_assert(r == 0);
3684 r = admin_socket->register_command("dump_blocked_ops " \
3685 "name=filterstr,type=CephString,n=N,req=false",
3686 asok_hook,
3687 "show the blocked ops currently in flight");
3688 ceph_assert(r == 0);
3689 r = admin_socket->register_command("dump_historic_ops " \
3690 "name=filterstr,type=CephString,n=N,req=false",
3691 asok_hook,
3692 "show recent ops");
3693 ceph_assert(r == 0);
3694 r = admin_socket->register_command("dump_historic_slow_ops " \
3695 "name=filterstr,type=CephString,n=N,req=false",
3696 asok_hook,
3697 "show slowest recent ops");
3698 ceph_assert(r == 0);
3699 r = admin_socket->register_command("dump_historic_ops_by_duration " \
3700 "name=filterstr,type=CephString,n=N,req=false",
3701 asok_hook,
3702 "show slowest recent ops, sorted by duration");
3703 ceph_assert(r == 0);
3704 r = admin_socket->register_command("dump_op_pq_state",
3705 asok_hook,
3706 "dump op priority queue state");
3707 ceph_assert(r == 0);
3708 r = admin_socket->register_command("dump_blacklist",
3709 asok_hook,
3710 "dump blacklisted clients and times");
3711 ceph_assert(r == 0);
3712 r = admin_socket->register_command("dump_watchers",
3713 asok_hook,
3714 "show clients which have active watches,"
3715 " and on which objects");
3716 ceph_assert(r == 0);
3717 r = admin_socket->register_command("dump_recovery_reservations",
3718 asok_hook,
3719 "show recovery reservations");
3720 ceph_assert(r == 0);
3721 r = admin_socket->register_command("dump_scrub_reservations",
3722 asok_hook,
3723 "show scrub reservations");
3724 ceph_assert(r == 0);
3725 r = admin_socket->register_command("get_latest_osdmap",
3726 asok_hook,
3727 "force osd to update the latest map from "
3728 "the mon");
3729 ceph_assert(r == 0);
3730
3731 r = admin_socket->register_command("set_heap_property " \
3732 "name=property,type=CephString " \
3733 "name=value,type=CephInt",
3734 asok_hook,
3735 "update malloc extension heap property");
3736 ceph_assert(r == 0);
3737
3738 r = admin_socket->register_command("get_heap_property " \
3739 "name=property,type=CephString",
3740 asok_hook,
3741 "get malloc extension heap property");
3742 ceph_assert(r == 0);
3743
3744 r = admin_socket->register_command("dump_objectstore_kv_stats",
3745 asok_hook,
3746 "print statistics of kvdb which used by bluestore");
3747 ceph_assert(r == 0);
3748
3749 r = admin_socket->register_command("dump_scrubs",
3750 asok_hook,
3751 "print scheduled scrubs");
3752 ceph_assert(r == 0);
3753
3754 r = admin_socket->register_command("calc_objectstore_db_histogram",
3755 asok_hook,
3756 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3757 ceph_assert(r == 0);
3758
3759 r = admin_socket->register_command("flush_store_cache",
3760 asok_hook,
3761 "Flush bluestore internal cache");
3762 ceph_assert(r == 0);
3763 r = admin_socket->register_command("dump_pgstate_history",
3764 asok_hook,
3765 "show recent state history");
3766 ceph_assert(r == 0);
3767
3768 r = admin_socket->register_command("compact",
3769 asok_hook,
3770 "Commpact object store's omap."
3771 " WARNING: Compaction probably slows your requests");
3772 ceph_assert(r == 0);
3773
3774 r = admin_socket->register_command("get_mapped_pools",
3775 asok_hook,
3776 "dump pools whose PG(s) are mapped to this OSD.");
3777
3778 ceph_assert(r == 0);
3779
3780 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
3781 asok_hook,
3782 "probe OSD devices for SMART data.");
3783
3784 ceph_assert(r == 0);
3785
3786 r = admin_socket->register_command("list_devices",
3787 asok_hook,
3788 "list OSD devices.");
3789 r = admin_socket->register_command("send_beacon",
3790 asok_hook,
3791 "send OSD beacon to mon immediately");
3792
3793 r = admin_socket->register_command(
3794 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3795 "Dump osd heartbeat network ping times");
3796 ceph_assert(r == 0);
3797
3798 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3799 // Note: pools are CephString instead of CephPoolname because
3800 // these commands traditionally support both pool names and numbers
3801 r = admin_socket->register_command(
3802 "setomapval " \
3803 "name=pool,type=CephString " \
3804 "name=objname,type=CephObjectname " \
3805 "name=key,type=CephString "\
3806 "name=val,type=CephString",
3807 test_ops_hook,
3808 "set omap key");
3809 ceph_assert(r == 0);
3810 r = admin_socket->register_command(
3811 "rmomapkey " \
3812 "name=pool,type=CephString " \
3813 "name=objname,type=CephObjectname " \
3814 "name=key,type=CephString",
3815 test_ops_hook,
3816 "remove omap key");
3817 ceph_assert(r == 0);
3818 r = admin_socket->register_command(
3819 "setomapheader " \
3820 "name=pool,type=CephString " \
3821 "name=objname,type=CephObjectname " \
3822 "name=header,type=CephString",
3823 test_ops_hook,
3824 "set omap header");
3825 ceph_assert(r == 0);
3826
3827 r = admin_socket->register_command(
3828 "getomap " \
3829 "name=pool,type=CephString " \
3830 "name=objname,type=CephObjectname",
3831 test_ops_hook,
3832 "output entire object map");
3833 ceph_assert(r == 0);
3834
3835 r = admin_socket->register_command(
3836 "truncobj " \
3837 "name=pool,type=CephString " \
3838 "name=objname,type=CephObjectname " \
3839 "name=len,type=CephInt",
3840 test_ops_hook,
3841 "truncate object to length");
3842 ceph_assert(r == 0);
3843
3844 r = admin_socket->register_command(
3845 "injectdataerr " \
3846 "name=pool,type=CephString " \
3847 "name=objname,type=CephObjectname " \
3848 "name=shardid,type=CephInt,req=false,range=0|255",
3849 test_ops_hook,
3850 "inject data error to an object");
3851 ceph_assert(r == 0);
3852
3853 r = admin_socket->register_command(
3854 "injectmdataerr " \
3855 "name=pool,type=CephString " \
3856 "name=objname,type=CephObjectname " \
3857 "name=shardid,type=CephInt,req=false,range=0|255",
3858 test_ops_hook,
3859 "inject metadata error to an object");
3860 ceph_assert(r == 0);
3861 r = admin_socket->register_command(
3862 "set_recovery_delay " \
3863 "name=utime,type=CephInt,req=false",
3864 test_ops_hook,
3865 "Delay osd recovery by specified seconds");
3866 ceph_assert(r == 0);
3867 r = admin_socket->register_command(
3868 "injectfull " \
3869 "name=type,type=CephString,req=false " \
3870 "name=count,type=CephInt,req=false ",
3871 test_ops_hook,
3872 "Inject a full disk (optional count times)");
3873 ceph_assert(r == 0);
3874 r = admin_socket->register_command(
3875 "bench " \
3876 "name=count,type=CephInt,req=false " \
3877 "name=size,type=CephInt,req=false " \
3878 "name=object_size,type=CephInt,req=false " \
3879 "name=object_num,type=CephInt,req=false ",
3880 asok_hook,
3881 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3882 "(default count=1G default size=4MB). Results in log.");
3883 ceph_assert(r == 0);
3884 r = admin_socket->register_command(
3885 "cluster_log " \
3886 "name=level,type=CephChoices,strings=error,warning,info,debug " \
3887 "name=message,type=CephString,n=N",
3888 asok_hook,
3889 "log a message to the cluster log");
3890 ceph_assert(r == 0);
3891 r = admin_socket->register_command(
3892 "flush_pg_stats",
3893 asok_hook,
3894 "flush pg stats");
3895 ceph_assert(r == 0);
3896 r = admin_socket->register_command(
3897 "heap " \
3898 "name=heapcmd,type=CephChoices,strings=" \
3899 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
3900 "name=value,type=CephString,req=false",
3901 asok_hook,
3902 "show heap usage info (available only if compiled with tcmalloc)");
3903 ceph_assert(r == 0);
3904 r = admin_socket->register_command(
3905 "debug dump_missing " \
3906 "name=filename,type=CephFilepath",
3907 asok_hook,
3908 "dump missing objects to a named file");
3909 ceph_assert(r == 0);
3910 r = admin_socket->register_command(
3911 "debug kick_recovery_wq " \
3912 "name=delay,type=CephInt,range=0",
3913 asok_hook,
3914 "set osd_recovery_delay_start to <val>");
3915 ceph_assert(r == 0);
3916 r = admin_socket->register_command(
3917 "cpu_profiler " \
3918 "name=arg,type=CephChoices,strings=status|flush",
3919 asok_hook,
3920 "run cpu profiling on daemon");
3921 ceph_assert(r == 0);
3922 r = admin_socket->register_command(
3923 "dump_pg_recovery_stats",
3924 asok_hook,
3925 "dump pg recovery statistics");
3926 ceph_assert(r == 0);
3927 r = admin_socket->register_command(
3928 "reset_pg_recovery_stats",
3929 asok_hook,
3930 "reset pg recovery statistics");
3931 ceph_assert(r == 0);
3932 r = admin_socket->register_command(
3933 "cache drop",
3934 asok_hook,
3935 "Drop all OSD caches");
3936 ceph_assert(r == 0);
3937 r = admin_socket->register_command(
3938 "cache status",
3939 asok_hook,
3940 "Get OSD caches statistics");
3941 ceph_assert(r == 0);
3942 r = admin_socket->register_command(
3943 "scrub_purged_snaps",
3944 asok_hook,
3945 "Scrub purged_snaps vs snapmapper index");
3946 ceph_assert(r == 0);
3947
3948 // -- pg commands --
3949 // old form: ceph pg <pgid> command ...
3950 r = admin_socket->register_command(
3951 "pg " \
3952 "name=pgid,type=CephPgid " \
3953 "name=cmd,type=CephChoices,strings=query",
3954 asok_hook,
3955 "");
3956 ceph_assert(r == 0);
3957 r = admin_socket->register_command(
3958 "pg " \
3959 "name=pgid,type=CephPgid " \
3960 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
3961 "name=mulcmd,type=CephChoices,strings=revert|delete",
3962 asok_hook,
3963 "");
3964 ceph_assert(r == 0);
3965 r = admin_socket->register_command(
3966 "pg " \
3967 "name=pgid,type=CephPgid " \
3968 "name=cmd,type=CephChoices,strings=list_unfound " \
3969 "name=offset,type=CephString,req=false",
3970 asok_hook,
3971 "");
3972 ceph_assert(r == 0);
3973 r = admin_socket->register_command(
3974 "pg " \
3975 "name=pgid,type=CephPgid " \
3976 "name=cmd,type=CephChoices,strings=scrub " \
3977 "name=time,type=CephInt,req=false",
3978 asok_hook,
3979 "");
3980 ceph_assert(r == 0);
3981 r = admin_socket->register_command(
3982 "pg " \
3983 "name=pgid,type=CephPgid " \
3984 "name=cmd,type=CephChoices,strings=deep_scrub " \
3985 "name=time,type=CephInt,req=false",
3986 asok_hook,
3987 "");
3988 ceph_assert(r == 0);
3989 // new form: tell <pgid> <cmd> for both cli and rest
3990 r = admin_socket->register_command(
3991 "query",
3992 asok_hook,
3993 "show details of a specific pg");
3994 ceph_assert(r == 0);
3995 r = admin_socket->register_command(
3996 "mark_unfound_lost " \
3997 "name=pgid,type=CephPgid,req=false " \
3998 "name=mulcmd,type=CephChoices,strings=revert|delete",
3999 asok_hook,
4000 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4001 ceph_assert(r == 0);
4002 r = admin_socket->register_command(
4003 "list_unfound " \
4004 "name=pgid,type=CephPgid,req=false " \
4005 "name=offset,type=CephString,req=false",
4006 asok_hook,
4007 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4008 ceph_assert(r == 0);
4009 r = admin_socket->register_command(
4010 "scrub " \
4011 "name=pgid,type=CephPgid,req=false " \
4012 "name=time,type=CephInt,req=false",
4013 asok_hook,
4014 "Trigger a scheduled scrub ");
4015 ceph_assert(r == 0);
4016 r = admin_socket->register_command(
4017 "deep_scrub " \
4018 "name=pgid,type=CephPgid,req=false " \
4019 "name=time,type=CephInt,req=false",
4020 asok_hook,
4021 "Trigger a scheduled deep scrub ");
4022 ceph_assert(r == 0);
4023}
4024
4025void OSD::create_logger()
4026{
4027 dout(10) << "create_logger" << dendl;
4028
4029 logger = build_osd_logger(cct);
4030 cct->get_perfcounters_collection()->add(logger);
4031}
4032
4033void OSD::create_recoverystate_perf()
4034{
4035 dout(10) << "create_recoverystate_perf" << dendl;
4036
4037 recoverystate_perf = build_recoverystate_perf(cct);
4038 cct->get_perfcounters_collection()->add(recoverystate_perf);
4039}
4040
4041int OSD::shutdown()
4042{
4043 if (cct->_conf->osd_fast_shutdown) {
4044 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4045 if (cct->_conf->osd_fast_shutdown_notify_mon)
4046 service.prepare_to_stop();
4047 cct->_log->flush();
4048 _exit(0);
4049 }
4050
4051 if (!service.prepare_to_stop())
4052 return 0; // already shutting down
4053 osd_lock.lock();
4054 if (is_stopping()) {
4055 osd_lock.unlock();
4056 return 0;
4057 }
4058 dout(0) << "shutdown" << dendl;
4059
4060 set_state(STATE_STOPPING);
4061
4062 // Debugging
4063 if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4064 cct->_conf.set_val("debug_osd", "100");
4065 cct->_conf.set_val("debug_journal", "100");
4066 cct->_conf.set_val("debug_filestore", "100");
4067 cct->_conf.set_val("debug_bluestore", "100");
4068 cct->_conf.set_val("debug_ms", "100");
4069 cct->_conf.apply_changes(nullptr);
4070 }
4071
4072 // stop MgrClient earlier as it's more like an internal consumer of OSD
4073 mgrc.shutdown();
4074
4075 service.start_shutdown();
4076
4077 // stop sending work to pgs. this just prevents any new work in _process
4078 // from racing with on_shutdown and potentially entering the pg after.
4079 op_shardedwq.drain();
4080
4081 // Shutdown PGs
4082 {
4083 vector<PGRef> pgs;
4084 _get_pgs(&pgs);
4085 for (auto pg : pgs) {
4086 pg->shutdown();
4087 }
4088 }
4089
4090 // drain op queue again (in case PGs requeued something)
4091 op_shardedwq.drain();
4092 {
4093 finished.clear(); // zap waiters (bleh, this is messy)
4094 waiting_for_osdmap.clear();
4095 }
4096
4097 // unregister commands
4098 cct->get_admin_socket()->unregister_commands(asok_hook);
4099 delete asok_hook;
4100 asok_hook = NULL;
4101
4102 cct->get_admin_socket()->unregister_commands(test_ops_hook);
4103 delete test_ops_hook;
4104 test_ops_hook = NULL;
4105
4106 osd_lock.unlock();
4107
4108 {
4109 std::lock_guard l{heartbeat_lock};
4110 heartbeat_stop = true;
4111 heartbeat_cond.notify_all();
4112 heartbeat_peers.clear();
4113 }
4114 heartbeat_thread.join();
4115
4116 hb_back_server_messenger->mark_down_all();
4117 hb_front_server_messenger->mark_down_all();
4118 hb_front_client_messenger->mark_down_all();
4119 hb_back_client_messenger->mark_down_all();
4120
4121 osd_op_tp.drain();
4122 osd_op_tp.stop();
4123 dout(10) << "op sharded tp stopped" << dendl;
4124
4125 dout(10) << "stopping agent" << dendl;
4126 service.agent_stop();
4127
4128 boot_finisher.wait_for_empty();
4129
4130 osd_lock.lock();
4131
4132 boot_finisher.stop();
4133 reset_heartbeat_peers(true);
4134
4135 tick_timer.shutdown();
4136
4137 {
4138 std::lock_guard l(tick_timer_lock);
4139 tick_timer_without_osd_lock.shutdown();
4140 }
4141
4142 // note unmount epoch
4143 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
4144 superblock.mounted = service.get_boot_epoch();
4145 superblock.clean_thru = get_osdmap_epoch();
4146 ObjectStore::Transaction t;
4147 write_superblock(t);
4148 int r = store->queue_transaction(service.meta_ch, std::move(t));
4149 if (r) {
4150 derr << "OSD::shutdown: error writing superblock: "
4151 << cpp_strerror(r) << dendl;
4152 }
4153
4154
4155 service.shutdown_reserver();
4156
4157 // Remove PGs
4158#ifdef PG_DEBUG_REFS
4159 service.dump_live_pgids();
4160#endif
4161 while (true) {
4162 vector<PGRef> pgs;
4163 _get_pgs(&pgs, true);
4164 if (pgs.empty()) {
4165 break;
4166 }
4167 for (auto& pg : pgs) {
4168 if (pg->is_deleted()) {
4169 continue;
4170 }
4171 dout(20) << " kicking pg " << pg << dendl;
4172 pg->lock();
4173 if (pg->get_num_ref() != 1) {
4174 derr << "pgid " << pg->get_pgid() << " has ref count of "
4175 << pg->get_num_ref() << dendl;
4176#ifdef PG_DEBUG_REFS
4177 pg->dump_live_ids();
4178#endif
4179 if (cct->_conf->osd_shutdown_pgref_assert) {
4180 ceph_abort();
4181 }
4182 }
4183 pg->ch.reset();
4184 pg->unlock();
4185 }
4186 }
4187#ifdef PG_DEBUG_REFS
4188 service.dump_live_pgids();
4189#endif
4190
4191 osd_lock.unlock();
4192 cct->_conf.remove_observer(this);
4193 osd_lock.lock();
4194
4195 service.meta_ch.reset();
4196
4197 dout(10) << "syncing store" << dendl;
4198 enable_disable_fuse(true);
4199
4200 if (cct->_conf->osd_journal_flush_on_shutdown) {
4201 dout(10) << "flushing journal" << dendl;
4202 store->flush_journal();
4203 }
4204
4205 monc->shutdown();
4206 osd_lock.unlock();
4207 {
4208 std::unique_lock l{map_lock};
4209 set_osdmap(OSDMapRef());
4210 }
4211 for (auto s : shards) {
4212 std::lock_guard l(s->osdmap_lock);
4213 s->shard_osdmap = OSDMapRef();
4214 }
4215 service.shutdown();
4216
4217 std::lock_guard lock(osd_lock);
4218 store->umount();
4219 delete store;
4220 store = nullptr;
4221 dout(10) << "Store synced" << dendl;
4222
4223 op_tracker.on_shutdown();
4224
4225 ClassHandler::get_instance().shutdown();
4226 client_messenger->shutdown();
4227 cluster_messenger->shutdown();
4228 hb_front_client_messenger->shutdown();
4229 hb_back_client_messenger->shutdown();
4230 objecter_messenger->shutdown();
4231 hb_front_server_messenger->shutdown();
4232 hb_back_server_messenger->shutdown();
4233
4234 return r;
4235}
4236
4237int OSD::mon_cmd_maybe_osd_create(string &cmd)
4238{
4239 bool created = false;
4240 while (true) {
4241 dout(10) << __func__ << " cmd: " << cmd << dendl;
4242 vector<string> vcmd{cmd};
4243 bufferlist inbl;
4244 C_SaferCond w;
4245 string outs;
4246 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4247 int r = w.wait();
4248 if (r < 0) {
4249 if (r == -ENOENT && !created) {
4250 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4251 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4252 vector<string> vnewcmd{newcmd};
4253 bufferlist inbl;
4254 C_SaferCond w;
4255 string outs;
4256 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4257 int r = w.wait();
4258 if (r < 0) {
4259 derr << __func__ << " fail: osd does not exist and created failed: "
4260 << cpp_strerror(r) << dendl;
4261 return r;
4262 }
4263 created = true;
4264 continue;
4265 }
4266 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4267 return r;
4268 }
4269 break;
4270 }
4271
4272 return 0;
4273}
4274
4275int OSD::update_crush_location()
4276{
4277 if (!cct->_conf->osd_crush_update_on_start) {
4278 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4279 return 0;
4280 }
4281
4282 char weight[32];
4283 if (cct->_conf->osd_crush_initial_weight >= 0) {
4284 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4285 } else {
4286 struct store_statfs_t st;
4287 osd_alert_list_t alerts;
4288 int r = store->statfs(&st, &alerts);
4289 if (r < 0) {
4290 derr << "statfs: " << cpp_strerror(r) << dendl;
4291 return r;
4292 }
4293 snprintf(weight, sizeof(weight), "%.4lf",
4294 std::max(.00001,
4295 double(st.total) /
4296 double(1ull << 40 /* TB */)));
4297 }
4298
4299 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
4300
4301 string cmd =
4302 string("{\"prefix\": \"osd crush create-or-move\", ") +
4303 string("\"id\": ") + stringify(whoami) + ", " +
4304 string("\"weight\":") + weight + ", " +
4305 string("\"args\": [") + stringify(cct->crush_location) + "]}";
4306 return mon_cmd_maybe_osd_create(cmd);
4307}
4308
4309int OSD::update_crush_device_class()
4310{
4311 if (!cct->_conf->osd_class_update_on_start) {
4312 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4313 return 0;
4314 }
4315
4316 string device_class;
4317 int r = store->read_meta("crush_device_class", &device_class);
4318 if (r < 0 || device_class.empty()) {
4319 device_class = store->get_default_device_class();
4320 }
4321
4322 if (device_class.empty()) {
4323 dout(20) << __func__ << " no device class stored locally" << dendl;
4324 return 0;
4325 }
4326
4327 string cmd =
4328 string("{\"prefix\": \"osd crush set-device-class\", ") +
4329 string("\"class\": \"") + device_class + string("\", ") +
4330 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4331
4332 r = mon_cmd_maybe_osd_create(cmd);
4333 if (r == -EBUSY) {
4334 // good, already bound to a device-class
4335 return 0;
4336 } else {
4337 return r;
4338 }
4339}
4340
4341void OSD::write_superblock(ObjectStore::Transaction& t)
4342{
4343 dout(10) << "write_superblock " << superblock << dendl;
4344
4345 //hack: at minimum it's using the baseline feature set
4346 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4347 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4348
4349 bufferlist bl;
4350 encode(superblock, bl);
4351 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4352}
4353
4354int OSD::read_superblock()
4355{
4356 bufferlist bl;
4357 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4358 if (r < 0)
4359 return r;
4360
4361 auto p = bl.cbegin();
4362 decode(superblock, p);
4363
4364 dout(10) << "read_superblock " << superblock << dendl;
4365
4366 return 0;
4367}
4368
4369void OSD::clear_temp_objects()
4370{
4371 dout(10) << __func__ << dendl;
4372 vector<coll_t> ls;
4373 store->list_collections(ls);
4374 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4375 spg_t pgid;
4376 if (!p->is_pg(&pgid))
4377 continue;
4378
4379 // list temp objects
4380 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4381
4382 vector<ghobject_t> temps;
4383 ghobject_t next;
4384 while (1) {
4385 vector<ghobject_t> objects;
4386 auto ch = store->open_collection(*p);
4387 ceph_assert(ch);
4388 store->collection_list(ch, next, ghobject_t::get_max(),
4389 store->get_ideal_list_max(),
4390 &objects, &next);
4391 if (objects.empty())
4392 break;
4393 vector<ghobject_t>::iterator q;
4394 for (q = objects.begin(); q != objects.end(); ++q) {
4395 // Hammer set pool for temps to -1, so check for clean-up
4396 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4397 temps.push_back(*q);
4398 } else {
4399 break;
4400 }
4401 }
4402 // If we saw a non-temp object and hit the break above we can
4403 // break out of the while loop too.
4404 if (q != objects.end())
4405 break;
4406 }
4407 if (!temps.empty()) {
4408 ObjectStore::Transaction t;
4409 int removed = 0;
4410 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4411 dout(20) << " removing " << *p << " object " << *q << dendl;
4412 t.remove(*p, *q);
4413 if (++removed > cct->_conf->osd_target_transaction_size) {
4414 store->queue_transaction(service.meta_ch, std::move(t));
4415 t = ObjectStore::Transaction();
4416 removed = 0;
4417 }
4418 }
4419 if (removed) {
4420 store->queue_transaction(service.meta_ch, std::move(t));
4421 }
4422 }
4423 }
4424}
4425
4426void OSD::recursive_remove_collection(CephContext* cct,
4427 ObjectStore *store, spg_t pgid,
4428 coll_t tmp)
4429{
4430 OSDriver driver(
4431 store,
4432 coll_t(),
4433 make_snapmapper_oid());
4434
4435 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4436 ObjectStore::Transaction t;
4437 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4438
4439 ghobject_t next;
4440 int max = cct->_conf->osd_target_transaction_size;
4441 vector<ghobject_t> objects;
4442 objects.reserve(max);
4443 while (true) {
4444 objects.clear();
4445 store->collection_list(ch, next, ghobject_t::get_max(),
4446 max, &objects, &next);
4447 generic_dout(10) << __func__ << " " << objects << dendl;
4448 if (objects.empty())
4449 break;
4450 for (auto& p: objects) {
4451 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4452 int r = mapper.remove_oid(p.hobj, &_t);
4453 if (r != 0 && r != -ENOENT)
4454 ceph_abort();
4455 t.remove(tmp, p);
4456 }
4457 int r = store->queue_transaction(ch, std::move(t));
4458 ceph_assert(r == 0);
4459 t = ObjectStore::Transaction();
4460 }
4461 t.remove_collection(tmp);
4462 int r = store->queue_transaction(ch, std::move(t));
4463 ceph_assert(r == 0);
4464
4465 C_SaferCond waiter;
4466 if (!ch->flush_commit(&waiter)) {
4467 waiter.wait();
4468 }
4469}
4470
4471
4472// ======================================================
4473// PG's
4474
4475PG* OSD::_make_pg(
4476 OSDMapRef createmap,
4477 spg_t pgid)
4478{
4479 dout(10) << __func__ << " " << pgid << dendl;
4480 pg_pool_t pi;
4481 map<string,string> ec_profile;
4482 string name;
4483 if (createmap->have_pg_pool(pgid.pool())) {
4484 pi = *createmap->get_pg_pool(pgid.pool());
4485 name = createmap->get_pool_name(pgid.pool());
4486 if (pi.is_erasure()) {
4487 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4488 }
4489 } else {
4490 // pool was deleted; grab final pg_pool_t off disk.
4491 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4492 bufferlist bl;
4493 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4494 if (r < 0) {
4495 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4496 << dendl;
4497 return nullptr;
4498 }
4499 ceph_assert(r >= 0);
4500 auto p = bl.cbegin();
4501 decode(pi, p);
4502 decode(name, p);
4503 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4504 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4505 << " tombstone" << dendl;
4506 return nullptr;
4507 }
4508 decode(ec_profile, p);
4509 }
4510 PGPool pool(cct, createmap, pgid.pool(), pi, name);
4511 PG *pg;
4512 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4513 pi.type == pg_pool_t::TYPE_ERASURE)
4514 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4515 else
4516 ceph_abort();
4517 return pg;
4518}
4519
4520void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4521{
4522 v->clear();
4523 v->reserve(get_num_pgs());
4524 for (auto& s : shards) {
4525 std::lock_guard l(s->shard_lock);
4526 for (auto& j : s->pg_slots) {
4527 if (j.second->pg &&
4528 !j.second->pg->is_deleted()) {
4529 v->push_back(j.second->pg);
4530 if (clear_too) {
4531 s->_detach_pg(j.second.get());
4532 }
4533 }
4534 }
4535 }
4536}
4537
4538void OSD::_get_pgids(vector<spg_t> *v)
4539{
4540 v->clear();
4541 v->reserve(get_num_pgs());
4542 for (auto& s : shards) {
4543 std::lock_guard l(s->shard_lock);
4544 for (auto& j : s->pg_slots) {
4545 if (j.second->pg &&
4546 !j.second->pg->is_deleted()) {
4547 v->push_back(j.first);
4548 }
4549 }
4550 }
4551}
4552
4553void OSD::register_pg(PGRef pg)
4554{
4555 spg_t pgid = pg->get_pgid();
4556 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4557 auto sdata = shards[shard_index];
4558 std::lock_guard l(sdata->shard_lock);
4559 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4560 ceph_assert(r.second);
4561 auto *slot = r.first->second.get();
4562 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4563 sdata->_attach_pg(slot, pg.get());
4564}
4565
4566bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4567{
4568 auto sdata = pg->osd_shard;
4569 ceph_assert(sdata);
4570 {
4571 std::lock_guard l(sdata->shard_lock);
4572 auto p = sdata->pg_slots.find(pg->pg_id);
4573 if (p == sdata->pg_slots.end() ||
4574 !p->second->pg) {
4575 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4576 return false;
4577 }
4578 if (p->second->waiting_for_merge_epoch) {
4579 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4580 return false;
4581 }
4582 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4583 sdata->_detach_pg(p->second.get());
4584 }
4585
4586 for (auto shard : shards) {
4587 shard->unprime_split_children(pg->pg_id, old_pg_num);
4588 }
4589
4590 // update pg count now since we might not get an osdmap any time soon.
4591 if (pg->is_primary())
4592 service.logger->dec(l_osd_pg_primary);
4593 else if (pg->is_nonprimary())
4594 service.logger->dec(l_osd_pg_replica); // misnomver
4595 else
4596 service.logger->dec(l_osd_pg_stray);
4597
4598 return true;
4599}
4600
4601PGRef OSD::_lookup_pg(spg_t pgid)
4602{
4603 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4604 auto sdata = shards[shard_index];
4605 std::lock_guard l(sdata->shard_lock);
4606 auto p = sdata->pg_slots.find(pgid);
4607 if (p == sdata->pg_slots.end()) {
4608 return nullptr;
4609 }
4610 return p->second->pg;
4611}
4612
4613PGRef OSD::_lookup_lock_pg(spg_t pgid)
4614{
4615 PGRef pg = _lookup_pg(pgid);
4616 if (!pg) {
4617 return nullptr;
4618 }
4619 pg->lock();
4620 if (!pg->is_deleted()) {
4621 return pg;
4622 }
4623 pg->unlock();
4624 return nullptr;
4625}
4626
4627PGRef OSD::lookup_lock_pg(spg_t pgid)
4628{
4629 return _lookup_lock_pg(pgid);
4630}
4631
4632void OSD::load_pgs()
4633{
4634 ceph_assert(ceph_mutex_is_locked(osd_lock));
4635 dout(0) << "load_pgs" << dendl;
4636
4637 {
4638 auto pghist = make_pg_num_history_oid();
4639 bufferlist bl;
4640 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4641 if (r >= 0 && bl.length() > 0) {
4642 auto p = bl.cbegin();
4643 decode(pg_num_history, p);
4644 }
4645 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
4646 }
4647
4648 vector<coll_t> ls;
4649 int r = store->list_collections(ls);
4650 if (r < 0) {
4651 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4652 }
4653
4654 int num = 0;
4655 for (vector<coll_t>::iterator it = ls.begin();
4656 it != ls.end();
4657 ++it) {
4658 spg_t pgid;
4659 if (it->is_temp(&pgid) ||
4660 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
4661 dout(10) << "load_pgs " << *it
4662 << " removing, legacy or flagged for removal pg" << dendl;
4663 recursive_remove_collection(cct, store, pgid, *it);
4664 continue;
4665 }
4666
4667 if (!it->is_pg(&pgid)) {
4668 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4669 continue;
4670 }
4671
4672 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4673 epoch_t map_epoch = 0;
4674 int r = PG::peek_map_epoch(store, pgid, &map_epoch);
4675 if (r < 0) {
4676 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4677 << dendl;
4678 continue;
4679 }
4680
4681 PGRef pg;
4682 if (map_epoch > 0) {
4683 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4684 if (!pgosdmap) {
4685 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
4686 derr << __func__ << ": could not find map for epoch " << map_epoch
4687 << " on pg " << pgid << ", but the pool is not present in the "
4688 << "current map, so this is probably a result of bug 10617. "
4689 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4690 << "to clean it up later." << dendl;
4691 continue;
4692 } else {
4693 derr << __func__ << ": have pgid " << pgid << " at epoch "
4694 << map_epoch << ", but missing map. Crashing."
4695 << dendl;
4696 ceph_abort_msg("Missing map in load_pgs");
4697 }
4698 }
4699 pg = _make_pg(pgosdmap, pgid);
4700 } else {
4701 pg = _make_pg(get_osdmap(), pgid);
4702 }
4703 if (!pg) {
4704 recursive_remove_collection(cct, store, pgid, *it);
4705 continue;
4706 }
4707
4708 // there can be no waiters here, so we don't call _wake_pg_slot
4709
4710 pg->lock();
4711 pg->ch = store->open_collection(pg->coll);
4712
4713 // read pg state, log
4714 pg->read_state(store);
4715
4716 if (pg->dne()) {
4717 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4718 pg->ch = nullptr;
4719 pg->unlock();
4720 recursive_remove_collection(cct, store, pgid, *it);
4721 continue;
4722 }
4723 {
4724 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4725 assert(NULL != shards[shard_index]);
4726 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4727 }
4728
4729 pg->reg_next_scrub();
4730
4731 dout(10) << __func__ << " loaded " << *pg << dendl;
4732 pg->unlock();
4733
4734 register_pg(pg);
4735 ++num;
4736 }
4737 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
4738}
4739
4740
4741PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4742 const PGCreateInfo *info)
4743{
4744 spg_t pgid = info->pgid;
4745
4746 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4747 dout(10) << __func__ << " hit max pg, dropping" << dendl;
4748 return nullptr;
4749 }
4750
4751 PeeringCtx rctx = create_context();
4752
4753 OSDMapRef startmap = get_map(info->epoch);
4754
4755 if (info->by_mon) {
4756 int64_t pool_id = pgid.pgid.pool();
4757 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4758 if (!pool) {
4759 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4760 return nullptr;
4761 }
4762 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
4763 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4764 // this ensures we do not process old creating messages after the
4765 // pool's initial pgs have been created (and pg are subsequently
4766 // allowed to split or merge).
4767 dout(20) << __func__ << " dropping " << pgid
4768 << "create, pool does not have CREATING flag set" << dendl;
4769 return nullptr;
4770 }
4771 }
4772
4773 int up_primary, acting_primary;
4774 vector<int> up, acting;
4775 startmap->pg_to_up_acting_osds(
4776 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4777
4778 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4779 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4780 store->get_type() != "bluestore") {
4781 clog->warn() << "pg " << pgid
4782 << " is at risk of silent data corruption: "
4783 << "the pool allows ec overwrites but is not stored in "
4784 << "bluestore, so deep scrubbing will not detect bitrot";
4785 }
4786 create_pg_collection(
4787 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4788 init_pg_ondisk(rctx.transaction, pgid, pp);
4789
4790 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
4791
4792 PGRef pg = _make_pg(startmap, pgid);
4793 pg->ch = store->create_new_collection(pg->coll);
4794
4795 {
4796 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4797 assert(NULL != shards[shard_index]);
4798 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4799 }
4800
4801 pg->lock(true);
4802
4803 // we are holding the shard lock
4804 ceph_assert(!pg->is_deleted());
4805
4806 pg->init(
4807 role,
4808 up,
4809 up_primary,
4810 acting,
4811 acting_primary,
4812 info->history,
4813 info->past_intervals,
4814 false,
4815 rctx.transaction);
4816
4817 pg->init_collection_pool_opts();
4818
4819 if (pg->is_primary()) {
4820 std::lock_guard locker{m_perf_queries_lock};
4821 pg->set_dynamic_perf_stats_queries(m_perf_queries);
4822 }
4823
4824 pg->handle_initialize(rctx);
4825 pg->handle_activate_map(rctx);
4826
4827 dispatch_context(rctx, pg.get(), osdmap, nullptr);
4828
4829 dout(10) << __func__ << " new pg " << *pg << dendl;
4830 return pg;
4831}
4832
4833bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4834 spg_t pgid,
4835 bool is_mon_create)
4836{
4837 const auto max_pgs_per_osd =
4838 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4839 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4840
4841 if (num_pgs < max_pgs_per_osd) {
4842 return false;
4843 }
4844
4845 std::lock_guard l(pending_creates_lock);
4846 if (is_mon_create) {
4847 pending_creates_from_mon++;
4848 } else {
4849 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
4850 pending_creates_from_osd.emplace(pgid, is_primary);
4851 }
4852 dout(1) << __func__ << " withhold creation of pg " << pgid
4853 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
4854 return true;
4855}
4856
4857// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4858// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4859// to up set if pg_temp is empty. so an empty pg_temp won't work.
4860static vector<int32_t> twiddle(const vector<int>& acting) {
4861 if (acting.size() > 1) {
4862 return {acting[0]};
4863 } else {
4864 vector<int32_t> twiddled(acting.begin(), acting.end());
4865 twiddled.push_back(-1);
4866 return twiddled;
4867 }
4868}
4869
4870void OSD::resume_creating_pg()
4871{
4872 bool do_sub_pg_creates = false;
4873 bool have_pending_creates = false;
4874 {
4875 const auto max_pgs_per_osd =
4876 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4877 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4878 if (max_pgs_per_osd <= num_pgs) {
4879 // this could happen if admin decreases this setting before a PG is removed
4880 return;
4881 }
4882 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4883 std::lock_guard l(pending_creates_lock);
4884 if (pending_creates_from_mon > 0) {
4885 dout(20) << __func__ << " pending_creates_from_mon "
4886 << pending_creates_from_mon << dendl;
4887 do_sub_pg_creates = true;
4888 if (pending_creates_from_mon >= spare_pgs) {
4889 spare_pgs = pending_creates_from_mon = 0;
4890 } else {
4891 spare_pgs -= pending_creates_from_mon;
4892 pending_creates_from_mon = 0;
4893 }
4894 }
4895 auto pg = pending_creates_from_osd.cbegin();
4896 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
4897 dout(20) << __func__ << " pg " << pg->first << dendl;
4898 vector<int> acting;
4899 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
4900 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
4901 pg = pending_creates_from_osd.erase(pg);
4902 do_sub_pg_creates = true;
4903 spare_pgs--;
4904 }
4905 have_pending_creates = (pending_creates_from_mon > 0 ||
4906 !pending_creates_from_osd.empty());
4907 }
4908
4909 bool do_renew_subs = false;
4910 if (do_sub_pg_creates) {
4911 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4912 dout(4) << __func__ << ": resolicit pg creates from mon since "
4913 << last_pg_create_epoch << dendl;
4914 do_renew_subs = true;
4915 }
4916 }
4917 version_t start = get_osdmap_epoch() + 1;
4918 if (have_pending_creates) {
4919 // don't miss any new osdmap deleting PGs
4920 if (monc->sub_want("osdmap", start, 0)) {
4921 dout(4) << __func__ << ": resolicit osdmap from mon since "
4922 << start << dendl;
4923 do_renew_subs = true;
4924 }
4925 } else if (do_sub_pg_creates) {
4926 // no need to subscribe the osdmap continuously anymore
4927 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4928 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
4929 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
4930 << start << dendl;
4931 do_renew_subs = true;
4932 }
4933 }
4934
4935 if (do_renew_subs) {
4936 monc->renew_subs();
4937 }
4938
4939 service.send_pg_temp();
4940}
4941
4942void OSD::build_initial_pg_history(
4943 spg_t pgid,
4944 epoch_t created,
4945 utime_t created_stamp,
4946 pg_history_t *h,
4947 PastIntervals *pi)
4948{
4949 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4950 *h = pg_history_t(created, created_stamp);
4951
4952 OSDMapRef lastmap = service.get_map(created);
4953 int up_primary, acting_primary;
4954 vector<int> up, acting;
4955 lastmap->pg_to_up_acting_osds(
4956 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4957
4958 ostringstream debug;
4959 for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
4960 OSDMapRef osdmap = service.get_map(e);
4961 int new_up_primary, new_acting_primary;
4962 vector<int> new_up, new_acting;
4963 osdmap->pg_to_up_acting_osds(
4964 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4965
4966 // this is a bit imprecise, but sufficient?
4967 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4968 const pg_pool_t *pi;
4969 bool operator()(const set<pg_shard_t> &have) const {
4970 return have.size() >= pi->min_size;
4971 }
4972 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4973 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4974
4975 bool new_interval = PastIntervals::check_new_interval(
4976 acting_primary,
4977 new_acting_primary,
4978 acting, new_acting,
4979 up_primary,
4980 new_up_primary,
4981 up, new_up,
4982 h->same_interval_since,
4983 h->last_epoch_clean,
4984 osdmap.get(),
4985 lastmap.get(),
4986 pgid.pgid,
4987 min_size_predicate,
4988 pi,
4989 &debug);
4990 if (new_interval) {
4991 h->same_interval_since = e;
4992 if (up != new_up) {
4993 h->same_up_since = e;
4994 }
4995 if (acting_primary != new_acting_primary) {
4996 h->same_primary_since = e;
4997 }
4998 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4999 osdmap->get_pg_num(pgid.pgid.pool()),
5000 nullptr)) {
5001 h->last_epoch_split = e;
5002 }
5003 up = new_up;
5004 acting = new_acting;
5005 up_primary = new_up_primary;
5006 acting_primary = new_acting_primary;
5007 }
5008 lastmap = osdmap;
5009 }
5010 dout(20) << __func__ << " " << debug.str() << dendl;
5011 dout(10) << __func__ << " " << *h << " " << *pi
5012 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5013 pi->get_bounds()) << ")"
5014 << dendl;
5015}
5016
5017void OSD::_add_heartbeat_peer(int p)
5018{
5019 if (p == whoami)
5020 return;
5021 HeartbeatInfo *hi;
5022
5023 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5024 if (i == heartbeat_peers.end()) {
5025 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
5026 if (!cons.first)
5027 return;
5028 assert(cons.second);
5029
5030 hi = &heartbeat_peers[p];
5031 hi->peer = p;
5032
5033 auto stamps = service.get_hb_stamps(p);
5034
5035 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5036 sb->peer = p;
5037 sb->stamps = stamps;
5038 hi->hb_interval_start = ceph_clock_now();
5039 hi->con_back = cons.first.get();
5040 hi->con_back->set_priv(sb);
5041
5042 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5043 sf->peer = p;
5044 sf->stamps = stamps;
5045 hi->con_front = cons.second.get();
5046 hi->con_front->set_priv(sf);
5047
5048 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5049 << " " << hi->con_back->get_peer_addr()
5050 << " " << hi->con_front->get_peer_addr()
5051 << dendl;
5052 } else {
5053 hi = &i->second;
5054 }
5055 hi->epoch = get_osdmap_epoch();
5056}
5057
5058void OSD::_remove_heartbeat_peer(int n)
5059{
5060 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5061 ceph_assert(q != heartbeat_peers.end());
5062 dout(20) << " removing heartbeat peer osd." << n
5063 << " " << q->second.con_back->get_peer_addr()
5064 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5065 << dendl;
5066 q->second.clear_mark_down();
5067 heartbeat_peers.erase(q);
5068}
5069
5070void OSD::need_heartbeat_peer_update()
5071{
5072 if (is_stopping())
5073 return;
5074 dout(20) << "need_heartbeat_peer_update" << dendl;
5075 heartbeat_set_peers_need_update();
5076}
5077
5078void OSD::maybe_update_heartbeat_peers()
5079{
5080 ceph_assert(ceph_mutex_is_locked(osd_lock));
5081
5082 if (is_waiting_for_healthy() || is_active()) {
5083 utime_t now = ceph_clock_now();
5084 if (last_heartbeat_resample == utime_t()) {
5085 last_heartbeat_resample = now;
5086 heartbeat_set_peers_need_update();
5087 } else if (!heartbeat_peers_need_update()) {
5088 utime_t dur = now - last_heartbeat_resample;
5089 if (dur > cct->_conf->osd_heartbeat_grace) {
5090 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5091 heartbeat_set_peers_need_update();
5092 last_heartbeat_resample = now;
5093 // automatically clean up any stale heartbeat peers
5094 // if we are unhealthy, then clean all
5095 reset_heartbeat_peers(is_waiting_for_healthy());
5096 }
5097 }
5098 }
5099
5100 if (!heartbeat_peers_need_update())
5101 return;
5102 heartbeat_clear_peers_need_update();
5103
5104 std::lock_guard l(heartbeat_lock);
5105
5106 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5107
5108
5109 // build heartbeat from set
5110 if (is_active()) {
5111 vector<PGRef> pgs;
5112 _get_pgs(&pgs);
5113 for (auto& pg : pgs) {
5114 pg->with_heartbeat_peers([&](int peer) {
5115 if (get_osdmap()->is_up(peer)) {
5116 _add_heartbeat_peer(peer);
5117 }
5118 });
5119 }
5120 }
5121
5122 // include next and previous up osds to ensure we have a fully-connected set
5123 set<int> want, extras;
5124 const int next = get_osdmap()->get_next_up_osd_after(whoami);
5125 if (next >= 0)
5126 want.insert(next);
5127 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
5128 if (prev >= 0 && prev != next)
5129 want.insert(prev);
5130
5131 // make sure we have at least **min_down** osds coming from different
5132 // subtree level (e.g., hosts) for fast failure detection.
5133 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5134 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5135 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5136 get_osdmap()->get_random_up_osds_by_subtree(
5137 whoami, subtree, limit, want, &want);
5138
5139 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5140 dout(10) << " adding neighbor peer osd." << *p << dendl;
5141 extras.insert(*p);
5142 _add_heartbeat_peer(*p);
5143 }
5144
5145 // remove down peers; enumerate extras
5146 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5147 while (p != heartbeat_peers.end()) {
5148 if (!get_osdmap()->is_up(p->first)) {
5149 int o = p->first;
5150 ++p;
5151 _remove_heartbeat_peer(o);
5152 continue;
5153 }
5154 if (p->second.epoch < get_osdmap_epoch()) {
5155 extras.insert(p->first);
5156 }
5157 ++p;
5158 }
5159
5160 // too few?
5161 for (int n = next; n >= 0; ) {
5162 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5163 break;
5164 if (!extras.count(n) && !want.count(n) && n != whoami) {
5165 dout(10) << " adding random peer osd." << n << dendl;
5166 extras.insert(n);
5167 _add_heartbeat_peer(n);
5168 }
5169 n = get_osdmap()->get_next_up_osd_after(n);
5170 if (n == next)
5171 break; // came full circle; stop
5172 }
5173
5174 // too many?
5175 for (set<int>::iterator p = extras.begin();
5176 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5177 ++p) {
5178 if (want.count(*p))
5179 continue;
5180 _remove_heartbeat_peer(*p);
5181 }
5182
5183 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5184
5185 // clean up stale failure pending
5186 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5187 if (heartbeat_peers.count(it->first) == 0) {
5188 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5189 failure_pending.erase(it++);
5190 } else {
5191 it++;
5192 }
5193 }
5194}
5195
5196void OSD::reset_heartbeat_peers(bool all)
5197{
5198 ceph_assert(ceph_mutex_is_locked(osd_lock));
5199 dout(10) << "reset_heartbeat_peers" << dendl;
5200 utime_t stale = ceph_clock_now();
5201 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5202 std::lock_guard l(heartbeat_lock);
5203 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5204 auto& [peer, hi] = *it;
5205 if (all || hi.is_stale(stale)) {
5206 hi.clear_mark_down();
5207 // stop sending failure_report to mon too
5208 failure_queue.erase(peer);
5209 failure_pending.erase(peer);
5210 it = heartbeat_peers.erase(it);
5211 } else {
5212 ++it;
5213 }
5214 }
5215}
5216
5217void OSD::handle_osd_ping(MOSDPing *m)
5218{
5219 if (superblock.cluster_fsid != m->fsid) {
5220 dout(20) << "handle_osd_ping from " << m->get_source_inst()
5221 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5222 << dendl;
5223 m->put();
5224 return;
5225 }
5226
5227 int from = m->get_source().num();
5228
5229 heartbeat_lock.lock();
5230 if (is_stopping()) {
5231 heartbeat_lock.unlock();
5232 m->put();
5233 return;
5234 }
5235
5236 utime_t now = ceph_clock_now();
5237 auto mnow = service.get_mnow();
5238 ConnectionRef con(m->get_connection());
5239 OSDMapRef curmap = service.get_osdmap();
5240 if (!curmap) {
5241 heartbeat_lock.unlock();
5242 m->put();
5243 return;
5244 }
5245
5246 auto sref = con->get_priv();
5247 Session *s = static_cast<Session*>(sref.get());
5248 if (!s) {
5249 heartbeat_lock.unlock();
5250 m->put();
5251 return;
5252 }
5253 if (!s->stamps) {
5254 s->peer = from;
5255 s->stamps = service.get_hb_stamps(from);
5256 }
5257
5258 switch (m->op) {
5259
5260 case MOSDPing::PING:
5261 {
5262 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5263 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5264 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5265 if (heartbeat_drop->second == 0) {
5266 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5267 } else {
5268 --heartbeat_drop->second;
5269 dout(5) << "Dropping heartbeat from " << from
5270 << ", " << heartbeat_drop->second
5271 << " remaining to drop" << dendl;
5272 break;
5273 }
5274 } else if (cct->_conf->osd_debug_drop_ping_probability >
5275 ((((double)(rand()%100))/100.0))) {
5276 heartbeat_drop =
5277 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5278 cct->_conf->osd_debug_drop_ping_duration)).first;
5279 dout(5) << "Dropping heartbeat from " << from
5280 << ", " << heartbeat_drop->second
5281 << " remaining to drop" << dendl;
5282 break;
5283 }
5284 }
5285
5286 ceph::signedspan sender_delta_ub{};
5287 s->stamps->got_ping(
5288 m->up_from,
5289 mnow,
5290 m->mono_send_stamp,
5291 m->delta_ub,
5292 &sender_delta_ub);
5293 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5294
5295 if (!cct->get_heartbeat_map()->is_healthy()) {
5296 dout(10) << "internal heartbeat not healthy, dropping ping request"
5297 << dendl;
5298 break;
5299 }
5300
5301 Message *r = new MOSDPing(monc->get_fsid(),
5302 curmap->get_epoch(),
5303 MOSDPing::PING_REPLY,
5304 m->ping_stamp,
5305 m->mono_ping_stamp,
5306 mnow,
5307 service.get_up_epoch(),
5308 cct->_conf->osd_heartbeat_min_size,
5309 sender_delta_ub);
5310 con->send_message(r);
5311
5312 if (curmap->is_up(from)) {
5313 if (is_active()) {
5314 ConnectionRef cluster_con = service.get_con_osd_cluster(
5315 from, curmap->get_epoch());
5316 if (cluster_con) {
5317 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5318 }
5319 }
5320 } else if (!curmap->exists(from) ||
5321 curmap->get_down_at(from) > m->map_epoch) {
5322 // tell them they have died
5323 Message *r = new MOSDPing(monc->get_fsid(),
5324 curmap->get_epoch(),
5325 MOSDPing::YOU_DIED,
5326 m->ping_stamp,
5327 m->mono_ping_stamp,
5328 mnow,
5329 service.get_up_epoch(),
5330 cct->_conf->osd_heartbeat_min_size);
5331 con->send_message(r);
5332 }
5333 }
5334 break;
5335
5336 case MOSDPing::PING_REPLY:
5337 {
5338 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5339 if (i != heartbeat_peers.end()) {
5340 auto acked = i->second.ping_history.find(m->ping_stamp);
5341 if (acked != i->second.ping_history.end()) {
5342 int &unacknowledged = acked->second.second;
5343 if (con == i->second.con_back) {
5344 dout(25) << "handle_osd_ping got reply from osd." << from
5345 << " first_tx " << i->second.first_tx
5346 << " last_tx " << i->second.last_tx
5347 << " last_rx_back " << i->second.last_rx_back
5348 << " -> " << now
5349 << " last_rx_front " << i->second.last_rx_front
5350 << dendl;
5351 i->second.last_rx_back = now;
5352 ceph_assert(unacknowledged > 0);
5353 --unacknowledged;
5354 // if there is no front con, set both stamps.
5355 if (i->second.con_front == NULL) {
5356 i->second.last_rx_front = now;
5357 ceph_assert(unacknowledged > 0);
5358 --unacknowledged;
5359 }
5360 } else if (con == i->second.con_front) {
5361 dout(25) << "handle_osd_ping got reply from osd." << from
5362 << " first_tx " << i->second.first_tx
5363 << " last_tx " << i->second.last_tx
5364 << " last_rx_back " << i->second.last_rx_back
5365 << " last_rx_front " << i->second.last_rx_front
5366 << " -> " << now
5367 << dendl;
5368 i->second.last_rx_front = now;
5369 ceph_assert(unacknowledged > 0);
5370 --unacknowledged;
5371 }
5372
5373 if (unacknowledged == 0) {
5374 // succeeded in getting all replies
5375 dout(25) << "handle_osd_ping got all replies from osd." << from
5376 << " , erase pending ping(sent at " << m->ping_stamp << ")"
5377 << " and older pending ping(s)"
5378 << dendl;
5379
5380#define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5381 ++i->second.hb_average_count;
5382 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
5383 i->second.hb_total_back += back_pingtime;
5384 if (back_pingtime < i->second.hb_min_back)
5385 i->second.hb_min_back = back_pingtime;
5386 if (back_pingtime > i->second.hb_max_back)
5387 i->second.hb_max_back = back_pingtime;
5388 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
5389 i->second.hb_total_front += front_pingtime;
5390 if (front_pingtime < i->second.hb_min_front)
5391 i->second.hb_min_front = front_pingtime;
5392 if (front_pingtime > i->second.hb_max_front)
5393 i->second.hb_max_front = front_pingtime;
5394
5395 ceph_assert(i->second.hb_interval_start != utime_t());
5396 if (i->second.hb_interval_start == utime_t())
5397 i->second.hb_interval_start = now;
5398 int64_t hb_avg_time_period = 60;
5399 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5400 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5401 }
5402 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5403 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5404 uint32_t back_min = i->second.hb_min_back;
5405 uint32_t back_max = i->second.hb_max_back;
5406 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5407 uint32_t front_min = i->second.hb_min_front;
5408 uint32_t front_max = i->second.hb_max_front;
5409
5410 // Reset for new interval
5411 i->second.hb_average_count = 0;
5412 i->second.hb_interval_start = now;
5413 i->second.hb_total_back = i->second.hb_max_back = 0;
5414 i->second.hb_min_back = UINT_MAX;
5415 i->second.hb_total_front = i->second.hb_max_front = 0;
5416 i->second.hb_min_front = UINT_MAX;
5417
5418 // Record per osd interace ping times
5419 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5420 if (i->second.hb_back_pingtime.size() == 0) {
5421 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5422 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5423 i->second.hb_back_pingtime.push_back(back_avg);
5424 i->second.hb_back_min.push_back(back_min);
5425 i->second.hb_back_max.push_back(back_max);
5426 i->second.hb_front_pingtime.push_back(front_avg);
5427 i->second.hb_front_min.push_back(front_min);
5428 i->second.hb_front_max.push_back(front_max);
5429 ++i->second.hb_index;
5430 }
5431 } else {
5432 int index = i->second.hb_index & (hb_vector_size - 1);
5433 i->second.hb_back_pingtime[index] = back_avg;
5434 i->second.hb_back_min[index] = back_min;
5435 i->second.hb_back_max[index] = back_max;
5436 i->second.hb_front_pingtime[index] = front_avg;
5437 i->second.hb_front_min[index] = front_min;
5438 i->second.hb_front_max[index] = front_max;
5439 ++i->second.hb_index;
5440 }
5441
5442 {
5443 std::lock_guard l(service.stat_lock);
5444 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5445 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5446
5447 uint32_t total = 0;
5448 uint32_t min = UINT_MAX;
5449 uint32_t max = 0;
5450 uint32_t count = 0;
5451 uint32_t which = 0;
5452 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5453 for (int32_t k = size - 1 ; k >= 0; --k) {
5454 ++count;
5455 int index = (i->second.hb_index + k) % size;
5456 total += i->second.hb_back_pingtime[index];
5457 if (i->second.hb_back_min[index] < min)
5458 min = i->second.hb_back_min[index];
5459 if (i->second.hb_back_max[index] > max)
5460 max = i->second.hb_back_max[index];
5461 if (count == 1 || count == 5 || count == 15) {
5462 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5463 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5464 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5465 which++;
5466 if (count == 15)
5467 break;
5468 }
5469 }
5470
5471 if (i->second.con_front != NULL) {
5472 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5473
5474 total = 0;
5475 min = UINT_MAX;
5476 max = 0;
5477 count = 0;
5478 which = 0;
5479 for (int32_t k = size - 1 ; k >= 0; --k) {
5480 ++count;
5481 int index = (i->second.hb_index + k) % size;
5482 total += i->second.hb_front_pingtime[index];
5483 if (i->second.hb_front_min[index] < min)
5484 min = i->second.hb_front_min[index];
5485 if (i->second.hb_front_max[index] > max)
5486 max = i->second.hb_front_max[index];
5487 if (count == 1 || count == 5 || count == 15) {
5488 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5489 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5490 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5491 which++;
5492 if (count == 15)
5493 break;
5494 }
5495 }
5496 }
5497 }
5498 } else {
5499 std::lock_guard l(service.stat_lock);
5500 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5501 if (i->second.con_front != NULL)
5502 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5503 }
5504 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5505 }
5506
5507 if (i->second.is_healthy(now)) {
5508 // Cancel false reports
5509 auto failure_queue_entry = failure_queue.find(from);
5510 if (failure_queue_entry != failure_queue.end()) {
5511 dout(10) << "handle_osd_ping canceling queued "
5512 << "failure report for osd." << from << dendl;
5513 failure_queue.erase(failure_queue_entry);
5514 }
5515
5516 auto failure_pending_entry = failure_pending.find(from);
5517 if (failure_pending_entry != failure_pending.end()) {
5518 dout(10) << "handle_osd_ping canceling in-flight "
5519 << "failure report for osd." << from << dendl;
5520 send_still_alive(curmap->get_epoch(),
5521 from,
5522 failure_pending_entry->second.second);
5523 failure_pending.erase(failure_pending_entry);
5524 }
5525 }
5526 } else {
5527 // old replies, deprecated by newly sent pings.
5528 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
5529 << ") is found, treat as covered by newly sent pings "
5530 << "and ignore"
5531 << dendl;
5532 }
5533 }
5534
5535 if (m->map_epoch &&
5536 curmap->is_up(from)) {
5537 if (is_active()) {
5538 ConnectionRef cluster_con = service.get_con_osd_cluster(
5539 from, curmap->get_epoch());
5540 if (cluster_con) {
5541 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5542 }
5543 }
5544 }
5545
5546 s->stamps->got_ping_reply(
5547 mnow,
5548 m->mono_send_stamp,
5549 m->delta_ub);
5550 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5551 }
5552 break;
5553
5554 case MOSDPing::YOU_DIED:
5555 dout(10) << "handle_osd_ping " << m->get_source_inst()
5556 << " says i am down in " << m->map_epoch << dendl;
5557 osdmap_subscribe(curmap->get_epoch()+1, false);
5558 break;
5559 }
5560
5561 heartbeat_lock.unlock();
5562 m->put();
5563}
5564
5565void OSD::heartbeat_entry()
5566{
5567 std::unique_lock l(heartbeat_lock);
5568 if (is_stopping())
5569 return;
5570 while (!heartbeat_stop) {
5571 heartbeat();
5572
5573 double wait;
5574 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5575 wait = (float)cct->_conf->osd_heartbeat_interval;
5576 } else {
5577 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5578 }
5579 auto w = ceph::make_timespan(wait);
5580 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5581 heartbeat_cond.wait_for(l, w);
5582 if (is_stopping())
5583 return;
5584 dout(30) << "heartbeat_entry woke up" << dendl;
5585 }
5586}
5587
5588void OSD::heartbeat_check()
5589{
5590 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
5591 utime_t now = ceph_clock_now();
5592
5593 // check for incoming heartbeats (move me elsewhere?)
5594 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5595 p != heartbeat_peers.end();
5596 ++p) {
5597
5598 if (p->second.first_tx == utime_t()) {
5599 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5600 << " yet, skipping" << dendl;
5601 continue;
5602 }
5603
5604 dout(25) << "heartbeat_check osd." << p->first
5605 << " first_tx " << p->second.first_tx
5606 << " last_tx " << p->second.last_tx
5607 << " last_rx_back " << p->second.last_rx_back
5608 << " last_rx_front " << p->second.last_rx_front
5609 << dendl;
5610 if (p->second.is_unhealthy(now)) {
5611 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5612 if (p->second.last_rx_back == utime_t() ||
5613 p->second.last_rx_front == utime_t()) {
5614 derr << "heartbeat_check: no reply from "
5615 << p->second.con_front->get_peer_addr().get_sockaddr()
5616 << " osd." << p->first
5617 << " ever on either front or back, first ping sent "
5618 << p->second.first_tx
5619 << " (oldest deadline " << oldest_deadline << ")"
5620 << dendl;
5621 // fail
5622 failure_queue[p->first] = p->second.first_tx;
5623 } else {
5624 derr << "heartbeat_check: no reply from "
5625 << p->second.con_front->get_peer_addr().get_sockaddr()
5626 << " osd." << p->first << " since back " << p->second.last_rx_back
5627 << " front " << p->second.last_rx_front
5628 << " (oldest deadline " << oldest_deadline << ")"
5629 << dendl;
5630 // fail
5631 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5632 }
5633 }
5634 }
5635}
5636
5637void OSD::heartbeat()
5638{
5639 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
5640 dout(30) << "heartbeat" << dendl;
5641
5642 // get CPU load avg
5643 double loadavgs[1];
5644 int hb_interval = cct->_conf->osd_heartbeat_interval;
5645 int n_samples = 86400;
5646 if (hb_interval > 1) {
5647 n_samples /= hb_interval;
5648 if (n_samples < 1)
5649 n_samples = 1;
5650 }
5651
5652 if (getloadavg(loadavgs, 1) == 1) {
5653 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5654 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5655 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5656 }
5657
5658 dout(30) << "heartbeat checking stats" << dendl;
5659
5660 // refresh peer list and osd stats
5661 vector<int> hb_peers;
5662 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5663 p != heartbeat_peers.end();
5664 ++p)
5665 hb_peers.push_back(p->first);
5666
5667 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5668 dout(5) << __func__ << " " << new_stat << dendl;
5669 ceph_assert(new_stat.statfs.total);
5670
5671 float pratio;
5672 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5673
5674 service.check_full_status(ratio, pratio);
5675
5676 utime_t now = ceph_clock_now();
5677 auto mnow = service.get_mnow();
5678 utime_t deadline = now;
5679 deadline += cct->_conf->osd_heartbeat_grace;
5680
5681 // send heartbeats
5682 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5683 i != heartbeat_peers.end();
5684 ++i) {
5685 int peer = i->first;
5686 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5687 if (!s) {
5688 dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
5689 continue;
5690 }
5691 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5692
5693 i->second.last_tx = now;
5694 if (i->second.first_tx == utime_t())
5695 i->second.first_tx = now;
5696 i->second.ping_history[now] = make_pair(deadline,
5697 HeartbeatInfo::HEARTBEAT_MAX_CONN);
5698 if (i->second.hb_interval_start == utime_t())
5699 i->second.hb_interval_start = now;
5700
5701 std::optional<ceph::signedspan> delta_ub;
5702 s->stamps->sent_ping(&delta_ub);
5703
5704 i->second.con_back->send_message(
5705 new MOSDPing(monc->get_fsid(),
5706 service.get_osdmap_epoch(),
5707 MOSDPing::PING,
5708 now,
5709 mnow,
5710 mnow,
5711 service.get_up_epoch(),
5712 cct->_conf->osd_heartbeat_min_size,
5713 delta_ub));
5714
5715 if (i->second.con_front)
5716 i->second.con_front->send_message(
5717 new MOSDPing(monc->get_fsid(),
5718 service.get_osdmap_epoch(),
5719 MOSDPing::PING,
5720 now,
5721 mnow,
5722 mnow,
5723 service.get_up_epoch(),
5724 cct->_conf->osd_heartbeat_min_size,
5725 delta_ub));
5726 }
5727
5728 logger->set(l_osd_hb_to, heartbeat_peers.size());
5729
5730 // hmm.. am i all alone?
5731 dout(30) << "heartbeat lonely?" << dendl;
5732 if (heartbeat_peers.empty()) {
5733 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5734 last_mon_heartbeat = now;
5735 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5736 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5737 }
5738 }
5739
5740 dout(30) << "heartbeat done" << dendl;
5741}
5742
5743bool OSD::heartbeat_reset(Connection *con)
5744{
5745 std::lock_guard l(heartbeat_lock);
5746 auto s = con->get_priv();
5747 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
5748 con->set_priv(nullptr);
5749 if (s) {
5750 if (is_stopping()) {
5751 return true;
5752 }
5753 auto session = static_cast<Session*>(s.get());
5754 auto p = heartbeat_peers.find(session->peer);
5755 if (p != heartbeat_peers.end() &&
5756 (p->second.con_back == con ||
5757 p->second.con_front == con)) {
5758 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5759 << ", reopening" << dendl;
5760 p->second.clear_mark_down(con);
5761 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5762 if (newcon.first) {
5763 p->second.con_back = newcon.first.get();
5764 p->second.con_back->set_priv(s);
5765 if (newcon.second) {
5766 p->second.con_front = newcon.second.get();
5767 p->second.con_front->set_priv(s);
5768 }
5769 p->second.ping_history.clear();
5770 } else {
5771 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5772 << ", raced with osdmap update, closing out peer" << dendl;
5773 heartbeat_peers.erase(p);
5774 }
5775 } else {
5776 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5777 }
5778 }
5779 return true;
5780}
5781
5782
5783
5784// =========================================
5785
5786void OSD::tick()
5787{
5788 ceph_assert(ceph_mutex_is_locked(osd_lock));
5789 dout(10) << "tick" << dendl;
5790
5791 utime_t now = ceph_clock_now();
5792 // throw out any obsolete markdown log
5793 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
5794 while (!osd_markdown_log.empty() &&
5795 osd_markdown_log.front() + grace < now)
5796 osd_markdown_log.pop_front();
5797
5798 if (is_active() || is_waiting_for_healthy()) {
5799 maybe_update_heartbeat_peers();
5800 }
5801
5802 if (is_waiting_for_healthy()) {
5803 start_boot();
5804 }
5805
5806 if (is_waiting_for_healthy() || is_booting()) {
5807 std::lock_guard l(heartbeat_lock);
5808 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5809 last_mon_heartbeat = now;
5810 dout(1) << __func__ << " checking mon for new map" << dendl;
5811 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5812 }
5813 }
5814
5815 do_waiters();
5816
5817 // scrub purged_snaps every deep scrub interval
5818 {
5819 const utime_t last = superblock.last_purged_snaps_scrub;
5820 utime_t next = last;
5821 next += cct->_conf->osd_scrub_min_interval;
5822 std::mt19937 rng;
5823 // use a seed that is stable for each scrub interval, but varies
5824 // by OSD to avoid any herds.
5825 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
5826 double r = (rng() % 1024) / 1024;
5827 next +=
5828 cct->_conf->osd_scrub_min_interval *
5829 cct->_conf->osd_scrub_interval_randomize_ratio * r;
5830 if (next < ceph_clock_now()) {
5831 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5832 << " next " << next << " ... now" << dendl;
5833 scrub_purged_snaps();
5834 } else {
5835 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5836 << " next " << next << dendl;
5837 }
5838 }
5839
5840 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
5841}
5842
5843void OSD::tick_without_osd_lock()
5844{
5845 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
5846 dout(10) << "tick_without_osd_lock" << dendl;
5847
5848 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5849 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5850 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5851
5852 // refresh osd stats
5853 struct store_statfs_t stbuf;
5854 osd_alert_list_t alerts;
5855 int r = store->statfs(&stbuf, &alerts);
5856 ceph_assert(r == 0);
5857 service.set_statfs(stbuf, alerts);
5858
5859 // osd_lock is not being held, which means the OSD state
5860 // might change when doing the monitor report
5861 if (is_active() || is_waiting_for_healthy()) {
5862 {
5863 std::lock_guard l{heartbeat_lock};
5864 heartbeat_check();
5865 }
5866 map_lock.lock_shared();
5867 std::lock_guard l(mon_report_lock);
5868
5869 // mon report?
5870 utime_t now = ceph_clock_now();
5871 if (service.need_fullness_update() ||
5872 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
5873 last_mon_report = now;
5874 send_full_update();
5875 send_failures();
5876 }
5877 map_lock.unlock_shared();
5878
5879 epoch_t max_waiting_epoch = 0;
5880 for (auto s : shards) {
5881 max_waiting_epoch = std::max(max_waiting_epoch,
5882 s->get_max_waiting_epoch());
5883 }
5884 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
5885 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
5886 << ", requesting new map" << dendl;
5887 osdmap_subscribe(superblock.newest_map + 1, false);
5888 }
5889 }
5890
5891 if (is_active()) {
5892 if (!scrub_random_backoff()) {
5893 sched_scrub();
5894 }
5895 service.promote_throttle_recalibrate();
5896 resume_creating_pg();
5897 bool need_send_beacon = false;
5898 const auto now = ceph::coarse_mono_clock::now();
5899 {
5900 // borrow lec lock to pretect last_sent_beacon from changing
5901 std::lock_guard l{min_last_epoch_clean_lock};
5902 const auto elapsed = now - last_sent_beacon;
5903 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5904 cct->_conf->osd_beacon_report_interval) {
5905 need_send_beacon = true;
5906 }
5907 }
5908 if (need_send_beacon) {
5909 send_beacon(now);
5910 }
5911 }
5912
5913 mgrc.update_daemon_health(get_health_metrics());
5914 service.kick_recovery_queue();
5915 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5916 new C_Tick_WithoutOSDLock(this));
5917}
5918
5919// Usage:
5920// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5921// rmomapkey <pool-id> [namespace/]<obj-name> <key>
5922// setomapheader <pool-id> [namespace/]<obj-name> <header>
5923// getomap <pool> [namespace/]<obj-name>
5924// truncobj <pool-id> [namespace/]<obj-name> <newlen>
5925// injectmdataerr [namespace/]<obj-name> [shardid]
5926// injectdataerr [namespace/]<obj-name> [shardid]
5927//
5928// set_recovery_delay [utime]
5929void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5930 std::string_view command,
5931 const cmdmap_t& cmdmap, ostream &ss)
5932{
5933 //Test support
5934 //Support changing the omap on a single osd by using the Admin Socket to
5935 //directly request the osd make a change.
5936 if (command == "setomapval" || command == "rmomapkey" ||
5937 command == "setomapheader" || command == "getomap" ||
5938 command == "truncobj" || command == "injectmdataerr" ||
5939 command == "injectdataerr"
5940 ) {
5941 pg_t rawpg;
5942 int64_t pool;
5943 OSDMapRef curmap = service->get_osdmap();
5944 int r = -1;
5945
5946 string poolstr;
5947
5948 cmd_getval(cmdmap, "pool", poolstr);
5949 pool = curmap->lookup_pg_pool_name(poolstr);
5950 //If we can't find it by name then maybe id specified
5951 if (pool < 0 && isdigit(poolstr[0]))
5952 pool = atoll(poolstr.c_str());
5953 if (pool < 0) {
5954 ss << "Invalid pool '" << poolstr << "''";
5955 return;
5956 }
5957
5958 string objname, nspace;
5959 cmd_getval(cmdmap, "objname", objname);
5960 std::size_t found = objname.find_first_of('/');
5961 if (found != string::npos) {
5962 nspace = objname.substr(0, found);
5963 objname = objname.substr(found+1);
5964 }
5965 object_locator_t oloc(pool, nspace);
5966 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5967
5968 if (r < 0) {
5969 ss << "Invalid namespace/objname";
5970 return;
5971 }
5972
5973 int64_t shardid;
5974 cmd_getval(cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5975 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5976 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5977 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5978 if (curmap->pg_is_ec(rawpg)) {
5979 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5980 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5981 return;
5982 }
5983 }
5984
5985 ObjectStore::Transaction t;
5986
5987 if (command == "setomapval") {
5988 map<string, bufferlist> newattrs;
5989 bufferlist val;
5990 string key, valstr;
5991 cmd_getval(cmdmap, "key", key);
5992 cmd_getval(cmdmap, "val", valstr);
5993
5994 val.append(valstr);
5995 newattrs[key] = val;
5996 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5997 r = store->queue_transaction(service->meta_ch, std::move(t));
5998 if (r < 0)
5999 ss << "error=" << r;
6000 else
6001 ss << "ok";
6002 } else if (command == "rmomapkey") {
6003 string key;
6004 cmd_getval(cmdmap, "key", key);
6005
6006 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
6007 r = store->queue_transaction(service->meta_ch, std::move(t));
6008 if (r < 0)
6009 ss << "error=" << r;
6010 else
6011 ss << "ok";
6012 } else if (command == "setomapheader") {
6013 bufferlist newheader;
6014 string headerstr;
6015
6016 cmd_getval(cmdmap, "header", headerstr);
6017 newheader.append(headerstr);
6018 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
6019 r = store->queue_transaction(service->meta_ch, std::move(t));
6020 if (r < 0)
6021 ss << "error=" << r;
6022 else
6023 ss << "ok";
6024 } else if (command == "getomap") {
6025 //Debug: Output entire omap
6026 bufferlist hdrbl;
6027 map<string, bufferlist> keyvals;
6028 auto ch = store->open_collection(coll_t(pgid));
6029 if (!ch) {
6030 ss << "unable to open collection for " << pgid;
6031 r = -ENOENT;
6032 } else {
6033 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6034 if (r >= 0) {
6035 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6036 for (map<string, bufferlist>::iterator it = keyvals.begin();
6037 it != keyvals.end(); ++it)
6038 ss << " key=" << (*it).first << " val="
6039 << string((*it).second.c_str(), (*it).second.length());
6040 } else {
6041 ss << "error=" << r;
6042 }
6043 }
6044 } else if (command == "truncobj") {
6045 int64_t trunclen;
6046 cmd_getval(cmdmap, "len", trunclen);
6047 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
6048 r = store->queue_transaction(service->meta_ch, std::move(t));
6049 if (r < 0)
6050 ss << "error=" << r;
6051 else
6052 ss << "ok";
6053 } else if (command == "injectdataerr") {
6054 store->inject_data_error(gobj);
6055 ss << "ok";
6056 } else if (command == "injectmdataerr") {
6057 store->inject_mdata_error(gobj);
6058 ss << "ok";
6059 }
6060 return;
6061 }
6062 if (command == "set_recovery_delay") {
6063 int64_t delay;
6064 cmd_getval(cmdmap, "utime", delay, (int64_t)0);
6065 ostringstream oss;
6066 oss << delay;
6067 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
6068 oss.str().c_str());
6069 if (r != 0) {
6070 ss << "set_recovery_delay: error setting "
6071 << "osd_recovery_delay_start to '" << delay << "': error "
6072 << r;
6073 return;
6074 }
6075 service->cct->_conf.apply_changes(nullptr);
6076 ss << "set_recovery_delay: set osd_recovery_delay_start "
6077 << "to " << service->cct->_conf->osd_recovery_delay_start;
6078 return;
6079 }
6080 if (command == "injectfull") {
6081 int64_t count;
6082 string type;
6083 OSDService::s_names state;
6084 cmd_getval(cmdmap, "type", type, string("full"));
6085 cmd_getval(cmdmap, "count", count, (int64_t)-1);
6086 if (type == "none" || count == 0) {
6087 type = "none";
6088 count = 0;
6089 }
6090 state = service->get_full_state(type);
6091 if (state == OSDService::s_names::INVALID) {
6092 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6093 return;
6094 }
6095 service->set_injectfull(state, count);
6096 return;
6097 }
6098 ss << "Internal error - command=" << command;
6099}
6100
6101// =========================================
6102
6103void OSD::ms_handle_connect(Connection *con)
6104{
6105 dout(10) << __func__ << " con " << con << dendl;
6106 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6107 std::lock_guard l(osd_lock);
6108 if (is_stopping())
6109 return;
6110 dout(10) << __func__ << " on mon" << dendl;
6111
6112 if (is_preboot()) {
6113 start_boot();
6114 } else if (is_booting()) {
6115 _send_boot(); // resend boot message
6116 } else {
6117 map_lock.lock_shared();
6118 std::lock_guard l2(mon_report_lock);
6119
6120 utime_t now = ceph_clock_now();
6121 last_mon_report = now;
6122
6123 // resend everything, it's a new session
6124 send_full_update();
6125 send_alive();
6126 service.requeue_pg_temp();
6127 service.clear_sent_ready_to_merge();
6128 service.send_pg_temp();
6129 service.send_ready_to_merge();
6130 service.send_pg_created();
6131 requeue_failures();
6132 send_failures();
6133
6134 map_lock.unlock_shared();
6135 if (is_active()) {
6136 send_beacon(ceph::coarse_mono_clock::now());
6137 }
6138 }
6139
6140 // full map requests may happen while active or pre-boot
6141 if (requested_full_first) {
6142 rerequest_full_maps();
6143 }
6144 }
6145}
6146
6147void OSD::ms_handle_fast_connect(Connection *con)
6148{
6149 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6150 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6151 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6152 s = ceph::make_ref<Session>(cct, con);
6153 con->set_priv(s);
6154 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6155 << " addr=" << s->con->get_peer_addr() << dendl;
6156 // we don't connect to clients
6157 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6158 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6159 }
6160 }
6161}
6162
6163void OSD::ms_handle_fast_accept(Connection *con)
6164{
6165 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6166 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6167 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6168 s = ceph::make_ref<Session>(cct, con);
6169 con->set_priv(s);
6170 dout(10) << "new session (incoming)" << s << " con=" << con
6171 << " addr=" << con->get_peer_addr()
6172 << " must have raced with connect" << dendl;
6173 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6174 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6175 }
6176 }
6177}
6178
6179bool OSD::ms_handle_reset(Connection *con)
6180{
6181 auto session = ceph::ref_cast<Session>(con->get_priv());
6182 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
6183 if (!session)
6184 return false;
6185 session->wstate.reset(con);
6186 session->con->set_priv(nullptr);
6187 session->con.reset(); // break con <-> session ref cycle
6188 // note that we break session->con *before* the session_handle_reset
6189 // cleanup below. this avoids a race between us and
6190 // PG::add_backoff, Session::check_backoff, etc.
6191 session_handle_reset(session);
6192 return true;
6193}
6194
6195bool OSD::ms_handle_refused(Connection *con)
6196{
6197 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6198 return false;
6199
6200 auto session = ceph::ref_cast<Session>(con->get_priv());
6201 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
6202 if (!session)
6203 return false;
6204 int type = con->get_peer_type();
6205 // handle only OSD failures here
6206 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6207 OSDMapRef osdmap = get_osdmap();
6208 if (osdmap) {
6209 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6210 if (id >= 0 && osdmap->is_up(id)) {
6211 // I'm cheating mon heartbeat grace logic, because we know it's not going
6212 // to respawn alone. +1 so we won't hit any boundary case.
6213 monc->send_mon_message(
6214 new MOSDFailure(
6215 monc->get_fsid(),
6216 id,
6217 osdmap->get_addrs(id),
6218 cct->_conf->osd_heartbeat_grace + 1,
6219 osdmap->get_epoch(),
6220 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6221 ));
6222 }
6223 }
6224 }
6225 return true;
6226}
6227
6228struct C_OSD_GetVersion : public Context {
6229 OSD *osd;
6230 uint64_t oldest, newest;
6231 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
6232 void finish(int r) override {
6233 if (r >= 0)
6234 osd->_got_mon_epochs(oldest, newest);
6235 }
6236};
6237
6238void OSD::start_boot()
6239{
6240 if (!_is_healthy()) {
6241 // if we are not healthy, do not mark ourselves up (yet)
6242 dout(1) << "not healthy; waiting to boot" << dendl;
6243 if (!is_waiting_for_healthy())
6244 start_waiting_for_healthy();
6245 // send pings sooner rather than later
6246 heartbeat_kick();
6247 return;
6248 }
6249 dout(1) << __func__ << dendl;
6250 set_state(STATE_PREBOOT);
6251 dout(10) << "start_boot - have maps " << superblock.oldest_map
6252 << ".." << superblock.newest_map << dendl;
6253 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
6254 monc->get_version("osdmap", &c->newest, &c->oldest, c);
6255}
6256
6257void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6258{
6259 std::lock_guard l(osd_lock);
6260 if (is_preboot()) {
6261 _preboot(oldest, newest);
6262 }
6263}
6264
6265void OSD::_preboot(epoch_t oldest, epoch_t newest)
6266{
6267 ceph_assert(is_preboot());
6268 dout(10) << __func__ << " _preboot mon has osdmaps "
6269 << oldest << ".." << newest << dendl;
6270
6271 // ensure our local fullness awareness is accurate
6272 {
6273 std::lock_guard l(heartbeat_lock);
6274 heartbeat();
6275 }
6276
6277 const auto& monmap = monc->monmap;
6278 const auto osdmap = get_osdmap();
6279 // if our map within recent history, try to add ourselves to the osdmap.
6280 if (osdmap->get_epoch() == 0) {
6281 derr << "waiting for initial osdmap" << dendl;
6282 } else if (osdmap->is_destroyed(whoami)) {
6283 derr << "osdmap says I am destroyed" << dendl;
6284 // provide a small margin so we don't livelock seeing if we
6285 // un-destroyed ourselves.
6286 if (osdmap->get_epoch() > newest - 1) {
6287 exit(0);
6288 }
6289 } else if (osdmap->is_noup(whoami)) {
6290 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6291 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6292 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6293 << dendl;
6294 } else if (osdmap->require_osd_release < ceph_release_t::luminous) {
6295 derr << "osdmap require_osd_release < luminous; please upgrade to luminous"
6296 << dendl;
6297 } else if (service.need_fullness_update()) {
6298 derr << "osdmap fullness state needs update" << dendl;
6299 send_full_update();
6300 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6301 superblock.purged_snaps_last < superblock.current_epoch) {
6302 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6303 << " < newest_map " << superblock.current_epoch << dendl;
6304 _get_purged_snaps();
6305 } else if (osdmap->get_epoch() >= oldest - 1 &&
6306 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6307
6308 // wait for pgs to fully catch up in a different thread, since
6309 // this thread might be required for splitting and merging PGs to
6310 // make progress.
6311 boot_finisher.queue(
6312 new LambdaContext(
6313 [this](int r) {
6314 std::unique_lock l(osd_lock);
6315 if (is_preboot()) {
6316 dout(10) << __func__ << " waiting for peering work to drain"
6317 << dendl;
6318 l.unlock();
6319 for (auto shard : shards) {
6320 shard->wait_min_pg_epoch(get_osdmap_epoch());
6321 }
6322 l.lock();
6323 }
6324 if (is_preboot()) {
6325 _send_boot();
6326 }
6327 }));
6328 return;
6329 }
6330
6331 // get all the latest maps
6332 if (osdmap->get_epoch() + 1 >= oldest)
6333 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6334 else
6335 osdmap_subscribe(oldest - 1, true);
6336}
6337
6338void OSD::_get_purged_snaps()
6339{
6340 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6341 // overlapping requests to the mon, which will be somewhat inefficient, but
6342 // it should be reliable.
6343 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6344 << ", newest_map " << superblock.current_epoch << dendl;
6345 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6346 superblock.purged_snaps_last + 1,
6347 superblock.current_epoch + 1);
6348 monc->send_mon_message(m);
6349}
6350
6351void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6352{
6353 dout(10) << __func__ << " " << *m << dendl;
6354 ObjectStore::Transaction t;
6355 if (!is_preboot() ||
6356 m->last < superblock.purged_snaps_last) {
6357 goto out;
6358 }
6359 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
6360 make_purged_snaps_oid(), &t,
6361 m->purged_snaps);
6362 superblock.purged_snaps_last = m->last;
6363 write_superblock(t);
6364 store->queue_transaction(
6365 service.meta_ch,
6366 std::move(t));
6367 service.publish_superblock(superblock);
6368 if (m->last < superblock.current_epoch) {
6369 _get_purged_snaps();
6370 } else {
6371 start_boot();
6372 }
6373out:
6374 m->put();
6375}
6376
6377void OSD::send_full_update()
6378{
6379 if (!service.need_fullness_update())
6380 return;
6381 unsigned state = 0;
6382 if (service.is_full()) {
6383 state = CEPH_OSD_FULL;
6384 } else if (service.is_backfillfull()) {
6385 state = CEPH_OSD_BACKFILLFULL;
6386 } else if (service.is_nearfull()) {
6387 state = CEPH_OSD_NEARFULL;
6388 }
6389 set<string> s;
6390 OSDMap::calc_state_set(state, s);
6391 dout(10) << __func__ << " want state " << s << dendl;
6392 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
6393}
6394
6395void OSD::start_waiting_for_healthy()
6396{
6397 dout(1) << "start_waiting_for_healthy" << dendl;
6398 set_state(STATE_WAITING_FOR_HEALTHY);
6399 last_heartbeat_resample = utime_t();
6400
6401 // subscribe to osdmap updates, in case our peers really are known to be dead
6402 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6403}
6404
6405bool OSD::_is_healthy()
6406{
6407 if (!cct->get_heartbeat_map()->is_healthy()) {
6408 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6409 return false;
6410 }
6411
6412 if (is_waiting_for_healthy()) {
6413 utime_t now = ceph_clock_now();
6414 if (osd_markdown_log.empty()) {
6415 dout(5) << __func__ << " force returning true since last markdown"
6416 << " was " << cct->_conf->osd_max_markdown_period
6417 << "s ago" << dendl;
6418 return true;
6419 }
6420 std::lock_guard l(heartbeat_lock);
6421 int num = 0, up = 0;
6422 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6423 p != heartbeat_peers.end();
6424 ++p) {
6425 if (p->second.is_healthy(now))
6426 ++up;
6427 ++num;
6428 }
6429 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6430 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6431 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6432 return false;
6433 }
6434 }
6435
6436 return true;
6437}
6438
6439void OSD::_send_boot()
6440{
6441 dout(10) << "_send_boot" << dendl;
6442 Connection *local_connection =
6443 cluster_messenger->get_loopback_connection().get();
6444 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6445 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6446 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6447 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6448
6449 dout(20) << " initial client_addrs " << client_addrs
6450 << ", cluster_addrs " << cluster_addrs
6451 << ", hb_back_addrs " << hb_back_addrs
6452 << ", hb_front_addrs " << hb_front_addrs
6453 << dendl;
6454 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6455 dout(10) << " assuming cluster_addrs match client_addrs "
6456 << client_addrs << dendl;
6457 cluster_addrs = cluster_messenger->get_myaddrs();
6458 }
6459 if (auto session = local_connection->get_priv(); !session) {
6460 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6461 }
6462
6463 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6464 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6465 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6466 << cluster_addrs << dendl;
6467 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6468 }
6469 if (auto session = local_connection->get_priv(); !session) {
6470 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6471 }
6472
6473 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6474 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6475 dout(10) << " assuming hb_front_addrs match client_addrs "
6476 << client_addrs << dendl;
6477 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6478 }
6479 if (auto session = local_connection->get_priv(); !session) {
6480 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6481 }
6482
6483 // we now know what our front and back addrs will be, and we are
6484 // about to tell the mon what our metadata (including numa bindings)
6485 // are, so now is a good time!
6486 set_numa_affinity();
6487
6488 MOSDBoot *mboot = new MOSDBoot(
6489 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6490 hb_back_addrs, hb_front_addrs, cluster_addrs,
6491 CEPH_FEATURES_ALL);
6492 dout(10) << " final client_addrs " << client_addrs
6493 << ", cluster_addrs " << cluster_addrs
6494 << ", hb_back_addrs " << hb_back_addrs
6495 << ", hb_front_addrs " << hb_front_addrs
6496 << dendl;
6497 _collect_metadata(&mboot->metadata);
6498 monc->send_mon_message(mboot);
6499 set_state(STATE_BOOTING);
6500}
6501
6502void OSD::_collect_metadata(map<string,string> *pm)
6503{
6504 // config info
6505 (*pm)["osd_data"] = dev_path;
6506 if (store->get_type() == "filestore") {
6507 // not applicable for bluestore
6508 (*pm)["osd_journal"] = journal_path;
6509 }
6510 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6511 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6512 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6513 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6514
6515 // backend
6516 (*pm)["osd_objectstore"] = store->get_type();
6517 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6518 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6519 (*pm)["default_device_class"] = store->get_default_device_class();
6520 string osdspec_affinity;
6521 int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6522 if (r < 0 || osdspec_affinity.empty()) {
6523 osdspec_affinity = "";
6524 }
6525 (*pm)["osdspec_affinity"] = osdspec_affinity;
6526 store->collect_metadata(pm);
6527
6528 collect_sys_info(pm, cct);
6529
6530 (*pm)["front_iface"] = pick_iface(
6531 cct,
6532 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6533 (*pm)["back_iface"] = pick_iface(
6534 cct,
6535 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6536
6537 // network numa
6538 {
6539 int node = -1;
6540 set<int> nodes;
6541 set<string> unknown;
6542 for (auto nm : { "front_iface", "back_iface" }) {
6543 if (!(*pm)[nm].size()) {
6544 unknown.insert(nm);
6545 continue;
6546 }
6547 int n = -1;
6548 int r = get_iface_numa_node((*pm)[nm], &n);
6549 if (r < 0) {
6550 unknown.insert((*pm)[nm]);
6551 continue;
6552 }
6553 nodes.insert(n);
6554 if (node < 0) {
6555 node = n;
6556 }
6557 }
6558 if (unknown.size()) {
6559 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6560 }
6561 if (!nodes.empty()) {
6562 (*pm)["network_numa_nodes"] = stringify(nodes);
6563 }
6564 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6565 (*pm)["network_numa_node"] = stringify(node);
6566 }
6567 }
6568
6569 if (numa_node >= 0) {
6570 (*pm)["numa_node"] = stringify(numa_node);
6571 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6572 &numa_cpu_set);
6573 }
6574
6575 set<string> devnames;
6576 store->get_devices(&devnames);
6577 map<string,string> errs;
6578 get_device_metadata(devnames, pm, &errs);
6579 for (auto& i : errs) {
6580 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
6581 }
6582 dout(10) << __func__ << " " << *pm << dendl;
6583}
6584
6585void OSD::queue_want_up_thru(epoch_t want)
6586{
6587 std::shared_lock map_locker{map_lock};
6588 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6589 std::lock_guard report_locker(mon_report_lock);
6590 if (want > up_thru_wanted) {
6591 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6592 << ", currently " << cur
6593 << dendl;
6594 up_thru_wanted = want;
6595 send_alive();
6596 } else {
6597 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6598 << ", currently " << cur
6599 << dendl;
6600 }
6601}
6602
6603void OSD::send_alive()
6604{
6605 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6606 const auto osdmap = get_osdmap();
6607 if (!osdmap->exists(whoami))
6608 return;
6609 epoch_t up_thru = osdmap->get_up_thru(whoami);
6610 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6611 if (up_thru_wanted > up_thru) {
6612 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6613 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6614 }
6615}
6616
6617void OSD::request_full_map(epoch_t first, epoch_t last)
6618{
6619 dout(10) << __func__ << " " << first << ".." << last
6620 << ", previously requested "
6621 << requested_full_first << ".." << requested_full_last << dendl;
6622 ceph_assert(ceph_mutex_is_locked(osd_lock));
6623 ceph_assert(first > 0 && last > 0);
6624 ceph_assert(first <= last);
6625 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6626 if (requested_full_first == 0) {
6627 // first request
6628 requested_full_first = first;
6629 requested_full_last = last;
6630 } else if (last <= requested_full_last) {
6631 // dup
6632 return;
6633 } else {
6634 // additional request
6635 first = requested_full_last + 1;
6636 requested_full_last = last;
6637 }
6638 MMonGetOSDMap *req = new MMonGetOSDMap;
6639 req->request_full(first, last);
6640 monc->send_mon_message(req);
6641}
6642
6643void OSD::got_full_map(epoch_t e)
6644{
6645 ceph_assert(requested_full_first <= requested_full_last);
6646 ceph_assert(ceph_mutex_is_locked(osd_lock));
6647 if (requested_full_first == 0) {
6648 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6649 return;
6650 }
6651 if (e < requested_full_first) {
6652 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6653 << ".." << requested_full_last
6654 << ", ignoring" << dendl;
6655 return;
6656 }
6657 if (e >= requested_full_last) {
6658 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6659 << ".." << requested_full_last << ", resetting" << dendl;
6660 requested_full_first = requested_full_last = 0;
6661 return;
6662 }
6663
6664 requested_full_first = e + 1;
6665
6666 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6667 << ".." << requested_full_last
6668 << ", still need more" << dendl;
6669}
6670
6671void OSD::requeue_failures()
6672{
6673 std::lock_guard l(heartbeat_lock);
6674 unsigned old_queue = failure_queue.size();
6675 unsigned old_pending = failure_pending.size();
6676 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
6677 failure_queue[p->first] = p->second.first;
6678 failure_pending.erase(p++);
6679 }
6680 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6681 << failure_queue.size() << dendl;
6682}
6683
6684void OSD::send_failures()
6685{
6686 ceph_assert(ceph_mutex_is_locked(map_lock));
6687 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6688 std::lock_guard l(heartbeat_lock);
6689 utime_t now = ceph_clock_now();
6690 const auto osdmap = get_osdmap();
6691 while (!failure_queue.empty()) {
6692 int osd = failure_queue.begin()->first;
6693 if (!failure_pending.count(osd)) {
6694 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6695 monc->send_mon_message(
6696 new MOSDFailure(
6697 monc->get_fsid(),
6698 osd,
6699 osdmap->get_addrs(osd),
6700 failed_for,
6701 osdmap->get_epoch()));
6702 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6703 osdmap->get_addrs(osd));
6704 }
6705 failure_queue.erase(osd);
6706 }
6707}
6708
6709void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
6710{
6711 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6712 MOSDFailure::FLAG_ALIVE);
6713 monc->send_mon_message(m);
6714}
6715
6716void OSD::cancel_pending_failures()
6717{
6718 std::lock_guard l(heartbeat_lock);
6719 auto it = failure_pending.begin();
6720 while (it != failure_pending.end()) {
6721 dout(10) << __func__ << " canceling in-flight failure report for osd."
6722 << it->first << dendl;
6723 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
6724 failure_pending.erase(it++);
6725 }
6726}
6727
6728void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6729{
6730 const auto& monmap = monc->monmap;
6731 // send beacon to mon even if we are just connected, and the monmap is not
6732 // initialized yet by then.
6733 if (monmap.epoch > 0 &&
6734 monmap.get_required_features().contains_all(
6735 ceph::features::mon::FEATURE_LUMINOUS)) {
6736 dout(20) << __func__ << " sending" << dendl;
6737 MOSDBeacon* beacon = nullptr;
6738 {
6739 std::lock_guard l{min_last_epoch_clean_lock};
6740 beacon = new MOSDBeacon(get_osdmap_epoch(),
6741 min_last_epoch_clean,
6742 superblock.last_purged_snaps_scrub);
6743 beacon->pgs = min_last_epoch_clean_pgs;
6744 last_sent_beacon = now;
6745 }
6746 monc->send_mon_message(beacon);
6747 } else {
6748 dout(20) << __func__ << " not sending" << dendl;
6749 }
6750}
6751
6752void OSD::handle_command(MCommand *m)
6753{
6754 ConnectionRef con = m->get_connection();
6755 auto session = ceph::ref_cast<Session>(con->get_priv());
6756 if (!session) {
6757 con->send_message(new MCommandReply(m, -EACCES));
6758 m->put();
6759 return;
6760 }
6761 if (!session->caps.allow_all()) {
6762 con->send_message(new MCommandReply(m, -EACCES));
6763 m->put();
6764 return;
6765 }
6766 cct->get_admin_socket()->queue_tell_command(m);
6767 m->put();
6768}
6769
6770namespace {
6771 class unlock_guard {
6772 ceph::mutex& m;
6773 public:
6774 explicit unlock_guard(ceph::mutex& mutex)
6775 : m(mutex)
6776 {
6777 m.unlock();
6778 }
6779 unlock_guard(unlock_guard&) = delete;
6780 ~unlock_guard() {
6781 m.lock();
6782 }
6783 };
6784}
6785
6786void OSD::scrub_purged_snaps()
6787{
6788 dout(10) << __func__ << dendl;
6789 ceph_assert(ceph_mutex_is_locked(osd_lock));
6790 SnapMapper::Scrubber s(cct, store, service.meta_ch,
6791 make_snapmapper_oid(),
6792 make_purged_snaps_oid());
6793 clog->debug() << "purged_snaps scrub starts";
6794 osd_lock.unlock();
6795 s.run();
6796 if (s.stray.size()) {
6797 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
6798 } else {
6799 clog->debug() << "purged_snaps scrub ok";
6800 }
6801 set<pair<spg_t,snapid_t>> queued;
6802 for (auto& [pool, snap, hash, shard] : s.stray) {
6803 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
6804 if (!pi) {
6805 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
6806 continue;
6807 }
6808 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
6809 spg_t spgid(pgid, shard);
6810 pair<spg_t,snapid_t> p(spgid, snap);
6811 if (queued.count(p)) {
6812 dout(20) << __func__ << " pg " << spgid << " snap " << snap
6813 << " already queued" << dendl;
6814 continue;
6815 }
6816 PGRef pg = lookup_lock_pg(spgid);
6817 if (!pg) {
6818 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
6819 continue;
6820 }
6821 queued.insert(p);
6822 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
6823 << snap << dendl;
6824 pg->queue_snap_retrim(snap);
6825 pg->unlock();
6826 }
6827 osd_lock.lock();
6828 if (is_stopping()) {
6829 return;
6830 }
6831 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
6832 ObjectStore::Transaction t;
6833 superblock.last_purged_snaps_scrub = ceph_clock_now();
6834 write_superblock(t);
6835 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
6836 ceph_assert(tr == 0);
6837 if (is_active()) {
6838 send_beacon(ceph::coarse_mono_clock::now());
6839 }
6840 dout(10) << __func__ << " done" << dendl;
6841}
6842
6843void OSD::probe_smart(const string& only_devid, ostream& ss)
6844{
6845 set<string> devnames;
6846 store->get_devices(&devnames);
6847 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
6848 "osd_smart_report_timeout");
6849
6850 // == typedef std::map<std::string, mValue> mObject;
6851 json_spirit::mObject json_map;
6852
6853 for (auto dev : devnames) {
6854 // smartctl works only on physical devices; filter out any logical device
6855 if (dev.find("dm-") == 0) {
6856 continue;
6857 }
6858
6859 string err;
6860 string devid = get_device_id(dev, &err);
6861 if (devid.size() == 0) {
6862 dout(10) << __func__ << " no unique id for dev " << dev << " ("
6863 << err << "), skipping" << dendl;
6864 continue;
6865 }
6866 if (only_devid.size() && devid != only_devid) {
6867 continue;
6868 }
6869
6870 json_spirit::mValue smart_json;
6871 if (block_device_get_metrics(dev, smart_timeout,
6872 &smart_json)) {
6873 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
6874 continue;
6875 }
6876 json_map[devid] = smart_json;
6877 }
6878 json_spirit::write(json_map, ss, json_spirit::pretty_print);
6879}
6880
6881bool OSD::heartbeat_dispatch(Message *m)
6882{
6883 dout(30) << "heartbeat_dispatch " << m << dendl;
6884 switch (m->get_type()) {
6885
6886 case CEPH_MSG_PING:
6887 dout(10) << "ping from " << m->get_source_inst() << dendl;
6888 m->put();
6889 break;
6890
6891 case MSG_OSD_PING:
6892 handle_osd_ping(static_cast<MOSDPing*>(m));
6893 break;
6894
6895 default:
6896 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
6897 m->put();
6898 }
6899
6900 return true;
6901}
6902
6903bool OSD::ms_dispatch(Message *m)
6904{
6905 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
6906 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
6907 service.got_stop_ack();
6908 m->put();
6909 return true;
6910 }
6911
6912 // lock!
6913
6914 osd_lock.lock();
6915 if (is_stopping()) {
6916 osd_lock.unlock();
6917 m->put();
6918 return true;
6919 }
6920
6921 do_waiters();
6922 _dispatch(m);
6923
6924 osd_lock.unlock();
6925
6926 return true;
6927}
6928
6929void OSDService::maybe_share_map(
6930 Connection *con,
6931 const OSDMapRef& osdmap,
6932 epoch_t peer_epoch_lb)
6933{
6934 // NOTE: we assume caller hold something that keeps the Connection itself
6935 // pinned (e.g., an OpRequest's MessageRef).
6936 auto session = ceph::ref_cast<Session>(con->get_priv());
6937 if (!session) {
6938 return;
6939 }
6940
6941 // assume the peer has the newer of the op's sent_epoch and what
6942 // we think we sent them.
6943 session->sent_epoch_lock.lock();
6944 if (peer_epoch_lb > session->last_sent_epoch) {
6945 dout(10) << __func__ << " con " << con
6946 << " " << con->get_peer_addr()
6947 << " map epoch " << session->last_sent_epoch
6948 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
6949 session->last_sent_epoch = peer_epoch_lb;
6950 }
6951 epoch_t last_sent_epoch = session->last_sent_epoch;
6952 session->sent_epoch_lock.unlock();
6953
6954 if (osdmap->get_epoch() <= last_sent_epoch) {
6955 return;
6956 }
6957
6958 send_incremental_map(last_sent_epoch, con, osdmap);
6959 last_sent_epoch = osdmap->get_epoch();
6960
6961 session->sent_epoch_lock.lock();
6962 if (session->last_sent_epoch < last_sent_epoch) {
6963 dout(10) << __func__ << " con " << con
6964 << " " << con->get_peer_addr()
6965 << " map epoch " << session->last_sent_epoch
6966 << " -> " << last_sent_epoch << " (shared)" << dendl;
6967 session->last_sent_epoch = last_sent_epoch;
6968 }
6969 session->sent_epoch_lock.unlock();
6970}
6971
6972void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
6973{
6974 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
6975
6976 auto i = session->waiting_on_map.begin();
6977 while (i != session->waiting_on_map.end()) {
6978 OpRequestRef op = &(*i);
6979 ceph_assert(ms_can_fast_dispatch(op->get_req()));
6980 auto m = op->get_req<MOSDFastDispatchOp>();
6981 if (m->get_min_epoch() > osdmap->get_epoch()) {
6982 break;
6983 }
6984 session->waiting_on_map.erase(i++);
6985 op->put();
6986
6987 spg_t pgid;
6988 if (m->get_type() == CEPH_MSG_OSD_OP) {
6989 pg_t actual_pgid = osdmap->raw_pg_to_pg(
6990 static_cast<const MOSDOp*>(m)->get_pg());
6991 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
6992 continue;
6993 }
6994 } else {
6995 pgid = m->get_spg();
6996 }
6997 enqueue_op(pgid, std::move(op), m->get_map_epoch());
6998 }
6999
7000 if (session->waiting_on_map.empty()) {
7001 clear_session_waiting_on_map(session);
7002 } else {
7003 register_session_waiting_on_map(session);
7004 }
7005}
7006
7007void OSD::ms_fast_dispatch(Message *m)
7008{
7009 FUNCTRACE(cct);
7010 if (service.is_stopping()) {
7011 m->put();
7012 return;
7013 }
7014
7015 // peering event?
7016 switch (m->get_type()) {
7017 case CEPH_MSG_PING:
7018 dout(10) << "ping from " << m->get_source() << dendl;
7019 m->put();
7020 return;
7021 case MSG_OSD_FORCE_RECOVERY:
7022 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7023 return;
7024 case MSG_OSD_SCRUB2:
7025 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7026 return;
7027
7028 case MSG_OSD_PG_CREATE2:
7029 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7030 case MSG_OSD_PG_QUERY:
7031 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7032 case MSG_OSD_PG_NOTIFY:
7033 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7034 case MSG_OSD_PG_INFO:
7035 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7036 case MSG_OSD_PG_REMOVE:
7037 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7038
7039 // these are single-pg messages that handle themselves
7040 case MSG_OSD_PG_LOG:
7041 case MSG_OSD_PG_TRIM:
7042 case MSG_OSD_PG_NOTIFY2:
7043 case MSG_OSD_PG_QUERY2:
7044 case MSG_OSD_PG_INFO2:
7045 case MSG_OSD_BACKFILL_RESERVE:
7046 case MSG_OSD_RECOVERY_RESERVE:
7047 case MSG_OSD_PG_LEASE:
7048 case MSG_OSD_PG_LEASE_ACK:
7049 {
7050 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7051 if (require_osd_peer(pm)) {
7052 enqueue_peering_evt(
7053 pm->get_spg(),
7054 PGPeeringEventRef(pm->get_event()));
7055 }
7056 pm->put();
7057 return;
7058 }
7059 }
7060
7061 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7062 {
7063#ifdef WITH_LTTNG
7064 osd_reqid_t reqid = op->get_reqid();
7065#endif
7066 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7067 reqid.name._num, reqid.tid, reqid.inc);
7068 }
7069
7070 if (m->trace)
7071 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7072
7073 // note sender epoch, min req's epoch
7074 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7075 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7076 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7077
7078 service.maybe_inject_dispatch_delay();
7079
7080 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7081 m->get_type() != CEPH_MSG_OSD_OP) {
7082 // queue it directly
7083 enqueue_op(
7084 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7085 std::move(op),
7086 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7087 } else {
7088 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7089 // message that didn't have an explicit spg_t); we need to map
7090 // them to an spg_t while preserving delivery order.
7091 auto priv = m->get_connection()->get_priv();
7092 if (auto session = static_cast<Session*>(priv.get()); session) {
7093 std::lock_guard l{session->session_dispatch_lock};
7094 op->get();
7095 session->waiting_on_map.push_back(*op);
7096 OSDMapRef nextmap = service.get_nextmap_reserved();
7097 dispatch_session_waiting(session, nextmap);
7098 service.release_map(nextmap);
7099 }
7100 }
7101 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7102}
7103
7104int OSD::ms_handle_authentication(Connection *con)
7105{
7106 int ret = 0;
7107 auto s = ceph::ref_cast<Session>(con->get_priv());
7108 if (!s) {
7109 s = ceph::make_ref<Session>(cct, con);
7110 con->set_priv(s);
7111 s->entity_name = con->get_peer_entity_name();
7112 dout(10) << __func__ << " new session " << s << " con " << s->con
7113 << " entity " << s->entity_name
7114 << " addr " << con->get_peer_addrs() << dendl;
7115 } else {
7116 dout(10) << __func__ << " existing session " << s << " con " << s->con
7117 << " entity " << s->entity_name
7118 << " addr " << con->get_peer_addrs() << dendl;
7119 }
7120
7121 AuthCapsInfo &caps_info = con->get_peer_caps_info();
7122 if (caps_info.allow_all) {
7123 s->caps.set_allow_all();
7124 } else if (caps_info.caps.length() > 0) {
7125 bufferlist::const_iterator p = caps_info.caps.cbegin();
7126 string str;
7127 try {
7128 decode(str, p);
7129 }
7130 catch (buffer::error& e) {
7131 dout(10) << __func__ << " session " << s << " " << s->entity_name
7132 << " failed to decode caps string" << dendl;
7133 ret = -EACCES;
7134 }
7135 if (!ret) {
7136 bool success = s->caps.parse(str);
7137 if (success) {
7138 dout(10) << __func__ << " session " << s
7139 << " " << s->entity_name
7140 << " has caps " << s->caps << " '" << str << "'" << dendl;
7141 ret = 1;
7142 } else {
7143 dout(10) << __func__ << " session " << s << " " << s->entity_name
7144 << " failed to parse caps '" << str << "'" << dendl;
7145 ret = -EACCES;
7146 }
7147 }
7148 }
7149 return ret;
7150}
7151
7152void OSD::do_waiters()
7153{
7154 ceph_assert(ceph_mutex_is_locked(osd_lock));
7155
7156 dout(10) << "do_waiters -- start" << dendl;
7157 while (!finished.empty()) {
7158 OpRequestRef next = finished.front();
7159 finished.pop_front();
7160 dispatch_op(next);
7161 }
7162 dout(10) << "do_waiters -- finish" << dendl;
7163}
7164
7165void OSD::dispatch_op(OpRequestRef op)
7166{
7167 switch (op->get_req()->get_type()) {
7168
7169 case MSG_OSD_PG_CREATE:
7170 handle_pg_create(op);
7171 break;
7172 }
7173}
7174
7175void OSD::_dispatch(Message *m)
7176{
7177 ceph_assert(ceph_mutex_is_locked(osd_lock));
7178 dout(20) << "_dispatch " << m << " " << *m << dendl;
7179
7180 switch (m->get_type()) {
7181 // -- don't need OSDMap --
7182
7183 // map and replication
7184 case CEPH_MSG_OSD_MAP:
7185 handle_osd_map(static_cast<MOSDMap*>(m));
7186 break;
7187 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7188 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7189 break;
7190
7191 // osd
7192 case MSG_OSD_SCRUB:
7193 handle_scrub(static_cast<MOSDScrub*>(m));
7194 break;
7195
7196 case MSG_COMMAND:
7197 handle_command(static_cast<MCommand*>(m));
7198 return;
7199
7200 // -- need OSDMap --
7201
7202 case MSG_OSD_PG_CREATE:
7203 {
7204 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7205 if (m->trace)
7206 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7207 // no map? starting up?
7208 if (!get_osdmap()) {
7209 dout(7) << "no OSDMap, not booted" << dendl;
7210 logger->inc(l_osd_waiting_for_map);
7211 waiting_for_osdmap.push_back(op);
7212 op->mark_delayed("no osdmap");
7213 break;
7214 }
7215
7216 // need OSDMap
7217 dispatch_op(op);
7218 }
7219 }
7220}
7221
7222// remove me post-nautilus
7223void OSD::handle_scrub(MOSDScrub *m)
7224{
7225 dout(10) << "handle_scrub " << *m << dendl;
7226 if (!require_mon_or_mgr_peer(m)) {
7227 m->put();
7228 return;
7229 }
7230 if (m->fsid != monc->get_fsid()) {
7231 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7232 << dendl;
7233 m->put();
7234 return;
7235 }
7236
7237 vector<spg_t> spgs;
7238 _get_pgids(&spgs);
7239
7240 if (!m->scrub_pgs.empty()) {
7241 vector<spg_t> v;
7242 for (auto pgid : m->scrub_pgs) {
7243 spg_t pcand;
7244 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
7245 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7246 v.push_back(pcand);
7247 }
7248 }
7249 spgs.swap(v);
7250 }
7251
7252 for (auto pgid : spgs) {
7253 enqueue_peering_evt(
7254 pgid,
7255 PGPeeringEventRef(
7256 std::make_shared<PGPeeringEvent>(
7257 get_osdmap_epoch(),
7258 get_osdmap_epoch(),
7259 PeeringState::RequestScrub(m->deep, m->repair))));
7260 }
7261
7262 m->put();
7263}
7264
7265void OSD::handle_fast_scrub(MOSDScrub2 *m)
7266{
7267 dout(10) << __func__ << " " << *m << dendl;
7268 if (!require_mon_or_mgr_peer(m)) {
7269 m->put();
7270 return;
7271 }
7272 if (m->fsid != monc->get_fsid()) {
7273 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7274 << dendl;
7275 m->put();
7276 return;
7277 }
7278 for (auto pgid : m->scrub_pgs) {
7279 enqueue_peering_evt(
7280 pgid,
7281 PGPeeringEventRef(
7282 std::make_shared<PGPeeringEvent>(
7283 m->epoch,
7284 m->epoch,
7285 PeeringState::RequestScrub(m->deep, m->repair))));
7286 }
7287 m->put();
7288}
7289
7290bool OSD::scrub_random_backoff()
7291{
7292 bool coin_flip = (rand() / (double)RAND_MAX >=
7293 cct->_conf->osd_scrub_backoff_ratio);
7294 if (!coin_flip) {
7295 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7296 return true;
7297 }
7298 return false;
7299}
7300
7301OSDService::ScrubJob::ScrubJob(CephContext* cct,
7302 const spg_t& pg, const utime_t& timestamp,
7303 double pool_scrub_min_interval,
7304 double pool_scrub_max_interval, bool must)
7305 : cct(cct),
7306 pgid(pg),
7307 sched_time(timestamp),
7308 deadline(timestamp)
7309{
7310 // if not explicitly requested, postpone the scrub with a random delay
7311 if (!must) {
7312 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7313 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7314 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7315 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7316
7317 sched_time += scrub_min_interval;
7318 double r = rand() / (double)RAND_MAX;
7319 sched_time +=
7320 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7321 if (scrub_max_interval == 0) {
7322 deadline = utime_t();
7323 } else {
7324 deadline += scrub_max_interval;
7325 }
7326
7327 }
7328}
7329
7330bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7331 if (sched_time < rhs.sched_time)
7332 return true;
7333 if (sched_time > rhs.sched_time)
7334 return false;
7335 return pgid < rhs.pgid;
7336}
7337
7338double OSD::scrub_sleep_time(bool must_scrub)
7339{
7340 if (must_scrub) {
7341 return cct->_conf->osd_scrub_sleep;
7342 }
7343 utime_t now = ceph_clock_now();
7344 if (scrub_time_permit(now)) {
7345 return cct->_conf->osd_scrub_sleep;
7346 }
7347 double normal_sleep = cct->_conf->osd_scrub_sleep;
7348 double extended_sleep = cct->_conf->osd_scrub_extended_sleep;
7349 return std::max(extended_sleep, normal_sleep);
7350}
7351
7352bool OSD::scrub_time_permit(utime_t now)
7353{
7354 struct tm bdt;
7355 time_t tt = now.sec();
7356 localtime_r(&tt, &bdt);
7357
7358 bool day_permit = false;
7359 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7360 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7361 day_permit = true;
7362 }
7363 } else {
7364 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7365 day_permit = true;
7366 }
7367 }
7368
7369 if (!day_permit) {
7370 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7371 << " - " << cct->_conf->osd_scrub_end_week_day
7372 << " now " << bdt.tm_wday << " = no" << dendl;
7373 return false;
7374 }
7375
7376 bool time_permit = false;
7377 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7378 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7379 time_permit = true;
7380 }
7381 } else {
7382 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7383 time_permit = true;
7384 }
7385 }
7386 if (!time_permit) {
7387 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7388 << " - " << cct->_conf->osd_scrub_end_hour
7389 << " now " << bdt.tm_hour << " = no" << dendl;
7390 } else {
7391 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7392 << " - " << cct->_conf->osd_scrub_end_hour
7393 << " now " << bdt.tm_hour << " = yes" << dendl;
7394 }
7395 return time_permit;
7396}
7397
7398bool OSD::scrub_load_below_threshold()
7399{
7400 double loadavgs[3];
7401 if (getloadavg(loadavgs, 3) != 3) {
7402 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7403 return false;
7404 }
7405
7406 // allow scrub if below configured threshold
7407 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7408 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7409 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7410 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7411 << " < max " << cct->_conf->osd_scrub_load_threshold
7412 << " = yes" << dendl;
7413 return true;
7414 }
7415
7416 // allow scrub if below daily avg and currently decreasing
7417 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7418 dout(20) << __func__ << " loadavg " << loadavgs[0]
7419 << " < daily_loadavg " << daily_loadavg
7420 << " and < 15m avg " << loadavgs[2]
7421 << " = yes" << dendl;
7422 return true;
7423 }
7424
7425 dout(20) << __func__ << " loadavg " << loadavgs[0]
7426 << " >= max " << cct->_conf->osd_scrub_load_threshold
7427 << " and ( >= daily_loadavg " << daily_loadavg
7428 << " or >= 15m avg " << loadavgs[2]
7429 << ") = no" << dendl;
7430 return false;
7431}
7432
7433void OSD::sched_scrub()
7434{
7435 // if not permitted, fail fast
7436 if (!service.can_inc_scrubs()) {
7437 return;
7438 }
7439 bool allow_requested_repair_only = false;
7440 if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7441 if (!cct->_conf->osd_repair_during_recovery) {
7442 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7443 return;
7444 }
7445 dout(10) << __func__
7446 << " will only schedule explicitly requested repair due to active recovery"
7447 << dendl;
7448 allow_requested_repair_only = true;
7449 }
7450
7451 utime_t now = ceph_clock_now();
7452 bool time_permit = scrub_time_permit(now);
7453 bool load_is_low = scrub_load_below_threshold();
7454 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7455
7456 OSDService::ScrubJob scrub;
7457 if (service.first_scrub_stamp(&scrub)) {
7458 do {
7459 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7460
7461 if (scrub.sched_time > now) {
7462 // save ourselves some effort
7463 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7464 << " > " << now << dendl;
7465 break;
7466 }
7467
7468 if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
7469 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7470 << (!time_permit ? "time not permit" : "high load") << dendl;
7471 continue;
7472 }
7473
7474 PGRef pg = _lookup_lock_pg(scrub.pgid);
7475 if (!pg)
7476 continue;
7477 // This has already started, so go on to the next scrub job
7478 if (pg->scrubber.active) {
7479 pg->unlock();
7480 dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
7481 continue;
7482 }
7483 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7484 if (allow_requested_repair_only && !pg->scrubber.must_repair) {
7485 pg->unlock();
7486 dout(10) << __func__ << " skip " << scrub.pgid
7487 << " because repairing is not explicitly requested on it"
7488 << dendl;
7489 continue;
7490 }
7491 // If it is reserving, let it resolve before going to the next scrub job
7492 if (pg->scrubber.local_reserved && !pg->scrubber.active) {
7493 pg->unlock();
7494 dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
7495 break;
7496 }
7497 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7498 << (pg->get_must_scrub() ? ", explicitly requested" :
7499 (load_is_low ? ", load_is_low" : " deadline < now"))
7500 << dendl;
7501 if (pg->sched_scrub()) {
7502 pg->unlock();
7503 break;
7504 }
7505 pg->unlock();
7506 } while (service.next_scrub_stamp(scrub, &scrub));
7507 }
7508 dout(20) << "sched_scrub done" << dendl;
7509}
7510
7511void OSD::resched_all_scrubs()
7512{
7513 dout(10) << __func__ << ": start" << dendl;
7514 const vector<spg_t> pgs = [this] {
7515 vector<spg_t> pgs;
7516 OSDService::ScrubJob job;
7517 if (service.first_scrub_stamp(&job)) {
7518 do {
7519 pgs.push_back(job.pgid);
7520 } while (service.next_scrub_stamp(job, &job));
7521 }
7522 return pgs;
7523 }();
7524 for (auto& pgid : pgs) {
7525 dout(20) << __func__ << ": examine " << pgid << dendl;
7526 PGRef pg = _lookup_lock_pg(pgid);
7527 if (!pg)
7528 continue;
7529 if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) {
7530 dout(15) << __func__ << ": reschedule " << pgid << dendl;
7531 pg->on_info_history_change();
7532 }
7533 pg->unlock();
7534 }
7535 dout(10) << __func__ << ": done" << dendl;
7536}
7537
7538MPGStats* OSD::collect_pg_stats()
7539{
7540 // This implementation unconditionally sends every is_primary PG's
7541 // stats every time we're called. This has equivalent cost to the
7542 // previous implementation's worst case where all PGs are busy and
7543 // their stats are always enqueued for sending.
7544 std::shared_lock l{map_lock};
7545
7546 osd_stat_t cur_stat = service.get_osd_stat();
7547 cur_stat.os_perf_stat = store->get_cur_stats();
7548
7549 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
7550 m->osd_stat = cur_stat;
7551
7552 std::lock_guard lec{min_last_epoch_clean_lock};
7553 min_last_epoch_clean = get_osdmap_epoch();
7554 min_last_epoch_clean_pgs.clear();
7555
7556 std::set<int64_t> pool_set;
7557 vector<PGRef> pgs;
7558 _get_pgs(&pgs);
7559 for (auto& pg : pgs) {
7560 auto pool = pg->pg_id.pgid.pool();
7561 pool_set.emplace((int64_t)pool);
7562 if (!pg->is_primary()) {
7563 continue;
7564 }
7565 pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7566 m->pg_stat[pg->pg_id.pgid] = s;
7567 min_last_epoch_clean = min(min_last_epoch_clean, lec);
7568 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7569 });
7570 }
7571 store_statfs_t st;
7572 bool per_pool_stats = false;
7573 bool per_pool_omap_stats = false;
7574 for (auto p : pool_set) {
7575 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
7576 if (r == -ENOTSUP) {
7577 break;
7578 } else {
7579 assert(r >= 0);
7580 m->pool_stat[p] = st;
7581 per_pool_stats = true;
7582 }
7583 }
7584
7585 // indicate whether we are reporting per-pool stats
7586 m->osd_stat.num_osds = 1;
7587 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
7588 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
7589
7590 return m;
7591}
7592
7593vector<DaemonHealthMetric> OSD::get_health_metrics()
7594{
7595 vector<DaemonHealthMetric> metrics;
7596 {
7597 utime_t oldest_secs;
7598 const utime_t now = ceph_clock_now();
7599 auto too_old = now;
7600 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7601 int slow = 0;
7602 TrackedOpRef oldest_op;
7603 auto count_slow_ops = [&](TrackedOp& op) {
7604 if (op.get_initiated() < too_old) {
7605 stringstream ss;
7606 ss << "slow request " << op.get_desc()
7607 << " initiated "
7608 << op.get_initiated()
7609 << " currently "
7610 << op.state_string();
7611 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7612 clog->warn() << ss.str();
7613 slow++;
7614 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7615 oldest_op = &op;
7616 }
7617 return true;
7618 } else {
7619 return false;
7620 }
7621 };
7622 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7623 if (slow) {
7624 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7625 << oldest_op->get_desc() << dendl;
7626 }
7627 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7628 } else {
7629 // no news is not good news.
7630 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7631 }
7632 }
7633 {
7634 std::lock_guard l(pending_creates_lock);
7635 auto n_primaries = pending_creates_from_mon;
7636 for (const auto& create : pending_creates_from_osd) {
7637 if (create.second) {
7638 n_primaries++;
7639 }
7640 }
7641 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
7642 }
7643 return metrics;
7644}
7645
7646// =====================================================
7647// MAP
7648
7649void OSD::wait_for_new_map(OpRequestRef op)
7650{
7651 // ask?
7652 if (waiting_for_osdmap.empty()) {
7653 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7654 }
7655
7656 logger->inc(l_osd_waiting_for_map);
7657 waiting_for_osdmap.push_back(op);
7658 op->mark_delayed("wait for new map");
7659}
7660
7661
7662/** update_map
7663 * assimilate new OSDMap(s). scan pgs, etc.
7664 */
7665
7666void OSD::note_down_osd(int peer)
7667{
7668 ceph_assert(ceph_mutex_is_locked(osd_lock));
7669 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7670
7671 std::lock_guard l{heartbeat_lock};
7672 failure_queue.erase(peer);
7673 failure_pending.erase(peer);
7674 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7675 if (p != heartbeat_peers.end()) {
7676 p->second.clear_mark_down();
7677 heartbeat_peers.erase(p);
7678 }
7679}
7680
7681void OSD::note_up_osd(int peer)
7682{
7683 heartbeat_set_peers_need_update();
7684}
7685
7686struct C_OnMapCommit : public Context {
7687 OSD *osd;
7688 epoch_t first, last;
7689 MOSDMap *msg;
7690 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7691 : osd(o), first(f), last(l), msg(m) {}
7692 void finish(int r) override {
7693 osd->_committed_osd_maps(first, last, msg);
7694 msg->put();
7695 }
7696};
7697
7698void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7699{
7700 std::lock_guard l(osdmap_subscribe_lock);
7701 if (latest_subscribed_epoch >= epoch && !force_request)
7702 return;
7703
7704 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
7705
7706 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7707 force_request) {
7708 monc->renew_subs();
7709 }
7710}
7711
7712void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7713{
7714 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7715 if (min <= superblock.oldest_map)
7716 return;
7717
7718 int num = 0;
7719 ObjectStore::Transaction t;
7720 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7721 dout(20) << " removing old osdmap epoch " << e << dendl;
7722 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7723 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7724 superblock.oldest_map = e + 1;
7725 num++;
7726 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7727 service.publish_superblock(superblock);
7728 write_superblock(t);
7729 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7730 ceph_assert(tr == 0);
7731 num = 0;
7732 if (!skip_maps) {
7733 // skip_maps leaves us with a range of old maps if we fail to remove all
7734 // of them before moving superblock.oldest_map forward to the first map
7735 // in the incoming MOSDMap msg. so we should continue removing them in
7736 // this case, even we could do huge series of delete transactions all at
7737 // once.
7738 break;
7739 }
7740 }
7741 }
7742 if (num > 0) {
7743 service.publish_superblock(superblock);
7744 write_superblock(t);
7745 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7746 ceph_assert(tr == 0);
7747 }
7748 // we should not remove the cached maps
7749 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7750}
7751
7752void OSD::handle_osd_map(MOSDMap *m)
7753{
7754 // wait for pgs to catch up
7755 {
7756 // we extend the map cache pins to accomodate pgs slow to consume maps
7757 // for some period, until we hit the max_lag_factor bound, at which point
7758 // we block here to stop injesting more maps than they are able to keep
7759 // up with.
7760 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7761 m_osd_pg_epoch_max_lag_factor;
7762 ceph_assert(max_lag > 0);
7763 epoch_t osd_min = 0;
7764 for (auto shard : shards) {
7765 epoch_t min = shard->get_min_pg_epoch();
7766 if (osd_min == 0 || min < osd_min) {
7767 osd_min = min;
7768 }
7769 }
7770 epoch_t osdmap_epoch = get_osdmap_epoch();
7771 if (osd_min > 0 &&
7772 osdmap_epoch > max_lag &&
7773 osdmap_epoch - max_lag > osd_min) {
7774 epoch_t need = osdmap_epoch - max_lag;
7775 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7776 << " max_lag " << max_lag << ")" << dendl;
7777 for (auto shard : shards) {
7778 epoch_t min = shard->get_min_pg_epoch();
7779 if (need > min) {
7780 dout(10) << __func__ << " waiting for pgs to consume " << need
7781 << " (shard " << shard->shard_id << " min " << min
7782 << ", map cache is " << cct->_conf->osd_map_cache_size
7783 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7784 << ")" << dendl;
7785 unlock_guard unlock{osd_lock};
7786 shard->wait_min_pg_epoch(need);
7787 }
7788 }
7789 }
7790 }
7791
7792 ceph_assert(ceph_mutex_is_locked(osd_lock));
7793 map<epoch_t,OSDMapRef> added_maps;
7794 map<epoch_t,bufferlist> added_maps_bl;
7795 if (m->fsid != monc->get_fsid()) {
7796 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7797 << monc->get_fsid() << dendl;
7798 m->put();
7799 return;
7800 }
7801 if (is_initializing()) {
7802 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7803 m->put();
7804 return;
7805 }
7806
7807 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7808 if (session && !(session->entity_name.is_mon() ||
7809 session->entity_name.is_osd())) {
7810 //not enough perms!
7811 dout(10) << "got osd map from Session " << session
7812 << " which we can't take maps from (not a mon or osd)" << dendl;
7813 m->put();
7814 return;
7815 }
7816
7817 // share with the objecter
7818 if (!is_preboot())
7819 service.objecter->handle_osd_map(m);
7820
7821 epoch_t first = m->get_first();
7822 epoch_t last = m->get_last();
7823 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7824 << superblock.newest_map
7825 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7826 << dendl;
7827
7828 logger->inc(l_osd_map);
7829 logger->inc(l_osd_mape, last - first + 1);
7830 if (first <= superblock.newest_map)
7831 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7832 if (service.max_oldest_map < m->oldest_map) {
7833 service.max_oldest_map = m->oldest_map;
7834 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7835 }
7836
7837 // make sure there is something new, here, before we bother flushing
7838 // the queues and such
7839 if (last <= superblock.newest_map) {
7840 dout(10) << " no new maps here, dropping" << dendl;
7841 m->put();
7842 return;
7843 }
7844
7845 // missing some?
7846 bool skip_maps = false;
7847 if (first > superblock.newest_map + 1) {
7848 dout(10) << "handle_osd_map message skips epochs "
7849 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7850 if (m->oldest_map <= superblock.newest_map + 1) {
7851 osdmap_subscribe(superblock.newest_map + 1, false);
7852 m->put();
7853 return;
7854 }
7855 // always try to get the full range of maps--as many as we can. this
7856 // 1- is good to have
7857 // 2- is at present the only way to ensure that we get a *full* map as
7858 // the first map!
7859 if (m->oldest_map < first) {
7860 osdmap_subscribe(m->oldest_map - 1, true);
7861 m->put();
7862 return;
7863 }
7864 skip_maps = true;
7865 }
7866
7867 ObjectStore::Transaction t;
7868 uint64_t txn_size = 0;
7869
7870 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
7871
7872 // store new maps: queue for disk and put in the osdmap cache
7873 epoch_t start = std::max(superblock.newest_map + 1, first);
7874 for (epoch_t e = start; e <= last; e++) {
7875 if (txn_size >= t.get_num_bytes()) {
7876 derr << __func__ << " transaction size overflowed" << dendl;
7877 ceph_assert(txn_size < t.get_num_bytes());
7878 }
7879 txn_size = t.get_num_bytes();
7880 map<epoch_t,bufferlist>::iterator p;
7881 p = m->maps.find(e);
7882 if (p != m->maps.end()) {
7883 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7884 OSDMap *o = new OSDMap;
7885 bufferlist& bl = p->second;
7886
7887 o->decode(bl);
7888
7889 purged_snaps[e] = o->get_new_purged_snaps();
7890
7891 ghobject_t fulloid = get_osdmap_pobject_name(e);
7892 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7893 added_maps[e] = add_map(o);
7894 added_maps_bl[e] = bl;
7895 got_full_map(e);
7896 continue;
7897 }
7898
7899 p = m->incremental_maps.find(e);
7900 if (p != m->incremental_maps.end()) {
7901 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7902 bufferlist& bl = p->second;
7903 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7904 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7905
7906 OSDMap *o = new OSDMap;
7907 if (e > 1) {
7908 bufferlist obl;
7909 bool got = get_map_bl(e - 1, obl);
7910 if (!got) {
7911 auto p = added_maps_bl.find(e - 1);
7912 ceph_assert(p != added_maps_bl.end());
7913 obl = p->second;
7914 }
7915 o->decode(obl);
7916 }
7917
7918 OSDMap::Incremental inc;
7919 auto p = bl.cbegin();
7920 inc.decode(p);
7921
7922 if (o->apply_incremental(inc) < 0) {
7923 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
7924 ceph_abort_msg("bad fsid");
7925 }
7926
7927 bufferlist fbl;
7928 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7929
7930 bool injected_failure = false;
7931 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7932 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7933 derr << __func__ << " injecting map crc failure" << dendl;
7934 injected_failure = true;
7935 }
7936
7937 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7938 dout(2) << "got incremental " << e
7939 << " but failed to encode full with correct crc; requesting"
7940 << dendl;
7941 clog->warn() << "failed to encode map e" << e << " with expected crc";
7942 dout(20) << "my encoded map was:\n";
7943 fbl.hexdump(*_dout);
7944 *_dout << dendl;
7945 delete o;
7946 request_full_map(e, last);
7947 last = e - 1;
7948
7949 // don't continue committing if we failed to enc the first inc map
7950 if (last < start) {
7951 dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
7952 m->put();
7953 return;
7954 }
7955 break;
7956 }
7957 got_full_map(e);
7958 purged_snaps[e] = o->get_new_purged_snaps();
7959
7960 ghobject_t fulloid = get_osdmap_pobject_name(e);
7961 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7962 added_maps[e] = add_map(o);
7963 added_maps_bl[e] = fbl;
7964 continue;
7965 }
7966
7967 ceph_abort_msg("MOSDMap lied about what maps it had?");
7968 }
7969
7970 // even if this map isn't from a mon, we may have satisfied our subscription
7971 monc->sub_got("osdmap", last);
7972
7973 if (!m->maps.empty() && requested_full_first) {
7974 dout(10) << __func__ << " still missing full maps " << requested_full_first
7975 << ".." << requested_full_last << dendl;
7976 rerequest_full_maps();
7977 }
7978
7979 if (superblock.oldest_map) {
7980 // make sure we at least keep pace with incoming maps
7981 trim_maps(m->oldest_map, last - first + 1, skip_maps);
7982 pg_num_history.prune(superblock.oldest_map);
7983 }
7984
7985 if (!superblock.oldest_map || skip_maps)
7986 superblock.oldest_map = first;
7987 superblock.newest_map = last;
7988 superblock.current_epoch = last;
7989
7990 // note in the superblock that we were clean thru the prior epoch
7991 epoch_t boot_epoch = service.get_boot_epoch();
7992 if (boot_epoch && boot_epoch >= superblock.mounted) {
7993 superblock.mounted = boot_epoch;
7994 superblock.clean_thru = last;
7995 }
7996
7997 // check for pg_num changes and deleted pools
7998 OSDMapRef lastmap;
7999 for (auto& i : added_maps) {
8000 if (!lastmap) {
8001 if (!(lastmap = service.try_get_map(i.first - 1))) {
8002 dout(10) << __func__ << " can't get previous map " << i.first - 1
8003 << " probably first start of this osd" << dendl;
8004 continue;
8005 }
8006 }
8007 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8008 for (auto& j : lastmap->get_pools()) {
8009 if (!i.second->have_pg_pool(j.first)) {
8010 pg_num_history.log_pool_delete(i.first, j.first);
8011 dout(10) << __func__ << " recording final pg_pool_t for pool "
8012 << j.first << dendl;
8013 // this information is needed by _make_pg() if have to restart before
8014 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8015 ghobject_t obj = make_final_pool_info_oid(j.first);
8016 bufferlist bl;
8017 encode(j.second, bl, CEPH_FEATURES_ALL);
8018 string name = lastmap->get_pool_name(j.first);
8019 encode(name, bl);
8020 map<string,string> profile;
8021 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8022 profile = lastmap->get_erasure_code_profile(
8023 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8024 }
8025 encode(profile, bl);
8026 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8027 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8028 new_pg_num != j.second.get_pg_num()) {
8029 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8030 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8031 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8032 }
8033 }
8034 for (auto& j : i.second->get_pools()) {
8035 if (!lastmap->have_pg_pool(j.first)) {
8036 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8037 << j.second.get_pg_num() << dendl;
8038 pg_num_history.log_pg_num_change(i.first, j.first,
8039 j.second.get_pg_num());
8040 }
8041 }
8042 lastmap = i.second;
8043 }
8044 pg_num_history.epoch = last;
8045 {
8046 bufferlist bl;
8047 ::encode(pg_num_history, bl);
8048 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8049 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8050 }
8051
8052 // record new purged_snaps
8053 if (superblock.purged_snaps_last == start - 1) {
8054 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
8055 make_purged_snaps_oid(), &t,
8056 purged_snaps);
8057 superblock.purged_snaps_last = last;
8058 } else {
8059 dout(10) << __func__ << " superblock purged_snaps_last is "
8060 << superblock.purged_snaps_last
8061 << ", not recording new purged_snaps" << dendl;
8062 }
8063
8064 // superblock and commit
8065 write_superblock(t);
8066 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8067 store->queue_transaction(
8068 service.meta_ch,
8069 std::move(t));
8070 service.publish_superblock(superblock);
8071}
8072
8073void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8074{
8075 dout(10) << __func__ << " " << first << ".." << last << dendl;
8076 if (is_stopping()) {
8077 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8078 return;
8079 }
8080 std::lock_guard l(osd_lock);
8081 if (is_stopping()) {
8082 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8083 return;
8084 }
8085 map_lock.lock();
8086
8087 ceph_assert(first <= last);
8088
8089 bool do_shutdown = false;
8090 bool do_restart = false;
8091 bool network_error = false;
8092 OSDMapRef osdmap = get_osdmap();
8093
8094 // advance through the new maps
8095 for (epoch_t cur = first; cur <= last; cur++) {
8096 dout(10) << " advance to epoch " << cur
8097 << " (<= last " << last
8098 << " <= newest_map " << superblock.newest_map
8099 << ")" << dendl;
8100
8101 OSDMapRef newmap = get_map(cur);
8102 ceph_assert(newmap); // we just cached it above!
8103
8104 // start blacklisting messages sent to peers that go down.
8105 service.pre_publish_map(newmap);
8106
8107 // kill connections to newly down osds
8108 bool waited_for_reservations = false;
8109 set<int> old;
8110 osdmap = get_osdmap();
8111 osdmap->get_all_osds(old);
8112 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8113 if (*p != whoami &&
8114 osdmap->is_up(*p) && // in old map
8115 newmap->is_down(*p)) { // but not the new one
8116 if (!waited_for_reservations) {
8117 service.await_reserved_maps();
8118 waited_for_reservations = true;
8119 }
8120 note_down_osd(*p);
8121 } else if (*p != whoami &&
8122 osdmap->is_down(*p) &&
8123 newmap->is_up(*p)) {
8124 note_up_osd(*p);
8125 }
8126 }
8127
8128 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8129 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8130 << dendl;
8131 if (is_booting()) {
8132 // this captures the case where we sent the boot message while
8133 // NOUP was being set on the mon and our boot request was
8134 // dropped, and then later it is cleared. it imperfectly
8135 // handles the case where our original boot message was not
8136 // dropped and we restart even though we might have booted, but
8137 // that is harmless (boot will just take slightly longer).
8138 do_restart = true;
8139 }
8140 }
8141
8142 osdmap = std::move(newmap);
8143 set_osdmap(osdmap);
8144 epoch_t up_epoch;
8145 epoch_t boot_epoch;
8146 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8147 if (!up_epoch &&
8148 osdmap->is_up(whoami) &&
8149 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8150 up_epoch = osdmap->get_epoch();
8151 dout(10) << "up_epoch is " << up_epoch << dendl;
8152 if (!boot_epoch) {
8153 boot_epoch = osdmap->get_epoch();
8154 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8155 }
8156 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8157 }
8158 }
8159
8160 epoch_t _bind_epoch = service.get_bind_epoch();
8161 if (osdmap->is_up(whoami) &&
8162 osdmap->get_addrs(whoami).legacy_equals(
8163 client_messenger->get_myaddrs()) &&
8164 _bind_epoch < osdmap->get_up_from(whoami)) {
8165
8166 if (is_booting()) {
8167 dout(1) << "state: booting -> active" << dendl;
8168 set_state(STATE_ACTIVE);
8169 do_restart = false;
8170
8171 // set incarnation so that osd_reqid_t's we generate for our
8172 // objecter requests are unique across restarts.
8173 service.objecter->set_client_incarnation(osdmap->get_epoch());
8174 cancel_pending_failures();
8175 }
8176 }
8177
8178 if (osdmap->get_epoch() > 0 &&
8179 is_active()) {
8180 if (!osdmap->exists(whoami)) {
8181 derr << "map says i do not exist. shutting down." << dendl;
8182 do_shutdown = true; // don't call shutdown() while we have
8183 // everything paused
8184 } else if (osdmap->is_stop(whoami)) {
8185 derr << "map says i am stopped by admin. shutting down." << dendl;
8186 do_shutdown = true;
8187 } else if (!osdmap->is_up(whoami) ||
8188 !osdmap->get_addrs(whoami).legacy_equals(
8189 client_messenger->get_myaddrs()) ||
8190 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8191 cluster_messenger->get_myaddrs()) ||
8192 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8193 hb_back_server_messenger->get_myaddrs()) ||
8194 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8195 hb_front_server_messenger->get_myaddrs())) {
8196 if (!osdmap->is_up(whoami)) {
8197 if (service.is_preparing_to_stop() || service.is_stopping()) {
8198 service.got_stop_ack();
8199 } else {
8200 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8201 "but it is still running";
8202 clog->debug() << "map e" << osdmap->get_epoch()
8203 << " wrongly marked me down at e"
8204 << osdmap->get_down_at(whoami);
8205 }
8206 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8207 // note that this is best-effort...
8208 monc->send_mon_message(
8209 new MOSDMarkMeDead(
8210 monc->get_fsid(),
8211 whoami,
8212 osdmap->get_epoch()));
8213 }
8214 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8215 client_messenger->get_myaddrs())) {
8216 clog->error() << "map e" << osdmap->get_epoch()
8217 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8218 << " != my " << client_messenger->get_myaddrs() << ")";
8219 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8220 cluster_messenger->get_myaddrs())) {
8221 clog->error() << "map e" << osdmap->get_epoch()
8222 << " had wrong cluster addr ("
8223 << osdmap->get_cluster_addrs(whoami)
8224 << " != my " << cluster_messenger->get_myaddrs() << ")";
8225 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8226 hb_back_server_messenger->get_myaddrs())) {
8227 clog->error() << "map e" << osdmap->get_epoch()
8228 << " had wrong heartbeat back addr ("
8229 << osdmap->get_hb_back_addrs(whoami)
8230 << " != my " << hb_back_server_messenger->get_myaddrs()
8231 << ")";
8232 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8233 hb_front_server_messenger->get_myaddrs())) {
8234 clog->error() << "map e" << osdmap->get_epoch()
8235 << " had wrong heartbeat front addr ("
8236 << osdmap->get_hb_front_addrs(whoami)
8237 << " != my " << hb_front_server_messenger->get_myaddrs()
8238 << ")";
8239 }
8240
8241 if (!service.is_stopping()) {
8242 epoch_t up_epoch = 0;
8243 epoch_t bind_epoch = osdmap->get_epoch();
8244 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8245 do_restart = true;
8246
8247 //add markdown log
8248 utime_t now = ceph_clock_now();
8249 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8250 osd_markdown_log.push_back(now);
8251 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8252 derr << __func__ << " marked down "
8253 << osd_markdown_log.size()
8254 << " > osd_max_markdown_count "
8255 << cct->_conf->osd_max_markdown_count
8256 << " in last " << grace << " seconds, shutting down"
8257 << dendl;
8258 do_restart = false;
8259 do_shutdown = true;
8260 }
8261
8262 start_waiting_for_healthy();
8263
8264 set<int> avoid_ports;
8265#if defined(__FreeBSD__)
8266 // prevent FreeBSD from grabbing the client_messenger port during
8267 // rebinding. In which case a cluster_meesneger will connect also
8268 // to the same port
8269 client_messenger->get_myaddrs().get_ports(&avoid_ports);
8270#endif
8271 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8272
8273 int r = cluster_messenger->rebind(avoid_ports);
8274 if (r != 0) {
8275 do_shutdown = true; // FIXME: do_restart?
8276 network_error = true;
8277 derr << __func__ << " marked down:"
8278 << " rebind cluster_messenger failed" << dendl;
8279 }
8280
8281 hb_back_server_messenger->mark_down_all();
8282 hb_front_server_messenger->mark_down_all();
8283 hb_front_client_messenger->mark_down_all();
8284 hb_back_client_messenger->mark_down_all();
8285
8286 reset_heartbeat_peers(true);
8287 }
8288 }
8289 }
8290
8291 map_lock.unlock();
8292
8293 check_osdmap_features();
8294
8295 // yay!
8296 consume_map();
8297
8298 if (is_active() || is_waiting_for_healthy())
8299 maybe_update_heartbeat_peers();
8300
8301 if (is_active()) {
8302 activate_map();
8303 }
8304
8305 if (do_shutdown) {
8306 if (network_error) {
8307 cancel_pending_failures();
8308 }
8309 // trigger shutdown in a different thread
8310 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8311 queue_async_signal(SIGINT);
8312 }
8313 else if (m->newest_map && m->newest_map > last) {
8314 dout(10) << " msg say newest map is " << m->newest_map
8315 << ", requesting more" << dendl;
8316 osdmap_subscribe(osdmap->get_epoch()+1, false);
8317 }
8318 else if (is_preboot()) {
8319 if (m->get_source().is_mon())
8320 _preboot(m->oldest_map, m->newest_map);
8321 else
8322 start_boot();
8323 }
8324 else if (do_restart)
8325 start_boot();
8326
8327}
8328
8329void OSD::check_osdmap_features()
8330{
8331 // adjust required feature bits?
8332
8333 // we have to be a bit careful here, because we are accessing the
8334 // Policy structures without taking any lock. in particular, only
8335 // modify integer values that can safely be read by a racing CPU.
8336 // since we are only accessing existing Policy structures a their
8337 // current memory location, and setting or clearing bits in integer
8338 // fields, and we are the only writer, this is not a problem.
8339
8340 const auto osdmap = get_osdmap();
8341 {
8342 Messenger::Policy p = client_messenger->get_default_policy();
8343 uint64_t mask;
8344 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8345 if ((p.features_required & mask) != features) {
8346 dout(0) << "crush map has features " << features
8347 << ", adjusting msgr requires for clients" << dendl;
8348 p.features_required = (p.features_required & ~mask) | features;
8349 client_messenger->set_default_policy(p);
8350 }
8351 }
8352 {
8353 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8354 uint64_t mask;
8355 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8356 if ((p.features_required & mask) != features) {
8357 dout(0) << "crush map has features " << features
8358 << " was " << p.features_required
8359 << ", adjusting msgr requires for mons" << dendl;
8360 p.features_required = (p.features_required & ~mask) | features;
8361 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8362 }
8363 }
8364 {
8365 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8366 uint64_t mask;
8367 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8368
8369 if ((p.features_required & mask) != features) {
8370 dout(0) << "crush map has features " << features
8371 << ", adjusting msgr requires for osds" << dendl;
8372 p.features_required = (p.features_required & ~mask) | features;
8373 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8374 }
8375
8376 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8377 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8378 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8379 ObjectStore::Transaction t;
8380 write_superblock(t);
8381 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8382 ceph_assert(err == 0);
8383 }
8384 }
8385
8386 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8387 hb_front_server_messenger->set_require_authorizer(false);
8388 hb_back_server_messenger->set_require_authorizer(false);
8389 } else {
8390 hb_front_server_messenger->set_require_authorizer(true);
8391 hb_back_server_messenger->set_require_authorizer(true);
8392 }
8393
8394 if (osdmap->require_osd_release != last_require_osd_release) {
8395 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8396 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8397 store->write_meta("require_osd_release",
8398 stringify((int)osdmap->require_osd_release));
8399 last_require_osd_release = osdmap->require_osd_release;
8400 }
8401}
8402
8403struct C_FinishSplits : public Context {
8404 OSD *osd;
8405 set<PGRef> pgs;
8406 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8407 : osd(osd), pgs(in) {}
8408 void finish(int r) override {
8409 osd->_finish_splits(pgs);
8410 }
8411};
8412
8413void OSD::_finish_splits(set<PGRef>& pgs)
8414{
8415 dout(10) << __func__ << " " << pgs << dendl;
8416 if (is_stopping())
8417 return;
8418 for (set<PGRef>::iterator i = pgs.begin();
8419 i != pgs.end();
8420 ++i) {
8421 PG *pg = i->get();
8422
8423 PeeringCtx rctx = create_context();
8424 pg->lock();
8425 dout(10) << __func__ << " " << *pg << dendl;
8426 epoch_t e = pg->get_osdmap_epoch();
8427 pg->handle_initialize(rctx);
8428 pg->queue_null(e, e);
8429 dispatch_context(rctx, pg, service.get_osdmap());
8430 pg->unlock();
8431
8432 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8433 shards[shard_index]->register_and_wake_split_child(pg);
8434 }
8435};
8436
8437bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8438 unsigned need)
8439{
8440 std::lock_guard l(merge_lock);
8441 auto& p = merge_waiters[nextmap->get_epoch()][target];
8442 p[src->pg_id] = src;
8443 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8444 << " for " << target << ", have " << p.size() << "/" << need
8445 << dendl;
8446 return p.size() == need;
8447}
8448
8449bool OSD::advance_pg(
8450 epoch_t osd_epoch,
8451 PG *pg,
8452 ThreadPool::TPHandle &handle,
8453 PeeringCtx &rctx)
8454{
8455 if (osd_epoch <= pg->get_osdmap_epoch()) {
8456 return true;
8457 }
8458 ceph_assert(pg->is_locked());
8459 OSDMapRef lastmap = pg->get_osdmap();
8460 ceph_assert(lastmap->get_epoch() < osd_epoch);
8461 set<PGRef> new_pgs; // any split children
8462 bool ret = true;
8463
8464 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8465 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8466 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8467 next_epoch <= osd_epoch;
8468 ++next_epoch) {
8469 OSDMapRef nextmap = service.try_get_map(next_epoch);
8470 if (!nextmap) {
8471 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8472 continue;
8473 }
8474
8475 unsigned new_pg_num =
8476 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8477 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8478 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8479 // check for merge
8480 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8481 spg_t parent;
8482 if (pg->pg_id.is_merge_source(
8483 old_pg_num,
8484 new_pg_num,
8485 &parent)) {
8486 // we are merge source
8487 PGRef spg = pg; // carry a ref
8488 dout(1) << __func__ << " " << pg->pg_id
8489 << " is merge source, target is " << parent
8490 << dendl;
8491 pg->write_if_dirty(rctx);
8492 if (!new_pgs.empty()) {
8493 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8494 new_pgs));
8495 new_pgs.clear();
8496 }
8497 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8498 pg->ch->flush();
8499 // release backoffs explicitly, since the on_shutdown path
8500 // aggressively tears down backoff state.
8501 if (pg->is_primary()) {
8502 pg->release_pg_backoffs();
8503 }
8504 pg->on_shutdown();
8505 OSDShard *sdata = pg->osd_shard;
8506 {
8507 std::lock_guard l(sdata->shard_lock);
8508 if (pg->pg_slot) {
8509 sdata->_detach_pg(pg->pg_slot);
8510 // update pg count now since we might not get an osdmap
8511 // any time soon.
8512 if (pg->is_primary())
8513 logger->dec(l_osd_pg_primary);
8514 else if (pg->is_nonprimary())
8515 logger->dec(l_osd_pg_replica); // misnomer
8516 else
8517 logger->dec(l_osd_pg_stray);
8518 }
8519 }
8520 pg->unlock();
8521
8522 set<spg_t> children;
8523 parent.is_split(new_pg_num, old_pg_num, &children);
8524 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8525 enqueue_peering_evt(
8526 parent,
8527 PGPeeringEventRef(
8528 std::make_shared<PGPeeringEvent>(
8529 nextmap->get_epoch(),
8530 nextmap->get_epoch(),
8531 NullEvt())));
8532 }
8533 ret = false;
8534 goto out;
8535 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8536 // we are merge target
8537 set<spg_t> children;
8538 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8539 dout(20) << __func__ << " " << pg->pg_id
8540 << " is merge target, sources are " << children
8541 << dendl;
8542 map<spg_t,PGRef> sources;
8543 {
8544 std::lock_guard l(merge_lock);
8545 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8546 unsigned need = children.size();
8547 dout(20) << __func__ << " have " << s.size() << "/"
8548 << need << dendl;
8549 if (s.size() == need) {
8550 sources.swap(s);
8551 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8552 if (merge_waiters[nextmap->get_epoch()].empty()) {
8553 merge_waiters.erase(nextmap->get_epoch());
8554 }
8555 }
8556 }
8557 if (!sources.empty()) {
8558 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8559 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8560 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8561 pg->merge_from(
8562 sources, rctx, split_bits,
8563 nextmap->get_pg_pool(
8564 pg->pg_id.pool())->last_pg_merge_meta);
8565 pg->pg_slot->waiting_for_merge_epoch = 0;
8566 } else {
8567 dout(20) << __func__ << " not ready to merge yet" << dendl;
8568 pg->write_if_dirty(rctx);
8569 if (!new_pgs.empty()) {
8570 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8571 new_pgs));
8572 new_pgs.clear();
8573 }
8574 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8575 pg->unlock();
8576 // kick source(s) to get them ready
8577 for (auto& i : children) {
8578 dout(20) << __func__ << " kicking source " << i << dendl;
8579 enqueue_peering_evt(
8580 i,
8581 PGPeeringEventRef(
8582 std::make_shared<PGPeeringEvent>(
8583 nextmap->get_epoch(),
8584 nextmap->get_epoch(),
8585 NullEvt())));
8586 }
8587 ret = false;
8588 goto out;
8589 }
8590 }
8591 }
8592 }
8593
8594 vector<int> newup, newacting;
8595 int up_primary, acting_primary;
8596 nextmap->pg_to_up_acting_osds(
8597 pg->pg_id.pgid,
8598 &newup, &up_primary,
8599 &newacting, &acting_primary);
8600 pg->handle_advance_map(
8601 nextmap, lastmap, newup, up_primary,
8602 newacting, acting_primary, rctx);
8603
8604 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8605 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8606 if (oldpool != lastmap->get_pools().end()
8607 && newpool != nextmap->get_pools().end()) {
8608 dout(20) << __func__
8609 << " new pool opts " << newpool->second.opts
8610 << " old pool opts " << oldpool->second.opts
8611 << dendl;
8612
8613 double old_min_interval = 0, new_min_interval = 0;
8614 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8615 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8616
8617 double old_max_interval = 0, new_max_interval = 0;
8618 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8619 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8620
8621 // Assume if an interval is change from set to unset or vice versa the actual config
8622 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8623 // unnecessarily.
8624 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8625 pg->on_info_history_change();
8626 }
8627 }
8628
8629 if (new_pg_num && old_pg_num != new_pg_num) {
8630 // check for split
8631 set<spg_t> children;
8632 if (pg->pg_id.is_split(
8633 old_pg_num,
8634 new_pg_num,
8635 &children)) {
8636 split_pgs(
8637 pg, children, &new_pgs, lastmap, nextmap,
8638 rctx);
8639 }
8640 }
8641
8642 lastmap = nextmap;
8643 old_pg_num = new_pg_num;
8644 handle.reset_tp_timeout();
8645 }
8646 pg->handle_activate_map(rctx);
8647
8648 ret = true;
8649 out:
8650 if (!new_pgs.empty()) {
8651 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
8652 }
8653 return ret;
8654}
8655
8656void OSD::consume_map()
8657{
8658 ceph_assert(ceph_mutex_is_locked(osd_lock));
8659 auto osdmap = get_osdmap();
8660 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8661
8662 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8663 * speak the older sorting version any more. Be careful not to force
8664 * a shutdown if we are merely processing old maps, though.
8665 */
8666 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8667 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8668 ceph_abort();
8669 }
8670
8671 service.pre_publish_map(osdmap);
8672 service.await_reserved_maps();
8673 service.publish_map(osdmap);
8674
8675 // prime splits and merges
8676 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8677 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8678 for (auto& shard : shards) {
8679 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8680 }
8681 if (!newly_split.empty()) {
8682 for (auto& shard : shards) {
8683 shard->prime_splits(osdmap, &newly_split);
8684 }
8685 ceph_assert(newly_split.empty());
8686 }
8687
8688 // prune sent_ready_to_merge
8689 service.prune_sent_ready_to_merge(osdmap);
8690
8691 // FIXME, maybe: We could race against an incoming peering message
8692 // that instantiates a merge PG after identify_merges() below and
8693 // never set up its peer to complete the merge. An OSD restart
8694 // would clear it up. This is a hard race to resolve,
8695 // extraordinarily rare (we only merge PGs that are stable and
8696 // clean, so it'd have to be an imported PG to an OSD with a
8697 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8698 // replace all of this with a seastar-based code soon anyway.
8699 if (!merge_pgs.empty()) {
8700 // mark the pgs we already have, or create new and empty merge
8701 // participants for those we are missing. do this all under the
8702 // shard lock so we don't have to worry about racing pg creates
8703 // via _process.
8704 for (auto& shard : shards) {
8705 shard->prime_merges(osdmap, &merge_pgs);
8706 }
8707 ceph_assert(merge_pgs.empty());
8708 }
8709
8710 service.prune_pg_created();
8711
8712 unsigned pushes_to_free = 0;
8713 for (auto& shard : shards) {
8714 shard->consume_map(osdmap, &pushes_to_free);
8715 }
8716
8717 vector<spg_t> pgids;
8718 _get_pgids(&pgids);
8719
8720 // count (FIXME, probably during seastar rewrite)
8721 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8722 vector<PGRef> pgs;
8723 _get_pgs(&pgs);
8724 for (auto& pg : pgs) {
8725 // FIXME (probably during seastar rewrite): this is lockless and
8726 // racy, but we don't want to take pg lock here.
8727 if (pg->is_primary())
8728 num_pg_primary++;
8729 else if (pg->is_nonprimary())
8730 num_pg_replica++; // misnomer
8731 else
8732 num_pg_stray++;
8733 }
8734
8735 {
8736 // FIXME (as part of seastar rewrite): move to OSDShard
8737 std::lock_guard l(pending_creates_lock);
8738 for (auto pg = pending_creates_from_osd.begin();
8739 pg != pending_creates_from_osd.end();) {
8740 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
8741 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8742 << "discarding pending_create_from_osd" << dendl;
8743 pg = pending_creates_from_osd.erase(pg);
8744 } else {
8745 ++pg;
8746 }
8747 }
8748 }
8749
8750 service.maybe_inject_dispatch_delay();
8751
8752 dispatch_sessions_waiting_on_map();
8753
8754 service.maybe_inject_dispatch_delay();
8755
8756 service.release_reserved_pushes(pushes_to_free);
8757
8758 // queue null events to push maps down to individual PGs
8759 for (auto pgid : pgids) {
8760 enqueue_peering_evt(
8761 pgid,
8762 PGPeeringEventRef(
8763 std::make_shared<PGPeeringEvent>(
8764 osdmap->get_epoch(),
8765 osdmap->get_epoch(),
8766 NullEvt())));
8767 }
8768 logger->set(l_osd_pg, pgids.size());
8769 logger->set(l_osd_pg_primary, num_pg_primary);
8770 logger->set(l_osd_pg_replica, num_pg_replica);
8771 logger->set(l_osd_pg_stray, num_pg_stray);
8772}
8773
8774void OSD::activate_map()
8775{
8776 ceph_assert(ceph_mutex_is_locked(osd_lock));
8777 auto osdmap = get_osdmap();
8778
8779 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8780
8781 // norecover?
8782 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8783 if (!service.recovery_is_paused()) {
8784 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8785 service.pause_recovery();
8786 }
8787 } else {
8788 if (service.recovery_is_paused()) {
8789 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8790 service.unpause_recovery();
8791 }
8792 }
8793
8794 service.activate_map();
8795
8796 // process waiters
8797 take_waiters(waiting_for_osdmap);
8798}
8799
8800bool OSD::require_mon_peer(const Message *m)
8801{
8802 if (!m->get_connection()->peer_is_mon()) {
8803 dout(0) << "require_mon_peer received from non-mon "
8804 << m->get_connection()->get_peer_addr()
8805 << " " << *m << dendl;
8806 return false;
8807 }
8808 return true;
8809}
8810
8811bool OSD::require_mon_or_mgr_peer(const Message *m)
8812{
8813 if (!m->get_connection()->peer_is_mon() &&
8814 !m->get_connection()->peer_is_mgr()) {
8815 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8816 << m->get_connection()->get_peer_addr()
8817 << " " << *m << dendl;
8818 return false;
8819 }
8820 return true;
8821}
8822
8823bool OSD::require_osd_peer(const Message *m)
8824{
8825 if (!m->get_connection()->peer_is_osd()) {
8826 dout(0) << "require_osd_peer received from non-osd "
8827 << m->get_connection()->get_peer_addr()
8828 << " " << *m << dendl;
8829 return false;
8830 }
8831 return true;
8832}
8833
8834bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8835{
8836 epoch_t up_epoch = service.get_up_epoch();
8837 if (epoch < up_epoch) {
8838 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8839 return false;
8840 }
8841
8842 if (!is_active()) {
8843 dout(7) << "still in boot state, dropping message " << *m << dendl;
8844 return false;
8845 }
8846
8847 return true;
8848}
8849
8850bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
8851 bool is_fast_dispatch)
8852{
8853 int from = m->get_source().num();
8854
8855 if (map->is_down(from) ||
8856 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
8857 dout(5) << "from dead osd." << from << ", marking down, "
8858 << " msg was " << m->get_source_inst().addr
8859 << " expected "
8860 << (map->is_up(from) ?
8861 map->get_cluster_addrs(from) : entity_addrvec_t())
8862 << dendl;
8863 ConnectionRef con = m->get_connection();
8864 con->mark_down();
8865 if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
8866 if (!is_fast_dispatch)
8867 s->session_dispatch_lock.lock();
8868 clear_session_waiting_on_map(s);
8869 con->set_priv(nullptr); // break ref <-> session cycle, if any
8870 s->con.reset();
8871 if (!is_fast_dispatch)
8872 s->session_dispatch_lock.unlock();
8873 }
8874 return false;
8875 }
8876 return true;
8877}
8878
8879
8880/*
8881 * require that we have same (or newer) map, and that
8882 * the source is the pg primary.
8883 */
8884bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8885 bool is_fast_dispatch)
8886{
8887 const Message *m = op->get_req();
8888 const auto osdmap = get_osdmap();
8889 dout(15) << "require_same_or_newer_map " << epoch
8890 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8891
8892 ceph_assert(ceph_mutex_is_locked(osd_lock));
8893
8894 // do they have a newer map?
8895 if (epoch > osdmap->get_epoch()) {
8896 dout(7) << "waiting for newer map epoch " << epoch
8897 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8898 wait_for_new_map(op);
8899 return false;
8900 }
8901
8902 if (!require_self_aliveness(op->get_req(), epoch)) {
8903 return false;
8904 }
8905
8906 // ok, our map is same or newer.. do they still exist?
8907 if (m->get_connection()->get_messenger() == cluster_messenger &&
8908 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8909 return false;
8910 }
8911
8912 return true;
8913}
8914
8915
8916
8917
8918
8919// ----------------------------------------
8920// pg creation
8921
8922void OSD::split_pgs(
8923 PG *parent,
8924 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
8925 OSDMapRef curmap,
8926 OSDMapRef nextmap,
8927 PeeringCtx &rctx)
8928{
8929 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
8930 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
8931
8932 vector<object_stat_sum_t> updated_stats;
8933 parent->start_split_stats(childpgids, &updated_stats);
8934
8935 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8936 for (set<spg_t>::const_iterator i = childpgids.begin();
8937 i != childpgids.end();
8938 ++i, ++stat_iter) {
8939 ceph_assert(stat_iter != updated_stats.end());
8940 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
8941 PG* child = _make_pg(nextmap, *i);
8942 child->lock(true);
8943 out_pgs->insert(child);
8944 child->ch = store->create_new_collection(child->coll);
8945
8946 {
8947 uint32_t shard_index = i->hash_to_shard(shards.size());
8948 assert(NULL != shards[shard_index]);
8949 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
8950 }
8951
8952 unsigned split_bits = i->get_split_bits(pg_num);
8953 dout(10) << " pg_num is " << pg_num
8954 << ", m_seed " << i->ps()
8955 << ", split_bits is " << split_bits << dendl;
8956 parent->split_colls(
8957 *i,
8958 split_bits,
8959 i->ps(),
8960 &child->get_pool().info,
8961 rctx.transaction);
8962 parent->split_into(
8963 i->pgid,
8964 child,
8965 split_bits);
8966
8967 child->init_collection_pool_opts();
8968
8969 child->finish_split_stats(*stat_iter, rctx.transaction);
8970 child->unlock();
8971 }
8972 ceph_assert(stat_iter != updated_stats.end());
8973 parent->finish_split_stats(*stat_iter, rctx.transaction);
8974}
8975
8976/*
8977 * holding osd_lock
8978 */
8979void OSD::handle_pg_create(OpRequestRef op)
8980{
8981 // NOTE: this can be removed in P release (mimic is the last version to
8982 // send MOSDPGCreate messages).
8983
8984 auto m = op->get_req<MOSDPGCreate>();
8985 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
8986
8987 dout(10) << "handle_pg_create " << *m << dendl;
8988
8989 if (!require_mon_peer(op->get_req())) {
8990 return;
8991 }
8992
8993 if (!require_same_or_newer_map(op, m->epoch, false))
8994 return;
8995
8996 op->mark_started();
8997
8998 const auto osdmap = get_osdmap();
8999 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9000 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9001 p != m->mkpg.end();
9002 ++p, ++ci) {
9003 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
9004 epoch_t created = p->second.created;
9005 if (p->second.split_bits) // Skip split pgs
9006 continue;
9007 pg_t on = p->first;
9008
9009 if (!osdmap->have_pg_pool(on.pool())) {
9010 dout(20) << "ignoring pg on deleted pool " << on << dendl;
9011 continue;
9012 }
9013
9014 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9015
9016 spg_t pgid;
9017 bool mapped = osdmap->get_primary_shard(on, &pgid);
9018 ceph_assert(mapped);
9019
9020 // is it still ours?
9021 vector<int> up, acting;
9022 int up_primary = -1;
9023 int acting_primary = -1;
9024 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9025 int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
9026
9027 if (acting_primary != whoami) {
9028 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9029 << "), my role=" << role << ", skipping" << dendl;
9030 continue;
9031 }
9032
9033
9034 PastIntervals pi;
9035 pg_history_t history;
9036 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9037
9038 // The mon won't resend unless the primary changed, so we ignore
9039 // same_interval_since. We'll pass this history with the current
9040 // epoch as the event.
9041 if (history.same_primary_since > m->epoch) {
9042 dout(10) << __func__ << ": got obsolete pg create on pgid "
9043 << pgid << " from epoch " << m->epoch
9044 << ", primary changed in " << history.same_primary_since
9045 << dendl;
9046 continue;
9047 }
9048 enqueue_peering_evt(
9049 pgid,
9050 PGPeeringEventRef(
9051 std::make_shared<PGPeeringEvent>(
9052 osdmap->get_epoch(),
9053 osdmap->get_epoch(),
9054 NullEvt(),
9055 true,
9056 new PGCreateInfo(
9057 pgid,
9058 osdmap->get_epoch(),
9059 history,
9060 pi,
9061 true)
9062 )));
9063 }
9064
9065 {
9066 std::lock_guard l(pending_creates_lock);
9067 if (pending_creates_from_mon == 0) {
9068 last_pg_create_epoch = m->epoch;
9069 }
9070 }
9071
9072 maybe_update_heartbeat_peers();
9073}
9074
9075
9076// ----------------------------------------
9077// peering and recovery
9078
9079PeeringCtx OSD::create_context()
9080{
9081 return PeeringCtx(get_osdmap()->require_osd_release);
9082}
9083
9084void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
9085 ThreadPool::TPHandle *handle)
9086{
9087 if (!service.get_osdmap()->is_up(whoami)) {
9088 dout(20) << __func__ << " not up in osdmap" << dendl;
9089 } else if (!is_active()) {
9090 dout(20) << __func__ << " not active" << dendl;
9091 } else {
9092 for (auto& [osd, ls] : ctx.message_map) {
9093 if (!curmap->is_up(osd)) {
9094 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9095 continue;
9096 }
9097 ConnectionRef con = service.get_con_osd_cluster(
9098 osd, curmap->get_epoch());
9099 if (!con) {
9100 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9101 << dendl;
9102 continue;
9103 }
9104 service.maybe_share_map(con.get(), curmap);
9105 for (auto m : ls) {
9106 con->send_message2(m);
9107 }
9108 ls.clear();
9109 }
9110 }
9111 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
9112 int tr = store->queue_transaction(
9113 pg->ch,
9114 std::move(ctx.transaction), TrackedOpRef(),
9115 handle);
9116 ceph_assert(tr == 0);
9117 }
9118}
9119
9120void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9121{
9122 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9123 if (!require_mon_peer(m)) {
9124 m->put();
9125 return;
9126 }
9127 for (auto& p : m->pgs) {
9128 spg_t pgid = p.first;
9129 epoch_t created = p.second.first;
9130 utime_t created_stamp = p.second.second;
9131 auto q = m->pg_extra.find(pgid);
9132 if (q == m->pg_extra.end()) {
9133 dout(20) << __func__ << " " << pgid << " e" << created
9134 << "@" << created_stamp
9135 << " (no history or past_intervals)" << dendl;
9136 // pre-octopus ... no pg history. this can be removed in Q release.
9137 enqueue_peering_evt(
9138 pgid,
9139 PGPeeringEventRef(
9140 std::make_shared<PGPeeringEvent>(
9141 m->epoch,
9142 m->epoch,
9143 NullEvt(),
9144 true,
9145 new PGCreateInfo(
9146 pgid,
9147 created,
9148 pg_history_t(created, created_stamp),
9149 PastIntervals(),
9150 true)
9151 )));
9152 } else {
9153 dout(20) << __func__ << " " << pgid << " e" << created
9154 << "@" << created_stamp
9155 << " history " << q->second.first
9156 << " pi " << q->second.second << dendl;
9157 if (!q->second.second.empty() &&
9158 m->epoch < q->second.second.get_bounds().second) {
9159 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9160 << " and unmatched past_intervals " << q->second.second
9161 << " (history " << q->second.first << ")";
9162 } else {
9163 enqueue_peering_evt(
9164 pgid,
9165 PGPeeringEventRef(
9166 std::make_shared<PGPeeringEvent>(
9167 m->epoch,
9168 m->epoch,
9169 NullEvt(),
9170 true,
9171 new PGCreateInfo(
9172 pgid,
9173 m->epoch,
9174 q->second.first,
9175 q->second.second,
9176 true)
9177 )));
9178 }
9179 }
9180 }
9181
9182 {
9183 std::lock_guard l(pending_creates_lock);
9184 if (pending_creates_from_mon == 0) {
9185 last_pg_create_epoch = m->epoch;
9186 }
9187 }
9188
9189 m->put();
9190}
9191
9192void OSD::handle_fast_pg_query(MOSDPGQuery *m)
9193{
9194 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9195 if (!require_osd_peer(m)) {
9196 m->put();
9197 return;
9198 }
9199 int from = m->get_source().num();
9200 for (auto& p : m->pg_list) {
9201 enqueue_peering_evt(
9202 p.first,
9203 PGPeeringEventRef(
9204 std::make_shared<PGPeeringEvent>(
9205 p.second.epoch_sent, p.second.epoch_sent,
9206 MQuery(
9207 p.first,
9208 pg_shard_t(from, p.second.from),
9209 p.second,
9210 p.second.epoch_sent),
9211 false))
9212 );
9213 }
9214 m->put();
9215}
9216
9217void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9218{
9219 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9220 if (!require_osd_peer(m)) {
9221 m->put();
9222 return;
9223 }
9224 int from = m->get_source().num();
9225 for (auto& p : m->get_pg_list()) {
9226 spg_t pgid(p.info.pgid.pgid, p.to);
9227 enqueue_peering_evt(
9228 pgid,
9229 PGPeeringEventRef(
9230 std::make_shared<PGPeeringEvent>(
9231 p.epoch_sent,
9232 p.query_epoch,
9233 MNotifyRec(
9234 pgid, pg_shard_t(from, p.from),
9235 p,
9236 m->get_connection()->get_features()),
9237 true,
9238 new PGCreateInfo(
9239 pgid,
9240 p.query_epoch,
9241 p.info.history,
9242 p.past_intervals,
9243 false)
9244 )));
9245 }
9246 m->put();
9247}
9248
9249void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9250{
9251 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9252 if (!require_osd_peer(m)) {
9253 m->put();
9254 return;
9255 }
9256 int from = m->get_source().num();
9257 for (auto& p : m->pg_list) {
9258 enqueue_peering_evt(
9259 spg_t(p.info.pgid.pgid, p.to),
9260 PGPeeringEventRef(
9261 std::make_shared<PGPeeringEvent>(
9262 p.epoch_sent, p.query_epoch,
9263 MInfoRec(
9264 pg_shard_t(from, p.from),
9265 p.info,
9266 p.epoch_sent)))
9267 );
9268 }
9269 m->put();
9270}
9271
9272void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9273{
9274 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9275 if (!require_osd_peer(m)) {
9276 m->put();
9277 return;
9278 }
9279 for (auto& pgid : m->pg_list) {
9280 enqueue_peering_evt(
9281 pgid,
9282 PGPeeringEventRef(
9283 std::make_shared<PGPeeringEvent>(
9284 m->get_epoch(), m->get_epoch(),
9285 PeeringState::DeleteStart())));
9286 }
9287 m->put();
9288}
9289
9290void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9291{
9292 dout(10) << __func__ << " " << *m << dendl;
9293 if (!require_mon_or_mgr_peer(m)) {
9294 m->put();
9295 return;
9296 }
9297 epoch_t epoch = get_osdmap_epoch();
9298 for (auto pgid : m->forced_pgs) {
9299 if (m->options & OFR_BACKFILL) {
9300 if (m->options & OFR_CANCEL) {
9301 enqueue_peering_evt(
9302 pgid,
9303 PGPeeringEventRef(
9304 std::make_shared<PGPeeringEvent>(
9305 epoch, epoch,
9306 PeeringState::UnsetForceBackfill())));
9307 } else {
9308 enqueue_peering_evt(
9309 pgid,
9310 PGPeeringEventRef(
9311 std::make_shared<PGPeeringEvent>(
9312 epoch, epoch,
9313 PeeringState::SetForceBackfill())));
9314 }
9315 } else if (m->options & OFR_RECOVERY) {
9316 if (m->options & OFR_CANCEL) {
9317 enqueue_peering_evt(
9318 pgid,
9319 PGPeeringEventRef(
9320 std::make_shared<PGPeeringEvent>(
9321 epoch, epoch,
9322 PeeringState::UnsetForceRecovery())));
9323 } else {
9324 enqueue_peering_evt(
9325 pgid,
9326 PGPeeringEventRef(
9327 std::make_shared<PGPeeringEvent>(
9328 epoch, epoch,
9329 PeeringState::SetForceRecovery())));
9330 }
9331 }
9332 }
9333 m->put();
9334}
9335
9336void OSD::handle_pg_query_nopg(const MQuery& q)
9337{
9338 spg_t pgid = q.pgid;
9339 dout(10) << __func__ << " " << pgid << dendl;
9340
9341 OSDMapRef osdmap = get_osdmap();
9342 if (!osdmap->have_pg_pool(pgid.pool()))
9343 return;
9344
9345 dout(10) << " pg " << pgid << " dne" << dendl;
9346 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9347 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9348 if (con) {
9349 Message *m;
9350 if (q.query.type == pg_query_t::LOG ||
9351 q.query.type == pg_query_t::FULLLOG) {
9352 m = new MOSDPGLog(
9353 q.query.from, q.query.to,
9354 osdmap->get_epoch(), empty,
9355 q.query.epoch_sent);
9356 } else {
9357 vector<pg_notify_t> ls;
9358 ls.push_back(
9359 pg_notify_t(
9360 q.query.from, q.query.to,
9361 q.query.epoch_sent,
9362 osdmap->get_epoch(),
9363 empty,
9364 PastIntervals()));
9365 m = new MOSDPGNotify(osdmap->get_epoch(), std::move(ls));
9366 }
9367 service.maybe_share_map(con.get(), osdmap);
9368 con->send_message(m);
9369 }
9370}
9371
9372void OSDService::queue_check_readable(spg_t spgid,
9373 epoch_t lpr,
9374 ceph::signedspan delay)
9375{
9376 if (delay == ceph::signedspan::zero()) {
9377 osd->enqueue_peering_evt(
9378 spgid,
9379 PGPeeringEventRef(
9380 std::make_shared<PGPeeringEvent>(
9381 lpr, lpr,
9382 PeeringState::CheckReadable())));
9383 } else {
9384 mono_timer.add_event(
9385 delay,
9386 [this, spgid, lpr]() {
9387 queue_check_readable(spgid, lpr);
9388 });
9389 }
9390}
9391
9392
9393// =========================================================
9394// RECOVERY
9395
9396void OSDService::_maybe_queue_recovery() {
9397 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
9398 uint64_t available_pushes;
9399 while (!awaiting_throttle.empty() &&
9400 _recover_now(&available_pushes)) {
9401 uint64_t to_start = std::min(
9402 available_pushes,
9403 cct->_conf->osd_recovery_max_single_start);
9404 _queue_for_recovery(awaiting_throttle.front(), to_start);
9405 awaiting_throttle.pop_front();
9406 dout(10) << __func__ << " starting " << to_start
9407 << ", recovery_ops_reserved " << recovery_ops_reserved
9408 << " -> " << (recovery_ops_reserved + to_start) << dendl;
9409 recovery_ops_reserved += to_start;
9410 }
9411}
9412
9413bool OSDService::_recover_now(uint64_t *available_pushes)
9414{
9415 if (available_pushes)
9416 *available_pushes = 0;
9417
9418 if (ceph_clock_now() < defer_recovery_until) {
9419 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9420 return false;
9421 }
9422
9423 if (recovery_paused) {
9424 dout(15) << __func__ << " paused" << dendl;
9425 return false;
9426 }
9427
9428 uint64_t max = osd->get_recovery_max_active();
9429 if (max <= recovery_ops_active + recovery_ops_reserved) {
9430 dout(15) << __func__ << " active " << recovery_ops_active
9431 << " + reserved " << recovery_ops_reserved
9432 << " >= max " << max << dendl;
9433 return false;
9434 }
9435
9436 if (available_pushes)
9437 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9438
9439 return true;
9440}
9441
9442unsigned OSDService::get_target_pg_log_entries() const
9443{
9444 auto num_pgs = osd->get_num_pgs();
9445 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9446 if (num_pgs > 0 && target > 0) {
9447 // target an even spread of our budgeted log entries across all
9448 // PGs. note that while we only get to control the entry count
9449 // for primary PGs, we'll normally be responsible for a mix of
9450 // primary and replica PGs (for the same pool(s) even), so this
9451 // will work out.
9452 return std::max<unsigned>(
9453 std::min<unsigned>(target / num_pgs,
9454 cct->_conf->osd_max_pg_log_entries),
9455 cct->_conf->osd_min_pg_log_entries);
9456 } else {
9457 // fall back to a per-pg value.
9458 return cct->_conf->osd_min_pg_log_entries;
9459 }
9460}
9461
9462void OSD::do_recovery(
9463 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9464 ThreadPool::TPHandle &handle)
9465{
9466 uint64_t started = 0;
9467
9468 /*
9469 * When the value of osd_recovery_sleep is set greater than zero, recovery
9470 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9471 * recovery event's schedule time. This is done by adding a
9472 * recovery_requeue_callback event, which re-queues the recovery op using
9473 * queue_recovery_after_sleep.
9474 */
9475 float recovery_sleep = get_osd_recovery_sleep();
9476 {
9477 std::lock_guard l(service.sleep_lock);
9478 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9479 PGRef pgref(pg);
9480 auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
9481 dout(20) << "do_recovery wake up at "
9482 << ceph_clock_now()
9483 << ", re-queuing recovery" << dendl;
9484 std::lock_guard l(service.sleep_lock);
9485 service.recovery_needs_sleep = false;
9486 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9487 });
9488
9489 // This is true for the first recovery op and when the previous recovery op
9490 // has been scheduled in the past. The next recovery op is scheduled after
9491 // completing the sleep from now.
9492
9493 if (auto now = ceph::real_clock::now();
9494 service.recovery_schedule_time < now) {
9495 service.recovery_schedule_time = now;
9496 }
9497 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
9498 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9499 recovery_requeue_callback);
9500 dout(20) << "Recovery event scheduled at "
9501 << service.recovery_schedule_time << dendl;
9502 return;
9503 }
9504 }
9505
9506 {
9507 {
9508 std::lock_guard l(service.sleep_lock);
9509 service.recovery_needs_sleep = true;
9510 }
9511
9512 if (pg->pg_has_reset_since(queued)) {
9513 goto out;
9514 }
9515
9516 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9517#ifdef DEBUG_RECOVERY_OIDS
9518 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
9519#endif
9520
9521 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
9522 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9523 << " on " << *pg << dendl;
9524
9525 if (do_unfound) {
9526 PeeringCtx rctx = create_context();
9527 rctx.handle = &handle;
9528 pg->find_unfound(queued, rctx);
9529 dispatch_context(rctx, pg, pg->get_osdmap());
9530 }
9531 }
9532
9533 out:
9534 ceph_assert(started <= reserved_pushes);
9535 service.release_reserved_pushes(reserved_pushes);
9536}
9537
9538void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9539{
9540 std::lock_guard l(recovery_lock);
9541 dout(10) << "start_recovery_op " << *pg << " " << soid
9542 << " (" << recovery_ops_active << "/"
9543 << osd->get_recovery_max_active() << " rops)"
9544 << dendl;
9545 recovery_ops_active++;
9546
9547#ifdef DEBUG_RECOVERY_OIDS
9548 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9549 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9550 recovery_oids[pg->pg_id].insert(soid);
9551#endif
9552}
9553
9554void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9555{
9556 std::lock_guard l(recovery_lock);
9557 dout(10) << "finish_recovery_op " << *pg << " " << soid
9558 << " dequeue=" << dequeue
9559 << " (" << recovery_ops_active << "/"
9560 << osd->get_recovery_max_active() << " rops)"
9561 << dendl;
9562
9563 // adjust count
9564 ceph_assert(recovery_ops_active > 0);
9565 recovery_ops_active--;
9566
9567#ifdef DEBUG_RECOVERY_OIDS
9568 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9569 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9570 recovery_oids[pg->pg_id].erase(soid);
9571#endif
9572
9573 _maybe_queue_recovery();
9574}
9575
9576bool OSDService::is_recovery_active()
9577{
9578 if (cct->_conf->osd_debug_pretend_recovery_active) {
9579 return true;
9580 }
9581 return local_reserver.has_reservation() || remote_reserver.has_reservation();
9582}
9583
9584void OSDService::release_reserved_pushes(uint64_t pushes)
9585{
9586 std::lock_guard l(recovery_lock);
9587 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9588 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9589 << dendl;
9590 ceph_assert(recovery_ops_reserved >= pushes);
9591 recovery_ops_reserved -= pushes;
9592 _maybe_queue_recovery();
9593}
9594
9595// =========================================================
9596// OPS
9597
9598bool OSD::op_is_discardable(const MOSDOp *op)
9599{
9600 // drop client request if they are not connected and can't get the
9601 // reply anyway.
9602 if (!op->get_connection()->is_connected()) {
9603 return true;
9604 }
9605 return false;
9606}
9607
9608void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
9609{
9610 const utime_t stamp = op->get_req()->get_recv_stamp();
9611 const utime_t latency = ceph_clock_now() - stamp;
9612 const unsigned priority = op->get_req()->get_priority();
9613 const int cost = op->get_req()->get_cost();
9614 const uint64_t owner = op->get_req()->get_source().num();
9615
9616 dout(15) << "enqueue_op " << op << " prio " << priority
9617 << " cost " << cost
9618 << " latency " << latency
9619 << " epoch " << epoch
9620 << " " << *(op->get_req()) << dendl;
9621 op->osd_trace.event("enqueue op");
9622 op->osd_trace.keyval("priority", priority);
9623 op->osd_trace.keyval("cost", cost);
9624 op->mark_queued_for_pg();
9625 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9626 op_shardedwq.queue(
9627 OpSchedulerItem(
9628 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9629 cost, priority, stamp, owner, epoch));
9630}
9631
9632void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9633{
9634 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9635 op_shardedwq.queue(
9636 OpSchedulerItem(
9637 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9638 10,
9639 cct->_conf->osd_peering_op_priority,
9640 utime_t(),
9641 0,
9642 evt->get_epoch_sent()));
9643}
9644
9645/*
9646 * NOTE: dequeue called in worker thread, with pg lock
9647 */
9648void OSD::dequeue_op(
9649 PGRef pg, OpRequestRef op,
9650 ThreadPool::TPHandle &handle)
9651{
9652 const Message *m = op->get_req();
9653
9654 FUNCTRACE(cct);
9655 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
9656
9657 utime_t now = ceph_clock_now();
9658 op->set_dequeued_time(now);
9659
9660 utime_t latency = now - m->get_recv_stamp();
9661 dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9662 << " cost " << m->get_cost()
9663 << " latency " << latency
9664 << " " << *m
9665 << " pg " << *pg << dendl;
9666
9667 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9668
9669 service.maybe_share_map(m->get_connection().get(),
9670 pg->get_osdmap(),
9671 op->sent_epoch);
9672
9673 if (pg->is_deleting())
9674 return;
9675
9676 op->mark_reached_pg();
9677 op->osd_trace.event("dequeue_op");
9678
9679 pg->do_request(op, handle);
9680
9681 // finish
9682 dout(10) << "dequeue_op " << op << " finish" << dendl;
9683 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
9684}
9685
9686
9687void OSD::dequeue_peering_evt(
9688 OSDShard *sdata,
9689 PG *pg,
9690 PGPeeringEventRef evt,
9691 ThreadPool::TPHandle& handle)
9692{
9693 PeeringCtx rctx = create_context();
9694 auto curmap = sdata->get_osdmap();
9695 bool need_up_thru = false;
9696 epoch_t same_interval_since = 0;
9697 if (!pg) {
9698 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9699 handle_pg_query_nopg(*q);
9700 } else {
9701 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9702 ceph_abort();
9703 }
9704 } else if (advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9705 pg->do_peering_event(evt, rctx);
9706 if (pg->is_deleted()) {
9707 pg->unlock();
9708 return;
9709 }
9710 dispatch_context(rctx, pg, curmap, &handle);
9711 need_up_thru = pg->get_need_up_thru();
9712 same_interval_since = pg->get_same_interval_since();
9713 pg->unlock();
9714 }
9715
9716 if (need_up_thru) {
9717 queue_want_up_thru(same_interval_since);
9718 }
9719
9720 service.send_pg_temp();
9721}
9722
9723void OSD::dequeue_delete(
9724 OSDShard *sdata,
9725 PG *pg,
9726 epoch_t e,
9727 ThreadPool::TPHandle& handle)
9728{
9729 dequeue_peering_evt(
9730 sdata,
9731 pg,
9732 PGPeeringEventRef(
9733 std::make_shared<PGPeeringEvent>(
9734 e, e,
9735 PeeringState::DeleteSome())),
9736 handle);
9737}
9738
9739
9740
9741// --------------------------------
9742
9743const char** OSD::get_tracked_conf_keys() const
9744{
9745 static const char* KEYS[] = {
9746 "osd_max_backfills",
9747 "osd_min_recovery_priority",
9748 "osd_max_trimming_pgs",
9749 "osd_op_complaint_time",
9750 "osd_op_log_threshold",
9751 "osd_op_history_size",
9752 "osd_op_history_duration",
9753 "osd_op_history_slow_op_size",
9754 "osd_op_history_slow_op_threshold",
9755 "osd_enable_op_tracker",
9756 "osd_map_cache_size",
9757 "osd_pg_epoch_max_lag_factor",
9758 "osd_pg_epoch_persisted_max_stale",
9759 // clog & admin clog
9760 "clog_to_monitors",
9761 "clog_to_syslog",
9762 "clog_to_syslog_facility",
9763 "clog_to_syslog_level",
9764 "osd_objectstore_fuse",
9765 "clog_to_graylog",
9766 "clog_to_graylog_host",
9767 "clog_to_graylog_port",
9768 "host",
9769 "fsid",
9770 "osd_recovery_delay_start",
9771 "osd_client_message_size_cap",
9772 "osd_client_message_cap",
9773 "osd_heartbeat_min_size",
9774 "osd_heartbeat_interval",
9775 "osd_object_clean_region_max_num_intervals",
9776 "osd_scrub_min_interval",
9777 "osd_scrub_max_interval",
9778 NULL
9779 };
9780 return KEYS;
9781}
9782
9783void OSD::handle_conf_change(const ConfigProxy& conf,
9784 const std::set <std::string> &changed)
9785{
9786 std::lock_guard l{osd_lock};
9787 if (changed.count("osd_max_backfills")) {
9788 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9789 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9790 }
9791 if (changed.count("osd_min_recovery_priority")) {
9792 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9793 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9794 }
9795 if (changed.count("osd_max_trimming_pgs")) {
9796 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9797 }
9798 if (changed.count("osd_op_complaint_time") ||
9799 changed.count("osd_op_log_threshold")) {
9800 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9801 cct->_conf->osd_op_log_threshold);
9802 }
9803 if (changed.count("osd_op_history_size") ||
9804 changed.count("osd_op_history_duration")) {
9805 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9806 cct->_conf->osd_op_history_duration);
9807 }
9808 if (changed.count("osd_op_history_slow_op_size") ||
9809 changed.count("osd_op_history_slow_op_threshold")) {
9810 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9811 cct->_conf->osd_op_history_slow_op_threshold);
9812 }
9813 if (changed.count("osd_enable_op_tracker")) {
9814 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9815 }
9816 if (changed.count("osd_map_cache_size")) {
9817 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9818 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9819 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9820 }
9821 if (changed.count("clog_to_monitors") ||
9822 changed.count("clog_to_syslog") ||
9823 changed.count("clog_to_syslog_level") ||
9824 changed.count("clog_to_syslog_facility") ||
9825 changed.count("clog_to_graylog") ||
9826 changed.count("clog_to_graylog_host") ||
9827 changed.count("clog_to_graylog_port") ||
9828 changed.count("host") ||
9829 changed.count("fsid")) {
9830 update_log_config();
9831 }
9832 if (changed.count("osd_pg_epoch_max_lag_factor")) {
9833 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
9834 "osd_pg_epoch_max_lag_factor");
9835 }
9836
9837#ifdef HAVE_LIBFUSE
9838 if (changed.count("osd_objectstore_fuse")) {
9839 if (store) {
9840 enable_disable_fuse(false);
9841 }
9842 }
9843#endif
9844
9845 if (changed.count("osd_recovery_delay_start")) {
9846 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9847 service.kick_recovery_queue();
9848 }
9849
9850 if (changed.count("osd_client_message_cap")) {
9851 uint64_t newval = cct->_conf->osd_client_message_cap;
9852 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9853 if (pol.throttler_messages && newval > 0) {
9854 pol.throttler_messages->reset_max(newval);
9855 }
9856 }
9857 if (changed.count("osd_client_message_size_cap")) {
9858 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9859 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9860 if (pol.throttler_bytes && newval > 0) {
9861 pol.throttler_bytes->reset_max(newval);
9862 }
9863 }
9864 if (changed.count("osd_object_clean_region_max_num_intervals")) {
9865 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
9866 }
9867
9868 if (changed.count("osd_scrub_min_interval") ||
9869 changed.count("osd_scrub_max_interval")) {
9870 resched_all_scrubs();
9871 dout(0) << __func__ << ": scrub interval change" << dendl;
9872 }
9873 check_config();
9874}
9875
9876void OSD::update_log_config()
9877{
9878 map<string,string> log_to_monitors;
9879 map<string,string> log_to_syslog;
9880 map<string,string> log_channel;
9881 map<string,string> log_prio;
9882 map<string,string> log_to_graylog;
9883 map<string,string> log_to_graylog_host;
9884 map<string,string> log_to_graylog_port;
9885 uuid_d fsid;
9886 string host;
9887
9888 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9889 log_channel, log_prio, log_to_graylog,
9890 log_to_graylog_host, log_to_graylog_port,
9891 fsid, host) == 0)
9892 clog->update_config(log_to_monitors, log_to_syslog,
9893 log_channel, log_prio, log_to_graylog,
9894 log_to_graylog_host, log_to_graylog_port,
9895 fsid, host);
9896 derr << "log_to_monitors " << log_to_monitors << dendl;
9897}
9898
9899void OSD::check_config()
9900{
9901 // some sanity checks
9902 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9903 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9904 << " is not > osd_pg_epoch_persisted_max_stale ("
9905 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9906 }
9907 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
9908 clog->warn() << "osd_object_clean_region_max_num_intervals ("
9909 << cct->_conf->osd_object_clean_region_max_num_intervals
9910 << ") is < 0";
9911 }
9912}
9913
9914// --------------------------------
9915
9916void OSD::get_latest_osdmap()
9917{
9918 dout(10) << __func__ << " -- start" << dendl;
9919
9920 C_SaferCond cond;
9921 service.objecter->wait_for_latest_osdmap(&cond);
9922 cond.wait();
9923
9924 dout(10) << __func__ << " -- finish" << dendl;
9925}
9926
9927// --------------------------------
9928
9929void OSD::set_perf_queries(const ConfigPayload &config_payload) {
9930 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
9931 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
9932 dout(10) << "setting " << queries.size() << " queries" << dendl;
9933
9934 std::list<OSDPerfMetricQuery> supported_queries;
9935 for (auto &it : queries) {
9936 auto &query = it.first;
9937 if (!query.key_descriptor.empty()) {
9938 supported_queries.push_back(query);
9939 }
9940 }
9941 if (supported_queries.size() < queries.size()) {
9942 dout(1) << queries.size() - supported_queries.size()
9943 << " unsupported queries" << dendl;
9944 }
9945 {
9946 std::lock_guard locker{m_perf_queries_lock};
9947 m_perf_queries = supported_queries;
9948 m_perf_limits = queries;
9949 }
9950 std::vector<PGRef> pgs;
9951 _get_pgs(&pgs);
9952 for (auto& pg : pgs) {
9953 std::scoped_lock l{*pg};
9954 pg->set_dynamic_perf_stats_queries(supported_queries);
9955 }
9956}
9957
9958MetricPayload OSD::get_perf_reports() {
9959 OSDMetricPayload payload;
9960 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
9961
9962 std::vector<PGRef> pgs;
9963 _get_pgs(&pgs);
9964 DynamicPerfStats dps;
9965 for (auto& pg : pgs) {
9966 // m_perf_queries can be modified only in set_perf_queries by mgr client
9967 // request, and it is protected by by mgr client's lock, which is held
9968 // when set_perf_queries/get_perf_reports are called, so we may not hold
9969 // m_perf_queries_lock here.
9970 DynamicPerfStats pg_dps(m_perf_queries);
9971 pg->lock();
9972 pg->get_dynamic_perf_stats(&pg_dps);
9973 pg->unlock();
9974 dps.merge(pg_dps);
9975 }
9976 dps.add_to_reports(m_perf_limits, &reports);
9977 dout(20) << "reports for " << reports.size() << " queries" << dendl;
9978
9979 return payload;
9980}
9981
9982// =============================================================
9983
9984#undef dout_context
9985#define dout_context cct
9986#undef dout_prefix
9987#define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
9988
9989void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
9990{
9991 dout(10) << pg->pg_id << " " << pg << dendl;
9992 slot->pg = pg;
9993 pg->osd_shard = this;
9994 pg->pg_slot = slot;
9995 osd->inc_num_pgs();
9996
9997 slot->epoch = pg->get_osdmap_epoch();
9998 pg_slots_by_epoch.insert(*slot);
9999}
10000
10001void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10002{
10003 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10004 slot->pg->osd_shard = nullptr;
10005 slot->pg->pg_slot = nullptr;
10006 slot->pg = nullptr;
10007 osd->dec_num_pgs();
10008
10009 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10010 slot->epoch = 0;
10011 if (waiting_for_min_pg_epoch) {
10012 min_pg_epoch_cond.notify_all();
10013 }
10014}
10015
10016void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10017{
10018 std::lock_guard l(shard_lock);
10019 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10020 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10021 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10022 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10023 slot->epoch = e;
10024 pg_slots_by_epoch.insert(*slot);
10025 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10026 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10027 if (waiting_for_min_pg_epoch) {
10028 min_pg_epoch_cond.notify_all();
10029 }
10030}
10031
10032epoch_t OSDShard::get_min_pg_epoch()
10033{
10034 std::lock_guard l(shard_lock);
10035 auto p = pg_slots_by_epoch.begin();
10036 if (p == pg_slots_by_epoch.end()) {
10037 return 0;
10038 }
10039 return p->epoch;
10040}
10041
10042void OSDShard::wait_min_pg_epoch(epoch_t need)
10043{
10044 std::unique_lock l{shard_lock};
10045 ++waiting_for_min_pg_epoch;
10046 min_pg_epoch_cond.wait(l, [need, this] {
10047 if (pg_slots_by_epoch.empty()) {
10048 return true;
10049 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10050 return true;
10051 } else {
10052 dout(10) << need << " waiting on "
10053 << pg_slots_by_epoch.begin()->epoch << dendl;
10054 return false;
10055 }
10056 });
10057 --waiting_for_min_pg_epoch;
10058}
10059
10060epoch_t OSDShard::get_max_waiting_epoch()
10061{
10062 std::lock_guard l(shard_lock);
10063 epoch_t r = 0;
10064 for (auto& i : pg_slots) {
10065 if (!i.second->waiting_peering.empty()) {
10066 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10067 }
10068 }
10069 return r;
10070}
10071
10072void OSDShard::consume_map(
10073 const OSDMapRef& new_osdmap,
10074 unsigned *pushes_to_free)
10075{
10076 std::lock_guard l(shard_lock);
10077 OSDMapRef old_osdmap;
10078 {
10079 std::lock_guard l(osdmap_lock);
10080 old_osdmap = std::move(shard_osdmap);
10081 shard_osdmap = new_osdmap;
10082 }
10083 dout(10) << new_osdmap->get_epoch()
10084 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10085 << dendl;
10086 bool queued = false;
10087
10088 // check slots
10089 auto p = pg_slots.begin();
10090 while (p != pg_slots.end()) {
10091 OSDShardPGSlot *slot = p->second.get();
10092 const spg_t& pgid = p->first;
10093 dout(20) << __func__ << " " << pgid << dendl;
10094 if (!slot->waiting_for_split.empty()) {
10095 dout(20) << __func__ << " " << pgid
10096 << " waiting for split " << slot->waiting_for_split << dendl;
10097 ++p;
10098 continue;
10099 }
10100 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10101 dout(20) << __func__ << " " << pgid
10102 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10103 << dendl;
10104 ++p;
10105 continue;
10106 }
10107 if (!slot->waiting_peering.empty()) {
10108 epoch_t first = slot->waiting_peering.begin()->first;
10109 if (first <= new_osdmap->get_epoch()) {
10110 dout(20) << __func__ << " " << pgid
10111 << " pending_peering first epoch " << first
10112 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10113 _wake_pg_slot(pgid, slot);
10114 queued = true;
10115 }
10116 ++p;
10117 continue;
10118 }
10119 if (!slot->waiting.empty()) {
10120 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10121 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10122 << dendl;
10123 ++p;
10124 continue;
10125 }
10126 while (!slot->waiting.empty() &&
10127 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10128 auto& qi = slot->waiting.front();
10129 dout(20) << __func__ << " " << pgid
10130 << " waiting item " << qi
10131 << " epoch " << qi.get_map_epoch()
10132 << " <= " << new_osdmap->get_epoch()
10133 << ", "
10134 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10135 "misdirected")
10136 << ", dropping" << dendl;
10137 *pushes_to_free += qi.get_reserved_pushes();
10138 slot->waiting.pop_front();
10139 }
10140 }
10141 if (slot->waiting.empty() &&
10142 slot->num_running == 0 &&
10143 slot->waiting_for_split.empty() &&
10144 !slot->pg) {
10145 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10146 p = pg_slots.erase(p);
10147 continue;
10148 }
10149
10150 ++p;
10151 }
10152 if (queued) {
10153 std::lock_guard l{sdata_wait_lock};
10154 sdata_cond.notify_one();
10155 }
10156}
10157
10158void OSDShard::_wake_pg_slot(
10159 spg_t pgid,
10160 OSDShardPGSlot *slot)
10161{
10162 dout(20) << __func__ << " " << pgid
10163 << " to_process " << slot->to_process
10164 << " waiting " << slot->waiting
10165 << " waiting_peering " << slot->waiting_peering << dendl;
10166 for (auto i = slot->to_process.rbegin();
10167 i != slot->to_process.rend();
10168 ++i) {
10169 scheduler->enqueue_front(std::move(*i));
10170 }
10171 slot->to_process.clear();
10172 for (auto i = slot->waiting.rbegin();
10173 i != slot->waiting.rend();
10174 ++i) {
10175 scheduler->enqueue_front(std::move(*i));
10176 }
10177 slot->waiting.clear();
10178 for (auto i = slot->waiting_peering.rbegin();
10179 i != slot->waiting_peering.rend();
10180 ++i) {
10181 // this is overkill; we requeue everything, even if some of these
10182 // items are waiting for maps we don't have yet. FIXME, maybe,
10183 // someday, if we decide this inefficiency matters
10184 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10185 scheduler->enqueue_front(std::move(*j));
10186 }
10187 }
10188 slot->waiting_peering.clear();
10189 ++slot->requeue_seq;
10190}
10191
10192void OSDShard::identify_splits_and_merges(
10193 const OSDMapRef& as_of_osdmap,
10194 set<pair<spg_t,epoch_t>> *split_pgs,
10195 set<pair<spg_t,epoch_t>> *merge_pgs)
10196{
10197 std::lock_guard l(shard_lock);
10198 if (shard_osdmap) {
10199 for (auto& i : pg_slots) {
10200 const spg_t& pgid = i.first;
10201 auto *slot = i.second.get();
10202 if (slot->pg) {
10203 osd->service.identify_splits_and_merges(
10204 shard_osdmap, as_of_osdmap, pgid,
10205 split_pgs, merge_pgs);
10206 } else if (!slot->waiting_for_split.empty()) {
10207 osd->service.identify_splits_and_merges(
10208 shard_osdmap, as_of_osdmap, pgid,
10209 split_pgs, nullptr);
10210 } else {
10211 dout(20) << __func__ << " slot " << pgid
10212 << " has no pg and waiting_for_split " << dendl;
10213 }
10214 }
10215 }
10216}
10217
10218void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10219 set<pair<spg_t,epoch_t>> *pgids)
10220{
10221 std::lock_guard l(shard_lock);
10222 _prime_splits(pgids);
10223 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10224 set<pair<spg_t,epoch_t>> newer_children;
10225 for (auto i : *pgids) {
10226 osd->service.identify_splits_and_merges(
10227 as_of_osdmap, shard_osdmap, i.first,
10228 &newer_children, nullptr);
10229 }
10230 newer_children.insert(pgids->begin(), pgids->end());
10231 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10232 << shard_osdmap->get_epoch() << ", new children " << newer_children
10233 << dendl;
10234 _prime_splits(&newer_children);
10235 // note: we don't care what is left over here for other shards.
10236 // if this shard is ahead of us and one isn't, e.g., one thread is
10237 // calling into prime_splits via _process (due to a newly created
10238 // pg) and this shard has a newer map due to a racing consume_map,
10239 // then any grandchildren left here will be identified (or were
10240 // identified) when the slower shard's osdmap is advanced.
10241 // _prime_splits() will tolerate the case where the pgid is
10242 // already primed.
10243 }
10244}
10245
10246void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10247{
10248 dout(10) << *pgids << dendl;
10249 auto p = pgids->begin();
10250 while (p != pgids->end()) {
10251 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10252 if (shard_index == shard_id) {
10253 auto r = pg_slots.emplace(p->first, nullptr);
10254 if (r.second) {
10255 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10256 r.first->second = make_unique<OSDShardPGSlot>();
10257 r.first->second->waiting_for_split.insert(p->second);
10258 } else {
10259 auto q = r.first;
10260 ceph_assert(q != pg_slots.end());
10261 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10262 << dendl;
10263 q->second->waiting_for_split.insert(p->second);
10264 }
10265 p = pgids->erase(p);
10266 } else {
10267 ++p;
10268 }
10269 }
10270}
10271
10272void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10273 set<pair<spg_t,epoch_t>> *merge_pgs)
10274{
10275 std::lock_guard l(shard_lock);
10276 dout(20) << __func__ << " checking shard " << shard_id
10277 << " for remaining merge pgs " << merge_pgs << dendl;
10278 auto p = merge_pgs->begin();
10279 while (p != merge_pgs->end()) {
10280 spg_t pgid = p->first;
10281 epoch_t epoch = p->second;
10282 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10283 if (shard_index != shard_id) {
10284 ++p;
10285 continue;
10286 }
10287 OSDShardPGSlot *slot;
10288 auto r = pg_slots.emplace(pgid, nullptr);
10289 if (r.second) {
10290 r.first->second = make_unique<OSDShardPGSlot>();
10291 }
10292 slot = r.first->second.get();
10293 if (slot->pg) {
10294 // already have pg
10295 dout(20) << __func__ << " have merge participant pg " << pgid
10296 << " " << slot->pg << dendl;
10297 } else if (!slot->waiting_for_split.empty() &&
10298 *slot->waiting_for_split.begin() < epoch) {
10299 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10300 << " " << slot->waiting_for_split << dendl;
10301 } else {
10302 dout(20) << __func__ << " creating empty merge participant " << pgid
10303 << " for merge in " << epoch << dendl;
10304 // leave history zeroed; PG::merge_from() will fill it in.
10305 pg_history_t history;
10306 PGCreateInfo cinfo(pgid, epoch - 1,
10307 history, PastIntervals(), false);
10308 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10309 _attach_pg(r.first->second.get(), pg.get());
10310 _wake_pg_slot(pgid, slot);
10311 pg->unlock();
10312 }
10313 // mark slot for merge
10314 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10315 slot->waiting_for_merge_epoch = epoch;
10316 p = merge_pgs->erase(p);
10317 }
10318}
10319
10320void OSDShard::register_and_wake_split_child(PG *pg)
10321{
10322 epoch_t epoch;
10323 {
10324 std::lock_guard l(shard_lock);
10325 dout(10) << pg->pg_id << " " << pg << dendl;
10326 auto p = pg_slots.find(pg->pg_id);
10327 ceph_assert(p != pg_slots.end());
10328 auto *slot = p->second.get();
10329 dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
10330 << dendl;
10331 ceph_assert(!slot->pg);
10332 ceph_assert(!slot->waiting_for_split.empty());
10333 _attach_pg(slot, pg);
10334
10335 epoch = pg->get_osdmap_epoch();
10336 ceph_assert(slot->waiting_for_split.count(epoch));
10337 slot->waiting_for_split.erase(epoch);
10338 if (slot->waiting_for_split.empty()) {
10339 _wake_pg_slot(pg->pg_id, slot);
10340 } else {
10341 dout(10) << __func__ << " still waiting for split on "
10342 << slot->waiting_for_split << dendl;
10343 }
10344 }
10345
10346 // kick child to ensure it pulls up to the latest osdmap
10347 osd->enqueue_peering_evt(
10348 pg->pg_id,
10349 PGPeeringEventRef(
10350 std::make_shared<PGPeeringEvent>(
10351 epoch,
10352 epoch,
10353 NullEvt())));
10354
10355 std::lock_guard l{sdata_wait_lock};
10356 sdata_cond.notify_one();
10357}
10358
10359void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
10360{
10361 std::lock_guard l(shard_lock);
10362 vector<spg_t> to_delete;
10363 for (auto& i : pg_slots) {
10364 if (i.first != parent &&
10365 i.first.get_ancestor(old_pg_num) == parent) {
10366 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10367 << dendl;
10368 _wake_pg_slot(i.first, i.second.get());
10369 to_delete.push_back(i.first);
10370 }
10371 }
10372 for (auto pgid : to_delete) {
10373 pg_slots.erase(pgid);
10374 }
10375}
10376
10377OSDShard::OSDShard(
10378 int id,
10379 CephContext *cct,
10380 OSD *osd)
10381 : shard_id(id),
10382 cct(cct),
10383 osd(osd),
10384 shard_name(string("OSDShard.") + stringify(id)),
10385 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10386 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10387 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10388 shard_lock_name(shard_name + "::shard_lock"),
10389 shard_lock{make_mutex(shard_lock_name)},
10390 scheduler(ceph::osd::scheduler::make_scheduler(cct)),
10391 context_queue(sdata_wait_lock, sdata_cond)
10392{
10393 dout(0) << "using op scheduler " << *scheduler << dendl;
10394}
10395
10396
10397// =============================================================
10398
10399#undef dout_context
10400#define dout_context osd->cct
10401#undef dout_prefix
10402#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10403
10404void OSD::ShardedOpWQ::_add_slot_waiter(
10405 spg_t pgid,
10406 OSDShardPGSlot *slot,
10407 OpSchedulerItem&& qi)
10408{
10409 if (qi.is_peering()) {
10410 dout(20) << __func__ << " " << pgid
10411 << " peering, item epoch is "
10412 << qi.get_map_epoch()
10413 << ", will wait on " << qi << dendl;
10414 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10415 } else {
10416 dout(20) << __func__ << " " << pgid
10417 << " item epoch is "
10418 << qi.get_map_epoch()
10419 << ", will wait on " << qi << dendl;
10420 slot->waiting.push_back(std::move(qi));
10421 }
10422}
10423
10424#undef dout_prefix
10425#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10426
10427void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10428{
10429 uint32_t shard_index = thread_index % osd->num_shards;
10430 auto& sdata = osd->shards[shard_index];
10431 ceph_assert(sdata);
10432
10433 // If all threads of shards do oncommits, there is a out-of-order
10434 // problem. So we choose the thread which has the smallest
10435 // thread_index(thread_index < num_shards) of shard to do oncommit
10436 // callback.
10437 bool is_smallest_thread_index = thread_index < osd->num_shards;
10438
10439 // peek at spg_t
10440 sdata->shard_lock.lock();
10441 if (sdata->scheduler->empty() &&
10442 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10443 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10444 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10445 // we raced with a context_queue addition, don't wait
10446 wait_lock.unlock();
10447 } else if (!sdata->stop_waiting) {
10448 dout(20) << __func__ << " empty q, waiting" << dendl;
10449 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10450 sdata->shard_lock.unlock();
10451 sdata->sdata_cond.wait(wait_lock);
10452 wait_lock.unlock();
10453 sdata->shard_lock.lock();
10454 if (sdata->scheduler->empty() &&
10455 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10456 sdata->shard_lock.unlock();
10457 return;
10458 }
10459 // found a work item; reapply default wq timeouts
10460 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10461 timeout_interval, suicide_interval);
10462 } else {
10463 dout(20) << __func__ << " need return immediately" << dendl;
10464 wait_lock.unlock();
10465 sdata->shard_lock.unlock();
10466 return;
10467 }
10468 }
10469
10470 list<Context *> oncommits;
10471 if (is_smallest_thread_index) {
10472 sdata->context_queue.move_to(oncommits);
10473 }
10474
10475 if (sdata->scheduler->empty()) {
10476 if (osd->is_stopping()) {
10477 sdata->shard_lock.unlock();
10478 for (auto c : oncommits) {
10479 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10480 delete c;
10481 }
10482 return; // OSD shutdown, discard.
10483 }
10484 sdata->shard_lock.unlock();
10485 handle_oncommits(oncommits);
10486 return;
10487 }
10488
10489 OpSchedulerItem item = sdata->scheduler->dequeue();
10490 if (osd->is_stopping()) {
10491 sdata->shard_lock.unlock();
10492 for (auto c : oncommits) {
10493 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10494 delete c;
10495 }
10496 return; // OSD shutdown, discard.
10497 }
10498
10499 const auto token = item.get_ordering_token();
10500 auto r = sdata->pg_slots.emplace(token, nullptr);
10501 if (r.second) {
10502 r.first->second = make_unique<OSDShardPGSlot>();
10503 }
10504 OSDShardPGSlot *slot = r.first->second.get();
10505 dout(20) << __func__ << " " << token
10506 << (r.second ? " (new)" : "")
10507 << " to_process " << slot->to_process
10508 << " waiting " << slot->waiting
10509 << " waiting_peering " << slot->waiting_peering
10510 << dendl;
10511 slot->to_process.push_back(std::move(item));
10512 dout(20) << __func__ << " " << slot->to_process.back()
10513 << " queued" << dendl;
10514
10515 retry_pg:
10516 PGRef pg = slot->pg;
10517
10518 // lock pg (if we have it)
10519 if (pg) {
10520 // note the requeue seq now...
10521 uint64_t requeue_seq = slot->requeue_seq;
10522 ++slot->num_running;
10523
10524 sdata->shard_lock.unlock();
10525 osd->service.maybe_inject_dispatch_delay();
10526 pg->lock();
10527 osd->service.maybe_inject_dispatch_delay();
10528 sdata->shard_lock.lock();
10529
10530 auto q = sdata->pg_slots.find(token);
10531 if (q == sdata->pg_slots.end()) {
10532 // this can happen if we race with pg removal.
10533 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10534 pg->unlock();
10535 sdata->shard_lock.unlock();
10536 handle_oncommits(oncommits);
10537 return;
10538 }
10539 slot = q->second.get();
10540 --slot->num_running;
10541
10542 if (slot->to_process.empty()) {
10543 // raced with _wake_pg_slot or consume_map
10544 dout(20) << __func__ << " " << token
10545 << " nothing queued" << dendl;
10546 pg->unlock();
10547 sdata->shard_lock.unlock();
10548 handle_oncommits(oncommits);
10549 return;
10550 }
10551 if (requeue_seq != slot->requeue_seq) {
10552 dout(20) << __func__ << " " << token
10553 << " requeue_seq " << slot->requeue_seq << " > our "
10554 << requeue_seq << ", we raced with _wake_pg_slot"
10555 << dendl;
10556 pg->unlock();
10557 sdata->shard_lock.unlock();
10558 handle_oncommits(oncommits);
10559 return;
10560 }
10561 if (slot->pg != pg) {
10562 // this can happen if we race with pg removal.
10563 dout(20) << __func__ << " slot " << token << " no longer attached to "
10564 << pg << dendl;
10565 pg->unlock();
10566 goto retry_pg;
10567 }
10568 }
10569
10570 dout(20) << __func__ << " " << token
10571 << " to_process " << slot->to_process
10572 << " waiting " << slot->waiting
10573 << " waiting_peering " << slot->waiting_peering << dendl;
10574
10575 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10576 suicide_interval);
10577
10578 // take next item
10579 auto qi = std::move(slot->to_process.front());
10580 slot->to_process.pop_front();
10581 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10582 set<pair<spg_t,epoch_t>> new_children;
10583 OSDMapRef osdmap;
10584
10585 while (!pg) {
10586 // should this pg shard exist on this osd in this (or a later) epoch?
10587 osdmap = sdata->shard_osdmap;
10588 const PGCreateInfo *create_info = qi.creates_pg();
10589 if (!slot->waiting_for_split.empty()) {
10590 dout(20) << __func__ << " " << token
10591 << " splitting " << slot->waiting_for_split << dendl;
10592 _add_slot_waiter(token, slot, std::move(qi));
10593 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10594 dout(20) << __func__ << " " << token
10595 << " map " << qi.get_map_epoch() << " > "
10596 << osdmap->get_epoch() << dendl;
10597 _add_slot_waiter(token, slot, std::move(qi));
10598 } else if (qi.is_peering()) {
10599 if (!qi.peering_requires_pg()) {
10600 // for pg-less events, we run them under the ordering lock, since
10601 // we don't have the pg lock to keep them ordered.
10602 qi.run(osd, sdata, pg, tp_handle);
10603 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10604 if (create_info) {
10605 if (create_info->by_mon &&
10606 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
10607 dout(20) << __func__ << " " << token
10608 << " no pg, no longer primary, ignoring mon create on "
10609 << qi << dendl;
10610 } else {
10611 dout(20) << __func__ << " " << token
10612 << " no pg, should create on " << qi << dendl;
10613 pg = osd->handle_pg_create_info(osdmap, create_info);
10614 if (pg) {
10615 // we created the pg! drop out and continue "normally"!
10616 sdata->_attach_pg(slot, pg.get());
10617 sdata->_wake_pg_slot(token, slot);
10618
10619 // identify split children between create epoch and shard epoch.
10620 osd->service.identify_splits_and_merges(
10621 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
10622 sdata->_prime_splits(&new_children);
10623 // distribute remaining split children to other shards below!
10624 break;
10625 }
10626 dout(20) << __func__ << " ignored create on " << qi << dendl;
10627 }
10628 } else {
10629 dout(20) << __func__ << " " << token
10630 << " no pg, peering, !create, discarding " << qi << dendl;
10631 }
10632 } else {
10633 dout(20) << __func__ << " " << token
10634 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
10635 << ", discarding " << qi
10636 << dendl;
10637 }
10638 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10639 dout(20) << __func__ << " " << token
10640 << " no pg, should exist e" << osdmap->get_epoch()
10641 << ", will wait on " << qi << dendl;
10642 _add_slot_waiter(token, slot, std::move(qi));
10643 } else {
10644 dout(20) << __func__ << " " << token
10645 << " no pg, shouldn't exist e" << osdmap->get_epoch()
10646 << ", dropping " << qi << dendl;
10647 // share map with client?
10648 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10649 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
10650 sdata->shard_osdmap,
10651 (*_op)->sent_epoch);
10652 }
10653 unsigned pushes_to_free = qi.get_reserved_pushes();
10654 if (pushes_to_free > 0) {
10655 sdata->shard_lock.unlock();
10656 osd->service.release_reserved_pushes(pushes_to_free);
10657 handle_oncommits(oncommits);
10658 return;
10659 }
10660 }
10661 sdata->shard_lock.unlock();
10662 handle_oncommits(oncommits);
10663 return;
10664 }
10665 if (qi.is_peering()) {
10666 OSDMapRef osdmap = sdata->shard_osdmap;
10667 if (qi.get_map_epoch() > osdmap->get_epoch()) {
10668 _add_slot_waiter(token, slot, std::move(qi));
10669 sdata->shard_lock.unlock();
10670 pg->unlock();
10671 handle_oncommits(oncommits);
10672 return;
10673 }
10674 }
10675 sdata->shard_lock.unlock();
10676
10677 if (!new_children.empty()) {
10678 for (auto shard : osd->shards) {
10679 shard->prime_splits(osdmap, &new_children);
10680 }
10681 ceph_assert(new_children.empty());
10682 }
10683
10684 // osd_opwq_process marks the point at which an operation has been dequeued
10685 // and will begin to be handled by a worker thread.
10686 {
10687#ifdef WITH_LTTNG
10688 osd_reqid_t reqid;
10689 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10690 reqid = (*_op)->get_reqid();
10691 }
10692#endif
10693 tracepoint(osd, opwq_process_start, reqid.name._type,
10694 reqid.name._num, reqid.tid, reqid.inc);
10695 }
10696
10697 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10698 Formatter *f = Formatter::create("json");
10699 f->open_object_section("q");
10700 dump(f);
10701 f->close_section();
10702 f->flush(*_dout);
10703 delete f;
10704 *_dout << dendl;
10705
10706 qi.run(osd, sdata, pg, tp_handle);
10707
10708 {
10709#ifdef WITH_LTTNG
10710 osd_reqid_t reqid;
10711 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10712 reqid = (*_op)->get_reqid();
10713 }
10714#endif
10715 tracepoint(osd, opwq_process_finish, reqid.name._type,
10716 reqid.name._num, reqid.tid, reqid.inc);
10717 }
10718
10719 handle_oncommits(oncommits);
10720}
10721
10722void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
10723 uint32_t shard_index =
10724 item.get_ordering_token().hash_to_shard(osd->shards.size());
10725
10726 dout(20) << __func__ << " " << item << dendl;
10727
10728 OSDShard* sdata = osd->shards[shard_index];
10729 assert (NULL != sdata);
10730
10731 bool empty = true;
10732 {
10733 std::lock_guard l{sdata->shard_lock};
10734 empty = sdata->scheduler->empty();
10735 sdata->scheduler->enqueue(std::move(item));
10736 }
10737
10738 if (empty) {
10739 std::lock_guard l{sdata->sdata_wait_lock};
10740 sdata->sdata_cond.notify_all();
10741 }
10742}
10743
10744void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
10745{
10746 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
10747 auto& sdata = osd->shards[shard_index];
10748 ceph_assert(sdata);
10749 sdata->shard_lock.lock();
10750 auto p = sdata->pg_slots.find(item.get_ordering_token());
10751 if (p != sdata->pg_slots.end() &&
10752 !p->second->to_process.empty()) {
10753 // we may be racing with _process, which has dequeued a new item
10754 // from scheduler, put it on to_process, and is now busy taking the
10755 // pg lock. ensure this old requeued item is ordered before any
10756 // such newer item in to_process.
10757 p->second->to_process.push_front(std::move(item));
10758 item = std::move(p->second->to_process.back());
10759 p->second->to_process.pop_back();
10760 dout(20) << __func__
10761 << " " << p->second->to_process.front()
10762 << " shuffled w/ " << item << dendl;
10763 } else {
10764 dout(20) << __func__ << " " << item << dendl;
10765 }
10766 sdata->scheduler->enqueue_front(std::move(item));
10767 sdata->shard_lock.unlock();
10768 std::lock_guard l{sdata->sdata_wait_lock};
10769 sdata->sdata_cond.notify_one();
10770}
10771
10772namespace ceph {
10773namespace osd_cmds {
10774
10775int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
10776 std::ostream& os)
10777{
10778 if (!ceph_using_tcmalloc()) {
10779 os << "could not issue heap profiler command -- not using tcmalloc!";
10780 return -EOPNOTSUPP;
10781 }
10782
10783 string cmd;
10784 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
10785 os << "unable to get value for command \"" << cmd << "\"";
10786 return -EINVAL;
10787 }
10788
10789 std::vector<std::string> cmd_vec;
10790 get_str_vec(cmd, cmd_vec);
10791
10792 string val;
10793 if (cmd_getval(cmdmap, "value", val)) {
10794 cmd_vec.push_back(val);
10795 }
10796
10797 ceph_heap_profiler_handle_command(cmd_vec, os);
10798
10799 return 0;
10800}
10801
10802}} // namespace ceph::osd_cmds