]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
import ceph 15.2.16
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15
16 #include "acconfig.h"
17
18 #include <cctype>
19 #include <fstream>
20 #include <iostream>
21 #include <iterator>
22
23 #include <unistd.h>
24 #include <sys/stat.h>
25 #include <signal.h>
26 #include <time.h>
27 #include <boost/scoped_ptr.hpp>
28 #include <boost/range/adaptor/reversed.hpp>
29
30 #ifdef HAVE_SYS_PARAM_H
31 #include <sys/param.h>
32 #endif
33
34 #ifdef HAVE_SYS_MOUNT_H
35 #include <sys/mount.h>
36 #endif
37
38 #include "osd/PG.h"
39
40 #include "include/types.h"
41 #include "include/compat.h"
42 #include "include/random.h"
43
44 #include "OSD.h"
45 #include "OSDMap.h"
46 #include "Watch.h"
47 #include "osdc/Objecter.h"
48
49 #include "common/errno.h"
50 #include "common/ceph_argparse.h"
51 #include "common/ceph_releases.h"
52 #include "common/ceph_time.h"
53 #include "common/version.h"
54 #include "common/pick_address.h"
55 #include "common/blkdev.h"
56 #include "common/numa.h"
57
58 #include "os/ObjectStore.h"
59 #ifdef HAVE_LIBFUSE
60 #include "os/FuseStore.h"
61 #endif
62
63 #include "PrimaryLogPG.h"
64
65 #include "msg/Messenger.h"
66 #include "msg/Message.h"
67
68 #include "mon/MonClient.h"
69
70 #include "messages/MLog.h"
71
72 #include "messages/MGenericMessage.h"
73 #include "messages/MOSDPing.h"
74 #include "messages/MOSDFailure.h"
75 #include "messages/MOSDMarkMeDown.h"
76 #include "messages/MOSDMarkMeDead.h"
77 #include "messages/MOSDFull.h"
78 #include "messages/MOSDOp.h"
79 #include "messages/MOSDOpReply.h"
80 #include "messages/MOSDBackoff.h"
81 #include "messages/MOSDBeacon.h"
82 #include "messages/MOSDRepOp.h"
83 #include "messages/MOSDRepOpReply.h"
84 #include "messages/MOSDBoot.h"
85 #include "messages/MOSDPGTemp.h"
86 #include "messages/MOSDPGReadyToMerge.h"
87
88 #include "messages/MOSDMap.h"
89 #include "messages/MMonGetOSDMap.h"
90 #include "messages/MOSDPGNotify.h"
91 #include "messages/MOSDPGNotify2.h"
92 #include "messages/MOSDPGQuery.h"
93 #include "messages/MOSDPGQuery2.h"
94 #include "messages/MOSDPGLog.h"
95 #include "messages/MOSDPGRemove.h"
96 #include "messages/MOSDPGInfo.h"
97 #include "messages/MOSDPGInfo2.h"
98 #include "messages/MOSDPGCreate.h"
99 #include "messages/MOSDPGCreate2.h"
100 #include "messages/MOSDPGScan.h"
101 #include "messages/MBackfillReserve.h"
102 #include "messages/MRecoveryReserve.h"
103 #include "messages/MOSDForceRecovery.h"
104 #include "messages/MOSDECSubOpWrite.h"
105 #include "messages/MOSDECSubOpWriteReply.h"
106 #include "messages/MOSDECSubOpRead.h"
107 #include "messages/MOSDECSubOpReadReply.h"
108 #include "messages/MOSDPGCreated.h"
109 #include "messages/MOSDPGUpdateLogMissing.h"
110 #include "messages/MOSDPGUpdateLogMissingReply.h"
111
112 #include "messages/MOSDPeeringOp.h"
113
114 #include "messages/MOSDAlive.h"
115
116 #include "messages/MOSDScrub.h"
117 #include "messages/MOSDScrub2.h"
118 #include "messages/MOSDRepScrub.h"
119
120 #include "messages/MCommand.h"
121 #include "messages/MCommandReply.h"
122
123 #include "messages/MPGStats.h"
124 #include "messages/MPGStatsAck.h"
125
126 #include "messages/MWatchNotify.h"
127 #include "messages/MOSDPGPush.h"
128 #include "messages/MOSDPGPushReply.h"
129 #include "messages/MOSDPGPull.h"
130
131 #include "messages/MMonGetPurgedSnaps.h"
132 #include "messages/MMonGetPurgedSnapsReply.h"
133
134 #include "common/perf_counters.h"
135 #include "common/Timer.h"
136 #include "common/LogClient.h"
137 #include "common/AsyncReserver.h"
138 #include "common/HeartbeatMap.h"
139 #include "common/admin_socket.h"
140 #include "common/ceph_context.h"
141
142 #include "global/signal_handler.h"
143 #include "global/pidfile.h"
144
145 #include "include/color.h"
146 #include "perfglue/cpu_profiler.h"
147 #include "perfglue/heap_profiler.h"
148
149 #include "osd/OpRequest.h"
150
151 #include "auth/AuthAuthorizeHandler.h"
152 #include "auth/RotatingKeyRing.h"
153
154 #include "objclass/objclass.h"
155
156 #include "common/cmdparse.h"
157 #include "include/str_list.h"
158 #include "include/util.h"
159
160 #include "include/ceph_assert.h"
161 #include "common/config.h"
162 #include "common/EventTrace.h"
163
164 #include "json_spirit/json_spirit_reader.h"
165 #include "json_spirit/json_spirit_writer.h"
166
167 #ifdef WITH_LTTNG
168 #define TRACEPOINT_DEFINE
169 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
170 #include "tracing/osd.h"
171 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
172 #undef TRACEPOINT_DEFINE
173 #else
174 #define tracepoint(...)
175 #endif
176
177 #define dout_context cct
178 #define dout_subsys ceph_subsys_osd
179 #undef dout_prefix
180 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
181
182 using namespace ceph::osd::scheduler;
183 using TOPNSPC::common::cmd_getval;
184
185 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
186 return *_dout << "osd." << whoami << " " << epoch << " ";
187 }
188
189 //Initial features in new superblock.
190 //Features here are also automatically upgraded
191 CompatSet OSD::get_osd_initial_compat_set() {
192 CompatSet::FeatureSet ceph_osd_feature_compat;
193 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
194 CompatSet::FeatureSet ceph_osd_feature_incompat;
195 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
196 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
197 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
198 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
199 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
200 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
201 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
202 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
203 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
204 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
205 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
206 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
207 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
208 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
209 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
210 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2);
211 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
212 ceph_osd_feature_incompat);
213 }
214
215 //Features are added here that this OSD supports.
216 CompatSet OSD::get_osd_compat_set() {
217 CompatSet compat = get_osd_initial_compat_set();
218 //Any features here can be set in code, but not in initial superblock
219 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
220 return compat;
221 }
222
223 OSDService::OSDService(OSD *osd) :
224 osd(osd),
225 cct(osd->cct),
226 whoami(osd->whoami), store(osd->store),
227 log_client(osd->log_client), clog(osd->clog),
228 pg_recovery_stats(osd->pg_recovery_stats),
229 cluster_messenger(osd->cluster_messenger),
230 client_messenger(osd->client_messenger),
231 logger(osd->logger),
232 recoverystate_perf(osd->recoverystate_perf),
233 monc(osd->monc),
234 osd_max_object_size(cct->_conf, "osd_max_object_size"),
235 osd_skip_data_digest(cct->_conf, "osd_skip_data_digest"),
236 publish_lock{ceph::make_mutex("OSDService::publish_lock")},
237 pre_publish_lock{ceph::make_mutex("OSDService::pre_publish_lock")},
238 max_oldest_map(0),
239 scrubs_local(0),
240 scrubs_remote(0),
241 agent_valid_iterator(false),
242 agent_ops(0),
243 flush_mode_high_count(0),
244 agent_active(true),
245 agent_thread(this),
246 agent_stop_flag(false),
247 agent_timer(osd->client_messenger->cct, agent_timer_lock),
248 last_recalibrate(ceph_clock_now()),
249 promote_max_objects(0),
250 promote_max_bytes(0),
251 objecter(make_unique<Objecter>(osd->client_messenger->cct,
252 osd->objecter_messenger,
253 osd->monc, nullptr)),
254 m_objecter_finishers(cct->_conf->osd_objecter_finishers),
255 watch_timer(osd->client_messenger->cct, watch_lock),
256 next_notif_id(0),
257 recovery_request_timer(cct, recovery_request_lock, false),
258 sleep_timer(cct, sleep_lock, false),
259 reserver_finisher(cct),
260 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
261 cct->_conf->osd_min_recovery_priority),
262 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
263 cct->_conf->osd_min_recovery_priority),
264 snap_reserver(cct, &reserver_finisher,
265 cct->_conf->osd_max_trimming_pgs),
266 recovery_ops_active(0),
267 recovery_ops_reserved(0),
268 recovery_paused(false),
269 map_cache(cct, cct->_conf->osd_map_cache_size),
270 map_bl_cache(cct->_conf->osd_map_cache_size),
271 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
272 cur_state(NONE),
273 cur_ratio(0), physical_ratio(0),
274 boot_epoch(0), up_epoch(0), bind_epoch(0)
275 {
276 objecter->init();
277
278 for (int i = 0; i < m_objecter_finishers; i++) {
279 ostringstream str;
280 str << "objecter-finisher-" << i;
281 auto fin = make_unique<Finisher>(osd->client_messenger->cct, str.str(), "finisher");
282 objecter_finishers.push_back(std::move(fin));
283 }
284 }
285
286 #ifdef PG_DEBUG_REFS
287 void OSDService::add_pgid(spg_t pgid, PG *pg){
288 std::lock_guard l(pgid_lock);
289 if (!pgid_tracker.count(pgid)) {
290 live_pgs[pgid] = pg;
291 }
292 pgid_tracker[pgid]++;
293 }
294 void OSDService::remove_pgid(spg_t pgid, PG *pg)
295 {
296 std::lock_guard l(pgid_lock);
297 ceph_assert(pgid_tracker.count(pgid));
298 ceph_assert(pgid_tracker[pgid] > 0);
299 pgid_tracker[pgid]--;
300 if (pgid_tracker[pgid] == 0) {
301 pgid_tracker.erase(pgid);
302 live_pgs.erase(pgid);
303 }
304 }
305 void OSDService::dump_live_pgids()
306 {
307 std::lock_guard l(pgid_lock);
308 derr << "live pgids:" << dendl;
309 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
310 i != pgid_tracker.cend();
311 ++i) {
312 derr << "\t" << *i << dendl;
313 live_pgs[i->first]->dump_live_ids();
314 }
315 }
316 #endif
317
318
319 ceph::signedspan OSDService::get_mnow()
320 {
321 return ceph::mono_clock::now() - osd->startup_time;
322 }
323
324 void OSDService::identify_splits_and_merges(
325 OSDMapRef old_map,
326 OSDMapRef new_map,
327 spg_t pgid,
328 set<pair<spg_t,epoch_t>> *split_children,
329 set<pair<spg_t,epoch_t>> *merge_pgs)
330 {
331 if (!old_map->have_pg_pool(pgid.pool())) {
332 return;
333 }
334 int old_pgnum = old_map->get_pg_num(pgid.pool());
335 auto p = osd->pg_num_history.pg_nums.find(pgid.pool());
336 if (p == osd->pg_num_history.pg_nums.end()) {
337 return;
338 }
339 dout(20) << __func__ << " " << pgid << " e" << old_map->get_epoch()
340 << " to e" << new_map->get_epoch()
341 << " pg_nums " << p->second << dendl;
342 deque<spg_t> queue;
343 queue.push_back(pgid);
344 set<spg_t> did;
345 while (!queue.empty()) {
346 auto cur = queue.front();
347 queue.pop_front();
348 did.insert(cur);
349 unsigned pgnum = old_pgnum;
350 for (auto q = p->second.lower_bound(old_map->get_epoch());
351 q != p->second.end() &&
352 q->first <= new_map->get_epoch();
353 ++q) {
354 if (pgnum < q->second) {
355 // split?
356 if (cur.ps() < pgnum) {
357 set<spg_t> children;
358 if (cur.is_split(pgnum, q->second, &children)) {
359 dout(20) << __func__ << " " << cur << " e" << q->first
360 << " pg_num " << pgnum << " -> " << q->second
361 << " children " << children << dendl;
362 for (auto i : children) {
363 split_children->insert(make_pair(i, q->first));
364 if (!did.count(i))
365 queue.push_back(i);
366 }
367 }
368 } else if (cur.ps() < q->second) {
369 dout(20) << __func__ << " " << cur << " e" << q->first
370 << " pg_num " << pgnum << " -> " << q->second
371 << " is a child" << dendl;
372 // normally we'd capture this from the parent, but it's
373 // possible the parent doesn't exist yet (it will be
374 // fabricated to allow an intervening merge). note this PG
375 // as a split child here to be sure we catch it.
376 split_children->insert(make_pair(cur, q->first));
377 } else {
378 dout(20) << __func__ << " " << cur << " e" << q->first
379 << " pg_num " << pgnum << " -> " << q->second
380 << " is post-split, skipping" << dendl;
381 }
382 } else if (merge_pgs) {
383 // merge?
384 if (cur.ps() >= q->second) {
385 if (cur.ps() < pgnum) {
386 spg_t parent;
387 if (cur.is_merge_source(pgnum, q->second, &parent)) {
388 set<spg_t> children;
389 parent.is_split(q->second, pgnum, &children);
390 dout(20) << __func__ << " " << cur << " e" << q->first
391 << " pg_num " << pgnum << " -> " << q->second
392 << " is merge source, target " << parent
393 << ", source(s) " << children << dendl;
394 merge_pgs->insert(make_pair(parent, q->first));
395 if (!did.count(parent)) {
396 // queue (and re-scan) parent in case it might not exist yet
397 // and there are some future splits pending on it
398 queue.push_back(parent);
399 }
400 for (auto c : children) {
401 merge_pgs->insert(make_pair(c, q->first));
402 if (!did.count(c))
403 queue.push_back(c);
404 }
405 }
406 } else {
407 dout(20) << __func__ << " " << cur << " e" << q->first
408 << " pg_num " << pgnum << " -> " << q->second
409 << " is beyond old pgnum, skipping" << dendl;
410 }
411 } else {
412 set<spg_t> children;
413 if (cur.is_split(q->second, pgnum, &children)) {
414 dout(20) << __func__ << " " << cur << " e" << q->first
415 << " pg_num " << pgnum << " -> " << q->second
416 << " is merge target, source " << children << dendl;
417 for (auto c : children) {
418 merge_pgs->insert(make_pair(c, q->first));
419 if (!did.count(c))
420 queue.push_back(c);
421 }
422 merge_pgs->insert(make_pair(cur, q->first));
423 }
424 }
425 }
426 pgnum = q->second;
427 }
428 }
429 }
430
431 void OSDService::need_heartbeat_peer_update()
432 {
433 osd->need_heartbeat_peer_update();
434 }
435
436 HeartbeatStampsRef OSDService::get_hb_stamps(unsigned peer)
437 {
438 std::lock_guard l(hb_stamp_lock);
439 if (peer >= hb_stamps.size()) {
440 hb_stamps.resize(peer + 1);
441 }
442 if (!hb_stamps[peer]) {
443 hb_stamps[peer] = ceph::make_ref<HeartbeatStamps>(peer);
444 }
445 return hb_stamps[peer];
446 }
447
448 void OSDService::queue_renew_lease(epoch_t epoch, spg_t spgid)
449 {
450 osd->enqueue_peering_evt(
451 spgid,
452 PGPeeringEventRef(
453 std::make_shared<PGPeeringEvent>(
454 epoch, epoch,
455 RenewLease())));
456 }
457
458 void OSDService::start_shutdown()
459 {
460 {
461 std::lock_guard l(agent_timer_lock);
462 agent_timer.shutdown();
463 }
464
465 {
466 std::lock_guard l(sleep_lock);
467 sleep_timer.shutdown();
468 }
469
470 {
471 std::lock_guard l(recovery_request_lock);
472 recovery_request_timer.shutdown();
473 }
474 }
475
476 void OSDService::shutdown_reserver()
477 {
478 reserver_finisher.wait_for_empty();
479 reserver_finisher.stop();
480 }
481
482 void OSDService::shutdown()
483 {
484 mono_timer.suspend();
485
486 {
487 std::lock_guard l(watch_lock);
488 watch_timer.shutdown();
489 }
490
491 objecter->shutdown();
492 for (auto& f : objecter_finishers) {
493 f->wait_for_empty();
494 f->stop();
495 }
496
497 publish_map(OSDMapRef());
498 next_osdmap = OSDMapRef();
499 }
500
501 void OSDService::init()
502 {
503 reserver_finisher.start();
504 for (auto& f : objecter_finishers) {
505 f->start();
506 }
507 objecter->set_client_incarnation(0);
508
509 // deprioritize objecter in daemonperf output
510 objecter->get_logger()->set_prio_adjust(-3);
511
512 watch_timer.init();
513 agent_timer.init();
514 mono_timer.resume();
515
516 agent_thread.create("osd_srv_agent");
517
518 if (cct->_conf->osd_recovery_delay_start)
519 defer_recovery(cct->_conf->osd_recovery_delay_start);
520 }
521
522 void OSDService::final_init()
523 {
524 objecter->start(osdmap.get());
525 }
526
527 void OSDService::activate_map()
528 {
529 // wake/unwake the tiering agent
530 std::lock_guard l{agent_lock};
531 agent_active =
532 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
533 osd->is_active();
534 agent_cond.notify_all();
535 }
536
537 void OSDService::request_osdmap_update(epoch_t e)
538 {
539 osd->osdmap_subscribe(e, false);
540 }
541
542
543 class AgentTimeoutCB : public Context {
544 PGRef pg;
545 public:
546 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
547 void finish(int) override {
548 pg->agent_choose_mode_restart();
549 }
550 };
551
552 void OSDService::agent_entry()
553 {
554 dout(10) << __func__ << " start" << dendl;
555 std::unique_lock agent_locker{agent_lock};
556
557 while (!agent_stop_flag) {
558 if (agent_queue.empty()) {
559 dout(20) << __func__ << " empty queue" << dendl;
560 agent_cond.wait(agent_locker);
561 continue;
562 }
563 uint64_t level = agent_queue.rbegin()->first;
564 set<PGRef>& top = agent_queue.rbegin()->second;
565 dout(10) << __func__
566 << " tiers " << agent_queue.size()
567 << ", top is " << level
568 << " with pgs " << top.size()
569 << ", ops " << agent_ops << "/"
570 << cct->_conf->osd_agent_max_ops
571 << (agent_active ? " active" : " NOT ACTIVE")
572 << dendl;
573 dout(20) << __func__ << " oids " << agent_oids << dendl;
574 int max = cct->_conf->osd_agent_max_ops - agent_ops;
575 int agent_flush_quota = max;
576 if (!flush_mode_high_count)
577 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
578 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
579 agent_cond.wait(agent_locker);
580 continue;
581 }
582
583 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
584 agent_queue_pos = top.begin();
585 agent_valid_iterator = true;
586 }
587 PGRef pg = *agent_queue_pos;
588 dout(10) << "high_count " << flush_mode_high_count
589 << " agent_ops " << agent_ops
590 << " flush_quota " << agent_flush_quota << dendl;
591 agent_locker.unlock();
592 if (!pg->agent_work(max, agent_flush_quota)) {
593 dout(10) << __func__ << " " << pg->pg_id
594 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
595 << " seconds" << dendl;
596
597 osd->logger->inc(l_osd_tier_delay);
598 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
599 std::lock_guard timer_locker{agent_timer_lock};
600 Context *cb = new AgentTimeoutCB(pg);
601 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
602 }
603 agent_locker.lock();
604 }
605 dout(10) << __func__ << " finish" << dendl;
606 }
607
608 void OSDService::agent_stop()
609 {
610 {
611 std::lock_guard l(agent_lock);
612
613 // By this time all ops should be cancelled
614 ceph_assert(agent_ops == 0);
615 // By this time all PGs are shutdown and dequeued
616 if (!agent_queue.empty()) {
617 set<PGRef>& top = agent_queue.rbegin()->second;
618 derr << "agent queue not empty, for example " << (*top.begin())->get_pgid() << dendl;
619 ceph_abort_msg("agent queue not empty");
620 }
621
622 agent_stop_flag = true;
623 agent_cond.notify_all();
624 }
625 agent_thread.join();
626 }
627
628 // -------------------------------------
629
630 void OSDService::promote_throttle_recalibrate()
631 {
632 utime_t now = ceph_clock_now();
633 double dur = now - last_recalibrate;
634 last_recalibrate = now;
635 unsigned prob = promote_probability_millis;
636
637 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
638 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
639
640 unsigned min_prob = 1;
641
642 uint64_t attempts, obj, bytes;
643 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
644 dout(10) << __func__ << " " << attempts << " attempts, promoted "
645 << obj << " objects and " << byte_u_t(bytes) << "; target "
646 << target_obj_sec << " obj/sec or "
647 << byte_u_t(target_bytes_sec) << "/sec"
648 << dendl;
649
650 // calculate what the probability *should* be, given the targets
651 unsigned new_prob;
652 if (attempts && dur > 0) {
653 uint64_t avg_size = 1;
654 if (obj)
655 avg_size = std::max<uint64_t>(bytes / obj, 1);
656 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
657 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
658 / (double)attempts;
659 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
660 << avg_size << dendl;
661 if (target_obj_sec && target_bytes_sec)
662 new_prob = std::min(po, pb);
663 else if (target_obj_sec)
664 new_prob = po;
665 else if (target_bytes_sec)
666 new_prob = pb;
667 else
668 new_prob = 1000;
669 } else {
670 new_prob = 1000;
671 }
672 dout(20) << __func__ << " new_prob " << new_prob << dendl;
673
674 // correct for persistent skew between target rate and actual rate, adjust
675 double ratio = 1.0;
676 unsigned actual = 0;
677 if (attempts && obj) {
678 actual = obj * 1000 / attempts;
679 ratio = (double)actual / (double)prob;
680 new_prob = (double)new_prob / ratio;
681 }
682 new_prob = std::max(new_prob, min_prob);
683 new_prob = std::min(new_prob, 1000u);
684
685 // adjust
686 prob = (prob + new_prob) / 2;
687 prob = std::max(prob, min_prob);
688 prob = std::min(prob, 1000u);
689 dout(10) << __func__ << " actual " << actual
690 << ", actual/prob ratio " << ratio
691 << ", adjusted new_prob " << new_prob
692 << ", prob " << promote_probability_millis << " -> " << prob
693 << dendl;
694 promote_probability_millis = prob;
695
696 // set hard limits for this interval to mitigate stampedes
697 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
698 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
699 }
700
701 // -------------------------------------
702
703 float OSDService::get_failsafe_full_ratio()
704 {
705 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
706 if (full_ratio > 1.0) full_ratio /= 100.0;
707 return full_ratio;
708 }
709
710 OSDService::s_names OSDService::recalc_full_state(float ratio, float pratio, string &inject)
711 {
712 // The OSDMap ratios take precendence. So if the failsafe is .95 and
713 // the admin sets the cluster full to .96, the failsafe moves up to .96
714 // too. (Not that having failsafe == full is ideal, but it's better than
715 // dropping writes before the clusters appears full.)
716 OSDMapRef osdmap = get_osdmap();
717 if (!osdmap || osdmap->get_epoch() == 0) {
718 return NONE;
719 }
720 float nearfull_ratio = osdmap->get_nearfull_ratio();
721 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
722 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
723 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
724
725 if (osdmap->require_osd_release < ceph_release_t::luminous) {
726 // use the failsafe for nearfull and full; the mon isn't using the
727 // flags anyway because we're mid-upgrade.
728 full_ratio = failsafe_ratio;
729 backfillfull_ratio = failsafe_ratio;
730 nearfull_ratio = failsafe_ratio;
731 } else if (full_ratio <= 0 ||
732 backfillfull_ratio <= 0 ||
733 nearfull_ratio <= 0) {
734 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
735 // use failsafe flag. ick. the monitor did something wrong or the user
736 // did something stupid.
737 full_ratio = failsafe_ratio;
738 backfillfull_ratio = failsafe_ratio;
739 nearfull_ratio = failsafe_ratio;
740 }
741
742 if (injectfull_state > NONE && injectfull) {
743 inject = "(Injected)";
744 return injectfull_state;
745 } else if (pratio > failsafe_ratio) {
746 return FAILSAFE;
747 } else if (ratio > full_ratio) {
748 return FULL;
749 } else if (ratio > backfillfull_ratio) {
750 return BACKFILLFULL;
751 } else if (pratio > nearfull_ratio) {
752 return NEARFULL;
753 }
754 return NONE;
755 }
756
757 void OSDService::check_full_status(float ratio, float pratio)
758 {
759 std::lock_guard l(full_status_lock);
760
761 cur_ratio = ratio;
762 physical_ratio = pratio;
763
764 string inject;
765 s_names new_state;
766 new_state = recalc_full_state(ratio, pratio, inject);
767
768 dout(20) << __func__ << " cur ratio " << ratio
769 << ", physical ratio " << pratio
770 << ", new state " << get_full_state_name(new_state)
771 << " " << inject
772 << dendl;
773
774 // warn
775 if (cur_state != new_state) {
776 dout(10) << __func__ << " " << get_full_state_name(cur_state)
777 << " -> " << get_full_state_name(new_state) << dendl;
778 if (new_state == FAILSAFE) {
779 clog->error() << "full status failsafe engaged, dropping updates, now "
780 << (int)roundf(ratio * 100) << "% full";
781 } else if (cur_state == FAILSAFE) {
782 clog->error() << "full status failsafe disengaged, no longer dropping "
783 << "updates, now " << (int)roundf(ratio * 100) << "% full";
784 }
785 cur_state = new_state;
786 }
787 }
788
789 bool OSDService::need_fullness_update()
790 {
791 OSDMapRef osdmap = get_osdmap();
792 s_names cur = NONE;
793 if (osdmap->exists(whoami)) {
794 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
795 cur = FULL;
796 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
797 cur = BACKFILLFULL;
798 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
799 cur = NEARFULL;
800 }
801 }
802 s_names want = NONE;
803 if (is_full())
804 want = FULL;
805 else if (is_backfillfull())
806 want = BACKFILLFULL;
807 else if (is_nearfull())
808 want = NEARFULL;
809 return want != cur;
810 }
811
812 bool OSDService::_check_inject_full(DoutPrefixProvider *dpp, s_names type) const
813 {
814 if (injectfull && injectfull_state >= type) {
815 // injectfull is either a count of the number of times to return failsafe full
816 // or if -1 then always return full
817 if (injectfull > 0)
818 --injectfull;
819 ldpp_dout(dpp, 10) << __func__ << " Injected " << get_full_state_name(type) << " OSD ("
820 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")"
821 << dendl;
822 return true;
823 }
824 return false;
825 }
826
827 bool OSDService::_check_full(DoutPrefixProvider *dpp, s_names type) const
828 {
829 std::lock_guard l(full_status_lock);
830
831 if (_check_inject_full(dpp, type))
832 return true;
833
834 if (cur_state >= type)
835 ldpp_dout(dpp, 10) << __func__ << " current usage is " << cur_ratio
836 << " physical " << physical_ratio << dendl;
837
838 return cur_state >= type;
839 }
840
841 bool OSDService::_tentative_full(DoutPrefixProvider *dpp, s_names type, uint64_t adjust_used, osd_stat_t adjusted_stat)
842 {
843 ldpp_dout(dpp, 20) << __func__ << " type " << get_full_state_name(type) << " adjust_used " << (adjust_used >> 10) << "KiB" << dendl;
844 {
845 std::lock_guard l(full_status_lock);
846 if (_check_inject_full(dpp, type)) {
847 return true;
848 }
849 }
850
851 float pratio;
852 float ratio = compute_adjusted_ratio(adjusted_stat, &pratio, adjust_used);
853
854 string notused;
855 s_names tentative_state = recalc_full_state(ratio, pratio, notused);
856
857 if (tentative_state >= type)
858 ldpp_dout(dpp, 10) << __func__ << " tentative usage is " << ratio << dendl;
859
860 return tentative_state >= type;
861 }
862
863 bool OSDService::check_failsafe_full(DoutPrefixProvider *dpp) const
864 {
865 return _check_full(dpp, FAILSAFE);
866 }
867
868 bool OSDService::check_full(DoutPrefixProvider *dpp) const
869 {
870 return _check_full(dpp, FULL);
871 }
872
873 bool OSDService::tentative_backfill_full(DoutPrefixProvider *dpp, uint64_t adjust_used, osd_stat_t stats)
874 {
875 return _tentative_full(dpp, BACKFILLFULL, adjust_used, stats);
876 }
877
878 bool OSDService::check_backfill_full(DoutPrefixProvider *dpp) const
879 {
880 return _check_full(dpp, BACKFILLFULL);
881 }
882
883 bool OSDService::check_nearfull(DoutPrefixProvider *dpp) const
884 {
885 return _check_full(dpp, NEARFULL);
886 }
887
888 bool OSDService::is_failsafe_full() const
889 {
890 std::lock_guard l(full_status_lock);
891 return cur_state == FAILSAFE;
892 }
893
894 bool OSDService::is_full() const
895 {
896 std::lock_guard l(full_status_lock);
897 return cur_state >= FULL;
898 }
899
900 bool OSDService::is_backfillfull() const
901 {
902 std::lock_guard l(full_status_lock);
903 return cur_state >= BACKFILLFULL;
904 }
905
906 bool OSDService::is_nearfull() const
907 {
908 std::lock_guard l(full_status_lock);
909 return cur_state >= NEARFULL;
910 }
911
912 void OSDService::set_injectfull(s_names type, int64_t count)
913 {
914 std::lock_guard l(full_status_lock);
915 injectfull_state = type;
916 injectfull = count;
917 }
918
919 void OSDService::set_statfs(const struct store_statfs_t &stbuf,
920 osd_alert_list_t& alerts)
921 {
922 uint64_t bytes = stbuf.total;
923 uint64_t avail = stbuf.available;
924 uint64_t used = stbuf.get_used_raw();
925
926 // For testing fake statfs values so it doesn't matter if all
927 // OSDs are using the same partition.
928 if (cct->_conf->fake_statfs_for_testing) {
929 uint64_t total_num_bytes = 0;
930 vector<PGRef> pgs;
931 osd->_get_pgs(&pgs);
932 for (auto p : pgs) {
933 total_num_bytes += p->get_stats_num_bytes();
934 }
935 bytes = cct->_conf->fake_statfs_for_testing;
936 if (total_num_bytes < bytes)
937 avail = bytes - total_num_bytes;
938 else
939 avail = 0;
940 dout(0) << __func__ << " fake total " << cct->_conf->fake_statfs_for_testing
941 << " adjust available " << avail
942 << dendl;
943 used = bytes - avail;
944 }
945
946 osd->logger->set(l_osd_stat_bytes, bytes);
947 osd->logger->set(l_osd_stat_bytes_used, used);
948 osd->logger->set(l_osd_stat_bytes_avail, avail);
949
950 std::lock_guard l(stat_lock);
951 osd_stat.statfs = stbuf;
952 osd_stat.os_alerts.clear();
953 osd_stat.os_alerts[whoami].swap(alerts);
954 if (cct->_conf->fake_statfs_for_testing) {
955 osd_stat.statfs.total = bytes;
956 osd_stat.statfs.available = avail;
957 // For testing don't want used to go negative, so clear reserved
958 osd_stat.statfs.internally_reserved = 0;
959 }
960 }
961
962 osd_stat_t OSDService::set_osd_stat(vector<int>& hb_peers,
963 int num_pgs)
964 {
965 utime_t now = ceph_clock_now();
966 auto stale_time = g_conf().get_val<int64_t>("osd_mon_heartbeat_stat_stale");
967 std::lock_guard l(stat_lock);
968 osd_stat.hb_peers.swap(hb_peers);
969 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
970 osd_stat.num_pgs = num_pgs;
971 // Clean entries that aren't updated
972 // This is called often enough that we can just remove 1 at a time
973 for (auto i: osd_stat.hb_pingtime) {
974 if (i.second.last_update == 0)
975 continue;
976 if (stale_time && now.sec() - i.second.last_update > stale_time) {
977 dout(20) << __func__ << " time out heartbeat for osd " << i.first
978 << " last_update " << i.second.last_update << dendl;
979 osd_stat.hb_pingtime.erase(i.first);
980 break;
981 }
982 }
983 return osd_stat;
984 }
985
986 void OSDService::inc_osd_stat_repaired()
987 {
988 std::lock_guard l(stat_lock);
989 osd_stat.num_shards_repaired++;
990 return;
991 }
992
993 float OSDService::compute_adjusted_ratio(osd_stat_t new_stat, float *pratio,
994 uint64_t adjust_used)
995 {
996 *pratio =
997 ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
998
999 if (adjust_used) {
1000 dout(20) << __func__ << " Before kb_used() " << new_stat.statfs.kb_used() << dendl;
1001 if (new_stat.statfs.available > adjust_used)
1002 new_stat.statfs.available -= adjust_used;
1003 else
1004 new_stat.statfs.available = 0;
1005 dout(20) << __func__ << " After kb_used() " << new_stat.statfs.kb_used() << dendl;
1006 }
1007
1008 // Check all pgs and adjust kb_used to include all pending backfill data
1009 int backfill_adjusted = 0;
1010 vector<PGRef> pgs;
1011 osd->_get_pgs(&pgs);
1012 for (auto p : pgs) {
1013 backfill_adjusted += p->pg_stat_adjust(&new_stat);
1014 }
1015 if (backfill_adjusted) {
1016 dout(20) << __func__ << " backfill adjusted " << new_stat << dendl;
1017 }
1018 return ((float)new_stat.statfs.get_used_raw()) / ((float)new_stat.statfs.total);
1019 }
1020
1021 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
1022 {
1023 OSDMapRef next_map = get_nextmap_reserved();
1024 // service map is always newer/newest
1025 ceph_assert(from_epoch <= next_map->get_epoch());
1026
1027 if (next_map->is_down(peer) ||
1028 next_map->get_info(peer).up_from > from_epoch) {
1029 m->put();
1030 release_map(next_map);
1031 return;
1032 }
1033 ConnectionRef peer_con;
1034 if (peer == whoami) {
1035 peer_con = osd->cluster_messenger->get_loopback_connection();
1036 } else {
1037 peer_con = osd->cluster_messenger->connect_to_osd(
1038 next_map->get_cluster_addrs(peer), false, true);
1039 }
1040 maybe_share_map(peer_con.get(), next_map);
1041 peer_con->send_message(m);
1042 release_map(next_map);
1043 }
1044
1045 void OSDService::send_message_osd_cluster(std::vector<std::pair<int, Message*>>& messages, epoch_t from_epoch)
1046 {
1047 OSDMapRef next_map = get_nextmap_reserved();
1048 // service map is always newer/newest
1049 ceph_assert(from_epoch <= next_map->get_epoch());
1050
1051 for (auto& iter : messages) {
1052 if (next_map->is_down(iter.first) ||
1053 next_map->get_info(iter.first).up_from > from_epoch) {
1054 iter.second->put();
1055 continue;
1056 }
1057 ConnectionRef peer_con;
1058 if (iter.first == whoami) {
1059 peer_con = osd->cluster_messenger->get_loopback_connection();
1060 } else {
1061 peer_con = osd->cluster_messenger->connect_to_osd(
1062 next_map->get_cluster_addrs(iter.first), false, true);
1063 }
1064 maybe_share_map(peer_con.get(), next_map);
1065 peer_con->send_message(iter.second);
1066 }
1067 release_map(next_map);
1068 }
1069 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1070 {
1071 OSDMapRef next_map = get_nextmap_reserved();
1072 // service map is always newer/newest
1073 ceph_assert(from_epoch <= next_map->get_epoch());
1074
1075 if (next_map->is_down(peer) ||
1076 next_map->get_info(peer).up_from > from_epoch) {
1077 release_map(next_map);
1078 return NULL;
1079 }
1080 ConnectionRef con;
1081 if (peer == whoami) {
1082 con = osd->cluster_messenger->get_loopback_connection();
1083 } else {
1084 con = osd->cluster_messenger->connect_to_osd(
1085 next_map->get_cluster_addrs(peer), false, true);
1086 }
1087 release_map(next_map);
1088 return con;
1089 }
1090
1091 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1092 {
1093 OSDMapRef next_map = get_nextmap_reserved();
1094 // service map is always newer/newest
1095 ceph_assert(from_epoch <= next_map->get_epoch());
1096
1097 pair<ConnectionRef,ConnectionRef> ret;
1098 if (next_map->is_down(peer) ||
1099 next_map->get_info(peer).up_from > from_epoch) {
1100 release_map(next_map);
1101 return ret;
1102 }
1103 ret.first = osd->hb_back_client_messenger->connect_to_osd(
1104 next_map->get_hb_back_addrs(peer));
1105 ret.second = osd->hb_front_client_messenger->connect_to_osd(
1106 next_map->get_hb_front_addrs(peer));
1107 release_map(next_map);
1108 return ret;
1109 }
1110
1111 entity_name_t OSDService::get_cluster_msgr_name() const
1112 {
1113 return cluster_messenger->get_myname();
1114 }
1115
1116 void OSDService::queue_want_pg_temp(pg_t pgid,
1117 const vector<int>& want,
1118 bool forced)
1119 {
1120 std::lock_guard l(pg_temp_lock);
1121 auto p = pg_temp_pending.find(pgid);
1122 if (p == pg_temp_pending.end() ||
1123 p->second.acting != want ||
1124 forced) {
1125 pg_temp_wanted[pgid] = {want, forced};
1126 }
1127 }
1128
1129 void OSDService::remove_want_pg_temp(pg_t pgid)
1130 {
1131 std::lock_guard l(pg_temp_lock);
1132 pg_temp_wanted.erase(pgid);
1133 pg_temp_pending.erase(pgid);
1134 }
1135
1136 void OSDService::_sent_pg_temp()
1137 {
1138 #ifdef HAVE_STDLIB_MAP_SPLICING
1139 pg_temp_pending.merge(pg_temp_wanted);
1140 #else
1141 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1142 make_move_iterator(end(pg_temp_wanted)));
1143 #endif
1144 pg_temp_wanted.clear();
1145 }
1146
1147 void OSDService::requeue_pg_temp()
1148 {
1149 std::lock_guard l(pg_temp_lock);
1150 // wanted overrides pending. note that remove_want_pg_temp
1151 // clears the item out of both.
1152 unsigned old_wanted = pg_temp_wanted.size();
1153 unsigned old_pending = pg_temp_pending.size();
1154 _sent_pg_temp();
1155 pg_temp_wanted.swap(pg_temp_pending);
1156 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1157 << pg_temp_wanted.size() << dendl;
1158 }
1159
1160 std::ostream& operator<<(std::ostream& out,
1161 const OSDService::pg_temp_t& pg_temp)
1162 {
1163 out << pg_temp.acting;
1164 if (pg_temp.forced) {
1165 out << " (forced)";
1166 }
1167 return out;
1168 }
1169
1170 void OSDService::send_pg_temp()
1171 {
1172 std::lock_guard l(pg_temp_lock);
1173 if (pg_temp_wanted.empty())
1174 return;
1175 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1176 MOSDPGTemp *ms[2] = {nullptr, nullptr};
1177 for (auto& [pgid, pg_temp] : pg_temp_wanted) {
1178 auto& m = ms[pg_temp.forced];
1179 if (!m) {
1180 m = new MOSDPGTemp(osdmap->get_epoch());
1181 m->forced = pg_temp.forced;
1182 }
1183 m->pg_temp.emplace(pgid, pg_temp.acting);
1184 }
1185 for (auto m : ms) {
1186 if (m) {
1187 monc->send_mon_message(m);
1188 }
1189 }
1190 _sent_pg_temp();
1191 }
1192
1193 void OSDService::send_pg_created(pg_t pgid)
1194 {
1195 std::lock_guard l(pg_created_lock);
1196 dout(20) << __func__ << dendl;
1197 auto o = get_osdmap();
1198 if (o->require_osd_release >= ceph_release_t::luminous) {
1199 pg_created.insert(pgid);
1200 monc->send_mon_message(new MOSDPGCreated(pgid));
1201 }
1202 }
1203
1204 void OSDService::send_pg_created()
1205 {
1206 std::lock_guard l(pg_created_lock);
1207 dout(20) << __func__ << dendl;
1208 auto o = get_osdmap();
1209 if (o->require_osd_release >= ceph_release_t::luminous) {
1210 for (auto pgid : pg_created) {
1211 monc->send_mon_message(new MOSDPGCreated(pgid));
1212 }
1213 }
1214 }
1215
1216 void OSDService::prune_pg_created()
1217 {
1218 std::lock_guard l(pg_created_lock);
1219 dout(20) << __func__ << dendl;
1220 auto o = get_osdmap();
1221 auto i = pg_created.begin();
1222 while (i != pg_created.end()) {
1223 auto p = o->get_pg_pool(i->pool());
1224 if (!p || !p->has_flag(pg_pool_t::FLAG_CREATING)) {
1225 dout(20) << __func__ << " pruning " << *i << dendl;
1226 i = pg_created.erase(i);
1227 } else {
1228 dout(20) << __func__ << " keeping " << *i << dendl;
1229 ++i;
1230 }
1231 }
1232 }
1233
1234
1235 // --------------------------------------
1236 // dispatch
1237
1238 bool OSDService::can_inc_scrubs()
1239 {
1240 bool can_inc = false;
1241 std::lock_guard l(sched_scrub_lock);
1242
1243 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1244 dout(20) << __func__ << " == true " << scrubs_local << " local + " << scrubs_remote
1245 << " remote < max " << cct->_conf->osd_max_scrubs << dendl;
1246 can_inc = true;
1247 } else {
1248 dout(20) << __func__ << " == false " << scrubs_local << " local + " << scrubs_remote
1249 << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1250 }
1251
1252 return can_inc;
1253 }
1254
1255 bool OSDService::inc_scrubs_local()
1256 {
1257 bool result = false;
1258 std::lock_guard l{sched_scrub_lock};
1259 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1260 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local+1)
1261 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1262 result = true;
1263 ++scrubs_local;
1264 } else {
1265 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1266 }
1267 return result;
1268 }
1269
1270 void OSDService::dec_scrubs_local()
1271 {
1272 std::lock_guard l{sched_scrub_lock};
1273 dout(20) << __func__ << " " << scrubs_local << " -> " << (scrubs_local-1)
1274 << " (max " << cct->_conf->osd_max_scrubs << ", remote " << scrubs_remote << ")" << dendl;
1275 --scrubs_local;
1276 ceph_assert(scrubs_local >= 0);
1277 }
1278
1279 bool OSDService::inc_scrubs_remote()
1280 {
1281 bool result = false;
1282 std::lock_guard l{sched_scrub_lock};
1283 if (scrubs_local + scrubs_remote < cct->_conf->osd_max_scrubs) {
1284 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote+1)
1285 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1286 result = true;
1287 ++scrubs_remote;
1288 } else {
1289 dout(20) << __func__ << " " << scrubs_local << " local + " << scrubs_remote << " remote >= max " << cct->_conf->osd_max_scrubs << dendl;
1290 }
1291 return result;
1292 }
1293
1294 void OSDService::dec_scrubs_remote()
1295 {
1296 std::lock_guard l{sched_scrub_lock};
1297 dout(20) << __func__ << " " << scrubs_remote << " -> " << (scrubs_remote-1)
1298 << " (max " << cct->_conf->osd_max_scrubs << ", local " << scrubs_local << ")" << dendl;
1299 --scrubs_remote;
1300 ceph_assert(scrubs_remote >= 0);
1301 }
1302
1303 void OSDService::dump_scrub_reservations(Formatter *f)
1304 {
1305 std::lock_guard l{sched_scrub_lock};
1306 f->dump_int("scrubs_local", scrubs_local);
1307 f->dump_int("scrubs_remote", scrubs_remote);
1308 f->dump_int("osd_max_scrubs", cct->_conf->osd_max_scrubs);
1309 }
1310
1311 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1312 epoch_t *_bind_epoch) const
1313 {
1314 std::lock_guard l(epoch_lock);
1315 if (_boot_epoch)
1316 *_boot_epoch = boot_epoch;
1317 if (_up_epoch)
1318 *_up_epoch = up_epoch;
1319 if (_bind_epoch)
1320 *_bind_epoch = bind_epoch;
1321 }
1322
1323 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1324 const epoch_t *_bind_epoch)
1325 {
1326 std::lock_guard l(epoch_lock);
1327 if (_boot_epoch) {
1328 ceph_assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1329 boot_epoch = *_boot_epoch;
1330 }
1331 if (_up_epoch) {
1332 ceph_assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1333 up_epoch = *_up_epoch;
1334 }
1335 if (_bind_epoch) {
1336 ceph_assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1337 bind_epoch = *_bind_epoch;
1338 }
1339 }
1340
1341 bool OSDService::prepare_to_stop()
1342 {
1343 std::unique_lock l(is_stopping_lock);
1344 if (get_state() != NOT_STOPPING)
1345 return false;
1346
1347 OSDMapRef osdmap = get_osdmap();
1348 if (osdmap && osdmap->is_up(whoami)) {
1349 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1350 set_state(PREPARING_TO_STOP);
1351 monc->send_mon_message(
1352 new MOSDMarkMeDown(
1353 monc->get_fsid(),
1354 whoami,
1355 osdmap->get_addrs(whoami),
1356 osdmap->get_epoch(),
1357 true // request ack
1358 ));
1359 const auto timeout = ceph::make_timespan(cct->_conf->osd_mon_shutdown_timeout);
1360 is_stopping_cond.wait_for(l, timeout,
1361 [this] { return get_state() == STOPPING; });
1362 }
1363 dout(0) << __func__ << " starting shutdown" << dendl;
1364 set_state(STOPPING);
1365 return true;
1366 }
1367
1368 void OSDService::got_stop_ack()
1369 {
1370 std::scoped_lock l(is_stopping_lock);
1371 if (get_state() == PREPARING_TO_STOP) {
1372 dout(0) << __func__ << " starting shutdown" << dendl;
1373 set_state(STOPPING);
1374 is_stopping_cond.notify_all();
1375 } else {
1376 dout(10) << __func__ << " ignoring msg" << dendl;
1377 }
1378 }
1379
1380 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1381 OSDSuperblock& sblock)
1382 {
1383 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1384 osdmap->get_encoding_features());
1385 m->oldest_map = max_oldest_map;
1386 m->newest_map = sblock.newest_map;
1387
1388 int max = cct->_conf->osd_map_message_max;
1389 ssize_t max_bytes = cct->_conf->osd_map_message_max_bytes;
1390
1391 if (since < m->oldest_map) {
1392 // we don't have the next map the target wants, so start with a
1393 // full map.
1394 bufferlist bl;
1395 dout(10) << __func__ << " oldest map " << max_oldest_map << " > since "
1396 << since << ", starting with full map" << dendl;
1397 since = m->oldest_map;
1398 if (!get_map_bl(since, bl)) {
1399 derr << __func__ << " missing full map " << since << dendl;
1400 goto panic;
1401 }
1402 max--;
1403 max_bytes -= bl.length();
1404 m->maps[since].claim(bl);
1405 }
1406 for (epoch_t e = since + 1; e <= to; ++e) {
1407 bufferlist bl;
1408 if (get_inc_map_bl(e, bl)) {
1409 m->incremental_maps[e].claim(bl);
1410 } else {
1411 dout(10) << __func__ << " missing incremental map " << e << dendl;
1412 if (!get_map_bl(e, bl)) {
1413 derr << __func__ << " also missing full map " << e << dendl;
1414 goto panic;
1415 }
1416 m->maps[e].claim(bl);
1417 }
1418 max--;
1419 max_bytes -= bl.length();
1420 if (max <= 0 || max_bytes <= 0) {
1421 break;
1422 }
1423 }
1424 return m;
1425
1426 panic:
1427 if (!m->maps.empty() ||
1428 !m->incremental_maps.empty()) {
1429 // send what we have so far
1430 return m;
1431 }
1432 // send something
1433 bufferlist bl;
1434 if (get_inc_map_bl(m->newest_map, bl)) {
1435 m->incremental_maps[m->newest_map].claim(bl);
1436 } else {
1437 derr << __func__ << " unable to load latest map " << m->newest_map << dendl;
1438 if (!get_map_bl(m->newest_map, bl)) {
1439 derr << __func__ << " unable to load latest full map " << m->newest_map
1440 << dendl;
1441 ceph_abort();
1442 }
1443 m->maps[m->newest_map].claim(bl);
1444 }
1445 return m;
1446 }
1447
1448 void OSDService::send_map(MOSDMap *m, Connection *con)
1449 {
1450 con->send_message(m);
1451 }
1452
1453 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1454 const OSDMapRef& osdmap)
1455 {
1456 epoch_t to = osdmap->get_epoch();
1457 dout(10) << "send_incremental_map " << since << " -> " << to
1458 << " to " << con << " " << con->get_peer_addr() << dendl;
1459
1460 MOSDMap *m = NULL;
1461 while (!m) {
1462 OSDSuperblock sblock(get_superblock());
1463 if (since < sblock.oldest_map) {
1464 // just send latest full map
1465 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1466 osdmap->get_encoding_features());
1467 m->oldest_map = max_oldest_map;
1468 m->newest_map = sblock.newest_map;
1469 get_map_bl(to, m->maps[to]);
1470 send_map(m, con);
1471 return;
1472 }
1473
1474 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1475 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1476 << ", only sending most recent" << dendl;
1477 since = to - cct->_conf->osd_map_share_max_epochs;
1478 }
1479
1480 m = build_incremental_map_msg(since, to, sblock);
1481 }
1482 send_map(m, con);
1483 }
1484
1485 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1486 {
1487 bool found = map_bl_cache.lookup(e, &bl);
1488 if (found) {
1489 if (logger)
1490 logger->inc(l_osd_map_bl_cache_hit);
1491 return true;
1492 }
1493 if (logger)
1494 logger->inc(l_osd_map_bl_cache_miss);
1495 found = store->read(meta_ch,
1496 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1497 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1498 if (found) {
1499 _add_map_bl(e, bl);
1500 }
1501 return found;
1502 }
1503
1504 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1505 {
1506 std::lock_guard l(map_cache_lock);
1507 bool found = map_bl_inc_cache.lookup(e, &bl);
1508 if (found) {
1509 if (logger)
1510 logger->inc(l_osd_map_bl_cache_hit);
1511 return true;
1512 }
1513 if (logger)
1514 logger->inc(l_osd_map_bl_cache_miss);
1515 found = store->read(meta_ch,
1516 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1517 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1518 if (found) {
1519 _add_map_inc_bl(e, bl);
1520 }
1521 return found;
1522 }
1523
1524 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1525 {
1526 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1527 // cache a contiguous buffer
1528 if (bl.get_num_buffers() > 1) {
1529 bl.rebuild();
1530 }
1531 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1532 map_bl_cache.add(e, bl);
1533 }
1534
1535 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1536 {
1537 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1538 // cache a contiguous buffer
1539 if (bl.get_num_buffers() > 1) {
1540 bl.rebuild();
1541 }
1542 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1543 map_bl_inc_cache.add(e, bl);
1544 }
1545
1546 OSDMapRef OSDService::_add_map(OSDMap *o)
1547 {
1548 epoch_t e = o->get_epoch();
1549
1550 if (cct->_conf->osd_map_dedup) {
1551 // Dedup against an existing map at a nearby epoch
1552 OSDMapRef for_dedup = map_cache.lower_bound(e);
1553 if (for_dedup) {
1554 OSDMap::dedup(for_dedup.get(), o);
1555 }
1556 }
1557 bool existed;
1558 OSDMapRef l = map_cache.add(e, o, &existed);
1559 if (existed) {
1560 delete o;
1561 }
1562 return l;
1563 }
1564
1565 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1566 {
1567 std::lock_guard l(map_cache_lock);
1568 OSDMapRef retval = map_cache.lookup(epoch);
1569 if (retval) {
1570 dout(30) << "get_map " << epoch << " -cached" << dendl;
1571 if (logger) {
1572 logger->inc(l_osd_map_cache_hit);
1573 }
1574 return retval;
1575 }
1576 if (logger) {
1577 logger->inc(l_osd_map_cache_miss);
1578 epoch_t lb = map_cache.cached_key_lower_bound();
1579 if (epoch < lb) {
1580 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1581 logger->inc(l_osd_map_cache_miss_low);
1582 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1583 }
1584 }
1585
1586 OSDMap *map = new OSDMap;
1587 if (epoch > 0) {
1588 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1589 bufferlist bl;
1590 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1591 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1592 delete map;
1593 return OSDMapRef();
1594 }
1595 map->decode(bl);
1596 } else {
1597 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1598 }
1599 return _add_map(map);
1600 }
1601
1602 // ops
1603
1604
1605 void OSDService::reply_op_error(OpRequestRef op, int err)
1606 {
1607 reply_op_error(op, err, eversion_t(), 0, {});
1608 }
1609
1610 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1611 version_t uv,
1612 vector<pg_log_op_return_item_t> op_returns)
1613 {
1614 auto m = op->get_req<MOSDOp>();
1615 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1616 int flags;
1617 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1618
1619 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1620 !m->has_flag(CEPH_OSD_FLAG_RETURNVEC));
1621 reply->set_reply_versions(v, uv);
1622 reply->set_op_returns(op_returns);
1623 m->get_connection()->send_message(reply);
1624 }
1625
1626 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1627 {
1628 if (!cct->_conf->osd_debug_misdirected_ops) {
1629 return;
1630 }
1631
1632 auto m = op->get_req<MOSDOp>();
1633 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1634
1635 ceph_assert(m->get_map_epoch() >= pg->get_history().same_primary_since);
1636
1637 if (pg->is_ec_pg()) {
1638 /**
1639 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1640 * can get this result:
1641 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1642 * [CRUSH_ITEM_NONE, 2, 3]/3
1643 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1644 * [3, 2, 3]/3
1645 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1646 * -- misdirected op
1647 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1648 * it and fulfils it
1649 *
1650 * We can't compute the op target based on the sending map epoch due to
1651 * splitting. The simplest thing is to detect such cases here and drop
1652 * them without an error (the client will resend anyway).
1653 */
1654 ceph_assert(m->get_map_epoch() <= superblock.newest_map);
1655 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1656 if (!opmap) {
1657 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1658 << m->get_map_epoch() << ", dropping" << dendl;
1659 return;
1660 }
1661 pg_t _pgid = m->get_raw_pg();
1662 spg_t pgid;
1663 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1664 _pgid = opmap->raw_pg_to_pg(_pgid);
1665 if (opmap->get_primary_shard(_pgid, &pgid) &&
1666 pgid.shard != pg->pg_id.shard) {
1667 dout(7) << __func__ << ": " << *pg << " primary changed since "
1668 << m->get_map_epoch() << ", dropping" << dendl;
1669 return;
1670 }
1671 }
1672
1673 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1674 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1675 << " pg " << m->get_raw_pg()
1676 << " to osd." << whoami
1677 << " not " << pg->get_acting()
1678 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1679 }
1680
1681 void OSDService::enqueue_back(OpSchedulerItem&& qi)
1682 {
1683 osd->op_shardedwq.queue(std::move(qi));
1684 }
1685
1686 void OSDService::enqueue_front(OpSchedulerItem&& qi)
1687 {
1688 osd->op_shardedwq.queue_front(std::move(qi));
1689 }
1690
1691 void OSDService::queue_recovery_context(
1692 PG *pg,
1693 GenContext<ThreadPool::TPHandle&> *c)
1694 {
1695 epoch_t e = get_osdmap_epoch();
1696 enqueue_back(
1697 OpSchedulerItem(
1698 unique_ptr<OpSchedulerItem::OpQueueable>(
1699 new PGRecoveryContext(pg->get_pgid(), c, e)),
1700 cct->_conf->osd_recovery_cost,
1701 cct->_conf->osd_recovery_priority,
1702 ceph_clock_now(),
1703 0,
1704 e));
1705 }
1706
1707 void OSDService::queue_for_snap_trim(PG *pg)
1708 {
1709 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1710 enqueue_back(
1711 OpSchedulerItem(
1712 unique_ptr<OpSchedulerItem::OpQueueable>(
1713 new PGSnapTrim(pg->get_pgid(), pg->get_osdmap_epoch())),
1714 cct->_conf->osd_snap_trim_cost,
1715 cct->_conf->osd_snap_trim_priority,
1716 ceph_clock_now(),
1717 0,
1718 pg->get_osdmap_epoch()));
1719 }
1720
1721 void OSDService::queue_for_scrub(PG *pg, bool with_high_priority)
1722 {
1723 unsigned scrub_queue_priority = pg->scrubber.priority;
1724 if (with_high_priority && scrub_queue_priority < cct->_conf->osd_client_op_priority) {
1725 scrub_queue_priority = cct->_conf->osd_client_op_priority;
1726 }
1727 const auto epoch = pg->get_osdmap_epoch();
1728 enqueue_back(
1729 OpSchedulerItem(
1730 unique_ptr<OpSchedulerItem::OpQueueable>(new PGScrub(pg->get_pgid(), epoch)),
1731 cct->_conf->osd_scrub_cost,
1732 scrub_queue_priority,
1733 ceph_clock_now(),
1734 0,
1735 epoch));
1736 }
1737
1738 void OSDService::queue_for_pg_delete(spg_t pgid, epoch_t e)
1739 {
1740 dout(10) << __func__ << " on " << pgid << " e " << e << dendl;
1741 enqueue_back(
1742 OpSchedulerItem(
1743 unique_ptr<OpSchedulerItem::OpQueueable>(
1744 new PGDelete(pgid, e)),
1745 cct->_conf->osd_pg_delete_cost,
1746 cct->_conf->osd_pg_delete_priority,
1747 ceph_clock_now(),
1748 0,
1749 e));
1750 }
1751
1752 bool OSDService::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
1753 {
1754 return osd->try_finish_pg_delete(pg, old_pg_num);
1755 }
1756
1757 // ---
1758
1759 void OSDService::set_ready_to_merge_source(PG *pg, eversion_t version)
1760 {
1761 std::lock_guard l(merge_lock);
1762 dout(10) << __func__ << " " << pg->pg_id << dendl;
1763 ready_to_merge_source[pg->pg_id.pgid] = version;
1764 assert(not_ready_to_merge_source.count(pg->pg_id.pgid) == 0);
1765 _send_ready_to_merge();
1766 }
1767
1768 void OSDService::set_ready_to_merge_target(PG *pg,
1769 eversion_t version,
1770 epoch_t last_epoch_started,
1771 epoch_t last_epoch_clean)
1772 {
1773 std::lock_guard l(merge_lock);
1774 dout(10) << __func__ << " " << pg->pg_id << dendl;
1775 ready_to_merge_target.insert(make_pair(pg->pg_id.pgid,
1776 make_tuple(version,
1777 last_epoch_started,
1778 last_epoch_clean)));
1779 assert(not_ready_to_merge_target.count(pg->pg_id.pgid) == 0);
1780 _send_ready_to_merge();
1781 }
1782
1783 void OSDService::set_not_ready_to_merge_source(pg_t source)
1784 {
1785 std::lock_guard l(merge_lock);
1786 dout(10) << __func__ << " " << source << dendl;
1787 not_ready_to_merge_source.insert(source);
1788 assert(ready_to_merge_source.count(source) == 0);
1789 _send_ready_to_merge();
1790 }
1791
1792 void OSDService::set_not_ready_to_merge_target(pg_t target, pg_t source)
1793 {
1794 std::lock_guard l(merge_lock);
1795 dout(10) << __func__ << " " << target << " source " << source << dendl;
1796 not_ready_to_merge_target[target] = source;
1797 assert(ready_to_merge_target.count(target) == 0);
1798 _send_ready_to_merge();
1799 }
1800
1801 void OSDService::send_ready_to_merge()
1802 {
1803 std::lock_guard l(merge_lock);
1804 _send_ready_to_merge();
1805 }
1806
1807 void OSDService::_send_ready_to_merge()
1808 {
1809 dout(20) << __func__
1810 << " ready_to_merge_source " << ready_to_merge_source
1811 << " not_ready_to_merge_source " << not_ready_to_merge_source
1812 << " ready_to_merge_target " << ready_to_merge_target
1813 << " not_ready_to_merge_target " << not_ready_to_merge_target
1814 << " sent_ready_to_merge_source " << sent_ready_to_merge_source
1815 << dendl;
1816 for (auto src : not_ready_to_merge_source) {
1817 if (sent_ready_to_merge_source.count(src) == 0) {
1818 monc->send_mon_message(new MOSDPGReadyToMerge(
1819 src,
1820 {}, {}, 0, 0,
1821 false,
1822 osdmap->get_epoch()));
1823 sent_ready_to_merge_source.insert(src);
1824 }
1825 }
1826 for (auto p : not_ready_to_merge_target) {
1827 if (sent_ready_to_merge_source.count(p.second) == 0) {
1828 monc->send_mon_message(new MOSDPGReadyToMerge(
1829 p.second,
1830 {}, {}, 0, 0,
1831 false,
1832 osdmap->get_epoch()));
1833 sent_ready_to_merge_source.insert(p.second);
1834 }
1835 }
1836 for (auto src : ready_to_merge_source) {
1837 if (not_ready_to_merge_source.count(src.first) ||
1838 not_ready_to_merge_target.count(src.first.get_parent())) {
1839 continue;
1840 }
1841 auto p = ready_to_merge_target.find(src.first.get_parent());
1842 if (p != ready_to_merge_target.end() &&
1843 sent_ready_to_merge_source.count(src.first) == 0) {
1844 monc->send_mon_message(new MOSDPGReadyToMerge(
1845 src.first, // source pgid
1846 src.second, // src version
1847 std::get<0>(p->second), // target version
1848 std::get<1>(p->second), // PG's last_epoch_started
1849 std::get<2>(p->second), // PG's last_epoch_clean
1850 true,
1851 osdmap->get_epoch()));
1852 sent_ready_to_merge_source.insert(src.first);
1853 }
1854 }
1855 }
1856
1857 void OSDService::clear_ready_to_merge(PG *pg)
1858 {
1859 std::lock_guard l(merge_lock);
1860 dout(10) << __func__ << " " << pg->pg_id << dendl;
1861 ready_to_merge_source.erase(pg->pg_id.pgid);
1862 ready_to_merge_target.erase(pg->pg_id.pgid);
1863 not_ready_to_merge_source.erase(pg->pg_id.pgid);
1864 not_ready_to_merge_target.erase(pg->pg_id.pgid);
1865 sent_ready_to_merge_source.erase(pg->pg_id.pgid);
1866 }
1867
1868 void OSDService::clear_sent_ready_to_merge()
1869 {
1870 std::lock_guard l(merge_lock);
1871 sent_ready_to_merge_source.clear();
1872 }
1873
1874 void OSDService::prune_sent_ready_to_merge(const OSDMapRef& osdmap)
1875 {
1876 std::lock_guard l(merge_lock);
1877 auto i = sent_ready_to_merge_source.begin();
1878 while (i != sent_ready_to_merge_source.end()) {
1879 if (!osdmap->pg_exists(*i)) {
1880 dout(10) << __func__ << " " << *i << dendl;
1881 i = sent_ready_to_merge_source.erase(i);
1882 } else {
1883 ++i;
1884 }
1885 }
1886 }
1887
1888 // ---
1889
1890 void OSDService::_queue_for_recovery(
1891 std::pair<epoch_t, PGRef> p,
1892 uint64_t reserved_pushes)
1893 {
1894 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
1895 enqueue_back(
1896 OpSchedulerItem(
1897 unique_ptr<OpSchedulerItem::OpQueueable>(
1898 new PGRecovery(
1899 p.second->get_pgid(), p.first, reserved_pushes)),
1900 cct->_conf->osd_recovery_cost,
1901 cct->_conf->osd_recovery_priority,
1902 ceph_clock_now(),
1903 0,
1904 p.first));
1905 }
1906
1907 // ====================================================================
1908 // OSD
1909
1910 #undef dout_prefix
1911 #define dout_prefix *_dout
1912
1913 // Commands shared between OSD's console and admin console:
1914 namespace ceph {
1915 namespace osd_cmds {
1916
1917 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1918
1919 }} // namespace ceph::osd_cmds
1920
1921 int OSD::mkfs(CephContext *cct, ObjectStore *store, uuid_d fsid, int whoami, string osdspec_affinity)
1922 {
1923 int ret;
1924
1925 OSDSuperblock sb;
1926 bufferlist sbbl;
1927 ObjectStore::CollectionHandle ch;
1928
1929 // if we are fed a uuid for this osd, use it.
1930 store->set_fsid(cct->_conf->osd_uuid);
1931
1932 ret = store->mkfs();
1933 if (ret) {
1934 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1935 << cpp_strerror(ret) << dendl;
1936 goto free_store;
1937 }
1938
1939 store->set_cache_shards(1); // doesn't matter for mkfs!
1940
1941 ret = store->mount();
1942 if (ret) {
1943 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1944 << cpp_strerror(ret) << dendl;
1945 goto free_store;
1946 }
1947
1948 ch = store->open_collection(coll_t::meta());
1949 if (ch) {
1950 ret = store->read(ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1951 if (ret < 0) {
1952 derr << "OSD::mkfs: have meta collection but no superblock" << dendl;
1953 goto free_store;
1954 }
1955 /* if we already have superblock, check content of superblock */
1956 dout(0) << " have superblock" << dendl;
1957 auto p = sbbl.cbegin();
1958 decode(sb, p);
1959 if (whoami != sb.whoami) {
1960 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1961 << dendl;
1962 ret = -EINVAL;
1963 goto umount_store;
1964 }
1965 if (fsid != sb.cluster_fsid) {
1966 derr << "provided cluster fsid " << fsid
1967 << " != superblock's " << sb.cluster_fsid << dendl;
1968 ret = -EINVAL;
1969 goto umount_store;
1970 }
1971 } else {
1972 // create superblock
1973 sb.cluster_fsid = fsid;
1974 sb.osd_fsid = store->get_fsid();
1975 sb.whoami = whoami;
1976 sb.compat_features = get_osd_initial_compat_set();
1977
1978 bufferlist bl;
1979 encode(sb, bl);
1980
1981 ObjectStore::CollectionHandle ch = store->create_new_collection(
1982 coll_t::meta());
1983 ObjectStore::Transaction t;
1984 t.create_collection(coll_t::meta(), 0);
1985 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1986 ret = store->queue_transaction(ch, std::move(t));
1987 if (ret) {
1988 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1989 << "queue_transaction returned " << cpp_strerror(ret) << dendl;
1990 goto umount_store;
1991 }
1992 ch->flush();
1993 }
1994
1995 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
1996 if (ret) {
1997 derr << "OSD::mkfs: failed to write fsid file: error "
1998 << cpp_strerror(ret) << dendl;
1999 goto umount_store;
2000 }
2001
2002 umount_store:
2003 if (ch) {
2004 ch.reset();
2005 }
2006 store->umount();
2007 free_store:
2008 delete store;
2009 return ret;
2010 }
2011
2012 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami, string& osdspec_affinity)
2013 {
2014 char val[80];
2015 int r;
2016
2017 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
2018 r = store->write_meta("magic", val);
2019 if (r < 0)
2020 return r;
2021
2022 snprintf(val, sizeof(val), "%d", whoami);
2023 r = store->write_meta("whoami", val);
2024 if (r < 0)
2025 return r;
2026
2027 cluster_fsid.print(val);
2028 r = store->write_meta("ceph_fsid", val);
2029 if (r < 0)
2030 return r;
2031
2032 string key = cct->_conf.get_val<string>("key");
2033 if (key.size()) {
2034 r = store->write_meta("osd_key", key);
2035 if (r < 0)
2036 return r;
2037 } else {
2038 string keyfile = cct->_conf.get_val<string>("keyfile");
2039 if (!keyfile.empty()) {
2040 bufferlist keybl;
2041 string err;
2042 r = keybl.read_file(keyfile.c_str(), &err);
2043 if (r < 0) {
2044 derr << __func__ << " failed to read keyfile " << keyfile << ": "
2045 << err << ": " << cpp_strerror(r) << dendl;
2046 return r;
2047 }
2048 r = store->write_meta("osd_key", keybl.to_str());
2049 if (r < 0)
2050 return r;
2051 }
2052 }
2053 if (!osdspec_affinity.empty()) {
2054 r = store->write_meta("osdspec_affinity", osdspec_affinity.c_str());
2055 if (r < 0)
2056 return r;
2057 }
2058
2059 r = store->write_meta("ready", "ready");
2060 if (r < 0)
2061 return r;
2062
2063 return 0;
2064 }
2065
2066 int OSD::peek_meta(ObjectStore *store,
2067 std::string *magic,
2068 uuid_d *cluster_fsid,
2069 uuid_d *osd_fsid,
2070 int *whoami,
2071 ceph_release_t *require_osd_release)
2072 {
2073 string val;
2074
2075 int r = store->read_meta("magic", &val);
2076 if (r < 0)
2077 return r;
2078 *magic = val;
2079
2080 r = store->read_meta("whoami", &val);
2081 if (r < 0)
2082 return r;
2083 *whoami = atoi(val.c_str());
2084
2085 r = store->read_meta("ceph_fsid", &val);
2086 if (r < 0)
2087 return r;
2088 r = cluster_fsid->parse(val.c_str());
2089 if (!r)
2090 return -EINVAL;
2091
2092 r = store->read_meta("fsid", &val);
2093 if (r < 0) {
2094 *osd_fsid = uuid_d();
2095 } else {
2096 r = osd_fsid->parse(val.c_str());
2097 if (!r)
2098 return -EINVAL;
2099 }
2100
2101 r = store->read_meta("require_osd_release", &val);
2102 if (r >= 0) {
2103 *require_osd_release = ceph_release_from_name(val);
2104 }
2105
2106 return 0;
2107 }
2108
2109
2110 #undef dout_prefix
2111 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
2112
2113 // cons/des
2114
2115 OSD::OSD(CephContext *cct_, ObjectStore *store_,
2116 int id,
2117 Messenger *internal_messenger,
2118 Messenger *external_messenger,
2119 Messenger *hb_client_front,
2120 Messenger *hb_client_back,
2121 Messenger *hb_front_serverm,
2122 Messenger *hb_back_serverm,
2123 Messenger *osdc_messenger,
2124 MonClient *mc,
2125 const std::string &dev, const std::string &jdev) :
2126 Dispatcher(cct_),
2127 tick_timer(cct, osd_lock),
2128 tick_timer_without_osd_lock(cct, tick_timer_lock),
2129 gss_ktfile_client(cct->_conf.get_val<std::string>("gss_ktab_client_file")),
2130 cluster_messenger(internal_messenger),
2131 client_messenger(external_messenger),
2132 objecter_messenger(osdc_messenger),
2133 monc(mc),
2134 mgrc(cct_, client_messenger, &mc->monmap),
2135 logger(NULL),
2136 recoverystate_perf(NULL),
2137 store(store_),
2138 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
2139 clog(log_client.create_channel()),
2140 whoami(id),
2141 dev_path(dev), journal_path(jdev),
2142 store_is_rotational(store->is_rotational()),
2143 trace_endpoint("0.0.0.0", 0, "osd"),
2144 asok_hook(NULL),
2145 m_osd_pg_epoch_max_lag_factor(cct->_conf.get_val<double>(
2146 "osd_pg_epoch_max_lag_factor")),
2147 osd_compat(get_osd_compat_set()),
2148 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
2149 get_num_op_threads()),
2150 heartbeat_stop(false),
2151 heartbeat_need_update(true),
2152 hb_front_client_messenger(hb_client_front),
2153 hb_back_client_messenger(hb_client_back),
2154 hb_front_server_messenger(hb_front_serverm),
2155 hb_back_server_messenger(hb_back_serverm),
2156 daily_loadavg(0.0),
2157 heartbeat_thread(this),
2158 heartbeat_dispatcher(this),
2159 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
2160 cct->_conf->osd_num_op_tracker_shard),
2161 test_ops_hook(NULL),
2162 op_shardedwq(
2163 this,
2164 cct->_conf->osd_op_thread_timeout,
2165 cct->_conf->osd_op_thread_suicide_timeout,
2166 &osd_op_tp),
2167 last_pg_create_epoch(0),
2168 boot_finisher(cct),
2169 up_thru_wanted(0),
2170 requested_full_first(0),
2171 requested_full_last(0),
2172 service(this)
2173 {
2174
2175 if (!gss_ktfile_client.empty()) {
2176 // Assert we can export environment variable
2177 /*
2178 The default client keytab is used, if it is present and readable,
2179 to automatically obtain initial credentials for GSSAPI client
2180 applications. The principal name of the first entry in the client
2181 keytab is used by default when obtaining initial credentials.
2182 1. The KRB5_CLIENT_KTNAME environment variable.
2183 2. The default_client_keytab_name profile variable in [libdefaults].
2184 3. The hardcoded default, DEFCKTNAME.
2185 */
2186 const int32_t set_result(setenv("KRB5_CLIENT_KTNAME",
2187 gss_ktfile_client.c_str(), 1));
2188 ceph_assert(set_result == 0);
2189 }
2190
2191 monc->set_messenger(client_messenger);
2192 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2193 cct->_conf->osd_op_log_threshold);
2194 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2195 cct->_conf->osd_op_history_duration);
2196 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2197 cct->_conf->osd_op_history_slow_op_threshold);
2198 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
2199 #ifdef WITH_BLKIN
2200 std::stringstream ss;
2201 ss << "osd." << whoami;
2202 trace_endpoint.copy_name(ss.str());
2203 #endif
2204
2205 // initialize shards
2206 num_shards = get_num_op_shards();
2207 for (uint32_t i = 0; i < num_shards; i++) {
2208 OSDShard *one_shard = new OSDShard(
2209 i,
2210 cct,
2211 this);
2212 shards.push_back(one_shard);
2213 }
2214 }
2215
2216 OSD::~OSD()
2217 {
2218 while (!shards.empty()) {
2219 delete shards.back();
2220 shards.pop_back();
2221 }
2222 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2223 cct->get_perfcounters_collection()->remove(logger);
2224 delete recoverystate_perf;
2225 delete logger;
2226 delete store;
2227 }
2228
2229 double OSD::get_tick_interval() const
2230 {
2231 // vary +/- 5% to avoid scrub scheduling livelocks
2232 constexpr auto delta = 0.05;
2233 return (OSD_TICK_INTERVAL *
2234 ceph::util::generate_random_number(1.0 - delta, 1.0 + delta));
2235 }
2236
2237 void OSD::handle_signal(int signum)
2238 {
2239 ceph_assert(signum == SIGINT || signum == SIGTERM);
2240 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2241 shutdown();
2242 }
2243
2244 int OSD::pre_init()
2245 {
2246 std::lock_guard lock(osd_lock);
2247 if (is_stopping())
2248 return 0;
2249
2250 if (store->test_mount_in_use()) {
2251 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2252 << "currently in use. (Is ceph-osd already running?)" << dendl;
2253 return -EBUSY;
2254 }
2255
2256 cct->_conf.add_observer(this);
2257 return 0;
2258 }
2259
2260 int OSD::set_numa_affinity()
2261 {
2262 // storage numa node
2263 int store_node = -1;
2264 store->get_numa_node(&store_node, nullptr, nullptr);
2265 if (store_node >= 0) {
2266 dout(1) << __func__ << " storage numa node " << store_node << dendl;
2267 }
2268
2269 // check network numa node(s)
2270 int front_node = -1, back_node = -1;
2271 string front_iface = pick_iface(
2272 cct,
2273 client_messenger->get_myaddrs().front().get_sockaddr_storage());
2274 string back_iface = pick_iface(
2275 cct,
2276 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
2277 int r = get_iface_numa_node(front_iface, &front_node);
2278 if (r >= 0 && front_node >= 0) {
2279 dout(1) << __func__ << " public network " << front_iface << " numa node "
2280 << front_node << dendl;
2281 r = get_iface_numa_node(back_iface, &back_node);
2282 if (r >= 0 && back_node >= 0) {
2283 dout(1) << __func__ << " cluster network " << back_iface << " numa node "
2284 << back_node << dendl;
2285 if (front_node == back_node &&
2286 front_node == store_node) {
2287 dout(1) << " objectstore and network numa nodes all match" << dendl;
2288 if (g_conf().get_val<bool>("osd_numa_auto_affinity")) {
2289 numa_node = front_node;
2290 }
2291 } else if (front_node != back_node) {
2292 dout(1) << __func__ << " public and cluster network numa nodes do not match"
2293 << dendl;
2294 } else {
2295 dout(1) << __func__ << " objectstore and network numa nodes do not match"
2296 << dendl;
2297 }
2298 } else if (back_node == -2) {
2299 dout(1) << __func__ << " cluster network " << back_iface
2300 << " ports numa nodes do not match" << dendl;
2301 } else {
2302 derr << __func__ << " unable to identify cluster interface '" << back_iface
2303 << "' numa node: " << cpp_strerror(r) << dendl;
2304 }
2305 } else if (front_node == -2) {
2306 dout(1) << __func__ << " public network " << front_iface
2307 << " ports numa nodes do not match" << dendl;
2308 } else {
2309 derr << __func__ << " unable to identify public interface '" << front_iface
2310 << "' numa node: " << cpp_strerror(r) << dendl;
2311 }
2312 if (int node = g_conf().get_val<int64_t>("osd_numa_node"); node >= 0) {
2313 // this takes precedence over the automagic logic above
2314 numa_node = node;
2315 }
2316 if (numa_node >= 0) {
2317 int r = get_numa_node_cpu_set(numa_node, &numa_cpu_set_size, &numa_cpu_set);
2318 if (r < 0) {
2319 dout(1) << __func__ << " unable to determine numa node " << numa_node
2320 << " CPUs" << dendl;
2321 numa_node = -1;
2322 } else {
2323 dout(1) << __func__ << " setting numa affinity to node " << numa_node
2324 << " cpus "
2325 << cpu_set_to_str_list(numa_cpu_set_size, &numa_cpu_set)
2326 << dendl;
2327 r = set_cpu_affinity_all_threads(numa_cpu_set_size, &numa_cpu_set);
2328 if (r < 0) {
2329 r = -errno;
2330 derr << __func__ << " failed to set numa affinity: " << cpp_strerror(r)
2331 << dendl;
2332 numa_node = -1;
2333 }
2334 }
2335 } else {
2336 dout(1) << __func__ << " not setting numa affinity" << dendl;
2337 }
2338 return 0;
2339 }
2340
2341 // asok
2342
2343 class OSDSocketHook : public AdminSocketHook {
2344 OSD *osd;
2345 public:
2346 explicit OSDSocketHook(OSD *o) : osd(o) {}
2347 int call(std::string_view prefix, const cmdmap_t& cmdmap,
2348 Formatter *f,
2349 std::ostream& ss,
2350 bufferlist& out) override {
2351 ceph_abort("should use async hook");
2352 }
2353 void call_async(
2354 std::string_view prefix,
2355 const cmdmap_t& cmdmap,
2356 Formatter *f,
2357 const bufferlist& inbl,
2358 std::function<void(int,const std::string&,bufferlist&)> on_finish) override {
2359 try {
2360 osd->asok_command(prefix, cmdmap, f, inbl, on_finish);
2361 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2362 bufferlist empty;
2363 on_finish(-EINVAL, e.what(), empty);
2364 }
2365 }
2366 };
2367
2368 std::set<int64_t> OSD::get_mapped_pools()
2369 {
2370 std::set<int64_t> pools;
2371 std::vector<spg_t> pgids;
2372 _get_pgids(&pgids);
2373 for (const auto &pgid : pgids) {
2374 pools.insert(pgid.pool());
2375 }
2376 return pools;
2377 }
2378
2379 void OSD::asok_command(
2380 std::string_view prefix, const cmdmap_t& cmdmap,
2381 Formatter *f,
2382 const bufferlist& inbl,
2383 std::function<void(int,const std::string&,bufferlist&)> on_finish)
2384 {
2385 int ret = 0;
2386 stringstream ss; // stderr error message stream
2387 bufferlist outbl; // if empty at end, we'll dump formatter as output
2388
2389 // --- PG commands are routed here to PG::do_command ---
2390 if (prefix == "pg" ||
2391 prefix == "query" ||
2392 prefix == "mark_unfound_lost" ||
2393 prefix == "list_unfound" ||
2394 prefix == "scrub" ||
2395 prefix == "deep_scrub"
2396 ) {
2397 string pgidstr;
2398 pg_t pgid;
2399 if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
2400 ss << "no pgid specified";
2401 ret = -EINVAL;
2402 goto out;
2403 }
2404 if (!pgid.parse(pgidstr.c_str())) {
2405 ss << "couldn't parse pgid '" << pgidstr << "'";
2406 ret = -EINVAL;
2407 goto out;
2408 }
2409 spg_t pcand;
2410 PGRef pg;
2411 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
2412 (pg = _lookup_lock_pg(pcand))) {
2413 if (pg->is_primary()) {
2414 cmdmap_t new_cmdmap = cmdmap;
2415 try {
2416 pg->do_command(prefix, new_cmdmap, inbl, on_finish);
2417 pg->unlock();
2418 return; // the pg handler calls on_finish directly
2419 } catch (const TOPNSPC::common::bad_cmd_get& e) {
2420 pg->unlock();
2421 ss << e.what();
2422 ret = -EINVAL;
2423 goto out;
2424 }
2425 } else {
2426 ss << "not primary for pgid " << pgid;
2427 // do not reply; they will get newer maps and realize they
2428 // need to resend.
2429 pg->unlock();
2430 ret = -EAGAIN;
2431 goto out;
2432 }
2433 } else {
2434 ss << "i don't have pgid " << pgid;
2435 ret = -ENOENT;
2436 }
2437 }
2438
2439 // --- OSD commands follow ---
2440
2441 else if (prefix == "status") {
2442 lock_guard l(osd_lock);
2443 f->open_object_section("status");
2444 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2445 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2446 f->dump_unsigned("whoami", superblock.whoami);
2447 f->dump_string("state", get_state_name(get_state()));
2448 f->dump_unsigned("oldest_map", superblock.oldest_map);
2449 f->dump_unsigned("newest_map", superblock.newest_map);
2450 f->dump_unsigned("num_pgs", num_pgs);
2451 f->close_section();
2452 } else if (prefix == "flush_journal") {
2453 store->flush_journal();
2454 } else if (prefix == "dump_ops_in_flight" ||
2455 prefix == "ops" ||
2456 prefix == "dump_blocked_ops" ||
2457 prefix == "dump_historic_ops" ||
2458 prefix == "dump_historic_ops_by_duration" ||
2459 prefix == "dump_historic_slow_ops") {
2460
2461 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2462 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2463 will start to track new ops received afterwards.";
2464
2465 set<string> filters;
2466 vector<string> filter_str;
2467 if (cmd_getval(cmdmap, "filterstr", filter_str)) {
2468 copy(filter_str.begin(), filter_str.end(),
2469 inserter(filters, filters.end()));
2470 }
2471
2472 if (prefix == "dump_ops_in_flight" ||
2473 prefix == "ops") {
2474 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2475 ss << error_str;
2476 ret = -EINVAL;
2477 goto out;
2478 }
2479 }
2480 if (prefix == "dump_blocked_ops") {
2481 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2482 ss << error_str;
2483 ret = -EINVAL;
2484 goto out;
2485 }
2486 }
2487 if (prefix == "dump_historic_ops") {
2488 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2489 ss << error_str;
2490 ret = -EINVAL;
2491 goto out;
2492 }
2493 }
2494 if (prefix == "dump_historic_ops_by_duration") {
2495 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2496 ss << error_str;
2497 ret = -EINVAL;
2498 goto out;
2499 }
2500 }
2501 if (prefix == "dump_historic_slow_ops") {
2502 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2503 ss << error_str;
2504 ret = -EINVAL;
2505 goto out;
2506 }
2507 }
2508 } else if (prefix == "dump_op_pq_state") {
2509 f->open_object_section("pq");
2510 op_shardedwq.dump(f);
2511 f->close_section();
2512 } else if (prefix == "dump_blacklist") {
2513 list<pair<entity_addr_t,utime_t> > bl;
2514 OSDMapRef curmap = service.get_osdmap();
2515
2516 f->open_array_section("blacklist");
2517 curmap->get_blacklist(&bl);
2518 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2519 it != bl.end(); ++it) {
2520 f->open_object_section("entry");
2521 f->open_object_section("entity_addr_t");
2522 it->first.dump(f);
2523 f->close_section(); //entity_addr_t
2524 it->second.localtime(f->dump_stream("expire_time"));
2525 f->close_section(); //entry
2526 }
2527 f->close_section(); //blacklist
2528 } else if (prefix == "dump_watchers") {
2529 list<obj_watch_item_t> watchers;
2530 // scan pg's
2531 vector<PGRef> pgs;
2532 _get_pgs(&pgs);
2533 for (auto& pg : pgs) {
2534 list<obj_watch_item_t> pg_watchers;
2535 pg->get_watchers(&pg_watchers);
2536 watchers.splice(watchers.end(), pg_watchers);
2537 }
2538
2539 f->open_array_section("watchers");
2540 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2541 it != watchers.end(); ++it) {
2542
2543 f->open_object_section("watch");
2544
2545 f->dump_string("namespace", it->obj.nspace);
2546 f->dump_string("object", it->obj.oid.name);
2547
2548 f->open_object_section("entity_name");
2549 it->wi.name.dump(f);
2550 f->close_section(); //entity_name_t
2551
2552 f->dump_unsigned("cookie", it->wi.cookie);
2553 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2554
2555 f->open_object_section("entity_addr_t");
2556 it->wi.addr.dump(f);
2557 f->close_section(); //entity_addr_t
2558
2559 f->close_section(); //watch
2560 }
2561
2562 f->close_section(); //watchers
2563 } else if (prefix == "dump_recovery_reservations") {
2564 f->open_object_section("reservations");
2565 f->open_object_section("local_reservations");
2566 service.local_reserver.dump(f);
2567 f->close_section();
2568 f->open_object_section("remote_reservations");
2569 service.remote_reserver.dump(f);
2570 f->close_section();
2571 f->close_section();
2572 } else if (prefix == "dump_scrub_reservations") {
2573 f->open_object_section("scrub_reservations");
2574 service.dump_scrub_reservations(f);
2575 f->close_section();
2576 } else if (prefix == "get_latest_osdmap") {
2577 get_latest_osdmap();
2578 } else if (prefix == "set_heap_property") {
2579 string property;
2580 int64_t value = 0;
2581 string error;
2582 bool success = false;
2583 if (!cmd_getval(cmdmap, "property", property)) {
2584 error = "unable to get property";
2585 success = false;
2586 } else if (!cmd_getval(cmdmap, "value", value)) {
2587 error = "unable to get value";
2588 success = false;
2589 } else if (value < 0) {
2590 error = "negative value not allowed";
2591 success = false;
2592 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2593 error = "invalid property";
2594 success = false;
2595 } else {
2596 success = true;
2597 }
2598 f->open_object_section("result");
2599 f->dump_string("error", error);
2600 f->dump_bool("success", success);
2601 f->close_section();
2602 } else if (prefix == "get_heap_property") {
2603 string property;
2604 size_t value = 0;
2605 string error;
2606 bool success = false;
2607 if (!cmd_getval(cmdmap, "property", property)) {
2608 error = "unable to get property";
2609 success = false;
2610 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2611 error = "invalid property";
2612 success = false;
2613 } else {
2614 success = true;
2615 }
2616 f->open_object_section("result");
2617 f->dump_string("error", error);
2618 f->dump_bool("success", success);
2619 f->dump_int("value", value);
2620 f->close_section();
2621 } else if (prefix == "dump_objectstore_kv_stats") {
2622 store->get_db_statistics(f);
2623 } else if (prefix == "dump_scrubs") {
2624 service.dumps_scrub(f);
2625 } else if (prefix == "calc_objectstore_db_histogram") {
2626 store->generate_db_histogram(f);
2627 } else if (prefix == "flush_store_cache") {
2628 store->flush_cache(&ss);
2629 } else if (prefix == "dump_pgstate_history") {
2630 f->open_object_section("pgstate_history");
2631 f->open_array_section("pgs");
2632 vector<PGRef> pgs;
2633 _get_pgs(&pgs);
2634 for (auto& pg : pgs) {
2635 f->open_object_section("pg");
2636 f->dump_stream("pg") << pg->pg_id;
2637 f->dump_string("currently", pg->get_current_state());
2638 pg->dump_pgstate_history(f);
2639 f->close_section();
2640 }
2641 f->close_section();
2642 f->close_section();
2643 } else if (prefix == "compact") {
2644 dout(1) << "triggering manual compaction" << dendl;
2645 auto start = ceph::coarse_mono_clock::now();
2646 store->compact();
2647 auto end = ceph::coarse_mono_clock::now();
2648 double duration = std::chrono::duration<double>(end-start).count();
2649 dout(1) << "finished manual compaction in "
2650 << duration
2651 << " seconds" << dendl;
2652 f->open_object_section("compact_result");
2653 f->dump_float("elapsed_time", duration);
2654 f->close_section();
2655 } else if (prefix == "get_mapped_pools") {
2656 f->open_array_section("mapped_pools");
2657 set<int64_t> poollist = get_mapped_pools();
2658 for (auto pool : poollist) {
2659 f->dump_int("pool_id", pool);
2660 }
2661 f->close_section();
2662 } else if (prefix == "smart") {
2663 string devid;
2664 cmd_getval(cmdmap, "devid", devid);
2665 ostringstream out;
2666 probe_smart(devid, out);
2667 outbl.append(out.str());
2668 } else if (prefix == "list_devices") {
2669 set<string> devnames;
2670 store->get_devices(&devnames);
2671 f->open_array_section("list_devices");
2672 for (auto dev : devnames) {
2673 if (dev.find("dm-") == 0) {
2674 continue;
2675 }
2676 string err;
2677 f->open_object_section("device");
2678 f->dump_string("device", "/dev/" + dev);
2679 f->dump_string("device_id", get_device_id(dev, &err));
2680 f->close_section();
2681 }
2682 f->close_section();
2683 } else if (prefix == "send_beacon") {
2684 lock_guard l(osd_lock);
2685 if (is_active()) {
2686 send_beacon(ceph::coarse_mono_clock::now());
2687 }
2688 }
2689
2690 else if (prefix == "cluster_log") {
2691 vector<string> msg;
2692 cmd_getval(cmdmap, "message", msg);
2693 if (msg.empty()) {
2694 ret = -EINVAL;
2695 ss << "ignoring empty log message";
2696 goto out;
2697 }
2698 string message = msg.front();
2699 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
2700 message += " " + *a;
2701 string lvl;
2702 cmd_getval(cmdmap, "level", lvl);
2703 clog_type level = string_to_clog_type(lvl);
2704 if (level < 0) {
2705 ret = -EINVAL;
2706 ss << "unknown level '" << lvl << "'";
2707 goto out;
2708 }
2709 clog->do_log(level, message);
2710 }
2711
2712 else if (prefix == "bench") {
2713 int64_t count;
2714 int64_t bsize;
2715 int64_t osize, onum;
2716 // default count 1G, size 4MB
2717 cmd_getval(cmdmap, "count", count, (int64_t)1 << 30);
2718 cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
2719 cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
2720 cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
2721
2722 uint32_t duration = cct->_conf->osd_bench_duration;
2723
2724 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
2725 // let us limit the block size because the next checks rely on it
2726 // having a sane value. If we allow any block size to be set things
2727 // can still go sideways.
2728 ss << "block 'size' values are capped at "
2729 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
2730 << " a higher value, please adjust 'osd_bench_max_block_size'";
2731 ret = -EINVAL;
2732 goto out;
2733 } else if (bsize < (int64_t) (1 << 20)) {
2734 // entering the realm of small block sizes.
2735 // limit the count to a sane value, assuming a configurable amount of
2736 // IOPS and duration, so that the OSD doesn't get hung up on this,
2737 // preventing timeouts from going off
2738 int64_t max_count =
2739 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
2740 if (count > max_count) {
2741 ss << "'count' values greater than " << max_count
2742 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2743 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
2744 << " for " << duration << " seconds,"
2745 << " can cause ill effects on osd. "
2746 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
2747 << " value if you wish to use a higher 'count'.";
2748 ret = -EINVAL;
2749 goto out;
2750 }
2751 } else {
2752 // 1MB block sizes are big enough so that we get more stuff done.
2753 // However, to avoid the osd from getting hung on this and having
2754 // timers being triggered, we are going to limit the count assuming
2755 // a configurable throughput and duration.
2756 // NOTE: max_count is the total amount of bytes that we believe we
2757 // will be able to write during 'duration' for the given
2758 // throughput. The block size hardly impacts this unless it's
2759 // way too big. Given we already check how big the block size
2760 // is, it's safe to assume everything will check out.
2761 int64_t max_count =
2762 cct->_conf->osd_bench_large_size_max_throughput * duration;
2763 if (count > max_count) {
2764 ss << "'count' values greater than " << max_count
2765 << " for a block size of " << byte_u_t(bsize) << ", assuming "
2766 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
2767 << " for " << duration << " seconds,"
2768 << " can cause ill effects on osd. "
2769 << " Please adjust 'osd_bench_large_size_max_throughput'"
2770 << " with a higher value if you wish to use a higher 'count'.";
2771 ret = -EINVAL;
2772 goto out;
2773 }
2774 }
2775
2776 if (osize && bsize > osize)
2777 bsize = osize;
2778
2779 dout(1) << " bench count " << count
2780 << " bsize " << byte_u_t(bsize) << dendl;
2781
2782 ObjectStore::Transaction cleanupt;
2783
2784 if (osize && onum) {
2785 bufferlist bl;
2786 bufferptr bp(osize);
2787 bp.zero();
2788 bl.push_back(std::move(bp));
2789 bl.rebuild_page_aligned();
2790 for (int i=0; i<onum; ++i) {
2791 char nm[30];
2792 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
2793 object_t oid(nm);
2794 hobject_t soid(sobject_t(oid, 0));
2795 ObjectStore::Transaction t;
2796 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
2797 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2798 cleanupt.remove(coll_t(), ghobject_t(soid));
2799 }
2800 }
2801
2802 bufferlist bl;
2803 bufferptr bp(bsize);
2804 bp.zero();
2805 bl.push_back(std::move(bp));
2806 bl.rebuild_page_aligned();
2807
2808 {
2809 C_SaferCond waiter;
2810 if (!service.meta_ch->flush_commit(&waiter)) {
2811 waiter.wait();
2812 }
2813 }
2814
2815 utime_t start = ceph_clock_now();
2816 for (int64_t pos = 0; pos < count; pos += bsize) {
2817 char nm[30];
2818 unsigned offset = 0;
2819 if (onum && osize) {
2820 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
2821 offset = rand() % (osize / bsize) * bsize;
2822 } else {
2823 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
2824 }
2825 object_t oid(nm);
2826 hobject_t soid(sobject_t(oid, 0));
2827 ObjectStore::Transaction t;
2828 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
2829 store->queue_transaction(service.meta_ch, std::move(t), NULL);
2830 if (!onum || !osize)
2831 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
2832 }
2833
2834 {
2835 C_SaferCond waiter;
2836 if (!service.meta_ch->flush_commit(&waiter)) {
2837 waiter.wait();
2838 }
2839 }
2840 utime_t end = ceph_clock_now();
2841
2842 // clean up
2843 store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
2844 {
2845 C_SaferCond waiter;
2846 if (!service.meta_ch->flush_commit(&waiter)) {
2847 waiter.wait();
2848 }
2849 }
2850
2851 double elapsed = end - start;
2852 double rate = count / elapsed;
2853 double iops = rate / bsize;
2854 f->open_object_section("osd_bench_results");
2855 f->dump_int("bytes_written", count);
2856 f->dump_int("blocksize", bsize);
2857 f->dump_float("elapsed_sec", elapsed);
2858 f->dump_float("bytes_per_sec", rate);
2859 f->dump_float("iops", iops);
2860 f->close_section();
2861 }
2862
2863 else if (prefix == "flush_pg_stats") {
2864 mgrc.send_pgstats();
2865 f->dump_unsigned("stat_seq", service.get_osd_stat_seq());
2866 }
2867
2868 else if (prefix == "heap") {
2869 ret = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2870 }
2871
2872 else if (prefix == "debug dump_missing") {
2873 f->open_array_section("pgs");
2874 vector<PGRef> pgs;
2875 _get_pgs(&pgs);
2876 for (auto& pg : pgs) {
2877 string s = stringify(pg->pg_id);
2878 f->open_array_section(s.c_str());
2879 pg->lock();
2880 pg->dump_missing(f);
2881 pg->unlock();
2882 f->close_section();
2883 }
2884 f->close_section();
2885 }
2886
2887 else if (prefix == "debug kick_recovery_wq") {
2888 int64_t delay;
2889 cmd_getval(cmdmap, "delay", delay);
2890 ostringstream oss;
2891 oss << delay;
2892 ret = cct->_conf.set_val("osd_recovery_delay_start", oss.str().c_str());
2893 if (ret != 0) {
2894 ss << "kick_recovery_wq: error setting "
2895 << "osd_recovery_delay_start to '" << delay << "': error "
2896 << ret;
2897 goto out;
2898 }
2899 cct->_conf.apply_changes(nullptr);
2900 ss << "kicking recovery queue. set osd_recovery_delay_start "
2901 << "to " << cct->_conf->osd_recovery_delay_start;
2902 }
2903
2904 else if (prefix == "cpu_profiler") {
2905 ostringstream ds;
2906 string arg;
2907 cmd_getval(cmdmap, "arg", arg);
2908 vector<string> argvec;
2909 get_str_vec(arg, argvec);
2910 cpu_profiler_handle_command(argvec, ds);
2911 outbl.append(ds.str());
2912 }
2913
2914 else if (prefix == "dump_pg_recovery_stats") {
2915 lock_guard l(osd_lock);
2916 pg_recovery_stats.dump_formatted(f);
2917 }
2918
2919 else if (prefix == "reset_pg_recovery_stats") {
2920 lock_guard l(osd_lock);
2921 pg_recovery_stats.reset();
2922 }
2923
2924 else if (prefix == "perf histogram dump") {
2925 std::string logger;
2926 std::string counter;
2927 cmd_getval(cmdmap, "logger", logger);
2928 cmd_getval(cmdmap, "counter", counter);
2929 cct->get_perfcounters_collection()->dump_formatted_histograms(
2930 f, false, logger, counter);
2931 }
2932
2933 else if (prefix == "cache drop") {
2934 lock_guard l(osd_lock);
2935 dout(20) << "clearing all caches" << dendl;
2936 // Clear the objectstore's cache - onode and buffer for Bluestore,
2937 // system's pagecache for Filestore
2938 ret = store->flush_cache(&ss);
2939 if (ret < 0) {
2940 ss << "Error flushing objectstore cache: " << cpp_strerror(ret);
2941 goto out;
2942 }
2943 // Clear the objectcontext cache (per PG)
2944 vector<PGRef> pgs;
2945 _get_pgs(&pgs);
2946 for (auto& pg: pgs) {
2947 pg->clear_cache();
2948 }
2949 }
2950
2951 else if (prefix == "cache status") {
2952 lock_guard l(osd_lock);
2953 int obj_ctx_count = 0;
2954 vector<PGRef> pgs;
2955 _get_pgs(&pgs);
2956 for (auto& pg: pgs) {
2957 obj_ctx_count += pg->get_cache_obj_count();
2958 }
2959 f->open_object_section("cache_status");
2960 f->dump_int("object_ctx", obj_ctx_count);
2961 store->dump_cache_stats(f);
2962 f->close_section();
2963 }
2964
2965 else if (prefix == "scrub_purged_snaps") {
2966 lock_guard l(osd_lock);
2967 scrub_purged_snaps();
2968 }
2969
2970 else if (prefix == "dump_osd_network") {
2971 lock_guard l(osd_lock);
2972 int64_t value = 0;
2973 if (!(cmd_getval(cmdmap, "value", value))) {
2974 // Convert milliseconds to microseconds
2975 value = static_cast<double>(g_conf().get_val<double>(
2976 "mon_warn_on_slow_ping_time")) * 1000;
2977 if (value == 0) {
2978 double ratio = g_conf().get_val<double>("mon_warn_on_slow_ping_ratio");
2979 value = g_conf().get_val<int64_t>("osd_heartbeat_grace");
2980 value *= 1000000 * ratio; // Seconds of grace to microseconds at ratio
2981 }
2982 } else {
2983 // Convert user input to microseconds
2984 value *= 1000;
2985 }
2986 if (value < 0) value = 0;
2987
2988 struct osd_ping_time_t {
2989 uint32_t pingtime;
2990 int to;
2991 bool back;
2992 std::array<uint32_t,3> times;
2993 std::array<uint32_t,3> min;
2994 std::array<uint32_t,3> max;
2995 uint32_t last;
2996 uint32_t last_update;
2997
2998 bool operator<(const osd_ping_time_t& rhs) const {
2999 if (pingtime < rhs.pingtime)
3000 return true;
3001 if (pingtime > rhs.pingtime)
3002 return false;
3003 if (to < rhs.to)
3004 return true;
3005 if (to > rhs.to)
3006 return false;
3007 return back;
3008 }
3009 };
3010
3011 set<osd_ping_time_t> sorted;
3012 // Get pingtimes under lock and not on the stack
3013 map<int, osd_stat_t::Interfaces> *pingtimes = new map<int, osd_stat_t::Interfaces>;
3014 service.get_hb_pingtime(pingtimes);
3015 for (auto j : *pingtimes) {
3016 if (j.second.last_update == 0)
3017 continue;
3018 osd_ping_time_t item;
3019 item.pingtime = std::max(j.second.back_pingtime[0], j.second.back_pingtime[1]);
3020 item.pingtime = std::max(item.pingtime, j.second.back_pingtime[2]);
3021 if (item.pingtime >= value) {
3022 item.to = j.first;
3023 item.times[0] = j.second.back_pingtime[0];
3024 item.times[1] = j.second.back_pingtime[1];
3025 item.times[2] = j.second.back_pingtime[2];
3026 item.min[0] = j.second.back_min[0];
3027 item.min[1] = j.second.back_min[1];
3028 item.min[2] = j.second.back_min[2];
3029 item.max[0] = j.second.back_max[0];
3030 item.max[1] = j.second.back_max[1];
3031 item.max[2] = j.second.back_max[2];
3032 item.last = j.second.back_last;
3033 item.back = true;
3034 item.last_update = j.second.last_update;
3035 sorted.emplace(item);
3036 }
3037 if (j.second.front_last == 0)
3038 continue;
3039 item.pingtime = std::max(j.second.front_pingtime[0], j.second.front_pingtime[1]);
3040 item.pingtime = std::max(item.pingtime, j.second.front_pingtime[2]);
3041 if (item.pingtime >= value) {
3042 item.to = j.first;
3043 item.times[0] = j.second.front_pingtime[0];
3044 item.times[1] = j.second.front_pingtime[1];
3045 item.times[2] = j.second.front_pingtime[2];
3046 item.min[0] = j.second.front_min[0];
3047 item.min[1] = j.second.front_min[1];
3048 item.min[2] = j.second.front_min[2];
3049 item.max[0] = j.second.front_max[0];
3050 item.max[1] = j.second.front_max[1];
3051 item.max[2] = j.second.front_max[2];
3052 item.last = j.second.front_last;
3053 item.last_update = j.second.last_update;
3054 item.back = false;
3055 sorted.emplace(item);
3056 }
3057 }
3058 delete pingtimes;
3059 //
3060 // Network ping times (1min 5min 15min)
3061 f->open_object_section("network_ping_times");
3062 f->dump_int("threshold", value / 1000);
3063 f->open_array_section("entries");
3064 for (auto &sitem : boost::adaptors::reverse(sorted)) {
3065 ceph_assert(sitem.pingtime >= value);
3066 f->open_object_section("entry");
3067
3068 const time_t lu(sitem.last_update);
3069 char buffer[26];
3070 string lustr(ctime_r(&lu, buffer));
3071 lustr.pop_back(); // Remove trailing \n
3072 auto stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3073 f->dump_string("last update", lustr);
3074 f->dump_bool("stale", ceph_clock_now().sec() - sitem.last_update > stale);
3075 f->dump_int("from osd", whoami);
3076 f->dump_int("to osd", sitem.to);
3077 f->dump_string("interface", (sitem.back ? "back" : "front"));
3078 f->open_object_section("average");
3079 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.times[0],3).c_str());
3080 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.times[1],3).c_str());
3081 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.times[2],3).c_str());
3082 f->close_section(); // average
3083 f->open_object_section("min");
3084 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3085 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3086 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3087 f->close_section(); // min
3088 f->open_object_section("max");
3089 f->dump_format_unquoted("1min", "%s", fixed_u_to_string(sitem.max[0],3).c_str());
3090 f->dump_format_unquoted("5min", "%s", fixed_u_to_string(sitem.max[1],3).c_str());
3091 f->dump_format_unquoted("15min", "%s", fixed_u_to_string(sitem.max[2],3).c_str());
3092 f->close_section(); // max
3093 f->dump_format_unquoted("last", "%s", fixed_u_to_string(sitem.last,3).c_str());
3094 f->close_section(); // entry
3095 }
3096 f->close_section(); // entries
3097 f->close_section(); // network_ping_times
3098 } else {
3099 ceph_abort_msg("broken asok registration");
3100 }
3101
3102 out:
3103 on_finish(ret, ss.str(), outbl);
3104 }
3105
3106 class TestOpsSocketHook : public AdminSocketHook {
3107 OSDService *service;
3108 ObjectStore *store;
3109 public:
3110 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
3111 int call(std::string_view command, const cmdmap_t& cmdmap,
3112 Formatter *f,
3113 std::ostream& errss,
3114 bufferlist& out) override {
3115 int r = 0;
3116 stringstream outss;
3117 try {
3118 test_ops(service, store, command, cmdmap, outss);
3119 out.append(outss);
3120 } catch (const TOPNSPC::common::bad_cmd_get& e) {
3121 errss << e.what();
3122 r = -EINVAL;
3123 }
3124 return r;
3125 }
3126 void test_ops(OSDService *service, ObjectStore *store,
3127 std::string_view command, const cmdmap_t& cmdmap, ostream &ss);
3128
3129 };
3130
3131 class OSD::C_Tick : public Context {
3132 OSD *osd;
3133 public:
3134 explicit C_Tick(OSD *o) : osd(o) {}
3135 void finish(int r) override {
3136 osd->tick();
3137 }
3138 };
3139
3140 class OSD::C_Tick_WithoutOSDLock : public Context {
3141 OSD *osd;
3142 public:
3143 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
3144 void finish(int r) override {
3145 osd->tick_without_osd_lock();
3146 }
3147 };
3148
3149 int OSD::enable_disable_fuse(bool stop)
3150 {
3151 #ifdef HAVE_LIBFUSE
3152 int r;
3153 string mntpath = cct->_conf->osd_data + "/fuse";
3154 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
3155 dout(1) << __func__ << " disabling" << dendl;
3156 fuse_store->stop();
3157 delete fuse_store;
3158 fuse_store = NULL;
3159 r = ::rmdir(mntpath.c_str());
3160 if (r < 0) {
3161 r = -errno;
3162 derr << __func__ << " failed to rmdir " << mntpath << ": "
3163 << cpp_strerror(r) << dendl;
3164 return r;
3165 }
3166 return 0;
3167 }
3168 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
3169 dout(1) << __func__ << " enabling" << dendl;
3170 r = ::mkdir(mntpath.c_str(), 0700);
3171 if (r < 0)
3172 r = -errno;
3173 if (r < 0 && r != -EEXIST) {
3174 derr << __func__ << " unable to create " << mntpath << ": "
3175 << cpp_strerror(r) << dendl;
3176 return r;
3177 }
3178 fuse_store = new FuseStore(store, mntpath);
3179 r = fuse_store->start();
3180 if (r < 0) {
3181 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
3182 delete fuse_store;
3183 fuse_store = NULL;
3184 return r;
3185 }
3186 }
3187 #endif // HAVE_LIBFUSE
3188 return 0;
3189 }
3190
3191 size_t OSD::get_num_cache_shards()
3192 {
3193 return cct->_conf.get_val<Option::size_t>("osd_num_cache_shards");
3194 }
3195
3196 int OSD::get_num_op_shards()
3197 {
3198 if (cct->_conf->osd_op_num_shards)
3199 return cct->_conf->osd_op_num_shards;
3200 if (store_is_rotational)
3201 return cct->_conf->osd_op_num_shards_hdd;
3202 else
3203 return cct->_conf->osd_op_num_shards_ssd;
3204 }
3205
3206 int OSD::get_num_op_threads()
3207 {
3208 if (cct->_conf->osd_op_num_threads_per_shard)
3209 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
3210 if (store_is_rotational)
3211 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
3212 else
3213 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
3214 }
3215
3216 float OSD::get_osd_recovery_sleep()
3217 {
3218 if (cct->_conf->osd_recovery_sleep)
3219 return cct->_conf->osd_recovery_sleep;
3220 if (!store_is_rotational && !journal_is_rotational)
3221 return cct->_conf->osd_recovery_sleep_ssd;
3222 else if (store_is_rotational && !journal_is_rotational)
3223 return cct->_conf.get_val<double>("osd_recovery_sleep_hybrid");
3224 else
3225 return cct->_conf->osd_recovery_sleep_hdd;
3226 }
3227
3228 float OSD::get_osd_delete_sleep()
3229 {
3230 float osd_delete_sleep = cct->_conf.get_val<double>("osd_delete_sleep");
3231 if (osd_delete_sleep > 0)
3232 return osd_delete_sleep;
3233 if (!store_is_rotational && !journal_is_rotational)
3234 return cct->_conf.get_val<double>("osd_delete_sleep_ssd");
3235 if (store_is_rotational && !journal_is_rotational)
3236 return cct->_conf.get_val<double>("osd_delete_sleep_hybrid");
3237 return cct->_conf.get_val<double>("osd_delete_sleep_hdd");
3238 }
3239
3240 int OSD::get_recovery_max_active()
3241 {
3242 if (cct->_conf->osd_recovery_max_active)
3243 return cct->_conf->osd_recovery_max_active;
3244 if (store_is_rotational)
3245 return cct->_conf->osd_recovery_max_active_hdd;
3246 else
3247 return cct->_conf->osd_recovery_max_active_ssd;
3248 }
3249
3250 float OSD::get_osd_snap_trim_sleep()
3251 {
3252 float osd_snap_trim_sleep = cct->_conf.get_val<double>("osd_snap_trim_sleep");
3253 if (osd_snap_trim_sleep > 0)
3254 return osd_snap_trim_sleep;
3255 if (!store_is_rotational && !journal_is_rotational)
3256 return cct->_conf.get_val<double>("osd_snap_trim_sleep_ssd");
3257 if (store_is_rotational && !journal_is_rotational)
3258 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hybrid");
3259 return cct->_conf.get_val<double>("osd_snap_trim_sleep_hdd");
3260 }
3261
3262 int OSD::init()
3263 {
3264 OSDMapRef osdmap;
3265 CompatSet initial, diff;
3266 std::lock_guard lock(osd_lock);
3267 if (is_stopping())
3268 return 0;
3269
3270 tick_timer.init();
3271 tick_timer_without_osd_lock.init();
3272 service.recovery_request_timer.init();
3273 service.sleep_timer.init();
3274
3275 boot_finisher.start();
3276
3277 {
3278 string val;
3279 store->read_meta("require_osd_release", &val);
3280 last_require_osd_release = ceph_release_from_name(val);
3281 }
3282
3283 // mount.
3284 dout(2) << "init " << dev_path
3285 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
3286 << dendl;
3287 dout(2) << "journal " << journal_path << dendl;
3288 ceph_assert(store); // call pre_init() first!
3289
3290 store->set_cache_shards(get_num_cache_shards());
3291
3292 int r = store->mount();
3293 if (r < 0) {
3294 derr << "OSD:init: unable to mount object store" << dendl;
3295 return r;
3296 }
3297 journal_is_rotational = store->is_journal_rotational();
3298 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
3299 << dendl;
3300
3301 enable_disable_fuse(false);
3302
3303 dout(2) << "boot" << dendl;
3304
3305 service.meta_ch = store->open_collection(coll_t::meta());
3306
3307 // initialize the daily loadavg with current 15min loadavg
3308 double loadavgs[3];
3309 if (getloadavg(loadavgs, 3) == 3) {
3310 daily_loadavg = loadavgs[2];
3311 } else {
3312 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
3313 daily_loadavg = 1.0;
3314 }
3315
3316 int rotating_auth_attempts = 0;
3317 auto rotating_auth_timeout =
3318 g_conf().get_val<int64_t>("rotating_keys_bootstrap_timeout");
3319
3320 // sanity check long object name handling
3321 {
3322 hobject_t l;
3323 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
3324 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
3325 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
3326 r = store->validate_hobject_key(l);
3327 if (r < 0) {
3328 derr << "backend (" << store->get_type() << ") is unable to support max "
3329 << "object name[space] len" << dendl;
3330 derr << " osd max object name len = "
3331 << cct->_conf->osd_max_object_name_len << dendl;
3332 derr << " osd max object namespace len = "
3333 << cct->_conf->osd_max_object_namespace_len << dendl;
3334 derr << cpp_strerror(r) << dendl;
3335 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
3336 goto out;
3337 }
3338 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
3339 << dendl;
3340 } else {
3341 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
3342 }
3343 }
3344
3345 // read superblock
3346 r = read_superblock();
3347 if (r < 0) {
3348 derr << "OSD::init() : unable to read osd superblock" << dendl;
3349 r = -EINVAL;
3350 goto out;
3351 }
3352
3353 if (osd_compat.compare(superblock.compat_features) < 0) {
3354 derr << "The disk uses features unsupported by the executable." << dendl;
3355 derr << " ondisk features " << superblock.compat_features << dendl;
3356 derr << " daemon features " << osd_compat << dendl;
3357
3358 if (osd_compat.writeable(superblock.compat_features)) {
3359 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3360 derr << "it is still writeable, though. Missing features: " << diff << dendl;
3361 r = -EOPNOTSUPP;
3362 goto out;
3363 }
3364 else {
3365 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
3366 derr << "Cannot write to disk! Missing features: " << diff << dendl;
3367 r = -EOPNOTSUPP;
3368 goto out;
3369 }
3370 }
3371
3372 assert_warn(whoami == superblock.whoami);
3373 if (whoami != superblock.whoami) {
3374 derr << "OSD::init: superblock says osd"
3375 << superblock.whoami << " but I am osd." << whoami << dendl;
3376 r = -EINVAL;
3377 goto out;
3378 }
3379
3380 startup_time = ceph::mono_clock::now();
3381
3382 // load up "current" osdmap
3383 assert_warn(!get_osdmap());
3384 if (get_osdmap()) {
3385 derr << "OSD::init: unable to read current osdmap" << dendl;
3386 r = -EINVAL;
3387 goto out;
3388 }
3389 osdmap = get_map(superblock.current_epoch);
3390 set_osdmap(osdmap);
3391
3392 // make sure we don't have legacy pgs deleting
3393 {
3394 vector<coll_t> ls;
3395 int r = store->list_collections(ls);
3396 ceph_assert(r >= 0);
3397 for (auto c : ls) {
3398 spg_t pgid;
3399 if (c.is_pg(&pgid) &&
3400 !osdmap->have_pg_pool(pgid.pool())) {
3401 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
3402 if (!store->exists(service.meta_ch, oid)) {
3403 derr << __func__ << " missing pg_pool_t for deleted pool "
3404 << pgid.pool() << " for pg " << pgid
3405 << "; please downgrade to luminous and allow "
3406 << "pg deletion to complete before upgrading" << dendl;
3407 ceph_abort();
3408 }
3409 }
3410 }
3411 }
3412
3413 initial = get_osd_initial_compat_set();
3414 diff = superblock.compat_features.unsupported(initial);
3415 if (superblock.compat_features.merge(initial)) {
3416 // Are we adding SNAPMAPPER2?
3417 if (diff.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER2)) {
3418 dout(1) << __func__ << " upgrade snap_mapper (first start as octopus)"
3419 << dendl;
3420 auto ch = service.meta_ch;
3421 auto hoid = make_snapmapper_oid();
3422 unsigned max = cct->_conf->osd_target_transaction_size;
3423 r = SnapMapper::convert_legacy(cct, store, ch, hoid, max);
3424 if (r < 0)
3425 goto out;
3426 }
3427 // We need to persist the new compat_set before we
3428 // do anything else
3429 dout(5) << "Upgrading superblock adding: " << diff << dendl;
3430 ObjectStore::Transaction t;
3431 write_superblock(t);
3432 r = store->queue_transaction(service.meta_ch, std::move(t));
3433 if (r < 0)
3434 goto out;
3435 }
3436
3437 // make sure snap mapper object exists
3438 if (!store->exists(service.meta_ch, OSD::make_snapmapper_oid())) {
3439 dout(10) << "init creating/touching snapmapper object" << dendl;
3440 ObjectStore::Transaction t;
3441 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
3442 r = store->queue_transaction(service.meta_ch, std::move(t));
3443 if (r < 0)
3444 goto out;
3445 }
3446 if (!store->exists(service.meta_ch, OSD::make_purged_snaps_oid())) {
3447 dout(10) << "init creating/touching purged_snaps object" << dendl;
3448 ObjectStore::Transaction t;
3449 t.touch(coll_t::meta(), OSD::make_purged_snaps_oid());
3450 r = store->queue_transaction(service.meta_ch, std::move(t));
3451 if (r < 0)
3452 goto out;
3453 }
3454
3455 if (cct->_conf->osd_open_classes_on_start) {
3456 int r = ClassHandler::get_instance().open_all_classes();
3457 if (r)
3458 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
3459 }
3460
3461 check_osdmap_features();
3462
3463 create_recoverystate_perf();
3464
3465 {
3466 epoch_t bind_epoch = osdmap->get_epoch();
3467 service.set_epochs(NULL, NULL, &bind_epoch);
3468 }
3469
3470 clear_temp_objects();
3471
3472 // initialize osdmap references in sharded wq
3473 for (auto& shard : shards) {
3474 std::lock_guard l(shard->osdmap_lock);
3475 shard->shard_osdmap = osdmap;
3476 }
3477
3478 // load up pgs (as they previously existed)
3479 load_pgs();
3480
3481 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
3482
3483 create_logger();
3484
3485 // prime osd stats
3486 {
3487 struct store_statfs_t stbuf;
3488 osd_alert_list_t alerts;
3489 int r = store->statfs(&stbuf, &alerts);
3490 ceph_assert(r == 0);
3491 service.set_statfs(stbuf, alerts);
3492 }
3493
3494 // client_messenger auth_client is already set up by monc.
3495 for (auto m : { cluster_messenger,
3496 objecter_messenger,
3497 hb_front_client_messenger,
3498 hb_back_client_messenger,
3499 hb_front_server_messenger,
3500 hb_back_server_messenger } ) {
3501 m->set_auth_client(monc);
3502 }
3503 for (auto m : { client_messenger,
3504 cluster_messenger,
3505 hb_front_server_messenger,
3506 hb_back_server_messenger }) {
3507 m->set_auth_server(monc);
3508 }
3509 monc->set_handle_authentication_dispatcher(this);
3510
3511 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
3512 | CEPH_ENTITY_TYPE_MGR);
3513 r = monc->init();
3514 if (r < 0)
3515 goto out;
3516
3517 mgrc.set_pgstats_cb([this](){ return collect_pg_stats(); });
3518 mgrc.set_perf_metric_query_cb(
3519 [this](const ConfigPayload &config_payload) {
3520 set_perf_queries(config_payload);
3521 },
3522 [this] {
3523 return get_perf_reports();
3524 });
3525 mgrc.init();
3526
3527 // tell monc about log_client so it will know about mon session resets
3528 monc->set_log_client(&log_client);
3529 update_log_config();
3530
3531 // i'm ready!
3532 client_messenger->add_dispatcher_tail(&mgrc);
3533 client_messenger->add_dispatcher_tail(this);
3534 cluster_messenger->add_dispatcher_head(this);
3535
3536 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3537 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3538 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3539 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
3540
3541 objecter_messenger->add_dispatcher_head(service.objecter.get());
3542
3543 service.init();
3544 service.publish_map(osdmap);
3545 service.publish_superblock(superblock);
3546 service.max_oldest_map = superblock.oldest_map;
3547
3548 for (auto& shard : shards) {
3549 // put PGs in a temporary set because we may modify pg_slots
3550 // unordered_map below.
3551 set<PGRef> pgs;
3552 for (auto& i : shard->pg_slots) {
3553 PGRef pg = i.second->pg;
3554 if (!pg) {
3555 continue;
3556 }
3557 pgs.insert(pg);
3558 }
3559 for (auto pg : pgs) {
3560 std::scoped_lock l{*pg};
3561 set<pair<spg_t,epoch_t>> new_children;
3562 set<pair<spg_t,epoch_t>> merge_pgs;
3563 service.identify_splits_and_merges(pg->get_osdmap(), osdmap, pg->pg_id,
3564 &new_children, &merge_pgs);
3565 if (!new_children.empty()) {
3566 for (auto shard : shards) {
3567 shard->prime_splits(osdmap, &new_children);
3568 }
3569 assert(new_children.empty());
3570 }
3571 if (!merge_pgs.empty()) {
3572 for (auto shard : shards) {
3573 shard->prime_merges(osdmap, &merge_pgs);
3574 }
3575 assert(merge_pgs.empty());
3576 }
3577 }
3578 }
3579
3580 osd_op_tp.start();
3581
3582 // start the heartbeat
3583 heartbeat_thread.create("osd_srv_heartbt");
3584
3585 // tick
3586 tick_timer.add_event_after(get_tick_interval(),
3587 new C_Tick(this));
3588 {
3589 std::lock_guard l(tick_timer_lock);
3590 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
3591 new C_Tick_WithoutOSDLock(this));
3592 }
3593
3594 osd_lock.unlock();
3595
3596 r = monc->authenticate();
3597 if (r < 0) {
3598 derr << __func__ << " authentication failed: " << cpp_strerror(r)
3599 << dendl;
3600 exit(1);
3601 }
3602
3603 while (monc->wait_auth_rotating(rotating_auth_timeout) < 0) {
3604 derr << "unable to obtain rotating service keys; retrying" << dendl;
3605 ++rotating_auth_attempts;
3606 if (rotating_auth_attempts > g_conf()->max_rotating_auth_attempts) {
3607 derr << __func__ << " wait_auth_rotating timed out" << dendl;
3608 exit(1);
3609 }
3610 }
3611
3612 r = update_crush_device_class();
3613 if (r < 0) {
3614 derr << __func__ << " unable to update_crush_device_class: "
3615 << cpp_strerror(r) << dendl;
3616 exit(1);
3617 }
3618
3619 r = update_crush_location();
3620 if (r < 0) {
3621 derr << __func__ << " unable to update_crush_location: "
3622 << cpp_strerror(r) << dendl;
3623 exit(1);
3624 }
3625
3626 osd_lock.lock();
3627 if (is_stopping())
3628 return 0;
3629
3630 // start objecter *after* we have authenticated, so that we don't ignore
3631 // the OSDMaps it requests.
3632 service.final_init();
3633
3634 check_config();
3635
3636 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
3637 consume_map();
3638
3639 dout(0) << "done with init, starting boot process" << dendl;
3640
3641 // subscribe to any pg creations
3642 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
3643
3644 // MgrClient needs this (it doesn't have MonClient reference itself)
3645 monc->sub_want("mgrmap", 0, 0);
3646
3647 // we don't need to ask for an osdmap here; objecter will
3648 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
3649
3650 monc->renew_subs();
3651
3652 start_boot();
3653
3654 return 0;
3655
3656 out:
3657 enable_disable_fuse(true);
3658 store->umount();
3659 delete store;
3660 store = NULL;
3661 return r;
3662 }
3663
3664 void OSD::final_init()
3665 {
3666 AdminSocket *admin_socket = cct->get_admin_socket();
3667 asok_hook = new OSDSocketHook(this);
3668 int r = admin_socket->register_command("status", asok_hook,
3669 "high-level status of OSD");
3670 ceph_assert(r == 0);
3671 r = admin_socket->register_command("flush_journal",
3672 asok_hook,
3673 "flush the journal to permanent store");
3674 ceph_assert(r == 0);
3675 r = admin_socket->register_command("dump_ops_in_flight " \
3676 "name=filterstr,type=CephString,n=N,req=false",
3677 asok_hook,
3678 "show the ops currently in flight");
3679 ceph_assert(r == 0);
3680 r = admin_socket->register_command("ops " \
3681 "name=filterstr,type=CephString,n=N,req=false",
3682 asok_hook,
3683 "show the ops currently in flight");
3684 ceph_assert(r == 0);
3685 r = admin_socket->register_command("dump_blocked_ops " \
3686 "name=filterstr,type=CephString,n=N,req=false",
3687 asok_hook,
3688 "show the blocked ops currently in flight");
3689 ceph_assert(r == 0);
3690 r = admin_socket->register_command("dump_historic_ops " \
3691 "name=filterstr,type=CephString,n=N,req=false",
3692 asok_hook,
3693 "show recent ops");
3694 ceph_assert(r == 0);
3695 r = admin_socket->register_command("dump_historic_slow_ops " \
3696 "name=filterstr,type=CephString,n=N,req=false",
3697 asok_hook,
3698 "show slowest recent ops");
3699 ceph_assert(r == 0);
3700 r = admin_socket->register_command("dump_historic_ops_by_duration " \
3701 "name=filterstr,type=CephString,n=N,req=false",
3702 asok_hook,
3703 "show slowest recent ops, sorted by duration");
3704 ceph_assert(r == 0);
3705 r = admin_socket->register_command("dump_op_pq_state",
3706 asok_hook,
3707 "dump op priority queue state");
3708 ceph_assert(r == 0);
3709 r = admin_socket->register_command("dump_blacklist",
3710 asok_hook,
3711 "dump blacklisted clients and times");
3712 ceph_assert(r == 0);
3713 r = admin_socket->register_command("dump_watchers",
3714 asok_hook,
3715 "show clients which have active watches,"
3716 " and on which objects");
3717 ceph_assert(r == 0);
3718 r = admin_socket->register_command("dump_recovery_reservations",
3719 asok_hook,
3720 "show recovery reservations");
3721 ceph_assert(r == 0);
3722 r = admin_socket->register_command("dump_scrub_reservations",
3723 asok_hook,
3724 "show scrub reservations");
3725 ceph_assert(r == 0);
3726 r = admin_socket->register_command("get_latest_osdmap",
3727 asok_hook,
3728 "force osd to update the latest map from "
3729 "the mon");
3730 ceph_assert(r == 0);
3731
3732 r = admin_socket->register_command("set_heap_property " \
3733 "name=property,type=CephString " \
3734 "name=value,type=CephInt",
3735 asok_hook,
3736 "update malloc extension heap property");
3737 ceph_assert(r == 0);
3738
3739 r = admin_socket->register_command("get_heap_property " \
3740 "name=property,type=CephString",
3741 asok_hook,
3742 "get malloc extension heap property");
3743 ceph_assert(r == 0);
3744
3745 r = admin_socket->register_command("dump_objectstore_kv_stats",
3746 asok_hook,
3747 "print statistics of kvdb which used by bluestore");
3748 ceph_assert(r == 0);
3749
3750 r = admin_socket->register_command("dump_scrubs",
3751 asok_hook,
3752 "print scheduled scrubs");
3753 ceph_assert(r == 0);
3754
3755 r = admin_socket->register_command("calc_objectstore_db_histogram",
3756 asok_hook,
3757 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
3758 ceph_assert(r == 0);
3759
3760 r = admin_socket->register_command("flush_store_cache",
3761 asok_hook,
3762 "Flush bluestore internal cache");
3763 ceph_assert(r == 0);
3764 r = admin_socket->register_command("dump_pgstate_history",
3765 asok_hook,
3766 "show recent state history");
3767 ceph_assert(r == 0);
3768
3769 r = admin_socket->register_command("compact",
3770 asok_hook,
3771 "Commpact object store's omap."
3772 " WARNING: Compaction probably slows your requests");
3773 ceph_assert(r == 0);
3774
3775 r = admin_socket->register_command("get_mapped_pools",
3776 asok_hook,
3777 "dump pools whose PG(s) are mapped to this OSD.");
3778
3779 ceph_assert(r == 0);
3780
3781 r = admin_socket->register_command("smart name=devid,type=CephString,req=false",
3782 asok_hook,
3783 "probe OSD devices for SMART data.");
3784
3785 ceph_assert(r == 0);
3786
3787 r = admin_socket->register_command("list_devices",
3788 asok_hook,
3789 "list OSD devices.");
3790 r = admin_socket->register_command("send_beacon",
3791 asok_hook,
3792 "send OSD beacon to mon immediately");
3793
3794 r = admin_socket->register_command(
3795 "dump_osd_network name=value,type=CephInt,req=false", asok_hook,
3796 "Dump osd heartbeat network ping times");
3797 ceph_assert(r == 0);
3798
3799 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
3800 // Note: pools are CephString instead of CephPoolname because
3801 // these commands traditionally support both pool names and numbers
3802 r = admin_socket->register_command(
3803 "setomapval " \
3804 "name=pool,type=CephString " \
3805 "name=objname,type=CephObjectname " \
3806 "name=key,type=CephString "\
3807 "name=val,type=CephString",
3808 test_ops_hook,
3809 "set omap key");
3810 ceph_assert(r == 0);
3811 r = admin_socket->register_command(
3812 "rmomapkey " \
3813 "name=pool,type=CephString " \
3814 "name=objname,type=CephObjectname " \
3815 "name=key,type=CephString",
3816 test_ops_hook,
3817 "remove omap key");
3818 ceph_assert(r == 0);
3819 r = admin_socket->register_command(
3820 "setomapheader " \
3821 "name=pool,type=CephString " \
3822 "name=objname,type=CephObjectname " \
3823 "name=header,type=CephString",
3824 test_ops_hook,
3825 "set omap header");
3826 ceph_assert(r == 0);
3827
3828 r = admin_socket->register_command(
3829 "getomap " \
3830 "name=pool,type=CephString " \
3831 "name=objname,type=CephObjectname",
3832 test_ops_hook,
3833 "output entire object map");
3834 ceph_assert(r == 0);
3835
3836 r = admin_socket->register_command(
3837 "truncobj " \
3838 "name=pool,type=CephString " \
3839 "name=objname,type=CephObjectname " \
3840 "name=len,type=CephInt",
3841 test_ops_hook,
3842 "truncate object to length");
3843 ceph_assert(r == 0);
3844
3845 r = admin_socket->register_command(
3846 "injectdataerr " \
3847 "name=pool,type=CephString " \
3848 "name=objname,type=CephObjectname " \
3849 "name=shardid,type=CephInt,req=false,range=0|255",
3850 test_ops_hook,
3851 "inject data error to an object");
3852 ceph_assert(r == 0);
3853
3854 r = admin_socket->register_command(
3855 "injectmdataerr " \
3856 "name=pool,type=CephString " \
3857 "name=objname,type=CephObjectname " \
3858 "name=shardid,type=CephInt,req=false,range=0|255",
3859 test_ops_hook,
3860 "inject metadata error to an object");
3861 ceph_assert(r == 0);
3862 r = admin_socket->register_command(
3863 "set_recovery_delay " \
3864 "name=utime,type=CephInt,req=false",
3865 test_ops_hook,
3866 "Delay osd recovery by specified seconds");
3867 ceph_assert(r == 0);
3868 r = admin_socket->register_command(
3869 "injectfull " \
3870 "name=type,type=CephString,req=false " \
3871 "name=count,type=CephInt,req=false ",
3872 test_ops_hook,
3873 "Inject a full disk (optional count times)");
3874 ceph_assert(r == 0);
3875 r = admin_socket->register_command(
3876 "bench " \
3877 "name=count,type=CephInt,req=false " \
3878 "name=size,type=CephInt,req=false " \
3879 "name=object_size,type=CephInt,req=false " \
3880 "name=object_num,type=CephInt,req=false ",
3881 asok_hook,
3882 "OSD benchmark: write <count> <size>-byte objects(with <obj_size> <obj_num>), " \
3883 "(default count=1G default size=4MB). Results in log.");
3884 ceph_assert(r == 0);
3885 r = admin_socket->register_command(
3886 "cluster_log " \
3887 "name=level,type=CephChoices,strings=error,warning,info,debug " \
3888 "name=message,type=CephString,n=N",
3889 asok_hook,
3890 "log a message to the cluster log");
3891 ceph_assert(r == 0);
3892 r = admin_socket->register_command(
3893 "flush_pg_stats",
3894 asok_hook,
3895 "flush pg stats");
3896 ceph_assert(r == 0);
3897 r = admin_socket->register_command(
3898 "heap " \
3899 "name=heapcmd,type=CephChoices,strings=" \
3900 "dump|start_profiler|stop_profiler|release|get_release_rate|set_release_rate|stats " \
3901 "name=value,type=CephString,req=false",
3902 asok_hook,
3903 "show heap usage info (available only if compiled with tcmalloc)");
3904 ceph_assert(r == 0);
3905 r = admin_socket->register_command(
3906 "debug dump_missing " \
3907 "name=filename,type=CephFilepath",
3908 asok_hook,
3909 "dump missing objects to a named file");
3910 ceph_assert(r == 0);
3911 r = admin_socket->register_command(
3912 "debug kick_recovery_wq " \
3913 "name=delay,type=CephInt,range=0",
3914 asok_hook,
3915 "set osd_recovery_delay_start to <val>");
3916 ceph_assert(r == 0);
3917 r = admin_socket->register_command(
3918 "cpu_profiler " \
3919 "name=arg,type=CephChoices,strings=status|flush",
3920 asok_hook,
3921 "run cpu profiling on daemon");
3922 ceph_assert(r == 0);
3923 r = admin_socket->register_command(
3924 "dump_pg_recovery_stats",
3925 asok_hook,
3926 "dump pg recovery statistics");
3927 ceph_assert(r == 0);
3928 r = admin_socket->register_command(
3929 "reset_pg_recovery_stats",
3930 asok_hook,
3931 "reset pg recovery statistics");
3932 ceph_assert(r == 0);
3933 r = admin_socket->register_command(
3934 "cache drop",
3935 asok_hook,
3936 "Drop all OSD caches");
3937 ceph_assert(r == 0);
3938 r = admin_socket->register_command(
3939 "cache status",
3940 asok_hook,
3941 "Get OSD caches statistics");
3942 ceph_assert(r == 0);
3943 r = admin_socket->register_command(
3944 "scrub_purged_snaps",
3945 asok_hook,
3946 "Scrub purged_snaps vs snapmapper index");
3947 ceph_assert(r == 0);
3948
3949 // -- pg commands --
3950 // old form: ceph pg <pgid> command ...
3951 r = admin_socket->register_command(
3952 "pg " \
3953 "name=pgid,type=CephPgid " \
3954 "name=cmd,type=CephChoices,strings=query",
3955 asok_hook,
3956 "");
3957 ceph_assert(r == 0);
3958 r = admin_socket->register_command(
3959 "pg " \
3960 "name=pgid,type=CephPgid " \
3961 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
3962 "name=mulcmd,type=CephChoices,strings=revert|delete",
3963 asok_hook,
3964 "");
3965 ceph_assert(r == 0);
3966 r = admin_socket->register_command(
3967 "pg " \
3968 "name=pgid,type=CephPgid " \
3969 "name=cmd,type=CephChoices,strings=list_unfound " \
3970 "name=offset,type=CephString,req=false",
3971 asok_hook,
3972 "");
3973 ceph_assert(r == 0);
3974 r = admin_socket->register_command(
3975 "pg " \
3976 "name=pgid,type=CephPgid " \
3977 "name=cmd,type=CephChoices,strings=scrub " \
3978 "name=time,type=CephInt,req=false",
3979 asok_hook,
3980 "");
3981 ceph_assert(r == 0);
3982 r = admin_socket->register_command(
3983 "pg " \
3984 "name=pgid,type=CephPgid " \
3985 "name=cmd,type=CephChoices,strings=deep_scrub " \
3986 "name=time,type=CephInt,req=false",
3987 asok_hook,
3988 "");
3989 ceph_assert(r == 0);
3990 // new form: tell <pgid> <cmd> for both cli and rest
3991 r = admin_socket->register_command(
3992 "query",
3993 asok_hook,
3994 "show details of a specific pg");
3995 ceph_assert(r == 0);
3996 r = admin_socket->register_command(
3997 "mark_unfound_lost " \
3998 "name=pgid,type=CephPgid,req=false " \
3999 "name=mulcmd,type=CephChoices,strings=revert|delete",
4000 asok_hook,
4001 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available");
4002 ceph_assert(r == 0);
4003 r = admin_socket->register_command(
4004 "list_unfound " \
4005 "name=pgid,type=CephPgid,req=false " \
4006 "name=offset,type=CephString,req=false",
4007 asok_hook,
4008 "list unfound objects on this pg, perhaps starting at an offset given in JSON");
4009 ceph_assert(r == 0);
4010 r = admin_socket->register_command(
4011 "scrub " \
4012 "name=pgid,type=CephPgid,req=false " \
4013 "name=time,type=CephInt,req=false",
4014 asok_hook,
4015 "Trigger a scheduled scrub ");
4016 ceph_assert(r == 0);
4017 r = admin_socket->register_command(
4018 "deep_scrub " \
4019 "name=pgid,type=CephPgid,req=false " \
4020 "name=time,type=CephInt,req=false",
4021 asok_hook,
4022 "Trigger a scheduled deep scrub ");
4023 ceph_assert(r == 0);
4024 }
4025
4026 void OSD::create_logger()
4027 {
4028 dout(10) << "create_logger" << dendl;
4029
4030 logger = build_osd_logger(cct);
4031 cct->get_perfcounters_collection()->add(logger);
4032 }
4033
4034 void OSD::create_recoverystate_perf()
4035 {
4036 dout(10) << "create_recoverystate_perf" << dendl;
4037
4038 recoverystate_perf = build_recoverystate_perf(cct);
4039 cct->get_perfcounters_collection()->add(recoverystate_perf);
4040 }
4041
4042 int OSD::shutdown()
4043 {
4044 if (cct->_conf->osd_fast_shutdown) {
4045 derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
4046 if (cct->_conf->osd_fast_shutdown_notify_mon)
4047 service.prepare_to_stop();
4048 cct->_log->flush();
4049 _exit(0);
4050 }
4051
4052 if (!service.prepare_to_stop())
4053 return 0; // already shutting down
4054 osd_lock.lock();
4055 if (is_stopping()) {
4056 osd_lock.unlock();
4057 return 0;
4058 }
4059 dout(0) << "shutdown" << dendl;
4060
4061 set_state(STATE_STOPPING);
4062
4063 // Debugging
4064 if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4065 cct->_conf.set_val("debug_osd", "100");
4066 cct->_conf.set_val("debug_journal", "100");
4067 cct->_conf.set_val("debug_filestore", "100");
4068 cct->_conf.set_val("debug_bluestore", "100");
4069 cct->_conf.set_val("debug_ms", "100");
4070 cct->_conf.apply_changes(nullptr);
4071 }
4072
4073 // stop MgrClient earlier as it's more like an internal consumer of OSD
4074 mgrc.shutdown();
4075
4076 service.start_shutdown();
4077
4078 // stop sending work to pgs. this just prevents any new work in _process
4079 // from racing with on_shutdown and potentially entering the pg after.
4080 op_shardedwq.drain();
4081
4082 // Shutdown PGs
4083 {
4084 vector<PGRef> pgs;
4085 _get_pgs(&pgs);
4086 for (auto pg : pgs) {
4087 pg->shutdown();
4088 }
4089 }
4090
4091 // drain op queue again (in case PGs requeued something)
4092 op_shardedwq.drain();
4093 {
4094 finished.clear(); // zap waiters (bleh, this is messy)
4095 waiting_for_osdmap.clear();
4096 }
4097
4098 // unregister commands
4099 cct->get_admin_socket()->unregister_commands(asok_hook);
4100 delete asok_hook;
4101 asok_hook = NULL;
4102
4103 cct->get_admin_socket()->unregister_commands(test_ops_hook);
4104 delete test_ops_hook;
4105 test_ops_hook = NULL;
4106
4107 osd_lock.unlock();
4108
4109 {
4110 std::lock_guard l{heartbeat_lock};
4111 heartbeat_stop = true;
4112 heartbeat_cond.notify_all();
4113 heartbeat_peers.clear();
4114 }
4115 heartbeat_thread.join();
4116
4117 hb_back_server_messenger->mark_down_all();
4118 hb_front_server_messenger->mark_down_all();
4119 hb_front_client_messenger->mark_down_all();
4120 hb_back_client_messenger->mark_down_all();
4121
4122 osd_op_tp.drain();
4123 osd_op_tp.stop();
4124 dout(10) << "op sharded tp stopped" << dendl;
4125
4126 dout(10) << "stopping agent" << dendl;
4127 service.agent_stop();
4128
4129 boot_finisher.wait_for_empty();
4130
4131 osd_lock.lock();
4132
4133 boot_finisher.stop();
4134 reset_heartbeat_peers(true);
4135
4136 tick_timer.shutdown();
4137
4138 {
4139 std::lock_guard l(tick_timer_lock);
4140 tick_timer_without_osd_lock.shutdown();
4141 }
4142
4143 // note unmount epoch
4144 dout(10) << "noting clean unmount in epoch " << get_osdmap_epoch() << dendl;
4145 superblock.mounted = service.get_boot_epoch();
4146 superblock.clean_thru = get_osdmap_epoch();
4147 ObjectStore::Transaction t;
4148 write_superblock(t);
4149 int r = store->queue_transaction(service.meta_ch, std::move(t));
4150 if (r) {
4151 derr << "OSD::shutdown: error writing superblock: "
4152 << cpp_strerror(r) << dendl;
4153 }
4154
4155
4156 service.shutdown_reserver();
4157
4158 // Remove PGs
4159 #ifdef PG_DEBUG_REFS
4160 service.dump_live_pgids();
4161 #endif
4162 while (true) {
4163 vector<PGRef> pgs;
4164 _get_pgs(&pgs, true);
4165 if (pgs.empty()) {
4166 break;
4167 }
4168 for (auto& pg : pgs) {
4169 if (pg->is_deleted()) {
4170 continue;
4171 }
4172 dout(20) << " kicking pg " << pg << dendl;
4173 pg->lock();
4174 if (pg->get_num_ref() != 1) {
4175 derr << "pgid " << pg->get_pgid() << " has ref count of "
4176 << pg->get_num_ref() << dendl;
4177 #ifdef PG_DEBUG_REFS
4178 pg->dump_live_ids();
4179 #endif
4180 if (cct->_conf->osd_shutdown_pgref_assert) {
4181 ceph_abort();
4182 }
4183 }
4184 pg->ch.reset();
4185 pg->unlock();
4186 }
4187 }
4188 #ifdef PG_DEBUG_REFS
4189 service.dump_live_pgids();
4190 #endif
4191
4192 osd_lock.unlock();
4193 cct->_conf.remove_observer(this);
4194 osd_lock.lock();
4195
4196 service.meta_ch.reset();
4197
4198 dout(10) << "syncing store" << dendl;
4199 enable_disable_fuse(true);
4200
4201 if (cct->_conf->osd_journal_flush_on_shutdown) {
4202 dout(10) << "flushing journal" << dendl;
4203 store->flush_journal();
4204 }
4205
4206 monc->shutdown();
4207 osd_lock.unlock();
4208 {
4209 std::unique_lock l{map_lock};
4210 set_osdmap(OSDMapRef());
4211 }
4212 for (auto s : shards) {
4213 std::lock_guard l(s->osdmap_lock);
4214 s->shard_osdmap = OSDMapRef();
4215 }
4216 service.shutdown();
4217
4218 std::lock_guard lock(osd_lock);
4219 store->umount();
4220 delete store;
4221 store = nullptr;
4222 dout(10) << "Store synced" << dendl;
4223
4224 op_tracker.on_shutdown();
4225
4226 ClassHandler::get_instance().shutdown();
4227 client_messenger->shutdown();
4228 cluster_messenger->shutdown();
4229 hb_front_client_messenger->shutdown();
4230 hb_back_client_messenger->shutdown();
4231 objecter_messenger->shutdown();
4232 hb_front_server_messenger->shutdown();
4233 hb_back_server_messenger->shutdown();
4234
4235 return r;
4236 }
4237
4238 int OSD::mon_cmd_maybe_osd_create(string &cmd)
4239 {
4240 bool created = false;
4241 while (true) {
4242 dout(10) << __func__ << " cmd: " << cmd << dendl;
4243 vector<string> vcmd{cmd};
4244 bufferlist inbl;
4245 C_SaferCond w;
4246 string outs;
4247 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
4248 int r = w.wait();
4249 if (r < 0) {
4250 if (r == -ENOENT && !created) {
4251 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
4252 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
4253 vector<string> vnewcmd{newcmd};
4254 bufferlist inbl;
4255 C_SaferCond w;
4256 string outs;
4257 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
4258 int r = w.wait();
4259 if (r < 0) {
4260 derr << __func__ << " fail: osd does not exist and created failed: "
4261 << cpp_strerror(r) << dendl;
4262 return r;
4263 }
4264 created = true;
4265 continue;
4266 }
4267 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
4268 return r;
4269 }
4270 break;
4271 }
4272
4273 return 0;
4274 }
4275
4276 int OSD::update_crush_location()
4277 {
4278 if (!cct->_conf->osd_crush_update_on_start) {
4279 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
4280 return 0;
4281 }
4282
4283 char weight[32];
4284 if (cct->_conf->osd_crush_initial_weight >= 0) {
4285 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
4286 } else {
4287 struct store_statfs_t st;
4288 osd_alert_list_t alerts;
4289 int r = store->statfs(&st, &alerts);
4290 if (r < 0) {
4291 derr << "statfs: " << cpp_strerror(r) << dendl;
4292 return r;
4293 }
4294 snprintf(weight, sizeof(weight), "%.4lf",
4295 std::max(.00001,
4296 double(st.total) /
4297 double(1ull << 40 /* TB */)));
4298 }
4299
4300 dout(10) << __func__ << " crush location is " << cct->crush_location << dendl;
4301
4302 string cmd =
4303 string("{\"prefix\": \"osd crush create-or-move\", ") +
4304 string("\"id\": ") + stringify(whoami) + ", " +
4305 string("\"weight\":") + weight + ", " +
4306 string("\"args\": [") + stringify(cct->crush_location) + "]}";
4307 return mon_cmd_maybe_osd_create(cmd);
4308 }
4309
4310 int OSD::update_crush_device_class()
4311 {
4312 if (!cct->_conf->osd_class_update_on_start) {
4313 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
4314 return 0;
4315 }
4316
4317 string device_class;
4318 int r = store->read_meta("crush_device_class", &device_class);
4319 if (r < 0 || device_class.empty()) {
4320 device_class = store->get_default_device_class();
4321 }
4322
4323 if (device_class.empty()) {
4324 dout(20) << __func__ << " no device class stored locally" << dendl;
4325 return 0;
4326 }
4327
4328 string cmd =
4329 string("{\"prefix\": \"osd crush set-device-class\", ") +
4330 string("\"class\": \"") + device_class + string("\", ") +
4331 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
4332
4333 r = mon_cmd_maybe_osd_create(cmd);
4334 if (r == -EBUSY) {
4335 // good, already bound to a device-class
4336 return 0;
4337 } else {
4338 return r;
4339 }
4340 }
4341
4342 void OSD::write_superblock(ObjectStore::Transaction& t)
4343 {
4344 dout(10) << "write_superblock " << superblock << dendl;
4345
4346 //hack: at minimum it's using the baseline feature set
4347 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
4348 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4349
4350 bufferlist bl;
4351 encode(superblock, bl);
4352 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
4353 }
4354
4355 int OSD::read_superblock()
4356 {
4357 bufferlist bl;
4358 int r = store->read(service.meta_ch, OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
4359 if (r < 0)
4360 return r;
4361
4362 auto p = bl.cbegin();
4363 decode(superblock, p);
4364
4365 dout(10) << "read_superblock " << superblock << dendl;
4366
4367 return 0;
4368 }
4369
4370 void OSD::clear_temp_objects()
4371 {
4372 dout(10) << __func__ << dendl;
4373 vector<coll_t> ls;
4374 store->list_collections(ls);
4375 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
4376 spg_t pgid;
4377 if (!p->is_pg(&pgid))
4378 continue;
4379
4380 // list temp objects
4381 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
4382
4383 vector<ghobject_t> temps;
4384 ghobject_t next;
4385 while (1) {
4386 vector<ghobject_t> objects;
4387 auto ch = store->open_collection(*p);
4388 ceph_assert(ch);
4389 store->collection_list(ch, next, ghobject_t::get_max(),
4390 store->get_ideal_list_max(),
4391 &objects, &next);
4392 if (objects.empty())
4393 break;
4394 vector<ghobject_t>::iterator q;
4395 for (q = objects.begin(); q != objects.end(); ++q) {
4396 // Hammer set pool for temps to -1, so check for clean-up
4397 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
4398 temps.push_back(*q);
4399 } else {
4400 break;
4401 }
4402 }
4403 // If we saw a non-temp object and hit the break above we can
4404 // break out of the while loop too.
4405 if (q != objects.end())
4406 break;
4407 }
4408 if (!temps.empty()) {
4409 ObjectStore::Transaction t;
4410 int removed = 0;
4411 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
4412 dout(20) << " removing " << *p << " object " << *q << dendl;
4413 t.remove(*p, *q);
4414 if (++removed > cct->_conf->osd_target_transaction_size) {
4415 store->queue_transaction(service.meta_ch, std::move(t));
4416 t = ObjectStore::Transaction();
4417 removed = 0;
4418 }
4419 }
4420 if (removed) {
4421 store->queue_transaction(service.meta_ch, std::move(t));
4422 }
4423 }
4424 }
4425 }
4426
4427 void OSD::recursive_remove_collection(CephContext* cct,
4428 ObjectStore *store, spg_t pgid,
4429 coll_t tmp)
4430 {
4431 OSDriver driver(
4432 store,
4433 coll_t(),
4434 make_snapmapper_oid());
4435
4436 ObjectStore::CollectionHandle ch = store->open_collection(tmp);
4437 ObjectStore::Transaction t;
4438 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
4439
4440 ghobject_t next;
4441 int max = cct->_conf->osd_target_transaction_size;
4442 vector<ghobject_t> objects;
4443 objects.reserve(max);
4444 while (true) {
4445 objects.clear();
4446 store->collection_list(ch, next, ghobject_t::get_max(),
4447 max, &objects, &next);
4448 generic_dout(10) << __func__ << " " << objects << dendl;
4449 if (objects.empty())
4450 break;
4451 for (auto& p: objects) {
4452 OSDriver::OSTransaction _t(driver.get_transaction(&t));
4453 int r = mapper.remove_oid(p.hobj, &_t);
4454 if (r != 0 && r != -ENOENT)
4455 ceph_abort();
4456 t.remove(tmp, p);
4457 }
4458 int r = store->queue_transaction(ch, std::move(t));
4459 ceph_assert(r == 0);
4460 t = ObjectStore::Transaction();
4461 }
4462 t.remove_collection(tmp);
4463 int r = store->queue_transaction(ch, std::move(t));
4464 ceph_assert(r == 0);
4465
4466 C_SaferCond waiter;
4467 if (!ch->flush_commit(&waiter)) {
4468 waiter.wait();
4469 }
4470 }
4471
4472
4473 // ======================================================
4474 // PG's
4475
4476 PG* OSD::_make_pg(
4477 OSDMapRef createmap,
4478 spg_t pgid)
4479 {
4480 dout(10) << __func__ << " " << pgid << dendl;
4481 pg_pool_t pi;
4482 map<string,string> ec_profile;
4483 string name;
4484 if (createmap->have_pg_pool(pgid.pool())) {
4485 pi = *createmap->get_pg_pool(pgid.pool());
4486 name = createmap->get_pool_name(pgid.pool());
4487 if (pi.is_erasure()) {
4488 ec_profile = createmap->get_erasure_code_profile(pi.erasure_code_profile);
4489 }
4490 } else {
4491 // pool was deleted; grab final pg_pool_t off disk.
4492 ghobject_t oid = make_final_pool_info_oid(pgid.pool());
4493 bufferlist bl;
4494 int r = store->read(service.meta_ch, oid, 0, 0, bl);
4495 if (r < 0) {
4496 derr << __func__ << " missing pool " << pgid.pool() << " tombstone"
4497 << dendl;
4498 return nullptr;
4499 }
4500 ceph_assert(r >= 0);
4501 auto p = bl.cbegin();
4502 decode(pi, p);
4503 decode(name, p);
4504 if (p.end()) { // dev release v13.0.2 did not include ec_profile
4505 derr << __func__ << " missing ec_profile from pool " << pgid.pool()
4506 << " tombstone" << dendl;
4507 return nullptr;
4508 }
4509 decode(ec_profile, p);
4510 }
4511 PGPool pool(cct, createmap, pgid.pool(), pi, name);
4512 PG *pg;
4513 if (pi.type == pg_pool_t::TYPE_REPLICATED ||
4514 pi.type == pg_pool_t::TYPE_ERASURE)
4515 pg = new PrimaryLogPG(&service, createmap, pool, ec_profile, pgid);
4516 else
4517 ceph_abort();
4518 return pg;
4519 }
4520
4521 void OSD::_get_pgs(vector<PGRef> *v, bool clear_too)
4522 {
4523 v->clear();
4524 v->reserve(get_num_pgs());
4525 for (auto& s : shards) {
4526 std::lock_guard l(s->shard_lock);
4527 for (auto& j : s->pg_slots) {
4528 if (j.second->pg &&
4529 !j.second->pg->is_deleted()) {
4530 v->push_back(j.second->pg);
4531 if (clear_too) {
4532 s->_detach_pg(j.second.get());
4533 }
4534 }
4535 }
4536 }
4537 }
4538
4539 void OSD::_get_pgids(vector<spg_t> *v)
4540 {
4541 v->clear();
4542 v->reserve(get_num_pgs());
4543 for (auto& s : shards) {
4544 std::lock_guard l(s->shard_lock);
4545 for (auto& j : s->pg_slots) {
4546 if (j.second->pg &&
4547 !j.second->pg->is_deleted()) {
4548 v->push_back(j.first);
4549 }
4550 }
4551 }
4552 }
4553
4554 void OSD::register_pg(PGRef pg)
4555 {
4556 spg_t pgid = pg->get_pgid();
4557 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4558 auto sdata = shards[shard_index];
4559 std::lock_guard l(sdata->shard_lock);
4560 auto r = sdata->pg_slots.emplace(pgid, make_unique<OSDShardPGSlot>());
4561 ceph_assert(r.second);
4562 auto *slot = r.first->second.get();
4563 dout(20) << __func__ << " " << pgid << " " << pg << dendl;
4564 sdata->_attach_pg(slot, pg.get());
4565 }
4566
4567 bool OSD::try_finish_pg_delete(PG *pg, unsigned old_pg_num)
4568 {
4569 auto sdata = pg->osd_shard;
4570 ceph_assert(sdata);
4571 {
4572 std::lock_guard l(sdata->shard_lock);
4573 auto p = sdata->pg_slots.find(pg->pg_id);
4574 if (p == sdata->pg_slots.end() ||
4575 !p->second->pg) {
4576 dout(20) << __func__ << " " << pg->pg_id << " not found" << dendl;
4577 return false;
4578 }
4579 if (p->second->waiting_for_merge_epoch) {
4580 dout(20) << __func__ << " " << pg->pg_id << " waiting for merge" << dendl;
4581 return false;
4582 }
4583 dout(20) << __func__ << " " << pg->pg_id << " " << pg << dendl;
4584 sdata->_detach_pg(p->second.get());
4585 }
4586
4587 for (auto shard : shards) {
4588 shard->unprime_split_children(pg->pg_id, old_pg_num);
4589 }
4590
4591 // update pg count now since we might not get an osdmap any time soon.
4592 if (pg->is_primary())
4593 service.logger->dec(l_osd_pg_primary);
4594 else if (pg->is_nonprimary())
4595 service.logger->dec(l_osd_pg_replica); // misnomver
4596 else
4597 service.logger->dec(l_osd_pg_stray);
4598
4599 return true;
4600 }
4601
4602 PGRef OSD::_lookup_pg(spg_t pgid)
4603 {
4604 uint32_t shard_index = pgid.hash_to_shard(num_shards);
4605 auto sdata = shards[shard_index];
4606 std::lock_guard l(sdata->shard_lock);
4607 auto p = sdata->pg_slots.find(pgid);
4608 if (p == sdata->pg_slots.end()) {
4609 return nullptr;
4610 }
4611 return p->second->pg;
4612 }
4613
4614 PGRef OSD::_lookup_lock_pg(spg_t pgid)
4615 {
4616 PGRef pg = _lookup_pg(pgid);
4617 if (!pg) {
4618 return nullptr;
4619 }
4620 pg->lock();
4621 if (!pg->is_deleted()) {
4622 return pg;
4623 }
4624 pg->unlock();
4625 return nullptr;
4626 }
4627
4628 PGRef OSD::lookup_lock_pg(spg_t pgid)
4629 {
4630 return _lookup_lock_pg(pgid);
4631 }
4632
4633 void OSD::load_pgs()
4634 {
4635 ceph_assert(ceph_mutex_is_locked(osd_lock));
4636 dout(0) << "load_pgs" << dendl;
4637
4638 {
4639 auto pghist = make_pg_num_history_oid();
4640 bufferlist bl;
4641 int r = store->read(service.meta_ch, pghist, 0, 0, bl, 0);
4642 if (r >= 0 && bl.length() > 0) {
4643 auto p = bl.cbegin();
4644 decode(pg_num_history, p);
4645 }
4646 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
4647 }
4648
4649 vector<coll_t> ls;
4650 int r = store->list_collections(ls);
4651 if (r < 0) {
4652 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4653 }
4654
4655 int num = 0;
4656 for (vector<coll_t>::iterator it = ls.begin();
4657 it != ls.end();
4658 ++it) {
4659 spg_t pgid;
4660 if (it->is_temp(&pgid) ||
4661 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
4662 dout(10) << "load_pgs " << *it
4663 << " removing, legacy or flagged for removal pg" << dendl;
4664 recursive_remove_collection(cct, store, pgid, *it);
4665 continue;
4666 }
4667
4668 if (!it->is_pg(&pgid)) {
4669 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4670 continue;
4671 }
4672
4673 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4674 epoch_t map_epoch = 0;
4675 int r = PG::peek_map_epoch(store, pgid, &map_epoch);
4676 if (r < 0) {
4677 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4678 << dendl;
4679 continue;
4680 }
4681
4682 PGRef pg;
4683 if (map_epoch > 0) {
4684 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4685 if (!pgosdmap) {
4686 if (!get_osdmap()->have_pg_pool(pgid.pool())) {
4687 derr << __func__ << ": could not find map for epoch " << map_epoch
4688 << " on pg " << pgid << ", but the pool is not present in the "
4689 << "current map, so this is probably a result of bug 10617. "
4690 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4691 << "to clean it up later." << dendl;
4692 continue;
4693 } else {
4694 derr << __func__ << ": have pgid " << pgid << " at epoch "
4695 << map_epoch << ", but missing map. Crashing."
4696 << dendl;
4697 ceph_abort_msg("Missing map in load_pgs");
4698 }
4699 }
4700 pg = _make_pg(pgosdmap, pgid);
4701 } else {
4702 pg = _make_pg(get_osdmap(), pgid);
4703 }
4704 if (!pg) {
4705 recursive_remove_collection(cct, store, pgid, *it);
4706 continue;
4707 }
4708
4709 // there can be no waiters here, so we don't call _wake_pg_slot
4710
4711 pg->lock();
4712 pg->ch = store->open_collection(pg->coll);
4713
4714 // read pg state, log
4715 pg->read_state(store);
4716
4717 if (pg->dne()) {
4718 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4719 pg->ch = nullptr;
4720 pg->unlock();
4721 recursive_remove_collection(cct, store, pgid, *it);
4722 continue;
4723 }
4724 {
4725 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4726 assert(NULL != shards[shard_index]);
4727 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4728 }
4729
4730 pg->reg_next_scrub();
4731
4732 dout(10) << __func__ << " loaded " << *pg << dendl;
4733 pg->unlock();
4734
4735 register_pg(pg);
4736 ++num;
4737 }
4738 dout(0) << __func__ << " opened " << num << " pgs" << dendl;
4739 }
4740
4741
4742 PGRef OSD::handle_pg_create_info(const OSDMapRef& osdmap,
4743 const PGCreateInfo *info)
4744 {
4745 spg_t pgid = info->pgid;
4746
4747 if (maybe_wait_for_max_pg(osdmap, pgid, info->by_mon)) {
4748 dout(10) << __func__ << " hit max pg, dropping" << dendl;
4749 return nullptr;
4750 }
4751
4752 PeeringCtx rctx = create_context();
4753
4754 OSDMapRef startmap = get_map(info->epoch);
4755
4756 if (info->by_mon) {
4757 int64_t pool_id = pgid.pgid.pool();
4758 const pg_pool_t *pool = osdmap->get_pg_pool(pool_id);
4759 if (!pool) {
4760 dout(10) << __func__ << " ignoring " << pgid << ", pool dne" << dendl;
4761 return nullptr;
4762 }
4763 if (osdmap->require_osd_release >= ceph_release_t::nautilus &&
4764 !pool->has_flag(pg_pool_t::FLAG_CREATING)) {
4765 // this ensures we do not process old creating messages after the
4766 // pool's initial pgs have been created (and pg are subsequently
4767 // allowed to split or merge).
4768 dout(20) << __func__ << " dropping " << pgid
4769 << "create, pool does not have CREATING flag set" << dendl;
4770 return nullptr;
4771 }
4772 }
4773
4774 int up_primary, acting_primary;
4775 vector<int> up, acting;
4776 startmap->pg_to_up_acting_osds(
4777 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4778
4779 const pg_pool_t* pp = startmap->get_pg_pool(pgid.pool());
4780 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4781 store->get_type() != "bluestore") {
4782 clog->warn() << "pg " << pgid
4783 << " is at risk of silent data corruption: "
4784 << "the pool allows ec overwrites but is not stored in "
4785 << "bluestore, so deep scrubbing will not detect bitrot";
4786 }
4787 create_pg_collection(
4788 rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4789 init_pg_ondisk(rctx.transaction, pgid, pp);
4790
4791 int role = startmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
4792
4793 PGRef pg = _make_pg(startmap, pgid);
4794 pg->ch = store->create_new_collection(pg->coll);
4795
4796 {
4797 uint32_t shard_index = pgid.hash_to_shard(shards.size());
4798 assert(NULL != shards[shard_index]);
4799 store->set_collection_commit_queue(pg->coll, &(shards[shard_index]->context_queue));
4800 }
4801
4802 pg->lock(true);
4803
4804 // we are holding the shard lock
4805 ceph_assert(!pg->is_deleted());
4806
4807 pg->init(
4808 role,
4809 up,
4810 up_primary,
4811 acting,
4812 acting_primary,
4813 info->history,
4814 info->past_intervals,
4815 false,
4816 rctx.transaction);
4817
4818 pg->init_collection_pool_opts();
4819
4820 if (pg->is_primary()) {
4821 std::lock_guard locker{m_perf_queries_lock};
4822 pg->set_dynamic_perf_stats_queries(m_perf_queries);
4823 }
4824
4825 pg->handle_initialize(rctx);
4826 pg->handle_activate_map(rctx);
4827
4828 dispatch_context(rctx, pg.get(), osdmap, nullptr);
4829
4830 dout(10) << __func__ << " new pg " << *pg << dendl;
4831 return pg;
4832 }
4833
4834 bool OSD::maybe_wait_for_max_pg(const OSDMapRef& osdmap,
4835 spg_t pgid,
4836 bool is_mon_create)
4837 {
4838 const auto max_pgs_per_osd =
4839 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4840 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4841
4842 if (num_pgs < max_pgs_per_osd) {
4843 return false;
4844 }
4845
4846 std::lock_guard l(pending_creates_lock);
4847 if (is_mon_create) {
4848 pending_creates_from_mon++;
4849 } else {
4850 bool is_primary = osdmap->get_pg_acting_role(pgid, whoami) == 0;
4851 pending_creates_from_osd.emplace(pgid, is_primary);
4852 }
4853 dout(1) << __func__ << " withhold creation of pg " << pgid
4854 << ": " << num_pgs << " >= "<< max_pgs_per_osd << dendl;
4855 return true;
4856 }
4857
4858 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4859 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4860 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4861 static vector<int32_t> twiddle(const vector<int>& acting) {
4862 if (acting.size() > 1) {
4863 return {acting[0]};
4864 } else {
4865 vector<int32_t> twiddled(acting.begin(), acting.end());
4866 twiddled.push_back(-1);
4867 return twiddled;
4868 }
4869 }
4870
4871 void OSD::resume_creating_pg()
4872 {
4873 bool do_sub_pg_creates = false;
4874 bool have_pending_creates = false;
4875 {
4876 const auto max_pgs_per_osd =
4877 (cct->_conf.get_val<uint64_t>("mon_max_pg_per_osd") *
4878 cct->_conf.get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4879 if (max_pgs_per_osd <= num_pgs) {
4880 // this could happen if admin decreases this setting before a PG is removed
4881 return;
4882 }
4883 unsigned spare_pgs = max_pgs_per_osd - num_pgs;
4884 std::lock_guard l(pending_creates_lock);
4885 if (pending_creates_from_mon > 0) {
4886 dout(20) << __func__ << " pending_creates_from_mon "
4887 << pending_creates_from_mon << dendl;
4888 do_sub_pg_creates = true;
4889 if (pending_creates_from_mon >= spare_pgs) {
4890 spare_pgs = pending_creates_from_mon = 0;
4891 } else {
4892 spare_pgs -= pending_creates_from_mon;
4893 pending_creates_from_mon = 0;
4894 }
4895 }
4896 auto pg = pending_creates_from_osd.cbegin();
4897 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
4898 dout(20) << __func__ << " pg " << pg->first << dendl;
4899 vector<int> acting;
4900 get_osdmap()->pg_to_up_acting_osds(pg->first.pgid, nullptr, nullptr, &acting, nullptr);
4901 service.queue_want_pg_temp(pg->first.pgid, twiddle(acting), true);
4902 pg = pending_creates_from_osd.erase(pg);
4903 do_sub_pg_creates = true;
4904 spare_pgs--;
4905 }
4906 have_pending_creates = (pending_creates_from_mon > 0 ||
4907 !pending_creates_from_osd.empty());
4908 }
4909
4910 bool do_renew_subs = false;
4911 if (do_sub_pg_creates) {
4912 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4913 dout(4) << __func__ << ": resolicit pg creates from mon since "
4914 << last_pg_create_epoch << dendl;
4915 do_renew_subs = true;
4916 }
4917 }
4918 version_t start = get_osdmap_epoch() + 1;
4919 if (have_pending_creates) {
4920 // don't miss any new osdmap deleting PGs
4921 if (monc->sub_want("osdmap", start, 0)) {
4922 dout(4) << __func__ << ": resolicit osdmap from mon since "
4923 << start << dendl;
4924 do_renew_subs = true;
4925 }
4926 } else if (do_sub_pg_creates) {
4927 // no need to subscribe the osdmap continuously anymore
4928 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4929 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
4930 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since "
4931 << start << dendl;
4932 do_renew_subs = true;
4933 }
4934 }
4935
4936 if (do_renew_subs) {
4937 monc->renew_subs();
4938 }
4939
4940 service.send_pg_temp();
4941 }
4942
4943 void OSD::build_initial_pg_history(
4944 spg_t pgid,
4945 epoch_t created,
4946 utime_t created_stamp,
4947 pg_history_t *h,
4948 PastIntervals *pi)
4949 {
4950 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4951 *h = pg_history_t(created, created_stamp);
4952
4953 OSDMapRef lastmap = service.get_map(created);
4954 int up_primary, acting_primary;
4955 vector<int> up, acting;
4956 lastmap->pg_to_up_acting_osds(
4957 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4958
4959 ostringstream debug;
4960 for (epoch_t e = created + 1; e <= get_osdmap_epoch(); ++e) {
4961 OSDMapRef osdmap = service.get_map(e);
4962 int new_up_primary, new_acting_primary;
4963 vector<int> new_up, new_acting;
4964 osdmap->pg_to_up_acting_osds(
4965 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4966
4967 // this is a bit imprecise, but sufficient?
4968 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4969 const pg_pool_t *pi;
4970 bool operator()(const set<pg_shard_t> &have) const {
4971 return have.size() >= pi->min_size;
4972 }
4973 explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4974 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4975
4976 bool new_interval = PastIntervals::check_new_interval(
4977 acting_primary,
4978 new_acting_primary,
4979 acting, new_acting,
4980 up_primary,
4981 new_up_primary,
4982 up, new_up,
4983 h->same_interval_since,
4984 h->last_epoch_clean,
4985 osdmap.get(),
4986 lastmap.get(),
4987 pgid.pgid,
4988 min_size_predicate,
4989 pi,
4990 &debug);
4991 if (new_interval) {
4992 h->same_interval_since = e;
4993 if (up != new_up) {
4994 h->same_up_since = e;
4995 }
4996 if (acting_primary != new_acting_primary) {
4997 h->same_primary_since = e;
4998 }
4999 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
5000 osdmap->get_pg_num(pgid.pgid.pool()),
5001 nullptr)) {
5002 h->last_epoch_split = e;
5003 }
5004 up = new_up;
5005 acting = new_acting;
5006 up_primary = new_up_primary;
5007 acting_primary = new_acting_primary;
5008 }
5009 lastmap = osdmap;
5010 }
5011 dout(20) << __func__ << " " << debug.str() << dendl;
5012 dout(10) << __func__ << " " << *h << " " << *pi
5013 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
5014 pi->get_bounds()) << ")"
5015 << dendl;
5016 }
5017
5018 void OSD::_add_heartbeat_peer(int p)
5019 {
5020 if (p == whoami)
5021 return;
5022 HeartbeatInfo *hi;
5023
5024 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
5025 if (i == heartbeat_peers.end()) {
5026 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, get_osdmap_epoch());
5027 if (!cons.first)
5028 return;
5029 assert(cons.second);
5030
5031 hi = &heartbeat_peers[p];
5032 hi->peer = p;
5033
5034 auto stamps = service.get_hb_stamps(p);
5035
5036 auto sb = ceph::make_ref<Session>(cct, cons.first.get());
5037 sb->peer = p;
5038 sb->stamps = stamps;
5039 hi->hb_interval_start = ceph_clock_now();
5040 hi->con_back = cons.first.get();
5041 hi->con_back->set_priv(sb);
5042
5043 auto sf = ceph::make_ref<Session>(cct, cons.second.get());
5044 sf->peer = p;
5045 sf->stamps = stamps;
5046 hi->con_front = cons.second.get();
5047 hi->con_front->set_priv(sf);
5048
5049 dout(10) << "_add_heartbeat_peer: new peer osd." << p
5050 << " " << hi->con_back->get_peer_addr()
5051 << " " << hi->con_front->get_peer_addr()
5052 << dendl;
5053 } else {
5054 hi = &i->second;
5055 }
5056 hi->epoch = get_osdmap_epoch();
5057 }
5058
5059 void OSD::_remove_heartbeat_peer(int n)
5060 {
5061 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
5062 ceph_assert(q != heartbeat_peers.end());
5063 dout(20) << " removing heartbeat peer osd." << n
5064 << " " << q->second.con_back->get_peer_addr()
5065 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
5066 << dendl;
5067 q->second.clear_mark_down();
5068 heartbeat_peers.erase(q);
5069 }
5070
5071 void OSD::need_heartbeat_peer_update()
5072 {
5073 if (is_stopping())
5074 return;
5075 dout(20) << "need_heartbeat_peer_update" << dendl;
5076 heartbeat_set_peers_need_update();
5077 }
5078
5079 void OSD::maybe_update_heartbeat_peers()
5080 {
5081 ceph_assert(ceph_mutex_is_locked(osd_lock));
5082
5083 if (is_waiting_for_healthy() || is_active()) {
5084 utime_t now = ceph_clock_now();
5085 if (last_heartbeat_resample == utime_t()) {
5086 last_heartbeat_resample = now;
5087 heartbeat_set_peers_need_update();
5088 } else if (!heartbeat_peers_need_update()) {
5089 utime_t dur = now - last_heartbeat_resample;
5090 if (dur > cct->_conf->osd_heartbeat_grace) {
5091 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
5092 heartbeat_set_peers_need_update();
5093 last_heartbeat_resample = now;
5094 // automatically clean up any stale heartbeat peers
5095 // if we are unhealthy, then clean all
5096 reset_heartbeat_peers(is_waiting_for_healthy());
5097 }
5098 }
5099 }
5100
5101 if (!heartbeat_peers_need_update())
5102 return;
5103 heartbeat_clear_peers_need_update();
5104
5105 std::lock_guard l(heartbeat_lock);
5106
5107 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
5108
5109
5110 // build heartbeat from set
5111 if (is_active()) {
5112 vector<PGRef> pgs;
5113 _get_pgs(&pgs);
5114 for (auto& pg : pgs) {
5115 pg->with_heartbeat_peers([&](int peer) {
5116 if (get_osdmap()->is_up(peer)) {
5117 _add_heartbeat_peer(peer);
5118 }
5119 });
5120 }
5121 }
5122
5123 // include next and previous up osds to ensure we have a fully-connected set
5124 set<int> want, extras;
5125 const int next = get_osdmap()->get_next_up_osd_after(whoami);
5126 if (next >= 0)
5127 want.insert(next);
5128 int prev = get_osdmap()->get_previous_up_osd_before(whoami);
5129 if (prev >= 0 && prev != next)
5130 want.insert(prev);
5131
5132 // make sure we have at least **min_down** osds coming from different
5133 // subtree level (e.g., hosts) for fast failure detection.
5134 auto min_down = cct->_conf.get_val<uint64_t>("mon_osd_min_down_reporters");
5135 auto subtree = cct->_conf.get_val<string>("mon_osd_reporter_subtree_level");
5136 auto limit = std::max(min_down, (uint64_t)cct->_conf->osd_heartbeat_min_peers);
5137 get_osdmap()->get_random_up_osds_by_subtree(
5138 whoami, subtree, limit, want, &want);
5139
5140 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
5141 dout(10) << " adding neighbor peer osd." << *p << dendl;
5142 extras.insert(*p);
5143 _add_heartbeat_peer(*p);
5144 }
5145
5146 // remove down peers; enumerate extras
5147 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5148 while (p != heartbeat_peers.end()) {
5149 if (!get_osdmap()->is_up(p->first)) {
5150 int o = p->first;
5151 ++p;
5152 _remove_heartbeat_peer(o);
5153 continue;
5154 }
5155 if (p->second.epoch < get_osdmap_epoch()) {
5156 extras.insert(p->first);
5157 }
5158 ++p;
5159 }
5160
5161 // too few?
5162 for (int n = next; n >= 0; ) {
5163 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
5164 break;
5165 if (!extras.count(n) && !want.count(n) && n != whoami) {
5166 dout(10) << " adding random peer osd." << n << dendl;
5167 extras.insert(n);
5168 _add_heartbeat_peer(n);
5169 }
5170 n = get_osdmap()->get_next_up_osd_after(n);
5171 if (n == next)
5172 break; // came full circle; stop
5173 }
5174
5175 // too many?
5176 for (set<int>::iterator p = extras.begin();
5177 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
5178 ++p) {
5179 if (want.count(*p))
5180 continue;
5181 _remove_heartbeat_peer(*p);
5182 }
5183
5184 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
5185
5186 // clean up stale failure pending
5187 for (auto it = failure_pending.begin(); it != failure_pending.end();) {
5188 if (heartbeat_peers.count(it->first) == 0) {
5189 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
5190 failure_pending.erase(it++);
5191 } else {
5192 it++;
5193 }
5194 }
5195 }
5196
5197 void OSD::reset_heartbeat_peers(bool all)
5198 {
5199 ceph_assert(ceph_mutex_is_locked(osd_lock));
5200 dout(10) << "reset_heartbeat_peers" << dendl;
5201 utime_t stale = ceph_clock_now();
5202 stale -= cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
5203 std::lock_guard l(heartbeat_lock);
5204 for (auto it = heartbeat_peers.begin(); it != heartbeat_peers.end();) {
5205 auto& [peer, hi] = *it;
5206 if (all || hi.is_stale(stale)) {
5207 hi.clear_mark_down();
5208 // stop sending failure_report to mon too
5209 failure_queue.erase(peer);
5210 failure_pending.erase(peer);
5211 it = heartbeat_peers.erase(it);
5212 } else {
5213 ++it;
5214 }
5215 }
5216 }
5217
5218 void OSD::handle_osd_ping(MOSDPing *m)
5219 {
5220 if (superblock.cluster_fsid != m->fsid) {
5221 dout(20) << "handle_osd_ping from " << m->get_source_inst()
5222 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid
5223 << dendl;
5224 m->put();
5225 return;
5226 }
5227
5228 int from = m->get_source().num();
5229
5230 heartbeat_lock.lock();
5231 if (is_stopping()) {
5232 heartbeat_lock.unlock();
5233 m->put();
5234 return;
5235 }
5236
5237 utime_t now = ceph_clock_now();
5238 auto mnow = service.get_mnow();
5239 ConnectionRef con(m->get_connection());
5240 OSDMapRef curmap = service.get_osdmap();
5241 if (!curmap) {
5242 heartbeat_lock.unlock();
5243 m->put();
5244 return;
5245 }
5246
5247 auto sref = con->get_priv();
5248 Session *s = static_cast<Session*>(sref.get());
5249 if (!s) {
5250 heartbeat_lock.unlock();
5251 m->put();
5252 return;
5253 }
5254 if (!s->stamps) {
5255 s->peer = from;
5256 s->stamps = service.get_hb_stamps(from);
5257 }
5258
5259 switch (m->op) {
5260
5261 case MOSDPing::PING:
5262 {
5263 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5264 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5265 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5266 if (heartbeat_drop->second == 0) {
5267 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5268 } else {
5269 --heartbeat_drop->second;
5270 dout(5) << "Dropping heartbeat from " << from
5271 << ", " << heartbeat_drop->second
5272 << " remaining to drop" << dendl;
5273 break;
5274 }
5275 } else if (cct->_conf->osd_debug_drop_ping_probability >
5276 ((((double)(rand()%100))/100.0))) {
5277 heartbeat_drop =
5278 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5279 cct->_conf->osd_debug_drop_ping_duration)).first;
5280 dout(5) << "Dropping heartbeat from " << from
5281 << ", " << heartbeat_drop->second
5282 << " remaining to drop" << dendl;
5283 break;
5284 }
5285 }
5286
5287 ceph::signedspan sender_delta_ub{};
5288 s->stamps->got_ping(
5289 m->up_from,
5290 mnow,
5291 m->mono_send_stamp,
5292 m->delta_ub,
5293 &sender_delta_ub);
5294 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5295
5296 if (!cct->get_heartbeat_map()->is_healthy()) {
5297 dout(10) << "internal heartbeat not healthy, dropping ping request"
5298 << dendl;
5299 break;
5300 }
5301
5302 Message *r = new MOSDPing(monc->get_fsid(),
5303 curmap->get_epoch(),
5304 MOSDPing::PING_REPLY,
5305 m->ping_stamp,
5306 m->mono_ping_stamp,
5307 mnow,
5308 service.get_up_epoch(),
5309 cct->_conf->osd_heartbeat_min_size,
5310 sender_delta_ub);
5311 con->send_message(r);
5312
5313 if (curmap->is_up(from)) {
5314 if (is_active()) {
5315 ConnectionRef cluster_con = service.get_con_osd_cluster(
5316 from, curmap->get_epoch());
5317 if (cluster_con) {
5318 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5319 }
5320 }
5321 } else if (!curmap->exists(from) ||
5322 curmap->get_down_at(from) > m->map_epoch) {
5323 // tell them they have died
5324 Message *r = new MOSDPing(monc->get_fsid(),
5325 curmap->get_epoch(),
5326 MOSDPing::YOU_DIED,
5327 m->ping_stamp,
5328 m->mono_ping_stamp,
5329 mnow,
5330 service.get_up_epoch(),
5331 cct->_conf->osd_heartbeat_min_size);
5332 con->send_message(r);
5333 }
5334 }
5335 break;
5336
5337 case MOSDPing::PING_REPLY:
5338 {
5339 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5340 if (i != heartbeat_peers.end()) {
5341 auto acked = i->second.ping_history.find(m->ping_stamp);
5342 if (acked != i->second.ping_history.end()) {
5343 int &unacknowledged = acked->second.second;
5344 if (con == i->second.con_back) {
5345 dout(25) << "handle_osd_ping got reply from osd." << from
5346 << " first_tx " << i->second.first_tx
5347 << " last_tx " << i->second.last_tx
5348 << " last_rx_back " << i->second.last_rx_back
5349 << " -> " << now
5350 << " last_rx_front " << i->second.last_rx_front
5351 << dendl;
5352 i->second.last_rx_back = now;
5353 ceph_assert(unacknowledged > 0);
5354 --unacknowledged;
5355 // if there is no front con, set both stamps.
5356 if (i->second.con_front == NULL) {
5357 i->second.last_rx_front = now;
5358 ceph_assert(unacknowledged > 0);
5359 --unacknowledged;
5360 }
5361 } else if (con == i->second.con_front) {
5362 dout(25) << "handle_osd_ping got reply from osd." << from
5363 << " first_tx " << i->second.first_tx
5364 << " last_tx " << i->second.last_tx
5365 << " last_rx_back " << i->second.last_rx_back
5366 << " last_rx_front " << i->second.last_rx_front
5367 << " -> " << now
5368 << dendl;
5369 i->second.last_rx_front = now;
5370 ceph_assert(unacknowledged > 0);
5371 --unacknowledged;
5372 }
5373
5374 if (unacknowledged == 0) {
5375 // succeeded in getting all replies
5376 dout(25) << "handle_osd_ping got all replies from osd." << from
5377 << " , erase pending ping(sent at " << m->ping_stamp << ")"
5378 << " and older pending ping(s)"
5379 << dendl;
5380
5381 #define ROUND_S_TO_USEC(sec) (uint32_t)((sec) * 1000 * 1000 + 0.5)
5382 ++i->second.hb_average_count;
5383 uint32_t back_pingtime = ROUND_S_TO_USEC(i->second.last_rx_back - m->ping_stamp);
5384 i->second.hb_total_back += back_pingtime;
5385 if (back_pingtime < i->second.hb_min_back)
5386 i->second.hb_min_back = back_pingtime;
5387 if (back_pingtime > i->second.hb_max_back)
5388 i->second.hb_max_back = back_pingtime;
5389 uint32_t front_pingtime = ROUND_S_TO_USEC(i->second.last_rx_front - m->ping_stamp);
5390 i->second.hb_total_front += front_pingtime;
5391 if (front_pingtime < i->second.hb_min_front)
5392 i->second.hb_min_front = front_pingtime;
5393 if (front_pingtime > i->second.hb_max_front)
5394 i->second.hb_max_front = front_pingtime;
5395
5396 ceph_assert(i->second.hb_interval_start != utime_t());
5397 if (i->second.hb_interval_start == utime_t())
5398 i->second.hb_interval_start = now;
5399 int64_t hb_avg_time_period = 60;
5400 if (cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span")) {
5401 hb_avg_time_period = cct->_conf.get_val<int64_t>("debug_heartbeat_testing_span");
5402 }
5403 if (now - i->second.hb_interval_start >= utime_t(hb_avg_time_period, 0)) {
5404 uint32_t back_avg = i->second.hb_total_back / i->second.hb_average_count;
5405 uint32_t back_min = i->second.hb_min_back;
5406 uint32_t back_max = i->second.hb_max_back;
5407 uint32_t front_avg = i->second.hb_total_front / i->second.hb_average_count;
5408 uint32_t front_min = i->second.hb_min_front;
5409 uint32_t front_max = i->second.hb_max_front;
5410
5411 // Reset for new interval
5412 i->second.hb_average_count = 0;
5413 i->second.hb_interval_start = now;
5414 i->second.hb_total_back = i->second.hb_max_back = 0;
5415 i->second.hb_min_back = UINT_MAX;
5416 i->second.hb_total_front = i->second.hb_max_front = 0;
5417 i->second.hb_min_front = UINT_MAX;
5418
5419 // Record per osd interace ping times
5420 // Based on osd_heartbeat_interval ignoring that it is randomly short than this interval
5421 if (i->second.hb_back_pingtime.size() == 0) {
5422 ceph_assert(i->second.hb_front_pingtime.size() == 0);
5423 for (unsigned k = 0 ; k < hb_vector_size; ++k) {
5424 i->second.hb_back_pingtime.push_back(back_avg);
5425 i->second.hb_back_min.push_back(back_min);
5426 i->second.hb_back_max.push_back(back_max);
5427 i->second.hb_front_pingtime.push_back(front_avg);
5428 i->second.hb_front_min.push_back(front_min);
5429 i->second.hb_front_max.push_back(front_max);
5430 ++i->second.hb_index;
5431 }
5432 } else {
5433 int index = i->second.hb_index & (hb_vector_size - 1);
5434 i->second.hb_back_pingtime[index] = back_avg;
5435 i->second.hb_back_min[index] = back_min;
5436 i->second.hb_back_max[index] = back_max;
5437 i->second.hb_front_pingtime[index] = front_avg;
5438 i->second.hb_front_min[index] = front_min;
5439 i->second.hb_front_max[index] = front_max;
5440 ++i->second.hb_index;
5441 }
5442
5443 {
5444 std::lock_guard l(service.stat_lock);
5445 service.osd_stat.hb_pingtime[from].last_update = now.sec();
5446 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5447
5448 uint32_t total = 0;
5449 uint32_t min = UINT_MAX;
5450 uint32_t max = 0;
5451 uint32_t count = 0;
5452 uint32_t which = 0;
5453 uint32_t size = (uint32_t)i->second.hb_back_pingtime.size();
5454 for (int32_t k = size - 1 ; k >= 0; --k) {
5455 ++count;
5456 int index = (i->second.hb_index + k) % size;
5457 total += i->second.hb_back_pingtime[index];
5458 if (i->second.hb_back_min[index] < min)
5459 min = i->second.hb_back_min[index];
5460 if (i->second.hb_back_max[index] > max)
5461 max = i->second.hb_back_max[index];
5462 if (count == 1 || count == 5 || count == 15) {
5463 service.osd_stat.hb_pingtime[from].back_pingtime[which] = total / count;
5464 service.osd_stat.hb_pingtime[from].back_min[which] = min;
5465 service.osd_stat.hb_pingtime[from].back_max[which] = max;
5466 which++;
5467 if (count == 15)
5468 break;
5469 }
5470 }
5471
5472 if (i->second.con_front != NULL) {
5473 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5474
5475 total = 0;
5476 min = UINT_MAX;
5477 max = 0;
5478 count = 0;
5479 which = 0;
5480 for (int32_t k = size - 1 ; k >= 0; --k) {
5481 ++count;
5482 int index = (i->second.hb_index + k) % size;
5483 total += i->second.hb_front_pingtime[index];
5484 if (i->second.hb_front_min[index] < min)
5485 min = i->second.hb_front_min[index];
5486 if (i->second.hb_front_max[index] > max)
5487 max = i->second.hb_front_max[index];
5488 if (count == 1 || count == 5 || count == 15) {
5489 service.osd_stat.hb_pingtime[from].front_pingtime[which] = total / count;
5490 service.osd_stat.hb_pingtime[from].front_min[which] = min;
5491 service.osd_stat.hb_pingtime[from].front_max[which] = max;
5492 which++;
5493 if (count == 15)
5494 break;
5495 }
5496 }
5497 }
5498 }
5499 } else {
5500 std::lock_guard l(service.stat_lock);
5501 service.osd_stat.hb_pingtime[from].back_last = back_pingtime;
5502 if (i->second.con_front != NULL)
5503 service.osd_stat.hb_pingtime[from].front_last = front_pingtime;
5504 }
5505 i->second.ping_history.erase(i->second.ping_history.begin(), ++acked);
5506 }
5507
5508 if (i->second.is_healthy(now)) {
5509 // Cancel false reports
5510 auto failure_queue_entry = failure_queue.find(from);
5511 if (failure_queue_entry != failure_queue.end()) {
5512 dout(10) << "handle_osd_ping canceling queued "
5513 << "failure report for osd." << from << dendl;
5514 failure_queue.erase(failure_queue_entry);
5515 }
5516
5517 auto failure_pending_entry = failure_pending.find(from);
5518 if (failure_pending_entry != failure_pending.end()) {
5519 dout(10) << "handle_osd_ping canceling in-flight "
5520 << "failure report for osd." << from << dendl;
5521 send_still_alive(curmap->get_epoch(),
5522 from,
5523 failure_pending_entry->second.second);
5524 failure_pending.erase(failure_pending_entry);
5525 }
5526 }
5527 } else {
5528 // old replies, deprecated by newly sent pings.
5529 dout(10) << "handle_osd_ping no pending ping(sent at " << m->ping_stamp
5530 << ") is found, treat as covered by newly sent pings "
5531 << "and ignore"
5532 << dendl;
5533 }
5534 }
5535
5536 if (m->map_epoch &&
5537 curmap->is_up(from)) {
5538 if (is_active()) {
5539 ConnectionRef cluster_con = service.get_con_osd_cluster(
5540 from, curmap->get_epoch());
5541 if (cluster_con) {
5542 service.maybe_share_map(cluster_con.get(), curmap, m->map_epoch);
5543 }
5544 }
5545 }
5546
5547 s->stamps->got_ping_reply(
5548 mnow,
5549 m->mono_send_stamp,
5550 m->delta_ub);
5551 dout(20) << __func__ << " new stamps " << *s->stamps << dendl;
5552 }
5553 break;
5554
5555 case MOSDPing::YOU_DIED:
5556 dout(10) << "handle_osd_ping " << m->get_source_inst()
5557 << " says i am down in " << m->map_epoch << dendl;
5558 osdmap_subscribe(curmap->get_epoch()+1, false);
5559 break;
5560 }
5561
5562 heartbeat_lock.unlock();
5563 m->put();
5564 }
5565
5566 void OSD::heartbeat_entry()
5567 {
5568 std::unique_lock l(heartbeat_lock);
5569 if (is_stopping())
5570 return;
5571 while (!heartbeat_stop) {
5572 heartbeat();
5573
5574 double wait;
5575 if (cct->_conf.get_val<bool>("debug_disable_randomized_ping")) {
5576 wait = (float)cct->_conf->osd_heartbeat_interval;
5577 } else {
5578 wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5579 }
5580 auto w = ceph::make_timespan(wait);
5581 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5582 heartbeat_cond.wait_for(l, w);
5583 if (is_stopping())
5584 return;
5585 dout(30) << "heartbeat_entry woke up" << dendl;
5586 }
5587 }
5588
5589 void OSD::heartbeat_check()
5590 {
5591 ceph_assert(ceph_mutex_is_locked(heartbeat_lock));
5592 utime_t now = ceph_clock_now();
5593
5594 // check for incoming heartbeats (move me elsewhere?)
5595 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5596 p != heartbeat_peers.end();
5597 ++p) {
5598
5599 if (p->second.first_tx == utime_t()) {
5600 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5601 << " yet, skipping" << dendl;
5602 continue;
5603 }
5604
5605 dout(25) << "heartbeat_check osd." << p->first
5606 << " first_tx " << p->second.first_tx
5607 << " last_tx " << p->second.last_tx
5608 << " last_rx_back " << p->second.last_rx_back
5609 << " last_rx_front " << p->second.last_rx_front
5610 << dendl;
5611 if (p->second.is_unhealthy(now)) {
5612 utime_t oldest_deadline = p->second.ping_history.begin()->second.first;
5613 if (p->second.last_rx_back == utime_t() ||
5614 p->second.last_rx_front == utime_t()) {
5615 derr << "heartbeat_check: no reply from "
5616 << p->second.con_front->get_peer_addr().get_sockaddr()
5617 << " osd." << p->first
5618 << " ever on either front or back, first ping sent "
5619 << p->second.first_tx
5620 << " (oldest deadline " << oldest_deadline << ")"
5621 << dendl;
5622 // fail
5623 failure_queue[p->first] = p->second.first_tx;
5624 } else {
5625 derr << "heartbeat_check: no reply from "
5626 << p->second.con_front->get_peer_addr().get_sockaddr()
5627 << " osd." << p->first << " since back " << p->second.last_rx_back
5628 << " front " << p->second.last_rx_front
5629 << " (oldest deadline " << oldest_deadline << ")"
5630 << dendl;
5631 // fail
5632 failure_queue[p->first] = std::min(p->second.last_rx_back, p->second.last_rx_front);
5633 }
5634 }
5635 }
5636 }
5637
5638 void OSD::heartbeat()
5639 {
5640 ceph_assert(ceph_mutex_is_locked_by_me(heartbeat_lock));
5641 dout(30) << "heartbeat" << dendl;
5642
5643 // get CPU load avg
5644 double loadavgs[1];
5645 int hb_interval = cct->_conf->osd_heartbeat_interval;
5646 int n_samples = 86400;
5647 if (hb_interval > 1) {
5648 n_samples /= hb_interval;
5649 if (n_samples < 1)
5650 n_samples = 1;
5651 }
5652
5653 if (getloadavg(loadavgs, 1) == 1) {
5654 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5655 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5656 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5657 }
5658
5659 dout(30) << "heartbeat checking stats" << dendl;
5660
5661 // refresh peer list and osd stats
5662 vector<int> hb_peers;
5663 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5664 p != heartbeat_peers.end();
5665 ++p)
5666 hb_peers.push_back(p->first);
5667
5668 auto new_stat = service.set_osd_stat(hb_peers, get_num_pgs());
5669 dout(5) << __func__ << " " << new_stat << dendl;
5670 ceph_assert(new_stat.statfs.total);
5671
5672 float pratio;
5673 float ratio = service.compute_adjusted_ratio(new_stat, &pratio);
5674
5675 service.check_full_status(ratio, pratio);
5676
5677 utime_t now = ceph_clock_now();
5678 auto mnow = service.get_mnow();
5679 utime_t deadline = now;
5680 deadline += cct->_conf->osd_heartbeat_grace;
5681
5682 // send heartbeats
5683 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5684 i != heartbeat_peers.end();
5685 ++i) {
5686 int peer = i->first;
5687 Session *s = static_cast<Session*>(i->second.con_back->get_priv().get());
5688 if (!s) {
5689 dout(30) << "heartbeat osd." << peer << " has no open con" << dendl;
5690 continue;
5691 }
5692 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5693
5694 i->second.last_tx = now;
5695 if (i->second.first_tx == utime_t())
5696 i->second.first_tx = now;
5697 i->second.ping_history[now] = make_pair(deadline,
5698 HeartbeatInfo::HEARTBEAT_MAX_CONN);
5699 if (i->second.hb_interval_start == utime_t())
5700 i->second.hb_interval_start = now;
5701
5702 std::optional<ceph::signedspan> delta_ub;
5703 s->stamps->sent_ping(&delta_ub);
5704
5705 i->second.con_back->send_message(
5706 new MOSDPing(monc->get_fsid(),
5707 service.get_osdmap_epoch(),
5708 MOSDPing::PING,
5709 now,
5710 mnow,
5711 mnow,
5712 service.get_up_epoch(),
5713 cct->_conf->osd_heartbeat_min_size,
5714 delta_ub));
5715
5716 if (i->second.con_front)
5717 i->second.con_front->send_message(
5718 new MOSDPing(monc->get_fsid(),
5719 service.get_osdmap_epoch(),
5720 MOSDPing::PING,
5721 now,
5722 mnow,
5723 mnow,
5724 service.get_up_epoch(),
5725 cct->_conf->osd_heartbeat_min_size,
5726 delta_ub));
5727 }
5728
5729 logger->set(l_osd_hb_to, heartbeat_peers.size());
5730
5731 // hmm.. am i all alone?
5732 dout(30) << "heartbeat lonely?" << dendl;
5733 if (heartbeat_peers.empty()) {
5734 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5735 last_mon_heartbeat = now;
5736 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5737 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5738 }
5739 }
5740
5741 dout(30) << "heartbeat done" << dendl;
5742 }
5743
5744 bool OSD::heartbeat_reset(Connection *con)
5745 {
5746 std::lock_guard l(heartbeat_lock);
5747 auto s = con->get_priv();
5748 dout(20) << __func__ << " con " << con << " s " << s.get() << dendl;
5749 con->set_priv(nullptr);
5750 if (s) {
5751 if (is_stopping()) {
5752 return true;
5753 }
5754 auto session = static_cast<Session*>(s.get());
5755 auto p = heartbeat_peers.find(session->peer);
5756 if (p != heartbeat_peers.end() &&
5757 (p->second.con_back == con ||
5758 p->second.con_front == con)) {
5759 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5760 << ", reopening" << dendl;
5761 p->second.clear_mark_down(con);
5762 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5763 if (newcon.first) {
5764 p->second.con_back = newcon.first.get();
5765 p->second.con_back->set_priv(s);
5766 if (newcon.second) {
5767 p->second.con_front = newcon.second.get();
5768 p->second.con_front->set_priv(s);
5769 }
5770 p->second.ping_history.clear();
5771 } else {
5772 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5773 << ", raced with osdmap update, closing out peer" << dendl;
5774 heartbeat_peers.erase(p);
5775 }
5776 } else {
5777 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5778 }
5779 }
5780 return true;
5781 }
5782
5783
5784
5785 // =========================================
5786
5787 void OSD::tick()
5788 {
5789 ceph_assert(ceph_mutex_is_locked(osd_lock));
5790 dout(10) << "tick" << dendl;
5791
5792 utime_t now = ceph_clock_now();
5793 // throw out any obsolete markdown log
5794 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
5795 while (!osd_markdown_log.empty() &&
5796 osd_markdown_log.front() + grace < now)
5797 osd_markdown_log.pop_front();
5798
5799 if (is_active() || is_waiting_for_healthy()) {
5800 maybe_update_heartbeat_peers();
5801 }
5802
5803 if (is_waiting_for_healthy()) {
5804 start_boot();
5805 }
5806
5807 if (is_waiting_for_healthy() || is_booting()) {
5808 std::lock_guard l(heartbeat_lock);
5809 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval) {
5810 last_mon_heartbeat = now;
5811 dout(1) << __func__ << " checking mon for new map" << dendl;
5812 osdmap_subscribe(get_osdmap_epoch() + 1, false);
5813 }
5814 }
5815
5816 do_waiters();
5817
5818 // scrub purged_snaps every deep scrub interval
5819 {
5820 const utime_t last = superblock.last_purged_snaps_scrub;
5821 utime_t next = last;
5822 next += cct->_conf->osd_scrub_min_interval;
5823 std::mt19937 rng;
5824 // use a seed that is stable for each scrub interval, but varies
5825 // by OSD to avoid any herds.
5826 rng.seed(whoami + superblock.last_purged_snaps_scrub.sec());
5827 double r = (rng() % 1024) / 1024;
5828 next +=
5829 cct->_conf->osd_scrub_min_interval *
5830 cct->_conf->osd_scrub_interval_randomize_ratio * r;
5831 if (next < ceph_clock_now()) {
5832 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5833 << " next " << next << " ... now" << dendl;
5834 scrub_purged_snaps();
5835 } else {
5836 dout(20) << __func__ << " last_purged_snaps_scrub " << last
5837 << " next " << next << dendl;
5838 }
5839 }
5840
5841 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
5842 }
5843
5844 void OSD::tick_without_osd_lock()
5845 {
5846 ceph_assert(ceph_mutex_is_locked(tick_timer_lock));
5847 dout(10) << "tick_without_osd_lock" << dendl;
5848
5849 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5850 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5851 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5852
5853 // refresh osd stats
5854 struct store_statfs_t stbuf;
5855 osd_alert_list_t alerts;
5856 int r = store->statfs(&stbuf, &alerts);
5857 ceph_assert(r == 0);
5858 service.set_statfs(stbuf, alerts);
5859
5860 // osd_lock is not being held, which means the OSD state
5861 // might change when doing the monitor report
5862 if (is_active() || is_waiting_for_healthy()) {
5863 {
5864 std::lock_guard l{heartbeat_lock};
5865 heartbeat_check();
5866 }
5867 map_lock.lock_shared();
5868 std::lock_guard l(mon_report_lock);
5869
5870 // mon report?
5871 utime_t now = ceph_clock_now();
5872 if (service.need_fullness_update() ||
5873 now - last_mon_report > cct->_conf->osd_mon_report_interval) {
5874 last_mon_report = now;
5875 send_full_update();
5876 send_failures();
5877 }
5878 map_lock.unlock_shared();
5879
5880 epoch_t max_waiting_epoch = 0;
5881 for (auto s : shards) {
5882 max_waiting_epoch = std::max(max_waiting_epoch,
5883 s->get_max_waiting_epoch());
5884 }
5885 if (max_waiting_epoch > get_osdmap()->get_epoch()) {
5886 dout(20) << __func__ << " max_waiting_epoch " << max_waiting_epoch
5887 << ", requesting new map" << dendl;
5888 osdmap_subscribe(superblock.newest_map + 1, false);
5889 }
5890 }
5891
5892 if (is_active()) {
5893 if (!scrub_random_backoff()) {
5894 sched_scrub();
5895 }
5896 service.promote_throttle_recalibrate();
5897 resume_creating_pg();
5898 bool need_send_beacon = false;
5899 const auto now = ceph::coarse_mono_clock::now();
5900 {
5901 // borrow lec lock to pretect last_sent_beacon from changing
5902 std::lock_guard l{min_last_epoch_clean_lock};
5903 const auto elapsed = now - last_sent_beacon;
5904 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5905 cct->_conf->osd_beacon_report_interval) {
5906 need_send_beacon = true;
5907 }
5908 }
5909 if (need_send_beacon) {
5910 send_beacon(now);
5911 }
5912 }
5913
5914 mgrc.update_daemon_health(get_health_metrics());
5915 service.kick_recovery_queue();
5916 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5917 new C_Tick_WithoutOSDLock(this));
5918 }
5919
5920 // Usage:
5921 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5922 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5923 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5924 // getomap <pool> [namespace/]<obj-name>
5925 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5926 // injectmdataerr [namespace/]<obj-name> [shardid]
5927 // injectdataerr [namespace/]<obj-name> [shardid]
5928 //
5929 // set_recovery_delay [utime]
5930 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5931 std::string_view command,
5932 const cmdmap_t& cmdmap, ostream &ss)
5933 {
5934 //Test support
5935 //Support changing the omap on a single osd by using the Admin Socket to
5936 //directly request the osd make a change.
5937 if (command == "setomapval" || command == "rmomapkey" ||
5938 command == "setomapheader" || command == "getomap" ||
5939 command == "truncobj" || command == "injectmdataerr" ||
5940 command == "injectdataerr"
5941 ) {
5942 pg_t rawpg;
5943 int64_t pool;
5944 OSDMapRef curmap = service->get_osdmap();
5945 int r = -1;
5946
5947 string poolstr;
5948
5949 cmd_getval(cmdmap, "pool", poolstr);
5950 pool = curmap->lookup_pg_pool_name(poolstr);
5951 //If we can't find it by name then maybe id specified
5952 if (pool < 0 && isdigit(poolstr[0]))
5953 pool = atoll(poolstr.c_str());
5954 if (pool < 0) {
5955 ss << "Invalid pool '" << poolstr << "''";
5956 return;
5957 }
5958
5959 string objname, nspace;
5960 cmd_getval(cmdmap, "objname", objname);
5961 std::size_t found = objname.find_first_of('/');
5962 if (found != string::npos) {
5963 nspace = objname.substr(0, found);
5964 objname = objname.substr(found+1);
5965 }
5966 object_locator_t oloc(pool, nspace);
5967 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5968
5969 if (r < 0) {
5970 ss << "Invalid namespace/objname";
5971 return;
5972 }
5973
5974 int64_t shardid;
5975 cmd_getval(cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5976 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5977 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5978 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5979 if (curmap->pg_is_ec(rawpg)) {
5980 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5981 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5982 return;
5983 }
5984 }
5985
5986 ObjectStore::Transaction t;
5987
5988 if (command == "setomapval") {
5989 map<string, bufferlist> newattrs;
5990 bufferlist val;
5991 string key, valstr;
5992 cmd_getval(cmdmap, "key", key);
5993 cmd_getval(cmdmap, "val", valstr);
5994
5995 val.append(valstr);
5996 newattrs[key] = val;
5997 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5998 r = store->queue_transaction(service->meta_ch, std::move(t));
5999 if (r < 0)
6000 ss << "error=" << r;
6001 else
6002 ss << "ok";
6003 } else if (command == "rmomapkey") {
6004 string key;
6005 cmd_getval(cmdmap, "key", key);
6006
6007 t.omap_rmkey(coll_t(pgid), ghobject_t(obj), key);
6008 r = store->queue_transaction(service->meta_ch, std::move(t));
6009 if (r < 0)
6010 ss << "error=" << r;
6011 else
6012 ss << "ok";
6013 } else if (command == "setomapheader") {
6014 bufferlist newheader;
6015 string headerstr;
6016
6017 cmd_getval(cmdmap, "header", headerstr);
6018 newheader.append(headerstr);
6019 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
6020 r = store->queue_transaction(service->meta_ch, std::move(t));
6021 if (r < 0)
6022 ss << "error=" << r;
6023 else
6024 ss << "ok";
6025 } else if (command == "getomap") {
6026 //Debug: Output entire omap
6027 bufferlist hdrbl;
6028 map<string, bufferlist> keyvals;
6029 auto ch = store->open_collection(coll_t(pgid));
6030 if (!ch) {
6031 ss << "unable to open collection for " << pgid;
6032 r = -ENOENT;
6033 } else {
6034 r = store->omap_get(ch, ghobject_t(obj), &hdrbl, &keyvals);
6035 if (r >= 0) {
6036 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
6037 for (map<string, bufferlist>::iterator it = keyvals.begin();
6038 it != keyvals.end(); ++it)
6039 ss << " key=" << (*it).first << " val="
6040 << string((*it).second.c_str(), (*it).second.length());
6041 } else {
6042 ss << "error=" << r;
6043 }
6044 }
6045 } else if (command == "truncobj") {
6046 int64_t trunclen;
6047 cmd_getval(cmdmap, "len", trunclen);
6048 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
6049 r = store->queue_transaction(service->meta_ch, std::move(t));
6050 if (r < 0)
6051 ss << "error=" << r;
6052 else
6053 ss << "ok";
6054 } else if (command == "injectdataerr") {
6055 store->inject_data_error(gobj);
6056 ss << "ok";
6057 } else if (command == "injectmdataerr") {
6058 store->inject_mdata_error(gobj);
6059 ss << "ok";
6060 }
6061 return;
6062 }
6063 if (command == "set_recovery_delay") {
6064 int64_t delay;
6065 cmd_getval(cmdmap, "utime", delay, (int64_t)0);
6066 ostringstream oss;
6067 oss << delay;
6068 int r = service->cct->_conf.set_val("osd_recovery_delay_start",
6069 oss.str().c_str());
6070 if (r != 0) {
6071 ss << "set_recovery_delay: error setting "
6072 << "osd_recovery_delay_start to '" << delay << "': error "
6073 << r;
6074 return;
6075 }
6076 service->cct->_conf.apply_changes(nullptr);
6077 ss << "set_recovery_delay: set osd_recovery_delay_start "
6078 << "to " << service->cct->_conf->osd_recovery_delay_start;
6079 return;
6080 }
6081 if (command == "injectfull") {
6082 int64_t count;
6083 string type;
6084 OSDService::s_names state;
6085 cmd_getval(cmdmap, "type", type, string("full"));
6086 cmd_getval(cmdmap, "count", count, (int64_t)-1);
6087 if (type == "none" || count == 0) {
6088 type = "none";
6089 count = 0;
6090 }
6091 state = service->get_full_state(type);
6092 if (state == OSDService::s_names::INVALID) {
6093 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
6094 return;
6095 }
6096 service->set_injectfull(state, count);
6097 return;
6098 }
6099 ss << "Internal error - command=" << command;
6100 }
6101
6102 // =========================================
6103
6104 void OSD::ms_handle_connect(Connection *con)
6105 {
6106 dout(10) << __func__ << " con " << con << dendl;
6107 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
6108 std::lock_guard l(osd_lock);
6109 if (is_stopping())
6110 return;
6111 dout(10) << __func__ << " on mon" << dendl;
6112
6113 if (is_preboot()) {
6114 start_boot();
6115 } else if (is_booting()) {
6116 _send_boot(); // resend boot message
6117 } else {
6118 map_lock.lock_shared();
6119 std::lock_guard l2(mon_report_lock);
6120
6121 utime_t now = ceph_clock_now();
6122 last_mon_report = now;
6123
6124 // resend everything, it's a new session
6125 send_full_update();
6126 send_alive();
6127 service.requeue_pg_temp();
6128 service.clear_sent_ready_to_merge();
6129 service.send_pg_temp();
6130 service.send_ready_to_merge();
6131 service.send_pg_created();
6132 requeue_failures();
6133 send_failures();
6134
6135 map_lock.unlock_shared();
6136 if (is_active()) {
6137 send_beacon(ceph::coarse_mono_clock::now());
6138 }
6139 }
6140
6141 // full map requests may happen while active or pre-boot
6142 if (requested_full_first) {
6143 rerequest_full_maps();
6144 }
6145 }
6146 }
6147
6148 void OSD::ms_handle_fast_connect(Connection *con)
6149 {
6150 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6151 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6152 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6153 s = ceph::make_ref<Session>(cct, con);
6154 con->set_priv(s);
6155 dout(10) << " new session (outgoing) " << s << " con=" << s->con
6156 << " addr=" << s->con->get_peer_addr() << dendl;
6157 // we don't connect to clients
6158 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6159 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6160 }
6161 }
6162 }
6163
6164 void OSD::ms_handle_fast_accept(Connection *con)
6165 {
6166 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
6167 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
6168 if (auto s = ceph::ref_cast<Session>(con->get_priv()); !s) {
6169 s = ceph::make_ref<Session>(cct, con);
6170 con->set_priv(s);
6171 dout(10) << "new session (incoming)" << s << " con=" << con
6172 << " addr=" << con->get_peer_addr()
6173 << " must have raced with connect" << dendl;
6174 ceph_assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
6175 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
6176 }
6177 }
6178 }
6179
6180 bool OSD::ms_handle_reset(Connection *con)
6181 {
6182 auto session = ceph::ref_cast<Session>(con->get_priv());
6183 dout(2) << "ms_handle_reset con " << con << " session " << session.get() << dendl;
6184 if (!session)
6185 return false;
6186 session->wstate.reset(con);
6187 session->con->set_priv(nullptr);
6188 session->con.reset(); // break con <-> session ref cycle
6189 // note that we break session->con *before* the session_handle_reset
6190 // cleanup below. this avoids a race between us and
6191 // PG::add_backoff, Session::check_backoff, etc.
6192 session_handle_reset(session);
6193 return true;
6194 }
6195
6196 bool OSD::ms_handle_refused(Connection *con)
6197 {
6198 if (!cct->_conf->osd_fast_fail_on_connection_refused)
6199 return false;
6200
6201 auto session = ceph::ref_cast<Session>(con->get_priv());
6202 dout(2) << "ms_handle_refused con " << con << " session " << session.get() << dendl;
6203 if (!session)
6204 return false;
6205 int type = con->get_peer_type();
6206 // handle only OSD failures here
6207 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
6208 OSDMapRef osdmap = get_osdmap();
6209 if (osdmap) {
6210 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
6211 if (id >= 0 && osdmap->is_up(id)) {
6212 // I'm cheating mon heartbeat grace logic, because we know it's not going
6213 // to respawn alone. +1 so we won't hit any boundary case.
6214 monc->send_mon_message(
6215 new MOSDFailure(
6216 monc->get_fsid(),
6217 id,
6218 osdmap->get_addrs(id),
6219 cct->_conf->osd_heartbeat_grace + 1,
6220 osdmap->get_epoch(),
6221 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
6222 ));
6223 }
6224 }
6225 }
6226 return true;
6227 }
6228
6229 struct C_OSD_GetVersion : public Context {
6230 OSD *osd;
6231 uint64_t oldest, newest;
6232 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
6233 void finish(int r) override {
6234 if (r >= 0)
6235 osd->_got_mon_epochs(oldest, newest);
6236 }
6237 };
6238
6239 void OSD::start_boot()
6240 {
6241 if (!_is_healthy()) {
6242 // if we are not healthy, do not mark ourselves up (yet)
6243 dout(1) << "not healthy; waiting to boot" << dendl;
6244 if (!is_waiting_for_healthy())
6245 start_waiting_for_healthy();
6246 // send pings sooner rather than later
6247 heartbeat_kick();
6248 return;
6249 }
6250 dout(1) << __func__ << dendl;
6251 set_state(STATE_PREBOOT);
6252 dout(10) << "start_boot - have maps " << superblock.oldest_map
6253 << ".." << superblock.newest_map << dendl;
6254 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
6255 monc->get_version("osdmap", &c->newest, &c->oldest, c);
6256 }
6257
6258 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
6259 {
6260 std::lock_guard l(osd_lock);
6261 if (is_preboot()) {
6262 _preboot(oldest, newest);
6263 }
6264 }
6265
6266 void OSD::_preboot(epoch_t oldest, epoch_t newest)
6267 {
6268 ceph_assert(is_preboot());
6269 dout(10) << __func__ << " _preboot mon has osdmaps "
6270 << oldest << ".." << newest << dendl;
6271
6272 // ensure our local fullness awareness is accurate
6273 {
6274 std::lock_guard l(heartbeat_lock);
6275 heartbeat();
6276 }
6277
6278 const auto& monmap = monc->monmap;
6279 const auto osdmap = get_osdmap();
6280 // if our map within recent history, try to add ourselves to the osdmap.
6281 if (osdmap->get_epoch() == 0) {
6282 derr << "waiting for initial osdmap" << dendl;
6283 } else if (osdmap->is_destroyed(whoami)) {
6284 derr << "osdmap says I am destroyed" << dendl;
6285 // provide a small margin so we don't livelock seeing if we
6286 // un-destroyed ourselves.
6287 if (osdmap->get_epoch() > newest - 1) {
6288 exit(0);
6289 }
6290 } else if (osdmap->is_noup(whoami)) {
6291 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6292 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6293 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6294 << dendl;
6295 } else if (osdmap->require_osd_release < ceph_release_t::luminous) {
6296 derr << "osdmap require_osd_release < luminous; please upgrade to luminous"
6297 << dendl;
6298 } else if (service.need_fullness_update()) {
6299 derr << "osdmap fullness state needs update" << dendl;
6300 send_full_update();
6301 } else if (monmap.min_mon_release >= ceph_release_t::octopus &&
6302 superblock.purged_snaps_last < superblock.current_epoch) {
6303 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6304 << " < newest_map " << superblock.current_epoch << dendl;
6305 _get_purged_snaps();
6306 } else if (osdmap->get_epoch() >= oldest - 1 &&
6307 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6308
6309 // wait for pgs to fully catch up in a different thread, since
6310 // this thread might be required for splitting and merging PGs to
6311 // make progress.
6312 boot_finisher.queue(
6313 new LambdaContext(
6314 [this](int r) {
6315 std::unique_lock l(osd_lock);
6316 if (is_preboot()) {
6317 dout(10) << __func__ << " waiting for peering work to drain"
6318 << dendl;
6319 l.unlock();
6320 for (auto shard : shards) {
6321 shard->wait_min_pg_epoch(get_osdmap_epoch());
6322 }
6323 l.lock();
6324 }
6325 if (is_preboot()) {
6326 _send_boot();
6327 }
6328 }));
6329 return;
6330 }
6331
6332 // get all the latest maps
6333 if (osdmap->get_epoch() + 1 >= oldest)
6334 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6335 else
6336 osdmap_subscribe(oldest - 1, true);
6337 }
6338
6339 void OSD::_get_purged_snaps()
6340 {
6341 // NOTE: this is a naive, stateless implementaiton. it may send multiple
6342 // overlapping requests to the mon, which will be somewhat inefficient, but
6343 // it should be reliable.
6344 dout(10) << __func__ << " purged_snaps_last " << superblock.purged_snaps_last
6345 << ", newest_map " << superblock.current_epoch << dendl;
6346 MMonGetPurgedSnaps *m = new MMonGetPurgedSnaps(
6347 superblock.purged_snaps_last + 1,
6348 superblock.current_epoch + 1);
6349 monc->send_mon_message(m);
6350 }
6351
6352 void OSD::handle_get_purged_snaps_reply(MMonGetPurgedSnapsReply *m)
6353 {
6354 dout(10) << __func__ << " " << *m << dendl;
6355 ObjectStore::Transaction t;
6356 if (!is_preboot() ||
6357 m->last < superblock.purged_snaps_last) {
6358 goto out;
6359 }
6360 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
6361 make_purged_snaps_oid(), &t,
6362 m->purged_snaps);
6363 superblock.purged_snaps_last = m->last;
6364 write_superblock(t);
6365 store->queue_transaction(
6366 service.meta_ch,
6367 std::move(t));
6368 service.publish_superblock(superblock);
6369 if (m->last < superblock.current_epoch) {
6370 _get_purged_snaps();
6371 } else {
6372 start_boot();
6373 }
6374 out:
6375 m->put();
6376 }
6377
6378 void OSD::send_full_update()
6379 {
6380 if (!service.need_fullness_update())
6381 return;
6382 unsigned state = 0;
6383 if (service.is_full()) {
6384 state = CEPH_OSD_FULL;
6385 } else if (service.is_backfillfull()) {
6386 state = CEPH_OSD_BACKFILLFULL;
6387 } else if (service.is_nearfull()) {
6388 state = CEPH_OSD_NEARFULL;
6389 }
6390 set<string> s;
6391 OSDMap::calc_state_set(state, s);
6392 dout(10) << __func__ << " want state " << s << dendl;
6393 monc->send_mon_message(new MOSDFull(get_osdmap_epoch(), state));
6394 }
6395
6396 void OSD::start_waiting_for_healthy()
6397 {
6398 dout(1) << "start_waiting_for_healthy" << dendl;
6399 set_state(STATE_WAITING_FOR_HEALTHY);
6400 last_heartbeat_resample = utime_t();
6401
6402 // subscribe to osdmap updates, in case our peers really are known to be dead
6403 osdmap_subscribe(get_osdmap_epoch() + 1, false);
6404 }
6405
6406 bool OSD::_is_healthy()
6407 {
6408 if (!cct->get_heartbeat_map()->is_healthy()) {
6409 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6410 return false;
6411 }
6412
6413 if (is_waiting_for_healthy()) {
6414 utime_t now = ceph_clock_now();
6415 if (osd_markdown_log.empty()) {
6416 dout(5) << __func__ << " force returning true since last markdown"
6417 << " was " << cct->_conf->osd_max_markdown_period
6418 << "s ago" << dendl;
6419 return true;
6420 }
6421 std::lock_guard l(heartbeat_lock);
6422 int num = 0, up = 0;
6423 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6424 p != heartbeat_peers.end();
6425 ++p) {
6426 if (p->second.is_healthy(now))
6427 ++up;
6428 ++num;
6429 }
6430 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6431 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6432 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6433 return false;
6434 }
6435 }
6436
6437 return true;
6438 }
6439
6440 void OSD::_send_boot()
6441 {
6442 dout(10) << "_send_boot" << dendl;
6443 Connection *local_connection =
6444 cluster_messenger->get_loopback_connection().get();
6445 entity_addrvec_t client_addrs = client_messenger->get_myaddrs();
6446 entity_addrvec_t cluster_addrs = cluster_messenger->get_myaddrs();
6447 entity_addrvec_t hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6448 entity_addrvec_t hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6449
6450 dout(20) << " initial client_addrs " << client_addrs
6451 << ", cluster_addrs " << cluster_addrs
6452 << ", hb_back_addrs " << hb_back_addrs
6453 << ", hb_front_addrs " << hb_front_addrs
6454 << dendl;
6455 if (cluster_messenger->set_addr_unknowns(client_addrs)) {
6456 dout(10) << " assuming cluster_addrs match client_addrs "
6457 << client_addrs << dendl;
6458 cluster_addrs = cluster_messenger->get_myaddrs();
6459 }
6460 if (auto session = local_connection->get_priv(); !session) {
6461 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6462 }
6463
6464 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6465 if (hb_back_server_messenger->set_addr_unknowns(cluster_addrs)) {
6466 dout(10) << " assuming hb_back_addrs match cluster_addrs "
6467 << cluster_addrs << dendl;
6468 hb_back_addrs = hb_back_server_messenger->get_myaddrs();
6469 }
6470 if (auto session = local_connection->get_priv(); !session) {
6471 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6472 }
6473
6474 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6475 if (hb_front_server_messenger->set_addr_unknowns(client_addrs)) {
6476 dout(10) << " assuming hb_front_addrs match client_addrs "
6477 << client_addrs << dendl;
6478 hb_front_addrs = hb_front_server_messenger->get_myaddrs();
6479 }
6480 if (auto session = local_connection->get_priv(); !session) {
6481 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6482 }
6483
6484 // we now know what our front and back addrs will be, and we are
6485 // about to tell the mon what our metadata (including numa bindings)
6486 // are, so now is a good time!
6487 set_numa_affinity();
6488
6489 MOSDBoot *mboot = new MOSDBoot(
6490 superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6491 hb_back_addrs, hb_front_addrs, cluster_addrs,
6492 CEPH_FEATURES_ALL);
6493 dout(10) << " final client_addrs " << client_addrs
6494 << ", cluster_addrs " << cluster_addrs
6495 << ", hb_back_addrs " << hb_back_addrs
6496 << ", hb_front_addrs " << hb_front_addrs
6497 << dendl;
6498 _collect_metadata(&mboot->metadata);
6499 monc->send_mon_message(mboot);
6500 set_state(STATE_BOOTING);
6501 }
6502
6503 void OSD::_collect_metadata(map<string,string> *pm)
6504 {
6505 // config info
6506 (*pm)["osd_data"] = dev_path;
6507 if (store->get_type() == "filestore") {
6508 // not applicable for bluestore
6509 (*pm)["osd_journal"] = journal_path;
6510 }
6511 (*pm)["front_addr"] = stringify(client_messenger->get_myaddrs());
6512 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddrs());
6513 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddrs());
6514 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddrs());
6515
6516 // backend
6517 (*pm)["osd_objectstore"] = store->get_type();
6518 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6519 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6520 (*pm)["default_device_class"] = store->get_default_device_class();
6521 string osdspec_affinity;
6522 int r = store->read_meta("osdspec_affinity", &osdspec_affinity);
6523 if (r < 0 || osdspec_affinity.empty()) {
6524 osdspec_affinity = "";
6525 }
6526 (*pm)["osdspec_affinity"] = osdspec_affinity;
6527 store->collect_metadata(pm);
6528
6529 collect_sys_info(pm, cct);
6530
6531 (*pm)["front_iface"] = pick_iface(
6532 cct,
6533 client_messenger->get_myaddrs().front().get_sockaddr_storage());
6534 (*pm)["back_iface"] = pick_iface(
6535 cct,
6536 cluster_messenger->get_myaddrs().front().get_sockaddr_storage());
6537
6538 // network numa
6539 {
6540 int node = -1;
6541 set<int> nodes;
6542 set<string> unknown;
6543 for (auto nm : { "front_iface", "back_iface" }) {
6544 if (!(*pm)[nm].size()) {
6545 unknown.insert(nm);
6546 continue;
6547 }
6548 int n = -1;
6549 int r = get_iface_numa_node((*pm)[nm], &n);
6550 if (r < 0) {
6551 unknown.insert((*pm)[nm]);
6552 continue;
6553 }
6554 nodes.insert(n);
6555 if (node < 0) {
6556 node = n;
6557 }
6558 }
6559 if (unknown.size()) {
6560 (*pm)["network_numa_unknown_ifaces"] = stringify(unknown);
6561 }
6562 if (!nodes.empty()) {
6563 (*pm)["network_numa_nodes"] = stringify(nodes);
6564 }
6565 if (node >= 0 && nodes.size() == 1 && unknown.empty()) {
6566 (*pm)["network_numa_node"] = stringify(node);
6567 }
6568 }
6569
6570 if (numa_node >= 0) {
6571 (*pm)["numa_node"] = stringify(numa_node);
6572 (*pm)["numa_node_cpus"] = cpu_set_to_str_list(numa_cpu_set_size,
6573 &numa_cpu_set);
6574 }
6575
6576 set<string> devnames;
6577 store->get_devices(&devnames);
6578 map<string,string> errs;
6579 get_device_metadata(devnames, pm, &errs);
6580 for (auto& i : errs) {
6581 dout(1) << __func__ << " " << i.first << ": " << i.second << dendl;
6582 }
6583 dout(10) << __func__ << " " << *pm << dendl;
6584 }
6585
6586 void OSD::queue_want_up_thru(epoch_t want)
6587 {
6588 std::shared_lock map_locker{map_lock};
6589 epoch_t cur = get_osdmap()->get_up_thru(whoami);
6590 std::lock_guard report_locker(mon_report_lock);
6591 if (want > up_thru_wanted) {
6592 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6593 << ", currently " << cur
6594 << dendl;
6595 up_thru_wanted = want;
6596 send_alive();
6597 } else {
6598 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6599 << ", currently " << cur
6600 << dendl;
6601 }
6602 }
6603
6604 void OSD::send_alive()
6605 {
6606 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6607 const auto osdmap = get_osdmap();
6608 if (!osdmap->exists(whoami))
6609 return;
6610 epoch_t up_thru = osdmap->get_up_thru(whoami);
6611 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6612 if (up_thru_wanted > up_thru) {
6613 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6614 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6615 }
6616 }
6617
6618 void OSD::request_full_map(epoch_t first, epoch_t last)
6619 {
6620 dout(10) << __func__ << " " << first << ".." << last
6621 << ", previously requested "
6622 << requested_full_first << ".." << requested_full_last << dendl;
6623 ceph_assert(ceph_mutex_is_locked(osd_lock));
6624 ceph_assert(first > 0 && last > 0);
6625 ceph_assert(first <= last);
6626 ceph_assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6627 if (requested_full_first == 0) {
6628 // first request
6629 requested_full_first = first;
6630 requested_full_last = last;
6631 } else if (last <= requested_full_last) {
6632 // dup
6633 return;
6634 } else {
6635 // additional request
6636 first = requested_full_last + 1;
6637 requested_full_last = last;
6638 }
6639 MMonGetOSDMap *req = new MMonGetOSDMap;
6640 req->request_full(first, last);
6641 monc->send_mon_message(req);
6642 }
6643
6644 void OSD::got_full_map(epoch_t e)
6645 {
6646 ceph_assert(requested_full_first <= requested_full_last);
6647 ceph_assert(ceph_mutex_is_locked(osd_lock));
6648 if (requested_full_first == 0) {
6649 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6650 return;
6651 }
6652 if (e < requested_full_first) {
6653 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6654 << ".." << requested_full_last
6655 << ", ignoring" << dendl;
6656 return;
6657 }
6658 if (e >= requested_full_last) {
6659 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6660 << ".." << requested_full_last << ", resetting" << dendl;
6661 requested_full_first = requested_full_last = 0;
6662 return;
6663 }
6664
6665 requested_full_first = e + 1;
6666
6667 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6668 << ".." << requested_full_last
6669 << ", still need more" << dendl;
6670 }
6671
6672 void OSD::requeue_failures()
6673 {
6674 std::lock_guard l(heartbeat_lock);
6675 unsigned old_queue = failure_queue.size();
6676 unsigned old_pending = failure_pending.size();
6677 for (auto p = failure_pending.begin(); p != failure_pending.end(); ) {
6678 failure_queue[p->first] = p->second.first;
6679 failure_pending.erase(p++);
6680 }
6681 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6682 << failure_queue.size() << dendl;
6683 }
6684
6685 void OSD::send_failures()
6686 {
6687 ceph_assert(ceph_mutex_is_locked(map_lock));
6688 ceph_assert(ceph_mutex_is_locked(mon_report_lock));
6689 std::lock_guard l(heartbeat_lock);
6690 utime_t now = ceph_clock_now();
6691 const auto osdmap = get_osdmap();
6692 while (!failure_queue.empty()) {
6693 int osd = failure_queue.begin()->first;
6694 if (!failure_pending.count(osd)) {
6695 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6696 monc->send_mon_message(
6697 new MOSDFailure(
6698 monc->get_fsid(),
6699 osd,
6700 osdmap->get_addrs(osd),
6701 failed_for,
6702 osdmap->get_epoch()));
6703 failure_pending[osd] = make_pair(failure_queue.begin()->second,
6704 osdmap->get_addrs(osd));
6705 }
6706 failure_queue.erase(osd);
6707 }
6708 }
6709
6710 void OSD::send_still_alive(epoch_t epoch, int osd, const entity_addrvec_t &addrs)
6711 {
6712 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), osd, addrs, 0, epoch,
6713 MOSDFailure::FLAG_ALIVE);
6714 monc->send_mon_message(m);
6715 }
6716
6717 void OSD::cancel_pending_failures()
6718 {
6719 std::lock_guard l(heartbeat_lock);
6720 auto it = failure_pending.begin();
6721 while (it != failure_pending.end()) {
6722 dout(10) << __func__ << " canceling in-flight failure report for osd."
6723 << it->first << dendl;
6724 send_still_alive(get_osdmap_epoch(), it->first, it->second.second);
6725 failure_pending.erase(it++);
6726 }
6727 }
6728
6729 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6730 {
6731 const auto& monmap = monc->monmap;
6732 // send beacon to mon even if we are just connected, and the monmap is not
6733 // initialized yet by then.
6734 if (monmap.epoch > 0 &&
6735 monmap.get_required_features().contains_all(
6736 ceph::features::mon::FEATURE_LUMINOUS)) {
6737 dout(20) << __func__ << " sending" << dendl;
6738 MOSDBeacon* beacon = nullptr;
6739 {
6740 std::lock_guard l{min_last_epoch_clean_lock};
6741 beacon = new MOSDBeacon(get_osdmap_epoch(),
6742 min_last_epoch_clean,
6743 superblock.last_purged_snaps_scrub);
6744 beacon->pgs = min_last_epoch_clean_pgs;
6745 last_sent_beacon = now;
6746 }
6747 monc->send_mon_message(beacon);
6748 } else {
6749 dout(20) << __func__ << " not sending" << dendl;
6750 }
6751 }
6752
6753 void OSD::handle_command(MCommand *m)
6754 {
6755 ConnectionRef con = m->get_connection();
6756 auto session = ceph::ref_cast<Session>(con->get_priv());
6757 if (!session) {
6758 con->send_message(new MCommandReply(m, -EACCES));
6759 m->put();
6760 return;
6761 }
6762 if (!session->caps.allow_all()) {
6763 con->send_message(new MCommandReply(m, -EACCES));
6764 m->put();
6765 return;
6766 }
6767 cct->get_admin_socket()->queue_tell_command(m);
6768 m->put();
6769 }
6770
6771 namespace {
6772 class unlock_guard {
6773 ceph::mutex& m;
6774 public:
6775 explicit unlock_guard(ceph::mutex& mutex)
6776 : m(mutex)
6777 {
6778 m.unlock();
6779 }
6780 unlock_guard(unlock_guard&) = delete;
6781 ~unlock_guard() {
6782 m.lock();
6783 }
6784 };
6785 }
6786
6787 void OSD::scrub_purged_snaps()
6788 {
6789 dout(10) << __func__ << dendl;
6790 ceph_assert(ceph_mutex_is_locked(osd_lock));
6791 SnapMapper::Scrubber s(cct, store, service.meta_ch,
6792 make_snapmapper_oid(),
6793 make_purged_snaps_oid());
6794 clog->debug() << "purged_snaps scrub starts";
6795 osd_lock.unlock();
6796 s.run();
6797 if (s.stray.size()) {
6798 clog->debug() << "purged_snaps scrub found " << s.stray.size() << " strays";
6799 } else {
6800 clog->debug() << "purged_snaps scrub ok";
6801 }
6802 set<pair<spg_t,snapid_t>> queued;
6803 for (auto& [pool, snap, hash, shard] : s.stray) {
6804 const pg_pool_t *pi = get_osdmap()->get_pg_pool(pool);
6805 if (!pi) {
6806 dout(20) << __func__ << " pool " << pool << " dne" << dendl;
6807 continue;
6808 }
6809 pg_t pgid(pi->raw_hash_to_pg(hash), pool);
6810 spg_t spgid(pgid, shard);
6811 pair<spg_t,snapid_t> p(spgid, snap);
6812 if (queued.count(p)) {
6813 dout(20) << __func__ << " pg " << spgid << " snap " << snap
6814 << " already queued" << dendl;
6815 continue;
6816 }
6817 PGRef pg = lookup_lock_pg(spgid);
6818 if (!pg) {
6819 dout(20) << __func__ << " pg " << spgid << " not found" << dendl;
6820 continue;
6821 }
6822 queued.insert(p);
6823 dout(10) << __func__ << " requeue pg " << spgid << " " << pg << " snap "
6824 << snap << dendl;
6825 pg->queue_snap_retrim(snap);
6826 pg->unlock();
6827 }
6828 osd_lock.lock();
6829 if (is_stopping()) {
6830 return;
6831 }
6832 dout(10) << __func__ << " done queueing pgs, updating superblock" << dendl;
6833 ObjectStore::Transaction t;
6834 superblock.last_purged_snaps_scrub = ceph_clock_now();
6835 write_superblock(t);
6836 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
6837 ceph_assert(tr == 0);
6838 if (is_active()) {
6839 send_beacon(ceph::coarse_mono_clock::now());
6840 }
6841 dout(10) << __func__ << " done" << dendl;
6842 }
6843
6844 void OSD::probe_smart(const string& only_devid, ostream& ss)
6845 {
6846 set<string> devnames;
6847 store->get_devices(&devnames);
6848 uint64_t smart_timeout = cct->_conf.get_val<uint64_t>(
6849 "osd_smart_report_timeout");
6850
6851 // == typedef std::map<std::string, mValue> mObject;
6852 json_spirit::mObject json_map;
6853
6854 for (auto dev : devnames) {
6855 // smartctl works only on physical devices; filter out any logical device
6856 if (dev.find("dm-") == 0) {
6857 continue;
6858 }
6859
6860 string err;
6861 string devid = get_device_id(dev, &err);
6862 if (devid.size() == 0) {
6863 dout(10) << __func__ << " no unique id for dev " << dev << " ("
6864 << err << "), skipping" << dendl;
6865 continue;
6866 }
6867 if (only_devid.size() && devid != only_devid) {
6868 continue;
6869 }
6870
6871 json_spirit::mValue smart_json;
6872 if (block_device_get_metrics(dev, smart_timeout,
6873 &smart_json)) {
6874 dout(10) << "block_device_get_metrics failed for /dev/" << dev << dendl;
6875 continue;
6876 }
6877 json_map[devid] = smart_json;
6878 }
6879 json_spirit::write(json_map, ss, json_spirit::pretty_print);
6880 }
6881
6882 bool OSD::heartbeat_dispatch(Message *m)
6883 {
6884 dout(30) << "heartbeat_dispatch " << m << dendl;
6885 switch (m->get_type()) {
6886
6887 case CEPH_MSG_PING:
6888 dout(10) << "ping from " << m->get_source_inst() << dendl;
6889 m->put();
6890 break;
6891
6892 case MSG_OSD_PING:
6893 handle_osd_ping(static_cast<MOSDPing*>(m));
6894 break;
6895
6896 default:
6897 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
6898 m->put();
6899 }
6900
6901 return true;
6902 }
6903
6904 bool OSD::ms_dispatch(Message *m)
6905 {
6906 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
6907 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
6908 service.got_stop_ack();
6909 m->put();
6910 return true;
6911 }
6912
6913 // lock!
6914
6915 osd_lock.lock();
6916 if (is_stopping()) {
6917 osd_lock.unlock();
6918 m->put();
6919 return true;
6920 }
6921
6922 do_waiters();
6923 _dispatch(m);
6924
6925 osd_lock.unlock();
6926
6927 return true;
6928 }
6929
6930 void OSDService::maybe_share_map(
6931 Connection *con,
6932 const OSDMapRef& osdmap,
6933 epoch_t peer_epoch_lb)
6934 {
6935 // NOTE: we assume caller hold something that keeps the Connection itself
6936 // pinned (e.g., an OpRequest's MessageRef).
6937 auto session = ceph::ref_cast<Session>(con->get_priv());
6938 if (!session) {
6939 return;
6940 }
6941
6942 // assume the peer has the newer of the op's sent_epoch and what
6943 // we think we sent them.
6944 session->sent_epoch_lock.lock();
6945 if (peer_epoch_lb > session->last_sent_epoch) {
6946 dout(10) << __func__ << " con " << con
6947 << " " << con->get_peer_addr()
6948 << " map epoch " << session->last_sent_epoch
6949 << " -> " << peer_epoch_lb << " (as per caller)" << dendl;
6950 session->last_sent_epoch = peer_epoch_lb;
6951 }
6952 epoch_t last_sent_epoch = session->last_sent_epoch;
6953 session->sent_epoch_lock.unlock();
6954
6955 if (osdmap->get_epoch() <= last_sent_epoch) {
6956 return;
6957 }
6958
6959 send_incremental_map(last_sent_epoch, con, osdmap);
6960 last_sent_epoch = osdmap->get_epoch();
6961
6962 session->sent_epoch_lock.lock();
6963 if (session->last_sent_epoch < last_sent_epoch) {
6964 dout(10) << __func__ << " con " << con
6965 << " " << con->get_peer_addr()
6966 << " map epoch " << session->last_sent_epoch
6967 << " -> " << last_sent_epoch << " (shared)" << dendl;
6968 session->last_sent_epoch = last_sent_epoch;
6969 }
6970 session->sent_epoch_lock.unlock();
6971 }
6972
6973 void OSD::dispatch_session_waiting(const ceph::ref_t<Session>& session, OSDMapRef osdmap)
6974 {
6975 ceph_assert(ceph_mutex_is_locked(session->session_dispatch_lock));
6976
6977 auto i = session->waiting_on_map.begin();
6978 while (i != session->waiting_on_map.end()) {
6979 OpRequestRef op = &(*i);
6980 ceph_assert(ms_can_fast_dispatch(op->get_req()));
6981 auto m = op->get_req<MOSDFastDispatchOp>();
6982 if (m->get_min_epoch() > osdmap->get_epoch()) {
6983 break;
6984 }
6985 session->waiting_on_map.erase(i++);
6986 op->put();
6987
6988 spg_t pgid;
6989 if (m->get_type() == CEPH_MSG_OSD_OP) {
6990 pg_t actual_pgid = osdmap->raw_pg_to_pg(
6991 static_cast<const MOSDOp*>(m)->get_pg());
6992 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
6993 continue;
6994 }
6995 } else {
6996 pgid = m->get_spg();
6997 }
6998 enqueue_op(pgid, std::move(op), m->get_map_epoch());
6999 }
7000
7001 if (session->waiting_on_map.empty()) {
7002 clear_session_waiting_on_map(session);
7003 } else {
7004 register_session_waiting_on_map(session);
7005 }
7006 }
7007
7008 void OSD::ms_fast_dispatch(Message *m)
7009 {
7010 FUNCTRACE(cct);
7011 if (service.is_stopping()) {
7012 m->put();
7013 return;
7014 }
7015
7016 // peering event?
7017 switch (m->get_type()) {
7018 case CEPH_MSG_PING:
7019 dout(10) << "ping from " << m->get_source() << dendl;
7020 m->put();
7021 return;
7022 case MSG_OSD_FORCE_RECOVERY:
7023 handle_fast_force_recovery(static_cast<MOSDForceRecovery*>(m));
7024 return;
7025 case MSG_OSD_SCRUB2:
7026 handle_fast_scrub(static_cast<MOSDScrub2*>(m));
7027 return;
7028
7029 case MSG_OSD_PG_CREATE2:
7030 return handle_fast_pg_create(static_cast<MOSDPGCreate2*>(m));
7031 case MSG_OSD_PG_QUERY:
7032 return handle_fast_pg_query(static_cast<MOSDPGQuery*>(m));
7033 case MSG_OSD_PG_NOTIFY:
7034 return handle_fast_pg_notify(static_cast<MOSDPGNotify*>(m));
7035 case MSG_OSD_PG_INFO:
7036 return handle_fast_pg_info(static_cast<MOSDPGInfo*>(m));
7037 case MSG_OSD_PG_REMOVE:
7038 return handle_fast_pg_remove(static_cast<MOSDPGRemove*>(m));
7039
7040 // these are single-pg messages that handle themselves
7041 case MSG_OSD_PG_LOG:
7042 case MSG_OSD_PG_TRIM:
7043 case MSG_OSD_PG_NOTIFY2:
7044 case MSG_OSD_PG_QUERY2:
7045 case MSG_OSD_PG_INFO2:
7046 case MSG_OSD_BACKFILL_RESERVE:
7047 case MSG_OSD_RECOVERY_RESERVE:
7048 case MSG_OSD_PG_LEASE:
7049 case MSG_OSD_PG_LEASE_ACK:
7050 {
7051 MOSDPeeringOp *pm = static_cast<MOSDPeeringOp*>(m);
7052 if (require_osd_peer(pm)) {
7053 enqueue_peering_evt(
7054 pm->get_spg(),
7055 PGPeeringEventRef(pm->get_event()));
7056 }
7057 pm->put();
7058 return;
7059 }
7060 }
7061
7062 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7063 {
7064 #ifdef WITH_LTTNG
7065 osd_reqid_t reqid = op->get_reqid();
7066 #endif
7067 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7068 reqid.name._num, reqid.tid, reqid.inc);
7069 }
7070
7071 if (m->trace)
7072 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7073
7074 // note sender epoch, min req's epoch
7075 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7076 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7077 ceph_assert(op->min_epoch <= op->sent_epoch); // sanity check!
7078
7079 service.maybe_inject_dispatch_delay();
7080
7081 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7082 m->get_type() != CEPH_MSG_OSD_OP) {
7083 // queue it directly
7084 enqueue_op(
7085 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7086 std::move(op),
7087 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7088 } else {
7089 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7090 // message that didn't have an explicit spg_t); we need to map
7091 // them to an spg_t while preserving delivery order.
7092 auto priv = m->get_connection()->get_priv();
7093 if (auto session = static_cast<Session*>(priv.get()); session) {
7094 std::lock_guard l{session->session_dispatch_lock};
7095 op->get();
7096 session->waiting_on_map.push_back(*op);
7097 OSDMapRef nextmap = service.get_nextmap_reserved();
7098 dispatch_session_waiting(session, nextmap);
7099 service.release_map(nextmap);
7100 }
7101 }
7102 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7103 }
7104
7105 int OSD::ms_handle_authentication(Connection *con)
7106 {
7107 int ret = 0;
7108 auto s = ceph::ref_cast<Session>(con->get_priv());
7109 if (!s) {
7110 s = ceph::make_ref<Session>(cct, con);
7111 con->set_priv(s);
7112 s->entity_name = con->get_peer_entity_name();
7113 dout(10) << __func__ << " new session " << s << " con " << s->con
7114 << " entity " << s->entity_name
7115 << " addr " << con->get_peer_addrs() << dendl;
7116 } else {
7117 dout(10) << __func__ << " existing session " << s << " con " << s->con
7118 << " entity " << s->entity_name
7119 << " addr " << con->get_peer_addrs() << dendl;
7120 }
7121
7122 AuthCapsInfo &caps_info = con->get_peer_caps_info();
7123 if (caps_info.allow_all) {
7124 s->caps.set_allow_all();
7125 } else if (caps_info.caps.length() > 0) {
7126 bufferlist::const_iterator p = caps_info.caps.cbegin();
7127 string str;
7128 try {
7129 decode(str, p);
7130 }
7131 catch (buffer::error& e) {
7132 dout(10) << __func__ << " session " << s << " " << s->entity_name
7133 << " failed to decode caps string" << dendl;
7134 ret = -EACCES;
7135 }
7136 if (!ret) {
7137 bool success = s->caps.parse(str);
7138 if (success) {
7139 dout(10) << __func__ << " session " << s
7140 << " " << s->entity_name
7141 << " has caps " << s->caps << " '" << str << "'" << dendl;
7142 ret = 1;
7143 } else {
7144 dout(10) << __func__ << " session " << s << " " << s->entity_name
7145 << " failed to parse caps '" << str << "'" << dendl;
7146 ret = -EACCES;
7147 }
7148 }
7149 }
7150 return ret;
7151 }
7152
7153 void OSD::do_waiters()
7154 {
7155 ceph_assert(ceph_mutex_is_locked(osd_lock));
7156
7157 dout(10) << "do_waiters -- start" << dendl;
7158 while (!finished.empty()) {
7159 OpRequestRef next = finished.front();
7160 finished.pop_front();
7161 dispatch_op(next);
7162 }
7163 dout(10) << "do_waiters -- finish" << dendl;
7164 }
7165
7166 void OSD::dispatch_op(OpRequestRef op)
7167 {
7168 switch (op->get_req()->get_type()) {
7169
7170 case MSG_OSD_PG_CREATE:
7171 handle_pg_create(op);
7172 break;
7173 }
7174 }
7175
7176 void OSD::_dispatch(Message *m)
7177 {
7178 ceph_assert(ceph_mutex_is_locked(osd_lock));
7179 dout(20) << "_dispatch " << m << " " << *m << dendl;
7180
7181 switch (m->get_type()) {
7182 // -- don't need OSDMap --
7183
7184 // map and replication
7185 case CEPH_MSG_OSD_MAP:
7186 handle_osd_map(static_cast<MOSDMap*>(m));
7187 break;
7188 case MSG_MON_GET_PURGED_SNAPS_REPLY:
7189 handle_get_purged_snaps_reply(static_cast<MMonGetPurgedSnapsReply*>(m));
7190 break;
7191
7192 // osd
7193 case MSG_OSD_SCRUB:
7194 handle_scrub(static_cast<MOSDScrub*>(m));
7195 break;
7196
7197 case MSG_COMMAND:
7198 handle_command(static_cast<MCommand*>(m));
7199 return;
7200
7201 // -- need OSDMap --
7202
7203 case MSG_OSD_PG_CREATE:
7204 {
7205 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7206 if (m->trace)
7207 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7208 // no map? starting up?
7209 if (!get_osdmap()) {
7210 dout(7) << "no OSDMap, not booted" << dendl;
7211 logger->inc(l_osd_waiting_for_map);
7212 waiting_for_osdmap.push_back(op);
7213 op->mark_delayed("no osdmap");
7214 break;
7215 }
7216
7217 // need OSDMap
7218 dispatch_op(op);
7219 }
7220 }
7221 }
7222
7223 // remove me post-nautilus
7224 void OSD::handle_scrub(MOSDScrub *m)
7225 {
7226 dout(10) << "handle_scrub " << *m << dendl;
7227 if (!require_mon_or_mgr_peer(m)) {
7228 m->put();
7229 return;
7230 }
7231 if (m->fsid != monc->get_fsid()) {
7232 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid()
7233 << dendl;
7234 m->put();
7235 return;
7236 }
7237
7238 vector<spg_t> spgs;
7239 _get_pgids(&spgs);
7240
7241 if (!m->scrub_pgs.empty()) {
7242 vector<spg_t> v;
7243 for (auto pgid : m->scrub_pgs) {
7244 spg_t pcand;
7245 if (get_osdmap()->get_primary_shard(pgid, &pcand) &&
7246 std::find(spgs.begin(), spgs.end(), pcand) != spgs.end()) {
7247 v.push_back(pcand);
7248 }
7249 }
7250 spgs.swap(v);
7251 }
7252
7253 for (auto pgid : spgs) {
7254 enqueue_peering_evt(
7255 pgid,
7256 PGPeeringEventRef(
7257 std::make_shared<PGPeeringEvent>(
7258 get_osdmap_epoch(),
7259 get_osdmap_epoch(),
7260 PeeringState::RequestScrub(m->deep, m->repair))));
7261 }
7262
7263 m->put();
7264 }
7265
7266 void OSD::handle_fast_scrub(MOSDScrub2 *m)
7267 {
7268 dout(10) << __func__ << " " << *m << dendl;
7269 if (!require_mon_or_mgr_peer(m)) {
7270 m->put();
7271 return;
7272 }
7273 if (m->fsid != monc->get_fsid()) {
7274 dout(0) << __func__ << " fsid " << m->fsid << " != " << monc->get_fsid()
7275 << dendl;
7276 m->put();
7277 return;
7278 }
7279 for (auto pgid : m->scrub_pgs) {
7280 enqueue_peering_evt(
7281 pgid,
7282 PGPeeringEventRef(
7283 std::make_shared<PGPeeringEvent>(
7284 m->epoch,
7285 m->epoch,
7286 PeeringState::RequestScrub(m->deep, m->repair))));
7287 }
7288 m->put();
7289 }
7290
7291 bool OSD::scrub_random_backoff()
7292 {
7293 bool coin_flip = (rand() / (double)RAND_MAX >=
7294 cct->_conf->osd_scrub_backoff_ratio);
7295 if (!coin_flip) {
7296 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7297 return true;
7298 }
7299 return false;
7300 }
7301
7302 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7303 const spg_t& pg, const utime_t& timestamp,
7304 double pool_scrub_min_interval,
7305 double pool_scrub_max_interval, bool must)
7306 : cct(cct),
7307 pgid(pg),
7308 sched_time(timestamp),
7309 deadline(timestamp)
7310 {
7311 // if not explicitly requested, postpone the scrub with a random delay
7312 if (!must) {
7313 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7314 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7315 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7316 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7317
7318 sched_time += scrub_min_interval;
7319 double r = rand() / (double)RAND_MAX;
7320 sched_time +=
7321 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7322 if (scrub_max_interval == 0) {
7323 deadline = utime_t();
7324 } else {
7325 deadline += scrub_max_interval;
7326 }
7327
7328 }
7329 }
7330
7331 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7332 if (sched_time < rhs.sched_time)
7333 return true;
7334 if (sched_time > rhs.sched_time)
7335 return false;
7336 return pgid < rhs.pgid;
7337 }
7338
7339 double OSD::scrub_sleep_time(bool must_scrub)
7340 {
7341 if (must_scrub) {
7342 return cct->_conf->osd_scrub_sleep;
7343 }
7344 utime_t now = ceph_clock_now();
7345 if (scrub_time_permit(now)) {
7346 return cct->_conf->osd_scrub_sleep;
7347 }
7348 double normal_sleep = cct->_conf->osd_scrub_sleep;
7349 double extended_sleep = cct->_conf->osd_scrub_extended_sleep;
7350 return std::max(extended_sleep, normal_sleep);
7351 }
7352
7353 bool OSD::scrub_time_permit(utime_t now)
7354 {
7355 struct tm bdt;
7356 time_t tt = now.sec();
7357 localtime_r(&tt, &bdt);
7358
7359 bool day_permit = false;
7360 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7361 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7362 day_permit = true;
7363 }
7364 } else {
7365 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7366 day_permit = true;
7367 }
7368 }
7369
7370 if (!day_permit) {
7371 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7372 << " - " << cct->_conf->osd_scrub_end_week_day
7373 << " now " << bdt.tm_wday << " = no" << dendl;
7374 return false;
7375 }
7376
7377 bool time_permit = false;
7378 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7379 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7380 time_permit = true;
7381 }
7382 } else {
7383 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7384 time_permit = true;
7385 }
7386 }
7387 if (!time_permit) {
7388 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7389 << " - " << cct->_conf->osd_scrub_end_hour
7390 << " now " << bdt.tm_hour << " = no" << dendl;
7391 } else {
7392 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7393 << " - " << cct->_conf->osd_scrub_end_hour
7394 << " now " << bdt.tm_hour << " = yes" << dendl;
7395 }
7396 return time_permit;
7397 }
7398
7399 bool OSD::scrub_load_below_threshold()
7400 {
7401 double loadavgs[3];
7402 if (getloadavg(loadavgs, 3) != 3) {
7403 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7404 return false;
7405 }
7406
7407 // allow scrub if below configured threshold
7408 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7409 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7410 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7411 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7412 << " < max " << cct->_conf->osd_scrub_load_threshold
7413 << " = yes" << dendl;
7414 return true;
7415 }
7416
7417 // allow scrub if below daily avg and currently decreasing
7418 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7419 dout(20) << __func__ << " loadavg " << loadavgs[0]
7420 << " < daily_loadavg " << daily_loadavg
7421 << " and < 15m avg " << loadavgs[2]
7422 << " = yes" << dendl;
7423 return true;
7424 }
7425
7426 dout(20) << __func__ << " loadavg " << loadavgs[0]
7427 << " >= max " << cct->_conf->osd_scrub_load_threshold
7428 << " and ( >= daily_loadavg " << daily_loadavg
7429 << " or >= 15m avg " << loadavgs[2]
7430 << ") = no" << dendl;
7431 return false;
7432 }
7433
7434 void OSD::sched_scrub()
7435 {
7436 // if not permitted, fail fast
7437 if (!service.can_inc_scrubs()) {
7438 return;
7439 }
7440 bool allow_requested_repair_only = false;
7441 if (service.is_recovery_active() && !cct->_conf->osd_scrub_during_recovery) {
7442 if (!cct->_conf->osd_repair_during_recovery) {
7443 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7444 return;
7445 }
7446 dout(10) << __func__
7447 << " will only schedule explicitly requested repair due to active recovery"
7448 << dendl;
7449 allow_requested_repair_only = true;
7450 }
7451
7452 utime_t now = ceph_clock_now();
7453 bool time_permit = scrub_time_permit(now);
7454 bool load_is_low = scrub_load_below_threshold();
7455 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7456
7457 OSDService::ScrubJob scrub;
7458 if (service.first_scrub_stamp(&scrub)) {
7459 do {
7460 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7461
7462 if (scrub.sched_time > now) {
7463 // save ourselves some effort
7464 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7465 << " > " << now << dendl;
7466 break;
7467 }
7468
7469 if ((scrub.deadline.is_zero() || scrub.deadline >= now) && !(time_permit && load_is_low)) {
7470 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7471 << (!time_permit ? "time not permit" : "high load") << dendl;
7472 continue;
7473 }
7474
7475 PGRef pg = _lookup_lock_pg(scrub.pgid);
7476 if (!pg)
7477 continue;
7478 // This has already started, so go on to the next scrub job
7479 if (pg->scrubber.active) {
7480 pg->unlock();
7481 dout(30) << __func__ << ": already in progress pgid " << scrub.pgid << dendl;
7482 continue;
7483 }
7484 // Skip other kinds of scrubing if only explicitly requested repairing is allowed
7485 if (allow_requested_repair_only && !pg->scrubber.must_repair) {
7486 pg->unlock();
7487 dout(10) << __func__ << " skip " << scrub.pgid
7488 << " because repairing is not explicitly requested on it"
7489 << dendl;
7490 continue;
7491 }
7492 // If it is reserving, let it resolve before going to the next scrub job
7493 if (pg->scrubber.local_reserved && !pg->scrubber.active) {
7494 pg->unlock();
7495 dout(30) << __func__ << ": reserve in progress pgid " << scrub.pgid << dendl;
7496 break;
7497 }
7498 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7499 << (pg->get_must_scrub() ? ", explicitly requested" :
7500 (load_is_low ? ", load_is_low" : " deadline < now"))
7501 << dendl;
7502 if (pg->sched_scrub()) {
7503 pg->unlock();
7504 break;
7505 }
7506 pg->unlock();
7507 } while (service.next_scrub_stamp(scrub, &scrub));
7508 }
7509 dout(20) << "sched_scrub done" << dendl;
7510 }
7511
7512 void OSD::resched_all_scrubs()
7513 {
7514 dout(10) << __func__ << ": start" << dendl;
7515 const vector<spg_t> pgs = [this] {
7516 vector<spg_t> pgs;
7517 OSDService::ScrubJob job;
7518 if (service.first_scrub_stamp(&job)) {
7519 do {
7520 pgs.push_back(job.pgid);
7521 } while (service.next_scrub_stamp(job, &job));
7522 }
7523 return pgs;
7524 }();
7525 for (auto& pgid : pgs) {
7526 dout(20) << __func__ << ": examine " << pgid << dendl;
7527 PGRef pg = _lookup_lock_pg(pgid);
7528 if (!pg)
7529 continue;
7530 if (!pg->scrubber.must_scrub && !pg->scrubber.need_auto) {
7531 dout(15) << __func__ << ": reschedule " << pgid << dendl;
7532 pg->on_info_history_change();
7533 }
7534 pg->unlock();
7535 }
7536 dout(10) << __func__ << ": done" << dendl;
7537 }
7538
7539 MPGStats* OSD::collect_pg_stats()
7540 {
7541 // This implementation unconditionally sends every is_primary PG's
7542 // stats every time we're called. This has equivalent cost to the
7543 // previous implementation's worst case where all PGs are busy and
7544 // their stats are always enqueued for sending.
7545 std::shared_lock l{map_lock};
7546
7547 osd_stat_t cur_stat = service.get_osd_stat();
7548 cur_stat.os_perf_stat = store->get_cur_stats();
7549
7550 auto m = new MPGStats(monc->get_fsid(), get_osdmap_epoch());
7551 m->osd_stat = cur_stat;
7552
7553 std::lock_guard lec{min_last_epoch_clean_lock};
7554 min_last_epoch_clean = get_osdmap_epoch();
7555 min_last_epoch_clean_pgs.clear();
7556
7557 std::set<int64_t> pool_set;
7558 vector<PGRef> pgs;
7559 _get_pgs(&pgs);
7560 for (auto& pg : pgs) {
7561 auto pool = pg->pg_id.pgid.pool();
7562 pool_set.emplace((int64_t)pool);
7563 if (!pg->is_primary()) {
7564 continue;
7565 }
7566 pg->get_pg_stats([&](const pg_stat_t& s, epoch_t lec) {
7567 m->pg_stat[pg->pg_id.pgid] = s;
7568 min_last_epoch_clean = min(min_last_epoch_clean, lec);
7569 min_last_epoch_clean_pgs.push_back(pg->pg_id.pgid);
7570 });
7571 }
7572 store_statfs_t st;
7573 bool per_pool_stats = false;
7574 bool per_pool_omap_stats = false;
7575 for (auto p : pool_set) {
7576 int r = store->pool_statfs(p, &st, &per_pool_omap_stats);
7577 if (r == -ENOTSUP) {
7578 break;
7579 } else {
7580 assert(r >= 0);
7581 m->pool_stat[p] = st;
7582 per_pool_stats = true;
7583 }
7584 }
7585
7586 // indicate whether we are reporting per-pool stats
7587 m->osd_stat.num_osds = 1;
7588 m->osd_stat.num_per_pool_osds = per_pool_stats ? 1 : 0;
7589 m->osd_stat.num_per_pool_omap_osds = per_pool_omap_stats ? 1 : 0;
7590
7591 return m;
7592 }
7593
7594 vector<DaemonHealthMetric> OSD::get_health_metrics()
7595 {
7596 vector<DaemonHealthMetric> metrics;
7597 {
7598 utime_t oldest_secs;
7599 const utime_t now = ceph_clock_now();
7600 auto too_old = now;
7601 too_old -= cct->_conf.get_val<double>("osd_op_complaint_time");
7602 int slow = 0;
7603 TrackedOpRef oldest_op;
7604 auto count_slow_ops = [&](TrackedOp& op) {
7605 if (op.get_initiated() < too_old) {
7606 stringstream ss;
7607 ss << "slow request " << op.get_desc()
7608 << " initiated "
7609 << op.get_initiated()
7610 << " currently "
7611 << op.state_string();
7612 lgeneric_subdout(cct,osd,20) << ss.str() << dendl;
7613 clog->warn() << ss.str();
7614 slow++;
7615 if (!oldest_op || op.get_initiated() < oldest_op->get_initiated()) {
7616 oldest_op = &op;
7617 }
7618 return true;
7619 } else {
7620 return false;
7621 }
7622 };
7623 if (op_tracker.visit_ops_in_flight(&oldest_secs, count_slow_ops)) {
7624 if (slow) {
7625 derr << __func__ << " reporting " << slow << " slow ops, oldest is "
7626 << oldest_op->get_desc() << dendl;
7627 }
7628 metrics.emplace_back(daemon_metric::SLOW_OPS, slow, oldest_secs);
7629 } else {
7630 // no news is not good news.
7631 metrics.emplace_back(daemon_metric::SLOW_OPS, 0, 0);
7632 }
7633 }
7634 {
7635 std::lock_guard l(pending_creates_lock);
7636 auto n_primaries = pending_creates_from_mon;
7637 for (const auto& create : pending_creates_from_osd) {
7638 if (create.second) {
7639 n_primaries++;
7640 }
7641 }
7642 metrics.emplace_back(daemon_metric::PENDING_CREATING_PGS, n_primaries);
7643 }
7644 return metrics;
7645 }
7646
7647 // =====================================================
7648 // MAP
7649
7650 void OSD::wait_for_new_map(OpRequestRef op)
7651 {
7652 // ask?
7653 if (waiting_for_osdmap.empty()) {
7654 osdmap_subscribe(get_osdmap_epoch() + 1, false);
7655 }
7656
7657 logger->inc(l_osd_waiting_for_map);
7658 waiting_for_osdmap.push_back(op);
7659 op->mark_delayed("wait for new map");
7660 }
7661
7662
7663 /** update_map
7664 * assimilate new OSDMap(s). scan pgs, etc.
7665 */
7666
7667 void OSD::note_down_osd(int peer)
7668 {
7669 ceph_assert(ceph_mutex_is_locked(osd_lock));
7670 cluster_messenger->mark_down_addrs(get_osdmap()->get_cluster_addrs(peer));
7671
7672 std::lock_guard l{heartbeat_lock};
7673 failure_queue.erase(peer);
7674 failure_pending.erase(peer);
7675 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7676 if (p != heartbeat_peers.end()) {
7677 p->second.clear_mark_down();
7678 heartbeat_peers.erase(p);
7679 }
7680 }
7681
7682 void OSD::note_up_osd(int peer)
7683 {
7684 heartbeat_set_peers_need_update();
7685 }
7686
7687 struct C_OnMapCommit : public Context {
7688 OSD *osd;
7689 epoch_t first, last;
7690 MOSDMap *msg;
7691 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7692 : osd(o), first(f), last(l), msg(m) {}
7693 void finish(int r) override {
7694 osd->_committed_osd_maps(first, last, msg);
7695 msg->put();
7696 }
7697 };
7698
7699 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7700 {
7701 std::lock_guard l(osdmap_subscribe_lock);
7702 if (latest_subscribed_epoch >= epoch && !force_request)
7703 return;
7704
7705 latest_subscribed_epoch = std::max<uint64_t>(epoch, latest_subscribed_epoch);
7706
7707 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7708 force_request) {
7709 monc->renew_subs();
7710 }
7711 }
7712
7713 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7714 {
7715 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7716 if (min <= superblock.oldest_map)
7717 return;
7718
7719 int num = 0;
7720 ObjectStore::Transaction t;
7721 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7722 dout(20) << " removing old osdmap epoch " << e << dendl;
7723 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7724 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7725 superblock.oldest_map = e + 1;
7726 num++;
7727 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7728 service.publish_superblock(superblock);
7729 write_superblock(t);
7730 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7731 ceph_assert(tr == 0);
7732 num = 0;
7733 if (!skip_maps) {
7734 // skip_maps leaves us with a range of old maps if we fail to remove all
7735 // of them before moving superblock.oldest_map forward to the first map
7736 // in the incoming MOSDMap msg. so we should continue removing them in
7737 // this case, even we could do huge series of delete transactions all at
7738 // once.
7739 break;
7740 }
7741 }
7742 }
7743 if (num > 0) {
7744 service.publish_superblock(superblock);
7745 write_superblock(t);
7746 int tr = store->queue_transaction(service.meta_ch, std::move(t), nullptr);
7747 ceph_assert(tr == 0);
7748 }
7749 // we should not remove the cached maps
7750 ceph_assert(min <= service.map_cache.cached_key_lower_bound());
7751 }
7752
7753 void OSD::handle_osd_map(MOSDMap *m)
7754 {
7755 // wait for pgs to catch up
7756 {
7757 // we extend the map cache pins to accomodate pgs slow to consume maps
7758 // for some period, until we hit the max_lag_factor bound, at which point
7759 // we block here to stop injesting more maps than they are able to keep
7760 // up with.
7761 epoch_t max_lag = cct->_conf->osd_map_cache_size *
7762 m_osd_pg_epoch_max_lag_factor;
7763 ceph_assert(max_lag > 0);
7764 epoch_t osd_min = 0;
7765 for (auto shard : shards) {
7766 epoch_t min = shard->get_min_pg_epoch();
7767 if (osd_min == 0 || min < osd_min) {
7768 osd_min = min;
7769 }
7770 }
7771 epoch_t osdmap_epoch = get_osdmap_epoch();
7772 if (osd_min > 0 &&
7773 osdmap_epoch > max_lag &&
7774 osdmap_epoch - max_lag > osd_min) {
7775 epoch_t need = osdmap_epoch - max_lag;
7776 dout(10) << __func__ << " waiting for pgs to catch up (need " << need
7777 << " max_lag " << max_lag << ")" << dendl;
7778 for (auto shard : shards) {
7779 epoch_t min = shard->get_min_pg_epoch();
7780 if (need > min) {
7781 dout(10) << __func__ << " waiting for pgs to consume " << need
7782 << " (shard " << shard->shard_id << " min " << min
7783 << ", map cache is " << cct->_conf->osd_map_cache_size
7784 << ", max_lag_factor " << m_osd_pg_epoch_max_lag_factor
7785 << ")" << dendl;
7786 unlock_guard unlock{osd_lock};
7787 shard->wait_min_pg_epoch(need);
7788 }
7789 }
7790 }
7791 }
7792
7793 ceph_assert(ceph_mutex_is_locked(osd_lock));
7794 map<epoch_t,OSDMapRef> added_maps;
7795 map<epoch_t,bufferlist> added_maps_bl;
7796 if (m->fsid != monc->get_fsid()) {
7797 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7798 << monc->get_fsid() << dendl;
7799 m->put();
7800 return;
7801 }
7802 if (is_initializing()) {
7803 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7804 m->put();
7805 return;
7806 }
7807
7808 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7809 if (session && !(session->entity_name.is_mon() ||
7810 session->entity_name.is_osd())) {
7811 //not enough perms!
7812 dout(10) << "got osd map from Session " << session
7813 << " which we can't take maps from (not a mon or osd)" << dendl;
7814 m->put();
7815 return;
7816 }
7817
7818 // share with the objecter
7819 if (!is_preboot())
7820 service.objecter->handle_osd_map(m);
7821
7822 epoch_t first = m->get_first();
7823 epoch_t last = m->get_last();
7824 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7825 << superblock.newest_map
7826 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7827 << dendl;
7828
7829 logger->inc(l_osd_map);
7830 logger->inc(l_osd_mape, last - first + 1);
7831 if (first <= superblock.newest_map)
7832 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7833 if (service.max_oldest_map < m->oldest_map) {
7834 service.max_oldest_map = m->oldest_map;
7835 ceph_assert(service.max_oldest_map >= superblock.oldest_map);
7836 }
7837
7838 // make sure there is something new, here, before we bother flushing
7839 // the queues and such
7840 if (last <= superblock.newest_map) {
7841 dout(10) << " no new maps here, dropping" << dendl;
7842 m->put();
7843 return;
7844 }
7845
7846 // missing some?
7847 bool skip_maps = false;
7848 if (first > superblock.newest_map + 1) {
7849 dout(10) << "handle_osd_map message skips epochs "
7850 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7851 if (m->oldest_map <= superblock.newest_map + 1) {
7852 osdmap_subscribe(superblock.newest_map + 1, false);
7853 m->put();
7854 return;
7855 }
7856 // always try to get the full range of maps--as many as we can. this
7857 // 1- is good to have
7858 // 2- is at present the only way to ensure that we get a *full* map as
7859 // the first map!
7860 if (m->oldest_map < first) {
7861 osdmap_subscribe(m->oldest_map - 1, true);
7862 m->put();
7863 return;
7864 }
7865 skip_maps = true;
7866 }
7867
7868 ObjectStore::Transaction t;
7869 uint64_t txn_size = 0;
7870
7871 map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> purged_snaps;
7872
7873 // store new maps: queue for disk and put in the osdmap cache
7874 epoch_t start = std::max(superblock.newest_map + 1, first);
7875 for (epoch_t e = start; e <= last; e++) {
7876 if (txn_size >= t.get_num_bytes()) {
7877 derr << __func__ << " transaction size overflowed" << dendl;
7878 ceph_assert(txn_size < t.get_num_bytes());
7879 }
7880 txn_size = t.get_num_bytes();
7881 map<epoch_t,bufferlist>::iterator p;
7882 p = m->maps.find(e);
7883 if (p != m->maps.end()) {
7884 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7885 OSDMap *o = new OSDMap;
7886 bufferlist& bl = p->second;
7887
7888 o->decode(bl);
7889
7890 purged_snaps[e] = o->get_new_purged_snaps();
7891
7892 ghobject_t fulloid = get_osdmap_pobject_name(e);
7893 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7894 added_maps[e] = add_map(o);
7895 added_maps_bl[e] = bl;
7896 got_full_map(e);
7897 continue;
7898 }
7899
7900 p = m->incremental_maps.find(e);
7901 if (p != m->incremental_maps.end()) {
7902 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7903 bufferlist& bl = p->second;
7904 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7905 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7906
7907 OSDMap *o = new OSDMap;
7908 if (e > 1) {
7909 bufferlist obl;
7910 bool got = get_map_bl(e - 1, obl);
7911 if (!got) {
7912 auto p = added_maps_bl.find(e - 1);
7913 ceph_assert(p != added_maps_bl.end());
7914 obl = p->second;
7915 }
7916 o->decode(obl);
7917 }
7918
7919 OSDMap::Incremental inc;
7920 auto p = bl.cbegin();
7921 inc.decode(p);
7922
7923 if (o->apply_incremental(inc) < 0) {
7924 derr << "ERROR: bad fsid? i have " << get_osdmap()->get_fsid() << " and inc has " << inc.fsid << dendl;
7925 ceph_abort_msg("bad fsid");
7926 }
7927
7928 bufferlist fbl;
7929 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7930
7931 bool injected_failure = false;
7932 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7933 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7934 derr << __func__ << " injecting map crc failure" << dendl;
7935 injected_failure = true;
7936 }
7937
7938 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7939 dout(2) << "got incremental " << e
7940 << " but failed to encode full with correct crc; requesting"
7941 << dendl;
7942 clog->warn() << "failed to encode map e" << e << " with expected crc";
7943 dout(20) << "my encoded map was:\n";
7944 fbl.hexdump(*_dout);
7945 *_dout << dendl;
7946 delete o;
7947 request_full_map(e, last);
7948 last = e - 1;
7949
7950 // don't continue committing if we failed to enc the first inc map
7951 if (last < start) {
7952 dout(10) << __func__ << " bailing because last < start (" << last << "<" << start << ")" << dendl;
7953 m->put();
7954 return;
7955 }
7956 break;
7957 }
7958 got_full_map(e);
7959 purged_snaps[e] = o->get_new_purged_snaps();
7960
7961 ghobject_t fulloid = get_osdmap_pobject_name(e);
7962 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7963 added_maps[e] = add_map(o);
7964 added_maps_bl[e] = fbl;
7965 continue;
7966 }
7967
7968 ceph_abort_msg("MOSDMap lied about what maps it had?");
7969 }
7970
7971 // even if this map isn't from a mon, we may have satisfied our subscription
7972 monc->sub_got("osdmap", last);
7973
7974 if (!m->maps.empty() && requested_full_first) {
7975 dout(10) << __func__ << " still missing full maps " << requested_full_first
7976 << ".." << requested_full_last << dendl;
7977 rerequest_full_maps();
7978 }
7979
7980 if (superblock.oldest_map) {
7981 // make sure we at least keep pace with incoming maps
7982 trim_maps(m->oldest_map, last - first + 1, skip_maps);
7983 pg_num_history.prune(superblock.oldest_map);
7984 }
7985
7986 if (!superblock.oldest_map || skip_maps)
7987 superblock.oldest_map = first;
7988 superblock.newest_map = last;
7989 superblock.current_epoch = last;
7990
7991 // note in the superblock that we were clean thru the prior epoch
7992 epoch_t boot_epoch = service.get_boot_epoch();
7993 if (boot_epoch && boot_epoch >= superblock.mounted) {
7994 superblock.mounted = boot_epoch;
7995 superblock.clean_thru = last;
7996 }
7997
7998 // check for pg_num changes and deleted pools
7999 OSDMapRef lastmap;
8000 for (auto& i : added_maps) {
8001 if (!lastmap) {
8002 if (!(lastmap = service.try_get_map(i.first - 1))) {
8003 dout(10) << __func__ << " can't get previous map " << i.first - 1
8004 << " probably first start of this osd" << dendl;
8005 continue;
8006 }
8007 }
8008 ceph_assert(lastmap->get_epoch() + 1 == i.second->get_epoch());
8009 for (auto& j : lastmap->get_pools()) {
8010 if (!i.second->have_pg_pool(j.first)) {
8011 pg_num_history.log_pool_delete(i.first, j.first);
8012 dout(10) << __func__ << " recording final pg_pool_t for pool "
8013 << j.first << dendl;
8014 // this information is needed by _make_pg() if have to restart before
8015 // the pool is deleted and need to instantiate a new (zombie) PG[Pool].
8016 ghobject_t obj = make_final_pool_info_oid(j.first);
8017 bufferlist bl;
8018 encode(j.second, bl, CEPH_FEATURES_ALL);
8019 string name = lastmap->get_pool_name(j.first);
8020 encode(name, bl);
8021 map<string,string> profile;
8022 if (lastmap->get_pg_pool(j.first)->is_erasure()) {
8023 profile = lastmap->get_erasure_code_profile(
8024 lastmap->get_pg_pool(j.first)->erasure_code_profile);
8025 }
8026 encode(profile, bl);
8027 t.write(coll_t::meta(), obj, 0, bl.length(), bl);
8028 } else if (unsigned new_pg_num = i.second->get_pg_num(j.first);
8029 new_pg_num != j.second.get_pg_num()) {
8030 dout(10) << __func__ << " recording pool " << j.first << " pg_num "
8031 << j.second.get_pg_num() << " -> " << new_pg_num << dendl;
8032 pg_num_history.log_pg_num_change(i.first, j.first, new_pg_num);
8033 }
8034 }
8035 for (auto& j : i.second->get_pools()) {
8036 if (!lastmap->have_pg_pool(j.first)) {
8037 dout(10) << __func__ << " recording new pool " << j.first << " pg_num "
8038 << j.second.get_pg_num() << dendl;
8039 pg_num_history.log_pg_num_change(i.first, j.first,
8040 j.second.get_pg_num());
8041 }
8042 }
8043 lastmap = i.second;
8044 }
8045 pg_num_history.epoch = last;
8046 {
8047 bufferlist bl;
8048 ::encode(pg_num_history, bl);
8049 t.write(coll_t::meta(), make_pg_num_history_oid(), 0, bl.length(), bl);
8050 dout(20) << __func__ << " pg_num_history " << pg_num_history << dendl;
8051 }
8052
8053 // record new purged_snaps
8054 if (superblock.purged_snaps_last == start - 1) {
8055 SnapMapper::record_purged_snaps(cct, store, service.meta_ch,
8056 make_purged_snaps_oid(), &t,
8057 purged_snaps);
8058 superblock.purged_snaps_last = last;
8059 } else {
8060 dout(10) << __func__ << " superblock purged_snaps_last is "
8061 << superblock.purged_snaps_last
8062 << ", not recording new purged_snaps" << dendl;
8063 }
8064
8065 // superblock and commit
8066 write_superblock(t);
8067 t.register_on_commit(new C_OnMapCommit(this, start, last, m));
8068 store->queue_transaction(
8069 service.meta_ch,
8070 std::move(t));
8071 service.publish_superblock(superblock);
8072 }
8073
8074 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8075 {
8076 dout(10) << __func__ << " " << first << ".." << last << dendl;
8077 if (is_stopping()) {
8078 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8079 return;
8080 }
8081 std::lock_guard l(osd_lock);
8082 if (is_stopping()) {
8083 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8084 return;
8085 }
8086 map_lock.lock();
8087
8088 ceph_assert(first <= last);
8089
8090 bool do_shutdown = false;
8091 bool do_restart = false;
8092 bool network_error = false;
8093 OSDMapRef osdmap = get_osdmap();
8094
8095 // advance through the new maps
8096 for (epoch_t cur = first; cur <= last; cur++) {
8097 dout(10) << " advance to epoch " << cur
8098 << " (<= last " << last
8099 << " <= newest_map " << superblock.newest_map
8100 << ")" << dendl;
8101
8102 OSDMapRef newmap = get_map(cur);
8103 ceph_assert(newmap); // we just cached it above!
8104
8105 // start blacklisting messages sent to peers that go down.
8106 service.pre_publish_map(newmap);
8107
8108 // kill connections to newly down osds
8109 bool waited_for_reservations = false;
8110 set<int> old;
8111 osdmap = get_osdmap();
8112 osdmap->get_all_osds(old);
8113 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8114 if (*p != whoami &&
8115 osdmap->is_up(*p) && // in old map
8116 newmap->is_down(*p)) { // but not the new one
8117 if (!waited_for_reservations) {
8118 service.await_reserved_maps();
8119 waited_for_reservations = true;
8120 }
8121 note_down_osd(*p);
8122 } else if (*p != whoami &&
8123 osdmap->is_down(*p) &&
8124 newmap->is_up(*p)) {
8125 note_up_osd(*p);
8126 }
8127 }
8128
8129 if (osdmap->is_noup(whoami) != newmap->is_noup(whoami)) {
8130 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8131 << dendl;
8132 if (is_booting()) {
8133 // this captures the case where we sent the boot message while
8134 // NOUP was being set on the mon and our boot request was
8135 // dropped, and then later it is cleared. it imperfectly
8136 // handles the case where our original boot message was not
8137 // dropped and we restart even though we might have booted, but
8138 // that is harmless (boot will just take slightly longer).
8139 do_restart = true;
8140 }
8141 }
8142
8143 osdmap = std::move(newmap);
8144 set_osdmap(osdmap);
8145 epoch_t up_epoch;
8146 epoch_t boot_epoch;
8147 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8148 if (!up_epoch &&
8149 osdmap->is_up(whoami) &&
8150 osdmap->get_addrs(whoami) == client_messenger->get_myaddrs()) {
8151 up_epoch = osdmap->get_epoch();
8152 dout(10) << "up_epoch is " << up_epoch << dendl;
8153 if (!boot_epoch) {
8154 boot_epoch = osdmap->get_epoch();
8155 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8156 }
8157 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8158 }
8159 }
8160
8161 epoch_t _bind_epoch = service.get_bind_epoch();
8162 if (osdmap->is_up(whoami) &&
8163 osdmap->get_addrs(whoami).legacy_equals(
8164 client_messenger->get_myaddrs()) &&
8165 _bind_epoch < osdmap->get_up_from(whoami)) {
8166
8167 if (is_booting()) {
8168 dout(1) << "state: booting -> active" << dendl;
8169 set_state(STATE_ACTIVE);
8170 do_restart = false;
8171
8172 // set incarnation so that osd_reqid_t's we generate for our
8173 // objecter requests are unique across restarts.
8174 service.objecter->set_client_incarnation(osdmap->get_epoch());
8175 cancel_pending_failures();
8176 }
8177 }
8178
8179 if (osdmap->get_epoch() > 0 &&
8180 is_active()) {
8181 if (!osdmap->exists(whoami)) {
8182 derr << "map says i do not exist. shutting down." << dendl;
8183 do_shutdown = true; // don't call shutdown() while we have
8184 // everything paused
8185 } else if (osdmap->is_stop(whoami)) {
8186 derr << "map says i am stopped by admin. shutting down." << dendl;
8187 do_shutdown = true;
8188 } else if (!osdmap->is_up(whoami) ||
8189 !osdmap->get_addrs(whoami).legacy_equals(
8190 client_messenger->get_myaddrs()) ||
8191 !osdmap->get_cluster_addrs(whoami).legacy_equals(
8192 cluster_messenger->get_myaddrs()) ||
8193 !osdmap->get_hb_back_addrs(whoami).legacy_equals(
8194 hb_back_server_messenger->get_myaddrs()) ||
8195 !osdmap->get_hb_front_addrs(whoami).legacy_equals(
8196 hb_front_server_messenger->get_myaddrs())) {
8197 if (!osdmap->is_up(whoami)) {
8198 if (service.is_preparing_to_stop() || service.is_stopping()) {
8199 service.got_stop_ack();
8200 } else {
8201 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8202 "but it is still running";
8203 clog->debug() << "map e" << osdmap->get_epoch()
8204 << " wrongly marked me down at e"
8205 << osdmap->get_down_at(whoami);
8206 }
8207 if (monc->monmap.min_mon_release >= ceph_release_t::octopus) {
8208 // note that this is best-effort...
8209 monc->send_mon_message(
8210 new MOSDMarkMeDead(
8211 monc->get_fsid(),
8212 whoami,
8213 osdmap->get_epoch()));
8214 }
8215 } else if (!osdmap->get_addrs(whoami).legacy_equals(
8216 client_messenger->get_myaddrs())) {
8217 clog->error() << "map e" << osdmap->get_epoch()
8218 << " had wrong client addr (" << osdmap->get_addrs(whoami)
8219 << " != my " << client_messenger->get_myaddrs() << ")";
8220 } else if (!osdmap->get_cluster_addrs(whoami).legacy_equals(
8221 cluster_messenger->get_myaddrs())) {
8222 clog->error() << "map e" << osdmap->get_epoch()
8223 << " had wrong cluster addr ("
8224 << osdmap->get_cluster_addrs(whoami)
8225 << " != my " << cluster_messenger->get_myaddrs() << ")";
8226 } else if (!osdmap->get_hb_back_addrs(whoami).legacy_equals(
8227 hb_back_server_messenger->get_myaddrs())) {
8228 clog->error() << "map e" << osdmap->get_epoch()
8229 << " had wrong heartbeat back addr ("
8230 << osdmap->get_hb_back_addrs(whoami)
8231 << " != my " << hb_back_server_messenger->get_myaddrs()
8232 << ")";
8233 } else if (!osdmap->get_hb_front_addrs(whoami).legacy_equals(
8234 hb_front_server_messenger->get_myaddrs())) {
8235 clog->error() << "map e" << osdmap->get_epoch()
8236 << " had wrong heartbeat front addr ("
8237 << osdmap->get_hb_front_addrs(whoami)
8238 << " != my " << hb_front_server_messenger->get_myaddrs()
8239 << ")";
8240 }
8241
8242 if (!service.is_stopping()) {
8243 epoch_t up_epoch = 0;
8244 epoch_t bind_epoch = osdmap->get_epoch();
8245 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8246 do_restart = true;
8247
8248 //add markdown log
8249 utime_t now = ceph_clock_now();
8250 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8251 osd_markdown_log.push_back(now);
8252 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8253 derr << __func__ << " marked down "
8254 << osd_markdown_log.size()
8255 << " > osd_max_markdown_count "
8256 << cct->_conf->osd_max_markdown_count
8257 << " in last " << grace << " seconds, shutting down"
8258 << dendl;
8259 do_restart = false;
8260 do_shutdown = true;
8261 }
8262
8263 start_waiting_for_healthy();
8264
8265 set<int> avoid_ports;
8266 #if defined(__FreeBSD__)
8267 // prevent FreeBSD from grabbing the client_messenger port during
8268 // rebinding. In which case a cluster_meesneger will connect also
8269 // to the same port
8270 client_messenger->get_myaddrs().get_ports(&avoid_ports);
8271 #endif
8272 cluster_messenger->get_myaddrs().get_ports(&avoid_ports);
8273
8274 int r = cluster_messenger->rebind(avoid_ports);
8275 if (r != 0) {
8276 do_shutdown = true; // FIXME: do_restart?
8277 network_error = true;
8278 derr << __func__ << " marked down:"
8279 << " rebind cluster_messenger failed" << dendl;
8280 }
8281
8282 hb_back_server_messenger->mark_down_all();
8283 hb_front_server_messenger->mark_down_all();
8284 hb_front_client_messenger->mark_down_all();
8285 hb_back_client_messenger->mark_down_all();
8286
8287 reset_heartbeat_peers(true);
8288 }
8289 }
8290 } else if (osdmap->get_epoch() > 0 && osdmap->is_stop(whoami)) {
8291 derr << "map says i am stopped by admin. shutting down." << dendl;
8292 do_shutdown = true;
8293 }
8294
8295 map_lock.unlock();
8296
8297 check_osdmap_features();
8298
8299 // yay!
8300 consume_map();
8301
8302 if (is_active() || is_waiting_for_healthy())
8303 maybe_update_heartbeat_peers();
8304
8305 if (is_active()) {
8306 activate_map();
8307 }
8308
8309 if (do_shutdown) {
8310 if (network_error) {
8311 cancel_pending_failures();
8312 }
8313 // trigger shutdown in a different thread
8314 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8315 queue_async_signal(SIGINT);
8316 }
8317 else if (m->newest_map && m->newest_map > last) {
8318 dout(10) << " msg say newest map is " << m->newest_map
8319 << ", requesting more" << dendl;
8320 osdmap_subscribe(osdmap->get_epoch()+1, false);
8321 }
8322 else if (is_preboot()) {
8323 if (m->get_source().is_mon())
8324 _preboot(m->oldest_map, m->newest_map);
8325 else
8326 start_boot();
8327 }
8328 else if (do_restart)
8329 start_boot();
8330
8331 }
8332
8333 void OSD::check_osdmap_features()
8334 {
8335 // adjust required feature bits?
8336
8337 // we have to be a bit careful here, because we are accessing the
8338 // Policy structures without taking any lock. in particular, only
8339 // modify integer values that can safely be read by a racing CPU.
8340 // since we are only accessing existing Policy structures a their
8341 // current memory location, and setting or clearing bits in integer
8342 // fields, and we are the only writer, this is not a problem.
8343
8344 const auto osdmap = get_osdmap();
8345 {
8346 Messenger::Policy p = client_messenger->get_default_policy();
8347 uint64_t mask;
8348 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8349 if ((p.features_required & mask) != features) {
8350 dout(0) << "crush map has features " << features
8351 << ", adjusting msgr requires for clients" << dendl;
8352 p.features_required = (p.features_required & ~mask) | features;
8353 client_messenger->set_default_policy(p);
8354 }
8355 }
8356 {
8357 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8358 uint64_t mask;
8359 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8360 if ((p.features_required & mask) != features) {
8361 dout(0) << "crush map has features " << features
8362 << " was " << p.features_required
8363 << ", adjusting msgr requires for mons" << dendl;
8364 p.features_required = (p.features_required & ~mask) | features;
8365 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8366 }
8367 }
8368 {
8369 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8370 uint64_t mask;
8371 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8372
8373 if ((p.features_required & mask) != features) {
8374 dout(0) << "crush map has features " << features
8375 << ", adjusting msgr requires for osds" << dendl;
8376 p.features_required = (p.features_required & ~mask) | features;
8377 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8378 }
8379
8380 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8381 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8382 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8383 ObjectStore::Transaction t;
8384 write_superblock(t);
8385 int err = store->queue_transaction(service.meta_ch, std::move(t), NULL);
8386 ceph_assert(err == 0);
8387 }
8388 }
8389
8390 if (osdmap->require_osd_release < ceph_release_t::nautilus) {
8391 hb_front_server_messenger->set_require_authorizer(false);
8392 hb_back_server_messenger->set_require_authorizer(false);
8393 } else {
8394 hb_front_server_messenger->set_require_authorizer(true);
8395 hb_back_server_messenger->set_require_authorizer(true);
8396 }
8397
8398 if (osdmap->require_osd_release != last_require_osd_release) {
8399 dout(1) << __func__ << " require_osd_release " << last_require_osd_release
8400 << " -> " << to_string(osdmap->require_osd_release) << dendl;
8401 store->write_meta("require_osd_release",
8402 stringify((int)osdmap->require_osd_release));
8403 last_require_osd_release = osdmap->require_osd_release;
8404 }
8405 }
8406
8407 struct C_FinishSplits : public Context {
8408 OSD *osd;
8409 set<PGRef> pgs;
8410 C_FinishSplits(OSD *osd, const set<PGRef> &in)
8411 : osd(osd), pgs(in) {}
8412 void finish(int r) override {
8413 osd->_finish_splits(pgs);
8414 }
8415 };
8416
8417 void OSD::_finish_splits(set<PGRef>& pgs)
8418 {
8419 dout(10) << __func__ << " " << pgs << dendl;
8420 if (is_stopping())
8421 return;
8422 for (set<PGRef>::iterator i = pgs.begin();
8423 i != pgs.end();
8424 ++i) {
8425 PG *pg = i->get();
8426
8427 PeeringCtx rctx = create_context();
8428 pg->lock();
8429 dout(10) << __func__ << " " << *pg << dendl;
8430 epoch_t e = pg->get_osdmap_epoch();
8431 pg->handle_initialize(rctx);
8432 pg->queue_null(e, e);
8433 dispatch_context(rctx, pg, service.get_osdmap());
8434 pg->unlock();
8435
8436 unsigned shard_index = pg->pg_id.hash_to_shard(num_shards);
8437 shards[shard_index]->register_and_wake_split_child(pg);
8438 }
8439 };
8440
8441 bool OSD::add_merge_waiter(OSDMapRef nextmap, spg_t target, PGRef src,
8442 unsigned need)
8443 {
8444 std::lock_guard l(merge_lock);
8445 auto& p = merge_waiters[nextmap->get_epoch()][target];
8446 p[src->pg_id] = src;
8447 dout(10) << __func__ << " added merge_waiter " << src->pg_id
8448 << " for " << target << ", have " << p.size() << "/" << need
8449 << dendl;
8450 return p.size() == need;
8451 }
8452
8453 bool OSD::advance_pg(
8454 epoch_t osd_epoch,
8455 PG *pg,
8456 ThreadPool::TPHandle &handle,
8457 PeeringCtx &rctx)
8458 {
8459 if (osd_epoch <= pg->get_osdmap_epoch()) {
8460 return true;
8461 }
8462 ceph_assert(pg->is_locked());
8463 OSDMapRef lastmap = pg->get_osdmap();
8464 ceph_assert(lastmap->get_epoch() < osd_epoch);
8465 set<PGRef> new_pgs; // any split children
8466 bool ret = true;
8467
8468 unsigned old_pg_num = lastmap->have_pg_pool(pg->pg_id.pool()) ?
8469 lastmap->get_pg_num(pg->pg_id.pool()) : 0;
8470 for (epoch_t next_epoch = pg->get_osdmap_epoch() + 1;
8471 next_epoch <= osd_epoch;
8472 ++next_epoch) {
8473 OSDMapRef nextmap = service.try_get_map(next_epoch);
8474 if (!nextmap) {
8475 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8476 continue;
8477 }
8478
8479 unsigned new_pg_num =
8480 (old_pg_num && nextmap->have_pg_pool(pg->pg_id.pool())) ?
8481 nextmap->get_pg_num(pg->pg_id.pool()) : 0;
8482 if (old_pg_num && new_pg_num && old_pg_num != new_pg_num) {
8483 // check for merge
8484 if (nextmap->have_pg_pool(pg->pg_id.pool())) {
8485 spg_t parent;
8486 if (pg->pg_id.is_merge_source(
8487 old_pg_num,
8488 new_pg_num,
8489 &parent)) {
8490 // we are merge source
8491 PGRef spg = pg; // carry a ref
8492 dout(1) << __func__ << " " << pg->pg_id
8493 << " is merge source, target is " << parent
8494 << dendl;
8495 pg->write_if_dirty(rctx);
8496 if (!new_pgs.empty()) {
8497 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8498 new_pgs));
8499 new_pgs.clear();
8500 }
8501 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8502 pg->ch->flush();
8503 // release backoffs explicitly, since the on_shutdown path
8504 // aggressively tears down backoff state.
8505 if (pg->is_primary()) {
8506 pg->release_pg_backoffs();
8507 }
8508 pg->on_shutdown();
8509 OSDShard *sdata = pg->osd_shard;
8510 {
8511 std::lock_guard l(sdata->shard_lock);
8512 if (pg->pg_slot) {
8513 sdata->_detach_pg(pg->pg_slot);
8514 // update pg count now since we might not get an osdmap
8515 // any time soon.
8516 if (pg->is_primary())
8517 logger->dec(l_osd_pg_primary);
8518 else if (pg->is_nonprimary())
8519 logger->dec(l_osd_pg_replica); // misnomer
8520 else
8521 logger->dec(l_osd_pg_stray);
8522 }
8523 }
8524 pg->unlock();
8525
8526 set<spg_t> children;
8527 parent.is_split(new_pg_num, old_pg_num, &children);
8528 if (add_merge_waiter(nextmap, parent, pg, children.size())) {
8529 enqueue_peering_evt(
8530 parent,
8531 PGPeeringEventRef(
8532 std::make_shared<PGPeeringEvent>(
8533 nextmap->get_epoch(),
8534 nextmap->get_epoch(),
8535 NullEvt())));
8536 }
8537 ret = false;
8538 goto out;
8539 } else if (pg->pg_id.is_merge_target(old_pg_num, new_pg_num)) {
8540 // we are merge target
8541 set<spg_t> children;
8542 pg->pg_id.is_split(new_pg_num, old_pg_num, &children);
8543 dout(20) << __func__ << " " << pg->pg_id
8544 << " is merge target, sources are " << children
8545 << dendl;
8546 map<spg_t,PGRef> sources;
8547 {
8548 std::lock_guard l(merge_lock);
8549 auto& s = merge_waiters[nextmap->get_epoch()][pg->pg_id];
8550 unsigned need = children.size();
8551 dout(20) << __func__ << " have " << s.size() << "/"
8552 << need << dendl;
8553 if (s.size() == need) {
8554 sources.swap(s);
8555 merge_waiters[nextmap->get_epoch()].erase(pg->pg_id);
8556 if (merge_waiters[nextmap->get_epoch()].empty()) {
8557 merge_waiters.erase(nextmap->get_epoch());
8558 }
8559 }
8560 }
8561 if (!sources.empty()) {
8562 unsigned new_pg_num = nextmap->get_pg_num(pg->pg_id.pool());
8563 unsigned split_bits = pg->pg_id.get_split_bits(new_pg_num);
8564 dout(1) << __func__ << " merging " << pg->pg_id << dendl;
8565 pg->merge_from(
8566 sources, rctx, split_bits,
8567 nextmap->get_pg_pool(
8568 pg->pg_id.pool())->last_pg_merge_meta);
8569 pg->pg_slot->waiting_for_merge_epoch = 0;
8570 } else {
8571 dout(20) << __func__ << " not ready to merge yet" << dendl;
8572 pg->write_if_dirty(rctx);
8573 if (!new_pgs.empty()) {
8574 rctx.transaction.register_on_applied(new C_FinishSplits(this,
8575 new_pgs));
8576 new_pgs.clear();
8577 }
8578 dispatch_context(rctx, pg, pg->get_osdmap(), &handle);
8579 pg->unlock();
8580 // kick source(s) to get them ready
8581 for (auto& i : children) {
8582 dout(20) << __func__ << " kicking source " << i << dendl;
8583 enqueue_peering_evt(
8584 i,
8585 PGPeeringEventRef(
8586 std::make_shared<PGPeeringEvent>(
8587 nextmap->get_epoch(),
8588 nextmap->get_epoch(),
8589 NullEvt())));
8590 }
8591 ret = false;
8592 goto out;
8593 }
8594 }
8595 }
8596 }
8597
8598 vector<int> newup, newacting;
8599 int up_primary, acting_primary;
8600 nextmap->pg_to_up_acting_osds(
8601 pg->pg_id.pgid,
8602 &newup, &up_primary,
8603 &newacting, &acting_primary);
8604 pg->handle_advance_map(
8605 nextmap, lastmap, newup, up_primary,
8606 newacting, acting_primary, rctx);
8607
8608 auto oldpool = lastmap->get_pools().find(pg->pg_id.pool());
8609 auto newpool = nextmap->get_pools().find(pg->pg_id.pool());
8610 if (oldpool != lastmap->get_pools().end()
8611 && newpool != nextmap->get_pools().end()) {
8612 dout(20) << __func__
8613 << " new pool opts " << newpool->second.opts
8614 << " old pool opts " << oldpool->second.opts
8615 << dendl;
8616
8617 double old_min_interval = 0, new_min_interval = 0;
8618 oldpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &old_min_interval);
8619 newpool->second.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &new_min_interval);
8620
8621 double old_max_interval = 0, new_max_interval = 0;
8622 oldpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &old_max_interval);
8623 newpool->second.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &new_max_interval);
8624
8625 // Assume if an interval is change from set to unset or vice versa the actual config
8626 // is different. Keep it simple even if it is possible to call resched_all_scrub()
8627 // unnecessarily.
8628 if (old_min_interval != new_min_interval || old_max_interval != new_max_interval) {
8629 pg->on_info_history_change();
8630 }
8631 }
8632
8633 if (new_pg_num && old_pg_num != new_pg_num) {
8634 // check for split
8635 set<spg_t> children;
8636 if (pg->pg_id.is_split(
8637 old_pg_num,
8638 new_pg_num,
8639 &children)) {
8640 split_pgs(
8641 pg, children, &new_pgs, lastmap, nextmap,
8642 rctx);
8643 }
8644 }
8645
8646 lastmap = nextmap;
8647 old_pg_num = new_pg_num;
8648 handle.reset_tp_timeout();
8649 }
8650 pg->handle_activate_map(rctx);
8651
8652 ret = true;
8653 out:
8654 if (!new_pgs.empty()) {
8655 rctx.transaction.register_on_applied(new C_FinishSplits(this, new_pgs));
8656 }
8657 return ret;
8658 }
8659
8660 void OSD::consume_map()
8661 {
8662 ceph_assert(ceph_mutex_is_locked(osd_lock));
8663 auto osdmap = get_osdmap();
8664 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8665
8666 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8667 * speak the older sorting version any more. Be careful not to force
8668 * a shutdown if we are merely processing old maps, though.
8669 */
8670 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8671 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8672 ceph_abort();
8673 }
8674
8675 service.pre_publish_map(osdmap);
8676 service.await_reserved_maps();
8677 service.publish_map(osdmap);
8678
8679 // prime splits and merges
8680 set<pair<spg_t,epoch_t>> newly_split; // splits, and when
8681 set<pair<spg_t,epoch_t>> merge_pgs; // merge participants, and when
8682 for (auto& shard : shards) {
8683 shard->identify_splits_and_merges(osdmap, &newly_split, &merge_pgs);
8684 }
8685 if (!newly_split.empty()) {
8686 for (auto& shard : shards) {
8687 shard->prime_splits(osdmap, &newly_split);
8688 }
8689 ceph_assert(newly_split.empty());
8690 }
8691
8692 // prune sent_ready_to_merge
8693 service.prune_sent_ready_to_merge(osdmap);
8694
8695 // FIXME, maybe: We could race against an incoming peering message
8696 // that instantiates a merge PG after identify_merges() below and
8697 // never set up its peer to complete the merge. An OSD restart
8698 // would clear it up. This is a hard race to resolve,
8699 // extraordinarily rare (we only merge PGs that are stable and
8700 // clean, so it'd have to be an imported PG to an OSD with a
8701 // slightly stale OSDMap...), so I'm ignoring it for now. We plan to
8702 // replace all of this with a seastar-based code soon anyway.
8703 if (!merge_pgs.empty()) {
8704 // mark the pgs we already have, or create new and empty merge
8705 // participants for those we are missing. do this all under the
8706 // shard lock so we don't have to worry about racing pg creates
8707 // via _process.
8708 for (auto& shard : shards) {
8709 shard->prime_merges(osdmap, &merge_pgs);
8710 }
8711 ceph_assert(merge_pgs.empty());
8712 }
8713
8714 service.prune_pg_created();
8715
8716 unsigned pushes_to_free = 0;
8717 for (auto& shard : shards) {
8718 shard->consume_map(osdmap, &pushes_to_free);
8719 }
8720
8721 vector<spg_t> pgids;
8722 _get_pgids(&pgids);
8723
8724 // count (FIXME, probably during seastar rewrite)
8725 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8726 vector<PGRef> pgs;
8727 _get_pgs(&pgs);
8728 for (auto& pg : pgs) {
8729 // FIXME (probably during seastar rewrite): this is lockless and
8730 // racy, but we don't want to take pg lock here.
8731 if (pg->is_primary())
8732 num_pg_primary++;
8733 else if (pg->is_nonprimary())
8734 num_pg_replica++; // misnomer
8735 else
8736 num_pg_stray++;
8737 }
8738
8739 {
8740 // FIXME (as part of seastar rewrite): move to OSDShard
8741 std::lock_guard l(pending_creates_lock);
8742 for (auto pg = pending_creates_from_osd.begin();
8743 pg != pending_creates_from_osd.end();) {
8744 if (osdmap->get_pg_acting_role(pg->first, whoami) < 0) {
8745 dout(10) << __func__ << " pg " << pg->first << " doesn't map here, "
8746 << "discarding pending_create_from_osd" << dendl;
8747 pg = pending_creates_from_osd.erase(pg);
8748 } else {
8749 ++pg;
8750 }
8751 }
8752 }
8753
8754 service.maybe_inject_dispatch_delay();
8755
8756 dispatch_sessions_waiting_on_map();
8757
8758 service.maybe_inject_dispatch_delay();
8759
8760 service.release_reserved_pushes(pushes_to_free);
8761
8762 // queue null events to push maps down to individual PGs
8763 for (auto pgid : pgids) {
8764 enqueue_peering_evt(
8765 pgid,
8766 PGPeeringEventRef(
8767 std::make_shared<PGPeeringEvent>(
8768 osdmap->get_epoch(),
8769 osdmap->get_epoch(),
8770 NullEvt())));
8771 }
8772 logger->set(l_osd_pg, pgids.size());
8773 logger->set(l_osd_pg_primary, num_pg_primary);
8774 logger->set(l_osd_pg_replica, num_pg_replica);
8775 logger->set(l_osd_pg_stray, num_pg_stray);
8776 }
8777
8778 void OSD::activate_map()
8779 {
8780 ceph_assert(ceph_mutex_is_locked(osd_lock));
8781 auto osdmap = get_osdmap();
8782
8783 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8784
8785 // norecover?
8786 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8787 if (!service.recovery_is_paused()) {
8788 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8789 service.pause_recovery();
8790 }
8791 } else {
8792 if (service.recovery_is_paused()) {
8793 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8794 service.unpause_recovery();
8795 }
8796 }
8797
8798 service.activate_map();
8799
8800 // process waiters
8801 take_waiters(waiting_for_osdmap);
8802 }
8803
8804 bool OSD::require_mon_peer(const Message *m)
8805 {
8806 if (!m->get_connection()->peer_is_mon()) {
8807 dout(0) << "require_mon_peer received from non-mon "
8808 << m->get_connection()->get_peer_addr()
8809 << " " << *m << dendl;
8810 return false;
8811 }
8812 return true;
8813 }
8814
8815 bool OSD::require_mon_or_mgr_peer(const Message *m)
8816 {
8817 if (!m->get_connection()->peer_is_mon() &&
8818 !m->get_connection()->peer_is_mgr()) {
8819 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8820 << m->get_connection()->get_peer_addr()
8821 << " " << *m << dendl;
8822 return false;
8823 }
8824 return true;
8825 }
8826
8827 bool OSD::require_osd_peer(const Message *m)
8828 {
8829 if (!m->get_connection()->peer_is_osd()) {
8830 dout(0) << "require_osd_peer received from non-osd "
8831 << m->get_connection()->get_peer_addr()
8832 << " " << *m << dendl;
8833 return false;
8834 }
8835 return true;
8836 }
8837
8838 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8839 {
8840 epoch_t up_epoch = service.get_up_epoch();
8841 if (epoch < up_epoch) {
8842 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8843 return false;
8844 }
8845
8846 if (!is_active()) {
8847 dout(7) << "still in boot state, dropping message " << *m << dendl;
8848 return false;
8849 }
8850
8851 return true;
8852 }
8853
8854 bool OSD::require_same_peer_instance(const Message *m, const OSDMapRef& map,
8855 bool is_fast_dispatch)
8856 {
8857 int from = m->get_source().num();
8858
8859 if (map->is_down(from) ||
8860 (map->get_cluster_addrs(from) != m->get_source_addrs())) {
8861 dout(5) << "from dead osd." << from << ", marking down, "
8862 << " msg was " << m->get_source_inst().addr
8863 << " expected "
8864 << (map->is_up(from) ?
8865 map->get_cluster_addrs(from) : entity_addrvec_t())
8866 << dendl;
8867 ConnectionRef con = m->get_connection();
8868 con->mark_down();
8869 if (auto s = ceph::ref_cast<Session>(con->get_priv()); s) {
8870 if (!is_fast_dispatch)
8871 s->session_dispatch_lock.lock();
8872 clear_session_waiting_on_map(s);
8873 con->set_priv(nullptr); // break ref <-> session cycle, if any
8874 s->con.reset();
8875 if (!is_fast_dispatch)
8876 s->session_dispatch_lock.unlock();
8877 }
8878 return false;
8879 }
8880 return true;
8881 }
8882
8883
8884 /*
8885 * require that we have same (or newer) map, and that
8886 * the source is the pg primary.
8887 */
8888 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8889 bool is_fast_dispatch)
8890 {
8891 const Message *m = op->get_req();
8892 const auto osdmap = get_osdmap();
8893 dout(15) << "require_same_or_newer_map " << epoch
8894 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8895
8896 ceph_assert(ceph_mutex_is_locked(osd_lock));
8897
8898 // do they have a newer map?
8899 if (epoch > osdmap->get_epoch()) {
8900 dout(7) << "waiting for newer map epoch " << epoch
8901 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8902 wait_for_new_map(op);
8903 return false;
8904 }
8905
8906 if (!require_self_aliveness(op->get_req(), epoch)) {
8907 return false;
8908 }
8909
8910 // ok, our map is same or newer.. do they still exist?
8911 if (m->get_connection()->get_messenger() == cluster_messenger &&
8912 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8913 return false;
8914 }
8915
8916 return true;
8917 }
8918
8919
8920
8921
8922
8923 // ----------------------------------------
8924 // pg creation
8925
8926 void OSD::split_pgs(
8927 PG *parent,
8928 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
8929 OSDMapRef curmap,
8930 OSDMapRef nextmap,
8931 PeeringCtx &rctx)
8932 {
8933 unsigned pg_num = nextmap->get_pg_num(parent->pg_id.pool());
8934 parent->update_snap_mapper_bits(parent->get_pgid().get_split_bits(pg_num));
8935
8936 vector<object_stat_sum_t> updated_stats;
8937 parent->start_split_stats(childpgids, &updated_stats);
8938
8939 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8940 for (set<spg_t>::const_iterator i = childpgids.begin();
8941 i != childpgids.end();
8942 ++i, ++stat_iter) {
8943 ceph_assert(stat_iter != updated_stats.end());
8944 dout(10) << __func__ << " splitting " << *parent << " into " << *i << dendl;
8945 PG* child = _make_pg(nextmap, *i);
8946 child->lock(true);
8947 out_pgs->insert(child);
8948 child->ch = store->create_new_collection(child->coll);
8949
8950 {
8951 uint32_t shard_index = i->hash_to_shard(shards.size());
8952 assert(NULL != shards[shard_index]);
8953 store->set_collection_commit_queue(child->coll, &(shards[shard_index]->context_queue));
8954 }
8955
8956 unsigned split_bits = i->get_split_bits(pg_num);
8957 dout(10) << " pg_num is " << pg_num
8958 << ", m_seed " << i->ps()
8959 << ", split_bits is " << split_bits << dendl;
8960 parent->split_colls(
8961 *i,
8962 split_bits,
8963 i->ps(),
8964 &child->get_pool().info,
8965 rctx.transaction);
8966 parent->split_into(
8967 i->pgid,
8968 child,
8969 split_bits);
8970
8971 child->init_collection_pool_opts();
8972
8973 child->finish_split_stats(*stat_iter, rctx.transaction);
8974 child->unlock();
8975 }
8976 ceph_assert(stat_iter != updated_stats.end());
8977 parent->finish_split_stats(*stat_iter, rctx.transaction);
8978 }
8979
8980 /*
8981 * holding osd_lock
8982 */
8983 void OSD::handle_pg_create(OpRequestRef op)
8984 {
8985 // NOTE: this can be removed in P release (mimic is the last version to
8986 // send MOSDPGCreate messages).
8987
8988 auto m = op->get_req<MOSDPGCreate>();
8989 ceph_assert(m->get_type() == MSG_OSD_PG_CREATE);
8990
8991 dout(10) << "handle_pg_create " << *m << dendl;
8992
8993 if (!require_mon_peer(op->get_req())) {
8994 return;
8995 }
8996
8997 if (!require_same_or_newer_map(op, m->epoch, false))
8998 return;
8999
9000 op->mark_started();
9001
9002 const auto osdmap = get_osdmap();
9003 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
9004 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
9005 p != m->mkpg.end();
9006 ++p, ++ci) {
9007 ceph_assert(ci != m->ctimes.end() && ci->first == p->first);
9008 epoch_t created = p->second.created;
9009 if (p->second.split_bits) // Skip split pgs
9010 continue;
9011 pg_t on = p->first;
9012
9013 if (!osdmap->have_pg_pool(on.pool())) {
9014 dout(20) << "ignoring pg on deleted pool " << on << dendl;
9015 continue;
9016 }
9017
9018 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
9019
9020 spg_t pgid;
9021 bool mapped = osdmap->get_primary_shard(on, &pgid);
9022 ceph_assert(mapped);
9023
9024 // is it still ours?
9025 vector<int> up, acting;
9026 int up_primary = -1;
9027 int acting_primary = -1;
9028 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
9029 int role = osdmap->calc_pg_role(pg_shard_t(whoami, pgid.shard), acting);
9030
9031 if (acting_primary != whoami) {
9032 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
9033 << "), my role=" << role << ", skipping" << dendl;
9034 continue;
9035 }
9036
9037
9038 PastIntervals pi;
9039 pg_history_t history;
9040 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
9041
9042 // The mon won't resend unless the primary changed, so we ignore
9043 // same_interval_since. We'll pass this history with the current
9044 // epoch as the event.
9045 if (history.same_primary_since > m->epoch) {
9046 dout(10) << __func__ << ": got obsolete pg create on pgid "
9047 << pgid << " from epoch " << m->epoch
9048 << ", primary changed in " << history.same_primary_since
9049 << dendl;
9050 continue;
9051 }
9052 enqueue_peering_evt(
9053 pgid,
9054 PGPeeringEventRef(
9055 std::make_shared<PGPeeringEvent>(
9056 osdmap->get_epoch(),
9057 osdmap->get_epoch(),
9058 NullEvt(),
9059 true,
9060 new PGCreateInfo(
9061 pgid,
9062 osdmap->get_epoch(),
9063 history,
9064 pi,
9065 true)
9066 )));
9067 }
9068
9069 {
9070 std::lock_guard l(pending_creates_lock);
9071 if (pending_creates_from_mon == 0) {
9072 last_pg_create_epoch = m->epoch;
9073 }
9074 }
9075
9076 maybe_update_heartbeat_peers();
9077 }
9078
9079
9080 // ----------------------------------------
9081 // peering and recovery
9082
9083 PeeringCtx OSD::create_context()
9084 {
9085 return PeeringCtx(get_osdmap()->require_osd_release);
9086 }
9087
9088 void OSD::dispatch_context(PeeringCtx &ctx, PG *pg, OSDMapRef curmap,
9089 ThreadPool::TPHandle *handle)
9090 {
9091 if (!service.get_osdmap()->is_up(whoami)) {
9092 dout(20) << __func__ << " not up in osdmap" << dendl;
9093 } else if (!is_active()) {
9094 dout(20) << __func__ << " not active" << dendl;
9095 } else {
9096 for (auto& [osd, ls] : ctx.message_map) {
9097 if (!curmap->is_up(osd)) {
9098 dout(20) << __func__ << " skipping down osd." << osd << dendl;
9099 continue;
9100 }
9101 ConnectionRef con = service.get_con_osd_cluster(
9102 osd, curmap->get_epoch());
9103 if (!con) {
9104 dout(20) << __func__ << " skipping osd." << osd << " (NULL con)"
9105 << dendl;
9106 continue;
9107 }
9108 service.maybe_share_map(con.get(), curmap);
9109 for (auto m : ls) {
9110 con->send_message2(m);
9111 }
9112 ls.clear();
9113 }
9114 }
9115 if ((!ctx.transaction.empty() || ctx.transaction.has_contexts()) && pg) {
9116 int tr = store->queue_transaction(
9117 pg->ch,
9118 std::move(ctx.transaction), TrackedOpRef(),
9119 handle);
9120 ceph_assert(tr == 0);
9121 }
9122 }
9123
9124 void OSD::handle_fast_pg_create(MOSDPGCreate2 *m)
9125 {
9126 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9127 if (!require_mon_peer(m)) {
9128 m->put();
9129 return;
9130 }
9131 for (auto& p : m->pgs) {
9132 spg_t pgid = p.first;
9133 epoch_t created = p.second.first;
9134 utime_t created_stamp = p.second.second;
9135 auto q = m->pg_extra.find(pgid);
9136 if (q == m->pg_extra.end()) {
9137 dout(20) << __func__ << " " << pgid << " e" << created
9138 << "@" << created_stamp
9139 << " (no history or past_intervals)" << dendl;
9140 // pre-octopus ... no pg history. this can be removed in Q release.
9141 enqueue_peering_evt(
9142 pgid,
9143 PGPeeringEventRef(
9144 std::make_shared<PGPeeringEvent>(
9145 m->epoch,
9146 m->epoch,
9147 NullEvt(),
9148 true,
9149 new PGCreateInfo(
9150 pgid,
9151 created,
9152 pg_history_t(created, created_stamp),
9153 PastIntervals(),
9154 true)
9155 )));
9156 } else {
9157 dout(20) << __func__ << " " << pgid << " e" << created
9158 << "@" << created_stamp
9159 << " history " << q->second.first
9160 << " pi " << q->second.second << dendl;
9161 if (!q->second.second.empty() &&
9162 m->epoch < q->second.second.get_bounds().second) {
9163 clog->error() << "got pg_create on " << pgid << " epoch " << m->epoch
9164 << " and unmatched past_intervals " << q->second.second
9165 << " (history " << q->second.first << ")";
9166 } else {
9167 enqueue_peering_evt(
9168 pgid,
9169 PGPeeringEventRef(
9170 std::make_shared<PGPeeringEvent>(
9171 m->epoch,
9172 m->epoch,
9173 NullEvt(),
9174 true,
9175 new PGCreateInfo(
9176 pgid,
9177 m->epoch,
9178 q->second.first,
9179 q->second.second,
9180 true)
9181 )));
9182 }
9183 }
9184 }
9185
9186 {
9187 std::lock_guard l(pending_creates_lock);
9188 if (pending_creates_from_mon == 0) {
9189 last_pg_create_epoch = m->epoch;
9190 }
9191 }
9192
9193 m->put();
9194 }
9195
9196 void OSD::handle_fast_pg_query(MOSDPGQuery *m)
9197 {
9198 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9199 if (!require_osd_peer(m)) {
9200 m->put();
9201 return;
9202 }
9203 int from = m->get_source().num();
9204 for (auto& p : m->pg_list) {
9205 enqueue_peering_evt(
9206 p.first,
9207 PGPeeringEventRef(
9208 std::make_shared<PGPeeringEvent>(
9209 p.second.epoch_sent, p.second.epoch_sent,
9210 MQuery(
9211 p.first,
9212 pg_shard_t(from, p.second.from),
9213 p.second,
9214 p.second.epoch_sent),
9215 false))
9216 );
9217 }
9218 m->put();
9219 }
9220
9221 void OSD::handle_fast_pg_notify(MOSDPGNotify* m)
9222 {
9223 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9224 if (!require_osd_peer(m)) {
9225 m->put();
9226 return;
9227 }
9228 int from = m->get_source().num();
9229 for (auto& p : m->get_pg_list()) {
9230 spg_t pgid(p.info.pgid.pgid, p.to);
9231 enqueue_peering_evt(
9232 pgid,
9233 PGPeeringEventRef(
9234 std::make_shared<PGPeeringEvent>(
9235 p.epoch_sent,
9236 p.query_epoch,
9237 MNotifyRec(
9238 pgid, pg_shard_t(from, p.from),
9239 p,
9240 m->get_connection()->get_features()),
9241 true,
9242 new PGCreateInfo(
9243 pgid,
9244 p.query_epoch,
9245 p.info.history,
9246 p.past_intervals,
9247 false)
9248 )));
9249 }
9250 m->put();
9251 }
9252
9253 void OSD::handle_fast_pg_info(MOSDPGInfo* m)
9254 {
9255 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9256 if (!require_osd_peer(m)) {
9257 m->put();
9258 return;
9259 }
9260 int from = m->get_source().num();
9261 for (auto& p : m->pg_list) {
9262 enqueue_peering_evt(
9263 spg_t(p.info.pgid.pgid, p.to),
9264 PGPeeringEventRef(
9265 std::make_shared<PGPeeringEvent>(
9266 p.epoch_sent, p.query_epoch,
9267 MInfoRec(
9268 pg_shard_t(from, p.from),
9269 p.info,
9270 p.epoch_sent)))
9271 );
9272 }
9273 m->put();
9274 }
9275
9276 void OSD::handle_fast_pg_remove(MOSDPGRemove *m)
9277 {
9278 dout(7) << __func__ << " " << *m << " from " << m->get_source() << dendl;
9279 if (!require_osd_peer(m)) {
9280 m->put();
9281 return;
9282 }
9283 for (auto& pgid : m->pg_list) {
9284 enqueue_peering_evt(
9285 pgid,
9286 PGPeeringEventRef(
9287 std::make_shared<PGPeeringEvent>(
9288 m->get_epoch(), m->get_epoch(),
9289 PeeringState::DeleteStart())));
9290 }
9291 m->put();
9292 }
9293
9294 void OSD::handle_fast_force_recovery(MOSDForceRecovery *m)
9295 {
9296 dout(10) << __func__ << " " << *m << dendl;
9297 if (!require_mon_or_mgr_peer(m)) {
9298 m->put();
9299 return;
9300 }
9301 epoch_t epoch = get_osdmap_epoch();
9302 for (auto pgid : m->forced_pgs) {
9303 if (m->options & OFR_BACKFILL) {
9304 if (m->options & OFR_CANCEL) {
9305 enqueue_peering_evt(
9306 pgid,
9307 PGPeeringEventRef(
9308 std::make_shared<PGPeeringEvent>(
9309 epoch, epoch,
9310 PeeringState::UnsetForceBackfill())));
9311 } else {
9312 enqueue_peering_evt(
9313 pgid,
9314 PGPeeringEventRef(
9315 std::make_shared<PGPeeringEvent>(
9316 epoch, epoch,
9317 PeeringState::SetForceBackfill())));
9318 }
9319 } else if (m->options & OFR_RECOVERY) {
9320 if (m->options & OFR_CANCEL) {
9321 enqueue_peering_evt(
9322 pgid,
9323 PGPeeringEventRef(
9324 std::make_shared<PGPeeringEvent>(
9325 epoch, epoch,
9326 PeeringState::UnsetForceRecovery())));
9327 } else {
9328 enqueue_peering_evt(
9329 pgid,
9330 PGPeeringEventRef(
9331 std::make_shared<PGPeeringEvent>(
9332 epoch, epoch,
9333 PeeringState::SetForceRecovery())));
9334 }
9335 }
9336 }
9337 m->put();
9338 }
9339
9340 void OSD::handle_pg_query_nopg(const MQuery& q)
9341 {
9342 spg_t pgid = q.pgid;
9343 dout(10) << __func__ << " " << pgid << dendl;
9344
9345 OSDMapRef osdmap = get_osdmap();
9346 if (!osdmap->have_pg_pool(pgid.pool()))
9347 return;
9348
9349 dout(10) << " pg " << pgid << " dne" << dendl;
9350 pg_info_t empty(spg_t(pgid.pgid, q.query.to));
9351 ConnectionRef con = service.get_con_osd_cluster(q.from.osd, osdmap->get_epoch());
9352 if (con) {
9353 Message *m;
9354 if (q.query.type == pg_query_t::LOG ||
9355 q.query.type == pg_query_t::FULLLOG) {
9356 m = new MOSDPGLog(
9357 q.query.from, q.query.to,
9358 osdmap->get_epoch(), empty,
9359 q.query.epoch_sent);
9360 } else {
9361 vector<pg_notify_t> ls;
9362 ls.push_back(
9363 pg_notify_t(
9364 q.query.from, q.query.to,
9365 q.query.epoch_sent,
9366 osdmap->get_epoch(),
9367 empty,
9368 PastIntervals()));
9369 m = new MOSDPGNotify(osdmap->get_epoch(), std::move(ls));
9370 }
9371 service.maybe_share_map(con.get(), osdmap);
9372 con->send_message(m);
9373 }
9374 }
9375
9376 void OSDService::queue_check_readable(spg_t spgid,
9377 epoch_t lpr,
9378 ceph::signedspan delay)
9379 {
9380 if (delay == ceph::signedspan::zero()) {
9381 osd->enqueue_peering_evt(
9382 spgid,
9383 PGPeeringEventRef(
9384 std::make_shared<PGPeeringEvent>(
9385 lpr, lpr,
9386 PeeringState::CheckReadable())));
9387 } else {
9388 mono_timer.add_event(
9389 delay,
9390 [this, spgid, lpr]() {
9391 queue_check_readable(spgid, lpr);
9392 });
9393 }
9394 }
9395
9396
9397 // =========================================================
9398 // RECOVERY
9399
9400 void OSDService::_maybe_queue_recovery() {
9401 ceph_assert(ceph_mutex_is_locked_by_me(recovery_lock));
9402 uint64_t available_pushes;
9403 while (!awaiting_throttle.empty() &&
9404 _recover_now(&available_pushes)) {
9405 uint64_t to_start = std::min(
9406 available_pushes,
9407 cct->_conf->osd_recovery_max_single_start);
9408 _queue_for_recovery(awaiting_throttle.front(), to_start);
9409 awaiting_throttle.pop_front();
9410 dout(10) << __func__ << " starting " << to_start
9411 << ", recovery_ops_reserved " << recovery_ops_reserved
9412 << " -> " << (recovery_ops_reserved + to_start) << dendl;
9413 recovery_ops_reserved += to_start;
9414 }
9415 }
9416
9417 bool OSDService::_recover_now(uint64_t *available_pushes)
9418 {
9419 if (available_pushes)
9420 *available_pushes = 0;
9421
9422 if (ceph_clock_now() < defer_recovery_until) {
9423 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9424 return false;
9425 }
9426
9427 if (recovery_paused) {
9428 dout(15) << __func__ << " paused" << dendl;
9429 return false;
9430 }
9431
9432 uint64_t max = osd->get_recovery_max_active();
9433 if (max <= recovery_ops_active + recovery_ops_reserved) {
9434 dout(15) << __func__ << " active " << recovery_ops_active
9435 << " + reserved " << recovery_ops_reserved
9436 << " >= max " << max << dendl;
9437 return false;
9438 }
9439
9440 if (available_pushes)
9441 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9442
9443 return true;
9444 }
9445
9446 unsigned OSDService::get_target_pg_log_entries() const
9447 {
9448 auto num_pgs = osd->get_num_pgs();
9449 auto target = cct->_conf->osd_target_pg_log_entries_per_osd;
9450 if (num_pgs > 0 && target > 0) {
9451 // target an even spread of our budgeted log entries across all
9452 // PGs. note that while we only get to control the entry count
9453 // for primary PGs, we'll normally be responsible for a mix of
9454 // primary and replica PGs (for the same pool(s) even), so this
9455 // will work out.
9456 return std::max<unsigned>(
9457 std::min<unsigned>(target / num_pgs,
9458 cct->_conf->osd_max_pg_log_entries),
9459 cct->_conf->osd_min_pg_log_entries);
9460 } else {
9461 // fall back to a per-pg value.
9462 return cct->_conf->osd_min_pg_log_entries;
9463 }
9464 }
9465
9466 void OSD::do_recovery(
9467 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9468 ThreadPool::TPHandle &handle)
9469 {
9470 uint64_t started = 0;
9471
9472 /*
9473 * When the value of osd_recovery_sleep is set greater than zero, recovery
9474 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9475 * recovery event's schedule time. This is done by adding a
9476 * recovery_requeue_callback event, which re-queues the recovery op using
9477 * queue_recovery_after_sleep.
9478 */
9479 float recovery_sleep = get_osd_recovery_sleep();
9480 {
9481 std::lock_guard l(service.sleep_lock);
9482 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9483 PGRef pgref(pg);
9484 auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
9485 dout(20) << "do_recovery wake up at "
9486 << ceph_clock_now()
9487 << ", re-queuing recovery" << dendl;
9488 std::lock_guard l(service.sleep_lock);
9489 service.recovery_needs_sleep = false;
9490 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9491 });
9492
9493 // This is true for the first recovery op and when the previous recovery op
9494 // has been scheduled in the past. The next recovery op is scheduled after
9495 // completing the sleep from now.
9496
9497 if (auto now = ceph::real_clock::now();
9498 service.recovery_schedule_time < now) {
9499 service.recovery_schedule_time = now;
9500 }
9501 service.recovery_schedule_time += ceph::make_timespan(recovery_sleep);
9502 service.sleep_timer.add_event_at(service.recovery_schedule_time,
9503 recovery_requeue_callback);
9504 dout(20) << "Recovery event scheduled at "
9505 << service.recovery_schedule_time << dendl;
9506 return;
9507 }
9508 }
9509
9510 {
9511 {
9512 std::lock_guard l(service.sleep_lock);
9513 service.recovery_needs_sleep = true;
9514 }
9515
9516 if (pg->pg_has_reset_since(queued)) {
9517 goto out;
9518 }
9519
9520 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9521 #ifdef DEBUG_RECOVERY_OIDS
9522 dout(20) << " active was " << service.recovery_oids[pg->pg_id] << dendl;
9523 #endif
9524
9525 bool do_unfound = pg->start_recovery_ops(reserved_pushes, handle, &started);
9526 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9527 << " on " << *pg << dendl;
9528
9529 if (do_unfound) {
9530 PeeringCtx rctx = create_context();
9531 rctx.handle = &handle;
9532 pg->find_unfound(queued, rctx);
9533 dispatch_context(rctx, pg, pg->get_osdmap());
9534 }
9535 }
9536
9537 out:
9538 ceph_assert(started <= reserved_pushes);
9539 service.release_reserved_pushes(reserved_pushes);
9540 }
9541
9542 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9543 {
9544 std::lock_guard l(recovery_lock);
9545 dout(10) << "start_recovery_op " << *pg << " " << soid
9546 << " (" << recovery_ops_active << "/"
9547 << osd->get_recovery_max_active() << " rops)"
9548 << dendl;
9549 recovery_ops_active++;
9550
9551 #ifdef DEBUG_RECOVERY_OIDS
9552 dout(20) << " active was " << recovery_oids[pg->pg_id] << dendl;
9553 ceph_assert(recovery_oids[pg->pg_id].count(soid) == 0);
9554 recovery_oids[pg->pg_id].insert(soid);
9555 #endif
9556 }
9557
9558 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9559 {
9560 std::lock_guard l(recovery_lock);
9561 dout(10) << "finish_recovery_op " << *pg << " " << soid
9562 << " dequeue=" << dequeue
9563 << " (" << recovery_ops_active << "/"
9564 << osd->get_recovery_max_active() << " rops)"
9565 << dendl;
9566
9567 // adjust count
9568 ceph_assert(recovery_ops_active > 0);
9569 recovery_ops_active--;
9570
9571 #ifdef DEBUG_RECOVERY_OIDS
9572 dout(20) << " active oids was " << recovery_oids[pg->pg_id] << dendl;
9573 ceph_assert(recovery_oids[pg->pg_id].count(soid));
9574 recovery_oids[pg->pg_id].erase(soid);
9575 #endif
9576
9577 _maybe_queue_recovery();
9578 }
9579
9580 bool OSDService::is_recovery_active()
9581 {
9582 if (cct->_conf->osd_debug_pretend_recovery_active) {
9583 return true;
9584 }
9585 return local_reserver.has_reservation() || remote_reserver.has_reservation();
9586 }
9587
9588 void OSDService::release_reserved_pushes(uint64_t pushes)
9589 {
9590 std::lock_guard l(recovery_lock);
9591 dout(10) << __func__ << "(" << pushes << "), recovery_ops_reserved "
9592 << recovery_ops_reserved << " -> " << (recovery_ops_reserved-pushes)
9593 << dendl;
9594 ceph_assert(recovery_ops_reserved >= pushes);
9595 recovery_ops_reserved -= pushes;
9596 _maybe_queue_recovery();
9597 }
9598
9599 // =========================================================
9600 // OPS
9601
9602 bool OSD::op_is_discardable(const MOSDOp *op)
9603 {
9604 // drop client request if they are not connected and can't get the
9605 // reply anyway.
9606 if (!op->get_connection()->is_connected()) {
9607 return true;
9608 }
9609 return false;
9610 }
9611
9612 void OSD::enqueue_op(spg_t pg, OpRequestRef&& op, epoch_t epoch)
9613 {
9614 const utime_t stamp = op->get_req()->get_recv_stamp();
9615 const utime_t latency = ceph_clock_now() - stamp;
9616 const unsigned priority = op->get_req()->get_priority();
9617 const int cost = op->get_req()->get_cost();
9618 const uint64_t owner = op->get_req()->get_source().num();
9619
9620 dout(15) << "enqueue_op " << op << " prio " << priority
9621 << " cost " << cost
9622 << " latency " << latency
9623 << " epoch " << epoch
9624 << " " << *(op->get_req()) << dendl;
9625 op->osd_trace.event("enqueue op");
9626 op->osd_trace.keyval("priority", priority);
9627 op->osd_trace.keyval("cost", cost);
9628 op->mark_queued_for_pg();
9629 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9630 op_shardedwq.queue(
9631 OpSchedulerItem(
9632 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(pg, std::move(op))),
9633 cost, priority, stamp, owner, epoch));
9634 }
9635
9636 void OSD::enqueue_peering_evt(spg_t pgid, PGPeeringEventRef evt)
9637 {
9638 dout(15) << __func__ << " " << pgid << " " << evt->get_desc() << dendl;
9639 op_shardedwq.queue(
9640 OpSchedulerItem(
9641 unique_ptr<OpSchedulerItem::OpQueueable>(new PGPeeringItem(pgid, evt)),
9642 10,
9643 cct->_conf->osd_peering_op_priority,
9644 utime_t(),
9645 0,
9646 evt->get_epoch_sent()));
9647 }
9648
9649 /*
9650 * NOTE: dequeue called in worker thread, with pg lock
9651 */
9652 void OSD::dequeue_op(
9653 PGRef pg, OpRequestRef op,
9654 ThreadPool::TPHandle &handle)
9655 {
9656 const Message *m = op->get_req();
9657
9658 FUNCTRACE(cct);
9659 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_BEGIN", false);
9660
9661 utime_t now = ceph_clock_now();
9662 op->set_dequeued_time(now);
9663
9664 utime_t latency = now - m->get_recv_stamp();
9665 dout(10) << "dequeue_op " << op << " prio " << m->get_priority()
9666 << " cost " << m->get_cost()
9667 << " latency " << latency
9668 << " " << *m
9669 << " pg " << *pg << dendl;
9670
9671 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9672
9673 service.maybe_share_map(m->get_connection().get(),
9674 pg->get_osdmap(),
9675 op->sent_epoch);
9676
9677 if (pg->is_deleting())
9678 return;
9679
9680 op->mark_reached_pg();
9681 op->osd_trace.event("dequeue_op");
9682
9683 pg->do_request(op, handle);
9684
9685 // finish
9686 dout(10) << "dequeue_op " << op << " finish" << dendl;
9687 OID_EVENT_TRACE_WITH_MSG(m, "DEQUEUE_OP_END", false);
9688 }
9689
9690
9691 void OSD::dequeue_peering_evt(
9692 OSDShard *sdata,
9693 PG *pg,
9694 PGPeeringEventRef evt,
9695 ThreadPool::TPHandle& handle)
9696 {
9697 PeeringCtx rctx = create_context();
9698 auto curmap = sdata->get_osdmap();
9699 bool need_up_thru = false;
9700 epoch_t same_interval_since = 0;
9701 if (!pg) {
9702 if (const MQuery *q = dynamic_cast<const MQuery*>(evt->evt.get())) {
9703 handle_pg_query_nopg(*q);
9704 } else {
9705 derr << __func__ << " unrecognized pg-less event " << evt->get_desc() << dendl;
9706 ceph_abort();
9707 }
9708 } else if (advance_pg(curmap->get_epoch(), pg, handle, rctx)) {
9709 pg->do_peering_event(evt, rctx);
9710 if (pg->is_deleted()) {
9711 pg->unlock();
9712 return;
9713 }
9714 dispatch_context(rctx, pg, curmap, &handle);
9715 need_up_thru = pg->get_need_up_thru();
9716 same_interval_since = pg->get_same_interval_since();
9717 pg->unlock();
9718 }
9719
9720 if (need_up_thru) {
9721 queue_want_up_thru(same_interval_since);
9722 }
9723
9724 service.send_pg_temp();
9725 }
9726
9727 void OSD::dequeue_delete(
9728 OSDShard *sdata,
9729 PG *pg,
9730 epoch_t e,
9731 ThreadPool::TPHandle& handle)
9732 {
9733 dequeue_peering_evt(
9734 sdata,
9735 pg,
9736 PGPeeringEventRef(
9737 std::make_shared<PGPeeringEvent>(
9738 e, e,
9739 PeeringState::DeleteSome())),
9740 handle);
9741 }
9742
9743
9744
9745 // --------------------------------
9746
9747 const char** OSD::get_tracked_conf_keys() const
9748 {
9749 static const char* KEYS[] = {
9750 "osd_max_backfills",
9751 "osd_min_recovery_priority",
9752 "osd_max_trimming_pgs",
9753 "osd_op_complaint_time",
9754 "osd_op_log_threshold",
9755 "osd_op_history_size",
9756 "osd_op_history_duration",
9757 "osd_op_history_slow_op_size",
9758 "osd_op_history_slow_op_threshold",
9759 "osd_enable_op_tracker",
9760 "osd_map_cache_size",
9761 "osd_pg_epoch_max_lag_factor",
9762 "osd_pg_epoch_persisted_max_stale",
9763 // clog & admin clog
9764 "clog_to_monitors",
9765 "clog_to_syslog",
9766 "clog_to_syslog_facility",
9767 "clog_to_syslog_level",
9768 "osd_objectstore_fuse",
9769 "clog_to_graylog",
9770 "clog_to_graylog_host",
9771 "clog_to_graylog_port",
9772 "host",
9773 "fsid",
9774 "osd_recovery_delay_start",
9775 "osd_client_message_size_cap",
9776 "osd_client_message_cap",
9777 "osd_heartbeat_min_size",
9778 "osd_heartbeat_interval",
9779 "osd_object_clean_region_max_num_intervals",
9780 "osd_scrub_min_interval",
9781 "osd_scrub_max_interval",
9782 NULL
9783 };
9784 return KEYS;
9785 }
9786
9787 void OSD::handle_conf_change(const ConfigProxy& conf,
9788 const std::set <std::string> &changed)
9789 {
9790 std::lock_guard l{osd_lock};
9791 if (changed.count("osd_max_backfills")) {
9792 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9793 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9794 }
9795 if (changed.count("osd_min_recovery_priority")) {
9796 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9797 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9798 }
9799 if (changed.count("osd_max_trimming_pgs")) {
9800 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9801 }
9802 if (changed.count("osd_op_complaint_time") ||
9803 changed.count("osd_op_log_threshold")) {
9804 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9805 cct->_conf->osd_op_log_threshold);
9806 }
9807 if (changed.count("osd_op_history_size") ||
9808 changed.count("osd_op_history_duration")) {
9809 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9810 cct->_conf->osd_op_history_duration);
9811 }
9812 if (changed.count("osd_op_history_slow_op_size") ||
9813 changed.count("osd_op_history_slow_op_threshold")) {
9814 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9815 cct->_conf->osd_op_history_slow_op_threshold);
9816 }
9817 if (changed.count("osd_enable_op_tracker")) {
9818 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9819 }
9820 if (changed.count("osd_map_cache_size")) {
9821 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9822 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9823 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9824 }
9825 if (changed.count("clog_to_monitors") ||
9826 changed.count("clog_to_syslog") ||
9827 changed.count("clog_to_syslog_level") ||
9828 changed.count("clog_to_syslog_facility") ||
9829 changed.count("clog_to_graylog") ||
9830 changed.count("clog_to_graylog_host") ||
9831 changed.count("clog_to_graylog_port") ||
9832 changed.count("host") ||
9833 changed.count("fsid")) {
9834 update_log_config();
9835 }
9836 if (changed.count("osd_pg_epoch_max_lag_factor")) {
9837 m_osd_pg_epoch_max_lag_factor = conf.get_val<double>(
9838 "osd_pg_epoch_max_lag_factor");
9839 }
9840
9841 #ifdef HAVE_LIBFUSE
9842 if (changed.count("osd_objectstore_fuse")) {
9843 if (store) {
9844 enable_disable_fuse(false);
9845 }
9846 }
9847 #endif
9848
9849 if (changed.count("osd_recovery_delay_start")) {
9850 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9851 service.kick_recovery_queue();
9852 }
9853
9854 if (changed.count("osd_client_message_cap")) {
9855 uint64_t newval = cct->_conf->osd_client_message_cap;
9856 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9857 if (pol.throttler_messages && newval > 0) {
9858 pol.throttler_messages->reset_max(newval);
9859 }
9860 }
9861 if (changed.count("osd_client_message_size_cap")) {
9862 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9863 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9864 if (pol.throttler_bytes && newval > 0) {
9865 pol.throttler_bytes->reset_max(newval);
9866 }
9867 }
9868 if (changed.count("osd_object_clean_region_max_num_intervals")) {
9869 ObjectCleanRegions::set_max_num_intervals(cct->_conf->osd_object_clean_region_max_num_intervals);
9870 }
9871
9872 if (changed.count("osd_scrub_min_interval") ||
9873 changed.count("osd_scrub_max_interval")) {
9874 resched_all_scrubs();
9875 dout(0) << __func__ << ": scrub interval change" << dendl;
9876 }
9877 check_config();
9878 }
9879
9880 void OSD::update_log_config()
9881 {
9882 map<string,string> log_to_monitors;
9883 map<string,string> log_to_syslog;
9884 map<string,string> log_channel;
9885 map<string,string> log_prio;
9886 map<string,string> log_to_graylog;
9887 map<string,string> log_to_graylog_host;
9888 map<string,string> log_to_graylog_port;
9889 uuid_d fsid;
9890 string host;
9891
9892 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9893 log_channel, log_prio, log_to_graylog,
9894 log_to_graylog_host, log_to_graylog_port,
9895 fsid, host) == 0)
9896 clog->update_config(log_to_monitors, log_to_syslog,
9897 log_channel, log_prio, log_to_graylog,
9898 log_to_graylog_host, log_to_graylog_port,
9899 fsid, host);
9900 derr << "log_to_monitors " << log_to_monitors << dendl;
9901 }
9902
9903 void OSD::check_config()
9904 {
9905 // some sanity checks
9906 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9907 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9908 << " is not > osd_pg_epoch_persisted_max_stale ("
9909 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9910 }
9911 if (cct->_conf->osd_object_clean_region_max_num_intervals < 0) {
9912 clog->warn() << "osd_object_clean_region_max_num_intervals ("
9913 << cct->_conf->osd_object_clean_region_max_num_intervals
9914 << ") is < 0";
9915 }
9916 }
9917
9918 // --------------------------------
9919
9920 void OSD::get_latest_osdmap()
9921 {
9922 dout(10) << __func__ << " -- start" << dendl;
9923
9924 C_SaferCond cond;
9925 service.objecter->wait_for_latest_osdmap(&cond);
9926 cond.wait();
9927
9928 dout(10) << __func__ << " -- finish" << dendl;
9929 }
9930
9931 // --------------------------------
9932
9933 void OSD::set_perf_queries(const ConfigPayload &config_payload) {
9934 const OSDConfigPayload &osd_config_payload = boost::get<OSDConfigPayload>(config_payload);
9935 const std::map<OSDPerfMetricQuery, OSDPerfMetricLimits> &queries = osd_config_payload.config;
9936 dout(10) << "setting " << queries.size() << " queries" << dendl;
9937
9938 std::list<OSDPerfMetricQuery> supported_queries;
9939 for (auto &it : queries) {
9940 auto &query = it.first;
9941 if (!query.key_descriptor.empty()) {
9942 supported_queries.push_back(query);
9943 }
9944 }
9945 if (supported_queries.size() < queries.size()) {
9946 dout(1) << queries.size() - supported_queries.size()
9947 << " unsupported queries" << dendl;
9948 }
9949 {
9950 std::lock_guard locker{m_perf_queries_lock};
9951 m_perf_queries = supported_queries;
9952 m_perf_limits = queries;
9953 }
9954 std::vector<PGRef> pgs;
9955 _get_pgs(&pgs);
9956 for (auto& pg : pgs) {
9957 std::scoped_lock l{*pg};
9958 pg->set_dynamic_perf_stats_queries(supported_queries);
9959 }
9960 }
9961
9962 MetricPayload OSD::get_perf_reports() {
9963 OSDMetricPayload payload;
9964 std::map<OSDPerfMetricQuery, OSDPerfMetricReport> &reports = payload.report;
9965
9966 std::vector<PGRef> pgs;
9967 _get_pgs(&pgs);
9968 DynamicPerfStats dps;
9969 for (auto& pg : pgs) {
9970 // m_perf_queries can be modified only in set_perf_queries by mgr client
9971 // request, and it is protected by by mgr client's lock, which is held
9972 // when set_perf_queries/get_perf_reports are called, so we may not hold
9973 // m_perf_queries_lock here.
9974 DynamicPerfStats pg_dps(m_perf_queries);
9975 pg->lock();
9976 pg->get_dynamic_perf_stats(&pg_dps);
9977 pg->unlock();
9978 dps.merge(pg_dps);
9979 }
9980 dps.add_to_reports(m_perf_limits, &reports);
9981 dout(20) << "reports for " << reports.size() << " queries" << dendl;
9982
9983 return payload;
9984 }
9985
9986 // =============================================================
9987
9988 #undef dout_context
9989 #define dout_context cct
9990 #undef dout_prefix
9991 #define dout_prefix *_dout << "osd." << osd->get_nodeid() << ":" << shard_id << "." << __func__ << " "
9992
9993 void OSDShard::_attach_pg(OSDShardPGSlot *slot, PG *pg)
9994 {
9995 dout(10) << pg->pg_id << " " << pg << dendl;
9996 slot->pg = pg;
9997 pg->osd_shard = this;
9998 pg->pg_slot = slot;
9999 osd->inc_num_pgs();
10000
10001 slot->epoch = pg->get_osdmap_epoch();
10002 pg_slots_by_epoch.insert(*slot);
10003 }
10004
10005 void OSDShard::_detach_pg(OSDShardPGSlot *slot)
10006 {
10007 dout(10) << slot->pg->pg_id << " " << slot->pg << dendl;
10008 slot->pg->osd_shard = nullptr;
10009 slot->pg->pg_slot = nullptr;
10010 slot->pg = nullptr;
10011 osd->dec_num_pgs();
10012
10013 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10014 slot->epoch = 0;
10015 if (waiting_for_min_pg_epoch) {
10016 min_pg_epoch_cond.notify_all();
10017 }
10018 }
10019
10020 void OSDShard::update_pg_epoch(OSDShardPGSlot *slot, epoch_t e)
10021 {
10022 std::lock_guard l(shard_lock);
10023 dout(30) << "min was " << pg_slots_by_epoch.begin()->epoch
10024 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10025 pg_slots_by_epoch.erase(pg_slots_by_epoch.iterator_to(*slot));
10026 dout(20) << slot->pg->pg_id << " " << slot->epoch << " -> " << e << dendl;
10027 slot->epoch = e;
10028 pg_slots_by_epoch.insert(*slot);
10029 dout(30) << "min is now " << pg_slots_by_epoch.begin()->epoch
10030 << " on " << pg_slots_by_epoch.begin()->pg->pg_id << dendl;
10031 if (waiting_for_min_pg_epoch) {
10032 min_pg_epoch_cond.notify_all();
10033 }
10034 }
10035
10036 epoch_t OSDShard::get_min_pg_epoch()
10037 {
10038 std::lock_guard l(shard_lock);
10039 auto p = pg_slots_by_epoch.begin();
10040 if (p == pg_slots_by_epoch.end()) {
10041 return 0;
10042 }
10043 return p->epoch;
10044 }
10045
10046 void OSDShard::wait_min_pg_epoch(epoch_t need)
10047 {
10048 std::unique_lock l{shard_lock};
10049 ++waiting_for_min_pg_epoch;
10050 min_pg_epoch_cond.wait(l, [need, this] {
10051 if (pg_slots_by_epoch.empty()) {
10052 return true;
10053 } else if (pg_slots_by_epoch.begin()->epoch >= need) {
10054 return true;
10055 } else {
10056 dout(10) << need << " waiting on "
10057 << pg_slots_by_epoch.begin()->epoch << dendl;
10058 return false;
10059 }
10060 });
10061 --waiting_for_min_pg_epoch;
10062 }
10063
10064 epoch_t OSDShard::get_max_waiting_epoch()
10065 {
10066 std::lock_guard l(shard_lock);
10067 epoch_t r = 0;
10068 for (auto& i : pg_slots) {
10069 if (!i.second->waiting_peering.empty()) {
10070 r = std::max(r, i.second->waiting_peering.rbegin()->first);
10071 }
10072 }
10073 return r;
10074 }
10075
10076 void OSDShard::consume_map(
10077 const OSDMapRef& new_osdmap,
10078 unsigned *pushes_to_free)
10079 {
10080 std::lock_guard l(shard_lock);
10081 OSDMapRef old_osdmap;
10082 {
10083 std::lock_guard l(osdmap_lock);
10084 old_osdmap = std::move(shard_osdmap);
10085 shard_osdmap = new_osdmap;
10086 }
10087 dout(10) << new_osdmap->get_epoch()
10088 << " (was " << (old_osdmap ? old_osdmap->get_epoch() : 0) << ")"
10089 << dendl;
10090 bool queued = false;
10091
10092 // check slots
10093 auto p = pg_slots.begin();
10094 while (p != pg_slots.end()) {
10095 OSDShardPGSlot *slot = p->second.get();
10096 const spg_t& pgid = p->first;
10097 dout(20) << __func__ << " " << pgid << dendl;
10098 if (!slot->waiting_for_split.empty()) {
10099 dout(20) << __func__ << " " << pgid
10100 << " waiting for split " << slot->waiting_for_split << dendl;
10101 ++p;
10102 continue;
10103 }
10104 if (slot->waiting_for_merge_epoch > new_osdmap->get_epoch()) {
10105 dout(20) << __func__ << " " << pgid
10106 << " waiting for merge by epoch " << slot->waiting_for_merge_epoch
10107 << dendl;
10108 ++p;
10109 continue;
10110 }
10111 if (!slot->waiting_peering.empty()) {
10112 epoch_t first = slot->waiting_peering.begin()->first;
10113 if (first <= new_osdmap->get_epoch()) {
10114 dout(20) << __func__ << " " << pgid
10115 << " pending_peering first epoch " << first
10116 << " <= " << new_osdmap->get_epoch() << ", requeueing" << dendl;
10117 _wake_pg_slot(pgid, slot);
10118 queued = true;
10119 }
10120 ++p;
10121 continue;
10122 }
10123 if (!slot->waiting.empty()) {
10124 if (new_osdmap->is_up_acting_osd_shard(pgid, osd->get_nodeid())) {
10125 dout(20) << __func__ << " " << pgid << " maps to us, keeping"
10126 << dendl;
10127 ++p;
10128 continue;
10129 }
10130 while (!slot->waiting.empty() &&
10131 slot->waiting.front().get_map_epoch() <= new_osdmap->get_epoch()) {
10132 auto& qi = slot->waiting.front();
10133 dout(20) << __func__ << " " << pgid
10134 << " waiting item " << qi
10135 << " epoch " << qi.get_map_epoch()
10136 << " <= " << new_osdmap->get_epoch()
10137 << ", "
10138 << (qi.get_map_epoch() < new_osdmap->get_epoch() ? "stale" :
10139 "misdirected")
10140 << ", dropping" << dendl;
10141 *pushes_to_free += qi.get_reserved_pushes();
10142 slot->waiting.pop_front();
10143 }
10144 }
10145 if (slot->waiting.empty() &&
10146 slot->num_running == 0 &&
10147 slot->waiting_for_split.empty() &&
10148 !slot->pg) {
10149 dout(20) << __func__ << " " << pgid << " empty, pruning" << dendl;
10150 p = pg_slots.erase(p);
10151 continue;
10152 }
10153
10154 ++p;
10155 }
10156 if (queued) {
10157 std::lock_guard l{sdata_wait_lock};
10158 sdata_cond.notify_one();
10159 }
10160 }
10161
10162 void OSDShard::_wake_pg_slot(
10163 spg_t pgid,
10164 OSDShardPGSlot *slot)
10165 {
10166 dout(20) << __func__ << " " << pgid
10167 << " to_process " << slot->to_process
10168 << " waiting " << slot->waiting
10169 << " waiting_peering " << slot->waiting_peering << dendl;
10170 for (auto i = slot->to_process.rbegin();
10171 i != slot->to_process.rend();
10172 ++i) {
10173 scheduler->enqueue_front(std::move(*i));
10174 }
10175 slot->to_process.clear();
10176 for (auto i = slot->waiting.rbegin();
10177 i != slot->waiting.rend();
10178 ++i) {
10179 scheduler->enqueue_front(std::move(*i));
10180 }
10181 slot->waiting.clear();
10182 for (auto i = slot->waiting_peering.rbegin();
10183 i != slot->waiting_peering.rend();
10184 ++i) {
10185 // this is overkill; we requeue everything, even if some of these
10186 // items are waiting for maps we don't have yet. FIXME, maybe,
10187 // someday, if we decide this inefficiency matters
10188 for (auto j = i->second.rbegin(); j != i->second.rend(); ++j) {
10189 scheduler->enqueue_front(std::move(*j));
10190 }
10191 }
10192 slot->waiting_peering.clear();
10193 ++slot->requeue_seq;
10194 }
10195
10196 void OSDShard::identify_splits_and_merges(
10197 const OSDMapRef& as_of_osdmap,
10198 set<pair<spg_t,epoch_t>> *split_pgs,
10199 set<pair<spg_t,epoch_t>> *merge_pgs)
10200 {
10201 std::lock_guard l(shard_lock);
10202 if (shard_osdmap) {
10203 for (auto& i : pg_slots) {
10204 const spg_t& pgid = i.first;
10205 auto *slot = i.second.get();
10206 if (slot->pg) {
10207 osd->service.identify_splits_and_merges(
10208 shard_osdmap, as_of_osdmap, pgid,
10209 split_pgs, merge_pgs);
10210 } else if (!slot->waiting_for_split.empty()) {
10211 osd->service.identify_splits_and_merges(
10212 shard_osdmap, as_of_osdmap, pgid,
10213 split_pgs, nullptr);
10214 } else {
10215 dout(20) << __func__ << " slot " << pgid
10216 << " has no pg and waiting_for_split " << dendl;
10217 }
10218 }
10219 }
10220 }
10221
10222 void OSDShard::prime_splits(const OSDMapRef& as_of_osdmap,
10223 set<pair<spg_t,epoch_t>> *pgids)
10224 {
10225 std::lock_guard l(shard_lock);
10226 _prime_splits(pgids);
10227 if (shard_osdmap->get_epoch() > as_of_osdmap->get_epoch()) {
10228 set<pair<spg_t,epoch_t>> newer_children;
10229 for (auto i : *pgids) {
10230 osd->service.identify_splits_and_merges(
10231 as_of_osdmap, shard_osdmap, i.first,
10232 &newer_children, nullptr);
10233 }
10234 newer_children.insert(pgids->begin(), pgids->end());
10235 dout(10) << "as_of_osdmap " << as_of_osdmap->get_epoch() << " < shard "
10236 << shard_osdmap->get_epoch() << ", new children " << newer_children
10237 << dendl;
10238 _prime_splits(&newer_children);
10239 // note: we don't care what is left over here for other shards.
10240 // if this shard is ahead of us and one isn't, e.g., one thread is
10241 // calling into prime_splits via _process (due to a newly created
10242 // pg) and this shard has a newer map due to a racing consume_map,
10243 // then any grandchildren left here will be identified (or were
10244 // identified) when the slower shard's osdmap is advanced.
10245 // _prime_splits() will tolerate the case where the pgid is
10246 // already primed.
10247 }
10248 }
10249
10250 void OSDShard::_prime_splits(set<pair<spg_t,epoch_t>> *pgids)
10251 {
10252 dout(10) << *pgids << dendl;
10253 auto p = pgids->begin();
10254 while (p != pgids->end()) {
10255 unsigned shard_index = p->first.hash_to_shard(osd->num_shards);
10256 if (shard_index == shard_id) {
10257 auto r = pg_slots.emplace(p->first, nullptr);
10258 if (r.second) {
10259 dout(10) << "priming slot " << p->first << " e" << p->second << dendl;
10260 r.first->second = make_unique<OSDShardPGSlot>();
10261 r.first->second->waiting_for_split.insert(p->second);
10262 } else {
10263 auto q = r.first;
10264 ceph_assert(q != pg_slots.end());
10265 dout(10) << "priming (existing) slot " << p->first << " e" << p->second
10266 << dendl;
10267 q->second->waiting_for_split.insert(p->second);
10268 }
10269 p = pgids->erase(p);
10270 } else {
10271 ++p;
10272 }
10273 }
10274 }
10275
10276 void OSDShard::prime_merges(const OSDMapRef& as_of_osdmap,
10277 set<pair<spg_t,epoch_t>> *merge_pgs)
10278 {
10279 std::lock_guard l(shard_lock);
10280 dout(20) << __func__ << " checking shard " << shard_id
10281 << " for remaining merge pgs " << merge_pgs << dendl;
10282 auto p = merge_pgs->begin();
10283 while (p != merge_pgs->end()) {
10284 spg_t pgid = p->first;
10285 epoch_t epoch = p->second;
10286 unsigned shard_index = pgid.hash_to_shard(osd->num_shards);
10287 if (shard_index != shard_id) {
10288 ++p;
10289 continue;
10290 }
10291 OSDShardPGSlot *slot;
10292 auto r = pg_slots.emplace(pgid, nullptr);
10293 if (r.second) {
10294 r.first->second = make_unique<OSDShardPGSlot>();
10295 }
10296 slot = r.first->second.get();
10297 if (slot->pg) {
10298 // already have pg
10299 dout(20) << __func__ << " have merge participant pg " << pgid
10300 << " " << slot->pg << dendl;
10301 } else if (!slot->waiting_for_split.empty() &&
10302 *slot->waiting_for_split.begin() < epoch) {
10303 dout(20) << __func__ << " pending split on merge participant pg " << pgid
10304 << " " << slot->waiting_for_split << dendl;
10305 } else {
10306 dout(20) << __func__ << " creating empty merge participant " << pgid
10307 << " for merge in " << epoch << dendl;
10308 // leave history zeroed; PG::merge_from() will fill it in.
10309 pg_history_t history;
10310 PGCreateInfo cinfo(pgid, epoch - 1,
10311 history, PastIntervals(), false);
10312 PGRef pg = osd->handle_pg_create_info(shard_osdmap, &cinfo);
10313 _attach_pg(r.first->second.get(), pg.get());
10314 _wake_pg_slot(pgid, slot);
10315 pg->unlock();
10316 }
10317 // mark slot for merge
10318 dout(20) << __func__ << " marking merge participant " << pgid << dendl;
10319 slot->waiting_for_merge_epoch = epoch;
10320 p = merge_pgs->erase(p);
10321 }
10322 }
10323
10324 void OSDShard::register_and_wake_split_child(PG *pg)
10325 {
10326 epoch_t epoch;
10327 {
10328 std::lock_guard l(shard_lock);
10329 dout(10) << pg->pg_id << " " << pg << dendl;
10330 auto p = pg_slots.find(pg->pg_id);
10331 ceph_assert(p != pg_slots.end());
10332 auto *slot = p->second.get();
10333 dout(20) << pg->pg_id << " waiting_for_split " << slot->waiting_for_split
10334 << dendl;
10335 ceph_assert(!slot->pg);
10336 ceph_assert(!slot->waiting_for_split.empty());
10337 _attach_pg(slot, pg);
10338
10339 epoch = pg->get_osdmap_epoch();
10340 ceph_assert(slot->waiting_for_split.count(epoch));
10341 slot->waiting_for_split.erase(epoch);
10342 if (slot->waiting_for_split.empty()) {
10343 _wake_pg_slot(pg->pg_id, slot);
10344 } else {
10345 dout(10) << __func__ << " still waiting for split on "
10346 << slot->waiting_for_split << dendl;
10347 }
10348 }
10349
10350 // kick child to ensure it pulls up to the latest osdmap
10351 osd->enqueue_peering_evt(
10352 pg->pg_id,
10353 PGPeeringEventRef(
10354 std::make_shared<PGPeeringEvent>(
10355 epoch,
10356 epoch,
10357 NullEvt())));
10358
10359 std::lock_guard l{sdata_wait_lock};
10360 sdata_cond.notify_one();
10361 }
10362
10363 void OSDShard::unprime_split_children(spg_t parent, unsigned old_pg_num)
10364 {
10365 std::lock_guard l(shard_lock);
10366 vector<spg_t> to_delete;
10367 for (auto& i : pg_slots) {
10368 if (i.first != parent &&
10369 i.first.get_ancestor(old_pg_num) == parent) {
10370 dout(10) << __func__ << " parent " << parent << " clearing " << i.first
10371 << dendl;
10372 _wake_pg_slot(i.first, i.second.get());
10373 to_delete.push_back(i.first);
10374 }
10375 }
10376 for (auto pgid : to_delete) {
10377 pg_slots.erase(pgid);
10378 }
10379 }
10380
10381 OSDShard::OSDShard(
10382 int id,
10383 CephContext *cct,
10384 OSD *osd)
10385 : shard_id(id),
10386 cct(cct),
10387 osd(osd),
10388 shard_name(string("OSDShard.") + stringify(id)),
10389 sdata_wait_lock_name(shard_name + "::sdata_wait_lock"),
10390 sdata_wait_lock{make_mutex(sdata_wait_lock_name)},
10391 osdmap_lock{make_mutex(shard_name + "::osdmap_lock")},
10392 shard_lock_name(shard_name + "::shard_lock"),
10393 shard_lock{make_mutex(shard_lock_name)},
10394 scheduler(ceph::osd::scheduler::make_scheduler(cct)),
10395 context_queue(sdata_wait_lock, sdata_cond)
10396 {
10397 dout(0) << "using op scheduler " << *scheduler << dendl;
10398 }
10399
10400
10401 // =============================================================
10402
10403 #undef dout_context
10404 #define dout_context osd->cct
10405 #undef dout_prefix
10406 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10407
10408 void OSD::ShardedOpWQ::_add_slot_waiter(
10409 spg_t pgid,
10410 OSDShardPGSlot *slot,
10411 OpSchedulerItem&& qi)
10412 {
10413 if (qi.is_peering()) {
10414 dout(20) << __func__ << " " << pgid
10415 << " peering, item epoch is "
10416 << qi.get_map_epoch()
10417 << ", will wait on " << qi << dendl;
10418 slot->waiting_peering[qi.get_map_epoch()].push_back(std::move(qi));
10419 } else {
10420 dout(20) << __func__ << " " << pgid
10421 << " item epoch is "
10422 << qi.get_map_epoch()
10423 << ", will wait on " << qi << dendl;
10424 slot->waiting.push_back(std::move(qi));
10425 }
10426 }
10427
10428 #undef dout_prefix
10429 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10430
10431 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10432 {
10433 uint32_t shard_index = thread_index % osd->num_shards;
10434 auto& sdata = osd->shards[shard_index];
10435 ceph_assert(sdata);
10436
10437 // If all threads of shards do oncommits, there is a out-of-order
10438 // problem. So we choose the thread which has the smallest
10439 // thread_index(thread_index < num_shards) of shard to do oncommit
10440 // callback.
10441 bool is_smallest_thread_index = thread_index < osd->num_shards;
10442
10443 // peek at spg_t
10444 sdata->shard_lock.lock();
10445 if (sdata->scheduler->empty() &&
10446 (!is_smallest_thread_index || sdata->context_queue.empty())) {
10447 std::unique_lock wait_lock{sdata->sdata_wait_lock};
10448 if (is_smallest_thread_index && !sdata->context_queue.empty()) {
10449 // we raced with a context_queue addition, don't wait
10450 wait_lock.unlock();
10451 } else if (!sdata->stop_waiting) {
10452 dout(20) << __func__ << " empty q, waiting" << dendl;
10453 osd->cct->get_heartbeat_map()->clear_timeout(hb);
10454 sdata->shard_lock.unlock();
10455 sdata->sdata_cond.wait(wait_lock);
10456 wait_lock.unlock();
10457 sdata->shard_lock.lock();
10458 if (sdata->scheduler->empty() &&
10459 !(is_smallest_thread_index && !sdata->context_queue.empty())) {
10460 sdata->shard_lock.unlock();
10461 return;
10462 }
10463 // found a work item; reapply default wq timeouts
10464 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10465 timeout_interval, suicide_interval);
10466 } else {
10467 dout(20) << __func__ << " need return immediately" << dendl;
10468 wait_lock.unlock();
10469 sdata->shard_lock.unlock();
10470 return;
10471 }
10472 }
10473
10474 list<Context *> oncommits;
10475 if (is_smallest_thread_index) {
10476 sdata->context_queue.move_to(oncommits);
10477 }
10478
10479 if (sdata->scheduler->empty()) {
10480 if (osd->is_stopping()) {
10481 sdata->shard_lock.unlock();
10482 for (auto c : oncommits) {
10483 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10484 delete c;
10485 }
10486 return; // OSD shutdown, discard.
10487 }
10488 sdata->shard_lock.unlock();
10489 handle_oncommits(oncommits);
10490 return;
10491 }
10492
10493 OpSchedulerItem item = sdata->scheduler->dequeue();
10494 if (osd->is_stopping()) {
10495 sdata->shard_lock.unlock();
10496 for (auto c : oncommits) {
10497 dout(10) << __func__ << " discarding in-flight oncommit " << c << dendl;
10498 delete c;
10499 }
10500 return; // OSD shutdown, discard.
10501 }
10502
10503 const auto token = item.get_ordering_token();
10504 auto r = sdata->pg_slots.emplace(token, nullptr);
10505 if (r.second) {
10506 r.first->second = make_unique<OSDShardPGSlot>();
10507 }
10508 OSDShardPGSlot *slot = r.first->second.get();
10509 dout(20) << __func__ << " " << token
10510 << (r.second ? " (new)" : "")
10511 << " to_process " << slot->to_process
10512 << " waiting " << slot->waiting
10513 << " waiting_peering " << slot->waiting_peering
10514 << dendl;
10515 slot->to_process.push_back(std::move(item));
10516 dout(20) << __func__ << " " << slot->to_process.back()
10517 << " queued" << dendl;
10518
10519 retry_pg:
10520 PGRef pg = slot->pg;
10521
10522 // lock pg (if we have it)
10523 if (pg) {
10524 // note the requeue seq now...
10525 uint64_t requeue_seq = slot->requeue_seq;
10526 ++slot->num_running;
10527
10528 sdata->shard_lock.unlock();
10529 osd->service.maybe_inject_dispatch_delay();
10530 pg->lock();
10531 osd->service.maybe_inject_dispatch_delay();
10532 sdata->shard_lock.lock();
10533
10534 auto q = sdata->pg_slots.find(token);
10535 if (q == sdata->pg_slots.end()) {
10536 // this can happen if we race with pg removal.
10537 dout(20) << __func__ << " slot " << token << " no longer there" << dendl;
10538 pg->unlock();
10539 sdata->shard_lock.unlock();
10540 handle_oncommits(oncommits);
10541 return;
10542 }
10543 slot = q->second.get();
10544 --slot->num_running;
10545
10546 if (slot->to_process.empty()) {
10547 // raced with _wake_pg_slot or consume_map
10548 dout(20) << __func__ << " " << token
10549 << " nothing queued" << dendl;
10550 pg->unlock();
10551 sdata->shard_lock.unlock();
10552 handle_oncommits(oncommits);
10553 return;
10554 }
10555 if (requeue_seq != slot->requeue_seq) {
10556 dout(20) << __func__ << " " << token
10557 << " requeue_seq " << slot->requeue_seq << " > our "
10558 << requeue_seq << ", we raced with _wake_pg_slot"
10559 << dendl;
10560 pg->unlock();
10561 sdata->shard_lock.unlock();
10562 handle_oncommits(oncommits);
10563 return;
10564 }
10565 if (slot->pg != pg) {
10566 // this can happen if we race with pg removal.
10567 dout(20) << __func__ << " slot " << token << " no longer attached to "
10568 << pg << dendl;
10569 pg->unlock();
10570 goto retry_pg;
10571 }
10572 }
10573
10574 dout(20) << __func__ << " " << token
10575 << " to_process " << slot->to_process
10576 << " waiting " << slot->waiting
10577 << " waiting_peering " << slot->waiting_peering << dendl;
10578
10579 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10580 suicide_interval);
10581
10582 // take next item
10583 auto qi = std::move(slot->to_process.front());
10584 slot->to_process.pop_front();
10585 dout(20) << __func__ << " " << qi << " pg " << pg << dendl;
10586 set<pair<spg_t,epoch_t>> new_children;
10587 OSDMapRef osdmap;
10588
10589 while (!pg) {
10590 // should this pg shard exist on this osd in this (or a later) epoch?
10591 osdmap = sdata->shard_osdmap;
10592 const PGCreateInfo *create_info = qi.creates_pg();
10593 if (!slot->waiting_for_split.empty()) {
10594 dout(20) << __func__ << " " << token
10595 << " splitting " << slot->waiting_for_split << dendl;
10596 _add_slot_waiter(token, slot, std::move(qi));
10597 } else if (qi.get_map_epoch() > osdmap->get_epoch()) {
10598 dout(20) << __func__ << " " << token
10599 << " map " << qi.get_map_epoch() << " > "
10600 << osdmap->get_epoch() << dendl;
10601 _add_slot_waiter(token, slot, std::move(qi));
10602 } else if (qi.is_peering()) {
10603 if (!qi.peering_requires_pg()) {
10604 // for pg-less events, we run them under the ordering lock, since
10605 // we don't have the pg lock to keep them ordered.
10606 qi.run(osd, sdata, pg, tp_handle);
10607 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10608 if (create_info) {
10609 if (create_info->by_mon &&
10610 osdmap->get_pg_acting_primary(token.pgid) != osd->whoami) {
10611 dout(20) << __func__ << " " << token
10612 << " no pg, no longer primary, ignoring mon create on "
10613 << qi << dendl;
10614 } else {
10615 dout(20) << __func__ << " " << token
10616 << " no pg, should create on " << qi << dendl;
10617 pg = osd->handle_pg_create_info(osdmap, create_info);
10618 if (pg) {
10619 // we created the pg! drop out and continue "normally"!
10620 sdata->_attach_pg(slot, pg.get());
10621 sdata->_wake_pg_slot(token, slot);
10622
10623 // identify split children between create epoch and shard epoch.
10624 osd->service.identify_splits_and_merges(
10625 pg->get_osdmap(), osdmap, pg->pg_id, &new_children, nullptr);
10626 sdata->_prime_splits(&new_children);
10627 // distribute remaining split children to other shards below!
10628 break;
10629 }
10630 dout(20) << __func__ << " ignored create on " << qi << dendl;
10631 }
10632 } else {
10633 dout(20) << __func__ << " " << token
10634 << " no pg, peering, !create, discarding " << qi << dendl;
10635 }
10636 } else {
10637 dout(20) << __func__ << " " << token
10638 << " no pg, peering, doesn't map here e" << osdmap->get_epoch()
10639 << ", discarding " << qi
10640 << dendl;
10641 }
10642 } else if (osdmap->is_up_acting_osd_shard(token, osd->whoami)) {
10643 dout(20) << __func__ << " " << token
10644 << " no pg, should exist e" << osdmap->get_epoch()
10645 << ", will wait on " << qi << dendl;
10646 _add_slot_waiter(token, slot, std::move(qi));
10647 } else {
10648 dout(20) << __func__ << " " << token
10649 << " no pg, shouldn't exist e" << osdmap->get_epoch()
10650 << ", dropping " << qi << dendl;
10651 // share map with client?
10652 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10653 osd->service.maybe_share_map((*_op)->get_req()->get_connection().get(),
10654 sdata->shard_osdmap,
10655 (*_op)->sent_epoch);
10656 }
10657 unsigned pushes_to_free = qi.get_reserved_pushes();
10658 if (pushes_to_free > 0) {
10659 sdata->shard_lock.unlock();
10660 osd->service.release_reserved_pushes(pushes_to_free);
10661 handle_oncommits(oncommits);
10662 return;
10663 }
10664 }
10665 sdata->shard_lock.unlock();
10666 handle_oncommits(oncommits);
10667 return;
10668 }
10669 if (qi.is_peering()) {
10670 OSDMapRef osdmap = sdata->shard_osdmap;
10671 if (qi.get_map_epoch() > osdmap->get_epoch()) {
10672 _add_slot_waiter(token, slot, std::move(qi));
10673 sdata->shard_lock.unlock();
10674 pg->unlock();
10675 handle_oncommits(oncommits);
10676 return;
10677 }
10678 }
10679 sdata->shard_lock.unlock();
10680
10681 if (!new_children.empty()) {
10682 for (auto shard : osd->shards) {
10683 shard->prime_splits(osdmap, &new_children);
10684 }
10685 ceph_assert(new_children.empty());
10686 }
10687
10688 // osd_opwq_process marks the point at which an operation has been dequeued
10689 // and will begin to be handled by a worker thread.
10690 {
10691 #ifdef WITH_LTTNG
10692 osd_reqid_t reqid;
10693 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10694 reqid = (*_op)->get_reqid();
10695 }
10696 #endif
10697 tracepoint(osd, opwq_process_start, reqid.name._type,
10698 reqid.name._num, reqid.tid, reqid.inc);
10699 }
10700
10701 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10702 Formatter *f = Formatter::create("json");
10703 f->open_object_section("q");
10704 dump(f);
10705 f->close_section();
10706 f->flush(*_dout);
10707 delete f;
10708 *_dout << dendl;
10709
10710 qi.run(osd, sdata, pg, tp_handle);
10711
10712 {
10713 #ifdef WITH_LTTNG
10714 osd_reqid_t reqid;
10715 if (std::optional<OpRequestRef> _op = qi.maybe_get_op()) {
10716 reqid = (*_op)->get_reqid();
10717 }
10718 #endif
10719 tracepoint(osd, opwq_process_finish, reqid.name._type,
10720 reqid.name._num, reqid.tid, reqid.inc);
10721 }
10722
10723 handle_oncommits(oncommits);
10724 }
10725
10726 void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
10727 uint32_t shard_index =
10728 item.get_ordering_token().hash_to_shard(osd->shards.size());
10729
10730 dout(20) << __func__ << " " << item << dendl;
10731
10732 OSDShard* sdata = osd->shards[shard_index];
10733 assert (NULL != sdata);
10734
10735 bool empty = true;
10736 {
10737 std::lock_guard l{sdata->shard_lock};
10738 empty = sdata->scheduler->empty();
10739 sdata->scheduler->enqueue(std::move(item));
10740 }
10741
10742 if (empty) {
10743 std::lock_guard l{sdata->sdata_wait_lock};
10744 sdata->sdata_cond.notify_all();
10745 }
10746 }
10747
10748 void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
10749 {
10750 auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
10751 auto& sdata = osd->shards[shard_index];
10752 ceph_assert(sdata);
10753 sdata->shard_lock.lock();
10754 auto p = sdata->pg_slots.find(item.get_ordering_token());
10755 if (p != sdata->pg_slots.end() &&
10756 !p->second->to_process.empty()) {
10757 // we may be racing with _process, which has dequeued a new item
10758 // from scheduler, put it on to_process, and is now busy taking the
10759 // pg lock. ensure this old requeued item is ordered before any
10760 // such newer item in to_process.
10761 p->second->to_process.push_front(std::move(item));
10762 item = std::move(p->second->to_process.back());
10763 p->second->to_process.pop_back();
10764 dout(20) << __func__
10765 << " " << p->second->to_process.front()
10766 << " shuffled w/ " << item << dendl;
10767 } else {
10768 dout(20) << __func__ << " " << item << dendl;
10769 }
10770 sdata->scheduler->enqueue_front(std::move(item));
10771 sdata->shard_lock.unlock();
10772 std::lock_guard l{sdata->sdata_wait_lock};
10773 sdata->sdata_cond.notify_one();
10774 }
10775
10776 namespace ceph {
10777 namespace osd_cmds {
10778
10779 int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,
10780 std::ostream& os)
10781 {
10782 if (!ceph_using_tcmalloc()) {
10783 os << "could not issue heap profiler command -- not using tcmalloc!";
10784 return -EOPNOTSUPP;
10785 }
10786
10787 string cmd;
10788 if (!cmd_getval(cmdmap, "heapcmd", cmd)) {
10789 os << "unable to get value for command \"" << cmd << "\"";
10790 return -EINVAL;
10791 }
10792
10793 std::vector<std::string> cmd_vec;
10794 get_str_vec(cmd, cmd_vec);
10795
10796 string val;
10797 if (cmd_getval(cmdmap, "value", val)) {
10798 cmd_vec.push_back(val);
10799 }
10800
10801 ceph_heap_profiler_handle_command(cmd_vec, os);
10802
10803 return 0;
10804 }
10805
10806 }} // namespace ceph::osd_cmds