]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
update sources to v12.1.3
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15 #include "acconfig.h"
16
17 #include <fstream>
18 #include <iostream>
19 #include <errno.h>
20 #include <sys/stat.h>
21 #include <signal.h>
22 #include <ctype.h>
23 #include <boost/scoped_ptr.hpp>
24
25 #ifdef HAVE_SYS_PARAM_H
26 #include <sys/param.h>
27 #endif
28
29 #ifdef HAVE_SYS_MOUNT_H
30 #include <sys/mount.h>
31 #endif
32
33 #include "osd/PG.h"
34
35 #include "include/types.h"
36 #include "include/compat.h"
37
38 #include "OSD.h"
39 #include "OSDMap.h"
40 #include "Watch.h"
41 #include "osdc/Objecter.h"
42
43 #include "common/errno.h"
44 #include "common/ceph_argparse.h"
45 #include "common/ceph_time.h"
46 #include "common/version.h"
47 #include "common/io_priority.h"
48
49 #include "os/ObjectStore.h"
50 #ifdef HAVE_LIBFUSE
51 #include "os/FuseStore.h"
52 #endif
53
54 #include "PrimaryLogPG.h"
55
56
57 #include "msg/Messenger.h"
58 #include "msg/Message.h"
59
60 #include "mon/MonClient.h"
61
62 #include "messages/MLog.h"
63
64 #include "messages/MGenericMessage.h"
65 #include "messages/MOSDPing.h"
66 #include "messages/MOSDFailure.h"
67 #include "messages/MOSDMarkMeDown.h"
68 #include "messages/MOSDFull.h"
69 #include "messages/MOSDOp.h"
70 #include "messages/MOSDOpReply.h"
71 #include "messages/MOSDBackoff.h"
72 #include "messages/MOSDBeacon.h"
73 #include "messages/MOSDRepOp.h"
74 #include "messages/MOSDRepOpReply.h"
75 #include "messages/MOSDBoot.h"
76 #include "messages/MOSDPGTemp.h"
77
78 #include "messages/MOSDMap.h"
79 #include "messages/MMonGetOSDMap.h"
80 #include "messages/MOSDPGNotify.h"
81 #include "messages/MOSDPGQuery.h"
82 #include "messages/MOSDPGLog.h"
83 #include "messages/MOSDPGRemove.h"
84 #include "messages/MOSDPGInfo.h"
85 #include "messages/MOSDPGCreate.h"
86 #include "messages/MOSDPGTrim.h"
87 #include "messages/MOSDPGScan.h"
88 #include "messages/MOSDPGBackfill.h"
89 #include "messages/MBackfillReserve.h"
90 #include "messages/MRecoveryReserve.h"
91 #include "messages/MOSDForceRecovery.h"
92 #include "messages/MOSDECSubOpWrite.h"
93 #include "messages/MOSDECSubOpWriteReply.h"
94 #include "messages/MOSDECSubOpRead.h"
95 #include "messages/MOSDECSubOpReadReply.h"
96 #include "messages/MOSDPGCreated.h"
97 #include "messages/MOSDPGUpdateLogMissing.h"
98 #include "messages/MOSDPGUpdateLogMissingReply.h"
99
100 #include "messages/MOSDAlive.h"
101
102 #include "messages/MOSDScrub.h"
103 #include "messages/MOSDScrubReserve.h"
104 #include "messages/MOSDRepScrub.h"
105
106 #include "messages/MMonCommand.h"
107 #include "messages/MCommand.h"
108 #include "messages/MCommandReply.h"
109
110 #include "messages/MPGStats.h"
111 #include "messages/MPGStatsAck.h"
112
113 #include "messages/MWatchNotify.h"
114 #include "messages/MOSDPGPush.h"
115 #include "messages/MOSDPGPushReply.h"
116 #include "messages/MOSDPGPull.h"
117
118 #include "common/perf_counters.h"
119 #include "common/Timer.h"
120 #include "common/LogClient.h"
121 #include "common/AsyncReserver.h"
122 #include "common/HeartbeatMap.h"
123 #include "common/admin_socket.h"
124 #include "common/ceph_context.h"
125
126 #include "global/signal_handler.h"
127 #include "global/pidfile.h"
128
129 #include "include/color.h"
130 #include "perfglue/cpu_profiler.h"
131 #include "perfglue/heap_profiler.h"
132
133 #include "osd/OpRequest.h"
134
135 #include "auth/AuthAuthorizeHandler.h"
136 #include "auth/RotatingKeyRing.h"
137 #include "common/errno.h"
138
139 #include "objclass/objclass.h"
140
141 #include "common/cmdparse.h"
142 #include "include/str_list.h"
143 #include "include/util.h"
144
145 #include "include/assert.h"
146 #include "common/config.h"
147 #include "common/EventTrace.h"
148
149 #ifdef WITH_LTTNG
150 #define TRACEPOINT_DEFINE
151 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
152 #include "tracing/osd.h"
153 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
154 #undef TRACEPOINT_DEFINE
155 #else
156 #define tracepoint(...)
157 #endif
158
159 #define dout_context cct
160 #define dout_subsys ceph_subsys_osd
161 #undef dout_prefix
162 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
163
164
165 const double OSD::OSD_TICK_INTERVAL = 1.0;
166
167 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
168 return *_dout << "osd." << whoami << " " << epoch << " ";
169 }
170
171 //Initial features in new superblock.
172 //Features here are also automatically upgraded
173 CompatSet OSD::get_osd_initial_compat_set() {
174 CompatSet::FeatureSet ceph_osd_feature_compat;
175 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
176 CompatSet::FeatureSet ceph_osd_feature_incompat;
177 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
178 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
179 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
180 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
181 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
182 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
183 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
184 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
185 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
186 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
187 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
188 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
189 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
190 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
191 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
192 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
193 ceph_osd_feature_incompat);
194 }
195
196 //Features are added here that this OSD supports.
197 CompatSet OSD::get_osd_compat_set() {
198 CompatSet compat = get_osd_initial_compat_set();
199 //Any features here can be set in code, but not in initial superblock
200 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
201 return compat;
202 }
203
204 OSDService::OSDService(OSD *osd) :
205 osd(osd),
206 cct(osd->cct),
207 meta_osr(new ObjectStore::Sequencer("meta")),
208 whoami(osd->whoami), store(osd->store),
209 log_client(osd->log_client), clog(osd->clog),
210 pg_recovery_stats(osd->pg_recovery_stats),
211 cluster_messenger(osd->cluster_messenger),
212 client_messenger(osd->client_messenger),
213 logger(osd->logger),
214 recoverystate_perf(osd->recoverystate_perf),
215 monc(osd->monc),
216 peering_wq(osd->peering_wq),
217 recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
218 &osd->disk_tp),
219 class_handler(osd->class_handler),
220 pg_epoch_lock("OSDService::pg_epoch_lock"),
221 publish_lock("OSDService::publish_lock"),
222 pre_publish_lock("OSDService::pre_publish_lock"),
223 max_oldest_map(0),
224 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
225 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
226 scrubs_active(0),
227 agent_lock("OSDService::agent_lock"),
228 agent_valid_iterator(false),
229 agent_ops(0),
230 flush_mode_high_count(0),
231 agent_active(true),
232 agent_thread(this),
233 agent_stop_flag(false),
234 agent_timer_lock("OSDService::agent_timer_lock"),
235 agent_timer(osd->client_messenger->cct, agent_timer_lock),
236 last_recalibrate(ceph_clock_now()),
237 promote_max_objects(0),
238 promote_max_bytes(0),
239 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
240 objecter_finisher(osd->client_messenger->cct),
241 watch_lock("OSDService::watch_lock"),
242 watch_timer(osd->client_messenger->cct, watch_lock),
243 next_notif_id(0),
244 recovery_request_lock("OSDService::recovery_request_lock"),
245 recovery_request_timer(cct, recovery_request_lock, false),
246 recovery_sleep_lock("OSDService::recovery_sleep_lock"),
247 recovery_sleep_timer(cct, recovery_sleep_lock, false),
248 reserver_finisher(cct),
249 local_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
250 cct->_conf->osd_min_recovery_priority),
251 remote_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
252 cct->_conf->osd_min_recovery_priority),
253 pg_temp_lock("OSDService::pg_temp_lock"),
254 snap_sleep_lock("OSDService::snap_sleep_lock"),
255 snap_sleep_timer(
256 osd->client_messenger->cct, snap_sleep_lock, false /* relax locking */),
257 scrub_sleep_lock("OSDService::scrub_sleep_lock"),
258 scrub_sleep_timer(
259 osd->client_messenger->cct, scrub_sleep_lock, false /* relax locking */),
260 snap_reserver(&reserver_finisher,
261 cct->_conf->osd_max_trimming_pgs),
262 recovery_lock("OSDService::recovery_lock"),
263 recovery_ops_active(0),
264 recovery_ops_reserved(0),
265 recovery_paused(false),
266 map_cache_lock("OSDService::map_cache_lock"),
267 map_cache(cct, cct->_conf->osd_map_cache_size),
268 map_bl_cache(cct->_conf->osd_map_cache_size),
269 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
270 in_progress_split_lock("OSDService::in_progress_split_lock"),
271 stat_lock("OSDService::stat_lock"),
272 full_status_lock("OSDService::full_status_lock"),
273 cur_state(NONE),
274 cur_ratio(0),
275 epoch_lock("OSDService::epoch_lock"),
276 boot_epoch(0), up_epoch(0), bind_epoch(0),
277 is_stopping_lock("OSDService::is_stopping_lock")
278 #ifdef PG_DEBUG_REFS
279 , pgid_lock("OSDService::pgid_lock")
280 #endif
281 {
282 objecter->init();
283 }
284
285 OSDService::~OSDService()
286 {
287 delete objecter;
288 }
289
290
291
292 #ifdef PG_DEBUG_REFS
293 void OSDService::add_pgid(spg_t pgid, PG *pg){
294 Mutex::Locker l(pgid_lock);
295 if (!pgid_tracker.count(pgid)) {
296 live_pgs[pgid] = pg;
297 }
298 pgid_tracker[pgid]++;
299 }
300 void OSDService::remove_pgid(spg_t pgid, PG *pg)
301 {
302 Mutex::Locker l(pgid_lock);
303 assert(pgid_tracker.count(pgid));
304 assert(pgid_tracker[pgid] > 0);
305 pgid_tracker[pgid]--;
306 if (pgid_tracker[pgid] == 0) {
307 pgid_tracker.erase(pgid);
308 live_pgs.erase(pgid);
309 }
310 }
311 void OSDService::dump_live_pgids()
312 {
313 Mutex::Locker l(pgid_lock);
314 derr << "live pgids:" << dendl;
315 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
316 i != pgid_tracker.cend();
317 ++i) {
318 derr << "\t" << *i << dendl;
319 live_pgs[i->first]->dump_live_ids();
320 }
321 }
322 #endif
323
324
325 void OSDService::_start_split(spg_t parent, const set<spg_t> &children)
326 {
327 for (set<spg_t>::const_iterator i = children.begin();
328 i != children.end();
329 ++i) {
330 dout(10) << __func__ << ": Starting split on pg " << *i
331 << ", parent=" << parent << dendl;
332 assert(!pending_splits.count(*i));
333 assert(!in_progress_splits.count(*i));
334 pending_splits.insert(make_pair(*i, parent));
335
336 assert(!rev_pending_splits[parent].count(*i));
337 rev_pending_splits[parent].insert(*i);
338 }
339 }
340
341 void OSDService::mark_split_in_progress(spg_t parent, const set<spg_t> &children)
342 {
343 Mutex::Locker l(in_progress_split_lock);
344 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
345 assert(piter != rev_pending_splits.end());
346 for (set<spg_t>::const_iterator i = children.begin();
347 i != children.end();
348 ++i) {
349 assert(piter->second.count(*i));
350 assert(pending_splits.count(*i));
351 assert(!in_progress_splits.count(*i));
352 assert(pending_splits[*i] == parent);
353
354 pending_splits.erase(*i);
355 piter->second.erase(*i);
356 in_progress_splits.insert(*i);
357 }
358 if (piter->second.empty())
359 rev_pending_splits.erase(piter);
360 }
361
362 void OSDService::cancel_pending_splits_for_parent(spg_t parent)
363 {
364 Mutex::Locker l(in_progress_split_lock);
365 _cancel_pending_splits_for_parent(parent);
366 }
367
368 void OSDService::_cancel_pending_splits_for_parent(spg_t parent)
369 {
370 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
371 if (piter == rev_pending_splits.end())
372 return;
373
374 for (set<spg_t>::iterator i = piter->second.begin();
375 i != piter->second.end();
376 ++i) {
377 assert(pending_splits.count(*i));
378 assert(!in_progress_splits.count(*i));
379 pending_splits.erase(*i);
380 dout(10) << __func__ << ": Completing split on pg " << *i
381 << " for parent: " << parent << dendl;
382 _cancel_pending_splits_for_parent(*i);
383 }
384 rev_pending_splits.erase(piter);
385 }
386
387 void OSDService::_maybe_split_pgid(OSDMapRef old_map,
388 OSDMapRef new_map,
389 spg_t pgid)
390 {
391 assert(old_map->have_pg_pool(pgid.pool()));
392 int old_pgnum = old_map->get_pg_num(pgid.pool());
393 if (pgid.ps() < static_cast<unsigned>(old_pgnum)) {
394 set<spg_t> children;
395 if (pgid.is_split(old_pgnum,
396 new_map->get_pg_num(pgid.pool()), &children)) {
397 _start_split(pgid, children); }
398 } else {
399 assert(pgid.ps() < static_cast<unsigned>(new_map->get_pg_num(pgid.pool())));
400 }
401 }
402
403 void OSDService::init_splits_between(spg_t pgid,
404 OSDMapRef frommap,
405 OSDMapRef tomap)
406 {
407 // First, check whether we can avoid this potentially expensive check
408 if (tomap->have_pg_pool(pgid.pool()) &&
409 pgid.is_split(
410 frommap->get_pg_num(pgid.pool()),
411 tomap->get_pg_num(pgid.pool()),
412 NULL)) {
413 // Ok, a split happened, so we need to walk the osdmaps
414 set<spg_t> new_pgs; // pgs to scan on each map
415 new_pgs.insert(pgid);
416 OSDMapRef curmap(get_map(frommap->get_epoch()));
417 for (epoch_t e = frommap->get_epoch() + 1;
418 e <= tomap->get_epoch();
419 ++e) {
420 OSDMapRef nextmap(try_get_map(e));
421 if (!nextmap)
422 continue;
423 set<spg_t> even_newer_pgs; // pgs added in this loop
424 for (set<spg_t>::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) {
425 set<spg_t> split_pgs;
426 if (i->is_split(curmap->get_pg_num(i->pool()),
427 nextmap->get_pg_num(i->pool()),
428 &split_pgs)) {
429 start_split(*i, split_pgs);
430 even_newer_pgs.insert(split_pgs.begin(), split_pgs.end());
431 }
432 }
433 new_pgs.insert(even_newer_pgs.begin(), even_newer_pgs.end());
434 curmap = nextmap;
435 }
436 assert(curmap == tomap); // we must have had both frommap and tomap
437 }
438 }
439
440 void OSDService::expand_pg_num(OSDMapRef old_map,
441 OSDMapRef new_map)
442 {
443 Mutex::Locker l(in_progress_split_lock);
444 for (set<spg_t>::iterator i = in_progress_splits.begin();
445 i != in_progress_splits.end();
446 ) {
447 if (!new_map->have_pg_pool(i->pool())) {
448 in_progress_splits.erase(i++);
449 } else {
450 _maybe_split_pgid(old_map, new_map, *i);
451 ++i;
452 }
453 }
454 for (map<spg_t, spg_t>::iterator i = pending_splits.begin();
455 i != pending_splits.end();
456 ) {
457 if (!new_map->have_pg_pool(i->first.pool())) {
458 rev_pending_splits.erase(i->second);
459 pending_splits.erase(i++);
460 } else {
461 _maybe_split_pgid(old_map, new_map, i->first);
462 ++i;
463 }
464 }
465 }
466
467 bool OSDService::splitting(spg_t pgid)
468 {
469 Mutex::Locker l(in_progress_split_lock);
470 return in_progress_splits.count(pgid) ||
471 pending_splits.count(pgid);
472 }
473
474 void OSDService::complete_split(const set<spg_t> &pgs)
475 {
476 Mutex::Locker l(in_progress_split_lock);
477 for (set<spg_t>::const_iterator i = pgs.begin();
478 i != pgs.end();
479 ++i) {
480 dout(10) << __func__ << ": Completing split on pg " << *i << dendl;
481 assert(!pending_splits.count(*i));
482 assert(in_progress_splits.count(*i));
483 in_progress_splits.erase(*i);
484 }
485 }
486
487 void OSDService::need_heartbeat_peer_update()
488 {
489 osd->need_heartbeat_peer_update();
490 }
491
492 void OSDService::pg_stat_queue_enqueue(PG *pg)
493 {
494 osd->pg_stat_queue_enqueue(pg);
495 }
496
497 void OSDService::pg_stat_queue_dequeue(PG *pg)
498 {
499 osd->pg_stat_queue_dequeue(pg);
500 }
501
502 void OSDService::start_shutdown()
503 {
504 {
505 Mutex::Locker l(agent_timer_lock);
506 agent_timer.shutdown();
507 }
508
509 {
510 Mutex::Locker l(recovery_sleep_lock);
511 recovery_sleep_timer.shutdown();
512 }
513 }
514
515 void OSDService::shutdown_reserver()
516 {
517 reserver_finisher.wait_for_empty();
518 reserver_finisher.stop();
519 }
520
521 void OSDService::shutdown()
522 {
523 {
524 Mutex::Locker l(watch_lock);
525 watch_timer.shutdown();
526 }
527
528 objecter->shutdown();
529 objecter_finisher.wait_for_empty();
530 objecter_finisher.stop();
531
532 {
533 Mutex::Locker l(recovery_request_lock);
534 recovery_request_timer.shutdown();
535 }
536
537 {
538 Mutex::Locker l(snap_sleep_lock);
539 snap_sleep_timer.shutdown();
540 }
541
542 {
543 Mutex::Locker l(scrub_sleep_lock);
544 scrub_sleep_timer.shutdown();
545 }
546
547 osdmap = OSDMapRef();
548 next_osdmap = OSDMapRef();
549 }
550
551 void OSDService::init()
552 {
553 reserver_finisher.start();
554 objecter_finisher.start();
555 objecter->set_client_incarnation(0);
556
557 // deprioritize objecter in daemonperf output
558 objecter->get_logger()->set_prio_adjust(-3);
559
560 watch_timer.init();
561 agent_timer.init();
562 snap_sleep_timer.init();
563 scrub_sleep_timer.init();
564
565 agent_thread.create("osd_srv_agent");
566
567 if (cct->_conf->osd_recovery_delay_start)
568 defer_recovery(cct->_conf->osd_recovery_delay_start);
569 }
570
571 void OSDService::final_init()
572 {
573 objecter->start(osdmap.get());
574 }
575
576 void OSDService::activate_map()
577 {
578 // wake/unwake the tiering agent
579 agent_lock.Lock();
580 agent_active =
581 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
582 osd->is_active();
583 agent_cond.Signal();
584 agent_lock.Unlock();
585 }
586
587 class AgentTimeoutCB : public Context {
588 PGRef pg;
589 public:
590 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
591 void finish(int) override {
592 pg->agent_choose_mode_restart();
593 }
594 };
595
596 void OSDService::agent_entry()
597 {
598 dout(10) << __func__ << " start" << dendl;
599 agent_lock.Lock();
600
601 while (!agent_stop_flag) {
602 if (agent_queue.empty()) {
603 dout(20) << __func__ << " empty queue" << dendl;
604 agent_cond.Wait(agent_lock);
605 continue;
606 }
607 uint64_t level = agent_queue.rbegin()->first;
608 set<PGRef>& top = agent_queue.rbegin()->second;
609 dout(10) << __func__
610 << " tiers " << agent_queue.size()
611 << ", top is " << level
612 << " with pgs " << top.size()
613 << ", ops " << agent_ops << "/"
614 << cct->_conf->osd_agent_max_ops
615 << (agent_active ? " active" : " NOT ACTIVE")
616 << dendl;
617 dout(20) << __func__ << " oids " << agent_oids << dendl;
618 int max = cct->_conf->osd_agent_max_ops - agent_ops;
619 int agent_flush_quota = max;
620 if (!flush_mode_high_count)
621 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
622 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
623 agent_cond.Wait(agent_lock);
624 continue;
625 }
626
627 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
628 agent_queue_pos = top.begin();
629 agent_valid_iterator = true;
630 }
631 PGRef pg = *agent_queue_pos;
632 dout(10) << "high_count " << flush_mode_high_count
633 << " agent_ops " << agent_ops
634 << " flush_quota " << agent_flush_quota << dendl;
635 agent_lock.Unlock();
636 if (!pg->agent_work(max, agent_flush_quota)) {
637 dout(10) << __func__ << " " << pg->get_pgid()
638 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
639 << " seconds" << dendl;
640
641 osd->logger->inc(l_osd_tier_delay);
642 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
643 agent_timer_lock.Lock();
644 Context *cb = new AgentTimeoutCB(pg);
645 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
646 agent_timer_lock.Unlock();
647 }
648 agent_lock.Lock();
649 }
650 agent_lock.Unlock();
651 dout(10) << __func__ << " finish" << dendl;
652 }
653
654 void OSDService::agent_stop()
655 {
656 {
657 Mutex::Locker l(agent_lock);
658
659 // By this time all ops should be cancelled
660 assert(agent_ops == 0);
661 // By this time all PGs are shutdown and dequeued
662 if (!agent_queue.empty()) {
663 set<PGRef>& top = agent_queue.rbegin()->second;
664 derr << "agent queue not empty, for example " << (*top.begin())->info.pgid << dendl;
665 assert(0 == "agent queue not empty");
666 }
667
668 agent_stop_flag = true;
669 agent_cond.Signal();
670 }
671 agent_thread.join();
672 }
673
674 // -------------------------------------
675
676 void OSDService::promote_throttle_recalibrate()
677 {
678 utime_t now = ceph_clock_now();
679 double dur = now - last_recalibrate;
680 last_recalibrate = now;
681 unsigned prob = promote_probability_millis;
682
683 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
684 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
685
686 unsigned min_prob = 1;
687
688 uint64_t attempts, obj, bytes;
689 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
690 dout(10) << __func__ << " " << attempts << " attempts, promoted "
691 << obj << " objects and " << pretty_si_t(bytes) << " bytes; target "
692 << target_obj_sec << " obj/sec or "
693 << pretty_si_t(target_bytes_sec) << " bytes/sec"
694 << dendl;
695
696 // calculate what the probability *should* be, given the targets
697 unsigned new_prob;
698 if (attempts && dur > 0) {
699 uint64_t avg_size = 1;
700 if (obj)
701 avg_size = MAX(bytes / obj, 1);
702 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
703 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
704 / (double)attempts;
705 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
706 << avg_size << dendl;
707 if (target_obj_sec && target_bytes_sec)
708 new_prob = MIN(po, pb);
709 else if (target_obj_sec)
710 new_prob = po;
711 else if (target_bytes_sec)
712 new_prob = pb;
713 else
714 new_prob = 1000;
715 } else {
716 new_prob = 1000;
717 }
718 dout(20) << __func__ << " new_prob " << new_prob << dendl;
719
720 // correct for persistent skew between target rate and actual rate, adjust
721 double ratio = 1.0;
722 unsigned actual = 0;
723 if (attempts && obj) {
724 actual = obj * 1000 / attempts;
725 ratio = (double)actual / (double)prob;
726 new_prob = (double)new_prob / ratio;
727 }
728 new_prob = MAX(new_prob, min_prob);
729 new_prob = MIN(new_prob, 1000);
730
731 // adjust
732 prob = (prob + new_prob) / 2;
733 prob = MAX(prob, min_prob);
734 prob = MIN(prob, 1000);
735 dout(10) << __func__ << " actual " << actual
736 << ", actual/prob ratio " << ratio
737 << ", adjusted new_prob " << new_prob
738 << ", prob " << promote_probability_millis << " -> " << prob
739 << dendl;
740 promote_probability_millis = prob;
741
742 // set hard limits for this interval to mitigate stampedes
743 promote_max_objects = target_obj_sec * OSD::OSD_TICK_INTERVAL * 2;
744 promote_max_bytes = target_bytes_sec * OSD::OSD_TICK_INTERVAL * 2;
745 }
746
747 // -------------------------------------
748
749 float OSDService::get_failsafe_full_ratio()
750 {
751 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
752 if (full_ratio > 1.0) full_ratio /= 100.0;
753 return full_ratio;
754 }
755
756 void OSDService::check_full_status(float ratio)
757 {
758 Mutex::Locker l(full_status_lock);
759
760 cur_ratio = ratio;
761
762 // The OSDMap ratios take precendence. So if the failsafe is .95 and
763 // the admin sets the cluster full to .96, the failsafe moves up to .96
764 // too. (Not that having failsafe == full is ideal, but it's better than
765 // dropping writes before the clusters appears full.)
766 OSDMapRef osdmap = get_osdmap();
767 if (!osdmap || osdmap->get_epoch() == 0) {
768 cur_state = NONE;
769 return;
770 }
771 float nearfull_ratio = osdmap->get_nearfull_ratio();
772 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
773 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
774 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
775
776 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
777 // use the failsafe for nearfull and full; the mon isn't using the
778 // flags anyway because we're mid-upgrade.
779 full_ratio = failsafe_ratio;
780 backfillfull_ratio = failsafe_ratio;
781 nearfull_ratio = failsafe_ratio;
782 } else if (full_ratio <= 0 ||
783 backfillfull_ratio <= 0 ||
784 nearfull_ratio <= 0) {
785 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
786 // use failsafe flag. ick. the monitor did something wrong or the user
787 // did something stupid.
788 full_ratio = failsafe_ratio;
789 backfillfull_ratio = failsafe_ratio;
790 nearfull_ratio = failsafe_ratio;
791 }
792
793 string inject;
794 s_names new_state;
795 if (injectfull_state > NONE && injectfull) {
796 new_state = injectfull_state;
797 inject = "(Injected)";
798 } else if (ratio > failsafe_ratio) {
799 new_state = FAILSAFE;
800 } else if (ratio > full_ratio) {
801 new_state = FULL;
802 } else if (ratio > backfillfull_ratio) {
803 new_state = BACKFILLFULL;
804 } else if (ratio > nearfull_ratio) {
805 new_state = NEARFULL;
806 } else {
807 new_state = NONE;
808 }
809 dout(20) << __func__ << " cur ratio " << ratio
810 << ". nearfull_ratio " << nearfull_ratio
811 << ". backfillfull_ratio " << backfillfull_ratio
812 << ", full_ratio " << full_ratio
813 << ", failsafe_ratio " << failsafe_ratio
814 << ", new state " << get_full_state_name(new_state)
815 << " " << inject
816 << dendl;
817
818 // warn
819 if (cur_state != new_state) {
820 dout(10) << __func__ << " " << get_full_state_name(cur_state)
821 << " -> " << get_full_state_name(new_state) << dendl;
822 if (new_state == FAILSAFE) {
823 clog->error() << "full status failsafe engaged, dropping updates, now "
824 << (int)roundf(ratio * 100) << "% full";
825 } else if (cur_state == FAILSAFE) {
826 clog->error() << "full status failsafe disengaged, no longer dropping "
827 << "updates, now " << (int)roundf(ratio * 100) << "% full";
828 }
829 cur_state = new_state;
830 }
831 }
832
833 bool OSDService::need_fullness_update()
834 {
835 OSDMapRef osdmap = get_osdmap();
836 s_names cur = NONE;
837 if (osdmap->exists(whoami)) {
838 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
839 cur = FULL;
840 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
841 cur = BACKFILLFULL;
842 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
843 cur = NEARFULL;
844 }
845 }
846 s_names want = NONE;
847 if (is_full())
848 want = FULL;
849 else if (is_backfillfull())
850 want = BACKFILLFULL;
851 else if (is_nearfull())
852 want = NEARFULL;
853 return want != cur;
854 }
855
856 bool OSDService::_check_full(s_names type, ostream &ss) const
857 {
858 Mutex::Locker l(full_status_lock);
859
860 if (injectfull && injectfull_state >= type) {
861 // injectfull is either a count of the number of times to return failsafe full
862 // or if -1 then always return full
863 if (injectfull > 0)
864 --injectfull;
865 ss << "Injected " << get_full_state_name(type) << " OSD ("
866 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")";
867 return true;
868 }
869
870 ss << "current usage is " << cur_ratio;
871 return cur_state >= type;
872 }
873
874 bool OSDService::check_failsafe_full(ostream &ss) const
875 {
876 return _check_full(FAILSAFE, ss);
877 }
878
879 bool OSDService::check_full(ostream &ss) const
880 {
881 return _check_full(FULL, ss);
882 }
883
884 bool OSDService::check_backfill_full(ostream &ss) const
885 {
886 return _check_full(BACKFILLFULL, ss);
887 }
888
889 bool OSDService::check_nearfull(ostream &ss) const
890 {
891 return _check_full(NEARFULL, ss);
892 }
893
894 bool OSDService::is_failsafe_full() const
895 {
896 Mutex::Locker l(full_status_lock);
897 return cur_state == FAILSAFE;
898 }
899
900 bool OSDService::is_full() const
901 {
902 Mutex::Locker l(full_status_lock);
903 return cur_state >= FULL;
904 }
905
906 bool OSDService::is_backfillfull() const
907 {
908 Mutex::Locker l(full_status_lock);
909 return cur_state >= BACKFILLFULL;
910 }
911
912 bool OSDService::is_nearfull() const
913 {
914 Mutex::Locker l(full_status_lock);
915 return cur_state >= NEARFULL;
916 }
917
918 void OSDService::set_injectfull(s_names type, int64_t count)
919 {
920 Mutex::Locker l(full_status_lock);
921 injectfull_state = type;
922 injectfull = count;
923 }
924
925 osd_stat_t OSDService::set_osd_stat(const struct store_statfs_t &stbuf,
926 vector<int>& hb_peers)
927 {
928 uint64_t bytes = stbuf.total;
929 uint64_t used = bytes - stbuf.available;
930 uint64_t avail = stbuf.available;
931
932 osd->logger->set(l_osd_stat_bytes, bytes);
933 osd->logger->set(l_osd_stat_bytes_used, used);
934 osd->logger->set(l_osd_stat_bytes_avail, avail);
935
936 {
937 Mutex::Locker l(stat_lock);
938 osd_stat.hb_peers.swap(hb_peers);
939 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
940 osd_stat.kb = bytes >> 10;
941 osd_stat.kb_used = used >> 10;
942 osd_stat.kb_avail = avail >> 10;
943 return osd_stat;
944 }
945 }
946
947 void OSDService::update_osd_stat(vector<int>& hb_peers)
948 {
949 // load osd stats first
950 struct store_statfs_t stbuf;
951 int r = osd->store->statfs(&stbuf);
952 if (r < 0) {
953 derr << "statfs() failed: " << cpp_strerror(r) << dendl;
954 return;
955 }
956
957 auto new_stat = set_osd_stat(stbuf, hb_peers);
958 dout(20) << "update_osd_stat " << new_stat << dendl;
959 assert(new_stat.kb);
960 float ratio = ((float)new_stat.kb_used) / ((float)new_stat.kb);
961 check_full_status(ratio);
962 }
963
964 bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
965 {
966 OSDMapRef osdmap = get_osdmap();
967 for (auto shard : missing_on) {
968 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
969 return true;
970 }
971 return false;
972 }
973
974 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
975 {
976 OSDMapRef next_map = get_nextmap_reserved();
977 // service map is always newer/newest
978 assert(from_epoch <= next_map->get_epoch());
979
980 if (next_map->is_down(peer) ||
981 next_map->get_info(peer).up_from > from_epoch) {
982 m->put();
983 release_map(next_map);
984 return;
985 }
986 const entity_inst_t& peer_inst = next_map->get_cluster_inst(peer);
987 ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst);
988 share_map_peer(peer, peer_con.get(), next_map);
989 peer_con->send_message(m);
990 release_map(next_map);
991 }
992
993 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
994 {
995 OSDMapRef next_map = get_nextmap_reserved();
996 // service map is always newer/newest
997 assert(from_epoch <= next_map->get_epoch());
998
999 if (next_map->is_down(peer) ||
1000 next_map->get_info(peer).up_from > from_epoch) {
1001 release_map(next_map);
1002 return NULL;
1003 }
1004 ConnectionRef con = osd->cluster_messenger->get_connection(next_map->get_cluster_inst(peer));
1005 release_map(next_map);
1006 return con;
1007 }
1008
1009 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1010 {
1011 OSDMapRef next_map = get_nextmap_reserved();
1012 // service map is always newer/newest
1013 assert(from_epoch <= next_map->get_epoch());
1014
1015 pair<ConnectionRef,ConnectionRef> ret;
1016 if (next_map->is_down(peer) ||
1017 next_map->get_info(peer).up_from > from_epoch) {
1018 release_map(next_map);
1019 return ret;
1020 }
1021 ret.first = osd->hb_back_client_messenger->get_connection(next_map->get_hb_back_inst(peer));
1022 if (next_map->get_hb_front_addr(peer) != entity_addr_t())
1023 ret.second = osd->hb_front_client_messenger->get_connection(next_map->get_hb_front_inst(peer));
1024 release_map(next_map);
1025 return ret;
1026 }
1027
1028
1029 void OSDService::queue_want_pg_temp(pg_t pgid, vector<int>& want)
1030 {
1031 Mutex::Locker l(pg_temp_lock);
1032 map<pg_t,vector<int> >::iterator p = pg_temp_pending.find(pgid);
1033 if (p == pg_temp_pending.end() ||
1034 p->second != want) {
1035 pg_temp_wanted[pgid] = want;
1036 }
1037 }
1038
1039 void OSDService::remove_want_pg_temp(pg_t pgid)
1040 {
1041 Mutex::Locker l(pg_temp_lock);
1042 pg_temp_wanted.erase(pgid);
1043 pg_temp_pending.erase(pgid);
1044 }
1045
1046 void OSDService::_sent_pg_temp()
1047 {
1048 for (map<pg_t,vector<int> >::iterator p = pg_temp_wanted.begin();
1049 p != pg_temp_wanted.end();
1050 ++p)
1051 pg_temp_pending[p->first] = p->second;
1052 pg_temp_wanted.clear();
1053 }
1054
1055 void OSDService::requeue_pg_temp()
1056 {
1057 Mutex::Locker l(pg_temp_lock);
1058 // wanted overrides pending. note that remove_want_pg_temp
1059 // clears the item out of both.
1060 unsigned old_wanted = pg_temp_wanted.size();
1061 unsigned old_pending = pg_temp_pending.size();
1062 _sent_pg_temp();
1063 pg_temp_wanted.swap(pg_temp_pending);
1064 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1065 << pg_temp_wanted.size() << dendl;
1066 }
1067
1068 void OSDService::send_pg_temp()
1069 {
1070 Mutex::Locker l(pg_temp_lock);
1071 if (pg_temp_wanted.empty())
1072 return;
1073 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1074 MOSDPGTemp *m = new MOSDPGTemp(osdmap->get_epoch());
1075 m->pg_temp = pg_temp_wanted;
1076 monc->send_mon_message(m);
1077 _sent_pg_temp();
1078 }
1079
1080 void OSDService::send_pg_created(pg_t pgid)
1081 {
1082 dout(20) << __func__ << dendl;
1083 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1084 monc->send_mon_message(new MOSDPGCreated(pgid));
1085 }
1086 }
1087
1088 // --------------------------------------
1089 // dispatch
1090
1091 epoch_t OSDService::get_peer_epoch(int peer)
1092 {
1093 Mutex::Locker l(peer_map_epoch_lock);
1094 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1095 if (p == peer_map_epoch.end())
1096 return 0;
1097 return p->second;
1098 }
1099
1100 epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1101 {
1102 Mutex::Locker l(peer_map_epoch_lock);
1103 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1104 if (p != peer_map_epoch.end()) {
1105 if (p->second < e) {
1106 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1107 p->second = e;
1108 } else {
1109 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1110 }
1111 return p->second;
1112 } else {
1113 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1114 peer_map_epoch[peer] = e;
1115 return e;
1116 }
1117 }
1118
1119 void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1120 {
1121 Mutex::Locker l(peer_map_epoch_lock);
1122 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1123 if (p != peer_map_epoch.end()) {
1124 if (p->second <= as_of) {
1125 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1126 << " had " << p->second << dendl;
1127 peer_map_epoch.erase(p);
1128 } else {
1129 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1130 << " has " << p->second << " - not forgetting" << dendl;
1131 }
1132 }
1133 }
1134
1135 bool OSDService::should_share_map(entity_name_t name, Connection *con,
1136 epoch_t epoch, const OSDMapRef& osdmap,
1137 const epoch_t *sent_epoch_p)
1138 {
1139 dout(20) << "should_share_map "
1140 << name << " " << con->get_peer_addr()
1141 << " " << epoch << dendl;
1142
1143 // does client have old map?
1144 if (name.is_client()) {
1145 bool message_sendmap = epoch < osdmap->get_epoch();
1146 if (message_sendmap && sent_epoch_p) {
1147 dout(20) << "client session last_sent_epoch: "
1148 << *sent_epoch_p
1149 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1150 if (*sent_epoch_p < osdmap->get_epoch()) {
1151 return true;
1152 } // else we don't need to send it out again
1153 }
1154 }
1155
1156 if (con->get_messenger() == osd->cluster_messenger &&
1157 con != osd->cluster_messenger->get_loopback_connection() &&
1158 osdmap->is_up(name.num()) &&
1159 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1160 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1161 // remember
1162 epoch_t has = MAX(get_peer_epoch(name.num()), epoch);
1163
1164 // share?
1165 if (has < osdmap->get_epoch()) {
1166 dout(10) << name << " " << con->get_peer_addr()
1167 << " has old map " << epoch << " < "
1168 << osdmap->get_epoch() << dendl;
1169 return true;
1170 }
1171 }
1172
1173 return false;
1174 }
1175
1176 void OSDService::share_map(
1177 entity_name_t name,
1178 Connection *con,
1179 epoch_t epoch,
1180 OSDMapRef& osdmap,
1181 epoch_t *sent_epoch_p)
1182 {
1183 dout(20) << "share_map "
1184 << name << " " << con->get_peer_addr()
1185 << " " << epoch << dendl;
1186
1187 if (!osd->is_active()) {
1188 /*It is safe not to proceed as OSD is not in healthy state*/
1189 return;
1190 }
1191
1192 bool want_shared = should_share_map(name, con, epoch,
1193 osdmap, sent_epoch_p);
1194
1195 if (want_shared){
1196 if (name.is_client()) {
1197 dout(10) << name << " has old map " << epoch
1198 << " < " << osdmap->get_epoch() << dendl;
1199 // we know the Session is valid or we wouldn't be sending
1200 if (sent_epoch_p) {
1201 *sent_epoch_p = osdmap->get_epoch();
1202 }
1203 send_incremental_map(epoch, con, osdmap);
1204 } else if (con->get_messenger() == osd->cluster_messenger &&
1205 osdmap->is_up(name.num()) &&
1206 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1207 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1208 dout(10) << name << " " << con->get_peer_addr()
1209 << " has old map " << epoch << " < "
1210 << osdmap->get_epoch() << dendl;
1211 note_peer_epoch(name.num(), osdmap->get_epoch());
1212 send_incremental_map(epoch, con, osdmap);
1213 }
1214 }
1215 }
1216
1217 void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1218 {
1219 if (!map)
1220 map = get_osdmap();
1221
1222 // send map?
1223 epoch_t pe = get_peer_epoch(peer);
1224 if (pe) {
1225 if (pe < map->get_epoch()) {
1226 send_incremental_map(pe, con, map);
1227 note_peer_epoch(peer, map->get_epoch());
1228 } else
1229 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1230 } else {
1231 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1232 // no idea about peer's epoch.
1233 // ??? send recent ???
1234 // do nothing.
1235 }
1236 }
1237
1238 bool OSDService::can_inc_scrubs_pending()
1239 {
1240 bool can_inc = false;
1241 Mutex::Locker l(sched_scrub_lock);
1242
1243 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1244 dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
1245 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1246 can_inc = true;
1247 } else {
1248 dout(20) << __func__ << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1249 }
1250
1251 return can_inc;
1252 }
1253
1254 bool OSDService::inc_scrubs_pending()
1255 {
1256 bool result = false;
1257
1258 sched_scrub_lock.Lock();
1259 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1260 dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
1261 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1262 result = true;
1263 ++scrubs_pending;
1264 } else {
1265 dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1266 }
1267 sched_scrub_lock.Unlock();
1268
1269 return result;
1270 }
1271
1272 void OSDService::dec_scrubs_pending()
1273 {
1274 sched_scrub_lock.Lock();
1275 dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
1276 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1277 --scrubs_pending;
1278 assert(scrubs_pending >= 0);
1279 sched_scrub_lock.Unlock();
1280 }
1281
1282 void OSDService::inc_scrubs_active(bool reserved)
1283 {
1284 sched_scrub_lock.Lock();
1285 ++(scrubs_active);
1286 if (reserved) {
1287 --(scrubs_pending);
1288 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1289 << " (max " << cct->_conf->osd_max_scrubs
1290 << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
1291 assert(scrubs_pending >= 0);
1292 } else {
1293 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1294 << " (max " << cct->_conf->osd_max_scrubs
1295 << ", pending " << scrubs_pending << ")" << dendl;
1296 }
1297 sched_scrub_lock.Unlock();
1298 }
1299
1300 void OSDService::dec_scrubs_active()
1301 {
1302 sched_scrub_lock.Lock();
1303 dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
1304 << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
1305 --scrubs_active;
1306 assert(scrubs_active >= 0);
1307 sched_scrub_lock.Unlock();
1308 }
1309
1310 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1311 epoch_t *_bind_epoch) const
1312 {
1313 Mutex::Locker l(epoch_lock);
1314 if (_boot_epoch)
1315 *_boot_epoch = boot_epoch;
1316 if (_up_epoch)
1317 *_up_epoch = up_epoch;
1318 if (_bind_epoch)
1319 *_bind_epoch = bind_epoch;
1320 }
1321
1322 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1323 const epoch_t *_bind_epoch)
1324 {
1325 Mutex::Locker l(epoch_lock);
1326 if (_boot_epoch) {
1327 assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1328 boot_epoch = *_boot_epoch;
1329 }
1330 if (_up_epoch) {
1331 assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1332 up_epoch = *_up_epoch;
1333 }
1334 if (_bind_epoch) {
1335 assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1336 bind_epoch = *_bind_epoch;
1337 }
1338 }
1339
1340 bool OSDService::prepare_to_stop()
1341 {
1342 Mutex::Locker l(is_stopping_lock);
1343 if (get_state() != NOT_STOPPING)
1344 return false;
1345
1346 OSDMapRef osdmap = get_osdmap();
1347 if (osdmap && osdmap->is_up(whoami)) {
1348 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1349 set_state(PREPARING_TO_STOP);
1350 monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(),
1351 osdmap->get_inst(whoami),
1352 osdmap->get_epoch(),
1353 true // request ack
1354 ));
1355 utime_t now = ceph_clock_now();
1356 utime_t timeout;
1357 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1358 while ((ceph_clock_now() < timeout) &&
1359 (get_state() != STOPPING)) {
1360 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1361 }
1362 }
1363 dout(0) << __func__ << " starting shutdown" << dendl;
1364 set_state(STOPPING);
1365 return true;
1366 }
1367
1368 void OSDService::got_stop_ack()
1369 {
1370 Mutex::Locker l(is_stopping_lock);
1371 if (get_state() == PREPARING_TO_STOP) {
1372 dout(0) << __func__ << " starting shutdown" << dendl;
1373 set_state(STOPPING);
1374 is_stopping_cond.Signal();
1375 } else {
1376 dout(10) << __func__ << " ignoring msg" << dendl;
1377 }
1378 }
1379
1380 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1381 OSDSuperblock& sblock)
1382 {
1383 MOSDMap *m = new MOSDMap(monc->get_fsid());
1384 m->oldest_map = max_oldest_map;
1385 m->newest_map = sblock.newest_map;
1386
1387 for (epoch_t e = to; e > since; e--) {
1388 bufferlist bl;
1389 if (e > m->oldest_map && get_inc_map_bl(e, bl)) {
1390 m->incremental_maps[e].claim(bl);
1391 } else if (get_map_bl(e, bl)) {
1392 m->maps[e].claim(bl);
1393 break;
1394 } else {
1395 derr << "since " << since << " to " << to
1396 << " oldest " << m->oldest_map << " newest " << m->newest_map
1397 << dendl;
1398 m->put();
1399 m = NULL;
1400 break;
1401 }
1402 }
1403 return m;
1404 }
1405
1406 void OSDService::send_map(MOSDMap *m, Connection *con)
1407 {
1408 con->send_message(m);
1409 }
1410
1411 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1412 OSDMapRef& osdmap)
1413 {
1414 epoch_t to = osdmap->get_epoch();
1415 dout(10) << "send_incremental_map " << since << " -> " << to
1416 << " to " << con << " " << con->get_peer_addr() << dendl;
1417
1418 MOSDMap *m = NULL;
1419 while (!m) {
1420 OSDSuperblock sblock(get_superblock());
1421 if (since < sblock.oldest_map) {
1422 // just send latest full map
1423 MOSDMap *m = new MOSDMap(monc->get_fsid());
1424 m->oldest_map = max_oldest_map;
1425 m->newest_map = sblock.newest_map;
1426 get_map_bl(to, m->maps[to]);
1427 send_map(m, con);
1428 return;
1429 }
1430
1431 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1432 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1433 << ", only sending most recent" << dendl;
1434 since = to - cct->_conf->osd_map_share_max_epochs;
1435 }
1436
1437 if (to - since > (epoch_t)cct->_conf->osd_map_message_max)
1438 to = since + cct->_conf->osd_map_message_max;
1439 m = build_incremental_map_msg(since, to, sblock);
1440 }
1441 send_map(m, con);
1442 }
1443
1444 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1445 {
1446 bool found = map_bl_cache.lookup(e, &bl);
1447 if (found) {
1448 if (logger)
1449 logger->inc(l_osd_map_bl_cache_hit);
1450 return true;
1451 }
1452 if (logger)
1453 logger->inc(l_osd_map_bl_cache_miss);
1454 found = store->read(coll_t::meta(),
1455 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1456 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1457 if (found) {
1458 _add_map_bl(e, bl);
1459 }
1460 return found;
1461 }
1462
1463 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1464 {
1465 Mutex::Locker l(map_cache_lock);
1466 bool found = map_bl_inc_cache.lookup(e, &bl);
1467 if (found) {
1468 if (logger)
1469 logger->inc(l_osd_map_bl_cache_hit);
1470 return true;
1471 }
1472 if (logger)
1473 logger->inc(l_osd_map_bl_cache_miss);
1474 found = store->read(coll_t::meta(),
1475 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1476 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1477 if (found) {
1478 _add_map_inc_bl(e, bl);
1479 }
1480 return found;
1481 }
1482
1483 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1484 {
1485 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1486 // cache a contiguous buffer
1487 if (bl.get_num_buffers() > 1) {
1488 bl.rebuild();
1489 }
1490 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1491 map_bl_cache.add(e, bl);
1492 }
1493
1494 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1495 {
1496 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1497 // cache a contiguous buffer
1498 if (bl.get_num_buffers() > 1) {
1499 bl.rebuild();
1500 }
1501 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1502 map_bl_inc_cache.add(e, bl);
1503 }
1504
1505 void OSDService::pin_map_inc_bl(epoch_t e, bufferlist &bl)
1506 {
1507 Mutex::Locker l(map_cache_lock);
1508 // cache a contiguous buffer
1509 if (bl.get_num_buffers() > 1) {
1510 bl.rebuild();
1511 }
1512 map_bl_inc_cache.pin(e, bl);
1513 }
1514
1515 void OSDService::pin_map_bl(epoch_t e, bufferlist &bl)
1516 {
1517 Mutex::Locker l(map_cache_lock);
1518 // cache a contiguous buffer
1519 if (bl.get_num_buffers() > 1) {
1520 bl.rebuild();
1521 }
1522 map_bl_cache.pin(e, bl);
1523 }
1524
1525 void OSDService::clear_map_bl_cache_pins(epoch_t e)
1526 {
1527 Mutex::Locker l(map_cache_lock);
1528 map_bl_inc_cache.clear_pinned(e);
1529 map_bl_cache.clear_pinned(e);
1530 }
1531
1532 OSDMapRef OSDService::_add_map(OSDMap *o)
1533 {
1534 epoch_t e = o->get_epoch();
1535
1536 if (cct->_conf->osd_map_dedup) {
1537 // Dedup against an existing map at a nearby epoch
1538 OSDMapRef for_dedup = map_cache.lower_bound(e);
1539 if (for_dedup) {
1540 OSDMap::dedup(for_dedup.get(), o);
1541 }
1542 }
1543 bool existed;
1544 OSDMapRef l = map_cache.add(e, o, &existed);
1545 if (existed) {
1546 delete o;
1547 }
1548 return l;
1549 }
1550
1551 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1552 {
1553 Mutex::Locker l(map_cache_lock);
1554 OSDMapRef retval = map_cache.lookup(epoch);
1555 if (retval) {
1556 dout(30) << "get_map " << epoch << " -cached" << dendl;
1557 if (logger) {
1558 logger->inc(l_osd_map_cache_hit);
1559 }
1560 return retval;
1561 }
1562 if (logger) {
1563 logger->inc(l_osd_map_cache_miss);
1564 epoch_t lb = map_cache.cached_key_lower_bound();
1565 if (epoch < lb) {
1566 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1567 logger->inc(l_osd_map_cache_miss_low);
1568 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1569 }
1570 }
1571
1572 OSDMap *map = new OSDMap;
1573 if (epoch > 0) {
1574 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1575 bufferlist bl;
1576 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1577 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1578 delete map;
1579 return OSDMapRef();
1580 }
1581 map->decode(bl);
1582 } else {
1583 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1584 }
1585 return _add_map(map);
1586 }
1587
1588 // ops
1589
1590
1591 void OSDService::reply_op_error(OpRequestRef op, int err)
1592 {
1593 reply_op_error(op, err, eversion_t(), 0);
1594 }
1595
1596 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1597 version_t uv)
1598 {
1599 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1600 assert(m->get_type() == CEPH_MSG_OSD_OP);
1601 int flags;
1602 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1603
1604 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1605 true);
1606 reply->set_reply_versions(v, uv);
1607 m->get_connection()->send_message(reply);
1608 }
1609
1610 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1611 {
1612 if (!cct->_conf->osd_debug_misdirected_ops) {
1613 return;
1614 }
1615
1616 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1617 assert(m->get_type() == CEPH_MSG_OSD_OP);
1618
1619 assert(m->get_map_epoch() >= pg->info.history.same_primary_since);
1620
1621 if (pg->is_ec_pg()) {
1622 /**
1623 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1624 * can get this result:
1625 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1626 * [CRUSH_ITEM_NONE, 2, 3]/3
1627 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1628 * [3, 2, 3]/3
1629 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1630 * -- misdirected op
1631 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1632 * it and fulfils it
1633 *
1634 * We can't compute the op target based on the sending map epoch due to
1635 * splitting. The simplest thing is to detect such cases here and drop
1636 * them without an error (the client will resend anyway).
1637 */
1638 assert(m->get_map_epoch() <= superblock.newest_map);
1639 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1640 if (!opmap) {
1641 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1642 << m->get_map_epoch() << ", dropping" << dendl;
1643 return;
1644 }
1645 pg_t _pgid = m->get_raw_pg();
1646 spg_t pgid;
1647 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1648 _pgid = opmap->raw_pg_to_pg(_pgid);
1649 if (opmap->get_primary_shard(_pgid, &pgid) &&
1650 pgid.shard != pg->info.pgid.shard) {
1651 dout(7) << __func__ << ": " << *pg << " primary changed since "
1652 << m->get_map_epoch() << ", dropping" << dendl;
1653 return;
1654 }
1655 }
1656
1657 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1658 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1659 << " pg " << m->get_raw_pg()
1660 << " to osd." << whoami
1661 << " not " << pg->acting
1662 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1663 }
1664
1665 void OSDService::enqueue_back(spg_t pgid, PGQueueable qi)
1666 {
1667 osd->op_shardedwq.queue(make_pair(pgid, qi));
1668 }
1669
1670 void OSDService::enqueue_front(spg_t pgid, PGQueueable qi)
1671 {
1672 osd->op_shardedwq.queue_front(make_pair(pgid, qi));
1673 }
1674
1675 void OSDService::queue_for_peering(PG *pg)
1676 {
1677 peering_wq.queue(pg);
1678 }
1679
1680 void OSDService::queue_for_snap_trim(PG *pg)
1681 {
1682 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1683 osd->op_shardedwq.queue(
1684 make_pair(
1685 pg->info.pgid,
1686 PGQueueable(
1687 PGSnapTrim(pg->get_osdmap()->get_epoch()),
1688 cct->_conf->osd_snap_trim_cost,
1689 cct->_conf->osd_snap_trim_priority,
1690 ceph_clock_now(),
1691 entity_inst_t(),
1692 pg->get_osdmap()->get_epoch())));
1693 }
1694
1695
1696 // ====================================================================
1697 // OSD
1698
1699 #undef dout_prefix
1700 #define dout_prefix *_dout
1701
1702 // Commands shared between OSD's console and admin console:
1703 namespace ceph {
1704 namespace osd_cmds {
1705
1706 int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1707
1708 }} // namespace ceph::osd_cmds
1709
1710 int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
1711 uuid_d fsid, int whoami)
1712 {
1713 int ret;
1714
1715 ceph::shared_ptr<ObjectStore::Sequencer> osr(
1716 new ObjectStore::Sequencer("mkfs"));
1717 OSDSuperblock sb;
1718 bufferlist sbbl;
1719 C_SaferCond waiter;
1720
1721 // if we are fed a uuid for this osd, use it.
1722 store->set_fsid(cct->_conf->osd_uuid);
1723
1724 ret = store->mkfs();
1725 if (ret) {
1726 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1727 << cpp_strerror(ret) << dendl;
1728 goto free_store;
1729 }
1730
1731 store->set_cache_shards(1); // doesn't matter for mkfs!
1732
1733 ret = store->mount();
1734 if (ret) {
1735 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1736 << cpp_strerror(ret) << dendl;
1737 goto free_store;
1738 }
1739
1740 ret = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1741 if (ret >= 0) {
1742 /* if we already have superblock, check content of superblock */
1743 dout(0) << " have superblock" << dendl;
1744 bufferlist::iterator p;
1745 p = sbbl.begin();
1746 ::decode(sb, p);
1747 if (whoami != sb.whoami) {
1748 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1749 << dendl;
1750 ret = -EINVAL;
1751 goto umount_store;
1752 }
1753 if (fsid != sb.cluster_fsid) {
1754 derr << "provided cluster fsid " << fsid
1755 << " != superblock's " << sb.cluster_fsid << dendl;
1756 ret = -EINVAL;
1757 goto umount_store;
1758 }
1759 } else {
1760 // create superblock
1761 sb.cluster_fsid = fsid;
1762 sb.osd_fsid = store->get_fsid();
1763 sb.whoami = whoami;
1764 sb.compat_features = get_osd_initial_compat_set();
1765
1766 bufferlist bl;
1767 ::encode(sb, bl);
1768
1769 ObjectStore::Transaction t;
1770 t.create_collection(coll_t::meta(), 0);
1771 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1772 ret = store->apply_transaction(osr.get(), std::move(t));
1773 if (ret) {
1774 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1775 << "apply_transaction returned " << cpp_strerror(ret) << dendl;
1776 goto umount_store;
1777 }
1778 }
1779
1780 if (!osr->flush_commit(&waiter)) {
1781 waiter.wait();
1782 }
1783
1784 ret = write_meta(store, sb.cluster_fsid, sb.osd_fsid, whoami);
1785 if (ret) {
1786 derr << "OSD::mkfs: failed to write fsid file: error "
1787 << cpp_strerror(ret) << dendl;
1788 goto umount_store;
1789 }
1790
1791 umount_store:
1792 store->umount();
1793 free_store:
1794 delete store;
1795 return ret;
1796 }
1797
1798 int OSD::write_meta(ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
1799 {
1800 char val[80];
1801 int r;
1802
1803 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
1804 r = store->write_meta("magic", val);
1805 if (r < 0)
1806 return r;
1807
1808 snprintf(val, sizeof(val), "%d", whoami);
1809 r = store->write_meta("whoami", val);
1810 if (r < 0)
1811 return r;
1812
1813 cluster_fsid.print(val);
1814 r = store->write_meta("ceph_fsid", val);
1815 if (r < 0)
1816 return r;
1817
1818 r = store->write_meta("ready", "ready");
1819 if (r < 0)
1820 return r;
1821
1822 return 0;
1823 }
1824
1825 int OSD::peek_meta(ObjectStore *store, std::string& magic,
1826 uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami)
1827 {
1828 string val;
1829
1830 int r = store->read_meta("magic", &val);
1831 if (r < 0)
1832 return r;
1833 magic = val;
1834
1835 r = store->read_meta("whoami", &val);
1836 if (r < 0)
1837 return r;
1838 whoami = atoi(val.c_str());
1839
1840 r = store->read_meta("ceph_fsid", &val);
1841 if (r < 0)
1842 return r;
1843 r = cluster_fsid.parse(val.c_str());
1844 if (!r)
1845 return -EINVAL;
1846
1847 r = store->read_meta("fsid", &val);
1848 if (r < 0) {
1849 osd_fsid = uuid_d();
1850 } else {
1851 r = osd_fsid.parse(val.c_str());
1852 if (!r)
1853 return -EINVAL;
1854 }
1855
1856 return 0;
1857 }
1858
1859
1860 #undef dout_prefix
1861 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
1862
1863 // cons/des
1864
1865 OSD::OSD(CephContext *cct_, ObjectStore *store_,
1866 int id,
1867 Messenger *internal_messenger,
1868 Messenger *external_messenger,
1869 Messenger *hb_client_front,
1870 Messenger *hb_client_back,
1871 Messenger *hb_front_serverm,
1872 Messenger *hb_back_serverm,
1873 Messenger *osdc_messenger,
1874 MonClient *mc,
1875 const std::string &dev, const std::string &jdev) :
1876 Dispatcher(cct_),
1877 osd_lock("OSD::osd_lock"),
1878 tick_timer(cct, osd_lock),
1879 tick_timer_lock("OSD::tick_timer_lock"),
1880 tick_timer_without_osd_lock(cct, tick_timer_lock),
1881 authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(cct,
1882 cct->_conf->auth_supported.empty() ?
1883 cct->_conf->auth_cluster_required :
1884 cct->_conf->auth_supported)),
1885 authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(cct,
1886 cct->_conf->auth_supported.empty() ?
1887 cct->_conf->auth_service_required :
1888 cct->_conf->auth_supported)),
1889 cluster_messenger(internal_messenger),
1890 client_messenger(external_messenger),
1891 objecter_messenger(osdc_messenger),
1892 monc(mc),
1893 mgrc(cct_, client_messenger),
1894 logger(NULL),
1895 recoverystate_perf(NULL),
1896 store(store_),
1897 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
1898 clog(log_client.create_channel()),
1899 whoami(id),
1900 dev_path(dev), journal_path(jdev),
1901 store_is_rotational(store->is_rotational()),
1902 trace_endpoint("0.0.0.0", 0, "osd"),
1903 asok_hook(NULL),
1904 osd_compat(get_osd_compat_set()),
1905 peering_tp(cct, "OSD::peering_tp", "tp_peering",
1906 cct->_conf->osd_peering_wq_threads,
1907 "osd_peering_tp_threads"),
1908 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
1909 get_num_op_threads()),
1910 disk_tp(cct, "OSD::disk_tp", "tp_osd_disk", cct->_conf->osd_disk_threads, "osd_disk_threads"),
1911 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
1912 session_waiting_lock("OSD::session_waiting_lock"),
1913 heartbeat_lock("OSD::heartbeat_lock"),
1914 heartbeat_stop(false),
1915 heartbeat_need_update(true),
1916 hb_front_client_messenger(hb_client_front),
1917 hb_back_client_messenger(hb_client_back),
1918 hb_front_server_messenger(hb_front_serverm),
1919 hb_back_server_messenger(hb_back_serverm),
1920 daily_loadavg(0.0),
1921 heartbeat_thread(this),
1922 heartbeat_dispatcher(this),
1923 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
1924 cct->_conf->osd_num_op_tracker_shard),
1925 test_ops_hook(NULL),
1926 op_queue(get_io_queue()),
1927 op_prio_cutoff(get_io_prio_cut()),
1928 op_shardedwq(
1929 get_num_op_shards(),
1930 this,
1931 cct->_conf->osd_op_thread_timeout,
1932 cct->_conf->osd_op_thread_suicide_timeout,
1933 &osd_op_tp),
1934 peering_wq(
1935 this,
1936 cct->_conf->osd_op_thread_timeout,
1937 cct->_conf->osd_op_thread_suicide_timeout,
1938 &peering_tp),
1939 map_lock("OSD::map_lock"),
1940 pg_map_lock("OSD::pg_map_lock"),
1941 last_pg_create_epoch(0),
1942 mon_report_lock("OSD::mon_report_lock"),
1943 stats_ack_timeout(cct->_conf->osd_mon_ack_timeout),
1944 up_thru_wanted(0),
1945 requested_full_first(0),
1946 requested_full_last(0),
1947 pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
1948 osd_stat_updated(false),
1949 pg_stat_tid(0), pg_stat_tid_flushed(0),
1950 command_wq(
1951 this,
1952 cct->_conf->osd_command_thread_timeout,
1953 cct->_conf->osd_command_thread_suicide_timeout,
1954 &command_tp),
1955 remove_wq(
1956 cct,
1957 store,
1958 cct->_conf->osd_remove_thread_timeout,
1959 cct->_conf->osd_remove_thread_suicide_timeout,
1960 &disk_tp),
1961 service(this)
1962 {
1963 monc->set_messenger(client_messenger);
1964 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
1965 cct->_conf->osd_op_log_threshold);
1966 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
1967 cct->_conf->osd_op_history_duration);
1968 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
1969 cct->_conf->osd_op_history_slow_op_threshold);
1970 #ifdef WITH_BLKIN
1971 std::stringstream ss;
1972 ss << "osd." << whoami;
1973 trace_endpoint.copy_name(ss.str());
1974 #endif
1975 }
1976
1977 OSD::~OSD()
1978 {
1979 delete authorize_handler_cluster_registry;
1980 delete authorize_handler_service_registry;
1981 delete class_handler;
1982 cct->get_perfcounters_collection()->remove(recoverystate_perf);
1983 cct->get_perfcounters_collection()->remove(logger);
1984 delete recoverystate_perf;
1985 delete logger;
1986 delete store;
1987 }
1988
1989 void cls_initialize(ClassHandler *ch);
1990
1991 void OSD::handle_signal(int signum)
1992 {
1993 assert(signum == SIGINT || signum == SIGTERM);
1994 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
1995 shutdown();
1996 }
1997
1998 int OSD::pre_init()
1999 {
2000 Mutex::Locker lock(osd_lock);
2001 if (is_stopping())
2002 return 0;
2003
2004 if (store->test_mount_in_use()) {
2005 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2006 << "currently in use. (Is ceph-osd already running?)" << dendl;
2007 return -EBUSY;
2008 }
2009
2010 cct->_conf->add_observer(this);
2011 return 0;
2012 }
2013
2014 // asok
2015
2016 class OSDSocketHook : public AdminSocketHook {
2017 OSD *osd;
2018 public:
2019 explicit OSDSocketHook(OSD *o) : osd(o) {}
2020 bool call(std::string admin_command, cmdmap_t& cmdmap, std::string format,
2021 bufferlist& out) override {
2022 stringstream ss;
2023 bool r = osd->asok_command(admin_command, cmdmap, format, ss);
2024 out.append(ss);
2025 return r;
2026 }
2027 };
2028
2029 bool OSD::asok_command(string admin_command, cmdmap_t& cmdmap, string format,
2030 ostream& ss)
2031 {
2032 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2033 if (admin_command == "status") {
2034 f->open_object_section("status");
2035 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2036 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2037 f->dump_unsigned("whoami", superblock.whoami);
2038 f->dump_string("state", get_state_name(get_state()));
2039 f->dump_unsigned("oldest_map", superblock.oldest_map);
2040 f->dump_unsigned("newest_map", superblock.newest_map);
2041 {
2042 RWLock::RLocker l(pg_map_lock);
2043 f->dump_unsigned("num_pgs", pg_map.size());
2044 }
2045 f->close_section();
2046 } else if (admin_command == "flush_journal") {
2047 store->flush_journal();
2048 } else if (admin_command == "dump_ops_in_flight" ||
2049 admin_command == "ops" ||
2050 admin_command == "dump_blocked_ops" ||
2051 admin_command == "dump_historic_ops" ||
2052 admin_command == "dump_historic_ops_by_duration" ||
2053 admin_command == "dump_historic_slow_ops") {
2054
2055 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2056 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2057 will start to track new ops received afterwards.";
2058
2059 set<string> filters;
2060 vector<string> filter_str;
2061 if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
2062 copy(filter_str.begin(), filter_str.end(),
2063 inserter(filters, filters.end()));
2064 }
2065
2066 if (admin_command == "dump_ops_in_flight" ||
2067 admin_command == "ops") {
2068 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2069 ss << error_str;
2070 }
2071 }
2072 if (admin_command == "dump_blocked_ops") {
2073 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2074 ss << error_str;
2075 }
2076 }
2077 if (admin_command == "dump_historic_ops") {
2078 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2079 ss << error_str;
2080 }
2081 }
2082 if (admin_command == "dump_historic_ops_by_duration") {
2083 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2084 ss << error_str;
2085 }
2086 }
2087 if (admin_command == "dump_historic_slow_ops") {
2088 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2089 ss << error_str;
2090 }
2091 }
2092 } else if (admin_command == "dump_op_pq_state") {
2093 f->open_object_section("pq");
2094 op_shardedwq.dump(f);
2095 f->close_section();
2096 } else if (admin_command == "dump_blacklist") {
2097 list<pair<entity_addr_t,utime_t> > bl;
2098 OSDMapRef curmap = service.get_osdmap();
2099
2100 f->open_array_section("blacklist");
2101 curmap->get_blacklist(&bl);
2102 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2103 it != bl.end(); ++it) {
2104 f->open_object_section("entry");
2105 f->open_object_section("entity_addr_t");
2106 it->first.dump(f);
2107 f->close_section(); //entity_addr_t
2108 it->second.localtime(f->dump_stream("expire_time"));
2109 f->close_section(); //entry
2110 }
2111 f->close_section(); //blacklist
2112 } else if (admin_command == "dump_watchers") {
2113 list<obj_watch_item_t> watchers;
2114 // scan pg's
2115 {
2116 Mutex::Locker l(osd_lock);
2117 RWLock::RLocker l2(pg_map_lock);
2118 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2119 it != pg_map.end();
2120 ++it) {
2121
2122 list<obj_watch_item_t> pg_watchers;
2123 PG *pg = it->second;
2124 pg->lock();
2125 pg->get_watchers(pg_watchers);
2126 pg->unlock();
2127 watchers.splice(watchers.end(), pg_watchers);
2128 }
2129 }
2130
2131 f->open_array_section("watchers");
2132 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2133 it != watchers.end(); ++it) {
2134
2135 f->open_object_section("watch");
2136
2137 f->dump_string("namespace", it->obj.nspace);
2138 f->dump_string("object", it->obj.oid.name);
2139
2140 f->open_object_section("entity_name");
2141 it->wi.name.dump(f);
2142 f->close_section(); //entity_name_t
2143
2144 f->dump_unsigned("cookie", it->wi.cookie);
2145 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2146
2147 f->open_object_section("entity_addr_t");
2148 it->wi.addr.dump(f);
2149 f->close_section(); //entity_addr_t
2150
2151 f->close_section(); //watch
2152 }
2153
2154 f->close_section(); //watchers
2155 } else if (admin_command == "dump_reservations") {
2156 f->open_object_section("reservations");
2157 f->open_object_section("local_reservations");
2158 service.local_reserver.dump(f);
2159 f->close_section();
2160 f->open_object_section("remote_reservations");
2161 service.remote_reserver.dump(f);
2162 f->close_section();
2163 f->close_section();
2164 } else if (admin_command == "get_latest_osdmap") {
2165 get_latest_osdmap();
2166 } else if (admin_command == "heap") {
2167 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2168
2169 // Note: Failed heap profile commands won't necessarily trigger an error:
2170 f->open_object_section("result");
2171 f->dump_string("error", cpp_strerror(result));
2172 f->dump_bool("success", result >= 0);
2173 f->close_section();
2174 } else if (admin_command == "set_heap_property") {
2175 string property;
2176 int64_t value = 0;
2177 string error;
2178 bool success = false;
2179 if (!cmd_getval(cct, cmdmap, "property", property)) {
2180 error = "unable to get property";
2181 success = false;
2182 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2183 error = "unable to get value";
2184 success = false;
2185 } else if (value < 0) {
2186 error = "negative value not allowed";
2187 success = false;
2188 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2189 error = "invalid property";
2190 success = false;
2191 } else {
2192 success = true;
2193 }
2194 f->open_object_section("result");
2195 f->dump_string("error", error);
2196 f->dump_bool("success", success);
2197 f->close_section();
2198 } else if (admin_command == "get_heap_property") {
2199 string property;
2200 size_t value = 0;
2201 string error;
2202 bool success = false;
2203 if (!cmd_getval(cct, cmdmap, "property", property)) {
2204 error = "unable to get property";
2205 success = false;
2206 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2207 error = "invalid property";
2208 success = false;
2209 } else {
2210 success = true;
2211 }
2212 f->open_object_section("result");
2213 f->dump_string("error", error);
2214 f->dump_bool("success", success);
2215 f->dump_int("value", value);
2216 f->close_section();
2217 } else if (admin_command == "dump_objectstore_kv_stats") {
2218 store->get_db_statistics(f);
2219 } else if (admin_command == "dump_scrubs") {
2220 service.dumps_scrub(f);
2221 } else if (admin_command == "calc_objectstore_db_histogram") {
2222 store->generate_db_histogram(f);
2223 } else if (admin_command == "flush_store_cache") {
2224 store->flush_cache();
2225 } else if (admin_command == "dump_pgstate_history") {
2226 f->open_object_section("pgstate_history");
2227 RWLock::RLocker l2(pg_map_lock);
2228 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2229 it != pg_map.end();
2230 ++it) {
2231
2232 PG *pg = it->second;
2233 f->dump_stream("pg") << pg->get_pgid();
2234 pg->lock();
2235 pg->pgstate_history.dump(f);
2236 pg->unlock();
2237 }
2238 f->close_section();
2239 } else if (admin_command == "compact") {
2240 dout(1) << "triggering manual compaction" << dendl;
2241 auto start = ceph::coarse_mono_clock::now();
2242 store->compact();
2243 auto end = ceph::coarse_mono_clock::now();
2244 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
2245 dout(1) << "finished manual compaction in "
2246 << time_span.count()
2247 << " seconds" << dendl;
2248 f->open_object_section("compact_result");
2249 f->dump_float("elapsed_time", time_span.count());
2250 f->close_section();
2251 } else {
2252 assert(0 == "broken asok registration");
2253 }
2254 f->flush(ss);
2255 delete f;
2256 return true;
2257 }
2258
2259 class TestOpsSocketHook : public AdminSocketHook {
2260 OSDService *service;
2261 ObjectStore *store;
2262 public:
2263 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
2264 bool call(std::string command, cmdmap_t& cmdmap, std::string format,
2265 bufferlist& out) override {
2266 stringstream ss;
2267 test_ops(service, store, command, cmdmap, ss);
2268 out.append(ss);
2269 return true;
2270 }
2271 void test_ops(OSDService *service, ObjectStore *store,
2272 const std::string &command, cmdmap_t& cmdmap, ostream &ss);
2273
2274 };
2275
2276 class OSD::C_Tick : public Context {
2277 OSD *osd;
2278 public:
2279 explicit C_Tick(OSD *o) : osd(o) {}
2280 void finish(int r) override {
2281 osd->tick();
2282 }
2283 };
2284
2285 class OSD::C_Tick_WithoutOSDLock : public Context {
2286 OSD *osd;
2287 public:
2288 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2289 void finish(int r) override {
2290 osd->tick_without_osd_lock();
2291 }
2292 };
2293
2294 int OSD::enable_disable_fuse(bool stop)
2295 {
2296 #ifdef HAVE_LIBFUSE
2297 int r;
2298 string mntpath = cct->_conf->osd_data + "/fuse";
2299 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2300 dout(1) << __func__ << " disabling" << dendl;
2301 fuse_store->stop();
2302 delete fuse_store;
2303 fuse_store = NULL;
2304 r = ::rmdir(mntpath.c_str());
2305 if (r < 0) {
2306 r = -errno;
2307 derr << __func__ << " failed to rmdir " << mntpath << ": "
2308 << cpp_strerror(r) << dendl;
2309 return r;
2310 }
2311 return 0;
2312 }
2313 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2314 dout(1) << __func__ << " enabling" << dendl;
2315 r = ::mkdir(mntpath.c_str(), 0700);
2316 if (r < 0)
2317 r = -errno;
2318 if (r < 0 && r != -EEXIST) {
2319 derr << __func__ << " unable to create " << mntpath << ": "
2320 << cpp_strerror(r) << dendl;
2321 return r;
2322 }
2323 fuse_store = new FuseStore(store, mntpath);
2324 r = fuse_store->start();
2325 if (r < 0) {
2326 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2327 delete fuse_store;
2328 fuse_store = NULL;
2329 return r;
2330 }
2331 }
2332 #endif // HAVE_LIBFUSE
2333 return 0;
2334 }
2335
2336 int OSD::get_num_op_shards()
2337 {
2338 if (cct->_conf->osd_op_num_shards)
2339 return cct->_conf->osd_op_num_shards;
2340 if (store_is_rotational)
2341 return cct->_conf->osd_op_num_shards_hdd;
2342 else
2343 return cct->_conf->osd_op_num_shards_ssd;
2344 }
2345
2346 int OSD::get_num_op_threads()
2347 {
2348 if (cct->_conf->osd_op_num_threads_per_shard)
2349 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2350 if (store_is_rotational)
2351 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2352 else
2353 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2354 }
2355
2356 float OSD::get_osd_recovery_sleep()
2357 {
2358 if (cct->_conf->osd_recovery_sleep)
2359 return cct->_conf->osd_recovery_sleep;
2360 if (!store_is_rotational && !journal_is_rotational)
2361 return cct->_conf->osd_recovery_sleep_ssd;
2362 else if (store_is_rotational && !journal_is_rotational)
2363 return cct->_conf->get_val<double>("osd_recovery_sleep_hybrid");
2364 else
2365 return cct->_conf->osd_recovery_sleep_hdd;
2366 }
2367
2368 int OSD::init()
2369 {
2370 CompatSet initial, diff;
2371 Mutex::Locker lock(osd_lock);
2372 if (is_stopping())
2373 return 0;
2374
2375 tick_timer.init();
2376 tick_timer_without_osd_lock.init();
2377 service.recovery_request_timer.init();
2378 service.recovery_sleep_timer.init();
2379
2380 // mount.
2381 dout(2) << "init " << dev_path
2382 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
2383 << dendl;
2384 dout(2) << "journal " << journal_path << dendl;
2385 assert(store); // call pre_init() first!
2386
2387 store->set_cache_shards(get_num_op_shards());
2388
2389 int r = store->mount();
2390 if (r < 0) {
2391 derr << "OSD:init: unable to mount object store" << dendl;
2392 return r;
2393 }
2394 journal_is_rotational = store->is_journal_rotational();
2395 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
2396 << dendl;
2397
2398 enable_disable_fuse(false);
2399
2400 dout(2) << "boot" << dendl;
2401
2402 // initialize the daily loadavg with current 15min loadavg
2403 double loadavgs[3];
2404 if (getloadavg(loadavgs, 3) == 3) {
2405 daily_loadavg = loadavgs[2];
2406 } else {
2407 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
2408 daily_loadavg = 1.0;
2409 }
2410
2411 int rotating_auth_attempts = 0;
2412
2413 // sanity check long object name handling
2414 {
2415 hobject_t l;
2416 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
2417 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
2418 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
2419 r = store->validate_hobject_key(l);
2420 if (r < 0) {
2421 derr << "backend (" << store->get_type() << ") is unable to support max "
2422 << "object name[space] len" << dendl;
2423 derr << " osd max object name len = "
2424 << cct->_conf->osd_max_object_name_len << dendl;
2425 derr << " osd max object namespace len = "
2426 << cct->_conf->osd_max_object_namespace_len << dendl;
2427 derr << cpp_strerror(r) << dendl;
2428 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
2429 goto out;
2430 }
2431 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
2432 << dendl;
2433 } else {
2434 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
2435 }
2436 }
2437
2438 // read superblock
2439 r = read_superblock();
2440 if (r < 0) {
2441 derr << "OSD::init() : unable to read osd superblock" << dendl;
2442 r = -EINVAL;
2443 goto out;
2444 }
2445
2446 if (osd_compat.compare(superblock.compat_features) < 0) {
2447 derr << "The disk uses features unsupported by the executable." << dendl;
2448 derr << " ondisk features " << superblock.compat_features << dendl;
2449 derr << " daemon features " << osd_compat << dendl;
2450
2451 if (osd_compat.writeable(superblock.compat_features)) {
2452 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2453 derr << "it is still writeable, though. Missing features: " << diff << dendl;
2454 r = -EOPNOTSUPP;
2455 goto out;
2456 }
2457 else {
2458 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2459 derr << "Cannot write to disk! Missing features: " << diff << dendl;
2460 r = -EOPNOTSUPP;
2461 goto out;
2462 }
2463 }
2464
2465 assert_warn(whoami == superblock.whoami);
2466 if (whoami != superblock.whoami) {
2467 derr << "OSD::init: superblock says osd"
2468 << superblock.whoami << " but I am osd." << whoami << dendl;
2469 r = -EINVAL;
2470 goto out;
2471 }
2472
2473 initial = get_osd_initial_compat_set();
2474 diff = superblock.compat_features.unsupported(initial);
2475 if (superblock.compat_features.merge(initial)) {
2476 // We need to persist the new compat_set before we
2477 // do anything else
2478 dout(5) << "Upgrading superblock adding: " << diff << dendl;
2479 ObjectStore::Transaction t;
2480 write_superblock(t);
2481 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2482 if (r < 0)
2483 goto out;
2484 }
2485
2486 // make sure snap mapper object exists
2487 if (!store->exists(coll_t::meta(), OSD::make_snapmapper_oid())) {
2488 dout(10) << "init creating/touching snapmapper object" << dendl;
2489 ObjectStore::Transaction t;
2490 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
2491 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2492 if (r < 0)
2493 goto out;
2494 }
2495
2496 class_handler = new ClassHandler(cct);
2497 cls_initialize(class_handler);
2498
2499 if (cct->_conf->osd_open_classes_on_start) {
2500 int r = class_handler->open_all_classes();
2501 if (r)
2502 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
2503 }
2504
2505 // load up "current" osdmap
2506 assert_warn(!osdmap);
2507 if (osdmap) {
2508 derr << "OSD::init: unable to read current osdmap" << dendl;
2509 r = -EINVAL;
2510 goto out;
2511 }
2512 osdmap = get_map(superblock.current_epoch);
2513 check_osdmap_features(store);
2514
2515 create_recoverystate_perf();
2516
2517 {
2518 epoch_t bind_epoch = osdmap->get_epoch();
2519 service.set_epochs(NULL, NULL, &bind_epoch);
2520 }
2521
2522 clear_temp_objects();
2523
2524 // initialize osdmap references in sharded wq
2525 op_shardedwq.prune_pg_waiters(osdmap, whoami);
2526
2527 // load up pgs (as they previously existed)
2528 load_pgs();
2529
2530 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
2531 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
2532 op_prio_cutoff << "." << dendl;
2533
2534 create_logger();
2535
2536 // i'm ready!
2537 client_messenger->add_dispatcher_head(this);
2538 cluster_messenger->add_dispatcher_head(this);
2539
2540 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2541 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2542 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2543 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2544
2545 objecter_messenger->add_dispatcher_head(service.objecter);
2546
2547 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
2548 | CEPH_ENTITY_TYPE_MGR);
2549 r = monc->init();
2550 if (r < 0)
2551 goto out;
2552
2553 /**
2554 * FIXME: this is a placeholder implementation that unconditionally
2555 * sends every is_primary PG's stats every time we're called, unlike
2556 * the existing mon PGStats mechanism that uses pg_stat_queue and acks.
2557 * This has equivalent cost to the existing worst case where all
2558 * PGs are busy and their stats are always enqueued for sending.
2559 */
2560 mgrc.set_pgstats_cb([this](){
2561 RWLock::RLocker l(map_lock);
2562
2563 utime_t had_for = ceph_clock_now() - had_map_since;
2564 osd_stat_t cur_stat = service.get_osd_stat();
2565 cur_stat.os_perf_stat = store->get_cur_stats();
2566
2567 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
2568 m->osd_stat = cur_stat;
2569
2570 Mutex::Locker lec{min_last_epoch_clean_lock};
2571 min_last_epoch_clean = osdmap->get_epoch();
2572 min_last_epoch_clean_pgs.clear();
2573 RWLock::RLocker lpg(pg_map_lock);
2574 for (const auto &i : pg_map) {
2575 PG *pg = i.second;
2576 if (!pg->is_primary()) {
2577 continue;
2578 }
2579
2580 pg->pg_stats_publish_lock.Lock();
2581 if (pg->pg_stats_publish_valid) {
2582 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
2583 const auto lec = pg->pg_stats_publish.get_effective_last_epoch_clean();
2584 min_last_epoch_clean = min(min_last_epoch_clean, lec);
2585 min_last_epoch_clean_pgs.push_back(pg->info.pgid.pgid);
2586 }
2587 pg->pg_stats_publish_lock.Unlock();
2588 }
2589
2590 return m;
2591 });
2592
2593 mgrc.init();
2594 client_messenger->add_dispatcher_head(&mgrc);
2595
2596 // tell monc about log_client so it will know about mon session resets
2597 monc->set_log_client(&log_client);
2598 update_log_config();
2599
2600 peering_tp.start();
2601 osd_op_tp.start();
2602 disk_tp.start();
2603 command_tp.start();
2604
2605 set_disk_tp_priority();
2606
2607 // start the heartbeat
2608 heartbeat_thread.create("osd_srv_heartbt");
2609
2610 // tick
2611 tick_timer.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick(this));
2612 {
2613 Mutex::Locker l(tick_timer_lock);
2614 tick_timer_without_osd_lock.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick_WithoutOSDLock(this));
2615 }
2616
2617 service.init();
2618 service.publish_map(osdmap);
2619 service.publish_superblock(superblock);
2620 service.max_oldest_map = superblock.oldest_map;
2621
2622 osd_lock.Unlock();
2623
2624 r = monc->authenticate();
2625 if (r < 0) {
2626 derr << __func__ << " authentication failed: " << cpp_strerror(r)
2627 << dendl;
2628 osd_lock.Lock(); // locker is going to unlock this on function exit
2629 if (is_stopping())
2630 r = 0;
2631 goto monout;
2632 }
2633
2634 while (monc->wait_auth_rotating(30.0) < 0) {
2635 derr << "unable to obtain rotating service keys; retrying" << dendl;
2636 ++rotating_auth_attempts;
2637 if (rotating_auth_attempts > g_conf->max_rotating_auth_attempts) {
2638 derr << __func__ << " wait_auth_rotating timed out" << dendl;
2639 osd_lock.Lock(); // make locker happy
2640 if (!is_stopping()) {
2641 r = -ETIMEDOUT;
2642 }
2643 goto monout;
2644 }
2645 }
2646
2647 r = update_crush_device_class();
2648 if (r < 0) {
2649 derr << __func__ << " unable to update_crush_device_class: "
2650 << cpp_strerror(r) << dendl;
2651 osd_lock.Lock();
2652 goto monout;
2653 }
2654
2655 r = update_crush_location();
2656 if (r < 0) {
2657 derr << __func__ << " unable to update_crush_location: "
2658 << cpp_strerror(r) << dendl;
2659 osd_lock.Lock();
2660 goto monout;
2661 }
2662
2663 osd_lock.Lock();
2664 if (is_stopping())
2665 return 0;
2666
2667 // start objecter *after* we have authenticated, so that we don't ignore
2668 // the OSDMaps it requests.
2669 service.final_init();
2670
2671 check_config();
2672
2673 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
2674 consume_map();
2675 peering_wq.drain();
2676
2677 dout(0) << "done with init, starting boot process" << dendl;
2678
2679 // subscribe to any pg creations
2680 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
2681
2682 // MgrClient needs this (it doesn't have MonClient reference itself)
2683 monc->sub_want("mgrmap", 0, 0);
2684
2685 // we don't need to ask for an osdmap here; objecter will
2686 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
2687
2688 monc->renew_subs();
2689
2690 start_boot();
2691
2692 return 0;
2693 monout:
2694 exit(1);
2695
2696 out:
2697 enable_disable_fuse(true);
2698 store->umount();
2699 delete store;
2700 store = NULL;
2701 return r;
2702 }
2703
2704 void OSD::final_init()
2705 {
2706 AdminSocket *admin_socket = cct->get_admin_socket();
2707 asok_hook = new OSDSocketHook(this);
2708 int r = admin_socket->register_command("status", "status", asok_hook,
2709 "high-level status of OSD");
2710 assert(r == 0);
2711 r = admin_socket->register_command("flush_journal", "flush_journal",
2712 asok_hook,
2713 "flush the journal to permanent store");
2714 assert(r == 0);
2715 r = admin_socket->register_command("dump_ops_in_flight",
2716 "dump_ops_in_flight " \
2717 "name=filterstr,type=CephString,n=N,req=false",
2718 asok_hook,
2719 "show the ops currently in flight");
2720 assert(r == 0);
2721 r = admin_socket->register_command("ops",
2722 "ops " \
2723 "name=filterstr,type=CephString,n=N,req=false",
2724 asok_hook,
2725 "show the ops currently in flight");
2726 assert(r == 0);
2727 r = admin_socket->register_command("dump_blocked_ops",
2728 "dump_blocked_ops " \
2729 "name=filterstr,type=CephString,n=N,req=false",
2730 asok_hook,
2731 "show the blocked ops currently in flight");
2732 assert(r == 0);
2733 r = admin_socket->register_command("dump_historic_ops",
2734 "dump_historic_ops " \
2735 "name=filterstr,type=CephString,n=N,req=false",
2736 asok_hook,
2737 "show recent ops");
2738 assert(r == 0);
2739 r = admin_socket->register_command("dump_historic_slow_ops",
2740 "dump_historic_slow_ops " \
2741 "name=filterstr,type=CephString,n=N,req=false",
2742 asok_hook,
2743 "show slowest recent ops");
2744 assert(r == 0);
2745 r = admin_socket->register_command("dump_historic_ops_by_duration",
2746 "dump_historic_ops_by_duration " \
2747 "name=filterstr,type=CephString,n=N,req=false",
2748 asok_hook,
2749 "show slowest recent ops, sorted by duration");
2750 assert(r == 0);
2751 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
2752 asok_hook,
2753 "dump op priority queue state");
2754 assert(r == 0);
2755 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
2756 asok_hook,
2757 "dump blacklisted clients and times");
2758 assert(r == 0);
2759 r = admin_socket->register_command("dump_watchers", "dump_watchers",
2760 asok_hook,
2761 "show clients which have active watches,"
2762 " and on which objects");
2763 assert(r == 0);
2764 r = admin_socket->register_command("dump_reservations", "dump_reservations",
2765 asok_hook,
2766 "show recovery reservations");
2767 assert(r == 0);
2768 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
2769 asok_hook,
2770 "force osd to update the latest map from "
2771 "the mon");
2772 assert(r == 0);
2773
2774 r = admin_socket->register_command( "heap",
2775 "heap " \
2776 "name=heapcmd,type=CephString",
2777 asok_hook,
2778 "show heap usage info (available only if "
2779 "compiled with tcmalloc)");
2780 assert(r == 0);
2781
2782 r = admin_socket->register_command("set_heap_property",
2783 "set_heap_property " \
2784 "name=property,type=CephString " \
2785 "name=value,type=CephInt",
2786 asok_hook,
2787 "update malloc extension heap property");
2788 assert(r == 0);
2789
2790 r = admin_socket->register_command("get_heap_property",
2791 "get_heap_property " \
2792 "name=property,type=CephString",
2793 asok_hook,
2794 "get malloc extension heap property");
2795 assert(r == 0);
2796
2797 r = admin_socket->register_command("dump_objectstore_kv_stats",
2798 "dump_objectstore_kv_stats",
2799 asok_hook,
2800 "print statistics of kvdb which used by bluestore");
2801 assert(r == 0);
2802
2803 r = admin_socket->register_command("dump_scrubs",
2804 "dump_scrubs",
2805 asok_hook,
2806 "print scheduled scrubs");
2807 assert(r == 0);
2808
2809 r = admin_socket->register_command("calc_objectstore_db_histogram",
2810 "calc_objectstore_db_histogram",
2811 asok_hook,
2812 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
2813 assert(r == 0);
2814
2815 r = admin_socket->register_command("flush_store_cache",
2816 "flush_store_cache",
2817 asok_hook,
2818 "Flush bluestore internal cache");
2819 assert(r == 0);
2820 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
2821 asok_hook,
2822 "show recent state history");
2823 assert(r == 0);
2824
2825 r = admin_socket->register_command("compact", "compact",
2826 asok_hook,
2827 "Commpact object store's omap."
2828 " WARNING: Compaction probably slows your requests");
2829 assert(r == 0);
2830
2831 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
2832 // Note: pools are CephString instead of CephPoolname because
2833 // these commands traditionally support both pool names and numbers
2834 r = admin_socket->register_command(
2835 "setomapval",
2836 "setomapval " \
2837 "name=pool,type=CephString " \
2838 "name=objname,type=CephObjectname " \
2839 "name=key,type=CephString "\
2840 "name=val,type=CephString",
2841 test_ops_hook,
2842 "set omap key");
2843 assert(r == 0);
2844 r = admin_socket->register_command(
2845 "rmomapkey",
2846 "rmomapkey " \
2847 "name=pool,type=CephString " \
2848 "name=objname,type=CephObjectname " \
2849 "name=key,type=CephString",
2850 test_ops_hook,
2851 "remove omap key");
2852 assert(r == 0);
2853 r = admin_socket->register_command(
2854 "setomapheader",
2855 "setomapheader " \
2856 "name=pool,type=CephString " \
2857 "name=objname,type=CephObjectname " \
2858 "name=header,type=CephString",
2859 test_ops_hook,
2860 "set omap header");
2861 assert(r == 0);
2862
2863 r = admin_socket->register_command(
2864 "getomap",
2865 "getomap " \
2866 "name=pool,type=CephString " \
2867 "name=objname,type=CephObjectname",
2868 test_ops_hook,
2869 "output entire object map");
2870 assert(r == 0);
2871
2872 r = admin_socket->register_command(
2873 "truncobj",
2874 "truncobj " \
2875 "name=pool,type=CephString " \
2876 "name=objname,type=CephObjectname " \
2877 "name=len,type=CephInt",
2878 test_ops_hook,
2879 "truncate object to length");
2880 assert(r == 0);
2881
2882 r = admin_socket->register_command(
2883 "injectdataerr",
2884 "injectdataerr " \
2885 "name=pool,type=CephString " \
2886 "name=objname,type=CephObjectname " \
2887 "name=shardid,type=CephInt,req=false,range=0|255",
2888 test_ops_hook,
2889 "inject data error to an object");
2890 assert(r == 0);
2891
2892 r = admin_socket->register_command(
2893 "injectmdataerr",
2894 "injectmdataerr " \
2895 "name=pool,type=CephString " \
2896 "name=objname,type=CephObjectname " \
2897 "name=shardid,type=CephInt,req=false,range=0|255",
2898 test_ops_hook,
2899 "inject metadata error to an object");
2900 assert(r == 0);
2901 r = admin_socket->register_command(
2902 "set_recovery_delay",
2903 "set_recovery_delay " \
2904 "name=utime,type=CephInt,req=false",
2905 test_ops_hook,
2906 "Delay osd recovery by specified seconds");
2907 assert(r == 0);
2908 r = admin_socket->register_command(
2909 "trigger_scrub",
2910 "trigger_scrub " \
2911 "name=pgid,type=CephString ",
2912 test_ops_hook,
2913 "Trigger a scheduled scrub ");
2914 assert(r == 0);
2915 r = admin_socket->register_command(
2916 "injectfull",
2917 "injectfull " \
2918 "name=type,type=CephString,req=false " \
2919 "name=count,type=CephInt,req=false ",
2920 test_ops_hook,
2921 "Inject a full disk (optional count times)");
2922 assert(r == 0);
2923 }
2924
2925 void OSD::create_logger()
2926 {
2927 dout(10) << "create_logger" << dendl;
2928
2929 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
2930
2931 // Latency axis configuration for op histograms, values are in nanoseconds
2932 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
2933 "Latency (usec)",
2934 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
2935 0, ///< Start at 0
2936 100000, ///< Quantization unit is 100usec
2937 32, ///< Enough to cover much longer than slow requests
2938 };
2939
2940 // Op size axis configuration for op histograms, values are in bytes
2941 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
2942 "Request size (bytes)",
2943 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
2944 0, ///< Start at 0
2945 512, ///< Quantization unit is 512 bytes
2946 32, ///< Enough to cover requests larger than GB
2947 };
2948
2949
2950 osd_plb.add_u64(
2951 l_osd_op_wip, "op_wip",
2952 "Replication operations currently being processed (primary)");
2953 osd_plb.add_u64_counter(
2954 l_osd_op, "op",
2955 "Client operations",
2956 "ops", PerfCountersBuilder::PRIO_CRITICAL);
2957 osd_plb.add_u64_counter(
2958 l_osd_op_inb, "op_in_bytes",
2959 "Client operations total write size",
2960 "wr", PerfCountersBuilder::PRIO_INTERESTING);
2961 osd_plb.add_u64_counter(
2962 l_osd_op_outb, "op_out_bytes",
2963 "Client operations total read size",
2964 "rd", PerfCountersBuilder::PRIO_INTERESTING);
2965 osd_plb.add_time_avg(
2966 l_osd_op_lat, "op_latency",
2967 "Latency of client operations (including queue time)",
2968 "l", 9);
2969 osd_plb.add_time_avg(
2970 l_osd_op_process_lat, "op_process_latency",
2971 "Latency of client operations (excluding queue time)");
2972 osd_plb.add_time_avg(
2973 l_osd_op_prepare_lat, "op_prepare_latency",
2974 "Latency of client operations (excluding queue time and wait for finished)");
2975
2976 osd_plb.add_u64_counter(
2977 l_osd_op_r, "op_r", "Client read operations");
2978 osd_plb.add_u64_counter(
2979 l_osd_op_r_outb, "op_r_out_bytes", "Client data read");
2980 osd_plb.add_time_avg(
2981 l_osd_op_r_lat, "op_r_latency",
2982 "Latency of read operation (including queue time)");
2983 osd_plb.add_u64_counter_histogram(
2984 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
2985 op_hist_x_axis_config, op_hist_y_axis_config,
2986 "Histogram of operation latency (including queue time) + data read");
2987 osd_plb.add_time_avg(
2988 l_osd_op_r_process_lat, "op_r_process_latency",
2989 "Latency of read operation (excluding queue time)");
2990 osd_plb.add_time_avg(
2991 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
2992 "Latency of read operations (excluding queue time and wait for finished)");
2993 osd_plb.add_u64_counter(
2994 l_osd_op_w, "op_w", "Client write operations");
2995 osd_plb.add_u64_counter(
2996 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
2997 osd_plb.add_time_avg(
2998 l_osd_op_w_lat, "op_w_latency",
2999 "Latency of write operation (including queue time)");
3000 osd_plb.add_u64_counter_histogram(
3001 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
3002 op_hist_x_axis_config, op_hist_y_axis_config,
3003 "Histogram of operation latency (including queue time) + data written");
3004 osd_plb.add_time_avg(
3005 l_osd_op_w_process_lat, "op_w_process_latency",
3006 "Latency of write operation (excluding queue time)");
3007 osd_plb.add_time_avg(
3008 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
3009 "Latency of write operations (excluding queue time and wait for finished)");
3010 osd_plb.add_u64_counter(
3011 l_osd_op_rw, "op_rw",
3012 "Client read-modify-write operations");
3013 osd_plb.add_u64_counter(
3014 l_osd_op_rw_inb, "op_rw_in_bytes",
3015 "Client read-modify-write operations write in");
3016 osd_plb.add_u64_counter(
3017 l_osd_op_rw_outb,"op_rw_out_bytes",
3018 "Client read-modify-write operations read out ");
3019 osd_plb.add_time_avg(
3020 l_osd_op_rw_lat, "op_rw_latency",
3021 "Latency of read-modify-write operation (including queue time)");
3022 osd_plb.add_u64_counter_histogram(
3023 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
3024 op_hist_x_axis_config, op_hist_y_axis_config,
3025 "Histogram of rw operation latency (including queue time) + data written");
3026 osd_plb.add_u64_counter_histogram(
3027 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
3028 op_hist_x_axis_config, op_hist_y_axis_config,
3029 "Histogram of rw operation latency (including queue time) + data read");
3030 osd_plb.add_time_avg(
3031 l_osd_op_rw_process_lat, "op_rw_process_latency",
3032 "Latency of read-modify-write operation (excluding queue time)");
3033 osd_plb.add_time_avg(
3034 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
3035 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3036
3037 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
3038 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3039 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
3040 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3041
3042 osd_plb.add_u64_counter(
3043 l_osd_sop, "subop", "Suboperations");
3044 osd_plb.add_u64_counter(
3045 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size");
3046 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
3047
3048 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
3049 osd_plb.add_u64_counter(
3050 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size");
3051 osd_plb.add_time_avg(
3052 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
3053 osd_plb.add_u64_counter(
3054 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
3055 osd_plb.add_time_avg(
3056 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
3057 osd_plb.add_u64_counter(
3058 l_osd_sop_push, "subop_push", "Suboperations push messages");
3059 osd_plb.add_u64_counter(
3060 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size");
3061 osd_plb.add_time_avg(
3062 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3063
3064 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3065 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
3066 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size");
3067
3068 osd_plb.add_u64_counter(
3069 l_osd_rop, "recovery_ops",
3070 "Started recovery operations",
3071 "rop", PerfCountersBuilder::PRIO_INTERESTING);
3072
3073 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
3074 osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size");
3075 osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes");
3076 osd_plb.add_u64(l_osd_history_alloc_num, "history_alloc_num");
3077 osd_plb.add_u64(
3078 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3079 osd_plb.add_u64(
3080 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3081 "Total number getting crc from crc_cache with adjusting");
3082 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3083 "Total number of crc cache misses");
3084
3085 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3086 "pgs", PerfCountersBuilder::PRIO_USEFUL);
3087 osd_plb.add_u64(
3088 l_osd_pg_primary, "numpg_primary",
3089 "Placement groups for which this osd is primary");
3090 osd_plb.add_u64(
3091 l_osd_pg_replica, "numpg_replica",
3092 "Placement groups for which this osd is replica");
3093 osd_plb.add_u64(
3094 l_osd_pg_stray, "numpg_stray",
3095 "Placement groups ready to be deleted from this osd");
3096 osd_plb.add_u64(
3097 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3098 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3099 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3100 osd_plb.add_u64_counter(
3101 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3102 osd_plb.add_u64_counter(
3103 l_osd_waiting_for_map, "messages_delayed_for_map",
3104 "Operations waiting for OSD map");
3105
3106 osd_plb.add_u64_counter(
3107 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3108 osd_plb.add_u64_counter(
3109 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3110 osd_plb.add_u64_counter(
3111 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3112 "osdmap cache miss below cache lower bound");
3113 osd_plb.add_u64_avg(
3114 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3115 "osdmap cache miss, avg distance below cache lower bound");
3116 osd_plb.add_u64_counter(
3117 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3118 "OSDMap buffer cache hits");
3119 osd_plb.add_u64_counter(
3120 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3121 "OSDMap buffer cache misses");
3122
3123 osd_plb.add_u64(l_osd_stat_bytes, "stat_bytes", "OSD size");
3124 osd_plb.add_u64(l_osd_stat_bytes_used, "stat_bytes_used", "Used space");
3125 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
3126
3127 osd_plb.add_u64_counter(
3128 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3129
3130 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3131 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3132 osd_plb.add_u64_counter(
3133 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3134 osd_plb.add_u64_counter(
3135 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3136 osd_plb.add_u64_counter(
3137 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3138 "Failed tier flush attempts");
3139 osd_plb.add_u64_counter(
3140 l_osd_tier_evict, "tier_evict", "Tier evictions");
3141 osd_plb.add_u64_counter(
3142 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3143 osd_plb.add_u64_counter(
3144 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3145 osd_plb.add_u64_counter(
3146 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3147 osd_plb.add_u64_counter(
3148 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3149 osd_plb.add_u64_counter(
3150 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3151 osd_plb.add_u64_counter(
3152 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3153
3154 osd_plb.add_u64_counter(
3155 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3156 osd_plb.add_u64_counter(
3157 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3158 osd_plb.add_u64_counter(
3159 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3160 osd_plb.add_u64_counter(
3161 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3162
3163 osd_plb.add_u64_counter(
3164 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3165 osd_plb.add_u64_counter(
3166 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3167
3168 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3169 osd_plb.add_time_avg(
3170 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3171 osd_plb.add_time_avg(
3172 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3173 osd_plb.add_time_avg(
3174 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3175
3176 osd_plb.add_u64_counter(
3177 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3178 osd_plb.add_u64_counter(
3179 l_osd_pg_fastinfo, "osd_pg_fastinfo",
3180 "PG updated its info using fastinfo attr");
3181 osd_plb.add_u64_counter(
3182 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3183
3184 logger = osd_plb.create_perf_counters();
3185 cct->get_perfcounters_collection()->add(logger);
3186 }
3187
3188 void OSD::create_recoverystate_perf()
3189 {
3190 dout(10) << "create_recoverystate_perf" << dendl;
3191
3192 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3193
3194 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3195 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3196 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3197 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3198 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3199 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3200 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3201 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3202 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3203 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3204 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3205 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3206 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3207 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3208 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3209 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3210 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3211 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3212 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3213 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3214 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3215 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3216 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3217 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3218 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3219 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3220 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3221 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3222 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3223 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3224 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3225
3226 recoverystate_perf = rs_perf.create_perf_counters();
3227 cct->get_perfcounters_collection()->add(recoverystate_perf);
3228 }
3229
3230 int OSD::shutdown()
3231 {
3232 if (!service.prepare_to_stop())
3233 return 0; // already shutting down
3234 osd_lock.Lock();
3235 if (is_stopping()) {
3236 osd_lock.Unlock();
3237 return 0;
3238 }
3239 derr << "shutdown" << dendl;
3240
3241 set_state(STATE_STOPPING);
3242
3243 // Debugging
3244 cct->_conf->set_val("debug_osd", "100");
3245 cct->_conf->set_val("debug_journal", "100");
3246 cct->_conf->set_val("debug_filestore", "100");
3247 cct->_conf->set_val("debug_ms", "100");
3248 cct->_conf->apply_changes(NULL);
3249
3250 // stop MgrClient earlier as it's more like an internal consumer of OSD
3251 mgrc.shutdown();
3252
3253 service.start_shutdown();
3254
3255 // stop sending work to pgs. this just prevents any new work in _process
3256 // from racing with on_shutdown and potentially entering the pg after.
3257 op_shardedwq.drain();
3258
3259 // Shutdown PGs
3260 {
3261 RWLock::RLocker l(pg_map_lock);
3262 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3263 p != pg_map.end();
3264 ++p) {
3265 dout(20) << " kicking pg " << p->first << dendl;
3266 p->second->lock();
3267 p->second->on_shutdown();
3268 p->second->unlock();
3269 p->second->osr->flush();
3270 }
3271 }
3272 clear_pg_stat_queue();
3273
3274 // drain op queue again (in case PGs requeued something)
3275 op_shardedwq.drain();
3276 {
3277 finished.clear(); // zap waiters (bleh, this is messy)
3278 }
3279
3280 op_shardedwq.clear_pg_slots();
3281
3282 // unregister commands
3283 cct->get_admin_socket()->unregister_command("status");
3284 cct->get_admin_socket()->unregister_command("flush_journal");
3285 cct->get_admin_socket()->unregister_command("dump_ops_in_flight");
3286 cct->get_admin_socket()->unregister_command("ops");
3287 cct->get_admin_socket()->unregister_command("dump_blocked_ops");
3288 cct->get_admin_socket()->unregister_command("dump_historic_ops");
3289 cct->get_admin_socket()->unregister_command("dump_historic_ops_by_duration");
3290 cct->get_admin_socket()->unregister_command("dump_historic_slow_ops");
3291 cct->get_admin_socket()->unregister_command("dump_op_pq_state");
3292 cct->get_admin_socket()->unregister_command("dump_blacklist");
3293 cct->get_admin_socket()->unregister_command("dump_watchers");
3294 cct->get_admin_socket()->unregister_command("dump_reservations");
3295 cct->get_admin_socket()->unregister_command("get_latest_osdmap");
3296 cct->get_admin_socket()->unregister_command("heap");
3297 cct->get_admin_socket()->unregister_command("set_heap_property");
3298 cct->get_admin_socket()->unregister_command("get_heap_property");
3299 cct->get_admin_socket()->unregister_command("dump_objectstore_kv_stats");
3300 cct->get_admin_socket()->unregister_command("dump_scrubs");
3301 cct->get_admin_socket()->unregister_command("calc_objectstore_db_histogram");
3302 cct->get_admin_socket()->unregister_command("flush_store_cache");
3303 cct->get_admin_socket()->unregister_command("dump_pgstate_history");
3304 cct->get_admin_socket()->unregister_command("compact");
3305 delete asok_hook;
3306 asok_hook = NULL;
3307
3308 cct->get_admin_socket()->unregister_command("setomapval");
3309 cct->get_admin_socket()->unregister_command("rmomapkey");
3310 cct->get_admin_socket()->unregister_command("setomapheader");
3311 cct->get_admin_socket()->unregister_command("getomap");
3312 cct->get_admin_socket()->unregister_command("truncobj");
3313 cct->get_admin_socket()->unregister_command("injectdataerr");
3314 cct->get_admin_socket()->unregister_command("injectmdataerr");
3315 cct->get_admin_socket()->unregister_command("set_recovery_delay");
3316 cct->get_admin_socket()->unregister_command("trigger_scrub");
3317 cct->get_admin_socket()->unregister_command("injectfull");
3318 delete test_ops_hook;
3319 test_ops_hook = NULL;
3320
3321 osd_lock.Unlock();
3322
3323 heartbeat_lock.Lock();
3324 heartbeat_stop = true;
3325 heartbeat_cond.Signal();
3326 heartbeat_lock.Unlock();
3327 heartbeat_thread.join();
3328
3329 peering_tp.drain();
3330 peering_wq.clear();
3331 peering_tp.stop();
3332 dout(10) << "osd tp stopped" << dendl;
3333
3334 osd_op_tp.drain();
3335 osd_op_tp.stop();
3336 dout(10) << "op sharded tp stopped" << dendl;
3337
3338 command_tp.drain();
3339 command_tp.stop();
3340 dout(10) << "command tp stopped" << dendl;
3341
3342 disk_tp.drain();
3343 disk_tp.stop();
3344 dout(10) << "disk tp paused (new)" << dendl;
3345
3346 dout(10) << "stopping agent" << dendl;
3347 service.agent_stop();
3348
3349 osd_lock.Lock();
3350
3351 reset_heartbeat_peers();
3352
3353 tick_timer.shutdown();
3354
3355 {
3356 Mutex::Locker l(tick_timer_lock);
3357 tick_timer_without_osd_lock.shutdown();
3358 }
3359
3360 // note unmount epoch
3361 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
3362 superblock.mounted = service.get_boot_epoch();
3363 superblock.clean_thru = osdmap->get_epoch();
3364 ObjectStore::Transaction t;
3365 write_superblock(t);
3366 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
3367 if (r) {
3368 derr << "OSD::shutdown: error writing superblock: "
3369 << cpp_strerror(r) << dendl;
3370 }
3371
3372
3373 {
3374 Mutex::Locker l(pg_stat_queue_lock);
3375 assert(pg_stat_queue.empty());
3376 }
3377
3378 service.shutdown_reserver();
3379
3380 // Remove PGs
3381 #ifdef PG_DEBUG_REFS
3382 service.dump_live_pgids();
3383 #endif
3384 {
3385 RWLock::RLocker l(pg_map_lock);
3386 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3387 p != pg_map.end();
3388 ++p) {
3389 dout(20) << " kicking pg " << p->first << dendl;
3390 p->second->lock();
3391 if (p->second->ref != 1) {
3392 derr << "pgid " << p->first << " has ref count of "
3393 << p->second->ref << dendl;
3394 #ifdef PG_DEBUG_REFS
3395 p->second->dump_live_ids();
3396 #endif
3397 if (cct->_conf->osd_shutdown_pgref_assert) {
3398 ceph_abort();
3399 }
3400 }
3401 p->second->unlock();
3402 p->second->put("PGMap");
3403 }
3404 pg_map.clear();
3405 }
3406 #ifdef PG_DEBUG_REFS
3407 service.dump_live_pgids();
3408 #endif
3409 cct->_conf->remove_observer(this);
3410
3411 dout(10) << "syncing store" << dendl;
3412 enable_disable_fuse(true);
3413
3414 if (cct->_conf->osd_journal_flush_on_shutdown) {
3415 dout(10) << "flushing journal" << dendl;
3416 store->flush_journal();
3417 }
3418
3419 store->umount();
3420 delete store;
3421 store = 0;
3422 dout(10) << "Store synced" << dendl;
3423
3424 monc->shutdown();
3425 osd_lock.Unlock();
3426
3427 osdmap = OSDMapRef();
3428 service.shutdown();
3429 op_tracker.on_shutdown();
3430
3431 class_handler->shutdown();
3432 client_messenger->shutdown();
3433 cluster_messenger->shutdown();
3434 hb_front_client_messenger->shutdown();
3435 hb_back_client_messenger->shutdown();
3436 objecter_messenger->shutdown();
3437 hb_front_server_messenger->shutdown();
3438 hb_back_server_messenger->shutdown();
3439
3440 peering_wq.clear();
3441
3442 return r;
3443 }
3444
3445 int OSD::mon_cmd_maybe_osd_create(string &cmd)
3446 {
3447 bool created = false;
3448 while (true) {
3449 dout(10) << __func__ << " cmd: " << cmd << dendl;
3450 vector<string> vcmd{cmd};
3451 bufferlist inbl;
3452 C_SaferCond w;
3453 string outs;
3454 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
3455 int r = w.wait();
3456 if (r < 0) {
3457 if (r == -ENOENT && !created) {
3458 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
3459 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
3460 vector<string> vnewcmd{newcmd};
3461 bufferlist inbl;
3462 C_SaferCond w;
3463 string outs;
3464 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
3465 int r = w.wait();
3466 if (r < 0) {
3467 derr << __func__ << " fail: osd does not exist and created failed: "
3468 << cpp_strerror(r) << dendl;
3469 return r;
3470 }
3471 created = true;
3472 continue;
3473 }
3474 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
3475 return r;
3476 }
3477 break;
3478 }
3479
3480 return 0;
3481 }
3482
3483 int OSD::update_crush_location()
3484 {
3485 if (!cct->_conf->osd_crush_update_on_start) {
3486 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
3487 return 0;
3488 }
3489
3490 char weight[32];
3491 if (cct->_conf->osd_crush_initial_weight >= 0) {
3492 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
3493 } else {
3494 struct store_statfs_t st;
3495 int r = store->statfs(&st);
3496 if (r < 0) {
3497 derr << "statfs: " << cpp_strerror(r) << dendl;
3498 return r;
3499 }
3500 snprintf(weight, sizeof(weight), "%.4lf",
3501 MAX((double).00001,
3502 (double)(st.total) /
3503 (double)(1ull << 40 /* TB */)));
3504 }
3505
3506 std::multimap<string,string> loc = cct->crush_location.get_location();
3507 dout(10) << __func__ << " crush location is " << loc << dendl;
3508
3509 string cmd =
3510 string("{\"prefix\": \"osd crush create-or-move\", ") +
3511 string("\"id\": ") + stringify(whoami) + string(", ") +
3512 string("\"weight\":") + weight + string(", ") +
3513 string("\"args\": [");
3514 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
3515 if (p != loc.begin())
3516 cmd += ", ";
3517 cmd += "\"" + p->first + "=" + p->second + "\"";
3518 }
3519 cmd += "]}";
3520
3521 return mon_cmd_maybe_osd_create(cmd);
3522 }
3523
3524 int OSD::update_crush_device_class()
3525 {
3526 if (!cct->_conf->osd_class_update_on_start) {
3527 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
3528 return 0;
3529 }
3530
3531 string device_class;
3532 int r = store->read_meta("crush_device_class", &device_class);
3533 if (r < 0 || device_class.empty()) {
3534 device_class = store->get_default_device_class();
3535 }
3536
3537 if (device_class.empty()) {
3538 dout(20) << __func__ << " no device class stored locally" << dendl;
3539 return 0;
3540 }
3541
3542 string cmd =
3543 string("{\"prefix\": \"osd crush set-device-class\", ") +
3544 string("\"class\": \"") + device_class + string("\", ") +
3545 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
3546
3547 r = mon_cmd_maybe_osd_create(cmd);
3548 // the above cmd can fail for various reasons, e.g.:
3549 // (1) we are connecting to a pre-luminous monitor
3550 // (2) user manually specify a class other than
3551 // 'ceph-disk prepare --crush-device-class'
3552 // simply skip result-checking for now
3553 return 0;
3554 }
3555
3556 void OSD::write_superblock(ObjectStore::Transaction& t)
3557 {
3558 dout(10) << "write_superblock " << superblock << dendl;
3559
3560 //hack: at minimum it's using the baseline feature set
3561 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
3562 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
3563
3564 bufferlist bl;
3565 ::encode(superblock, bl);
3566 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
3567 }
3568
3569 int OSD::read_superblock()
3570 {
3571 bufferlist bl;
3572 int r = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
3573 if (r < 0)
3574 return r;
3575
3576 bufferlist::iterator p = bl.begin();
3577 ::decode(superblock, p);
3578
3579 dout(10) << "read_superblock " << superblock << dendl;
3580
3581 return 0;
3582 }
3583
3584 void OSD::clear_temp_objects()
3585 {
3586 dout(10) << __func__ << dendl;
3587 vector<coll_t> ls;
3588 store->list_collections(ls);
3589 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
3590 spg_t pgid;
3591 if (!p->is_pg(&pgid))
3592 continue;
3593
3594 // list temp objects
3595 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
3596
3597 vector<ghobject_t> temps;
3598 ghobject_t next;
3599 while (1) {
3600 vector<ghobject_t> objects;
3601 store->collection_list(*p, next, ghobject_t::get_max(),
3602 store->get_ideal_list_max(),
3603 &objects, &next);
3604 if (objects.empty())
3605 break;
3606 vector<ghobject_t>::iterator q;
3607 for (q = objects.begin(); q != objects.end(); ++q) {
3608 // Hammer set pool for temps to -1, so check for clean-up
3609 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
3610 temps.push_back(*q);
3611 } else {
3612 break;
3613 }
3614 }
3615 // If we saw a non-temp object and hit the break above we can
3616 // break out of the while loop too.
3617 if (q != objects.end())
3618 break;
3619 }
3620 if (!temps.empty()) {
3621 ObjectStore::Transaction t;
3622 int removed = 0;
3623 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
3624 dout(20) << " removing " << *p << " object " << *q << dendl;
3625 t.remove(*p, *q);
3626 if (++removed > cct->_conf->osd_target_transaction_size) {
3627 store->apply_transaction(service.meta_osr.get(), std::move(t));
3628 t = ObjectStore::Transaction();
3629 removed = 0;
3630 }
3631 }
3632 if (removed) {
3633 store->apply_transaction(service.meta_osr.get(), std::move(t));
3634 }
3635 }
3636 }
3637 }
3638
3639 void OSD::recursive_remove_collection(CephContext* cct,
3640 ObjectStore *store, spg_t pgid,
3641 coll_t tmp)
3642 {
3643 OSDriver driver(
3644 store,
3645 coll_t(),
3646 make_snapmapper_oid());
3647
3648 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
3649 ObjectStore::Sequencer>("rm"));
3650 ObjectStore::Transaction t;
3651 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
3652
3653 vector<ghobject_t> objects;
3654 store->collection_list(tmp, ghobject_t(), ghobject_t::get_max(),
3655 INT_MAX, &objects, 0);
3656 generic_dout(10) << __func__ << " " << objects << dendl;
3657 // delete them.
3658 int removed = 0;
3659 for (vector<ghobject_t>::iterator p = objects.begin();
3660 p != objects.end();
3661 ++p, removed++) {
3662 OSDriver::OSTransaction _t(driver.get_transaction(&t));
3663 int r = mapper.remove_oid(p->hobj, &_t);
3664 if (r != 0 && r != -ENOENT)
3665 ceph_abort();
3666 t.remove(tmp, *p);
3667 if (removed > cct->_conf->osd_target_transaction_size) {
3668 int r = store->apply_transaction(osr.get(), std::move(t));
3669 assert(r == 0);
3670 t = ObjectStore::Transaction();
3671 removed = 0;
3672 }
3673 }
3674 t.remove_collection(tmp);
3675 int r = store->apply_transaction(osr.get(), std::move(t));
3676 assert(r == 0);
3677
3678 C_SaferCond waiter;
3679 if (!osr->flush_commit(&waiter)) {
3680 waiter.wait();
3681 }
3682 }
3683
3684
3685 // ======================================================
3686 // PG's
3687
3688 PGPool OSD::_get_pool(int id, OSDMapRef createmap)
3689 {
3690 if (!createmap->have_pg_pool(id)) {
3691 dout(5) << __func__ << ": the OSDmap does not contain a PG pool with id = "
3692 << id << dendl;
3693 ceph_abort();
3694 }
3695
3696 PGPool p = PGPool(cct, createmap, id);
3697
3698 dout(10) << "_get_pool " << p.id << dendl;
3699 return p;
3700 }
3701
3702 PG *OSD::_open_lock_pg(
3703 OSDMapRef createmap,
3704 spg_t pgid, bool no_lockdep_check)
3705 {
3706 assert(osd_lock.is_locked());
3707
3708 PG* pg = _make_pg(createmap, pgid);
3709 {
3710 RWLock::WLocker l(pg_map_lock);
3711 pg->lock(no_lockdep_check);
3712 pg_map[pgid] = pg;
3713 pg->get("PGMap"); // because it's in pg_map
3714 service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
3715 }
3716 return pg;
3717 }
3718
3719 PG* OSD::_make_pg(
3720 OSDMapRef createmap,
3721 spg_t pgid)
3722 {
3723 dout(10) << "_open_lock_pg " << pgid << dendl;
3724 PGPool pool = _get_pool(pgid.pool(), createmap);
3725
3726 // create
3727 PG *pg;
3728 if (createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_REPLICATED ||
3729 createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_ERASURE)
3730 pg = new PrimaryLogPG(&service, createmap, pool, pgid);
3731 else
3732 ceph_abort();
3733
3734 return pg;
3735 }
3736
3737
3738 void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
3739 {
3740 epoch_t e(service.get_osdmap()->get_epoch());
3741 pg->get("PGMap"); // For pg_map
3742 pg_map[pg->info.pgid] = pg;
3743 service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
3744
3745 dout(10) << "Adding newly split pg " << *pg << dendl;
3746 pg->handle_loaded(rctx);
3747 pg->write_if_dirty(*(rctx->transaction));
3748 pg->queue_null(e, e);
3749 map<spg_t, list<PG::CephPeeringEvtRef> >::iterator to_wake =
3750 peering_wait_for_split.find(pg->info.pgid);
3751 if (to_wake != peering_wait_for_split.end()) {
3752 for (list<PG::CephPeeringEvtRef>::iterator i =
3753 to_wake->second.begin();
3754 i != to_wake->second.end();
3755 ++i) {
3756 pg->queue_peering_event(*i);
3757 }
3758 peering_wait_for_split.erase(to_wake);
3759 }
3760 if (!service.get_osdmap()->have_pg_pool(pg->info.pgid.pool()))
3761 _remove_pg(pg);
3762 }
3763
3764 OSD::res_result OSD::_try_resurrect_pg(
3765 OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state)
3766 {
3767 assert(resurrected);
3768 assert(old_pg_state);
3769 // find nearest ancestor
3770 DeletingStateRef df;
3771 spg_t cur(pgid);
3772 while (true) {
3773 df = service.deleting_pgs.lookup(cur);
3774 if (df)
3775 break;
3776 if (!cur.ps())
3777 break;
3778 cur = cur.get_parent();
3779 }
3780 if (!df)
3781 return RES_NONE; // good to go
3782
3783 df->old_pg_state->lock();
3784 OSDMapRef create_map = df->old_pg_state->get_osdmap();
3785 df->old_pg_state->unlock();
3786
3787 set<spg_t> children;
3788 if (cur == pgid) {
3789 if (df->try_stop_deletion()) {
3790 dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl;
3791 *resurrected = cur;
3792 *old_pg_state = df->old_pg_state;
3793 service.deleting_pgs.remove(pgid); // PG is no longer being removed!
3794 return RES_SELF;
3795 } else {
3796 // raced, ensure we don't see DeletingStateRef when we try to
3797 // delete this pg
3798 service.deleting_pgs.remove(pgid);
3799 return RES_NONE;
3800 }
3801 } else if (cur.is_split(create_map->get_pg_num(cur.pool()),
3802 curmap->get_pg_num(cur.pool()),
3803 &children) &&
3804 children.count(pgid)) {
3805 if (df->try_stop_deletion()) {
3806 dout(10) << __func__ << ": halted deletion on ancestor pg " << pgid
3807 << dendl;
3808 *resurrected = cur;
3809 *old_pg_state = df->old_pg_state;
3810 service.deleting_pgs.remove(cur); // PG is no longer being removed!
3811 return RES_PARENT;
3812 } else {
3813 /* this is not a problem, failing to cancel proves that all objects
3814 * have been removed, so no hobject_t overlap is possible
3815 */
3816 return RES_NONE;
3817 }
3818 }
3819 return RES_NONE;
3820 }
3821
3822 PG *OSD::_create_lock_pg(
3823 OSDMapRef createmap,
3824 spg_t pgid,
3825 bool hold_map_lock,
3826 bool backfill,
3827 int role,
3828 vector<int>& up, int up_primary,
3829 vector<int>& acting, int acting_primary,
3830 pg_history_t history,
3831 const PastIntervals& pi,
3832 ObjectStore::Transaction& t)
3833 {
3834 assert(osd_lock.is_locked());
3835 dout(20) << "_create_lock_pg pgid " << pgid << dendl;
3836
3837 PG *pg = _open_lock_pg(createmap, pgid, true);
3838
3839 service.init_splits_between(pgid, pg->get_osdmap(), service.get_osdmap());
3840
3841 pg->init(
3842 role,
3843 up,
3844 up_primary,
3845 acting,
3846 acting_primary,
3847 history,
3848 pi,
3849 backfill,
3850 &t);
3851
3852 dout(7) << "_create_lock_pg " << *pg << dendl;
3853 return pg;
3854 }
3855
3856 PG *OSD::_lookup_lock_pg(spg_t pgid)
3857 {
3858 RWLock::RLocker l(pg_map_lock);
3859
3860 auto pg_map_entry = pg_map.find(pgid);
3861 if (pg_map_entry == pg_map.end())
3862 return nullptr;
3863 PG *pg = pg_map_entry->second;
3864 pg->lock();
3865 return pg;
3866 }
3867
3868 PG *OSD::lookup_lock_pg(spg_t pgid)
3869 {
3870 return _lookup_lock_pg(pgid);
3871 }
3872
3873 PG *OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid)
3874 {
3875 assert(pg_map.count(pgid));
3876 PG *pg = pg_map[pgid];
3877 pg->lock();
3878 return pg;
3879 }
3880
3881 void OSD::load_pgs()
3882 {
3883 assert(osd_lock.is_locked());
3884 dout(0) << "load_pgs" << dendl;
3885 {
3886 RWLock::RLocker l(pg_map_lock);
3887 assert(pg_map.empty());
3888 }
3889
3890 vector<coll_t> ls;
3891 int r = store->list_collections(ls);
3892 if (r < 0) {
3893 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
3894 }
3895
3896 bool has_upgraded = false;
3897
3898 for (vector<coll_t>::iterator it = ls.begin();
3899 it != ls.end();
3900 ++it) {
3901 spg_t pgid;
3902 if (it->is_temp(&pgid) ||
3903 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
3904 dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
3905 recursive_remove_collection(cct, store, pgid, *it);
3906 continue;
3907 }
3908
3909 if (!it->is_pg(&pgid)) {
3910 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
3911 continue;
3912 }
3913
3914 if (pgid.preferred() >= 0) {
3915 dout(10) << __func__ << ": skipping localized PG " << pgid << dendl;
3916 // FIXME: delete it too, eventually
3917 continue;
3918 }
3919
3920 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
3921 bufferlist bl;
3922 epoch_t map_epoch = 0;
3923 int r = PG::peek_map_epoch(store, pgid, &map_epoch, &bl);
3924 if (r < 0) {
3925 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
3926 << dendl;
3927 continue;
3928 }
3929
3930 PG *pg = NULL;
3931 if (map_epoch > 0) {
3932 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
3933 if (!pgosdmap) {
3934 if (!osdmap->have_pg_pool(pgid.pool())) {
3935 derr << __func__ << ": could not find map for epoch " << map_epoch
3936 << " on pg " << pgid << ", but the pool is not present in the "
3937 << "current map, so this is probably a result of bug 10617. "
3938 << "Skipping the pg for now, you can use ceph-objectstore-tool "
3939 << "to clean it up later." << dendl;
3940 continue;
3941 } else {
3942 derr << __func__ << ": have pgid " << pgid << " at epoch "
3943 << map_epoch << ", but missing map. Crashing."
3944 << dendl;
3945 assert(0 == "Missing map in load_pgs");
3946 }
3947 }
3948 pg = _open_lock_pg(pgosdmap, pgid);
3949 } else {
3950 pg = _open_lock_pg(osdmap, pgid);
3951 }
3952 // there can be no waiters here, so we don't call wake_pg_waiters
3953
3954 pg->ch = store->open_collection(pg->coll);
3955
3956 // read pg state, log
3957 pg->read_state(store, bl);
3958
3959 if (pg->must_upgrade()) {
3960 if (!pg->can_upgrade()) {
3961 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
3962 << " an older version first." << dendl;
3963 assert(0 == "PG too old to upgrade");
3964 }
3965 if (!has_upgraded) {
3966 derr << "PGs are upgrading" << dendl;
3967 has_upgraded = true;
3968 }
3969 dout(10) << "PG " << pg->info.pgid
3970 << " must upgrade..." << dendl;
3971 pg->upgrade(store);
3972 }
3973
3974 service.init_splits_between(pg->info.pgid, pg->get_osdmap(), osdmap);
3975
3976 // generate state for PG's current mapping
3977 int primary, up_primary;
3978 vector<int> acting, up;
3979 pg->get_osdmap()->pg_to_up_acting_osds(
3980 pgid.pgid, &up, &up_primary, &acting, &primary);
3981 pg->init_primary_up_acting(
3982 up,
3983 acting,
3984 up_primary,
3985 primary);
3986 int role = OSDMap::calc_pg_role(whoami, pg->acting);
3987 if (pg->pool.info.is_replicated() || role == pg->pg_whoami.shard)
3988 pg->set_role(role);
3989 else
3990 pg->set_role(-1);
3991
3992 pg->reg_next_scrub();
3993
3994 PG::RecoveryCtx rctx(0, 0, 0, 0, 0, 0);
3995 pg->handle_loaded(&rctx);
3996
3997 dout(10) << "load_pgs loaded " << *pg << " " << pg->pg_log.get_log() << dendl;
3998 if (pg->pg_log.is_dirty()) {
3999 ObjectStore::Transaction t;
4000 pg->write_if_dirty(t);
4001 store->apply_transaction(pg->osr.get(), std::move(t));
4002 }
4003 pg->unlock();
4004 }
4005 {
4006 RWLock::RLocker l(pg_map_lock);
4007 dout(0) << "load_pgs opened " << pg_map.size() << " pgs" << dendl;
4008 }
4009
4010 // clean up old infos object?
4011 if (has_upgraded && store->exists(coll_t::meta(), OSD::make_infos_oid())) {
4012 dout(1) << __func__ << " removing legacy infos object" << dendl;
4013 ObjectStore::Transaction t;
4014 t.remove(coll_t::meta(), OSD::make_infos_oid());
4015 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
4016 if (r != 0) {
4017 derr << __func__ << ": apply_transaction returned "
4018 << cpp_strerror(r) << dendl;
4019 ceph_abort();
4020 }
4021 }
4022
4023 build_past_intervals_parallel();
4024 }
4025
4026
4027 /*
4028 * build past_intervals efficiently on old, degraded, and buried
4029 * clusters. this is important for efficiently catching up osds that
4030 * are way behind on maps to the current cluster state.
4031 *
4032 * this is a parallel version of PG::generate_past_intervals().
4033 * follow the same logic, but do all pgs at the same time so that we
4034 * can make a single pass across the osdmap history.
4035 */
4036 void OSD::build_past_intervals_parallel()
4037 {
4038 struct pistate {
4039 epoch_t start, end;
4040 vector<int> old_acting, old_up;
4041 epoch_t same_interval_since;
4042 int primary;
4043 int up_primary;
4044 };
4045 map<PG*,pistate> pis;
4046
4047 // calculate junction of map range
4048 epoch_t end_epoch = superblock.oldest_map;
4049 epoch_t cur_epoch = superblock.newest_map;
4050 {
4051 RWLock::RLocker l(pg_map_lock);
4052 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4053 i != pg_map.end();
4054 ++i) {
4055 PG *pg = i->second;
4056
4057 auto rpib = pg->get_required_past_interval_bounds(
4058 pg->info,
4059 superblock.oldest_map);
4060 if (rpib.first >= rpib.second && pg->past_intervals.empty()) {
4061 if (pg->info.history.same_interval_since == 0) {
4062 pg->info.history.same_interval_since = rpib.second;
4063 }
4064 continue;
4065 } else {
4066 auto apib = pg->past_intervals.get_bounds();
4067 if (apib.second >= rpib.second &&
4068 apib.first <= rpib.first) {
4069 if (pg->info.history.same_interval_since == 0) {
4070 pg->info.history.same_interval_since = rpib.second;
4071 }
4072 continue;
4073 }
4074 }
4075
4076 dout(10) << pg->info.pgid << " needs " << rpib.first << "-"
4077 << rpib.second << dendl;
4078 pistate& p = pis[pg];
4079 p.start = rpib.first;
4080 p.end = rpib.second;
4081 p.same_interval_since = 0;
4082
4083 if (rpib.first < cur_epoch)
4084 cur_epoch = rpib.first;
4085 if (rpib.second > end_epoch)
4086 end_epoch = rpib.second;
4087 }
4088 }
4089 if (pis.empty()) {
4090 dout(10) << __func__ << " nothing to build" << dendl;
4091 return;
4092 }
4093
4094 dout(1) << __func__ << " over " << cur_epoch << "-" << end_epoch << dendl;
4095 assert(cur_epoch <= end_epoch);
4096
4097 OSDMapRef cur_map, last_map;
4098 for ( ; cur_epoch <= end_epoch; cur_epoch++) {
4099 dout(10) << __func__ << " epoch " << cur_epoch << dendl;
4100 last_map = cur_map;
4101 cur_map = get_map(cur_epoch);
4102
4103 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4104 PG *pg = i->first;
4105 pistate& p = i->second;
4106
4107 if (cur_epoch < p.start || cur_epoch > p.end)
4108 continue;
4109
4110 vector<int> acting, up;
4111 int up_primary;
4112 int primary;
4113 pg_t pgid = pg->info.pgid.pgid;
4114 if (p.same_interval_since && last_map->get_pools().count(pgid.pool()))
4115 pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool()));
4116 cur_map->pg_to_up_acting_osds(
4117 pgid, &up, &up_primary, &acting, &primary);
4118
4119 if (p.same_interval_since == 0) {
4120 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4121 << " first map, acting " << acting
4122 << " up " << up << ", same_interval_since = " << cur_epoch << dendl;
4123 p.same_interval_since = cur_epoch;
4124 p.old_up = up;
4125 p.old_acting = acting;
4126 p.primary = primary;
4127 p.up_primary = up_primary;
4128 continue;
4129 }
4130 assert(last_map);
4131
4132 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
4133 pg->get_is_recoverable_predicate());
4134 std::stringstream debug;
4135 bool new_interval = PastIntervals::check_new_interval(
4136 p.primary,
4137 primary,
4138 p.old_acting, acting,
4139 p.up_primary,
4140 up_primary,
4141 p.old_up, up,
4142 p.same_interval_since,
4143 pg->info.history.last_epoch_clean,
4144 cur_map, last_map,
4145 pgid,
4146 recoverable.get(),
4147 &pg->past_intervals,
4148 &debug);
4149 if (new_interval) {
4150 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4151 << " " << debug.str() << dendl;
4152 p.old_up = up;
4153 p.old_acting = acting;
4154 p.primary = primary;
4155 p.up_primary = up_primary;
4156 p.same_interval_since = cur_epoch;
4157 }
4158 }
4159 }
4160
4161 // Now that past_intervals have been recomputed let's fix the same_interval_since
4162 // if it was cleared by import.
4163 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4164 PG *pg = i->first;
4165 pistate& p = i->second;
4166
4167 if (pg->info.history.same_interval_since == 0) {
4168 assert(p.same_interval_since);
4169 dout(10) << __func__ << " fix same_interval_since " << p.same_interval_since << " pg " << *pg << dendl;
4170 dout(10) << __func__ << " past_intervals " << pg->past_intervals << dendl;
4171 // Fix it
4172 pg->info.history.same_interval_since = p.same_interval_since;
4173 }
4174 }
4175
4176 // write info only at the end. this is necessary because we check
4177 // whether the past_intervals go far enough back or forward in time,
4178 // but we don't check for holes. we could avoid it by discarding
4179 // the previous past_intervals and rebuilding from scratch, or we
4180 // can just do this and commit all our work at the end.
4181 ObjectStore::Transaction t;
4182 int num = 0;
4183 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4184 PG *pg = i->first;
4185 pg->lock();
4186 pg->dirty_big_info = true;
4187 pg->dirty_info = true;
4188 pg->write_if_dirty(t);
4189 pg->unlock();
4190
4191 // don't let the transaction get too big
4192 if (++num >= cct->_conf->osd_target_transaction_size) {
4193 store->apply_transaction(service.meta_osr.get(), std::move(t));
4194 t = ObjectStore::Transaction();
4195 num = 0;
4196 }
4197 }
4198 if (!t.empty())
4199 store->apply_transaction(service.meta_osr.get(), std::move(t));
4200 }
4201
4202 /*
4203 * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping
4204 * hasn't changed since the given epoch and we are the primary.
4205 */
4206 int OSD::handle_pg_peering_evt(
4207 spg_t pgid,
4208 const pg_history_t& orig_history,
4209 const PastIntervals& pi,
4210 epoch_t epoch,
4211 PG::CephPeeringEvtRef evt)
4212 {
4213 if (service.splitting(pgid)) {
4214 peering_wait_for_split[pgid].push_back(evt);
4215 return -EEXIST;
4216 }
4217
4218 PG *pg = _lookup_lock_pg(pgid);
4219 if (!pg) {
4220 // same primary?
4221 if (!osdmap->have_pg_pool(pgid.pool()))
4222 return -EINVAL;
4223 int up_primary, acting_primary;
4224 vector<int> up, acting;
4225 osdmap->pg_to_up_acting_osds(
4226 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4227
4228 pg_history_t history = orig_history;
4229 bool valid_history = project_pg_history(
4230 pgid, history, epoch, up, up_primary, acting, acting_primary);
4231
4232 if (!valid_history || epoch < history.same_interval_since) {
4233 dout(10) << __func__ << pgid << " acting changed in "
4234 << history.same_interval_since << " (msg from " << epoch << ")"
4235 << dendl;
4236 return -EINVAL;
4237 }
4238
4239 if (service.splitting(pgid)) {
4240 ceph_abort();
4241 }
4242
4243 // do we need to resurrect a deleting pg?
4244 spg_t resurrected;
4245 PGRef old_pg_state;
4246 res_result result = _try_resurrect_pg(
4247 service.get_osdmap(),
4248 pgid,
4249 &resurrected,
4250 &old_pg_state);
4251
4252 PG::RecoveryCtx rctx = create_context();
4253 switch (result) {
4254 case RES_NONE: {
4255 const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
4256 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4257 store->get_type() != "bluestore") {
4258 clog->warn() << "pg " << pgid
4259 << " is at risk of silent data corruption: "
4260 << "the pool allows ec overwrites but is not stored in "
4261 << "bluestore, so deep scrubbing will not detect bitrot";
4262 }
4263 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4264 PG::_init(*rctx.transaction, pgid, pp);
4265
4266 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
4267 if (!pp->is_replicated() && role != pgid.shard)
4268 role = -1;
4269
4270 pg = _create_lock_pg(
4271 get_map(epoch),
4272 pgid, false, false,
4273 role,
4274 up, up_primary,
4275 acting, acting_primary,
4276 history, pi,
4277 *rctx.transaction);
4278 pg->handle_create(&rctx);
4279 pg->write_if_dirty(*rctx.transaction);
4280 dispatch_context(rctx, pg, osdmap);
4281
4282 dout(10) << *pg << " is new" << dendl;
4283
4284 pg->queue_peering_event(evt);
4285 wake_pg_waiters(pg);
4286 pg->unlock();
4287 return 0;
4288 }
4289 case RES_SELF: {
4290 old_pg_state->lock();
4291 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4292 int old_role = old_pg_state->role;
4293 vector<int> old_up = old_pg_state->up;
4294 int old_up_primary = old_pg_state->up_primary.osd;
4295 vector<int> old_acting = old_pg_state->acting;
4296 int old_primary = old_pg_state->primary.osd;
4297 pg_history_t old_history = old_pg_state->info.history;
4298 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4299 old_pg_state->unlock();
4300 pg = _create_lock_pg(
4301 old_osd_map,
4302 resurrected,
4303 false,
4304 true,
4305 old_role,
4306 old_up,
4307 old_up_primary,
4308 old_acting,
4309 old_primary,
4310 old_history,
4311 old_past_intervals,
4312 *rctx.transaction);
4313 pg->handle_create(&rctx);
4314 pg->write_if_dirty(*rctx.transaction);
4315 dispatch_context(rctx, pg, osdmap);
4316
4317 dout(10) << *pg << " is new (resurrected)" << dendl;
4318
4319 pg->queue_peering_event(evt);
4320 wake_pg_waiters(pg);
4321 pg->unlock();
4322 return 0;
4323 }
4324 case RES_PARENT: {
4325 assert(old_pg_state);
4326 old_pg_state->lock();
4327 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4328 int old_role = old_pg_state->role;
4329 vector<int> old_up = old_pg_state->up;
4330 int old_up_primary = old_pg_state->up_primary.osd;
4331 vector<int> old_acting = old_pg_state->acting;
4332 int old_primary = old_pg_state->primary.osd;
4333 pg_history_t old_history = old_pg_state->info.history;
4334 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4335 old_pg_state->unlock();
4336 PG *parent = _create_lock_pg(
4337 old_osd_map,
4338 resurrected,
4339 false,
4340 true,
4341 old_role,
4342 old_up,
4343 old_up_primary,
4344 old_acting,
4345 old_primary,
4346 old_history,
4347 old_past_intervals,
4348 *rctx.transaction
4349 );
4350 parent->handle_create(&rctx);
4351 parent->write_if_dirty(*rctx.transaction);
4352 dispatch_context(rctx, parent, osdmap);
4353
4354 dout(10) << *parent << " is new" << dendl;
4355
4356 assert(service.splitting(pgid));
4357 peering_wait_for_split[pgid].push_back(evt);
4358
4359 //parent->queue_peering_event(evt);
4360 parent->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
4361 wake_pg_waiters(parent);
4362 parent->unlock();
4363 return 0;
4364 }
4365 default:
4366 assert(0);
4367 return 0;
4368 }
4369 } else {
4370 // already had it. did the mapping change?
4371 if (epoch < pg->info.history.same_interval_since) {
4372 dout(10) << *pg << __func__ << " acting changed in "
4373 << pg->info.history.same_interval_since
4374 << " (msg from " << epoch << ")" << dendl;
4375 } else {
4376 pg->queue_peering_event(evt);
4377 }
4378 pg->unlock();
4379 return -EEXIST;
4380 }
4381 }
4382
4383
4384 void OSD::build_initial_pg_history(
4385 spg_t pgid,
4386 epoch_t created,
4387 utime_t created_stamp,
4388 pg_history_t *h,
4389 PastIntervals *pi)
4390 {
4391 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4392 h->epoch_created = created;
4393 h->epoch_pool_created = created;
4394 h->same_interval_since = created;
4395 h->same_up_since = created;
4396 h->same_primary_since = created;
4397 h->last_scrub_stamp = created_stamp;
4398 h->last_deep_scrub_stamp = created_stamp;
4399 h->last_clean_scrub_stamp = created_stamp;
4400
4401 OSDMapRef lastmap = service.get_map(created);
4402 int up_primary, acting_primary;
4403 vector<int> up, acting;
4404 lastmap->pg_to_up_acting_osds(
4405 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4406
4407 ostringstream debug;
4408 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4409 OSDMapRef osdmap = service.get_map(e);
4410 int new_up_primary, new_acting_primary;
4411 vector<int> new_up, new_acting;
4412 osdmap->pg_to_up_acting_osds(
4413 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4414
4415 // this is a bit imprecise, but sufficient?
4416 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4417 const pg_pool_t *pi;
4418 bool operator()(const set<pg_shard_t> &have) const {
4419 return have.size() >= pi->min_size;
4420 }
4421 min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4422 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4423
4424 bool new_interval = PastIntervals::check_new_interval(
4425 acting_primary,
4426 new_acting_primary,
4427 acting, new_acting,
4428 up_primary,
4429 new_up_primary,
4430 up, new_up,
4431 h->same_interval_since,
4432 h->last_epoch_clean,
4433 osdmap,
4434 lastmap,
4435 pgid.pgid,
4436 &min_size_predicate,
4437 pi,
4438 &debug);
4439 if (new_interval) {
4440 h->same_interval_since = e;
4441 }
4442 if (up != new_up) {
4443 h->same_up_since = e;
4444 }
4445 if (acting_primary != new_acting_primary) {
4446 h->same_primary_since = e;
4447 }
4448 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4449 osdmap->get_pg_num(pgid.pgid.pool()),
4450 nullptr)) {
4451 h->last_epoch_split = e;
4452 }
4453 lastmap = osdmap;
4454 }
4455 dout(20) << __func__ << " " << debug.str() << dendl;
4456 dout(10) << __func__ << " " << *h << " " << *pi
4457 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4458 pi->get_bounds()) << ")"
4459 << dendl;
4460 }
4461
4462 /**
4463 * Fill in the passed history so you know same_interval_since, same_up_since,
4464 * and same_primary_since.
4465 */
4466 bool OSD::project_pg_history(spg_t pgid, pg_history_t& h, epoch_t from,
4467 const vector<int>& currentup,
4468 int currentupprimary,
4469 const vector<int>& currentacting,
4470 int currentactingprimary)
4471 {
4472 dout(15) << "project_pg_history " << pgid
4473 << " from " << from << " to " << osdmap->get_epoch()
4474 << ", start " << h
4475 << dendl;
4476
4477 epoch_t e;
4478 for (e = osdmap->get_epoch();
4479 e > from;
4480 e--) {
4481 // verify during intermediate epoch (e-1)
4482 OSDMapRef oldmap = service.try_get_map(e-1);
4483 if (!oldmap) {
4484 dout(15) << __func__ << ": found map gap, returning false" << dendl;
4485 return false;
4486 }
4487 assert(oldmap->have_pg_pool(pgid.pool()));
4488
4489 int upprimary, actingprimary;
4490 vector<int> up, acting;
4491 oldmap->pg_to_up_acting_osds(
4492 pgid.pgid,
4493 &up,
4494 &upprimary,
4495 &acting,
4496 &actingprimary);
4497
4498 // acting set change?
4499 if ((actingprimary != currentactingprimary ||
4500 upprimary != currentupprimary ||
4501 acting != currentacting ||
4502 up != currentup) && e > h.same_interval_since) {
4503 dout(15) << "project_pg_history " << pgid << " acting|up changed in " << e
4504 << " from " << acting << "/" << up
4505 << " " << actingprimary << "/" << upprimary
4506 << " -> " << currentacting << "/" << currentup
4507 << " " << currentactingprimary << "/" << currentupprimary
4508 << dendl;
4509 h.same_interval_since = e;
4510 }
4511 // split?
4512 if (pgid.is_split(oldmap->get_pg_num(pgid.pool()),
4513 osdmap->get_pg_num(pgid.pool()),
4514 0) && e > h.same_interval_since) {
4515 h.same_interval_since = e;
4516 }
4517 // up set change?
4518 if ((up != currentup || upprimary != currentupprimary)
4519 && e > h.same_up_since) {
4520 dout(15) << "project_pg_history " << pgid << " up changed in " << e
4521 << " from " << up << " " << upprimary
4522 << " -> " << currentup << " " << currentupprimary << dendl;
4523 h.same_up_since = e;
4524 }
4525
4526 // primary change?
4527 if (OSDMap::primary_changed(
4528 actingprimary,
4529 acting,
4530 currentactingprimary,
4531 currentacting) &&
4532 e > h.same_primary_since) {
4533 dout(15) << "project_pg_history " << pgid << " primary changed in " << e << dendl;
4534 h.same_primary_since = e;
4535 }
4536
4537 if (h.same_interval_since >= e && h.same_up_since >= e && h.same_primary_since >= e)
4538 break;
4539 }
4540
4541 // base case: these floors should be the pg creation epoch if we didn't
4542 // find any changes.
4543 if (e == h.epoch_created) {
4544 if (!h.same_interval_since)
4545 h.same_interval_since = e;
4546 if (!h.same_up_since)
4547 h.same_up_since = e;
4548 if (!h.same_primary_since)
4549 h.same_primary_since = e;
4550 }
4551
4552 dout(15) << "project_pg_history end " << h << dendl;
4553 return true;
4554 }
4555
4556
4557
4558 void OSD::_add_heartbeat_peer(int p)
4559 {
4560 if (p == whoami)
4561 return;
4562 HeartbeatInfo *hi;
4563
4564 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4565 if (i == heartbeat_peers.end()) {
4566 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4567 if (!cons.first)
4568 return;
4569 hi = &heartbeat_peers[p];
4570 hi->peer = p;
4571 HeartbeatSession *s = new HeartbeatSession(p);
4572 hi->con_back = cons.first.get();
4573 hi->con_back->set_priv(s->get());
4574 if (cons.second) {
4575 hi->con_front = cons.second.get();
4576 hi->con_front->set_priv(s->get());
4577 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4578 << " " << hi->con_back->get_peer_addr()
4579 << " " << hi->con_front->get_peer_addr()
4580 << dendl;
4581 } else {
4582 hi->con_front.reset(NULL);
4583 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4584 << " " << hi->con_back->get_peer_addr()
4585 << dendl;
4586 }
4587 s->put();
4588 } else {
4589 hi = &i->second;
4590 }
4591 hi->epoch = osdmap->get_epoch();
4592 }
4593
4594 void OSD::_remove_heartbeat_peer(int n)
4595 {
4596 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
4597 assert(q != heartbeat_peers.end());
4598 dout(20) << " removing heartbeat peer osd." << n
4599 << " " << q->second.con_back->get_peer_addr()
4600 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
4601 << dendl;
4602 q->second.con_back->mark_down();
4603 if (q->second.con_front) {
4604 q->second.con_front->mark_down();
4605 }
4606 heartbeat_peers.erase(q);
4607 }
4608
4609 void OSD::need_heartbeat_peer_update()
4610 {
4611 if (is_stopping())
4612 return;
4613 dout(20) << "need_heartbeat_peer_update" << dendl;
4614 heartbeat_set_peers_need_update();
4615 }
4616
4617 void OSD::maybe_update_heartbeat_peers()
4618 {
4619 assert(osd_lock.is_locked());
4620
4621 if (is_waiting_for_healthy()) {
4622 utime_t now = ceph_clock_now();
4623 if (last_heartbeat_resample == utime_t()) {
4624 last_heartbeat_resample = now;
4625 heartbeat_set_peers_need_update();
4626 } else if (!heartbeat_peers_need_update()) {
4627 utime_t dur = now - last_heartbeat_resample;
4628 if (dur > cct->_conf->osd_heartbeat_grace) {
4629 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
4630 heartbeat_set_peers_need_update();
4631 last_heartbeat_resample = now;
4632 reset_heartbeat_peers(); // we want *new* peers!
4633 }
4634 }
4635 }
4636
4637 if (!heartbeat_peers_need_update())
4638 return;
4639 heartbeat_clear_peers_need_update();
4640
4641 Mutex::Locker l(heartbeat_lock);
4642
4643 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
4644
4645
4646 // build heartbeat from set
4647 if (is_active()) {
4648 RWLock::RLocker l(pg_map_lock);
4649 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4650 i != pg_map.end();
4651 ++i) {
4652 PG *pg = i->second;
4653 pg->heartbeat_peer_lock.Lock();
4654 dout(20) << i->first << " heartbeat_peers " << pg->heartbeat_peers << dendl;
4655 for (set<int>::iterator p = pg->heartbeat_peers.begin();
4656 p != pg->heartbeat_peers.end();
4657 ++p)
4658 if (osdmap->is_up(*p))
4659 _add_heartbeat_peer(*p);
4660 for (set<int>::iterator p = pg->probe_targets.begin();
4661 p != pg->probe_targets.end();
4662 ++p)
4663 if (osdmap->is_up(*p))
4664 _add_heartbeat_peer(*p);
4665 pg->heartbeat_peer_lock.Unlock();
4666 }
4667 }
4668
4669 // include next and previous up osds to ensure we have a fully-connected set
4670 set<int> want, extras;
4671 int next = osdmap->get_next_up_osd_after(whoami);
4672 if (next >= 0)
4673 want.insert(next);
4674 int prev = osdmap->get_previous_up_osd_before(whoami);
4675 if (prev >= 0 && prev != next)
4676 want.insert(prev);
4677
4678 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
4679 dout(10) << " adding neighbor peer osd." << *p << dendl;
4680 extras.insert(*p);
4681 _add_heartbeat_peer(*p);
4682 }
4683
4684 // remove down peers; enumerate extras
4685 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4686 while (p != heartbeat_peers.end()) {
4687 if (!osdmap->is_up(p->first)) {
4688 int o = p->first;
4689 ++p;
4690 _remove_heartbeat_peer(o);
4691 continue;
4692 }
4693 if (p->second.epoch < osdmap->get_epoch()) {
4694 extras.insert(p->first);
4695 }
4696 ++p;
4697 }
4698
4699 // too few?
4700 int start = osdmap->get_next_up_osd_after(whoami);
4701 for (int n = start; n >= 0; ) {
4702 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
4703 break;
4704 if (!extras.count(n) && !want.count(n) && n != whoami) {
4705 dout(10) << " adding random peer osd." << n << dendl;
4706 extras.insert(n);
4707 _add_heartbeat_peer(n);
4708 }
4709 n = osdmap->get_next_up_osd_after(n);
4710 if (n == start)
4711 break; // came full circle; stop
4712 }
4713
4714 // too many?
4715 for (set<int>::iterator p = extras.begin();
4716 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
4717 ++p) {
4718 if (want.count(*p))
4719 continue;
4720 _remove_heartbeat_peer(*p);
4721 }
4722
4723 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
4724 }
4725
4726 void OSD::reset_heartbeat_peers()
4727 {
4728 assert(osd_lock.is_locked());
4729 dout(10) << "reset_heartbeat_peers" << dendl;
4730 Mutex::Locker l(heartbeat_lock);
4731 while (!heartbeat_peers.empty()) {
4732 HeartbeatInfo& hi = heartbeat_peers.begin()->second;
4733 hi.con_back->mark_down();
4734 if (hi.con_front) {
4735 hi.con_front->mark_down();
4736 }
4737 heartbeat_peers.erase(heartbeat_peers.begin());
4738 }
4739 failure_queue.clear();
4740 }
4741
4742 void OSD::handle_osd_ping(MOSDPing *m)
4743 {
4744 if (superblock.cluster_fsid != m->fsid) {
4745 dout(20) << "handle_osd_ping from " << m->get_source_inst()
4746 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
4747 m->put();
4748 return;
4749 }
4750
4751 int from = m->get_source().num();
4752
4753 heartbeat_lock.Lock();
4754 if (is_stopping()) {
4755 heartbeat_lock.Unlock();
4756 m->put();
4757 return;
4758 }
4759
4760 OSDMapRef curmap = service.get_osdmap();
4761 if (!curmap) {
4762 heartbeat_lock.Unlock();
4763 m->put();
4764 return;
4765 }
4766
4767 switch (m->op) {
4768
4769 case MOSDPing::PING:
4770 {
4771 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
4772 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
4773 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
4774 if (heartbeat_drop->second == 0) {
4775 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
4776 } else {
4777 --heartbeat_drop->second;
4778 dout(5) << "Dropping heartbeat from " << from
4779 << ", " << heartbeat_drop->second
4780 << " remaining to drop" << dendl;
4781 break;
4782 }
4783 } else if (cct->_conf->osd_debug_drop_ping_probability >
4784 ((((double)(rand()%100))/100.0))) {
4785 heartbeat_drop =
4786 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
4787 cct->_conf->osd_debug_drop_ping_duration)).first;
4788 dout(5) << "Dropping heartbeat from " << from
4789 << ", " << heartbeat_drop->second
4790 << " remaining to drop" << dendl;
4791 break;
4792 }
4793 }
4794
4795 if (!cct->get_heartbeat_map()->is_healthy()) {
4796 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
4797 break;
4798 }
4799
4800 Message *r = new MOSDPing(monc->get_fsid(),
4801 curmap->get_epoch(),
4802 MOSDPing::PING_REPLY, m->stamp,
4803 cct->_conf->osd_heartbeat_min_size);
4804 m->get_connection()->send_message(r);
4805
4806 if (curmap->is_up(from)) {
4807 service.note_peer_epoch(from, m->map_epoch);
4808 if (is_active()) {
4809 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
4810 if (con) {
4811 service.share_map_peer(from, con.get());
4812 }
4813 }
4814 } else if (!curmap->exists(from) ||
4815 curmap->get_down_at(from) > m->map_epoch) {
4816 // tell them they have died
4817 Message *r = new MOSDPing(monc->get_fsid(),
4818 curmap->get_epoch(),
4819 MOSDPing::YOU_DIED,
4820 m->stamp,
4821 cct->_conf->osd_heartbeat_min_size);
4822 m->get_connection()->send_message(r);
4823 }
4824 }
4825 break;
4826
4827 case MOSDPing::PING_REPLY:
4828 {
4829 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
4830 if (i != heartbeat_peers.end()) {
4831 if (m->get_connection() == i->second.con_back) {
4832 dout(25) << "handle_osd_ping got reply from osd." << from
4833 << " first_tx " << i->second.first_tx
4834 << " last_tx " << i->second.last_tx
4835 << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
4836 << " last_rx_front " << i->second.last_rx_front
4837 << dendl;
4838 i->second.last_rx_back = m->stamp;
4839 // if there is no front con, set both stamps.
4840 if (i->second.con_front == NULL)
4841 i->second.last_rx_front = m->stamp;
4842 } else if (m->get_connection() == i->second.con_front) {
4843 dout(25) << "handle_osd_ping got reply from osd." << from
4844 << " first_tx " << i->second.first_tx
4845 << " last_tx " << i->second.last_tx
4846 << " last_rx_back " << i->second.last_rx_back
4847 << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
4848 << dendl;
4849 i->second.last_rx_front = m->stamp;
4850 }
4851
4852 utime_t cutoff = ceph_clock_now();
4853 cutoff -= cct->_conf->osd_heartbeat_grace;
4854 if (i->second.is_healthy(cutoff)) {
4855 // Cancel false reports
4856 auto failure_queue_entry = failure_queue.find(from);
4857 if (failure_queue_entry != failure_queue.end()) {
4858 dout(10) << "handle_osd_ping canceling queued "
4859 << "failure report for osd." << from << dendl;
4860 failure_queue.erase(failure_queue_entry);
4861 }
4862
4863 auto failure_pending_entry = failure_pending.find(from);
4864 if (failure_pending_entry != failure_pending.end()) {
4865 dout(10) << "handle_osd_ping canceling in-flight "
4866 << "failure report for osd." << from << dendl;
4867 send_still_alive(curmap->get_epoch(),
4868 failure_pending_entry->second.second);
4869 failure_pending.erase(failure_pending_entry);
4870 }
4871 }
4872 }
4873
4874 if (m->map_epoch &&
4875 curmap->is_up(from)) {
4876 service.note_peer_epoch(from, m->map_epoch);
4877 if (is_active()) {
4878 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
4879 if (con) {
4880 service.share_map_peer(from, con.get());
4881 }
4882 }
4883 }
4884 }
4885 break;
4886
4887 case MOSDPing::YOU_DIED:
4888 dout(10) << "handle_osd_ping " << m->get_source_inst()
4889 << " says i am down in " << m->map_epoch << dendl;
4890 osdmap_subscribe(curmap->get_epoch()+1, false);
4891 break;
4892 }
4893
4894 heartbeat_lock.Unlock();
4895 m->put();
4896 }
4897
4898 void OSD::heartbeat_entry()
4899 {
4900 Mutex::Locker l(heartbeat_lock);
4901 if (is_stopping())
4902 return;
4903 while (!heartbeat_stop) {
4904 heartbeat();
4905
4906 double wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
4907 utime_t w;
4908 w.set_from_double(wait);
4909 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
4910 heartbeat_cond.WaitInterval(heartbeat_lock, w);
4911 if (is_stopping())
4912 return;
4913 dout(30) << "heartbeat_entry woke up" << dendl;
4914 }
4915 }
4916
4917 void OSD::heartbeat_check()
4918 {
4919 assert(heartbeat_lock.is_locked());
4920 utime_t now = ceph_clock_now();
4921
4922 // check for heartbeat replies (move me elsewhere?)
4923 utime_t cutoff = now;
4924 cutoff -= cct->_conf->osd_heartbeat_grace;
4925 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4926 p != heartbeat_peers.end();
4927 ++p) {
4928
4929 if (p->second.first_tx == utime_t()) {
4930 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
4931 << "yet, skipping" << dendl;
4932 continue;
4933 }
4934
4935 dout(25) << "heartbeat_check osd." << p->first
4936 << " first_tx " << p->second.first_tx
4937 << " last_tx " << p->second.last_tx
4938 << " last_rx_back " << p->second.last_rx_back
4939 << " last_rx_front " << p->second.last_rx_front
4940 << dendl;
4941 if (p->second.is_unhealthy(cutoff)) {
4942 if (p->second.last_rx_back == utime_t() ||
4943 p->second.last_rx_front == utime_t()) {
4944 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
4945 << " osd." << p->first << " ever on either front or back, first ping sent "
4946 << p->second.first_tx << " (cutoff " << cutoff << ")" << dendl;
4947 // fail
4948 failure_queue[p->first] = p->second.last_tx;
4949 } else {
4950 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
4951 << " osd." << p->first << " since back " << p->second.last_rx_back
4952 << " front " << p->second.last_rx_front
4953 << " (cutoff " << cutoff << ")" << dendl;
4954 // fail
4955 failure_queue[p->first] = MIN(p->second.last_rx_back, p->second.last_rx_front);
4956 }
4957 }
4958 }
4959 }
4960
4961 void OSD::heartbeat()
4962 {
4963 dout(30) << "heartbeat" << dendl;
4964
4965 // get CPU load avg
4966 double loadavgs[1];
4967 int n_samples = 86400 / cct->_conf->osd_heartbeat_interval;
4968 if (getloadavg(loadavgs, 1) == 1) {
4969 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
4970 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
4971 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
4972 }
4973
4974 dout(30) << "heartbeat checking stats" << dendl;
4975
4976 // refresh stats?
4977 vector<int> hb_peers;
4978 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4979 p != heartbeat_peers.end();
4980 ++p)
4981 hb_peers.push_back(p->first);
4982 service.update_osd_stat(hb_peers);
4983
4984 dout(5) << "heartbeat: " << service.get_osd_stat() << dendl;
4985
4986 utime_t now = ceph_clock_now();
4987
4988 // send heartbeats
4989 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
4990 i != heartbeat_peers.end();
4991 ++i) {
4992 int peer = i->first;
4993 i->second.last_tx = now;
4994 if (i->second.first_tx == utime_t())
4995 i->second.first_tx = now;
4996 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
4997 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
4998 service.get_osdmap()->get_epoch(),
4999 MOSDPing::PING, now,
5000 cct->_conf->osd_heartbeat_min_size));
5001
5002 if (i->second.con_front)
5003 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
5004 service.get_osdmap()->get_epoch(),
5005 MOSDPing::PING, now,
5006 cct->_conf->osd_heartbeat_min_size));
5007 }
5008
5009 logger->set(l_osd_hb_to, heartbeat_peers.size());
5010
5011 // hmm.. am i all alone?
5012 dout(30) << "heartbeat lonely?" << dendl;
5013 if (heartbeat_peers.empty()) {
5014 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5015 last_mon_heartbeat = now;
5016 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5017 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5018 }
5019 }
5020
5021 dout(30) << "heartbeat done" << dendl;
5022 }
5023
5024 bool OSD::heartbeat_reset(Connection *con)
5025 {
5026 HeartbeatSession *s = static_cast<HeartbeatSession*>(con->get_priv());
5027 if (s) {
5028 heartbeat_lock.Lock();
5029 if (is_stopping()) {
5030 heartbeat_lock.Unlock();
5031 s->put();
5032 return true;
5033 }
5034 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer);
5035 if (p != heartbeat_peers.end() &&
5036 (p->second.con_back == con ||
5037 p->second.con_front == con)) {
5038 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5039 << ", reopening" << dendl;
5040 if (con != p->second.con_back) {
5041 p->second.con_back->mark_down();
5042 }
5043 p->second.con_back.reset(NULL);
5044 if (p->second.con_front && con != p->second.con_front) {
5045 p->second.con_front->mark_down();
5046 }
5047 p->second.con_front.reset(NULL);
5048 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5049 if (newcon.first) {
5050 p->second.con_back = newcon.first.get();
5051 p->second.con_back->set_priv(s->get());
5052 if (newcon.second) {
5053 p->second.con_front = newcon.second.get();
5054 p->second.con_front->set_priv(s->get());
5055 }
5056 } else {
5057 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5058 << ", raced with osdmap update, closing out peer" << dendl;
5059 heartbeat_peers.erase(p);
5060 }
5061 } else {
5062 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5063 }
5064 heartbeat_lock.Unlock();
5065 s->put();
5066 }
5067 return true;
5068 }
5069
5070
5071
5072 // =========================================
5073
5074 void OSD::tick()
5075 {
5076 assert(osd_lock.is_locked());
5077 dout(10) << "tick" << dendl;
5078
5079 if (is_active() || is_waiting_for_healthy()) {
5080 maybe_update_heartbeat_peers();
5081 }
5082
5083 if (is_waiting_for_healthy()) {
5084 start_boot();
5085 } else if (is_preboot() &&
5086 waiting_for_luminous_mons &&
5087 monc->monmap.get_required_features().contains_all(
5088 ceph::features::mon::FEATURE_LUMINOUS)) {
5089 // mon upgrade finished!
5090 start_boot();
5091 }
5092
5093 do_waiters();
5094
5095 tick_timer.add_event_after(OSD_TICK_INTERVAL, new C_Tick(this));
5096 }
5097
5098 void OSD::tick_without_osd_lock()
5099 {
5100 assert(tick_timer_lock.is_locked());
5101 dout(10) << "tick_without_osd_lock" << dendl;
5102
5103 logger->set(l_osd_buf, buffer::get_total_alloc());
5104 logger->set(l_osd_history_alloc_bytes, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
5105 logger->set(l_osd_history_alloc_num, buffer::get_history_alloc_num());
5106 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5107 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5108 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5109
5110 // osd_lock is not being held, which means the OSD state
5111 // might change when doing the monitor report
5112 if (is_active() || is_waiting_for_healthy()) {
5113 heartbeat_lock.Lock();
5114 heartbeat_check();
5115 heartbeat_lock.Unlock();
5116
5117 map_lock.get_read();
5118 Mutex::Locker l(mon_report_lock);
5119
5120 // mon report?
5121 bool reset = false;
5122 bool report = false;
5123 utime_t now = ceph_clock_now();
5124 pg_stat_queue_lock.Lock();
5125 double backoff = stats_ack_timeout / cct->_conf->osd_mon_ack_timeout;
5126 double adjusted_min = cct->_conf->osd_mon_report_interval_min * backoff;
5127 // note: we shouldn't adjust max because it must remain < the
5128 // mon's mon_osd_report_timeout (which defaults to 1.5x our
5129 // value).
5130 double max = cct->_conf->osd_mon_report_interval_max;
5131 if (!outstanding_pg_stats.empty() &&
5132 (now - stats_ack_timeout) > last_pg_stats_ack) {
5133 dout(1) << __func__ << " mon hasn't acked PGStats in "
5134 << now - last_pg_stats_ack
5135 << " seconds, reconnecting elsewhere" << dendl;
5136 reset = true;
5137 last_pg_stats_ack = now; // reset clock
5138 last_pg_stats_sent = utime_t();
5139 stats_ack_timeout =
5140 MAX(cct->_conf->osd_mon_ack_timeout,
5141 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_factor);
5142 outstanding_pg_stats.clear();
5143 }
5144 if (now - last_pg_stats_sent > max) {
5145 osd_stat_updated = true;
5146 report = true;
5147 } else if (service.need_fullness_update()) {
5148 report = true;
5149 } else if ((int)outstanding_pg_stats.size() >=
5150 cct->_conf->osd_mon_report_max_in_flight) {
5151 dout(20) << __func__ << " have max " << outstanding_pg_stats
5152 << " stats updates in flight" << dendl;
5153 } else {
5154 if (now - last_mon_report > adjusted_min) {
5155 dout(20) << __func__ << " stats backoff " << backoff
5156 << " adjusted_min " << adjusted_min << " - sending report"
5157 << dendl;
5158 osd_stat_updated = true;
5159 report = true;
5160 }
5161 }
5162 pg_stat_queue_lock.Unlock();
5163
5164 if (reset) {
5165 monc->reopen_session();
5166 } else if (report) {
5167 last_mon_report = now;
5168
5169 // do any pending reports
5170 send_full_update();
5171 send_failures();
5172 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5173 send_pg_stats(now);
5174 }
5175 }
5176 map_lock.put_read();
5177 }
5178
5179 if (is_active()) {
5180 if (!scrub_random_backoff()) {
5181 sched_scrub();
5182 }
5183 service.promote_throttle_recalibrate();
5184 bool need_send_beacon = false;
5185 const auto now = ceph::coarse_mono_clock::now();
5186 {
5187 // borrow lec lock to pretect last_sent_beacon from changing
5188 Mutex::Locker l{min_last_epoch_clean_lock};
5189 const auto elapsed = now - last_sent_beacon;
5190 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5191 cct->_conf->osd_beacon_report_interval) {
5192 need_send_beacon = true;
5193 }
5194 }
5195 if (need_send_beacon) {
5196 send_beacon(now);
5197 }
5198 }
5199
5200 check_ops_in_flight();
5201 service.kick_recovery_queue();
5202 tick_timer_without_osd_lock.add_event_after(OSD_TICK_INTERVAL, new C_Tick_WithoutOSDLock(this));
5203 }
5204
5205 void OSD::check_ops_in_flight()
5206 {
5207 vector<string> warnings;
5208 if (op_tracker.check_ops_in_flight(warnings)) {
5209 for (vector<string>::iterator i = warnings.begin();
5210 i != warnings.end();
5211 ++i) {
5212 clog->warn() << *i;
5213 }
5214 }
5215 }
5216
5217 // Usage:
5218 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5219 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5220 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5221 // getomap <pool> [namespace/]<obj-name>
5222 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5223 // injectmdataerr [namespace/]<obj-name> [shardid]
5224 // injectdataerr [namespace/]<obj-name> [shardid]
5225 //
5226 // set_recovery_delay [utime]
5227 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5228 const std::string &command, cmdmap_t& cmdmap, ostream &ss)
5229 {
5230 //Test support
5231 //Support changing the omap on a single osd by using the Admin Socket to
5232 //directly request the osd make a change.
5233 if (command == "setomapval" || command == "rmomapkey" ||
5234 command == "setomapheader" || command == "getomap" ||
5235 command == "truncobj" || command == "injectmdataerr" ||
5236 command == "injectdataerr"
5237 ) {
5238 pg_t rawpg;
5239 int64_t pool;
5240 OSDMapRef curmap = service->get_osdmap();
5241 int r = -1;
5242
5243 string poolstr;
5244
5245 cmd_getval(service->cct, cmdmap, "pool", poolstr);
5246 pool = curmap->lookup_pg_pool_name(poolstr);
5247 //If we can't find it by name then maybe id specified
5248 if (pool < 0 && isdigit(poolstr[0]))
5249 pool = atoll(poolstr.c_str());
5250 if (pool < 0) {
5251 ss << "Invalid pool" << poolstr;
5252 return;
5253 }
5254
5255 string objname, nspace;
5256 cmd_getval(service->cct, cmdmap, "objname", objname);
5257 std::size_t found = objname.find_first_of('/');
5258 if (found != string::npos) {
5259 nspace = objname.substr(0, found);
5260 objname = objname.substr(found+1);
5261 }
5262 object_locator_t oloc(pool, nspace);
5263 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5264
5265 if (r < 0) {
5266 ss << "Invalid namespace/objname";
5267 return;
5268 }
5269
5270 int64_t shardid;
5271 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5272 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5273 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5274 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5275 if (curmap->pg_is_ec(rawpg)) {
5276 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5277 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5278 return;
5279 }
5280 }
5281
5282 ObjectStore::Transaction t;
5283
5284 if (command == "setomapval") {
5285 map<string, bufferlist> newattrs;
5286 bufferlist val;
5287 string key, valstr;
5288 cmd_getval(service->cct, cmdmap, "key", key);
5289 cmd_getval(service->cct, cmdmap, "val", valstr);
5290
5291 val.append(valstr);
5292 newattrs[key] = val;
5293 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5294 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5295 if (r < 0)
5296 ss << "error=" << r;
5297 else
5298 ss << "ok";
5299 } else if (command == "rmomapkey") {
5300 string key;
5301 set<string> keys;
5302 cmd_getval(service->cct, cmdmap, "key", key);
5303
5304 keys.insert(key);
5305 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
5306 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5307 if (r < 0)
5308 ss << "error=" << r;
5309 else
5310 ss << "ok";
5311 } else if (command == "setomapheader") {
5312 bufferlist newheader;
5313 string headerstr;
5314
5315 cmd_getval(service->cct, cmdmap, "header", headerstr);
5316 newheader.append(headerstr);
5317 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
5318 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5319 if (r < 0)
5320 ss << "error=" << r;
5321 else
5322 ss << "ok";
5323 } else if (command == "getomap") {
5324 //Debug: Output entire omap
5325 bufferlist hdrbl;
5326 map<string, bufferlist> keyvals;
5327 r = store->omap_get(coll_t(pgid), ghobject_t(obj), &hdrbl, &keyvals);
5328 if (r >= 0) {
5329 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5330 for (map<string, bufferlist>::iterator it = keyvals.begin();
5331 it != keyvals.end(); ++it)
5332 ss << " key=" << (*it).first << " val="
5333 << string((*it).second.c_str(), (*it).second.length());
5334 } else {
5335 ss << "error=" << r;
5336 }
5337 } else if (command == "truncobj") {
5338 int64_t trunclen;
5339 cmd_getval(service->cct, cmdmap, "len", trunclen);
5340 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
5341 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5342 if (r < 0)
5343 ss << "error=" << r;
5344 else
5345 ss << "ok";
5346 } else if (command == "injectdataerr") {
5347 store->inject_data_error(gobj);
5348 ss << "ok";
5349 } else if (command == "injectmdataerr") {
5350 store->inject_mdata_error(gobj);
5351 ss << "ok";
5352 }
5353 return;
5354 }
5355 if (command == "set_recovery_delay") {
5356 int64_t delay;
5357 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5358 ostringstream oss;
5359 oss << delay;
5360 int r = service->cct->_conf->set_val("osd_recovery_delay_start",
5361 oss.str().c_str());
5362 if (r != 0) {
5363 ss << "set_recovery_delay: error setting "
5364 << "osd_recovery_delay_start to '" << delay << "': error "
5365 << r;
5366 return;
5367 }
5368 service->cct->_conf->apply_changes(NULL);
5369 ss << "set_recovery_delay: set osd_recovery_delay_start "
5370 << "to " << service->cct->_conf->osd_recovery_delay_start;
5371 return;
5372 }
5373 if (command == "trigger_scrub") {
5374 spg_t pgid;
5375 OSDMapRef curmap = service->get_osdmap();
5376
5377 string pgidstr;
5378
5379 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5380 if (!pgid.parse(pgidstr.c_str())) {
5381 ss << "Invalid pgid specified";
5382 return;
5383 }
5384
5385 PG *pg = service->osd->_lookup_lock_pg(pgid);
5386 if (pg == nullptr) {
5387 ss << "Can't find pg " << pgid;
5388 return;
5389 }
5390
5391 if (pg->is_primary()) {
5392 pg->unreg_next_scrub();
5393 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5394 double pool_scrub_max_interval = 0;
5395 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5396 double scrub_max_interval = pool_scrub_max_interval > 0 ?
5397 pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
5398 // Instead of marking must_scrub force a schedule scrub
5399 utime_t stamp = ceph_clock_now();
5400 stamp -= scrub_max_interval;
5401 stamp -= 100.0; // push back last scrub more for good measure
5402 pg->info.history.last_scrub_stamp = stamp;
5403 pg->reg_next_scrub();
5404 ss << "ok";
5405 } else {
5406 ss << "Not primary";
5407 }
5408 pg->unlock();
5409 return;
5410 }
5411 if (command == "injectfull") {
5412 int64_t count;
5413 string type;
5414 OSDService::s_names state;
5415 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
5416 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
5417 if (type == "none" || count == 0) {
5418 type = "none";
5419 count = 0;
5420 }
5421 state = service->get_full_state(type);
5422 if (state == OSDService::s_names::INVALID) {
5423 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5424 return;
5425 }
5426 service->set_injectfull(state, count);
5427 return;
5428 }
5429 ss << "Internal error - command=" << command;
5430 }
5431
5432 // =========================================
5433 bool remove_dir(
5434 CephContext *cct,
5435 ObjectStore *store, SnapMapper *mapper,
5436 OSDriver *osdriver,
5437 ObjectStore::Sequencer *osr,
5438 coll_t coll, DeletingStateRef dstate,
5439 bool *finished,
5440 ThreadPool::TPHandle &handle)
5441 {
5442 vector<ghobject_t> olist;
5443 int64_t num = 0;
5444 ObjectStore::Transaction t;
5445 ghobject_t next;
5446 handle.reset_tp_timeout();
5447 store->collection_list(
5448 coll,
5449 next,
5450 ghobject_t::get_max(),
5451 store->get_ideal_list_max(),
5452 &olist,
5453 &next);
5454 generic_dout(10) << __func__ << " " << olist << dendl;
5455 // default cont to true, this is safe because caller(OSD::RemoveWQ::_process())
5456 // will recheck the answer before it really goes on.
5457 bool cont = true;
5458 for (vector<ghobject_t>::iterator i = olist.begin();
5459 i != olist.end();
5460 ++i) {
5461 if (i->is_pgmeta())
5462 continue;
5463 OSDriver::OSTransaction _t(osdriver->get_transaction(&t));
5464 int r = mapper->remove_oid(i->hobj, &_t);
5465 if (r != 0 && r != -ENOENT) {
5466 ceph_abort();
5467 }
5468 t.remove(coll, *i);
5469 if (++num >= cct->_conf->osd_target_transaction_size) {
5470 C_SaferCond waiter;
5471 store->queue_transaction(osr, std::move(t), &waiter);
5472 cont = dstate->pause_clearing();
5473 handle.suspend_tp_timeout();
5474 waiter.wait();
5475 handle.reset_tp_timeout();
5476 if (cont)
5477 cont = dstate->resume_clearing();
5478 if (!cont)
5479 return false;
5480 t = ObjectStore::Transaction();
5481 num = 0;
5482 }
5483 }
5484 if (num) {
5485 C_SaferCond waiter;
5486 store->queue_transaction(osr, std::move(t), &waiter);
5487 cont = dstate->pause_clearing();
5488 handle.suspend_tp_timeout();
5489 waiter.wait();
5490 handle.reset_tp_timeout();
5491 if (cont)
5492 cont = dstate->resume_clearing();
5493 }
5494 // whether there are more objects to remove in the collection
5495 *finished = next.is_max();
5496 return cont;
5497 }
5498
5499 void OSD::RemoveWQ::_process(
5500 pair<PGRef, DeletingStateRef> item,
5501 ThreadPool::TPHandle &handle)
5502 {
5503 FUNCTRACE();
5504 PGRef pg(item.first);
5505 SnapMapper &mapper = pg->snap_mapper;
5506 OSDriver &driver = pg->osdriver;
5507 coll_t coll = coll_t(pg->info.pgid);
5508 pg->osr->flush();
5509 bool finished = false;
5510
5511 if (!item.second->start_or_resume_clearing())
5512 return;
5513
5514 bool cont = remove_dir(
5515 pg->cct, store, &mapper, &driver, pg->osr.get(), coll, item.second,
5516 &finished, handle);
5517 if (!cont)
5518 return;
5519 if (!finished) {
5520 if (item.second->pause_clearing())
5521 queue_front(item);
5522 return;
5523 }
5524
5525 if (!item.second->start_deleting())
5526 return;
5527
5528 ObjectStore::Transaction t;
5529 PGLog::clear_info_log(pg->info.pgid, &t);
5530
5531 if (cct->_conf->osd_inject_failure_on_pg_removal) {
5532 generic_derr << "osd_inject_failure_on_pg_removal" << dendl;
5533 _exit(1);
5534 }
5535 t.remove_collection(coll);
5536
5537 // We need the sequencer to stick around until the op is complete
5538 store->queue_transaction(
5539 pg->osr.get(),
5540 std::move(t),
5541 0, // onapplied
5542 0, // oncommit
5543 0, // onreadable sync
5544 new ContainerContext<PGRef>(pg),
5545 TrackedOpRef());
5546
5547 item.second->finish_deleting();
5548 }
5549 // =========================================
5550
5551 void OSD::ms_handle_connect(Connection *con)
5552 {
5553 dout(10) << __func__ << " con " << con << dendl;
5554 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
5555 Mutex::Locker l(osd_lock);
5556 if (is_stopping())
5557 return;
5558 dout(10) << __func__ << " on mon" << dendl;
5559
5560 if (is_preboot()) {
5561 start_boot();
5562 } else if (is_booting()) {
5563 _send_boot(); // resend boot message
5564 } else {
5565 map_lock.get_read();
5566 Mutex::Locker l2(mon_report_lock);
5567
5568 utime_t now = ceph_clock_now();
5569 last_mon_report = now;
5570
5571 // resend everything, it's a new session
5572 send_full_update();
5573 send_alive();
5574 service.requeue_pg_temp();
5575 service.send_pg_temp();
5576 requeue_failures();
5577 send_failures();
5578 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5579 send_pg_stats(now);
5580 }
5581
5582 map_lock.put_read();
5583 if (is_active()) {
5584 send_beacon(ceph::coarse_mono_clock::now());
5585 }
5586 }
5587
5588 // full map requests may happen while active or pre-boot
5589 if (requested_full_first) {
5590 rerequest_full_maps();
5591 }
5592 }
5593 }
5594
5595 void OSD::ms_handle_fast_connect(Connection *con)
5596 {
5597 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5598 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5599 Session *s = static_cast<Session*>(con->get_priv());
5600 if (!s) {
5601 s = new Session(cct);
5602 con->set_priv(s->get());
5603 s->con = con;
5604 dout(10) << " new session (outgoing) " << s << " con=" << s->con
5605 << " addr=" << s->con->get_peer_addr() << dendl;
5606 // we don't connect to clients
5607 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5608 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5609 }
5610 s->put();
5611 }
5612 }
5613
5614 void OSD::ms_handle_fast_accept(Connection *con)
5615 {
5616 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5617 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5618 Session *s = static_cast<Session*>(con->get_priv());
5619 if (!s) {
5620 s = new Session(cct);
5621 con->set_priv(s->get());
5622 s->con = con;
5623 dout(10) << "new session (incoming)" << s << " con=" << con
5624 << " addr=" << con->get_peer_addr()
5625 << " must have raced with connect" << dendl;
5626 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5627 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5628 }
5629 s->put();
5630 }
5631 }
5632
5633 bool OSD::ms_handle_reset(Connection *con)
5634 {
5635 Session *session = static_cast<Session*>(con->get_priv());
5636 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
5637 if (!session)
5638 return false;
5639 session->wstate.reset(con);
5640 session->con.reset(NULL); // break con <-> session ref cycle
5641 // note that we break session->con *before* the session_handle_reset
5642 // cleanup below. this avoids a race between us and
5643 // PG::add_backoff, Session::check_backoff, etc.
5644 session_handle_reset(session);
5645 session->put();
5646 return true;
5647 }
5648
5649 bool OSD::ms_handle_refused(Connection *con)
5650 {
5651 if (!cct->_conf->osd_fast_fail_on_connection_refused)
5652 return false;
5653
5654 Session *session = static_cast<Session*>(con->get_priv());
5655 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
5656 if (!session)
5657 return false;
5658 int type = con->get_peer_type();
5659 // handle only OSD failures here
5660 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
5661 OSDMapRef osdmap = get_osdmap();
5662 if (osdmap) {
5663 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
5664 if (id >= 0 && osdmap->is_up(id)) {
5665 // I'm cheating mon heartbeat grace logic, because we know it's not going
5666 // to respawn alone. +1 so we won't hit any boundary case.
5667 monc->send_mon_message(new MOSDFailure(monc->get_fsid(),
5668 osdmap->get_inst(id),
5669 cct->_conf->osd_heartbeat_grace + 1,
5670 osdmap->get_epoch(),
5671 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
5672 ));
5673 }
5674 }
5675 }
5676 session->put();
5677 return true;
5678 }
5679
5680 struct C_OSD_GetVersion : public Context {
5681 OSD *osd;
5682 uint64_t oldest, newest;
5683 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
5684 void finish(int r) override {
5685 if (r >= 0)
5686 osd->_got_mon_epochs(oldest, newest);
5687 }
5688 };
5689
5690 void OSD::start_boot()
5691 {
5692 if (!_is_healthy()) {
5693 // if we are not healthy, do not mark ourselves up (yet)
5694 dout(1) << "not healthy; waiting to boot" << dendl;
5695 if (!is_waiting_for_healthy())
5696 start_waiting_for_healthy();
5697 // send pings sooner rather than later
5698 heartbeat_kick();
5699 return;
5700 }
5701 dout(1) << __func__ << dendl;
5702 set_state(STATE_PREBOOT);
5703 waiting_for_luminous_mons = false;
5704 dout(10) << "start_boot - have maps " << superblock.oldest_map
5705 << ".." << superblock.newest_map << dendl;
5706 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
5707 monc->get_version("osdmap", &c->newest, &c->oldest, c);
5708 }
5709
5710 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
5711 {
5712 Mutex::Locker l(osd_lock);
5713 if (is_preboot()) {
5714 _preboot(oldest, newest);
5715 }
5716 }
5717
5718 void OSD::_preboot(epoch_t oldest, epoch_t newest)
5719 {
5720 assert(is_preboot());
5721 dout(10) << __func__ << " _preboot mon has osdmaps "
5722 << oldest << ".." << newest << dendl;
5723
5724 // ensure our local fullness awareness is accurate
5725 heartbeat();
5726
5727 // if our map within recent history, try to add ourselves to the osdmap.
5728 if (osdmap->get_epoch() == 0) {
5729 derr << "waiting for initial osdmap" << dendl;
5730 } else if (osdmap->is_destroyed(whoami)) {
5731 derr << "osdmap says I am destroyed, exiting" << dendl;
5732 exit(0);
5733 } else if (osdmap->test_flag(CEPH_OSDMAP_NOUP) || osdmap->is_noup(whoami)) {
5734 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
5735 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
5736 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
5737 << dendl;
5738 } else if (osdmap->require_osd_release < CEPH_RELEASE_JEWEL) {
5739 derr << "osdmap REQUIRE_JEWEL OSDMap flag is NOT set; please set it"
5740 << dendl;
5741 } else if (!monc->monmap.get_required_features().contains_all(
5742 ceph::features::mon::FEATURE_LUMINOUS)) {
5743 derr << "monmap REQUIRE_LUMINOUS is NOT set; must upgrade all monitors to "
5744 << "Luminous or later before Luminous OSDs will boot" << dendl;
5745 waiting_for_luminous_mons = true;
5746 } else if (service.need_fullness_update()) {
5747 derr << "osdmap fullness state needs update" << dendl;
5748 send_full_update();
5749 } else if (osdmap->get_epoch() >= oldest - 1 &&
5750 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
5751 _send_boot();
5752 return;
5753 }
5754
5755 // get all the latest maps
5756 if (osdmap->get_epoch() + 1 >= oldest)
5757 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5758 else
5759 osdmap_subscribe(oldest - 1, true);
5760 }
5761
5762 void OSD::send_full_update()
5763 {
5764 if (!service.need_fullness_update())
5765 return;
5766 unsigned state = 0;
5767 if (service.is_full()) {
5768 state = CEPH_OSD_FULL;
5769 } else if (service.is_backfillfull()) {
5770 state = CEPH_OSD_BACKFILLFULL;
5771 } else if (service.is_nearfull()) {
5772 state = CEPH_OSD_NEARFULL;
5773 }
5774 set<string> s;
5775 OSDMap::calc_state_set(state, s);
5776 dout(10) << __func__ << " want state " << s << dendl;
5777 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
5778 }
5779
5780 void OSD::start_waiting_for_healthy()
5781 {
5782 dout(1) << "start_waiting_for_healthy" << dendl;
5783 set_state(STATE_WAITING_FOR_HEALTHY);
5784 last_heartbeat_resample = utime_t();
5785 }
5786
5787 bool OSD::_is_healthy()
5788 {
5789 if (!cct->get_heartbeat_map()->is_healthy()) {
5790 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
5791 return false;
5792 }
5793
5794 if (is_waiting_for_healthy()) {
5795 Mutex::Locker l(heartbeat_lock);
5796 utime_t cutoff = ceph_clock_now();
5797 cutoff -= cct->_conf->osd_heartbeat_grace;
5798 int num = 0, up = 0;
5799 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5800 p != heartbeat_peers.end();
5801 ++p) {
5802 if (p->second.is_healthy(cutoff))
5803 ++up;
5804 ++num;
5805 }
5806 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
5807 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
5808 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
5809 return false;
5810 }
5811 }
5812
5813 return true;
5814 }
5815
5816 void OSD::_send_boot()
5817 {
5818 dout(10) << "_send_boot" << dendl;
5819 entity_addr_t cluster_addr = cluster_messenger->get_myaddr();
5820 Connection *local_connection = cluster_messenger->get_loopback_connection().get();
5821 if (cluster_addr.is_blank_ip()) {
5822 int port = cluster_addr.get_port();
5823 cluster_addr = client_messenger->get_myaddr();
5824 cluster_addr.set_port(port);
5825 cluster_messenger->set_addr_unknowns(cluster_addr);
5826 dout(10) << " assuming cluster_addr ip matches client_addr" << dendl;
5827 } else {
5828 Session *s = static_cast<Session*>(local_connection->get_priv());
5829 if (s)
5830 s->put();
5831 else
5832 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
5833 }
5834
5835 entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr();
5836 local_connection = hb_back_server_messenger->get_loopback_connection().get();
5837 if (hb_back_addr.is_blank_ip()) {
5838 int port = hb_back_addr.get_port();
5839 hb_back_addr = cluster_addr;
5840 hb_back_addr.set_port(port);
5841 hb_back_server_messenger->set_addr_unknowns(hb_back_addr);
5842 dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl;
5843 } else {
5844 Session *s = static_cast<Session*>(local_connection->get_priv());
5845 if (s)
5846 s->put();
5847 else
5848 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
5849 }
5850
5851 entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr();
5852 local_connection = hb_front_server_messenger->get_loopback_connection().get();
5853 if (hb_front_addr.is_blank_ip()) {
5854 int port = hb_front_addr.get_port();
5855 hb_front_addr = client_messenger->get_myaddr();
5856 hb_front_addr.set_port(port);
5857 hb_front_server_messenger->set_addr_unknowns(hb_front_addr);
5858 dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
5859 } else {
5860 Session *s = static_cast<Session*>(local_connection->get_priv());
5861 if (s)
5862 s->put();
5863 else
5864 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
5865 }
5866
5867 MOSDBoot *mboot = new MOSDBoot(superblock, get_osdmap_epoch(), service.get_boot_epoch(),
5868 hb_back_addr, hb_front_addr, cluster_addr,
5869 CEPH_FEATURES_ALL);
5870 dout(10) << " client_addr " << client_messenger->get_myaddr()
5871 << ", cluster_addr " << cluster_addr
5872 << ", hb_back_addr " << hb_back_addr
5873 << ", hb_front_addr " << hb_front_addr
5874 << dendl;
5875 _collect_metadata(&mboot->metadata);
5876 monc->send_mon_message(mboot);
5877 set_state(STATE_BOOTING);
5878 }
5879
5880 void OSD::_collect_metadata(map<string,string> *pm)
5881 {
5882 // config info
5883 (*pm)["osd_data"] = dev_path;
5884 if (store->get_type() == "filestore") {
5885 // not applicable for bluestore
5886 (*pm)["osd_journal"] = journal_path;
5887 }
5888 (*pm)["front_addr"] = stringify(client_messenger->get_myaddr());
5889 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddr());
5890 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddr());
5891 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddr());
5892
5893 // backend
5894 (*pm)["osd_objectstore"] = store->get_type();
5895 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
5896 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
5897 (*pm)["default_device_class"] = store->get_default_device_class();
5898 store->collect_metadata(pm);
5899
5900 collect_sys_info(pm, cct);
5901
5902 dout(10) << __func__ << " " << *pm << dendl;
5903 }
5904
5905 void OSD::queue_want_up_thru(epoch_t want)
5906 {
5907 map_lock.get_read();
5908 epoch_t cur = osdmap->get_up_thru(whoami);
5909 Mutex::Locker l(mon_report_lock);
5910 if (want > up_thru_wanted) {
5911 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
5912 << ", currently " << cur
5913 << dendl;
5914 up_thru_wanted = want;
5915 send_alive();
5916 } else {
5917 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
5918 << ", currently " << cur
5919 << dendl;
5920 }
5921 map_lock.put_read();
5922 }
5923
5924 void OSD::send_alive()
5925 {
5926 assert(mon_report_lock.is_locked());
5927 if (!osdmap->exists(whoami))
5928 return;
5929 epoch_t up_thru = osdmap->get_up_thru(whoami);
5930 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
5931 if (up_thru_wanted > up_thru) {
5932 dout(10) << "send_alive want " << up_thru_wanted << dendl;
5933 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
5934 }
5935 }
5936
5937 void OSD::request_full_map(epoch_t first, epoch_t last)
5938 {
5939 dout(10) << __func__ << " " << first << ".." << last
5940 << ", previously requested "
5941 << requested_full_first << ".." << requested_full_last << dendl;
5942 assert(osd_lock.is_locked());
5943 assert(first > 0 && last > 0);
5944 assert(first <= last);
5945 assert(first >= requested_full_first); // we shouldn't ever ask for older maps
5946 if (requested_full_first == 0) {
5947 // first request
5948 requested_full_first = first;
5949 requested_full_last = last;
5950 } else if (last <= requested_full_last) {
5951 // dup
5952 return;
5953 } else {
5954 // additional request
5955 first = requested_full_last + 1;
5956 requested_full_last = last;
5957 }
5958 MMonGetOSDMap *req = new MMonGetOSDMap;
5959 req->request_full(first, last);
5960 monc->send_mon_message(req);
5961 }
5962
5963 void OSD::got_full_map(epoch_t e)
5964 {
5965 assert(requested_full_first <= requested_full_last);
5966 assert(osd_lock.is_locked());
5967 if (requested_full_first == 0) {
5968 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
5969 return;
5970 }
5971 if (e < requested_full_first) {
5972 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
5973 << ".." << requested_full_last
5974 << ", ignoring" << dendl;
5975 return;
5976 }
5977 if (e >= requested_full_last) {
5978 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
5979 << ".." << requested_full_last << ", resetting" << dendl;
5980 requested_full_first = requested_full_last = 0;
5981 return;
5982 }
5983
5984 requested_full_first = e + 1;
5985
5986 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
5987 << ".." << requested_full_last
5988 << ", still need more" << dendl;
5989 }
5990
5991 void OSD::requeue_failures()
5992 {
5993 Mutex::Locker l(heartbeat_lock);
5994 unsigned old_queue = failure_queue.size();
5995 unsigned old_pending = failure_pending.size();
5996 for (map<int,pair<utime_t,entity_inst_t> >::iterator p =
5997 failure_pending.begin();
5998 p != failure_pending.end(); ) {
5999 failure_queue[p->first] = p->second.first;
6000 failure_pending.erase(p++);
6001 }
6002 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6003 << failure_queue.size() << dendl;
6004 }
6005
6006 void OSD::send_failures()
6007 {
6008 assert(map_lock.is_locked());
6009 assert(mon_report_lock.is_locked());
6010 Mutex::Locker l(heartbeat_lock);
6011 utime_t now = ceph_clock_now();
6012 while (!failure_queue.empty()) {
6013 int osd = failure_queue.begin()->first;
6014 if (!failure_pending.count(osd)) {
6015 entity_inst_t i = osdmap->get_inst(osd);
6016 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6017 monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, failed_for,
6018 osdmap->get_epoch()));
6019 failure_pending[osd] = make_pair(failure_queue.begin()->second, i);
6020 }
6021 failure_queue.erase(osd);
6022 }
6023 }
6024
6025 void OSD::send_still_alive(epoch_t epoch, const entity_inst_t &i)
6026 {
6027 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), i, 0, epoch, MOSDFailure::FLAG_ALIVE);
6028 monc->send_mon_message(m);
6029 }
6030
6031 void OSD::send_pg_stats(const utime_t &now)
6032 {
6033 assert(map_lock.is_locked());
6034 assert(osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS);
6035 dout(20) << "send_pg_stats" << dendl;
6036
6037 osd_stat_t cur_stat = service.get_osd_stat();
6038
6039 cur_stat.os_perf_stat = store->get_cur_stats();
6040
6041 pg_stat_queue_lock.Lock();
6042
6043 if (osd_stat_updated || !pg_stat_queue.empty()) {
6044 last_pg_stats_sent = now;
6045 osd_stat_updated = false;
6046
6047 dout(10) << "send_pg_stats - " << pg_stat_queue.size() << " pgs updated" << dendl;
6048
6049 utime_t had_for(now);
6050 had_for -= had_map_since;
6051
6052 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
6053
6054 uint64_t tid = ++pg_stat_tid;
6055 m->set_tid(tid);
6056 m->osd_stat = cur_stat;
6057
6058 xlist<PG*>::iterator p = pg_stat_queue.begin();
6059 while (!p.end()) {
6060 PG *pg = *p;
6061 ++p;
6062 if (!pg->is_primary()) { // we hold map_lock; role is stable.
6063 pg->stat_queue_item.remove_myself();
6064 pg->put("pg_stat_queue");
6065 continue;
6066 }
6067 pg->pg_stats_publish_lock.Lock();
6068 if (pg->pg_stats_publish_valid) {
6069 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
6070 dout(25) << " sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6071 << pg->pg_stats_publish.reported_seq << dendl;
6072 } else {
6073 dout(25) << " NOT sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6074 << pg->pg_stats_publish.reported_seq << ", not valid" << dendl;
6075 }
6076 pg->pg_stats_publish_lock.Unlock();
6077 }
6078
6079 if (last_pg_stats_ack == utime_t() || !outstanding_pg_stats.empty()) {
6080 last_pg_stats_ack = ceph_clock_now();
6081 }
6082 outstanding_pg_stats.insert(tid);
6083 dout(20) << __func__ << " updates pending: " << outstanding_pg_stats << dendl;
6084
6085 monc->send_mon_message(m);
6086 }
6087
6088 pg_stat_queue_lock.Unlock();
6089 }
6090
6091 void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
6092 {
6093 dout(10) << "handle_pg_stats_ack " << dendl;
6094
6095 if (!require_mon_peer(ack)) {
6096 ack->put();
6097 return;
6098 }
6099
6100 // NOTE: we may get replies from a previous mon even while
6101 // outstanding_pg_stats is empty if reconnecting races with replies
6102 // in flight.
6103
6104 pg_stat_queue_lock.Lock();
6105
6106 last_pg_stats_ack = ceph_clock_now();
6107
6108 // decay timeout slowly (analogous to TCP)
6109 stats_ack_timeout =
6110 MAX(cct->_conf->osd_mon_ack_timeout,
6111 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_decay);
6112 dout(20) << __func__ << " timeout now " << stats_ack_timeout << dendl;
6113
6114 if (ack->get_tid() > pg_stat_tid_flushed) {
6115 pg_stat_tid_flushed = ack->get_tid();
6116 pg_stat_queue_cond.Signal();
6117 }
6118
6119 xlist<PG*>::iterator p = pg_stat_queue.begin();
6120 while (!p.end()) {
6121 PG *pg = *p;
6122 PGRef _pg(pg);
6123 ++p;
6124
6125 auto acked = ack->pg_stat.find(pg->info.pgid.pgid);
6126 if (acked != ack->pg_stat.end()) {
6127 pg->pg_stats_publish_lock.Lock();
6128 if (acked->second.first == pg->pg_stats_publish.reported_seq &&
6129 acked->second.second == pg->pg_stats_publish.reported_epoch) {
6130 dout(25) << " ack on " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6131 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6132 pg->stat_queue_item.remove_myself();
6133 pg->put("pg_stat_queue");
6134 } else {
6135 dout(25) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6136 << ":" << pg->pg_stats_publish.reported_seq << " > acked "
6137 << acked->second << dendl;
6138 }
6139 pg->pg_stats_publish_lock.Unlock();
6140 } else {
6141 dout(30) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6142 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6143 }
6144 }
6145
6146 outstanding_pg_stats.erase(ack->get_tid());
6147 dout(20) << __func__ << " still pending: " << outstanding_pg_stats << dendl;
6148
6149 pg_stat_queue_lock.Unlock();
6150
6151 ack->put();
6152 }
6153
6154 void OSD::flush_pg_stats()
6155 {
6156 dout(10) << "flush_pg_stats" << dendl;
6157 osd_lock.Unlock();
6158 utime_t now = ceph_clock_now();
6159 map_lock.get_read();
6160 mon_report_lock.Lock();
6161 send_pg_stats(now);
6162 mon_report_lock.Unlock();
6163 map_lock.put_read();
6164
6165
6166 pg_stat_queue_lock.Lock();
6167 uint64_t tid = pg_stat_tid;
6168 dout(10) << "flush_pg_stats waiting for stats tid " << tid << " to flush" << dendl;
6169 while (tid > pg_stat_tid_flushed)
6170 pg_stat_queue_cond.Wait(pg_stat_queue_lock);
6171 dout(10) << "flush_pg_stats finished waiting for stats tid " << tid << " to flush" << dendl;
6172 pg_stat_queue_lock.Unlock();
6173
6174 osd_lock.Lock();
6175 }
6176
6177 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6178 {
6179 const auto& monmap = monc->monmap;
6180 // send beacon to mon even if we are just connected, and the monmap is not
6181 // initialized yet by then.
6182 if (monmap.epoch > 0 &&
6183 monmap.get_required_features().contains_all(
6184 ceph::features::mon::FEATURE_LUMINOUS)) {
6185 dout(20) << __func__ << " sending" << dendl;
6186 MOSDBeacon* beacon = nullptr;
6187 {
6188 Mutex::Locker l{min_last_epoch_clean_lock};
6189 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
6190 std::swap(beacon->pgs, min_last_epoch_clean_pgs);
6191 last_sent_beacon = now;
6192 }
6193 monc->send_mon_message(beacon);
6194 } else {
6195 dout(20) << __func__ << " not sending" << dendl;
6196 }
6197 }
6198
6199 void OSD::handle_command(MMonCommand *m)
6200 {
6201 if (!require_mon_peer(m)) {
6202 m->put();
6203 return;
6204 }
6205
6206 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6207 command_wq.queue(c);
6208 m->put();
6209 }
6210
6211 void OSD::handle_command(MCommand *m)
6212 {
6213 ConnectionRef con = m->get_connection();
6214 Session *session = static_cast<Session *>(con->get_priv());
6215 if (!session) {
6216 con->send_message(new MCommandReply(m, -EPERM));
6217 m->put();
6218 return;
6219 }
6220
6221 OSDCap& caps = session->caps;
6222 session->put();
6223
6224 if (!caps.allow_all() || m->get_source().is_mon()) {
6225 con->send_message(new MCommandReply(m, -EPERM));
6226 m->put();
6227 return;
6228 }
6229
6230 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6231 command_wq.queue(c);
6232
6233 m->put();
6234 }
6235
6236 struct OSDCommand {
6237 string cmdstring;
6238 string helpstring;
6239 string module;
6240 string perm;
6241 string availability;
6242 } osd_commands[] = {
6243
6244 #define COMMAND(parsesig, helptext, module, perm, availability) \
6245 {parsesig, helptext, module, perm, availability},
6246
6247 // yes, these are really pg commands, but there's a limit to how
6248 // much work it's worth. The OSD returns all of them. Make this
6249 // form (pg <pgid> <cmd>) valid only for the cli.
6250 // Rest uses "tell <pgid> <cmd>"
6251
6252 COMMAND("pg " \
6253 "name=pgid,type=CephPgid " \
6254 "name=cmd,type=CephChoices,strings=query", \
6255 "show details of a specific pg", "osd", "r", "cli")
6256 COMMAND("pg " \
6257 "name=pgid,type=CephPgid " \
6258 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6259 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6260 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6261 "osd", "rw", "cli")
6262 COMMAND("pg " \
6263 "name=pgid,type=CephPgid " \
6264 "name=cmd,type=CephChoices,strings=list_missing " \
6265 "name=offset,type=CephString,req=false",
6266 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6267 "osd", "r", "cli")
6268
6269 // new form: tell <pgid> <cmd> for both cli and rest
6270
6271 COMMAND("query",
6272 "show details of a specific pg", "osd", "r", "cli,rest")
6273 COMMAND("mark_unfound_lost " \
6274 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6275 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6276 "osd", "rw", "cli,rest")
6277 COMMAND("list_missing " \
6278 "name=offset,type=CephString,req=false",
6279 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6280 "osd", "r", "cli,rest")
6281 COMMAND("perf histogram dump "
6282 "name=logger,type=CephString,req=false "
6283 "name=counter,type=CephString,req=false",
6284 "Get histogram data",
6285 "osd", "r", "cli,rest")
6286
6287 // tell <osd.n> commands. Validation of osd.n must be special-cased in client
6288 COMMAND("version", "report version of OSD", "osd", "r", "cli,rest")
6289 COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r", "cli,rest")
6290 COMMAND("injectargs " \
6291 "name=injected_args,type=CephString,n=N",
6292 "inject configuration arguments into running OSD",
6293 "osd", "rw", "cli,rest")
6294 COMMAND("config set " \
6295 "name=key,type=CephString name=value,type=CephString",
6296 "Set a configuration option at runtime (not persistent)",
6297 "osd", "rw", "cli,rest")
6298 COMMAND("cluster_log " \
6299 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6300 "name=message,type=CephString,n=N",
6301 "log a message to the cluster log",
6302 "osd", "rw", "cli,rest")
6303 COMMAND("bench " \
6304 "name=count,type=CephInt,req=false " \
6305 "name=size,type=CephInt,req=false " \
6306 "name=object_size,type=CephInt,req=false " \
6307 "name=object_num,type=CephInt,req=false ", \
6308 "OSD benchmark: write <count> <size>-byte objects, " \
6309 "(default 1G size 4MB). Results in log.",
6310 "osd", "rw", "cli,rest")
6311 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw", "cli,rest")
6312 COMMAND("heap " \
6313 "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
6314 "show heap usage info (available only if compiled with tcmalloc)", \
6315 "osd", "rw", "cli,rest")
6316 COMMAND("debug dump_missing " \
6317 "name=filename,type=CephFilepath",
6318 "dump missing objects to a named file", "osd", "r", "cli,rest")
6319 COMMAND("debug kick_recovery_wq " \
6320 "name=delay,type=CephInt,range=0",
6321 "set osd_recovery_delay_start to <val>", "osd", "rw", "cli,rest")
6322 COMMAND("cpu_profiler " \
6323 "name=arg,type=CephChoices,strings=status|flush",
6324 "run cpu profiling on daemon", "osd", "rw", "cli,rest")
6325 COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6326 "osd", "r", "cli,rest")
6327 COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6328 "osd", "rw", "cli,rest")
6329 COMMAND("compact",
6330 "compact object store's omap. "
6331 "WARNING: Compaction probably slows your requests",
6332 "osd", "rw", "cli,rest")
6333 };
6334
6335 void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6336 {
6337 int r = 0;
6338 stringstream ss, ds;
6339 string rs;
6340 bufferlist odata;
6341
6342 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6343
6344 map<string, cmd_vartype> cmdmap;
6345 string prefix;
6346 string format;
6347 string pgidstr;
6348 boost::scoped_ptr<Formatter> f;
6349
6350 if (cmd.empty()) {
6351 ss << "no command given";
6352 goto out;
6353 }
6354
6355 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6356 r = -EINVAL;
6357 goto out;
6358 }
6359
6360 cmd_getval(cct, cmdmap, "prefix", prefix);
6361
6362 if (prefix == "get_command_descriptions") {
6363 int cmdnum = 0;
6364 JSONFormatter *f = new JSONFormatter();
6365 f->open_object_section("command_descriptions");
6366 for (OSDCommand *cp = osd_commands;
6367 cp < &osd_commands[ARRAY_SIZE(osd_commands)]; cp++) {
6368
6369 ostringstream secname;
6370 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
6371 dump_cmddesc_to_json(f, secname.str(), cp->cmdstring, cp->helpstring,
6372 cp->module, cp->perm, cp->availability, 0);
6373 cmdnum++;
6374 }
6375 f->close_section(); // command_descriptions
6376
6377 f->flush(ds);
6378 delete f;
6379 goto out;
6380 }
6381
6382 cmd_getval(cct, cmdmap, "format", format);
6383 f.reset(Formatter::create(format));
6384
6385 if (prefix == "version") {
6386 if (f) {
6387 f->open_object_section("version");
6388 f->dump_string("version", pretty_version_to_str());
6389 f->close_section();
6390 f->flush(ds);
6391 } else {
6392 ds << pretty_version_to_str();
6393 }
6394 goto out;
6395 }
6396 else if (prefix == "injectargs") {
6397 vector<string> argsvec;
6398 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6399
6400 if (argsvec.empty()) {
6401 r = -EINVAL;
6402 ss << "ignoring empty injectargs";
6403 goto out;
6404 }
6405 string args = argsvec.front();
6406 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6407 args += " " + *a;
6408 osd_lock.Unlock();
6409 r = cct->_conf->injectargs(args, &ss);
6410 osd_lock.Lock();
6411 }
6412 else if (prefix == "config set") {
6413 std::string key;
6414 std::string val;
6415 cmd_getval(cct, cmdmap, "key", key);
6416 cmd_getval(cct, cmdmap, "value", val);
6417 osd_lock.Unlock();
6418 r = cct->_conf->set_val(key, val, true, &ss);
6419 if (r == 0) {
6420 cct->_conf->apply_changes(nullptr);
6421 }
6422 osd_lock.Lock();
6423 }
6424 else if (prefix == "cluster_log") {
6425 vector<string> msg;
6426 cmd_getval(cct, cmdmap, "message", msg);
6427 if (msg.empty()) {
6428 r = -EINVAL;
6429 ss << "ignoring empty log message";
6430 goto out;
6431 }
6432 string message = msg.front();
6433 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6434 message += " " + *a;
6435 string lvl;
6436 cmd_getval(cct, cmdmap, "level", lvl);
6437 clog_type level = string_to_clog_type(lvl);
6438 if (level < 0) {
6439 r = -EINVAL;
6440 ss << "unknown level '" << lvl << "'";
6441 goto out;
6442 }
6443 clog->do_log(level, message);
6444 }
6445
6446 // either 'pg <pgid> <command>' or
6447 // 'tell <pgid>' (which comes in without any of that prefix)?
6448
6449 else if (prefix == "pg" ||
6450 prefix == "query" ||
6451 prefix == "mark_unfound_lost" ||
6452 prefix == "list_missing"
6453 ) {
6454 pg_t pgid;
6455
6456 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6457 ss << "no pgid specified";
6458 r = -EINVAL;
6459 } else if (!pgid.parse(pgidstr.c_str())) {
6460 ss << "couldn't parse pgid '" << pgidstr << "'";
6461 r = -EINVAL;
6462 } else {
6463 spg_t pcand;
6464 PG *pg = nullptr;
6465 if (osdmap->get_primary_shard(pgid, &pcand) &&
6466 (pg = _lookup_lock_pg(pcand))) {
6467 if (pg->is_primary()) {
6468 // simulate pg <pgid> cmd= for pg->do-command
6469 if (prefix != "pg")
6470 cmd_putval(cct, cmdmap, "cmd", prefix);
6471 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
6472 if (r == -EAGAIN) {
6473 pg->unlock();
6474 // don't reply, pg will do so async
6475 return;
6476 }
6477 } else {
6478 ss << "not primary for pgid " << pgid;
6479
6480 // send them the latest diff to ensure they realize the mapping
6481 // has changed.
6482 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
6483
6484 // do not reply; they will get newer maps and realize they
6485 // need to resend.
6486 pg->unlock();
6487 return;
6488 }
6489 pg->unlock();
6490 } else {
6491 ss << "i don't have pgid " << pgid;
6492 r = -ENOENT;
6493 }
6494 }
6495 }
6496
6497 else if (prefix == "bench") {
6498 int64_t count;
6499 int64_t bsize;
6500 int64_t osize, onum;
6501 // default count 1G, size 4MB
6502 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
6503 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
6504 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
6505 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
6506
6507 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
6508 ObjectStore::Sequencer>("bench"));
6509
6510 uint32_t duration = cct->_conf->osd_bench_duration;
6511
6512 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
6513 // let us limit the block size because the next checks rely on it
6514 // having a sane value. If we allow any block size to be set things
6515 // can still go sideways.
6516 ss << "block 'size' values are capped at "
6517 << prettybyte_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
6518 << " a higher value, please adjust 'osd_bench_max_block_size'";
6519 r = -EINVAL;
6520 goto out;
6521 } else if (bsize < (int64_t) (1 << 20)) {
6522 // entering the realm of small block sizes.
6523 // limit the count to a sane value, assuming a configurable amount of
6524 // IOPS and duration, so that the OSD doesn't get hung up on this,
6525 // preventing timeouts from going off
6526 int64_t max_count =
6527 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
6528 if (count > max_count) {
6529 ss << "'count' values greater than " << max_count
6530 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6531 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
6532 << " for " << duration << " seconds,"
6533 << " can cause ill effects on osd. "
6534 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6535 << " value if you wish to use a higher 'count'.";
6536 r = -EINVAL;
6537 goto out;
6538 }
6539 } else {
6540 // 1MB block sizes are big enough so that we get more stuff done.
6541 // However, to avoid the osd from getting hung on this and having
6542 // timers being triggered, we are going to limit the count assuming
6543 // a configurable throughput and duration.
6544 // NOTE: max_count is the total amount of bytes that we believe we
6545 // will be able to write during 'duration' for the given
6546 // throughput. The block size hardly impacts this unless it's
6547 // way too big. Given we already check how big the block size
6548 // is, it's safe to assume everything will check out.
6549 int64_t max_count =
6550 cct->_conf->osd_bench_large_size_max_throughput * duration;
6551 if (count > max_count) {
6552 ss << "'count' values greater than " << max_count
6553 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6554 << prettybyte_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
6555 << " for " << duration << " seconds,"
6556 << " can cause ill effects on osd. "
6557 << " Please adjust 'osd_bench_large_size_max_throughput'"
6558 << " with a higher value if you wish to use a higher 'count'.";
6559 r = -EINVAL;
6560 goto out;
6561 }
6562 }
6563
6564 if (osize && bsize > osize)
6565 bsize = osize;
6566
6567 dout(1) << " bench count " << count
6568 << " bsize " << prettybyte_t(bsize) << dendl;
6569
6570 ObjectStore::Transaction cleanupt;
6571
6572 if (osize && onum) {
6573 bufferlist bl;
6574 bufferptr bp(osize);
6575 bp.zero();
6576 bl.push_back(std::move(bp));
6577 bl.rebuild_page_aligned();
6578 for (int i=0; i<onum; ++i) {
6579 char nm[30];
6580 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
6581 object_t oid(nm);
6582 hobject_t soid(sobject_t(oid, 0));
6583 ObjectStore::Transaction t;
6584 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
6585 store->queue_transaction(osr.get(), std::move(t), NULL);
6586 cleanupt.remove(coll_t(), ghobject_t(soid));
6587 }
6588 }
6589
6590 bufferlist bl;
6591 bufferptr bp(bsize);
6592 bp.zero();
6593 bl.push_back(std::move(bp));
6594 bl.rebuild_page_aligned();
6595
6596 {
6597 C_SaferCond waiter;
6598 if (!osr->flush_commit(&waiter)) {
6599 waiter.wait();
6600 }
6601 }
6602
6603 utime_t start = ceph_clock_now();
6604 for (int64_t pos = 0; pos < count; pos += bsize) {
6605 char nm[30];
6606 unsigned offset = 0;
6607 if (onum && osize) {
6608 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
6609 offset = rand() % (osize / bsize) * bsize;
6610 } else {
6611 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
6612 }
6613 object_t oid(nm);
6614 hobject_t soid(sobject_t(oid, 0));
6615 ObjectStore::Transaction t;
6616 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
6617 store->queue_transaction(osr.get(), std::move(t), NULL);
6618 if (!onum || !osize)
6619 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
6620 }
6621
6622 {
6623 C_SaferCond waiter;
6624 if (!osr->flush_commit(&waiter)) {
6625 waiter.wait();
6626 }
6627 }
6628 utime_t end = ceph_clock_now();
6629
6630 // clean up
6631 store->queue_transaction(osr.get(), std::move(cleanupt), NULL);
6632 {
6633 C_SaferCond waiter;
6634 if (!osr->flush_commit(&waiter)) {
6635 waiter.wait();
6636 }
6637 }
6638
6639 uint64_t rate = (double)count / (end - start);
6640 if (f) {
6641 f->open_object_section("osd_bench_results");
6642 f->dump_int("bytes_written", count);
6643 f->dump_int("blocksize", bsize);
6644 f->dump_unsigned("bytes_per_sec", rate);
6645 f->close_section();
6646 f->flush(ss);
6647 } else {
6648 ss << "bench: wrote " << prettybyte_t(count)
6649 << " in blocks of " << prettybyte_t(bsize) << " in "
6650 << (end-start) << " sec at " << prettybyte_t(rate) << "/sec";
6651 }
6652 }
6653
6654 else if (prefix == "flush_pg_stats") {
6655 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6656 mgrc.send_pgstats();
6657 ds << service.get_osd_stat_seq() << "\n";
6658 } else {
6659 flush_pg_stats();
6660 }
6661 }
6662
6663 else if (prefix == "heap") {
6664 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
6665 }
6666
6667 else if (prefix == "debug dump_missing") {
6668 string file_name;
6669 cmd_getval(cct, cmdmap, "filename", file_name);
6670 std::ofstream fout(file_name.c_str());
6671 if (!fout.is_open()) {
6672 ss << "failed to open file '" << file_name << "'";
6673 r = -EINVAL;
6674 goto out;
6675 }
6676
6677 fout << "*** osd " << whoami << ": dump_missing ***" << std::endl;
6678 RWLock::RLocker l(pg_map_lock);
6679 for (ceph::unordered_map<spg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
6680 pg_map_e != pg_map.end(); ++pg_map_e) {
6681 PG *pg = pg_map_e->second;
6682 pg->lock();
6683
6684 fout << *pg << std::endl;
6685 std::map<hobject_t, pg_missing_item>::const_iterator mend =
6686 pg->pg_log.get_missing().get_items().end();
6687 std::map<hobject_t, pg_missing_item>::const_iterator mi =
6688 pg->pg_log.get_missing().get_items().begin();
6689 for (; mi != mend; ++mi) {
6690 fout << mi->first << " -> " << mi->second << std::endl;
6691 if (!pg->missing_loc.needs_recovery(mi->first))
6692 continue;
6693 if (pg->missing_loc.is_unfound(mi->first))
6694 fout << " unfound ";
6695 const set<pg_shard_t> &mls(pg->missing_loc.get_locations(mi->first));
6696 if (mls.empty())
6697 continue;
6698 fout << "missing_loc: " << mls << std::endl;
6699 }
6700 pg->unlock();
6701 fout << std::endl;
6702 }
6703
6704 fout.close();
6705 }
6706 else if (prefix == "debug kick_recovery_wq") {
6707 int64_t delay;
6708 cmd_getval(cct, cmdmap, "delay", delay);
6709 ostringstream oss;
6710 oss << delay;
6711 r = cct->_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
6712 if (r != 0) {
6713 ss << "kick_recovery_wq: error setting "
6714 << "osd_recovery_delay_start to '" << delay << "': error "
6715 << r;
6716 goto out;
6717 }
6718 cct->_conf->apply_changes(NULL);
6719 ss << "kicking recovery queue. set osd_recovery_delay_start "
6720 << "to " << cct->_conf->osd_recovery_delay_start;
6721 }
6722
6723 else if (prefix == "cpu_profiler") {
6724 string arg;
6725 cmd_getval(cct, cmdmap, "arg", arg);
6726 vector<string> argvec;
6727 get_str_vec(arg, argvec);
6728 cpu_profiler_handle_command(argvec, ds);
6729 }
6730
6731 else if (prefix == "dump_pg_recovery_stats") {
6732 stringstream s;
6733 if (f) {
6734 pg_recovery_stats.dump_formatted(f.get());
6735 f->flush(ds);
6736 } else {
6737 pg_recovery_stats.dump(s);
6738 ds << "dump pg recovery stats: " << s.str();
6739 }
6740 }
6741
6742 else if (prefix == "reset_pg_recovery_stats") {
6743 ss << "reset pg recovery stats";
6744 pg_recovery_stats.reset();
6745 }
6746
6747 else if (prefix == "perf histogram dump") {
6748 std::string logger;
6749 std::string counter;
6750 cmd_getval(cct, cmdmap, "logger", logger);
6751 cmd_getval(cct, cmdmap, "counter", counter);
6752 if (f) {
6753 cct->get_perfcounters_collection()->dump_formatted_histograms(
6754 f.get(), false, logger, counter);
6755 f->flush(ds);
6756 }
6757 }
6758
6759 else if (prefix == "compact") {
6760 dout(1) << "triggering manual compaction" << dendl;
6761 auto start = ceph::coarse_mono_clock::now();
6762 store->compact();
6763 auto end = ceph::coarse_mono_clock::now();
6764 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
6765 dout(1) << "finished manual compaction in "
6766 << time_span.count()
6767 << " seconds" << dendl;
6768 ss << "compacted omap in " << time_span.count() << " seconds";
6769 }
6770
6771 else {
6772 ss << "unrecognized command! " << cmd;
6773 r = -EINVAL;
6774 }
6775
6776 out:
6777 rs = ss.str();
6778 odata.append(ds);
6779 dout(0) << "do_command r=" << r << " " << rs << dendl;
6780 clog->info() << rs;
6781 if (con) {
6782 MCommandReply *reply = new MCommandReply(r, rs);
6783 reply->set_tid(tid);
6784 reply->set_data(odata);
6785 con->send_message(reply);
6786 }
6787 }
6788
6789 bool OSD::heartbeat_dispatch(Message *m)
6790 {
6791 dout(30) << "heartbeat_dispatch " << m << dendl;
6792 switch (m->get_type()) {
6793
6794 case CEPH_MSG_PING:
6795 dout(10) << "ping from " << m->get_source_inst() << dendl;
6796 m->put();
6797 break;
6798
6799 case MSG_OSD_PING:
6800 handle_osd_ping(static_cast<MOSDPing*>(m));
6801 break;
6802
6803 default:
6804 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
6805 m->put();
6806 }
6807
6808 return true;
6809 }
6810
6811 bool OSD::ms_dispatch(Message *m)
6812 {
6813 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
6814 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
6815 service.got_stop_ack();
6816 m->put();
6817 return true;
6818 }
6819
6820 // lock!
6821
6822 osd_lock.Lock();
6823 if (is_stopping()) {
6824 osd_lock.Unlock();
6825 m->put();
6826 return true;
6827 }
6828
6829 do_waiters();
6830 _dispatch(m);
6831
6832 osd_lock.Unlock();
6833
6834 return true;
6835 }
6836
6837 void OSD::maybe_share_map(
6838 Session *session,
6839 OpRequestRef op,
6840 OSDMapRef osdmap)
6841 {
6842 if (!op->check_send_map) {
6843 return;
6844 }
6845 epoch_t last_sent_epoch = 0;
6846
6847 session->sent_epoch_lock.lock();
6848 last_sent_epoch = session->last_sent_epoch;
6849 session->sent_epoch_lock.unlock();
6850
6851 const Message *m = op->get_req();
6852 service.share_map(
6853 m->get_source(),
6854 m->get_connection().get(),
6855 op->sent_epoch,
6856 osdmap,
6857 session ? &last_sent_epoch : NULL);
6858
6859 session->sent_epoch_lock.lock();
6860 if (session->last_sent_epoch < last_sent_epoch) {
6861 session->last_sent_epoch = last_sent_epoch;
6862 }
6863 session->sent_epoch_lock.unlock();
6864
6865 op->check_send_map = false;
6866 }
6867
6868 void OSD::dispatch_session_waiting(Session *session, OSDMapRef osdmap)
6869 {
6870 assert(session->session_dispatch_lock.is_locked());
6871
6872 auto i = session->waiting_on_map.begin();
6873 while (i != session->waiting_on_map.end()) {
6874 OpRequestRef op = &(*i);
6875 assert(ms_can_fast_dispatch(op->get_req()));
6876 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
6877 op->get_req());
6878 if (m->get_min_epoch() > osdmap->get_epoch()) {
6879 break;
6880 }
6881 session->waiting_on_map.erase(i++);
6882 op->put();
6883
6884 spg_t pgid;
6885 if (m->get_type() == CEPH_MSG_OSD_OP) {
6886 pg_t actual_pgid = osdmap->raw_pg_to_pg(
6887 static_cast<const MOSDOp*>(m)->get_pg());
6888 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
6889 continue;
6890 }
6891 } else {
6892 pgid = m->get_spg();
6893 }
6894 enqueue_op(pgid, op, m->get_map_epoch());
6895 }
6896
6897 if (session->waiting_on_map.empty()) {
6898 clear_session_waiting_on_map(session);
6899 } else {
6900 register_session_waiting_on_map(session);
6901 }
6902 }
6903
6904 void OSD::ms_fast_dispatch(Message *m)
6905 {
6906 FUNCTRACE();
6907 if (service.is_stopping()) {
6908 m->put();
6909 return;
6910 }
6911 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
6912 {
6913 #ifdef WITH_LTTNG
6914 osd_reqid_t reqid = op->get_reqid();
6915 #endif
6916 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
6917 reqid.name._num, reqid.tid, reqid.inc);
6918 }
6919
6920 if (m->trace)
6921 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
6922
6923 // note sender epoch, min req'd epoch
6924 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
6925 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
6926 assert(op->min_epoch <= op->sent_epoch); // sanity check!
6927
6928 service.maybe_inject_dispatch_delay();
6929
6930 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
6931 m->get_type() != CEPH_MSG_OSD_OP) {
6932 // queue it directly
6933 enqueue_op(
6934 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
6935 op,
6936 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
6937 } else {
6938 // legacy client, and this is an MOSDOp (the *only* fast dispatch
6939 // message that didn't have an explicit spg_t); we need to map
6940 // them to an spg_t while preserving delivery order.
6941 Session *session = static_cast<Session*>(m->get_connection()->get_priv());
6942 if (session) {
6943 {
6944 Mutex::Locker l(session->session_dispatch_lock);
6945 op->get();
6946 session->waiting_on_map.push_back(*op);
6947 OSDMapRef nextmap = service.get_nextmap_reserved();
6948 dispatch_session_waiting(session, nextmap);
6949 service.release_map(nextmap);
6950 }
6951 session->put();
6952 }
6953 }
6954 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
6955 }
6956
6957 void OSD::ms_fast_preprocess(Message *m)
6958 {
6959 if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
6960 if (m->get_type() == CEPH_MSG_OSD_MAP) {
6961 MOSDMap *mm = static_cast<MOSDMap*>(m);
6962 Session *s = static_cast<Session*>(m->get_connection()->get_priv());
6963 if (s) {
6964 s->received_map_lock.lock();
6965 s->received_map_epoch = mm->get_last();
6966 s->received_map_lock.unlock();
6967 s->put();
6968 }
6969 }
6970 }
6971 }
6972
6973 bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
6974 {
6975 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
6976
6977 if (is_stopping()) {
6978 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
6979 return false;
6980 }
6981
6982 if (dest_type == CEPH_ENTITY_TYPE_MON)
6983 return true;
6984
6985 if (force_new) {
6986 /* the MonClient checks keys every tick(), so we should just wait for that cycle
6987 to get through */
6988 if (monc->wait_auth_rotating(10) < 0) {
6989 derr << "OSD::ms_get_authorizer wait_auth_rotating failed" << dendl;
6990 return false;
6991 }
6992 }
6993
6994 *authorizer = monc->build_authorizer(dest_type);
6995 return *authorizer != NULL;
6996 }
6997
6998
6999 bool OSD::ms_verify_authorizer(Connection *con, int peer_type,
7000 int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
7001 bool& isvalid, CryptoKey& session_key)
7002 {
7003 AuthAuthorizeHandler *authorize_handler = 0;
7004 switch (peer_type) {
7005 case CEPH_ENTITY_TYPE_MDS:
7006 /*
7007 * note: mds is technically a client from our perspective, but
7008 * this makes the 'cluster' consistent w/ monitor's usage.
7009 */
7010 case CEPH_ENTITY_TYPE_OSD:
7011 case CEPH_ENTITY_TYPE_MGR:
7012 authorize_handler = authorize_handler_cluster_registry->get_handler(protocol);
7013 break;
7014 default:
7015 authorize_handler = authorize_handler_service_registry->get_handler(protocol);
7016 }
7017 if (!authorize_handler) {
7018 dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol << dendl;
7019 isvalid = false;
7020 return true;
7021 }
7022
7023 AuthCapsInfo caps_info;
7024 EntityName name;
7025 uint64_t global_id;
7026 uint64_t auid = CEPH_AUTH_UID_DEFAULT;
7027
7028 RotatingKeyRing *keys = monc->rotating_secrets.get();
7029 if (keys) {
7030 isvalid = authorize_handler->verify_authorizer(
7031 cct, keys,
7032 authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
7033 &auid);
7034 } else {
7035 dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
7036 isvalid = false;
7037 }
7038
7039 if (isvalid) {
7040 Session *s = static_cast<Session *>(con->get_priv());
7041 if (!s) {
7042 s = new Session(cct);
7043 con->set_priv(s->get());
7044 s->con = con;
7045 dout(10) << " new session " << s << " con=" << s->con << " addr=" << s->con->get_peer_addr() << dendl;
7046 }
7047
7048 s->entity_name = name;
7049 if (caps_info.allow_all)
7050 s->caps.set_allow_all();
7051 s->auid = auid;
7052
7053 if (caps_info.caps.length() > 0) {
7054 bufferlist::iterator p = caps_info.caps.begin();
7055 string str;
7056 try {
7057 ::decode(str, p);
7058 }
7059 catch (buffer::error& e) {
7060 }
7061 bool success = s->caps.parse(str);
7062 if (success)
7063 dout(10) << " session " << s << " " << s->entity_name << " has caps " << s->caps << " '" << str << "'" << dendl;
7064 else
7065 dout(10) << " session " << s << " " << s->entity_name << " failed to parse caps '" << str << "'" << dendl;
7066 }
7067
7068 s->put();
7069 }
7070 return true;
7071 }
7072
7073 void OSD::do_waiters()
7074 {
7075 assert(osd_lock.is_locked());
7076
7077 dout(10) << "do_waiters -- start" << dendl;
7078 while (!finished.empty()) {
7079 OpRequestRef next = finished.front();
7080 finished.pop_front();
7081 dispatch_op(next);
7082 }
7083 dout(10) << "do_waiters -- finish" << dendl;
7084 }
7085
7086 void OSD::dispatch_op(OpRequestRef op)
7087 {
7088 switch (op->get_req()->get_type()) {
7089
7090 case MSG_OSD_PG_CREATE:
7091 handle_pg_create(op);
7092 break;
7093 case MSG_OSD_PG_NOTIFY:
7094 handle_pg_notify(op);
7095 break;
7096 case MSG_OSD_PG_QUERY:
7097 handle_pg_query(op);
7098 break;
7099 case MSG_OSD_PG_LOG:
7100 handle_pg_log(op);
7101 break;
7102 case MSG_OSD_PG_REMOVE:
7103 handle_pg_remove(op);
7104 break;
7105 case MSG_OSD_PG_INFO:
7106 handle_pg_info(op);
7107 break;
7108 case MSG_OSD_PG_TRIM:
7109 handle_pg_trim(op);
7110 break;
7111 case MSG_OSD_BACKFILL_RESERVE:
7112 handle_pg_backfill_reserve(op);
7113 break;
7114 case MSG_OSD_RECOVERY_RESERVE:
7115 handle_pg_recovery_reserve(op);
7116 break;
7117 }
7118 }
7119
7120 void OSD::_dispatch(Message *m)
7121 {
7122 assert(osd_lock.is_locked());
7123 dout(20) << "_dispatch " << m << " " << *m << dendl;
7124
7125 switch (m->get_type()) {
7126
7127 // -- don't need lock --
7128 case CEPH_MSG_PING:
7129 dout(10) << "ping from " << m->get_source() << dendl;
7130 m->put();
7131 break;
7132
7133 // -- don't need OSDMap --
7134
7135 // map and replication
7136 case CEPH_MSG_OSD_MAP:
7137 handle_osd_map(static_cast<MOSDMap*>(m));
7138 break;
7139
7140 // osd
7141 case MSG_PGSTATSACK:
7142 handle_pg_stats_ack(static_cast<MPGStatsAck*>(m));
7143 break;
7144
7145 case MSG_MON_COMMAND:
7146 handle_command(static_cast<MMonCommand*>(m));
7147 break;
7148 case MSG_COMMAND:
7149 handle_command(static_cast<MCommand*>(m));
7150 break;
7151
7152 case MSG_OSD_SCRUB:
7153 handle_scrub(static_cast<MOSDScrub*>(m));
7154 break;
7155
7156 case MSG_OSD_FORCE_RECOVERY:
7157 handle_force_recovery(m);
7158 break;
7159
7160 // -- need OSDMap --
7161
7162 case MSG_OSD_PG_CREATE:
7163 case MSG_OSD_PG_NOTIFY:
7164 case MSG_OSD_PG_QUERY:
7165 case MSG_OSD_PG_LOG:
7166 case MSG_OSD_PG_REMOVE:
7167 case MSG_OSD_PG_INFO:
7168 case MSG_OSD_PG_TRIM:
7169 case MSG_OSD_BACKFILL_RESERVE:
7170 case MSG_OSD_RECOVERY_RESERVE:
7171 {
7172 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7173 if (m->trace)
7174 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7175 // no map? starting up?
7176 if (!osdmap) {
7177 dout(7) << "no OSDMap, not booted" << dendl;
7178 logger->inc(l_osd_waiting_for_map);
7179 waiting_for_osdmap.push_back(op);
7180 op->mark_delayed("no osdmap");
7181 break;
7182 }
7183
7184 // need OSDMap
7185 dispatch_op(op);
7186 }
7187 }
7188 }
7189
7190 void OSD::handle_pg_scrub(MOSDScrub *m, PG *pg)
7191 {
7192 pg->lock();
7193 if (pg->is_primary()) {
7194 pg->unreg_next_scrub();
7195 pg->scrubber.must_scrub = true;
7196 pg->scrubber.must_deep_scrub = m->deep || m->repair;
7197 pg->scrubber.must_repair = m->repair;
7198 pg->reg_next_scrub();
7199 dout(10) << "marking " << *pg << " for scrub" << dendl;
7200 }
7201 pg->unlock();
7202 }
7203
7204 void OSD::handle_scrub(MOSDScrub *m)
7205 {
7206 dout(10) << "handle_scrub " << *m << dendl;
7207 if (!require_mon_or_mgr_peer(m)) {
7208 m->put();
7209 return;
7210 }
7211 if (m->fsid != monc->get_fsid()) {
7212 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid() << dendl;
7213 m->put();
7214 return;
7215 }
7216
7217 RWLock::RLocker l(pg_map_lock);
7218 if (m->scrub_pgs.empty()) {
7219 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
7220 p != pg_map.end();
7221 ++p)
7222 handle_pg_scrub(m, p->second);
7223 } else {
7224 for (vector<pg_t>::iterator p = m->scrub_pgs.begin();
7225 p != m->scrub_pgs.end();
7226 ++p) {
7227 spg_t pcand;
7228 if (osdmap->get_primary_shard(*p, &pcand)) {
7229 auto pg_map_entry = pg_map.find(pcand);
7230 if (pg_map_entry != pg_map.end()) {
7231 handle_pg_scrub(m, pg_map_entry->second);
7232 }
7233 }
7234 }
7235 }
7236
7237 m->put();
7238 }
7239
7240 bool OSD::scrub_random_backoff()
7241 {
7242 bool coin_flip = (rand() / (double)RAND_MAX >=
7243 cct->_conf->osd_scrub_backoff_ratio);
7244 if (!coin_flip) {
7245 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7246 return true;
7247 }
7248 return false;
7249 }
7250
7251 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7252 const spg_t& pg, const utime_t& timestamp,
7253 double pool_scrub_min_interval,
7254 double pool_scrub_max_interval, bool must)
7255 : cct(cct),
7256 pgid(pg),
7257 sched_time(timestamp),
7258 deadline(timestamp)
7259 {
7260 // if not explicitly requested, postpone the scrub with a random delay
7261 if (!must) {
7262 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7263 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7264 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7265 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7266
7267 sched_time += scrub_min_interval;
7268 double r = rand() / (double)RAND_MAX;
7269 sched_time +=
7270 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7271 deadline += scrub_max_interval;
7272 }
7273 }
7274
7275 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7276 if (sched_time < rhs.sched_time)
7277 return true;
7278 if (sched_time > rhs.sched_time)
7279 return false;
7280 return pgid < rhs.pgid;
7281 }
7282
7283 bool OSD::scrub_time_permit(utime_t now)
7284 {
7285 struct tm bdt;
7286 time_t tt = now.sec();
7287 localtime_r(&tt, &bdt);
7288 bool time_permit = false;
7289 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7290 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7291 time_permit = true;
7292 }
7293 } else {
7294 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7295 time_permit = true;
7296 }
7297 }
7298 if (!time_permit) {
7299 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7300 << " - " << cct->_conf->osd_scrub_end_hour
7301 << " now " << bdt.tm_hour << " = no" << dendl;
7302 } else {
7303 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7304 << " - " << cct->_conf->osd_scrub_end_hour
7305 << " now " << bdt.tm_hour << " = yes" << dendl;
7306 }
7307 return time_permit;
7308 }
7309
7310 bool OSD::scrub_load_below_threshold()
7311 {
7312 double loadavgs[3];
7313 if (getloadavg(loadavgs, 3) != 3) {
7314 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7315 return false;
7316 }
7317
7318 // allow scrub if below configured threshold
7319 if (loadavgs[0] < cct->_conf->osd_scrub_load_threshold) {
7320 dout(20) << __func__ << " loadavg " << loadavgs[0]
7321 << " < max " << cct->_conf->osd_scrub_load_threshold
7322 << " = yes" << dendl;
7323 return true;
7324 }
7325
7326 // allow scrub if below daily avg and currently decreasing
7327 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7328 dout(20) << __func__ << " loadavg " << loadavgs[0]
7329 << " < daily_loadavg " << daily_loadavg
7330 << " and < 15m avg " << loadavgs[2]
7331 << " = yes" << dendl;
7332 return true;
7333 }
7334
7335 dout(20) << __func__ << " loadavg " << loadavgs[0]
7336 << " >= max " << cct->_conf->osd_scrub_load_threshold
7337 << " and ( >= daily_loadavg " << daily_loadavg
7338 << " or >= 15m avg " << loadavgs[2]
7339 << ") = no" << dendl;
7340 return false;
7341 }
7342
7343 void OSD::sched_scrub()
7344 {
7345 // if not permitted, fail fast
7346 if (!service.can_inc_scrubs_pending()) {
7347 return;
7348 }
7349
7350 utime_t now = ceph_clock_now();
7351 bool time_permit = scrub_time_permit(now);
7352 bool load_is_low = scrub_load_below_threshold();
7353 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7354
7355 OSDService::ScrubJob scrub;
7356 if (service.first_scrub_stamp(&scrub)) {
7357 do {
7358 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7359
7360 if (scrub.sched_time > now) {
7361 // save ourselves some effort
7362 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7363 << " > " << now << dendl;
7364 break;
7365 }
7366
7367 if (!cct->_conf->osd_scrub_during_recovery && service.is_recovery_active()) {
7368 dout(10) << __func__ << "not scheduling scrub of " << scrub.pgid << " due to active recovery ops" << dendl;
7369 break;
7370 }
7371
7372 if ((scrub.deadline >= now) && !(time_permit && load_is_low)) {
7373 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7374 << (!time_permit ? "time not permit" : "high load") << dendl;
7375 continue;
7376 }
7377
7378 PG *pg = _lookup_lock_pg(scrub.pgid);
7379 if (!pg)
7380 continue;
7381 if (pg->get_pgbackend()->scrub_supported() && pg->is_active()) {
7382 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7383 << (pg->scrubber.must_scrub ? ", explicitly requested" :
7384 (load_is_low ? ", load_is_low" : " deadline < now"))
7385 << dendl;
7386 if (pg->sched_scrub()) {
7387 pg->unlock();
7388 break;
7389 }
7390 }
7391 pg->unlock();
7392 } while (service.next_scrub_stamp(scrub, &scrub));
7393 }
7394 dout(20) << "sched_scrub done" << dendl;
7395 }
7396
7397
7398
7399 // =====================================================
7400 // MAP
7401
7402 void OSD::wait_for_new_map(OpRequestRef op)
7403 {
7404 // ask?
7405 if (waiting_for_osdmap.empty()) {
7406 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7407 }
7408
7409 logger->inc(l_osd_waiting_for_map);
7410 waiting_for_osdmap.push_back(op);
7411 op->mark_delayed("wait for new map");
7412 }
7413
7414
7415 /** update_map
7416 * assimilate new OSDMap(s). scan pgs, etc.
7417 */
7418
7419 void OSD::note_down_osd(int peer)
7420 {
7421 assert(osd_lock.is_locked());
7422 cluster_messenger->mark_down(osdmap->get_cluster_addr(peer));
7423
7424 heartbeat_lock.Lock();
7425 failure_queue.erase(peer);
7426 failure_pending.erase(peer);
7427 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7428 if (p != heartbeat_peers.end()) {
7429 p->second.con_back->mark_down();
7430 if (p->second.con_front) {
7431 p->second.con_front->mark_down();
7432 }
7433 heartbeat_peers.erase(p);
7434 }
7435 heartbeat_lock.Unlock();
7436 }
7437
7438 void OSD::note_up_osd(int peer)
7439 {
7440 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
7441 heartbeat_set_peers_need_update();
7442 }
7443
7444 struct C_OnMapCommit : public Context {
7445 OSD *osd;
7446 epoch_t first, last;
7447 MOSDMap *msg;
7448 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7449 : osd(o), first(f), last(l), msg(m) {}
7450 void finish(int r) override {
7451 osd->_committed_osd_maps(first, last, msg);
7452 msg->put();
7453 }
7454 };
7455
7456 struct C_OnMapApply : public Context {
7457 OSDService *service;
7458 list<OSDMapRef> pinned_maps;
7459 epoch_t e;
7460 C_OnMapApply(OSDService *service,
7461 const list<OSDMapRef> &pinned_maps,
7462 epoch_t e)
7463 : service(service), pinned_maps(pinned_maps), e(e) {}
7464 void finish(int r) override {
7465 service->clear_map_bl_cache_pins(e);
7466 }
7467 };
7468
7469 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7470 {
7471 OSDMapRef osdmap = service.get_osdmap();
7472 if (osdmap->get_epoch() >= epoch)
7473 return;
7474
7475 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7476 force_request) {
7477 monc->renew_subs();
7478 }
7479 }
7480
7481 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7482 {
7483 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7484 if (min <= superblock.oldest_map)
7485 return;
7486
7487 int num = 0;
7488 ObjectStore::Transaction t;
7489 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7490 dout(20) << " removing old osdmap epoch " << e << dendl;
7491 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7492 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7493 superblock.oldest_map = e + 1;
7494 num++;
7495 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7496 service.publish_superblock(superblock);
7497 write_superblock(t);
7498 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7499 assert(tr == 0);
7500 num = 0;
7501 if (!skip_maps) {
7502 // skip_maps leaves us with a range of old maps if we fail to remove all
7503 // of them before moving superblock.oldest_map forward to the first map
7504 // in the incoming MOSDMap msg. so we should continue removing them in
7505 // this case, even we could do huge series of delete transactions all at
7506 // once.
7507 break;
7508 }
7509 }
7510 }
7511 if (num > 0) {
7512 service.publish_superblock(superblock);
7513 write_superblock(t);
7514 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7515 assert(tr == 0);
7516 }
7517 // we should not remove the cached maps
7518 assert(min <= service.map_cache.cached_key_lower_bound());
7519 }
7520
7521 void OSD::handle_osd_map(MOSDMap *m)
7522 {
7523 assert(osd_lock.is_locked());
7524 // Keep a ref in the list until we get the newly received map written
7525 // onto disk. This is important because as long as the refs are alive,
7526 // the OSDMaps will be pinned in the cache and we won't try to read it
7527 // off of disk. Otherwise these maps will probably not stay in the cache,
7528 // and reading those OSDMaps before they are actually written can result
7529 // in a crash.
7530 list<OSDMapRef> pinned_maps;
7531 if (m->fsid != monc->get_fsid()) {
7532 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7533 << monc->get_fsid() << dendl;
7534 m->put();
7535 return;
7536 }
7537 if (is_initializing()) {
7538 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7539 m->put();
7540 return;
7541 }
7542
7543 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
7544 if (session && !(session->entity_name.is_mon() ||
7545 session->entity_name.is_osd())) {
7546 //not enough perms!
7547 dout(10) << "got osd map from Session " << session
7548 << " which we can't take maps from (not a mon or osd)" << dendl;
7549 m->put();
7550 session->put();
7551 return;
7552 }
7553 if (session)
7554 session->put();
7555
7556 // share with the objecter
7557 if (!is_preboot())
7558 service.objecter->handle_osd_map(m);
7559
7560 epoch_t first = m->get_first();
7561 epoch_t last = m->get_last();
7562 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7563 << superblock.newest_map
7564 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7565 << dendl;
7566
7567 logger->inc(l_osd_map);
7568 logger->inc(l_osd_mape, last - first + 1);
7569 if (first <= superblock.newest_map)
7570 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7571 if (service.max_oldest_map < m->oldest_map) {
7572 service.max_oldest_map = m->oldest_map;
7573 assert(service.max_oldest_map >= superblock.oldest_map);
7574 }
7575
7576 // make sure there is something new, here, before we bother flushing
7577 // the queues and such
7578 if (last <= superblock.newest_map) {
7579 dout(10) << " no new maps here, dropping" << dendl;
7580 m->put();
7581 return;
7582 }
7583
7584 // missing some?
7585 bool skip_maps = false;
7586 if (first > superblock.newest_map + 1) {
7587 dout(10) << "handle_osd_map message skips epochs "
7588 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7589 if (m->oldest_map <= superblock.newest_map + 1) {
7590 osdmap_subscribe(superblock.newest_map + 1, false);
7591 m->put();
7592 return;
7593 }
7594 // always try to get the full range of maps--as many as we can. this
7595 // 1- is good to have
7596 // 2- is at present the only way to ensure that we get a *full* map as
7597 // the first map!
7598 if (m->oldest_map < first) {
7599 osdmap_subscribe(m->oldest_map - 1, true);
7600 m->put();
7601 return;
7602 }
7603 skip_maps = true;
7604 }
7605
7606 ObjectStore::Transaction t;
7607 uint64_t txn_size = 0;
7608
7609 // store new maps: queue for disk and put in the osdmap cache
7610 epoch_t start = MAX(superblock.newest_map + 1, first);
7611 for (epoch_t e = start; e <= last; e++) {
7612 if (txn_size >= t.get_num_bytes()) {
7613 derr << __func__ << " transaction size overflowed" << dendl;
7614 assert(txn_size < t.get_num_bytes());
7615 }
7616 txn_size = t.get_num_bytes();
7617 map<epoch_t,bufferlist>::iterator p;
7618 p = m->maps.find(e);
7619 if (p != m->maps.end()) {
7620 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7621 OSDMap *o = new OSDMap;
7622 bufferlist& bl = p->second;
7623
7624 o->decode(bl);
7625
7626 ghobject_t fulloid = get_osdmap_pobject_name(e);
7627 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7628 pin_map_bl(e, bl);
7629 pinned_maps.push_back(add_map(o));
7630
7631 got_full_map(e);
7632 continue;
7633 }
7634
7635 p = m->incremental_maps.find(e);
7636 if (p != m->incremental_maps.end()) {
7637 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7638 bufferlist& bl = p->second;
7639 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7640 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7641 pin_map_inc_bl(e, bl);
7642
7643 OSDMap *o = new OSDMap;
7644 if (e > 1) {
7645 bufferlist obl;
7646 bool got = get_map_bl(e - 1, obl);
7647 assert(got);
7648 o->decode(obl);
7649 }
7650
7651 OSDMap::Incremental inc;
7652 bufferlist::iterator p = bl.begin();
7653 inc.decode(p);
7654 if (o->apply_incremental(inc) < 0) {
7655 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
7656 assert(0 == "bad fsid");
7657 }
7658
7659 bufferlist fbl;
7660 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7661
7662 bool injected_failure = false;
7663 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7664 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7665 derr << __func__ << " injecting map crc failure" << dendl;
7666 injected_failure = true;
7667 }
7668
7669 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7670 dout(2) << "got incremental " << e
7671 << " but failed to encode full with correct crc; requesting"
7672 << dendl;
7673 clog->warn() << "failed to encode map e" << e << " with expected crc";
7674 dout(20) << "my encoded map was:\n";
7675 fbl.hexdump(*_dout);
7676 *_dout << dendl;
7677 delete o;
7678 request_full_map(e, last);
7679 last = e - 1;
7680 break;
7681 }
7682 got_full_map(e);
7683
7684 ghobject_t fulloid = get_osdmap_pobject_name(e);
7685 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7686 pin_map_bl(e, fbl);
7687 pinned_maps.push_back(add_map(o));
7688 continue;
7689 }
7690
7691 assert(0 == "MOSDMap lied about what maps it had?");
7692 }
7693
7694 // even if this map isn't from a mon, we may have satisfied our subscription
7695 monc->sub_got("osdmap", last);
7696
7697 if (!m->maps.empty() && requested_full_first) {
7698 dout(10) << __func__ << " still missing full maps " << requested_full_first
7699 << ".." << requested_full_last << dendl;
7700 rerequest_full_maps();
7701 }
7702
7703 if (superblock.oldest_map) {
7704 // make sure we at least keep pace with incoming maps
7705 trim_maps(m->oldest_map, last - first + 1, skip_maps);
7706 }
7707
7708 if (!superblock.oldest_map || skip_maps)
7709 superblock.oldest_map = first;
7710 superblock.newest_map = last;
7711 superblock.current_epoch = last;
7712
7713 // note in the superblock that we were clean thru the prior epoch
7714 epoch_t boot_epoch = service.get_boot_epoch();
7715 if (boot_epoch && boot_epoch >= superblock.mounted) {
7716 superblock.mounted = boot_epoch;
7717 superblock.clean_thru = last;
7718 }
7719
7720 // superblock and commit
7721 write_superblock(t);
7722 store->queue_transaction(
7723 service.meta_osr.get(),
7724 std::move(t),
7725 new C_OnMapApply(&service, pinned_maps, last),
7726 new C_OnMapCommit(this, start, last, m), 0);
7727 service.publish_superblock(superblock);
7728 }
7729
7730 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
7731 {
7732 dout(10) << __func__ << " " << first << ".." << last << dendl;
7733 if (is_stopping()) {
7734 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7735 return;
7736 }
7737 Mutex::Locker l(osd_lock);
7738 if (is_stopping()) {
7739 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7740 return;
7741 }
7742 map_lock.get_write();
7743
7744 bool do_shutdown = false;
7745 bool do_restart = false;
7746 bool network_error = false;
7747
7748 // advance through the new maps
7749 for (epoch_t cur = first; cur <= last; cur++) {
7750 dout(10) << " advance to epoch " << cur
7751 << " (<= last " << last
7752 << " <= newest_map " << superblock.newest_map
7753 << ")" << dendl;
7754
7755 OSDMapRef newmap = get_map(cur);
7756 assert(newmap); // we just cached it above!
7757
7758 // start blacklisting messages sent to peers that go down.
7759 service.pre_publish_map(newmap);
7760
7761 // kill connections to newly down osds
7762 bool waited_for_reservations = false;
7763 set<int> old;
7764 osdmap->get_all_osds(old);
7765 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
7766 if (*p != whoami &&
7767 osdmap->is_up(*p) && // in old map
7768 newmap->is_down(*p)) { // but not the new one
7769 if (!waited_for_reservations) {
7770 service.await_reserved_maps();
7771 waited_for_reservations = true;
7772 }
7773 note_down_osd(*p);
7774 } else if (*p != whoami &&
7775 osdmap->is_down(*p) &&
7776 newmap->is_up(*p)) {
7777 note_up_osd(*p);
7778 }
7779 }
7780
7781 if ((osdmap->test_flag(CEPH_OSDMAP_NOUP) !=
7782 newmap->test_flag(CEPH_OSDMAP_NOUP)) ||
7783 (osdmap->is_noup(whoami) != newmap->is_noup(whoami))) {
7784 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
7785 << dendl;
7786 if (is_booting()) {
7787 // this captures the case where we sent the boot message while
7788 // NOUP was being set on the mon and our boot request was
7789 // dropped, and then later it is cleared. it imperfectly
7790 // handles the case where our original boot message was not
7791 // dropped and we restart even though we might have booted, but
7792 // that is harmless (boot will just take slightly longer).
7793 do_restart = true;
7794 }
7795 }
7796 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS &&
7797 newmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
7798 dout(10) << __func__ << " require_osd_release reached luminous in "
7799 << newmap->get_epoch() << dendl;
7800 clear_pg_stat_queue();
7801 clear_outstanding_pg_stats();
7802 }
7803
7804 osdmap = newmap;
7805 epoch_t up_epoch;
7806 epoch_t boot_epoch;
7807 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
7808 if (!up_epoch &&
7809 osdmap->is_up(whoami) &&
7810 osdmap->get_inst(whoami) == client_messenger->get_myinst()) {
7811 up_epoch = osdmap->get_epoch();
7812 dout(10) << "up_epoch is " << up_epoch << dendl;
7813 if (!boot_epoch) {
7814 boot_epoch = osdmap->get_epoch();
7815 dout(10) << "boot_epoch is " << boot_epoch << dendl;
7816 }
7817 service.set_epochs(&boot_epoch, &up_epoch, NULL);
7818 }
7819 }
7820
7821 had_map_since = ceph_clock_now();
7822
7823 epoch_t _bind_epoch = service.get_bind_epoch();
7824 if (osdmap->is_up(whoami) &&
7825 osdmap->get_addr(whoami) == client_messenger->get_myaddr() &&
7826 _bind_epoch < osdmap->get_up_from(whoami)) {
7827
7828 if (is_booting()) {
7829 dout(1) << "state: booting -> active" << dendl;
7830 set_state(STATE_ACTIVE);
7831
7832 // set incarnation so that osd_reqid_t's we generate for our
7833 // objecter requests are unique across restarts.
7834 service.objecter->set_client_incarnation(osdmap->get_epoch());
7835 }
7836 }
7837
7838 if (osdmap->get_epoch() > 0 &&
7839 is_active()) {
7840 if (!osdmap->exists(whoami)) {
7841 dout(0) << "map says i do not exist. shutting down." << dendl;
7842 do_shutdown = true; // don't call shutdown() while we have
7843 // everything paused
7844 } else if (!osdmap->is_up(whoami) ||
7845 !osdmap->get_addr(whoami).probably_equals(
7846 client_messenger->get_myaddr()) ||
7847 !osdmap->get_cluster_addr(whoami).probably_equals(
7848 cluster_messenger->get_myaddr()) ||
7849 !osdmap->get_hb_back_addr(whoami).probably_equals(
7850 hb_back_server_messenger->get_myaddr()) ||
7851 (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
7852 !osdmap->get_hb_front_addr(whoami).probably_equals(
7853 hb_front_server_messenger->get_myaddr()))) {
7854 if (!osdmap->is_up(whoami)) {
7855 if (service.is_preparing_to_stop() || service.is_stopping()) {
7856 service.got_stop_ack();
7857 } else {
7858 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
7859 "but it is still running";
7860 clog->debug() << "map e" << osdmap->get_epoch()
7861 << " wrongly marked me down at e"
7862 << osdmap->get_down_at(whoami);
7863 }
7864 } else if (!osdmap->get_addr(whoami).probably_equals(
7865 client_messenger->get_myaddr())) {
7866 clog->error() << "map e" << osdmap->get_epoch()
7867 << " had wrong client addr (" << osdmap->get_addr(whoami)
7868 << " != my " << client_messenger->get_myaddr() << ")";
7869 } else if (!osdmap->get_cluster_addr(whoami).probably_equals(
7870 cluster_messenger->get_myaddr())) {
7871 clog->error() << "map e" << osdmap->get_epoch()
7872 << " had wrong cluster addr ("
7873 << osdmap->get_cluster_addr(whoami)
7874 << " != my " << cluster_messenger->get_myaddr() << ")";
7875 } else if (!osdmap->get_hb_back_addr(whoami).probably_equals(
7876 hb_back_server_messenger->get_myaddr())) {
7877 clog->error() << "map e" << osdmap->get_epoch()
7878 << " had wrong heartbeat back addr ("
7879 << osdmap->get_hb_back_addr(whoami)
7880 << " != my " << hb_back_server_messenger->get_myaddr()
7881 << ")";
7882 } else if (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
7883 !osdmap->get_hb_front_addr(whoami).probably_equals(
7884 hb_front_server_messenger->get_myaddr())) {
7885 clog->error() << "map e" << osdmap->get_epoch()
7886 << " had wrong heartbeat front addr ("
7887 << osdmap->get_hb_front_addr(whoami)
7888 << " != my " << hb_front_server_messenger->get_myaddr()
7889 << ")";
7890 }
7891
7892 if (!service.is_stopping()) {
7893 epoch_t up_epoch = 0;
7894 epoch_t bind_epoch = osdmap->get_epoch();
7895 service.set_epochs(NULL,&up_epoch, &bind_epoch);
7896 do_restart = true;
7897
7898 //add markdown log
7899 utime_t now = ceph_clock_now();
7900 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
7901 osd_markdown_log.push_back(now);
7902 //clear all out-of-date log
7903 while (!osd_markdown_log.empty() &&
7904 osd_markdown_log.front() + grace < now)
7905 osd_markdown_log.pop_front();
7906 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
7907 dout(0) << __func__ << " marked down "
7908 << osd_markdown_log.size()
7909 << " > osd_max_markdown_count "
7910 << cct->_conf->osd_max_markdown_count
7911 << " in last " << grace << " seconds, shutting down"
7912 << dendl;
7913 do_restart = false;
7914 do_shutdown = true;
7915 }
7916
7917 start_waiting_for_healthy();
7918
7919 set<int> avoid_ports;
7920 #if defined(__FreeBSD__)
7921 // prevent FreeBSD from grabbing the client_messenger port during
7922 // rebinding. In which case a cluster_meesneger will connect also
7923 // to the same port
7924 avoid_ports.insert(client_messenger->get_myaddr().get_port());
7925 #endif
7926 avoid_ports.insert(cluster_messenger->get_myaddr().get_port());
7927 avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port());
7928 avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port());
7929
7930 int r = cluster_messenger->rebind(avoid_ports);
7931 if (r != 0) {
7932 do_shutdown = true; // FIXME: do_restart?
7933 network_error = true;
7934 dout(0) << __func__ << " marked down:"
7935 << " rebind cluster_messenger failed" << dendl;
7936 }
7937
7938 r = hb_back_server_messenger->rebind(avoid_ports);
7939 if (r != 0) {
7940 do_shutdown = true; // FIXME: do_restart?
7941 network_error = true;
7942 dout(0) << __func__ << " marked down:"
7943 << " rebind hb_back_server_messenger failed" << dendl;
7944 }
7945
7946 r = hb_front_server_messenger->rebind(avoid_ports);
7947 if (r != 0) {
7948 do_shutdown = true; // FIXME: do_restart?
7949 network_error = true;
7950 dout(0) << __func__ << " marked down:"
7951 << " rebind hb_front_server_messenger failed" << dendl;
7952 }
7953
7954 hb_front_client_messenger->mark_down_all();
7955 hb_back_client_messenger->mark_down_all();
7956
7957 reset_heartbeat_peers();
7958 }
7959 }
7960 }
7961
7962 map_lock.put_write();
7963
7964 check_osdmap_features(store);
7965
7966 // yay!
7967 consume_map();
7968
7969 if (is_active() || is_waiting_for_healthy())
7970 maybe_update_heartbeat_peers();
7971
7972 if (!is_active()) {
7973 dout(10) << " not yet active; waiting for peering wq to drain" << dendl;
7974 peering_wq.drain();
7975 } else {
7976 activate_map();
7977 }
7978
7979 if (do_shutdown) {
7980 if (network_error) {
7981 Mutex::Locker l(heartbeat_lock);
7982 map<int,pair<utime_t,entity_inst_t>>::iterator it =
7983 failure_pending.begin();
7984 while (it != failure_pending.end()) {
7985 dout(10) << "handle_osd_ping canceling in-flight failure report for osd."
7986 << it->first << dendl;
7987 send_still_alive(osdmap->get_epoch(), it->second.second);
7988 failure_pending.erase(it++);
7989 }
7990 }
7991 // trigger shutdown in a different thread
7992 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
7993 queue_async_signal(SIGINT);
7994 }
7995 else if (m->newest_map && m->newest_map > last) {
7996 dout(10) << " msg say newest map is " << m->newest_map
7997 << ", requesting more" << dendl;
7998 osdmap_subscribe(osdmap->get_epoch()+1, false);
7999 }
8000 else if (is_preboot()) {
8001 if (m->get_source().is_mon())
8002 _preboot(m->oldest_map, m->newest_map);
8003 else
8004 start_boot();
8005 }
8006 else if (do_restart)
8007 start_boot();
8008
8009 }
8010
8011 void OSD::check_osdmap_features(ObjectStore *fs)
8012 {
8013 // adjust required feature bits?
8014
8015 // we have to be a bit careful here, because we are accessing the
8016 // Policy structures without taking any lock. in particular, only
8017 // modify integer values that can safely be read by a racing CPU.
8018 // since we are only accessing existing Policy structures a their
8019 // current memory location, and setting or clearing bits in integer
8020 // fields, and we are the only writer, this is not a problem.
8021
8022 {
8023 Messenger::Policy p = client_messenger->get_default_policy();
8024 uint64_t mask;
8025 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8026 if ((p.features_required & mask) != features) {
8027 dout(0) << "crush map has features " << features
8028 << ", adjusting msgr requires for clients" << dendl;
8029 p.features_required = (p.features_required & ~mask) | features;
8030 client_messenger->set_default_policy(p);
8031 }
8032 }
8033 {
8034 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8035 uint64_t mask;
8036 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8037 if ((p.features_required & mask) != features) {
8038 dout(0) << "crush map has features " << features
8039 << " was " << p.features_required
8040 << ", adjusting msgr requires for mons" << dendl;
8041 p.features_required = (p.features_required & ~mask) | features;
8042 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8043 }
8044 }
8045 {
8046 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8047 uint64_t mask;
8048 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8049
8050 if ((p.features_required & mask) != features) {
8051 dout(0) << "crush map has features " << features
8052 << ", adjusting msgr requires for osds" << dendl;
8053 p.features_required = (p.features_required & ~mask) | features;
8054 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8055 }
8056
8057 if ((features & CEPH_FEATURE_OSD_ERASURE_CODES) &&
8058 !superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8059 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8060 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8061 ObjectStore::Transaction t;
8062 write_superblock(t);
8063 int err = store->queue_transaction(service.meta_osr.get(), std::move(t), NULL);
8064 assert(err == 0);
8065 }
8066 }
8067 }
8068
8069 bool OSD::advance_pg(
8070 epoch_t osd_epoch, PG *pg,
8071 ThreadPool::TPHandle &handle,
8072 PG::RecoveryCtx *rctx,
8073 set<PGRef> *new_pgs)
8074 {
8075 assert(pg->is_locked());
8076 epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
8077 OSDMapRef lastmap = pg->get_osdmap();
8078
8079 if (lastmap->get_epoch() == osd_epoch)
8080 return true;
8081 assert(lastmap->get_epoch() < osd_epoch);
8082
8083 epoch_t min_epoch = service.get_min_pg_epoch();
8084 epoch_t max;
8085 if (min_epoch) {
8086 max = min_epoch + cct->_conf->osd_map_max_advance;
8087 } else {
8088 max = next_epoch + cct->_conf->osd_map_max_advance;
8089 }
8090
8091 for (;
8092 next_epoch <= osd_epoch && next_epoch <= max;
8093 ++next_epoch) {
8094 OSDMapRef nextmap = service.try_get_map(next_epoch);
8095 if (!nextmap) {
8096 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8097 // make sure max is bumped up so that we can get past any
8098 // gap in maps
8099 max = MAX(max, next_epoch + cct->_conf->osd_map_max_advance);
8100 continue;
8101 }
8102
8103 vector<int> newup, newacting;
8104 int up_primary, acting_primary;
8105 nextmap->pg_to_up_acting_osds(
8106 pg->info.pgid.pgid,
8107 &newup, &up_primary,
8108 &newacting, &acting_primary);
8109 pg->handle_advance_map(
8110 nextmap, lastmap, newup, up_primary,
8111 newacting, acting_primary, rctx);
8112
8113 // Check for split!
8114 set<spg_t> children;
8115 spg_t parent(pg->info.pgid);
8116 if (parent.is_split(
8117 lastmap->get_pg_num(pg->pool.id),
8118 nextmap->get_pg_num(pg->pool.id),
8119 &children)) {
8120 service.mark_split_in_progress(pg->info.pgid, children);
8121 split_pgs(
8122 pg, children, new_pgs, lastmap, nextmap,
8123 rctx);
8124 }
8125
8126 lastmap = nextmap;
8127 handle.reset_tp_timeout();
8128 }
8129 service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
8130 pg->handle_activate_map(rctx);
8131 if (next_epoch <= osd_epoch) {
8132 dout(10) << __func__ << " advanced to max " << max
8133 << " past min epoch " << min_epoch
8134 << " ... will requeue " << *pg << dendl;
8135 return false;
8136 }
8137 return true;
8138 }
8139
8140 void OSD::consume_map()
8141 {
8142 assert(osd_lock.is_locked());
8143 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8144
8145 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8146 list<PGRef> to_remove;
8147
8148 // scan pg's
8149 {
8150 RWLock::RLocker l(pg_map_lock);
8151 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8152 it != pg_map.end();
8153 ++it) {
8154 PG *pg = it->second;
8155 pg->lock();
8156 if (pg->is_primary())
8157 num_pg_primary++;
8158 else if (pg->is_replica())
8159 num_pg_replica++;
8160 else
8161 num_pg_stray++;
8162
8163 if (!osdmap->have_pg_pool(pg->info.pgid.pool())) {
8164 //pool is deleted!
8165 to_remove.push_back(PGRef(pg));
8166 } else {
8167 service.init_splits_between(it->first, service.get_osdmap(), osdmap);
8168 }
8169
8170 pg->unlock();
8171 }
8172 }
8173
8174 for (list<PGRef>::iterator i = to_remove.begin();
8175 i != to_remove.end();
8176 to_remove.erase(i++)) {
8177 RWLock::WLocker locker(pg_map_lock);
8178 (*i)->lock();
8179 _remove_pg(&**i);
8180 (*i)->unlock();
8181 }
8182
8183 service.expand_pg_num(service.get_osdmap(), osdmap);
8184
8185 service.pre_publish_map(osdmap);
8186 service.await_reserved_maps();
8187 service.publish_map(osdmap);
8188
8189 service.maybe_inject_dispatch_delay();
8190
8191 dispatch_sessions_waiting_on_map();
8192
8193 service.maybe_inject_dispatch_delay();
8194
8195 // remove any PGs which we no longer host from the session waiting_for_pg lists
8196 dout(20) << __func__ << " checking waiting_for_pg" << dendl;
8197 op_shardedwq.prune_pg_waiters(osdmap, whoami);
8198
8199 service.maybe_inject_dispatch_delay();
8200
8201 // scan pg's
8202 {
8203 RWLock::RLocker l(pg_map_lock);
8204 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8205 it != pg_map.end();
8206 ++it) {
8207 PG *pg = it->second;
8208 pg->lock();
8209 pg->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
8210 pg->unlock();
8211 }
8212
8213 logger->set(l_osd_pg, pg_map.size());
8214 }
8215 logger->set(l_osd_pg_primary, num_pg_primary);
8216 logger->set(l_osd_pg_replica, num_pg_replica);
8217 logger->set(l_osd_pg_stray, num_pg_stray);
8218 }
8219
8220 void OSD::activate_map()
8221 {
8222 assert(osd_lock.is_locked());
8223
8224 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8225
8226 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8227 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8228 ceph_abort();
8229 }
8230
8231 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
8232 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
8233 osdmap_subscribe(osdmap->get_epoch() + 1, false);
8234 }
8235
8236 // norecover?
8237 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8238 if (!service.recovery_is_paused()) {
8239 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8240 service.pause_recovery();
8241 }
8242 } else {
8243 if (service.recovery_is_paused()) {
8244 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8245 service.unpause_recovery();
8246 }
8247 }
8248
8249 service.activate_map();
8250
8251 // process waiters
8252 take_waiters(waiting_for_osdmap);
8253 }
8254
8255 bool OSD::require_mon_peer(const Message *m)
8256 {
8257 if (!m->get_connection()->peer_is_mon()) {
8258 dout(0) << "require_mon_peer received from non-mon "
8259 << m->get_connection()->get_peer_addr()
8260 << " " << *m << dendl;
8261 return false;
8262 }
8263 return true;
8264 }
8265
8266 bool OSD::require_mon_or_mgr_peer(const Message *m)
8267 {
8268 if (!m->get_connection()->peer_is_mon() &&
8269 !m->get_connection()->peer_is_mgr()) {
8270 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8271 << m->get_connection()->get_peer_addr()
8272 << " " << *m << dendl;
8273 return false;
8274 }
8275 return true;
8276 }
8277
8278 bool OSD::require_osd_peer(const Message *m)
8279 {
8280 if (!m->get_connection()->peer_is_osd()) {
8281 dout(0) << "require_osd_peer received from non-osd "
8282 << m->get_connection()->get_peer_addr()
8283 << " " << *m << dendl;
8284 return false;
8285 }
8286 return true;
8287 }
8288
8289 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8290 {
8291 epoch_t up_epoch = service.get_up_epoch();
8292 if (epoch < up_epoch) {
8293 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8294 return false;
8295 }
8296
8297 if (!is_active()) {
8298 dout(7) << "still in boot state, dropping message " << *m << dendl;
8299 return false;
8300 }
8301
8302 return true;
8303 }
8304
8305 bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
8306 bool is_fast_dispatch)
8307 {
8308 int from = m->get_source().num();
8309
8310 if (map->is_down(from) ||
8311 (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
8312 dout(5) << "from dead osd." << from << ", marking down, "
8313 << " msg was " << m->get_source_inst().addr
8314 << " expected " << (map->is_up(from) ?
8315 map->get_cluster_addr(from) : entity_addr_t())
8316 << dendl;
8317 ConnectionRef con = m->get_connection();
8318 con->mark_down();
8319 Session *s = static_cast<Session*>(con->get_priv());
8320 if (s) {
8321 if (!is_fast_dispatch)
8322 s->session_dispatch_lock.Lock();
8323 clear_session_waiting_on_map(s);
8324 con->set_priv(NULL); // break ref <-> session cycle, if any
8325 if (!is_fast_dispatch)
8326 s->session_dispatch_lock.Unlock();
8327 s->put();
8328 }
8329 return false;
8330 }
8331 return true;
8332 }
8333
8334
8335 /*
8336 * require that we have same (or newer) map, and that
8337 * the source is the pg primary.
8338 */
8339 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8340 bool is_fast_dispatch)
8341 {
8342 const Message *m = op->get_req();
8343 dout(15) << "require_same_or_newer_map " << epoch
8344 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8345
8346 assert(osd_lock.is_locked());
8347
8348 // do they have a newer map?
8349 if (epoch > osdmap->get_epoch()) {
8350 dout(7) << "waiting for newer map epoch " << epoch
8351 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8352 wait_for_new_map(op);
8353 return false;
8354 }
8355
8356 if (!require_self_aliveness(op->get_req(), epoch)) {
8357 return false;
8358 }
8359
8360 // ok, our map is same or newer.. do they still exist?
8361 if (m->get_connection()->get_messenger() == cluster_messenger &&
8362 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8363 return false;
8364 }
8365
8366 return true;
8367 }
8368
8369
8370
8371
8372
8373 // ----------------------------------------
8374 // pg creation
8375
8376 void OSD::split_pgs(
8377 PG *parent,
8378 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
8379 OSDMapRef curmap,
8380 OSDMapRef nextmap,
8381 PG::RecoveryCtx *rctx)
8382 {
8383 unsigned pg_num = nextmap->get_pg_num(
8384 parent->pool.id);
8385 parent->update_snap_mapper_bits(
8386 parent->info.pgid.get_split_bits(pg_num)
8387 );
8388
8389 vector<object_stat_sum_t> updated_stats(childpgids.size() + 1);
8390 parent->info.stats.stats.sum.split(updated_stats);
8391
8392 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8393 for (set<spg_t>::const_iterator i = childpgids.begin();
8394 i != childpgids.end();
8395 ++i, ++stat_iter) {
8396 assert(stat_iter != updated_stats.end());
8397 dout(10) << "Splitting " << *parent << " into " << *i << dendl;
8398 assert(service.splitting(*i));
8399 PG* child = _make_pg(nextmap, *i);
8400 child->lock(true);
8401 out_pgs->insert(child);
8402 rctx->created_pgs.insert(child);
8403
8404 unsigned split_bits = i->get_split_bits(pg_num);
8405 dout(10) << "pg_num is " << pg_num << dendl;
8406 dout(10) << "m_seed " << i->ps() << dendl;
8407 dout(10) << "split_bits is " << split_bits << dendl;
8408
8409 parent->split_colls(
8410 *i,
8411 split_bits,
8412 i->ps(),
8413 &child->pool.info,
8414 rctx->transaction);
8415 parent->split_into(
8416 i->pgid,
8417 child,
8418 split_bits);
8419 child->info.stats.stats.sum = *stat_iter;
8420
8421 child->write_if_dirty(*(rctx->transaction));
8422 child->unlock();
8423 }
8424 assert(stat_iter != updated_stats.end());
8425 parent->info.stats.stats.sum = *stat_iter;
8426 parent->write_if_dirty(*(rctx->transaction));
8427 }
8428
8429 /*
8430 * holding osd_lock
8431 */
8432 void OSD::handle_pg_create(OpRequestRef op)
8433 {
8434 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
8435 assert(m->get_type() == MSG_OSD_PG_CREATE);
8436
8437 dout(10) << "handle_pg_create " << *m << dendl;
8438
8439 if (!require_mon_peer(op->get_req())) {
8440 return;
8441 }
8442
8443 if (!require_same_or_newer_map(op, m->epoch, false))
8444 return;
8445
8446 op->mark_started();
8447
8448 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8449 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8450 p != m->mkpg.end();
8451 ++p, ++ci) {
8452 assert(ci != m->ctimes.end() && ci->first == p->first);
8453 epoch_t created = p->second.created;
8454 if (p->second.split_bits) // Skip split pgs
8455 continue;
8456 pg_t on = p->first;
8457
8458 if (on.preferred() >= 0) {
8459 dout(20) << "ignoring localized pg " << on << dendl;
8460 continue;
8461 }
8462
8463 if (!osdmap->have_pg_pool(on.pool())) {
8464 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8465 continue;
8466 }
8467
8468 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
8469
8470 // is it still ours?
8471 vector<int> up, acting;
8472 int up_primary = -1;
8473 int acting_primary = -1;
8474 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
8475 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
8476
8477 if (acting_primary != whoami) {
8478 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
8479 << "), my role=" << role << ", skipping" << dendl;
8480 continue;
8481 }
8482
8483 spg_t pgid;
8484 bool mapped = osdmap->get_primary_shard(on, &pgid);
8485 assert(mapped);
8486
8487 PastIntervals pi(
8488 osdmap->get_pools().at(pgid.pool()).ec_pool(),
8489 *osdmap);
8490 pg_history_t history;
8491 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
8492
8493 // The mon won't resend unless the primary changed, so
8494 // we ignore same_interval_since. We'll pass this history
8495 // to handle_pg_peering_evt with the current epoch as the
8496 // event -- the project_pg_history check in
8497 // handle_pg_peering_evt will be a noop.
8498 if (history.same_primary_since > m->epoch) {
8499 dout(10) << __func__ << ": got obsolete pg create on pgid "
8500 << pgid << " from epoch " << m->epoch
8501 << ", primary changed in " << history.same_primary_since
8502 << dendl;
8503 continue;
8504 }
8505
8506 if (handle_pg_peering_evt(
8507 pgid,
8508 history,
8509 pi,
8510 osdmap->get_epoch(),
8511 PG::CephPeeringEvtRef(
8512 new PG::CephPeeringEvt(
8513 osdmap->get_epoch(),
8514 osdmap->get_epoch(),
8515 PG::NullEvt()))
8516 ) == -EEXIST) {
8517 service.send_pg_created(pgid.pgid);
8518 }
8519 }
8520 last_pg_create_epoch = m->epoch;
8521
8522 maybe_update_heartbeat_peers();
8523 }
8524
8525
8526 // ----------------------------------------
8527 // peering and recovery
8528
8529 PG::RecoveryCtx OSD::create_context()
8530 {
8531 ObjectStore::Transaction *t = new ObjectStore::Transaction;
8532 C_Contexts *on_applied = new C_Contexts(cct);
8533 C_Contexts *on_safe = new C_Contexts(cct);
8534 map<int, map<spg_t,pg_query_t> > *query_map =
8535 new map<int, map<spg_t, pg_query_t> >;
8536 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
8537 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
8538 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
8539 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
8540 PG::RecoveryCtx rctx(query_map, info_map, notify_list,
8541 on_applied, on_safe, t);
8542 return rctx;
8543 }
8544
8545 struct C_OpenPGs : public Context {
8546 set<PGRef> pgs;
8547 ObjectStore *store;
8548 OSD *osd;
8549 C_OpenPGs(set<PGRef>& p, ObjectStore *s, OSD* o) : store(s), osd(o) {
8550 pgs.swap(p);
8551 }
8552 void finish(int r) override {
8553 RWLock::RLocker l(osd->pg_map_lock);
8554 for (auto p : pgs) {
8555 if (osd->pg_map.count(p->info.pgid)) {
8556 p->ch = store->open_collection(p->coll);
8557 assert(p->ch);
8558 }
8559 }
8560 }
8561 };
8562
8563 void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
8564 ThreadPool::TPHandle *handle)
8565 {
8566 if (!ctx.transaction->empty()) {
8567 if (!ctx.created_pgs.empty()) {
8568 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8569 }
8570 int tr = store->queue_transaction(
8571 pg->osr.get(),
8572 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL,
8573 TrackedOpRef(), handle);
8574 delete (ctx.transaction);
8575 assert(tr == 0);
8576 ctx.transaction = new ObjectStore::Transaction;
8577 ctx.on_applied = new C_Contexts(cct);
8578 ctx.on_safe = new C_Contexts(cct);
8579 }
8580 }
8581
8582 void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
8583 ThreadPool::TPHandle *handle)
8584 {
8585 if (service.get_osdmap()->is_up(whoami) &&
8586 is_active()) {
8587 do_notifies(*ctx.notify_list, curmap);
8588 do_queries(*ctx.query_map, curmap);
8589 do_infos(*ctx.info_map, curmap);
8590 }
8591 delete ctx.notify_list;
8592 delete ctx.query_map;
8593 delete ctx.info_map;
8594 if ((ctx.on_applied->empty() &&
8595 ctx.on_safe->empty() &&
8596 ctx.transaction->empty() &&
8597 ctx.created_pgs.empty()) || !pg) {
8598 delete ctx.transaction;
8599 delete ctx.on_applied;
8600 delete ctx.on_safe;
8601 assert(ctx.created_pgs.empty());
8602 } else {
8603 if (!ctx.created_pgs.empty()) {
8604 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8605 }
8606 int tr = store->queue_transaction(
8607 pg->osr.get(),
8608 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL, TrackedOpRef(),
8609 handle);
8610 delete (ctx.transaction);
8611 assert(tr == 0);
8612 }
8613 }
8614
8615 /** do_notifies
8616 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
8617 * content for, and they are primary for.
8618 */
8619
8620 void OSD::do_notifies(
8621 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
8622 OSDMapRef curmap)
8623 {
8624 for (map<int,
8625 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
8626 notify_list.begin();
8627 it != notify_list.end();
8628 ++it) {
8629 if (!curmap->is_up(it->first)) {
8630 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
8631 continue;
8632 }
8633 ConnectionRef con = service.get_con_osd_cluster(
8634 it->first, curmap->get_epoch());
8635 if (!con) {
8636 dout(20) << __func__ << " skipping osd." << it->first
8637 << " (NULL con)" << dendl;
8638 continue;
8639 }
8640 service.share_map_peer(it->first, con.get(), curmap);
8641 dout(7) << __func__ << " osd " << it->first
8642 << " on " << it->second.size() << " PGs" << dendl;
8643 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
8644 it->second);
8645 con->send_message(m);
8646 }
8647 }
8648
8649
8650 /** do_queries
8651 * send out pending queries for info | summaries
8652 */
8653 void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
8654 OSDMapRef curmap)
8655 {
8656 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
8657 pit != query_map.end();
8658 ++pit) {
8659 if (!curmap->is_up(pit->first)) {
8660 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
8661 continue;
8662 }
8663 int who = pit->first;
8664 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
8665 if (!con) {
8666 dout(20) << __func__ << " skipping osd." << who
8667 << " (NULL con)" << dendl;
8668 continue;
8669 }
8670 service.share_map_peer(who, con.get(), curmap);
8671 dout(7) << __func__ << " querying osd." << who
8672 << " on " << pit->second.size() << " PGs" << dendl;
8673 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
8674 con->send_message(m);
8675 }
8676 }
8677
8678
8679 void OSD::do_infos(map<int,
8680 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
8681 OSDMapRef curmap)
8682 {
8683 for (map<int,
8684 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
8685 info_map.begin();
8686 p != info_map.end();
8687 ++p) {
8688 if (!curmap->is_up(p->first)) {
8689 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
8690 continue;
8691 }
8692 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
8693 i != p->second.end();
8694 ++i) {
8695 dout(20) << __func__ << " sending info " << i->first.info
8696 << " to shard " << p->first << dendl;
8697 }
8698 ConnectionRef con = service.get_con_osd_cluster(
8699 p->first, curmap->get_epoch());
8700 if (!con) {
8701 dout(20) << __func__ << " skipping osd." << p->first
8702 << " (NULL con)" << dendl;
8703 continue;
8704 }
8705 service.share_map_peer(p->first, con.get(), curmap);
8706 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
8707 m->pg_list = p->second;
8708 con->send_message(m);
8709 }
8710 info_map.clear();
8711 }
8712
8713
8714 /** PGNotify
8715 * from non-primary to primary
8716 * includes pg_info_t.
8717 * NOTE: called with opqueue active.
8718 */
8719 void OSD::handle_pg_notify(OpRequestRef op)
8720 {
8721 const MOSDPGNotify *m = static_cast<const MOSDPGNotify*>(op->get_req());
8722 assert(m->get_type() == MSG_OSD_PG_NOTIFY);
8723
8724 dout(7) << "handle_pg_notify from " << m->get_source() << dendl;
8725 int from = m->get_source().num();
8726
8727 if (!require_osd_peer(op->get_req()))
8728 return;
8729
8730 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8731 return;
8732
8733 op->mark_started();
8734
8735 for (auto it = m->get_pg_list().begin();
8736 it != m->get_pg_list().end();
8737 ++it) {
8738 if (it->first.info.pgid.preferred() >= 0) {
8739 dout(20) << "ignoring localized pg " << it->first.info.pgid << dendl;
8740 continue;
8741 }
8742
8743 handle_pg_peering_evt(
8744 spg_t(it->first.info.pgid.pgid, it->first.to),
8745 it->first.info.history, it->second,
8746 it->first.query_epoch,
8747 PG::CephPeeringEvtRef(
8748 new PG::CephPeeringEvt(
8749 it->first.epoch_sent, it->first.query_epoch,
8750 PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first,
8751 op->get_req()->get_connection()->get_features())))
8752 );
8753 }
8754 }
8755
8756 void OSD::handle_pg_log(OpRequestRef op)
8757 {
8758 MOSDPGLog *m = static_cast<MOSDPGLog*>(op->get_nonconst_req());
8759 assert(m->get_type() == MSG_OSD_PG_LOG);
8760 dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl;
8761
8762 if (!require_osd_peer(op->get_req()))
8763 return;
8764
8765 int from = m->get_source().num();
8766 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8767 return;
8768
8769 if (m->info.pgid.preferred() >= 0) {
8770 dout(10) << "ignoring localized pg " << m->info.pgid << dendl;
8771 return;
8772 }
8773
8774 op->mark_started();
8775 handle_pg_peering_evt(
8776 spg_t(m->info.pgid.pgid, m->to),
8777 m->info.history, m->past_intervals, m->get_epoch(),
8778 PG::CephPeeringEvtRef(
8779 new PG::CephPeeringEvt(
8780 m->get_epoch(), m->get_query_epoch(),
8781 PG::MLogRec(pg_shard_t(from, m->from), m)))
8782 );
8783 }
8784
8785 void OSD::handle_pg_info(OpRequestRef op)
8786 {
8787 const MOSDPGInfo *m = static_cast<const MOSDPGInfo *>(op->get_req());
8788 assert(m->get_type() == MSG_OSD_PG_INFO);
8789 dout(7) << "handle_pg_info " << *m << " from " << m->get_source() << dendl;
8790
8791 if (!require_osd_peer(op->get_req()))
8792 return;
8793
8794 int from = m->get_source().num();
8795 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8796 return;
8797
8798 op->mark_started();
8799
8800 for (auto p = m->pg_list.begin();
8801 p != m->pg_list.end();
8802 ++p) {
8803 if (p->first.info.pgid.preferred() >= 0) {
8804 dout(10) << "ignoring localized pg " << p->first.info.pgid << dendl;
8805 continue;
8806 }
8807
8808 handle_pg_peering_evt(
8809 spg_t(p->first.info.pgid.pgid, p->first.to),
8810 p->first.info.history, p->second, p->first.epoch_sent,
8811 PG::CephPeeringEvtRef(
8812 new PG::CephPeeringEvt(
8813 p->first.epoch_sent, p->first.query_epoch,
8814 PG::MInfoRec(
8815 pg_shard_t(
8816 from, p->first.from), p->first.info, p->first.epoch_sent)))
8817 );
8818 }
8819 }
8820
8821 void OSD::handle_pg_trim(OpRequestRef op)
8822 {
8823 const MOSDPGTrim *m = static_cast<const MOSDPGTrim*>(op->get_req());
8824 assert(m->get_type() == MSG_OSD_PG_TRIM);
8825
8826 dout(7) << "handle_pg_trim " << *m << " from " << m->get_source() << dendl;
8827
8828 if (!require_osd_peer(op->get_req()))
8829 return;
8830
8831 int from = m->get_source().num();
8832 if (!require_same_or_newer_map(op, m->epoch, false))
8833 return;
8834
8835 if (m->pgid.preferred() >= 0) {
8836 dout(10) << "ignoring localized pg " << m->pgid << dendl;
8837 return;
8838 }
8839
8840 op->mark_started();
8841
8842 PG *pg = _lookup_lock_pg(m->pgid);
8843 if(!pg) {
8844 dout(10) << " don't have pg " << m->pgid << dendl;
8845 return;
8846 }
8847
8848 if (m->epoch < pg->info.history.same_interval_since) {
8849 dout(10) << *pg << " got old trim to " << m->trim_to << ", ignoring" << dendl;
8850 pg->unlock();
8851 return;
8852 }
8853
8854 if (pg->is_primary()) {
8855 // peer is informing us of their last_complete_ondisk
8856 dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
8857 pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
8858 m->trim_to;
8859 // trim log when the pg is recovered
8860 pg->calc_min_last_complete_ondisk();
8861 } else {
8862 // primary is instructing us to trim
8863 ObjectStore::Transaction t;
8864 pg->pg_log.trim(m->trim_to, pg->info);
8865 pg->dirty_info = true;
8866 pg->write_if_dirty(t);
8867 int tr = store->queue_transaction(pg->osr.get(), std::move(t), NULL);
8868 assert(tr == 0);
8869 }
8870 pg->unlock();
8871 }
8872
8873 void OSD::handle_pg_backfill_reserve(OpRequestRef op)
8874 {
8875 const MBackfillReserve *m = static_cast<const MBackfillReserve*>(op->get_req());
8876 assert(m->get_type() == MSG_OSD_BACKFILL_RESERVE);
8877
8878 if (!require_osd_peer(op->get_req()))
8879 return;
8880 if (!require_same_or_newer_map(op, m->query_epoch, false))
8881 return;
8882
8883 PG::CephPeeringEvtRef evt;
8884 if (m->type == MBackfillReserve::REQUEST) {
8885 evt = PG::CephPeeringEvtRef(
8886 new PG::CephPeeringEvt(
8887 m->query_epoch,
8888 m->query_epoch,
8889 PG::RequestBackfillPrio(m->priority)));
8890 } else if (m->type == MBackfillReserve::GRANT) {
8891 evt = PG::CephPeeringEvtRef(
8892 new PG::CephPeeringEvt(
8893 m->query_epoch,
8894 m->query_epoch,
8895 PG::RemoteBackfillReserved()));
8896 } else if (m->type == MBackfillReserve::REJECT) {
8897 evt = PG::CephPeeringEvtRef(
8898 new PG::CephPeeringEvt(
8899 m->query_epoch,
8900 m->query_epoch,
8901 PG::RemoteReservationRejected()));
8902 } else {
8903 ceph_abort();
8904 }
8905
8906 if (service.splitting(m->pgid)) {
8907 peering_wait_for_split[m->pgid].push_back(evt);
8908 return;
8909 }
8910
8911 PG *pg = _lookup_lock_pg(m->pgid);
8912 if (!pg) {
8913 dout(10) << " don't have pg " << m->pgid << dendl;
8914 return;
8915 }
8916
8917 pg->queue_peering_event(evt);
8918 pg->unlock();
8919 }
8920
8921 void OSD::handle_pg_recovery_reserve(OpRequestRef op)
8922 {
8923 const MRecoveryReserve *m = static_cast<const MRecoveryReserve*>(op->get_req());
8924 assert(m->get_type() == MSG_OSD_RECOVERY_RESERVE);
8925
8926 if (!require_osd_peer(op->get_req()))
8927 return;
8928 if (!require_same_or_newer_map(op, m->query_epoch, false))
8929 return;
8930
8931 PG::CephPeeringEvtRef evt;
8932 if (m->type == MRecoveryReserve::REQUEST) {
8933 evt = PG::CephPeeringEvtRef(
8934 new PG::CephPeeringEvt(
8935 m->query_epoch,
8936 m->query_epoch,
8937 PG::RequestRecovery()));
8938 } else if (m->type == MRecoveryReserve::GRANT) {
8939 evt = PG::CephPeeringEvtRef(
8940 new PG::CephPeeringEvt(
8941 m->query_epoch,
8942 m->query_epoch,
8943 PG::RemoteRecoveryReserved()));
8944 } else if (m->type == MRecoveryReserve::RELEASE) {
8945 evt = PG::CephPeeringEvtRef(
8946 new PG::CephPeeringEvt(
8947 m->query_epoch,
8948 m->query_epoch,
8949 PG::RecoveryDone()));
8950 } else {
8951 ceph_abort();
8952 }
8953
8954 if (service.splitting(m->pgid)) {
8955 peering_wait_for_split[m->pgid].push_back(evt);
8956 return;
8957 }
8958
8959 PG *pg = _lookup_lock_pg(m->pgid);
8960 if (!pg) {
8961 dout(10) << " don't have pg " << m->pgid << dendl;
8962 return;
8963 }
8964
8965 pg->queue_peering_event(evt);
8966 pg->unlock();
8967 }
8968
8969 void OSD::handle_force_recovery(Message *m)
8970 {
8971 MOSDForceRecovery *msg = static_cast<MOSDForceRecovery*>(m);
8972 assert(msg->get_type() == MSG_OSD_FORCE_RECOVERY);
8973
8974 vector<PGRef> local_pgs;
8975 local_pgs.reserve(msg->forced_pgs.size());
8976
8977 {
8978 RWLock::RLocker l(pg_map_lock);
8979 for (auto& i : msg->forced_pgs) {
8980 spg_t locpg;
8981 if (osdmap->get_primary_shard(i, &locpg)) {
8982 auto pg_map_entry = pg_map.find(locpg);
8983 if (pg_map_entry != pg_map.end()) {
8984 local_pgs.push_back(pg_map_entry->second);
8985 }
8986 }
8987 }
8988 }
8989
8990 if (local_pgs.size()) {
8991 service.adjust_pg_priorities(local_pgs, msg->options);
8992 }
8993
8994 msg->put();
8995 }
8996
8997 /** PGQuery
8998 * from primary to replica | stray
8999 * NOTE: called with opqueue active.
9000 */
9001 void OSD::handle_pg_query(OpRequestRef op)
9002 {
9003 assert(osd_lock.is_locked());
9004
9005 const MOSDPGQuery *m = static_cast<const MOSDPGQuery*>(op->get_req());
9006 assert(m->get_type() == MSG_OSD_PG_QUERY);
9007
9008 if (!require_osd_peer(op->get_req()))
9009 return;
9010
9011 dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl;
9012 int from = m->get_source().num();
9013
9014 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9015 return;
9016
9017 op->mark_started();
9018
9019 map< int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
9020
9021 for (auto it = m->pg_list.begin();
9022 it != m->pg_list.end();
9023 ++it) {
9024 spg_t pgid = it->first;
9025
9026 if (pgid.preferred() >= 0) {
9027 dout(10) << "ignoring localized pg " << pgid << dendl;
9028 continue;
9029 }
9030
9031 if (service.splitting(pgid)) {
9032 peering_wait_for_split[pgid].push_back(
9033 PG::CephPeeringEvtRef(
9034 new PG::CephPeeringEvt(
9035 it->second.epoch_sent, it->second.epoch_sent,
9036 PG::MQuery(pg_shard_t(from, it->second.from),
9037 it->second, it->second.epoch_sent))));
9038 continue;
9039 }
9040
9041 {
9042 RWLock::RLocker l(pg_map_lock);
9043 if (pg_map.count(pgid)) {
9044 PG *pg = 0;
9045 pg = _lookup_lock_pg_with_map_lock_held(pgid);
9046 pg->queue_query(
9047 it->second.epoch_sent, it->second.epoch_sent,
9048 pg_shard_t(from, it->second.from), it->second);
9049 pg->unlock();
9050 continue;
9051 }
9052 }
9053
9054 if (!osdmap->have_pg_pool(pgid.pool()))
9055 continue;
9056
9057 // get active crush mapping
9058 int up_primary, acting_primary;
9059 vector<int> up, acting;
9060 osdmap->pg_to_up_acting_osds(
9061 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9062
9063 // same primary?
9064 pg_history_t history = it->second.history;
9065 bool valid_history = project_pg_history(
9066 pgid, history, it->second.epoch_sent,
9067 up, up_primary, acting, acting_primary);
9068
9069 if (!valid_history ||
9070 it->second.epoch_sent < history.same_interval_since) {
9071 dout(10) << " pg " << pgid << " dne, and pg has changed in "
9072 << history.same_interval_since
9073 << " (msg from " << it->second.epoch_sent << ")" << dendl;
9074 continue;
9075 }
9076
9077 dout(10) << " pg " << pgid << " dne" << dendl;
9078 pg_info_t empty(spg_t(pgid.pgid, it->second.to));
9079 /* This is racy, but that should be ok: if we complete the deletion
9080 * before the pg is recreated, we'll just start it off backfilling
9081 * instead of just empty */
9082 if (service.deleting_pgs.lookup(pgid))
9083 empty.set_last_backfill(hobject_t());
9084 if (it->second.type == pg_query_t::LOG ||
9085 it->second.type == pg_query_t::FULLLOG) {
9086 ConnectionRef con = service.get_con_osd_cluster(from, osdmap->get_epoch());
9087 if (con) {
9088 MOSDPGLog *mlog = new MOSDPGLog(
9089 it->second.from, it->second.to,
9090 osdmap->get_epoch(), empty,
9091 it->second.epoch_sent);
9092 service.share_map_peer(from, con.get(), osdmap);
9093 con->send_message(mlog);
9094 }
9095 } else {
9096 notify_list[from].push_back(
9097 make_pair(
9098 pg_notify_t(
9099 it->second.from, it->second.to,
9100 it->second.epoch_sent,
9101 osdmap->get_epoch(),
9102 empty),
9103 PastIntervals(
9104 osdmap->get_pools().at(pgid.pool()).ec_pool(),
9105 *osdmap)));
9106 }
9107 }
9108 do_notifies(notify_list, osdmap);
9109 }
9110
9111
9112 void OSD::handle_pg_remove(OpRequestRef op)
9113 {
9114 const MOSDPGRemove *m = static_cast<const MOSDPGRemove *>(op->get_req());
9115 assert(m->get_type() == MSG_OSD_PG_REMOVE);
9116 assert(osd_lock.is_locked());
9117
9118 if (!require_osd_peer(op->get_req()))
9119 return;
9120
9121 dout(7) << "handle_pg_remove from " << m->get_source() << " on "
9122 << m->pg_list.size() << " pgs" << dendl;
9123
9124 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9125 return;
9126
9127 op->mark_started();
9128
9129 for (auto it = m->pg_list.begin();
9130 it != m->pg_list.end();
9131 ++it) {
9132 spg_t pgid = *it;
9133 if (pgid.preferred() >= 0) {
9134 dout(10) << "ignoring localized pg " << pgid << dendl;
9135 continue;
9136 }
9137
9138 RWLock::WLocker l(pg_map_lock);
9139 if (pg_map.count(pgid) == 0) {
9140 dout(10) << " don't have pg " << pgid << dendl;
9141 continue;
9142 }
9143 dout(5) << "queue_pg_for_deletion: " << pgid << dendl;
9144 PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
9145 pg_history_t history = pg->info.history;
9146 int up_primary, acting_primary;
9147 vector<int> up, acting;
9148 osdmap->pg_to_up_acting_osds(
9149 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9150 bool valid_history = project_pg_history(
9151 pg->info.pgid, history, pg->get_osdmap()->get_epoch(),
9152 up, up_primary, acting, acting_primary);
9153 if (valid_history &&
9154 history.same_interval_since <= m->get_epoch()) {
9155 assert(pg->get_primary().osd == m->get_source().num());
9156 PGRef _pg(pg);
9157 _remove_pg(pg);
9158 pg->unlock();
9159 } else {
9160 dout(10) << *pg << " ignoring remove request, pg changed in epoch "
9161 << history.same_interval_since
9162 << " > " << m->get_epoch() << dendl;
9163 pg->unlock();
9164 }
9165 }
9166 }
9167
9168 void OSD::_remove_pg(PG *pg)
9169 {
9170 ObjectStore::Transaction rmt ;
9171
9172 // on_removal, which calls remove_watchers_and_notifies, and the erasure from
9173 // the pg_map must be done together without unlocking the pg lock,
9174 // to avoid racing with watcher cleanup in ms_handle_reset
9175 // and handle_notify_timeout
9176 pg->on_removal(&rmt);
9177
9178 service.cancel_pending_splits_for_parent(pg->info.pgid);
9179 int tr = store->queue_transaction(
9180 pg->osr.get(), std::move(rmt), NULL,
9181 new ContainerContext<
9182 SequencerRef>(pg->osr));
9183 assert(tr == 0);
9184
9185 DeletingStateRef deleting = service.deleting_pgs.lookup_or_create(
9186 pg->info.pgid,
9187 make_pair(
9188 pg->info.pgid,
9189 PGRef(pg))
9190 );
9191 remove_wq.queue(make_pair(PGRef(pg), deleting));
9192
9193 service.pg_remove_epoch(pg->info.pgid);
9194
9195 // dereference from op_wq
9196 op_shardedwq.clear_pg_pointer(pg->info.pgid);
9197
9198 // remove from map
9199 pg_map.erase(pg->info.pgid);
9200 pg->put("PGMap"); // since we've taken it out of map
9201 }
9202
9203
9204 // =========================================================
9205 // RECOVERY
9206
9207 void OSDService::_maybe_queue_recovery() {
9208 assert(recovery_lock.is_locked_by_me());
9209 uint64_t available_pushes;
9210 while (!awaiting_throttle.empty() &&
9211 _recover_now(&available_pushes)) {
9212 uint64_t to_start = MIN(
9213 available_pushes,
9214 cct->_conf->osd_recovery_max_single_start);
9215 _queue_for_recovery(awaiting_throttle.front(), to_start);
9216 awaiting_throttle.pop_front();
9217 recovery_ops_reserved += to_start;
9218 }
9219 }
9220
9221 bool OSDService::_recover_now(uint64_t *available_pushes)
9222 {
9223 if (available_pushes)
9224 *available_pushes = 0;
9225
9226 if (ceph_clock_now() < defer_recovery_until) {
9227 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9228 return false;
9229 }
9230
9231 if (recovery_paused) {
9232 dout(15) << __func__ << " paused" << dendl;
9233 return false;
9234 }
9235
9236 uint64_t max = cct->_conf->osd_recovery_max_active;
9237 if (max <= recovery_ops_active + recovery_ops_reserved) {
9238 dout(15) << __func__ << " active " << recovery_ops_active
9239 << " + reserved " << recovery_ops_reserved
9240 << " >= max " << max << dendl;
9241 return false;
9242 }
9243
9244 if (available_pushes)
9245 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9246
9247 return true;
9248 }
9249
9250
9251 void OSDService::adjust_pg_priorities(const vector<PGRef>& pgs, int newflags)
9252 {
9253 if (!pgs.size() || !(newflags & (OFR_BACKFILL | OFR_RECOVERY)))
9254 return;
9255 int newstate = 0;
9256
9257 if (newflags & OFR_BACKFILL) {
9258 newstate = PG_STATE_FORCED_BACKFILL;
9259 } else if (newflags & OFR_RECOVERY) {
9260 newstate = PG_STATE_FORCED_RECOVERY;
9261 }
9262
9263 // debug output here may get large, don't generate it if debug level is below
9264 // 10 and use abbreviated pg ids otherwise
9265 if ((cct)->_conf->subsys.should_gather(ceph_subsys_osd, 10)) {
9266 stringstream ss;
9267
9268 for (auto& i : pgs) {
9269 ss << i->get_pgid() << " ";
9270 }
9271
9272 dout(10) << __func__ << " working on " << ss.str() << dendl;
9273 }
9274
9275 if (newflags & OFR_CANCEL) {
9276 for (auto& i : pgs) {
9277 i->lock();
9278 i->_change_recovery_force_mode(newstate, true);
9279 i->unlock();
9280 }
9281 } else {
9282 for (auto& i : pgs) {
9283 // make sure the PG is in correct state before forcing backfill or recovery, or
9284 // else we'll make PG keeping FORCE_* flag forever, requiring osds restart
9285 // or forcing somehow recovery/backfill.
9286 i->lock();
9287 int pgstate = i->get_state();
9288 if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) ||
9289 ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILL))) )
9290 i->_change_recovery_force_mode(newstate, false);
9291 i->unlock();
9292 }
9293 }
9294 }
9295
9296 void OSD::do_recovery(
9297 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9298 ThreadPool::TPHandle &handle)
9299 {
9300 uint64_t started = 0;
9301
9302 /*
9303 * When the value of osd_recovery_sleep is set greater than zero, recovery
9304 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9305 * recovery event's schedule time. This is done by adding a
9306 * recovery_requeue_callback event, which re-queues the recovery op using
9307 * queue_recovery_after_sleep.
9308 */
9309 float recovery_sleep = get_osd_recovery_sleep();
9310 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9311 PGRef pgref(pg);
9312 auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9313 dout(20) << "do_recovery wake up at "
9314 << ceph_clock_now()
9315 << ", re-queuing recovery" << dendl;
9316 service.recovery_needs_sleep = false;
9317 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9318 });
9319 Mutex::Locker l(service.recovery_sleep_lock);
9320
9321 // This is true for the first recovery op and when the previous recovery op
9322 // has been scheduled in the past. The next recovery op is scheduled after
9323 // completing the sleep from now.
9324 if (service.recovery_schedule_time < ceph_clock_now()) {
9325 service.recovery_schedule_time = ceph_clock_now();
9326 }
9327 service.recovery_schedule_time += recovery_sleep;
9328 service.recovery_sleep_timer.add_event_at(service.recovery_schedule_time,
9329 recovery_requeue_callback);
9330 dout(20) << "Recovery event scheduled at "
9331 << service.recovery_schedule_time << dendl;
9332 return;
9333 }
9334
9335 {
9336 service.recovery_needs_sleep = true;
9337 if (pg->pg_has_reset_since(queued)) {
9338 goto out;
9339 }
9340
9341 assert(!pg->deleting);
9342 assert(pg->is_peered() && pg->is_primary());
9343
9344 assert(pg->recovery_queued);
9345 pg->recovery_queued = false;
9346
9347 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9348 #ifdef DEBUG_RECOVERY_OIDS
9349 dout(20) << " active was " << service.recovery_oids[pg->info.pgid] << dendl;
9350 #endif
9351
9352 bool more = pg->start_recovery_ops(reserved_pushes, handle, &started);
9353 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9354 << " on " << *pg << dendl;
9355
9356 // If no recovery op is started, don't bother to manipulate the RecoveryCtx
9357 if (!started && (more || !pg->have_unfound())) {
9358 goto out;
9359 }
9360
9361 PG::RecoveryCtx rctx = create_context();
9362 rctx.handle = &handle;
9363
9364 /*
9365 * if we couldn't start any recovery ops and things are still
9366 * unfound, see if we can discover more missing object locations.
9367 * It may be that our initial locations were bad and we errored
9368 * out while trying to pull.
9369 */
9370 if (!more && pg->have_unfound()) {
9371 pg->discover_all_missing(*rctx.query_map);
9372 if (rctx.query_map->empty()) {
9373 string action;
9374 if (pg->state_test(PG_STATE_BACKFILL)) {
9375 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9376 queued,
9377 queued,
9378 PG::CancelBackfill()));
9379 pg->queue_peering_event(evt);
9380 action = "in backfill";
9381 } else if (pg->state_test(PG_STATE_RECOVERING)) {
9382 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9383 queued,
9384 queued,
9385 PG::CancelRecovery()));
9386 pg->queue_peering_event(evt);
9387 action = "in recovery";
9388 } else {
9389 action = "already out of recovery/backfill";
9390 }
9391 dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
9392 } else {
9393 dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
9394 pg->queue_recovery();
9395 }
9396 }
9397
9398 pg->write_if_dirty(*rctx.transaction);
9399 OSDMapRef curmap = pg->get_osdmap();
9400 dispatch_context(rctx, pg, curmap);
9401 }
9402
9403 out:
9404 assert(started <= reserved_pushes);
9405 service.release_reserved_pushes(reserved_pushes);
9406 }
9407
9408 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9409 {
9410 Mutex::Locker l(recovery_lock);
9411 dout(10) << "start_recovery_op " << *pg << " " << soid
9412 << " (" << recovery_ops_active << "/"
9413 << cct->_conf->osd_recovery_max_active << " rops)"
9414 << dendl;
9415 recovery_ops_active++;
9416
9417 #ifdef DEBUG_RECOVERY_OIDS
9418 dout(20) << " active was " << recovery_oids[pg->info.pgid] << dendl;
9419 assert(recovery_oids[pg->info.pgid].count(soid) == 0);
9420 recovery_oids[pg->info.pgid].insert(soid);
9421 #endif
9422 }
9423
9424 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9425 {
9426 Mutex::Locker l(recovery_lock);
9427 dout(10) << "finish_recovery_op " << *pg << " " << soid
9428 << " dequeue=" << dequeue
9429 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
9430 << dendl;
9431
9432 // adjust count
9433 assert(recovery_ops_active > 0);
9434 recovery_ops_active--;
9435
9436 #ifdef DEBUG_RECOVERY_OIDS
9437 dout(20) << " active oids was " << recovery_oids[pg->info.pgid] << dendl;
9438 assert(recovery_oids[pg->info.pgid].count(soid));
9439 recovery_oids[pg->info.pgid].erase(soid);
9440 #endif
9441
9442 _maybe_queue_recovery();
9443 }
9444
9445 bool OSDService::is_recovery_active()
9446 {
9447 Mutex::Locker l(recovery_lock);
9448 return recovery_ops_active > 0;
9449 }
9450
9451 // =========================================================
9452 // OPS
9453
9454 bool OSD::op_is_discardable(const MOSDOp *op)
9455 {
9456 // drop client request if they are not connected and can't get the
9457 // reply anyway.
9458 if (!op->get_connection()->is_connected()) {
9459 return true;
9460 }
9461 return false;
9462 }
9463
9464 void OSD::enqueue_op(spg_t pg, OpRequestRef& op, epoch_t epoch)
9465 {
9466 utime_t latency = ceph_clock_now() - op->get_req()->get_recv_stamp();
9467 dout(15) << "enqueue_op " << op << " prio " << op->get_req()->get_priority()
9468 << " cost " << op->get_req()->get_cost()
9469 << " latency " << latency
9470 << " epoch " << epoch
9471 << " " << *(op->get_req()) << dendl;
9472 op->osd_trace.event("enqueue op");
9473 op->osd_trace.keyval("priority", op->get_req()->get_priority());
9474 op->osd_trace.keyval("cost", op->get_req()->get_cost());
9475 op->mark_queued_for_pg();
9476 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9477 op_shardedwq.queue(make_pair(pg, PGQueueable(op, epoch)));
9478 }
9479
9480
9481
9482 /*
9483 * NOTE: dequeue called in worker thread, with pg lock
9484 */
9485 void OSD::dequeue_op(
9486 PGRef pg, OpRequestRef op,
9487 ThreadPool::TPHandle &handle)
9488 {
9489 FUNCTRACE();
9490 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
9491
9492 utime_t now = ceph_clock_now();
9493 op->set_dequeued_time(now);
9494 utime_t latency = now - op->get_req()->get_recv_stamp();
9495 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
9496 << " cost " << op->get_req()->get_cost()
9497 << " latency " << latency
9498 << " " << *(op->get_req())
9499 << " pg " << *pg << dendl;
9500
9501 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9502
9503 Session *session = static_cast<Session *>(
9504 op->get_req()->get_connection()->get_priv());
9505 if (session) {
9506 maybe_share_map(session, op, pg->get_osdmap());
9507 session->put();
9508 }
9509
9510 if (pg->deleting)
9511 return;
9512
9513 op->mark_reached_pg();
9514 op->osd_trace.event("dequeue_op");
9515
9516 pg->do_request(op, handle);
9517
9518 // finish
9519 dout(10) << "dequeue_op " << op << " finish" << dendl;
9520 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
9521 }
9522
9523
9524 struct C_CompleteSplits : public Context {
9525 OSD *osd;
9526 set<PGRef> pgs;
9527 C_CompleteSplits(OSD *osd, const set<PGRef> &in)
9528 : osd(osd), pgs(in) {}
9529 void finish(int r) override {
9530 Mutex::Locker l(osd->osd_lock);
9531 if (osd->is_stopping())
9532 return;
9533 PG::RecoveryCtx rctx = osd->create_context();
9534 for (set<PGRef>::iterator i = pgs.begin();
9535 i != pgs.end();
9536 ++i) {
9537 osd->pg_map_lock.get_write();
9538 (*i)->lock();
9539 PG *pg = i->get();
9540 osd->add_newly_split_pg(pg, &rctx);
9541 if (!((*i)->deleting)) {
9542 set<spg_t> to_complete;
9543 to_complete.insert((*i)->info.pgid);
9544 osd->service.complete_split(to_complete);
9545 }
9546 osd->pg_map_lock.put_write();
9547 osd->dispatch_context_transaction(rctx, pg);
9548 osd->wake_pg_waiters(*i);
9549 (*i)->unlock();
9550 }
9551
9552 osd->dispatch_context(rctx, 0, osd->service.get_osdmap());
9553 }
9554 };
9555
9556 void OSD::process_peering_events(
9557 const list<PG*> &pgs,
9558 ThreadPool::TPHandle &handle
9559 )
9560 {
9561 bool need_up_thru = false;
9562 epoch_t same_interval_since = 0;
9563 OSDMapRef curmap;
9564 PG::RecoveryCtx rctx = create_context();
9565 rctx.handle = &handle;
9566 for (list<PG*>::const_iterator i = pgs.begin();
9567 i != pgs.end();
9568 ++i) {
9569 set<PGRef> split_pgs;
9570 PG *pg = *i;
9571 pg->lock_suspend_timeout(handle);
9572 curmap = service.get_osdmap();
9573 if (pg->deleting) {
9574 pg->unlock();
9575 continue;
9576 }
9577 if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
9578 // we need to requeue the PG explicitly since we didn't actually
9579 // handle an event
9580 peering_wq.queue(pg);
9581 } else {
9582 assert(!pg->peering_queue.empty());
9583 PG::CephPeeringEvtRef evt = pg->peering_queue.front();
9584 pg->peering_queue.pop_front();
9585 pg->handle_peering_event(evt, &rctx);
9586 }
9587 need_up_thru = pg->need_up_thru || need_up_thru;
9588 same_interval_since = MAX(pg->info.history.same_interval_since,
9589 same_interval_since);
9590 pg->write_if_dirty(*rctx.transaction);
9591 if (!split_pgs.empty()) {
9592 rctx.on_applied->add(new C_CompleteSplits(this, split_pgs));
9593 split_pgs.clear();
9594 }
9595 dispatch_context_transaction(rctx, pg, &handle);
9596 pg->unlock();
9597 }
9598 if (need_up_thru)
9599 queue_want_up_thru(same_interval_since);
9600 dispatch_context(rctx, 0, curmap, &handle);
9601
9602 service.send_pg_temp();
9603 }
9604
9605 // --------------------------------
9606
9607 const char** OSD::get_tracked_conf_keys() const
9608 {
9609 static const char* KEYS[] = {
9610 "osd_max_backfills",
9611 "osd_min_recovery_priority",
9612 "osd_max_trimming_pgs",
9613 "osd_op_complaint_time",
9614 "osd_op_log_threshold",
9615 "osd_op_history_size",
9616 "osd_op_history_duration",
9617 "osd_op_history_slow_op_size",
9618 "osd_op_history_slow_op_threshold",
9619 "osd_enable_op_tracker",
9620 "osd_map_cache_size",
9621 "osd_map_max_advance",
9622 "osd_pg_epoch_persisted_max_stale",
9623 "osd_disk_thread_ioprio_class",
9624 "osd_disk_thread_ioprio_priority",
9625 // clog & admin clog
9626 "clog_to_monitors",
9627 "clog_to_syslog",
9628 "clog_to_syslog_facility",
9629 "clog_to_syslog_level",
9630 "osd_objectstore_fuse",
9631 "clog_to_graylog",
9632 "clog_to_graylog_host",
9633 "clog_to_graylog_port",
9634 "host",
9635 "fsid",
9636 "osd_recovery_delay_start",
9637 "osd_client_message_size_cap",
9638 "osd_client_message_cap",
9639 "osd_heartbeat_min_size",
9640 "osd_heartbeat_interval",
9641 NULL
9642 };
9643 return KEYS;
9644 }
9645
9646 void OSD::handle_conf_change(const struct md_config_t *conf,
9647 const std::set <std::string> &changed)
9648 {
9649 if (changed.count("osd_max_backfills")) {
9650 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9651 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9652 }
9653 if (changed.count("osd_min_recovery_priority")) {
9654 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9655 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9656 }
9657 if (changed.count("osd_max_trimming_pgs")) {
9658 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9659 }
9660 if (changed.count("osd_op_complaint_time") ||
9661 changed.count("osd_op_log_threshold")) {
9662 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9663 cct->_conf->osd_op_log_threshold);
9664 }
9665 if (changed.count("osd_op_history_size") ||
9666 changed.count("osd_op_history_duration")) {
9667 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9668 cct->_conf->osd_op_history_duration);
9669 }
9670 if (changed.count("osd_op_history_slow_op_size") ||
9671 changed.count("osd_op_history_slow_op_threshold")) {
9672 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9673 cct->_conf->osd_op_history_slow_op_threshold);
9674 }
9675 if (changed.count("osd_enable_op_tracker")) {
9676 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9677 }
9678 if (changed.count("osd_disk_thread_ioprio_class") ||
9679 changed.count("osd_disk_thread_ioprio_priority")) {
9680 set_disk_tp_priority();
9681 }
9682 if (changed.count("osd_map_cache_size")) {
9683 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9684 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9685 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9686 }
9687 if (changed.count("clog_to_monitors") ||
9688 changed.count("clog_to_syslog") ||
9689 changed.count("clog_to_syslog_level") ||
9690 changed.count("clog_to_syslog_facility") ||
9691 changed.count("clog_to_graylog") ||
9692 changed.count("clog_to_graylog_host") ||
9693 changed.count("clog_to_graylog_port") ||
9694 changed.count("host") ||
9695 changed.count("fsid")) {
9696 update_log_config();
9697 }
9698
9699 #ifdef HAVE_LIBFUSE
9700 if (changed.count("osd_objectstore_fuse")) {
9701 if (store) {
9702 enable_disable_fuse(false);
9703 }
9704 }
9705 #endif
9706
9707 if (changed.count("osd_recovery_delay_start")) {
9708 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9709 service.kick_recovery_queue();
9710 }
9711
9712 if (changed.count("osd_client_message_cap")) {
9713 uint64_t newval = cct->_conf->osd_client_message_cap;
9714 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9715 if (pol.throttler_messages && newval > 0) {
9716 pol.throttler_messages->reset_max(newval);
9717 }
9718 }
9719 if (changed.count("osd_client_message_size_cap")) {
9720 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9721 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9722 if (pol.throttler_bytes && newval > 0) {
9723 pol.throttler_bytes->reset_max(newval);
9724 }
9725 }
9726
9727 check_config();
9728 }
9729
9730 void OSD::update_log_config()
9731 {
9732 map<string,string> log_to_monitors;
9733 map<string,string> log_to_syslog;
9734 map<string,string> log_channel;
9735 map<string,string> log_prio;
9736 map<string,string> log_to_graylog;
9737 map<string,string> log_to_graylog_host;
9738 map<string,string> log_to_graylog_port;
9739 uuid_d fsid;
9740 string host;
9741
9742 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9743 log_channel, log_prio, log_to_graylog,
9744 log_to_graylog_host, log_to_graylog_port,
9745 fsid, host) == 0)
9746 clog->update_config(log_to_monitors, log_to_syslog,
9747 log_channel, log_prio, log_to_graylog,
9748 log_to_graylog_host, log_to_graylog_port,
9749 fsid, host);
9750 derr << "log_to_monitors " << log_to_monitors << dendl;
9751 }
9752
9753 void OSD::check_config()
9754 {
9755 // some sanity checks
9756 if (cct->_conf->osd_map_cache_size <= cct->_conf->osd_map_max_advance + 2) {
9757 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9758 << " is not > osd_map_max_advance ("
9759 << cct->_conf->osd_map_max_advance << ")";
9760 }
9761 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9762 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9763 << " is not > osd_pg_epoch_persisted_max_stale ("
9764 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9765 }
9766 }
9767
9768 void OSD::set_disk_tp_priority()
9769 {
9770 dout(10) << __func__
9771 << " class " << cct->_conf->osd_disk_thread_ioprio_class
9772 << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
9773 << dendl;
9774 if (cct->_conf->osd_disk_thread_ioprio_class.empty() ||
9775 cct->_conf->osd_disk_thread_ioprio_priority < 0)
9776 return;
9777 int cls =
9778 ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
9779 if (cls < 0)
9780 derr << __func__ << cpp_strerror(cls) << ": "
9781 << "osd_disk_thread_ioprio_class is " << cct->_conf->osd_disk_thread_ioprio_class
9782 << " but only the following values are allowed: idle, be or rt" << dendl;
9783 else
9784 disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
9785 }
9786
9787 // --------------------------------
9788
9789 void OSD::get_latest_osdmap()
9790 {
9791 dout(10) << __func__ << " -- start" << dendl;
9792
9793 C_SaferCond cond;
9794 service.objecter->wait_for_latest_osdmap(&cond);
9795 cond.wait();
9796
9797 dout(10) << __func__ << " -- finish" << dendl;
9798 }
9799
9800 // --------------------------------
9801
9802 int OSD::init_op_flags(OpRequestRef& op)
9803 {
9804 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
9805 vector<OSDOp>::const_iterator iter;
9806
9807 // client flags have no bearing on whether an op is a read, write, etc.
9808 op->rmw_flags = 0;
9809
9810 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
9811 op->set_force_rwordered();
9812 }
9813
9814 // set bits based on op codes, called methods.
9815 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
9816 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
9817 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
9818 /* This a bit odd. PING isn't actually a write. It can't
9819 * result in an update to the object_info. PINGs also aren'ty
9820 * resent, so there's no reason to write out a log entry
9821 *
9822 * However, we pipeline them behind writes, so let's force
9823 * the write_ordered flag.
9824 */
9825 op->set_force_rwordered();
9826 } else {
9827 if (ceph_osd_op_mode_modify(iter->op.op))
9828 op->set_write();
9829 }
9830 if (ceph_osd_op_mode_read(iter->op.op))
9831 op->set_read();
9832
9833 // set READ flag if there are src_oids
9834 if (iter->soid.oid.name.length())
9835 op->set_read();
9836
9837 // set PGOP flag if there are PG ops
9838 if (ceph_osd_op_type_pg(iter->op.op))
9839 op->set_pg_op();
9840
9841 if (ceph_osd_op_mode_cache(iter->op.op))
9842 op->set_cache();
9843
9844 // check for ec base pool
9845 int64_t poolid = m->get_pg().pool();
9846 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
9847 if (pool && pool->is_tier()) {
9848 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
9849 if (base_pool && base_pool->require_rollback()) {
9850 if ((iter->op.op != CEPH_OSD_OP_READ) &&
9851 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
9852 (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
9853 (iter->op.op != CEPH_OSD_OP_STAT) &&
9854 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
9855 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
9856 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
9857 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
9858 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
9859 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
9860 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
9861 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
9862 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
9863 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
9864 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
9865 (iter->op.op != CEPH_OSD_OP_CREATE) &&
9866 (iter->op.op != CEPH_OSD_OP_DELETE) &&
9867 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
9868 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
9869 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
9870 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
9871 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
9872 op->set_promote();
9873 }
9874 }
9875 }
9876
9877 switch (iter->op.op) {
9878 case CEPH_OSD_OP_CALL:
9879 {
9880 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
9881 int is_write, is_read;
9882 string cname, mname;
9883 bp.copy(iter->op.cls.class_len, cname);
9884 bp.copy(iter->op.cls.method_len, mname);
9885
9886 ClassHandler::ClassData *cls;
9887 int r = class_handler->open_class(cname, &cls);
9888 if (r) {
9889 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
9890 if (r == -ENOENT)
9891 r = -EOPNOTSUPP;
9892 else if (r != -EPERM) // propagate permission errors
9893 r = -EIO;
9894 return r;
9895 }
9896 int flags = cls->get_method_flags(mname.c_str());
9897 if (flags < 0) {
9898 if (flags == -ENOENT)
9899 r = -EOPNOTSUPP;
9900 else
9901 r = flags;
9902 return r;
9903 }
9904 is_read = flags & CLS_METHOD_RD;
9905 is_write = flags & CLS_METHOD_WR;
9906 bool is_promote = flags & CLS_METHOD_PROMOTE;
9907
9908 dout(10) << "class " << cname << " method " << mname << " "
9909 << "flags=" << (is_read ? "r" : "")
9910 << (is_write ? "w" : "")
9911 << (is_promote ? "p" : "")
9912 << dendl;
9913 if (is_read)
9914 op->set_class_read();
9915 if (is_write)
9916 op->set_class_write();
9917 if (is_promote)
9918 op->set_promote();
9919 op->add_class(cname, is_read, is_write, cls->whitelisted);
9920 break;
9921 }
9922
9923 case CEPH_OSD_OP_WATCH:
9924 // force the read bit for watch since it is depends on previous
9925 // watch state (and may return early if the watch exists) or, in
9926 // the case of ping, is simply a read op.
9927 op->set_read();
9928 // fall through
9929 case CEPH_OSD_OP_NOTIFY:
9930 case CEPH_OSD_OP_NOTIFY_ACK:
9931 {
9932 op->set_promote();
9933 break;
9934 }
9935
9936 case CEPH_OSD_OP_DELETE:
9937 // if we get a delete with FAILOK we can skip handle cache. without
9938 // FAILOK we still need to promote (or do something smarter) to
9939 // determine whether to return ENOENT or 0.
9940 if (iter == m->ops.begin() &&
9941 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
9942 op->set_skip_handle_cache();
9943 }
9944 // skip promotion when proxying a delete op
9945 if (m->ops.size() == 1) {
9946 op->set_skip_promote();
9947 }
9948 break;
9949
9950 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
9951 case CEPH_OSD_OP_CACHE_FLUSH:
9952 case CEPH_OSD_OP_CACHE_EVICT:
9953 // If try_flush/flush/evict is the only op, can skip handle cache.
9954 if (m->ops.size() == 1) {
9955 op->set_skip_handle_cache();
9956 }
9957 break;
9958
9959 case CEPH_OSD_OP_READ:
9960 case CEPH_OSD_OP_SYNC_READ:
9961 case CEPH_OSD_OP_SPARSE_READ:
9962 case CEPH_OSD_OP_CHECKSUM:
9963 case CEPH_OSD_OP_WRITEFULL:
9964 if (m->ops.size() == 1 &&
9965 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
9966 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
9967 op->set_skip_promote();
9968 }
9969 break;
9970
9971 // force promotion when pin an object in cache tier
9972 case CEPH_OSD_OP_CACHE_PIN:
9973 op->set_promote();
9974 break;
9975
9976 default:
9977 break;
9978 }
9979 }
9980
9981 if (op->rmw_flags == 0)
9982 return -EINVAL;
9983
9984 return 0;
9985 }
9986
9987 void OSD::PeeringWQ::_dequeue(list<PG*> *out) {
9988 for (list<PG*>::iterator i = peering_queue.begin();
9989 i != peering_queue.end() &&
9990 out->size() < osd->cct->_conf->osd_peering_wq_batch_size;
9991 ) {
9992 if (in_use.count(*i)) {
9993 ++i;
9994 } else {
9995 out->push_back(*i);
9996 peering_queue.erase(i++);
9997 }
9998 }
9999 in_use.insert(out->begin(), out->end());
10000 }
10001
10002
10003 // =============================================================
10004
10005 #undef dout_context
10006 #define dout_context osd->cct
10007 #undef dout_prefix
10008 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10009
10010 void OSD::ShardedOpWQ::wake_pg_waiters(spg_t pgid)
10011 {
10012 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10013 auto sdata = shard_list[shard_index];
10014 bool queued = false;
10015 unsigned pushes_to_free = 0;
10016 {
10017 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10018 auto p = sdata->pg_slots.find(pgid);
10019 if (p != sdata->pg_slots.end()) {
10020 dout(20) << __func__ << " " << pgid
10021 << " to_process " << p->second.to_process
10022 << " waiting_for_pg=" << (int)p->second.waiting_for_pg << dendl;
10023 for (auto i = p->second.to_process.rbegin();
10024 i != p->second.to_process.rend();
10025 ++i) {
10026 sdata->_enqueue_front(make_pair(pgid, *i), osd->op_prio_cutoff);
10027 }
10028 for (auto& q : p->second.to_process) {
10029 pushes_to_free += q.get_reserved_pushes();
10030 }
10031 p->second.to_process.clear();
10032 p->second.waiting_for_pg = false;
10033 ++p->second.requeue_seq;
10034 queued = true;
10035 }
10036 }
10037 if (pushes_to_free > 0) {
10038 osd->service.release_reserved_pushes(pushes_to_free);
10039 }
10040 if (queued) {
10041 sdata->sdata_lock.Lock();
10042 sdata->sdata_cond.SignalOne();
10043 sdata->sdata_lock.Unlock();
10044 }
10045 }
10046
10047 void OSD::ShardedOpWQ::prune_pg_waiters(OSDMapRef osdmap, int whoami)
10048 {
10049 unsigned pushes_to_free = 0;
10050 for (auto sdata : shard_list) {
10051 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10052 sdata->waiting_for_pg_osdmap = osdmap;
10053 auto p = sdata->pg_slots.begin();
10054 while (p != sdata->pg_slots.end()) {
10055 ShardData::pg_slot& slot = p->second;
10056 if (!slot.to_process.empty() && slot.num_running == 0) {
10057 if (osdmap->is_up_acting_osd_shard(p->first, whoami)) {
10058 dout(20) << __func__ << " " << p->first << " maps to us, keeping"
10059 << dendl;
10060 ++p;
10061 continue;
10062 }
10063 while (!slot.to_process.empty() &&
10064 slot.to_process.front().get_map_epoch() <= osdmap->get_epoch()) {
10065 auto& qi = slot.to_process.front();
10066 dout(20) << __func__ << " " << p->first
10067 << " item " << qi
10068 << " epoch " << qi.get_map_epoch()
10069 << " <= " << osdmap->get_epoch()
10070 << ", stale, dropping" << dendl;
10071 pushes_to_free += qi.get_reserved_pushes();
10072 slot.to_process.pop_front();
10073 }
10074 }
10075 if (slot.to_process.empty() &&
10076 slot.num_running == 0 &&
10077 !slot.pg) {
10078 dout(20) << __func__ << " " << p->first << " empty, pruning" << dendl;
10079 p = sdata->pg_slots.erase(p);
10080 } else {
10081 ++p;
10082 }
10083 }
10084 }
10085 if (pushes_to_free > 0) {
10086 osd->service.release_reserved_pushes(pushes_to_free);
10087 }
10088 }
10089
10090 void OSD::ShardedOpWQ::clear_pg_pointer(spg_t pgid)
10091 {
10092 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10093 auto sdata = shard_list[shard_index];
10094 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10095 auto p = sdata->pg_slots.find(pgid);
10096 if (p != sdata->pg_slots.end()) {
10097 auto& slot = p->second;
10098 dout(20) << __func__ << " " << pgid << " pg " << slot.pg << dendl;
10099 assert(!slot.pg || slot.pg->deleting);
10100 slot.pg = nullptr;
10101 }
10102 }
10103
10104 void OSD::ShardedOpWQ::clear_pg_slots()
10105 {
10106 for (auto sdata : shard_list) {
10107 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10108 sdata->pg_slots.clear();
10109 sdata->waiting_for_pg_osdmap.reset();
10110 // don't bother with reserved pushes; we are shutting down
10111 }
10112 }
10113
10114 #undef dout_prefix
10115 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10116
10117 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10118 {
10119 uint32_t shard_index = thread_index % num_shards;
10120 ShardData *sdata = shard_list[shard_index];
10121 assert(NULL != sdata);
10122
10123 // peek at spg_t
10124 sdata->sdata_op_ordering_lock.Lock();
10125 if (sdata->pqueue->empty()) {
10126 dout(20) << __func__ << " empty q, waiting" << dendl;
10127 // optimistically sleep a moment; maybe another work item will come along.
10128 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10129 osd->cct->_conf->threadpool_default_timeout, 0);
10130 sdata->sdata_lock.Lock();
10131 sdata->sdata_op_ordering_lock.Unlock();
10132 sdata->sdata_cond.WaitInterval(sdata->sdata_lock,
10133 utime_t(osd->cct->_conf->threadpool_empty_queue_max_wait, 0));
10134 sdata->sdata_lock.Unlock();
10135 sdata->sdata_op_ordering_lock.Lock();
10136 if (sdata->pqueue->empty()) {
10137 sdata->sdata_op_ordering_lock.Unlock();
10138 return;
10139 }
10140 }
10141 pair<spg_t, PGQueueable> item = sdata->pqueue->dequeue();
10142 if (osd->is_stopping()) {
10143 sdata->sdata_op_ordering_lock.Unlock();
10144 return; // OSD shutdown, discard.
10145 }
10146 PGRef pg;
10147 uint64_t requeue_seq;
10148 {
10149 auto& slot = sdata->pg_slots[item.first];
10150 dout(30) << __func__ << " " << item.first
10151 << " to_process " << slot.to_process
10152 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10153 slot.to_process.push_back(item.second);
10154 // note the requeue seq now...
10155 requeue_seq = slot.requeue_seq;
10156 if (slot.waiting_for_pg) {
10157 // save ourselves a bit of effort
10158 dout(20) << __func__ << " " << item.first << " item " << item.second
10159 << " queued, waiting_for_pg" << dendl;
10160 sdata->sdata_op_ordering_lock.Unlock();
10161 return;
10162 }
10163 pg = slot.pg;
10164 dout(20) << __func__ << " " << item.first << " item " << item.second
10165 << " queued" << dendl;
10166 ++slot.num_running;
10167 }
10168 sdata->sdata_op_ordering_lock.Unlock();
10169
10170 osd->service.maybe_inject_dispatch_delay();
10171
10172 // [lookup +] lock pg (if we have it)
10173 if (!pg) {
10174 pg = osd->_lookup_lock_pg(item.first);
10175 } else {
10176 pg->lock();
10177 }
10178
10179 osd->service.maybe_inject_dispatch_delay();
10180
10181 boost::optional<PGQueueable> qi;
10182
10183 // we don't use a Mutex::Locker here because of the
10184 // osd->service.release_reserved_pushes() call below
10185 sdata->sdata_op_ordering_lock.Lock();
10186
10187 auto q = sdata->pg_slots.find(item.first);
10188 assert(q != sdata->pg_slots.end());
10189 auto& slot = q->second;
10190 --slot.num_running;
10191
10192 if (slot.to_process.empty()) {
10193 // raced with wake_pg_waiters or prune_pg_waiters
10194 dout(20) << __func__ << " " << item.first << " nothing queued" << dendl;
10195 if (pg) {
10196 pg->unlock();
10197 }
10198 sdata->sdata_op_ordering_lock.Unlock();
10199 return;
10200 }
10201 if (requeue_seq != slot.requeue_seq) {
10202 dout(20) << __func__ << " " << item.first
10203 << " requeue_seq " << slot.requeue_seq << " > our "
10204 << requeue_seq << ", we raced with wake_pg_waiters"
10205 << dendl;
10206 if (pg) {
10207 pg->unlock();
10208 }
10209 sdata->sdata_op_ordering_lock.Unlock();
10210 return;
10211 }
10212 if (pg && !slot.pg && !pg->deleting) {
10213 dout(20) << __func__ << " " << item.first << " set pg to " << pg << dendl;
10214 slot.pg = pg;
10215 }
10216 dout(30) << __func__ << " " << item.first << " to_process " << slot.to_process
10217 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10218
10219 // make sure we're not already waiting for this pg
10220 if (slot.waiting_for_pg) {
10221 dout(20) << __func__ << " " << item.first << " item " << item.second
10222 << " slot is waiting_for_pg" << dendl;
10223 if (pg) {
10224 pg->unlock();
10225 }
10226 sdata->sdata_op_ordering_lock.Unlock();
10227 return;
10228 }
10229
10230 // take next item
10231 qi = slot.to_process.front();
10232 slot.to_process.pop_front();
10233 dout(20) << __func__ << " " << item.first << " item " << *qi
10234 << " pg " << pg << dendl;
10235
10236 if (!pg) {
10237 // should this pg shard exist on this osd in this (or a later) epoch?
10238 OSDMapRef osdmap = sdata->waiting_for_pg_osdmap;
10239 if (osdmap->is_up_acting_osd_shard(item.first, osd->whoami)) {
10240 dout(20) << __func__ << " " << item.first
10241 << " no pg, should exist, will wait" << " on " << *qi << dendl;
10242 slot.to_process.push_front(*qi);
10243 slot.waiting_for_pg = true;
10244 } else if (qi->get_map_epoch() > osdmap->get_epoch()) {
10245 dout(20) << __func__ << " " << item.first << " no pg, item epoch is "
10246 << qi->get_map_epoch() << " > " << osdmap->get_epoch()
10247 << ", will wait on " << *qi << dendl;
10248 slot.to_process.push_front(*qi);
10249 slot.waiting_for_pg = true;
10250 } else {
10251 dout(20) << __func__ << " " << item.first << " no pg, shouldn't exist,"
10252 << " dropping " << *qi << dendl;
10253 // share map with client?
10254 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10255 Session *session = static_cast<Session *>(
10256 (*_op)->get_req()->get_connection()->get_priv());
10257 if (session) {
10258 osd->maybe_share_map(session, *_op, sdata->waiting_for_pg_osdmap);
10259 session->put();
10260 }
10261 }
10262 unsigned pushes_to_free = qi->get_reserved_pushes();
10263 if (pushes_to_free > 0) {
10264 sdata->sdata_op_ordering_lock.Unlock();
10265 osd->service.release_reserved_pushes(pushes_to_free);
10266 return;
10267 }
10268 }
10269 sdata->sdata_op_ordering_lock.Unlock();
10270 return;
10271 }
10272 sdata->sdata_op_ordering_lock.Unlock();
10273
10274
10275 // osd_opwq_process marks the point at which an operation has been dequeued
10276 // and will begin to be handled by a worker thread.
10277 {
10278 #ifdef WITH_LTTNG
10279 osd_reqid_t reqid;
10280 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10281 reqid = (*_op)->get_reqid();
10282 }
10283 #endif
10284 tracepoint(osd, opwq_process_start, reqid.name._type,
10285 reqid.name._num, reqid.tid, reqid.inc);
10286 }
10287
10288 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10289 Formatter *f = Formatter::create("json");
10290 f->open_object_section("q");
10291 dump(f);
10292 f->close_section();
10293 f->flush(*_dout);
10294 delete f;
10295 *_dout << dendl;
10296
10297 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10298 suicide_interval);
10299 qi->run(osd, pg, tp_handle);
10300
10301 {
10302 #ifdef WITH_LTTNG
10303 osd_reqid_t reqid;
10304 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10305 reqid = (*_op)->get_reqid();
10306 }
10307 #endif
10308 tracepoint(osd, opwq_process_finish, reqid.name._type,
10309 reqid.name._num, reqid.tid, reqid.inc);
10310 }
10311
10312 pg->unlock();
10313 }
10314
10315 void OSD::ShardedOpWQ::_enqueue(pair<spg_t, PGQueueable> item) {
10316 uint32_t shard_index =
10317 item.first.hash_to_shard(shard_list.size());
10318
10319 ShardData* sdata = shard_list[shard_index];
10320 assert (NULL != sdata);
10321 unsigned priority = item.second.get_priority();
10322 unsigned cost = item.second.get_cost();
10323 sdata->sdata_op_ordering_lock.Lock();
10324
10325 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10326 if (priority >= osd->op_prio_cutoff)
10327 sdata->pqueue->enqueue_strict(
10328 item.second.get_owner(), priority, item);
10329 else
10330 sdata->pqueue->enqueue(
10331 item.second.get_owner(),
10332 priority, cost, item);
10333 sdata->sdata_op_ordering_lock.Unlock();
10334
10335 sdata->sdata_lock.Lock();
10336 sdata->sdata_cond.SignalOne();
10337 sdata->sdata_lock.Unlock();
10338
10339 }
10340
10341 void OSD::ShardedOpWQ::_enqueue_front(pair<spg_t, PGQueueable> item)
10342 {
10343 uint32_t shard_index = item.first.hash_to_shard(shard_list.size());
10344 ShardData* sdata = shard_list[shard_index];
10345 assert (NULL != sdata);
10346 sdata->sdata_op_ordering_lock.Lock();
10347 auto p = sdata->pg_slots.find(item.first);
10348 if (p != sdata->pg_slots.end() && !p->second.to_process.empty()) {
10349 // we may be racing with _process, which has dequeued a new item
10350 // from pqueue, put it on to_process, and is now busy taking the
10351 // pg lock. ensure this old requeued item is ordered before any
10352 // such newer item in to_process.
10353 p->second.to_process.push_front(item.second);
10354 item.second = p->second.to_process.back();
10355 p->second.to_process.pop_back();
10356 dout(20) << __func__ << " " << item.first
10357 << " " << p->second.to_process.front()
10358 << " shuffled w/ " << item.second << dendl;
10359 } else {
10360 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10361 }
10362 sdata->_enqueue_front(item, osd->op_prio_cutoff);
10363 sdata->sdata_op_ordering_lock.Unlock();
10364 sdata->sdata_lock.Lock();
10365 sdata->sdata_cond.SignalOne();
10366 sdata->sdata_lock.Unlock();
10367 }
10368
10369 namespace ceph {
10370 namespace osd_cmds {
10371
10372 int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os)
10373 {
10374 if (!ceph_using_tcmalloc()) {
10375 os << "could not issue heap profiler command -- not using tcmalloc!";
10376 return -EOPNOTSUPP;
10377 }
10378
10379 string cmd;
10380 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
10381 os << "unable to get value for command \"" << cmd << "\"";
10382 return -EINVAL;
10383 }
10384
10385 std::vector<std::string> cmd_vec;
10386 get_str_vec(cmd, cmd_vec);
10387
10388 ceph_heap_profiler_handle_command(cmd_vec, os);
10389
10390 return 0;
10391 }
10392
10393 }} // namespace ceph::osd_cmds
10394
10395
10396 std::ostream& operator<<(std::ostream& out, const OSD::io_queue& q) {
10397 switch(q) {
10398 case OSD::io_queue::prioritized:
10399 out << "prioritized";
10400 break;
10401 case OSD::io_queue::weightedpriority:
10402 out << "weightedpriority";
10403 break;
10404 case OSD::io_queue::mclock_opclass:
10405 out << "mclock_opclass";
10406 break;
10407 case OSD::io_queue::mclock_client:
10408 out << "mclock_client";
10409 break;
10410 }
10411 return out;
10412 }