]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
update sources to v12.2.0
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15 #include "acconfig.h"
16
17 #include <fstream>
18 #include <iostream>
19 #include <errno.h>
20 #include <sys/stat.h>
21 #include <signal.h>
22 #include <ctype.h>
23 #include <boost/scoped_ptr.hpp>
24
25 #ifdef HAVE_SYS_PARAM_H
26 #include <sys/param.h>
27 #endif
28
29 #ifdef HAVE_SYS_MOUNT_H
30 #include <sys/mount.h>
31 #endif
32
33 #include "osd/PG.h"
34
35 #include "include/types.h"
36 #include "include/compat.h"
37
38 #include "OSD.h"
39 #include "OSDMap.h"
40 #include "Watch.h"
41 #include "osdc/Objecter.h"
42
43 #include "common/errno.h"
44 #include "common/ceph_argparse.h"
45 #include "common/ceph_time.h"
46 #include "common/version.h"
47 #include "common/io_priority.h"
48 #include "common/pick_address.h"
49
50 #include "os/ObjectStore.h"
51 #ifdef HAVE_LIBFUSE
52 #include "os/FuseStore.h"
53 #endif
54
55 #include "PrimaryLogPG.h"
56
57
58 #include "msg/Messenger.h"
59 #include "msg/Message.h"
60
61 #include "mon/MonClient.h"
62
63 #include "messages/MLog.h"
64
65 #include "messages/MGenericMessage.h"
66 #include "messages/MOSDPing.h"
67 #include "messages/MOSDFailure.h"
68 #include "messages/MOSDMarkMeDown.h"
69 #include "messages/MOSDFull.h"
70 #include "messages/MOSDOp.h"
71 #include "messages/MOSDOpReply.h"
72 #include "messages/MOSDBackoff.h"
73 #include "messages/MOSDBeacon.h"
74 #include "messages/MOSDRepOp.h"
75 #include "messages/MOSDRepOpReply.h"
76 #include "messages/MOSDBoot.h"
77 #include "messages/MOSDPGTemp.h"
78
79 #include "messages/MOSDMap.h"
80 #include "messages/MMonGetOSDMap.h"
81 #include "messages/MOSDPGNotify.h"
82 #include "messages/MOSDPGQuery.h"
83 #include "messages/MOSDPGLog.h"
84 #include "messages/MOSDPGRemove.h"
85 #include "messages/MOSDPGInfo.h"
86 #include "messages/MOSDPGCreate.h"
87 #include "messages/MOSDPGTrim.h"
88 #include "messages/MOSDPGScan.h"
89 #include "messages/MOSDPGBackfill.h"
90 #include "messages/MBackfillReserve.h"
91 #include "messages/MRecoveryReserve.h"
92 #include "messages/MOSDForceRecovery.h"
93 #include "messages/MOSDECSubOpWrite.h"
94 #include "messages/MOSDECSubOpWriteReply.h"
95 #include "messages/MOSDECSubOpRead.h"
96 #include "messages/MOSDECSubOpReadReply.h"
97 #include "messages/MOSDPGCreated.h"
98 #include "messages/MOSDPGUpdateLogMissing.h"
99 #include "messages/MOSDPGUpdateLogMissingReply.h"
100
101 #include "messages/MOSDAlive.h"
102
103 #include "messages/MOSDScrub.h"
104 #include "messages/MOSDScrubReserve.h"
105 #include "messages/MOSDRepScrub.h"
106
107 #include "messages/MMonCommand.h"
108 #include "messages/MCommand.h"
109 #include "messages/MCommandReply.h"
110
111 #include "messages/MPGStats.h"
112 #include "messages/MPGStatsAck.h"
113
114 #include "messages/MWatchNotify.h"
115 #include "messages/MOSDPGPush.h"
116 #include "messages/MOSDPGPushReply.h"
117 #include "messages/MOSDPGPull.h"
118
119 #include "common/perf_counters.h"
120 #include "common/Timer.h"
121 #include "common/LogClient.h"
122 #include "common/AsyncReserver.h"
123 #include "common/HeartbeatMap.h"
124 #include "common/admin_socket.h"
125 #include "common/ceph_context.h"
126
127 #include "global/signal_handler.h"
128 #include "global/pidfile.h"
129
130 #include "include/color.h"
131 #include "perfglue/cpu_profiler.h"
132 #include "perfglue/heap_profiler.h"
133
134 #include "osd/OpRequest.h"
135
136 #include "auth/AuthAuthorizeHandler.h"
137 #include "auth/RotatingKeyRing.h"
138 #include "common/errno.h"
139
140 #include "objclass/objclass.h"
141
142 #include "common/cmdparse.h"
143 #include "include/str_list.h"
144 #include "include/util.h"
145
146 #include "include/assert.h"
147 #include "common/config.h"
148 #include "common/EventTrace.h"
149
150 #ifdef WITH_LTTNG
151 #define TRACEPOINT_DEFINE
152 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
153 #include "tracing/osd.h"
154 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
155 #undef TRACEPOINT_DEFINE
156 #else
157 #define tracepoint(...)
158 #endif
159
160 #define dout_context cct
161 #define dout_subsys ceph_subsys_osd
162 #undef dout_prefix
163 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
164
165
166 const double OSD::OSD_TICK_INTERVAL = 1.0;
167
168 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
169 return *_dout << "osd." << whoami << " " << epoch << " ";
170 }
171
172 //Initial features in new superblock.
173 //Features here are also automatically upgraded
174 CompatSet OSD::get_osd_initial_compat_set() {
175 CompatSet::FeatureSet ceph_osd_feature_compat;
176 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
177 CompatSet::FeatureSet ceph_osd_feature_incompat;
178 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
179 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
180 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
181 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
182 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
183 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
184 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
185 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
186 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
187 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
188 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
189 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
190 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
191 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
192 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
193 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
194 ceph_osd_feature_incompat);
195 }
196
197 //Features are added here that this OSD supports.
198 CompatSet OSD::get_osd_compat_set() {
199 CompatSet compat = get_osd_initial_compat_set();
200 //Any features here can be set in code, but not in initial superblock
201 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
202 return compat;
203 }
204
205 OSDService::OSDService(OSD *osd) :
206 osd(osd),
207 cct(osd->cct),
208 meta_osr(new ObjectStore::Sequencer("meta")),
209 whoami(osd->whoami), store(osd->store),
210 log_client(osd->log_client), clog(osd->clog),
211 pg_recovery_stats(osd->pg_recovery_stats),
212 cluster_messenger(osd->cluster_messenger),
213 client_messenger(osd->client_messenger),
214 logger(osd->logger),
215 recoverystate_perf(osd->recoverystate_perf),
216 monc(osd->monc),
217 peering_wq(osd->peering_wq),
218 recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
219 &osd->disk_tp),
220 class_handler(osd->class_handler),
221 pg_epoch_lock("OSDService::pg_epoch_lock"),
222 publish_lock("OSDService::publish_lock"),
223 pre_publish_lock("OSDService::pre_publish_lock"),
224 max_oldest_map(0),
225 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
226 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
227 scrubs_active(0),
228 agent_lock("OSDService::agent_lock"),
229 agent_valid_iterator(false),
230 agent_ops(0),
231 flush_mode_high_count(0),
232 agent_active(true),
233 agent_thread(this),
234 agent_stop_flag(false),
235 agent_timer_lock("OSDService::agent_timer_lock"),
236 agent_timer(osd->client_messenger->cct, agent_timer_lock),
237 last_recalibrate(ceph_clock_now()),
238 promote_max_objects(0),
239 promote_max_bytes(0),
240 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
241 objecter_finisher(osd->client_messenger->cct),
242 watch_lock("OSDService::watch_lock"),
243 watch_timer(osd->client_messenger->cct, watch_lock),
244 next_notif_id(0),
245 recovery_request_lock("OSDService::recovery_request_lock"),
246 recovery_request_timer(cct, recovery_request_lock, false),
247 recovery_sleep_lock("OSDService::recovery_sleep_lock"),
248 recovery_sleep_timer(cct, recovery_sleep_lock, false),
249 reserver_finisher(cct),
250 local_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
251 cct->_conf->osd_min_recovery_priority),
252 remote_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
253 cct->_conf->osd_min_recovery_priority),
254 pg_temp_lock("OSDService::pg_temp_lock"),
255 snap_sleep_lock("OSDService::snap_sleep_lock"),
256 snap_sleep_timer(
257 osd->client_messenger->cct, snap_sleep_lock, false /* relax locking */),
258 scrub_sleep_lock("OSDService::scrub_sleep_lock"),
259 scrub_sleep_timer(
260 osd->client_messenger->cct, scrub_sleep_lock, false /* relax locking */),
261 snap_reserver(&reserver_finisher,
262 cct->_conf->osd_max_trimming_pgs),
263 recovery_lock("OSDService::recovery_lock"),
264 recovery_ops_active(0),
265 recovery_ops_reserved(0),
266 recovery_paused(false),
267 map_cache_lock("OSDService::map_cache_lock"),
268 map_cache(cct, cct->_conf->osd_map_cache_size),
269 map_bl_cache(cct->_conf->osd_map_cache_size),
270 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
271 in_progress_split_lock("OSDService::in_progress_split_lock"),
272 stat_lock("OSDService::stat_lock"),
273 full_status_lock("OSDService::full_status_lock"),
274 cur_state(NONE),
275 cur_ratio(0),
276 epoch_lock("OSDService::epoch_lock"),
277 boot_epoch(0), up_epoch(0), bind_epoch(0),
278 is_stopping_lock("OSDService::is_stopping_lock")
279 #ifdef PG_DEBUG_REFS
280 , pgid_lock("OSDService::pgid_lock")
281 #endif
282 {
283 objecter->init();
284 }
285
286 OSDService::~OSDService()
287 {
288 delete objecter;
289 }
290
291
292
293 #ifdef PG_DEBUG_REFS
294 void OSDService::add_pgid(spg_t pgid, PG *pg){
295 Mutex::Locker l(pgid_lock);
296 if (!pgid_tracker.count(pgid)) {
297 live_pgs[pgid] = pg;
298 }
299 pgid_tracker[pgid]++;
300 }
301 void OSDService::remove_pgid(spg_t pgid, PG *pg)
302 {
303 Mutex::Locker l(pgid_lock);
304 assert(pgid_tracker.count(pgid));
305 assert(pgid_tracker[pgid] > 0);
306 pgid_tracker[pgid]--;
307 if (pgid_tracker[pgid] == 0) {
308 pgid_tracker.erase(pgid);
309 live_pgs.erase(pgid);
310 }
311 }
312 void OSDService::dump_live_pgids()
313 {
314 Mutex::Locker l(pgid_lock);
315 derr << "live pgids:" << dendl;
316 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
317 i != pgid_tracker.cend();
318 ++i) {
319 derr << "\t" << *i << dendl;
320 live_pgs[i->first]->dump_live_ids();
321 }
322 }
323 #endif
324
325
326 void OSDService::_start_split(spg_t parent, const set<spg_t> &children)
327 {
328 for (set<spg_t>::const_iterator i = children.begin();
329 i != children.end();
330 ++i) {
331 dout(10) << __func__ << ": Starting split on pg " << *i
332 << ", parent=" << parent << dendl;
333 assert(!pending_splits.count(*i));
334 assert(!in_progress_splits.count(*i));
335 pending_splits.insert(make_pair(*i, parent));
336
337 assert(!rev_pending_splits[parent].count(*i));
338 rev_pending_splits[parent].insert(*i);
339 }
340 }
341
342 void OSDService::mark_split_in_progress(spg_t parent, const set<spg_t> &children)
343 {
344 Mutex::Locker l(in_progress_split_lock);
345 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
346 assert(piter != rev_pending_splits.end());
347 for (set<spg_t>::const_iterator i = children.begin();
348 i != children.end();
349 ++i) {
350 assert(piter->second.count(*i));
351 assert(pending_splits.count(*i));
352 assert(!in_progress_splits.count(*i));
353 assert(pending_splits[*i] == parent);
354
355 pending_splits.erase(*i);
356 piter->second.erase(*i);
357 in_progress_splits.insert(*i);
358 }
359 if (piter->second.empty())
360 rev_pending_splits.erase(piter);
361 }
362
363 void OSDService::cancel_pending_splits_for_parent(spg_t parent)
364 {
365 Mutex::Locker l(in_progress_split_lock);
366 _cancel_pending_splits_for_parent(parent);
367 }
368
369 void OSDService::_cancel_pending_splits_for_parent(spg_t parent)
370 {
371 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
372 if (piter == rev_pending_splits.end())
373 return;
374
375 for (set<spg_t>::iterator i = piter->second.begin();
376 i != piter->second.end();
377 ++i) {
378 assert(pending_splits.count(*i));
379 assert(!in_progress_splits.count(*i));
380 pending_splits.erase(*i);
381 dout(10) << __func__ << ": Completing split on pg " << *i
382 << " for parent: " << parent << dendl;
383 _cancel_pending_splits_for_parent(*i);
384 }
385 rev_pending_splits.erase(piter);
386 }
387
388 void OSDService::_maybe_split_pgid(OSDMapRef old_map,
389 OSDMapRef new_map,
390 spg_t pgid)
391 {
392 assert(old_map->have_pg_pool(pgid.pool()));
393 int old_pgnum = old_map->get_pg_num(pgid.pool());
394 if (pgid.ps() < static_cast<unsigned>(old_pgnum)) {
395 set<spg_t> children;
396 if (pgid.is_split(old_pgnum,
397 new_map->get_pg_num(pgid.pool()), &children)) {
398 _start_split(pgid, children); }
399 } else {
400 assert(pgid.ps() < static_cast<unsigned>(new_map->get_pg_num(pgid.pool())));
401 }
402 }
403
404 void OSDService::init_splits_between(spg_t pgid,
405 OSDMapRef frommap,
406 OSDMapRef tomap)
407 {
408 // First, check whether we can avoid this potentially expensive check
409 if (tomap->have_pg_pool(pgid.pool()) &&
410 pgid.is_split(
411 frommap->get_pg_num(pgid.pool()),
412 tomap->get_pg_num(pgid.pool()),
413 NULL)) {
414 // Ok, a split happened, so we need to walk the osdmaps
415 set<spg_t> new_pgs; // pgs to scan on each map
416 new_pgs.insert(pgid);
417 OSDMapRef curmap(get_map(frommap->get_epoch()));
418 for (epoch_t e = frommap->get_epoch() + 1;
419 e <= tomap->get_epoch();
420 ++e) {
421 OSDMapRef nextmap(try_get_map(e));
422 if (!nextmap)
423 continue;
424 set<spg_t> even_newer_pgs; // pgs added in this loop
425 for (set<spg_t>::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) {
426 set<spg_t> split_pgs;
427 if (i->is_split(curmap->get_pg_num(i->pool()),
428 nextmap->get_pg_num(i->pool()),
429 &split_pgs)) {
430 start_split(*i, split_pgs);
431 even_newer_pgs.insert(split_pgs.begin(), split_pgs.end());
432 }
433 }
434 new_pgs.insert(even_newer_pgs.begin(), even_newer_pgs.end());
435 curmap = nextmap;
436 }
437 assert(curmap == tomap); // we must have had both frommap and tomap
438 }
439 }
440
441 void OSDService::expand_pg_num(OSDMapRef old_map,
442 OSDMapRef new_map)
443 {
444 Mutex::Locker l(in_progress_split_lock);
445 for (set<spg_t>::iterator i = in_progress_splits.begin();
446 i != in_progress_splits.end();
447 ) {
448 if (!new_map->have_pg_pool(i->pool())) {
449 in_progress_splits.erase(i++);
450 } else {
451 _maybe_split_pgid(old_map, new_map, *i);
452 ++i;
453 }
454 }
455 for (map<spg_t, spg_t>::iterator i = pending_splits.begin();
456 i != pending_splits.end();
457 ) {
458 if (!new_map->have_pg_pool(i->first.pool())) {
459 rev_pending_splits.erase(i->second);
460 pending_splits.erase(i++);
461 } else {
462 _maybe_split_pgid(old_map, new_map, i->first);
463 ++i;
464 }
465 }
466 }
467
468 bool OSDService::splitting(spg_t pgid)
469 {
470 Mutex::Locker l(in_progress_split_lock);
471 return in_progress_splits.count(pgid) ||
472 pending_splits.count(pgid);
473 }
474
475 void OSDService::complete_split(const set<spg_t> &pgs)
476 {
477 Mutex::Locker l(in_progress_split_lock);
478 for (set<spg_t>::const_iterator i = pgs.begin();
479 i != pgs.end();
480 ++i) {
481 dout(10) << __func__ << ": Completing split on pg " << *i << dendl;
482 assert(!pending_splits.count(*i));
483 assert(in_progress_splits.count(*i));
484 in_progress_splits.erase(*i);
485 }
486 }
487
488 void OSDService::need_heartbeat_peer_update()
489 {
490 osd->need_heartbeat_peer_update();
491 }
492
493 void OSDService::pg_stat_queue_enqueue(PG *pg)
494 {
495 osd->pg_stat_queue_enqueue(pg);
496 }
497
498 void OSDService::pg_stat_queue_dequeue(PG *pg)
499 {
500 osd->pg_stat_queue_dequeue(pg);
501 }
502
503 void OSDService::start_shutdown()
504 {
505 {
506 Mutex::Locker l(agent_timer_lock);
507 agent_timer.shutdown();
508 }
509
510 {
511 Mutex::Locker l(recovery_sleep_lock);
512 recovery_sleep_timer.shutdown();
513 }
514 }
515
516 void OSDService::shutdown_reserver()
517 {
518 reserver_finisher.wait_for_empty();
519 reserver_finisher.stop();
520 }
521
522 void OSDService::shutdown()
523 {
524 {
525 Mutex::Locker l(watch_lock);
526 watch_timer.shutdown();
527 }
528
529 objecter->shutdown();
530 objecter_finisher.wait_for_empty();
531 objecter_finisher.stop();
532
533 {
534 Mutex::Locker l(recovery_request_lock);
535 recovery_request_timer.shutdown();
536 }
537
538 {
539 Mutex::Locker l(snap_sleep_lock);
540 snap_sleep_timer.shutdown();
541 }
542
543 {
544 Mutex::Locker l(scrub_sleep_lock);
545 scrub_sleep_timer.shutdown();
546 }
547
548 osdmap = OSDMapRef();
549 next_osdmap = OSDMapRef();
550 }
551
552 void OSDService::init()
553 {
554 reserver_finisher.start();
555 objecter_finisher.start();
556 objecter->set_client_incarnation(0);
557
558 // deprioritize objecter in daemonperf output
559 objecter->get_logger()->set_prio_adjust(-3);
560
561 watch_timer.init();
562 agent_timer.init();
563 snap_sleep_timer.init();
564 scrub_sleep_timer.init();
565
566 agent_thread.create("osd_srv_agent");
567
568 if (cct->_conf->osd_recovery_delay_start)
569 defer_recovery(cct->_conf->osd_recovery_delay_start);
570 }
571
572 void OSDService::final_init()
573 {
574 objecter->start(osdmap.get());
575 }
576
577 void OSDService::activate_map()
578 {
579 // wake/unwake the tiering agent
580 agent_lock.Lock();
581 agent_active =
582 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
583 osd->is_active();
584 agent_cond.Signal();
585 agent_lock.Unlock();
586 }
587
588 class AgentTimeoutCB : public Context {
589 PGRef pg;
590 public:
591 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
592 void finish(int) override {
593 pg->agent_choose_mode_restart();
594 }
595 };
596
597 void OSDService::agent_entry()
598 {
599 dout(10) << __func__ << " start" << dendl;
600 agent_lock.Lock();
601
602 while (!agent_stop_flag) {
603 if (agent_queue.empty()) {
604 dout(20) << __func__ << " empty queue" << dendl;
605 agent_cond.Wait(agent_lock);
606 continue;
607 }
608 uint64_t level = agent_queue.rbegin()->first;
609 set<PGRef>& top = agent_queue.rbegin()->second;
610 dout(10) << __func__
611 << " tiers " << agent_queue.size()
612 << ", top is " << level
613 << " with pgs " << top.size()
614 << ", ops " << agent_ops << "/"
615 << cct->_conf->osd_agent_max_ops
616 << (agent_active ? " active" : " NOT ACTIVE")
617 << dendl;
618 dout(20) << __func__ << " oids " << agent_oids << dendl;
619 int max = cct->_conf->osd_agent_max_ops - agent_ops;
620 int agent_flush_quota = max;
621 if (!flush_mode_high_count)
622 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
623 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
624 agent_cond.Wait(agent_lock);
625 continue;
626 }
627
628 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
629 agent_queue_pos = top.begin();
630 agent_valid_iterator = true;
631 }
632 PGRef pg = *agent_queue_pos;
633 dout(10) << "high_count " << flush_mode_high_count
634 << " agent_ops " << agent_ops
635 << " flush_quota " << agent_flush_quota << dendl;
636 agent_lock.Unlock();
637 if (!pg->agent_work(max, agent_flush_quota)) {
638 dout(10) << __func__ << " " << pg->get_pgid()
639 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
640 << " seconds" << dendl;
641
642 osd->logger->inc(l_osd_tier_delay);
643 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
644 agent_timer_lock.Lock();
645 Context *cb = new AgentTimeoutCB(pg);
646 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
647 agent_timer_lock.Unlock();
648 }
649 agent_lock.Lock();
650 }
651 agent_lock.Unlock();
652 dout(10) << __func__ << " finish" << dendl;
653 }
654
655 void OSDService::agent_stop()
656 {
657 {
658 Mutex::Locker l(agent_lock);
659
660 // By this time all ops should be cancelled
661 assert(agent_ops == 0);
662 // By this time all PGs are shutdown and dequeued
663 if (!agent_queue.empty()) {
664 set<PGRef>& top = agent_queue.rbegin()->second;
665 derr << "agent queue not empty, for example " << (*top.begin())->info.pgid << dendl;
666 assert(0 == "agent queue not empty");
667 }
668
669 agent_stop_flag = true;
670 agent_cond.Signal();
671 }
672 agent_thread.join();
673 }
674
675 // -------------------------------------
676
677 void OSDService::promote_throttle_recalibrate()
678 {
679 utime_t now = ceph_clock_now();
680 double dur = now - last_recalibrate;
681 last_recalibrate = now;
682 unsigned prob = promote_probability_millis;
683
684 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
685 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
686
687 unsigned min_prob = 1;
688
689 uint64_t attempts, obj, bytes;
690 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
691 dout(10) << __func__ << " " << attempts << " attempts, promoted "
692 << obj << " objects and " << pretty_si_t(bytes) << " bytes; target "
693 << target_obj_sec << " obj/sec or "
694 << pretty_si_t(target_bytes_sec) << " bytes/sec"
695 << dendl;
696
697 // calculate what the probability *should* be, given the targets
698 unsigned new_prob;
699 if (attempts && dur > 0) {
700 uint64_t avg_size = 1;
701 if (obj)
702 avg_size = MAX(bytes / obj, 1);
703 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
704 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
705 / (double)attempts;
706 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
707 << avg_size << dendl;
708 if (target_obj_sec && target_bytes_sec)
709 new_prob = MIN(po, pb);
710 else if (target_obj_sec)
711 new_prob = po;
712 else if (target_bytes_sec)
713 new_prob = pb;
714 else
715 new_prob = 1000;
716 } else {
717 new_prob = 1000;
718 }
719 dout(20) << __func__ << " new_prob " << new_prob << dendl;
720
721 // correct for persistent skew between target rate and actual rate, adjust
722 double ratio = 1.0;
723 unsigned actual = 0;
724 if (attempts && obj) {
725 actual = obj * 1000 / attempts;
726 ratio = (double)actual / (double)prob;
727 new_prob = (double)new_prob / ratio;
728 }
729 new_prob = MAX(new_prob, min_prob);
730 new_prob = MIN(new_prob, 1000);
731
732 // adjust
733 prob = (prob + new_prob) / 2;
734 prob = MAX(prob, min_prob);
735 prob = MIN(prob, 1000);
736 dout(10) << __func__ << " actual " << actual
737 << ", actual/prob ratio " << ratio
738 << ", adjusted new_prob " << new_prob
739 << ", prob " << promote_probability_millis << " -> " << prob
740 << dendl;
741 promote_probability_millis = prob;
742
743 // set hard limits for this interval to mitigate stampedes
744 promote_max_objects = target_obj_sec * OSD::OSD_TICK_INTERVAL * 2;
745 promote_max_bytes = target_bytes_sec * OSD::OSD_TICK_INTERVAL * 2;
746 }
747
748 // -------------------------------------
749
750 float OSDService::get_failsafe_full_ratio()
751 {
752 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
753 if (full_ratio > 1.0) full_ratio /= 100.0;
754 return full_ratio;
755 }
756
757 void OSDService::check_full_status(float ratio)
758 {
759 Mutex::Locker l(full_status_lock);
760
761 cur_ratio = ratio;
762
763 // The OSDMap ratios take precendence. So if the failsafe is .95 and
764 // the admin sets the cluster full to .96, the failsafe moves up to .96
765 // too. (Not that having failsafe == full is ideal, but it's better than
766 // dropping writes before the clusters appears full.)
767 OSDMapRef osdmap = get_osdmap();
768 if (!osdmap || osdmap->get_epoch() == 0) {
769 cur_state = NONE;
770 return;
771 }
772 float nearfull_ratio = osdmap->get_nearfull_ratio();
773 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
774 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
775 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
776
777 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
778 // use the failsafe for nearfull and full; the mon isn't using the
779 // flags anyway because we're mid-upgrade.
780 full_ratio = failsafe_ratio;
781 backfillfull_ratio = failsafe_ratio;
782 nearfull_ratio = failsafe_ratio;
783 } else if (full_ratio <= 0 ||
784 backfillfull_ratio <= 0 ||
785 nearfull_ratio <= 0) {
786 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
787 // use failsafe flag. ick. the monitor did something wrong or the user
788 // did something stupid.
789 full_ratio = failsafe_ratio;
790 backfillfull_ratio = failsafe_ratio;
791 nearfull_ratio = failsafe_ratio;
792 }
793
794 string inject;
795 s_names new_state;
796 if (injectfull_state > NONE && injectfull) {
797 new_state = injectfull_state;
798 inject = "(Injected)";
799 } else if (ratio > failsafe_ratio) {
800 new_state = FAILSAFE;
801 } else if (ratio > full_ratio) {
802 new_state = FULL;
803 } else if (ratio > backfillfull_ratio) {
804 new_state = BACKFILLFULL;
805 } else if (ratio > nearfull_ratio) {
806 new_state = NEARFULL;
807 } else {
808 new_state = NONE;
809 }
810 dout(20) << __func__ << " cur ratio " << ratio
811 << ". nearfull_ratio " << nearfull_ratio
812 << ". backfillfull_ratio " << backfillfull_ratio
813 << ", full_ratio " << full_ratio
814 << ", failsafe_ratio " << failsafe_ratio
815 << ", new state " << get_full_state_name(new_state)
816 << " " << inject
817 << dendl;
818
819 // warn
820 if (cur_state != new_state) {
821 dout(10) << __func__ << " " << get_full_state_name(cur_state)
822 << " -> " << get_full_state_name(new_state) << dendl;
823 if (new_state == FAILSAFE) {
824 clog->error() << "full status failsafe engaged, dropping updates, now "
825 << (int)roundf(ratio * 100) << "% full";
826 } else if (cur_state == FAILSAFE) {
827 clog->error() << "full status failsafe disengaged, no longer dropping "
828 << "updates, now " << (int)roundf(ratio * 100) << "% full";
829 }
830 cur_state = new_state;
831 }
832 }
833
834 bool OSDService::need_fullness_update()
835 {
836 OSDMapRef osdmap = get_osdmap();
837 s_names cur = NONE;
838 if (osdmap->exists(whoami)) {
839 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
840 cur = FULL;
841 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
842 cur = BACKFILLFULL;
843 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
844 cur = NEARFULL;
845 }
846 }
847 s_names want = NONE;
848 if (is_full())
849 want = FULL;
850 else if (is_backfillfull())
851 want = BACKFILLFULL;
852 else if (is_nearfull())
853 want = NEARFULL;
854 return want != cur;
855 }
856
857 bool OSDService::_check_full(s_names type, ostream &ss) const
858 {
859 Mutex::Locker l(full_status_lock);
860
861 if (injectfull && injectfull_state >= type) {
862 // injectfull is either a count of the number of times to return failsafe full
863 // or if -1 then always return full
864 if (injectfull > 0)
865 --injectfull;
866 ss << "Injected " << get_full_state_name(type) << " OSD ("
867 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")";
868 return true;
869 }
870
871 ss << "current usage is " << cur_ratio;
872 return cur_state >= type;
873 }
874
875 bool OSDService::check_failsafe_full(ostream &ss) const
876 {
877 return _check_full(FAILSAFE, ss);
878 }
879
880 bool OSDService::check_full(ostream &ss) const
881 {
882 return _check_full(FULL, ss);
883 }
884
885 bool OSDService::check_backfill_full(ostream &ss) const
886 {
887 return _check_full(BACKFILLFULL, ss);
888 }
889
890 bool OSDService::check_nearfull(ostream &ss) const
891 {
892 return _check_full(NEARFULL, ss);
893 }
894
895 bool OSDService::is_failsafe_full() const
896 {
897 Mutex::Locker l(full_status_lock);
898 return cur_state == FAILSAFE;
899 }
900
901 bool OSDService::is_full() const
902 {
903 Mutex::Locker l(full_status_lock);
904 return cur_state >= FULL;
905 }
906
907 bool OSDService::is_backfillfull() const
908 {
909 Mutex::Locker l(full_status_lock);
910 return cur_state >= BACKFILLFULL;
911 }
912
913 bool OSDService::is_nearfull() const
914 {
915 Mutex::Locker l(full_status_lock);
916 return cur_state >= NEARFULL;
917 }
918
919 void OSDService::set_injectfull(s_names type, int64_t count)
920 {
921 Mutex::Locker l(full_status_lock);
922 injectfull_state = type;
923 injectfull = count;
924 }
925
926 osd_stat_t OSDService::set_osd_stat(const struct store_statfs_t &stbuf,
927 vector<int>& hb_peers,
928 int num_pgs)
929 {
930 uint64_t bytes = stbuf.total;
931 uint64_t used = bytes - stbuf.available;
932 uint64_t avail = stbuf.available;
933
934 osd->logger->set(l_osd_stat_bytes, bytes);
935 osd->logger->set(l_osd_stat_bytes_used, used);
936 osd->logger->set(l_osd_stat_bytes_avail, avail);
937
938 {
939 Mutex::Locker l(stat_lock);
940 osd_stat.hb_peers.swap(hb_peers);
941 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
942 osd_stat.kb = bytes >> 10;
943 osd_stat.kb_used = used >> 10;
944 osd_stat.kb_avail = avail >> 10;
945 osd_stat.num_pgs = num_pgs;
946 return osd_stat;
947 }
948 }
949
950 void OSDService::update_osd_stat(vector<int>& hb_peers)
951 {
952 // load osd stats first
953 struct store_statfs_t stbuf;
954 int r = osd->store->statfs(&stbuf);
955 if (r < 0) {
956 derr << "statfs() failed: " << cpp_strerror(r) << dendl;
957 return;
958 }
959
960 auto new_stat = set_osd_stat(stbuf, hb_peers, osd->get_num_pgs());
961 dout(20) << "update_osd_stat " << new_stat << dendl;
962 assert(new_stat.kb);
963 float ratio = ((float)new_stat.kb_used) / ((float)new_stat.kb);
964 check_full_status(ratio);
965 }
966
967 bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
968 {
969 OSDMapRef osdmap = get_osdmap();
970 for (auto shard : missing_on) {
971 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
972 return true;
973 }
974 return false;
975 }
976
977 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
978 {
979 OSDMapRef next_map = get_nextmap_reserved();
980 // service map is always newer/newest
981 assert(from_epoch <= next_map->get_epoch());
982
983 if (next_map->is_down(peer) ||
984 next_map->get_info(peer).up_from > from_epoch) {
985 m->put();
986 release_map(next_map);
987 return;
988 }
989 const entity_inst_t& peer_inst = next_map->get_cluster_inst(peer);
990 ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst);
991 share_map_peer(peer, peer_con.get(), next_map);
992 peer_con->send_message(m);
993 release_map(next_map);
994 }
995
996 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
997 {
998 OSDMapRef next_map = get_nextmap_reserved();
999 // service map is always newer/newest
1000 assert(from_epoch <= next_map->get_epoch());
1001
1002 if (next_map->is_down(peer) ||
1003 next_map->get_info(peer).up_from > from_epoch) {
1004 release_map(next_map);
1005 return NULL;
1006 }
1007 ConnectionRef con = osd->cluster_messenger->get_connection(next_map->get_cluster_inst(peer));
1008 release_map(next_map);
1009 return con;
1010 }
1011
1012 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1013 {
1014 OSDMapRef next_map = get_nextmap_reserved();
1015 // service map is always newer/newest
1016 assert(from_epoch <= next_map->get_epoch());
1017
1018 pair<ConnectionRef,ConnectionRef> ret;
1019 if (next_map->is_down(peer) ||
1020 next_map->get_info(peer).up_from > from_epoch) {
1021 release_map(next_map);
1022 return ret;
1023 }
1024 ret.first = osd->hb_back_client_messenger->get_connection(next_map->get_hb_back_inst(peer));
1025 if (next_map->get_hb_front_addr(peer) != entity_addr_t())
1026 ret.second = osd->hb_front_client_messenger->get_connection(next_map->get_hb_front_inst(peer));
1027 release_map(next_map);
1028 return ret;
1029 }
1030
1031
1032 void OSDService::queue_want_pg_temp(pg_t pgid, vector<int>& want)
1033 {
1034 Mutex::Locker l(pg_temp_lock);
1035 map<pg_t,vector<int> >::iterator p = pg_temp_pending.find(pgid);
1036 if (p == pg_temp_pending.end() ||
1037 p->second != want) {
1038 pg_temp_wanted[pgid] = want;
1039 }
1040 }
1041
1042 void OSDService::remove_want_pg_temp(pg_t pgid)
1043 {
1044 Mutex::Locker l(pg_temp_lock);
1045 pg_temp_wanted.erase(pgid);
1046 pg_temp_pending.erase(pgid);
1047 }
1048
1049 void OSDService::_sent_pg_temp()
1050 {
1051 for (map<pg_t,vector<int> >::iterator p = pg_temp_wanted.begin();
1052 p != pg_temp_wanted.end();
1053 ++p)
1054 pg_temp_pending[p->first] = p->second;
1055 pg_temp_wanted.clear();
1056 }
1057
1058 void OSDService::requeue_pg_temp()
1059 {
1060 Mutex::Locker l(pg_temp_lock);
1061 // wanted overrides pending. note that remove_want_pg_temp
1062 // clears the item out of both.
1063 unsigned old_wanted = pg_temp_wanted.size();
1064 unsigned old_pending = pg_temp_pending.size();
1065 _sent_pg_temp();
1066 pg_temp_wanted.swap(pg_temp_pending);
1067 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1068 << pg_temp_wanted.size() << dendl;
1069 }
1070
1071 void OSDService::send_pg_temp()
1072 {
1073 Mutex::Locker l(pg_temp_lock);
1074 if (pg_temp_wanted.empty())
1075 return;
1076 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1077 MOSDPGTemp *m = new MOSDPGTemp(osdmap->get_epoch());
1078 m->pg_temp = pg_temp_wanted;
1079 monc->send_mon_message(m);
1080 _sent_pg_temp();
1081 }
1082
1083 void OSDService::send_pg_created(pg_t pgid)
1084 {
1085 dout(20) << __func__ << dendl;
1086 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1087 monc->send_mon_message(new MOSDPGCreated(pgid));
1088 }
1089 }
1090
1091 // --------------------------------------
1092 // dispatch
1093
1094 epoch_t OSDService::get_peer_epoch(int peer)
1095 {
1096 Mutex::Locker l(peer_map_epoch_lock);
1097 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1098 if (p == peer_map_epoch.end())
1099 return 0;
1100 return p->second;
1101 }
1102
1103 epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1104 {
1105 Mutex::Locker l(peer_map_epoch_lock);
1106 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1107 if (p != peer_map_epoch.end()) {
1108 if (p->second < e) {
1109 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1110 p->second = e;
1111 } else {
1112 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1113 }
1114 return p->second;
1115 } else {
1116 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1117 peer_map_epoch[peer] = e;
1118 return e;
1119 }
1120 }
1121
1122 void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1123 {
1124 Mutex::Locker l(peer_map_epoch_lock);
1125 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1126 if (p != peer_map_epoch.end()) {
1127 if (p->second <= as_of) {
1128 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1129 << " had " << p->second << dendl;
1130 peer_map_epoch.erase(p);
1131 } else {
1132 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1133 << " has " << p->second << " - not forgetting" << dendl;
1134 }
1135 }
1136 }
1137
1138 bool OSDService::should_share_map(entity_name_t name, Connection *con,
1139 epoch_t epoch, const OSDMapRef& osdmap,
1140 const epoch_t *sent_epoch_p)
1141 {
1142 dout(20) << "should_share_map "
1143 << name << " " << con->get_peer_addr()
1144 << " " << epoch << dendl;
1145
1146 // does client have old map?
1147 if (name.is_client()) {
1148 bool message_sendmap = epoch < osdmap->get_epoch();
1149 if (message_sendmap && sent_epoch_p) {
1150 dout(20) << "client session last_sent_epoch: "
1151 << *sent_epoch_p
1152 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1153 if (*sent_epoch_p < osdmap->get_epoch()) {
1154 return true;
1155 } // else we don't need to send it out again
1156 }
1157 }
1158
1159 if (con->get_messenger() == osd->cluster_messenger &&
1160 con != osd->cluster_messenger->get_loopback_connection() &&
1161 osdmap->is_up(name.num()) &&
1162 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1163 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1164 // remember
1165 epoch_t has = MAX(get_peer_epoch(name.num()), epoch);
1166
1167 // share?
1168 if (has < osdmap->get_epoch()) {
1169 dout(10) << name << " " << con->get_peer_addr()
1170 << " has old map " << epoch << " < "
1171 << osdmap->get_epoch() << dendl;
1172 return true;
1173 }
1174 }
1175
1176 return false;
1177 }
1178
1179 void OSDService::share_map(
1180 entity_name_t name,
1181 Connection *con,
1182 epoch_t epoch,
1183 OSDMapRef& osdmap,
1184 epoch_t *sent_epoch_p)
1185 {
1186 dout(20) << "share_map "
1187 << name << " " << con->get_peer_addr()
1188 << " " << epoch << dendl;
1189
1190 if (!osd->is_active()) {
1191 /*It is safe not to proceed as OSD is not in healthy state*/
1192 return;
1193 }
1194
1195 bool want_shared = should_share_map(name, con, epoch,
1196 osdmap, sent_epoch_p);
1197
1198 if (want_shared){
1199 if (name.is_client()) {
1200 dout(10) << name << " has old map " << epoch
1201 << " < " << osdmap->get_epoch() << dendl;
1202 // we know the Session is valid or we wouldn't be sending
1203 if (sent_epoch_p) {
1204 *sent_epoch_p = osdmap->get_epoch();
1205 }
1206 send_incremental_map(epoch, con, osdmap);
1207 } else if (con->get_messenger() == osd->cluster_messenger &&
1208 osdmap->is_up(name.num()) &&
1209 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1210 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1211 dout(10) << name << " " << con->get_peer_addr()
1212 << " has old map " << epoch << " < "
1213 << osdmap->get_epoch() << dendl;
1214 note_peer_epoch(name.num(), osdmap->get_epoch());
1215 send_incremental_map(epoch, con, osdmap);
1216 }
1217 }
1218 }
1219
1220 void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1221 {
1222 if (!map)
1223 map = get_osdmap();
1224
1225 // send map?
1226 epoch_t pe = get_peer_epoch(peer);
1227 if (pe) {
1228 if (pe < map->get_epoch()) {
1229 send_incremental_map(pe, con, map);
1230 note_peer_epoch(peer, map->get_epoch());
1231 } else
1232 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1233 } else {
1234 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1235 // no idea about peer's epoch.
1236 // ??? send recent ???
1237 // do nothing.
1238 }
1239 }
1240
1241 bool OSDService::can_inc_scrubs_pending()
1242 {
1243 bool can_inc = false;
1244 Mutex::Locker l(sched_scrub_lock);
1245
1246 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1247 dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
1248 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1249 can_inc = true;
1250 } else {
1251 dout(20) << __func__ << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1252 }
1253
1254 return can_inc;
1255 }
1256
1257 bool OSDService::inc_scrubs_pending()
1258 {
1259 bool result = false;
1260
1261 sched_scrub_lock.Lock();
1262 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1263 dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
1264 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1265 result = true;
1266 ++scrubs_pending;
1267 } else {
1268 dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1269 }
1270 sched_scrub_lock.Unlock();
1271
1272 return result;
1273 }
1274
1275 void OSDService::dec_scrubs_pending()
1276 {
1277 sched_scrub_lock.Lock();
1278 dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
1279 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1280 --scrubs_pending;
1281 assert(scrubs_pending >= 0);
1282 sched_scrub_lock.Unlock();
1283 }
1284
1285 void OSDService::inc_scrubs_active(bool reserved)
1286 {
1287 sched_scrub_lock.Lock();
1288 ++(scrubs_active);
1289 if (reserved) {
1290 --(scrubs_pending);
1291 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1292 << " (max " << cct->_conf->osd_max_scrubs
1293 << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
1294 assert(scrubs_pending >= 0);
1295 } else {
1296 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1297 << " (max " << cct->_conf->osd_max_scrubs
1298 << ", pending " << scrubs_pending << ")" << dendl;
1299 }
1300 sched_scrub_lock.Unlock();
1301 }
1302
1303 void OSDService::dec_scrubs_active()
1304 {
1305 sched_scrub_lock.Lock();
1306 dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
1307 << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
1308 --scrubs_active;
1309 assert(scrubs_active >= 0);
1310 sched_scrub_lock.Unlock();
1311 }
1312
1313 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1314 epoch_t *_bind_epoch) const
1315 {
1316 Mutex::Locker l(epoch_lock);
1317 if (_boot_epoch)
1318 *_boot_epoch = boot_epoch;
1319 if (_up_epoch)
1320 *_up_epoch = up_epoch;
1321 if (_bind_epoch)
1322 *_bind_epoch = bind_epoch;
1323 }
1324
1325 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1326 const epoch_t *_bind_epoch)
1327 {
1328 Mutex::Locker l(epoch_lock);
1329 if (_boot_epoch) {
1330 assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1331 boot_epoch = *_boot_epoch;
1332 }
1333 if (_up_epoch) {
1334 assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1335 up_epoch = *_up_epoch;
1336 }
1337 if (_bind_epoch) {
1338 assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1339 bind_epoch = *_bind_epoch;
1340 }
1341 }
1342
1343 bool OSDService::prepare_to_stop()
1344 {
1345 Mutex::Locker l(is_stopping_lock);
1346 if (get_state() != NOT_STOPPING)
1347 return false;
1348
1349 OSDMapRef osdmap = get_osdmap();
1350 if (osdmap && osdmap->is_up(whoami)) {
1351 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1352 set_state(PREPARING_TO_STOP);
1353 monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(),
1354 osdmap->get_inst(whoami),
1355 osdmap->get_epoch(),
1356 true // request ack
1357 ));
1358 utime_t now = ceph_clock_now();
1359 utime_t timeout;
1360 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1361 while ((ceph_clock_now() < timeout) &&
1362 (get_state() != STOPPING)) {
1363 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1364 }
1365 }
1366 dout(0) << __func__ << " starting shutdown" << dendl;
1367 set_state(STOPPING);
1368 return true;
1369 }
1370
1371 void OSDService::got_stop_ack()
1372 {
1373 Mutex::Locker l(is_stopping_lock);
1374 if (get_state() == PREPARING_TO_STOP) {
1375 dout(0) << __func__ << " starting shutdown" << dendl;
1376 set_state(STOPPING);
1377 is_stopping_cond.Signal();
1378 } else {
1379 dout(10) << __func__ << " ignoring msg" << dendl;
1380 }
1381 }
1382
1383 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1384 OSDSuperblock& sblock)
1385 {
1386 MOSDMap *m = new MOSDMap(monc->get_fsid());
1387 m->oldest_map = max_oldest_map;
1388 m->newest_map = sblock.newest_map;
1389
1390 for (epoch_t e = to; e > since; e--) {
1391 bufferlist bl;
1392 if (e > m->oldest_map && get_inc_map_bl(e, bl)) {
1393 m->incremental_maps[e].claim(bl);
1394 } else if (get_map_bl(e, bl)) {
1395 m->maps[e].claim(bl);
1396 break;
1397 } else {
1398 derr << "since " << since << " to " << to
1399 << " oldest " << m->oldest_map << " newest " << m->newest_map
1400 << dendl;
1401 m->put();
1402 m = NULL;
1403 break;
1404 }
1405 }
1406 return m;
1407 }
1408
1409 void OSDService::send_map(MOSDMap *m, Connection *con)
1410 {
1411 con->send_message(m);
1412 }
1413
1414 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1415 OSDMapRef& osdmap)
1416 {
1417 epoch_t to = osdmap->get_epoch();
1418 dout(10) << "send_incremental_map " << since << " -> " << to
1419 << " to " << con << " " << con->get_peer_addr() << dendl;
1420
1421 MOSDMap *m = NULL;
1422 while (!m) {
1423 OSDSuperblock sblock(get_superblock());
1424 if (since < sblock.oldest_map) {
1425 // just send latest full map
1426 MOSDMap *m = new MOSDMap(monc->get_fsid());
1427 m->oldest_map = max_oldest_map;
1428 m->newest_map = sblock.newest_map;
1429 get_map_bl(to, m->maps[to]);
1430 send_map(m, con);
1431 return;
1432 }
1433
1434 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1435 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1436 << ", only sending most recent" << dendl;
1437 since = to - cct->_conf->osd_map_share_max_epochs;
1438 }
1439
1440 if (to - since > (epoch_t)cct->_conf->osd_map_message_max)
1441 to = since + cct->_conf->osd_map_message_max;
1442 m = build_incremental_map_msg(since, to, sblock);
1443 }
1444 send_map(m, con);
1445 }
1446
1447 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1448 {
1449 bool found = map_bl_cache.lookup(e, &bl);
1450 if (found) {
1451 if (logger)
1452 logger->inc(l_osd_map_bl_cache_hit);
1453 return true;
1454 }
1455 if (logger)
1456 logger->inc(l_osd_map_bl_cache_miss);
1457 found = store->read(coll_t::meta(),
1458 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1459 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1460 if (found) {
1461 _add_map_bl(e, bl);
1462 }
1463 return found;
1464 }
1465
1466 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1467 {
1468 Mutex::Locker l(map_cache_lock);
1469 bool found = map_bl_inc_cache.lookup(e, &bl);
1470 if (found) {
1471 if (logger)
1472 logger->inc(l_osd_map_bl_cache_hit);
1473 return true;
1474 }
1475 if (logger)
1476 logger->inc(l_osd_map_bl_cache_miss);
1477 found = store->read(coll_t::meta(),
1478 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1479 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1480 if (found) {
1481 _add_map_inc_bl(e, bl);
1482 }
1483 return found;
1484 }
1485
1486 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1487 {
1488 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1489 // cache a contiguous buffer
1490 if (bl.get_num_buffers() > 1) {
1491 bl.rebuild();
1492 }
1493 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1494 map_bl_cache.add(e, bl);
1495 }
1496
1497 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1498 {
1499 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1500 // cache a contiguous buffer
1501 if (bl.get_num_buffers() > 1) {
1502 bl.rebuild();
1503 }
1504 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1505 map_bl_inc_cache.add(e, bl);
1506 }
1507
1508 void OSDService::pin_map_inc_bl(epoch_t e, bufferlist &bl)
1509 {
1510 Mutex::Locker l(map_cache_lock);
1511 // cache a contiguous buffer
1512 if (bl.get_num_buffers() > 1) {
1513 bl.rebuild();
1514 }
1515 map_bl_inc_cache.pin(e, bl);
1516 }
1517
1518 void OSDService::pin_map_bl(epoch_t e, bufferlist &bl)
1519 {
1520 Mutex::Locker l(map_cache_lock);
1521 // cache a contiguous buffer
1522 if (bl.get_num_buffers() > 1) {
1523 bl.rebuild();
1524 }
1525 map_bl_cache.pin(e, bl);
1526 }
1527
1528 void OSDService::clear_map_bl_cache_pins(epoch_t e)
1529 {
1530 Mutex::Locker l(map_cache_lock);
1531 map_bl_inc_cache.clear_pinned(e);
1532 map_bl_cache.clear_pinned(e);
1533 }
1534
1535 OSDMapRef OSDService::_add_map(OSDMap *o)
1536 {
1537 epoch_t e = o->get_epoch();
1538
1539 if (cct->_conf->osd_map_dedup) {
1540 // Dedup against an existing map at a nearby epoch
1541 OSDMapRef for_dedup = map_cache.lower_bound(e);
1542 if (for_dedup) {
1543 OSDMap::dedup(for_dedup.get(), o);
1544 }
1545 }
1546 bool existed;
1547 OSDMapRef l = map_cache.add(e, o, &existed);
1548 if (existed) {
1549 delete o;
1550 }
1551 return l;
1552 }
1553
1554 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1555 {
1556 Mutex::Locker l(map_cache_lock);
1557 OSDMapRef retval = map_cache.lookup(epoch);
1558 if (retval) {
1559 dout(30) << "get_map " << epoch << " -cached" << dendl;
1560 if (logger) {
1561 logger->inc(l_osd_map_cache_hit);
1562 }
1563 return retval;
1564 }
1565 if (logger) {
1566 logger->inc(l_osd_map_cache_miss);
1567 epoch_t lb = map_cache.cached_key_lower_bound();
1568 if (epoch < lb) {
1569 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1570 logger->inc(l_osd_map_cache_miss_low);
1571 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1572 }
1573 }
1574
1575 OSDMap *map = new OSDMap;
1576 if (epoch > 0) {
1577 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1578 bufferlist bl;
1579 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1580 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1581 delete map;
1582 return OSDMapRef();
1583 }
1584 map->decode(bl);
1585 } else {
1586 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1587 }
1588 return _add_map(map);
1589 }
1590
1591 // ops
1592
1593
1594 void OSDService::reply_op_error(OpRequestRef op, int err)
1595 {
1596 reply_op_error(op, err, eversion_t(), 0);
1597 }
1598
1599 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1600 version_t uv)
1601 {
1602 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1603 assert(m->get_type() == CEPH_MSG_OSD_OP);
1604 int flags;
1605 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1606
1607 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1608 true);
1609 reply->set_reply_versions(v, uv);
1610 m->get_connection()->send_message(reply);
1611 }
1612
1613 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1614 {
1615 if (!cct->_conf->osd_debug_misdirected_ops) {
1616 return;
1617 }
1618
1619 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1620 assert(m->get_type() == CEPH_MSG_OSD_OP);
1621
1622 assert(m->get_map_epoch() >= pg->info.history.same_primary_since);
1623
1624 if (pg->is_ec_pg()) {
1625 /**
1626 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1627 * can get this result:
1628 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1629 * [CRUSH_ITEM_NONE, 2, 3]/3
1630 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1631 * [3, 2, 3]/3
1632 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1633 * -- misdirected op
1634 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1635 * it and fulfils it
1636 *
1637 * We can't compute the op target based on the sending map epoch due to
1638 * splitting. The simplest thing is to detect such cases here and drop
1639 * them without an error (the client will resend anyway).
1640 */
1641 assert(m->get_map_epoch() <= superblock.newest_map);
1642 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1643 if (!opmap) {
1644 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1645 << m->get_map_epoch() << ", dropping" << dendl;
1646 return;
1647 }
1648 pg_t _pgid = m->get_raw_pg();
1649 spg_t pgid;
1650 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1651 _pgid = opmap->raw_pg_to_pg(_pgid);
1652 if (opmap->get_primary_shard(_pgid, &pgid) &&
1653 pgid.shard != pg->info.pgid.shard) {
1654 dout(7) << __func__ << ": " << *pg << " primary changed since "
1655 << m->get_map_epoch() << ", dropping" << dendl;
1656 return;
1657 }
1658 }
1659
1660 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1661 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1662 << " pg " << m->get_raw_pg()
1663 << " to osd." << whoami
1664 << " not " << pg->acting
1665 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1666 }
1667
1668 void OSDService::enqueue_back(spg_t pgid, PGQueueable qi)
1669 {
1670 osd->op_shardedwq.queue(make_pair(pgid, qi));
1671 }
1672
1673 void OSDService::enqueue_front(spg_t pgid, PGQueueable qi)
1674 {
1675 osd->op_shardedwq.queue_front(make_pair(pgid, qi));
1676 }
1677
1678 void OSDService::queue_for_peering(PG *pg)
1679 {
1680 peering_wq.queue(pg);
1681 }
1682
1683 void OSDService::queue_for_snap_trim(PG *pg)
1684 {
1685 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1686 osd->op_shardedwq.queue(
1687 make_pair(
1688 pg->info.pgid,
1689 PGQueueable(
1690 PGSnapTrim(pg->get_osdmap()->get_epoch()),
1691 cct->_conf->osd_snap_trim_cost,
1692 cct->_conf->osd_snap_trim_priority,
1693 ceph_clock_now(),
1694 entity_inst_t(),
1695 pg->get_osdmap()->get_epoch())));
1696 }
1697
1698
1699 // ====================================================================
1700 // OSD
1701
1702 #undef dout_prefix
1703 #define dout_prefix *_dout
1704
1705 // Commands shared between OSD's console and admin console:
1706 namespace ceph {
1707 namespace osd_cmds {
1708
1709 int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1710
1711 }} // namespace ceph::osd_cmds
1712
1713 int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
1714 uuid_d fsid, int whoami)
1715 {
1716 int ret;
1717
1718 ceph::shared_ptr<ObjectStore::Sequencer> osr(
1719 new ObjectStore::Sequencer("mkfs"));
1720 OSDSuperblock sb;
1721 bufferlist sbbl;
1722 C_SaferCond waiter;
1723
1724 // if we are fed a uuid for this osd, use it.
1725 store->set_fsid(cct->_conf->osd_uuid);
1726
1727 ret = store->mkfs();
1728 if (ret) {
1729 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1730 << cpp_strerror(ret) << dendl;
1731 goto free_store;
1732 }
1733
1734 store->set_cache_shards(1); // doesn't matter for mkfs!
1735
1736 ret = store->mount();
1737 if (ret) {
1738 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1739 << cpp_strerror(ret) << dendl;
1740 goto free_store;
1741 }
1742
1743 ret = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1744 if (ret >= 0) {
1745 /* if we already have superblock, check content of superblock */
1746 dout(0) << " have superblock" << dendl;
1747 bufferlist::iterator p;
1748 p = sbbl.begin();
1749 ::decode(sb, p);
1750 if (whoami != sb.whoami) {
1751 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1752 << dendl;
1753 ret = -EINVAL;
1754 goto umount_store;
1755 }
1756 if (fsid != sb.cluster_fsid) {
1757 derr << "provided cluster fsid " << fsid
1758 << " != superblock's " << sb.cluster_fsid << dendl;
1759 ret = -EINVAL;
1760 goto umount_store;
1761 }
1762 } else {
1763 // create superblock
1764 sb.cluster_fsid = fsid;
1765 sb.osd_fsid = store->get_fsid();
1766 sb.whoami = whoami;
1767 sb.compat_features = get_osd_initial_compat_set();
1768
1769 bufferlist bl;
1770 ::encode(sb, bl);
1771
1772 ObjectStore::Transaction t;
1773 t.create_collection(coll_t::meta(), 0);
1774 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1775 ret = store->apply_transaction(osr.get(), std::move(t));
1776 if (ret) {
1777 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1778 << "apply_transaction returned " << cpp_strerror(ret) << dendl;
1779 goto umount_store;
1780 }
1781 }
1782
1783 if (!osr->flush_commit(&waiter)) {
1784 waiter.wait();
1785 }
1786
1787 ret = write_meta(store, sb.cluster_fsid, sb.osd_fsid, whoami);
1788 if (ret) {
1789 derr << "OSD::mkfs: failed to write fsid file: error "
1790 << cpp_strerror(ret) << dendl;
1791 goto umount_store;
1792 }
1793
1794 umount_store:
1795 store->umount();
1796 free_store:
1797 delete store;
1798 return ret;
1799 }
1800
1801 int OSD::write_meta(ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
1802 {
1803 char val[80];
1804 int r;
1805
1806 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
1807 r = store->write_meta("magic", val);
1808 if (r < 0)
1809 return r;
1810
1811 snprintf(val, sizeof(val), "%d", whoami);
1812 r = store->write_meta("whoami", val);
1813 if (r < 0)
1814 return r;
1815
1816 cluster_fsid.print(val);
1817 r = store->write_meta("ceph_fsid", val);
1818 if (r < 0)
1819 return r;
1820
1821 r = store->write_meta("ready", "ready");
1822 if (r < 0)
1823 return r;
1824
1825 return 0;
1826 }
1827
1828 int OSD::peek_meta(ObjectStore *store, std::string& magic,
1829 uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami)
1830 {
1831 string val;
1832
1833 int r = store->read_meta("magic", &val);
1834 if (r < 0)
1835 return r;
1836 magic = val;
1837
1838 r = store->read_meta("whoami", &val);
1839 if (r < 0)
1840 return r;
1841 whoami = atoi(val.c_str());
1842
1843 r = store->read_meta("ceph_fsid", &val);
1844 if (r < 0)
1845 return r;
1846 r = cluster_fsid.parse(val.c_str());
1847 if (!r)
1848 return -EINVAL;
1849
1850 r = store->read_meta("fsid", &val);
1851 if (r < 0) {
1852 osd_fsid = uuid_d();
1853 } else {
1854 r = osd_fsid.parse(val.c_str());
1855 if (!r)
1856 return -EINVAL;
1857 }
1858
1859 return 0;
1860 }
1861
1862
1863 #undef dout_prefix
1864 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
1865
1866 // cons/des
1867
1868 OSD::OSD(CephContext *cct_, ObjectStore *store_,
1869 int id,
1870 Messenger *internal_messenger,
1871 Messenger *external_messenger,
1872 Messenger *hb_client_front,
1873 Messenger *hb_client_back,
1874 Messenger *hb_front_serverm,
1875 Messenger *hb_back_serverm,
1876 Messenger *osdc_messenger,
1877 MonClient *mc,
1878 const std::string &dev, const std::string &jdev) :
1879 Dispatcher(cct_),
1880 osd_lock("OSD::osd_lock"),
1881 tick_timer(cct, osd_lock),
1882 tick_timer_lock("OSD::tick_timer_lock"),
1883 tick_timer_without_osd_lock(cct, tick_timer_lock),
1884 authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(cct,
1885 cct->_conf->auth_supported.empty() ?
1886 cct->_conf->auth_cluster_required :
1887 cct->_conf->auth_supported)),
1888 authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(cct,
1889 cct->_conf->auth_supported.empty() ?
1890 cct->_conf->auth_service_required :
1891 cct->_conf->auth_supported)),
1892 cluster_messenger(internal_messenger),
1893 client_messenger(external_messenger),
1894 objecter_messenger(osdc_messenger),
1895 monc(mc),
1896 mgrc(cct_, client_messenger),
1897 logger(NULL),
1898 recoverystate_perf(NULL),
1899 store(store_),
1900 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
1901 clog(log_client.create_channel()),
1902 whoami(id),
1903 dev_path(dev), journal_path(jdev),
1904 store_is_rotational(store->is_rotational()),
1905 trace_endpoint("0.0.0.0", 0, "osd"),
1906 asok_hook(NULL),
1907 osd_compat(get_osd_compat_set()),
1908 peering_tp(cct, "OSD::peering_tp", "tp_peering",
1909 cct->_conf->osd_peering_wq_threads,
1910 "osd_peering_tp_threads"),
1911 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
1912 get_num_op_threads()),
1913 disk_tp(cct, "OSD::disk_tp", "tp_osd_disk", cct->_conf->osd_disk_threads, "osd_disk_threads"),
1914 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
1915 session_waiting_lock("OSD::session_waiting_lock"),
1916 heartbeat_lock("OSD::heartbeat_lock"),
1917 heartbeat_stop(false),
1918 heartbeat_need_update(true),
1919 hb_front_client_messenger(hb_client_front),
1920 hb_back_client_messenger(hb_client_back),
1921 hb_front_server_messenger(hb_front_serverm),
1922 hb_back_server_messenger(hb_back_serverm),
1923 daily_loadavg(0.0),
1924 heartbeat_thread(this),
1925 heartbeat_dispatcher(this),
1926 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
1927 cct->_conf->osd_num_op_tracker_shard),
1928 test_ops_hook(NULL),
1929 op_queue(get_io_queue()),
1930 op_prio_cutoff(get_io_prio_cut()),
1931 op_shardedwq(
1932 get_num_op_shards(),
1933 this,
1934 cct->_conf->osd_op_thread_timeout,
1935 cct->_conf->osd_op_thread_suicide_timeout,
1936 &osd_op_tp),
1937 peering_wq(
1938 this,
1939 cct->_conf->osd_op_thread_timeout,
1940 cct->_conf->osd_op_thread_suicide_timeout,
1941 &peering_tp),
1942 map_lock("OSD::map_lock"),
1943 pg_map_lock("OSD::pg_map_lock"),
1944 last_pg_create_epoch(0),
1945 mon_report_lock("OSD::mon_report_lock"),
1946 stats_ack_timeout(cct->_conf->osd_mon_ack_timeout),
1947 up_thru_wanted(0),
1948 requested_full_first(0),
1949 requested_full_last(0),
1950 pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
1951 osd_stat_updated(false),
1952 pg_stat_tid(0), pg_stat_tid_flushed(0),
1953 command_wq(
1954 this,
1955 cct->_conf->osd_command_thread_timeout,
1956 cct->_conf->osd_command_thread_suicide_timeout,
1957 &command_tp),
1958 remove_wq(
1959 cct,
1960 store,
1961 cct->_conf->osd_remove_thread_timeout,
1962 cct->_conf->osd_remove_thread_suicide_timeout,
1963 &disk_tp),
1964 service(this)
1965 {
1966 monc->set_messenger(client_messenger);
1967 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
1968 cct->_conf->osd_op_log_threshold);
1969 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
1970 cct->_conf->osd_op_history_duration);
1971 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
1972 cct->_conf->osd_op_history_slow_op_threshold);
1973 #ifdef WITH_BLKIN
1974 std::stringstream ss;
1975 ss << "osd." << whoami;
1976 trace_endpoint.copy_name(ss.str());
1977 #endif
1978 }
1979
1980 OSD::~OSD()
1981 {
1982 delete authorize_handler_cluster_registry;
1983 delete authorize_handler_service_registry;
1984 delete class_handler;
1985 cct->get_perfcounters_collection()->remove(recoverystate_perf);
1986 cct->get_perfcounters_collection()->remove(logger);
1987 delete recoverystate_perf;
1988 delete logger;
1989 delete store;
1990 }
1991
1992 void cls_initialize(ClassHandler *ch);
1993
1994 void OSD::handle_signal(int signum)
1995 {
1996 assert(signum == SIGINT || signum == SIGTERM);
1997 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
1998 shutdown();
1999 }
2000
2001 int OSD::pre_init()
2002 {
2003 Mutex::Locker lock(osd_lock);
2004 if (is_stopping())
2005 return 0;
2006
2007 if (store->test_mount_in_use()) {
2008 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2009 << "currently in use. (Is ceph-osd already running?)" << dendl;
2010 return -EBUSY;
2011 }
2012
2013 cct->_conf->add_observer(this);
2014 return 0;
2015 }
2016
2017 // asok
2018
2019 class OSDSocketHook : public AdminSocketHook {
2020 OSD *osd;
2021 public:
2022 explicit OSDSocketHook(OSD *o) : osd(o) {}
2023 bool call(std::string admin_command, cmdmap_t& cmdmap, std::string format,
2024 bufferlist& out) override {
2025 stringstream ss;
2026 bool r = osd->asok_command(admin_command, cmdmap, format, ss);
2027 out.append(ss);
2028 return r;
2029 }
2030 };
2031
2032 bool OSD::asok_command(string admin_command, cmdmap_t& cmdmap, string format,
2033 ostream& ss)
2034 {
2035 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2036 if (admin_command == "status") {
2037 f->open_object_section("status");
2038 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2039 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2040 f->dump_unsigned("whoami", superblock.whoami);
2041 f->dump_string("state", get_state_name(get_state()));
2042 f->dump_unsigned("oldest_map", superblock.oldest_map);
2043 f->dump_unsigned("newest_map", superblock.newest_map);
2044 {
2045 RWLock::RLocker l(pg_map_lock);
2046 f->dump_unsigned("num_pgs", pg_map.size());
2047 }
2048 f->close_section();
2049 } else if (admin_command == "flush_journal") {
2050 store->flush_journal();
2051 } else if (admin_command == "dump_ops_in_flight" ||
2052 admin_command == "ops" ||
2053 admin_command == "dump_blocked_ops" ||
2054 admin_command == "dump_historic_ops" ||
2055 admin_command == "dump_historic_ops_by_duration" ||
2056 admin_command == "dump_historic_slow_ops") {
2057
2058 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2059 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2060 will start to track new ops received afterwards.";
2061
2062 set<string> filters;
2063 vector<string> filter_str;
2064 if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
2065 copy(filter_str.begin(), filter_str.end(),
2066 inserter(filters, filters.end()));
2067 }
2068
2069 if (admin_command == "dump_ops_in_flight" ||
2070 admin_command == "ops") {
2071 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2072 ss << error_str;
2073 }
2074 }
2075 if (admin_command == "dump_blocked_ops") {
2076 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2077 ss << error_str;
2078 }
2079 }
2080 if (admin_command == "dump_historic_ops") {
2081 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2082 ss << error_str;
2083 }
2084 }
2085 if (admin_command == "dump_historic_ops_by_duration") {
2086 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2087 ss << error_str;
2088 }
2089 }
2090 if (admin_command == "dump_historic_slow_ops") {
2091 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2092 ss << error_str;
2093 }
2094 }
2095 } else if (admin_command == "dump_op_pq_state") {
2096 f->open_object_section("pq");
2097 op_shardedwq.dump(f);
2098 f->close_section();
2099 } else if (admin_command == "dump_blacklist") {
2100 list<pair<entity_addr_t,utime_t> > bl;
2101 OSDMapRef curmap = service.get_osdmap();
2102
2103 f->open_array_section("blacklist");
2104 curmap->get_blacklist(&bl);
2105 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2106 it != bl.end(); ++it) {
2107 f->open_object_section("entry");
2108 f->open_object_section("entity_addr_t");
2109 it->first.dump(f);
2110 f->close_section(); //entity_addr_t
2111 it->second.localtime(f->dump_stream("expire_time"));
2112 f->close_section(); //entry
2113 }
2114 f->close_section(); //blacklist
2115 } else if (admin_command == "dump_watchers") {
2116 list<obj_watch_item_t> watchers;
2117 // scan pg's
2118 {
2119 Mutex::Locker l(osd_lock);
2120 RWLock::RLocker l2(pg_map_lock);
2121 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2122 it != pg_map.end();
2123 ++it) {
2124
2125 list<obj_watch_item_t> pg_watchers;
2126 PG *pg = it->second;
2127 pg->lock();
2128 pg->get_watchers(pg_watchers);
2129 pg->unlock();
2130 watchers.splice(watchers.end(), pg_watchers);
2131 }
2132 }
2133
2134 f->open_array_section("watchers");
2135 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2136 it != watchers.end(); ++it) {
2137
2138 f->open_object_section("watch");
2139
2140 f->dump_string("namespace", it->obj.nspace);
2141 f->dump_string("object", it->obj.oid.name);
2142
2143 f->open_object_section("entity_name");
2144 it->wi.name.dump(f);
2145 f->close_section(); //entity_name_t
2146
2147 f->dump_unsigned("cookie", it->wi.cookie);
2148 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2149
2150 f->open_object_section("entity_addr_t");
2151 it->wi.addr.dump(f);
2152 f->close_section(); //entity_addr_t
2153
2154 f->close_section(); //watch
2155 }
2156
2157 f->close_section(); //watchers
2158 } else if (admin_command == "dump_reservations") {
2159 f->open_object_section("reservations");
2160 f->open_object_section("local_reservations");
2161 service.local_reserver.dump(f);
2162 f->close_section();
2163 f->open_object_section("remote_reservations");
2164 service.remote_reserver.dump(f);
2165 f->close_section();
2166 f->close_section();
2167 } else if (admin_command == "get_latest_osdmap") {
2168 get_latest_osdmap();
2169 } else if (admin_command == "heap") {
2170 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2171
2172 // Note: Failed heap profile commands won't necessarily trigger an error:
2173 f->open_object_section("result");
2174 f->dump_string("error", cpp_strerror(result));
2175 f->dump_bool("success", result >= 0);
2176 f->close_section();
2177 } else if (admin_command == "set_heap_property") {
2178 string property;
2179 int64_t value = 0;
2180 string error;
2181 bool success = false;
2182 if (!cmd_getval(cct, cmdmap, "property", property)) {
2183 error = "unable to get property";
2184 success = false;
2185 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2186 error = "unable to get value";
2187 success = false;
2188 } else if (value < 0) {
2189 error = "negative value not allowed";
2190 success = false;
2191 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2192 error = "invalid property";
2193 success = false;
2194 } else {
2195 success = true;
2196 }
2197 f->open_object_section("result");
2198 f->dump_string("error", error);
2199 f->dump_bool("success", success);
2200 f->close_section();
2201 } else if (admin_command == "get_heap_property") {
2202 string property;
2203 size_t value = 0;
2204 string error;
2205 bool success = false;
2206 if (!cmd_getval(cct, cmdmap, "property", property)) {
2207 error = "unable to get property";
2208 success = false;
2209 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2210 error = "invalid property";
2211 success = false;
2212 } else {
2213 success = true;
2214 }
2215 f->open_object_section("result");
2216 f->dump_string("error", error);
2217 f->dump_bool("success", success);
2218 f->dump_int("value", value);
2219 f->close_section();
2220 } else if (admin_command == "dump_objectstore_kv_stats") {
2221 store->get_db_statistics(f);
2222 } else if (admin_command == "dump_scrubs") {
2223 service.dumps_scrub(f);
2224 } else if (admin_command == "calc_objectstore_db_histogram") {
2225 store->generate_db_histogram(f);
2226 } else if (admin_command == "flush_store_cache") {
2227 store->flush_cache();
2228 } else if (admin_command == "dump_pgstate_history") {
2229 f->open_object_section("pgstate_history");
2230 RWLock::RLocker l2(pg_map_lock);
2231 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2232 it != pg_map.end();
2233 ++it) {
2234
2235 PG *pg = it->second;
2236 f->dump_stream("pg") << pg->get_pgid();
2237 pg->lock();
2238 pg->pgstate_history.dump(f);
2239 pg->unlock();
2240 }
2241 f->close_section();
2242 } else if (admin_command == "compact") {
2243 dout(1) << "triggering manual compaction" << dendl;
2244 auto start = ceph::coarse_mono_clock::now();
2245 store->compact();
2246 auto end = ceph::coarse_mono_clock::now();
2247 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
2248 dout(1) << "finished manual compaction in "
2249 << time_span.count()
2250 << " seconds" << dendl;
2251 f->open_object_section("compact_result");
2252 f->dump_float("elapsed_time", time_span.count());
2253 f->close_section();
2254 } else {
2255 assert(0 == "broken asok registration");
2256 }
2257 f->flush(ss);
2258 delete f;
2259 return true;
2260 }
2261
2262 class TestOpsSocketHook : public AdminSocketHook {
2263 OSDService *service;
2264 ObjectStore *store;
2265 public:
2266 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
2267 bool call(std::string command, cmdmap_t& cmdmap, std::string format,
2268 bufferlist& out) override {
2269 stringstream ss;
2270 test_ops(service, store, command, cmdmap, ss);
2271 out.append(ss);
2272 return true;
2273 }
2274 void test_ops(OSDService *service, ObjectStore *store,
2275 const std::string &command, cmdmap_t& cmdmap, ostream &ss);
2276
2277 };
2278
2279 class OSD::C_Tick : public Context {
2280 OSD *osd;
2281 public:
2282 explicit C_Tick(OSD *o) : osd(o) {}
2283 void finish(int r) override {
2284 osd->tick();
2285 }
2286 };
2287
2288 class OSD::C_Tick_WithoutOSDLock : public Context {
2289 OSD *osd;
2290 public:
2291 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2292 void finish(int r) override {
2293 osd->tick_without_osd_lock();
2294 }
2295 };
2296
2297 int OSD::enable_disable_fuse(bool stop)
2298 {
2299 #ifdef HAVE_LIBFUSE
2300 int r;
2301 string mntpath = cct->_conf->osd_data + "/fuse";
2302 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2303 dout(1) << __func__ << " disabling" << dendl;
2304 fuse_store->stop();
2305 delete fuse_store;
2306 fuse_store = NULL;
2307 r = ::rmdir(mntpath.c_str());
2308 if (r < 0) {
2309 r = -errno;
2310 derr << __func__ << " failed to rmdir " << mntpath << ": "
2311 << cpp_strerror(r) << dendl;
2312 return r;
2313 }
2314 return 0;
2315 }
2316 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2317 dout(1) << __func__ << " enabling" << dendl;
2318 r = ::mkdir(mntpath.c_str(), 0700);
2319 if (r < 0)
2320 r = -errno;
2321 if (r < 0 && r != -EEXIST) {
2322 derr << __func__ << " unable to create " << mntpath << ": "
2323 << cpp_strerror(r) << dendl;
2324 return r;
2325 }
2326 fuse_store = new FuseStore(store, mntpath);
2327 r = fuse_store->start();
2328 if (r < 0) {
2329 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2330 delete fuse_store;
2331 fuse_store = NULL;
2332 return r;
2333 }
2334 }
2335 #endif // HAVE_LIBFUSE
2336 return 0;
2337 }
2338
2339 int OSD::get_num_op_shards()
2340 {
2341 if (cct->_conf->osd_op_num_shards)
2342 return cct->_conf->osd_op_num_shards;
2343 if (store_is_rotational)
2344 return cct->_conf->osd_op_num_shards_hdd;
2345 else
2346 return cct->_conf->osd_op_num_shards_ssd;
2347 }
2348
2349 int OSD::get_num_op_threads()
2350 {
2351 if (cct->_conf->osd_op_num_threads_per_shard)
2352 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2353 if (store_is_rotational)
2354 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2355 else
2356 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2357 }
2358
2359 float OSD::get_osd_recovery_sleep()
2360 {
2361 if (cct->_conf->osd_recovery_sleep)
2362 return cct->_conf->osd_recovery_sleep;
2363 if (!store_is_rotational && !journal_is_rotational)
2364 return cct->_conf->osd_recovery_sleep_ssd;
2365 else if (store_is_rotational && !journal_is_rotational)
2366 return cct->_conf->get_val<double>("osd_recovery_sleep_hybrid");
2367 else
2368 return cct->_conf->osd_recovery_sleep_hdd;
2369 }
2370
2371 int OSD::init()
2372 {
2373 CompatSet initial, diff;
2374 Mutex::Locker lock(osd_lock);
2375 if (is_stopping())
2376 return 0;
2377
2378 tick_timer.init();
2379 tick_timer_without_osd_lock.init();
2380 service.recovery_request_timer.init();
2381 service.recovery_sleep_timer.init();
2382
2383 // mount.
2384 dout(2) << "init " << dev_path
2385 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
2386 << dendl;
2387 dout(2) << "journal " << journal_path << dendl;
2388 assert(store); // call pre_init() first!
2389
2390 store->set_cache_shards(get_num_op_shards());
2391
2392 int r = store->mount();
2393 if (r < 0) {
2394 derr << "OSD:init: unable to mount object store" << dendl;
2395 return r;
2396 }
2397 journal_is_rotational = store->is_journal_rotational();
2398 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
2399 << dendl;
2400
2401 enable_disable_fuse(false);
2402
2403 dout(2) << "boot" << dendl;
2404
2405 // initialize the daily loadavg with current 15min loadavg
2406 double loadavgs[3];
2407 if (getloadavg(loadavgs, 3) == 3) {
2408 daily_loadavg = loadavgs[2];
2409 } else {
2410 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
2411 daily_loadavg = 1.0;
2412 }
2413
2414 int rotating_auth_attempts = 0;
2415
2416 // sanity check long object name handling
2417 {
2418 hobject_t l;
2419 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
2420 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
2421 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
2422 r = store->validate_hobject_key(l);
2423 if (r < 0) {
2424 derr << "backend (" << store->get_type() << ") is unable to support max "
2425 << "object name[space] len" << dendl;
2426 derr << " osd max object name len = "
2427 << cct->_conf->osd_max_object_name_len << dendl;
2428 derr << " osd max object namespace len = "
2429 << cct->_conf->osd_max_object_namespace_len << dendl;
2430 derr << cpp_strerror(r) << dendl;
2431 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
2432 goto out;
2433 }
2434 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
2435 << dendl;
2436 } else {
2437 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
2438 }
2439 }
2440
2441 // read superblock
2442 r = read_superblock();
2443 if (r < 0) {
2444 derr << "OSD::init() : unable to read osd superblock" << dendl;
2445 r = -EINVAL;
2446 goto out;
2447 }
2448
2449 if (osd_compat.compare(superblock.compat_features) < 0) {
2450 derr << "The disk uses features unsupported by the executable." << dendl;
2451 derr << " ondisk features " << superblock.compat_features << dendl;
2452 derr << " daemon features " << osd_compat << dendl;
2453
2454 if (osd_compat.writeable(superblock.compat_features)) {
2455 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2456 derr << "it is still writeable, though. Missing features: " << diff << dendl;
2457 r = -EOPNOTSUPP;
2458 goto out;
2459 }
2460 else {
2461 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2462 derr << "Cannot write to disk! Missing features: " << diff << dendl;
2463 r = -EOPNOTSUPP;
2464 goto out;
2465 }
2466 }
2467
2468 assert_warn(whoami == superblock.whoami);
2469 if (whoami != superblock.whoami) {
2470 derr << "OSD::init: superblock says osd"
2471 << superblock.whoami << " but I am osd." << whoami << dendl;
2472 r = -EINVAL;
2473 goto out;
2474 }
2475
2476 initial = get_osd_initial_compat_set();
2477 diff = superblock.compat_features.unsupported(initial);
2478 if (superblock.compat_features.merge(initial)) {
2479 // We need to persist the new compat_set before we
2480 // do anything else
2481 dout(5) << "Upgrading superblock adding: " << diff << dendl;
2482 ObjectStore::Transaction t;
2483 write_superblock(t);
2484 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2485 if (r < 0)
2486 goto out;
2487 }
2488
2489 // make sure snap mapper object exists
2490 if (!store->exists(coll_t::meta(), OSD::make_snapmapper_oid())) {
2491 dout(10) << "init creating/touching snapmapper object" << dendl;
2492 ObjectStore::Transaction t;
2493 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
2494 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2495 if (r < 0)
2496 goto out;
2497 }
2498
2499 class_handler = new ClassHandler(cct);
2500 cls_initialize(class_handler);
2501
2502 if (cct->_conf->osd_open_classes_on_start) {
2503 int r = class_handler->open_all_classes();
2504 if (r)
2505 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
2506 }
2507
2508 // load up "current" osdmap
2509 assert_warn(!osdmap);
2510 if (osdmap) {
2511 derr << "OSD::init: unable to read current osdmap" << dendl;
2512 r = -EINVAL;
2513 goto out;
2514 }
2515 osdmap = get_map(superblock.current_epoch);
2516 check_osdmap_features(store);
2517
2518 create_recoverystate_perf();
2519
2520 {
2521 epoch_t bind_epoch = osdmap->get_epoch();
2522 service.set_epochs(NULL, NULL, &bind_epoch);
2523 }
2524
2525 clear_temp_objects();
2526
2527 // initialize osdmap references in sharded wq
2528 op_shardedwq.prune_pg_waiters(osdmap, whoami);
2529
2530 // load up pgs (as they previously existed)
2531 load_pgs();
2532
2533 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
2534 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
2535 op_prio_cutoff << "." << dendl;
2536
2537 create_logger();
2538
2539 // i'm ready!
2540 client_messenger->add_dispatcher_head(this);
2541 cluster_messenger->add_dispatcher_head(this);
2542
2543 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2544 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2545 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2546 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2547
2548 objecter_messenger->add_dispatcher_head(service.objecter);
2549
2550 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
2551 | CEPH_ENTITY_TYPE_MGR);
2552 r = monc->init();
2553 if (r < 0)
2554 goto out;
2555
2556 /**
2557 * FIXME: this is a placeholder implementation that unconditionally
2558 * sends every is_primary PG's stats every time we're called, unlike
2559 * the existing mon PGStats mechanism that uses pg_stat_queue and acks.
2560 * This has equivalent cost to the existing worst case where all
2561 * PGs are busy and their stats are always enqueued for sending.
2562 */
2563 mgrc.set_pgstats_cb([this](){
2564 RWLock::RLocker l(map_lock);
2565
2566 utime_t had_for = ceph_clock_now() - had_map_since;
2567 osd_stat_t cur_stat = service.get_osd_stat();
2568 cur_stat.os_perf_stat = store->get_cur_stats();
2569
2570 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
2571 m->osd_stat = cur_stat;
2572
2573 Mutex::Locker lec{min_last_epoch_clean_lock};
2574 min_last_epoch_clean = osdmap->get_epoch();
2575 min_last_epoch_clean_pgs.clear();
2576 RWLock::RLocker lpg(pg_map_lock);
2577 for (const auto &i : pg_map) {
2578 PG *pg = i.second;
2579 if (!pg->is_primary()) {
2580 continue;
2581 }
2582
2583 pg->pg_stats_publish_lock.Lock();
2584 if (pg->pg_stats_publish_valid) {
2585 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
2586 const auto lec = pg->pg_stats_publish.get_effective_last_epoch_clean();
2587 min_last_epoch_clean = min(min_last_epoch_clean, lec);
2588 min_last_epoch_clean_pgs.push_back(pg->info.pgid.pgid);
2589 }
2590 pg->pg_stats_publish_lock.Unlock();
2591 }
2592
2593 return m;
2594 });
2595
2596 mgrc.init();
2597 client_messenger->add_dispatcher_head(&mgrc);
2598
2599 // tell monc about log_client so it will know about mon session resets
2600 monc->set_log_client(&log_client);
2601 update_log_config();
2602
2603 peering_tp.start();
2604 osd_op_tp.start();
2605 disk_tp.start();
2606 command_tp.start();
2607
2608 set_disk_tp_priority();
2609
2610 // start the heartbeat
2611 heartbeat_thread.create("osd_srv_heartbt");
2612
2613 // tick
2614 tick_timer.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick(this));
2615 {
2616 Mutex::Locker l(tick_timer_lock);
2617 tick_timer_without_osd_lock.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick_WithoutOSDLock(this));
2618 }
2619
2620 service.init();
2621 service.publish_map(osdmap);
2622 service.publish_superblock(superblock);
2623 service.max_oldest_map = superblock.oldest_map;
2624
2625 osd_lock.Unlock();
2626
2627 r = monc->authenticate();
2628 if (r < 0) {
2629 derr << __func__ << " authentication failed: " << cpp_strerror(r)
2630 << dendl;
2631 osd_lock.Lock(); // locker is going to unlock this on function exit
2632 if (is_stopping())
2633 r = 0;
2634 goto monout;
2635 }
2636
2637 while (monc->wait_auth_rotating(30.0) < 0) {
2638 derr << "unable to obtain rotating service keys; retrying" << dendl;
2639 ++rotating_auth_attempts;
2640 if (rotating_auth_attempts > g_conf->max_rotating_auth_attempts) {
2641 derr << __func__ << " wait_auth_rotating timed out" << dendl;
2642 osd_lock.Lock(); // make locker happy
2643 if (!is_stopping()) {
2644 r = -ETIMEDOUT;
2645 }
2646 goto monout;
2647 }
2648 }
2649
2650 r = update_crush_device_class();
2651 if (r < 0) {
2652 derr << __func__ << " unable to update_crush_device_class: "
2653 << cpp_strerror(r) << dendl;
2654 osd_lock.Lock();
2655 goto monout;
2656 }
2657
2658 r = update_crush_location();
2659 if (r < 0) {
2660 derr << __func__ << " unable to update_crush_location: "
2661 << cpp_strerror(r) << dendl;
2662 osd_lock.Lock();
2663 goto monout;
2664 }
2665
2666 osd_lock.Lock();
2667 if (is_stopping())
2668 return 0;
2669
2670 // start objecter *after* we have authenticated, so that we don't ignore
2671 // the OSDMaps it requests.
2672 service.final_init();
2673
2674 check_config();
2675
2676 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
2677 consume_map();
2678 peering_wq.drain();
2679
2680 dout(0) << "done with init, starting boot process" << dendl;
2681
2682 // subscribe to any pg creations
2683 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
2684
2685 // MgrClient needs this (it doesn't have MonClient reference itself)
2686 monc->sub_want("mgrmap", 0, 0);
2687
2688 // we don't need to ask for an osdmap here; objecter will
2689 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
2690
2691 monc->renew_subs();
2692
2693 start_boot();
2694
2695 return 0;
2696 monout:
2697 exit(1);
2698
2699 out:
2700 enable_disable_fuse(true);
2701 store->umount();
2702 delete store;
2703 store = NULL;
2704 return r;
2705 }
2706
2707 void OSD::final_init()
2708 {
2709 AdminSocket *admin_socket = cct->get_admin_socket();
2710 asok_hook = new OSDSocketHook(this);
2711 int r = admin_socket->register_command("status", "status", asok_hook,
2712 "high-level status of OSD");
2713 assert(r == 0);
2714 r = admin_socket->register_command("flush_journal", "flush_journal",
2715 asok_hook,
2716 "flush the journal to permanent store");
2717 assert(r == 0);
2718 r = admin_socket->register_command("dump_ops_in_flight",
2719 "dump_ops_in_flight " \
2720 "name=filterstr,type=CephString,n=N,req=false",
2721 asok_hook,
2722 "show the ops currently in flight");
2723 assert(r == 0);
2724 r = admin_socket->register_command("ops",
2725 "ops " \
2726 "name=filterstr,type=CephString,n=N,req=false",
2727 asok_hook,
2728 "show the ops currently in flight");
2729 assert(r == 0);
2730 r = admin_socket->register_command("dump_blocked_ops",
2731 "dump_blocked_ops " \
2732 "name=filterstr,type=CephString,n=N,req=false",
2733 asok_hook,
2734 "show the blocked ops currently in flight");
2735 assert(r == 0);
2736 r = admin_socket->register_command("dump_historic_ops",
2737 "dump_historic_ops " \
2738 "name=filterstr,type=CephString,n=N,req=false",
2739 asok_hook,
2740 "show recent ops");
2741 assert(r == 0);
2742 r = admin_socket->register_command("dump_historic_slow_ops",
2743 "dump_historic_slow_ops " \
2744 "name=filterstr,type=CephString,n=N,req=false",
2745 asok_hook,
2746 "show slowest recent ops");
2747 assert(r == 0);
2748 r = admin_socket->register_command("dump_historic_ops_by_duration",
2749 "dump_historic_ops_by_duration " \
2750 "name=filterstr,type=CephString,n=N,req=false",
2751 asok_hook,
2752 "show slowest recent ops, sorted by duration");
2753 assert(r == 0);
2754 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
2755 asok_hook,
2756 "dump op priority queue state");
2757 assert(r == 0);
2758 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
2759 asok_hook,
2760 "dump blacklisted clients and times");
2761 assert(r == 0);
2762 r = admin_socket->register_command("dump_watchers", "dump_watchers",
2763 asok_hook,
2764 "show clients which have active watches,"
2765 " and on which objects");
2766 assert(r == 0);
2767 r = admin_socket->register_command("dump_reservations", "dump_reservations",
2768 asok_hook,
2769 "show recovery reservations");
2770 assert(r == 0);
2771 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
2772 asok_hook,
2773 "force osd to update the latest map from "
2774 "the mon");
2775 assert(r == 0);
2776
2777 r = admin_socket->register_command( "heap",
2778 "heap " \
2779 "name=heapcmd,type=CephString",
2780 asok_hook,
2781 "show heap usage info (available only if "
2782 "compiled with tcmalloc)");
2783 assert(r == 0);
2784
2785 r = admin_socket->register_command("set_heap_property",
2786 "set_heap_property " \
2787 "name=property,type=CephString " \
2788 "name=value,type=CephInt",
2789 asok_hook,
2790 "update malloc extension heap property");
2791 assert(r == 0);
2792
2793 r = admin_socket->register_command("get_heap_property",
2794 "get_heap_property " \
2795 "name=property,type=CephString",
2796 asok_hook,
2797 "get malloc extension heap property");
2798 assert(r == 0);
2799
2800 r = admin_socket->register_command("dump_objectstore_kv_stats",
2801 "dump_objectstore_kv_stats",
2802 asok_hook,
2803 "print statistics of kvdb which used by bluestore");
2804 assert(r == 0);
2805
2806 r = admin_socket->register_command("dump_scrubs",
2807 "dump_scrubs",
2808 asok_hook,
2809 "print scheduled scrubs");
2810 assert(r == 0);
2811
2812 r = admin_socket->register_command("calc_objectstore_db_histogram",
2813 "calc_objectstore_db_histogram",
2814 asok_hook,
2815 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
2816 assert(r == 0);
2817
2818 r = admin_socket->register_command("flush_store_cache",
2819 "flush_store_cache",
2820 asok_hook,
2821 "Flush bluestore internal cache");
2822 assert(r == 0);
2823 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
2824 asok_hook,
2825 "show recent state history");
2826 assert(r == 0);
2827
2828 r = admin_socket->register_command("compact", "compact",
2829 asok_hook,
2830 "Commpact object store's omap."
2831 " WARNING: Compaction probably slows your requests");
2832 assert(r == 0);
2833
2834 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
2835 // Note: pools are CephString instead of CephPoolname because
2836 // these commands traditionally support both pool names and numbers
2837 r = admin_socket->register_command(
2838 "setomapval",
2839 "setomapval " \
2840 "name=pool,type=CephString " \
2841 "name=objname,type=CephObjectname " \
2842 "name=key,type=CephString "\
2843 "name=val,type=CephString",
2844 test_ops_hook,
2845 "set omap key");
2846 assert(r == 0);
2847 r = admin_socket->register_command(
2848 "rmomapkey",
2849 "rmomapkey " \
2850 "name=pool,type=CephString " \
2851 "name=objname,type=CephObjectname " \
2852 "name=key,type=CephString",
2853 test_ops_hook,
2854 "remove omap key");
2855 assert(r == 0);
2856 r = admin_socket->register_command(
2857 "setomapheader",
2858 "setomapheader " \
2859 "name=pool,type=CephString " \
2860 "name=objname,type=CephObjectname " \
2861 "name=header,type=CephString",
2862 test_ops_hook,
2863 "set omap header");
2864 assert(r == 0);
2865
2866 r = admin_socket->register_command(
2867 "getomap",
2868 "getomap " \
2869 "name=pool,type=CephString " \
2870 "name=objname,type=CephObjectname",
2871 test_ops_hook,
2872 "output entire object map");
2873 assert(r == 0);
2874
2875 r = admin_socket->register_command(
2876 "truncobj",
2877 "truncobj " \
2878 "name=pool,type=CephString " \
2879 "name=objname,type=CephObjectname " \
2880 "name=len,type=CephInt",
2881 test_ops_hook,
2882 "truncate object to length");
2883 assert(r == 0);
2884
2885 r = admin_socket->register_command(
2886 "injectdataerr",
2887 "injectdataerr " \
2888 "name=pool,type=CephString " \
2889 "name=objname,type=CephObjectname " \
2890 "name=shardid,type=CephInt,req=false,range=0|255",
2891 test_ops_hook,
2892 "inject data error to an object");
2893 assert(r == 0);
2894
2895 r = admin_socket->register_command(
2896 "injectmdataerr",
2897 "injectmdataerr " \
2898 "name=pool,type=CephString " \
2899 "name=objname,type=CephObjectname " \
2900 "name=shardid,type=CephInt,req=false,range=0|255",
2901 test_ops_hook,
2902 "inject metadata error to an object");
2903 assert(r == 0);
2904 r = admin_socket->register_command(
2905 "set_recovery_delay",
2906 "set_recovery_delay " \
2907 "name=utime,type=CephInt,req=false",
2908 test_ops_hook,
2909 "Delay osd recovery by specified seconds");
2910 assert(r == 0);
2911 r = admin_socket->register_command(
2912 "trigger_scrub",
2913 "trigger_scrub " \
2914 "name=pgid,type=CephString ",
2915 test_ops_hook,
2916 "Trigger a scheduled scrub ");
2917 assert(r == 0);
2918 r = admin_socket->register_command(
2919 "injectfull",
2920 "injectfull " \
2921 "name=type,type=CephString,req=false " \
2922 "name=count,type=CephInt,req=false ",
2923 test_ops_hook,
2924 "Inject a full disk (optional count times)");
2925 assert(r == 0);
2926 }
2927
2928 void OSD::create_logger()
2929 {
2930 dout(10) << "create_logger" << dendl;
2931
2932 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
2933
2934 // Latency axis configuration for op histograms, values are in nanoseconds
2935 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
2936 "Latency (usec)",
2937 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
2938 0, ///< Start at 0
2939 100000, ///< Quantization unit is 100usec
2940 32, ///< Enough to cover much longer than slow requests
2941 };
2942
2943 // Op size axis configuration for op histograms, values are in bytes
2944 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
2945 "Request size (bytes)",
2946 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
2947 0, ///< Start at 0
2948 512, ///< Quantization unit is 512 bytes
2949 32, ///< Enough to cover requests larger than GB
2950 };
2951
2952
2953 osd_plb.add_u64(
2954 l_osd_op_wip, "op_wip",
2955 "Replication operations currently being processed (primary)");
2956 osd_plb.add_u64_counter(
2957 l_osd_op, "op",
2958 "Client operations",
2959 "ops", PerfCountersBuilder::PRIO_CRITICAL);
2960 osd_plb.add_u64_counter(
2961 l_osd_op_inb, "op_in_bytes",
2962 "Client operations total write size",
2963 "wr", PerfCountersBuilder::PRIO_INTERESTING);
2964 osd_plb.add_u64_counter(
2965 l_osd_op_outb, "op_out_bytes",
2966 "Client operations total read size",
2967 "rd", PerfCountersBuilder::PRIO_INTERESTING);
2968 osd_plb.add_time_avg(
2969 l_osd_op_lat, "op_latency",
2970 "Latency of client operations (including queue time)",
2971 "l", 9);
2972 osd_plb.add_time_avg(
2973 l_osd_op_process_lat, "op_process_latency",
2974 "Latency of client operations (excluding queue time)");
2975 osd_plb.add_time_avg(
2976 l_osd_op_prepare_lat, "op_prepare_latency",
2977 "Latency of client operations (excluding queue time and wait for finished)");
2978
2979 osd_plb.add_u64_counter(
2980 l_osd_op_r, "op_r", "Client read operations");
2981 osd_plb.add_u64_counter(
2982 l_osd_op_r_outb, "op_r_out_bytes", "Client data read");
2983 osd_plb.add_time_avg(
2984 l_osd_op_r_lat, "op_r_latency",
2985 "Latency of read operation (including queue time)");
2986 osd_plb.add_u64_counter_histogram(
2987 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
2988 op_hist_x_axis_config, op_hist_y_axis_config,
2989 "Histogram of operation latency (including queue time) + data read");
2990 osd_plb.add_time_avg(
2991 l_osd_op_r_process_lat, "op_r_process_latency",
2992 "Latency of read operation (excluding queue time)");
2993 osd_plb.add_time_avg(
2994 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
2995 "Latency of read operations (excluding queue time and wait for finished)");
2996 osd_plb.add_u64_counter(
2997 l_osd_op_w, "op_w", "Client write operations");
2998 osd_plb.add_u64_counter(
2999 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
3000 osd_plb.add_time_avg(
3001 l_osd_op_w_lat, "op_w_latency",
3002 "Latency of write operation (including queue time)");
3003 osd_plb.add_u64_counter_histogram(
3004 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
3005 op_hist_x_axis_config, op_hist_y_axis_config,
3006 "Histogram of operation latency (including queue time) + data written");
3007 osd_plb.add_time_avg(
3008 l_osd_op_w_process_lat, "op_w_process_latency",
3009 "Latency of write operation (excluding queue time)");
3010 osd_plb.add_time_avg(
3011 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
3012 "Latency of write operations (excluding queue time and wait for finished)");
3013 osd_plb.add_u64_counter(
3014 l_osd_op_rw, "op_rw",
3015 "Client read-modify-write operations");
3016 osd_plb.add_u64_counter(
3017 l_osd_op_rw_inb, "op_rw_in_bytes",
3018 "Client read-modify-write operations write in");
3019 osd_plb.add_u64_counter(
3020 l_osd_op_rw_outb,"op_rw_out_bytes",
3021 "Client read-modify-write operations read out ");
3022 osd_plb.add_time_avg(
3023 l_osd_op_rw_lat, "op_rw_latency",
3024 "Latency of read-modify-write operation (including queue time)");
3025 osd_plb.add_u64_counter_histogram(
3026 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
3027 op_hist_x_axis_config, op_hist_y_axis_config,
3028 "Histogram of rw operation latency (including queue time) + data written");
3029 osd_plb.add_u64_counter_histogram(
3030 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
3031 op_hist_x_axis_config, op_hist_y_axis_config,
3032 "Histogram of rw operation latency (including queue time) + data read");
3033 osd_plb.add_time_avg(
3034 l_osd_op_rw_process_lat, "op_rw_process_latency",
3035 "Latency of read-modify-write operation (excluding queue time)");
3036 osd_plb.add_time_avg(
3037 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
3038 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3039
3040 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
3041 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3042 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
3043 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3044
3045 osd_plb.add_u64_counter(
3046 l_osd_sop, "subop", "Suboperations");
3047 osd_plb.add_u64_counter(
3048 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size");
3049 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
3050
3051 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
3052 osd_plb.add_u64_counter(
3053 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size");
3054 osd_plb.add_time_avg(
3055 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
3056 osd_plb.add_u64_counter(
3057 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
3058 osd_plb.add_time_avg(
3059 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
3060 osd_plb.add_u64_counter(
3061 l_osd_sop_push, "subop_push", "Suboperations push messages");
3062 osd_plb.add_u64_counter(
3063 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size");
3064 osd_plb.add_time_avg(
3065 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3066
3067 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3068 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
3069 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size");
3070
3071 osd_plb.add_u64_counter(
3072 l_osd_rop, "recovery_ops",
3073 "Started recovery operations",
3074 "rop", PerfCountersBuilder::PRIO_INTERESTING);
3075
3076 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
3077 osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size");
3078 osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes");
3079 osd_plb.add_u64(l_osd_history_alloc_num, "history_alloc_num");
3080 osd_plb.add_u64(
3081 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3082 osd_plb.add_u64(
3083 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3084 "Total number getting crc from crc_cache with adjusting");
3085 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3086 "Total number of crc cache misses");
3087
3088 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3089 "pgs", PerfCountersBuilder::PRIO_USEFUL);
3090 osd_plb.add_u64(
3091 l_osd_pg_primary, "numpg_primary",
3092 "Placement groups for which this osd is primary");
3093 osd_plb.add_u64(
3094 l_osd_pg_replica, "numpg_replica",
3095 "Placement groups for which this osd is replica");
3096 osd_plb.add_u64(
3097 l_osd_pg_stray, "numpg_stray",
3098 "Placement groups ready to be deleted from this osd");
3099 osd_plb.add_u64(
3100 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3101 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3102 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3103 osd_plb.add_u64_counter(
3104 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3105 osd_plb.add_u64_counter(
3106 l_osd_waiting_for_map, "messages_delayed_for_map",
3107 "Operations waiting for OSD map");
3108
3109 osd_plb.add_u64_counter(
3110 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3111 osd_plb.add_u64_counter(
3112 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3113 osd_plb.add_u64_counter(
3114 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3115 "osdmap cache miss below cache lower bound");
3116 osd_plb.add_u64_avg(
3117 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3118 "osdmap cache miss, avg distance below cache lower bound");
3119 osd_plb.add_u64_counter(
3120 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3121 "OSDMap buffer cache hits");
3122 osd_plb.add_u64_counter(
3123 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3124 "OSDMap buffer cache misses");
3125
3126 osd_plb.add_u64(l_osd_stat_bytes, "stat_bytes", "OSD size");
3127 osd_plb.add_u64(l_osd_stat_bytes_used, "stat_bytes_used", "Used space");
3128 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
3129
3130 osd_plb.add_u64_counter(
3131 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3132
3133 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3134 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3135 osd_plb.add_u64_counter(
3136 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3137 osd_plb.add_u64_counter(
3138 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3139 osd_plb.add_u64_counter(
3140 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3141 "Failed tier flush attempts");
3142 osd_plb.add_u64_counter(
3143 l_osd_tier_evict, "tier_evict", "Tier evictions");
3144 osd_plb.add_u64_counter(
3145 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3146 osd_plb.add_u64_counter(
3147 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3148 osd_plb.add_u64_counter(
3149 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3150 osd_plb.add_u64_counter(
3151 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3152 osd_plb.add_u64_counter(
3153 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3154 osd_plb.add_u64_counter(
3155 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3156
3157 osd_plb.add_u64_counter(
3158 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3159 osd_plb.add_u64_counter(
3160 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3161 osd_plb.add_u64_counter(
3162 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3163 osd_plb.add_u64_counter(
3164 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3165
3166 osd_plb.add_u64_counter(
3167 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3168 osd_plb.add_u64_counter(
3169 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3170
3171 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3172 osd_plb.add_time_avg(
3173 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3174 osd_plb.add_time_avg(
3175 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3176 osd_plb.add_time_avg(
3177 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3178
3179 osd_plb.add_u64_counter(
3180 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3181 osd_plb.add_u64_counter(
3182 l_osd_pg_fastinfo, "osd_pg_fastinfo",
3183 "PG updated its info using fastinfo attr");
3184 osd_plb.add_u64_counter(
3185 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3186
3187 logger = osd_plb.create_perf_counters();
3188 cct->get_perfcounters_collection()->add(logger);
3189 }
3190
3191 void OSD::create_recoverystate_perf()
3192 {
3193 dout(10) << "create_recoverystate_perf" << dendl;
3194
3195 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3196
3197 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3198 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3199 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3200 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3201 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3202 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3203 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3204 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3205 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3206 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3207 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3208 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3209 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3210 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3211 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3212 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3213 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3214 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3215 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3216 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3217 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3218 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3219 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3220 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3221 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3222 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3223 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3224 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3225 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3226 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3227 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3228
3229 recoverystate_perf = rs_perf.create_perf_counters();
3230 cct->get_perfcounters_collection()->add(recoverystate_perf);
3231 }
3232
3233 int OSD::shutdown()
3234 {
3235 if (!service.prepare_to_stop())
3236 return 0; // already shutting down
3237 osd_lock.Lock();
3238 if (is_stopping()) {
3239 osd_lock.Unlock();
3240 return 0;
3241 }
3242 derr << "shutdown" << dendl;
3243
3244 set_state(STATE_STOPPING);
3245
3246 // Debugging
3247 cct->_conf->set_val("debug_osd", "100");
3248 cct->_conf->set_val("debug_journal", "100");
3249 cct->_conf->set_val("debug_filestore", "100");
3250 cct->_conf->set_val("debug_ms", "100");
3251 cct->_conf->apply_changes(NULL);
3252
3253 // stop MgrClient earlier as it's more like an internal consumer of OSD
3254 mgrc.shutdown();
3255
3256 service.start_shutdown();
3257
3258 // stop sending work to pgs. this just prevents any new work in _process
3259 // from racing with on_shutdown and potentially entering the pg after.
3260 op_shardedwq.drain();
3261
3262 // Shutdown PGs
3263 {
3264 RWLock::RLocker l(pg_map_lock);
3265 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3266 p != pg_map.end();
3267 ++p) {
3268 dout(20) << " kicking pg " << p->first << dendl;
3269 p->second->lock();
3270 p->second->on_shutdown();
3271 p->second->unlock();
3272 p->second->osr->flush();
3273 }
3274 }
3275 clear_pg_stat_queue();
3276
3277 // drain op queue again (in case PGs requeued something)
3278 op_shardedwq.drain();
3279 {
3280 finished.clear(); // zap waiters (bleh, this is messy)
3281 }
3282
3283 op_shardedwq.clear_pg_slots();
3284
3285 // unregister commands
3286 cct->get_admin_socket()->unregister_command("status");
3287 cct->get_admin_socket()->unregister_command("flush_journal");
3288 cct->get_admin_socket()->unregister_command("dump_ops_in_flight");
3289 cct->get_admin_socket()->unregister_command("ops");
3290 cct->get_admin_socket()->unregister_command("dump_blocked_ops");
3291 cct->get_admin_socket()->unregister_command("dump_historic_ops");
3292 cct->get_admin_socket()->unregister_command("dump_historic_ops_by_duration");
3293 cct->get_admin_socket()->unregister_command("dump_historic_slow_ops");
3294 cct->get_admin_socket()->unregister_command("dump_op_pq_state");
3295 cct->get_admin_socket()->unregister_command("dump_blacklist");
3296 cct->get_admin_socket()->unregister_command("dump_watchers");
3297 cct->get_admin_socket()->unregister_command("dump_reservations");
3298 cct->get_admin_socket()->unregister_command("get_latest_osdmap");
3299 cct->get_admin_socket()->unregister_command("heap");
3300 cct->get_admin_socket()->unregister_command("set_heap_property");
3301 cct->get_admin_socket()->unregister_command("get_heap_property");
3302 cct->get_admin_socket()->unregister_command("dump_objectstore_kv_stats");
3303 cct->get_admin_socket()->unregister_command("dump_scrubs");
3304 cct->get_admin_socket()->unregister_command("calc_objectstore_db_histogram");
3305 cct->get_admin_socket()->unregister_command("flush_store_cache");
3306 cct->get_admin_socket()->unregister_command("dump_pgstate_history");
3307 cct->get_admin_socket()->unregister_command("compact");
3308 delete asok_hook;
3309 asok_hook = NULL;
3310
3311 cct->get_admin_socket()->unregister_command("setomapval");
3312 cct->get_admin_socket()->unregister_command("rmomapkey");
3313 cct->get_admin_socket()->unregister_command("setomapheader");
3314 cct->get_admin_socket()->unregister_command("getomap");
3315 cct->get_admin_socket()->unregister_command("truncobj");
3316 cct->get_admin_socket()->unregister_command("injectdataerr");
3317 cct->get_admin_socket()->unregister_command("injectmdataerr");
3318 cct->get_admin_socket()->unregister_command("set_recovery_delay");
3319 cct->get_admin_socket()->unregister_command("trigger_scrub");
3320 cct->get_admin_socket()->unregister_command("injectfull");
3321 delete test_ops_hook;
3322 test_ops_hook = NULL;
3323
3324 osd_lock.Unlock();
3325
3326 heartbeat_lock.Lock();
3327 heartbeat_stop = true;
3328 heartbeat_cond.Signal();
3329 heartbeat_lock.Unlock();
3330 heartbeat_thread.join();
3331
3332 peering_tp.drain();
3333 peering_wq.clear();
3334 peering_tp.stop();
3335 dout(10) << "osd tp stopped" << dendl;
3336
3337 osd_op_tp.drain();
3338 osd_op_tp.stop();
3339 dout(10) << "op sharded tp stopped" << dendl;
3340
3341 command_tp.drain();
3342 command_tp.stop();
3343 dout(10) << "command tp stopped" << dendl;
3344
3345 disk_tp.drain();
3346 disk_tp.stop();
3347 dout(10) << "disk tp paused (new)" << dendl;
3348
3349 dout(10) << "stopping agent" << dendl;
3350 service.agent_stop();
3351
3352 osd_lock.Lock();
3353
3354 reset_heartbeat_peers();
3355
3356 tick_timer.shutdown();
3357
3358 {
3359 Mutex::Locker l(tick_timer_lock);
3360 tick_timer_without_osd_lock.shutdown();
3361 }
3362
3363 // note unmount epoch
3364 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
3365 superblock.mounted = service.get_boot_epoch();
3366 superblock.clean_thru = osdmap->get_epoch();
3367 ObjectStore::Transaction t;
3368 write_superblock(t);
3369 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
3370 if (r) {
3371 derr << "OSD::shutdown: error writing superblock: "
3372 << cpp_strerror(r) << dendl;
3373 }
3374
3375
3376 {
3377 Mutex::Locker l(pg_stat_queue_lock);
3378 assert(pg_stat_queue.empty());
3379 }
3380
3381 service.shutdown_reserver();
3382
3383 // Remove PGs
3384 #ifdef PG_DEBUG_REFS
3385 service.dump_live_pgids();
3386 #endif
3387 {
3388 RWLock::RLocker l(pg_map_lock);
3389 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3390 p != pg_map.end();
3391 ++p) {
3392 dout(20) << " kicking pg " << p->first << dendl;
3393 p->second->lock();
3394 if (p->second->ref != 1) {
3395 derr << "pgid " << p->first << " has ref count of "
3396 << p->second->ref << dendl;
3397 #ifdef PG_DEBUG_REFS
3398 p->second->dump_live_ids();
3399 #endif
3400 if (cct->_conf->osd_shutdown_pgref_assert) {
3401 ceph_abort();
3402 }
3403 }
3404 p->second->unlock();
3405 p->second->put("PGMap");
3406 }
3407 pg_map.clear();
3408 }
3409 #ifdef PG_DEBUG_REFS
3410 service.dump_live_pgids();
3411 #endif
3412 cct->_conf->remove_observer(this);
3413
3414 dout(10) << "syncing store" << dendl;
3415 enable_disable_fuse(true);
3416
3417 if (cct->_conf->osd_journal_flush_on_shutdown) {
3418 dout(10) << "flushing journal" << dendl;
3419 store->flush_journal();
3420 }
3421
3422 store->umount();
3423 delete store;
3424 store = 0;
3425 dout(10) << "Store synced" << dendl;
3426
3427 monc->shutdown();
3428 osd_lock.Unlock();
3429
3430 osdmap = OSDMapRef();
3431 service.shutdown();
3432 op_tracker.on_shutdown();
3433
3434 class_handler->shutdown();
3435 client_messenger->shutdown();
3436 cluster_messenger->shutdown();
3437 hb_front_client_messenger->shutdown();
3438 hb_back_client_messenger->shutdown();
3439 objecter_messenger->shutdown();
3440 hb_front_server_messenger->shutdown();
3441 hb_back_server_messenger->shutdown();
3442
3443 peering_wq.clear();
3444
3445 return r;
3446 }
3447
3448 int OSD::mon_cmd_maybe_osd_create(string &cmd)
3449 {
3450 bool created = false;
3451 while (true) {
3452 dout(10) << __func__ << " cmd: " << cmd << dendl;
3453 vector<string> vcmd{cmd};
3454 bufferlist inbl;
3455 C_SaferCond w;
3456 string outs;
3457 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
3458 int r = w.wait();
3459 if (r < 0) {
3460 if (r == -ENOENT && !created) {
3461 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
3462 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
3463 vector<string> vnewcmd{newcmd};
3464 bufferlist inbl;
3465 C_SaferCond w;
3466 string outs;
3467 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
3468 int r = w.wait();
3469 if (r < 0) {
3470 derr << __func__ << " fail: osd does not exist and created failed: "
3471 << cpp_strerror(r) << dendl;
3472 return r;
3473 }
3474 created = true;
3475 continue;
3476 }
3477 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
3478 return r;
3479 }
3480 break;
3481 }
3482
3483 return 0;
3484 }
3485
3486 int OSD::update_crush_location()
3487 {
3488 if (!cct->_conf->osd_crush_update_on_start) {
3489 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
3490 return 0;
3491 }
3492
3493 char weight[32];
3494 if (cct->_conf->osd_crush_initial_weight >= 0) {
3495 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
3496 } else {
3497 struct store_statfs_t st;
3498 int r = store->statfs(&st);
3499 if (r < 0) {
3500 derr << "statfs: " << cpp_strerror(r) << dendl;
3501 return r;
3502 }
3503 snprintf(weight, sizeof(weight), "%.4lf",
3504 MAX((double).00001,
3505 (double)(st.total) /
3506 (double)(1ull << 40 /* TB */)));
3507 }
3508
3509 std::multimap<string,string> loc = cct->crush_location.get_location();
3510 dout(10) << __func__ << " crush location is " << loc << dendl;
3511
3512 string cmd =
3513 string("{\"prefix\": \"osd crush create-or-move\", ") +
3514 string("\"id\": ") + stringify(whoami) + string(", ") +
3515 string("\"weight\":") + weight + string(", ") +
3516 string("\"args\": [");
3517 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
3518 if (p != loc.begin())
3519 cmd += ", ";
3520 cmd += "\"" + p->first + "=" + p->second + "\"";
3521 }
3522 cmd += "]}";
3523
3524 return mon_cmd_maybe_osd_create(cmd);
3525 }
3526
3527 int OSD::update_crush_device_class()
3528 {
3529 if (!cct->_conf->osd_class_update_on_start) {
3530 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
3531 return 0;
3532 }
3533
3534 string device_class;
3535 int r = store->read_meta("crush_device_class", &device_class);
3536 if (r < 0 || device_class.empty()) {
3537 device_class = store->get_default_device_class();
3538 }
3539
3540 if (device_class.empty()) {
3541 dout(20) << __func__ << " no device class stored locally" << dendl;
3542 return 0;
3543 }
3544
3545 string cmd =
3546 string("{\"prefix\": \"osd crush set-device-class\", ") +
3547 string("\"class\": \"") + device_class + string("\", ") +
3548 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
3549
3550 r = mon_cmd_maybe_osd_create(cmd);
3551 // the above cmd can fail for various reasons, e.g.:
3552 // (1) we are connecting to a pre-luminous monitor
3553 // (2) user manually specify a class other than
3554 // 'ceph-disk prepare --crush-device-class'
3555 // simply skip result-checking for now
3556 return 0;
3557 }
3558
3559 void OSD::write_superblock(ObjectStore::Transaction& t)
3560 {
3561 dout(10) << "write_superblock " << superblock << dendl;
3562
3563 //hack: at minimum it's using the baseline feature set
3564 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
3565 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
3566
3567 bufferlist bl;
3568 ::encode(superblock, bl);
3569 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
3570 }
3571
3572 int OSD::read_superblock()
3573 {
3574 bufferlist bl;
3575 int r = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
3576 if (r < 0)
3577 return r;
3578
3579 bufferlist::iterator p = bl.begin();
3580 ::decode(superblock, p);
3581
3582 dout(10) << "read_superblock " << superblock << dendl;
3583
3584 return 0;
3585 }
3586
3587 void OSD::clear_temp_objects()
3588 {
3589 dout(10) << __func__ << dendl;
3590 vector<coll_t> ls;
3591 store->list_collections(ls);
3592 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
3593 spg_t pgid;
3594 if (!p->is_pg(&pgid))
3595 continue;
3596
3597 // list temp objects
3598 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
3599
3600 vector<ghobject_t> temps;
3601 ghobject_t next;
3602 while (1) {
3603 vector<ghobject_t> objects;
3604 store->collection_list(*p, next, ghobject_t::get_max(),
3605 store->get_ideal_list_max(),
3606 &objects, &next);
3607 if (objects.empty())
3608 break;
3609 vector<ghobject_t>::iterator q;
3610 for (q = objects.begin(); q != objects.end(); ++q) {
3611 // Hammer set pool for temps to -1, so check for clean-up
3612 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
3613 temps.push_back(*q);
3614 } else {
3615 break;
3616 }
3617 }
3618 // If we saw a non-temp object and hit the break above we can
3619 // break out of the while loop too.
3620 if (q != objects.end())
3621 break;
3622 }
3623 if (!temps.empty()) {
3624 ObjectStore::Transaction t;
3625 int removed = 0;
3626 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
3627 dout(20) << " removing " << *p << " object " << *q << dendl;
3628 t.remove(*p, *q);
3629 if (++removed > cct->_conf->osd_target_transaction_size) {
3630 store->apply_transaction(service.meta_osr.get(), std::move(t));
3631 t = ObjectStore::Transaction();
3632 removed = 0;
3633 }
3634 }
3635 if (removed) {
3636 store->apply_transaction(service.meta_osr.get(), std::move(t));
3637 }
3638 }
3639 }
3640 }
3641
3642 void OSD::recursive_remove_collection(CephContext* cct,
3643 ObjectStore *store, spg_t pgid,
3644 coll_t tmp)
3645 {
3646 OSDriver driver(
3647 store,
3648 coll_t(),
3649 make_snapmapper_oid());
3650
3651 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
3652 ObjectStore::Sequencer>("rm"));
3653 ObjectStore::Transaction t;
3654 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
3655
3656 vector<ghobject_t> objects;
3657 store->collection_list(tmp, ghobject_t(), ghobject_t::get_max(),
3658 INT_MAX, &objects, 0);
3659 generic_dout(10) << __func__ << " " << objects << dendl;
3660 // delete them.
3661 int removed = 0;
3662 for (vector<ghobject_t>::iterator p = objects.begin();
3663 p != objects.end();
3664 ++p, removed++) {
3665 OSDriver::OSTransaction _t(driver.get_transaction(&t));
3666 int r = mapper.remove_oid(p->hobj, &_t);
3667 if (r != 0 && r != -ENOENT)
3668 ceph_abort();
3669 t.remove(tmp, *p);
3670 if (removed > cct->_conf->osd_target_transaction_size) {
3671 int r = store->apply_transaction(osr.get(), std::move(t));
3672 assert(r == 0);
3673 t = ObjectStore::Transaction();
3674 removed = 0;
3675 }
3676 }
3677 t.remove_collection(tmp);
3678 int r = store->apply_transaction(osr.get(), std::move(t));
3679 assert(r == 0);
3680
3681 C_SaferCond waiter;
3682 if (!osr->flush_commit(&waiter)) {
3683 waiter.wait();
3684 }
3685 }
3686
3687
3688 // ======================================================
3689 // PG's
3690
3691 PGPool OSD::_get_pool(int id, OSDMapRef createmap)
3692 {
3693 if (!createmap->have_pg_pool(id)) {
3694 dout(5) << __func__ << ": the OSDmap does not contain a PG pool with id = "
3695 << id << dendl;
3696 ceph_abort();
3697 }
3698
3699 PGPool p = PGPool(cct, createmap, id);
3700
3701 dout(10) << "_get_pool " << p.id << dendl;
3702 return p;
3703 }
3704
3705 PG *OSD::_open_lock_pg(
3706 OSDMapRef createmap,
3707 spg_t pgid, bool no_lockdep_check)
3708 {
3709 assert(osd_lock.is_locked());
3710
3711 PG* pg = _make_pg(createmap, pgid);
3712 {
3713 RWLock::WLocker l(pg_map_lock);
3714 pg->lock(no_lockdep_check);
3715 pg_map[pgid] = pg;
3716 pg->get("PGMap"); // because it's in pg_map
3717 service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
3718 }
3719 return pg;
3720 }
3721
3722 PG* OSD::_make_pg(
3723 OSDMapRef createmap,
3724 spg_t pgid)
3725 {
3726 dout(10) << "_open_lock_pg " << pgid << dendl;
3727 PGPool pool = _get_pool(pgid.pool(), createmap);
3728
3729 // create
3730 PG *pg;
3731 if (createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_REPLICATED ||
3732 createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_ERASURE)
3733 pg = new PrimaryLogPG(&service, createmap, pool, pgid);
3734 else
3735 ceph_abort();
3736
3737 return pg;
3738 }
3739
3740
3741 void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
3742 {
3743 epoch_t e(service.get_osdmap()->get_epoch());
3744 pg->get("PGMap"); // For pg_map
3745 pg_map[pg->info.pgid] = pg;
3746 service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
3747
3748 dout(10) << "Adding newly split pg " << *pg << dendl;
3749 pg->handle_loaded(rctx);
3750 pg->write_if_dirty(*(rctx->transaction));
3751 pg->queue_null(e, e);
3752 map<spg_t, list<PG::CephPeeringEvtRef> >::iterator to_wake =
3753 peering_wait_for_split.find(pg->info.pgid);
3754 if (to_wake != peering_wait_for_split.end()) {
3755 for (list<PG::CephPeeringEvtRef>::iterator i =
3756 to_wake->second.begin();
3757 i != to_wake->second.end();
3758 ++i) {
3759 pg->queue_peering_event(*i);
3760 }
3761 peering_wait_for_split.erase(to_wake);
3762 }
3763 if (!service.get_osdmap()->have_pg_pool(pg->info.pgid.pool()))
3764 _remove_pg(pg);
3765 }
3766
3767 OSD::res_result OSD::_try_resurrect_pg(
3768 OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state)
3769 {
3770 assert(resurrected);
3771 assert(old_pg_state);
3772 // find nearest ancestor
3773 DeletingStateRef df;
3774 spg_t cur(pgid);
3775 while (true) {
3776 df = service.deleting_pgs.lookup(cur);
3777 if (df)
3778 break;
3779 if (!cur.ps())
3780 break;
3781 cur = cur.get_parent();
3782 }
3783 if (!df)
3784 return RES_NONE; // good to go
3785
3786 df->old_pg_state->lock();
3787 OSDMapRef create_map = df->old_pg_state->get_osdmap();
3788 df->old_pg_state->unlock();
3789
3790 set<spg_t> children;
3791 if (cur == pgid) {
3792 if (df->try_stop_deletion()) {
3793 dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl;
3794 *resurrected = cur;
3795 *old_pg_state = df->old_pg_state;
3796 service.deleting_pgs.remove(pgid); // PG is no longer being removed!
3797 return RES_SELF;
3798 } else {
3799 // raced, ensure we don't see DeletingStateRef when we try to
3800 // delete this pg
3801 service.deleting_pgs.remove(pgid);
3802 return RES_NONE;
3803 }
3804 } else if (cur.is_split(create_map->get_pg_num(cur.pool()),
3805 curmap->get_pg_num(cur.pool()),
3806 &children) &&
3807 children.count(pgid)) {
3808 if (df->try_stop_deletion()) {
3809 dout(10) << __func__ << ": halted deletion on ancestor pg " << pgid
3810 << dendl;
3811 *resurrected = cur;
3812 *old_pg_state = df->old_pg_state;
3813 service.deleting_pgs.remove(cur); // PG is no longer being removed!
3814 return RES_PARENT;
3815 } else {
3816 /* this is not a problem, failing to cancel proves that all objects
3817 * have been removed, so no hobject_t overlap is possible
3818 */
3819 return RES_NONE;
3820 }
3821 }
3822 return RES_NONE;
3823 }
3824
3825 PG *OSD::_create_lock_pg(
3826 OSDMapRef createmap,
3827 spg_t pgid,
3828 bool hold_map_lock,
3829 bool backfill,
3830 int role,
3831 vector<int>& up, int up_primary,
3832 vector<int>& acting, int acting_primary,
3833 pg_history_t history,
3834 const PastIntervals& pi,
3835 ObjectStore::Transaction& t)
3836 {
3837 assert(osd_lock.is_locked());
3838 dout(20) << "_create_lock_pg pgid " << pgid << dendl;
3839
3840 PG *pg = _open_lock_pg(createmap, pgid, true);
3841
3842 service.init_splits_between(pgid, pg->get_osdmap(), service.get_osdmap());
3843
3844 pg->init(
3845 role,
3846 up,
3847 up_primary,
3848 acting,
3849 acting_primary,
3850 history,
3851 pi,
3852 backfill,
3853 &t);
3854
3855 dout(7) << "_create_lock_pg " << *pg << dendl;
3856 return pg;
3857 }
3858
3859 PG *OSD::_lookup_lock_pg(spg_t pgid)
3860 {
3861 RWLock::RLocker l(pg_map_lock);
3862
3863 auto pg_map_entry = pg_map.find(pgid);
3864 if (pg_map_entry == pg_map.end())
3865 return nullptr;
3866 PG *pg = pg_map_entry->second;
3867 pg->lock();
3868 return pg;
3869 }
3870
3871 PG *OSD::lookup_lock_pg(spg_t pgid)
3872 {
3873 return _lookup_lock_pg(pgid);
3874 }
3875
3876 PG *OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid)
3877 {
3878 assert(pg_map.count(pgid));
3879 PG *pg = pg_map[pgid];
3880 pg->lock();
3881 return pg;
3882 }
3883
3884 void OSD::load_pgs()
3885 {
3886 assert(osd_lock.is_locked());
3887 dout(0) << "load_pgs" << dendl;
3888 {
3889 RWLock::RLocker l(pg_map_lock);
3890 assert(pg_map.empty());
3891 }
3892
3893 vector<coll_t> ls;
3894 int r = store->list_collections(ls);
3895 if (r < 0) {
3896 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
3897 }
3898
3899 bool has_upgraded = false;
3900
3901 for (vector<coll_t>::iterator it = ls.begin();
3902 it != ls.end();
3903 ++it) {
3904 spg_t pgid;
3905 if (it->is_temp(&pgid) ||
3906 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
3907 dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
3908 recursive_remove_collection(cct, store, pgid, *it);
3909 continue;
3910 }
3911
3912 if (!it->is_pg(&pgid)) {
3913 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
3914 continue;
3915 }
3916
3917 if (pgid.preferred() >= 0) {
3918 dout(10) << __func__ << ": skipping localized PG " << pgid << dendl;
3919 // FIXME: delete it too, eventually
3920 continue;
3921 }
3922
3923 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
3924 bufferlist bl;
3925 epoch_t map_epoch = 0;
3926 int r = PG::peek_map_epoch(store, pgid, &map_epoch, &bl);
3927 if (r < 0) {
3928 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
3929 << dendl;
3930 continue;
3931 }
3932
3933 PG *pg = NULL;
3934 if (map_epoch > 0) {
3935 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
3936 if (!pgosdmap) {
3937 if (!osdmap->have_pg_pool(pgid.pool())) {
3938 derr << __func__ << ": could not find map for epoch " << map_epoch
3939 << " on pg " << pgid << ", but the pool is not present in the "
3940 << "current map, so this is probably a result of bug 10617. "
3941 << "Skipping the pg for now, you can use ceph-objectstore-tool "
3942 << "to clean it up later." << dendl;
3943 continue;
3944 } else {
3945 derr << __func__ << ": have pgid " << pgid << " at epoch "
3946 << map_epoch << ", but missing map. Crashing."
3947 << dendl;
3948 assert(0 == "Missing map in load_pgs");
3949 }
3950 }
3951 pg = _open_lock_pg(pgosdmap, pgid);
3952 } else {
3953 pg = _open_lock_pg(osdmap, pgid);
3954 }
3955 // there can be no waiters here, so we don't call wake_pg_waiters
3956
3957 pg->ch = store->open_collection(pg->coll);
3958
3959 // read pg state, log
3960 pg->read_state(store, bl);
3961
3962 if (pg->must_upgrade()) {
3963 if (!pg->can_upgrade()) {
3964 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
3965 << " an older version first." << dendl;
3966 assert(0 == "PG too old to upgrade");
3967 }
3968 if (!has_upgraded) {
3969 derr << "PGs are upgrading" << dendl;
3970 has_upgraded = true;
3971 }
3972 dout(10) << "PG " << pg->info.pgid
3973 << " must upgrade..." << dendl;
3974 pg->upgrade(store);
3975 }
3976
3977 service.init_splits_between(pg->info.pgid, pg->get_osdmap(), osdmap);
3978
3979 // generate state for PG's current mapping
3980 int primary, up_primary;
3981 vector<int> acting, up;
3982 pg->get_osdmap()->pg_to_up_acting_osds(
3983 pgid.pgid, &up, &up_primary, &acting, &primary);
3984 pg->init_primary_up_acting(
3985 up,
3986 acting,
3987 up_primary,
3988 primary);
3989 int role = OSDMap::calc_pg_role(whoami, pg->acting);
3990 if (pg->pool.info.is_replicated() || role == pg->pg_whoami.shard)
3991 pg->set_role(role);
3992 else
3993 pg->set_role(-1);
3994
3995 pg->reg_next_scrub();
3996
3997 PG::RecoveryCtx rctx(0, 0, 0, 0, 0, 0);
3998 pg->handle_loaded(&rctx);
3999
4000 dout(10) << "load_pgs loaded " << *pg << " " << pg->pg_log.get_log() << dendl;
4001 if (pg->pg_log.is_dirty()) {
4002 ObjectStore::Transaction t;
4003 pg->write_if_dirty(t);
4004 store->apply_transaction(pg->osr.get(), std::move(t));
4005 }
4006 pg->unlock();
4007 }
4008 {
4009 RWLock::RLocker l(pg_map_lock);
4010 dout(0) << "load_pgs opened " << pg_map.size() << " pgs" << dendl;
4011 }
4012
4013 // clean up old infos object?
4014 if (has_upgraded && store->exists(coll_t::meta(), OSD::make_infos_oid())) {
4015 dout(1) << __func__ << " removing legacy infos object" << dendl;
4016 ObjectStore::Transaction t;
4017 t.remove(coll_t::meta(), OSD::make_infos_oid());
4018 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
4019 if (r != 0) {
4020 derr << __func__ << ": apply_transaction returned "
4021 << cpp_strerror(r) << dendl;
4022 ceph_abort();
4023 }
4024 }
4025
4026 build_past_intervals_parallel();
4027 }
4028
4029
4030 /*
4031 * build past_intervals efficiently on old, degraded, and buried
4032 * clusters. this is important for efficiently catching up osds that
4033 * are way behind on maps to the current cluster state.
4034 *
4035 * this is a parallel version of PG::generate_past_intervals().
4036 * follow the same logic, but do all pgs at the same time so that we
4037 * can make a single pass across the osdmap history.
4038 */
4039 void OSD::build_past_intervals_parallel()
4040 {
4041 struct pistate {
4042 epoch_t start, end;
4043 vector<int> old_acting, old_up;
4044 epoch_t same_interval_since;
4045 int primary;
4046 int up_primary;
4047 };
4048 map<PG*,pistate> pis;
4049
4050 // calculate junction of map range
4051 epoch_t end_epoch = superblock.oldest_map;
4052 epoch_t cur_epoch = superblock.newest_map;
4053 {
4054 RWLock::RLocker l(pg_map_lock);
4055 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4056 i != pg_map.end();
4057 ++i) {
4058 PG *pg = i->second;
4059
4060 auto rpib = pg->get_required_past_interval_bounds(
4061 pg->info,
4062 superblock.oldest_map);
4063 if (rpib.first >= rpib.second && pg->past_intervals.empty()) {
4064 if (pg->info.history.same_interval_since == 0) {
4065 pg->info.history.same_interval_since = rpib.second;
4066 }
4067 continue;
4068 } else {
4069 auto apib = pg->past_intervals.get_bounds();
4070 if (apib.second >= rpib.second &&
4071 apib.first <= rpib.first) {
4072 if (pg->info.history.same_interval_since == 0) {
4073 pg->info.history.same_interval_since = rpib.second;
4074 }
4075 continue;
4076 }
4077 }
4078
4079 dout(10) << pg->info.pgid << " needs " << rpib.first << "-"
4080 << rpib.second << dendl;
4081 pistate& p = pis[pg];
4082 p.start = rpib.first;
4083 p.end = rpib.second;
4084 p.same_interval_since = 0;
4085
4086 if (rpib.first < cur_epoch)
4087 cur_epoch = rpib.first;
4088 if (rpib.second > end_epoch)
4089 end_epoch = rpib.second;
4090 }
4091 }
4092 if (pis.empty()) {
4093 dout(10) << __func__ << " nothing to build" << dendl;
4094 return;
4095 }
4096
4097 dout(1) << __func__ << " over " << cur_epoch << "-" << end_epoch << dendl;
4098 assert(cur_epoch <= end_epoch);
4099
4100 OSDMapRef cur_map, last_map;
4101 for ( ; cur_epoch <= end_epoch; cur_epoch++) {
4102 dout(10) << __func__ << " epoch " << cur_epoch << dendl;
4103 last_map = cur_map;
4104 cur_map = get_map(cur_epoch);
4105
4106 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4107 PG *pg = i->first;
4108 pistate& p = i->second;
4109
4110 if (cur_epoch < p.start || cur_epoch > p.end)
4111 continue;
4112
4113 vector<int> acting, up;
4114 int up_primary;
4115 int primary;
4116 pg_t pgid = pg->info.pgid.pgid;
4117 if (p.same_interval_since && last_map->get_pools().count(pgid.pool()))
4118 pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool()));
4119 cur_map->pg_to_up_acting_osds(
4120 pgid, &up, &up_primary, &acting, &primary);
4121
4122 if (p.same_interval_since == 0) {
4123 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4124 << " first map, acting " << acting
4125 << " up " << up << ", same_interval_since = " << cur_epoch << dendl;
4126 p.same_interval_since = cur_epoch;
4127 p.old_up = up;
4128 p.old_acting = acting;
4129 p.primary = primary;
4130 p.up_primary = up_primary;
4131 continue;
4132 }
4133 assert(last_map);
4134
4135 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
4136 pg->get_is_recoverable_predicate());
4137 std::stringstream debug;
4138 bool new_interval = PastIntervals::check_new_interval(
4139 p.primary,
4140 primary,
4141 p.old_acting, acting,
4142 p.up_primary,
4143 up_primary,
4144 p.old_up, up,
4145 p.same_interval_since,
4146 pg->info.history.last_epoch_clean,
4147 cur_map, last_map,
4148 pgid,
4149 recoverable.get(),
4150 &pg->past_intervals,
4151 &debug);
4152 if (new_interval) {
4153 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4154 << " " << debug.str() << dendl;
4155 p.old_up = up;
4156 p.old_acting = acting;
4157 p.primary = primary;
4158 p.up_primary = up_primary;
4159 p.same_interval_since = cur_epoch;
4160 }
4161 }
4162 }
4163
4164 // Now that past_intervals have been recomputed let's fix the same_interval_since
4165 // if it was cleared by import.
4166 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4167 PG *pg = i->first;
4168 pistate& p = i->second;
4169
4170 if (pg->info.history.same_interval_since == 0) {
4171 assert(p.same_interval_since);
4172 dout(10) << __func__ << " fix same_interval_since " << p.same_interval_since << " pg " << *pg << dendl;
4173 dout(10) << __func__ << " past_intervals " << pg->past_intervals << dendl;
4174 // Fix it
4175 pg->info.history.same_interval_since = p.same_interval_since;
4176 }
4177 }
4178
4179 // write info only at the end. this is necessary because we check
4180 // whether the past_intervals go far enough back or forward in time,
4181 // but we don't check for holes. we could avoid it by discarding
4182 // the previous past_intervals and rebuilding from scratch, or we
4183 // can just do this and commit all our work at the end.
4184 ObjectStore::Transaction t;
4185 int num = 0;
4186 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4187 PG *pg = i->first;
4188 pg->lock();
4189 pg->dirty_big_info = true;
4190 pg->dirty_info = true;
4191 pg->write_if_dirty(t);
4192 pg->unlock();
4193
4194 // don't let the transaction get too big
4195 if (++num >= cct->_conf->osd_target_transaction_size) {
4196 store->apply_transaction(service.meta_osr.get(), std::move(t));
4197 t = ObjectStore::Transaction();
4198 num = 0;
4199 }
4200 }
4201 if (!t.empty())
4202 store->apply_transaction(service.meta_osr.get(), std::move(t));
4203 }
4204
4205 /*
4206 * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping
4207 * hasn't changed since the given epoch and we are the primary.
4208 */
4209 int OSD::handle_pg_peering_evt(
4210 spg_t pgid,
4211 const pg_history_t& orig_history,
4212 const PastIntervals& pi,
4213 epoch_t epoch,
4214 PG::CephPeeringEvtRef evt)
4215 {
4216 if (service.splitting(pgid)) {
4217 peering_wait_for_split[pgid].push_back(evt);
4218 return -EEXIST;
4219 }
4220
4221 PG *pg = _lookup_lock_pg(pgid);
4222 if (!pg) {
4223 // same primary?
4224 if (!osdmap->have_pg_pool(pgid.pool()))
4225 return -EINVAL;
4226 int up_primary, acting_primary;
4227 vector<int> up, acting;
4228 osdmap->pg_to_up_acting_osds(
4229 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4230
4231 pg_history_t history = orig_history;
4232 bool valid_history = project_pg_history(
4233 pgid, history, epoch, up, up_primary, acting, acting_primary);
4234
4235 if (!valid_history || epoch < history.same_interval_since) {
4236 dout(10) << __func__ << pgid << " acting changed in "
4237 << history.same_interval_since << " (msg from " << epoch << ")"
4238 << dendl;
4239 return -EINVAL;
4240 }
4241
4242 if (service.splitting(pgid)) {
4243 ceph_abort();
4244 }
4245
4246 // do we need to resurrect a deleting pg?
4247 spg_t resurrected;
4248 PGRef old_pg_state;
4249 res_result result = _try_resurrect_pg(
4250 service.get_osdmap(),
4251 pgid,
4252 &resurrected,
4253 &old_pg_state);
4254
4255 PG::RecoveryCtx rctx = create_context();
4256 switch (result) {
4257 case RES_NONE: {
4258 const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
4259 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4260 store->get_type() != "bluestore") {
4261 clog->warn() << "pg " << pgid
4262 << " is at risk of silent data corruption: "
4263 << "the pool allows ec overwrites but is not stored in "
4264 << "bluestore, so deep scrubbing will not detect bitrot";
4265 }
4266 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4267 PG::_init(*rctx.transaction, pgid, pp);
4268
4269 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
4270 if (!pp->is_replicated() && role != pgid.shard)
4271 role = -1;
4272
4273 pg = _create_lock_pg(
4274 get_map(epoch),
4275 pgid, false, false,
4276 role,
4277 up, up_primary,
4278 acting, acting_primary,
4279 history, pi,
4280 *rctx.transaction);
4281 pg->handle_create(&rctx);
4282 pg->write_if_dirty(*rctx.transaction);
4283 dispatch_context(rctx, pg, osdmap);
4284
4285 dout(10) << *pg << " is new" << dendl;
4286
4287 pg->queue_peering_event(evt);
4288 wake_pg_waiters(pg);
4289 pg->unlock();
4290 return 0;
4291 }
4292 case RES_SELF: {
4293 old_pg_state->lock();
4294 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4295 int old_role = old_pg_state->role;
4296 vector<int> old_up = old_pg_state->up;
4297 int old_up_primary = old_pg_state->up_primary.osd;
4298 vector<int> old_acting = old_pg_state->acting;
4299 int old_primary = old_pg_state->primary.osd;
4300 pg_history_t old_history = old_pg_state->info.history;
4301 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4302 old_pg_state->unlock();
4303 pg = _create_lock_pg(
4304 old_osd_map,
4305 resurrected,
4306 false,
4307 true,
4308 old_role,
4309 old_up,
4310 old_up_primary,
4311 old_acting,
4312 old_primary,
4313 old_history,
4314 old_past_intervals,
4315 *rctx.transaction);
4316 pg->handle_create(&rctx);
4317 pg->write_if_dirty(*rctx.transaction);
4318 dispatch_context(rctx, pg, osdmap);
4319
4320 dout(10) << *pg << " is new (resurrected)" << dendl;
4321
4322 pg->queue_peering_event(evt);
4323 wake_pg_waiters(pg);
4324 pg->unlock();
4325 return 0;
4326 }
4327 case RES_PARENT: {
4328 assert(old_pg_state);
4329 old_pg_state->lock();
4330 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4331 int old_role = old_pg_state->role;
4332 vector<int> old_up = old_pg_state->up;
4333 int old_up_primary = old_pg_state->up_primary.osd;
4334 vector<int> old_acting = old_pg_state->acting;
4335 int old_primary = old_pg_state->primary.osd;
4336 pg_history_t old_history = old_pg_state->info.history;
4337 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4338 old_pg_state->unlock();
4339 PG *parent = _create_lock_pg(
4340 old_osd_map,
4341 resurrected,
4342 false,
4343 true,
4344 old_role,
4345 old_up,
4346 old_up_primary,
4347 old_acting,
4348 old_primary,
4349 old_history,
4350 old_past_intervals,
4351 *rctx.transaction
4352 );
4353 parent->handle_create(&rctx);
4354 parent->write_if_dirty(*rctx.transaction);
4355 dispatch_context(rctx, parent, osdmap);
4356
4357 dout(10) << *parent << " is new" << dendl;
4358
4359 assert(service.splitting(pgid));
4360 peering_wait_for_split[pgid].push_back(evt);
4361
4362 //parent->queue_peering_event(evt);
4363 parent->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
4364 wake_pg_waiters(parent);
4365 parent->unlock();
4366 return 0;
4367 }
4368 default:
4369 assert(0);
4370 return 0;
4371 }
4372 } else {
4373 // already had it. did the mapping change?
4374 if (epoch < pg->info.history.same_interval_since) {
4375 dout(10) << *pg << __func__ << " acting changed in "
4376 << pg->info.history.same_interval_since
4377 << " (msg from " << epoch << ")" << dendl;
4378 } else {
4379 pg->queue_peering_event(evt);
4380 }
4381 pg->unlock();
4382 return -EEXIST;
4383 }
4384 }
4385
4386
4387 void OSD::build_initial_pg_history(
4388 spg_t pgid,
4389 epoch_t created,
4390 utime_t created_stamp,
4391 pg_history_t *h,
4392 PastIntervals *pi)
4393 {
4394 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4395 h->epoch_created = created;
4396 h->epoch_pool_created = created;
4397 h->same_interval_since = created;
4398 h->same_up_since = created;
4399 h->same_primary_since = created;
4400 h->last_scrub_stamp = created_stamp;
4401 h->last_deep_scrub_stamp = created_stamp;
4402 h->last_clean_scrub_stamp = created_stamp;
4403
4404 OSDMapRef lastmap = service.get_map(created);
4405 int up_primary, acting_primary;
4406 vector<int> up, acting;
4407 lastmap->pg_to_up_acting_osds(
4408 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4409
4410 ostringstream debug;
4411 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4412 OSDMapRef osdmap = service.get_map(e);
4413 int new_up_primary, new_acting_primary;
4414 vector<int> new_up, new_acting;
4415 osdmap->pg_to_up_acting_osds(
4416 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4417
4418 // this is a bit imprecise, but sufficient?
4419 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4420 const pg_pool_t *pi;
4421 bool operator()(const set<pg_shard_t> &have) const {
4422 return have.size() >= pi->min_size;
4423 }
4424 min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4425 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4426
4427 bool new_interval = PastIntervals::check_new_interval(
4428 acting_primary,
4429 new_acting_primary,
4430 acting, new_acting,
4431 up_primary,
4432 new_up_primary,
4433 up, new_up,
4434 h->same_interval_since,
4435 h->last_epoch_clean,
4436 osdmap,
4437 lastmap,
4438 pgid.pgid,
4439 &min_size_predicate,
4440 pi,
4441 &debug);
4442 if (new_interval) {
4443 h->same_interval_since = e;
4444 }
4445 if (up != new_up) {
4446 h->same_up_since = e;
4447 }
4448 if (acting_primary != new_acting_primary) {
4449 h->same_primary_since = e;
4450 }
4451 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4452 osdmap->get_pg_num(pgid.pgid.pool()),
4453 nullptr)) {
4454 h->last_epoch_split = e;
4455 }
4456 lastmap = osdmap;
4457 }
4458 dout(20) << __func__ << " " << debug.str() << dendl;
4459 dout(10) << __func__ << " " << *h << " " << *pi
4460 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4461 pi->get_bounds()) << ")"
4462 << dendl;
4463 }
4464
4465 /**
4466 * Fill in the passed history so you know same_interval_since, same_up_since,
4467 * and same_primary_since.
4468 */
4469 bool OSD::project_pg_history(spg_t pgid, pg_history_t& h, epoch_t from,
4470 const vector<int>& currentup,
4471 int currentupprimary,
4472 const vector<int>& currentacting,
4473 int currentactingprimary)
4474 {
4475 dout(15) << "project_pg_history " << pgid
4476 << " from " << from << " to " << osdmap->get_epoch()
4477 << ", start " << h
4478 << dendl;
4479
4480 epoch_t e;
4481 for (e = osdmap->get_epoch();
4482 e > from;
4483 e--) {
4484 // verify during intermediate epoch (e-1)
4485 OSDMapRef oldmap = service.try_get_map(e-1);
4486 if (!oldmap) {
4487 dout(15) << __func__ << ": found map gap, returning false" << dendl;
4488 return false;
4489 }
4490 assert(oldmap->have_pg_pool(pgid.pool()));
4491
4492 int upprimary, actingprimary;
4493 vector<int> up, acting;
4494 oldmap->pg_to_up_acting_osds(
4495 pgid.pgid,
4496 &up,
4497 &upprimary,
4498 &acting,
4499 &actingprimary);
4500
4501 // acting set change?
4502 if ((actingprimary != currentactingprimary ||
4503 upprimary != currentupprimary ||
4504 acting != currentacting ||
4505 up != currentup) && e > h.same_interval_since) {
4506 dout(15) << "project_pg_history " << pgid << " acting|up changed in " << e
4507 << " from " << acting << "/" << up
4508 << " " << actingprimary << "/" << upprimary
4509 << " -> " << currentacting << "/" << currentup
4510 << " " << currentactingprimary << "/" << currentupprimary
4511 << dendl;
4512 h.same_interval_since = e;
4513 }
4514 // split?
4515 if (pgid.is_split(oldmap->get_pg_num(pgid.pool()),
4516 osdmap->get_pg_num(pgid.pool()),
4517 0) && e > h.same_interval_since) {
4518 h.same_interval_since = e;
4519 }
4520 // up set change?
4521 if ((up != currentup || upprimary != currentupprimary)
4522 && e > h.same_up_since) {
4523 dout(15) << "project_pg_history " << pgid << " up changed in " << e
4524 << " from " << up << " " << upprimary
4525 << " -> " << currentup << " " << currentupprimary << dendl;
4526 h.same_up_since = e;
4527 }
4528
4529 // primary change?
4530 if (OSDMap::primary_changed(
4531 actingprimary,
4532 acting,
4533 currentactingprimary,
4534 currentacting) &&
4535 e > h.same_primary_since) {
4536 dout(15) << "project_pg_history " << pgid << " primary changed in " << e << dendl;
4537 h.same_primary_since = e;
4538 }
4539
4540 if (h.same_interval_since >= e && h.same_up_since >= e && h.same_primary_since >= e)
4541 break;
4542 }
4543
4544 // base case: these floors should be the pg creation epoch if we didn't
4545 // find any changes.
4546 if (e == h.epoch_created) {
4547 if (!h.same_interval_since)
4548 h.same_interval_since = e;
4549 if (!h.same_up_since)
4550 h.same_up_since = e;
4551 if (!h.same_primary_since)
4552 h.same_primary_since = e;
4553 }
4554
4555 dout(15) << "project_pg_history end " << h << dendl;
4556 return true;
4557 }
4558
4559
4560
4561 void OSD::_add_heartbeat_peer(int p)
4562 {
4563 if (p == whoami)
4564 return;
4565 HeartbeatInfo *hi;
4566
4567 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4568 if (i == heartbeat_peers.end()) {
4569 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4570 if (!cons.first)
4571 return;
4572 hi = &heartbeat_peers[p];
4573 hi->peer = p;
4574 HeartbeatSession *s = new HeartbeatSession(p);
4575 hi->con_back = cons.first.get();
4576 hi->con_back->set_priv(s->get());
4577 if (cons.second) {
4578 hi->con_front = cons.second.get();
4579 hi->con_front->set_priv(s->get());
4580 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4581 << " " << hi->con_back->get_peer_addr()
4582 << " " << hi->con_front->get_peer_addr()
4583 << dendl;
4584 } else {
4585 hi->con_front.reset(NULL);
4586 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4587 << " " << hi->con_back->get_peer_addr()
4588 << dendl;
4589 }
4590 s->put();
4591 } else {
4592 hi = &i->second;
4593 }
4594 hi->epoch = osdmap->get_epoch();
4595 }
4596
4597 void OSD::_remove_heartbeat_peer(int n)
4598 {
4599 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
4600 assert(q != heartbeat_peers.end());
4601 dout(20) << " removing heartbeat peer osd." << n
4602 << " " << q->second.con_back->get_peer_addr()
4603 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
4604 << dendl;
4605 q->second.con_back->mark_down();
4606 if (q->second.con_front) {
4607 q->second.con_front->mark_down();
4608 }
4609 heartbeat_peers.erase(q);
4610 }
4611
4612 void OSD::need_heartbeat_peer_update()
4613 {
4614 if (is_stopping())
4615 return;
4616 dout(20) << "need_heartbeat_peer_update" << dendl;
4617 heartbeat_set_peers_need_update();
4618 }
4619
4620 void OSD::maybe_update_heartbeat_peers()
4621 {
4622 assert(osd_lock.is_locked());
4623
4624 if (is_waiting_for_healthy()) {
4625 utime_t now = ceph_clock_now();
4626 if (last_heartbeat_resample == utime_t()) {
4627 last_heartbeat_resample = now;
4628 heartbeat_set_peers_need_update();
4629 } else if (!heartbeat_peers_need_update()) {
4630 utime_t dur = now - last_heartbeat_resample;
4631 if (dur > cct->_conf->osd_heartbeat_grace) {
4632 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
4633 heartbeat_set_peers_need_update();
4634 last_heartbeat_resample = now;
4635 reset_heartbeat_peers(); // we want *new* peers!
4636 }
4637 }
4638 }
4639
4640 if (!heartbeat_peers_need_update())
4641 return;
4642 heartbeat_clear_peers_need_update();
4643
4644 Mutex::Locker l(heartbeat_lock);
4645
4646 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
4647
4648
4649 // build heartbeat from set
4650 if (is_active()) {
4651 RWLock::RLocker l(pg_map_lock);
4652 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4653 i != pg_map.end();
4654 ++i) {
4655 PG *pg = i->second;
4656 pg->heartbeat_peer_lock.Lock();
4657 dout(20) << i->first << " heartbeat_peers " << pg->heartbeat_peers << dendl;
4658 for (set<int>::iterator p = pg->heartbeat_peers.begin();
4659 p != pg->heartbeat_peers.end();
4660 ++p)
4661 if (osdmap->is_up(*p))
4662 _add_heartbeat_peer(*p);
4663 for (set<int>::iterator p = pg->probe_targets.begin();
4664 p != pg->probe_targets.end();
4665 ++p)
4666 if (osdmap->is_up(*p))
4667 _add_heartbeat_peer(*p);
4668 pg->heartbeat_peer_lock.Unlock();
4669 }
4670 }
4671
4672 // include next and previous up osds to ensure we have a fully-connected set
4673 set<int> want, extras;
4674 int next = osdmap->get_next_up_osd_after(whoami);
4675 if (next >= 0)
4676 want.insert(next);
4677 int prev = osdmap->get_previous_up_osd_before(whoami);
4678 if (prev >= 0 && prev != next)
4679 want.insert(prev);
4680
4681 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
4682 dout(10) << " adding neighbor peer osd." << *p << dendl;
4683 extras.insert(*p);
4684 _add_heartbeat_peer(*p);
4685 }
4686
4687 // remove down peers; enumerate extras
4688 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4689 while (p != heartbeat_peers.end()) {
4690 if (!osdmap->is_up(p->first)) {
4691 int o = p->first;
4692 ++p;
4693 _remove_heartbeat_peer(o);
4694 continue;
4695 }
4696 if (p->second.epoch < osdmap->get_epoch()) {
4697 extras.insert(p->first);
4698 }
4699 ++p;
4700 }
4701
4702 // too few?
4703 int start = osdmap->get_next_up_osd_after(whoami);
4704 for (int n = start; n >= 0; ) {
4705 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
4706 break;
4707 if (!extras.count(n) && !want.count(n) && n != whoami) {
4708 dout(10) << " adding random peer osd." << n << dendl;
4709 extras.insert(n);
4710 _add_heartbeat_peer(n);
4711 }
4712 n = osdmap->get_next_up_osd_after(n);
4713 if (n == start)
4714 break; // came full circle; stop
4715 }
4716
4717 // too many?
4718 for (set<int>::iterator p = extras.begin();
4719 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
4720 ++p) {
4721 if (want.count(*p))
4722 continue;
4723 _remove_heartbeat_peer(*p);
4724 }
4725
4726 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
4727 }
4728
4729 void OSD::reset_heartbeat_peers()
4730 {
4731 assert(osd_lock.is_locked());
4732 dout(10) << "reset_heartbeat_peers" << dendl;
4733 Mutex::Locker l(heartbeat_lock);
4734 while (!heartbeat_peers.empty()) {
4735 HeartbeatInfo& hi = heartbeat_peers.begin()->second;
4736 hi.con_back->mark_down();
4737 if (hi.con_front) {
4738 hi.con_front->mark_down();
4739 }
4740 heartbeat_peers.erase(heartbeat_peers.begin());
4741 }
4742 failure_queue.clear();
4743 }
4744
4745 void OSD::handle_osd_ping(MOSDPing *m)
4746 {
4747 if (superblock.cluster_fsid != m->fsid) {
4748 dout(20) << "handle_osd_ping from " << m->get_source_inst()
4749 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
4750 m->put();
4751 return;
4752 }
4753
4754 int from = m->get_source().num();
4755
4756 heartbeat_lock.Lock();
4757 if (is_stopping()) {
4758 heartbeat_lock.Unlock();
4759 m->put();
4760 return;
4761 }
4762
4763 OSDMapRef curmap = service.get_osdmap();
4764 if (!curmap) {
4765 heartbeat_lock.Unlock();
4766 m->put();
4767 return;
4768 }
4769
4770 switch (m->op) {
4771
4772 case MOSDPing::PING:
4773 {
4774 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
4775 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
4776 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
4777 if (heartbeat_drop->second == 0) {
4778 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
4779 } else {
4780 --heartbeat_drop->second;
4781 dout(5) << "Dropping heartbeat from " << from
4782 << ", " << heartbeat_drop->second
4783 << " remaining to drop" << dendl;
4784 break;
4785 }
4786 } else if (cct->_conf->osd_debug_drop_ping_probability >
4787 ((((double)(rand()%100))/100.0))) {
4788 heartbeat_drop =
4789 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
4790 cct->_conf->osd_debug_drop_ping_duration)).first;
4791 dout(5) << "Dropping heartbeat from " << from
4792 << ", " << heartbeat_drop->second
4793 << " remaining to drop" << dendl;
4794 break;
4795 }
4796 }
4797
4798 if (!cct->get_heartbeat_map()->is_healthy()) {
4799 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
4800 break;
4801 }
4802
4803 Message *r = new MOSDPing(monc->get_fsid(),
4804 curmap->get_epoch(),
4805 MOSDPing::PING_REPLY, m->stamp,
4806 cct->_conf->osd_heartbeat_min_size);
4807 m->get_connection()->send_message(r);
4808
4809 if (curmap->is_up(from)) {
4810 service.note_peer_epoch(from, m->map_epoch);
4811 if (is_active()) {
4812 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
4813 if (con) {
4814 service.share_map_peer(from, con.get());
4815 }
4816 }
4817 } else if (!curmap->exists(from) ||
4818 curmap->get_down_at(from) > m->map_epoch) {
4819 // tell them they have died
4820 Message *r = new MOSDPing(monc->get_fsid(),
4821 curmap->get_epoch(),
4822 MOSDPing::YOU_DIED,
4823 m->stamp,
4824 cct->_conf->osd_heartbeat_min_size);
4825 m->get_connection()->send_message(r);
4826 }
4827 }
4828 break;
4829
4830 case MOSDPing::PING_REPLY:
4831 {
4832 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
4833 if (i != heartbeat_peers.end()) {
4834 if (m->get_connection() == i->second.con_back) {
4835 dout(25) << "handle_osd_ping got reply from osd." << from
4836 << " first_tx " << i->second.first_tx
4837 << " last_tx " << i->second.last_tx
4838 << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
4839 << " last_rx_front " << i->second.last_rx_front
4840 << dendl;
4841 i->second.last_rx_back = m->stamp;
4842 // if there is no front con, set both stamps.
4843 if (i->second.con_front == NULL)
4844 i->second.last_rx_front = m->stamp;
4845 } else if (m->get_connection() == i->second.con_front) {
4846 dout(25) << "handle_osd_ping got reply from osd." << from
4847 << " first_tx " << i->second.first_tx
4848 << " last_tx " << i->second.last_tx
4849 << " last_rx_back " << i->second.last_rx_back
4850 << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
4851 << dendl;
4852 i->second.last_rx_front = m->stamp;
4853 }
4854
4855 utime_t cutoff = ceph_clock_now();
4856 cutoff -= cct->_conf->osd_heartbeat_grace;
4857 if (i->second.is_healthy(cutoff)) {
4858 // Cancel false reports
4859 auto failure_queue_entry = failure_queue.find(from);
4860 if (failure_queue_entry != failure_queue.end()) {
4861 dout(10) << "handle_osd_ping canceling queued "
4862 << "failure report for osd." << from << dendl;
4863 failure_queue.erase(failure_queue_entry);
4864 }
4865
4866 auto failure_pending_entry = failure_pending.find(from);
4867 if (failure_pending_entry != failure_pending.end()) {
4868 dout(10) << "handle_osd_ping canceling in-flight "
4869 << "failure report for osd." << from << dendl;
4870 send_still_alive(curmap->get_epoch(),
4871 failure_pending_entry->second.second);
4872 failure_pending.erase(failure_pending_entry);
4873 }
4874 }
4875 }
4876
4877 if (m->map_epoch &&
4878 curmap->is_up(from)) {
4879 service.note_peer_epoch(from, m->map_epoch);
4880 if (is_active()) {
4881 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
4882 if (con) {
4883 service.share_map_peer(from, con.get());
4884 }
4885 }
4886 }
4887 }
4888 break;
4889
4890 case MOSDPing::YOU_DIED:
4891 dout(10) << "handle_osd_ping " << m->get_source_inst()
4892 << " says i am down in " << m->map_epoch << dendl;
4893 osdmap_subscribe(curmap->get_epoch()+1, false);
4894 break;
4895 }
4896
4897 heartbeat_lock.Unlock();
4898 m->put();
4899 }
4900
4901 void OSD::heartbeat_entry()
4902 {
4903 Mutex::Locker l(heartbeat_lock);
4904 if (is_stopping())
4905 return;
4906 while (!heartbeat_stop) {
4907 heartbeat();
4908
4909 double wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
4910 utime_t w;
4911 w.set_from_double(wait);
4912 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
4913 heartbeat_cond.WaitInterval(heartbeat_lock, w);
4914 if (is_stopping())
4915 return;
4916 dout(30) << "heartbeat_entry woke up" << dendl;
4917 }
4918 }
4919
4920 void OSD::heartbeat_check()
4921 {
4922 assert(heartbeat_lock.is_locked());
4923 utime_t now = ceph_clock_now();
4924
4925 // check for heartbeat replies (move me elsewhere?)
4926 utime_t cutoff = now;
4927 cutoff -= cct->_conf->osd_heartbeat_grace;
4928 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4929 p != heartbeat_peers.end();
4930 ++p) {
4931
4932 if (p->second.first_tx == utime_t()) {
4933 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
4934 << "yet, skipping" << dendl;
4935 continue;
4936 }
4937
4938 dout(25) << "heartbeat_check osd." << p->first
4939 << " first_tx " << p->second.first_tx
4940 << " last_tx " << p->second.last_tx
4941 << " last_rx_back " << p->second.last_rx_back
4942 << " last_rx_front " << p->second.last_rx_front
4943 << dendl;
4944 if (p->second.is_unhealthy(cutoff)) {
4945 if (p->second.last_rx_back == utime_t() ||
4946 p->second.last_rx_front == utime_t()) {
4947 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
4948 << " osd." << p->first << " ever on either front or back, first ping sent "
4949 << p->second.first_tx << " (cutoff " << cutoff << ")" << dendl;
4950 // fail
4951 failure_queue[p->first] = p->second.last_tx;
4952 } else {
4953 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
4954 << " osd." << p->first << " since back " << p->second.last_rx_back
4955 << " front " << p->second.last_rx_front
4956 << " (cutoff " << cutoff << ")" << dendl;
4957 // fail
4958 failure_queue[p->first] = MIN(p->second.last_rx_back, p->second.last_rx_front);
4959 }
4960 }
4961 }
4962 }
4963
4964 void OSD::heartbeat()
4965 {
4966 dout(30) << "heartbeat" << dendl;
4967
4968 // get CPU load avg
4969 double loadavgs[1];
4970 int n_samples = 86400 / cct->_conf->osd_heartbeat_interval;
4971 if (getloadavg(loadavgs, 1) == 1) {
4972 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
4973 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
4974 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
4975 }
4976
4977 dout(30) << "heartbeat checking stats" << dendl;
4978
4979 // refresh stats?
4980 vector<int> hb_peers;
4981 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4982 p != heartbeat_peers.end();
4983 ++p)
4984 hb_peers.push_back(p->first);
4985 service.update_osd_stat(hb_peers);
4986
4987 dout(5) << "heartbeat: " << service.get_osd_stat() << dendl;
4988
4989 utime_t now = ceph_clock_now();
4990
4991 // send heartbeats
4992 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
4993 i != heartbeat_peers.end();
4994 ++i) {
4995 int peer = i->first;
4996 i->second.last_tx = now;
4997 if (i->second.first_tx == utime_t())
4998 i->second.first_tx = now;
4999 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5000 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
5001 service.get_osdmap()->get_epoch(),
5002 MOSDPing::PING, now,
5003 cct->_conf->osd_heartbeat_min_size));
5004
5005 if (i->second.con_front)
5006 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
5007 service.get_osdmap()->get_epoch(),
5008 MOSDPing::PING, now,
5009 cct->_conf->osd_heartbeat_min_size));
5010 }
5011
5012 logger->set(l_osd_hb_to, heartbeat_peers.size());
5013
5014 // hmm.. am i all alone?
5015 dout(30) << "heartbeat lonely?" << dendl;
5016 if (heartbeat_peers.empty()) {
5017 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5018 last_mon_heartbeat = now;
5019 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5020 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5021 }
5022 }
5023
5024 dout(30) << "heartbeat done" << dendl;
5025 }
5026
5027 bool OSD::heartbeat_reset(Connection *con)
5028 {
5029 HeartbeatSession *s = static_cast<HeartbeatSession*>(con->get_priv());
5030 if (s) {
5031 heartbeat_lock.Lock();
5032 if (is_stopping()) {
5033 heartbeat_lock.Unlock();
5034 s->put();
5035 return true;
5036 }
5037 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer);
5038 if (p != heartbeat_peers.end() &&
5039 (p->second.con_back == con ||
5040 p->second.con_front == con)) {
5041 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5042 << ", reopening" << dendl;
5043 if (con != p->second.con_back) {
5044 p->second.con_back->mark_down();
5045 }
5046 p->second.con_back.reset(NULL);
5047 if (p->second.con_front && con != p->second.con_front) {
5048 p->second.con_front->mark_down();
5049 }
5050 p->second.con_front.reset(NULL);
5051 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5052 if (newcon.first) {
5053 p->second.con_back = newcon.first.get();
5054 p->second.con_back->set_priv(s->get());
5055 if (newcon.second) {
5056 p->second.con_front = newcon.second.get();
5057 p->second.con_front->set_priv(s->get());
5058 }
5059 } else {
5060 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5061 << ", raced with osdmap update, closing out peer" << dendl;
5062 heartbeat_peers.erase(p);
5063 }
5064 } else {
5065 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5066 }
5067 heartbeat_lock.Unlock();
5068 s->put();
5069 }
5070 return true;
5071 }
5072
5073
5074
5075 // =========================================
5076
5077 void OSD::tick()
5078 {
5079 assert(osd_lock.is_locked());
5080 dout(10) << "tick" << dendl;
5081
5082 if (is_active() || is_waiting_for_healthy()) {
5083 maybe_update_heartbeat_peers();
5084 }
5085
5086 if (is_waiting_for_healthy()) {
5087 start_boot();
5088 } else if (is_preboot() &&
5089 waiting_for_luminous_mons &&
5090 monc->monmap.get_required_features().contains_all(
5091 ceph::features::mon::FEATURE_LUMINOUS)) {
5092 // mon upgrade finished!
5093 start_boot();
5094 }
5095
5096 do_waiters();
5097
5098 tick_timer.add_event_after(OSD_TICK_INTERVAL, new C_Tick(this));
5099 }
5100
5101 void OSD::tick_without_osd_lock()
5102 {
5103 assert(tick_timer_lock.is_locked());
5104 dout(10) << "tick_without_osd_lock" << dendl;
5105
5106 logger->set(l_osd_buf, buffer::get_total_alloc());
5107 logger->set(l_osd_history_alloc_bytes, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
5108 logger->set(l_osd_history_alloc_num, buffer::get_history_alloc_num());
5109 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5110 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5111 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5112
5113 // osd_lock is not being held, which means the OSD state
5114 // might change when doing the monitor report
5115 if (is_active() || is_waiting_for_healthy()) {
5116 heartbeat_lock.Lock();
5117 heartbeat_check();
5118 heartbeat_lock.Unlock();
5119
5120 map_lock.get_read();
5121 Mutex::Locker l(mon_report_lock);
5122
5123 // mon report?
5124 bool reset = false;
5125 bool report = false;
5126 utime_t now = ceph_clock_now();
5127 pg_stat_queue_lock.Lock();
5128 double backoff = stats_ack_timeout / cct->_conf->osd_mon_ack_timeout;
5129 double adjusted_min = cct->_conf->osd_mon_report_interval_min * backoff;
5130 // note: we shouldn't adjust max because it must remain < the
5131 // mon's mon_osd_report_timeout (which defaults to 1.5x our
5132 // value).
5133 double max = cct->_conf->osd_mon_report_interval_max;
5134 if (!outstanding_pg_stats.empty() &&
5135 (now - stats_ack_timeout) > last_pg_stats_ack) {
5136 dout(1) << __func__ << " mon hasn't acked PGStats in "
5137 << now - last_pg_stats_ack
5138 << " seconds, reconnecting elsewhere" << dendl;
5139 reset = true;
5140 last_pg_stats_ack = now; // reset clock
5141 last_pg_stats_sent = utime_t();
5142 stats_ack_timeout =
5143 MAX(cct->_conf->osd_mon_ack_timeout,
5144 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_factor);
5145 outstanding_pg_stats.clear();
5146 }
5147 if (now - last_pg_stats_sent > max) {
5148 osd_stat_updated = true;
5149 report = true;
5150 } else if (service.need_fullness_update()) {
5151 report = true;
5152 } else if ((int)outstanding_pg_stats.size() >=
5153 cct->_conf->osd_mon_report_max_in_flight) {
5154 dout(20) << __func__ << " have max " << outstanding_pg_stats
5155 << " stats updates in flight" << dendl;
5156 } else {
5157 if (now - last_mon_report > adjusted_min) {
5158 dout(20) << __func__ << " stats backoff " << backoff
5159 << " adjusted_min " << adjusted_min << " - sending report"
5160 << dendl;
5161 osd_stat_updated = true;
5162 report = true;
5163 }
5164 }
5165 pg_stat_queue_lock.Unlock();
5166
5167 if (reset) {
5168 monc->reopen_session();
5169 } else if (report) {
5170 last_mon_report = now;
5171
5172 // do any pending reports
5173 send_full_update();
5174 send_failures();
5175 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5176 send_pg_stats(now);
5177 }
5178 }
5179 map_lock.put_read();
5180 }
5181
5182 if (is_active()) {
5183 if (!scrub_random_backoff()) {
5184 sched_scrub();
5185 }
5186 service.promote_throttle_recalibrate();
5187 bool need_send_beacon = false;
5188 const auto now = ceph::coarse_mono_clock::now();
5189 {
5190 // borrow lec lock to pretect last_sent_beacon from changing
5191 Mutex::Locker l{min_last_epoch_clean_lock};
5192 const auto elapsed = now - last_sent_beacon;
5193 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5194 cct->_conf->osd_beacon_report_interval) {
5195 need_send_beacon = true;
5196 }
5197 }
5198 if (need_send_beacon) {
5199 send_beacon(now);
5200 }
5201 }
5202
5203 check_ops_in_flight();
5204 service.kick_recovery_queue();
5205 tick_timer_without_osd_lock.add_event_after(OSD_TICK_INTERVAL, new C_Tick_WithoutOSDLock(this));
5206 }
5207
5208 void OSD::check_ops_in_flight()
5209 {
5210 vector<string> warnings;
5211 if (op_tracker.check_ops_in_flight(warnings)) {
5212 for (vector<string>::iterator i = warnings.begin();
5213 i != warnings.end();
5214 ++i) {
5215 clog->warn() << *i;
5216 }
5217 }
5218 }
5219
5220 // Usage:
5221 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5222 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5223 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5224 // getomap <pool> [namespace/]<obj-name>
5225 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5226 // injectmdataerr [namespace/]<obj-name> [shardid]
5227 // injectdataerr [namespace/]<obj-name> [shardid]
5228 //
5229 // set_recovery_delay [utime]
5230 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5231 const std::string &command, cmdmap_t& cmdmap, ostream &ss)
5232 {
5233 //Test support
5234 //Support changing the omap on a single osd by using the Admin Socket to
5235 //directly request the osd make a change.
5236 if (command == "setomapval" || command == "rmomapkey" ||
5237 command == "setomapheader" || command == "getomap" ||
5238 command == "truncobj" || command == "injectmdataerr" ||
5239 command == "injectdataerr"
5240 ) {
5241 pg_t rawpg;
5242 int64_t pool;
5243 OSDMapRef curmap = service->get_osdmap();
5244 int r = -1;
5245
5246 string poolstr;
5247
5248 cmd_getval(service->cct, cmdmap, "pool", poolstr);
5249 pool = curmap->lookup_pg_pool_name(poolstr);
5250 //If we can't find it by name then maybe id specified
5251 if (pool < 0 && isdigit(poolstr[0]))
5252 pool = atoll(poolstr.c_str());
5253 if (pool < 0) {
5254 ss << "Invalid pool '" << poolstr << "''";
5255 return;
5256 }
5257
5258 string objname, nspace;
5259 cmd_getval(service->cct, cmdmap, "objname", objname);
5260 std::size_t found = objname.find_first_of('/');
5261 if (found != string::npos) {
5262 nspace = objname.substr(0, found);
5263 objname = objname.substr(found+1);
5264 }
5265 object_locator_t oloc(pool, nspace);
5266 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5267
5268 if (r < 0) {
5269 ss << "Invalid namespace/objname";
5270 return;
5271 }
5272
5273 int64_t shardid;
5274 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5275 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5276 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5277 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5278 if (curmap->pg_is_ec(rawpg)) {
5279 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5280 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5281 return;
5282 }
5283 }
5284
5285 ObjectStore::Transaction t;
5286
5287 if (command == "setomapval") {
5288 map<string, bufferlist> newattrs;
5289 bufferlist val;
5290 string key, valstr;
5291 cmd_getval(service->cct, cmdmap, "key", key);
5292 cmd_getval(service->cct, cmdmap, "val", valstr);
5293
5294 val.append(valstr);
5295 newattrs[key] = val;
5296 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5297 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5298 if (r < 0)
5299 ss << "error=" << r;
5300 else
5301 ss << "ok";
5302 } else if (command == "rmomapkey") {
5303 string key;
5304 set<string> keys;
5305 cmd_getval(service->cct, cmdmap, "key", key);
5306
5307 keys.insert(key);
5308 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
5309 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5310 if (r < 0)
5311 ss << "error=" << r;
5312 else
5313 ss << "ok";
5314 } else if (command == "setomapheader") {
5315 bufferlist newheader;
5316 string headerstr;
5317
5318 cmd_getval(service->cct, cmdmap, "header", headerstr);
5319 newheader.append(headerstr);
5320 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
5321 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5322 if (r < 0)
5323 ss << "error=" << r;
5324 else
5325 ss << "ok";
5326 } else if (command == "getomap") {
5327 //Debug: Output entire omap
5328 bufferlist hdrbl;
5329 map<string, bufferlist> keyvals;
5330 r = store->omap_get(coll_t(pgid), ghobject_t(obj), &hdrbl, &keyvals);
5331 if (r >= 0) {
5332 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5333 for (map<string, bufferlist>::iterator it = keyvals.begin();
5334 it != keyvals.end(); ++it)
5335 ss << " key=" << (*it).first << " val="
5336 << string((*it).second.c_str(), (*it).second.length());
5337 } else {
5338 ss << "error=" << r;
5339 }
5340 } else if (command == "truncobj") {
5341 int64_t trunclen;
5342 cmd_getval(service->cct, cmdmap, "len", trunclen);
5343 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
5344 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5345 if (r < 0)
5346 ss << "error=" << r;
5347 else
5348 ss << "ok";
5349 } else if (command == "injectdataerr") {
5350 store->inject_data_error(gobj);
5351 ss << "ok";
5352 } else if (command == "injectmdataerr") {
5353 store->inject_mdata_error(gobj);
5354 ss << "ok";
5355 }
5356 return;
5357 }
5358 if (command == "set_recovery_delay") {
5359 int64_t delay;
5360 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5361 ostringstream oss;
5362 oss << delay;
5363 int r = service->cct->_conf->set_val("osd_recovery_delay_start",
5364 oss.str().c_str());
5365 if (r != 0) {
5366 ss << "set_recovery_delay: error setting "
5367 << "osd_recovery_delay_start to '" << delay << "': error "
5368 << r;
5369 return;
5370 }
5371 service->cct->_conf->apply_changes(NULL);
5372 ss << "set_recovery_delay: set osd_recovery_delay_start "
5373 << "to " << service->cct->_conf->osd_recovery_delay_start;
5374 return;
5375 }
5376 if (command == "trigger_scrub") {
5377 spg_t pgid;
5378 OSDMapRef curmap = service->get_osdmap();
5379
5380 string pgidstr;
5381
5382 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5383 if (!pgid.parse(pgidstr.c_str())) {
5384 ss << "Invalid pgid specified";
5385 return;
5386 }
5387
5388 PG *pg = service->osd->_lookup_lock_pg(pgid);
5389 if (pg == nullptr) {
5390 ss << "Can't find pg " << pgid;
5391 return;
5392 }
5393
5394 if (pg->is_primary()) {
5395 pg->unreg_next_scrub();
5396 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5397 double pool_scrub_max_interval = 0;
5398 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5399 double scrub_max_interval = pool_scrub_max_interval > 0 ?
5400 pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
5401 // Instead of marking must_scrub force a schedule scrub
5402 utime_t stamp = ceph_clock_now();
5403 stamp -= scrub_max_interval;
5404 stamp -= 100.0; // push back last scrub more for good measure
5405 pg->info.history.last_scrub_stamp = stamp;
5406 pg->reg_next_scrub();
5407 ss << "ok";
5408 } else {
5409 ss << "Not primary";
5410 }
5411 pg->unlock();
5412 return;
5413 }
5414 if (command == "injectfull") {
5415 int64_t count;
5416 string type;
5417 OSDService::s_names state;
5418 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
5419 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
5420 if (type == "none" || count == 0) {
5421 type = "none";
5422 count = 0;
5423 }
5424 state = service->get_full_state(type);
5425 if (state == OSDService::s_names::INVALID) {
5426 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5427 return;
5428 }
5429 service->set_injectfull(state, count);
5430 return;
5431 }
5432 ss << "Internal error - command=" << command;
5433 }
5434
5435 // =========================================
5436 bool remove_dir(
5437 CephContext *cct,
5438 ObjectStore *store, SnapMapper *mapper,
5439 OSDriver *osdriver,
5440 ObjectStore::Sequencer *osr,
5441 coll_t coll, DeletingStateRef dstate,
5442 bool *finished,
5443 ThreadPool::TPHandle &handle)
5444 {
5445 vector<ghobject_t> olist;
5446 int64_t num = 0;
5447 ObjectStore::Transaction t;
5448 ghobject_t next;
5449 handle.reset_tp_timeout();
5450 store->collection_list(
5451 coll,
5452 next,
5453 ghobject_t::get_max(),
5454 store->get_ideal_list_max(),
5455 &olist,
5456 &next);
5457 generic_dout(10) << __func__ << " " << olist << dendl;
5458 // default cont to true, this is safe because caller(OSD::RemoveWQ::_process())
5459 // will recheck the answer before it really goes on.
5460 bool cont = true;
5461 for (vector<ghobject_t>::iterator i = olist.begin();
5462 i != olist.end();
5463 ++i) {
5464 if (i->is_pgmeta())
5465 continue;
5466 OSDriver::OSTransaction _t(osdriver->get_transaction(&t));
5467 int r = mapper->remove_oid(i->hobj, &_t);
5468 if (r != 0 && r != -ENOENT) {
5469 ceph_abort();
5470 }
5471 t.remove(coll, *i);
5472 if (++num >= cct->_conf->osd_target_transaction_size) {
5473 C_SaferCond waiter;
5474 store->queue_transaction(osr, std::move(t), &waiter);
5475 cont = dstate->pause_clearing();
5476 handle.suspend_tp_timeout();
5477 waiter.wait();
5478 handle.reset_tp_timeout();
5479 if (cont)
5480 cont = dstate->resume_clearing();
5481 if (!cont)
5482 return false;
5483 t = ObjectStore::Transaction();
5484 num = 0;
5485 }
5486 }
5487 if (num) {
5488 C_SaferCond waiter;
5489 store->queue_transaction(osr, std::move(t), &waiter);
5490 cont = dstate->pause_clearing();
5491 handle.suspend_tp_timeout();
5492 waiter.wait();
5493 handle.reset_tp_timeout();
5494 if (cont)
5495 cont = dstate->resume_clearing();
5496 }
5497 // whether there are more objects to remove in the collection
5498 *finished = next.is_max();
5499 return cont;
5500 }
5501
5502 void OSD::RemoveWQ::_process(
5503 pair<PGRef, DeletingStateRef> item,
5504 ThreadPool::TPHandle &handle)
5505 {
5506 FUNCTRACE();
5507 PGRef pg(item.first);
5508 SnapMapper &mapper = pg->snap_mapper;
5509 OSDriver &driver = pg->osdriver;
5510 coll_t coll = coll_t(pg->info.pgid);
5511 pg->osr->flush();
5512 bool finished = false;
5513
5514 if (!item.second->start_or_resume_clearing())
5515 return;
5516
5517 bool cont = remove_dir(
5518 pg->cct, store, &mapper, &driver, pg->osr.get(), coll, item.second,
5519 &finished, handle);
5520 if (!cont)
5521 return;
5522 if (!finished) {
5523 if (item.second->pause_clearing())
5524 queue_front(item);
5525 return;
5526 }
5527
5528 if (!item.second->start_deleting())
5529 return;
5530
5531 ObjectStore::Transaction t;
5532 PGLog::clear_info_log(pg->info.pgid, &t);
5533
5534 if (cct->_conf->osd_inject_failure_on_pg_removal) {
5535 generic_derr << "osd_inject_failure_on_pg_removal" << dendl;
5536 _exit(1);
5537 }
5538 t.remove_collection(coll);
5539
5540 // We need the sequencer to stick around until the op is complete
5541 store->queue_transaction(
5542 pg->osr.get(),
5543 std::move(t),
5544 0, // onapplied
5545 0, // oncommit
5546 0, // onreadable sync
5547 new ContainerContext<PGRef>(pg),
5548 TrackedOpRef());
5549
5550 item.second->finish_deleting();
5551 }
5552 // =========================================
5553
5554 void OSD::ms_handle_connect(Connection *con)
5555 {
5556 dout(10) << __func__ << " con " << con << dendl;
5557 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
5558 Mutex::Locker l(osd_lock);
5559 if (is_stopping())
5560 return;
5561 dout(10) << __func__ << " on mon" << dendl;
5562
5563 if (is_preboot()) {
5564 start_boot();
5565 } else if (is_booting()) {
5566 _send_boot(); // resend boot message
5567 } else {
5568 map_lock.get_read();
5569 Mutex::Locker l2(mon_report_lock);
5570
5571 utime_t now = ceph_clock_now();
5572 last_mon_report = now;
5573
5574 // resend everything, it's a new session
5575 send_full_update();
5576 send_alive();
5577 service.requeue_pg_temp();
5578 service.send_pg_temp();
5579 requeue_failures();
5580 send_failures();
5581 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5582 send_pg_stats(now);
5583 }
5584
5585 map_lock.put_read();
5586 if (is_active()) {
5587 send_beacon(ceph::coarse_mono_clock::now());
5588 }
5589 }
5590
5591 // full map requests may happen while active or pre-boot
5592 if (requested_full_first) {
5593 rerequest_full_maps();
5594 }
5595 }
5596 }
5597
5598 void OSD::ms_handle_fast_connect(Connection *con)
5599 {
5600 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5601 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5602 Session *s = static_cast<Session*>(con->get_priv());
5603 if (!s) {
5604 s = new Session(cct);
5605 con->set_priv(s->get());
5606 s->con = con;
5607 dout(10) << " new session (outgoing) " << s << " con=" << s->con
5608 << " addr=" << s->con->get_peer_addr() << dendl;
5609 // we don't connect to clients
5610 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5611 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5612 }
5613 s->put();
5614 }
5615 }
5616
5617 void OSD::ms_handle_fast_accept(Connection *con)
5618 {
5619 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5620 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5621 Session *s = static_cast<Session*>(con->get_priv());
5622 if (!s) {
5623 s = new Session(cct);
5624 con->set_priv(s->get());
5625 s->con = con;
5626 dout(10) << "new session (incoming)" << s << " con=" << con
5627 << " addr=" << con->get_peer_addr()
5628 << " must have raced with connect" << dendl;
5629 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5630 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5631 }
5632 s->put();
5633 }
5634 }
5635
5636 bool OSD::ms_handle_reset(Connection *con)
5637 {
5638 Session *session = static_cast<Session*>(con->get_priv());
5639 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
5640 if (!session)
5641 return false;
5642 session->wstate.reset(con);
5643 session->con.reset(NULL); // break con <-> session ref cycle
5644 // note that we break session->con *before* the session_handle_reset
5645 // cleanup below. this avoids a race between us and
5646 // PG::add_backoff, Session::check_backoff, etc.
5647 session_handle_reset(session);
5648 session->put();
5649 return true;
5650 }
5651
5652 bool OSD::ms_handle_refused(Connection *con)
5653 {
5654 if (!cct->_conf->osd_fast_fail_on_connection_refused)
5655 return false;
5656
5657 Session *session = static_cast<Session*>(con->get_priv());
5658 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
5659 if (!session)
5660 return false;
5661 int type = con->get_peer_type();
5662 // handle only OSD failures here
5663 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
5664 OSDMapRef osdmap = get_osdmap();
5665 if (osdmap) {
5666 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
5667 if (id >= 0 && osdmap->is_up(id)) {
5668 // I'm cheating mon heartbeat grace logic, because we know it's not going
5669 // to respawn alone. +1 so we won't hit any boundary case.
5670 monc->send_mon_message(new MOSDFailure(monc->get_fsid(),
5671 osdmap->get_inst(id),
5672 cct->_conf->osd_heartbeat_grace + 1,
5673 osdmap->get_epoch(),
5674 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
5675 ));
5676 }
5677 }
5678 }
5679 session->put();
5680 return true;
5681 }
5682
5683 struct C_OSD_GetVersion : public Context {
5684 OSD *osd;
5685 uint64_t oldest, newest;
5686 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
5687 void finish(int r) override {
5688 if (r >= 0)
5689 osd->_got_mon_epochs(oldest, newest);
5690 }
5691 };
5692
5693 void OSD::start_boot()
5694 {
5695 if (!_is_healthy()) {
5696 // if we are not healthy, do not mark ourselves up (yet)
5697 dout(1) << "not healthy; waiting to boot" << dendl;
5698 if (!is_waiting_for_healthy())
5699 start_waiting_for_healthy();
5700 // send pings sooner rather than later
5701 heartbeat_kick();
5702 return;
5703 }
5704 dout(1) << __func__ << dendl;
5705 set_state(STATE_PREBOOT);
5706 waiting_for_luminous_mons = false;
5707 dout(10) << "start_boot - have maps " << superblock.oldest_map
5708 << ".." << superblock.newest_map << dendl;
5709 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
5710 monc->get_version("osdmap", &c->newest, &c->oldest, c);
5711 }
5712
5713 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
5714 {
5715 Mutex::Locker l(osd_lock);
5716 if (is_preboot()) {
5717 _preboot(oldest, newest);
5718 }
5719 }
5720
5721 void OSD::_preboot(epoch_t oldest, epoch_t newest)
5722 {
5723 assert(is_preboot());
5724 dout(10) << __func__ << " _preboot mon has osdmaps "
5725 << oldest << ".." << newest << dendl;
5726
5727 // ensure our local fullness awareness is accurate
5728 heartbeat();
5729
5730 // if our map within recent history, try to add ourselves to the osdmap.
5731 if (osdmap->get_epoch() == 0) {
5732 derr << "waiting for initial osdmap" << dendl;
5733 } else if (osdmap->is_destroyed(whoami)) {
5734 derr << "osdmap says I am destroyed, exiting" << dendl;
5735 exit(0);
5736 } else if (osdmap->test_flag(CEPH_OSDMAP_NOUP) || osdmap->is_noup(whoami)) {
5737 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
5738 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
5739 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
5740 << dendl;
5741 } else if (osdmap->require_osd_release < CEPH_RELEASE_JEWEL) {
5742 derr << "osdmap REQUIRE_JEWEL OSDMap flag is NOT set; please set it"
5743 << dendl;
5744 } else if (!monc->monmap.get_required_features().contains_all(
5745 ceph::features::mon::FEATURE_LUMINOUS)) {
5746 derr << "monmap REQUIRE_LUMINOUS is NOT set; must upgrade all monitors to "
5747 << "Luminous or later before Luminous OSDs will boot" << dendl;
5748 waiting_for_luminous_mons = true;
5749 } else if (service.need_fullness_update()) {
5750 derr << "osdmap fullness state needs update" << dendl;
5751 send_full_update();
5752 } else if (osdmap->get_epoch() >= oldest - 1 &&
5753 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
5754 _send_boot();
5755 return;
5756 }
5757
5758 // get all the latest maps
5759 if (osdmap->get_epoch() + 1 >= oldest)
5760 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5761 else
5762 osdmap_subscribe(oldest - 1, true);
5763 }
5764
5765 void OSD::send_full_update()
5766 {
5767 if (!service.need_fullness_update())
5768 return;
5769 unsigned state = 0;
5770 if (service.is_full()) {
5771 state = CEPH_OSD_FULL;
5772 } else if (service.is_backfillfull()) {
5773 state = CEPH_OSD_BACKFILLFULL;
5774 } else if (service.is_nearfull()) {
5775 state = CEPH_OSD_NEARFULL;
5776 }
5777 set<string> s;
5778 OSDMap::calc_state_set(state, s);
5779 dout(10) << __func__ << " want state " << s << dendl;
5780 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
5781 }
5782
5783 void OSD::start_waiting_for_healthy()
5784 {
5785 dout(1) << "start_waiting_for_healthy" << dendl;
5786 set_state(STATE_WAITING_FOR_HEALTHY);
5787 last_heartbeat_resample = utime_t();
5788 }
5789
5790 bool OSD::_is_healthy()
5791 {
5792 if (!cct->get_heartbeat_map()->is_healthy()) {
5793 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
5794 return false;
5795 }
5796
5797 if (is_waiting_for_healthy()) {
5798 Mutex::Locker l(heartbeat_lock);
5799 utime_t cutoff = ceph_clock_now();
5800 cutoff -= cct->_conf->osd_heartbeat_grace;
5801 int num = 0, up = 0;
5802 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5803 p != heartbeat_peers.end();
5804 ++p) {
5805 if (p->second.is_healthy(cutoff))
5806 ++up;
5807 ++num;
5808 }
5809 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
5810 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
5811 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
5812 return false;
5813 }
5814 }
5815
5816 return true;
5817 }
5818
5819 void OSD::_send_boot()
5820 {
5821 dout(10) << "_send_boot" << dendl;
5822 entity_addr_t cluster_addr = cluster_messenger->get_myaddr();
5823 Connection *local_connection = cluster_messenger->get_loopback_connection().get();
5824 if (cluster_addr.is_blank_ip()) {
5825 int port = cluster_addr.get_port();
5826 cluster_addr = client_messenger->get_myaddr();
5827 cluster_addr.set_port(port);
5828 cluster_messenger->set_addr_unknowns(cluster_addr);
5829 dout(10) << " assuming cluster_addr ip matches client_addr" << dendl;
5830 } else {
5831 Session *s = static_cast<Session*>(local_connection->get_priv());
5832 if (s)
5833 s->put();
5834 else
5835 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
5836 }
5837
5838 entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr();
5839 local_connection = hb_back_server_messenger->get_loopback_connection().get();
5840 if (hb_back_addr.is_blank_ip()) {
5841 int port = hb_back_addr.get_port();
5842 hb_back_addr = cluster_addr;
5843 hb_back_addr.set_port(port);
5844 hb_back_server_messenger->set_addr_unknowns(hb_back_addr);
5845 dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl;
5846 } else {
5847 Session *s = static_cast<Session*>(local_connection->get_priv());
5848 if (s)
5849 s->put();
5850 else
5851 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
5852 }
5853
5854 entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr();
5855 local_connection = hb_front_server_messenger->get_loopback_connection().get();
5856 if (hb_front_addr.is_blank_ip()) {
5857 int port = hb_front_addr.get_port();
5858 hb_front_addr = client_messenger->get_myaddr();
5859 hb_front_addr.set_port(port);
5860 hb_front_server_messenger->set_addr_unknowns(hb_front_addr);
5861 dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
5862 } else {
5863 Session *s = static_cast<Session*>(local_connection->get_priv());
5864 if (s)
5865 s->put();
5866 else
5867 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
5868 }
5869
5870 MOSDBoot *mboot = new MOSDBoot(superblock, get_osdmap_epoch(), service.get_boot_epoch(),
5871 hb_back_addr, hb_front_addr, cluster_addr,
5872 CEPH_FEATURES_ALL);
5873 dout(10) << " client_addr " << client_messenger->get_myaddr()
5874 << ", cluster_addr " << cluster_addr
5875 << ", hb_back_addr " << hb_back_addr
5876 << ", hb_front_addr " << hb_front_addr
5877 << dendl;
5878 _collect_metadata(&mboot->metadata);
5879 monc->send_mon_message(mboot);
5880 set_state(STATE_BOOTING);
5881 }
5882
5883 void OSD::_collect_metadata(map<string,string> *pm)
5884 {
5885 // config info
5886 (*pm)["osd_data"] = dev_path;
5887 if (store->get_type() == "filestore") {
5888 // not applicable for bluestore
5889 (*pm)["osd_journal"] = journal_path;
5890 }
5891 (*pm)["front_addr"] = stringify(client_messenger->get_myaddr());
5892 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddr());
5893 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddr());
5894 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddr());
5895
5896 // backend
5897 (*pm)["osd_objectstore"] = store->get_type();
5898 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
5899 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
5900 (*pm)["default_device_class"] = store->get_default_device_class();
5901 store->collect_metadata(pm);
5902
5903 collect_sys_info(pm, cct);
5904
5905 std::string front_iface, back_iface;
5906 /*
5907 pick_iface(cct,
5908 CEPH_PICK_ADDRESS_PUBLIC | CEPH_PICK_ADDRESS_CLUSTER,
5909 &front_iface, &back_iface);
5910 */
5911 (*pm)["front_iface"] = pick_iface(cct,
5912 client_messenger->get_myaddr().get_sockaddr_storage());
5913 (*pm)["back_iface"] = pick_iface(cct,
5914 cluster_messenger->get_myaddr().get_sockaddr_storage());
5915
5916 dout(10) << __func__ << " " << *pm << dendl;
5917 }
5918
5919 void OSD::queue_want_up_thru(epoch_t want)
5920 {
5921 map_lock.get_read();
5922 epoch_t cur = osdmap->get_up_thru(whoami);
5923 Mutex::Locker l(mon_report_lock);
5924 if (want > up_thru_wanted) {
5925 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
5926 << ", currently " << cur
5927 << dendl;
5928 up_thru_wanted = want;
5929 send_alive();
5930 } else {
5931 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
5932 << ", currently " << cur
5933 << dendl;
5934 }
5935 map_lock.put_read();
5936 }
5937
5938 void OSD::send_alive()
5939 {
5940 assert(mon_report_lock.is_locked());
5941 if (!osdmap->exists(whoami))
5942 return;
5943 epoch_t up_thru = osdmap->get_up_thru(whoami);
5944 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
5945 if (up_thru_wanted > up_thru) {
5946 dout(10) << "send_alive want " << up_thru_wanted << dendl;
5947 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
5948 }
5949 }
5950
5951 void OSD::request_full_map(epoch_t first, epoch_t last)
5952 {
5953 dout(10) << __func__ << " " << first << ".." << last
5954 << ", previously requested "
5955 << requested_full_first << ".." << requested_full_last << dendl;
5956 assert(osd_lock.is_locked());
5957 assert(first > 0 && last > 0);
5958 assert(first <= last);
5959 assert(first >= requested_full_first); // we shouldn't ever ask for older maps
5960 if (requested_full_first == 0) {
5961 // first request
5962 requested_full_first = first;
5963 requested_full_last = last;
5964 } else if (last <= requested_full_last) {
5965 // dup
5966 return;
5967 } else {
5968 // additional request
5969 first = requested_full_last + 1;
5970 requested_full_last = last;
5971 }
5972 MMonGetOSDMap *req = new MMonGetOSDMap;
5973 req->request_full(first, last);
5974 monc->send_mon_message(req);
5975 }
5976
5977 void OSD::got_full_map(epoch_t e)
5978 {
5979 assert(requested_full_first <= requested_full_last);
5980 assert(osd_lock.is_locked());
5981 if (requested_full_first == 0) {
5982 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
5983 return;
5984 }
5985 if (e < requested_full_first) {
5986 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
5987 << ".." << requested_full_last
5988 << ", ignoring" << dendl;
5989 return;
5990 }
5991 if (e >= requested_full_last) {
5992 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
5993 << ".." << requested_full_last << ", resetting" << dendl;
5994 requested_full_first = requested_full_last = 0;
5995 return;
5996 }
5997
5998 requested_full_first = e + 1;
5999
6000 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6001 << ".." << requested_full_last
6002 << ", still need more" << dendl;
6003 }
6004
6005 void OSD::requeue_failures()
6006 {
6007 Mutex::Locker l(heartbeat_lock);
6008 unsigned old_queue = failure_queue.size();
6009 unsigned old_pending = failure_pending.size();
6010 for (map<int,pair<utime_t,entity_inst_t> >::iterator p =
6011 failure_pending.begin();
6012 p != failure_pending.end(); ) {
6013 failure_queue[p->first] = p->second.first;
6014 failure_pending.erase(p++);
6015 }
6016 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6017 << failure_queue.size() << dendl;
6018 }
6019
6020 void OSD::send_failures()
6021 {
6022 assert(map_lock.is_locked());
6023 assert(mon_report_lock.is_locked());
6024 Mutex::Locker l(heartbeat_lock);
6025 utime_t now = ceph_clock_now();
6026 while (!failure_queue.empty()) {
6027 int osd = failure_queue.begin()->first;
6028 if (!failure_pending.count(osd)) {
6029 entity_inst_t i = osdmap->get_inst(osd);
6030 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6031 monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, failed_for,
6032 osdmap->get_epoch()));
6033 failure_pending[osd] = make_pair(failure_queue.begin()->second, i);
6034 }
6035 failure_queue.erase(osd);
6036 }
6037 }
6038
6039 void OSD::send_still_alive(epoch_t epoch, const entity_inst_t &i)
6040 {
6041 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), i, 0, epoch, MOSDFailure::FLAG_ALIVE);
6042 monc->send_mon_message(m);
6043 }
6044
6045 void OSD::send_pg_stats(const utime_t &now)
6046 {
6047 assert(map_lock.is_locked());
6048 assert(osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS);
6049 dout(20) << "send_pg_stats" << dendl;
6050
6051 osd_stat_t cur_stat = service.get_osd_stat();
6052
6053 cur_stat.os_perf_stat = store->get_cur_stats();
6054
6055 pg_stat_queue_lock.Lock();
6056
6057 if (osd_stat_updated || !pg_stat_queue.empty()) {
6058 last_pg_stats_sent = now;
6059 osd_stat_updated = false;
6060
6061 dout(10) << "send_pg_stats - " << pg_stat_queue.size() << " pgs updated" << dendl;
6062
6063 utime_t had_for(now);
6064 had_for -= had_map_since;
6065
6066 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
6067
6068 uint64_t tid = ++pg_stat_tid;
6069 m->set_tid(tid);
6070 m->osd_stat = cur_stat;
6071
6072 xlist<PG*>::iterator p = pg_stat_queue.begin();
6073 while (!p.end()) {
6074 PG *pg = *p;
6075 ++p;
6076 if (!pg->is_primary()) { // we hold map_lock; role is stable.
6077 pg->stat_queue_item.remove_myself();
6078 pg->put("pg_stat_queue");
6079 continue;
6080 }
6081 pg->pg_stats_publish_lock.Lock();
6082 if (pg->pg_stats_publish_valid) {
6083 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
6084 dout(25) << " sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6085 << pg->pg_stats_publish.reported_seq << dendl;
6086 } else {
6087 dout(25) << " NOT sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6088 << pg->pg_stats_publish.reported_seq << ", not valid" << dendl;
6089 }
6090 pg->pg_stats_publish_lock.Unlock();
6091 }
6092
6093 if (last_pg_stats_ack == utime_t() || !outstanding_pg_stats.empty()) {
6094 last_pg_stats_ack = ceph_clock_now();
6095 }
6096 outstanding_pg_stats.insert(tid);
6097 dout(20) << __func__ << " updates pending: " << outstanding_pg_stats << dendl;
6098
6099 monc->send_mon_message(m);
6100 }
6101
6102 pg_stat_queue_lock.Unlock();
6103 }
6104
6105 void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
6106 {
6107 dout(10) << "handle_pg_stats_ack " << dendl;
6108
6109 if (!require_mon_peer(ack)) {
6110 ack->put();
6111 return;
6112 }
6113
6114 // NOTE: we may get replies from a previous mon even while
6115 // outstanding_pg_stats is empty if reconnecting races with replies
6116 // in flight.
6117
6118 pg_stat_queue_lock.Lock();
6119
6120 last_pg_stats_ack = ceph_clock_now();
6121
6122 // decay timeout slowly (analogous to TCP)
6123 stats_ack_timeout =
6124 MAX(cct->_conf->osd_mon_ack_timeout,
6125 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_decay);
6126 dout(20) << __func__ << " timeout now " << stats_ack_timeout << dendl;
6127
6128 if (ack->get_tid() > pg_stat_tid_flushed) {
6129 pg_stat_tid_flushed = ack->get_tid();
6130 pg_stat_queue_cond.Signal();
6131 }
6132
6133 xlist<PG*>::iterator p = pg_stat_queue.begin();
6134 while (!p.end()) {
6135 PG *pg = *p;
6136 PGRef _pg(pg);
6137 ++p;
6138
6139 auto acked = ack->pg_stat.find(pg->info.pgid.pgid);
6140 if (acked != ack->pg_stat.end()) {
6141 pg->pg_stats_publish_lock.Lock();
6142 if (acked->second.first == pg->pg_stats_publish.reported_seq &&
6143 acked->second.second == pg->pg_stats_publish.reported_epoch) {
6144 dout(25) << " ack on " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6145 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6146 pg->stat_queue_item.remove_myself();
6147 pg->put("pg_stat_queue");
6148 } else {
6149 dout(25) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6150 << ":" << pg->pg_stats_publish.reported_seq << " > acked "
6151 << acked->second << dendl;
6152 }
6153 pg->pg_stats_publish_lock.Unlock();
6154 } else {
6155 dout(30) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6156 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6157 }
6158 }
6159
6160 outstanding_pg_stats.erase(ack->get_tid());
6161 dout(20) << __func__ << " still pending: " << outstanding_pg_stats << dendl;
6162
6163 pg_stat_queue_lock.Unlock();
6164
6165 ack->put();
6166 }
6167
6168 void OSD::flush_pg_stats()
6169 {
6170 dout(10) << "flush_pg_stats" << dendl;
6171 osd_lock.Unlock();
6172 utime_t now = ceph_clock_now();
6173 map_lock.get_read();
6174 mon_report_lock.Lock();
6175 send_pg_stats(now);
6176 mon_report_lock.Unlock();
6177 map_lock.put_read();
6178
6179
6180 pg_stat_queue_lock.Lock();
6181 uint64_t tid = pg_stat_tid;
6182 dout(10) << "flush_pg_stats waiting for stats tid " << tid << " to flush" << dendl;
6183 while (tid > pg_stat_tid_flushed)
6184 pg_stat_queue_cond.Wait(pg_stat_queue_lock);
6185 dout(10) << "flush_pg_stats finished waiting for stats tid " << tid << " to flush" << dendl;
6186 pg_stat_queue_lock.Unlock();
6187
6188 osd_lock.Lock();
6189 }
6190
6191 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6192 {
6193 const auto& monmap = monc->monmap;
6194 // send beacon to mon even if we are just connected, and the monmap is not
6195 // initialized yet by then.
6196 if (monmap.epoch > 0 &&
6197 monmap.get_required_features().contains_all(
6198 ceph::features::mon::FEATURE_LUMINOUS)) {
6199 dout(20) << __func__ << " sending" << dendl;
6200 MOSDBeacon* beacon = nullptr;
6201 {
6202 Mutex::Locker l{min_last_epoch_clean_lock};
6203 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
6204 std::swap(beacon->pgs, min_last_epoch_clean_pgs);
6205 last_sent_beacon = now;
6206 }
6207 monc->send_mon_message(beacon);
6208 } else {
6209 dout(20) << __func__ << " not sending" << dendl;
6210 }
6211 }
6212
6213 void OSD::handle_command(MMonCommand *m)
6214 {
6215 if (!require_mon_peer(m)) {
6216 m->put();
6217 return;
6218 }
6219
6220 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6221 command_wq.queue(c);
6222 m->put();
6223 }
6224
6225 void OSD::handle_command(MCommand *m)
6226 {
6227 ConnectionRef con = m->get_connection();
6228 Session *session = static_cast<Session *>(con->get_priv());
6229 if (!session) {
6230 con->send_message(new MCommandReply(m, -EPERM));
6231 m->put();
6232 return;
6233 }
6234
6235 OSDCap& caps = session->caps;
6236 session->put();
6237
6238 if (!caps.allow_all() || m->get_source().is_mon()) {
6239 con->send_message(new MCommandReply(m, -EPERM));
6240 m->put();
6241 return;
6242 }
6243
6244 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6245 command_wq.queue(c);
6246
6247 m->put();
6248 }
6249
6250 struct OSDCommand {
6251 string cmdstring;
6252 string helpstring;
6253 string module;
6254 string perm;
6255 string availability;
6256 } osd_commands[] = {
6257
6258 #define COMMAND(parsesig, helptext, module, perm, availability) \
6259 {parsesig, helptext, module, perm, availability},
6260
6261 // yes, these are really pg commands, but there's a limit to how
6262 // much work it's worth. The OSD returns all of them. Make this
6263 // form (pg <pgid> <cmd>) valid only for the cli.
6264 // Rest uses "tell <pgid> <cmd>"
6265
6266 COMMAND("pg " \
6267 "name=pgid,type=CephPgid " \
6268 "name=cmd,type=CephChoices,strings=query", \
6269 "show details of a specific pg", "osd", "r", "cli")
6270 COMMAND("pg " \
6271 "name=pgid,type=CephPgid " \
6272 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6273 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6274 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6275 "osd", "rw", "cli")
6276 COMMAND("pg " \
6277 "name=pgid,type=CephPgid " \
6278 "name=cmd,type=CephChoices,strings=list_missing " \
6279 "name=offset,type=CephString,req=false",
6280 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6281 "osd", "r", "cli")
6282
6283 // new form: tell <pgid> <cmd> for both cli and rest
6284
6285 COMMAND("query",
6286 "show details of a specific pg", "osd", "r", "cli,rest")
6287 COMMAND("mark_unfound_lost " \
6288 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6289 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6290 "osd", "rw", "cli,rest")
6291 COMMAND("list_missing " \
6292 "name=offset,type=CephString,req=false",
6293 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6294 "osd", "r", "cli,rest")
6295 COMMAND("perf histogram dump "
6296 "name=logger,type=CephString,req=false "
6297 "name=counter,type=CephString,req=false",
6298 "Get histogram data",
6299 "osd", "r", "cli,rest")
6300
6301 // tell <osd.n> commands. Validation of osd.n must be special-cased in client
6302 COMMAND("version", "report version of OSD", "osd", "r", "cli,rest")
6303 COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r", "cli,rest")
6304 COMMAND("injectargs " \
6305 "name=injected_args,type=CephString,n=N",
6306 "inject configuration arguments into running OSD",
6307 "osd", "rw", "cli,rest")
6308 COMMAND("config set " \
6309 "name=key,type=CephString name=value,type=CephString",
6310 "Set a configuration option at runtime (not persistent)",
6311 "osd", "rw", "cli,rest")
6312 COMMAND("cluster_log " \
6313 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6314 "name=message,type=CephString,n=N",
6315 "log a message to the cluster log",
6316 "osd", "rw", "cli,rest")
6317 COMMAND("bench " \
6318 "name=count,type=CephInt,req=false " \
6319 "name=size,type=CephInt,req=false " \
6320 "name=object_size,type=CephInt,req=false " \
6321 "name=object_num,type=CephInt,req=false ", \
6322 "OSD benchmark: write <count> <size>-byte objects, " \
6323 "(default 1G size 4MB). Results in log.",
6324 "osd", "rw", "cli,rest")
6325 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw", "cli,rest")
6326 COMMAND("heap " \
6327 "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
6328 "show heap usage info (available only if compiled with tcmalloc)", \
6329 "osd", "rw", "cli,rest")
6330 COMMAND("debug dump_missing " \
6331 "name=filename,type=CephFilepath",
6332 "dump missing objects to a named file", "osd", "r", "cli,rest")
6333 COMMAND("debug kick_recovery_wq " \
6334 "name=delay,type=CephInt,range=0",
6335 "set osd_recovery_delay_start to <val>", "osd", "rw", "cli,rest")
6336 COMMAND("cpu_profiler " \
6337 "name=arg,type=CephChoices,strings=status|flush",
6338 "run cpu profiling on daemon", "osd", "rw", "cli,rest")
6339 COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6340 "osd", "r", "cli,rest")
6341 COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6342 "osd", "rw", "cli,rest")
6343 COMMAND("compact",
6344 "compact object store's omap. "
6345 "WARNING: Compaction probably slows your requests",
6346 "osd", "rw", "cli,rest")
6347 };
6348
6349 void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6350 {
6351 int r = 0;
6352 stringstream ss, ds;
6353 string rs;
6354 bufferlist odata;
6355
6356 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6357
6358 map<string, cmd_vartype> cmdmap;
6359 string prefix;
6360 string format;
6361 string pgidstr;
6362 boost::scoped_ptr<Formatter> f;
6363
6364 if (cmd.empty()) {
6365 ss << "no command given";
6366 goto out;
6367 }
6368
6369 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6370 r = -EINVAL;
6371 goto out;
6372 }
6373
6374 cmd_getval(cct, cmdmap, "prefix", prefix);
6375
6376 if (prefix == "get_command_descriptions") {
6377 int cmdnum = 0;
6378 JSONFormatter *f = new JSONFormatter();
6379 f->open_object_section("command_descriptions");
6380 for (OSDCommand *cp = osd_commands;
6381 cp < &osd_commands[ARRAY_SIZE(osd_commands)]; cp++) {
6382
6383 ostringstream secname;
6384 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
6385 dump_cmddesc_to_json(f, secname.str(), cp->cmdstring, cp->helpstring,
6386 cp->module, cp->perm, cp->availability, 0);
6387 cmdnum++;
6388 }
6389 f->close_section(); // command_descriptions
6390
6391 f->flush(ds);
6392 delete f;
6393 goto out;
6394 }
6395
6396 cmd_getval(cct, cmdmap, "format", format);
6397 f.reset(Formatter::create(format));
6398
6399 if (prefix == "version") {
6400 if (f) {
6401 f->open_object_section("version");
6402 f->dump_string("version", pretty_version_to_str());
6403 f->close_section();
6404 f->flush(ds);
6405 } else {
6406 ds << pretty_version_to_str();
6407 }
6408 goto out;
6409 }
6410 else if (prefix == "injectargs") {
6411 vector<string> argsvec;
6412 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6413
6414 if (argsvec.empty()) {
6415 r = -EINVAL;
6416 ss << "ignoring empty injectargs";
6417 goto out;
6418 }
6419 string args = argsvec.front();
6420 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6421 args += " " + *a;
6422 osd_lock.Unlock();
6423 r = cct->_conf->injectargs(args, &ss);
6424 osd_lock.Lock();
6425 }
6426 else if (prefix == "config set") {
6427 std::string key;
6428 std::string val;
6429 cmd_getval(cct, cmdmap, "key", key);
6430 cmd_getval(cct, cmdmap, "value", val);
6431 osd_lock.Unlock();
6432 r = cct->_conf->set_val(key, val, true, &ss);
6433 if (r == 0) {
6434 cct->_conf->apply_changes(nullptr);
6435 }
6436 osd_lock.Lock();
6437 }
6438 else if (prefix == "cluster_log") {
6439 vector<string> msg;
6440 cmd_getval(cct, cmdmap, "message", msg);
6441 if (msg.empty()) {
6442 r = -EINVAL;
6443 ss << "ignoring empty log message";
6444 goto out;
6445 }
6446 string message = msg.front();
6447 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6448 message += " " + *a;
6449 string lvl;
6450 cmd_getval(cct, cmdmap, "level", lvl);
6451 clog_type level = string_to_clog_type(lvl);
6452 if (level < 0) {
6453 r = -EINVAL;
6454 ss << "unknown level '" << lvl << "'";
6455 goto out;
6456 }
6457 clog->do_log(level, message);
6458 }
6459
6460 // either 'pg <pgid> <command>' or
6461 // 'tell <pgid>' (which comes in without any of that prefix)?
6462
6463 else if (prefix == "pg" ||
6464 prefix == "query" ||
6465 prefix == "mark_unfound_lost" ||
6466 prefix == "list_missing"
6467 ) {
6468 pg_t pgid;
6469
6470 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6471 ss << "no pgid specified";
6472 r = -EINVAL;
6473 } else if (!pgid.parse(pgidstr.c_str())) {
6474 ss << "couldn't parse pgid '" << pgidstr << "'";
6475 r = -EINVAL;
6476 } else {
6477 spg_t pcand;
6478 PG *pg = nullptr;
6479 if (osdmap->get_primary_shard(pgid, &pcand) &&
6480 (pg = _lookup_lock_pg(pcand))) {
6481 if (pg->is_primary()) {
6482 // simulate pg <pgid> cmd= for pg->do-command
6483 if (prefix != "pg")
6484 cmd_putval(cct, cmdmap, "cmd", prefix);
6485 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
6486 if (r == -EAGAIN) {
6487 pg->unlock();
6488 // don't reply, pg will do so async
6489 return;
6490 }
6491 } else {
6492 ss << "not primary for pgid " << pgid;
6493
6494 // send them the latest diff to ensure they realize the mapping
6495 // has changed.
6496 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
6497
6498 // do not reply; they will get newer maps and realize they
6499 // need to resend.
6500 pg->unlock();
6501 return;
6502 }
6503 pg->unlock();
6504 } else {
6505 ss << "i don't have pgid " << pgid;
6506 r = -ENOENT;
6507 }
6508 }
6509 }
6510
6511 else if (prefix == "bench") {
6512 int64_t count;
6513 int64_t bsize;
6514 int64_t osize, onum;
6515 // default count 1G, size 4MB
6516 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
6517 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
6518 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
6519 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
6520
6521 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
6522 ObjectStore::Sequencer>("bench"));
6523
6524 uint32_t duration = cct->_conf->osd_bench_duration;
6525
6526 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
6527 // let us limit the block size because the next checks rely on it
6528 // having a sane value. If we allow any block size to be set things
6529 // can still go sideways.
6530 ss << "block 'size' values are capped at "
6531 << prettybyte_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
6532 << " a higher value, please adjust 'osd_bench_max_block_size'";
6533 r = -EINVAL;
6534 goto out;
6535 } else if (bsize < (int64_t) (1 << 20)) {
6536 // entering the realm of small block sizes.
6537 // limit the count to a sane value, assuming a configurable amount of
6538 // IOPS and duration, so that the OSD doesn't get hung up on this,
6539 // preventing timeouts from going off
6540 int64_t max_count =
6541 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
6542 if (count > max_count) {
6543 ss << "'count' values greater than " << max_count
6544 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6545 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
6546 << " for " << duration << " seconds,"
6547 << " can cause ill effects on osd. "
6548 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6549 << " value if you wish to use a higher 'count'.";
6550 r = -EINVAL;
6551 goto out;
6552 }
6553 } else {
6554 // 1MB block sizes are big enough so that we get more stuff done.
6555 // However, to avoid the osd from getting hung on this and having
6556 // timers being triggered, we are going to limit the count assuming
6557 // a configurable throughput and duration.
6558 // NOTE: max_count is the total amount of bytes that we believe we
6559 // will be able to write during 'duration' for the given
6560 // throughput. The block size hardly impacts this unless it's
6561 // way too big. Given we already check how big the block size
6562 // is, it's safe to assume everything will check out.
6563 int64_t max_count =
6564 cct->_conf->osd_bench_large_size_max_throughput * duration;
6565 if (count > max_count) {
6566 ss << "'count' values greater than " << max_count
6567 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6568 << prettybyte_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
6569 << " for " << duration << " seconds,"
6570 << " can cause ill effects on osd. "
6571 << " Please adjust 'osd_bench_large_size_max_throughput'"
6572 << " with a higher value if you wish to use a higher 'count'.";
6573 r = -EINVAL;
6574 goto out;
6575 }
6576 }
6577
6578 if (osize && bsize > osize)
6579 bsize = osize;
6580
6581 dout(1) << " bench count " << count
6582 << " bsize " << prettybyte_t(bsize) << dendl;
6583
6584 ObjectStore::Transaction cleanupt;
6585
6586 if (osize && onum) {
6587 bufferlist bl;
6588 bufferptr bp(osize);
6589 bp.zero();
6590 bl.push_back(std::move(bp));
6591 bl.rebuild_page_aligned();
6592 for (int i=0; i<onum; ++i) {
6593 char nm[30];
6594 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
6595 object_t oid(nm);
6596 hobject_t soid(sobject_t(oid, 0));
6597 ObjectStore::Transaction t;
6598 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
6599 store->queue_transaction(osr.get(), std::move(t), NULL);
6600 cleanupt.remove(coll_t(), ghobject_t(soid));
6601 }
6602 }
6603
6604 bufferlist bl;
6605 bufferptr bp(bsize);
6606 bp.zero();
6607 bl.push_back(std::move(bp));
6608 bl.rebuild_page_aligned();
6609
6610 {
6611 C_SaferCond waiter;
6612 if (!osr->flush_commit(&waiter)) {
6613 waiter.wait();
6614 }
6615 }
6616
6617 utime_t start = ceph_clock_now();
6618 for (int64_t pos = 0; pos < count; pos += bsize) {
6619 char nm[30];
6620 unsigned offset = 0;
6621 if (onum && osize) {
6622 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
6623 offset = rand() % (osize / bsize) * bsize;
6624 } else {
6625 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
6626 }
6627 object_t oid(nm);
6628 hobject_t soid(sobject_t(oid, 0));
6629 ObjectStore::Transaction t;
6630 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
6631 store->queue_transaction(osr.get(), std::move(t), NULL);
6632 if (!onum || !osize)
6633 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
6634 }
6635
6636 {
6637 C_SaferCond waiter;
6638 if (!osr->flush_commit(&waiter)) {
6639 waiter.wait();
6640 }
6641 }
6642 utime_t end = ceph_clock_now();
6643
6644 // clean up
6645 store->queue_transaction(osr.get(), std::move(cleanupt), NULL);
6646 {
6647 C_SaferCond waiter;
6648 if (!osr->flush_commit(&waiter)) {
6649 waiter.wait();
6650 }
6651 }
6652
6653 uint64_t rate = (double)count / (end - start);
6654 if (f) {
6655 f->open_object_section("osd_bench_results");
6656 f->dump_int("bytes_written", count);
6657 f->dump_int("blocksize", bsize);
6658 f->dump_unsigned("bytes_per_sec", rate);
6659 f->close_section();
6660 f->flush(ss);
6661 } else {
6662 ss << "bench: wrote " << prettybyte_t(count)
6663 << " in blocks of " << prettybyte_t(bsize) << " in "
6664 << (end-start) << " sec at " << prettybyte_t(rate) << "/sec";
6665 }
6666 }
6667
6668 else if (prefix == "flush_pg_stats") {
6669 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6670 mgrc.send_pgstats();
6671 ds << service.get_osd_stat_seq() << "\n";
6672 } else {
6673 flush_pg_stats();
6674 }
6675 }
6676
6677 else if (prefix == "heap") {
6678 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
6679 }
6680
6681 else if (prefix == "debug dump_missing") {
6682 string file_name;
6683 cmd_getval(cct, cmdmap, "filename", file_name);
6684 std::ofstream fout(file_name.c_str());
6685 if (!fout.is_open()) {
6686 ss << "failed to open file '" << file_name << "'";
6687 r = -EINVAL;
6688 goto out;
6689 }
6690
6691 fout << "*** osd " << whoami << ": dump_missing ***" << std::endl;
6692 RWLock::RLocker l(pg_map_lock);
6693 for (ceph::unordered_map<spg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
6694 pg_map_e != pg_map.end(); ++pg_map_e) {
6695 PG *pg = pg_map_e->second;
6696 pg->lock();
6697
6698 fout << *pg << std::endl;
6699 std::map<hobject_t, pg_missing_item>::const_iterator mend =
6700 pg->pg_log.get_missing().get_items().end();
6701 std::map<hobject_t, pg_missing_item>::const_iterator mi =
6702 pg->pg_log.get_missing().get_items().begin();
6703 for (; mi != mend; ++mi) {
6704 fout << mi->first << " -> " << mi->second << std::endl;
6705 if (!pg->missing_loc.needs_recovery(mi->first))
6706 continue;
6707 if (pg->missing_loc.is_unfound(mi->first))
6708 fout << " unfound ";
6709 const set<pg_shard_t> &mls(pg->missing_loc.get_locations(mi->first));
6710 if (mls.empty())
6711 continue;
6712 fout << "missing_loc: " << mls << std::endl;
6713 }
6714 pg->unlock();
6715 fout << std::endl;
6716 }
6717
6718 fout.close();
6719 }
6720 else if (prefix == "debug kick_recovery_wq") {
6721 int64_t delay;
6722 cmd_getval(cct, cmdmap, "delay", delay);
6723 ostringstream oss;
6724 oss << delay;
6725 r = cct->_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
6726 if (r != 0) {
6727 ss << "kick_recovery_wq: error setting "
6728 << "osd_recovery_delay_start to '" << delay << "': error "
6729 << r;
6730 goto out;
6731 }
6732 cct->_conf->apply_changes(NULL);
6733 ss << "kicking recovery queue. set osd_recovery_delay_start "
6734 << "to " << cct->_conf->osd_recovery_delay_start;
6735 }
6736
6737 else if (prefix == "cpu_profiler") {
6738 string arg;
6739 cmd_getval(cct, cmdmap, "arg", arg);
6740 vector<string> argvec;
6741 get_str_vec(arg, argvec);
6742 cpu_profiler_handle_command(argvec, ds);
6743 }
6744
6745 else if (prefix == "dump_pg_recovery_stats") {
6746 stringstream s;
6747 if (f) {
6748 pg_recovery_stats.dump_formatted(f.get());
6749 f->flush(ds);
6750 } else {
6751 pg_recovery_stats.dump(s);
6752 ds << "dump pg recovery stats: " << s.str();
6753 }
6754 }
6755
6756 else if (prefix == "reset_pg_recovery_stats") {
6757 ss << "reset pg recovery stats";
6758 pg_recovery_stats.reset();
6759 }
6760
6761 else if (prefix == "perf histogram dump") {
6762 std::string logger;
6763 std::string counter;
6764 cmd_getval(cct, cmdmap, "logger", logger);
6765 cmd_getval(cct, cmdmap, "counter", counter);
6766 if (f) {
6767 cct->get_perfcounters_collection()->dump_formatted_histograms(
6768 f.get(), false, logger, counter);
6769 f->flush(ds);
6770 }
6771 }
6772
6773 else if (prefix == "compact") {
6774 dout(1) << "triggering manual compaction" << dendl;
6775 auto start = ceph::coarse_mono_clock::now();
6776 store->compact();
6777 auto end = ceph::coarse_mono_clock::now();
6778 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
6779 dout(1) << "finished manual compaction in "
6780 << time_span.count()
6781 << " seconds" << dendl;
6782 ss << "compacted omap in " << time_span.count() << " seconds";
6783 }
6784
6785 else {
6786 ss << "unrecognized command! " << cmd;
6787 r = -EINVAL;
6788 }
6789
6790 out:
6791 rs = ss.str();
6792 odata.append(ds);
6793 dout(0) << "do_command r=" << r << " " << rs << dendl;
6794 clog->info() << rs;
6795 if (con) {
6796 MCommandReply *reply = new MCommandReply(r, rs);
6797 reply->set_tid(tid);
6798 reply->set_data(odata);
6799 con->send_message(reply);
6800 }
6801 }
6802
6803 bool OSD::heartbeat_dispatch(Message *m)
6804 {
6805 dout(30) << "heartbeat_dispatch " << m << dendl;
6806 switch (m->get_type()) {
6807
6808 case CEPH_MSG_PING:
6809 dout(10) << "ping from " << m->get_source_inst() << dendl;
6810 m->put();
6811 break;
6812
6813 case MSG_OSD_PING:
6814 handle_osd_ping(static_cast<MOSDPing*>(m));
6815 break;
6816
6817 default:
6818 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
6819 m->put();
6820 }
6821
6822 return true;
6823 }
6824
6825 bool OSD::ms_dispatch(Message *m)
6826 {
6827 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
6828 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
6829 service.got_stop_ack();
6830 m->put();
6831 return true;
6832 }
6833
6834 // lock!
6835
6836 osd_lock.Lock();
6837 if (is_stopping()) {
6838 osd_lock.Unlock();
6839 m->put();
6840 return true;
6841 }
6842
6843 do_waiters();
6844 _dispatch(m);
6845
6846 osd_lock.Unlock();
6847
6848 return true;
6849 }
6850
6851 void OSD::maybe_share_map(
6852 Session *session,
6853 OpRequestRef op,
6854 OSDMapRef osdmap)
6855 {
6856 if (!op->check_send_map) {
6857 return;
6858 }
6859 epoch_t last_sent_epoch = 0;
6860
6861 session->sent_epoch_lock.lock();
6862 last_sent_epoch = session->last_sent_epoch;
6863 session->sent_epoch_lock.unlock();
6864
6865 const Message *m = op->get_req();
6866 service.share_map(
6867 m->get_source(),
6868 m->get_connection().get(),
6869 op->sent_epoch,
6870 osdmap,
6871 session ? &last_sent_epoch : NULL);
6872
6873 session->sent_epoch_lock.lock();
6874 if (session->last_sent_epoch < last_sent_epoch) {
6875 session->last_sent_epoch = last_sent_epoch;
6876 }
6877 session->sent_epoch_lock.unlock();
6878
6879 op->check_send_map = false;
6880 }
6881
6882 void OSD::dispatch_session_waiting(Session *session, OSDMapRef osdmap)
6883 {
6884 assert(session->session_dispatch_lock.is_locked());
6885
6886 auto i = session->waiting_on_map.begin();
6887 while (i != session->waiting_on_map.end()) {
6888 OpRequestRef op = &(*i);
6889 assert(ms_can_fast_dispatch(op->get_req()));
6890 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
6891 op->get_req());
6892 if (m->get_min_epoch() > osdmap->get_epoch()) {
6893 break;
6894 }
6895 session->waiting_on_map.erase(i++);
6896 op->put();
6897
6898 spg_t pgid;
6899 if (m->get_type() == CEPH_MSG_OSD_OP) {
6900 pg_t actual_pgid = osdmap->raw_pg_to_pg(
6901 static_cast<const MOSDOp*>(m)->get_pg());
6902 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
6903 continue;
6904 }
6905 } else {
6906 pgid = m->get_spg();
6907 }
6908 enqueue_op(pgid, op, m->get_map_epoch());
6909 }
6910
6911 if (session->waiting_on_map.empty()) {
6912 clear_session_waiting_on_map(session);
6913 } else {
6914 register_session_waiting_on_map(session);
6915 }
6916 }
6917
6918 void OSD::ms_fast_dispatch(Message *m)
6919 {
6920 FUNCTRACE();
6921 if (service.is_stopping()) {
6922 m->put();
6923 return;
6924 }
6925 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
6926 {
6927 #ifdef WITH_LTTNG
6928 osd_reqid_t reqid = op->get_reqid();
6929 #endif
6930 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
6931 reqid.name._num, reqid.tid, reqid.inc);
6932 }
6933
6934 if (m->trace)
6935 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
6936
6937 // note sender epoch, min req'd epoch
6938 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
6939 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
6940 assert(op->min_epoch <= op->sent_epoch); // sanity check!
6941
6942 service.maybe_inject_dispatch_delay();
6943
6944 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
6945 m->get_type() != CEPH_MSG_OSD_OP) {
6946 // queue it directly
6947 enqueue_op(
6948 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
6949 op,
6950 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
6951 } else {
6952 // legacy client, and this is an MOSDOp (the *only* fast dispatch
6953 // message that didn't have an explicit spg_t); we need to map
6954 // them to an spg_t while preserving delivery order.
6955 Session *session = static_cast<Session*>(m->get_connection()->get_priv());
6956 if (session) {
6957 {
6958 Mutex::Locker l(session->session_dispatch_lock);
6959 op->get();
6960 session->waiting_on_map.push_back(*op);
6961 OSDMapRef nextmap = service.get_nextmap_reserved();
6962 dispatch_session_waiting(session, nextmap);
6963 service.release_map(nextmap);
6964 }
6965 session->put();
6966 }
6967 }
6968 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
6969 }
6970
6971 void OSD::ms_fast_preprocess(Message *m)
6972 {
6973 if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
6974 if (m->get_type() == CEPH_MSG_OSD_MAP) {
6975 MOSDMap *mm = static_cast<MOSDMap*>(m);
6976 Session *s = static_cast<Session*>(m->get_connection()->get_priv());
6977 if (s) {
6978 s->received_map_lock.lock();
6979 s->received_map_epoch = mm->get_last();
6980 s->received_map_lock.unlock();
6981 s->put();
6982 }
6983 }
6984 }
6985 }
6986
6987 bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
6988 {
6989 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
6990
6991 if (is_stopping()) {
6992 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
6993 return false;
6994 }
6995
6996 if (dest_type == CEPH_ENTITY_TYPE_MON)
6997 return true;
6998
6999 if (force_new) {
7000 /* the MonClient checks keys every tick(), so we should just wait for that cycle
7001 to get through */
7002 if (monc->wait_auth_rotating(10) < 0) {
7003 derr << "OSD::ms_get_authorizer wait_auth_rotating failed" << dendl;
7004 return false;
7005 }
7006 }
7007
7008 *authorizer = monc->build_authorizer(dest_type);
7009 return *authorizer != NULL;
7010 }
7011
7012
7013 bool OSD::ms_verify_authorizer(Connection *con, int peer_type,
7014 int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
7015 bool& isvalid, CryptoKey& session_key)
7016 {
7017 AuthAuthorizeHandler *authorize_handler = 0;
7018 switch (peer_type) {
7019 case CEPH_ENTITY_TYPE_MDS:
7020 /*
7021 * note: mds is technically a client from our perspective, but
7022 * this makes the 'cluster' consistent w/ monitor's usage.
7023 */
7024 case CEPH_ENTITY_TYPE_OSD:
7025 case CEPH_ENTITY_TYPE_MGR:
7026 authorize_handler = authorize_handler_cluster_registry->get_handler(protocol);
7027 break;
7028 default:
7029 authorize_handler = authorize_handler_service_registry->get_handler(protocol);
7030 }
7031 if (!authorize_handler) {
7032 dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol << dendl;
7033 isvalid = false;
7034 return true;
7035 }
7036
7037 AuthCapsInfo caps_info;
7038 EntityName name;
7039 uint64_t global_id;
7040 uint64_t auid = CEPH_AUTH_UID_DEFAULT;
7041
7042 RotatingKeyRing *keys = monc->rotating_secrets.get();
7043 if (keys) {
7044 isvalid = authorize_handler->verify_authorizer(
7045 cct, keys,
7046 authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
7047 &auid);
7048 } else {
7049 dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
7050 isvalid = false;
7051 }
7052
7053 if (isvalid) {
7054 Session *s = static_cast<Session *>(con->get_priv());
7055 if (!s) {
7056 s = new Session(cct);
7057 con->set_priv(s->get());
7058 s->con = con;
7059 dout(10) << " new session " << s << " con=" << s->con << " addr=" << s->con->get_peer_addr() << dendl;
7060 }
7061
7062 s->entity_name = name;
7063 if (caps_info.allow_all)
7064 s->caps.set_allow_all();
7065 s->auid = auid;
7066
7067 if (caps_info.caps.length() > 0) {
7068 bufferlist::iterator p = caps_info.caps.begin();
7069 string str;
7070 try {
7071 ::decode(str, p);
7072 }
7073 catch (buffer::error& e) {
7074 }
7075 bool success = s->caps.parse(str);
7076 if (success)
7077 dout(10) << " session " << s << " " << s->entity_name << " has caps " << s->caps << " '" << str << "'" << dendl;
7078 else
7079 dout(10) << " session " << s << " " << s->entity_name << " failed to parse caps '" << str << "'" << dendl;
7080 }
7081
7082 s->put();
7083 }
7084 return true;
7085 }
7086
7087 void OSD::do_waiters()
7088 {
7089 assert(osd_lock.is_locked());
7090
7091 dout(10) << "do_waiters -- start" << dendl;
7092 while (!finished.empty()) {
7093 OpRequestRef next = finished.front();
7094 finished.pop_front();
7095 dispatch_op(next);
7096 }
7097 dout(10) << "do_waiters -- finish" << dendl;
7098 }
7099
7100 void OSD::dispatch_op(OpRequestRef op)
7101 {
7102 switch (op->get_req()->get_type()) {
7103
7104 case MSG_OSD_PG_CREATE:
7105 handle_pg_create(op);
7106 break;
7107 case MSG_OSD_PG_NOTIFY:
7108 handle_pg_notify(op);
7109 break;
7110 case MSG_OSD_PG_QUERY:
7111 handle_pg_query(op);
7112 break;
7113 case MSG_OSD_PG_LOG:
7114 handle_pg_log(op);
7115 break;
7116 case MSG_OSD_PG_REMOVE:
7117 handle_pg_remove(op);
7118 break;
7119 case MSG_OSD_PG_INFO:
7120 handle_pg_info(op);
7121 break;
7122 case MSG_OSD_PG_TRIM:
7123 handle_pg_trim(op);
7124 break;
7125 case MSG_OSD_BACKFILL_RESERVE:
7126 handle_pg_backfill_reserve(op);
7127 break;
7128 case MSG_OSD_RECOVERY_RESERVE:
7129 handle_pg_recovery_reserve(op);
7130 break;
7131 }
7132 }
7133
7134 void OSD::_dispatch(Message *m)
7135 {
7136 assert(osd_lock.is_locked());
7137 dout(20) << "_dispatch " << m << " " << *m << dendl;
7138
7139 switch (m->get_type()) {
7140
7141 // -- don't need lock --
7142 case CEPH_MSG_PING:
7143 dout(10) << "ping from " << m->get_source() << dendl;
7144 m->put();
7145 break;
7146
7147 // -- don't need OSDMap --
7148
7149 // map and replication
7150 case CEPH_MSG_OSD_MAP:
7151 handle_osd_map(static_cast<MOSDMap*>(m));
7152 break;
7153
7154 // osd
7155 case MSG_PGSTATSACK:
7156 handle_pg_stats_ack(static_cast<MPGStatsAck*>(m));
7157 break;
7158
7159 case MSG_MON_COMMAND:
7160 handle_command(static_cast<MMonCommand*>(m));
7161 break;
7162 case MSG_COMMAND:
7163 handle_command(static_cast<MCommand*>(m));
7164 break;
7165
7166 case MSG_OSD_SCRUB:
7167 handle_scrub(static_cast<MOSDScrub*>(m));
7168 break;
7169
7170 case MSG_OSD_FORCE_RECOVERY:
7171 handle_force_recovery(m);
7172 break;
7173
7174 // -- need OSDMap --
7175
7176 case MSG_OSD_PG_CREATE:
7177 case MSG_OSD_PG_NOTIFY:
7178 case MSG_OSD_PG_QUERY:
7179 case MSG_OSD_PG_LOG:
7180 case MSG_OSD_PG_REMOVE:
7181 case MSG_OSD_PG_INFO:
7182 case MSG_OSD_PG_TRIM:
7183 case MSG_OSD_BACKFILL_RESERVE:
7184 case MSG_OSD_RECOVERY_RESERVE:
7185 {
7186 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7187 if (m->trace)
7188 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7189 // no map? starting up?
7190 if (!osdmap) {
7191 dout(7) << "no OSDMap, not booted" << dendl;
7192 logger->inc(l_osd_waiting_for_map);
7193 waiting_for_osdmap.push_back(op);
7194 op->mark_delayed("no osdmap");
7195 break;
7196 }
7197
7198 // need OSDMap
7199 dispatch_op(op);
7200 }
7201 }
7202 }
7203
7204 void OSD::handle_pg_scrub(MOSDScrub *m, PG *pg)
7205 {
7206 pg->lock();
7207 if (pg->is_primary()) {
7208 pg->unreg_next_scrub();
7209 pg->scrubber.must_scrub = true;
7210 pg->scrubber.must_deep_scrub = m->deep || m->repair;
7211 pg->scrubber.must_repair = m->repair;
7212 pg->reg_next_scrub();
7213 dout(10) << "marking " << *pg << " for scrub" << dendl;
7214 }
7215 pg->unlock();
7216 }
7217
7218 void OSD::handle_scrub(MOSDScrub *m)
7219 {
7220 dout(10) << "handle_scrub " << *m << dendl;
7221 if (!require_mon_or_mgr_peer(m)) {
7222 m->put();
7223 return;
7224 }
7225 if (m->fsid != monc->get_fsid()) {
7226 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid() << dendl;
7227 m->put();
7228 return;
7229 }
7230
7231 RWLock::RLocker l(pg_map_lock);
7232 if (m->scrub_pgs.empty()) {
7233 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
7234 p != pg_map.end();
7235 ++p)
7236 handle_pg_scrub(m, p->second);
7237 } else {
7238 for (vector<pg_t>::iterator p = m->scrub_pgs.begin();
7239 p != m->scrub_pgs.end();
7240 ++p) {
7241 spg_t pcand;
7242 if (osdmap->get_primary_shard(*p, &pcand)) {
7243 auto pg_map_entry = pg_map.find(pcand);
7244 if (pg_map_entry != pg_map.end()) {
7245 handle_pg_scrub(m, pg_map_entry->second);
7246 }
7247 }
7248 }
7249 }
7250
7251 m->put();
7252 }
7253
7254 bool OSD::scrub_random_backoff()
7255 {
7256 bool coin_flip = (rand() / (double)RAND_MAX >=
7257 cct->_conf->osd_scrub_backoff_ratio);
7258 if (!coin_flip) {
7259 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7260 return true;
7261 }
7262 return false;
7263 }
7264
7265 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7266 const spg_t& pg, const utime_t& timestamp,
7267 double pool_scrub_min_interval,
7268 double pool_scrub_max_interval, bool must)
7269 : cct(cct),
7270 pgid(pg),
7271 sched_time(timestamp),
7272 deadline(timestamp)
7273 {
7274 // if not explicitly requested, postpone the scrub with a random delay
7275 if (!must) {
7276 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7277 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7278 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7279 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7280
7281 sched_time += scrub_min_interval;
7282 double r = rand() / (double)RAND_MAX;
7283 sched_time +=
7284 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7285 deadline += scrub_max_interval;
7286 }
7287 }
7288
7289 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7290 if (sched_time < rhs.sched_time)
7291 return true;
7292 if (sched_time > rhs.sched_time)
7293 return false;
7294 return pgid < rhs.pgid;
7295 }
7296
7297 bool OSD::scrub_time_permit(utime_t now)
7298 {
7299 struct tm bdt;
7300 time_t tt = now.sec();
7301 localtime_r(&tt, &bdt);
7302 bool time_permit = false;
7303 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7304 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7305 time_permit = true;
7306 }
7307 } else {
7308 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7309 time_permit = true;
7310 }
7311 }
7312 if (!time_permit) {
7313 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7314 << " - " << cct->_conf->osd_scrub_end_hour
7315 << " now " << bdt.tm_hour << " = no" << dendl;
7316 } else {
7317 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7318 << " - " << cct->_conf->osd_scrub_end_hour
7319 << " now " << bdt.tm_hour << " = yes" << dendl;
7320 }
7321 return time_permit;
7322 }
7323
7324 bool OSD::scrub_load_below_threshold()
7325 {
7326 double loadavgs[3];
7327 if (getloadavg(loadavgs, 3) != 3) {
7328 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7329 return false;
7330 }
7331
7332 // allow scrub if below configured threshold
7333 if (loadavgs[0] < cct->_conf->osd_scrub_load_threshold) {
7334 dout(20) << __func__ << " loadavg " << loadavgs[0]
7335 << " < max " << cct->_conf->osd_scrub_load_threshold
7336 << " = yes" << dendl;
7337 return true;
7338 }
7339
7340 // allow scrub if below daily avg and currently decreasing
7341 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7342 dout(20) << __func__ << " loadavg " << loadavgs[0]
7343 << " < daily_loadavg " << daily_loadavg
7344 << " and < 15m avg " << loadavgs[2]
7345 << " = yes" << dendl;
7346 return true;
7347 }
7348
7349 dout(20) << __func__ << " loadavg " << loadavgs[0]
7350 << " >= max " << cct->_conf->osd_scrub_load_threshold
7351 << " and ( >= daily_loadavg " << daily_loadavg
7352 << " or >= 15m avg " << loadavgs[2]
7353 << ") = no" << dendl;
7354 return false;
7355 }
7356
7357 void OSD::sched_scrub()
7358 {
7359 // if not permitted, fail fast
7360 if (!service.can_inc_scrubs_pending()) {
7361 return;
7362 }
7363 if (!cct->_conf->osd_scrub_during_recovery && service.is_recovery_active()) {
7364 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7365 return;
7366 }
7367
7368
7369 utime_t now = ceph_clock_now();
7370 bool time_permit = scrub_time_permit(now);
7371 bool load_is_low = scrub_load_below_threshold();
7372 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7373
7374 OSDService::ScrubJob scrub;
7375 if (service.first_scrub_stamp(&scrub)) {
7376 do {
7377 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7378
7379 if (scrub.sched_time > now) {
7380 // save ourselves some effort
7381 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7382 << " > " << now << dendl;
7383 break;
7384 }
7385
7386 if ((scrub.deadline >= now) && !(time_permit && load_is_low)) {
7387 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7388 << (!time_permit ? "time not permit" : "high load") << dendl;
7389 continue;
7390 }
7391
7392 PG *pg = _lookup_lock_pg(scrub.pgid);
7393 if (!pg)
7394 continue;
7395 if (pg->get_pgbackend()->scrub_supported() && pg->is_active()) {
7396 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7397 << (pg->scrubber.must_scrub ? ", explicitly requested" :
7398 (load_is_low ? ", load_is_low" : " deadline < now"))
7399 << dendl;
7400 if (pg->sched_scrub()) {
7401 pg->unlock();
7402 break;
7403 }
7404 }
7405 pg->unlock();
7406 } while (service.next_scrub_stamp(scrub, &scrub));
7407 }
7408 dout(20) << "sched_scrub done" << dendl;
7409 }
7410
7411
7412
7413 // =====================================================
7414 // MAP
7415
7416 void OSD::wait_for_new_map(OpRequestRef op)
7417 {
7418 // ask?
7419 if (waiting_for_osdmap.empty()) {
7420 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7421 }
7422
7423 logger->inc(l_osd_waiting_for_map);
7424 waiting_for_osdmap.push_back(op);
7425 op->mark_delayed("wait for new map");
7426 }
7427
7428
7429 /** update_map
7430 * assimilate new OSDMap(s). scan pgs, etc.
7431 */
7432
7433 void OSD::note_down_osd(int peer)
7434 {
7435 assert(osd_lock.is_locked());
7436 cluster_messenger->mark_down(osdmap->get_cluster_addr(peer));
7437
7438 heartbeat_lock.Lock();
7439 failure_queue.erase(peer);
7440 failure_pending.erase(peer);
7441 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7442 if (p != heartbeat_peers.end()) {
7443 p->second.con_back->mark_down();
7444 if (p->second.con_front) {
7445 p->second.con_front->mark_down();
7446 }
7447 heartbeat_peers.erase(p);
7448 }
7449 heartbeat_lock.Unlock();
7450 }
7451
7452 void OSD::note_up_osd(int peer)
7453 {
7454 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
7455 heartbeat_set_peers_need_update();
7456 }
7457
7458 struct C_OnMapCommit : public Context {
7459 OSD *osd;
7460 epoch_t first, last;
7461 MOSDMap *msg;
7462 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7463 : osd(o), first(f), last(l), msg(m) {}
7464 void finish(int r) override {
7465 osd->_committed_osd_maps(first, last, msg);
7466 msg->put();
7467 }
7468 };
7469
7470 struct C_OnMapApply : public Context {
7471 OSDService *service;
7472 list<OSDMapRef> pinned_maps;
7473 epoch_t e;
7474 C_OnMapApply(OSDService *service,
7475 const list<OSDMapRef> &pinned_maps,
7476 epoch_t e)
7477 : service(service), pinned_maps(pinned_maps), e(e) {}
7478 void finish(int r) override {
7479 service->clear_map_bl_cache_pins(e);
7480 }
7481 };
7482
7483 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7484 {
7485 OSDMapRef osdmap = service.get_osdmap();
7486 if (osdmap->get_epoch() >= epoch)
7487 return;
7488
7489 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7490 force_request) {
7491 monc->renew_subs();
7492 }
7493 }
7494
7495 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7496 {
7497 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7498 if (min <= superblock.oldest_map)
7499 return;
7500
7501 int num = 0;
7502 ObjectStore::Transaction t;
7503 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7504 dout(20) << " removing old osdmap epoch " << e << dendl;
7505 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7506 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7507 superblock.oldest_map = e + 1;
7508 num++;
7509 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7510 service.publish_superblock(superblock);
7511 write_superblock(t);
7512 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7513 assert(tr == 0);
7514 num = 0;
7515 if (!skip_maps) {
7516 // skip_maps leaves us with a range of old maps if we fail to remove all
7517 // of them before moving superblock.oldest_map forward to the first map
7518 // in the incoming MOSDMap msg. so we should continue removing them in
7519 // this case, even we could do huge series of delete transactions all at
7520 // once.
7521 break;
7522 }
7523 }
7524 }
7525 if (num > 0) {
7526 service.publish_superblock(superblock);
7527 write_superblock(t);
7528 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7529 assert(tr == 0);
7530 }
7531 // we should not remove the cached maps
7532 assert(min <= service.map_cache.cached_key_lower_bound());
7533 }
7534
7535 void OSD::handle_osd_map(MOSDMap *m)
7536 {
7537 assert(osd_lock.is_locked());
7538 // Keep a ref in the list until we get the newly received map written
7539 // onto disk. This is important because as long as the refs are alive,
7540 // the OSDMaps will be pinned in the cache and we won't try to read it
7541 // off of disk. Otherwise these maps will probably not stay in the cache,
7542 // and reading those OSDMaps before they are actually written can result
7543 // in a crash.
7544 list<OSDMapRef> pinned_maps;
7545 if (m->fsid != monc->get_fsid()) {
7546 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7547 << monc->get_fsid() << dendl;
7548 m->put();
7549 return;
7550 }
7551 if (is_initializing()) {
7552 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7553 m->put();
7554 return;
7555 }
7556
7557 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
7558 if (session && !(session->entity_name.is_mon() ||
7559 session->entity_name.is_osd())) {
7560 //not enough perms!
7561 dout(10) << "got osd map from Session " << session
7562 << " which we can't take maps from (not a mon or osd)" << dendl;
7563 m->put();
7564 session->put();
7565 return;
7566 }
7567 if (session)
7568 session->put();
7569
7570 // share with the objecter
7571 if (!is_preboot())
7572 service.objecter->handle_osd_map(m);
7573
7574 epoch_t first = m->get_first();
7575 epoch_t last = m->get_last();
7576 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7577 << superblock.newest_map
7578 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7579 << dendl;
7580
7581 logger->inc(l_osd_map);
7582 logger->inc(l_osd_mape, last - first + 1);
7583 if (first <= superblock.newest_map)
7584 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7585 if (service.max_oldest_map < m->oldest_map) {
7586 service.max_oldest_map = m->oldest_map;
7587 assert(service.max_oldest_map >= superblock.oldest_map);
7588 }
7589
7590 // make sure there is something new, here, before we bother flushing
7591 // the queues and such
7592 if (last <= superblock.newest_map) {
7593 dout(10) << " no new maps here, dropping" << dendl;
7594 m->put();
7595 return;
7596 }
7597
7598 // missing some?
7599 bool skip_maps = false;
7600 if (first > superblock.newest_map + 1) {
7601 dout(10) << "handle_osd_map message skips epochs "
7602 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7603 if (m->oldest_map <= superblock.newest_map + 1) {
7604 osdmap_subscribe(superblock.newest_map + 1, false);
7605 m->put();
7606 return;
7607 }
7608 // always try to get the full range of maps--as many as we can. this
7609 // 1- is good to have
7610 // 2- is at present the only way to ensure that we get a *full* map as
7611 // the first map!
7612 if (m->oldest_map < first) {
7613 osdmap_subscribe(m->oldest_map - 1, true);
7614 m->put();
7615 return;
7616 }
7617 skip_maps = true;
7618 }
7619
7620 ObjectStore::Transaction t;
7621 uint64_t txn_size = 0;
7622
7623 // store new maps: queue for disk and put in the osdmap cache
7624 epoch_t start = MAX(superblock.newest_map + 1, first);
7625 for (epoch_t e = start; e <= last; e++) {
7626 if (txn_size >= t.get_num_bytes()) {
7627 derr << __func__ << " transaction size overflowed" << dendl;
7628 assert(txn_size < t.get_num_bytes());
7629 }
7630 txn_size = t.get_num_bytes();
7631 map<epoch_t,bufferlist>::iterator p;
7632 p = m->maps.find(e);
7633 if (p != m->maps.end()) {
7634 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7635 OSDMap *o = new OSDMap;
7636 bufferlist& bl = p->second;
7637
7638 o->decode(bl);
7639
7640 ghobject_t fulloid = get_osdmap_pobject_name(e);
7641 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7642 pin_map_bl(e, bl);
7643 pinned_maps.push_back(add_map(o));
7644
7645 got_full_map(e);
7646 continue;
7647 }
7648
7649 p = m->incremental_maps.find(e);
7650 if (p != m->incremental_maps.end()) {
7651 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7652 bufferlist& bl = p->second;
7653 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7654 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7655 pin_map_inc_bl(e, bl);
7656
7657 OSDMap *o = new OSDMap;
7658 if (e > 1) {
7659 bufferlist obl;
7660 bool got = get_map_bl(e - 1, obl);
7661 assert(got);
7662 o->decode(obl);
7663 }
7664
7665 OSDMap::Incremental inc;
7666 bufferlist::iterator p = bl.begin();
7667 inc.decode(p);
7668 if (o->apply_incremental(inc) < 0) {
7669 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
7670 assert(0 == "bad fsid");
7671 }
7672
7673 bufferlist fbl;
7674 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7675
7676 bool injected_failure = false;
7677 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7678 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7679 derr << __func__ << " injecting map crc failure" << dendl;
7680 injected_failure = true;
7681 }
7682
7683 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7684 dout(2) << "got incremental " << e
7685 << " but failed to encode full with correct crc; requesting"
7686 << dendl;
7687 clog->warn() << "failed to encode map e" << e << " with expected crc";
7688 dout(20) << "my encoded map was:\n";
7689 fbl.hexdump(*_dout);
7690 *_dout << dendl;
7691 delete o;
7692 request_full_map(e, last);
7693 last = e - 1;
7694 break;
7695 }
7696 got_full_map(e);
7697
7698 ghobject_t fulloid = get_osdmap_pobject_name(e);
7699 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7700 pin_map_bl(e, fbl);
7701 pinned_maps.push_back(add_map(o));
7702 continue;
7703 }
7704
7705 assert(0 == "MOSDMap lied about what maps it had?");
7706 }
7707
7708 // even if this map isn't from a mon, we may have satisfied our subscription
7709 monc->sub_got("osdmap", last);
7710
7711 if (!m->maps.empty() && requested_full_first) {
7712 dout(10) << __func__ << " still missing full maps " << requested_full_first
7713 << ".." << requested_full_last << dendl;
7714 rerequest_full_maps();
7715 }
7716
7717 if (superblock.oldest_map) {
7718 // make sure we at least keep pace with incoming maps
7719 trim_maps(m->oldest_map, last - first + 1, skip_maps);
7720 }
7721
7722 if (!superblock.oldest_map || skip_maps)
7723 superblock.oldest_map = first;
7724 superblock.newest_map = last;
7725 superblock.current_epoch = last;
7726
7727 // note in the superblock that we were clean thru the prior epoch
7728 epoch_t boot_epoch = service.get_boot_epoch();
7729 if (boot_epoch && boot_epoch >= superblock.mounted) {
7730 superblock.mounted = boot_epoch;
7731 superblock.clean_thru = last;
7732 }
7733
7734 // superblock and commit
7735 write_superblock(t);
7736 store->queue_transaction(
7737 service.meta_osr.get(),
7738 std::move(t),
7739 new C_OnMapApply(&service, pinned_maps, last),
7740 new C_OnMapCommit(this, start, last, m), 0);
7741 service.publish_superblock(superblock);
7742 }
7743
7744 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
7745 {
7746 dout(10) << __func__ << " " << first << ".." << last << dendl;
7747 if (is_stopping()) {
7748 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7749 return;
7750 }
7751 Mutex::Locker l(osd_lock);
7752 if (is_stopping()) {
7753 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7754 return;
7755 }
7756 map_lock.get_write();
7757
7758 bool do_shutdown = false;
7759 bool do_restart = false;
7760 bool network_error = false;
7761
7762 // advance through the new maps
7763 for (epoch_t cur = first; cur <= last; cur++) {
7764 dout(10) << " advance to epoch " << cur
7765 << " (<= last " << last
7766 << " <= newest_map " << superblock.newest_map
7767 << ")" << dendl;
7768
7769 OSDMapRef newmap = get_map(cur);
7770 assert(newmap); // we just cached it above!
7771
7772 // start blacklisting messages sent to peers that go down.
7773 service.pre_publish_map(newmap);
7774
7775 // kill connections to newly down osds
7776 bool waited_for_reservations = false;
7777 set<int> old;
7778 osdmap->get_all_osds(old);
7779 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
7780 if (*p != whoami &&
7781 osdmap->is_up(*p) && // in old map
7782 newmap->is_down(*p)) { // but not the new one
7783 if (!waited_for_reservations) {
7784 service.await_reserved_maps();
7785 waited_for_reservations = true;
7786 }
7787 note_down_osd(*p);
7788 } else if (*p != whoami &&
7789 osdmap->is_down(*p) &&
7790 newmap->is_up(*p)) {
7791 note_up_osd(*p);
7792 }
7793 }
7794
7795 if ((osdmap->test_flag(CEPH_OSDMAP_NOUP) !=
7796 newmap->test_flag(CEPH_OSDMAP_NOUP)) ||
7797 (osdmap->is_noup(whoami) != newmap->is_noup(whoami))) {
7798 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
7799 << dendl;
7800 if (is_booting()) {
7801 // this captures the case where we sent the boot message while
7802 // NOUP was being set on the mon and our boot request was
7803 // dropped, and then later it is cleared. it imperfectly
7804 // handles the case where our original boot message was not
7805 // dropped and we restart even though we might have booted, but
7806 // that is harmless (boot will just take slightly longer).
7807 do_restart = true;
7808 }
7809 }
7810 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS &&
7811 newmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
7812 dout(10) << __func__ << " require_osd_release reached luminous in "
7813 << newmap->get_epoch() << dendl;
7814 clear_pg_stat_queue();
7815 clear_outstanding_pg_stats();
7816 }
7817
7818 osdmap = newmap;
7819 epoch_t up_epoch;
7820 epoch_t boot_epoch;
7821 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
7822 if (!up_epoch &&
7823 osdmap->is_up(whoami) &&
7824 osdmap->get_inst(whoami) == client_messenger->get_myinst()) {
7825 up_epoch = osdmap->get_epoch();
7826 dout(10) << "up_epoch is " << up_epoch << dendl;
7827 if (!boot_epoch) {
7828 boot_epoch = osdmap->get_epoch();
7829 dout(10) << "boot_epoch is " << boot_epoch << dendl;
7830 }
7831 service.set_epochs(&boot_epoch, &up_epoch, NULL);
7832 }
7833 }
7834
7835 had_map_since = ceph_clock_now();
7836
7837 epoch_t _bind_epoch = service.get_bind_epoch();
7838 if (osdmap->is_up(whoami) &&
7839 osdmap->get_addr(whoami) == client_messenger->get_myaddr() &&
7840 _bind_epoch < osdmap->get_up_from(whoami)) {
7841
7842 if (is_booting()) {
7843 dout(1) << "state: booting -> active" << dendl;
7844 set_state(STATE_ACTIVE);
7845
7846 // set incarnation so that osd_reqid_t's we generate for our
7847 // objecter requests are unique across restarts.
7848 service.objecter->set_client_incarnation(osdmap->get_epoch());
7849 }
7850 }
7851
7852 if (osdmap->get_epoch() > 0 &&
7853 is_active()) {
7854 if (!osdmap->exists(whoami)) {
7855 dout(0) << "map says i do not exist. shutting down." << dendl;
7856 do_shutdown = true; // don't call shutdown() while we have
7857 // everything paused
7858 } else if (!osdmap->is_up(whoami) ||
7859 !osdmap->get_addr(whoami).probably_equals(
7860 client_messenger->get_myaddr()) ||
7861 !osdmap->get_cluster_addr(whoami).probably_equals(
7862 cluster_messenger->get_myaddr()) ||
7863 !osdmap->get_hb_back_addr(whoami).probably_equals(
7864 hb_back_server_messenger->get_myaddr()) ||
7865 (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
7866 !osdmap->get_hb_front_addr(whoami).probably_equals(
7867 hb_front_server_messenger->get_myaddr()))) {
7868 if (!osdmap->is_up(whoami)) {
7869 if (service.is_preparing_to_stop() || service.is_stopping()) {
7870 service.got_stop_ack();
7871 } else {
7872 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
7873 "but it is still running";
7874 clog->debug() << "map e" << osdmap->get_epoch()
7875 << " wrongly marked me down at e"
7876 << osdmap->get_down_at(whoami);
7877 }
7878 } else if (!osdmap->get_addr(whoami).probably_equals(
7879 client_messenger->get_myaddr())) {
7880 clog->error() << "map e" << osdmap->get_epoch()
7881 << " had wrong client addr (" << osdmap->get_addr(whoami)
7882 << " != my " << client_messenger->get_myaddr() << ")";
7883 } else if (!osdmap->get_cluster_addr(whoami).probably_equals(
7884 cluster_messenger->get_myaddr())) {
7885 clog->error() << "map e" << osdmap->get_epoch()
7886 << " had wrong cluster addr ("
7887 << osdmap->get_cluster_addr(whoami)
7888 << " != my " << cluster_messenger->get_myaddr() << ")";
7889 } else if (!osdmap->get_hb_back_addr(whoami).probably_equals(
7890 hb_back_server_messenger->get_myaddr())) {
7891 clog->error() << "map e" << osdmap->get_epoch()
7892 << " had wrong heartbeat back addr ("
7893 << osdmap->get_hb_back_addr(whoami)
7894 << " != my " << hb_back_server_messenger->get_myaddr()
7895 << ")";
7896 } else if (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
7897 !osdmap->get_hb_front_addr(whoami).probably_equals(
7898 hb_front_server_messenger->get_myaddr())) {
7899 clog->error() << "map e" << osdmap->get_epoch()
7900 << " had wrong heartbeat front addr ("
7901 << osdmap->get_hb_front_addr(whoami)
7902 << " != my " << hb_front_server_messenger->get_myaddr()
7903 << ")";
7904 }
7905
7906 if (!service.is_stopping()) {
7907 epoch_t up_epoch = 0;
7908 epoch_t bind_epoch = osdmap->get_epoch();
7909 service.set_epochs(NULL,&up_epoch, &bind_epoch);
7910 do_restart = true;
7911
7912 //add markdown log
7913 utime_t now = ceph_clock_now();
7914 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
7915 osd_markdown_log.push_back(now);
7916 //clear all out-of-date log
7917 while (!osd_markdown_log.empty() &&
7918 osd_markdown_log.front() + grace < now)
7919 osd_markdown_log.pop_front();
7920 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
7921 dout(0) << __func__ << " marked down "
7922 << osd_markdown_log.size()
7923 << " > osd_max_markdown_count "
7924 << cct->_conf->osd_max_markdown_count
7925 << " in last " << grace << " seconds, shutting down"
7926 << dendl;
7927 do_restart = false;
7928 do_shutdown = true;
7929 }
7930
7931 start_waiting_for_healthy();
7932
7933 set<int> avoid_ports;
7934 #if defined(__FreeBSD__)
7935 // prevent FreeBSD from grabbing the client_messenger port during
7936 // rebinding. In which case a cluster_meesneger will connect also
7937 // to the same port
7938 avoid_ports.insert(client_messenger->get_myaddr().get_port());
7939 #endif
7940 avoid_ports.insert(cluster_messenger->get_myaddr().get_port());
7941 avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port());
7942 avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port());
7943
7944 int r = cluster_messenger->rebind(avoid_ports);
7945 if (r != 0) {
7946 do_shutdown = true; // FIXME: do_restart?
7947 network_error = true;
7948 dout(0) << __func__ << " marked down:"
7949 << " rebind cluster_messenger failed" << dendl;
7950 }
7951
7952 r = hb_back_server_messenger->rebind(avoid_ports);
7953 if (r != 0) {
7954 do_shutdown = true; // FIXME: do_restart?
7955 network_error = true;
7956 dout(0) << __func__ << " marked down:"
7957 << " rebind hb_back_server_messenger failed" << dendl;
7958 }
7959
7960 r = hb_front_server_messenger->rebind(avoid_ports);
7961 if (r != 0) {
7962 do_shutdown = true; // FIXME: do_restart?
7963 network_error = true;
7964 dout(0) << __func__ << " marked down:"
7965 << " rebind hb_front_server_messenger failed" << dendl;
7966 }
7967
7968 hb_front_client_messenger->mark_down_all();
7969 hb_back_client_messenger->mark_down_all();
7970
7971 reset_heartbeat_peers();
7972 }
7973 }
7974 }
7975
7976 map_lock.put_write();
7977
7978 check_osdmap_features(store);
7979
7980 // yay!
7981 consume_map();
7982
7983 if (is_active() || is_waiting_for_healthy())
7984 maybe_update_heartbeat_peers();
7985
7986 if (!is_active()) {
7987 dout(10) << " not yet active; waiting for peering wq to drain" << dendl;
7988 peering_wq.drain();
7989 } else {
7990 activate_map();
7991 }
7992
7993 if (do_shutdown) {
7994 if (network_error) {
7995 Mutex::Locker l(heartbeat_lock);
7996 map<int,pair<utime_t,entity_inst_t>>::iterator it =
7997 failure_pending.begin();
7998 while (it != failure_pending.end()) {
7999 dout(10) << "handle_osd_ping canceling in-flight failure report for osd."
8000 << it->first << dendl;
8001 send_still_alive(osdmap->get_epoch(), it->second.second);
8002 failure_pending.erase(it++);
8003 }
8004 }
8005 // trigger shutdown in a different thread
8006 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8007 queue_async_signal(SIGINT);
8008 }
8009 else if (m->newest_map && m->newest_map > last) {
8010 dout(10) << " msg say newest map is " << m->newest_map
8011 << ", requesting more" << dendl;
8012 osdmap_subscribe(osdmap->get_epoch()+1, false);
8013 }
8014 else if (is_preboot()) {
8015 if (m->get_source().is_mon())
8016 _preboot(m->oldest_map, m->newest_map);
8017 else
8018 start_boot();
8019 }
8020 else if (do_restart)
8021 start_boot();
8022
8023 }
8024
8025 void OSD::check_osdmap_features(ObjectStore *fs)
8026 {
8027 // adjust required feature bits?
8028
8029 // we have to be a bit careful here, because we are accessing the
8030 // Policy structures without taking any lock. in particular, only
8031 // modify integer values that can safely be read by a racing CPU.
8032 // since we are only accessing existing Policy structures a their
8033 // current memory location, and setting or clearing bits in integer
8034 // fields, and we are the only writer, this is not a problem.
8035
8036 {
8037 Messenger::Policy p = client_messenger->get_default_policy();
8038 uint64_t mask;
8039 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8040 if ((p.features_required & mask) != features) {
8041 dout(0) << "crush map has features " << features
8042 << ", adjusting msgr requires for clients" << dendl;
8043 p.features_required = (p.features_required & ~mask) | features;
8044 client_messenger->set_default_policy(p);
8045 }
8046 }
8047 {
8048 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8049 uint64_t mask;
8050 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8051 if ((p.features_required & mask) != features) {
8052 dout(0) << "crush map has features " << features
8053 << " was " << p.features_required
8054 << ", adjusting msgr requires for mons" << dendl;
8055 p.features_required = (p.features_required & ~mask) | features;
8056 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8057 }
8058 }
8059 {
8060 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8061 uint64_t mask;
8062 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8063
8064 if ((p.features_required & mask) != features) {
8065 dout(0) << "crush map has features " << features
8066 << ", adjusting msgr requires for osds" << dendl;
8067 p.features_required = (p.features_required & ~mask) | features;
8068 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8069 }
8070
8071 if ((features & CEPH_FEATURE_OSD_ERASURE_CODES) &&
8072 !superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8073 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8074 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8075 ObjectStore::Transaction t;
8076 write_superblock(t);
8077 int err = store->queue_transaction(service.meta_osr.get(), std::move(t), NULL);
8078 assert(err == 0);
8079 }
8080 }
8081 }
8082
8083 bool OSD::advance_pg(
8084 epoch_t osd_epoch, PG *pg,
8085 ThreadPool::TPHandle &handle,
8086 PG::RecoveryCtx *rctx,
8087 set<PGRef> *new_pgs)
8088 {
8089 assert(pg->is_locked());
8090 epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
8091 OSDMapRef lastmap = pg->get_osdmap();
8092
8093 if (lastmap->get_epoch() == osd_epoch)
8094 return true;
8095 assert(lastmap->get_epoch() < osd_epoch);
8096
8097 epoch_t min_epoch = service.get_min_pg_epoch();
8098 epoch_t max;
8099 if (min_epoch) {
8100 max = min_epoch + cct->_conf->osd_map_max_advance;
8101 } else {
8102 max = next_epoch + cct->_conf->osd_map_max_advance;
8103 }
8104
8105 for (;
8106 next_epoch <= osd_epoch && next_epoch <= max;
8107 ++next_epoch) {
8108 OSDMapRef nextmap = service.try_get_map(next_epoch);
8109 if (!nextmap) {
8110 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8111 // make sure max is bumped up so that we can get past any
8112 // gap in maps
8113 max = MAX(max, next_epoch + cct->_conf->osd_map_max_advance);
8114 continue;
8115 }
8116
8117 vector<int> newup, newacting;
8118 int up_primary, acting_primary;
8119 nextmap->pg_to_up_acting_osds(
8120 pg->info.pgid.pgid,
8121 &newup, &up_primary,
8122 &newacting, &acting_primary);
8123 pg->handle_advance_map(
8124 nextmap, lastmap, newup, up_primary,
8125 newacting, acting_primary, rctx);
8126
8127 // Check for split!
8128 set<spg_t> children;
8129 spg_t parent(pg->info.pgid);
8130 if (parent.is_split(
8131 lastmap->get_pg_num(pg->pool.id),
8132 nextmap->get_pg_num(pg->pool.id),
8133 &children)) {
8134 service.mark_split_in_progress(pg->info.pgid, children);
8135 split_pgs(
8136 pg, children, new_pgs, lastmap, nextmap,
8137 rctx);
8138 }
8139
8140 lastmap = nextmap;
8141 handle.reset_tp_timeout();
8142 }
8143 service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
8144 pg->handle_activate_map(rctx);
8145 if (next_epoch <= osd_epoch) {
8146 dout(10) << __func__ << " advanced to max " << max
8147 << " past min epoch " << min_epoch
8148 << " ... will requeue " << *pg << dendl;
8149 return false;
8150 }
8151 return true;
8152 }
8153
8154 void OSD::consume_map()
8155 {
8156 assert(osd_lock.is_locked());
8157 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8158
8159 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8160 list<PGRef> to_remove;
8161
8162 // scan pg's
8163 {
8164 RWLock::RLocker l(pg_map_lock);
8165 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8166 it != pg_map.end();
8167 ++it) {
8168 PG *pg = it->second;
8169 pg->lock();
8170 if (pg->is_primary())
8171 num_pg_primary++;
8172 else if (pg->is_replica())
8173 num_pg_replica++;
8174 else
8175 num_pg_stray++;
8176
8177 if (!osdmap->have_pg_pool(pg->info.pgid.pool())) {
8178 //pool is deleted!
8179 to_remove.push_back(PGRef(pg));
8180 } else {
8181 service.init_splits_between(it->first, service.get_osdmap(), osdmap);
8182 }
8183
8184 pg->unlock();
8185 }
8186 }
8187
8188 for (list<PGRef>::iterator i = to_remove.begin();
8189 i != to_remove.end();
8190 to_remove.erase(i++)) {
8191 RWLock::WLocker locker(pg_map_lock);
8192 (*i)->lock();
8193 _remove_pg(&**i);
8194 (*i)->unlock();
8195 }
8196
8197 service.expand_pg_num(service.get_osdmap(), osdmap);
8198
8199 service.pre_publish_map(osdmap);
8200 service.await_reserved_maps();
8201 service.publish_map(osdmap);
8202
8203 service.maybe_inject_dispatch_delay();
8204
8205 dispatch_sessions_waiting_on_map();
8206
8207 service.maybe_inject_dispatch_delay();
8208
8209 // remove any PGs which we no longer host from the session waiting_for_pg lists
8210 dout(20) << __func__ << " checking waiting_for_pg" << dendl;
8211 op_shardedwq.prune_pg_waiters(osdmap, whoami);
8212
8213 service.maybe_inject_dispatch_delay();
8214
8215 // scan pg's
8216 {
8217 RWLock::RLocker l(pg_map_lock);
8218 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8219 it != pg_map.end();
8220 ++it) {
8221 PG *pg = it->second;
8222 pg->lock();
8223 pg->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
8224 pg->unlock();
8225 }
8226
8227 logger->set(l_osd_pg, pg_map.size());
8228 }
8229 logger->set(l_osd_pg_primary, num_pg_primary);
8230 logger->set(l_osd_pg_replica, num_pg_replica);
8231 logger->set(l_osd_pg_stray, num_pg_stray);
8232 }
8233
8234 void OSD::activate_map()
8235 {
8236 assert(osd_lock.is_locked());
8237
8238 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8239
8240 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8241 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8242 ceph_abort();
8243 }
8244
8245 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
8246 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
8247 osdmap_subscribe(osdmap->get_epoch() + 1, false);
8248 }
8249
8250 // norecover?
8251 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8252 if (!service.recovery_is_paused()) {
8253 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8254 service.pause_recovery();
8255 }
8256 } else {
8257 if (service.recovery_is_paused()) {
8258 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8259 service.unpause_recovery();
8260 }
8261 }
8262
8263 service.activate_map();
8264
8265 // process waiters
8266 take_waiters(waiting_for_osdmap);
8267 }
8268
8269 bool OSD::require_mon_peer(const Message *m)
8270 {
8271 if (!m->get_connection()->peer_is_mon()) {
8272 dout(0) << "require_mon_peer received from non-mon "
8273 << m->get_connection()->get_peer_addr()
8274 << " " << *m << dendl;
8275 return false;
8276 }
8277 return true;
8278 }
8279
8280 bool OSD::require_mon_or_mgr_peer(const Message *m)
8281 {
8282 if (!m->get_connection()->peer_is_mon() &&
8283 !m->get_connection()->peer_is_mgr()) {
8284 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8285 << m->get_connection()->get_peer_addr()
8286 << " " << *m << dendl;
8287 return false;
8288 }
8289 return true;
8290 }
8291
8292 bool OSD::require_osd_peer(const Message *m)
8293 {
8294 if (!m->get_connection()->peer_is_osd()) {
8295 dout(0) << "require_osd_peer received from non-osd "
8296 << m->get_connection()->get_peer_addr()
8297 << " " << *m << dendl;
8298 return false;
8299 }
8300 return true;
8301 }
8302
8303 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8304 {
8305 epoch_t up_epoch = service.get_up_epoch();
8306 if (epoch < up_epoch) {
8307 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8308 return false;
8309 }
8310
8311 if (!is_active()) {
8312 dout(7) << "still in boot state, dropping message " << *m << dendl;
8313 return false;
8314 }
8315
8316 return true;
8317 }
8318
8319 bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
8320 bool is_fast_dispatch)
8321 {
8322 int from = m->get_source().num();
8323
8324 if (map->is_down(from) ||
8325 (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
8326 dout(5) << "from dead osd." << from << ", marking down, "
8327 << " msg was " << m->get_source_inst().addr
8328 << " expected " << (map->is_up(from) ?
8329 map->get_cluster_addr(from) : entity_addr_t())
8330 << dendl;
8331 ConnectionRef con = m->get_connection();
8332 con->mark_down();
8333 Session *s = static_cast<Session*>(con->get_priv());
8334 if (s) {
8335 if (!is_fast_dispatch)
8336 s->session_dispatch_lock.Lock();
8337 clear_session_waiting_on_map(s);
8338 con->set_priv(NULL); // break ref <-> session cycle, if any
8339 if (!is_fast_dispatch)
8340 s->session_dispatch_lock.Unlock();
8341 s->put();
8342 }
8343 return false;
8344 }
8345 return true;
8346 }
8347
8348
8349 /*
8350 * require that we have same (or newer) map, and that
8351 * the source is the pg primary.
8352 */
8353 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8354 bool is_fast_dispatch)
8355 {
8356 const Message *m = op->get_req();
8357 dout(15) << "require_same_or_newer_map " << epoch
8358 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8359
8360 assert(osd_lock.is_locked());
8361
8362 // do they have a newer map?
8363 if (epoch > osdmap->get_epoch()) {
8364 dout(7) << "waiting for newer map epoch " << epoch
8365 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8366 wait_for_new_map(op);
8367 return false;
8368 }
8369
8370 if (!require_self_aliveness(op->get_req(), epoch)) {
8371 return false;
8372 }
8373
8374 // ok, our map is same or newer.. do they still exist?
8375 if (m->get_connection()->get_messenger() == cluster_messenger &&
8376 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8377 return false;
8378 }
8379
8380 return true;
8381 }
8382
8383
8384
8385
8386
8387 // ----------------------------------------
8388 // pg creation
8389
8390 void OSD::split_pgs(
8391 PG *parent,
8392 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
8393 OSDMapRef curmap,
8394 OSDMapRef nextmap,
8395 PG::RecoveryCtx *rctx)
8396 {
8397 unsigned pg_num = nextmap->get_pg_num(
8398 parent->pool.id);
8399 parent->update_snap_mapper_bits(
8400 parent->info.pgid.get_split_bits(pg_num)
8401 );
8402
8403 vector<object_stat_sum_t> updated_stats(childpgids.size() + 1);
8404 parent->info.stats.stats.sum.split(updated_stats);
8405
8406 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8407 for (set<spg_t>::const_iterator i = childpgids.begin();
8408 i != childpgids.end();
8409 ++i, ++stat_iter) {
8410 assert(stat_iter != updated_stats.end());
8411 dout(10) << "Splitting " << *parent << " into " << *i << dendl;
8412 assert(service.splitting(*i));
8413 PG* child = _make_pg(nextmap, *i);
8414 child->lock(true);
8415 out_pgs->insert(child);
8416 rctx->created_pgs.insert(child);
8417
8418 unsigned split_bits = i->get_split_bits(pg_num);
8419 dout(10) << "pg_num is " << pg_num << dendl;
8420 dout(10) << "m_seed " << i->ps() << dendl;
8421 dout(10) << "split_bits is " << split_bits << dendl;
8422
8423 parent->split_colls(
8424 *i,
8425 split_bits,
8426 i->ps(),
8427 &child->pool.info,
8428 rctx->transaction);
8429 parent->split_into(
8430 i->pgid,
8431 child,
8432 split_bits);
8433 child->info.stats.stats.sum = *stat_iter;
8434
8435 child->write_if_dirty(*(rctx->transaction));
8436 child->unlock();
8437 }
8438 assert(stat_iter != updated_stats.end());
8439 parent->info.stats.stats.sum = *stat_iter;
8440 parent->write_if_dirty(*(rctx->transaction));
8441 }
8442
8443 /*
8444 * holding osd_lock
8445 */
8446 void OSD::handle_pg_create(OpRequestRef op)
8447 {
8448 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
8449 assert(m->get_type() == MSG_OSD_PG_CREATE);
8450
8451 dout(10) << "handle_pg_create " << *m << dendl;
8452
8453 if (!require_mon_peer(op->get_req())) {
8454 return;
8455 }
8456
8457 if (!require_same_or_newer_map(op, m->epoch, false))
8458 return;
8459
8460 op->mark_started();
8461
8462 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8463 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8464 p != m->mkpg.end();
8465 ++p, ++ci) {
8466 assert(ci != m->ctimes.end() && ci->first == p->first);
8467 epoch_t created = p->second.created;
8468 if (p->second.split_bits) // Skip split pgs
8469 continue;
8470 pg_t on = p->first;
8471
8472 if (on.preferred() >= 0) {
8473 dout(20) << "ignoring localized pg " << on << dendl;
8474 continue;
8475 }
8476
8477 if (!osdmap->have_pg_pool(on.pool())) {
8478 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8479 continue;
8480 }
8481
8482 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
8483
8484 // is it still ours?
8485 vector<int> up, acting;
8486 int up_primary = -1;
8487 int acting_primary = -1;
8488 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
8489 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
8490
8491 if (acting_primary != whoami) {
8492 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
8493 << "), my role=" << role << ", skipping" << dendl;
8494 continue;
8495 }
8496
8497 spg_t pgid;
8498 bool mapped = osdmap->get_primary_shard(on, &pgid);
8499 assert(mapped);
8500
8501 PastIntervals pi(
8502 osdmap->get_pools().at(pgid.pool()).ec_pool(),
8503 *osdmap);
8504 pg_history_t history;
8505 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
8506
8507 // The mon won't resend unless the primary changed, so
8508 // we ignore same_interval_since. We'll pass this history
8509 // to handle_pg_peering_evt with the current epoch as the
8510 // event -- the project_pg_history check in
8511 // handle_pg_peering_evt will be a noop.
8512 if (history.same_primary_since > m->epoch) {
8513 dout(10) << __func__ << ": got obsolete pg create on pgid "
8514 << pgid << " from epoch " << m->epoch
8515 << ", primary changed in " << history.same_primary_since
8516 << dendl;
8517 continue;
8518 }
8519
8520 if (handle_pg_peering_evt(
8521 pgid,
8522 history,
8523 pi,
8524 osdmap->get_epoch(),
8525 PG::CephPeeringEvtRef(
8526 new PG::CephPeeringEvt(
8527 osdmap->get_epoch(),
8528 osdmap->get_epoch(),
8529 PG::NullEvt()))
8530 ) == -EEXIST) {
8531 service.send_pg_created(pgid.pgid);
8532 }
8533 }
8534 last_pg_create_epoch = m->epoch;
8535
8536 maybe_update_heartbeat_peers();
8537 }
8538
8539
8540 // ----------------------------------------
8541 // peering and recovery
8542
8543 PG::RecoveryCtx OSD::create_context()
8544 {
8545 ObjectStore::Transaction *t = new ObjectStore::Transaction;
8546 C_Contexts *on_applied = new C_Contexts(cct);
8547 C_Contexts *on_safe = new C_Contexts(cct);
8548 map<int, map<spg_t,pg_query_t> > *query_map =
8549 new map<int, map<spg_t, pg_query_t> >;
8550 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
8551 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
8552 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
8553 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
8554 PG::RecoveryCtx rctx(query_map, info_map, notify_list,
8555 on_applied, on_safe, t);
8556 return rctx;
8557 }
8558
8559 struct C_OpenPGs : public Context {
8560 set<PGRef> pgs;
8561 ObjectStore *store;
8562 OSD *osd;
8563 C_OpenPGs(set<PGRef>& p, ObjectStore *s, OSD* o) : store(s), osd(o) {
8564 pgs.swap(p);
8565 }
8566 void finish(int r) override {
8567 RWLock::RLocker l(osd->pg_map_lock);
8568 for (auto p : pgs) {
8569 if (osd->pg_map.count(p->info.pgid)) {
8570 p->ch = store->open_collection(p->coll);
8571 assert(p->ch);
8572 }
8573 }
8574 }
8575 };
8576
8577 void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
8578 ThreadPool::TPHandle *handle)
8579 {
8580 if (!ctx.transaction->empty()) {
8581 if (!ctx.created_pgs.empty()) {
8582 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8583 }
8584 int tr = store->queue_transaction(
8585 pg->osr.get(),
8586 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL,
8587 TrackedOpRef(), handle);
8588 delete (ctx.transaction);
8589 assert(tr == 0);
8590 ctx.transaction = new ObjectStore::Transaction;
8591 ctx.on_applied = new C_Contexts(cct);
8592 ctx.on_safe = new C_Contexts(cct);
8593 }
8594 }
8595
8596 void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
8597 ThreadPool::TPHandle *handle)
8598 {
8599 if (service.get_osdmap()->is_up(whoami) &&
8600 is_active()) {
8601 do_notifies(*ctx.notify_list, curmap);
8602 do_queries(*ctx.query_map, curmap);
8603 do_infos(*ctx.info_map, curmap);
8604 }
8605 delete ctx.notify_list;
8606 delete ctx.query_map;
8607 delete ctx.info_map;
8608 if ((ctx.on_applied->empty() &&
8609 ctx.on_safe->empty() &&
8610 ctx.transaction->empty() &&
8611 ctx.created_pgs.empty()) || !pg) {
8612 delete ctx.transaction;
8613 delete ctx.on_applied;
8614 delete ctx.on_safe;
8615 assert(ctx.created_pgs.empty());
8616 } else {
8617 if (!ctx.created_pgs.empty()) {
8618 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8619 }
8620 int tr = store->queue_transaction(
8621 pg->osr.get(),
8622 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL, TrackedOpRef(),
8623 handle);
8624 delete (ctx.transaction);
8625 assert(tr == 0);
8626 }
8627 }
8628
8629 /** do_notifies
8630 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
8631 * content for, and they are primary for.
8632 */
8633
8634 void OSD::do_notifies(
8635 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
8636 OSDMapRef curmap)
8637 {
8638 for (map<int,
8639 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
8640 notify_list.begin();
8641 it != notify_list.end();
8642 ++it) {
8643 if (!curmap->is_up(it->first)) {
8644 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
8645 continue;
8646 }
8647 ConnectionRef con = service.get_con_osd_cluster(
8648 it->first, curmap->get_epoch());
8649 if (!con) {
8650 dout(20) << __func__ << " skipping osd." << it->first
8651 << " (NULL con)" << dendl;
8652 continue;
8653 }
8654 service.share_map_peer(it->first, con.get(), curmap);
8655 dout(7) << __func__ << " osd " << it->first
8656 << " on " << it->second.size() << " PGs" << dendl;
8657 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
8658 it->second);
8659 con->send_message(m);
8660 }
8661 }
8662
8663
8664 /** do_queries
8665 * send out pending queries for info | summaries
8666 */
8667 void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
8668 OSDMapRef curmap)
8669 {
8670 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
8671 pit != query_map.end();
8672 ++pit) {
8673 if (!curmap->is_up(pit->first)) {
8674 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
8675 continue;
8676 }
8677 int who = pit->first;
8678 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
8679 if (!con) {
8680 dout(20) << __func__ << " skipping osd." << who
8681 << " (NULL con)" << dendl;
8682 continue;
8683 }
8684 service.share_map_peer(who, con.get(), curmap);
8685 dout(7) << __func__ << " querying osd." << who
8686 << " on " << pit->second.size() << " PGs" << dendl;
8687 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
8688 con->send_message(m);
8689 }
8690 }
8691
8692
8693 void OSD::do_infos(map<int,
8694 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
8695 OSDMapRef curmap)
8696 {
8697 for (map<int,
8698 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
8699 info_map.begin();
8700 p != info_map.end();
8701 ++p) {
8702 if (!curmap->is_up(p->first)) {
8703 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
8704 continue;
8705 }
8706 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
8707 i != p->second.end();
8708 ++i) {
8709 dout(20) << __func__ << " sending info " << i->first.info
8710 << " to shard " << p->first << dendl;
8711 }
8712 ConnectionRef con = service.get_con_osd_cluster(
8713 p->first, curmap->get_epoch());
8714 if (!con) {
8715 dout(20) << __func__ << " skipping osd." << p->first
8716 << " (NULL con)" << dendl;
8717 continue;
8718 }
8719 service.share_map_peer(p->first, con.get(), curmap);
8720 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
8721 m->pg_list = p->second;
8722 con->send_message(m);
8723 }
8724 info_map.clear();
8725 }
8726
8727
8728 /** PGNotify
8729 * from non-primary to primary
8730 * includes pg_info_t.
8731 * NOTE: called with opqueue active.
8732 */
8733 void OSD::handle_pg_notify(OpRequestRef op)
8734 {
8735 const MOSDPGNotify *m = static_cast<const MOSDPGNotify*>(op->get_req());
8736 assert(m->get_type() == MSG_OSD_PG_NOTIFY);
8737
8738 dout(7) << "handle_pg_notify from " << m->get_source() << dendl;
8739 int from = m->get_source().num();
8740
8741 if (!require_osd_peer(op->get_req()))
8742 return;
8743
8744 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8745 return;
8746
8747 op->mark_started();
8748
8749 for (auto it = m->get_pg_list().begin();
8750 it != m->get_pg_list().end();
8751 ++it) {
8752 if (it->first.info.pgid.preferred() >= 0) {
8753 dout(20) << "ignoring localized pg " << it->first.info.pgid << dendl;
8754 continue;
8755 }
8756
8757 handle_pg_peering_evt(
8758 spg_t(it->first.info.pgid.pgid, it->first.to),
8759 it->first.info.history, it->second,
8760 it->first.query_epoch,
8761 PG::CephPeeringEvtRef(
8762 new PG::CephPeeringEvt(
8763 it->first.epoch_sent, it->first.query_epoch,
8764 PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first,
8765 op->get_req()->get_connection()->get_features())))
8766 );
8767 }
8768 }
8769
8770 void OSD::handle_pg_log(OpRequestRef op)
8771 {
8772 MOSDPGLog *m = static_cast<MOSDPGLog*>(op->get_nonconst_req());
8773 assert(m->get_type() == MSG_OSD_PG_LOG);
8774 dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl;
8775
8776 if (!require_osd_peer(op->get_req()))
8777 return;
8778
8779 int from = m->get_source().num();
8780 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8781 return;
8782
8783 if (m->info.pgid.preferred() >= 0) {
8784 dout(10) << "ignoring localized pg " << m->info.pgid << dendl;
8785 return;
8786 }
8787
8788 op->mark_started();
8789 handle_pg_peering_evt(
8790 spg_t(m->info.pgid.pgid, m->to),
8791 m->info.history, m->past_intervals, m->get_epoch(),
8792 PG::CephPeeringEvtRef(
8793 new PG::CephPeeringEvt(
8794 m->get_epoch(), m->get_query_epoch(),
8795 PG::MLogRec(pg_shard_t(from, m->from), m)))
8796 );
8797 }
8798
8799 void OSD::handle_pg_info(OpRequestRef op)
8800 {
8801 const MOSDPGInfo *m = static_cast<const MOSDPGInfo *>(op->get_req());
8802 assert(m->get_type() == MSG_OSD_PG_INFO);
8803 dout(7) << "handle_pg_info " << *m << " from " << m->get_source() << dendl;
8804
8805 if (!require_osd_peer(op->get_req()))
8806 return;
8807
8808 int from = m->get_source().num();
8809 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8810 return;
8811
8812 op->mark_started();
8813
8814 for (auto p = m->pg_list.begin();
8815 p != m->pg_list.end();
8816 ++p) {
8817 if (p->first.info.pgid.preferred() >= 0) {
8818 dout(10) << "ignoring localized pg " << p->first.info.pgid << dendl;
8819 continue;
8820 }
8821
8822 handle_pg_peering_evt(
8823 spg_t(p->first.info.pgid.pgid, p->first.to),
8824 p->first.info.history, p->second, p->first.epoch_sent,
8825 PG::CephPeeringEvtRef(
8826 new PG::CephPeeringEvt(
8827 p->first.epoch_sent, p->first.query_epoch,
8828 PG::MInfoRec(
8829 pg_shard_t(
8830 from, p->first.from), p->first.info, p->first.epoch_sent)))
8831 );
8832 }
8833 }
8834
8835 void OSD::handle_pg_trim(OpRequestRef op)
8836 {
8837 const MOSDPGTrim *m = static_cast<const MOSDPGTrim*>(op->get_req());
8838 assert(m->get_type() == MSG_OSD_PG_TRIM);
8839
8840 dout(7) << "handle_pg_trim " << *m << " from " << m->get_source() << dendl;
8841
8842 if (!require_osd_peer(op->get_req()))
8843 return;
8844
8845 int from = m->get_source().num();
8846 if (!require_same_or_newer_map(op, m->epoch, false))
8847 return;
8848
8849 if (m->pgid.preferred() >= 0) {
8850 dout(10) << "ignoring localized pg " << m->pgid << dendl;
8851 return;
8852 }
8853
8854 op->mark_started();
8855
8856 PG *pg = _lookup_lock_pg(m->pgid);
8857 if(!pg) {
8858 dout(10) << " don't have pg " << m->pgid << dendl;
8859 return;
8860 }
8861
8862 if (m->epoch < pg->info.history.same_interval_since) {
8863 dout(10) << *pg << " got old trim to " << m->trim_to << ", ignoring" << dendl;
8864 pg->unlock();
8865 return;
8866 }
8867
8868 if (pg->is_primary()) {
8869 // peer is informing us of their last_complete_ondisk
8870 dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
8871 pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
8872 m->trim_to;
8873 // trim log when the pg is recovered
8874 pg->calc_min_last_complete_ondisk();
8875 } else {
8876 // primary is instructing us to trim
8877 ObjectStore::Transaction t;
8878 pg->pg_log.trim(m->trim_to, pg->info);
8879 pg->dirty_info = true;
8880 pg->write_if_dirty(t);
8881 int tr = store->queue_transaction(pg->osr.get(), std::move(t), NULL);
8882 assert(tr == 0);
8883 }
8884 pg->unlock();
8885 }
8886
8887 void OSD::handle_pg_backfill_reserve(OpRequestRef op)
8888 {
8889 const MBackfillReserve *m = static_cast<const MBackfillReserve*>(op->get_req());
8890 assert(m->get_type() == MSG_OSD_BACKFILL_RESERVE);
8891
8892 if (!require_osd_peer(op->get_req()))
8893 return;
8894 if (!require_same_or_newer_map(op, m->query_epoch, false))
8895 return;
8896
8897 PG::CephPeeringEvtRef evt;
8898 if (m->type == MBackfillReserve::REQUEST) {
8899 evt = PG::CephPeeringEvtRef(
8900 new PG::CephPeeringEvt(
8901 m->query_epoch,
8902 m->query_epoch,
8903 PG::RequestBackfillPrio(m->priority)));
8904 } else if (m->type == MBackfillReserve::GRANT) {
8905 evt = PG::CephPeeringEvtRef(
8906 new PG::CephPeeringEvt(
8907 m->query_epoch,
8908 m->query_epoch,
8909 PG::RemoteBackfillReserved()));
8910 } else if (m->type == MBackfillReserve::REJECT) {
8911 evt = PG::CephPeeringEvtRef(
8912 new PG::CephPeeringEvt(
8913 m->query_epoch,
8914 m->query_epoch,
8915 PG::RemoteReservationRejected()));
8916 } else {
8917 ceph_abort();
8918 }
8919
8920 if (service.splitting(m->pgid)) {
8921 peering_wait_for_split[m->pgid].push_back(evt);
8922 return;
8923 }
8924
8925 PG *pg = _lookup_lock_pg(m->pgid);
8926 if (!pg) {
8927 dout(10) << " don't have pg " << m->pgid << dendl;
8928 return;
8929 }
8930
8931 pg->queue_peering_event(evt);
8932 pg->unlock();
8933 }
8934
8935 void OSD::handle_pg_recovery_reserve(OpRequestRef op)
8936 {
8937 const MRecoveryReserve *m = static_cast<const MRecoveryReserve*>(op->get_req());
8938 assert(m->get_type() == MSG_OSD_RECOVERY_RESERVE);
8939
8940 if (!require_osd_peer(op->get_req()))
8941 return;
8942 if (!require_same_or_newer_map(op, m->query_epoch, false))
8943 return;
8944
8945 PG::CephPeeringEvtRef evt;
8946 if (m->type == MRecoveryReserve::REQUEST) {
8947 evt = PG::CephPeeringEvtRef(
8948 new PG::CephPeeringEvt(
8949 m->query_epoch,
8950 m->query_epoch,
8951 PG::RequestRecovery()));
8952 } else if (m->type == MRecoveryReserve::GRANT) {
8953 evt = PG::CephPeeringEvtRef(
8954 new PG::CephPeeringEvt(
8955 m->query_epoch,
8956 m->query_epoch,
8957 PG::RemoteRecoveryReserved()));
8958 } else if (m->type == MRecoveryReserve::RELEASE) {
8959 evt = PG::CephPeeringEvtRef(
8960 new PG::CephPeeringEvt(
8961 m->query_epoch,
8962 m->query_epoch,
8963 PG::RecoveryDone()));
8964 } else {
8965 ceph_abort();
8966 }
8967
8968 if (service.splitting(m->pgid)) {
8969 peering_wait_for_split[m->pgid].push_back(evt);
8970 return;
8971 }
8972
8973 PG *pg = _lookup_lock_pg(m->pgid);
8974 if (!pg) {
8975 dout(10) << " don't have pg " << m->pgid << dendl;
8976 return;
8977 }
8978
8979 pg->queue_peering_event(evt);
8980 pg->unlock();
8981 }
8982
8983 void OSD::handle_force_recovery(Message *m)
8984 {
8985 MOSDForceRecovery *msg = static_cast<MOSDForceRecovery*>(m);
8986 assert(msg->get_type() == MSG_OSD_FORCE_RECOVERY);
8987
8988 vector<PGRef> local_pgs;
8989 local_pgs.reserve(msg->forced_pgs.size());
8990
8991 {
8992 RWLock::RLocker l(pg_map_lock);
8993 for (auto& i : msg->forced_pgs) {
8994 spg_t locpg;
8995 if (osdmap->get_primary_shard(i, &locpg)) {
8996 auto pg_map_entry = pg_map.find(locpg);
8997 if (pg_map_entry != pg_map.end()) {
8998 local_pgs.push_back(pg_map_entry->second);
8999 }
9000 }
9001 }
9002 }
9003
9004 if (local_pgs.size()) {
9005 service.adjust_pg_priorities(local_pgs, msg->options);
9006 }
9007
9008 msg->put();
9009 }
9010
9011 /** PGQuery
9012 * from primary to replica | stray
9013 * NOTE: called with opqueue active.
9014 */
9015 void OSD::handle_pg_query(OpRequestRef op)
9016 {
9017 assert(osd_lock.is_locked());
9018
9019 const MOSDPGQuery *m = static_cast<const MOSDPGQuery*>(op->get_req());
9020 assert(m->get_type() == MSG_OSD_PG_QUERY);
9021
9022 if (!require_osd_peer(op->get_req()))
9023 return;
9024
9025 dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl;
9026 int from = m->get_source().num();
9027
9028 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9029 return;
9030
9031 op->mark_started();
9032
9033 map< int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
9034
9035 for (auto it = m->pg_list.begin();
9036 it != m->pg_list.end();
9037 ++it) {
9038 spg_t pgid = it->first;
9039
9040 if (pgid.preferred() >= 0) {
9041 dout(10) << "ignoring localized pg " << pgid << dendl;
9042 continue;
9043 }
9044
9045 if (service.splitting(pgid)) {
9046 peering_wait_for_split[pgid].push_back(
9047 PG::CephPeeringEvtRef(
9048 new PG::CephPeeringEvt(
9049 it->second.epoch_sent, it->second.epoch_sent,
9050 PG::MQuery(pg_shard_t(from, it->second.from),
9051 it->second, it->second.epoch_sent))));
9052 continue;
9053 }
9054
9055 {
9056 RWLock::RLocker l(pg_map_lock);
9057 if (pg_map.count(pgid)) {
9058 PG *pg = 0;
9059 pg = _lookup_lock_pg_with_map_lock_held(pgid);
9060 pg->queue_query(
9061 it->second.epoch_sent, it->second.epoch_sent,
9062 pg_shard_t(from, it->second.from), it->second);
9063 pg->unlock();
9064 continue;
9065 }
9066 }
9067
9068 if (!osdmap->have_pg_pool(pgid.pool()))
9069 continue;
9070
9071 // get active crush mapping
9072 int up_primary, acting_primary;
9073 vector<int> up, acting;
9074 osdmap->pg_to_up_acting_osds(
9075 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9076
9077 // same primary?
9078 pg_history_t history = it->second.history;
9079 bool valid_history = project_pg_history(
9080 pgid, history, it->second.epoch_sent,
9081 up, up_primary, acting, acting_primary);
9082
9083 if (!valid_history ||
9084 it->second.epoch_sent < history.same_interval_since) {
9085 dout(10) << " pg " << pgid << " dne, and pg has changed in "
9086 << history.same_interval_since
9087 << " (msg from " << it->second.epoch_sent << ")" << dendl;
9088 continue;
9089 }
9090
9091 dout(10) << " pg " << pgid << " dne" << dendl;
9092 pg_info_t empty(spg_t(pgid.pgid, it->second.to));
9093 /* This is racy, but that should be ok: if we complete the deletion
9094 * before the pg is recreated, we'll just start it off backfilling
9095 * instead of just empty */
9096 if (service.deleting_pgs.lookup(pgid))
9097 empty.set_last_backfill(hobject_t());
9098 if (it->second.type == pg_query_t::LOG ||
9099 it->second.type == pg_query_t::FULLLOG) {
9100 ConnectionRef con = service.get_con_osd_cluster(from, osdmap->get_epoch());
9101 if (con) {
9102 MOSDPGLog *mlog = new MOSDPGLog(
9103 it->second.from, it->second.to,
9104 osdmap->get_epoch(), empty,
9105 it->second.epoch_sent);
9106 service.share_map_peer(from, con.get(), osdmap);
9107 con->send_message(mlog);
9108 }
9109 } else {
9110 notify_list[from].push_back(
9111 make_pair(
9112 pg_notify_t(
9113 it->second.from, it->second.to,
9114 it->second.epoch_sent,
9115 osdmap->get_epoch(),
9116 empty),
9117 PastIntervals(
9118 osdmap->get_pools().at(pgid.pool()).ec_pool(),
9119 *osdmap)));
9120 }
9121 }
9122 do_notifies(notify_list, osdmap);
9123 }
9124
9125
9126 void OSD::handle_pg_remove(OpRequestRef op)
9127 {
9128 const MOSDPGRemove *m = static_cast<const MOSDPGRemove *>(op->get_req());
9129 assert(m->get_type() == MSG_OSD_PG_REMOVE);
9130 assert(osd_lock.is_locked());
9131
9132 if (!require_osd_peer(op->get_req()))
9133 return;
9134
9135 dout(7) << "handle_pg_remove from " << m->get_source() << " on "
9136 << m->pg_list.size() << " pgs" << dendl;
9137
9138 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9139 return;
9140
9141 op->mark_started();
9142
9143 for (auto it = m->pg_list.begin();
9144 it != m->pg_list.end();
9145 ++it) {
9146 spg_t pgid = *it;
9147 if (pgid.preferred() >= 0) {
9148 dout(10) << "ignoring localized pg " << pgid << dendl;
9149 continue;
9150 }
9151
9152 RWLock::WLocker l(pg_map_lock);
9153 if (pg_map.count(pgid) == 0) {
9154 dout(10) << " don't have pg " << pgid << dendl;
9155 continue;
9156 }
9157 dout(5) << "queue_pg_for_deletion: " << pgid << dendl;
9158 PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
9159 pg_history_t history = pg->info.history;
9160 int up_primary, acting_primary;
9161 vector<int> up, acting;
9162 osdmap->pg_to_up_acting_osds(
9163 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9164 bool valid_history = project_pg_history(
9165 pg->info.pgid, history, pg->get_osdmap()->get_epoch(),
9166 up, up_primary, acting, acting_primary);
9167 if (valid_history &&
9168 history.same_interval_since <= m->get_epoch()) {
9169 assert(pg->get_primary().osd == m->get_source().num());
9170 PGRef _pg(pg);
9171 _remove_pg(pg);
9172 pg->unlock();
9173 } else {
9174 dout(10) << *pg << " ignoring remove request, pg changed in epoch "
9175 << history.same_interval_since
9176 << " > " << m->get_epoch() << dendl;
9177 pg->unlock();
9178 }
9179 }
9180 }
9181
9182 void OSD::_remove_pg(PG *pg)
9183 {
9184 ObjectStore::Transaction rmt ;
9185
9186 // on_removal, which calls remove_watchers_and_notifies, and the erasure from
9187 // the pg_map must be done together without unlocking the pg lock,
9188 // to avoid racing with watcher cleanup in ms_handle_reset
9189 // and handle_notify_timeout
9190 pg->on_removal(&rmt);
9191
9192 service.cancel_pending_splits_for_parent(pg->info.pgid);
9193 int tr = store->queue_transaction(
9194 pg->osr.get(), std::move(rmt), NULL,
9195 new ContainerContext<
9196 SequencerRef>(pg->osr));
9197 assert(tr == 0);
9198
9199 DeletingStateRef deleting = service.deleting_pgs.lookup_or_create(
9200 pg->info.pgid,
9201 make_pair(
9202 pg->info.pgid,
9203 PGRef(pg))
9204 );
9205 remove_wq.queue(make_pair(PGRef(pg), deleting));
9206
9207 service.pg_remove_epoch(pg->info.pgid);
9208
9209 // dereference from op_wq
9210 op_shardedwq.clear_pg_pointer(pg->info.pgid);
9211
9212 // remove from map
9213 pg_map.erase(pg->info.pgid);
9214 pg->put("PGMap"); // since we've taken it out of map
9215 }
9216
9217
9218 // =========================================================
9219 // RECOVERY
9220
9221 void OSDService::_maybe_queue_recovery() {
9222 assert(recovery_lock.is_locked_by_me());
9223 uint64_t available_pushes;
9224 while (!awaiting_throttle.empty() &&
9225 _recover_now(&available_pushes)) {
9226 uint64_t to_start = MIN(
9227 available_pushes,
9228 cct->_conf->osd_recovery_max_single_start);
9229 _queue_for_recovery(awaiting_throttle.front(), to_start);
9230 awaiting_throttle.pop_front();
9231 recovery_ops_reserved += to_start;
9232 }
9233 }
9234
9235 bool OSDService::_recover_now(uint64_t *available_pushes)
9236 {
9237 if (available_pushes)
9238 *available_pushes = 0;
9239
9240 if (ceph_clock_now() < defer_recovery_until) {
9241 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9242 return false;
9243 }
9244
9245 if (recovery_paused) {
9246 dout(15) << __func__ << " paused" << dendl;
9247 return false;
9248 }
9249
9250 uint64_t max = cct->_conf->osd_recovery_max_active;
9251 if (max <= recovery_ops_active + recovery_ops_reserved) {
9252 dout(15) << __func__ << " active " << recovery_ops_active
9253 << " + reserved " << recovery_ops_reserved
9254 << " >= max " << max << dendl;
9255 return false;
9256 }
9257
9258 if (available_pushes)
9259 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9260
9261 return true;
9262 }
9263
9264
9265 void OSDService::adjust_pg_priorities(const vector<PGRef>& pgs, int newflags)
9266 {
9267 if (!pgs.size() || !(newflags & (OFR_BACKFILL | OFR_RECOVERY)))
9268 return;
9269 int newstate = 0;
9270
9271 if (newflags & OFR_BACKFILL) {
9272 newstate = PG_STATE_FORCED_BACKFILL;
9273 } else if (newflags & OFR_RECOVERY) {
9274 newstate = PG_STATE_FORCED_RECOVERY;
9275 }
9276
9277 // debug output here may get large, don't generate it if debug level is below
9278 // 10 and use abbreviated pg ids otherwise
9279 if ((cct)->_conf->subsys.should_gather(ceph_subsys_osd, 10)) {
9280 stringstream ss;
9281
9282 for (auto& i : pgs) {
9283 ss << i->get_pgid() << " ";
9284 }
9285
9286 dout(10) << __func__ << " working on " << ss.str() << dendl;
9287 }
9288
9289 if (newflags & OFR_CANCEL) {
9290 for (auto& i : pgs) {
9291 i->lock();
9292 i->_change_recovery_force_mode(newstate, true);
9293 i->unlock();
9294 }
9295 } else {
9296 for (auto& i : pgs) {
9297 // make sure the PG is in correct state before forcing backfill or recovery, or
9298 // else we'll make PG keeping FORCE_* flag forever, requiring osds restart
9299 // or forcing somehow recovery/backfill.
9300 i->lock();
9301 int pgstate = i->get_state();
9302 if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) ||
9303 ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILL))) )
9304 i->_change_recovery_force_mode(newstate, false);
9305 i->unlock();
9306 }
9307 }
9308 }
9309
9310 void OSD::do_recovery(
9311 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9312 ThreadPool::TPHandle &handle)
9313 {
9314 uint64_t started = 0;
9315
9316 /*
9317 * When the value of osd_recovery_sleep is set greater than zero, recovery
9318 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9319 * recovery event's schedule time. This is done by adding a
9320 * recovery_requeue_callback event, which re-queues the recovery op using
9321 * queue_recovery_after_sleep.
9322 */
9323 float recovery_sleep = get_osd_recovery_sleep();
9324 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9325 PGRef pgref(pg);
9326 auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9327 dout(20) << "do_recovery wake up at "
9328 << ceph_clock_now()
9329 << ", re-queuing recovery" << dendl;
9330 service.recovery_needs_sleep = false;
9331 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9332 });
9333 Mutex::Locker l(service.recovery_sleep_lock);
9334
9335 // This is true for the first recovery op and when the previous recovery op
9336 // has been scheduled in the past. The next recovery op is scheduled after
9337 // completing the sleep from now.
9338 if (service.recovery_schedule_time < ceph_clock_now()) {
9339 service.recovery_schedule_time = ceph_clock_now();
9340 }
9341 service.recovery_schedule_time += recovery_sleep;
9342 service.recovery_sleep_timer.add_event_at(service.recovery_schedule_time,
9343 recovery_requeue_callback);
9344 dout(20) << "Recovery event scheduled at "
9345 << service.recovery_schedule_time << dendl;
9346 return;
9347 }
9348
9349 {
9350 service.recovery_needs_sleep = true;
9351 if (pg->pg_has_reset_since(queued)) {
9352 goto out;
9353 }
9354
9355 assert(!pg->deleting);
9356 assert(pg->is_peered() && pg->is_primary());
9357
9358 assert(pg->recovery_queued);
9359 pg->recovery_queued = false;
9360
9361 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9362 #ifdef DEBUG_RECOVERY_OIDS
9363 dout(20) << " active was " << service.recovery_oids[pg->info.pgid] << dendl;
9364 #endif
9365
9366 bool more = pg->start_recovery_ops(reserved_pushes, handle, &started);
9367 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9368 << " on " << *pg << dendl;
9369
9370 // If no recovery op is started, don't bother to manipulate the RecoveryCtx
9371 if (!started && (more || !pg->have_unfound())) {
9372 goto out;
9373 }
9374
9375 PG::RecoveryCtx rctx = create_context();
9376 rctx.handle = &handle;
9377
9378 /*
9379 * if we couldn't start any recovery ops and things are still
9380 * unfound, see if we can discover more missing object locations.
9381 * It may be that our initial locations were bad and we errored
9382 * out while trying to pull.
9383 */
9384 if (!more && pg->have_unfound()) {
9385 pg->discover_all_missing(*rctx.query_map);
9386 if (rctx.query_map->empty()) {
9387 string action;
9388 if (pg->state_test(PG_STATE_BACKFILL)) {
9389 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9390 queued,
9391 queued,
9392 PG::CancelBackfill()));
9393 pg->queue_peering_event(evt);
9394 action = "in backfill";
9395 } else if (pg->state_test(PG_STATE_RECOVERING)) {
9396 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9397 queued,
9398 queued,
9399 PG::CancelRecovery()));
9400 pg->queue_peering_event(evt);
9401 action = "in recovery";
9402 } else {
9403 action = "already out of recovery/backfill";
9404 }
9405 dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
9406 } else {
9407 dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
9408 pg->queue_recovery();
9409 }
9410 }
9411
9412 pg->write_if_dirty(*rctx.transaction);
9413 OSDMapRef curmap = pg->get_osdmap();
9414 dispatch_context(rctx, pg, curmap);
9415 }
9416
9417 out:
9418 assert(started <= reserved_pushes);
9419 service.release_reserved_pushes(reserved_pushes);
9420 }
9421
9422 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9423 {
9424 Mutex::Locker l(recovery_lock);
9425 dout(10) << "start_recovery_op " << *pg << " " << soid
9426 << " (" << recovery_ops_active << "/"
9427 << cct->_conf->osd_recovery_max_active << " rops)"
9428 << dendl;
9429 recovery_ops_active++;
9430
9431 #ifdef DEBUG_RECOVERY_OIDS
9432 dout(20) << " active was " << recovery_oids[pg->info.pgid] << dendl;
9433 assert(recovery_oids[pg->info.pgid].count(soid) == 0);
9434 recovery_oids[pg->info.pgid].insert(soid);
9435 #endif
9436 }
9437
9438 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9439 {
9440 Mutex::Locker l(recovery_lock);
9441 dout(10) << "finish_recovery_op " << *pg << " " << soid
9442 << " dequeue=" << dequeue
9443 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
9444 << dendl;
9445
9446 // adjust count
9447 assert(recovery_ops_active > 0);
9448 recovery_ops_active--;
9449
9450 #ifdef DEBUG_RECOVERY_OIDS
9451 dout(20) << " active oids was " << recovery_oids[pg->info.pgid] << dendl;
9452 assert(recovery_oids[pg->info.pgid].count(soid));
9453 recovery_oids[pg->info.pgid].erase(soid);
9454 #endif
9455
9456 _maybe_queue_recovery();
9457 }
9458
9459 bool OSDService::is_recovery_active()
9460 {
9461 return local_reserver.has_reservation() || remote_reserver.has_reservation();
9462 }
9463
9464 // =========================================================
9465 // OPS
9466
9467 bool OSD::op_is_discardable(const MOSDOp *op)
9468 {
9469 // drop client request if they are not connected and can't get the
9470 // reply anyway.
9471 if (!op->get_connection()->is_connected()) {
9472 return true;
9473 }
9474 return false;
9475 }
9476
9477 void OSD::enqueue_op(spg_t pg, OpRequestRef& op, epoch_t epoch)
9478 {
9479 utime_t latency = ceph_clock_now() - op->get_req()->get_recv_stamp();
9480 dout(15) << "enqueue_op " << op << " prio " << op->get_req()->get_priority()
9481 << " cost " << op->get_req()->get_cost()
9482 << " latency " << latency
9483 << " epoch " << epoch
9484 << " " << *(op->get_req()) << dendl;
9485 op->osd_trace.event("enqueue op");
9486 op->osd_trace.keyval("priority", op->get_req()->get_priority());
9487 op->osd_trace.keyval("cost", op->get_req()->get_cost());
9488 op->mark_queued_for_pg();
9489 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9490 op_shardedwq.queue(make_pair(pg, PGQueueable(op, epoch)));
9491 }
9492
9493
9494
9495 /*
9496 * NOTE: dequeue called in worker thread, with pg lock
9497 */
9498 void OSD::dequeue_op(
9499 PGRef pg, OpRequestRef op,
9500 ThreadPool::TPHandle &handle)
9501 {
9502 FUNCTRACE();
9503 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
9504
9505 utime_t now = ceph_clock_now();
9506 op->set_dequeued_time(now);
9507 utime_t latency = now - op->get_req()->get_recv_stamp();
9508 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
9509 << " cost " << op->get_req()->get_cost()
9510 << " latency " << latency
9511 << " " << *(op->get_req())
9512 << " pg " << *pg << dendl;
9513
9514 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9515
9516 Session *session = static_cast<Session *>(
9517 op->get_req()->get_connection()->get_priv());
9518 if (session) {
9519 maybe_share_map(session, op, pg->get_osdmap());
9520 session->put();
9521 }
9522
9523 if (pg->deleting)
9524 return;
9525
9526 op->mark_reached_pg();
9527 op->osd_trace.event("dequeue_op");
9528
9529 pg->do_request(op, handle);
9530
9531 // finish
9532 dout(10) << "dequeue_op " << op << " finish" << dendl;
9533 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
9534 }
9535
9536
9537 struct C_CompleteSplits : public Context {
9538 OSD *osd;
9539 set<PGRef> pgs;
9540 C_CompleteSplits(OSD *osd, const set<PGRef> &in)
9541 : osd(osd), pgs(in) {}
9542 void finish(int r) override {
9543 Mutex::Locker l(osd->osd_lock);
9544 if (osd->is_stopping())
9545 return;
9546 PG::RecoveryCtx rctx = osd->create_context();
9547 for (set<PGRef>::iterator i = pgs.begin();
9548 i != pgs.end();
9549 ++i) {
9550 osd->pg_map_lock.get_write();
9551 (*i)->lock();
9552 PG *pg = i->get();
9553 osd->add_newly_split_pg(pg, &rctx);
9554 if (!((*i)->deleting)) {
9555 set<spg_t> to_complete;
9556 to_complete.insert((*i)->info.pgid);
9557 osd->service.complete_split(to_complete);
9558 }
9559 osd->pg_map_lock.put_write();
9560 osd->dispatch_context_transaction(rctx, pg);
9561 osd->wake_pg_waiters(*i);
9562 (*i)->unlock();
9563 }
9564
9565 osd->dispatch_context(rctx, 0, osd->service.get_osdmap());
9566 }
9567 };
9568
9569 void OSD::process_peering_events(
9570 const list<PG*> &pgs,
9571 ThreadPool::TPHandle &handle
9572 )
9573 {
9574 bool need_up_thru = false;
9575 epoch_t same_interval_since = 0;
9576 OSDMapRef curmap;
9577 PG::RecoveryCtx rctx = create_context();
9578 rctx.handle = &handle;
9579 for (list<PG*>::const_iterator i = pgs.begin();
9580 i != pgs.end();
9581 ++i) {
9582 set<PGRef> split_pgs;
9583 PG *pg = *i;
9584 pg->lock_suspend_timeout(handle);
9585 curmap = service.get_osdmap();
9586 if (pg->deleting) {
9587 pg->unlock();
9588 continue;
9589 }
9590 if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
9591 // we need to requeue the PG explicitly since we didn't actually
9592 // handle an event
9593 peering_wq.queue(pg);
9594 } else {
9595 assert(!pg->peering_queue.empty());
9596 PG::CephPeeringEvtRef evt = pg->peering_queue.front();
9597 pg->peering_queue.pop_front();
9598 pg->handle_peering_event(evt, &rctx);
9599 }
9600 need_up_thru = pg->need_up_thru || need_up_thru;
9601 same_interval_since = MAX(pg->info.history.same_interval_since,
9602 same_interval_since);
9603 pg->write_if_dirty(*rctx.transaction);
9604 if (!split_pgs.empty()) {
9605 rctx.on_applied->add(new C_CompleteSplits(this, split_pgs));
9606 split_pgs.clear();
9607 }
9608 dispatch_context_transaction(rctx, pg, &handle);
9609 pg->unlock();
9610 }
9611 if (need_up_thru)
9612 queue_want_up_thru(same_interval_since);
9613 dispatch_context(rctx, 0, curmap, &handle);
9614
9615 service.send_pg_temp();
9616 }
9617
9618 // --------------------------------
9619
9620 const char** OSD::get_tracked_conf_keys() const
9621 {
9622 static const char* KEYS[] = {
9623 "osd_max_backfills",
9624 "osd_min_recovery_priority",
9625 "osd_max_trimming_pgs",
9626 "osd_op_complaint_time",
9627 "osd_op_log_threshold",
9628 "osd_op_history_size",
9629 "osd_op_history_duration",
9630 "osd_op_history_slow_op_size",
9631 "osd_op_history_slow_op_threshold",
9632 "osd_enable_op_tracker",
9633 "osd_map_cache_size",
9634 "osd_map_max_advance",
9635 "osd_pg_epoch_persisted_max_stale",
9636 "osd_disk_thread_ioprio_class",
9637 "osd_disk_thread_ioprio_priority",
9638 // clog & admin clog
9639 "clog_to_monitors",
9640 "clog_to_syslog",
9641 "clog_to_syslog_facility",
9642 "clog_to_syslog_level",
9643 "osd_objectstore_fuse",
9644 "clog_to_graylog",
9645 "clog_to_graylog_host",
9646 "clog_to_graylog_port",
9647 "host",
9648 "fsid",
9649 "osd_recovery_delay_start",
9650 "osd_client_message_size_cap",
9651 "osd_client_message_cap",
9652 "osd_heartbeat_min_size",
9653 "osd_heartbeat_interval",
9654 NULL
9655 };
9656 return KEYS;
9657 }
9658
9659 void OSD::handle_conf_change(const struct md_config_t *conf,
9660 const std::set <std::string> &changed)
9661 {
9662 if (changed.count("osd_max_backfills")) {
9663 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9664 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9665 }
9666 if (changed.count("osd_min_recovery_priority")) {
9667 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9668 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9669 }
9670 if (changed.count("osd_max_trimming_pgs")) {
9671 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9672 }
9673 if (changed.count("osd_op_complaint_time") ||
9674 changed.count("osd_op_log_threshold")) {
9675 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9676 cct->_conf->osd_op_log_threshold);
9677 }
9678 if (changed.count("osd_op_history_size") ||
9679 changed.count("osd_op_history_duration")) {
9680 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9681 cct->_conf->osd_op_history_duration);
9682 }
9683 if (changed.count("osd_op_history_slow_op_size") ||
9684 changed.count("osd_op_history_slow_op_threshold")) {
9685 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9686 cct->_conf->osd_op_history_slow_op_threshold);
9687 }
9688 if (changed.count("osd_enable_op_tracker")) {
9689 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9690 }
9691 if (changed.count("osd_disk_thread_ioprio_class") ||
9692 changed.count("osd_disk_thread_ioprio_priority")) {
9693 set_disk_tp_priority();
9694 }
9695 if (changed.count("osd_map_cache_size")) {
9696 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9697 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9698 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9699 }
9700 if (changed.count("clog_to_monitors") ||
9701 changed.count("clog_to_syslog") ||
9702 changed.count("clog_to_syslog_level") ||
9703 changed.count("clog_to_syslog_facility") ||
9704 changed.count("clog_to_graylog") ||
9705 changed.count("clog_to_graylog_host") ||
9706 changed.count("clog_to_graylog_port") ||
9707 changed.count("host") ||
9708 changed.count("fsid")) {
9709 update_log_config();
9710 }
9711
9712 #ifdef HAVE_LIBFUSE
9713 if (changed.count("osd_objectstore_fuse")) {
9714 if (store) {
9715 enable_disable_fuse(false);
9716 }
9717 }
9718 #endif
9719
9720 if (changed.count("osd_recovery_delay_start")) {
9721 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9722 service.kick_recovery_queue();
9723 }
9724
9725 if (changed.count("osd_client_message_cap")) {
9726 uint64_t newval = cct->_conf->osd_client_message_cap;
9727 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9728 if (pol.throttler_messages && newval > 0) {
9729 pol.throttler_messages->reset_max(newval);
9730 }
9731 }
9732 if (changed.count("osd_client_message_size_cap")) {
9733 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9734 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9735 if (pol.throttler_bytes && newval > 0) {
9736 pol.throttler_bytes->reset_max(newval);
9737 }
9738 }
9739
9740 check_config();
9741 }
9742
9743 void OSD::update_log_config()
9744 {
9745 map<string,string> log_to_monitors;
9746 map<string,string> log_to_syslog;
9747 map<string,string> log_channel;
9748 map<string,string> log_prio;
9749 map<string,string> log_to_graylog;
9750 map<string,string> log_to_graylog_host;
9751 map<string,string> log_to_graylog_port;
9752 uuid_d fsid;
9753 string host;
9754
9755 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9756 log_channel, log_prio, log_to_graylog,
9757 log_to_graylog_host, log_to_graylog_port,
9758 fsid, host) == 0)
9759 clog->update_config(log_to_monitors, log_to_syslog,
9760 log_channel, log_prio, log_to_graylog,
9761 log_to_graylog_host, log_to_graylog_port,
9762 fsid, host);
9763 derr << "log_to_monitors " << log_to_monitors << dendl;
9764 }
9765
9766 void OSD::check_config()
9767 {
9768 // some sanity checks
9769 if (cct->_conf->osd_map_cache_size <= cct->_conf->osd_map_max_advance + 2) {
9770 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9771 << " is not > osd_map_max_advance ("
9772 << cct->_conf->osd_map_max_advance << ")";
9773 }
9774 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9775 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9776 << " is not > osd_pg_epoch_persisted_max_stale ("
9777 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9778 }
9779 }
9780
9781 void OSD::set_disk_tp_priority()
9782 {
9783 dout(10) << __func__
9784 << " class " << cct->_conf->osd_disk_thread_ioprio_class
9785 << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
9786 << dendl;
9787 if (cct->_conf->osd_disk_thread_ioprio_class.empty() ||
9788 cct->_conf->osd_disk_thread_ioprio_priority < 0)
9789 return;
9790 int cls =
9791 ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
9792 if (cls < 0)
9793 derr << __func__ << cpp_strerror(cls) << ": "
9794 << "osd_disk_thread_ioprio_class is " << cct->_conf->osd_disk_thread_ioprio_class
9795 << " but only the following values are allowed: idle, be or rt" << dendl;
9796 else
9797 disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
9798 }
9799
9800 // --------------------------------
9801
9802 void OSD::get_latest_osdmap()
9803 {
9804 dout(10) << __func__ << " -- start" << dendl;
9805
9806 C_SaferCond cond;
9807 service.objecter->wait_for_latest_osdmap(&cond);
9808 cond.wait();
9809
9810 dout(10) << __func__ << " -- finish" << dendl;
9811 }
9812
9813 // --------------------------------
9814
9815 int OSD::init_op_flags(OpRequestRef& op)
9816 {
9817 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
9818 vector<OSDOp>::const_iterator iter;
9819
9820 // client flags have no bearing on whether an op is a read, write, etc.
9821 op->rmw_flags = 0;
9822
9823 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
9824 op->set_force_rwordered();
9825 }
9826
9827 // set bits based on op codes, called methods.
9828 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
9829 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
9830 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
9831 /* This a bit odd. PING isn't actually a write. It can't
9832 * result in an update to the object_info. PINGs also aren'ty
9833 * resent, so there's no reason to write out a log entry
9834 *
9835 * However, we pipeline them behind writes, so let's force
9836 * the write_ordered flag.
9837 */
9838 op->set_force_rwordered();
9839 } else {
9840 if (ceph_osd_op_mode_modify(iter->op.op))
9841 op->set_write();
9842 }
9843 if (ceph_osd_op_mode_read(iter->op.op))
9844 op->set_read();
9845
9846 // set READ flag if there are src_oids
9847 if (iter->soid.oid.name.length())
9848 op->set_read();
9849
9850 // set PGOP flag if there are PG ops
9851 if (ceph_osd_op_type_pg(iter->op.op))
9852 op->set_pg_op();
9853
9854 if (ceph_osd_op_mode_cache(iter->op.op))
9855 op->set_cache();
9856
9857 // check for ec base pool
9858 int64_t poolid = m->get_pg().pool();
9859 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
9860 if (pool && pool->is_tier()) {
9861 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
9862 if (base_pool && base_pool->require_rollback()) {
9863 if ((iter->op.op != CEPH_OSD_OP_READ) &&
9864 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
9865 (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
9866 (iter->op.op != CEPH_OSD_OP_STAT) &&
9867 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
9868 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
9869 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
9870 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
9871 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
9872 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
9873 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
9874 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
9875 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
9876 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
9877 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
9878 (iter->op.op != CEPH_OSD_OP_CREATE) &&
9879 (iter->op.op != CEPH_OSD_OP_DELETE) &&
9880 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
9881 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
9882 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
9883 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
9884 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
9885 op->set_promote();
9886 }
9887 }
9888 }
9889
9890 switch (iter->op.op) {
9891 case CEPH_OSD_OP_CALL:
9892 {
9893 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
9894 int is_write, is_read;
9895 string cname, mname;
9896 bp.copy(iter->op.cls.class_len, cname);
9897 bp.copy(iter->op.cls.method_len, mname);
9898
9899 ClassHandler::ClassData *cls;
9900 int r = class_handler->open_class(cname, &cls);
9901 if (r) {
9902 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
9903 if (r == -ENOENT)
9904 r = -EOPNOTSUPP;
9905 else if (r != -EPERM) // propagate permission errors
9906 r = -EIO;
9907 return r;
9908 }
9909 int flags = cls->get_method_flags(mname.c_str());
9910 if (flags < 0) {
9911 if (flags == -ENOENT)
9912 r = -EOPNOTSUPP;
9913 else
9914 r = flags;
9915 return r;
9916 }
9917 is_read = flags & CLS_METHOD_RD;
9918 is_write = flags & CLS_METHOD_WR;
9919 bool is_promote = flags & CLS_METHOD_PROMOTE;
9920
9921 dout(10) << "class " << cname << " method " << mname << " "
9922 << "flags=" << (is_read ? "r" : "")
9923 << (is_write ? "w" : "")
9924 << (is_promote ? "p" : "")
9925 << dendl;
9926 if (is_read)
9927 op->set_class_read();
9928 if (is_write)
9929 op->set_class_write();
9930 if (is_promote)
9931 op->set_promote();
9932 op->add_class(cname, is_read, is_write, cls->whitelisted);
9933 break;
9934 }
9935
9936 case CEPH_OSD_OP_WATCH:
9937 // force the read bit for watch since it is depends on previous
9938 // watch state (and may return early if the watch exists) or, in
9939 // the case of ping, is simply a read op.
9940 op->set_read();
9941 // fall through
9942 case CEPH_OSD_OP_NOTIFY:
9943 case CEPH_OSD_OP_NOTIFY_ACK:
9944 {
9945 op->set_promote();
9946 break;
9947 }
9948
9949 case CEPH_OSD_OP_DELETE:
9950 // if we get a delete with FAILOK we can skip handle cache. without
9951 // FAILOK we still need to promote (or do something smarter) to
9952 // determine whether to return ENOENT or 0.
9953 if (iter == m->ops.begin() &&
9954 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
9955 op->set_skip_handle_cache();
9956 }
9957 // skip promotion when proxying a delete op
9958 if (m->ops.size() == 1) {
9959 op->set_skip_promote();
9960 }
9961 break;
9962
9963 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
9964 case CEPH_OSD_OP_CACHE_FLUSH:
9965 case CEPH_OSD_OP_CACHE_EVICT:
9966 // If try_flush/flush/evict is the only op, can skip handle cache.
9967 if (m->ops.size() == 1) {
9968 op->set_skip_handle_cache();
9969 }
9970 break;
9971
9972 case CEPH_OSD_OP_READ:
9973 case CEPH_OSD_OP_SYNC_READ:
9974 case CEPH_OSD_OP_SPARSE_READ:
9975 case CEPH_OSD_OP_CHECKSUM:
9976 case CEPH_OSD_OP_WRITEFULL:
9977 if (m->ops.size() == 1 &&
9978 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
9979 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
9980 op->set_skip_promote();
9981 }
9982 break;
9983
9984 // force promotion when pin an object in cache tier
9985 case CEPH_OSD_OP_CACHE_PIN:
9986 op->set_promote();
9987 break;
9988
9989 default:
9990 break;
9991 }
9992 }
9993
9994 if (op->rmw_flags == 0)
9995 return -EINVAL;
9996
9997 return 0;
9998 }
9999
10000 void OSD::PeeringWQ::_dequeue(list<PG*> *out) {
10001 for (list<PG*>::iterator i = peering_queue.begin();
10002 i != peering_queue.end() &&
10003 out->size() < osd->cct->_conf->osd_peering_wq_batch_size;
10004 ) {
10005 if (in_use.count(*i)) {
10006 ++i;
10007 } else {
10008 out->push_back(*i);
10009 peering_queue.erase(i++);
10010 }
10011 }
10012 in_use.insert(out->begin(), out->end());
10013 }
10014
10015
10016 // =============================================================
10017
10018 #undef dout_context
10019 #define dout_context osd->cct
10020 #undef dout_prefix
10021 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10022
10023 void OSD::ShardedOpWQ::wake_pg_waiters(spg_t pgid)
10024 {
10025 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10026 auto sdata = shard_list[shard_index];
10027 bool queued = false;
10028 unsigned pushes_to_free = 0;
10029 {
10030 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10031 auto p = sdata->pg_slots.find(pgid);
10032 if (p != sdata->pg_slots.end()) {
10033 dout(20) << __func__ << " " << pgid
10034 << " to_process " << p->second.to_process
10035 << " waiting_for_pg=" << (int)p->second.waiting_for_pg << dendl;
10036 for (auto i = p->second.to_process.rbegin();
10037 i != p->second.to_process.rend();
10038 ++i) {
10039 sdata->_enqueue_front(make_pair(pgid, *i), osd->op_prio_cutoff);
10040 }
10041 for (auto& q : p->second.to_process) {
10042 pushes_to_free += q.get_reserved_pushes();
10043 }
10044 p->second.to_process.clear();
10045 p->second.waiting_for_pg = false;
10046 ++p->second.requeue_seq;
10047 queued = true;
10048 }
10049 }
10050 if (pushes_to_free > 0) {
10051 osd->service.release_reserved_pushes(pushes_to_free);
10052 }
10053 if (queued) {
10054 sdata->sdata_lock.Lock();
10055 sdata->sdata_cond.SignalOne();
10056 sdata->sdata_lock.Unlock();
10057 }
10058 }
10059
10060 void OSD::ShardedOpWQ::prune_pg_waiters(OSDMapRef osdmap, int whoami)
10061 {
10062 unsigned pushes_to_free = 0;
10063 for (auto sdata : shard_list) {
10064 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10065 sdata->waiting_for_pg_osdmap = osdmap;
10066 auto p = sdata->pg_slots.begin();
10067 while (p != sdata->pg_slots.end()) {
10068 ShardData::pg_slot& slot = p->second;
10069 if (!slot.to_process.empty() && slot.num_running == 0) {
10070 if (osdmap->is_up_acting_osd_shard(p->first, whoami)) {
10071 dout(20) << __func__ << " " << p->first << " maps to us, keeping"
10072 << dendl;
10073 ++p;
10074 continue;
10075 }
10076 while (!slot.to_process.empty() &&
10077 slot.to_process.front().get_map_epoch() <= osdmap->get_epoch()) {
10078 auto& qi = slot.to_process.front();
10079 dout(20) << __func__ << " " << p->first
10080 << " item " << qi
10081 << " epoch " << qi.get_map_epoch()
10082 << " <= " << osdmap->get_epoch()
10083 << ", stale, dropping" << dendl;
10084 pushes_to_free += qi.get_reserved_pushes();
10085 slot.to_process.pop_front();
10086 }
10087 }
10088 if (slot.to_process.empty() &&
10089 slot.num_running == 0 &&
10090 !slot.pg) {
10091 dout(20) << __func__ << " " << p->first << " empty, pruning" << dendl;
10092 p = sdata->pg_slots.erase(p);
10093 } else {
10094 ++p;
10095 }
10096 }
10097 }
10098 if (pushes_to_free > 0) {
10099 osd->service.release_reserved_pushes(pushes_to_free);
10100 }
10101 }
10102
10103 void OSD::ShardedOpWQ::clear_pg_pointer(spg_t pgid)
10104 {
10105 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10106 auto sdata = shard_list[shard_index];
10107 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10108 auto p = sdata->pg_slots.find(pgid);
10109 if (p != sdata->pg_slots.end()) {
10110 auto& slot = p->second;
10111 dout(20) << __func__ << " " << pgid << " pg " << slot.pg << dendl;
10112 assert(!slot.pg || slot.pg->deleting);
10113 slot.pg = nullptr;
10114 }
10115 }
10116
10117 void OSD::ShardedOpWQ::clear_pg_slots()
10118 {
10119 for (auto sdata : shard_list) {
10120 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10121 sdata->pg_slots.clear();
10122 sdata->waiting_for_pg_osdmap.reset();
10123 // don't bother with reserved pushes; we are shutting down
10124 }
10125 }
10126
10127 #undef dout_prefix
10128 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10129
10130 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10131 {
10132 uint32_t shard_index = thread_index % num_shards;
10133 ShardData *sdata = shard_list[shard_index];
10134 assert(NULL != sdata);
10135
10136 // peek at spg_t
10137 sdata->sdata_op_ordering_lock.Lock();
10138 if (sdata->pqueue->empty()) {
10139 dout(20) << __func__ << " empty q, waiting" << dendl;
10140 // optimistically sleep a moment; maybe another work item will come along.
10141 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10142 osd->cct->_conf->threadpool_default_timeout, 0);
10143 sdata->sdata_lock.Lock();
10144 sdata->sdata_op_ordering_lock.Unlock();
10145 sdata->sdata_cond.WaitInterval(sdata->sdata_lock,
10146 utime_t(osd->cct->_conf->threadpool_empty_queue_max_wait, 0));
10147 sdata->sdata_lock.Unlock();
10148 sdata->sdata_op_ordering_lock.Lock();
10149 if (sdata->pqueue->empty()) {
10150 sdata->sdata_op_ordering_lock.Unlock();
10151 return;
10152 }
10153 }
10154 pair<spg_t, PGQueueable> item = sdata->pqueue->dequeue();
10155 if (osd->is_stopping()) {
10156 sdata->sdata_op_ordering_lock.Unlock();
10157 return; // OSD shutdown, discard.
10158 }
10159 PGRef pg;
10160 uint64_t requeue_seq;
10161 {
10162 auto& slot = sdata->pg_slots[item.first];
10163 dout(30) << __func__ << " " << item.first
10164 << " to_process " << slot.to_process
10165 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10166 slot.to_process.push_back(item.second);
10167 // note the requeue seq now...
10168 requeue_seq = slot.requeue_seq;
10169 if (slot.waiting_for_pg) {
10170 // save ourselves a bit of effort
10171 dout(20) << __func__ << " " << item.first << " item " << item.second
10172 << " queued, waiting_for_pg" << dendl;
10173 sdata->sdata_op_ordering_lock.Unlock();
10174 return;
10175 }
10176 pg = slot.pg;
10177 dout(20) << __func__ << " " << item.first << " item " << item.second
10178 << " queued" << dendl;
10179 ++slot.num_running;
10180 }
10181 sdata->sdata_op_ordering_lock.Unlock();
10182
10183 osd->service.maybe_inject_dispatch_delay();
10184
10185 // [lookup +] lock pg (if we have it)
10186 if (!pg) {
10187 pg = osd->_lookup_lock_pg(item.first);
10188 } else {
10189 pg->lock();
10190 }
10191
10192 osd->service.maybe_inject_dispatch_delay();
10193
10194 boost::optional<PGQueueable> qi;
10195
10196 // we don't use a Mutex::Locker here because of the
10197 // osd->service.release_reserved_pushes() call below
10198 sdata->sdata_op_ordering_lock.Lock();
10199
10200 auto q = sdata->pg_slots.find(item.first);
10201 assert(q != sdata->pg_slots.end());
10202 auto& slot = q->second;
10203 --slot.num_running;
10204
10205 if (slot.to_process.empty()) {
10206 // raced with wake_pg_waiters or prune_pg_waiters
10207 dout(20) << __func__ << " " << item.first << " nothing queued" << dendl;
10208 if (pg) {
10209 pg->unlock();
10210 }
10211 sdata->sdata_op_ordering_lock.Unlock();
10212 return;
10213 }
10214 if (requeue_seq != slot.requeue_seq) {
10215 dout(20) << __func__ << " " << item.first
10216 << " requeue_seq " << slot.requeue_seq << " > our "
10217 << requeue_seq << ", we raced with wake_pg_waiters"
10218 << dendl;
10219 if (pg) {
10220 pg->unlock();
10221 }
10222 sdata->sdata_op_ordering_lock.Unlock();
10223 return;
10224 }
10225 if (pg && !slot.pg && !pg->deleting) {
10226 dout(20) << __func__ << " " << item.first << " set pg to " << pg << dendl;
10227 slot.pg = pg;
10228 }
10229 dout(30) << __func__ << " " << item.first << " to_process " << slot.to_process
10230 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10231
10232 // make sure we're not already waiting for this pg
10233 if (slot.waiting_for_pg) {
10234 dout(20) << __func__ << " " << item.first << " item " << item.second
10235 << " slot is waiting_for_pg" << dendl;
10236 if (pg) {
10237 pg->unlock();
10238 }
10239 sdata->sdata_op_ordering_lock.Unlock();
10240 return;
10241 }
10242
10243 // take next item
10244 qi = slot.to_process.front();
10245 slot.to_process.pop_front();
10246 dout(20) << __func__ << " " << item.first << " item " << *qi
10247 << " pg " << pg << dendl;
10248
10249 if (!pg) {
10250 // should this pg shard exist on this osd in this (or a later) epoch?
10251 OSDMapRef osdmap = sdata->waiting_for_pg_osdmap;
10252 if (osdmap->is_up_acting_osd_shard(item.first, osd->whoami)) {
10253 dout(20) << __func__ << " " << item.first
10254 << " no pg, should exist, will wait" << " on " << *qi << dendl;
10255 slot.to_process.push_front(*qi);
10256 slot.waiting_for_pg = true;
10257 } else if (qi->get_map_epoch() > osdmap->get_epoch()) {
10258 dout(20) << __func__ << " " << item.first << " no pg, item epoch is "
10259 << qi->get_map_epoch() << " > " << osdmap->get_epoch()
10260 << ", will wait on " << *qi << dendl;
10261 slot.to_process.push_front(*qi);
10262 slot.waiting_for_pg = true;
10263 } else {
10264 dout(20) << __func__ << " " << item.first << " no pg, shouldn't exist,"
10265 << " dropping " << *qi << dendl;
10266 // share map with client?
10267 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10268 Session *session = static_cast<Session *>(
10269 (*_op)->get_req()->get_connection()->get_priv());
10270 if (session) {
10271 osd->maybe_share_map(session, *_op, sdata->waiting_for_pg_osdmap);
10272 session->put();
10273 }
10274 }
10275 unsigned pushes_to_free = qi->get_reserved_pushes();
10276 if (pushes_to_free > 0) {
10277 sdata->sdata_op_ordering_lock.Unlock();
10278 osd->service.release_reserved_pushes(pushes_to_free);
10279 return;
10280 }
10281 }
10282 sdata->sdata_op_ordering_lock.Unlock();
10283 return;
10284 }
10285 sdata->sdata_op_ordering_lock.Unlock();
10286
10287
10288 // osd_opwq_process marks the point at which an operation has been dequeued
10289 // and will begin to be handled by a worker thread.
10290 {
10291 #ifdef WITH_LTTNG
10292 osd_reqid_t reqid;
10293 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10294 reqid = (*_op)->get_reqid();
10295 }
10296 #endif
10297 tracepoint(osd, opwq_process_start, reqid.name._type,
10298 reqid.name._num, reqid.tid, reqid.inc);
10299 }
10300
10301 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10302 Formatter *f = Formatter::create("json");
10303 f->open_object_section("q");
10304 dump(f);
10305 f->close_section();
10306 f->flush(*_dout);
10307 delete f;
10308 *_dout << dendl;
10309
10310 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10311 suicide_interval);
10312 qi->run(osd, pg, tp_handle);
10313
10314 {
10315 #ifdef WITH_LTTNG
10316 osd_reqid_t reqid;
10317 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10318 reqid = (*_op)->get_reqid();
10319 }
10320 #endif
10321 tracepoint(osd, opwq_process_finish, reqid.name._type,
10322 reqid.name._num, reqid.tid, reqid.inc);
10323 }
10324
10325 pg->unlock();
10326 }
10327
10328 void OSD::ShardedOpWQ::_enqueue(pair<spg_t, PGQueueable> item) {
10329 uint32_t shard_index =
10330 item.first.hash_to_shard(shard_list.size());
10331
10332 ShardData* sdata = shard_list[shard_index];
10333 assert (NULL != sdata);
10334 unsigned priority = item.second.get_priority();
10335 unsigned cost = item.second.get_cost();
10336 sdata->sdata_op_ordering_lock.Lock();
10337
10338 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10339 if (priority >= osd->op_prio_cutoff)
10340 sdata->pqueue->enqueue_strict(
10341 item.second.get_owner(), priority, item);
10342 else
10343 sdata->pqueue->enqueue(
10344 item.second.get_owner(),
10345 priority, cost, item);
10346 sdata->sdata_op_ordering_lock.Unlock();
10347
10348 sdata->sdata_lock.Lock();
10349 sdata->sdata_cond.SignalOne();
10350 sdata->sdata_lock.Unlock();
10351
10352 }
10353
10354 void OSD::ShardedOpWQ::_enqueue_front(pair<spg_t, PGQueueable> item)
10355 {
10356 uint32_t shard_index = item.first.hash_to_shard(shard_list.size());
10357 ShardData* sdata = shard_list[shard_index];
10358 assert (NULL != sdata);
10359 sdata->sdata_op_ordering_lock.Lock();
10360 auto p = sdata->pg_slots.find(item.first);
10361 if (p != sdata->pg_slots.end() && !p->second.to_process.empty()) {
10362 // we may be racing with _process, which has dequeued a new item
10363 // from pqueue, put it on to_process, and is now busy taking the
10364 // pg lock. ensure this old requeued item is ordered before any
10365 // such newer item in to_process.
10366 p->second.to_process.push_front(item.second);
10367 item.second = p->second.to_process.back();
10368 p->second.to_process.pop_back();
10369 dout(20) << __func__ << " " << item.first
10370 << " " << p->second.to_process.front()
10371 << " shuffled w/ " << item.second << dendl;
10372 } else {
10373 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10374 }
10375 sdata->_enqueue_front(item, osd->op_prio_cutoff);
10376 sdata->sdata_op_ordering_lock.Unlock();
10377 sdata->sdata_lock.Lock();
10378 sdata->sdata_cond.SignalOne();
10379 sdata->sdata_lock.Unlock();
10380 }
10381
10382 namespace ceph {
10383 namespace osd_cmds {
10384
10385 int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os)
10386 {
10387 if (!ceph_using_tcmalloc()) {
10388 os << "could not issue heap profiler command -- not using tcmalloc!";
10389 return -EOPNOTSUPP;
10390 }
10391
10392 string cmd;
10393 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
10394 os << "unable to get value for command \"" << cmd << "\"";
10395 return -EINVAL;
10396 }
10397
10398 std::vector<std::string> cmd_vec;
10399 get_str_vec(cmd, cmd_vec);
10400
10401 ceph_heap_profiler_handle_command(cmd_vec, os);
10402
10403 return 0;
10404 }
10405
10406 }} // namespace ceph::osd_cmds
10407
10408
10409 std::ostream& operator<<(std::ostream& out, const OSD::io_queue& q) {
10410 switch(q) {
10411 case OSD::io_queue::prioritized:
10412 out << "prioritized";
10413 break;
10414 case OSD::io_queue::weightedpriority:
10415 out << "weightedpriority";
10416 break;
10417 case OSD::io_queue::mclock_opclass:
10418 out << "mclock_opclass";
10419 break;
10420 case OSD::io_queue::mclock_client:
10421 out << "mclock_client";
10422 break;
10423 }
10424 return out;
10425 }