]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
update sources to v12.1.1
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15 #include "acconfig.h"
16
17 #include <fstream>
18 #include <iostream>
19 #include <errno.h>
20 #include <sys/stat.h>
21 #include <signal.h>
22 #include <ctype.h>
23 #include <boost/scoped_ptr.hpp>
24
25 #ifdef HAVE_SYS_PARAM_H
26 #include <sys/param.h>
27 #endif
28
29 #ifdef HAVE_SYS_MOUNT_H
30 #include <sys/mount.h>
31 #endif
32
33 #include "osd/PG.h"
34
35 #include "include/types.h"
36 #include "include/compat.h"
37
38 #include "OSD.h"
39 #include "OSDMap.h"
40 #include "Watch.h"
41 #include "osdc/Objecter.h"
42
43 #include "common/errno.h"
44 #include "common/ceph_argparse.h"
45 #include "common/ceph_time.h"
46 #include "common/version.h"
47 #include "common/io_priority.h"
48
49 #include "os/ObjectStore.h"
50 #ifdef HAVE_LIBFUSE
51 #include "os/FuseStore.h"
52 #endif
53
54 #include "PrimaryLogPG.h"
55
56
57 #include "msg/Messenger.h"
58 #include "msg/Message.h"
59
60 #include "mon/MonClient.h"
61
62 #include "messages/MLog.h"
63
64 #include "messages/MGenericMessage.h"
65 #include "messages/MOSDPing.h"
66 #include "messages/MOSDFailure.h"
67 #include "messages/MOSDMarkMeDown.h"
68 #include "messages/MOSDFull.h"
69 #include "messages/MOSDOp.h"
70 #include "messages/MOSDOpReply.h"
71 #include "messages/MOSDBackoff.h"
72 #include "messages/MOSDBeacon.h"
73 #include "messages/MOSDRepOp.h"
74 #include "messages/MOSDRepOpReply.h"
75 #include "messages/MOSDBoot.h"
76 #include "messages/MOSDPGTemp.h"
77
78 #include "messages/MOSDMap.h"
79 #include "messages/MMonGetOSDMap.h"
80 #include "messages/MOSDPGNotify.h"
81 #include "messages/MOSDPGQuery.h"
82 #include "messages/MOSDPGLog.h"
83 #include "messages/MOSDPGRemove.h"
84 #include "messages/MOSDPGInfo.h"
85 #include "messages/MOSDPGCreate.h"
86 #include "messages/MOSDPGTrim.h"
87 #include "messages/MOSDPGScan.h"
88 #include "messages/MOSDPGBackfill.h"
89 #include "messages/MBackfillReserve.h"
90 #include "messages/MRecoveryReserve.h"
91 #include "messages/MOSDECSubOpWrite.h"
92 #include "messages/MOSDECSubOpWriteReply.h"
93 #include "messages/MOSDECSubOpRead.h"
94 #include "messages/MOSDECSubOpReadReply.h"
95 #include "messages/MOSDPGCreated.h"
96 #include "messages/MOSDPGUpdateLogMissing.h"
97 #include "messages/MOSDPGUpdateLogMissingReply.h"
98
99 #include "messages/MOSDAlive.h"
100
101 #include "messages/MOSDScrub.h"
102 #include "messages/MOSDScrubReserve.h"
103 #include "messages/MOSDRepScrub.h"
104
105 #include "messages/MMonCommand.h"
106 #include "messages/MCommand.h"
107 #include "messages/MCommandReply.h"
108
109 #include "messages/MPGStats.h"
110 #include "messages/MPGStatsAck.h"
111
112 #include "messages/MWatchNotify.h"
113 #include "messages/MOSDPGPush.h"
114 #include "messages/MOSDPGPushReply.h"
115 #include "messages/MOSDPGPull.h"
116
117 #include "common/perf_counters.h"
118 #include "common/Timer.h"
119 #include "common/LogClient.h"
120 #include "common/AsyncReserver.h"
121 #include "common/HeartbeatMap.h"
122 #include "common/admin_socket.h"
123 #include "common/ceph_context.h"
124
125 #include "global/signal_handler.h"
126 #include "global/pidfile.h"
127
128 #include "include/color.h"
129 #include "perfglue/cpu_profiler.h"
130 #include "perfglue/heap_profiler.h"
131
132 #include "osd/OpRequest.h"
133
134 #include "auth/AuthAuthorizeHandler.h"
135 #include "auth/RotatingKeyRing.h"
136 #include "common/errno.h"
137
138 #include "objclass/objclass.h"
139
140 #include "common/cmdparse.h"
141 #include "include/str_list.h"
142 #include "include/util.h"
143
144 #include "include/assert.h"
145 #include "common/config.h"
146 #include "common/EventTrace.h"
147
148 #ifdef WITH_LTTNG
149 #define TRACEPOINT_DEFINE
150 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
151 #include "tracing/osd.h"
152 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
153 #undef TRACEPOINT_DEFINE
154 #else
155 #define tracepoint(...)
156 #endif
157
158 #define dout_context cct
159 #define dout_subsys ceph_subsys_osd
160 #undef dout_prefix
161 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
162
163
164 const double OSD::OSD_TICK_INTERVAL = 1.0;
165
166 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
167 return *_dout << "osd." << whoami << " " << epoch << " ";
168 }
169
170 //Initial features in new superblock.
171 //Features here are also automatically upgraded
172 CompatSet OSD::get_osd_initial_compat_set() {
173 CompatSet::FeatureSet ceph_osd_feature_compat;
174 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
175 CompatSet::FeatureSet ceph_osd_feature_incompat;
176 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
177 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
178 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
179 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
180 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
181 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
182 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
183 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
184 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
185 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
186 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
187 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
188 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
189 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
190 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
191 ceph_osd_feature_incompat);
192 }
193
194 //Features are added here that this OSD supports.
195 CompatSet OSD::get_osd_compat_set() {
196 CompatSet compat = get_osd_initial_compat_set();
197 //Any features here can be set in code, but not in initial superblock
198 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
199 return compat;
200 }
201
202 OSDService::OSDService(OSD *osd) :
203 osd(osd),
204 cct(osd->cct),
205 meta_osr(new ObjectStore::Sequencer("meta")),
206 whoami(osd->whoami), store(osd->store),
207 log_client(osd->log_client), clog(osd->clog),
208 pg_recovery_stats(osd->pg_recovery_stats),
209 cluster_messenger(osd->cluster_messenger),
210 client_messenger(osd->client_messenger),
211 logger(osd->logger),
212 recoverystate_perf(osd->recoverystate_perf),
213 monc(osd->monc),
214 peering_wq(osd->peering_wq),
215 recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
216 &osd->disk_tp),
217 class_handler(osd->class_handler),
218 pg_epoch_lock("OSDService::pg_epoch_lock"),
219 publish_lock("OSDService::publish_lock"),
220 pre_publish_lock("OSDService::pre_publish_lock"),
221 max_oldest_map(0),
222 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
223 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
224 scrubs_active(0),
225 agent_lock("OSDService::agent_lock"),
226 agent_valid_iterator(false),
227 agent_ops(0),
228 flush_mode_high_count(0),
229 agent_active(true),
230 agent_thread(this),
231 agent_stop_flag(false),
232 agent_timer_lock("OSDService::agent_timer_lock"),
233 agent_timer(osd->client_messenger->cct, agent_timer_lock),
234 last_recalibrate(ceph_clock_now()),
235 promote_max_objects(0),
236 promote_max_bytes(0),
237 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
238 objecter_finisher(osd->client_messenger->cct),
239 watch_lock("OSDService::watch_lock"),
240 watch_timer(osd->client_messenger->cct, watch_lock),
241 next_notif_id(0),
242 recovery_request_lock("OSDService::recovery_request_lock"),
243 recovery_request_timer(cct, recovery_request_lock, false),
244 recovery_sleep_lock("OSDService::recovery_sleep_lock"),
245 recovery_sleep_timer(cct, recovery_sleep_lock, false),
246 reserver_finisher(cct),
247 local_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
248 cct->_conf->osd_min_recovery_priority),
249 remote_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
250 cct->_conf->osd_min_recovery_priority),
251 pg_temp_lock("OSDService::pg_temp_lock"),
252 snap_sleep_lock("OSDService::snap_sleep_lock"),
253 snap_sleep_timer(
254 osd->client_messenger->cct, snap_sleep_lock, false /* relax locking */),
255 scrub_sleep_lock("OSDService::scrub_sleep_lock"),
256 scrub_sleep_timer(
257 osd->client_messenger->cct, scrub_sleep_lock, false /* relax locking */),
258 snap_reserver(&reserver_finisher,
259 cct->_conf->osd_max_trimming_pgs),
260 recovery_lock("OSDService::recovery_lock"),
261 recovery_ops_active(0),
262 recovery_ops_reserved(0),
263 recovery_paused(false),
264 map_cache_lock("OSDService::map_cache_lock"),
265 map_cache(cct, cct->_conf->osd_map_cache_size),
266 map_bl_cache(cct->_conf->osd_map_cache_size),
267 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
268 in_progress_split_lock("OSDService::in_progress_split_lock"),
269 stat_lock("OSDService::stat_lock"),
270 full_status_lock("OSDService::full_status_lock"),
271 cur_state(NONE),
272 cur_ratio(0),
273 epoch_lock("OSDService::epoch_lock"),
274 boot_epoch(0), up_epoch(0), bind_epoch(0),
275 is_stopping_lock("OSDService::is_stopping_lock")
276 #ifdef PG_DEBUG_REFS
277 , pgid_lock("OSDService::pgid_lock")
278 #endif
279 {
280 objecter->init();
281 }
282
283 OSDService::~OSDService()
284 {
285 delete objecter;
286 }
287
288
289
290 #ifdef PG_DEBUG_REFS
291 void OSDService::add_pgid(spg_t pgid, PG *pg){
292 Mutex::Locker l(pgid_lock);
293 if (!pgid_tracker.count(pgid)) {
294 live_pgs[pgid] = pg;
295 }
296 pgid_tracker[pgid]++;
297 }
298 void OSDService::remove_pgid(spg_t pgid, PG *pg)
299 {
300 Mutex::Locker l(pgid_lock);
301 assert(pgid_tracker.count(pgid));
302 assert(pgid_tracker[pgid] > 0);
303 pgid_tracker[pgid]--;
304 if (pgid_tracker[pgid] == 0) {
305 pgid_tracker.erase(pgid);
306 live_pgs.erase(pgid);
307 }
308 }
309 void OSDService::dump_live_pgids()
310 {
311 Mutex::Locker l(pgid_lock);
312 derr << "live pgids:" << dendl;
313 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
314 i != pgid_tracker.cend();
315 ++i) {
316 derr << "\t" << *i << dendl;
317 live_pgs[i->first]->dump_live_ids();
318 }
319 }
320 #endif
321
322
323 void OSDService::_start_split(spg_t parent, const set<spg_t> &children)
324 {
325 for (set<spg_t>::const_iterator i = children.begin();
326 i != children.end();
327 ++i) {
328 dout(10) << __func__ << ": Starting split on pg " << *i
329 << ", parent=" << parent << dendl;
330 assert(!pending_splits.count(*i));
331 assert(!in_progress_splits.count(*i));
332 pending_splits.insert(make_pair(*i, parent));
333
334 assert(!rev_pending_splits[parent].count(*i));
335 rev_pending_splits[parent].insert(*i);
336 }
337 }
338
339 void OSDService::mark_split_in_progress(spg_t parent, const set<spg_t> &children)
340 {
341 Mutex::Locker l(in_progress_split_lock);
342 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
343 assert(piter != rev_pending_splits.end());
344 for (set<spg_t>::const_iterator i = children.begin();
345 i != children.end();
346 ++i) {
347 assert(piter->second.count(*i));
348 assert(pending_splits.count(*i));
349 assert(!in_progress_splits.count(*i));
350 assert(pending_splits[*i] == parent);
351
352 pending_splits.erase(*i);
353 piter->second.erase(*i);
354 in_progress_splits.insert(*i);
355 }
356 if (piter->second.empty())
357 rev_pending_splits.erase(piter);
358 }
359
360 void OSDService::cancel_pending_splits_for_parent(spg_t parent)
361 {
362 Mutex::Locker l(in_progress_split_lock);
363 _cancel_pending_splits_for_parent(parent);
364 }
365
366 void OSDService::_cancel_pending_splits_for_parent(spg_t parent)
367 {
368 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
369 if (piter == rev_pending_splits.end())
370 return;
371
372 for (set<spg_t>::iterator i = piter->second.begin();
373 i != piter->second.end();
374 ++i) {
375 assert(pending_splits.count(*i));
376 assert(!in_progress_splits.count(*i));
377 pending_splits.erase(*i);
378 dout(10) << __func__ << ": Completing split on pg " << *i
379 << " for parent: " << parent << dendl;
380 _cancel_pending_splits_for_parent(*i);
381 }
382 rev_pending_splits.erase(piter);
383 }
384
385 void OSDService::_maybe_split_pgid(OSDMapRef old_map,
386 OSDMapRef new_map,
387 spg_t pgid)
388 {
389 assert(old_map->have_pg_pool(pgid.pool()));
390 int old_pgnum = old_map->get_pg_num(pgid.pool());
391 if (pgid.ps() < static_cast<unsigned>(old_pgnum)) {
392 set<spg_t> children;
393 if (pgid.is_split(old_pgnum,
394 new_map->get_pg_num(pgid.pool()), &children)) {
395 _start_split(pgid, children); }
396 } else {
397 assert(pgid.ps() < static_cast<unsigned>(new_map->get_pg_num(pgid.pool())));
398 }
399 }
400
401 void OSDService::init_splits_between(spg_t pgid,
402 OSDMapRef frommap,
403 OSDMapRef tomap)
404 {
405 // First, check whether we can avoid this potentially expensive check
406 if (tomap->have_pg_pool(pgid.pool()) &&
407 pgid.is_split(
408 frommap->get_pg_num(pgid.pool()),
409 tomap->get_pg_num(pgid.pool()),
410 NULL)) {
411 // Ok, a split happened, so we need to walk the osdmaps
412 set<spg_t> new_pgs; // pgs to scan on each map
413 new_pgs.insert(pgid);
414 OSDMapRef curmap(get_map(frommap->get_epoch()));
415 for (epoch_t e = frommap->get_epoch() + 1;
416 e <= tomap->get_epoch();
417 ++e) {
418 OSDMapRef nextmap(try_get_map(e));
419 if (!nextmap)
420 continue;
421 set<spg_t> even_newer_pgs; // pgs added in this loop
422 for (set<spg_t>::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) {
423 set<spg_t> split_pgs;
424 if (i->is_split(curmap->get_pg_num(i->pool()),
425 nextmap->get_pg_num(i->pool()),
426 &split_pgs)) {
427 start_split(*i, split_pgs);
428 even_newer_pgs.insert(split_pgs.begin(), split_pgs.end());
429 }
430 }
431 new_pgs.insert(even_newer_pgs.begin(), even_newer_pgs.end());
432 curmap = nextmap;
433 }
434 assert(curmap == tomap); // we must have had both frommap and tomap
435 }
436 }
437
438 void OSDService::expand_pg_num(OSDMapRef old_map,
439 OSDMapRef new_map)
440 {
441 Mutex::Locker l(in_progress_split_lock);
442 for (set<spg_t>::iterator i = in_progress_splits.begin();
443 i != in_progress_splits.end();
444 ) {
445 if (!new_map->have_pg_pool(i->pool())) {
446 in_progress_splits.erase(i++);
447 } else {
448 _maybe_split_pgid(old_map, new_map, *i);
449 ++i;
450 }
451 }
452 for (map<spg_t, spg_t>::iterator i = pending_splits.begin();
453 i != pending_splits.end();
454 ) {
455 if (!new_map->have_pg_pool(i->first.pool())) {
456 rev_pending_splits.erase(i->second);
457 pending_splits.erase(i++);
458 } else {
459 _maybe_split_pgid(old_map, new_map, i->first);
460 ++i;
461 }
462 }
463 }
464
465 bool OSDService::splitting(spg_t pgid)
466 {
467 Mutex::Locker l(in_progress_split_lock);
468 return in_progress_splits.count(pgid) ||
469 pending_splits.count(pgid);
470 }
471
472 void OSDService::complete_split(const set<spg_t> &pgs)
473 {
474 Mutex::Locker l(in_progress_split_lock);
475 for (set<spg_t>::const_iterator i = pgs.begin();
476 i != pgs.end();
477 ++i) {
478 dout(10) << __func__ << ": Completing split on pg " << *i << dendl;
479 assert(!pending_splits.count(*i));
480 assert(in_progress_splits.count(*i));
481 in_progress_splits.erase(*i);
482 }
483 }
484
485 void OSDService::need_heartbeat_peer_update()
486 {
487 osd->need_heartbeat_peer_update();
488 }
489
490 void OSDService::pg_stat_queue_enqueue(PG *pg)
491 {
492 osd->pg_stat_queue_enqueue(pg);
493 }
494
495 void OSDService::pg_stat_queue_dequeue(PG *pg)
496 {
497 osd->pg_stat_queue_dequeue(pg);
498 }
499
500 void OSDService::start_shutdown()
501 {
502 {
503 Mutex::Locker l(agent_timer_lock);
504 agent_timer.shutdown();
505 }
506
507 {
508 Mutex::Locker l(recovery_sleep_lock);
509 recovery_sleep_timer.shutdown();
510 }
511 }
512
513 void OSDService::shutdown_reserver()
514 {
515 reserver_finisher.wait_for_empty();
516 reserver_finisher.stop();
517 }
518
519 void OSDService::shutdown()
520 {
521 {
522 Mutex::Locker l(watch_lock);
523 watch_timer.shutdown();
524 }
525
526 objecter->shutdown();
527 objecter_finisher.wait_for_empty();
528 objecter_finisher.stop();
529
530 {
531 Mutex::Locker l(recovery_request_lock);
532 recovery_request_timer.shutdown();
533 }
534
535 {
536 Mutex::Locker l(snap_sleep_lock);
537 snap_sleep_timer.shutdown();
538 }
539
540 {
541 Mutex::Locker l(scrub_sleep_lock);
542 scrub_sleep_timer.shutdown();
543 }
544
545 osdmap = OSDMapRef();
546 next_osdmap = OSDMapRef();
547 }
548
549 void OSDService::init()
550 {
551 reserver_finisher.start();
552 objecter_finisher.start();
553 objecter->set_client_incarnation(0);
554
555 // deprioritize objecter in daemonperf output
556 objecter->get_logger()->set_prio_adjust(-3);
557
558 watch_timer.init();
559 agent_timer.init();
560 snap_sleep_timer.init();
561 scrub_sleep_timer.init();
562
563 agent_thread.create("osd_srv_agent");
564
565 if (cct->_conf->osd_recovery_delay_start)
566 defer_recovery(cct->_conf->osd_recovery_delay_start);
567 }
568
569 void OSDService::final_init()
570 {
571 objecter->start(osdmap.get());
572 }
573
574 void OSDService::activate_map()
575 {
576 // wake/unwake the tiering agent
577 agent_lock.Lock();
578 agent_active =
579 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
580 osd->is_active();
581 agent_cond.Signal();
582 agent_lock.Unlock();
583 }
584
585 class AgentTimeoutCB : public Context {
586 PGRef pg;
587 public:
588 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
589 void finish(int) override {
590 pg->agent_choose_mode_restart();
591 }
592 };
593
594 void OSDService::agent_entry()
595 {
596 dout(10) << __func__ << " start" << dendl;
597 agent_lock.Lock();
598
599 while (!agent_stop_flag) {
600 if (agent_queue.empty()) {
601 dout(20) << __func__ << " empty queue" << dendl;
602 agent_cond.Wait(agent_lock);
603 continue;
604 }
605 uint64_t level = agent_queue.rbegin()->first;
606 set<PGRef>& top = agent_queue.rbegin()->second;
607 dout(10) << __func__
608 << " tiers " << agent_queue.size()
609 << ", top is " << level
610 << " with pgs " << top.size()
611 << ", ops " << agent_ops << "/"
612 << cct->_conf->osd_agent_max_ops
613 << (agent_active ? " active" : " NOT ACTIVE")
614 << dendl;
615 dout(20) << __func__ << " oids " << agent_oids << dendl;
616 int max = cct->_conf->osd_agent_max_ops - agent_ops;
617 int agent_flush_quota = max;
618 if (!flush_mode_high_count)
619 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
620 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
621 agent_cond.Wait(agent_lock);
622 continue;
623 }
624
625 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
626 agent_queue_pos = top.begin();
627 agent_valid_iterator = true;
628 }
629 PGRef pg = *agent_queue_pos;
630 dout(10) << "high_count " << flush_mode_high_count
631 << " agent_ops " << agent_ops
632 << " flush_quota " << agent_flush_quota << dendl;
633 agent_lock.Unlock();
634 if (!pg->agent_work(max, agent_flush_quota)) {
635 dout(10) << __func__ << " " << pg->get_pgid()
636 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
637 << " seconds" << dendl;
638
639 osd->logger->inc(l_osd_tier_delay);
640 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
641 agent_timer_lock.Lock();
642 Context *cb = new AgentTimeoutCB(pg);
643 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
644 agent_timer_lock.Unlock();
645 }
646 agent_lock.Lock();
647 }
648 agent_lock.Unlock();
649 dout(10) << __func__ << " finish" << dendl;
650 }
651
652 void OSDService::agent_stop()
653 {
654 {
655 Mutex::Locker l(agent_lock);
656
657 // By this time all ops should be cancelled
658 assert(agent_ops == 0);
659 // By this time all PGs are shutdown and dequeued
660 if (!agent_queue.empty()) {
661 set<PGRef>& top = agent_queue.rbegin()->second;
662 derr << "agent queue not empty, for example " << (*top.begin())->info.pgid << dendl;
663 assert(0 == "agent queue not empty");
664 }
665
666 agent_stop_flag = true;
667 agent_cond.Signal();
668 }
669 agent_thread.join();
670 }
671
672 // -------------------------------------
673
674 void OSDService::promote_throttle_recalibrate()
675 {
676 utime_t now = ceph_clock_now();
677 double dur = now - last_recalibrate;
678 last_recalibrate = now;
679 unsigned prob = promote_probability_millis;
680
681 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
682 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
683
684 unsigned min_prob = 1;
685
686 uint64_t attempts, obj, bytes;
687 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
688 dout(10) << __func__ << " " << attempts << " attempts, promoted "
689 << obj << " objects and " << pretty_si_t(bytes) << " bytes; target "
690 << target_obj_sec << " obj/sec or "
691 << pretty_si_t(target_bytes_sec) << " bytes/sec"
692 << dendl;
693
694 // calculate what the probability *should* be, given the targets
695 unsigned new_prob;
696 if (attempts && dur > 0) {
697 uint64_t avg_size = 1;
698 if (obj)
699 avg_size = MAX(bytes / obj, 1);
700 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
701 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
702 / (double)attempts;
703 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
704 << avg_size << dendl;
705 if (target_obj_sec && target_bytes_sec)
706 new_prob = MIN(po, pb);
707 else if (target_obj_sec)
708 new_prob = po;
709 else if (target_bytes_sec)
710 new_prob = pb;
711 else
712 new_prob = 1000;
713 } else {
714 new_prob = 1000;
715 }
716 dout(20) << __func__ << " new_prob " << new_prob << dendl;
717
718 // correct for persistent skew between target rate and actual rate, adjust
719 double ratio = 1.0;
720 unsigned actual = 0;
721 if (attempts && obj) {
722 actual = obj * 1000 / attempts;
723 ratio = (double)actual / (double)prob;
724 new_prob = (double)new_prob / ratio;
725 }
726 new_prob = MAX(new_prob, min_prob);
727 new_prob = MIN(new_prob, 1000);
728
729 // adjust
730 prob = (prob + new_prob) / 2;
731 prob = MAX(prob, min_prob);
732 prob = MIN(prob, 1000);
733 dout(10) << __func__ << " actual " << actual
734 << ", actual/prob ratio " << ratio
735 << ", adjusted new_prob " << new_prob
736 << ", prob " << promote_probability_millis << " -> " << prob
737 << dendl;
738 promote_probability_millis = prob;
739
740 // set hard limits for this interval to mitigate stampedes
741 promote_max_objects = target_obj_sec * OSD::OSD_TICK_INTERVAL * 2;
742 promote_max_bytes = target_bytes_sec * OSD::OSD_TICK_INTERVAL * 2;
743 }
744
745 // -------------------------------------
746
747 float OSDService::get_failsafe_full_ratio()
748 {
749 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
750 if (full_ratio > 1.0) full_ratio /= 100.0;
751 return full_ratio;
752 }
753
754 void OSDService::check_full_status(float ratio)
755 {
756 Mutex::Locker l(full_status_lock);
757
758 cur_ratio = ratio;
759
760 // The OSDMap ratios take precendence. So if the failsafe is .95 and
761 // the admin sets the cluster full to .96, the failsafe moves up to .96
762 // too. (Not that having failsafe == full is ideal, but it's better than
763 // dropping writes before the clusters appears full.)
764 OSDMapRef osdmap = get_osdmap();
765 if (!osdmap || osdmap->get_epoch() == 0) {
766 cur_state = NONE;
767 return;
768 }
769 float nearfull_ratio = osdmap->get_nearfull_ratio();
770 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
771 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
772 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
773
774 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
775 // use the failsafe for nearfull and full; the mon isn't using the
776 // flags anyway because we're mid-upgrade.
777 full_ratio = failsafe_ratio;
778 backfillfull_ratio = failsafe_ratio;
779 nearfull_ratio = failsafe_ratio;
780 } else if (full_ratio <= 0 ||
781 backfillfull_ratio <= 0 ||
782 nearfull_ratio <= 0) {
783 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
784 // use failsafe flag. ick. the monitor did something wrong or the user
785 // did something stupid.
786 full_ratio = failsafe_ratio;
787 backfillfull_ratio = failsafe_ratio;
788 nearfull_ratio = failsafe_ratio;
789 }
790
791 string inject;
792 s_names new_state;
793 if (injectfull_state > NONE && injectfull) {
794 new_state = injectfull_state;
795 inject = "(Injected)";
796 } else if (ratio > failsafe_ratio) {
797 new_state = FAILSAFE;
798 } else if (ratio > full_ratio) {
799 new_state = FULL;
800 } else if (ratio > backfillfull_ratio) {
801 new_state = BACKFILLFULL;
802 } else if (ratio > nearfull_ratio) {
803 new_state = NEARFULL;
804 } else {
805 new_state = NONE;
806 }
807 dout(20) << __func__ << " cur ratio " << ratio
808 << ". nearfull_ratio " << nearfull_ratio
809 << ". backfillfull_ratio " << backfillfull_ratio
810 << ", full_ratio " << full_ratio
811 << ", failsafe_ratio " << failsafe_ratio
812 << ", new state " << get_full_state_name(new_state)
813 << " " << inject
814 << dendl;
815
816 // warn
817 if (cur_state != new_state) {
818 dout(10) << __func__ << " " << get_full_state_name(cur_state)
819 << " -> " << get_full_state_name(new_state) << dendl;
820 if (new_state == FAILSAFE) {
821 clog->error() << "failsafe engaged, dropping updates, now "
822 << (int)roundf(ratio * 100) << "% full";
823 } else if (cur_state == FAILSAFE) {
824 clog->error() << "failsafe disengaged, no longer dropping updates, now "
825 << (int)roundf(ratio * 100) << "% full";
826 }
827 cur_state = new_state;
828 }
829 }
830
831 bool OSDService::need_fullness_update()
832 {
833 OSDMapRef osdmap = get_osdmap();
834 s_names cur = NONE;
835 if (osdmap->exists(whoami)) {
836 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
837 cur = FULL;
838 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
839 cur = BACKFILLFULL;
840 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
841 cur = NEARFULL;
842 }
843 }
844 s_names want = NONE;
845 if (is_full())
846 want = FULL;
847 else if (is_backfillfull())
848 want = BACKFILLFULL;
849 else if (is_nearfull())
850 want = NEARFULL;
851 return want != cur;
852 }
853
854 bool OSDService::_check_full(s_names type, ostream &ss) const
855 {
856 Mutex::Locker l(full_status_lock);
857
858 if (injectfull && injectfull_state >= type) {
859 // injectfull is either a count of the number of times to return failsafe full
860 // or if -1 then always return full
861 if (injectfull > 0)
862 --injectfull;
863 ss << "Injected " << get_full_state_name(type) << " OSD ("
864 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")";
865 return true;
866 }
867
868 ss << "current usage is " << cur_ratio;
869 return cur_state >= type;
870 }
871
872 bool OSDService::check_failsafe_full(ostream &ss) const
873 {
874 return _check_full(FAILSAFE, ss);
875 }
876
877 bool OSDService::check_full(ostream &ss) const
878 {
879 return _check_full(FULL, ss);
880 }
881
882 bool OSDService::check_backfill_full(ostream &ss) const
883 {
884 return _check_full(BACKFILLFULL, ss);
885 }
886
887 bool OSDService::check_nearfull(ostream &ss) const
888 {
889 return _check_full(NEARFULL, ss);
890 }
891
892 bool OSDService::is_failsafe_full() const
893 {
894 Mutex::Locker l(full_status_lock);
895 return cur_state == FAILSAFE;
896 }
897
898 bool OSDService::is_full() const
899 {
900 Mutex::Locker l(full_status_lock);
901 return cur_state >= FULL;
902 }
903
904 bool OSDService::is_backfillfull() const
905 {
906 Mutex::Locker l(full_status_lock);
907 return cur_state >= BACKFILLFULL;
908 }
909
910 bool OSDService::is_nearfull() const
911 {
912 Mutex::Locker l(full_status_lock);
913 return cur_state >= NEARFULL;
914 }
915
916 void OSDService::set_injectfull(s_names type, int64_t count)
917 {
918 Mutex::Locker l(full_status_lock);
919 injectfull_state = type;
920 injectfull = count;
921 }
922
923 osd_stat_t OSDService::set_osd_stat(const struct store_statfs_t &stbuf,
924 vector<int>& hb_peers)
925 {
926 uint64_t bytes = stbuf.total;
927 uint64_t used = bytes - stbuf.available;
928 uint64_t avail = stbuf.available;
929
930 osd->logger->set(l_osd_stat_bytes, bytes);
931 osd->logger->set(l_osd_stat_bytes_used, used);
932 osd->logger->set(l_osd_stat_bytes_avail, avail);
933
934 {
935 Mutex::Locker l(stat_lock);
936 osd_stat.hb_peers.swap(hb_peers);
937 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
938 osd_stat.kb = bytes >> 10;
939 osd_stat.kb_used = used >> 10;
940 osd_stat.kb_avail = avail >> 10;
941 return osd_stat;
942 }
943 }
944
945 void OSDService::update_osd_stat(vector<int>& hb_peers)
946 {
947 // load osd stats first
948 struct store_statfs_t stbuf;
949 int r = osd->store->statfs(&stbuf);
950 if (r < 0) {
951 derr << "statfs() failed: " << cpp_strerror(r) << dendl;
952 return;
953 }
954
955 auto new_stat = set_osd_stat(stbuf, hb_peers);
956 dout(20) << "update_osd_stat " << new_stat << dendl;
957 assert(new_stat.kb);
958 float ratio = ((float)new_stat.kb_used) / ((float)new_stat.kb);
959 check_full_status(ratio);
960 }
961
962 bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
963 {
964 OSDMapRef osdmap = get_osdmap();
965 for (auto shard : missing_on) {
966 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
967 return true;
968 }
969 return false;
970 }
971
972 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
973 {
974 OSDMapRef next_map = get_nextmap_reserved();
975 // service map is always newer/newest
976 assert(from_epoch <= next_map->get_epoch());
977
978 if (next_map->is_down(peer) ||
979 next_map->get_info(peer).up_from > from_epoch) {
980 m->put();
981 release_map(next_map);
982 return;
983 }
984 const entity_inst_t& peer_inst = next_map->get_cluster_inst(peer);
985 ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst);
986 share_map_peer(peer, peer_con.get(), next_map);
987 peer_con->send_message(m);
988 release_map(next_map);
989 }
990
991 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
992 {
993 OSDMapRef next_map = get_nextmap_reserved();
994 // service map is always newer/newest
995 assert(from_epoch <= next_map->get_epoch());
996
997 if (next_map->is_down(peer) ||
998 next_map->get_info(peer).up_from > from_epoch) {
999 release_map(next_map);
1000 return NULL;
1001 }
1002 ConnectionRef con = osd->cluster_messenger->get_connection(next_map->get_cluster_inst(peer));
1003 release_map(next_map);
1004 return con;
1005 }
1006
1007 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1008 {
1009 OSDMapRef next_map = get_nextmap_reserved();
1010 // service map is always newer/newest
1011 assert(from_epoch <= next_map->get_epoch());
1012
1013 pair<ConnectionRef,ConnectionRef> ret;
1014 if (next_map->is_down(peer) ||
1015 next_map->get_info(peer).up_from > from_epoch) {
1016 release_map(next_map);
1017 return ret;
1018 }
1019 ret.first = osd->hb_back_client_messenger->get_connection(next_map->get_hb_back_inst(peer));
1020 if (next_map->get_hb_front_addr(peer) != entity_addr_t())
1021 ret.second = osd->hb_front_client_messenger->get_connection(next_map->get_hb_front_inst(peer));
1022 release_map(next_map);
1023 return ret;
1024 }
1025
1026
1027 void OSDService::queue_want_pg_temp(pg_t pgid, vector<int>& want)
1028 {
1029 Mutex::Locker l(pg_temp_lock);
1030 map<pg_t,vector<int> >::iterator p = pg_temp_pending.find(pgid);
1031 if (p == pg_temp_pending.end() ||
1032 p->second != want) {
1033 pg_temp_wanted[pgid] = want;
1034 }
1035 }
1036
1037 void OSDService::remove_want_pg_temp(pg_t pgid)
1038 {
1039 Mutex::Locker l(pg_temp_lock);
1040 pg_temp_wanted.erase(pgid);
1041 pg_temp_pending.erase(pgid);
1042 }
1043
1044 void OSDService::_sent_pg_temp()
1045 {
1046 for (map<pg_t,vector<int> >::iterator p = pg_temp_wanted.begin();
1047 p != pg_temp_wanted.end();
1048 ++p)
1049 pg_temp_pending[p->first] = p->second;
1050 pg_temp_wanted.clear();
1051 }
1052
1053 void OSDService::requeue_pg_temp()
1054 {
1055 Mutex::Locker l(pg_temp_lock);
1056 // wanted overrides pending. note that remove_want_pg_temp
1057 // clears the item out of both.
1058 unsigned old_wanted = pg_temp_wanted.size();
1059 unsigned old_pending = pg_temp_pending.size();
1060 _sent_pg_temp();
1061 pg_temp_wanted.swap(pg_temp_pending);
1062 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1063 << pg_temp_wanted.size() << dendl;
1064 }
1065
1066 void OSDService::send_pg_temp()
1067 {
1068 Mutex::Locker l(pg_temp_lock);
1069 if (pg_temp_wanted.empty())
1070 return;
1071 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1072 MOSDPGTemp *m = new MOSDPGTemp(osdmap->get_epoch());
1073 m->pg_temp = pg_temp_wanted;
1074 monc->send_mon_message(m);
1075 _sent_pg_temp();
1076 }
1077
1078 void OSDService::send_pg_created(pg_t pgid)
1079 {
1080 dout(20) << __func__ << dendl;
1081 monc->send_mon_message(new MOSDPGCreated(pgid));
1082 }
1083
1084 // --------------------------------------
1085 // dispatch
1086
1087 epoch_t OSDService::get_peer_epoch(int peer)
1088 {
1089 Mutex::Locker l(peer_map_epoch_lock);
1090 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1091 if (p == peer_map_epoch.end())
1092 return 0;
1093 return p->second;
1094 }
1095
1096 epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1097 {
1098 Mutex::Locker l(peer_map_epoch_lock);
1099 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1100 if (p != peer_map_epoch.end()) {
1101 if (p->second < e) {
1102 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1103 p->second = e;
1104 } else {
1105 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1106 }
1107 return p->second;
1108 } else {
1109 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1110 peer_map_epoch[peer] = e;
1111 return e;
1112 }
1113 }
1114
1115 void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1116 {
1117 Mutex::Locker l(peer_map_epoch_lock);
1118 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1119 if (p != peer_map_epoch.end()) {
1120 if (p->second <= as_of) {
1121 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1122 << " had " << p->second << dendl;
1123 peer_map_epoch.erase(p);
1124 } else {
1125 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1126 << " has " << p->second << " - not forgetting" << dendl;
1127 }
1128 }
1129 }
1130
1131 bool OSDService::should_share_map(entity_name_t name, Connection *con,
1132 epoch_t epoch, const OSDMapRef& osdmap,
1133 const epoch_t *sent_epoch_p)
1134 {
1135 dout(20) << "should_share_map "
1136 << name << " " << con->get_peer_addr()
1137 << " " << epoch << dendl;
1138
1139 // does client have old map?
1140 if (name.is_client()) {
1141 bool message_sendmap = epoch < osdmap->get_epoch();
1142 if (message_sendmap && sent_epoch_p) {
1143 dout(20) << "client session last_sent_epoch: "
1144 << *sent_epoch_p
1145 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1146 if (*sent_epoch_p < osdmap->get_epoch()) {
1147 return true;
1148 } // else we don't need to send it out again
1149 }
1150 }
1151
1152 if (con->get_messenger() == osd->cluster_messenger &&
1153 con != osd->cluster_messenger->get_loopback_connection() &&
1154 osdmap->is_up(name.num()) &&
1155 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1156 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1157 // remember
1158 epoch_t has = MAX(get_peer_epoch(name.num()), epoch);
1159
1160 // share?
1161 if (has < osdmap->get_epoch()) {
1162 dout(10) << name << " " << con->get_peer_addr()
1163 << " has old map " << epoch << " < "
1164 << osdmap->get_epoch() << dendl;
1165 return true;
1166 }
1167 }
1168
1169 return false;
1170 }
1171
1172 void OSDService::share_map(
1173 entity_name_t name,
1174 Connection *con,
1175 epoch_t epoch,
1176 OSDMapRef& osdmap,
1177 epoch_t *sent_epoch_p)
1178 {
1179 dout(20) << "share_map "
1180 << name << " " << con->get_peer_addr()
1181 << " " << epoch << dendl;
1182
1183 if (!osd->is_active()) {
1184 /*It is safe not to proceed as OSD is not in healthy state*/
1185 return;
1186 }
1187
1188 bool want_shared = should_share_map(name, con, epoch,
1189 osdmap, sent_epoch_p);
1190
1191 if (want_shared){
1192 if (name.is_client()) {
1193 dout(10) << name << " has old map " << epoch
1194 << " < " << osdmap->get_epoch() << dendl;
1195 // we know the Session is valid or we wouldn't be sending
1196 if (sent_epoch_p) {
1197 *sent_epoch_p = osdmap->get_epoch();
1198 }
1199 send_incremental_map(epoch, con, osdmap);
1200 } else if (con->get_messenger() == osd->cluster_messenger &&
1201 osdmap->is_up(name.num()) &&
1202 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1203 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1204 dout(10) << name << " " << con->get_peer_addr()
1205 << " has old map " << epoch << " < "
1206 << osdmap->get_epoch() << dendl;
1207 note_peer_epoch(name.num(), osdmap->get_epoch());
1208 send_incremental_map(epoch, con, osdmap);
1209 }
1210 }
1211 }
1212
1213 void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1214 {
1215 if (!map)
1216 map = get_osdmap();
1217
1218 // send map?
1219 epoch_t pe = get_peer_epoch(peer);
1220 if (pe) {
1221 if (pe < map->get_epoch()) {
1222 send_incremental_map(pe, con, map);
1223 note_peer_epoch(peer, map->get_epoch());
1224 } else
1225 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1226 } else {
1227 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1228 // no idea about peer's epoch.
1229 // ??? send recent ???
1230 // do nothing.
1231 }
1232 }
1233
1234 bool OSDService::can_inc_scrubs_pending()
1235 {
1236 bool can_inc = false;
1237 Mutex::Locker l(sched_scrub_lock);
1238
1239 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1240 dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
1241 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1242 can_inc = true;
1243 } else {
1244 dout(20) << __func__ << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1245 }
1246
1247 return can_inc;
1248 }
1249
1250 bool OSDService::inc_scrubs_pending()
1251 {
1252 bool result = false;
1253
1254 sched_scrub_lock.Lock();
1255 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1256 dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
1257 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1258 result = true;
1259 ++scrubs_pending;
1260 } else {
1261 dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1262 }
1263 sched_scrub_lock.Unlock();
1264
1265 return result;
1266 }
1267
1268 void OSDService::dec_scrubs_pending()
1269 {
1270 sched_scrub_lock.Lock();
1271 dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
1272 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1273 --scrubs_pending;
1274 assert(scrubs_pending >= 0);
1275 sched_scrub_lock.Unlock();
1276 }
1277
1278 void OSDService::inc_scrubs_active(bool reserved)
1279 {
1280 sched_scrub_lock.Lock();
1281 ++(scrubs_active);
1282 if (reserved) {
1283 --(scrubs_pending);
1284 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1285 << " (max " << cct->_conf->osd_max_scrubs
1286 << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
1287 assert(scrubs_pending >= 0);
1288 } else {
1289 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1290 << " (max " << cct->_conf->osd_max_scrubs
1291 << ", pending " << scrubs_pending << ")" << dendl;
1292 }
1293 sched_scrub_lock.Unlock();
1294 }
1295
1296 void OSDService::dec_scrubs_active()
1297 {
1298 sched_scrub_lock.Lock();
1299 dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
1300 << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
1301 --scrubs_active;
1302 assert(scrubs_active >= 0);
1303 sched_scrub_lock.Unlock();
1304 }
1305
1306 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1307 epoch_t *_bind_epoch) const
1308 {
1309 Mutex::Locker l(epoch_lock);
1310 if (_boot_epoch)
1311 *_boot_epoch = boot_epoch;
1312 if (_up_epoch)
1313 *_up_epoch = up_epoch;
1314 if (_bind_epoch)
1315 *_bind_epoch = bind_epoch;
1316 }
1317
1318 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1319 const epoch_t *_bind_epoch)
1320 {
1321 Mutex::Locker l(epoch_lock);
1322 if (_boot_epoch) {
1323 assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1324 boot_epoch = *_boot_epoch;
1325 }
1326 if (_up_epoch) {
1327 assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1328 up_epoch = *_up_epoch;
1329 }
1330 if (_bind_epoch) {
1331 assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1332 bind_epoch = *_bind_epoch;
1333 }
1334 }
1335
1336 bool OSDService::prepare_to_stop()
1337 {
1338 Mutex::Locker l(is_stopping_lock);
1339 if (get_state() != NOT_STOPPING)
1340 return false;
1341
1342 OSDMapRef osdmap = get_osdmap();
1343 if (osdmap && osdmap->is_up(whoami)) {
1344 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1345 set_state(PREPARING_TO_STOP);
1346 monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(),
1347 osdmap->get_inst(whoami),
1348 osdmap->get_epoch(),
1349 true // request ack
1350 ));
1351 utime_t now = ceph_clock_now();
1352 utime_t timeout;
1353 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1354 while ((ceph_clock_now() < timeout) &&
1355 (get_state() != STOPPING)) {
1356 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1357 }
1358 }
1359 dout(0) << __func__ << " starting shutdown" << dendl;
1360 set_state(STOPPING);
1361 return true;
1362 }
1363
1364 void OSDService::got_stop_ack()
1365 {
1366 Mutex::Locker l(is_stopping_lock);
1367 if (get_state() == PREPARING_TO_STOP) {
1368 dout(0) << __func__ << " starting shutdown" << dendl;
1369 set_state(STOPPING);
1370 is_stopping_cond.Signal();
1371 } else {
1372 dout(10) << __func__ << " ignoring msg" << dendl;
1373 }
1374 }
1375
1376 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1377 OSDSuperblock& sblock)
1378 {
1379 MOSDMap *m = new MOSDMap(monc->get_fsid());
1380 m->oldest_map = max_oldest_map;
1381 m->newest_map = sblock.newest_map;
1382
1383 for (epoch_t e = to; e > since; e--) {
1384 bufferlist bl;
1385 if (e > m->oldest_map && get_inc_map_bl(e, bl)) {
1386 m->incremental_maps[e].claim(bl);
1387 } else if (get_map_bl(e, bl)) {
1388 m->maps[e].claim(bl);
1389 break;
1390 } else {
1391 derr << "since " << since << " to " << to
1392 << " oldest " << m->oldest_map << " newest " << m->newest_map
1393 << dendl;
1394 m->put();
1395 m = NULL;
1396 break;
1397 }
1398 }
1399 return m;
1400 }
1401
1402 void OSDService::send_map(MOSDMap *m, Connection *con)
1403 {
1404 con->send_message(m);
1405 }
1406
1407 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1408 OSDMapRef& osdmap)
1409 {
1410 epoch_t to = osdmap->get_epoch();
1411 dout(10) << "send_incremental_map " << since << " -> " << to
1412 << " to " << con << " " << con->get_peer_addr() << dendl;
1413
1414 MOSDMap *m = NULL;
1415 while (!m) {
1416 OSDSuperblock sblock(get_superblock());
1417 if (since < sblock.oldest_map) {
1418 // just send latest full map
1419 MOSDMap *m = new MOSDMap(monc->get_fsid());
1420 m->oldest_map = max_oldest_map;
1421 m->newest_map = sblock.newest_map;
1422 get_map_bl(to, m->maps[to]);
1423 send_map(m, con);
1424 return;
1425 }
1426
1427 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1428 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1429 << ", only sending most recent" << dendl;
1430 since = to - cct->_conf->osd_map_share_max_epochs;
1431 }
1432
1433 if (to - since > (epoch_t)cct->_conf->osd_map_message_max)
1434 to = since + cct->_conf->osd_map_message_max;
1435 m = build_incremental_map_msg(since, to, sblock);
1436 }
1437 send_map(m, con);
1438 }
1439
1440 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1441 {
1442 bool found = map_bl_cache.lookup(e, &bl);
1443 if (found) {
1444 if (logger)
1445 logger->inc(l_osd_map_bl_cache_hit);
1446 return true;
1447 }
1448 if (logger)
1449 logger->inc(l_osd_map_bl_cache_miss);
1450 found = store->read(coll_t::meta(),
1451 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1452 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1453 if (found) {
1454 _add_map_bl(e, bl);
1455 }
1456 return found;
1457 }
1458
1459 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1460 {
1461 Mutex::Locker l(map_cache_lock);
1462 bool found = map_bl_inc_cache.lookup(e, &bl);
1463 if (found) {
1464 if (logger)
1465 logger->inc(l_osd_map_bl_cache_hit);
1466 return true;
1467 }
1468 if (logger)
1469 logger->inc(l_osd_map_bl_cache_miss);
1470 found = store->read(coll_t::meta(),
1471 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1472 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1473 if (found) {
1474 _add_map_inc_bl(e, bl);
1475 }
1476 return found;
1477 }
1478
1479 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1480 {
1481 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1482 // cache a contiguous buffer
1483 if (bl.get_num_buffers() > 1) {
1484 bl.rebuild();
1485 }
1486 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1487 map_bl_cache.add(e, bl);
1488 }
1489
1490 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1491 {
1492 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1493 // cache a contiguous buffer
1494 if (bl.get_num_buffers() > 1) {
1495 bl.rebuild();
1496 }
1497 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1498 map_bl_inc_cache.add(e, bl);
1499 }
1500
1501 void OSDService::pin_map_inc_bl(epoch_t e, bufferlist &bl)
1502 {
1503 Mutex::Locker l(map_cache_lock);
1504 // cache a contiguous buffer
1505 if (bl.get_num_buffers() > 1) {
1506 bl.rebuild();
1507 }
1508 map_bl_inc_cache.pin(e, bl);
1509 }
1510
1511 void OSDService::pin_map_bl(epoch_t e, bufferlist &bl)
1512 {
1513 Mutex::Locker l(map_cache_lock);
1514 // cache a contiguous buffer
1515 if (bl.get_num_buffers() > 1) {
1516 bl.rebuild();
1517 }
1518 map_bl_cache.pin(e, bl);
1519 }
1520
1521 void OSDService::clear_map_bl_cache_pins(epoch_t e)
1522 {
1523 Mutex::Locker l(map_cache_lock);
1524 map_bl_inc_cache.clear_pinned(e);
1525 map_bl_cache.clear_pinned(e);
1526 }
1527
1528 OSDMapRef OSDService::_add_map(OSDMap *o)
1529 {
1530 epoch_t e = o->get_epoch();
1531
1532 if (cct->_conf->osd_map_dedup) {
1533 // Dedup against an existing map at a nearby epoch
1534 OSDMapRef for_dedup = map_cache.lower_bound(e);
1535 if (for_dedup) {
1536 OSDMap::dedup(for_dedup.get(), o);
1537 }
1538 }
1539 bool existed;
1540 OSDMapRef l = map_cache.add(e, o, &existed);
1541 if (existed) {
1542 delete o;
1543 }
1544 return l;
1545 }
1546
1547 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1548 {
1549 Mutex::Locker l(map_cache_lock);
1550 OSDMapRef retval = map_cache.lookup(epoch);
1551 if (retval) {
1552 dout(30) << "get_map " << epoch << " -cached" << dendl;
1553 if (logger) {
1554 logger->inc(l_osd_map_cache_hit);
1555 }
1556 return retval;
1557 }
1558 if (logger) {
1559 logger->inc(l_osd_map_cache_miss);
1560 epoch_t lb = map_cache.cached_key_lower_bound();
1561 if (epoch < lb) {
1562 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1563 logger->inc(l_osd_map_cache_miss_low);
1564 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1565 }
1566 }
1567
1568 OSDMap *map = new OSDMap;
1569 if (epoch > 0) {
1570 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1571 bufferlist bl;
1572 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1573 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1574 delete map;
1575 return OSDMapRef();
1576 }
1577 map->decode(bl);
1578 } else {
1579 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1580 }
1581 return _add_map(map);
1582 }
1583
1584 // ops
1585
1586
1587 void OSDService::reply_op_error(OpRequestRef op, int err)
1588 {
1589 reply_op_error(op, err, eversion_t(), 0);
1590 }
1591
1592 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1593 version_t uv)
1594 {
1595 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1596 assert(m->get_type() == CEPH_MSG_OSD_OP);
1597 int flags;
1598 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1599
1600 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1601 true);
1602 reply->set_reply_versions(v, uv);
1603 m->get_connection()->send_message(reply);
1604 }
1605
1606 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1607 {
1608 if (!cct->_conf->osd_debug_misdirected_ops) {
1609 return;
1610 }
1611
1612 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1613 assert(m->get_type() == CEPH_MSG_OSD_OP);
1614
1615 assert(m->get_map_epoch() >= pg->info.history.same_primary_since);
1616
1617 if (pg->is_ec_pg()) {
1618 /**
1619 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1620 * can get this result:
1621 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1622 * [CRUSH_ITEM_NONE, 2, 3]/3
1623 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1624 * [3, 2, 3]/3
1625 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1626 * -- misdirected op
1627 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1628 * it and fulfils it
1629 *
1630 * We can't compute the op target based on the sending map epoch due to
1631 * splitting. The simplest thing is to detect such cases here and drop
1632 * them without an error (the client will resend anyway).
1633 */
1634 assert(m->get_map_epoch() <= superblock.newest_map);
1635 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1636 if (!opmap) {
1637 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1638 << m->get_map_epoch() << ", dropping" << dendl;
1639 return;
1640 }
1641 pg_t _pgid = m->get_raw_pg();
1642 spg_t pgid;
1643 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1644 _pgid = opmap->raw_pg_to_pg(_pgid);
1645 if (opmap->get_primary_shard(_pgid, &pgid) &&
1646 pgid.shard != pg->info.pgid.shard) {
1647 dout(7) << __func__ << ": " << *pg << " primary changed since "
1648 << m->get_map_epoch() << ", dropping" << dendl;
1649 return;
1650 }
1651 }
1652
1653 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1654 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1655 << " pg " << m->get_raw_pg()
1656 << " to osd." << whoami
1657 << " not " << pg->acting
1658 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1659 }
1660
1661 void OSDService::enqueue_back(spg_t pgid, PGQueueable qi)
1662 {
1663 osd->op_shardedwq.queue(make_pair(pgid, qi));
1664 }
1665
1666 void OSDService::enqueue_front(spg_t pgid, PGQueueable qi)
1667 {
1668 osd->op_shardedwq.queue_front(make_pair(pgid, qi));
1669 }
1670
1671 void OSDService::queue_for_peering(PG *pg)
1672 {
1673 peering_wq.queue(pg);
1674 }
1675
1676 void OSDService::queue_for_snap_trim(PG *pg)
1677 {
1678 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1679 osd->op_shardedwq.queue(
1680 make_pair(
1681 pg->info.pgid,
1682 PGQueueable(
1683 PGSnapTrim(pg->get_osdmap()->get_epoch()),
1684 cct->_conf->osd_snap_trim_cost,
1685 cct->_conf->osd_snap_trim_priority,
1686 ceph_clock_now(),
1687 entity_inst_t(),
1688 pg->get_osdmap()->get_epoch())));
1689 }
1690
1691
1692 // ====================================================================
1693 // OSD
1694
1695 #undef dout_prefix
1696 #define dout_prefix *_dout
1697
1698 // Commands shared between OSD's console and admin console:
1699 namespace ceph {
1700 namespace osd_cmds {
1701
1702 int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1703
1704 }} // namespace ceph::osd_cmds
1705
1706 int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
1707 uuid_d fsid, int whoami)
1708 {
1709 int ret;
1710
1711 ceph::shared_ptr<ObjectStore::Sequencer> osr(
1712 new ObjectStore::Sequencer("mkfs"));
1713 OSDSuperblock sb;
1714 bufferlist sbbl;
1715 C_SaferCond waiter;
1716
1717 // if we are fed a uuid for this osd, use it.
1718 store->set_fsid(cct->_conf->osd_uuid);
1719
1720 ret = store->mkfs();
1721 if (ret) {
1722 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1723 << cpp_strerror(ret) << dendl;
1724 goto free_store;
1725 }
1726
1727 store->set_cache_shards(1); // doesn't matter for mkfs!
1728
1729 ret = store->mount();
1730 if (ret) {
1731 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1732 << cpp_strerror(ret) << dendl;
1733 goto free_store;
1734 }
1735
1736 ret = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1737 if (ret >= 0) {
1738 /* if we already have superblock, check content of superblock */
1739 dout(0) << " have superblock" << dendl;
1740 bufferlist::iterator p;
1741 p = sbbl.begin();
1742 ::decode(sb, p);
1743 if (whoami != sb.whoami) {
1744 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1745 << dendl;
1746 ret = -EINVAL;
1747 goto umount_store;
1748 }
1749 if (fsid != sb.cluster_fsid) {
1750 derr << "provided cluster fsid " << fsid
1751 << " != superblock's " << sb.cluster_fsid << dendl;
1752 ret = -EINVAL;
1753 goto umount_store;
1754 }
1755 } else {
1756 // create superblock
1757 sb.cluster_fsid = fsid;
1758 sb.osd_fsid = store->get_fsid();
1759 sb.whoami = whoami;
1760 sb.compat_features = get_osd_initial_compat_set();
1761
1762 bufferlist bl;
1763 ::encode(sb, bl);
1764
1765 ObjectStore::Transaction t;
1766 t.create_collection(coll_t::meta(), 0);
1767 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1768 ret = store->apply_transaction(osr.get(), std::move(t));
1769 if (ret) {
1770 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1771 << "apply_transaction returned " << cpp_strerror(ret) << dendl;
1772 goto umount_store;
1773 }
1774 }
1775
1776 if (!osr->flush_commit(&waiter)) {
1777 waiter.wait();
1778 }
1779
1780 ret = write_meta(store, sb.cluster_fsid, sb.osd_fsid, whoami);
1781 if (ret) {
1782 derr << "OSD::mkfs: failed to write fsid file: error "
1783 << cpp_strerror(ret) << dendl;
1784 goto umount_store;
1785 }
1786
1787 umount_store:
1788 store->umount();
1789 free_store:
1790 delete store;
1791 return ret;
1792 }
1793
1794 int OSD::write_meta(ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
1795 {
1796 char val[80];
1797 int r;
1798
1799 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
1800 r = store->write_meta("magic", val);
1801 if (r < 0)
1802 return r;
1803
1804 snprintf(val, sizeof(val), "%d", whoami);
1805 r = store->write_meta("whoami", val);
1806 if (r < 0)
1807 return r;
1808
1809 cluster_fsid.print(val);
1810 r = store->write_meta("ceph_fsid", val);
1811 if (r < 0)
1812 return r;
1813
1814 r = store->write_meta("ready", "ready");
1815 if (r < 0)
1816 return r;
1817
1818 return 0;
1819 }
1820
1821 int OSD::peek_meta(ObjectStore *store, std::string& magic,
1822 uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami)
1823 {
1824 string val;
1825
1826 int r = store->read_meta("magic", &val);
1827 if (r < 0)
1828 return r;
1829 magic = val;
1830
1831 r = store->read_meta("whoami", &val);
1832 if (r < 0)
1833 return r;
1834 whoami = atoi(val.c_str());
1835
1836 r = store->read_meta("ceph_fsid", &val);
1837 if (r < 0)
1838 return r;
1839 r = cluster_fsid.parse(val.c_str());
1840 if (!r)
1841 return -EINVAL;
1842
1843 r = store->read_meta("fsid", &val);
1844 if (r < 0) {
1845 osd_fsid = uuid_d();
1846 } else {
1847 r = osd_fsid.parse(val.c_str());
1848 if (!r)
1849 return -EINVAL;
1850 }
1851
1852 return 0;
1853 }
1854
1855
1856 #undef dout_prefix
1857 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
1858
1859 // cons/des
1860
1861 OSD::OSD(CephContext *cct_, ObjectStore *store_,
1862 int id,
1863 Messenger *internal_messenger,
1864 Messenger *external_messenger,
1865 Messenger *hb_client_front,
1866 Messenger *hb_client_back,
1867 Messenger *hb_front_serverm,
1868 Messenger *hb_back_serverm,
1869 Messenger *osdc_messenger,
1870 MonClient *mc,
1871 const std::string &dev, const std::string &jdev) :
1872 Dispatcher(cct_),
1873 osd_lock("OSD::osd_lock"),
1874 tick_timer(cct, osd_lock),
1875 tick_timer_lock("OSD::tick_timer_lock"),
1876 tick_timer_without_osd_lock(cct, tick_timer_lock),
1877 authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(cct,
1878 cct->_conf->auth_supported.empty() ?
1879 cct->_conf->auth_cluster_required :
1880 cct->_conf->auth_supported)),
1881 authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(cct,
1882 cct->_conf->auth_supported.empty() ?
1883 cct->_conf->auth_service_required :
1884 cct->_conf->auth_supported)),
1885 cluster_messenger(internal_messenger),
1886 client_messenger(external_messenger),
1887 objecter_messenger(osdc_messenger),
1888 monc(mc),
1889 mgrc(cct_, client_messenger),
1890 logger(NULL),
1891 recoverystate_perf(NULL),
1892 store(store_),
1893 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
1894 clog(log_client.create_channel()),
1895 whoami(id),
1896 dev_path(dev), journal_path(jdev),
1897 store_is_rotational(store->is_rotational()),
1898 trace_endpoint("0.0.0.0", 0, "osd"),
1899 asok_hook(NULL),
1900 osd_compat(get_osd_compat_set()),
1901 peering_tp(cct, "OSD::peering_tp", "tp_peering",
1902 cct->_conf->osd_peering_wq_threads,
1903 "osd_peering_tp_threads"),
1904 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
1905 get_num_op_threads()),
1906 disk_tp(cct, "OSD::disk_tp", "tp_osd_disk", cct->_conf->osd_disk_threads, "osd_disk_threads"),
1907 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
1908 session_waiting_lock("OSD::session_waiting_lock"),
1909 heartbeat_lock("OSD::heartbeat_lock"),
1910 heartbeat_stop(false),
1911 heartbeat_need_update(true),
1912 hb_front_client_messenger(hb_client_front),
1913 hb_back_client_messenger(hb_client_back),
1914 hb_front_server_messenger(hb_front_serverm),
1915 hb_back_server_messenger(hb_back_serverm),
1916 daily_loadavg(0.0),
1917 heartbeat_thread(this),
1918 heartbeat_dispatcher(this),
1919 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
1920 cct->_conf->osd_num_op_tracker_shard),
1921 test_ops_hook(NULL),
1922 op_queue(get_io_queue()),
1923 op_prio_cutoff(get_io_prio_cut()),
1924 op_shardedwq(
1925 get_num_op_shards(),
1926 this,
1927 cct->_conf->osd_op_thread_timeout,
1928 cct->_conf->osd_op_thread_suicide_timeout,
1929 &osd_op_tp),
1930 peering_wq(
1931 this,
1932 cct->_conf->osd_op_thread_timeout,
1933 cct->_conf->osd_op_thread_suicide_timeout,
1934 &peering_tp),
1935 map_lock("OSD::map_lock"),
1936 pg_map_lock("OSD::pg_map_lock"),
1937 last_pg_create_epoch(0),
1938 mon_report_lock("OSD::mon_report_lock"),
1939 stats_ack_timeout(cct->_conf->osd_mon_ack_timeout),
1940 up_thru_wanted(0),
1941 requested_full_first(0),
1942 requested_full_last(0),
1943 pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
1944 osd_stat_updated(false),
1945 pg_stat_tid(0), pg_stat_tid_flushed(0),
1946 command_wq(
1947 this,
1948 cct->_conf->osd_command_thread_timeout,
1949 cct->_conf->osd_command_thread_suicide_timeout,
1950 &command_tp),
1951 remove_wq(
1952 cct,
1953 store,
1954 cct->_conf->osd_remove_thread_timeout,
1955 cct->_conf->osd_remove_thread_suicide_timeout,
1956 &disk_tp),
1957 service(this)
1958 {
1959 monc->set_messenger(client_messenger);
1960 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
1961 cct->_conf->osd_op_log_threshold);
1962 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
1963 cct->_conf->osd_op_history_duration);
1964 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
1965 cct->_conf->osd_op_history_slow_op_threshold);
1966 #ifdef WITH_BLKIN
1967 std::stringstream ss;
1968 ss << "osd." << whoami;
1969 trace_endpoint.copy_name(ss.str());
1970 #endif
1971 }
1972
1973 OSD::~OSD()
1974 {
1975 delete authorize_handler_cluster_registry;
1976 delete authorize_handler_service_registry;
1977 delete class_handler;
1978 cct->get_perfcounters_collection()->remove(recoverystate_perf);
1979 cct->get_perfcounters_collection()->remove(logger);
1980 delete recoverystate_perf;
1981 delete logger;
1982 delete store;
1983 }
1984
1985 void cls_initialize(ClassHandler *ch);
1986
1987 void OSD::handle_signal(int signum)
1988 {
1989 assert(signum == SIGINT || signum == SIGTERM);
1990 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
1991 shutdown();
1992 }
1993
1994 int OSD::pre_init()
1995 {
1996 Mutex::Locker lock(osd_lock);
1997 if (is_stopping())
1998 return 0;
1999
2000 if (store->test_mount_in_use()) {
2001 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2002 << "currently in use. (Is ceph-osd already running?)" << dendl;
2003 return -EBUSY;
2004 }
2005
2006 cct->_conf->add_observer(this);
2007 return 0;
2008 }
2009
2010 // asok
2011
2012 class OSDSocketHook : public AdminSocketHook {
2013 OSD *osd;
2014 public:
2015 explicit OSDSocketHook(OSD *o) : osd(o) {}
2016 bool call(std::string admin_command, cmdmap_t& cmdmap, std::string format,
2017 bufferlist& out) override {
2018 stringstream ss;
2019 bool r = osd->asok_command(admin_command, cmdmap, format, ss);
2020 out.append(ss);
2021 return r;
2022 }
2023 };
2024
2025 bool OSD::asok_command(string admin_command, cmdmap_t& cmdmap, string format,
2026 ostream& ss)
2027 {
2028 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2029 if (admin_command == "status") {
2030 f->open_object_section("status");
2031 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2032 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2033 f->dump_unsigned("whoami", superblock.whoami);
2034 f->dump_string("state", get_state_name(get_state()));
2035 f->dump_unsigned("oldest_map", superblock.oldest_map);
2036 f->dump_unsigned("newest_map", superblock.newest_map);
2037 {
2038 RWLock::RLocker l(pg_map_lock);
2039 f->dump_unsigned("num_pgs", pg_map.size());
2040 }
2041 f->close_section();
2042 } else if (admin_command == "flush_journal") {
2043 store->flush_journal();
2044 } else if (admin_command == "dump_ops_in_flight" ||
2045 admin_command == "ops") {
2046 if (!op_tracker.dump_ops_in_flight(f)) {
2047 ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
2048 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
2049 }
2050 } else if (admin_command == "dump_blocked_ops") {
2051 if (!op_tracker.dump_ops_in_flight(f, true)) {
2052 ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
2053 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
2054 }
2055 } else if (admin_command == "dump_historic_ops") {
2056 if (!op_tracker.dump_historic_ops(f, false)) {
2057 ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
2058 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
2059 }
2060 } else if (admin_command == "dump_historic_ops_by_duration") {
2061 if (!op_tracker.dump_historic_ops(f, true)) {
2062 ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
2063 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
2064 }
2065 } else if (admin_command == "dump_historic_slow_ops") {
2066 if (!op_tracker.dump_historic_slow_ops(f)) {
2067 ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
2068 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
2069 }
2070 } else if (admin_command == "dump_op_pq_state") {
2071 f->open_object_section("pq");
2072 op_shardedwq.dump(f);
2073 f->close_section();
2074 } else if (admin_command == "dump_blacklist") {
2075 list<pair<entity_addr_t,utime_t> > bl;
2076 OSDMapRef curmap = service.get_osdmap();
2077
2078 f->open_array_section("blacklist");
2079 curmap->get_blacklist(&bl);
2080 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2081 it != bl.end(); ++it) {
2082 f->open_object_section("entry");
2083 f->open_object_section("entity_addr_t");
2084 it->first.dump(f);
2085 f->close_section(); //entity_addr_t
2086 it->second.localtime(f->dump_stream("expire_time"));
2087 f->close_section(); //entry
2088 }
2089 f->close_section(); //blacklist
2090 } else if (admin_command == "dump_watchers") {
2091 list<obj_watch_item_t> watchers;
2092 // scan pg's
2093 {
2094 Mutex::Locker l(osd_lock);
2095 RWLock::RLocker l2(pg_map_lock);
2096 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2097 it != pg_map.end();
2098 ++it) {
2099
2100 list<obj_watch_item_t> pg_watchers;
2101 PG *pg = it->second;
2102 pg->lock();
2103 pg->get_watchers(pg_watchers);
2104 pg->unlock();
2105 watchers.splice(watchers.end(), pg_watchers);
2106 }
2107 }
2108
2109 f->open_array_section("watchers");
2110 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2111 it != watchers.end(); ++it) {
2112
2113 f->open_object_section("watch");
2114
2115 f->dump_string("namespace", it->obj.nspace);
2116 f->dump_string("object", it->obj.oid.name);
2117
2118 f->open_object_section("entity_name");
2119 it->wi.name.dump(f);
2120 f->close_section(); //entity_name_t
2121
2122 f->dump_unsigned("cookie", it->wi.cookie);
2123 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2124
2125 f->open_object_section("entity_addr_t");
2126 it->wi.addr.dump(f);
2127 f->close_section(); //entity_addr_t
2128
2129 f->close_section(); //watch
2130 }
2131
2132 f->close_section(); //watchers
2133 } else if (admin_command == "dump_reservations") {
2134 f->open_object_section("reservations");
2135 f->open_object_section("local_reservations");
2136 service.local_reserver.dump(f);
2137 f->close_section();
2138 f->open_object_section("remote_reservations");
2139 service.remote_reserver.dump(f);
2140 f->close_section();
2141 f->close_section();
2142 } else if (admin_command == "get_latest_osdmap") {
2143 get_latest_osdmap();
2144 } else if (admin_command == "heap") {
2145 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2146
2147 // Note: Failed heap profile commands won't necessarily trigger an error:
2148 f->open_object_section("result");
2149 f->dump_string("error", cpp_strerror(result));
2150 f->dump_bool("success", result >= 0);
2151 f->close_section();
2152 } else if (admin_command == "set_heap_property") {
2153 string property;
2154 int64_t value = 0;
2155 string error;
2156 bool success = false;
2157 if (!cmd_getval(cct, cmdmap, "property", property)) {
2158 error = "unable to get property";
2159 success = false;
2160 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2161 error = "unable to get value";
2162 success = false;
2163 } else if (value < 0) {
2164 error = "negative value not allowed";
2165 success = false;
2166 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2167 error = "invalid property";
2168 success = false;
2169 } else {
2170 success = true;
2171 }
2172 f->open_object_section("result");
2173 f->dump_string("error", error);
2174 f->dump_bool("success", success);
2175 f->close_section();
2176 } else if (admin_command == "get_heap_property") {
2177 string property;
2178 size_t value = 0;
2179 string error;
2180 bool success = false;
2181 if (!cmd_getval(cct, cmdmap, "property", property)) {
2182 error = "unable to get property";
2183 success = false;
2184 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2185 error = "invalid property";
2186 success = false;
2187 } else {
2188 success = true;
2189 }
2190 f->open_object_section("result");
2191 f->dump_string("error", error);
2192 f->dump_bool("success", success);
2193 f->dump_int("value", value);
2194 f->close_section();
2195 } else if (admin_command == "dump_objectstore_kv_stats") {
2196 store->get_db_statistics(f);
2197 } else if (admin_command == "dump_scrubs") {
2198 service.dumps_scrub(f);
2199 } else if (admin_command == "calc_objectstore_db_histogram") {
2200 store->generate_db_histogram(f);
2201 } else if (admin_command == "flush_store_cache") {
2202 store->flush_cache();
2203 } else if (admin_command == "dump_pgstate_history") {
2204 f->open_object_section("pgstate_history");
2205 RWLock::RLocker l2(pg_map_lock);
2206 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2207 it != pg_map.end();
2208 ++it) {
2209
2210 PG *pg = it->second;
2211 f->dump_stream("pg") << pg->get_pgid();
2212 pg->lock();
2213 pg->pgstate_history.dump(f);
2214 pg->unlock();
2215 }
2216 f->close_section();
2217 } else if (admin_command == "compact") {
2218 dout(1) << "triggering manual compaction" << dendl;
2219 auto start = ceph::coarse_mono_clock::now();
2220 store->compact();
2221 auto end = ceph::coarse_mono_clock::now();
2222 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
2223 dout(1) << "finished manual compaction in "
2224 << time_span.count()
2225 << " seconds" << dendl;
2226 f->open_object_section("compact_result");
2227 f->dump_float("elapsed_time", time_span.count());
2228 f->close_section();
2229 } else {
2230 assert(0 == "broken asok registration");
2231 }
2232 f->flush(ss);
2233 delete f;
2234 return true;
2235 }
2236
2237 class TestOpsSocketHook : public AdminSocketHook {
2238 OSDService *service;
2239 ObjectStore *store;
2240 public:
2241 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
2242 bool call(std::string command, cmdmap_t& cmdmap, std::string format,
2243 bufferlist& out) override {
2244 stringstream ss;
2245 test_ops(service, store, command, cmdmap, ss);
2246 out.append(ss);
2247 return true;
2248 }
2249 void test_ops(OSDService *service, ObjectStore *store,
2250 const std::string &command, cmdmap_t& cmdmap, ostream &ss);
2251
2252 };
2253
2254 class OSD::C_Tick : public Context {
2255 OSD *osd;
2256 public:
2257 explicit C_Tick(OSD *o) : osd(o) {}
2258 void finish(int r) override {
2259 osd->tick();
2260 }
2261 };
2262
2263 class OSD::C_Tick_WithoutOSDLock : public Context {
2264 OSD *osd;
2265 public:
2266 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2267 void finish(int r) override {
2268 osd->tick_without_osd_lock();
2269 }
2270 };
2271
2272 int OSD::enable_disable_fuse(bool stop)
2273 {
2274 #ifdef HAVE_LIBFUSE
2275 int r;
2276 string mntpath = cct->_conf->osd_data + "/fuse";
2277 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2278 dout(1) << __func__ << " disabling" << dendl;
2279 fuse_store->stop();
2280 delete fuse_store;
2281 fuse_store = NULL;
2282 r = ::rmdir(mntpath.c_str());
2283 if (r < 0)
2284 r = -errno;
2285 if (r < 0) {
2286 derr << __func__ << " failed to rmdir " << mntpath << dendl;
2287 return r;
2288 }
2289 return 0;
2290 }
2291 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2292 dout(1) << __func__ << " enabling" << dendl;
2293 r = ::mkdir(mntpath.c_str(), 0700);
2294 if (r < 0)
2295 r = -errno;
2296 if (r < 0 && r != -EEXIST) {
2297 derr << __func__ << " unable to create " << mntpath << ": "
2298 << cpp_strerror(r) << dendl;
2299 return r;
2300 }
2301 fuse_store = new FuseStore(store, mntpath);
2302 r = fuse_store->start();
2303 if (r < 0) {
2304 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2305 delete fuse_store;
2306 fuse_store = NULL;
2307 return r;
2308 }
2309 }
2310 #endif // HAVE_LIBFUSE
2311 return 0;
2312 }
2313
2314 int OSD::get_num_op_shards()
2315 {
2316 if (cct->_conf->osd_op_num_shards)
2317 return cct->_conf->osd_op_num_shards;
2318 if (store_is_rotational)
2319 return cct->_conf->osd_op_num_shards_hdd;
2320 else
2321 return cct->_conf->osd_op_num_shards_ssd;
2322 }
2323
2324 int OSD::get_num_op_threads()
2325 {
2326 if (cct->_conf->osd_op_num_threads_per_shard)
2327 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2328 if (store_is_rotational)
2329 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2330 else
2331 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2332 }
2333
2334 int OSD::init()
2335 {
2336 CompatSet initial, diff;
2337 Mutex::Locker lock(osd_lock);
2338 if (is_stopping())
2339 return 0;
2340
2341 tick_timer.init();
2342 tick_timer_without_osd_lock.init();
2343 service.recovery_request_timer.init();
2344 service.recovery_sleep_timer.init();
2345
2346 // mount.
2347 dout(2) << "init " << dev_path
2348 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
2349 << dendl;
2350 assert(store); // call pre_init() first!
2351
2352 store->set_cache_shards(get_num_op_shards());
2353
2354 int r = store->mount();
2355 if (r < 0) {
2356 derr << "OSD:init: unable to mount object store" << dendl;
2357 return r;
2358 }
2359
2360 enable_disable_fuse(false);
2361
2362 dout(2) << "boot" << dendl;
2363
2364 // initialize the daily loadavg with current 15min loadavg
2365 double loadavgs[3];
2366 if (getloadavg(loadavgs, 3) == 3) {
2367 daily_loadavg = loadavgs[2];
2368 } else {
2369 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
2370 daily_loadavg = 1.0;
2371 }
2372
2373 int rotating_auth_attempts = 0;
2374
2375 // sanity check long object name handling
2376 {
2377 hobject_t l;
2378 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
2379 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
2380 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
2381 r = store->validate_hobject_key(l);
2382 if (r < 0) {
2383 derr << "backend (" << store->get_type() << ") is unable to support max "
2384 << "object name[space] len" << dendl;
2385 derr << " osd max object name len = "
2386 << cct->_conf->osd_max_object_name_len << dendl;
2387 derr << " osd max object namespace len = "
2388 << cct->_conf->osd_max_object_namespace_len << dendl;
2389 derr << cpp_strerror(r) << dendl;
2390 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
2391 goto out;
2392 }
2393 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
2394 << dendl;
2395 } else {
2396 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
2397 }
2398 }
2399
2400 // read superblock
2401 r = read_superblock();
2402 if (r < 0) {
2403 derr << "OSD::init() : unable to read osd superblock" << dendl;
2404 r = -EINVAL;
2405 goto out;
2406 }
2407
2408 if (osd_compat.compare(superblock.compat_features) < 0) {
2409 derr << "The disk uses features unsupported by the executable." << dendl;
2410 derr << " ondisk features " << superblock.compat_features << dendl;
2411 derr << " daemon features " << osd_compat << dendl;
2412
2413 if (osd_compat.writeable(superblock.compat_features)) {
2414 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2415 derr << "it is still writeable, though. Missing features: " << diff << dendl;
2416 r = -EOPNOTSUPP;
2417 goto out;
2418 }
2419 else {
2420 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2421 derr << "Cannot write to disk! Missing features: " << diff << dendl;
2422 r = -EOPNOTSUPP;
2423 goto out;
2424 }
2425 }
2426
2427 assert_warn(whoami == superblock.whoami);
2428 if (whoami != superblock.whoami) {
2429 derr << "OSD::init: superblock says osd"
2430 << superblock.whoami << " but I am osd." << whoami << dendl;
2431 r = -EINVAL;
2432 goto out;
2433 }
2434
2435 initial = get_osd_initial_compat_set();
2436 diff = superblock.compat_features.unsupported(initial);
2437 if (superblock.compat_features.merge(initial)) {
2438 // We need to persist the new compat_set before we
2439 // do anything else
2440 dout(5) << "Upgrading superblock adding: " << diff << dendl;
2441 ObjectStore::Transaction t;
2442 write_superblock(t);
2443 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2444 if (r < 0)
2445 goto out;
2446 }
2447
2448 // make sure snap mapper object exists
2449 if (!store->exists(coll_t::meta(), OSD::make_snapmapper_oid())) {
2450 dout(10) << "init creating/touching snapmapper object" << dendl;
2451 ObjectStore::Transaction t;
2452 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
2453 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2454 if (r < 0)
2455 goto out;
2456 }
2457
2458 class_handler = new ClassHandler(cct);
2459 cls_initialize(class_handler);
2460
2461 if (cct->_conf->osd_open_classes_on_start) {
2462 int r = class_handler->open_all_classes();
2463 if (r)
2464 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
2465 }
2466
2467 // load up "current" osdmap
2468 assert_warn(!osdmap);
2469 if (osdmap) {
2470 derr << "OSD::init: unable to read current osdmap" << dendl;
2471 r = -EINVAL;
2472 goto out;
2473 }
2474 osdmap = get_map(superblock.current_epoch);
2475 check_osdmap_features(store);
2476
2477 create_recoverystate_perf();
2478
2479 {
2480 epoch_t bind_epoch = osdmap->get_epoch();
2481 service.set_epochs(NULL, NULL, &bind_epoch);
2482 }
2483
2484 clear_temp_objects();
2485
2486 // load up pgs (as they previously existed)
2487 load_pgs();
2488
2489 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
2490 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
2491 op_prio_cutoff << "." << dendl;
2492
2493 create_logger();
2494
2495 // i'm ready!
2496 client_messenger->add_dispatcher_head(this);
2497 cluster_messenger->add_dispatcher_head(this);
2498
2499 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2500 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2501 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2502 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2503
2504 objecter_messenger->add_dispatcher_head(service.objecter);
2505
2506 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
2507 | CEPH_ENTITY_TYPE_MGR);
2508 r = monc->init();
2509 if (r < 0)
2510 goto out;
2511
2512 /**
2513 * FIXME: this is a placeholder implementation that unconditionally
2514 * sends every is_primary PG's stats every time we're called, unlike
2515 * the existing mon PGStats mechanism that uses pg_stat_queue and acks.
2516 * This has equivalent cost to the existing worst case where all
2517 * PGs are busy and their stats are always enqueued for sending.
2518 */
2519 mgrc.set_pgstats_cb([this](){
2520 RWLock::RLocker l(map_lock);
2521
2522 utime_t had_for = ceph_clock_now() - had_map_since;
2523 osd_stat_t cur_stat = service.get_osd_stat();
2524 cur_stat.os_perf_stat = store->get_cur_stats();
2525
2526 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
2527 m->osd_stat = cur_stat;
2528
2529 Mutex::Locker lec{min_last_epoch_clean_lock};
2530 min_last_epoch_clean = osdmap->get_epoch();
2531 min_last_epoch_clean_pgs.clear();
2532 RWLock::RLocker lpg(pg_map_lock);
2533 for (const auto &i : pg_map) {
2534 PG *pg = i.second;
2535 if (!pg->is_primary()) {
2536 continue;
2537 }
2538
2539 pg->pg_stats_publish_lock.Lock();
2540 if (pg->pg_stats_publish_valid) {
2541 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
2542 const auto lec = pg->pg_stats_publish.get_effective_last_epoch_clean();
2543 min_last_epoch_clean = min(min_last_epoch_clean, lec);
2544 min_last_epoch_clean_pgs.push_back(pg->info.pgid.pgid);
2545 }
2546 pg->pg_stats_publish_lock.Unlock();
2547 }
2548
2549 return m;
2550 });
2551
2552 mgrc.init();
2553 client_messenger->add_dispatcher_head(&mgrc);
2554
2555 // tell monc about log_client so it will know about mon session resets
2556 monc->set_log_client(&log_client);
2557 update_log_config();
2558
2559 peering_tp.start();
2560 osd_op_tp.start();
2561 disk_tp.start();
2562 command_tp.start();
2563
2564 set_disk_tp_priority();
2565
2566 // start the heartbeat
2567 heartbeat_thread.create("osd_srv_heartbt");
2568
2569 // tick
2570 tick_timer.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick(this));
2571 {
2572 Mutex::Locker l(tick_timer_lock);
2573 tick_timer_without_osd_lock.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick_WithoutOSDLock(this));
2574 }
2575
2576 service.init();
2577 service.publish_map(osdmap);
2578 service.publish_superblock(superblock);
2579 service.max_oldest_map = superblock.oldest_map;
2580
2581 osd_lock.Unlock();
2582
2583 r = monc->authenticate();
2584 if (r < 0) {
2585 osd_lock.Lock(); // locker is going to unlock this on function exit
2586 if (is_stopping())
2587 r = 0;
2588 goto monout;
2589 }
2590
2591 while (monc->wait_auth_rotating(30.0) < 0) {
2592 derr << "unable to obtain rotating service keys; retrying" << dendl;
2593 ++rotating_auth_attempts;
2594 if (rotating_auth_attempts > g_conf->max_rotating_auth_attempts) {
2595 osd_lock.Lock(); // make locker happy
2596 if (!is_stopping()) {
2597 r = - ETIMEDOUT;
2598 }
2599 goto monout;
2600 }
2601 }
2602
2603 r = update_crush_device_class();
2604 if (r < 0) {
2605 osd_lock.Lock();
2606 goto monout;
2607 }
2608
2609 r = update_crush_location();
2610 if (r < 0) {
2611 osd_lock.Lock();
2612 goto monout;
2613 }
2614
2615 osd_lock.Lock();
2616 if (is_stopping())
2617 return 0;
2618
2619 // start objecter *after* we have authenticated, so that we don't ignore
2620 // the OSDMaps it requests.
2621 service.final_init();
2622
2623 check_config();
2624
2625 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
2626 consume_map();
2627 peering_wq.drain();
2628
2629 dout(0) << "done with init, starting boot process" << dendl;
2630
2631 // subscribe to any pg creations
2632 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
2633
2634 // MgrClient needs this (it doesn't have MonClient reference itself)
2635 monc->sub_want("mgrmap", 0, 0);
2636
2637 // we don't need to ask for an osdmap here; objecter will
2638 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
2639
2640 monc->renew_subs();
2641
2642 start_boot();
2643
2644 return 0;
2645 monout:
2646 mgrc.shutdown();
2647 monc->shutdown();
2648
2649 out:
2650 enable_disable_fuse(true);
2651 store->umount();
2652 delete store;
2653 store = NULL;
2654 return r;
2655 }
2656
2657 void OSD::final_init()
2658 {
2659 AdminSocket *admin_socket = cct->get_admin_socket();
2660 asok_hook = new OSDSocketHook(this);
2661 int r = admin_socket->register_command("status", "status", asok_hook,
2662 "high-level status of OSD");
2663 assert(r == 0);
2664 r = admin_socket->register_command("flush_journal", "flush_journal",
2665 asok_hook,
2666 "flush the journal to permanent store");
2667 assert(r == 0);
2668 r = admin_socket->register_command("dump_ops_in_flight",
2669 "dump_ops_in_flight", asok_hook,
2670 "show the ops currently in flight");
2671 assert(r == 0);
2672 r = admin_socket->register_command("ops",
2673 "ops", asok_hook,
2674 "show the ops currently in flight");
2675 assert(r == 0);
2676 r = admin_socket->register_command("dump_blocked_ops",
2677 "dump_blocked_ops", asok_hook,
2678 "show the blocked ops currently in flight");
2679 assert(r == 0);
2680 r = admin_socket->register_command("dump_historic_ops", "dump_historic_ops",
2681 asok_hook,
2682 "show recent ops");
2683 assert(r == 0);
2684 r = admin_socket->register_command("dump_historic_slow_ops", "dump_historic_slow_ops",
2685 asok_hook,
2686 "show slowest recent ops");
2687 assert(r == 0);
2688 r = admin_socket->register_command("dump_historic_ops_by_duration", "dump_historic_ops_by_duration",
2689 asok_hook,
2690 "show slowest recent ops, sorted by duration");
2691 assert(r == 0);
2692 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
2693 asok_hook,
2694 "dump op priority queue state");
2695 assert(r == 0);
2696 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
2697 asok_hook,
2698 "dump blacklisted clients and times");
2699 assert(r == 0);
2700 r = admin_socket->register_command("dump_watchers", "dump_watchers",
2701 asok_hook,
2702 "show clients which have active watches,"
2703 " and on which objects");
2704 assert(r == 0);
2705 r = admin_socket->register_command("dump_reservations", "dump_reservations",
2706 asok_hook,
2707 "show recovery reservations");
2708 assert(r == 0);
2709 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
2710 asok_hook,
2711 "force osd to update the latest map from "
2712 "the mon");
2713 assert(r == 0);
2714
2715 r = admin_socket->register_command( "heap",
2716 "heap " \
2717 "name=heapcmd,type=CephString",
2718 asok_hook,
2719 "show heap usage info (available only if "
2720 "compiled with tcmalloc)");
2721 assert(r == 0);
2722
2723 r = admin_socket->register_command("set_heap_property",
2724 "set_heap_property " \
2725 "name=property,type=CephString " \
2726 "name=value,type=CephInt",
2727 asok_hook,
2728 "update malloc extension heap property");
2729 assert(r == 0);
2730
2731 r = admin_socket->register_command("get_heap_property",
2732 "get_heap_property " \
2733 "name=property,type=CephString",
2734 asok_hook,
2735 "get malloc extension heap property");
2736 assert(r == 0);
2737
2738 r = admin_socket->register_command("dump_objectstore_kv_stats",
2739 "dump_objectstore_kv_stats",
2740 asok_hook,
2741 "print statistics of kvdb which used by bluestore");
2742 assert(r == 0);
2743
2744 r = admin_socket->register_command("dump_scrubs",
2745 "dump_scrubs",
2746 asok_hook,
2747 "print scheduled scrubs");
2748 assert(r == 0);
2749
2750 r = admin_socket->register_command("calc_objectstore_db_histogram",
2751 "calc_objectstore_db_histogram",
2752 asok_hook,
2753 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
2754 assert(r == 0);
2755
2756 r = admin_socket->register_command("flush_store_cache",
2757 "flush_store_cache",
2758 asok_hook,
2759 "Flush bluestore internal cache");
2760 assert(r == 0);
2761 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
2762 asok_hook,
2763 "show recent state history");
2764 assert(r == 0);
2765
2766 r = admin_socket->register_command("compact", "compact",
2767 asok_hook,
2768 "Commpact object store's omap."
2769 " WARNING: Compaction probably slows your requests");
2770 assert(r == 0);
2771
2772 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
2773 // Note: pools are CephString instead of CephPoolname because
2774 // these commands traditionally support both pool names and numbers
2775 r = admin_socket->register_command(
2776 "setomapval",
2777 "setomapval " \
2778 "name=pool,type=CephString " \
2779 "name=objname,type=CephObjectname " \
2780 "name=key,type=CephString "\
2781 "name=val,type=CephString",
2782 test_ops_hook,
2783 "set omap key");
2784 assert(r == 0);
2785 r = admin_socket->register_command(
2786 "rmomapkey",
2787 "rmomapkey " \
2788 "name=pool,type=CephString " \
2789 "name=objname,type=CephObjectname " \
2790 "name=key,type=CephString",
2791 test_ops_hook,
2792 "remove omap key");
2793 assert(r == 0);
2794 r = admin_socket->register_command(
2795 "setomapheader",
2796 "setomapheader " \
2797 "name=pool,type=CephString " \
2798 "name=objname,type=CephObjectname " \
2799 "name=header,type=CephString",
2800 test_ops_hook,
2801 "set omap header");
2802 assert(r == 0);
2803
2804 r = admin_socket->register_command(
2805 "getomap",
2806 "getomap " \
2807 "name=pool,type=CephString " \
2808 "name=objname,type=CephObjectname",
2809 test_ops_hook,
2810 "output entire object map");
2811 assert(r == 0);
2812
2813 r = admin_socket->register_command(
2814 "truncobj",
2815 "truncobj " \
2816 "name=pool,type=CephString " \
2817 "name=objname,type=CephObjectname " \
2818 "name=len,type=CephInt",
2819 test_ops_hook,
2820 "truncate object to length");
2821 assert(r == 0);
2822
2823 r = admin_socket->register_command(
2824 "injectdataerr",
2825 "injectdataerr " \
2826 "name=pool,type=CephString " \
2827 "name=objname,type=CephObjectname " \
2828 "name=shardid,type=CephInt,req=false,range=0|255",
2829 test_ops_hook,
2830 "inject data error to an object");
2831 assert(r == 0);
2832
2833 r = admin_socket->register_command(
2834 "injectmdataerr",
2835 "injectmdataerr " \
2836 "name=pool,type=CephString " \
2837 "name=objname,type=CephObjectname " \
2838 "name=shardid,type=CephInt,req=false,range=0|255",
2839 test_ops_hook,
2840 "inject metadata error to an object");
2841 assert(r == 0);
2842 r = admin_socket->register_command(
2843 "set_recovery_delay",
2844 "set_recovery_delay " \
2845 "name=utime,type=CephInt,req=false",
2846 test_ops_hook,
2847 "Delay osd recovery by specified seconds");
2848 assert(r == 0);
2849 r = admin_socket->register_command(
2850 "trigger_scrub",
2851 "trigger_scrub " \
2852 "name=pgid,type=CephString ",
2853 test_ops_hook,
2854 "Trigger a scheduled scrub ");
2855 assert(r == 0);
2856 r = admin_socket->register_command(
2857 "injectfull",
2858 "injectfull " \
2859 "name=type,type=CephString,req=false " \
2860 "name=count,type=CephInt,req=false ",
2861 test_ops_hook,
2862 "Inject a full disk (optional count times)");
2863 assert(r == 0);
2864 }
2865
2866 void OSD::create_logger()
2867 {
2868 dout(10) << "create_logger" << dendl;
2869
2870 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
2871
2872 // Latency axis configuration for op histograms, values are in nanoseconds
2873 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
2874 "Latency (usec)",
2875 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
2876 0, ///< Start at 0
2877 100000, ///< Quantization unit is 100usec
2878 32, ///< Enough to cover much longer than slow requests
2879 };
2880
2881 // Op size axis configuration for op histograms, values are in bytes
2882 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
2883 "Request size (bytes)",
2884 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
2885 0, ///< Start at 0
2886 512, ///< Quantization unit is 512 bytes
2887 32, ///< Enough to cover requests larger than GB
2888 };
2889
2890
2891 osd_plb.add_u64(
2892 l_osd_op_wip, "op_wip",
2893 "Replication operations currently being processed (primary)");
2894 osd_plb.add_u64_counter(
2895 l_osd_op, "op",
2896 "Client operations",
2897 "ops", PerfCountersBuilder::PRIO_CRITICAL);
2898 osd_plb.add_u64_counter(
2899 l_osd_op_inb, "op_in_bytes",
2900 "Client operations total write size",
2901 "wr", PerfCountersBuilder::PRIO_INTERESTING);
2902 osd_plb.add_u64_counter(
2903 l_osd_op_outb, "op_out_bytes",
2904 "Client operations total read size",
2905 "rd", PerfCountersBuilder::PRIO_INTERESTING);
2906 osd_plb.add_time_avg(
2907 l_osd_op_lat, "op_latency",
2908 "Latency of client operations (including queue time)",
2909 "l", 9);
2910 osd_plb.add_time_avg(
2911 l_osd_op_process_lat, "op_process_latency",
2912 "Latency of client operations (excluding queue time)");
2913 osd_plb.add_time_avg(
2914 l_osd_op_prepare_lat, "op_prepare_latency",
2915 "Latency of client operations (excluding queue time and wait for finished)");
2916
2917 osd_plb.add_u64_counter(
2918 l_osd_op_r, "op_r", "Client read operations");
2919 osd_plb.add_u64_counter(
2920 l_osd_op_r_outb, "op_r_out_bytes", "Client data read");
2921 osd_plb.add_time_avg(
2922 l_osd_op_r_lat, "op_r_latency",
2923 "Latency of read operation (including queue time)");
2924 osd_plb.add_u64_counter_histogram(
2925 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
2926 op_hist_x_axis_config, op_hist_y_axis_config,
2927 "Histogram of operation latency (including queue time) + data read");
2928 osd_plb.add_time_avg(
2929 l_osd_op_r_process_lat, "op_r_process_latency",
2930 "Latency of read operation (excluding queue time)");
2931 osd_plb.add_time_avg(
2932 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
2933 "Latency of read operations (excluding queue time and wait for finished)");
2934 osd_plb.add_u64_counter(
2935 l_osd_op_w, "op_w", "Client write operations");
2936 osd_plb.add_u64_counter(
2937 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
2938 osd_plb.add_time_avg(
2939 l_osd_op_w_lat, "op_w_latency",
2940 "Latency of write operation (including queue time)");
2941 osd_plb.add_u64_counter_histogram(
2942 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
2943 op_hist_x_axis_config, op_hist_y_axis_config,
2944 "Histogram of operation latency (including queue time) + data written");
2945 osd_plb.add_time_avg(
2946 l_osd_op_w_process_lat, "op_w_process_latency",
2947 "Latency of write operation (excluding queue time)");
2948 osd_plb.add_time_avg(
2949 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
2950 "Latency of write operations (excluding queue time and wait for finished)");
2951 osd_plb.add_u64_counter(
2952 l_osd_op_rw, "op_rw",
2953 "Client read-modify-write operations");
2954 osd_plb.add_u64_counter(
2955 l_osd_op_rw_inb, "op_rw_in_bytes",
2956 "Client read-modify-write operations write in");
2957 osd_plb.add_u64_counter(
2958 l_osd_op_rw_outb,"op_rw_out_bytes",
2959 "Client read-modify-write operations read out ");
2960 osd_plb.add_time_avg(
2961 l_osd_op_rw_lat, "op_rw_latency",
2962 "Latency of read-modify-write operation (including queue time)");
2963 osd_plb.add_u64_counter_histogram(
2964 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
2965 op_hist_x_axis_config, op_hist_y_axis_config,
2966 "Histogram of rw operation latency (including queue time) + data written");
2967 osd_plb.add_u64_counter_histogram(
2968 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
2969 op_hist_x_axis_config, op_hist_y_axis_config,
2970 "Histogram of rw operation latency (including queue time) + data read");
2971 osd_plb.add_time_avg(
2972 l_osd_op_rw_process_lat, "op_rw_process_latency",
2973 "Latency of read-modify-write operation (excluding queue time)");
2974 osd_plb.add_time_avg(
2975 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
2976 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
2977
2978 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
2979 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
2980 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
2981 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
2982
2983 osd_plb.add_u64_counter(
2984 l_osd_sop, "subop", "Suboperations");
2985 osd_plb.add_u64_counter(
2986 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size");
2987 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
2988
2989 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
2990 osd_plb.add_u64_counter(
2991 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size");
2992 osd_plb.add_time_avg(
2993 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
2994 osd_plb.add_u64_counter(
2995 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
2996 osd_plb.add_time_avg(
2997 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
2998 osd_plb.add_u64_counter(
2999 l_osd_sop_push, "subop_push", "Suboperations push messages");
3000 osd_plb.add_u64_counter(
3001 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size");
3002 osd_plb.add_time_avg(
3003 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3004
3005 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3006 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
3007 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size");
3008
3009 osd_plb.add_u64_counter(
3010 l_osd_rop, "recovery_ops",
3011 "Started recovery operations",
3012 "rop", PerfCountersBuilder::PRIO_INTERESTING);
3013
3014 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
3015 osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size");
3016 osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes");
3017 osd_plb.add_u64(l_osd_history_alloc_num, "history_alloc_num");
3018 osd_plb.add_u64(
3019 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3020 osd_plb.add_u64(
3021 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3022 "Total number getting crc from crc_cache with adjusting");
3023 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3024 "Total number of crc cache misses");
3025
3026 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3027 "pgs", PerfCountersBuilder::PRIO_USEFUL);
3028 osd_plb.add_u64(
3029 l_osd_pg_primary, "numpg_primary",
3030 "Placement groups for which this osd is primary");
3031 osd_plb.add_u64(
3032 l_osd_pg_replica, "numpg_replica",
3033 "Placement groups for which this osd is replica");
3034 osd_plb.add_u64(
3035 l_osd_pg_stray, "numpg_stray",
3036 "Placement groups ready to be deleted from this osd");
3037 osd_plb.add_u64(
3038 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3039 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3040 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3041 osd_plb.add_u64_counter(
3042 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3043 osd_plb.add_u64_counter(
3044 l_osd_waiting_for_map, "messages_delayed_for_map",
3045 "Operations waiting for OSD map");
3046
3047 osd_plb.add_u64_counter(
3048 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3049 osd_plb.add_u64_counter(
3050 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3051 osd_plb.add_u64_counter(
3052 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3053 "osdmap cache miss below cache lower bound");
3054 osd_plb.add_u64_avg(
3055 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3056 "osdmap cache miss, avg distance below cache lower bound");
3057 osd_plb.add_u64_counter(
3058 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3059 "OSDMap buffer cache hits");
3060 osd_plb.add_u64_counter(
3061 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3062 "OSDMap buffer cache misses");
3063
3064 osd_plb.add_u64(l_osd_stat_bytes, "stat_bytes", "OSD size");
3065 osd_plb.add_u64(l_osd_stat_bytes_used, "stat_bytes_used", "Used space");
3066 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
3067
3068 osd_plb.add_u64_counter(
3069 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3070
3071 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3072 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3073 osd_plb.add_u64_counter(
3074 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3075 osd_plb.add_u64_counter(
3076 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3077 osd_plb.add_u64_counter(
3078 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3079 "Failed tier flush attempts");
3080 osd_plb.add_u64_counter(
3081 l_osd_tier_evict, "tier_evict", "Tier evictions");
3082 osd_plb.add_u64_counter(
3083 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3084 osd_plb.add_u64_counter(
3085 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3086 osd_plb.add_u64_counter(
3087 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3088 osd_plb.add_u64_counter(
3089 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3090 osd_plb.add_u64_counter(
3091 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3092 osd_plb.add_u64_counter(
3093 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3094
3095 osd_plb.add_u64_counter(
3096 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3097 osd_plb.add_u64_counter(
3098 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3099 osd_plb.add_u64_counter(
3100 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3101 osd_plb.add_u64_counter(
3102 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3103
3104 osd_plb.add_u64_counter(
3105 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3106 osd_plb.add_u64_counter(
3107 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3108
3109 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3110 osd_plb.add_time_avg(
3111 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3112 osd_plb.add_time_avg(
3113 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3114 osd_plb.add_time_avg(
3115 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3116
3117 osd_plb.add_u64_counter(
3118 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3119 osd_plb.add_u64_counter(
3120 l_osd_pg_fastinfo, "osd_pg_fastinfo",
3121 "PG updated its info using fastinfo attr");
3122 osd_plb.add_u64_counter(
3123 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3124
3125 logger = osd_plb.create_perf_counters();
3126 cct->get_perfcounters_collection()->add(logger);
3127 }
3128
3129 void OSD::create_recoverystate_perf()
3130 {
3131 dout(10) << "create_recoverystate_perf" << dendl;
3132
3133 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3134
3135 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3136 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3137 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3138 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3139 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3140 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3141 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3142 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3143 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3144 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3145 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3146 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3147 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3148 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3149 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3150 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3151 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3152 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3153 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3154 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3155 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3156 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3157 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3158 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3159 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3160 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3161 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3162 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3163 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3164 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3165 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3166
3167 recoverystate_perf = rs_perf.create_perf_counters();
3168 cct->get_perfcounters_collection()->add(recoverystate_perf);
3169 }
3170
3171 int OSD::shutdown()
3172 {
3173 if (!service.prepare_to_stop())
3174 return 0; // already shutting down
3175 osd_lock.Lock();
3176 if (is_stopping()) {
3177 osd_lock.Unlock();
3178 return 0;
3179 }
3180 derr << "shutdown" << dendl;
3181
3182 set_state(STATE_STOPPING);
3183
3184 // Debugging
3185 cct->_conf->set_val("debug_osd", "100");
3186 cct->_conf->set_val("debug_journal", "100");
3187 cct->_conf->set_val("debug_filestore", "100");
3188 cct->_conf->set_val("debug_ms", "100");
3189 cct->_conf->apply_changes(NULL);
3190
3191 // stop MgrClient earlier as it's more like an internal consumer of OSD
3192 mgrc.shutdown();
3193
3194 service.start_shutdown();
3195
3196 // stop sending work to pgs. this just prevents any new work in _process
3197 // from racing with on_shutdown and potentially entering the pg after.
3198 op_shardedwq.drain();
3199
3200 // Shutdown PGs
3201 {
3202 RWLock::RLocker l(pg_map_lock);
3203 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3204 p != pg_map.end();
3205 ++p) {
3206 dout(20) << " kicking pg " << p->first << dendl;
3207 p->second->lock();
3208 p->second->on_shutdown();
3209 p->second->unlock();
3210 p->second->osr->flush();
3211 }
3212 }
3213 clear_pg_stat_queue();
3214
3215 // drain op queue again (in case PGs requeued something)
3216 op_shardedwq.drain();
3217 {
3218 finished.clear(); // zap waiters (bleh, this is messy)
3219 }
3220
3221 op_shardedwq.clear_pg_slots();
3222
3223 // unregister commands
3224 cct->get_admin_socket()->unregister_command("status");
3225 cct->get_admin_socket()->unregister_command("flush_journal");
3226 cct->get_admin_socket()->unregister_command("dump_ops_in_flight");
3227 cct->get_admin_socket()->unregister_command("ops");
3228 cct->get_admin_socket()->unregister_command("dump_blocked_ops");
3229 cct->get_admin_socket()->unregister_command("dump_historic_ops");
3230 cct->get_admin_socket()->unregister_command("dump_historic_ops_by_duration");
3231 cct->get_admin_socket()->unregister_command("dump_historic_slow_ops");
3232 cct->get_admin_socket()->unregister_command("dump_op_pq_state");
3233 cct->get_admin_socket()->unregister_command("dump_blacklist");
3234 cct->get_admin_socket()->unregister_command("dump_watchers");
3235 cct->get_admin_socket()->unregister_command("dump_reservations");
3236 cct->get_admin_socket()->unregister_command("get_latest_osdmap");
3237 cct->get_admin_socket()->unregister_command("heap");
3238 cct->get_admin_socket()->unregister_command("set_heap_property");
3239 cct->get_admin_socket()->unregister_command("get_heap_property");
3240 cct->get_admin_socket()->unregister_command("dump_objectstore_kv_stats");
3241 cct->get_admin_socket()->unregister_command("dump_scrubs");
3242 cct->get_admin_socket()->unregister_command("calc_objectstore_db_histogram");
3243 cct->get_admin_socket()->unregister_command("flush_store_cache");
3244 cct->get_admin_socket()->unregister_command("dump_pgstate_history");
3245 cct->get_admin_socket()->unregister_command("compact");
3246 delete asok_hook;
3247 asok_hook = NULL;
3248
3249 cct->get_admin_socket()->unregister_command("setomapval");
3250 cct->get_admin_socket()->unregister_command("rmomapkey");
3251 cct->get_admin_socket()->unregister_command("setomapheader");
3252 cct->get_admin_socket()->unregister_command("getomap");
3253 cct->get_admin_socket()->unregister_command("truncobj");
3254 cct->get_admin_socket()->unregister_command("injectdataerr");
3255 cct->get_admin_socket()->unregister_command("injectmdataerr");
3256 cct->get_admin_socket()->unregister_command("set_recovery_delay");
3257 cct->get_admin_socket()->unregister_command("trigger_scrub");
3258 cct->get_admin_socket()->unregister_command("injectfull");
3259 delete test_ops_hook;
3260 test_ops_hook = NULL;
3261
3262 osd_lock.Unlock();
3263
3264 heartbeat_lock.Lock();
3265 heartbeat_stop = true;
3266 heartbeat_cond.Signal();
3267 heartbeat_lock.Unlock();
3268 heartbeat_thread.join();
3269
3270 peering_tp.drain();
3271 peering_wq.clear();
3272 peering_tp.stop();
3273 dout(10) << "osd tp stopped" << dendl;
3274
3275 osd_op_tp.drain();
3276 osd_op_tp.stop();
3277 dout(10) << "op sharded tp stopped" << dendl;
3278
3279 command_tp.drain();
3280 command_tp.stop();
3281 dout(10) << "command tp stopped" << dendl;
3282
3283 disk_tp.drain();
3284 disk_tp.stop();
3285 dout(10) << "disk tp paused (new)" << dendl;
3286
3287 dout(10) << "stopping agent" << dendl;
3288 service.agent_stop();
3289
3290 osd_lock.Lock();
3291
3292 reset_heartbeat_peers();
3293
3294 tick_timer.shutdown();
3295
3296 {
3297 Mutex::Locker l(tick_timer_lock);
3298 tick_timer_without_osd_lock.shutdown();
3299 }
3300
3301 // note unmount epoch
3302 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
3303 superblock.mounted = service.get_boot_epoch();
3304 superblock.clean_thru = osdmap->get_epoch();
3305 ObjectStore::Transaction t;
3306 write_superblock(t);
3307 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
3308 if (r) {
3309 derr << "OSD::shutdown: error writing superblock: "
3310 << cpp_strerror(r) << dendl;
3311 }
3312
3313
3314 {
3315 Mutex::Locker l(pg_stat_queue_lock);
3316 assert(pg_stat_queue.empty());
3317 }
3318
3319 service.shutdown_reserver();
3320
3321 // Remove PGs
3322 #ifdef PG_DEBUG_REFS
3323 service.dump_live_pgids();
3324 #endif
3325 {
3326 RWLock::RLocker l(pg_map_lock);
3327 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3328 p != pg_map.end();
3329 ++p) {
3330 dout(20) << " kicking pg " << p->first << dendl;
3331 p->second->lock();
3332 if (p->second->ref != 1) {
3333 derr << "pgid " << p->first << " has ref count of "
3334 << p->second->ref << dendl;
3335 #ifdef PG_DEBUG_REFS
3336 p->second->dump_live_ids();
3337 #endif
3338 if (cct->_conf->osd_shutdown_pgref_assert) {
3339 ceph_abort();
3340 }
3341 }
3342 p->second->unlock();
3343 p->second->put("PGMap");
3344 }
3345 pg_map.clear();
3346 }
3347 #ifdef PG_DEBUG_REFS
3348 service.dump_live_pgids();
3349 #endif
3350 cct->_conf->remove_observer(this);
3351
3352 dout(10) << "syncing store" << dendl;
3353 enable_disable_fuse(true);
3354
3355 if (cct->_conf->osd_journal_flush_on_shutdown) {
3356 dout(10) << "flushing journal" << dendl;
3357 store->flush_journal();
3358 }
3359
3360 store->umount();
3361 delete store;
3362 store = 0;
3363 dout(10) << "Store synced" << dendl;
3364
3365 monc->shutdown();
3366 osd_lock.Unlock();
3367
3368 osdmap = OSDMapRef();
3369 service.shutdown();
3370 op_tracker.on_shutdown();
3371
3372 class_handler->shutdown();
3373 client_messenger->shutdown();
3374 cluster_messenger->shutdown();
3375 hb_front_client_messenger->shutdown();
3376 hb_back_client_messenger->shutdown();
3377 objecter_messenger->shutdown();
3378 hb_front_server_messenger->shutdown();
3379 hb_back_server_messenger->shutdown();
3380
3381 peering_wq.clear();
3382
3383 return r;
3384 }
3385
3386 int OSD::mon_cmd_maybe_osd_create(string &cmd)
3387 {
3388 bool created = false;
3389 while (true) {
3390 dout(10) << __func__ << " cmd: " << cmd << dendl;
3391 vector<string> vcmd{cmd};
3392 bufferlist inbl;
3393 C_SaferCond w;
3394 string outs;
3395 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
3396 int r = w.wait();
3397 if (r < 0) {
3398 if (r == -ENOENT && !created) {
3399 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
3400 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
3401 vector<string> vnewcmd{newcmd};
3402 bufferlist inbl;
3403 C_SaferCond w;
3404 string outs;
3405 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
3406 int r = w.wait();
3407 if (r < 0) {
3408 derr << __func__ << " fail: osd does not exist and created failed: "
3409 << cpp_strerror(r) << dendl;
3410 return r;
3411 }
3412 created = true;
3413 continue;
3414 }
3415 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
3416 return r;
3417 }
3418 break;
3419 }
3420
3421 return 0;
3422 }
3423
3424 int OSD::update_crush_location()
3425 {
3426 if (!cct->_conf->osd_crush_update_on_start) {
3427 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
3428 return 0;
3429 }
3430
3431 char weight[32];
3432 if (cct->_conf->osd_crush_initial_weight >= 0) {
3433 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
3434 } else {
3435 struct store_statfs_t st;
3436 int r = store->statfs(&st);
3437 if (r < 0) {
3438 derr << "statfs: " << cpp_strerror(r) << dendl;
3439 return r;
3440 }
3441 snprintf(weight, sizeof(weight), "%.4lf",
3442 MAX((double).00001,
3443 (double)(st.total) /
3444 (double)(1ull << 40 /* TB */)));
3445 }
3446
3447 std::multimap<string,string> loc = cct->crush_location.get_location();
3448 dout(10) << __func__ << " crush location is " << loc << dendl;
3449
3450 string cmd =
3451 string("{\"prefix\": \"osd crush create-or-move\", ") +
3452 string("\"id\": ") + stringify(whoami) + string(", ") +
3453 string("\"weight\":") + weight + string(", ") +
3454 string("\"args\": [");
3455 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
3456 if (p != loc.begin())
3457 cmd += ", ";
3458 cmd += "\"" + p->first + "=" + p->second + "\"";
3459 }
3460 cmd += "]}";
3461
3462 return mon_cmd_maybe_osd_create(cmd);
3463 }
3464
3465 int OSD::update_crush_device_class()
3466 {
3467 if (!cct->_conf->osd_class_update_on_start) {
3468 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
3469 return 0;
3470 }
3471
3472 string device_class;
3473 int r = store->read_meta("crush_device_class", &device_class);
3474 if (r < 0 || device_class.empty()) {
3475 device_class = store->get_default_device_class();
3476 }
3477
3478 if (device_class.empty()) {
3479 return 0;
3480 }
3481
3482 string cmd =
3483 string("{\"prefix\": \"osd crush set-device-class\", ") +
3484 string("\"class\": \"") + device_class + string("\", ") +
3485 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
3486
3487 r = mon_cmd_maybe_osd_create(cmd);
3488 if (r == -EPERM) {
3489 r = 0;
3490 }
3491
3492 return r;
3493 }
3494
3495 void OSD::write_superblock(ObjectStore::Transaction& t)
3496 {
3497 dout(10) << "write_superblock " << superblock << dendl;
3498
3499 //hack: at minimum it's using the baseline feature set
3500 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
3501 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
3502
3503 bufferlist bl;
3504 ::encode(superblock, bl);
3505 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
3506 }
3507
3508 int OSD::read_superblock()
3509 {
3510 bufferlist bl;
3511 int r = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
3512 if (r < 0)
3513 return r;
3514
3515 bufferlist::iterator p = bl.begin();
3516 ::decode(superblock, p);
3517
3518 dout(10) << "read_superblock " << superblock << dendl;
3519
3520 return 0;
3521 }
3522
3523 void OSD::clear_temp_objects()
3524 {
3525 dout(10) << __func__ << dendl;
3526 vector<coll_t> ls;
3527 store->list_collections(ls);
3528 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
3529 spg_t pgid;
3530 if (!p->is_pg(&pgid))
3531 continue;
3532
3533 // list temp objects
3534 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
3535
3536 vector<ghobject_t> temps;
3537 ghobject_t next;
3538 while (1) {
3539 vector<ghobject_t> objects;
3540 store->collection_list(*p, next, ghobject_t::get_max(),
3541 store->get_ideal_list_max(),
3542 &objects, &next);
3543 if (objects.empty())
3544 break;
3545 vector<ghobject_t>::iterator q;
3546 for (q = objects.begin(); q != objects.end(); ++q) {
3547 // Hammer set pool for temps to -1, so check for clean-up
3548 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
3549 temps.push_back(*q);
3550 } else {
3551 break;
3552 }
3553 }
3554 // If we saw a non-temp object and hit the break above we can
3555 // break out of the while loop too.
3556 if (q != objects.end())
3557 break;
3558 }
3559 if (!temps.empty()) {
3560 ObjectStore::Transaction t;
3561 int removed = 0;
3562 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
3563 dout(20) << " removing " << *p << " object " << *q << dendl;
3564 t.remove(*p, *q);
3565 if (++removed > cct->_conf->osd_target_transaction_size) {
3566 store->apply_transaction(service.meta_osr.get(), std::move(t));
3567 t = ObjectStore::Transaction();
3568 removed = 0;
3569 }
3570 }
3571 if (removed) {
3572 store->apply_transaction(service.meta_osr.get(), std::move(t));
3573 }
3574 }
3575 }
3576 }
3577
3578 void OSD::recursive_remove_collection(CephContext* cct,
3579 ObjectStore *store, spg_t pgid,
3580 coll_t tmp)
3581 {
3582 OSDriver driver(
3583 store,
3584 coll_t(),
3585 make_snapmapper_oid());
3586
3587 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
3588 ObjectStore::Sequencer>("rm"));
3589 ObjectStore::Transaction t;
3590 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
3591
3592 vector<ghobject_t> objects;
3593 store->collection_list(tmp, ghobject_t(), ghobject_t::get_max(),
3594 INT_MAX, &objects, 0);
3595 generic_dout(10) << __func__ << " " << objects << dendl;
3596 // delete them.
3597 int removed = 0;
3598 for (vector<ghobject_t>::iterator p = objects.begin();
3599 p != objects.end();
3600 ++p, removed++) {
3601 OSDriver::OSTransaction _t(driver.get_transaction(&t));
3602 int r = mapper.remove_oid(p->hobj, &_t);
3603 if (r != 0 && r != -ENOENT)
3604 ceph_abort();
3605 t.remove(tmp, *p);
3606 if (removed > cct->_conf->osd_target_transaction_size) {
3607 int r = store->apply_transaction(osr.get(), std::move(t));
3608 assert(r == 0);
3609 t = ObjectStore::Transaction();
3610 removed = 0;
3611 }
3612 }
3613 t.remove_collection(tmp);
3614 int r = store->apply_transaction(osr.get(), std::move(t));
3615 assert(r == 0);
3616
3617 C_SaferCond waiter;
3618 if (!osr->flush_commit(&waiter)) {
3619 waiter.wait();
3620 }
3621 }
3622
3623
3624 // ======================================================
3625 // PG's
3626
3627 PGPool OSD::_get_pool(int id, OSDMapRef createmap)
3628 {
3629 if (!createmap->have_pg_pool(id)) {
3630 dout(5) << __func__ << ": the OSDmap does not contain a PG pool with id = "
3631 << id << dendl;
3632 ceph_abort();
3633 }
3634
3635 PGPool p = PGPool(cct, createmap, id);
3636
3637 dout(10) << "_get_pool " << p.id << dendl;
3638 return p;
3639 }
3640
3641 PG *OSD::_open_lock_pg(
3642 OSDMapRef createmap,
3643 spg_t pgid, bool no_lockdep_check)
3644 {
3645 assert(osd_lock.is_locked());
3646
3647 PG* pg = _make_pg(createmap, pgid);
3648 {
3649 RWLock::WLocker l(pg_map_lock);
3650 pg->lock(no_lockdep_check);
3651 pg_map[pgid] = pg;
3652 pg->get("PGMap"); // because it's in pg_map
3653 service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
3654 }
3655 return pg;
3656 }
3657
3658 PG* OSD::_make_pg(
3659 OSDMapRef createmap,
3660 spg_t pgid)
3661 {
3662 dout(10) << "_open_lock_pg " << pgid << dendl;
3663 PGPool pool = _get_pool(pgid.pool(), createmap);
3664
3665 // create
3666 PG *pg;
3667 if (createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_REPLICATED ||
3668 createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_ERASURE)
3669 pg = new PrimaryLogPG(&service, createmap, pool, pgid);
3670 else
3671 ceph_abort();
3672
3673 return pg;
3674 }
3675
3676
3677 void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
3678 {
3679 epoch_t e(service.get_osdmap()->get_epoch());
3680 pg->get("PGMap"); // For pg_map
3681 pg_map[pg->info.pgid] = pg;
3682 service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
3683
3684 dout(10) << "Adding newly split pg " << *pg << dendl;
3685 pg->handle_loaded(rctx);
3686 pg->write_if_dirty(*(rctx->transaction));
3687 pg->queue_null(e, e);
3688 map<spg_t, list<PG::CephPeeringEvtRef> >::iterator to_wake =
3689 peering_wait_for_split.find(pg->info.pgid);
3690 if (to_wake != peering_wait_for_split.end()) {
3691 for (list<PG::CephPeeringEvtRef>::iterator i =
3692 to_wake->second.begin();
3693 i != to_wake->second.end();
3694 ++i) {
3695 pg->queue_peering_event(*i);
3696 }
3697 peering_wait_for_split.erase(to_wake);
3698 }
3699 if (!service.get_osdmap()->have_pg_pool(pg->info.pgid.pool()))
3700 _remove_pg(pg);
3701 }
3702
3703 OSD::res_result OSD::_try_resurrect_pg(
3704 OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state)
3705 {
3706 assert(resurrected);
3707 assert(old_pg_state);
3708 // find nearest ancestor
3709 DeletingStateRef df;
3710 spg_t cur(pgid);
3711 while (true) {
3712 df = service.deleting_pgs.lookup(cur);
3713 if (df)
3714 break;
3715 if (!cur.ps())
3716 break;
3717 cur = cur.get_parent();
3718 }
3719 if (!df)
3720 return RES_NONE; // good to go
3721
3722 df->old_pg_state->lock();
3723 OSDMapRef create_map = df->old_pg_state->get_osdmap();
3724 df->old_pg_state->unlock();
3725
3726 set<spg_t> children;
3727 if (cur == pgid) {
3728 if (df->try_stop_deletion()) {
3729 dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl;
3730 *resurrected = cur;
3731 *old_pg_state = df->old_pg_state;
3732 service.deleting_pgs.remove(pgid); // PG is no longer being removed!
3733 return RES_SELF;
3734 } else {
3735 // raced, ensure we don't see DeletingStateRef when we try to
3736 // delete this pg
3737 service.deleting_pgs.remove(pgid);
3738 return RES_NONE;
3739 }
3740 } else if (cur.is_split(create_map->get_pg_num(cur.pool()),
3741 curmap->get_pg_num(cur.pool()),
3742 &children) &&
3743 children.count(pgid)) {
3744 if (df->try_stop_deletion()) {
3745 dout(10) << __func__ << ": halted deletion on ancestor pg " << pgid
3746 << dendl;
3747 *resurrected = cur;
3748 *old_pg_state = df->old_pg_state;
3749 service.deleting_pgs.remove(cur); // PG is no longer being removed!
3750 return RES_PARENT;
3751 } else {
3752 /* this is not a problem, failing to cancel proves that all objects
3753 * have been removed, so no hobject_t overlap is possible
3754 */
3755 return RES_NONE;
3756 }
3757 }
3758 return RES_NONE;
3759 }
3760
3761 PG *OSD::_create_lock_pg(
3762 OSDMapRef createmap,
3763 spg_t pgid,
3764 bool hold_map_lock,
3765 bool backfill,
3766 int role,
3767 vector<int>& up, int up_primary,
3768 vector<int>& acting, int acting_primary,
3769 pg_history_t history,
3770 const PastIntervals& pi,
3771 ObjectStore::Transaction& t)
3772 {
3773 assert(osd_lock.is_locked());
3774 dout(20) << "_create_lock_pg pgid " << pgid << dendl;
3775
3776 PG *pg = _open_lock_pg(createmap, pgid, true);
3777
3778 service.init_splits_between(pgid, pg->get_osdmap(), service.get_osdmap());
3779
3780 pg->init(
3781 role,
3782 up,
3783 up_primary,
3784 acting,
3785 acting_primary,
3786 history,
3787 pi,
3788 backfill,
3789 &t);
3790
3791 dout(7) << "_create_lock_pg " << *pg << dendl;
3792 return pg;
3793 }
3794
3795 PG *OSD::_lookup_lock_pg(spg_t pgid)
3796 {
3797 RWLock::RLocker l(pg_map_lock);
3798
3799 auto pg_map_entry = pg_map.find(pgid);
3800 if (pg_map_entry == pg_map.end())
3801 return nullptr;
3802 PG *pg = pg_map_entry->second;
3803 pg->lock();
3804 return pg;
3805 }
3806
3807 PG *OSD::lookup_lock_pg(spg_t pgid)
3808 {
3809 return _lookup_lock_pg(pgid);
3810 }
3811
3812 PG *OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid)
3813 {
3814 assert(pg_map.count(pgid));
3815 PG *pg = pg_map[pgid];
3816 pg->lock();
3817 return pg;
3818 }
3819
3820 void OSD::load_pgs()
3821 {
3822 assert(osd_lock.is_locked());
3823 dout(0) << "load_pgs" << dendl;
3824 {
3825 RWLock::RLocker l(pg_map_lock);
3826 assert(pg_map.empty());
3827 }
3828
3829 vector<coll_t> ls;
3830 int r = store->list_collections(ls);
3831 if (r < 0) {
3832 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
3833 }
3834
3835 bool has_upgraded = false;
3836
3837 for (vector<coll_t>::iterator it = ls.begin();
3838 it != ls.end();
3839 ++it) {
3840 spg_t pgid;
3841 if (it->is_temp(&pgid) ||
3842 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
3843 dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
3844 recursive_remove_collection(cct, store, pgid, *it);
3845 continue;
3846 }
3847
3848 if (!it->is_pg(&pgid)) {
3849 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
3850 continue;
3851 }
3852
3853 if (pgid.preferred() >= 0) {
3854 dout(10) << __func__ << ": skipping localized PG " << pgid << dendl;
3855 // FIXME: delete it too, eventually
3856 continue;
3857 }
3858
3859 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
3860 bufferlist bl;
3861 epoch_t map_epoch = 0;
3862 int r = PG::peek_map_epoch(store, pgid, &map_epoch, &bl);
3863 if (r < 0) {
3864 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
3865 << dendl;
3866 continue;
3867 }
3868
3869 PG *pg = NULL;
3870 if (map_epoch > 0) {
3871 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
3872 if (!pgosdmap) {
3873 if (!osdmap->have_pg_pool(pgid.pool())) {
3874 derr << __func__ << ": could not find map for epoch " << map_epoch
3875 << " on pg " << pgid << ", but the pool is not present in the "
3876 << "current map, so this is probably a result of bug 10617. "
3877 << "Skipping the pg for now, you can use ceph-objectstore-tool "
3878 << "to clean it up later." << dendl;
3879 continue;
3880 } else {
3881 derr << __func__ << ": have pgid " << pgid << " at epoch "
3882 << map_epoch << ", but missing map. Crashing."
3883 << dendl;
3884 assert(0 == "Missing map in load_pgs");
3885 }
3886 }
3887 pg = _open_lock_pg(pgosdmap, pgid);
3888 } else {
3889 pg = _open_lock_pg(osdmap, pgid);
3890 }
3891 // there can be no waiters here, so we don't call wake_pg_waiters
3892
3893 pg->ch = store->open_collection(pg->coll);
3894
3895 // read pg state, log
3896 pg->read_state(store, bl);
3897
3898 if (pg->must_upgrade()) {
3899 if (!pg->can_upgrade()) {
3900 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
3901 << " an older version first." << dendl;
3902 assert(0 == "PG too old to upgrade");
3903 }
3904 if (!has_upgraded) {
3905 derr << "PGs are upgrading" << dendl;
3906 has_upgraded = true;
3907 }
3908 dout(10) << "PG " << pg->info.pgid
3909 << " must upgrade..." << dendl;
3910 pg->upgrade(store);
3911 }
3912
3913 service.init_splits_between(pg->info.pgid, pg->get_osdmap(), osdmap);
3914
3915 // generate state for PG's current mapping
3916 int primary, up_primary;
3917 vector<int> acting, up;
3918 pg->get_osdmap()->pg_to_up_acting_osds(
3919 pgid.pgid, &up, &up_primary, &acting, &primary);
3920 pg->init_primary_up_acting(
3921 up,
3922 acting,
3923 up_primary,
3924 primary);
3925 int role = OSDMap::calc_pg_role(whoami, pg->acting);
3926 if (pg->pool.info.is_replicated() || role == pg->pg_whoami.shard)
3927 pg->set_role(role);
3928 else
3929 pg->set_role(-1);
3930
3931 pg->reg_next_scrub();
3932
3933 PG::RecoveryCtx rctx(0, 0, 0, 0, 0, 0);
3934 pg->handle_loaded(&rctx);
3935
3936 dout(10) << "load_pgs loaded " << *pg << " " << pg->pg_log.get_log() << dendl;
3937 if (pg->pg_log.is_dirty()) {
3938 ObjectStore::Transaction t;
3939 pg->write_if_dirty(t);
3940 store->apply_transaction(pg->osr.get(), std::move(t));
3941 }
3942 pg->unlock();
3943 }
3944 {
3945 RWLock::RLocker l(pg_map_lock);
3946 dout(0) << "load_pgs opened " << pg_map.size() << " pgs" << dendl;
3947 }
3948
3949 // clean up old infos object?
3950 if (has_upgraded && store->exists(coll_t::meta(), OSD::make_infos_oid())) {
3951 dout(1) << __func__ << " removing legacy infos object" << dendl;
3952 ObjectStore::Transaction t;
3953 t.remove(coll_t::meta(), OSD::make_infos_oid());
3954 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
3955 if (r != 0) {
3956 derr << __func__ << ": apply_transaction returned "
3957 << cpp_strerror(r) << dendl;
3958 ceph_abort();
3959 }
3960 }
3961
3962 build_past_intervals_parallel();
3963 }
3964
3965
3966 /*
3967 * build past_intervals efficiently on old, degraded, and buried
3968 * clusters. this is important for efficiently catching up osds that
3969 * are way behind on maps to the current cluster state.
3970 *
3971 * this is a parallel version of PG::generate_past_intervals().
3972 * follow the same logic, but do all pgs at the same time so that we
3973 * can make a single pass across the osdmap history.
3974 */
3975 void OSD::build_past_intervals_parallel()
3976 {
3977 struct pistate {
3978 epoch_t start, end;
3979 vector<int> old_acting, old_up;
3980 epoch_t same_interval_since;
3981 int primary;
3982 int up_primary;
3983 };
3984 map<PG*,pistate> pis;
3985
3986 // calculate junction of map range
3987 epoch_t end_epoch = superblock.oldest_map;
3988 epoch_t cur_epoch = superblock.newest_map;
3989 {
3990 RWLock::RLocker l(pg_map_lock);
3991 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
3992 i != pg_map.end();
3993 ++i) {
3994 PG *pg = i->second;
3995
3996 auto rpib = pg->get_required_past_interval_bounds(
3997 pg->info,
3998 superblock.oldest_map);
3999 if (rpib.first >= rpib.second && pg->past_intervals.empty()) {
4000 if (pg->info.history.same_interval_since == 0) {
4001 pg->info.history.same_interval_since = rpib.second;
4002 }
4003 continue;
4004 } else {
4005 auto apib = pg->past_intervals.get_bounds();
4006 if (apib.second >= rpib.second &&
4007 apib.first <= rpib.first) {
4008 if (pg->info.history.same_interval_since == 0) {
4009 pg->info.history.same_interval_since = rpib.second;
4010 }
4011 continue;
4012 }
4013 }
4014
4015 dout(10) << pg->info.pgid << " needs " << rpib.first << "-"
4016 << rpib.second << dendl;
4017 pistate& p = pis[pg];
4018 p.start = rpib.first;
4019 p.end = rpib.second;
4020 p.same_interval_since = 0;
4021
4022 if (rpib.first < cur_epoch)
4023 cur_epoch = rpib.first;
4024 if (rpib.second > end_epoch)
4025 end_epoch = rpib.second;
4026 }
4027 }
4028 if (pis.empty()) {
4029 dout(10) << __func__ << " nothing to build" << dendl;
4030 return;
4031 }
4032
4033 dout(1) << __func__ << " over " << cur_epoch << "-" << end_epoch << dendl;
4034 assert(cur_epoch <= end_epoch);
4035
4036 OSDMapRef cur_map, last_map;
4037 for ( ; cur_epoch <= end_epoch; cur_epoch++) {
4038 dout(10) << __func__ << " epoch " << cur_epoch << dendl;
4039 last_map = cur_map;
4040 cur_map = get_map(cur_epoch);
4041
4042 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4043 PG *pg = i->first;
4044 pistate& p = i->second;
4045
4046 if (cur_epoch < p.start || cur_epoch > p.end)
4047 continue;
4048
4049 vector<int> acting, up;
4050 int up_primary;
4051 int primary;
4052 pg_t pgid = pg->info.pgid.pgid;
4053 if (p.same_interval_since && last_map->get_pools().count(pgid.pool()))
4054 pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool()));
4055 cur_map->pg_to_up_acting_osds(
4056 pgid, &up, &up_primary, &acting, &primary);
4057
4058 if (p.same_interval_since == 0) {
4059 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4060 << " first map, acting " << acting
4061 << " up " << up << ", same_interval_since = " << cur_epoch << dendl;
4062 p.same_interval_since = cur_epoch;
4063 p.old_up = up;
4064 p.old_acting = acting;
4065 p.primary = primary;
4066 p.up_primary = up_primary;
4067 continue;
4068 }
4069 assert(last_map);
4070
4071 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
4072 pg->get_is_recoverable_predicate());
4073 std::stringstream debug;
4074 bool new_interval = PastIntervals::check_new_interval(
4075 p.primary,
4076 primary,
4077 p.old_acting, acting,
4078 p.up_primary,
4079 up_primary,
4080 p.old_up, up,
4081 p.same_interval_since,
4082 pg->info.history.last_epoch_clean,
4083 cur_map, last_map,
4084 pgid,
4085 recoverable.get(),
4086 &pg->past_intervals,
4087 &debug);
4088 if (new_interval) {
4089 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4090 << " " << debug.str() << dendl;
4091 p.old_up = up;
4092 p.old_acting = acting;
4093 p.primary = primary;
4094 p.up_primary = up_primary;
4095 p.same_interval_since = cur_epoch;
4096 }
4097 }
4098 }
4099
4100 // Now that past_intervals have been recomputed let's fix the same_interval_since
4101 // if it was cleared by import.
4102 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4103 PG *pg = i->first;
4104 pistate& p = i->second;
4105
4106 if (pg->info.history.same_interval_since == 0) {
4107 assert(p.same_interval_since);
4108 dout(10) << __func__ << " fix same_interval_since " << p.same_interval_since << " pg " << *pg << dendl;
4109 dout(10) << __func__ << " past_intervals " << pg->past_intervals << dendl;
4110 // Fix it
4111 pg->info.history.same_interval_since = p.same_interval_since;
4112 }
4113 }
4114
4115 // write info only at the end. this is necessary because we check
4116 // whether the past_intervals go far enough back or forward in time,
4117 // but we don't check for holes. we could avoid it by discarding
4118 // the previous past_intervals and rebuilding from scratch, or we
4119 // can just do this and commit all our work at the end.
4120 ObjectStore::Transaction t;
4121 int num = 0;
4122 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4123 PG *pg = i->first;
4124 pg->lock();
4125 pg->dirty_big_info = true;
4126 pg->dirty_info = true;
4127 pg->write_if_dirty(t);
4128 pg->unlock();
4129
4130 // don't let the transaction get too big
4131 if (++num >= cct->_conf->osd_target_transaction_size) {
4132 store->apply_transaction(service.meta_osr.get(), std::move(t));
4133 t = ObjectStore::Transaction();
4134 num = 0;
4135 }
4136 }
4137 if (!t.empty())
4138 store->apply_transaction(service.meta_osr.get(), std::move(t));
4139 }
4140
4141 /*
4142 * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping
4143 * hasn't changed since the given epoch and we are the primary.
4144 */
4145 int OSD::handle_pg_peering_evt(
4146 spg_t pgid,
4147 const pg_history_t& orig_history,
4148 const PastIntervals& pi,
4149 epoch_t epoch,
4150 PG::CephPeeringEvtRef evt)
4151 {
4152 if (service.splitting(pgid)) {
4153 peering_wait_for_split[pgid].push_back(evt);
4154 return -EEXIST;
4155 }
4156
4157 PG *pg = _lookup_lock_pg(pgid);
4158 if (!pg) {
4159 // same primary?
4160 if (!osdmap->have_pg_pool(pgid.pool()))
4161 return -EINVAL;
4162 int up_primary, acting_primary;
4163 vector<int> up, acting;
4164 osdmap->pg_to_up_acting_osds(
4165 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4166
4167 pg_history_t history = orig_history;
4168 bool valid_history = project_pg_history(
4169 pgid, history, epoch, up, up_primary, acting, acting_primary);
4170
4171 if (!valid_history || epoch < history.same_interval_since) {
4172 dout(10) << __func__ << pgid << " acting changed in "
4173 << history.same_interval_since << " (msg from " << epoch << ")"
4174 << dendl;
4175 return -EINVAL;
4176 }
4177
4178 if (service.splitting(pgid)) {
4179 ceph_abort();
4180 }
4181
4182 // do we need to resurrect a deleting pg?
4183 spg_t resurrected;
4184 PGRef old_pg_state;
4185 res_result result = _try_resurrect_pg(
4186 service.get_osdmap(),
4187 pgid,
4188 &resurrected,
4189 &old_pg_state);
4190
4191 PG::RecoveryCtx rctx = create_context();
4192 switch (result) {
4193 case RES_NONE: {
4194 const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
4195 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4196 store->get_type() != "bluestore") {
4197 clog->warn() << "pg " << pgid
4198 << " is at risk of silent data corruption: "
4199 << "the pool allows ec overwrites but is not stored in "
4200 << "bluestore, so deep scrubbing will not detect bitrot";
4201 }
4202 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4203 PG::_init(*rctx.transaction, pgid, pp);
4204
4205 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
4206 if (!pp->is_replicated() && role != pgid.shard)
4207 role = -1;
4208
4209 pg = _create_lock_pg(
4210 get_map(epoch),
4211 pgid, false, false,
4212 role,
4213 up, up_primary,
4214 acting, acting_primary,
4215 history, pi,
4216 *rctx.transaction);
4217 pg->handle_create(&rctx);
4218 pg->write_if_dirty(*rctx.transaction);
4219 dispatch_context(rctx, pg, osdmap);
4220
4221 dout(10) << *pg << " is new" << dendl;
4222
4223 pg->queue_peering_event(evt);
4224 wake_pg_waiters(pg);
4225 pg->unlock();
4226 return 0;
4227 }
4228 case RES_SELF: {
4229 old_pg_state->lock();
4230 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4231 int old_role = old_pg_state->role;
4232 vector<int> old_up = old_pg_state->up;
4233 int old_up_primary = old_pg_state->up_primary.osd;
4234 vector<int> old_acting = old_pg_state->acting;
4235 int old_primary = old_pg_state->primary.osd;
4236 pg_history_t old_history = old_pg_state->info.history;
4237 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4238 old_pg_state->unlock();
4239 pg = _create_lock_pg(
4240 old_osd_map,
4241 resurrected,
4242 false,
4243 true,
4244 old_role,
4245 old_up,
4246 old_up_primary,
4247 old_acting,
4248 old_primary,
4249 old_history,
4250 old_past_intervals,
4251 *rctx.transaction);
4252 pg->handle_create(&rctx);
4253 pg->write_if_dirty(*rctx.transaction);
4254 dispatch_context(rctx, pg, osdmap);
4255
4256 dout(10) << *pg << " is new (resurrected)" << dendl;
4257
4258 pg->queue_peering_event(evt);
4259 wake_pg_waiters(pg);
4260 pg->unlock();
4261 return 0;
4262 }
4263 case RES_PARENT: {
4264 assert(old_pg_state);
4265 old_pg_state->lock();
4266 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4267 int old_role = old_pg_state->role;
4268 vector<int> old_up = old_pg_state->up;
4269 int old_up_primary = old_pg_state->up_primary.osd;
4270 vector<int> old_acting = old_pg_state->acting;
4271 int old_primary = old_pg_state->primary.osd;
4272 pg_history_t old_history = old_pg_state->info.history;
4273 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4274 old_pg_state->unlock();
4275 PG *parent = _create_lock_pg(
4276 old_osd_map,
4277 resurrected,
4278 false,
4279 true,
4280 old_role,
4281 old_up,
4282 old_up_primary,
4283 old_acting,
4284 old_primary,
4285 old_history,
4286 old_past_intervals,
4287 *rctx.transaction
4288 );
4289 parent->handle_create(&rctx);
4290 parent->write_if_dirty(*rctx.transaction);
4291 dispatch_context(rctx, parent, osdmap);
4292
4293 dout(10) << *parent << " is new" << dendl;
4294
4295 assert(service.splitting(pgid));
4296 peering_wait_for_split[pgid].push_back(evt);
4297
4298 //parent->queue_peering_event(evt);
4299 parent->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
4300 wake_pg_waiters(parent);
4301 parent->unlock();
4302 return 0;
4303 }
4304 default:
4305 assert(0);
4306 return 0;
4307 }
4308 } else {
4309 // already had it. did the mapping change?
4310 if (epoch < pg->info.history.same_interval_since) {
4311 dout(10) << *pg << __func__ << " acting changed in "
4312 << pg->info.history.same_interval_since
4313 << " (msg from " << epoch << ")" << dendl;
4314 } else {
4315 pg->queue_peering_event(evt);
4316 }
4317 pg->unlock();
4318 return -EEXIST;
4319 }
4320 }
4321
4322
4323 void OSD::build_initial_pg_history(
4324 spg_t pgid,
4325 epoch_t created,
4326 utime_t created_stamp,
4327 pg_history_t *h,
4328 PastIntervals *pi)
4329 {
4330 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4331 h->epoch_created = created;
4332 h->epoch_pool_created = created;
4333 h->same_interval_since = created;
4334 h->same_up_since = created;
4335 h->same_primary_since = created;
4336 h->last_scrub_stamp = created_stamp;
4337 h->last_deep_scrub_stamp = created_stamp;
4338 h->last_clean_scrub_stamp = created_stamp;
4339
4340 OSDMapRef lastmap = service.get_map(created);
4341 int up_primary, acting_primary;
4342 vector<int> up, acting;
4343 lastmap->pg_to_up_acting_osds(
4344 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4345
4346 ostringstream debug;
4347 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4348 OSDMapRef osdmap = service.get_map(e);
4349 int new_up_primary, new_acting_primary;
4350 vector<int> new_up, new_acting;
4351 osdmap->pg_to_up_acting_osds(
4352 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4353
4354 // this is a bit imprecise, but sufficient?
4355 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4356 const pg_pool_t *pi;
4357 bool operator()(const set<pg_shard_t> &have) const {
4358 return have.size() >= pi->min_size;
4359 }
4360 min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4361 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4362
4363 bool new_interval = PastIntervals::check_new_interval(
4364 acting_primary,
4365 new_acting_primary,
4366 acting, new_acting,
4367 up_primary,
4368 new_up_primary,
4369 up, new_up,
4370 h->same_interval_since,
4371 h->last_epoch_clean,
4372 osdmap,
4373 lastmap,
4374 pgid.pgid,
4375 &min_size_predicate,
4376 pi,
4377 &debug);
4378 if (new_interval) {
4379 h->same_interval_since = e;
4380 }
4381 if (up != new_up) {
4382 h->same_up_since = e;
4383 }
4384 if (acting_primary != new_acting_primary) {
4385 h->same_primary_since = e;
4386 }
4387 lastmap = osdmap;
4388 }
4389 dout(20) << __func__ << " " << debug.str() << dendl;
4390 dout(10) << __func__ << " " << *h << " " << *pi
4391 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4392 pi->get_bounds()) << ")"
4393 << dendl;
4394 }
4395
4396 /**
4397 * Fill in the passed history so you know same_interval_since, same_up_since,
4398 * and same_primary_since.
4399 */
4400 bool OSD::project_pg_history(spg_t pgid, pg_history_t& h, epoch_t from,
4401 const vector<int>& currentup,
4402 int currentupprimary,
4403 const vector<int>& currentacting,
4404 int currentactingprimary)
4405 {
4406 dout(15) << "project_pg_history " << pgid
4407 << " from " << from << " to " << osdmap->get_epoch()
4408 << ", start " << h
4409 << dendl;
4410
4411 epoch_t e;
4412 for (e = osdmap->get_epoch();
4413 e > from;
4414 e--) {
4415 // verify during intermediate epoch (e-1)
4416 OSDMapRef oldmap = service.try_get_map(e-1);
4417 if (!oldmap) {
4418 dout(15) << __func__ << ": found map gap, returning false" << dendl;
4419 return false;
4420 }
4421 assert(oldmap->have_pg_pool(pgid.pool()));
4422
4423 int upprimary, actingprimary;
4424 vector<int> up, acting;
4425 oldmap->pg_to_up_acting_osds(
4426 pgid.pgid,
4427 &up,
4428 &upprimary,
4429 &acting,
4430 &actingprimary);
4431
4432 // acting set change?
4433 if ((actingprimary != currentactingprimary ||
4434 upprimary != currentupprimary ||
4435 acting != currentacting ||
4436 up != currentup) && e > h.same_interval_since) {
4437 dout(15) << "project_pg_history " << pgid << " acting|up changed in " << e
4438 << " from " << acting << "/" << up
4439 << " " << actingprimary << "/" << upprimary
4440 << " -> " << currentacting << "/" << currentup
4441 << " " << currentactingprimary << "/" << currentupprimary
4442 << dendl;
4443 h.same_interval_since = e;
4444 }
4445 // split?
4446 if (pgid.is_split(oldmap->get_pg_num(pgid.pool()),
4447 osdmap->get_pg_num(pgid.pool()),
4448 0) && e > h.same_interval_since) {
4449 h.same_interval_since = e;
4450 }
4451 // up set change?
4452 if ((up != currentup || upprimary != currentupprimary)
4453 && e > h.same_up_since) {
4454 dout(15) << "project_pg_history " << pgid << " up changed in " << e
4455 << " from " << up << " " << upprimary
4456 << " -> " << currentup << " " << currentupprimary << dendl;
4457 h.same_up_since = e;
4458 }
4459
4460 // primary change?
4461 if (OSDMap::primary_changed(
4462 actingprimary,
4463 acting,
4464 currentactingprimary,
4465 currentacting) &&
4466 e > h.same_primary_since) {
4467 dout(15) << "project_pg_history " << pgid << " primary changed in " << e << dendl;
4468 h.same_primary_since = e;
4469 }
4470
4471 if (h.same_interval_since >= e && h.same_up_since >= e && h.same_primary_since >= e)
4472 break;
4473 }
4474
4475 // base case: these floors should be the pg creation epoch if we didn't
4476 // find any changes.
4477 if (e == h.epoch_created) {
4478 if (!h.same_interval_since)
4479 h.same_interval_since = e;
4480 if (!h.same_up_since)
4481 h.same_up_since = e;
4482 if (!h.same_primary_since)
4483 h.same_primary_since = e;
4484 }
4485
4486 dout(15) << "project_pg_history end " << h << dendl;
4487 return true;
4488 }
4489
4490
4491
4492 void OSD::_add_heartbeat_peer(int p)
4493 {
4494 if (p == whoami)
4495 return;
4496 HeartbeatInfo *hi;
4497
4498 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4499 if (i == heartbeat_peers.end()) {
4500 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4501 if (!cons.first)
4502 return;
4503 hi = &heartbeat_peers[p];
4504 hi->peer = p;
4505 HeartbeatSession *s = new HeartbeatSession(p);
4506 hi->con_back = cons.first.get();
4507 hi->con_back->set_priv(s->get());
4508 if (cons.second) {
4509 hi->con_front = cons.second.get();
4510 hi->con_front->set_priv(s->get());
4511 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4512 << " " << hi->con_back->get_peer_addr()
4513 << " " << hi->con_front->get_peer_addr()
4514 << dendl;
4515 } else {
4516 hi->con_front.reset(NULL);
4517 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4518 << " " << hi->con_back->get_peer_addr()
4519 << dendl;
4520 }
4521 s->put();
4522 } else {
4523 hi = &i->second;
4524 }
4525 hi->epoch = osdmap->get_epoch();
4526 }
4527
4528 void OSD::_remove_heartbeat_peer(int n)
4529 {
4530 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
4531 assert(q != heartbeat_peers.end());
4532 dout(20) << " removing heartbeat peer osd." << n
4533 << " " << q->second.con_back->get_peer_addr()
4534 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
4535 << dendl;
4536 q->second.con_back->mark_down();
4537 if (q->second.con_front) {
4538 q->second.con_front->mark_down();
4539 }
4540 heartbeat_peers.erase(q);
4541 }
4542
4543 void OSD::need_heartbeat_peer_update()
4544 {
4545 if (is_stopping())
4546 return;
4547 dout(20) << "need_heartbeat_peer_update" << dendl;
4548 heartbeat_set_peers_need_update();
4549 }
4550
4551 void OSD::maybe_update_heartbeat_peers()
4552 {
4553 assert(osd_lock.is_locked());
4554
4555 if (is_waiting_for_healthy()) {
4556 utime_t now = ceph_clock_now();
4557 if (last_heartbeat_resample == utime_t()) {
4558 last_heartbeat_resample = now;
4559 heartbeat_set_peers_need_update();
4560 } else if (!heartbeat_peers_need_update()) {
4561 utime_t dur = now - last_heartbeat_resample;
4562 if (dur > cct->_conf->osd_heartbeat_grace) {
4563 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
4564 heartbeat_set_peers_need_update();
4565 last_heartbeat_resample = now;
4566 reset_heartbeat_peers(); // we want *new* peers!
4567 }
4568 }
4569 }
4570
4571 if (!heartbeat_peers_need_update())
4572 return;
4573 heartbeat_clear_peers_need_update();
4574
4575 Mutex::Locker l(heartbeat_lock);
4576
4577 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
4578
4579
4580 // build heartbeat from set
4581 if (is_active()) {
4582 RWLock::RLocker l(pg_map_lock);
4583 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4584 i != pg_map.end();
4585 ++i) {
4586 PG *pg = i->second;
4587 pg->heartbeat_peer_lock.Lock();
4588 dout(20) << i->first << " heartbeat_peers " << pg->heartbeat_peers << dendl;
4589 for (set<int>::iterator p = pg->heartbeat_peers.begin();
4590 p != pg->heartbeat_peers.end();
4591 ++p)
4592 if (osdmap->is_up(*p))
4593 _add_heartbeat_peer(*p);
4594 for (set<int>::iterator p = pg->probe_targets.begin();
4595 p != pg->probe_targets.end();
4596 ++p)
4597 if (osdmap->is_up(*p))
4598 _add_heartbeat_peer(*p);
4599 pg->heartbeat_peer_lock.Unlock();
4600 }
4601 }
4602
4603 // include next and previous up osds to ensure we have a fully-connected set
4604 set<int> want, extras;
4605 int next = osdmap->get_next_up_osd_after(whoami);
4606 if (next >= 0)
4607 want.insert(next);
4608 int prev = osdmap->get_previous_up_osd_before(whoami);
4609 if (prev >= 0 && prev != next)
4610 want.insert(prev);
4611
4612 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
4613 dout(10) << " adding neighbor peer osd." << *p << dendl;
4614 extras.insert(*p);
4615 _add_heartbeat_peer(*p);
4616 }
4617
4618 // remove down peers; enumerate extras
4619 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4620 while (p != heartbeat_peers.end()) {
4621 if (!osdmap->is_up(p->first)) {
4622 int o = p->first;
4623 ++p;
4624 _remove_heartbeat_peer(o);
4625 continue;
4626 }
4627 if (p->second.epoch < osdmap->get_epoch()) {
4628 extras.insert(p->first);
4629 }
4630 ++p;
4631 }
4632
4633 // too few?
4634 int start = osdmap->get_next_up_osd_after(whoami);
4635 for (int n = start; n >= 0; ) {
4636 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
4637 break;
4638 if (!extras.count(n) && !want.count(n) && n != whoami) {
4639 dout(10) << " adding random peer osd." << n << dendl;
4640 extras.insert(n);
4641 _add_heartbeat_peer(n);
4642 }
4643 n = osdmap->get_next_up_osd_after(n);
4644 if (n == start)
4645 break; // came full circle; stop
4646 }
4647
4648 // too many?
4649 for (set<int>::iterator p = extras.begin();
4650 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
4651 ++p) {
4652 if (want.count(*p))
4653 continue;
4654 _remove_heartbeat_peer(*p);
4655 }
4656
4657 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
4658 }
4659
4660 void OSD::reset_heartbeat_peers()
4661 {
4662 assert(osd_lock.is_locked());
4663 dout(10) << "reset_heartbeat_peers" << dendl;
4664 Mutex::Locker l(heartbeat_lock);
4665 while (!heartbeat_peers.empty()) {
4666 HeartbeatInfo& hi = heartbeat_peers.begin()->second;
4667 hi.con_back->mark_down();
4668 if (hi.con_front) {
4669 hi.con_front->mark_down();
4670 }
4671 heartbeat_peers.erase(heartbeat_peers.begin());
4672 }
4673 failure_queue.clear();
4674 }
4675
4676 void OSD::handle_osd_ping(MOSDPing *m)
4677 {
4678 if (superblock.cluster_fsid != m->fsid) {
4679 dout(20) << "handle_osd_ping from " << m->get_source_inst()
4680 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
4681 m->put();
4682 return;
4683 }
4684
4685 int from = m->get_source().num();
4686
4687 heartbeat_lock.Lock();
4688 if (is_stopping()) {
4689 heartbeat_lock.Unlock();
4690 m->put();
4691 return;
4692 }
4693
4694 OSDMapRef curmap = service.get_osdmap();
4695 assert(curmap);
4696
4697 switch (m->op) {
4698
4699 case MOSDPing::PING:
4700 {
4701 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
4702 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
4703 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
4704 if (heartbeat_drop->second == 0) {
4705 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
4706 } else {
4707 --heartbeat_drop->second;
4708 dout(5) << "Dropping heartbeat from " << from
4709 << ", " << heartbeat_drop->second
4710 << " remaining to drop" << dendl;
4711 break;
4712 }
4713 } else if (cct->_conf->osd_debug_drop_ping_probability >
4714 ((((double)(rand()%100))/100.0))) {
4715 heartbeat_drop =
4716 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
4717 cct->_conf->osd_debug_drop_ping_duration)).first;
4718 dout(5) << "Dropping heartbeat from " << from
4719 << ", " << heartbeat_drop->second
4720 << " remaining to drop" << dendl;
4721 break;
4722 }
4723 }
4724
4725 if (!cct->get_heartbeat_map()->is_healthy()) {
4726 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
4727 break;
4728 }
4729
4730 Message *r = new MOSDPing(monc->get_fsid(),
4731 curmap->get_epoch(),
4732 MOSDPing::PING_REPLY, m->stamp,
4733 cct->_conf->osd_heartbeat_min_size);
4734 m->get_connection()->send_message(r);
4735
4736 if (curmap->is_up(from)) {
4737 service.note_peer_epoch(from, m->map_epoch);
4738 if (is_active()) {
4739 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
4740 if (con) {
4741 service.share_map_peer(from, con.get());
4742 }
4743 }
4744 } else if (!curmap->exists(from) ||
4745 curmap->get_down_at(from) > m->map_epoch) {
4746 // tell them they have died
4747 Message *r = new MOSDPing(monc->get_fsid(),
4748 curmap->get_epoch(),
4749 MOSDPing::YOU_DIED,
4750 m->stamp,
4751 cct->_conf->osd_heartbeat_min_size);
4752 m->get_connection()->send_message(r);
4753 }
4754 }
4755 break;
4756
4757 case MOSDPing::PING_REPLY:
4758 {
4759 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
4760 if (i != heartbeat_peers.end()) {
4761 if (m->get_connection() == i->second.con_back) {
4762 dout(25) << "handle_osd_ping got reply from osd." << from
4763 << " first_tx " << i->second.first_tx
4764 << " last_tx " << i->second.last_tx
4765 << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
4766 << " last_rx_front " << i->second.last_rx_front
4767 << dendl;
4768 i->second.last_rx_back = m->stamp;
4769 // if there is no front con, set both stamps.
4770 if (i->second.con_front == NULL)
4771 i->second.last_rx_front = m->stamp;
4772 } else if (m->get_connection() == i->second.con_front) {
4773 dout(25) << "handle_osd_ping got reply from osd." << from
4774 << " first_tx " << i->second.first_tx
4775 << " last_tx " << i->second.last_tx
4776 << " last_rx_back " << i->second.last_rx_back
4777 << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
4778 << dendl;
4779 i->second.last_rx_front = m->stamp;
4780 }
4781
4782 utime_t cutoff = ceph_clock_now();
4783 cutoff -= cct->_conf->osd_heartbeat_grace;
4784 if (i->second.is_healthy(cutoff)) {
4785 // Cancel false reports
4786 auto failure_queue_entry = failure_queue.find(from);
4787 if (failure_queue_entry != failure_queue.end()) {
4788 dout(10) << "handle_osd_ping canceling queued "
4789 << "failure report for osd." << from << dendl;
4790 failure_queue.erase(failure_queue_entry);
4791 }
4792
4793 auto failure_pending_entry = failure_pending.find(from);
4794 if (failure_pending_entry != failure_pending.end()) {
4795 dout(10) << "handle_osd_ping canceling in-flight "
4796 << "failure report for osd." << from << dendl;
4797 send_still_alive(curmap->get_epoch(),
4798 failure_pending_entry->second.second);
4799 failure_pending.erase(failure_pending_entry);
4800 }
4801 }
4802 }
4803
4804 if (m->map_epoch &&
4805 curmap->is_up(from)) {
4806 service.note_peer_epoch(from, m->map_epoch);
4807 if (is_active()) {
4808 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
4809 if (con) {
4810 service.share_map_peer(from, con.get());
4811 }
4812 }
4813 }
4814 }
4815 break;
4816
4817 case MOSDPing::YOU_DIED:
4818 dout(10) << "handle_osd_ping " << m->get_source_inst()
4819 << " says i am down in " << m->map_epoch << dendl;
4820 osdmap_subscribe(curmap->get_epoch()+1, false);
4821 break;
4822 }
4823
4824 heartbeat_lock.Unlock();
4825 m->put();
4826 }
4827
4828 void OSD::heartbeat_entry()
4829 {
4830 Mutex::Locker l(heartbeat_lock);
4831 if (is_stopping())
4832 return;
4833 while (!heartbeat_stop) {
4834 heartbeat();
4835
4836 double wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
4837 utime_t w;
4838 w.set_from_double(wait);
4839 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
4840 heartbeat_cond.WaitInterval(heartbeat_lock, w);
4841 if (is_stopping())
4842 return;
4843 dout(30) << "heartbeat_entry woke up" << dendl;
4844 }
4845 }
4846
4847 void OSD::heartbeat_check()
4848 {
4849 assert(heartbeat_lock.is_locked());
4850 utime_t now = ceph_clock_now();
4851
4852 // check for heartbeat replies (move me elsewhere?)
4853 utime_t cutoff = now;
4854 cutoff -= cct->_conf->osd_heartbeat_grace;
4855 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4856 p != heartbeat_peers.end();
4857 ++p) {
4858
4859 if (p->second.first_tx == utime_t()) {
4860 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
4861 << "yet, skipping" << dendl;
4862 continue;
4863 }
4864
4865 dout(25) << "heartbeat_check osd." << p->first
4866 << " first_tx " << p->second.first_tx
4867 << " last_tx " << p->second.last_tx
4868 << " last_rx_back " << p->second.last_rx_back
4869 << " last_rx_front " << p->second.last_rx_front
4870 << dendl;
4871 if (p->second.is_unhealthy(cutoff)) {
4872 if (p->second.last_rx_back == utime_t() ||
4873 p->second.last_rx_front == utime_t()) {
4874 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
4875 << " osd." << p->first << " ever on either front or back, first ping sent "
4876 << p->second.first_tx << " (cutoff " << cutoff << ")" << dendl;
4877 // fail
4878 failure_queue[p->first] = p->second.last_tx;
4879 } else {
4880 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
4881 << " osd." << p->first << " since back " << p->second.last_rx_back
4882 << " front " << p->second.last_rx_front
4883 << " (cutoff " << cutoff << ")" << dendl;
4884 // fail
4885 failure_queue[p->first] = MIN(p->second.last_rx_back, p->second.last_rx_front);
4886 }
4887 }
4888 }
4889 }
4890
4891 void OSD::heartbeat()
4892 {
4893 dout(30) << "heartbeat" << dendl;
4894
4895 // get CPU load avg
4896 double loadavgs[1];
4897 int n_samples = 86400 / cct->_conf->osd_heartbeat_interval;
4898 if (getloadavg(loadavgs, 1) == 1) {
4899 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
4900 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
4901 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
4902 }
4903
4904 dout(30) << "heartbeat checking stats" << dendl;
4905
4906 // refresh stats?
4907 vector<int> hb_peers;
4908 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4909 p != heartbeat_peers.end();
4910 ++p)
4911 hb_peers.push_back(p->first);
4912 service.update_osd_stat(hb_peers);
4913
4914 dout(5) << "heartbeat: " << service.get_osd_stat() << dendl;
4915
4916 utime_t now = ceph_clock_now();
4917
4918 // send heartbeats
4919 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
4920 i != heartbeat_peers.end();
4921 ++i) {
4922 int peer = i->first;
4923 i->second.last_tx = now;
4924 if (i->second.first_tx == utime_t())
4925 i->second.first_tx = now;
4926 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
4927 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
4928 service.get_osdmap()->get_epoch(),
4929 MOSDPing::PING, now,
4930 cct->_conf->osd_heartbeat_min_size));
4931
4932 if (i->second.con_front)
4933 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
4934 service.get_osdmap()->get_epoch(),
4935 MOSDPing::PING, now,
4936 cct->_conf->osd_heartbeat_min_size));
4937 }
4938
4939 logger->set(l_osd_hb_to, heartbeat_peers.size());
4940
4941 // hmm.. am i all alone?
4942 dout(30) << "heartbeat lonely?" << dendl;
4943 if (heartbeat_peers.empty()) {
4944 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
4945 last_mon_heartbeat = now;
4946 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
4947 osdmap_subscribe(osdmap->get_epoch() + 1, false);
4948 }
4949 }
4950
4951 dout(30) << "heartbeat done" << dendl;
4952 }
4953
4954 bool OSD::heartbeat_reset(Connection *con)
4955 {
4956 HeartbeatSession *s = static_cast<HeartbeatSession*>(con->get_priv());
4957 if (s) {
4958 heartbeat_lock.Lock();
4959 if (is_stopping()) {
4960 heartbeat_lock.Unlock();
4961 s->put();
4962 return true;
4963 }
4964 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer);
4965 if (p != heartbeat_peers.end() &&
4966 (p->second.con_back == con ||
4967 p->second.con_front == con)) {
4968 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
4969 << ", reopening" << dendl;
4970 if (con != p->second.con_back) {
4971 p->second.con_back->mark_down();
4972 }
4973 p->second.con_back.reset(NULL);
4974 if (p->second.con_front && con != p->second.con_front) {
4975 p->second.con_front->mark_down();
4976 }
4977 p->second.con_front.reset(NULL);
4978 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
4979 if (newcon.first) {
4980 p->second.con_back = newcon.first.get();
4981 p->second.con_back->set_priv(s->get());
4982 if (newcon.second) {
4983 p->second.con_front = newcon.second.get();
4984 p->second.con_front->set_priv(s->get());
4985 }
4986 } else {
4987 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
4988 << ", raced with osdmap update, closing out peer" << dendl;
4989 heartbeat_peers.erase(p);
4990 }
4991 } else {
4992 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
4993 }
4994 heartbeat_lock.Unlock();
4995 s->put();
4996 }
4997 return true;
4998 }
4999
5000
5001
5002 // =========================================
5003
5004 void OSD::tick()
5005 {
5006 assert(osd_lock.is_locked());
5007 dout(10) << "tick" << dendl;
5008
5009 if (is_active() || is_waiting_for_healthy()) {
5010 maybe_update_heartbeat_peers();
5011 }
5012
5013 if (is_waiting_for_healthy()) {
5014 start_boot();
5015 } else if (is_preboot() &&
5016 waiting_for_luminous_mons &&
5017 monc->monmap.get_required_features().contains_all(
5018 ceph::features::mon::FEATURE_LUMINOUS)) {
5019 // mon upgrade finished!
5020 start_boot();
5021 }
5022
5023 do_waiters();
5024
5025 tick_timer.add_event_after(OSD_TICK_INTERVAL, new C_Tick(this));
5026 }
5027
5028 void OSD::tick_without_osd_lock()
5029 {
5030 assert(tick_timer_lock.is_locked());
5031 dout(10) << "tick_without_osd_lock" << dendl;
5032
5033 logger->set(l_osd_buf, buffer::get_total_alloc());
5034 logger->set(l_osd_history_alloc_bytes, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
5035 logger->set(l_osd_history_alloc_num, buffer::get_history_alloc_num());
5036 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5037 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5038 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5039
5040 // osd_lock is not being held, which means the OSD state
5041 // might change when doing the monitor report
5042 if (is_active() || is_waiting_for_healthy()) {
5043 heartbeat_lock.Lock();
5044 heartbeat_check();
5045 heartbeat_lock.Unlock();
5046
5047 map_lock.get_read();
5048 Mutex::Locker l(mon_report_lock);
5049
5050 // mon report?
5051 bool reset = false;
5052 bool report = false;
5053 utime_t now = ceph_clock_now();
5054 pg_stat_queue_lock.Lock();
5055 double backoff = stats_ack_timeout / cct->_conf->osd_mon_ack_timeout;
5056 double adjusted_min = cct->_conf->osd_mon_report_interval_min * backoff;
5057 // note: we shouldn't adjust max because it must remain < the
5058 // mon's mon_osd_report_timeout (which defaults to 1.5x our
5059 // value).
5060 double max = cct->_conf->osd_mon_report_interval_max;
5061 if (!outstanding_pg_stats.empty() &&
5062 (now - stats_ack_timeout) > last_pg_stats_ack) {
5063 dout(1) << __func__ << " mon hasn't acked PGStats in "
5064 << now - last_pg_stats_ack
5065 << " seconds, reconnecting elsewhere" << dendl;
5066 reset = true;
5067 last_pg_stats_ack = now; // reset clock
5068 last_pg_stats_sent = utime_t();
5069 stats_ack_timeout =
5070 MAX(cct->_conf->osd_mon_ack_timeout,
5071 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_factor);
5072 outstanding_pg_stats.clear();
5073 }
5074 if (now - last_pg_stats_sent > max) {
5075 osd_stat_updated = true;
5076 report = true;
5077 } else if (service.need_fullness_update()) {
5078 report = true;
5079 } else if ((int)outstanding_pg_stats.size() >=
5080 cct->_conf->osd_mon_report_max_in_flight) {
5081 dout(20) << __func__ << " have max " << outstanding_pg_stats
5082 << " stats updates in flight" << dendl;
5083 } else {
5084 if (now - last_mon_report > adjusted_min) {
5085 dout(20) << __func__ << " stats backoff " << backoff
5086 << " adjusted_min " << adjusted_min << " - sending report"
5087 << dendl;
5088 osd_stat_updated = true;
5089 report = true;
5090 }
5091 }
5092 pg_stat_queue_lock.Unlock();
5093
5094 if (reset) {
5095 monc->reopen_session();
5096 } else if (report) {
5097 last_mon_report = now;
5098
5099 // do any pending reports
5100 send_full_update();
5101 send_failures();
5102 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5103 send_pg_stats(now);
5104 }
5105 }
5106 map_lock.put_read();
5107 }
5108
5109 if (is_active()) {
5110 if (!scrub_random_backoff()) {
5111 sched_scrub();
5112 }
5113 service.promote_throttle_recalibrate();
5114 bool need_send_beacon = false;
5115 const auto now = ceph::coarse_mono_clock::now();
5116 {
5117 // borrow lec lock to pretect last_sent_beacon from changing
5118 Mutex::Locker l{min_last_epoch_clean_lock};
5119 const auto elapsed = now - last_sent_beacon;
5120 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5121 cct->_conf->osd_beacon_report_interval) {
5122 need_send_beacon = true;
5123 }
5124 }
5125 if (need_send_beacon) {
5126 send_beacon(now);
5127 }
5128 }
5129
5130 check_ops_in_flight();
5131 service.kick_recovery_queue();
5132 tick_timer_without_osd_lock.add_event_after(OSD_TICK_INTERVAL, new C_Tick_WithoutOSDLock(this));
5133 }
5134
5135 void OSD::check_ops_in_flight()
5136 {
5137 vector<string> warnings;
5138 if (op_tracker.check_ops_in_flight(warnings)) {
5139 for (vector<string>::iterator i = warnings.begin();
5140 i != warnings.end();
5141 ++i) {
5142 clog->warn() << *i;
5143 }
5144 }
5145 }
5146
5147 // Usage:
5148 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5149 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5150 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5151 // getomap <pool> [namespace/]<obj-name>
5152 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5153 // injectmdataerr [namespace/]<obj-name> [shardid]
5154 // injectdataerr [namespace/]<obj-name> [shardid]
5155 //
5156 // set_recovery_delay [utime]
5157 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5158 const std::string &command, cmdmap_t& cmdmap, ostream &ss)
5159 {
5160 //Test support
5161 //Support changing the omap on a single osd by using the Admin Socket to
5162 //directly request the osd make a change.
5163 if (command == "setomapval" || command == "rmomapkey" ||
5164 command == "setomapheader" || command == "getomap" ||
5165 command == "truncobj" || command == "injectmdataerr" ||
5166 command == "injectdataerr"
5167 ) {
5168 pg_t rawpg;
5169 int64_t pool;
5170 OSDMapRef curmap = service->get_osdmap();
5171 int r = -1;
5172
5173 string poolstr;
5174
5175 cmd_getval(service->cct, cmdmap, "pool", poolstr);
5176 pool = curmap->lookup_pg_pool_name(poolstr);
5177 //If we can't find it by name then maybe id specified
5178 if (pool < 0 && isdigit(poolstr[0]))
5179 pool = atoll(poolstr.c_str());
5180 if (pool < 0) {
5181 ss << "Invalid pool" << poolstr;
5182 return;
5183 }
5184
5185 string objname, nspace;
5186 cmd_getval(service->cct, cmdmap, "objname", objname);
5187 std::size_t found = objname.find_first_of('/');
5188 if (found != string::npos) {
5189 nspace = objname.substr(0, found);
5190 objname = objname.substr(found+1);
5191 }
5192 object_locator_t oloc(pool, nspace);
5193 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5194
5195 if (r < 0) {
5196 ss << "Invalid namespace/objname";
5197 return;
5198 }
5199
5200 int64_t shardid;
5201 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5202 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5203 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5204 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5205 if (curmap->pg_is_ec(rawpg)) {
5206 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5207 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5208 return;
5209 }
5210 }
5211
5212 ObjectStore::Transaction t;
5213
5214 if (command == "setomapval") {
5215 map<string, bufferlist> newattrs;
5216 bufferlist val;
5217 string key, valstr;
5218 cmd_getval(service->cct, cmdmap, "key", key);
5219 cmd_getval(service->cct, cmdmap, "val", valstr);
5220
5221 val.append(valstr);
5222 newattrs[key] = val;
5223 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5224 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5225 if (r < 0)
5226 ss << "error=" << r;
5227 else
5228 ss << "ok";
5229 } else if (command == "rmomapkey") {
5230 string key;
5231 set<string> keys;
5232 cmd_getval(service->cct, cmdmap, "key", key);
5233
5234 keys.insert(key);
5235 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
5236 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5237 if (r < 0)
5238 ss << "error=" << r;
5239 else
5240 ss << "ok";
5241 } else if (command == "setomapheader") {
5242 bufferlist newheader;
5243 string headerstr;
5244
5245 cmd_getval(service->cct, cmdmap, "header", headerstr);
5246 newheader.append(headerstr);
5247 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
5248 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5249 if (r < 0)
5250 ss << "error=" << r;
5251 else
5252 ss << "ok";
5253 } else if (command == "getomap") {
5254 //Debug: Output entire omap
5255 bufferlist hdrbl;
5256 map<string, bufferlist> keyvals;
5257 r = store->omap_get(coll_t(pgid), ghobject_t(obj), &hdrbl, &keyvals);
5258 if (r >= 0) {
5259 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5260 for (map<string, bufferlist>::iterator it = keyvals.begin();
5261 it != keyvals.end(); ++it)
5262 ss << " key=" << (*it).first << " val="
5263 << string((*it).second.c_str(), (*it).second.length());
5264 } else {
5265 ss << "error=" << r;
5266 }
5267 } else if (command == "truncobj") {
5268 int64_t trunclen;
5269 cmd_getval(service->cct, cmdmap, "len", trunclen);
5270 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
5271 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5272 if (r < 0)
5273 ss << "error=" << r;
5274 else
5275 ss << "ok";
5276 } else if (command == "injectdataerr") {
5277 store->inject_data_error(gobj);
5278 ss << "ok";
5279 } else if (command == "injectmdataerr") {
5280 store->inject_mdata_error(gobj);
5281 ss << "ok";
5282 }
5283 return;
5284 }
5285 if (command == "set_recovery_delay") {
5286 int64_t delay;
5287 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5288 ostringstream oss;
5289 oss << delay;
5290 int r = service->cct->_conf->set_val("osd_recovery_delay_start",
5291 oss.str().c_str());
5292 if (r != 0) {
5293 ss << "set_recovery_delay: error setting "
5294 << "osd_recovery_delay_start to '" << delay << "': error "
5295 << r;
5296 return;
5297 }
5298 service->cct->_conf->apply_changes(NULL);
5299 ss << "set_recovery_delay: set osd_recovery_delay_start "
5300 << "to " << service->cct->_conf->osd_recovery_delay_start;
5301 return;
5302 }
5303 if (command == "trigger_scrub") {
5304 spg_t pgid;
5305 OSDMapRef curmap = service->get_osdmap();
5306
5307 string pgidstr;
5308
5309 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5310 if (!pgid.parse(pgidstr.c_str())) {
5311 ss << "Invalid pgid specified";
5312 return;
5313 }
5314
5315 PG *pg = service->osd->_lookup_lock_pg(pgid);
5316 if (pg == nullptr) {
5317 ss << "Can't find pg " << pgid;
5318 return;
5319 }
5320
5321 if (pg->is_primary()) {
5322 pg->unreg_next_scrub();
5323 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5324 double pool_scrub_max_interval = 0;
5325 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5326 double scrub_max_interval = pool_scrub_max_interval > 0 ?
5327 pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
5328 // Instead of marking must_scrub force a schedule scrub
5329 utime_t stamp = ceph_clock_now();
5330 stamp -= scrub_max_interval;
5331 stamp -= 100.0; // push back last scrub more for good measure
5332 pg->info.history.last_scrub_stamp = stamp;
5333 pg->reg_next_scrub();
5334 ss << "ok";
5335 } else {
5336 ss << "Not primary";
5337 }
5338 pg->unlock();
5339 return;
5340 }
5341 if (command == "injectfull") {
5342 int64_t count;
5343 string type;
5344 OSDService::s_names state;
5345 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
5346 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
5347 if (type == "none" || count == 0) {
5348 type = "none";
5349 count = 0;
5350 }
5351 state = service->get_full_state(type);
5352 if (state == OSDService::s_names::INVALID) {
5353 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5354 return;
5355 }
5356 service->set_injectfull(state, count);
5357 return;
5358 }
5359 ss << "Internal error - command=" << command;
5360 }
5361
5362 // =========================================
5363 bool remove_dir(
5364 CephContext *cct,
5365 ObjectStore *store, SnapMapper *mapper,
5366 OSDriver *osdriver,
5367 ObjectStore::Sequencer *osr,
5368 coll_t coll, DeletingStateRef dstate,
5369 bool *finished,
5370 ThreadPool::TPHandle &handle)
5371 {
5372 vector<ghobject_t> olist;
5373 int64_t num = 0;
5374 ObjectStore::Transaction t;
5375 ghobject_t next;
5376 handle.reset_tp_timeout();
5377 store->collection_list(
5378 coll,
5379 next,
5380 ghobject_t::get_max(),
5381 store->get_ideal_list_max(),
5382 &olist,
5383 &next);
5384 generic_dout(10) << __func__ << " " << olist << dendl;
5385 // default cont to true, this is safe because caller(OSD::RemoveWQ::_process())
5386 // will recheck the answer before it really goes on.
5387 bool cont = true;
5388 for (vector<ghobject_t>::iterator i = olist.begin();
5389 i != olist.end();
5390 ++i) {
5391 if (i->is_pgmeta())
5392 continue;
5393 OSDriver::OSTransaction _t(osdriver->get_transaction(&t));
5394 int r = mapper->remove_oid(i->hobj, &_t);
5395 if (r != 0 && r != -ENOENT) {
5396 ceph_abort();
5397 }
5398 t.remove(coll, *i);
5399 if (++num >= cct->_conf->osd_target_transaction_size) {
5400 C_SaferCond waiter;
5401 store->queue_transaction(osr, std::move(t), &waiter);
5402 cont = dstate->pause_clearing();
5403 handle.suspend_tp_timeout();
5404 waiter.wait();
5405 handle.reset_tp_timeout();
5406 if (cont)
5407 cont = dstate->resume_clearing();
5408 if (!cont)
5409 return false;
5410 t = ObjectStore::Transaction();
5411 num = 0;
5412 }
5413 }
5414 if (num) {
5415 C_SaferCond waiter;
5416 store->queue_transaction(osr, std::move(t), &waiter);
5417 cont = dstate->pause_clearing();
5418 handle.suspend_tp_timeout();
5419 waiter.wait();
5420 handle.reset_tp_timeout();
5421 if (cont)
5422 cont = dstate->resume_clearing();
5423 }
5424 // whether there are more objects to remove in the collection
5425 *finished = next.is_max();
5426 return cont;
5427 }
5428
5429 void OSD::RemoveWQ::_process(
5430 pair<PGRef, DeletingStateRef> item,
5431 ThreadPool::TPHandle &handle)
5432 {
5433 FUNCTRACE();
5434 PGRef pg(item.first);
5435 SnapMapper &mapper = pg->snap_mapper;
5436 OSDriver &driver = pg->osdriver;
5437 coll_t coll = coll_t(pg->info.pgid);
5438 pg->osr->flush();
5439 bool finished = false;
5440
5441 if (!item.second->start_or_resume_clearing())
5442 return;
5443
5444 bool cont = remove_dir(
5445 pg->cct, store, &mapper, &driver, pg->osr.get(), coll, item.second,
5446 &finished, handle);
5447 if (!cont)
5448 return;
5449 if (!finished) {
5450 if (item.second->pause_clearing())
5451 queue_front(item);
5452 return;
5453 }
5454
5455 if (!item.second->start_deleting())
5456 return;
5457
5458 ObjectStore::Transaction t;
5459 PGLog::clear_info_log(pg->info.pgid, &t);
5460
5461 if (cct->_conf->osd_inject_failure_on_pg_removal) {
5462 generic_derr << "osd_inject_failure_on_pg_removal" << dendl;
5463 _exit(1);
5464 }
5465 t.remove_collection(coll);
5466
5467 // We need the sequencer to stick around until the op is complete
5468 store->queue_transaction(
5469 pg->osr.get(),
5470 std::move(t),
5471 0, // onapplied
5472 0, // oncommit
5473 0, // onreadable sync
5474 new ContainerContext<PGRef>(pg),
5475 TrackedOpRef());
5476
5477 item.second->finish_deleting();
5478 }
5479 // =========================================
5480
5481 void OSD::ms_handle_connect(Connection *con)
5482 {
5483 dout(10) << __func__ << " con " << con << dendl;
5484 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
5485 Mutex::Locker l(osd_lock);
5486 if (is_stopping())
5487 return;
5488 dout(10) << __func__ << " on mon" << dendl;
5489
5490 if (is_preboot()) {
5491 start_boot();
5492 } else if (is_booting()) {
5493 _send_boot(); // resend boot message
5494 } else {
5495 map_lock.get_read();
5496 Mutex::Locker l2(mon_report_lock);
5497
5498 utime_t now = ceph_clock_now();
5499 last_mon_report = now;
5500
5501 // resend everything, it's a new session
5502 send_full_update();
5503 send_alive();
5504 service.requeue_pg_temp();
5505 service.send_pg_temp();
5506 requeue_failures();
5507 send_failures();
5508 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5509 send_pg_stats(now);
5510 }
5511
5512 map_lock.put_read();
5513 if (is_active()) {
5514 send_beacon(ceph::coarse_mono_clock::now());
5515 }
5516 }
5517
5518 // full map requests may happen while active or pre-boot
5519 if (requested_full_first) {
5520 rerequest_full_maps();
5521 }
5522 }
5523 }
5524
5525 void OSD::ms_handle_fast_connect(Connection *con)
5526 {
5527 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5528 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5529 Session *s = static_cast<Session*>(con->get_priv());
5530 if (!s) {
5531 s = new Session(cct);
5532 con->set_priv(s->get());
5533 s->con = con;
5534 dout(10) << " new session (outgoing) " << s << " con=" << s->con
5535 << " addr=" << s->con->get_peer_addr() << dendl;
5536 // we don't connect to clients
5537 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5538 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5539 }
5540 s->put();
5541 }
5542 }
5543
5544 void OSD::ms_handle_fast_accept(Connection *con)
5545 {
5546 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5547 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5548 Session *s = static_cast<Session*>(con->get_priv());
5549 if (!s) {
5550 s = new Session(cct);
5551 con->set_priv(s->get());
5552 s->con = con;
5553 dout(10) << "new session (incoming)" << s << " con=" << con
5554 << " addr=" << con->get_peer_addr()
5555 << " must have raced with connect" << dendl;
5556 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5557 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5558 }
5559 s->put();
5560 }
5561 }
5562
5563 bool OSD::ms_handle_reset(Connection *con)
5564 {
5565 Session *session = static_cast<Session*>(con->get_priv());
5566 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
5567 if (!session)
5568 return false;
5569 session->wstate.reset(con);
5570 session->con.reset(NULL); // break con <-> session ref cycle
5571 // note that we break session->con *before* the session_handle_reset
5572 // cleanup below. this avoids a race between us and
5573 // PG::add_backoff, Session::check_backoff, etc.
5574 session_handle_reset(session);
5575 session->put();
5576 return true;
5577 }
5578
5579 bool OSD::ms_handle_refused(Connection *con)
5580 {
5581 if (!cct->_conf->osd_fast_fail_on_connection_refused)
5582 return false;
5583
5584 Session *session = static_cast<Session*>(con->get_priv());
5585 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
5586 if (!session)
5587 return false;
5588 int type = con->get_peer_type();
5589 // handle only OSD failures here
5590 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
5591 OSDMapRef osdmap = get_osdmap();
5592 if (osdmap) {
5593 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
5594 if (id >= 0 && osdmap->is_up(id)) {
5595 // I'm cheating mon heartbeat grace logic, because we know it's not going
5596 // to respawn alone. +1 so we won't hit any boundary case.
5597 monc->send_mon_message(new MOSDFailure(monc->get_fsid(),
5598 osdmap->get_inst(id),
5599 cct->_conf->osd_heartbeat_grace + 1,
5600 osdmap->get_epoch(),
5601 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
5602 ));
5603 }
5604 }
5605 }
5606 session->put();
5607 return true;
5608 }
5609
5610 struct C_OSD_GetVersion : public Context {
5611 OSD *osd;
5612 uint64_t oldest, newest;
5613 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
5614 void finish(int r) override {
5615 if (r >= 0)
5616 osd->_got_mon_epochs(oldest, newest);
5617 }
5618 };
5619
5620 void OSD::start_boot()
5621 {
5622 if (!_is_healthy()) {
5623 // if we are not healthy, do not mark ourselves up (yet)
5624 dout(1) << "not healthy; waiting to boot" << dendl;
5625 if (!is_waiting_for_healthy())
5626 start_waiting_for_healthy();
5627 // send pings sooner rather than later
5628 heartbeat_kick();
5629 return;
5630 }
5631 dout(1) << __func__ << dendl;
5632 set_state(STATE_PREBOOT);
5633 waiting_for_luminous_mons = false;
5634 dout(10) << "start_boot - have maps " << superblock.oldest_map
5635 << ".." << superblock.newest_map << dendl;
5636 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
5637 monc->get_version("osdmap", &c->newest, &c->oldest, c);
5638 }
5639
5640 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
5641 {
5642 Mutex::Locker l(osd_lock);
5643 if (is_preboot()) {
5644 _preboot(oldest, newest);
5645 }
5646 }
5647
5648 void OSD::_preboot(epoch_t oldest, epoch_t newest)
5649 {
5650 assert(is_preboot());
5651 dout(10) << __func__ << " _preboot mon has osdmaps "
5652 << oldest << ".." << newest << dendl;
5653
5654 // ensure our local fullness awareness is accurate
5655 heartbeat();
5656
5657 // if our map within recent history, try to add ourselves to the osdmap.
5658 if (osdmap->get_epoch() == 0) {
5659 derr << "waiting for initial osdmap" << dendl;
5660 } else if (osdmap->test_flag(CEPH_OSDMAP_NOUP) || osdmap->is_noup(whoami)) {
5661 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
5662 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
5663 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
5664 << dendl;
5665 } else if (osdmap->require_osd_release < CEPH_RELEASE_JEWEL) {
5666 derr << "osdmap REQUIRE_JEWEL OSDMap flag is NOT set; please set it"
5667 << dendl;
5668 } else if (!monc->monmap.get_required_features().contains_all(
5669 ceph::features::mon::FEATURE_LUMINOUS)) {
5670 derr << "monmap REQUIRE_LUMINOUS is NOT set; must upgrade all monitors to "
5671 << "Luminous or later before Luminous OSDs will boot" << dendl;
5672 waiting_for_luminous_mons = true;
5673 } else if (service.need_fullness_update()) {
5674 derr << "osdmap fullness state needs update" << dendl;
5675 send_full_update();
5676 } else if (osdmap->get_epoch() >= oldest - 1 &&
5677 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
5678 _send_boot();
5679 return;
5680 }
5681
5682 // get all the latest maps
5683 if (osdmap->get_epoch() + 1 >= oldest)
5684 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5685 else
5686 osdmap_subscribe(oldest - 1, true);
5687 }
5688
5689 void OSD::send_full_update()
5690 {
5691 if (!service.need_fullness_update())
5692 return;
5693 unsigned state = 0;
5694 if (service.is_full()) {
5695 state = CEPH_OSD_FULL;
5696 } else if (service.is_backfillfull()) {
5697 state = CEPH_OSD_BACKFILLFULL;
5698 } else if (service.is_nearfull()) {
5699 state = CEPH_OSD_NEARFULL;
5700 }
5701 set<string> s;
5702 OSDMap::calc_state_set(state, s);
5703 dout(10) << __func__ << " want state " << s << dendl;
5704 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
5705 }
5706
5707 void OSD::start_waiting_for_healthy()
5708 {
5709 dout(1) << "start_waiting_for_healthy" << dendl;
5710 set_state(STATE_WAITING_FOR_HEALTHY);
5711 last_heartbeat_resample = utime_t();
5712 }
5713
5714 bool OSD::_is_healthy()
5715 {
5716 if (!cct->get_heartbeat_map()->is_healthy()) {
5717 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
5718 return false;
5719 }
5720
5721 if (is_waiting_for_healthy()) {
5722 Mutex::Locker l(heartbeat_lock);
5723 utime_t cutoff = ceph_clock_now();
5724 cutoff -= cct->_conf->osd_heartbeat_grace;
5725 int num = 0, up = 0;
5726 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5727 p != heartbeat_peers.end();
5728 ++p) {
5729 if (p->second.is_healthy(cutoff))
5730 ++up;
5731 ++num;
5732 }
5733 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
5734 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
5735 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
5736 return false;
5737 }
5738 }
5739
5740 return true;
5741 }
5742
5743 void OSD::_send_boot()
5744 {
5745 dout(10) << "_send_boot" << dendl;
5746 entity_addr_t cluster_addr = cluster_messenger->get_myaddr();
5747 Connection *local_connection = cluster_messenger->get_loopback_connection().get();
5748 if (cluster_addr.is_blank_ip()) {
5749 int port = cluster_addr.get_port();
5750 cluster_addr = client_messenger->get_myaddr();
5751 cluster_addr.set_port(port);
5752 cluster_messenger->set_addr_unknowns(cluster_addr);
5753 dout(10) << " assuming cluster_addr ip matches client_addr" << dendl;
5754 } else {
5755 Session *s = static_cast<Session*>(local_connection->get_priv());
5756 if (s)
5757 s->put();
5758 else
5759 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
5760 }
5761
5762 entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr();
5763 local_connection = hb_back_server_messenger->get_loopback_connection().get();
5764 if (hb_back_addr.is_blank_ip()) {
5765 int port = hb_back_addr.get_port();
5766 hb_back_addr = cluster_addr;
5767 hb_back_addr.set_port(port);
5768 hb_back_server_messenger->set_addr_unknowns(hb_back_addr);
5769 dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl;
5770 } else {
5771 Session *s = static_cast<Session*>(local_connection->get_priv());
5772 if (s)
5773 s->put();
5774 else
5775 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
5776 }
5777
5778 entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr();
5779 local_connection = hb_front_server_messenger->get_loopback_connection().get();
5780 if (hb_front_addr.is_blank_ip()) {
5781 int port = hb_front_addr.get_port();
5782 hb_front_addr = client_messenger->get_myaddr();
5783 hb_front_addr.set_port(port);
5784 hb_front_server_messenger->set_addr_unknowns(hb_front_addr);
5785 dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
5786 } else {
5787 Session *s = static_cast<Session*>(local_connection->get_priv());
5788 if (s)
5789 s->put();
5790 else
5791 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
5792 }
5793
5794 MOSDBoot *mboot = new MOSDBoot(superblock, get_osdmap_epoch(), service.get_boot_epoch(),
5795 hb_back_addr, hb_front_addr, cluster_addr,
5796 CEPH_FEATURES_ALL);
5797 dout(10) << " client_addr " << client_messenger->get_myaddr()
5798 << ", cluster_addr " << cluster_addr
5799 << ", hb_back_addr " << hb_back_addr
5800 << ", hb_front_addr " << hb_front_addr
5801 << dendl;
5802 _collect_metadata(&mboot->metadata);
5803 monc->send_mon_message(mboot);
5804 set_state(STATE_BOOTING);
5805 }
5806
5807 void OSD::_collect_metadata(map<string,string> *pm)
5808 {
5809 // config info
5810 (*pm)["osd_data"] = dev_path;
5811 (*pm)["osd_journal"] = journal_path;
5812 (*pm)["front_addr"] = stringify(client_messenger->get_myaddr());
5813 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddr());
5814 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddr());
5815 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddr());
5816
5817 // backend
5818 (*pm)["osd_objectstore"] = store->get_type();
5819 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
5820 store->collect_metadata(pm);
5821
5822 collect_sys_info(pm, cct);
5823
5824 dout(10) << __func__ << " " << *pm << dendl;
5825 }
5826
5827 void OSD::queue_want_up_thru(epoch_t want)
5828 {
5829 map_lock.get_read();
5830 epoch_t cur = osdmap->get_up_thru(whoami);
5831 Mutex::Locker l(mon_report_lock);
5832 if (want > up_thru_wanted) {
5833 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
5834 << ", currently " << cur
5835 << dendl;
5836 up_thru_wanted = want;
5837 send_alive();
5838 } else {
5839 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
5840 << ", currently " << cur
5841 << dendl;
5842 }
5843 map_lock.put_read();
5844 }
5845
5846 void OSD::send_alive()
5847 {
5848 assert(mon_report_lock.is_locked());
5849 if (!osdmap->exists(whoami))
5850 return;
5851 epoch_t up_thru = osdmap->get_up_thru(whoami);
5852 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
5853 if (up_thru_wanted > up_thru) {
5854 dout(10) << "send_alive want " << up_thru_wanted << dendl;
5855 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
5856 }
5857 }
5858
5859 void OSD::request_full_map(epoch_t first, epoch_t last)
5860 {
5861 dout(10) << __func__ << " " << first << ".." << last
5862 << ", previously requested "
5863 << requested_full_first << ".." << requested_full_last << dendl;
5864 assert(osd_lock.is_locked());
5865 assert(first > 0 && last > 0);
5866 assert(first <= last);
5867 assert(first >= requested_full_first); // we shouldn't ever ask for older maps
5868 if (requested_full_first == 0) {
5869 // first request
5870 requested_full_first = first;
5871 requested_full_last = last;
5872 } else if (last <= requested_full_last) {
5873 // dup
5874 return;
5875 } else {
5876 // additional request
5877 first = requested_full_last + 1;
5878 requested_full_last = last;
5879 }
5880 MMonGetOSDMap *req = new MMonGetOSDMap;
5881 req->request_full(first, last);
5882 monc->send_mon_message(req);
5883 }
5884
5885 void OSD::got_full_map(epoch_t e)
5886 {
5887 assert(requested_full_first <= requested_full_last);
5888 assert(osd_lock.is_locked());
5889 if (requested_full_first == 0) {
5890 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
5891 return;
5892 }
5893 if (e < requested_full_first) {
5894 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
5895 << ".." << requested_full_last
5896 << ", ignoring" << dendl;
5897 return;
5898 }
5899 if (e >= requested_full_last) {
5900 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
5901 << ".." << requested_full_last << ", resetting" << dendl;
5902 requested_full_first = requested_full_last = 0;
5903 return;
5904 }
5905
5906 requested_full_first = e + 1;
5907
5908 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
5909 << ".." << requested_full_last
5910 << ", still need more" << dendl;
5911 }
5912
5913 void OSD::requeue_failures()
5914 {
5915 Mutex::Locker l(heartbeat_lock);
5916 unsigned old_queue = failure_queue.size();
5917 unsigned old_pending = failure_pending.size();
5918 for (map<int,pair<utime_t,entity_inst_t> >::iterator p =
5919 failure_pending.begin();
5920 p != failure_pending.end(); ) {
5921 failure_queue[p->first] = p->second.first;
5922 failure_pending.erase(p++);
5923 }
5924 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
5925 << failure_queue.size() << dendl;
5926 }
5927
5928 void OSD::send_failures()
5929 {
5930 assert(map_lock.is_locked());
5931 assert(mon_report_lock.is_locked());
5932 Mutex::Locker l(heartbeat_lock);
5933 utime_t now = ceph_clock_now();
5934 while (!failure_queue.empty()) {
5935 int osd = failure_queue.begin()->first;
5936 if (!failure_pending.count(osd)) {
5937 entity_inst_t i = osdmap->get_inst(osd);
5938 int failed_for = (int)(double)(now - failure_queue.begin()->second);
5939 monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, failed_for,
5940 osdmap->get_epoch()));
5941 failure_pending[osd] = make_pair(failure_queue.begin()->second, i);
5942 }
5943 failure_queue.erase(osd);
5944 }
5945 }
5946
5947 void OSD::send_still_alive(epoch_t epoch, const entity_inst_t &i)
5948 {
5949 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), i, 0, epoch, MOSDFailure::FLAG_ALIVE);
5950 monc->send_mon_message(m);
5951 }
5952
5953 void OSD::send_pg_stats(const utime_t &now)
5954 {
5955 assert(map_lock.is_locked());
5956 assert(osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS);
5957 dout(20) << "send_pg_stats" << dendl;
5958
5959 osd_stat_t cur_stat = service.get_osd_stat();
5960
5961 cur_stat.os_perf_stat = store->get_cur_stats();
5962
5963 pg_stat_queue_lock.Lock();
5964
5965 if (osd_stat_updated || !pg_stat_queue.empty()) {
5966 last_pg_stats_sent = now;
5967 osd_stat_updated = false;
5968
5969 dout(10) << "send_pg_stats - " << pg_stat_queue.size() << " pgs updated" << dendl;
5970
5971 utime_t had_for(now);
5972 had_for -= had_map_since;
5973
5974 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
5975
5976 uint64_t tid = ++pg_stat_tid;
5977 m->set_tid(tid);
5978 m->osd_stat = cur_stat;
5979
5980 xlist<PG*>::iterator p = pg_stat_queue.begin();
5981 while (!p.end()) {
5982 PG *pg = *p;
5983 ++p;
5984 if (!pg->is_primary()) { // we hold map_lock; role is stable.
5985 pg->stat_queue_item.remove_myself();
5986 pg->put("pg_stat_queue");
5987 continue;
5988 }
5989 pg->pg_stats_publish_lock.Lock();
5990 if (pg->pg_stats_publish_valid) {
5991 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
5992 dout(25) << " sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
5993 << pg->pg_stats_publish.reported_seq << dendl;
5994 } else {
5995 dout(25) << " NOT sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
5996 << pg->pg_stats_publish.reported_seq << ", not valid" << dendl;
5997 }
5998 pg->pg_stats_publish_lock.Unlock();
5999 }
6000
6001 if (last_pg_stats_ack == utime_t() || !outstanding_pg_stats.empty()) {
6002 last_pg_stats_ack = ceph_clock_now();
6003 }
6004 outstanding_pg_stats.insert(tid);
6005 dout(20) << __func__ << " updates pending: " << outstanding_pg_stats << dendl;
6006
6007 monc->send_mon_message(m);
6008 }
6009
6010 pg_stat_queue_lock.Unlock();
6011 }
6012
6013 void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
6014 {
6015 dout(10) << "handle_pg_stats_ack " << dendl;
6016
6017 if (!require_mon_peer(ack)) {
6018 ack->put();
6019 return;
6020 }
6021
6022 // NOTE: we may get replies from a previous mon even while
6023 // outstanding_pg_stats is empty if reconnecting races with replies
6024 // in flight.
6025
6026 pg_stat_queue_lock.Lock();
6027
6028 last_pg_stats_ack = ceph_clock_now();
6029
6030 // decay timeout slowly (analogous to TCP)
6031 stats_ack_timeout =
6032 MAX(cct->_conf->osd_mon_ack_timeout,
6033 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_decay);
6034 dout(20) << __func__ << " timeout now " << stats_ack_timeout << dendl;
6035
6036 if (ack->get_tid() > pg_stat_tid_flushed) {
6037 pg_stat_tid_flushed = ack->get_tid();
6038 pg_stat_queue_cond.Signal();
6039 }
6040
6041 xlist<PG*>::iterator p = pg_stat_queue.begin();
6042 while (!p.end()) {
6043 PG *pg = *p;
6044 PGRef _pg(pg);
6045 ++p;
6046
6047 auto acked = ack->pg_stat.find(pg->info.pgid.pgid);
6048 if (acked != ack->pg_stat.end()) {
6049 pg->pg_stats_publish_lock.Lock();
6050 if (acked->second.first == pg->pg_stats_publish.reported_seq &&
6051 acked->second.second == pg->pg_stats_publish.reported_epoch) {
6052 dout(25) << " ack on " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6053 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6054 pg->stat_queue_item.remove_myself();
6055 pg->put("pg_stat_queue");
6056 } else {
6057 dout(25) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6058 << ":" << pg->pg_stats_publish.reported_seq << " > acked "
6059 << acked->second << dendl;
6060 }
6061 pg->pg_stats_publish_lock.Unlock();
6062 } else {
6063 dout(30) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6064 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6065 }
6066 }
6067
6068 outstanding_pg_stats.erase(ack->get_tid());
6069 dout(20) << __func__ << " still pending: " << outstanding_pg_stats << dendl;
6070
6071 pg_stat_queue_lock.Unlock();
6072
6073 ack->put();
6074 }
6075
6076 void OSD::flush_pg_stats()
6077 {
6078 dout(10) << "flush_pg_stats" << dendl;
6079 osd_lock.Unlock();
6080 utime_t now = ceph_clock_now();
6081 map_lock.get_read();
6082 mon_report_lock.Lock();
6083 send_pg_stats(now);
6084 mon_report_lock.Unlock();
6085 map_lock.put_read();
6086
6087
6088 pg_stat_queue_lock.Lock();
6089 uint64_t tid = pg_stat_tid;
6090 dout(10) << "flush_pg_stats waiting for stats tid " << tid << " to flush" << dendl;
6091 while (tid > pg_stat_tid_flushed)
6092 pg_stat_queue_cond.Wait(pg_stat_queue_lock);
6093 dout(10) << "flush_pg_stats finished waiting for stats tid " << tid << " to flush" << dendl;
6094 pg_stat_queue_lock.Unlock();
6095
6096 osd_lock.Lock();
6097 }
6098
6099 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6100 {
6101 const auto& monmap = monc->monmap;
6102 // send beacon to mon even if we are just connected, and the monmap is not
6103 // initialized yet by then.
6104 if (monmap.epoch > 0 &&
6105 monmap.get_required_features().contains_all(
6106 ceph::features::mon::FEATURE_LUMINOUS)) {
6107 dout(20) << __func__ << " sending" << dendl;
6108 MOSDBeacon* beacon = nullptr;
6109 {
6110 Mutex::Locker l{min_last_epoch_clean_lock};
6111 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
6112 std::swap(beacon->pgs, min_last_epoch_clean_pgs);
6113 last_sent_beacon = now;
6114 }
6115 monc->send_mon_message(beacon);
6116 } else {
6117 dout(20) << __func__ << " not sending" << dendl;
6118 }
6119 }
6120
6121 void OSD::handle_command(MMonCommand *m)
6122 {
6123 if (!require_mon_peer(m)) {
6124 m->put();
6125 return;
6126 }
6127
6128 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6129 command_wq.queue(c);
6130 m->put();
6131 }
6132
6133 void OSD::handle_command(MCommand *m)
6134 {
6135 ConnectionRef con = m->get_connection();
6136 Session *session = static_cast<Session *>(con->get_priv());
6137 if (!session) {
6138 con->send_message(new MCommandReply(m, -EPERM));
6139 m->put();
6140 return;
6141 }
6142
6143 OSDCap& caps = session->caps;
6144 session->put();
6145
6146 if (!caps.allow_all() || m->get_source().is_mon()) {
6147 con->send_message(new MCommandReply(m, -EPERM));
6148 m->put();
6149 return;
6150 }
6151
6152 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6153 command_wq.queue(c);
6154
6155 m->put();
6156 }
6157
6158 struct OSDCommand {
6159 string cmdstring;
6160 string helpstring;
6161 string module;
6162 string perm;
6163 string availability;
6164 } osd_commands[] = {
6165
6166 #define COMMAND(parsesig, helptext, module, perm, availability) \
6167 {parsesig, helptext, module, perm, availability},
6168
6169 // yes, these are really pg commands, but there's a limit to how
6170 // much work it's worth. The OSD returns all of them. Make this
6171 // form (pg <pgid> <cmd>) valid only for the cli.
6172 // Rest uses "tell <pgid> <cmd>"
6173
6174 COMMAND("pg " \
6175 "name=pgid,type=CephPgid " \
6176 "name=cmd,type=CephChoices,strings=query", \
6177 "show details of a specific pg", "osd", "r", "cli")
6178 COMMAND("pg " \
6179 "name=pgid,type=CephPgid " \
6180 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6181 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6182 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6183 "osd", "rw", "cli")
6184 COMMAND("pg " \
6185 "name=pgid,type=CephPgid " \
6186 "name=cmd,type=CephChoices,strings=list_missing " \
6187 "name=offset,type=CephString,req=false",
6188 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6189 "osd", "r", "cli")
6190
6191 // new form: tell <pgid> <cmd> for both cli and rest
6192
6193 COMMAND("query",
6194 "show details of a specific pg", "osd", "r", "cli,rest")
6195 COMMAND("mark_unfound_lost " \
6196 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6197 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6198 "osd", "rw", "cli,rest")
6199 COMMAND("list_missing " \
6200 "name=offset,type=CephString,req=false",
6201 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6202 "osd", "r", "cli,rest")
6203 COMMAND("perf histogram dump "
6204 "name=logger,type=CephString,req=false "
6205 "name=counter,type=CephString,req=false",
6206 "Get histogram data",
6207 "osd", "r", "cli,rest")
6208
6209 // tell <osd.n> commands. Validation of osd.n must be special-cased in client
6210 COMMAND("version", "report version of OSD", "osd", "r", "cli,rest")
6211 COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r", "cli,rest")
6212 COMMAND("injectargs " \
6213 "name=injected_args,type=CephString,n=N",
6214 "inject configuration arguments into running OSD",
6215 "osd", "rw", "cli,rest")
6216 COMMAND("cluster_log " \
6217 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6218 "name=message,type=CephString,n=N",
6219 "log a message to the cluster log",
6220 "osd", "rw", "cli,rest")
6221 COMMAND("bench " \
6222 "name=count,type=CephInt,req=false " \
6223 "name=size,type=CephInt,req=false " \
6224 "name=object_size,type=CephInt,req=false " \
6225 "name=object_num,type=CephInt,req=false ", \
6226 "OSD benchmark: write <count> <size>-byte objects, " \
6227 "(default 1G size 4MB). Results in log.",
6228 "osd", "rw", "cli,rest")
6229 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw", "cli,rest")
6230 COMMAND("heap " \
6231 "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
6232 "show heap usage info (available only if compiled with tcmalloc)", \
6233 "osd", "rw", "cli,rest")
6234 COMMAND("debug dump_missing " \
6235 "name=filename,type=CephFilepath",
6236 "dump missing objects to a named file", "osd", "r", "cli,rest")
6237 COMMAND("debug kick_recovery_wq " \
6238 "name=delay,type=CephInt,range=0",
6239 "set osd_recovery_delay_start to <val>", "osd", "rw", "cli,rest")
6240 COMMAND("cpu_profiler " \
6241 "name=arg,type=CephChoices,strings=status|flush",
6242 "run cpu profiling on daemon", "osd", "rw", "cli,rest")
6243 COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6244 "osd", "r", "cli,rest")
6245 COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6246 "osd", "rw", "cli,rest")
6247 COMMAND("compact",
6248 "compact object store's omap. "
6249 "WARNING: Compaction probably slows your requests",
6250 "osd", "rw", "cli,rest")
6251 };
6252
6253 void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6254 {
6255 int r = 0;
6256 stringstream ss, ds;
6257 string rs;
6258 bufferlist odata;
6259
6260 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6261
6262 map<string, cmd_vartype> cmdmap;
6263 string prefix;
6264 string format;
6265 string pgidstr;
6266 boost::scoped_ptr<Formatter> f;
6267
6268 if (cmd.empty()) {
6269 ss << "no command given";
6270 goto out;
6271 }
6272
6273 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6274 r = -EINVAL;
6275 goto out;
6276 }
6277
6278 cmd_getval(cct, cmdmap, "prefix", prefix);
6279
6280 if (prefix == "get_command_descriptions") {
6281 int cmdnum = 0;
6282 JSONFormatter *f = new JSONFormatter();
6283 f->open_object_section("command_descriptions");
6284 for (OSDCommand *cp = osd_commands;
6285 cp < &osd_commands[ARRAY_SIZE(osd_commands)]; cp++) {
6286
6287 ostringstream secname;
6288 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
6289 dump_cmddesc_to_json(f, secname.str(), cp->cmdstring, cp->helpstring,
6290 cp->module, cp->perm, cp->availability, 0);
6291 cmdnum++;
6292 }
6293 f->close_section(); // command_descriptions
6294
6295 f->flush(ds);
6296 delete f;
6297 goto out;
6298 }
6299
6300 cmd_getval(cct, cmdmap, "format", format);
6301 f.reset(Formatter::create(format));
6302
6303 if (prefix == "version") {
6304 if (f) {
6305 f->open_object_section("version");
6306 f->dump_string("version", pretty_version_to_str());
6307 f->close_section();
6308 f->flush(ds);
6309 } else {
6310 ds << pretty_version_to_str();
6311 }
6312 goto out;
6313 }
6314 else if (prefix == "injectargs") {
6315 vector<string> argsvec;
6316 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6317
6318 if (argsvec.empty()) {
6319 r = -EINVAL;
6320 ss << "ignoring empty injectargs";
6321 goto out;
6322 }
6323 string args = argsvec.front();
6324 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6325 args += " " + *a;
6326 osd_lock.Unlock();
6327 r = cct->_conf->injectargs(args, &ss);
6328 osd_lock.Lock();
6329 }
6330 else if (prefix == "cluster_log") {
6331 vector<string> msg;
6332 cmd_getval(cct, cmdmap, "message", msg);
6333 if (msg.empty()) {
6334 r = -EINVAL;
6335 ss << "ignoring empty log message";
6336 goto out;
6337 }
6338 string message = msg.front();
6339 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6340 message += " " + *a;
6341 string lvl;
6342 cmd_getval(cct, cmdmap, "level", lvl);
6343 clog_type level = string_to_clog_type(lvl);
6344 if (level < 0) {
6345 r = -EINVAL;
6346 ss << "unknown level '" << lvl << "'";
6347 goto out;
6348 }
6349 clog->do_log(level, message);
6350 }
6351
6352 // either 'pg <pgid> <command>' or
6353 // 'tell <pgid>' (which comes in without any of that prefix)?
6354
6355 else if (prefix == "pg" ||
6356 prefix == "query" ||
6357 prefix == "mark_unfound_lost" ||
6358 prefix == "list_missing"
6359 ) {
6360 pg_t pgid;
6361
6362 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6363 ss << "no pgid specified";
6364 r = -EINVAL;
6365 } else if (!pgid.parse(pgidstr.c_str())) {
6366 ss << "couldn't parse pgid '" << pgidstr << "'";
6367 r = -EINVAL;
6368 } else {
6369 spg_t pcand;
6370 PG *pg = nullptr;
6371 if (osdmap->get_primary_shard(pgid, &pcand) &&
6372 (pg = _lookup_lock_pg(pcand))) {
6373 if (pg->is_primary()) {
6374 // simulate pg <pgid> cmd= for pg->do-command
6375 if (prefix != "pg")
6376 cmd_putval(cct, cmdmap, "cmd", prefix);
6377 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
6378 if (r == -EAGAIN) {
6379 pg->unlock();
6380 // don't reply, pg will do so async
6381 return;
6382 }
6383 } else {
6384 ss << "not primary for pgid " << pgid;
6385
6386 // send them the latest diff to ensure they realize the mapping
6387 // has changed.
6388 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
6389
6390 // do not reply; they will get newer maps and realize they
6391 // need to resend.
6392 pg->unlock();
6393 return;
6394 }
6395 pg->unlock();
6396 } else {
6397 ss << "i don't have pgid " << pgid;
6398 r = -ENOENT;
6399 }
6400 }
6401 }
6402
6403 else if (prefix == "bench") {
6404 int64_t count;
6405 int64_t bsize;
6406 int64_t osize, onum;
6407 // default count 1G, size 4MB
6408 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
6409 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
6410 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
6411 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
6412
6413 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
6414 ObjectStore::Sequencer>("bench"));
6415
6416 uint32_t duration = cct->_conf->osd_bench_duration;
6417
6418 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
6419 // let us limit the block size because the next checks rely on it
6420 // having a sane value. If we allow any block size to be set things
6421 // can still go sideways.
6422 ss << "block 'size' values are capped at "
6423 << prettybyte_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
6424 << " a higher value, please adjust 'osd_bench_max_block_size'";
6425 r = -EINVAL;
6426 goto out;
6427 } else if (bsize < (int64_t) (1 << 20)) {
6428 // entering the realm of small block sizes.
6429 // limit the count to a sane value, assuming a configurable amount of
6430 // IOPS and duration, so that the OSD doesn't get hung up on this,
6431 // preventing timeouts from going off
6432 int64_t max_count =
6433 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
6434 if (count > max_count) {
6435 ss << "'count' values greater than " << max_count
6436 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6437 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
6438 << " for " << duration << " seconds,"
6439 << " can cause ill effects on osd. "
6440 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6441 << " value if you wish to use a higher 'count'.";
6442 r = -EINVAL;
6443 goto out;
6444 }
6445 } else {
6446 // 1MB block sizes are big enough so that we get more stuff done.
6447 // However, to avoid the osd from getting hung on this and having
6448 // timers being triggered, we are going to limit the count assuming
6449 // a configurable throughput and duration.
6450 // NOTE: max_count is the total amount of bytes that we believe we
6451 // will be able to write during 'duration' for the given
6452 // throughput. The block size hardly impacts this unless it's
6453 // way too big. Given we already check how big the block size
6454 // is, it's safe to assume everything will check out.
6455 int64_t max_count =
6456 cct->_conf->osd_bench_large_size_max_throughput * duration;
6457 if (count > max_count) {
6458 ss << "'count' values greater than " << max_count
6459 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6460 << prettybyte_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
6461 << " for " << duration << " seconds,"
6462 << " can cause ill effects on osd. "
6463 << " Please adjust 'osd_bench_large_size_max_throughput'"
6464 << " with a higher value if you wish to use a higher 'count'.";
6465 r = -EINVAL;
6466 goto out;
6467 }
6468 }
6469
6470 if (osize && bsize > osize)
6471 bsize = osize;
6472
6473 dout(1) << " bench count " << count
6474 << " bsize " << prettybyte_t(bsize) << dendl;
6475
6476 ObjectStore::Transaction cleanupt;
6477
6478 if (osize && onum) {
6479 bufferlist bl;
6480 bufferptr bp(osize);
6481 bp.zero();
6482 bl.push_back(std::move(bp));
6483 bl.rebuild_page_aligned();
6484 for (int i=0; i<onum; ++i) {
6485 char nm[30];
6486 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
6487 object_t oid(nm);
6488 hobject_t soid(sobject_t(oid, 0));
6489 ObjectStore::Transaction t;
6490 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
6491 store->queue_transaction(osr.get(), std::move(t), NULL);
6492 cleanupt.remove(coll_t(), ghobject_t(soid));
6493 }
6494 }
6495
6496 bufferlist bl;
6497 bufferptr bp(bsize);
6498 bp.zero();
6499 bl.push_back(std::move(bp));
6500 bl.rebuild_page_aligned();
6501
6502 {
6503 C_SaferCond waiter;
6504 if (!osr->flush_commit(&waiter)) {
6505 waiter.wait();
6506 }
6507 }
6508
6509 utime_t start = ceph_clock_now();
6510 for (int64_t pos = 0; pos < count; pos += bsize) {
6511 char nm[30];
6512 unsigned offset = 0;
6513 if (onum && osize) {
6514 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
6515 offset = rand() % (osize / bsize) * bsize;
6516 } else {
6517 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
6518 }
6519 object_t oid(nm);
6520 hobject_t soid(sobject_t(oid, 0));
6521 ObjectStore::Transaction t;
6522 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
6523 store->queue_transaction(osr.get(), std::move(t), NULL);
6524 if (!onum || !osize)
6525 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
6526 }
6527
6528 {
6529 C_SaferCond waiter;
6530 if (!osr->flush_commit(&waiter)) {
6531 waiter.wait();
6532 }
6533 }
6534 utime_t end = ceph_clock_now();
6535
6536 // clean up
6537 store->queue_transaction(osr.get(), std::move(cleanupt), NULL);
6538 {
6539 C_SaferCond waiter;
6540 if (!osr->flush_commit(&waiter)) {
6541 waiter.wait();
6542 }
6543 }
6544
6545 uint64_t rate = (double)count / (end - start);
6546 if (f) {
6547 f->open_object_section("osd_bench_results");
6548 f->dump_int("bytes_written", count);
6549 f->dump_int("blocksize", bsize);
6550 f->dump_unsigned("bytes_per_sec", rate);
6551 f->close_section();
6552 f->flush(ss);
6553 } else {
6554 ss << "bench: wrote " << prettybyte_t(count)
6555 << " in blocks of " << prettybyte_t(bsize) << " in "
6556 << (end-start) << " sec at " << prettybyte_t(rate) << "/sec";
6557 }
6558 }
6559
6560 else if (prefix == "flush_pg_stats") {
6561 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6562 mgrc.send_pgstats();
6563 ds << service.get_osd_stat_seq() << "\n";
6564 } else {
6565 flush_pg_stats();
6566 }
6567 }
6568
6569 else if (prefix == "heap") {
6570 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
6571 }
6572
6573 else if (prefix == "debug dump_missing") {
6574 string file_name;
6575 cmd_getval(cct, cmdmap, "filename", file_name);
6576 std::ofstream fout(file_name.c_str());
6577 if (!fout.is_open()) {
6578 ss << "failed to open file '" << file_name << "'";
6579 r = -EINVAL;
6580 goto out;
6581 }
6582
6583 fout << "*** osd " << whoami << ": dump_missing ***" << std::endl;
6584 RWLock::RLocker l(pg_map_lock);
6585 for (ceph::unordered_map<spg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
6586 pg_map_e != pg_map.end(); ++pg_map_e) {
6587 PG *pg = pg_map_e->second;
6588 pg->lock();
6589
6590 fout << *pg << std::endl;
6591 std::map<hobject_t, pg_missing_item>::const_iterator mend =
6592 pg->pg_log.get_missing().get_items().end();
6593 std::map<hobject_t, pg_missing_item>::const_iterator mi =
6594 pg->pg_log.get_missing().get_items().begin();
6595 for (; mi != mend; ++mi) {
6596 fout << mi->first << " -> " << mi->second << std::endl;
6597 if (!pg->missing_loc.needs_recovery(mi->first))
6598 continue;
6599 if (pg->missing_loc.is_unfound(mi->first))
6600 fout << " unfound ";
6601 const set<pg_shard_t> &mls(pg->missing_loc.get_locations(mi->first));
6602 if (mls.empty())
6603 continue;
6604 fout << "missing_loc: " << mls << std::endl;
6605 }
6606 pg->unlock();
6607 fout << std::endl;
6608 }
6609
6610 fout.close();
6611 }
6612 else if (prefix == "debug kick_recovery_wq") {
6613 int64_t delay;
6614 cmd_getval(cct, cmdmap, "delay", delay);
6615 ostringstream oss;
6616 oss << delay;
6617 r = cct->_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
6618 if (r != 0) {
6619 ss << "kick_recovery_wq: error setting "
6620 << "osd_recovery_delay_start to '" << delay << "': error "
6621 << r;
6622 goto out;
6623 }
6624 cct->_conf->apply_changes(NULL);
6625 ss << "kicking recovery queue. set osd_recovery_delay_start "
6626 << "to " << cct->_conf->osd_recovery_delay_start;
6627 }
6628
6629 else if (prefix == "cpu_profiler") {
6630 string arg;
6631 cmd_getval(cct, cmdmap, "arg", arg);
6632 vector<string> argvec;
6633 get_str_vec(arg, argvec);
6634 cpu_profiler_handle_command(argvec, ds);
6635 }
6636
6637 else if (prefix == "dump_pg_recovery_stats") {
6638 stringstream s;
6639 if (f) {
6640 pg_recovery_stats.dump_formatted(f.get());
6641 f->flush(ds);
6642 } else {
6643 pg_recovery_stats.dump(s);
6644 ds << "dump pg recovery stats: " << s.str();
6645 }
6646 }
6647
6648 else if (prefix == "reset_pg_recovery_stats") {
6649 ss << "reset pg recovery stats";
6650 pg_recovery_stats.reset();
6651 }
6652
6653 else if (prefix == "perf histogram dump") {
6654 std::string logger;
6655 std::string counter;
6656 cmd_getval(cct, cmdmap, "logger", logger);
6657 cmd_getval(cct, cmdmap, "counter", counter);
6658 if (f) {
6659 cct->get_perfcounters_collection()->dump_formatted_histograms(
6660 f.get(), false, logger, counter);
6661 f->flush(ds);
6662 }
6663 }
6664
6665 else if (prefix == "compact") {
6666 dout(1) << "triggering manual compaction" << dendl;
6667 auto start = ceph::coarse_mono_clock::now();
6668 store->compact();
6669 auto end = ceph::coarse_mono_clock::now();
6670 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
6671 dout(1) << "finished manual compaction in "
6672 << time_span.count()
6673 << " seconds" << dendl;
6674 ss << "compacted omap in " << time_span.count() << " seconds";
6675 }
6676
6677 else {
6678 ss << "unrecognized command! " << cmd;
6679 r = -EINVAL;
6680 }
6681
6682 out:
6683 rs = ss.str();
6684 odata.append(ds);
6685 dout(0) << "do_command r=" << r << " " << rs << dendl;
6686 clog->info() << rs;
6687 if (con) {
6688 MCommandReply *reply = new MCommandReply(r, rs);
6689 reply->set_tid(tid);
6690 reply->set_data(odata);
6691 con->send_message(reply);
6692 }
6693 }
6694
6695 bool OSD::heartbeat_dispatch(Message *m)
6696 {
6697 dout(30) << "heartbeat_dispatch " << m << dendl;
6698 switch (m->get_type()) {
6699
6700 case CEPH_MSG_PING:
6701 dout(10) << "ping from " << m->get_source_inst() << dendl;
6702 m->put();
6703 break;
6704
6705 case MSG_OSD_PING:
6706 handle_osd_ping(static_cast<MOSDPing*>(m));
6707 break;
6708
6709 default:
6710 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
6711 m->put();
6712 }
6713
6714 return true;
6715 }
6716
6717 bool OSD::ms_dispatch(Message *m)
6718 {
6719 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
6720 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
6721 service.got_stop_ack();
6722 m->put();
6723 return true;
6724 }
6725
6726 // lock!
6727
6728 osd_lock.Lock();
6729 if (is_stopping()) {
6730 osd_lock.Unlock();
6731 m->put();
6732 return true;
6733 }
6734
6735 do_waiters();
6736 _dispatch(m);
6737
6738 osd_lock.Unlock();
6739
6740 return true;
6741 }
6742
6743 void OSD::maybe_share_map(
6744 Session *session,
6745 OpRequestRef op,
6746 OSDMapRef osdmap)
6747 {
6748 if (!op->check_send_map) {
6749 return;
6750 }
6751 epoch_t last_sent_epoch = 0;
6752
6753 session->sent_epoch_lock.lock();
6754 last_sent_epoch = session->last_sent_epoch;
6755 session->sent_epoch_lock.unlock();
6756
6757 const Message *m = op->get_req();
6758 service.share_map(
6759 m->get_source(),
6760 m->get_connection().get(),
6761 op->sent_epoch,
6762 osdmap,
6763 session ? &last_sent_epoch : NULL);
6764
6765 session->sent_epoch_lock.lock();
6766 if (session->last_sent_epoch < last_sent_epoch) {
6767 session->last_sent_epoch = last_sent_epoch;
6768 }
6769 session->sent_epoch_lock.unlock();
6770
6771 op->check_send_map = false;
6772 }
6773
6774 void OSD::dispatch_session_waiting(Session *session, OSDMapRef osdmap)
6775 {
6776 assert(session->session_dispatch_lock.is_locked());
6777
6778 auto i = session->waiting_on_map.begin();
6779 while (i != session->waiting_on_map.end()) {
6780 OpRequestRef op = &(*i);
6781 assert(ms_can_fast_dispatch(op->get_req()));
6782 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
6783 op->get_req());
6784 if (m->get_min_epoch() > osdmap->get_epoch()) {
6785 break;
6786 }
6787 session->waiting_on_map.erase(i++);
6788 op->put();
6789
6790 spg_t pgid;
6791 if (m->get_type() == CEPH_MSG_OSD_OP) {
6792 pg_t actual_pgid = osdmap->raw_pg_to_pg(
6793 static_cast<const MOSDOp*>(m)->get_pg());
6794 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
6795 continue;
6796 }
6797 } else {
6798 pgid = m->get_spg();
6799 }
6800 enqueue_op(pgid, op, m->get_map_epoch());
6801 }
6802
6803 if (session->waiting_on_map.empty()) {
6804 clear_session_waiting_on_map(session);
6805 } else {
6806 register_session_waiting_on_map(session);
6807 }
6808 }
6809
6810 void OSD::ms_fast_dispatch(Message *m)
6811 {
6812 FUNCTRACE();
6813 if (service.is_stopping()) {
6814 m->put();
6815 return;
6816 }
6817 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
6818 {
6819 #ifdef WITH_LTTNG
6820 osd_reqid_t reqid = op->get_reqid();
6821 #endif
6822 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
6823 reqid.name._num, reqid.tid, reqid.inc);
6824 }
6825
6826 if (m->trace)
6827 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
6828
6829 // note sender epoch, min req'd epoch
6830 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
6831 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
6832 assert(op->min_epoch <= op->sent_epoch); // sanity check!
6833
6834 service.maybe_inject_dispatch_delay();
6835
6836 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
6837 m->get_type() != CEPH_MSG_OSD_OP) {
6838 // queue it directly
6839 enqueue_op(
6840 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
6841 op,
6842 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
6843 } else {
6844 // legacy client, and this is an MOSDOp (the *only* fast dispatch
6845 // message that didn't have an explicit spg_t); we need to map
6846 // them to an spg_t while preserving delivery order.
6847 Session *session = static_cast<Session*>(m->get_connection()->get_priv());
6848 if (session) {
6849 {
6850 Mutex::Locker l(session->session_dispatch_lock);
6851 op->get();
6852 session->waiting_on_map.push_back(*op);
6853 OSDMapRef nextmap = service.get_nextmap_reserved();
6854 dispatch_session_waiting(session, nextmap);
6855 service.release_map(nextmap);
6856 }
6857 session->put();
6858 }
6859 }
6860 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
6861 }
6862
6863 void OSD::ms_fast_preprocess(Message *m)
6864 {
6865 if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
6866 if (m->get_type() == CEPH_MSG_OSD_MAP) {
6867 MOSDMap *mm = static_cast<MOSDMap*>(m);
6868 Session *s = static_cast<Session*>(m->get_connection()->get_priv());
6869 if (s) {
6870 s->received_map_lock.lock();
6871 s->received_map_epoch = mm->get_last();
6872 s->received_map_lock.unlock();
6873 s->put();
6874 }
6875 }
6876 }
6877 }
6878
6879 bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
6880 {
6881 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
6882
6883 if (is_stopping()) {
6884 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
6885 return false;
6886 }
6887
6888 if (dest_type == CEPH_ENTITY_TYPE_MON)
6889 return true;
6890
6891 if (force_new) {
6892 /* the MonClient checks keys every tick(), so we should just wait for that cycle
6893 to get through */
6894 if (monc->wait_auth_rotating(10) < 0) {
6895 derr << "OSD::ms_get_authorizer wait_auth_rotating failed" << dendl;
6896 return false;
6897 }
6898 }
6899
6900 *authorizer = monc->build_authorizer(dest_type);
6901 return *authorizer != NULL;
6902 }
6903
6904
6905 bool OSD::ms_verify_authorizer(Connection *con, int peer_type,
6906 int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
6907 bool& isvalid, CryptoKey& session_key)
6908 {
6909 AuthAuthorizeHandler *authorize_handler = 0;
6910 switch (peer_type) {
6911 case CEPH_ENTITY_TYPE_MDS:
6912 /*
6913 * note: mds is technically a client from our perspective, but
6914 * this makes the 'cluster' consistent w/ monitor's usage.
6915 */
6916 case CEPH_ENTITY_TYPE_OSD:
6917 case CEPH_ENTITY_TYPE_MGR:
6918 authorize_handler = authorize_handler_cluster_registry->get_handler(protocol);
6919 break;
6920 default:
6921 authorize_handler = authorize_handler_service_registry->get_handler(protocol);
6922 }
6923 if (!authorize_handler) {
6924 dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol << dendl;
6925 isvalid = false;
6926 return true;
6927 }
6928
6929 AuthCapsInfo caps_info;
6930 EntityName name;
6931 uint64_t global_id;
6932 uint64_t auid = CEPH_AUTH_UID_DEFAULT;
6933
6934 isvalid = authorize_handler->verify_authorizer(
6935 cct, monc->rotating_secrets.get(),
6936 authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
6937 &auid);
6938
6939 if (isvalid) {
6940 Session *s = static_cast<Session *>(con->get_priv());
6941 if (!s) {
6942 s = new Session(cct);
6943 con->set_priv(s->get());
6944 s->con = con;
6945 dout(10) << " new session " << s << " con=" << s->con << " addr=" << s->con->get_peer_addr() << dendl;
6946 }
6947
6948 s->entity_name = name;
6949 if (caps_info.allow_all)
6950 s->caps.set_allow_all();
6951 s->auid = auid;
6952
6953 if (caps_info.caps.length() > 0) {
6954 bufferlist::iterator p = caps_info.caps.begin();
6955 string str;
6956 try {
6957 ::decode(str, p);
6958 }
6959 catch (buffer::error& e) {
6960 }
6961 bool success = s->caps.parse(str);
6962 if (success)
6963 dout(10) << " session " << s << " " << s->entity_name << " has caps " << s->caps << " '" << str << "'" << dendl;
6964 else
6965 dout(10) << " session " << s << " " << s->entity_name << " failed to parse caps '" << str << "'" << dendl;
6966 }
6967
6968 s->put();
6969 }
6970 return true;
6971 }
6972
6973 void OSD::do_waiters()
6974 {
6975 assert(osd_lock.is_locked());
6976
6977 dout(10) << "do_waiters -- start" << dendl;
6978 while (!finished.empty()) {
6979 OpRequestRef next = finished.front();
6980 finished.pop_front();
6981 dispatch_op(next);
6982 }
6983 dout(10) << "do_waiters -- finish" << dendl;
6984 }
6985
6986 void OSD::dispatch_op(OpRequestRef op)
6987 {
6988 switch (op->get_req()->get_type()) {
6989
6990 case MSG_OSD_PG_CREATE:
6991 handle_pg_create(op);
6992 break;
6993 case MSG_OSD_PG_NOTIFY:
6994 handle_pg_notify(op);
6995 break;
6996 case MSG_OSD_PG_QUERY:
6997 handle_pg_query(op);
6998 break;
6999 case MSG_OSD_PG_LOG:
7000 handle_pg_log(op);
7001 break;
7002 case MSG_OSD_PG_REMOVE:
7003 handle_pg_remove(op);
7004 break;
7005 case MSG_OSD_PG_INFO:
7006 handle_pg_info(op);
7007 break;
7008 case MSG_OSD_PG_TRIM:
7009 handle_pg_trim(op);
7010 break;
7011 case MSG_OSD_BACKFILL_RESERVE:
7012 handle_pg_backfill_reserve(op);
7013 break;
7014 case MSG_OSD_RECOVERY_RESERVE:
7015 handle_pg_recovery_reserve(op);
7016 break;
7017 }
7018 }
7019
7020 void OSD::_dispatch(Message *m)
7021 {
7022 assert(osd_lock.is_locked());
7023 dout(20) << "_dispatch " << m << " " << *m << dendl;
7024
7025 switch (m->get_type()) {
7026
7027 // -- don't need lock --
7028 case CEPH_MSG_PING:
7029 dout(10) << "ping from " << m->get_source() << dendl;
7030 m->put();
7031 break;
7032
7033 // -- don't need OSDMap --
7034
7035 // map and replication
7036 case CEPH_MSG_OSD_MAP:
7037 handle_osd_map(static_cast<MOSDMap*>(m));
7038 break;
7039
7040 // osd
7041 case MSG_PGSTATSACK:
7042 handle_pg_stats_ack(static_cast<MPGStatsAck*>(m));
7043 break;
7044
7045 case MSG_MON_COMMAND:
7046 handle_command(static_cast<MMonCommand*>(m));
7047 break;
7048 case MSG_COMMAND:
7049 handle_command(static_cast<MCommand*>(m));
7050 break;
7051
7052 case MSG_OSD_SCRUB:
7053 handle_scrub(static_cast<MOSDScrub*>(m));
7054 break;
7055
7056 // -- need OSDMap --
7057
7058 case MSG_OSD_PG_CREATE:
7059 case MSG_OSD_PG_NOTIFY:
7060 case MSG_OSD_PG_QUERY:
7061 case MSG_OSD_PG_LOG:
7062 case MSG_OSD_PG_REMOVE:
7063 case MSG_OSD_PG_INFO:
7064 case MSG_OSD_PG_TRIM:
7065 case MSG_OSD_BACKFILL_RESERVE:
7066 case MSG_OSD_RECOVERY_RESERVE:
7067 {
7068 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7069 if (m->trace)
7070 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7071 // no map? starting up?
7072 if (!osdmap) {
7073 dout(7) << "no OSDMap, not booted" << dendl;
7074 logger->inc(l_osd_waiting_for_map);
7075 waiting_for_osdmap.push_back(op);
7076 op->mark_delayed("no osdmap");
7077 break;
7078 }
7079
7080 // need OSDMap
7081 dispatch_op(op);
7082 }
7083 }
7084 }
7085
7086 void OSD::handle_pg_scrub(MOSDScrub *m, PG *pg)
7087 {
7088 pg->lock();
7089 if (pg->is_primary()) {
7090 pg->unreg_next_scrub();
7091 pg->scrubber.must_scrub = true;
7092 pg->scrubber.must_deep_scrub = m->deep || m->repair;
7093 pg->scrubber.must_repair = m->repair;
7094 pg->reg_next_scrub();
7095 dout(10) << "marking " << *pg << " for scrub" << dendl;
7096 }
7097 pg->unlock();
7098 }
7099
7100 void OSD::handle_scrub(MOSDScrub *m)
7101 {
7102 dout(10) << "handle_scrub " << *m << dendl;
7103 if (!require_mon_or_mgr_peer(m)) {
7104 m->put();
7105 return;
7106 }
7107 if (m->fsid != monc->get_fsid()) {
7108 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid() << dendl;
7109 m->put();
7110 return;
7111 }
7112
7113 RWLock::RLocker l(pg_map_lock);
7114 if (m->scrub_pgs.empty()) {
7115 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
7116 p != pg_map.end();
7117 ++p)
7118 handle_pg_scrub(m, p->second);
7119 } else {
7120 for (vector<pg_t>::iterator p = m->scrub_pgs.begin();
7121 p != m->scrub_pgs.end();
7122 ++p) {
7123 spg_t pcand;
7124 if (osdmap->get_primary_shard(*p, &pcand)) {
7125 auto pg_map_entry = pg_map.find(pcand);
7126 if (pg_map_entry != pg_map.end()) {
7127 handle_pg_scrub(m, pg_map_entry->second);
7128 }
7129 }
7130 }
7131 }
7132
7133 m->put();
7134 }
7135
7136 bool OSD::scrub_random_backoff()
7137 {
7138 bool coin_flip = (rand() / (double)RAND_MAX >=
7139 cct->_conf->osd_scrub_backoff_ratio);
7140 if (!coin_flip) {
7141 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7142 return true;
7143 }
7144 return false;
7145 }
7146
7147 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7148 const spg_t& pg, const utime_t& timestamp,
7149 double pool_scrub_min_interval,
7150 double pool_scrub_max_interval, bool must)
7151 : cct(cct),
7152 pgid(pg),
7153 sched_time(timestamp),
7154 deadline(timestamp)
7155 {
7156 // if not explicitly requested, postpone the scrub with a random delay
7157 if (!must) {
7158 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7159 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7160 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7161 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7162
7163 sched_time += scrub_min_interval;
7164 double r = rand() / (double)RAND_MAX;
7165 sched_time +=
7166 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7167 deadline += scrub_max_interval;
7168 }
7169 }
7170
7171 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7172 if (sched_time < rhs.sched_time)
7173 return true;
7174 if (sched_time > rhs.sched_time)
7175 return false;
7176 return pgid < rhs.pgid;
7177 }
7178
7179 bool OSD::scrub_time_permit(utime_t now)
7180 {
7181 struct tm bdt;
7182 time_t tt = now.sec();
7183 localtime_r(&tt, &bdt);
7184 bool time_permit = false;
7185 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7186 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7187 time_permit = true;
7188 }
7189 } else {
7190 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7191 time_permit = true;
7192 }
7193 }
7194 if (!time_permit) {
7195 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7196 << " - " << cct->_conf->osd_scrub_end_hour
7197 << " now " << bdt.tm_hour << " = no" << dendl;
7198 } else {
7199 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7200 << " - " << cct->_conf->osd_scrub_end_hour
7201 << " now " << bdt.tm_hour << " = yes" << dendl;
7202 }
7203 return time_permit;
7204 }
7205
7206 bool OSD::scrub_load_below_threshold()
7207 {
7208 double loadavgs[3];
7209 if (getloadavg(loadavgs, 3) != 3) {
7210 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7211 return false;
7212 }
7213
7214 // allow scrub if below configured threshold
7215 if (loadavgs[0] < cct->_conf->osd_scrub_load_threshold) {
7216 dout(20) << __func__ << " loadavg " << loadavgs[0]
7217 << " < max " << cct->_conf->osd_scrub_load_threshold
7218 << " = yes" << dendl;
7219 return true;
7220 }
7221
7222 // allow scrub if below daily avg and currently decreasing
7223 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7224 dout(20) << __func__ << " loadavg " << loadavgs[0]
7225 << " < daily_loadavg " << daily_loadavg
7226 << " and < 15m avg " << loadavgs[2]
7227 << " = yes" << dendl;
7228 return true;
7229 }
7230
7231 dout(20) << __func__ << " loadavg " << loadavgs[0]
7232 << " >= max " << cct->_conf->osd_scrub_load_threshold
7233 << " and ( >= daily_loadavg " << daily_loadavg
7234 << " or >= 15m avg " << loadavgs[2]
7235 << ") = no" << dendl;
7236 return false;
7237 }
7238
7239 void OSD::sched_scrub()
7240 {
7241 // if not permitted, fail fast
7242 if (!service.can_inc_scrubs_pending()) {
7243 return;
7244 }
7245
7246 utime_t now = ceph_clock_now();
7247 bool time_permit = scrub_time_permit(now);
7248 bool load_is_low = scrub_load_below_threshold();
7249 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7250
7251 OSDService::ScrubJob scrub;
7252 if (service.first_scrub_stamp(&scrub)) {
7253 do {
7254 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7255
7256 if (scrub.sched_time > now) {
7257 // save ourselves some effort
7258 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7259 << " > " << now << dendl;
7260 break;
7261 }
7262
7263 if (!cct->_conf->osd_scrub_during_recovery && service.is_recovery_active()) {
7264 dout(10) << __func__ << "not scheduling scrub of " << scrub.pgid << " due to active recovery ops" << dendl;
7265 break;
7266 }
7267
7268 if ((scrub.deadline >= now) && !(time_permit && load_is_low)) {
7269 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7270 << (!time_permit ? "time not permit" : "high load") << dendl;
7271 continue;
7272 }
7273
7274 PG *pg = _lookup_lock_pg(scrub.pgid);
7275 if (!pg)
7276 continue;
7277 if (pg->get_pgbackend()->scrub_supported() && pg->is_active()) {
7278 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7279 << (pg->scrubber.must_scrub ? ", explicitly requested" :
7280 (load_is_low ? ", load_is_low" : " deadline < now"))
7281 << dendl;
7282 if (pg->sched_scrub()) {
7283 pg->unlock();
7284 break;
7285 }
7286 }
7287 pg->unlock();
7288 } while (service.next_scrub_stamp(scrub, &scrub));
7289 }
7290 dout(20) << "sched_scrub done" << dendl;
7291 }
7292
7293
7294
7295 // =====================================================
7296 // MAP
7297
7298 void OSD::wait_for_new_map(OpRequestRef op)
7299 {
7300 // ask?
7301 if (waiting_for_osdmap.empty()) {
7302 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7303 }
7304
7305 logger->inc(l_osd_waiting_for_map);
7306 waiting_for_osdmap.push_back(op);
7307 op->mark_delayed("wait for new map");
7308 }
7309
7310
7311 /** update_map
7312 * assimilate new OSDMap(s). scan pgs, etc.
7313 */
7314
7315 void OSD::note_down_osd(int peer)
7316 {
7317 assert(osd_lock.is_locked());
7318 cluster_messenger->mark_down(osdmap->get_cluster_addr(peer));
7319
7320 heartbeat_lock.Lock();
7321 failure_queue.erase(peer);
7322 failure_pending.erase(peer);
7323 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7324 if (p != heartbeat_peers.end()) {
7325 p->second.con_back->mark_down();
7326 if (p->second.con_front) {
7327 p->second.con_front->mark_down();
7328 }
7329 heartbeat_peers.erase(p);
7330 }
7331 heartbeat_lock.Unlock();
7332 }
7333
7334 void OSD::note_up_osd(int peer)
7335 {
7336 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
7337 heartbeat_set_peers_need_update();
7338 }
7339
7340 struct C_OnMapCommit : public Context {
7341 OSD *osd;
7342 epoch_t first, last;
7343 MOSDMap *msg;
7344 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7345 : osd(o), first(f), last(l), msg(m) {}
7346 void finish(int r) override {
7347 osd->_committed_osd_maps(first, last, msg);
7348 msg->put();
7349 }
7350 };
7351
7352 struct C_OnMapApply : public Context {
7353 OSDService *service;
7354 list<OSDMapRef> pinned_maps;
7355 epoch_t e;
7356 C_OnMapApply(OSDService *service,
7357 const list<OSDMapRef> &pinned_maps,
7358 epoch_t e)
7359 : service(service), pinned_maps(pinned_maps), e(e) {}
7360 void finish(int r) override {
7361 service->clear_map_bl_cache_pins(e);
7362 }
7363 };
7364
7365 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7366 {
7367 OSDMapRef osdmap = service.get_osdmap();
7368 if (osdmap->get_epoch() >= epoch)
7369 return;
7370
7371 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7372 force_request) {
7373 monc->renew_subs();
7374 }
7375 }
7376
7377 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7378 {
7379 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7380 if (min <= superblock.oldest_map)
7381 return;
7382
7383 int num = 0;
7384 ObjectStore::Transaction t;
7385 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7386 dout(20) << " removing old osdmap epoch " << e << dendl;
7387 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7388 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7389 superblock.oldest_map = e + 1;
7390 num++;
7391 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7392 service.publish_superblock(superblock);
7393 write_superblock(t);
7394 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7395 assert(tr == 0);
7396 num = 0;
7397 if (!skip_maps) {
7398 // skip_maps leaves us with a range of old maps if we fail to remove all
7399 // of them before moving superblock.oldest_map forward to the first map
7400 // in the incoming MOSDMap msg. so we should continue removing them in
7401 // this case, even we could do huge series of delete transactions all at
7402 // once.
7403 break;
7404 }
7405 }
7406 }
7407 if (num > 0) {
7408 service.publish_superblock(superblock);
7409 write_superblock(t);
7410 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7411 assert(tr == 0);
7412 }
7413 // we should not remove the cached maps
7414 assert(min <= service.map_cache.cached_key_lower_bound());
7415 }
7416
7417 void OSD::handle_osd_map(MOSDMap *m)
7418 {
7419 assert(osd_lock.is_locked());
7420 // Keep a ref in the list until we get the newly received map written
7421 // onto disk. This is important because as long as the refs are alive,
7422 // the OSDMaps will be pinned in the cache and we won't try to read it
7423 // off of disk. Otherwise these maps will probably not stay in the cache,
7424 // and reading those OSDMaps before they are actually written can result
7425 // in a crash.
7426 list<OSDMapRef> pinned_maps;
7427 if (m->fsid != monc->get_fsid()) {
7428 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7429 << monc->get_fsid() << dendl;
7430 m->put();
7431 return;
7432 }
7433 if (is_initializing()) {
7434 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7435 m->put();
7436 return;
7437 }
7438
7439 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
7440 if (session && !(session->entity_name.is_mon() ||
7441 session->entity_name.is_osd())) {
7442 //not enough perms!
7443 dout(10) << "got osd map from Session " << session
7444 << " which we can't take maps from (not a mon or osd)" << dendl;
7445 m->put();
7446 session->put();
7447 return;
7448 }
7449 if (session)
7450 session->put();
7451
7452 // share with the objecter
7453 if (!is_preboot())
7454 service.objecter->handle_osd_map(m);
7455
7456 epoch_t first = m->get_first();
7457 epoch_t last = m->get_last();
7458 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7459 << superblock.newest_map
7460 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7461 << dendl;
7462
7463 logger->inc(l_osd_map);
7464 logger->inc(l_osd_mape, last - first + 1);
7465 if (first <= superblock.newest_map)
7466 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7467 if (service.max_oldest_map < m->oldest_map) {
7468 service.max_oldest_map = m->oldest_map;
7469 assert(service.max_oldest_map >= superblock.oldest_map);
7470 }
7471
7472 // make sure there is something new, here, before we bother flushing
7473 // the queues and such
7474 if (last <= superblock.newest_map) {
7475 dout(10) << " no new maps here, dropping" << dendl;
7476 m->put();
7477 return;
7478 }
7479
7480 // missing some?
7481 bool skip_maps = false;
7482 if (first > superblock.newest_map + 1) {
7483 dout(10) << "handle_osd_map message skips epochs "
7484 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7485 if (m->oldest_map <= superblock.newest_map + 1) {
7486 osdmap_subscribe(superblock.newest_map + 1, false);
7487 m->put();
7488 return;
7489 }
7490 // always try to get the full range of maps--as many as we can. this
7491 // 1- is good to have
7492 // 2- is at present the only way to ensure that we get a *full* map as
7493 // the first map!
7494 if (m->oldest_map < first) {
7495 osdmap_subscribe(m->oldest_map - 1, true);
7496 m->put();
7497 return;
7498 }
7499 skip_maps = true;
7500 }
7501
7502 ObjectStore::Transaction t;
7503 uint64_t txn_size = 0;
7504
7505 // store new maps: queue for disk and put in the osdmap cache
7506 epoch_t start = MAX(superblock.newest_map + 1, first);
7507 for (epoch_t e = start; e <= last; e++) {
7508 if (txn_size >= t.get_num_bytes()) {
7509 derr << __func__ << " transaction size overflowed" << dendl;
7510 assert(txn_size < t.get_num_bytes());
7511 }
7512 txn_size = t.get_num_bytes();
7513 map<epoch_t,bufferlist>::iterator p;
7514 p = m->maps.find(e);
7515 if (p != m->maps.end()) {
7516 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7517 OSDMap *o = new OSDMap;
7518 bufferlist& bl = p->second;
7519
7520 o->decode(bl);
7521
7522 ghobject_t fulloid = get_osdmap_pobject_name(e);
7523 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7524 pin_map_bl(e, bl);
7525 pinned_maps.push_back(add_map(o));
7526
7527 got_full_map(e);
7528 continue;
7529 }
7530
7531 p = m->incremental_maps.find(e);
7532 if (p != m->incremental_maps.end()) {
7533 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7534 bufferlist& bl = p->second;
7535 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7536 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7537 pin_map_inc_bl(e, bl);
7538
7539 OSDMap *o = new OSDMap;
7540 if (e > 1) {
7541 bufferlist obl;
7542 bool got = get_map_bl(e - 1, obl);
7543 assert(got);
7544 o->decode(obl);
7545 }
7546
7547 OSDMap::Incremental inc;
7548 bufferlist::iterator p = bl.begin();
7549 inc.decode(p);
7550 if (o->apply_incremental(inc) < 0) {
7551 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
7552 assert(0 == "bad fsid");
7553 }
7554
7555 bufferlist fbl;
7556 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7557
7558 bool injected_failure = false;
7559 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7560 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7561 derr << __func__ << " injecting map crc failure" << dendl;
7562 injected_failure = true;
7563 }
7564
7565 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7566 dout(2) << "got incremental " << e
7567 << " but failed to encode full with correct crc; requesting"
7568 << dendl;
7569 clog->warn() << "failed to encode map e" << e << " with expected crc";
7570 dout(20) << "my encoded map was:\n";
7571 fbl.hexdump(*_dout);
7572 *_dout << dendl;
7573 delete o;
7574 request_full_map(e, last);
7575 last = e - 1;
7576 break;
7577 }
7578 got_full_map(e);
7579
7580 ghobject_t fulloid = get_osdmap_pobject_name(e);
7581 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7582 pin_map_bl(e, fbl);
7583 pinned_maps.push_back(add_map(o));
7584 continue;
7585 }
7586
7587 assert(0 == "MOSDMap lied about what maps it had?");
7588 }
7589
7590 // even if this map isn't from a mon, we may have satisfied our subscription
7591 monc->sub_got("osdmap", last);
7592
7593 if (!m->maps.empty() && requested_full_first) {
7594 dout(10) << __func__ << " still missing full maps " << requested_full_first
7595 << ".." << requested_full_last << dendl;
7596 rerequest_full_maps();
7597 }
7598
7599 if (superblock.oldest_map) {
7600 // make sure we at least keep pace with incoming maps
7601 trim_maps(m->oldest_map, last - first + 1, skip_maps);
7602 }
7603
7604 if (!superblock.oldest_map || skip_maps)
7605 superblock.oldest_map = first;
7606 superblock.newest_map = last;
7607 superblock.current_epoch = last;
7608
7609 // note in the superblock that we were clean thru the prior epoch
7610 epoch_t boot_epoch = service.get_boot_epoch();
7611 if (boot_epoch && boot_epoch >= superblock.mounted) {
7612 superblock.mounted = boot_epoch;
7613 superblock.clean_thru = last;
7614 }
7615
7616 // superblock and commit
7617 write_superblock(t);
7618 store->queue_transaction(
7619 service.meta_osr.get(),
7620 std::move(t),
7621 new C_OnMapApply(&service, pinned_maps, last),
7622 new C_OnMapCommit(this, start, last, m), 0);
7623 service.publish_superblock(superblock);
7624 }
7625
7626 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
7627 {
7628 dout(10) << __func__ << " " << first << ".." << last << dendl;
7629 if (is_stopping()) {
7630 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7631 return;
7632 }
7633 Mutex::Locker l(osd_lock);
7634 if (is_stopping()) {
7635 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7636 return;
7637 }
7638 map_lock.get_write();
7639
7640 bool do_shutdown = false;
7641 bool do_restart = false;
7642 bool network_error = false;
7643
7644 // advance through the new maps
7645 for (epoch_t cur = first; cur <= last; cur++) {
7646 dout(10) << " advance to epoch " << cur
7647 << " (<= last " << last
7648 << " <= newest_map " << superblock.newest_map
7649 << ")" << dendl;
7650
7651 OSDMapRef newmap = get_map(cur);
7652 assert(newmap); // we just cached it above!
7653
7654 // start blacklisting messages sent to peers that go down.
7655 service.pre_publish_map(newmap);
7656
7657 // kill connections to newly down osds
7658 bool waited_for_reservations = false;
7659 set<int> old;
7660 osdmap->get_all_osds(old);
7661 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
7662 if (*p != whoami &&
7663 osdmap->is_up(*p) && // in old map
7664 newmap->is_down(*p)) { // but not the new one
7665 if (!waited_for_reservations) {
7666 service.await_reserved_maps();
7667 waited_for_reservations = true;
7668 }
7669 note_down_osd(*p);
7670 } else if (*p != whoami &&
7671 osdmap->is_down(*p) &&
7672 newmap->is_up(*p)) {
7673 note_up_osd(*p);
7674 }
7675 }
7676
7677 if ((osdmap->test_flag(CEPH_OSDMAP_NOUP) !=
7678 newmap->test_flag(CEPH_OSDMAP_NOUP)) ||
7679 (osdmap->is_noup(whoami) != newmap->is_noup(whoami))) {
7680 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
7681 << dendl;
7682 if (is_booting()) {
7683 // this captures the case where we sent the boot message while
7684 // NOUP was being set on the mon and our boot request was
7685 // dropped, and then later it is cleared. it imperfectly
7686 // handles the case where our original boot message was not
7687 // dropped and we restart even though we might have booted, but
7688 // that is harmless (boot will just take slightly longer).
7689 do_restart = true;
7690 }
7691 }
7692 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS &&
7693 newmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
7694 dout(10) << __func__ << " require_osd_release reached luminous in "
7695 << newmap->get_epoch() << dendl;
7696 clear_pg_stat_queue();
7697 clear_outstanding_pg_stats();
7698 }
7699
7700 osdmap = newmap;
7701 epoch_t up_epoch;
7702 epoch_t boot_epoch;
7703 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
7704 if (!up_epoch &&
7705 osdmap->is_up(whoami) &&
7706 osdmap->get_inst(whoami) == client_messenger->get_myinst()) {
7707 up_epoch = osdmap->get_epoch();
7708 dout(10) << "up_epoch is " << up_epoch << dendl;
7709 if (!boot_epoch) {
7710 boot_epoch = osdmap->get_epoch();
7711 dout(10) << "boot_epoch is " << boot_epoch << dendl;
7712 }
7713 service.set_epochs(&boot_epoch, &up_epoch, NULL);
7714 }
7715 }
7716
7717 had_map_since = ceph_clock_now();
7718
7719 epoch_t _bind_epoch = service.get_bind_epoch();
7720 if (osdmap->is_up(whoami) &&
7721 osdmap->get_addr(whoami) == client_messenger->get_myaddr() &&
7722 _bind_epoch < osdmap->get_up_from(whoami)) {
7723
7724 if (is_booting()) {
7725 dout(1) << "state: booting -> active" << dendl;
7726 set_state(STATE_ACTIVE);
7727
7728 // set incarnation so that osd_reqid_t's we generate for our
7729 // objecter requests are unique across restarts.
7730 service.objecter->set_client_incarnation(osdmap->get_epoch());
7731 }
7732 }
7733
7734 if (osdmap->get_epoch() > 0 &&
7735 is_active()) {
7736 if (!osdmap->exists(whoami)) {
7737 dout(0) << "map says i do not exist. shutting down." << dendl;
7738 do_shutdown = true; // don't call shutdown() while we have
7739 // everything paused
7740 } else if (!osdmap->is_up(whoami) ||
7741 !osdmap->get_addr(whoami).probably_equals(
7742 client_messenger->get_myaddr()) ||
7743 !osdmap->get_cluster_addr(whoami).probably_equals(
7744 cluster_messenger->get_myaddr()) ||
7745 !osdmap->get_hb_back_addr(whoami).probably_equals(
7746 hb_back_server_messenger->get_myaddr()) ||
7747 (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
7748 !osdmap->get_hb_front_addr(whoami).probably_equals(
7749 hb_front_server_messenger->get_myaddr()))) {
7750 if (!osdmap->is_up(whoami)) {
7751 if (service.is_preparing_to_stop() || service.is_stopping()) {
7752 service.got_stop_ack();
7753 } else {
7754 clog->warn() << "map e" << osdmap->get_epoch()
7755 << " wrongly marked me down at e"
7756 << osdmap->get_down_at(whoami);
7757 }
7758 } else if (!osdmap->get_addr(whoami).probably_equals(
7759 client_messenger->get_myaddr())) {
7760 clog->error() << "map e" << osdmap->get_epoch()
7761 << " had wrong client addr (" << osdmap->get_addr(whoami)
7762 << " != my " << client_messenger->get_myaddr() << ")";
7763 } else if (!osdmap->get_cluster_addr(whoami).probably_equals(
7764 cluster_messenger->get_myaddr())) {
7765 clog->error() << "map e" << osdmap->get_epoch()
7766 << " had wrong cluster addr ("
7767 << osdmap->get_cluster_addr(whoami)
7768 << " != my " << cluster_messenger->get_myaddr() << ")";
7769 } else if (!osdmap->get_hb_back_addr(whoami).probably_equals(
7770 hb_back_server_messenger->get_myaddr())) {
7771 clog->error() << "map e" << osdmap->get_epoch()
7772 << " had wrong hb back addr ("
7773 << osdmap->get_hb_back_addr(whoami)
7774 << " != my " << hb_back_server_messenger->get_myaddr()
7775 << ")";
7776 } else if (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
7777 !osdmap->get_hb_front_addr(whoami).probably_equals(
7778 hb_front_server_messenger->get_myaddr())) {
7779 clog->error() << "map e" << osdmap->get_epoch()
7780 << " had wrong hb front addr ("
7781 << osdmap->get_hb_front_addr(whoami)
7782 << " != my " << hb_front_server_messenger->get_myaddr()
7783 << ")";
7784 }
7785
7786 if (!service.is_stopping()) {
7787 epoch_t up_epoch = 0;
7788 epoch_t bind_epoch = osdmap->get_epoch();
7789 service.set_epochs(NULL,&up_epoch, &bind_epoch);
7790 do_restart = true;
7791
7792 //add markdown log
7793 utime_t now = ceph_clock_now();
7794 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
7795 osd_markdown_log.push_back(now);
7796 //clear all out-of-date log
7797 while (!osd_markdown_log.empty() &&
7798 osd_markdown_log.front() + grace < now)
7799 osd_markdown_log.pop_front();
7800 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
7801 dout(0) << __func__ << " marked down "
7802 << osd_markdown_log.size()
7803 << " > osd_max_markdown_count "
7804 << cct->_conf->osd_max_markdown_count
7805 << " in last " << grace << " seconds, shutting down"
7806 << dendl;
7807 do_restart = false;
7808 do_shutdown = true;
7809 }
7810
7811 start_waiting_for_healthy();
7812
7813 set<int> avoid_ports;
7814 #if defined(__FreeBSD__)
7815 // prevent FreeBSD from grabbing the client_messenger port during
7816 // rebinding. In which case a cluster_meesneger will connect also
7817 // to the same port
7818 avoid_ports.insert(client_messenger->get_myaddr().get_port());
7819 #endif
7820 avoid_ports.insert(cluster_messenger->get_myaddr().get_port());
7821 avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port());
7822 avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port());
7823
7824 int r = cluster_messenger->rebind(avoid_ports);
7825 if (r != 0) {
7826 do_shutdown = true; // FIXME: do_restart?
7827 network_error = true;
7828 dout(0) << __func__ << " marked down:"
7829 << " rebind cluster_messenger failed" << dendl;
7830 }
7831
7832 r = hb_back_server_messenger->rebind(avoid_ports);
7833 if (r != 0) {
7834 do_shutdown = true; // FIXME: do_restart?
7835 network_error = true;
7836 dout(0) << __func__ << " marked down:"
7837 << " rebind hb_back_server_messenger failed" << dendl;
7838 }
7839
7840 r = hb_front_server_messenger->rebind(avoid_ports);
7841 if (r != 0) {
7842 do_shutdown = true; // FIXME: do_restart?
7843 network_error = true;
7844 dout(0) << __func__ << " marked down:"
7845 << " rebind hb_front_server_messenger failed" << dendl;
7846 }
7847
7848 hb_front_client_messenger->mark_down_all();
7849 hb_back_client_messenger->mark_down_all();
7850
7851 reset_heartbeat_peers();
7852 }
7853 }
7854 }
7855
7856 map_lock.put_write();
7857
7858 check_osdmap_features(store);
7859
7860 // yay!
7861 consume_map();
7862
7863 if (is_active() || is_waiting_for_healthy())
7864 maybe_update_heartbeat_peers();
7865
7866 if (!is_active()) {
7867 dout(10) << " not yet active; waiting for peering wq to drain" << dendl;
7868 peering_wq.drain();
7869 } else {
7870 activate_map();
7871 }
7872
7873 if (do_shutdown) {
7874 if (network_error) {
7875 Mutex::Locker l(heartbeat_lock);
7876 map<int,pair<utime_t,entity_inst_t>>::iterator it =
7877 failure_pending.begin();
7878 while (it != failure_pending.end()) {
7879 dout(10) << "handle_osd_ping canceling in-flight failure report for osd."
7880 << it->first << dendl;
7881 send_still_alive(osdmap->get_epoch(), it->second.second);
7882 failure_pending.erase(it++);
7883 }
7884 }
7885 // trigger shutdown in a different thread
7886 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
7887 queue_async_signal(SIGINT);
7888 }
7889 else if (m->newest_map && m->newest_map > last) {
7890 dout(10) << " msg say newest map is " << m->newest_map
7891 << ", requesting more" << dendl;
7892 osdmap_subscribe(osdmap->get_epoch()+1, false);
7893 }
7894 else if (is_preboot()) {
7895 if (m->get_source().is_mon())
7896 _preboot(m->oldest_map, m->newest_map);
7897 else
7898 start_boot();
7899 }
7900 else if (do_restart)
7901 start_boot();
7902
7903 }
7904
7905 void OSD::check_osdmap_features(ObjectStore *fs)
7906 {
7907 // adjust required feature bits?
7908
7909 // we have to be a bit careful here, because we are accessing the
7910 // Policy structures without taking any lock. in particular, only
7911 // modify integer values that can safely be read by a racing CPU.
7912 // since we are only accessing existing Policy structures a their
7913 // current memory location, and setting or clearing bits in integer
7914 // fields, and we are the only writer, this is not a problem.
7915
7916 {
7917 Messenger::Policy p = client_messenger->get_default_policy();
7918 uint64_t mask;
7919 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
7920 if ((p.features_required & mask) != features) {
7921 dout(0) << "crush map has features " << features
7922 << ", adjusting msgr requires for clients" << dendl;
7923 p.features_required = (p.features_required & ~mask) | features;
7924 client_messenger->set_default_policy(p);
7925 }
7926 }
7927 {
7928 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
7929 uint64_t mask;
7930 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
7931 if ((p.features_required & mask) != features) {
7932 dout(0) << "crush map has features " << features
7933 << " was " << p.features_required
7934 << ", adjusting msgr requires for mons" << dendl;
7935 p.features_required = (p.features_required & ~mask) | features;
7936 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
7937 }
7938 }
7939 {
7940 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
7941 uint64_t mask;
7942 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
7943
7944 if ((p.features_required & mask) != features) {
7945 dout(0) << "crush map has features " << features
7946 << ", adjusting msgr requires for osds" << dendl;
7947 p.features_required = (p.features_required & ~mask) | features;
7948 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
7949 }
7950
7951 if ((features & CEPH_FEATURE_OSD_ERASURE_CODES) &&
7952 !superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
7953 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
7954 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
7955 ObjectStore::Transaction t;
7956 write_superblock(t);
7957 int err = store->queue_transaction(service.meta_osr.get(), std::move(t), NULL);
7958 assert(err == 0);
7959 }
7960 }
7961 }
7962
7963 bool OSD::advance_pg(
7964 epoch_t osd_epoch, PG *pg,
7965 ThreadPool::TPHandle &handle,
7966 PG::RecoveryCtx *rctx,
7967 set<PGRef> *new_pgs)
7968 {
7969 assert(pg->is_locked());
7970 epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
7971 OSDMapRef lastmap = pg->get_osdmap();
7972
7973 if (lastmap->get_epoch() == osd_epoch)
7974 return true;
7975 assert(lastmap->get_epoch() < osd_epoch);
7976
7977 epoch_t min_epoch = service.get_min_pg_epoch();
7978 epoch_t max;
7979 if (min_epoch) {
7980 max = min_epoch + cct->_conf->osd_map_max_advance;
7981 } else {
7982 max = next_epoch + cct->_conf->osd_map_max_advance;
7983 }
7984
7985 for (;
7986 next_epoch <= osd_epoch && next_epoch <= max;
7987 ++next_epoch) {
7988 OSDMapRef nextmap = service.try_get_map(next_epoch);
7989 if (!nextmap) {
7990 dout(20) << __func__ << " missing map " << next_epoch << dendl;
7991 // make sure max is bumped up so that we can get past any
7992 // gap in maps
7993 max = MAX(max, next_epoch + cct->_conf->osd_map_max_advance);
7994 continue;
7995 }
7996
7997 vector<int> newup, newacting;
7998 int up_primary, acting_primary;
7999 nextmap->pg_to_up_acting_osds(
8000 pg->info.pgid.pgid,
8001 &newup, &up_primary,
8002 &newacting, &acting_primary);
8003 pg->handle_advance_map(
8004 nextmap, lastmap, newup, up_primary,
8005 newacting, acting_primary, rctx);
8006
8007 // Check for split!
8008 set<spg_t> children;
8009 spg_t parent(pg->info.pgid);
8010 if (parent.is_split(
8011 lastmap->get_pg_num(pg->pool.id),
8012 nextmap->get_pg_num(pg->pool.id),
8013 &children)) {
8014 service.mark_split_in_progress(pg->info.pgid, children);
8015 split_pgs(
8016 pg, children, new_pgs, lastmap, nextmap,
8017 rctx);
8018 }
8019
8020 lastmap = nextmap;
8021 handle.reset_tp_timeout();
8022 }
8023 service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
8024 pg->handle_activate_map(rctx);
8025 if (next_epoch <= osd_epoch) {
8026 dout(10) << __func__ << " advanced to max " << max
8027 << " past min epoch " << min_epoch
8028 << " ... will requeue " << *pg << dendl;
8029 return false;
8030 }
8031 return true;
8032 }
8033
8034 void OSD::consume_map()
8035 {
8036 assert(osd_lock.is_locked());
8037 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8038
8039 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8040 list<PGRef> to_remove;
8041
8042 // scan pg's
8043 {
8044 RWLock::RLocker l(pg_map_lock);
8045 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8046 it != pg_map.end();
8047 ++it) {
8048 PG *pg = it->second;
8049 pg->lock();
8050 if (pg->is_primary())
8051 num_pg_primary++;
8052 else if (pg->is_replica())
8053 num_pg_replica++;
8054 else
8055 num_pg_stray++;
8056
8057 if (!osdmap->have_pg_pool(pg->info.pgid.pool())) {
8058 //pool is deleted!
8059 to_remove.push_back(PGRef(pg));
8060 } else {
8061 service.init_splits_between(it->first, service.get_osdmap(), osdmap);
8062 }
8063
8064 pg->unlock();
8065 }
8066 }
8067
8068 for (list<PGRef>::iterator i = to_remove.begin();
8069 i != to_remove.end();
8070 to_remove.erase(i++)) {
8071 RWLock::WLocker locker(pg_map_lock);
8072 (*i)->lock();
8073 _remove_pg(&**i);
8074 (*i)->unlock();
8075 }
8076
8077 service.expand_pg_num(service.get_osdmap(), osdmap);
8078
8079 service.pre_publish_map(osdmap);
8080 service.await_reserved_maps();
8081 service.publish_map(osdmap);
8082
8083 service.maybe_inject_dispatch_delay();
8084
8085 dispatch_sessions_waiting_on_map();
8086
8087 service.maybe_inject_dispatch_delay();
8088
8089 // remove any PGs which we no longer host from the session waiting_for_pg lists
8090 dout(20) << __func__ << " checking waiting_for_pg" << dendl;
8091 op_shardedwq.prune_pg_waiters(osdmap, whoami);
8092
8093 service.maybe_inject_dispatch_delay();
8094
8095 // scan pg's
8096 {
8097 RWLock::RLocker l(pg_map_lock);
8098 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8099 it != pg_map.end();
8100 ++it) {
8101 PG *pg = it->second;
8102 pg->lock();
8103 pg->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
8104 pg->unlock();
8105 }
8106
8107 logger->set(l_osd_pg, pg_map.size());
8108 }
8109 logger->set(l_osd_pg_primary, num_pg_primary);
8110 logger->set(l_osd_pg_replica, num_pg_replica);
8111 logger->set(l_osd_pg_stray, num_pg_stray);
8112 }
8113
8114 void OSD::activate_map()
8115 {
8116 assert(osd_lock.is_locked());
8117
8118 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8119
8120 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8121 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8122 ceph_abort();
8123 }
8124
8125 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
8126 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
8127 osdmap_subscribe(osdmap->get_epoch() + 1, false);
8128 }
8129
8130 // norecover?
8131 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8132 if (!service.recovery_is_paused()) {
8133 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8134 service.pause_recovery();
8135 }
8136 } else {
8137 if (service.recovery_is_paused()) {
8138 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8139 service.unpause_recovery();
8140 }
8141 }
8142
8143 service.activate_map();
8144
8145 // process waiters
8146 take_waiters(waiting_for_osdmap);
8147 }
8148
8149 bool OSD::require_mon_peer(const Message *m)
8150 {
8151 if (!m->get_connection()->peer_is_mon()) {
8152 dout(0) << "require_mon_peer received from non-mon "
8153 << m->get_connection()->get_peer_addr()
8154 << " " << *m << dendl;
8155 return false;
8156 }
8157 return true;
8158 }
8159
8160 bool OSD::require_mon_or_mgr_peer(const Message *m)
8161 {
8162 if (!m->get_connection()->peer_is_mon() &&
8163 !m->get_connection()->peer_is_mgr()) {
8164 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8165 << m->get_connection()->get_peer_addr()
8166 << " " << *m << dendl;
8167 return false;
8168 }
8169 return true;
8170 }
8171
8172 bool OSD::require_osd_peer(const Message *m)
8173 {
8174 if (!m->get_connection()->peer_is_osd()) {
8175 dout(0) << "require_osd_peer received from non-osd "
8176 << m->get_connection()->get_peer_addr()
8177 << " " << *m << dendl;
8178 return false;
8179 }
8180 return true;
8181 }
8182
8183 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8184 {
8185 epoch_t up_epoch = service.get_up_epoch();
8186 if (epoch < up_epoch) {
8187 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8188 return false;
8189 }
8190
8191 if (!is_active()) {
8192 dout(7) << "still in boot state, dropping message " << *m << dendl;
8193 return false;
8194 }
8195
8196 return true;
8197 }
8198
8199 bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
8200 bool is_fast_dispatch)
8201 {
8202 int from = m->get_source().num();
8203
8204 if (map->is_down(from) ||
8205 (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
8206 dout(5) << "from dead osd." << from << ", marking down, "
8207 << " msg was " << m->get_source_inst().addr
8208 << " expected " << (map->is_up(from) ?
8209 map->get_cluster_addr(from) : entity_addr_t())
8210 << dendl;
8211 ConnectionRef con = m->get_connection();
8212 con->mark_down();
8213 Session *s = static_cast<Session*>(con->get_priv());
8214 if (s) {
8215 if (!is_fast_dispatch)
8216 s->session_dispatch_lock.Lock();
8217 clear_session_waiting_on_map(s);
8218 con->set_priv(NULL); // break ref <-> session cycle, if any
8219 if (!is_fast_dispatch)
8220 s->session_dispatch_lock.Unlock();
8221 s->put();
8222 }
8223 return false;
8224 }
8225 return true;
8226 }
8227
8228
8229 /*
8230 * require that we have same (or newer) map, and that
8231 * the source is the pg primary.
8232 */
8233 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8234 bool is_fast_dispatch)
8235 {
8236 const Message *m = op->get_req();
8237 dout(15) << "require_same_or_newer_map " << epoch
8238 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8239
8240 assert(osd_lock.is_locked());
8241
8242 // do they have a newer map?
8243 if (epoch > osdmap->get_epoch()) {
8244 dout(7) << "waiting for newer map epoch " << epoch
8245 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8246 wait_for_new_map(op);
8247 return false;
8248 }
8249
8250 if (!require_self_aliveness(op->get_req(), epoch)) {
8251 return false;
8252 }
8253
8254 // ok, our map is same or newer.. do they still exist?
8255 if (m->get_connection()->get_messenger() == cluster_messenger &&
8256 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8257 return false;
8258 }
8259
8260 return true;
8261 }
8262
8263
8264
8265
8266
8267 // ----------------------------------------
8268 // pg creation
8269
8270 void OSD::split_pgs(
8271 PG *parent,
8272 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
8273 OSDMapRef curmap,
8274 OSDMapRef nextmap,
8275 PG::RecoveryCtx *rctx)
8276 {
8277 unsigned pg_num = nextmap->get_pg_num(
8278 parent->pool.id);
8279 parent->update_snap_mapper_bits(
8280 parent->info.pgid.get_split_bits(pg_num)
8281 );
8282
8283 vector<object_stat_sum_t> updated_stats(childpgids.size() + 1);
8284 parent->info.stats.stats.sum.split(updated_stats);
8285
8286 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8287 for (set<spg_t>::const_iterator i = childpgids.begin();
8288 i != childpgids.end();
8289 ++i, ++stat_iter) {
8290 assert(stat_iter != updated_stats.end());
8291 dout(10) << "Splitting " << *parent << " into " << *i << dendl;
8292 assert(service.splitting(*i));
8293 PG* child = _make_pg(nextmap, *i);
8294 child->lock(true);
8295 out_pgs->insert(child);
8296 rctx->created_pgs.insert(child);
8297
8298 unsigned split_bits = i->get_split_bits(pg_num);
8299 dout(10) << "pg_num is " << pg_num << dendl;
8300 dout(10) << "m_seed " << i->ps() << dendl;
8301 dout(10) << "split_bits is " << split_bits << dendl;
8302
8303 parent->split_colls(
8304 *i,
8305 split_bits,
8306 i->ps(),
8307 &child->pool.info,
8308 rctx->transaction);
8309 parent->split_into(
8310 i->pgid,
8311 child,
8312 split_bits);
8313 child->info.stats.stats.sum = *stat_iter;
8314
8315 child->write_if_dirty(*(rctx->transaction));
8316 child->unlock();
8317 }
8318 assert(stat_iter != updated_stats.end());
8319 parent->info.stats.stats.sum = *stat_iter;
8320 parent->write_if_dirty(*(rctx->transaction));
8321 }
8322
8323 /*
8324 * holding osd_lock
8325 */
8326 void OSD::handle_pg_create(OpRequestRef op)
8327 {
8328 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
8329 assert(m->get_type() == MSG_OSD_PG_CREATE);
8330
8331 dout(10) << "handle_pg_create " << *m << dendl;
8332
8333 if (!require_mon_peer(op->get_req())) {
8334 return;
8335 }
8336
8337 if (!require_same_or_newer_map(op, m->epoch, false))
8338 return;
8339
8340 op->mark_started();
8341
8342 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8343 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8344 p != m->mkpg.end();
8345 ++p, ++ci) {
8346 assert(ci != m->ctimes.end() && ci->first == p->first);
8347 epoch_t created = p->second.created;
8348 if (p->second.split_bits) // Skip split pgs
8349 continue;
8350 pg_t on = p->first;
8351
8352 if (on.preferred() >= 0) {
8353 dout(20) << "ignoring localized pg " << on << dendl;
8354 continue;
8355 }
8356
8357 if (!osdmap->have_pg_pool(on.pool())) {
8358 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8359 continue;
8360 }
8361
8362 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
8363
8364 // is it still ours?
8365 vector<int> up, acting;
8366 int up_primary = -1;
8367 int acting_primary = -1;
8368 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
8369 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
8370
8371 if (acting_primary != whoami) {
8372 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
8373 << "), my role=" << role << ", skipping" << dendl;
8374 continue;
8375 }
8376
8377 spg_t pgid;
8378 bool mapped = osdmap->get_primary_shard(on, &pgid);
8379 assert(mapped);
8380
8381 PastIntervals pi(
8382 osdmap->get_pools().at(pgid.pool()).ec_pool(),
8383 *osdmap);
8384 pg_history_t history;
8385 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
8386
8387 // The mon won't resend unless the primary changed, so
8388 // we ignore same_interval_since. We'll pass this history
8389 // to handle_pg_peering_evt with the current epoch as the
8390 // event -- the project_pg_history check in
8391 // handle_pg_peering_evt will be a noop.
8392 if (history.same_primary_since > m->epoch) {
8393 dout(10) << __func__ << ": got obsolete pg create on pgid "
8394 << pgid << " from epoch " << m->epoch
8395 << ", primary changed in " << history.same_primary_since
8396 << dendl;
8397 continue;
8398 }
8399
8400 if (handle_pg_peering_evt(
8401 pgid,
8402 history,
8403 pi,
8404 osdmap->get_epoch(),
8405 PG::CephPeeringEvtRef(
8406 new PG::CephPeeringEvt(
8407 osdmap->get_epoch(),
8408 osdmap->get_epoch(),
8409 PG::NullEvt()))
8410 ) == -EEXIST) {
8411 service.send_pg_created(pgid.pgid);
8412 }
8413 }
8414 last_pg_create_epoch = m->epoch;
8415
8416 maybe_update_heartbeat_peers();
8417 }
8418
8419
8420 // ----------------------------------------
8421 // peering and recovery
8422
8423 PG::RecoveryCtx OSD::create_context()
8424 {
8425 ObjectStore::Transaction *t = new ObjectStore::Transaction;
8426 C_Contexts *on_applied = new C_Contexts(cct);
8427 C_Contexts *on_safe = new C_Contexts(cct);
8428 map<int, map<spg_t,pg_query_t> > *query_map =
8429 new map<int, map<spg_t, pg_query_t> >;
8430 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
8431 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
8432 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
8433 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
8434 PG::RecoveryCtx rctx(query_map, info_map, notify_list,
8435 on_applied, on_safe, t);
8436 return rctx;
8437 }
8438
8439 struct C_OpenPGs : public Context {
8440 set<PGRef> pgs;
8441 ObjectStore *store;
8442 OSD *osd;
8443 C_OpenPGs(set<PGRef>& p, ObjectStore *s, OSD* o) : store(s), osd(o) {
8444 pgs.swap(p);
8445 }
8446 void finish(int r) override {
8447 RWLock::RLocker l(osd->pg_map_lock);
8448 for (auto p : pgs) {
8449 if (osd->pg_map.count(p->info.pgid)) {
8450 p->ch = store->open_collection(p->coll);
8451 assert(p->ch);
8452 }
8453 }
8454 }
8455 };
8456
8457 void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
8458 ThreadPool::TPHandle *handle)
8459 {
8460 if (!ctx.transaction->empty()) {
8461 if (!ctx.created_pgs.empty()) {
8462 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8463 }
8464 int tr = store->queue_transaction(
8465 pg->osr.get(),
8466 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL,
8467 TrackedOpRef(), handle);
8468 delete (ctx.transaction);
8469 assert(tr == 0);
8470 ctx.transaction = new ObjectStore::Transaction;
8471 ctx.on_applied = new C_Contexts(cct);
8472 ctx.on_safe = new C_Contexts(cct);
8473 }
8474 }
8475
8476 void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
8477 ThreadPool::TPHandle *handle)
8478 {
8479 if (service.get_osdmap()->is_up(whoami) &&
8480 is_active()) {
8481 do_notifies(*ctx.notify_list, curmap);
8482 do_queries(*ctx.query_map, curmap);
8483 do_infos(*ctx.info_map, curmap);
8484 }
8485 delete ctx.notify_list;
8486 delete ctx.query_map;
8487 delete ctx.info_map;
8488 if ((ctx.on_applied->empty() &&
8489 ctx.on_safe->empty() &&
8490 ctx.transaction->empty() &&
8491 ctx.created_pgs.empty()) || !pg) {
8492 delete ctx.transaction;
8493 delete ctx.on_applied;
8494 delete ctx.on_safe;
8495 assert(ctx.created_pgs.empty());
8496 } else {
8497 if (!ctx.created_pgs.empty()) {
8498 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8499 }
8500 int tr = store->queue_transaction(
8501 pg->osr.get(),
8502 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL, TrackedOpRef(),
8503 handle);
8504 delete (ctx.transaction);
8505 assert(tr == 0);
8506 }
8507 }
8508
8509 /** do_notifies
8510 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
8511 * content for, and they are primary for.
8512 */
8513
8514 void OSD::do_notifies(
8515 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
8516 OSDMapRef curmap)
8517 {
8518 for (map<int,
8519 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
8520 notify_list.begin();
8521 it != notify_list.end();
8522 ++it) {
8523 if (!curmap->is_up(it->first)) {
8524 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
8525 continue;
8526 }
8527 ConnectionRef con = service.get_con_osd_cluster(
8528 it->first, curmap->get_epoch());
8529 if (!con) {
8530 dout(20) << __func__ << " skipping osd." << it->first
8531 << " (NULL con)" << dendl;
8532 continue;
8533 }
8534 service.share_map_peer(it->first, con.get(), curmap);
8535 dout(7) << __func__ << " osd " << it->first
8536 << " on " << it->second.size() << " PGs" << dendl;
8537 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
8538 it->second);
8539 con->send_message(m);
8540 }
8541 }
8542
8543
8544 /** do_queries
8545 * send out pending queries for info | summaries
8546 */
8547 void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
8548 OSDMapRef curmap)
8549 {
8550 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
8551 pit != query_map.end();
8552 ++pit) {
8553 if (!curmap->is_up(pit->first)) {
8554 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
8555 continue;
8556 }
8557 int who = pit->first;
8558 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
8559 if (!con) {
8560 dout(20) << __func__ << " skipping osd." << who
8561 << " (NULL con)" << dendl;
8562 continue;
8563 }
8564 service.share_map_peer(who, con.get(), curmap);
8565 dout(7) << __func__ << " querying osd." << who
8566 << " on " << pit->second.size() << " PGs" << dendl;
8567 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
8568 con->send_message(m);
8569 }
8570 }
8571
8572
8573 void OSD::do_infos(map<int,
8574 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
8575 OSDMapRef curmap)
8576 {
8577 for (map<int,
8578 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
8579 info_map.begin();
8580 p != info_map.end();
8581 ++p) {
8582 if (!curmap->is_up(p->first)) {
8583 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
8584 continue;
8585 }
8586 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
8587 i != p->second.end();
8588 ++i) {
8589 dout(20) << __func__ << " sending info " << i->first.info
8590 << " to shard " << p->first << dendl;
8591 }
8592 ConnectionRef con = service.get_con_osd_cluster(
8593 p->first, curmap->get_epoch());
8594 if (!con) {
8595 dout(20) << __func__ << " skipping osd." << p->first
8596 << " (NULL con)" << dendl;
8597 continue;
8598 }
8599 service.share_map_peer(p->first, con.get(), curmap);
8600 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
8601 m->pg_list = p->second;
8602 con->send_message(m);
8603 }
8604 info_map.clear();
8605 }
8606
8607
8608 /** PGNotify
8609 * from non-primary to primary
8610 * includes pg_info_t.
8611 * NOTE: called with opqueue active.
8612 */
8613 void OSD::handle_pg_notify(OpRequestRef op)
8614 {
8615 const MOSDPGNotify *m = static_cast<const MOSDPGNotify*>(op->get_req());
8616 assert(m->get_type() == MSG_OSD_PG_NOTIFY);
8617
8618 dout(7) << "handle_pg_notify from " << m->get_source() << dendl;
8619 int from = m->get_source().num();
8620
8621 if (!require_osd_peer(op->get_req()))
8622 return;
8623
8624 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8625 return;
8626
8627 op->mark_started();
8628
8629 for (auto it = m->get_pg_list().begin();
8630 it != m->get_pg_list().end();
8631 ++it) {
8632 if (it->first.info.pgid.preferred() >= 0) {
8633 dout(20) << "ignoring localized pg " << it->first.info.pgid << dendl;
8634 continue;
8635 }
8636
8637 handle_pg_peering_evt(
8638 spg_t(it->first.info.pgid.pgid, it->first.to),
8639 it->first.info.history, it->second,
8640 it->first.query_epoch,
8641 PG::CephPeeringEvtRef(
8642 new PG::CephPeeringEvt(
8643 it->first.epoch_sent, it->first.query_epoch,
8644 PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first,
8645 op->get_req()->get_connection()->get_features())))
8646 );
8647 }
8648 }
8649
8650 void OSD::handle_pg_log(OpRequestRef op)
8651 {
8652 MOSDPGLog *m = static_cast<MOSDPGLog*>(op->get_nonconst_req());
8653 assert(m->get_type() == MSG_OSD_PG_LOG);
8654 dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl;
8655
8656 if (!require_osd_peer(op->get_req()))
8657 return;
8658
8659 int from = m->get_source().num();
8660 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8661 return;
8662
8663 if (m->info.pgid.preferred() >= 0) {
8664 dout(10) << "ignoring localized pg " << m->info.pgid << dendl;
8665 return;
8666 }
8667
8668 op->mark_started();
8669 handle_pg_peering_evt(
8670 spg_t(m->info.pgid.pgid, m->to),
8671 m->info.history, m->past_intervals, m->get_epoch(),
8672 PG::CephPeeringEvtRef(
8673 new PG::CephPeeringEvt(
8674 m->get_epoch(), m->get_query_epoch(),
8675 PG::MLogRec(pg_shard_t(from, m->from), m)))
8676 );
8677 }
8678
8679 void OSD::handle_pg_info(OpRequestRef op)
8680 {
8681 const MOSDPGInfo *m = static_cast<const MOSDPGInfo *>(op->get_req());
8682 assert(m->get_type() == MSG_OSD_PG_INFO);
8683 dout(7) << "handle_pg_info " << *m << " from " << m->get_source() << dendl;
8684
8685 if (!require_osd_peer(op->get_req()))
8686 return;
8687
8688 int from = m->get_source().num();
8689 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8690 return;
8691
8692 op->mark_started();
8693
8694 for (auto p = m->pg_list.begin();
8695 p != m->pg_list.end();
8696 ++p) {
8697 if (p->first.info.pgid.preferred() >= 0) {
8698 dout(10) << "ignoring localized pg " << p->first.info.pgid << dendl;
8699 continue;
8700 }
8701
8702 handle_pg_peering_evt(
8703 spg_t(p->first.info.pgid.pgid, p->first.to),
8704 p->first.info.history, p->second, p->first.epoch_sent,
8705 PG::CephPeeringEvtRef(
8706 new PG::CephPeeringEvt(
8707 p->first.epoch_sent, p->first.query_epoch,
8708 PG::MInfoRec(
8709 pg_shard_t(
8710 from, p->first.from), p->first.info, p->first.epoch_sent)))
8711 );
8712 }
8713 }
8714
8715 void OSD::handle_pg_trim(OpRequestRef op)
8716 {
8717 const MOSDPGTrim *m = static_cast<const MOSDPGTrim*>(op->get_req());
8718 assert(m->get_type() == MSG_OSD_PG_TRIM);
8719
8720 dout(7) << "handle_pg_trim " << *m << " from " << m->get_source() << dendl;
8721
8722 if (!require_osd_peer(op->get_req()))
8723 return;
8724
8725 int from = m->get_source().num();
8726 if (!require_same_or_newer_map(op, m->epoch, false))
8727 return;
8728
8729 if (m->pgid.preferred() >= 0) {
8730 dout(10) << "ignoring localized pg " << m->pgid << dendl;
8731 return;
8732 }
8733
8734 op->mark_started();
8735
8736 PG *pg = _lookup_lock_pg(m->pgid);
8737 if(!pg) {
8738 dout(10) << " don't have pg " << m->pgid << dendl;
8739 return;
8740 }
8741
8742 if (m->epoch < pg->info.history.same_interval_since) {
8743 dout(10) << *pg << " got old trim to " << m->trim_to << ", ignoring" << dendl;
8744 pg->unlock();
8745 return;
8746 }
8747
8748 if (pg->is_primary()) {
8749 // peer is informing us of their last_complete_ondisk
8750 dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
8751 pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
8752 m->trim_to;
8753 // trim log when the pg is recovered
8754 pg->calc_min_last_complete_ondisk();
8755 } else {
8756 // primary is instructing us to trim
8757 ObjectStore::Transaction t;
8758 pg->pg_log.trim(m->trim_to, pg->info);
8759 pg->dirty_info = true;
8760 pg->write_if_dirty(t);
8761 int tr = store->queue_transaction(pg->osr.get(), std::move(t), NULL);
8762 assert(tr == 0);
8763 }
8764 pg->unlock();
8765 }
8766
8767 void OSD::handle_pg_backfill_reserve(OpRequestRef op)
8768 {
8769 const MBackfillReserve *m = static_cast<const MBackfillReserve*>(op->get_req());
8770 assert(m->get_type() == MSG_OSD_BACKFILL_RESERVE);
8771
8772 if (!require_osd_peer(op->get_req()))
8773 return;
8774 if (!require_same_or_newer_map(op, m->query_epoch, false))
8775 return;
8776
8777 PG::CephPeeringEvtRef evt;
8778 if (m->type == MBackfillReserve::REQUEST) {
8779 evt = PG::CephPeeringEvtRef(
8780 new PG::CephPeeringEvt(
8781 m->query_epoch,
8782 m->query_epoch,
8783 PG::RequestBackfillPrio(m->priority)));
8784 } else if (m->type == MBackfillReserve::GRANT) {
8785 evt = PG::CephPeeringEvtRef(
8786 new PG::CephPeeringEvt(
8787 m->query_epoch,
8788 m->query_epoch,
8789 PG::RemoteBackfillReserved()));
8790 } else if (m->type == MBackfillReserve::REJECT) {
8791 evt = PG::CephPeeringEvtRef(
8792 new PG::CephPeeringEvt(
8793 m->query_epoch,
8794 m->query_epoch,
8795 PG::RemoteReservationRejected()));
8796 } else {
8797 ceph_abort();
8798 }
8799
8800 if (service.splitting(m->pgid)) {
8801 peering_wait_for_split[m->pgid].push_back(evt);
8802 return;
8803 }
8804
8805 PG *pg = _lookup_lock_pg(m->pgid);
8806 if (!pg) {
8807 dout(10) << " don't have pg " << m->pgid << dendl;
8808 return;
8809 }
8810
8811 pg->queue_peering_event(evt);
8812 pg->unlock();
8813 }
8814
8815 void OSD::handle_pg_recovery_reserve(OpRequestRef op)
8816 {
8817 const MRecoveryReserve *m = static_cast<const MRecoveryReserve*>(op->get_req());
8818 assert(m->get_type() == MSG_OSD_RECOVERY_RESERVE);
8819
8820 if (!require_osd_peer(op->get_req()))
8821 return;
8822 if (!require_same_or_newer_map(op, m->query_epoch, false))
8823 return;
8824
8825 PG::CephPeeringEvtRef evt;
8826 if (m->type == MRecoveryReserve::REQUEST) {
8827 evt = PG::CephPeeringEvtRef(
8828 new PG::CephPeeringEvt(
8829 m->query_epoch,
8830 m->query_epoch,
8831 PG::RequestRecovery()));
8832 } else if (m->type == MRecoveryReserve::GRANT) {
8833 evt = PG::CephPeeringEvtRef(
8834 new PG::CephPeeringEvt(
8835 m->query_epoch,
8836 m->query_epoch,
8837 PG::RemoteRecoveryReserved()));
8838 } else if (m->type == MRecoveryReserve::RELEASE) {
8839 evt = PG::CephPeeringEvtRef(
8840 new PG::CephPeeringEvt(
8841 m->query_epoch,
8842 m->query_epoch,
8843 PG::RecoveryDone()));
8844 } else {
8845 ceph_abort();
8846 }
8847
8848 if (service.splitting(m->pgid)) {
8849 peering_wait_for_split[m->pgid].push_back(evt);
8850 return;
8851 }
8852
8853 PG *pg = _lookup_lock_pg(m->pgid);
8854 if (!pg) {
8855 dout(10) << " don't have pg " << m->pgid << dendl;
8856 return;
8857 }
8858
8859 pg->queue_peering_event(evt);
8860 pg->unlock();
8861 }
8862
8863
8864 /** PGQuery
8865 * from primary to replica | stray
8866 * NOTE: called with opqueue active.
8867 */
8868 void OSD::handle_pg_query(OpRequestRef op)
8869 {
8870 assert(osd_lock.is_locked());
8871
8872 const MOSDPGQuery *m = static_cast<const MOSDPGQuery*>(op->get_req());
8873 assert(m->get_type() == MSG_OSD_PG_QUERY);
8874
8875 if (!require_osd_peer(op->get_req()))
8876 return;
8877
8878 dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl;
8879 int from = m->get_source().num();
8880
8881 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8882 return;
8883
8884 op->mark_started();
8885
8886 map< int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
8887
8888 for (auto it = m->pg_list.begin();
8889 it != m->pg_list.end();
8890 ++it) {
8891 spg_t pgid = it->first;
8892
8893 if (pgid.preferred() >= 0) {
8894 dout(10) << "ignoring localized pg " << pgid << dendl;
8895 continue;
8896 }
8897
8898 if (service.splitting(pgid)) {
8899 peering_wait_for_split[pgid].push_back(
8900 PG::CephPeeringEvtRef(
8901 new PG::CephPeeringEvt(
8902 it->second.epoch_sent, it->second.epoch_sent,
8903 PG::MQuery(pg_shard_t(from, it->second.from),
8904 it->second, it->second.epoch_sent))));
8905 continue;
8906 }
8907
8908 {
8909 RWLock::RLocker l(pg_map_lock);
8910 if (pg_map.count(pgid)) {
8911 PG *pg = 0;
8912 pg = _lookup_lock_pg_with_map_lock_held(pgid);
8913 pg->queue_query(
8914 it->second.epoch_sent, it->second.epoch_sent,
8915 pg_shard_t(from, it->second.from), it->second);
8916 pg->unlock();
8917 continue;
8918 }
8919 }
8920
8921 if (!osdmap->have_pg_pool(pgid.pool()))
8922 continue;
8923
8924 // get active crush mapping
8925 int up_primary, acting_primary;
8926 vector<int> up, acting;
8927 osdmap->pg_to_up_acting_osds(
8928 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
8929
8930 // same primary?
8931 pg_history_t history = it->second.history;
8932 bool valid_history = project_pg_history(
8933 pgid, history, it->second.epoch_sent,
8934 up, up_primary, acting, acting_primary);
8935
8936 if (!valid_history ||
8937 it->second.epoch_sent < history.same_interval_since) {
8938 dout(10) << " pg " << pgid << " dne, and pg has changed in "
8939 << history.same_interval_since
8940 << " (msg from " << it->second.epoch_sent << ")" << dendl;
8941 continue;
8942 }
8943
8944 dout(10) << " pg " << pgid << " dne" << dendl;
8945 pg_info_t empty(spg_t(pgid.pgid, it->second.to));
8946 /* This is racy, but that should be ok: if we complete the deletion
8947 * before the pg is recreated, we'll just start it off backfilling
8948 * instead of just empty */
8949 if (service.deleting_pgs.lookup(pgid))
8950 empty.set_last_backfill(hobject_t());
8951 if (it->second.type == pg_query_t::LOG ||
8952 it->second.type == pg_query_t::FULLLOG) {
8953 ConnectionRef con = service.get_con_osd_cluster(from, osdmap->get_epoch());
8954 if (con) {
8955 MOSDPGLog *mlog = new MOSDPGLog(
8956 it->second.from, it->second.to,
8957 osdmap->get_epoch(), empty,
8958 it->second.epoch_sent);
8959 service.share_map_peer(from, con.get(), osdmap);
8960 con->send_message(mlog);
8961 }
8962 } else {
8963 notify_list[from].push_back(
8964 make_pair(
8965 pg_notify_t(
8966 it->second.from, it->second.to,
8967 it->second.epoch_sent,
8968 osdmap->get_epoch(),
8969 empty),
8970 PastIntervals(
8971 osdmap->get_pools().at(pgid.pool()).ec_pool(),
8972 *osdmap)));
8973 }
8974 }
8975 do_notifies(notify_list, osdmap);
8976 }
8977
8978
8979 void OSD::handle_pg_remove(OpRequestRef op)
8980 {
8981 const MOSDPGRemove *m = static_cast<const MOSDPGRemove *>(op->get_req());
8982 assert(m->get_type() == MSG_OSD_PG_REMOVE);
8983 assert(osd_lock.is_locked());
8984
8985 if (!require_osd_peer(op->get_req()))
8986 return;
8987
8988 dout(7) << "handle_pg_remove from " << m->get_source() << " on "
8989 << m->pg_list.size() << " pgs" << dendl;
8990
8991 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8992 return;
8993
8994 op->mark_started();
8995
8996 for (auto it = m->pg_list.begin();
8997 it != m->pg_list.end();
8998 ++it) {
8999 spg_t pgid = *it;
9000 if (pgid.preferred() >= 0) {
9001 dout(10) << "ignoring localized pg " << pgid << dendl;
9002 continue;
9003 }
9004
9005 RWLock::WLocker l(pg_map_lock);
9006 if (pg_map.count(pgid) == 0) {
9007 dout(10) << " don't have pg " << pgid << dendl;
9008 continue;
9009 }
9010 dout(5) << "queue_pg_for_deletion: " << pgid << dendl;
9011 PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
9012 pg_history_t history = pg->info.history;
9013 int up_primary, acting_primary;
9014 vector<int> up, acting;
9015 osdmap->pg_to_up_acting_osds(
9016 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9017 bool valid_history = project_pg_history(
9018 pg->info.pgid, history, pg->get_osdmap()->get_epoch(),
9019 up, up_primary, acting, acting_primary);
9020 if (valid_history &&
9021 history.same_interval_since <= m->get_epoch()) {
9022 assert(pg->get_primary().osd == m->get_source().num());
9023 PGRef _pg(pg);
9024 _remove_pg(pg);
9025 pg->unlock();
9026 } else {
9027 dout(10) << *pg << " ignoring remove request, pg changed in epoch "
9028 << history.same_interval_since
9029 << " > " << m->get_epoch() << dendl;
9030 pg->unlock();
9031 }
9032 }
9033 }
9034
9035 void OSD::_remove_pg(PG *pg)
9036 {
9037 ObjectStore::Transaction rmt ;
9038
9039 // on_removal, which calls remove_watchers_and_notifies, and the erasure from
9040 // the pg_map must be done together without unlocking the pg lock,
9041 // to avoid racing with watcher cleanup in ms_handle_reset
9042 // and handle_notify_timeout
9043 pg->on_removal(&rmt);
9044
9045 service.cancel_pending_splits_for_parent(pg->info.pgid);
9046 int tr = store->queue_transaction(
9047 pg->osr.get(), std::move(rmt), NULL,
9048 new ContainerContext<
9049 SequencerRef>(pg->osr));
9050 assert(tr == 0);
9051
9052 DeletingStateRef deleting = service.deleting_pgs.lookup_or_create(
9053 pg->info.pgid,
9054 make_pair(
9055 pg->info.pgid,
9056 PGRef(pg))
9057 );
9058 remove_wq.queue(make_pair(PGRef(pg), deleting));
9059
9060 service.pg_remove_epoch(pg->info.pgid);
9061
9062 // dereference from op_wq
9063 op_shardedwq.clear_pg_pointer(pg->info.pgid);
9064
9065 // remove from map
9066 pg_map.erase(pg->info.pgid);
9067 pg->put("PGMap"); // since we've taken it out of map
9068 }
9069
9070
9071 // =========================================================
9072 // RECOVERY
9073
9074 void OSDService::_maybe_queue_recovery() {
9075 assert(recovery_lock.is_locked_by_me());
9076 uint64_t available_pushes;
9077 while (!awaiting_throttle.empty() &&
9078 _recover_now(&available_pushes)) {
9079 uint64_t to_start = MIN(
9080 available_pushes,
9081 cct->_conf->osd_recovery_max_single_start);
9082 _queue_for_recovery(awaiting_throttle.front(), to_start);
9083 awaiting_throttle.pop_front();
9084 recovery_ops_reserved += to_start;
9085 }
9086 }
9087
9088 bool OSDService::_recover_now(uint64_t *available_pushes)
9089 {
9090 if (available_pushes)
9091 *available_pushes = 0;
9092
9093 if (ceph_clock_now() < defer_recovery_until) {
9094 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9095 return false;
9096 }
9097
9098 if (recovery_paused) {
9099 dout(15) << __func__ << " paused" << dendl;
9100 return false;
9101 }
9102
9103 uint64_t max = cct->_conf->osd_recovery_max_active;
9104 if (max <= recovery_ops_active + recovery_ops_reserved) {
9105 dout(15) << __func__ << " active " << recovery_ops_active
9106 << " + reserved " << recovery_ops_reserved
9107 << " >= max " << max << dendl;
9108 return false;
9109 }
9110
9111 if (available_pushes)
9112 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9113
9114 return true;
9115 }
9116
9117 void OSD::do_recovery(
9118 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9119 ThreadPool::TPHandle &handle)
9120 {
9121 uint64_t started = 0;
9122
9123 /*
9124 * When the value of osd_recovery_sleep is set greater than zero, recovery
9125 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9126 * recovery event's schedule time. This is done by adding a
9127 * recovery_requeue_callback event, which re-queues the recovery op using
9128 * queue_recovery_after_sleep.
9129 */
9130 if (cct->_conf->osd_recovery_sleep > 0 && service.recovery_needs_sleep) {
9131 PGRef pgref(pg);
9132 auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9133 dout(20) << "do_recovery wake up at "
9134 << ceph_clock_now()
9135 << ", re-queuing recovery" << dendl;
9136 service.recovery_needs_sleep = false;
9137 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9138 });
9139 Mutex::Locker l(service.recovery_sleep_lock);
9140
9141 // This is true for the first recovery op and when the previous recovery op
9142 // has been scheduled in the past. The next recovery op is scheduled after
9143 // completing the sleep from now.
9144 if (service.recovery_schedule_time < ceph_clock_now()) {
9145 service.recovery_schedule_time = ceph_clock_now();
9146 }
9147 service.recovery_schedule_time += cct->_conf->osd_recovery_sleep;
9148 service.recovery_sleep_timer.add_event_at(service.recovery_schedule_time,
9149 recovery_requeue_callback);
9150 dout(20) << "Recovery event scheduled at "
9151 << service.recovery_schedule_time << dendl;
9152 return;
9153 }
9154
9155 {
9156 service.recovery_needs_sleep = true;
9157 if (pg->pg_has_reset_since(queued)) {
9158 goto out;
9159 }
9160
9161 assert(!pg->deleting);
9162 assert(pg->is_peered() && pg->is_primary());
9163
9164 assert(pg->recovery_queued);
9165 pg->recovery_queued = false;
9166
9167 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9168 #ifdef DEBUG_RECOVERY_OIDS
9169 dout(20) << " active was " << service.recovery_oids[pg->info.pgid] << dendl;
9170 #endif
9171
9172 bool more = pg->start_recovery_ops(reserved_pushes, handle, &started);
9173 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9174 << " on " << *pg << dendl;
9175
9176 // If no recovery op is started, don't bother to manipulate the RecoveryCtx
9177 if (!started && (more || !pg->have_unfound())) {
9178 goto out;
9179 }
9180
9181 PG::RecoveryCtx rctx = create_context();
9182 rctx.handle = &handle;
9183
9184 /*
9185 * if we couldn't start any recovery ops and things are still
9186 * unfound, see if we can discover more missing object locations.
9187 * It may be that our initial locations were bad and we errored
9188 * out while trying to pull.
9189 */
9190 if (!more && pg->have_unfound()) {
9191 pg->discover_all_missing(*rctx.query_map);
9192 if (rctx.query_map->empty()) {
9193 string action;
9194 if (pg->state_test(PG_STATE_BACKFILL)) {
9195 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9196 queued,
9197 queued,
9198 PG::CancelBackfill()));
9199 pg->queue_peering_event(evt);
9200 action = "in backfill";
9201 } else if (pg->state_test(PG_STATE_RECOVERING)) {
9202 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9203 queued,
9204 queued,
9205 PG::CancelRecovery()));
9206 pg->queue_peering_event(evt);
9207 action = "in recovery";
9208 } else {
9209 action = "already out of recovery/backfill";
9210 }
9211 dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
9212 } else {
9213 dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
9214 pg->queue_recovery();
9215 }
9216 }
9217
9218 pg->write_if_dirty(*rctx.transaction);
9219 OSDMapRef curmap = pg->get_osdmap();
9220 dispatch_context(rctx, pg, curmap);
9221 }
9222
9223 out:
9224 assert(started <= reserved_pushes);
9225 service.release_reserved_pushes(reserved_pushes);
9226 }
9227
9228 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9229 {
9230 Mutex::Locker l(recovery_lock);
9231 dout(10) << "start_recovery_op " << *pg << " " << soid
9232 << " (" << recovery_ops_active << "/"
9233 << cct->_conf->osd_recovery_max_active << " rops)"
9234 << dendl;
9235 recovery_ops_active++;
9236
9237 #ifdef DEBUG_RECOVERY_OIDS
9238 dout(20) << " active was " << recovery_oids[pg->info.pgid] << dendl;
9239 assert(recovery_oids[pg->info.pgid].count(soid) == 0);
9240 recovery_oids[pg->info.pgid].insert(soid);
9241 #endif
9242 }
9243
9244 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9245 {
9246 Mutex::Locker l(recovery_lock);
9247 dout(10) << "finish_recovery_op " << *pg << " " << soid
9248 << " dequeue=" << dequeue
9249 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
9250 << dendl;
9251
9252 // adjust count
9253 assert(recovery_ops_active > 0);
9254 recovery_ops_active--;
9255
9256 #ifdef DEBUG_RECOVERY_OIDS
9257 dout(20) << " active oids was " << recovery_oids[pg->info.pgid] << dendl;
9258 assert(recovery_oids[pg->info.pgid].count(soid));
9259 recovery_oids[pg->info.pgid].erase(soid);
9260 #endif
9261
9262 _maybe_queue_recovery();
9263 }
9264
9265 bool OSDService::is_recovery_active()
9266 {
9267 Mutex::Locker l(recovery_lock);
9268 return recovery_ops_active > 0;
9269 }
9270
9271 // =========================================================
9272 // OPS
9273
9274 bool OSD::op_is_discardable(const MOSDOp *op)
9275 {
9276 // drop client request if they are not connected and can't get the
9277 // reply anyway.
9278 if (!op->get_connection()->is_connected()) {
9279 return true;
9280 }
9281 return false;
9282 }
9283
9284 void OSD::enqueue_op(spg_t pg, OpRequestRef& op, epoch_t epoch)
9285 {
9286 utime_t latency = ceph_clock_now() - op->get_req()->get_recv_stamp();
9287 dout(15) << "enqueue_op " << op << " prio " << op->get_req()->get_priority()
9288 << " cost " << op->get_req()->get_cost()
9289 << " latency " << latency
9290 << " epoch " << epoch
9291 << " " << *(op->get_req()) << dendl;
9292 op->osd_trace.event("enqueue op");
9293 op->osd_trace.keyval("priority", op->get_req()->get_priority());
9294 op->osd_trace.keyval("cost", op->get_req()->get_cost());
9295 op->mark_queued_for_pg();
9296 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9297 op_shardedwq.queue(make_pair(pg, PGQueueable(op, epoch)));
9298 }
9299
9300
9301
9302 /*
9303 * NOTE: dequeue called in worker thread, with pg lock
9304 */
9305 void OSD::dequeue_op(
9306 PGRef pg, OpRequestRef op,
9307 ThreadPool::TPHandle &handle)
9308 {
9309 FUNCTRACE();
9310 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
9311
9312 utime_t now = ceph_clock_now();
9313 op->set_dequeued_time(now);
9314 utime_t latency = now - op->get_req()->get_recv_stamp();
9315 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
9316 << " cost " << op->get_req()->get_cost()
9317 << " latency " << latency
9318 << " " << *(op->get_req())
9319 << " pg " << *pg << dendl;
9320
9321 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9322
9323 Session *session = static_cast<Session *>(
9324 op->get_req()->get_connection()->get_priv());
9325 if (session) {
9326 maybe_share_map(session, op, pg->get_osdmap());
9327 session->put();
9328 }
9329
9330 if (pg->deleting)
9331 return;
9332
9333 op->mark_reached_pg();
9334 op->osd_trace.event("dequeue_op");
9335
9336 pg->do_request(op, handle);
9337
9338 // finish
9339 dout(10) << "dequeue_op " << op << " finish" << dendl;
9340 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
9341 }
9342
9343
9344 struct C_CompleteSplits : public Context {
9345 OSD *osd;
9346 set<PGRef> pgs;
9347 C_CompleteSplits(OSD *osd, const set<PGRef> &in)
9348 : osd(osd), pgs(in) {}
9349 void finish(int r) override {
9350 Mutex::Locker l(osd->osd_lock);
9351 if (osd->is_stopping())
9352 return;
9353 PG::RecoveryCtx rctx = osd->create_context();
9354 for (set<PGRef>::iterator i = pgs.begin();
9355 i != pgs.end();
9356 ++i) {
9357 osd->pg_map_lock.get_write();
9358 (*i)->lock();
9359 PG *pg = i->get();
9360 osd->add_newly_split_pg(pg, &rctx);
9361 if (!((*i)->deleting)) {
9362 set<spg_t> to_complete;
9363 to_complete.insert((*i)->info.pgid);
9364 osd->service.complete_split(to_complete);
9365 }
9366 osd->pg_map_lock.put_write();
9367 osd->dispatch_context_transaction(rctx, pg);
9368 osd->wake_pg_waiters(*i);
9369 (*i)->unlock();
9370 }
9371
9372 osd->dispatch_context(rctx, 0, osd->service.get_osdmap());
9373 }
9374 };
9375
9376 void OSD::process_peering_events(
9377 const list<PG*> &pgs,
9378 ThreadPool::TPHandle &handle
9379 )
9380 {
9381 bool need_up_thru = false;
9382 epoch_t same_interval_since = 0;
9383 OSDMapRef curmap;
9384 PG::RecoveryCtx rctx = create_context();
9385 rctx.handle = &handle;
9386 for (list<PG*>::const_iterator i = pgs.begin();
9387 i != pgs.end();
9388 ++i) {
9389 set<PGRef> split_pgs;
9390 PG *pg = *i;
9391 pg->lock_suspend_timeout(handle);
9392 curmap = service.get_osdmap();
9393 if (pg->deleting) {
9394 pg->unlock();
9395 continue;
9396 }
9397 if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
9398 // we need to requeue the PG explicitly since we didn't actually
9399 // handle an event
9400 peering_wq.queue(pg);
9401 } else {
9402 assert(!pg->peering_queue.empty());
9403 PG::CephPeeringEvtRef evt = pg->peering_queue.front();
9404 pg->peering_queue.pop_front();
9405 pg->handle_peering_event(evt, &rctx);
9406 }
9407 need_up_thru = pg->need_up_thru || need_up_thru;
9408 same_interval_since = MAX(pg->info.history.same_interval_since,
9409 same_interval_since);
9410 pg->write_if_dirty(*rctx.transaction);
9411 if (!split_pgs.empty()) {
9412 rctx.on_applied->add(new C_CompleteSplits(this, split_pgs));
9413 split_pgs.clear();
9414 }
9415 dispatch_context_transaction(rctx, pg, &handle);
9416 pg->unlock();
9417 }
9418 if (need_up_thru)
9419 queue_want_up_thru(same_interval_since);
9420 dispatch_context(rctx, 0, curmap, &handle);
9421
9422 service.send_pg_temp();
9423 }
9424
9425 // --------------------------------
9426
9427 const char** OSD::get_tracked_conf_keys() const
9428 {
9429 static const char* KEYS[] = {
9430 "osd_max_backfills",
9431 "osd_min_recovery_priority",
9432 "osd_max_trimming_pgs",
9433 "osd_op_complaint_time",
9434 "osd_op_log_threshold",
9435 "osd_op_history_size",
9436 "osd_op_history_duration",
9437 "osd_op_history_slow_op_size",
9438 "osd_op_history_slow_op_threshold",
9439 "osd_enable_op_tracker",
9440 "osd_map_cache_size",
9441 "osd_map_max_advance",
9442 "osd_pg_epoch_persisted_max_stale",
9443 "osd_disk_thread_ioprio_class",
9444 "osd_disk_thread_ioprio_priority",
9445 // clog & admin clog
9446 "clog_to_monitors",
9447 "clog_to_syslog",
9448 "clog_to_syslog_facility",
9449 "clog_to_syslog_level",
9450 "osd_objectstore_fuse",
9451 "clog_to_graylog",
9452 "clog_to_graylog_host",
9453 "clog_to_graylog_port",
9454 "host",
9455 "fsid",
9456 "osd_recovery_delay_start",
9457 "osd_client_message_size_cap",
9458 "osd_client_message_cap",
9459 "osd_heartbeat_min_size",
9460 "osd_heartbeat_interval",
9461 NULL
9462 };
9463 return KEYS;
9464 }
9465
9466 void OSD::handle_conf_change(const struct md_config_t *conf,
9467 const std::set <std::string> &changed)
9468 {
9469 if (changed.count("osd_max_backfills")) {
9470 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9471 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9472 }
9473 if (changed.count("osd_min_recovery_priority")) {
9474 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9475 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9476 }
9477 if (changed.count("osd_max_trimming_pgs")) {
9478 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9479 }
9480 if (changed.count("osd_op_complaint_time") ||
9481 changed.count("osd_op_log_threshold")) {
9482 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9483 cct->_conf->osd_op_log_threshold);
9484 }
9485 if (changed.count("osd_op_history_size") ||
9486 changed.count("osd_op_history_duration")) {
9487 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9488 cct->_conf->osd_op_history_duration);
9489 }
9490 if (changed.count("osd_op_history_slow_op_size") ||
9491 changed.count("osd_op_history_slow_op_threshold")) {
9492 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9493 cct->_conf->osd_op_history_slow_op_threshold);
9494 }
9495 if (changed.count("osd_enable_op_tracker")) {
9496 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9497 }
9498 if (changed.count("osd_disk_thread_ioprio_class") ||
9499 changed.count("osd_disk_thread_ioprio_priority")) {
9500 set_disk_tp_priority();
9501 }
9502 if (changed.count("osd_map_cache_size")) {
9503 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9504 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9505 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9506 }
9507 if (changed.count("clog_to_monitors") ||
9508 changed.count("clog_to_syslog") ||
9509 changed.count("clog_to_syslog_level") ||
9510 changed.count("clog_to_syslog_facility") ||
9511 changed.count("clog_to_graylog") ||
9512 changed.count("clog_to_graylog_host") ||
9513 changed.count("clog_to_graylog_port") ||
9514 changed.count("host") ||
9515 changed.count("fsid")) {
9516 update_log_config();
9517 }
9518
9519 #ifdef HAVE_LIBFUSE
9520 if (changed.count("osd_objectstore_fuse")) {
9521 if (store) {
9522 enable_disable_fuse(false);
9523 }
9524 }
9525 #endif
9526
9527 if (changed.count("osd_recovery_delay_start")) {
9528 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9529 service.kick_recovery_queue();
9530 }
9531
9532 if (changed.count("osd_client_message_cap")) {
9533 uint64_t newval = cct->_conf->osd_client_message_cap;
9534 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9535 if (pol.throttler_messages && newval > 0) {
9536 pol.throttler_messages->reset_max(newval);
9537 }
9538 }
9539 if (changed.count("osd_client_message_size_cap")) {
9540 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9541 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9542 if (pol.throttler_bytes && newval > 0) {
9543 pol.throttler_bytes->reset_max(newval);
9544 }
9545 }
9546
9547 check_config();
9548 }
9549
9550 void OSD::update_log_config()
9551 {
9552 map<string,string> log_to_monitors;
9553 map<string,string> log_to_syslog;
9554 map<string,string> log_channel;
9555 map<string,string> log_prio;
9556 map<string,string> log_to_graylog;
9557 map<string,string> log_to_graylog_host;
9558 map<string,string> log_to_graylog_port;
9559 uuid_d fsid;
9560 string host;
9561
9562 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9563 log_channel, log_prio, log_to_graylog,
9564 log_to_graylog_host, log_to_graylog_port,
9565 fsid, host) == 0)
9566 clog->update_config(log_to_monitors, log_to_syslog,
9567 log_channel, log_prio, log_to_graylog,
9568 log_to_graylog_host, log_to_graylog_port,
9569 fsid, host);
9570 derr << "log_to_monitors " << log_to_monitors << dendl;
9571 }
9572
9573 void OSD::check_config()
9574 {
9575 // some sanity checks
9576 if (cct->_conf->osd_map_cache_size <= cct->_conf->osd_map_max_advance + 2) {
9577 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9578 << " is not > osd_map_max_advance ("
9579 << cct->_conf->osd_map_max_advance << ")";
9580 }
9581 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9582 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9583 << " is not > osd_pg_epoch_persisted_max_stale ("
9584 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9585 }
9586 }
9587
9588 void OSD::set_disk_tp_priority()
9589 {
9590 dout(10) << __func__
9591 << " class " << cct->_conf->osd_disk_thread_ioprio_class
9592 << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
9593 << dendl;
9594 if (cct->_conf->osd_disk_thread_ioprio_class.empty() ||
9595 cct->_conf->osd_disk_thread_ioprio_priority < 0)
9596 return;
9597 int cls =
9598 ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
9599 if (cls < 0)
9600 derr << __func__ << cpp_strerror(cls) << ": "
9601 << "osd_disk_thread_ioprio_class is " << cct->_conf->osd_disk_thread_ioprio_class
9602 << " but only the following values are allowed: idle, be or rt" << dendl;
9603 else
9604 disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
9605 }
9606
9607 // --------------------------------
9608
9609 void OSD::get_latest_osdmap()
9610 {
9611 dout(10) << __func__ << " -- start" << dendl;
9612
9613 C_SaferCond cond;
9614 service.objecter->wait_for_latest_osdmap(&cond);
9615 cond.wait();
9616
9617 dout(10) << __func__ << " -- finish" << dendl;
9618 }
9619
9620 // --------------------------------
9621
9622 int OSD::init_op_flags(OpRequestRef& op)
9623 {
9624 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
9625 vector<OSDOp>::const_iterator iter;
9626
9627 // client flags have no bearing on whether an op is a read, write, etc.
9628 op->rmw_flags = 0;
9629
9630 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
9631 op->set_force_rwordered();
9632 }
9633
9634 // set bits based on op codes, called methods.
9635 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
9636 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
9637 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
9638 /* This a bit odd. PING isn't actually a write. It can't
9639 * result in an update to the object_info. PINGs also aren'ty
9640 * resent, so there's no reason to write out a log entry
9641 *
9642 * However, we pipeline them behind writes, so let's force
9643 * the write_ordered flag.
9644 */
9645 op->set_force_rwordered();
9646 } else {
9647 if (ceph_osd_op_mode_modify(iter->op.op))
9648 op->set_write();
9649 }
9650 if (ceph_osd_op_mode_read(iter->op.op))
9651 op->set_read();
9652
9653 // set READ flag if there are src_oids
9654 if (iter->soid.oid.name.length())
9655 op->set_read();
9656
9657 // set PGOP flag if there are PG ops
9658 if (ceph_osd_op_type_pg(iter->op.op))
9659 op->set_pg_op();
9660
9661 if (ceph_osd_op_mode_cache(iter->op.op))
9662 op->set_cache();
9663
9664 // check for ec base pool
9665 int64_t poolid = m->get_pg().pool();
9666 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
9667 if (pool && pool->is_tier()) {
9668 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
9669 if (base_pool && base_pool->require_rollback()) {
9670 if ((iter->op.op != CEPH_OSD_OP_READ) &&
9671 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
9672 (iter->op.op != CEPH_OSD_OP_STAT) &&
9673 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
9674 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
9675 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
9676 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
9677 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
9678 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
9679 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
9680 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
9681 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
9682 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
9683 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
9684 (iter->op.op != CEPH_OSD_OP_CREATE) &&
9685 (iter->op.op != CEPH_OSD_OP_DELETE) &&
9686 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
9687 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
9688 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
9689 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
9690 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
9691 op->set_promote();
9692 }
9693 }
9694 }
9695
9696 switch (iter->op.op) {
9697 case CEPH_OSD_OP_CALL:
9698 {
9699 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
9700 int is_write, is_read;
9701 string cname, mname;
9702 bp.copy(iter->op.cls.class_len, cname);
9703 bp.copy(iter->op.cls.method_len, mname);
9704
9705 ClassHandler::ClassData *cls;
9706 int r = class_handler->open_class(cname, &cls);
9707 if (r) {
9708 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
9709 if (r == -ENOENT)
9710 r = -EOPNOTSUPP;
9711 else if (r != -EPERM) // propagate permission errors
9712 r = -EIO;
9713 return r;
9714 }
9715 int flags = cls->get_method_flags(mname.c_str());
9716 if (flags < 0) {
9717 if (flags == -ENOENT)
9718 r = -EOPNOTSUPP;
9719 else
9720 r = flags;
9721 return r;
9722 }
9723 is_read = flags & CLS_METHOD_RD;
9724 is_write = flags & CLS_METHOD_WR;
9725 bool is_promote = flags & CLS_METHOD_PROMOTE;
9726
9727 dout(10) << "class " << cname << " method " << mname << " "
9728 << "flags=" << (is_read ? "r" : "")
9729 << (is_write ? "w" : "")
9730 << (is_promote ? "p" : "")
9731 << dendl;
9732 if (is_read)
9733 op->set_class_read();
9734 if (is_write)
9735 op->set_class_write();
9736 if (is_promote)
9737 op->set_promote();
9738 op->add_class(cname, is_read, is_write, cls->whitelisted);
9739 break;
9740 }
9741
9742 case CEPH_OSD_OP_WATCH:
9743 // force the read bit for watch since it is depends on previous
9744 // watch state (and may return early if the watch exists) or, in
9745 // the case of ping, is simply a read op.
9746 op->set_read();
9747 // fall through
9748 case CEPH_OSD_OP_NOTIFY:
9749 case CEPH_OSD_OP_NOTIFY_ACK:
9750 {
9751 op->set_promote();
9752 break;
9753 }
9754
9755 case CEPH_OSD_OP_DELETE:
9756 // if we get a delete with FAILOK we can skip handle cache. without
9757 // FAILOK we still need to promote (or do something smarter) to
9758 // determine whether to return ENOENT or 0.
9759 if (iter == m->ops.begin() &&
9760 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
9761 op->set_skip_handle_cache();
9762 }
9763 // skip promotion when proxying a delete op
9764 if (m->ops.size() == 1) {
9765 op->set_skip_promote();
9766 }
9767 break;
9768
9769 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
9770 case CEPH_OSD_OP_CACHE_FLUSH:
9771 case CEPH_OSD_OP_CACHE_EVICT:
9772 // If try_flush/flush/evict is the only op, can skip handle cache.
9773 if (m->ops.size() == 1) {
9774 op->set_skip_handle_cache();
9775 }
9776 break;
9777
9778 case CEPH_OSD_OP_READ:
9779 case CEPH_OSD_OP_SYNC_READ:
9780 case CEPH_OSD_OP_SPARSE_READ:
9781 case CEPH_OSD_OP_CHECKSUM:
9782 case CEPH_OSD_OP_WRITEFULL:
9783 if (m->ops.size() == 1 &&
9784 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
9785 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
9786 op->set_skip_promote();
9787 }
9788 break;
9789
9790 // force promotion when pin an object in cache tier
9791 case CEPH_OSD_OP_CACHE_PIN:
9792 op->set_promote();
9793 break;
9794
9795 default:
9796 break;
9797 }
9798 }
9799
9800 if (op->rmw_flags == 0)
9801 return -EINVAL;
9802
9803 return 0;
9804 }
9805
9806 void OSD::PeeringWQ::_dequeue(list<PG*> *out) {
9807 for (list<PG*>::iterator i = peering_queue.begin();
9808 i != peering_queue.end() &&
9809 out->size() < osd->cct->_conf->osd_peering_wq_batch_size;
9810 ) {
9811 if (in_use.count(*i)) {
9812 ++i;
9813 } else {
9814 out->push_back(*i);
9815 peering_queue.erase(i++);
9816 }
9817 }
9818 in_use.insert(out->begin(), out->end());
9819 }
9820
9821
9822 // =============================================================
9823
9824 #undef dout_context
9825 #define dout_context osd->cct
9826 #undef dout_prefix
9827 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
9828
9829 void OSD::ShardedOpWQ::wake_pg_waiters(spg_t pgid)
9830 {
9831 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
9832 auto sdata = shard_list[shard_index];
9833 bool queued = false;
9834 unsigned pushes_to_free = 0;
9835 {
9836 Mutex::Locker l(sdata->sdata_op_ordering_lock);
9837 auto p = sdata->pg_slots.find(pgid);
9838 if (p != sdata->pg_slots.end()) {
9839 dout(20) << __func__ << " " << pgid
9840 << " to_process " << p->second.to_process
9841 << " waiting_for_pg=" << (int)p->second.waiting_for_pg << dendl;
9842 for (auto i = p->second.to_process.rbegin();
9843 i != p->second.to_process.rend();
9844 ++i) {
9845 sdata->_enqueue_front(make_pair(pgid, *i), osd->op_prio_cutoff);
9846 }
9847 for (auto& q : p->second.to_process) {
9848 pushes_to_free += q.get_reserved_pushes();
9849 }
9850 p->second.to_process.clear();
9851 p->second.waiting_for_pg = false;
9852 ++p->second.requeue_seq;
9853 queued = true;
9854 }
9855 }
9856 if (pushes_to_free > 0) {
9857 osd->service.release_reserved_pushes(pushes_to_free);
9858 }
9859 if (queued) {
9860 sdata->sdata_lock.Lock();
9861 sdata->sdata_cond.SignalOne();
9862 sdata->sdata_lock.Unlock();
9863 }
9864 }
9865
9866 void OSD::ShardedOpWQ::prune_pg_waiters(OSDMapRef osdmap, int whoami)
9867 {
9868 unsigned pushes_to_free = 0;
9869 for (auto sdata : shard_list) {
9870 Mutex::Locker l(sdata->sdata_op_ordering_lock);
9871 sdata->waiting_for_pg_osdmap = osdmap;
9872 auto p = sdata->pg_slots.begin();
9873 while (p != sdata->pg_slots.end()) {
9874 ShardData::pg_slot& slot = p->second;
9875 if (!slot.to_process.empty() && slot.num_running == 0) {
9876 if (osdmap->is_up_acting_osd_shard(p->first, whoami)) {
9877 dout(20) << __func__ << " " << p->first << " maps to us, keeping"
9878 << dendl;
9879 ++p;
9880 continue;
9881 }
9882 while (!slot.to_process.empty() &&
9883 slot.to_process.front().get_map_epoch() <= osdmap->get_epoch()) {
9884 auto& qi = slot.to_process.front();
9885 dout(20) << __func__ << " " << p->first
9886 << " item " << qi
9887 << " epoch " << qi.get_map_epoch()
9888 << " <= " << osdmap->get_epoch()
9889 << ", stale, dropping" << dendl;
9890 pushes_to_free += qi.get_reserved_pushes();
9891 slot.to_process.pop_front();
9892 }
9893 }
9894 if (slot.to_process.empty() &&
9895 slot.num_running == 0 &&
9896 !slot.pg) {
9897 dout(20) << __func__ << " " << p->first << " empty, pruning" << dendl;
9898 p = sdata->pg_slots.erase(p);
9899 } else {
9900 ++p;
9901 }
9902 }
9903 }
9904 if (pushes_to_free > 0) {
9905 osd->service.release_reserved_pushes(pushes_to_free);
9906 }
9907 }
9908
9909 void OSD::ShardedOpWQ::clear_pg_pointer(spg_t pgid)
9910 {
9911 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
9912 auto sdata = shard_list[shard_index];
9913 Mutex::Locker l(sdata->sdata_op_ordering_lock);
9914 auto p = sdata->pg_slots.find(pgid);
9915 if (p != sdata->pg_slots.end()) {
9916 auto& slot = p->second;
9917 dout(20) << __func__ << " " << pgid << " pg " << slot.pg << dendl;
9918 assert(!slot.pg || slot.pg->deleting);
9919 slot.pg = nullptr;
9920 }
9921 }
9922
9923 void OSD::ShardedOpWQ::clear_pg_slots()
9924 {
9925 for (auto sdata : shard_list) {
9926 Mutex::Locker l(sdata->sdata_op_ordering_lock);
9927 sdata->pg_slots.clear();
9928 sdata->waiting_for_pg_osdmap.reset();
9929 // don't bother with reserved pushes; we are shutting down
9930 }
9931 }
9932
9933 #undef dout_prefix
9934 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
9935
9936 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
9937 {
9938 uint32_t shard_index = thread_index % num_shards;
9939 ShardData *sdata = shard_list[shard_index];
9940 assert(NULL != sdata);
9941
9942 // peek at spg_t
9943 sdata->sdata_op_ordering_lock.Lock();
9944 if (sdata->pqueue->empty()) {
9945 dout(20) << __func__ << " empty q, waiting" << dendl;
9946 // optimistically sleep a moment; maybe another work item will come along.
9947 osd->cct->get_heartbeat_map()->reset_timeout(hb,
9948 osd->cct->_conf->threadpool_default_timeout, 0);
9949 sdata->sdata_lock.Lock();
9950 sdata->sdata_op_ordering_lock.Unlock();
9951 sdata->sdata_cond.WaitInterval(sdata->sdata_lock,
9952 utime_t(osd->cct->_conf->threadpool_empty_queue_max_wait, 0));
9953 sdata->sdata_lock.Unlock();
9954 sdata->sdata_op_ordering_lock.Lock();
9955 if (sdata->pqueue->empty()) {
9956 sdata->sdata_op_ordering_lock.Unlock();
9957 return;
9958 }
9959 }
9960 pair<spg_t, PGQueueable> item = sdata->pqueue->dequeue();
9961 if (osd->is_stopping()) {
9962 sdata->sdata_op_ordering_lock.Unlock();
9963 return; // OSD shutdown, discard.
9964 }
9965 PGRef pg;
9966 uint64_t requeue_seq;
9967 {
9968 auto& slot = sdata->pg_slots[item.first];
9969 dout(30) << __func__ << " " << item.first
9970 << " to_process " << slot.to_process
9971 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
9972 slot.to_process.push_back(item.second);
9973 // note the requeue seq now...
9974 requeue_seq = slot.requeue_seq;
9975 if (slot.waiting_for_pg) {
9976 // save ourselves a bit of effort
9977 dout(20) << __func__ << " " << item.first << " item " << item.second
9978 << " queued, waiting_for_pg" << dendl;
9979 sdata->sdata_op_ordering_lock.Unlock();
9980 return;
9981 }
9982 pg = slot.pg;
9983 dout(20) << __func__ << " " << item.first << " item " << item.second
9984 << " queued" << dendl;
9985 ++slot.num_running;
9986 }
9987 sdata->sdata_op_ordering_lock.Unlock();
9988
9989 osd->service.maybe_inject_dispatch_delay();
9990
9991 // [lookup +] lock pg (if we have it)
9992 if (!pg) {
9993 pg = osd->_lookup_lock_pg(item.first);
9994 } else {
9995 pg->lock();
9996 }
9997
9998 osd->service.maybe_inject_dispatch_delay();
9999
10000 boost::optional<PGQueueable> qi;
10001
10002 // we don't use a Mutex::Locker here because of the
10003 // osd->service.release_reserved_pushes() call below
10004 sdata->sdata_op_ordering_lock.Lock();
10005
10006 auto q = sdata->pg_slots.find(item.first);
10007 assert(q != sdata->pg_slots.end());
10008 auto& slot = q->second;
10009 --slot.num_running;
10010
10011 if (slot.to_process.empty()) {
10012 // raced with wake_pg_waiters or prune_pg_waiters
10013 dout(20) << __func__ << " " << item.first << " nothing queued" << dendl;
10014 if (pg) {
10015 pg->unlock();
10016 }
10017 sdata->sdata_op_ordering_lock.Unlock();
10018 return;
10019 }
10020 if (requeue_seq != slot.requeue_seq) {
10021 dout(20) << __func__ << " " << item.first
10022 << " requeue_seq " << slot.requeue_seq << " > our "
10023 << requeue_seq << ", we raced with wake_pg_waiters"
10024 << dendl;
10025 if (pg) {
10026 pg->unlock();
10027 }
10028 sdata->sdata_op_ordering_lock.Unlock();
10029 return;
10030 }
10031 if (pg && !slot.pg && !pg->deleting) {
10032 dout(20) << __func__ << " " << item.first << " set pg to " << pg << dendl;
10033 slot.pg = pg;
10034 }
10035 dout(30) << __func__ << " " << item.first << " to_process " << slot.to_process
10036 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10037
10038 // make sure we're not already waiting for this pg
10039 if (slot.waiting_for_pg) {
10040 dout(20) << __func__ << " " << item.first << " item " << item.second
10041 << " slot is waiting_for_pg" << dendl;
10042 if (pg) {
10043 pg->unlock();
10044 }
10045 sdata->sdata_op_ordering_lock.Unlock();
10046 return;
10047 }
10048
10049 // take next item
10050 qi = slot.to_process.front();
10051 slot.to_process.pop_front();
10052 dout(20) << __func__ << " " << item.first << " item " << *qi
10053 << " pg " << pg << dendl;
10054
10055 if (!pg) {
10056 // should this pg shard exist on this osd in this (or a later) epoch?
10057 OSDMapRef osdmap = sdata->waiting_for_pg_osdmap;
10058 if (osdmap->is_up_acting_osd_shard(item.first, osd->whoami)) {
10059 dout(20) << __func__ << " " << item.first
10060 << " no pg, should exist, will wait" << " on " << *qi << dendl;
10061 slot.to_process.push_front(*qi);
10062 slot.waiting_for_pg = true;
10063 } else if (qi->get_map_epoch() > osdmap->get_epoch()) {
10064 dout(20) << __func__ << " " << item.first << " no pg, item epoch is "
10065 << qi->get_map_epoch() << " > " << osdmap->get_epoch()
10066 << ", will wait on " << *qi << dendl;
10067 slot.to_process.push_front(*qi);
10068 slot.waiting_for_pg = true;
10069 } else {
10070 dout(20) << __func__ << " " << item.first << " no pg, shouldn't exist,"
10071 << " dropping " << *qi << dendl;
10072 // share map with client?
10073 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10074 Session *session = static_cast<Session *>(
10075 (*_op)->get_req()->get_connection()->get_priv());
10076 if (session) {
10077 osd->maybe_share_map(session, *_op, sdata->waiting_for_pg_osdmap);
10078 session->put();
10079 }
10080 }
10081 unsigned pushes_to_free = qi->get_reserved_pushes();
10082 if (pushes_to_free > 0) {
10083 sdata->sdata_op_ordering_lock.Unlock();
10084 osd->service.release_reserved_pushes(pushes_to_free);
10085 return;
10086 }
10087 }
10088 sdata->sdata_op_ordering_lock.Unlock();
10089 return;
10090 }
10091 sdata->sdata_op_ordering_lock.Unlock();
10092
10093
10094 // osd_opwq_process marks the point at which an operation has been dequeued
10095 // and will begin to be handled by a worker thread.
10096 {
10097 #ifdef WITH_LTTNG
10098 osd_reqid_t reqid;
10099 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10100 reqid = (*_op)->get_reqid();
10101 }
10102 #endif
10103 tracepoint(osd, opwq_process_start, reqid.name._type,
10104 reqid.name._num, reqid.tid, reqid.inc);
10105 }
10106
10107 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10108 Formatter *f = Formatter::create("json");
10109 f->open_object_section("q");
10110 dump(f);
10111 f->close_section();
10112 f->flush(*_dout);
10113 delete f;
10114 *_dout << dendl;
10115
10116 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10117 suicide_interval);
10118 qi->run(osd, pg, tp_handle);
10119
10120 {
10121 #ifdef WITH_LTTNG
10122 osd_reqid_t reqid;
10123 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10124 reqid = (*_op)->get_reqid();
10125 }
10126 #endif
10127 tracepoint(osd, opwq_process_finish, reqid.name._type,
10128 reqid.name._num, reqid.tid, reqid.inc);
10129 }
10130
10131 pg->unlock();
10132 }
10133
10134 void OSD::ShardedOpWQ::_enqueue(pair<spg_t, PGQueueable> item) {
10135 uint32_t shard_index =
10136 item.first.hash_to_shard(shard_list.size());
10137
10138 ShardData* sdata = shard_list[shard_index];
10139 assert (NULL != sdata);
10140 unsigned priority = item.second.get_priority();
10141 unsigned cost = item.second.get_cost();
10142 sdata->sdata_op_ordering_lock.Lock();
10143
10144 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10145 if (priority >= osd->op_prio_cutoff)
10146 sdata->pqueue->enqueue_strict(
10147 item.second.get_owner(), priority, item);
10148 else
10149 sdata->pqueue->enqueue(
10150 item.second.get_owner(),
10151 priority, cost, item);
10152 sdata->sdata_op_ordering_lock.Unlock();
10153
10154 sdata->sdata_lock.Lock();
10155 sdata->sdata_cond.SignalOne();
10156 sdata->sdata_lock.Unlock();
10157
10158 }
10159
10160 void OSD::ShardedOpWQ::_enqueue_front(pair<spg_t, PGQueueable> item)
10161 {
10162 uint32_t shard_index = item.first.hash_to_shard(shard_list.size());
10163 ShardData* sdata = shard_list[shard_index];
10164 assert (NULL != sdata);
10165 sdata->sdata_op_ordering_lock.Lock();
10166 auto p = sdata->pg_slots.find(item.first);
10167 if (p != sdata->pg_slots.end() && !p->second.to_process.empty()) {
10168 // we may be racing with _process, which has dequeued a new item
10169 // from pqueue, put it on to_process, and is now busy taking the
10170 // pg lock. ensure this old requeued item is ordered before any
10171 // such newer item in to_process.
10172 p->second.to_process.push_front(item.second);
10173 item.second = p->second.to_process.back();
10174 p->second.to_process.pop_back();
10175 dout(20) << __func__ << " " << item.first
10176 << " " << p->second.to_process.front()
10177 << " shuffled w/ " << item.second << dendl;
10178 } else {
10179 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10180 }
10181 sdata->_enqueue_front(item, osd->op_prio_cutoff);
10182 sdata->sdata_op_ordering_lock.Unlock();
10183 sdata->sdata_lock.Lock();
10184 sdata->sdata_cond.SignalOne();
10185 sdata->sdata_lock.Unlock();
10186 }
10187
10188 namespace ceph {
10189 namespace osd_cmds {
10190
10191 int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os)
10192 {
10193 if (!ceph_using_tcmalloc()) {
10194 os << "could not issue heap profiler command -- not using tcmalloc!";
10195 return -EOPNOTSUPP;
10196 }
10197
10198 string cmd;
10199 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
10200 os << "unable to get value for command \"" << cmd << "\"";
10201 return -EINVAL;
10202 }
10203
10204 std::vector<std::string> cmd_vec;
10205 get_str_vec(cmd, cmd_vec);
10206
10207 ceph_heap_profiler_handle_command(cmd_vec, os);
10208
10209 return 0;
10210 }
10211
10212 }} // namespace ceph::osd_cmds
10213
10214
10215 std::ostream& operator<<(std::ostream& out, const OSD::io_queue& q) {
10216 switch(q) {
10217 case OSD::io_queue::prioritized:
10218 out << "prioritized";
10219 break;
10220 case OSD::io_queue::weightedpriority:
10221 out << "weightedpriority";
10222 break;
10223 case OSD::io_queue::mclock_opclass:
10224 out << "mclock_opclass";
10225 break;
10226 case OSD::io_queue::mclock_client:
10227 out << "mclock_client";
10228 break;
10229 }
10230 return out;
10231 }