]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSD.cc
fb77b0777bca21d4cdcfbb6a4b681cf6a96b13d7
[ceph.git] / ceph / src / osd / OSD.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15 #include "acconfig.h"
16
17 #include <fstream>
18 #include <iostream>
19 #include <errno.h>
20 #include <sys/stat.h>
21 #include <signal.h>
22 #include <ctype.h>
23 #include <boost/scoped_ptr.hpp>
24
25 #ifdef HAVE_SYS_PARAM_H
26 #include <sys/param.h>
27 #endif
28
29 #ifdef HAVE_SYS_MOUNT_H
30 #include <sys/mount.h>
31 #endif
32
33 #include "osd/PG.h"
34
35 #include "include/types.h"
36 #include "include/compat.h"
37
38 #include "OSD.h"
39 #include "OSDMap.h"
40 #include "Watch.h"
41 #include "osdc/Objecter.h"
42
43 #include "common/errno.h"
44 #include "common/ceph_argparse.h"
45 #include "common/version.h"
46 #include "common/io_priority.h"
47
48 #include "os/ObjectStore.h"
49 #ifdef HAVE_LIBFUSE
50 #include "os/FuseStore.h"
51 #endif
52
53 #include "PrimaryLogPG.h"
54
55
56 #include "msg/Messenger.h"
57 #include "msg/Message.h"
58
59 #include "mon/MonClient.h"
60
61 #include "messages/MLog.h"
62
63 #include "messages/MGenericMessage.h"
64 #include "messages/MPing.h"
65 #include "messages/MOSDPing.h"
66 #include "messages/MOSDFailure.h"
67 #include "messages/MOSDMarkMeDown.h"
68 #include "messages/MOSDFull.h"
69 #include "messages/MOSDOp.h"
70 #include "messages/MOSDOpReply.h"
71 #include "messages/MOSDBackoff.h"
72 #include "messages/MOSDBeacon.h"
73 #include "messages/MOSDRepOp.h"
74 #include "messages/MOSDRepOpReply.h"
75 #include "messages/MOSDBoot.h"
76 #include "messages/MOSDPGTemp.h"
77
78 #include "messages/MOSDMap.h"
79 #include "messages/MMonGetOSDMap.h"
80 #include "messages/MOSDPGNotify.h"
81 #include "messages/MOSDPGQuery.h"
82 #include "messages/MOSDPGLog.h"
83 #include "messages/MOSDPGRemove.h"
84 #include "messages/MOSDPGInfo.h"
85 #include "messages/MOSDPGCreate.h"
86 #include "messages/MOSDPGTrim.h"
87 #include "messages/MOSDPGScan.h"
88 #include "messages/MOSDPGBackfill.h"
89 #include "messages/MBackfillReserve.h"
90 #include "messages/MRecoveryReserve.h"
91 #include "messages/MOSDECSubOpWrite.h"
92 #include "messages/MOSDECSubOpWriteReply.h"
93 #include "messages/MOSDECSubOpRead.h"
94 #include "messages/MOSDECSubOpReadReply.h"
95 #include "messages/MOSDPGCreated.h"
96 #include "messages/MOSDPGUpdateLogMissing.h"
97 #include "messages/MOSDPGUpdateLogMissingReply.h"
98
99 #include "messages/MOSDAlive.h"
100
101 #include "messages/MOSDScrub.h"
102 #include "messages/MOSDScrubReserve.h"
103 #include "messages/MOSDRepScrub.h"
104
105 #include "messages/MMonCommand.h"
106 #include "messages/MCommand.h"
107 #include "messages/MCommandReply.h"
108
109 #include "messages/MPGStats.h"
110 #include "messages/MPGStatsAck.h"
111
112 #include "messages/MWatchNotify.h"
113 #include "messages/MOSDPGPush.h"
114 #include "messages/MOSDPGPushReply.h"
115 #include "messages/MOSDPGPull.h"
116
117 #include "common/perf_counters.h"
118 #include "common/Timer.h"
119 #include "common/LogClient.h"
120 #include "common/AsyncReserver.h"
121 #include "common/HeartbeatMap.h"
122 #include "common/admin_socket.h"
123 #include "common/ceph_context.h"
124
125 #include "global/signal_handler.h"
126 #include "global/pidfile.h"
127
128 #include "include/color.h"
129 #include "perfglue/cpu_profiler.h"
130 #include "perfglue/heap_profiler.h"
131
132 #include "osd/OpRequest.h"
133
134 #include "auth/AuthAuthorizeHandler.h"
135 #include "auth/RotatingKeyRing.h"
136 #include "common/errno.h"
137
138 #include "objclass/objclass.h"
139
140 #include "common/cmdparse.h"
141 #include "include/str_list.h"
142 #include "include/util.h"
143
144 #include "include/assert.h"
145 #include "common/config.h"
146 #include "common/EventTrace.h"
147
148 #ifdef WITH_LTTNG
149 #define TRACEPOINT_DEFINE
150 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
151 #include "tracing/osd.h"
152 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
153 #undef TRACEPOINT_DEFINE
154 #else
155 #define tracepoint(...)
156 #endif
157
158 #define dout_context cct
159 #define dout_subsys ceph_subsys_osd
160 #undef dout_prefix
161 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
162
163 const double OSD::OSD_TICK_INTERVAL = 1.0;
164
165 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
166 return *_dout << "osd." << whoami << " " << epoch << " ";
167 }
168
169 void PGQueueable::RunVis::operator()(const OpRequestRef &op) {
170 return osd->dequeue_op(pg, op, handle);
171 }
172
173 void PGQueueable::RunVis::operator()(const PGSnapTrim &op) {
174 return pg->snap_trimmer(op.epoch_queued);
175 }
176
177 void PGQueueable::RunVis::operator()(const PGScrub &op) {
178 return pg->scrub(op.epoch_queued, handle);
179 }
180
181 void PGQueueable::RunVis::operator()(const PGRecovery &op) {
182 return osd->do_recovery(pg.get(), op.epoch_queued, op.reserved_pushes, handle);
183 }
184
185 //Initial features in new superblock.
186 //Features here are also automatically upgraded
187 CompatSet OSD::get_osd_initial_compat_set() {
188 CompatSet::FeatureSet ceph_osd_feature_compat;
189 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
190 CompatSet::FeatureSet ceph_osd_feature_incompat;
191 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
192 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
193 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
194 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
195 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
196 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
197 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
198 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
199 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
200 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
201 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
202 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
203 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
204 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
205 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
206 ceph_osd_feature_incompat);
207 }
208
209 //Features are added here that this OSD supports.
210 CompatSet OSD::get_osd_compat_set() {
211 CompatSet compat = get_osd_initial_compat_set();
212 //Any features here can be set in code, but not in initial superblock
213 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
214 return compat;
215 }
216
217 OSDService::OSDService(OSD *osd) :
218 osd(osd),
219 cct(osd->cct),
220 meta_osr(new ObjectStore::Sequencer("meta")),
221 whoami(osd->whoami), store(osd->store),
222 log_client(osd->log_client), clog(osd->clog),
223 pg_recovery_stats(osd->pg_recovery_stats),
224 cluster_messenger(osd->cluster_messenger),
225 client_messenger(osd->client_messenger),
226 logger(osd->logger),
227 recoverystate_perf(osd->recoverystate_perf),
228 monc(osd->monc),
229 peering_wq(osd->peering_wq),
230 recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
231 &osd->disk_tp),
232 class_handler(osd->class_handler),
233 pg_epoch_lock("OSDService::pg_epoch_lock"),
234 publish_lock("OSDService::publish_lock"),
235 pre_publish_lock("OSDService::pre_publish_lock"),
236 max_oldest_map(0),
237 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
238 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
239 scrubs_active(0),
240 agent_lock("OSDService::agent_lock"),
241 agent_valid_iterator(false),
242 agent_ops(0),
243 flush_mode_high_count(0),
244 agent_active(true),
245 agent_thread(this),
246 agent_stop_flag(false),
247 agent_timer_lock("OSDService::agent_timer_lock"),
248 agent_timer(osd->client_messenger->cct, agent_timer_lock),
249 last_recalibrate(ceph_clock_now()),
250 promote_max_objects(0),
251 promote_max_bytes(0),
252 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
253 objecter_finisher(osd->client_messenger->cct),
254 watch_lock("OSDService::watch_lock"),
255 watch_timer(osd->client_messenger->cct, watch_lock),
256 next_notif_id(0),
257 recovery_request_lock("OSDService::recovery_request_lock"),
258 recovery_request_timer(cct, recovery_request_lock, false),
259 recovery_sleep_lock("OSDService::recovery_sleep_lock"),
260 recovery_sleep_timer(cct, recovery_sleep_lock, false),
261 reserver_finisher(cct),
262 local_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
263 cct->_conf->osd_min_recovery_priority),
264 remote_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
265 cct->_conf->osd_min_recovery_priority),
266 pg_temp_lock("OSDService::pg_temp_lock"),
267 snap_sleep_lock("OSDService::snap_sleep_lock"),
268 snap_sleep_timer(
269 osd->client_messenger->cct, snap_sleep_lock, false /* relax locking */),
270 scrub_sleep_lock("OSDService::scrub_sleep_lock"),
271 scrub_sleep_timer(
272 osd->client_messenger->cct, scrub_sleep_lock, false /* relax locking */),
273 snap_reserver(&reserver_finisher,
274 cct->_conf->osd_max_trimming_pgs),
275 recovery_lock("OSDService::recovery_lock"),
276 recovery_ops_active(0),
277 recovery_ops_reserved(0),
278 recovery_paused(false),
279 map_cache_lock("OSDService::map_cache_lock"),
280 map_cache(cct, cct->_conf->osd_map_cache_size),
281 map_bl_cache(cct->_conf->osd_map_cache_size),
282 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
283 in_progress_split_lock("OSDService::in_progress_split_lock"),
284 stat_lock("OSDService::stat_lock"),
285 full_status_lock("OSDService::full_status_lock"),
286 cur_state(NONE),
287 cur_ratio(0),
288 epoch_lock("OSDService::epoch_lock"),
289 boot_epoch(0), up_epoch(0), bind_epoch(0),
290 is_stopping_lock("OSDService::is_stopping_lock")
291 #ifdef PG_DEBUG_REFS
292 , pgid_lock("OSDService::pgid_lock")
293 #endif
294 {
295 objecter->init();
296 }
297
298 OSDService::~OSDService()
299 {
300 delete objecter;
301 }
302
303
304
305 #ifdef PG_DEBUG_REFS
306 void OSDService::add_pgid(spg_t pgid, PG *pg){
307 Mutex::Locker l(pgid_lock);
308 if (!pgid_tracker.count(pgid)) {
309 live_pgs[pgid] = pg;
310 }
311 pgid_tracker[pgid]++;
312 }
313 void OSDService::remove_pgid(spg_t pgid, PG *pg)
314 {
315 Mutex::Locker l(pgid_lock);
316 assert(pgid_tracker.count(pgid));
317 assert(pgid_tracker[pgid] > 0);
318 pgid_tracker[pgid]--;
319 if (pgid_tracker[pgid] == 0) {
320 pgid_tracker.erase(pgid);
321 live_pgs.erase(pgid);
322 }
323 }
324 void OSDService::dump_live_pgids()
325 {
326 Mutex::Locker l(pgid_lock);
327 derr << "live pgids:" << dendl;
328 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
329 i != pgid_tracker.cend();
330 ++i) {
331 derr << "\t" << *i << dendl;
332 live_pgs[i->first]->dump_live_ids();
333 }
334 }
335 #endif
336
337
338 void OSDService::_start_split(spg_t parent, const set<spg_t> &children)
339 {
340 for (set<spg_t>::const_iterator i = children.begin();
341 i != children.end();
342 ++i) {
343 dout(10) << __func__ << ": Starting split on pg " << *i
344 << ", parent=" << parent << dendl;
345 assert(!pending_splits.count(*i));
346 assert(!in_progress_splits.count(*i));
347 pending_splits.insert(make_pair(*i, parent));
348
349 assert(!rev_pending_splits[parent].count(*i));
350 rev_pending_splits[parent].insert(*i);
351 }
352 }
353
354 void OSDService::mark_split_in_progress(spg_t parent, const set<spg_t> &children)
355 {
356 Mutex::Locker l(in_progress_split_lock);
357 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
358 assert(piter != rev_pending_splits.end());
359 for (set<spg_t>::const_iterator i = children.begin();
360 i != children.end();
361 ++i) {
362 assert(piter->second.count(*i));
363 assert(pending_splits.count(*i));
364 assert(!in_progress_splits.count(*i));
365 assert(pending_splits[*i] == parent);
366
367 pending_splits.erase(*i);
368 piter->second.erase(*i);
369 in_progress_splits.insert(*i);
370 }
371 if (piter->second.empty())
372 rev_pending_splits.erase(piter);
373 }
374
375 void OSDService::cancel_pending_splits_for_parent(spg_t parent)
376 {
377 Mutex::Locker l(in_progress_split_lock);
378 _cancel_pending_splits_for_parent(parent);
379 }
380
381 void OSDService::_cancel_pending_splits_for_parent(spg_t parent)
382 {
383 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
384 if (piter == rev_pending_splits.end())
385 return;
386
387 for (set<spg_t>::iterator i = piter->second.begin();
388 i != piter->second.end();
389 ++i) {
390 assert(pending_splits.count(*i));
391 assert(!in_progress_splits.count(*i));
392 pending_splits.erase(*i);
393 dout(10) << __func__ << ": Completing split on pg " << *i
394 << " for parent: " << parent << dendl;
395 _cancel_pending_splits_for_parent(*i);
396 }
397 rev_pending_splits.erase(piter);
398 }
399
400 void OSDService::_maybe_split_pgid(OSDMapRef old_map,
401 OSDMapRef new_map,
402 spg_t pgid)
403 {
404 assert(old_map->have_pg_pool(pgid.pool()));
405 int old_pgnum = old_map->get_pg_num(pgid.pool());
406 if (pgid.ps() < static_cast<unsigned>(old_pgnum)) {
407 set<spg_t> children;
408 if (pgid.is_split(old_pgnum,
409 new_map->get_pg_num(pgid.pool()), &children)) {
410 _start_split(pgid, children); }
411 } else {
412 assert(pgid.ps() < static_cast<unsigned>(new_map->get_pg_num(pgid.pool())));
413 }
414 }
415
416 void OSDService::init_splits_between(spg_t pgid,
417 OSDMapRef frommap,
418 OSDMapRef tomap)
419 {
420 // First, check whether we can avoid this potentially expensive check
421 if (tomap->have_pg_pool(pgid.pool()) &&
422 pgid.is_split(
423 frommap->get_pg_num(pgid.pool()),
424 tomap->get_pg_num(pgid.pool()),
425 NULL)) {
426 // Ok, a split happened, so we need to walk the osdmaps
427 set<spg_t> new_pgs; // pgs to scan on each map
428 new_pgs.insert(pgid);
429 OSDMapRef curmap(get_map(frommap->get_epoch()));
430 for (epoch_t e = frommap->get_epoch() + 1;
431 e <= tomap->get_epoch();
432 ++e) {
433 OSDMapRef nextmap(try_get_map(e));
434 if (!nextmap)
435 continue;
436 set<spg_t> even_newer_pgs; // pgs added in this loop
437 for (set<spg_t>::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) {
438 set<spg_t> split_pgs;
439 if (i->is_split(curmap->get_pg_num(i->pool()),
440 nextmap->get_pg_num(i->pool()),
441 &split_pgs)) {
442 start_split(*i, split_pgs);
443 even_newer_pgs.insert(split_pgs.begin(), split_pgs.end());
444 }
445 }
446 new_pgs.insert(even_newer_pgs.begin(), even_newer_pgs.end());
447 curmap = nextmap;
448 }
449 assert(curmap == tomap); // we must have had both frommap and tomap
450 }
451 }
452
453 void OSDService::expand_pg_num(OSDMapRef old_map,
454 OSDMapRef new_map)
455 {
456 Mutex::Locker l(in_progress_split_lock);
457 for (set<spg_t>::iterator i = in_progress_splits.begin();
458 i != in_progress_splits.end();
459 ) {
460 if (!new_map->have_pg_pool(i->pool())) {
461 in_progress_splits.erase(i++);
462 } else {
463 _maybe_split_pgid(old_map, new_map, *i);
464 ++i;
465 }
466 }
467 for (map<spg_t, spg_t>::iterator i = pending_splits.begin();
468 i != pending_splits.end();
469 ) {
470 if (!new_map->have_pg_pool(i->first.pool())) {
471 rev_pending_splits.erase(i->second);
472 pending_splits.erase(i++);
473 } else {
474 _maybe_split_pgid(old_map, new_map, i->first);
475 ++i;
476 }
477 }
478 }
479
480 bool OSDService::splitting(spg_t pgid)
481 {
482 Mutex::Locker l(in_progress_split_lock);
483 return in_progress_splits.count(pgid) ||
484 pending_splits.count(pgid);
485 }
486
487 void OSDService::complete_split(const set<spg_t> &pgs)
488 {
489 Mutex::Locker l(in_progress_split_lock);
490 for (set<spg_t>::const_iterator i = pgs.begin();
491 i != pgs.end();
492 ++i) {
493 dout(10) << __func__ << ": Completing split on pg " << *i << dendl;
494 assert(!pending_splits.count(*i));
495 assert(in_progress_splits.count(*i));
496 in_progress_splits.erase(*i);
497 }
498 }
499
500 void OSDService::need_heartbeat_peer_update()
501 {
502 osd->need_heartbeat_peer_update();
503 }
504
505 void OSDService::pg_stat_queue_enqueue(PG *pg)
506 {
507 osd->pg_stat_queue_enqueue(pg);
508 }
509
510 void OSDService::pg_stat_queue_dequeue(PG *pg)
511 {
512 osd->pg_stat_queue_dequeue(pg);
513 }
514
515 void OSDService::start_shutdown()
516 {
517 {
518 Mutex::Locker l(agent_timer_lock);
519 agent_timer.shutdown();
520 }
521
522 {
523 Mutex::Locker l(recovery_sleep_lock);
524 recovery_sleep_timer.shutdown();
525 }
526 }
527
528 void OSDService::shutdown_reserver()
529 {
530 reserver_finisher.wait_for_empty();
531 reserver_finisher.stop();
532 }
533
534 void OSDService::shutdown()
535 {
536 {
537 Mutex::Locker l(watch_lock);
538 watch_timer.shutdown();
539 }
540
541 objecter->shutdown();
542 objecter_finisher.wait_for_empty();
543 objecter_finisher.stop();
544
545 {
546 Mutex::Locker l(recovery_request_lock);
547 recovery_request_timer.shutdown();
548 }
549
550 {
551 Mutex::Locker l(snap_sleep_lock);
552 snap_sleep_timer.shutdown();
553 }
554
555 {
556 Mutex::Locker l(scrub_sleep_lock);
557 scrub_sleep_timer.shutdown();
558 }
559
560 osdmap = OSDMapRef();
561 next_osdmap = OSDMapRef();
562 }
563
564 void OSDService::init()
565 {
566 reserver_finisher.start();
567 objecter_finisher.start();
568 objecter->set_client_incarnation(0);
569
570 // deprioritize objecter in daemonperf output
571 objecter->get_logger()->set_prio_adjust(-3);
572
573 watch_timer.init();
574 agent_timer.init();
575 snap_sleep_timer.init();
576 scrub_sleep_timer.init();
577
578 agent_thread.create("osd_srv_agent");
579
580 if (cct->_conf->osd_recovery_delay_start)
581 defer_recovery(cct->_conf->osd_recovery_delay_start);
582 }
583
584 void OSDService::final_init()
585 {
586 objecter->start(osdmap.get());
587 }
588
589 void OSDService::activate_map()
590 {
591 // wake/unwake the tiering agent
592 agent_lock.Lock();
593 agent_active =
594 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
595 osd->is_active();
596 agent_cond.Signal();
597 agent_lock.Unlock();
598 }
599
600 class AgentTimeoutCB : public Context {
601 PGRef pg;
602 public:
603 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
604 void finish(int) override {
605 pg->agent_choose_mode_restart();
606 }
607 };
608
609 void OSDService::agent_entry()
610 {
611 dout(10) << __func__ << " start" << dendl;
612 agent_lock.Lock();
613
614 while (!agent_stop_flag) {
615 if (agent_queue.empty()) {
616 dout(20) << __func__ << " empty queue" << dendl;
617 agent_cond.Wait(agent_lock);
618 continue;
619 }
620 uint64_t level = agent_queue.rbegin()->first;
621 set<PGRef>& top = agent_queue.rbegin()->second;
622 dout(10) << __func__
623 << " tiers " << agent_queue.size()
624 << ", top is " << level
625 << " with pgs " << top.size()
626 << ", ops " << agent_ops << "/"
627 << cct->_conf->osd_agent_max_ops
628 << (agent_active ? " active" : " NOT ACTIVE")
629 << dendl;
630 dout(20) << __func__ << " oids " << agent_oids << dendl;
631 int max = cct->_conf->osd_agent_max_ops - agent_ops;
632 int agent_flush_quota = max;
633 if (!flush_mode_high_count)
634 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
635 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
636 agent_cond.Wait(agent_lock);
637 continue;
638 }
639
640 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
641 agent_queue_pos = top.begin();
642 agent_valid_iterator = true;
643 }
644 PGRef pg = *agent_queue_pos;
645 dout(10) << "high_count " << flush_mode_high_count
646 << " agent_ops " << agent_ops
647 << " flush_quota " << agent_flush_quota << dendl;
648 agent_lock.Unlock();
649 if (!pg->agent_work(max, agent_flush_quota)) {
650 dout(10) << __func__ << " " << pg->get_pgid()
651 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
652 << " seconds" << dendl;
653
654 osd->logger->inc(l_osd_tier_delay);
655 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
656 agent_timer_lock.Lock();
657 Context *cb = new AgentTimeoutCB(pg);
658 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
659 agent_timer_lock.Unlock();
660 }
661 agent_lock.Lock();
662 }
663 agent_lock.Unlock();
664 dout(10) << __func__ << " finish" << dendl;
665 }
666
667 void OSDService::agent_stop()
668 {
669 {
670 Mutex::Locker l(agent_lock);
671
672 // By this time all ops should be cancelled
673 assert(agent_ops == 0);
674 // By this time all PGs are shutdown and dequeued
675 if (!agent_queue.empty()) {
676 set<PGRef>& top = agent_queue.rbegin()->second;
677 derr << "agent queue not empty, for example " << (*top.begin())->info.pgid << dendl;
678 assert(0 == "agent queue not empty");
679 }
680
681 agent_stop_flag = true;
682 agent_cond.Signal();
683 }
684 agent_thread.join();
685 }
686
687 // -------------------------------------
688
689 void OSDService::promote_throttle_recalibrate()
690 {
691 utime_t now = ceph_clock_now();
692 double dur = now - last_recalibrate;
693 last_recalibrate = now;
694 unsigned prob = promote_probability_millis;
695
696 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
697 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
698
699 unsigned min_prob = 1;
700
701 uint64_t attempts, obj, bytes;
702 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
703 dout(10) << __func__ << " " << attempts << " attempts, promoted "
704 << obj << " objects and " << pretty_si_t(bytes) << " bytes; target "
705 << target_obj_sec << " obj/sec or "
706 << pretty_si_t(target_bytes_sec) << " bytes/sec"
707 << dendl;
708
709 // calculate what the probability *should* be, given the targets
710 unsigned new_prob;
711 if (attempts && dur > 0) {
712 uint64_t avg_size = 1;
713 if (obj)
714 avg_size = MAX(bytes / obj, 1);
715 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
716 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
717 / (double)attempts;
718 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
719 << avg_size << dendl;
720 if (target_obj_sec && target_bytes_sec)
721 new_prob = MIN(po, pb);
722 else if (target_obj_sec)
723 new_prob = po;
724 else if (target_bytes_sec)
725 new_prob = pb;
726 else
727 new_prob = 1000;
728 } else {
729 new_prob = 1000;
730 }
731 dout(20) << __func__ << " new_prob " << new_prob << dendl;
732
733 // correct for persistent skew between target rate and actual rate, adjust
734 double ratio = 1.0;
735 unsigned actual = 0;
736 if (attempts && obj) {
737 actual = obj * 1000 / attempts;
738 ratio = (double)actual / (double)prob;
739 new_prob = (double)new_prob / ratio;
740 }
741 new_prob = MAX(new_prob, min_prob);
742 new_prob = MIN(new_prob, 1000);
743
744 // adjust
745 prob = (prob + new_prob) / 2;
746 prob = MAX(prob, min_prob);
747 prob = MIN(prob, 1000);
748 dout(10) << __func__ << " actual " << actual
749 << ", actual/prob ratio " << ratio
750 << ", adjusted new_prob " << new_prob
751 << ", prob " << promote_probability_millis << " -> " << prob
752 << dendl;
753 promote_probability_millis = prob;
754
755 // set hard limits for this interval to mitigate stampedes
756 promote_max_objects = target_obj_sec * OSD::OSD_TICK_INTERVAL * 2;
757 promote_max_bytes = target_bytes_sec * OSD::OSD_TICK_INTERVAL * 2;
758 }
759
760 // -------------------------------------
761
762 float OSDService::get_failsafe_full_ratio()
763 {
764 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
765 if (full_ratio > 1.0) full_ratio /= 100.0;
766 return full_ratio;
767 }
768
769 void OSDService::check_full_status(const osd_stat_t &osd_stat)
770 {
771 Mutex::Locker l(full_status_lock);
772
773 float ratio = ((float)osd_stat.kb_used) / ((float)osd_stat.kb);
774 cur_ratio = ratio;
775
776 // The OSDMap ratios take precendence. So if the failsafe is .95 and
777 // the admin sets the cluster full to .96, the failsafe moves up to .96
778 // too. (Not that having failsafe == full is ideal, but it's better than
779 // dropping writes before the clusters appears full.)
780 OSDMapRef osdmap = get_osdmap();
781 if (!osdmap || osdmap->get_epoch() == 0) {
782 cur_state = NONE;
783 return;
784 }
785 float nearfull_ratio = osdmap->get_nearfull_ratio();
786 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
787 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
788 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
789
790 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
791 // use the failsafe for nearfull and full; the mon isn't using the
792 // flags anyway because we're mid-upgrade.
793 full_ratio = failsafe_ratio;
794 backfillfull_ratio = failsafe_ratio;
795 nearfull_ratio = failsafe_ratio;
796 } else if (full_ratio <= 0 ||
797 backfillfull_ratio <= 0 ||
798 nearfull_ratio <= 0) {
799 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
800 // use failsafe flag. ick. the monitor did something wrong or the user
801 // did something stupid.
802 full_ratio = failsafe_ratio;
803 backfillfull_ratio = failsafe_ratio;
804 nearfull_ratio = failsafe_ratio;
805 }
806
807 string inject;
808 s_names new_state;
809 if (injectfull_state > NONE && injectfull) {
810 new_state = injectfull_state;
811 inject = "(Injected)";
812 } else if (ratio > failsafe_ratio) {
813 new_state = FAILSAFE;
814 } else if (ratio > full_ratio) {
815 new_state = FULL;
816 } else if (ratio > backfillfull_ratio) {
817 new_state = BACKFILLFULL;
818 } else if (ratio > nearfull_ratio) {
819 new_state = NEARFULL;
820 } else {
821 new_state = NONE;
822 }
823 dout(20) << __func__ << " cur ratio " << ratio
824 << ". nearfull_ratio " << nearfull_ratio
825 << ". backfillfull_ratio " << backfillfull_ratio
826 << ", full_ratio " << full_ratio
827 << ", failsafe_ratio " << failsafe_ratio
828 << ", new state " << get_full_state_name(new_state)
829 << " " << inject
830 << dendl;
831
832 // warn
833 if (cur_state != new_state) {
834 dout(10) << __func__ << " " << get_full_state_name(cur_state)
835 << " -> " << get_full_state_name(new_state) << dendl;
836 if (new_state == FAILSAFE) {
837 clog->error() << "failsafe engaged, dropping updates, now "
838 << (int)roundf(ratio * 100) << "% full";
839 } else if (cur_state == FAILSAFE) {
840 clog->error() << "failsafe disengaged, no longer dropping updates, now "
841 << (int)roundf(ratio * 100) << "% full";
842 }
843 cur_state = new_state;
844 }
845 }
846
847 bool OSDService::need_fullness_update()
848 {
849 OSDMapRef osdmap = get_osdmap();
850 s_names cur = NONE;
851 if (osdmap->exists(whoami)) {
852 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
853 cur = FULL;
854 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
855 cur = BACKFILLFULL;
856 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
857 cur = NEARFULL;
858 }
859 }
860 s_names want = NONE;
861 if (is_full())
862 want = FULL;
863 else if (is_backfillfull())
864 want = BACKFILLFULL;
865 else if (is_nearfull())
866 want = NEARFULL;
867 return want != cur;
868 }
869
870 bool OSDService::_check_full(s_names type, ostream &ss) const
871 {
872 Mutex::Locker l(full_status_lock);
873
874 if (injectfull && injectfull_state >= type) {
875 // injectfull is either a count of the number of times to return failsafe full
876 // or if -1 then always return full
877 if (injectfull > 0)
878 --injectfull;
879 ss << "Injected " << get_full_state_name(type) << " OSD ("
880 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")";
881 return true;
882 }
883
884 ss << "current usage is " << cur_ratio;
885 return cur_state >= type;
886 }
887
888 bool OSDService::check_failsafe_full(ostream &ss) const
889 {
890 return _check_full(FAILSAFE, ss);
891 }
892
893 bool OSDService::check_full(ostream &ss) const
894 {
895 return _check_full(FULL, ss);
896 }
897
898 bool OSDService::check_backfill_full(ostream &ss) const
899 {
900 return _check_full(BACKFILLFULL, ss);
901 }
902
903 bool OSDService::check_nearfull(ostream &ss) const
904 {
905 return _check_full(NEARFULL, ss);
906 }
907
908 bool OSDService::is_failsafe_full() const
909 {
910 Mutex::Locker l(full_status_lock);
911 return cur_state == FAILSAFE;
912 }
913
914 bool OSDService::is_full() const
915 {
916 Mutex::Locker l(full_status_lock);
917 return cur_state >= FULL;
918 }
919
920 bool OSDService::is_backfillfull() const
921 {
922 Mutex::Locker l(full_status_lock);
923 return cur_state >= BACKFILLFULL;
924 }
925
926 bool OSDService::is_nearfull() const
927 {
928 Mutex::Locker l(full_status_lock);
929 return cur_state >= NEARFULL;
930 }
931
932 void OSDService::set_injectfull(s_names type, int64_t count)
933 {
934 Mutex::Locker l(full_status_lock);
935 injectfull_state = type;
936 injectfull = count;
937 }
938
939 void OSDService::update_osd_stat(vector<int>& hb_peers)
940 {
941 Mutex::Locker lock(stat_lock);
942
943 osd_stat.hb_peers.swap(hb_peers);
944
945 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
946
947 // fill in osd stats too
948 struct store_statfs_t stbuf;
949 int r = osd->store->statfs(&stbuf);
950 if (r < 0) {
951 derr << "statfs() failed: " << cpp_strerror(r) << dendl;
952 return;
953 }
954
955 uint64_t bytes = stbuf.total;
956 uint64_t used = bytes - stbuf.available;
957 uint64_t avail = stbuf.available;
958
959 osd_stat.kb = bytes >> 10;
960 osd_stat.kb_used = used >> 10;
961 osd_stat.kb_avail = avail >> 10;
962
963 osd->logger->set(l_osd_stat_bytes, bytes);
964 osd->logger->set(l_osd_stat_bytes_used, used);
965 osd->logger->set(l_osd_stat_bytes_avail, avail);
966
967 dout(20) << "update_osd_stat " << osd_stat << dendl;
968
969 check_full_status(osd_stat);
970 }
971
972 bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
973 {
974 OSDMapRef osdmap = get_osdmap();
975 for (auto shard : missing_on) {
976 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
977 return true;
978 }
979 return false;
980 }
981
982 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
983 {
984 OSDMapRef next_map = get_nextmap_reserved();
985 // service map is always newer/newest
986 assert(from_epoch <= next_map->get_epoch());
987
988 if (next_map->is_down(peer) ||
989 next_map->get_info(peer).up_from > from_epoch) {
990 m->put();
991 release_map(next_map);
992 return;
993 }
994 const entity_inst_t& peer_inst = next_map->get_cluster_inst(peer);
995 ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst);
996 share_map_peer(peer, peer_con.get(), next_map);
997 peer_con->send_message(m);
998 release_map(next_map);
999 }
1000
1001 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1002 {
1003 OSDMapRef next_map = get_nextmap_reserved();
1004 // service map is always newer/newest
1005 assert(from_epoch <= next_map->get_epoch());
1006
1007 if (next_map->is_down(peer) ||
1008 next_map->get_info(peer).up_from > from_epoch) {
1009 release_map(next_map);
1010 return NULL;
1011 }
1012 ConnectionRef con = osd->cluster_messenger->get_connection(next_map->get_cluster_inst(peer));
1013 release_map(next_map);
1014 return con;
1015 }
1016
1017 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1018 {
1019 OSDMapRef next_map = get_nextmap_reserved();
1020 // service map is always newer/newest
1021 assert(from_epoch <= next_map->get_epoch());
1022
1023 pair<ConnectionRef,ConnectionRef> ret;
1024 if (next_map->is_down(peer) ||
1025 next_map->get_info(peer).up_from > from_epoch) {
1026 release_map(next_map);
1027 return ret;
1028 }
1029 ret.first = osd->hb_back_client_messenger->get_connection(next_map->get_hb_back_inst(peer));
1030 if (next_map->get_hb_front_addr(peer) != entity_addr_t())
1031 ret.second = osd->hb_front_client_messenger->get_connection(next_map->get_hb_front_inst(peer));
1032 release_map(next_map);
1033 return ret;
1034 }
1035
1036
1037 void OSDService::queue_want_pg_temp(pg_t pgid, vector<int>& want)
1038 {
1039 Mutex::Locker l(pg_temp_lock);
1040 map<pg_t,vector<int> >::iterator p = pg_temp_pending.find(pgid);
1041 if (p == pg_temp_pending.end() ||
1042 p->second != want) {
1043 pg_temp_wanted[pgid] = want;
1044 }
1045 }
1046
1047 void OSDService::remove_want_pg_temp(pg_t pgid)
1048 {
1049 Mutex::Locker l(pg_temp_lock);
1050 pg_temp_wanted.erase(pgid);
1051 pg_temp_pending.erase(pgid);
1052 }
1053
1054 void OSDService::_sent_pg_temp()
1055 {
1056 for (map<pg_t,vector<int> >::iterator p = pg_temp_wanted.begin();
1057 p != pg_temp_wanted.end();
1058 ++p)
1059 pg_temp_pending[p->first] = p->second;
1060 pg_temp_wanted.clear();
1061 }
1062
1063 void OSDService::requeue_pg_temp()
1064 {
1065 Mutex::Locker l(pg_temp_lock);
1066 // wanted overrides pending. note that remove_want_pg_temp
1067 // clears the item out of both.
1068 unsigned old_wanted = pg_temp_wanted.size();
1069 unsigned old_pending = pg_temp_pending.size();
1070 _sent_pg_temp();
1071 pg_temp_wanted.swap(pg_temp_pending);
1072 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1073 << pg_temp_wanted.size() << dendl;
1074 }
1075
1076 void OSDService::send_pg_temp()
1077 {
1078 Mutex::Locker l(pg_temp_lock);
1079 if (pg_temp_wanted.empty())
1080 return;
1081 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1082 MOSDPGTemp *m = new MOSDPGTemp(osdmap->get_epoch());
1083 m->pg_temp = pg_temp_wanted;
1084 monc->send_mon_message(m);
1085 _sent_pg_temp();
1086 }
1087
1088 void OSDService::send_pg_created(pg_t pgid)
1089 {
1090 dout(20) << __func__ << dendl;
1091 monc->send_mon_message(new MOSDPGCreated(pgid));
1092 }
1093
1094 // --------------------------------------
1095 // dispatch
1096
1097 epoch_t OSDService::get_peer_epoch(int peer)
1098 {
1099 Mutex::Locker l(peer_map_epoch_lock);
1100 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1101 if (p == peer_map_epoch.end())
1102 return 0;
1103 return p->second;
1104 }
1105
1106 epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1107 {
1108 Mutex::Locker l(peer_map_epoch_lock);
1109 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1110 if (p != peer_map_epoch.end()) {
1111 if (p->second < e) {
1112 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1113 p->second = e;
1114 } else {
1115 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1116 }
1117 return p->second;
1118 } else {
1119 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1120 peer_map_epoch[peer] = e;
1121 return e;
1122 }
1123 }
1124
1125 void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1126 {
1127 Mutex::Locker l(peer_map_epoch_lock);
1128 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1129 if (p != peer_map_epoch.end()) {
1130 if (p->second <= as_of) {
1131 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1132 << " had " << p->second << dendl;
1133 peer_map_epoch.erase(p);
1134 } else {
1135 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1136 << " has " << p->second << " - not forgetting" << dendl;
1137 }
1138 }
1139 }
1140
1141 bool OSDService::should_share_map(entity_name_t name, Connection *con,
1142 epoch_t epoch, const OSDMapRef& osdmap,
1143 const epoch_t *sent_epoch_p)
1144 {
1145 dout(20) << "should_share_map "
1146 << name << " " << con->get_peer_addr()
1147 << " " << epoch << dendl;
1148
1149 // does client have old map?
1150 if (name.is_client()) {
1151 bool message_sendmap = epoch < osdmap->get_epoch();
1152 if (message_sendmap && sent_epoch_p) {
1153 dout(20) << "client session last_sent_epoch: "
1154 << *sent_epoch_p
1155 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1156 if (*sent_epoch_p < osdmap->get_epoch()) {
1157 return true;
1158 } // else we don't need to send it out again
1159 }
1160 }
1161
1162 if (con->get_messenger() == osd->cluster_messenger &&
1163 con != osd->cluster_messenger->get_loopback_connection() &&
1164 osdmap->is_up(name.num()) &&
1165 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1166 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1167 // remember
1168 epoch_t has = MAX(get_peer_epoch(name.num()), epoch);
1169
1170 // share?
1171 if (has < osdmap->get_epoch()) {
1172 dout(10) << name << " " << con->get_peer_addr()
1173 << " has old map " << epoch << " < "
1174 << osdmap->get_epoch() << dendl;
1175 return true;
1176 }
1177 }
1178
1179 return false;
1180 }
1181
1182 void OSDService::share_map(
1183 entity_name_t name,
1184 Connection *con,
1185 epoch_t epoch,
1186 OSDMapRef& osdmap,
1187 epoch_t *sent_epoch_p)
1188 {
1189 dout(20) << "share_map "
1190 << name << " " << con->get_peer_addr()
1191 << " " << epoch << dendl;
1192
1193 if (!osd->is_active()) {
1194 /*It is safe not to proceed as OSD is not in healthy state*/
1195 return;
1196 }
1197
1198 bool want_shared = should_share_map(name, con, epoch,
1199 osdmap, sent_epoch_p);
1200
1201 if (want_shared){
1202 if (name.is_client()) {
1203 dout(10) << name << " has old map " << epoch
1204 << " < " << osdmap->get_epoch() << dendl;
1205 // we know the Session is valid or we wouldn't be sending
1206 if (sent_epoch_p) {
1207 *sent_epoch_p = osdmap->get_epoch();
1208 }
1209 send_incremental_map(epoch, con, osdmap);
1210 } else if (con->get_messenger() == osd->cluster_messenger &&
1211 osdmap->is_up(name.num()) &&
1212 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1213 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1214 dout(10) << name << " " << con->get_peer_addr()
1215 << " has old map " << epoch << " < "
1216 << osdmap->get_epoch() << dendl;
1217 note_peer_epoch(name.num(), osdmap->get_epoch());
1218 send_incremental_map(epoch, con, osdmap);
1219 }
1220 }
1221 }
1222
1223 void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1224 {
1225 if (!map)
1226 map = get_osdmap();
1227
1228 // send map?
1229 epoch_t pe = get_peer_epoch(peer);
1230 if (pe) {
1231 if (pe < map->get_epoch()) {
1232 send_incremental_map(pe, con, map);
1233 note_peer_epoch(peer, map->get_epoch());
1234 } else
1235 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1236 } else {
1237 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1238 // no idea about peer's epoch.
1239 // ??? send recent ???
1240 // do nothing.
1241 }
1242 }
1243
1244 bool OSDService::can_inc_scrubs_pending()
1245 {
1246 bool can_inc = false;
1247 Mutex::Locker l(sched_scrub_lock);
1248
1249 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1250 dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
1251 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1252 can_inc = true;
1253 } else {
1254 dout(20) << __func__ << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1255 }
1256
1257 return can_inc;
1258 }
1259
1260 bool OSDService::inc_scrubs_pending()
1261 {
1262 bool result = false;
1263
1264 sched_scrub_lock.Lock();
1265 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1266 dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
1267 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1268 result = true;
1269 ++scrubs_pending;
1270 } else {
1271 dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1272 }
1273 sched_scrub_lock.Unlock();
1274
1275 return result;
1276 }
1277
1278 void OSDService::dec_scrubs_pending()
1279 {
1280 sched_scrub_lock.Lock();
1281 dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
1282 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1283 --scrubs_pending;
1284 assert(scrubs_pending >= 0);
1285 sched_scrub_lock.Unlock();
1286 }
1287
1288 void OSDService::inc_scrubs_active(bool reserved)
1289 {
1290 sched_scrub_lock.Lock();
1291 ++(scrubs_active);
1292 if (reserved) {
1293 --(scrubs_pending);
1294 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1295 << " (max " << cct->_conf->osd_max_scrubs
1296 << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
1297 assert(scrubs_pending >= 0);
1298 } else {
1299 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1300 << " (max " << cct->_conf->osd_max_scrubs
1301 << ", pending " << scrubs_pending << ")" << dendl;
1302 }
1303 sched_scrub_lock.Unlock();
1304 }
1305
1306 void OSDService::dec_scrubs_active()
1307 {
1308 sched_scrub_lock.Lock();
1309 dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
1310 << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
1311 --scrubs_active;
1312 assert(scrubs_active >= 0);
1313 sched_scrub_lock.Unlock();
1314 }
1315
1316 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1317 epoch_t *_bind_epoch) const
1318 {
1319 Mutex::Locker l(epoch_lock);
1320 if (_boot_epoch)
1321 *_boot_epoch = boot_epoch;
1322 if (_up_epoch)
1323 *_up_epoch = up_epoch;
1324 if (_bind_epoch)
1325 *_bind_epoch = bind_epoch;
1326 }
1327
1328 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1329 const epoch_t *_bind_epoch)
1330 {
1331 Mutex::Locker l(epoch_lock);
1332 if (_boot_epoch) {
1333 assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1334 boot_epoch = *_boot_epoch;
1335 }
1336 if (_up_epoch) {
1337 assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1338 up_epoch = *_up_epoch;
1339 }
1340 if (_bind_epoch) {
1341 assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1342 bind_epoch = *_bind_epoch;
1343 }
1344 }
1345
1346 bool OSDService::prepare_to_stop()
1347 {
1348 Mutex::Locker l(is_stopping_lock);
1349 if (get_state() != NOT_STOPPING)
1350 return false;
1351
1352 OSDMapRef osdmap = get_osdmap();
1353 if (osdmap && osdmap->is_up(whoami)) {
1354 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1355 set_state(PREPARING_TO_STOP);
1356 monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(),
1357 osdmap->get_inst(whoami),
1358 osdmap->get_epoch(),
1359 true // request ack
1360 ));
1361 utime_t now = ceph_clock_now();
1362 utime_t timeout;
1363 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1364 while ((ceph_clock_now() < timeout) &&
1365 (get_state() != STOPPING)) {
1366 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1367 }
1368 }
1369 dout(0) << __func__ << " starting shutdown" << dendl;
1370 set_state(STOPPING);
1371 return true;
1372 }
1373
1374 void OSDService::got_stop_ack()
1375 {
1376 Mutex::Locker l(is_stopping_lock);
1377 if (get_state() == PREPARING_TO_STOP) {
1378 dout(0) << __func__ << " starting shutdown" << dendl;
1379 set_state(STOPPING);
1380 is_stopping_cond.Signal();
1381 } else {
1382 dout(10) << __func__ << " ignoring msg" << dendl;
1383 }
1384 }
1385
1386 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1387 OSDSuperblock& sblock)
1388 {
1389 MOSDMap *m = new MOSDMap(monc->get_fsid());
1390 m->oldest_map = max_oldest_map;
1391 m->newest_map = sblock.newest_map;
1392
1393 for (epoch_t e = to; e > since; e--) {
1394 bufferlist bl;
1395 if (e > m->oldest_map && get_inc_map_bl(e, bl)) {
1396 m->incremental_maps[e].claim(bl);
1397 } else if (get_map_bl(e, bl)) {
1398 m->maps[e].claim(bl);
1399 break;
1400 } else {
1401 derr << "since " << since << " to " << to
1402 << " oldest " << m->oldest_map << " newest " << m->newest_map
1403 << dendl;
1404 m->put();
1405 m = NULL;
1406 break;
1407 }
1408 }
1409 return m;
1410 }
1411
1412 void OSDService::send_map(MOSDMap *m, Connection *con)
1413 {
1414 con->send_message(m);
1415 }
1416
1417 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1418 OSDMapRef& osdmap)
1419 {
1420 epoch_t to = osdmap->get_epoch();
1421 dout(10) << "send_incremental_map " << since << " -> " << to
1422 << " to " << con << " " << con->get_peer_addr() << dendl;
1423
1424 MOSDMap *m = NULL;
1425 while (!m) {
1426 OSDSuperblock sblock(get_superblock());
1427 if (since < sblock.oldest_map) {
1428 // just send latest full map
1429 MOSDMap *m = new MOSDMap(monc->get_fsid());
1430 m->oldest_map = max_oldest_map;
1431 m->newest_map = sblock.newest_map;
1432 get_map_bl(to, m->maps[to]);
1433 send_map(m, con);
1434 return;
1435 }
1436
1437 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1438 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1439 << ", only sending most recent" << dendl;
1440 since = to - cct->_conf->osd_map_share_max_epochs;
1441 }
1442
1443 if (to - since > (epoch_t)cct->_conf->osd_map_message_max)
1444 to = since + cct->_conf->osd_map_message_max;
1445 m = build_incremental_map_msg(since, to, sblock);
1446 }
1447 send_map(m, con);
1448 }
1449
1450 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1451 {
1452 bool found = map_bl_cache.lookup(e, &bl);
1453 if (found) {
1454 if (logger)
1455 logger->inc(l_osd_map_bl_cache_hit);
1456 return true;
1457 }
1458 if (logger)
1459 logger->inc(l_osd_map_bl_cache_miss);
1460 found = store->read(coll_t::meta(),
1461 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1462 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1463 if (found) {
1464 _add_map_bl(e, bl);
1465 }
1466 return found;
1467 }
1468
1469 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1470 {
1471 Mutex::Locker l(map_cache_lock);
1472 bool found = map_bl_inc_cache.lookup(e, &bl);
1473 if (found) {
1474 if (logger)
1475 logger->inc(l_osd_map_bl_cache_hit);
1476 return true;
1477 }
1478 if (logger)
1479 logger->inc(l_osd_map_bl_cache_miss);
1480 found = store->read(coll_t::meta(),
1481 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1482 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1483 if (found) {
1484 _add_map_inc_bl(e, bl);
1485 }
1486 return found;
1487 }
1488
1489 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1490 {
1491 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1492 // cache a contiguous buffer
1493 if (bl.get_num_buffers() > 1) {
1494 bl.rebuild();
1495 }
1496 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1497 map_bl_cache.add(e, bl);
1498 }
1499
1500 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1501 {
1502 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1503 // cache a contiguous buffer
1504 if (bl.get_num_buffers() > 1) {
1505 bl.rebuild();
1506 }
1507 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1508 map_bl_inc_cache.add(e, bl);
1509 }
1510
1511 void OSDService::pin_map_inc_bl(epoch_t e, bufferlist &bl)
1512 {
1513 Mutex::Locker l(map_cache_lock);
1514 // cache a contiguous buffer
1515 if (bl.get_num_buffers() > 1) {
1516 bl.rebuild();
1517 }
1518 map_bl_inc_cache.pin(e, bl);
1519 }
1520
1521 void OSDService::pin_map_bl(epoch_t e, bufferlist &bl)
1522 {
1523 Mutex::Locker l(map_cache_lock);
1524 // cache a contiguous buffer
1525 if (bl.get_num_buffers() > 1) {
1526 bl.rebuild();
1527 }
1528 map_bl_cache.pin(e, bl);
1529 }
1530
1531 void OSDService::clear_map_bl_cache_pins(epoch_t e)
1532 {
1533 Mutex::Locker l(map_cache_lock);
1534 map_bl_inc_cache.clear_pinned(e);
1535 map_bl_cache.clear_pinned(e);
1536 }
1537
1538 OSDMapRef OSDService::_add_map(OSDMap *o)
1539 {
1540 epoch_t e = o->get_epoch();
1541
1542 if (cct->_conf->osd_map_dedup) {
1543 // Dedup against an existing map at a nearby epoch
1544 OSDMapRef for_dedup = map_cache.lower_bound(e);
1545 if (for_dedup) {
1546 OSDMap::dedup(for_dedup.get(), o);
1547 }
1548 }
1549 bool existed;
1550 OSDMapRef l = map_cache.add(e, o, &existed);
1551 if (existed) {
1552 delete o;
1553 }
1554 return l;
1555 }
1556
1557 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1558 {
1559 Mutex::Locker l(map_cache_lock);
1560 OSDMapRef retval = map_cache.lookup(epoch);
1561 if (retval) {
1562 dout(30) << "get_map " << epoch << " -cached" << dendl;
1563 if (logger) {
1564 logger->inc(l_osd_map_cache_hit);
1565 }
1566 return retval;
1567 }
1568 if (logger) {
1569 logger->inc(l_osd_map_cache_miss);
1570 epoch_t lb = map_cache.cached_key_lower_bound();
1571 if (epoch < lb) {
1572 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1573 logger->inc(l_osd_map_cache_miss_low);
1574 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1575 }
1576 }
1577
1578 OSDMap *map = new OSDMap;
1579 if (epoch > 0) {
1580 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1581 bufferlist bl;
1582 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1583 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1584 delete map;
1585 return OSDMapRef();
1586 }
1587 map->decode(bl);
1588 } else {
1589 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1590 }
1591 return _add_map(map);
1592 }
1593
1594 // ops
1595
1596
1597 void OSDService::reply_op_error(OpRequestRef op, int err)
1598 {
1599 reply_op_error(op, err, eversion_t(), 0);
1600 }
1601
1602 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1603 version_t uv)
1604 {
1605 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1606 assert(m->get_type() == CEPH_MSG_OSD_OP);
1607 int flags;
1608 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1609
1610 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1611 true);
1612 reply->set_reply_versions(v, uv);
1613 m->get_connection()->send_message(reply);
1614 }
1615
1616 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1617 {
1618 if (!cct->_conf->osd_debug_misdirected_ops) {
1619 return;
1620 }
1621
1622 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1623 assert(m->get_type() == CEPH_MSG_OSD_OP);
1624
1625 assert(m->get_map_epoch() >= pg->info.history.same_primary_since);
1626
1627 if (pg->is_ec_pg()) {
1628 /**
1629 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1630 * can get this result:
1631 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1632 * [CRUSH_ITEM_NONE, 2, 3]/3
1633 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1634 * [3, 2, 3]/3
1635 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1636 * -- misdirected op
1637 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1638 * it and fulfils it
1639 *
1640 * We can't compute the op target based on the sending map epoch due to
1641 * splitting. The simplest thing is to detect such cases here and drop
1642 * them without an error (the client will resend anyway).
1643 */
1644 assert(m->get_map_epoch() <= superblock.newest_map);
1645 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1646 if (!opmap) {
1647 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1648 << m->get_map_epoch() << ", dropping" << dendl;
1649 return;
1650 }
1651 pg_t _pgid = m->get_raw_pg();
1652 spg_t pgid;
1653 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1654 _pgid = opmap->raw_pg_to_pg(_pgid);
1655 if (opmap->get_primary_shard(_pgid, &pgid) &&
1656 pgid.shard != pg->info.pgid.shard) {
1657 dout(7) << __func__ << ": " << *pg << " primary changed since "
1658 << m->get_map_epoch() << ", dropping" << dendl;
1659 return;
1660 }
1661 }
1662
1663 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1664 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1665 << " pg " << m->get_raw_pg()
1666 << " to osd." << whoami
1667 << " not " << pg->acting
1668 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1669 }
1670
1671 void OSDService::enqueue_back(spg_t pgid, PGQueueable qi)
1672 {
1673 osd->op_shardedwq.queue(make_pair(pgid, qi));
1674 }
1675
1676 void OSDService::enqueue_front(spg_t pgid, PGQueueable qi)
1677 {
1678 osd->op_shardedwq.queue_front(make_pair(pgid, qi));
1679 }
1680
1681 void OSDService::queue_for_peering(PG *pg)
1682 {
1683 peering_wq.queue(pg);
1684 }
1685
1686 void OSDService::queue_for_snap_trim(PG *pg)
1687 {
1688 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1689 osd->op_shardedwq.queue(
1690 make_pair(
1691 pg->info.pgid,
1692 PGQueueable(
1693 PGSnapTrim(pg->get_osdmap()->get_epoch()),
1694 cct->_conf->osd_snap_trim_cost,
1695 cct->_conf->osd_snap_trim_priority,
1696 ceph_clock_now(),
1697 entity_inst_t(),
1698 pg->get_osdmap()->get_epoch())));
1699 }
1700
1701
1702 // ====================================================================
1703 // OSD
1704
1705 #undef dout_prefix
1706 #define dout_prefix *_dout
1707
1708 // Commands shared between OSD's console and admin console:
1709 namespace ceph {
1710 namespace osd_cmds {
1711
1712 int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1713
1714 }} // namespace ceph::osd_cmds
1715
1716 int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
1717 uuid_d fsid, int whoami)
1718 {
1719 int ret;
1720
1721 ceph::shared_ptr<ObjectStore::Sequencer> osr(
1722 new ObjectStore::Sequencer("mkfs"));
1723 OSDSuperblock sb;
1724 bufferlist sbbl;
1725 C_SaferCond waiter;
1726
1727 // if we are fed a uuid for this osd, use it.
1728 store->set_fsid(cct->_conf->osd_uuid);
1729
1730 ret = store->mkfs();
1731 if (ret) {
1732 derr << "OSD::mkfs: ObjectStore::mkfs failed with error " << ret << dendl;
1733 goto free_store;
1734 }
1735
1736 store->set_cache_shards(1); // doesn't matter for mkfs!
1737
1738 ret = store->mount();
1739 if (ret) {
1740 derr << "OSD::mkfs: couldn't mount ObjectStore: error " << ret << dendl;
1741 goto free_store;
1742 }
1743
1744 ret = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1745 if (ret >= 0) {
1746 /* if we already have superblock, check content of superblock */
1747 dout(0) << " have superblock" << dendl;
1748 bufferlist::iterator p;
1749 p = sbbl.begin();
1750 ::decode(sb, p);
1751 if (whoami != sb.whoami) {
1752 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1753 << dendl;
1754 ret = -EINVAL;
1755 goto umount_store;
1756 }
1757 if (fsid != sb.cluster_fsid) {
1758 derr << "provided cluster fsid " << fsid
1759 << " != superblock's " << sb.cluster_fsid << dendl;
1760 ret = -EINVAL;
1761 goto umount_store;
1762 }
1763 } else {
1764 // create superblock
1765 sb.cluster_fsid = fsid;
1766 sb.osd_fsid = store->get_fsid();
1767 sb.whoami = whoami;
1768 sb.compat_features = get_osd_initial_compat_set();
1769
1770 bufferlist bl;
1771 ::encode(sb, bl);
1772
1773 ObjectStore::Transaction t;
1774 t.create_collection(coll_t::meta(), 0);
1775 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1776 ret = store->apply_transaction(osr.get(), std::move(t));
1777 if (ret) {
1778 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1779 << "apply_transaction returned " << ret << dendl;
1780 goto umount_store;
1781 }
1782 }
1783
1784 if (!osr->flush_commit(&waiter)) {
1785 waiter.wait();
1786 }
1787
1788 ret = write_meta(store, sb.cluster_fsid, sb.osd_fsid, whoami);
1789 if (ret) {
1790 derr << "OSD::mkfs: failed to write fsid file: error " << ret << dendl;
1791 goto umount_store;
1792 }
1793
1794 umount_store:
1795 store->umount();
1796 free_store:
1797 delete store;
1798 return ret;
1799 }
1800
1801 int OSD::write_meta(ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
1802 {
1803 char val[80];
1804 int r;
1805
1806 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
1807 r = store->write_meta("magic", val);
1808 if (r < 0)
1809 return r;
1810
1811 snprintf(val, sizeof(val), "%d", whoami);
1812 r = store->write_meta("whoami", val);
1813 if (r < 0)
1814 return r;
1815
1816 cluster_fsid.print(val);
1817 r = store->write_meta("ceph_fsid", val);
1818 if (r < 0)
1819 return r;
1820
1821 r = store->write_meta("ready", "ready");
1822 if (r < 0)
1823 return r;
1824
1825 return 0;
1826 }
1827
1828 int OSD::peek_meta(ObjectStore *store, std::string& magic,
1829 uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami)
1830 {
1831 string val;
1832
1833 int r = store->read_meta("magic", &val);
1834 if (r < 0)
1835 return r;
1836 magic = val;
1837
1838 r = store->read_meta("whoami", &val);
1839 if (r < 0)
1840 return r;
1841 whoami = atoi(val.c_str());
1842
1843 r = store->read_meta("ceph_fsid", &val);
1844 if (r < 0)
1845 return r;
1846 r = cluster_fsid.parse(val.c_str());
1847 if (!r)
1848 return -EINVAL;
1849
1850 r = store->read_meta("fsid", &val);
1851 if (r < 0) {
1852 osd_fsid = uuid_d();
1853 } else {
1854 r = osd_fsid.parse(val.c_str());
1855 if (!r)
1856 return -EINVAL;
1857 }
1858
1859 return 0;
1860 }
1861
1862
1863 #undef dout_prefix
1864 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
1865
1866 // cons/des
1867
1868 OSD::OSD(CephContext *cct_, ObjectStore *store_,
1869 int id,
1870 Messenger *internal_messenger,
1871 Messenger *external_messenger,
1872 Messenger *hb_client_front,
1873 Messenger *hb_client_back,
1874 Messenger *hb_front_serverm,
1875 Messenger *hb_back_serverm,
1876 Messenger *osdc_messenger,
1877 MonClient *mc,
1878 const std::string &dev, const std::string &jdev) :
1879 Dispatcher(cct_),
1880 osd_lock("OSD::osd_lock"),
1881 tick_timer(cct, osd_lock),
1882 tick_timer_lock("OSD::tick_timer_lock"),
1883 tick_timer_without_osd_lock(cct, tick_timer_lock),
1884 authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(cct,
1885 cct->_conf->auth_supported.empty() ?
1886 cct->_conf->auth_cluster_required :
1887 cct->_conf->auth_supported)),
1888 authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(cct,
1889 cct->_conf->auth_supported.empty() ?
1890 cct->_conf->auth_service_required :
1891 cct->_conf->auth_supported)),
1892 cluster_messenger(internal_messenger),
1893 client_messenger(external_messenger),
1894 objecter_messenger(osdc_messenger),
1895 monc(mc),
1896 mgrc(cct_, client_messenger),
1897 logger(NULL),
1898 recoverystate_perf(NULL),
1899 store(store_),
1900 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
1901 clog(log_client.create_channel()),
1902 whoami(id),
1903 dev_path(dev), journal_path(jdev),
1904 store_is_rotational(store->is_rotational()),
1905 trace_endpoint("0.0.0.0", 0, "osd"),
1906 asok_hook(NULL),
1907 osd_compat(get_osd_compat_set()),
1908 peering_tp(cct, "OSD::peering_tp", "tp_peering",
1909 cct->_conf->osd_peering_wq_threads,
1910 "osd_peering_tp_threads"),
1911 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
1912 get_num_op_threads()),
1913 disk_tp(cct, "OSD::disk_tp", "tp_osd_disk", cct->_conf->osd_disk_threads, "osd_disk_threads"),
1914 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
1915 session_waiting_lock("OSD::session_waiting_lock"),
1916 heartbeat_lock("OSD::heartbeat_lock"),
1917 heartbeat_stop(false),
1918 heartbeat_need_update(true),
1919 hb_front_client_messenger(hb_client_front),
1920 hb_back_client_messenger(hb_client_back),
1921 hb_front_server_messenger(hb_front_serverm),
1922 hb_back_server_messenger(hb_back_serverm),
1923 daily_loadavg(0.0),
1924 heartbeat_thread(this),
1925 heartbeat_dispatcher(this),
1926 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
1927 cct->_conf->osd_num_op_tracker_shard),
1928 test_ops_hook(NULL),
1929 op_queue(get_io_queue()),
1930 op_prio_cutoff(get_io_prio_cut()),
1931 op_shardedwq(
1932 get_num_op_shards(),
1933 this,
1934 cct->_conf->osd_op_thread_timeout,
1935 cct->_conf->osd_op_thread_suicide_timeout,
1936 &osd_op_tp),
1937 peering_wq(
1938 this,
1939 cct->_conf->osd_op_thread_timeout,
1940 cct->_conf->osd_op_thread_suicide_timeout,
1941 &peering_tp),
1942 map_lock("OSD::map_lock"),
1943 pg_map_lock("OSD::pg_map_lock"),
1944 last_pg_create_epoch(0),
1945 mon_report_lock("OSD::mon_report_lock"),
1946 stats_ack_timeout(cct->_conf->osd_mon_ack_timeout),
1947 up_thru_wanted(0),
1948 requested_full_first(0),
1949 requested_full_last(0),
1950 pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
1951 osd_stat_updated(false),
1952 pg_stat_tid(0), pg_stat_tid_flushed(0),
1953 command_wq(
1954 this,
1955 cct->_conf->osd_command_thread_timeout,
1956 cct->_conf->osd_command_thread_suicide_timeout,
1957 &command_tp),
1958 remove_wq(
1959 cct,
1960 store,
1961 cct->_conf->osd_remove_thread_timeout,
1962 cct->_conf->osd_remove_thread_suicide_timeout,
1963 &disk_tp),
1964 service(this)
1965 {
1966 monc->set_messenger(client_messenger);
1967 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
1968 cct->_conf->osd_op_log_threshold);
1969 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
1970 cct->_conf->osd_op_history_duration);
1971 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
1972 cct->_conf->osd_op_history_slow_op_threshold);
1973 #ifdef WITH_BLKIN
1974 std::stringstream ss;
1975 ss << "osd." << whoami;
1976 trace_endpoint.copy_name(ss.str());
1977 #endif
1978 }
1979
1980 OSD::~OSD()
1981 {
1982 delete authorize_handler_cluster_registry;
1983 delete authorize_handler_service_registry;
1984 delete class_handler;
1985 cct->get_perfcounters_collection()->remove(recoverystate_perf);
1986 cct->get_perfcounters_collection()->remove(logger);
1987 delete recoverystate_perf;
1988 delete logger;
1989 delete store;
1990 }
1991
1992 void cls_initialize(ClassHandler *ch);
1993
1994 void OSD::handle_signal(int signum)
1995 {
1996 assert(signum == SIGINT || signum == SIGTERM);
1997 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
1998 shutdown();
1999 }
2000
2001 int OSD::pre_init()
2002 {
2003 Mutex::Locker lock(osd_lock);
2004 if (is_stopping())
2005 return 0;
2006
2007 if (store->test_mount_in_use()) {
2008 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2009 << "currently in use. (Is ceph-osd already running?)" << dendl;
2010 return -EBUSY;
2011 }
2012
2013 cct->_conf->add_observer(this);
2014 return 0;
2015 }
2016
2017 // asok
2018
2019 class OSDSocketHook : public AdminSocketHook {
2020 OSD *osd;
2021 public:
2022 explicit OSDSocketHook(OSD *o) : osd(o) {}
2023 bool call(std::string admin_command, cmdmap_t& cmdmap, std::string format,
2024 bufferlist& out) override {
2025 stringstream ss;
2026 bool r = osd->asok_command(admin_command, cmdmap, format, ss);
2027 out.append(ss);
2028 return r;
2029 }
2030 };
2031
2032 bool OSD::asok_command(string admin_command, cmdmap_t& cmdmap, string format,
2033 ostream& ss)
2034 {
2035 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2036 if (admin_command == "status") {
2037 f->open_object_section("status");
2038 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2039 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2040 f->dump_unsigned("whoami", superblock.whoami);
2041 f->dump_string("state", get_state_name(get_state()));
2042 f->dump_unsigned("oldest_map", superblock.oldest_map);
2043 f->dump_unsigned("newest_map", superblock.newest_map);
2044 {
2045 RWLock::RLocker l(pg_map_lock);
2046 f->dump_unsigned("num_pgs", pg_map.size());
2047 }
2048 f->close_section();
2049 } else if (admin_command == "flush_journal") {
2050 store->flush_journal();
2051 } else if (admin_command == "dump_ops_in_flight" ||
2052 admin_command == "ops") {
2053 if (!op_tracker.dump_ops_in_flight(f)) {
2054 ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
2055 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
2056 }
2057 } else if (admin_command == "dump_blocked_ops") {
2058 if (!op_tracker.dump_ops_in_flight(f, true)) {
2059 ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
2060 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
2061 }
2062 } else if (admin_command == "dump_historic_ops") {
2063 if (!op_tracker.dump_historic_ops(f, false)) {
2064 ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
2065 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
2066 }
2067 } else if (admin_command == "dump_historic_ops_by_duration") {
2068 if (!op_tracker.dump_historic_ops(f, true)) {
2069 ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
2070 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
2071 }
2072 } else if (admin_command == "dump_historic_slow_ops") {
2073 if (!op_tracker.dump_historic_slow_ops(f)) {
2074 ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
2075 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
2076 }
2077 } else if (admin_command == "dump_op_pq_state") {
2078 f->open_object_section("pq");
2079 op_shardedwq.dump(f);
2080 f->close_section();
2081 } else if (admin_command == "dump_blacklist") {
2082 list<pair<entity_addr_t,utime_t> > bl;
2083 OSDMapRef curmap = service.get_osdmap();
2084
2085 f->open_array_section("blacklist");
2086 curmap->get_blacklist(&bl);
2087 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2088 it != bl.end(); ++it) {
2089 f->open_array_section("entry");
2090 f->open_object_section("entity_addr_t");
2091 it->first.dump(f);
2092 f->close_section(); //entity_addr_t
2093 it->second.localtime(f->dump_stream("expire_time"));
2094 f->close_section(); //entry
2095 }
2096 f->close_section(); //blacklist
2097 } else if (admin_command == "dump_watchers") {
2098 list<obj_watch_item_t> watchers;
2099 // scan pg's
2100 {
2101 Mutex::Locker l(osd_lock);
2102 RWLock::RLocker l2(pg_map_lock);
2103 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2104 it != pg_map.end();
2105 ++it) {
2106
2107 list<obj_watch_item_t> pg_watchers;
2108 PG *pg = it->second;
2109 pg->lock();
2110 pg->get_watchers(pg_watchers);
2111 pg->unlock();
2112 watchers.splice(watchers.end(), pg_watchers);
2113 }
2114 }
2115
2116 f->open_array_section("watchers");
2117 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2118 it != watchers.end(); ++it) {
2119
2120 f->open_array_section("watch");
2121
2122 f->dump_string("namespace", it->obj.nspace);
2123 f->dump_string("object", it->obj.oid.name);
2124
2125 f->open_object_section("entity_name");
2126 it->wi.name.dump(f);
2127 f->close_section(); //entity_name_t
2128
2129 f->dump_int("cookie", it->wi.cookie);
2130 f->dump_int("timeout", it->wi.timeout_seconds);
2131
2132 f->open_object_section("entity_addr_t");
2133 it->wi.addr.dump(f);
2134 f->close_section(); //entity_addr_t
2135
2136 f->close_section(); //watch
2137 }
2138
2139 f->close_section(); //watchers
2140 } else if (admin_command == "dump_reservations") {
2141 f->open_object_section("reservations");
2142 f->open_object_section("local_reservations");
2143 service.local_reserver.dump(f);
2144 f->close_section();
2145 f->open_object_section("remote_reservations");
2146 service.remote_reserver.dump(f);
2147 f->close_section();
2148 f->close_section();
2149 } else if (admin_command == "get_latest_osdmap") {
2150 get_latest_osdmap();
2151 } else if (admin_command == "heap") {
2152 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2153
2154 // Note: Failed heap profile commands won't necessarily trigger an error:
2155 f->open_object_section("result");
2156 f->dump_string("error", cpp_strerror(result));
2157 f->dump_bool("success", result >= 0);
2158 f->close_section();
2159 } else if (admin_command == "set_heap_property") {
2160 string property;
2161 int64_t value = 0;
2162 string error;
2163 bool success = false;
2164 if (!cmd_getval(cct, cmdmap, "property", property)) {
2165 error = "unable to get property";
2166 success = false;
2167 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2168 error = "unable to get value";
2169 success = false;
2170 } else if (value < 0) {
2171 error = "negative value not allowed";
2172 success = false;
2173 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2174 error = "invalid property";
2175 success = false;
2176 } else {
2177 success = true;
2178 }
2179 f->open_object_section("result");
2180 f->dump_string("error", error);
2181 f->dump_bool("success", success);
2182 f->close_section();
2183 } else if (admin_command == "get_heap_property") {
2184 string property;
2185 size_t value = 0;
2186 string error;
2187 bool success = false;
2188 if (!cmd_getval(cct, cmdmap, "property", property)) {
2189 error = "unable to get property";
2190 success = false;
2191 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2192 error = "invalid property";
2193 success = false;
2194 } else {
2195 success = true;
2196 }
2197 f->open_object_section("result");
2198 f->dump_string("error", error);
2199 f->dump_bool("success", success);
2200 f->dump_int("value", value);
2201 f->close_section();
2202 } else if (admin_command == "dump_objectstore_kv_stats") {
2203 store->get_db_statistics(f);
2204 } else if (admin_command == "dump_scrubs") {
2205 service.dumps_scrub(f);
2206 } else if (admin_command == "calc_objectstore_db_histogram") {
2207 store->generate_db_histogram(f);
2208 } else if (admin_command == "flush_store_cache") {
2209 store->flush_cache();
2210 } else if (admin_command == "dump_pgstate_history") {
2211 f->open_object_section("pgstate_history");
2212 RWLock::RLocker l2(pg_map_lock);
2213 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2214 it != pg_map.end();
2215 ++it) {
2216
2217 PG *pg = it->second;
2218 f->dump_stream("pg") << pg->get_pgid();
2219 pg->lock();
2220 pg->pgstate_history.dump(f);
2221 pg->unlock();
2222 }
2223 f->close_section();
2224 } else {
2225 assert(0 == "broken asok registration");
2226 }
2227 f->flush(ss);
2228 delete f;
2229 return true;
2230 }
2231
2232 class TestOpsSocketHook : public AdminSocketHook {
2233 OSDService *service;
2234 ObjectStore *store;
2235 public:
2236 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
2237 bool call(std::string command, cmdmap_t& cmdmap, std::string format,
2238 bufferlist& out) override {
2239 stringstream ss;
2240 test_ops(service, store, command, cmdmap, ss);
2241 out.append(ss);
2242 return true;
2243 }
2244 void test_ops(OSDService *service, ObjectStore *store,
2245 const std::string &command, cmdmap_t& cmdmap, ostream &ss);
2246
2247 };
2248
2249 class OSD::C_Tick : public Context {
2250 OSD *osd;
2251 public:
2252 explicit C_Tick(OSD *o) : osd(o) {}
2253 void finish(int r) override {
2254 osd->tick();
2255 }
2256 };
2257
2258 class OSD::C_Tick_WithoutOSDLock : public Context {
2259 OSD *osd;
2260 public:
2261 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2262 void finish(int r) override {
2263 osd->tick_without_osd_lock();
2264 }
2265 };
2266
2267 int OSD::enable_disable_fuse(bool stop)
2268 {
2269 #ifdef HAVE_LIBFUSE
2270 int r;
2271 string mntpath = cct->_conf->osd_data + "/fuse";
2272 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2273 dout(1) << __func__ << " disabling" << dendl;
2274 fuse_store->stop();
2275 delete fuse_store;
2276 fuse_store = NULL;
2277 r = ::rmdir(mntpath.c_str());
2278 if (r < 0)
2279 r = -errno;
2280 if (r < 0) {
2281 derr << __func__ << " failed to rmdir " << mntpath << dendl;
2282 return r;
2283 }
2284 return 0;
2285 }
2286 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2287 dout(1) << __func__ << " enabling" << dendl;
2288 r = ::mkdir(mntpath.c_str(), 0700);
2289 if (r < 0)
2290 r = -errno;
2291 if (r < 0 && r != -EEXIST) {
2292 derr << __func__ << " unable to create " << mntpath << ": "
2293 << cpp_strerror(r) << dendl;
2294 return r;
2295 }
2296 fuse_store = new FuseStore(store, mntpath);
2297 r = fuse_store->start();
2298 if (r < 0) {
2299 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2300 delete fuse_store;
2301 fuse_store = NULL;
2302 return r;
2303 }
2304 }
2305 #endif // HAVE_LIBFUSE
2306 return 0;
2307 }
2308
2309 int OSD::get_num_op_shards()
2310 {
2311 if (cct->_conf->osd_op_num_shards)
2312 return cct->_conf->osd_op_num_shards;
2313 if (store_is_rotational)
2314 return cct->_conf->osd_op_num_shards_hdd;
2315 else
2316 return cct->_conf->osd_op_num_shards_ssd;
2317 }
2318
2319 int OSD::get_num_op_threads()
2320 {
2321 if (cct->_conf->osd_op_num_threads_per_shard)
2322 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2323 if (store_is_rotational)
2324 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2325 else
2326 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2327 }
2328
2329 int OSD::init()
2330 {
2331 CompatSet initial, diff;
2332 Mutex::Locker lock(osd_lock);
2333 if (is_stopping())
2334 return 0;
2335
2336 tick_timer.init();
2337 tick_timer_without_osd_lock.init();
2338 service.recovery_request_timer.init();
2339 service.recovery_sleep_timer.init();
2340
2341 // mount.
2342 dout(2) << "init " << dev_path
2343 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
2344 << dendl;
2345 assert(store); // call pre_init() first!
2346
2347 store->set_cache_shards(get_num_op_shards());
2348
2349 int r = store->mount();
2350 if (r < 0) {
2351 derr << "OSD:init: unable to mount object store" << dendl;
2352 return r;
2353 }
2354
2355 enable_disable_fuse(false);
2356
2357 dout(2) << "boot" << dendl;
2358
2359 // initialize the daily loadavg with current 15min loadavg
2360 double loadavgs[3];
2361 if (getloadavg(loadavgs, 3) == 3) {
2362 daily_loadavg = loadavgs[2];
2363 } else {
2364 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
2365 daily_loadavg = 1.0;
2366 }
2367
2368 int rotating_auth_attempts = 0;
2369
2370 // sanity check long object name handling
2371 {
2372 hobject_t l;
2373 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
2374 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
2375 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
2376 r = store->validate_hobject_key(l);
2377 if (r < 0) {
2378 derr << "backend (" << store->get_type() << ") is unable to support max "
2379 << "object name[space] len" << dendl;
2380 derr << " osd max object name len = "
2381 << cct->_conf->osd_max_object_name_len << dendl;
2382 derr << " osd max object namespace len = "
2383 << cct->_conf->osd_max_object_namespace_len << dendl;
2384 derr << cpp_strerror(r) << dendl;
2385 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
2386 goto out;
2387 }
2388 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
2389 << dendl;
2390 } else {
2391 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
2392 }
2393 }
2394
2395 // read superblock
2396 r = read_superblock();
2397 if (r < 0) {
2398 derr << "OSD::init() : unable to read osd superblock" << dendl;
2399 r = -EINVAL;
2400 goto out;
2401 }
2402
2403 if (osd_compat.compare(superblock.compat_features) < 0) {
2404 derr << "The disk uses features unsupported by the executable." << dendl;
2405 derr << " ondisk features " << superblock.compat_features << dendl;
2406 derr << " daemon features " << osd_compat << dendl;
2407
2408 if (osd_compat.writeable(superblock.compat_features)) {
2409 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2410 derr << "it is still writeable, though. Missing features: " << diff << dendl;
2411 r = -EOPNOTSUPP;
2412 goto out;
2413 }
2414 else {
2415 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2416 derr << "Cannot write to disk! Missing features: " << diff << dendl;
2417 r = -EOPNOTSUPP;
2418 goto out;
2419 }
2420 }
2421
2422 assert_warn(whoami == superblock.whoami);
2423 if (whoami != superblock.whoami) {
2424 derr << "OSD::init: superblock says osd"
2425 << superblock.whoami << " but I am osd." << whoami << dendl;
2426 r = -EINVAL;
2427 goto out;
2428 }
2429
2430 initial = get_osd_initial_compat_set();
2431 diff = superblock.compat_features.unsupported(initial);
2432 if (superblock.compat_features.merge(initial)) {
2433 // We need to persist the new compat_set before we
2434 // do anything else
2435 dout(5) << "Upgrading superblock adding: " << diff << dendl;
2436 ObjectStore::Transaction t;
2437 write_superblock(t);
2438 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2439 if (r < 0)
2440 goto out;
2441 }
2442
2443 // make sure snap mapper object exists
2444 if (!store->exists(coll_t::meta(), OSD::make_snapmapper_oid())) {
2445 dout(10) << "init creating/touching snapmapper object" << dendl;
2446 ObjectStore::Transaction t;
2447 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
2448 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2449 if (r < 0)
2450 goto out;
2451 }
2452
2453 class_handler = new ClassHandler(cct);
2454 cls_initialize(class_handler);
2455
2456 if (cct->_conf->osd_open_classes_on_start) {
2457 int r = class_handler->open_all_classes();
2458 if (r)
2459 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
2460 }
2461
2462 // load up "current" osdmap
2463 assert_warn(!osdmap);
2464 if (osdmap) {
2465 derr << "OSD::init: unable to read current osdmap" << dendl;
2466 r = -EINVAL;
2467 goto out;
2468 }
2469 osdmap = get_map(superblock.current_epoch);
2470 check_osdmap_features(store);
2471
2472 create_recoverystate_perf();
2473
2474 {
2475 epoch_t bind_epoch = osdmap->get_epoch();
2476 service.set_epochs(NULL, NULL, &bind_epoch);
2477 }
2478
2479 clear_temp_objects();
2480
2481 // load up pgs (as they previously existed)
2482 load_pgs();
2483
2484 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
2485 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
2486 op_prio_cutoff << "." << dendl;
2487
2488 create_logger();
2489
2490 // i'm ready!
2491 client_messenger->add_dispatcher_head(this);
2492 cluster_messenger->add_dispatcher_head(this);
2493
2494 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2495 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2496 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2497 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2498
2499 objecter_messenger->add_dispatcher_head(service.objecter);
2500
2501 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
2502 | CEPH_ENTITY_TYPE_MGR);
2503 r = monc->init();
2504 if (r < 0)
2505 goto out;
2506
2507 /**
2508 * FIXME: this is a placeholder implementation that unconditionally
2509 * sends every is_primary PG's stats every time we're called, unlike
2510 * the existing mon PGStats mechanism that uses pg_stat_queue and acks.
2511 * This has equivalent cost to the existing worst case where all
2512 * PGs are busy and their stats are always enqueued for sending.
2513 */
2514 mgrc.set_pgstats_cb([this](){
2515 RWLock::RLocker l(map_lock);
2516
2517 utime_t had_for = ceph_clock_now() - had_map_since;
2518 osd_stat_t cur_stat = service.get_osd_stat();
2519 cur_stat.os_perf_stat = store->get_cur_stats();
2520
2521 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
2522 m->osd_stat = cur_stat;
2523
2524 Mutex::Locker lec{min_last_epoch_clean_lock};
2525 min_last_epoch_clean = osdmap->get_epoch();
2526 min_last_epoch_clean_pgs.clear();
2527 RWLock::RLocker lpg(pg_map_lock);
2528 for (const auto &i : pg_map) {
2529 PG *pg = i.second;
2530 if (!pg->is_primary()) {
2531 continue;
2532 }
2533
2534 pg->pg_stats_publish_lock.Lock();
2535 if (pg->pg_stats_publish_valid) {
2536 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
2537 const auto lec = pg->pg_stats_publish.get_effective_last_epoch_clean();
2538 min_last_epoch_clean = min(min_last_epoch_clean, lec);
2539 min_last_epoch_clean_pgs.push_back(pg->info.pgid.pgid);
2540 }
2541 pg->pg_stats_publish_lock.Unlock();
2542 }
2543
2544 return m;
2545 });
2546
2547 mgrc.init();
2548 client_messenger->add_dispatcher_head(&mgrc);
2549
2550 // tell monc about log_client so it will know about mon session resets
2551 monc->set_log_client(&log_client);
2552 update_log_config();
2553
2554 peering_tp.start();
2555 osd_op_tp.start();
2556 disk_tp.start();
2557 command_tp.start();
2558
2559 set_disk_tp_priority();
2560
2561 // start the heartbeat
2562 heartbeat_thread.create("osd_srv_heartbt");
2563
2564 // tick
2565 tick_timer.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick(this));
2566 {
2567 Mutex::Locker l(tick_timer_lock);
2568 tick_timer_without_osd_lock.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick_WithoutOSDLock(this));
2569 }
2570
2571 service.init();
2572 service.publish_map(osdmap);
2573 service.publish_superblock(superblock);
2574 service.max_oldest_map = superblock.oldest_map;
2575
2576 osd_lock.Unlock();
2577
2578 r = monc->authenticate();
2579 if (r < 0) {
2580 osd_lock.Lock(); // locker is going to unlock this on function exit
2581 if (is_stopping())
2582 r = 0;
2583 goto monout;
2584 }
2585
2586 while (monc->wait_auth_rotating(30.0) < 0) {
2587 derr << "unable to obtain rotating service keys; retrying" << dendl;
2588 ++rotating_auth_attempts;
2589 if (rotating_auth_attempts > g_conf->max_rotating_auth_attempts) {
2590 osd_lock.Lock(); // make locker happy
2591 if (!is_stopping()) {
2592 r = - ETIMEDOUT;
2593 }
2594 goto monout;
2595 }
2596 }
2597
2598 r = update_crush_device_class();
2599 if (r < 0) {
2600 osd_lock.Lock();
2601 goto monout;
2602 }
2603
2604 r = update_crush_location();
2605 if (r < 0) {
2606 osd_lock.Lock();
2607 goto monout;
2608 }
2609
2610 osd_lock.Lock();
2611 if (is_stopping())
2612 return 0;
2613
2614 // start objecter *after* we have authenticated, so that we don't ignore
2615 // the OSDMaps it requests.
2616 service.final_init();
2617
2618 check_config();
2619
2620 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
2621 consume_map();
2622 peering_wq.drain();
2623
2624 dout(0) << "done with init, starting boot process" << dendl;
2625
2626 // subscribe to any pg creations
2627 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
2628
2629 // MgrClient needs this (it doesn't have MonClient reference itself)
2630 monc->sub_want("mgrmap", 0, 0);
2631
2632 // we don't need to ask for an osdmap here; objecter will
2633 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
2634
2635 monc->renew_subs();
2636
2637 start_boot();
2638
2639 return 0;
2640 monout:
2641 mgrc.shutdown();
2642 monc->shutdown();
2643
2644 out:
2645 enable_disable_fuse(true);
2646 store->umount();
2647 delete store;
2648 store = NULL;
2649 return r;
2650 }
2651
2652 void OSD::final_init()
2653 {
2654 AdminSocket *admin_socket = cct->get_admin_socket();
2655 asok_hook = new OSDSocketHook(this);
2656 int r = admin_socket->register_command("status", "status", asok_hook,
2657 "high-level status of OSD");
2658 assert(r == 0);
2659 r = admin_socket->register_command("flush_journal", "flush_journal",
2660 asok_hook,
2661 "flush the journal to permanent store");
2662 assert(r == 0);
2663 r = admin_socket->register_command("dump_ops_in_flight",
2664 "dump_ops_in_flight", asok_hook,
2665 "show the ops currently in flight");
2666 assert(r == 0);
2667 r = admin_socket->register_command("ops",
2668 "ops", asok_hook,
2669 "show the ops currently in flight");
2670 assert(r == 0);
2671 r = admin_socket->register_command("dump_blocked_ops",
2672 "dump_blocked_ops", asok_hook,
2673 "show the blocked ops currently in flight");
2674 assert(r == 0);
2675 r = admin_socket->register_command("dump_historic_ops", "dump_historic_ops",
2676 asok_hook,
2677 "show recent ops");
2678 assert(r == 0);
2679 r = admin_socket->register_command("dump_historic_slow_ops", "dump_historic_slow_ops",
2680 asok_hook,
2681 "show slowest recent ops");
2682 assert(r == 0);
2683 r = admin_socket->register_command("dump_historic_ops_by_duration", "dump_historic_ops_by_duration",
2684 asok_hook,
2685 "show slowest recent ops, sorted by duration");
2686 assert(r == 0);
2687 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
2688 asok_hook,
2689 "dump op priority queue state");
2690 assert(r == 0);
2691 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
2692 asok_hook,
2693 "dump blacklisted clients and times");
2694 assert(r == 0);
2695 r = admin_socket->register_command("dump_watchers", "dump_watchers",
2696 asok_hook,
2697 "show clients which have active watches,"
2698 " and on which objects");
2699 assert(r == 0);
2700 r = admin_socket->register_command("dump_reservations", "dump_reservations",
2701 asok_hook,
2702 "show recovery reservations");
2703 assert(r == 0);
2704 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
2705 asok_hook,
2706 "force osd to update the latest map from "
2707 "the mon");
2708 assert(r == 0);
2709
2710 r = admin_socket->register_command( "heap",
2711 "heap " \
2712 "name=heapcmd,type=CephString",
2713 asok_hook,
2714 "show heap usage info (available only if "
2715 "compiled with tcmalloc)");
2716 assert(r == 0);
2717
2718 r = admin_socket->register_command("set_heap_property",
2719 "set_heap_property " \
2720 "name=property,type=CephString " \
2721 "name=value,type=CephInt",
2722 asok_hook,
2723 "update malloc extension heap property");
2724 assert(r == 0);
2725
2726 r = admin_socket->register_command("get_heap_property",
2727 "get_heap_property " \
2728 "name=property,type=CephString",
2729 asok_hook,
2730 "get malloc extension heap property");
2731 assert(r == 0);
2732
2733 r = admin_socket->register_command("dump_objectstore_kv_stats",
2734 "dump_objectstore_kv_stats",
2735 asok_hook,
2736 "print statistics of kvdb which used by bluestore");
2737 assert(r == 0);
2738
2739 r = admin_socket->register_command("dump_scrubs",
2740 "dump_scrubs",
2741 asok_hook,
2742 "print scheduled scrubs");
2743 assert(r == 0);
2744
2745 r = admin_socket->register_command("calc_objectstore_db_histogram",
2746 "calc_objectstore_db_histogram",
2747 asok_hook,
2748 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
2749 assert(r == 0);
2750
2751 r = admin_socket->register_command("flush_store_cache",
2752 "flush_store_cache",
2753 asok_hook,
2754 "Flush bluestore internal cache");
2755 assert(r == 0);
2756 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
2757 asok_hook,
2758 "show recent state history");
2759 assert(r == 0);
2760
2761 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
2762 // Note: pools are CephString instead of CephPoolname because
2763 // these commands traditionally support both pool names and numbers
2764 r = admin_socket->register_command(
2765 "setomapval",
2766 "setomapval " \
2767 "name=pool,type=CephString " \
2768 "name=objname,type=CephObjectname " \
2769 "name=key,type=CephString "\
2770 "name=val,type=CephString",
2771 test_ops_hook,
2772 "set omap key");
2773 assert(r == 0);
2774 r = admin_socket->register_command(
2775 "rmomapkey",
2776 "rmomapkey " \
2777 "name=pool,type=CephString " \
2778 "name=objname,type=CephObjectname " \
2779 "name=key,type=CephString",
2780 test_ops_hook,
2781 "remove omap key");
2782 assert(r == 0);
2783 r = admin_socket->register_command(
2784 "setomapheader",
2785 "setomapheader " \
2786 "name=pool,type=CephString " \
2787 "name=objname,type=CephObjectname " \
2788 "name=header,type=CephString",
2789 test_ops_hook,
2790 "set omap header");
2791 assert(r == 0);
2792
2793 r = admin_socket->register_command(
2794 "getomap",
2795 "getomap " \
2796 "name=pool,type=CephString " \
2797 "name=objname,type=CephObjectname",
2798 test_ops_hook,
2799 "output entire object map");
2800 assert(r == 0);
2801
2802 r = admin_socket->register_command(
2803 "truncobj",
2804 "truncobj " \
2805 "name=pool,type=CephString " \
2806 "name=objname,type=CephObjectname " \
2807 "name=len,type=CephInt",
2808 test_ops_hook,
2809 "truncate object to length");
2810 assert(r == 0);
2811
2812 r = admin_socket->register_command(
2813 "injectdataerr",
2814 "injectdataerr " \
2815 "name=pool,type=CephString " \
2816 "name=objname,type=CephObjectname " \
2817 "name=shardid,type=CephInt,req=false,range=0|255",
2818 test_ops_hook,
2819 "inject data error to an object");
2820 assert(r == 0);
2821
2822 r = admin_socket->register_command(
2823 "injectmdataerr",
2824 "injectmdataerr " \
2825 "name=pool,type=CephString " \
2826 "name=objname,type=CephObjectname " \
2827 "name=shardid,type=CephInt,req=false,range=0|255",
2828 test_ops_hook,
2829 "inject metadata error to an object");
2830 assert(r == 0);
2831 r = admin_socket->register_command(
2832 "set_recovery_delay",
2833 "set_recovery_delay " \
2834 "name=utime,type=CephInt,req=false",
2835 test_ops_hook,
2836 "Delay osd recovery by specified seconds");
2837 assert(r == 0);
2838 r = admin_socket->register_command(
2839 "trigger_scrub",
2840 "trigger_scrub " \
2841 "name=pgid,type=CephString ",
2842 test_ops_hook,
2843 "Trigger a scheduled scrub ");
2844 assert(r == 0);
2845 r = admin_socket->register_command(
2846 "injectfull",
2847 "injectfull " \
2848 "name=type,type=CephString,req=false " \
2849 "name=count,type=CephInt,req=false ",
2850 test_ops_hook,
2851 "Inject a full disk (optional count times)");
2852 assert(r == 0);
2853 }
2854
2855 void OSD::create_logger()
2856 {
2857 dout(10) << "create_logger" << dendl;
2858
2859 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
2860
2861 // Latency axis configuration for op histograms, values are in nanoseconds
2862 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
2863 "Latency (usec)",
2864 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
2865 0, ///< Start at 0
2866 100000, ///< Quantization unit is 100usec
2867 32, ///< Enough to cover much longer than slow requests
2868 };
2869
2870 // Op size axis configuration for op histograms, values are in bytes
2871 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
2872 "Request size (bytes)",
2873 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
2874 0, ///< Start at 0
2875 512, ///< Quantization unit is 512 bytes
2876 32, ///< Enough to cover requests larger than GB
2877 };
2878
2879
2880 osd_plb.add_u64(
2881 l_osd_op_wip, "op_wip",
2882 "Replication operations currently being processed (primary)");
2883 osd_plb.add_u64_counter(
2884 l_osd_op, "op",
2885 "Client operations",
2886 "ops", PerfCountersBuilder::PRIO_CRITICAL);
2887 osd_plb.add_u64_counter(
2888 l_osd_op_inb, "op_in_bytes",
2889 "Client operations total write size",
2890 "wr", PerfCountersBuilder::PRIO_INTERESTING);
2891 osd_plb.add_u64_counter(
2892 l_osd_op_outb, "op_out_bytes",
2893 "Client operations total read size",
2894 "rd", PerfCountersBuilder::PRIO_INTERESTING);
2895 osd_plb.add_time_avg(
2896 l_osd_op_lat, "op_latency",
2897 "Latency of client operations (including queue time)",
2898 "l", 9);
2899 osd_plb.add_time_avg(
2900 l_osd_op_process_lat, "op_process_latency",
2901 "Latency of client operations (excluding queue time)");
2902 osd_plb.add_time_avg(
2903 l_osd_op_prepare_lat, "op_prepare_latency",
2904 "Latency of client operations (excluding queue time and wait for finished)");
2905
2906 osd_plb.add_u64_counter(
2907 l_osd_op_r, "op_r", "Client read operations");
2908 osd_plb.add_u64_counter(
2909 l_osd_op_r_outb, "op_r_out_bytes", "Client data read");
2910 osd_plb.add_time_avg(
2911 l_osd_op_r_lat, "op_r_latency",
2912 "Latency of read operation (including queue time)");
2913 osd_plb.add_u64_counter_histogram(
2914 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
2915 op_hist_x_axis_config, op_hist_y_axis_config,
2916 "Histogram of operation latency (including queue time) + data read");
2917 osd_plb.add_time_avg(
2918 l_osd_op_r_process_lat, "op_r_process_latency",
2919 "Latency of read operation (excluding queue time)");
2920 osd_plb.add_time_avg(
2921 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
2922 "Latency of read operations (excluding queue time and wait for finished)");
2923 osd_plb.add_u64_counter(
2924 l_osd_op_w, "op_w", "Client write operations");
2925 osd_plb.add_u64_counter(
2926 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
2927 osd_plb.add_time_avg(
2928 l_osd_op_w_lat, "op_w_latency",
2929 "Latency of write operation (including queue time)");
2930 osd_plb.add_u64_counter_histogram(
2931 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
2932 op_hist_x_axis_config, op_hist_y_axis_config,
2933 "Histogram of operation latency (including queue time) + data written");
2934 osd_plb.add_time_avg(
2935 l_osd_op_w_process_lat, "op_w_process_latency",
2936 "Latency of write operation (excluding queue time)");
2937 osd_plb.add_time_avg(
2938 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
2939 "Latency of write operations (excluding queue time and wait for finished)");
2940 osd_plb.add_u64_counter(
2941 l_osd_op_rw, "op_rw",
2942 "Client read-modify-write operations");
2943 osd_plb.add_u64_counter(
2944 l_osd_op_rw_inb, "op_rw_in_bytes",
2945 "Client read-modify-write operations write in");
2946 osd_plb.add_u64_counter(
2947 l_osd_op_rw_outb,"op_rw_out_bytes",
2948 "Client read-modify-write operations read out ");
2949 osd_plb.add_time_avg(
2950 l_osd_op_rw_lat, "op_rw_latency",
2951 "Latency of read-modify-write operation (including queue time)");
2952 osd_plb.add_u64_counter_histogram(
2953 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
2954 op_hist_x_axis_config, op_hist_y_axis_config,
2955 "Histogram of rw operation latency (including queue time) + data written");
2956 osd_plb.add_u64_counter_histogram(
2957 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
2958 op_hist_x_axis_config, op_hist_y_axis_config,
2959 "Histogram of rw operation latency (including queue time) + data read");
2960 osd_plb.add_time_avg(
2961 l_osd_op_rw_process_lat, "op_rw_process_latency",
2962 "Latency of read-modify-write operation (excluding queue time)");
2963 osd_plb.add_time_avg(
2964 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
2965 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
2966
2967 osd_plb.add_u64_counter(
2968 l_osd_sop, "subop", "Suboperations");
2969 osd_plb.add_u64_counter(
2970 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size");
2971 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
2972
2973 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
2974 osd_plb.add_u64_counter(
2975 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size");
2976 osd_plb.add_time_avg(
2977 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
2978 osd_plb.add_u64_counter(
2979 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
2980 osd_plb.add_time_avg(
2981 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
2982 osd_plb.add_u64_counter(
2983 l_osd_sop_push, "subop_push", "Suboperations push messages");
2984 osd_plb.add_u64_counter(
2985 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size");
2986 osd_plb.add_time_avg(
2987 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
2988
2989 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
2990 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
2991 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size");
2992
2993 osd_plb.add_u64_counter(
2994 l_osd_rop, "recovery_ops",
2995 "Started recovery operations",
2996 "rop", PerfCountersBuilder::PRIO_INTERESTING);
2997
2998 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
2999 osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size");
3000 osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes");
3001 osd_plb.add_u64(l_osd_history_alloc_num, "history_alloc_num");
3002 osd_plb.add_u64(
3003 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3004 osd_plb.add_u64(
3005 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3006 "Total number getting crc from crc_cache with adjusting");
3007 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3008 "Total number of crc cache misses");
3009
3010 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3011 "pgs", PerfCountersBuilder::PRIO_USEFUL);
3012 osd_plb.add_u64(
3013 l_osd_pg_primary, "numpg_primary",
3014 "Placement groups for which this osd is primary");
3015 osd_plb.add_u64(
3016 l_osd_pg_replica, "numpg_replica",
3017 "Placement groups for which this osd is replica");
3018 osd_plb.add_u64(
3019 l_osd_pg_stray, "numpg_stray",
3020 "Placement groups ready to be deleted from this osd");
3021 osd_plb.add_u64(
3022 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3023 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3024 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3025 osd_plb.add_u64_counter(
3026 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3027 osd_plb.add_u64_counter(
3028 l_osd_waiting_for_map, "messages_delayed_for_map",
3029 "Operations waiting for OSD map");
3030
3031 osd_plb.add_u64_counter(
3032 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3033 osd_plb.add_u64_counter(
3034 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3035 osd_plb.add_u64_counter(
3036 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3037 "osdmap cache miss below cache lower bound");
3038 osd_plb.add_u64_avg(
3039 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3040 "osdmap cache miss, avg distance below cache lower bound");
3041 osd_plb.add_u64_counter(
3042 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3043 "OSDMap buffer cache hits");
3044 osd_plb.add_u64_counter(
3045 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3046 "OSDMap buffer cache misses");
3047
3048 osd_plb.add_u64(l_osd_stat_bytes, "stat_bytes", "OSD size");
3049 osd_plb.add_u64(l_osd_stat_bytes_used, "stat_bytes_used", "Used space");
3050 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
3051
3052 osd_plb.add_u64_counter(
3053 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3054
3055 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3056 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3057 osd_plb.add_u64_counter(
3058 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3059 osd_plb.add_u64_counter(
3060 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3061 osd_plb.add_u64_counter(
3062 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3063 "Failed tier flush attempts");
3064 osd_plb.add_u64_counter(
3065 l_osd_tier_evict, "tier_evict", "Tier evictions");
3066 osd_plb.add_u64_counter(
3067 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3068 osd_plb.add_u64_counter(
3069 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3070 osd_plb.add_u64_counter(
3071 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3072 osd_plb.add_u64_counter(
3073 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3074 osd_plb.add_u64_counter(
3075 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3076 osd_plb.add_u64_counter(
3077 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3078
3079 osd_plb.add_u64_counter(
3080 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3081 osd_plb.add_u64_counter(
3082 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3083 osd_plb.add_u64_counter(
3084 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3085 osd_plb.add_u64_counter(
3086 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3087
3088 osd_plb.add_u64_counter(
3089 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3090 osd_plb.add_u64_counter(
3091 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3092
3093 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3094 osd_plb.add_time_avg(
3095 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3096 osd_plb.add_time_avg(
3097 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3098 osd_plb.add_time_avg(
3099 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3100
3101 osd_plb.add_u64_counter(
3102 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3103 osd_plb.add_u64_counter(
3104 l_osd_pg_fastinfo, "osd_pg_fastinfo",
3105 "PG updated its info using fastinfo attr");
3106 osd_plb.add_u64_counter(
3107 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3108
3109 logger = osd_plb.create_perf_counters();
3110 cct->get_perfcounters_collection()->add(logger);
3111 }
3112
3113 void OSD::create_recoverystate_perf()
3114 {
3115 dout(10) << "create_recoverystate_perf" << dendl;
3116
3117 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3118
3119 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3120 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3121 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3122 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3123 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3124 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3125 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3126 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3127 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3128 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3129 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3130 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3131 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3132 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3133 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3134 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3135 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3136 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3137 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3138 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3139 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3140 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3141 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3142 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3143 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3144 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3145 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3146 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3147 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3148 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3149 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3150
3151 recoverystate_perf = rs_perf.create_perf_counters();
3152 cct->get_perfcounters_collection()->add(recoverystate_perf);
3153 }
3154
3155 int OSD::shutdown()
3156 {
3157 if (!service.prepare_to_stop())
3158 return 0; // already shutting down
3159 osd_lock.Lock();
3160 if (is_stopping()) {
3161 osd_lock.Unlock();
3162 return 0;
3163 }
3164 derr << "shutdown" << dendl;
3165
3166 set_state(STATE_STOPPING);
3167
3168 // Debugging
3169 cct->_conf->set_val("debug_osd", "100");
3170 cct->_conf->set_val("debug_journal", "100");
3171 cct->_conf->set_val("debug_filestore", "100");
3172 cct->_conf->set_val("debug_ms", "100");
3173 cct->_conf->apply_changes(NULL);
3174
3175 // stop MgrClient earlier as it's more like an internal consumer of OSD
3176 mgrc.shutdown();
3177
3178 service.start_shutdown();
3179
3180 // stop sending work to pgs. this just prevents any new work in _process
3181 // from racing with on_shutdown and potentially entering the pg after.
3182 op_shardedwq.drain();
3183
3184 // Shutdown PGs
3185 {
3186 RWLock::RLocker l(pg_map_lock);
3187 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3188 p != pg_map.end();
3189 ++p) {
3190 dout(20) << " kicking pg " << p->first << dendl;
3191 p->second->lock();
3192 p->second->on_shutdown();
3193 p->second->unlock();
3194 p->second->osr->flush();
3195 }
3196 }
3197 clear_pg_stat_queue();
3198
3199 // drain op queue again (in case PGs requeued something)
3200 op_shardedwq.drain();
3201 {
3202 finished.clear(); // zap waiters (bleh, this is messy)
3203 }
3204
3205 op_shardedwq.clear_pg_slots();
3206
3207 // unregister commands
3208 cct->get_admin_socket()->unregister_command("status");
3209 cct->get_admin_socket()->unregister_command("flush_journal");
3210 cct->get_admin_socket()->unregister_command("dump_ops_in_flight");
3211 cct->get_admin_socket()->unregister_command("ops");
3212 cct->get_admin_socket()->unregister_command("dump_blocked_ops");
3213 cct->get_admin_socket()->unregister_command("dump_historic_ops");
3214 cct->get_admin_socket()->unregister_command("dump_historic_ops_by_duration");
3215 cct->get_admin_socket()->unregister_command("dump_historic_slow_ops");
3216 cct->get_admin_socket()->unregister_command("dump_op_pq_state");
3217 cct->get_admin_socket()->unregister_command("dump_blacklist");
3218 cct->get_admin_socket()->unregister_command("dump_watchers");
3219 cct->get_admin_socket()->unregister_command("dump_reservations");
3220 cct->get_admin_socket()->unregister_command("get_latest_osdmap");
3221 cct->get_admin_socket()->unregister_command("set_heap_property");
3222 cct->get_admin_socket()->unregister_command("get_heap_property");
3223 cct->get_admin_socket()->unregister_command("dump_objectstore_kv_stats");
3224 cct->get_admin_socket()->unregister_command("calc_objectstore_db_histogram");
3225 cct->get_admin_socket()->unregister_command("flush_store_cache");
3226 cct->get_admin_socket()->unregister_command("dump_pgstate_history");
3227 delete asok_hook;
3228 asok_hook = NULL;
3229
3230 cct->get_admin_socket()->unregister_command("setomapval");
3231 cct->get_admin_socket()->unregister_command("rmomapkey");
3232 cct->get_admin_socket()->unregister_command("setomapheader");
3233 cct->get_admin_socket()->unregister_command("getomap");
3234 cct->get_admin_socket()->unregister_command("truncobj");
3235 cct->get_admin_socket()->unregister_command("injectdataerr");
3236 cct->get_admin_socket()->unregister_command("injectmdataerr");
3237 cct->get_admin_socket()->unregister_command("set_recovery_delay");
3238 delete test_ops_hook;
3239 test_ops_hook = NULL;
3240
3241 osd_lock.Unlock();
3242
3243 heartbeat_lock.Lock();
3244 heartbeat_stop = true;
3245 heartbeat_cond.Signal();
3246 heartbeat_lock.Unlock();
3247 heartbeat_thread.join();
3248
3249 peering_tp.drain();
3250 peering_wq.clear();
3251 peering_tp.stop();
3252 dout(10) << "osd tp stopped" << dendl;
3253
3254 osd_op_tp.drain();
3255 osd_op_tp.stop();
3256 dout(10) << "op sharded tp stopped" << dendl;
3257
3258 command_tp.drain();
3259 command_tp.stop();
3260 dout(10) << "command tp stopped" << dendl;
3261
3262 disk_tp.drain();
3263 disk_tp.stop();
3264 dout(10) << "disk tp paused (new)" << dendl;
3265
3266 dout(10) << "stopping agent" << dendl;
3267 service.agent_stop();
3268
3269 osd_lock.Lock();
3270
3271 reset_heartbeat_peers();
3272
3273 tick_timer.shutdown();
3274
3275 {
3276 Mutex::Locker l(tick_timer_lock);
3277 tick_timer_without_osd_lock.shutdown();
3278 }
3279
3280 // note unmount epoch
3281 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
3282 superblock.mounted = service.get_boot_epoch();
3283 superblock.clean_thru = osdmap->get_epoch();
3284 ObjectStore::Transaction t;
3285 write_superblock(t);
3286 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
3287 if (r) {
3288 derr << "OSD::shutdown: error writing superblock: "
3289 << cpp_strerror(r) << dendl;
3290 }
3291
3292
3293 {
3294 Mutex::Locker l(pg_stat_queue_lock);
3295 assert(pg_stat_queue.empty());
3296 }
3297
3298 service.shutdown_reserver();
3299
3300 // Remove PGs
3301 #ifdef PG_DEBUG_REFS
3302 service.dump_live_pgids();
3303 #endif
3304 {
3305 RWLock::RLocker l(pg_map_lock);
3306 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3307 p != pg_map.end();
3308 ++p) {
3309 dout(20) << " kicking pg " << p->first << dendl;
3310 p->second->lock();
3311 if (p->second->ref != 1) {
3312 derr << "pgid " << p->first << " has ref count of "
3313 << p->second->ref << dendl;
3314 #ifdef PG_DEBUG_REFS
3315 p->second->dump_live_ids();
3316 #endif
3317 if (cct->_conf->osd_shutdown_pgref_assert) {
3318 ceph_abort();
3319 }
3320 }
3321 p->second->unlock();
3322 p->second->put("PGMap");
3323 }
3324 pg_map.clear();
3325 }
3326 #ifdef PG_DEBUG_REFS
3327 service.dump_live_pgids();
3328 #endif
3329 cct->_conf->remove_observer(this);
3330
3331 dout(10) << "syncing store" << dendl;
3332 enable_disable_fuse(true);
3333
3334 if (cct->_conf->osd_journal_flush_on_shutdown) {
3335 dout(10) << "flushing journal" << dendl;
3336 store->flush_journal();
3337 }
3338
3339 store->umount();
3340 delete store;
3341 store = 0;
3342 dout(10) << "Store synced" << dendl;
3343
3344 monc->shutdown();
3345 osd_lock.Unlock();
3346
3347 osdmap = OSDMapRef();
3348 service.shutdown();
3349 op_tracker.on_shutdown();
3350
3351 class_handler->shutdown();
3352 client_messenger->shutdown();
3353 cluster_messenger->shutdown();
3354 hb_front_client_messenger->shutdown();
3355 hb_back_client_messenger->shutdown();
3356 objecter_messenger->shutdown();
3357 hb_front_server_messenger->shutdown();
3358 hb_back_server_messenger->shutdown();
3359
3360 peering_wq.clear();
3361
3362 return r;
3363 }
3364
3365 int OSD::mon_cmd_maybe_osd_create(string &cmd)
3366 {
3367 bool created = false;
3368 while (true) {
3369 dout(10) << __func__ << " cmd: " << cmd << dendl;
3370 vector<string> vcmd{cmd};
3371 bufferlist inbl;
3372 C_SaferCond w;
3373 string outs;
3374 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
3375 int r = w.wait();
3376 if (r < 0) {
3377 if (r == -ENOENT && !created) {
3378 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
3379 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
3380 vector<string> vnewcmd{newcmd};
3381 bufferlist inbl;
3382 C_SaferCond w;
3383 string outs;
3384 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
3385 int r = w.wait();
3386 if (r < 0) {
3387 derr << __func__ << " fail: osd does not exist and created failed: "
3388 << cpp_strerror(r) << dendl;
3389 return r;
3390 }
3391 created = true;
3392 continue;
3393 }
3394 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
3395 return r;
3396 }
3397 break;
3398 }
3399
3400 return 0;
3401 }
3402
3403 int OSD::update_crush_location()
3404 {
3405 if (!cct->_conf->osd_crush_update_on_start) {
3406 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
3407 return 0;
3408 }
3409
3410 char weight[32];
3411 if (cct->_conf->osd_crush_initial_weight >= 0) {
3412 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
3413 } else {
3414 struct store_statfs_t st;
3415 int r = store->statfs(&st);
3416 if (r < 0) {
3417 derr << "statfs: " << cpp_strerror(r) << dendl;
3418 return r;
3419 }
3420 snprintf(weight, sizeof(weight), "%.4lf",
3421 MAX((double).00001,
3422 (double)(st.total) /
3423 (double)(1ull << 40 /* TB */)));
3424 }
3425
3426 std::multimap<string,string> loc = cct->crush_location.get_location();
3427 dout(10) << __func__ << " crush location is " << loc << dendl;
3428
3429 string cmd =
3430 string("{\"prefix\": \"osd crush create-or-move\", ") +
3431 string("\"id\": ") + stringify(whoami) + string(", ") +
3432 string("\"weight\":") + weight + string(", ") +
3433 string("\"args\": [");
3434 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
3435 if (p != loc.begin())
3436 cmd += ", ";
3437 cmd += "\"" + p->first + "=" + p->second + "\"";
3438 }
3439 cmd += "]}";
3440
3441 return mon_cmd_maybe_osd_create(cmd);
3442 }
3443
3444 int OSD::update_crush_device_class()
3445 {
3446 string device_class;
3447 int r = store->read_meta("crush_device_class", &device_class);
3448 if (r < 0)
3449 return 0;
3450
3451 string cmd =
3452 string("{\"prefix\": \"osd crush set-device-class\", ") +
3453 string("\"id\": ") + stringify(whoami) + string(", ") +
3454 string("\"class\": \"") + device_class + string("\"}");
3455
3456 return mon_cmd_maybe_osd_create(cmd);
3457 }
3458
3459 void OSD::write_superblock(ObjectStore::Transaction& t)
3460 {
3461 dout(10) << "write_superblock " << superblock << dendl;
3462
3463 //hack: at minimum it's using the baseline feature set
3464 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
3465 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
3466
3467 bufferlist bl;
3468 ::encode(superblock, bl);
3469 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
3470 }
3471
3472 int OSD::read_superblock()
3473 {
3474 bufferlist bl;
3475 int r = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
3476 if (r < 0)
3477 return r;
3478
3479 bufferlist::iterator p = bl.begin();
3480 ::decode(superblock, p);
3481
3482 dout(10) << "read_superblock " << superblock << dendl;
3483
3484 return 0;
3485 }
3486
3487 void OSD::clear_temp_objects()
3488 {
3489 dout(10) << __func__ << dendl;
3490 vector<coll_t> ls;
3491 store->list_collections(ls);
3492 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
3493 spg_t pgid;
3494 if (!p->is_pg(&pgid))
3495 continue;
3496
3497 // list temp objects
3498 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
3499
3500 vector<ghobject_t> temps;
3501 ghobject_t next;
3502 while (1) {
3503 vector<ghobject_t> objects;
3504 store->collection_list(*p, next, ghobject_t::get_max(),
3505 store->get_ideal_list_max(),
3506 &objects, &next);
3507 if (objects.empty())
3508 break;
3509 vector<ghobject_t>::iterator q;
3510 for (q = objects.begin(); q != objects.end(); ++q) {
3511 // Hammer set pool for temps to -1, so check for clean-up
3512 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
3513 temps.push_back(*q);
3514 } else {
3515 break;
3516 }
3517 }
3518 // If we saw a non-temp object and hit the break above we can
3519 // break out of the while loop too.
3520 if (q != objects.end())
3521 break;
3522 }
3523 if (!temps.empty()) {
3524 ObjectStore::Transaction t;
3525 int removed = 0;
3526 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
3527 dout(20) << " removing " << *p << " object " << *q << dendl;
3528 t.remove(*p, *q);
3529 if (++removed > cct->_conf->osd_target_transaction_size) {
3530 store->apply_transaction(service.meta_osr.get(), std::move(t));
3531 t = ObjectStore::Transaction();
3532 removed = 0;
3533 }
3534 }
3535 if (removed) {
3536 store->apply_transaction(service.meta_osr.get(), std::move(t));
3537 }
3538 }
3539 }
3540 }
3541
3542 void OSD::recursive_remove_collection(CephContext* cct,
3543 ObjectStore *store, spg_t pgid,
3544 coll_t tmp)
3545 {
3546 OSDriver driver(
3547 store,
3548 coll_t(),
3549 make_snapmapper_oid());
3550
3551 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
3552 ObjectStore::Sequencer>("rm"));
3553 ObjectStore::Transaction t;
3554 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
3555
3556 vector<ghobject_t> objects;
3557 store->collection_list(tmp, ghobject_t(), ghobject_t::get_max(),
3558 INT_MAX, &objects, 0);
3559 generic_dout(10) << __func__ << " " << objects << dendl;
3560 // delete them.
3561 int removed = 0;
3562 for (vector<ghobject_t>::iterator p = objects.begin();
3563 p != objects.end();
3564 ++p, removed++) {
3565 OSDriver::OSTransaction _t(driver.get_transaction(&t));
3566 int r = mapper.remove_oid(p->hobj, &_t);
3567 if (r != 0 && r != -ENOENT)
3568 ceph_abort();
3569 t.remove(tmp, *p);
3570 if (removed > cct->_conf->osd_target_transaction_size) {
3571 int r = store->apply_transaction(osr.get(), std::move(t));
3572 assert(r == 0);
3573 t = ObjectStore::Transaction();
3574 removed = 0;
3575 }
3576 }
3577 t.remove_collection(tmp);
3578 int r = store->apply_transaction(osr.get(), std::move(t));
3579 assert(r == 0);
3580
3581 C_SaferCond waiter;
3582 if (!osr->flush_commit(&waiter)) {
3583 waiter.wait();
3584 }
3585 }
3586
3587
3588 // ======================================================
3589 // PG's
3590
3591 PGPool OSD::_get_pool(int id, OSDMapRef createmap)
3592 {
3593 if (!createmap->have_pg_pool(id)) {
3594 dout(5) << __func__ << ": the OSDmap does not contain a PG pool with id = "
3595 << id << dendl;
3596 ceph_abort();
3597 }
3598
3599 PGPool p = PGPool(cct, createmap, id);
3600
3601 dout(10) << "_get_pool " << p.id << dendl;
3602 return p;
3603 }
3604
3605 PG *OSD::_open_lock_pg(
3606 OSDMapRef createmap,
3607 spg_t pgid, bool no_lockdep_check)
3608 {
3609 assert(osd_lock.is_locked());
3610
3611 PG* pg = _make_pg(createmap, pgid);
3612 {
3613 RWLock::WLocker l(pg_map_lock);
3614 pg->lock(no_lockdep_check);
3615 pg_map[pgid] = pg;
3616 pg->get("PGMap"); // because it's in pg_map
3617 service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
3618 }
3619 return pg;
3620 }
3621
3622 PG* OSD::_make_pg(
3623 OSDMapRef createmap,
3624 spg_t pgid)
3625 {
3626 dout(10) << "_open_lock_pg " << pgid << dendl;
3627 PGPool pool = _get_pool(pgid.pool(), createmap);
3628
3629 // create
3630 PG *pg;
3631 if (createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_REPLICATED ||
3632 createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_ERASURE)
3633 pg = new PrimaryLogPG(&service, createmap, pool, pgid);
3634 else
3635 ceph_abort();
3636
3637 return pg;
3638 }
3639
3640
3641 void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
3642 {
3643 epoch_t e(service.get_osdmap()->get_epoch());
3644 pg->get("PGMap"); // For pg_map
3645 pg_map[pg->info.pgid] = pg;
3646 service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
3647
3648 dout(10) << "Adding newly split pg " << *pg << dendl;
3649 pg->handle_loaded(rctx);
3650 pg->write_if_dirty(*(rctx->transaction));
3651 pg->queue_null(e, e);
3652 map<spg_t, list<PG::CephPeeringEvtRef> >::iterator to_wake =
3653 peering_wait_for_split.find(pg->info.pgid);
3654 if (to_wake != peering_wait_for_split.end()) {
3655 for (list<PG::CephPeeringEvtRef>::iterator i =
3656 to_wake->second.begin();
3657 i != to_wake->second.end();
3658 ++i) {
3659 pg->queue_peering_event(*i);
3660 }
3661 peering_wait_for_split.erase(to_wake);
3662 }
3663 if (!service.get_osdmap()->have_pg_pool(pg->info.pgid.pool()))
3664 _remove_pg(pg);
3665 }
3666
3667 OSD::res_result OSD::_try_resurrect_pg(
3668 OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state)
3669 {
3670 assert(resurrected);
3671 assert(old_pg_state);
3672 // find nearest ancestor
3673 DeletingStateRef df;
3674 spg_t cur(pgid);
3675 while (true) {
3676 df = service.deleting_pgs.lookup(cur);
3677 if (df)
3678 break;
3679 if (!cur.ps())
3680 break;
3681 cur = cur.get_parent();
3682 }
3683 if (!df)
3684 return RES_NONE; // good to go
3685
3686 df->old_pg_state->lock();
3687 OSDMapRef create_map = df->old_pg_state->get_osdmap();
3688 df->old_pg_state->unlock();
3689
3690 set<spg_t> children;
3691 if (cur == pgid) {
3692 if (df->try_stop_deletion()) {
3693 dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl;
3694 *resurrected = cur;
3695 *old_pg_state = df->old_pg_state;
3696 service.deleting_pgs.remove(pgid); // PG is no longer being removed!
3697 return RES_SELF;
3698 } else {
3699 // raced, ensure we don't see DeletingStateRef when we try to
3700 // delete this pg
3701 service.deleting_pgs.remove(pgid);
3702 return RES_NONE;
3703 }
3704 } else if (cur.is_split(create_map->get_pg_num(cur.pool()),
3705 curmap->get_pg_num(cur.pool()),
3706 &children) &&
3707 children.count(pgid)) {
3708 if (df->try_stop_deletion()) {
3709 dout(10) << __func__ << ": halted deletion on ancestor pg " << pgid
3710 << dendl;
3711 *resurrected = cur;
3712 *old_pg_state = df->old_pg_state;
3713 service.deleting_pgs.remove(cur); // PG is no longer being removed!
3714 return RES_PARENT;
3715 } else {
3716 /* this is not a problem, failing to cancel proves that all objects
3717 * have been removed, so no hobject_t overlap is possible
3718 */
3719 return RES_NONE;
3720 }
3721 }
3722 return RES_NONE;
3723 }
3724
3725 PG *OSD::_create_lock_pg(
3726 OSDMapRef createmap,
3727 spg_t pgid,
3728 bool hold_map_lock,
3729 bool backfill,
3730 int role,
3731 vector<int>& up, int up_primary,
3732 vector<int>& acting, int acting_primary,
3733 pg_history_t history,
3734 const PastIntervals& pi,
3735 ObjectStore::Transaction& t)
3736 {
3737 assert(osd_lock.is_locked());
3738 dout(20) << "_create_lock_pg pgid " << pgid << dendl;
3739
3740 PG *pg = _open_lock_pg(createmap, pgid, true);
3741
3742 service.init_splits_between(pgid, pg->get_osdmap(), service.get_osdmap());
3743
3744 pg->init(
3745 role,
3746 up,
3747 up_primary,
3748 acting,
3749 acting_primary,
3750 history,
3751 pi,
3752 backfill,
3753 &t);
3754
3755 dout(7) << "_create_lock_pg " << *pg << dendl;
3756 return pg;
3757 }
3758
3759 PG *OSD::_lookup_lock_pg(spg_t pgid)
3760 {
3761 RWLock::RLocker l(pg_map_lock);
3762
3763 auto pg_map_entry = pg_map.find(pgid);
3764 if (pg_map_entry == pg_map.end())
3765 return nullptr;
3766 PG *pg = pg_map_entry->second;
3767 pg->lock();
3768 return pg;
3769 }
3770
3771 PG *OSD::lookup_lock_pg(spg_t pgid)
3772 {
3773 return _lookup_lock_pg(pgid);
3774 }
3775
3776 PG *OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid)
3777 {
3778 assert(pg_map.count(pgid));
3779 PG *pg = pg_map[pgid];
3780 pg->lock();
3781 return pg;
3782 }
3783
3784 void OSD::load_pgs()
3785 {
3786 assert(osd_lock.is_locked());
3787 dout(0) << "load_pgs" << dendl;
3788 {
3789 RWLock::RLocker l(pg_map_lock);
3790 assert(pg_map.empty());
3791 }
3792
3793 vector<coll_t> ls;
3794 int r = store->list_collections(ls);
3795 if (r < 0) {
3796 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
3797 }
3798
3799 bool has_upgraded = false;
3800
3801 for (vector<coll_t>::iterator it = ls.begin();
3802 it != ls.end();
3803 ++it) {
3804 spg_t pgid;
3805 if (it->is_temp(&pgid) ||
3806 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
3807 dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
3808 recursive_remove_collection(cct, store, pgid, *it);
3809 continue;
3810 }
3811
3812 if (!it->is_pg(&pgid)) {
3813 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
3814 continue;
3815 }
3816
3817 if (pgid.preferred() >= 0) {
3818 dout(10) << __func__ << ": skipping localized PG " << pgid << dendl;
3819 // FIXME: delete it too, eventually
3820 continue;
3821 }
3822
3823 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
3824 bufferlist bl;
3825 epoch_t map_epoch = 0;
3826 int r = PG::peek_map_epoch(store, pgid, &map_epoch, &bl);
3827 if (r < 0) {
3828 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
3829 << dendl;
3830 continue;
3831 }
3832
3833 PG *pg = NULL;
3834 if (map_epoch > 0) {
3835 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
3836 if (!pgosdmap) {
3837 if (!osdmap->have_pg_pool(pgid.pool())) {
3838 derr << __func__ << ": could not find map for epoch " << map_epoch
3839 << " on pg " << pgid << ", but the pool is not present in the "
3840 << "current map, so this is probably a result of bug 10617. "
3841 << "Skipping the pg for now, you can use ceph-objectstore-tool "
3842 << "to clean it up later." << dendl;
3843 continue;
3844 } else {
3845 derr << __func__ << ": have pgid " << pgid << " at epoch "
3846 << map_epoch << ", but missing map. Crashing."
3847 << dendl;
3848 assert(0 == "Missing map in load_pgs");
3849 }
3850 }
3851 pg = _open_lock_pg(pgosdmap, pgid);
3852 } else {
3853 pg = _open_lock_pg(osdmap, pgid);
3854 }
3855 // there can be no waiters here, so we don't call wake_pg_waiters
3856
3857 pg->ch = store->open_collection(pg->coll);
3858
3859 // read pg state, log
3860 pg->read_state(store, bl);
3861
3862 if (pg->must_upgrade()) {
3863 if (!pg->can_upgrade()) {
3864 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
3865 << " an older version first." << dendl;
3866 assert(0 == "PG too old to upgrade");
3867 }
3868 if (!has_upgraded) {
3869 derr << "PGs are upgrading" << dendl;
3870 has_upgraded = true;
3871 }
3872 dout(10) << "PG " << pg->info.pgid
3873 << " must upgrade..." << dendl;
3874 pg->upgrade(store);
3875 }
3876
3877 service.init_splits_between(pg->info.pgid, pg->get_osdmap(), osdmap);
3878
3879 // generate state for PG's current mapping
3880 int primary, up_primary;
3881 vector<int> acting, up;
3882 pg->get_osdmap()->pg_to_up_acting_osds(
3883 pgid.pgid, &up, &up_primary, &acting, &primary);
3884 pg->init_primary_up_acting(
3885 up,
3886 acting,
3887 up_primary,
3888 primary);
3889 int role = OSDMap::calc_pg_role(whoami, pg->acting);
3890 if (pg->pool.info.is_replicated() || role == pg->pg_whoami.shard)
3891 pg->set_role(role);
3892 else
3893 pg->set_role(-1);
3894
3895 pg->reg_next_scrub();
3896
3897 PG::RecoveryCtx rctx(0, 0, 0, 0, 0, 0);
3898 pg->handle_loaded(&rctx);
3899
3900 dout(10) << "load_pgs loaded " << *pg << " " << pg->pg_log.get_log() << dendl;
3901 if (pg->pg_log.is_dirty()) {
3902 ObjectStore::Transaction t;
3903 pg->write_if_dirty(t);
3904 store->apply_transaction(pg->osr.get(), std::move(t));
3905 }
3906 pg->unlock();
3907 }
3908 {
3909 RWLock::RLocker l(pg_map_lock);
3910 dout(0) << "load_pgs opened " << pg_map.size() << " pgs" << dendl;
3911 }
3912
3913 // clean up old infos object?
3914 if (has_upgraded && store->exists(coll_t::meta(), OSD::make_infos_oid())) {
3915 dout(1) << __func__ << " removing legacy infos object" << dendl;
3916 ObjectStore::Transaction t;
3917 t.remove(coll_t::meta(), OSD::make_infos_oid());
3918 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
3919 if (r != 0) {
3920 derr << __func__ << ": apply_transaction returned "
3921 << cpp_strerror(r) << dendl;
3922 ceph_abort();
3923 }
3924 }
3925
3926 build_past_intervals_parallel();
3927 }
3928
3929
3930 /*
3931 * build past_intervals efficiently on old, degraded, and buried
3932 * clusters. this is important for efficiently catching up osds that
3933 * are way behind on maps to the current cluster state.
3934 *
3935 * this is a parallel version of PG::generate_past_intervals().
3936 * follow the same logic, but do all pgs at the same time so that we
3937 * can make a single pass across the osdmap history.
3938 */
3939 void OSD::build_past_intervals_parallel()
3940 {
3941 struct pistate {
3942 epoch_t start, end;
3943 vector<int> old_acting, old_up;
3944 epoch_t same_interval_since;
3945 int primary;
3946 int up_primary;
3947 };
3948 map<PG*,pistate> pis;
3949
3950 // calculate junction of map range
3951 epoch_t end_epoch = superblock.oldest_map;
3952 epoch_t cur_epoch = superblock.newest_map;
3953 {
3954 RWLock::RLocker l(pg_map_lock);
3955 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
3956 i != pg_map.end();
3957 ++i) {
3958 PG *pg = i->second;
3959
3960 auto rpib = pg->get_required_past_interval_bounds(
3961 pg->info,
3962 superblock.oldest_map);
3963 if (rpib.first >= rpib.second && pg->past_intervals.empty()) {
3964 if (pg->info.history.same_interval_since == 0) {
3965 pg->info.history.same_interval_since = rpib.second;
3966 }
3967 continue;
3968 } else {
3969 auto apib = pg->past_intervals.get_bounds();
3970 if (apib.second >= rpib.second &&
3971 apib.first <= rpib.first) {
3972 if (pg->info.history.same_interval_since == 0) {
3973 pg->info.history.same_interval_since = rpib.second;
3974 }
3975 continue;
3976 }
3977 }
3978
3979 dout(10) << pg->info.pgid << " needs " << rpib.first << "-"
3980 << rpib.second << dendl;
3981 pistate& p = pis[pg];
3982 p.start = rpib.first;
3983 p.end = rpib.second;
3984 p.same_interval_since = 0;
3985
3986 if (rpib.first < cur_epoch)
3987 cur_epoch = rpib.first;
3988 if (rpib.second > end_epoch)
3989 end_epoch = rpib.second;
3990 }
3991 }
3992 if (pis.empty()) {
3993 dout(10) << __func__ << " nothing to build" << dendl;
3994 return;
3995 }
3996
3997 dout(1) << __func__ << " over " << cur_epoch << "-" << end_epoch << dendl;
3998 assert(cur_epoch <= end_epoch);
3999
4000 OSDMapRef cur_map, last_map;
4001 for ( ; cur_epoch <= end_epoch; cur_epoch++) {
4002 dout(10) << __func__ << " epoch " << cur_epoch << dendl;
4003 last_map = cur_map;
4004 cur_map = get_map(cur_epoch);
4005
4006 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4007 PG *pg = i->first;
4008 pistate& p = i->second;
4009
4010 if (cur_epoch < p.start || cur_epoch > p.end)
4011 continue;
4012
4013 vector<int> acting, up;
4014 int up_primary;
4015 int primary;
4016 pg_t pgid = pg->info.pgid.pgid;
4017 if (p.same_interval_since && last_map->get_pools().count(pgid.pool()))
4018 pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool()));
4019 cur_map->pg_to_up_acting_osds(
4020 pgid, &up, &up_primary, &acting, &primary);
4021
4022 if (p.same_interval_since == 0) {
4023 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4024 << " first map, acting " << acting
4025 << " up " << up << ", same_interval_since = " << cur_epoch << dendl;
4026 p.same_interval_since = cur_epoch;
4027 p.old_up = up;
4028 p.old_acting = acting;
4029 p.primary = primary;
4030 p.up_primary = up_primary;
4031 continue;
4032 }
4033 assert(last_map);
4034
4035 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
4036 pg->get_is_recoverable_predicate());
4037 std::stringstream debug;
4038 bool new_interval = PastIntervals::check_new_interval(
4039 p.primary,
4040 primary,
4041 p.old_acting, acting,
4042 p.up_primary,
4043 up_primary,
4044 p.old_up, up,
4045 p.same_interval_since,
4046 pg->info.history.last_epoch_clean,
4047 cur_map, last_map,
4048 pgid,
4049 recoverable.get(),
4050 &pg->past_intervals,
4051 &debug);
4052 if (new_interval) {
4053 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4054 << " " << debug.str() << dendl;
4055 p.old_up = up;
4056 p.old_acting = acting;
4057 p.primary = primary;
4058 p.up_primary = up_primary;
4059 p.same_interval_since = cur_epoch;
4060 }
4061 }
4062 }
4063
4064 // Now that past_intervals have been recomputed let's fix the same_interval_since
4065 // if it was cleared by import.
4066 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4067 PG *pg = i->first;
4068 pistate& p = i->second;
4069
4070 if (pg->info.history.same_interval_since == 0) {
4071 assert(p.same_interval_since);
4072 dout(10) << __func__ << " fix same_interval_since " << p.same_interval_since << " pg " << *pg << dendl;
4073 dout(10) << __func__ << " past_intervals " << pg->past_intervals << dendl;
4074 // Fix it
4075 pg->info.history.same_interval_since = p.same_interval_since;
4076 }
4077 }
4078
4079 // write info only at the end. this is necessary because we check
4080 // whether the past_intervals go far enough back or forward in time,
4081 // but we don't check for holes. we could avoid it by discarding
4082 // the previous past_intervals and rebuilding from scratch, or we
4083 // can just do this and commit all our work at the end.
4084 ObjectStore::Transaction t;
4085 int num = 0;
4086 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4087 PG *pg = i->first;
4088 pg->lock();
4089 pg->dirty_big_info = true;
4090 pg->dirty_info = true;
4091 pg->write_if_dirty(t);
4092 pg->unlock();
4093
4094 // don't let the transaction get too big
4095 if (++num >= cct->_conf->osd_target_transaction_size) {
4096 store->apply_transaction(service.meta_osr.get(), std::move(t));
4097 t = ObjectStore::Transaction();
4098 num = 0;
4099 }
4100 }
4101 if (!t.empty())
4102 store->apply_transaction(service.meta_osr.get(), std::move(t));
4103 }
4104
4105 /*
4106 * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping
4107 * hasn't changed since the given epoch and we are the primary.
4108 */
4109 int OSD::handle_pg_peering_evt(
4110 spg_t pgid,
4111 const pg_history_t& orig_history,
4112 const PastIntervals& pi,
4113 epoch_t epoch,
4114 PG::CephPeeringEvtRef evt)
4115 {
4116 if (service.splitting(pgid)) {
4117 peering_wait_for_split[pgid].push_back(evt);
4118 return -EEXIST;
4119 }
4120
4121 PG *pg = _lookup_lock_pg(pgid);
4122 if (!pg) {
4123 // same primary?
4124 if (!osdmap->have_pg_pool(pgid.pool()))
4125 return -EINVAL;
4126 int up_primary, acting_primary;
4127 vector<int> up, acting;
4128 osdmap->pg_to_up_acting_osds(
4129 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4130
4131 pg_history_t history = orig_history;
4132 bool valid_history = project_pg_history(
4133 pgid, history, epoch, up, up_primary, acting, acting_primary);
4134
4135 if (!valid_history || epoch < history.same_interval_since) {
4136 dout(10) << __func__ << pgid << " acting changed in "
4137 << history.same_interval_since << " (msg from " << epoch << ")"
4138 << dendl;
4139 return -EINVAL;
4140 }
4141
4142 if (service.splitting(pgid)) {
4143 ceph_abort();
4144 }
4145
4146 // do we need to resurrect a deleting pg?
4147 spg_t resurrected;
4148 PGRef old_pg_state;
4149 res_result result = _try_resurrect_pg(
4150 service.get_osdmap(),
4151 pgid,
4152 &resurrected,
4153 &old_pg_state);
4154
4155 PG::RecoveryCtx rctx = create_context();
4156 switch (result) {
4157 case RES_NONE: {
4158 const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
4159 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4160 store->get_type() != "bluestore") {
4161 clog->warn() << "pg " << pgid
4162 << " is at risk of silent data corruption: "
4163 << "the pool allows ec overwrites but is not stored in "
4164 << "bluestore, so deep scrubbing will not detect bitrot";
4165 }
4166 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4167 PG::_init(*rctx.transaction, pgid, pp);
4168
4169 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
4170 if (!pp->is_replicated() && role != pgid.shard)
4171 role = -1;
4172
4173 pg = _create_lock_pg(
4174 get_map(epoch),
4175 pgid, false, false,
4176 role,
4177 up, up_primary,
4178 acting, acting_primary,
4179 history, pi,
4180 *rctx.transaction);
4181 pg->handle_create(&rctx);
4182 pg->write_if_dirty(*rctx.transaction);
4183 dispatch_context(rctx, pg, osdmap);
4184
4185 dout(10) << *pg << " is new" << dendl;
4186
4187 pg->queue_peering_event(evt);
4188 wake_pg_waiters(pg);
4189 pg->unlock();
4190 return 0;
4191 }
4192 case RES_SELF: {
4193 old_pg_state->lock();
4194 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4195 int old_role = old_pg_state->role;
4196 vector<int> old_up = old_pg_state->up;
4197 int old_up_primary = old_pg_state->up_primary.osd;
4198 vector<int> old_acting = old_pg_state->acting;
4199 int old_primary = old_pg_state->primary.osd;
4200 pg_history_t old_history = old_pg_state->info.history;
4201 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4202 old_pg_state->unlock();
4203 pg = _create_lock_pg(
4204 old_osd_map,
4205 resurrected,
4206 false,
4207 true,
4208 old_role,
4209 old_up,
4210 old_up_primary,
4211 old_acting,
4212 old_primary,
4213 old_history,
4214 old_past_intervals,
4215 *rctx.transaction);
4216 pg->handle_create(&rctx);
4217 pg->write_if_dirty(*rctx.transaction);
4218 dispatch_context(rctx, pg, osdmap);
4219
4220 dout(10) << *pg << " is new (resurrected)" << dendl;
4221
4222 pg->queue_peering_event(evt);
4223 wake_pg_waiters(pg);
4224 pg->unlock();
4225 return 0;
4226 }
4227 case RES_PARENT: {
4228 assert(old_pg_state);
4229 old_pg_state->lock();
4230 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4231 int old_role = old_pg_state->role;
4232 vector<int> old_up = old_pg_state->up;
4233 int old_up_primary = old_pg_state->up_primary.osd;
4234 vector<int> old_acting = old_pg_state->acting;
4235 int old_primary = old_pg_state->primary.osd;
4236 pg_history_t old_history = old_pg_state->info.history;
4237 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4238 old_pg_state->unlock();
4239 PG *parent = _create_lock_pg(
4240 old_osd_map,
4241 resurrected,
4242 false,
4243 true,
4244 old_role,
4245 old_up,
4246 old_up_primary,
4247 old_acting,
4248 old_primary,
4249 old_history,
4250 old_past_intervals,
4251 *rctx.transaction
4252 );
4253 parent->handle_create(&rctx);
4254 parent->write_if_dirty(*rctx.transaction);
4255 dispatch_context(rctx, parent, osdmap);
4256
4257 dout(10) << *parent << " is new" << dendl;
4258
4259 assert(service.splitting(pgid));
4260 peering_wait_for_split[pgid].push_back(evt);
4261
4262 //parent->queue_peering_event(evt);
4263 parent->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
4264 wake_pg_waiters(parent);
4265 parent->unlock();
4266 return 0;
4267 }
4268 default:
4269 assert(0);
4270 return 0;
4271 }
4272 } else {
4273 // already had it. did the mapping change?
4274 if (epoch < pg->info.history.same_interval_since) {
4275 dout(10) << *pg << __func__ << " acting changed in "
4276 << pg->info.history.same_interval_since
4277 << " (msg from " << epoch << ")" << dendl;
4278 } else {
4279 pg->queue_peering_event(evt);
4280 }
4281 pg->unlock();
4282 return -EEXIST;
4283 }
4284 }
4285
4286
4287 void OSD::build_initial_pg_history(
4288 spg_t pgid,
4289 epoch_t created,
4290 utime_t created_stamp,
4291 pg_history_t *h,
4292 PastIntervals *pi)
4293 {
4294 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4295 h->epoch_created = created;
4296 h->epoch_pool_created = created;
4297 h->same_interval_since = created;
4298 h->same_up_since = created;
4299 h->same_primary_since = created;
4300 h->last_scrub_stamp = created_stamp;
4301 h->last_deep_scrub_stamp = created_stamp;
4302 h->last_clean_scrub_stamp = created_stamp;
4303
4304 OSDMapRef lastmap = service.get_map(created);
4305 int up_primary, acting_primary;
4306 vector<int> up, acting;
4307 lastmap->pg_to_up_acting_osds(
4308 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4309
4310 ostringstream debug;
4311 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4312 OSDMapRef osdmap = service.get_map(e);
4313 int new_up_primary, new_acting_primary;
4314 vector<int> new_up, new_acting;
4315 osdmap->pg_to_up_acting_osds(
4316 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4317
4318 // this is a bit imprecise, but sufficient?
4319 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4320 const pg_pool_t *pi;
4321 bool operator()(const set<pg_shard_t> &have) const {
4322 return have.size() >= pi->min_size;
4323 }
4324 min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4325 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4326
4327 bool new_interval = PastIntervals::check_new_interval(
4328 acting_primary,
4329 new_acting_primary,
4330 acting, new_acting,
4331 up_primary,
4332 new_up_primary,
4333 up, new_up,
4334 h->same_interval_since,
4335 h->last_epoch_clean,
4336 osdmap,
4337 lastmap,
4338 pgid.pgid,
4339 &min_size_predicate,
4340 pi,
4341 &debug);
4342 if (new_interval) {
4343 h->same_interval_since = e;
4344 }
4345 if (up != new_up) {
4346 h->same_up_since = e;
4347 }
4348 if (acting_primary != new_acting_primary) {
4349 h->same_primary_since = e;
4350 }
4351 lastmap = osdmap;
4352 }
4353 dout(20) << __func__ << " " << debug.str() << dendl;
4354 dout(10) << __func__ << " " << *h << " " << *pi
4355 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4356 pi->get_bounds()) << ")"
4357 << dendl;
4358 }
4359
4360 /**
4361 * Fill in the passed history so you know same_interval_since, same_up_since,
4362 * and same_primary_since.
4363 */
4364 bool OSD::project_pg_history(spg_t pgid, pg_history_t& h, epoch_t from,
4365 const vector<int>& currentup,
4366 int currentupprimary,
4367 const vector<int>& currentacting,
4368 int currentactingprimary)
4369 {
4370 dout(15) << "project_pg_history " << pgid
4371 << " from " << from << " to " << osdmap->get_epoch()
4372 << ", start " << h
4373 << dendl;
4374
4375 epoch_t e;
4376 for (e = osdmap->get_epoch();
4377 e > from;
4378 e--) {
4379 // verify during intermediate epoch (e-1)
4380 OSDMapRef oldmap = service.try_get_map(e-1);
4381 if (!oldmap) {
4382 dout(15) << __func__ << ": found map gap, returning false" << dendl;
4383 return false;
4384 }
4385 assert(oldmap->have_pg_pool(pgid.pool()));
4386
4387 int upprimary, actingprimary;
4388 vector<int> up, acting;
4389 oldmap->pg_to_up_acting_osds(
4390 pgid.pgid,
4391 &up,
4392 &upprimary,
4393 &acting,
4394 &actingprimary);
4395
4396 // acting set change?
4397 if ((actingprimary != currentactingprimary ||
4398 upprimary != currentupprimary ||
4399 acting != currentacting ||
4400 up != currentup) && e > h.same_interval_since) {
4401 dout(15) << "project_pg_history " << pgid << " acting|up changed in " << e
4402 << " from " << acting << "/" << up
4403 << " " << actingprimary << "/" << upprimary
4404 << " -> " << currentacting << "/" << currentup
4405 << " " << currentactingprimary << "/" << currentupprimary
4406 << dendl;
4407 h.same_interval_since = e;
4408 }
4409 // split?
4410 if (pgid.is_split(oldmap->get_pg_num(pgid.pool()),
4411 osdmap->get_pg_num(pgid.pool()),
4412 0) && e > h.same_interval_since) {
4413 h.same_interval_since = e;
4414 }
4415 // up set change?
4416 if ((up != currentup || upprimary != currentupprimary)
4417 && e > h.same_up_since) {
4418 dout(15) << "project_pg_history " << pgid << " up changed in " << e
4419 << " from " << up << " " << upprimary
4420 << " -> " << currentup << " " << currentupprimary << dendl;
4421 h.same_up_since = e;
4422 }
4423
4424 // primary change?
4425 if (OSDMap::primary_changed(
4426 actingprimary,
4427 acting,
4428 currentactingprimary,
4429 currentacting) &&
4430 e > h.same_primary_since) {
4431 dout(15) << "project_pg_history " << pgid << " primary changed in " << e << dendl;
4432 h.same_primary_since = e;
4433 }
4434
4435 if (h.same_interval_since >= e && h.same_up_since >= e && h.same_primary_since >= e)
4436 break;
4437 }
4438
4439 // base case: these floors should be the pg creation epoch if we didn't
4440 // find any changes.
4441 if (e == h.epoch_created) {
4442 if (!h.same_interval_since)
4443 h.same_interval_since = e;
4444 if (!h.same_up_since)
4445 h.same_up_since = e;
4446 if (!h.same_primary_since)
4447 h.same_primary_since = e;
4448 }
4449
4450 dout(15) << "project_pg_history end " << h << dendl;
4451 return true;
4452 }
4453
4454
4455
4456 void OSD::_add_heartbeat_peer(int p)
4457 {
4458 if (p == whoami)
4459 return;
4460 HeartbeatInfo *hi;
4461
4462 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4463 if (i == heartbeat_peers.end()) {
4464 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4465 if (!cons.first)
4466 return;
4467 hi = &heartbeat_peers[p];
4468 hi->peer = p;
4469 HeartbeatSession *s = new HeartbeatSession(p);
4470 hi->con_back = cons.first.get();
4471 hi->con_back->set_priv(s->get());
4472 if (cons.second) {
4473 hi->con_front = cons.second.get();
4474 hi->con_front->set_priv(s->get());
4475 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4476 << " " << hi->con_back->get_peer_addr()
4477 << " " << hi->con_front->get_peer_addr()
4478 << dendl;
4479 } else {
4480 hi->con_front.reset(NULL);
4481 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4482 << " " << hi->con_back->get_peer_addr()
4483 << dendl;
4484 }
4485 s->put();
4486 } else {
4487 hi = &i->second;
4488 }
4489 hi->epoch = osdmap->get_epoch();
4490 }
4491
4492 void OSD::_remove_heartbeat_peer(int n)
4493 {
4494 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
4495 assert(q != heartbeat_peers.end());
4496 dout(20) << " removing heartbeat peer osd." << n
4497 << " " << q->second.con_back->get_peer_addr()
4498 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
4499 << dendl;
4500 q->second.con_back->mark_down();
4501 if (q->second.con_front) {
4502 q->second.con_front->mark_down();
4503 }
4504 heartbeat_peers.erase(q);
4505 }
4506
4507 void OSD::need_heartbeat_peer_update()
4508 {
4509 if (is_stopping())
4510 return;
4511 dout(20) << "need_heartbeat_peer_update" << dendl;
4512 heartbeat_set_peers_need_update();
4513 }
4514
4515 void OSD::maybe_update_heartbeat_peers()
4516 {
4517 assert(osd_lock.is_locked());
4518
4519 if (is_waiting_for_healthy()) {
4520 utime_t now = ceph_clock_now();
4521 if (last_heartbeat_resample == utime_t()) {
4522 last_heartbeat_resample = now;
4523 heartbeat_set_peers_need_update();
4524 } else if (!heartbeat_peers_need_update()) {
4525 utime_t dur = now - last_heartbeat_resample;
4526 if (dur > cct->_conf->osd_heartbeat_grace) {
4527 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
4528 heartbeat_set_peers_need_update();
4529 last_heartbeat_resample = now;
4530 reset_heartbeat_peers(); // we want *new* peers!
4531 }
4532 }
4533 }
4534
4535 if (!heartbeat_peers_need_update())
4536 return;
4537 heartbeat_clear_peers_need_update();
4538
4539 Mutex::Locker l(heartbeat_lock);
4540
4541 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
4542
4543
4544 // build heartbeat from set
4545 if (is_active()) {
4546 RWLock::RLocker l(pg_map_lock);
4547 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4548 i != pg_map.end();
4549 ++i) {
4550 PG *pg = i->second;
4551 pg->heartbeat_peer_lock.Lock();
4552 dout(20) << i->first << " heartbeat_peers " << pg->heartbeat_peers << dendl;
4553 for (set<int>::iterator p = pg->heartbeat_peers.begin();
4554 p != pg->heartbeat_peers.end();
4555 ++p)
4556 if (osdmap->is_up(*p))
4557 _add_heartbeat_peer(*p);
4558 for (set<int>::iterator p = pg->probe_targets.begin();
4559 p != pg->probe_targets.end();
4560 ++p)
4561 if (osdmap->is_up(*p))
4562 _add_heartbeat_peer(*p);
4563 pg->heartbeat_peer_lock.Unlock();
4564 }
4565 }
4566
4567 // include next and previous up osds to ensure we have a fully-connected set
4568 set<int> want, extras;
4569 int next = osdmap->get_next_up_osd_after(whoami);
4570 if (next >= 0)
4571 want.insert(next);
4572 int prev = osdmap->get_previous_up_osd_before(whoami);
4573 if (prev >= 0 && prev != next)
4574 want.insert(prev);
4575
4576 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
4577 dout(10) << " adding neighbor peer osd." << *p << dendl;
4578 extras.insert(*p);
4579 _add_heartbeat_peer(*p);
4580 }
4581
4582 // remove down peers; enumerate extras
4583 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4584 while (p != heartbeat_peers.end()) {
4585 if (!osdmap->is_up(p->first)) {
4586 int o = p->first;
4587 ++p;
4588 _remove_heartbeat_peer(o);
4589 continue;
4590 }
4591 if (p->second.epoch < osdmap->get_epoch()) {
4592 extras.insert(p->first);
4593 }
4594 ++p;
4595 }
4596
4597 // too few?
4598 int start = osdmap->get_next_up_osd_after(whoami);
4599 for (int n = start; n >= 0; ) {
4600 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
4601 break;
4602 if (!extras.count(n) && !want.count(n) && n != whoami) {
4603 dout(10) << " adding random peer osd." << n << dendl;
4604 extras.insert(n);
4605 _add_heartbeat_peer(n);
4606 }
4607 n = osdmap->get_next_up_osd_after(n);
4608 if (n == start)
4609 break; // came full circle; stop
4610 }
4611
4612 // too many?
4613 for (set<int>::iterator p = extras.begin();
4614 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
4615 ++p) {
4616 if (want.count(*p))
4617 continue;
4618 _remove_heartbeat_peer(*p);
4619 }
4620
4621 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
4622 }
4623
4624 void OSD::reset_heartbeat_peers()
4625 {
4626 assert(osd_lock.is_locked());
4627 dout(10) << "reset_heartbeat_peers" << dendl;
4628 Mutex::Locker l(heartbeat_lock);
4629 while (!heartbeat_peers.empty()) {
4630 HeartbeatInfo& hi = heartbeat_peers.begin()->second;
4631 hi.con_back->mark_down();
4632 if (hi.con_front) {
4633 hi.con_front->mark_down();
4634 }
4635 heartbeat_peers.erase(heartbeat_peers.begin());
4636 }
4637 failure_queue.clear();
4638 }
4639
4640 void OSD::handle_osd_ping(MOSDPing *m)
4641 {
4642 if (superblock.cluster_fsid != m->fsid) {
4643 dout(20) << "handle_osd_ping from " << m->get_source_inst()
4644 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
4645 m->put();
4646 return;
4647 }
4648
4649 int from = m->get_source().num();
4650
4651 heartbeat_lock.Lock();
4652 if (is_stopping()) {
4653 heartbeat_lock.Unlock();
4654 m->put();
4655 return;
4656 }
4657
4658 OSDMapRef curmap = service.get_osdmap();
4659 assert(curmap);
4660
4661 switch (m->op) {
4662
4663 case MOSDPing::PING:
4664 {
4665 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
4666 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
4667 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
4668 if (heartbeat_drop->second == 0) {
4669 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
4670 } else {
4671 --heartbeat_drop->second;
4672 dout(5) << "Dropping heartbeat from " << from
4673 << ", " << heartbeat_drop->second
4674 << " remaining to drop" << dendl;
4675 break;
4676 }
4677 } else if (cct->_conf->osd_debug_drop_ping_probability >
4678 ((((double)(rand()%100))/100.0))) {
4679 heartbeat_drop =
4680 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
4681 cct->_conf->osd_debug_drop_ping_duration)).first;
4682 dout(5) << "Dropping heartbeat from " << from
4683 << ", " << heartbeat_drop->second
4684 << " remaining to drop" << dendl;
4685 break;
4686 }
4687 }
4688
4689 if (!cct->get_heartbeat_map()->is_healthy()) {
4690 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
4691 break;
4692 }
4693
4694 Message *r = new MOSDPing(monc->get_fsid(),
4695 curmap->get_epoch(),
4696 MOSDPing::PING_REPLY, m->stamp,
4697 cct->_conf->osd_heartbeat_min_size);
4698 m->get_connection()->send_message(r);
4699
4700 if (curmap->is_up(from)) {
4701 service.note_peer_epoch(from, m->map_epoch);
4702 if (is_active()) {
4703 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
4704 if (con) {
4705 service.share_map_peer(from, con.get());
4706 }
4707 }
4708 } else if (!curmap->exists(from) ||
4709 curmap->get_down_at(from) > m->map_epoch) {
4710 // tell them they have died
4711 Message *r = new MOSDPing(monc->get_fsid(),
4712 curmap->get_epoch(),
4713 MOSDPing::YOU_DIED,
4714 m->stamp,
4715 cct->_conf->osd_heartbeat_min_size);
4716 m->get_connection()->send_message(r);
4717 }
4718 }
4719 break;
4720
4721 case MOSDPing::PING_REPLY:
4722 {
4723 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
4724 if (i != heartbeat_peers.end()) {
4725 if (m->get_connection() == i->second.con_back) {
4726 dout(25) << "handle_osd_ping got reply from osd." << from
4727 << " first_tx " << i->second.first_tx
4728 << " last_tx " << i->second.last_tx
4729 << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
4730 << " last_rx_front " << i->second.last_rx_front
4731 << dendl;
4732 i->second.last_rx_back = m->stamp;
4733 // if there is no front con, set both stamps.
4734 if (i->second.con_front == NULL)
4735 i->second.last_rx_front = m->stamp;
4736 } else if (m->get_connection() == i->second.con_front) {
4737 dout(25) << "handle_osd_ping got reply from osd." << from
4738 << " first_tx " << i->second.first_tx
4739 << " last_tx " << i->second.last_tx
4740 << " last_rx_back " << i->second.last_rx_back
4741 << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
4742 << dendl;
4743 i->second.last_rx_front = m->stamp;
4744 }
4745
4746 utime_t cutoff = ceph_clock_now();
4747 cutoff -= cct->_conf->osd_heartbeat_grace;
4748 if (i->second.is_healthy(cutoff)) {
4749 // Cancel false reports
4750 auto failure_queue_entry = failure_queue.find(from);
4751 if (failure_queue_entry != failure_queue.end()) {
4752 dout(10) << "handle_osd_ping canceling queued "
4753 << "failure report for osd." << from << dendl;
4754 failure_queue.erase(failure_queue_entry);
4755 }
4756
4757 auto failure_pending_entry = failure_pending.find(from);
4758 if (failure_pending_entry != failure_pending.end()) {
4759 dout(10) << "handle_osd_ping canceling in-flight "
4760 << "failure report for osd." << from << dendl;
4761 send_still_alive(curmap->get_epoch(),
4762 failure_pending_entry->second.second);
4763 failure_pending.erase(failure_pending_entry);
4764 }
4765 }
4766 }
4767
4768 if (m->map_epoch &&
4769 curmap->is_up(from)) {
4770 service.note_peer_epoch(from, m->map_epoch);
4771 if (is_active()) {
4772 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
4773 if (con) {
4774 service.share_map_peer(from, con.get());
4775 }
4776 }
4777 }
4778 }
4779 break;
4780
4781 case MOSDPing::YOU_DIED:
4782 dout(10) << "handle_osd_ping " << m->get_source_inst()
4783 << " says i am down in " << m->map_epoch << dendl;
4784 osdmap_subscribe(curmap->get_epoch()+1, false);
4785 break;
4786 }
4787
4788 heartbeat_lock.Unlock();
4789 m->put();
4790 }
4791
4792 void OSD::heartbeat_entry()
4793 {
4794 Mutex::Locker l(heartbeat_lock);
4795 if (is_stopping())
4796 return;
4797 while (!heartbeat_stop) {
4798 heartbeat();
4799
4800 double wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
4801 utime_t w;
4802 w.set_from_double(wait);
4803 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
4804 heartbeat_cond.WaitInterval(heartbeat_lock, w);
4805 if (is_stopping())
4806 return;
4807 dout(30) << "heartbeat_entry woke up" << dendl;
4808 }
4809 }
4810
4811 void OSD::heartbeat_check()
4812 {
4813 assert(heartbeat_lock.is_locked());
4814 utime_t now = ceph_clock_now();
4815
4816 // check for heartbeat replies (move me elsewhere?)
4817 utime_t cutoff = now;
4818 cutoff -= cct->_conf->osd_heartbeat_grace;
4819 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4820 p != heartbeat_peers.end();
4821 ++p) {
4822
4823 if (p->second.first_tx == utime_t()) {
4824 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
4825 << "yet, skipping" << dendl;
4826 continue;
4827 }
4828
4829 dout(25) << "heartbeat_check osd." << p->first
4830 << " first_tx " << p->second.first_tx
4831 << " last_tx " << p->second.last_tx
4832 << " last_rx_back " << p->second.last_rx_back
4833 << " last_rx_front " << p->second.last_rx_front
4834 << dendl;
4835 if (p->second.is_unhealthy(cutoff)) {
4836 if (p->second.last_rx_back == utime_t() ||
4837 p->second.last_rx_front == utime_t()) {
4838 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
4839 << " osd." << p->first << " ever on either front or back, first ping sent "
4840 << p->second.first_tx << " (cutoff " << cutoff << ")" << dendl;
4841 // fail
4842 failure_queue[p->first] = p->second.last_tx;
4843 } else {
4844 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
4845 << " osd." << p->first << " since back " << p->second.last_rx_back
4846 << " front " << p->second.last_rx_front
4847 << " (cutoff " << cutoff << ")" << dendl;
4848 // fail
4849 failure_queue[p->first] = MIN(p->second.last_rx_back, p->second.last_rx_front);
4850 }
4851 }
4852 }
4853 }
4854
4855 void OSD::heartbeat()
4856 {
4857 dout(30) << "heartbeat" << dendl;
4858
4859 // get CPU load avg
4860 double loadavgs[1];
4861 int n_samples = 86400 / cct->_conf->osd_heartbeat_interval;
4862 if (getloadavg(loadavgs, 1) == 1) {
4863 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
4864 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
4865 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
4866 }
4867
4868 dout(30) << "heartbeat checking stats" << dendl;
4869
4870 // refresh stats?
4871 vector<int> hb_peers;
4872 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4873 p != heartbeat_peers.end();
4874 ++p)
4875 hb_peers.push_back(p->first);
4876 service.update_osd_stat(hb_peers);
4877
4878 dout(5) << "heartbeat: " << service.get_osd_stat() << dendl;
4879
4880 utime_t now = ceph_clock_now();
4881
4882 // send heartbeats
4883 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
4884 i != heartbeat_peers.end();
4885 ++i) {
4886 int peer = i->first;
4887 i->second.last_tx = now;
4888 if (i->second.first_tx == utime_t())
4889 i->second.first_tx = now;
4890 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
4891 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
4892 service.get_osdmap()->get_epoch(),
4893 MOSDPing::PING, now,
4894 cct->_conf->osd_heartbeat_min_size));
4895
4896 if (i->second.con_front)
4897 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
4898 service.get_osdmap()->get_epoch(),
4899 MOSDPing::PING, now,
4900 cct->_conf->osd_heartbeat_min_size));
4901 }
4902
4903 logger->set(l_osd_hb_to, heartbeat_peers.size());
4904
4905 // hmm.. am i all alone?
4906 dout(30) << "heartbeat lonely?" << dendl;
4907 if (heartbeat_peers.empty()) {
4908 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
4909 last_mon_heartbeat = now;
4910 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
4911 osdmap_subscribe(osdmap->get_epoch() + 1, false);
4912 }
4913 }
4914
4915 dout(30) << "heartbeat done" << dendl;
4916 }
4917
4918 bool OSD::heartbeat_reset(Connection *con)
4919 {
4920 HeartbeatSession *s = static_cast<HeartbeatSession*>(con->get_priv());
4921 if (s) {
4922 heartbeat_lock.Lock();
4923 if (is_stopping()) {
4924 heartbeat_lock.Unlock();
4925 s->put();
4926 return true;
4927 }
4928 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer);
4929 if (p != heartbeat_peers.end() &&
4930 (p->second.con_back == con ||
4931 p->second.con_front == con)) {
4932 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
4933 << ", reopening" << dendl;
4934 if (con != p->second.con_back) {
4935 p->second.con_back->mark_down();
4936 }
4937 p->second.con_back.reset(NULL);
4938 if (p->second.con_front && con != p->second.con_front) {
4939 p->second.con_front->mark_down();
4940 }
4941 p->second.con_front.reset(NULL);
4942 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
4943 if (newcon.first) {
4944 p->second.con_back = newcon.first.get();
4945 p->second.con_back->set_priv(s->get());
4946 if (newcon.second) {
4947 p->second.con_front = newcon.second.get();
4948 p->second.con_front->set_priv(s->get());
4949 }
4950 } else {
4951 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
4952 << ", raced with osdmap update, closing out peer" << dendl;
4953 heartbeat_peers.erase(p);
4954 }
4955 } else {
4956 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
4957 }
4958 heartbeat_lock.Unlock();
4959 s->put();
4960 }
4961 return true;
4962 }
4963
4964
4965
4966 // =========================================
4967
4968 void OSD::tick()
4969 {
4970 assert(osd_lock.is_locked());
4971 dout(10) << "tick" << dendl;
4972
4973 if (is_active() || is_waiting_for_healthy()) {
4974 maybe_update_heartbeat_peers();
4975 }
4976
4977 if (is_waiting_for_healthy()) {
4978 start_boot();
4979 }
4980
4981 do_waiters();
4982
4983 tick_timer.add_event_after(OSD_TICK_INTERVAL, new C_Tick(this));
4984
4985 if (is_active()) {
4986 const auto now = ceph::coarse_mono_clock::now();
4987 const auto elapsed = now - last_sent_beacon;
4988 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
4989 cct->_conf->osd_beacon_report_interval) {
4990 send_beacon(now);
4991 }
4992 }
4993 }
4994
4995 void OSD::tick_without_osd_lock()
4996 {
4997 assert(tick_timer_lock.is_locked());
4998 dout(10) << "tick_without_osd_lock" << dendl;
4999
5000 logger->set(l_osd_buf, buffer::get_total_alloc());
5001 logger->set(l_osd_history_alloc_bytes, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
5002 logger->set(l_osd_history_alloc_num, buffer::get_history_alloc_num());
5003 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5004 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5005 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5006
5007 // osd_lock is not being held, which means the OSD state
5008 // might change when doing the monitor report
5009 if (is_active() || is_waiting_for_healthy()) {
5010 heartbeat_lock.Lock();
5011 heartbeat_check();
5012 heartbeat_lock.Unlock();
5013
5014 map_lock.get_read();
5015 Mutex::Locker l(mon_report_lock);
5016
5017 // mon report?
5018 bool reset = false;
5019 bool report = false;
5020 utime_t now = ceph_clock_now();
5021 pg_stat_queue_lock.Lock();
5022 double backoff = stats_ack_timeout / cct->_conf->osd_mon_ack_timeout;
5023 double adjusted_min = cct->_conf->osd_mon_report_interval_min * backoff;
5024 // note: we shouldn't adjust max because it must remain < the
5025 // mon's mon_osd_report_timeout (which defaults to 1.5x our
5026 // value).
5027 double max = cct->_conf->osd_mon_report_interval_max;
5028 if (!outstanding_pg_stats.empty() &&
5029 (now - stats_ack_timeout) > last_pg_stats_ack) {
5030 dout(1) << __func__ << " mon hasn't acked PGStats in "
5031 << now - last_pg_stats_ack
5032 << " seconds, reconnecting elsewhere" << dendl;
5033 reset = true;
5034 last_pg_stats_ack = now; // reset clock
5035 last_pg_stats_sent = utime_t();
5036 stats_ack_timeout =
5037 MAX(cct->_conf->osd_mon_ack_timeout,
5038 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_factor);
5039 outstanding_pg_stats.clear();
5040 }
5041 if (now - last_pg_stats_sent > max) {
5042 osd_stat_updated = true;
5043 report = true;
5044 } else if (service.need_fullness_update()) {
5045 report = true;
5046 } else if ((int)outstanding_pg_stats.size() >=
5047 cct->_conf->osd_mon_report_max_in_flight) {
5048 dout(20) << __func__ << " have max " << outstanding_pg_stats
5049 << " stats updates in flight" << dendl;
5050 } else {
5051 if (now - last_mon_report > adjusted_min) {
5052 dout(20) << __func__ << " stats backoff " << backoff
5053 << " adjusted_min " << adjusted_min << " - sending report"
5054 << dendl;
5055 osd_stat_updated = true;
5056 report = true;
5057 }
5058 }
5059 pg_stat_queue_lock.Unlock();
5060
5061 if (reset) {
5062 monc->reopen_session();
5063 } else if (report) {
5064 last_mon_report = now;
5065
5066 // do any pending reports
5067 send_full_update();
5068 send_failures();
5069 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5070 send_pg_stats(now);
5071 }
5072 }
5073 map_lock.put_read();
5074 }
5075
5076 if (is_active()) {
5077 if (!scrub_random_backoff()) {
5078 sched_scrub();
5079 }
5080 service.promote_throttle_recalibrate();
5081 }
5082
5083 check_ops_in_flight();
5084 service.kick_recovery_queue();
5085 tick_timer_without_osd_lock.add_event_after(OSD_TICK_INTERVAL, new C_Tick_WithoutOSDLock(this));
5086 }
5087
5088 void OSD::check_ops_in_flight()
5089 {
5090 vector<string> warnings;
5091 if (op_tracker.check_ops_in_flight(warnings)) {
5092 for (vector<string>::iterator i = warnings.begin();
5093 i != warnings.end();
5094 ++i) {
5095 clog->warn() << *i;
5096 }
5097 }
5098 }
5099
5100 // Usage:
5101 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5102 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5103 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5104 // getomap <pool> [namespace/]<obj-name>
5105 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5106 // injectmdataerr [namespace/]<obj-name> [shardid]
5107 // injectdataerr [namespace/]<obj-name> [shardid]
5108 //
5109 // set_recovery_delay [utime]
5110 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5111 const std::string &command, cmdmap_t& cmdmap, ostream &ss)
5112 {
5113 //Test support
5114 //Support changing the omap on a single osd by using the Admin Socket to
5115 //directly request the osd make a change.
5116 if (command == "setomapval" || command == "rmomapkey" ||
5117 command == "setomapheader" || command == "getomap" ||
5118 command == "truncobj" || command == "injectmdataerr" ||
5119 command == "injectdataerr"
5120 ) {
5121 pg_t rawpg;
5122 int64_t pool;
5123 OSDMapRef curmap = service->get_osdmap();
5124 int r = -1;
5125
5126 string poolstr;
5127
5128 cmd_getval(service->cct, cmdmap, "pool", poolstr);
5129 pool = curmap->lookup_pg_pool_name(poolstr);
5130 //If we can't find it by name then maybe id specified
5131 if (pool < 0 && isdigit(poolstr[0]))
5132 pool = atoll(poolstr.c_str());
5133 if (pool < 0) {
5134 ss << "Invalid pool" << poolstr;
5135 return;
5136 }
5137
5138 string objname, nspace;
5139 cmd_getval(service->cct, cmdmap, "objname", objname);
5140 std::size_t found = objname.find_first_of('/');
5141 if (found != string::npos) {
5142 nspace = objname.substr(0, found);
5143 objname = objname.substr(found+1);
5144 }
5145 object_locator_t oloc(pool, nspace);
5146 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5147
5148 if (r < 0) {
5149 ss << "Invalid namespace/objname";
5150 return;
5151 }
5152
5153 int64_t shardid;
5154 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5155 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5156 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5157 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5158 if (curmap->pg_is_ec(rawpg)) {
5159 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5160 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5161 return;
5162 }
5163 }
5164
5165 ObjectStore::Transaction t;
5166
5167 if (command == "setomapval") {
5168 map<string, bufferlist> newattrs;
5169 bufferlist val;
5170 string key, valstr;
5171 cmd_getval(service->cct, cmdmap, "key", key);
5172 cmd_getval(service->cct, cmdmap, "val", valstr);
5173
5174 val.append(valstr);
5175 newattrs[key] = val;
5176 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5177 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5178 if (r < 0)
5179 ss << "error=" << r;
5180 else
5181 ss << "ok";
5182 } else if (command == "rmomapkey") {
5183 string key;
5184 set<string> keys;
5185 cmd_getval(service->cct, cmdmap, "key", key);
5186
5187 keys.insert(key);
5188 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
5189 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5190 if (r < 0)
5191 ss << "error=" << r;
5192 else
5193 ss << "ok";
5194 } else if (command == "setomapheader") {
5195 bufferlist newheader;
5196 string headerstr;
5197
5198 cmd_getval(service->cct, cmdmap, "header", headerstr);
5199 newheader.append(headerstr);
5200 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
5201 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5202 if (r < 0)
5203 ss << "error=" << r;
5204 else
5205 ss << "ok";
5206 } else if (command == "getomap") {
5207 //Debug: Output entire omap
5208 bufferlist hdrbl;
5209 map<string, bufferlist> keyvals;
5210 r = store->omap_get(coll_t(pgid), ghobject_t(obj), &hdrbl, &keyvals);
5211 if (r >= 0) {
5212 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5213 for (map<string, bufferlist>::iterator it = keyvals.begin();
5214 it != keyvals.end(); ++it)
5215 ss << " key=" << (*it).first << " val="
5216 << string((*it).second.c_str(), (*it).second.length());
5217 } else {
5218 ss << "error=" << r;
5219 }
5220 } else if (command == "truncobj") {
5221 int64_t trunclen;
5222 cmd_getval(service->cct, cmdmap, "len", trunclen);
5223 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
5224 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5225 if (r < 0)
5226 ss << "error=" << r;
5227 else
5228 ss << "ok";
5229 } else if (command == "injectdataerr") {
5230 store->inject_data_error(gobj);
5231 ss << "ok";
5232 } else if (command == "injectmdataerr") {
5233 store->inject_mdata_error(gobj);
5234 ss << "ok";
5235 }
5236 return;
5237 }
5238 if (command == "set_recovery_delay") {
5239 int64_t delay;
5240 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5241 ostringstream oss;
5242 oss << delay;
5243 int r = service->cct->_conf->set_val("osd_recovery_delay_start",
5244 oss.str().c_str());
5245 if (r != 0) {
5246 ss << "set_recovery_delay: error setting "
5247 << "osd_recovery_delay_start to '" << delay << "': error "
5248 << r;
5249 return;
5250 }
5251 service->cct->_conf->apply_changes(NULL);
5252 ss << "set_recovery_delay: set osd_recovery_delay_start "
5253 << "to " << service->cct->_conf->osd_recovery_delay_start;
5254 return;
5255 }
5256 if (command == "trigger_scrub") {
5257 spg_t pgid;
5258 OSDMapRef curmap = service->get_osdmap();
5259
5260 string pgidstr;
5261
5262 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5263 if (!pgid.parse(pgidstr.c_str())) {
5264 ss << "Invalid pgid specified";
5265 return;
5266 }
5267
5268 PG *pg = service->osd->_lookup_lock_pg(pgid);
5269 if (pg == nullptr) {
5270 ss << "Can't find pg " << pgid;
5271 return;
5272 }
5273
5274 if (pg->is_primary()) {
5275 pg->unreg_next_scrub();
5276 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5277 double pool_scrub_max_interval = 0;
5278 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5279 double scrub_max_interval = pool_scrub_max_interval > 0 ?
5280 pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
5281 // Instead of marking must_scrub force a schedule scrub
5282 utime_t stamp = ceph_clock_now();
5283 stamp -= scrub_max_interval;
5284 stamp -= 100.0; // push back last scrub more for good measure
5285 pg->info.history.last_scrub_stamp = stamp;
5286 pg->reg_next_scrub();
5287 ss << "ok";
5288 } else {
5289 ss << "Not primary";
5290 }
5291 pg->unlock();
5292 return;
5293 }
5294 if (command == "injectfull") {
5295 int64_t count;
5296 string type;
5297 OSDService::s_names state;
5298 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
5299 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
5300 if (type == "none" || count == 0) {
5301 type = "none";
5302 count = 0;
5303 }
5304 state = service->get_full_state(type);
5305 if (state == OSDService::s_names::INVALID) {
5306 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5307 return;
5308 }
5309 service->set_injectfull(state, count);
5310 return;
5311 }
5312 ss << "Internal error - command=" << command;
5313 }
5314
5315 // =========================================
5316 bool remove_dir(
5317 CephContext *cct,
5318 ObjectStore *store, SnapMapper *mapper,
5319 OSDriver *osdriver,
5320 ObjectStore::Sequencer *osr,
5321 coll_t coll, DeletingStateRef dstate,
5322 bool *finished,
5323 ThreadPool::TPHandle &handle)
5324 {
5325 vector<ghobject_t> olist;
5326 int64_t num = 0;
5327 ObjectStore::Transaction t;
5328 ghobject_t next;
5329 handle.reset_tp_timeout();
5330 store->collection_list(
5331 coll,
5332 next,
5333 ghobject_t::get_max(),
5334 store->get_ideal_list_max(),
5335 &olist,
5336 &next);
5337 generic_dout(10) << __func__ << " " << olist << dendl;
5338 // default cont to true, this is safe because caller(OSD::RemoveWQ::_process())
5339 // will recheck the answer before it really goes on.
5340 bool cont = true;
5341 for (vector<ghobject_t>::iterator i = olist.begin();
5342 i != olist.end();
5343 ++i) {
5344 if (i->is_pgmeta())
5345 continue;
5346 OSDriver::OSTransaction _t(osdriver->get_transaction(&t));
5347 int r = mapper->remove_oid(i->hobj, &_t);
5348 if (r != 0 && r != -ENOENT) {
5349 ceph_abort();
5350 }
5351 t.remove(coll, *i);
5352 if (++num >= cct->_conf->osd_target_transaction_size) {
5353 C_SaferCond waiter;
5354 store->queue_transaction(osr, std::move(t), &waiter);
5355 cont = dstate->pause_clearing();
5356 handle.suspend_tp_timeout();
5357 waiter.wait();
5358 handle.reset_tp_timeout();
5359 if (cont)
5360 cont = dstate->resume_clearing();
5361 if (!cont)
5362 return false;
5363 t = ObjectStore::Transaction();
5364 num = 0;
5365 }
5366 }
5367 if (num) {
5368 C_SaferCond waiter;
5369 store->queue_transaction(osr, std::move(t), &waiter);
5370 cont = dstate->pause_clearing();
5371 handle.suspend_tp_timeout();
5372 waiter.wait();
5373 handle.reset_tp_timeout();
5374 if (cont)
5375 cont = dstate->resume_clearing();
5376 }
5377 // whether there are more objects to remove in the collection
5378 *finished = next.is_max();
5379 return cont;
5380 }
5381
5382 void OSD::RemoveWQ::_process(
5383 pair<PGRef, DeletingStateRef> item,
5384 ThreadPool::TPHandle &handle)
5385 {
5386 FUNCTRACE();
5387 PGRef pg(item.first);
5388 SnapMapper &mapper = pg->snap_mapper;
5389 OSDriver &driver = pg->osdriver;
5390 coll_t coll = coll_t(pg->info.pgid);
5391 pg->osr->flush();
5392 bool finished = false;
5393
5394 if (!item.second->start_or_resume_clearing())
5395 return;
5396
5397 bool cont = remove_dir(
5398 pg->cct, store, &mapper, &driver, pg->osr.get(), coll, item.second,
5399 &finished, handle);
5400 if (!cont)
5401 return;
5402 if (!finished) {
5403 if (item.second->pause_clearing())
5404 queue_front(item);
5405 return;
5406 }
5407
5408 if (!item.second->start_deleting())
5409 return;
5410
5411 ObjectStore::Transaction t;
5412 PGLog::clear_info_log(pg->info.pgid, &t);
5413
5414 if (cct->_conf->osd_inject_failure_on_pg_removal) {
5415 generic_derr << "osd_inject_failure_on_pg_removal" << dendl;
5416 _exit(1);
5417 }
5418 t.remove_collection(coll);
5419
5420 // We need the sequencer to stick around until the op is complete
5421 store->queue_transaction(
5422 pg->osr.get(),
5423 std::move(t),
5424 0, // onapplied
5425 0, // oncommit
5426 0, // onreadable sync
5427 new ContainerContext<PGRef>(pg),
5428 TrackedOpRef());
5429
5430 item.second->finish_deleting();
5431 }
5432 // =========================================
5433
5434 void OSD::ms_handle_connect(Connection *con)
5435 {
5436 dout(10) << __func__ << " con " << con << dendl;
5437 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
5438 Mutex::Locker l(osd_lock);
5439 if (is_stopping())
5440 return;
5441 dout(10) << __func__ << " on mon" << dendl;
5442
5443 if (is_preboot()) {
5444 start_boot();
5445 } else if (is_booting()) {
5446 _send_boot(); // resend boot message
5447 } else {
5448 map_lock.get_read();
5449 Mutex::Locker l2(mon_report_lock);
5450
5451 utime_t now = ceph_clock_now();
5452 last_mon_report = now;
5453
5454 // resend everything, it's a new session
5455 send_full_update();
5456 send_alive();
5457 service.requeue_pg_temp();
5458 service.send_pg_temp();
5459 requeue_failures();
5460 send_failures();
5461 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5462 send_pg_stats(now);
5463 }
5464
5465 map_lock.put_read();
5466 if (is_active()) {
5467 send_beacon(ceph::coarse_mono_clock::now());
5468 }
5469 }
5470
5471 // full map requests may happen while active or pre-boot
5472 if (requested_full_first) {
5473 rerequest_full_maps();
5474 }
5475 }
5476 }
5477
5478 void OSD::ms_handle_fast_connect(Connection *con)
5479 {
5480 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5481 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5482 Session *s = static_cast<Session*>(con->get_priv());
5483 if (!s) {
5484 s = new Session(cct);
5485 con->set_priv(s->get());
5486 s->con = con;
5487 dout(10) << " new session (outgoing) " << s << " con=" << s->con
5488 << " addr=" << s->con->get_peer_addr() << dendl;
5489 // we don't connect to clients
5490 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5491 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5492 }
5493 s->put();
5494 }
5495 }
5496
5497 void OSD::ms_handle_fast_accept(Connection *con)
5498 {
5499 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5500 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5501 Session *s = static_cast<Session*>(con->get_priv());
5502 if (!s) {
5503 s = new Session(cct);
5504 con->set_priv(s->get());
5505 s->con = con;
5506 dout(10) << "new session (incoming)" << s << " con=" << con
5507 << " addr=" << con->get_peer_addr()
5508 << " must have raced with connect" << dendl;
5509 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5510 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5511 }
5512 s->put();
5513 }
5514 }
5515
5516 bool OSD::ms_handle_reset(Connection *con)
5517 {
5518 Session *session = static_cast<Session*>(con->get_priv());
5519 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
5520 if (!session)
5521 return false;
5522 session->wstate.reset(con);
5523 session->con.reset(NULL); // break con <-> session ref cycle
5524 // note that we break session->con *before* the session_handle_reset
5525 // cleanup below. this avoids a race between us and
5526 // PG::add_backoff, Session::check_backoff, etc.
5527 session_handle_reset(session);
5528 session->put();
5529 return true;
5530 }
5531
5532 bool OSD::ms_handle_refused(Connection *con)
5533 {
5534 if (!cct->_conf->osd_fast_fail_on_connection_refused)
5535 return false;
5536
5537 Session *session = static_cast<Session*>(con->get_priv());
5538 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
5539 if (!session)
5540 return false;
5541 int type = con->get_peer_type();
5542 // handle only OSD failures here
5543 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
5544 OSDMapRef osdmap = get_osdmap();
5545 if (osdmap) {
5546 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
5547 if (id >= 0 && osdmap->is_up(id)) {
5548 // I'm cheating mon heartbeat grace logic, because we know it's not going
5549 // to respawn alone. +1 so we won't hit any boundary case.
5550 monc->send_mon_message(new MOSDFailure(monc->get_fsid(),
5551 osdmap->get_inst(id),
5552 cct->_conf->osd_heartbeat_grace + 1,
5553 osdmap->get_epoch(),
5554 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
5555 ));
5556 }
5557 }
5558 }
5559 session->put();
5560 return true;
5561 }
5562
5563 struct C_OSD_GetVersion : public Context {
5564 OSD *osd;
5565 uint64_t oldest, newest;
5566 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
5567 void finish(int r) override {
5568 if (r >= 0)
5569 osd->_got_mon_epochs(oldest, newest);
5570 }
5571 };
5572
5573 void OSD::start_boot()
5574 {
5575 if (!_is_healthy()) {
5576 // if we are not healthy, do not mark ourselves up (yet)
5577 dout(1) << "not healthy; waiting to boot" << dendl;
5578 if (!is_waiting_for_healthy())
5579 start_waiting_for_healthy();
5580 // send pings sooner rather than later
5581 heartbeat_kick();
5582 return;
5583 }
5584 dout(1) << __func__ << dendl;
5585 set_state(STATE_PREBOOT);
5586 dout(10) << "start_boot - have maps " << superblock.oldest_map
5587 << ".." << superblock.newest_map << dendl;
5588 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
5589 monc->get_version("osdmap", &c->newest, &c->oldest, c);
5590 }
5591
5592 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
5593 {
5594 Mutex::Locker l(osd_lock);
5595 if (is_preboot()) {
5596 _preboot(oldest, newest);
5597 }
5598 }
5599
5600 void OSD::_preboot(epoch_t oldest, epoch_t newest)
5601 {
5602 assert(is_preboot());
5603 dout(10) << __func__ << " _preboot mon has osdmaps "
5604 << oldest << ".." << newest << dendl;
5605
5606 // ensure our local fullness awareness is accurate
5607 heartbeat();
5608
5609 // if our map within recent history, try to add ourselves to the osdmap.
5610 if (osdmap->get_epoch() == 0) {
5611 derr << "waiting for initial osdmap" << dendl;
5612 } else if (osdmap->test_flag(CEPH_OSDMAP_NOUP) || osdmap->is_noup(whoami)) {
5613 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
5614 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
5615 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
5616 << dendl;
5617 } else if (osdmap->require_osd_release < CEPH_RELEASE_JEWEL) {
5618 derr << "osdmap REQUIRE_JEWEL OSDMap flag is NOT set; please set it"
5619 << dendl;
5620 } else if (!monc->monmap.get_required_features().contains_all(
5621 ceph::features::mon::FEATURE_LUMINOUS)) {
5622 derr << "monmap REQUIRE_LUMINOUS is NOT set; must upgrade all monitors to "
5623 << "Luminous or later before Luminous OSDs will boot" << dendl;
5624 } else if (service.need_fullness_update()) {
5625 derr << "osdmap fullness state needs update" << dendl;
5626 send_full_update();
5627 } else if (osdmap->get_epoch() >= oldest - 1 &&
5628 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
5629 _send_boot();
5630 return;
5631 }
5632
5633 // get all the latest maps
5634 if (osdmap->get_epoch() + 1 >= oldest)
5635 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5636 else
5637 osdmap_subscribe(oldest - 1, true);
5638 }
5639
5640 void OSD::send_full_update()
5641 {
5642 if (!service.need_fullness_update())
5643 return;
5644 unsigned state = 0;
5645 if (service.is_full()) {
5646 state = CEPH_OSD_FULL;
5647 } else if (service.is_backfillfull()) {
5648 state = CEPH_OSD_BACKFILLFULL;
5649 } else if (service.is_nearfull()) {
5650 state = CEPH_OSD_NEARFULL;
5651 }
5652 set<string> s;
5653 OSDMap::calc_state_set(state, s);
5654 dout(10) << __func__ << " want state " << s << dendl;
5655 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
5656 }
5657
5658 void OSD::start_waiting_for_healthy()
5659 {
5660 dout(1) << "start_waiting_for_healthy" << dendl;
5661 set_state(STATE_WAITING_FOR_HEALTHY);
5662 last_heartbeat_resample = utime_t();
5663 }
5664
5665 bool OSD::_is_healthy()
5666 {
5667 if (!cct->get_heartbeat_map()->is_healthy()) {
5668 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
5669 return false;
5670 }
5671
5672 if (is_waiting_for_healthy()) {
5673 Mutex::Locker l(heartbeat_lock);
5674 utime_t cutoff = ceph_clock_now();
5675 cutoff -= cct->_conf->osd_heartbeat_grace;
5676 int num = 0, up = 0;
5677 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5678 p != heartbeat_peers.end();
5679 ++p) {
5680 if (p->second.is_healthy(cutoff))
5681 ++up;
5682 ++num;
5683 }
5684 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
5685 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
5686 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
5687 return false;
5688 }
5689 }
5690
5691 return true;
5692 }
5693
5694 void OSD::_send_boot()
5695 {
5696 dout(10) << "_send_boot" << dendl;
5697 entity_addr_t cluster_addr = cluster_messenger->get_myaddr();
5698 Connection *local_connection = cluster_messenger->get_loopback_connection().get();
5699 if (cluster_addr.is_blank_ip()) {
5700 int port = cluster_addr.get_port();
5701 cluster_addr = client_messenger->get_myaddr();
5702 cluster_addr.set_port(port);
5703 cluster_messenger->set_addr_unknowns(cluster_addr);
5704 dout(10) << " assuming cluster_addr ip matches client_addr" << dendl;
5705 } else {
5706 Session *s = static_cast<Session*>(local_connection->get_priv());
5707 if (s)
5708 s->put();
5709 else
5710 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
5711 }
5712
5713 entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr();
5714 local_connection = hb_back_server_messenger->get_loopback_connection().get();
5715 if (hb_back_addr.is_blank_ip()) {
5716 int port = hb_back_addr.get_port();
5717 hb_back_addr = cluster_addr;
5718 hb_back_addr.set_port(port);
5719 hb_back_server_messenger->set_addr_unknowns(hb_back_addr);
5720 dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl;
5721 } else {
5722 Session *s = static_cast<Session*>(local_connection->get_priv());
5723 if (s)
5724 s->put();
5725 else
5726 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
5727 }
5728
5729 entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr();
5730 local_connection = hb_front_server_messenger->get_loopback_connection().get();
5731 if (hb_front_addr.is_blank_ip()) {
5732 int port = hb_front_addr.get_port();
5733 hb_front_addr = client_messenger->get_myaddr();
5734 hb_front_addr.set_port(port);
5735 hb_front_server_messenger->set_addr_unknowns(hb_front_addr);
5736 dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
5737 } else {
5738 Session *s = static_cast<Session*>(local_connection->get_priv());
5739 if (s)
5740 s->put();
5741 else
5742 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
5743 }
5744
5745 MOSDBoot *mboot = new MOSDBoot(superblock, get_osdmap_epoch(), service.get_boot_epoch(),
5746 hb_back_addr, hb_front_addr, cluster_addr,
5747 CEPH_FEATURES_ALL);
5748 dout(10) << " client_addr " << client_messenger->get_myaddr()
5749 << ", cluster_addr " << cluster_addr
5750 << ", hb_back_addr " << hb_back_addr
5751 << ", hb_front_addr " << hb_front_addr
5752 << dendl;
5753 _collect_metadata(&mboot->metadata);
5754 monc->send_mon_message(mboot);
5755 set_state(STATE_BOOTING);
5756 }
5757
5758 void OSD::_collect_metadata(map<string,string> *pm)
5759 {
5760 // config info
5761 (*pm)["osd_data"] = dev_path;
5762 (*pm)["osd_journal"] = journal_path;
5763 (*pm)["front_addr"] = stringify(client_messenger->get_myaddr());
5764 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddr());
5765 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddr());
5766 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddr());
5767
5768 // backend
5769 (*pm)["osd_objectstore"] = store->get_type();
5770 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
5771 store->collect_metadata(pm);
5772
5773 collect_sys_info(pm, cct);
5774
5775 dout(10) << __func__ << " " << *pm << dendl;
5776 }
5777
5778 void OSD::queue_want_up_thru(epoch_t want)
5779 {
5780 map_lock.get_read();
5781 epoch_t cur = osdmap->get_up_thru(whoami);
5782 Mutex::Locker l(mon_report_lock);
5783 if (want > up_thru_wanted) {
5784 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
5785 << ", currently " << cur
5786 << dendl;
5787 up_thru_wanted = want;
5788 send_alive();
5789 } else {
5790 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
5791 << ", currently " << cur
5792 << dendl;
5793 }
5794 map_lock.put_read();
5795 }
5796
5797 void OSD::send_alive()
5798 {
5799 assert(mon_report_lock.is_locked());
5800 if (!osdmap->exists(whoami))
5801 return;
5802 epoch_t up_thru = osdmap->get_up_thru(whoami);
5803 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
5804 if (up_thru_wanted > up_thru) {
5805 dout(10) << "send_alive want " << up_thru_wanted << dendl;
5806 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
5807 }
5808 }
5809
5810 void OSD::request_full_map(epoch_t first, epoch_t last)
5811 {
5812 dout(10) << __func__ << " " << first << ".." << last
5813 << ", previously requested "
5814 << requested_full_first << ".." << requested_full_last << dendl;
5815 assert(osd_lock.is_locked());
5816 assert(first > 0 && last > 0);
5817 assert(first <= last);
5818 assert(first >= requested_full_first); // we shouldn't ever ask for older maps
5819 if (requested_full_first == 0) {
5820 // first request
5821 requested_full_first = first;
5822 requested_full_last = last;
5823 } else if (last <= requested_full_last) {
5824 // dup
5825 return;
5826 } else {
5827 // additional request
5828 first = requested_full_last + 1;
5829 requested_full_last = last;
5830 }
5831 MMonGetOSDMap *req = new MMonGetOSDMap;
5832 req->request_full(first, last);
5833 monc->send_mon_message(req);
5834 }
5835
5836 void OSD::got_full_map(epoch_t e)
5837 {
5838 assert(requested_full_first <= requested_full_last);
5839 assert(osd_lock.is_locked());
5840 if (requested_full_first == 0) {
5841 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
5842 return;
5843 }
5844 if (e < requested_full_first) {
5845 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
5846 << ".." << requested_full_last
5847 << ", ignoring" << dendl;
5848 return;
5849 }
5850 if (e >= requested_full_last) {
5851 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
5852 << ".." << requested_full_last << ", resetting" << dendl;
5853 requested_full_first = requested_full_last = 0;
5854 return;
5855 }
5856
5857 requested_full_first = e + 1;
5858
5859 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
5860 << ".." << requested_full_last
5861 << ", still need more" << dendl;
5862 }
5863
5864 void OSD::requeue_failures()
5865 {
5866 Mutex::Locker l(heartbeat_lock);
5867 unsigned old_queue = failure_queue.size();
5868 unsigned old_pending = failure_pending.size();
5869 for (map<int,pair<utime_t,entity_inst_t> >::iterator p =
5870 failure_pending.begin();
5871 p != failure_pending.end(); ) {
5872 failure_queue[p->first] = p->second.first;
5873 failure_pending.erase(p++);
5874 }
5875 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
5876 << failure_queue.size() << dendl;
5877 }
5878
5879 void OSD::send_failures()
5880 {
5881 assert(map_lock.is_locked());
5882 assert(mon_report_lock.is_locked());
5883 Mutex::Locker l(heartbeat_lock);
5884 utime_t now = ceph_clock_now();
5885 while (!failure_queue.empty()) {
5886 int osd = failure_queue.begin()->first;
5887 if (!failure_pending.count(osd)) {
5888 entity_inst_t i = osdmap->get_inst(osd);
5889 int failed_for = (int)(double)(now - failure_queue.begin()->second);
5890 monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, failed_for,
5891 osdmap->get_epoch()));
5892 failure_pending[osd] = make_pair(failure_queue.begin()->second, i);
5893 }
5894 failure_queue.erase(osd);
5895 }
5896 }
5897
5898 void OSD::send_still_alive(epoch_t epoch, const entity_inst_t &i)
5899 {
5900 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), i, 0, epoch, MOSDFailure::FLAG_ALIVE);
5901 monc->send_mon_message(m);
5902 }
5903
5904 void OSD::send_pg_stats(const utime_t &now)
5905 {
5906 assert(map_lock.is_locked());
5907 assert(osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS);
5908 dout(20) << "send_pg_stats" << dendl;
5909
5910 osd_stat_t cur_stat = service.get_osd_stat();
5911
5912 cur_stat.os_perf_stat = store->get_cur_stats();
5913
5914 pg_stat_queue_lock.Lock();
5915
5916 if (osd_stat_updated || !pg_stat_queue.empty()) {
5917 last_pg_stats_sent = now;
5918 osd_stat_updated = false;
5919
5920 dout(10) << "send_pg_stats - " << pg_stat_queue.size() << " pgs updated" << dendl;
5921
5922 utime_t had_for(now);
5923 had_for -= had_map_since;
5924
5925 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
5926
5927 uint64_t tid = ++pg_stat_tid;
5928 m->set_tid(tid);
5929 m->osd_stat = cur_stat;
5930
5931 xlist<PG*>::iterator p = pg_stat_queue.begin();
5932 while (!p.end()) {
5933 PG *pg = *p;
5934 ++p;
5935 if (!pg->is_primary()) { // we hold map_lock; role is stable.
5936 pg->stat_queue_item.remove_myself();
5937 pg->put("pg_stat_queue");
5938 continue;
5939 }
5940 pg->pg_stats_publish_lock.Lock();
5941 if (pg->pg_stats_publish_valid) {
5942 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
5943 dout(25) << " sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
5944 << pg->pg_stats_publish.reported_seq << dendl;
5945 } else {
5946 dout(25) << " NOT sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
5947 << pg->pg_stats_publish.reported_seq << ", not valid" << dendl;
5948 }
5949 pg->pg_stats_publish_lock.Unlock();
5950 }
5951
5952 if (last_pg_stats_ack == utime_t() || !outstanding_pg_stats.empty()) {
5953 last_pg_stats_ack = ceph_clock_now();
5954 }
5955 outstanding_pg_stats.insert(tid);
5956 dout(20) << __func__ << " updates pending: " << outstanding_pg_stats << dendl;
5957
5958 monc->send_mon_message(m);
5959 }
5960
5961 pg_stat_queue_lock.Unlock();
5962 }
5963
5964 void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
5965 {
5966 dout(10) << "handle_pg_stats_ack " << dendl;
5967
5968 if (!require_mon_peer(ack)) {
5969 ack->put();
5970 return;
5971 }
5972
5973 // NOTE: we may get replies from a previous mon even while
5974 // outstanding_pg_stats is empty if reconnecting races with replies
5975 // in flight.
5976
5977 pg_stat_queue_lock.Lock();
5978
5979 last_pg_stats_ack = ceph_clock_now();
5980
5981 // decay timeout slowly (analogous to TCP)
5982 stats_ack_timeout =
5983 MAX(cct->_conf->osd_mon_ack_timeout,
5984 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_decay);
5985 dout(20) << __func__ << " timeout now " << stats_ack_timeout << dendl;
5986
5987 if (ack->get_tid() > pg_stat_tid_flushed) {
5988 pg_stat_tid_flushed = ack->get_tid();
5989 pg_stat_queue_cond.Signal();
5990 }
5991
5992 xlist<PG*>::iterator p = pg_stat_queue.begin();
5993 while (!p.end()) {
5994 PG *pg = *p;
5995 PGRef _pg(pg);
5996 ++p;
5997
5998 auto acked = ack->pg_stat.find(pg->info.pgid.pgid);
5999 if (acked != ack->pg_stat.end()) {
6000 pg->pg_stats_publish_lock.Lock();
6001 if (acked->second.first == pg->pg_stats_publish.reported_seq &&
6002 acked->second.second == pg->pg_stats_publish.reported_epoch) {
6003 dout(25) << " ack on " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6004 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6005 pg->stat_queue_item.remove_myself();
6006 pg->put("pg_stat_queue");
6007 } else {
6008 dout(25) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6009 << ":" << pg->pg_stats_publish.reported_seq << " > acked "
6010 << acked->second << dendl;
6011 }
6012 pg->pg_stats_publish_lock.Unlock();
6013 } else {
6014 dout(30) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6015 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6016 }
6017 }
6018
6019 outstanding_pg_stats.erase(ack->get_tid());
6020 dout(20) << __func__ << " still pending: " << outstanding_pg_stats << dendl;
6021
6022 pg_stat_queue_lock.Unlock();
6023
6024 ack->put();
6025 }
6026
6027 void OSD::flush_pg_stats()
6028 {
6029 dout(10) << "flush_pg_stats" << dendl;
6030 osd_lock.Unlock();
6031 utime_t now = ceph_clock_now();
6032 map_lock.get_read();
6033 mon_report_lock.Lock();
6034 send_pg_stats(now);
6035 mon_report_lock.Unlock();
6036 map_lock.put_read();
6037
6038
6039 pg_stat_queue_lock.Lock();
6040 uint64_t tid = pg_stat_tid;
6041 dout(10) << "flush_pg_stats waiting for stats tid " << tid << " to flush" << dendl;
6042 while (tid > pg_stat_tid_flushed)
6043 pg_stat_queue_cond.Wait(pg_stat_queue_lock);
6044 dout(10) << "flush_pg_stats finished waiting for stats tid " << tid << " to flush" << dendl;
6045 pg_stat_queue_lock.Unlock();
6046
6047 osd_lock.Lock();
6048 }
6049
6050 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6051 {
6052 const auto& monmap = monc->monmap;
6053 // send beacon to mon even if we are just connected, and the monmap is not
6054 // initialized yet by then.
6055 if (monmap.epoch > 0 &&
6056 monmap.get_required_features().contains_all(
6057 ceph::features::mon::FEATURE_LUMINOUS)) {
6058 dout(20) << __func__ << " sending" << dendl;
6059 last_sent_beacon = now;
6060 MOSDBeacon* beacon = nullptr;
6061 {
6062 Mutex::Locker l{min_last_epoch_clean_lock};
6063 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
6064 std::swap(beacon->pgs, min_last_epoch_clean_pgs);
6065 }
6066 monc->send_mon_message(beacon);
6067 } else {
6068 dout(20) << __func__ << " not sending" << dendl;
6069 }
6070 }
6071
6072 void OSD::handle_command(MMonCommand *m)
6073 {
6074 if (!require_mon_peer(m)) {
6075 m->put();
6076 return;
6077 }
6078
6079 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6080 command_wq.queue(c);
6081 m->put();
6082 }
6083
6084 void OSD::handle_command(MCommand *m)
6085 {
6086 ConnectionRef con = m->get_connection();
6087 Session *session = static_cast<Session *>(con->get_priv());
6088 if (!session) {
6089 con->send_message(new MCommandReply(m, -EPERM));
6090 m->put();
6091 return;
6092 }
6093
6094 OSDCap& caps = session->caps;
6095 session->put();
6096
6097 if (!caps.allow_all() || m->get_source().is_mon()) {
6098 con->send_message(new MCommandReply(m, -EPERM));
6099 m->put();
6100 return;
6101 }
6102
6103 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6104 command_wq.queue(c);
6105
6106 m->put();
6107 }
6108
6109 struct OSDCommand {
6110 string cmdstring;
6111 string helpstring;
6112 string module;
6113 string perm;
6114 string availability;
6115 } osd_commands[] = {
6116
6117 #define COMMAND(parsesig, helptext, module, perm, availability) \
6118 {parsesig, helptext, module, perm, availability},
6119
6120 // yes, these are really pg commands, but there's a limit to how
6121 // much work it's worth. The OSD returns all of them. Make this
6122 // form (pg <pgid> <cmd>) valid only for the cli.
6123 // Rest uses "tell <pgid> <cmd>"
6124
6125 COMMAND("pg " \
6126 "name=pgid,type=CephPgid " \
6127 "name=cmd,type=CephChoices,strings=query", \
6128 "show details of a specific pg", "osd", "r", "cli")
6129 COMMAND("pg " \
6130 "name=pgid,type=CephPgid " \
6131 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6132 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6133 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6134 "osd", "rw", "cli")
6135 COMMAND("pg " \
6136 "name=pgid,type=CephPgid " \
6137 "name=cmd,type=CephChoices,strings=list_missing " \
6138 "name=offset,type=CephString,req=false",
6139 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6140 "osd", "r", "cli")
6141
6142 // new form: tell <pgid> <cmd> for both cli and rest
6143
6144 COMMAND("query",
6145 "show details of a specific pg", "osd", "r", "cli,rest")
6146 COMMAND("mark_unfound_lost " \
6147 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6148 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6149 "osd", "rw", "cli,rest")
6150 COMMAND("list_missing " \
6151 "name=offset,type=CephString,req=false",
6152 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6153 "osd", "r", "cli,rest")
6154 COMMAND("perf histogram dump "
6155 "name=logger,type=CephString,req=false "
6156 "name=counter,type=CephString,req=false",
6157 "Get histogram data",
6158 "osd", "r", "cli,rest")
6159
6160 // tell <osd.n> commands. Validation of osd.n must be special-cased in client
6161 COMMAND("version", "report version of OSD", "osd", "r", "cli,rest")
6162 COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r", "cli,rest")
6163 COMMAND("injectargs " \
6164 "name=injected_args,type=CephString,n=N",
6165 "inject configuration arguments into running OSD",
6166 "osd", "rw", "cli,rest")
6167 COMMAND("cluster_log " \
6168 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6169 "name=message,type=CephString,n=N",
6170 "log a message to the cluster log",
6171 "osd", "rw", "cli,rest")
6172 COMMAND("bench " \
6173 "name=count,type=CephInt,req=false " \
6174 "name=size,type=CephInt,req=false " \
6175 "name=object_size,type=CephInt,req=false " \
6176 "name=object_num,type=CephInt,req=false ", \
6177 "OSD benchmark: write <count> <size>-byte objects, " \
6178 "(default 1G size 4MB). Results in log.",
6179 "osd", "rw", "cli,rest")
6180 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw", "cli,rest")
6181 COMMAND("heap " \
6182 "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
6183 "show heap usage info (available only if compiled with tcmalloc)", \
6184 "osd", "rw", "cli,rest")
6185 COMMAND("debug dump_missing " \
6186 "name=filename,type=CephFilepath",
6187 "dump missing objects to a named file", "osd", "r", "cli,rest")
6188 COMMAND("debug kick_recovery_wq " \
6189 "name=delay,type=CephInt,range=0",
6190 "set osd_recovery_delay_start to <val>", "osd", "rw", "cli,rest")
6191 COMMAND("cpu_profiler " \
6192 "name=arg,type=CephChoices,strings=status|flush",
6193 "run cpu profiling on daemon", "osd", "rw", "cli,rest")
6194 COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6195 "osd", "r", "cli,rest")
6196 COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6197 "osd", "rw", "cli,rest")
6198 };
6199
6200 void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6201 {
6202 int r = 0;
6203 stringstream ss, ds;
6204 string rs;
6205 bufferlist odata;
6206
6207 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6208
6209 map<string, cmd_vartype> cmdmap;
6210 string prefix;
6211 string format;
6212 string pgidstr;
6213 boost::scoped_ptr<Formatter> f;
6214
6215 if (cmd.empty()) {
6216 ss << "no command given";
6217 goto out;
6218 }
6219
6220 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6221 r = -EINVAL;
6222 goto out;
6223 }
6224
6225 cmd_getval(cct, cmdmap, "prefix", prefix);
6226
6227 if (prefix == "get_command_descriptions") {
6228 int cmdnum = 0;
6229 JSONFormatter *f = new JSONFormatter();
6230 f->open_object_section("command_descriptions");
6231 for (OSDCommand *cp = osd_commands;
6232 cp < &osd_commands[ARRAY_SIZE(osd_commands)]; cp++) {
6233
6234 ostringstream secname;
6235 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
6236 dump_cmddesc_to_json(f, secname.str(), cp->cmdstring, cp->helpstring,
6237 cp->module, cp->perm, cp->availability, 0);
6238 cmdnum++;
6239 }
6240 f->close_section(); // command_descriptions
6241
6242 f->flush(ds);
6243 delete f;
6244 goto out;
6245 }
6246
6247 cmd_getval(cct, cmdmap, "format", format);
6248 f.reset(Formatter::create(format));
6249
6250 if (prefix == "version") {
6251 if (f) {
6252 f->open_object_section("version");
6253 f->dump_string("version", pretty_version_to_str());
6254 f->close_section();
6255 f->flush(ds);
6256 } else {
6257 ds << pretty_version_to_str();
6258 }
6259 goto out;
6260 }
6261 else if (prefix == "injectargs") {
6262 vector<string> argsvec;
6263 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6264
6265 if (argsvec.empty()) {
6266 r = -EINVAL;
6267 ss << "ignoring empty injectargs";
6268 goto out;
6269 }
6270 string args = argsvec.front();
6271 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6272 args += " " + *a;
6273 osd_lock.Unlock();
6274 r = cct->_conf->injectargs(args, &ss);
6275 osd_lock.Lock();
6276 }
6277 else if (prefix == "cluster_log") {
6278 vector<string> msg;
6279 cmd_getval(cct, cmdmap, "message", msg);
6280 if (msg.empty()) {
6281 r = -EINVAL;
6282 ss << "ignoring empty log message";
6283 goto out;
6284 }
6285 string message = msg.front();
6286 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6287 message += " " + *a;
6288 string lvl;
6289 cmd_getval(cct, cmdmap, "level", lvl);
6290 clog_type level = string_to_clog_type(lvl);
6291 if (level < 0) {
6292 r = -EINVAL;
6293 ss << "unknown level '" << lvl << "'";
6294 goto out;
6295 }
6296 clog->do_log(level, message);
6297 }
6298
6299 // either 'pg <pgid> <command>' or
6300 // 'tell <pgid>' (which comes in without any of that prefix)?
6301
6302 else if (prefix == "pg" ||
6303 prefix == "query" ||
6304 prefix == "mark_unfound_lost" ||
6305 prefix == "list_missing"
6306 ) {
6307 pg_t pgid;
6308
6309 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6310 ss << "no pgid specified";
6311 r = -EINVAL;
6312 } else if (!pgid.parse(pgidstr.c_str())) {
6313 ss << "couldn't parse pgid '" << pgidstr << "'";
6314 r = -EINVAL;
6315 } else {
6316 spg_t pcand;
6317 PG *pg = nullptr;
6318 if (osdmap->get_primary_shard(pgid, &pcand) &&
6319 (pg = _lookup_lock_pg(pcand))) {
6320 if (pg->is_primary()) {
6321 // simulate pg <pgid> cmd= for pg->do-command
6322 if (prefix != "pg")
6323 cmd_putval(cct, cmdmap, "cmd", prefix);
6324 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
6325 if (r == -EAGAIN) {
6326 pg->unlock();
6327 // don't reply, pg will do so async
6328 return;
6329 }
6330 } else {
6331 ss << "not primary for pgid " << pgid;
6332
6333 // send them the latest diff to ensure they realize the mapping
6334 // has changed.
6335 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
6336
6337 // do not reply; they will get newer maps and realize they
6338 // need to resend.
6339 pg->unlock();
6340 return;
6341 }
6342 pg->unlock();
6343 } else {
6344 ss << "i don't have pgid " << pgid;
6345 r = -ENOENT;
6346 }
6347 }
6348 }
6349
6350 else if (prefix == "bench") {
6351 int64_t count;
6352 int64_t bsize;
6353 int64_t osize, onum;
6354 // default count 1G, size 4MB
6355 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
6356 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
6357 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
6358 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
6359
6360 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
6361 ObjectStore::Sequencer>("bench"));
6362
6363 uint32_t duration = cct->_conf->osd_bench_duration;
6364
6365 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
6366 // let us limit the block size because the next checks rely on it
6367 // having a sane value. If we allow any block size to be set things
6368 // can still go sideways.
6369 ss << "block 'size' values are capped at "
6370 << prettybyte_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
6371 << " a higher value, please adjust 'osd_bench_max_block_size'";
6372 r = -EINVAL;
6373 goto out;
6374 } else if (bsize < (int64_t) (1 << 20)) {
6375 // entering the realm of small block sizes.
6376 // limit the count to a sane value, assuming a configurable amount of
6377 // IOPS and duration, so that the OSD doesn't get hung up on this,
6378 // preventing timeouts from going off
6379 int64_t max_count =
6380 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
6381 if (count > max_count) {
6382 ss << "'count' values greater than " << max_count
6383 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6384 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
6385 << " for " << duration << " seconds,"
6386 << " can cause ill effects on osd. "
6387 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6388 << " value if you wish to use a higher 'count'.";
6389 r = -EINVAL;
6390 goto out;
6391 }
6392 } else {
6393 // 1MB block sizes are big enough so that we get more stuff done.
6394 // However, to avoid the osd from getting hung on this and having
6395 // timers being triggered, we are going to limit the count assuming
6396 // a configurable throughput and duration.
6397 // NOTE: max_count is the total amount of bytes that we believe we
6398 // will be able to write during 'duration' for the given
6399 // throughput. The block size hardly impacts this unless it's
6400 // way too big. Given we already check how big the block size
6401 // is, it's safe to assume everything will check out.
6402 int64_t max_count =
6403 cct->_conf->osd_bench_large_size_max_throughput * duration;
6404 if (count > max_count) {
6405 ss << "'count' values greater than " << max_count
6406 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6407 << prettybyte_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
6408 << " for " << duration << " seconds,"
6409 << " can cause ill effects on osd. "
6410 << " Please adjust 'osd_bench_large_size_max_throughput'"
6411 << " with a higher value if you wish to use a higher 'count'.";
6412 r = -EINVAL;
6413 goto out;
6414 }
6415 }
6416
6417 if (osize && bsize > osize)
6418 bsize = osize;
6419
6420 dout(1) << " bench count " << count
6421 << " bsize " << prettybyte_t(bsize) << dendl;
6422
6423 ObjectStore::Transaction cleanupt;
6424
6425 if (osize && onum) {
6426 bufferlist bl;
6427 bufferptr bp(osize);
6428 bp.zero();
6429 bl.push_back(std::move(bp));
6430 bl.rebuild_page_aligned();
6431 for (int i=0; i<onum; ++i) {
6432 char nm[30];
6433 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
6434 object_t oid(nm);
6435 hobject_t soid(sobject_t(oid, 0));
6436 ObjectStore::Transaction t;
6437 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
6438 store->queue_transaction(osr.get(), std::move(t), NULL);
6439 cleanupt.remove(coll_t(), ghobject_t(soid));
6440 }
6441 }
6442
6443 bufferlist bl;
6444 bufferptr bp(bsize);
6445 bp.zero();
6446 bl.push_back(std::move(bp));
6447 bl.rebuild_page_aligned();
6448
6449 {
6450 C_SaferCond waiter;
6451 if (!osr->flush_commit(&waiter)) {
6452 waiter.wait();
6453 }
6454 }
6455
6456 utime_t start = ceph_clock_now();
6457 for (int64_t pos = 0; pos < count; pos += bsize) {
6458 char nm[30];
6459 unsigned offset = 0;
6460 if (onum && osize) {
6461 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
6462 offset = rand() % (osize / bsize) * bsize;
6463 } else {
6464 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
6465 }
6466 object_t oid(nm);
6467 hobject_t soid(sobject_t(oid, 0));
6468 ObjectStore::Transaction t;
6469 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
6470 store->queue_transaction(osr.get(), std::move(t), NULL);
6471 if (!onum || !osize)
6472 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
6473 }
6474
6475 {
6476 C_SaferCond waiter;
6477 if (!osr->flush_commit(&waiter)) {
6478 waiter.wait();
6479 }
6480 }
6481 utime_t end = ceph_clock_now();
6482
6483 // clean up
6484 store->queue_transaction(osr.get(), std::move(cleanupt), NULL);
6485 {
6486 C_SaferCond waiter;
6487 if (!osr->flush_commit(&waiter)) {
6488 waiter.wait();
6489 }
6490 }
6491
6492 uint64_t rate = (double)count / (end - start);
6493 if (f) {
6494 f->open_object_section("osd_bench_results");
6495 f->dump_int("bytes_written", count);
6496 f->dump_int("blocksize", bsize);
6497 f->dump_unsigned("bytes_per_sec", rate);
6498 f->close_section();
6499 f->flush(ss);
6500 } else {
6501 ss << "bench: wrote " << prettybyte_t(count)
6502 << " in blocks of " << prettybyte_t(bsize) << " in "
6503 << (end-start) << " sec at " << prettybyte_t(rate) << "/sec";
6504 }
6505 }
6506
6507 else if (prefix == "flush_pg_stats") {
6508 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6509 mgrc.send_pgstats();
6510 ds << service.get_osd_stat_seq() << "\n";
6511 } else {
6512 flush_pg_stats();
6513 }
6514 }
6515
6516 else if (prefix == "heap") {
6517 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
6518 }
6519
6520 else if (prefix == "debug dump_missing") {
6521 string file_name;
6522 cmd_getval(cct, cmdmap, "filename", file_name);
6523 std::ofstream fout(file_name.c_str());
6524 if (!fout.is_open()) {
6525 ss << "failed to open file '" << file_name << "'";
6526 r = -EINVAL;
6527 goto out;
6528 }
6529
6530 fout << "*** osd " << whoami << ": dump_missing ***" << std::endl;
6531 RWLock::RLocker l(pg_map_lock);
6532 for (ceph::unordered_map<spg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
6533 pg_map_e != pg_map.end(); ++pg_map_e) {
6534 PG *pg = pg_map_e->second;
6535 pg->lock();
6536
6537 fout << *pg << std::endl;
6538 std::map<hobject_t, pg_missing_item>::const_iterator mend =
6539 pg->pg_log.get_missing().get_items().end();
6540 std::map<hobject_t, pg_missing_item>::const_iterator mi =
6541 pg->pg_log.get_missing().get_items().begin();
6542 for (; mi != mend; ++mi) {
6543 fout << mi->first << " -> " << mi->second << std::endl;
6544 if (!pg->missing_loc.needs_recovery(mi->first))
6545 continue;
6546 if (pg->missing_loc.is_unfound(mi->first))
6547 fout << " unfound ";
6548 const set<pg_shard_t> &mls(pg->missing_loc.get_locations(mi->first));
6549 if (mls.empty())
6550 continue;
6551 fout << "missing_loc: " << mls << std::endl;
6552 }
6553 pg->unlock();
6554 fout << std::endl;
6555 }
6556
6557 fout.close();
6558 }
6559 else if (prefix == "debug kick_recovery_wq") {
6560 int64_t delay;
6561 cmd_getval(cct, cmdmap, "delay", delay);
6562 ostringstream oss;
6563 oss << delay;
6564 r = cct->_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
6565 if (r != 0) {
6566 ss << "kick_recovery_wq: error setting "
6567 << "osd_recovery_delay_start to '" << delay << "': error "
6568 << r;
6569 goto out;
6570 }
6571 cct->_conf->apply_changes(NULL);
6572 ss << "kicking recovery queue. set osd_recovery_delay_start "
6573 << "to " << cct->_conf->osd_recovery_delay_start;
6574 }
6575
6576 else if (prefix == "cpu_profiler") {
6577 string arg;
6578 cmd_getval(cct, cmdmap, "arg", arg);
6579 vector<string> argvec;
6580 get_str_vec(arg, argvec);
6581 cpu_profiler_handle_command(argvec, ds);
6582 }
6583
6584 else if (prefix == "dump_pg_recovery_stats") {
6585 stringstream s;
6586 if (f) {
6587 pg_recovery_stats.dump_formatted(f.get());
6588 f->flush(ds);
6589 } else {
6590 pg_recovery_stats.dump(s);
6591 ds << "dump pg recovery stats: " << s.str();
6592 }
6593 }
6594
6595 else if (prefix == "reset_pg_recovery_stats") {
6596 ss << "reset pg recovery stats";
6597 pg_recovery_stats.reset();
6598 }
6599
6600 else if (prefix == "perf histogram dump") {
6601 std::string logger;
6602 std::string counter;
6603 cmd_getval(cct, cmdmap, "logger", logger);
6604 cmd_getval(cct, cmdmap, "counter", counter);
6605 if (f) {
6606 cct->get_perfcounters_collection()->dump_formatted_histograms(
6607 f.get(), false, logger, counter);
6608 f->flush(ds);
6609 }
6610 }
6611
6612 else {
6613 ss << "unrecognized command! " << cmd;
6614 r = -EINVAL;
6615 }
6616
6617 out:
6618 rs = ss.str();
6619 odata.append(ds);
6620 dout(0) << "do_command r=" << r << " " << rs << dendl;
6621 clog->info() << rs;
6622 if (con) {
6623 MCommandReply *reply = new MCommandReply(r, rs);
6624 reply->set_tid(tid);
6625 reply->set_data(odata);
6626 con->send_message(reply);
6627 }
6628 }
6629
6630 bool OSD::heartbeat_dispatch(Message *m)
6631 {
6632 dout(30) << "heartbeat_dispatch " << m << dendl;
6633 switch (m->get_type()) {
6634
6635 case CEPH_MSG_PING:
6636 dout(10) << "ping from " << m->get_source_inst() << dendl;
6637 m->put();
6638 break;
6639
6640 case MSG_OSD_PING:
6641 handle_osd_ping(static_cast<MOSDPing*>(m));
6642 break;
6643
6644 default:
6645 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
6646 m->put();
6647 }
6648
6649 return true;
6650 }
6651
6652 bool OSD::ms_dispatch(Message *m)
6653 {
6654 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
6655 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
6656 service.got_stop_ack();
6657 m->put();
6658 return true;
6659 }
6660
6661 // lock!
6662
6663 osd_lock.Lock();
6664 if (is_stopping()) {
6665 osd_lock.Unlock();
6666 m->put();
6667 return true;
6668 }
6669
6670 do_waiters();
6671 _dispatch(m);
6672
6673 osd_lock.Unlock();
6674
6675 return true;
6676 }
6677
6678 void OSD::maybe_share_map(
6679 Session *session,
6680 OpRequestRef op,
6681 OSDMapRef osdmap)
6682 {
6683 if (!op->check_send_map) {
6684 return;
6685 }
6686 epoch_t last_sent_epoch = 0;
6687
6688 session->sent_epoch_lock.lock();
6689 last_sent_epoch = session->last_sent_epoch;
6690 session->sent_epoch_lock.unlock();
6691
6692 const Message *m = op->get_req();
6693 service.share_map(
6694 m->get_source(),
6695 m->get_connection().get(),
6696 op->sent_epoch,
6697 osdmap,
6698 session ? &last_sent_epoch : NULL);
6699
6700 session->sent_epoch_lock.lock();
6701 if (session->last_sent_epoch < last_sent_epoch) {
6702 session->last_sent_epoch = last_sent_epoch;
6703 }
6704 session->sent_epoch_lock.unlock();
6705
6706 op->check_send_map = false;
6707 }
6708
6709 void OSD::dispatch_session_waiting(Session *session, OSDMapRef osdmap)
6710 {
6711 assert(session->session_dispatch_lock.is_locked());
6712
6713 auto i = session->waiting_on_map.begin();
6714 while (i != session->waiting_on_map.end()) {
6715 OpRequestRef op = &(*i);
6716 assert(ms_can_fast_dispatch(op->get_req()));
6717 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
6718 op->get_req());
6719 if (m->get_min_epoch() > osdmap->get_epoch()) {
6720 break;
6721 }
6722 session->waiting_on_map.erase(i++);
6723 op->put();
6724
6725 spg_t pgid;
6726 if (m->get_type() == CEPH_MSG_OSD_OP) {
6727 pg_t actual_pgid = osdmap->raw_pg_to_pg(
6728 static_cast<const MOSDOp*>(m)->get_pg());
6729 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
6730 continue;
6731 }
6732 } else {
6733 pgid = m->get_spg();
6734 }
6735 enqueue_op(pgid, op, m->get_map_epoch());
6736 }
6737
6738 if (session->waiting_on_map.empty()) {
6739 clear_session_waiting_on_map(session);
6740 } else {
6741 register_session_waiting_on_map(session);
6742 }
6743 }
6744
6745 void OSD::ms_fast_dispatch(Message *m)
6746 {
6747 FUNCTRACE();
6748 if (service.is_stopping()) {
6749 m->put();
6750 return;
6751 }
6752 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
6753 {
6754 #ifdef WITH_LTTNG
6755 osd_reqid_t reqid = op->get_reqid();
6756 #endif
6757 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
6758 reqid.name._num, reqid.tid, reqid.inc);
6759 }
6760
6761 if (m->trace)
6762 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
6763
6764 // note sender epoch, min req'd epoch
6765 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
6766 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
6767 assert(op->min_epoch <= op->sent_epoch); // sanity check!
6768
6769 service.maybe_inject_dispatch_delay();
6770
6771 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
6772 m->get_type() != CEPH_MSG_OSD_OP) {
6773 // queue it directly
6774 enqueue_op(
6775 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
6776 op,
6777 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
6778 } else {
6779 // legacy client, and this is an MOSDOp (the *only* fast dispatch
6780 // message that didn't have an explicit spg_t); we need to map
6781 // them to an spg_t while preserving delivery order.
6782 Session *session = static_cast<Session*>(m->get_connection()->get_priv());
6783 if (session) {
6784 {
6785 Mutex::Locker l(session->session_dispatch_lock);
6786 op->get();
6787 session->waiting_on_map.push_back(*op);
6788 OSDMapRef nextmap = service.get_nextmap_reserved();
6789 dispatch_session_waiting(session, nextmap);
6790 service.release_map(nextmap);
6791 }
6792 session->put();
6793 }
6794 }
6795 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
6796 }
6797
6798 void OSD::ms_fast_preprocess(Message *m)
6799 {
6800 if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
6801 if (m->get_type() == CEPH_MSG_OSD_MAP) {
6802 MOSDMap *mm = static_cast<MOSDMap*>(m);
6803 Session *s = static_cast<Session*>(m->get_connection()->get_priv());
6804 if (s) {
6805 s->received_map_lock.lock();
6806 s->received_map_epoch = mm->get_last();
6807 s->received_map_lock.unlock();
6808 s->put();
6809 }
6810 }
6811 }
6812 }
6813
6814 bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
6815 {
6816 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
6817
6818 if (is_stopping()) {
6819 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
6820 return false;
6821 }
6822
6823 if (dest_type == CEPH_ENTITY_TYPE_MON)
6824 return true;
6825
6826 if (force_new) {
6827 /* the MonClient checks keys every tick(), so we should just wait for that cycle
6828 to get through */
6829 if (monc->wait_auth_rotating(10) < 0) {
6830 derr << "OSD::ms_get_authorizer wait_auth_rotating failed" << dendl;
6831 return false;
6832 }
6833 }
6834
6835 *authorizer = monc->build_authorizer(dest_type);
6836 return *authorizer != NULL;
6837 }
6838
6839
6840 bool OSD::ms_verify_authorizer(Connection *con, int peer_type,
6841 int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
6842 bool& isvalid, CryptoKey& session_key)
6843 {
6844 AuthAuthorizeHandler *authorize_handler = 0;
6845 switch (peer_type) {
6846 case CEPH_ENTITY_TYPE_MDS:
6847 /*
6848 * note: mds is technically a client from our perspective, but
6849 * this makes the 'cluster' consistent w/ monitor's usage.
6850 */
6851 case CEPH_ENTITY_TYPE_OSD:
6852 case CEPH_ENTITY_TYPE_MGR:
6853 authorize_handler = authorize_handler_cluster_registry->get_handler(protocol);
6854 break;
6855 default:
6856 authorize_handler = authorize_handler_service_registry->get_handler(protocol);
6857 }
6858 if (!authorize_handler) {
6859 dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol << dendl;
6860 isvalid = false;
6861 return true;
6862 }
6863
6864 AuthCapsInfo caps_info;
6865 EntityName name;
6866 uint64_t global_id;
6867 uint64_t auid = CEPH_AUTH_UID_DEFAULT;
6868
6869 isvalid = authorize_handler->verify_authorizer(
6870 cct, monc->rotating_secrets.get(),
6871 authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
6872 &auid);
6873
6874 if (isvalid) {
6875 Session *s = static_cast<Session *>(con->get_priv());
6876 if (!s) {
6877 s = new Session(cct);
6878 con->set_priv(s->get());
6879 s->con = con;
6880 dout(10) << " new session " << s << " con=" << s->con << " addr=" << s->con->get_peer_addr() << dendl;
6881 }
6882
6883 s->entity_name = name;
6884 if (caps_info.allow_all)
6885 s->caps.set_allow_all();
6886 s->auid = auid;
6887
6888 if (caps_info.caps.length() > 0) {
6889 bufferlist::iterator p = caps_info.caps.begin();
6890 string str;
6891 try {
6892 ::decode(str, p);
6893 }
6894 catch (buffer::error& e) {
6895 }
6896 bool success = s->caps.parse(str);
6897 if (success)
6898 dout(10) << " session " << s << " " << s->entity_name << " has caps " << s->caps << " '" << str << "'" << dendl;
6899 else
6900 dout(10) << " session " << s << " " << s->entity_name << " failed to parse caps '" << str << "'" << dendl;
6901 }
6902
6903 s->put();
6904 }
6905 return true;
6906 }
6907
6908 void OSD::do_waiters()
6909 {
6910 assert(osd_lock.is_locked());
6911
6912 dout(10) << "do_waiters -- start" << dendl;
6913 while (!finished.empty()) {
6914 OpRequestRef next = finished.front();
6915 finished.pop_front();
6916 dispatch_op(next);
6917 }
6918 dout(10) << "do_waiters -- finish" << dendl;
6919 }
6920
6921 void OSD::dispatch_op(OpRequestRef op)
6922 {
6923 switch (op->get_req()->get_type()) {
6924
6925 case MSG_OSD_PG_CREATE:
6926 handle_pg_create(op);
6927 break;
6928 case MSG_OSD_PG_NOTIFY:
6929 handle_pg_notify(op);
6930 break;
6931 case MSG_OSD_PG_QUERY:
6932 handle_pg_query(op);
6933 break;
6934 case MSG_OSD_PG_LOG:
6935 handle_pg_log(op);
6936 break;
6937 case MSG_OSD_PG_REMOVE:
6938 handle_pg_remove(op);
6939 break;
6940 case MSG_OSD_PG_INFO:
6941 handle_pg_info(op);
6942 break;
6943 case MSG_OSD_PG_TRIM:
6944 handle_pg_trim(op);
6945 break;
6946 case MSG_OSD_BACKFILL_RESERVE:
6947 handle_pg_backfill_reserve(op);
6948 break;
6949 case MSG_OSD_RECOVERY_RESERVE:
6950 handle_pg_recovery_reserve(op);
6951 break;
6952 }
6953 }
6954
6955 void OSD::_dispatch(Message *m)
6956 {
6957 assert(osd_lock.is_locked());
6958 dout(20) << "_dispatch " << m << " " << *m << dendl;
6959
6960 switch (m->get_type()) {
6961
6962 // -- don't need lock --
6963 case CEPH_MSG_PING:
6964 dout(10) << "ping from " << m->get_source() << dendl;
6965 m->put();
6966 break;
6967
6968 // -- don't need OSDMap --
6969
6970 // map and replication
6971 case CEPH_MSG_OSD_MAP:
6972 handle_osd_map(static_cast<MOSDMap*>(m));
6973 break;
6974
6975 // osd
6976 case MSG_PGSTATSACK:
6977 handle_pg_stats_ack(static_cast<MPGStatsAck*>(m));
6978 break;
6979
6980 case MSG_MON_COMMAND:
6981 handle_command(static_cast<MMonCommand*>(m));
6982 break;
6983 case MSG_COMMAND:
6984 handle_command(static_cast<MCommand*>(m));
6985 break;
6986
6987 case MSG_OSD_SCRUB:
6988 handle_scrub(static_cast<MOSDScrub*>(m));
6989 break;
6990
6991 // -- need OSDMap --
6992
6993 case MSG_OSD_PG_CREATE:
6994 case MSG_OSD_PG_NOTIFY:
6995 case MSG_OSD_PG_QUERY:
6996 case MSG_OSD_PG_LOG:
6997 case MSG_OSD_PG_REMOVE:
6998 case MSG_OSD_PG_INFO:
6999 case MSG_OSD_PG_TRIM:
7000 case MSG_OSD_BACKFILL_RESERVE:
7001 case MSG_OSD_RECOVERY_RESERVE:
7002 {
7003 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7004 if (m->trace)
7005 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7006 // no map? starting up?
7007 if (!osdmap) {
7008 dout(7) << "no OSDMap, not booted" << dendl;
7009 logger->inc(l_osd_waiting_for_map);
7010 waiting_for_osdmap.push_back(op);
7011 op->mark_delayed("no osdmap");
7012 break;
7013 }
7014
7015 // need OSDMap
7016 dispatch_op(op);
7017 }
7018 }
7019 }
7020
7021 void OSD::handle_pg_scrub(MOSDScrub *m, PG *pg)
7022 {
7023 pg->lock();
7024 if (pg->is_primary()) {
7025 pg->unreg_next_scrub();
7026 pg->scrubber.must_scrub = true;
7027 pg->scrubber.must_deep_scrub = m->deep || m->repair;
7028 pg->scrubber.must_repair = m->repair;
7029 pg->reg_next_scrub();
7030 dout(10) << "marking " << *pg << " for scrub" << dendl;
7031 }
7032 pg->unlock();
7033 }
7034
7035 void OSD::handle_scrub(MOSDScrub *m)
7036 {
7037 dout(10) << "handle_scrub " << *m << dendl;
7038 if (!require_mon_or_mgr_peer(m)) {
7039 m->put();
7040 return;
7041 }
7042 if (m->fsid != monc->get_fsid()) {
7043 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid() << dendl;
7044 m->put();
7045 return;
7046 }
7047
7048 RWLock::RLocker l(pg_map_lock);
7049 if (m->scrub_pgs.empty()) {
7050 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
7051 p != pg_map.end();
7052 ++p)
7053 handle_pg_scrub(m, p->second);
7054 } else {
7055 for (vector<pg_t>::iterator p = m->scrub_pgs.begin();
7056 p != m->scrub_pgs.end();
7057 ++p) {
7058 spg_t pcand;
7059 if (osdmap->get_primary_shard(*p, &pcand)) {
7060 auto pg_map_entry = pg_map.find(pcand);
7061 if (pg_map_entry != pg_map.end()) {
7062 handle_pg_scrub(m, pg_map_entry->second);
7063 }
7064 }
7065 }
7066 }
7067
7068 m->put();
7069 }
7070
7071 bool OSD::scrub_random_backoff()
7072 {
7073 bool coin_flip = (rand() / (double)RAND_MAX >=
7074 cct->_conf->osd_scrub_backoff_ratio);
7075 if (!coin_flip) {
7076 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7077 return true;
7078 }
7079 return false;
7080 }
7081
7082 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7083 const spg_t& pg, const utime_t& timestamp,
7084 double pool_scrub_min_interval,
7085 double pool_scrub_max_interval, bool must)
7086 : cct(cct),
7087 pgid(pg),
7088 sched_time(timestamp),
7089 deadline(timestamp)
7090 {
7091 // if not explicitly requested, postpone the scrub with a random delay
7092 if (!must) {
7093 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7094 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7095 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7096 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7097
7098 sched_time += scrub_min_interval;
7099 double r = rand() / (double)RAND_MAX;
7100 sched_time +=
7101 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7102 deadline += scrub_max_interval;
7103 }
7104 }
7105
7106 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7107 if (sched_time < rhs.sched_time)
7108 return true;
7109 if (sched_time > rhs.sched_time)
7110 return false;
7111 return pgid < rhs.pgid;
7112 }
7113
7114 bool OSD::scrub_time_permit(utime_t now)
7115 {
7116 struct tm bdt;
7117 time_t tt = now.sec();
7118 localtime_r(&tt, &bdt);
7119 bool time_permit = false;
7120 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7121 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7122 time_permit = true;
7123 }
7124 } else {
7125 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7126 time_permit = true;
7127 }
7128 }
7129 if (!time_permit) {
7130 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7131 << " - " << cct->_conf->osd_scrub_end_hour
7132 << " now " << bdt.tm_hour << " = no" << dendl;
7133 } else {
7134 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7135 << " - " << cct->_conf->osd_scrub_end_hour
7136 << " now " << bdt.tm_hour << " = yes" << dendl;
7137 }
7138 return time_permit;
7139 }
7140
7141 bool OSD::scrub_load_below_threshold()
7142 {
7143 double loadavgs[3];
7144 if (getloadavg(loadavgs, 3) != 3) {
7145 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7146 return false;
7147 }
7148
7149 // allow scrub if below configured threshold
7150 if (loadavgs[0] < cct->_conf->osd_scrub_load_threshold) {
7151 dout(20) << __func__ << " loadavg " << loadavgs[0]
7152 << " < max " << cct->_conf->osd_scrub_load_threshold
7153 << " = yes" << dendl;
7154 return true;
7155 }
7156
7157 // allow scrub if below daily avg and currently decreasing
7158 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7159 dout(20) << __func__ << " loadavg " << loadavgs[0]
7160 << " < daily_loadavg " << daily_loadavg
7161 << " and < 15m avg " << loadavgs[2]
7162 << " = yes" << dendl;
7163 return true;
7164 }
7165
7166 dout(20) << __func__ << " loadavg " << loadavgs[0]
7167 << " >= max " << cct->_conf->osd_scrub_load_threshold
7168 << " and ( >= daily_loadavg " << daily_loadavg
7169 << " or >= 15m avg " << loadavgs[2]
7170 << ") = no" << dendl;
7171 return false;
7172 }
7173
7174 void OSD::sched_scrub()
7175 {
7176 // if not permitted, fail fast
7177 if (!service.can_inc_scrubs_pending()) {
7178 return;
7179 }
7180
7181 utime_t now = ceph_clock_now();
7182 bool time_permit = scrub_time_permit(now);
7183 bool load_is_low = scrub_load_below_threshold();
7184 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7185
7186 OSDService::ScrubJob scrub;
7187 if (service.first_scrub_stamp(&scrub)) {
7188 do {
7189 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7190
7191 if (scrub.sched_time > now) {
7192 // save ourselves some effort
7193 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7194 << " > " << now << dendl;
7195 break;
7196 }
7197
7198 if (!cct->_conf->osd_scrub_during_recovery && service.is_recovery_active()) {
7199 dout(10) << __func__ << "not scheduling scrub of " << scrub.pgid << " due to active recovery ops" << dendl;
7200 break;
7201 }
7202
7203 if ((scrub.deadline >= now) && !(time_permit && load_is_low)) {
7204 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7205 << (!time_permit ? "time not permit" : "high load") << dendl;
7206 continue;
7207 }
7208
7209 PG *pg = _lookup_lock_pg(scrub.pgid);
7210 if (!pg)
7211 continue;
7212 if (pg->get_pgbackend()->scrub_supported() && pg->is_active()) {
7213 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7214 << (pg->scrubber.must_scrub ? ", explicitly requested" :
7215 (load_is_low ? ", load_is_low" : " deadline < now"))
7216 << dendl;
7217 if (pg->sched_scrub()) {
7218 pg->unlock();
7219 break;
7220 }
7221 }
7222 pg->unlock();
7223 } while (service.next_scrub_stamp(scrub, &scrub));
7224 }
7225 dout(20) << "sched_scrub done" << dendl;
7226 }
7227
7228
7229
7230 // =====================================================
7231 // MAP
7232
7233 void OSD::wait_for_new_map(OpRequestRef op)
7234 {
7235 // ask?
7236 if (waiting_for_osdmap.empty()) {
7237 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7238 }
7239
7240 logger->inc(l_osd_waiting_for_map);
7241 waiting_for_osdmap.push_back(op);
7242 op->mark_delayed("wait for new map");
7243 }
7244
7245
7246 /** update_map
7247 * assimilate new OSDMap(s). scan pgs, etc.
7248 */
7249
7250 void OSD::note_down_osd(int peer)
7251 {
7252 assert(osd_lock.is_locked());
7253 cluster_messenger->mark_down(osdmap->get_cluster_addr(peer));
7254
7255 heartbeat_lock.Lock();
7256 failure_queue.erase(peer);
7257 failure_pending.erase(peer);
7258 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7259 if (p != heartbeat_peers.end()) {
7260 p->second.con_back->mark_down();
7261 if (p->second.con_front) {
7262 p->second.con_front->mark_down();
7263 }
7264 heartbeat_peers.erase(p);
7265 }
7266 heartbeat_lock.Unlock();
7267 }
7268
7269 void OSD::note_up_osd(int peer)
7270 {
7271 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
7272 heartbeat_set_peers_need_update();
7273 }
7274
7275 struct C_OnMapCommit : public Context {
7276 OSD *osd;
7277 epoch_t first, last;
7278 MOSDMap *msg;
7279 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7280 : osd(o), first(f), last(l), msg(m) {}
7281 void finish(int r) override {
7282 osd->_committed_osd_maps(first, last, msg);
7283 msg->put();
7284 }
7285 };
7286
7287 struct C_OnMapApply : public Context {
7288 OSDService *service;
7289 list<OSDMapRef> pinned_maps;
7290 epoch_t e;
7291 C_OnMapApply(OSDService *service,
7292 const list<OSDMapRef> &pinned_maps,
7293 epoch_t e)
7294 : service(service), pinned_maps(pinned_maps), e(e) {}
7295 void finish(int r) override {
7296 service->clear_map_bl_cache_pins(e);
7297 }
7298 };
7299
7300 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7301 {
7302 OSDMapRef osdmap = service.get_osdmap();
7303 if (osdmap->get_epoch() >= epoch)
7304 return;
7305
7306 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7307 force_request) {
7308 monc->renew_subs();
7309 }
7310 }
7311
7312 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7313 {
7314 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7315 if (min <= superblock.oldest_map)
7316 return;
7317
7318 int num = 0;
7319 ObjectStore::Transaction t;
7320 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7321 dout(20) << " removing old osdmap epoch " << e << dendl;
7322 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7323 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7324 superblock.oldest_map = e + 1;
7325 num++;
7326 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7327 service.publish_superblock(superblock);
7328 write_superblock(t);
7329 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7330 assert(tr == 0);
7331 num = 0;
7332 if (!skip_maps) {
7333 // skip_maps leaves us with a range of old maps if we fail to remove all
7334 // of them before moving superblock.oldest_map forward to the first map
7335 // in the incoming MOSDMap msg. so we should continue removing them in
7336 // this case, even we could do huge series of delete transactions all at
7337 // once.
7338 break;
7339 }
7340 }
7341 }
7342 if (num > 0) {
7343 service.publish_superblock(superblock);
7344 write_superblock(t);
7345 store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7346 }
7347 // we should not remove the cached maps
7348 assert(min <= service.map_cache.cached_key_lower_bound());
7349 }
7350
7351 void OSD::handle_osd_map(MOSDMap *m)
7352 {
7353 assert(osd_lock.is_locked());
7354 // Keep a ref in the list until we get the newly received map written
7355 // onto disk. This is important because as long as the refs are alive,
7356 // the OSDMaps will be pinned in the cache and we won't try to read it
7357 // off of disk. Otherwise these maps will probably not stay in the cache,
7358 // and reading those OSDMaps before they are actually written can result
7359 // in a crash.
7360 list<OSDMapRef> pinned_maps;
7361 if (m->fsid != monc->get_fsid()) {
7362 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7363 << monc->get_fsid() << dendl;
7364 m->put();
7365 return;
7366 }
7367 if (is_initializing()) {
7368 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7369 m->put();
7370 return;
7371 }
7372
7373 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
7374 if (session && !(session->entity_name.is_mon() ||
7375 session->entity_name.is_osd())) {
7376 //not enough perms!
7377 dout(10) << "got osd map from Session " << session
7378 << " which we can't take maps from (not a mon or osd)" << dendl;
7379 m->put();
7380 session->put();
7381 return;
7382 }
7383 if (session)
7384 session->put();
7385
7386 // share with the objecter
7387 if (!is_preboot())
7388 service.objecter->handle_osd_map(m);
7389
7390 epoch_t first = m->get_first();
7391 epoch_t last = m->get_last();
7392 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7393 << superblock.newest_map
7394 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7395 << dendl;
7396
7397 logger->inc(l_osd_map);
7398 logger->inc(l_osd_mape, last - first + 1);
7399 if (first <= superblock.newest_map)
7400 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7401 if (service.max_oldest_map < m->oldest_map) {
7402 service.max_oldest_map = m->oldest_map;
7403 assert(service.max_oldest_map >= superblock.oldest_map);
7404 }
7405
7406 // make sure there is something new, here, before we bother flushing
7407 // the queues and such
7408 if (last <= superblock.newest_map) {
7409 dout(10) << " no new maps here, dropping" << dendl;
7410 m->put();
7411 return;
7412 }
7413
7414 // missing some?
7415 bool skip_maps = false;
7416 if (first > superblock.newest_map + 1) {
7417 dout(10) << "handle_osd_map message skips epochs "
7418 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7419 if (m->oldest_map <= superblock.newest_map + 1) {
7420 osdmap_subscribe(superblock.newest_map + 1, false);
7421 m->put();
7422 return;
7423 }
7424 // always try to get the full range of maps--as many as we can. this
7425 // 1- is good to have
7426 // 2- is at present the only way to ensure that we get a *full* map as
7427 // the first map!
7428 if (m->oldest_map < first) {
7429 osdmap_subscribe(m->oldest_map - 1, true);
7430 m->put();
7431 return;
7432 }
7433 skip_maps = true;
7434 }
7435
7436 ObjectStore::Transaction t;
7437 uint64_t txn_size = 0;
7438
7439 // store new maps: queue for disk and put in the osdmap cache
7440 epoch_t start = MAX(superblock.newest_map + 1, first);
7441 for (epoch_t e = start; e <= last; e++) {
7442 if (txn_size >= t.get_num_bytes()) {
7443 derr << __func__ << " transaction size overflowed" << dendl;
7444 assert(txn_size < t.get_num_bytes());
7445 }
7446 txn_size = t.get_num_bytes();
7447 map<epoch_t,bufferlist>::iterator p;
7448 p = m->maps.find(e);
7449 if (p != m->maps.end()) {
7450 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7451 OSDMap *o = new OSDMap;
7452 bufferlist& bl = p->second;
7453
7454 o->decode(bl);
7455
7456 ghobject_t fulloid = get_osdmap_pobject_name(e);
7457 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7458 pin_map_bl(e, bl);
7459 pinned_maps.push_back(add_map(o));
7460
7461 got_full_map(e);
7462 continue;
7463 }
7464
7465 p = m->incremental_maps.find(e);
7466 if (p != m->incremental_maps.end()) {
7467 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7468 bufferlist& bl = p->second;
7469 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7470 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7471 pin_map_inc_bl(e, bl);
7472
7473 OSDMap *o = new OSDMap;
7474 if (e > 1) {
7475 bufferlist obl;
7476 bool got = get_map_bl(e - 1, obl);
7477 assert(got);
7478 o->decode(obl);
7479 }
7480
7481 OSDMap::Incremental inc;
7482 bufferlist::iterator p = bl.begin();
7483 inc.decode(p);
7484 if (o->apply_incremental(inc) < 0) {
7485 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
7486 assert(0 == "bad fsid");
7487 }
7488
7489 bufferlist fbl;
7490 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7491
7492 bool injected_failure = false;
7493 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7494 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7495 derr << __func__ << " injecting map crc failure" << dendl;
7496 injected_failure = true;
7497 }
7498
7499 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7500 dout(2) << "got incremental " << e
7501 << " but failed to encode full with correct crc; requesting"
7502 << dendl;
7503 clog->warn() << "failed to encode map e" << e << " with expected crc";
7504 dout(20) << "my encoded map was:\n";
7505 fbl.hexdump(*_dout);
7506 *_dout << dendl;
7507 delete o;
7508 request_full_map(e, last);
7509 last = e - 1;
7510 break;
7511 }
7512 got_full_map(e);
7513
7514 ghobject_t fulloid = get_osdmap_pobject_name(e);
7515 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7516 pin_map_bl(e, fbl);
7517 pinned_maps.push_back(add_map(o));
7518 continue;
7519 }
7520
7521 assert(0 == "MOSDMap lied about what maps it had?");
7522 }
7523
7524 // even if this map isn't from a mon, we may have satisfied our subscription
7525 monc->sub_got("osdmap", last);
7526
7527 if (!m->maps.empty() && requested_full_first) {
7528 dout(10) << __func__ << " still missing full maps " << requested_full_first
7529 << ".." << requested_full_last << dendl;
7530 rerequest_full_maps();
7531 }
7532
7533 if (superblock.oldest_map) {
7534 // make sure we at least keep pace with incoming maps
7535 trim_maps(m->oldest_map, last - first + 1, skip_maps);
7536 }
7537
7538 if (!superblock.oldest_map || skip_maps)
7539 superblock.oldest_map = first;
7540 superblock.newest_map = last;
7541 superblock.current_epoch = last;
7542
7543 // note in the superblock that we were clean thru the prior epoch
7544 epoch_t boot_epoch = service.get_boot_epoch();
7545 if (boot_epoch && boot_epoch >= superblock.mounted) {
7546 superblock.mounted = boot_epoch;
7547 superblock.clean_thru = last;
7548 }
7549
7550 // superblock and commit
7551 write_superblock(t);
7552 store->queue_transaction(
7553 service.meta_osr.get(),
7554 std::move(t),
7555 new C_OnMapApply(&service, pinned_maps, last),
7556 new C_OnMapCommit(this, start, last, m), 0);
7557 service.publish_superblock(superblock);
7558 }
7559
7560 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
7561 {
7562 dout(10) << __func__ << " " << first << ".." << last << dendl;
7563 if (is_stopping()) {
7564 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7565 return;
7566 }
7567 Mutex::Locker l(osd_lock);
7568 if (is_stopping()) {
7569 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7570 return;
7571 }
7572 map_lock.get_write();
7573
7574 bool do_shutdown = false;
7575 bool do_restart = false;
7576 bool network_error = false;
7577
7578 // advance through the new maps
7579 for (epoch_t cur = first; cur <= last; cur++) {
7580 dout(10) << " advance to epoch " << cur
7581 << " (<= last " << last
7582 << " <= newest_map " << superblock.newest_map
7583 << ")" << dendl;
7584
7585 OSDMapRef newmap = get_map(cur);
7586 assert(newmap); // we just cached it above!
7587
7588 // start blacklisting messages sent to peers that go down.
7589 service.pre_publish_map(newmap);
7590
7591 // kill connections to newly down osds
7592 bool waited_for_reservations = false;
7593 set<int> old;
7594 osdmap->get_all_osds(old);
7595 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
7596 if (*p != whoami &&
7597 osdmap->is_up(*p) && // in old map
7598 newmap->is_down(*p)) { // but not the new one
7599 if (!waited_for_reservations) {
7600 service.await_reserved_maps();
7601 waited_for_reservations = true;
7602 }
7603 note_down_osd(*p);
7604 } else if (*p != whoami &&
7605 osdmap->is_down(*p) &&
7606 newmap->is_up(*p)) {
7607 note_up_osd(*p);
7608 }
7609 }
7610
7611 if ((osdmap->test_flag(CEPH_OSDMAP_NOUP) !=
7612 newmap->test_flag(CEPH_OSDMAP_NOUP)) ||
7613 (osdmap->is_noup(whoami) != newmap->is_noup(whoami))) {
7614 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
7615 << dendl;
7616 if (is_booting()) {
7617 // this captures the case where we sent the boot message while
7618 // NOUP was being set on the mon and our boot request was
7619 // dropped, and then later it is cleared. it imperfectly
7620 // handles the case where our original boot message was not
7621 // dropped and we restart even though we might have booted, but
7622 // that is harmless (boot will just take slightly longer).
7623 do_restart = true;
7624 }
7625 }
7626 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS &&
7627 newmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
7628 dout(10) << __func__ << " require_osd_release reached luminous in "
7629 << newmap->get_epoch() << dendl;
7630 clear_pg_stat_queue();
7631 outstanding_pg_stats.clear();
7632 }
7633
7634 osdmap = newmap;
7635 epoch_t up_epoch;
7636 epoch_t boot_epoch;
7637 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
7638 if (!up_epoch &&
7639 osdmap->is_up(whoami) &&
7640 osdmap->get_inst(whoami) == client_messenger->get_myinst()) {
7641 up_epoch = osdmap->get_epoch();
7642 dout(10) << "up_epoch is " << up_epoch << dendl;
7643 if (!boot_epoch) {
7644 boot_epoch = osdmap->get_epoch();
7645 dout(10) << "boot_epoch is " << boot_epoch << dendl;
7646 }
7647 service.set_epochs(&boot_epoch, &up_epoch, NULL);
7648 }
7649 }
7650
7651 had_map_since = ceph_clock_now();
7652
7653 epoch_t _bind_epoch = service.get_bind_epoch();
7654 if (osdmap->is_up(whoami) &&
7655 osdmap->get_addr(whoami) == client_messenger->get_myaddr() &&
7656 _bind_epoch < osdmap->get_up_from(whoami)) {
7657
7658 if (is_booting()) {
7659 dout(1) << "state: booting -> active" << dendl;
7660 set_state(STATE_ACTIVE);
7661
7662 // set incarnation so that osd_reqid_t's we generate for our
7663 // objecter requests are unique across restarts.
7664 service.objecter->set_client_incarnation(osdmap->get_epoch());
7665 }
7666 }
7667
7668 if (osdmap->get_epoch() > 0 &&
7669 is_active()) {
7670 if (!osdmap->exists(whoami)) {
7671 dout(0) << "map says i do not exist. shutting down." << dendl;
7672 do_shutdown = true; // don't call shutdown() while we have
7673 // everything paused
7674 } else if (!osdmap->is_up(whoami) ||
7675 !osdmap->get_addr(whoami).probably_equals(
7676 client_messenger->get_myaddr()) ||
7677 !osdmap->get_cluster_addr(whoami).probably_equals(
7678 cluster_messenger->get_myaddr()) ||
7679 !osdmap->get_hb_back_addr(whoami).probably_equals(
7680 hb_back_server_messenger->get_myaddr()) ||
7681 (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
7682 !osdmap->get_hb_front_addr(whoami).probably_equals(
7683 hb_front_server_messenger->get_myaddr()))) {
7684 if (!osdmap->is_up(whoami)) {
7685 if (service.is_preparing_to_stop() || service.is_stopping()) {
7686 service.got_stop_ack();
7687 } else {
7688 clog->warn() << "map e" << osdmap->get_epoch()
7689 << " wrongly marked me down at e"
7690 << osdmap->get_down_at(whoami);
7691 }
7692 } else if (!osdmap->get_addr(whoami).probably_equals(
7693 client_messenger->get_myaddr())) {
7694 clog->error() << "map e" << osdmap->get_epoch()
7695 << " had wrong client addr (" << osdmap->get_addr(whoami)
7696 << " != my " << client_messenger->get_myaddr() << ")";
7697 } else if (!osdmap->get_cluster_addr(whoami).probably_equals(
7698 cluster_messenger->get_myaddr())) {
7699 clog->error() << "map e" << osdmap->get_epoch()
7700 << " had wrong cluster addr ("
7701 << osdmap->get_cluster_addr(whoami)
7702 << " != my " << cluster_messenger->get_myaddr() << ")";
7703 } else if (!osdmap->get_hb_back_addr(whoami).probably_equals(
7704 hb_back_server_messenger->get_myaddr())) {
7705 clog->error() << "map e" << osdmap->get_epoch()
7706 << " had wrong hb back addr ("
7707 << osdmap->get_hb_back_addr(whoami)
7708 << " != my " << hb_back_server_messenger->get_myaddr()
7709 << ")";
7710 } else if (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
7711 !osdmap->get_hb_front_addr(whoami).probably_equals(
7712 hb_front_server_messenger->get_myaddr())) {
7713 clog->error() << "map e" << osdmap->get_epoch()
7714 << " had wrong hb front addr ("
7715 << osdmap->get_hb_front_addr(whoami)
7716 << " != my " << hb_front_server_messenger->get_myaddr()
7717 << ")";
7718 }
7719
7720 if (!service.is_stopping()) {
7721 epoch_t up_epoch = 0;
7722 epoch_t bind_epoch = osdmap->get_epoch();
7723 service.set_epochs(NULL,&up_epoch, &bind_epoch);
7724 do_restart = true;
7725
7726 //add markdown log
7727 utime_t now = ceph_clock_now();
7728 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
7729 osd_markdown_log.push_back(now);
7730 //clear all out-of-date log
7731 while (!osd_markdown_log.empty() &&
7732 osd_markdown_log.front() + grace < now)
7733 osd_markdown_log.pop_front();
7734 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
7735 dout(0) << __func__ << " marked down "
7736 << osd_markdown_log.size()
7737 << " > osd_max_markdown_count "
7738 << cct->_conf->osd_max_markdown_count
7739 << " in last " << grace << " seconds, shutting down"
7740 << dendl;
7741 do_restart = false;
7742 do_shutdown = true;
7743 }
7744
7745 start_waiting_for_healthy();
7746
7747 set<int> avoid_ports;
7748 #if defined(__FreeBSD__)
7749 // prevent FreeBSD from grabbing the client_messenger port during
7750 // rebinding. In which case a cluster_meesneger will connect also
7751 // to the same port
7752 avoid_ports.insert(client_messenger->get_myaddr().get_port());
7753 #endif
7754 avoid_ports.insert(cluster_messenger->get_myaddr().get_port());
7755 avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port());
7756 avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port());
7757
7758 int r = cluster_messenger->rebind(avoid_ports);
7759 if (r != 0) {
7760 do_shutdown = true; // FIXME: do_restart?
7761 network_error = true;
7762 dout(0) << __func__ << " marked down:"
7763 << " rebind cluster_messenger failed" << dendl;
7764 }
7765
7766 r = hb_back_server_messenger->rebind(avoid_ports);
7767 if (r != 0) {
7768 do_shutdown = true; // FIXME: do_restart?
7769 network_error = true;
7770 dout(0) << __func__ << " marked down:"
7771 << " rebind hb_back_server_messenger failed" << dendl;
7772 }
7773
7774 r = hb_front_server_messenger->rebind(avoid_ports);
7775 if (r != 0) {
7776 do_shutdown = true; // FIXME: do_restart?
7777 network_error = true;
7778 dout(0) << __func__ << " marked down:"
7779 << " rebind hb_front_server_messenger failed" << dendl;
7780 }
7781
7782 hb_front_client_messenger->mark_down_all();
7783 hb_back_client_messenger->mark_down_all();
7784
7785 reset_heartbeat_peers();
7786 }
7787 }
7788 }
7789
7790 map_lock.put_write();
7791
7792 check_osdmap_features(store);
7793
7794 // yay!
7795 consume_map();
7796
7797 if (is_active() || is_waiting_for_healthy())
7798 maybe_update_heartbeat_peers();
7799
7800 if (!is_active()) {
7801 dout(10) << " not yet active; waiting for peering wq to drain" << dendl;
7802 peering_wq.drain();
7803 } else {
7804 activate_map();
7805 }
7806
7807 if (do_shutdown) {
7808 if (network_error) {
7809 Mutex::Locker l(heartbeat_lock);
7810 map<int,pair<utime_t,entity_inst_t>>::iterator it =
7811 failure_pending.begin();
7812 while (it != failure_pending.end()) {
7813 dout(10) << "handle_osd_ping canceling in-flight failure report for osd."
7814 << it->first << dendl;
7815 send_still_alive(osdmap->get_epoch(), it->second.second);
7816 failure_pending.erase(it++);
7817 }
7818 }
7819 // trigger shutdown in a different thread
7820 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
7821 queue_async_signal(SIGINT);
7822 }
7823 else if (m->newest_map && m->newest_map > last) {
7824 dout(10) << " msg say newest map is " << m->newest_map
7825 << ", requesting more" << dendl;
7826 osdmap_subscribe(osdmap->get_epoch()+1, false);
7827 }
7828 else if (is_preboot()) {
7829 if (m->get_source().is_mon())
7830 _preboot(m->oldest_map, m->newest_map);
7831 else
7832 start_boot();
7833 }
7834 else if (do_restart)
7835 start_boot();
7836
7837 }
7838
7839 void OSD::check_osdmap_features(ObjectStore *fs)
7840 {
7841 // adjust required feature bits?
7842
7843 // we have to be a bit careful here, because we are accessing the
7844 // Policy structures without taking any lock. in particular, only
7845 // modify integer values that can safely be read by a racing CPU.
7846 // since we are only accessing existing Policy structures a their
7847 // current memory location, and setting or clearing bits in integer
7848 // fields, and we are the only writer, this is not a problem.
7849
7850 {
7851 Messenger::Policy p = client_messenger->get_default_policy();
7852 uint64_t mask;
7853 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
7854 if ((p.features_required & mask) != features) {
7855 dout(0) << "crush map has features " << features
7856 << ", adjusting msgr requires for clients" << dendl;
7857 p.features_required = (p.features_required & ~mask) | features;
7858 client_messenger->set_default_policy(p);
7859 }
7860 }
7861 {
7862 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
7863 uint64_t mask;
7864 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
7865 if ((p.features_required & mask) != features) {
7866 dout(0) << "crush map has features " << features
7867 << " was " << p.features_required
7868 << ", adjusting msgr requires for mons" << dendl;
7869 p.features_required = (p.features_required & ~mask) | features;
7870 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
7871 }
7872 }
7873 {
7874 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
7875 uint64_t mask;
7876 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
7877
7878 if ((p.features_required & mask) != features) {
7879 dout(0) << "crush map has features " << features
7880 << ", adjusting msgr requires for osds" << dendl;
7881 p.features_required = (p.features_required & ~mask) | features;
7882 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
7883 }
7884
7885 if ((features & CEPH_FEATURE_OSD_ERASURE_CODES) &&
7886 !superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
7887 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
7888 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
7889 ObjectStore::Transaction t;
7890 write_superblock(t);
7891 int err = store->queue_transaction(service.meta_osr.get(), std::move(t), NULL);
7892 assert(err == 0);
7893 }
7894 }
7895 }
7896
7897 bool OSD::advance_pg(
7898 epoch_t osd_epoch, PG *pg,
7899 ThreadPool::TPHandle &handle,
7900 PG::RecoveryCtx *rctx,
7901 set<PGRef> *new_pgs)
7902 {
7903 assert(pg->is_locked());
7904 epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
7905 OSDMapRef lastmap = pg->get_osdmap();
7906
7907 if (lastmap->get_epoch() == osd_epoch)
7908 return true;
7909 assert(lastmap->get_epoch() < osd_epoch);
7910
7911 epoch_t min_epoch = service.get_min_pg_epoch();
7912 epoch_t max;
7913 if (min_epoch) {
7914 max = min_epoch + cct->_conf->osd_map_max_advance;
7915 } else {
7916 max = next_epoch + cct->_conf->osd_map_max_advance;
7917 }
7918
7919 for (;
7920 next_epoch <= osd_epoch && next_epoch <= max;
7921 ++next_epoch) {
7922 OSDMapRef nextmap = service.try_get_map(next_epoch);
7923 if (!nextmap) {
7924 dout(20) << __func__ << " missing map " << next_epoch << dendl;
7925 // make sure max is bumped up so that we can get past any
7926 // gap in maps
7927 max = MAX(max, next_epoch + cct->_conf->osd_map_max_advance);
7928 continue;
7929 }
7930
7931 vector<int> newup, newacting;
7932 int up_primary, acting_primary;
7933 nextmap->pg_to_up_acting_osds(
7934 pg->info.pgid.pgid,
7935 &newup, &up_primary,
7936 &newacting, &acting_primary);
7937 pg->handle_advance_map(
7938 nextmap, lastmap, newup, up_primary,
7939 newacting, acting_primary, rctx);
7940
7941 // Check for split!
7942 set<spg_t> children;
7943 spg_t parent(pg->info.pgid);
7944 if (parent.is_split(
7945 lastmap->get_pg_num(pg->pool.id),
7946 nextmap->get_pg_num(pg->pool.id),
7947 &children)) {
7948 service.mark_split_in_progress(pg->info.pgid, children);
7949 split_pgs(
7950 pg, children, new_pgs, lastmap, nextmap,
7951 rctx);
7952 }
7953
7954 lastmap = nextmap;
7955 handle.reset_tp_timeout();
7956 }
7957 service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
7958 pg->handle_activate_map(rctx);
7959 if (next_epoch <= osd_epoch) {
7960 dout(10) << __func__ << " advanced to max " << max
7961 << " past min epoch " << min_epoch
7962 << " ... will requeue " << *pg << dendl;
7963 return false;
7964 }
7965 return true;
7966 }
7967
7968 void OSD::consume_map()
7969 {
7970 assert(osd_lock.is_locked());
7971 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
7972
7973 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
7974 list<PGRef> to_remove;
7975
7976 // scan pg's
7977 {
7978 RWLock::RLocker l(pg_map_lock);
7979 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
7980 it != pg_map.end();
7981 ++it) {
7982 PG *pg = it->second;
7983 pg->lock();
7984 if (pg->is_primary())
7985 num_pg_primary++;
7986 else if (pg->is_replica())
7987 num_pg_replica++;
7988 else
7989 num_pg_stray++;
7990
7991 if (!osdmap->have_pg_pool(pg->info.pgid.pool())) {
7992 //pool is deleted!
7993 to_remove.push_back(PGRef(pg));
7994 } else {
7995 service.init_splits_between(it->first, service.get_osdmap(), osdmap);
7996 }
7997
7998 pg->unlock();
7999 }
8000 }
8001
8002 for (list<PGRef>::iterator i = to_remove.begin();
8003 i != to_remove.end();
8004 to_remove.erase(i++)) {
8005 RWLock::WLocker locker(pg_map_lock);
8006 (*i)->lock();
8007 _remove_pg(&**i);
8008 (*i)->unlock();
8009 }
8010
8011 service.expand_pg_num(service.get_osdmap(), osdmap);
8012
8013 service.pre_publish_map(osdmap);
8014 service.await_reserved_maps();
8015 service.publish_map(osdmap);
8016
8017 service.maybe_inject_dispatch_delay();
8018
8019 dispatch_sessions_waiting_on_map();
8020
8021 service.maybe_inject_dispatch_delay();
8022
8023 // remove any PGs which we no longer host from the session waiting_for_pg lists
8024 dout(20) << __func__ << " checking waiting_for_pg" << dendl;
8025 op_shardedwq.prune_pg_waiters(osdmap, whoami);
8026
8027 service.maybe_inject_dispatch_delay();
8028
8029 // scan pg's
8030 {
8031 RWLock::RLocker l(pg_map_lock);
8032 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8033 it != pg_map.end();
8034 ++it) {
8035 PG *pg = it->second;
8036 pg->lock();
8037 pg->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
8038 pg->unlock();
8039 }
8040
8041 logger->set(l_osd_pg, pg_map.size());
8042 }
8043 logger->set(l_osd_pg_primary, num_pg_primary);
8044 logger->set(l_osd_pg_replica, num_pg_replica);
8045 logger->set(l_osd_pg_stray, num_pg_stray);
8046 }
8047
8048 void OSD::activate_map()
8049 {
8050 assert(osd_lock.is_locked());
8051
8052 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8053
8054 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8055 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8056 ceph_abort();
8057 }
8058
8059 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
8060 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
8061 osdmap_subscribe(osdmap->get_epoch() + 1, false);
8062 }
8063
8064 // norecover?
8065 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8066 if (!service.recovery_is_paused()) {
8067 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8068 service.pause_recovery();
8069 }
8070 } else {
8071 if (service.recovery_is_paused()) {
8072 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8073 service.unpause_recovery();
8074 }
8075 }
8076
8077 service.activate_map();
8078
8079 // process waiters
8080 take_waiters(waiting_for_osdmap);
8081 }
8082
8083 bool OSD::require_mon_peer(const Message *m)
8084 {
8085 if (!m->get_connection()->peer_is_mon()) {
8086 dout(0) << "require_mon_peer received from non-mon "
8087 << m->get_connection()->get_peer_addr()
8088 << " " << *m << dendl;
8089 return false;
8090 }
8091 return true;
8092 }
8093
8094 bool OSD::require_mon_or_mgr_peer(const Message *m)
8095 {
8096 if (!m->get_connection()->peer_is_mon() &&
8097 !m->get_connection()->peer_is_mgr()) {
8098 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8099 << m->get_connection()->get_peer_addr()
8100 << " " << *m << dendl;
8101 return false;
8102 }
8103 return true;
8104 }
8105
8106 bool OSD::require_osd_peer(const Message *m)
8107 {
8108 if (!m->get_connection()->peer_is_osd()) {
8109 dout(0) << "require_osd_peer received from non-osd "
8110 << m->get_connection()->get_peer_addr()
8111 << " " << *m << dendl;
8112 return false;
8113 }
8114 return true;
8115 }
8116
8117 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8118 {
8119 epoch_t up_epoch = service.get_up_epoch();
8120 if (epoch < up_epoch) {
8121 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8122 return false;
8123 }
8124
8125 if (!is_active()) {
8126 dout(7) << "still in boot state, dropping message " << *m << dendl;
8127 return false;
8128 }
8129
8130 return true;
8131 }
8132
8133 bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
8134 bool is_fast_dispatch)
8135 {
8136 int from = m->get_source().num();
8137
8138 if (map->is_down(from) ||
8139 (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
8140 dout(5) << "from dead osd." << from << ", marking down, "
8141 << " msg was " << m->get_source_inst().addr
8142 << " expected " << (map->is_up(from) ?
8143 map->get_cluster_addr(from) : entity_addr_t())
8144 << dendl;
8145 ConnectionRef con = m->get_connection();
8146 con->mark_down();
8147 Session *s = static_cast<Session*>(con->get_priv());
8148 if (s) {
8149 if (!is_fast_dispatch)
8150 s->session_dispatch_lock.Lock();
8151 clear_session_waiting_on_map(s);
8152 con->set_priv(NULL); // break ref <-> session cycle, if any
8153 if (!is_fast_dispatch)
8154 s->session_dispatch_lock.Unlock();
8155 s->put();
8156 }
8157 return false;
8158 }
8159 return true;
8160 }
8161
8162
8163 /*
8164 * require that we have same (or newer) map, and that
8165 * the source is the pg primary.
8166 */
8167 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8168 bool is_fast_dispatch)
8169 {
8170 const Message *m = op->get_req();
8171 dout(15) << "require_same_or_newer_map " << epoch
8172 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8173
8174 assert(osd_lock.is_locked());
8175
8176 // do they have a newer map?
8177 if (epoch > osdmap->get_epoch()) {
8178 dout(7) << "waiting for newer map epoch " << epoch
8179 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8180 wait_for_new_map(op);
8181 return false;
8182 }
8183
8184 if (!require_self_aliveness(op->get_req(), epoch)) {
8185 return false;
8186 }
8187
8188 // ok, our map is same or newer.. do they still exist?
8189 if (m->get_connection()->get_messenger() == cluster_messenger &&
8190 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8191 return false;
8192 }
8193
8194 return true;
8195 }
8196
8197
8198
8199
8200
8201 // ----------------------------------------
8202 // pg creation
8203
8204 void OSD::split_pgs(
8205 PG *parent,
8206 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
8207 OSDMapRef curmap,
8208 OSDMapRef nextmap,
8209 PG::RecoveryCtx *rctx)
8210 {
8211 unsigned pg_num = nextmap->get_pg_num(
8212 parent->pool.id);
8213 parent->update_snap_mapper_bits(
8214 parent->info.pgid.get_split_bits(pg_num)
8215 );
8216
8217 vector<object_stat_sum_t> updated_stats(childpgids.size() + 1);
8218 parent->info.stats.stats.sum.split(updated_stats);
8219
8220 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8221 for (set<spg_t>::const_iterator i = childpgids.begin();
8222 i != childpgids.end();
8223 ++i, ++stat_iter) {
8224 assert(stat_iter != updated_stats.end());
8225 dout(10) << "Splitting " << *parent << " into " << *i << dendl;
8226 assert(service.splitting(*i));
8227 PG* child = _make_pg(nextmap, *i);
8228 child->lock(true);
8229 out_pgs->insert(child);
8230 rctx->created_pgs.insert(child);
8231
8232 unsigned split_bits = i->get_split_bits(pg_num);
8233 dout(10) << "pg_num is " << pg_num << dendl;
8234 dout(10) << "m_seed " << i->ps() << dendl;
8235 dout(10) << "split_bits is " << split_bits << dendl;
8236
8237 parent->split_colls(
8238 *i,
8239 split_bits,
8240 i->ps(),
8241 &child->pool.info,
8242 rctx->transaction);
8243 parent->split_into(
8244 i->pgid,
8245 child,
8246 split_bits);
8247 child->info.stats.stats.sum = *stat_iter;
8248
8249 child->write_if_dirty(*(rctx->transaction));
8250 child->unlock();
8251 }
8252 assert(stat_iter != updated_stats.end());
8253 parent->info.stats.stats.sum = *stat_iter;
8254 parent->write_if_dirty(*(rctx->transaction));
8255 }
8256
8257 /*
8258 * holding osd_lock
8259 */
8260 void OSD::handle_pg_create(OpRequestRef op)
8261 {
8262 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
8263 assert(m->get_type() == MSG_OSD_PG_CREATE);
8264
8265 dout(10) << "handle_pg_create " << *m << dendl;
8266
8267 if (!require_mon_peer(op->get_req())) {
8268 return;
8269 }
8270
8271 if (!require_same_or_newer_map(op, m->epoch, false))
8272 return;
8273
8274 op->mark_started();
8275
8276 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8277 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8278 p != m->mkpg.end();
8279 ++p, ++ci) {
8280 assert(ci != m->ctimes.end() && ci->first == p->first);
8281 epoch_t created = p->second.created;
8282 if (p->second.split_bits) // Skip split pgs
8283 continue;
8284 pg_t on = p->first;
8285
8286 if (on.preferred() >= 0) {
8287 dout(20) << "ignoring localized pg " << on << dendl;
8288 continue;
8289 }
8290
8291 if (!osdmap->have_pg_pool(on.pool())) {
8292 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8293 continue;
8294 }
8295
8296 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
8297
8298 // is it still ours?
8299 vector<int> up, acting;
8300 int up_primary = -1;
8301 int acting_primary = -1;
8302 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
8303 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
8304
8305 if (acting_primary != whoami) {
8306 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
8307 << "), my role=" << role << ", skipping" << dendl;
8308 continue;
8309 }
8310
8311 spg_t pgid;
8312 bool mapped = osdmap->get_primary_shard(on, &pgid);
8313 assert(mapped);
8314
8315 PastIntervals pi(
8316 osdmap->get_pools().at(pgid.pool()).ec_pool(),
8317 *osdmap);
8318 pg_history_t history;
8319 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
8320
8321 // The mon won't resend unless the primary changed, so
8322 // we ignore same_interval_since. We'll pass this history
8323 // to handle_pg_peering_evt with the current epoch as the
8324 // event -- the project_pg_history check in
8325 // handle_pg_peering_evt will be a noop.
8326 if (history.same_primary_since > m->epoch) {
8327 dout(10) << __func__ << ": got obsolete pg create on pgid "
8328 << pgid << " from epoch " << m->epoch
8329 << ", primary changed in " << history.same_primary_since
8330 << dendl;
8331 continue;
8332 }
8333
8334 if (handle_pg_peering_evt(
8335 pgid,
8336 history,
8337 pi,
8338 osdmap->get_epoch(),
8339 PG::CephPeeringEvtRef(
8340 new PG::CephPeeringEvt(
8341 osdmap->get_epoch(),
8342 osdmap->get_epoch(),
8343 PG::NullEvt()))
8344 ) == -EEXIST) {
8345 service.send_pg_created(pgid.pgid);
8346 }
8347 }
8348 last_pg_create_epoch = m->epoch;
8349
8350 maybe_update_heartbeat_peers();
8351 }
8352
8353
8354 // ----------------------------------------
8355 // peering and recovery
8356
8357 PG::RecoveryCtx OSD::create_context()
8358 {
8359 ObjectStore::Transaction *t = new ObjectStore::Transaction;
8360 C_Contexts *on_applied = new C_Contexts(cct);
8361 C_Contexts *on_safe = new C_Contexts(cct);
8362 map<int, map<spg_t,pg_query_t> > *query_map =
8363 new map<int, map<spg_t, pg_query_t> >;
8364 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
8365 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
8366 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
8367 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
8368 PG::RecoveryCtx rctx(query_map, info_map, notify_list,
8369 on_applied, on_safe, t);
8370 return rctx;
8371 }
8372
8373 struct C_OpenPGs : public Context {
8374 set<PGRef> pgs;
8375 ObjectStore *store;
8376 OSD *osd;
8377 C_OpenPGs(set<PGRef>& p, ObjectStore *s, OSD* o) : store(s), osd(o) {
8378 pgs.swap(p);
8379 }
8380 void finish(int r) override {
8381 RWLock::RLocker l(osd->pg_map_lock);
8382 for (auto p : pgs) {
8383 if (osd->pg_map.count(p->info.pgid)) {
8384 p->ch = store->open_collection(p->coll);
8385 assert(p->ch);
8386 }
8387 }
8388 }
8389 };
8390
8391 void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
8392 ThreadPool::TPHandle *handle)
8393 {
8394 if (!ctx.transaction->empty()) {
8395 if (!ctx.created_pgs.empty()) {
8396 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8397 }
8398 int tr = store->queue_transaction(
8399 pg->osr.get(),
8400 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL,
8401 TrackedOpRef(), handle);
8402 delete (ctx.transaction);
8403 assert(tr == 0);
8404 ctx.transaction = new ObjectStore::Transaction;
8405 ctx.on_applied = new C_Contexts(cct);
8406 ctx.on_safe = new C_Contexts(cct);
8407 }
8408 }
8409
8410 void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
8411 ThreadPool::TPHandle *handle)
8412 {
8413 if (service.get_osdmap()->is_up(whoami) &&
8414 is_active()) {
8415 do_notifies(*ctx.notify_list, curmap);
8416 do_queries(*ctx.query_map, curmap);
8417 do_infos(*ctx.info_map, curmap);
8418 }
8419 delete ctx.notify_list;
8420 delete ctx.query_map;
8421 delete ctx.info_map;
8422 if ((ctx.on_applied->empty() &&
8423 ctx.on_safe->empty() &&
8424 ctx.transaction->empty() &&
8425 ctx.created_pgs.empty()) || !pg) {
8426 delete ctx.transaction;
8427 delete ctx.on_applied;
8428 delete ctx.on_safe;
8429 assert(ctx.created_pgs.empty());
8430 } else {
8431 if (!ctx.created_pgs.empty()) {
8432 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8433 }
8434 int tr = store->queue_transaction(
8435 pg->osr.get(),
8436 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL, TrackedOpRef(),
8437 handle);
8438 delete (ctx.transaction);
8439 assert(tr == 0);
8440 }
8441 }
8442
8443 /** do_notifies
8444 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
8445 * content for, and they are primary for.
8446 */
8447
8448 void OSD::do_notifies(
8449 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
8450 OSDMapRef curmap)
8451 {
8452 for (map<int,
8453 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
8454 notify_list.begin();
8455 it != notify_list.end();
8456 ++it) {
8457 if (!curmap->is_up(it->first)) {
8458 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
8459 continue;
8460 }
8461 ConnectionRef con = service.get_con_osd_cluster(
8462 it->first, curmap->get_epoch());
8463 if (!con) {
8464 dout(20) << __func__ << " skipping osd." << it->first
8465 << " (NULL con)" << dendl;
8466 continue;
8467 }
8468 service.share_map_peer(it->first, con.get(), curmap);
8469 dout(7) << __func__ << " osd " << it->first
8470 << " on " << it->second.size() << " PGs" << dendl;
8471 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
8472 it->second);
8473 con->send_message(m);
8474 }
8475 }
8476
8477
8478 /** do_queries
8479 * send out pending queries for info | summaries
8480 */
8481 void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
8482 OSDMapRef curmap)
8483 {
8484 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
8485 pit != query_map.end();
8486 ++pit) {
8487 if (!curmap->is_up(pit->first)) {
8488 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
8489 continue;
8490 }
8491 int who = pit->first;
8492 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
8493 if (!con) {
8494 dout(20) << __func__ << " skipping osd." << who
8495 << " (NULL con)" << dendl;
8496 continue;
8497 }
8498 service.share_map_peer(who, con.get(), curmap);
8499 dout(7) << __func__ << " querying osd." << who
8500 << " on " << pit->second.size() << " PGs" << dendl;
8501 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
8502 con->send_message(m);
8503 }
8504 }
8505
8506
8507 void OSD::do_infos(map<int,
8508 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
8509 OSDMapRef curmap)
8510 {
8511 for (map<int,
8512 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
8513 info_map.begin();
8514 p != info_map.end();
8515 ++p) {
8516 if (!curmap->is_up(p->first)) {
8517 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
8518 continue;
8519 }
8520 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
8521 i != p->second.end();
8522 ++i) {
8523 dout(20) << __func__ << " sending info " << i->first.info
8524 << " to shard " << p->first << dendl;
8525 }
8526 ConnectionRef con = service.get_con_osd_cluster(
8527 p->first, curmap->get_epoch());
8528 if (!con) {
8529 dout(20) << __func__ << " skipping osd." << p->first
8530 << " (NULL con)" << dendl;
8531 continue;
8532 }
8533 service.share_map_peer(p->first, con.get(), curmap);
8534 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
8535 m->pg_list = p->second;
8536 con->send_message(m);
8537 }
8538 info_map.clear();
8539 }
8540
8541
8542 /** PGNotify
8543 * from non-primary to primary
8544 * includes pg_info_t.
8545 * NOTE: called with opqueue active.
8546 */
8547 void OSD::handle_pg_notify(OpRequestRef op)
8548 {
8549 const MOSDPGNotify *m = static_cast<const MOSDPGNotify*>(op->get_req());
8550 assert(m->get_type() == MSG_OSD_PG_NOTIFY);
8551
8552 dout(7) << "handle_pg_notify from " << m->get_source() << dendl;
8553 int from = m->get_source().num();
8554
8555 if (!require_osd_peer(op->get_req()))
8556 return;
8557
8558 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8559 return;
8560
8561 op->mark_started();
8562
8563 for (auto it = m->get_pg_list().begin();
8564 it != m->get_pg_list().end();
8565 ++it) {
8566 if (it->first.info.pgid.preferred() >= 0) {
8567 dout(20) << "ignoring localized pg " << it->first.info.pgid << dendl;
8568 continue;
8569 }
8570
8571 handle_pg_peering_evt(
8572 spg_t(it->first.info.pgid.pgid, it->first.to),
8573 it->first.info.history, it->second,
8574 it->first.query_epoch,
8575 PG::CephPeeringEvtRef(
8576 new PG::CephPeeringEvt(
8577 it->first.epoch_sent, it->first.query_epoch,
8578 PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first,
8579 op->get_req()->get_connection()->get_features())))
8580 );
8581 }
8582 }
8583
8584 void OSD::handle_pg_log(OpRequestRef op)
8585 {
8586 MOSDPGLog *m = static_cast<MOSDPGLog*>(op->get_nonconst_req());
8587 assert(m->get_type() == MSG_OSD_PG_LOG);
8588 dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl;
8589
8590 if (!require_osd_peer(op->get_req()))
8591 return;
8592
8593 int from = m->get_source().num();
8594 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8595 return;
8596
8597 if (m->info.pgid.preferred() >= 0) {
8598 dout(10) << "ignoring localized pg " << m->info.pgid << dendl;
8599 return;
8600 }
8601
8602 op->mark_started();
8603 handle_pg_peering_evt(
8604 spg_t(m->info.pgid.pgid, m->to),
8605 m->info.history, m->past_intervals, m->get_epoch(),
8606 PG::CephPeeringEvtRef(
8607 new PG::CephPeeringEvt(
8608 m->get_epoch(), m->get_query_epoch(),
8609 PG::MLogRec(pg_shard_t(from, m->from), m)))
8610 );
8611 }
8612
8613 void OSD::handle_pg_info(OpRequestRef op)
8614 {
8615 const MOSDPGInfo *m = static_cast<const MOSDPGInfo *>(op->get_req());
8616 assert(m->get_type() == MSG_OSD_PG_INFO);
8617 dout(7) << "handle_pg_info " << *m << " from " << m->get_source() << dendl;
8618
8619 if (!require_osd_peer(op->get_req()))
8620 return;
8621
8622 int from = m->get_source().num();
8623 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8624 return;
8625
8626 op->mark_started();
8627
8628 for (auto p = m->pg_list.begin();
8629 p != m->pg_list.end();
8630 ++p) {
8631 if (p->first.info.pgid.preferred() >= 0) {
8632 dout(10) << "ignoring localized pg " << p->first.info.pgid << dendl;
8633 continue;
8634 }
8635
8636 handle_pg_peering_evt(
8637 spg_t(p->first.info.pgid.pgid, p->first.to),
8638 p->first.info.history, p->second, p->first.epoch_sent,
8639 PG::CephPeeringEvtRef(
8640 new PG::CephPeeringEvt(
8641 p->first.epoch_sent, p->first.query_epoch,
8642 PG::MInfoRec(
8643 pg_shard_t(
8644 from, p->first.from), p->first.info, p->first.epoch_sent)))
8645 );
8646 }
8647 }
8648
8649 void OSD::handle_pg_trim(OpRequestRef op)
8650 {
8651 const MOSDPGTrim *m = static_cast<const MOSDPGTrim*>(op->get_req());
8652 assert(m->get_type() == MSG_OSD_PG_TRIM);
8653
8654 dout(7) << "handle_pg_trim " << *m << " from " << m->get_source() << dendl;
8655
8656 if (!require_osd_peer(op->get_req()))
8657 return;
8658
8659 int from = m->get_source().num();
8660 if (!require_same_or_newer_map(op, m->epoch, false))
8661 return;
8662
8663 if (m->pgid.preferred() >= 0) {
8664 dout(10) << "ignoring localized pg " << m->pgid << dendl;
8665 return;
8666 }
8667
8668 op->mark_started();
8669
8670 PG *pg = _lookup_lock_pg(m->pgid);
8671 if(!pg) {
8672 dout(10) << " don't have pg " << m->pgid << dendl;
8673 return;
8674 }
8675
8676 if (m->epoch < pg->info.history.same_interval_since) {
8677 dout(10) << *pg << " got old trim to " << m->trim_to << ", ignoring" << dendl;
8678 pg->unlock();
8679 return;
8680 }
8681
8682 if (pg->is_primary()) {
8683 // peer is informing us of their last_complete_ondisk
8684 dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
8685 pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
8686 m->trim_to;
8687 // trim log when the pg is recovered
8688 pg->calc_min_last_complete_ondisk();
8689 } else {
8690 // primary is instructing us to trim
8691 ObjectStore::Transaction t;
8692 pg->pg_log.trim(m->trim_to, pg->info);
8693 pg->dirty_info = true;
8694 pg->write_if_dirty(t);
8695 int tr = store->queue_transaction(pg->osr.get(), std::move(t), NULL);
8696 assert(tr == 0);
8697 }
8698 pg->unlock();
8699 }
8700
8701 void OSD::handle_pg_backfill_reserve(OpRequestRef op)
8702 {
8703 const MBackfillReserve *m = static_cast<const MBackfillReserve*>(op->get_req());
8704 assert(m->get_type() == MSG_OSD_BACKFILL_RESERVE);
8705
8706 if (!require_osd_peer(op->get_req()))
8707 return;
8708 if (!require_same_or_newer_map(op, m->query_epoch, false))
8709 return;
8710
8711 PG::CephPeeringEvtRef evt;
8712 if (m->type == MBackfillReserve::REQUEST) {
8713 evt = PG::CephPeeringEvtRef(
8714 new PG::CephPeeringEvt(
8715 m->query_epoch,
8716 m->query_epoch,
8717 PG::RequestBackfillPrio(m->priority)));
8718 } else if (m->type == MBackfillReserve::GRANT) {
8719 evt = PG::CephPeeringEvtRef(
8720 new PG::CephPeeringEvt(
8721 m->query_epoch,
8722 m->query_epoch,
8723 PG::RemoteBackfillReserved()));
8724 } else if (m->type == MBackfillReserve::REJECT) {
8725 evt = PG::CephPeeringEvtRef(
8726 new PG::CephPeeringEvt(
8727 m->query_epoch,
8728 m->query_epoch,
8729 PG::RemoteReservationRejected()));
8730 } else {
8731 ceph_abort();
8732 }
8733
8734 if (service.splitting(m->pgid)) {
8735 peering_wait_for_split[m->pgid].push_back(evt);
8736 return;
8737 }
8738
8739 PG *pg = _lookup_lock_pg(m->pgid);
8740 if (!pg) {
8741 dout(10) << " don't have pg " << m->pgid << dendl;
8742 return;
8743 }
8744
8745 pg->queue_peering_event(evt);
8746 pg->unlock();
8747 }
8748
8749 void OSD::handle_pg_recovery_reserve(OpRequestRef op)
8750 {
8751 const MRecoveryReserve *m = static_cast<const MRecoveryReserve*>(op->get_req());
8752 assert(m->get_type() == MSG_OSD_RECOVERY_RESERVE);
8753
8754 if (!require_osd_peer(op->get_req()))
8755 return;
8756 if (!require_same_or_newer_map(op, m->query_epoch, false))
8757 return;
8758
8759 PG::CephPeeringEvtRef evt;
8760 if (m->type == MRecoveryReserve::REQUEST) {
8761 evt = PG::CephPeeringEvtRef(
8762 new PG::CephPeeringEvt(
8763 m->query_epoch,
8764 m->query_epoch,
8765 PG::RequestRecovery()));
8766 } else if (m->type == MRecoveryReserve::GRANT) {
8767 evt = PG::CephPeeringEvtRef(
8768 new PG::CephPeeringEvt(
8769 m->query_epoch,
8770 m->query_epoch,
8771 PG::RemoteRecoveryReserved()));
8772 } else if (m->type == MRecoveryReserve::RELEASE) {
8773 evt = PG::CephPeeringEvtRef(
8774 new PG::CephPeeringEvt(
8775 m->query_epoch,
8776 m->query_epoch,
8777 PG::RecoveryDone()));
8778 } else {
8779 ceph_abort();
8780 }
8781
8782 if (service.splitting(m->pgid)) {
8783 peering_wait_for_split[m->pgid].push_back(evt);
8784 return;
8785 }
8786
8787 PG *pg = _lookup_lock_pg(m->pgid);
8788 if (!pg) {
8789 dout(10) << " don't have pg " << m->pgid << dendl;
8790 return;
8791 }
8792
8793 pg->queue_peering_event(evt);
8794 pg->unlock();
8795 }
8796
8797
8798 /** PGQuery
8799 * from primary to replica | stray
8800 * NOTE: called with opqueue active.
8801 */
8802 void OSD::handle_pg_query(OpRequestRef op)
8803 {
8804 assert(osd_lock.is_locked());
8805
8806 const MOSDPGQuery *m = static_cast<const MOSDPGQuery*>(op->get_req());
8807 assert(m->get_type() == MSG_OSD_PG_QUERY);
8808
8809 if (!require_osd_peer(op->get_req()))
8810 return;
8811
8812 dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl;
8813 int from = m->get_source().num();
8814
8815 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8816 return;
8817
8818 op->mark_started();
8819
8820 map< int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
8821
8822 for (auto it = m->pg_list.begin();
8823 it != m->pg_list.end();
8824 ++it) {
8825 spg_t pgid = it->first;
8826
8827 if (pgid.preferred() >= 0) {
8828 dout(10) << "ignoring localized pg " << pgid << dendl;
8829 continue;
8830 }
8831
8832 if (service.splitting(pgid)) {
8833 peering_wait_for_split[pgid].push_back(
8834 PG::CephPeeringEvtRef(
8835 new PG::CephPeeringEvt(
8836 it->second.epoch_sent, it->second.epoch_sent,
8837 PG::MQuery(pg_shard_t(from, it->second.from),
8838 it->second, it->second.epoch_sent))));
8839 continue;
8840 }
8841
8842 {
8843 RWLock::RLocker l(pg_map_lock);
8844 if (pg_map.count(pgid)) {
8845 PG *pg = 0;
8846 pg = _lookup_lock_pg_with_map_lock_held(pgid);
8847 pg->queue_query(
8848 it->second.epoch_sent, it->second.epoch_sent,
8849 pg_shard_t(from, it->second.from), it->second);
8850 pg->unlock();
8851 continue;
8852 }
8853 }
8854
8855 if (!osdmap->have_pg_pool(pgid.pool()))
8856 continue;
8857
8858 // get active crush mapping
8859 int up_primary, acting_primary;
8860 vector<int> up, acting;
8861 osdmap->pg_to_up_acting_osds(
8862 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
8863
8864 // same primary?
8865 pg_history_t history = it->second.history;
8866 bool valid_history = project_pg_history(
8867 pgid, history, it->second.epoch_sent,
8868 up, up_primary, acting, acting_primary);
8869
8870 if (!valid_history ||
8871 it->second.epoch_sent < history.same_interval_since) {
8872 dout(10) << " pg " << pgid << " dne, and pg has changed in "
8873 << history.same_interval_since
8874 << " (msg from " << it->second.epoch_sent << ")" << dendl;
8875 continue;
8876 }
8877
8878 dout(10) << " pg " << pgid << " dne" << dendl;
8879 pg_info_t empty(spg_t(pgid.pgid, it->second.to));
8880 /* This is racy, but that should be ok: if we complete the deletion
8881 * before the pg is recreated, we'll just start it off backfilling
8882 * instead of just empty */
8883 if (service.deleting_pgs.lookup(pgid))
8884 empty.set_last_backfill(hobject_t());
8885 if (it->second.type == pg_query_t::LOG ||
8886 it->second.type == pg_query_t::FULLLOG) {
8887 ConnectionRef con = service.get_con_osd_cluster(from, osdmap->get_epoch());
8888 if (con) {
8889 MOSDPGLog *mlog = new MOSDPGLog(
8890 it->second.from, it->second.to,
8891 osdmap->get_epoch(), empty,
8892 it->second.epoch_sent);
8893 service.share_map_peer(from, con.get(), osdmap);
8894 con->send_message(mlog);
8895 }
8896 } else {
8897 notify_list[from].push_back(
8898 make_pair(
8899 pg_notify_t(
8900 it->second.from, it->second.to,
8901 it->second.epoch_sent,
8902 osdmap->get_epoch(),
8903 empty),
8904 PastIntervals(
8905 osdmap->get_pools().at(pgid.pool()).ec_pool(),
8906 *osdmap)));
8907 }
8908 }
8909 do_notifies(notify_list, osdmap);
8910 }
8911
8912
8913 void OSD::handle_pg_remove(OpRequestRef op)
8914 {
8915 const MOSDPGRemove *m = static_cast<const MOSDPGRemove *>(op->get_req());
8916 assert(m->get_type() == MSG_OSD_PG_REMOVE);
8917 assert(osd_lock.is_locked());
8918
8919 if (!require_osd_peer(op->get_req()))
8920 return;
8921
8922 dout(7) << "handle_pg_remove from " << m->get_source() << " on "
8923 << m->pg_list.size() << " pgs" << dendl;
8924
8925 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8926 return;
8927
8928 op->mark_started();
8929
8930 for (auto it = m->pg_list.begin();
8931 it != m->pg_list.end();
8932 ++it) {
8933 spg_t pgid = *it;
8934 if (pgid.preferred() >= 0) {
8935 dout(10) << "ignoring localized pg " << pgid << dendl;
8936 continue;
8937 }
8938
8939 RWLock::WLocker l(pg_map_lock);
8940 if (pg_map.count(pgid) == 0) {
8941 dout(10) << " don't have pg " << pgid << dendl;
8942 continue;
8943 }
8944 dout(5) << "queue_pg_for_deletion: " << pgid << dendl;
8945 PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
8946 pg_history_t history = pg->info.history;
8947 int up_primary, acting_primary;
8948 vector<int> up, acting;
8949 osdmap->pg_to_up_acting_osds(
8950 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
8951 bool valid_history = project_pg_history(
8952 pg->info.pgid, history, pg->get_osdmap()->get_epoch(),
8953 up, up_primary, acting, acting_primary);
8954 if (valid_history &&
8955 history.same_interval_since <= m->get_epoch()) {
8956 assert(pg->get_primary().osd == m->get_source().num());
8957 PGRef _pg(pg);
8958 _remove_pg(pg);
8959 pg->unlock();
8960 } else {
8961 dout(10) << *pg << " ignoring remove request, pg changed in epoch "
8962 << history.same_interval_since
8963 << " > " << m->get_epoch() << dendl;
8964 pg->unlock();
8965 }
8966 }
8967 }
8968
8969 void OSD::_remove_pg(PG *pg)
8970 {
8971 ObjectStore::Transaction rmt ;
8972
8973 // on_removal, which calls remove_watchers_and_notifies, and the erasure from
8974 // the pg_map must be done together without unlocking the pg lock,
8975 // to avoid racing with watcher cleanup in ms_handle_reset
8976 // and handle_notify_timeout
8977 pg->on_removal(&rmt);
8978
8979 service.cancel_pending_splits_for_parent(pg->info.pgid);
8980 int tr = store->queue_transaction(
8981 pg->osr.get(), std::move(rmt), NULL,
8982 new ContainerContext<
8983 SequencerRef>(pg->osr));
8984 assert(tr == 0);
8985
8986 DeletingStateRef deleting = service.deleting_pgs.lookup_or_create(
8987 pg->info.pgid,
8988 make_pair(
8989 pg->info.pgid,
8990 PGRef(pg))
8991 );
8992 remove_wq.queue(make_pair(PGRef(pg), deleting));
8993
8994 service.pg_remove_epoch(pg->info.pgid);
8995
8996 // dereference from op_wq
8997 op_shardedwq.clear_pg_pointer(pg->info.pgid);
8998
8999 // remove from map
9000 pg_map.erase(pg->info.pgid);
9001 pg->put("PGMap"); // since we've taken it out of map
9002 }
9003
9004
9005 // =========================================================
9006 // RECOVERY
9007
9008 void OSDService::_maybe_queue_recovery() {
9009 assert(recovery_lock.is_locked_by_me());
9010 uint64_t available_pushes;
9011 while (!awaiting_throttle.empty() &&
9012 _recover_now(&available_pushes)) {
9013 uint64_t to_start = MIN(
9014 available_pushes,
9015 cct->_conf->osd_recovery_max_single_start);
9016 _queue_for_recovery(awaiting_throttle.front(), to_start);
9017 awaiting_throttle.pop_front();
9018 recovery_ops_reserved += to_start;
9019 }
9020 }
9021
9022 bool OSDService::_recover_now(uint64_t *available_pushes)
9023 {
9024 if (available_pushes)
9025 *available_pushes = 0;
9026
9027 if (ceph_clock_now() < defer_recovery_until) {
9028 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9029 return false;
9030 }
9031
9032 if (recovery_paused) {
9033 dout(15) << __func__ << " paused" << dendl;
9034 return false;
9035 }
9036
9037 uint64_t max = cct->_conf->osd_recovery_max_active;
9038 if (max <= recovery_ops_active + recovery_ops_reserved) {
9039 dout(15) << __func__ << " active " << recovery_ops_active
9040 << " + reserved " << recovery_ops_reserved
9041 << " >= max " << max << dendl;
9042 return false;
9043 }
9044
9045 if (available_pushes)
9046 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9047
9048 return true;
9049 }
9050
9051 void OSD::do_recovery(
9052 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9053 ThreadPool::TPHandle &handle)
9054 {
9055 uint64_t started = 0;
9056
9057 /*
9058 * When the value of osd_recovery_sleep is set greater than zero, recovery
9059 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9060 * recovery event's schedule time. This is done by adding a
9061 * recovery_requeue_callback event, which re-queues the recovery op using
9062 * queue_recovery_after_sleep.
9063 */
9064 if (cct->_conf->osd_recovery_sleep > 0 && service.recovery_needs_sleep) {
9065 PGRef pgref(pg);
9066 auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9067 dout(20) << "do_recovery wake up at "
9068 << ceph_clock_now()
9069 << ", re-queuing recovery" << dendl;
9070 service.recovery_needs_sleep = false;
9071 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9072 });
9073 Mutex::Locker l(service.recovery_sleep_lock);
9074
9075 // This is true for the first recovery op and when the previous recovery op
9076 // has been scheduled in the past. The next recovery op is scheduled after
9077 // completing the sleep from now.
9078 if (service.recovery_schedule_time < ceph_clock_now()) {
9079 service.recovery_schedule_time = ceph_clock_now();
9080 }
9081 service.recovery_schedule_time += cct->_conf->osd_recovery_sleep;
9082 service.recovery_sleep_timer.add_event_at(service.recovery_schedule_time,
9083 recovery_requeue_callback);
9084 dout(20) << "Recovery event scheduled at "
9085 << service.recovery_schedule_time << dendl;
9086 return;
9087 }
9088
9089 {
9090 service.recovery_needs_sleep = true;
9091 if (pg->pg_has_reset_since(queued)) {
9092 goto out;
9093 }
9094
9095 assert(!pg->deleting);
9096 assert(pg->is_peered() && pg->is_primary());
9097
9098 assert(pg->recovery_queued);
9099 pg->recovery_queued = false;
9100
9101 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9102 #ifdef DEBUG_RECOVERY_OIDS
9103 dout(20) << " active was " << service.recovery_oids[pg->info.pgid] << dendl;
9104 #endif
9105
9106 bool more = pg->start_recovery_ops(reserved_pushes, handle, &started);
9107 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9108 << " on " << *pg << dendl;
9109
9110 // If no recovery op is started, don't bother to manipulate the RecoveryCtx
9111 if (!started && (more || !pg->have_unfound())) {
9112 goto out;
9113 }
9114
9115 PG::RecoveryCtx rctx = create_context();
9116 rctx.handle = &handle;
9117
9118 /*
9119 * if we couldn't start any recovery ops and things are still
9120 * unfound, see if we can discover more missing object locations.
9121 * It may be that our initial locations were bad and we errored
9122 * out while trying to pull.
9123 */
9124 if (!more && pg->have_unfound()) {
9125 pg->discover_all_missing(*rctx.query_map);
9126 if (rctx.query_map->empty()) {
9127 dout(10) << "do_recovery no luck, giving up on this pg for now" << dendl;
9128 } else {
9129 dout(10) << "do_recovery no luck, giving up on this pg for now" << dendl;
9130 pg->queue_recovery();
9131 }
9132 }
9133
9134 pg->write_if_dirty(*rctx.transaction);
9135 OSDMapRef curmap = pg->get_osdmap();
9136 dispatch_context(rctx, pg, curmap);
9137 }
9138
9139 out:
9140 assert(started <= reserved_pushes);
9141 service.release_reserved_pushes(reserved_pushes);
9142 }
9143
9144 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9145 {
9146 Mutex::Locker l(recovery_lock);
9147 dout(10) << "start_recovery_op " << *pg << " " << soid
9148 << " (" << recovery_ops_active << "/"
9149 << cct->_conf->osd_recovery_max_active << " rops)"
9150 << dendl;
9151 recovery_ops_active++;
9152
9153 #ifdef DEBUG_RECOVERY_OIDS
9154 dout(20) << " active was " << recovery_oids[pg->info.pgid] << dendl;
9155 assert(recovery_oids[pg->info.pgid].count(soid) == 0);
9156 recovery_oids[pg->info.pgid].insert(soid);
9157 #endif
9158 }
9159
9160 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9161 {
9162 Mutex::Locker l(recovery_lock);
9163 dout(10) << "finish_recovery_op " << *pg << " " << soid
9164 << " dequeue=" << dequeue
9165 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
9166 << dendl;
9167
9168 // adjust count
9169 assert(recovery_ops_active > 0);
9170 recovery_ops_active--;
9171
9172 #ifdef DEBUG_RECOVERY_OIDS
9173 dout(20) << " active oids was " << recovery_oids[pg->info.pgid] << dendl;
9174 assert(recovery_oids[pg->info.pgid].count(soid));
9175 recovery_oids[pg->info.pgid].erase(soid);
9176 #endif
9177
9178 _maybe_queue_recovery();
9179 }
9180
9181 bool OSDService::is_recovery_active()
9182 {
9183 if (recovery_ops_active > 0)
9184 return true;
9185
9186 return false;
9187 }
9188
9189 // =========================================================
9190 // OPS
9191
9192 bool OSD::op_is_discardable(const MOSDOp *op)
9193 {
9194 // drop client request if they are not connected and can't get the
9195 // reply anyway.
9196 if (!op->get_connection()->is_connected()) {
9197 return true;
9198 }
9199 return false;
9200 }
9201
9202 void OSD::enqueue_op(spg_t pg, OpRequestRef& op, epoch_t epoch)
9203 {
9204 utime_t latency = ceph_clock_now() - op->get_req()->get_recv_stamp();
9205 dout(15) << "enqueue_op " << op << " prio " << op->get_req()->get_priority()
9206 << " cost " << op->get_req()->get_cost()
9207 << " latency " << latency
9208 << " epoch " << epoch
9209 << " " << *(op->get_req()) << dendl;
9210 op->osd_trace.event("enqueue op");
9211 op->osd_trace.keyval("priority", op->get_req()->get_priority());
9212 op->osd_trace.keyval("cost", op->get_req()->get_cost());
9213 op->mark_queued_for_pg();
9214 op_shardedwq.queue(make_pair(pg, PGQueueable(op, epoch)));
9215 }
9216
9217
9218
9219 /*
9220 * NOTE: dequeue called in worker thread, with pg lock
9221 */
9222 void OSD::dequeue_op(
9223 PGRef pg, OpRequestRef op,
9224 ThreadPool::TPHandle &handle)
9225 {
9226 FUNCTRACE();
9227 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
9228
9229 utime_t now = ceph_clock_now();
9230 op->set_dequeued_time(now);
9231 utime_t latency = now - op->get_req()->get_recv_stamp();
9232 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
9233 << " cost " << op->get_req()->get_cost()
9234 << " latency " << latency
9235 << " " << *(op->get_req())
9236 << " pg " << *pg << dendl;
9237
9238 Session *session = static_cast<Session *>(
9239 op->get_req()->get_connection()->get_priv());
9240 if (session) {
9241 maybe_share_map(session, op, pg->get_osdmap());
9242 session->put();
9243 }
9244
9245 if (pg->deleting)
9246 return;
9247
9248 op->mark_reached_pg();
9249 op->osd_trace.event("dequeue_op");
9250
9251 pg->do_request(op, handle);
9252
9253 // finish
9254 dout(10) << "dequeue_op " << op << " finish" << dendl;
9255 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
9256 }
9257
9258
9259 struct C_CompleteSplits : public Context {
9260 OSD *osd;
9261 set<PGRef> pgs;
9262 C_CompleteSplits(OSD *osd, const set<PGRef> &in)
9263 : osd(osd), pgs(in) {}
9264 void finish(int r) override {
9265 Mutex::Locker l(osd->osd_lock);
9266 if (osd->is_stopping())
9267 return;
9268 PG::RecoveryCtx rctx = osd->create_context();
9269 for (set<PGRef>::iterator i = pgs.begin();
9270 i != pgs.end();
9271 ++i) {
9272 osd->pg_map_lock.get_write();
9273 (*i)->lock();
9274 PG *pg = i->get();
9275 osd->add_newly_split_pg(pg, &rctx);
9276 if (!((*i)->deleting)) {
9277 set<spg_t> to_complete;
9278 to_complete.insert((*i)->info.pgid);
9279 osd->service.complete_split(to_complete);
9280 }
9281 osd->pg_map_lock.put_write();
9282 osd->dispatch_context_transaction(rctx, pg);
9283 osd->wake_pg_waiters(*i);
9284 (*i)->unlock();
9285 }
9286
9287 osd->dispatch_context(rctx, 0, osd->service.get_osdmap());
9288 }
9289 };
9290
9291 void OSD::process_peering_events(
9292 const list<PG*> &pgs,
9293 ThreadPool::TPHandle &handle
9294 )
9295 {
9296 bool need_up_thru = false;
9297 epoch_t same_interval_since = 0;
9298 OSDMapRef curmap;
9299 PG::RecoveryCtx rctx = create_context();
9300 rctx.handle = &handle;
9301 for (list<PG*>::const_iterator i = pgs.begin();
9302 i != pgs.end();
9303 ++i) {
9304 set<PGRef> split_pgs;
9305 PG *pg = *i;
9306 pg->lock_suspend_timeout(handle);
9307 curmap = service.get_osdmap();
9308 if (pg->deleting) {
9309 pg->unlock();
9310 continue;
9311 }
9312 if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
9313 // we need to requeue the PG explicitly since we didn't actually
9314 // handle an event
9315 peering_wq.queue(pg);
9316 } else {
9317 assert(!pg->peering_queue.empty());
9318 PG::CephPeeringEvtRef evt = pg->peering_queue.front();
9319 pg->peering_queue.pop_front();
9320 pg->handle_peering_event(evt, &rctx);
9321 }
9322 need_up_thru = pg->need_up_thru || need_up_thru;
9323 same_interval_since = MAX(pg->info.history.same_interval_since,
9324 same_interval_since);
9325 pg->write_if_dirty(*rctx.transaction);
9326 if (!split_pgs.empty()) {
9327 rctx.on_applied->add(new C_CompleteSplits(this, split_pgs));
9328 split_pgs.clear();
9329 }
9330 dispatch_context_transaction(rctx, pg, &handle);
9331 pg->unlock();
9332 }
9333 if (need_up_thru)
9334 queue_want_up_thru(same_interval_since);
9335 dispatch_context(rctx, 0, curmap, &handle);
9336
9337 service.send_pg_temp();
9338 }
9339
9340 // --------------------------------
9341
9342 const char** OSD::get_tracked_conf_keys() const
9343 {
9344 static const char* KEYS[] = {
9345 "osd_max_backfills",
9346 "osd_min_recovery_priority",
9347 "osd_op_complaint_time", "osd_op_log_threshold",
9348 "osd_op_history_size", "osd_op_history_duration",
9349 "osd_enable_op_tracker",
9350 "osd_map_cache_size",
9351 "osd_map_max_advance",
9352 "osd_pg_epoch_persisted_max_stale",
9353 "osd_disk_thread_ioprio_class",
9354 "osd_disk_thread_ioprio_priority",
9355 // clog & admin clog
9356 "clog_to_monitors",
9357 "clog_to_syslog",
9358 "clog_to_syslog_facility",
9359 "clog_to_syslog_level",
9360 "osd_objectstore_fuse",
9361 "clog_to_graylog",
9362 "clog_to_graylog_host",
9363 "clog_to_graylog_port",
9364 "host",
9365 "fsid",
9366 "osd_recovery_delay_start",
9367 "osd_client_message_size_cap",
9368 "osd_client_message_cap",
9369 "osd_heartbeat_min_size",
9370 "osd_heartbeat_interval",
9371 NULL
9372 };
9373 return KEYS;
9374 }
9375
9376 void OSD::handle_conf_change(const struct md_config_t *conf,
9377 const std::set <std::string> &changed)
9378 {
9379 if (changed.count("osd_max_backfills")) {
9380 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9381 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9382 }
9383 if (changed.count("osd_min_recovery_priority")) {
9384 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9385 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9386 }
9387 if (changed.count("osd_max_trimming_pgs")) {
9388 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9389 }
9390 if (changed.count("osd_op_complaint_time") ||
9391 changed.count("osd_op_log_threshold")) {
9392 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9393 cct->_conf->osd_op_log_threshold);
9394 }
9395 if (changed.count("osd_op_history_size") ||
9396 changed.count("osd_op_history_duration")) {
9397 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9398 cct->_conf->osd_op_history_duration);
9399 }
9400 if (changed.count("osd_op_history_slow_op_size") ||
9401 changed.count("osd_op_history_slow_op_threshold")) {
9402 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9403 cct->_conf->osd_op_history_slow_op_threshold);
9404 }
9405 if (changed.count("osd_enable_op_tracker")) {
9406 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9407 }
9408 if (changed.count("osd_disk_thread_ioprio_class") ||
9409 changed.count("osd_disk_thread_ioprio_priority")) {
9410 set_disk_tp_priority();
9411 }
9412 if (changed.count("osd_map_cache_size")) {
9413 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9414 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9415 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9416 }
9417 if (changed.count("clog_to_monitors") ||
9418 changed.count("clog_to_syslog") ||
9419 changed.count("clog_to_syslog_level") ||
9420 changed.count("clog_to_syslog_facility") ||
9421 changed.count("clog_to_graylog") ||
9422 changed.count("clog_to_graylog_host") ||
9423 changed.count("clog_to_graylog_port") ||
9424 changed.count("host") ||
9425 changed.count("fsid")) {
9426 update_log_config();
9427 }
9428
9429 #ifdef HAVE_LIBFUSE
9430 if (changed.count("osd_objectstore_fuse")) {
9431 if (store) {
9432 enable_disable_fuse(false);
9433 }
9434 }
9435 #endif
9436
9437 if (changed.count("osd_recovery_delay_start")) {
9438 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9439 service.kick_recovery_queue();
9440 }
9441
9442 if (changed.count("osd_client_message_cap")) {
9443 uint64_t newval = cct->_conf->osd_client_message_cap;
9444 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9445 if (pol.throttler_messages && newval > 0) {
9446 pol.throttler_messages->reset_max(newval);
9447 }
9448 }
9449 if (changed.count("osd_client_message_size_cap")) {
9450 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9451 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9452 if (pol.throttler_bytes && newval > 0) {
9453 pol.throttler_bytes->reset_max(newval);
9454 }
9455 }
9456
9457 check_config();
9458 }
9459
9460 void OSD::update_log_config()
9461 {
9462 map<string,string> log_to_monitors;
9463 map<string,string> log_to_syslog;
9464 map<string,string> log_channel;
9465 map<string,string> log_prio;
9466 map<string,string> log_to_graylog;
9467 map<string,string> log_to_graylog_host;
9468 map<string,string> log_to_graylog_port;
9469 uuid_d fsid;
9470 string host;
9471
9472 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9473 log_channel, log_prio, log_to_graylog,
9474 log_to_graylog_host, log_to_graylog_port,
9475 fsid, host) == 0)
9476 clog->update_config(log_to_monitors, log_to_syslog,
9477 log_channel, log_prio, log_to_graylog,
9478 log_to_graylog_host, log_to_graylog_port,
9479 fsid, host);
9480 derr << "log_to_monitors " << log_to_monitors << dendl;
9481 }
9482
9483 void OSD::check_config()
9484 {
9485 // some sanity checks
9486 if (cct->_conf->osd_map_cache_size <= cct->_conf->osd_map_max_advance + 2) {
9487 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9488 << " is not > osd_map_max_advance ("
9489 << cct->_conf->osd_map_max_advance << ")";
9490 }
9491 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9492 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9493 << " is not > osd_pg_epoch_persisted_max_stale ("
9494 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9495 }
9496 }
9497
9498 void OSD::set_disk_tp_priority()
9499 {
9500 dout(10) << __func__
9501 << " class " << cct->_conf->osd_disk_thread_ioprio_class
9502 << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
9503 << dendl;
9504 if (cct->_conf->osd_disk_thread_ioprio_class.empty() ||
9505 cct->_conf->osd_disk_thread_ioprio_priority < 0)
9506 return;
9507 int cls =
9508 ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
9509 if (cls < 0)
9510 derr << __func__ << cpp_strerror(cls) << ": "
9511 << "osd_disk_thread_ioprio_class is " << cct->_conf->osd_disk_thread_ioprio_class
9512 << " but only the following values are allowed: idle, be or rt" << dendl;
9513 else
9514 disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
9515 }
9516
9517 // --------------------------------
9518
9519 void OSD::get_latest_osdmap()
9520 {
9521 dout(10) << __func__ << " -- start" << dendl;
9522
9523 C_SaferCond cond;
9524 service.objecter->wait_for_latest_osdmap(&cond);
9525 cond.wait();
9526
9527 dout(10) << __func__ << " -- finish" << dendl;
9528 }
9529
9530 // --------------------------------
9531
9532 int OSD::init_op_flags(OpRequestRef& op)
9533 {
9534 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
9535 vector<OSDOp>::const_iterator iter;
9536
9537 // client flags have no bearing on whether an op is a read, write, etc.
9538 op->rmw_flags = 0;
9539
9540 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
9541 op->set_force_rwordered();
9542 }
9543
9544 // set bits based on op codes, called methods.
9545 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
9546 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
9547 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
9548 /* This a bit odd. PING isn't actually a write. It can't
9549 * result in an update to the object_info. PINGs also aren'ty
9550 * resent, so there's no reason to write out a log entry
9551 *
9552 * However, we pipeline them behind writes, so let's force
9553 * the write_ordered flag.
9554 */
9555 op->set_force_rwordered();
9556 } else {
9557 if (ceph_osd_op_mode_modify(iter->op.op))
9558 op->set_write();
9559 }
9560 if (ceph_osd_op_mode_read(iter->op.op))
9561 op->set_read();
9562
9563 // set READ flag if there are src_oids
9564 if (iter->soid.oid.name.length())
9565 op->set_read();
9566
9567 // set PGOP flag if there are PG ops
9568 if (ceph_osd_op_type_pg(iter->op.op))
9569 op->set_pg_op();
9570
9571 if (ceph_osd_op_mode_cache(iter->op.op))
9572 op->set_cache();
9573
9574 // check for ec base pool
9575 int64_t poolid = m->get_pg().pool();
9576 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
9577 if (pool && pool->is_tier()) {
9578 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
9579 if (base_pool && base_pool->require_rollback()) {
9580 if ((iter->op.op != CEPH_OSD_OP_READ) &&
9581 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
9582 (iter->op.op != CEPH_OSD_OP_STAT) &&
9583 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
9584 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
9585 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
9586 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
9587 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
9588 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
9589 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
9590 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
9591 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
9592 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
9593 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
9594 (iter->op.op != CEPH_OSD_OP_CREATE) &&
9595 (iter->op.op != CEPH_OSD_OP_DELETE) &&
9596 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
9597 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
9598 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
9599 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
9600 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
9601 op->set_promote();
9602 }
9603 }
9604 }
9605
9606 switch (iter->op.op) {
9607 case CEPH_OSD_OP_CALL:
9608 {
9609 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
9610 int is_write, is_read;
9611 string cname, mname;
9612 bp.copy(iter->op.cls.class_len, cname);
9613 bp.copy(iter->op.cls.method_len, mname);
9614
9615 ClassHandler::ClassData *cls;
9616 int r = class_handler->open_class(cname, &cls);
9617 if (r) {
9618 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
9619 if (r == -ENOENT)
9620 r = -EOPNOTSUPP;
9621 else if (r != -EPERM) // propagate permission errors
9622 r = -EIO;
9623 return r;
9624 }
9625 int flags = cls->get_method_flags(mname.c_str());
9626 if (flags < 0) {
9627 if (flags == -ENOENT)
9628 r = -EOPNOTSUPP;
9629 else
9630 r = flags;
9631 return r;
9632 }
9633 is_read = flags & CLS_METHOD_RD;
9634 is_write = flags & CLS_METHOD_WR;
9635 bool is_promote = flags & CLS_METHOD_PROMOTE;
9636
9637 dout(10) << "class " << cname << " method " << mname << " "
9638 << "flags=" << (is_read ? "r" : "")
9639 << (is_write ? "w" : "")
9640 << (is_promote ? "p" : "")
9641 << dendl;
9642 if (is_read)
9643 op->set_class_read();
9644 if (is_write)
9645 op->set_class_write();
9646 if (is_promote)
9647 op->set_promote();
9648 op->add_class(cname, is_read, is_write, cls->whitelisted);
9649 break;
9650 }
9651
9652 case CEPH_OSD_OP_WATCH:
9653 // force the read bit for watch since it is depends on previous
9654 // watch state (and may return early if the watch exists) or, in
9655 // the case of ping, is simply a read op.
9656 op->set_read();
9657 // fall through
9658 case CEPH_OSD_OP_NOTIFY:
9659 case CEPH_OSD_OP_NOTIFY_ACK:
9660 {
9661 op->set_promote();
9662 break;
9663 }
9664
9665 case CEPH_OSD_OP_DELETE:
9666 // if we get a delete with FAILOK we can skip handle cache. without
9667 // FAILOK we still need to promote (or do something smarter) to
9668 // determine whether to return ENOENT or 0.
9669 if (iter == m->ops.begin() &&
9670 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
9671 op->set_skip_handle_cache();
9672 }
9673 // skip promotion when proxying a delete op
9674 if (m->ops.size() == 1) {
9675 op->set_skip_promote();
9676 }
9677 break;
9678
9679 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
9680 case CEPH_OSD_OP_CACHE_FLUSH:
9681 case CEPH_OSD_OP_CACHE_EVICT:
9682 // If try_flush/flush/evict is the only op, can skip handle cache.
9683 if (m->ops.size() == 1) {
9684 op->set_skip_handle_cache();
9685 }
9686 break;
9687
9688 case CEPH_OSD_OP_READ:
9689 case CEPH_OSD_OP_SYNC_READ:
9690 case CEPH_OSD_OP_SPARSE_READ:
9691 case CEPH_OSD_OP_CHECKSUM:
9692 case CEPH_OSD_OP_WRITEFULL:
9693 if (m->ops.size() == 1 &&
9694 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
9695 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
9696 op->set_skip_promote();
9697 }
9698 break;
9699
9700 // force promotion when pin an object in cache tier
9701 case CEPH_OSD_OP_CACHE_PIN:
9702 op->set_promote();
9703 break;
9704
9705 default:
9706 break;
9707 }
9708 }
9709
9710 if (op->rmw_flags == 0)
9711 return -EINVAL;
9712
9713 return 0;
9714 }
9715
9716 void OSD::PeeringWQ::_dequeue(list<PG*> *out) {
9717 for (list<PG*>::iterator i = peering_queue.begin();
9718 i != peering_queue.end() &&
9719 out->size() < osd->cct->_conf->osd_peering_wq_batch_size;
9720 ) {
9721 if (in_use.count(*i)) {
9722 ++i;
9723 } else {
9724 out->push_back(*i);
9725 peering_queue.erase(i++);
9726 }
9727 }
9728 in_use.insert(out->begin(), out->end());
9729 }
9730
9731 // =============================================================
9732
9733 #undef dout_context
9734 #define dout_context osd->cct
9735 #undef dout_prefix
9736 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
9737
9738 void OSD::ShardedOpWQ::wake_pg_waiters(spg_t pgid)
9739 {
9740 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
9741 auto sdata = shard_list[shard_index];
9742 bool queued = false;
9743 unsigned pushes_to_free = 0;
9744 {
9745 Mutex::Locker l(sdata->sdata_op_ordering_lock);
9746 auto p = sdata->pg_slots.find(pgid);
9747 if (p != sdata->pg_slots.end()) {
9748 dout(20) << __func__ << " " << pgid
9749 << " to_process " << p->second.to_process
9750 << " waiting_for_pg=" << (int)p->second.waiting_for_pg << dendl;
9751 for (auto i = p->second.to_process.rbegin();
9752 i != p->second.to_process.rend();
9753 ++i) {
9754 sdata->_enqueue_front(make_pair(pgid, *i), osd->op_prio_cutoff);
9755 }
9756 for (auto& q : p->second.to_process) {
9757 pushes_to_free += q.get_reserved_pushes();
9758 }
9759 p->second.to_process.clear();
9760 p->second.waiting_for_pg = false;
9761 ++p->second.requeue_seq;
9762 queued = true;
9763 }
9764 }
9765 if (pushes_to_free > 0) {
9766 osd->service.release_reserved_pushes(pushes_to_free);
9767 }
9768 if (queued) {
9769 sdata->sdata_lock.Lock();
9770 sdata->sdata_cond.SignalOne();
9771 sdata->sdata_lock.Unlock();
9772 }
9773 }
9774
9775 void OSD::ShardedOpWQ::prune_pg_waiters(OSDMapRef osdmap, int whoami)
9776 {
9777 unsigned pushes_to_free = 0;
9778 for (auto sdata : shard_list) {
9779 Mutex::Locker l(sdata->sdata_op_ordering_lock);
9780 sdata->waiting_for_pg_osdmap = osdmap;
9781 auto p = sdata->pg_slots.begin();
9782 while (p != sdata->pg_slots.end()) {
9783 ShardData::pg_slot& slot = p->second;
9784 if (!slot.to_process.empty() && slot.num_running == 0) {
9785 if (osdmap->is_up_acting_osd_shard(p->first, whoami)) {
9786 dout(20) << __func__ << " " << p->first << " maps to us, keeping"
9787 << dendl;
9788 ++p;
9789 continue;
9790 }
9791 while (!slot.to_process.empty() &&
9792 slot.to_process.front().get_map_epoch() <= osdmap->get_epoch()) {
9793 auto& qi = slot.to_process.front();
9794 dout(20) << __func__ << " " << p->first
9795 << " item " << qi
9796 << " epoch " << qi.get_map_epoch()
9797 << " <= " << osdmap->get_epoch()
9798 << ", stale, dropping" << dendl;
9799 pushes_to_free += qi.get_reserved_pushes();
9800 slot.to_process.pop_front();
9801 }
9802 }
9803 if (slot.to_process.empty() &&
9804 slot.num_running == 0 &&
9805 !slot.pg) {
9806 dout(20) << __func__ << " " << p->first << " empty, pruning" << dendl;
9807 p = sdata->pg_slots.erase(p);
9808 } else {
9809 ++p;
9810 }
9811 }
9812 }
9813 if (pushes_to_free > 0) {
9814 osd->service.release_reserved_pushes(pushes_to_free);
9815 }
9816 }
9817
9818 void OSD::ShardedOpWQ::clear_pg_pointer(spg_t pgid)
9819 {
9820 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
9821 auto sdata = shard_list[shard_index];
9822 Mutex::Locker l(sdata->sdata_op_ordering_lock);
9823 auto p = sdata->pg_slots.find(pgid);
9824 if (p != sdata->pg_slots.end()) {
9825 auto& slot = p->second;
9826 dout(20) << __func__ << " " << pgid << " pg " << slot.pg << dendl;
9827 assert(!slot.pg || slot.pg->deleting);
9828 slot.pg = nullptr;
9829 }
9830 }
9831
9832 void OSD::ShardedOpWQ::clear_pg_slots()
9833 {
9834 for (auto sdata : shard_list) {
9835 Mutex::Locker l(sdata->sdata_op_ordering_lock);
9836 sdata->pg_slots.clear();
9837 sdata->waiting_for_pg_osdmap.reset();
9838 // don't bother with reserved pushes; we are shutting down
9839 }
9840 }
9841
9842 #undef dout_prefix
9843 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
9844
9845 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
9846 {
9847 uint32_t shard_index = thread_index % num_shards;
9848 ShardData *sdata = shard_list[shard_index];
9849 assert(NULL != sdata);
9850
9851 // peek at spg_t
9852 sdata->sdata_op_ordering_lock.Lock();
9853 if (sdata->pqueue->empty()) {
9854 dout(20) << __func__ << " empty q, waiting" << dendl;
9855 // optimistically sleep a moment; maybe another work item will come along.
9856 sdata->sdata_op_ordering_lock.Unlock();
9857 osd->cct->get_heartbeat_map()->reset_timeout(hb,
9858 osd->cct->_conf->threadpool_default_timeout, 0);
9859 sdata->sdata_lock.Lock();
9860 sdata->sdata_cond.WaitInterval(sdata->sdata_lock,
9861 utime_t(osd->cct->_conf->threadpool_empty_queue_max_wait, 0));
9862 sdata->sdata_lock.Unlock();
9863 sdata->sdata_op_ordering_lock.Lock();
9864 if (sdata->pqueue->empty()) {
9865 sdata->sdata_op_ordering_lock.Unlock();
9866 return;
9867 }
9868 }
9869 pair<spg_t, PGQueueable> item = sdata->pqueue->dequeue();
9870 if (osd->is_stopping()) {
9871 sdata->sdata_op_ordering_lock.Unlock();
9872 return; // OSD shutdown, discard.
9873 }
9874 PGRef pg;
9875 uint64_t requeue_seq;
9876 {
9877 auto& slot = sdata->pg_slots[item.first];
9878 dout(30) << __func__ << " " << item.first
9879 << " to_process " << slot.to_process
9880 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
9881 slot.to_process.push_back(item.second);
9882 // note the requeue seq now...
9883 requeue_seq = slot.requeue_seq;
9884 if (slot.waiting_for_pg) {
9885 // save ourselves a bit of effort
9886 dout(20) << __func__ << " " << item.first << " item " << item.second
9887 << " queued, waiting_for_pg" << dendl;
9888 sdata->sdata_op_ordering_lock.Unlock();
9889 return;
9890 }
9891 pg = slot.pg;
9892 dout(20) << __func__ << " " << item.first << " item " << item.second
9893 << " queued" << dendl;
9894 ++slot.num_running;
9895 }
9896 sdata->sdata_op_ordering_lock.Unlock();
9897
9898 osd->service.maybe_inject_dispatch_delay();
9899
9900 // [lookup +] lock pg (if we have it)
9901 if (!pg) {
9902 pg = osd->_lookup_lock_pg(item.first);
9903 } else {
9904 pg->lock();
9905 }
9906
9907 osd->service.maybe_inject_dispatch_delay();
9908
9909 boost::optional<PGQueueable> qi;
9910
9911 // we don't use a Mutex::Locker here because of the
9912 // osd->service.release_reserved_pushes() call below
9913 sdata->sdata_op_ordering_lock.Lock();
9914
9915 auto q = sdata->pg_slots.find(item.first);
9916 assert(q != sdata->pg_slots.end());
9917 auto& slot = q->second;
9918 --slot.num_running;
9919
9920 if (slot.to_process.empty()) {
9921 // raced with wake_pg_waiters or prune_pg_waiters
9922 dout(20) << __func__ << " " << item.first << " nothing queued" << dendl;
9923 if (pg) {
9924 pg->unlock();
9925 }
9926 sdata->sdata_op_ordering_lock.Unlock();
9927 return;
9928 }
9929 if (requeue_seq != slot.requeue_seq) {
9930 dout(20) << __func__ << " " << item.first
9931 << " requeue_seq " << slot.requeue_seq << " > our "
9932 << requeue_seq << ", we raced with wake_pg_waiters"
9933 << dendl;
9934 if (pg) {
9935 pg->unlock();
9936 }
9937 sdata->sdata_op_ordering_lock.Unlock();
9938 return;
9939 }
9940 if (pg && !slot.pg && !pg->deleting) {
9941 dout(20) << __func__ << " " << item.first << " set pg to " << pg << dendl;
9942 slot.pg = pg;
9943 }
9944 dout(30) << __func__ << " " << item.first << " to_process " << slot.to_process
9945 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
9946
9947 // make sure we're not already waiting for this pg
9948 if (slot.waiting_for_pg) {
9949 dout(20) << __func__ << " " << item.first << " item " << item.second
9950 << " slot is waiting_for_pg" << dendl;
9951 if (pg) {
9952 pg->unlock();
9953 }
9954 sdata->sdata_op_ordering_lock.Unlock();
9955 return;
9956 }
9957
9958 // take next item
9959 qi = slot.to_process.front();
9960 slot.to_process.pop_front();
9961 dout(20) << __func__ << " " << item.first << " item " << *qi
9962 << " pg " << pg << dendl;
9963
9964 if (!pg) {
9965 // should this pg shard exist on this osd in this (or a later) epoch?
9966 OSDMapRef osdmap = sdata->waiting_for_pg_osdmap;
9967 if (osdmap->is_up_acting_osd_shard(item.first, osd->whoami)) {
9968 dout(20) << __func__ << " " << item.first
9969 << " no pg, should exist, will wait" << " on " << *qi << dendl;
9970 slot.to_process.push_front(*qi);
9971 slot.waiting_for_pg = true;
9972 } else if (qi->get_map_epoch() > osdmap->get_epoch()) {
9973 dout(20) << __func__ << " " << item.first << " no pg, item epoch is "
9974 << qi->get_map_epoch() << " > " << osdmap->get_epoch()
9975 << ", will wait on " << *qi << dendl;
9976 slot.to_process.push_front(*qi);
9977 slot.waiting_for_pg = true;
9978 } else {
9979 dout(20) << __func__ << " " << item.first << " no pg, shouldn't exist,"
9980 << " dropping " << *qi << dendl;
9981 // share map with client?
9982 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
9983 Session *session = static_cast<Session *>(
9984 (*_op)->get_req()->get_connection()->get_priv());
9985 if (session) {
9986 osd->maybe_share_map(session, *_op, sdata->waiting_for_pg_osdmap);
9987 session->put();
9988 }
9989 }
9990 unsigned pushes_to_free = qi->get_reserved_pushes();
9991 if (pushes_to_free > 0) {
9992 sdata->sdata_op_ordering_lock.Unlock();
9993 osd->service.release_reserved_pushes(pushes_to_free);
9994 return;
9995 }
9996 }
9997 sdata->sdata_op_ordering_lock.Unlock();
9998 return;
9999 }
10000 sdata->sdata_op_ordering_lock.Unlock();
10001
10002
10003 // osd_opwq_process marks the point at which an operation has been dequeued
10004 // and will begin to be handled by a worker thread.
10005 {
10006 #ifdef WITH_LTTNG
10007 osd_reqid_t reqid;
10008 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10009 reqid = (*_op)->get_reqid();
10010 }
10011 #endif
10012 tracepoint(osd, opwq_process_start, reqid.name._type,
10013 reqid.name._num, reqid.tid, reqid.inc);
10014 }
10015
10016 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10017 Formatter *f = Formatter::create("json");
10018 f->open_object_section("q");
10019 dump(f);
10020 f->close_section();
10021 f->flush(*_dout);
10022 delete f;
10023 *_dout << dendl;
10024
10025 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10026 suicide_interval);
10027 qi->run(osd, pg, tp_handle);
10028
10029 {
10030 #ifdef WITH_LTTNG
10031 osd_reqid_t reqid;
10032 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10033 reqid = (*_op)->get_reqid();
10034 }
10035 #endif
10036 tracepoint(osd, opwq_process_finish, reqid.name._type,
10037 reqid.name._num, reqid.tid, reqid.inc);
10038 }
10039
10040 pg->unlock();
10041 }
10042
10043 void OSD::ShardedOpWQ::_enqueue(pair<spg_t, PGQueueable> item) {
10044 uint32_t shard_index =
10045 item.first.hash_to_shard(shard_list.size());
10046
10047 ShardData* sdata = shard_list[shard_index];
10048 assert (NULL != sdata);
10049 unsigned priority = item.second.get_priority();
10050 unsigned cost = item.second.get_cost();
10051 sdata->sdata_op_ordering_lock.Lock();
10052
10053 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10054 if (priority >= osd->op_prio_cutoff)
10055 sdata->pqueue->enqueue_strict(
10056 item.second.get_owner(), priority, item);
10057 else
10058 sdata->pqueue->enqueue(
10059 item.second.get_owner(),
10060 priority, cost, item);
10061 sdata->sdata_op_ordering_lock.Unlock();
10062
10063 sdata->sdata_lock.Lock();
10064 sdata->sdata_cond.SignalOne();
10065 sdata->sdata_lock.Unlock();
10066
10067 }
10068
10069 void OSD::ShardedOpWQ::_enqueue_front(pair<spg_t, PGQueueable> item)
10070 {
10071 uint32_t shard_index = item.first.hash_to_shard(shard_list.size());
10072 ShardData* sdata = shard_list[shard_index];
10073 assert (NULL != sdata);
10074 sdata->sdata_op_ordering_lock.Lock();
10075 auto p = sdata->pg_slots.find(item.first);
10076 if (p != sdata->pg_slots.end() && !p->second.to_process.empty()) {
10077 // we may be racing with _process, which has dequeued a new item
10078 // from pqueue, put it on to_process, and is now busy taking the
10079 // pg lock. ensure this old requeued item is ordered before any
10080 // such newer item in to_process.
10081 p->second.to_process.push_front(item.second);
10082 item.second = p->second.to_process.back();
10083 p->second.to_process.pop_back();
10084 dout(20) << __func__ << " " << item.first
10085 << " " << p->second.to_process.front()
10086 << " shuffled w/ " << item.second << dendl;
10087 } else {
10088 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10089 }
10090 sdata->_enqueue_front(item, osd->op_prio_cutoff);
10091 sdata->sdata_op_ordering_lock.Unlock();
10092 sdata->sdata_lock.Lock();
10093 sdata->sdata_cond.SignalOne();
10094 sdata->sdata_lock.Unlock();
10095 }
10096
10097 namespace ceph {
10098 namespace osd_cmds {
10099
10100 int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os)
10101 {
10102 if (!ceph_using_tcmalloc()) {
10103 os << "could not issue heap profiler command -- not using tcmalloc!";
10104 return -EOPNOTSUPP;
10105 }
10106
10107 string cmd;
10108 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
10109 os << "unable to get value for command \"" << cmd << "\"";
10110 return -EINVAL;
10111 }
10112
10113 std::vector<std::string> cmd_vec;
10114 get_str_vec(cmd, cmd_vec);
10115
10116 ceph_heap_profiler_handle_command(cmd_vec, os);
10117
10118 return 0;
10119 }
10120
10121 }} // namespace ceph::osd_cmds
10122