]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSD.cc
bump version to 12.1.4-pve1
[ceph.git] / ceph / src / osd / OSD.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15#include "acconfig.h"
16
17#include <fstream>
18#include <iostream>
19#include <errno.h>
20#include <sys/stat.h>
21#include <signal.h>
22#include <ctype.h>
23#include <boost/scoped_ptr.hpp>
24
25#ifdef HAVE_SYS_PARAM_H
26#include <sys/param.h>
27#endif
28
29#ifdef HAVE_SYS_MOUNT_H
30#include <sys/mount.h>
31#endif
32
33#include "osd/PG.h"
34
35#include "include/types.h"
36#include "include/compat.h"
37
38#include "OSD.h"
39#include "OSDMap.h"
40#include "Watch.h"
41#include "osdc/Objecter.h"
42
43#include "common/errno.h"
44#include "common/ceph_argparse.h"
224ce89b 45#include "common/ceph_time.h"
7c673cae
FG
46#include "common/version.h"
47#include "common/io_priority.h"
48
49#include "os/ObjectStore.h"
50#ifdef HAVE_LIBFUSE
51#include "os/FuseStore.h"
52#endif
53
54#include "PrimaryLogPG.h"
55
56
57#include "msg/Messenger.h"
58#include "msg/Message.h"
59
60#include "mon/MonClient.h"
61
62#include "messages/MLog.h"
63
64#include "messages/MGenericMessage.h"
7c673cae
FG
65#include "messages/MOSDPing.h"
66#include "messages/MOSDFailure.h"
67#include "messages/MOSDMarkMeDown.h"
68#include "messages/MOSDFull.h"
69#include "messages/MOSDOp.h"
70#include "messages/MOSDOpReply.h"
71#include "messages/MOSDBackoff.h"
72#include "messages/MOSDBeacon.h"
73#include "messages/MOSDRepOp.h"
74#include "messages/MOSDRepOpReply.h"
75#include "messages/MOSDBoot.h"
76#include "messages/MOSDPGTemp.h"
77
78#include "messages/MOSDMap.h"
79#include "messages/MMonGetOSDMap.h"
80#include "messages/MOSDPGNotify.h"
81#include "messages/MOSDPGQuery.h"
82#include "messages/MOSDPGLog.h"
83#include "messages/MOSDPGRemove.h"
84#include "messages/MOSDPGInfo.h"
85#include "messages/MOSDPGCreate.h"
86#include "messages/MOSDPGTrim.h"
87#include "messages/MOSDPGScan.h"
88#include "messages/MOSDPGBackfill.h"
89#include "messages/MBackfillReserve.h"
90#include "messages/MRecoveryReserve.h"
c07f9fc5 91#include "messages/MOSDForceRecovery.h"
7c673cae
FG
92#include "messages/MOSDECSubOpWrite.h"
93#include "messages/MOSDECSubOpWriteReply.h"
94#include "messages/MOSDECSubOpRead.h"
95#include "messages/MOSDECSubOpReadReply.h"
96#include "messages/MOSDPGCreated.h"
97#include "messages/MOSDPGUpdateLogMissing.h"
98#include "messages/MOSDPGUpdateLogMissingReply.h"
99
100#include "messages/MOSDAlive.h"
101
102#include "messages/MOSDScrub.h"
103#include "messages/MOSDScrubReserve.h"
104#include "messages/MOSDRepScrub.h"
105
106#include "messages/MMonCommand.h"
107#include "messages/MCommand.h"
108#include "messages/MCommandReply.h"
109
110#include "messages/MPGStats.h"
111#include "messages/MPGStatsAck.h"
112
113#include "messages/MWatchNotify.h"
114#include "messages/MOSDPGPush.h"
115#include "messages/MOSDPGPushReply.h"
116#include "messages/MOSDPGPull.h"
117
118#include "common/perf_counters.h"
119#include "common/Timer.h"
120#include "common/LogClient.h"
121#include "common/AsyncReserver.h"
122#include "common/HeartbeatMap.h"
123#include "common/admin_socket.h"
124#include "common/ceph_context.h"
125
126#include "global/signal_handler.h"
127#include "global/pidfile.h"
128
129#include "include/color.h"
130#include "perfglue/cpu_profiler.h"
131#include "perfglue/heap_profiler.h"
132
133#include "osd/OpRequest.h"
134
135#include "auth/AuthAuthorizeHandler.h"
136#include "auth/RotatingKeyRing.h"
137#include "common/errno.h"
138
139#include "objclass/objclass.h"
140
141#include "common/cmdparse.h"
142#include "include/str_list.h"
143#include "include/util.h"
144
145#include "include/assert.h"
146#include "common/config.h"
147#include "common/EventTrace.h"
148
149#ifdef WITH_LTTNG
150#define TRACEPOINT_DEFINE
151#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
152#include "tracing/osd.h"
153#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
154#undef TRACEPOINT_DEFINE
155#else
156#define tracepoint(...)
157#endif
158
159#define dout_context cct
160#define dout_subsys ceph_subsys_osd
161#undef dout_prefix
162#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
163
224ce89b 164
7c673cae
FG
165const double OSD::OSD_TICK_INTERVAL = 1.0;
166
167static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
168 return *_dout << "osd." << whoami << " " << epoch << " ";
169}
170
7c673cae
FG
171//Initial features in new superblock.
172//Features here are also automatically upgraded
173CompatSet OSD::get_osd_initial_compat_set() {
174 CompatSet::FeatureSet ceph_osd_feature_compat;
175 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
176 CompatSet::FeatureSet ceph_osd_feature_incompat;
177 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
178 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
179 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
180 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
181 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
182 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
183 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
184 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
185 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
186 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
187 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
188 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
189 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
190 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
c07f9fc5 191 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
7c673cae
FG
192 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
193 ceph_osd_feature_incompat);
194}
195
196//Features are added here that this OSD supports.
197CompatSet OSD::get_osd_compat_set() {
198 CompatSet compat = get_osd_initial_compat_set();
199 //Any features here can be set in code, but not in initial superblock
200 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
201 return compat;
202}
203
204OSDService::OSDService(OSD *osd) :
205 osd(osd),
206 cct(osd->cct),
207 meta_osr(new ObjectStore::Sequencer("meta")),
208 whoami(osd->whoami), store(osd->store),
209 log_client(osd->log_client), clog(osd->clog),
210 pg_recovery_stats(osd->pg_recovery_stats),
211 cluster_messenger(osd->cluster_messenger),
212 client_messenger(osd->client_messenger),
213 logger(osd->logger),
214 recoverystate_perf(osd->recoverystate_perf),
215 monc(osd->monc),
216 peering_wq(osd->peering_wq),
217 recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
218 &osd->disk_tp),
219 class_handler(osd->class_handler),
220 pg_epoch_lock("OSDService::pg_epoch_lock"),
221 publish_lock("OSDService::publish_lock"),
222 pre_publish_lock("OSDService::pre_publish_lock"),
223 max_oldest_map(0),
224 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
225 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
226 scrubs_active(0),
227 agent_lock("OSDService::agent_lock"),
228 agent_valid_iterator(false),
229 agent_ops(0),
230 flush_mode_high_count(0),
231 agent_active(true),
232 agent_thread(this),
233 agent_stop_flag(false),
234 agent_timer_lock("OSDService::agent_timer_lock"),
235 agent_timer(osd->client_messenger->cct, agent_timer_lock),
236 last_recalibrate(ceph_clock_now()),
237 promote_max_objects(0),
238 promote_max_bytes(0),
239 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
240 objecter_finisher(osd->client_messenger->cct),
241 watch_lock("OSDService::watch_lock"),
242 watch_timer(osd->client_messenger->cct, watch_lock),
243 next_notif_id(0),
244 recovery_request_lock("OSDService::recovery_request_lock"),
245 recovery_request_timer(cct, recovery_request_lock, false),
31f18b77
FG
246 recovery_sleep_lock("OSDService::recovery_sleep_lock"),
247 recovery_sleep_timer(cct, recovery_sleep_lock, false),
7c673cae
FG
248 reserver_finisher(cct),
249 local_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
250 cct->_conf->osd_min_recovery_priority),
251 remote_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
252 cct->_conf->osd_min_recovery_priority),
253 pg_temp_lock("OSDService::pg_temp_lock"),
254 snap_sleep_lock("OSDService::snap_sleep_lock"),
255 snap_sleep_timer(
256 osd->client_messenger->cct, snap_sleep_lock, false /* relax locking */),
31f18b77
FG
257 scrub_sleep_lock("OSDService::scrub_sleep_lock"),
258 scrub_sleep_timer(
259 osd->client_messenger->cct, scrub_sleep_lock, false /* relax locking */),
7c673cae
FG
260 snap_reserver(&reserver_finisher,
261 cct->_conf->osd_max_trimming_pgs),
262 recovery_lock("OSDService::recovery_lock"),
263 recovery_ops_active(0),
264 recovery_ops_reserved(0),
265 recovery_paused(false),
266 map_cache_lock("OSDService::map_cache_lock"),
267 map_cache(cct, cct->_conf->osd_map_cache_size),
268 map_bl_cache(cct->_conf->osd_map_cache_size),
269 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
270 in_progress_split_lock("OSDService::in_progress_split_lock"),
271 stat_lock("OSDService::stat_lock"),
272 full_status_lock("OSDService::full_status_lock"),
273 cur_state(NONE),
274 cur_ratio(0),
275 epoch_lock("OSDService::epoch_lock"),
276 boot_epoch(0), up_epoch(0), bind_epoch(0),
277 is_stopping_lock("OSDService::is_stopping_lock")
278#ifdef PG_DEBUG_REFS
279 , pgid_lock("OSDService::pgid_lock")
280#endif
281{
282 objecter->init();
283}
284
285OSDService::~OSDService()
286{
287 delete objecter;
288}
289
31f18b77
FG
290
291
292#ifdef PG_DEBUG_REFS
293void OSDService::add_pgid(spg_t pgid, PG *pg){
294 Mutex::Locker l(pgid_lock);
295 if (!pgid_tracker.count(pgid)) {
296 live_pgs[pgid] = pg;
297 }
298 pgid_tracker[pgid]++;
299}
300void OSDService::remove_pgid(spg_t pgid, PG *pg)
301{
302 Mutex::Locker l(pgid_lock);
303 assert(pgid_tracker.count(pgid));
304 assert(pgid_tracker[pgid] > 0);
305 pgid_tracker[pgid]--;
306 if (pgid_tracker[pgid] == 0) {
307 pgid_tracker.erase(pgid);
308 live_pgs.erase(pgid);
309 }
310}
311void OSDService::dump_live_pgids()
312{
313 Mutex::Locker l(pgid_lock);
314 derr << "live pgids:" << dendl;
315 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
316 i != pgid_tracker.cend();
317 ++i) {
318 derr << "\t" << *i << dendl;
319 live_pgs[i->first]->dump_live_ids();
320 }
321}
322#endif
323
324
7c673cae
FG
325void OSDService::_start_split(spg_t parent, const set<spg_t> &children)
326{
327 for (set<spg_t>::const_iterator i = children.begin();
328 i != children.end();
329 ++i) {
330 dout(10) << __func__ << ": Starting split on pg " << *i
331 << ", parent=" << parent << dendl;
332 assert(!pending_splits.count(*i));
333 assert(!in_progress_splits.count(*i));
334 pending_splits.insert(make_pair(*i, parent));
335
336 assert(!rev_pending_splits[parent].count(*i));
337 rev_pending_splits[parent].insert(*i);
338 }
339}
340
341void OSDService::mark_split_in_progress(spg_t parent, const set<spg_t> &children)
342{
343 Mutex::Locker l(in_progress_split_lock);
344 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
345 assert(piter != rev_pending_splits.end());
346 for (set<spg_t>::const_iterator i = children.begin();
347 i != children.end();
348 ++i) {
349 assert(piter->second.count(*i));
350 assert(pending_splits.count(*i));
351 assert(!in_progress_splits.count(*i));
352 assert(pending_splits[*i] == parent);
353
354 pending_splits.erase(*i);
355 piter->second.erase(*i);
356 in_progress_splits.insert(*i);
357 }
358 if (piter->second.empty())
359 rev_pending_splits.erase(piter);
360}
361
362void OSDService::cancel_pending_splits_for_parent(spg_t parent)
363{
364 Mutex::Locker l(in_progress_split_lock);
365 _cancel_pending_splits_for_parent(parent);
366}
367
368void OSDService::_cancel_pending_splits_for_parent(spg_t parent)
369{
370 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
371 if (piter == rev_pending_splits.end())
372 return;
373
374 for (set<spg_t>::iterator i = piter->second.begin();
375 i != piter->second.end();
376 ++i) {
377 assert(pending_splits.count(*i));
378 assert(!in_progress_splits.count(*i));
379 pending_splits.erase(*i);
380 dout(10) << __func__ << ": Completing split on pg " << *i
381 << " for parent: " << parent << dendl;
382 _cancel_pending_splits_for_parent(*i);
383 }
384 rev_pending_splits.erase(piter);
385}
386
387void OSDService::_maybe_split_pgid(OSDMapRef old_map,
388 OSDMapRef new_map,
389 spg_t pgid)
390{
391 assert(old_map->have_pg_pool(pgid.pool()));
392 int old_pgnum = old_map->get_pg_num(pgid.pool());
393 if (pgid.ps() < static_cast<unsigned>(old_pgnum)) {
394 set<spg_t> children;
395 if (pgid.is_split(old_pgnum,
396 new_map->get_pg_num(pgid.pool()), &children)) {
397 _start_split(pgid, children); }
398 } else {
399 assert(pgid.ps() < static_cast<unsigned>(new_map->get_pg_num(pgid.pool())));
400 }
401}
402
403void OSDService::init_splits_between(spg_t pgid,
404 OSDMapRef frommap,
405 OSDMapRef tomap)
406{
407 // First, check whether we can avoid this potentially expensive check
408 if (tomap->have_pg_pool(pgid.pool()) &&
409 pgid.is_split(
410 frommap->get_pg_num(pgid.pool()),
411 tomap->get_pg_num(pgid.pool()),
412 NULL)) {
413 // Ok, a split happened, so we need to walk the osdmaps
414 set<spg_t> new_pgs; // pgs to scan on each map
415 new_pgs.insert(pgid);
416 OSDMapRef curmap(get_map(frommap->get_epoch()));
417 for (epoch_t e = frommap->get_epoch() + 1;
418 e <= tomap->get_epoch();
419 ++e) {
420 OSDMapRef nextmap(try_get_map(e));
421 if (!nextmap)
422 continue;
423 set<spg_t> even_newer_pgs; // pgs added in this loop
424 for (set<spg_t>::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) {
425 set<spg_t> split_pgs;
426 if (i->is_split(curmap->get_pg_num(i->pool()),
427 nextmap->get_pg_num(i->pool()),
428 &split_pgs)) {
429 start_split(*i, split_pgs);
430 even_newer_pgs.insert(split_pgs.begin(), split_pgs.end());
431 }
432 }
433 new_pgs.insert(even_newer_pgs.begin(), even_newer_pgs.end());
434 curmap = nextmap;
435 }
436 assert(curmap == tomap); // we must have had both frommap and tomap
437 }
438}
439
440void OSDService::expand_pg_num(OSDMapRef old_map,
441 OSDMapRef new_map)
442{
443 Mutex::Locker l(in_progress_split_lock);
444 for (set<spg_t>::iterator i = in_progress_splits.begin();
445 i != in_progress_splits.end();
446 ) {
447 if (!new_map->have_pg_pool(i->pool())) {
448 in_progress_splits.erase(i++);
449 } else {
450 _maybe_split_pgid(old_map, new_map, *i);
451 ++i;
452 }
453 }
454 for (map<spg_t, spg_t>::iterator i = pending_splits.begin();
455 i != pending_splits.end();
456 ) {
457 if (!new_map->have_pg_pool(i->first.pool())) {
458 rev_pending_splits.erase(i->second);
459 pending_splits.erase(i++);
460 } else {
461 _maybe_split_pgid(old_map, new_map, i->first);
462 ++i;
463 }
464 }
465}
466
467bool OSDService::splitting(spg_t pgid)
468{
469 Mutex::Locker l(in_progress_split_lock);
470 return in_progress_splits.count(pgid) ||
471 pending_splits.count(pgid);
472}
473
474void OSDService::complete_split(const set<spg_t> &pgs)
475{
476 Mutex::Locker l(in_progress_split_lock);
477 for (set<spg_t>::const_iterator i = pgs.begin();
478 i != pgs.end();
479 ++i) {
480 dout(10) << __func__ << ": Completing split on pg " << *i << dendl;
481 assert(!pending_splits.count(*i));
482 assert(in_progress_splits.count(*i));
483 in_progress_splits.erase(*i);
484 }
485}
486
487void OSDService::need_heartbeat_peer_update()
488{
489 osd->need_heartbeat_peer_update();
490}
491
492void OSDService::pg_stat_queue_enqueue(PG *pg)
493{
494 osd->pg_stat_queue_enqueue(pg);
495}
496
497void OSDService::pg_stat_queue_dequeue(PG *pg)
498{
499 osd->pg_stat_queue_dequeue(pg);
500}
501
502void OSDService::start_shutdown()
503{
504 {
505 Mutex::Locker l(agent_timer_lock);
506 agent_timer.shutdown();
507 }
31f18b77
FG
508
509 {
510 Mutex::Locker l(recovery_sleep_lock);
511 recovery_sleep_timer.shutdown();
512 }
7c673cae
FG
513}
514
31f18b77 515void OSDService::shutdown_reserver()
7c673cae
FG
516{
517 reserver_finisher.wait_for_empty();
518 reserver_finisher.stop();
31f18b77
FG
519}
520
521void OSDService::shutdown()
522{
7c673cae
FG
523 {
524 Mutex::Locker l(watch_lock);
525 watch_timer.shutdown();
526 }
527
528 objecter->shutdown();
529 objecter_finisher.wait_for_empty();
530 objecter_finisher.stop();
531
532 {
533 Mutex::Locker l(recovery_request_lock);
534 recovery_request_timer.shutdown();
535 }
536
537 {
538 Mutex::Locker l(snap_sleep_lock);
539 snap_sleep_timer.shutdown();
540 }
541
31f18b77
FG
542 {
543 Mutex::Locker l(scrub_sleep_lock);
544 scrub_sleep_timer.shutdown();
545 }
546
7c673cae
FG
547 osdmap = OSDMapRef();
548 next_osdmap = OSDMapRef();
549}
550
551void OSDService::init()
552{
553 reserver_finisher.start();
554 objecter_finisher.start();
555 objecter->set_client_incarnation(0);
556
557 // deprioritize objecter in daemonperf output
558 objecter->get_logger()->set_prio_adjust(-3);
559
560 watch_timer.init();
561 agent_timer.init();
562 snap_sleep_timer.init();
31f18b77 563 scrub_sleep_timer.init();
7c673cae
FG
564
565 agent_thread.create("osd_srv_agent");
566
567 if (cct->_conf->osd_recovery_delay_start)
568 defer_recovery(cct->_conf->osd_recovery_delay_start);
569}
570
571void OSDService::final_init()
572{
573 objecter->start(osdmap.get());
574}
575
576void OSDService::activate_map()
577{
578 // wake/unwake the tiering agent
579 agent_lock.Lock();
580 agent_active =
581 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
582 osd->is_active();
583 agent_cond.Signal();
584 agent_lock.Unlock();
585}
586
587class AgentTimeoutCB : public Context {
588 PGRef pg;
589public:
590 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
591 void finish(int) override {
592 pg->agent_choose_mode_restart();
593 }
594};
595
596void OSDService::agent_entry()
597{
598 dout(10) << __func__ << " start" << dendl;
599 agent_lock.Lock();
600
601 while (!agent_stop_flag) {
602 if (agent_queue.empty()) {
603 dout(20) << __func__ << " empty queue" << dendl;
604 agent_cond.Wait(agent_lock);
605 continue;
606 }
607 uint64_t level = agent_queue.rbegin()->first;
608 set<PGRef>& top = agent_queue.rbegin()->second;
609 dout(10) << __func__
610 << " tiers " << agent_queue.size()
611 << ", top is " << level
612 << " with pgs " << top.size()
613 << ", ops " << agent_ops << "/"
614 << cct->_conf->osd_agent_max_ops
615 << (agent_active ? " active" : " NOT ACTIVE")
616 << dendl;
617 dout(20) << __func__ << " oids " << agent_oids << dendl;
618 int max = cct->_conf->osd_agent_max_ops - agent_ops;
619 int agent_flush_quota = max;
620 if (!flush_mode_high_count)
621 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
622 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
623 agent_cond.Wait(agent_lock);
624 continue;
625 }
626
627 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
628 agent_queue_pos = top.begin();
629 agent_valid_iterator = true;
630 }
631 PGRef pg = *agent_queue_pos;
632 dout(10) << "high_count " << flush_mode_high_count
633 << " agent_ops " << agent_ops
634 << " flush_quota " << agent_flush_quota << dendl;
635 agent_lock.Unlock();
636 if (!pg->agent_work(max, agent_flush_quota)) {
637 dout(10) << __func__ << " " << pg->get_pgid()
638 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
639 << " seconds" << dendl;
640
641 osd->logger->inc(l_osd_tier_delay);
642 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
643 agent_timer_lock.Lock();
644 Context *cb = new AgentTimeoutCB(pg);
645 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
646 agent_timer_lock.Unlock();
647 }
648 agent_lock.Lock();
649 }
650 agent_lock.Unlock();
651 dout(10) << __func__ << " finish" << dendl;
652}
653
654void OSDService::agent_stop()
655{
656 {
657 Mutex::Locker l(agent_lock);
658
659 // By this time all ops should be cancelled
660 assert(agent_ops == 0);
661 // By this time all PGs are shutdown and dequeued
662 if (!agent_queue.empty()) {
663 set<PGRef>& top = agent_queue.rbegin()->second;
664 derr << "agent queue not empty, for example " << (*top.begin())->info.pgid << dendl;
665 assert(0 == "agent queue not empty");
666 }
667
668 agent_stop_flag = true;
669 agent_cond.Signal();
670 }
671 agent_thread.join();
672}
673
674// -------------------------------------
675
676void OSDService::promote_throttle_recalibrate()
677{
678 utime_t now = ceph_clock_now();
679 double dur = now - last_recalibrate;
680 last_recalibrate = now;
681 unsigned prob = promote_probability_millis;
682
683 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
684 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
685
686 unsigned min_prob = 1;
687
688 uint64_t attempts, obj, bytes;
689 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
690 dout(10) << __func__ << " " << attempts << " attempts, promoted "
691 << obj << " objects and " << pretty_si_t(bytes) << " bytes; target "
692 << target_obj_sec << " obj/sec or "
693 << pretty_si_t(target_bytes_sec) << " bytes/sec"
694 << dendl;
695
696 // calculate what the probability *should* be, given the targets
697 unsigned new_prob;
698 if (attempts && dur > 0) {
699 uint64_t avg_size = 1;
700 if (obj)
701 avg_size = MAX(bytes / obj, 1);
702 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
703 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
704 / (double)attempts;
705 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
706 << avg_size << dendl;
707 if (target_obj_sec && target_bytes_sec)
708 new_prob = MIN(po, pb);
709 else if (target_obj_sec)
710 new_prob = po;
711 else if (target_bytes_sec)
712 new_prob = pb;
713 else
714 new_prob = 1000;
715 } else {
716 new_prob = 1000;
717 }
718 dout(20) << __func__ << " new_prob " << new_prob << dendl;
719
720 // correct for persistent skew between target rate and actual rate, adjust
721 double ratio = 1.0;
722 unsigned actual = 0;
723 if (attempts && obj) {
724 actual = obj * 1000 / attempts;
725 ratio = (double)actual / (double)prob;
726 new_prob = (double)new_prob / ratio;
727 }
728 new_prob = MAX(new_prob, min_prob);
729 new_prob = MIN(new_prob, 1000);
730
731 // adjust
732 prob = (prob + new_prob) / 2;
733 prob = MAX(prob, min_prob);
734 prob = MIN(prob, 1000);
735 dout(10) << __func__ << " actual " << actual
736 << ", actual/prob ratio " << ratio
737 << ", adjusted new_prob " << new_prob
738 << ", prob " << promote_probability_millis << " -> " << prob
739 << dendl;
740 promote_probability_millis = prob;
741
742 // set hard limits for this interval to mitigate stampedes
743 promote_max_objects = target_obj_sec * OSD::OSD_TICK_INTERVAL * 2;
744 promote_max_bytes = target_bytes_sec * OSD::OSD_TICK_INTERVAL * 2;
745}
746
747// -------------------------------------
748
749float OSDService::get_failsafe_full_ratio()
750{
751 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
752 if (full_ratio > 1.0) full_ratio /= 100.0;
753 return full_ratio;
754}
755
224ce89b 756void OSDService::check_full_status(float ratio)
7c673cae
FG
757{
758 Mutex::Locker l(full_status_lock);
759
7c673cae
FG
760 cur_ratio = ratio;
761
762 // The OSDMap ratios take precendence. So if the failsafe is .95 and
763 // the admin sets the cluster full to .96, the failsafe moves up to .96
764 // too. (Not that having failsafe == full is ideal, but it's better than
765 // dropping writes before the clusters appears full.)
766 OSDMapRef osdmap = get_osdmap();
767 if (!osdmap || osdmap->get_epoch() == 0) {
768 cur_state = NONE;
769 return;
770 }
771 float nearfull_ratio = osdmap->get_nearfull_ratio();
772 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
773 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
774 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
775
31f18b77 776 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
777 // use the failsafe for nearfull and full; the mon isn't using the
778 // flags anyway because we're mid-upgrade.
779 full_ratio = failsafe_ratio;
780 backfillfull_ratio = failsafe_ratio;
781 nearfull_ratio = failsafe_ratio;
782 } else if (full_ratio <= 0 ||
783 backfillfull_ratio <= 0 ||
784 nearfull_ratio <= 0) {
785 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
786 // use failsafe flag. ick. the monitor did something wrong or the user
787 // did something stupid.
788 full_ratio = failsafe_ratio;
789 backfillfull_ratio = failsafe_ratio;
790 nearfull_ratio = failsafe_ratio;
791 }
792
793 string inject;
794 s_names new_state;
795 if (injectfull_state > NONE && injectfull) {
796 new_state = injectfull_state;
797 inject = "(Injected)";
798 } else if (ratio > failsafe_ratio) {
799 new_state = FAILSAFE;
800 } else if (ratio > full_ratio) {
801 new_state = FULL;
802 } else if (ratio > backfillfull_ratio) {
803 new_state = BACKFILLFULL;
804 } else if (ratio > nearfull_ratio) {
805 new_state = NEARFULL;
806 } else {
807 new_state = NONE;
808 }
809 dout(20) << __func__ << " cur ratio " << ratio
810 << ". nearfull_ratio " << nearfull_ratio
811 << ". backfillfull_ratio " << backfillfull_ratio
812 << ", full_ratio " << full_ratio
813 << ", failsafe_ratio " << failsafe_ratio
814 << ", new state " << get_full_state_name(new_state)
815 << " " << inject
816 << dendl;
817
818 // warn
819 if (cur_state != new_state) {
820 dout(10) << __func__ << " " << get_full_state_name(cur_state)
821 << " -> " << get_full_state_name(new_state) << dendl;
822 if (new_state == FAILSAFE) {
c07f9fc5 823 clog->error() << "full status failsafe engaged, dropping updates, now "
7c673cae
FG
824 << (int)roundf(ratio * 100) << "% full";
825 } else if (cur_state == FAILSAFE) {
c07f9fc5
FG
826 clog->error() << "full status failsafe disengaged, no longer dropping "
827 << "updates, now " << (int)roundf(ratio * 100) << "% full";
7c673cae
FG
828 }
829 cur_state = new_state;
830 }
831}
832
833bool OSDService::need_fullness_update()
834{
835 OSDMapRef osdmap = get_osdmap();
836 s_names cur = NONE;
837 if (osdmap->exists(whoami)) {
838 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
839 cur = FULL;
840 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
841 cur = BACKFILLFULL;
842 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
843 cur = NEARFULL;
844 }
845 }
846 s_names want = NONE;
847 if (is_full())
848 want = FULL;
849 else if (is_backfillfull())
850 want = BACKFILLFULL;
851 else if (is_nearfull())
852 want = NEARFULL;
853 return want != cur;
854}
855
856bool OSDService::_check_full(s_names type, ostream &ss) const
857{
858 Mutex::Locker l(full_status_lock);
859
860 if (injectfull && injectfull_state >= type) {
861 // injectfull is either a count of the number of times to return failsafe full
862 // or if -1 then always return full
863 if (injectfull > 0)
864 --injectfull;
865 ss << "Injected " << get_full_state_name(type) << " OSD ("
866 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")";
867 return true;
868 }
869
870 ss << "current usage is " << cur_ratio;
871 return cur_state >= type;
872}
873
874bool OSDService::check_failsafe_full(ostream &ss) const
875{
876 return _check_full(FAILSAFE, ss);
877}
878
879bool OSDService::check_full(ostream &ss) const
880{
881 return _check_full(FULL, ss);
882}
883
884bool OSDService::check_backfill_full(ostream &ss) const
885{
886 return _check_full(BACKFILLFULL, ss);
887}
888
889bool OSDService::check_nearfull(ostream &ss) const
890{
891 return _check_full(NEARFULL, ss);
892}
893
894bool OSDService::is_failsafe_full() const
895{
896 Mutex::Locker l(full_status_lock);
897 return cur_state == FAILSAFE;
898}
899
900bool OSDService::is_full() const
901{
902 Mutex::Locker l(full_status_lock);
903 return cur_state >= FULL;
904}
905
906bool OSDService::is_backfillfull() const
907{
908 Mutex::Locker l(full_status_lock);
909 return cur_state >= BACKFILLFULL;
910}
911
912bool OSDService::is_nearfull() const
913{
914 Mutex::Locker l(full_status_lock);
915 return cur_state >= NEARFULL;
916}
917
918void OSDService::set_injectfull(s_names type, int64_t count)
919{
920 Mutex::Locker l(full_status_lock);
921 injectfull_state = type;
922 injectfull = count;
923}
924
224ce89b 925osd_stat_t OSDService::set_osd_stat(const struct store_statfs_t &stbuf,
35e4c445
FG
926 vector<int>& hb_peers,
927 int num_pgs)
7c673cae 928{
224ce89b
WB
929 uint64_t bytes = stbuf.total;
930 uint64_t used = bytes - stbuf.available;
931 uint64_t avail = stbuf.available;
7c673cae 932
224ce89b
WB
933 osd->logger->set(l_osd_stat_bytes, bytes);
934 osd->logger->set(l_osd_stat_bytes_used, used);
935 osd->logger->set(l_osd_stat_bytes_avail, avail);
7c673cae 936
224ce89b
WB
937 {
938 Mutex::Locker l(stat_lock);
939 osd_stat.hb_peers.swap(hb_peers);
940 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
941 osd_stat.kb = bytes >> 10;
942 osd_stat.kb_used = used >> 10;
943 osd_stat.kb_avail = avail >> 10;
35e4c445 944 osd_stat.num_pgs = num_pgs;
224ce89b
WB
945 return osd_stat;
946 }
947}
7c673cae 948
224ce89b
WB
949void OSDService::update_osd_stat(vector<int>& hb_peers)
950{
951 // load osd stats first
7c673cae
FG
952 struct store_statfs_t stbuf;
953 int r = osd->store->statfs(&stbuf);
954 if (r < 0) {
955 derr << "statfs() failed: " << cpp_strerror(r) << dendl;
956 return;
957 }
958
35e4c445 959 auto new_stat = set_osd_stat(stbuf, hb_peers, osd->get_num_pgs());
224ce89b
WB
960 dout(20) << "update_osd_stat " << new_stat << dendl;
961 assert(new_stat.kb);
962 float ratio = ((float)new_stat.kb_used) / ((float)new_stat.kb);
963 check_full_status(ratio);
7c673cae
FG
964}
965
966bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
967{
968 OSDMapRef osdmap = get_osdmap();
969 for (auto shard : missing_on) {
970 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
971 return true;
972 }
973 return false;
974}
975
976void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
977{
978 OSDMapRef next_map = get_nextmap_reserved();
979 // service map is always newer/newest
980 assert(from_epoch <= next_map->get_epoch());
981
982 if (next_map->is_down(peer) ||
983 next_map->get_info(peer).up_from > from_epoch) {
984 m->put();
985 release_map(next_map);
986 return;
987 }
988 const entity_inst_t& peer_inst = next_map->get_cluster_inst(peer);
989 ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst);
990 share_map_peer(peer, peer_con.get(), next_map);
991 peer_con->send_message(m);
992 release_map(next_map);
993}
994
995ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
996{
997 OSDMapRef next_map = get_nextmap_reserved();
998 // service map is always newer/newest
999 assert(from_epoch <= next_map->get_epoch());
1000
1001 if (next_map->is_down(peer) ||
1002 next_map->get_info(peer).up_from > from_epoch) {
1003 release_map(next_map);
1004 return NULL;
1005 }
1006 ConnectionRef con = osd->cluster_messenger->get_connection(next_map->get_cluster_inst(peer));
1007 release_map(next_map);
1008 return con;
1009}
1010
1011pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1012{
1013 OSDMapRef next_map = get_nextmap_reserved();
1014 // service map is always newer/newest
1015 assert(from_epoch <= next_map->get_epoch());
1016
1017 pair<ConnectionRef,ConnectionRef> ret;
1018 if (next_map->is_down(peer) ||
1019 next_map->get_info(peer).up_from > from_epoch) {
1020 release_map(next_map);
1021 return ret;
1022 }
1023 ret.first = osd->hb_back_client_messenger->get_connection(next_map->get_hb_back_inst(peer));
1024 if (next_map->get_hb_front_addr(peer) != entity_addr_t())
1025 ret.second = osd->hb_front_client_messenger->get_connection(next_map->get_hb_front_inst(peer));
1026 release_map(next_map);
1027 return ret;
1028}
1029
1030
1031void OSDService::queue_want_pg_temp(pg_t pgid, vector<int>& want)
1032{
1033 Mutex::Locker l(pg_temp_lock);
1034 map<pg_t,vector<int> >::iterator p = pg_temp_pending.find(pgid);
1035 if (p == pg_temp_pending.end() ||
1036 p->second != want) {
1037 pg_temp_wanted[pgid] = want;
1038 }
1039}
1040
1041void OSDService::remove_want_pg_temp(pg_t pgid)
1042{
1043 Mutex::Locker l(pg_temp_lock);
1044 pg_temp_wanted.erase(pgid);
1045 pg_temp_pending.erase(pgid);
1046}
1047
1048void OSDService::_sent_pg_temp()
1049{
1050 for (map<pg_t,vector<int> >::iterator p = pg_temp_wanted.begin();
1051 p != pg_temp_wanted.end();
1052 ++p)
1053 pg_temp_pending[p->first] = p->second;
1054 pg_temp_wanted.clear();
1055}
1056
1057void OSDService::requeue_pg_temp()
1058{
1059 Mutex::Locker l(pg_temp_lock);
1060 // wanted overrides pending. note that remove_want_pg_temp
1061 // clears the item out of both.
1062 unsigned old_wanted = pg_temp_wanted.size();
1063 unsigned old_pending = pg_temp_pending.size();
1064 _sent_pg_temp();
1065 pg_temp_wanted.swap(pg_temp_pending);
1066 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1067 << pg_temp_wanted.size() << dendl;
1068}
1069
1070void OSDService::send_pg_temp()
1071{
1072 Mutex::Locker l(pg_temp_lock);
1073 if (pg_temp_wanted.empty())
1074 return;
1075 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1076 MOSDPGTemp *m = new MOSDPGTemp(osdmap->get_epoch());
1077 m->pg_temp = pg_temp_wanted;
1078 monc->send_mon_message(m);
1079 _sent_pg_temp();
1080}
1081
1082void OSDService::send_pg_created(pg_t pgid)
1083{
1084 dout(20) << __func__ << dendl;
c07f9fc5
FG
1085 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1086 monc->send_mon_message(new MOSDPGCreated(pgid));
1087 }
7c673cae
FG
1088}
1089
1090// --------------------------------------
1091// dispatch
1092
1093epoch_t OSDService::get_peer_epoch(int peer)
1094{
1095 Mutex::Locker l(peer_map_epoch_lock);
1096 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1097 if (p == peer_map_epoch.end())
1098 return 0;
1099 return p->second;
1100}
1101
1102epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1103{
1104 Mutex::Locker l(peer_map_epoch_lock);
1105 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1106 if (p != peer_map_epoch.end()) {
1107 if (p->second < e) {
1108 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1109 p->second = e;
1110 } else {
1111 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1112 }
1113 return p->second;
1114 } else {
1115 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1116 peer_map_epoch[peer] = e;
1117 return e;
1118 }
1119}
1120
1121void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1122{
1123 Mutex::Locker l(peer_map_epoch_lock);
1124 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1125 if (p != peer_map_epoch.end()) {
1126 if (p->second <= as_of) {
1127 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1128 << " had " << p->second << dendl;
1129 peer_map_epoch.erase(p);
1130 } else {
1131 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1132 << " has " << p->second << " - not forgetting" << dendl;
1133 }
1134 }
1135}
1136
1137bool OSDService::should_share_map(entity_name_t name, Connection *con,
1138 epoch_t epoch, const OSDMapRef& osdmap,
1139 const epoch_t *sent_epoch_p)
1140{
1141 dout(20) << "should_share_map "
1142 << name << " " << con->get_peer_addr()
1143 << " " << epoch << dendl;
1144
1145 // does client have old map?
1146 if (name.is_client()) {
1147 bool message_sendmap = epoch < osdmap->get_epoch();
1148 if (message_sendmap && sent_epoch_p) {
1149 dout(20) << "client session last_sent_epoch: "
1150 << *sent_epoch_p
1151 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1152 if (*sent_epoch_p < osdmap->get_epoch()) {
1153 return true;
1154 } // else we don't need to send it out again
1155 }
1156 }
1157
1158 if (con->get_messenger() == osd->cluster_messenger &&
1159 con != osd->cluster_messenger->get_loopback_connection() &&
1160 osdmap->is_up(name.num()) &&
1161 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1162 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1163 // remember
1164 epoch_t has = MAX(get_peer_epoch(name.num()), epoch);
1165
1166 // share?
1167 if (has < osdmap->get_epoch()) {
1168 dout(10) << name << " " << con->get_peer_addr()
1169 << " has old map " << epoch << " < "
1170 << osdmap->get_epoch() << dendl;
1171 return true;
1172 }
1173 }
1174
1175 return false;
1176}
1177
1178void OSDService::share_map(
1179 entity_name_t name,
1180 Connection *con,
1181 epoch_t epoch,
1182 OSDMapRef& osdmap,
1183 epoch_t *sent_epoch_p)
1184{
1185 dout(20) << "share_map "
1186 << name << " " << con->get_peer_addr()
1187 << " " << epoch << dendl;
1188
1189 if (!osd->is_active()) {
1190 /*It is safe not to proceed as OSD is not in healthy state*/
1191 return;
1192 }
1193
1194 bool want_shared = should_share_map(name, con, epoch,
1195 osdmap, sent_epoch_p);
1196
1197 if (want_shared){
1198 if (name.is_client()) {
1199 dout(10) << name << " has old map " << epoch
1200 << " < " << osdmap->get_epoch() << dendl;
1201 // we know the Session is valid or we wouldn't be sending
1202 if (sent_epoch_p) {
1203 *sent_epoch_p = osdmap->get_epoch();
1204 }
1205 send_incremental_map(epoch, con, osdmap);
1206 } else if (con->get_messenger() == osd->cluster_messenger &&
1207 osdmap->is_up(name.num()) &&
1208 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1209 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1210 dout(10) << name << " " << con->get_peer_addr()
1211 << " has old map " << epoch << " < "
1212 << osdmap->get_epoch() << dendl;
1213 note_peer_epoch(name.num(), osdmap->get_epoch());
1214 send_incremental_map(epoch, con, osdmap);
1215 }
1216 }
1217}
1218
1219void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1220{
1221 if (!map)
1222 map = get_osdmap();
1223
1224 // send map?
1225 epoch_t pe = get_peer_epoch(peer);
1226 if (pe) {
1227 if (pe < map->get_epoch()) {
1228 send_incremental_map(pe, con, map);
1229 note_peer_epoch(peer, map->get_epoch());
1230 } else
1231 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1232 } else {
1233 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1234 // no idea about peer's epoch.
1235 // ??? send recent ???
1236 // do nothing.
1237 }
1238}
1239
1240bool OSDService::can_inc_scrubs_pending()
1241{
1242 bool can_inc = false;
1243 Mutex::Locker l(sched_scrub_lock);
1244
1245 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1246 dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
1247 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1248 can_inc = true;
1249 } else {
1250 dout(20) << __func__ << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1251 }
1252
1253 return can_inc;
1254}
1255
1256bool OSDService::inc_scrubs_pending()
1257{
1258 bool result = false;
1259
1260 sched_scrub_lock.Lock();
1261 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1262 dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
1263 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1264 result = true;
1265 ++scrubs_pending;
1266 } else {
1267 dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1268 }
1269 sched_scrub_lock.Unlock();
1270
1271 return result;
1272}
1273
1274void OSDService::dec_scrubs_pending()
1275{
1276 sched_scrub_lock.Lock();
1277 dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
1278 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1279 --scrubs_pending;
1280 assert(scrubs_pending >= 0);
1281 sched_scrub_lock.Unlock();
1282}
1283
1284void OSDService::inc_scrubs_active(bool reserved)
1285{
1286 sched_scrub_lock.Lock();
1287 ++(scrubs_active);
1288 if (reserved) {
1289 --(scrubs_pending);
1290 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1291 << " (max " << cct->_conf->osd_max_scrubs
1292 << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
1293 assert(scrubs_pending >= 0);
1294 } else {
1295 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1296 << " (max " << cct->_conf->osd_max_scrubs
1297 << ", pending " << scrubs_pending << ")" << dendl;
1298 }
1299 sched_scrub_lock.Unlock();
1300}
1301
1302void OSDService::dec_scrubs_active()
1303{
1304 sched_scrub_lock.Lock();
1305 dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
1306 << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
1307 --scrubs_active;
1308 assert(scrubs_active >= 0);
1309 sched_scrub_lock.Unlock();
1310}
1311
1312void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1313 epoch_t *_bind_epoch) const
1314{
1315 Mutex::Locker l(epoch_lock);
1316 if (_boot_epoch)
1317 *_boot_epoch = boot_epoch;
1318 if (_up_epoch)
1319 *_up_epoch = up_epoch;
1320 if (_bind_epoch)
1321 *_bind_epoch = bind_epoch;
1322}
1323
1324void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1325 const epoch_t *_bind_epoch)
1326{
1327 Mutex::Locker l(epoch_lock);
1328 if (_boot_epoch) {
1329 assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1330 boot_epoch = *_boot_epoch;
1331 }
1332 if (_up_epoch) {
1333 assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1334 up_epoch = *_up_epoch;
1335 }
1336 if (_bind_epoch) {
1337 assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1338 bind_epoch = *_bind_epoch;
1339 }
1340}
1341
1342bool OSDService::prepare_to_stop()
1343{
1344 Mutex::Locker l(is_stopping_lock);
1345 if (get_state() != NOT_STOPPING)
1346 return false;
1347
1348 OSDMapRef osdmap = get_osdmap();
1349 if (osdmap && osdmap->is_up(whoami)) {
1350 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1351 set_state(PREPARING_TO_STOP);
1352 monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(),
1353 osdmap->get_inst(whoami),
1354 osdmap->get_epoch(),
1355 true // request ack
1356 ));
1357 utime_t now = ceph_clock_now();
1358 utime_t timeout;
1359 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1360 while ((ceph_clock_now() < timeout) &&
1361 (get_state() != STOPPING)) {
1362 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1363 }
1364 }
1365 dout(0) << __func__ << " starting shutdown" << dendl;
1366 set_state(STOPPING);
1367 return true;
1368}
1369
1370void OSDService::got_stop_ack()
1371{
1372 Mutex::Locker l(is_stopping_lock);
1373 if (get_state() == PREPARING_TO_STOP) {
1374 dout(0) << __func__ << " starting shutdown" << dendl;
1375 set_state(STOPPING);
1376 is_stopping_cond.Signal();
1377 } else {
1378 dout(10) << __func__ << " ignoring msg" << dendl;
1379 }
1380}
1381
1382MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1383 OSDSuperblock& sblock)
1384{
1385 MOSDMap *m = new MOSDMap(monc->get_fsid());
1386 m->oldest_map = max_oldest_map;
1387 m->newest_map = sblock.newest_map;
1388
1389 for (epoch_t e = to; e > since; e--) {
1390 bufferlist bl;
1391 if (e > m->oldest_map && get_inc_map_bl(e, bl)) {
1392 m->incremental_maps[e].claim(bl);
1393 } else if (get_map_bl(e, bl)) {
1394 m->maps[e].claim(bl);
1395 break;
1396 } else {
1397 derr << "since " << since << " to " << to
1398 << " oldest " << m->oldest_map << " newest " << m->newest_map
1399 << dendl;
1400 m->put();
1401 m = NULL;
1402 break;
1403 }
1404 }
1405 return m;
1406}
1407
1408void OSDService::send_map(MOSDMap *m, Connection *con)
1409{
1410 con->send_message(m);
1411}
1412
1413void OSDService::send_incremental_map(epoch_t since, Connection *con,
1414 OSDMapRef& osdmap)
1415{
1416 epoch_t to = osdmap->get_epoch();
1417 dout(10) << "send_incremental_map " << since << " -> " << to
1418 << " to " << con << " " << con->get_peer_addr() << dendl;
1419
1420 MOSDMap *m = NULL;
1421 while (!m) {
1422 OSDSuperblock sblock(get_superblock());
1423 if (since < sblock.oldest_map) {
1424 // just send latest full map
1425 MOSDMap *m = new MOSDMap(monc->get_fsid());
1426 m->oldest_map = max_oldest_map;
1427 m->newest_map = sblock.newest_map;
1428 get_map_bl(to, m->maps[to]);
1429 send_map(m, con);
1430 return;
1431 }
1432
1433 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1434 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1435 << ", only sending most recent" << dendl;
1436 since = to - cct->_conf->osd_map_share_max_epochs;
1437 }
1438
1439 if (to - since > (epoch_t)cct->_conf->osd_map_message_max)
1440 to = since + cct->_conf->osd_map_message_max;
1441 m = build_incremental_map_msg(since, to, sblock);
1442 }
1443 send_map(m, con);
1444}
1445
1446bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1447{
1448 bool found = map_bl_cache.lookup(e, &bl);
31f18b77
FG
1449 if (found) {
1450 if (logger)
1451 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1452 return true;
31f18b77
FG
1453 }
1454 if (logger)
1455 logger->inc(l_osd_map_bl_cache_miss);
7c673cae 1456 found = store->read(coll_t::meta(),
31f18b77
FG
1457 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1458 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1459 if (found) {
7c673cae 1460 _add_map_bl(e, bl);
31f18b77 1461 }
7c673cae
FG
1462 return found;
1463}
1464
1465bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1466{
1467 Mutex::Locker l(map_cache_lock);
1468 bool found = map_bl_inc_cache.lookup(e, &bl);
31f18b77
FG
1469 if (found) {
1470 if (logger)
1471 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1472 return true;
31f18b77
FG
1473 }
1474 if (logger)
1475 logger->inc(l_osd_map_bl_cache_miss);
7c673cae 1476 found = store->read(coll_t::meta(),
31f18b77
FG
1477 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1478 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1479 if (found) {
7c673cae 1480 _add_map_inc_bl(e, bl);
31f18b77 1481 }
7c673cae
FG
1482 return found;
1483}
1484
1485void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1486{
1487 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1488 // cache a contiguous buffer
1489 if (bl.get_num_buffers() > 1) {
1490 bl.rebuild();
1491 }
1492 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1493 map_bl_cache.add(e, bl);
1494}
1495
1496void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1497{
1498 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1499 // cache a contiguous buffer
1500 if (bl.get_num_buffers() > 1) {
1501 bl.rebuild();
1502 }
1503 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1504 map_bl_inc_cache.add(e, bl);
1505}
1506
1507void OSDService::pin_map_inc_bl(epoch_t e, bufferlist &bl)
1508{
1509 Mutex::Locker l(map_cache_lock);
31f18b77
FG
1510 // cache a contiguous buffer
1511 if (bl.get_num_buffers() > 1) {
1512 bl.rebuild();
1513 }
7c673cae
FG
1514 map_bl_inc_cache.pin(e, bl);
1515}
1516
1517void OSDService::pin_map_bl(epoch_t e, bufferlist &bl)
1518{
1519 Mutex::Locker l(map_cache_lock);
31f18b77
FG
1520 // cache a contiguous buffer
1521 if (bl.get_num_buffers() > 1) {
1522 bl.rebuild();
1523 }
7c673cae
FG
1524 map_bl_cache.pin(e, bl);
1525}
1526
1527void OSDService::clear_map_bl_cache_pins(epoch_t e)
1528{
1529 Mutex::Locker l(map_cache_lock);
1530 map_bl_inc_cache.clear_pinned(e);
1531 map_bl_cache.clear_pinned(e);
1532}
1533
1534OSDMapRef OSDService::_add_map(OSDMap *o)
1535{
1536 epoch_t e = o->get_epoch();
1537
1538 if (cct->_conf->osd_map_dedup) {
1539 // Dedup against an existing map at a nearby epoch
1540 OSDMapRef for_dedup = map_cache.lower_bound(e);
1541 if (for_dedup) {
1542 OSDMap::dedup(for_dedup.get(), o);
1543 }
1544 }
1545 bool existed;
1546 OSDMapRef l = map_cache.add(e, o, &existed);
1547 if (existed) {
1548 delete o;
1549 }
1550 return l;
1551}
1552
1553OSDMapRef OSDService::try_get_map(epoch_t epoch)
1554{
1555 Mutex::Locker l(map_cache_lock);
1556 OSDMapRef retval = map_cache.lookup(epoch);
1557 if (retval) {
1558 dout(30) << "get_map " << epoch << " -cached" << dendl;
1559 if (logger) {
1560 logger->inc(l_osd_map_cache_hit);
1561 }
1562 return retval;
1563 }
1564 if (logger) {
1565 logger->inc(l_osd_map_cache_miss);
1566 epoch_t lb = map_cache.cached_key_lower_bound();
1567 if (epoch < lb) {
1568 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1569 logger->inc(l_osd_map_cache_miss_low);
1570 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1571 }
1572 }
1573
1574 OSDMap *map = new OSDMap;
1575 if (epoch > 0) {
1576 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1577 bufferlist bl;
1578 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1579 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1580 delete map;
1581 return OSDMapRef();
1582 }
1583 map->decode(bl);
1584 } else {
1585 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1586 }
1587 return _add_map(map);
1588}
1589
1590// ops
1591
1592
1593void OSDService::reply_op_error(OpRequestRef op, int err)
1594{
1595 reply_op_error(op, err, eversion_t(), 0);
1596}
1597
1598void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1599 version_t uv)
1600{
1601 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1602 assert(m->get_type() == CEPH_MSG_OSD_OP);
1603 int flags;
1604 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1605
1606 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1607 true);
1608 reply->set_reply_versions(v, uv);
1609 m->get_connection()->send_message(reply);
1610}
1611
1612void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1613{
31f18b77
FG
1614 if (!cct->_conf->osd_debug_misdirected_ops) {
1615 return;
1616 }
1617
7c673cae
FG
1618 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1619 assert(m->get_type() == CEPH_MSG_OSD_OP);
1620
1621 assert(m->get_map_epoch() >= pg->info.history.same_primary_since);
1622
1623 if (pg->is_ec_pg()) {
1624 /**
1625 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1626 * can get this result:
1627 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1628 * [CRUSH_ITEM_NONE, 2, 3]/3
1629 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1630 * [3, 2, 3]/3
1631 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1632 * -- misdirected op
1633 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1634 * it and fulfils it
1635 *
1636 * We can't compute the op target based on the sending map epoch due to
1637 * splitting. The simplest thing is to detect such cases here and drop
1638 * them without an error (the client will resend anyway).
1639 */
1640 assert(m->get_map_epoch() <= superblock.newest_map);
1641 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1642 if (!opmap) {
1643 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1644 << m->get_map_epoch() << ", dropping" << dendl;
1645 return;
1646 }
1647 pg_t _pgid = m->get_raw_pg();
1648 spg_t pgid;
1649 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1650 _pgid = opmap->raw_pg_to_pg(_pgid);
1651 if (opmap->get_primary_shard(_pgid, &pgid) &&
1652 pgid.shard != pg->info.pgid.shard) {
1653 dout(7) << __func__ << ": " << *pg << " primary changed since "
1654 << m->get_map_epoch() << ", dropping" << dendl;
1655 return;
1656 }
1657 }
1658
1659 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1660 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1661 << " pg " << m->get_raw_pg()
1662 << " to osd." << whoami
1663 << " not " << pg->acting
1664 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
7c673cae
FG
1665}
1666
1667void OSDService::enqueue_back(spg_t pgid, PGQueueable qi)
1668{
1669 osd->op_shardedwq.queue(make_pair(pgid, qi));
1670}
1671
1672void OSDService::enqueue_front(spg_t pgid, PGQueueable qi)
1673{
1674 osd->op_shardedwq.queue_front(make_pair(pgid, qi));
1675}
1676
1677void OSDService::queue_for_peering(PG *pg)
1678{
1679 peering_wq.queue(pg);
1680}
1681
1682void OSDService::queue_for_snap_trim(PG *pg)
1683{
1684 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1685 osd->op_shardedwq.queue(
1686 make_pair(
1687 pg->info.pgid,
1688 PGQueueable(
1689 PGSnapTrim(pg->get_osdmap()->get_epoch()),
1690 cct->_conf->osd_snap_trim_cost,
1691 cct->_conf->osd_snap_trim_priority,
1692 ceph_clock_now(),
1693 entity_inst_t(),
1694 pg->get_osdmap()->get_epoch())));
1695}
1696
1697
1698// ====================================================================
1699// OSD
1700
1701#undef dout_prefix
1702#define dout_prefix *_dout
1703
1704// Commands shared between OSD's console and admin console:
1705namespace ceph {
1706namespace osd_cmds {
1707
1708int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1709
1710}} // namespace ceph::osd_cmds
1711
1712int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
1713 uuid_d fsid, int whoami)
1714{
1715 int ret;
1716
1717 ceph::shared_ptr<ObjectStore::Sequencer> osr(
1718 new ObjectStore::Sequencer("mkfs"));
1719 OSDSuperblock sb;
1720 bufferlist sbbl;
1721 C_SaferCond waiter;
1722
1723 // if we are fed a uuid for this osd, use it.
1724 store->set_fsid(cct->_conf->osd_uuid);
1725
1726 ret = store->mkfs();
1727 if (ret) {
224ce89b
WB
1728 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1729 << cpp_strerror(ret) << dendl;
7c673cae
FG
1730 goto free_store;
1731 }
1732
31f18b77 1733 store->set_cache_shards(1); // doesn't matter for mkfs!
7c673cae
FG
1734
1735 ret = store->mount();
1736 if (ret) {
224ce89b
WB
1737 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1738 << cpp_strerror(ret) << dendl;
7c673cae
FG
1739 goto free_store;
1740 }
1741
1742 ret = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1743 if (ret >= 0) {
1744 /* if we already have superblock, check content of superblock */
1745 dout(0) << " have superblock" << dendl;
1746 bufferlist::iterator p;
1747 p = sbbl.begin();
1748 ::decode(sb, p);
1749 if (whoami != sb.whoami) {
1750 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1751 << dendl;
1752 ret = -EINVAL;
1753 goto umount_store;
1754 }
1755 if (fsid != sb.cluster_fsid) {
1756 derr << "provided cluster fsid " << fsid
1757 << " != superblock's " << sb.cluster_fsid << dendl;
1758 ret = -EINVAL;
1759 goto umount_store;
1760 }
1761 } else {
1762 // create superblock
1763 sb.cluster_fsid = fsid;
1764 sb.osd_fsid = store->get_fsid();
1765 sb.whoami = whoami;
1766 sb.compat_features = get_osd_initial_compat_set();
1767
1768 bufferlist bl;
1769 ::encode(sb, bl);
1770
1771 ObjectStore::Transaction t;
1772 t.create_collection(coll_t::meta(), 0);
1773 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1774 ret = store->apply_transaction(osr.get(), std::move(t));
1775 if (ret) {
1776 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
224ce89b 1777 << "apply_transaction returned " << cpp_strerror(ret) << dendl;
7c673cae
FG
1778 goto umount_store;
1779 }
1780 }
1781
1782 if (!osr->flush_commit(&waiter)) {
1783 waiter.wait();
1784 }
1785
1786 ret = write_meta(store, sb.cluster_fsid, sb.osd_fsid, whoami);
1787 if (ret) {
224ce89b
WB
1788 derr << "OSD::mkfs: failed to write fsid file: error "
1789 << cpp_strerror(ret) << dendl;
7c673cae
FG
1790 goto umount_store;
1791 }
1792
1793umount_store:
1794 store->umount();
1795free_store:
1796 delete store;
1797 return ret;
1798}
1799
1800int OSD::write_meta(ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
1801{
1802 char val[80];
1803 int r;
1804
1805 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
1806 r = store->write_meta("magic", val);
1807 if (r < 0)
1808 return r;
1809
1810 snprintf(val, sizeof(val), "%d", whoami);
1811 r = store->write_meta("whoami", val);
1812 if (r < 0)
1813 return r;
1814
1815 cluster_fsid.print(val);
1816 r = store->write_meta("ceph_fsid", val);
1817 if (r < 0)
1818 return r;
1819
1820 r = store->write_meta("ready", "ready");
1821 if (r < 0)
1822 return r;
1823
1824 return 0;
1825}
1826
1827int OSD::peek_meta(ObjectStore *store, std::string& magic,
1828 uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami)
1829{
1830 string val;
1831
1832 int r = store->read_meta("magic", &val);
1833 if (r < 0)
1834 return r;
1835 magic = val;
1836
1837 r = store->read_meta("whoami", &val);
1838 if (r < 0)
1839 return r;
1840 whoami = atoi(val.c_str());
1841
1842 r = store->read_meta("ceph_fsid", &val);
1843 if (r < 0)
1844 return r;
1845 r = cluster_fsid.parse(val.c_str());
1846 if (!r)
1847 return -EINVAL;
1848
1849 r = store->read_meta("fsid", &val);
1850 if (r < 0) {
1851 osd_fsid = uuid_d();
1852 } else {
1853 r = osd_fsid.parse(val.c_str());
1854 if (!r)
1855 return -EINVAL;
1856 }
1857
1858 return 0;
1859}
1860
1861
1862#undef dout_prefix
1863#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
1864
1865// cons/des
1866
1867OSD::OSD(CephContext *cct_, ObjectStore *store_,
1868 int id,
1869 Messenger *internal_messenger,
1870 Messenger *external_messenger,
1871 Messenger *hb_client_front,
1872 Messenger *hb_client_back,
1873 Messenger *hb_front_serverm,
1874 Messenger *hb_back_serverm,
1875 Messenger *osdc_messenger,
1876 MonClient *mc,
1877 const std::string &dev, const std::string &jdev) :
1878 Dispatcher(cct_),
1879 osd_lock("OSD::osd_lock"),
1880 tick_timer(cct, osd_lock),
1881 tick_timer_lock("OSD::tick_timer_lock"),
1882 tick_timer_without_osd_lock(cct, tick_timer_lock),
1883 authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(cct,
1884 cct->_conf->auth_supported.empty() ?
1885 cct->_conf->auth_cluster_required :
1886 cct->_conf->auth_supported)),
1887 authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(cct,
1888 cct->_conf->auth_supported.empty() ?
1889 cct->_conf->auth_service_required :
1890 cct->_conf->auth_supported)),
1891 cluster_messenger(internal_messenger),
1892 client_messenger(external_messenger),
1893 objecter_messenger(osdc_messenger),
1894 monc(mc),
1895 mgrc(cct_, client_messenger),
1896 logger(NULL),
1897 recoverystate_perf(NULL),
1898 store(store_),
1899 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
1900 clog(log_client.create_channel()),
1901 whoami(id),
1902 dev_path(dev), journal_path(jdev),
31f18b77 1903 store_is_rotational(store->is_rotational()),
7c673cae
FG
1904 trace_endpoint("0.0.0.0", 0, "osd"),
1905 asok_hook(NULL),
1906 osd_compat(get_osd_compat_set()),
31f18b77
FG
1907 peering_tp(cct, "OSD::peering_tp", "tp_peering",
1908 cct->_conf->osd_peering_wq_threads,
1909 "osd_peering_tp_threads"),
7c673cae 1910 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
31f18b77 1911 get_num_op_threads()),
7c673cae
FG
1912 disk_tp(cct, "OSD::disk_tp", "tp_osd_disk", cct->_conf->osd_disk_threads, "osd_disk_threads"),
1913 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
1914 session_waiting_lock("OSD::session_waiting_lock"),
1915 heartbeat_lock("OSD::heartbeat_lock"),
1916 heartbeat_stop(false),
1917 heartbeat_need_update(true),
1918 hb_front_client_messenger(hb_client_front),
1919 hb_back_client_messenger(hb_client_back),
1920 hb_front_server_messenger(hb_front_serverm),
1921 hb_back_server_messenger(hb_back_serverm),
1922 daily_loadavg(0.0),
1923 heartbeat_thread(this),
1924 heartbeat_dispatcher(this),
1925 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
1926 cct->_conf->osd_num_op_tracker_shard),
1927 test_ops_hook(NULL),
1928 op_queue(get_io_queue()),
1929 op_prio_cutoff(get_io_prio_cut()),
1930 op_shardedwq(
31f18b77 1931 get_num_op_shards(),
7c673cae
FG
1932 this,
1933 cct->_conf->osd_op_thread_timeout,
1934 cct->_conf->osd_op_thread_suicide_timeout,
1935 &osd_op_tp),
1936 peering_wq(
1937 this,
1938 cct->_conf->osd_op_thread_timeout,
1939 cct->_conf->osd_op_thread_suicide_timeout,
31f18b77 1940 &peering_tp),
7c673cae
FG
1941 map_lock("OSD::map_lock"),
1942 pg_map_lock("OSD::pg_map_lock"),
1943 last_pg_create_epoch(0),
1944 mon_report_lock("OSD::mon_report_lock"),
1945 stats_ack_timeout(cct->_conf->osd_mon_ack_timeout),
1946 up_thru_wanted(0),
1947 requested_full_first(0),
1948 requested_full_last(0),
1949 pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
1950 osd_stat_updated(false),
1951 pg_stat_tid(0), pg_stat_tid_flushed(0),
1952 command_wq(
1953 this,
1954 cct->_conf->osd_command_thread_timeout,
1955 cct->_conf->osd_command_thread_suicide_timeout,
1956 &command_tp),
1957 remove_wq(
1958 cct,
1959 store,
1960 cct->_conf->osd_remove_thread_timeout,
1961 cct->_conf->osd_remove_thread_suicide_timeout,
1962 &disk_tp),
1963 service(this)
1964{
1965 monc->set_messenger(client_messenger);
1966 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
1967 cct->_conf->osd_op_log_threshold);
1968 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
1969 cct->_conf->osd_op_history_duration);
1970 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
1971 cct->_conf->osd_op_history_slow_op_threshold);
1972#ifdef WITH_BLKIN
1973 std::stringstream ss;
1974 ss << "osd." << whoami;
1975 trace_endpoint.copy_name(ss.str());
1976#endif
1977}
1978
1979OSD::~OSD()
1980{
1981 delete authorize_handler_cluster_registry;
1982 delete authorize_handler_service_registry;
1983 delete class_handler;
1984 cct->get_perfcounters_collection()->remove(recoverystate_perf);
1985 cct->get_perfcounters_collection()->remove(logger);
1986 delete recoverystate_perf;
1987 delete logger;
1988 delete store;
1989}
1990
1991void cls_initialize(ClassHandler *ch);
1992
1993void OSD::handle_signal(int signum)
1994{
1995 assert(signum == SIGINT || signum == SIGTERM);
1996 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
1997 shutdown();
1998}
1999
2000int OSD::pre_init()
2001{
2002 Mutex::Locker lock(osd_lock);
2003 if (is_stopping())
2004 return 0;
2005
2006 if (store->test_mount_in_use()) {
2007 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2008 << "currently in use. (Is ceph-osd already running?)" << dendl;
2009 return -EBUSY;
2010 }
2011
2012 cct->_conf->add_observer(this);
2013 return 0;
2014}
2015
2016// asok
2017
2018class OSDSocketHook : public AdminSocketHook {
2019 OSD *osd;
2020public:
2021 explicit OSDSocketHook(OSD *o) : osd(o) {}
2022 bool call(std::string admin_command, cmdmap_t& cmdmap, std::string format,
2023 bufferlist& out) override {
2024 stringstream ss;
2025 bool r = osd->asok_command(admin_command, cmdmap, format, ss);
2026 out.append(ss);
2027 return r;
2028 }
2029};
2030
2031bool OSD::asok_command(string admin_command, cmdmap_t& cmdmap, string format,
2032 ostream& ss)
2033{
2034 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2035 if (admin_command == "status") {
2036 f->open_object_section("status");
2037 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2038 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2039 f->dump_unsigned("whoami", superblock.whoami);
2040 f->dump_string("state", get_state_name(get_state()));
2041 f->dump_unsigned("oldest_map", superblock.oldest_map);
2042 f->dump_unsigned("newest_map", superblock.newest_map);
2043 {
2044 RWLock::RLocker l(pg_map_lock);
2045 f->dump_unsigned("num_pgs", pg_map.size());
2046 }
2047 f->close_section();
2048 } else if (admin_command == "flush_journal") {
2049 store->flush_journal();
2050 } else if (admin_command == "dump_ops_in_flight" ||
c07f9fc5
FG
2051 admin_command == "ops" ||
2052 admin_command == "dump_blocked_ops" ||
2053 admin_command == "dump_historic_ops" ||
2054 admin_command == "dump_historic_ops_by_duration" ||
2055 admin_command == "dump_historic_slow_ops") {
2056
2057 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2058even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2059will start to track new ops received afterwards.";
2060
2061 set<string> filters;
2062 vector<string> filter_str;
2063 if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
2064 copy(filter_str.begin(), filter_str.end(),
2065 inserter(filters, filters.end()));
2066 }
2067
2068 if (admin_command == "dump_ops_in_flight" ||
2069 admin_command == "ops") {
2070 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2071 ss << error_str;
2072 }
2073 }
2074 if (admin_command == "dump_blocked_ops") {
2075 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2076 ss << error_str;
2077 }
2078 }
2079 if (admin_command == "dump_historic_ops") {
2080 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2081 ss << error_str;
2082 }
2083 }
2084 if (admin_command == "dump_historic_ops_by_duration") {
2085 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2086 ss << error_str;
2087 }
2088 }
2089 if (admin_command == "dump_historic_slow_ops") {
2090 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2091 ss << error_str;
2092 }
7c673cae
FG
2093 }
2094 } else if (admin_command == "dump_op_pq_state") {
2095 f->open_object_section("pq");
2096 op_shardedwq.dump(f);
2097 f->close_section();
2098 } else if (admin_command == "dump_blacklist") {
2099 list<pair<entity_addr_t,utime_t> > bl;
2100 OSDMapRef curmap = service.get_osdmap();
2101
2102 f->open_array_section("blacklist");
2103 curmap->get_blacklist(&bl);
2104 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2105 it != bl.end(); ++it) {
224ce89b 2106 f->open_object_section("entry");
7c673cae
FG
2107 f->open_object_section("entity_addr_t");
2108 it->first.dump(f);
2109 f->close_section(); //entity_addr_t
2110 it->second.localtime(f->dump_stream("expire_time"));
2111 f->close_section(); //entry
2112 }
2113 f->close_section(); //blacklist
2114 } else if (admin_command == "dump_watchers") {
2115 list<obj_watch_item_t> watchers;
2116 // scan pg's
2117 {
2118 Mutex::Locker l(osd_lock);
2119 RWLock::RLocker l2(pg_map_lock);
2120 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2121 it != pg_map.end();
2122 ++it) {
2123
2124 list<obj_watch_item_t> pg_watchers;
2125 PG *pg = it->second;
2126 pg->lock();
2127 pg->get_watchers(pg_watchers);
2128 pg->unlock();
2129 watchers.splice(watchers.end(), pg_watchers);
2130 }
2131 }
2132
2133 f->open_array_section("watchers");
2134 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2135 it != watchers.end(); ++it) {
2136
224ce89b 2137 f->open_object_section("watch");
7c673cae
FG
2138
2139 f->dump_string("namespace", it->obj.nspace);
2140 f->dump_string("object", it->obj.oid.name);
2141
2142 f->open_object_section("entity_name");
2143 it->wi.name.dump(f);
2144 f->close_section(); //entity_name_t
2145
224ce89b
WB
2146 f->dump_unsigned("cookie", it->wi.cookie);
2147 f->dump_unsigned("timeout", it->wi.timeout_seconds);
7c673cae
FG
2148
2149 f->open_object_section("entity_addr_t");
2150 it->wi.addr.dump(f);
2151 f->close_section(); //entity_addr_t
2152
2153 f->close_section(); //watch
2154 }
2155
2156 f->close_section(); //watchers
2157 } else if (admin_command == "dump_reservations") {
2158 f->open_object_section("reservations");
2159 f->open_object_section("local_reservations");
2160 service.local_reserver.dump(f);
2161 f->close_section();
2162 f->open_object_section("remote_reservations");
2163 service.remote_reserver.dump(f);
2164 f->close_section();
2165 f->close_section();
2166 } else if (admin_command == "get_latest_osdmap") {
2167 get_latest_osdmap();
2168 } else if (admin_command == "heap") {
2169 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2170
2171 // Note: Failed heap profile commands won't necessarily trigger an error:
2172 f->open_object_section("result");
2173 f->dump_string("error", cpp_strerror(result));
2174 f->dump_bool("success", result >= 0);
2175 f->close_section();
2176 } else if (admin_command == "set_heap_property") {
2177 string property;
2178 int64_t value = 0;
2179 string error;
2180 bool success = false;
2181 if (!cmd_getval(cct, cmdmap, "property", property)) {
2182 error = "unable to get property";
2183 success = false;
2184 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2185 error = "unable to get value";
2186 success = false;
2187 } else if (value < 0) {
2188 error = "negative value not allowed";
2189 success = false;
2190 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2191 error = "invalid property";
2192 success = false;
2193 } else {
2194 success = true;
2195 }
2196 f->open_object_section("result");
2197 f->dump_string("error", error);
2198 f->dump_bool("success", success);
2199 f->close_section();
2200 } else if (admin_command == "get_heap_property") {
2201 string property;
2202 size_t value = 0;
2203 string error;
2204 bool success = false;
2205 if (!cmd_getval(cct, cmdmap, "property", property)) {
2206 error = "unable to get property";
2207 success = false;
2208 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2209 error = "invalid property";
2210 success = false;
2211 } else {
2212 success = true;
2213 }
2214 f->open_object_section("result");
2215 f->dump_string("error", error);
2216 f->dump_bool("success", success);
2217 f->dump_int("value", value);
2218 f->close_section();
2219 } else if (admin_command == "dump_objectstore_kv_stats") {
2220 store->get_db_statistics(f);
2221 } else if (admin_command == "dump_scrubs") {
2222 service.dumps_scrub(f);
2223 } else if (admin_command == "calc_objectstore_db_histogram") {
2224 store->generate_db_histogram(f);
2225 } else if (admin_command == "flush_store_cache") {
2226 store->flush_cache();
2227 } else if (admin_command == "dump_pgstate_history") {
2228 f->open_object_section("pgstate_history");
2229 RWLock::RLocker l2(pg_map_lock);
2230 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2231 it != pg_map.end();
2232 ++it) {
2233
2234 PG *pg = it->second;
2235 f->dump_stream("pg") << pg->get_pgid();
2236 pg->lock();
2237 pg->pgstate_history.dump(f);
2238 pg->unlock();
2239 }
2240 f->close_section();
224ce89b
WB
2241 } else if (admin_command == "compact") {
2242 dout(1) << "triggering manual compaction" << dendl;
2243 auto start = ceph::coarse_mono_clock::now();
2244 store->compact();
2245 auto end = ceph::coarse_mono_clock::now();
2246 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
2247 dout(1) << "finished manual compaction in "
2248 << time_span.count()
2249 << " seconds" << dendl;
2250 f->open_object_section("compact_result");
2251 f->dump_float("elapsed_time", time_span.count());
2252 f->close_section();
7c673cae
FG
2253 } else {
2254 assert(0 == "broken asok registration");
2255 }
2256 f->flush(ss);
2257 delete f;
2258 return true;
2259}
2260
2261class TestOpsSocketHook : public AdminSocketHook {
2262 OSDService *service;
2263 ObjectStore *store;
2264public:
2265 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
2266 bool call(std::string command, cmdmap_t& cmdmap, std::string format,
2267 bufferlist& out) override {
2268 stringstream ss;
2269 test_ops(service, store, command, cmdmap, ss);
2270 out.append(ss);
2271 return true;
2272 }
2273 void test_ops(OSDService *service, ObjectStore *store,
2274 const std::string &command, cmdmap_t& cmdmap, ostream &ss);
2275
2276};
2277
2278class OSD::C_Tick : public Context {
2279 OSD *osd;
2280 public:
2281 explicit C_Tick(OSD *o) : osd(o) {}
2282 void finish(int r) override {
2283 osd->tick();
2284 }
2285};
2286
2287class OSD::C_Tick_WithoutOSDLock : public Context {
2288 OSD *osd;
2289 public:
2290 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2291 void finish(int r) override {
2292 osd->tick_without_osd_lock();
2293 }
2294};
2295
2296int OSD::enable_disable_fuse(bool stop)
2297{
2298#ifdef HAVE_LIBFUSE
2299 int r;
2300 string mntpath = cct->_conf->osd_data + "/fuse";
2301 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2302 dout(1) << __func__ << " disabling" << dendl;
2303 fuse_store->stop();
2304 delete fuse_store;
2305 fuse_store = NULL;
2306 r = ::rmdir(mntpath.c_str());
7c673cae 2307 if (r < 0) {
c07f9fc5
FG
2308 r = -errno;
2309 derr << __func__ << " failed to rmdir " << mntpath << ": "
2310 << cpp_strerror(r) << dendl;
7c673cae
FG
2311 return r;
2312 }
2313 return 0;
2314 }
2315 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2316 dout(1) << __func__ << " enabling" << dendl;
2317 r = ::mkdir(mntpath.c_str(), 0700);
2318 if (r < 0)
2319 r = -errno;
2320 if (r < 0 && r != -EEXIST) {
2321 derr << __func__ << " unable to create " << mntpath << ": "
2322 << cpp_strerror(r) << dendl;
2323 return r;
2324 }
2325 fuse_store = new FuseStore(store, mntpath);
2326 r = fuse_store->start();
2327 if (r < 0) {
2328 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2329 delete fuse_store;
2330 fuse_store = NULL;
2331 return r;
2332 }
2333 }
2334#endif // HAVE_LIBFUSE
2335 return 0;
2336}
2337
31f18b77
FG
2338int OSD::get_num_op_shards()
2339{
2340 if (cct->_conf->osd_op_num_shards)
2341 return cct->_conf->osd_op_num_shards;
2342 if (store_is_rotational)
2343 return cct->_conf->osd_op_num_shards_hdd;
2344 else
2345 return cct->_conf->osd_op_num_shards_ssd;
2346}
2347
2348int OSD::get_num_op_threads()
2349{
2350 if (cct->_conf->osd_op_num_threads_per_shard)
2351 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2352 if (store_is_rotational)
2353 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2354 else
2355 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2356}
2357
c07f9fc5
FG
2358float OSD::get_osd_recovery_sleep()
2359{
2360 if (cct->_conf->osd_recovery_sleep)
2361 return cct->_conf->osd_recovery_sleep;
d2e6a577 2362 if (!store_is_rotational && !journal_is_rotational)
c07f9fc5 2363 return cct->_conf->osd_recovery_sleep_ssd;
d2e6a577
FG
2364 else if (store_is_rotational && !journal_is_rotational)
2365 return cct->_conf->get_val<double>("osd_recovery_sleep_hybrid");
2366 else
2367 return cct->_conf->osd_recovery_sleep_hdd;
c07f9fc5
FG
2368}
2369
7c673cae
FG
2370int OSD::init()
2371{
2372 CompatSet initial, diff;
2373 Mutex::Locker lock(osd_lock);
2374 if (is_stopping())
2375 return 0;
2376
2377 tick_timer.init();
2378 tick_timer_without_osd_lock.init();
2379 service.recovery_request_timer.init();
31f18b77 2380 service.recovery_sleep_timer.init();
7c673cae
FG
2381
2382 // mount.
31f18b77
FG
2383 dout(2) << "init " << dev_path
2384 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
2385 << dendl;
d2e6a577 2386 dout(2) << "journal " << journal_path << dendl;
7c673cae
FG
2387 assert(store); // call pre_init() first!
2388
31f18b77 2389 store->set_cache_shards(get_num_op_shards());
7c673cae
FG
2390
2391 int r = store->mount();
2392 if (r < 0) {
2393 derr << "OSD:init: unable to mount object store" << dendl;
2394 return r;
2395 }
d2e6a577
FG
2396 journal_is_rotational = store->is_journal_rotational();
2397 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
2398 << dendl;
7c673cae
FG
2399
2400 enable_disable_fuse(false);
2401
2402 dout(2) << "boot" << dendl;
2403
2404 // initialize the daily loadavg with current 15min loadavg
2405 double loadavgs[3];
2406 if (getloadavg(loadavgs, 3) == 3) {
2407 daily_loadavg = loadavgs[2];
2408 } else {
2409 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
2410 daily_loadavg = 1.0;
2411 }
2412
2413 int rotating_auth_attempts = 0;
2414
2415 // sanity check long object name handling
2416 {
2417 hobject_t l;
2418 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
2419 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
2420 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
2421 r = store->validate_hobject_key(l);
2422 if (r < 0) {
2423 derr << "backend (" << store->get_type() << ") is unable to support max "
2424 << "object name[space] len" << dendl;
2425 derr << " osd max object name len = "
2426 << cct->_conf->osd_max_object_name_len << dendl;
2427 derr << " osd max object namespace len = "
2428 << cct->_conf->osd_max_object_namespace_len << dendl;
2429 derr << cpp_strerror(r) << dendl;
2430 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
2431 goto out;
2432 }
2433 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
2434 << dendl;
2435 } else {
2436 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
2437 }
2438 }
2439
2440 // read superblock
2441 r = read_superblock();
2442 if (r < 0) {
2443 derr << "OSD::init() : unable to read osd superblock" << dendl;
2444 r = -EINVAL;
2445 goto out;
2446 }
2447
2448 if (osd_compat.compare(superblock.compat_features) < 0) {
2449 derr << "The disk uses features unsupported by the executable." << dendl;
2450 derr << " ondisk features " << superblock.compat_features << dendl;
2451 derr << " daemon features " << osd_compat << dendl;
2452
2453 if (osd_compat.writeable(superblock.compat_features)) {
2454 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2455 derr << "it is still writeable, though. Missing features: " << diff << dendl;
2456 r = -EOPNOTSUPP;
2457 goto out;
2458 }
2459 else {
2460 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2461 derr << "Cannot write to disk! Missing features: " << diff << dendl;
2462 r = -EOPNOTSUPP;
2463 goto out;
2464 }
2465 }
2466
2467 assert_warn(whoami == superblock.whoami);
2468 if (whoami != superblock.whoami) {
2469 derr << "OSD::init: superblock says osd"
2470 << superblock.whoami << " but I am osd." << whoami << dendl;
2471 r = -EINVAL;
2472 goto out;
2473 }
2474
2475 initial = get_osd_initial_compat_set();
2476 diff = superblock.compat_features.unsupported(initial);
2477 if (superblock.compat_features.merge(initial)) {
2478 // We need to persist the new compat_set before we
2479 // do anything else
2480 dout(5) << "Upgrading superblock adding: " << diff << dendl;
2481 ObjectStore::Transaction t;
2482 write_superblock(t);
2483 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2484 if (r < 0)
2485 goto out;
2486 }
2487
2488 // make sure snap mapper object exists
2489 if (!store->exists(coll_t::meta(), OSD::make_snapmapper_oid())) {
2490 dout(10) << "init creating/touching snapmapper object" << dendl;
2491 ObjectStore::Transaction t;
2492 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
2493 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2494 if (r < 0)
2495 goto out;
2496 }
2497
2498 class_handler = new ClassHandler(cct);
2499 cls_initialize(class_handler);
2500
2501 if (cct->_conf->osd_open_classes_on_start) {
2502 int r = class_handler->open_all_classes();
2503 if (r)
2504 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
2505 }
2506
2507 // load up "current" osdmap
2508 assert_warn(!osdmap);
2509 if (osdmap) {
2510 derr << "OSD::init: unable to read current osdmap" << dendl;
2511 r = -EINVAL;
2512 goto out;
2513 }
2514 osdmap = get_map(superblock.current_epoch);
2515 check_osdmap_features(store);
2516
2517 create_recoverystate_perf();
2518
2519 {
2520 epoch_t bind_epoch = osdmap->get_epoch();
2521 service.set_epochs(NULL, NULL, &bind_epoch);
2522 }
2523
2524 clear_temp_objects();
2525
d2e6a577
FG
2526 // initialize osdmap references in sharded wq
2527 op_shardedwq.prune_pg_waiters(osdmap, whoami);
2528
7c673cae
FG
2529 // load up pgs (as they previously existed)
2530 load_pgs();
2531
2532 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
2533 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
2534 op_prio_cutoff << "." << dendl;
2535
2536 create_logger();
2537
2538 // i'm ready!
2539 client_messenger->add_dispatcher_head(this);
2540 cluster_messenger->add_dispatcher_head(this);
2541
2542 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2543 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2544 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2545 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2546
2547 objecter_messenger->add_dispatcher_head(service.objecter);
2548
2549 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
2550 | CEPH_ENTITY_TYPE_MGR);
2551 r = monc->init();
2552 if (r < 0)
2553 goto out;
2554
2555 /**
2556 * FIXME: this is a placeholder implementation that unconditionally
2557 * sends every is_primary PG's stats every time we're called, unlike
2558 * the existing mon PGStats mechanism that uses pg_stat_queue and acks.
2559 * This has equivalent cost to the existing worst case where all
2560 * PGs are busy and their stats are always enqueued for sending.
2561 */
2562 mgrc.set_pgstats_cb([this](){
2563 RWLock::RLocker l(map_lock);
2564
2565 utime_t had_for = ceph_clock_now() - had_map_since;
2566 osd_stat_t cur_stat = service.get_osd_stat();
2567 cur_stat.os_perf_stat = store->get_cur_stats();
2568
2569 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
2570 m->osd_stat = cur_stat;
2571
2572 Mutex::Locker lec{min_last_epoch_clean_lock};
2573 min_last_epoch_clean = osdmap->get_epoch();
2574 min_last_epoch_clean_pgs.clear();
2575 RWLock::RLocker lpg(pg_map_lock);
2576 for (const auto &i : pg_map) {
2577 PG *pg = i.second;
2578 if (!pg->is_primary()) {
2579 continue;
2580 }
2581
2582 pg->pg_stats_publish_lock.Lock();
2583 if (pg->pg_stats_publish_valid) {
2584 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
2585 const auto lec = pg->pg_stats_publish.get_effective_last_epoch_clean();
2586 min_last_epoch_clean = min(min_last_epoch_clean, lec);
2587 min_last_epoch_clean_pgs.push_back(pg->info.pgid.pgid);
2588 }
2589 pg->pg_stats_publish_lock.Unlock();
2590 }
2591
2592 return m;
2593 });
2594
2595 mgrc.init();
2596 client_messenger->add_dispatcher_head(&mgrc);
2597
2598 // tell monc about log_client so it will know about mon session resets
2599 monc->set_log_client(&log_client);
2600 update_log_config();
2601
31f18b77 2602 peering_tp.start();
7c673cae
FG
2603 osd_op_tp.start();
2604 disk_tp.start();
2605 command_tp.start();
2606
2607 set_disk_tp_priority();
2608
2609 // start the heartbeat
2610 heartbeat_thread.create("osd_srv_heartbt");
2611
2612 // tick
2613 tick_timer.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick(this));
2614 {
2615 Mutex::Locker l(tick_timer_lock);
2616 tick_timer_without_osd_lock.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick_WithoutOSDLock(this));
2617 }
2618
2619 service.init();
2620 service.publish_map(osdmap);
2621 service.publish_superblock(superblock);
2622 service.max_oldest_map = superblock.oldest_map;
2623
2624 osd_lock.Unlock();
2625
2626 r = monc->authenticate();
2627 if (r < 0) {
c07f9fc5
FG
2628 derr << __func__ << " authentication failed: " << cpp_strerror(r)
2629 << dendl;
7c673cae
FG
2630 osd_lock.Lock(); // locker is going to unlock this on function exit
2631 if (is_stopping())
c07f9fc5 2632 r = 0;
7c673cae
FG
2633 goto monout;
2634 }
2635
2636 while (monc->wait_auth_rotating(30.0) < 0) {
2637 derr << "unable to obtain rotating service keys; retrying" << dendl;
2638 ++rotating_auth_attempts;
2639 if (rotating_auth_attempts > g_conf->max_rotating_auth_attempts) {
c07f9fc5 2640 derr << __func__ << " wait_auth_rotating timed out" << dendl;
7c673cae
FG
2641 osd_lock.Lock(); // make locker happy
2642 if (!is_stopping()) {
c07f9fc5 2643 r = -ETIMEDOUT;
7c673cae
FG
2644 }
2645 goto monout;
2646 }
2647 }
2648
2649 r = update_crush_device_class();
2650 if (r < 0) {
d2e6a577
FG
2651 derr << __func__ << " unable to update_crush_device_class: "
2652 << cpp_strerror(r) << dendl;
7c673cae
FG
2653 osd_lock.Lock();
2654 goto monout;
2655 }
2656
2657 r = update_crush_location();
2658 if (r < 0) {
d2e6a577 2659 derr << __func__ << " unable to update_crush_location: "
c07f9fc5 2660 << cpp_strerror(r) << dendl;
7c673cae
FG
2661 osd_lock.Lock();
2662 goto monout;
2663 }
2664
2665 osd_lock.Lock();
2666 if (is_stopping())
2667 return 0;
2668
2669 // start objecter *after* we have authenticated, so that we don't ignore
2670 // the OSDMaps it requests.
2671 service.final_init();
2672
2673 check_config();
2674
2675 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
2676 consume_map();
2677 peering_wq.drain();
2678
2679 dout(0) << "done with init, starting boot process" << dendl;
2680
2681 // subscribe to any pg creations
2682 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
2683
2684 // MgrClient needs this (it doesn't have MonClient reference itself)
2685 monc->sub_want("mgrmap", 0, 0);
2686
2687 // we don't need to ask for an osdmap here; objecter will
2688 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
2689
2690 monc->renew_subs();
2691
2692 start_boot();
2693
2694 return 0;
2695monout:
c07f9fc5 2696 exit(1);
7c673cae
FG
2697
2698out:
2699 enable_disable_fuse(true);
2700 store->umount();
2701 delete store;
2702 store = NULL;
2703 return r;
2704}
2705
2706void OSD::final_init()
2707{
2708 AdminSocket *admin_socket = cct->get_admin_socket();
2709 asok_hook = new OSDSocketHook(this);
2710 int r = admin_socket->register_command("status", "status", asok_hook,
2711 "high-level status of OSD");
2712 assert(r == 0);
2713 r = admin_socket->register_command("flush_journal", "flush_journal",
2714 asok_hook,
2715 "flush the journal to permanent store");
2716 assert(r == 0);
2717 r = admin_socket->register_command("dump_ops_in_flight",
c07f9fc5
FG
2718 "dump_ops_in_flight " \
2719 "name=filterstr,type=CephString,n=N,req=false",
2720 asok_hook,
7c673cae
FG
2721 "show the ops currently in flight");
2722 assert(r == 0);
2723 r = admin_socket->register_command("ops",
c07f9fc5
FG
2724 "ops " \
2725 "name=filterstr,type=CephString,n=N,req=false",
2726 asok_hook,
7c673cae
FG
2727 "show the ops currently in flight");
2728 assert(r == 0);
2729 r = admin_socket->register_command("dump_blocked_ops",
c07f9fc5
FG
2730 "dump_blocked_ops " \
2731 "name=filterstr,type=CephString,n=N,req=false",
2732 asok_hook,
7c673cae
FG
2733 "show the blocked ops currently in flight");
2734 assert(r == 0);
c07f9fc5
FG
2735 r = admin_socket->register_command("dump_historic_ops",
2736 "dump_historic_ops " \
2737 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
2738 asok_hook,
2739 "show recent ops");
2740 assert(r == 0);
c07f9fc5
FG
2741 r = admin_socket->register_command("dump_historic_slow_ops",
2742 "dump_historic_slow_ops " \
2743 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
2744 asok_hook,
2745 "show slowest recent ops");
2746 assert(r == 0);
c07f9fc5
FG
2747 r = admin_socket->register_command("dump_historic_ops_by_duration",
2748 "dump_historic_ops_by_duration " \
2749 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
2750 asok_hook,
2751 "show slowest recent ops, sorted by duration");
2752 assert(r == 0);
2753 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
2754 asok_hook,
2755 "dump op priority queue state");
2756 assert(r == 0);
2757 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
2758 asok_hook,
2759 "dump blacklisted clients and times");
2760 assert(r == 0);
2761 r = admin_socket->register_command("dump_watchers", "dump_watchers",
2762 asok_hook,
2763 "show clients which have active watches,"
2764 " and on which objects");
2765 assert(r == 0);
2766 r = admin_socket->register_command("dump_reservations", "dump_reservations",
2767 asok_hook,
2768 "show recovery reservations");
2769 assert(r == 0);
2770 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
2771 asok_hook,
2772 "force osd to update the latest map from "
2773 "the mon");
2774 assert(r == 0);
2775
2776 r = admin_socket->register_command( "heap",
2777 "heap " \
2778 "name=heapcmd,type=CephString",
2779 asok_hook,
2780 "show heap usage info (available only if "
2781 "compiled with tcmalloc)");
2782 assert(r == 0);
2783
2784 r = admin_socket->register_command("set_heap_property",
2785 "set_heap_property " \
2786 "name=property,type=CephString " \
2787 "name=value,type=CephInt",
2788 asok_hook,
2789 "update malloc extension heap property");
2790 assert(r == 0);
2791
2792 r = admin_socket->register_command("get_heap_property",
2793 "get_heap_property " \
2794 "name=property,type=CephString",
2795 asok_hook,
2796 "get malloc extension heap property");
2797 assert(r == 0);
2798
2799 r = admin_socket->register_command("dump_objectstore_kv_stats",
2800 "dump_objectstore_kv_stats",
2801 asok_hook,
2802 "print statistics of kvdb which used by bluestore");
2803 assert(r == 0);
2804
2805 r = admin_socket->register_command("dump_scrubs",
2806 "dump_scrubs",
2807 asok_hook,
2808 "print scheduled scrubs");
2809 assert(r == 0);
2810
2811 r = admin_socket->register_command("calc_objectstore_db_histogram",
2812 "calc_objectstore_db_histogram",
2813 asok_hook,
2814 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
2815 assert(r == 0);
2816
2817 r = admin_socket->register_command("flush_store_cache",
2818 "flush_store_cache",
2819 asok_hook,
2820 "Flush bluestore internal cache");
2821 assert(r == 0);
2822 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
2823 asok_hook,
2824 "show recent state history");
2825 assert(r == 0);
2826
224ce89b
WB
2827 r = admin_socket->register_command("compact", "compact",
2828 asok_hook,
2829 "Commpact object store's omap."
2830 " WARNING: Compaction probably slows your requests");
2831 assert(r == 0);
2832
7c673cae
FG
2833 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
2834 // Note: pools are CephString instead of CephPoolname because
2835 // these commands traditionally support both pool names and numbers
2836 r = admin_socket->register_command(
2837 "setomapval",
2838 "setomapval " \
2839 "name=pool,type=CephString " \
2840 "name=objname,type=CephObjectname " \
2841 "name=key,type=CephString "\
2842 "name=val,type=CephString",
2843 test_ops_hook,
2844 "set omap key");
2845 assert(r == 0);
2846 r = admin_socket->register_command(
2847 "rmomapkey",
2848 "rmomapkey " \
2849 "name=pool,type=CephString " \
2850 "name=objname,type=CephObjectname " \
2851 "name=key,type=CephString",
2852 test_ops_hook,
2853 "remove omap key");
2854 assert(r == 0);
2855 r = admin_socket->register_command(
2856 "setomapheader",
2857 "setomapheader " \
2858 "name=pool,type=CephString " \
2859 "name=objname,type=CephObjectname " \
2860 "name=header,type=CephString",
2861 test_ops_hook,
2862 "set omap header");
2863 assert(r == 0);
2864
2865 r = admin_socket->register_command(
2866 "getomap",
2867 "getomap " \
2868 "name=pool,type=CephString " \
2869 "name=objname,type=CephObjectname",
2870 test_ops_hook,
2871 "output entire object map");
2872 assert(r == 0);
2873
2874 r = admin_socket->register_command(
2875 "truncobj",
2876 "truncobj " \
2877 "name=pool,type=CephString " \
2878 "name=objname,type=CephObjectname " \
2879 "name=len,type=CephInt",
2880 test_ops_hook,
2881 "truncate object to length");
2882 assert(r == 0);
2883
2884 r = admin_socket->register_command(
2885 "injectdataerr",
2886 "injectdataerr " \
2887 "name=pool,type=CephString " \
2888 "name=objname,type=CephObjectname " \
2889 "name=shardid,type=CephInt,req=false,range=0|255",
2890 test_ops_hook,
2891 "inject data error to an object");
2892 assert(r == 0);
2893
2894 r = admin_socket->register_command(
2895 "injectmdataerr",
2896 "injectmdataerr " \
2897 "name=pool,type=CephString " \
2898 "name=objname,type=CephObjectname " \
2899 "name=shardid,type=CephInt,req=false,range=0|255",
2900 test_ops_hook,
2901 "inject metadata error to an object");
2902 assert(r == 0);
2903 r = admin_socket->register_command(
2904 "set_recovery_delay",
2905 "set_recovery_delay " \
2906 "name=utime,type=CephInt,req=false",
2907 test_ops_hook,
2908 "Delay osd recovery by specified seconds");
2909 assert(r == 0);
2910 r = admin_socket->register_command(
2911 "trigger_scrub",
2912 "trigger_scrub " \
2913 "name=pgid,type=CephString ",
2914 test_ops_hook,
2915 "Trigger a scheduled scrub ");
2916 assert(r == 0);
2917 r = admin_socket->register_command(
2918 "injectfull",
2919 "injectfull " \
2920 "name=type,type=CephString,req=false " \
2921 "name=count,type=CephInt,req=false ",
2922 test_ops_hook,
2923 "Inject a full disk (optional count times)");
2924 assert(r == 0);
2925}
2926
2927void OSD::create_logger()
2928{
2929 dout(10) << "create_logger" << dendl;
2930
2931 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
2932
2933 // Latency axis configuration for op histograms, values are in nanoseconds
2934 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
2935 "Latency (usec)",
2936 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
2937 0, ///< Start at 0
2938 100000, ///< Quantization unit is 100usec
2939 32, ///< Enough to cover much longer than slow requests
2940 };
2941
2942 // Op size axis configuration for op histograms, values are in bytes
2943 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
2944 "Request size (bytes)",
2945 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
2946 0, ///< Start at 0
2947 512, ///< Quantization unit is 512 bytes
2948 32, ///< Enough to cover requests larger than GB
2949 };
2950
2951
2952 osd_plb.add_u64(
2953 l_osd_op_wip, "op_wip",
2954 "Replication operations currently being processed (primary)");
2955 osd_plb.add_u64_counter(
2956 l_osd_op, "op",
2957 "Client operations",
2958 "ops", PerfCountersBuilder::PRIO_CRITICAL);
2959 osd_plb.add_u64_counter(
2960 l_osd_op_inb, "op_in_bytes",
2961 "Client operations total write size",
2962 "wr", PerfCountersBuilder::PRIO_INTERESTING);
2963 osd_plb.add_u64_counter(
2964 l_osd_op_outb, "op_out_bytes",
2965 "Client operations total read size",
2966 "rd", PerfCountersBuilder::PRIO_INTERESTING);
2967 osd_plb.add_time_avg(
2968 l_osd_op_lat, "op_latency",
2969 "Latency of client operations (including queue time)",
2970 "l", 9);
2971 osd_plb.add_time_avg(
2972 l_osd_op_process_lat, "op_process_latency",
2973 "Latency of client operations (excluding queue time)");
2974 osd_plb.add_time_avg(
2975 l_osd_op_prepare_lat, "op_prepare_latency",
2976 "Latency of client operations (excluding queue time and wait for finished)");
2977
2978 osd_plb.add_u64_counter(
2979 l_osd_op_r, "op_r", "Client read operations");
2980 osd_plb.add_u64_counter(
2981 l_osd_op_r_outb, "op_r_out_bytes", "Client data read");
2982 osd_plb.add_time_avg(
2983 l_osd_op_r_lat, "op_r_latency",
2984 "Latency of read operation (including queue time)");
31f18b77 2985 osd_plb.add_u64_counter_histogram(
7c673cae
FG
2986 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
2987 op_hist_x_axis_config, op_hist_y_axis_config,
2988 "Histogram of operation latency (including queue time) + data read");
2989 osd_plb.add_time_avg(
2990 l_osd_op_r_process_lat, "op_r_process_latency",
2991 "Latency of read operation (excluding queue time)");
2992 osd_plb.add_time_avg(
2993 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
2994 "Latency of read operations (excluding queue time and wait for finished)");
2995 osd_plb.add_u64_counter(
2996 l_osd_op_w, "op_w", "Client write operations");
2997 osd_plb.add_u64_counter(
2998 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
2999 osd_plb.add_time_avg(
3000 l_osd_op_w_lat, "op_w_latency",
3001 "Latency of write operation (including queue time)");
31f18b77 3002 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3003 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
3004 op_hist_x_axis_config, op_hist_y_axis_config,
3005 "Histogram of operation latency (including queue time) + data written");
3006 osd_plb.add_time_avg(
3007 l_osd_op_w_process_lat, "op_w_process_latency",
3008 "Latency of write operation (excluding queue time)");
3009 osd_plb.add_time_avg(
3010 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
3011 "Latency of write operations (excluding queue time and wait for finished)");
3012 osd_plb.add_u64_counter(
3013 l_osd_op_rw, "op_rw",
3014 "Client read-modify-write operations");
3015 osd_plb.add_u64_counter(
3016 l_osd_op_rw_inb, "op_rw_in_bytes",
3017 "Client read-modify-write operations write in");
3018 osd_plb.add_u64_counter(
3019 l_osd_op_rw_outb,"op_rw_out_bytes",
3020 "Client read-modify-write operations read out ");
3021 osd_plb.add_time_avg(
3022 l_osd_op_rw_lat, "op_rw_latency",
3023 "Latency of read-modify-write operation (including queue time)");
31f18b77 3024 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3025 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
3026 op_hist_x_axis_config, op_hist_y_axis_config,
3027 "Histogram of rw operation latency (including queue time) + data written");
31f18b77 3028 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3029 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
3030 op_hist_x_axis_config, op_hist_y_axis_config,
3031 "Histogram of rw operation latency (including queue time) + data read");
3032 osd_plb.add_time_avg(
3033 l_osd_op_rw_process_lat, "op_rw_process_latency",
3034 "Latency of read-modify-write operation (excluding queue time)");
3035 osd_plb.add_time_avg(
3036 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
3037 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3038
224ce89b
WB
3039 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
3040 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3041 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
3042 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3043
7c673cae
FG
3044 osd_plb.add_u64_counter(
3045 l_osd_sop, "subop", "Suboperations");
3046 osd_plb.add_u64_counter(
3047 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size");
3048 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
3049
3050 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
3051 osd_plb.add_u64_counter(
3052 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size");
3053 osd_plb.add_time_avg(
3054 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
3055 osd_plb.add_u64_counter(
3056 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
3057 osd_plb.add_time_avg(
3058 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
3059 osd_plb.add_u64_counter(
3060 l_osd_sop_push, "subop_push", "Suboperations push messages");
3061 osd_plb.add_u64_counter(
3062 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size");
3063 osd_plb.add_time_avg(
3064 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3065
3066 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3067 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
3068 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size");
3069
3070 osd_plb.add_u64_counter(
3071 l_osd_rop, "recovery_ops",
3072 "Started recovery operations",
3073 "rop", PerfCountersBuilder::PRIO_INTERESTING);
3074
3075 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
3076 osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size");
3077 osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes");
3078 osd_plb.add_u64(l_osd_history_alloc_num, "history_alloc_num");
3079 osd_plb.add_u64(
3080 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3081 osd_plb.add_u64(
3082 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3083 "Total number getting crc from crc_cache with adjusting");
3084 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3085 "Total number of crc cache misses");
3086
3087 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3088 "pgs", PerfCountersBuilder::PRIO_USEFUL);
3089 osd_plb.add_u64(
3090 l_osd_pg_primary, "numpg_primary",
3091 "Placement groups for which this osd is primary");
3092 osd_plb.add_u64(
3093 l_osd_pg_replica, "numpg_replica",
3094 "Placement groups for which this osd is replica");
3095 osd_plb.add_u64(
3096 l_osd_pg_stray, "numpg_stray",
3097 "Placement groups ready to be deleted from this osd");
3098 osd_plb.add_u64(
3099 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3100 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3101 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3102 osd_plb.add_u64_counter(
3103 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3104 osd_plb.add_u64_counter(
3105 l_osd_waiting_for_map, "messages_delayed_for_map",
3106 "Operations waiting for OSD map");
31f18b77 3107
7c673cae
FG
3108 osd_plb.add_u64_counter(
3109 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3110 osd_plb.add_u64_counter(
3111 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3112 osd_plb.add_u64_counter(
3113 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3114 "osdmap cache miss below cache lower bound");
3115 osd_plb.add_u64_avg(
3116 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3117 "osdmap cache miss, avg distance below cache lower bound");
31f18b77
FG
3118 osd_plb.add_u64_counter(
3119 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3120 "OSDMap buffer cache hits");
3121 osd_plb.add_u64_counter(
3122 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3123 "OSDMap buffer cache misses");
7c673cae
FG
3124
3125 osd_plb.add_u64(l_osd_stat_bytes, "stat_bytes", "OSD size");
3126 osd_plb.add_u64(l_osd_stat_bytes_used, "stat_bytes_used", "Used space");
3127 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
3128
3129 osd_plb.add_u64_counter(
3130 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3131
3132 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3133 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3134 osd_plb.add_u64_counter(
3135 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3136 osd_plb.add_u64_counter(
3137 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3138 osd_plb.add_u64_counter(
3139 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3140 "Failed tier flush attempts");
3141 osd_plb.add_u64_counter(
3142 l_osd_tier_evict, "tier_evict", "Tier evictions");
3143 osd_plb.add_u64_counter(
3144 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3145 osd_plb.add_u64_counter(
3146 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3147 osd_plb.add_u64_counter(
3148 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3149 osd_plb.add_u64_counter(
3150 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3151 osd_plb.add_u64_counter(
3152 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3153 osd_plb.add_u64_counter(
3154 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3155
3156 osd_plb.add_u64_counter(
3157 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3158 osd_plb.add_u64_counter(
3159 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3160 osd_plb.add_u64_counter(
3161 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3162 osd_plb.add_u64_counter(
3163 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3164
3165 osd_plb.add_u64_counter(
3166 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3167 osd_plb.add_u64_counter(
3168 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3169
3170 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3171 osd_plb.add_time_avg(
3172 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3173 osd_plb.add_time_avg(
3174 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3175 osd_plb.add_time_avg(
3176 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3177
3178 osd_plb.add_u64_counter(
3179 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3180 osd_plb.add_u64_counter(
3181 l_osd_pg_fastinfo, "osd_pg_fastinfo",
3182 "PG updated its info using fastinfo attr");
3183 osd_plb.add_u64_counter(
3184 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3185
3186 logger = osd_plb.create_perf_counters();
3187 cct->get_perfcounters_collection()->add(logger);
3188}
3189
3190void OSD::create_recoverystate_perf()
3191{
3192 dout(10) << "create_recoverystate_perf" << dendl;
3193
3194 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3195
3196 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3197 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3198 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3199 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3200 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3201 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3202 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3203 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3204 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3205 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3206 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3207 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3208 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3209 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3210 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3211 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3212 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3213 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3214 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3215 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3216 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3217 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3218 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3219 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3220 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3221 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3222 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3223 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3224 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3225 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3226 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3227
3228 recoverystate_perf = rs_perf.create_perf_counters();
3229 cct->get_perfcounters_collection()->add(recoverystate_perf);
3230}
3231
3232int OSD::shutdown()
3233{
3234 if (!service.prepare_to_stop())
3235 return 0; // already shutting down
3236 osd_lock.Lock();
3237 if (is_stopping()) {
3238 osd_lock.Unlock();
3239 return 0;
3240 }
3241 derr << "shutdown" << dendl;
3242
3243 set_state(STATE_STOPPING);
3244
3245 // Debugging
3246 cct->_conf->set_val("debug_osd", "100");
3247 cct->_conf->set_val("debug_journal", "100");
3248 cct->_conf->set_val("debug_filestore", "100");
3249 cct->_conf->set_val("debug_ms", "100");
3250 cct->_conf->apply_changes(NULL);
3251
3252 // stop MgrClient earlier as it's more like an internal consumer of OSD
3253 mgrc.shutdown();
3254
3255 service.start_shutdown();
3256
3257 // stop sending work to pgs. this just prevents any new work in _process
3258 // from racing with on_shutdown and potentially entering the pg after.
3259 op_shardedwq.drain();
3260
3261 // Shutdown PGs
3262 {
3263 RWLock::RLocker l(pg_map_lock);
3264 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3265 p != pg_map.end();
3266 ++p) {
3267 dout(20) << " kicking pg " << p->first << dendl;
3268 p->second->lock();
3269 p->second->on_shutdown();
3270 p->second->unlock();
3271 p->second->osr->flush();
3272 }
3273 }
3274 clear_pg_stat_queue();
3275
3276 // drain op queue again (in case PGs requeued something)
3277 op_shardedwq.drain();
3278 {
3279 finished.clear(); // zap waiters (bleh, this is messy)
3280 }
3281
3282 op_shardedwq.clear_pg_slots();
3283
3284 // unregister commands
3285 cct->get_admin_socket()->unregister_command("status");
3286 cct->get_admin_socket()->unregister_command("flush_journal");
3287 cct->get_admin_socket()->unregister_command("dump_ops_in_flight");
3288 cct->get_admin_socket()->unregister_command("ops");
3289 cct->get_admin_socket()->unregister_command("dump_blocked_ops");
3290 cct->get_admin_socket()->unregister_command("dump_historic_ops");
3291 cct->get_admin_socket()->unregister_command("dump_historic_ops_by_duration");
3292 cct->get_admin_socket()->unregister_command("dump_historic_slow_ops");
3293 cct->get_admin_socket()->unregister_command("dump_op_pq_state");
3294 cct->get_admin_socket()->unregister_command("dump_blacklist");
3295 cct->get_admin_socket()->unregister_command("dump_watchers");
3296 cct->get_admin_socket()->unregister_command("dump_reservations");
3297 cct->get_admin_socket()->unregister_command("get_latest_osdmap");
224ce89b 3298 cct->get_admin_socket()->unregister_command("heap");
7c673cae
FG
3299 cct->get_admin_socket()->unregister_command("set_heap_property");
3300 cct->get_admin_socket()->unregister_command("get_heap_property");
3301 cct->get_admin_socket()->unregister_command("dump_objectstore_kv_stats");
224ce89b 3302 cct->get_admin_socket()->unregister_command("dump_scrubs");
7c673cae
FG
3303 cct->get_admin_socket()->unregister_command("calc_objectstore_db_histogram");
3304 cct->get_admin_socket()->unregister_command("flush_store_cache");
3305 cct->get_admin_socket()->unregister_command("dump_pgstate_history");
224ce89b 3306 cct->get_admin_socket()->unregister_command("compact");
7c673cae
FG
3307 delete asok_hook;
3308 asok_hook = NULL;
3309
3310 cct->get_admin_socket()->unregister_command("setomapval");
3311 cct->get_admin_socket()->unregister_command("rmomapkey");
3312 cct->get_admin_socket()->unregister_command("setomapheader");
3313 cct->get_admin_socket()->unregister_command("getomap");
3314 cct->get_admin_socket()->unregister_command("truncobj");
3315 cct->get_admin_socket()->unregister_command("injectdataerr");
3316 cct->get_admin_socket()->unregister_command("injectmdataerr");
3317 cct->get_admin_socket()->unregister_command("set_recovery_delay");
224ce89b
WB
3318 cct->get_admin_socket()->unregister_command("trigger_scrub");
3319 cct->get_admin_socket()->unregister_command("injectfull");
7c673cae
FG
3320 delete test_ops_hook;
3321 test_ops_hook = NULL;
3322
3323 osd_lock.Unlock();
3324
3325 heartbeat_lock.Lock();
3326 heartbeat_stop = true;
3327 heartbeat_cond.Signal();
3328 heartbeat_lock.Unlock();
3329 heartbeat_thread.join();
3330
31f18b77 3331 peering_tp.drain();
7c673cae 3332 peering_wq.clear();
31f18b77 3333 peering_tp.stop();
7c673cae
FG
3334 dout(10) << "osd tp stopped" << dendl;
3335
3336 osd_op_tp.drain();
3337 osd_op_tp.stop();
3338 dout(10) << "op sharded tp stopped" << dendl;
3339
3340 command_tp.drain();
3341 command_tp.stop();
3342 dout(10) << "command tp stopped" << dendl;
3343
3344 disk_tp.drain();
3345 disk_tp.stop();
3346 dout(10) << "disk tp paused (new)" << dendl;
3347
3348 dout(10) << "stopping agent" << dendl;
3349 service.agent_stop();
3350
3351 osd_lock.Lock();
3352
3353 reset_heartbeat_peers();
3354
3355 tick_timer.shutdown();
3356
3357 {
3358 Mutex::Locker l(tick_timer_lock);
3359 tick_timer_without_osd_lock.shutdown();
3360 }
3361
3362 // note unmount epoch
3363 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
3364 superblock.mounted = service.get_boot_epoch();
3365 superblock.clean_thru = osdmap->get_epoch();
3366 ObjectStore::Transaction t;
3367 write_superblock(t);
3368 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
3369 if (r) {
3370 derr << "OSD::shutdown: error writing superblock: "
3371 << cpp_strerror(r) << dendl;
3372 }
3373
3374
3375 {
3376 Mutex::Locker l(pg_stat_queue_lock);
3377 assert(pg_stat_queue.empty());
3378 }
3379
31f18b77
FG
3380 service.shutdown_reserver();
3381
7c673cae
FG
3382 // Remove PGs
3383#ifdef PG_DEBUG_REFS
3384 service.dump_live_pgids();
3385#endif
3386 {
3387 RWLock::RLocker l(pg_map_lock);
3388 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3389 p != pg_map.end();
3390 ++p) {
3391 dout(20) << " kicking pg " << p->first << dendl;
3392 p->second->lock();
3393 if (p->second->ref != 1) {
3394 derr << "pgid " << p->first << " has ref count of "
3395 << p->second->ref << dendl;
3396#ifdef PG_DEBUG_REFS
3397 p->second->dump_live_ids();
3398#endif
31f18b77
FG
3399 if (cct->_conf->osd_shutdown_pgref_assert) {
3400 ceph_abort();
3401 }
7c673cae
FG
3402 }
3403 p->second->unlock();
3404 p->second->put("PGMap");
3405 }
3406 pg_map.clear();
3407 }
3408#ifdef PG_DEBUG_REFS
3409 service.dump_live_pgids();
3410#endif
3411 cct->_conf->remove_observer(this);
3412
3413 dout(10) << "syncing store" << dendl;
3414 enable_disable_fuse(true);
3415
3416 if (cct->_conf->osd_journal_flush_on_shutdown) {
3417 dout(10) << "flushing journal" << dendl;
3418 store->flush_journal();
3419 }
3420
3421 store->umount();
3422 delete store;
3423 store = 0;
3424 dout(10) << "Store synced" << dendl;
3425
3426 monc->shutdown();
3427 osd_lock.Unlock();
3428
3429 osdmap = OSDMapRef();
3430 service.shutdown();
3431 op_tracker.on_shutdown();
3432
3433 class_handler->shutdown();
3434 client_messenger->shutdown();
3435 cluster_messenger->shutdown();
3436 hb_front_client_messenger->shutdown();
3437 hb_back_client_messenger->shutdown();
3438 objecter_messenger->shutdown();
3439 hb_front_server_messenger->shutdown();
3440 hb_back_server_messenger->shutdown();
3441
3442 peering_wq.clear();
3443
3444 return r;
3445}
3446
3447int OSD::mon_cmd_maybe_osd_create(string &cmd)
3448{
3449 bool created = false;
3450 while (true) {
3451 dout(10) << __func__ << " cmd: " << cmd << dendl;
3452 vector<string> vcmd{cmd};
3453 bufferlist inbl;
3454 C_SaferCond w;
3455 string outs;
3456 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
3457 int r = w.wait();
3458 if (r < 0) {
3459 if (r == -ENOENT && !created) {
3460 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
3461 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
3462 vector<string> vnewcmd{newcmd};
3463 bufferlist inbl;
3464 C_SaferCond w;
3465 string outs;
3466 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
3467 int r = w.wait();
3468 if (r < 0) {
3469 derr << __func__ << " fail: osd does not exist and created failed: "
3470 << cpp_strerror(r) << dendl;
3471 return r;
3472 }
3473 created = true;
3474 continue;
3475 }
3476 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
3477 return r;
3478 }
3479 break;
3480 }
3481
3482 return 0;
3483}
3484
3485int OSD::update_crush_location()
3486{
3487 if (!cct->_conf->osd_crush_update_on_start) {
3488 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
3489 return 0;
3490 }
3491
3492 char weight[32];
3493 if (cct->_conf->osd_crush_initial_weight >= 0) {
3494 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
3495 } else {
3496 struct store_statfs_t st;
3497 int r = store->statfs(&st);
3498 if (r < 0) {
3499 derr << "statfs: " << cpp_strerror(r) << dendl;
3500 return r;
3501 }
3502 snprintf(weight, sizeof(weight), "%.4lf",
3503 MAX((double).00001,
3504 (double)(st.total) /
3505 (double)(1ull << 40 /* TB */)));
3506 }
3507
3508 std::multimap<string,string> loc = cct->crush_location.get_location();
3509 dout(10) << __func__ << " crush location is " << loc << dendl;
3510
3511 string cmd =
3512 string("{\"prefix\": \"osd crush create-or-move\", ") +
3513 string("\"id\": ") + stringify(whoami) + string(", ") +
3514 string("\"weight\":") + weight + string(", ") +
3515 string("\"args\": [");
3516 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
3517 if (p != loc.begin())
3518 cmd += ", ";
3519 cmd += "\"" + p->first + "=" + p->second + "\"";
3520 }
3521 cmd += "]}";
3522
3523 return mon_cmd_maybe_osd_create(cmd);
3524}
3525
3526int OSD::update_crush_device_class()
3527{
224ce89b
WB
3528 if (!cct->_conf->osd_class_update_on_start) {
3529 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
3530 return 0;
3531 }
3532
7c673cae
FG
3533 string device_class;
3534 int r = store->read_meta("crush_device_class", &device_class);
224ce89b
WB
3535 if (r < 0 || device_class.empty()) {
3536 device_class = store->get_default_device_class();
3537 }
3538
3539 if (device_class.empty()) {
d2e6a577 3540 dout(20) << __func__ << " no device class stored locally" << dendl;
7c673cae 3541 return 0;
224ce89b 3542 }
7c673cae
FG
3543
3544 string cmd =
3545 string("{\"prefix\": \"osd crush set-device-class\", ") +
224ce89b
WB
3546 string("\"class\": \"") + device_class + string("\", ") +
3547 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
7c673cae 3548
224ce89b 3549 r = mon_cmd_maybe_osd_create(cmd);
d2e6a577
FG
3550 // the above cmd can fail for various reasons, e.g.:
3551 // (1) we are connecting to a pre-luminous monitor
3552 // (2) user manually specify a class other than
3553 // 'ceph-disk prepare --crush-device-class'
3554 // simply skip result-checking for now
3555 return 0;
7c673cae
FG
3556}
3557
3558void OSD::write_superblock(ObjectStore::Transaction& t)
3559{
3560 dout(10) << "write_superblock " << superblock << dendl;
3561
3562 //hack: at minimum it's using the baseline feature set
3563 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
3564 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
3565
3566 bufferlist bl;
3567 ::encode(superblock, bl);
3568 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
3569}
3570
3571int OSD::read_superblock()
3572{
3573 bufferlist bl;
3574 int r = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
3575 if (r < 0)
3576 return r;
3577
3578 bufferlist::iterator p = bl.begin();
3579 ::decode(superblock, p);
3580
3581 dout(10) << "read_superblock " << superblock << dendl;
3582
3583 return 0;
3584}
3585
3586void OSD::clear_temp_objects()
3587{
3588 dout(10) << __func__ << dendl;
3589 vector<coll_t> ls;
3590 store->list_collections(ls);
3591 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
3592 spg_t pgid;
3593 if (!p->is_pg(&pgid))
3594 continue;
3595
3596 // list temp objects
3597 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
3598
3599 vector<ghobject_t> temps;
3600 ghobject_t next;
3601 while (1) {
3602 vector<ghobject_t> objects;
3603 store->collection_list(*p, next, ghobject_t::get_max(),
3604 store->get_ideal_list_max(),
3605 &objects, &next);
3606 if (objects.empty())
3607 break;
3608 vector<ghobject_t>::iterator q;
3609 for (q = objects.begin(); q != objects.end(); ++q) {
3610 // Hammer set pool for temps to -1, so check for clean-up
3611 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
3612 temps.push_back(*q);
3613 } else {
3614 break;
3615 }
3616 }
3617 // If we saw a non-temp object and hit the break above we can
3618 // break out of the while loop too.
3619 if (q != objects.end())
3620 break;
3621 }
3622 if (!temps.empty()) {
3623 ObjectStore::Transaction t;
3624 int removed = 0;
3625 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
3626 dout(20) << " removing " << *p << " object " << *q << dendl;
3627 t.remove(*p, *q);
3628 if (++removed > cct->_conf->osd_target_transaction_size) {
3629 store->apply_transaction(service.meta_osr.get(), std::move(t));
3630 t = ObjectStore::Transaction();
3631 removed = 0;
3632 }
3633 }
3634 if (removed) {
3635 store->apply_transaction(service.meta_osr.get(), std::move(t));
3636 }
3637 }
3638 }
3639}
3640
3641void OSD::recursive_remove_collection(CephContext* cct,
3642 ObjectStore *store, spg_t pgid,
3643 coll_t tmp)
3644{
3645 OSDriver driver(
3646 store,
3647 coll_t(),
3648 make_snapmapper_oid());
3649
3650 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
3651 ObjectStore::Sequencer>("rm"));
3652 ObjectStore::Transaction t;
3653 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
3654
3655 vector<ghobject_t> objects;
3656 store->collection_list(tmp, ghobject_t(), ghobject_t::get_max(),
3657 INT_MAX, &objects, 0);
3658 generic_dout(10) << __func__ << " " << objects << dendl;
3659 // delete them.
3660 int removed = 0;
3661 for (vector<ghobject_t>::iterator p = objects.begin();
3662 p != objects.end();
3663 ++p, removed++) {
3664 OSDriver::OSTransaction _t(driver.get_transaction(&t));
3665 int r = mapper.remove_oid(p->hobj, &_t);
3666 if (r != 0 && r != -ENOENT)
3667 ceph_abort();
3668 t.remove(tmp, *p);
3669 if (removed > cct->_conf->osd_target_transaction_size) {
3670 int r = store->apply_transaction(osr.get(), std::move(t));
3671 assert(r == 0);
3672 t = ObjectStore::Transaction();
3673 removed = 0;
3674 }
3675 }
3676 t.remove_collection(tmp);
3677 int r = store->apply_transaction(osr.get(), std::move(t));
3678 assert(r == 0);
3679
3680 C_SaferCond waiter;
3681 if (!osr->flush_commit(&waiter)) {
3682 waiter.wait();
3683 }
3684}
3685
3686
3687// ======================================================
3688// PG's
3689
3690PGPool OSD::_get_pool(int id, OSDMapRef createmap)
3691{
3692 if (!createmap->have_pg_pool(id)) {
3693 dout(5) << __func__ << ": the OSDmap does not contain a PG pool with id = "
3694 << id << dendl;
3695 ceph_abort();
3696 }
3697
3698 PGPool p = PGPool(cct, createmap, id);
3699
3700 dout(10) << "_get_pool " << p.id << dendl;
3701 return p;
3702}
3703
3704PG *OSD::_open_lock_pg(
3705 OSDMapRef createmap,
3706 spg_t pgid, bool no_lockdep_check)
3707{
3708 assert(osd_lock.is_locked());
3709
3710 PG* pg = _make_pg(createmap, pgid);
3711 {
3712 RWLock::WLocker l(pg_map_lock);
3713 pg->lock(no_lockdep_check);
3714 pg_map[pgid] = pg;
3715 pg->get("PGMap"); // because it's in pg_map
3716 service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
3717 }
3718 return pg;
3719}
3720
3721PG* OSD::_make_pg(
3722 OSDMapRef createmap,
3723 spg_t pgid)
3724{
3725 dout(10) << "_open_lock_pg " << pgid << dendl;
3726 PGPool pool = _get_pool(pgid.pool(), createmap);
3727
3728 // create
3729 PG *pg;
3730 if (createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_REPLICATED ||
3731 createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_ERASURE)
3732 pg = new PrimaryLogPG(&service, createmap, pool, pgid);
3733 else
3734 ceph_abort();
3735
3736 return pg;
3737}
3738
3739
3740void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
3741{
3742 epoch_t e(service.get_osdmap()->get_epoch());
3743 pg->get("PGMap"); // For pg_map
3744 pg_map[pg->info.pgid] = pg;
3745 service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
3746
3747 dout(10) << "Adding newly split pg " << *pg << dendl;
3748 pg->handle_loaded(rctx);
3749 pg->write_if_dirty(*(rctx->transaction));
3750 pg->queue_null(e, e);
3751 map<spg_t, list<PG::CephPeeringEvtRef> >::iterator to_wake =
3752 peering_wait_for_split.find(pg->info.pgid);
3753 if (to_wake != peering_wait_for_split.end()) {
3754 for (list<PG::CephPeeringEvtRef>::iterator i =
3755 to_wake->second.begin();
3756 i != to_wake->second.end();
3757 ++i) {
3758 pg->queue_peering_event(*i);
3759 }
3760 peering_wait_for_split.erase(to_wake);
3761 }
3762 if (!service.get_osdmap()->have_pg_pool(pg->info.pgid.pool()))
3763 _remove_pg(pg);
3764}
3765
3766OSD::res_result OSD::_try_resurrect_pg(
3767 OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state)
3768{
3769 assert(resurrected);
3770 assert(old_pg_state);
3771 // find nearest ancestor
3772 DeletingStateRef df;
3773 spg_t cur(pgid);
3774 while (true) {
3775 df = service.deleting_pgs.lookup(cur);
3776 if (df)
3777 break;
3778 if (!cur.ps())
3779 break;
3780 cur = cur.get_parent();
3781 }
3782 if (!df)
3783 return RES_NONE; // good to go
3784
3785 df->old_pg_state->lock();
3786 OSDMapRef create_map = df->old_pg_state->get_osdmap();
3787 df->old_pg_state->unlock();
3788
3789 set<spg_t> children;
3790 if (cur == pgid) {
3791 if (df->try_stop_deletion()) {
3792 dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl;
3793 *resurrected = cur;
3794 *old_pg_state = df->old_pg_state;
3795 service.deleting_pgs.remove(pgid); // PG is no longer being removed!
3796 return RES_SELF;
3797 } else {
3798 // raced, ensure we don't see DeletingStateRef when we try to
3799 // delete this pg
3800 service.deleting_pgs.remove(pgid);
3801 return RES_NONE;
3802 }
3803 } else if (cur.is_split(create_map->get_pg_num(cur.pool()),
3804 curmap->get_pg_num(cur.pool()),
3805 &children) &&
3806 children.count(pgid)) {
3807 if (df->try_stop_deletion()) {
3808 dout(10) << __func__ << ": halted deletion on ancestor pg " << pgid
3809 << dendl;
3810 *resurrected = cur;
3811 *old_pg_state = df->old_pg_state;
3812 service.deleting_pgs.remove(cur); // PG is no longer being removed!
3813 return RES_PARENT;
3814 } else {
3815 /* this is not a problem, failing to cancel proves that all objects
3816 * have been removed, so no hobject_t overlap is possible
3817 */
3818 return RES_NONE;
3819 }
3820 }
3821 return RES_NONE;
3822}
3823
3824PG *OSD::_create_lock_pg(
3825 OSDMapRef createmap,
3826 spg_t pgid,
3827 bool hold_map_lock,
3828 bool backfill,
3829 int role,
3830 vector<int>& up, int up_primary,
3831 vector<int>& acting, int acting_primary,
3832 pg_history_t history,
3833 const PastIntervals& pi,
3834 ObjectStore::Transaction& t)
3835{
3836 assert(osd_lock.is_locked());
3837 dout(20) << "_create_lock_pg pgid " << pgid << dendl;
3838
3839 PG *pg = _open_lock_pg(createmap, pgid, true);
3840
3841 service.init_splits_between(pgid, pg->get_osdmap(), service.get_osdmap());
3842
3843 pg->init(
3844 role,
3845 up,
3846 up_primary,
3847 acting,
3848 acting_primary,
3849 history,
3850 pi,
3851 backfill,
3852 &t);
3853
3854 dout(7) << "_create_lock_pg " << *pg << dendl;
3855 return pg;
3856}
3857
3858PG *OSD::_lookup_lock_pg(spg_t pgid)
3859{
3860 RWLock::RLocker l(pg_map_lock);
3861
3862 auto pg_map_entry = pg_map.find(pgid);
3863 if (pg_map_entry == pg_map.end())
3864 return nullptr;
3865 PG *pg = pg_map_entry->second;
3866 pg->lock();
3867 return pg;
3868}
3869
31f18b77
FG
3870PG *OSD::lookup_lock_pg(spg_t pgid)
3871{
3872 return _lookup_lock_pg(pgid);
3873}
3874
7c673cae
FG
3875PG *OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid)
3876{
3877 assert(pg_map.count(pgid));
3878 PG *pg = pg_map[pgid];
3879 pg->lock();
3880 return pg;
3881}
3882
3883void OSD::load_pgs()
3884{
3885 assert(osd_lock.is_locked());
3886 dout(0) << "load_pgs" << dendl;
3887 {
3888 RWLock::RLocker l(pg_map_lock);
3889 assert(pg_map.empty());
3890 }
3891
3892 vector<coll_t> ls;
3893 int r = store->list_collections(ls);
3894 if (r < 0) {
3895 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
3896 }
3897
3898 bool has_upgraded = false;
3899
3900 for (vector<coll_t>::iterator it = ls.begin();
3901 it != ls.end();
3902 ++it) {
3903 spg_t pgid;
3904 if (it->is_temp(&pgid) ||
3905 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
3906 dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
3907 recursive_remove_collection(cct, store, pgid, *it);
3908 continue;
3909 }
3910
3911 if (!it->is_pg(&pgid)) {
3912 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
3913 continue;
3914 }
3915
3916 if (pgid.preferred() >= 0) {
3917 dout(10) << __func__ << ": skipping localized PG " << pgid << dendl;
3918 // FIXME: delete it too, eventually
3919 continue;
3920 }
3921
3922 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
3923 bufferlist bl;
3924 epoch_t map_epoch = 0;
3925 int r = PG::peek_map_epoch(store, pgid, &map_epoch, &bl);
3926 if (r < 0) {
3927 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
3928 << dendl;
3929 continue;
3930 }
3931
3932 PG *pg = NULL;
3933 if (map_epoch > 0) {
3934 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
3935 if (!pgosdmap) {
3936 if (!osdmap->have_pg_pool(pgid.pool())) {
3937 derr << __func__ << ": could not find map for epoch " << map_epoch
3938 << " on pg " << pgid << ", but the pool is not present in the "
3939 << "current map, so this is probably a result of bug 10617. "
3940 << "Skipping the pg for now, you can use ceph-objectstore-tool "
3941 << "to clean it up later." << dendl;
3942 continue;
3943 } else {
3944 derr << __func__ << ": have pgid " << pgid << " at epoch "
3945 << map_epoch << ", but missing map. Crashing."
3946 << dendl;
3947 assert(0 == "Missing map in load_pgs");
3948 }
3949 }
3950 pg = _open_lock_pg(pgosdmap, pgid);
3951 } else {
3952 pg = _open_lock_pg(osdmap, pgid);
3953 }
3954 // there can be no waiters here, so we don't call wake_pg_waiters
3955
3956 pg->ch = store->open_collection(pg->coll);
3957
3958 // read pg state, log
3959 pg->read_state(store, bl);
3960
3961 if (pg->must_upgrade()) {
3962 if (!pg->can_upgrade()) {
3963 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
3964 << " an older version first." << dendl;
3965 assert(0 == "PG too old to upgrade");
3966 }
3967 if (!has_upgraded) {
3968 derr << "PGs are upgrading" << dendl;
3969 has_upgraded = true;
3970 }
3971 dout(10) << "PG " << pg->info.pgid
3972 << " must upgrade..." << dendl;
3973 pg->upgrade(store);
3974 }
3975
3976 service.init_splits_between(pg->info.pgid, pg->get_osdmap(), osdmap);
3977
3978 // generate state for PG's current mapping
3979 int primary, up_primary;
3980 vector<int> acting, up;
3981 pg->get_osdmap()->pg_to_up_acting_osds(
3982 pgid.pgid, &up, &up_primary, &acting, &primary);
3983 pg->init_primary_up_acting(
3984 up,
3985 acting,
3986 up_primary,
3987 primary);
3988 int role = OSDMap::calc_pg_role(whoami, pg->acting);
3989 if (pg->pool.info.is_replicated() || role == pg->pg_whoami.shard)
3990 pg->set_role(role);
3991 else
3992 pg->set_role(-1);
3993
3994 pg->reg_next_scrub();
3995
3996 PG::RecoveryCtx rctx(0, 0, 0, 0, 0, 0);
3997 pg->handle_loaded(&rctx);
3998
3999 dout(10) << "load_pgs loaded " << *pg << " " << pg->pg_log.get_log() << dendl;
4000 if (pg->pg_log.is_dirty()) {
4001 ObjectStore::Transaction t;
4002 pg->write_if_dirty(t);
4003 store->apply_transaction(pg->osr.get(), std::move(t));
4004 }
4005 pg->unlock();
4006 }
4007 {
4008 RWLock::RLocker l(pg_map_lock);
4009 dout(0) << "load_pgs opened " << pg_map.size() << " pgs" << dendl;
4010 }
4011
4012 // clean up old infos object?
4013 if (has_upgraded && store->exists(coll_t::meta(), OSD::make_infos_oid())) {
4014 dout(1) << __func__ << " removing legacy infos object" << dendl;
4015 ObjectStore::Transaction t;
4016 t.remove(coll_t::meta(), OSD::make_infos_oid());
4017 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
4018 if (r != 0) {
4019 derr << __func__ << ": apply_transaction returned "
4020 << cpp_strerror(r) << dendl;
4021 ceph_abort();
4022 }
4023 }
4024
4025 build_past_intervals_parallel();
4026}
4027
4028
4029/*
4030 * build past_intervals efficiently on old, degraded, and buried
4031 * clusters. this is important for efficiently catching up osds that
4032 * are way behind on maps to the current cluster state.
4033 *
4034 * this is a parallel version of PG::generate_past_intervals().
4035 * follow the same logic, but do all pgs at the same time so that we
4036 * can make a single pass across the osdmap history.
4037 */
4038void OSD::build_past_intervals_parallel()
4039{
4040 struct pistate {
4041 epoch_t start, end;
4042 vector<int> old_acting, old_up;
4043 epoch_t same_interval_since;
4044 int primary;
4045 int up_primary;
4046 };
4047 map<PG*,pistate> pis;
4048
4049 // calculate junction of map range
4050 epoch_t end_epoch = superblock.oldest_map;
4051 epoch_t cur_epoch = superblock.newest_map;
4052 {
4053 RWLock::RLocker l(pg_map_lock);
4054 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4055 i != pg_map.end();
4056 ++i) {
4057 PG *pg = i->second;
4058
4059 auto rpib = pg->get_required_past_interval_bounds(
4060 pg->info,
4061 superblock.oldest_map);
4062 if (rpib.first >= rpib.second && pg->past_intervals.empty()) {
4063 if (pg->info.history.same_interval_since == 0) {
4064 pg->info.history.same_interval_since = rpib.second;
4065 }
4066 continue;
4067 } else {
4068 auto apib = pg->past_intervals.get_bounds();
4069 if (apib.second >= rpib.second &&
4070 apib.first <= rpib.first) {
4071 if (pg->info.history.same_interval_since == 0) {
4072 pg->info.history.same_interval_since = rpib.second;
4073 }
4074 continue;
4075 }
4076 }
4077
4078 dout(10) << pg->info.pgid << " needs " << rpib.first << "-"
4079 << rpib.second << dendl;
4080 pistate& p = pis[pg];
4081 p.start = rpib.first;
4082 p.end = rpib.second;
4083 p.same_interval_since = 0;
4084
4085 if (rpib.first < cur_epoch)
4086 cur_epoch = rpib.first;
4087 if (rpib.second > end_epoch)
4088 end_epoch = rpib.second;
4089 }
4090 }
4091 if (pis.empty()) {
4092 dout(10) << __func__ << " nothing to build" << dendl;
4093 return;
4094 }
4095
4096 dout(1) << __func__ << " over " << cur_epoch << "-" << end_epoch << dendl;
4097 assert(cur_epoch <= end_epoch);
4098
4099 OSDMapRef cur_map, last_map;
4100 for ( ; cur_epoch <= end_epoch; cur_epoch++) {
4101 dout(10) << __func__ << " epoch " << cur_epoch << dendl;
4102 last_map = cur_map;
4103 cur_map = get_map(cur_epoch);
4104
4105 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4106 PG *pg = i->first;
4107 pistate& p = i->second;
4108
4109 if (cur_epoch < p.start || cur_epoch > p.end)
4110 continue;
4111
4112 vector<int> acting, up;
4113 int up_primary;
4114 int primary;
4115 pg_t pgid = pg->info.pgid.pgid;
4116 if (p.same_interval_since && last_map->get_pools().count(pgid.pool()))
4117 pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool()));
4118 cur_map->pg_to_up_acting_osds(
4119 pgid, &up, &up_primary, &acting, &primary);
4120
4121 if (p.same_interval_since == 0) {
4122 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4123 << " first map, acting " << acting
4124 << " up " << up << ", same_interval_since = " << cur_epoch << dendl;
4125 p.same_interval_since = cur_epoch;
4126 p.old_up = up;
4127 p.old_acting = acting;
4128 p.primary = primary;
4129 p.up_primary = up_primary;
4130 continue;
4131 }
4132 assert(last_map);
4133
4134 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
4135 pg->get_is_recoverable_predicate());
4136 std::stringstream debug;
4137 bool new_interval = PastIntervals::check_new_interval(
4138 p.primary,
4139 primary,
4140 p.old_acting, acting,
4141 p.up_primary,
4142 up_primary,
4143 p.old_up, up,
4144 p.same_interval_since,
4145 pg->info.history.last_epoch_clean,
4146 cur_map, last_map,
4147 pgid,
4148 recoverable.get(),
4149 &pg->past_intervals,
4150 &debug);
4151 if (new_interval) {
4152 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4153 << " " << debug.str() << dendl;
4154 p.old_up = up;
4155 p.old_acting = acting;
4156 p.primary = primary;
4157 p.up_primary = up_primary;
4158 p.same_interval_since = cur_epoch;
4159 }
4160 }
4161 }
4162
4163 // Now that past_intervals have been recomputed let's fix the same_interval_since
4164 // if it was cleared by import.
4165 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4166 PG *pg = i->first;
4167 pistate& p = i->second;
4168
4169 if (pg->info.history.same_interval_since == 0) {
4170 assert(p.same_interval_since);
4171 dout(10) << __func__ << " fix same_interval_since " << p.same_interval_since << " pg " << *pg << dendl;
4172 dout(10) << __func__ << " past_intervals " << pg->past_intervals << dendl;
4173 // Fix it
4174 pg->info.history.same_interval_since = p.same_interval_since;
4175 }
4176 }
4177
4178 // write info only at the end. this is necessary because we check
4179 // whether the past_intervals go far enough back or forward in time,
4180 // but we don't check for holes. we could avoid it by discarding
4181 // the previous past_intervals and rebuilding from scratch, or we
4182 // can just do this and commit all our work at the end.
4183 ObjectStore::Transaction t;
4184 int num = 0;
4185 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4186 PG *pg = i->first;
4187 pg->lock();
4188 pg->dirty_big_info = true;
4189 pg->dirty_info = true;
4190 pg->write_if_dirty(t);
4191 pg->unlock();
4192
4193 // don't let the transaction get too big
4194 if (++num >= cct->_conf->osd_target_transaction_size) {
4195 store->apply_transaction(service.meta_osr.get(), std::move(t));
4196 t = ObjectStore::Transaction();
4197 num = 0;
4198 }
4199 }
4200 if (!t.empty())
4201 store->apply_transaction(service.meta_osr.get(), std::move(t));
4202}
4203
4204/*
4205 * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping
4206 * hasn't changed since the given epoch and we are the primary.
4207 */
4208int OSD::handle_pg_peering_evt(
4209 spg_t pgid,
4210 const pg_history_t& orig_history,
4211 const PastIntervals& pi,
4212 epoch_t epoch,
4213 PG::CephPeeringEvtRef evt)
4214{
4215 if (service.splitting(pgid)) {
4216 peering_wait_for_split[pgid].push_back(evt);
4217 return -EEXIST;
4218 }
4219
4220 PG *pg = _lookup_lock_pg(pgid);
4221 if (!pg) {
4222 // same primary?
4223 if (!osdmap->have_pg_pool(pgid.pool()))
4224 return -EINVAL;
4225 int up_primary, acting_primary;
4226 vector<int> up, acting;
4227 osdmap->pg_to_up_acting_osds(
4228 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4229
4230 pg_history_t history = orig_history;
4231 bool valid_history = project_pg_history(
4232 pgid, history, epoch, up, up_primary, acting, acting_primary);
4233
4234 if (!valid_history || epoch < history.same_interval_since) {
4235 dout(10) << __func__ << pgid << " acting changed in "
4236 << history.same_interval_since << " (msg from " << epoch << ")"
4237 << dendl;
4238 return -EINVAL;
4239 }
4240
4241 if (service.splitting(pgid)) {
4242 ceph_abort();
4243 }
4244
4245 // do we need to resurrect a deleting pg?
4246 spg_t resurrected;
4247 PGRef old_pg_state;
4248 res_result result = _try_resurrect_pg(
4249 service.get_osdmap(),
4250 pgid,
4251 &resurrected,
4252 &old_pg_state);
4253
4254 PG::RecoveryCtx rctx = create_context();
4255 switch (result) {
4256 case RES_NONE: {
4257 const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
4258 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4259 store->get_type() != "bluestore") {
4260 clog->warn() << "pg " << pgid
4261 << " is at risk of silent data corruption: "
4262 << "the pool allows ec overwrites but is not stored in "
4263 << "bluestore, so deep scrubbing will not detect bitrot";
4264 }
4265 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4266 PG::_init(*rctx.transaction, pgid, pp);
4267
4268 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
4269 if (!pp->is_replicated() && role != pgid.shard)
4270 role = -1;
4271
4272 pg = _create_lock_pg(
4273 get_map(epoch),
4274 pgid, false, false,
4275 role,
4276 up, up_primary,
4277 acting, acting_primary,
4278 history, pi,
4279 *rctx.transaction);
4280 pg->handle_create(&rctx);
4281 pg->write_if_dirty(*rctx.transaction);
4282 dispatch_context(rctx, pg, osdmap);
4283
4284 dout(10) << *pg << " is new" << dendl;
4285
4286 pg->queue_peering_event(evt);
4287 wake_pg_waiters(pg);
4288 pg->unlock();
4289 return 0;
4290 }
4291 case RES_SELF: {
4292 old_pg_state->lock();
4293 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4294 int old_role = old_pg_state->role;
4295 vector<int> old_up = old_pg_state->up;
4296 int old_up_primary = old_pg_state->up_primary.osd;
4297 vector<int> old_acting = old_pg_state->acting;
4298 int old_primary = old_pg_state->primary.osd;
4299 pg_history_t old_history = old_pg_state->info.history;
4300 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4301 old_pg_state->unlock();
4302 pg = _create_lock_pg(
4303 old_osd_map,
4304 resurrected,
4305 false,
4306 true,
4307 old_role,
4308 old_up,
4309 old_up_primary,
4310 old_acting,
4311 old_primary,
4312 old_history,
4313 old_past_intervals,
4314 *rctx.transaction);
4315 pg->handle_create(&rctx);
4316 pg->write_if_dirty(*rctx.transaction);
4317 dispatch_context(rctx, pg, osdmap);
4318
4319 dout(10) << *pg << " is new (resurrected)" << dendl;
4320
4321 pg->queue_peering_event(evt);
4322 wake_pg_waiters(pg);
4323 pg->unlock();
4324 return 0;
4325 }
4326 case RES_PARENT: {
4327 assert(old_pg_state);
4328 old_pg_state->lock();
4329 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4330 int old_role = old_pg_state->role;
4331 vector<int> old_up = old_pg_state->up;
4332 int old_up_primary = old_pg_state->up_primary.osd;
4333 vector<int> old_acting = old_pg_state->acting;
4334 int old_primary = old_pg_state->primary.osd;
4335 pg_history_t old_history = old_pg_state->info.history;
4336 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4337 old_pg_state->unlock();
4338 PG *parent = _create_lock_pg(
4339 old_osd_map,
4340 resurrected,
4341 false,
4342 true,
4343 old_role,
4344 old_up,
4345 old_up_primary,
4346 old_acting,
4347 old_primary,
4348 old_history,
4349 old_past_intervals,
4350 *rctx.transaction
4351 );
4352 parent->handle_create(&rctx);
4353 parent->write_if_dirty(*rctx.transaction);
4354 dispatch_context(rctx, parent, osdmap);
4355
4356 dout(10) << *parent << " is new" << dendl;
4357
4358 assert(service.splitting(pgid));
4359 peering_wait_for_split[pgid].push_back(evt);
4360
4361 //parent->queue_peering_event(evt);
4362 parent->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
4363 wake_pg_waiters(parent);
4364 parent->unlock();
4365 return 0;
4366 }
4367 default:
4368 assert(0);
4369 return 0;
4370 }
4371 } else {
4372 // already had it. did the mapping change?
4373 if (epoch < pg->info.history.same_interval_since) {
4374 dout(10) << *pg << __func__ << " acting changed in "
4375 << pg->info.history.same_interval_since
4376 << " (msg from " << epoch << ")" << dendl;
4377 } else {
4378 pg->queue_peering_event(evt);
4379 }
4380 pg->unlock();
4381 return -EEXIST;
4382 }
4383}
4384
4385
4386void OSD::build_initial_pg_history(
4387 spg_t pgid,
4388 epoch_t created,
4389 utime_t created_stamp,
4390 pg_history_t *h,
4391 PastIntervals *pi)
4392{
4393 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4394 h->epoch_created = created;
31f18b77 4395 h->epoch_pool_created = created;
7c673cae
FG
4396 h->same_interval_since = created;
4397 h->same_up_since = created;
4398 h->same_primary_since = created;
4399 h->last_scrub_stamp = created_stamp;
4400 h->last_deep_scrub_stamp = created_stamp;
4401 h->last_clean_scrub_stamp = created_stamp;
4402
4403 OSDMapRef lastmap = service.get_map(created);
4404 int up_primary, acting_primary;
4405 vector<int> up, acting;
4406 lastmap->pg_to_up_acting_osds(
4407 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4408
4409 ostringstream debug;
4410 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4411 OSDMapRef osdmap = service.get_map(e);
4412 int new_up_primary, new_acting_primary;
4413 vector<int> new_up, new_acting;
4414 osdmap->pg_to_up_acting_osds(
4415 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4416
4417 // this is a bit imprecise, but sufficient?
4418 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4419 const pg_pool_t *pi;
4420 bool operator()(const set<pg_shard_t> &have) const {
4421 return have.size() >= pi->min_size;
4422 }
4423 min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4424 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4425
4426 bool new_interval = PastIntervals::check_new_interval(
4427 acting_primary,
4428 new_acting_primary,
4429 acting, new_acting,
4430 up_primary,
4431 new_up_primary,
4432 up, new_up,
4433 h->same_interval_since,
4434 h->last_epoch_clean,
4435 osdmap,
4436 lastmap,
4437 pgid.pgid,
4438 &min_size_predicate,
4439 pi,
4440 &debug);
4441 if (new_interval) {
4442 h->same_interval_since = e;
4443 }
4444 if (up != new_up) {
4445 h->same_up_since = e;
4446 }
4447 if (acting_primary != new_acting_primary) {
4448 h->same_primary_since = e;
4449 }
c07f9fc5
FG
4450 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4451 osdmap->get_pg_num(pgid.pgid.pool()),
4452 nullptr)) {
4453 h->last_epoch_split = e;
4454 }
7c673cae
FG
4455 lastmap = osdmap;
4456 }
4457 dout(20) << __func__ << " " << debug.str() << dendl;
4458 dout(10) << __func__ << " " << *h << " " << *pi
4459 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4460 pi->get_bounds()) << ")"
4461 << dendl;
4462}
4463
4464/**
4465 * Fill in the passed history so you know same_interval_since, same_up_since,
4466 * and same_primary_since.
4467 */
4468bool OSD::project_pg_history(spg_t pgid, pg_history_t& h, epoch_t from,
4469 const vector<int>& currentup,
4470 int currentupprimary,
4471 const vector<int>& currentacting,
4472 int currentactingprimary)
4473{
4474 dout(15) << "project_pg_history " << pgid
4475 << " from " << from << " to " << osdmap->get_epoch()
4476 << ", start " << h
4477 << dendl;
4478
4479 epoch_t e;
4480 for (e = osdmap->get_epoch();
4481 e > from;
4482 e--) {
4483 // verify during intermediate epoch (e-1)
4484 OSDMapRef oldmap = service.try_get_map(e-1);
4485 if (!oldmap) {
4486 dout(15) << __func__ << ": found map gap, returning false" << dendl;
4487 return false;
4488 }
4489 assert(oldmap->have_pg_pool(pgid.pool()));
4490
4491 int upprimary, actingprimary;
4492 vector<int> up, acting;
4493 oldmap->pg_to_up_acting_osds(
4494 pgid.pgid,
4495 &up,
4496 &upprimary,
4497 &acting,
4498 &actingprimary);
4499
4500 // acting set change?
4501 if ((actingprimary != currentactingprimary ||
4502 upprimary != currentupprimary ||
4503 acting != currentacting ||
4504 up != currentup) && e > h.same_interval_since) {
4505 dout(15) << "project_pg_history " << pgid << " acting|up changed in " << e
4506 << " from " << acting << "/" << up
4507 << " " << actingprimary << "/" << upprimary
4508 << " -> " << currentacting << "/" << currentup
4509 << " " << currentactingprimary << "/" << currentupprimary
4510 << dendl;
4511 h.same_interval_since = e;
4512 }
4513 // split?
4514 if (pgid.is_split(oldmap->get_pg_num(pgid.pool()),
4515 osdmap->get_pg_num(pgid.pool()),
4516 0) && e > h.same_interval_since) {
4517 h.same_interval_since = e;
4518 }
4519 // up set change?
4520 if ((up != currentup || upprimary != currentupprimary)
4521 && e > h.same_up_since) {
4522 dout(15) << "project_pg_history " << pgid << " up changed in " << e
4523 << " from " << up << " " << upprimary
4524 << " -> " << currentup << " " << currentupprimary << dendl;
4525 h.same_up_since = e;
4526 }
4527
4528 // primary change?
4529 if (OSDMap::primary_changed(
4530 actingprimary,
4531 acting,
4532 currentactingprimary,
4533 currentacting) &&
4534 e > h.same_primary_since) {
4535 dout(15) << "project_pg_history " << pgid << " primary changed in " << e << dendl;
4536 h.same_primary_since = e;
4537 }
4538
4539 if (h.same_interval_since >= e && h.same_up_since >= e && h.same_primary_since >= e)
4540 break;
4541 }
4542
31f18b77 4543 // base case: these floors should be the pg creation epoch if we didn't
7c673cae
FG
4544 // find any changes.
4545 if (e == h.epoch_created) {
4546 if (!h.same_interval_since)
4547 h.same_interval_since = e;
4548 if (!h.same_up_since)
4549 h.same_up_since = e;
4550 if (!h.same_primary_since)
4551 h.same_primary_since = e;
4552 }
4553
4554 dout(15) << "project_pg_history end " << h << dendl;
4555 return true;
4556}
4557
4558
4559
4560void OSD::_add_heartbeat_peer(int p)
4561{
4562 if (p == whoami)
4563 return;
4564 HeartbeatInfo *hi;
4565
4566 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4567 if (i == heartbeat_peers.end()) {
4568 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4569 if (!cons.first)
4570 return;
4571 hi = &heartbeat_peers[p];
4572 hi->peer = p;
4573 HeartbeatSession *s = new HeartbeatSession(p);
4574 hi->con_back = cons.first.get();
4575 hi->con_back->set_priv(s->get());
4576 if (cons.second) {
4577 hi->con_front = cons.second.get();
4578 hi->con_front->set_priv(s->get());
4579 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4580 << " " << hi->con_back->get_peer_addr()
4581 << " " << hi->con_front->get_peer_addr()
4582 << dendl;
4583 } else {
4584 hi->con_front.reset(NULL);
4585 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4586 << " " << hi->con_back->get_peer_addr()
4587 << dendl;
4588 }
4589 s->put();
4590 } else {
4591 hi = &i->second;
4592 }
4593 hi->epoch = osdmap->get_epoch();
4594}
4595
4596void OSD::_remove_heartbeat_peer(int n)
4597{
4598 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
4599 assert(q != heartbeat_peers.end());
4600 dout(20) << " removing heartbeat peer osd." << n
4601 << " " << q->second.con_back->get_peer_addr()
4602 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
4603 << dendl;
4604 q->second.con_back->mark_down();
4605 if (q->second.con_front) {
4606 q->second.con_front->mark_down();
4607 }
4608 heartbeat_peers.erase(q);
4609}
4610
4611void OSD::need_heartbeat_peer_update()
4612{
4613 if (is_stopping())
4614 return;
4615 dout(20) << "need_heartbeat_peer_update" << dendl;
4616 heartbeat_set_peers_need_update();
4617}
4618
4619void OSD::maybe_update_heartbeat_peers()
4620{
4621 assert(osd_lock.is_locked());
4622
4623 if (is_waiting_for_healthy()) {
4624 utime_t now = ceph_clock_now();
4625 if (last_heartbeat_resample == utime_t()) {
4626 last_heartbeat_resample = now;
4627 heartbeat_set_peers_need_update();
4628 } else if (!heartbeat_peers_need_update()) {
4629 utime_t dur = now - last_heartbeat_resample;
4630 if (dur > cct->_conf->osd_heartbeat_grace) {
4631 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
4632 heartbeat_set_peers_need_update();
4633 last_heartbeat_resample = now;
4634 reset_heartbeat_peers(); // we want *new* peers!
4635 }
4636 }
4637 }
4638
4639 if (!heartbeat_peers_need_update())
4640 return;
4641 heartbeat_clear_peers_need_update();
4642
4643 Mutex::Locker l(heartbeat_lock);
4644
4645 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
4646
4647
4648 // build heartbeat from set
4649 if (is_active()) {
4650 RWLock::RLocker l(pg_map_lock);
4651 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4652 i != pg_map.end();
4653 ++i) {
4654 PG *pg = i->second;
4655 pg->heartbeat_peer_lock.Lock();
4656 dout(20) << i->first << " heartbeat_peers " << pg->heartbeat_peers << dendl;
4657 for (set<int>::iterator p = pg->heartbeat_peers.begin();
4658 p != pg->heartbeat_peers.end();
4659 ++p)
4660 if (osdmap->is_up(*p))
4661 _add_heartbeat_peer(*p);
4662 for (set<int>::iterator p = pg->probe_targets.begin();
4663 p != pg->probe_targets.end();
4664 ++p)
4665 if (osdmap->is_up(*p))
4666 _add_heartbeat_peer(*p);
4667 pg->heartbeat_peer_lock.Unlock();
4668 }
4669 }
4670
4671 // include next and previous up osds to ensure we have a fully-connected set
4672 set<int> want, extras;
4673 int next = osdmap->get_next_up_osd_after(whoami);
4674 if (next >= 0)
4675 want.insert(next);
4676 int prev = osdmap->get_previous_up_osd_before(whoami);
4677 if (prev >= 0 && prev != next)
4678 want.insert(prev);
4679
4680 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
4681 dout(10) << " adding neighbor peer osd." << *p << dendl;
4682 extras.insert(*p);
4683 _add_heartbeat_peer(*p);
4684 }
4685
4686 // remove down peers; enumerate extras
4687 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4688 while (p != heartbeat_peers.end()) {
4689 if (!osdmap->is_up(p->first)) {
4690 int o = p->first;
4691 ++p;
4692 _remove_heartbeat_peer(o);
4693 continue;
4694 }
4695 if (p->second.epoch < osdmap->get_epoch()) {
4696 extras.insert(p->first);
4697 }
4698 ++p;
4699 }
4700
4701 // too few?
4702 int start = osdmap->get_next_up_osd_after(whoami);
4703 for (int n = start; n >= 0; ) {
4704 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
4705 break;
4706 if (!extras.count(n) && !want.count(n) && n != whoami) {
4707 dout(10) << " adding random peer osd." << n << dendl;
4708 extras.insert(n);
4709 _add_heartbeat_peer(n);
4710 }
4711 n = osdmap->get_next_up_osd_after(n);
4712 if (n == start)
4713 break; // came full circle; stop
4714 }
4715
4716 // too many?
4717 for (set<int>::iterator p = extras.begin();
4718 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
4719 ++p) {
4720 if (want.count(*p))
4721 continue;
4722 _remove_heartbeat_peer(*p);
4723 }
4724
4725 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
4726}
4727
4728void OSD::reset_heartbeat_peers()
4729{
4730 assert(osd_lock.is_locked());
4731 dout(10) << "reset_heartbeat_peers" << dendl;
4732 Mutex::Locker l(heartbeat_lock);
4733 while (!heartbeat_peers.empty()) {
4734 HeartbeatInfo& hi = heartbeat_peers.begin()->second;
4735 hi.con_back->mark_down();
4736 if (hi.con_front) {
4737 hi.con_front->mark_down();
4738 }
4739 heartbeat_peers.erase(heartbeat_peers.begin());
4740 }
4741 failure_queue.clear();
4742}
4743
4744void OSD::handle_osd_ping(MOSDPing *m)
4745{
4746 if (superblock.cluster_fsid != m->fsid) {
4747 dout(20) << "handle_osd_ping from " << m->get_source_inst()
4748 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
4749 m->put();
4750 return;
4751 }
4752
4753 int from = m->get_source().num();
4754
4755 heartbeat_lock.Lock();
4756 if (is_stopping()) {
4757 heartbeat_lock.Unlock();
4758 m->put();
4759 return;
4760 }
4761
4762 OSDMapRef curmap = service.get_osdmap();
c07f9fc5
FG
4763 if (!curmap) {
4764 heartbeat_lock.Unlock();
4765 m->put();
4766 return;
4767 }
7c673cae
FG
4768
4769 switch (m->op) {
4770
4771 case MOSDPing::PING:
4772 {
4773 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
4774 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
4775 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
4776 if (heartbeat_drop->second == 0) {
4777 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
4778 } else {
4779 --heartbeat_drop->second;
4780 dout(5) << "Dropping heartbeat from " << from
4781 << ", " << heartbeat_drop->second
4782 << " remaining to drop" << dendl;
4783 break;
4784 }
4785 } else if (cct->_conf->osd_debug_drop_ping_probability >
4786 ((((double)(rand()%100))/100.0))) {
4787 heartbeat_drop =
4788 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
4789 cct->_conf->osd_debug_drop_ping_duration)).first;
4790 dout(5) << "Dropping heartbeat from " << from
4791 << ", " << heartbeat_drop->second
4792 << " remaining to drop" << dendl;
4793 break;
4794 }
4795 }
4796
4797 if (!cct->get_heartbeat_map()->is_healthy()) {
4798 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
4799 break;
4800 }
4801
4802 Message *r = new MOSDPing(monc->get_fsid(),
4803 curmap->get_epoch(),
31f18b77
FG
4804 MOSDPing::PING_REPLY, m->stamp,
4805 cct->_conf->osd_heartbeat_min_size);
7c673cae
FG
4806 m->get_connection()->send_message(r);
4807
4808 if (curmap->is_up(from)) {
4809 service.note_peer_epoch(from, m->map_epoch);
4810 if (is_active()) {
4811 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
4812 if (con) {
4813 service.share_map_peer(from, con.get());
4814 }
4815 }
4816 } else if (!curmap->exists(from) ||
4817 curmap->get_down_at(from) > m->map_epoch) {
4818 // tell them they have died
4819 Message *r = new MOSDPing(monc->get_fsid(),
4820 curmap->get_epoch(),
4821 MOSDPing::YOU_DIED,
31f18b77
FG
4822 m->stamp,
4823 cct->_conf->osd_heartbeat_min_size);
7c673cae
FG
4824 m->get_connection()->send_message(r);
4825 }
4826 }
4827 break;
4828
4829 case MOSDPing::PING_REPLY:
4830 {
4831 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
4832 if (i != heartbeat_peers.end()) {
4833 if (m->get_connection() == i->second.con_back) {
4834 dout(25) << "handle_osd_ping got reply from osd." << from
4835 << " first_tx " << i->second.first_tx
4836 << " last_tx " << i->second.last_tx
4837 << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
4838 << " last_rx_front " << i->second.last_rx_front
4839 << dendl;
4840 i->second.last_rx_back = m->stamp;
4841 // if there is no front con, set both stamps.
4842 if (i->second.con_front == NULL)
4843 i->second.last_rx_front = m->stamp;
4844 } else if (m->get_connection() == i->second.con_front) {
4845 dout(25) << "handle_osd_ping got reply from osd." << from
4846 << " first_tx " << i->second.first_tx
4847 << " last_tx " << i->second.last_tx
4848 << " last_rx_back " << i->second.last_rx_back
4849 << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
4850 << dendl;
4851 i->second.last_rx_front = m->stamp;
4852 }
4853
4854 utime_t cutoff = ceph_clock_now();
4855 cutoff -= cct->_conf->osd_heartbeat_grace;
4856 if (i->second.is_healthy(cutoff)) {
4857 // Cancel false reports
4858 auto failure_queue_entry = failure_queue.find(from);
4859 if (failure_queue_entry != failure_queue.end()) {
4860 dout(10) << "handle_osd_ping canceling queued "
4861 << "failure report for osd." << from << dendl;
4862 failure_queue.erase(failure_queue_entry);
4863 }
4864
4865 auto failure_pending_entry = failure_pending.find(from);
4866 if (failure_pending_entry != failure_pending.end()) {
4867 dout(10) << "handle_osd_ping canceling in-flight "
4868 << "failure report for osd." << from << dendl;
4869 send_still_alive(curmap->get_epoch(),
4870 failure_pending_entry->second.second);
4871 failure_pending.erase(failure_pending_entry);
4872 }
4873 }
4874 }
4875
4876 if (m->map_epoch &&
4877 curmap->is_up(from)) {
4878 service.note_peer_epoch(from, m->map_epoch);
4879 if (is_active()) {
4880 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
4881 if (con) {
4882 service.share_map_peer(from, con.get());
4883 }
4884 }
4885 }
4886 }
4887 break;
4888
4889 case MOSDPing::YOU_DIED:
4890 dout(10) << "handle_osd_ping " << m->get_source_inst()
4891 << " says i am down in " << m->map_epoch << dendl;
4892 osdmap_subscribe(curmap->get_epoch()+1, false);
4893 break;
4894 }
4895
4896 heartbeat_lock.Unlock();
4897 m->put();
4898}
4899
4900void OSD::heartbeat_entry()
4901{
4902 Mutex::Locker l(heartbeat_lock);
4903 if (is_stopping())
4904 return;
4905 while (!heartbeat_stop) {
4906 heartbeat();
4907
4908 double wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
4909 utime_t w;
4910 w.set_from_double(wait);
4911 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
4912 heartbeat_cond.WaitInterval(heartbeat_lock, w);
4913 if (is_stopping())
4914 return;
4915 dout(30) << "heartbeat_entry woke up" << dendl;
4916 }
4917}
4918
4919void OSD::heartbeat_check()
4920{
4921 assert(heartbeat_lock.is_locked());
4922 utime_t now = ceph_clock_now();
4923
4924 // check for heartbeat replies (move me elsewhere?)
4925 utime_t cutoff = now;
4926 cutoff -= cct->_conf->osd_heartbeat_grace;
4927 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4928 p != heartbeat_peers.end();
4929 ++p) {
4930
4931 if (p->second.first_tx == utime_t()) {
4932 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
4933 << "yet, skipping" << dendl;
4934 continue;
4935 }
4936
4937 dout(25) << "heartbeat_check osd." << p->first
4938 << " first_tx " << p->second.first_tx
4939 << " last_tx " << p->second.last_tx
4940 << " last_rx_back " << p->second.last_rx_back
4941 << " last_rx_front " << p->second.last_rx_front
4942 << dendl;
4943 if (p->second.is_unhealthy(cutoff)) {
4944 if (p->second.last_rx_back == utime_t() ||
4945 p->second.last_rx_front == utime_t()) {
4946 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
4947 << " osd." << p->first << " ever on either front or back, first ping sent "
4948 << p->second.first_tx << " (cutoff " << cutoff << ")" << dendl;
4949 // fail
4950 failure_queue[p->first] = p->second.last_tx;
4951 } else {
4952 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
4953 << " osd." << p->first << " since back " << p->second.last_rx_back
4954 << " front " << p->second.last_rx_front
4955 << " (cutoff " << cutoff << ")" << dendl;
4956 // fail
4957 failure_queue[p->first] = MIN(p->second.last_rx_back, p->second.last_rx_front);
4958 }
4959 }
4960 }
4961}
4962
4963void OSD::heartbeat()
4964{
4965 dout(30) << "heartbeat" << dendl;
4966
4967 // get CPU load avg
4968 double loadavgs[1];
4969 int n_samples = 86400 / cct->_conf->osd_heartbeat_interval;
4970 if (getloadavg(loadavgs, 1) == 1) {
4971 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
4972 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
4973 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
4974 }
4975
4976 dout(30) << "heartbeat checking stats" << dendl;
4977
4978 // refresh stats?
4979 vector<int> hb_peers;
4980 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4981 p != heartbeat_peers.end();
4982 ++p)
4983 hb_peers.push_back(p->first);
4984 service.update_osd_stat(hb_peers);
4985
4986 dout(5) << "heartbeat: " << service.get_osd_stat() << dendl;
4987
4988 utime_t now = ceph_clock_now();
4989
4990 // send heartbeats
4991 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
4992 i != heartbeat_peers.end();
4993 ++i) {
4994 int peer = i->first;
4995 i->second.last_tx = now;
4996 if (i->second.first_tx == utime_t())
4997 i->second.first_tx = now;
4998 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
4999 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
5000 service.get_osdmap()->get_epoch(),
31f18b77
FG
5001 MOSDPing::PING, now,
5002 cct->_conf->osd_heartbeat_min_size));
7c673cae
FG
5003
5004 if (i->second.con_front)
5005 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
5006 service.get_osdmap()->get_epoch(),
31f18b77
FG
5007 MOSDPing::PING, now,
5008 cct->_conf->osd_heartbeat_min_size));
7c673cae
FG
5009 }
5010
5011 logger->set(l_osd_hb_to, heartbeat_peers.size());
5012
5013 // hmm.. am i all alone?
5014 dout(30) << "heartbeat lonely?" << dendl;
5015 if (heartbeat_peers.empty()) {
5016 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5017 last_mon_heartbeat = now;
5018 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5019 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5020 }
5021 }
5022
5023 dout(30) << "heartbeat done" << dendl;
5024}
5025
5026bool OSD::heartbeat_reset(Connection *con)
5027{
5028 HeartbeatSession *s = static_cast<HeartbeatSession*>(con->get_priv());
5029 if (s) {
5030 heartbeat_lock.Lock();
5031 if (is_stopping()) {
5032 heartbeat_lock.Unlock();
5033 s->put();
5034 return true;
5035 }
5036 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer);
5037 if (p != heartbeat_peers.end() &&
5038 (p->second.con_back == con ||
5039 p->second.con_front == con)) {
5040 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5041 << ", reopening" << dendl;
5042 if (con != p->second.con_back) {
5043 p->second.con_back->mark_down();
5044 }
5045 p->second.con_back.reset(NULL);
5046 if (p->second.con_front && con != p->second.con_front) {
5047 p->second.con_front->mark_down();
5048 }
5049 p->second.con_front.reset(NULL);
5050 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5051 if (newcon.first) {
5052 p->second.con_back = newcon.first.get();
5053 p->second.con_back->set_priv(s->get());
5054 if (newcon.second) {
5055 p->second.con_front = newcon.second.get();
5056 p->second.con_front->set_priv(s->get());
5057 }
5058 } else {
5059 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5060 << ", raced with osdmap update, closing out peer" << dendl;
5061 heartbeat_peers.erase(p);
5062 }
5063 } else {
5064 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5065 }
5066 heartbeat_lock.Unlock();
5067 s->put();
5068 }
5069 return true;
5070}
5071
5072
5073
5074// =========================================
5075
5076void OSD::tick()
5077{
5078 assert(osd_lock.is_locked());
5079 dout(10) << "tick" << dendl;
5080
5081 if (is_active() || is_waiting_for_healthy()) {
5082 maybe_update_heartbeat_peers();
5083 }
5084
5085 if (is_waiting_for_healthy()) {
5086 start_boot();
224ce89b
WB
5087 } else if (is_preboot() &&
5088 waiting_for_luminous_mons &&
5089 monc->monmap.get_required_features().contains_all(
5090 ceph::features::mon::FEATURE_LUMINOUS)) {
5091 // mon upgrade finished!
5092 start_boot();
7c673cae
FG
5093 }
5094
5095 do_waiters();
5096
5097 tick_timer.add_event_after(OSD_TICK_INTERVAL, new C_Tick(this));
7c673cae
FG
5098}
5099
5100void OSD::tick_without_osd_lock()
5101{
5102 assert(tick_timer_lock.is_locked());
5103 dout(10) << "tick_without_osd_lock" << dendl;
5104
5105 logger->set(l_osd_buf, buffer::get_total_alloc());
5106 logger->set(l_osd_history_alloc_bytes, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
5107 logger->set(l_osd_history_alloc_num, buffer::get_history_alloc_num());
5108 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5109 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5110 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5111
5112 // osd_lock is not being held, which means the OSD state
5113 // might change when doing the monitor report
5114 if (is_active() || is_waiting_for_healthy()) {
5115 heartbeat_lock.Lock();
5116 heartbeat_check();
5117 heartbeat_lock.Unlock();
5118
5119 map_lock.get_read();
5120 Mutex::Locker l(mon_report_lock);
5121
5122 // mon report?
5123 bool reset = false;
5124 bool report = false;
5125 utime_t now = ceph_clock_now();
5126 pg_stat_queue_lock.Lock();
5127 double backoff = stats_ack_timeout / cct->_conf->osd_mon_ack_timeout;
5128 double adjusted_min = cct->_conf->osd_mon_report_interval_min * backoff;
5129 // note: we shouldn't adjust max because it must remain < the
5130 // mon's mon_osd_report_timeout (which defaults to 1.5x our
5131 // value).
5132 double max = cct->_conf->osd_mon_report_interval_max;
5133 if (!outstanding_pg_stats.empty() &&
5134 (now - stats_ack_timeout) > last_pg_stats_ack) {
5135 dout(1) << __func__ << " mon hasn't acked PGStats in "
5136 << now - last_pg_stats_ack
5137 << " seconds, reconnecting elsewhere" << dendl;
5138 reset = true;
5139 last_pg_stats_ack = now; // reset clock
5140 last_pg_stats_sent = utime_t();
5141 stats_ack_timeout =
5142 MAX(cct->_conf->osd_mon_ack_timeout,
5143 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_factor);
5144 outstanding_pg_stats.clear();
5145 }
5146 if (now - last_pg_stats_sent > max) {
5147 osd_stat_updated = true;
5148 report = true;
5149 } else if (service.need_fullness_update()) {
5150 report = true;
5151 } else if ((int)outstanding_pg_stats.size() >=
5152 cct->_conf->osd_mon_report_max_in_flight) {
5153 dout(20) << __func__ << " have max " << outstanding_pg_stats
5154 << " stats updates in flight" << dendl;
5155 } else {
5156 if (now - last_mon_report > adjusted_min) {
5157 dout(20) << __func__ << " stats backoff " << backoff
5158 << " adjusted_min " << adjusted_min << " - sending report"
5159 << dendl;
5160 osd_stat_updated = true;
5161 report = true;
5162 }
5163 }
5164 pg_stat_queue_lock.Unlock();
5165
5166 if (reset) {
5167 monc->reopen_session();
5168 } else if (report) {
5169 last_mon_report = now;
5170
5171 // do any pending reports
5172 send_full_update();
5173 send_failures();
31f18b77
FG
5174 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5175 send_pg_stats(now);
5176 }
7c673cae
FG
5177 }
5178 map_lock.put_read();
5179 }
5180
5181 if (is_active()) {
5182 if (!scrub_random_backoff()) {
5183 sched_scrub();
5184 }
5185 service.promote_throttle_recalibrate();
224ce89b
WB
5186 bool need_send_beacon = false;
5187 const auto now = ceph::coarse_mono_clock::now();
5188 {
5189 // borrow lec lock to pretect last_sent_beacon from changing
5190 Mutex::Locker l{min_last_epoch_clean_lock};
5191 const auto elapsed = now - last_sent_beacon;
5192 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5193 cct->_conf->osd_beacon_report_interval) {
5194 need_send_beacon = true;
5195 }
5196 }
5197 if (need_send_beacon) {
5198 send_beacon(now);
5199 }
7c673cae
FG
5200 }
5201
5202 check_ops_in_flight();
5203 service.kick_recovery_queue();
5204 tick_timer_without_osd_lock.add_event_after(OSD_TICK_INTERVAL, new C_Tick_WithoutOSDLock(this));
5205}
5206
5207void OSD::check_ops_in_flight()
5208{
5209 vector<string> warnings;
5210 if (op_tracker.check_ops_in_flight(warnings)) {
5211 for (vector<string>::iterator i = warnings.begin();
5212 i != warnings.end();
5213 ++i) {
5214 clog->warn() << *i;
5215 }
5216 }
5217}
5218
5219// Usage:
5220// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5221// rmomapkey <pool-id> [namespace/]<obj-name> <key>
5222// setomapheader <pool-id> [namespace/]<obj-name> <header>
5223// getomap <pool> [namespace/]<obj-name>
5224// truncobj <pool-id> [namespace/]<obj-name> <newlen>
5225// injectmdataerr [namespace/]<obj-name> [shardid]
5226// injectdataerr [namespace/]<obj-name> [shardid]
5227//
5228// set_recovery_delay [utime]
5229void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5230 const std::string &command, cmdmap_t& cmdmap, ostream &ss)
5231{
5232 //Test support
5233 //Support changing the omap on a single osd by using the Admin Socket to
5234 //directly request the osd make a change.
5235 if (command == "setomapval" || command == "rmomapkey" ||
5236 command == "setomapheader" || command == "getomap" ||
5237 command == "truncobj" || command == "injectmdataerr" ||
5238 command == "injectdataerr"
5239 ) {
5240 pg_t rawpg;
5241 int64_t pool;
5242 OSDMapRef curmap = service->get_osdmap();
5243 int r = -1;
5244
5245 string poolstr;
5246
5247 cmd_getval(service->cct, cmdmap, "pool", poolstr);
5248 pool = curmap->lookup_pg_pool_name(poolstr);
5249 //If we can't find it by name then maybe id specified
5250 if (pool < 0 && isdigit(poolstr[0]))
5251 pool = atoll(poolstr.c_str());
5252 if (pool < 0) {
5253 ss << "Invalid pool" << poolstr;
5254 return;
5255 }
5256
5257 string objname, nspace;
5258 cmd_getval(service->cct, cmdmap, "objname", objname);
5259 std::size_t found = objname.find_first_of('/');
5260 if (found != string::npos) {
5261 nspace = objname.substr(0, found);
5262 objname = objname.substr(found+1);
5263 }
5264 object_locator_t oloc(pool, nspace);
5265 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5266
5267 if (r < 0) {
5268 ss << "Invalid namespace/objname";
5269 return;
5270 }
5271
5272 int64_t shardid;
5273 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5274 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5275 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5276 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5277 if (curmap->pg_is_ec(rawpg)) {
5278 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5279 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5280 return;
5281 }
5282 }
5283
5284 ObjectStore::Transaction t;
5285
5286 if (command == "setomapval") {
5287 map<string, bufferlist> newattrs;
5288 bufferlist val;
5289 string key, valstr;
5290 cmd_getval(service->cct, cmdmap, "key", key);
5291 cmd_getval(service->cct, cmdmap, "val", valstr);
5292
5293 val.append(valstr);
5294 newattrs[key] = val;
5295 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5296 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5297 if (r < 0)
5298 ss << "error=" << r;
5299 else
5300 ss << "ok";
5301 } else if (command == "rmomapkey") {
5302 string key;
5303 set<string> keys;
5304 cmd_getval(service->cct, cmdmap, "key", key);
5305
5306 keys.insert(key);
5307 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
5308 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5309 if (r < 0)
5310 ss << "error=" << r;
5311 else
5312 ss << "ok";
5313 } else if (command == "setomapheader") {
5314 bufferlist newheader;
5315 string headerstr;
5316
5317 cmd_getval(service->cct, cmdmap, "header", headerstr);
5318 newheader.append(headerstr);
5319 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
5320 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5321 if (r < 0)
5322 ss << "error=" << r;
5323 else
5324 ss << "ok";
5325 } else if (command == "getomap") {
5326 //Debug: Output entire omap
5327 bufferlist hdrbl;
5328 map<string, bufferlist> keyvals;
5329 r = store->omap_get(coll_t(pgid), ghobject_t(obj), &hdrbl, &keyvals);
5330 if (r >= 0) {
5331 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5332 for (map<string, bufferlist>::iterator it = keyvals.begin();
5333 it != keyvals.end(); ++it)
5334 ss << " key=" << (*it).first << " val="
5335 << string((*it).second.c_str(), (*it).second.length());
5336 } else {
5337 ss << "error=" << r;
5338 }
5339 } else if (command == "truncobj") {
5340 int64_t trunclen;
5341 cmd_getval(service->cct, cmdmap, "len", trunclen);
5342 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
5343 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5344 if (r < 0)
5345 ss << "error=" << r;
5346 else
5347 ss << "ok";
5348 } else if (command == "injectdataerr") {
5349 store->inject_data_error(gobj);
5350 ss << "ok";
5351 } else if (command == "injectmdataerr") {
5352 store->inject_mdata_error(gobj);
5353 ss << "ok";
5354 }
5355 return;
5356 }
5357 if (command == "set_recovery_delay") {
5358 int64_t delay;
5359 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5360 ostringstream oss;
5361 oss << delay;
5362 int r = service->cct->_conf->set_val("osd_recovery_delay_start",
5363 oss.str().c_str());
5364 if (r != 0) {
5365 ss << "set_recovery_delay: error setting "
5366 << "osd_recovery_delay_start to '" << delay << "': error "
5367 << r;
5368 return;
5369 }
5370 service->cct->_conf->apply_changes(NULL);
5371 ss << "set_recovery_delay: set osd_recovery_delay_start "
5372 << "to " << service->cct->_conf->osd_recovery_delay_start;
5373 return;
5374 }
5375 if (command == "trigger_scrub") {
5376 spg_t pgid;
5377 OSDMapRef curmap = service->get_osdmap();
5378
5379 string pgidstr;
5380
5381 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5382 if (!pgid.parse(pgidstr.c_str())) {
5383 ss << "Invalid pgid specified";
5384 return;
5385 }
5386
5387 PG *pg = service->osd->_lookup_lock_pg(pgid);
5388 if (pg == nullptr) {
5389 ss << "Can't find pg " << pgid;
5390 return;
5391 }
5392
5393 if (pg->is_primary()) {
5394 pg->unreg_next_scrub();
5395 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5396 double pool_scrub_max_interval = 0;
5397 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5398 double scrub_max_interval = pool_scrub_max_interval > 0 ?
5399 pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
5400 // Instead of marking must_scrub force a schedule scrub
5401 utime_t stamp = ceph_clock_now();
5402 stamp -= scrub_max_interval;
5403 stamp -= 100.0; // push back last scrub more for good measure
5404 pg->info.history.last_scrub_stamp = stamp;
5405 pg->reg_next_scrub();
5406 ss << "ok";
5407 } else {
5408 ss << "Not primary";
5409 }
5410 pg->unlock();
5411 return;
5412 }
5413 if (command == "injectfull") {
5414 int64_t count;
5415 string type;
5416 OSDService::s_names state;
5417 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
5418 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
5419 if (type == "none" || count == 0) {
5420 type = "none";
5421 count = 0;
5422 }
5423 state = service->get_full_state(type);
5424 if (state == OSDService::s_names::INVALID) {
5425 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5426 return;
5427 }
5428 service->set_injectfull(state, count);
5429 return;
5430 }
5431 ss << "Internal error - command=" << command;
5432}
5433
5434// =========================================
5435bool remove_dir(
5436 CephContext *cct,
5437 ObjectStore *store, SnapMapper *mapper,
5438 OSDriver *osdriver,
5439 ObjectStore::Sequencer *osr,
5440 coll_t coll, DeletingStateRef dstate,
5441 bool *finished,
5442 ThreadPool::TPHandle &handle)
5443{
5444 vector<ghobject_t> olist;
5445 int64_t num = 0;
5446 ObjectStore::Transaction t;
5447 ghobject_t next;
5448 handle.reset_tp_timeout();
5449 store->collection_list(
5450 coll,
5451 next,
5452 ghobject_t::get_max(),
5453 store->get_ideal_list_max(),
5454 &olist,
5455 &next);
5456 generic_dout(10) << __func__ << " " << olist << dendl;
5457 // default cont to true, this is safe because caller(OSD::RemoveWQ::_process())
5458 // will recheck the answer before it really goes on.
5459 bool cont = true;
5460 for (vector<ghobject_t>::iterator i = olist.begin();
5461 i != olist.end();
5462 ++i) {
5463 if (i->is_pgmeta())
5464 continue;
5465 OSDriver::OSTransaction _t(osdriver->get_transaction(&t));
5466 int r = mapper->remove_oid(i->hobj, &_t);
5467 if (r != 0 && r != -ENOENT) {
5468 ceph_abort();
5469 }
5470 t.remove(coll, *i);
5471 if (++num >= cct->_conf->osd_target_transaction_size) {
5472 C_SaferCond waiter;
5473 store->queue_transaction(osr, std::move(t), &waiter);
5474 cont = dstate->pause_clearing();
5475 handle.suspend_tp_timeout();
5476 waiter.wait();
5477 handle.reset_tp_timeout();
5478 if (cont)
5479 cont = dstate->resume_clearing();
5480 if (!cont)
5481 return false;
5482 t = ObjectStore::Transaction();
5483 num = 0;
5484 }
5485 }
5486 if (num) {
5487 C_SaferCond waiter;
5488 store->queue_transaction(osr, std::move(t), &waiter);
5489 cont = dstate->pause_clearing();
5490 handle.suspend_tp_timeout();
5491 waiter.wait();
5492 handle.reset_tp_timeout();
5493 if (cont)
5494 cont = dstate->resume_clearing();
5495 }
5496 // whether there are more objects to remove in the collection
5497 *finished = next.is_max();
5498 return cont;
5499}
5500
5501void OSD::RemoveWQ::_process(
5502 pair<PGRef, DeletingStateRef> item,
5503 ThreadPool::TPHandle &handle)
5504{
5505 FUNCTRACE();
5506 PGRef pg(item.first);
5507 SnapMapper &mapper = pg->snap_mapper;
5508 OSDriver &driver = pg->osdriver;
5509 coll_t coll = coll_t(pg->info.pgid);
5510 pg->osr->flush();
5511 bool finished = false;
5512
5513 if (!item.second->start_or_resume_clearing())
5514 return;
5515
5516 bool cont = remove_dir(
5517 pg->cct, store, &mapper, &driver, pg->osr.get(), coll, item.second,
5518 &finished, handle);
5519 if (!cont)
5520 return;
5521 if (!finished) {
5522 if (item.second->pause_clearing())
5523 queue_front(item);
5524 return;
5525 }
5526
5527 if (!item.second->start_deleting())
5528 return;
5529
5530 ObjectStore::Transaction t;
5531 PGLog::clear_info_log(pg->info.pgid, &t);
5532
5533 if (cct->_conf->osd_inject_failure_on_pg_removal) {
5534 generic_derr << "osd_inject_failure_on_pg_removal" << dendl;
5535 _exit(1);
5536 }
5537 t.remove_collection(coll);
5538
5539 // We need the sequencer to stick around until the op is complete
5540 store->queue_transaction(
5541 pg->osr.get(),
5542 std::move(t),
5543 0, // onapplied
5544 0, // oncommit
5545 0, // onreadable sync
5546 new ContainerContext<PGRef>(pg),
5547 TrackedOpRef());
5548
5549 item.second->finish_deleting();
5550}
5551// =========================================
5552
5553void OSD::ms_handle_connect(Connection *con)
5554{
5555 dout(10) << __func__ << " con " << con << dendl;
5556 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
5557 Mutex::Locker l(osd_lock);
5558 if (is_stopping())
5559 return;
5560 dout(10) << __func__ << " on mon" << dendl;
5561
5562 if (is_preboot()) {
5563 start_boot();
5564 } else if (is_booting()) {
5565 _send_boot(); // resend boot message
5566 } else {
5567 map_lock.get_read();
5568 Mutex::Locker l2(mon_report_lock);
5569
5570 utime_t now = ceph_clock_now();
5571 last_mon_report = now;
5572
5573 // resend everything, it's a new session
5574 send_full_update();
5575 send_alive();
5576 service.requeue_pg_temp();
5577 service.send_pg_temp();
5578 requeue_failures();
5579 send_failures();
31f18b77
FG
5580 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5581 send_pg_stats(now);
5582 }
7c673cae
FG
5583
5584 map_lock.put_read();
5585 if (is_active()) {
5586 send_beacon(ceph::coarse_mono_clock::now());
5587 }
5588 }
5589
5590 // full map requests may happen while active or pre-boot
5591 if (requested_full_first) {
5592 rerequest_full_maps();
5593 }
5594 }
5595}
5596
5597void OSD::ms_handle_fast_connect(Connection *con)
5598{
5599 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5600 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5601 Session *s = static_cast<Session*>(con->get_priv());
5602 if (!s) {
5603 s = new Session(cct);
5604 con->set_priv(s->get());
5605 s->con = con;
5606 dout(10) << " new session (outgoing) " << s << " con=" << s->con
5607 << " addr=" << s->con->get_peer_addr() << dendl;
5608 // we don't connect to clients
5609 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5610 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5611 }
5612 s->put();
5613 }
5614}
5615
5616void OSD::ms_handle_fast_accept(Connection *con)
5617{
5618 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5619 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5620 Session *s = static_cast<Session*>(con->get_priv());
5621 if (!s) {
5622 s = new Session(cct);
5623 con->set_priv(s->get());
5624 s->con = con;
5625 dout(10) << "new session (incoming)" << s << " con=" << con
5626 << " addr=" << con->get_peer_addr()
5627 << " must have raced with connect" << dendl;
5628 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5629 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5630 }
5631 s->put();
5632 }
5633}
5634
5635bool OSD::ms_handle_reset(Connection *con)
5636{
5637 Session *session = static_cast<Session*>(con->get_priv());
5638 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
5639 if (!session)
5640 return false;
5641 session->wstate.reset(con);
5642 session->con.reset(NULL); // break con <-> session ref cycle
5643 // note that we break session->con *before* the session_handle_reset
5644 // cleanup below. this avoids a race between us and
5645 // PG::add_backoff, Session::check_backoff, etc.
5646 session_handle_reset(session);
5647 session->put();
5648 return true;
5649}
5650
5651bool OSD::ms_handle_refused(Connection *con)
5652{
5653 if (!cct->_conf->osd_fast_fail_on_connection_refused)
5654 return false;
5655
5656 Session *session = static_cast<Session*>(con->get_priv());
5657 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
5658 if (!session)
5659 return false;
5660 int type = con->get_peer_type();
5661 // handle only OSD failures here
5662 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
5663 OSDMapRef osdmap = get_osdmap();
5664 if (osdmap) {
5665 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
5666 if (id >= 0 && osdmap->is_up(id)) {
5667 // I'm cheating mon heartbeat grace logic, because we know it's not going
5668 // to respawn alone. +1 so we won't hit any boundary case.
5669 monc->send_mon_message(new MOSDFailure(monc->get_fsid(),
5670 osdmap->get_inst(id),
5671 cct->_conf->osd_heartbeat_grace + 1,
5672 osdmap->get_epoch(),
5673 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
5674 ));
5675 }
5676 }
5677 }
5678 session->put();
5679 return true;
5680}
5681
5682struct C_OSD_GetVersion : public Context {
5683 OSD *osd;
5684 uint64_t oldest, newest;
5685 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
5686 void finish(int r) override {
5687 if (r >= 0)
5688 osd->_got_mon_epochs(oldest, newest);
5689 }
5690};
5691
5692void OSD::start_boot()
5693{
5694 if (!_is_healthy()) {
5695 // if we are not healthy, do not mark ourselves up (yet)
5696 dout(1) << "not healthy; waiting to boot" << dendl;
5697 if (!is_waiting_for_healthy())
5698 start_waiting_for_healthy();
5699 // send pings sooner rather than later
5700 heartbeat_kick();
5701 return;
5702 }
5703 dout(1) << __func__ << dendl;
5704 set_state(STATE_PREBOOT);
224ce89b 5705 waiting_for_luminous_mons = false;
7c673cae
FG
5706 dout(10) << "start_boot - have maps " << superblock.oldest_map
5707 << ".." << superblock.newest_map << dendl;
5708 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
5709 monc->get_version("osdmap", &c->newest, &c->oldest, c);
5710}
5711
5712void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
5713{
5714 Mutex::Locker l(osd_lock);
5715 if (is_preboot()) {
5716 _preboot(oldest, newest);
5717 }
5718}
5719
5720void OSD::_preboot(epoch_t oldest, epoch_t newest)
5721{
5722 assert(is_preboot());
5723 dout(10) << __func__ << " _preboot mon has osdmaps "
5724 << oldest << ".." << newest << dendl;
5725
5726 // ensure our local fullness awareness is accurate
5727 heartbeat();
5728
5729 // if our map within recent history, try to add ourselves to the osdmap.
31f18b77
FG
5730 if (osdmap->get_epoch() == 0) {
5731 derr << "waiting for initial osdmap" << dendl;
c07f9fc5
FG
5732 } else if (osdmap->is_destroyed(whoami)) {
5733 derr << "osdmap says I am destroyed, exiting" << dendl;
5734 exit(0);
31f18b77 5735 } else if (osdmap->test_flag(CEPH_OSDMAP_NOUP) || osdmap->is_noup(whoami)) {
7c673cae
FG
5736 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
5737 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
5738 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
5739 << dendl;
31f18b77 5740 } else if (osdmap->require_osd_release < CEPH_RELEASE_JEWEL) {
7c673cae
FG
5741 derr << "osdmap REQUIRE_JEWEL OSDMap flag is NOT set; please set it"
5742 << dendl;
5743 } else if (!monc->monmap.get_required_features().contains_all(
5744 ceph::features::mon::FEATURE_LUMINOUS)) {
5745 derr << "monmap REQUIRE_LUMINOUS is NOT set; must upgrade all monitors to "
5746 << "Luminous or later before Luminous OSDs will boot" << dendl;
224ce89b 5747 waiting_for_luminous_mons = true;
7c673cae
FG
5748 } else if (service.need_fullness_update()) {
5749 derr << "osdmap fullness state needs update" << dendl;
5750 send_full_update();
5751 } else if (osdmap->get_epoch() >= oldest - 1 &&
5752 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
5753 _send_boot();
5754 return;
5755 }
5756
5757 // get all the latest maps
5758 if (osdmap->get_epoch() + 1 >= oldest)
5759 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5760 else
5761 osdmap_subscribe(oldest - 1, true);
5762}
5763
5764void OSD::send_full_update()
5765{
5766 if (!service.need_fullness_update())
5767 return;
5768 unsigned state = 0;
5769 if (service.is_full()) {
5770 state = CEPH_OSD_FULL;
5771 } else if (service.is_backfillfull()) {
5772 state = CEPH_OSD_BACKFILLFULL;
5773 } else if (service.is_nearfull()) {
5774 state = CEPH_OSD_NEARFULL;
5775 }
5776 set<string> s;
5777 OSDMap::calc_state_set(state, s);
5778 dout(10) << __func__ << " want state " << s << dendl;
5779 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
5780}
5781
5782void OSD::start_waiting_for_healthy()
5783{
5784 dout(1) << "start_waiting_for_healthy" << dendl;
5785 set_state(STATE_WAITING_FOR_HEALTHY);
5786 last_heartbeat_resample = utime_t();
5787}
5788
5789bool OSD::_is_healthy()
5790{
5791 if (!cct->get_heartbeat_map()->is_healthy()) {
5792 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
5793 return false;
5794 }
5795
5796 if (is_waiting_for_healthy()) {
5797 Mutex::Locker l(heartbeat_lock);
5798 utime_t cutoff = ceph_clock_now();
5799 cutoff -= cct->_conf->osd_heartbeat_grace;
5800 int num = 0, up = 0;
5801 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5802 p != heartbeat_peers.end();
5803 ++p) {
5804 if (p->second.is_healthy(cutoff))
5805 ++up;
5806 ++num;
5807 }
5808 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
5809 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
5810 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
5811 return false;
5812 }
5813 }
5814
5815 return true;
5816}
5817
5818void OSD::_send_boot()
5819{
5820 dout(10) << "_send_boot" << dendl;
5821 entity_addr_t cluster_addr = cluster_messenger->get_myaddr();
5822 Connection *local_connection = cluster_messenger->get_loopback_connection().get();
5823 if (cluster_addr.is_blank_ip()) {
5824 int port = cluster_addr.get_port();
5825 cluster_addr = client_messenger->get_myaddr();
5826 cluster_addr.set_port(port);
5827 cluster_messenger->set_addr_unknowns(cluster_addr);
5828 dout(10) << " assuming cluster_addr ip matches client_addr" << dendl;
5829 } else {
5830 Session *s = static_cast<Session*>(local_connection->get_priv());
5831 if (s)
5832 s->put();
5833 else
5834 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
5835 }
5836
5837 entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr();
5838 local_connection = hb_back_server_messenger->get_loopback_connection().get();
5839 if (hb_back_addr.is_blank_ip()) {
5840 int port = hb_back_addr.get_port();
5841 hb_back_addr = cluster_addr;
5842 hb_back_addr.set_port(port);
5843 hb_back_server_messenger->set_addr_unknowns(hb_back_addr);
5844 dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl;
5845 } else {
5846 Session *s = static_cast<Session*>(local_connection->get_priv());
5847 if (s)
5848 s->put();
5849 else
5850 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
5851 }
5852
5853 entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr();
5854 local_connection = hb_front_server_messenger->get_loopback_connection().get();
5855 if (hb_front_addr.is_blank_ip()) {
5856 int port = hb_front_addr.get_port();
5857 hb_front_addr = client_messenger->get_myaddr();
5858 hb_front_addr.set_port(port);
5859 hb_front_server_messenger->set_addr_unknowns(hb_front_addr);
5860 dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
5861 } else {
5862 Session *s = static_cast<Session*>(local_connection->get_priv());
5863 if (s)
5864 s->put();
5865 else
5866 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
5867 }
5868
5869 MOSDBoot *mboot = new MOSDBoot(superblock, get_osdmap_epoch(), service.get_boot_epoch(),
5870 hb_back_addr, hb_front_addr, cluster_addr,
5871 CEPH_FEATURES_ALL);
5872 dout(10) << " client_addr " << client_messenger->get_myaddr()
5873 << ", cluster_addr " << cluster_addr
5874 << ", hb_back_addr " << hb_back_addr
5875 << ", hb_front_addr " << hb_front_addr
5876 << dendl;
5877 _collect_metadata(&mboot->metadata);
5878 monc->send_mon_message(mboot);
5879 set_state(STATE_BOOTING);
5880}
5881
5882void OSD::_collect_metadata(map<string,string> *pm)
5883{
5884 // config info
5885 (*pm)["osd_data"] = dev_path;
c07f9fc5
FG
5886 if (store->get_type() == "filestore") {
5887 // not applicable for bluestore
5888 (*pm)["osd_journal"] = journal_path;
5889 }
7c673cae
FG
5890 (*pm)["front_addr"] = stringify(client_messenger->get_myaddr());
5891 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddr());
5892 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddr());
5893 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddr());
5894
5895 // backend
5896 (*pm)["osd_objectstore"] = store->get_type();
31f18b77 5897 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
d2e6a577 5898 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
c07f9fc5 5899 (*pm)["default_device_class"] = store->get_default_device_class();
7c673cae
FG
5900 store->collect_metadata(pm);
5901
5902 collect_sys_info(pm, cct);
5903
5904 dout(10) << __func__ << " " << *pm << dendl;
5905}
5906
5907void OSD::queue_want_up_thru(epoch_t want)
5908{
5909 map_lock.get_read();
5910 epoch_t cur = osdmap->get_up_thru(whoami);
5911 Mutex::Locker l(mon_report_lock);
5912 if (want > up_thru_wanted) {
5913 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
5914 << ", currently " << cur
5915 << dendl;
5916 up_thru_wanted = want;
5917 send_alive();
5918 } else {
5919 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
5920 << ", currently " << cur
5921 << dendl;
5922 }
5923 map_lock.put_read();
5924}
5925
5926void OSD::send_alive()
5927{
5928 assert(mon_report_lock.is_locked());
5929 if (!osdmap->exists(whoami))
5930 return;
5931 epoch_t up_thru = osdmap->get_up_thru(whoami);
5932 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
5933 if (up_thru_wanted > up_thru) {
5934 dout(10) << "send_alive want " << up_thru_wanted << dendl;
5935 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
5936 }
5937}
5938
5939void OSD::request_full_map(epoch_t first, epoch_t last)
5940{
5941 dout(10) << __func__ << " " << first << ".." << last
5942 << ", previously requested "
5943 << requested_full_first << ".." << requested_full_last << dendl;
5944 assert(osd_lock.is_locked());
5945 assert(first > 0 && last > 0);
5946 assert(first <= last);
5947 assert(first >= requested_full_first); // we shouldn't ever ask for older maps
5948 if (requested_full_first == 0) {
5949 // first request
5950 requested_full_first = first;
5951 requested_full_last = last;
5952 } else if (last <= requested_full_last) {
5953 // dup
5954 return;
5955 } else {
5956 // additional request
5957 first = requested_full_last + 1;
5958 requested_full_last = last;
5959 }
5960 MMonGetOSDMap *req = new MMonGetOSDMap;
5961 req->request_full(first, last);
5962 monc->send_mon_message(req);
5963}
5964
5965void OSD::got_full_map(epoch_t e)
5966{
5967 assert(requested_full_first <= requested_full_last);
5968 assert(osd_lock.is_locked());
5969 if (requested_full_first == 0) {
5970 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
5971 return;
5972 }
5973 if (e < requested_full_first) {
5974 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
5975 << ".." << requested_full_last
5976 << ", ignoring" << dendl;
5977 return;
5978 }
5979 if (e >= requested_full_last) {
5980 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
5981 << ".." << requested_full_last << ", resetting" << dendl;
5982 requested_full_first = requested_full_last = 0;
5983 return;
5984 }
5985
5986 requested_full_first = e + 1;
5987
5988 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
5989 << ".." << requested_full_last
5990 << ", still need more" << dendl;
5991}
5992
5993void OSD::requeue_failures()
5994{
5995 Mutex::Locker l(heartbeat_lock);
5996 unsigned old_queue = failure_queue.size();
5997 unsigned old_pending = failure_pending.size();
5998 for (map<int,pair<utime_t,entity_inst_t> >::iterator p =
5999 failure_pending.begin();
6000 p != failure_pending.end(); ) {
6001 failure_queue[p->first] = p->second.first;
6002 failure_pending.erase(p++);
6003 }
6004 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6005 << failure_queue.size() << dendl;
6006}
6007
6008void OSD::send_failures()
6009{
6010 assert(map_lock.is_locked());
6011 assert(mon_report_lock.is_locked());
6012 Mutex::Locker l(heartbeat_lock);
6013 utime_t now = ceph_clock_now();
6014 while (!failure_queue.empty()) {
6015 int osd = failure_queue.begin()->first;
7c673cae 6016 if (!failure_pending.count(osd)) {
31f18b77 6017 entity_inst_t i = osdmap->get_inst(osd);
7c673cae
FG
6018 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6019 monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, failed_for,
6020 osdmap->get_epoch()));
6021 failure_pending[osd] = make_pair(failure_queue.begin()->second, i);
6022 }
6023 failure_queue.erase(osd);
6024 }
6025}
6026
6027void OSD::send_still_alive(epoch_t epoch, const entity_inst_t &i)
6028{
6029 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), i, 0, epoch, MOSDFailure::FLAG_ALIVE);
6030 monc->send_mon_message(m);
6031}
6032
6033void OSD::send_pg_stats(const utime_t &now)
6034{
6035 assert(map_lock.is_locked());
31f18b77 6036 assert(osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS);
7c673cae
FG
6037 dout(20) << "send_pg_stats" << dendl;
6038
6039 osd_stat_t cur_stat = service.get_osd_stat();
6040
6041 cur_stat.os_perf_stat = store->get_cur_stats();
6042
6043 pg_stat_queue_lock.Lock();
6044
6045 if (osd_stat_updated || !pg_stat_queue.empty()) {
6046 last_pg_stats_sent = now;
6047 osd_stat_updated = false;
6048
6049 dout(10) << "send_pg_stats - " << pg_stat_queue.size() << " pgs updated" << dendl;
6050
6051 utime_t had_for(now);
6052 had_for -= had_map_since;
6053
6054 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
6055
6056 uint64_t tid = ++pg_stat_tid;
6057 m->set_tid(tid);
6058 m->osd_stat = cur_stat;
6059
6060 xlist<PG*>::iterator p = pg_stat_queue.begin();
6061 while (!p.end()) {
6062 PG *pg = *p;
6063 ++p;
6064 if (!pg->is_primary()) { // we hold map_lock; role is stable.
6065 pg->stat_queue_item.remove_myself();
6066 pg->put("pg_stat_queue");
6067 continue;
6068 }
6069 pg->pg_stats_publish_lock.Lock();
6070 if (pg->pg_stats_publish_valid) {
6071 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
6072 dout(25) << " sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6073 << pg->pg_stats_publish.reported_seq << dendl;
6074 } else {
6075 dout(25) << " NOT sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6076 << pg->pg_stats_publish.reported_seq << ", not valid" << dendl;
6077 }
6078 pg->pg_stats_publish_lock.Unlock();
6079 }
6080
6081 if (last_pg_stats_ack == utime_t() || !outstanding_pg_stats.empty()) {
6082 last_pg_stats_ack = ceph_clock_now();
6083 }
6084 outstanding_pg_stats.insert(tid);
6085 dout(20) << __func__ << " updates pending: " << outstanding_pg_stats << dendl;
6086
6087 monc->send_mon_message(m);
6088 }
6089
6090 pg_stat_queue_lock.Unlock();
6091}
6092
6093void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
6094{
6095 dout(10) << "handle_pg_stats_ack " << dendl;
6096
6097 if (!require_mon_peer(ack)) {
6098 ack->put();
6099 return;
6100 }
6101
6102 // NOTE: we may get replies from a previous mon even while
6103 // outstanding_pg_stats is empty if reconnecting races with replies
6104 // in flight.
6105
6106 pg_stat_queue_lock.Lock();
6107
6108 last_pg_stats_ack = ceph_clock_now();
6109
6110 // decay timeout slowly (analogous to TCP)
6111 stats_ack_timeout =
6112 MAX(cct->_conf->osd_mon_ack_timeout,
6113 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_decay);
6114 dout(20) << __func__ << " timeout now " << stats_ack_timeout << dendl;
6115
6116 if (ack->get_tid() > pg_stat_tid_flushed) {
6117 pg_stat_tid_flushed = ack->get_tid();
6118 pg_stat_queue_cond.Signal();
6119 }
6120
6121 xlist<PG*>::iterator p = pg_stat_queue.begin();
6122 while (!p.end()) {
6123 PG *pg = *p;
6124 PGRef _pg(pg);
6125 ++p;
6126
6127 auto acked = ack->pg_stat.find(pg->info.pgid.pgid);
6128 if (acked != ack->pg_stat.end()) {
6129 pg->pg_stats_publish_lock.Lock();
6130 if (acked->second.first == pg->pg_stats_publish.reported_seq &&
6131 acked->second.second == pg->pg_stats_publish.reported_epoch) {
6132 dout(25) << " ack on " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6133 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6134 pg->stat_queue_item.remove_myself();
6135 pg->put("pg_stat_queue");
6136 } else {
6137 dout(25) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6138 << ":" << pg->pg_stats_publish.reported_seq << " > acked "
6139 << acked->second << dendl;
6140 }
6141 pg->pg_stats_publish_lock.Unlock();
6142 } else {
6143 dout(30) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6144 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6145 }
6146 }
6147
6148 outstanding_pg_stats.erase(ack->get_tid());
6149 dout(20) << __func__ << " still pending: " << outstanding_pg_stats << dendl;
6150
6151 pg_stat_queue_lock.Unlock();
6152
6153 ack->put();
6154}
6155
6156void OSD::flush_pg_stats()
6157{
6158 dout(10) << "flush_pg_stats" << dendl;
6159 osd_lock.Unlock();
6160 utime_t now = ceph_clock_now();
6161 map_lock.get_read();
6162 mon_report_lock.Lock();
6163 send_pg_stats(now);
6164 mon_report_lock.Unlock();
6165 map_lock.put_read();
6166
6167
6168 pg_stat_queue_lock.Lock();
6169 uint64_t tid = pg_stat_tid;
6170 dout(10) << "flush_pg_stats waiting for stats tid " << tid << " to flush" << dendl;
6171 while (tid > pg_stat_tid_flushed)
6172 pg_stat_queue_cond.Wait(pg_stat_queue_lock);
6173 dout(10) << "flush_pg_stats finished waiting for stats tid " << tid << " to flush" << dendl;
6174 pg_stat_queue_lock.Unlock();
6175
6176 osd_lock.Lock();
6177}
6178
6179void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6180{
6181 const auto& monmap = monc->monmap;
6182 // send beacon to mon even if we are just connected, and the monmap is not
6183 // initialized yet by then.
6184 if (monmap.epoch > 0 &&
6185 monmap.get_required_features().contains_all(
6186 ceph::features::mon::FEATURE_LUMINOUS)) {
6187 dout(20) << __func__ << " sending" << dendl;
7c673cae
FG
6188 MOSDBeacon* beacon = nullptr;
6189 {
6190 Mutex::Locker l{min_last_epoch_clean_lock};
6191 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
6192 std::swap(beacon->pgs, min_last_epoch_clean_pgs);
224ce89b 6193 last_sent_beacon = now;
7c673cae
FG
6194 }
6195 monc->send_mon_message(beacon);
6196 } else {
6197 dout(20) << __func__ << " not sending" << dendl;
6198 }
6199}
6200
6201void OSD::handle_command(MMonCommand *m)
6202{
6203 if (!require_mon_peer(m)) {
6204 m->put();
6205 return;
6206 }
6207
6208 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6209 command_wq.queue(c);
6210 m->put();
6211}
6212
6213void OSD::handle_command(MCommand *m)
6214{
6215 ConnectionRef con = m->get_connection();
6216 Session *session = static_cast<Session *>(con->get_priv());
6217 if (!session) {
6218 con->send_message(new MCommandReply(m, -EPERM));
6219 m->put();
6220 return;
6221 }
6222
6223 OSDCap& caps = session->caps;
6224 session->put();
6225
6226 if (!caps.allow_all() || m->get_source().is_mon()) {
6227 con->send_message(new MCommandReply(m, -EPERM));
6228 m->put();
6229 return;
6230 }
6231
6232 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6233 command_wq.queue(c);
6234
6235 m->put();
6236}
6237
6238struct OSDCommand {
6239 string cmdstring;
6240 string helpstring;
6241 string module;
6242 string perm;
6243 string availability;
6244} osd_commands[] = {
6245
6246#define COMMAND(parsesig, helptext, module, perm, availability) \
6247 {parsesig, helptext, module, perm, availability},
6248
6249// yes, these are really pg commands, but there's a limit to how
6250// much work it's worth. The OSD returns all of them. Make this
6251// form (pg <pgid> <cmd>) valid only for the cli.
6252// Rest uses "tell <pgid> <cmd>"
6253
6254COMMAND("pg " \
6255 "name=pgid,type=CephPgid " \
6256 "name=cmd,type=CephChoices,strings=query", \
6257 "show details of a specific pg", "osd", "r", "cli")
6258COMMAND("pg " \
6259 "name=pgid,type=CephPgid " \
6260 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6261 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6262 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6263 "osd", "rw", "cli")
6264COMMAND("pg " \
6265 "name=pgid,type=CephPgid " \
6266 "name=cmd,type=CephChoices,strings=list_missing " \
6267 "name=offset,type=CephString,req=false",
6268 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6269 "osd", "r", "cli")
6270
6271// new form: tell <pgid> <cmd> for both cli and rest
6272
6273COMMAND("query",
6274 "show details of a specific pg", "osd", "r", "cli,rest")
6275COMMAND("mark_unfound_lost " \
6276 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6277 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6278 "osd", "rw", "cli,rest")
6279COMMAND("list_missing " \
6280 "name=offset,type=CephString,req=false",
6281 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6282 "osd", "r", "cli,rest")
31f18b77
FG
6283COMMAND("perf histogram dump "
6284 "name=logger,type=CephString,req=false "
6285 "name=counter,type=CephString,req=false",
6286 "Get histogram data",
6287 "osd", "r", "cli,rest")
7c673cae
FG
6288
6289// tell <osd.n> commands. Validation of osd.n must be special-cased in client
6290COMMAND("version", "report version of OSD", "osd", "r", "cli,rest")
6291COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r", "cli,rest")
6292COMMAND("injectargs " \
6293 "name=injected_args,type=CephString,n=N",
6294 "inject configuration arguments into running OSD",
6295 "osd", "rw", "cli,rest")
c07f9fc5
FG
6296COMMAND("config set " \
6297 "name=key,type=CephString name=value,type=CephString",
6298 "Set a configuration option at runtime (not persistent)",
6299 "osd", "rw", "cli,rest")
7c673cae
FG
6300COMMAND("cluster_log " \
6301 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6302 "name=message,type=CephString,n=N",
6303 "log a message to the cluster log",
6304 "osd", "rw", "cli,rest")
6305COMMAND("bench " \
6306 "name=count,type=CephInt,req=false " \
6307 "name=size,type=CephInt,req=false " \
6308 "name=object_size,type=CephInt,req=false " \
6309 "name=object_num,type=CephInt,req=false ", \
6310 "OSD benchmark: write <count> <size>-byte objects, " \
6311 "(default 1G size 4MB). Results in log.",
6312 "osd", "rw", "cli,rest")
6313COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw", "cli,rest")
6314COMMAND("heap " \
6315 "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
6316 "show heap usage info (available only if compiled with tcmalloc)", \
6317 "osd", "rw", "cli,rest")
6318COMMAND("debug dump_missing " \
6319 "name=filename,type=CephFilepath",
6320 "dump missing objects to a named file", "osd", "r", "cli,rest")
6321COMMAND("debug kick_recovery_wq " \
6322 "name=delay,type=CephInt,range=0",
6323 "set osd_recovery_delay_start to <val>", "osd", "rw", "cli,rest")
6324COMMAND("cpu_profiler " \
6325 "name=arg,type=CephChoices,strings=status|flush",
6326 "run cpu profiling on daemon", "osd", "rw", "cli,rest")
6327COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6328 "osd", "r", "cli,rest")
6329COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6330 "osd", "rw", "cli,rest")
224ce89b
WB
6331COMMAND("compact",
6332 "compact object store's omap. "
6333 "WARNING: Compaction probably slows your requests",
6334 "osd", "rw", "cli,rest")
7c673cae
FG
6335};
6336
6337void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6338{
6339 int r = 0;
6340 stringstream ss, ds;
6341 string rs;
6342 bufferlist odata;
6343
6344 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6345
6346 map<string, cmd_vartype> cmdmap;
6347 string prefix;
6348 string format;
6349 string pgidstr;
6350 boost::scoped_ptr<Formatter> f;
6351
6352 if (cmd.empty()) {
6353 ss << "no command given";
6354 goto out;
6355 }
6356
6357 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6358 r = -EINVAL;
6359 goto out;
6360 }
6361
6362 cmd_getval(cct, cmdmap, "prefix", prefix);
6363
6364 if (prefix == "get_command_descriptions") {
6365 int cmdnum = 0;
6366 JSONFormatter *f = new JSONFormatter();
6367 f->open_object_section("command_descriptions");
6368 for (OSDCommand *cp = osd_commands;
6369 cp < &osd_commands[ARRAY_SIZE(osd_commands)]; cp++) {
6370
6371 ostringstream secname;
6372 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
6373 dump_cmddesc_to_json(f, secname.str(), cp->cmdstring, cp->helpstring,
6374 cp->module, cp->perm, cp->availability, 0);
6375 cmdnum++;
6376 }
6377 f->close_section(); // command_descriptions
6378
6379 f->flush(ds);
6380 delete f;
6381 goto out;
6382 }
6383
6384 cmd_getval(cct, cmdmap, "format", format);
6385 f.reset(Formatter::create(format));
6386
6387 if (prefix == "version") {
6388 if (f) {
6389 f->open_object_section("version");
6390 f->dump_string("version", pretty_version_to_str());
6391 f->close_section();
6392 f->flush(ds);
6393 } else {
6394 ds << pretty_version_to_str();
6395 }
6396 goto out;
6397 }
6398 else if (prefix == "injectargs") {
6399 vector<string> argsvec;
6400 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6401
6402 if (argsvec.empty()) {
6403 r = -EINVAL;
6404 ss << "ignoring empty injectargs";
6405 goto out;
6406 }
6407 string args = argsvec.front();
6408 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6409 args += " " + *a;
6410 osd_lock.Unlock();
6411 r = cct->_conf->injectargs(args, &ss);
6412 osd_lock.Lock();
6413 }
c07f9fc5
FG
6414 else if (prefix == "config set") {
6415 std::string key;
6416 std::string val;
6417 cmd_getval(cct, cmdmap, "key", key);
6418 cmd_getval(cct, cmdmap, "value", val);
6419 osd_lock.Unlock();
6420 r = cct->_conf->set_val(key, val, true, &ss);
d2e6a577
FG
6421 if (r == 0) {
6422 cct->_conf->apply_changes(nullptr);
6423 }
c07f9fc5
FG
6424 osd_lock.Lock();
6425 }
7c673cae
FG
6426 else if (prefix == "cluster_log") {
6427 vector<string> msg;
6428 cmd_getval(cct, cmdmap, "message", msg);
6429 if (msg.empty()) {
6430 r = -EINVAL;
6431 ss << "ignoring empty log message";
6432 goto out;
6433 }
6434 string message = msg.front();
6435 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6436 message += " " + *a;
6437 string lvl;
6438 cmd_getval(cct, cmdmap, "level", lvl);
6439 clog_type level = string_to_clog_type(lvl);
6440 if (level < 0) {
6441 r = -EINVAL;
6442 ss << "unknown level '" << lvl << "'";
6443 goto out;
6444 }
6445 clog->do_log(level, message);
6446 }
6447
6448 // either 'pg <pgid> <command>' or
6449 // 'tell <pgid>' (which comes in without any of that prefix)?
6450
6451 else if (prefix == "pg" ||
6452 prefix == "query" ||
6453 prefix == "mark_unfound_lost" ||
6454 prefix == "list_missing"
6455 ) {
6456 pg_t pgid;
6457
6458 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6459 ss << "no pgid specified";
6460 r = -EINVAL;
6461 } else if (!pgid.parse(pgidstr.c_str())) {
6462 ss << "couldn't parse pgid '" << pgidstr << "'";
6463 r = -EINVAL;
6464 } else {
6465 spg_t pcand;
6466 PG *pg = nullptr;
6467 if (osdmap->get_primary_shard(pgid, &pcand) &&
6468 (pg = _lookup_lock_pg(pcand))) {
6469 if (pg->is_primary()) {
6470 // simulate pg <pgid> cmd= for pg->do-command
6471 if (prefix != "pg")
6472 cmd_putval(cct, cmdmap, "cmd", prefix);
6473 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
6474 if (r == -EAGAIN) {
6475 pg->unlock();
6476 // don't reply, pg will do so async
6477 return;
6478 }
6479 } else {
6480 ss << "not primary for pgid " << pgid;
6481
6482 // send them the latest diff to ensure they realize the mapping
6483 // has changed.
6484 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
6485
6486 // do not reply; they will get newer maps and realize they
6487 // need to resend.
6488 pg->unlock();
6489 return;
6490 }
6491 pg->unlock();
6492 } else {
6493 ss << "i don't have pgid " << pgid;
6494 r = -ENOENT;
6495 }
6496 }
6497 }
6498
6499 else if (prefix == "bench") {
6500 int64_t count;
6501 int64_t bsize;
6502 int64_t osize, onum;
6503 // default count 1G, size 4MB
6504 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
6505 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
6506 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
6507 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
6508
6509 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
6510 ObjectStore::Sequencer>("bench"));
6511
6512 uint32_t duration = cct->_conf->osd_bench_duration;
6513
6514 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
6515 // let us limit the block size because the next checks rely on it
6516 // having a sane value. If we allow any block size to be set things
6517 // can still go sideways.
6518 ss << "block 'size' values are capped at "
6519 << prettybyte_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
6520 << " a higher value, please adjust 'osd_bench_max_block_size'";
6521 r = -EINVAL;
6522 goto out;
6523 } else if (bsize < (int64_t) (1 << 20)) {
6524 // entering the realm of small block sizes.
6525 // limit the count to a sane value, assuming a configurable amount of
6526 // IOPS and duration, so that the OSD doesn't get hung up on this,
6527 // preventing timeouts from going off
6528 int64_t max_count =
6529 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
6530 if (count > max_count) {
6531 ss << "'count' values greater than " << max_count
6532 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6533 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
6534 << " for " << duration << " seconds,"
6535 << " can cause ill effects on osd. "
6536 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6537 << " value if you wish to use a higher 'count'.";
6538 r = -EINVAL;
6539 goto out;
6540 }
6541 } else {
6542 // 1MB block sizes are big enough so that we get more stuff done.
6543 // However, to avoid the osd from getting hung on this and having
6544 // timers being triggered, we are going to limit the count assuming
6545 // a configurable throughput and duration.
6546 // NOTE: max_count is the total amount of bytes that we believe we
6547 // will be able to write during 'duration' for the given
6548 // throughput. The block size hardly impacts this unless it's
6549 // way too big. Given we already check how big the block size
6550 // is, it's safe to assume everything will check out.
6551 int64_t max_count =
6552 cct->_conf->osd_bench_large_size_max_throughput * duration;
6553 if (count > max_count) {
6554 ss << "'count' values greater than " << max_count
6555 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6556 << prettybyte_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
6557 << " for " << duration << " seconds,"
6558 << " can cause ill effects on osd. "
6559 << " Please adjust 'osd_bench_large_size_max_throughput'"
6560 << " with a higher value if you wish to use a higher 'count'.";
6561 r = -EINVAL;
6562 goto out;
6563 }
6564 }
6565
6566 if (osize && bsize > osize)
6567 bsize = osize;
6568
6569 dout(1) << " bench count " << count
6570 << " bsize " << prettybyte_t(bsize) << dendl;
6571
6572 ObjectStore::Transaction cleanupt;
6573
6574 if (osize && onum) {
6575 bufferlist bl;
6576 bufferptr bp(osize);
6577 bp.zero();
6578 bl.push_back(std::move(bp));
6579 bl.rebuild_page_aligned();
6580 for (int i=0; i<onum; ++i) {
6581 char nm[30];
6582 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
6583 object_t oid(nm);
6584 hobject_t soid(sobject_t(oid, 0));
6585 ObjectStore::Transaction t;
6586 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
6587 store->queue_transaction(osr.get(), std::move(t), NULL);
6588 cleanupt.remove(coll_t(), ghobject_t(soid));
6589 }
6590 }
6591
6592 bufferlist bl;
6593 bufferptr bp(bsize);
6594 bp.zero();
6595 bl.push_back(std::move(bp));
6596 bl.rebuild_page_aligned();
6597
6598 {
6599 C_SaferCond waiter;
6600 if (!osr->flush_commit(&waiter)) {
6601 waiter.wait();
6602 }
6603 }
6604
6605 utime_t start = ceph_clock_now();
6606 for (int64_t pos = 0; pos < count; pos += bsize) {
6607 char nm[30];
6608 unsigned offset = 0;
6609 if (onum && osize) {
6610 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
6611 offset = rand() % (osize / bsize) * bsize;
6612 } else {
6613 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
6614 }
6615 object_t oid(nm);
6616 hobject_t soid(sobject_t(oid, 0));
6617 ObjectStore::Transaction t;
6618 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
6619 store->queue_transaction(osr.get(), std::move(t), NULL);
6620 if (!onum || !osize)
6621 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
6622 }
6623
6624 {
6625 C_SaferCond waiter;
6626 if (!osr->flush_commit(&waiter)) {
6627 waiter.wait();
6628 }
6629 }
6630 utime_t end = ceph_clock_now();
6631
6632 // clean up
6633 store->queue_transaction(osr.get(), std::move(cleanupt), NULL);
6634 {
6635 C_SaferCond waiter;
6636 if (!osr->flush_commit(&waiter)) {
6637 waiter.wait();
6638 }
6639 }
6640
6641 uint64_t rate = (double)count / (end - start);
6642 if (f) {
6643 f->open_object_section("osd_bench_results");
6644 f->dump_int("bytes_written", count);
6645 f->dump_int("blocksize", bsize);
6646 f->dump_unsigned("bytes_per_sec", rate);
6647 f->close_section();
6648 f->flush(ss);
6649 } else {
6650 ss << "bench: wrote " << prettybyte_t(count)
6651 << " in blocks of " << prettybyte_t(bsize) << " in "
6652 << (end-start) << " sec at " << prettybyte_t(rate) << "/sec";
6653 }
6654 }
6655
6656 else if (prefix == "flush_pg_stats") {
31f18b77
FG
6657 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6658 mgrc.send_pgstats();
6659 ds << service.get_osd_stat_seq() << "\n";
6660 } else {
6661 flush_pg_stats();
6662 }
7c673cae
FG
6663 }
6664
6665 else if (prefix == "heap") {
6666 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
6667 }
6668
6669 else if (prefix == "debug dump_missing") {
6670 string file_name;
6671 cmd_getval(cct, cmdmap, "filename", file_name);
6672 std::ofstream fout(file_name.c_str());
6673 if (!fout.is_open()) {
6674 ss << "failed to open file '" << file_name << "'";
6675 r = -EINVAL;
6676 goto out;
6677 }
6678
6679 fout << "*** osd " << whoami << ": dump_missing ***" << std::endl;
6680 RWLock::RLocker l(pg_map_lock);
6681 for (ceph::unordered_map<spg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
6682 pg_map_e != pg_map.end(); ++pg_map_e) {
6683 PG *pg = pg_map_e->second;
6684 pg->lock();
6685
6686 fout << *pg << std::endl;
6687 std::map<hobject_t, pg_missing_item>::const_iterator mend =
6688 pg->pg_log.get_missing().get_items().end();
6689 std::map<hobject_t, pg_missing_item>::const_iterator mi =
6690 pg->pg_log.get_missing().get_items().begin();
6691 for (; mi != mend; ++mi) {
6692 fout << mi->first << " -> " << mi->second << std::endl;
6693 if (!pg->missing_loc.needs_recovery(mi->first))
6694 continue;
6695 if (pg->missing_loc.is_unfound(mi->first))
6696 fout << " unfound ";
6697 const set<pg_shard_t> &mls(pg->missing_loc.get_locations(mi->first));
6698 if (mls.empty())
6699 continue;
6700 fout << "missing_loc: " << mls << std::endl;
6701 }
6702 pg->unlock();
6703 fout << std::endl;
6704 }
6705
6706 fout.close();
6707 }
6708 else if (prefix == "debug kick_recovery_wq") {
6709 int64_t delay;
6710 cmd_getval(cct, cmdmap, "delay", delay);
6711 ostringstream oss;
6712 oss << delay;
6713 r = cct->_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
6714 if (r != 0) {
6715 ss << "kick_recovery_wq: error setting "
6716 << "osd_recovery_delay_start to '" << delay << "': error "
6717 << r;
6718 goto out;
6719 }
6720 cct->_conf->apply_changes(NULL);
6721 ss << "kicking recovery queue. set osd_recovery_delay_start "
6722 << "to " << cct->_conf->osd_recovery_delay_start;
6723 }
6724
6725 else if (prefix == "cpu_profiler") {
6726 string arg;
6727 cmd_getval(cct, cmdmap, "arg", arg);
6728 vector<string> argvec;
6729 get_str_vec(arg, argvec);
6730 cpu_profiler_handle_command(argvec, ds);
6731 }
6732
6733 else if (prefix == "dump_pg_recovery_stats") {
6734 stringstream s;
6735 if (f) {
6736 pg_recovery_stats.dump_formatted(f.get());
6737 f->flush(ds);
6738 } else {
6739 pg_recovery_stats.dump(s);
6740 ds << "dump pg recovery stats: " << s.str();
6741 }
6742 }
6743
6744 else if (prefix == "reset_pg_recovery_stats") {
6745 ss << "reset pg recovery stats";
6746 pg_recovery_stats.reset();
6747 }
6748
31f18b77
FG
6749 else if (prefix == "perf histogram dump") {
6750 std::string logger;
6751 std::string counter;
6752 cmd_getval(cct, cmdmap, "logger", logger);
6753 cmd_getval(cct, cmdmap, "counter", counter);
6754 if (f) {
6755 cct->get_perfcounters_collection()->dump_formatted_histograms(
6756 f.get(), false, logger, counter);
6757 f->flush(ds);
6758 }
6759 }
6760
224ce89b
WB
6761 else if (prefix == "compact") {
6762 dout(1) << "triggering manual compaction" << dendl;
6763 auto start = ceph::coarse_mono_clock::now();
6764 store->compact();
6765 auto end = ceph::coarse_mono_clock::now();
6766 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
6767 dout(1) << "finished manual compaction in "
6768 << time_span.count()
6769 << " seconds" << dendl;
6770 ss << "compacted omap in " << time_span.count() << " seconds";
6771 }
6772
7c673cae
FG
6773 else {
6774 ss << "unrecognized command! " << cmd;
6775 r = -EINVAL;
6776 }
6777
6778 out:
6779 rs = ss.str();
6780 odata.append(ds);
6781 dout(0) << "do_command r=" << r << " " << rs << dendl;
6782 clog->info() << rs;
6783 if (con) {
6784 MCommandReply *reply = new MCommandReply(r, rs);
6785 reply->set_tid(tid);
6786 reply->set_data(odata);
6787 con->send_message(reply);
6788 }
6789}
6790
6791bool OSD::heartbeat_dispatch(Message *m)
6792{
6793 dout(30) << "heartbeat_dispatch " << m << dendl;
6794 switch (m->get_type()) {
6795
6796 case CEPH_MSG_PING:
6797 dout(10) << "ping from " << m->get_source_inst() << dendl;
6798 m->put();
6799 break;
6800
6801 case MSG_OSD_PING:
6802 handle_osd_ping(static_cast<MOSDPing*>(m));
6803 break;
6804
6805 default:
6806 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
6807 m->put();
6808 }
6809
6810 return true;
6811}
6812
6813bool OSD::ms_dispatch(Message *m)
6814{
6815 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
6816 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
6817 service.got_stop_ack();
6818 m->put();
6819 return true;
6820 }
6821
6822 // lock!
6823
6824 osd_lock.Lock();
6825 if (is_stopping()) {
6826 osd_lock.Unlock();
6827 m->put();
6828 return true;
6829 }
6830
6831 do_waiters();
6832 _dispatch(m);
6833
6834 osd_lock.Unlock();
6835
6836 return true;
6837}
6838
6839void OSD::maybe_share_map(
6840 Session *session,
6841 OpRequestRef op,
6842 OSDMapRef osdmap)
6843{
6844 if (!op->check_send_map) {
6845 return;
6846 }
6847 epoch_t last_sent_epoch = 0;
6848
6849 session->sent_epoch_lock.lock();
6850 last_sent_epoch = session->last_sent_epoch;
6851 session->sent_epoch_lock.unlock();
6852
6853 const Message *m = op->get_req();
6854 service.share_map(
6855 m->get_source(),
6856 m->get_connection().get(),
6857 op->sent_epoch,
6858 osdmap,
6859 session ? &last_sent_epoch : NULL);
6860
6861 session->sent_epoch_lock.lock();
6862 if (session->last_sent_epoch < last_sent_epoch) {
6863 session->last_sent_epoch = last_sent_epoch;
6864 }
6865 session->sent_epoch_lock.unlock();
6866
6867 op->check_send_map = false;
6868}
6869
6870void OSD::dispatch_session_waiting(Session *session, OSDMapRef osdmap)
6871{
6872 assert(session->session_dispatch_lock.is_locked());
6873
6874 auto i = session->waiting_on_map.begin();
6875 while (i != session->waiting_on_map.end()) {
6876 OpRequestRef op = &(*i);
6877 assert(ms_can_fast_dispatch(op->get_req()));
6878 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
6879 op->get_req());
6880 if (m->get_min_epoch() > osdmap->get_epoch()) {
6881 break;
6882 }
6883 session->waiting_on_map.erase(i++);
6884 op->put();
6885
6886 spg_t pgid;
6887 if (m->get_type() == CEPH_MSG_OSD_OP) {
6888 pg_t actual_pgid = osdmap->raw_pg_to_pg(
6889 static_cast<const MOSDOp*>(m)->get_pg());
6890 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
6891 continue;
6892 }
6893 } else {
6894 pgid = m->get_spg();
6895 }
6896 enqueue_op(pgid, op, m->get_map_epoch());
6897 }
6898
6899 if (session->waiting_on_map.empty()) {
6900 clear_session_waiting_on_map(session);
6901 } else {
6902 register_session_waiting_on_map(session);
6903 }
6904}
6905
6906void OSD::ms_fast_dispatch(Message *m)
6907{
6908 FUNCTRACE();
6909 if (service.is_stopping()) {
6910 m->put();
6911 return;
6912 }
6913 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
6914 {
6915#ifdef WITH_LTTNG
6916 osd_reqid_t reqid = op->get_reqid();
6917#endif
6918 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
6919 reqid.name._num, reqid.tid, reqid.inc);
6920 }
6921
6922 if (m->trace)
6923 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
6924
6925 // note sender epoch, min req'd epoch
6926 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
6927 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
6928 assert(op->min_epoch <= op->sent_epoch); // sanity check!
6929
6930 service.maybe_inject_dispatch_delay();
6931
6932 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
6933 m->get_type() != CEPH_MSG_OSD_OP) {
6934 // queue it directly
6935 enqueue_op(
6936 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
6937 op,
6938 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
6939 } else {
6940 // legacy client, and this is an MOSDOp (the *only* fast dispatch
6941 // message that didn't have an explicit spg_t); we need to map
6942 // them to an spg_t while preserving delivery order.
6943 Session *session = static_cast<Session*>(m->get_connection()->get_priv());
6944 if (session) {
6945 {
6946 Mutex::Locker l(session->session_dispatch_lock);
6947 op->get();
6948 session->waiting_on_map.push_back(*op);
6949 OSDMapRef nextmap = service.get_nextmap_reserved();
6950 dispatch_session_waiting(session, nextmap);
6951 service.release_map(nextmap);
6952 }
6953 session->put();
6954 }
6955 }
6956 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
6957}
6958
6959void OSD::ms_fast_preprocess(Message *m)
6960{
6961 if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
6962 if (m->get_type() == CEPH_MSG_OSD_MAP) {
6963 MOSDMap *mm = static_cast<MOSDMap*>(m);
6964 Session *s = static_cast<Session*>(m->get_connection()->get_priv());
6965 if (s) {
6966 s->received_map_lock.lock();
6967 s->received_map_epoch = mm->get_last();
6968 s->received_map_lock.unlock();
6969 s->put();
6970 }
6971 }
6972 }
6973}
6974
6975bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
6976{
6977 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
6978
31f18b77
FG
6979 if (is_stopping()) {
6980 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
6981 return false;
6982 }
6983
7c673cae
FG
6984 if (dest_type == CEPH_ENTITY_TYPE_MON)
6985 return true;
6986
6987 if (force_new) {
6988 /* the MonClient checks keys every tick(), so we should just wait for that cycle
6989 to get through */
6990 if (monc->wait_auth_rotating(10) < 0) {
6991 derr << "OSD::ms_get_authorizer wait_auth_rotating failed" << dendl;
6992 return false;
6993 }
6994 }
6995
6996 *authorizer = monc->build_authorizer(dest_type);
6997 return *authorizer != NULL;
6998}
6999
7000
7001bool OSD::ms_verify_authorizer(Connection *con, int peer_type,
7002 int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
7003 bool& isvalid, CryptoKey& session_key)
7004{
7005 AuthAuthorizeHandler *authorize_handler = 0;
7006 switch (peer_type) {
7007 case CEPH_ENTITY_TYPE_MDS:
7008 /*
7009 * note: mds is technically a client from our perspective, but
7010 * this makes the 'cluster' consistent w/ monitor's usage.
7011 */
7012 case CEPH_ENTITY_TYPE_OSD:
7013 case CEPH_ENTITY_TYPE_MGR:
7014 authorize_handler = authorize_handler_cluster_registry->get_handler(protocol);
7015 break;
7016 default:
7017 authorize_handler = authorize_handler_service_registry->get_handler(protocol);
7018 }
7019 if (!authorize_handler) {
7020 dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol << dendl;
7021 isvalid = false;
7022 return true;
7023 }
7024
7025 AuthCapsInfo caps_info;
7026 EntityName name;
7027 uint64_t global_id;
7028 uint64_t auid = CEPH_AUTH_UID_DEFAULT;
7029
c07f9fc5
FG
7030 RotatingKeyRing *keys = monc->rotating_secrets.get();
7031 if (keys) {
7032 isvalid = authorize_handler->verify_authorizer(
7033 cct, keys,
7034 authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
7035 &auid);
7036 } else {
7037 dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
7038 isvalid = false;
7039 }
7c673cae
FG
7040
7041 if (isvalid) {
7042 Session *s = static_cast<Session *>(con->get_priv());
7043 if (!s) {
7044 s = new Session(cct);
7045 con->set_priv(s->get());
7046 s->con = con;
7047 dout(10) << " new session " << s << " con=" << s->con << " addr=" << s->con->get_peer_addr() << dendl;
7048 }
7049
7050 s->entity_name = name;
7051 if (caps_info.allow_all)
7052 s->caps.set_allow_all();
7053 s->auid = auid;
7054
7055 if (caps_info.caps.length() > 0) {
7056 bufferlist::iterator p = caps_info.caps.begin();
7057 string str;
7058 try {
7059 ::decode(str, p);
7060 }
7061 catch (buffer::error& e) {
7062 }
7063 bool success = s->caps.parse(str);
7064 if (success)
7065 dout(10) << " session " << s << " " << s->entity_name << " has caps " << s->caps << " '" << str << "'" << dendl;
7066 else
7067 dout(10) << " session " << s << " " << s->entity_name << " failed to parse caps '" << str << "'" << dendl;
7068 }
7069
7070 s->put();
7071 }
7072 return true;
7073}
7074
7075void OSD::do_waiters()
7076{
7077 assert(osd_lock.is_locked());
7078
7079 dout(10) << "do_waiters -- start" << dendl;
7080 while (!finished.empty()) {
7081 OpRequestRef next = finished.front();
7082 finished.pop_front();
7083 dispatch_op(next);
7084 }
7085 dout(10) << "do_waiters -- finish" << dendl;
7086}
7087
7088void OSD::dispatch_op(OpRequestRef op)
7089{
7090 switch (op->get_req()->get_type()) {
7091
7092 case MSG_OSD_PG_CREATE:
7093 handle_pg_create(op);
7094 break;
7095 case MSG_OSD_PG_NOTIFY:
7096 handle_pg_notify(op);
7097 break;
7098 case MSG_OSD_PG_QUERY:
7099 handle_pg_query(op);
7100 break;
7101 case MSG_OSD_PG_LOG:
7102 handle_pg_log(op);
7103 break;
7104 case MSG_OSD_PG_REMOVE:
7105 handle_pg_remove(op);
7106 break;
7107 case MSG_OSD_PG_INFO:
7108 handle_pg_info(op);
7109 break;
7110 case MSG_OSD_PG_TRIM:
7111 handle_pg_trim(op);
7112 break;
7113 case MSG_OSD_BACKFILL_RESERVE:
7114 handle_pg_backfill_reserve(op);
7115 break;
7116 case MSG_OSD_RECOVERY_RESERVE:
7117 handle_pg_recovery_reserve(op);
7118 break;
7119 }
7120}
7121
7122void OSD::_dispatch(Message *m)
7123{
7124 assert(osd_lock.is_locked());
7125 dout(20) << "_dispatch " << m << " " << *m << dendl;
7126
7127 switch (m->get_type()) {
7128
7129 // -- don't need lock --
7130 case CEPH_MSG_PING:
7131 dout(10) << "ping from " << m->get_source() << dendl;
7132 m->put();
7133 break;
7134
7135 // -- don't need OSDMap --
7136
7137 // map and replication
7138 case CEPH_MSG_OSD_MAP:
7139 handle_osd_map(static_cast<MOSDMap*>(m));
7140 break;
7141
7142 // osd
7143 case MSG_PGSTATSACK:
7144 handle_pg_stats_ack(static_cast<MPGStatsAck*>(m));
7145 break;
7146
7147 case MSG_MON_COMMAND:
7148 handle_command(static_cast<MMonCommand*>(m));
7149 break;
7150 case MSG_COMMAND:
7151 handle_command(static_cast<MCommand*>(m));
7152 break;
7153
7154 case MSG_OSD_SCRUB:
7155 handle_scrub(static_cast<MOSDScrub*>(m));
7156 break;
7157
c07f9fc5
FG
7158 case MSG_OSD_FORCE_RECOVERY:
7159 handle_force_recovery(m);
7160 break;
7161
7c673cae
FG
7162 // -- need OSDMap --
7163
7164 case MSG_OSD_PG_CREATE:
7165 case MSG_OSD_PG_NOTIFY:
7166 case MSG_OSD_PG_QUERY:
7167 case MSG_OSD_PG_LOG:
7168 case MSG_OSD_PG_REMOVE:
7169 case MSG_OSD_PG_INFO:
7170 case MSG_OSD_PG_TRIM:
7171 case MSG_OSD_BACKFILL_RESERVE:
7172 case MSG_OSD_RECOVERY_RESERVE:
7173 {
7174 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7175 if (m->trace)
7176 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7177 // no map? starting up?
7178 if (!osdmap) {
7179 dout(7) << "no OSDMap, not booted" << dendl;
7180 logger->inc(l_osd_waiting_for_map);
7181 waiting_for_osdmap.push_back(op);
7182 op->mark_delayed("no osdmap");
7183 break;
7184 }
7185
7186 // need OSDMap
7187 dispatch_op(op);
7188 }
7189 }
7190}
7191
7192void OSD::handle_pg_scrub(MOSDScrub *m, PG *pg)
7193{
7194 pg->lock();
7195 if (pg->is_primary()) {
7196 pg->unreg_next_scrub();
7197 pg->scrubber.must_scrub = true;
7198 pg->scrubber.must_deep_scrub = m->deep || m->repair;
7199 pg->scrubber.must_repair = m->repair;
7200 pg->reg_next_scrub();
7201 dout(10) << "marking " << *pg << " for scrub" << dendl;
7202 }
7203 pg->unlock();
7204}
7205
7206void OSD::handle_scrub(MOSDScrub *m)
7207{
7208 dout(10) << "handle_scrub " << *m << dendl;
7209 if (!require_mon_or_mgr_peer(m)) {
7210 m->put();
7211 return;
7212 }
7213 if (m->fsid != monc->get_fsid()) {
7214 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid() << dendl;
7215 m->put();
7216 return;
7217 }
7218
7219 RWLock::RLocker l(pg_map_lock);
7220 if (m->scrub_pgs.empty()) {
7221 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
7222 p != pg_map.end();
7223 ++p)
7224 handle_pg_scrub(m, p->second);
7225 } else {
7226 for (vector<pg_t>::iterator p = m->scrub_pgs.begin();
7227 p != m->scrub_pgs.end();
7228 ++p) {
7229 spg_t pcand;
7230 if (osdmap->get_primary_shard(*p, &pcand)) {
7231 auto pg_map_entry = pg_map.find(pcand);
7232 if (pg_map_entry != pg_map.end()) {
7233 handle_pg_scrub(m, pg_map_entry->second);
7234 }
7235 }
7236 }
7237 }
7238
7239 m->put();
7240}
7241
7242bool OSD::scrub_random_backoff()
7243{
7244 bool coin_flip = (rand() / (double)RAND_MAX >=
7245 cct->_conf->osd_scrub_backoff_ratio);
7246 if (!coin_flip) {
7247 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7248 return true;
7249 }
7250 return false;
7251}
7252
7253OSDService::ScrubJob::ScrubJob(CephContext* cct,
7254 const spg_t& pg, const utime_t& timestamp,
7255 double pool_scrub_min_interval,
7256 double pool_scrub_max_interval, bool must)
7257 : cct(cct),
7258 pgid(pg),
7259 sched_time(timestamp),
7260 deadline(timestamp)
7261{
7262 // if not explicitly requested, postpone the scrub with a random delay
7263 if (!must) {
7264 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7265 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7266 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7267 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7268
7269 sched_time += scrub_min_interval;
7270 double r = rand() / (double)RAND_MAX;
7271 sched_time +=
7272 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7273 deadline += scrub_max_interval;
7274 }
7275}
7276
7277bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7278 if (sched_time < rhs.sched_time)
7279 return true;
7280 if (sched_time > rhs.sched_time)
7281 return false;
7282 return pgid < rhs.pgid;
7283}
7284
7285bool OSD::scrub_time_permit(utime_t now)
7286{
7287 struct tm bdt;
7288 time_t tt = now.sec();
7289 localtime_r(&tt, &bdt);
7290 bool time_permit = false;
7291 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7292 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7293 time_permit = true;
7294 }
7295 } else {
7296 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7297 time_permit = true;
7298 }
7299 }
7300 if (!time_permit) {
7301 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7302 << " - " << cct->_conf->osd_scrub_end_hour
7303 << " now " << bdt.tm_hour << " = no" << dendl;
7304 } else {
7305 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7306 << " - " << cct->_conf->osd_scrub_end_hour
7307 << " now " << bdt.tm_hour << " = yes" << dendl;
7308 }
7309 return time_permit;
7310}
7311
7312bool OSD::scrub_load_below_threshold()
7313{
7314 double loadavgs[3];
7315 if (getloadavg(loadavgs, 3) != 3) {
7316 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7317 return false;
7318 }
7319
7320 // allow scrub if below configured threshold
7321 if (loadavgs[0] < cct->_conf->osd_scrub_load_threshold) {
7322 dout(20) << __func__ << " loadavg " << loadavgs[0]
7323 << " < max " << cct->_conf->osd_scrub_load_threshold
7324 << " = yes" << dendl;
7325 return true;
7326 }
7327
7328 // allow scrub if below daily avg and currently decreasing
7329 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7330 dout(20) << __func__ << " loadavg " << loadavgs[0]
7331 << " < daily_loadavg " << daily_loadavg
7332 << " and < 15m avg " << loadavgs[2]
7333 << " = yes" << dendl;
7334 return true;
7335 }
7336
7337 dout(20) << __func__ << " loadavg " << loadavgs[0]
7338 << " >= max " << cct->_conf->osd_scrub_load_threshold
7339 << " and ( >= daily_loadavg " << daily_loadavg
7340 << " or >= 15m avg " << loadavgs[2]
7341 << ") = no" << dendl;
7342 return false;
7343}
7344
7345void OSD::sched_scrub()
7346{
7347 // if not permitted, fail fast
7348 if (!service.can_inc_scrubs_pending()) {
7349 return;
7350 }
7351
7352 utime_t now = ceph_clock_now();
7353 bool time_permit = scrub_time_permit(now);
7354 bool load_is_low = scrub_load_below_threshold();
7355 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7356
7357 OSDService::ScrubJob scrub;
7358 if (service.first_scrub_stamp(&scrub)) {
7359 do {
7360 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7361
7362 if (scrub.sched_time > now) {
7363 // save ourselves some effort
7364 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7365 << " > " << now << dendl;
7366 break;
7367 }
7368
7369 if (!cct->_conf->osd_scrub_during_recovery && service.is_recovery_active()) {
7370 dout(10) << __func__ << "not scheduling scrub of " << scrub.pgid << " due to active recovery ops" << dendl;
7371 break;
7372 }
7373
7374 if ((scrub.deadline >= now) && !(time_permit && load_is_low)) {
7375 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7376 << (!time_permit ? "time not permit" : "high load") << dendl;
7377 continue;
7378 }
7379
7380 PG *pg = _lookup_lock_pg(scrub.pgid);
7381 if (!pg)
7382 continue;
7383 if (pg->get_pgbackend()->scrub_supported() && pg->is_active()) {
7384 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7385 << (pg->scrubber.must_scrub ? ", explicitly requested" :
7386 (load_is_low ? ", load_is_low" : " deadline < now"))
7387 << dendl;
7388 if (pg->sched_scrub()) {
7389 pg->unlock();
7390 break;
7391 }
7392 }
7393 pg->unlock();
7394 } while (service.next_scrub_stamp(scrub, &scrub));
7395 }
7396 dout(20) << "sched_scrub done" << dendl;
7397}
7398
7399
7400
7401// =====================================================
7402// MAP
7403
7404void OSD::wait_for_new_map(OpRequestRef op)
7405{
7406 // ask?
7407 if (waiting_for_osdmap.empty()) {
7408 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7409 }
7410
7411 logger->inc(l_osd_waiting_for_map);
7412 waiting_for_osdmap.push_back(op);
7413 op->mark_delayed("wait for new map");
7414}
7415
7416
7417/** update_map
7418 * assimilate new OSDMap(s). scan pgs, etc.
7419 */
7420
7421void OSD::note_down_osd(int peer)
7422{
7423 assert(osd_lock.is_locked());
7424 cluster_messenger->mark_down(osdmap->get_cluster_addr(peer));
7425
7426 heartbeat_lock.Lock();
7427 failure_queue.erase(peer);
7428 failure_pending.erase(peer);
7429 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7430 if (p != heartbeat_peers.end()) {
7431 p->second.con_back->mark_down();
7432 if (p->second.con_front) {
7433 p->second.con_front->mark_down();
7434 }
7435 heartbeat_peers.erase(p);
7436 }
7437 heartbeat_lock.Unlock();
7438}
7439
7440void OSD::note_up_osd(int peer)
7441{
7442 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
7443 heartbeat_set_peers_need_update();
7444}
7445
7446struct C_OnMapCommit : public Context {
7447 OSD *osd;
7448 epoch_t first, last;
7449 MOSDMap *msg;
7450 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7451 : osd(o), first(f), last(l), msg(m) {}
7452 void finish(int r) override {
7453 osd->_committed_osd_maps(first, last, msg);
7454 msg->put();
7455 }
7456};
7457
7458struct C_OnMapApply : public Context {
7459 OSDService *service;
7460 list<OSDMapRef> pinned_maps;
7461 epoch_t e;
7462 C_OnMapApply(OSDService *service,
7463 const list<OSDMapRef> &pinned_maps,
7464 epoch_t e)
7465 : service(service), pinned_maps(pinned_maps), e(e) {}
7466 void finish(int r) override {
7467 service->clear_map_bl_cache_pins(e);
7468 }
7469};
7470
7471void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7472{
7473 OSDMapRef osdmap = service.get_osdmap();
7474 if (osdmap->get_epoch() >= epoch)
7475 return;
7476
7477 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7478 force_request) {
7479 monc->renew_subs();
7480 }
7481}
7482
7483void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7484{
7485 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7486 if (min <= superblock.oldest_map)
7487 return;
7488
7489 int num = 0;
7490 ObjectStore::Transaction t;
7491 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7492 dout(20) << " removing old osdmap epoch " << e << dendl;
7493 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7494 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7495 superblock.oldest_map = e + 1;
7496 num++;
7497 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7498 service.publish_superblock(superblock);
7499 write_superblock(t);
7500 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7501 assert(tr == 0);
7502 num = 0;
7503 if (!skip_maps) {
7504 // skip_maps leaves us with a range of old maps if we fail to remove all
7505 // of them before moving superblock.oldest_map forward to the first map
7506 // in the incoming MOSDMap msg. so we should continue removing them in
7507 // this case, even we could do huge series of delete transactions all at
7508 // once.
7509 break;
7510 }
7511 }
7512 }
7513 if (num > 0) {
7514 service.publish_superblock(superblock);
7515 write_superblock(t);
224ce89b
WB
7516 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7517 assert(tr == 0);
7c673cae
FG
7518 }
7519 // we should not remove the cached maps
7520 assert(min <= service.map_cache.cached_key_lower_bound());
7521}
7522
7523void OSD::handle_osd_map(MOSDMap *m)
7524{
7525 assert(osd_lock.is_locked());
7526 // Keep a ref in the list until we get the newly received map written
7527 // onto disk. This is important because as long as the refs are alive,
7528 // the OSDMaps will be pinned in the cache and we won't try to read it
7529 // off of disk. Otherwise these maps will probably not stay in the cache,
7530 // and reading those OSDMaps before they are actually written can result
7531 // in a crash.
7532 list<OSDMapRef> pinned_maps;
7533 if (m->fsid != monc->get_fsid()) {
7534 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7535 << monc->get_fsid() << dendl;
7536 m->put();
7537 return;
7538 }
7539 if (is_initializing()) {
7540 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7541 m->put();
7542 return;
7543 }
7544
7545 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
7546 if (session && !(session->entity_name.is_mon() ||
7547 session->entity_name.is_osd())) {
7548 //not enough perms!
7549 dout(10) << "got osd map from Session " << session
7550 << " which we can't take maps from (not a mon or osd)" << dendl;
7551 m->put();
7552 session->put();
7553 return;
7554 }
7555 if (session)
7556 session->put();
7557
7558 // share with the objecter
7559 if (!is_preboot())
7560 service.objecter->handle_osd_map(m);
7561
7562 epoch_t first = m->get_first();
7563 epoch_t last = m->get_last();
7564 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7565 << superblock.newest_map
7566 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7567 << dendl;
7568
7569 logger->inc(l_osd_map);
7570 logger->inc(l_osd_mape, last - first + 1);
7571 if (first <= superblock.newest_map)
7572 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7573 if (service.max_oldest_map < m->oldest_map) {
7574 service.max_oldest_map = m->oldest_map;
7575 assert(service.max_oldest_map >= superblock.oldest_map);
7576 }
7577
7578 // make sure there is something new, here, before we bother flushing
7579 // the queues and such
7580 if (last <= superblock.newest_map) {
7581 dout(10) << " no new maps here, dropping" << dendl;
7582 m->put();
7583 return;
7584 }
7585
7586 // missing some?
7587 bool skip_maps = false;
7588 if (first > superblock.newest_map + 1) {
7589 dout(10) << "handle_osd_map message skips epochs "
7590 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7591 if (m->oldest_map <= superblock.newest_map + 1) {
7592 osdmap_subscribe(superblock.newest_map + 1, false);
7593 m->put();
7594 return;
7595 }
7596 // always try to get the full range of maps--as many as we can. this
7597 // 1- is good to have
7598 // 2- is at present the only way to ensure that we get a *full* map as
7599 // the first map!
7600 if (m->oldest_map < first) {
7601 osdmap_subscribe(m->oldest_map - 1, true);
7602 m->put();
7603 return;
7604 }
7605 skip_maps = true;
7606 }
7607
7608 ObjectStore::Transaction t;
7609 uint64_t txn_size = 0;
7610
7611 // store new maps: queue for disk and put in the osdmap cache
7612 epoch_t start = MAX(superblock.newest_map + 1, first);
7613 for (epoch_t e = start; e <= last; e++) {
7614 if (txn_size >= t.get_num_bytes()) {
7615 derr << __func__ << " transaction size overflowed" << dendl;
7616 assert(txn_size < t.get_num_bytes());
7617 }
7618 txn_size = t.get_num_bytes();
7619 map<epoch_t,bufferlist>::iterator p;
7620 p = m->maps.find(e);
7621 if (p != m->maps.end()) {
7622 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7623 OSDMap *o = new OSDMap;
7624 bufferlist& bl = p->second;
7625
7626 o->decode(bl);
7627
7628 ghobject_t fulloid = get_osdmap_pobject_name(e);
7629 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7630 pin_map_bl(e, bl);
7631 pinned_maps.push_back(add_map(o));
7632
7633 got_full_map(e);
7634 continue;
7635 }
7636
7637 p = m->incremental_maps.find(e);
7638 if (p != m->incremental_maps.end()) {
7639 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7640 bufferlist& bl = p->second;
7641 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7642 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7643 pin_map_inc_bl(e, bl);
7644
7645 OSDMap *o = new OSDMap;
7646 if (e > 1) {
7647 bufferlist obl;
7648 bool got = get_map_bl(e - 1, obl);
7649 assert(got);
7650 o->decode(obl);
7651 }
7652
7653 OSDMap::Incremental inc;
7654 bufferlist::iterator p = bl.begin();
7655 inc.decode(p);
7656 if (o->apply_incremental(inc) < 0) {
7657 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
7658 assert(0 == "bad fsid");
7659 }
7660
7661 bufferlist fbl;
7662 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7663
7664 bool injected_failure = false;
7665 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7666 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7667 derr << __func__ << " injecting map crc failure" << dendl;
7668 injected_failure = true;
7669 }
7670
7671 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7672 dout(2) << "got incremental " << e
7673 << " but failed to encode full with correct crc; requesting"
7674 << dendl;
7675 clog->warn() << "failed to encode map e" << e << " with expected crc";
7676 dout(20) << "my encoded map was:\n";
7677 fbl.hexdump(*_dout);
7678 *_dout << dendl;
7679 delete o;
7680 request_full_map(e, last);
7681 last = e - 1;
7682 break;
7683 }
7684 got_full_map(e);
7685
7686 ghobject_t fulloid = get_osdmap_pobject_name(e);
7687 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7688 pin_map_bl(e, fbl);
7689 pinned_maps.push_back(add_map(o));
7690 continue;
7691 }
7692
7693 assert(0 == "MOSDMap lied about what maps it had?");
7694 }
7695
7696 // even if this map isn't from a mon, we may have satisfied our subscription
7697 monc->sub_got("osdmap", last);
7698
7699 if (!m->maps.empty() && requested_full_first) {
7700 dout(10) << __func__ << " still missing full maps " << requested_full_first
7701 << ".." << requested_full_last << dendl;
7702 rerequest_full_maps();
7703 }
7704
7c673cae
FG
7705 if (superblock.oldest_map) {
7706 // make sure we at least keep pace with incoming maps
7707 trim_maps(m->oldest_map, last - first + 1, skip_maps);
7708 }
7709
7710 if (!superblock.oldest_map || skip_maps)
7711 superblock.oldest_map = first;
7712 superblock.newest_map = last;
7713 superblock.current_epoch = last;
7714
7715 // note in the superblock that we were clean thru the prior epoch
7716 epoch_t boot_epoch = service.get_boot_epoch();
7717 if (boot_epoch && boot_epoch >= superblock.mounted) {
7718 superblock.mounted = boot_epoch;
7719 superblock.clean_thru = last;
7720 }
7721
7722 // superblock and commit
7723 write_superblock(t);
7724 store->queue_transaction(
7725 service.meta_osr.get(),
7726 std::move(t),
7727 new C_OnMapApply(&service, pinned_maps, last),
7728 new C_OnMapCommit(this, start, last, m), 0);
7729 service.publish_superblock(superblock);
7730}
7731
7732void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
7733{
7734 dout(10) << __func__ << " " << first << ".." << last << dendl;
7735 if (is_stopping()) {
7736 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7737 return;
7738 }
7739 Mutex::Locker l(osd_lock);
31f18b77
FG
7740 if (is_stopping()) {
7741 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7742 return;
7743 }
7c673cae
FG
7744 map_lock.get_write();
7745
7746 bool do_shutdown = false;
7747 bool do_restart = false;
7748 bool network_error = false;
7749
7750 // advance through the new maps
7751 for (epoch_t cur = first; cur <= last; cur++) {
7752 dout(10) << " advance to epoch " << cur
7753 << " (<= last " << last
7754 << " <= newest_map " << superblock.newest_map
7755 << ")" << dendl;
7756
7757 OSDMapRef newmap = get_map(cur);
7758 assert(newmap); // we just cached it above!
7759
7760 // start blacklisting messages sent to peers that go down.
7761 service.pre_publish_map(newmap);
7762
7763 // kill connections to newly down osds
7764 bool waited_for_reservations = false;
7765 set<int> old;
7766 osdmap->get_all_osds(old);
7767 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
7768 if (*p != whoami &&
7769 osdmap->is_up(*p) && // in old map
7770 newmap->is_down(*p)) { // but not the new one
7771 if (!waited_for_reservations) {
7772 service.await_reserved_maps();
7773 waited_for_reservations = true;
7774 }
7775 note_down_osd(*p);
7776 } else if (*p != whoami &&
7777 osdmap->is_down(*p) &&
7778 newmap->is_up(*p)) {
7779 note_up_osd(*p);
7780 }
7781 }
7782
31f18b77
FG
7783 if ((osdmap->test_flag(CEPH_OSDMAP_NOUP) !=
7784 newmap->test_flag(CEPH_OSDMAP_NOUP)) ||
7785 (osdmap->is_noup(whoami) != newmap->is_noup(whoami))) {
7c673cae
FG
7786 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
7787 << dendl;
7788 if (is_booting()) {
7789 // this captures the case where we sent the boot message while
7790 // NOUP was being set on the mon and our boot request was
7791 // dropped, and then later it is cleared. it imperfectly
7792 // handles the case where our original boot message was not
7793 // dropped and we restart even though we might have booted, but
7794 // that is harmless (boot will just take slightly longer).
7795 do_restart = true;
7796 }
7797 }
31f18b77
FG
7798 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS &&
7799 newmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
7800 dout(10) << __func__ << " require_osd_release reached luminous in "
7801 << newmap->get_epoch() << dendl;
7802 clear_pg_stat_queue();
224ce89b 7803 clear_outstanding_pg_stats();
31f18b77 7804 }
7c673cae
FG
7805
7806 osdmap = newmap;
7807 epoch_t up_epoch;
7808 epoch_t boot_epoch;
7809 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
7810 if (!up_epoch &&
7811 osdmap->is_up(whoami) &&
7812 osdmap->get_inst(whoami) == client_messenger->get_myinst()) {
7813 up_epoch = osdmap->get_epoch();
7814 dout(10) << "up_epoch is " << up_epoch << dendl;
7815 if (!boot_epoch) {
7816 boot_epoch = osdmap->get_epoch();
7817 dout(10) << "boot_epoch is " << boot_epoch << dendl;
7818 }
7819 service.set_epochs(&boot_epoch, &up_epoch, NULL);
7820 }
7821 }
7822
7823 had_map_since = ceph_clock_now();
7824
7825 epoch_t _bind_epoch = service.get_bind_epoch();
7826 if (osdmap->is_up(whoami) &&
7827 osdmap->get_addr(whoami) == client_messenger->get_myaddr() &&
7828 _bind_epoch < osdmap->get_up_from(whoami)) {
7829
7830 if (is_booting()) {
7831 dout(1) << "state: booting -> active" << dendl;
7832 set_state(STATE_ACTIVE);
7833
7834 // set incarnation so that osd_reqid_t's we generate for our
7835 // objecter requests are unique across restarts.
7836 service.objecter->set_client_incarnation(osdmap->get_epoch());
7837 }
7838 }
7839
7840 if (osdmap->get_epoch() > 0 &&
7841 is_active()) {
7842 if (!osdmap->exists(whoami)) {
7843 dout(0) << "map says i do not exist. shutting down." << dendl;
7844 do_shutdown = true; // don't call shutdown() while we have
7845 // everything paused
7846 } else if (!osdmap->is_up(whoami) ||
7847 !osdmap->get_addr(whoami).probably_equals(
7848 client_messenger->get_myaddr()) ||
7849 !osdmap->get_cluster_addr(whoami).probably_equals(
7850 cluster_messenger->get_myaddr()) ||
7851 !osdmap->get_hb_back_addr(whoami).probably_equals(
7852 hb_back_server_messenger->get_myaddr()) ||
7853 (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
7854 !osdmap->get_hb_front_addr(whoami).probably_equals(
7855 hb_front_server_messenger->get_myaddr()))) {
7856 if (!osdmap->is_up(whoami)) {
7857 if (service.is_preparing_to_stop() || service.is_stopping()) {
7858 service.got_stop_ack();
7859 } else {
c07f9fc5
FG
7860 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
7861 "but it is still running";
7862 clog->debug() << "map e" << osdmap->get_epoch()
7863 << " wrongly marked me down at e"
7864 << osdmap->get_down_at(whoami);
7c673cae
FG
7865 }
7866 } else if (!osdmap->get_addr(whoami).probably_equals(
7867 client_messenger->get_myaddr())) {
7868 clog->error() << "map e" << osdmap->get_epoch()
7869 << " had wrong client addr (" << osdmap->get_addr(whoami)
7870 << " != my " << client_messenger->get_myaddr() << ")";
7871 } else if (!osdmap->get_cluster_addr(whoami).probably_equals(
7872 cluster_messenger->get_myaddr())) {
7873 clog->error() << "map e" << osdmap->get_epoch()
7874 << " had wrong cluster addr ("
7875 << osdmap->get_cluster_addr(whoami)
7876 << " != my " << cluster_messenger->get_myaddr() << ")";
7877 } else if (!osdmap->get_hb_back_addr(whoami).probably_equals(
7878 hb_back_server_messenger->get_myaddr())) {
7879 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 7880 << " had wrong heartbeat back addr ("
7c673cae
FG
7881 << osdmap->get_hb_back_addr(whoami)
7882 << " != my " << hb_back_server_messenger->get_myaddr()
7883 << ")";
7884 } else if (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
7885 !osdmap->get_hb_front_addr(whoami).probably_equals(
7886 hb_front_server_messenger->get_myaddr())) {
7887 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 7888 << " had wrong heartbeat front addr ("
7c673cae
FG
7889 << osdmap->get_hb_front_addr(whoami)
7890 << " != my " << hb_front_server_messenger->get_myaddr()
7891 << ")";
7892 }
7893
7894 if (!service.is_stopping()) {
7895 epoch_t up_epoch = 0;
7896 epoch_t bind_epoch = osdmap->get_epoch();
7897 service.set_epochs(NULL,&up_epoch, &bind_epoch);
7898 do_restart = true;
7899
7900 //add markdown log
7901 utime_t now = ceph_clock_now();
7902 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
7903 osd_markdown_log.push_back(now);
7904 //clear all out-of-date log
7905 while (!osd_markdown_log.empty() &&
7906 osd_markdown_log.front() + grace < now)
7907 osd_markdown_log.pop_front();
7908 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
7909 dout(0) << __func__ << " marked down "
7910 << osd_markdown_log.size()
7911 << " > osd_max_markdown_count "
7912 << cct->_conf->osd_max_markdown_count
7913 << " in last " << grace << " seconds, shutting down"
7914 << dendl;
7915 do_restart = false;
7916 do_shutdown = true;
7917 }
7918
7919 start_waiting_for_healthy();
7920
7921 set<int> avoid_ports;
7922#if defined(__FreeBSD__)
7923 // prevent FreeBSD from grabbing the client_messenger port during
7924 // rebinding. In which case a cluster_meesneger will connect also
7925 // to the same port
7926 avoid_ports.insert(client_messenger->get_myaddr().get_port());
7927#endif
7928 avoid_ports.insert(cluster_messenger->get_myaddr().get_port());
7929 avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port());
7930 avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port());
7931
7932 int r = cluster_messenger->rebind(avoid_ports);
7933 if (r != 0) {
7934 do_shutdown = true; // FIXME: do_restart?
7935 network_error = true;
7936 dout(0) << __func__ << " marked down:"
7937 << " rebind cluster_messenger failed" << dendl;
7938 }
7939
7940 r = hb_back_server_messenger->rebind(avoid_ports);
7941 if (r != 0) {
7942 do_shutdown = true; // FIXME: do_restart?
7943 network_error = true;
7944 dout(0) << __func__ << " marked down:"
7945 << " rebind hb_back_server_messenger failed" << dendl;
7946 }
7947
7948 r = hb_front_server_messenger->rebind(avoid_ports);
7949 if (r != 0) {
7950 do_shutdown = true; // FIXME: do_restart?
7951 network_error = true;
7952 dout(0) << __func__ << " marked down:"
7953 << " rebind hb_front_server_messenger failed" << dendl;
7954 }
7955
7956 hb_front_client_messenger->mark_down_all();
7957 hb_back_client_messenger->mark_down_all();
7958
7959 reset_heartbeat_peers();
7960 }
7961 }
7962 }
7963
7964 map_lock.put_write();
7965
7966 check_osdmap_features(store);
7967
7968 // yay!
7969 consume_map();
7970
7971 if (is_active() || is_waiting_for_healthy())
7972 maybe_update_heartbeat_peers();
7973
7974 if (!is_active()) {
7975 dout(10) << " not yet active; waiting for peering wq to drain" << dendl;
7976 peering_wq.drain();
7977 } else {
7978 activate_map();
7979 }
7980
31f18b77 7981 if (do_shutdown) {
7c673cae
FG
7982 if (network_error) {
7983 Mutex::Locker l(heartbeat_lock);
7984 map<int,pair<utime_t,entity_inst_t>>::iterator it =
7985 failure_pending.begin();
7986 while (it != failure_pending.end()) {
7987 dout(10) << "handle_osd_ping canceling in-flight failure report for osd."
7988 << it->first << dendl;
7989 send_still_alive(osdmap->get_epoch(), it->second.second);
7990 failure_pending.erase(it++);
7991 }
7992 }
7993 // trigger shutdown in a different thread
7994 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
7995 queue_async_signal(SIGINT);
7996 }
31f18b77
FG
7997 else if (m->newest_map && m->newest_map > last) {
7998 dout(10) << " msg say newest map is " << m->newest_map
7999 << ", requesting more" << dendl;
8000 osdmap_subscribe(osdmap->get_epoch()+1, false);
8001 }
7c673cae
FG
8002 else if (is_preboot()) {
8003 if (m->get_source().is_mon())
8004 _preboot(m->oldest_map, m->newest_map);
8005 else
8006 start_boot();
8007 }
8008 else if (do_restart)
8009 start_boot();
8010
8011}
8012
8013void OSD::check_osdmap_features(ObjectStore *fs)
8014{
8015 // adjust required feature bits?
8016
8017 // we have to be a bit careful here, because we are accessing the
8018 // Policy structures without taking any lock. in particular, only
8019 // modify integer values that can safely be read by a racing CPU.
8020 // since we are only accessing existing Policy structures a their
8021 // current memory location, and setting or clearing bits in integer
8022 // fields, and we are the only writer, this is not a problem.
8023
8024 {
8025 Messenger::Policy p = client_messenger->get_default_policy();
8026 uint64_t mask;
8027 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8028 if ((p.features_required & mask) != features) {
8029 dout(0) << "crush map has features " << features
8030 << ", adjusting msgr requires for clients" << dendl;
8031 p.features_required = (p.features_required & ~mask) | features;
8032 client_messenger->set_default_policy(p);
8033 }
8034 }
8035 {
8036 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8037 uint64_t mask;
8038 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8039 if ((p.features_required & mask) != features) {
8040 dout(0) << "crush map has features " << features
8041 << " was " << p.features_required
8042 << ", adjusting msgr requires for mons" << dendl;
8043 p.features_required = (p.features_required & ~mask) | features;
8044 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8045 }
8046 }
8047 {
8048 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8049 uint64_t mask;
8050 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8051
8052 if ((p.features_required & mask) != features) {
8053 dout(0) << "crush map has features " << features
8054 << ", adjusting msgr requires for osds" << dendl;
8055 p.features_required = (p.features_required & ~mask) | features;
8056 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8057 }
8058
8059 if ((features & CEPH_FEATURE_OSD_ERASURE_CODES) &&
8060 !superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8061 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8062 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8063 ObjectStore::Transaction t;
8064 write_superblock(t);
8065 int err = store->queue_transaction(service.meta_osr.get(), std::move(t), NULL);
8066 assert(err == 0);
8067 }
8068 }
8069}
8070
8071bool OSD::advance_pg(
8072 epoch_t osd_epoch, PG *pg,
8073 ThreadPool::TPHandle &handle,
8074 PG::RecoveryCtx *rctx,
31f18b77 8075 set<PGRef> *new_pgs)
7c673cae
FG
8076{
8077 assert(pg->is_locked());
8078 epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
8079 OSDMapRef lastmap = pg->get_osdmap();
8080
8081 if (lastmap->get_epoch() == osd_epoch)
8082 return true;
8083 assert(lastmap->get_epoch() < osd_epoch);
8084
8085 epoch_t min_epoch = service.get_min_pg_epoch();
8086 epoch_t max;
8087 if (min_epoch) {
8088 max = min_epoch + cct->_conf->osd_map_max_advance;
8089 } else {
8090 max = next_epoch + cct->_conf->osd_map_max_advance;
8091 }
8092
8093 for (;
8094 next_epoch <= osd_epoch && next_epoch <= max;
8095 ++next_epoch) {
8096 OSDMapRef nextmap = service.try_get_map(next_epoch);
8097 if (!nextmap) {
8098 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8099 // make sure max is bumped up so that we can get past any
8100 // gap in maps
8101 max = MAX(max, next_epoch + cct->_conf->osd_map_max_advance);
8102 continue;
8103 }
8104
8105 vector<int> newup, newacting;
8106 int up_primary, acting_primary;
8107 nextmap->pg_to_up_acting_osds(
8108 pg->info.pgid.pgid,
8109 &newup, &up_primary,
8110 &newacting, &acting_primary);
8111 pg->handle_advance_map(
8112 nextmap, lastmap, newup, up_primary,
8113 newacting, acting_primary, rctx);
8114
8115 // Check for split!
8116 set<spg_t> children;
8117 spg_t parent(pg->info.pgid);
8118 if (parent.is_split(
8119 lastmap->get_pg_num(pg->pool.id),
8120 nextmap->get_pg_num(pg->pool.id),
8121 &children)) {
8122 service.mark_split_in_progress(pg->info.pgid, children);
8123 split_pgs(
8124 pg, children, new_pgs, lastmap, nextmap,
8125 rctx);
8126 }
8127
8128 lastmap = nextmap;
8129 handle.reset_tp_timeout();
8130 }
8131 service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
8132 pg->handle_activate_map(rctx);
8133 if (next_epoch <= osd_epoch) {
8134 dout(10) << __func__ << " advanced to max " << max
8135 << " past min epoch " << min_epoch
8136 << " ... will requeue " << *pg << dendl;
8137 return false;
8138 }
8139 return true;
8140}
8141
8142void OSD::consume_map()
8143{
8144 assert(osd_lock.is_locked());
8145 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8146
8147 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8148 list<PGRef> to_remove;
8149
8150 // scan pg's
8151 {
8152 RWLock::RLocker l(pg_map_lock);
8153 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8154 it != pg_map.end();
8155 ++it) {
8156 PG *pg = it->second;
8157 pg->lock();
8158 if (pg->is_primary())
8159 num_pg_primary++;
8160 else if (pg->is_replica())
8161 num_pg_replica++;
8162 else
8163 num_pg_stray++;
8164
8165 if (!osdmap->have_pg_pool(pg->info.pgid.pool())) {
8166 //pool is deleted!
8167 to_remove.push_back(PGRef(pg));
8168 } else {
8169 service.init_splits_between(it->first, service.get_osdmap(), osdmap);
8170 }
8171
8172 pg->unlock();
8173 }
8174 }
8175
8176 for (list<PGRef>::iterator i = to_remove.begin();
8177 i != to_remove.end();
8178 to_remove.erase(i++)) {
8179 RWLock::WLocker locker(pg_map_lock);
8180 (*i)->lock();
8181 _remove_pg(&**i);
8182 (*i)->unlock();
8183 }
8184
8185 service.expand_pg_num(service.get_osdmap(), osdmap);
8186
8187 service.pre_publish_map(osdmap);
8188 service.await_reserved_maps();
8189 service.publish_map(osdmap);
8190
8191 service.maybe_inject_dispatch_delay();
8192
8193 dispatch_sessions_waiting_on_map();
8194
8195 service.maybe_inject_dispatch_delay();
8196
8197 // remove any PGs which we no longer host from the session waiting_for_pg lists
8198 dout(20) << __func__ << " checking waiting_for_pg" << dendl;
8199 op_shardedwq.prune_pg_waiters(osdmap, whoami);
8200
8201 service.maybe_inject_dispatch_delay();
8202
8203 // scan pg's
8204 {
8205 RWLock::RLocker l(pg_map_lock);
8206 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8207 it != pg_map.end();
8208 ++it) {
8209 PG *pg = it->second;
8210 pg->lock();
8211 pg->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
8212 pg->unlock();
8213 }
8214
8215 logger->set(l_osd_pg, pg_map.size());
8216 }
8217 logger->set(l_osd_pg_primary, num_pg_primary);
8218 logger->set(l_osd_pg_replica, num_pg_replica);
8219 logger->set(l_osd_pg_stray, num_pg_stray);
8220}
8221
8222void OSD::activate_map()
8223{
8224 assert(osd_lock.is_locked());
8225
8226 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8227
8228 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8229 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8230 ceph_abort();
8231 }
8232
8233 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
8234 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
8235 osdmap_subscribe(osdmap->get_epoch() + 1, false);
8236 }
8237
8238 // norecover?
8239 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8240 if (!service.recovery_is_paused()) {
8241 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8242 service.pause_recovery();
8243 }
8244 } else {
8245 if (service.recovery_is_paused()) {
8246 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8247 service.unpause_recovery();
8248 }
8249 }
8250
8251 service.activate_map();
8252
8253 // process waiters
8254 take_waiters(waiting_for_osdmap);
8255}
8256
8257bool OSD::require_mon_peer(const Message *m)
8258{
8259 if (!m->get_connection()->peer_is_mon()) {
8260 dout(0) << "require_mon_peer received from non-mon "
8261 << m->get_connection()->get_peer_addr()
8262 << " " << *m << dendl;
8263 return false;
8264 }
8265 return true;
8266}
8267
8268bool OSD::require_mon_or_mgr_peer(const Message *m)
8269{
8270 if (!m->get_connection()->peer_is_mon() &&
8271 !m->get_connection()->peer_is_mgr()) {
8272 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8273 << m->get_connection()->get_peer_addr()
8274 << " " << *m << dendl;
8275 return false;
8276 }
8277 return true;
8278}
8279
8280bool OSD::require_osd_peer(const Message *m)
8281{
8282 if (!m->get_connection()->peer_is_osd()) {
8283 dout(0) << "require_osd_peer received from non-osd "
8284 << m->get_connection()->get_peer_addr()
8285 << " " << *m << dendl;
8286 return false;
8287 }
8288 return true;
8289}
8290
8291bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8292{
8293 epoch_t up_epoch = service.get_up_epoch();
8294 if (epoch < up_epoch) {
8295 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8296 return false;
8297 }
8298
8299 if (!is_active()) {
8300 dout(7) << "still in boot state, dropping message " << *m << dendl;
8301 return false;
8302 }
8303
8304 return true;
8305}
8306
8307bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
8308 bool is_fast_dispatch)
8309{
8310 int from = m->get_source().num();
8311
8312 if (map->is_down(from) ||
8313 (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
8314 dout(5) << "from dead osd." << from << ", marking down, "
8315 << " msg was " << m->get_source_inst().addr
8316 << " expected " << (map->is_up(from) ?
8317 map->get_cluster_addr(from) : entity_addr_t())
8318 << dendl;
8319 ConnectionRef con = m->get_connection();
8320 con->mark_down();
8321 Session *s = static_cast<Session*>(con->get_priv());
8322 if (s) {
8323 if (!is_fast_dispatch)
8324 s->session_dispatch_lock.Lock();
8325 clear_session_waiting_on_map(s);
8326 con->set_priv(NULL); // break ref <-> session cycle, if any
8327 if (!is_fast_dispatch)
8328 s->session_dispatch_lock.Unlock();
8329 s->put();
8330 }
8331 return false;
8332 }
8333 return true;
8334}
8335
8336
8337/*
8338 * require that we have same (or newer) map, and that
8339 * the source is the pg primary.
8340 */
8341bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8342 bool is_fast_dispatch)
8343{
8344 const Message *m = op->get_req();
8345 dout(15) << "require_same_or_newer_map " << epoch
8346 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8347
8348 assert(osd_lock.is_locked());
8349
8350 // do they have a newer map?
8351 if (epoch > osdmap->get_epoch()) {
8352 dout(7) << "waiting for newer map epoch " << epoch
8353 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8354 wait_for_new_map(op);
8355 return false;
8356 }
8357
8358 if (!require_self_aliveness(op->get_req(), epoch)) {
8359 return false;
8360 }
8361
8362 // ok, our map is same or newer.. do they still exist?
8363 if (m->get_connection()->get_messenger() == cluster_messenger &&
8364 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8365 return false;
8366 }
8367
8368 return true;
8369}
8370
8371
8372
8373
8374
8375// ----------------------------------------
8376// pg creation
8377
8378void OSD::split_pgs(
8379 PG *parent,
31f18b77 8380 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
7c673cae
FG
8381 OSDMapRef curmap,
8382 OSDMapRef nextmap,
8383 PG::RecoveryCtx *rctx)
8384{
8385 unsigned pg_num = nextmap->get_pg_num(
8386 parent->pool.id);
8387 parent->update_snap_mapper_bits(
8388 parent->info.pgid.get_split_bits(pg_num)
8389 );
8390
8391 vector<object_stat_sum_t> updated_stats(childpgids.size() + 1);
8392 parent->info.stats.stats.sum.split(updated_stats);
8393
8394 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8395 for (set<spg_t>::const_iterator i = childpgids.begin();
8396 i != childpgids.end();
8397 ++i, ++stat_iter) {
8398 assert(stat_iter != updated_stats.end());
8399 dout(10) << "Splitting " << *parent << " into " << *i << dendl;
8400 assert(service.splitting(*i));
8401 PG* child = _make_pg(nextmap, *i);
8402 child->lock(true);
8403 out_pgs->insert(child);
8404 rctx->created_pgs.insert(child);
8405
8406 unsigned split_bits = i->get_split_bits(pg_num);
8407 dout(10) << "pg_num is " << pg_num << dendl;
8408 dout(10) << "m_seed " << i->ps() << dendl;
8409 dout(10) << "split_bits is " << split_bits << dendl;
8410
8411 parent->split_colls(
8412 *i,
8413 split_bits,
8414 i->ps(),
8415 &child->pool.info,
8416 rctx->transaction);
8417 parent->split_into(
8418 i->pgid,
8419 child,
8420 split_bits);
8421 child->info.stats.stats.sum = *stat_iter;
8422
8423 child->write_if_dirty(*(rctx->transaction));
8424 child->unlock();
8425 }
8426 assert(stat_iter != updated_stats.end());
8427 parent->info.stats.stats.sum = *stat_iter;
8428 parent->write_if_dirty(*(rctx->transaction));
8429}
8430
8431/*
8432 * holding osd_lock
8433 */
8434void OSD::handle_pg_create(OpRequestRef op)
8435{
8436 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
8437 assert(m->get_type() == MSG_OSD_PG_CREATE);
8438
8439 dout(10) << "handle_pg_create " << *m << dendl;
8440
8441 if (!require_mon_peer(op->get_req())) {
8442 return;
8443 }
8444
8445 if (!require_same_or_newer_map(op, m->epoch, false))
8446 return;
8447
8448 op->mark_started();
8449
8450 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8451 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8452 p != m->mkpg.end();
8453 ++p, ++ci) {
8454 assert(ci != m->ctimes.end() && ci->first == p->first);
8455 epoch_t created = p->second.created;
8456 if (p->second.split_bits) // Skip split pgs
8457 continue;
8458 pg_t on = p->first;
8459
8460 if (on.preferred() >= 0) {
8461 dout(20) << "ignoring localized pg " << on << dendl;
8462 continue;
8463 }
8464
8465 if (!osdmap->have_pg_pool(on.pool())) {
8466 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8467 continue;
8468 }
8469
8470 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
8471
8472 // is it still ours?
8473 vector<int> up, acting;
8474 int up_primary = -1;
8475 int acting_primary = -1;
8476 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
8477 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
8478
8479 if (acting_primary != whoami) {
8480 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
8481 << "), my role=" << role << ", skipping" << dendl;
8482 continue;
8483 }
8484
8485 spg_t pgid;
8486 bool mapped = osdmap->get_primary_shard(on, &pgid);
8487 assert(mapped);
8488
8489 PastIntervals pi(
8490 osdmap->get_pools().at(pgid.pool()).ec_pool(),
8491 *osdmap);
8492 pg_history_t history;
8493 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
8494
8495 // The mon won't resend unless the primary changed, so
8496 // we ignore same_interval_since. We'll pass this history
8497 // to handle_pg_peering_evt with the current epoch as the
8498 // event -- the project_pg_history check in
8499 // handle_pg_peering_evt will be a noop.
8500 if (history.same_primary_since > m->epoch) {
8501 dout(10) << __func__ << ": got obsolete pg create on pgid "
8502 << pgid << " from epoch " << m->epoch
8503 << ", primary changed in " << history.same_primary_since
8504 << dendl;
8505 continue;
8506 }
8507
8508 if (handle_pg_peering_evt(
8509 pgid,
8510 history,
8511 pi,
8512 osdmap->get_epoch(),
8513 PG::CephPeeringEvtRef(
8514 new PG::CephPeeringEvt(
8515 osdmap->get_epoch(),
8516 osdmap->get_epoch(),
8517 PG::NullEvt()))
8518 ) == -EEXIST) {
8519 service.send_pg_created(pgid.pgid);
8520 }
8521 }
8522 last_pg_create_epoch = m->epoch;
8523
8524 maybe_update_heartbeat_peers();
8525}
8526
8527
8528// ----------------------------------------
8529// peering and recovery
8530
8531PG::RecoveryCtx OSD::create_context()
8532{
8533 ObjectStore::Transaction *t = new ObjectStore::Transaction;
8534 C_Contexts *on_applied = new C_Contexts(cct);
8535 C_Contexts *on_safe = new C_Contexts(cct);
8536 map<int, map<spg_t,pg_query_t> > *query_map =
8537 new map<int, map<spg_t, pg_query_t> >;
8538 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
8539 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
8540 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
8541 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
8542 PG::RecoveryCtx rctx(query_map, info_map, notify_list,
8543 on_applied, on_safe, t);
8544 return rctx;
8545}
8546
8547struct C_OpenPGs : public Context {
8548 set<PGRef> pgs;
8549 ObjectStore *store;
8550 OSD *osd;
8551 C_OpenPGs(set<PGRef>& p, ObjectStore *s, OSD* o) : store(s), osd(o) {
8552 pgs.swap(p);
8553 }
8554 void finish(int r) override {
8555 RWLock::RLocker l(osd->pg_map_lock);
8556 for (auto p : pgs) {
8557 if (osd->pg_map.count(p->info.pgid)) {
8558 p->ch = store->open_collection(p->coll);
8559 assert(p->ch);
8560 }
8561 }
8562 }
8563};
8564
8565void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
8566 ThreadPool::TPHandle *handle)
8567{
8568 if (!ctx.transaction->empty()) {
8569 if (!ctx.created_pgs.empty()) {
8570 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8571 }
8572 int tr = store->queue_transaction(
8573 pg->osr.get(),
8574 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL,
8575 TrackedOpRef(), handle);
8576 delete (ctx.transaction);
8577 assert(tr == 0);
8578 ctx.transaction = new ObjectStore::Transaction;
8579 ctx.on_applied = new C_Contexts(cct);
8580 ctx.on_safe = new C_Contexts(cct);
8581 }
8582}
8583
8584void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
8585 ThreadPool::TPHandle *handle)
8586{
8587 if (service.get_osdmap()->is_up(whoami) &&
8588 is_active()) {
8589 do_notifies(*ctx.notify_list, curmap);
8590 do_queries(*ctx.query_map, curmap);
8591 do_infos(*ctx.info_map, curmap);
8592 }
8593 delete ctx.notify_list;
8594 delete ctx.query_map;
8595 delete ctx.info_map;
8596 if ((ctx.on_applied->empty() &&
8597 ctx.on_safe->empty() &&
8598 ctx.transaction->empty() &&
8599 ctx.created_pgs.empty()) || !pg) {
8600 delete ctx.transaction;
8601 delete ctx.on_applied;
8602 delete ctx.on_safe;
8603 assert(ctx.created_pgs.empty());
8604 } else {
8605 if (!ctx.created_pgs.empty()) {
8606 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8607 }
8608 int tr = store->queue_transaction(
8609 pg->osr.get(),
8610 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL, TrackedOpRef(),
8611 handle);
8612 delete (ctx.transaction);
8613 assert(tr == 0);
8614 }
8615}
8616
8617/** do_notifies
8618 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
8619 * content for, and they are primary for.
8620 */
8621
8622void OSD::do_notifies(
8623 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
8624 OSDMapRef curmap)
8625{
8626 for (map<int,
8627 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
8628 notify_list.begin();
8629 it != notify_list.end();
8630 ++it) {
8631 if (!curmap->is_up(it->first)) {
8632 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
8633 continue;
8634 }
8635 ConnectionRef con = service.get_con_osd_cluster(
8636 it->first, curmap->get_epoch());
8637 if (!con) {
8638 dout(20) << __func__ << " skipping osd." << it->first
8639 << " (NULL con)" << dendl;
8640 continue;
8641 }
8642 service.share_map_peer(it->first, con.get(), curmap);
8643 dout(7) << __func__ << " osd " << it->first
8644 << " on " << it->second.size() << " PGs" << dendl;
8645 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
8646 it->second);
8647 con->send_message(m);
8648 }
8649}
8650
8651
8652/** do_queries
8653 * send out pending queries for info | summaries
8654 */
8655void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
8656 OSDMapRef curmap)
8657{
8658 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
8659 pit != query_map.end();
8660 ++pit) {
8661 if (!curmap->is_up(pit->first)) {
8662 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
8663 continue;
8664 }
8665 int who = pit->first;
8666 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
8667 if (!con) {
8668 dout(20) << __func__ << " skipping osd." << who
8669 << " (NULL con)" << dendl;
8670 continue;
8671 }
8672 service.share_map_peer(who, con.get(), curmap);
8673 dout(7) << __func__ << " querying osd." << who
8674 << " on " << pit->second.size() << " PGs" << dendl;
8675 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
8676 con->send_message(m);
8677 }
8678}
8679
8680
8681void OSD::do_infos(map<int,
8682 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
8683 OSDMapRef curmap)
8684{
8685 for (map<int,
8686 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
8687 info_map.begin();
8688 p != info_map.end();
8689 ++p) {
8690 if (!curmap->is_up(p->first)) {
8691 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
8692 continue;
8693 }
8694 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
8695 i != p->second.end();
8696 ++i) {
8697 dout(20) << __func__ << " sending info " << i->first.info
8698 << " to shard " << p->first << dendl;
8699 }
8700 ConnectionRef con = service.get_con_osd_cluster(
8701 p->first, curmap->get_epoch());
8702 if (!con) {
8703 dout(20) << __func__ << " skipping osd." << p->first
8704 << " (NULL con)" << dendl;
8705 continue;
8706 }
8707 service.share_map_peer(p->first, con.get(), curmap);
8708 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
8709 m->pg_list = p->second;
8710 con->send_message(m);
8711 }
8712 info_map.clear();
8713}
8714
8715
8716/** PGNotify
8717 * from non-primary to primary
8718 * includes pg_info_t.
8719 * NOTE: called with opqueue active.
8720 */
8721void OSD::handle_pg_notify(OpRequestRef op)
8722{
8723 const MOSDPGNotify *m = static_cast<const MOSDPGNotify*>(op->get_req());
8724 assert(m->get_type() == MSG_OSD_PG_NOTIFY);
8725
8726 dout(7) << "handle_pg_notify from " << m->get_source() << dendl;
8727 int from = m->get_source().num();
8728
8729 if (!require_osd_peer(op->get_req()))
8730 return;
8731
8732 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8733 return;
8734
8735 op->mark_started();
8736
8737 for (auto it = m->get_pg_list().begin();
8738 it != m->get_pg_list().end();
8739 ++it) {
8740 if (it->first.info.pgid.preferred() >= 0) {
8741 dout(20) << "ignoring localized pg " << it->first.info.pgid << dendl;
8742 continue;
8743 }
8744
8745 handle_pg_peering_evt(
8746 spg_t(it->first.info.pgid.pgid, it->first.to),
8747 it->first.info.history, it->second,
8748 it->first.query_epoch,
8749 PG::CephPeeringEvtRef(
8750 new PG::CephPeeringEvt(
8751 it->first.epoch_sent, it->first.query_epoch,
8752 PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first,
8753 op->get_req()->get_connection()->get_features())))
8754 );
8755 }
8756}
8757
8758void OSD::handle_pg_log(OpRequestRef op)
8759{
8760 MOSDPGLog *m = static_cast<MOSDPGLog*>(op->get_nonconst_req());
8761 assert(m->get_type() == MSG_OSD_PG_LOG);
8762 dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl;
8763
8764 if (!require_osd_peer(op->get_req()))
8765 return;
8766
8767 int from = m->get_source().num();
8768 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8769 return;
8770
8771 if (m->info.pgid.preferred() >= 0) {
8772 dout(10) << "ignoring localized pg " << m->info.pgid << dendl;
8773 return;
8774 }
8775
8776 op->mark_started();
8777 handle_pg_peering_evt(
8778 spg_t(m->info.pgid.pgid, m->to),
8779 m->info.history, m->past_intervals, m->get_epoch(),
8780 PG::CephPeeringEvtRef(
8781 new PG::CephPeeringEvt(
8782 m->get_epoch(), m->get_query_epoch(),
8783 PG::MLogRec(pg_shard_t(from, m->from), m)))
8784 );
8785}
8786
8787void OSD::handle_pg_info(OpRequestRef op)
8788{
8789 const MOSDPGInfo *m = static_cast<const MOSDPGInfo *>(op->get_req());
8790 assert(m->get_type() == MSG_OSD_PG_INFO);
8791 dout(7) << "handle_pg_info " << *m << " from " << m->get_source() << dendl;
8792
8793 if (!require_osd_peer(op->get_req()))
8794 return;
8795
8796 int from = m->get_source().num();
8797 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8798 return;
8799
8800 op->mark_started();
8801
8802 for (auto p = m->pg_list.begin();
8803 p != m->pg_list.end();
8804 ++p) {
8805 if (p->first.info.pgid.preferred() >= 0) {
8806 dout(10) << "ignoring localized pg " << p->first.info.pgid << dendl;
8807 continue;
8808 }
8809
8810 handle_pg_peering_evt(
8811 spg_t(p->first.info.pgid.pgid, p->first.to),
8812 p->first.info.history, p->second, p->first.epoch_sent,
8813 PG::CephPeeringEvtRef(
8814 new PG::CephPeeringEvt(
8815 p->first.epoch_sent, p->first.query_epoch,
8816 PG::MInfoRec(
8817 pg_shard_t(
8818 from, p->first.from), p->first.info, p->first.epoch_sent)))
8819 );
8820 }
8821}
8822
8823void OSD::handle_pg_trim(OpRequestRef op)
8824{
8825 const MOSDPGTrim *m = static_cast<const MOSDPGTrim*>(op->get_req());
8826 assert(m->get_type() == MSG_OSD_PG_TRIM);
8827
8828 dout(7) << "handle_pg_trim " << *m << " from " << m->get_source() << dendl;
8829
8830 if (!require_osd_peer(op->get_req()))
8831 return;
8832
8833 int from = m->get_source().num();
8834 if (!require_same_or_newer_map(op, m->epoch, false))
8835 return;
8836
8837 if (m->pgid.preferred() >= 0) {
8838 dout(10) << "ignoring localized pg " << m->pgid << dendl;
8839 return;
8840 }
8841
8842 op->mark_started();
8843
8844 PG *pg = _lookup_lock_pg(m->pgid);
8845 if(!pg) {
8846 dout(10) << " don't have pg " << m->pgid << dendl;
8847 return;
8848 }
8849
8850 if (m->epoch < pg->info.history.same_interval_since) {
8851 dout(10) << *pg << " got old trim to " << m->trim_to << ", ignoring" << dendl;
8852 pg->unlock();
8853 return;
8854 }
8855
8856 if (pg->is_primary()) {
8857 // peer is informing us of their last_complete_ondisk
8858 dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
8859 pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
8860 m->trim_to;
8861 // trim log when the pg is recovered
8862 pg->calc_min_last_complete_ondisk();
8863 } else {
8864 // primary is instructing us to trim
8865 ObjectStore::Transaction t;
8866 pg->pg_log.trim(m->trim_to, pg->info);
8867 pg->dirty_info = true;
8868 pg->write_if_dirty(t);
8869 int tr = store->queue_transaction(pg->osr.get(), std::move(t), NULL);
8870 assert(tr == 0);
8871 }
8872 pg->unlock();
8873}
8874
8875void OSD::handle_pg_backfill_reserve(OpRequestRef op)
8876{
8877 const MBackfillReserve *m = static_cast<const MBackfillReserve*>(op->get_req());
8878 assert(m->get_type() == MSG_OSD_BACKFILL_RESERVE);
8879
8880 if (!require_osd_peer(op->get_req()))
8881 return;
8882 if (!require_same_or_newer_map(op, m->query_epoch, false))
8883 return;
8884
8885 PG::CephPeeringEvtRef evt;
8886 if (m->type == MBackfillReserve::REQUEST) {
8887 evt = PG::CephPeeringEvtRef(
8888 new PG::CephPeeringEvt(
8889 m->query_epoch,
8890 m->query_epoch,
8891 PG::RequestBackfillPrio(m->priority)));
8892 } else if (m->type == MBackfillReserve::GRANT) {
8893 evt = PG::CephPeeringEvtRef(
8894 new PG::CephPeeringEvt(
8895 m->query_epoch,
8896 m->query_epoch,
8897 PG::RemoteBackfillReserved()));
8898 } else if (m->type == MBackfillReserve::REJECT) {
8899 evt = PG::CephPeeringEvtRef(
8900 new PG::CephPeeringEvt(
8901 m->query_epoch,
8902 m->query_epoch,
8903 PG::RemoteReservationRejected()));
8904 } else {
8905 ceph_abort();
8906 }
8907
8908 if (service.splitting(m->pgid)) {
8909 peering_wait_for_split[m->pgid].push_back(evt);
8910 return;
8911 }
8912
8913 PG *pg = _lookup_lock_pg(m->pgid);
8914 if (!pg) {
8915 dout(10) << " don't have pg " << m->pgid << dendl;
8916 return;
8917 }
8918
8919 pg->queue_peering_event(evt);
8920 pg->unlock();
8921}
8922
8923void OSD::handle_pg_recovery_reserve(OpRequestRef op)
8924{
8925 const MRecoveryReserve *m = static_cast<const MRecoveryReserve*>(op->get_req());
8926 assert(m->get_type() == MSG_OSD_RECOVERY_RESERVE);
8927
8928 if (!require_osd_peer(op->get_req()))
8929 return;
8930 if (!require_same_or_newer_map(op, m->query_epoch, false))
8931 return;
8932
8933 PG::CephPeeringEvtRef evt;
8934 if (m->type == MRecoveryReserve::REQUEST) {
8935 evt = PG::CephPeeringEvtRef(
8936 new PG::CephPeeringEvt(
8937 m->query_epoch,
8938 m->query_epoch,
8939 PG::RequestRecovery()));
8940 } else if (m->type == MRecoveryReserve::GRANT) {
8941 evt = PG::CephPeeringEvtRef(
8942 new PG::CephPeeringEvt(
8943 m->query_epoch,
8944 m->query_epoch,
8945 PG::RemoteRecoveryReserved()));
8946 } else if (m->type == MRecoveryReserve::RELEASE) {
8947 evt = PG::CephPeeringEvtRef(
8948 new PG::CephPeeringEvt(
8949 m->query_epoch,
8950 m->query_epoch,
8951 PG::RecoveryDone()));
8952 } else {
8953 ceph_abort();
8954 }
8955
8956 if (service.splitting(m->pgid)) {
8957 peering_wait_for_split[m->pgid].push_back(evt);
8958 return;
8959 }
8960
8961 PG *pg = _lookup_lock_pg(m->pgid);
8962 if (!pg) {
8963 dout(10) << " don't have pg " << m->pgid << dendl;
8964 return;
8965 }
8966
8967 pg->queue_peering_event(evt);
8968 pg->unlock();
8969}
8970
c07f9fc5
FG
8971void OSD::handle_force_recovery(Message *m)
8972{
8973 MOSDForceRecovery *msg = static_cast<MOSDForceRecovery*>(m);
8974 assert(msg->get_type() == MSG_OSD_FORCE_RECOVERY);
c07f9fc5 8975
d2e6a577 8976 vector<PGRef> local_pgs;
c07f9fc5
FG
8977 local_pgs.reserve(msg->forced_pgs.size());
8978
d2e6a577
FG
8979 {
8980 RWLock::RLocker l(pg_map_lock);
8981 for (auto& i : msg->forced_pgs) {
8982 spg_t locpg;
8983 if (osdmap->get_primary_shard(i, &locpg)) {
8984 auto pg_map_entry = pg_map.find(locpg);
8985 if (pg_map_entry != pg_map.end()) {
8986 local_pgs.push_back(pg_map_entry->second);
8987 }
c07f9fc5
FG
8988 }
8989 }
8990 }
8991
8992 if (local_pgs.size()) {
8993 service.adjust_pg_priorities(local_pgs, msg->options);
8994 }
8995
8996 msg->put();
8997}
7c673cae
FG
8998
8999/** PGQuery
9000 * from primary to replica | stray
9001 * NOTE: called with opqueue active.
9002 */
9003void OSD::handle_pg_query(OpRequestRef op)
9004{
9005 assert(osd_lock.is_locked());
9006
9007 const MOSDPGQuery *m = static_cast<const MOSDPGQuery*>(op->get_req());
9008 assert(m->get_type() == MSG_OSD_PG_QUERY);
9009
9010 if (!require_osd_peer(op->get_req()))
9011 return;
9012
9013 dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl;
9014 int from = m->get_source().num();
9015
9016 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9017 return;
9018
9019 op->mark_started();
9020
9021 map< int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
9022
9023 for (auto it = m->pg_list.begin();
9024 it != m->pg_list.end();
9025 ++it) {
9026 spg_t pgid = it->first;
9027
9028 if (pgid.preferred() >= 0) {
9029 dout(10) << "ignoring localized pg " << pgid << dendl;
9030 continue;
9031 }
9032
9033 if (service.splitting(pgid)) {
9034 peering_wait_for_split[pgid].push_back(
9035 PG::CephPeeringEvtRef(
9036 new PG::CephPeeringEvt(
9037 it->second.epoch_sent, it->second.epoch_sent,
9038 PG::MQuery(pg_shard_t(from, it->second.from),
9039 it->second, it->second.epoch_sent))));
9040 continue;
9041 }
9042
9043 {
9044 RWLock::RLocker l(pg_map_lock);
9045 if (pg_map.count(pgid)) {
9046 PG *pg = 0;
9047 pg = _lookup_lock_pg_with_map_lock_held(pgid);
9048 pg->queue_query(
9049 it->second.epoch_sent, it->second.epoch_sent,
9050 pg_shard_t(from, it->second.from), it->second);
9051 pg->unlock();
9052 continue;
9053 }
9054 }
9055
9056 if (!osdmap->have_pg_pool(pgid.pool()))
9057 continue;
9058
9059 // get active crush mapping
9060 int up_primary, acting_primary;
9061 vector<int> up, acting;
9062 osdmap->pg_to_up_acting_osds(
9063 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9064
9065 // same primary?
9066 pg_history_t history = it->second.history;
9067 bool valid_history = project_pg_history(
9068 pgid, history, it->second.epoch_sent,
9069 up, up_primary, acting, acting_primary);
9070
9071 if (!valid_history ||
9072 it->second.epoch_sent < history.same_interval_since) {
9073 dout(10) << " pg " << pgid << " dne, and pg has changed in "
9074 << history.same_interval_since
9075 << " (msg from " << it->second.epoch_sent << ")" << dendl;
9076 continue;
9077 }
9078
9079 dout(10) << " pg " << pgid << " dne" << dendl;
9080 pg_info_t empty(spg_t(pgid.pgid, it->second.to));
9081 /* This is racy, but that should be ok: if we complete the deletion
9082 * before the pg is recreated, we'll just start it off backfilling
9083 * instead of just empty */
9084 if (service.deleting_pgs.lookup(pgid))
9085 empty.set_last_backfill(hobject_t());
9086 if (it->second.type == pg_query_t::LOG ||
9087 it->second.type == pg_query_t::FULLLOG) {
9088 ConnectionRef con = service.get_con_osd_cluster(from, osdmap->get_epoch());
9089 if (con) {
9090 MOSDPGLog *mlog = new MOSDPGLog(
9091 it->second.from, it->second.to,
9092 osdmap->get_epoch(), empty,
9093 it->second.epoch_sent);
9094 service.share_map_peer(from, con.get(), osdmap);
9095 con->send_message(mlog);
9096 }
9097 } else {
9098 notify_list[from].push_back(
9099 make_pair(
9100 pg_notify_t(
9101 it->second.from, it->second.to,
9102 it->second.epoch_sent,
9103 osdmap->get_epoch(),
9104 empty),
9105 PastIntervals(
9106 osdmap->get_pools().at(pgid.pool()).ec_pool(),
9107 *osdmap)));
9108 }
9109 }
9110 do_notifies(notify_list, osdmap);
9111}
9112
9113
9114void OSD::handle_pg_remove(OpRequestRef op)
9115{
9116 const MOSDPGRemove *m = static_cast<const MOSDPGRemove *>(op->get_req());
9117 assert(m->get_type() == MSG_OSD_PG_REMOVE);
9118 assert(osd_lock.is_locked());
9119
9120 if (!require_osd_peer(op->get_req()))
9121 return;
9122
9123 dout(7) << "handle_pg_remove from " << m->get_source() << " on "
9124 << m->pg_list.size() << " pgs" << dendl;
9125
9126 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9127 return;
9128
9129 op->mark_started();
9130
9131 for (auto it = m->pg_list.begin();
9132 it != m->pg_list.end();
9133 ++it) {
9134 spg_t pgid = *it;
9135 if (pgid.preferred() >= 0) {
9136 dout(10) << "ignoring localized pg " << pgid << dendl;
9137 continue;
9138 }
9139
9140 RWLock::WLocker l(pg_map_lock);
9141 if (pg_map.count(pgid) == 0) {
9142 dout(10) << " don't have pg " << pgid << dendl;
9143 continue;
9144 }
9145 dout(5) << "queue_pg_for_deletion: " << pgid << dendl;
9146 PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
9147 pg_history_t history = pg->info.history;
9148 int up_primary, acting_primary;
9149 vector<int> up, acting;
9150 osdmap->pg_to_up_acting_osds(
9151 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9152 bool valid_history = project_pg_history(
9153 pg->info.pgid, history, pg->get_osdmap()->get_epoch(),
9154 up, up_primary, acting, acting_primary);
9155 if (valid_history &&
9156 history.same_interval_since <= m->get_epoch()) {
9157 assert(pg->get_primary().osd == m->get_source().num());
9158 PGRef _pg(pg);
9159 _remove_pg(pg);
9160 pg->unlock();
9161 } else {
9162 dout(10) << *pg << " ignoring remove request, pg changed in epoch "
9163 << history.same_interval_since
9164 << " > " << m->get_epoch() << dendl;
9165 pg->unlock();
9166 }
9167 }
9168}
9169
9170void OSD::_remove_pg(PG *pg)
9171{
9172 ObjectStore::Transaction rmt ;
9173
9174 // on_removal, which calls remove_watchers_and_notifies, and the erasure from
9175 // the pg_map must be done together without unlocking the pg lock,
9176 // to avoid racing with watcher cleanup in ms_handle_reset
9177 // and handle_notify_timeout
9178 pg->on_removal(&rmt);
9179
9180 service.cancel_pending_splits_for_parent(pg->info.pgid);
9181 int tr = store->queue_transaction(
9182 pg->osr.get(), std::move(rmt), NULL,
9183 new ContainerContext<
9184 SequencerRef>(pg->osr));
9185 assert(tr == 0);
9186
9187 DeletingStateRef deleting = service.deleting_pgs.lookup_or_create(
9188 pg->info.pgid,
9189 make_pair(
9190 pg->info.pgid,
9191 PGRef(pg))
9192 );
9193 remove_wq.queue(make_pair(PGRef(pg), deleting));
9194
9195 service.pg_remove_epoch(pg->info.pgid);
9196
9197 // dereference from op_wq
9198 op_shardedwq.clear_pg_pointer(pg->info.pgid);
9199
9200 // remove from map
9201 pg_map.erase(pg->info.pgid);
9202 pg->put("PGMap"); // since we've taken it out of map
9203}
9204
9205
9206// =========================================================
9207// RECOVERY
9208
9209void OSDService::_maybe_queue_recovery() {
9210 assert(recovery_lock.is_locked_by_me());
9211 uint64_t available_pushes;
9212 while (!awaiting_throttle.empty() &&
9213 _recover_now(&available_pushes)) {
9214 uint64_t to_start = MIN(
9215 available_pushes,
9216 cct->_conf->osd_recovery_max_single_start);
9217 _queue_for_recovery(awaiting_throttle.front(), to_start);
9218 awaiting_throttle.pop_front();
9219 recovery_ops_reserved += to_start;
9220 }
9221}
9222
9223bool OSDService::_recover_now(uint64_t *available_pushes)
9224{
9225 if (available_pushes)
9226 *available_pushes = 0;
9227
9228 if (ceph_clock_now() < defer_recovery_until) {
9229 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9230 return false;
9231 }
9232
9233 if (recovery_paused) {
9234 dout(15) << __func__ << " paused" << dendl;
9235 return false;
9236 }
9237
9238 uint64_t max = cct->_conf->osd_recovery_max_active;
9239 if (max <= recovery_ops_active + recovery_ops_reserved) {
9240 dout(15) << __func__ << " active " << recovery_ops_active
9241 << " + reserved " << recovery_ops_reserved
9242 << " >= max " << max << dendl;
9243 return false;
9244 }
9245
9246 if (available_pushes)
9247 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9248
9249 return true;
9250}
9251
c07f9fc5 9252
d2e6a577 9253void OSDService::adjust_pg_priorities(const vector<PGRef>& pgs, int newflags)
c07f9fc5
FG
9254{
9255 if (!pgs.size() || !(newflags & (OFR_BACKFILL | OFR_RECOVERY)))
9256 return;
9257 int newstate = 0;
9258
c07f9fc5
FG
9259 if (newflags & OFR_BACKFILL) {
9260 newstate = PG_STATE_FORCED_BACKFILL;
9261 } else if (newflags & OFR_RECOVERY) {
9262 newstate = PG_STATE_FORCED_RECOVERY;
9263 }
9264
9265 // debug output here may get large, don't generate it if debug level is below
9266 // 10 and use abbreviated pg ids otherwise
9267 if ((cct)->_conf->subsys.should_gather(ceph_subsys_osd, 10)) {
9268 stringstream ss;
9269
9270 for (auto& i : pgs) {
9271 ss << i->get_pgid() << " ";
9272 }
9273
9274 dout(10) << __func__ << " working on " << ss.str() << dendl;
9275 }
9276
9277 if (newflags & OFR_CANCEL) {
9278 for (auto& i : pgs) {
d2e6a577
FG
9279 i->lock();
9280 i->_change_recovery_force_mode(newstate, true);
9281 i->unlock();
c07f9fc5
FG
9282 }
9283 } else {
9284 for (auto& i : pgs) {
9285 // make sure the PG is in correct state before forcing backfill or recovery, or
9286 // else we'll make PG keeping FORCE_* flag forever, requiring osds restart
9287 // or forcing somehow recovery/backfill.
d2e6a577 9288 i->lock();
c07f9fc5
FG
9289 int pgstate = i->get_state();
9290 if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) ||
9291 ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILL))) )
d2e6a577
FG
9292 i->_change_recovery_force_mode(newstate, false);
9293 i->unlock();
c07f9fc5
FG
9294 }
9295 }
9296}
9297
7c673cae
FG
9298void OSD::do_recovery(
9299 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9300 ThreadPool::TPHandle &handle)
9301{
9302 uint64_t started = 0;
31f18b77
FG
9303
9304 /*
9305 * When the value of osd_recovery_sleep is set greater than zero, recovery
9306 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9307 * recovery event's schedule time. This is done by adding a
9308 * recovery_requeue_callback event, which re-queues the recovery op using
9309 * queue_recovery_after_sleep.
9310 */
c07f9fc5
FG
9311 float recovery_sleep = get_osd_recovery_sleep();
9312 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
31f18b77
FG
9313 PGRef pgref(pg);
9314 auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9315 dout(20) << "do_recovery wake up at "
9316 << ceph_clock_now()
9317 << ", re-queuing recovery" << dendl;
9318 service.recovery_needs_sleep = false;
9319 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9320 });
9321 Mutex::Locker l(service.recovery_sleep_lock);
9322
9323 // This is true for the first recovery op and when the previous recovery op
9324 // has been scheduled in the past. The next recovery op is scheduled after
9325 // completing the sleep from now.
9326 if (service.recovery_schedule_time < ceph_clock_now()) {
9327 service.recovery_schedule_time = ceph_clock_now();
9328 }
c07f9fc5 9329 service.recovery_schedule_time += recovery_sleep;
31f18b77
FG
9330 service.recovery_sleep_timer.add_event_at(service.recovery_schedule_time,
9331 recovery_requeue_callback);
9332 dout(20) << "Recovery event scheduled at "
9333 << service.recovery_schedule_time << dendl;
9334 return;
7c673cae
FG
9335 }
9336
9337 {
31f18b77 9338 service.recovery_needs_sleep = true;
7c673cae
FG
9339 if (pg->pg_has_reset_since(queued)) {
9340 goto out;
9341 }
9342
9343 assert(!pg->deleting);
9344 assert(pg->is_peered() && pg->is_primary());
9345
9346 assert(pg->recovery_queued);
9347 pg->recovery_queued = false;
9348
9349 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9350#ifdef DEBUG_RECOVERY_OIDS
9351 dout(20) << " active was " << service.recovery_oids[pg->info.pgid] << dendl;
9352#endif
9353
9354 bool more = pg->start_recovery_ops(reserved_pushes, handle, &started);
9355 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9356 << " on " << *pg << dendl;
9357
9358 // If no recovery op is started, don't bother to manipulate the RecoveryCtx
9359 if (!started && (more || !pg->have_unfound())) {
9360 goto out;
9361 }
9362
9363 PG::RecoveryCtx rctx = create_context();
9364 rctx.handle = &handle;
9365
9366 /*
9367 * if we couldn't start any recovery ops and things are still
9368 * unfound, see if we can discover more missing object locations.
9369 * It may be that our initial locations were bad and we errored
9370 * out while trying to pull.
9371 */
9372 if (!more && pg->have_unfound()) {
9373 pg->discover_all_missing(*rctx.query_map);
9374 if (rctx.query_map->empty()) {
224ce89b
WB
9375 string action;
9376 if (pg->state_test(PG_STATE_BACKFILL)) {
9377 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9378 queued,
9379 queued,
9380 PG::CancelBackfill()));
9381 pg->queue_peering_event(evt);
9382 action = "in backfill";
9383 } else if (pg->state_test(PG_STATE_RECOVERING)) {
9384 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9385 queued,
9386 queued,
9387 PG::CancelRecovery()));
9388 pg->queue_peering_event(evt);
9389 action = "in recovery";
9390 } else {
9391 action = "already out of recovery/backfill";
9392 }
9393 dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
7c673cae 9394 } else {
224ce89b 9395 dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
7c673cae
FG
9396 pg->queue_recovery();
9397 }
9398 }
9399
9400 pg->write_if_dirty(*rctx.transaction);
9401 OSDMapRef curmap = pg->get_osdmap();
9402 dispatch_context(rctx, pg, curmap);
9403 }
9404
9405 out:
9406 assert(started <= reserved_pushes);
9407 service.release_reserved_pushes(reserved_pushes);
9408}
9409
9410void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9411{
9412 Mutex::Locker l(recovery_lock);
9413 dout(10) << "start_recovery_op " << *pg << " " << soid
9414 << " (" << recovery_ops_active << "/"
9415 << cct->_conf->osd_recovery_max_active << " rops)"
9416 << dendl;
9417 recovery_ops_active++;
9418
9419#ifdef DEBUG_RECOVERY_OIDS
9420 dout(20) << " active was " << recovery_oids[pg->info.pgid] << dendl;
9421 assert(recovery_oids[pg->info.pgid].count(soid) == 0);
9422 recovery_oids[pg->info.pgid].insert(soid);
9423#endif
9424}
9425
9426void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9427{
9428 Mutex::Locker l(recovery_lock);
9429 dout(10) << "finish_recovery_op " << *pg << " " << soid
9430 << " dequeue=" << dequeue
9431 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
9432 << dendl;
9433
9434 // adjust count
9435 assert(recovery_ops_active > 0);
9436 recovery_ops_active--;
9437
9438#ifdef DEBUG_RECOVERY_OIDS
9439 dout(20) << " active oids was " << recovery_oids[pg->info.pgid] << dendl;
9440 assert(recovery_oids[pg->info.pgid].count(soid));
9441 recovery_oids[pg->info.pgid].erase(soid);
9442#endif
9443
9444 _maybe_queue_recovery();
9445}
9446
9447bool OSDService::is_recovery_active()
9448{
224ce89b
WB
9449 Mutex::Locker l(recovery_lock);
9450 return recovery_ops_active > 0;
7c673cae
FG
9451}
9452
9453// =========================================================
9454// OPS
9455
9456bool OSD::op_is_discardable(const MOSDOp *op)
9457{
9458 // drop client request if they are not connected and can't get the
9459 // reply anyway.
9460 if (!op->get_connection()->is_connected()) {
9461 return true;
9462 }
9463 return false;
9464}
9465
9466void OSD::enqueue_op(spg_t pg, OpRequestRef& op, epoch_t epoch)
9467{
9468 utime_t latency = ceph_clock_now() - op->get_req()->get_recv_stamp();
9469 dout(15) << "enqueue_op " << op << " prio " << op->get_req()->get_priority()
9470 << " cost " << op->get_req()->get_cost()
9471 << " latency " << latency
9472 << " epoch " << epoch
9473 << " " << *(op->get_req()) << dendl;
9474 op->osd_trace.event("enqueue op");
9475 op->osd_trace.keyval("priority", op->get_req()->get_priority());
9476 op->osd_trace.keyval("cost", op->get_req()->get_cost());
9477 op->mark_queued_for_pg();
224ce89b 9478 logger->tinc(l_osd_op_before_queue_op_lat, latency);
7c673cae
FG
9479 op_shardedwq.queue(make_pair(pg, PGQueueable(op, epoch)));
9480}
9481
9482
9483
9484/*
9485 * NOTE: dequeue called in worker thread, with pg lock
9486 */
9487void OSD::dequeue_op(
9488 PGRef pg, OpRequestRef op,
9489 ThreadPool::TPHandle &handle)
9490{
9491 FUNCTRACE();
9492 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
9493
9494 utime_t now = ceph_clock_now();
9495 op->set_dequeued_time(now);
9496 utime_t latency = now - op->get_req()->get_recv_stamp();
9497 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
9498 << " cost " << op->get_req()->get_cost()
9499 << " latency " << latency
9500 << " " << *(op->get_req())
9501 << " pg " << *pg << dendl;
9502
224ce89b
WB
9503 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9504
7c673cae
FG
9505 Session *session = static_cast<Session *>(
9506 op->get_req()->get_connection()->get_priv());
9507 if (session) {
9508 maybe_share_map(session, op, pg->get_osdmap());
9509 session->put();
9510 }
9511
9512 if (pg->deleting)
9513 return;
9514
9515 op->mark_reached_pg();
9516 op->osd_trace.event("dequeue_op");
9517
9518 pg->do_request(op, handle);
9519
9520 // finish
9521 dout(10) << "dequeue_op " << op << " finish" << dendl;
9522 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
9523}
9524
9525
9526struct C_CompleteSplits : public Context {
9527 OSD *osd;
31f18b77
FG
9528 set<PGRef> pgs;
9529 C_CompleteSplits(OSD *osd, const set<PGRef> &in)
7c673cae
FG
9530 : osd(osd), pgs(in) {}
9531 void finish(int r) override {
9532 Mutex::Locker l(osd->osd_lock);
9533 if (osd->is_stopping())
9534 return;
9535 PG::RecoveryCtx rctx = osd->create_context();
31f18b77 9536 for (set<PGRef>::iterator i = pgs.begin();
7c673cae
FG
9537 i != pgs.end();
9538 ++i) {
9539 osd->pg_map_lock.get_write();
9540 (*i)->lock();
31f18b77
FG
9541 PG *pg = i->get();
9542 osd->add_newly_split_pg(pg, &rctx);
7c673cae
FG
9543 if (!((*i)->deleting)) {
9544 set<spg_t> to_complete;
9545 to_complete.insert((*i)->info.pgid);
9546 osd->service.complete_split(to_complete);
9547 }
9548 osd->pg_map_lock.put_write();
31f18b77 9549 osd->dispatch_context_transaction(rctx, pg);
7c673cae
FG
9550 osd->wake_pg_waiters(*i);
9551 (*i)->unlock();
9552 }
9553
9554 osd->dispatch_context(rctx, 0, osd->service.get_osdmap());
9555 }
9556};
9557
9558void OSD::process_peering_events(
9559 const list<PG*> &pgs,
9560 ThreadPool::TPHandle &handle
9561 )
9562{
9563 bool need_up_thru = false;
9564 epoch_t same_interval_since = 0;
9565 OSDMapRef curmap;
9566 PG::RecoveryCtx rctx = create_context();
9567 rctx.handle = &handle;
9568 for (list<PG*>::const_iterator i = pgs.begin();
9569 i != pgs.end();
9570 ++i) {
31f18b77 9571 set<PGRef> split_pgs;
7c673cae
FG
9572 PG *pg = *i;
9573 pg->lock_suspend_timeout(handle);
9574 curmap = service.get_osdmap();
9575 if (pg->deleting) {
9576 pg->unlock();
9577 continue;
9578 }
9579 if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
9580 // we need to requeue the PG explicitly since we didn't actually
9581 // handle an event
9582 peering_wq.queue(pg);
9583 } else {
9584 assert(!pg->peering_queue.empty());
9585 PG::CephPeeringEvtRef evt = pg->peering_queue.front();
9586 pg->peering_queue.pop_front();
9587 pg->handle_peering_event(evt, &rctx);
9588 }
9589 need_up_thru = pg->need_up_thru || need_up_thru;
9590 same_interval_since = MAX(pg->info.history.same_interval_since,
9591 same_interval_since);
9592 pg->write_if_dirty(*rctx.transaction);
9593 if (!split_pgs.empty()) {
9594 rctx.on_applied->add(new C_CompleteSplits(this, split_pgs));
9595 split_pgs.clear();
9596 }
9597 dispatch_context_transaction(rctx, pg, &handle);
9598 pg->unlock();
9599 }
9600 if (need_up_thru)
9601 queue_want_up_thru(same_interval_since);
9602 dispatch_context(rctx, 0, curmap, &handle);
9603
9604 service.send_pg_temp();
9605}
9606
9607// --------------------------------
9608
9609const char** OSD::get_tracked_conf_keys() const
9610{
9611 static const char* KEYS[] = {
9612 "osd_max_backfills",
9613 "osd_min_recovery_priority",
224ce89b
WB
9614 "osd_max_trimming_pgs",
9615 "osd_op_complaint_time",
9616 "osd_op_log_threshold",
9617 "osd_op_history_size",
9618 "osd_op_history_duration",
9619 "osd_op_history_slow_op_size",
9620 "osd_op_history_slow_op_threshold",
7c673cae
FG
9621 "osd_enable_op_tracker",
9622 "osd_map_cache_size",
9623 "osd_map_max_advance",
9624 "osd_pg_epoch_persisted_max_stale",
9625 "osd_disk_thread_ioprio_class",
9626 "osd_disk_thread_ioprio_priority",
9627 // clog & admin clog
9628 "clog_to_monitors",
9629 "clog_to_syslog",
9630 "clog_to_syslog_facility",
9631 "clog_to_syslog_level",
9632 "osd_objectstore_fuse",
9633 "clog_to_graylog",
9634 "clog_to_graylog_host",
9635 "clog_to_graylog_port",
9636 "host",
9637 "fsid",
9638 "osd_recovery_delay_start",
9639 "osd_client_message_size_cap",
9640 "osd_client_message_cap",
31f18b77
FG
9641 "osd_heartbeat_min_size",
9642 "osd_heartbeat_interval",
7c673cae
FG
9643 NULL
9644 };
9645 return KEYS;
9646}
9647
9648void OSD::handle_conf_change(const struct md_config_t *conf,
9649 const std::set <std::string> &changed)
9650{
9651 if (changed.count("osd_max_backfills")) {
9652 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9653 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9654 }
9655 if (changed.count("osd_min_recovery_priority")) {
9656 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9657 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9658 }
9659 if (changed.count("osd_max_trimming_pgs")) {
9660 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9661 }
9662 if (changed.count("osd_op_complaint_time") ||
9663 changed.count("osd_op_log_threshold")) {
9664 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9665 cct->_conf->osd_op_log_threshold);
9666 }
9667 if (changed.count("osd_op_history_size") ||
9668 changed.count("osd_op_history_duration")) {
9669 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9670 cct->_conf->osd_op_history_duration);
9671 }
9672 if (changed.count("osd_op_history_slow_op_size") ||
9673 changed.count("osd_op_history_slow_op_threshold")) {
9674 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9675 cct->_conf->osd_op_history_slow_op_threshold);
9676 }
9677 if (changed.count("osd_enable_op_tracker")) {
9678 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9679 }
9680 if (changed.count("osd_disk_thread_ioprio_class") ||
9681 changed.count("osd_disk_thread_ioprio_priority")) {
9682 set_disk_tp_priority();
9683 }
9684 if (changed.count("osd_map_cache_size")) {
9685 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9686 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9687 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9688 }
9689 if (changed.count("clog_to_monitors") ||
9690 changed.count("clog_to_syslog") ||
9691 changed.count("clog_to_syslog_level") ||
9692 changed.count("clog_to_syslog_facility") ||
9693 changed.count("clog_to_graylog") ||
9694 changed.count("clog_to_graylog_host") ||
9695 changed.count("clog_to_graylog_port") ||
9696 changed.count("host") ||
9697 changed.count("fsid")) {
9698 update_log_config();
9699 }
9700
9701#ifdef HAVE_LIBFUSE
9702 if (changed.count("osd_objectstore_fuse")) {
9703 if (store) {
9704 enable_disable_fuse(false);
9705 }
9706 }
9707#endif
9708
9709 if (changed.count("osd_recovery_delay_start")) {
9710 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9711 service.kick_recovery_queue();
9712 }
9713
9714 if (changed.count("osd_client_message_cap")) {
9715 uint64_t newval = cct->_conf->osd_client_message_cap;
9716 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9717 if (pol.throttler_messages && newval > 0) {
9718 pol.throttler_messages->reset_max(newval);
9719 }
9720 }
9721 if (changed.count("osd_client_message_size_cap")) {
9722 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9723 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9724 if (pol.throttler_bytes && newval > 0) {
9725 pol.throttler_bytes->reset_max(newval);
9726 }
9727 }
9728
9729 check_config();
9730}
9731
9732void OSD::update_log_config()
9733{
9734 map<string,string> log_to_monitors;
9735 map<string,string> log_to_syslog;
9736 map<string,string> log_channel;
9737 map<string,string> log_prio;
9738 map<string,string> log_to_graylog;
9739 map<string,string> log_to_graylog_host;
9740 map<string,string> log_to_graylog_port;
9741 uuid_d fsid;
9742 string host;
9743
9744 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9745 log_channel, log_prio, log_to_graylog,
9746 log_to_graylog_host, log_to_graylog_port,
9747 fsid, host) == 0)
9748 clog->update_config(log_to_monitors, log_to_syslog,
9749 log_channel, log_prio, log_to_graylog,
9750 log_to_graylog_host, log_to_graylog_port,
9751 fsid, host);
9752 derr << "log_to_monitors " << log_to_monitors << dendl;
9753}
9754
9755void OSD::check_config()
9756{
9757 // some sanity checks
9758 if (cct->_conf->osd_map_cache_size <= cct->_conf->osd_map_max_advance + 2) {
9759 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9760 << " is not > osd_map_max_advance ("
9761 << cct->_conf->osd_map_max_advance << ")";
9762 }
9763 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9764 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9765 << " is not > osd_pg_epoch_persisted_max_stale ("
9766 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9767 }
9768}
9769
9770void OSD::set_disk_tp_priority()
9771{
9772 dout(10) << __func__
9773 << " class " << cct->_conf->osd_disk_thread_ioprio_class
9774 << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
9775 << dendl;
9776 if (cct->_conf->osd_disk_thread_ioprio_class.empty() ||
9777 cct->_conf->osd_disk_thread_ioprio_priority < 0)
9778 return;
9779 int cls =
9780 ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
9781 if (cls < 0)
9782 derr << __func__ << cpp_strerror(cls) << ": "
9783 << "osd_disk_thread_ioprio_class is " << cct->_conf->osd_disk_thread_ioprio_class
9784 << " but only the following values are allowed: idle, be or rt" << dendl;
9785 else
9786 disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
9787}
9788
9789// --------------------------------
9790
9791void OSD::get_latest_osdmap()
9792{
9793 dout(10) << __func__ << " -- start" << dendl;
9794
9795 C_SaferCond cond;
9796 service.objecter->wait_for_latest_osdmap(&cond);
9797 cond.wait();
9798
9799 dout(10) << __func__ << " -- finish" << dendl;
9800}
9801
9802// --------------------------------
9803
9804int OSD::init_op_flags(OpRequestRef& op)
9805{
9806 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
9807 vector<OSDOp>::const_iterator iter;
9808
9809 // client flags have no bearing on whether an op is a read, write, etc.
9810 op->rmw_flags = 0;
9811
9812 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
9813 op->set_force_rwordered();
9814 }
9815
9816 // set bits based on op codes, called methods.
9817 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
9818 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
9819 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
9820 /* This a bit odd. PING isn't actually a write. It can't
9821 * result in an update to the object_info. PINGs also aren'ty
9822 * resent, so there's no reason to write out a log entry
9823 *
9824 * However, we pipeline them behind writes, so let's force
9825 * the write_ordered flag.
9826 */
9827 op->set_force_rwordered();
9828 } else {
9829 if (ceph_osd_op_mode_modify(iter->op.op))
9830 op->set_write();
9831 }
9832 if (ceph_osd_op_mode_read(iter->op.op))
9833 op->set_read();
9834
9835 // set READ flag if there are src_oids
9836 if (iter->soid.oid.name.length())
9837 op->set_read();
9838
9839 // set PGOP flag if there are PG ops
9840 if (ceph_osd_op_type_pg(iter->op.op))
9841 op->set_pg_op();
9842
9843 if (ceph_osd_op_mode_cache(iter->op.op))
9844 op->set_cache();
9845
9846 // check for ec base pool
9847 int64_t poolid = m->get_pg().pool();
9848 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
9849 if (pool && pool->is_tier()) {
9850 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
9851 if (base_pool && base_pool->require_rollback()) {
9852 if ((iter->op.op != CEPH_OSD_OP_READ) &&
9853 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
c07f9fc5 9854 (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
7c673cae
FG
9855 (iter->op.op != CEPH_OSD_OP_STAT) &&
9856 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
9857 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
9858 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
9859 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
9860 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
9861 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
9862 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
9863 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
9864 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
9865 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
9866 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
9867 (iter->op.op != CEPH_OSD_OP_CREATE) &&
9868 (iter->op.op != CEPH_OSD_OP_DELETE) &&
9869 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
9870 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
9871 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
9872 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
9873 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
9874 op->set_promote();
9875 }
9876 }
9877 }
9878
9879 switch (iter->op.op) {
9880 case CEPH_OSD_OP_CALL:
9881 {
9882 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
9883 int is_write, is_read;
9884 string cname, mname;
9885 bp.copy(iter->op.cls.class_len, cname);
9886 bp.copy(iter->op.cls.method_len, mname);
9887
9888 ClassHandler::ClassData *cls;
9889 int r = class_handler->open_class(cname, &cls);
9890 if (r) {
9891 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
9892 if (r == -ENOENT)
9893 r = -EOPNOTSUPP;
9894 else if (r != -EPERM) // propagate permission errors
9895 r = -EIO;
9896 return r;
9897 }
9898 int flags = cls->get_method_flags(mname.c_str());
9899 if (flags < 0) {
9900 if (flags == -ENOENT)
9901 r = -EOPNOTSUPP;
9902 else
9903 r = flags;
9904 return r;
9905 }
9906 is_read = flags & CLS_METHOD_RD;
9907 is_write = flags & CLS_METHOD_WR;
9908 bool is_promote = flags & CLS_METHOD_PROMOTE;
9909
9910 dout(10) << "class " << cname << " method " << mname << " "
9911 << "flags=" << (is_read ? "r" : "")
9912 << (is_write ? "w" : "")
9913 << (is_promote ? "p" : "")
9914 << dendl;
9915 if (is_read)
9916 op->set_class_read();
9917 if (is_write)
9918 op->set_class_write();
9919 if (is_promote)
9920 op->set_promote();
9921 op->add_class(cname, is_read, is_write, cls->whitelisted);
9922 break;
9923 }
9924
9925 case CEPH_OSD_OP_WATCH:
9926 // force the read bit for watch since it is depends on previous
9927 // watch state (and may return early if the watch exists) or, in
9928 // the case of ping, is simply a read op.
9929 op->set_read();
9930 // fall through
9931 case CEPH_OSD_OP_NOTIFY:
9932 case CEPH_OSD_OP_NOTIFY_ACK:
9933 {
9934 op->set_promote();
9935 break;
9936 }
9937
9938 case CEPH_OSD_OP_DELETE:
9939 // if we get a delete with FAILOK we can skip handle cache. without
9940 // FAILOK we still need to promote (or do something smarter) to
9941 // determine whether to return ENOENT or 0.
9942 if (iter == m->ops.begin() &&
9943 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
9944 op->set_skip_handle_cache();
9945 }
9946 // skip promotion when proxying a delete op
9947 if (m->ops.size() == 1) {
9948 op->set_skip_promote();
9949 }
9950 break;
9951
9952 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
9953 case CEPH_OSD_OP_CACHE_FLUSH:
9954 case CEPH_OSD_OP_CACHE_EVICT:
9955 // If try_flush/flush/evict is the only op, can skip handle cache.
9956 if (m->ops.size() == 1) {
9957 op->set_skip_handle_cache();
9958 }
9959 break;
9960
9961 case CEPH_OSD_OP_READ:
9962 case CEPH_OSD_OP_SYNC_READ:
9963 case CEPH_OSD_OP_SPARSE_READ:
9964 case CEPH_OSD_OP_CHECKSUM:
9965 case CEPH_OSD_OP_WRITEFULL:
9966 if (m->ops.size() == 1 &&
9967 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
9968 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
9969 op->set_skip_promote();
9970 }
9971 break;
9972
9973 // force promotion when pin an object in cache tier
9974 case CEPH_OSD_OP_CACHE_PIN:
9975 op->set_promote();
9976 break;
9977
9978 default:
9979 break;
9980 }
9981 }
9982
9983 if (op->rmw_flags == 0)
9984 return -EINVAL;
9985
9986 return 0;
9987}
9988
9989void OSD::PeeringWQ::_dequeue(list<PG*> *out) {
9990 for (list<PG*>::iterator i = peering_queue.begin();
9991 i != peering_queue.end() &&
9992 out->size() < osd->cct->_conf->osd_peering_wq_batch_size;
9993 ) {
9994 if (in_use.count(*i)) {
9995 ++i;
9996 } else {
9997 out->push_back(*i);
9998 peering_queue.erase(i++);
9999 }
10000 }
10001 in_use.insert(out->begin(), out->end());
10002}
10003
224ce89b 10004
7c673cae
FG
10005// =============================================================
10006
10007#undef dout_context
10008#define dout_context osd->cct
10009#undef dout_prefix
10010#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10011
10012void OSD::ShardedOpWQ::wake_pg_waiters(spg_t pgid)
10013{
10014 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10015 auto sdata = shard_list[shard_index];
10016 bool queued = false;
10017 unsigned pushes_to_free = 0;
10018 {
10019 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10020 auto p = sdata->pg_slots.find(pgid);
10021 if (p != sdata->pg_slots.end()) {
10022 dout(20) << __func__ << " " << pgid
10023 << " to_process " << p->second.to_process
10024 << " waiting_for_pg=" << (int)p->second.waiting_for_pg << dendl;
10025 for (auto i = p->second.to_process.rbegin();
10026 i != p->second.to_process.rend();
10027 ++i) {
10028 sdata->_enqueue_front(make_pair(pgid, *i), osd->op_prio_cutoff);
10029 }
10030 for (auto& q : p->second.to_process) {
10031 pushes_to_free += q.get_reserved_pushes();
10032 }
10033 p->second.to_process.clear();
10034 p->second.waiting_for_pg = false;
10035 ++p->second.requeue_seq;
10036 queued = true;
10037 }
10038 }
10039 if (pushes_to_free > 0) {
10040 osd->service.release_reserved_pushes(pushes_to_free);
10041 }
10042 if (queued) {
10043 sdata->sdata_lock.Lock();
10044 sdata->sdata_cond.SignalOne();
10045 sdata->sdata_lock.Unlock();
10046 }
10047}
10048
10049void OSD::ShardedOpWQ::prune_pg_waiters(OSDMapRef osdmap, int whoami)
10050{
10051 unsigned pushes_to_free = 0;
10052 for (auto sdata : shard_list) {
10053 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10054 sdata->waiting_for_pg_osdmap = osdmap;
10055 auto p = sdata->pg_slots.begin();
10056 while (p != sdata->pg_slots.end()) {
10057 ShardData::pg_slot& slot = p->second;
10058 if (!slot.to_process.empty() && slot.num_running == 0) {
10059 if (osdmap->is_up_acting_osd_shard(p->first, whoami)) {
10060 dout(20) << __func__ << " " << p->first << " maps to us, keeping"
10061 << dendl;
10062 ++p;
10063 continue;
10064 }
10065 while (!slot.to_process.empty() &&
10066 slot.to_process.front().get_map_epoch() <= osdmap->get_epoch()) {
10067 auto& qi = slot.to_process.front();
10068 dout(20) << __func__ << " " << p->first
10069 << " item " << qi
10070 << " epoch " << qi.get_map_epoch()
10071 << " <= " << osdmap->get_epoch()
10072 << ", stale, dropping" << dendl;
10073 pushes_to_free += qi.get_reserved_pushes();
10074 slot.to_process.pop_front();
10075 }
10076 }
10077 if (slot.to_process.empty() &&
10078 slot.num_running == 0 &&
10079 !slot.pg) {
10080 dout(20) << __func__ << " " << p->first << " empty, pruning" << dendl;
10081 p = sdata->pg_slots.erase(p);
10082 } else {
10083 ++p;
10084 }
10085 }
10086 }
10087 if (pushes_to_free > 0) {
10088 osd->service.release_reserved_pushes(pushes_to_free);
10089 }
10090}
10091
10092void OSD::ShardedOpWQ::clear_pg_pointer(spg_t pgid)
10093{
10094 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10095 auto sdata = shard_list[shard_index];
10096 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10097 auto p = sdata->pg_slots.find(pgid);
10098 if (p != sdata->pg_slots.end()) {
10099 auto& slot = p->second;
10100 dout(20) << __func__ << " " << pgid << " pg " << slot.pg << dendl;
10101 assert(!slot.pg || slot.pg->deleting);
10102 slot.pg = nullptr;
10103 }
10104}
10105
10106void OSD::ShardedOpWQ::clear_pg_slots()
10107{
10108 for (auto sdata : shard_list) {
10109 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10110 sdata->pg_slots.clear();
10111 sdata->waiting_for_pg_osdmap.reset();
10112 // don't bother with reserved pushes; we are shutting down
10113 }
10114}
10115
10116#undef dout_prefix
10117#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10118
10119void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10120{
10121 uint32_t shard_index = thread_index % num_shards;
10122 ShardData *sdata = shard_list[shard_index];
10123 assert(NULL != sdata);
10124
10125 // peek at spg_t
10126 sdata->sdata_op_ordering_lock.Lock();
10127 if (sdata->pqueue->empty()) {
10128 dout(20) << __func__ << " empty q, waiting" << dendl;
10129 // optimistically sleep a moment; maybe another work item will come along.
7c673cae
FG
10130 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10131 osd->cct->_conf->threadpool_default_timeout, 0);
10132 sdata->sdata_lock.Lock();
224ce89b 10133 sdata->sdata_op_ordering_lock.Unlock();
7c673cae
FG
10134 sdata->sdata_cond.WaitInterval(sdata->sdata_lock,
10135 utime_t(osd->cct->_conf->threadpool_empty_queue_max_wait, 0));
10136 sdata->sdata_lock.Unlock();
10137 sdata->sdata_op_ordering_lock.Lock();
10138 if (sdata->pqueue->empty()) {
10139 sdata->sdata_op_ordering_lock.Unlock();
10140 return;
10141 }
10142 }
10143 pair<spg_t, PGQueueable> item = sdata->pqueue->dequeue();
10144 if (osd->is_stopping()) {
10145 sdata->sdata_op_ordering_lock.Unlock();
10146 return; // OSD shutdown, discard.
10147 }
10148 PGRef pg;
10149 uint64_t requeue_seq;
10150 {
10151 auto& slot = sdata->pg_slots[item.first];
10152 dout(30) << __func__ << " " << item.first
10153 << " to_process " << slot.to_process
10154 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10155 slot.to_process.push_back(item.second);
10156 // note the requeue seq now...
10157 requeue_seq = slot.requeue_seq;
10158 if (slot.waiting_for_pg) {
10159 // save ourselves a bit of effort
10160 dout(20) << __func__ << " " << item.first << " item " << item.second
10161 << " queued, waiting_for_pg" << dendl;
10162 sdata->sdata_op_ordering_lock.Unlock();
10163 return;
10164 }
10165 pg = slot.pg;
10166 dout(20) << __func__ << " " << item.first << " item " << item.second
10167 << " queued" << dendl;
10168 ++slot.num_running;
10169 }
10170 sdata->sdata_op_ordering_lock.Unlock();
10171
10172 osd->service.maybe_inject_dispatch_delay();
10173
10174 // [lookup +] lock pg (if we have it)
10175 if (!pg) {
10176 pg = osd->_lookup_lock_pg(item.first);
10177 } else {
10178 pg->lock();
10179 }
10180
10181 osd->service.maybe_inject_dispatch_delay();
10182
10183 boost::optional<PGQueueable> qi;
10184
10185 // we don't use a Mutex::Locker here because of the
10186 // osd->service.release_reserved_pushes() call below
10187 sdata->sdata_op_ordering_lock.Lock();
10188
10189 auto q = sdata->pg_slots.find(item.first);
10190 assert(q != sdata->pg_slots.end());
10191 auto& slot = q->second;
10192 --slot.num_running;
10193
10194 if (slot.to_process.empty()) {
10195 // raced with wake_pg_waiters or prune_pg_waiters
10196 dout(20) << __func__ << " " << item.first << " nothing queued" << dendl;
10197 if (pg) {
10198 pg->unlock();
10199 }
10200 sdata->sdata_op_ordering_lock.Unlock();
10201 return;
10202 }
10203 if (requeue_seq != slot.requeue_seq) {
10204 dout(20) << __func__ << " " << item.first
10205 << " requeue_seq " << slot.requeue_seq << " > our "
10206 << requeue_seq << ", we raced with wake_pg_waiters"
10207 << dendl;
10208 if (pg) {
10209 pg->unlock();
10210 }
10211 sdata->sdata_op_ordering_lock.Unlock();
10212 return;
10213 }
10214 if (pg && !slot.pg && !pg->deleting) {
10215 dout(20) << __func__ << " " << item.first << " set pg to " << pg << dendl;
10216 slot.pg = pg;
10217 }
10218 dout(30) << __func__ << " " << item.first << " to_process " << slot.to_process
10219 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10220
10221 // make sure we're not already waiting for this pg
10222 if (slot.waiting_for_pg) {
10223 dout(20) << __func__ << " " << item.first << " item " << item.second
10224 << " slot is waiting_for_pg" << dendl;
10225 if (pg) {
10226 pg->unlock();
10227 }
10228 sdata->sdata_op_ordering_lock.Unlock();
10229 return;
10230 }
10231
10232 // take next item
10233 qi = slot.to_process.front();
10234 slot.to_process.pop_front();
10235 dout(20) << __func__ << " " << item.first << " item " << *qi
10236 << " pg " << pg << dendl;
10237
10238 if (!pg) {
10239 // should this pg shard exist on this osd in this (or a later) epoch?
10240 OSDMapRef osdmap = sdata->waiting_for_pg_osdmap;
10241 if (osdmap->is_up_acting_osd_shard(item.first, osd->whoami)) {
10242 dout(20) << __func__ << " " << item.first
10243 << " no pg, should exist, will wait" << " on " << *qi << dendl;
10244 slot.to_process.push_front(*qi);
10245 slot.waiting_for_pg = true;
10246 } else if (qi->get_map_epoch() > osdmap->get_epoch()) {
10247 dout(20) << __func__ << " " << item.first << " no pg, item epoch is "
10248 << qi->get_map_epoch() << " > " << osdmap->get_epoch()
10249 << ", will wait on " << *qi << dendl;
10250 slot.to_process.push_front(*qi);
10251 slot.waiting_for_pg = true;
10252 } else {
10253 dout(20) << __func__ << " " << item.first << " no pg, shouldn't exist,"
10254 << " dropping " << *qi << dendl;
10255 // share map with client?
10256 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10257 Session *session = static_cast<Session *>(
10258 (*_op)->get_req()->get_connection()->get_priv());
10259 if (session) {
10260 osd->maybe_share_map(session, *_op, sdata->waiting_for_pg_osdmap);
10261 session->put();
10262 }
10263 }
10264 unsigned pushes_to_free = qi->get_reserved_pushes();
10265 if (pushes_to_free > 0) {
10266 sdata->sdata_op_ordering_lock.Unlock();
10267 osd->service.release_reserved_pushes(pushes_to_free);
10268 return;
10269 }
10270 }
10271 sdata->sdata_op_ordering_lock.Unlock();
10272 return;
10273 }
10274 sdata->sdata_op_ordering_lock.Unlock();
10275
10276
10277 // osd_opwq_process marks the point at which an operation has been dequeued
10278 // and will begin to be handled by a worker thread.
10279 {
10280#ifdef WITH_LTTNG
10281 osd_reqid_t reqid;
10282 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10283 reqid = (*_op)->get_reqid();
10284 }
10285#endif
10286 tracepoint(osd, opwq_process_start, reqid.name._type,
10287 reqid.name._num, reqid.tid, reqid.inc);
10288 }
10289
10290 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10291 Formatter *f = Formatter::create("json");
10292 f->open_object_section("q");
10293 dump(f);
10294 f->close_section();
10295 f->flush(*_dout);
10296 delete f;
10297 *_dout << dendl;
10298
10299 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10300 suicide_interval);
10301 qi->run(osd, pg, tp_handle);
10302
10303 {
10304#ifdef WITH_LTTNG
10305 osd_reqid_t reqid;
10306 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10307 reqid = (*_op)->get_reqid();
10308 }
10309#endif
10310 tracepoint(osd, opwq_process_finish, reqid.name._type,
10311 reqid.name._num, reqid.tid, reqid.inc);
10312 }
10313
10314 pg->unlock();
10315}
10316
10317void OSD::ShardedOpWQ::_enqueue(pair<spg_t, PGQueueable> item) {
10318 uint32_t shard_index =
10319 item.first.hash_to_shard(shard_list.size());
10320
10321 ShardData* sdata = shard_list[shard_index];
10322 assert (NULL != sdata);
10323 unsigned priority = item.second.get_priority();
10324 unsigned cost = item.second.get_cost();
10325 sdata->sdata_op_ordering_lock.Lock();
10326
10327 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10328 if (priority >= osd->op_prio_cutoff)
10329 sdata->pqueue->enqueue_strict(
10330 item.second.get_owner(), priority, item);
10331 else
10332 sdata->pqueue->enqueue(
10333 item.second.get_owner(),
10334 priority, cost, item);
10335 sdata->sdata_op_ordering_lock.Unlock();
10336
10337 sdata->sdata_lock.Lock();
10338 sdata->sdata_cond.SignalOne();
10339 sdata->sdata_lock.Unlock();
10340
10341}
10342
10343void OSD::ShardedOpWQ::_enqueue_front(pair<spg_t, PGQueueable> item)
10344{
10345 uint32_t shard_index = item.first.hash_to_shard(shard_list.size());
10346 ShardData* sdata = shard_list[shard_index];
10347 assert (NULL != sdata);
10348 sdata->sdata_op_ordering_lock.Lock();
10349 auto p = sdata->pg_slots.find(item.first);
10350 if (p != sdata->pg_slots.end() && !p->second.to_process.empty()) {
10351 // we may be racing with _process, which has dequeued a new item
10352 // from pqueue, put it on to_process, and is now busy taking the
10353 // pg lock. ensure this old requeued item is ordered before any
10354 // such newer item in to_process.
10355 p->second.to_process.push_front(item.second);
10356 item.second = p->second.to_process.back();
10357 p->second.to_process.pop_back();
10358 dout(20) << __func__ << " " << item.first
10359 << " " << p->second.to_process.front()
10360 << " shuffled w/ " << item.second << dendl;
10361 } else {
10362 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10363 }
10364 sdata->_enqueue_front(item, osd->op_prio_cutoff);
10365 sdata->sdata_op_ordering_lock.Unlock();
10366 sdata->sdata_lock.Lock();
10367 sdata->sdata_cond.SignalOne();
10368 sdata->sdata_lock.Unlock();
10369}
10370
10371namespace ceph {
10372namespace osd_cmds {
10373
10374int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os)
10375{
10376 if (!ceph_using_tcmalloc()) {
10377 os << "could not issue heap profiler command -- not using tcmalloc!";
10378 return -EOPNOTSUPP;
10379 }
10380
10381 string cmd;
10382 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
10383 os << "unable to get value for command \"" << cmd << "\"";
10384 return -EINVAL;
10385 }
10386
10387 std::vector<std::string> cmd_vec;
10388 get_str_vec(cmd, cmd_vec);
10389
10390 ceph_heap_profiler_handle_command(cmd_vec, os);
10391
10392 return 0;
10393}
10394
10395}} // namespace ceph::osd_cmds
10396
224ce89b
WB
10397
10398std::ostream& operator<<(std::ostream& out, const OSD::io_queue& q) {
10399 switch(q) {
10400 case OSD::io_queue::prioritized:
10401 out << "prioritized";
10402 break;
10403 case OSD::io_queue::weightedpriority:
10404 out << "weightedpriority";
10405 break;
10406 case OSD::io_queue::mclock_opclass:
10407 out << "mclock_opclass";
10408 break;
10409 case OSD::io_queue::mclock_client:
10410 out << "mclock_client";
10411 break;
10412 }
10413 return out;
10414}