]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSD.cc
bump version to 12.2.12-pve1
[ceph.git] / ceph / src / osd / OSD.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15#include "acconfig.h"
91327a77 16#include <unistd.h>
7c673cae
FG
17#include <fstream>
18#include <iostream>
19#include <errno.h>
20#include <sys/stat.h>
21#include <signal.h>
22#include <ctype.h>
23#include <boost/scoped_ptr.hpp>
91327a77 24#include <random>
7c673cae
FG
25
26#ifdef HAVE_SYS_PARAM_H
27#include <sys/param.h>
28#endif
29
30#ifdef HAVE_SYS_MOUNT_H
31#include <sys/mount.h>
32#endif
33
34#include "osd/PG.h"
35
36#include "include/types.h"
37#include "include/compat.h"
38
39#include "OSD.h"
40#include "OSDMap.h"
41#include "Watch.h"
42#include "osdc/Objecter.h"
43
44#include "common/errno.h"
45#include "common/ceph_argparse.h"
224ce89b 46#include "common/ceph_time.h"
7c673cae
FG
47#include "common/version.h"
48#include "common/io_priority.h"
b5b8bbf5 49#include "common/pick_address.h"
7c673cae
FG
50
51#include "os/ObjectStore.h"
52#ifdef HAVE_LIBFUSE
53#include "os/FuseStore.h"
54#endif
55
56#include "PrimaryLogPG.h"
57
58
59#include "msg/Messenger.h"
60#include "msg/Message.h"
61
62#include "mon/MonClient.h"
63
64#include "messages/MLog.h"
65
66#include "messages/MGenericMessage.h"
7c673cae
FG
67#include "messages/MOSDPing.h"
68#include "messages/MOSDFailure.h"
69#include "messages/MOSDMarkMeDown.h"
70#include "messages/MOSDFull.h"
71#include "messages/MOSDOp.h"
72#include "messages/MOSDOpReply.h"
73#include "messages/MOSDBackoff.h"
74#include "messages/MOSDBeacon.h"
75#include "messages/MOSDRepOp.h"
76#include "messages/MOSDRepOpReply.h"
77#include "messages/MOSDBoot.h"
78#include "messages/MOSDPGTemp.h"
79
80#include "messages/MOSDMap.h"
81#include "messages/MMonGetOSDMap.h"
82#include "messages/MOSDPGNotify.h"
83#include "messages/MOSDPGQuery.h"
84#include "messages/MOSDPGLog.h"
85#include "messages/MOSDPGRemove.h"
86#include "messages/MOSDPGInfo.h"
87#include "messages/MOSDPGCreate.h"
88#include "messages/MOSDPGTrim.h"
89#include "messages/MOSDPGScan.h"
90#include "messages/MOSDPGBackfill.h"
91#include "messages/MBackfillReserve.h"
92#include "messages/MRecoveryReserve.h"
c07f9fc5 93#include "messages/MOSDForceRecovery.h"
7c673cae
FG
94#include "messages/MOSDECSubOpWrite.h"
95#include "messages/MOSDECSubOpWriteReply.h"
96#include "messages/MOSDECSubOpRead.h"
97#include "messages/MOSDECSubOpReadReply.h"
98#include "messages/MOSDPGCreated.h"
99#include "messages/MOSDPGUpdateLogMissing.h"
100#include "messages/MOSDPGUpdateLogMissingReply.h"
101
102#include "messages/MOSDAlive.h"
103
104#include "messages/MOSDScrub.h"
105#include "messages/MOSDScrubReserve.h"
106#include "messages/MOSDRepScrub.h"
107
108#include "messages/MMonCommand.h"
109#include "messages/MCommand.h"
110#include "messages/MCommandReply.h"
111
112#include "messages/MPGStats.h"
113#include "messages/MPGStatsAck.h"
114
115#include "messages/MWatchNotify.h"
116#include "messages/MOSDPGPush.h"
117#include "messages/MOSDPGPushReply.h"
118#include "messages/MOSDPGPull.h"
119
120#include "common/perf_counters.h"
121#include "common/Timer.h"
122#include "common/LogClient.h"
123#include "common/AsyncReserver.h"
124#include "common/HeartbeatMap.h"
125#include "common/admin_socket.h"
126#include "common/ceph_context.h"
127
128#include "global/signal_handler.h"
129#include "global/pidfile.h"
130
131#include "include/color.h"
132#include "perfglue/cpu_profiler.h"
133#include "perfglue/heap_profiler.h"
134
135#include "osd/OpRequest.h"
136
137#include "auth/AuthAuthorizeHandler.h"
138#include "auth/RotatingKeyRing.h"
139#include "common/errno.h"
140
141#include "objclass/objclass.h"
142
143#include "common/cmdparse.h"
144#include "include/str_list.h"
145#include "include/util.h"
146
147#include "include/assert.h"
148#include "common/config.h"
149#include "common/EventTrace.h"
150
151#ifdef WITH_LTTNG
152#define TRACEPOINT_DEFINE
153#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
154#include "tracing/osd.h"
155#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
156#undef TRACEPOINT_DEFINE
157#else
158#define tracepoint(...)
159#endif
160
161#define dout_context cct
162#define dout_subsys ceph_subsys_osd
163#undef dout_prefix
164#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
165
224ce89b 166
7c673cae
FG
167static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
168 return *_dout << "osd." << whoami << " " << epoch << " ";
169}
170
7c673cae
FG
171//Initial features in new superblock.
172//Features here are also automatically upgraded
173CompatSet OSD::get_osd_initial_compat_set() {
174 CompatSet::FeatureSet ceph_osd_feature_compat;
175 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
176 CompatSet::FeatureSet ceph_osd_feature_incompat;
177 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
178 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
179 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
180 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
181 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
182 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
183 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
184 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
185 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
186 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
187 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
188 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
189 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
190 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
c07f9fc5 191 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
7c673cae
FG
192 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
193 ceph_osd_feature_incompat);
194}
195
196//Features are added here that this OSD supports.
197CompatSet OSD::get_osd_compat_set() {
198 CompatSet compat = get_osd_initial_compat_set();
199 //Any features here can be set in code, but not in initial superblock
200 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
201 return compat;
202}
203
204OSDService::OSDService(OSD *osd) :
205 osd(osd),
206 cct(osd->cct),
207 meta_osr(new ObjectStore::Sequencer("meta")),
208 whoami(osd->whoami), store(osd->store),
209 log_client(osd->log_client), clog(osd->clog),
210 pg_recovery_stats(osd->pg_recovery_stats),
211 cluster_messenger(osd->cluster_messenger),
212 client_messenger(osd->client_messenger),
213 logger(osd->logger),
214 recoverystate_perf(osd->recoverystate_perf),
215 monc(osd->monc),
216 peering_wq(osd->peering_wq),
217 recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
f64942e4 218 &osd->recovery_tp),
7c673cae
FG
219 class_handler(osd->class_handler),
220 pg_epoch_lock("OSDService::pg_epoch_lock"),
221 publish_lock("OSDService::publish_lock"),
222 pre_publish_lock("OSDService::pre_publish_lock"),
223 max_oldest_map(0),
224 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
225 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
226 scrubs_active(0),
227 agent_lock("OSDService::agent_lock"),
228 agent_valid_iterator(false),
229 agent_ops(0),
230 flush_mode_high_count(0),
231 agent_active(true),
232 agent_thread(this),
233 agent_stop_flag(false),
234 agent_timer_lock("OSDService::agent_timer_lock"),
235 agent_timer(osd->client_messenger->cct, agent_timer_lock),
236 last_recalibrate(ceph_clock_now()),
237 promote_max_objects(0),
238 promote_max_bytes(0),
239 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
240 objecter_finisher(osd->client_messenger->cct),
241 watch_lock("OSDService::watch_lock"),
242 watch_timer(osd->client_messenger->cct, watch_lock),
243 next_notif_id(0),
244 recovery_request_lock("OSDService::recovery_request_lock"),
245 recovery_request_timer(cct, recovery_request_lock, false),
31f18b77
FG
246 recovery_sleep_lock("OSDService::recovery_sleep_lock"),
247 recovery_sleep_timer(cct, recovery_sleep_lock, false),
7c673cae 248 reserver_finisher(cct),
3efd9988 249 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae 250 cct->_conf->osd_min_recovery_priority),
3efd9988 251 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
7c673cae
FG
252 cct->_conf->osd_min_recovery_priority),
253 pg_temp_lock("OSDService::pg_temp_lock"),
254 snap_sleep_lock("OSDService::snap_sleep_lock"),
255 snap_sleep_timer(
256 osd->client_messenger->cct, snap_sleep_lock, false /* relax locking */),
31f18b77
FG
257 scrub_sleep_lock("OSDService::scrub_sleep_lock"),
258 scrub_sleep_timer(
259 osd->client_messenger->cct, scrub_sleep_lock, false /* relax locking */),
3efd9988 260 snap_reserver(cct, &reserver_finisher,
7c673cae
FG
261 cct->_conf->osd_max_trimming_pgs),
262 recovery_lock("OSDService::recovery_lock"),
263 recovery_ops_active(0),
264 recovery_ops_reserved(0),
265 recovery_paused(false),
266 map_cache_lock("OSDService::map_cache_lock"),
267 map_cache(cct, cct->_conf->osd_map_cache_size),
268 map_bl_cache(cct->_conf->osd_map_cache_size),
269 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
270 in_progress_split_lock("OSDService::in_progress_split_lock"),
271 stat_lock("OSDService::stat_lock"),
272 full_status_lock("OSDService::full_status_lock"),
273 cur_state(NONE),
274 cur_ratio(0),
275 epoch_lock("OSDService::epoch_lock"),
276 boot_epoch(0), up_epoch(0), bind_epoch(0),
277 is_stopping_lock("OSDService::is_stopping_lock")
278#ifdef PG_DEBUG_REFS
279 , pgid_lock("OSDService::pgid_lock")
280#endif
281{
282 objecter->init();
283}
284
285OSDService::~OSDService()
286{
287 delete objecter;
288}
289
31f18b77
FG
290
291
292#ifdef PG_DEBUG_REFS
293void OSDService::add_pgid(spg_t pgid, PG *pg){
294 Mutex::Locker l(pgid_lock);
295 if (!pgid_tracker.count(pgid)) {
296 live_pgs[pgid] = pg;
297 }
298 pgid_tracker[pgid]++;
299}
300void OSDService::remove_pgid(spg_t pgid, PG *pg)
301{
302 Mutex::Locker l(pgid_lock);
303 assert(pgid_tracker.count(pgid));
304 assert(pgid_tracker[pgid] > 0);
305 pgid_tracker[pgid]--;
306 if (pgid_tracker[pgid] == 0) {
307 pgid_tracker.erase(pgid);
308 live_pgs.erase(pgid);
309 }
310}
311void OSDService::dump_live_pgids()
312{
313 Mutex::Locker l(pgid_lock);
314 derr << "live pgids:" << dendl;
315 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
316 i != pgid_tracker.cend();
317 ++i) {
318 derr << "\t" << *i << dendl;
319 live_pgs[i->first]->dump_live_ids();
320 }
321}
322#endif
323
324
7c673cae
FG
325void OSDService::_start_split(spg_t parent, const set<spg_t> &children)
326{
327 for (set<spg_t>::const_iterator i = children.begin();
328 i != children.end();
329 ++i) {
330 dout(10) << __func__ << ": Starting split on pg " << *i
331 << ", parent=" << parent << dendl;
332 assert(!pending_splits.count(*i));
333 assert(!in_progress_splits.count(*i));
334 pending_splits.insert(make_pair(*i, parent));
335
336 assert(!rev_pending_splits[parent].count(*i));
337 rev_pending_splits[parent].insert(*i);
338 }
339}
340
341void OSDService::mark_split_in_progress(spg_t parent, const set<spg_t> &children)
342{
343 Mutex::Locker l(in_progress_split_lock);
344 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
345 assert(piter != rev_pending_splits.end());
346 for (set<spg_t>::const_iterator i = children.begin();
347 i != children.end();
348 ++i) {
349 assert(piter->second.count(*i));
350 assert(pending_splits.count(*i));
351 assert(!in_progress_splits.count(*i));
352 assert(pending_splits[*i] == parent);
353
354 pending_splits.erase(*i);
355 piter->second.erase(*i);
356 in_progress_splits.insert(*i);
357 }
358 if (piter->second.empty())
359 rev_pending_splits.erase(piter);
360}
361
362void OSDService::cancel_pending_splits_for_parent(spg_t parent)
363{
364 Mutex::Locker l(in_progress_split_lock);
365 _cancel_pending_splits_for_parent(parent);
366}
367
368void OSDService::_cancel_pending_splits_for_parent(spg_t parent)
369{
370 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
371 if (piter == rev_pending_splits.end())
372 return;
373
374 for (set<spg_t>::iterator i = piter->second.begin();
375 i != piter->second.end();
376 ++i) {
377 assert(pending_splits.count(*i));
378 assert(!in_progress_splits.count(*i));
379 pending_splits.erase(*i);
380 dout(10) << __func__ << ": Completing split on pg " << *i
381 << " for parent: " << parent << dendl;
382 _cancel_pending_splits_for_parent(*i);
383 }
384 rev_pending_splits.erase(piter);
385}
386
387void OSDService::_maybe_split_pgid(OSDMapRef old_map,
388 OSDMapRef new_map,
389 spg_t pgid)
390{
391 assert(old_map->have_pg_pool(pgid.pool()));
392 int old_pgnum = old_map->get_pg_num(pgid.pool());
393 if (pgid.ps() < static_cast<unsigned>(old_pgnum)) {
394 set<spg_t> children;
395 if (pgid.is_split(old_pgnum,
396 new_map->get_pg_num(pgid.pool()), &children)) {
397 _start_split(pgid, children); }
398 } else {
399 assert(pgid.ps() < static_cast<unsigned>(new_map->get_pg_num(pgid.pool())));
400 }
401}
402
403void OSDService::init_splits_between(spg_t pgid,
404 OSDMapRef frommap,
405 OSDMapRef tomap)
406{
407 // First, check whether we can avoid this potentially expensive check
408 if (tomap->have_pg_pool(pgid.pool()) &&
409 pgid.is_split(
410 frommap->get_pg_num(pgid.pool()),
411 tomap->get_pg_num(pgid.pool()),
412 NULL)) {
413 // Ok, a split happened, so we need to walk the osdmaps
414 set<spg_t> new_pgs; // pgs to scan on each map
415 new_pgs.insert(pgid);
416 OSDMapRef curmap(get_map(frommap->get_epoch()));
417 for (epoch_t e = frommap->get_epoch() + 1;
418 e <= tomap->get_epoch();
419 ++e) {
420 OSDMapRef nextmap(try_get_map(e));
421 if (!nextmap)
422 continue;
423 set<spg_t> even_newer_pgs; // pgs added in this loop
424 for (set<spg_t>::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) {
425 set<spg_t> split_pgs;
426 if (i->is_split(curmap->get_pg_num(i->pool()),
427 nextmap->get_pg_num(i->pool()),
428 &split_pgs)) {
429 start_split(*i, split_pgs);
430 even_newer_pgs.insert(split_pgs.begin(), split_pgs.end());
431 }
432 }
433 new_pgs.insert(even_newer_pgs.begin(), even_newer_pgs.end());
434 curmap = nextmap;
435 }
436 assert(curmap == tomap); // we must have had both frommap and tomap
437 }
438}
439
440void OSDService::expand_pg_num(OSDMapRef old_map,
441 OSDMapRef new_map)
442{
443 Mutex::Locker l(in_progress_split_lock);
444 for (set<spg_t>::iterator i = in_progress_splits.begin();
445 i != in_progress_splits.end();
446 ) {
447 if (!new_map->have_pg_pool(i->pool())) {
448 in_progress_splits.erase(i++);
449 } else {
450 _maybe_split_pgid(old_map, new_map, *i);
451 ++i;
452 }
453 }
454 for (map<spg_t, spg_t>::iterator i = pending_splits.begin();
455 i != pending_splits.end();
456 ) {
457 if (!new_map->have_pg_pool(i->first.pool())) {
458 rev_pending_splits.erase(i->second);
459 pending_splits.erase(i++);
460 } else {
461 _maybe_split_pgid(old_map, new_map, i->first);
462 ++i;
463 }
464 }
465}
466
467bool OSDService::splitting(spg_t pgid)
468{
469 Mutex::Locker l(in_progress_split_lock);
470 return in_progress_splits.count(pgid) ||
471 pending_splits.count(pgid);
472}
473
474void OSDService::complete_split(const set<spg_t> &pgs)
475{
476 Mutex::Locker l(in_progress_split_lock);
477 for (set<spg_t>::const_iterator i = pgs.begin();
478 i != pgs.end();
479 ++i) {
480 dout(10) << __func__ << ": Completing split on pg " << *i << dendl;
481 assert(!pending_splits.count(*i));
482 assert(in_progress_splits.count(*i));
483 in_progress_splits.erase(*i);
484 }
485}
486
487void OSDService::need_heartbeat_peer_update()
488{
489 osd->need_heartbeat_peer_update();
490}
491
492void OSDService::pg_stat_queue_enqueue(PG *pg)
493{
494 osd->pg_stat_queue_enqueue(pg);
495}
496
497void OSDService::pg_stat_queue_dequeue(PG *pg)
498{
499 osd->pg_stat_queue_dequeue(pg);
500}
501
502void OSDService::start_shutdown()
503{
504 {
505 Mutex::Locker l(agent_timer_lock);
506 agent_timer.shutdown();
507 }
31f18b77
FG
508
509 {
510 Mutex::Locker l(recovery_sleep_lock);
511 recovery_sleep_timer.shutdown();
512 }
7c673cae
FG
513}
514
31f18b77 515void OSDService::shutdown_reserver()
7c673cae
FG
516{
517 reserver_finisher.wait_for_empty();
518 reserver_finisher.stop();
31f18b77
FG
519}
520
521void OSDService::shutdown()
522{
7c673cae
FG
523 {
524 Mutex::Locker l(watch_lock);
525 watch_timer.shutdown();
526 }
527
528 objecter->shutdown();
529 objecter_finisher.wait_for_empty();
530 objecter_finisher.stop();
531
532 {
533 Mutex::Locker l(recovery_request_lock);
534 recovery_request_timer.shutdown();
535 }
536
537 {
538 Mutex::Locker l(snap_sleep_lock);
539 snap_sleep_timer.shutdown();
540 }
541
31f18b77
FG
542 {
543 Mutex::Locker l(scrub_sleep_lock);
544 scrub_sleep_timer.shutdown();
545 }
546
7c673cae
FG
547 osdmap = OSDMapRef();
548 next_osdmap = OSDMapRef();
549}
550
551void OSDService::init()
552{
553 reserver_finisher.start();
554 objecter_finisher.start();
555 objecter->set_client_incarnation(0);
556
557 // deprioritize objecter in daemonperf output
558 objecter->get_logger()->set_prio_adjust(-3);
559
560 watch_timer.init();
561 agent_timer.init();
562 snap_sleep_timer.init();
31f18b77 563 scrub_sleep_timer.init();
7c673cae
FG
564
565 agent_thread.create("osd_srv_agent");
566
567 if (cct->_conf->osd_recovery_delay_start)
568 defer_recovery(cct->_conf->osd_recovery_delay_start);
569}
570
571void OSDService::final_init()
572{
573 objecter->start(osdmap.get());
574}
575
576void OSDService::activate_map()
577{
578 // wake/unwake the tiering agent
579 agent_lock.Lock();
580 agent_active =
581 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
582 osd->is_active();
583 agent_cond.Signal();
584 agent_lock.Unlock();
585}
586
181888fb
FG
587void OSDService::request_osdmap_update(epoch_t e)
588{
589 osd->osdmap_subscribe(e, false);
590}
591
7c673cae
FG
592class AgentTimeoutCB : public Context {
593 PGRef pg;
594public:
595 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
596 void finish(int) override {
597 pg->agent_choose_mode_restart();
598 }
599};
600
601void OSDService::agent_entry()
602{
603 dout(10) << __func__ << " start" << dendl;
604 agent_lock.Lock();
605
606 while (!agent_stop_flag) {
607 if (agent_queue.empty()) {
608 dout(20) << __func__ << " empty queue" << dendl;
609 agent_cond.Wait(agent_lock);
610 continue;
611 }
612 uint64_t level = agent_queue.rbegin()->first;
613 set<PGRef>& top = agent_queue.rbegin()->second;
614 dout(10) << __func__
615 << " tiers " << agent_queue.size()
616 << ", top is " << level
617 << " with pgs " << top.size()
618 << ", ops " << agent_ops << "/"
619 << cct->_conf->osd_agent_max_ops
620 << (agent_active ? " active" : " NOT ACTIVE")
621 << dendl;
622 dout(20) << __func__ << " oids " << agent_oids << dendl;
623 int max = cct->_conf->osd_agent_max_ops - agent_ops;
624 int agent_flush_quota = max;
625 if (!flush_mode_high_count)
626 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
627 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
628 agent_cond.Wait(agent_lock);
629 continue;
630 }
631
632 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
633 agent_queue_pos = top.begin();
634 agent_valid_iterator = true;
635 }
636 PGRef pg = *agent_queue_pos;
637 dout(10) << "high_count " << flush_mode_high_count
638 << " agent_ops " << agent_ops
639 << " flush_quota " << agent_flush_quota << dendl;
640 agent_lock.Unlock();
641 if (!pg->agent_work(max, agent_flush_quota)) {
642 dout(10) << __func__ << " " << pg->get_pgid()
643 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
644 << " seconds" << dendl;
645
646 osd->logger->inc(l_osd_tier_delay);
647 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
648 agent_timer_lock.Lock();
649 Context *cb = new AgentTimeoutCB(pg);
650 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
651 agent_timer_lock.Unlock();
652 }
653 agent_lock.Lock();
654 }
655 agent_lock.Unlock();
656 dout(10) << __func__ << " finish" << dendl;
657}
658
659void OSDService::agent_stop()
660{
661 {
662 Mutex::Locker l(agent_lock);
663
664 // By this time all ops should be cancelled
665 assert(agent_ops == 0);
666 // By this time all PGs are shutdown and dequeued
667 if (!agent_queue.empty()) {
668 set<PGRef>& top = agent_queue.rbegin()->second;
669 derr << "agent queue not empty, for example " << (*top.begin())->info.pgid << dendl;
670 assert(0 == "agent queue not empty");
671 }
672
673 agent_stop_flag = true;
674 agent_cond.Signal();
675 }
676 agent_thread.join();
677}
678
679// -------------------------------------
680
681void OSDService::promote_throttle_recalibrate()
682{
683 utime_t now = ceph_clock_now();
684 double dur = now - last_recalibrate;
685 last_recalibrate = now;
686 unsigned prob = promote_probability_millis;
687
688 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
689 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
690
691 unsigned min_prob = 1;
692
693 uint64_t attempts, obj, bytes;
694 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
695 dout(10) << __func__ << " " << attempts << " attempts, promoted "
1adf2230 696 << obj << " objects and " << byte_u_t(bytes) << "; target "
7c673cae 697 << target_obj_sec << " obj/sec or "
1adf2230 698 << byte_u_t(target_bytes_sec) << "/sec"
7c673cae
FG
699 << dendl;
700
701 // calculate what the probability *should* be, given the targets
702 unsigned new_prob;
703 if (attempts && dur > 0) {
704 uint64_t avg_size = 1;
705 if (obj)
706 avg_size = MAX(bytes / obj, 1);
707 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
708 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
709 / (double)attempts;
710 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
711 << avg_size << dendl;
712 if (target_obj_sec && target_bytes_sec)
713 new_prob = MIN(po, pb);
714 else if (target_obj_sec)
715 new_prob = po;
716 else if (target_bytes_sec)
717 new_prob = pb;
718 else
719 new_prob = 1000;
720 } else {
721 new_prob = 1000;
722 }
723 dout(20) << __func__ << " new_prob " << new_prob << dendl;
724
725 // correct for persistent skew between target rate and actual rate, adjust
726 double ratio = 1.0;
727 unsigned actual = 0;
728 if (attempts && obj) {
729 actual = obj * 1000 / attempts;
730 ratio = (double)actual / (double)prob;
731 new_prob = (double)new_prob / ratio;
732 }
733 new_prob = MAX(new_prob, min_prob);
734 new_prob = MIN(new_prob, 1000);
735
736 // adjust
737 prob = (prob + new_prob) / 2;
738 prob = MAX(prob, min_prob);
739 prob = MIN(prob, 1000);
740 dout(10) << __func__ << " actual " << actual
741 << ", actual/prob ratio " << ratio
742 << ", adjusted new_prob " << new_prob
743 << ", prob " << promote_probability_millis << " -> " << prob
744 << dendl;
745 promote_probability_millis = prob;
746
747 // set hard limits for this interval to mitigate stampedes
91327a77
AA
748 promote_max_objects = target_obj_sec * osd->OSD_TICK_INTERVAL * 2;
749 promote_max_bytes = target_bytes_sec * osd->OSD_TICK_INTERVAL * 2;
7c673cae
FG
750}
751
752// -------------------------------------
753
754float OSDService::get_failsafe_full_ratio()
755{
756 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
757 if (full_ratio > 1.0) full_ratio /= 100.0;
758 return full_ratio;
759}
760
224ce89b 761void OSDService::check_full_status(float ratio)
7c673cae
FG
762{
763 Mutex::Locker l(full_status_lock);
764
7c673cae
FG
765 cur_ratio = ratio;
766
767 // The OSDMap ratios take precendence. So if the failsafe is .95 and
768 // the admin sets the cluster full to .96, the failsafe moves up to .96
769 // too. (Not that having failsafe == full is ideal, but it's better than
770 // dropping writes before the clusters appears full.)
771 OSDMapRef osdmap = get_osdmap();
772 if (!osdmap || osdmap->get_epoch() == 0) {
773 cur_state = NONE;
774 return;
775 }
776 float nearfull_ratio = osdmap->get_nearfull_ratio();
777 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
778 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
779 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
780
31f18b77 781 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
782 // use the failsafe for nearfull and full; the mon isn't using the
783 // flags anyway because we're mid-upgrade.
784 full_ratio = failsafe_ratio;
785 backfillfull_ratio = failsafe_ratio;
786 nearfull_ratio = failsafe_ratio;
787 } else if (full_ratio <= 0 ||
788 backfillfull_ratio <= 0 ||
789 nearfull_ratio <= 0) {
790 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
791 // use failsafe flag. ick. the monitor did something wrong or the user
792 // did something stupid.
793 full_ratio = failsafe_ratio;
794 backfillfull_ratio = failsafe_ratio;
795 nearfull_ratio = failsafe_ratio;
796 }
797
798 string inject;
799 s_names new_state;
800 if (injectfull_state > NONE && injectfull) {
801 new_state = injectfull_state;
802 inject = "(Injected)";
803 } else if (ratio > failsafe_ratio) {
804 new_state = FAILSAFE;
805 } else if (ratio > full_ratio) {
806 new_state = FULL;
807 } else if (ratio > backfillfull_ratio) {
808 new_state = BACKFILLFULL;
809 } else if (ratio > nearfull_ratio) {
810 new_state = NEARFULL;
811 } else {
812 new_state = NONE;
813 }
814 dout(20) << __func__ << " cur ratio " << ratio
815 << ". nearfull_ratio " << nearfull_ratio
816 << ". backfillfull_ratio " << backfillfull_ratio
817 << ", full_ratio " << full_ratio
818 << ", failsafe_ratio " << failsafe_ratio
819 << ", new state " << get_full_state_name(new_state)
820 << " " << inject
821 << dendl;
822
823 // warn
824 if (cur_state != new_state) {
825 dout(10) << __func__ << " " << get_full_state_name(cur_state)
826 << " -> " << get_full_state_name(new_state) << dendl;
827 if (new_state == FAILSAFE) {
c07f9fc5 828 clog->error() << "full status failsafe engaged, dropping updates, now "
7c673cae
FG
829 << (int)roundf(ratio * 100) << "% full";
830 } else if (cur_state == FAILSAFE) {
c07f9fc5
FG
831 clog->error() << "full status failsafe disengaged, no longer dropping "
832 << "updates, now " << (int)roundf(ratio * 100) << "% full";
7c673cae
FG
833 }
834 cur_state = new_state;
835 }
836}
837
838bool OSDService::need_fullness_update()
839{
840 OSDMapRef osdmap = get_osdmap();
841 s_names cur = NONE;
842 if (osdmap->exists(whoami)) {
843 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
844 cur = FULL;
845 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
846 cur = BACKFILLFULL;
847 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
848 cur = NEARFULL;
849 }
850 }
851 s_names want = NONE;
852 if (is_full())
853 want = FULL;
854 else if (is_backfillfull())
855 want = BACKFILLFULL;
856 else if (is_nearfull())
857 want = NEARFULL;
858 return want != cur;
859}
860
861bool OSDService::_check_full(s_names type, ostream &ss) const
862{
863 Mutex::Locker l(full_status_lock);
864
865 if (injectfull && injectfull_state >= type) {
866 // injectfull is either a count of the number of times to return failsafe full
867 // or if -1 then always return full
868 if (injectfull > 0)
869 --injectfull;
870 ss << "Injected " << get_full_state_name(type) << " OSD ("
871 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")";
872 return true;
873 }
874
875 ss << "current usage is " << cur_ratio;
876 return cur_state >= type;
877}
878
879bool OSDService::check_failsafe_full(ostream &ss) const
880{
881 return _check_full(FAILSAFE, ss);
882}
883
884bool OSDService::check_full(ostream &ss) const
885{
886 return _check_full(FULL, ss);
887}
888
889bool OSDService::check_backfill_full(ostream &ss) const
890{
891 return _check_full(BACKFILLFULL, ss);
892}
893
894bool OSDService::check_nearfull(ostream &ss) const
895{
896 return _check_full(NEARFULL, ss);
897}
898
899bool OSDService::is_failsafe_full() const
900{
901 Mutex::Locker l(full_status_lock);
902 return cur_state == FAILSAFE;
903}
904
905bool OSDService::is_full() const
906{
907 Mutex::Locker l(full_status_lock);
908 return cur_state >= FULL;
909}
910
911bool OSDService::is_backfillfull() const
912{
913 Mutex::Locker l(full_status_lock);
914 return cur_state >= BACKFILLFULL;
915}
916
917bool OSDService::is_nearfull() const
918{
919 Mutex::Locker l(full_status_lock);
920 return cur_state >= NEARFULL;
921}
922
923void OSDService::set_injectfull(s_names type, int64_t count)
924{
925 Mutex::Locker l(full_status_lock);
926 injectfull_state = type;
927 injectfull = count;
928}
929
224ce89b 930osd_stat_t OSDService::set_osd_stat(const struct store_statfs_t &stbuf,
35e4c445
FG
931 vector<int>& hb_peers,
932 int num_pgs)
7c673cae 933{
224ce89b
WB
934 uint64_t bytes = stbuf.total;
935 uint64_t used = bytes - stbuf.available;
936 uint64_t avail = stbuf.available;
7c673cae 937
224ce89b
WB
938 osd->logger->set(l_osd_stat_bytes, bytes);
939 osd->logger->set(l_osd_stat_bytes_used, used);
940 osd->logger->set(l_osd_stat_bytes_avail, avail);
7c673cae 941
224ce89b
WB
942 {
943 Mutex::Locker l(stat_lock);
944 osd_stat.hb_peers.swap(hb_peers);
945 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
946 osd_stat.kb = bytes >> 10;
947 osd_stat.kb_used = used >> 10;
948 osd_stat.kb_avail = avail >> 10;
35e4c445 949 osd_stat.num_pgs = num_pgs;
224ce89b
WB
950 return osd_stat;
951 }
952}
7c673cae 953
224ce89b
WB
954void OSDService::update_osd_stat(vector<int>& hb_peers)
955{
956 // load osd stats first
7c673cae
FG
957 struct store_statfs_t stbuf;
958 int r = osd->store->statfs(&stbuf);
959 if (r < 0) {
960 derr << "statfs() failed: " << cpp_strerror(r) << dendl;
961 return;
962 }
963
35e4c445 964 auto new_stat = set_osd_stat(stbuf, hb_peers, osd->get_num_pgs());
224ce89b
WB
965 dout(20) << "update_osd_stat " << new_stat << dendl;
966 assert(new_stat.kb);
967 float ratio = ((float)new_stat.kb_used) / ((float)new_stat.kb);
968 check_full_status(ratio);
7c673cae
FG
969}
970
971bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
972{
973 OSDMapRef osdmap = get_osdmap();
974 for (auto shard : missing_on) {
975 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
976 return true;
977 }
978 return false;
979}
980
981void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
982{
983 OSDMapRef next_map = get_nextmap_reserved();
984 // service map is always newer/newest
985 assert(from_epoch <= next_map->get_epoch());
986
987 if (next_map->is_down(peer) ||
988 next_map->get_info(peer).up_from > from_epoch) {
989 m->put();
990 release_map(next_map);
991 return;
992 }
993 const entity_inst_t& peer_inst = next_map->get_cluster_inst(peer);
994 ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst);
995 share_map_peer(peer, peer_con.get(), next_map);
996 peer_con->send_message(m);
997 release_map(next_map);
998}
999
1000ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1001{
1002 OSDMapRef next_map = get_nextmap_reserved();
1003 // service map is always newer/newest
1004 assert(from_epoch <= next_map->get_epoch());
1005
1006 if (next_map->is_down(peer) ||
1007 next_map->get_info(peer).up_from > from_epoch) {
1008 release_map(next_map);
1009 return NULL;
1010 }
1011 ConnectionRef con = osd->cluster_messenger->get_connection(next_map->get_cluster_inst(peer));
1012 release_map(next_map);
1013 return con;
1014}
1015
1016pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1017{
1018 OSDMapRef next_map = get_nextmap_reserved();
1019 // service map is always newer/newest
1020 assert(from_epoch <= next_map->get_epoch());
1021
1022 pair<ConnectionRef,ConnectionRef> ret;
1023 if (next_map->is_down(peer) ||
1024 next_map->get_info(peer).up_from > from_epoch) {
1025 release_map(next_map);
1026 return ret;
1027 }
1028 ret.first = osd->hb_back_client_messenger->get_connection(next_map->get_hb_back_inst(peer));
1029 if (next_map->get_hb_front_addr(peer) != entity_addr_t())
1030 ret.second = osd->hb_front_client_messenger->get_connection(next_map->get_hb_front_inst(peer));
1031 release_map(next_map);
1032 return ret;
1033}
1034
1035
94b18763
FG
1036void OSDService::queue_want_pg_temp(pg_t pgid,
1037 const vector<int>& want,
1038 bool forced)
7c673cae
FG
1039{
1040 Mutex::Locker l(pg_temp_lock);
94b18763 1041 auto p = pg_temp_pending.find(pgid);
7c673cae 1042 if (p == pg_temp_pending.end() ||
94b18763
FG
1043 p->second.acting != want ||
1044 forced) {
1045 pg_temp_wanted[pgid] = pg_temp_t{want, forced};
7c673cae
FG
1046 }
1047}
1048
1049void OSDService::remove_want_pg_temp(pg_t pgid)
1050{
1051 Mutex::Locker l(pg_temp_lock);
1052 pg_temp_wanted.erase(pgid);
1053 pg_temp_pending.erase(pgid);
1054}
1055
1056void OSDService::_sent_pg_temp()
1057{
94b18763
FG
1058 pg_temp_pending.insert(make_move_iterator(begin(pg_temp_wanted)),
1059 make_move_iterator(end(pg_temp_wanted)));
7c673cae
FG
1060 pg_temp_wanted.clear();
1061}
1062
1063void OSDService::requeue_pg_temp()
1064{
1065 Mutex::Locker l(pg_temp_lock);
1066 // wanted overrides pending. note that remove_want_pg_temp
1067 // clears the item out of both.
1068 unsigned old_wanted = pg_temp_wanted.size();
1069 unsigned old_pending = pg_temp_pending.size();
1070 _sent_pg_temp();
1071 pg_temp_wanted.swap(pg_temp_pending);
1072 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1073 << pg_temp_wanted.size() << dendl;
1074}
1075
94b18763
FG
1076std::ostream& operator<<(std::ostream& out,
1077 const OSDService::pg_temp_t& pg_temp)
1078{
1079 out << pg_temp.acting;
1080 if (pg_temp.forced) {
1081 out << " (forced)";
1082 }
1083 return out;
1084}
1085
7c673cae
FG
1086void OSDService::send_pg_temp()
1087{
1088 Mutex::Locker l(pg_temp_lock);
1089 if (pg_temp_wanted.empty())
1090 return;
1091 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
94b18763
FG
1092 MOSDPGTemp *ms[2] = {nullptr, nullptr};
1093 for (auto& pg_temp : pg_temp_wanted) {
1094 auto& m = ms[pg_temp.second.forced];
1095 if (!m) {
1096 m = new MOSDPGTemp(osdmap->get_epoch());
1097 m->forced = pg_temp.second.forced;
1098 }
1099 m->pg_temp.emplace(pg_temp.first,
1100 pg_temp.second.acting);
1101 }
1102 for (auto m : ms) {
1103 if (m) {
1104 monc->send_mon_message(m);
1105 }
1106 }
7c673cae
FG
1107 _sent_pg_temp();
1108}
1109
1110void OSDService::send_pg_created(pg_t pgid)
1111{
1112 dout(20) << __func__ << dendl;
c07f9fc5
FG
1113 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1114 monc->send_mon_message(new MOSDPGCreated(pgid));
1115 }
7c673cae
FG
1116}
1117
1118// --------------------------------------
1119// dispatch
1120
1121epoch_t OSDService::get_peer_epoch(int peer)
1122{
1123 Mutex::Locker l(peer_map_epoch_lock);
1124 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1125 if (p == peer_map_epoch.end())
1126 return 0;
1127 return p->second;
1128}
1129
1130epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1131{
1132 Mutex::Locker l(peer_map_epoch_lock);
1133 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1134 if (p != peer_map_epoch.end()) {
1135 if (p->second < e) {
1136 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1137 p->second = e;
1138 } else {
1139 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1140 }
1141 return p->second;
1142 } else {
1143 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1144 peer_map_epoch[peer] = e;
1145 return e;
1146 }
1147}
1148
1149void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1150{
1151 Mutex::Locker l(peer_map_epoch_lock);
1152 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1153 if (p != peer_map_epoch.end()) {
1154 if (p->second <= as_of) {
1155 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1156 << " had " << p->second << dendl;
1157 peer_map_epoch.erase(p);
1158 } else {
1159 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1160 << " has " << p->second << " - not forgetting" << dendl;
1161 }
1162 }
1163}
1164
1165bool OSDService::should_share_map(entity_name_t name, Connection *con,
1166 epoch_t epoch, const OSDMapRef& osdmap,
1167 const epoch_t *sent_epoch_p)
1168{
1169 dout(20) << "should_share_map "
1170 << name << " " << con->get_peer_addr()
1171 << " " << epoch << dendl;
1172
1173 // does client have old map?
1174 if (name.is_client()) {
1175 bool message_sendmap = epoch < osdmap->get_epoch();
1176 if (message_sendmap && sent_epoch_p) {
1177 dout(20) << "client session last_sent_epoch: "
1178 << *sent_epoch_p
1179 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1180 if (*sent_epoch_p < osdmap->get_epoch()) {
1181 return true;
1182 } // else we don't need to send it out again
1183 }
1184 }
1185
1186 if (con->get_messenger() == osd->cluster_messenger &&
1187 con != osd->cluster_messenger->get_loopback_connection() &&
1188 osdmap->is_up(name.num()) &&
1189 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1190 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1191 // remember
1192 epoch_t has = MAX(get_peer_epoch(name.num()), epoch);
1193
1194 // share?
1195 if (has < osdmap->get_epoch()) {
1196 dout(10) << name << " " << con->get_peer_addr()
1197 << " has old map " << epoch << " < "
1198 << osdmap->get_epoch() << dendl;
1199 return true;
1200 }
1201 }
1202
1203 return false;
1204}
1205
1206void OSDService::share_map(
1207 entity_name_t name,
1208 Connection *con,
1209 epoch_t epoch,
1210 OSDMapRef& osdmap,
1211 epoch_t *sent_epoch_p)
1212{
1213 dout(20) << "share_map "
1214 << name << " " << con->get_peer_addr()
1215 << " " << epoch << dendl;
1216
1217 if (!osd->is_active()) {
1218 /*It is safe not to proceed as OSD is not in healthy state*/
1219 return;
1220 }
1221
1222 bool want_shared = should_share_map(name, con, epoch,
1223 osdmap, sent_epoch_p);
1224
1225 if (want_shared){
1226 if (name.is_client()) {
1227 dout(10) << name << " has old map " << epoch
1228 << " < " << osdmap->get_epoch() << dendl;
1229 // we know the Session is valid or we wouldn't be sending
1230 if (sent_epoch_p) {
1231 *sent_epoch_p = osdmap->get_epoch();
1232 }
1233 send_incremental_map(epoch, con, osdmap);
1234 } else if (con->get_messenger() == osd->cluster_messenger &&
1235 osdmap->is_up(name.num()) &&
1236 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1237 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1238 dout(10) << name << " " << con->get_peer_addr()
1239 << " has old map " << epoch << " < "
1240 << osdmap->get_epoch() << dendl;
1241 note_peer_epoch(name.num(), osdmap->get_epoch());
1242 send_incremental_map(epoch, con, osdmap);
1243 }
1244 }
1245}
1246
1247void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1248{
1249 if (!map)
1250 map = get_osdmap();
1251
1252 // send map?
1253 epoch_t pe = get_peer_epoch(peer);
1254 if (pe) {
1255 if (pe < map->get_epoch()) {
1256 send_incremental_map(pe, con, map);
1257 note_peer_epoch(peer, map->get_epoch());
1258 } else
1259 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1260 } else {
1261 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1262 // no idea about peer's epoch.
1263 // ??? send recent ???
1264 // do nothing.
1265 }
1266}
1267
1268bool OSDService::can_inc_scrubs_pending()
1269{
1270 bool can_inc = false;
1271 Mutex::Locker l(sched_scrub_lock);
1272
1273 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1274 dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
28e407b8
AA
1275 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active
1276 << ")" << dendl;
7c673cae
FG
1277 can_inc = true;
1278 } else {
28e407b8
AA
1279 dout(20) << __func__ << " " << scrubs_pending << " + " << scrubs_active
1280 << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
7c673cae
FG
1281 }
1282
1283 return can_inc;
1284}
1285
1286bool OSDService::inc_scrubs_pending()
1287{
1288 bool result = false;
1289
1290 sched_scrub_lock.Lock();
1291 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1292 dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
1293 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1294 result = true;
1295 ++scrubs_pending;
1296 } else {
1297 dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1298 }
1299 sched_scrub_lock.Unlock();
1300
1301 return result;
1302}
1303
1304void OSDService::dec_scrubs_pending()
1305{
1306 sched_scrub_lock.Lock();
1307 dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
1308 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1309 --scrubs_pending;
1310 assert(scrubs_pending >= 0);
1311 sched_scrub_lock.Unlock();
1312}
1313
1314void OSDService::inc_scrubs_active(bool reserved)
1315{
1316 sched_scrub_lock.Lock();
1317 ++(scrubs_active);
1318 if (reserved) {
1319 --(scrubs_pending);
1320 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1321 << " (max " << cct->_conf->osd_max_scrubs
1322 << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
1323 assert(scrubs_pending >= 0);
1324 } else {
1325 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1326 << " (max " << cct->_conf->osd_max_scrubs
1327 << ", pending " << scrubs_pending << ")" << dendl;
1328 }
1329 sched_scrub_lock.Unlock();
1330}
1331
1332void OSDService::dec_scrubs_active()
1333{
1334 sched_scrub_lock.Lock();
1335 dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
1336 << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
1337 --scrubs_active;
1338 assert(scrubs_active >= 0);
1339 sched_scrub_lock.Unlock();
1340}
1341
1342void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1343 epoch_t *_bind_epoch) const
1344{
1345 Mutex::Locker l(epoch_lock);
1346 if (_boot_epoch)
1347 *_boot_epoch = boot_epoch;
1348 if (_up_epoch)
1349 *_up_epoch = up_epoch;
1350 if (_bind_epoch)
1351 *_bind_epoch = bind_epoch;
1352}
1353
1354void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1355 const epoch_t *_bind_epoch)
1356{
1357 Mutex::Locker l(epoch_lock);
1358 if (_boot_epoch) {
1359 assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1360 boot_epoch = *_boot_epoch;
1361 }
1362 if (_up_epoch) {
1363 assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1364 up_epoch = *_up_epoch;
1365 }
1366 if (_bind_epoch) {
1367 assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1368 bind_epoch = *_bind_epoch;
1369 }
1370}
1371
1372bool OSDService::prepare_to_stop()
1373{
1374 Mutex::Locker l(is_stopping_lock);
1375 if (get_state() != NOT_STOPPING)
1376 return false;
1377
1378 OSDMapRef osdmap = get_osdmap();
1379 if (osdmap && osdmap->is_up(whoami)) {
1380 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1381 set_state(PREPARING_TO_STOP);
1382 monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(),
1383 osdmap->get_inst(whoami),
1384 osdmap->get_epoch(),
1385 true // request ack
1386 ));
1387 utime_t now = ceph_clock_now();
1388 utime_t timeout;
1389 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1390 while ((ceph_clock_now() < timeout) &&
1391 (get_state() != STOPPING)) {
1392 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1393 }
1394 }
1395 dout(0) << __func__ << " starting shutdown" << dendl;
1396 set_state(STOPPING);
1397 return true;
1398}
1399
1400void OSDService::got_stop_ack()
1401{
1402 Mutex::Locker l(is_stopping_lock);
1403 if (get_state() == PREPARING_TO_STOP) {
1404 dout(0) << __func__ << " starting shutdown" << dendl;
1405 set_state(STOPPING);
1406 is_stopping_cond.Signal();
1407 } else {
1408 dout(10) << __func__ << " ignoring msg" << dendl;
1409 }
1410}
1411
1412MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1413 OSDSuperblock& sblock)
1414{
28e407b8
AA
1415 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1416 osdmap->get_encoding_features());
7c673cae
FG
1417 m->oldest_map = max_oldest_map;
1418 m->newest_map = sblock.newest_map;
1419
1420 for (epoch_t e = to; e > since; e--) {
1421 bufferlist bl;
1422 if (e > m->oldest_map && get_inc_map_bl(e, bl)) {
1423 m->incremental_maps[e].claim(bl);
1424 } else if (get_map_bl(e, bl)) {
1425 m->maps[e].claim(bl);
1426 break;
1427 } else {
1428 derr << "since " << since << " to " << to
1429 << " oldest " << m->oldest_map << " newest " << m->newest_map
1430 << dendl;
1431 m->put();
1432 m = NULL;
1433 break;
1434 }
1435 }
1436 return m;
1437}
1438
1439void OSDService::send_map(MOSDMap *m, Connection *con)
1440{
1441 con->send_message(m);
1442}
1443
1444void OSDService::send_incremental_map(epoch_t since, Connection *con,
1445 OSDMapRef& osdmap)
1446{
1447 epoch_t to = osdmap->get_epoch();
1448 dout(10) << "send_incremental_map " << since << " -> " << to
1449 << " to " << con << " " << con->get_peer_addr() << dendl;
1450
1451 MOSDMap *m = NULL;
1452 while (!m) {
1453 OSDSuperblock sblock(get_superblock());
1454 if (since < sblock.oldest_map) {
1455 // just send latest full map
28e407b8
AA
1456 MOSDMap *m = new MOSDMap(monc->get_fsid(),
1457 osdmap->get_encoding_features());
7c673cae
FG
1458 m->oldest_map = max_oldest_map;
1459 m->newest_map = sblock.newest_map;
1460 get_map_bl(to, m->maps[to]);
1461 send_map(m, con);
1462 return;
1463 }
1464
1465 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1466 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1467 << ", only sending most recent" << dendl;
1468 since = to - cct->_conf->osd_map_share_max_epochs;
1469 }
1470
1471 if (to - since > (epoch_t)cct->_conf->osd_map_message_max)
1472 to = since + cct->_conf->osd_map_message_max;
1473 m = build_incremental_map_msg(since, to, sblock);
1474 }
1475 send_map(m, con);
1476}
1477
1478bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1479{
1480 bool found = map_bl_cache.lookup(e, &bl);
31f18b77
FG
1481 if (found) {
1482 if (logger)
1483 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1484 return true;
31f18b77
FG
1485 }
1486 if (logger)
1487 logger->inc(l_osd_map_bl_cache_miss);
7c673cae 1488 found = store->read(coll_t::meta(),
31f18b77
FG
1489 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1490 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1491 if (found) {
7c673cae 1492 _add_map_bl(e, bl);
31f18b77 1493 }
7c673cae
FG
1494 return found;
1495}
1496
1497bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1498{
1499 Mutex::Locker l(map_cache_lock);
1500 bool found = map_bl_inc_cache.lookup(e, &bl);
31f18b77
FG
1501 if (found) {
1502 if (logger)
1503 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1504 return true;
31f18b77
FG
1505 }
1506 if (logger)
1507 logger->inc(l_osd_map_bl_cache_miss);
7c673cae 1508 found = store->read(coll_t::meta(),
31f18b77
FG
1509 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1510 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1511 if (found) {
7c673cae 1512 _add_map_inc_bl(e, bl);
31f18b77 1513 }
7c673cae
FG
1514 return found;
1515}
1516
1517void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1518{
1519 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1520 // cache a contiguous buffer
1521 if (bl.get_num_buffers() > 1) {
1522 bl.rebuild();
1523 }
1524 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1525 map_bl_cache.add(e, bl);
1526}
1527
1528void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1529{
1530 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1531 // cache a contiguous buffer
1532 if (bl.get_num_buffers() > 1) {
1533 bl.rebuild();
1534 }
1535 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1536 map_bl_inc_cache.add(e, bl);
1537}
1538
1539void OSDService::pin_map_inc_bl(epoch_t e, bufferlist &bl)
1540{
1541 Mutex::Locker l(map_cache_lock);
31f18b77
FG
1542 // cache a contiguous buffer
1543 if (bl.get_num_buffers() > 1) {
1544 bl.rebuild();
1545 }
7c673cae
FG
1546 map_bl_inc_cache.pin(e, bl);
1547}
1548
1549void OSDService::pin_map_bl(epoch_t e, bufferlist &bl)
1550{
1551 Mutex::Locker l(map_cache_lock);
31f18b77
FG
1552 // cache a contiguous buffer
1553 if (bl.get_num_buffers() > 1) {
1554 bl.rebuild();
1555 }
7c673cae
FG
1556 map_bl_cache.pin(e, bl);
1557}
1558
1559void OSDService::clear_map_bl_cache_pins(epoch_t e)
1560{
1561 Mutex::Locker l(map_cache_lock);
1562 map_bl_inc_cache.clear_pinned(e);
1563 map_bl_cache.clear_pinned(e);
1564}
1565
1566OSDMapRef OSDService::_add_map(OSDMap *o)
1567{
1568 epoch_t e = o->get_epoch();
1569
1570 if (cct->_conf->osd_map_dedup) {
1571 // Dedup against an existing map at a nearby epoch
1572 OSDMapRef for_dedup = map_cache.lower_bound(e);
1573 if (for_dedup) {
1574 OSDMap::dedup(for_dedup.get(), o);
1575 }
1576 }
1577 bool existed;
1578 OSDMapRef l = map_cache.add(e, o, &existed);
1579 if (existed) {
1580 delete o;
1581 }
1582 return l;
1583}
1584
1585OSDMapRef OSDService::try_get_map(epoch_t epoch)
1586{
1587 Mutex::Locker l(map_cache_lock);
1588 OSDMapRef retval = map_cache.lookup(epoch);
1589 if (retval) {
1590 dout(30) << "get_map " << epoch << " -cached" << dendl;
1591 if (logger) {
1592 logger->inc(l_osd_map_cache_hit);
1593 }
1594 return retval;
1595 }
1596 if (logger) {
1597 logger->inc(l_osd_map_cache_miss);
1598 epoch_t lb = map_cache.cached_key_lower_bound();
1599 if (epoch < lb) {
1600 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1601 logger->inc(l_osd_map_cache_miss_low);
1602 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1603 }
1604 }
1605
1606 OSDMap *map = new OSDMap;
1607 if (epoch > 0) {
1608 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1609 bufferlist bl;
1610 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1611 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1612 delete map;
1613 return OSDMapRef();
1614 }
1615 map->decode(bl);
1616 } else {
1617 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1618 }
1619 return _add_map(map);
1620}
1621
1622// ops
1623
1624
1625void OSDService::reply_op_error(OpRequestRef op, int err)
1626{
1627 reply_op_error(op, err, eversion_t(), 0);
1628}
1629
1630void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1631 version_t uv)
1632{
1633 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1634 assert(m->get_type() == CEPH_MSG_OSD_OP);
1635 int flags;
1636 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1637
1638 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1639 true);
1640 reply->set_reply_versions(v, uv);
1641 m->get_connection()->send_message(reply);
1642}
1643
1644void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1645{
31f18b77
FG
1646 if (!cct->_conf->osd_debug_misdirected_ops) {
1647 return;
1648 }
1649
7c673cae
FG
1650 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1651 assert(m->get_type() == CEPH_MSG_OSD_OP);
1652
1653 assert(m->get_map_epoch() >= pg->info.history.same_primary_since);
1654
1655 if (pg->is_ec_pg()) {
1656 /**
1657 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1658 * can get this result:
1659 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1660 * [CRUSH_ITEM_NONE, 2, 3]/3
1661 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1662 * [3, 2, 3]/3
1663 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1664 * -- misdirected op
1665 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1666 * it and fulfils it
1667 *
1668 * We can't compute the op target based on the sending map epoch due to
1669 * splitting. The simplest thing is to detect such cases here and drop
1670 * them without an error (the client will resend anyway).
1671 */
1672 assert(m->get_map_epoch() <= superblock.newest_map);
1673 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1674 if (!opmap) {
1675 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1676 << m->get_map_epoch() << ", dropping" << dendl;
1677 return;
1678 }
1679 pg_t _pgid = m->get_raw_pg();
1680 spg_t pgid;
1681 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1682 _pgid = opmap->raw_pg_to_pg(_pgid);
1683 if (opmap->get_primary_shard(_pgid, &pgid) &&
1684 pgid.shard != pg->info.pgid.shard) {
1685 dout(7) << __func__ << ": " << *pg << " primary changed since "
1686 << m->get_map_epoch() << ", dropping" << dendl;
1687 return;
1688 }
1689 }
1690
1691 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1692 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1693 << " pg " << m->get_raw_pg()
1694 << " to osd." << whoami
1695 << " not " << pg->acting
1696 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
7c673cae
FG
1697}
1698
1699void OSDService::enqueue_back(spg_t pgid, PGQueueable qi)
1700{
1701 osd->op_shardedwq.queue(make_pair(pgid, qi));
1702}
1703
1704void OSDService::enqueue_front(spg_t pgid, PGQueueable qi)
1705{
1706 osd->op_shardedwq.queue_front(make_pair(pgid, qi));
1707}
1708
1709void OSDService::queue_for_peering(PG *pg)
1710{
1711 peering_wq.queue(pg);
1712}
1713
1714void OSDService::queue_for_snap_trim(PG *pg)
1715{
1716 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1717 osd->op_shardedwq.queue(
1718 make_pair(
1719 pg->info.pgid,
1720 PGQueueable(
1721 PGSnapTrim(pg->get_osdmap()->get_epoch()),
1722 cct->_conf->osd_snap_trim_cost,
1723 cct->_conf->osd_snap_trim_priority,
1724 ceph_clock_now(),
1725 entity_inst_t(),
1726 pg->get_osdmap()->get_epoch())));
1727}
1728
1729
1730// ====================================================================
1731// OSD
1732
1733#undef dout_prefix
1734#define dout_prefix *_dout
1735
1736// Commands shared between OSD's console and admin console:
1737namespace ceph {
1738namespace osd_cmds {
1739
1740int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1741
1742}} // namespace ceph::osd_cmds
1743
1744int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
1745 uuid_d fsid, int whoami)
1746{
1747 int ret;
1748
1749 ceph::shared_ptr<ObjectStore::Sequencer> osr(
1750 new ObjectStore::Sequencer("mkfs"));
1751 OSDSuperblock sb;
1752 bufferlist sbbl;
1753 C_SaferCond waiter;
1754
1755 // if we are fed a uuid for this osd, use it.
1756 store->set_fsid(cct->_conf->osd_uuid);
1757
1758 ret = store->mkfs();
1759 if (ret) {
224ce89b
WB
1760 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1761 << cpp_strerror(ret) << dendl;
7c673cae
FG
1762 goto free_store;
1763 }
1764
31f18b77 1765 store->set_cache_shards(1); // doesn't matter for mkfs!
7c673cae
FG
1766
1767 ret = store->mount();
1768 if (ret) {
224ce89b
WB
1769 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1770 << cpp_strerror(ret) << dendl;
7c673cae
FG
1771 goto free_store;
1772 }
1773
1774 ret = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1775 if (ret >= 0) {
1776 /* if we already have superblock, check content of superblock */
1777 dout(0) << " have superblock" << dendl;
1778 bufferlist::iterator p;
1779 p = sbbl.begin();
1780 ::decode(sb, p);
1781 if (whoami != sb.whoami) {
1782 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1783 << dendl;
1784 ret = -EINVAL;
1785 goto umount_store;
1786 }
1787 if (fsid != sb.cluster_fsid) {
1788 derr << "provided cluster fsid " << fsid
1789 << " != superblock's " << sb.cluster_fsid << dendl;
1790 ret = -EINVAL;
1791 goto umount_store;
1792 }
1793 } else {
1794 // create superblock
1795 sb.cluster_fsid = fsid;
1796 sb.osd_fsid = store->get_fsid();
1797 sb.whoami = whoami;
1798 sb.compat_features = get_osd_initial_compat_set();
1799
1800 bufferlist bl;
1801 ::encode(sb, bl);
1802
1803 ObjectStore::Transaction t;
1804 t.create_collection(coll_t::meta(), 0);
1805 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1806 ret = store->apply_transaction(osr.get(), std::move(t));
1807 if (ret) {
1808 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
224ce89b 1809 << "apply_transaction returned " << cpp_strerror(ret) << dendl;
7c673cae
FG
1810 goto umount_store;
1811 }
1812 }
1813
1814 if (!osr->flush_commit(&waiter)) {
1815 waiter.wait();
1816 }
1817
3efd9988 1818 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
7c673cae 1819 if (ret) {
224ce89b
WB
1820 derr << "OSD::mkfs: failed to write fsid file: error "
1821 << cpp_strerror(ret) << dendl;
7c673cae
FG
1822 goto umount_store;
1823 }
1824
1825umount_store:
1826 store->umount();
1827free_store:
1828 delete store;
1829 return ret;
1830}
1831
3efd9988 1832int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
7c673cae
FG
1833{
1834 char val[80];
1835 int r;
1836
1837 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
1838 r = store->write_meta("magic", val);
1839 if (r < 0)
1840 return r;
1841
1842 snprintf(val, sizeof(val), "%d", whoami);
1843 r = store->write_meta("whoami", val);
1844 if (r < 0)
1845 return r;
1846
1847 cluster_fsid.print(val);
1848 r = store->write_meta("ceph_fsid", val);
1849 if (r < 0)
1850 return r;
1851
3efd9988 1852 string key = cct->_conf->get_val<string>("key");
3efd9988
FG
1853 if (key.size()) {
1854 r = store->write_meta("osd_key", key);
1855 if (r < 0)
1856 return r;
b32b8144
FG
1857 } else {
1858 string keyfile = cct->_conf->get_val<string>("keyfile");
1859 if (!keyfile.empty()) {
1860 bufferlist keybl;
1861 string err;
1862 if (keyfile == "-") {
1863 static_assert(1024 * 1024 >
1864 (sizeof(CryptoKey) - sizeof(bufferptr) +
1865 sizeof(__u16) + 16 /* AES_KEY_LEN */ + 3 - 1) / 3. * 4.,
1866 "1MB should be enough for a base64 encoded CryptoKey");
1867 r = keybl.read_fd(STDIN_FILENO, 1024 * 1024);
1868 } else {
1869 r = keybl.read_file(keyfile.c_str(), &err);
1870 }
1871 if (r < 0) {
1872 derr << __func__ << " failed to read keyfile " << keyfile << ": "
1873 << err << ": " << cpp_strerror(r) << dendl;
1874 return r;
1875 }
1876 r = store->write_meta("osd_key", keybl.to_str());
1877 if (r < 0)
1878 return r;
1879 }
3efd9988
FG
1880 }
1881
7c673cae
FG
1882 r = store->write_meta("ready", "ready");
1883 if (r < 0)
1884 return r;
1885
1886 return 0;
1887}
1888
1889int OSD::peek_meta(ObjectStore *store, std::string& magic,
1890 uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami)
1891{
1892 string val;
1893
1894 int r = store->read_meta("magic", &val);
1895 if (r < 0)
1896 return r;
1897 magic = val;
1898
1899 r = store->read_meta("whoami", &val);
1900 if (r < 0)
1901 return r;
1902 whoami = atoi(val.c_str());
1903
1904 r = store->read_meta("ceph_fsid", &val);
1905 if (r < 0)
1906 return r;
1907 r = cluster_fsid.parse(val.c_str());
1908 if (!r)
1909 return -EINVAL;
1910
1911 r = store->read_meta("fsid", &val);
1912 if (r < 0) {
1913 osd_fsid = uuid_d();
1914 } else {
1915 r = osd_fsid.parse(val.c_str());
1916 if (!r)
1917 return -EINVAL;
1918 }
1919
1920 return 0;
1921}
1922
1923
1924#undef dout_prefix
1925#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
1926
1927// cons/des
1928
1929OSD::OSD(CephContext *cct_, ObjectStore *store_,
1930 int id,
1931 Messenger *internal_messenger,
1932 Messenger *external_messenger,
1933 Messenger *hb_client_front,
1934 Messenger *hb_client_back,
1935 Messenger *hb_front_serverm,
1936 Messenger *hb_back_serverm,
1937 Messenger *osdc_messenger,
1938 MonClient *mc,
1939 const std::string &dev, const std::string &jdev) :
1940 Dispatcher(cct_),
1941 osd_lock("OSD::osd_lock"),
1942 tick_timer(cct, osd_lock),
1943 tick_timer_lock("OSD::tick_timer_lock"),
1944 tick_timer_without_osd_lock(cct, tick_timer_lock),
1945 authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(cct,
1946 cct->_conf->auth_supported.empty() ?
1947 cct->_conf->auth_cluster_required :
1948 cct->_conf->auth_supported)),
1949 authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(cct,
1950 cct->_conf->auth_supported.empty() ?
1951 cct->_conf->auth_service_required :
1952 cct->_conf->auth_supported)),
1953 cluster_messenger(internal_messenger),
1954 client_messenger(external_messenger),
1955 objecter_messenger(osdc_messenger),
1956 monc(mc),
1957 mgrc(cct_, client_messenger),
1958 logger(NULL),
1959 recoverystate_perf(NULL),
1960 store(store_),
1961 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
1962 clog(log_client.create_channel()),
1963 whoami(id),
1964 dev_path(dev), journal_path(jdev),
31f18b77 1965 store_is_rotational(store->is_rotational()),
7c673cae
FG
1966 trace_endpoint("0.0.0.0", 0, "osd"),
1967 asok_hook(NULL),
1968 osd_compat(get_osd_compat_set()),
31f18b77
FG
1969 peering_tp(cct, "OSD::peering_tp", "tp_peering",
1970 cct->_conf->osd_peering_wq_threads,
1971 "osd_peering_tp_threads"),
7c673cae 1972 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
31f18b77 1973 get_num_op_threads()),
f64942e4
AA
1974 remove_tp(cct, "OSD::remove_tp", "tp_osd_remove", cct->_conf->osd_remove_threads, "osd_remove_threads"),
1975 recovery_tp(cct, "OSD::recovery_tp", "tp_osd_recovery", cct->_conf->osd_recovery_threads, "osd_recovery_threads"),
7c673cae
FG
1976 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
1977 session_waiting_lock("OSD::session_waiting_lock"),
181888fb 1978 osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
7c673cae
FG
1979 heartbeat_lock("OSD::heartbeat_lock"),
1980 heartbeat_stop(false),
1981 heartbeat_need_update(true),
1982 hb_front_client_messenger(hb_client_front),
1983 hb_back_client_messenger(hb_client_back),
1984 hb_front_server_messenger(hb_front_serverm),
1985 hb_back_server_messenger(hb_back_serverm),
1986 daily_loadavg(0.0),
1987 heartbeat_thread(this),
1988 heartbeat_dispatcher(this),
1989 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
1990 cct->_conf->osd_num_op_tracker_shard),
1991 test_ops_hook(NULL),
1992 op_queue(get_io_queue()),
1993 op_prio_cutoff(get_io_prio_cut()),
1994 op_shardedwq(
31f18b77 1995 get_num_op_shards(),
7c673cae
FG
1996 this,
1997 cct->_conf->osd_op_thread_timeout,
1998 cct->_conf->osd_op_thread_suicide_timeout,
1999 &osd_op_tp),
2000 peering_wq(
2001 this,
2002 cct->_conf->osd_op_thread_timeout,
2003 cct->_conf->osd_op_thread_suicide_timeout,
31f18b77 2004 &peering_tp),
7c673cae
FG
2005 map_lock("OSD::map_lock"),
2006 pg_map_lock("OSD::pg_map_lock"),
2007 last_pg_create_epoch(0),
2008 mon_report_lock("OSD::mon_report_lock"),
2009 stats_ack_timeout(cct->_conf->osd_mon_ack_timeout),
2010 up_thru_wanted(0),
2011 requested_full_first(0),
2012 requested_full_last(0),
2013 pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
2014 osd_stat_updated(false),
2015 pg_stat_tid(0), pg_stat_tid_flushed(0),
2016 command_wq(
2017 this,
2018 cct->_conf->osd_command_thread_timeout,
2019 cct->_conf->osd_command_thread_suicide_timeout,
2020 &command_tp),
2021 remove_wq(
2022 cct,
2023 store,
2024 cct->_conf->osd_remove_thread_timeout,
2025 cct->_conf->osd_remove_thread_suicide_timeout,
f64942e4 2026 &remove_tp),
7c673cae
FG
2027 service(this)
2028{
2029 monc->set_messenger(client_messenger);
2030 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
2031 cct->_conf->osd_op_log_threshold);
2032 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
2033 cct->_conf->osd_op_history_duration);
2034 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
2035 cct->_conf->osd_op_history_slow_op_threshold);
2036#ifdef WITH_BLKIN
2037 std::stringstream ss;
2038 ss << "osd." << whoami;
2039 trace_endpoint.copy_name(ss.str());
2040#endif
2041}
2042
2043OSD::~OSD()
2044{
2045 delete authorize_handler_cluster_registry;
2046 delete authorize_handler_service_registry;
2047 delete class_handler;
2048 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2049 cct->get_perfcounters_collection()->remove(logger);
2050 delete recoverystate_perf;
2051 delete logger;
2052 delete store;
2053}
2054
91327a77
AA
2055double OSD::get_tick_interval() const
2056{
2057 // vary +/- 5% to avoid scrub scheduling livelocks
2058 constexpr auto delta = 0.05;
2059 std::default_random_engine rng{static_cast<unsigned>(whoami)};
2060 return (OSD_TICK_INTERVAL *
2061 std::uniform_real_distribution<>{1.0 - delta, 1.0 + delta}(rng));
2062}
2063
7c673cae
FG
2064void cls_initialize(ClassHandler *ch);
2065
2066void OSD::handle_signal(int signum)
2067{
2068 assert(signum == SIGINT || signum == SIGTERM);
2069 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2070 shutdown();
2071}
2072
2073int OSD::pre_init()
2074{
2075 Mutex::Locker lock(osd_lock);
2076 if (is_stopping())
2077 return 0;
2078
2079 if (store->test_mount_in_use()) {
2080 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2081 << "currently in use. (Is ceph-osd already running?)" << dendl;
2082 return -EBUSY;
2083 }
2084
2085 cct->_conf->add_observer(this);
2086 return 0;
2087}
2088
2089// asok
2090
2091class OSDSocketHook : public AdminSocketHook {
2092 OSD *osd;
2093public:
2094 explicit OSDSocketHook(OSD *o) : osd(o) {}
2095 bool call(std::string admin_command, cmdmap_t& cmdmap, std::string format,
2096 bufferlist& out) override {
2097 stringstream ss;
2098 bool r = osd->asok_command(admin_command, cmdmap, format, ss);
2099 out.append(ss);
2100 return r;
2101 }
2102};
2103
2104bool OSD::asok_command(string admin_command, cmdmap_t& cmdmap, string format,
2105 ostream& ss)
2106{
2107 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2108 if (admin_command == "status") {
2109 f->open_object_section("status");
2110 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2111 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2112 f->dump_unsigned("whoami", superblock.whoami);
2113 f->dump_string("state", get_state_name(get_state()));
2114 f->dump_unsigned("oldest_map", superblock.oldest_map);
2115 f->dump_unsigned("newest_map", superblock.newest_map);
2116 {
2117 RWLock::RLocker l(pg_map_lock);
2118 f->dump_unsigned("num_pgs", pg_map.size());
2119 }
2120 f->close_section();
2121 } else if (admin_command == "flush_journal") {
2122 store->flush_journal();
2123 } else if (admin_command == "dump_ops_in_flight" ||
c07f9fc5
FG
2124 admin_command == "ops" ||
2125 admin_command == "dump_blocked_ops" ||
2126 admin_command == "dump_historic_ops" ||
2127 admin_command == "dump_historic_ops_by_duration" ||
2128 admin_command == "dump_historic_slow_ops") {
2129
2130 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2131even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2132will start to track new ops received afterwards.";
2133
2134 set<string> filters;
2135 vector<string> filter_str;
2136 if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
2137 copy(filter_str.begin(), filter_str.end(),
2138 inserter(filters, filters.end()));
2139 }
2140
2141 if (admin_command == "dump_ops_in_flight" ||
2142 admin_command == "ops") {
2143 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2144 ss << error_str;
2145 }
2146 }
2147 if (admin_command == "dump_blocked_ops") {
2148 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2149 ss << error_str;
2150 }
2151 }
2152 if (admin_command == "dump_historic_ops") {
2153 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2154 ss << error_str;
2155 }
2156 }
2157 if (admin_command == "dump_historic_ops_by_duration") {
2158 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2159 ss << error_str;
2160 }
2161 }
2162 if (admin_command == "dump_historic_slow_ops") {
2163 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2164 ss << error_str;
2165 }
7c673cae
FG
2166 }
2167 } else if (admin_command == "dump_op_pq_state") {
2168 f->open_object_section("pq");
2169 op_shardedwq.dump(f);
2170 f->close_section();
2171 } else if (admin_command == "dump_blacklist") {
2172 list<pair<entity_addr_t,utime_t> > bl;
2173 OSDMapRef curmap = service.get_osdmap();
2174
2175 f->open_array_section("blacklist");
2176 curmap->get_blacklist(&bl);
2177 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2178 it != bl.end(); ++it) {
224ce89b 2179 f->open_object_section("entry");
7c673cae
FG
2180 f->open_object_section("entity_addr_t");
2181 it->first.dump(f);
2182 f->close_section(); //entity_addr_t
2183 it->second.localtime(f->dump_stream("expire_time"));
2184 f->close_section(); //entry
2185 }
2186 f->close_section(); //blacklist
2187 } else if (admin_command == "dump_watchers") {
2188 list<obj_watch_item_t> watchers;
2189 // scan pg's
2190 {
2191 Mutex::Locker l(osd_lock);
2192 RWLock::RLocker l2(pg_map_lock);
2193 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2194 it != pg_map.end();
2195 ++it) {
2196
2197 list<obj_watch_item_t> pg_watchers;
2198 PG *pg = it->second;
2199 pg->lock();
2200 pg->get_watchers(pg_watchers);
2201 pg->unlock();
2202 watchers.splice(watchers.end(), pg_watchers);
2203 }
2204 }
2205
2206 f->open_array_section("watchers");
2207 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2208 it != watchers.end(); ++it) {
2209
224ce89b 2210 f->open_object_section("watch");
7c673cae
FG
2211
2212 f->dump_string("namespace", it->obj.nspace);
2213 f->dump_string("object", it->obj.oid.name);
2214
2215 f->open_object_section("entity_name");
2216 it->wi.name.dump(f);
2217 f->close_section(); //entity_name_t
2218
224ce89b
WB
2219 f->dump_unsigned("cookie", it->wi.cookie);
2220 f->dump_unsigned("timeout", it->wi.timeout_seconds);
7c673cae
FG
2221
2222 f->open_object_section("entity_addr_t");
2223 it->wi.addr.dump(f);
2224 f->close_section(); //entity_addr_t
2225
2226 f->close_section(); //watch
2227 }
2228
2229 f->close_section(); //watchers
2230 } else if (admin_command == "dump_reservations") {
2231 f->open_object_section("reservations");
2232 f->open_object_section("local_reservations");
2233 service.local_reserver.dump(f);
2234 f->close_section();
2235 f->open_object_section("remote_reservations");
2236 service.remote_reserver.dump(f);
2237 f->close_section();
2238 f->close_section();
2239 } else if (admin_command == "get_latest_osdmap") {
2240 get_latest_osdmap();
2241 } else if (admin_command == "heap") {
2242 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2243
2244 // Note: Failed heap profile commands won't necessarily trigger an error:
2245 f->open_object_section("result");
2246 f->dump_string("error", cpp_strerror(result));
2247 f->dump_bool("success", result >= 0);
2248 f->close_section();
2249 } else if (admin_command == "set_heap_property") {
2250 string property;
2251 int64_t value = 0;
2252 string error;
2253 bool success = false;
2254 if (!cmd_getval(cct, cmdmap, "property", property)) {
2255 error = "unable to get property";
2256 success = false;
2257 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2258 error = "unable to get value";
2259 success = false;
2260 } else if (value < 0) {
2261 error = "negative value not allowed";
2262 success = false;
2263 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2264 error = "invalid property";
2265 success = false;
2266 } else {
2267 success = true;
2268 }
2269 f->open_object_section("result");
2270 f->dump_string("error", error);
2271 f->dump_bool("success", success);
2272 f->close_section();
2273 } else if (admin_command == "get_heap_property") {
2274 string property;
2275 size_t value = 0;
2276 string error;
2277 bool success = false;
2278 if (!cmd_getval(cct, cmdmap, "property", property)) {
2279 error = "unable to get property";
2280 success = false;
2281 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2282 error = "invalid property";
2283 success = false;
2284 } else {
2285 success = true;
2286 }
2287 f->open_object_section("result");
2288 f->dump_string("error", error);
2289 f->dump_bool("success", success);
2290 f->dump_int("value", value);
2291 f->close_section();
2292 } else if (admin_command == "dump_objectstore_kv_stats") {
2293 store->get_db_statistics(f);
2294 } else if (admin_command == "dump_scrubs") {
2295 service.dumps_scrub(f);
2296 } else if (admin_command == "calc_objectstore_db_histogram") {
2297 store->generate_db_histogram(f);
2298 } else if (admin_command == "flush_store_cache") {
2299 store->flush_cache();
2300 } else if (admin_command == "dump_pgstate_history") {
2301 f->open_object_section("pgstate_history");
2302 RWLock::RLocker l2(pg_map_lock);
2303 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2304 it != pg_map.end();
2305 ++it) {
2306
2307 PG *pg = it->second;
2308 f->dump_stream("pg") << pg->get_pgid();
2309 pg->lock();
2310 pg->pgstate_history.dump(f);
2311 pg->unlock();
2312 }
2313 f->close_section();
224ce89b
WB
2314 } else if (admin_command == "compact") {
2315 dout(1) << "triggering manual compaction" << dendl;
2316 auto start = ceph::coarse_mono_clock::now();
2317 store->compact();
2318 auto end = ceph::coarse_mono_clock::now();
2319 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
2320 dout(1) << "finished manual compaction in "
2321 << time_span.count()
2322 << " seconds" << dendl;
2323 f->open_object_section("compact_result");
2324 f->dump_float("elapsed_time", time_span.count());
2325 f->close_section();
7c673cae
FG
2326 } else {
2327 assert(0 == "broken asok registration");
2328 }
2329 f->flush(ss);
2330 delete f;
2331 return true;
2332}
2333
2334class TestOpsSocketHook : public AdminSocketHook {
2335 OSDService *service;
2336 ObjectStore *store;
2337public:
2338 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
2339 bool call(std::string command, cmdmap_t& cmdmap, std::string format,
2340 bufferlist& out) override {
2341 stringstream ss;
2342 test_ops(service, store, command, cmdmap, ss);
2343 out.append(ss);
2344 return true;
2345 }
2346 void test_ops(OSDService *service, ObjectStore *store,
2347 const std::string &command, cmdmap_t& cmdmap, ostream &ss);
2348
2349};
2350
2351class OSD::C_Tick : public Context {
2352 OSD *osd;
2353 public:
2354 explicit C_Tick(OSD *o) : osd(o) {}
2355 void finish(int r) override {
2356 osd->tick();
2357 }
2358};
2359
2360class OSD::C_Tick_WithoutOSDLock : public Context {
2361 OSD *osd;
2362 public:
2363 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2364 void finish(int r) override {
2365 osd->tick_without_osd_lock();
2366 }
2367};
2368
2369int OSD::enable_disable_fuse(bool stop)
2370{
2371#ifdef HAVE_LIBFUSE
2372 int r;
2373 string mntpath = cct->_conf->osd_data + "/fuse";
2374 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2375 dout(1) << __func__ << " disabling" << dendl;
2376 fuse_store->stop();
2377 delete fuse_store;
2378 fuse_store = NULL;
2379 r = ::rmdir(mntpath.c_str());
7c673cae 2380 if (r < 0) {
c07f9fc5
FG
2381 r = -errno;
2382 derr << __func__ << " failed to rmdir " << mntpath << ": "
2383 << cpp_strerror(r) << dendl;
7c673cae
FG
2384 return r;
2385 }
2386 return 0;
2387 }
2388 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2389 dout(1) << __func__ << " enabling" << dendl;
2390 r = ::mkdir(mntpath.c_str(), 0700);
2391 if (r < 0)
2392 r = -errno;
2393 if (r < 0 && r != -EEXIST) {
2394 derr << __func__ << " unable to create " << mntpath << ": "
2395 << cpp_strerror(r) << dendl;
2396 return r;
2397 }
2398 fuse_store = new FuseStore(store, mntpath);
2399 r = fuse_store->start();
2400 if (r < 0) {
2401 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2402 delete fuse_store;
2403 fuse_store = NULL;
2404 return r;
2405 }
2406 }
2407#endif // HAVE_LIBFUSE
2408 return 0;
2409}
2410
31f18b77
FG
2411int OSD::get_num_op_shards()
2412{
2413 if (cct->_conf->osd_op_num_shards)
2414 return cct->_conf->osd_op_num_shards;
2415 if (store_is_rotational)
2416 return cct->_conf->osd_op_num_shards_hdd;
2417 else
2418 return cct->_conf->osd_op_num_shards_ssd;
2419}
2420
2421int OSD::get_num_op_threads()
2422{
2423 if (cct->_conf->osd_op_num_threads_per_shard)
2424 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2425 if (store_is_rotational)
2426 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2427 else
2428 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2429}
2430
c07f9fc5
FG
2431float OSD::get_osd_recovery_sleep()
2432{
2433 if (cct->_conf->osd_recovery_sleep)
2434 return cct->_conf->osd_recovery_sleep;
d2e6a577 2435 if (!store_is_rotational && !journal_is_rotational)
c07f9fc5 2436 return cct->_conf->osd_recovery_sleep_ssd;
d2e6a577
FG
2437 else if (store_is_rotational && !journal_is_rotational)
2438 return cct->_conf->get_val<double>("osd_recovery_sleep_hybrid");
2439 else
2440 return cct->_conf->osd_recovery_sleep_hdd;
c07f9fc5
FG
2441}
2442
7c673cae
FG
2443int OSD::init()
2444{
2445 CompatSet initial, diff;
2446 Mutex::Locker lock(osd_lock);
2447 if (is_stopping())
2448 return 0;
2449
2450 tick_timer.init();
2451 tick_timer_without_osd_lock.init();
2452 service.recovery_request_timer.init();
31f18b77 2453 service.recovery_sleep_timer.init();
7c673cae
FG
2454
2455 // mount.
31f18b77
FG
2456 dout(2) << "init " << dev_path
2457 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
2458 << dendl;
d2e6a577 2459 dout(2) << "journal " << journal_path << dendl;
7c673cae
FG
2460 assert(store); // call pre_init() first!
2461
31f18b77 2462 store->set_cache_shards(get_num_op_shards());
7c673cae
FG
2463
2464 int r = store->mount();
2465 if (r < 0) {
2466 derr << "OSD:init: unable to mount object store" << dendl;
2467 return r;
2468 }
d2e6a577
FG
2469 journal_is_rotational = store->is_journal_rotational();
2470 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
2471 << dendl;
7c673cae
FG
2472
2473 enable_disable_fuse(false);
2474
2475 dout(2) << "boot" << dendl;
2476
2477 // initialize the daily loadavg with current 15min loadavg
2478 double loadavgs[3];
2479 if (getloadavg(loadavgs, 3) == 3) {
2480 daily_loadavg = loadavgs[2];
2481 } else {
2482 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
2483 daily_loadavg = 1.0;
2484 }
2485
2486 int rotating_auth_attempts = 0;
2487
2488 // sanity check long object name handling
2489 {
2490 hobject_t l;
2491 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
2492 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
2493 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
2494 r = store->validate_hobject_key(l);
2495 if (r < 0) {
2496 derr << "backend (" << store->get_type() << ") is unable to support max "
2497 << "object name[space] len" << dendl;
2498 derr << " osd max object name len = "
2499 << cct->_conf->osd_max_object_name_len << dendl;
2500 derr << " osd max object namespace len = "
2501 << cct->_conf->osd_max_object_namespace_len << dendl;
2502 derr << cpp_strerror(r) << dendl;
2503 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
2504 goto out;
2505 }
2506 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
2507 << dendl;
2508 } else {
2509 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
2510 }
2511 }
2512
2513 // read superblock
2514 r = read_superblock();
2515 if (r < 0) {
2516 derr << "OSD::init() : unable to read osd superblock" << dendl;
2517 r = -EINVAL;
2518 goto out;
2519 }
2520
2521 if (osd_compat.compare(superblock.compat_features) < 0) {
2522 derr << "The disk uses features unsupported by the executable." << dendl;
2523 derr << " ondisk features " << superblock.compat_features << dendl;
2524 derr << " daemon features " << osd_compat << dendl;
2525
2526 if (osd_compat.writeable(superblock.compat_features)) {
2527 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2528 derr << "it is still writeable, though. Missing features: " << diff << dendl;
2529 r = -EOPNOTSUPP;
2530 goto out;
2531 }
2532 else {
2533 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2534 derr << "Cannot write to disk! Missing features: " << diff << dendl;
2535 r = -EOPNOTSUPP;
2536 goto out;
2537 }
2538 }
2539
2540 assert_warn(whoami == superblock.whoami);
2541 if (whoami != superblock.whoami) {
2542 derr << "OSD::init: superblock says osd"
2543 << superblock.whoami << " but I am osd." << whoami << dendl;
2544 r = -EINVAL;
2545 goto out;
2546 }
2547
2548 initial = get_osd_initial_compat_set();
2549 diff = superblock.compat_features.unsupported(initial);
2550 if (superblock.compat_features.merge(initial)) {
2551 // We need to persist the new compat_set before we
2552 // do anything else
2553 dout(5) << "Upgrading superblock adding: " << diff << dendl;
2554 ObjectStore::Transaction t;
2555 write_superblock(t);
2556 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2557 if (r < 0)
2558 goto out;
2559 }
2560
2561 // make sure snap mapper object exists
2562 if (!store->exists(coll_t::meta(), OSD::make_snapmapper_oid())) {
2563 dout(10) << "init creating/touching snapmapper object" << dendl;
2564 ObjectStore::Transaction t;
2565 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
2566 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2567 if (r < 0)
2568 goto out;
2569 }
2570
2571 class_handler = new ClassHandler(cct);
2572 cls_initialize(class_handler);
2573
2574 if (cct->_conf->osd_open_classes_on_start) {
2575 int r = class_handler->open_all_classes();
2576 if (r)
2577 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
2578 }
2579
2580 // load up "current" osdmap
2581 assert_warn(!osdmap);
2582 if (osdmap) {
2583 derr << "OSD::init: unable to read current osdmap" << dendl;
2584 r = -EINVAL;
2585 goto out;
2586 }
2587 osdmap = get_map(superblock.current_epoch);
2588 check_osdmap_features(store);
2589
2590 create_recoverystate_perf();
2591
2592 {
2593 epoch_t bind_epoch = osdmap->get_epoch();
2594 service.set_epochs(NULL, NULL, &bind_epoch);
2595 }
2596
2597 clear_temp_objects();
2598
d2e6a577
FG
2599 // initialize osdmap references in sharded wq
2600 op_shardedwq.prune_pg_waiters(osdmap, whoami);
2601
7c673cae
FG
2602 // load up pgs (as they previously existed)
2603 load_pgs();
2604
2605 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
2606 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
2607 op_prio_cutoff << "." << dendl;
2608
2609 create_logger();
2610
2611 // i'm ready!
2612 client_messenger->add_dispatcher_head(this);
2613 cluster_messenger->add_dispatcher_head(this);
2614
2615 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2616 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2617 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2618 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2619
2620 objecter_messenger->add_dispatcher_head(service.objecter);
2621
2622 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
2623 | CEPH_ENTITY_TYPE_MGR);
2624 r = monc->init();
2625 if (r < 0)
2626 goto out;
2627
2628 /**
2629 * FIXME: this is a placeholder implementation that unconditionally
2630 * sends every is_primary PG's stats every time we're called, unlike
2631 * the existing mon PGStats mechanism that uses pg_stat_queue and acks.
2632 * This has equivalent cost to the existing worst case where all
2633 * PGs are busy and their stats are always enqueued for sending.
2634 */
2635 mgrc.set_pgstats_cb([this](){
2636 RWLock::RLocker l(map_lock);
2637
2638 utime_t had_for = ceph_clock_now() - had_map_since;
2639 osd_stat_t cur_stat = service.get_osd_stat();
2640 cur_stat.os_perf_stat = store->get_cur_stats();
2641
2642 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
2643 m->osd_stat = cur_stat;
2644
2645 Mutex::Locker lec{min_last_epoch_clean_lock};
2646 min_last_epoch_clean = osdmap->get_epoch();
2647 min_last_epoch_clean_pgs.clear();
2648 RWLock::RLocker lpg(pg_map_lock);
2649 for (const auto &i : pg_map) {
2650 PG *pg = i.second;
2651 if (!pg->is_primary()) {
2652 continue;
2653 }
2654
2655 pg->pg_stats_publish_lock.Lock();
2656 if (pg->pg_stats_publish_valid) {
2657 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
2658 const auto lec = pg->pg_stats_publish.get_effective_last_epoch_clean();
2659 min_last_epoch_clean = min(min_last_epoch_clean, lec);
2660 min_last_epoch_clean_pgs.push_back(pg->info.pgid.pgid);
2661 }
2662 pg->pg_stats_publish_lock.Unlock();
2663 }
2664
2665 return m;
2666 });
2667
2668 mgrc.init();
2669 client_messenger->add_dispatcher_head(&mgrc);
2670
2671 // tell monc about log_client so it will know about mon session resets
2672 monc->set_log_client(&log_client);
2673 update_log_config();
2674
31f18b77 2675 peering_tp.start();
28e407b8
AA
2676
2677 service.init();
2678 service.publish_map(osdmap);
2679 service.publish_superblock(superblock);
2680 service.max_oldest_map = superblock.oldest_map;
2681
7c673cae 2682 osd_op_tp.start();
f64942e4
AA
2683 remove_tp.start();
2684 recovery_tp.start();
7c673cae
FG
2685 command_tp.start();
2686
2687 set_disk_tp_priority();
2688
2689 // start the heartbeat
2690 heartbeat_thread.create("osd_srv_heartbt");
2691
2692 // tick
91327a77
AA
2693 tick_timer.add_event_after(get_tick_interval(),
2694 new C_Tick(this));
7c673cae
FG
2695 {
2696 Mutex::Locker l(tick_timer_lock);
91327a77
AA
2697 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
2698 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
2699 }
2700
7c673cae
FG
2701 osd_lock.Unlock();
2702
2703 r = monc->authenticate();
2704 if (r < 0) {
c07f9fc5
FG
2705 derr << __func__ << " authentication failed: " << cpp_strerror(r)
2706 << dendl;
7c673cae
FG
2707 osd_lock.Lock(); // locker is going to unlock this on function exit
2708 if (is_stopping())
c07f9fc5 2709 r = 0;
7c673cae
FG
2710 goto monout;
2711 }
2712
2713 while (monc->wait_auth_rotating(30.0) < 0) {
2714 derr << "unable to obtain rotating service keys; retrying" << dendl;
2715 ++rotating_auth_attempts;
2716 if (rotating_auth_attempts > g_conf->max_rotating_auth_attempts) {
c07f9fc5 2717 derr << __func__ << " wait_auth_rotating timed out" << dendl;
7c673cae
FG
2718 osd_lock.Lock(); // make locker happy
2719 if (!is_stopping()) {
c07f9fc5 2720 r = -ETIMEDOUT;
7c673cae
FG
2721 }
2722 goto monout;
2723 }
2724 }
2725
2726 r = update_crush_device_class();
2727 if (r < 0) {
d2e6a577
FG
2728 derr << __func__ << " unable to update_crush_device_class: "
2729 << cpp_strerror(r) << dendl;
7c673cae
FG
2730 osd_lock.Lock();
2731 goto monout;
2732 }
2733
2734 r = update_crush_location();
2735 if (r < 0) {
d2e6a577 2736 derr << __func__ << " unable to update_crush_location: "
c07f9fc5 2737 << cpp_strerror(r) << dendl;
7c673cae
FG
2738 osd_lock.Lock();
2739 goto monout;
2740 }
2741
2742 osd_lock.Lock();
2743 if (is_stopping())
2744 return 0;
2745
2746 // start objecter *after* we have authenticated, so that we don't ignore
2747 // the OSDMaps it requests.
2748 service.final_init();
2749
2750 check_config();
2751
2752 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
2753 consume_map();
2754 peering_wq.drain();
2755
2756 dout(0) << "done with init, starting boot process" << dendl;
2757
2758 // subscribe to any pg creations
2759 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
2760
2761 // MgrClient needs this (it doesn't have MonClient reference itself)
2762 monc->sub_want("mgrmap", 0, 0);
2763
2764 // we don't need to ask for an osdmap here; objecter will
2765 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
2766
2767 monc->renew_subs();
2768
2769 start_boot();
2770
2771 return 0;
2772monout:
c07f9fc5 2773 exit(1);
7c673cae
FG
2774
2775out:
2776 enable_disable_fuse(true);
2777 store->umount();
2778 delete store;
2779 store = NULL;
2780 return r;
2781}
2782
2783void OSD::final_init()
2784{
2785 AdminSocket *admin_socket = cct->get_admin_socket();
2786 asok_hook = new OSDSocketHook(this);
2787 int r = admin_socket->register_command("status", "status", asok_hook,
2788 "high-level status of OSD");
2789 assert(r == 0);
2790 r = admin_socket->register_command("flush_journal", "flush_journal",
2791 asok_hook,
2792 "flush the journal to permanent store");
2793 assert(r == 0);
2794 r = admin_socket->register_command("dump_ops_in_flight",
c07f9fc5
FG
2795 "dump_ops_in_flight " \
2796 "name=filterstr,type=CephString,n=N,req=false",
2797 asok_hook,
7c673cae
FG
2798 "show the ops currently in flight");
2799 assert(r == 0);
2800 r = admin_socket->register_command("ops",
c07f9fc5
FG
2801 "ops " \
2802 "name=filterstr,type=CephString,n=N,req=false",
2803 asok_hook,
7c673cae
FG
2804 "show the ops currently in flight");
2805 assert(r == 0);
2806 r = admin_socket->register_command("dump_blocked_ops",
c07f9fc5
FG
2807 "dump_blocked_ops " \
2808 "name=filterstr,type=CephString,n=N,req=false",
2809 asok_hook,
7c673cae
FG
2810 "show the blocked ops currently in flight");
2811 assert(r == 0);
c07f9fc5
FG
2812 r = admin_socket->register_command("dump_historic_ops",
2813 "dump_historic_ops " \
2814 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
2815 asok_hook,
2816 "show recent ops");
2817 assert(r == 0);
c07f9fc5
FG
2818 r = admin_socket->register_command("dump_historic_slow_ops",
2819 "dump_historic_slow_ops " \
2820 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
2821 asok_hook,
2822 "show slowest recent ops");
2823 assert(r == 0);
c07f9fc5
FG
2824 r = admin_socket->register_command("dump_historic_ops_by_duration",
2825 "dump_historic_ops_by_duration " \
2826 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
2827 asok_hook,
2828 "show slowest recent ops, sorted by duration");
2829 assert(r == 0);
2830 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
2831 asok_hook,
2832 "dump op priority queue state");
2833 assert(r == 0);
2834 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
2835 asok_hook,
2836 "dump blacklisted clients and times");
2837 assert(r == 0);
2838 r = admin_socket->register_command("dump_watchers", "dump_watchers",
2839 asok_hook,
2840 "show clients which have active watches,"
2841 " and on which objects");
2842 assert(r == 0);
2843 r = admin_socket->register_command("dump_reservations", "dump_reservations",
2844 asok_hook,
2845 "show recovery reservations");
2846 assert(r == 0);
2847 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
2848 asok_hook,
2849 "force osd to update the latest map from "
2850 "the mon");
2851 assert(r == 0);
2852
2853 r = admin_socket->register_command( "heap",
2854 "heap " \
2855 "name=heapcmd,type=CephString",
2856 asok_hook,
2857 "show heap usage info (available only if "
2858 "compiled with tcmalloc)");
2859 assert(r == 0);
2860
2861 r = admin_socket->register_command("set_heap_property",
2862 "set_heap_property " \
2863 "name=property,type=CephString " \
2864 "name=value,type=CephInt",
2865 asok_hook,
2866 "update malloc extension heap property");
2867 assert(r == 0);
2868
2869 r = admin_socket->register_command("get_heap_property",
2870 "get_heap_property " \
2871 "name=property,type=CephString",
2872 asok_hook,
2873 "get malloc extension heap property");
2874 assert(r == 0);
2875
2876 r = admin_socket->register_command("dump_objectstore_kv_stats",
2877 "dump_objectstore_kv_stats",
2878 asok_hook,
2879 "print statistics of kvdb which used by bluestore");
2880 assert(r == 0);
2881
2882 r = admin_socket->register_command("dump_scrubs",
2883 "dump_scrubs",
2884 asok_hook,
2885 "print scheduled scrubs");
2886 assert(r == 0);
2887
2888 r = admin_socket->register_command("calc_objectstore_db_histogram",
2889 "calc_objectstore_db_histogram",
2890 asok_hook,
2891 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
2892 assert(r == 0);
2893
2894 r = admin_socket->register_command("flush_store_cache",
2895 "flush_store_cache",
2896 asok_hook,
2897 "Flush bluestore internal cache");
2898 assert(r == 0);
2899 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
2900 asok_hook,
2901 "show recent state history");
2902 assert(r == 0);
2903
224ce89b
WB
2904 r = admin_socket->register_command("compact", "compact",
2905 asok_hook,
2906 "Commpact object store's omap."
2907 " WARNING: Compaction probably slows your requests");
2908 assert(r == 0);
2909
7c673cae
FG
2910 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
2911 // Note: pools are CephString instead of CephPoolname because
2912 // these commands traditionally support both pool names and numbers
2913 r = admin_socket->register_command(
2914 "setomapval",
2915 "setomapval " \
2916 "name=pool,type=CephString " \
2917 "name=objname,type=CephObjectname " \
2918 "name=key,type=CephString "\
2919 "name=val,type=CephString",
2920 test_ops_hook,
2921 "set omap key");
2922 assert(r == 0);
2923 r = admin_socket->register_command(
2924 "rmomapkey",
2925 "rmomapkey " \
2926 "name=pool,type=CephString " \
2927 "name=objname,type=CephObjectname " \
2928 "name=key,type=CephString",
2929 test_ops_hook,
2930 "remove omap key");
2931 assert(r == 0);
2932 r = admin_socket->register_command(
2933 "setomapheader",
2934 "setomapheader " \
2935 "name=pool,type=CephString " \
2936 "name=objname,type=CephObjectname " \
2937 "name=header,type=CephString",
2938 test_ops_hook,
2939 "set omap header");
2940 assert(r == 0);
2941
2942 r = admin_socket->register_command(
2943 "getomap",
2944 "getomap " \
2945 "name=pool,type=CephString " \
2946 "name=objname,type=CephObjectname",
2947 test_ops_hook,
2948 "output entire object map");
2949 assert(r == 0);
2950
2951 r = admin_socket->register_command(
2952 "truncobj",
2953 "truncobj " \
2954 "name=pool,type=CephString " \
2955 "name=objname,type=CephObjectname " \
2956 "name=len,type=CephInt",
2957 test_ops_hook,
2958 "truncate object to length");
2959 assert(r == 0);
2960
2961 r = admin_socket->register_command(
2962 "injectdataerr",
2963 "injectdataerr " \
2964 "name=pool,type=CephString " \
2965 "name=objname,type=CephObjectname " \
2966 "name=shardid,type=CephInt,req=false,range=0|255",
2967 test_ops_hook,
2968 "inject data error to an object");
2969 assert(r == 0);
2970
2971 r = admin_socket->register_command(
2972 "injectmdataerr",
2973 "injectmdataerr " \
2974 "name=pool,type=CephString " \
2975 "name=objname,type=CephObjectname " \
2976 "name=shardid,type=CephInt,req=false,range=0|255",
2977 test_ops_hook,
2978 "inject metadata error to an object");
2979 assert(r == 0);
2980 r = admin_socket->register_command(
2981 "set_recovery_delay",
2982 "set_recovery_delay " \
2983 "name=utime,type=CephInt,req=false",
2984 test_ops_hook,
2985 "Delay osd recovery by specified seconds");
2986 assert(r == 0);
2987 r = admin_socket->register_command(
2988 "trigger_scrub",
2989 "trigger_scrub " \
a8e16298
TL
2990 "name=pgid,type=CephString " \
2991 "name=time,type=CephInt,req=false",
7c673cae
FG
2992 test_ops_hook,
2993 "Trigger a scheduled scrub ");
2994 assert(r == 0);
a8e16298
TL
2995 r = admin_socket->register_command(
2996 "trigger_deep_scrub",
2997 "trigger_deep_scrub " \
2998 "name=pgid,type=CephString " \
2999 "name=time,type=CephInt,req=false",
3000 test_ops_hook,
3001 "Trigger a scheduled deep scrub ");
3002 ceph_assert(r == 0);
7c673cae
FG
3003 r = admin_socket->register_command(
3004 "injectfull",
3005 "injectfull " \
3006 "name=type,type=CephString,req=false " \
3007 "name=count,type=CephInt,req=false ",
3008 test_ops_hook,
3009 "Inject a full disk (optional count times)");
3010 assert(r == 0);
3011}
3012
3013void OSD::create_logger()
3014{
3015 dout(10) << "create_logger" << dendl;
3016
3017 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
3018
3019 // Latency axis configuration for op histograms, values are in nanoseconds
3020 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
3021 "Latency (usec)",
3022 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
3023 0, ///< Start at 0
3024 100000, ///< Quantization unit is 100usec
3025 32, ///< Enough to cover much longer than slow requests
3026 };
3027
3028 // Op size axis configuration for op histograms, values are in bytes
3029 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
3030 "Request size (bytes)",
3031 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
3032 0, ///< Start at 0
3033 512, ///< Quantization unit is 512 bytes
3034 32, ///< Enough to cover requests larger than GB
3035 };
3036
3037
3efd9988
FG
3038 // All the basic OSD operation stats are to be considered useful
3039 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
3040
7c673cae
FG
3041 osd_plb.add_u64(
3042 l_osd_op_wip, "op_wip",
3043 "Replication operations currently being processed (primary)");
3044 osd_plb.add_u64_counter(
3045 l_osd_op, "op",
3046 "Client operations",
3047 "ops", PerfCountersBuilder::PRIO_CRITICAL);
3048 osd_plb.add_u64_counter(
3049 l_osd_op_inb, "op_in_bytes",
3050 "Client operations total write size",
1adf2230 3051 "wr", PerfCountersBuilder::PRIO_INTERESTING, unit_t(BYTES));
7c673cae
FG
3052 osd_plb.add_u64_counter(
3053 l_osd_op_outb, "op_out_bytes",
3054 "Client operations total read size",
1adf2230 3055 "rd", PerfCountersBuilder::PRIO_INTERESTING, unit_t(BYTES));
7c673cae
FG
3056 osd_plb.add_time_avg(
3057 l_osd_op_lat, "op_latency",
3058 "Latency of client operations (including queue time)",
3059 "l", 9);
3060 osd_plb.add_time_avg(
3061 l_osd_op_process_lat, "op_process_latency",
3062 "Latency of client operations (excluding queue time)");
3063 osd_plb.add_time_avg(
3064 l_osd_op_prepare_lat, "op_prepare_latency",
3065 "Latency of client operations (excluding queue time and wait for finished)");
3066
3067 osd_plb.add_u64_counter(
3068 l_osd_op_r, "op_r", "Client read operations");
3069 osd_plb.add_u64_counter(
1adf2230 3070 l_osd_op_r_outb, "op_r_out_bytes", "Client data read", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
7c673cae
FG
3071 osd_plb.add_time_avg(
3072 l_osd_op_r_lat, "op_r_latency",
3073 "Latency of read operation (including queue time)");
31f18b77 3074 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3075 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
3076 op_hist_x_axis_config, op_hist_y_axis_config,
3077 "Histogram of operation latency (including queue time) + data read");
3078 osd_plb.add_time_avg(
3079 l_osd_op_r_process_lat, "op_r_process_latency",
3080 "Latency of read operation (excluding queue time)");
3081 osd_plb.add_time_avg(
3082 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
3083 "Latency of read operations (excluding queue time and wait for finished)");
3084 osd_plb.add_u64_counter(
3085 l_osd_op_w, "op_w", "Client write operations");
3086 osd_plb.add_u64_counter(
3087 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
3088 osd_plb.add_time_avg(
3089 l_osd_op_w_lat, "op_w_latency",
3090 "Latency of write operation (including queue time)");
31f18b77 3091 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3092 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
3093 op_hist_x_axis_config, op_hist_y_axis_config,
3094 "Histogram of operation latency (including queue time) + data written");
3095 osd_plb.add_time_avg(
3096 l_osd_op_w_process_lat, "op_w_process_latency",
3097 "Latency of write operation (excluding queue time)");
3098 osd_plb.add_time_avg(
3099 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
3100 "Latency of write operations (excluding queue time and wait for finished)");
3101 osd_plb.add_u64_counter(
3102 l_osd_op_rw, "op_rw",
3103 "Client read-modify-write operations");
3104 osd_plb.add_u64_counter(
3105 l_osd_op_rw_inb, "op_rw_in_bytes",
1adf2230 3106 "Client read-modify-write operations write in", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
7c673cae
FG
3107 osd_plb.add_u64_counter(
3108 l_osd_op_rw_outb,"op_rw_out_bytes",
1adf2230 3109 "Client read-modify-write operations read out ", NULL, PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
7c673cae
FG
3110 osd_plb.add_time_avg(
3111 l_osd_op_rw_lat, "op_rw_latency",
3112 "Latency of read-modify-write operation (including queue time)");
31f18b77 3113 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3114 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
3115 op_hist_x_axis_config, op_hist_y_axis_config,
3116 "Histogram of rw operation latency (including queue time) + data written");
31f18b77 3117 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3118 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
3119 op_hist_x_axis_config, op_hist_y_axis_config,
3120 "Histogram of rw operation latency (including queue time) + data read");
3121 osd_plb.add_time_avg(
3122 l_osd_op_rw_process_lat, "op_rw_process_latency",
3123 "Latency of read-modify-write operation (excluding queue time)");
3124 osd_plb.add_time_avg(
3125 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
3126 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3127
3efd9988
FG
3128 // Now we move on to some more obscure stats, revert to assuming things
3129 // are low priority unless otherwise specified.
3130 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3131
224ce89b
WB
3132 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
3133 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3134 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
3135 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3136
7c673cae
FG
3137 osd_plb.add_u64_counter(
3138 l_osd_sop, "subop", "Suboperations");
3139 osd_plb.add_u64_counter(
1adf2230 3140 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size", NULL, 0, unit_t(BYTES));
7c673cae
FG
3141 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
3142
3143 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
3144 osd_plb.add_u64_counter(
1adf2230 3145 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size", NULL, 0, unit_t(BYTES));
7c673cae
FG
3146 osd_plb.add_time_avg(
3147 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
3148 osd_plb.add_u64_counter(
3149 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
3150 osd_plb.add_time_avg(
3151 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
3152 osd_plb.add_u64_counter(
3153 l_osd_sop_push, "subop_push", "Suboperations push messages");
3154 osd_plb.add_u64_counter(
1adf2230 3155 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size", NULL, 0, unit_t(BYTES));
7c673cae
FG
3156 osd_plb.add_time_avg(
3157 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3158
3159 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3160 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
1adf2230 3161 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size", NULL, 0, unit_t(BYTES));
7c673cae
FG
3162
3163 osd_plb.add_u64_counter(
3164 l_osd_rop, "recovery_ops",
3165 "Started recovery operations",
3166 "rop", PerfCountersBuilder::PRIO_INTERESTING);
3167
3168 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
1adf2230
AA
3169 osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size", NULL, 0, unit_t(BYTES));
3170 osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes", NULL, 0, unit_t(BYTES));
7c673cae
FG
3171 osd_plb.add_u64(l_osd_history_alloc_num, "history_alloc_num");
3172 osd_plb.add_u64(
3173 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3174 osd_plb.add_u64(
3175 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3176 "Total number getting crc from crc_cache with adjusting");
3177 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3178 "Total number of crc cache misses");
3179
3180 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3181 "pgs", PerfCountersBuilder::PRIO_USEFUL);
3182 osd_plb.add_u64(
3183 l_osd_pg_primary, "numpg_primary",
3184 "Placement groups for which this osd is primary");
3185 osd_plb.add_u64(
3186 l_osd_pg_replica, "numpg_replica",
3187 "Placement groups for which this osd is replica");
3188 osd_plb.add_u64(
3189 l_osd_pg_stray, "numpg_stray",
3190 "Placement groups ready to be deleted from this osd");
94b18763
FG
3191 osd_plb.add_u64(
3192 l_osd_pg_removing, "numpg_removing",
3193 "Placement groups queued for local deletion", "pgsr",
3194 PerfCountersBuilder::PRIO_USEFUL);
7c673cae
FG
3195 osd_plb.add_u64(
3196 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3197 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3198 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3199 osd_plb.add_u64_counter(
3200 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3201 osd_plb.add_u64_counter(
3202 l_osd_waiting_for_map, "messages_delayed_for_map",
3203 "Operations waiting for OSD map");
31f18b77 3204
7c673cae
FG
3205 osd_plb.add_u64_counter(
3206 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3207 osd_plb.add_u64_counter(
3208 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3209 osd_plb.add_u64_counter(
3210 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3211 "osdmap cache miss below cache lower bound");
3212 osd_plb.add_u64_avg(
3213 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3214 "osdmap cache miss, avg distance below cache lower bound");
31f18b77
FG
3215 osd_plb.add_u64_counter(
3216 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3217 "OSDMap buffer cache hits");
3218 osd_plb.add_u64_counter(
3219 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3220 "OSDMap buffer cache misses");
7c673cae 3221
3efd9988
FG
3222 osd_plb.add_u64(
3223 l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
1adf2230 3224 PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
3efd9988
FG
3225 osd_plb.add_u64(
3226 l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
1adf2230
AA
3227 PerfCountersBuilder::PRIO_USEFUL, unit_t(BYTES));
3228 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space", NULL, 0, unit_t(BYTES));
7c673cae
FG
3229
3230 osd_plb.add_u64_counter(
3231 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3232
3233 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3234 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3235 osd_plb.add_u64_counter(
3236 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3237 osd_plb.add_u64_counter(
3238 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3239 osd_plb.add_u64_counter(
3240 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3241 "Failed tier flush attempts");
3242 osd_plb.add_u64_counter(
3243 l_osd_tier_evict, "tier_evict", "Tier evictions");
3244 osd_plb.add_u64_counter(
3245 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3246 osd_plb.add_u64_counter(
3247 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3248 osd_plb.add_u64_counter(
3249 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3250 osd_plb.add_u64_counter(
3251 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3252 osd_plb.add_u64_counter(
3253 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3254 osd_plb.add_u64_counter(
3255 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3256
3257 osd_plb.add_u64_counter(
3258 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3259 osd_plb.add_u64_counter(
3260 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3261 osd_plb.add_u64_counter(
3262 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3263 osd_plb.add_u64_counter(
3264 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3265
3266 osd_plb.add_u64_counter(
3267 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3268 osd_plb.add_u64_counter(
3269 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3270
3271 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3272 osd_plb.add_time_avg(
3273 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3274 osd_plb.add_time_avg(
3275 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3276 osd_plb.add_time_avg(
3277 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3278
3279 osd_plb.add_u64_counter(
3280 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3281 osd_plb.add_u64_counter(
3282 l_osd_pg_fastinfo, "osd_pg_fastinfo",
3283 "PG updated its info using fastinfo attr");
3284 osd_plb.add_u64_counter(
3285 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3286
3287 logger = osd_plb.create_perf_counters();
3288 cct->get_perfcounters_collection()->add(logger);
3289}
3290
3291void OSD::create_recoverystate_perf()
3292{
3293 dout(10) << "create_recoverystate_perf" << dendl;
3294
3295 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3296
3297 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3298 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3299 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3300 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3301 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3302 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3303 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3304 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3305 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3306 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3307 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3308 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3309 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3310 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3311 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3312 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3313 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3314 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3315 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3316 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3317 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3318 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3319 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3320 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3321 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3322 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3323 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3324 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3325 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3326 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3327 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3328
3329 recoverystate_perf = rs_perf.create_perf_counters();
3330 cct->get_perfcounters_collection()->add(recoverystate_perf);
3331}
3332
3333int OSD::shutdown()
3334{
3335 if (!service.prepare_to_stop())
3336 return 0; // already shutting down
3337 osd_lock.Lock();
3338 if (is_stopping()) {
3339 osd_lock.Unlock();
3340 return 0;
3341 }
3342 derr << "shutdown" << dendl;
3343
3344 set_state(STATE_STOPPING);
3345
3346 // Debugging
3efd9988
FG
3347 if (cct->_conf->get_val<bool>("osd_debug_shutdown")) {
3348 cct->_conf->set_val("debug_osd", "100");
3349 cct->_conf->set_val("debug_journal", "100");
3350 cct->_conf->set_val("debug_filestore", "100");
3351 cct->_conf->set_val("debug_bluestore", "100");
3352 cct->_conf->set_val("debug_ms", "100");
3353 cct->_conf->apply_changes(NULL);
3354 }
7c673cae
FG
3355
3356 // stop MgrClient earlier as it's more like an internal consumer of OSD
3357 mgrc.shutdown();
3358
3359 service.start_shutdown();
3360
3361 // stop sending work to pgs. this just prevents any new work in _process
3362 // from racing with on_shutdown and potentially entering the pg after.
3363 op_shardedwq.drain();
3364
3365 // Shutdown PGs
3366 {
3367 RWLock::RLocker l(pg_map_lock);
3368 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3369 p != pg_map.end();
3370 ++p) {
3371 dout(20) << " kicking pg " << p->first << dendl;
3372 p->second->lock();
3373 p->second->on_shutdown();
3374 p->second->unlock();
3375 p->second->osr->flush();
3376 }
3377 }
3378 clear_pg_stat_queue();
3379
3380 // drain op queue again (in case PGs requeued something)
3381 op_shardedwq.drain();
3382 {
3383 finished.clear(); // zap waiters (bleh, this is messy)
3384 }
3385
3386 op_shardedwq.clear_pg_slots();
3387
3388 // unregister commands
3389 cct->get_admin_socket()->unregister_command("status");
3390 cct->get_admin_socket()->unregister_command("flush_journal");
3391 cct->get_admin_socket()->unregister_command("dump_ops_in_flight");
3392 cct->get_admin_socket()->unregister_command("ops");
3393 cct->get_admin_socket()->unregister_command("dump_blocked_ops");
3394 cct->get_admin_socket()->unregister_command("dump_historic_ops");
3395 cct->get_admin_socket()->unregister_command("dump_historic_ops_by_duration");
3396 cct->get_admin_socket()->unregister_command("dump_historic_slow_ops");
3397 cct->get_admin_socket()->unregister_command("dump_op_pq_state");
3398 cct->get_admin_socket()->unregister_command("dump_blacklist");
3399 cct->get_admin_socket()->unregister_command("dump_watchers");
3400 cct->get_admin_socket()->unregister_command("dump_reservations");
3401 cct->get_admin_socket()->unregister_command("get_latest_osdmap");
224ce89b 3402 cct->get_admin_socket()->unregister_command("heap");
7c673cae
FG
3403 cct->get_admin_socket()->unregister_command("set_heap_property");
3404 cct->get_admin_socket()->unregister_command("get_heap_property");
3405 cct->get_admin_socket()->unregister_command("dump_objectstore_kv_stats");
224ce89b 3406 cct->get_admin_socket()->unregister_command("dump_scrubs");
7c673cae
FG
3407 cct->get_admin_socket()->unregister_command("calc_objectstore_db_histogram");
3408 cct->get_admin_socket()->unregister_command("flush_store_cache");
3409 cct->get_admin_socket()->unregister_command("dump_pgstate_history");
224ce89b 3410 cct->get_admin_socket()->unregister_command("compact");
7c673cae
FG
3411 delete asok_hook;
3412 asok_hook = NULL;
3413
3414 cct->get_admin_socket()->unregister_command("setomapval");
3415 cct->get_admin_socket()->unregister_command("rmomapkey");
3416 cct->get_admin_socket()->unregister_command("setomapheader");
3417 cct->get_admin_socket()->unregister_command("getomap");
3418 cct->get_admin_socket()->unregister_command("truncobj");
3419 cct->get_admin_socket()->unregister_command("injectdataerr");
3420 cct->get_admin_socket()->unregister_command("injectmdataerr");
3421 cct->get_admin_socket()->unregister_command("set_recovery_delay");
224ce89b
WB
3422 cct->get_admin_socket()->unregister_command("trigger_scrub");
3423 cct->get_admin_socket()->unregister_command("injectfull");
7c673cae
FG
3424 delete test_ops_hook;
3425 test_ops_hook = NULL;
3426
3427 osd_lock.Unlock();
3428
3429 heartbeat_lock.Lock();
3430 heartbeat_stop = true;
3431 heartbeat_cond.Signal();
3432 heartbeat_lock.Unlock();
3433 heartbeat_thread.join();
3434
31f18b77 3435 peering_tp.drain();
7c673cae 3436 peering_wq.clear();
31f18b77 3437 peering_tp.stop();
7c673cae
FG
3438 dout(10) << "osd tp stopped" << dendl;
3439
3440 osd_op_tp.drain();
3441 osd_op_tp.stop();
3442 dout(10) << "op sharded tp stopped" << dendl;
3443
3444 command_tp.drain();
3445 command_tp.stop();
3446 dout(10) << "command tp stopped" << dendl;
3447
f64942e4
AA
3448 remove_tp.drain();
3449 remove_tp.stop();
3450 dout(10) << "remove tp paused (new)" << dendl;
3451
3452 recovery_tp.drain();
3453 recovery_tp.stop();
3454 dout(10) << "recovery tp paused (new)" << dendl;
7c673cae
FG
3455
3456 dout(10) << "stopping agent" << dendl;
3457 service.agent_stop();
3458
3459 osd_lock.Lock();
3460
3461 reset_heartbeat_peers();
3462
3463 tick_timer.shutdown();
3464
3465 {
3466 Mutex::Locker l(tick_timer_lock);
3467 tick_timer_without_osd_lock.shutdown();
3468 }
3469
3470 // note unmount epoch
3471 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
3472 superblock.mounted = service.get_boot_epoch();
3473 superblock.clean_thru = osdmap->get_epoch();
3474 ObjectStore::Transaction t;
3475 write_superblock(t);
3476 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
3477 if (r) {
3478 derr << "OSD::shutdown: error writing superblock: "
3479 << cpp_strerror(r) << dendl;
3480 }
3481
3482
3483 {
3484 Mutex::Locker l(pg_stat_queue_lock);
3485 assert(pg_stat_queue.empty());
3486 }
3487
31f18b77
FG
3488 service.shutdown_reserver();
3489
7c673cae
FG
3490 // Remove PGs
3491#ifdef PG_DEBUG_REFS
3492 service.dump_live_pgids();
3493#endif
3494 {
3495 RWLock::RLocker l(pg_map_lock);
3496 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3497 p != pg_map.end();
3498 ++p) {
3499 dout(20) << " kicking pg " << p->first << dendl;
3500 p->second->lock();
3501 if (p->second->ref != 1) {
3502 derr << "pgid " << p->first << " has ref count of "
3503 << p->second->ref << dendl;
3504#ifdef PG_DEBUG_REFS
3505 p->second->dump_live_ids();
3506#endif
31f18b77
FG
3507 if (cct->_conf->osd_shutdown_pgref_assert) {
3508 ceph_abort();
3509 }
7c673cae
FG
3510 }
3511 p->second->unlock();
3512 p->second->put("PGMap");
3513 }
3514 pg_map.clear();
3515 }
3516#ifdef PG_DEBUG_REFS
3517 service.dump_live_pgids();
3518#endif
f64942e4
AA
3519
3520 osd_lock.Unlock();
7c673cae 3521 cct->_conf->remove_observer(this);
f64942e4 3522 osd_lock.Lock();
7c673cae
FG
3523
3524 dout(10) << "syncing store" << dendl;
3525 enable_disable_fuse(true);
3526
3527 if (cct->_conf->osd_journal_flush_on_shutdown) {
3528 dout(10) << "flushing journal" << dendl;
3529 store->flush_journal();
3530 }
3531
3532 store->umount();
3533 delete store;
3534 store = 0;
3535 dout(10) << "Store synced" << dendl;
3536
3537 monc->shutdown();
3538 osd_lock.Unlock();
3539
3540 osdmap = OSDMapRef();
3541 service.shutdown();
3542 op_tracker.on_shutdown();
3543
3544 class_handler->shutdown();
3545 client_messenger->shutdown();
3546 cluster_messenger->shutdown();
3547 hb_front_client_messenger->shutdown();
3548 hb_back_client_messenger->shutdown();
3549 objecter_messenger->shutdown();
3550 hb_front_server_messenger->shutdown();
3551 hb_back_server_messenger->shutdown();
3552
3553 peering_wq.clear();
3554
3555 return r;
3556}
3557
3558int OSD::mon_cmd_maybe_osd_create(string &cmd)
3559{
3560 bool created = false;
3561 while (true) {
3562 dout(10) << __func__ << " cmd: " << cmd << dendl;
3563 vector<string> vcmd{cmd};
3564 bufferlist inbl;
3565 C_SaferCond w;
3566 string outs;
3567 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
3568 int r = w.wait();
3569 if (r < 0) {
3570 if (r == -ENOENT && !created) {
3571 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
3572 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
3573 vector<string> vnewcmd{newcmd};
3574 bufferlist inbl;
3575 C_SaferCond w;
3576 string outs;
3577 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
3578 int r = w.wait();
3579 if (r < 0) {
3580 derr << __func__ << " fail: osd does not exist and created failed: "
3581 << cpp_strerror(r) << dendl;
3582 return r;
3583 }
3584 created = true;
3585 continue;
3586 }
3587 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
3588 return r;
3589 }
3590 break;
3591 }
3592
3593 return 0;
3594}
3595
3596int OSD::update_crush_location()
3597{
3598 if (!cct->_conf->osd_crush_update_on_start) {
3599 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
3600 return 0;
3601 }
3602
3603 char weight[32];
3604 if (cct->_conf->osd_crush_initial_weight >= 0) {
3605 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
3606 } else {
3607 struct store_statfs_t st;
3608 int r = store->statfs(&st);
3609 if (r < 0) {
3610 derr << "statfs: " << cpp_strerror(r) << dendl;
3611 return r;
3612 }
3613 snprintf(weight, sizeof(weight), "%.4lf",
3614 MAX((double).00001,
3615 (double)(st.total) /
3616 (double)(1ull << 40 /* TB */)));
3617 }
3618
3619 std::multimap<string,string> loc = cct->crush_location.get_location();
3620 dout(10) << __func__ << " crush location is " << loc << dendl;
3621
3622 string cmd =
3623 string("{\"prefix\": \"osd crush create-or-move\", ") +
3624 string("\"id\": ") + stringify(whoami) + string(", ") +
3625 string("\"weight\":") + weight + string(", ") +
3626 string("\"args\": [");
3627 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
3628 if (p != loc.begin())
3629 cmd += ", ";
3630 cmd += "\"" + p->first + "=" + p->second + "\"";
3631 }
3632 cmd += "]}";
3633
3634 return mon_cmd_maybe_osd_create(cmd);
3635}
3636
3637int OSD::update_crush_device_class()
3638{
224ce89b
WB
3639 if (!cct->_conf->osd_class_update_on_start) {
3640 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
3641 return 0;
3642 }
3643
7c673cae
FG
3644 string device_class;
3645 int r = store->read_meta("crush_device_class", &device_class);
224ce89b
WB
3646 if (r < 0 || device_class.empty()) {
3647 device_class = store->get_default_device_class();
3648 }
3649
3650 if (device_class.empty()) {
d2e6a577 3651 dout(20) << __func__ << " no device class stored locally" << dendl;
7c673cae 3652 return 0;
224ce89b 3653 }
7c673cae
FG
3654
3655 string cmd =
3656 string("{\"prefix\": \"osd crush set-device-class\", ") +
224ce89b
WB
3657 string("\"class\": \"") + device_class + string("\", ") +
3658 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
7c673cae 3659
224ce89b 3660 r = mon_cmd_maybe_osd_create(cmd);
d2e6a577
FG
3661 // the above cmd can fail for various reasons, e.g.:
3662 // (1) we are connecting to a pre-luminous monitor
3663 // (2) user manually specify a class other than
3664 // 'ceph-disk prepare --crush-device-class'
3665 // simply skip result-checking for now
3666 return 0;
7c673cae
FG
3667}
3668
3669void OSD::write_superblock(ObjectStore::Transaction& t)
3670{
3671 dout(10) << "write_superblock " << superblock << dendl;
3672
3673 //hack: at minimum it's using the baseline feature set
3674 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
3675 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
3676
3677 bufferlist bl;
3678 ::encode(superblock, bl);
3679 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
3680}
3681
3682int OSD::read_superblock()
3683{
3684 bufferlist bl;
3685 int r = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
3686 if (r < 0)
3687 return r;
3688
3689 bufferlist::iterator p = bl.begin();
3690 ::decode(superblock, p);
3691
3692 dout(10) << "read_superblock " << superblock << dendl;
3693
3694 return 0;
3695}
3696
3697void OSD::clear_temp_objects()
3698{
3699 dout(10) << __func__ << dendl;
3700 vector<coll_t> ls;
3701 store->list_collections(ls);
3702 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
3703 spg_t pgid;
3704 if (!p->is_pg(&pgid))
3705 continue;
3706
3707 // list temp objects
3708 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
3709
3710 vector<ghobject_t> temps;
3711 ghobject_t next;
3712 while (1) {
3713 vector<ghobject_t> objects;
3714 store->collection_list(*p, next, ghobject_t::get_max(),
3715 store->get_ideal_list_max(),
3716 &objects, &next);
3717 if (objects.empty())
3718 break;
3719 vector<ghobject_t>::iterator q;
3720 for (q = objects.begin(); q != objects.end(); ++q) {
3721 // Hammer set pool for temps to -1, so check for clean-up
3722 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
3723 temps.push_back(*q);
3724 } else {
3725 break;
3726 }
3727 }
3728 // If we saw a non-temp object and hit the break above we can
3729 // break out of the while loop too.
3730 if (q != objects.end())
3731 break;
3732 }
3733 if (!temps.empty()) {
3734 ObjectStore::Transaction t;
3735 int removed = 0;
3736 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
3737 dout(20) << " removing " << *p << " object " << *q << dendl;
3738 t.remove(*p, *q);
3739 if (++removed > cct->_conf->osd_target_transaction_size) {
3740 store->apply_transaction(service.meta_osr.get(), std::move(t));
3741 t = ObjectStore::Transaction();
3742 removed = 0;
3743 }
3744 }
3745 if (removed) {
3746 store->apply_transaction(service.meta_osr.get(), std::move(t));
3747 }
3748 }
3749 }
3750}
3751
3752void OSD::recursive_remove_collection(CephContext* cct,
3753 ObjectStore *store, spg_t pgid,
3754 coll_t tmp)
3755{
3756 OSDriver driver(
3757 store,
3758 coll_t(),
3759 make_snapmapper_oid());
3760
3761 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
3762 ObjectStore::Sequencer>("rm"));
3763 ObjectStore::Transaction t;
3764 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
3765
3766 vector<ghobject_t> objects;
3767 store->collection_list(tmp, ghobject_t(), ghobject_t::get_max(),
3768 INT_MAX, &objects, 0);
3769 generic_dout(10) << __func__ << " " << objects << dendl;
3770 // delete them.
3771 int removed = 0;
3772 for (vector<ghobject_t>::iterator p = objects.begin();
3773 p != objects.end();
3774 ++p, removed++) {
3775 OSDriver::OSTransaction _t(driver.get_transaction(&t));
3776 int r = mapper.remove_oid(p->hobj, &_t);
3777 if (r != 0 && r != -ENOENT)
3778 ceph_abort();
3779 t.remove(tmp, *p);
3780 if (removed > cct->_conf->osd_target_transaction_size) {
3781 int r = store->apply_transaction(osr.get(), std::move(t));
3782 assert(r == 0);
3783 t = ObjectStore::Transaction();
3784 removed = 0;
3785 }
3786 }
3787 t.remove_collection(tmp);
3788 int r = store->apply_transaction(osr.get(), std::move(t));
3789 assert(r == 0);
3790
3791 C_SaferCond waiter;
3792 if (!osr->flush_commit(&waiter)) {
3793 waiter.wait();
3794 }
3795}
3796
3797
3798// ======================================================
3799// PG's
3800
3801PGPool OSD::_get_pool(int id, OSDMapRef createmap)
3802{
3803 if (!createmap->have_pg_pool(id)) {
3804 dout(5) << __func__ << ": the OSDmap does not contain a PG pool with id = "
3805 << id << dendl;
3806 ceph_abort();
3807 }
3808
3809 PGPool p = PGPool(cct, createmap, id);
3810
3811 dout(10) << "_get_pool " << p.id << dendl;
3812 return p;
3813}
3814
3815PG *OSD::_open_lock_pg(
3816 OSDMapRef createmap,
3817 spg_t pgid, bool no_lockdep_check)
3818{
3819 assert(osd_lock.is_locked());
3820
3821 PG* pg = _make_pg(createmap, pgid);
3822 {
3823 RWLock::WLocker l(pg_map_lock);
3824 pg->lock(no_lockdep_check);
3825 pg_map[pgid] = pg;
3826 pg->get("PGMap"); // because it's in pg_map
3827 service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
3828 }
3829 return pg;
3830}
3831
3832PG* OSD::_make_pg(
3833 OSDMapRef createmap,
3834 spg_t pgid)
3835{
3836 dout(10) << "_open_lock_pg " << pgid << dendl;
3837 PGPool pool = _get_pool(pgid.pool(), createmap);
3838
3839 // create
3840 PG *pg;
3841 if (createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_REPLICATED ||
3842 createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_ERASURE)
3843 pg = new PrimaryLogPG(&service, createmap, pool, pgid);
3844 else
3845 ceph_abort();
3846
3847 return pg;
3848}
3849
3850
3851void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
3852{
3853 epoch_t e(service.get_osdmap()->get_epoch());
3854 pg->get("PGMap"); // For pg_map
3855 pg_map[pg->info.pgid] = pg;
3856 service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
3857
3858 dout(10) << "Adding newly split pg " << *pg << dendl;
3859 pg->handle_loaded(rctx);
3860 pg->write_if_dirty(*(rctx->transaction));
3861 pg->queue_null(e, e);
3862 map<spg_t, list<PG::CephPeeringEvtRef> >::iterator to_wake =
3863 peering_wait_for_split.find(pg->info.pgid);
3864 if (to_wake != peering_wait_for_split.end()) {
3865 for (list<PG::CephPeeringEvtRef>::iterator i =
3866 to_wake->second.begin();
3867 i != to_wake->second.end();
3868 ++i) {
3869 pg->queue_peering_event(*i);
3870 }
3871 peering_wait_for_split.erase(to_wake);
3872 }
3873 if (!service.get_osdmap()->have_pg_pool(pg->info.pgid.pool()))
3874 _remove_pg(pg);
3875}
3876
3877OSD::res_result OSD::_try_resurrect_pg(
3878 OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state)
3879{
3880 assert(resurrected);
3881 assert(old_pg_state);
3882 // find nearest ancestor
3883 DeletingStateRef df;
3884 spg_t cur(pgid);
3885 while (true) {
3886 df = service.deleting_pgs.lookup(cur);
3887 if (df)
3888 break;
3889 if (!cur.ps())
3890 break;
3891 cur = cur.get_parent();
3892 }
3893 if (!df)
3894 return RES_NONE; // good to go
3895
3896 df->old_pg_state->lock();
3897 OSDMapRef create_map = df->old_pg_state->get_osdmap();
3898 df->old_pg_state->unlock();
3899
3900 set<spg_t> children;
3901 if (cur == pgid) {
3902 if (df->try_stop_deletion()) {
3903 dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl;
3904 *resurrected = cur;
3905 *old_pg_state = df->old_pg_state;
3906 service.deleting_pgs.remove(pgid); // PG is no longer being removed!
3907 return RES_SELF;
3908 } else {
3909 // raced, ensure we don't see DeletingStateRef when we try to
3910 // delete this pg
3911 service.deleting_pgs.remove(pgid);
3912 return RES_NONE;
3913 }
3914 } else if (cur.is_split(create_map->get_pg_num(cur.pool()),
3915 curmap->get_pg_num(cur.pool()),
3916 &children) &&
3917 children.count(pgid)) {
3918 if (df->try_stop_deletion()) {
3919 dout(10) << __func__ << ": halted deletion on ancestor pg " << pgid
3920 << dendl;
3921 *resurrected = cur;
3922 *old_pg_state = df->old_pg_state;
3923 service.deleting_pgs.remove(cur); // PG is no longer being removed!
3924 return RES_PARENT;
3925 } else {
3926 /* this is not a problem, failing to cancel proves that all objects
3927 * have been removed, so no hobject_t overlap is possible
3928 */
3929 return RES_NONE;
3930 }
3931 }
3932 return RES_NONE;
3933}
3934
3935PG *OSD::_create_lock_pg(
3936 OSDMapRef createmap,
3937 spg_t pgid,
3938 bool hold_map_lock,
3939 bool backfill,
3940 int role,
3941 vector<int>& up, int up_primary,
3942 vector<int>& acting, int acting_primary,
3943 pg_history_t history,
3944 const PastIntervals& pi,
3945 ObjectStore::Transaction& t)
3946{
3947 assert(osd_lock.is_locked());
3948 dout(20) << "_create_lock_pg pgid " << pgid << dendl;
3949
3950 PG *pg = _open_lock_pg(createmap, pgid, true);
3951
3952 service.init_splits_between(pgid, pg->get_osdmap(), service.get_osdmap());
3953
3954 pg->init(
3955 role,
3956 up,
3957 up_primary,
3958 acting,
3959 acting_primary,
3960 history,
3961 pi,
3962 backfill,
3963 &t);
3964
3965 dout(7) << "_create_lock_pg " << *pg << dendl;
3966 return pg;
3967}
3968
3969PG *OSD::_lookup_lock_pg(spg_t pgid)
3970{
3971 RWLock::RLocker l(pg_map_lock);
3972
3973 auto pg_map_entry = pg_map.find(pgid);
3974 if (pg_map_entry == pg_map.end())
3975 return nullptr;
3976 PG *pg = pg_map_entry->second;
3977 pg->lock();
3978 return pg;
3979}
3980
31f18b77
FG
3981PG *OSD::lookup_lock_pg(spg_t pgid)
3982{
3983 return _lookup_lock_pg(pgid);
3984}
3985
7c673cae
FG
3986PG *OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid)
3987{
3988 assert(pg_map.count(pgid));
3989 PG *pg = pg_map[pgid];
3990 pg->lock();
3991 return pg;
3992}
3993
3994void OSD::load_pgs()
3995{
3996 assert(osd_lock.is_locked());
3997 dout(0) << "load_pgs" << dendl;
3998 {
3999 RWLock::RLocker l(pg_map_lock);
4000 assert(pg_map.empty());
4001 }
4002
4003 vector<coll_t> ls;
4004 int r = store->list_collections(ls);
4005 if (r < 0) {
4006 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
4007 }
4008
4009 bool has_upgraded = false;
4010
4011 for (vector<coll_t>::iterator it = ls.begin();
4012 it != ls.end();
4013 ++it) {
4014 spg_t pgid;
4015 if (it->is_temp(&pgid) ||
4016 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
4017 dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
4018 recursive_remove_collection(cct, store, pgid, *it);
4019 continue;
4020 }
4021
4022 if (!it->is_pg(&pgid)) {
4023 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
4024 continue;
4025 }
4026
4027 if (pgid.preferred() >= 0) {
4028 dout(10) << __func__ << ": skipping localized PG " << pgid << dendl;
4029 // FIXME: delete it too, eventually
4030 continue;
4031 }
4032
4033 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
4034 bufferlist bl;
4035 epoch_t map_epoch = 0;
4036 int r = PG::peek_map_epoch(store, pgid, &map_epoch, &bl);
4037 if (r < 0) {
4038 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
4039 << dendl;
4040 continue;
4041 }
4042
4043 PG *pg = NULL;
4044 if (map_epoch > 0) {
4045 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
4046 if (!pgosdmap) {
4047 if (!osdmap->have_pg_pool(pgid.pool())) {
4048 derr << __func__ << ": could not find map for epoch " << map_epoch
4049 << " on pg " << pgid << ", but the pool is not present in the "
4050 << "current map, so this is probably a result of bug 10617. "
4051 << "Skipping the pg for now, you can use ceph-objectstore-tool "
4052 << "to clean it up later." << dendl;
4053 continue;
4054 } else {
4055 derr << __func__ << ": have pgid " << pgid << " at epoch "
4056 << map_epoch << ", but missing map. Crashing."
4057 << dendl;
4058 assert(0 == "Missing map in load_pgs");
4059 }
4060 }
4061 pg = _open_lock_pg(pgosdmap, pgid);
4062 } else {
4063 pg = _open_lock_pg(osdmap, pgid);
4064 }
4065 // there can be no waiters here, so we don't call wake_pg_waiters
4066
4067 pg->ch = store->open_collection(pg->coll);
4068
4069 // read pg state, log
4070 pg->read_state(store, bl);
4071
4072 if (pg->must_upgrade()) {
4073 if (!pg->can_upgrade()) {
4074 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
4075 << " an older version first." << dendl;
4076 assert(0 == "PG too old to upgrade");
4077 }
4078 if (!has_upgraded) {
4079 derr << "PGs are upgrading" << dendl;
4080 has_upgraded = true;
4081 }
4082 dout(10) << "PG " << pg->info.pgid
4083 << " must upgrade..." << dendl;
4084 pg->upgrade(store);
4085 }
4086
94b18763
FG
4087 if (pg->dne()) {
4088 dout(10) << "load_pgs " << *it << " deleting dne" << dendl;
4089 pg->ch = nullptr;
4090 service.pg_remove_epoch(pg->pg_id);
4091 pg->unlock();
4092 {
4093 // Delete pg
4094 RWLock::WLocker l(pg_map_lock);
4095 auto p = pg_map.find(pg->get_pgid());
4096 assert(p != pg_map.end() && p->second == pg);
4097 dout(20) << __func__ << " removed pg " << pg << " from pg_map" << dendl;
4098 pg_map.erase(p);
4099 pg->put("PGMap");
4100 }
4101 recursive_remove_collection(cct, store, pgid, *it);
4102 continue;
4103 }
4104
7c673cae
FG
4105 service.init_splits_between(pg->info.pgid, pg->get_osdmap(), osdmap);
4106
4107 // generate state for PG's current mapping
4108 int primary, up_primary;
4109 vector<int> acting, up;
4110 pg->get_osdmap()->pg_to_up_acting_osds(
4111 pgid.pgid, &up, &up_primary, &acting, &primary);
4112 pg->init_primary_up_acting(
4113 up,
4114 acting,
4115 up_primary,
4116 primary);
4117 int role = OSDMap::calc_pg_role(whoami, pg->acting);
4118 if (pg->pool.info.is_replicated() || role == pg->pg_whoami.shard)
4119 pg->set_role(role);
4120 else
4121 pg->set_role(-1);
4122
4123 pg->reg_next_scrub();
4124
4125 PG::RecoveryCtx rctx(0, 0, 0, 0, 0, 0);
4126 pg->handle_loaded(&rctx);
4127
4128 dout(10) << "load_pgs loaded " << *pg << " " << pg->pg_log.get_log() << dendl;
4129 if (pg->pg_log.is_dirty()) {
4130 ObjectStore::Transaction t;
4131 pg->write_if_dirty(t);
4132 store->apply_transaction(pg->osr.get(), std::move(t));
4133 }
4134 pg->unlock();
4135 }
4136 {
4137 RWLock::RLocker l(pg_map_lock);
4138 dout(0) << "load_pgs opened " << pg_map.size() << " pgs" << dendl;
4139 }
4140
4141 // clean up old infos object?
4142 if (has_upgraded && store->exists(coll_t::meta(), OSD::make_infos_oid())) {
4143 dout(1) << __func__ << " removing legacy infos object" << dendl;
4144 ObjectStore::Transaction t;
4145 t.remove(coll_t::meta(), OSD::make_infos_oid());
4146 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
4147 if (r != 0) {
4148 derr << __func__ << ": apply_transaction returned "
4149 << cpp_strerror(r) << dendl;
4150 ceph_abort();
4151 }
4152 }
4153
4154 build_past_intervals_parallel();
4155}
4156
4157
4158/*
4159 * build past_intervals efficiently on old, degraded, and buried
4160 * clusters. this is important for efficiently catching up osds that
4161 * are way behind on maps to the current cluster state.
4162 *
4163 * this is a parallel version of PG::generate_past_intervals().
4164 * follow the same logic, but do all pgs at the same time so that we
4165 * can make a single pass across the osdmap history.
4166 */
4167void OSD::build_past_intervals_parallel()
4168{
4169 struct pistate {
4170 epoch_t start, end;
4171 vector<int> old_acting, old_up;
4172 epoch_t same_interval_since;
4173 int primary;
4174 int up_primary;
4175 };
4176 map<PG*,pistate> pis;
4177
4178 // calculate junction of map range
4179 epoch_t end_epoch = superblock.oldest_map;
4180 epoch_t cur_epoch = superblock.newest_map;
4181 {
4182 RWLock::RLocker l(pg_map_lock);
4183 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4184 i != pg_map.end();
4185 ++i) {
4186 PG *pg = i->second;
4187
3efd9988
FG
4188 // Ignore PGs only partially created (DNE)
4189 if (pg->info.dne()) {
4190 continue;
4191 }
4192
7c673cae
FG
4193 auto rpib = pg->get_required_past_interval_bounds(
4194 pg->info,
4195 superblock.oldest_map);
4196 if (rpib.first >= rpib.second && pg->past_intervals.empty()) {
4197 if (pg->info.history.same_interval_since == 0) {
4198 pg->info.history.same_interval_since = rpib.second;
4199 }
4200 continue;
4201 } else {
4202 auto apib = pg->past_intervals.get_bounds();
4203 if (apib.second >= rpib.second &&
4204 apib.first <= rpib.first) {
4205 if (pg->info.history.same_interval_since == 0) {
4206 pg->info.history.same_interval_since = rpib.second;
4207 }
4208 continue;
4209 }
4210 }
4211
4212 dout(10) << pg->info.pgid << " needs " << rpib.first << "-"
4213 << rpib.second << dendl;
4214 pistate& p = pis[pg];
4215 p.start = rpib.first;
4216 p.end = rpib.second;
4217 p.same_interval_since = 0;
4218
4219 if (rpib.first < cur_epoch)
4220 cur_epoch = rpib.first;
4221 if (rpib.second > end_epoch)
4222 end_epoch = rpib.second;
4223 }
4224 }
4225 if (pis.empty()) {
4226 dout(10) << __func__ << " nothing to build" << dendl;
4227 return;
4228 }
4229
4230 dout(1) << __func__ << " over " << cur_epoch << "-" << end_epoch << dendl;
4231 assert(cur_epoch <= end_epoch);
4232
4233 OSDMapRef cur_map, last_map;
4234 for ( ; cur_epoch <= end_epoch; cur_epoch++) {
4235 dout(10) << __func__ << " epoch " << cur_epoch << dendl;
4236 last_map = cur_map;
4237 cur_map = get_map(cur_epoch);
4238
4239 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4240 PG *pg = i->first;
4241 pistate& p = i->second;
4242
4243 if (cur_epoch < p.start || cur_epoch > p.end)
4244 continue;
4245
4246 vector<int> acting, up;
4247 int up_primary;
4248 int primary;
4249 pg_t pgid = pg->info.pgid.pgid;
4250 if (p.same_interval_since && last_map->get_pools().count(pgid.pool()))
4251 pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool()));
4252 cur_map->pg_to_up_acting_osds(
4253 pgid, &up, &up_primary, &acting, &primary);
4254
4255 if (p.same_interval_since == 0) {
4256 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4257 << " first map, acting " << acting
4258 << " up " << up << ", same_interval_since = " << cur_epoch << dendl;
4259 p.same_interval_since = cur_epoch;
4260 p.old_up = up;
4261 p.old_acting = acting;
4262 p.primary = primary;
4263 p.up_primary = up_primary;
4264 continue;
4265 }
4266 assert(last_map);
4267
4268 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
4269 pg->get_is_recoverable_predicate());
4270 std::stringstream debug;
4271 bool new_interval = PastIntervals::check_new_interval(
4272 p.primary,
4273 primary,
4274 p.old_acting, acting,
4275 p.up_primary,
4276 up_primary,
4277 p.old_up, up,
4278 p.same_interval_since,
4279 pg->info.history.last_epoch_clean,
4280 cur_map, last_map,
4281 pgid,
4282 recoverable.get(),
4283 &pg->past_intervals,
4284 &debug);
4285 if (new_interval) {
4286 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4287 << " " << debug.str() << dendl;
4288 p.old_up = up;
4289 p.old_acting = acting;
4290 p.primary = primary;
4291 p.up_primary = up_primary;
4292 p.same_interval_since = cur_epoch;
4293 }
4294 }
4295 }
4296
4297 // Now that past_intervals have been recomputed let's fix the same_interval_since
4298 // if it was cleared by import.
4299 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4300 PG *pg = i->first;
4301 pistate& p = i->second;
4302
4303 if (pg->info.history.same_interval_since == 0) {
4304 assert(p.same_interval_since);
4305 dout(10) << __func__ << " fix same_interval_since " << p.same_interval_since << " pg " << *pg << dendl;
4306 dout(10) << __func__ << " past_intervals " << pg->past_intervals << dendl;
4307 // Fix it
4308 pg->info.history.same_interval_since = p.same_interval_since;
4309 }
4310 }
4311
4312 // write info only at the end. this is necessary because we check
4313 // whether the past_intervals go far enough back or forward in time,
4314 // but we don't check for holes. we could avoid it by discarding
4315 // the previous past_intervals and rebuilding from scratch, or we
4316 // can just do this and commit all our work at the end.
4317 ObjectStore::Transaction t;
4318 int num = 0;
4319 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4320 PG *pg = i->first;
4321 pg->lock();
4322 pg->dirty_big_info = true;
4323 pg->dirty_info = true;
4324 pg->write_if_dirty(t);
4325 pg->unlock();
4326
4327 // don't let the transaction get too big
4328 if (++num >= cct->_conf->osd_target_transaction_size) {
4329 store->apply_transaction(service.meta_osr.get(), std::move(t));
4330 t = ObjectStore::Transaction();
4331 num = 0;
4332 }
4333 }
4334 if (!t.empty())
4335 store->apply_transaction(service.meta_osr.get(), std::move(t));
4336}
4337
4338/*
4339 * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping
4340 * hasn't changed since the given epoch and we are the primary.
4341 */
4342int OSD::handle_pg_peering_evt(
4343 spg_t pgid,
4344 const pg_history_t& orig_history,
4345 const PastIntervals& pi,
4346 epoch_t epoch,
4347 PG::CephPeeringEvtRef evt)
4348{
4349 if (service.splitting(pgid)) {
4350 peering_wait_for_split[pgid].push_back(evt);
4351 return -EEXIST;
4352 }
4353
4354 PG *pg = _lookup_lock_pg(pgid);
4355 if (!pg) {
4356 // same primary?
4357 if (!osdmap->have_pg_pool(pgid.pool()))
4358 return -EINVAL;
4359 int up_primary, acting_primary;
4360 vector<int> up, acting;
4361 osdmap->pg_to_up_acting_osds(
4362 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4363
4364 pg_history_t history = orig_history;
4365 bool valid_history = project_pg_history(
4366 pgid, history, epoch, up, up_primary, acting, acting_primary);
4367
4368 if (!valid_history || epoch < history.same_interval_since) {
4369 dout(10) << __func__ << pgid << " acting changed in "
4370 << history.same_interval_since << " (msg from " << epoch << ")"
4371 << dendl;
4372 return -EINVAL;
4373 }
4374
4375 if (service.splitting(pgid)) {
4376 ceph_abort();
4377 }
4378
3efd9988
FG
4379 const bool is_mon_create =
4380 evt->get_event().dynamic_type() == PG::NullEvt::static_type();
4381 if (maybe_wait_for_max_pg(pgid, is_mon_create)) {
4382 return -EAGAIN;
4383 }
7c673cae
FG
4384 // do we need to resurrect a deleting pg?
4385 spg_t resurrected;
4386 PGRef old_pg_state;
4387 res_result result = _try_resurrect_pg(
4388 service.get_osdmap(),
4389 pgid,
4390 &resurrected,
4391 &old_pg_state);
4392
4393 PG::RecoveryCtx rctx = create_context();
4394 switch (result) {
4395 case RES_NONE: {
4396 const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
4397 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4398 store->get_type() != "bluestore") {
4399 clog->warn() << "pg " << pgid
4400 << " is at risk of silent data corruption: "
4401 << "the pool allows ec overwrites but is not stored in "
4402 << "bluestore, so deep scrubbing will not detect bitrot";
4403 }
4404 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4405 PG::_init(*rctx.transaction, pgid, pp);
4406
4407 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
4408 if (!pp->is_replicated() && role != pgid.shard)
4409 role = -1;
4410
4411 pg = _create_lock_pg(
4412 get_map(epoch),
4413 pgid, false, false,
4414 role,
4415 up, up_primary,
4416 acting, acting_primary,
4417 history, pi,
4418 *rctx.transaction);
4419 pg->handle_create(&rctx);
4420 pg->write_if_dirty(*rctx.transaction);
4421 dispatch_context(rctx, pg, osdmap);
4422
4423 dout(10) << *pg << " is new" << dendl;
4424
4425 pg->queue_peering_event(evt);
4426 wake_pg_waiters(pg);
4427 pg->unlock();
4428 return 0;
4429 }
4430 case RES_SELF: {
4431 old_pg_state->lock();
4432 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4433 int old_role = old_pg_state->role;
4434 vector<int> old_up = old_pg_state->up;
4435 int old_up_primary = old_pg_state->up_primary.osd;
4436 vector<int> old_acting = old_pg_state->acting;
4437 int old_primary = old_pg_state->primary.osd;
4438 pg_history_t old_history = old_pg_state->info.history;
4439 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4440 old_pg_state->unlock();
4441 pg = _create_lock_pg(
4442 old_osd_map,
4443 resurrected,
4444 false,
4445 true,
4446 old_role,
4447 old_up,
4448 old_up_primary,
4449 old_acting,
4450 old_primary,
4451 old_history,
4452 old_past_intervals,
4453 *rctx.transaction);
4454 pg->handle_create(&rctx);
4455 pg->write_if_dirty(*rctx.transaction);
4456 dispatch_context(rctx, pg, osdmap);
4457
4458 dout(10) << *pg << " is new (resurrected)" << dendl;
4459
4460 pg->queue_peering_event(evt);
4461 wake_pg_waiters(pg);
4462 pg->unlock();
4463 return 0;
4464 }
4465 case RES_PARENT: {
4466 assert(old_pg_state);
4467 old_pg_state->lock();
4468 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4469 int old_role = old_pg_state->role;
4470 vector<int> old_up = old_pg_state->up;
4471 int old_up_primary = old_pg_state->up_primary.osd;
4472 vector<int> old_acting = old_pg_state->acting;
4473 int old_primary = old_pg_state->primary.osd;
4474 pg_history_t old_history = old_pg_state->info.history;
4475 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4476 old_pg_state->unlock();
4477 PG *parent = _create_lock_pg(
4478 old_osd_map,
4479 resurrected,
4480 false,
4481 true,
4482 old_role,
4483 old_up,
4484 old_up_primary,
4485 old_acting,
4486 old_primary,
4487 old_history,
4488 old_past_intervals,
4489 *rctx.transaction
4490 );
4491 parent->handle_create(&rctx);
4492 parent->write_if_dirty(*rctx.transaction);
4493 dispatch_context(rctx, parent, osdmap);
4494
4495 dout(10) << *parent << " is new" << dendl;
4496
4497 assert(service.splitting(pgid));
4498 peering_wait_for_split[pgid].push_back(evt);
4499
4500 //parent->queue_peering_event(evt);
4501 parent->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
4502 wake_pg_waiters(parent);
4503 parent->unlock();
4504 return 0;
4505 }
4506 default:
4507 assert(0);
4508 return 0;
4509 }
4510 } else {
4511 // already had it. did the mapping change?
4512 if (epoch < pg->info.history.same_interval_since) {
4513 dout(10) << *pg << __func__ << " acting changed in "
4514 << pg->info.history.same_interval_since
4515 << " (msg from " << epoch << ")" << dendl;
4516 } else {
4517 pg->queue_peering_event(evt);
4518 }
4519 pg->unlock();
4520 return -EEXIST;
4521 }
4522}
4523
3efd9988
FG
4524bool OSD::maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create)
4525{
4526 const auto max_pgs_per_osd =
4527 (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
4528 cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4529
4530 RWLock::RLocker pg_map_locker{pg_map_lock};
4531 if (pg_map.size() < max_pgs_per_osd) {
4532 return false;
4533 }
4534 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
4535 if (is_mon_create) {
4536 pending_creates_from_mon++;
4537 } else {
b32b8144
FG
4538 bool is_primary = osdmap->get_pg_acting_rank(pgid.pgid, whoami) == 0;
4539 pending_creates_from_osd.emplace(pgid.pgid, is_primary);
3efd9988 4540 }
1adf2230 4541 dout(1) << __func__ << " withhold creation of pg " << pgid
3efd9988
FG
4542 << ": " << pg_map.size() << " >= "<< max_pgs_per_osd << dendl;
4543 return true;
4544}
4545
4546// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4547// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4548// to up set if pg_temp is empty. so an empty pg_temp won't work.
4549static vector<int32_t> twiddle(const vector<int>& acting) {
4550 if (acting.size() > 1) {
4551 return {acting[0]};
4552 } else {
4553 vector<int32_t> twiddled(acting.begin(), acting.end());
4554 twiddled.push_back(-1);
4555 return twiddled;
4556 }
4557}
4558
4559void OSD::resume_creating_pg()
4560{
4561 bool do_sub_pg_creates = false;
b32b8144 4562 bool have_pending_creates = false;
3efd9988
FG
4563 {
4564 const auto max_pgs_per_osd =
4565 (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
4566 cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4567 RWLock::RLocker l(pg_map_lock);
4568 if (max_pgs_per_osd <= pg_map.size()) {
4569 // this could happen if admin decreases this setting before a PG is removed
4570 return;
4571 }
4572 unsigned spare_pgs = max_pgs_per_osd - pg_map.size();
4573 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
4574 if (pending_creates_from_mon > 0) {
4575 do_sub_pg_creates = true;
4576 if (pending_creates_from_mon >= spare_pgs) {
4577 spare_pgs = pending_creates_from_mon = 0;
4578 } else {
4579 spare_pgs -= pending_creates_from_mon;
4580 pending_creates_from_mon = 0;
4581 }
4582 }
4583 auto pg = pending_creates_from_osd.cbegin();
4584 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
94b18763 4585 dout(20) << __func__ << " pg " << pg->first << dendl;
3efd9988 4586 vector<int> acting;
b32b8144 4587 osdmap->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr);
94b18763 4588 service.queue_want_pg_temp(pg->first, twiddle(acting), true);
3efd9988 4589 pg = pending_creates_from_osd.erase(pg);
94b18763 4590 do_sub_pg_creates = true;
3efd9988
FG
4591 spare_pgs--;
4592 }
b32b8144
FG
4593 have_pending_creates = (pending_creates_from_mon > 0 ||
4594 !pending_creates_from_osd.empty());
3efd9988 4595 }
b32b8144
FG
4596
4597 bool do_renew_subs = false;
3efd9988
FG
4598 if (do_sub_pg_creates) {
4599 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4600 dout(4) << __func__ << ": resolicit pg creates from mon since "
4601 << last_pg_create_epoch << dendl;
b32b8144 4602 do_renew_subs = true;
3efd9988
FG
4603 }
4604 }
b32b8144
FG
4605 version_t start = osdmap->get_epoch() + 1;
4606 if (have_pending_creates) {
4607 // don't miss any new osdmap deleting PGs
4608 if (monc->sub_want("osdmap", start, 0)) {
4609 dout(4) << __func__ << ": resolicit osdmap from mon since "
4610 << start << dendl;
4611 do_renew_subs = true;
4612 }
94b18763 4613 } else if (do_sub_pg_creates) {
b32b8144
FG
4614 // no need to subscribe the osdmap continuously anymore
4615 // once the pgtemp and/or mon_subscribe(pg_creates) is sent
4616 if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
4617 dout(4) << __func__ << ": re-subscribe osdmap(onetime) since"
4618 << start << dendl;
4619 do_renew_subs = true;
4620 }
4621 }
4622
4623 if (do_renew_subs) {
4624 monc->renew_subs();
4625 }
4626
94b18763 4627 service.send_pg_temp();
3efd9988 4628}
7c673cae
FG
4629
4630void OSD::build_initial_pg_history(
4631 spg_t pgid,
4632 epoch_t created,
4633 utime_t created_stamp,
4634 pg_history_t *h,
4635 PastIntervals *pi)
4636{
4637 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4638 h->epoch_created = created;
31f18b77 4639 h->epoch_pool_created = created;
7c673cae
FG
4640 h->same_interval_since = created;
4641 h->same_up_since = created;
4642 h->same_primary_since = created;
4643 h->last_scrub_stamp = created_stamp;
4644 h->last_deep_scrub_stamp = created_stamp;
4645 h->last_clean_scrub_stamp = created_stamp;
4646
4647 OSDMapRef lastmap = service.get_map(created);
4648 int up_primary, acting_primary;
4649 vector<int> up, acting;
4650 lastmap->pg_to_up_acting_osds(
4651 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4652
4653 ostringstream debug;
4654 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4655 OSDMapRef osdmap = service.get_map(e);
4656 int new_up_primary, new_acting_primary;
4657 vector<int> new_up, new_acting;
4658 osdmap->pg_to_up_acting_osds(
4659 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4660
4661 // this is a bit imprecise, but sufficient?
4662 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4663 const pg_pool_t *pi;
4664 bool operator()(const set<pg_shard_t> &have) const {
4665 return have.size() >= pi->min_size;
4666 }
4667 min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4668 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4669
4670 bool new_interval = PastIntervals::check_new_interval(
4671 acting_primary,
4672 new_acting_primary,
4673 acting, new_acting,
4674 up_primary,
4675 new_up_primary,
4676 up, new_up,
4677 h->same_interval_since,
4678 h->last_epoch_clean,
4679 osdmap,
4680 lastmap,
4681 pgid.pgid,
4682 &min_size_predicate,
4683 pi,
4684 &debug);
4685 if (new_interval) {
4686 h->same_interval_since = e;
181888fb
FG
4687 if (up != new_up) {
4688 h->same_up_since = e;
4689 }
4690 if (acting_primary != new_acting_primary) {
4691 h->same_primary_since = e;
4692 }
4693 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4694 osdmap->get_pg_num(pgid.pgid.pool()),
4695 nullptr)) {
4696 h->last_epoch_split = e;
4697 }
4698 up = new_up;
4699 acting = new_acting;
4700 up_primary = new_up_primary;
4701 acting_primary = new_acting_primary;
c07f9fc5 4702 }
7c673cae
FG
4703 lastmap = osdmap;
4704 }
4705 dout(20) << __func__ << " " << debug.str() << dendl;
4706 dout(10) << __func__ << " " << *h << " " << *pi
4707 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4708 pi->get_bounds()) << ")"
4709 << dendl;
4710}
4711
4712/**
4713 * Fill in the passed history so you know same_interval_since, same_up_since,
4714 * and same_primary_since.
4715 */
4716bool OSD::project_pg_history(spg_t pgid, pg_history_t& h, epoch_t from,
4717 const vector<int>& currentup,
4718 int currentupprimary,
4719 const vector<int>& currentacting,
4720 int currentactingprimary)
4721{
4722 dout(15) << "project_pg_history " << pgid
4723 << " from " << from << " to " << osdmap->get_epoch()
4724 << ", start " << h
4725 << dendl;
4726
4727 epoch_t e;
4728 for (e = osdmap->get_epoch();
4729 e > from;
4730 e--) {
4731 // verify during intermediate epoch (e-1)
4732 OSDMapRef oldmap = service.try_get_map(e-1);
4733 if (!oldmap) {
4734 dout(15) << __func__ << ": found map gap, returning false" << dendl;
4735 return false;
4736 }
4737 assert(oldmap->have_pg_pool(pgid.pool()));
4738
4739 int upprimary, actingprimary;
4740 vector<int> up, acting;
4741 oldmap->pg_to_up_acting_osds(
4742 pgid.pgid,
4743 &up,
4744 &upprimary,
4745 &acting,
4746 &actingprimary);
4747
4748 // acting set change?
4749 if ((actingprimary != currentactingprimary ||
4750 upprimary != currentupprimary ||
4751 acting != currentacting ||
4752 up != currentup) && e > h.same_interval_since) {
4753 dout(15) << "project_pg_history " << pgid << " acting|up changed in " << e
4754 << " from " << acting << "/" << up
4755 << " " << actingprimary << "/" << upprimary
4756 << " -> " << currentacting << "/" << currentup
4757 << " " << currentactingprimary << "/" << currentupprimary
4758 << dendl;
4759 h.same_interval_since = e;
4760 }
4761 // split?
4762 if (pgid.is_split(oldmap->get_pg_num(pgid.pool()),
4763 osdmap->get_pg_num(pgid.pool()),
4764 0) && e > h.same_interval_since) {
4765 h.same_interval_since = e;
4766 }
4767 // up set change?
4768 if ((up != currentup || upprimary != currentupprimary)
4769 && e > h.same_up_since) {
4770 dout(15) << "project_pg_history " << pgid << " up changed in " << e
4771 << " from " << up << " " << upprimary
4772 << " -> " << currentup << " " << currentupprimary << dendl;
4773 h.same_up_since = e;
4774 }
4775
4776 // primary change?
4777 if (OSDMap::primary_changed(
4778 actingprimary,
4779 acting,
4780 currentactingprimary,
4781 currentacting) &&
4782 e > h.same_primary_since) {
4783 dout(15) << "project_pg_history " << pgid << " primary changed in " << e << dendl;
4784 h.same_primary_since = e;
4785 }
4786
4787 if (h.same_interval_since >= e && h.same_up_since >= e && h.same_primary_since >= e)
4788 break;
4789 }
4790
31f18b77 4791 // base case: these floors should be the pg creation epoch if we didn't
7c673cae
FG
4792 // find any changes.
4793 if (e == h.epoch_created) {
4794 if (!h.same_interval_since)
4795 h.same_interval_since = e;
4796 if (!h.same_up_since)
4797 h.same_up_since = e;
4798 if (!h.same_primary_since)
4799 h.same_primary_since = e;
4800 }
4801
4802 dout(15) << "project_pg_history end " << h << dendl;
4803 return true;
4804}
4805
4806
4807
4808void OSD::_add_heartbeat_peer(int p)
4809{
4810 if (p == whoami)
4811 return;
4812 HeartbeatInfo *hi;
4813
4814 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4815 if (i == heartbeat_peers.end()) {
4816 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4817 if (!cons.first)
4818 return;
4819 hi = &heartbeat_peers[p];
4820 hi->peer = p;
4821 HeartbeatSession *s = new HeartbeatSession(p);
4822 hi->con_back = cons.first.get();
4823 hi->con_back->set_priv(s->get());
4824 if (cons.second) {
4825 hi->con_front = cons.second.get();
4826 hi->con_front->set_priv(s->get());
4827 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4828 << " " << hi->con_back->get_peer_addr()
4829 << " " << hi->con_front->get_peer_addr()
4830 << dendl;
4831 } else {
4832 hi->con_front.reset(NULL);
4833 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4834 << " " << hi->con_back->get_peer_addr()
4835 << dendl;
4836 }
4837 s->put();
4838 } else {
4839 hi = &i->second;
4840 }
4841 hi->epoch = osdmap->get_epoch();
4842}
4843
4844void OSD::_remove_heartbeat_peer(int n)
4845{
4846 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
4847 assert(q != heartbeat_peers.end());
4848 dout(20) << " removing heartbeat peer osd." << n
4849 << " " << q->second.con_back->get_peer_addr()
4850 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
4851 << dendl;
4852 q->second.con_back->mark_down();
4853 if (q->second.con_front) {
4854 q->second.con_front->mark_down();
4855 }
4856 heartbeat_peers.erase(q);
4857}
4858
4859void OSD::need_heartbeat_peer_update()
4860{
4861 if (is_stopping())
4862 return;
4863 dout(20) << "need_heartbeat_peer_update" << dendl;
4864 heartbeat_set_peers_need_update();
4865}
4866
4867void OSD::maybe_update_heartbeat_peers()
4868{
4869 assert(osd_lock.is_locked());
4870
4871 if (is_waiting_for_healthy()) {
4872 utime_t now = ceph_clock_now();
4873 if (last_heartbeat_resample == utime_t()) {
4874 last_heartbeat_resample = now;
4875 heartbeat_set_peers_need_update();
4876 } else if (!heartbeat_peers_need_update()) {
4877 utime_t dur = now - last_heartbeat_resample;
4878 if (dur > cct->_conf->osd_heartbeat_grace) {
4879 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
4880 heartbeat_set_peers_need_update();
4881 last_heartbeat_resample = now;
4882 reset_heartbeat_peers(); // we want *new* peers!
4883 }
4884 }
4885 }
4886
4887 if (!heartbeat_peers_need_update())
4888 return;
4889 heartbeat_clear_peers_need_update();
4890
4891 Mutex::Locker l(heartbeat_lock);
4892
4893 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
4894
4895
4896 // build heartbeat from set
4897 if (is_active()) {
4898 RWLock::RLocker l(pg_map_lock);
4899 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4900 i != pg_map.end();
4901 ++i) {
4902 PG *pg = i->second;
4903 pg->heartbeat_peer_lock.Lock();
4904 dout(20) << i->first << " heartbeat_peers " << pg->heartbeat_peers << dendl;
4905 for (set<int>::iterator p = pg->heartbeat_peers.begin();
4906 p != pg->heartbeat_peers.end();
4907 ++p)
4908 if (osdmap->is_up(*p))
4909 _add_heartbeat_peer(*p);
4910 for (set<int>::iterator p = pg->probe_targets.begin();
4911 p != pg->probe_targets.end();
4912 ++p)
4913 if (osdmap->is_up(*p))
4914 _add_heartbeat_peer(*p);
4915 pg->heartbeat_peer_lock.Unlock();
4916 }
4917 }
4918
4919 // include next and previous up osds to ensure we have a fully-connected set
4920 set<int> want, extras;
4921 int next = osdmap->get_next_up_osd_after(whoami);
4922 if (next >= 0)
4923 want.insert(next);
4924 int prev = osdmap->get_previous_up_osd_before(whoami);
4925 if (prev >= 0 && prev != next)
4926 want.insert(prev);
4927
4928 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
4929 dout(10) << " adding neighbor peer osd." << *p << dendl;
4930 extras.insert(*p);
4931 _add_heartbeat_peer(*p);
4932 }
4933
4934 // remove down peers; enumerate extras
4935 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4936 while (p != heartbeat_peers.end()) {
4937 if (!osdmap->is_up(p->first)) {
4938 int o = p->first;
4939 ++p;
4940 _remove_heartbeat_peer(o);
4941 continue;
4942 }
4943 if (p->second.epoch < osdmap->get_epoch()) {
4944 extras.insert(p->first);
4945 }
4946 ++p;
4947 }
4948
4949 // too few?
4950 int start = osdmap->get_next_up_osd_after(whoami);
4951 for (int n = start; n >= 0; ) {
4952 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
4953 break;
4954 if (!extras.count(n) && !want.count(n) && n != whoami) {
4955 dout(10) << " adding random peer osd." << n << dendl;
4956 extras.insert(n);
4957 _add_heartbeat_peer(n);
4958 }
4959 n = osdmap->get_next_up_osd_after(n);
4960 if (n == start)
4961 break; // came full circle; stop
4962 }
4963
4964 // too many?
4965 for (set<int>::iterator p = extras.begin();
4966 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
4967 ++p) {
4968 if (want.count(*p))
4969 continue;
4970 _remove_heartbeat_peer(*p);
4971 }
4972
4973 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
4974}
4975
4976void OSD::reset_heartbeat_peers()
4977{
4978 assert(osd_lock.is_locked());
4979 dout(10) << "reset_heartbeat_peers" << dendl;
4980 Mutex::Locker l(heartbeat_lock);
4981 while (!heartbeat_peers.empty()) {
4982 HeartbeatInfo& hi = heartbeat_peers.begin()->second;
4983 hi.con_back->mark_down();
4984 if (hi.con_front) {
4985 hi.con_front->mark_down();
4986 }
4987 heartbeat_peers.erase(heartbeat_peers.begin());
4988 }
4989 failure_queue.clear();
4990}
4991
4992void OSD::handle_osd_ping(MOSDPing *m)
4993{
4994 if (superblock.cluster_fsid != m->fsid) {
4995 dout(20) << "handle_osd_ping from " << m->get_source_inst()
4996 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
4997 m->put();
4998 return;
4999 }
5000
5001 int from = m->get_source().num();
5002
5003 heartbeat_lock.Lock();
5004 if (is_stopping()) {
5005 heartbeat_lock.Unlock();
5006 m->put();
5007 return;
5008 }
5009
5010 OSDMapRef curmap = service.get_osdmap();
c07f9fc5
FG
5011 if (!curmap) {
5012 heartbeat_lock.Unlock();
5013 m->put();
5014 return;
5015 }
7c673cae
FG
5016
5017 switch (m->op) {
5018
5019 case MOSDPing::PING:
5020 {
5021 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
5022 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
5023 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
5024 if (heartbeat_drop->second == 0) {
5025 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
5026 } else {
5027 --heartbeat_drop->second;
5028 dout(5) << "Dropping heartbeat from " << from
5029 << ", " << heartbeat_drop->second
5030 << " remaining to drop" << dendl;
5031 break;
5032 }
5033 } else if (cct->_conf->osd_debug_drop_ping_probability >
5034 ((((double)(rand()%100))/100.0))) {
5035 heartbeat_drop =
5036 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
5037 cct->_conf->osd_debug_drop_ping_duration)).first;
5038 dout(5) << "Dropping heartbeat from " << from
5039 << ", " << heartbeat_drop->second
5040 << " remaining to drop" << dendl;
5041 break;
5042 }
5043 }
5044
5045 if (!cct->get_heartbeat_map()->is_healthy()) {
5046 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
5047 break;
5048 }
5049
5050 Message *r = new MOSDPing(monc->get_fsid(),
5051 curmap->get_epoch(),
31f18b77
FG
5052 MOSDPing::PING_REPLY, m->stamp,
5053 cct->_conf->osd_heartbeat_min_size);
7c673cae
FG
5054 m->get_connection()->send_message(r);
5055
5056 if (curmap->is_up(from)) {
5057 service.note_peer_epoch(from, m->map_epoch);
5058 if (is_active()) {
5059 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5060 if (con) {
5061 service.share_map_peer(from, con.get());
5062 }
5063 }
5064 } else if (!curmap->exists(from) ||
5065 curmap->get_down_at(from) > m->map_epoch) {
5066 // tell them they have died
5067 Message *r = new MOSDPing(monc->get_fsid(),
5068 curmap->get_epoch(),
5069 MOSDPing::YOU_DIED,
31f18b77
FG
5070 m->stamp,
5071 cct->_conf->osd_heartbeat_min_size);
7c673cae
FG
5072 m->get_connection()->send_message(r);
5073 }
5074 }
5075 break;
5076
5077 case MOSDPing::PING_REPLY:
5078 {
5079 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
5080 if (i != heartbeat_peers.end()) {
5081 if (m->get_connection() == i->second.con_back) {
5082 dout(25) << "handle_osd_ping got reply from osd." << from
5083 << " first_tx " << i->second.first_tx
5084 << " last_tx " << i->second.last_tx
5085 << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
5086 << " last_rx_front " << i->second.last_rx_front
5087 << dendl;
5088 i->second.last_rx_back = m->stamp;
5089 // if there is no front con, set both stamps.
5090 if (i->second.con_front == NULL)
5091 i->second.last_rx_front = m->stamp;
5092 } else if (m->get_connection() == i->second.con_front) {
5093 dout(25) << "handle_osd_ping got reply from osd." << from
5094 << " first_tx " << i->second.first_tx
5095 << " last_tx " << i->second.last_tx
5096 << " last_rx_back " << i->second.last_rx_back
5097 << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
5098 << dendl;
5099 i->second.last_rx_front = m->stamp;
5100 }
5101
5102 utime_t cutoff = ceph_clock_now();
5103 cutoff -= cct->_conf->osd_heartbeat_grace;
5104 if (i->second.is_healthy(cutoff)) {
5105 // Cancel false reports
5106 auto failure_queue_entry = failure_queue.find(from);
5107 if (failure_queue_entry != failure_queue.end()) {
5108 dout(10) << "handle_osd_ping canceling queued "
5109 << "failure report for osd." << from << dendl;
5110 failure_queue.erase(failure_queue_entry);
5111 }
5112
5113 auto failure_pending_entry = failure_pending.find(from);
5114 if (failure_pending_entry != failure_pending.end()) {
5115 dout(10) << "handle_osd_ping canceling in-flight "
5116 << "failure report for osd." << from << dendl;
5117 send_still_alive(curmap->get_epoch(),
5118 failure_pending_entry->second.second);
5119 failure_pending.erase(failure_pending_entry);
5120 }
5121 }
5122 }
5123
5124 if (m->map_epoch &&
5125 curmap->is_up(from)) {
5126 service.note_peer_epoch(from, m->map_epoch);
5127 if (is_active()) {
5128 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5129 if (con) {
5130 service.share_map_peer(from, con.get());
5131 }
5132 }
5133 }
5134 }
5135 break;
5136
5137 case MOSDPing::YOU_DIED:
5138 dout(10) << "handle_osd_ping " << m->get_source_inst()
5139 << " says i am down in " << m->map_epoch << dendl;
5140 osdmap_subscribe(curmap->get_epoch()+1, false);
5141 break;
5142 }
5143
5144 heartbeat_lock.Unlock();
5145 m->put();
5146}
5147
5148void OSD::heartbeat_entry()
5149{
5150 Mutex::Locker l(heartbeat_lock);
5151 if (is_stopping())
5152 return;
5153 while (!heartbeat_stop) {
5154 heartbeat();
5155
5156 double wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5157 utime_t w;
5158 w.set_from_double(wait);
5159 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5160 heartbeat_cond.WaitInterval(heartbeat_lock, w);
5161 if (is_stopping())
5162 return;
5163 dout(30) << "heartbeat_entry woke up" << dendl;
5164 }
5165}
5166
5167void OSD::heartbeat_check()
5168{
5169 assert(heartbeat_lock.is_locked());
5170 utime_t now = ceph_clock_now();
5171
5172 // check for heartbeat replies (move me elsewhere?)
5173 utime_t cutoff = now;
5174 cutoff -= cct->_conf->osd_heartbeat_grace;
5175 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5176 p != heartbeat_peers.end();
5177 ++p) {
5178
5179 if (p->second.first_tx == utime_t()) {
5180 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5181 << "yet, skipping" << dendl;
5182 continue;
5183 }
5184
5185 dout(25) << "heartbeat_check osd." << p->first
5186 << " first_tx " << p->second.first_tx
5187 << " last_tx " << p->second.last_tx
5188 << " last_rx_back " << p->second.last_rx_back
5189 << " last_rx_front " << p->second.last_rx_front
5190 << dendl;
5191 if (p->second.is_unhealthy(cutoff)) {
5192 if (p->second.last_rx_back == utime_t() ||
5193 p->second.last_rx_front == utime_t()) {
5194 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
5195 << " osd." << p->first << " ever on either front or back, first ping sent "
5196 << p->second.first_tx << " (cutoff " << cutoff << ")" << dendl;
5197 // fail
5198 failure_queue[p->first] = p->second.last_tx;
5199 } else {
5200 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
5201 << " osd." << p->first << " since back " << p->second.last_rx_back
5202 << " front " << p->second.last_rx_front
5203 << " (cutoff " << cutoff << ")" << dendl;
5204 // fail
5205 failure_queue[p->first] = MIN(p->second.last_rx_back, p->second.last_rx_front);
5206 }
5207 }
5208 }
5209}
5210
5211void OSD::heartbeat()
5212{
5213 dout(30) << "heartbeat" << dendl;
5214
5215 // get CPU load avg
5216 double loadavgs[1];
5217 int n_samples = 86400 / cct->_conf->osd_heartbeat_interval;
5218 if (getloadavg(loadavgs, 1) == 1) {
5219 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5220 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5221 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5222 }
5223
5224 dout(30) << "heartbeat checking stats" << dendl;
5225
5226 // refresh stats?
5227 vector<int> hb_peers;
5228 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5229 p != heartbeat_peers.end();
5230 ++p)
5231 hb_peers.push_back(p->first);
5232 service.update_osd_stat(hb_peers);
5233
5234 dout(5) << "heartbeat: " << service.get_osd_stat() << dendl;
5235
5236 utime_t now = ceph_clock_now();
5237
5238 // send heartbeats
5239 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5240 i != heartbeat_peers.end();
5241 ++i) {
5242 int peer = i->first;
5243 i->second.last_tx = now;
5244 if (i->second.first_tx == utime_t())
5245 i->second.first_tx = now;
5246 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5247 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
5248 service.get_osdmap()->get_epoch(),
31f18b77
FG
5249 MOSDPing::PING, now,
5250 cct->_conf->osd_heartbeat_min_size));
7c673cae
FG
5251
5252 if (i->second.con_front)
5253 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
5254 service.get_osdmap()->get_epoch(),
31f18b77
FG
5255 MOSDPing::PING, now,
5256 cct->_conf->osd_heartbeat_min_size));
7c673cae
FG
5257 }
5258
5259 logger->set(l_osd_hb_to, heartbeat_peers.size());
5260
5261 // hmm.. am i all alone?
5262 dout(30) << "heartbeat lonely?" << dendl;
5263 if (heartbeat_peers.empty()) {
5264 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5265 last_mon_heartbeat = now;
5266 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5267 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5268 }
5269 }
5270
5271 dout(30) << "heartbeat done" << dendl;
5272}
5273
5274bool OSD::heartbeat_reset(Connection *con)
5275{
f64942e4 5276 Mutex::Locker l(heartbeat_lock);
7c673cae
FG
5277 HeartbeatSession *s = static_cast<HeartbeatSession*>(con->get_priv());
5278 if (s) {
7c673cae 5279 if (is_stopping()) {
7c673cae
FG
5280 s->put();
5281 return true;
5282 }
5283 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer);
5284 if (p != heartbeat_peers.end() &&
5285 (p->second.con_back == con ||
5286 p->second.con_front == con)) {
5287 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5288 << ", reopening" << dendl;
5289 if (con != p->second.con_back) {
5290 p->second.con_back->mark_down();
5291 }
5292 p->second.con_back.reset(NULL);
5293 if (p->second.con_front && con != p->second.con_front) {
5294 p->second.con_front->mark_down();
5295 }
5296 p->second.con_front.reset(NULL);
5297 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5298 if (newcon.first) {
5299 p->second.con_back = newcon.first.get();
5300 p->second.con_back->set_priv(s->get());
5301 if (newcon.second) {
5302 p->second.con_front = newcon.second.get();
5303 p->second.con_front->set_priv(s->get());
5304 }
5305 } else {
5306 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5307 << ", raced with osdmap update, closing out peer" << dendl;
5308 heartbeat_peers.erase(p);
5309 }
5310 } else {
5311 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5312 }
7c673cae
FG
5313 s->put();
5314 }
5315 return true;
5316}
5317
5318
5319
5320// =========================================
5321
5322void OSD::tick()
5323{
5324 assert(osd_lock.is_locked());
5325 dout(10) << "tick" << dendl;
5326
5327 if (is_active() || is_waiting_for_healthy()) {
5328 maybe_update_heartbeat_peers();
5329 }
5330
5331 if (is_waiting_for_healthy()) {
5332 start_boot();
224ce89b
WB
5333 } else if (is_preboot() &&
5334 waiting_for_luminous_mons &&
5335 monc->monmap.get_required_features().contains_all(
5336 ceph::features::mon::FEATURE_LUMINOUS)) {
5337 // mon upgrade finished!
5338 start_boot();
7c673cae
FG
5339 }
5340
5341 do_waiters();
5342
91327a77 5343 tick_timer.add_event_after(get_tick_interval(), new C_Tick(this));
7c673cae
FG
5344}
5345
5346void OSD::tick_without_osd_lock()
5347{
5348 assert(tick_timer_lock.is_locked());
5349 dout(10) << "tick_without_osd_lock" << dendl;
5350
5351 logger->set(l_osd_buf, buffer::get_total_alloc());
5352 logger->set(l_osd_history_alloc_bytes, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
5353 logger->set(l_osd_history_alloc_num, buffer::get_history_alloc_num());
5354 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5355 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5356 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
94b18763 5357 logger->set(l_osd_pg_removing, remove_wq.get_remove_queue_len());
7c673cae
FG
5358
5359 // osd_lock is not being held, which means the OSD state
5360 // might change when doing the monitor report
5361 if (is_active() || is_waiting_for_healthy()) {
5362 heartbeat_lock.Lock();
5363 heartbeat_check();
5364 heartbeat_lock.Unlock();
5365
5366 map_lock.get_read();
5367 Mutex::Locker l(mon_report_lock);
5368
5369 // mon report?
5370 bool reset = false;
5371 bool report = false;
5372 utime_t now = ceph_clock_now();
5373 pg_stat_queue_lock.Lock();
5374 double backoff = stats_ack_timeout / cct->_conf->osd_mon_ack_timeout;
5375 double adjusted_min = cct->_conf->osd_mon_report_interval_min * backoff;
5376 // note: we shouldn't adjust max because it must remain < the
5377 // mon's mon_osd_report_timeout (which defaults to 1.5x our
5378 // value).
5379 double max = cct->_conf->osd_mon_report_interval_max;
5380 if (!outstanding_pg_stats.empty() &&
5381 (now - stats_ack_timeout) > last_pg_stats_ack) {
5382 dout(1) << __func__ << " mon hasn't acked PGStats in "
5383 << now - last_pg_stats_ack
5384 << " seconds, reconnecting elsewhere" << dendl;
5385 reset = true;
5386 last_pg_stats_ack = now; // reset clock
5387 last_pg_stats_sent = utime_t();
5388 stats_ack_timeout =
5389 MAX(cct->_conf->osd_mon_ack_timeout,
5390 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_factor);
5391 outstanding_pg_stats.clear();
5392 }
5393 if (now - last_pg_stats_sent > max) {
5394 osd_stat_updated = true;
5395 report = true;
5396 } else if (service.need_fullness_update()) {
5397 report = true;
5398 } else if ((int)outstanding_pg_stats.size() >=
5399 cct->_conf->osd_mon_report_max_in_flight) {
5400 dout(20) << __func__ << " have max " << outstanding_pg_stats
5401 << " stats updates in flight" << dendl;
5402 } else {
5403 if (now - last_mon_report > adjusted_min) {
5404 dout(20) << __func__ << " stats backoff " << backoff
5405 << " adjusted_min " << adjusted_min << " - sending report"
5406 << dendl;
5407 osd_stat_updated = true;
5408 report = true;
5409 }
5410 }
5411 pg_stat_queue_lock.Unlock();
5412
5413 if (reset) {
5414 monc->reopen_session();
5415 } else if (report) {
5416 last_mon_report = now;
5417
5418 // do any pending reports
5419 send_full_update();
5420 send_failures();
31f18b77
FG
5421 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5422 send_pg_stats(now);
5423 }
7c673cae
FG
5424 }
5425 map_lock.put_read();
5426 }
5427
5428 if (is_active()) {
5429 if (!scrub_random_backoff()) {
5430 sched_scrub();
5431 }
5432 service.promote_throttle_recalibrate();
3efd9988 5433 resume_creating_pg();
224ce89b
WB
5434 bool need_send_beacon = false;
5435 const auto now = ceph::coarse_mono_clock::now();
5436 {
5437 // borrow lec lock to pretect last_sent_beacon from changing
5438 Mutex::Locker l{min_last_epoch_clean_lock};
5439 const auto elapsed = now - last_sent_beacon;
5440 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5441 cct->_conf->osd_beacon_report_interval) {
5442 need_send_beacon = true;
5443 }
5444 }
5445 if (need_send_beacon) {
5446 send_beacon(now);
5447 }
7c673cae
FG
5448 }
5449
f64942e4 5450 check_ops_in_flight();
b32b8144 5451 mgrc.update_osd_health(get_health_metrics());
7c673cae 5452 service.kick_recovery_queue();
91327a77
AA
5453 tick_timer_without_osd_lock.add_event_after(get_tick_interval(),
5454 new C_Tick_WithoutOSDLock(this));
7c673cae
FG
5455}
5456
5457void OSD::check_ops_in_flight()
5458{
5459 vector<string> warnings;
5460 if (op_tracker.check_ops_in_flight(warnings)) {
5461 for (vector<string>::iterator i = warnings.begin();
5462 i != warnings.end();
5463 ++i) {
5464 clog->warn() << *i;
5465 }
5466 }
5467}
5468
5469// Usage:
5470// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5471// rmomapkey <pool-id> [namespace/]<obj-name> <key>
5472// setomapheader <pool-id> [namespace/]<obj-name> <header>
5473// getomap <pool> [namespace/]<obj-name>
5474// truncobj <pool-id> [namespace/]<obj-name> <newlen>
5475// injectmdataerr [namespace/]<obj-name> [shardid]
5476// injectdataerr [namespace/]<obj-name> [shardid]
5477//
5478// set_recovery_delay [utime]
5479void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5480 const std::string &command, cmdmap_t& cmdmap, ostream &ss)
5481{
5482 //Test support
5483 //Support changing the omap on a single osd by using the Admin Socket to
5484 //directly request the osd make a change.
5485 if (command == "setomapval" || command == "rmomapkey" ||
5486 command == "setomapheader" || command == "getomap" ||
5487 command == "truncobj" || command == "injectmdataerr" ||
5488 command == "injectdataerr"
5489 ) {
5490 pg_t rawpg;
5491 int64_t pool;
5492 OSDMapRef curmap = service->get_osdmap();
5493 int r = -1;
5494
5495 string poolstr;
5496
5497 cmd_getval(service->cct, cmdmap, "pool", poolstr);
5498 pool = curmap->lookup_pg_pool_name(poolstr);
5499 //If we can't find it by name then maybe id specified
5500 if (pool < 0 && isdigit(poolstr[0]))
5501 pool = atoll(poolstr.c_str());
5502 if (pool < 0) {
b5b8bbf5 5503 ss << "Invalid pool '" << poolstr << "''";
7c673cae
FG
5504 return;
5505 }
5506
5507 string objname, nspace;
5508 cmd_getval(service->cct, cmdmap, "objname", objname);
5509 std::size_t found = objname.find_first_of('/');
5510 if (found != string::npos) {
5511 nspace = objname.substr(0, found);
5512 objname = objname.substr(found+1);
5513 }
5514 object_locator_t oloc(pool, nspace);
5515 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5516
5517 if (r < 0) {
5518 ss << "Invalid namespace/objname";
5519 return;
5520 }
5521
5522 int64_t shardid;
5523 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5524 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5525 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5526 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5527 if (curmap->pg_is_ec(rawpg)) {
5528 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5529 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5530 return;
5531 }
5532 }
5533
5534 ObjectStore::Transaction t;
5535
5536 if (command == "setomapval") {
5537 map<string, bufferlist> newattrs;
5538 bufferlist val;
5539 string key, valstr;
5540 cmd_getval(service->cct, cmdmap, "key", key);
5541 cmd_getval(service->cct, cmdmap, "val", valstr);
5542
5543 val.append(valstr);
5544 newattrs[key] = val;
5545 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5546 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5547 if (r < 0)
5548 ss << "error=" << r;
5549 else
5550 ss << "ok";
5551 } else if (command == "rmomapkey") {
5552 string key;
5553 set<string> keys;
5554 cmd_getval(service->cct, cmdmap, "key", key);
5555
5556 keys.insert(key);
5557 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
5558 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5559 if (r < 0)
5560 ss << "error=" << r;
5561 else
5562 ss << "ok";
5563 } else if (command == "setomapheader") {
5564 bufferlist newheader;
5565 string headerstr;
5566
5567 cmd_getval(service->cct, cmdmap, "header", headerstr);
5568 newheader.append(headerstr);
5569 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
5570 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5571 if (r < 0)
5572 ss << "error=" << r;
5573 else
5574 ss << "ok";
5575 } else if (command == "getomap") {
5576 //Debug: Output entire omap
5577 bufferlist hdrbl;
5578 map<string, bufferlist> keyvals;
5579 r = store->omap_get(coll_t(pgid), ghobject_t(obj), &hdrbl, &keyvals);
5580 if (r >= 0) {
5581 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5582 for (map<string, bufferlist>::iterator it = keyvals.begin();
5583 it != keyvals.end(); ++it)
5584 ss << " key=" << (*it).first << " val="
5585 << string((*it).second.c_str(), (*it).second.length());
5586 } else {
5587 ss << "error=" << r;
5588 }
5589 } else if (command == "truncobj") {
5590 int64_t trunclen;
5591 cmd_getval(service->cct, cmdmap, "len", trunclen);
5592 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
5593 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5594 if (r < 0)
5595 ss << "error=" << r;
5596 else
5597 ss << "ok";
5598 } else if (command == "injectdataerr") {
5599 store->inject_data_error(gobj);
5600 ss << "ok";
5601 } else if (command == "injectmdataerr") {
5602 store->inject_mdata_error(gobj);
5603 ss << "ok";
5604 }
5605 return;
5606 }
5607 if (command == "set_recovery_delay") {
5608 int64_t delay;
5609 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5610 ostringstream oss;
5611 oss << delay;
5612 int r = service->cct->_conf->set_val("osd_recovery_delay_start",
5613 oss.str().c_str());
5614 if (r != 0) {
5615 ss << "set_recovery_delay: error setting "
5616 << "osd_recovery_delay_start to '" << delay << "': error "
5617 << r;
5618 return;
5619 }
5620 service->cct->_conf->apply_changes(NULL);
5621 ss << "set_recovery_delay: set osd_recovery_delay_start "
5622 << "to " << service->cct->_conf->osd_recovery_delay_start;
5623 return;
5624 }
a8e16298 5625 if (command == "trigger_scrub" || command == "trigger_deep_scrub") {
7c673cae 5626 spg_t pgid;
a8e16298 5627 bool deep = (command == "trigger_deep_scrub");
7c673cae
FG
5628 OSDMapRef curmap = service->get_osdmap();
5629
5630 string pgidstr;
5631
5632 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5633 if (!pgid.parse(pgidstr.c_str())) {
5634 ss << "Invalid pgid specified";
5635 return;
5636 }
5637
a8e16298
TL
5638 int64_t time;
5639 cmd_getval(service->cct, cmdmap, "time", time, (int64_t)0);
5640
7c673cae
FG
5641 PG *pg = service->osd->_lookup_lock_pg(pgid);
5642 if (pg == nullptr) {
5643 ss << "Can't find pg " << pgid;
5644 return;
5645 }
5646
5647 if (pg->is_primary()) {
5648 pg->unreg_next_scrub();
5649 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5650 double pool_scrub_max_interval = 0;
a8e16298
TL
5651 double scrub_max_interval;
5652 if (deep) {
5653 p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
5654 scrub_max_interval = pool_scrub_max_interval > 0 ?
5655 pool_scrub_max_interval : g_conf->osd_deep_scrub_interval;
5656 } else {
5657 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5658 scrub_max_interval = pool_scrub_max_interval > 0 ?
5659 pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
5660 }
7c673cae
FG
5661 // Instead of marking must_scrub force a schedule scrub
5662 utime_t stamp = ceph_clock_now();
a8e16298
TL
5663 if (time == 0)
5664 stamp -= scrub_max_interval;
5665 else
5666 stamp -= (float)time;
5667 stamp -= 100.0; // push back last scrub more for good measure
5668 if (deep) {
5669 pg->set_last_deep_scrub_stamp(stamp);
5670 } else {
5671 pg->set_last_scrub_stamp(stamp);
5672 }
7c673cae 5673 pg->reg_next_scrub();
a8e16298
TL
5674 pg->publish_stats_to_osd();
5675 ss << "ok - set" << (deep ? " deep" : "" ) << " stamp " << stamp;
7c673cae
FG
5676 } else {
5677 ss << "Not primary";
5678 }
5679 pg->unlock();
5680 return;
5681 }
5682 if (command == "injectfull") {
5683 int64_t count;
5684 string type;
5685 OSDService::s_names state;
5686 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
5687 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
5688 if (type == "none" || count == 0) {
5689 type = "none";
5690 count = 0;
5691 }
5692 state = service->get_full_state(type);
5693 if (state == OSDService::s_names::INVALID) {
5694 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5695 return;
5696 }
5697 service->set_injectfull(state, count);
5698 return;
5699 }
5700 ss << "Internal error - command=" << command;
5701}
5702
5703// =========================================
5704bool remove_dir(
5705 CephContext *cct,
5706 ObjectStore *store, SnapMapper *mapper,
5707 OSDriver *osdriver,
5708 ObjectStore::Sequencer *osr,
5709 coll_t coll, DeletingStateRef dstate,
5710 bool *finished,
5711 ThreadPool::TPHandle &handle)
5712{
5713 vector<ghobject_t> olist;
5714 int64_t num = 0;
5715 ObjectStore::Transaction t;
5716 ghobject_t next;
5717 handle.reset_tp_timeout();
5718 store->collection_list(
5719 coll,
5720 next,
5721 ghobject_t::get_max(),
5722 store->get_ideal_list_max(),
5723 &olist,
5724 &next);
5725 generic_dout(10) << __func__ << " " << olist << dendl;
5726 // default cont to true, this is safe because caller(OSD::RemoveWQ::_process())
5727 // will recheck the answer before it really goes on.
5728 bool cont = true;
5729 for (vector<ghobject_t>::iterator i = olist.begin();
5730 i != olist.end();
5731 ++i) {
5732 if (i->is_pgmeta())
5733 continue;
5734 OSDriver::OSTransaction _t(osdriver->get_transaction(&t));
5735 int r = mapper->remove_oid(i->hobj, &_t);
5736 if (r != 0 && r != -ENOENT) {
5737 ceph_abort();
5738 }
5739 t.remove(coll, *i);
5740 if (++num >= cct->_conf->osd_target_transaction_size) {
5741 C_SaferCond waiter;
5742 store->queue_transaction(osr, std::move(t), &waiter);
5743 cont = dstate->pause_clearing();
5744 handle.suspend_tp_timeout();
5745 waiter.wait();
f64942e4
AA
5746 if (cct->_conf->osd_delete_sleep) {
5747 utime_t t;
5748 t.set_from_double(cct->_conf->osd_delete_sleep);
5749 lgeneric_subdout(cct, osd, 10) << __func__ << " inject delay of " << t << dendl;
5750 t.sleep();
5751 }
7c673cae
FG
5752 handle.reset_tp_timeout();
5753 if (cont)
5754 cont = dstate->resume_clearing();
5755 if (!cont)
5756 return false;
5757 t = ObjectStore::Transaction();
5758 num = 0;
5759 }
5760 }
5761 if (num) {
5762 C_SaferCond waiter;
5763 store->queue_transaction(osr, std::move(t), &waiter);
5764 cont = dstate->pause_clearing();
5765 handle.suspend_tp_timeout();
5766 waiter.wait();
5767 handle.reset_tp_timeout();
5768 if (cont)
5769 cont = dstate->resume_clearing();
5770 }
5771 // whether there are more objects to remove in the collection
5772 *finished = next.is_max();
5773 return cont;
5774}
5775
5776void OSD::RemoveWQ::_process(
5777 pair<PGRef, DeletingStateRef> item,
5778 ThreadPool::TPHandle &handle)
5779{
5780 FUNCTRACE();
5781 PGRef pg(item.first);
5782 SnapMapper &mapper = pg->snap_mapper;
5783 OSDriver &driver = pg->osdriver;
5784 coll_t coll = coll_t(pg->info.pgid);
5785 pg->osr->flush();
5786 bool finished = false;
5787
5788 if (!item.second->start_or_resume_clearing())
5789 return;
5790
5791 bool cont = remove_dir(
5792 pg->cct, store, &mapper, &driver, pg->osr.get(), coll, item.second,
5793 &finished, handle);
5794 if (!cont)
5795 return;
5796 if (!finished) {
5797 if (item.second->pause_clearing())
5798 queue_front(item);
5799 return;
5800 }
5801
5802 if (!item.second->start_deleting())
5803 return;
5804
5805 ObjectStore::Transaction t;
5806 PGLog::clear_info_log(pg->info.pgid, &t);
5807
5808 if (cct->_conf->osd_inject_failure_on_pg_removal) {
5809 generic_derr << "osd_inject_failure_on_pg_removal" << dendl;
5810 _exit(1);
5811 }
5812 t.remove_collection(coll);
5813
5814 // We need the sequencer to stick around until the op is complete
5815 store->queue_transaction(
5816 pg->osr.get(),
5817 std::move(t),
5818 0, // onapplied
5819 0, // oncommit
5820 0, // onreadable sync
5821 new ContainerContext<PGRef>(pg),
5822 TrackedOpRef());
5823
5824 item.second->finish_deleting();
5825}
5826// =========================================
5827
5828void OSD::ms_handle_connect(Connection *con)
5829{
5830 dout(10) << __func__ << " con " << con << dendl;
5831 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
5832 Mutex::Locker l(osd_lock);
5833 if (is_stopping())
5834 return;
5835 dout(10) << __func__ << " on mon" << dendl;
5836
5837 if (is_preboot()) {
5838 start_boot();
5839 } else if (is_booting()) {
5840 _send_boot(); // resend boot message
5841 } else {
5842 map_lock.get_read();
5843 Mutex::Locker l2(mon_report_lock);
5844
5845 utime_t now = ceph_clock_now();
5846 last_mon_report = now;
5847
5848 // resend everything, it's a new session
5849 send_full_update();
5850 send_alive();
5851 service.requeue_pg_temp();
5852 service.send_pg_temp();
5853 requeue_failures();
5854 send_failures();
31f18b77
FG
5855 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5856 send_pg_stats(now);
5857 }
7c673cae
FG
5858
5859 map_lock.put_read();
5860 if (is_active()) {
5861 send_beacon(ceph::coarse_mono_clock::now());
5862 }
5863 }
5864
5865 // full map requests may happen while active or pre-boot
5866 if (requested_full_first) {
5867 rerequest_full_maps();
5868 }
5869 }
5870}
5871
5872void OSD::ms_handle_fast_connect(Connection *con)
5873{
5874 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5875 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5876 Session *s = static_cast<Session*>(con->get_priv());
5877 if (!s) {
5878 s = new Session(cct);
5879 con->set_priv(s->get());
5880 s->con = con;
5881 dout(10) << " new session (outgoing) " << s << " con=" << s->con
5882 << " addr=" << s->con->get_peer_addr() << dendl;
5883 // we don't connect to clients
5884 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5885 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5886 }
5887 s->put();
5888 }
5889}
5890
5891void OSD::ms_handle_fast_accept(Connection *con)
5892{
5893 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5894 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5895 Session *s = static_cast<Session*>(con->get_priv());
5896 if (!s) {
5897 s = new Session(cct);
5898 con->set_priv(s->get());
5899 s->con = con;
5900 dout(10) << "new session (incoming)" << s << " con=" << con
5901 << " addr=" << con->get_peer_addr()
5902 << " must have raced with connect" << dendl;
5903 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5904 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5905 }
5906 s->put();
5907 }
5908}
5909
5910bool OSD::ms_handle_reset(Connection *con)
5911{
5912 Session *session = static_cast<Session*>(con->get_priv());
5913 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
5914 if (!session)
5915 return false;
5916 session->wstate.reset(con);
5917 session->con.reset(NULL); // break con <-> session ref cycle
5918 // note that we break session->con *before* the session_handle_reset
5919 // cleanup below. this avoids a race between us and
5920 // PG::add_backoff, Session::check_backoff, etc.
5921 session_handle_reset(session);
5922 session->put();
5923 return true;
5924}
5925
5926bool OSD::ms_handle_refused(Connection *con)
5927{
5928 if (!cct->_conf->osd_fast_fail_on_connection_refused)
5929 return false;
5930
5931 Session *session = static_cast<Session*>(con->get_priv());
5932 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
5933 if (!session)
5934 return false;
5935 int type = con->get_peer_type();
5936 // handle only OSD failures here
5937 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
5938 OSDMapRef osdmap = get_osdmap();
5939 if (osdmap) {
5940 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
5941 if (id >= 0 && osdmap->is_up(id)) {
5942 // I'm cheating mon heartbeat grace logic, because we know it's not going
5943 // to respawn alone. +1 so we won't hit any boundary case.
5944 monc->send_mon_message(new MOSDFailure(monc->get_fsid(),
5945 osdmap->get_inst(id),
5946 cct->_conf->osd_heartbeat_grace + 1,
5947 osdmap->get_epoch(),
5948 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
5949 ));
5950 }
5951 }
5952 }
5953 session->put();
5954 return true;
5955}
5956
5957struct C_OSD_GetVersion : public Context {
5958 OSD *osd;
5959 uint64_t oldest, newest;
5960 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
5961 void finish(int r) override {
5962 if (r >= 0)
5963 osd->_got_mon_epochs(oldest, newest);
5964 }
5965};
5966
5967void OSD::start_boot()
5968{
5969 if (!_is_healthy()) {
5970 // if we are not healthy, do not mark ourselves up (yet)
5971 dout(1) << "not healthy; waiting to boot" << dendl;
5972 if (!is_waiting_for_healthy())
5973 start_waiting_for_healthy();
5974 // send pings sooner rather than later
5975 heartbeat_kick();
5976 return;
5977 }
5978 dout(1) << __func__ << dendl;
5979 set_state(STATE_PREBOOT);
224ce89b 5980 waiting_for_luminous_mons = false;
7c673cae
FG
5981 dout(10) << "start_boot - have maps " << superblock.oldest_map
5982 << ".." << superblock.newest_map << dendl;
5983 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
5984 monc->get_version("osdmap", &c->newest, &c->oldest, c);
5985}
5986
5987void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
5988{
5989 Mutex::Locker l(osd_lock);
5990 if (is_preboot()) {
5991 _preboot(oldest, newest);
5992 }
5993}
5994
5995void OSD::_preboot(epoch_t oldest, epoch_t newest)
5996{
5997 assert(is_preboot());
5998 dout(10) << __func__ << " _preboot mon has osdmaps "
5999 << oldest << ".." << newest << dendl;
6000
6001 // ensure our local fullness awareness is accurate
6002 heartbeat();
6003
6004 // if our map within recent history, try to add ourselves to the osdmap.
31f18b77
FG
6005 if (osdmap->get_epoch() == 0) {
6006 derr << "waiting for initial osdmap" << dendl;
c07f9fc5 6007 } else if (osdmap->is_destroyed(whoami)) {
b32b8144
FG
6008 derr << "osdmap says I am destroyed" << dendl;
6009 // provide a small margin so we don't livelock seeing if we
6010 // un-destroyed ourselves.
6011 if (osdmap->get_epoch() > newest - 1) {
6012 exit(0);
6013 }
31f18b77 6014 } else if (osdmap->test_flag(CEPH_OSDMAP_NOUP) || osdmap->is_noup(whoami)) {
7c673cae
FG
6015 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
6016 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
6017 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
6018 << dendl;
31f18b77 6019 } else if (osdmap->require_osd_release < CEPH_RELEASE_JEWEL) {
7c673cae
FG
6020 derr << "osdmap REQUIRE_JEWEL OSDMap flag is NOT set; please set it"
6021 << dendl;
6022 } else if (!monc->monmap.get_required_features().contains_all(
6023 ceph::features::mon::FEATURE_LUMINOUS)) {
6024 derr << "monmap REQUIRE_LUMINOUS is NOT set; must upgrade all monitors to "
6025 << "Luminous or later before Luminous OSDs will boot" << dendl;
224ce89b 6026 waiting_for_luminous_mons = true;
7c673cae
FG
6027 } else if (service.need_fullness_update()) {
6028 derr << "osdmap fullness state needs update" << dendl;
6029 send_full_update();
6030 } else if (osdmap->get_epoch() >= oldest - 1 &&
6031 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
6032 _send_boot();
6033 return;
6034 }
6035
6036 // get all the latest maps
6037 if (osdmap->get_epoch() + 1 >= oldest)
6038 osdmap_subscribe(osdmap->get_epoch() + 1, false);
6039 else
6040 osdmap_subscribe(oldest - 1, true);
6041}
6042
6043void OSD::send_full_update()
6044{
6045 if (!service.need_fullness_update())
6046 return;
6047 unsigned state = 0;
6048 if (service.is_full()) {
6049 state = CEPH_OSD_FULL;
6050 } else if (service.is_backfillfull()) {
6051 state = CEPH_OSD_BACKFILLFULL;
6052 } else if (service.is_nearfull()) {
6053 state = CEPH_OSD_NEARFULL;
6054 }
6055 set<string> s;
6056 OSDMap::calc_state_set(state, s);
6057 dout(10) << __func__ << " want state " << s << dendl;
6058 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
6059}
6060
6061void OSD::start_waiting_for_healthy()
6062{
6063 dout(1) << "start_waiting_for_healthy" << dendl;
6064 set_state(STATE_WAITING_FOR_HEALTHY);
6065 last_heartbeat_resample = utime_t();
181888fb
FG
6066
6067 // subscribe to osdmap updates, in case our peers really are known to be dead
6068 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7c673cae
FG
6069}
6070
6071bool OSD::_is_healthy()
6072{
6073 if (!cct->get_heartbeat_map()->is_healthy()) {
6074 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
6075 return false;
6076 }
6077
6078 if (is_waiting_for_healthy()) {
6079 Mutex::Locker l(heartbeat_lock);
6080 utime_t cutoff = ceph_clock_now();
6081 cutoff -= cct->_conf->osd_heartbeat_grace;
6082 int num = 0, up = 0;
6083 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
6084 p != heartbeat_peers.end();
6085 ++p) {
6086 if (p->second.is_healthy(cutoff))
6087 ++up;
6088 ++num;
6089 }
6090 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
6091 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
6092 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
6093 return false;
6094 }
6095 }
6096
6097 return true;
6098}
6099
6100void OSD::_send_boot()
6101{
6102 dout(10) << "_send_boot" << dendl;
6103 entity_addr_t cluster_addr = cluster_messenger->get_myaddr();
6104 Connection *local_connection = cluster_messenger->get_loopback_connection().get();
6105 if (cluster_addr.is_blank_ip()) {
6106 int port = cluster_addr.get_port();
6107 cluster_addr = client_messenger->get_myaddr();
6108 cluster_addr.set_port(port);
6109 cluster_messenger->set_addr_unknowns(cluster_addr);
6110 dout(10) << " assuming cluster_addr ip matches client_addr" << dendl;
6111 } else {
6112 Session *s = static_cast<Session*>(local_connection->get_priv());
6113 if (s)
6114 s->put();
6115 else
6116 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
6117 }
6118
6119 entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr();
6120 local_connection = hb_back_server_messenger->get_loopback_connection().get();
6121 if (hb_back_addr.is_blank_ip()) {
6122 int port = hb_back_addr.get_port();
6123 hb_back_addr = cluster_addr;
6124 hb_back_addr.set_port(port);
6125 hb_back_server_messenger->set_addr_unknowns(hb_back_addr);
6126 dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl;
6127 } else {
6128 Session *s = static_cast<Session*>(local_connection->get_priv());
6129 if (s)
6130 s->put();
6131 else
6132 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6133 }
6134
6135 entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr();
6136 local_connection = hb_front_server_messenger->get_loopback_connection().get();
6137 if (hb_front_addr.is_blank_ip()) {
6138 int port = hb_front_addr.get_port();
6139 hb_front_addr = client_messenger->get_myaddr();
6140 hb_front_addr.set_port(port);
6141 hb_front_server_messenger->set_addr_unknowns(hb_front_addr);
6142 dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
6143 } else {
6144 Session *s = static_cast<Session*>(local_connection->get_priv());
6145 if (s)
6146 s->put();
6147 else
6148 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
6149 }
6150
6151 MOSDBoot *mboot = new MOSDBoot(superblock, get_osdmap_epoch(), service.get_boot_epoch(),
6152 hb_back_addr, hb_front_addr, cluster_addr,
6153 CEPH_FEATURES_ALL);
6154 dout(10) << " client_addr " << client_messenger->get_myaddr()
6155 << ", cluster_addr " << cluster_addr
6156 << ", hb_back_addr " << hb_back_addr
6157 << ", hb_front_addr " << hb_front_addr
6158 << dendl;
6159 _collect_metadata(&mboot->metadata);
6160 monc->send_mon_message(mboot);
6161 set_state(STATE_BOOTING);
6162}
6163
6164void OSD::_collect_metadata(map<string,string> *pm)
6165{
6166 // config info
6167 (*pm)["osd_data"] = dev_path;
c07f9fc5
FG
6168 if (store->get_type() == "filestore") {
6169 // not applicable for bluestore
6170 (*pm)["osd_journal"] = journal_path;
6171 }
7c673cae
FG
6172 (*pm)["front_addr"] = stringify(client_messenger->get_myaddr());
6173 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddr());
6174 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddr());
6175 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddr());
6176
6177 // backend
6178 (*pm)["osd_objectstore"] = store->get_type();
31f18b77 6179 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
d2e6a577 6180 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
c07f9fc5 6181 (*pm)["default_device_class"] = store->get_default_device_class();
7c673cae
FG
6182 store->collect_metadata(pm);
6183
6184 collect_sys_info(pm, cct);
6185
b5b8bbf5
FG
6186 std::string front_iface, back_iface;
6187 /*
6188 pick_iface(cct,
6189 CEPH_PICK_ADDRESS_PUBLIC | CEPH_PICK_ADDRESS_CLUSTER,
6190 &front_iface, &back_iface);
6191 */
6192 (*pm)["front_iface"] = pick_iface(cct,
6193 client_messenger->get_myaddr().get_sockaddr_storage());
6194 (*pm)["back_iface"] = pick_iface(cct,
6195 cluster_messenger->get_myaddr().get_sockaddr_storage());
6196
7c673cae
FG
6197 dout(10) << __func__ << " " << *pm << dendl;
6198}
6199
6200void OSD::queue_want_up_thru(epoch_t want)
6201{
6202 map_lock.get_read();
6203 epoch_t cur = osdmap->get_up_thru(whoami);
6204 Mutex::Locker l(mon_report_lock);
6205 if (want > up_thru_wanted) {
6206 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6207 << ", currently " << cur
6208 << dendl;
6209 up_thru_wanted = want;
6210 send_alive();
6211 } else {
6212 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6213 << ", currently " << cur
6214 << dendl;
6215 }
6216 map_lock.put_read();
6217}
6218
6219void OSD::send_alive()
6220{
6221 assert(mon_report_lock.is_locked());
6222 if (!osdmap->exists(whoami))
6223 return;
6224 epoch_t up_thru = osdmap->get_up_thru(whoami);
6225 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6226 if (up_thru_wanted > up_thru) {
6227 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6228 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6229 }
6230}
6231
6232void OSD::request_full_map(epoch_t first, epoch_t last)
6233{
6234 dout(10) << __func__ << " " << first << ".." << last
6235 << ", previously requested "
6236 << requested_full_first << ".." << requested_full_last << dendl;
6237 assert(osd_lock.is_locked());
6238 assert(first > 0 && last > 0);
6239 assert(first <= last);
6240 assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6241 if (requested_full_first == 0) {
6242 // first request
6243 requested_full_first = first;
6244 requested_full_last = last;
6245 } else if (last <= requested_full_last) {
6246 // dup
6247 return;
6248 } else {
6249 // additional request
6250 first = requested_full_last + 1;
6251 requested_full_last = last;
6252 }
6253 MMonGetOSDMap *req = new MMonGetOSDMap;
6254 req->request_full(first, last);
6255 monc->send_mon_message(req);
6256}
6257
6258void OSD::got_full_map(epoch_t e)
6259{
6260 assert(requested_full_first <= requested_full_last);
6261 assert(osd_lock.is_locked());
6262 if (requested_full_first == 0) {
6263 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6264 return;
6265 }
6266 if (e < requested_full_first) {
6267 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6268 << ".." << requested_full_last
6269 << ", ignoring" << dendl;
6270 return;
6271 }
6272 if (e >= requested_full_last) {
6273 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6274 << ".." << requested_full_last << ", resetting" << dendl;
6275 requested_full_first = requested_full_last = 0;
6276 return;
6277 }
6278
6279 requested_full_first = e + 1;
6280
6281 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6282 << ".." << requested_full_last
6283 << ", still need more" << dendl;
6284}
6285
6286void OSD::requeue_failures()
6287{
6288 Mutex::Locker l(heartbeat_lock);
6289 unsigned old_queue = failure_queue.size();
6290 unsigned old_pending = failure_pending.size();
6291 for (map<int,pair<utime_t,entity_inst_t> >::iterator p =
6292 failure_pending.begin();
6293 p != failure_pending.end(); ) {
6294 failure_queue[p->first] = p->second.first;
6295 failure_pending.erase(p++);
6296 }
6297 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6298 << failure_queue.size() << dendl;
6299}
6300
6301void OSD::send_failures()
6302{
6303 assert(map_lock.is_locked());
6304 assert(mon_report_lock.is_locked());
6305 Mutex::Locker l(heartbeat_lock);
6306 utime_t now = ceph_clock_now();
6307 while (!failure_queue.empty()) {
6308 int osd = failure_queue.begin()->first;
7c673cae 6309 if (!failure_pending.count(osd)) {
31f18b77 6310 entity_inst_t i = osdmap->get_inst(osd);
7c673cae
FG
6311 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6312 monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, failed_for,
6313 osdmap->get_epoch()));
6314 failure_pending[osd] = make_pair(failure_queue.begin()->second, i);
6315 }
6316 failure_queue.erase(osd);
6317 }
6318}
6319
6320void OSD::send_still_alive(epoch_t epoch, const entity_inst_t &i)
6321{
6322 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), i, 0, epoch, MOSDFailure::FLAG_ALIVE);
6323 monc->send_mon_message(m);
6324}
6325
6326void OSD::send_pg_stats(const utime_t &now)
6327{
6328 assert(map_lock.is_locked());
31f18b77 6329 assert(osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS);
7c673cae
FG
6330 dout(20) << "send_pg_stats" << dendl;
6331
6332 osd_stat_t cur_stat = service.get_osd_stat();
6333
6334 cur_stat.os_perf_stat = store->get_cur_stats();
6335
6336 pg_stat_queue_lock.Lock();
6337
6338 if (osd_stat_updated || !pg_stat_queue.empty()) {
6339 last_pg_stats_sent = now;
6340 osd_stat_updated = false;
6341
6342 dout(10) << "send_pg_stats - " << pg_stat_queue.size() << " pgs updated" << dendl;
6343
6344 utime_t had_for(now);
6345 had_for -= had_map_since;
6346
6347 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
6348
6349 uint64_t tid = ++pg_stat_tid;
6350 m->set_tid(tid);
6351 m->osd_stat = cur_stat;
6352
6353 xlist<PG*>::iterator p = pg_stat_queue.begin();
6354 while (!p.end()) {
6355 PG *pg = *p;
6356 ++p;
6357 if (!pg->is_primary()) { // we hold map_lock; role is stable.
6358 pg->stat_queue_item.remove_myself();
6359 pg->put("pg_stat_queue");
6360 continue;
6361 }
6362 pg->pg_stats_publish_lock.Lock();
6363 if (pg->pg_stats_publish_valid) {
6364 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
6365 dout(25) << " sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6366 << pg->pg_stats_publish.reported_seq << dendl;
6367 } else {
6368 dout(25) << " NOT sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6369 << pg->pg_stats_publish.reported_seq << ", not valid" << dendl;
6370 }
6371 pg->pg_stats_publish_lock.Unlock();
6372 }
6373
6374 if (last_pg_stats_ack == utime_t() || !outstanding_pg_stats.empty()) {
6375 last_pg_stats_ack = ceph_clock_now();
6376 }
6377 outstanding_pg_stats.insert(tid);
6378 dout(20) << __func__ << " updates pending: " << outstanding_pg_stats << dendl;
6379
6380 monc->send_mon_message(m);
6381 }
6382
6383 pg_stat_queue_lock.Unlock();
6384}
6385
6386void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
6387{
6388 dout(10) << "handle_pg_stats_ack " << dendl;
6389
6390 if (!require_mon_peer(ack)) {
6391 ack->put();
6392 return;
6393 }
6394
6395 // NOTE: we may get replies from a previous mon even while
6396 // outstanding_pg_stats is empty if reconnecting races with replies
6397 // in flight.
6398
6399 pg_stat_queue_lock.Lock();
6400
6401 last_pg_stats_ack = ceph_clock_now();
6402
6403 // decay timeout slowly (analogous to TCP)
6404 stats_ack_timeout =
6405 MAX(cct->_conf->osd_mon_ack_timeout,
6406 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_decay);
6407 dout(20) << __func__ << " timeout now " << stats_ack_timeout << dendl;
6408
6409 if (ack->get_tid() > pg_stat_tid_flushed) {
6410 pg_stat_tid_flushed = ack->get_tid();
6411 pg_stat_queue_cond.Signal();
6412 }
6413
6414 xlist<PG*>::iterator p = pg_stat_queue.begin();
6415 while (!p.end()) {
6416 PG *pg = *p;
6417 PGRef _pg(pg);
6418 ++p;
6419
6420 auto acked = ack->pg_stat.find(pg->info.pgid.pgid);
6421 if (acked != ack->pg_stat.end()) {
6422 pg->pg_stats_publish_lock.Lock();
6423 if (acked->second.first == pg->pg_stats_publish.reported_seq &&
6424 acked->second.second == pg->pg_stats_publish.reported_epoch) {
6425 dout(25) << " ack on " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6426 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6427 pg->stat_queue_item.remove_myself();
6428 pg->put("pg_stat_queue");
6429 } else {
6430 dout(25) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6431 << ":" << pg->pg_stats_publish.reported_seq << " > acked "
6432 << acked->second << dendl;
6433 }
6434 pg->pg_stats_publish_lock.Unlock();
6435 } else {
6436 dout(30) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6437 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6438 }
6439 }
6440
6441 outstanding_pg_stats.erase(ack->get_tid());
6442 dout(20) << __func__ << " still pending: " << outstanding_pg_stats << dendl;
6443
6444 pg_stat_queue_lock.Unlock();
6445
6446 ack->put();
6447}
6448
6449void OSD::flush_pg_stats()
6450{
6451 dout(10) << "flush_pg_stats" << dendl;
6452 osd_lock.Unlock();
6453 utime_t now = ceph_clock_now();
6454 map_lock.get_read();
6455 mon_report_lock.Lock();
6456 send_pg_stats(now);
6457 mon_report_lock.Unlock();
6458 map_lock.put_read();
6459
6460
6461 pg_stat_queue_lock.Lock();
6462 uint64_t tid = pg_stat_tid;
6463 dout(10) << "flush_pg_stats waiting for stats tid " << tid << " to flush" << dendl;
6464 while (tid > pg_stat_tid_flushed)
6465 pg_stat_queue_cond.Wait(pg_stat_queue_lock);
6466 dout(10) << "flush_pg_stats finished waiting for stats tid " << tid << " to flush" << dendl;
6467 pg_stat_queue_lock.Unlock();
6468
6469 osd_lock.Lock();
6470}
6471
6472void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6473{
6474 const auto& monmap = monc->monmap;
6475 // send beacon to mon even if we are just connected, and the monmap is not
6476 // initialized yet by then.
6477 if (monmap.epoch > 0 &&
6478 monmap.get_required_features().contains_all(
6479 ceph::features::mon::FEATURE_LUMINOUS)) {
6480 dout(20) << __func__ << " sending" << dendl;
7c673cae
FG
6481 MOSDBeacon* beacon = nullptr;
6482 {
6483 Mutex::Locker l{min_last_epoch_clean_lock};
6484 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
6485 std::swap(beacon->pgs, min_last_epoch_clean_pgs);
224ce89b 6486 last_sent_beacon = now;
7c673cae
FG
6487 }
6488 monc->send_mon_message(beacon);
6489 } else {
6490 dout(20) << __func__ << " not sending" << dendl;
6491 }
6492}
6493
6494void OSD::handle_command(MMonCommand *m)
6495{
6496 if (!require_mon_peer(m)) {
6497 m->put();
6498 return;
6499 }
6500
6501 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6502 command_wq.queue(c);
6503 m->put();
6504}
6505
6506void OSD::handle_command(MCommand *m)
6507{
6508 ConnectionRef con = m->get_connection();
6509 Session *session = static_cast<Session *>(con->get_priv());
6510 if (!session) {
6511 con->send_message(new MCommandReply(m, -EPERM));
6512 m->put();
6513 return;
6514 }
6515
6516 OSDCap& caps = session->caps;
6517 session->put();
6518
6519 if (!caps.allow_all() || m->get_source().is_mon()) {
6520 con->send_message(new MCommandReply(m, -EPERM));
6521 m->put();
6522 return;
6523 }
6524
6525 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6526 command_wq.queue(c);
6527
6528 m->put();
6529}
6530
6531struct OSDCommand {
6532 string cmdstring;
6533 string helpstring;
6534 string module;
6535 string perm;
6536 string availability;
6537} osd_commands[] = {
6538
6539#define COMMAND(parsesig, helptext, module, perm, availability) \
6540 {parsesig, helptext, module, perm, availability},
6541
6542// yes, these are really pg commands, but there's a limit to how
6543// much work it's worth. The OSD returns all of them. Make this
6544// form (pg <pgid> <cmd>) valid only for the cli.
6545// Rest uses "tell <pgid> <cmd>"
6546
6547COMMAND("pg " \
6548 "name=pgid,type=CephPgid " \
6549 "name=cmd,type=CephChoices,strings=query", \
6550 "show details of a specific pg", "osd", "r", "cli")
6551COMMAND("pg " \
6552 "name=pgid,type=CephPgid " \
6553 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6554 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6555 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6556 "osd", "rw", "cli")
6557COMMAND("pg " \
6558 "name=pgid,type=CephPgid " \
6559 "name=cmd,type=CephChoices,strings=list_missing " \
6560 "name=offset,type=CephString,req=false",
6561 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6562 "osd", "r", "cli")
6563
6564// new form: tell <pgid> <cmd> for both cli and rest
6565
6566COMMAND("query",
6567 "show details of a specific pg", "osd", "r", "cli,rest")
6568COMMAND("mark_unfound_lost " \
6569 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6570 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6571 "osd", "rw", "cli,rest")
6572COMMAND("list_missing " \
6573 "name=offset,type=CephString,req=false",
6574 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6575 "osd", "r", "cli,rest")
31f18b77
FG
6576COMMAND("perf histogram dump "
6577 "name=logger,type=CephString,req=false "
6578 "name=counter,type=CephString,req=false",
6579 "Get histogram data",
6580 "osd", "r", "cli,rest")
7c673cae
FG
6581
6582// tell <osd.n> commands. Validation of osd.n must be special-cased in client
6583COMMAND("version", "report version of OSD", "osd", "r", "cli,rest")
6584COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r", "cli,rest")
6585COMMAND("injectargs " \
6586 "name=injected_args,type=CephString,n=N",
6587 "inject configuration arguments into running OSD",
6588 "osd", "rw", "cli,rest")
c07f9fc5
FG
6589COMMAND("config set " \
6590 "name=key,type=CephString name=value,type=CephString",
6591 "Set a configuration option at runtime (not persistent)",
6592 "osd", "rw", "cli,rest")
7c673cae
FG
6593COMMAND("cluster_log " \
6594 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6595 "name=message,type=CephString,n=N",
6596 "log a message to the cluster log",
6597 "osd", "rw", "cli,rest")
6598COMMAND("bench " \
6599 "name=count,type=CephInt,req=false " \
6600 "name=size,type=CephInt,req=false " \
6601 "name=object_size,type=CephInt,req=false " \
6602 "name=object_num,type=CephInt,req=false ", \
6603 "OSD benchmark: write <count> <size>-byte objects, " \
6604 "(default 1G size 4MB). Results in log.",
6605 "osd", "rw", "cli,rest")
6606COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw", "cli,rest")
6607COMMAND("heap " \
6608 "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
6609 "show heap usage info (available only if compiled with tcmalloc)", \
6610 "osd", "rw", "cli,rest")
6611COMMAND("debug dump_missing " \
6612 "name=filename,type=CephFilepath",
6613 "dump missing objects to a named file", "osd", "r", "cli,rest")
6614COMMAND("debug kick_recovery_wq " \
6615 "name=delay,type=CephInt,range=0",
6616 "set osd_recovery_delay_start to <val>", "osd", "rw", "cli,rest")
6617COMMAND("cpu_profiler " \
6618 "name=arg,type=CephChoices,strings=status|flush",
6619 "run cpu profiling on daemon", "osd", "rw", "cli,rest")
6620COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6621 "osd", "r", "cli,rest")
6622COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6623 "osd", "rw", "cli,rest")
224ce89b
WB
6624COMMAND("compact",
6625 "compact object store's omap. "
6626 "WARNING: Compaction probably slows your requests",
6627 "osd", "rw", "cli,rest")
7c673cae
FG
6628};
6629
f64942e4
AA
6630namespace {
6631 class unlock_guard {
6632 Mutex& m;
6633 public:
6634 explicit unlock_guard(Mutex& mutex)
6635 : m(mutex)
6636 {
6637 m.Unlock();
6638 }
6639 unlock_guard(unlock_guard&) = delete;
6640 ~unlock_guard() {
6641 m.Lock();
6642 }
6643 };
6644}
6645
7c673cae
FG
6646void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6647{
6648 int r = 0;
6649 stringstream ss, ds;
6650 string rs;
6651 bufferlist odata;
6652
6653 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6654
6655 map<string, cmd_vartype> cmdmap;
6656 string prefix;
6657 string format;
6658 string pgidstr;
6659 boost::scoped_ptr<Formatter> f;
6660
6661 if (cmd.empty()) {
6662 ss << "no command given";
6663 goto out;
6664 }
6665
6666 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6667 r = -EINVAL;
6668 goto out;
6669 }
6670
6671 cmd_getval(cct, cmdmap, "prefix", prefix);
6672
6673 if (prefix == "get_command_descriptions") {
6674 int cmdnum = 0;
6675 JSONFormatter *f = new JSONFormatter();
6676 f->open_object_section("command_descriptions");
6677 for (OSDCommand *cp = osd_commands;
6678 cp < &osd_commands[ARRAY_SIZE(osd_commands)]; cp++) {
6679
6680 ostringstream secname;
6681 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
6682 dump_cmddesc_to_json(f, secname.str(), cp->cmdstring, cp->helpstring,
6683 cp->module, cp->perm, cp->availability, 0);
6684 cmdnum++;
6685 }
6686 f->close_section(); // command_descriptions
6687
6688 f->flush(ds);
6689 delete f;
6690 goto out;
6691 }
6692
6693 cmd_getval(cct, cmdmap, "format", format);
6694 f.reset(Formatter::create(format));
6695
6696 if (prefix == "version") {
6697 if (f) {
6698 f->open_object_section("version");
6699 f->dump_string("version", pretty_version_to_str());
6700 f->close_section();
6701 f->flush(ds);
6702 } else {
6703 ds << pretty_version_to_str();
6704 }
6705 goto out;
6706 }
6707 else if (prefix == "injectargs") {
6708 vector<string> argsvec;
6709 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6710
6711 if (argsvec.empty()) {
6712 r = -EINVAL;
6713 ss << "ignoring empty injectargs";
6714 goto out;
6715 }
6716 string args = argsvec.front();
6717 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6718 args += " " + *a;
f64942e4 6719 unlock_guard unlock{osd_lock};
7c673cae 6720 r = cct->_conf->injectargs(args, &ss);
7c673cae 6721 }
c07f9fc5
FG
6722 else if (prefix == "config set") {
6723 std::string key;
6724 std::string val;
6725 cmd_getval(cct, cmdmap, "key", key);
6726 cmd_getval(cct, cmdmap, "value", val);
f64942e4 6727 unlock_guard unlock{osd_lock};
c07f9fc5 6728 r = cct->_conf->set_val(key, val, true, &ss);
d2e6a577
FG
6729 if (r == 0) {
6730 cct->_conf->apply_changes(nullptr);
6731 }
c07f9fc5 6732 }
7c673cae
FG
6733 else if (prefix == "cluster_log") {
6734 vector<string> msg;
6735 cmd_getval(cct, cmdmap, "message", msg);
6736 if (msg.empty()) {
6737 r = -EINVAL;
6738 ss << "ignoring empty log message";
6739 goto out;
6740 }
6741 string message = msg.front();
6742 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6743 message += " " + *a;
6744 string lvl;
6745 cmd_getval(cct, cmdmap, "level", lvl);
6746 clog_type level = string_to_clog_type(lvl);
6747 if (level < 0) {
6748 r = -EINVAL;
6749 ss << "unknown level '" << lvl << "'";
6750 goto out;
6751 }
6752 clog->do_log(level, message);
6753 }
6754
6755 // either 'pg <pgid> <command>' or
6756 // 'tell <pgid>' (which comes in without any of that prefix)?
6757
6758 else if (prefix == "pg" ||
6759 prefix == "query" ||
6760 prefix == "mark_unfound_lost" ||
6761 prefix == "list_missing"
6762 ) {
6763 pg_t pgid;
6764
6765 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6766 ss << "no pgid specified";
6767 r = -EINVAL;
6768 } else if (!pgid.parse(pgidstr.c_str())) {
6769 ss << "couldn't parse pgid '" << pgidstr << "'";
6770 r = -EINVAL;
6771 } else {
6772 spg_t pcand;
6773 PG *pg = nullptr;
6774 if (osdmap->get_primary_shard(pgid, &pcand) &&
6775 (pg = _lookup_lock_pg(pcand))) {
6776 if (pg->is_primary()) {
6777 // simulate pg <pgid> cmd= for pg->do-command
6778 if (prefix != "pg")
6779 cmd_putval(cct, cmdmap, "cmd", prefix);
6780 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
6781 if (r == -EAGAIN) {
6782 pg->unlock();
6783 // don't reply, pg will do so async
6784 return;
6785 }
6786 } else {
6787 ss << "not primary for pgid " << pgid;
6788
6789 // send them the latest diff to ensure they realize the mapping
6790 // has changed.
6791 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
6792
6793 // do not reply; they will get newer maps and realize they
6794 // need to resend.
6795 pg->unlock();
6796 return;
6797 }
6798 pg->unlock();
6799 } else {
6800 ss << "i don't have pgid " << pgid;
6801 r = -ENOENT;
6802 }
6803 }
6804 }
6805
6806 else if (prefix == "bench") {
6807 int64_t count;
6808 int64_t bsize;
6809 int64_t osize, onum;
6810 // default count 1G, size 4MB
6811 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
6812 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
6813 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
6814 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
6815
6816 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
6817 ObjectStore::Sequencer>("bench"));
6818
6819 uint32_t duration = cct->_conf->osd_bench_duration;
6820
6821 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
6822 // let us limit the block size because the next checks rely on it
6823 // having a sane value. If we allow any block size to be set things
6824 // can still go sideways.
6825 ss << "block 'size' values are capped at "
1adf2230 6826 << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
7c673cae
FG
6827 << " a higher value, please adjust 'osd_bench_max_block_size'";
6828 r = -EINVAL;
6829 goto out;
6830 } else if (bsize < (int64_t) (1 << 20)) {
6831 // entering the realm of small block sizes.
6832 // limit the count to a sane value, assuming a configurable amount of
6833 // IOPS and duration, so that the OSD doesn't get hung up on this,
6834 // preventing timeouts from going off
6835 int64_t max_count =
6836 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
6837 if (count > max_count) {
6838 ss << "'count' values greater than " << max_count
1adf2230 6839 << " for a block size of " << byte_u_t(bsize) << ", assuming "
7c673cae
FG
6840 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
6841 << " for " << duration << " seconds,"
6842 << " can cause ill effects on osd. "
6843 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6844 << " value if you wish to use a higher 'count'.";
6845 r = -EINVAL;
6846 goto out;
6847 }
6848 } else {
6849 // 1MB block sizes are big enough so that we get more stuff done.
6850 // However, to avoid the osd from getting hung on this and having
6851 // timers being triggered, we are going to limit the count assuming
6852 // a configurable throughput and duration.
6853 // NOTE: max_count is the total amount of bytes that we believe we
6854 // will be able to write during 'duration' for the given
6855 // throughput. The block size hardly impacts this unless it's
6856 // way too big. Given we already check how big the block size
6857 // is, it's safe to assume everything will check out.
6858 int64_t max_count =
6859 cct->_conf->osd_bench_large_size_max_throughput * duration;
6860 if (count > max_count) {
6861 ss << "'count' values greater than " << max_count
1adf2230
AA
6862 << " for a block size of " << byte_u_t(bsize) << ", assuming "
6863 << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
7c673cae
FG
6864 << " for " << duration << " seconds,"
6865 << " can cause ill effects on osd. "
6866 << " Please adjust 'osd_bench_large_size_max_throughput'"
6867 << " with a higher value if you wish to use a higher 'count'.";
6868 r = -EINVAL;
6869 goto out;
6870 }
6871 }
6872
6873 if (osize && bsize > osize)
6874 bsize = osize;
6875
6876 dout(1) << " bench count " << count
1adf2230 6877 << " bsize " << byte_u_t(bsize) << dendl;
7c673cae
FG
6878
6879 ObjectStore::Transaction cleanupt;
6880
6881 if (osize && onum) {
6882 bufferlist bl;
6883 bufferptr bp(osize);
6884 bp.zero();
6885 bl.push_back(std::move(bp));
6886 bl.rebuild_page_aligned();
6887 for (int i=0; i<onum; ++i) {
6888 char nm[30];
6889 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
6890 object_t oid(nm);
6891 hobject_t soid(sobject_t(oid, 0));
6892 ObjectStore::Transaction t;
6893 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
6894 store->queue_transaction(osr.get(), std::move(t), NULL);
6895 cleanupt.remove(coll_t(), ghobject_t(soid));
6896 }
6897 }
6898
6899 bufferlist bl;
6900 bufferptr bp(bsize);
6901 bp.zero();
6902 bl.push_back(std::move(bp));
6903 bl.rebuild_page_aligned();
6904
6905 {
6906 C_SaferCond waiter;
6907 if (!osr->flush_commit(&waiter)) {
6908 waiter.wait();
6909 }
6910 }
6911
6912 utime_t start = ceph_clock_now();
6913 for (int64_t pos = 0; pos < count; pos += bsize) {
6914 char nm[30];
6915 unsigned offset = 0;
6916 if (onum && osize) {
6917 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
6918 offset = rand() % (osize / bsize) * bsize;
6919 } else {
6920 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
6921 }
6922 object_t oid(nm);
6923 hobject_t soid(sobject_t(oid, 0));
6924 ObjectStore::Transaction t;
6925 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
6926 store->queue_transaction(osr.get(), std::move(t), NULL);
6927 if (!onum || !osize)
6928 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
6929 }
6930
6931 {
6932 C_SaferCond waiter;
6933 if (!osr->flush_commit(&waiter)) {
6934 waiter.wait();
6935 }
6936 }
6937 utime_t end = ceph_clock_now();
6938
6939 // clean up
6940 store->queue_transaction(osr.get(), std::move(cleanupt), NULL);
6941 {
6942 C_SaferCond waiter;
6943 if (!osr->flush_commit(&waiter)) {
6944 waiter.wait();
6945 }
6946 }
6947
91327a77
AA
6948 double elapsed = end - start;
6949 double rate = count / elapsed;
6950 double iops = rate / bsize;
7c673cae
FG
6951 if (f) {
6952 f->open_object_section("osd_bench_results");
6953 f->dump_int("bytes_written", count);
6954 f->dump_int("blocksize", bsize);
91327a77
AA
6955 f->dump_float("elapsed_sec", elapsed);
6956 f->dump_float("bytes_per_sec", rate);
6957 f->dump_float("iops", iops);
7c673cae 6958 f->close_section();
91327a77 6959 f->flush(ds);
7c673cae 6960 } else {
91327a77 6961 ds << "bench: wrote " << byte_u_t(count)
1adf2230 6962 << " in blocks of " << byte_u_t(bsize) << " in "
91327a77
AA
6963 << elapsed << " sec at " << byte_u_t(rate) << "/sec "
6964 << si_u_t(iops) << " IOPS";
7c673cae
FG
6965 }
6966 }
6967
6968 else if (prefix == "flush_pg_stats") {
31f18b77
FG
6969 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6970 mgrc.send_pgstats();
6971 ds << service.get_osd_stat_seq() << "\n";
6972 } else {
6973 flush_pg_stats();
6974 }
7c673cae
FG
6975 }
6976
6977 else if (prefix == "heap") {
6978 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
6979 }
6980
6981 else if (prefix == "debug dump_missing") {
6982 string file_name;
6983 cmd_getval(cct, cmdmap, "filename", file_name);
6984 std::ofstream fout(file_name.c_str());
6985 if (!fout.is_open()) {
6986 ss << "failed to open file '" << file_name << "'";
6987 r = -EINVAL;
6988 goto out;
6989 }
6990
6991 fout << "*** osd " << whoami << ": dump_missing ***" << std::endl;
6992 RWLock::RLocker l(pg_map_lock);
6993 for (ceph::unordered_map<spg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
6994 pg_map_e != pg_map.end(); ++pg_map_e) {
6995 PG *pg = pg_map_e->second;
6996 pg->lock();
6997
6998 fout << *pg << std::endl;
6999 std::map<hobject_t, pg_missing_item>::const_iterator mend =
7000 pg->pg_log.get_missing().get_items().end();
7001 std::map<hobject_t, pg_missing_item>::const_iterator mi =
7002 pg->pg_log.get_missing().get_items().begin();
7003 for (; mi != mend; ++mi) {
7004 fout << mi->first << " -> " << mi->second << std::endl;
7005 if (!pg->missing_loc.needs_recovery(mi->first))
7006 continue;
7007 if (pg->missing_loc.is_unfound(mi->first))
7008 fout << " unfound ";
7009 const set<pg_shard_t> &mls(pg->missing_loc.get_locations(mi->first));
7010 if (mls.empty())
7011 continue;
7012 fout << "missing_loc: " << mls << std::endl;
7013 }
7014 pg->unlock();
7015 fout << std::endl;
7016 }
7017
7018 fout.close();
7019 }
7020 else if (prefix == "debug kick_recovery_wq") {
7021 int64_t delay;
7022 cmd_getval(cct, cmdmap, "delay", delay);
7023 ostringstream oss;
7024 oss << delay;
f64942e4 7025 unlock_guard unlock{osd_lock};
7c673cae
FG
7026 r = cct->_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
7027 if (r != 0) {
7028 ss << "kick_recovery_wq: error setting "
7029 << "osd_recovery_delay_start to '" << delay << "': error "
7030 << r;
7031 goto out;
7032 }
7033 cct->_conf->apply_changes(NULL);
7034 ss << "kicking recovery queue. set osd_recovery_delay_start "
7035 << "to " << cct->_conf->osd_recovery_delay_start;
7036 }
7037
7038 else if (prefix == "cpu_profiler") {
7039 string arg;
7040 cmd_getval(cct, cmdmap, "arg", arg);
7041 vector<string> argvec;
7042 get_str_vec(arg, argvec);
7043 cpu_profiler_handle_command(argvec, ds);
7044 }
7045
7046 else if (prefix == "dump_pg_recovery_stats") {
7047 stringstream s;
7048 if (f) {
7049 pg_recovery_stats.dump_formatted(f.get());
7050 f->flush(ds);
7051 } else {
7052 pg_recovery_stats.dump(s);
7053 ds << "dump pg recovery stats: " << s.str();
7054 }
7055 }
7056
7057 else if (prefix == "reset_pg_recovery_stats") {
7058 ss << "reset pg recovery stats";
7059 pg_recovery_stats.reset();
7060 }
7061
31f18b77
FG
7062 else if (prefix == "perf histogram dump") {
7063 std::string logger;
7064 std::string counter;
7065 cmd_getval(cct, cmdmap, "logger", logger);
7066 cmd_getval(cct, cmdmap, "counter", counter);
7067 if (f) {
7068 cct->get_perfcounters_collection()->dump_formatted_histograms(
7069 f.get(), false, logger, counter);
7070 f->flush(ds);
7071 }
7072 }
7073
224ce89b
WB
7074 else if (prefix == "compact") {
7075 dout(1) << "triggering manual compaction" << dendl;
7076 auto start = ceph::coarse_mono_clock::now();
7077 store->compact();
7078 auto end = ceph::coarse_mono_clock::now();
7079 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
7080 dout(1) << "finished manual compaction in "
7081 << time_span.count()
7082 << " seconds" << dendl;
7083 ss << "compacted omap in " << time_span.count() << " seconds";
7084 }
7085
7c673cae
FG
7086 else {
7087 ss << "unrecognized command! " << cmd;
7088 r = -EINVAL;
7089 }
7090
7091 out:
7092 rs = ss.str();
7093 odata.append(ds);
7094 dout(0) << "do_command r=" << r << " " << rs << dendl;
7095 clog->info() << rs;
7096 if (con) {
7097 MCommandReply *reply = new MCommandReply(r, rs);
7098 reply->set_tid(tid);
7099 reply->set_data(odata);
7100 con->send_message(reply);
7101 }
7102}
7103
7104bool OSD::heartbeat_dispatch(Message *m)
7105{
7106 dout(30) << "heartbeat_dispatch " << m << dendl;
7107 switch (m->get_type()) {
7108
7109 case CEPH_MSG_PING:
7110 dout(10) << "ping from " << m->get_source_inst() << dendl;
7111 m->put();
7112 break;
7113
7114 case MSG_OSD_PING:
7115 handle_osd_ping(static_cast<MOSDPing*>(m));
7116 break;
7117
7118 default:
7119 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
7120 m->put();
7121 }
7122
7123 return true;
7124}
7125
7126bool OSD::ms_dispatch(Message *m)
7127{
7128 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
7129 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
7130 service.got_stop_ack();
7131 m->put();
7132 return true;
7133 }
7134
7135 // lock!
7136
7137 osd_lock.Lock();
7138 if (is_stopping()) {
7139 osd_lock.Unlock();
7140 m->put();
7141 return true;
7142 }
7143
7144 do_waiters();
7145 _dispatch(m);
7146
7147 osd_lock.Unlock();
7148
7149 return true;
7150}
7151
7152void OSD::maybe_share_map(
7153 Session *session,
7154 OpRequestRef op,
7155 OSDMapRef osdmap)
7156{
7157 if (!op->check_send_map) {
7158 return;
7159 }
7160 epoch_t last_sent_epoch = 0;
7161
7162 session->sent_epoch_lock.lock();
7163 last_sent_epoch = session->last_sent_epoch;
7164 session->sent_epoch_lock.unlock();
7165
7166 const Message *m = op->get_req();
7167 service.share_map(
7168 m->get_source(),
7169 m->get_connection().get(),
7170 op->sent_epoch,
7171 osdmap,
7172 session ? &last_sent_epoch : NULL);
7173
7174 session->sent_epoch_lock.lock();
7175 if (session->last_sent_epoch < last_sent_epoch) {
7176 session->last_sent_epoch = last_sent_epoch;
7177 }
7178 session->sent_epoch_lock.unlock();
7179
7180 op->check_send_map = false;
7181}
7182
7183void OSD::dispatch_session_waiting(Session *session, OSDMapRef osdmap)
7184{
7185 assert(session->session_dispatch_lock.is_locked());
7186
7187 auto i = session->waiting_on_map.begin();
7188 while (i != session->waiting_on_map.end()) {
7189 OpRequestRef op = &(*i);
7190 assert(ms_can_fast_dispatch(op->get_req()));
7191 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
7192 op->get_req());
7193 if (m->get_min_epoch() > osdmap->get_epoch()) {
7194 break;
7195 }
7196 session->waiting_on_map.erase(i++);
7197 op->put();
7198
7199 spg_t pgid;
7200 if (m->get_type() == CEPH_MSG_OSD_OP) {
7201 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7202 static_cast<const MOSDOp*>(m)->get_pg());
7203 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7204 continue;
7205 }
7206 } else {
7207 pgid = m->get_spg();
7208 }
7209 enqueue_op(pgid, op, m->get_map_epoch());
7210 }
7211
7212 if (session->waiting_on_map.empty()) {
7213 clear_session_waiting_on_map(session);
7214 } else {
7215 register_session_waiting_on_map(session);
7216 }
7217}
7218
7219void OSD::ms_fast_dispatch(Message *m)
7220{
7221 FUNCTRACE();
7222 if (service.is_stopping()) {
7223 m->put();
7224 return;
7225 }
7226 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7227 {
7228#ifdef WITH_LTTNG
7229 osd_reqid_t reqid = op->get_reqid();
7230#endif
7231 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7232 reqid.name._num, reqid.tid, reqid.inc);
7233 }
7234
7235 if (m->trace)
7236 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7237
7238 // note sender epoch, min req'd epoch
7239 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7240 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7241 assert(op->min_epoch <= op->sent_epoch); // sanity check!
7242
7243 service.maybe_inject_dispatch_delay();
7244
7245 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7246 m->get_type() != CEPH_MSG_OSD_OP) {
7247 // queue it directly
7248 enqueue_op(
7249 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7250 op,
7251 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7252 } else {
7253 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7254 // message that didn't have an explicit spg_t); we need to map
7255 // them to an spg_t while preserving delivery order.
7256 Session *session = static_cast<Session*>(m->get_connection()->get_priv());
7257 if (session) {
7258 {
7259 Mutex::Locker l(session->session_dispatch_lock);
7260 op->get();
7261 session->waiting_on_map.push_back(*op);
7262 OSDMapRef nextmap = service.get_nextmap_reserved();
7263 dispatch_session_waiting(session, nextmap);
7264 service.release_map(nextmap);
7265 }
7266 session->put();
7267 }
7268 }
7269 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7270}
7271
7272void OSD::ms_fast_preprocess(Message *m)
7273{
7274 if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
7275 if (m->get_type() == CEPH_MSG_OSD_MAP) {
7276 MOSDMap *mm = static_cast<MOSDMap*>(m);
7277 Session *s = static_cast<Session*>(m->get_connection()->get_priv());
7278 if (s) {
7279 s->received_map_lock.lock();
7280 s->received_map_epoch = mm->get_last();
7281 s->received_map_lock.unlock();
7282 s->put();
7283 }
7284 }
7285 }
7286}
7287
7288bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
7289{
7290 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
7291
31f18b77
FG
7292 if (is_stopping()) {
7293 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7294 return false;
7295 }
7296
7c673cae
FG
7297 if (dest_type == CEPH_ENTITY_TYPE_MON)
7298 return true;
7299
7300 if (force_new) {
7301 /* the MonClient checks keys every tick(), so we should just wait for that cycle
7302 to get through */
7303 if (monc->wait_auth_rotating(10) < 0) {
7304 derr << "OSD::ms_get_authorizer wait_auth_rotating failed" << dendl;
7305 return false;
7306 }
7307 }
7308
7309 *authorizer = monc->build_authorizer(dest_type);
7310 return *authorizer != NULL;
7311}
7312
7313
28e407b8
AA
7314bool OSD::ms_verify_authorizer(
7315 Connection *con, int peer_type,
7316 int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
7317 bool& isvalid, CryptoKey& session_key,
7318 std::unique_ptr<AuthAuthorizerChallenge> *challenge)
7c673cae
FG
7319{
7320 AuthAuthorizeHandler *authorize_handler = 0;
7321 switch (peer_type) {
7322 case CEPH_ENTITY_TYPE_MDS:
7323 /*
7324 * note: mds is technically a client from our perspective, but
7325 * this makes the 'cluster' consistent w/ monitor's usage.
7326 */
7327 case CEPH_ENTITY_TYPE_OSD:
7328 case CEPH_ENTITY_TYPE_MGR:
7329 authorize_handler = authorize_handler_cluster_registry->get_handler(protocol);
7330 break;
7331 default:
7332 authorize_handler = authorize_handler_service_registry->get_handler(protocol);
7333 }
7334 if (!authorize_handler) {
7335 dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol << dendl;
7336 isvalid = false;
7337 return true;
7338 }
7339
7340 AuthCapsInfo caps_info;
7341 EntityName name;
7342 uint64_t global_id;
7343 uint64_t auid = CEPH_AUTH_UID_DEFAULT;
7344
c07f9fc5
FG
7345 RotatingKeyRing *keys = monc->rotating_secrets.get();
7346 if (keys) {
7347 isvalid = authorize_handler->verify_authorizer(
7348 cct, keys,
7349 authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
28e407b8 7350 &auid, challenge);
c07f9fc5
FG
7351 } else {
7352 dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
7353 isvalid = false;
7354 }
7c673cae
FG
7355
7356 if (isvalid) {
7357 Session *s = static_cast<Session *>(con->get_priv());
7358 if (!s) {
7359 s = new Session(cct);
7360 con->set_priv(s->get());
7361 s->con = con;
7362 dout(10) << " new session " << s << " con=" << s->con << " addr=" << s->con->get_peer_addr() << dendl;
7363 }
7364
7365 s->entity_name = name;
7366 if (caps_info.allow_all)
7367 s->caps.set_allow_all();
7368 s->auid = auid;
7369
7370 if (caps_info.caps.length() > 0) {
7371 bufferlist::iterator p = caps_info.caps.begin();
7372 string str;
7373 try {
7374 ::decode(str, p);
7375 }
7376 catch (buffer::error& e) {
7377 }
7378 bool success = s->caps.parse(str);
7379 if (success)
7380 dout(10) << " session " << s << " " << s->entity_name << " has caps " << s->caps << " '" << str << "'" << dendl;
7381 else
7382 dout(10) << " session " << s << " " << s->entity_name << " failed to parse caps '" << str << "'" << dendl;
7383 }
7384
7385 s->put();
7386 }
7387 return true;
7388}
7389
7390void OSD::do_waiters()
7391{
7392 assert(osd_lock.is_locked());
7393
7394 dout(10) << "do_waiters -- start" << dendl;
7395 while (!finished.empty()) {
7396 OpRequestRef next = finished.front();
7397 finished.pop_front();
7398 dispatch_op(next);
7399 }
7400 dout(10) << "do_waiters -- finish" << dendl;
7401}
7402
7403void OSD::dispatch_op(OpRequestRef op)
7404{
7405 switch (op->get_req()->get_type()) {
7406
7407 case MSG_OSD_PG_CREATE:
7408 handle_pg_create(op);
7409 break;
7410 case MSG_OSD_PG_NOTIFY:
7411 handle_pg_notify(op);
7412 break;
7413 case MSG_OSD_PG_QUERY:
7414 handle_pg_query(op);
7415 break;
7416 case MSG_OSD_PG_LOG:
7417 handle_pg_log(op);
7418 break;
7419 case MSG_OSD_PG_REMOVE:
7420 handle_pg_remove(op);
7421 break;
7422 case MSG_OSD_PG_INFO:
7423 handle_pg_info(op);
7424 break;
7425 case MSG_OSD_PG_TRIM:
7426 handle_pg_trim(op);
7427 break;
7428 case MSG_OSD_BACKFILL_RESERVE:
7429 handle_pg_backfill_reserve(op);
7430 break;
7431 case MSG_OSD_RECOVERY_RESERVE:
7432 handle_pg_recovery_reserve(op);
7433 break;
7434 }
7435}
7436
7437void OSD::_dispatch(Message *m)
7438{
7439 assert(osd_lock.is_locked());
7440 dout(20) << "_dispatch " << m << " " << *m << dendl;
7441
7442 switch (m->get_type()) {
7443
7444 // -- don't need lock --
7445 case CEPH_MSG_PING:
7446 dout(10) << "ping from " << m->get_source() << dendl;
7447 m->put();
7448 break;
7449
7450 // -- don't need OSDMap --
7451
7452 // map and replication
7453 case CEPH_MSG_OSD_MAP:
7454 handle_osd_map(static_cast<MOSDMap*>(m));
7455 break;
7456
7457 // osd
7458 case MSG_PGSTATSACK:
7459 handle_pg_stats_ack(static_cast<MPGStatsAck*>(m));
7460 break;
7461
7462 case MSG_MON_COMMAND:
7463 handle_command(static_cast<MMonCommand*>(m));
7464 break;
7465 case MSG_COMMAND:
7466 handle_command(static_cast<MCommand*>(m));
7467 break;
7468
7469 case MSG_OSD_SCRUB:
7470 handle_scrub(static_cast<MOSDScrub*>(m));
7471 break;
7472
c07f9fc5
FG
7473 case MSG_OSD_FORCE_RECOVERY:
7474 handle_force_recovery(m);
7475 break;
7476
7c673cae
FG
7477 // -- need OSDMap --
7478
7479 case MSG_OSD_PG_CREATE:
7480 case MSG_OSD_PG_NOTIFY:
7481 case MSG_OSD_PG_QUERY:
7482 case MSG_OSD_PG_LOG:
7483 case MSG_OSD_PG_REMOVE:
7484 case MSG_OSD_PG_INFO:
7485 case MSG_OSD_PG_TRIM:
7486 case MSG_OSD_BACKFILL_RESERVE:
7487 case MSG_OSD_RECOVERY_RESERVE:
7488 {
7489 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7490 if (m->trace)
7491 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7492 // no map? starting up?
7493 if (!osdmap) {
7494 dout(7) << "no OSDMap, not booted" << dendl;
7495 logger->inc(l_osd_waiting_for_map);
7496 waiting_for_osdmap.push_back(op);
7497 op->mark_delayed("no osdmap");
7498 break;
7499 }
7500
7501 // need OSDMap
7502 dispatch_op(op);
7503 }
7504 }
7505}
7506
7507void OSD::handle_pg_scrub(MOSDScrub *m, PG *pg)
7508{
7509 pg->lock();
7510 if (pg->is_primary()) {
7511 pg->unreg_next_scrub();
7512 pg->scrubber.must_scrub = true;
7513 pg->scrubber.must_deep_scrub = m->deep || m->repair;
7514 pg->scrubber.must_repair = m->repair;
7515 pg->reg_next_scrub();
7516 dout(10) << "marking " << *pg << " for scrub" << dendl;
7517 }
7518 pg->unlock();
7519}
7520
7521void OSD::handle_scrub(MOSDScrub *m)
7522{
7523 dout(10) << "handle_scrub " << *m << dendl;
7524 if (!require_mon_or_mgr_peer(m)) {
7525 m->put();
7526 return;
7527 }
7528 if (m->fsid != monc->get_fsid()) {
7529 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid() << dendl;
7530 m->put();
7531 return;
7532 }
7533
7534 RWLock::RLocker l(pg_map_lock);
7535 if (m->scrub_pgs.empty()) {
7536 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
7537 p != pg_map.end();
7538 ++p)
7539 handle_pg_scrub(m, p->second);
7540 } else {
7541 for (vector<pg_t>::iterator p = m->scrub_pgs.begin();
7542 p != m->scrub_pgs.end();
7543 ++p) {
7544 spg_t pcand;
7545 if (osdmap->get_primary_shard(*p, &pcand)) {
7546 auto pg_map_entry = pg_map.find(pcand);
7547 if (pg_map_entry != pg_map.end()) {
7548 handle_pg_scrub(m, pg_map_entry->second);
7549 }
7550 }
7551 }
7552 }
7553
7554 m->put();
7555}
7556
7557bool OSD::scrub_random_backoff()
7558{
7559 bool coin_flip = (rand() / (double)RAND_MAX >=
7560 cct->_conf->osd_scrub_backoff_ratio);
7561 if (!coin_flip) {
7562 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7563 return true;
7564 }
7565 return false;
7566}
7567
7568OSDService::ScrubJob::ScrubJob(CephContext* cct,
7569 const spg_t& pg, const utime_t& timestamp,
7570 double pool_scrub_min_interval,
7571 double pool_scrub_max_interval, bool must)
7572 : cct(cct),
7573 pgid(pg),
7574 sched_time(timestamp),
7575 deadline(timestamp)
7576{
7577 // if not explicitly requested, postpone the scrub with a random delay
7578 if (!must) {
7579 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7580 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7581 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7582 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7583
7584 sched_time += scrub_min_interval;
7585 double r = rand() / (double)RAND_MAX;
7586 sched_time +=
7587 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7588 deadline += scrub_max_interval;
7589 }
7590}
7591
7592bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7593 if (sched_time < rhs.sched_time)
7594 return true;
7595 if (sched_time > rhs.sched_time)
7596 return false;
7597 return pgid < rhs.pgid;
7598}
7599
7600bool OSD::scrub_time_permit(utime_t now)
7601{
7602 struct tm bdt;
7603 time_t tt = now.sec();
7604 localtime_r(&tt, &bdt);
28e407b8
AA
7605
7606 bool day_permit = false;
7607 if (cct->_conf->osd_scrub_begin_week_day < cct->_conf->osd_scrub_end_week_day) {
7608 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day && bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7609 day_permit = true;
7610 }
7611 } else {
7612 if (bdt.tm_wday >= cct->_conf->osd_scrub_begin_week_day || bdt.tm_wday < cct->_conf->osd_scrub_end_week_day) {
7613 day_permit = true;
7614 }
7615 }
7616
7617 if (!day_permit) {
7618 dout(20) << __func__ << " should run between week day " << cct->_conf->osd_scrub_begin_week_day
7619 << " - " << cct->_conf->osd_scrub_end_week_day
7620 << " now " << bdt.tm_wday << " = no" << dendl;
7621 return false;
7622 }
7623
7c673cae
FG
7624 bool time_permit = false;
7625 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7626 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7627 time_permit = true;
7628 }
7629 } else {
7630 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7631 time_permit = true;
7632 }
7633 }
7634 if (!time_permit) {
7635 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7636 << " - " << cct->_conf->osd_scrub_end_hour
7637 << " now " << bdt.tm_hour << " = no" << dendl;
7638 } else {
7639 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7640 << " - " << cct->_conf->osd_scrub_end_hour
7641 << " now " << bdt.tm_hour << " = yes" << dendl;
7642 }
7643 return time_permit;
7644}
7645
7646bool OSD::scrub_load_below_threshold()
7647{
7648 double loadavgs[3];
7649 if (getloadavg(loadavgs, 3) != 3) {
7650 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7651 return false;
7652 }
7653
7654 // allow scrub if below configured threshold
91327a77
AA
7655 long cpus = sysconf(_SC_NPROCESSORS_ONLN);
7656 double loadavg_per_cpu = cpus > 0 ? loadavgs[0] / cpus : loadavgs[0];
7657 if (loadavg_per_cpu < cct->_conf->osd_scrub_load_threshold) {
7658 dout(20) << __func__ << " loadavg per cpu " << loadavg_per_cpu
7c673cae
FG
7659 << " < max " << cct->_conf->osd_scrub_load_threshold
7660 << " = yes" << dendl;
7661 return true;
7662 }
7663
7664 // allow scrub if below daily avg and currently decreasing
7665 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7666 dout(20) << __func__ << " loadavg " << loadavgs[0]
7667 << " < daily_loadavg " << daily_loadavg
7668 << " and < 15m avg " << loadavgs[2]
7669 << " = yes" << dendl;
7670 return true;
7671 }
7672
7673 dout(20) << __func__ << " loadavg " << loadavgs[0]
7674 << " >= max " << cct->_conf->osd_scrub_load_threshold
7675 << " and ( >= daily_loadavg " << daily_loadavg
7676 << " or >= 15m avg " << loadavgs[2]
7677 << ") = no" << dendl;
7678 return false;
7679}
7680
7681void OSD::sched_scrub()
7682{
7683 // if not permitted, fail fast
7684 if (!service.can_inc_scrubs_pending()) {
7685 return;
7686 }
b5b8bbf5
FG
7687 if (!cct->_conf->osd_scrub_during_recovery && service.is_recovery_active()) {
7688 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7689 return;
7690 }
7691
7c673cae
FG
7692
7693 utime_t now = ceph_clock_now();
7694 bool time_permit = scrub_time_permit(now);
7695 bool load_is_low = scrub_load_below_threshold();
7696 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7697
7698 OSDService::ScrubJob scrub;
7699 if (service.first_scrub_stamp(&scrub)) {
7700 do {
7701 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7702
7703 if (scrub.sched_time > now) {
7704 // save ourselves some effort
7705 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7706 << " > " << now << dendl;
7707 break;
7708 }
7709
7c673cae
FG
7710 if ((scrub.deadline >= now) && !(time_permit && load_is_low)) {
7711 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7712 << (!time_permit ? "time not permit" : "high load") << dendl;
7713 continue;
7714 }
7715
7716 PG *pg = _lookup_lock_pg(scrub.pgid);
7717 if (!pg)
7718 continue;
7719 if (pg->get_pgbackend()->scrub_supported() && pg->is_active()) {
7720 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7721 << (pg->scrubber.must_scrub ? ", explicitly requested" :
7722 (load_is_low ? ", load_is_low" : " deadline < now"))
7723 << dendl;
7724 if (pg->sched_scrub()) {
7725 pg->unlock();
7726 break;
7727 }
7728 }
7729 pg->unlock();
7730 } while (service.next_scrub_stamp(scrub, &scrub));
7731 }
7732 dout(20) << "sched_scrub done" << dendl;
7733}
7734
7735
7736
b32b8144
FG
7737vector<OSDHealthMetric> OSD::get_health_metrics()
7738{
7739 vector<OSDHealthMetric> metrics;
7740 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
7741 auto n_primaries = pending_creates_from_mon;
7742 for (const auto& create : pending_creates_from_osd) {
7743 if (create.second) {
7744 n_primaries++;
7745 }
7746 }
7747 metrics.emplace_back(osd_metric::PENDING_CREATING_PGS, n_primaries);
7748 return metrics;
7749}
7750
7c673cae
FG
7751// =====================================================
7752// MAP
7753
7754void OSD::wait_for_new_map(OpRequestRef op)
7755{
7756 // ask?
7757 if (waiting_for_osdmap.empty()) {
7758 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7759 }
7760
7761 logger->inc(l_osd_waiting_for_map);
7762 waiting_for_osdmap.push_back(op);
7763 op->mark_delayed("wait for new map");
7764}
7765
7766
7767/** update_map
7768 * assimilate new OSDMap(s). scan pgs, etc.
7769 */
7770
7771void OSD::note_down_osd(int peer)
7772{
7773 assert(osd_lock.is_locked());
7774 cluster_messenger->mark_down(osdmap->get_cluster_addr(peer));
7775
7776 heartbeat_lock.Lock();
7777 failure_queue.erase(peer);
7778 failure_pending.erase(peer);
7779 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7780 if (p != heartbeat_peers.end()) {
7781 p->second.con_back->mark_down();
7782 if (p->second.con_front) {
7783 p->second.con_front->mark_down();
7784 }
7785 heartbeat_peers.erase(p);
7786 }
7787 heartbeat_lock.Unlock();
7788}
7789
7790void OSD::note_up_osd(int peer)
7791{
7792 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
7793 heartbeat_set_peers_need_update();
7794}
7795
7796struct C_OnMapCommit : public Context {
7797 OSD *osd;
7798 epoch_t first, last;
7799 MOSDMap *msg;
7800 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7801 : osd(o), first(f), last(l), msg(m) {}
7802 void finish(int r) override {
7803 osd->_committed_osd_maps(first, last, msg);
7804 msg->put();
7805 }
7806};
7807
7808struct C_OnMapApply : public Context {
7809 OSDService *service;
7810 list<OSDMapRef> pinned_maps;
7811 epoch_t e;
7812 C_OnMapApply(OSDService *service,
7813 const list<OSDMapRef> &pinned_maps,
7814 epoch_t e)
7815 : service(service), pinned_maps(pinned_maps), e(e) {}
7816 void finish(int r) override {
7817 service->clear_map_bl_cache_pins(e);
7818 }
7819};
7820
7821void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7822{
181888fb
FG
7823 Mutex::Locker l(osdmap_subscribe_lock);
7824 if (latest_subscribed_epoch >= epoch && !force_request)
7c673cae
FG
7825 return;
7826
181888fb
FG
7827 latest_subscribed_epoch = MAX(epoch, latest_subscribed_epoch);
7828
7c673cae
FG
7829 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7830 force_request) {
7831 monc->renew_subs();
7832 }
7833}
7834
7835void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7836{
7837 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7838 if (min <= superblock.oldest_map)
7839 return;
7840
7841 int num = 0;
7842 ObjectStore::Transaction t;
7843 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7844 dout(20) << " removing old osdmap epoch " << e << dendl;
7845 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7846 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7847 superblock.oldest_map = e + 1;
7848 num++;
7849 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7850 service.publish_superblock(superblock);
7851 write_superblock(t);
7852 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7853 assert(tr == 0);
7854 num = 0;
7855 if (!skip_maps) {
7856 // skip_maps leaves us with a range of old maps if we fail to remove all
7857 // of them before moving superblock.oldest_map forward to the first map
7858 // in the incoming MOSDMap msg. so we should continue removing them in
7859 // this case, even we could do huge series of delete transactions all at
7860 // once.
7861 break;
7862 }
7863 }
7864 }
7865 if (num > 0) {
7866 service.publish_superblock(superblock);
7867 write_superblock(t);
224ce89b
WB
7868 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7869 assert(tr == 0);
7c673cae
FG
7870 }
7871 // we should not remove the cached maps
7872 assert(min <= service.map_cache.cached_key_lower_bound());
7873}
7874
7875void OSD::handle_osd_map(MOSDMap *m)
7876{
7877 assert(osd_lock.is_locked());
7878 // Keep a ref in the list until we get the newly received map written
7879 // onto disk. This is important because as long as the refs are alive,
7880 // the OSDMaps will be pinned in the cache and we won't try to read it
7881 // off of disk. Otherwise these maps will probably not stay in the cache,
7882 // and reading those OSDMaps before they are actually written can result
7883 // in a crash.
7884 list<OSDMapRef> pinned_maps;
7885 if (m->fsid != monc->get_fsid()) {
7886 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7887 << monc->get_fsid() << dendl;
7888 m->put();
7889 return;
7890 }
7891 if (is_initializing()) {
7892 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7893 m->put();
7894 return;
7895 }
7896
7897 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
7898 if (session && !(session->entity_name.is_mon() ||
7899 session->entity_name.is_osd())) {
7900 //not enough perms!
7901 dout(10) << "got osd map from Session " << session
7902 << " which we can't take maps from (not a mon or osd)" << dendl;
7903 m->put();
7904 session->put();
7905 return;
7906 }
7907 if (session)
7908 session->put();
7909
7910 // share with the objecter
7911 if (!is_preboot())
7912 service.objecter->handle_osd_map(m);
7913
7914 epoch_t first = m->get_first();
7915 epoch_t last = m->get_last();
7916 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7917 << superblock.newest_map
7918 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7919 << dendl;
7920
7921 logger->inc(l_osd_map);
7922 logger->inc(l_osd_mape, last - first + 1);
7923 if (first <= superblock.newest_map)
7924 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7925 if (service.max_oldest_map < m->oldest_map) {
7926 service.max_oldest_map = m->oldest_map;
7927 assert(service.max_oldest_map >= superblock.oldest_map);
7928 }
7929
7930 // make sure there is something new, here, before we bother flushing
7931 // the queues and such
7932 if (last <= superblock.newest_map) {
7933 dout(10) << " no new maps here, dropping" << dendl;
7934 m->put();
7935 return;
7936 }
7937
7938 // missing some?
7939 bool skip_maps = false;
7940 if (first > superblock.newest_map + 1) {
7941 dout(10) << "handle_osd_map message skips epochs "
7942 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7943 if (m->oldest_map <= superblock.newest_map + 1) {
7944 osdmap_subscribe(superblock.newest_map + 1, false);
7945 m->put();
7946 return;
7947 }
7948 // always try to get the full range of maps--as many as we can. this
7949 // 1- is good to have
7950 // 2- is at present the only way to ensure that we get a *full* map as
7951 // the first map!
7952 if (m->oldest_map < first) {
7953 osdmap_subscribe(m->oldest_map - 1, true);
7954 m->put();
7955 return;
7956 }
7957 skip_maps = true;
7958 }
7959
7960 ObjectStore::Transaction t;
7961 uint64_t txn_size = 0;
7962
7963 // store new maps: queue for disk and put in the osdmap cache
7964 epoch_t start = MAX(superblock.newest_map + 1, first);
7965 for (epoch_t e = start; e <= last; e++) {
7966 if (txn_size >= t.get_num_bytes()) {
7967 derr << __func__ << " transaction size overflowed" << dendl;
7968 assert(txn_size < t.get_num_bytes());
7969 }
7970 txn_size = t.get_num_bytes();
7971 map<epoch_t,bufferlist>::iterator p;
7972 p = m->maps.find(e);
7973 if (p != m->maps.end()) {
7974 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7975 OSDMap *o = new OSDMap;
7976 bufferlist& bl = p->second;
7977
7978 o->decode(bl);
7979
7980 ghobject_t fulloid = get_osdmap_pobject_name(e);
7981 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7982 pin_map_bl(e, bl);
7983 pinned_maps.push_back(add_map(o));
7984
7985 got_full_map(e);
7986 continue;
7987 }
7988
7989 p = m->incremental_maps.find(e);
7990 if (p != m->incremental_maps.end()) {
7991 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7992 bufferlist& bl = p->second;
7993 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7994 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7995 pin_map_inc_bl(e, bl);
7996
7997 OSDMap *o = new OSDMap;
7998 if (e > 1) {
7999 bufferlist obl;
8000 bool got = get_map_bl(e - 1, obl);
8001 assert(got);
8002 o->decode(obl);
8003 }
8004
8005 OSDMap::Incremental inc;
8006 bufferlist::iterator p = bl.begin();
8007 inc.decode(p);
8008 if (o->apply_incremental(inc) < 0) {
8009 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
8010 assert(0 == "bad fsid");
8011 }
8012
8013 bufferlist fbl;
8014 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
8015
8016 bool injected_failure = false;
8017 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
8018 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
8019 derr << __func__ << " injecting map crc failure" << dendl;
8020 injected_failure = true;
8021 }
8022
8023 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
8024 dout(2) << "got incremental " << e
8025 << " but failed to encode full with correct crc; requesting"
8026 << dendl;
8027 clog->warn() << "failed to encode map e" << e << " with expected crc";
8028 dout(20) << "my encoded map was:\n";
8029 fbl.hexdump(*_dout);
8030 *_dout << dendl;
8031 delete o;
8032 request_full_map(e, last);
8033 last = e - 1;
8034 break;
8035 }
8036 got_full_map(e);
8037
8038 ghobject_t fulloid = get_osdmap_pobject_name(e);
8039 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
8040 pin_map_bl(e, fbl);
8041 pinned_maps.push_back(add_map(o));
8042 continue;
8043 }
8044
8045 assert(0 == "MOSDMap lied about what maps it had?");
8046 }
8047
8048 // even if this map isn't from a mon, we may have satisfied our subscription
8049 monc->sub_got("osdmap", last);
8050
8051 if (!m->maps.empty() && requested_full_first) {
8052 dout(10) << __func__ << " still missing full maps " << requested_full_first
8053 << ".." << requested_full_last << dendl;
8054 rerequest_full_maps();
8055 }
8056
7c673cae
FG
8057 if (superblock.oldest_map) {
8058 // make sure we at least keep pace with incoming maps
8059 trim_maps(m->oldest_map, last - first + 1, skip_maps);
8060 }
8061
8062 if (!superblock.oldest_map || skip_maps)
8063 superblock.oldest_map = first;
8064 superblock.newest_map = last;
8065 superblock.current_epoch = last;
8066
8067 // note in the superblock that we were clean thru the prior epoch
8068 epoch_t boot_epoch = service.get_boot_epoch();
8069 if (boot_epoch && boot_epoch >= superblock.mounted) {
8070 superblock.mounted = boot_epoch;
8071 superblock.clean_thru = last;
8072 }
8073
8074 // superblock and commit
8075 write_superblock(t);
8076 store->queue_transaction(
8077 service.meta_osr.get(),
8078 std::move(t),
8079 new C_OnMapApply(&service, pinned_maps, last),
8080 new C_OnMapCommit(this, start, last, m), 0);
8081 service.publish_superblock(superblock);
8082}
8083
8084void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
8085{
8086 dout(10) << __func__ << " " << first << ".." << last << dendl;
8087 if (is_stopping()) {
8088 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8089 return;
8090 }
8091 Mutex::Locker l(osd_lock);
31f18b77
FG
8092 if (is_stopping()) {
8093 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
8094 return;
8095 }
7c673cae
FG
8096 map_lock.get_write();
8097
8098 bool do_shutdown = false;
8099 bool do_restart = false;
8100 bool network_error = false;
8101
8102 // advance through the new maps
8103 for (epoch_t cur = first; cur <= last; cur++) {
8104 dout(10) << " advance to epoch " << cur
8105 << " (<= last " << last
8106 << " <= newest_map " << superblock.newest_map
8107 << ")" << dendl;
8108
8109 OSDMapRef newmap = get_map(cur);
8110 assert(newmap); // we just cached it above!
8111
8112 // start blacklisting messages sent to peers that go down.
8113 service.pre_publish_map(newmap);
8114
8115 // kill connections to newly down osds
8116 bool waited_for_reservations = false;
8117 set<int> old;
8118 osdmap->get_all_osds(old);
8119 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
8120 if (*p != whoami &&
8121 osdmap->is_up(*p) && // in old map
8122 newmap->is_down(*p)) { // but not the new one
8123 if (!waited_for_reservations) {
8124 service.await_reserved_maps();
8125 waited_for_reservations = true;
8126 }
8127 note_down_osd(*p);
8128 } else if (*p != whoami &&
8129 osdmap->is_down(*p) &&
8130 newmap->is_up(*p)) {
8131 note_up_osd(*p);
8132 }
8133 }
8134
31f18b77
FG
8135 if ((osdmap->test_flag(CEPH_OSDMAP_NOUP) !=
8136 newmap->test_flag(CEPH_OSDMAP_NOUP)) ||
8137 (osdmap->is_noup(whoami) != newmap->is_noup(whoami))) {
7c673cae
FG
8138 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
8139 << dendl;
8140 if (is_booting()) {
8141 // this captures the case where we sent the boot message while
8142 // NOUP was being set on the mon and our boot request was
8143 // dropped, and then later it is cleared. it imperfectly
8144 // handles the case where our original boot message was not
8145 // dropped and we restart even though we might have booted, but
8146 // that is harmless (boot will just take slightly longer).
8147 do_restart = true;
8148 }
8149 }
31f18b77
FG
8150 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS &&
8151 newmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
8152 dout(10) << __func__ << " require_osd_release reached luminous in "
8153 << newmap->get_epoch() << dendl;
8154 clear_pg_stat_queue();
224ce89b 8155 clear_outstanding_pg_stats();
31f18b77 8156 }
7c673cae
FG
8157
8158 osdmap = newmap;
8159 epoch_t up_epoch;
8160 epoch_t boot_epoch;
8161 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
8162 if (!up_epoch &&
8163 osdmap->is_up(whoami) &&
8164 osdmap->get_inst(whoami) == client_messenger->get_myinst()) {
8165 up_epoch = osdmap->get_epoch();
8166 dout(10) << "up_epoch is " << up_epoch << dendl;
8167 if (!boot_epoch) {
8168 boot_epoch = osdmap->get_epoch();
8169 dout(10) << "boot_epoch is " << boot_epoch << dendl;
8170 }
8171 service.set_epochs(&boot_epoch, &up_epoch, NULL);
8172 }
8173 }
8174
8175 had_map_since = ceph_clock_now();
8176
8177 epoch_t _bind_epoch = service.get_bind_epoch();
8178 if (osdmap->is_up(whoami) &&
8179 osdmap->get_addr(whoami) == client_messenger->get_myaddr() &&
8180 _bind_epoch < osdmap->get_up_from(whoami)) {
8181
8182 if (is_booting()) {
8183 dout(1) << "state: booting -> active" << dendl;
8184 set_state(STATE_ACTIVE);
8185
8186 // set incarnation so that osd_reqid_t's we generate for our
8187 // objecter requests are unique across restarts.
8188 service.objecter->set_client_incarnation(osdmap->get_epoch());
8189 }
8190 }
8191
8192 if (osdmap->get_epoch() > 0 &&
8193 is_active()) {
8194 if (!osdmap->exists(whoami)) {
8195 dout(0) << "map says i do not exist. shutting down." << dendl;
8196 do_shutdown = true; // don't call shutdown() while we have
8197 // everything paused
8198 } else if (!osdmap->is_up(whoami) ||
8199 !osdmap->get_addr(whoami).probably_equals(
8200 client_messenger->get_myaddr()) ||
8201 !osdmap->get_cluster_addr(whoami).probably_equals(
8202 cluster_messenger->get_myaddr()) ||
8203 !osdmap->get_hb_back_addr(whoami).probably_equals(
8204 hb_back_server_messenger->get_myaddr()) ||
8205 (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
8206 !osdmap->get_hb_front_addr(whoami).probably_equals(
8207 hb_front_server_messenger->get_myaddr()))) {
8208 if (!osdmap->is_up(whoami)) {
8209 if (service.is_preparing_to_stop() || service.is_stopping()) {
8210 service.got_stop_ack();
8211 } else {
c07f9fc5
FG
8212 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8213 "but it is still running";
8214 clog->debug() << "map e" << osdmap->get_epoch()
8215 << " wrongly marked me down at e"
8216 << osdmap->get_down_at(whoami);
7c673cae
FG
8217 }
8218 } else if (!osdmap->get_addr(whoami).probably_equals(
8219 client_messenger->get_myaddr())) {
8220 clog->error() << "map e" << osdmap->get_epoch()
8221 << " had wrong client addr (" << osdmap->get_addr(whoami)
8222 << " != my " << client_messenger->get_myaddr() << ")";
8223 } else if (!osdmap->get_cluster_addr(whoami).probably_equals(
8224 cluster_messenger->get_myaddr())) {
8225 clog->error() << "map e" << osdmap->get_epoch()
8226 << " had wrong cluster addr ("
8227 << osdmap->get_cluster_addr(whoami)
8228 << " != my " << cluster_messenger->get_myaddr() << ")";
8229 } else if (!osdmap->get_hb_back_addr(whoami).probably_equals(
8230 hb_back_server_messenger->get_myaddr())) {
8231 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8232 << " had wrong heartbeat back addr ("
7c673cae
FG
8233 << osdmap->get_hb_back_addr(whoami)
8234 << " != my " << hb_back_server_messenger->get_myaddr()
8235 << ")";
8236 } else if (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
8237 !osdmap->get_hb_front_addr(whoami).probably_equals(
8238 hb_front_server_messenger->get_myaddr())) {
8239 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 8240 << " had wrong heartbeat front addr ("
7c673cae
FG
8241 << osdmap->get_hb_front_addr(whoami)
8242 << " != my " << hb_front_server_messenger->get_myaddr()
8243 << ")";
8244 }
8245
8246 if (!service.is_stopping()) {
8247 epoch_t up_epoch = 0;
8248 epoch_t bind_epoch = osdmap->get_epoch();
8249 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8250 do_restart = true;
8251
8252 //add markdown log
8253 utime_t now = ceph_clock_now();
8254 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8255 osd_markdown_log.push_back(now);
8256 //clear all out-of-date log
8257 while (!osd_markdown_log.empty() &&
8258 osd_markdown_log.front() + grace < now)
8259 osd_markdown_log.pop_front();
8260 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8261 dout(0) << __func__ << " marked down "
8262 << osd_markdown_log.size()
8263 << " > osd_max_markdown_count "
8264 << cct->_conf->osd_max_markdown_count
8265 << " in last " << grace << " seconds, shutting down"
8266 << dendl;
8267 do_restart = false;
8268 do_shutdown = true;
8269 }
8270
8271 start_waiting_for_healthy();
8272
8273 set<int> avoid_ports;
8274#if defined(__FreeBSD__)
8275 // prevent FreeBSD from grabbing the client_messenger port during
8276 // rebinding. In which case a cluster_meesneger will connect also
8277 // to the same port
8278 avoid_ports.insert(client_messenger->get_myaddr().get_port());
8279#endif
8280 avoid_ports.insert(cluster_messenger->get_myaddr().get_port());
8281 avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port());
8282 avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port());
8283
8284 int r = cluster_messenger->rebind(avoid_ports);
8285 if (r != 0) {
8286 do_shutdown = true; // FIXME: do_restart?
8287 network_error = true;
8288 dout(0) << __func__ << " marked down:"
8289 << " rebind cluster_messenger failed" << dendl;
8290 }
8291
8292 r = hb_back_server_messenger->rebind(avoid_ports);
8293 if (r != 0) {
8294 do_shutdown = true; // FIXME: do_restart?
8295 network_error = true;
8296 dout(0) << __func__ << " marked down:"
8297 << " rebind hb_back_server_messenger failed" << dendl;
8298 }
8299
8300 r = hb_front_server_messenger->rebind(avoid_ports);
8301 if (r != 0) {
8302 do_shutdown = true; // FIXME: do_restart?
8303 network_error = true;
8304 dout(0) << __func__ << " marked down:"
8305 << " rebind hb_front_server_messenger failed" << dendl;
8306 }
8307
8308 hb_front_client_messenger->mark_down_all();
8309 hb_back_client_messenger->mark_down_all();
8310
8311 reset_heartbeat_peers();
8312 }
8313 }
8314 }
8315
8316 map_lock.put_write();
8317
8318 check_osdmap_features(store);
8319
8320 // yay!
8321 consume_map();
8322
8323 if (is_active() || is_waiting_for_healthy())
8324 maybe_update_heartbeat_peers();
8325
8326 if (!is_active()) {
8327 dout(10) << " not yet active; waiting for peering wq to drain" << dendl;
8328 peering_wq.drain();
8329 } else {
8330 activate_map();
8331 }
8332
31f18b77 8333 if (do_shutdown) {
7c673cae
FG
8334 if (network_error) {
8335 Mutex::Locker l(heartbeat_lock);
8336 map<int,pair<utime_t,entity_inst_t>>::iterator it =
8337 failure_pending.begin();
8338 while (it != failure_pending.end()) {
8339 dout(10) << "handle_osd_ping canceling in-flight failure report for osd."
8340 << it->first << dendl;
8341 send_still_alive(osdmap->get_epoch(), it->second.second);
8342 failure_pending.erase(it++);
8343 }
8344 }
8345 // trigger shutdown in a different thread
8346 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8347 queue_async_signal(SIGINT);
8348 }
31f18b77
FG
8349 else if (m->newest_map && m->newest_map > last) {
8350 dout(10) << " msg say newest map is " << m->newest_map
8351 << ", requesting more" << dendl;
8352 osdmap_subscribe(osdmap->get_epoch()+1, false);
8353 }
7c673cae
FG
8354 else if (is_preboot()) {
8355 if (m->get_source().is_mon())
8356 _preboot(m->oldest_map, m->newest_map);
8357 else
8358 start_boot();
8359 }
8360 else if (do_restart)
8361 start_boot();
8362
8363}
8364
8365void OSD::check_osdmap_features(ObjectStore *fs)
8366{
8367 // adjust required feature bits?
8368
8369 // we have to be a bit careful here, because we are accessing the
8370 // Policy structures without taking any lock. in particular, only
8371 // modify integer values that can safely be read by a racing CPU.
8372 // since we are only accessing existing Policy structures a their
8373 // current memory location, and setting or clearing bits in integer
8374 // fields, and we are the only writer, this is not a problem.
8375
8376 {
8377 Messenger::Policy p = client_messenger->get_default_policy();
8378 uint64_t mask;
8379 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8380 if ((p.features_required & mask) != features) {
8381 dout(0) << "crush map has features " << features
8382 << ", adjusting msgr requires for clients" << dendl;
8383 p.features_required = (p.features_required & ~mask) | features;
8384 client_messenger->set_default_policy(p);
8385 }
8386 }
8387 {
8388 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8389 uint64_t mask;
8390 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8391 if ((p.features_required & mask) != features) {
8392 dout(0) << "crush map has features " << features
8393 << " was " << p.features_required
8394 << ", adjusting msgr requires for mons" << dendl;
8395 p.features_required = (p.features_required & ~mask) | features;
8396 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8397 }
8398 }
8399 {
8400 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8401 uint64_t mask;
8402 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8403
8404 if ((p.features_required & mask) != features) {
8405 dout(0) << "crush map has features " << features
8406 << ", adjusting msgr requires for osds" << dendl;
8407 p.features_required = (p.features_required & ~mask) | features;
8408 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8409 }
8410
8411 if ((features & CEPH_FEATURE_OSD_ERASURE_CODES) &&
8412 !superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8413 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8414 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8415 ObjectStore::Transaction t;
8416 write_superblock(t);
8417 int err = store->queue_transaction(service.meta_osr.get(), std::move(t), NULL);
8418 assert(err == 0);
8419 }
8420 }
8421}
8422
8423bool OSD::advance_pg(
8424 epoch_t osd_epoch, PG *pg,
8425 ThreadPool::TPHandle &handle,
8426 PG::RecoveryCtx *rctx,
31f18b77 8427 set<PGRef> *new_pgs)
7c673cae
FG
8428{
8429 assert(pg->is_locked());
8430 epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
8431 OSDMapRef lastmap = pg->get_osdmap();
8432
8433 if (lastmap->get_epoch() == osd_epoch)
8434 return true;
8435 assert(lastmap->get_epoch() < osd_epoch);
8436
8437 epoch_t min_epoch = service.get_min_pg_epoch();
8438 epoch_t max;
8439 if (min_epoch) {
8440 max = min_epoch + cct->_conf->osd_map_max_advance;
8441 } else {
8442 max = next_epoch + cct->_conf->osd_map_max_advance;
8443 }
8444
8445 for (;
8446 next_epoch <= osd_epoch && next_epoch <= max;
8447 ++next_epoch) {
8448 OSDMapRef nextmap = service.try_get_map(next_epoch);
8449 if (!nextmap) {
8450 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8451 // make sure max is bumped up so that we can get past any
8452 // gap in maps
8453 max = MAX(max, next_epoch + cct->_conf->osd_map_max_advance);
8454 continue;
8455 }
8456
8457 vector<int> newup, newacting;
8458 int up_primary, acting_primary;
8459 nextmap->pg_to_up_acting_osds(
8460 pg->info.pgid.pgid,
8461 &newup, &up_primary,
8462 &newacting, &acting_primary);
8463 pg->handle_advance_map(
8464 nextmap, lastmap, newup, up_primary,
8465 newacting, acting_primary, rctx);
8466
8467 // Check for split!
8468 set<spg_t> children;
8469 spg_t parent(pg->info.pgid);
8470 if (parent.is_split(
8471 lastmap->get_pg_num(pg->pool.id),
8472 nextmap->get_pg_num(pg->pool.id),
8473 &children)) {
8474 service.mark_split_in_progress(pg->info.pgid, children);
8475 split_pgs(
8476 pg, children, new_pgs, lastmap, nextmap,
8477 rctx);
8478 }
8479
8480 lastmap = nextmap;
8481 handle.reset_tp_timeout();
8482 }
8483 service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
8484 pg->handle_activate_map(rctx);
8485 if (next_epoch <= osd_epoch) {
8486 dout(10) << __func__ << " advanced to max " << max
8487 << " past min epoch " << min_epoch
8488 << " ... will requeue " << *pg << dendl;
8489 return false;
8490 }
8491 return true;
8492}
8493
8494void OSD::consume_map()
8495{
8496 assert(osd_lock.is_locked());
8497 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8498
3efd9988
FG
8499 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8500 * speak the older sorting version any more. Be careful not to force
8501 * a shutdown if we are merely processing old maps, though.
8502 */
8503 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8504 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8505 ceph_abort();
8506 }
8507
7c673cae
FG
8508 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8509 list<PGRef> to_remove;
8510
8511 // scan pg's
8512 {
8513 RWLock::RLocker l(pg_map_lock);
8514 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8515 it != pg_map.end();
8516 ++it) {
8517 PG *pg = it->second;
8518 pg->lock();
8519 if (pg->is_primary())
8520 num_pg_primary++;
8521 else if (pg->is_replica())
8522 num_pg_replica++;
8523 else
8524 num_pg_stray++;
8525
8526 if (!osdmap->have_pg_pool(pg->info.pgid.pool())) {
8527 //pool is deleted!
8528 to_remove.push_back(PGRef(pg));
8529 } else {
8530 service.init_splits_between(it->first, service.get_osdmap(), osdmap);
8531 }
8532
8533 pg->unlock();
8534 }
3efd9988
FG
8535
8536 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
8537 for (auto pg = pending_creates_from_osd.cbegin();
8538 pg != pending_creates_from_osd.cend();) {
b32b8144 8539 if (osdmap->get_pg_acting_rank(pg->first, whoami) < 0) {
3efd9988
FG
8540 pg = pending_creates_from_osd.erase(pg);
8541 } else {
8542 ++pg;
8543 }
8544 }
7c673cae
FG
8545 }
8546
8547 for (list<PGRef>::iterator i = to_remove.begin();
8548 i != to_remove.end();
8549 to_remove.erase(i++)) {
8550 RWLock::WLocker locker(pg_map_lock);
8551 (*i)->lock();
8552 _remove_pg(&**i);
8553 (*i)->unlock();
8554 }
8555
8556 service.expand_pg_num(service.get_osdmap(), osdmap);
8557
8558 service.pre_publish_map(osdmap);
8559 service.await_reserved_maps();
8560 service.publish_map(osdmap);
8561
8562 service.maybe_inject_dispatch_delay();
8563
8564 dispatch_sessions_waiting_on_map();
8565
8566 service.maybe_inject_dispatch_delay();
8567
8568 // remove any PGs which we no longer host from the session waiting_for_pg lists
8569 dout(20) << __func__ << " checking waiting_for_pg" << dendl;
8570 op_shardedwq.prune_pg_waiters(osdmap, whoami);
8571
8572 service.maybe_inject_dispatch_delay();
8573
8574 // scan pg's
8575 {
8576 RWLock::RLocker l(pg_map_lock);
8577 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8578 it != pg_map.end();
8579 ++it) {
8580 PG *pg = it->second;
8581 pg->lock();
8582 pg->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
8583 pg->unlock();
8584 }
8585
8586 logger->set(l_osd_pg, pg_map.size());
8587 }
8588 logger->set(l_osd_pg_primary, num_pg_primary);
8589 logger->set(l_osd_pg_replica, num_pg_replica);
8590 logger->set(l_osd_pg_stray, num_pg_stray);
94b18763 8591 logger->set(l_osd_pg_removing, remove_wq.get_remove_queue_len());
7c673cae
FG
8592}
8593
8594void OSD::activate_map()
8595{
8596 assert(osd_lock.is_locked());
8597
8598 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8599
7c673cae
FG
8600 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
8601 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
8602 osdmap_subscribe(osdmap->get_epoch() + 1, false);
8603 }
8604
8605 // norecover?
8606 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8607 if (!service.recovery_is_paused()) {
8608 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8609 service.pause_recovery();
8610 }
8611 } else {
8612 if (service.recovery_is_paused()) {
8613 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8614 service.unpause_recovery();
8615 }
8616 }
8617
8618 service.activate_map();
8619
8620 // process waiters
8621 take_waiters(waiting_for_osdmap);
8622}
8623
8624bool OSD::require_mon_peer(const Message *m)
8625{
8626 if (!m->get_connection()->peer_is_mon()) {
8627 dout(0) << "require_mon_peer received from non-mon "
8628 << m->get_connection()->get_peer_addr()
8629 << " " << *m << dendl;
8630 return false;
8631 }
8632 return true;
8633}
8634
8635bool OSD::require_mon_or_mgr_peer(const Message *m)
8636{
8637 if (!m->get_connection()->peer_is_mon() &&
8638 !m->get_connection()->peer_is_mgr()) {
8639 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8640 << m->get_connection()->get_peer_addr()
8641 << " " << *m << dendl;
8642 return false;
8643 }
8644 return true;
8645}
8646
8647bool OSD::require_osd_peer(const Message *m)
8648{
8649 if (!m->get_connection()->peer_is_osd()) {
8650 dout(0) << "require_osd_peer received from non-osd "
8651 << m->get_connection()->get_peer_addr()
8652 << " " << *m << dendl;
8653 return false;
8654 }
8655 return true;
8656}
8657
8658bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8659{
8660 epoch_t up_epoch = service.get_up_epoch();
8661 if (epoch < up_epoch) {
8662 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8663 return false;
8664 }
8665
8666 if (!is_active()) {
8667 dout(7) << "still in boot state, dropping message " << *m << dendl;
8668 return false;
8669 }
8670
8671 return true;
8672}
8673
8674bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
8675 bool is_fast_dispatch)
8676{
8677 int from = m->get_source().num();
8678
8679 if (map->is_down(from) ||
8680 (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
8681 dout(5) << "from dead osd." << from << ", marking down, "
8682 << " msg was " << m->get_source_inst().addr
8683 << " expected " << (map->is_up(from) ?
8684 map->get_cluster_addr(from) : entity_addr_t())
8685 << dendl;
8686 ConnectionRef con = m->get_connection();
8687 con->mark_down();
8688 Session *s = static_cast<Session*>(con->get_priv());
8689 if (s) {
8690 if (!is_fast_dispatch)
8691 s->session_dispatch_lock.Lock();
8692 clear_session_waiting_on_map(s);
8693 con->set_priv(NULL); // break ref <-> session cycle, if any
8694 if (!is_fast_dispatch)
8695 s->session_dispatch_lock.Unlock();
8696 s->put();
8697 }
8698 return false;
8699 }
8700 return true;
8701}
8702
8703
8704/*
8705 * require that we have same (or newer) map, and that
8706 * the source is the pg primary.
8707 */
8708bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8709 bool is_fast_dispatch)
8710{
8711 const Message *m = op->get_req();
8712 dout(15) << "require_same_or_newer_map " << epoch
8713 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8714
8715 assert(osd_lock.is_locked());
8716
8717 // do they have a newer map?
8718 if (epoch > osdmap->get_epoch()) {
8719 dout(7) << "waiting for newer map epoch " << epoch
8720 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8721 wait_for_new_map(op);
8722 return false;
8723 }
8724
8725 if (!require_self_aliveness(op->get_req(), epoch)) {
8726 return false;
8727 }
8728
8729 // ok, our map is same or newer.. do they still exist?
8730 if (m->get_connection()->get_messenger() == cluster_messenger &&
8731 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8732 return false;
8733 }
8734
8735 return true;
8736}
8737
8738
8739
8740
8741
8742// ----------------------------------------
8743// pg creation
8744
8745void OSD::split_pgs(
8746 PG *parent,
31f18b77 8747 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
7c673cae
FG
8748 OSDMapRef curmap,
8749 OSDMapRef nextmap,
8750 PG::RecoveryCtx *rctx)
8751{
8752 unsigned pg_num = nextmap->get_pg_num(
8753 parent->pool.id);
8754 parent->update_snap_mapper_bits(
8755 parent->info.pgid.get_split_bits(pg_num)
8756 );
8757
8758 vector<object_stat_sum_t> updated_stats(childpgids.size() + 1);
8759 parent->info.stats.stats.sum.split(updated_stats);
8760
8761 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8762 for (set<spg_t>::const_iterator i = childpgids.begin();
8763 i != childpgids.end();
8764 ++i, ++stat_iter) {
8765 assert(stat_iter != updated_stats.end());
8766 dout(10) << "Splitting " << *parent << " into " << *i << dendl;
8767 assert(service.splitting(*i));
8768 PG* child = _make_pg(nextmap, *i);
8769 child->lock(true);
8770 out_pgs->insert(child);
8771 rctx->created_pgs.insert(child);
8772
8773 unsigned split_bits = i->get_split_bits(pg_num);
8774 dout(10) << "pg_num is " << pg_num << dendl;
8775 dout(10) << "m_seed " << i->ps() << dendl;
8776 dout(10) << "split_bits is " << split_bits << dendl;
8777
8778 parent->split_colls(
8779 *i,
8780 split_bits,
8781 i->ps(),
8782 &child->pool.info,
8783 rctx->transaction);
8784 parent->split_into(
8785 i->pgid,
8786 child,
8787 split_bits);
8788 child->info.stats.stats.sum = *stat_iter;
8789
8790 child->write_if_dirty(*(rctx->transaction));
8791 child->unlock();
8792 }
8793 assert(stat_iter != updated_stats.end());
8794 parent->info.stats.stats.sum = *stat_iter;
8795 parent->write_if_dirty(*(rctx->transaction));
8796}
8797
8798/*
8799 * holding osd_lock
8800 */
8801void OSD::handle_pg_create(OpRequestRef op)
8802{
8803 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
8804 assert(m->get_type() == MSG_OSD_PG_CREATE);
8805
8806 dout(10) << "handle_pg_create " << *m << dendl;
8807
8808 if (!require_mon_peer(op->get_req())) {
8809 return;
8810 }
8811
8812 if (!require_same_or_newer_map(op, m->epoch, false))
8813 return;
8814
8815 op->mark_started();
8816
8817 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8818 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8819 p != m->mkpg.end();
8820 ++p, ++ci) {
8821 assert(ci != m->ctimes.end() && ci->first == p->first);
8822 epoch_t created = p->second.created;
8823 if (p->second.split_bits) // Skip split pgs
8824 continue;
8825 pg_t on = p->first;
8826
8827 if (on.preferred() >= 0) {
8828 dout(20) << "ignoring localized pg " << on << dendl;
8829 continue;
8830 }
8831
8832 if (!osdmap->have_pg_pool(on.pool())) {
8833 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8834 continue;
8835 }
8836
8837 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
8838
8839 // is it still ours?
8840 vector<int> up, acting;
8841 int up_primary = -1;
8842 int acting_primary = -1;
8843 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
8844 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
8845
8846 if (acting_primary != whoami) {
8847 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
8848 << "), my role=" << role << ", skipping" << dendl;
8849 continue;
8850 }
8851
8852 spg_t pgid;
8853 bool mapped = osdmap->get_primary_shard(on, &pgid);
8854 assert(mapped);
8855
8856 PastIntervals pi(
8857 osdmap->get_pools().at(pgid.pool()).ec_pool(),
8858 *osdmap);
8859 pg_history_t history;
8860 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
8861
8862 // The mon won't resend unless the primary changed, so
8863 // we ignore same_interval_since. We'll pass this history
8864 // to handle_pg_peering_evt with the current epoch as the
8865 // event -- the project_pg_history check in
8866 // handle_pg_peering_evt will be a noop.
8867 if (history.same_primary_since > m->epoch) {
8868 dout(10) << __func__ << ": got obsolete pg create on pgid "
8869 << pgid << " from epoch " << m->epoch
8870 << ", primary changed in " << history.same_primary_since
8871 << dendl;
8872 continue;
8873 }
7c673cae
FG
8874 if (handle_pg_peering_evt(
8875 pgid,
8876 history,
8877 pi,
8878 osdmap->get_epoch(),
8879 PG::CephPeeringEvtRef(
8880 new PG::CephPeeringEvt(
8881 osdmap->get_epoch(),
8882 osdmap->get_epoch(),
8883 PG::NullEvt()))
8884 ) == -EEXIST) {
8885 service.send_pg_created(pgid.pgid);
8886 }
8887 }
7c673cae 8888
3efd9988
FG
8889 {
8890 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
8891 if (pending_creates_from_mon == 0) {
8892 last_pg_create_epoch = m->epoch;
8893 }
8894 }
7c673cae
FG
8895 maybe_update_heartbeat_peers();
8896}
8897
8898
8899// ----------------------------------------
8900// peering and recovery
8901
8902PG::RecoveryCtx OSD::create_context()
8903{
8904 ObjectStore::Transaction *t = new ObjectStore::Transaction;
8905 C_Contexts *on_applied = new C_Contexts(cct);
8906 C_Contexts *on_safe = new C_Contexts(cct);
8907 map<int, map<spg_t,pg_query_t> > *query_map =
8908 new map<int, map<spg_t, pg_query_t> >;
8909 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
8910 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
8911 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
8912 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
8913 PG::RecoveryCtx rctx(query_map, info_map, notify_list,
8914 on_applied, on_safe, t);
8915 return rctx;
8916}
8917
8918struct C_OpenPGs : public Context {
8919 set<PGRef> pgs;
8920 ObjectStore *store;
8921 OSD *osd;
8922 C_OpenPGs(set<PGRef>& p, ObjectStore *s, OSD* o) : store(s), osd(o) {
8923 pgs.swap(p);
8924 }
8925 void finish(int r) override {
8926 RWLock::RLocker l(osd->pg_map_lock);
8927 for (auto p : pgs) {
8928 if (osd->pg_map.count(p->info.pgid)) {
8929 p->ch = store->open_collection(p->coll);
8930 assert(p->ch);
8931 }
8932 }
8933 }
8934};
8935
8936void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
8937 ThreadPool::TPHandle *handle)
8938{
8939 if (!ctx.transaction->empty()) {
8940 if (!ctx.created_pgs.empty()) {
8941 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8942 }
8943 int tr = store->queue_transaction(
8944 pg->osr.get(),
8945 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL,
8946 TrackedOpRef(), handle);
8947 delete (ctx.transaction);
8948 assert(tr == 0);
8949 ctx.transaction = new ObjectStore::Transaction;
8950 ctx.on_applied = new C_Contexts(cct);
8951 ctx.on_safe = new C_Contexts(cct);
8952 }
8953}
8954
8955void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
8956 ThreadPool::TPHandle *handle)
8957{
8958 if (service.get_osdmap()->is_up(whoami) &&
8959 is_active()) {
8960 do_notifies(*ctx.notify_list, curmap);
8961 do_queries(*ctx.query_map, curmap);
8962 do_infos(*ctx.info_map, curmap);
8963 }
8964 delete ctx.notify_list;
8965 delete ctx.query_map;
8966 delete ctx.info_map;
8967 if ((ctx.on_applied->empty() &&
8968 ctx.on_safe->empty() &&
8969 ctx.transaction->empty() &&
8970 ctx.created_pgs.empty()) || !pg) {
8971 delete ctx.transaction;
8972 delete ctx.on_applied;
8973 delete ctx.on_safe;
8974 assert(ctx.created_pgs.empty());
8975 } else {
8976 if (!ctx.created_pgs.empty()) {
8977 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8978 }
8979 int tr = store->queue_transaction(
8980 pg->osr.get(),
8981 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL, TrackedOpRef(),
8982 handle);
8983 delete (ctx.transaction);
8984 assert(tr == 0);
8985 }
8986}
8987
8988/** do_notifies
8989 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
8990 * content for, and they are primary for.
8991 */
8992
8993void OSD::do_notifies(
8994 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
8995 OSDMapRef curmap)
8996{
8997 for (map<int,
8998 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
8999 notify_list.begin();
9000 it != notify_list.end();
9001 ++it) {
9002 if (!curmap->is_up(it->first)) {
9003 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
9004 continue;
9005 }
9006 ConnectionRef con = service.get_con_osd_cluster(
9007 it->first, curmap->get_epoch());
9008 if (!con) {
9009 dout(20) << __func__ << " skipping osd." << it->first
9010 << " (NULL con)" << dendl;
9011 continue;
9012 }
9013 service.share_map_peer(it->first, con.get(), curmap);
3efd9988 9014 dout(7) << __func__ << " osd." << it->first
7c673cae
FG
9015 << " on " << it->second.size() << " PGs" << dendl;
9016 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
9017 it->second);
9018 con->send_message(m);
9019 }
9020}
9021
9022
9023/** do_queries
9024 * send out pending queries for info | summaries
9025 */
9026void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
9027 OSDMapRef curmap)
9028{
9029 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
9030 pit != query_map.end();
9031 ++pit) {
9032 if (!curmap->is_up(pit->first)) {
9033 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
9034 continue;
9035 }
9036 int who = pit->first;
9037 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
9038 if (!con) {
9039 dout(20) << __func__ << " skipping osd." << who
9040 << " (NULL con)" << dendl;
9041 continue;
9042 }
9043 service.share_map_peer(who, con.get(), curmap);
9044 dout(7) << __func__ << " querying osd." << who
9045 << " on " << pit->second.size() << " PGs" << dendl;
9046 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
9047 con->send_message(m);
9048 }
9049}
9050
9051
9052void OSD::do_infos(map<int,
9053 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
9054 OSDMapRef curmap)
9055{
9056 for (map<int,
9057 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
9058 info_map.begin();
9059 p != info_map.end();
9060 ++p) {
9061 if (!curmap->is_up(p->first)) {
9062 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
9063 continue;
9064 }
9065 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
9066 i != p->second.end();
9067 ++i) {
9068 dout(20) << __func__ << " sending info " << i->first.info
9069 << " to shard " << p->first << dendl;
9070 }
9071 ConnectionRef con = service.get_con_osd_cluster(
9072 p->first, curmap->get_epoch());
9073 if (!con) {
9074 dout(20) << __func__ << " skipping osd." << p->first
9075 << " (NULL con)" << dendl;
9076 continue;
9077 }
9078 service.share_map_peer(p->first, con.get(), curmap);
9079 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
9080 m->pg_list = p->second;
9081 con->send_message(m);
9082 }
9083 info_map.clear();
9084}
9085
9086
9087/** PGNotify
9088 * from non-primary to primary
9089 * includes pg_info_t.
9090 * NOTE: called with opqueue active.
9091 */
9092void OSD::handle_pg_notify(OpRequestRef op)
9093{
9094 const MOSDPGNotify *m = static_cast<const MOSDPGNotify*>(op->get_req());
9095 assert(m->get_type() == MSG_OSD_PG_NOTIFY);
9096
9097 dout(7) << "handle_pg_notify from " << m->get_source() << dendl;
9098 int from = m->get_source().num();
9099
9100 if (!require_osd_peer(op->get_req()))
9101 return;
9102
9103 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9104 return;
9105
9106 op->mark_started();
9107
9108 for (auto it = m->get_pg_list().begin();
9109 it != m->get_pg_list().end();
9110 ++it) {
9111 if (it->first.info.pgid.preferred() >= 0) {
9112 dout(20) << "ignoring localized pg " << it->first.info.pgid << dendl;
9113 continue;
9114 }
9115
9116 handle_pg_peering_evt(
9117 spg_t(it->first.info.pgid.pgid, it->first.to),
9118 it->first.info.history, it->second,
9119 it->first.query_epoch,
9120 PG::CephPeeringEvtRef(
9121 new PG::CephPeeringEvt(
9122 it->first.epoch_sent, it->first.query_epoch,
9123 PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first,
9124 op->get_req()->get_connection()->get_features())))
9125 );
9126 }
9127}
9128
9129void OSD::handle_pg_log(OpRequestRef op)
9130{
9131 MOSDPGLog *m = static_cast<MOSDPGLog*>(op->get_nonconst_req());
9132 assert(m->get_type() == MSG_OSD_PG_LOG);
9133 dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl;
9134
9135 if (!require_osd_peer(op->get_req()))
9136 return;
9137
9138 int from = m->get_source().num();
9139 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9140 return;
9141
9142 if (m->info.pgid.preferred() >= 0) {
9143 dout(10) << "ignoring localized pg " << m->info.pgid << dendl;
9144 return;
9145 }
9146
9147 op->mark_started();
9148 handle_pg_peering_evt(
9149 spg_t(m->info.pgid.pgid, m->to),
9150 m->info.history, m->past_intervals, m->get_epoch(),
9151 PG::CephPeeringEvtRef(
9152 new PG::CephPeeringEvt(
9153 m->get_epoch(), m->get_query_epoch(),
9154 PG::MLogRec(pg_shard_t(from, m->from), m)))
9155 );
9156}
9157
9158void OSD::handle_pg_info(OpRequestRef op)
9159{
9160 const MOSDPGInfo *m = static_cast<const MOSDPGInfo *>(op->get_req());
9161 assert(m->get_type() == MSG_OSD_PG_INFO);
9162 dout(7) << "handle_pg_info " << *m << " from " << m->get_source() << dendl;
9163
9164 if (!require_osd_peer(op->get_req()))
9165 return;
9166
9167 int from = m->get_source().num();
9168 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9169 return;
9170
9171 op->mark_started();
9172
9173 for (auto p = m->pg_list.begin();
9174 p != m->pg_list.end();
9175 ++p) {
9176 if (p->first.info.pgid.preferred() >= 0) {
9177 dout(10) << "ignoring localized pg " << p->first.info.pgid << dendl;
9178 continue;
9179 }
9180
9181 handle_pg_peering_evt(
9182 spg_t(p->first.info.pgid.pgid, p->first.to),
9183 p->first.info.history, p->second, p->first.epoch_sent,
9184 PG::CephPeeringEvtRef(
9185 new PG::CephPeeringEvt(
9186 p->first.epoch_sent, p->first.query_epoch,
9187 PG::MInfoRec(
9188 pg_shard_t(
9189 from, p->first.from), p->first.info, p->first.epoch_sent)))
9190 );
9191 }
9192}
9193
9194void OSD::handle_pg_trim(OpRequestRef op)
9195{
9196 const MOSDPGTrim *m = static_cast<const MOSDPGTrim*>(op->get_req());
9197 assert(m->get_type() == MSG_OSD_PG_TRIM);
9198
9199 dout(7) << "handle_pg_trim " << *m << " from " << m->get_source() << dendl;
9200
9201 if (!require_osd_peer(op->get_req()))
9202 return;
9203
9204 int from = m->get_source().num();
9205 if (!require_same_or_newer_map(op, m->epoch, false))
9206 return;
9207
9208 if (m->pgid.preferred() >= 0) {
9209 dout(10) << "ignoring localized pg " << m->pgid << dendl;
9210 return;
9211 }
9212
9213 op->mark_started();
9214
9215 PG *pg = _lookup_lock_pg(m->pgid);
9216 if(!pg) {
9217 dout(10) << " don't have pg " << m->pgid << dendl;
9218 return;
9219 }
9220
9221 if (m->epoch < pg->info.history.same_interval_since) {
9222 dout(10) << *pg << " got old trim to " << m->trim_to << ", ignoring" << dendl;
9223 pg->unlock();
9224 return;
9225 }
9226
9227 if (pg->is_primary()) {
9228 // peer is informing us of their last_complete_ondisk
9229 dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
9230 pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
9231 m->trim_to;
9232 // trim log when the pg is recovered
9233 pg->calc_min_last_complete_ondisk();
9234 } else {
9235 // primary is instructing us to trim
9236 ObjectStore::Transaction t;
9237 pg->pg_log.trim(m->trim_to, pg->info);
9238 pg->dirty_info = true;
9239 pg->write_if_dirty(t);
9240 int tr = store->queue_transaction(pg->osr.get(), std::move(t), NULL);
9241 assert(tr == 0);
9242 }
9243 pg->unlock();
9244}
9245
9246void OSD::handle_pg_backfill_reserve(OpRequestRef op)
9247{
9248 const MBackfillReserve *m = static_cast<const MBackfillReserve*>(op->get_req());
9249 assert(m->get_type() == MSG_OSD_BACKFILL_RESERVE);
9250
9251 if (!require_osd_peer(op->get_req()))
9252 return;
9253 if (!require_same_or_newer_map(op, m->query_epoch, false))
9254 return;
9255
9256 PG::CephPeeringEvtRef evt;
9257 if (m->type == MBackfillReserve::REQUEST) {
9258 evt = PG::CephPeeringEvtRef(
9259 new PG::CephPeeringEvt(
9260 m->query_epoch,
9261 m->query_epoch,
9262 PG::RequestBackfillPrio(m->priority)));
9263 } else if (m->type == MBackfillReserve::GRANT) {
9264 evt = PG::CephPeeringEvtRef(
9265 new PG::CephPeeringEvt(
9266 m->query_epoch,
9267 m->query_epoch,
9268 PG::RemoteBackfillReserved()));
9269 } else if (m->type == MBackfillReserve::REJECT) {
3efd9988
FG
9270 // NOTE: this is replica -> primary "i reject your request"
9271 // and also primary -> replica "cancel my previously-granted request"
7c673cae
FG
9272 evt = PG::CephPeeringEvtRef(
9273 new PG::CephPeeringEvt(
9274 m->query_epoch,
9275 m->query_epoch,
9276 PG::RemoteReservationRejected()));
9277 } else {
9278 ceph_abort();
9279 }
9280
9281 if (service.splitting(m->pgid)) {
9282 peering_wait_for_split[m->pgid].push_back(evt);
9283 return;
9284 }
9285
9286 PG *pg = _lookup_lock_pg(m->pgid);
9287 if (!pg) {
9288 dout(10) << " don't have pg " << m->pgid << dendl;
9289 return;
9290 }
9291
9292 pg->queue_peering_event(evt);
9293 pg->unlock();
9294}
9295
9296void OSD::handle_pg_recovery_reserve(OpRequestRef op)
9297{
9298 const MRecoveryReserve *m = static_cast<const MRecoveryReserve*>(op->get_req());
9299 assert(m->get_type() == MSG_OSD_RECOVERY_RESERVE);
9300
9301 if (!require_osd_peer(op->get_req()))
9302 return;
9303 if (!require_same_or_newer_map(op, m->query_epoch, false))
9304 return;
9305
9306 PG::CephPeeringEvtRef evt;
9307 if (m->type == MRecoveryReserve::REQUEST) {
9308 evt = PG::CephPeeringEvtRef(
9309 new PG::CephPeeringEvt(
9310 m->query_epoch,
9311 m->query_epoch,
9312 PG::RequestRecovery()));
9313 } else if (m->type == MRecoveryReserve::GRANT) {
9314 evt = PG::CephPeeringEvtRef(
9315 new PG::CephPeeringEvt(
9316 m->query_epoch,
9317 m->query_epoch,
9318 PG::RemoteRecoveryReserved()));
9319 } else if (m->type == MRecoveryReserve::RELEASE) {
9320 evt = PG::CephPeeringEvtRef(
9321 new PG::CephPeeringEvt(
9322 m->query_epoch,
9323 m->query_epoch,
9324 PG::RecoveryDone()));
9325 } else {
9326 ceph_abort();
9327 }
9328
9329 if (service.splitting(m->pgid)) {
9330 peering_wait_for_split[m->pgid].push_back(evt);
9331 return;
9332 }
9333
9334 PG *pg = _lookup_lock_pg(m->pgid);
9335 if (!pg) {
9336 dout(10) << " don't have pg " << m->pgid << dendl;
9337 return;
9338 }
9339
9340 pg->queue_peering_event(evt);
9341 pg->unlock();
9342}
9343
c07f9fc5
FG
9344void OSD::handle_force_recovery(Message *m)
9345{
9346 MOSDForceRecovery *msg = static_cast<MOSDForceRecovery*>(m);
9347 assert(msg->get_type() == MSG_OSD_FORCE_RECOVERY);
c07f9fc5 9348
d2e6a577 9349 vector<PGRef> local_pgs;
c07f9fc5
FG
9350 local_pgs.reserve(msg->forced_pgs.size());
9351
d2e6a577
FG
9352 {
9353 RWLock::RLocker l(pg_map_lock);
9354 for (auto& i : msg->forced_pgs) {
9355 spg_t locpg;
9356 if (osdmap->get_primary_shard(i, &locpg)) {
9357 auto pg_map_entry = pg_map.find(locpg);
9358 if (pg_map_entry != pg_map.end()) {
9359 local_pgs.push_back(pg_map_entry->second);
9360 }
c07f9fc5
FG
9361 }
9362 }
9363 }
9364
9365 if (local_pgs.size()) {
9366 service.adjust_pg_priorities(local_pgs, msg->options);
9367 }
9368
9369 msg->put();
9370}
7c673cae
FG
9371
9372/** PGQuery
9373 * from primary to replica | stray
9374 * NOTE: called with opqueue active.
9375 */
9376void OSD::handle_pg_query(OpRequestRef op)
9377{
9378 assert(osd_lock.is_locked());
9379
9380 const MOSDPGQuery *m = static_cast<const MOSDPGQuery*>(op->get_req());
9381 assert(m->get_type() == MSG_OSD_PG_QUERY);
9382
9383 if (!require_osd_peer(op->get_req()))
9384 return;
9385
9386 dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl;
9387 int from = m->get_source().num();
9388
9389 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9390 return;
9391
9392 op->mark_started();
9393
9394 map< int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
9395
9396 for (auto it = m->pg_list.begin();
9397 it != m->pg_list.end();
9398 ++it) {
9399 spg_t pgid = it->first;
9400
9401 if (pgid.preferred() >= 0) {
9402 dout(10) << "ignoring localized pg " << pgid << dendl;
9403 continue;
9404 }
9405
9406 if (service.splitting(pgid)) {
9407 peering_wait_for_split[pgid].push_back(
9408 PG::CephPeeringEvtRef(
9409 new PG::CephPeeringEvt(
9410 it->second.epoch_sent, it->second.epoch_sent,
9411 PG::MQuery(pg_shard_t(from, it->second.from),
9412 it->second, it->second.epoch_sent))));
9413 continue;
9414 }
9415
9416 {
9417 RWLock::RLocker l(pg_map_lock);
9418 if (pg_map.count(pgid)) {
9419 PG *pg = 0;
9420 pg = _lookup_lock_pg_with_map_lock_held(pgid);
9421 pg->queue_query(
9422 it->second.epoch_sent, it->second.epoch_sent,
9423 pg_shard_t(from, it->second.from), it->second);
9424 pg->unlock();
9425 continue;
9426 }
9427 }
9428
9429 if (!osdmap->have_pg_pool(pgid.pool()))
9430 continue;
9431
9432 // get active crush mapping
9433 int up_primary, acting_primary;
9434 vector<int> up, acting;
9435 osdmap->pg_to_up_acting_osds(
9436 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9437
9438 // same primary?
9439 pg_history_t history = it->second.history;
9440 bool valid_history = project_pg_history(
9441 pgid, history, it->second.epoch_sent,
9442 up, up_primary, acting, acting_primary);
9443
9444 if (!valid_history ||
9445 it->second.epoch_sent < history.same_interval_since) {
9446 dout(10) << " pg " << pgid << " dne, and pg has changed in "
9447 << history.same_interval_since
9448 << " (msg from " << it->second.epoch_sent << ")" << dendl;
9449 continue;
9450 }
9451
9452 dout(10) << " pg " << pgid << " dne" << dendl;
9453 pg_info_t empty(spg_t(pgid.pgid, it->second.to));
9454 /* This is racy, but that should be ok: if we complete the deletion
9455 * before the pg is recreated, we'll just start it off backfilling
9456 * instead of just empty */
9457 if (service.deleting_pgs.lookup(pgid))
9458 empty.set_last_backfill(hobject_t());
9459 if (it->second.type == pg_query_t::LOG ||
9460 it->second.type == pg_query_t::FULLLOG) {
9461 ConnectionRef con = service.get_con_osd_cluster(from, osdmap->get_epoch());
9462 if (con) {
9463 MOSDPGLog *mlog = new MOSDPGLog(
9464 it->second.from, it->second.to,
9465 osdmap->get_epoch(), empty,
9466 it->second.epoch_sent);
9467 service.share_map_peer(from, con.get(), osdmap);
9468 con->send_message(mlog);
9469 }
9470 } else {
9471 notify_list[from].push_back(
9472 make_pair(
9473 pg_notify_t(
9474 it->second.from, it->second.to,
9475 it->second.epoch_sent,
9476 osdmap->get_epoch(),
9477 empty),
9478 PastIntervals(
9479 osdmap->get_pools().at(pgid.pool()).ec_pool(),
9480 *osdmap)));
9481 }
9482 }
9483 do_notifies(notify_list, osdmap);
9484}
9485
9486
9487void OSD::handle_pg_remove(OpRequestRef op)
9488{
9489 const MOSDPGRemove *m = static_cast<const MOSDPGRemove *>(op->get_req());
9490 assert(m->get_type() == MSG_OSD_PG_REMOVE);
9491 assert(osd_lock.is_locked());
9492
9493 if (!require_osd_peer(op->get_req()))
9494 return;
9495
9496 dout(7) << "handle_pg_remove from " << m->get_source() << " on "
9497 << m->pg_list.size() << " pgs" << dendl;
9498
9499 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9500 return;
9501
9502 op->mark_started();
9503
9504 for (auto it = m->pg_list.begin();
9505 it != m->pg_list.end();
9506 ++it) {
9507 spg_t pgid = *it;
9508 if (pgid.preferred() >= 0) {
9509 dout(10) << "ignoring localized pg " << pgid << dendl;
9510 continue;
9511 }
9512
9513 RWLock::WLocker l(pg_map_lock);
9514 if (pg_map.count(pgid) == 0) {
9515 dout(10) << " don't have pg " << pgid << dendl;
9516 continue;
9517 }
9518 dout(5) << "queue_pg_for_deletion: " << pgid << dendl;
9519 PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
9520 pg_history_t history = pg->info.history;
9521 int up_primary, acting_primary;
9522 vector<int> up, acting;
9523 osdmap->pg_to_up_acting_osds(
9524 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9525 bool valid_history = project_pg_history(
9526 pg->info.pgid, history, pg->get_osdmap()->get_epoch(),
9527 up, up_primary, acting, acting_primary);
9528 if (valid_history &&
9529 history.same_interval_since <= m->get_epoch()) {
9530 assert(pg->get_primary().osd == m->get_source().num());
9531 PGRef _pg(pg);
9532 _remove_pg(pg);
9533 pg->unlock();
9534 } else {
9535 dout(10) << *pg << " ignoring remove request, pg changed in epoch "
9536 << history.same_interval_since
9537 << " > " << m->get_epoch() << dendl;
9538 pg->unlock();
9539 }
9540 }
9541}
9542
9543void OSD::_remove_pg(PG *pg)
9544{
9545 ObjectStore::Transaction rmt ;
9546
9547 // on_removal, which calls remove_watchers_and_notifies, and the erasure from
9548 // the pg_map must be done together without unlocking the pg lock,
9549 // to avoid racing with watcher cleanup in ms_handle_reset
9550 // and handle_notify_timeout
9551 pg->on_removal(&rmt);
9552
9553 service.cancel_pending_splits_for_parent(pg->info.pgid);
9554 int tr = store->queue_transaction(
9555 pg->osr.get(), std::move(rmt), NULL,
9556 new ContainerContext<
9557 SequencerRef>(pg->osr));
9558 assert(tr == 0);
9559
9560 DeletingStateRef deleting = service.deleting_pgs.lookup_or_create(
9561 pg->info.pgid,
9562 make_pair(
9563 pg->info.pgid,
9564 PGRef(pg))
9565 );
9566 remove_wq.queue(make_pair(PGRef(pg), deleting));
9567
9568 service.pg_remove_epoch(pg->info.pgid);
9569
9570 // dereference from op_wq
9571 op_shardedwq.clear_pg_pointer(pg->info.pgid);
9572
9573 // remove from map
9574 pg_map.erase(pg->info.pgid);
9575 pg->put("PGMap"); // since we've taken it out of map
9576}
9577
7c673cae
FG
9578// =========================================================
9579// RECOVERY
9580
9581void OSDService::_maybe_queue_recovery() {
9582 assert(recovery_lock.is_locked_by_me());
9583 uint64_t available_pushes;
9584 while (!awaiting_throttle.empty() &&
9585 _recover_now(&available_pushes)) {
9586 uint64_t to_start = MIN(
9587 available_pushes,
9588 cct->_conf->osd_recovery_max_single_start);
9589 _queue_for_recovery(awaiting_throttle.front(), to_start);
9590 awaiting_throttle.pop_front();
9591 recovery_ops_reserved += to_start;
9592 }
9593}
9594
9595bool OSDService::_recover_now(uint64_t *available_pushes)
9596{
9597 if (available_pushes)
9598 *available_pushes = 0;
9599
9600 if (ceph_clock_now() < defer_recovery_until) {
9601 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9602 return false;
9603 }
9604
9605 if (recovery_paused) {
9606 dout(15) << __func__ << " paused" << dendl;
9607 return false;
9608 }
9609
9610 uint64_t max = cct->_conf->osd_recovery_max_active;
9611 if (max <= recovery_ops_active + recovery_ops_reserved) {
9612 dout(15) << __func__ << " active " << recovery_ops_active
9613 << " + reserved " << recovery_ops_reserved
9614 << " >= max " << max << dendl;
9615 return false;
9616 }
9617
9618 if (available_pushes)
9619 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9620
9621 return true;
9622}
9623
c07f9fc5 9624
d2e6a577 9625void OSDService::adjust_pg_priorities(const vector<PGRef>& pgs, int newflags)
c07f9fc5 9626{
a8e16298 9627 if (!pgs.size() || !(newflags & (OFR_BACKFILL | OFR_RECOVERY))) {
c07f9fc5 9628 return;
a8e16298
TL
9629 }
9630 set<spg_t> did;
c07f9fc5 9631 if (newflags & OFR_BACKFILL) {
a8e16298
TL
9632 for (auto& pg : pgs) {
9633 if (pg->set_force_backfill(!(newflags & OFR_CANCEL))) {
9634 did.insert(pg->pg_id);
9635 }
9636 }
c07f9fc5 9637 } else if (newflags & OFR_RECOVERY) {
a8e16298
TL
9638 for (auto& pg : pgs) {
9639 if (pg->set_force_recovery(!(newflags & OFR_CANCEL))) {
9640 did.insert(pg->pg_id);
9641 }
c07f9fc5 9642 }
c07f9fc5 9643 }
a8e16298
TL
9644 if (did.empty()) {
9645 dout(10) << __func__ << " " << ((newflags & OFR_CANCEL) ? "cleared" : "set")
9646 << " force_" << ((newflags & OFR_BACKFILL) ? "backfill" : "recovery")
9647 << " on no pgs" << dendl;
c07f9fc5 9648 } else {
a8e16298
TL
9649 dout(10) << __func__ << " " << ((newflags & OFR_CANCEL) ? "cleared" : "set")
9650 << " force_" << ((newflags & OFR_BACKFILL) ? "backfill" : "recovery")
9651 << " on " << did << dendl;
c07f9fc5
FG
9652 }
9653}
9654
7c673cae
FG
9655void OSD::do_recovery(
9656 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9657 ThreadPool::TPHandle &handle)
9658{
9659 uint64_t started = 0;
31f18b77
FG
9660
9661 /*
9662 * When the value of osd_recovery_sleep is set greater than zero, recovery
9663 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9664 * recovery event's schedule time. This is done by adding a
9665 * recovery_requeue_callback event, which re-queues the recovery op using
9666 * queue_recovery_after_sleep.
9667 */
c07f9fc5 9668 float recovery_sleep = get_osd_recovery_sleep();
b32b8144 9669 {
31f18b77 9670 Mutex::Locker l(service.recovery_sleep_lock);
b32b8144
FG
9671 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9672 PGRef pgref(pg);
9673 auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9674 dout(20) << "do_recovery wake up at "
9675 << ceph_clock_now()
9676 << ", re-queuing recovery" << dendl;
9677 Mutex::Locker l(service.recovery_sleep_lock);
9678 service.recovery_needs_sleep = false;
9679 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9680 });
9681
9682 // This is true for the first recovery op and when the previous recovery op
9683 // has been scheduled in the past. The next recovery op is scheduled after
9684 // completing the sleep from now.
9685 if (service.recovery_schedule_time < ceph_clock_now()) {
9686 service.recovery_schedule_time = ceph_clock_now();
9687 }
9688 service.recovery_schedule_time += recovery_sleep;
9689 service.recovery_sleep_timer.add_event_at(service.recovery_schedule_time,
9690 recovery_requeue_callback);
9691 dout(20) << "Recovery event scheduled at "
9692 << service.recovery_schedule_time << dendl;
9693 return;
9694 }
7c673cae
FG
9695 }
9696
9697 {
b32b8144
FG
9698 {
9699 Mutex::Locker l(service.recovery_sleep_lock);
9700 service.recovery_needs_sleep = true;
9701 }
9702
7c673cae
FG
9703 if (pg->pg_has_reset_since(queued)) {
9704 goto out;
9705 }
9706
9707 assert(!pg->deleting);
9708 assert(pg->is_peered() && pg->is_primary());
9709
9710 assert(pg->recovery_queued);
9711 pg->recovery_queued = false;
9712
9713 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9714#ifdef DEBUG_RECOVERY_OIDS
9715 dout(20) << " active was " << service.recovery_oids[pg->info.pgid] << dendl;
9716#endif
9717
9718 bool more = pg->start_recovery_ops(reserved_pushes, handle, &started);
9719 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9720 << " on " << *pg << dendl;
9721
9722 // If no recovery op is started, don't bother to manipulate the RecoveryCtx
9723 if (!started && (more || !pg->have_unfound())) {
9724 goto out;
9725 }
9726
9727 PG::RecoveryCtx rctx = create_context();
9728 rctx.handle = &handle;
9729
9730 /*
9731 * if we couldn't start any recovery ops and things are still
9732 * unfound, see if we can discover more missing object locations.
9733 * It may be that our initial locations were bad and we errored
9734 * out while trying to pull.
9735 */
9736 if (!more && pg->have_unfound()) {
9737 pg->discover_all_missing(*rctx.query_map);
9738 if (rctx.query_map->empty()) {
224ce89b 9739 string action;
3efd9988 9740 if (pg->state_test(PG_STATE_BACKFILLING)) {
224ce89b
WB
9741 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9742 queued,
9743 queued,
3efd9988 9744 PG::DeferBackfill(cct->_conf->osd_recovery_retry_interval)));
224ce89b
WB
9745 pg->queue_peering_event(evt);
9746 action = "in backfill";
9747 } else if (pg->state_test(PG_STATE_RECOVERING)) {
9748 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9749 queued,
9750 queued,
3efd9988 9751 PG::DeferRecovery(cct->_conf->osd_recovery_retry_interval)));
224ce89b
WB
9752 pg->queue_peering_event(evt);
9753 action = "in recovery";
9754 } else {
9755 action = "already out of recovery/backfill";
9756 }
9757 dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
7c673cae 9758 } else {
224ce89b 9759 dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
7c673cae
FG
9760 pg->queue_recovery();
9761 }
9762 }
9763
9764 pg->write_if_dirty(*rctx.transaction);
9765 OSDMapRef curmap = pg->get_osdmap();
9766 dispatch_context(rctx, pg, curmap);
9767 }
9768
9769 out:
9770 assert(started <= reserved_pushes);
9771 service.release_reserved_pushes(reserved_pushes);
9772}
9773
9774void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9775{
9776 Mutex::Locker l(recovery_lock);
9777 dout(10) << "start_recovery_op " << *pg << " " << soid
9778 << " (" << recovery_ops_active << "/"
9779 << cct->_conf->osd_recovery_max_active << " rops)"
9780 << dendl;
9781 recovery_ops_active++;
9782
9783#ifdef DEBUG_RECOVERY_OIDS
9784 dout(20) << " active was " << recovery_oids[pg->info.pgid] << dendl;
9785 assert(recovery_oids[pg->info.pgid].count(soid) == 0);
9786 recovery_oids[pg->info.pgid].insert(soid);
9787#endif
9788}
9789
9790void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9791{
9792 Mutex::Locker l(recovery_lock);
9793 dout(10) << "finish_recovery_op " << *pg << " " << soid
9794 << " dequeue=" << dequeue
9795 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
9796 << dendl;
9797
9798 // adjust count
9799 assert(recovery_ops_active > 0);
9800 recovery_ops_active--;
9801
9802#ifdef DEBUG_RECOVERY_OIDS
9803 dout(20) << " active oids was " << recovery_oids[pg->info.pgid] << dendl;
9804 assert(recovery_oids[pg->info.pgid].count(soid));
9805 recovery_oids[pg->info.pgid].erase(soid);
9806#endif
9807
9808 _maybe_queue_recovery();
9809}
9810
9811bool OSDService::is_recovery_active()
9812{
b5b8bbf5 9813 return local_reserver.has_reservation() || remote_reserver.has_reservation();
7c673cae
FG
9814}
9815
9816// =========================================================
9817// OPS
9818
9819bool OSD::op_is_discardable(const MOSDOp *op)
9820{
9821 // drop client request if they are not connected and can't get the
9822 // reply anyway.
9823 if (!op->get_connection()->is_connected()) {
9824 return true;
9825 }
9826 return false;
9827}
9828
9829void OSD::enqueue_op(spg_t pg, OpRequestRef& op, epoch_t epoch)
9830{
9831 utime_t latency = ceph_clock_now() - op->get_req()->get_recv_stamp();
9832 dout(15) << "enqueue_op " << op << " prio " << op->get_req()->get_priority()
9833 << " cost " << op->get_req()->get_cost()
9834 << " latency " << latency
9835 << " epoch " << epoch
9836 << " " << *(op->get_req()) << dendl;
9837 op->osd_trace.event("enqueue op");
9838 op->osd_trace.keyval("priority", op->get_req()->get_priority());
9839 op->osd_trace.keyval("cost", op->get_req()->get_cost());
9840 op->mark_queued_for_pg();
224ce89b 9841 logger->tinc(l_osd_op_before_queue_op_lat, latency);
7c673cae
FG
9842 op_shardedwq.queue(make_pair(pg, PGQueueable(op, epoch)));
9843}
9844
9845
9846
9847/*
9848 * NOTE: dequeue called in worker thread, with pg lock
9849 */
9850void OSD::dequeue_op(
9851 PGRef pg, OpRequestRef op,
9852 ThreadPool::TPHandle &handle)
9853{
9854 FUNCTRACE();
9855 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
9856
9857 utime_t now = ceph_clock_now();
9858 op->set_dequeued_time(now);
9859 utime_t latency = now - op->get_req()->get_recv_stamp();
9860 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
9861 << " cost " << op->get_req()->get_cost()
9862 << " latency " << latency
9863 << " " << *(op->get_req())
9864 << " pg " << *pg << dendl;
9865
224ce89b
WB
9866 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9867
7c673cae
FG
9868 Session *session = static_cast<Session *>(
9869 op->get_req()->get_connection()->get_priv());
9870 if (session) {
9871 maybe_share_map(session, op, pg->get_osdmap());
9872 session->put();
9873 }
9874
9875 if (pg->deleting)
9876 return;
9877
9878 op->mark_reached_pg();
9879 op->osd_trace.event("dequeue_op");
9880
9881 pg->do_request(op, handle);
9882
9883 // finish
9884 dout(10) << "dequeue_op " << op << " finish" << dendl;
9885 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
9886}
9887
9888
9889struct C_CompleteSplits : public Context {
9890 OSD *osd;
31f18b77
FG
9891 set<PGRef> pgs;
9892 C_CompleteSplits(OSD *osd, const set<PGRef> &in)
7c673cae
FG
9893 : osd(osd), pgs(in) {}
9894 void finish(int r) override {
9895 Mutex::Locker l(osd->osd_lock);
9896 if (osd->is_stopping())
9897 return;
9898 PG::RecoveryCtx rctx = osd->create_context();
31f18b77 9899 for (set<PGRef>::iterator i = pgs.begin();
7c673cae
FG
9900 i != pgs.end();
9901 ++i) {
9902 osd->pg_map_lock.get_write();
9903 (*i)->lock();
31f18b77
FG
9904 PG *pg = i->get();
9905 osd->add_newly_split_pg(pg, &rctx);
7c673cae
FG
9906 if (!((*i)->deleting)) {
9907 set<spg_t> to_complete;
9908 to_complete.insert((*i)->info.pgid);
9909 osd->service.complete_split(to_complete);
9910 }
9911 osd->pg_map_lock.put_write();
31f18b77 9912 osd->dispatch_context_transaction(rctx, pg);
7c673cae
FG
9913 osd->wake_pg_waiters(*i);
9914 (*i)->unlock();
9915 }
9916
9917 osd->dispatch_context(rctx, 0, osd->service.get_osdmap());
9918 }
9919};
9920
9921void OSD::process_peering_events(
9922 const list<PG*> &pgs,
9923 ThreadPool::TPHandle &handle
9924 )
9925{
9926 bool need_up_thru = false;
9927 epoch_t same_interval_since = 0;
9928 OSDMapRef curmap;
9929 PG::RecoveryCtx rctx = create_context();
9930 rctx.handle = &handle;
9931 for (list<PG*>::const_iterator i = pgs.begin();
9932 i != pgs.end();
9933 ++i) {
31f18b77 9934 set<PGRef> split_pgs;
7c673cae
FG
9935 PG *pg = *i;
9936 pg->lock_suspend_timeout(handle);
9937 curmap = service.get_osdmap();
9938 if (pg->deleting) {
9939 pg->unlock();
9940 continue;
9941 }
9942 if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
9943 // we need to requeue the PG explicitly since we didn't actually
9944 // handle an event
9945 peering_wq.queue(pg);
9946 } else {
9947 assert(!pg->peering_queue.empty());
9948 PG::CephPeeringEvtRef evt = pg->peering_queue.front();
9949 pg->peering_queue.pop_front();
9950 pg->handle_peering_event(evt, &rctx);
9951 }
9952 need_up_thru = pg->need_up_thru || need_up_thru;
9953 same_interval_since = MAX(pg->info.history.same_interval_since,
9954 same_interval_since);
9955 pg->write_if_dirty(*rctx.transaction);
9956 if (!split_pgs.empty()) {
9957 rctx.on_applied->add(new C_CompleteSplits(this, split_pgs));
9958 split_pgs.clear();
9959 }
9960 dispatch_context_transaction(rctx, pg, &handle);
9961 pg->unlock();
9962 }
9963 if (need_up_thru)
9964 queue_want_up_thru(same_interval_since);
9965 dispatch_context(rctx, 0, curmap, &handle);
9966
9967 service.send_pg_temp();
9968}
9969
9970// --------------------------------
9971
9972const char** OSD::get_tracked_conf_keys() const
9973{
9974 static const char* KEYS[] = {
9975 "osd_max_backfills",
9976 "osd_min_recovery_priority",
224ce89b
WB
9977 "osd_max_trimming_pgs",
9978 "osd_op_complaint_time",
9979 "osd_op_log_threshold",
9980 "osd_op_history_size",
9981 "osd_op_history_duration",
9982 "osd_op_history_slow_op_size",
9983 "osd_op_history_slow_op_threshold",
7c673cae
FG
9984 "osd_enable_op_tracker",
9985 "osd_map_cache_size",
9986 "osd_map_max_advance",
9987 "osd_pg_epoch_persisted_max_stale",
9988 "osd_disk_thread_ioprio_class",
9989 "osd_disk_thread_ioprio_priority",
9990 // clog & admin clog
9991 "clog_to_monitors",
9992 "clog_to_syslog",
9993 "clog_to_syslog_facility",
9994 "clog_to_syslog_level",
9995 "osd_objectstore_fuse",
9996 "clog_to_graylog",
9997 "clog_to_graylog_host",
9998 "clog_to_graylog_port",
9999 "host",
10000 "fsid",
10001 "osd_recovery_delay_start",
10002 "osd_client_message_size_cap",
10003 "osd_client_message_cap",
31f18b77
FG
10004 "osd_heartbeat_min_size",
10005 "osd_heartbeat_interval",
7c673cae
FG
10006 NULL
10007 };
10008 return KEYS;
10009}
10010
10011void OSD::handle_conf_change(const struct md_config_t *conf,
10012 const std::set <std::string> &changed)
10013{
f64942e4 10014 Mutex::Locker l(osd_lock);
7c673cae
FG
10015 if (changed.count("osd_max_backfills")) {
10016 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
10017 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
10018 }
10019 if (changed.count("osd_min_recovery_priority")) {
10020 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10021 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
10022 }
10023 if (changed.count("osd_max_trimming_pgs")) {
10024 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
10025 }
10026 if (changed.count("osd_op_complaint_time") ||
10027 changed.count("osd_op_log_threshold")) {
10028 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
10029 cct->_conf->osd_op_log_threshold);
10030 }
10031 if (changed.count("osd_op_history_size") ||
10032 changed.count("osd_op_history_duration")) {
10033 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
10034 cct->_conf->osd_op_history_duration);
10035 }
10036 if (changed.count("osd_op_history_slow_op_size") ||
10037 changed.count("osd_op_history_slow_op_threshold")) {
10038 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
10039 cct->_conf->osd_op_history_slow_op_threshold);
10040 }
10041 if (changed.count("osd_enable_op_tracker")) {
10042 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
10043 }
10044 if (changed.count("osd_disk_thread_ioprio_class") ||
10045 changed.count("osd_disk_thread_ioprio_priority")) {
10046 set_disk_tp_priority();
10047 }
10048 if (changed.count("osd_map_cache_size")) {
10049 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
10050 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
10051 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
10052 }
10053 if (changed.count("clog_to_monitors") ||
10054 changed.count("clog_to_syslog") ||
10055 changed.count("clog_to_syslog_level") ||
10056 changed.count("clog_to_syslog_facility") ||
10057 changed.count("clog_to_graylog") ||
10058 changed.count("clog_to_graylog_host") ||
10059 changed.count("clog_to_graylog_port") ||
10060 changed.count("host") ||
10061 changed.count("fsid")) {
10062 update_log_config();
10063 }
10064
10065#ifdef HAVE_LIBFUSE
10066 if (changed.count("osd_objectstore_fuse")) {
10067 if (store) {
10068 enable_disable_fuse(false);
10069 }
10070 }
10071#endif
10072
10073 if (changed.count("osd_recovery_delay_start")) {
10074 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
10075 service.kick_recovery_queue();
10076 }
10077
10078 if (changed.count("osd_client_message_cap")) {
10079 uint64_t newval = cct->_conf->osd_client_message_cap;
10080 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10081 if (pol.throttler_messages && newval > 0) {
10082 pol.throttler_messages->reset_max(newval);
10083 }
10084 }
10085 if (changed.count("osd_client_message_size_cap")) {
10086 uint64_t newval = cct->_conf->osd_client_message_size_cap;
10087 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
10088 if (pol.throttler_bytes && newval > 0) {
10089 pol.throttler_bytes->reset_max(newval);
10090 }
10091 }
10092
10093 check_config();
10094}
10095
10096void OSD::update_log_config()
10097{
10098 map<string,string> log_to_monitors;
10099 map<string,string> log_to_syslog;
10100 map<string,string> log_channel;
10101 map<string,string> log_prio;
10102 map<string,string> log_to_graylog;
10103 map<string,string> log_to_graylog_host;
10104 map<string,string> log_to_graylog_port;
10105 uuid_d fsid;
10106 string host;
10107
10108 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
10109 log_channel, log_prio, log_to_graylog,
10110 log_to_graylog_host, log_to_graylog_port,
10111 fsid, host) == 0)
10112 clog->update_config(log_to_monitors, log_to_syslog,
10113 log_channel, log_prio, log_to_graylog,
10114 log_to_graylog_host, log_to_graylog_port,
10115 fsid, host);
10116 derr << "log_to_monitors " << log_to_monitors << dendl;
10117}
10118
10119void OSD::check_config()
10120{
10121 // some sanity checks
10122 if (cct->_conf->osd_map_cache_size <= cct->_conf->osd_map_max_advance + 2) {
10123 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10124 << " is not > osd_map_max_advance ("
10125 << cct->_conf->osd_map_max_advance << ")";
10126 }
10127 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
10128 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
10129 << " is not > osd_pg_epoch_persisted_max_stale ("
10130 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
10131 }
10132}
10133
10134void OSD::set_disk_tp_priority()
10135{
10136 dout(10) << __func__
10137 << " class " << cct->_conf->osd_disk_thread_ioprio_class
10138 << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
10139 << dendl;
10140 if (cct->_conf->osd_disk_thread_ioprio_class.empty() ||
10141 cct->_conf->osd_disk_thread_ioprio_priority < 0)
10142 return;
10143 int cls =
10144 ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
f64942e4 10145 if (cls < 0) {
7c673cae
FG
10146 derr << __func__ << cpp_strerror(cls) << ": "
10147 << "osd_disk_thread_ioprio_class is " << cct->_conf->osd_disk_thread_ioprio_class
10148 << " but only the following values are allowed: idle, be or rt" << dendl;
f64942e4
AA
10149 } else {
10150 remove_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
10151 recovery_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
10152 }
7c673cae
FG
10153}
10154
10155// --------------------------------
10156
10157void OSD::get_latest_osdmap()
10158{
10159 dout(10) << __func__ << " -- start" << dendl;
10160
10161 C_SaferCond cond;
10162 service.objecter->wait_for_latest_osdmap(&cond);
10163 cond.wait();
10164
10165 dout(10) << __func__ << " -- finish" << dendl;
10166}
10167
10168// --------------------------------
10169
10170int OSD::init_op_flags(OpRequestRef& op)
10171{
10172 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
10173 vector<OSDOp>::const_iterator iter;
10174
10175 // client flags have no bearing on whether an op is a read, write, etc.
10176 op->rmw_flags = 0;
10177
10178 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
10179 op->set_force_rwordered();
10180 }
10181
10182 // set bits based on op codes, called methods.
10183 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
10184 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
10185 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
10186 /* This a bit odd. PING isn't actually a write. It can't
10187 * result in an update to the object_info. PINGs also aren'ty
10188 * resent, so there's no reason to write out a log entry
10189 *
10190 * However, we pipeline them behind writes, so let's force
10191 * the write_ordered flag.
10192 */
10193 op->set_force_rwordered();
10194 } else {
10195 if (ceph_osd_op_mode_modify(iter->op.op))
10196 op->set_write();
10197 }
10198 if (ceph_osd_op_mode_read(iter->op.op))
10199 op->set_read();
10200
10201 // set READ flag if there are src_oids
10202 if (iter->soid.oid.name.length())
10203 op->set_read();
10204
10205 // set PGOP flag if there are PG ops
10206 if (ceph_osd_op_type_pg(iter->op.op))
10207 op->set_pg_op();
10208
10209 if (ceph_osd_op_mode_cache(iter->op.op))
10210 op->set_cache();
10211
10212 // check for ec base pool
10213 int64_t poolid = m->get_pg().pool();
10214 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
10215 if (pool && pool->is_tier()) {
10216 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
10217 if (base_pool && base_pool->require_rollback()) {
10218 if ((iter->op.op != CEPH_OSD_OP_READ) &&
10219 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
c07f9fc5 10220 (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
7c673cae
FG
10221 (iter->op.op != CEPH_OSD_OP_STAT) &&
10222 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
10223 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
10224 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
10225 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
10226 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
10227 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
10228 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
10229 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
10230 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
10231 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
10232 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
10233 (iter->op.op != CEPH_OSD_OP_CREATE) &&
10234 (iter->op.op != CEPH_OSD_OP_DELETE) &&
10235 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
10236 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
10237 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
10238 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
10239 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
10240 op->set_promote();
10241 }
10242 }
10243 }
10244
10245 switch (iter->op.op) {
10246 case CEPH_OSD_OP_CALL:
10247 {
10248 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
10249 int is_write, is_read;
10250 string cname, mname;
10251 bp.copy(iter->op.cls.class_len, cname);
10252 bp.copy(iter->op.cls.method_len, mname);
10253
10254 ClassHandler::ClassData *cls;
10255 int r = class_handler->open_class(cname, &cls);
10256 if (r) {
10257 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
10258 if (r == -ENOENT)
10259 r = -EOPNOTSUPP;
10260 else if (r != -EPERM) // propagate permission errors
10261 r = -EIO;
10262 return r;
10263 }
10264 int flags = cls->get_method_flags(mname.c_str());
10265 if (flags < 0) {
10266 if (flags == -ENOENT)
10267 r = -EOPNOTSUPP;
10268 else
10269 r = flags;
10270 return r;
10271 }
10272 is_read = flags & CLS_METHOD_RD;
10273 is_write = flags & CLS_METHOD_WR;
10274 bool is_promote = flags & CLS_METHOD_PROMOTE;
10275
10276 dout(10) << "class " << cname << " method " << mname << " "
10277 << "flags=" << (is_read ? "r" : "")
10278 << (is_write ? "w" : "")
10279 << (is_promote ? "p" : "")
10280 << dendl;
10281 if (is_read)
10282 op->set_class_read();
10283 if (is_write)
10284 op->set_class_write();
10285 if (is_promote)
10286 op->set_promote();
10287 op->add_class(cname, is_read, is_write, cls->whitelisted);
10288 break;
10289 }
10290
10291 case CEPH_OSD_OP_WATCH:
10292 // force the read bit for watch since it is depends on previous
10293 // watch state (and may return early if the watch exists) or, in
10294 // the case of ping, is simply a read op.
10295 op->set_read();
10296 // fall through
10297 case CEPH_OSD_OP_NOTIFY:
10298 case CEPH_OSD_OP_NOTIFY_ACK:
10299 {
10300 op->set_promote();
10301 break;
10302 }
10303
10304 case CEPH_OSD_OP_DELETE:
10305 // if we get a delete with FAILOK we can skip handle cache. without
10306 // FAILOK we still need to promote (or do something smarter) to
10307 // determine whether to return ENOENT or 0.
10308 if (iter == m->ops.begin() &&
10309 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
10310 op->set_skip_handle_cache();
10311 }
10312 // skip promotion when proxying a delete op
10313 if (m->ops.size() == 1) {
10314 op->set_skip_promote();
10315 }
10316 break;
10317
10318 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
10319 case CEPH_OSD_OP_CACHE_FLUSH:
10320 case CEPH_OSD_OP_CACHE_EVICT:
10321 // If try_flush/flush/evict is the only op, can skip handle cache.
10322 if (m->ops.size() == 1) {
10323 op->set_skip_handle_cache();
10324 }
10325 break;
10326
10327 case CEPH_OSD_OP_READ:
10328 case CEPH_OSD_OP_SYNC_READ:
10329 case CEPH_OSD_OP_SPARSE_READ:
10330 case CEPH_OSD_OP_CHECKSUM:
10331 case CEPH_OSD_OP_WRITEFULL:
10332 if (m->ops.size() == 1 &&
10333 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
10334 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
10335 op->set_skip_promote();
10336 }
10337 break;
10338
10339 // force promotion when pin an object in cache tier
10340 case CEPH_OSD_OP_CACHE_PIN:
10341 op->set_promote();
10342 break;
10343
10344 default:
10345 break;
10346 }
10347 }
10348
10349 if (op->rmw_flags == 0)
10350 return -EINVAL;
10351
10352 return 0;
10353}
10354
10355void OSD::PeeringWQ::_dequeue(list<PG*> *out) {
10356 for (list<PG*>::iterator i = peering_queue.begin();
10357 i != peering_queue.end() &&
10358 out->size() < osd->cct->_conf->osd_peering_wq_batch_size;
10359 ) {
10360 if (in_use.count(*i)) {
10361 ++i;
10362 } else {
10363 out->push_back(*i);
10364 peering_queue.erase(i++);
10365 }
10366 }
10367 in_use.insert(out->begin(), out->end());
10368}
10369
224ce89b 10370
7c673cae
FG
10371// =============================================================
10372
10373#undef dout_context
10374#define dout_context osd->cct
10375#undef dout_prefix
10376#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10377
10378void OSD::ShardedOpWQ::wake_pg_waiters(spg_t pgid)
10379{
10380 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10381 auto sdata = shard_list[shard_index];
10382 bool queued = false;
7c673cae
FG
10383 {
10384 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10385 auto p = sdata->pg_slots.find(pgid);
10386 if (p != sdata->pg_slots.end()) {
10387 dout(20) << __func__ << " " << pgid
10388 << " to_process " << p->second.to_process
10389 << " waiting_for_pg=" << (int)p->second.waiting_for_pg << dendl;
10390 for (auto i = p->second.to_process.rbegin();
10391 i != p->second.to_process.rend();
10392 ++i) {
10393 sdata->_enqueue_front(make_pair(pgid, *i), osd->op_prio_cutoff);
10394 }
7c673cae
FG
10395 p->second.to_process.clear();
10396 p->second.waiting_for_pg = false;
10397 ++p->second.requeue_seq;
10398 queued = true;
10399 }
10400 }
7c673cae
FG
10401 if (queued) {
10402 sdata->sdata_lock.Lock();
10403 sdata->sdata_cond.SignalOne();
10404 sdata->sdata_lock.Unlock();
10405 }
10406}
10407
10408void OSD::ShardedOpWQ::prune_pg_waiters(OSDMapRef osdmap, int whoami)
10409{
10410 unsigned pushes_to_free = 0;
10411 for (auto sdata : shard_list) {
10412 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10413 sdata->waiting_for_pg_osdmap = osdmap;
10414 auto p = sdata->pg_slots.begin();
10415 while (p != sdata->pg_slots.end()) {
10416 ShardData::pg_slot& slot = p->second;
10417 if (!slot.to_process.empty() && slot.num_running == 0) {
10418 if (osdmap->is_up_acting_osd_shard(p->first, whoami)) {
10419 dout(20) << __func__ << " " << p->first << " maps to us, keeping"
10420 << dendl;
10421 ++p;
10422 continue;
10423 }
10424 while (!slot.to_process.empty() &&
10425 slot.to_process.front().get_map_epoch() <= osdmap->get_epoch()) {
10426 auto& qi = slot.to_process.front();
10427 dout(20) << __func__ << " " << p->first
10428 << " item " << qi
10429 << " epoch " << qi.get_map_epoch()
10430 << " <= " << osdmap->get_epoch()
10431 << ", stale, dropping" << dendl;
10432 pushes_to_free += qi.get_reserved_pushes();
10433 slot.to_process.pop_front();
10434 }
10435 }
10436 if (slot.to_process.empty() &&
10437 slot.num_running == 0 &&
10438 !slot.pg) {
10439 dout(20) << __func__ << " " << p->first << " empty, pruning" << dendl;
10440 p = sdata->pg_slots.erase(p);
10441 } else {
10442 ++p;
10443 }
10444 }
10445 }
10446 if (pushes_to_free > 0) {
10447 osd->service.release_reserved_pushes(pushes_to_free);
10448 }
10449}
10450
10451void OSD::ShardedOpWQ::clear_pg_pointer(spg_t pgid)
10452{
10453 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10454 auto sdata = shard_list[shard_index];
10455 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10456 auto p = sdata->pg_slots.find(pgid);
10457 if (p != sdata->pg_slots.end()) {
10458 auto& slot = p->second;
10459 dout(20) << __func__ << " " << pgid << " pg " << slot.pg << dendl;
10460 assert(!slot.pg || slot.pg->deleting);
10461 slot.pg = nullptr;
10462 }
10463}
10464
10465void OSD::ShardedOpWQ::clear_pg_slots()
10466{
10467 for (auto sdata : shard_list) {
10468 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10469 sdata->pg_slots.clear();
10470 sdata->waiting_for_pg_osdmap.reset();
10471 // don't bother with reserved pushes; we are shutting down
10472 }
10473}
10474
10475#undef dout_prefix
10476#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10477
10478void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10479{
10480 uint32_t shard_index = thread_index % num_shards;
10481 ShardData *sdata = shard_list[shard_index];
10482 assert(NULL != sdata);
10483
10484 // peek at spg_t
10485 sdata->sdata_op_ordering_lock.Lock();
10486 if (sdata->pqueue->empty()) {
10487 dout(20) << __func__ << " empty q, waiting" << dendl;
10488 // optimistically sleep a moment; maybe another work item will come along.
7c673cae
FG
10489 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10490 osd->cct->_conf->threadpool_default_timeout, 0);
10491 sdata->sdata_lock.Lock();
224ce89b 10492 sdata->sdata_op_ordering_lock.Unlock();
7c673cae
FG
10493 sdata->sdata_cond.WaitInterval(sdata->sdata_lock,
10494 utime_t(osd->cct->_conf->threadpool_empty_queue_max_wait, 0));
10495 sdata->sdata_lock.Unlock();
10496 sdata->sdata_op_ordering_lock.Lock();
10497 if (sdata->pqueue->empty()) {
10498 sdata->sdata_op_ordering_lock.Unlock();
10499 return;
10500 }
10501 }
10502 pair<spg_t, PGQueueable> item = sdata->pqueue->dequeue();
10503 if (osd->is_stopping()) {
10504 sdata->sdata_op_ordering_lock.Unlock();
10505 return; // OSD shutdown, discard.
10506 }
10507 PGRef pg;
10508 uint64_t requeue_seq;
10509 {
10510 auto& slot = sdata->pg_slots[item.first];
10511 dout(30) << __func__ << " " << item.first
10512 << " to_process " << slot.to_process
10513 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10514 slot.to_process.push_back(item.second);
10515 // note the requeue seq now...
10516 requeue_seq = slot.requeue_seq;
10517 if (slot.waiting_for_pg) {
10518 // save ourselves a bit of effort
10519 dout(20) << __func__ << " " << item.first << " item " << item.second
10520 << " queued, waiting_for_pg" << dendl;
10521 sdata->sdata_op_ordering_lock.Unlock();
10522 return;
10523 }
10524 pg = slot.pg;
10525 dout(20) << __func__ << " " << item.first << " item " << item.second
10526 << " queued" << dendl;
10527 ++slot.num_running;
10528 }
10529 sdata->sdata_op_ordering_lock.Unlock();
10530
10531 osd->service.maybe_inject_dispatch_delay();
10532
10533 // [lookup +] lock pg (if we have it)
10534 if (!pg) {
10535 pg = osd->_lookup_lock_pg(item.first);
10536 } else {
10537 pg->lock();
10538 }
10539
10540 osd->service.maybe_inject_dispatch_delay();
10541
10542 boost::optional<PGQueueable> qi;
10543
10544 // we don't use a Mutex::Locker here because of the
10545 // osd->service.release_reserved_pushes() call below
10546 sdata->sdata_op_ordering_lock.Lock();
10547
10548 auto q = sdata->pg_slots.find(item.first);
10549 assert(q != sdata->pg_slots.end());
10550 auto& slot = q->second;
10551 --slot.num_running;
10552
10553 if (slot.to_process.empty()) {
10554 // raced with wake_pg_waiters or prune_pg_waiters
10555 dout(20) << __func__ << " " << item.first << " nothing queued" << dendl;
10556 if (pg) {
10557 pg->unlock();
10558 }
10559 sdata->sdata_op_ordering_lock.Unlock();
10560 return;
10561 }
10562 if (requeue_seq != slot.requeue_seq) {
10563 dout(20) << __func__ << " " << item.first
10564 << " requeue_seq " << slot.requeue_seq << " > our "
10565 << requeue_seq << ", we raced with wake_pg_waiters"
10566 << dendl;
10567 if (pg) {
10568 pg->unlock();
10569 }
10570 sdata->sdata_op_ordering_lock.Unlock();
10571 return;
10572 }
10573 if (pg && !slot.pg && !pg->deleting) {
10574 dout(20) << __func__ << " " << item.first << " set pg to " << pg << dendl;
10575 slot.pg = pg;
10576 }
10577 dout(30) << __func__ << " " << item.first << " to_process " << slot.to_process
10578 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10579
10580 // make sure we're not already waiting for this pg
10581 if (slot.waiting_for_pg) {
10582 dout(20) << __func__ << " " << item.first << " item " << item.second
10583 << " slot is waiting_for_pg" << dendl;
10584 if (pg) {
10585 pg->unlock();
10586 }
10587 sdata->sdata_op_ordering_lock.Unlock();
10588 return;
10589 }
10590
10591 // take next item
10592 qi = slot.to_process.front();
10593 slot.to_process.pop_front();
10594 dout(20) << __func__ << " " << item.first << " item " << *qi
10595 << " pg " << pg << dendl;
10596
10597 if (!pg) {
10598 // should this pg shard exist on this osd in this (or a later) epoch?
10599 OSDMapRef osdmap = sdata->waiting_for_pg_osdmap;
10600 if (osdmap->is_up_acting_osd_shard(item.first, osd->whoami)) {
10601 dout(20) << __func__ << " " << item.first
10602 << " no pg, should exist, will wait" << " on " << *qi << dendl;
10603 slot.to_process.push_front(*qi);
10604 slot.waiting_for_pg = true;
10605 } else if (qi->get_map_epoch() > osdmap->get_epoch()) {
10606 dout(20) << __func__ << " " << item.first << " no pg, item epoch is "
10607 << qi->get_map_epoch() << " > " << osdmap->get_epoch()
10608 << ", will wait on " << *qi << dendl;
10609 slot.to_process.push_front(*qi);
10610 slot.waiting_for_pg = true;
10611 } else {
10612 dout(20) << __func__ << " " << item.first << " no pg, shouldn't exist,"
10613 << " dropping " << *qi << dendl;
10614 // share map with client?
10615 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10616 Session *session = static_cast<Session *>(
10617 (*_op)->get_req()->get_connection()->get_priv());
10618 if (session) {
10619 osd->maybe_share_map(session, *_op, sdata->waiting_for_pg_osdmap);
10620 session->put();
10621 }
10622 }
10623 unsigned pushes_to_free = qi->get_reserved_pushes();
10624 if (pushes_to_free > 0) {
10625 sdata->sdata_op_ordering_lock.Unlock();
10626 osd->service.release_reserved_pushes(pushes_to_free);
10627 return;
10628 }
10629 }
10630 sdata->sdata_op_ordering_lock.Unlock();
10631 return;
10632 }
10633 sdata->sdata_op_ordering_lock.Unlock();
10634
10635
10636 // osd_opwq_process marks the point at which an operation has been dequeued
10637 // and will begin to be handled by a worker thread.
10638 {
10639#ifdef WITH_LTTNG
10640 osd_reqid_t reqid;
10641 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10642 reqid = (*_op)->get_reqid();
10643 }
10644#endif
10645 tracepoint(osd, opwq_process_start, reqid.name._type,
10646 reqid.name._num, reqid.tid, reqid.inc);
10647 }
10648
10649 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10650 Formatter *f = Formatter::create("json");
10651 f->open_object_section("q");
10652 dump(f);
10653 f->close_section();
10654 f->flush(*_dout);
10655 delete f;
10656 *_dout << dendl;
10657
10658 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10659 suicide_interval);
10660 qi->run(osd, pg, tp_handle);
10661
10662 {
10663#ifdef WITH_LTTNG
10664 osd_reqid_t reqid;
10665 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10666 reqid = (*_op)->get_reqid();
10667 }
10668#endif
10669 tracepoint(osd, opwq_process_finish, reqid.name._type,
10670 reqid.name._num, reqid.tid, reqid.inc);
10671 }
10672
10673 pg->unlock();
10674}
10675
10676void OSD::ShardedOpWQ::_enqueue(pair<spg_t, PGQueueable> item) {
10677 uint32_t shard_index =
10678 item.first.hash_to_shard(shard_list.size());
10679
10680 ShardData* sdata = shard_list[shard_index];
10681 assert (NULL != sdata);
10682 unsigned priority = item.second.get_priority();
10683 unsigned cost = item.second.get_cost();
10684 sdata->sdata_op_ordering_lock.Lock();
10685
10686 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10687 if (priority >= osd->op_prio_cutoff)
10688 sdata->pqueue->enqueue_strict(
10689 item.second.get_owner(), priority, item);
10690 else
10691 sdata->pqueue->enqueue(
10692 item.second.get_owner(),
10693 priority, cost, item);
10694 sdata->sdata_op_ordering_lock.Unlock();
10695
10696 sdata->sdata_lock.Lock();
10697 sdata->sdata_cond.SignalOne();
10698 sdata->sdata_lock.Unlock();
10699
10700}
10701
10702void OSD::ShardedOpWQ::_enqueue_front(pair<spg_t, PGQueueable> item)
10703{
10704 uint32_t shard_index = item.first.hash_to_shard(shard_list.size());
10705 ShardData* sdata = shard_list[shard_index];
10706 assert (NULL != sdata);
10707 sdata->sdata_op_ordering_lock.Lock();
10708 auto p = sdata->pg_slots.find(item.first);
10709 if (p != sdata->pg_slots.end() && !p->second.to_process.empty()) {
10710 // we may be racing with _process, which has dequeued a new item
10711 // from pqueue, put it on to_process, and is now busy taking the
10712 // pg lock. ensure this old requeued item is ordered before any
10713 // such newer item in to_process.
10714 p->second.to_process.push_front(item.second);
10715 item.second = p->second.to_process.back();
10716 p->second.to_process.pop_back();
10717 dout(20) << __func__ << " " << item.first
10718 << " " << p->second.to_process.front()
10719 << " shuffled w/ " << item.second << dendl;
10720 } else {
10721 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10722 }
10723 sdata->_enqueue_front(item, osd->op_prio_cutoff);
10724 sdata->sdata_op_ordering_lock.Unlock();
10725 sdata->sdata_lock.Lock();
10726 sdata->sdata_cond.SignalOne();
10727 sdata->sdata_lock.Unlock();
10728}
10729
10730namespace ceph {
10731namespace osd_cmds {
10732
10733int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os)
10734{
10735 if (!ceph_using_tcmalloc()) {
10736 os << "could not issue heap profiler command -- not using tcmalloc!";
10737 return -EOPNOTSUPP;
10738 }
10739
10740 string cmd;
10741 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
10742 os << "unable to get value for command \"" << cmd << "\"";
10743 return -EINVAL;
10744 }
10745
10746 std::vector<std::string> cmd_vec;
10747 get_str_vec(cmd, cmd_vec);
10748
10749 ceph_heap_profiler_handle_command(cmd_vec, os);
10750
10751 return 0;
10752}
10753
10754}} // namespace ceph::osd_cmds
10755
224ce89b
WB
10756
10757std::ostream& operator<<(std::ostream& out, const OSD::io_queue& q) {
10758 switch(q) {
10759 case OSD::io_queue::prioritized:
10760 out << "prioritized";
10761 break;
10762 case OSD::io_queue::weightedpriority:
10763 out << "weightedpriority";
10764 break;
10765 case OSD::io_queue::mclock_opclass:
10766 out << "mclock_opclass";
10767 break;
10768 case OSD::io_queue::mclock_client:
10769 out << "mclock_client";
10770 break;
10771 }
10772 return out;
10773}