]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSD.cc
bump version to 12.2.1-pve3
[ceph.git] / ceph / src / osd / OSD.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15#include "acconfig.h"
16
17#include <fstream>
18#include <iostream>
19#include <errno.h>
20#include <sys/stat.h>
21#include <signal.h>
22#include <ctype.h>
23#include <boost/scoped_ptr.hpp>
24
25#ifdef HAVE_SYS_PARAM_H
26#include <sys/param.h>
27#endif
28
29#ifdef HAVE_SYS_MOUNT_H
30#include <sys/mount.h>
31#endif
32
33#include "osd/PG.h"
34
35#include "include/types.h"
36#include "include/compat.h"
37
38#include "OSD.h"
39#include "OSDMap.h"
40#include "Watch.h"
41#include "osdc/Objecter.h"
42
43#include "common/errno.h"
44#include "common/ceph_argparse.h"
224ce89b 45#include "common/ceph_time.h"
7c673cae
FG
46#include "common/version.h"
47#include "common/io_priority.h"
b5b8bbf5 48#include "common/pick_address.h"
7c673cae
FG
49
50#include "os/ObjectStore.h"
51#ifdef HAVE_LIBFUSE
52#include "os/FuseStore.h"
53#endif
54
55#include "PrimaryLogPG.h"
56
57
58#include "msg/Messenger.h"
59#include "msg/Message.h"
60
61#include "mon/MonClient.h"
62
63#include "messages/MLog.h"
64
65#include "messages/MGenericMessage.h"
7c673cae
FG
66#include "messages/MOSDPing.h"
67#include "messages/MOSDFailure.h"
68#include "messages/MOSDMarkMeDown.h"
69#include "messages/MOSDFull.h"
70#include "messages/MOSDOp.h"
71#include "messages/MOSDOpReply.h"
72#include "messages/MOSDBackoff.h"
73#include "messages/MOSDBeacon.h"
74#include "messages/MOSDRepOp.h"
75#include "messages/MOSDRepOpReply.h"
76#include "messages/MOSDBoot.h"
77#include "messages/MOSDPGTemp.h"
78
79#include "messages/MOSDMap.h"
80#include "messages/MMonGetOSDMap.h"
81#include "messages/MOSDPGNotify.h"
82#include "messages/MOSDPGQuery.h"
83#include "messages/MOSDPGLog.h"
84#include "messages/MOSDPGRemove.h"
85#include "messages/MOSDPGInfo.h"
86#include "messages/MOSDPGCreate.h"
87#include "messages/MOSDPGTrim.h"
88#include "messages/MOSDPGScan.h"
89#include "messages/MOSDPGBackfill.h"
90#include "messages/MBackfillReserve.h"
91#include "messages/MRecoveryReserve.h"
c07f9fc5 92#include "messages/MOSDForceRecovery.h"
7c673cae
FG
93#include "messages/MOSDECSubOpWrite.h"
94#include "messages/MOSDECSubOpWriteReply.h"
95#include "messages/MOSDECSubOpRead.h"
96#include "messages/MOSDECSubOpReadReply.h"
97#include "messages/MOSDPGCreated.h"
98#include "messages/MOSDPGUpdateLogMissing.h"
99#include "messages/MOSDPGUpdateLogMissingReply.h"
100
101#include "messages/MOSDAlive.h"
102
103#include "messages/MOSDScrub.h"
104#include "messages/MOSDScrubReserve.h"
105#include "messages/MOSDRepScrub.h"
106
107#include "messages/MMonCommand.h"
108#include "messages/MCommand.h"
109#include "messages/MCommandReply.h"
110
111#include "messages/MPGStats.h"
112#include "messages/MPGStatsAck.h"
113
114#include "messages/MWatchNotify.h"
115#include "messages/MOSDPGPush.h"
116#include "messages/MOSDPGPushReply.h"
117#include "messages/MOSDPGPull.h"
118
119#include "common/perf_counters.h"
120#include "common/Timer.h"
121#include "common/LogClient.h"
122#include "common/AsyncReserver.h"
123#include "common/HeartbeatMap.h"
124#include "common/admin_socket.h"
125#include "common/ceph_context.h"
126
127#include "global/signal_handler.h"
128#include "global/pidfile.h"
129
130#include "include/color.h"
131#include "perfglue/cpu_profiler.h"
132#include "perfglue/heap_profiler.h"
133
134#include "osd/OpRequest.h"
135
136#include "auth/AuthAuthorizeHandler.h"
137#include "auth/RotatingKeyRing.h"
138#include "common/errno.h"
139
140#include "objclass/objclass.h"
141
142#include "common/cmdparse.h"
143#include "include/str_list.h"
144#include "include/util.h"
145
146#include "include/assert.h"
147#include "common/config.h"
148#include "common/EventTrace.h"
149
150#ifdef WITH_LTTNG
151#define TRACEPOINT_DEFINE
152#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
153#include "tracing/osd.h"
154#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
155#undef TRACEPOINT_DEFINE
156#else
157#define tracepoint(...)
158#endif
159
160#define dout_context cct
161#define dout_subsys ceph_subsys_osd
162#undef dout_prefix
163#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
164
224ce89b 165
7c673cae
FG
166const double OSD::OSD_TICK_INTERVAL = 1.0;
167
168static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
169 return *_dout << "osd." << whoami << " " << epoch << " ";
170}
171
7c673cae
FG
172//Initial features in new superblock.
173//Features here are also automatically upgraded
174CompatSet OSD::get_osd_initial_compat_set() {
175 CompatSet::FeatureSet ceph_osd_feature_compat;
176 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
177 CompatSet::FeatureSet ceph_osd_feature_incompat;
178 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
179 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
180 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
181 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
182 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
183 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
184 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
185 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
186 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
187 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
188 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
189 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
190 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
191 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
c07f9fc5 192 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
7c673cae
FG
193 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
194 ceph_osd_feature_incompat);
195}
196
197//Features are added here that this OSD supports.
198CompatSet OSD::get_osd_compat_set() {
199 CompatSet compat = get_osd_initial_compat_set();
200 //Any features here can be set in code, but not in initial superblock
201 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
202 return compat;
203}
204
205OSDService::OSDService(OSD *osd) :
206 osd(osd),
207 cct(osd->cct),
208 meta_osr(new ObjectStore::Sequencer("meta")),
209 whoami(osd->whoami), store(osd->store),
210 log_client(osd->log_client), clog(osd->clog),
211 pg_recovery_stats(osd->pg_recovery_stats),
212 cluster_messenger(osd->cluster_messenger),
213 client_messenger(osd->client_messenger),
214 logger(osd->logger),
215 recoverystate_perf(osd->recoverystate_perf),
216 monc(osd->monc),
217 peering_wq(osd->peering_wq),
218 recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
219 &osd->disk_tp),
220 class_handler(osd->class_handler),
221 pg_epoch_lock("OSDService::pg_epoch_lock"),
222 publish_lock("OSDService::publish_lock"),
223 pre_publish_lock("OSDService::pre_publish_lock"),
224 max_oldest_map(0),
225 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
226 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
227 scrubs_active(0),
228 agent_lock("OSDService::agent_lock"),
229 agent_valid_iterator(false),
230 agent_ops(0),
231 flush_mode_high_count(0),
232 agent_active(true),
233 agent_thread(this),
234 agent_stop_flag(false),
235 agent_timer_lock("OSDService::agent_timer_lock"),
236 agent_timer(osd->client_messenger->cct, agent_timer_lock),
237 last_recalibrate(ceph_clock_now()),
238 promote_max_objects(0),
239 promote_max_bytes(0),
240 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
241 objecter_finisher(osd->client_messenger->cct),
242 watch_lock("OSDService::watch_lock"),
243 watch_timer(osd->client_messenger->cct, watch_lock),
244 next_notif_id(0),
245 recovery_request_lock("OSDService::recovery_request_lock"),
246 recovery_request_timer(cct, recovery_request_lock, false),
31f18b77
FG
247 recovery_sleep_lock("OSDService::recovery_sleep_lock"),
248 recovery_sleep_timer(cct, recovery_sleep_lock, false),
7c673cae
FG
249 reserver_finisher(cct),
250 local_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
251 cct->_conf->osd_min_recovery_priority),
252 remote_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
253 cct->_conf->osd_min_recovery_priority),
254 pg_temp_lock("OSDService::pg_temp_lock"),
255 snap_sleep_lock("OSDService::snap_sleep_lock"),
256 snap_sleep_timer(
257 osd->client_messenger->cct, snap_sleep_lock, false /* relax locking */),
31f18b77
FG
258 scrub_sleep_lock("OSDService::scrub_sleep_lock"),
259 scrub_sleep_timer(
260 osd->client_messenger->cct, scrub_sleep_lock, false /* relax locking */),
7c673cae
FG
261 snap_reserver(&reserver_finisher,
262 cct->_conf->osd_max_trimming_pgs),
263 recovery_lock("OSDService::recovery_lock"),
264 recovery_ops_active(0),
265 recovery_ops_reserved(0),
266 recovery_paused(false),
267 map_cache_lock("OSDService::map_cache_lock"),
268 map_cache(cct, cct->_conf->osd_map_cache_size),
269 map_bl_cache(cct->_conf->osd_map_cache_size),
270 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
271 in_progress_split_lock("OSDService::in_progress_split_lock"),
272 stat_lock("OSDService::stat_lock"),
273 full_status_lock("OSDService::full_status_lock"),
274 cur_state(NONE),
275 cur_ratio(0),
276 epoch_lock("OSDService::epoch_lock"),
277 boot_epoch(0), up_epoch(0), bind_epoch(0),
278 is_stopping_lock("OSDService::is_stopping_lock")
279#ifdef PG_DEBUG_REFS
280 , pgid_lock("OSDService::pgid_lock")
281#endif
282{
283 objecter->init();
284}
285
286OSDService::~OSDService()
287{
288 delete objecter;
289}
290
31f18b77
FG
291
292
293#ifdef PG_DEBUG_REFS
294void OSDService::add_pgid(spg_t pgid, PG *pg){
295 Mutex::Locker l(pgid_lock);
296 if (!pgid_tracker.count(pgid)) {
297 live_pgs[pgid] = pg;
298 }
299 pgid_tracker[pgid]++;
300}
301void OSDService::remove_pgid(spg_t pgid, PG *pg)
302{
303 Mutex::Locker l(pgid_lock);
304 assert(pgid_tracker.count(pgid));
305 assert(pgid_tracker[pgid] > 0);
306 pgid_tracker[pgid]--;
307 if (pgid_tracker[pgid] == 0) {
308 pgid_tracker.erase(pgid);
309 live_pgs.erase(pgid);
310 }
311}
312void OSDService::dump_live_pgids()
313{
314 Mutex::Locker l(pgid_lock);
315 derr << "live pgids:" << dendl;
316 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
317 i != pgid_tracker.cend();
318 ++i) {
319 derr << "\t" << *i << dendl;
320 live_pgs[i->first]->dump_live_ids();
321 }
322}
323#endif
324
325
7c673cae
FG
326void OSDService::_start_split(spg_t parent, const set<spg_t> &children)
327{
328 for (set<spg_t>::const_iterator i = children.begin();
329 i != children.end();
330 ++i) {
331 dout(10) << __func__ << ": Starting split on pg " << *i
332 << ", parent=" << parent << dendl;
333 assert(!pending_splits.count(*i));
334 assert(!in_progress_splits.count(*i));
335 pending_splits.insert(make_pair(*i, parent));
336
337 assert(!rev_pending_splits[parent].count(*i));
338 rev_pending_splits[parent].insert(*i);
339 }
340}
341
342void OSDService::mark_split_in_progress(spg_t parent, const set<spg_t> &children)
343{
344 Mutex::Locker l(in_progress_split_lock);
345 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
346 assert(piter != rev_pending_splits.end());
347 for (set<spg_t>::const_iterator i = children.begin();
348 i != children.end();
349 ++i) {
350 assert(piter->second.count(*i));
351 assert(pending_splits.count(*i));
352 assert(!in_progress_splits.count(*i));
353 assert(pending_splits[*i] == parent);
354
355 pending_splits.erase(*i);
356 piter->second.erase(*i);
357 in_progress_splits.insert(*i);
358 }
359 if (piter->second.empty())
360 rev_pending_splits.erase(piter);
361}
362
363void OSDService::cancel_pending_splits_for_parent(spg_t parent)
364{
365 Mutex::Locker l(in_progress_split_lock);
366 _cancel_pending_splits_for_parent(parent);
367}
368
369void OSDService::_cancel_pending_splits_for_parent(spg_t parent)
370{
371 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
372 if (piter == rev_pending_splits.end())
373 return;
374
375 for (set<spg_t>::iterator i = piter->second.begin();
376 i != piter->second.end();
377 ++i) {
378 assert(pending_splits.count(*i));
379 assert(!in_progress_splits.count(*i));
380 pending_splits.erase(*i);
381 dout(10) << __func__ << ": Completing split on pg " << *i
382 << " for parent: " << parent << dendl;
383 _cancel_pending_splits_for_parent(*i);
384 }
385 rev_pending_splits.erase(piter);
386}
387
388void OSDService::_maybe_split_pgid(OSDMapRef old_map,
389 OSDMapRef new_map,
390 spg_t pgid)
391{
392 assert(old_map->have_pg_pool(pgid.pool()));
393 int old_pgnum = old_map->get_pg_num(pgid.pool());
394 if (pgid.ps() < static_cast<unsigned>(old_pgnum)) {
395 set<spg_t> children;
396 if (pgid.is_split(old_pgnum,
397 new_map->get_pg_num(pgid.pool()), &children)) {
398 _start_split(pgid, children); }
399 } else {
400 assert(pgid.ps() < static_cast<unsigned>(new_map->get_pg_num(pgid.pool())));
401 }
402}
403
404void OSDService::init_splits_between(spg_t pgid,
405 OSDMapRef frommap,
406 OSDMapRef tomap)
407{
408 // First, check whether we can avoid this potentially expensive check
409 if (tomap->have_pg_pool(pgid.pool()) &&
410 pgid.is_split(
411 frommap->get_pg_num(pgid.pool()),
412 tomap->get_pg_num(pgid.pool()),
413 NULL)) {
414 // Ok, a split happened, so we need to walk the osdmaps
415 set<spg_t> new_pgs; // pgs to scan on each map
416 new_pgs.insert(pgid);
417 OSDMapRef curmap(get_map(frommap->get_epoch()));
418 for (epoch_t e = frommap->get_epoch() + 1;
419 e <= tomap->get_epoch();
420 ++e) {
421 OSDMapRef nextmap(try_get_map(e));
422 if (!nextmap)
423 continue;
424 set<spg_t> even_newer_pgs; // pgs added in this loop
425 for (set<spg_t>::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) {
426 set<spg_t> split_pgs;
427 if (i->is_split(curmap->get_pg_num(i->pool()),
428 nextmap->get_pg_num(i->pool()),
429 &split_pgs)) {
430 start_split(*i, split_pgs);
431 even_newer_pgs.insert(split_pgs.begin(), split_pgs.end());
432 }
433 }
434 new_pgs.insert(even_newer_pgs.begin(), even_newer_pgs.end());
435 curmap = nextmap;
436 }
437 assert(curmap == tomap); // we must have had both frommap and tomap
438 }
439}
440
441void OSDService::expand_pg_num(OSDMapRef old_map,
442 OSDMapRef new_map)
443{
444 Mutex::Locker l(in_progress_split_lock);
445 for (set<spg_t>::iterator i = in_progress_splits.begin();
446 i != in_progress_splits.end();
447 ) {
448 if (!new_map->have_pg_pool(i->pool())) {
449 in_progress_splits.erase(i++);
450 } else {
451 _maybe_split_pgid(old_map, new_map, *i);
452 ++i;
453 }
454 }
455 for (map<spg_t, spg_t>::iterator i = pending_splits.begin();
456 i != pending_splits.end();
457 ) {
458 if (!new_map->have_pg_pool(i->first.pool())) {
459 rev_pending_splits.erase(i->second);
460 pending_splits.erase(i++);
461 } else {
462 _maybe_split_pgid(old_map, new_map, i->first);
463 ++i;
464 }
465 }
466}
467
468bool OSDService::splitting(spg_t pgid)
469{
470 Mutex::Locker l(in_progress_split_lock);
471 return in_progress_splits.count(pgid) ||
472 pending_splits.count(pgid);
473}
474
475void OSDService::complete_split(const set<spg_t> &pgs)
476{
477 Mutex::Locker l(in_progress_split_lock);
478 for (set<spg_t>::const_iterator i = pgs.begin();
479 i != pgs.end();
480 ++i) {
481 dout(10) << __func__ << ": Completing split on pg " << *i << dendl;
482 assert(!pending_splits.count(*i));
483 assert(in_progress_splits.count(*i));
484 in_progress_splits.erase(*i);
485 }
486}
487
488void OSDService::need_heartbeat_peer_update()
489{
490 osd->need_heartbeat_peer_update();
491}
492
493void OSDService::pg_stat_queue_enqueue(PG *pg)
494{
495 osd->pg_stat_queue_enqueue(pg);
496}
497
498void OSDService::pg_stat_queue_dequeue(PG *pg)
499{
500 osd->pg_stat_queue_dequeue(pg);
501}
502
503void OSDService::start_shutdown()
504{
505 {
506 Mutex::Locker l(agent_timer_lock);
507 agent_timer.shutdown();
508 }
31f18b77
FG
509
510 {
511 Mutex::Locker l(recovery_sleep_lock);
512 recovery_sleep_timer.shutdown();
513 }
7c673cae
FG
514}
515
31f18b77 516void OSDService::shutdown_reserver()
7c673cae
FG
517{
518 reserver_finisher.wait_for_empty();
519 reserver_finisher.stop();
31f18b77
FG
520}
521
522void OSDService::shutdown()
523{
7c673cae
FG
524 {
525 Mutex::Locker l(watch_lock);
526 watch_timer.shutdown();
527 }
528
529 objecter->shutdown();
530 objecter_finisher.wait_for_empty();
531 objecter_finisher.stop();
532
533 {
534 Mutex::Locker l(recovery_request_lock);
535 recovery_request_timer.shutdown();
536 }
537
538 {
539 Mutex::Locker l(snap_sleep_lock);
540 snap_sleep_timer.shutdown();
541 }
542
31f18b77
FG
543 {
544 Mutex::Locker l(scrub_sleep_lock);
545 scrub_sleep_timer.shutdown();
546 }
547
7c673cae
FG
548 osdmap = OSDMapRef();
549 next_osdmap = OSDMapRef();
550}
551
552void OSDService::init()
553{
554 reserver_finisher.start();
555 objecter_finisher.start();
556 objecter->set_client_incarnation(0);
557
558 // deprioritize objecter in daemonperf output
559 objecter->get_logger()->set_prio_adjust(-3);
560
561 watch_timer.init();
562 agent_timer.init();
563 snap_sleep_timer.init();
31f18b77 564 scrub_sleep_timer.init();
7c673cae
FG
565
566 agent_thread.create("osd_srv_agent");
567
568 if (cct->_conf->osd_recovery_delay_start)
569 defer_recovery(cct->_conf->osd_recovery_delay_start);
570}
571
572void OSDService::final_init()
573{
574 objecter->start(osdmap.get());
575}
576
577void OSDService::activate_map()
578{
579 // wake/unwake the tiering agent
580 agent_lock.Lock();
581 agent_active =
582 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
583 osd->is_active();
584 agent_cond.Signal();
585 agent_lock.Unlock();
586}
587
181888fb
FG
588void OSDService::request_osdmap_update(epoch_t e)
589{
590 osd->osdmap_subscribe(e, false);
591}
592
7c673cae
FG
593class AgentTimeoutCB : public Context {
594 PGRef pg;
595public:
596 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
597 void finish(int) override {
598 pg->agent_choose_mode_restart();
599 }
600};
601
602void OSDService::agent_entry()
603{
604 dout(10) << __func__ << " start" << dendl;
605 agent_lock.Lock();
606
607 while (!agent_stop_flag) {
608 if (agent_queue.empty()) {
609 dout(20) << __func__ << " empty queue" << dendl;
610 agent_cond.Wait(agent_lock);
611 continue;
612 }
613 uint64_t level = agent_queue.rbegin()->first;
614 set<PGRef>& top = agent_queue.rbegin()->second;
615 dout(10) << __func__
616 << " tiers " << agent_queue.size()
617 << ", top is " << level
618 << " with pgs " << top.size()
619 << ", ops " << agent_ops << "/"
620 << cct->_conf->osd_agent_max_ops
621 << (agent_active ? " active" : " NOT ACTIVE")
622 << dendl;
623 dout(20) << __func__ << " oids " << agent_oids << dendl;
624 int max = cct->_conf->osd_agent_max_ops - agent_ops;
625 int agent_flush_quota = max;
626 if (!flush_mode_high_count)
627 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
628 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
629 agent_cond.Wait(agent_lock);
630 continue;
631 }
632
633 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
634 agent_queue_pos = top.begin();
635 agent_valid_iterator = true;
636 }
637 PGRef pg = *agent_queue_pos;
638 dout(10) << "high_count " << flush_mode_high_count
639 << " agent_ops " << agent_ops
640 << " flush_quota " << agent_flush_quota << dendl;
641 agent_lock.Unlock();
642 if (!pg->agent_work(max, agent_flush_quota)) {
643 dout(10) << __func__ << " " << pg->get_pgid()
644 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
645 << " seconds" << dendl;
646
647 osd->logger->inc(l_osd_tier_delay);
648 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
649 agent_timer_lock.Lock();
650 Context *cb = new AgentTimeoutCB(pg);
651 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
652 agent_timer_lock.Unlock();
653 }
654 agent_lock.Lock();
655 }
656 agent_lock.Unlock();
657 dout(10) << __func__ << " finish" << dendl;
658}
659
660void OSDService::agent_stop()
661{
662 {
663 Mutex::Locker l(agent_lock);
664
665 // By this time all ops should be cancelled
666 assert(agent_ops == 0);
667 // By this time all PGs are shutdown and dequeued
668 if (!agent_queue.empty()) {
669 set<PGRef>& top = agent_queue.rbegin()->second;
670 derr << "agent queue not empty, for example " << (*top.begin())->info.pgid << dendl;
671 assert(0 == "agent queue not empty");
672 }
673
674 agent_stop_flag = true;
675 agent_cond.Signal();
676 }
677 agent_thread.join();
678}
679
680// -------------------------------------
681
682void OSDService::promote_throttle_recalibrate()
683{
684 utime_t now = ceph_clock_now();
685 double dur = now - last_recalibrate;
686 last_recalibrate = now;
687 unsigned prob = promote_probability_millis;
688
689 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
690 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
691
692 unsigned min_prob = 1;
693
694 uint64_t attempts, obj, bytes;
695 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
696 dout(10) << __func__ << " " << attempts << " attempts, promoted "
697 << obj << " objects and " << pretty_si_t(bytes) << " bytes; target "
698 << target_obj_sec << " obj/sec or "
699 << pretty_si_t(target_bytes_sec) << " bytes/sec"
700 << dendl;
701
702 // calculate what the probability *should* be, given the targets
703 unsigned new_prob;
704 if (attempts && dur > 0) {
705 uint64_t avg_size = 1;
706 if (obj)
707 avg_size = MAX(bytes / obj, 1);
708 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
709 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
710 / (double)attempts;
711 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
712 << avg_size << dendl;
713 if (target_obj_sec && target_bytes_sec)
714 new_prob = MIN(po, pb);
715 else if (target_obj_sec)
716 new_prob = po;
717 else if (target_bytes_sec)
718 new_prob = pb;
719 else
720 new_prob = 1000;
721 } else {
722 new_prob = 1000;
723 }
724 dout(20) << __func__ << " new_prob " << new_prob << dendl;
725
726 // correct for persistent skew between target rate and actual rate, adjust
727 double ratio = 1.0;
728 unsigned actual = 0;
729 if (attempts && obj) {
730 actual = obj * 1000 / attempts;
731 ratio = (double)actual / (double)prob;
732 new_prob = (double)new_prob / ratio;
733 }
734 new_prob = MAX(new_prob, min_prob);
735 new_prob = MIN(new_prob, 1000);
736
737 // adjust
738 prob = (prob + new_prob) / 2;
739 prob = MAX(prob, min_prob);
740 prob = MIN(prob, 1000);
741 dout(10) << __func__ << " actual " << actual
742 << ", actual/prob ratio " << ratio
743 << ", adjusted new_prob " << new_prob
744 << ", prob " << promote_probability_millis << " -> " << prob
745 << dendl;
746 promote_probability_millis = prob;
747
748 // set hard limits for this interval to mitigate stampedes
749 promote_max_objects = target_obj_sec * OSD::OSD_TICK_INTERVAL * 2;
750 promote_max_bytes = target_bytes_sec * OSD::OSD_TICK_INTERVAL * 2;
751}
752
753// -------------------------------------
754
755float OSDService::get_failsafe_full_ratio()
756{
757 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
758 if (full_ratio > 1.0) full_ratio /= 100.0;
759 return full_ratio;
760}
761
224ce89b 762void OSDService::check_full_status(float ratio)
7c673cae
FG
763{
764 Mutex::Locker l(full_status_lock);
765
7c673cae
FG
766 cur_ratio = ratio;
767
768 // The OSDMap ratios take precendence. So if the failsafe is .95 and
769 // the admin sets the cluster full to .96, the failsafe moves up to .96
770 // too. (Not that having failsafe == full is ideal, but it's better than
771 // dropping writes before the clusters appears full.)
772 OSDMapRef osdmap = get_osdmap();
773 if (!osdmap || osdmap->get_epoch() == 0) {
774 cur_state = NONE;
775 return;
776 }
777 float nearfull_ratio = osdmap->get_nearfull_ratio();
778 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
779 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
780 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
781
31f18b77 782 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7c673cae
FG
783 // use the failsafe for nearfull and full; the mon isn't using the
784 // flags anyway because we're mid-upgrade.
785 full_ratio = failsafe_ratio;
786 backfillfull_ratio = failsafe_ratio;
787 nearfull_ratio = failsafe_ratio;
788 } else if (full_ratio <= 0 ||
789 backfillfull_ratio <= 0 ||
790 nearfull_ratio <= 0) {
791 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
792 // use failsafe flag. ick. the monitor did something wrong or the user
793 // did something stupid.
794 full_ratio = failsafe_ratio;
795 backfillfull_ratio = failsafe_ratio;
796 nearfull_ratio = failsafe_ratio;
797 }
798
799 string inject;
800 s_names new_state;
801 if (injectfull_state > NONE && injectfull) {
802 new_state = injectfull_state;
803 inject = "(Injected)";
804 } else if (ratio > failsafe_ratio) {
805 new_state = FAILSAFE;
806 } else if (ratio > full_ratio) {
807 new_state = FULL;
808 } else if (ratio > backfillfull_ratio) {
809 new_state = BACKFILLFULL;
810 } else if (ratio > nearfull_ratio) {
811 new_state = NEARFULL;
812 } else {
813 new_state = NONE;
814 }
815 dout(20) << __func__ << " cur ratio " << ratio
816 << ". nearfull_ratio " << nearfull_ratio
817 << ". backfillfull_ratio " << backfillfull_ratio
818 << ", full_ratio " << full_ratio
819 << ", failsafe_ratio " << failsafe_ratio
820 << ", new state " << get_full_state_name(new_state)
821 << " " << inject
822 << dendl;
823
824 // warn
825 if (cur_state != new_state) {
826 dout(10) << __func__ << " " << get_full_state_name(cur_state)
827 << " -> " << get_full_state_name(new_state) << dendl;
828 if (new_state == FAILSAFE) {
c07f9fc5 829 clog->error() << "full status failsafe engaged, dropping updates, now "
7c673cae
FG
830 << (int)roundf(ratio * 100) << "% full";
831 } else if (cur_state == FAILSAFE) {
c07f9fc5
FG
832 clog->error() << "full status failsafe disengaged, no longer dropping "
833 << "updates, now " << (int)roundf(ratio * 100) << "% full";
7c673cae
FG
834 }
835 cur_state = new_state;
836 }
837}
838
839bool OSDService::need_fullness_update()
840{
841 OSDMapRef osdmap = get_osdmap();
842 s_names cur = NONE;
843 if (osdmap->exists(whoami)) {
844 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
845 cur = FULL;
846 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
847 cur = BACKFILLFULL;
848 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
849 cur = NEARFULL;
850 }
851 }
852 s_names want = NONE;
853 if (is_full())
854 want = FULL;
855 else if (is_backfillfull())
856 want = BACKFILLFULL;
857 else if (is_nearfull())
858 want = NEARFULL;
859 return want != cur;
860}
861
862bool OSDService::_check_full(s_names type, ostream &ss) const
863{
864 Mutex::Locker l(full_status_lock);
865
866 if (injectfull && injectfull_state >= type) {
867 // injectfull is either a count of the number of times to return failsafe full
868 // or if -1 then always return full
869 if (injectfull > 0)
870 --injectfull;
871 ss << "Injected " << get_full_state_name(type) << " OSD ("
872 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")";
873 return true;
874 }
875
876 ss << "current usage is " << cur_ratio;
877 return cur_state >= type;
878}
879
880bool OSDService::check_failsafe_full(ostream &ss) const
881{
882 return _check_full(FAILSAFE, ss);
883}
884
885bool OSDService::check_full(ostream &ss) const
886{
887 return _check_full(FULL, ss);
888}
889
890bool OSDService::check_backfill_full(ostream &ss) const
891{
892 return _check_full(BACKFILLFULL, ss);
893}
894
895bool OSDService::check_nearfull(ostream &ss) const
896{
897 return _check_full(NEARFULL, ss);
898}
899
900bool OSDService::is_failsafe_full() const
901{
902 Mutex::Locker l(full_status_lock);
903 return cur_state == FAILSAFE;
904}
905
906bool OSDService::is_full() const
907{
908 Mutex::Locker l(full_status_lock);
909 return cur_state >= FULL;
910}
911
912bool OSDService::is_backfillfull() const
913{
914 Mutex::Locker l(full_status_lock);
915 return cur_state >= BACKFILLFULL;
916}
917
918bool OSDService::is_nearfull() const
919{
920 Mutex::Locker l(full_status_lock);
921 return cur_state >= NEARFULL;
922}
923
924void OSDService::set_injectfull(s_names type, int64_t count)
925{
926 Mutex::Locker l(full_status_lock);
927 injectfull_state = type;
928 injectfull = count;
929}
930
224ce89b 931osd_stat_t OSDService::set_osd_stat(const struct store_statfs_t &stbuf,
35e4c445
FG
932 vector<int>& hb_peers,
933 int num_pgs)
7c673cae 934{
224ce89b
WB
935 uint64_t bytes = stbuf.total;
936 uint64_t used = bytes - stbuf.available;
937 uint64_t avail = stbuf.available;
7c673cae 938
224ce89b
WB
939 osd->logger->set(l_osd_stat_bytes, bytes);
940 osd->logger->set(l_osd_stat_bytes_used, used);
941 osd->logger->set(l_osd_stat_bytes_avail, avail);
7c673cae 942
224ce89b
WB
943 {
944 Mutex::Locker l(stat_lock);
945 osd_stat.hb_peers.swap(hb_peers);
946 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
947 osd_stat.kb = bytes >> 10;
948 osd_stat.kb_used = used >> 10;
949 osd_stat.kb_avail = avail >> 10;
35e4c445 950 osd_stat.num_pgs = num_pgs;
224ce89b
WB
951 return osd_stat;
952 }
953}
7c673cae 954
224ce89b
WB
955void OSDService::update_osd_stat(vector<int>& hb_peers)
956{
957 // load osd stats first
7c673cae
FG
958 struct store_statfs_t stbuf;
959 int r = osd->store->statfs(&stbuf);
960 if (r < 0) {
961 derr << "statfs() failed: " << cpp_strerror(r) << dendl;
962 return;
963 }
964
35e4c445 965 auto new_stat = set_osd_stat(stbuf, hb_peers, osd->get_num_pgs());
224ce89b
WB
966 dout(20) << "update_osd_stat " << new_stat << dendl;
967 assert(new_stat.kb);
968 float ratio = ((float)new_stat.kb_used) / ((float)new_stat.kb);
969 check_full_status(ratio);
7c673cae
FG
970}
971
972bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
973{
974 OSDMapRef osdmap = get_osdmap();
975 for (auto shard : missing_on) {
976 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
977 return true;
978 }
979 return false;
980}
981
982void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
983{
984 OSDMapRef next_map = get_nextmap_reserved();
985 // service map is always newer/newest
986 assert(from_epoch <= next_map->get_epoch());
987
988 if (next_map->is_down(peer) ||
989 next_map->get_info(peer).up_from > from_epoch) {
990 m->put();
991 release_map(next_map);
992 return;
993 }
994 const entity_inst_t& peer_inst = next_map->get_cluster_inst(peer);
995 ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst);
996 share_map_peer(peer, peer_con.get(), next_map);
997 peer_con->send_message(m);
998 release_map(next_map);
999}
1000
1001ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1002{
1003 OSDMapRef next_map = get_nextmap_reserved();
1004 // service map is always newer/newest
1005 assert(from_epoch <= next_map->get_epoch());
1006
1007 if (next_map->is_down(peer) ||
1008 next_map->get_info(peer).up_from > from_epoch) {
1009 release_map(next_map);
1010 return NULL;
1011 }
1012 ConnectionRef con = osd->cluster_messenger->get_connection(next_map->get_cluster_inst(peer));
1013 release_map(next_map);
1014 return con;
1015}
1016
1017pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1018{
1019 OSDMapRef next_map = get_nextmap_reserved();
1020 // service map is always newer/newest
1021 assert(from_epoch <= next_map->get_epoch());
1022
1023 pair<ConnectionRef,ConnectionRef> ret;
1024 if (next_map->is_down(peer) ||
1025 next_map->get_info(peer).up_from > from_epoch) {
1026 release_map(next_map);
1027 return ret;
1028 }
1029 ret.first = osd->hb_back_client_messenger->get_connection(next_map->get_hb_back_inst(peer));
1030 if (next_map->get_hb_front_addr(peer) != entity_addr_t())
1031 ret.second = osd->hb_front_client_messenger->get_connection(next_map->get_hb_front_inst(peer));
1032 release_map(next_map);
1033 return ret;
1034}
1035
1036
1037void OSDService::queue_want_pg_temp(pg_t pgid, vector<int>& want)
1038{
1039 Mutex::Locker l(pg_temp_lock);
1040 map<pg_t,vector<int> >::iterator p = pg_temp_pending.find(pgid);
1041 if (p == pg_temp_pending.end() ||
1042 p->second != want) {
1043 pg_temp_wanted[pgid] = want;
1044 }
1045}
1046
1047void OSDService::remove_want_pg_temp(pg_t pgid)
1048{
1049 Mutex::Locker l(pg_temp_lock);
1050 pg_temp_wanted.erase(pgid);
1051 pg_temp_pending.erase(pgid);
1052}
1053
1054void OSDService::_sent_pg_temp()
1055{
1056 for (map<pg_t,vector<int> >::iterator p = pg_temp_wanted.begin();
1057 p != pg_temp_wanted.end();
1058 ++p)
1059 pg_temp_pending[p->first] = p->second;
1060 pg_temp_wanted.clear();
1061}
1062
1063void OSDService::requeue_pg_temp()
1064{
1065 Mutex::Locker l(pg_temp_lock);
1066 // wanted overrides pending. note that remove_want_pg_temp
1067 // clears the item out of both.
1068 unsigned old_wanted = pg_temp_wanted.size();
1069 unsigned old_pending = pg_temp_pending.size();
1070 _sent_pg_temp();
1071 pg_temp_wanted.swap(pg_temp_pending);
1072 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1073 << pg_temp_wanted.size() << dendl;
1074}
1075
1076void OSDService::send_pg_temp()
1077{
1078 Mutex::Locker l(pg_temp_lock);
1079 if (pg_temp_wanted.empty())
1080 return;
1081 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1082 MOSDPGTemp *m = new MOSDPGTemp(osdmap->get_epoch());
1083 m->pg_temp = pg_temp_wanted;
1084 monc->send_mon_message(m);
1085 _sent_pg_temp();
1086}
1087
1088void OSDService::send_pg_created(pg_t pgid)
1089{
1090 dout(20) << __func__ << dendl;
c07f9fc5
FG
1091 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1092 monc->send_mon_message(new MOSDPGCreated(pgid));
1093 }
7c673cae
FG
1094}
1095
1096// --------------------------------------
1097// dispatch
1098
1099epoch_t OSDService::get_peer_epoch(int peer)
1100{
1101 Mutex::Locker l(peer_map_epoch_lock);
1102 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1103 if (p == peer_map_epoch.end())
1104 return 0;
1105 return p->second;
1106}
1107
1108epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1109{
1110 Mutex::Locker l(peer_map_epoch_lock);
1111 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1112 if (p != peer_map_epoch.end()) {
1113 if (p->second < e) {
1114 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1115 p->second = e;
1116 } else {
1117 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1118 }
1119 return p->second;
1120 } else {
1121 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1122 peer_map_epoch[peer] = e;
1123 return e;
1124 }
1125}
1126
1127void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1128{
1129 Mutex::Locker l(peer_map_epoch_lock);
1130 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1131 if (p != peer_map_epoch.end()) {
1132 if (p->second <= as_of) {
1133 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1134 << " had " << p->second << dendl;
1135 peer_map_epoch.erase(p);
1136 } else {
1137 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1138 << " has " << p->second << " - not forgetting" << dendl;
1139 }
1140 }
1141}
1142
1143bool OSDService::should_share_map(entity_name_t name, Connection *con,
1144 epoch_t epoch, const OSDMapRef& osdmap,
1145 const epoch_t *sent_epoch_p)
1146{
1147 dout(20) << "should_share_map "
1148 << name << " " << con->get_peer_addr()
1149 << " " << epoch << dendl;
1150
1151 // does client have old map?
1152 if (name.is_client()) {
1153 bool message_sendmap = epoch < osdmap->get_epoch();
1154 if (message_sendmap && sent_epoch_p) {
1155 dout(20) << "client session last_sent_epoch: "
1156 << *sent_epoch_p
1157 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1158 if (*sent_epoch_p < osdmap->get_epoch()) {
1159 return true;
1160 } // else we don't need to send it out again
1161 }
1162 }
1163
1164 if (con->get_messenger() == osd->cluster_messenger &&
1165 con != osd->cluster_messenger->get_loopback_connection() &&
1166 osdmap->is_up(name.num()) &&
1167 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1168 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1169 // remember
1170 epoch_t has = MAX(get_peer_epoch(name.num()), epoch);
1171
1172 // share?
1173 if (has < osdmap->get_epoch()) {
1174 dout(10) << name << " " << con->get_peer_addr()
1175 << " has old map " << epoch << " < "
1176 << osdmap->get_epoch() << dendl;
1177 return true;
1178 }
1179 }
1180
1181 return false;
1182}
1183
1184void OSDService::share_map(
1185 entity_name_t name,
1186 Connection *con,
1187 epoch_t epoch,
1188 OSDMapRef& osdmap,
1189 epoch_t *sent_epoch_p)
1190{
1191 dout(20) << "share_map "
1192 << name << " " << con->get_peer_addr()
1193 << " " << epoch << dendl;
1194
1195 if (!osd->is_active()) {
1196 /*It is safe not to proceed as OSD is not in healthy state*/
1197 return;
1198 }
1199
1200 bool want_shared = should_share_map(name, con, epoch,
1201 osdmap, sent_epoch_p);
1202
1203 if (want_shared){
1204 if (name.is_client()) {
1205 dout(10) << name << " has old map " << epoch
1206 << " < " << osdmap->get_epoch() << dendl;
1207 // we know the Session is valid or we wouldn't be sending
1208 if (sent_epoch_p) {
1209 *sent_epoch_p = osdmap->get_epoch();
1210 }
1211 send_incremental_map(epoch, con, osdmap);
1212 } else if (con->get_messenger() == osd->cluster_messenger &&
1213 osdmap->is_up(name.num()) &&
1214 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1215 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1216 dout(10) << name << " " << con->get_peer_addr()
1217 << " has old map " << epoch << " < "
1218 << osdmap->get_epoch() << dendl;
1219 note_peer_epoch(name.num(), osdmap->get_epoch());
1220 send_incremental_map(epoch, con, osdmap);
1221 }
1222 }
1223}
1224
1225void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1226{
1227 if (!map)
1228 map = get_osdmap();
1229
1230 // send map?
1231 epoch_t pe = get_peer_epoch(peer);
1232 if (pe) {
1233 if (pe < map->get_epoch()) {
1234 send_incremental_map(pe, con, map);
1235 note_peer_epoch(peer, map->get_epoch());
1236 } else
1237 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1238 } else {
1239 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1240 // no idea about peer's epoch.
1241 // ??? send recent ???
1242 // do nothing.
1243 }
1244}
1245
1246bool OSDService::can_inc_scrubs_pending()
1247{
1248 bool can_inc = false;
1249 Mutex::Locker l(sched_scrub_lock);
1250
1251 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1252 dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
1253 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1254 can_inc = true;
1255 } else {
1256 dout(20) << __func__ << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1257 }
1258
1259 return can_inc;
1260}
1261
1262bool OSDService::inc_scrubs_pending()
1263{
1264 bool result = false;
1265
1266 sched_scrub_lock.Lock();
1267 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1268 dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
1269 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1270 result = true;
1271 ++scrubs_pending;
1272 } else {
1273 dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1274 }
1275 sched_scrub_lock.Unlock();
1276
1277 return result;
1278}
1279
1280void OSDService::dec_scrubs_pending()
1281{
1282 sched_scrub_lock.Lock();
1283 dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
1284 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1285 --scrubs_pending;
1286 assert(scrubs_pending >= 0);
1287 sched_scrub_lock.Unlock();
1288}
1289
1290void OSDService::inc_scrubs_active(bool reserved)
1291{
1292 sched_scrub_lock.Lock();
1293 ++(scrubs_active);
1294 if (reserved) {
1295 --(scrubs_pending);
1296 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1297 << " (max " << cct->_conf->osd_max_scrubs
1298 << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
1299 assert(scrubs_pending >= 0);
1300 } else {
1301 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1302 << " (max " << cct->_conf->osd_max_scrubs
1303 << ", pending " << scrubs_pending << ")" << dendl;
1304 }
1305 sched_scrub_lock.Unlock();
1306}
1307
1308void OSDService::dec_scrubs_active()
1309{
1310 sched_scrub_lock.Lock();
1311 dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
1312 << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
1313 --scrubs_active;
1314 assert(scrubs_active >= 0);
1315 sched_scrub_lock.Unlock();
1316}
1317
1318void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1319 epoch_t *_bind_epoch) const
1320{
1321 Mutex::Locker l(epoch_lock);
1322 if (_boot_epoch)
1323 *_boot_epoch = boot_epoch;
1324 if (_up_epoch)
1325 *_up_epoch = up_epoch;
1326 if (_bind_epoch)
1327 *_bind_epoch = bind_epoch;
1328}
1329
1330void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1331 const epoch_t *_bind_epoch)
1332{
1333 Mutex::Locker l(epoch_lock);
1334 if (_boot_epoch) {
1335 assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1336 boot_epoch = *_boot_epoch;
1337 }
1338 if (_up_epoch) {
1339 assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1340 up_epoch = *_up_epoch;
1341 }
1342 if (_bind_epoch) {
1343 assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1344 bind_epoch = *_bind_epoch;
1345 }
1346}
1347
1348bool OSDService::prepare_to_stop()
1349{
1350 Mutex::Locker l(is_stopping_lock);
1351 if (get_state() != NOT_STOPPING)
1352 return false;
1353
1354 OSDMapRef osdmap = get_osdmap();
1355 if (osdmap && osdmap->is_up(whoami)) {
1356 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1357 set_state(PREPARING_TO_STOP);
1358 monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(),
1359 osdmap->get_inst(whoami),
1360 osdmap->get_epoch(),
1361 true // request ack
1362 ));
1363 utime_t now = ceph_clock_now();
1364 utime_t timeout;
1365 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1366 while ((ceph_clock_now() < timeout) &&
1367 (get_state() != STOPPING)) {
1368 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1369 }
1370 }
1371 dout(0) << __func__ << " starting shutdown" << dendl;
1372 set_state(STOPPING);
1373 return true;
1374}
1375
1376void OSDService::got_stop_ack()
1377{
1378 Mutex::Locker l(is_stopping_lock);
1379 if (get_state() == PREPARING_TO_STOP) {
1380 dout(0) << __func__ << " starting shutdown" << dendl;
1381 set_state(STOPPING);
1382 is_stopping_cond.Signal();
1383 } else {
1384 dout(10) << __func__ << " ignoring msg" << dendl;
1385 }
1386}
1387
1388MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1389 OSDSuperblock& sblock)
1390{
1391 MOSDMap *m = new MOSDMap(monc->get_fsid());
1392 m->oldest_map = max_oldest_map;
1393 m->newest_map = sblock.newest_map;
1394
1395 for (epoch_t e = to; e > since; e--) {
1396 bufferlist bl;
1397 if (e > m->oldest_map && get_inc_map_bl(e, bl)) {
1398 m->incremental_maps[e].claim(bl);
1399 } else if (get_map_bl(e, bl)) {
1400 m->maps[e].claim(bl);
1401 break;
1402 } else {
1403 derr << "since " << since << " to " << to
1404 << " oldest " << m->oldest_map << " newest " << m->newest_map
1405 << dendl;
1406 m->put();
1407 m = NULL;
1408 break;
1409 }
1410 }
1411 return m;
1412}
1413
1414void OSDService::send_map(MOSDMap *m, Connection *con)
1415{
1416 con->send_message(m);
1417}
1418
1419void OSDService::send_incremental_map(epoch_t since, Connection *con,
1420 OSDMapRef& osdmap)
1421{
1422 epoch_t to = osdmap->get_epoch();
1423 dout(10) << "send_incremental_map " << since << " -> " << to
1424 << " to " << con << " " << con->get_peer_addr() << dendl;
1425
1426 MOSDMap *m = NULL;
1427 while (!m) {
1428 OSDSuperblock sblock(get_superblock());
1429 if (since < sblock.oldest_map) {
1430 // just send latest full map
1431 MOSDMap *m = new MOSDMap(monc->get_fsid());
1432 m->oldest_map = max_oldest_map;
1433 m->newest_map = sblock.newest_map;
1434 get_map_bl(to, m->maps[to]);
1435 send_map(m, con);
1436 return;
1437 }
1438
1439 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1440 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1441 << ", only sending most recent" << dendl;
1442 since = to - cct->_conf->osd_map_share_max_epochs;
1443 }
1444
1445 if (to - since > (epoch_t)cct->_conf->osd_map_message_max)
1446 to = since + cct->_conf->osd_map_message_max;
1447 m = build_incremental_map_msg(since, to, sblock);
1448 }
1449 send_map(m, con);
1450}
1451
1452bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1453{
1454 bool found = map_bl_cache.lookup(e, &bl);
31f18b77
FG
1455 if (found) {
1456 if (logger)
1457 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1458 return true;
31f18b77
FG
1459 }
1460 if (logger)
1461 logger->inc(l_osd_map_bl_cache_miss);
7c673cae 1462 found = store->read(coll_t::meta(),
31f18b77
FG
1463 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1464 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1465 if (found) {
7c673cae 1466 _add_map_bl(e, bl);
31f18b77 1467 }
7c673cae
FG
1468 return found;
1469}
1470
1471bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1472{
1473 Mutex::Locker l(map_cache_lock);
1474 bool found = map_bl_inc_cache.lookup(e, &bl);
31f18b77
FG
1475 if (found) {
1476 if (logger)
1477 logger->inc(l_osd_map_bl_cache_hit);
7c673cae 1478 return true;
31f18b77
FG
1479 }
1480 if (logger)
1481 logger->inc(l_osd_map_bl_cache_miss);
7c673cae 1482 found = store->read(coll_t::meta(),
31f18b77
FG
1483 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1484 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1485 if (found) {
7c673cae 1486 _add_map_inc_bl(e, bl);
31f18b77 1487 }
7c673cae
FG
1488 return found;
1489}
1490
1491void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1492{
1493 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1494 // cache a contiguous buffer
1495 if (bl.get_num_buffers() > 1) {
1496 bl.rebuild();
1497 }
1498 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1499 map_bl_cache.add(e, bl);
1500}
1501
1502void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1503{
1504 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
31f18b77
FG
1505 // cache a contiguous buffer
1506 if (bl.get_num_buffers() > 1) {
1507 bl.rebuild();
1508 }
1509 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
7c673cae
FG
1510 map_bl_inc_cache.add(e, bl);
1511}
1512
1513void OSDService::pin_map_inc_bl(epoch_t e, bufferlist &bl)
1514{
1515 Mutex::Locker l(map_cache_lock);
31f18b77
FG
1516 // cache a contiguous buffer
1517 if (bl.get_num_buffers() > 1) {
1518 bl.rebuild();
1519 }
7c673cae
FG
1520 map_bl_inc_cache.pin(e, bl);
1521}
1522
1523void OSDService::pin_map_bl(epoch_t e, bufferlist &bl)
1524{
1525 Mutex::Locker l(map_cache_lock);
31f18b77
FG
1526 // cache a contiguous buffer
1527 if (bl.get_num_buffers() > 1) {
1528 bl.rebuild();
1529 }
7c673cae
FG
1530 map_bl_cache.pin(e, bl);
1531}
1532
1533void OSDService::clear_map_bl_cache_pins(epoch_t e)
1534{
1535 Mutex::Locker l(map_cache_lock);
1536 map_bl_inc_cache.clear_pinned(e);
1537 map_bl_cache.clear_pinned(e);
1538}
1539
1540OSDMapRef OSDService::_add_map(OSDMap *o)
1541{
1542 epoch_t e = o->get_epoch();
1543
1544 if (cct->_conf->osd_map_dedup) {
1545 // Dedup against an existing map at a nearby epoch
1546 OSDMapRef for_dedup = map_cache.lower_bound(e);
1547 if (for_dedup) {
1548 OSDMap::dedup(for_dedup.get(), o);
1549 }
1550 }
1551 bool existed;
1552 OSDMapRef l = map_cache.add(e, o, &existed);
1553 if (existed) {
1554 delete o;
1555 }
1556 return l;
1557}
1558
1559OSDMapRef OSDService::try_get_map(epoch_t epoch)
1560{
1561 Mutex::Locker l(map_cache_lock);
1562 OSDMapRef retval = map_cache.lookup(epoch);
1563 if (retval) {
1564 dout(30) << "get_map " << epoch << " -cached" << dendl;
1565 if (logger) {
1566 logger->inc(l_osd_map_cache_hit);
1567 }
1568 return retval;
1569 }
1570 if (logger) {
1571 logger->inc(l_osd_map_cache_miss);
1572 epoch_t lb = map_cache.cached_key_lower_bound();
1573 if (epoch < lb) {
1574 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1575 logger->inc(l_osd_map_cache_miss_low);
1576 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1577 }
1578 }
1579
1580 OSDMap *map = new OSDMap;
1581 if (epoch > 0) {
1582 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1583 bufferlist bl;
1584 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1585 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1586 delete map;
1587 return OSDMapRef();
1588 }
1589 map->decode(bl);
1590 } else {
1591 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1592 }
1593 return _add_map(map);
1594}
1595
1596// ops
1597
1598
1599void OSDService::reply_op_error(OpRequestRef op, int err)
1600{
1601 reply_op_error(op, err, eversion_t(), 0);
1602}
1603
1604void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1605 version_t uv)
1606{
1607 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1608 assert(m->get_type() == CEPH_MSG_OSD_OP);
1609 int flags;
1610 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1611
1612 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1613 true);
1614 reply->set_reply_versions(v, uv);
1615 m->get_connection()->send_message(reply);
1616}
1617
1618void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1619{
31f18b77
FG
1620 if (!cct->_conf->osd_debug_misdirected_ops) {
1621 return;
1622 }
1623
7c673cae
FG
1624 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1625 assert(m->get_type() == CEPH_MSG_OSD_OP);
1626
1627 assert(m->get_map_epoch() >= pg->info.history.same_primary_since);
1628
1629 if (pg->is_ec_pg()) {
1630 /**
1631 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1632 * can get this result:
1633 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1634 * [CRUSH_ITEM_NONE, 2, 3]/3
1635 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1636 * [3, 2, 3]/3
1637 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1638 * -- misdirected op
1639 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1640 * it and fulfils it
1641 *
1642 * We can't compute the op target based on the sending map epoch due to
1643 * splitting. The simplest thing is to detect such cases here and drop
1644 * them without an error (the client will resend anyway).
1645 */
1646 assert(m->get_map_epoch() <= superblock.newest_map);
1647 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1648 if (!opmap) {
1649 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1650 << m->get_map_epoch() << ", dropping" << dendl;
1651 return;
1652 }
1653 pg_t _pgid = m->get_raw_pg();
1654 spg_t pgid;
1655 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1656 _pgid = opmap->raw_pg_to_pg(_pgid);
1657 if (opmap->get_primary_shard(_pgid, &pgid) &&
1658 pgid.shard != pg->info.pgid.shard) {
1659 dout(7) << __func__ << ": " << *pg << " primary changed since "
1660 << m->get_map_epoch() << ", dropping" << dendl;
1661 return;
1662 }
1663 }
1664
1665 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1666 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1667 << " pg " << m->get_raw_pg()
1668 << " to osd." << whoami
1669 << " not " << pg->acting
1670 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
7c673cae
FG
1671}
1672
1673void OSDService::enqueue_back(spg_t pgid, PGQueueable qi)
1674{
1675 osd->op_shardedwq.queue(make_pair(pgid, qi));
1676}
1677
1678void OSDService::enqueue_front(spg_t pgid, PGQueueable qi)
1679{
1680 osd->op_shardedwq.queue_front(make_pair(pgid, qi));
1681}
1682
1683void OSDService::queue_for_peering(PG *pg)
1684{
1685 peering_wq.queue(pg);
1686}
1687
1688void OSDService::queue_for_snap_trim(PG *pg)
1689{
1690 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1691 osd->op_shardedwq.queue(
1692 make_pair(
1693 pg->info.pgid,
1694 PGQueueable(
1695 PGSnapTrim(pg->get_osdmap()->get_epoch()),
1696 cct->_conf->osd_snap_trim_cost,
1697 cct->_conf->osd_snap_trim_priority,
1698 ceph_clock_now(),
1699 entity_inst_t(),
1700 pg->get_osdmap()->get_epoch())));
1701}
1702
1703
1704// ====================================================================
1705// OSD
1706
1707#undef dout_prefix
1708#define dout_prefix *_dout
1709
1710// Commands shared between OSD's console and admin console:
1711namespace ceph {
1712namespace osd_cmds {
1713
1714int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1715
1716}} // namespace ceph::osd_cmds
1717
1718int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
1719 uuid_d fsid, int whoami)
1720{
1721 int ret;
1722
1723 ceph::shared_ptr<ObjectStore::Sequencer> osr(
1724 new ObjectStore::Sequencer("mkfs"));
1725 OSDSuperblock sb;
1726 bufferlist sbbl;
1727 C_SaferCond waiter;
1728
1729 // if we are fed a uuid for this osd, use it.
1730 store->set_fsid(cct->_conf->osd_uuid);
1731
1732 ret = store->mkfs();
1733 if (ret) {
224ce89b
WB
1734 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1735 << cpp_strerror(ret) << dendl;
7c673cae
FG
1736 goto free_store;
1737 }
1738
31f18b77 1739 store->set_cache_shards(1); // doesn't matter for mkfs!
7c673cae
FG
1740
1741 ret = store->mount();
1742 if (ret) {
224ce89b
WB
1743 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1744 << cpp_strerror(ret) << dendl;
7c673cae
FG
1745 goto free_store;
1746 }
1747
1748 ret = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1749 if (ret >= 0) {
1750 /* if we already have superblock, check content of superblock */
1751 dout(0) << " have superblock" << dendl;
1752 bufferlist::iterator p;
1753 p = sbbl.begin();
1754 ::decode(sb, p);
1755 if (whoami != sb.whoami) {
1756 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1757 << dendl;
1758 ret = -EINVAL;
1759 goto umount_store;
1760 }
1761 if (fsid != sb.cluster_fsid) {
1762 derr << "provided cluster fsid " << fsid
1763 << " != superblock's " << sb.cluster_fsid << dendl;
1764 ret = -EINVAL;
1765 goto umount_store;
1766 }
1767 } else {
1768 // create superblock
1769 sb.cluster_fsid = fsid;
1770 sb.osd_fsid = store->get_fsid();
1771 sb.whoami = whoami;
1772 sb.compat_features = get_osd_initial_compat_set();
1773
1774 bufferlist bl;
1775 ::encode(sb, bl);
1776
1777 ObjectStore::Transaction t;
1778 t.create_collection(coll_t::meta(), 0);
1779 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1780 ret = store->apply_transaction(osr.get(), std::move(t));
1781 if (ret) {
1782 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
224ce89b 1783 << "apply_transaction returned " << cpp_strerror(ret) << dendl;
7c673cae
FG
1784 goto umount_store;
1785 }
1786 }
1787
1788 if (!osr->flush_commit(&waiter)) {
1789 waiter.wait();
1790 }
1791
1792 ret = write_meta(store, sb.cluster_fsid, sb.osd_fsid, whoami);
1793 if (ret) {
224ce89b
WB
1794 derr << "OSD::mkfs: failed to write fsid file: error "
1795 << cpp_strerror(ret) << dendl;
7c673cae
FG
1796 goto umount_store;
1797 }
1798
1799umount_store:
1800 store->umount();
1801free_store:
1802 delete store;
1803 return ret;
1804}
1805
1806int OSD::write_meta(ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
1807{
1808 char val[80];
1809 int r;
1810
1811 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
1812 r = store->write_meta("magic", val);
1813 if (r < 0)
1814 return r;
1815
1816 snprintf(val, sizeof(val), "%d", whoami);
1817 r = store->write_meta("whoami", val);
1818 if (r < 0)
1819 return r;
1820
1821 cluster_fsid.print(val);
1822 r = store->write_meta("ceph_fsid", val);
1823 if (r < 0)
1824 return r;
1825
1826 r = store->write_meta("ready", "ready");
1827 if (r < 0)
1828 return r;
1829
1830 return 0;
1831}
1832
1833int OSD::peek_meta(ObjectStore *store, std::string& magic,
1834 uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami)
1835{
1836 string val;
1837
1838 int r = store->read_meta("magic", &val);
1839 if (r < 0)
1840 return r;
1841 magic = val;
1842
1843 r = store->read_meta("whoami", &val);
1844 if (r < 0)
1845 return r;
1846 whoami = atoi(val.c_str());
1847
1848 r = store->read_meta("ceph_fsid", &val);
1849 if (r < 0)
1850 return r;
1851 r = cluster_fsid.parse(val.c_str());
1852 if (!r)
1853 return -EINVAL;
1854
1855 r = store->read_meta("fsid", &val);
1856 if (r < 0) {
1857 osd_fsid = uuid_d();
1858 } else {
1859 r = osd_fsid.parse(val.c_str());
1860 if (!r)
1861 return -EINVAL;
1862 }
1863
1864 return 0;
1865}
1866
1867
1868#undef dout_prefix
1869#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
1870
1871// cons/des
1872
1873OSD::OSD(CephContext *cct_, ObjectStore *store_,
1874 int id,
1875 Messenger *internal_messenger,
1876 Messenger *external_messenger,
1877 Messenger *hb_client_front,
1878 Messenger *hb_client_back,
1879 Messenger *hb_front_serverm,
1880 Messenger *hb_back_serverm,
1881 Messenger *osdc_messenger,
1882 MonClient *mc,
1883 const std::string &dev, const std::string &jdev) :
1884 Dispatcher(cct_),
1885 osd_lock("OSD::osd_lock"),
1886 tick_timer(cct, osd_lock),
1887 tick_timer_lock("OSD::tick_timer_lock"),
1888 tick_timer_without_osd_lock(cct, tick_timer_lock),
1889 authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(cct,
1890 cct->_conf->auth_supported.empty() ?
1891 cct->_conf->auth_cluster_required :
1892 cct->_conf->auth_supported)),
1893 authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(cct,
1894 cct->_conf->auth_supported.empty() ?
1895 cct->_conf->auth_service_required :
1896 cct->_conf->auth_supported)),
1897 cluster_messenger(internal_messenger),
1898 client_messenger(external_messenger),
1899 objecter_messenger(osdc_messenger),
1900 monc(mc),
1901 mgrc(cct_, client_messenger),
1902 logger(NULL),
1903 recoverystate_perf(NULL),
1904 store(store_),
1905 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
1906 clog(log_client.create_channel()),
1907 whoami(id),
1908 dev_path(dev), journal_path(jdev),
31f18b77 1909 store_is_rotational(store->is_rotational()),
7c673cae
FG
1910 trace_endpoint("0.0.0.0", 0, "osd"),
1911 asok_hook(NULL),
1912 osd_compat(get_osd_compat_set()),
31f18b77
FG
1913 peering_tp(cct, "OSD::peering_tp", "tp_peering",
1914 cct->_conf->osd_peering_wq_threads,
1915 "osd_peering_tp_threads"),
7c673cae 1916 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
31f18b77 1917 get_num_op_threads()),
7c673cae
FG
1918 disk_tp(cct, "OSD::disk_tp", "tp_osd_disk", cct->_conf->osd_disk_threads, "osd_disk_threads"),
1919 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
1920 session_waiting_lock("OSD::session_waiting_lock"),
181888fb 1921 osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
7c673cae
FG
1922 heartbeat_lock("OSD::heartbeat_lock"),
1923 heartbeat_stop(false),
1924 heartbeat_need_update(true),
1925 hb_front_client_messenger(hb_client_front),
1926 hb_back_client_messenger(hb_client_back),
1927 hb_front_server_messenger(hb_front_serverm),
1928 hb_back_server_messenger(hb_back_serverm),
1929 daily_loadavg(0.0),
1930 heartbeat_thread(this),
1931 heartbeat_dispatcher(this),
1932 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
1933 cct->_conf->osd_num_op_tracker_shard),
1934 test_ops_hook(NULL),
1935 op_queue(get_io_queue()),
1936 op_prio_cutoff(get_io_prio_cut()),
1937 op_shardedwq(
31f18b77 1938 get_num_op_shards(),
7c673cae
FG
1939 this,
1940 cct->_conf->osd_op_thread_timeout,
1941 cct->_conf->osd_op_thread_suicide_timeout,
1942 &osd_op_tp),
1943 peering_wq(
1944 this,
1945 cct->_conf->osd_op_thread_timeout,
1946 cct->_conf->osd_op_thread_suicide_timeout,
31f18b77 1947 &peering_tp),
7c673cae
FG
1948 map_lock("OSD::map_lock"),
1949 pg_map_lock("OSD::pg_map_lock"),
1950 last_pg_create_epoch(0),
1951 mon_report_lock("OSD::mon_report_lock"),
1952 stats_ack_timeout(cct->_conf->osd_mon_ack_timeout),
1953 up_thru_wanted(0),
1954 requested_full_first(0),
1955 requested_full_last(0),
1956 pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
1957 osd_stat_updated(false),
1958 pg_stat_tid(0), pg_stat_tid_flushed(0),
1959 command_wq(
1960 this,
1961 cct->_conf->osd_command_thread_timeout,
1962 cct->_conf->osd_command_thread_suicide_timeout,
1963 &command_tp),
1964 remove_wq(
1965 cct,
1966 store,
1967 cct->_conf->osd_remove_thread_timeout,
1968 cct->_conf->osd_remove_thread_suicide_timeout,
1969 &disk_tp),
1970 service(this)
1971{
1972 monc->set_messenger(client_messenger);
1973 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
1974 cct->_conf->osd_op_log_threshold);
1975 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
1976 cct->_conf->osd_op_history_duration);
1977 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
1978 cct->_conf->osd_op_history_slow_op_threshold);
1979#ifdef WITH_BLKIN
1980 std::stringstream ss;
1981 ss << "osd." << whoami;
1982 trace_endpoint.copy_name(ss.str());
1983#endif
1984}
1985
1986OSD::~OSD()
1987{
1988 delete authorize_handler_cluster_registry;
1989 delete authorize_handler_service_registry;
1990 delete class_handler;
1991 cct->get_perfcounters_collection()->remove(recoverystate_perf);
1992 cct->get_perfcounters_collection()->remove(logger);
1993 delete recoverystate_perf;
1994 delete logger;
1995 delete store;
1996}
1997
1998void cls_initialize(ClassHandler *ch);
1999
2000void OSD::handle_signal(int signum)
2001{
2002 assert(signum == SIGINT || signum == SIGTERM);
2003 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2004 shutdown();
2005}
2006
2007int OSD::pre_init()
2008{
2009 Mutex::Locker lock(osd_lock);
2010 if (is_stopping())
2011 return 0;
2012
2013 if (store->test_mount_in_use()) {
2014 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2015 << "currently in use. (Is ceph-osd already running?)" << dendl;
2016 return -EBUSY;
2017 }
2018
2019 cct->_conf->add_observer(this);
2020 return 0;
2021}
2022
2023// asok
2024
2025class OSDSocketHook : public AdminSocketHook {
2026 OSD *osd;
2027public:
2028 explicit OSDSocketHook(OSD *o) : osd(o) {}
2029 bool call(std::string admin_command, cmdmap_t& cmdmap, std::string format,
2030 bufferlist& out) override {
2031 stringstream ss;
2032 bool r = osd->asok_command(admin_command, cmdmap, format, ss);
2033 out.append(ss);
2034 return r;
2035 }
2036};
2037
2038bool OSD::asok_command(string admin_command, cmdmap_t& cmdmap, string format,
2039 ostream& ss)
2040{
2041 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2042 if (admin_command == "status") {
2043 f->open_object_section("status");
2044 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2045 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2046 f->dump_unsigned("whoami", superblock.whoami);
2047 f->dump_string("state", get_state_name(get_state()));
2048 f->dump_unsigned("oldest_map", superblock.oldest_map);
2049 f->dump_unsigned("newest_map", superblock.newest_map);
2050 {
2051 RWLock::RLocker l(pg_map_lock);
2052 f->dump_unsigned("num_pgs", pg_map.size());
2053 }
2054 f->close_section();
2055 } else if (admin_command == "flush_journal") {
2056 store->flush_journal();
2057 } else if (admin_command == "dump_ops_in_flight" ||
c07f9fc5
FG
2058 admin_command == "ops" ||
2059 admin_command == "dump_blocked_ops" ||
2060 admin_command == "dump_historic_ops" ||
2061 admin_command == "dump_historic_ops_by_duration" ||
2062 admin_command == "dump_historic_slow_ops") {
2063
2064 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2065even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2066will start to track new ops received afterwards.";
2067
2068 set<string> filters;
2069 vector<string> filter_str;
2070 if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
2071 copy(filter_str.begin(), filter_str.end(),
2072 inserter(filters, filters.end()));
2073 }
2074
2075 if (admin_command == "dump_ops_in_flight" ||
2076 admin_command == "ops") {
2077 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2078 ss << error_str;
2079 }
2080 }
2081 if (admin_command == "dump_blocked_ops") {
2082 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2083 ss << error_str;
2084 }
2085 }
2086 if (admin_command == "dump_historic_ops") {
2087 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2088 ss << error_str;
2089 }
2090 }
2091 if (admin_command == "dump_historic_ops_by_duration") {
2092 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2093 ss << error_str;
2094 }
2095 }
2096 if (admin_command == "dump_historic_slow_ops") {
2097 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2098 ss << error_str;
2099 }
7c673cae
FG
2100 }
2101 } else if (admin_command == "dump_op_pq_state") {
2102 f->open_object_section("pq");
2103 op_shardedwq.dump(f);
2104 f->close_section();
2105 } else if (admin_command == "dump_blacklist") {
2106 list<pair<entity_addr_t,utime_t> > bl;
2107 OSDMapRef curmap = service.get_osdmap();
2108
2109 f->open_array_section("blacklist");
2110 curmap->get_blacklist(&bl);
2111 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2112 it != bl.end(); ++it) {
224ce89b 2113 f->open_object_section("entry");
7c673cae
FG
2114 f->open_object_section("entity_addr_t");
2115 it->first.dump(f);
2116 f->close_section(); //entity_addr_t
2117 it->second.localtime(f->dump_stream("expire_time"));
2118 f->close_section(); //entry
2119 }
2120 f->close_section(); //blacklist
2121 } else if (admin_command == "dump_watchers") {
2122 list<obj_watch_item_t> watchers;
2123 // scan pg's
2124 {
2125 Mutex::Locker l(osd_lock);
2126 RWLock::RLocker l2(pg_map_lock);
2127 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2128 it != pg_map.end();
2129 ++it) {
2130
2131 list<obj_watch_item_t> pg_watchers;
2132 PG *pg = it->second;
2133 pg->lock();
2134 pg->get_watchers(pg_watchers);
2135 pg->unlock();
2136 watchers.splice(watchers.end(), pg_watchers);
2137 }
2138 }
2139
2140 f->open_array_section("watchers");
2141 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2142 it != watchers.end(); ++it) {
2143
224ce89b 2144 f->open_object_section("watch");
7c673cae
FG
2145
2146 f->dump_string("namespace", it->obj.nspace);
2147 f->dump_string("object", it->obj.oid.name);
2148
2149 f->open_object_section("entity_name");
2150 it->wi.name.dump(f);
2151 f->close_section(); //entity_name_t
2152
224ce89b
WB
2153 f->dump_unsigned("cookie", it->wi.cookie);
2154 f->dump_unsigned("timeout", it->wi.timeout_seconds);
7c673cae
FG
2155
2156 f->open_object_section("entity_addr_t");
2157 it->wi.addr.dump(f);
2158 f->close_section(); //entity_addr_t
2159
2160 f->close_section(); //watch
2161 }
2162
2163 f->close_section(); //watchers
2164 } else if (admin_command == "dump_reservations") {
2165 f->open_object_section("reservations");
2166 f->open_object_section("local_reservations");
2167 service.local_reserver.dump(f);
2168 f->close_section();
2169 f->open_object_section("remote_reservations");
2170 service.remote_reserver.dump(f);
2171 f->close_section();
2172 f->close_section();
2173 } else if (admin_command == "get_latest_osdmap") {
2174 get_latest_osdmap();
2175 } else if (admin_command == "heap") {
2176 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2177
2178 // Note: Failed heap profile commands won't necessarily trigger an error:
2179 f->open_object_section("result");
2180 f->dump_string("error", cpp_strerror(result));
2181 f->dump_bool("success", result >= 0);
2182 f->close_section();
2183 } else if (admin_command == "set_heap_property") {
2184 string property;
2185 int64_t value = 0;
2186 string error;
2187 bool success = false;
2188 if (!cmd_getval(cct, cmdmap, "property", property)) {
2189 error = "unable to get property";
2190 success = false;
2191 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2192 error = "unable to get value";
2193 success = false;
2194 } else if (value < 0) {
2195 error = "negative value not allowed";
2196 success = false;
2197 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2198 error = "invalid property";
2199 success = false;
2200 } else {
2201 success = true;
2202 }
2203 f->open_object_section("result");
2204 f->dump_string("error", error);
2205 f->dump_bool("success", success);
2206 f->close_section();
2207 } else if (admin_command == "get_heap_property") {
2208 string property;
2209 size_t value = 0;
2210 string error;
2211 bool success = false;
2212 if (!cmd_getval(cct, cmdmap, "property", property)) {
2213 error = "unable to get property";
2214 success = false;
2215 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2216 error = "invalid property";
2217 success = false;
2218 } else {
2219 success = true;
2220 }
2221 f->open_object_section("result");
2222 f->dump_string("error", error);
2223 f->dump_bool("success", success);
2224 f->dump_int("value", value);
2225 f->close_section();
2226 } else if (admin_command == "dump_objectstore_kv_stats") {
2227 store->get_db_statistics(f);
2228 } else if (admin_command == "dump_scrubs") {
2229 service.dumps_scrub(f);
2230 } else if (admin_command == "calc_objectstore_db_histogram") {
2231 store->generate_db_histogram(f);
2232 } else if (admin_command == "flush_store_cache") {
2233 store->flush_cache();
2234 } else if (admin_command == "dump_pgstate_history") {
2235 f->open_object_section("pgstate_history");
2236 RWLock::RLocker l2(pg_map_lock);
2237 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2238 it != pg_map.end();
2239 ++it) {
2240
2241 PG *pg = it->second;
2242 f->dump_stream("pg") << pg->get_pgid();
2243 pg->lock();
2244 pg->pgstate_history.dump(f);
2245 pg->unlock();
2246 }
2247 f->close_section();
224ce89b
WB
2248 } else if (admin_command == "compact") {
2249 dout(1) << "triggering manual compaction" << dendl;
2250 auto start = ceph::coarse_mono_clock::now();
2251 store->compact();
2252 auto end = ceph::coarse_mono_clock::now();
2253 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
2254 dout(1) << "finished manual compaction in "
2255 << time_span.count()
2256 << " seconds" << dendl;
2257 f->open_object_section("compact_result");
2258 f->dump_float("elapsed_time", time_span.count());
2259 f->close_section();
7c673cae
FG
2260 } else {
2261 assert(0 == "broken asok registration");
2262 }
2263 f->flush(ss);
2264 delete f;
2265 return true;
2266}
2267
2268class TestOpsSocketHook : public AdminSocketHook {
2269 OSDService *service;
2270 ObjectStore *store;
2271public:
2272 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
2273 bool call(std::string command, cmdmap_t& cmdmap, std::string format,
2274 bufferlist& out) override {
2275 stringstream ss;
2276 test_ops(service, store, command, cmdmap, ss);
2277 out.append(ss);
2278 return true;
2279 }
2280 void test_ops(OSDService *service, ObjectStore *store,
2281 const std::string &command, cmdmap_t& cmdmap, ostream &ss);
2282
2283};
2284
2285class OSD::C_Tick : public Context {
2286 OSD *osd;
2287 public:
2288 explicit C_Tick(OSD *o) : osd(o) {}
2289 void finish(int r) override {
2290 osd->tick();
2291 }
2292};
2293
2294class OSD::C_Tick_WithoutOSDLock : public Context {
2295 OSD *osd;
2296 public:
2297 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2298 void finish(int r) override {
2299 osd->tick_without_osd_lock();
2300 }
2301};
2302
2303int OSD::enable_disable_fuse(bool stop)
2304{
2305#ifdef HAVE_LIBFUSE
2306 int r;
2307 string mntpath = cct->_conf->osd_data + "/fuse";
2308 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2309 dout(1) << __func__ << " disabling" << dendl;
2310 fuse_store->stop();
2311 delete fuse_store;
2312 fuse_store = NULL;
2313 r = ::rmdir(mntpath.c_str());
7c673cae 2314 if (r < 0) {
c07f9fc5
FG
2315 r = -errno;
2316 derr << __func__ << " failed to rmdir " << mntpath << ": "
2317 << cpp_strerror(r) << dendl;
7c673cae
FG
2318 return r;
2319 }
2320 return 0;
2321 }
2322 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2323 dout(1) << __func__ << " enabling" << dendl;
2324 r = ::mkdir(mntpath.c_str(), 0700);
2325 if (r < 0)
2326 r = -errno;
2327 if (r < 0 && r != -EEXIST) {
2328 derr << __func__ << " unable to create " << mntpath << ": "
2329 << cpp_strerror(r) << dendl;
2330 return r;
2331 }
2332 fuse_store = new FuseStore(store, mntpath);
2333 r = fuse_store->start();
2334 if (r < 0) {
2335 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2336 delete fuse_store;
2337 fuse_store = NULL;
2338 return r;
2339 }
2340 }
2341#endif // HAVE_LIBFUSE
2342 return 0;
2343}
2344
31f18b77
FG
2345int OSD::get_num_op_shards()
2346{
2347 if (cct->_conf->osd_op_num_shards)
2348 return cct->_conf->osd_op_num_shards;
2349 if (store_is_rotational)
2350 return cct->_conf->osd_op_num_shards_hdd;
2351 else
2352 return cct->_conf->osd_op_num_shards_ssd;
2353}
2354
2355int OSD::get_num_op_threads()
2356{
2357 if (cct->_conf->osd_op_num_threads_per_shard)
2358 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2359 if (store_is_rotational)
2360 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2361 else
2362 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2363}
2364
c07f9fc5
FG
2365float OSD::get_osd_recovery_sleep()
2366{
2367 if (cct->_conf->osd_recovery_sleep)
2368 return cct->_conf->osd_recovery_sleep;
d2e6a577 2369 if (!store_is_rotational && !journal_is_rotational)
c07f9fc5 2370 return cct->_conf->osd_recovery_sleep_ssd;
d2e6a577
FG
2371 else if (store_is_rotational && !journal_is_rotational)
2372 return cct->_conf->get_val<double>("osd_recovery_sleep_hybrid");
2373 else
2374 return cct->_conf->osd_recovery_sleep_hdd;
c07f9fc5
FG
2375}
2376
7c673cae
FG
2377int OSD::init()
2378{
2379 CompatSet initial, diff;
2380 Mutex::Locker lock(osd_lock);
2381 if (is_stopping())
2382 return 0;
2383
2384 tick_timer.init();
2385 tick_timer_without_osd_lock.init();
2386 service.recovery_request_timer.init();
31f18b77 2387 service.recovery_sleep_timer.init();
7c673cae
FG
2388
2389 // mount.
31f18b77
FG
2390 dout(2) << "init " << dev_path
2391 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
2392 << dendl;
d2e6a577 2393 dout(2) << "journal " << journal_path << dendl;
7c673cae
FG
2394 assert(store); // call pre_init() first!
2395
31f18b77 2396 store->set_cache_shards(get_num_op_shards());
7c673cae
FG
2397
2398 int r = store->mount();
2399 if (r < 0) {
2400 derr << "OSD:init: unable to mount object store" << dendl;
2401 return r;
2402 }
d2e6a577
FG
2403 journal_is_rotational = store->is_journal_rotational();
2404 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
2405 << dendl;
7c673cae
FG
2406
2407 enable_disable_fuse(false);
2408
2409 dout(2) << "boot" << dendl;
2410
2411 // initialize the daily loadavg with current 15min loadavg
2412 double loadavgs[3];
2413 if (getloadavg(loadavgs, 3) == 3) {
2414 daily_loadavg = loadavgs[2];
2415 } else {
2416 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
2417 daily_loadavg = 1.0;
2418 }
2419
2420 int rotating_auth_attempts = 0;
2421
2422 // sanity check long object name handling
2423 {
2424 hobject_t l;
2425 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
2426 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
2427 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
2428 r = store->validate_hobject_key(l);
2429 if (r < 0) {
2430 derr << "backend (" << store->get_type() << ") is unable to support max "
2431 << "object name[space] len" << dendl;
2432 derr << " osd max object name len = "
2433 << cct->_conf->osd_max_object_name_len << dendl;
2434 derr << " osd max object namespace len = "
2435 << cct->_conf->osd_max_object_namespace_len << dendl;
2436 derr << cpp_strerror(r) << dendl;
2437 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
2438 goto out;
2439 }
2440 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
2441 << dendl;
2442 } else {
2443 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
2444 }
2445 }
2446
2447 // read superblock
2448 r = read_superblock();
2449 if (r < 0) {
2450 derr << "OSD::init() : unable to read osd superblock" << dendl;
2451 r = -EINVAL;
2452 goto out;
2453 }
2454
2455 if (osd_compat.compare(superblock.compat_features) < 0) {
2456 derr << "The disk uses features unsupported by the executable." << dendl;
2457 derr << " ondisk features " << superblock.compat_features << dendl;
2458 derr << " daemon features " << osd_compat << dendl;
2459
2460 if (osd_compat.writeable(superblock.compat_features)) {
2461 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2462 derr << "it is still writeable, though. Missing features: " << diff << dendl;
2463 r = -EOPNOTSUPP;
2464 goto out;
2465 }
2466 else {
2467 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2468 derr << "Cannot write to disk! Missing features: " << diff << dendl;
2469 r = -EOPNOTSUPP;
2470 goto out;
2471 }
2472 }
2473
2474 assert_warn(whoami == superblock.whoami);
2475 if (whoami != superblock.whoami) {
2476 derr << "OSD::init: superblock says osd"
2477 << superblock.whoami << " but I am osd." << whoami << dendl;
2478 r = -EINVAL;
2479 goto out;
2480 }
2481
2482 initial = get_osd_initial_compat_set();
2483 diff = superblock.compat_features.unsupported(initial);
2484 if (superblock.compat_features.merge(initial)) {
2485 // We need to persist the new compat_set before we
2486 // do anything else
2487 dout(5) << "Upgrading superblock adding: " << diff << dendl;
2488 ObjectStore::Transaction t;
2489 write_superblock(t);
2490 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2491 if (r < 0)
2492 goto out;
2493 }
2494
2495 // make sure snap mapper object exists
2496 if (!store->exists(coll_t::meta(), OSD::make_snapmapper_oid())) {
2497 dout(10) << "init creating/touching snapmapper object" << dendl;
2498 ObjectStore::Transaction t;
2499 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
2500 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2501 if (r < 0)
2502 goto out;
2503 }
2504
2505 class_handler = new ClassHandler(cct);
2506 cls_initialize(class_handler);
2507
2508 if (cct->_conf->osd_open_classes_on_start) {
2509 int r = class_handler->open_all_classes();
2510 if (r)
2511 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
2512 }
2513
2514 // load up "current" osdmap
2515 assert_warn(!osdmap);
2516 if (osdmap) {
2517 derr << "OSD::init: unable to read current osdmap" << dendl;
2518 r = -EINVAL;
2519 goto out;
2520 }
2521 osdmap = get_map(superblock.current_epoch);
2522 check_osdmap_features(store);
2523
2524 create_recoverystate_perf();
2525
2526 {
2527 epoch_t bind_epoch = osdmap->get_epoch();
2528 service.set_epochs(NULL, NULL, &bind_epoch);
2529 }
2530
2531 clear_temp_objects();
2532
d2e6a577
FG
2533 // initialize osdmap references in sharded wq
2534 op_shardedwq.prune_pg_waiters(osdmap, whoami);
2535
7c673cae
FG
2536 // load up pgs (as they previously existed)
2537 load_pgs();
2538
2539 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
2540 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
2541 op_prio_cutoff << "." << dendl;
2542
2543 create_logger();
2544
2545 // i'm ready!
2546 client_messenger->add_dispatcher_head(this);
2547 cluster_messenger->add_dispatcher_head(this);
2548
2549 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2550 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2551 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2552 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2553
2554 objecter_messenger->add_dispatcher_head(service.objecter);
2555
2556 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
2557 | CEPH_ENTITY_TYPE_MGR);
2558 r = monc->init();
2559 if (r < 0)
2560 goto out;
2561
2562 /**
2563 * FIXME: this is a placeholder implementation that unconditionally
2564 * sends every is_primary PG's stats every time we're called, unlike
2565 * the existing mon PGStats mechanism that uses pg_stat_queue and acks.
2566 * This has equivalent cost to the existing worst case where all
2567 * PGs are busy and their stats are always enqueued for sending.
2568 */
2569 mgrc.set_pgstats_cb([this](){
2570 RWLock::RLocker l(map_lock);
2571
2572 utime_t had_for = ceph_clock_now() - had_map_since;
2573 osd_stat_t cur_stat = service.get_osd_stat();
2574 cur_stat.os_perf_stat = store->get_cur_stats();
2575
2576 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
2577 m->osd_stat = cur_stat;
2578
2579 Mutex::Locker lec{min_last_epoch_clean_lock};
2580 min_last_epoch_clean = osdmap->get_epoch();
2581 min_last_epoch_clean_pgs.clear();
2582 RWLock::RLocker lpg(pg_map_lock);
2583 for (const auto &i : pg_map) {
2584 PG *pg = i.second;
2585 if (!pg->is_primary()) {
2586 continue;
2587 }
2588
2589 pg->pg_stats_publish_lock.Lock();
2590 if (pg->pg_stats_publish_valid) {
2591 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
2592 const auto lec = pg->pg_stats_publish.get_effective_last_epoch_clean();
2593 min_last_epoch_clean = min(min_last_epoch_clean, lec);
2594 min_last_epoch_clean_pgs.push_back(pg->info.pgid.pgid);
2595 }
2596 pg->pg_stats_publish_lock.Unlock();
2597 }
2598
2599 return m;
2600 });
2601
2602 mgrc.init();
2603 client_messenger->add_dispatcher_head(&mgrc);
2604
2605 // tell monc about log_client so it will know about mon session resets
2606 monc->set_log_client(&log_client);
2607 update_log_config();
2608
31f18b77 2609 peering_tp.start();
7c673cae
FG
2610 osd_op_tp.start();
2611 disk_tp.start();
2612 command_tp.start();
2613
2614 set_disk_tp_priority();
2615
2616 // start the heartbeat
2617 heartbeat_thread.create("osd_srv_heartbt");
2618
2619 // tick
2620 tick_timer.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick(this));
2621 {
2622 Mutex::Locker l(tick_timer_lock);
2623 tick_timer_without_osd_lock.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick_WithoutOSDLock(this));
2624 }
2625
2626 service.init();
2627 service.publish_map(osdmap);
2628 service.publish_superblock(superblock);
2629 service.max_oldest_map = superblock.oldest_map;
2630
2631 osd_lock.Unlock();
2632
2633 r = monc->authenticate();
2634 if (r < 0) {
c07f9fc5
FG
2635 derr << __func__ << " authentication failed: " << cpp_strerror(r)
2636 << dendl;
7c673cae
FG
2637 osd_lock.Lock(); // locker is going to unlock this on function exit
2638 if (is_stopping())
c07f9fc5 2639 r = 0;
7c673cae
FG
2640 goto monout;
2641 }
2642
2643 while (monc->wait_auth_rotating(30.0) < 0) {
2644 derr << "unable to obtain rotating service keys; retrying" << dendl;
2645 ++rotating_auth_attempts;
2646 if (rotating_auth_attempts > g_conf->max_rotating_auth_attempts) {
c07f9fc5 2647 derr << __func__ << " wait_auth_rotating timed out" << dendl;
7c673cae
FG
2648 osd_lock.Lock(); // make locker happy
2649 if (!is_stopping()) {
c07f9fc5 2650 r = -ETIMEDOUT;
7c673cae
FG
2651 }
2652 goto monout;
2653 }
2654 }
2655
2656 r = update_crush_device_class();
2657 if (r < 0) {
d2e6a577
FG
2658 derr << __func__ << " unable to update_crush_device_class: "
2659 << cpp_strerror(r) << dendl;
7c673cae
FG
2660 osd_lock.Lock();
2661 goto monout;
2662 }
2663
2664 r = update_crush_location();
2665 if (r < 0) {
d2e6a577 2666 derr << __func__ << " unable to update_crush_location: "
c07f9fc5 2667 << cpp_strerror(r) << dendl;
7c673cae
FG
2668 osd_lock.Lock();
2669 goto monout;
2670 }
2671
2672 osd_lock.Lock();
2673 if (is_stopping())
2674 return 0;
2675
2676 // start objecter *after* we have authenticated, so that we don't ignore
2677 // the OSDMaps it requests.
2678 service.final_init();
2679
2680 check_config();
2681
2682 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
2683 consume_map();
2684 peering_wq.drain();
2685
2686 dout(0) << "done with init, starting boot process" << dendl;
2687
2688 // subscribe to any pg creations
2689 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
2690
2691 // MgrClient needs this (it doesn't have MonClient reference itself)
2692 monc->sub_want("mgrmap", 0, 0);
2693
2694 // we don't need to ask for an osdmap here; objecter will
2695 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
2696
2697 monc->renew_subs();
2698
2699 start_boot();
2700
2701 return 0;
2702monout:
c07f9fc5 2703 exit(1);
7c673cae
FG
2704
2705out:
2706 enable_disable_fuse(true);
2707 store->umount();
2708 delete store;
2709 store = NULL;
2710 return r;
2711}
2712
2713void OSD::final_init()
2714{
2715 AdminSocket *admin_socket = cct->get_admin_socket();
2716 asok_hook = new OSDSocketHook(this);
2717 int r = admin_socket->register_command("status", "status", asok_hook,
2718 "high-level status of OSD");
2719 assert(r == 0);
2720 r = admin_socket->register_command("flush_journal", "flush_journal",
2721 asok_hook,
2722 "flush the journal to permanent store");
2723 assert(r == 0);
2724 r = admin_socket->register_command("dump_ops_in_flight",
c07f9fc5
FG
2725 "dump_ops_in_flight " \
2726 "name=filterstr,type=CephString,n=N,req=false",
2727 asok_hook,
7c673cae
FG
2728 "show the ops currently in flight");
2729 assert(r == 0);
2730 r = admin_socket->register_command("ops",
c07f9fc5
FG
2731 "ops " \
2732 "name=filterstr,type=CephString,n=N,req=false",
2733 asok_hook,
7c673cae
FG
2734 "show the ops currently in flight");
2735 assert(r == 0);
2736 r = admin_socket->register_command("dump_blocked_ops",
c07f9fc5
FG
2737 "dump_blocked_ops " \
2738 "name=filterstr,type=CephString,n=N,req=false",
2739 asok_hook,
7c673cae
FG
2740 "show the blocked ops currently in flight");
2741 assert(r == 0);
c07f9fc5
FG
2742 r = admin_socket->register_command("dump_historic_ops",
2743 "dump_historic_ops " \
2744 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
2745 asok_hook,
2746 "show recent ops");
2747 assert(r == 0);
c07f9fc5
FG
2748 r = admin_socket->register_command("dump_historic_slow_ops",
2749 "dump_historic_slow_ops " \
2750 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
2751 asok_hook,
2752 "show slowest recent ops");
2753 assert(r == 0);
c07f9fc5
FG
2754 r = admin_socket->register_command("dump_historic_ops_by_duration",
2755 "dump_historic_ops_by_duration " \
2756 "name=filterstr,type=CephString,n=N,req=false",
7c673cae
FG
2757 asok_hook,
2758 "show slowest recent ops, sorted by duration");
2759 assert(r == 0);
2760 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
2761 asok_hook,
2762 "dump op priority queue state");
2763 assert(r == 0);
2764 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
2765 asok_hook,
2766 "dump blacklisted clients and times");
2767 assert(r == 0);
2768 r = admin_socket->register_command("dump_watchers", "dump_watchers",
2769 asok_hook,
2770 "show clients which have active watches,"
2771 " and on which objects");
2772 assert(r == 0);
2773 r = admin_socket->register_command("dump_reservations", "dump_reservations",
2774 asok_hook,
2775 "show recovery reservations");
2776 assert(r == 0);
2777 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
2778 asok_hook,
2779 "force osd to update the latest map from "
2780 "the mon");
2781 assert(r == 0);
2782
2783 r = admin_socket->register_command( "heap",
2784 "heap " \
2785 "name=heapcmd,type=CephString",
2786 asok_hook,
2787 "show heap usage info (available only if "
2788 "compiled with tcmalloc)");
2789 assert(r == 0);
2790
2791 r = admin_socket->register_command("set_heap_property",
2792 "set_heap_property " \
2793 "name=property,type=CephString " \
2794 "name=value,type=CephInt",
2795 asok_hook,
2796 "update malloc extension heap property");
2797 assert(r == 0);
2798
2799 r = admin_socket->register_command("get_heap_property",
2800 "get_heap_property " \
2801 "name=property,type=CephString",
2802 asok_hook,
2803 "get malloc extension heap property");
2804 assert(r == 0);
2805
2806 r = admin_socket->register_command("dump_objectstore_kv_stats",
2807 "dump_objectstore_kv_stats",
2808 asok_hook,
2809 "print statistics of kvdb which used by bluestore");
2810 assert(r == 0);
2811
2812 r = admin_socket->register_command("dump_scrubs",
2813 "dump_scrubs",
2814 asok_hook,
2815 "print scheduled scrubs");
2816 assert(r == 0);
2817
2818 r = admin_socket->register_command("calc_objectstore_db_histogram",
2819 "calc_objectstore_db_histogram",
2820 asok_hook,
2821 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
2822 assert(r == 0);
2823
2824 r = admin_socket->register_command("flush_store_cache",
2825 "flush_store_cache",
2826 asok_hook,
2827 "Flush bluestore internal cache");
2828 assert(r == 0);
2829 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
2830 asok_hook,
2831 "show recent state history");
2832 assert(r == 0);
2833
224ce89b
WB
2834 r = admin_socket->register_command("compact", "compact",
2835 asok_hook,
2836 "Commpact object store's omap."
2837 " WARNING: Compaction probably slows your requests");
2838 assert(r == 0);
2839
7c673cae
FG
2840 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
2841 // Note: pools are CephString instead of CephPoolname because
2842 // these commands traditionally support both pool names and numbers
2843 r = admin_socket->register_command(
2844 "setomapval",
2845 "setomapval " \
2846 "name=pool,type=CephString " \
2847 "name=objname,type=CephObjectname " \
2848 "name=key,type=CephString "\
2849 "name=val,type=CephString",
2850 test_ops_hook,
2851 "set omap key");
2852 assert(r == 0);
2853 r = admin_socket->register_command(
2854 "rmomapkey",
2855 "rmomapkey " \
2856 "name=pool,type=CephString " \
2857 "name=objname,type=CephObjectname " \
2858 "name=key,type=CephString",
2859 test_ops_hook,
2860 "remove omap key");
2861 assert(r == 0);
2862 r = admin_socket->register_command(
2863 "setomapheader",
2864 "setomapheader " \
2865 "name=pool,type=CephString " \
2866 "name=objname,type=CephObjectname " \
2867 "name=header,type=CephString",
2868 test_ops_hook,
2869 "set omap header");
2870 assert(r == 0);
2871
2872 r = admin_socket->register_command(
2873 "getomap",
2874 "getomap " \
2875 "name=pool,type=CephString " \
2876 "name=objname,type=CephObjectname",
2877 test_ops_hook,
2878 "output entire object map");
2879 assert(r == 0);
2880
2881 r = admin_socket->register_command(
2882 "truncobj",
2883 "truncobj " \
2884 "name=pool,type=CephString " \
2885 "name=objname,type=CephObjectname " \
2886 "name=len,type=CephInt",
2887 test_ops_hook,
2888 "truncate object to length");
2889 assert(r == 0);
2890
2891 r = admin_socket->register_command(
2892 "injectdataerr",
2893 "injectdataerr " \
2894 "name=pool,type=CephString " \
2895 "name=objname,type=CephObjectname " \
2896 "name=shardid,type=CephInt,req=false,range=0|255",
2897 test_ops_hook,
2898 "inject data error to an object");
2899 assert(r == 0);
2900
2901 r = admin_socket->register_command(
2902 "injectmdataerr",
2903 "injectmdataerr " \
2904 "name=pool,type=CephString " \
2905 "name=objname,type=CephObjectname " \
2906 "name=shardid,type=CephInt,req=false,range=0|255",
2907 test_ops_hook,
2908 "inject metadata error to an object");
2909 assert(r == 0);
2910 r = admin_socket->register_command(
2911 "set_recovery_delay",
2912 "set_recovery_delay " \
2913 "name=utime,type=CephInt,req=false",
2914 test_ops_hook,
2915 "Delay osd recovery by specified seconds");
2916 assert(r == 0);
2917 r = admin_socket->register_command(
2918 "trigger_scrub",
2919 "trigger_scrub " \
2920 "name=pgid,type=CephString ",
2921 test_ops_hook,
2922 "Trigger a scheduled scrub ");
2923 assert(r == 0);
2924 r = admin_socket->register_command(
2925 "injectfull",
2926 "injectfull " \
2927 "name=type,type=CephString,req=false " \
2928 "name=count,type=CephInt,req=false ",
2929 test_ops_hook,
2930 "Inject a full disk (optional count times)");
2931 assert(r == 0);
2932}
2933
2934void OSD::create_logger()
2935{
2936 dout(10) << "create_logger" << dendl;
2937
2938 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
2939
2940 // Latency axis configuration for op histograms, values are in nanoseconds
2941 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
2942 "Latency (usec)",
2943 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
2944 0, ///< Start at 0
2945 100000, ///< Quantization unit is 100usec
2946 32, ///< Enough to cover much longer than slow requests
2947 };
2948
2949 // Op size axis configuration for op histograms, values are in bytes
2950 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
2951 "Request size (bytes)",
2952 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
2953 0, ///< Start at 0
2954 512, ///< Quantization unit is 512 bytes
2955 32, ///< Enough to cover requests larger than GB
2956 };
2957
2958
2959 osd_plb.add_u64(
2960 l_osd_op_wip, "op_wip",
2961 "Replication operations currently being processed (primary)");
2962 osd_plb.add_u64_counter(
2963 l_osd_op, "op",
2964 "Client operations",
2965 "ops", PerfCountersBuilder::PRIO_CRITICAL);
2966 osd_plb.add_u64_counter(
2967 l_osd_op_inb, "op_in_bytes",
2968 "Client operations total write size",
2969 "wr", PerfCountersBuilder::PRIO_INTERESTING);
2970 osd_plb.add_u64_counter(
2971 l_osd_op_outb, "op_out_bytes",
2972 "Client operations total read size",
2973 "rd", PerfCountersBuilder::PRIO_INTERESTING);
2974 osd_plb.add_time_avg(
2975 l_osd_op_lat, "op_latency",
2976 "Latency of client operations (including queue time)",
2977 "l", 9);
2978 osd_plb.add_time_avg(
2979 l_osd_op_process_lat, "op_process_latency",
2980 "Latency of client operations (excluding queue time)");
2981 osd_plb.add_time_avg(
2982 l_osd_op_prepare_lat, "op_prepare_latency",
2983 "Latency of client operations (excluding queue time and wait for finished)");
2984
2985 osd_plb.add_u64_counter(
2986 l_osd_op_r, "op_r", "Client read operations");
2987 osd_plb.add_u64_counter(
2988 l_osd_op_r_outb, "op_r_out_bytes", "Client data read");
2989 osd_plb.add_time_avg(
2990 l_osd_op_r_lat, "op_r_latency",
2991 "Latency of read operation (including queue time)");
31f18b77 2992 osd_plb.add_u64_counter_histogram(
7c673cae
FG
2993 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
2994 op_hist_x_axis_config, op_hist_y_axis_config,
2995 "Histogram of operation latency (including queue time) + data read");
2996 osd_plb.add_time_avg(
2997 l_osd_op_r_process_lat, "op_r_process_latency",
2998 "Latency of read operation (excluding queue time)");
2999 osd_plb.add_time_avg(
3000 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
3001 "Latency of read operations (excluding queue time and wait for finished)");
3002 osd_plb.add_u64_counter(
3003 l_osd_op_w, "op_w", "Client write operations");
3004 osd_plb.add_u64_counter(
3005 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
3006 osd_plb.add_time_avg(
3007 l_osd_op_w_lat, "op_w_latency",
3008 "Latency of write operation (including queue time)");
31f18b77 3009 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3010 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
3011 op_hist_x_axis_config, op_hist_y_axis_config,
3012 "Histogram of operation latency (including queue time) + data written");
3013 osd_plb.add_time_avg(
3014 l_osd_op_w_process_lat, "op_w_process_latency",
3015 "Latency of write operation (excluding queue time)");
3016 osd_plb.add_time_avg(
3017 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
3018 "Latency of write operations (excluding queue time and wait for finished)");
3019 osd_plb.add_u64_counter(
3020 l_osd_op_rw, "op_rw",
3021 "Client read-modify-write operations");
3022 osd_plb.add_u64_counter(
3023 l_osd_op_rw_inb, "op_rw_in_bytes",
3024 "Client read-modify-write operations write in");
3025 osd_plb.add_u64_counter(
3026 l_osd_op_rw_outb,"op_rw_out_bytes",
3027 "Client read-modify-write operations read out ");
3028 osd_plb.add_time_avg(
3029 l_osd_op_rw_lat, "op_rw_latency",
3030 "Latency of read-modify-write operation (including queue time)");
31f18b77 3031 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3032 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
3033 op_hist_x_axis_config, op_hist_y_axis_config,
3034 "Histogram of rw operation latency (including queue time) + data written");
31f18b77 3035 osd_plb.add_u64_counter_histogram(
7c673cae
FG
3036 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
3037 op_hist_x_axis_config, op_hist_y_axis_config,
3038 "Histogram of rw operation latency (including queue time) + data read");
3039 osd_plb.add_time_avg(
3040 l_osd_op_rw_process_lat, "op_rw_process_latency",
3041 "Latency of read-modify-write operation (excluding queue time)");
3042 osd_plb.add_time_avg(
3043 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
3044 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3045
224ce89b
WB
3046 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
3047 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3048 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
3049 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3050
7c673cae
FG
3051 osd_plb.add_u64_counter(
3052 l_osd_sop, "subop", "Suboperations");
3053 osd_plb.add_u64_counter(
3054 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size");
3055 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
3056
3057 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
3058 osd_plb.add_u64_counter(
3059 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size");
3060 osd_plb.add_time_avg(
3061 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
3062 osd_plb.add_u64_counter(
3063 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
3064 osd_plb.add_time_avg(
3065 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
3066 osd_plb.add_u64_counter(
3067 l_osd_sop_push, "subop_push", "Suboperations push messages");
3068 osd_plb.add_u64_counter(
3069 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size");
3070 osd_plb.add_time_avg(
3071 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3072
3073 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3074 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
3075 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size");
3076
3077 osd_plb.add_u64_counter(
3078 l_osd_rop, "recovery_ops",
3079 "Started recovery operations",
3080 "rop", PerfCountersBuilder::PRIO_INTERESTING);
3081
3082 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
3083 osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size");
3084 osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes");
3085 osd_plb.add_u64(l_osd_history_alloc_num, "history_alloc_num");
3086 osd_plb.add_u64(
3087 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3088 osd_plb.add_u64(
3089 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3090 "Total number getting crc from crc_cache with adjusting");
3091 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3092 "Total number of crc cache misses");
3093
3094 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3095 "pgs", PerfCountersBuilder::PRIO_USEFUL);
3096 osd_plb.add_u64(
3097 l_osd_pg_primary, "numpg_primary",
3098 "Placement groups for which this osd is primary");
3099 osd_plb.add_u64(
3100 l_osd_pg_replica, "numpg_replica",
3101 "Placement groups for which this osd is replica");
3102 osd_plb.add_u64(
3103 l_osd_pg_stray, "numpg_stray",
3104 "Placement groups ready to be deleted from this osd");
3105 osd_plb.add_u64(
3106 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3107 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3108 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3109 osd_plb.add_u64_counter(
3110 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3111 osd_plb.add_u64_counter(
3112 l_osd_waiting_for_map, "messages_delayed_for_map",
3113 "Operations waiting for OSD map");
31f18b77 3114
7c673cae
FG
3115 osd_plb.add_u64_counter(
3116 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3117 osd_plb.add_u64_counter(
3118 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3119 osd_plb.add_u64_counter(
3120 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3121 "osdmap cache miss below cache lower bound");
3122 osd_plb.add_u64_avg(
3123 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3124 "osdmap cache miss, avg distance below cache lower bound");
31f18b77
FG
3125 osd_plb.add_u64_counter(
3126 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3127 "OSDMap buffer cache hits");
3128 osd_plb.add_u64_counter(
3129 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3130 "OSDMap buffer cache misses");
7c673cae
FG
3131
3132 osd_plb.add_u64(l_osd_stat_bytes, "stat_bytes", "OSD size");
3133 osd_plb.add_u64(l_osd_stat_bytes_used, "stat_bytes_used", "Used space");
3134 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
3135
3136 osd_plb.add_u64_counter(
3137 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3138
3139 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3140 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3141 osd_plb.add_u64_counter(
3142 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3143 osd_plb.add_u64_counter(
3144 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3145 osd_plb.add_u64_counter(
3146 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3147 "Failed tier flush attempts");
3148 osd_plb.add_u64_counter(
3149 l_osd_tier_evict, "tier_evict", "Tier evictions");
3150 osd_plb.add_u64_counter(
3151 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3152 osd_plb.add_u64_counter(
3153 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3154 osd_plb.add_u64_counter(
3155 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3156 osd_plb.add_u64_counter(
3157 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3158 osd_plb.add_u64_counter(
3159 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3160 osd_plb.add_u64_counter(
3161 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3162
3163 osd_plb.add_u64_counter(
3164 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3165 osd_plb.add_u64_counter(
3166 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3167 osd_plb.add_u64_counter(
3168 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3169 osd_plb.add_u64_counter(
3170 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3171
3172 osd_plb.add_u64_counter(
3173 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3174 osd_plb.add_u64_counter(
3175 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3176
3177 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3178 osd_plb.add_time_avg(
3179 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3180 osd_plb.add_time_avg(
3181 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3182 osd_plb.add_time_avg(
3183 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3184
3185 osd_plb.add_u64_counter(
3186 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3187 osd_plb.add_u64_counter(
3188 l_osd_pg_fastinfo, "osd_pg_fastinfo",
3189 "PG updated its info using fastinfo attr");
3190 osd_plb.add_u64_counter(
3191 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3192
3193 logger = osd_plb.create_perf_counters();
3194 cct->get_perfcounters_collection()->add(logger);
3195}
3196
3197void OSD::create_recoverystate_perf()
3198{
3199 dout(10) << "create_recoverystate_perf" << dendl;
3200
3201 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3202
3203 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3204 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3205 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3206 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3207 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3208 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3209 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3210 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3211 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3212 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3213 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3214 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3215 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3216 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3217 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3218 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3219 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3220 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3221 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3222 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3223 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3224 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3225 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3226 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3227 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3228 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3229 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3230 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3231 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3232 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3233 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3234
3235 recoverystate_perf = rs_perf.create_perf_counters();
3236 cct->get_perfcounters_collection()->add(recoverystate_perf);
3237}
3238
3239int OSD::shutdown()
3240{
3241 if (!service.prepare_to_stop())
3242 return 0; // already shutting down
3243 osd_lock.Lock();
3244 if (is_stopping()) {
3245 osd_lock.Unlock();
3246 return 0;
3247 }
3248 derr << "shutdown" << dendl;
3249
3250 set_state(STATE_STOPPING);
3251
3252 // Debugging
3253 cct->_conf->set_val("debug_osd", "100");
3254 cct->_conf->set_val("debug_journal", "100");
3255 cct->_conf->set_val("debug_filestore", "100");
3256 cct->_conf->set_val("debug_ms", "100");
3257 cct->_conf->apply_changes(NULL);
3258
3259 // stop MgrClient earlier as it's more like an internal consumer of OSD
3260 mgrc.shutdown();
3261
3262 service.start_shutdown();
3263
3264 // stop sending work to pgs. this just prevents any new work in _process
3265 // from racing with on_shutdown and potentially entering the pg after.
3266 op_shardedwq.drain();
3267
3268 // Shutdown PGs
3269 {
3270 RWLock::RLocker l(pg_map_lock);
3271 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3272 p != pg_map.end();
3273 ++p) {
3274 dout(20) << " kicking pg " << p->first << dendl;
3275 p->second->lock();
3276 p->second->on_shutdown();
3277 p->second->unlock();
3278 p->second->osr->flush();
3279 }
3280 }
3281 clear_pg_stat_queue();
3282
3283 // drain op queue again (in case PGs requeued something)
3284 op_shardedwq.drain();
3285 {
3286 finished.clear(); // zap waiters (bleh, this is messy)
3287 }
3288
3289 op_shardedwq.clear_pg_slots();
3290
3291 // unregister commands
3292 cct->get_admin_socket()->unregister_command("status");
3293 cct->get_admin_socket()->unregister_command("flush_journal");
3294 cct->get_admin_socket()->unregister_command("dump_ops_in_flight");
3295 cct->get_admin_socket()->unregister_command("ops");
3296 cct->get_admin_socket()->unregister_command("dump_blocked_ops");
3297 cct->get_admin_socket()->unregister_command("dump_historic_ops");
3298 cct->get_admin_socket()->unregister_command("dump_historic_ops_by_duration");
3299 cct->get_admin_socket()->unregister_command("dump_historic_slow_ops");
3300 cct->get_admin_socket()->unregister_command("dump_op_pq_state");
3301 cct->get_admin_socket()->unregister_command("dump_blacklist");
3302 cct->get_admin_socket()->unregister_command("dump_watchers");
3303 cct->get_admin_socket()->unregister_command("dump_reservations");
3304 cct->get_admin_socket()->unregister_command("get_latest_osdmap");
224ce89b 3305 cct->get_admin_socket()->unregister_command("heap");
7c673cae
FG
3306 cct->get_admin_socket()->unregister_command("set_heap_property");
3307 cct->get_admin_socket()->unregister_command("get_heap_property");
3308 cct->get_admin_socket()->unregister_command("dump_objectstore_kv_stats");
224ce89b 3309 cct->get_admin_socket()->unregister_command("dump_scrubs");
7c673cae
FG
3310 cct->get_admin_socket()->unregister_command("calc_objectstore_db_histogram");
3311 cct->get_admin_socket()->unregister_command("flush_store_cache");
3312 cct->get_admin_socket()->unregister_command("dump_pgstate_history");
224ce89b 3313 cct->get_admin_socket()->unregister_command("compact");
7c673cae
FG
3314 delete asok_hook;
3315 asok_hook = NULL;
3316
3317 cct->get_admin_socket()->unregister_command("setomapval");
3318 cct->get_admin_socket()->unregister_command("rmomapkey");
3319 cct->get_admin_socket()->unregister_command("setomapheader");
3320 cct->get_admin_socket()->unregister_command("getomap");
3321 cct->get_admin_socket()->unregister_command("truncobj");
3322 cct->get_admin_socket()->unregister_command("injectdataerr");
3323 cct->get_admin_socket()->unregister_command("injectmdataerr");
3324 cct->get_admin_socket()->unregister_command("set_recovery_delay");
224ce89b
WB
3325 cct->get_admin_socket()->unregister_command("trigger_scrub");
3326 cct->get_admin_socket()->unregister_command("injectfull");
7c673cae
FG
3327 delete test_ops_hook;
3328 test_ops_hook = NULL;
3329
3330 osd_lock.Unlock();
3331
3332 heartbeat_lock.Lock();
3333 heartbeat_stop = true;
3334 heartbeat_cond.Signal();
3335 heartbeat_lock.Unlock();
3336 heartbeat_thread.join();
3337
31f18b77 3338 peering_tp.drain();
7c673cae 3339 peering_wq.clear();
31f18b77 3340 peering_tp.stop();
7c673cae
FG
3341 dout(10) << "osd tp stopped" << dendl;
3342
3343 osd_op_tp.drain();
3344 osd_op_tp.stop();
3345 dout(10) << "op sharded tp stopped" << dendl;
3346
3347 command_tp.drain();
3348 command_tp.stop();
3349 dout(10) << "command tp stopped" << dendl;
3350
3351 disk_tp.drain();
3352 disk_tp.stop();
3353 dout(10) << "disk tp paused (new)" << dendl;
3354
3355 dout(10) << "stopping agent" << dendl;
3356 service.agent_stop();
3357
3358 osd_lock.Lock();
3359
3360 reset_heartbeat_peers();
3361
3362 tick_timer.shutdown();
3363
3364 {
3365 Mutex::Locker l(tick_timer_lock);
3366 tick_timer_without_osd_lock.shutdown();
3367 }
3368
3369 // note unmount epoch
3370 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
3371 superblock.mounted = service.get_boot_epoch();
3372 superblock.clean_thru = osdmap->get_epoch();
3373 ObjectStore::Transaction t;
3374 write_superblock(t);
3375 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
3376 if (r) {
3377 derr << "OSD::shutdown: error writing superblock: "
3378 << cpp_strerror(r) << dendl;
3379 }
3380
3381
3382 {
3383 Mutex::Locker l(pg_stat_queue_lock);
3384 assert(pg_stat_queue.empty());
3385 }
3386
31f18b77
FG
3387 service.shutdown_reserver();
3388
7c673cae
FG
3389 // Remove PGs
3390#ifdef PG_DEBUG_REFS
3391 service.dump_live_pgids();
3392#endif
3393 {
3394 RWLock::RLocker l(pg_map_lock);
3395 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3396 p != pg_map.end();
3397 ++p) {
3398 dout(20) << " kicking pg " << p->first << dendl;
3399 p->second->lock();
3400 if (p->second->ref != 1) {
3401 derr << "pgid " << p->first << " has ref count of "
3402 << p->second->ref << dendl;
3403#ifdef PG_DEBUG_REFS
3404 p->second->dump_live_ids();
3405#endif
31f18b77
FG
3406 if (cct->_conf->osd_shutdown_pgref_assert) {
3407 ceph_abort();
3408 }
7c673cae
FG
3409 }
3410 p->second->unlock();
3411 p->second->put("PGMap");
3412 }
3413 pg_map.clear();
3414 }
3415#ifdef PG_DEBUG_REFS
3416 service.dump_live_pgids();
3417#endif
3418 cct->_conf->remove_observer(this);
3419
3420 dout(10) << "syncing store" << dendl;
3421 enable_disable_fuse(true);
3422
3423 if (cct->_conf->osd_journal_flush_on_shutdown) {
3424 dout(10) << "flushing journal" << dendl;
3425 store->flush_journal();
3426 }
3427
3428 store->umount();
3429 delete store;
3430 store = 0;
3431 dout(10) << "Store synced" << dendl;
3432
3433 monc->shutdown();
3434 osd_lock.Unlock();
3435
3436 osdmap = OSDMapRef();
3437 service.shutdown();
3438 op_tracker.on_shutdown();
3439
3440 class_handler->shutdown();
3441 client_messenger->shutdown();
3442 cluster_messenger->shutdown();
3443 hb_front_client_messenger->shutdown();
3444 hb_back_client_messenger->shutdown();
3445 objecter_messenger->shutdown();
3446 hb_front_server_messenger->shutdown();
3447 hb_back_server_messenger->shutdown();
3448
3449 peering_wq.clear();
3450
3451 return r;
3452}
3453
3454int OSD::mon_cmd_maybe_osd_create(string &cmd)
3455{
3456 bool created = false;
3457 while (true) {
3458 dout(10) << __func__ << " cmd: " << cmd << dendl;
3459 vector<string> vcmd{cmd};
3460 bufferlist inbl;
3461 C_SaferCond w;
3462 string outs;
3463 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
3464 int r = w.wait();
3465 if (r < 0) {
3466 if (r == -ENOENT && !created) {
3467 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
3468 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
3469 vector<string> vnewcmd{newcmd};
3470 bufferlist inbl;
3471 C_SaferCond w;
3472 string outs;
3473 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
3474 int r = w.wait();
3475 if (r < 0) {
3476 derr << __func__ << " fail: osd does not exist and created failed: "
3477 << cpp_strerror(r) << dendl;
3478 return r;
3479 }
3480 created = true;
3481 continue;
3482 }
3483 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
3484 return r;
3485 }
3486 break;
3487 }
3488
3489 return 0;
3490}
3491
3492int OSD::update_crush_location()
3493{
3494 if (!cct->_conf->osd_crush_update_on_start) {
3495 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
3496 return 0;
3497 }
3498
3499 char weight[32];
3500 if (cct->_conf->osd_crush_initial_weight >= 0) {
3501 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
3502 } else {
3503 struct store_statfs_t st;
3504 int r = store->statfs(&st);
3505 if (r < 0) {
3506 derr << "statfs: " << cpp_strerror(r) << dendl;
3507 return r;
3508 }
3509 snprintf(weight, sizeof(weight), "%.4lf",
3510 MAX((double).00001,
3511 (double)(st.total) /
3512 (double)(1ull << 40 /* TB */)));
3513 }
3514
3515 std::multimap<string,string> loc = cct->crush_location.get_location();
3516 dout(10) << __func__ << " crush location is " << loc << dendl;
3517
3518 string cmd =
3519 string("{\"prefix\": \"osd crush create-or-move\", ") +
3520 string("\"id\": ") + stringify(whoami) + string(", ") +
3521 string("\"weight\":") + weight + string(", ") +
3522 string("\"args\": [");
3523 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
3524 if (p != loc.begin())
3525 cmd += ", ";
3526 cmd += "\"" + p->first + "=" + p->second + "\"";
3527 }
3528 cmd += "]}";
3529
3530 return mon_cmd_maybe_osd_create(cmd);
3531}
3532
3533int OSD::update_crush_device_class()
3534{
224ce89b
WB
3535 if (!cct->_conf->osd_class_update_on_start) {
3536 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
3537 return 0;
3538 }
3539
7c673cae
FG
3540 string device_class;
3541 int r = store->read_meta("crush_device_class", &device_class);
224ce89b
WB
3542 if (r < 0 || device_class.empty()) {
3543 device_class = store->get_default_device_class();
3544 }
3545
3546 if (device_class.empty()) {
d2e6a577 3547 dout(20) << __func__ << " no device class stored locally" << dendl;
7c673cae 3548 return 0;
224ce89b 3549 }
7c673cae
FG
3550
3551 string cmd =
3552 string("{\"prefix\": \"osd crush set-device-class\", ") +
224ce89b
WB
3553 string("\"class\": \"") + device_class + string("\", ") +
3554 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
7c673cae 3555
224ce89b 3556 r = mon_cmd_maybe_osd_create(cmd);
d2e6a577
FG
3557 // the above cmd can fail for various reasons, e.g.:
3558 // (1) we are connecting to a pre-luminous monitor
3559 // (2) user manually specify a class other than
3560 // 'ceph-disk prepare --crush-device-class'
3561 // simply skip result-checking for now
3562 return 0;
7c673cae
FG
3563}
3564
3565void OSD::write_superblock(ObjectStore::Transaction& t)
3566{
3567 dout(10) << "write_superblock " << superblock << dendl;
3568
3569 //hack: at minimum it's using the baseline feature set
3570 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
3571 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
3572
3573 bufferlist bl;
3574 ::encode(superblock, bl);
3575 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
3576}
3577
3578int OSD::read_superblock()
3579{
3580 bufferlist bl;
3581 int r = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
3582 if (r < 0)
3583 return r;
3584
3585 bufferlist::iterator p = bl.begin();
3586 ::decode(superblock, p);
3587
3588 dout(10) << "read_superblock " << superblock << dendl;
3589
3590 return 0;
3591}
3592
3593void OSD::clear_temp_objects()
3594{
3595 dout(10) << __func__ << dendl;
3596 vector<coll_t> ls;
3597 store->list_collections(ls);
3598 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
3599 spg_t pgid;
3600 if (!p->is_pg(&pgid))
3601 continue;
3602
3603 // list temp objects
3604 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
3605
3606 vector<ghobject_t> temps;
3607 ghobject_t next;
3608 while (1) {
3609 vector<ghobject_t> objects;
3610 store->collection_list(*p, next, ghobject_t::get_max(),
3611 store->get_ideal_list_max(),
3612 &objects, &next);
3613 if (objects.empty())
3614 break;
3615 vector<ghobject_t>::iterator q;
3616 for (q = objects.begin(); q != objects.end(); ++q) {
3617 // Hammer set pool for temps to -1, so check for clean-up
3618 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
3619 temps.push_back(*q);
3620 } else {
3621 break;
3622 }
3623 }
3624 // If we saw a non-temp object and hit the break above we can
3625 // break out of the while loop too.
3626 if (q != objects.end())
3627 break;
3628 }
3629 if (!temps.empty()) {
3630 ObjectStore::Transaction t;
3631 int removed = 0;
3632 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
3633 dout(20) << " removing " << *p << " object " << *q << dendl;
3634 t.remove(*p, *q);
3635 if (++removed > cct->_conf->osd_target_transaction_size) {
3636 store->apply_transaction(service.meta_osr.get(), std::move(t));
3637 t = ObjectStore::Transaction();
3638 removed = 0;
3639 }
3640 }
3641 if (removed) {
3642 store->apply_transaction(service.meta_osr.get(), std::move(t));
3643 }
3644 }
3645 }
3646}
3647
3648void OSD::recursive_remove_collection(CephContext* cct,
3649 ObjectStore *store, spg_t pgid,
3650 coll_t tmp)
3651{
3652 OSDriver driver(
3653 store,
3654 coll_t(),
3655 make_snapmapper_oid());
3656
3657 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
3658 ObjectStore::Sequencer>("rm"));
3659 ObjectStore::Transaction t;
3660 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
3661
3662 vector<ghobject_t> objects;
3663 store->collection_list(tmp, ghobject_t(), ghobject_t::get_max(),
3664 INT_MAX, &objects, 0);
3665 generic_dout(10) << __func__ << " " << objects << dendl;
3666 // delete them.
3667 int removed = 0;
3668 for (vector<ghobject_t>::iterator p = objects.begin();
3669 p != objects.end();
3670 ++p, removed++) {
3671 OSDriver::OSTransaction _t(driver.get_transaction(&t));
3672 int r = mapper.remove_oid(p->hobj, &_t);
3673 if (r != 0 && r != -ENOENT)
3674 ceph_abort();
3675 t.remove(tmp, *p);
3676 if (removed > cct->_conf->osd_target_transaction_size) {
3677 int r = store->apply_transaction(osr.get(), std::move(t));
3678 assert(r == 0);
3679 t = ObjectStore::Transaction();
3680 removed = 0;
3681 }
3682 }
3683 t.remove_collection(tmp);
3684 int r = store->apply_transaction(osr.get(), std::move(t));
3685 assert(r == 0);
3686
3687 C_SaferCond waiter;
3688 if (!osr->flush_commit(&waiter)) {
3689 waiter.wait();
3690 }
3691}
3692
3693
3694// ======================================================
3695// PG's
3696
3697PGPool OSD::_get_pool(int id, OSDMapRef createmap)
3698{
3699 if (!createmap->have_pg_pool(id)) {
3700 dout(5) << __func__ << ": the OSDmap does not contain a PG pool with id = "
3701 << id << dendl;
3702 ceph_abort();
3703 }
3704
3705 PGPool p = PGPool(cct, createmap, id);
3706
3707 dout(10) << "_get_pool " << p.id << dendl;
3708 return p;
3709}
3710
3711PG *OSD::_open_lock_pg(
3712 OSDMapRef createmap,
3713 spg_t pgid, bool no_lockdep_check)
3714{
3715 assert(osd_lock.is_locked());
3716
3717 PG* pg = _make_pg(createmap, pgid);
3718 {
3719 RWLock::WLocker l(pg_map_lock);
3720 pg->lock(no_lockdep_check);
3721 pg_map[pgid] = pg;
3722 pg->get("PGMap"); // because it's in pg_map
3723 service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
3724 }
3725 return pg;
3726}
3727
3728PG* OSD::_make_pg(
3729 OSDMapRef createmap,
3730 spg_t pgid)
3731{
3732 dout(10) << "_open_lock_pg " << pgid << dendl;
3733 PGPool pool = _get_pool(pgid.pool(), createmap);
3734
3735 // create
3736 PG *pg;
3737 if (createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_REPLICATED ||
3738 createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_ERASURE)
3739 pg = new PrimaryLogPG(&service, createmap, pool, pgid);
3740 else
3741 ceph_abort();
3742
3743 return pg;
3744}
3745
3746
3747void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
3748{
3749 epoch_t e(service.get_osdmap()->get_epoch());
3750 pg->get("PGMap"); // For pg_map
3751 pg_map[pg->info.pgid] = pg;
3752 service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
3753
3754 dout(10) << "Adding newly split pg " << *pg << dendl;
3755 pg->handle_loaded(rctx);
3756 pg->write_if_dirty(*(rctx->transaction));
3757 pg->queue_null(e, e);
3758 map<spg_t, list<PG::CephPeeringEvtRef> >::iterator to_wake =
3759 peering_wait_for_split.find(pg->info.pgid);
3760 if (to_wake != peering_wait_for_split.end()) {
3761 for (list<PG::CephPeeringEvtRef>::iterator i =
3762 to_wake->second.begin();
3763 i != to_wake->second.end();
3764 ++i) {
3765 pg->queue_peering_event(*i);
3766 }
3767 peering_wait_for_split.erase(to_wake);
3768 }
3769 if (!service.get_osdmap()->have_pg_pool(pg->info.pgid.pool()))
3770 _remove_pg(pg);
3771}
3772
3773OSD::res_result OSD::_try_resurrect_pg(
3774 OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state)
3775{
3776 assert(resurrected);
3777 assert(old_pg_state);
3778 // find nearest ancestor
3779 DeletingStateRef df;
3780 spg_t cur(pgid);
3781 while (true) {
3782 df = service.deleting_pgs.lookup(cur);
3783 if (df)
3784 break;
3785 if (!cur.ps())
3786 break;
3787 cur = cur.get_parent();
3788 }
3789 if (!df)
3790 return RES_NONE; // good to go
3791
3792 df->old_pg_state->lock();
3793 OSDMapRef create_map = df->old_pg_state->get_osdmap();
3794 df->old_pg_state->unlock();
3795
3796 set<spg_t> children;
3797 if (cur == pgid) {
3798 if (df->try_stop_deletion()) {
3799 dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl;
3800 *resurrected = cur;
3801 *old_pg_state = df->old_pg_state;
3802 service.deleting_pgs.remove(pgid); // PG is no longer being removed!
3803 return RES_SELF;
3804 } else {
3805 // raced, ensure we don't see DeletingStateRef when we try to
3806 // delete this pg
3807 service.deleting_pgs.remove(pgid);
3808 return RES_NONE;
3809 }
3810 } else if (cur.is_split(create_map->get_pg_num(cur.pool()),
3811 curmap->get_pg_num(cur.pool()),
3812 &children) &&
3813 children.count(pgid)) {
3814 if (df->try_stop_deletion()) {
3815 dout(10) << __func__ << ": halted deletion on ancestor pg " << pgid
3816 << dendl;
3817 *resurrected = cur;
3818 *old_pg_state = df->old_pg_state;
3819 service.deleting_pgs.remove(cur); // PG is no longer being removed!
3820 return RES_PARENT;
3821 } else {
3822 /* this is not a problem, failing to cancel proves that all objects
3823 * have been removed, so no hobject_t overlap is possible
3824 */
3825 return RES_NONE;
3826 }
3827 }
3828 return RES_NONE;
3829}
3830
3831PG *OSD::_create_lock_pg(
3832 OSDMapRef createmap,
3833 spg_t pgid,
3834 bool hold_map_lock,
3835 bool backfill,
3836 int role,
3837 vector<int>& up, int up_primary,
3838 vector<int>& acting, int acting_primary,
3839 pg_history_t history,
3840 const PastIntervals& pi,
3841 ObjectStore::Transaction& t)
3842{
3843 assert(osd_lock.is_locked());
3844 dout(20) << "_create_lock_pg pgid " << pgid << dendl;
3845
3846 PG *pg = _open_lock_pg(createmap, pgid, true);
3847
3848 service.init_splits_between(pgid, pg->get_osdmap(), service.get_osdmap());
3849
3850 pg->init(
3851 role,
3852 up,
3853 up_primary,
3854 acting,
3855 acting_primary,
3856 history,
3857 pi,
3858 backfill,
3859 &t);
3860
3861 dout(7) << "_create_lock_pg " << *pg << dendl;
3862 return pg;
3863}
3864
3865PG *OSD::_lookup_lock_pg(spg_t pgid)
3866{
3867 RWLock::RLocker l(pg_map_lock);
3868
3869 auto pg_map_entry = pg_map.find(pgid);
3870 if (pg_map_entry == pg_map.end())
3871 return nullptr;
3872 PG *pg = pg_map_entry->second;
3873 pg->lock();
3874 return pg;
3875}
3876
31f18b77
FG
3877PG *OSD::lookup_lock_pg(spg_t pgid)
3878{
3879 return _lookup_lock_pg(pgid);
3880}
3881
7c673cae
FG
3882PG *OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid)
3883{
3884 assert(pg_map.count(pgid));
3885 PG *pg = pg_map[pgid];
3886 pg->lock();
3887 return pg;
3888}
3889
3890void OSD::load_pgs()
3891{
3892 assert(osd_lock.is_locked());
3893 dout(0) << "load_pgs" << dendl;
3894 {
3895 RWLock::RLocker l(pg_map_lock);
3896 assert(pg_map.empty());
3897 }
3898
3899 vector<coll_t> ls;
3900 int r = store->list_collections(ls);
3901 if (r < 0) {
3902 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
3903 }
3904
3905 bool has_upgraded = false;
3906
3907 for (vector<coll_t>::iterator it = ls.begin();
3908 it != ls.end();
3909 ++it) {
3910 spg_t pgid;
3911 if (it->is_temp(&pgid) ||
3912 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
3913 dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
3914 recursive_remove_collection(cct, store, pgid, *it);
3915 continue;
3916 }
3917
3918 if (!it->is_pg(&pgid)) {
3919 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
3920 continue;
3921 }
3922
3923 if (pgid.preferred() >= 0) {
3924 dout(10) << __func__ << ": skipping localized PG " << pgid << dendl;
3925 // FIXME: delete it too, eventually
3926 continue;
3927 }
3928
3929 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
3930 bufferlist bl;
3931 epoch_t map_epoch = 0;
3932 int r = PG::peek_map_epoch(store, pgid, &map_epoch, &bl);
3933 if (r < 0) {
3934 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
3935 << dendl;
3936 continue;
3937 }
3938
3939 PG *pg = NULL;
3940 if (map_epoch > 0) {
3941 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
3942 if (!pgosdmap) {
3943 if (!osdmap->have_pg_pool(pgid.pool())) {
3944 derr << __func__ << ": could not find map for epoch " << map_epoch
3945 << " on pg " << pgid << ", but the pool is not present in the "
3946 << "current map, so this is probably a result of bug 10617. "
3947 << "Skipping the pg for now, you can use ceph-objectstore-tool "
3948 << "to clean it up later." << dendl;
3949 continue;
3950 } else {
3951 derr << __func__ << ": have pgid " << pgid << " at epoch "
3952 << map_epoch << ", but missing map. Crashing."
3953 << dendl;
3954 assert(0 == "Missing map in load_pgs");
3955 }
3956 }
3957 pg = _open_lock_pg(pgosdmap, pgid);
3958 } else {
3959 pg = _open_lock_pg(osdmap, pgid);
3960 }
3961 // there can be no waiters here, so we don't call wake_pg_waiters
3962
3963 pg->ch = store->open_collection(pg->coll);
3964
3965 // read pg state, log
3966 pg->read_state(store, bl);
3967
3968 if (pg->must_upgrade()) {
3969 if (!pg->can_upgrade()) {
3970 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
3971 << " an older version first." << dendl;
3972 assert(0 == "PG too old to upgrade");
3973 }
3974 if (!has_upgraded) {
3975 derr << "PGs are upgrading" << dendl;
3976 has_upgraded = true;
3977 }
3978 dout(10) << "PG " << pg->info.pgid
3979 << " must upgrade..." << dendl;
3980 pg->upgrade(store);
3981 }
3982
3983 service.init_splits_between(pg->info.pgid, pg->get_osdmap(), osdmap);
3984
3985 // generate state for PG's current mapping
3986 int primary, up_primary;
3987 vector<int> acting, up;
3988 pg->get_osdmap()->pg_to_up_acting_osds(
3989 pgid.pgid, &up, &up_primary, &acting, &primary);
3990 pg->init_primary_up_acting(
3991 up,
3992 acting,
3993 up_primary,
3994 primary);
3995 int role = OSDMap::calc_pg_role(whoami, pg->acting);
3996 if (pg->pool.info.is_replicated() || role == pg->pg_whoami.shard)
3997 pg->set_role(role);
3998 else
3999 pg->set_role(-1);
4000
4001 pg->reg_next_scrub();
4002
4003 PG::RecoveryCtx rctx(0, 0, 0, 0, 0, 0);
4004 pg->handle_loaded(&rctx);
4005
4006 dout(10) << "load_pgs loaded " << *pg << " " << pg->pg_log.get_log() << dendl;
4007 if (pg->pg_log.is_dirty()) {
4008 ObjectStore::Transaction t;
4009 pg->write_if_dirty(t);
4010 store->apply_transaction(pg->osr.get(), std::move(t));
4011 }
4012 pg->unlock();
4013 }
4014 {
4015 RWLock::RLocker l(pg_map_lock);
4016 dout(0) << "load_pgs opened " << pg_map.size() << " pgs" << dendl;
4017 }
4018
4019 // clean up old infos object?
4020 if (has_upgraded && store->exists(coll_t::meta(), OSD::make_infos_oid())) {
4021 dout(1) << __func__ << " removing legacy infos object" << dendl;
4022 ObjectStore::Transaction t;
4023 t.remove(coll_t::meta(), OSD::make_infos_oid());
4024 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
4025 if (r != 0) {
4026 derr << __func__ << ": apply_transaction returned "
4027 << cpp_strerror(r) << dendl;
4028 ceph_abort();
4029 }
4030 }
4031
4032 build_past_intervals_parallel();
4033}
4034
4035
4036/*
4037 * build past_intervals efficiently on old, degraded, and buried
4038 * clusters. this is important for efficiently catching up osds that
4039 * are way behind on maps to the current cluster state.
4040 *
4041 * this is a parallel version of PG::generate_past_intervals().
4042 * follow the same logic, but do all pgs at the same time so that we
4043 * can make a single pass across the osdmap history.
4044 */
4045void OSD::build_past_intervals_parallel()
4046{
4047 struct pistate {
4048 epoch_t start, end;
4049 vector<int> old_acting, old_up;
4050 epoch_t same_interval_since;
4051 int primary;
4052 int up_primary;
4053 };
4054 map<PG*,pistate> pis;
4055
4056 // calculate junction of map range
4057 epoch_t end_epoch = superblock.oldest_map;
4058 epoch_t cur_epoch = superblock.newest_map;
4059 {
4060 RWLock::RLocker l(pg_map_lock);
4061 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4062 i != pg_map.end();
4063 ++i) {
4064 PG *pg = i->second;
4065
4066 auto rpib = pg->get_required_past_interval_bounds(
4067 pg->info,
4068 superblock.oldest_map);
4069 if (rpib.first >= rpib.second && pg->past_intervals.empty()) {
4070 if (pg->info.history.same_interval_since == 0) {
4071 pg->info.history.same_interval_since = rpib.second;
4072 }
4073 continue;
4074 } else {
4075 auto apib = pg->past_intervals.get_bounds();
4076 if (apib.second >= rpib.second &&
4077 apib.first <= rpib.first) {
4078 if (pg->info.history.same_interval_since == 0) {
4079 pg->info.history.same_interval_since = rpib.second;
4080 }
4081 continue;
4082 }
4083 }
4084
4085 dout(10) << pg->info.pgid << " needs " << rpib.first << "-"
4086 << rpib.second << dendl;
4087 pistate& p = pis[pg];
4088 p.start = rpib.first;
4089 p.end = rpib.second;
4090 p.same_interval_since = 0;
4091
4092 if (rpib.first < cur_epoch)
4093 cur_epoch = rpib.first;
4094 if (rpib.second > end_epoch)
4095 end_epoch = rpib.second;
4096 }
4097 }
4098 if (pis.empty()) {
4099 dout(10) << __func__ << " nothing to build" << dendl;
4100 return;
4101 }
4102
4103 dout(1) << __func__ << " over " << cur_epoch << "-" << end_epoch << dendl;
4104 assert(cur_epoch <= end_epoch);
4105
4106 OSDMapRef cur_map, last_map;
4107 for ( ; cur_epoch <= end_epoch; cur_epoch++) {
4108 dout(10) << __func__ << " epoch " << cur_epoch << dendl;
4109 last_map = cur_map;
4110 cur_map = get_map(cur_epoch);
4111
4112 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4113 PG *pg = i->first;
4114 pistate& p = i->second;
4115
4116 if (cur_epoch < p.start || cur_epoch > p.end)
4117 continue;
4118
4119 vector<int> acting, up;
4120 int up_primary;
4121 int primary;
4122 pg_t pgid = pg->info.pgid.pgid;
4123 if (p.same_interval_since && last_map->get_pools().count(pgid.pool()))
4124 pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool()));
4125 cur_map->pg_to_up_acting_osds(
4126 pgid, &up, &up_primary, &acting, &primary);
4127
4128 if (p.same_interval_since == 0) {
4129 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4130 << " first map, acting " << acting
4131 << " up " << up << ", same_interval_since = " << cur_epoch << dendl;
4132 p.same_interval_since = cur_epoch;
4133 p.old_up = up;
4134 p.old_acting = acting;
4135 p.primary = primary;
4136 p.up_primary = up_primary;
4137 continue;
4138 }
4139 assert(last_map);
4140
4141 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
4142 pg->get_is_recoverable_predicate());
4143 std::stringstream debug;
4144 bool new_interval = PastIntervals::check_new_interval(
4145 p.primary,
4146 primary,
4147 p.old_acting, acting,
4148 p.up_primary,
4149 up_primary,
4150 p.old_up, up,
4151 p.same_interval_since,
4152 pg->info.history.last_epoch_clean,
4153 cur_map, last_map,
4154 pgid,
4155 recoverable.get(),
4156 &pg->past_intervals,
4157 &debug);
4158 if (new_interval) {
4159 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4160 << " " << debug.str() << dendl;
4161 p.old_up = up;
4162 p.old_acting = acting;
4163 p.primary = primary;
4164 p.up_primary = up_primary;
4165 p.same_interval_since = cur_epoch;
4166 }
4167 }
4168 }
4169
4170 // Now that past_intervals have been recomputed let's fix the same_interval_since
4171 // if it was cleared by import.
4172 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4173 PG *pg = i->first;
4174 pistate& p = i->second;
4175
4176 if (pg->info.history.same_interval_since == 0) {
4177 assert(p.same_interval_since);
4178 dout(10) << __func__ << " fix same_interval_since " << p.same_interval_since << " pg " << *pg << dendl;
4179 dout(10) << __func__ << " past_intervals " << pg->past_intervals << dendl;
4180 // Fix it
4181 pg->info.history.same_interval_since = p.same_interval_since;
4182 }
4183 }
4184
4185 // write info only at the end. this is necessary because we check
4186 // whether the past_intervals go far enough back or forward in time,
4187 // but we don't check for holes. we could avoid it by discarding
4188 // the previous past_intervals and rebuilding from scratch, or we
4189 // can just do this and commit all our work at the end.
4190 ObjectStore::Transaction t;
4191 int num = 0;
4192 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4193 PG *pg = i->first;
4194 pg->lock();
4195 pg->dirty_big_info = true;
4196 pg->dirty_info = true;
4197 pg->write_if_dirty(t);
4198 pg->unlock();
4199
4200 // don't let the transaction get too big
4201 if (++num >= cct->_conf->osd_target_transaction_size) {
4202 store->apply_transaction(service.meta_osr.get(), std::move(t));
4203 t = ObjectStore::Transaction();
4204 num = 0;
4205 }
4206 }
4207 if (!t.empty())
4208 store->apply_transaction(service.meta_osr.get(), std::move(t));
4209}
4210
4211/*
4212 * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping
4213 * hasn't changed since the given epoch and we are the primary.
4214 */
4215int OSD::handle_pg_peering_evt(
4216 spg_t pgid,
4217 const pg_history_t& orig_history,
4218 const PastIntervals& pi,
4219 epoch_t epoch,
4220 PG::CephPeeringEvtRef evt)
4221{
4222 if (service.splitting(pgid)) {
4223 peering_wait_for_split[pgid].push_back(evt);
4224 return -EEXIST;
4225 }
4226
4227 PG *pg = _lookup_lock_pg(pgid);
4228 if (!pg) {
4229 // same primary?
4230 if (!osdmap->have_pg_pool(pgid.pool()))
4231 return -EINVAL;
4232 int up_primary, acting_primary;
4233 vector<int> up, acting;
4234 osdmap->pg_to_up_acting_osds(
4235 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4236
4237 pg_history_t history = orig_history;
4238 bool valid_history = project_pg_history(
4239 pgid, history, epoch, up, up_primary, acting, acting_primary);
4240
4241 if (!valid_history || epoch < history.same_interval_since) {
4242 dout(10) << __func__ << pgid << " acting changed in "
4243 << history.same_interval_since << " (msg from " << epoch << ")"
4244 << dendl;
4245 return -EINVAL;
4246 }
4247
4248 if (service.splitting(pgid)) {
4249 ceph_abort();
4250 }
4251
4252 // do we need to resurrect a deleting pg?
4253 spg_t resurrected;
4254 PGRef old_pg_state;
4255 res_result result = _try_resurrect_pg(
4256 service.get_osdmap(),
4257 pgid,
4258 &resurrected,
4259 &old_pg_state);
4260
4261 PG::RecoveryCtx rctx = create_context();
4262 switch (result) {
4263 case RES_NONE: {
4264 const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
4265 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4266 store->get_type() != "bluestore") {
4267 clog->warn() << "pg " << pgid
4268 << " is at risk of silent data corruption: "
4269 << "the pool allows ec overwrites but is not stored in "
4270 << "bluestore, so deep scrubbing will not detect bitrot";
4271 }
4272 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4273 PG::_init(*rctx.transaction, pgid, pp);
4274
4275 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
4276 if (!pp->is_replicated() && role != pgid.shard)
4277 role = -1;
4278
4279 pg = _create_lock_pg(
4280 get_map(epoch),
4281 pgid, false, false,
4282 role,
4283 up, up_primary,
4284 acting, acting_primary,
4285 history, pi,
4286 *rctx.transaction);
4287 pg->handle_create(&rctx);
4288 pg->write_if_dirty(*rctx.transaction);
4289 dispatch_context(rctx, pg, osdmap);
4290
4291 dout(10) << *pg << " is new" << dendl;
4292
4293 pg->queue_peering_event(evt);
4294 wake_pg_waiters(pg);
4295 pg->unlock();
4296 return 0;
4297 }
4298 case RES_SELF: {
4299 old_pg_state->lock();
4300 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4301 int old_role = old_pg_state->role;
4302 vector<int> old_up = old_pg_state->up;
4303 int old_up_primary = old_pg_state->up_primary.osd;
4304 vector<int> old_acting = old_pg_state->acting;
4305 int old_primary = old_pg_state->primary.osd;
4306 pg_history_t old_history = old_pg_state->info.history;
4307 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4308 old_pg_state->unlock();
4309 pg = _create_lock_pg(
4310 old_osd_map,
4311 resurrected,
4312 false,
4313 true,
4314 old_role,
4315 old_up,
4316 old_up_primary,
4317 old_acting,
4318 old_primary,
4319 old_history,
4320 old_past_intervals,
4321 *rctx.transaction);
4322 pg->handle_create(&rctx);
4323 pg->write_if_dirty(*rctx.transaction);
4324 dispatch_context(rctx, pg, osdmap);
4325
4326 dout(10) << *pg << " is new (resurrected)" << dendl;
4327
4328 pg->queue_peering_event(evt);
4329 wake_pg_waiters(pg);
4330 pg->unlock();
4331 return 0;
4332 }
4333 case RES_PARENT: {
4334 assert(old_pg_state);
4335 old_pg_state->lock();
4336 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4337 int old_role = old_pg_state->role;
4338 vector<int> old_up = old_pg_state->up;
4339 int old_up_primary = old_pg_state->up_primary.osd;
4340 vector<int> old_acting = old_pg_state->acting;
4341 int old_primary = old_pg_state->primary.osd;
4342 pg_history_t old_history = old_pg_state->info.history;
4343 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4344 old_pg_state->unlock();
4345 PG *parent = _create_lock_pg(
4346 old_osd_map,
4347 resurrected,
4348 false,
4349 true,
4350 old_role,
4351 old_up,
4352 old_up_primary,
4353 old_acting,
4354 old_primary,
4355 old_history,
4356 old_past_intervals,
4357 *rctx.transaction
4358 );
4359 parent->handle_create(&rctx);
4360 parent->write_if_dirty(*rctx.transaction);
4361 dispatch_context(rctx, parent, osdmap);
4362
4363 dout(10) << *parent << " is new" << dendl;
4364
4365 assert(service.splitting(pgid));
4366 peering_wait_for_split[pgid].push_back(evt);
4367
4368 //parent->queue_peering_event(evt);
4369 parent->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
4370 wake_pg_waiters(parent);
4371 parent->unlock();
4372 return 0;
4373 }
4374 default:
4375 assert(0);
4376 return 0;
4377 }
4378 } else {
4379 // already had it. did the mapping change?
4380 if (epoch < pg->info.history.same_interval_since) {
4381 dout(10) << *pg << __func__ << " acting changed in "
4382 << pg->info.history.same_interval_since
4383 << " (msg from " << epoch << ")" << dendl;
4384 } else {
4385 pg->queue_peering_event(evt);
4386 }
4387 pg->unlock();
4388 return -EEXIST;
4389 }
4390}
4391
4392
4393void OSD::build_initial_pg_history(
4394 spg_t pgid,
4395 epoch_t created,
4396 utime_t created_stamp,
4397 pg_history_t *h,
4398 PastIntervals *pi)
4399{
4400 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4401 h->epoch_created = created;
31f18b77 4402 h->epoch_pool_created = created;
7c673cae
FG
4403 h->same_interval_since = created;
4404 h->same_up_since = created;
4405 h->same_primary_since = created;
4406 h->last_scrub_stamp = created_stamp;
4407 h->last_deep_scrub_stamp = created_stamp;
4408 h->last_clean_scrub_stamp = created_stamp;
4409
4410 OSDMapRef lastmap = service.get_map(created);
4411 int up_primary, acting_primary;
4412 vector<int> up, acting;
4413 lastmap->pg_to_up_acting_osds(
4414 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4415
4416 ostringstream debug;
4417 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4418 OSDMapRef osdmap = service.get_map(e);
4419 int new_up_primary, new_acting_primary;
4420 vector<int> new_up, new_acting;
4421 osdmap->pg_to_up_acting_osds(
4422 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4423
4424 // this is a bit imprecise, but sufficient?
4425 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4426 const pg_pool_t *pi;
4427 bool operator()(const set<pg_shard_t> &have) const {
4428 return have.size() >= pi->min_size;
4429 }
4430 min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4431 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4432
4433 bool new_interval = PastIntervals::check_new_interval(
4434 acting_primary,
4435 new_acting_primary,
4436 acting, new_acting,
4437 up_primary,
4438 new_up_primary,
4439 up, new_up,
4440 h->same_interval_since,
4441 h->last_epoch_clean,
4442 osdmap,
4443 lastmap,
4444 pgid.pgid,
4445 &min_size_predicate,
4446 pi,
4447 &debug);
4448 if (new_interval) {
4449 h->same_interval_since = e;
181888fb
FG
4450 if (up != new_up) {
4451 h->same_up_since = e;
4452 }
4453 if (acting_primary != new_acting_primary) {
4454 h->same_primary_since = e;
4455 }
4456 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4457 osdmap->get_pg_num(pgid.pgid.pool()),
4458 nullptr)) {
4459 h->last_epoch_split = e;
4460 }
4461 up = new_up;
4462 acting = new_acting;
4463 up_primary = new_up_primary;
4464 acting_primary = new_acting_primary;
c07f9fc5 4465 }
7c673cae
FG
4466 lastmap = osdmap;
4467 }
4468 dout(20) << __func__ << " " << debug.str() << dendl;
4469 dout(10) << __func__ << " " << *h << " " << *pi
4470 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4471 pi->get_bounds()) << ")"
4472 << dendl;
4473}
4474
4475/**
4476 * Fill in the passed history so you know same_interval_since, same_up_since,
4477 * and same_primary_since.
4478 */
4479bool OSD::project_pg_history(spg_t pgid, pg_history_t& h, epoch_t from,
4480 const vector<int>& currentup,
4481 int currentupprimary,
4482 const vector<int>& currentacting,
4483 int currentactingprimary)
4484{
4485 dout(15) << "project_pg_history " << pgid
4486 << " from " << from << " to " << osdmap->get_epoch()
4487 << ", start " << h
4488 << dendl;
4489
4490 epoch_t e;
4491 for (e = osdmap->get_epoch();
4492 e > from;
4493 e--) {
4494 // verify during intermediate epoch (e-1)
4495 OSDMapRef oldmap = service.try_get_map(e-1);
4496 if (!oldmap) {
4497 dout(15) << __func__ << ": found map gap, returning false" << dendl;
4498 return false;
4499 }
4500 assert(oldmap->have_pg_pool(pgid.pool()));
4501
4502 int upprimary, actingprimary;
4503 vector<int> up, acting;
4504 oldmap->pg_to_up_acting_osds(
4505 pgid.pgid,
4506 &up,
4507 &upprimary,
4508 &acting,
4509 &actingprimary);
4510
4511 // acting set change?
4512 if ((actingprimary != currentactingprimary ||
4513 upprimary != currentupprimary ||
4514 acting != currentacting ||
4515 up != currentup) && e > h.same_interval_since) {
4516 dout(15) << "project_pg_history " << pgid << " acting|up changed in " << e
4517 << " from " << acting << "/" << up
4518 << " " << actingprimary << "/" << upprimary
4519 << " -> " << currentacting << "/" << currentup
4520 << " " << currentactingprimary << "/" << currentupprimary
4521 << dendl;
4522 h.same_interval_since = e;
4523 }
4524 // split?
4525 if (pgid.is_split(oldmap->get_pg_num(pgid.pool()),
4526 osdmap->get_pg_num(pgid.pool()),
4527 0) && e > h.same_interval_since) {
4528 h.same_interval_since = e;
4529 }
4530 // up set change?
4531 if ((up != currentup || upprimary != currentupprimary)
4532 && e > h.same_up_since) {
4533 dout(15) << "project_pg_history " << pgid << " up changed in " << e
4534 << " from " << up << " " << upprimary
4535 << " -> " << currentup << " " << currentupprimary << dendl;
4536 h.same_up_since = e;
4537 }
4538
4539 // primary change?
4540 if (OSDMap::primary_changed(
4541 actingprimary,
4542 acting,
4543 currentactingprimary,
4544 currentacting) &&
4545 e > h.same_primary_since) {
4546 dout(15) << "project_pg_history " << pgid << " primary changed in " << e << dendl;
4547 h.same_primary_since = e;
4548 }
4549
4550 if (h.same_interval_since >= e && h.same_up_since >= e && h.same_primary_since >= e)
4551 break;
4552 }
4553
31f18b77 4554 // base case: these floors should be the pg creation epoch if we didn't
7c673cae
FG
4555 // find any changes.
4556 if (e == h.epoch_created) {
4557 if (!h.same_interval_since)
4558 h.same_interval_since = e;
4559 if (!h.same_up_since)
4560 h.same_up_since = e;
4561 if (!h.same_primary_since)
4562 h.same_primary_since = e;
4563 }
4564
4565 dout(15) << "project_pg_history end " << h << dendl;
4566 return true;
4567}
4568
4569
4570
4571void OSD::_add_heartbeat_peer(int p)
4572{
4573 if (p == whoami)
4574 return;
4575 HeartbeatInfo *hi;
4576
4577 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4578 if (i == heartbeat_peers.end()) {
4579 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4580 if (!cons.first)
4581 return;
4582 hi = &heartbeat_peers[p];
4583 hi->peer = p;
4584 HeartbeatSession *s = new HeartbeatSession(p);
4585 hi->con_back = cons.first.get();
4586 hi->con_back->set_priv(s->get());
4587 if (cons.second) {
4588 hi->con_front = cons.second.get();
4589 hi->con_front->set_priv(s->get());
4590 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4591 << " " << hi->con_back->get_peer_addr()
4592 << " " << hi->con_front->get_peer_addr()
4593 << dendl;
4594 } else {
4595 hi->con_front.reset(NULL);
4596 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4597 << " " << hi->con_back->get_peer_addr()
4598 << dendl;
4599 }
4600 s->put();
4601 } else {
4602 hi = &i->second;
4603 }
4604 hi->epoch = osdmap->get_epoch();
4605}
4606
4607void OSD::_remove_heartbeat_peer(int n)
4608{
4609 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
4610 assert(q != heartbeat_peers.end());
4611 dout(20) << " removing heartbeat peer osd." << n
4612 << " " << q->second.con_back->get_peer_addr()
4613 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
4614 << dendl;
4615 q->second.con_back->mark_down();
4616 if (q->second.con_front) {
4617 q->second.con_front->mark_down();
4618 }
4619 heartbeat_peers.erase(q);
4620}
4621
4622void OSD::need_heartbeat_peer_update()
4623{
4624 if (is_stopping())
4625 return;
4626 dout(20) << "need_heartbeat_peer_update" << dendl;
4627 heartbeat_set_peers_need_update();
4628}
4629
4630void OSD::maybe_update_heartbeat_peers()
4631{
4632 assert(osd_lock.is_locked());
4633
4634 if (is_waiting_for_healthy()) {
4635 utime_t now = ceph_clock_now();
4636 if (last_heartbeat_resample == utime_t()) {
4637 last_heartbeat_resample = now;
4638 heartbeat_set_peers_need_update();
4639 } else if (!heartbeat_peers_need_update()) {
4640 utime_t dur = now - last_heartbeat_resample;
4641 if (dur > cct->_conf->osd_heartbeat_grace) {
4642 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
4643 heartbeat_set_peers_need_update();
4644 last_heartbeat_resample = now;
4645 reset_heartbeat_peers(); // we want *new* peers!
4646 }
4647 }
4648 }
4649
4650 if (!heartbeat_peers_need_update())
4651 return;
4652 heartbeat_clear_peers_need_update();
4653
4654 Mutex::Locker l(heartbeat_lock);
4655
4656 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
4657
4658
4659 // build heartbeat from set
4660 if (is_active()) {
4661 RWLock::RLocker l(pg_map_lock);
4662 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4663 i != pg_map.end();
4664 ++i) {
4665 PG *pg = i->second;
4666 pg->heartbeat_peer_lock.Lock();
4667 dout(20) << i->first << " heartbeat_peers " << pg->heartbeat_peers << dendl;
4668 for (set<int>::iterator p = pg->heartbeat_peers.begin();
4669 p != pg->heartbeat_peers.end();
4670 ++p)
4671 if (osdmap->is_up(*p))
4672 _add_heartbeat_peer(*p);
4673 for (set<int>::iterator p = pg->probe_targets.begin();
4674 p != pg->probe_targets.end();
4675 ++p)
4676 if (osdmap->is_up(*p))
4677 _add_heartbeat_peer(*p);
4678 pg->heartbeat_peer_lock.Unlock();
4679 }
4680 }
4681
4682 // include next and previous up osds to ensure we have a fully-connected set
4683 set<int> want, extras;
4684 int next = osdmap->get_next_up_osd_after(whoami);
4685 if (next >= 0)
4686 want.insert(next);
4687 int prev = osdmap->get_previous_up_osd_before(whoami);
4688 if (prev >= 0 && prev != next)
4689 want.insert(prev);
4690
4691 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
4692 dout(10) << " adding neighbor peer osd." << *p << dendl;
4693 extras.insert(*p);
4694 _add_heartbeat_peer(*p);
4695 }
4696
4697 // remove down peers; enumerate extras
4698 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4699 while (p != heartbeat_peers.end()) {
4700 if (!osdmap->is_up(p->first)) {
4701 int o = p->first;
4702 ++p;
4703 _remove_heartbeat_peer(o);
4704 continue;
4705 }
4706 if (p->second.epoch < osdmap->get_epoch()) {
4707 extras.insert(p->first);
4708 }
4709 ++p;
4710 }
4711
4712 // too few?
4713 int start = osdmap->get_next_up_osd_after(whoami);
4714 for (int n = start; n >= 0; ) {
4715 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
4716 break;
4717 if (!extras.count(n) && !want.count(n) && n != whoami) {
4718 dout(10) << " adding random peer osd." << n << dendl;
4719 extras.insert(n);
4720 _add_heartbeat_peer(n);
4721 }
4722 n = osdmap->get_next_up_osd_after(n);
4723 if (n == start)
4724 break; // came full circle; stop
4725 }
4726
4727 // too many?
4728 for (set<int>::iterator p = extras.begin();
4729 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
4730 ++p) {
4731 if (want.count(*p))
4732 continue;
4733 _remove_heartbeat_peer(*p);
4734 }
4735
4736 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
4737}
4738
4739void OSD::reset_heartbeat_peers()
4740{
4741 assert(osd_lock.is_locked());
4742 dout(10) << "reset_heartbeat_peers" << dendl;
4743 Mutex::Locker l(heartbeat_lock);
4744 while (!heartbeat_peers.empty()) {
4745 HeartbeatInfo& hi = heartbeat_peers.begin()->second;
4746 hi.con_back->mark_down();
4747 if (hi.con_front) {
4748 hi.con_front->mark_down();
4749 }
4750 heartbeat_peers.erase(heartbeat_peers.begin());
4751 }
4752 failure_queue.clear();
4753}
4754
4755void OSD::handle_osd_ping(MOSDPing *m)
4756{
4757 if (superblock.cluster_fsid != m->fsid) {
4758 dout(20) << "handle_osd_ping from " << m->get_source_inst()
4759 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
4760 m->put();
4761 return;
4762 }
4763
4764 int from = m->get_source().num();
4765
4766 heartbeat_lock.Lock();
4767 if (is_stopping()) {
4768 heartbeat_lock.Unlock();
4769 m->put();
4770 return;
4771 }
4772
4773 OSDMapRef curmap = service.get_osdmap();
c07f9fc5
FG
4774 if (!curmap) {
4775 heartbeat_lock.Unlock();
4776 m->put();
4777 return;
4778 }
7c673cae
FG
4779
4780 switch (m->op) {
4781
4782 case MOSDPing::PING:
4783 {
4784 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
4785 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
4786 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
4787 if (heartbeat_drop->second == 0) {
4788 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
4789 } else {
4790 --heartbeat_drop->second;
4791 dout(5) << "Dropping heartbeat from " << from
4792 << ", " << heartbeat_drop->second
4793 << " remaining to drop" << dendl;
4794 break;
4795 }
4796 } else if (cct->_conf->osd_debug_drop_ping_probability >
4797 ((((double)(rand()%100))/100.0))) {
4798 heartbeat_drop =
4799 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
4800 cct->_conf->osd_debug_drop_ping_duration)).first;
4801 dout(5) << "Dropping heartbeat from " << from
4802 << ", " << heartbeat_drop->second
4803 << " remaining to drop" << dendl;
4804 break;
4805 }
4806 }
4807
4808 if (!cct->get_heartbeat_map()->is_healthy()) {
4809 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
4810 break;
4811 }
4812
4813 Message *r = new MOSDPing(monc->get_fsid(),
4814 curmap->get_epoch(),
31f18b77
FG
4815 MOSDPing::PING_REPLY, m->stamp,
4816 cct->_conf->osd_heartbeat_min_size);
7c673cae
FG
4817 m->get_connection()->send_message(r);
4818
4819 if (curmap->is_up(from)) {
4820 service.note_peer_epoch(from, m->map_epoch);
4821 if (is_active()) {
4822 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
4823 if (con) {
4824 service.share_map_peer(from, con.get());
4825 }
4826 }
4827 } else if (!curmap->exists(from) ||
4828 curmap->get_down_at(from) > m->map_epoch) {
4829 // tell them they have died
4830 Message *r = new MOSDPing(monc->get_fsid(),
4831 curmap->get_epoch(),
4832 MOSDPing::YOU_DIED,
31f18b77
FG
4833 m->stamp,
4834 cct->_conf->osd_heartbeat_min_size);
7c673cae
FG
4835 m->get_connection()->send_message(r);
4836 }
4837 }
4838 break;
4839
4840 case MOSDPing::PING_REPLY:
4841 {
4842 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
4843 if (i != heartbeat_peers.end()) {
4844 if (m->get_connection() == i->second.con_back) {
4845 dout(25) << "handle_osd_ping got reply from osd." << from
4846 << " first_tx " << i->second.first_tx
4847 << " last_tx " << i->second.last_tx
4848 << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
4849 << " last_rx_front " << i->second.last_rx_front
4850 << dendl;
4851 i->second.last_rx_back = m->stamp;
4852 // if there is no front con, set both stamps.
4853 if (i->second.con_front == NULL)
4854 i->second.last_rx_front = m->stamp;
4855 } else if (m->get_connection() == i->second.con_front) {
4856 dout(25) << "handle_osd_ping got reply from osd." << from
4857 << " first_tx " << i->second.first_tx
4858 << " last_tx " << i->second.last_tx
4859 << " last_rx_back " << i->second.last_rx_back
4860 << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
4861 << dendl;
4862 i->second.last_rx_front = m->stamp;
4863 }
4864
4865 utime_t cutoff = ceph_clock_now();
4866 cutoff -= cct->_conf->osd_heartbeat_grace;
4867 if (i->second.is_healthy(cutoff)) {
4868 // Cancel false reports
4869 auto failure_queue_entry = failure_queue.find(from);
4870 if (failure_queue_entry != failure_queue.end()) {
4871 dout(10) << "handle_osd_ping canceling queued "
4872 << "failure report for osd." << from << dendl;
4873 failure_queue.erase(failure_queue_entry);
4874 }
4875
4876 auto failure_pending_entry = failure_pending.find(from);
4877 if (failure_pending_entry != failure_pending.end()) {
4878 dout(10) << "handle_osd_ping canceling in-flight "
4879 << "failure report for osd." << from << dendl;
4880 send_still_alive(curmap->get_epoch(),
4881 failure_pending_entry->second.second);
4882 failure_pending.erase(failure_pending_entry);
4883 }
4884 }
4885 }
4886
4887 if (m->map_epoch &&
4888 curmap->is_up(from)) {
4889 service.note_peer_epoch(from, m->map_epoch);
4890 if (is_active()) {
4891 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
4892 if (con) {
4893 service.share_map_peer(from, con.get());
4894 }
4895 }
4896 }
4897 }
4898 break;
4899
4900 case MOSDPing::YOU_DIED:
4901 dout(10) << "handle_osd_ping " << m->get_source_inst()
4902 << " says i am down in " << m->map_epoch << dendl;
4903 osdmap_subscribe(curmap->get_epoch()+1, false);
4904 break;
4905 }
4906
4907 heartbeat_lock.Unlock();
4908 m->put();
4909}
4910
4911void OSD::heartbeat_entry()
4912{
4913 Mutex::Locker l(heartbeat_lock);
4914 if (is_stopping())
4915 return;
4916 while (!heartbeat_stop) {
4917 heartbeat();
4918
4919 double wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
4920 utime_t w;
4921 w.set_from_double(wait);
4922 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
4923 heartbeat_cond.WaitInterval(heartbeat_lock, w);
4924 if (is_stopping())
4925 return;
4926 dout(30) << "heartbeat_entry woke up" << dendl;
4927 }
4928}
4929
4930void OSD::heartbeat_check()
4931{
4932 assert(heartbeat_lock.is_locked());
4933 utime_t now = ceph_clock_now();
4934
4935 // check for heartbeat replies (move me elsewhere?)
4936 utime_t cutoff = now;
4937 cutoff -= cct->_conf->osd_heartbeat_grace;
4938 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4939 p != heartbeat_peers.end();
4940 ++p) {
4941
4942 if (p->second.first_tx == utime_t()) {
4943 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
4944 << "yet, skipping" << dendl;
4945 continue;
4946 }
4947
4948 dout(25) << "heartbeat_check osd." << p->first
4949 << " first_tx " << p->second.first_tx
4950 << " last_tx " << p->second.last_tx
4951 << " last_rx_back " << p->second.last_rx_back
4952 << " last_rx_front " << p->second.last_rx_front
4953 << dendl;
4954 if (p->second.is_unhealthy(cutoff)) {
4955 if (p->second.last_rx_back == utime_t() ||
4956 p->second.last_rx_front == utime_t()) {
4957 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
4958 << " osd." << p->first << " ever on either front or back, first ping sent "
4959 << p->second.first_tx << " (cutoff " << cutoff << ")" << dendl;
4960 // fail
4961 failure_queue[p->first] = p->second.last_tx;
4962 } else {
4963 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
4964 << " osd." << p->first << " since back " << p->second.last_rx_back
4965 << " front " << p->second.last_rx_front
4966 << " (cutoff " << cutoff << ")" << dendl;
4967 // fail
4968 failure_queue[p->first] = MIN(p->second.last_rx_back, p->second.last_rx_front);
4969 }
4970 }
4971 }
4972}
4973
4974void OSD::heartbeat()
4975{
4976 dout(30) << "heartbeat" << dendl;
4977
4978 // get CPU load avg
4979 double loadavgs[1];
4980 int n_samples = 86400 / cct->_conf->osd_heartbeat_interval;
4981 if (getloadavg(loadavgs, 1) == 1) {
4982 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
4983 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
4984 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
4985 }
4986
4987 dout(30) << "heartbeat checking stats" << dendl;
4988
4989 // refresh stats?
4990 vector<int> hb_peers;
4991 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4992 p != heartbeat_peers.end();
4993 ++p)
4994 hb_peers.push_back(p->first);
4995 service.update_osd_stat(hb_peers);
4996
4997 dout(5) << "heartbeat: " << service.get_osd_stat() << dendl;
4998
4999 utime_t now = ceph_clock_now();
5000
5001 // send heartbeats
5002 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5003 i != heartbeat_peers.end();
5004 ++i) {
5005 int peer = i->first;
5006 i->second.last_tx = now;
5007 if (i->second.first_tx == utime_t())
5008 i->second.first_tx = now;
5009 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5010 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
5011 service.get_osdmap()->get_epoch(),
31f18b77
FG
5012 MOSDPing::PING, now,
5013 cct->_conf->osd_heartbeat_min_size));
7c673cae
FG
5014
5015 if (i->second.con_front)
5016 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
5017 service.get_osdmap()->get_epoch(),
31f18b77
FG
5018 MOSDPing::PING, now,
5019 cct->_conf->osd_heartbeat_min_size));
7c673cae
FG
5020 }
5021
5022 logger->set(l_osd_hb_to, heartbeat_peers.size());
5023
5024 // hmm.. am i all alone?
5025 dout(30) << "heartbeat lonely?" << dendl;
5026 if (heartbeat_peers.empty()) {
5027 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5028 last_mon_heartbeat = now;
5029 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5030 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5031 }
5032 }
5033
5034 dout(30) << "heartbeat done" << dendl;
5035}
5036
5037bool OSD::heartbeat_reset(Connection *con)
5038{
5039 HeartbeatSession *s = static_cast<HeartbeatSession*>(con->get_priv());
5040 if (s) {
5041 heartbeat_lock.Lock();
5042 if (is_stopping()) {
5043 heartbeat_lock.Unlock();
5044 s->put();
5045 return true;
5046 }
5047 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer);
5048 if (p != heartbeat_peers.end() &&
5049 (p->second.con_back == con ||
5050 p->second.con_front == con)) {
5051 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5052 << ", reopening" << dendl;
5053 if (con != p->second.con_back) {
5054 p->second.con_back->mark_down();
5055 }
5056 p->second.con_back.reset(NULL);
5057 if (p->second.con_front && con != p->second.con_front) {
5058 p->second.con_front->mark_down();
5059 }
5060 p->second.con_front.reset(NULL);
5061 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5062 if (newcon.first) {
5063 p->second.con_back = newcon.first.get();
5064 p->second.con_back->set_priv(s->get());
5065 if (newcon.second) {
5066 p->second.con_front = newcon.second.get();
5067 p->second.con_front->set_priv(s->get());
5068 }
5069 } else {
5070 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5071 << ", raced with osdmap update, closing out peer" << dendl;
5072 heartbeat_peers.erase(p);
5073 }
5074 } else {
5075 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5076 }
5077 heartbeat_lock.Unlock();
5078 s->put();
5079 }
5080 return true;
5081}
5082
5083
5084
5085// =========================================
5086
5087void OSD::tick()
5088{
5089 assert(osd_lock.is_locked());
5090 dout(10) << "tick" << dendl;
5091
5092 if (is_active() || is_waiting_for_healthy()) {
5093 maybe_update_heartbeat_peers();
5094 }
5095
5096 if (is_waiting_for_healthy()) {
5097 start_boot();
224ce89b
WB
5098 } else if (is_preboot() &&
5099 waiting_for_luminous_mons &&
5100 monc->monmap.get_required_features().contains_all(
5101 ceph::features::mon::FEATURE_LUMINOUS)) {
5102 // mon upgrade finished!
5103 start_boot();
7c673cae
FG
5104 }
5105
5106 do_waiters();
5107
5108 tick_timer.add_event_after(OSD_TICK_INTERVAL, new C_Tick(this));
7c673cae
FG
5109}
5110
5111void OSD::tick_without_osd_lock()
5112{
5113 assert(tick_timer_lock.is_locked());
5114 dout(10) << "tick_without_osd_lock" << dendl;
5115
5116 logger->set(l_osd_buf, buffer::get_total_alloc());
5117 logger->set(l_osd_history_alloc_bytes, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
5118 logger->set(l_osd_history_alloc_num, buffer::get_history_alloc_num());
5119 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5120 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5121 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5122
5123 // osd_lock is not being held, which means the OSD state
5124 // might change when doing the monitor report
5125 if (is_active() || is_waiting_for_healthy()) {
5126 heartbeat_lock.Lock();
5127 heartbeat_check();
5128 heartbeat_lock.Unlock();
5129
5130 map_lock.get_read();
5131 Mutex::Locker l(mon_report_lock);
5132
5133 // mon report?
5134 bool reset = false;
5135 bool report = false;
5136 utime_t now = ceph_clock_now();
5137 pg_stat_queue_lock.Lock();
5138 double backoff = stats_ack_timeout / cct->_conf->osd_mon_ack_timeout;
5139 double adjusted_min = cct->_conf->osd_mon_report_interval_min * backoff;
5140 // note: we shouldn't adjust max because it must remain < the
5141 // mon's mon_osd_report_timeout (which defaults to 1.5x our
5142 // value).
5143 double max = cct->_conf->osd_mon_report_interval_max;
5144 if (!outstanding_pg_stats.empty() &&
5145 (now - stats_ack_timeout) > last_pg_stats_ack) {
5146 dout(1) << __func__ << " mon hasn't acked PGStats in "
5147 << now - last_pg_stats_ack
5148 << " seconds, reconnecting elsewhere" << dendl;
5149 reset = true;
5150 last_pg_stats_ack = now; // reset clock
5151 last_pg_stats_sent = utime_t();
5152 stats_ack_timeout =
5153 MAX(cct->_conf->osd_mon_ack_timeout,
5154 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_factor);
5155 outstanding_pg_stats.clear();
5156 }
5157 if (now - last_pg_stats_sent > max) {
5158 osd_stat_updated = true;
5159 report = true;
5160 } else if (service.need_fullness_update()) {
5161 report = true;
5162 } else if ((int)outstanding_pg_stats.size() >=
5163 cct->_conf->osd_mon_report_max_in_flight) {
5164 dout(20) << __func__ << " have max " << outstanding_pg_stats
5165 << " stats updates in flight" << dendl;
5166 } else {
5167 if (now - last_mon_report > adjusted_min) {
5168 dout(20) << __func__ << " stats backoff " << backoff
5169 << " adjusted_min " << adjusted_min << " - sending report"
5170 << dendl;
5171 osd_stat_updated = true;
5172 report = true;
5173 }
5174 }
5175 pg_stat_queue_lock.Unlock();
5176
5177 if (reset) {
5178 monc->reopen_session();
5179 } else if (report) {
5180 last_mon_report = now;
5181
5182 // do any pending reports
5183 send_full_update();
5184 send_failures();
31f18b77
FG
5185 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5186 send_pg_stats(now);
5187 }
7c673cae
FG
5188 }
5189 map_lock.put_read();
5190 }
5191
5192 if (is_active()) {
5193 if (!scrub_random_backoff()) {
5194 sched_scrub();
5195 }
5196 service.promote_throttle_recalibrate();
224ce89b
WB
5197 bool need_send_beacon = false;
5198 const auto now = ceph::coarse_mono_clock::now();
5199 {
5200 // borrow lec lock to pretect last_sent_beacon from changing
5201 Mutex::Locker l{min_last_epoch_clean_lock};
5202 const auto elapsed = now - last_sent_beacon;
5203 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5204 cct->_conf->osd_beacon_report_interval) {
5205 need_send_beacon = true;
5206 }
5207 }
5208 if (need_send_beacon) {
5209 send_beacon(now);
5210 }
7c673cae
FG
5211 }
5212
5213 check_ops_in_flight();
5214 service.kick_recovery_queue();
5215 tick_timer_without_osd_lock.add_event_after(OSD_TICK_INTERVAL, new C_Tick_WithoutOSDLock(this));
5216}
5217
5218void OSD::check_ops_in_flight()
5219{
5220 vector<string> warnings;
5221 if (op_tracker.check_ops_in_flight(warnings)) {
5222 for (vector<string>::iterator i = warnings.begin();
5223 i != warnings.end();
5224 ++i) {
5225 clog->warn() << *i;
5226 }
5227 }
5228}
5229
5230// Usage:
5231// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5232// rmomapkey <pool-id> [namespace/]<obj-name> <key>
5233// setomapheader <pool-id> [namespace/]<obj-name> <header>
5234// getomap <pool> [namespace/]<obj-name>
5235// truncobj <pool-id> [namespace/]<obj-name> <newlen>
5236// injectmdataerr [namespace/]<obj-name> [shardid]
5237// injectdataerr [namespace/]<obj-name> [shardid]
5238//
5239// set_recovery_delay [utime]
5240void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5241 const std::string &command, cmdmap_t& cmdmap, ostream &ss)
5242{
5243 //Test support
5244 //Support changing the omap on a single osd by using the Admin Socket to
5245 //directly request the osd make a change.
5246 if (command == "setomapval" || command == "rmomapkey" ||
5247 command == "setomapheader" || command == "getomap" ||
5248 command == "truncobj" || command == "injectmdataerr" ||
5249 command == "injectdataerr"
5250 ) {
5251 pg_t rawpg;
5252 int64_t pool;
5253 OSDMapRef curmap = service->get_osdmap();
5254 int r = -1;
5255
5256 string poolstr;
5257
5258 cmd_getval(service->cct, cmdmap, "pool", poolstr);
5259 pool = curmap->lookup_pg_pool_name(poolstr);
5260 //If we can't find it by name then maybe id specified
5261 if (pool < 0 && isdigit(poolstr[0]))
5262 pool = atoll(poolstr.c_str());
5263 if (pool < 0) {
b5b8bbf5 5264 ss << "Invalid pool '" << poolstr << "''";
7c673cae
FG
5265 return;
5266 }
5267
5268 string objname, nspace;
5269 cmd_getval(service->cct, cmdmap, "objname", objname);
5270 std::size_t found = objname.find_first_of('/');
5271 if (found != string::npos) {
5272 nspace = objname.substr(0, found);
5273 objname = objname.substr(found+1);
5274 }
5275 object_locator_t oloc(pool, nspace);
5276 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5277
5278 if (r < 0) {
5279 ss << "Invalid namespace/objname";
5280 return;
5281 }
5282
5283 int64_t shardid;
5284 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5285 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5286 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5287 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5288 if (curmap->pg_is_ec(rawpg)) {
5289 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5290 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5291 return;
5292 }
5293 }
5294
5295 ObjectStore::Transaction t;
5296
5297 if (command == "setomapval") {
5298 map<string, bufferlist> newattrs;
5299 bufferlist val;
5300 string key, valstr;
5301 cmd_getval(service->cct, cmdmap, "key", key);
5302 cmd_getval(service->cct, cmdmap, "val", valstr);
5303
5304 val.append(valstr);
5305 newattrs[key] = val;
5306 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5307 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5308 if (r < 0)
5309 ss << "error=" << r;
5310 else
5311 ss << "ok";
5312 } else if (command == "rmomapkey") {
5313 string key;
5314 set<string> keys;
5315 cmd_getval(service->cct, cmdmap, "key", key);
5316
5317 keys.insert(key);
5318 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
5319 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5320 if (r < 0)
5321 ss << "error=" << r;
5322 else
5323 ss << "ok";
5324 } else if (command == "setomapheader") {
5325 bufferlist newheader;
5326 string headerstr;
5327
5328 cmd_getval(service->cct, cmdmap, "header", headerstr);
5329 newheader.append(headerstr);
5330 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
5331 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5332 if (r < 0)
5333 ss << "error=" << r;
5334 else
5335 ss << "ok";
5336 } else if (command == "getomap") {
5337 //Debug: Output entire omap
5338 bufferlist hdrbl;
5339 map<string, bufferlist> keyvals;
5340 r = store->omap_get(coll_t(pgid), ghobject_t(obj), &hdrbl, &keyvals);
5341 if (r >= 0) {
5342 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5343 for (map<string, bufferlist>::iterator it = keyvals.begin();
5344 it != keyvals.end(); ++it)
5345 ss << " key=" << (*it).first << " val="
5346 << string((*it).second.c_str(), (*it).second.length());
5347 } else {
5348 ss << "error=" << r;
5349 }
5350 } else if (command == "truncobj") {
5351 int64_t trunclen;
5352 cmd_getval(service->cct, cmdmap, "len", trunclen);
5353 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
5354 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5355 if (r < 0)
5356 ss << "error=" << r;
5357 else
5358 ss << "ok";
5359 } else if (command == "injectdataerr") {
5360 store->inject_data_error(gobj);
5361 ss << "ok";
5362 } else if (command == "injectmdataerr") {
5363 store->inject_mdata_error(gobj);
5364 ss << "ok";
5365 }
5366 return;
5367 }
5368 if (command == "set_recovery_delay") {
5369 int64_t delay;
5370 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5371 ostringstream oss;
5372 oss << delay;
5373 int r = service->cct->_conf->set_val("osd_recovery_delay_start",
5374 oss.str().c_str());
5375 if (r != 0) {
5376 ss << "set_recovery_delay: error setting "
5377 << "osd_recovery_delay_start to '" << delay << "': error "
5378 << r;
5379 return;
5380 }
5381 service->cct->_conf->apply_changes(NULL);
5382 ss << "set_recovery_delay: set osd_recovery_delay_start "
5383 << "to " << service->cct->_conf->osd_recovery_delay_start;
5384 return;
5385 }
5386 if (command == "trigger_scrub") {
5387 spg_t pgid;
5388 OSDMapRef curmap = service->get_osdmap();
5389
5390 string pgidstr;
5391
5392 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5393 if (!pgid.parse(pgidstr.c_str())) {
5394 ss << "Invalid pgid specified";
5395 return;
5396 }
5397
5398 PG *pg = service->osd->_lookup_lock_pg(pgid);
5399 if (pg == nullptr) {
5400 ss << "Can't find pg " << pgid;
5401 return;
5402 }
5403
5404 if (pg->is_primary()) {
5405 pg->unreg_next_scrub();
5406 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5407 double pool_scrub_max_interval = 0;
5408 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5409 double scrub_max_interval = pool_scrub_max_interval > 0 ?
5410 pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
5411 // Instead of marking must_scrub force a schedule scrub
5412 utime_t stamp = ceph_clock_now();
5413 stamp -= scrub_max_interval;
5414 stamp -= 100.0; // push back last scrub more for good measure
5415 pg->info.history.last_scrub_stamp = stamp;
5416 pg->reg_next_scrub();
5417 ss << "ok";
5418 } else {
5419 ss << "Not primary";
5420 }
5421 pg->unlock();
5422 return;
5423 }
5424 if (command == "injectfull") {
5425 int64_t count;
5426 string type;
5427 OSDService::s_names state;
5428 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
5429 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
5430 if (type == "none" || count == 0) {
5431 type = "none";
5432 count = 0;
5433 }
5434 state = service->get_full_state(type);
5435 if (state == OSDService::s_names::INVALID) {
5436 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5437 return;
5438 }
5439 service->set_injectfull(state, count);
5440 return;
5441 }
5442 ss << "Internal error - command=" << command;
5443}
5444
5445// =========================================
5446bool remove_dir(
5447 CephContext *cct,
5448 ObjectStore *store, SnapMapper *mapper,
5449 OSDriver *osdriver,
5450 ObjectStore::Sequencer *osr,
5451 coll_t coll, DeletingStateRef dstate,
5452 bool *finished,
5453 ThreadPool::TPHandle &handle)
5454{
5455 vector<ghobject_t> olist;
5456 int64_t num = 0;
5457 ObjectStore::Transaction t;
5458 ghobject_t next;
5459 handle.reset_tp_timeout();
5460 store->collection_list(
5461 coll,
5462 next,
5463 ghobject_t::get_max(),
5464 store->get_ideal_list_max(),
5465 &olist,
5466 &next);
5467 generic_dout(10) << __func__ << " " << olist << dendl;
5468 // default cont to true, this is safe because caller(OSD::RemoveWQ::_process())
5469 // will recheck the answer before it really goes on.
5470 bool cont = true;
5471 for (vector<ghobject_t>::iterator i = olist.begin();
5472 i != olist.end();
5473 ++i) {
5474 if (i->is_pgmeta())
5475 continue;
5476 OSDriver::OSTransaction _t(osdriver->get_transaction(&t));
5477 int r = mapper->remove_oid(i->hobj, &_t);
5478 if (r != 0 && r != -ENOENT) {
5479 ceph_abort();
5480 }
5481 t.remove(coll, *i);
5482 if (++num >= cct->_conf->osd_target_transaction_size) {
5483 C_SaferCond waiter;
5484 store->queue_transaction(osr, std::move(t), &waiter);
5485 cont = dstate->pause_clearing();
5486 handle.suspend_tp_timeout();
5487 waiter.wait();
5488 handle.reset_tp_timeout();
5489 if (cont)
5490 cont = dstate->resume_clearing();
5491 if (!cont)
5492 return false;
5493 t = ObjectStore::Transaction();
5494 num = 0;
5495 }
5496 }
5497 if (num) {
5498 C_SaferCond waiter;
5499 store->queue_transaction(osr, std::move(t), &waiter);
5500 cont = dstate->pause_clearing();
5501 handle.suspend_tp_timeout();
5502 waiter.wait();
5503 handle.reset_tp_timeout();
5504 if (cont)
5505 cont = dstate->resume_clearing();
5506 }
5507 // whether there are more objects to remove in the collection
5508 *finished = next.is_max();
5509 return cont;
5510}
5511
5512void OSD::RemoveWQ::_process(
5513 pair<PGRef, DeletingStateRef> item,
5514 ThreadPool::TPHandle &handle)
5515{
5516 FUNCTRACE();
5517 PGRef pg(item.first);
5518 SnapMapper &mapper = pg->snap_mapper;
5519 OSDriver &driver = pg->osdriver;
5520 coll_t coll = coll_t(pg->info.pgid);
5521 pg->osr->flush();
5522 bool finished = false;
5523
5524 if (!item.second->start_or_resume_clearing())
5525 return;
5526
5527 bool cont = remove_dir(
5528 pg->cct, store, &mapper, &driver, pg->osr.get(), coll, item.second,
5529 &finished, handle);
5530 if (!cont)
5531 return;
5532 if (!finished) {
5533 if (item.second->pause_clearing())
5534 queue_front(item);
5535 return;
5536 }
5537
5538 if (!item.second->start_deleting())
5539 return;
5540
5541 ObjectStore::Transaction t;
5542 PGLog::clear_info_log(pg->info.pgid, &t);
5543
5544 if (cct->_conf->osd_inject_failure_on_pg_removal) {
5545 generic_derr << "osd_inject_failure_on_pg_removal" << dendl;
5546 _exit(1);
5547 }
5548 t.remove_collection(coll);
5549
5550 // We need the sequencer to stick around until the op is complete
5551 store->queue_transaction(
5552 pg->osr.get(),
5553 std::move(t),
5554 0, // onapplied
5555 0, // oncommit
5556 0, // onreadable sync
5557 new ContainerContext<PGRef>(pg),
5558 TrackedOpRef());
5559
5560 item.second->finish_deleting();
5561}
5562// =========================================
5563
5564void OSD::ms_handle_connect(Connection *con)
5565{
5566 dout(10) << __func__ << " con " << con << dendl;
5567 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
5568 Mutex::Locker l(osd_lock);
5569 if (is_stopping())
5570 return;
5571 dout(10) << __func__ << " on mon" << dendl;
5572
5573 if (is_preboot()) {
5574 start_boot();
5575 } else if (is_booting()) {
5576 _send_boot(); // resend boot message
5577 } else {
5578 map_lock.get_read();
5579 Mutex::Locker l2(mon_report_lock);
5580
5581 utime_t now = ceph_clock_now();
5582 last_mon_report = now;
5583
5584 // resend everything, it's a new session
5585 send_full_update();
5586 send_alive();
5587 service.requeue_pg_temp();
5588 service.send_pg_temp();
5589 requeue_failures();
5590 send_failures();
31f18b77
FG
5591 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5592 send_pg_stats(now);
5593 }
7c673cae
FG
5594
5595 map_lock.put_read();
5596 if (is_active()) {
5597 send_beacon(ceph::coarse_mono_clock::now());
5598 }
5599 }
5600
5601 // full map requests may happen while active or pre-boot
5602 if (requested_full_first) {
5603 rerequest_full_maps();
5604 }
5605 }
5606}
5607
5608void OSD::ms_handle_fast_connect(Connection *con)
5609{
5610 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5611 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5612 Session *s = static_cast<Session*>(con->get_priv());
5613 if (!s) {
5614 s = new Session(cct);
5615 con->set_priv(s->get());
5616 s->con = con;
5617 dout(10) << " new session (outgoing) " << s << " con=" << s->con
5618 << " addr=" << s->con->get_peer_addr() << dendl;
5619 // we don't connect to clients
5620 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5621 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5622 }
5623 s->put();
5624 }
5625}
5626
5627void OSD::ms_handle_fast_accept(Connection *con)
5628{
5629 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5630 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5631 Session *s = static_cast<Session*>(con->get_priv());
5632 if (!s) {
5633 s = new Session(cct);
5634 con->set_priv(s->get());
5635 s->con = con;
5636 dout(10) << "new session (incoming)" << s << " con=" << con
5637 << " addr=" << con->get_peer_addr()
5638 << " must have raced with connect" << dendl;
5639 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5640 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5641 }
5642 s->put();
5643 }
5644}
5645
5646bool OSD::ms_handle_reset(Connection *con)
5647{
5648 Session *session = static_cast<Session*>(con->get_priv());
5649 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
5650 if (!session)
5651 return false;
5652 session->wstate.reset(con);
5653 session->con.reset(NULL); // break con <-> session ref cycle
5654 // note that we break session->con *before* the session_handle_reset
5655 // cleanup below. this avoids a race between us and
5656 // PG::add_backoff, Session::check_backoff, etc.
5657 session_handle_reset(session);
5658 session->put();
5659 return true;
5660}
5661
5662bool OSD::ms_handle_refused(Connection *con)
5663{
5664 if (!cct->_conf->osd_fast_fail_on_connection_refused)
5665 return false;
5666
5667 Session *session = static_cast<Session*>(con->get_priv());
5668 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
5669 if (!session)
5670 return false;
5671 int type = con->get_peer_type();
5672 // handle only OSD failures here
5673 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
5674 OSDMapRef osdmap = get_osdmap();
5675 if (osdmap) {
5676 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
5677 if (id >= 0 && osdmap->is_up(id)) {
5678 // I'm cheating mon heartbeat grace logic, because we know it's not going
5679 // to respawn alone. +1 so we won't hit any boundary case.
5680 monc->send_mon_message(new MOSDFailure(monc->get_fsid(),
5681 osdmap->get_inst(id),
5682 cct->_conf->osd_heartbeat_grace + 1,
5683 osdmap->get_epoch(),
5684 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
5685 ));
5686 }
5687 }
5688 }
5689 session->put();
5690 return true;
5691}
5692
5693struct C_OSD_GetVersion : public Context {
5694 OSD *osd;
5695 uint64_t oldest, newest;
5696 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
5697 void finish(int r) override {
5698 if (r >= 0)
5699 osd->_got_mon_epochs(oldest, newest);
5700 }
5701};
5702
5703void OSD::start_boot()
5704{
5705 if (!_is_healthy()) {
5706 // if we are not healthy, do not mark ourselves up (yet)
5707 dout(1) << "not healthy; waiting to boot" << dendl;
5708 if (!is_waiting_for_healthy())
5709 start_waiting_for_healthy();
5710 // send pings sooner rather than later
5711 heartbeat_kick();
5712 return;
5713 }
5714 dout(1) << __func__ << dendl;
5715 set_state(STATE_PREBOOT);
224ce89b 5716 waiting_for_luminous_mons = false;
7c673cae
FG
5717 dout(10) << "start_boot - have maps " << superblock.oldest_map
5718 << ".." << superblock.newest_map << dendl;
5719 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
5720 monc->get_version("osdmap", &c->newest, &c->oldest, c);
5721}
5722
5723void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
5724{
5725 Mutex::Locker l(osd_lock);
5726 if (is_preboot()) {
5727 _preboot(oldest, newest);
5728 }
5729}
5730
5731void OSD::_preboot(epoch_t oldest, epoch_t newest)
5732{
5733 assert(is_preboot());
5734 dout(10) << __func__ << " _preboot mon has osdmaps "
5735 << oldest << ".." << newest << dendl;
5736
5737 // ensure our local fullness awareness is accurate
5738 heartbeat();
5739
5740 // if our map within recent history, try to add ourselves to the osdmap.
31f18b77
FG
5741 if (osdmap->get_epoch() == 0) {
5742 derr << "waiting for initial osdmap" << dendl;
c07f9fc5
FG
5743 } else if (osdmap->is_destroyed(whoami)) {
5744 derr << "osdmap says I am destroyed, exiting" << dendl;
5745 exit(0);
31f18b77 5746 } else if (osdmap->test_flag(CEPH_OSDMAP_NOUP) || osdmap->is_noup(whoami)) {
7c673cae
FG
5747 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
5748 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
5749 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
5750 << dendl;
31f18b77 5751 } else if (osdmap->require_osd_release < CEPH_RELEASE_JEWEL) {
7c673cae
FG
5752 derr << "osdmap REQUIRE_JEWEL OSDMap flag is NOT set; please set it"
5753 << dendl;
5754 } else if (!monc->monmap.get_required_features().contains_all(
5755 ceph::features::mon::FEATURE_LUMINOUS)) {
5756 derr << "monmap REQUIRE_LUMINOUS is NOT set; must upgrade all monitors to "
5757 << "Luminous or later before Luminous OSDs will boot" << dendl;
224ce89b 5758 waiting_for_luminous_mons = true;
7c673cae
FG
5759 } else if (service.need_fullness_update()) {
5760 derr << "osdmap fullness state needs update" << dendl;
5761 send_full_update();
5762 } else if (osdmap->get_epoch() >= oldest - 1 &&
5763 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
5764 _send_boot();
5765 return;
5766 }
5767
5768 // get all the latest maps
5769 if (osdmap->get_epoch() + 1 >= oldest)
5770 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5771 else
5772 osdmap_subscribe(oldest - 1, true);
5773}
5774
5775void OSD::send_full_update()
5776{
5777 if (!service.need_fullness_update())
5778 return;
5779 unsigned state = 0;
5780 if (service.is_full()) {
5781 state = CEPH_OSD_FULL;
5782 } else if (service.is_backfillfull()) {
5783 state = CEPH_OSD_BACKFILLFULL;
5784 } else if (service.is_nearfull()) {
5785 state = CEPH_OSD_NEARFULL;
5786 }
5787 set<string> s;
5788 OSDMap::calc_state_set(state, s);
5789 dout(10) << __func__ << " want state " << s << dendl;
5790 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
5791}
5792
5793void OSD::start_waiting_for_healthy()
5794{
5795 dout(1) << "start_waiting_for_healthy" << dendl;
5796 set_state(STATE_WAITING_FOR_HEALTHY);
5797 last_heartbeat_resample = utime_t();
181888fb
FG
5798
5799 // subscribe to osdmap updates, in case our peers really are known to be dead
5800 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7c673cae
FG
5801}
5802
5803bool OSD::_is_healthy()
5804{
5805 if (!cct->get_heartbeat_map()->is_healthy()) {
5806 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
5807 return false;
5808 }
5809
5810 if (is_waiting_for_healthy()) {
5811 Mutex::Locker l(heartbeat_lock);
5812 utime_t cutoff = ceph_clock_now();
5813 cutoff -= cct->_conf->osd_heartbeat_grace;
5814 int num = 0, up = 0;
5815 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5816 p != heartbeat_peers.end();
5817 ++p) {
5818 if (p->second.is_healthy(cutoff))
5819 ++up;
5820 ++num;
5821 }
5822 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
5823 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
5824 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
5825 return false;
5826 }
5827 }
5828
5829 return true;
5830}
5831
5832void OSD::_send_boot()
5833{
5834 dout(10) << "_send_boot" << dendl;
5835 entity_addr_t cluster_addr = cluster_messenger->get_myaddr();
5836 Connection *local_connection = cluster_messenger->get_loopback_connection().get();
5837 if (cluster_addr.is_blank_ip()) {
5838 int port = cluster_addr.get_port();
5839 cluster_addr = client_messenger->get_myaddr();
5840 cluster_addr.set_port(port);
5841 cluster_messenger->set_addr_unknowns(cluster_addr);
5842 dout(10) << " assuming cluster_addr ip matches client_addr" << dendl;
5843 } else {
5844 Session *s = static_cast<Session*>(local_connection->get_priv());
5845 if (s)
5846 s->put();
5847 else
5848 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
5849 }
5850
5851 entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr();
5852 local_connection = hb_back_server_messenger->get_loopback_connection().get();
5853 if (hb_back_addr.is_blank_ip()) {
5854 int port = hb_back_addr.get_port();
5855 hb_back_addr = cluster_addr;
5856 hb_back_addr.set_port(port);
5857 hb_back_server_messenger->set_addr_unknowns(hb_back_addr);
5858 dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl;
5859 } else {
5860 Session *s = static_cast<Session*>(local_connection->get_priv());
5861 if (s)
5862 s->put();
5863 else
5864 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
5865 }
5866
5867 entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr();
5868 local_connection = hb_front_server_messenger->get_loopback_connection().get();
5869 if (hb_front_addr.is_blank_ip()) {
5870 int port = hb_front_addr.get_port();
5871 hb_front_addr = client_messenger->get_myaddr();
5872 hb_front_addr.set_port(port);
5873 hb_front_server_messenger->set_addr_unknowns(hb_front_addr);
5874 dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
5875 } else {
5876 Session *s = static_cast<Session*>(local_connection->get_priv());
5877 if (s)
5878 s->put();
5879 else
5880 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
5881 }
5882
5883 MOSDBoot *mboot = new MOSDBoot(superblock, get_osdmap_epoch(), service.get_boot_epoch(),
5884 hb_back_addr, hb_front_addr, cluster_addr,
5885 CEPH_FEATURES_ALL);
5886 dout(10) << " client_addr " << client_messenger->get_myaddr()
5887 << ", cluster_addr " << cluster_addr
5888 << ", hb_back_addr " << hb_back_addr
5889 << ", hb_front_addr " << hb_front_addr
5890 << dendl;
5891 _collect_metadata(&mboot->metadata);
5892 monc->send_mon_message(mboot);
5893 set_state(STATE_BOOTING);
5894}
5895
5896void OSD::_collect_metadata(map<string,string> *pm)
5897{
5898 // config info
5899 (*pm)["osd_data"] = dev_path;
c07f9fc5
FG
5900 if (store->get_type() == "filestore") {
5901 // not applicable for bluestore
5902 (*pm)["osd_journal"] = journal_path;
5903 }
7c673cae
FG
5904 (*pm)["front_addr"] = stringify(client_messenger->get_myaddr());
5905 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddr());
5906 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddr());
5907 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddr());
5908
5909 // backend
5910 (*pm)["osd_objectstore"] = store->get_type();
31f18b77 5911 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
d2e6a577 5912 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
c07f9fc5 5913 (*pm)["default_device_class"] = store->get_default_device_class();
7c673cae
FG
5914 store->collect_metadata(pm);
5915
5916 collect_sys_info(pm, cct);
5917
b5b8bbf5
FG
5918 std::string front_iface, back_iface;
5919 /*
5920 pick_iface(cct,
5921 CEPH_PICK_ADDRESS_PUBLIC | CEPH_PICK_ADDRESS_CLUSTER,
5922 &front_iface, &back_iface);
5923 */
5924 (*pm)["front_iface"] = pick_iface(cct,
5925 client_messenger->get_myaddr().get_sockaddr_storage());
5926 (*pm)["back_iface"] = pick_iface(cct,
5927 cluster_messenger->get_myaddr().get_sockaddr_storage());
5928
7c673cae
FG
5929 dout(10) << __func__ << " " << *pm << dendl;
5930}
5931
5932void OSD::queue_want_up_thru(epoch_t want)
5933{
5934 map_lock.get_read();
5935 epoch_t cur = osdmap->get_up_thru(whoami);
5936 Mutex::Locker l(mon_report_lock);
5937 if (want > up_thru_wanted) {
5938 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
5939 << ", currently " << cur
5940 << dendl;
5941 up_thru_wanted = want;
5942 send_alive();
5943 } else {
5944 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
5945 << ", currently " << cur
5946 << dendl;
5947 }
5948 map_lock.put_read();
5949}
5950
5951void OSD::send_alive()
5952{
5953 assert(mon_report_lock.is_locked());
5954 if (!osdmap->exists(whoami))
5955 return;
5956 epoch_t up_thru = osdmap->get_up_thru(whoami);
5957 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
5958 if (up_thru_wanted > up_thru) {
5959 dout(10) << "send_alive want " << up_thru_wanted << dendl;
5960 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
5961 }
5962}
5963
5964void OSD::request_full_map(epoch_t first, epoch_t last)
5965{
5966 dout(10) << __func__ << " " << first << ".." << last
5967 << ", previously requested "
5968 << requested_full_first << ".." << requested_full_last << dendl;
5969 assert(osd_lock.is_locked());
5970 assert(first > 0 && last > 0);
5971 assert(first <= last);
5972 assert(first >= requested_full_first); // we shouldn't ever ask for older maps
5973 if (requested_full_first == 0) {
5974 // first request
5975 requested_full_first = first;
5976 requested_full_last = last;
5977 } else if (last <= requested_full_last) {
5978 // dup
5979 return;
5980 } else {
5981 // additional request
5982 first = requested_full_last + 1;
5983 requested_full_last = last;
5984 }
5985 MMonGetOSDMap *req = new MMonGetOSDMap;
5986 req->request_full(first, last);
5987 monc->send_mon_message(req);
5988}
5989
5990void OSD::got_full_map(epoch_t e)
5991{
5992 assert(requested_full_first <= requested_full_last);
5993 assert(osd_lock.is_locked());
5994 if (requested_full_first == 0) {
5995 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
5996 return;
5997 }
5998 if (e < requested_full_first) {
5999 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6000 << ".." << requested_full_last
6001 << ", ignoring" << dendl;
6002 return;
6003 }
6004 if (e >= requested_full_last) {
6005 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6006 << ".." << requested_full_last << ", resetting" << dendl;
6007 requested_full_first = requested_full_last = 0;
6008 return;
6009 }
6010
6011 requested_full_first = e + 1;
6012
6013 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6014 << ".." << requested_full_last
6015 << ", still need more" << dendl;
6016}
6017
6018void OSD::requeue_failures()
6019{
6020 Mutex::Locker l(heartbeat_lock);
6021 unsigned old_queue = failure_queue.size();
6022 unsigned old_pending = failure_pending.size();
6023 for (map<int,pair<utime_t,entity_inst_t> >::iterator p =
6024 failure_pending.begin();
6025 p != failure_pending.end(); ) {
6026 failure_queue[p->first] = p->second.first;
6027 failure_pending.erase(p++);
6028 }
6029 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6030 << failure_queue.size() << dendl;
6031}
6032
6033void OSD::send_failures()
6034{
6035 assert(map_lock.is_locked());
6036 assert(mon_report_lock.is_locked());
6037 Mutex::Locker l(heartbeat_lock);
6038 utime_t now = ceph_clock_now();
6039 while (!failure_queue.empty()) {
6040 int osd = failure_queue.begin()->first;
7c673cae 6041 if (!failure_pending.count(osd)) {
31f18b77 6042 entity_inst_t i = osdmap->get_inst(osd);
7c673cae
FG
6043 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6044 monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, failed_for,
6045 osdmap->get_epoch()));
6046 failure_pending[osd] = make_pair(failure_queue.begin()->second, i);
6047 }
6048 failure_queue.erase(osd);
6049 }
6050}
6051
6052void OSD::send_still_alive(epoch_t epoch, const entity_inst_t &i)
6053{
6054 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), i, 0, epoch, MOSDFailure::FLAG_ALIVE);
6055 monc->send_mon_message(m);
6056}
6057
6058void OSD::send_pg_stats(const utime_t &now)
6059{
6060 assert(map_lock.is_locked());
31f18b77 6061 assert(osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS);
7c673cae
FG
6062 dout(20) << "send_pg_stats" << dendl;
6063
6064 osd_stat_t cur_stat = service.get_osd_stat();
6065
6066 cur_stat.os_perf_stat = store->get_cur_stats();
6067
6068 pg_stat_queue_lock.Lock();
6069
6070 if (osd_stat_updated || !pg_stat_queue.empty()) {
6071 last_pg_stats_sent = now;
6072 osd_stat_updated = false;
6073
6074 dout(10) << "send_pg_stats - " << pg_stat_queue.size() << " pgs updated" << dendl;
6075
6076 utime_t had_for(now);
6077 had_for -= had_map_since;
6078
6079 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
6080
6081 uint64_t tid = ++pg_stat_tid;
6082 m->set_tid(tid);
6083 m->osd_stat = cur_stat;
6084
6085 xlist<PG*>::iterator p = pg_stat_queue.begin();
6086 while (!p.end()) {
6087 PG *pg = *p;
6088 ++p;
6089 if (!pg->is_primary()) { // we hold map_lock; role is stable.
6090 pg->stat_queue_item.remove_myself();
6091 pg->put("pg_stat_queue");
6092 continue;
6093 }
6094 pg->pg_stats_publish_lock.Lock();
6095 if (pg->pg_stats_publish_valid) {
6096 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
6097 dout(25) << " sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6098 << pg->pg_stats_publish.reported_seq << dendl;
6099 } else {
6100 dout(25) << " NOT sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6101 << pg->pg_stats_publish.reported_seq << ", not valid" << dendl;
6102 }
6103 pg->pg_stats_publish_lock.Unlock();
6104 }
6105
6106 if (last_pg_stats_ack == utime_t() || !outstanding_pg_stats.empty()) {
6107 last_pg_stats_ack = ceph_clock_now();
6108 }
6109 outstanding_pg_stats.insert(tid);
6110 dout(20) << __func__ << " updates pending: " << outstanding_pg_stats << dendl;
6111
6112 monc->send_mon_message(m);
6113 }
6114
6115 pg_stat_queue_lock.Unlock();
6116}
6117
6118void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
6119{
6120 dout(10) << "handle_pg_stats_ack " << dendl;
6121
6122 if (!require_mon_peer(ack)) {
6123 ack->put();
6124 return;
6125 }
6126
6127 // NOTE: we may get replies from a previous mon even while
6128 // outstanding_pg_stats is empty if reconnecting races with replies
6129 // in flight.
6130
6131 pg_stat_queue_lock.Lock();
6132
6133 last_pg_stats_ack = ceph_clock_now();
6134
6135 // decay timeout slowly (analogous to TCP)
6136 stats_ack_timeout =
6137 MAX(cct->_conf->osd_mon_ack_timeout,
6138 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_decay);
6139 dout(20) << __func__ << " timeout now " << stats_ack_timeout << dendl;
6140
6141 if (ack->get_tid() > pg_stat_tid_flushed) {
6142 pg_stat_tid_flushed = ack->get_tid();
6143 pg_stat_queue_cond.Signal();
6144 }
6145
6146 xlist<PG*>::iterator p = pg_stat_queue.begin();
6147 while (!p.end()) {
6148 PG *pg = *p;
6149 PGRef _pg(pg);
6150 ++p;
6151
6152 auto acked = ack->pg_stat.find(pg->info.pgid.pgid);
6153 if (acked != ack->pg_stat.end()) {
6154 pg->pg_stats_publish_lock.Lock();
6155 if (acked->second.first == pg->pg_stats_publish.reported_seq &&
6156 acked->second.second == pg->pg_stats_publish.reported_epoch) {
6157 dout(25) << " ack on " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6158 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6159 pg->stat_queue_item.remove_myself();
6160 pg->put("pg_stat_queue");
6161 } else {
6162 dout(25) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6163 << ":" << pg->pg_stats_publish.reported_seq << " > acked "
6164 << acked->second << dendl;
6165 }
6166 pg->pg_stats_publish_lock.Unlock();
6167 } else {
6168 dout(30) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6169 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6170 }
6171 }
6172
6173 outstanding_pg_stats.erase(ack->get_tid());
6174 dout(20) << __func__ << " still pending: " << outstanding_pg_stats << dendl;
6175
6176 pg_stat_queue_lock.Unlock();
6177
6178 ack->put();
6179}
6180
6181void OSD::flush_pg_stats()
6182{
6183 dout(10) << "flush_pg_stats" << dendl;
6184 osd_lock.Unlock();
6185 utime_t now = ceph_clock_now();
6186 map_lock.get_read();
6187 mon_report_lock.Lock();
6188 send_pg_stats(now);
6189 mon_report_lock.Unlock();
6190 map_lock.put_read();
6191
6192
6193 pg_stat_queue_lock.Lock();
6194 uint64_t tid = pg_stat_tid;
6195 dout(10) << "flush_pg_stats waiting for stats tid " << tid << " to flush" << dendl;
6196 while (tid > pg_stat_tid_flushed)
6197 pg_stat_queue_cond.Wait(pg_stat_queue_lock);
6198 dout(10) << "flush_pg_stats finished waiting for stats tid " << tid << " to flush" << dendl;
6199 pg_stat_queue_lock.Unlock();
6200
6201 osd_lock.Lock();
6202}
6203
6204void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6205{
6206 const auto& monmap = monc->monmap;
6207 // send beacon to mon even if we are just connected, and the monmap is not
6208 // initialized yet by then.
6209 if (monmap.epoch > 0 &&
6210 monmap.get_required_features().contains_all(
6211 ceph::features::mon::FEATURE_LUMINOUS)) {
6212 dout(20) << __func__ << " sending" << dendl;
7c673cae
FG
6213 MOSDBeacon* beacon = nullptr;
6214 {
6215 Mutex::Locker l{min_last_epoch_clean_lock};
6216 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
6217 std::swap(beacon->pgs, min_last_epoch_clean_pgs);
224ce89b 6218 last_sent_beacon = now;
7c673cae
FG
6219 }
6220 monc->send_mon_message(beacon);
6221 } else {
6222 dout(20) << __func__ << " not sending" << dendl;
6223 }
6224}
6225
6226void OSD::handle_command(MMonCommand *m)
6227{
6228 if (!require_mon_peer(m)) {
6229 m->put();
6230 return;
6231 }
6232
6233 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6234 command_wq.queue(c);
6235 m->put();
6236}
6237
6238void OSD::handle_command(MCommand *m)
6239{
6240 ConnectionRef con = m->get_connection();
6241 Session *session = static_cast<Session *>(con->get_priv());
6242 if (!session) {
6243 con->send_message(new MCommandReply(m, -EPERM));
6244 m->put();
6245 return;
6246 }
6247
6248 OSDCap& caps = session->caps;
6249 session->put();
6250
6251 if (!caps.allow_all() || m->get_source().is_mon()) {
6252 con->send_message(new MCommandReply(m, -EPERM));
6253 m->put();
6254 return;
6255 }
6256
6257 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6258 command_wq.queue(c);
6259
6260 m->put();
6261}
6262
6263struct OSDCommand {
6264 string cmdstring;
6265 string helpstring;
6266 string module;
6267 string perm;
6268 string availability;
6269} osd_commands[] = {
6270
6271#define COMMAND(parsesig, helptext, module, perm, availability) \
6272 {parsesig, helptext, module, perm, availability},
6273
6274// yes, these are really pg commands, but there's a limit to how
6275// much work it's worth. The OSD returns all of them. Make this
6276// form (pg <pgid> <cmd>) valid only for the cli.
6277// Rest uses "tell <pgid> <cmd>"
6278
6279COMMAND("pg " \
6280 "name=pgid,type=CephPgid " \
6281 "name=cmd,type=CephChoices,strings=query", \
6282 "show details of a specific pg", "osd", "r", "cli")
6283COMMAND("pg " \
6284 "name=pgid,type=CephPgid " \
6285 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6286 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6287 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6288 "osd", "rw", "cli")
6289COMMAND("pg " \
6290 "name=pgid,type=CephPgid " \
6291 "name=cmd,type=CephChoices,strings=list_missing " \
6292 "name=offset,type=CephString,req=false",
6293 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6294 "osd", "r", "cli")
6295
6296// new form: tell <pgid> <cmd> for both cli and rest
6297
6298COMMAND("query",
6299 "show details of a specific pg", "osd", "r", "cli,rest")
6300COMMAND("mark_unfound_lost " \
6301 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6302 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6303 "osd", "rw", "cli,rest")
6304COMMAND("list_missing " \
6305 "name=offset,type=CephString,req=false",
6306 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6307 "osd", "r", "cli,rest")
31f18b77
FG
6308COMMAND("perf histogram dump "
6309 "name=logger,type=CephString,req=false "
6310 "name=counter,type=CephString,req=false",
6311 "Get histogram data",
6312 "osd", "r", "cli,rest")
7c673cae
FG
6313
6314// tell <osd.n> commands. Validation of osd.n must be special-cased in client
6315COMMAND("version", "report version of OSD", "osd", "r", "cli,rest")
6316COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r", "cli,rest")
6317COMMAND("injectargs " \
6318 "name=injected_args,type=CephString,n=N",
6319 "inject configuration arguments into running OSD",
6320 "osd", "rw", "cli,rest")
c07f9fc5
FG
6321COMMAND("config set " \
6322 "name=key,type=CephString name=value,type=CephString",
6323 "Set a configuration option at runtime (not persistent)",
6324 "osd", "rw", "cli,rest")
7c673cae
FG
6325COMMAND("cluster_log " \
6326 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6327 "name=message,type=CephString,n=N",
6328 "log a message to the cluster log",
6329 "osd", "rw", "cli,rest")
6330COMMAND("bench " \
6331 "name=count,type=CephInt,req=false " \
6332 "name=size,type=CephInt,req=false " \
6333 "name=object_size,type=CephInt,req=false " \
6334 "name=object_num,type=CephInt,req=false ", \
6335 "OSD benchmark: write <count> <size>-byte objects, " \
6336 "(default 1G size 4MB). Results in log.",
6337 "osd", "rw", "cli,rest")
6338COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw", "cli,rest")
6339COMMAND("heap " \
6340 "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
6341 "show heap usage info (available only if compiled with tcmalloc)", \
6342 "osd", "rw", "cli,rest")
6343COMMAND("debug dump_missing " \
6344 "name=filename,type=CephFilepath",
6345 "dump missing objects to a named file", "osd", "r", "cli,rest")
6346COMMAND("debug kick_recovery_wq " \
6347 "name=delay,type=CephInt,range=0",
6348 "set osd_recovery_delay_start to <val>", "osd", "rw", "cli,rest")
6349COMMAND("cpu_profiler " \
6350 "name=arg,type=CephChoices,strings=status|flush",
6351 "run cpu profiling on daemon", "osd", "rw", "cli,rest")
6352COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6353 "osd", "r", "cli,rest")
6354COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6355 "osd", "rw", "cli,rest")
224ce89b
WB
6356COMMAND("compact",
6357 "compact object store's omap. "
6358 "WARNING: Compaction probably slows your requests",
6359 "osd", "rw", "cli,rest")
7c673cae
FG
6360};
6361
6362void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6363{
6364 int r = 0;
6365 stringstream ss, ds;
6366 string rs;
6367 bufferlist odata;
6368
6369 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6370
6371 map<string, cmd_vartype> cmdmap;
6372 string prefix;
6373 string format;
6374 string pgidstr;
6375 boost::scoped_ptr<Formatter> f;
6376
6377 if (cmd.empty()) {
6378 ss << "no command given";
6379 goto out;
6380 }
6381
6382 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6383 r = -EINVAL;
6384 goto out;
6385 }
6386
6387 cmd_getval(cct, cmdmap, "prefix", prefix);
6388
6389 if (prefix == "get_command_descriptions") {
6390 int cmdnum = 0;
6391 JSONFormatter *f = new JSONFormatter();
6392 f->open_object_section("command_descriptions");
6393 for (OSDCommand *cp = osd_commands;
6394 cp < &osd_commands[ARRAY_SIZE(osd_commands)]; cp++) {
6395
6396 ostringstream secname;
6397 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
6398 dump_cmddesc_to_json(f, secname.str(), cp->cmdstring, cp->helpstring,
6399 cp->module, cp->perm, cp->availability, 0);
6400 cmdnum++;
6401 }
6402 f->close_section(); // command_descriptions
6403
6404 f->flush(ds);
6405 delete f;
6406 goto out;
6407 }
6408
6409 cmd_getval(cct, cmdmap, "format", format);
6410 f.reset(Formatter::create(format));
6411
6412 if (prefix == "version") {
6413 if (f) {
6414 f->open_object_section("version");
6415 f->dump_string("version", pretty_version_to_str());
6416 f->close_section();
6417 f->flush(ds);
6418 } else {
6419 ds << pretty_version_to_str();
6420 }
6421 goto out;
6422 }
6423 else if (prefix == "injectargs") {
6424 vector<string> argsvec;
6425 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6426
6427 if (argsvec.empty()) {
6428 r = -EINVAL;
6429 ss << "ignoring empty injectargs";
6430 goto out;
6431 }
6432 string args = argsvec.front();
6433 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6434 args += " " + *a;
6435 osd_lock.Unlock();
6436 r = cct->_conf->injectargs(args, &ss);
6437 osd_lock.Lock();
6438 }
c07f9fc5
FG
6439 else if (prefix == "config set") {
6440 std::string key;
6441 std::string val;
6442 cmd_getval(cct, cmdmap, "key", key);
6443 cmd_getval(cct, cmdmap, "value", val);
6444 osd_lock.Unlock();
6445 r = cct->_conf->set_val(key, val, true, &ss);
d2e6a577
FG
6446 if (r == 0) {
6447 cct->_conf->apply_changes(nullptr);
6448 }
c07f9fc5
FG
6449 osd_lock.Lock();
6450 }
7c673cae
FG
6451 else if (prefix == "cluster_log") {
6452 vector<string> msg;
6453 cmd_getval(cct, cmdmap, "message", msg);
6454 if (msg.empty()) {
6455 r = -EINVAL;
6456 ss << "ignoring empty log message";
6457 goto out;
6458 }
6459 string message = msg.front();
6460 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6461 message += " " + *a;
6462 string lvl;
6463 cmd_getval(cct, cmdmap, "level", lvl);
6464 clog_type level = string_to_clog_type(lvl);
6465 if (level < 0) {
6466 r = -EINVAL;
6467 ss << "unknown level '" << lvl << "'";
6468 goto out;
6469 }
6470 clog->do_log(level, message);
6471 }
6472
6473 // either 'pg <pgid> <command>' or
6474 // 'tell <pgid>' (which comes in without any of that prefix)?
6475
6476 else if (prefix == "pg" ||
6477 prefix == "query" ||
6478 prefix == "mark_unfound_lost" ||
6479 prefix == "list_missing"
6480 ) {
6481 pg_t pgid;
6482
6483 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6484 ss << "no pgid specified";
6485 r = -EINVAL;
6486 } else if (!pgid.parse(pgidstr.c_str())) {
6487 ss << "couldn't parse pgid '" << pgidstr << "'";
6488 r = -EINVAL;
6489 } else {
6490 spg_t pcand;
6491 PG *pg = nullptr;
6492 if (osdmap->get_primary_shard(pgid, &pcand) &&
6493 (pg = _lookup_lock_pg(pcand))) {
6494 if (pg->is_primary()) {
6495 // simulate pg <pgid> cmd= for pg->do-command
6496 if (prefix != "pg")
6497 cmd_putval(cct, cmdmap, "cmd", prefix);
6498 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
6499 if (r == -EAGAIN) {
6500 pg->unlock();
6501 // don't reply, pg will do so async
6502 return;
6503 }
6504 } else {
6505 ss << "not primary for pgid " << pgid;
6506
6507 // send them the latest diff to ensure they realize the mapping
6508 // has changed.
6509 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
6510
6511 // do not reply; they will get newer maps and realize they
6512 // need to resend.
6513 pg->unlock();
6514 return;
6515 }
6516 pg->unlock();
6517 } else {
6518 ss << "i don't have pgid " << pgid;
6519 r = -ENOENT;
6520 }
6521 }
6522 }
6523
6524 else if (prefix == "bench") {
6525 int64_t count;
6526 int64_t bsize;
6527 int64_t osize, onum;
6528 // default count 1G, size 4MB
6529 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
6530 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
6531 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
6532 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
6533
6534 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
6535 ObjectStore::Sequencer>("bench"));
6536
6537 uint32_t duration = cct->_conf->osd_bench_duration;
6538
6539 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
6540 // let us limit the block size because the next checks rely on it
6541 // having a sane value. If we allow any block size to be set things
6542 // can still go sideways.
6543 ss << "block 'size' values are capped at "
6544 << prettybyte_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
6545 << " a higher value, please adjust 'osd_bench_max_block_size'";
6546 r = -EINVAL;
6547 goto out;
6548 } else if (bsize < (int64_t) (1 << 20)) {
6549 // entering the realm of small block sizes.
6550 // limit the count to a sane value, assuming a configurable amount of
6551 // IOPS and duration, so that the OSD doesn't get hung up on this,
6552 // preventing timeouts from going off
6553 int64_t max_count =
6554 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
6555 if (count > max_count) {
6556 ss << "'count' values greater than " << max_count
6557 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6558 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
6559 << " for " << duration << " seconds,"
6560 << " can cause ill effects on osd. "
6561 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6562 << " value if you wish to use a higher 'count'.";
6563 r = -EINVAL;
6564 goto out;
6565 }
6566 } else {
6567 // 1MB block sizes are big enough so that we get more stuff done.
6568 // However, to avoid the osd from getting hung on this and having
6569 // timers being triggered, we are going to limit the count assuming
6570 // a configurable throughput and duration.
6571 // NOTE: max_count is the total amount of bytes that we believe we
6572 // will be able to write during 'duration' for the given
6573 // throughput. The block size hardly impacts this unless it's
6574 // way too big. Given we already check how big the block size
6575 // is, it's safe to assume everything will check out.
6576 int64_t max_count =
6577 cct->_conf->osd_bench_large_size_max_throughput * duration;
6578 if (count > max_count) {
6579 ss << "'count' values greater than " << max_count
6580 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6581 << prettybyte_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
6582 << " for " << duration << " seconds,"
6583 << " can cause ill effects on osd. "
6584 << " Please adjust 'osd_bench_large_size_max_throughput'"
6585 << " with a higher value if you wish to use a higher 'count'.";
6586 r = -EINVAL;
6587 goto out;
6588 }
6589 }
6590
6591 if (osize && bsize > osize)
6592 bsize = osize;
6593
6594 dout(1) << " bench count " << count
6595 << " bsize " << prettybyte_t(bsize) << dendl;
6596
6597 ObjectStore::Transaction cleanupt;
6598
6599 if (osize && onum) {
6600 bufferlist bl;
6601 bufferptr bp(osize);
6602 bp.zero();
6603 bl.push_back(std::move(bp));
6604 bl.rebuild_page_aligned();
6605 for (int i=0; i<onum; ++i) {
6606 char nm[30];
6607 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
6608 object_t oid(nm);
6609 hobject_t soid(sobject_t(oid, 0));
6610 ObjectStore::Transaction t;
6611 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
6612 store->queue_transaction(osr.get(), std::move(t), NULL);
6613 cleanupt.remove(coll_t(), ghobject_t(soid));
6614 }
6615 }
6616
6617 bufferlist bl;
6618 bufferptr bp(bsize);
6619 bp.zero();
6620 bl.push_back(std::move(bp));
6621 bl.rebuild_page_aligned();
6622
6623 {
6624 C_SaferCond waiter;
6625 if (!osr->flush_commit(&waiter)) {
6626 waiter.wait();
6627 }
6628 }
6629
6630 utime_t start = ceph_clock_now();
6631 for (int64_t pos = 0; pos < count; pos += bsize) {
6632 char nm[30];
6633 unsigned offset = 0;
6634 if (onum && osize) {
6635 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
6636 offset = rand() % (osize / bsize) * bsize;
6637 } else {
6638 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
6639 }
6640 object_t oid(nm);
6641 hobject_t soid(sobject_t(oid, 0));
6642 ObjectStore::Transaction t;
6643 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
6644 store->queue_transaction(osr.get(), std::move(t), NULL);
6645 if (!onum || !osize)
6646 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
6647 }
6648
6649 {
6650 C_SaferCond waiter;
6651 if (!osr->flush_commit(&waiter)) {
6652 waiter.wait();
6653 }
6654 }
6655 utime_t end = ceph_clock_now();
6656
6657 // clean up
6658 store->queue_transaction(osr.get(), std::move(cleanupt), NULL);
6659 {
6660 C_SaferCond waiter;
6661 if (!osr->flush_commit(&waiter)) {
6662 waiter.wait();
6663 }
6664 }
6665
6666 uint64_t rate = (double)count / (end - start);
6667 if (f) {
6668 f->open_object_section("osd_bench_results");
6669 f->dump_int("bytes_written", count);
6670 f->dump_int("blocksize", bsize);
6671 f->dump_unsigned("bytes_per_sec", rate);
6672 f->close_section();
6673 f->flush(ss);
6674 } else {
6675 ss << "bench: wrote " << prettybyte_t(count)
6676 << " in blocks of " << prettybyte_t(bsize) << " in "
6677 << (end-start) << " sec at " << prettybyte_t(rate) << "/sec";
6678 }
6679 }
6680
6681 else if (prefix == "flush_pg_stats") {
31f18b77
FG
6682 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6683 mgrc.send_pgstats();
6684 ds << service.get_osd_stat_seq() << "\n";
6685 } else {
6686 flush_pg_stats();
6687 }
7c673cae
FG
6688 }
6689
6690 else if (prefix == "heap") {
6691 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
6692 }
6693
6694 else if (prefix == "debug dump_missing") {
6695 string file_name;
6696 cmd_getval(cct, cmdmap, "filename", file_name);
6697 std::ofstream fout(file_name.c_str());
6698 if (!fout.is_open()) {
6699 ss << "failed to open file '" << file_name << "'";
6700 r = -EINVAL;
6701 goto out;
6702 }
6703
6704 fout << "*** osd " << whoami << ": dump_missing ***" << std::endl;
6705 RWLock::RLocker l(pg_map_lock);
6706 for (ceph::unordered_map<spg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
6707 pg_map_e != pg_map.end(); ++pg_map_e) {
6708 PG *pg = pg_map_e->second;
6709 pg->lock();
6710
6711 fout << *pg << std::endl;
6712 std::map<hobject_t, pg_missing_item>::const_iterator mend =
6713 pg->pg_log.get_missing().get_items().end();
6714 std::map<hobject_t, pg_missing_item>::const_iterator mi =
6715 pg->pg_log.get_missing().get_items().begin();
6716 for (; mi != mend; ++mi) {
6717 fout << mi->first << " -> " << mi->second << std::endl;
6718 if (!pg->missing_loc.needs_recovery(mi->first))
6719 continue;
6720 if (pg->missing_loc.is_unfound(mi->first))
6721 fout << " unfound ";
6722 const set<pg_shard_t> &mls(pg->missing_loc.get_locations(mi->first));
6723 if (mls.empty())
6724 continue;
6725 fout << "missing_loc: " << mls << std::endl;
6726 }
6727 pg->unlock();
6728 fout << std::endl;
6729 }
6730
6731 fout.close();
6732 }
6733 else if (prefix == "debug kick_recovery_wq") {
6734 int64_t delay;
6735 cmd_getval(cct, cmdmap, "delay", delay);
6736 ostringstream oss;
6737 oss << delay;
6738 r = cct->_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
6739 if (r != 0) {
6740 ss << "kick_recovery_wq: error setting "
6741 << "osd_recovery_delay_start to '" << delay << "': error "
6742 << r;
6743 goto out;
6744 }
6745 cct->_conf->apply_changes(NULL);
6746 ss << "kicking recovery queue. set osd_recovery_delay_start "
6747 << "to " << cct->_conf->osd_recovery_delay_start;
6748 }
6749
6750 else if (prefix == "cpu_profiler") {
6751 string arg;
6752 cmd_getval(cct, cmdmap, "arg", arg);
6753 vector<string> argvec;
6754 get_str_vec(arg, argvec);
6755 cpu_profiler_handle_command(argvec, ds);
6756 }
6757
6758 else if (prefix == "dump_pg_recovery_stats") {
6759 stringstream s;
6760 if (f) {
6761 pg_recovery_stats.dump_formatted(f.get());
6762 f->flush(ds);
6763 } else {
6764 pg_recovery_stats.dump(s);
6765 ds << "dump pg recovery stats: " << s.str();
6766 }
6767 }
6768
6769 else if (prefix == "reset_pg_recovery_stats") {
6770 ss << "reset pg recovery stats";
6771 pg_recovery_stats.reset();
6772 }
6773
31f18b77
FG
6774 else if (prefix == "perf histogram dump") {
6775 std::string logger;
6776 std::string counter;
6777 cmd_getval(cct, cmdmap, "logger", logger);
6778 cmd_getval(cct, cmdmap, "counter", counter);
6779 if (f) {
6780 cct->get_perfcounters_collection()->dump_formatted_histograms(
6781 f.get(), false, logger, counter);
6782 f->flush(ds);
6783 }
6784 }
6785
224ce89b
WB
6786 else if (prefix == "compact") {
6787 dout(1) << "triggering manual compaction" << dendl;
6788 auto start = ceph::coarse_mono_clock::now();
6789 store->compact();
6790 auto end = ceph::coarse_mono_clock::now();
6791 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
6792 dout(1) << "finished manual compaction in "
6793 << time_span.count()
6794 << " seconds" << dendl;
6795 ss << "compacted omap in " << time_span.count() << " seconds";
6796 }
6797
7c673cae
FG
6798 else {
6799 ss << "unrecognized command! " << cmd;
6800 r = -EINVAL;
6801 }
6802
6803 out:
6804 rs = ss.str();
6805 odata.append(ds);
6806 dout(0) << "do_command r=" << r << " " << rs << dendl;
6807 clog->info() << rs;
6808 if (con) {
6809 MCommandReply *reply = new MCommandReply(r, rs);
6810 reply->set_tid(tid);
6811 reply->set_data(odata);
6812 con->send_message(reply);
6813 }
6814}
6815
6816bool OSD::heartbeat_dispatch(Message *m)
6817{
6818 dout(30) << "heartbeat_dispatch " << m << dendl;
6819 switch (m->get_type()) {
6820
6821 case CEPH_MSG_PING:
6822 dout(10) << "ping from " << m->get_source_inst() << dendl;
6823 m->put();
6824 break;
6825
6826 case MSG_OSD_PING:
6827 handle_osd_ping(static_cast<MOSDPing*>(m));
6828 break;
6829
6830 default:
6831 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
6832 m->put();
6833 }
6834
6835 return true;
6836}
6837
6838bool OSD::ms_dispatch(Message *m)
6839{
6840 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
6841 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
6842 service.got_stop_ack();
6843 m->put();
6844 return true;
6845 }
6846
6847 // lock!
6848
6849 osd_lock.Lock();
6850 if (is_stopping()) {
6851 osd_lock.Unlock();
6852 m->put();
6853 return true;
6854 }
6855
6856 do_waiters();
6857 _dispatch(m);
6858
6859 osd_lock.Unlock();
6860
6861 return true;
6862}
6863
6864void OSD::maybe_share_map(
6865 Session *session,
6866 OpRequestRef op,
6867 OSDMapRef osdmap)
6868{
6869 if (!op->check_send_map) {
6870 return;
6871 }
6872 epoch_t last_sent_epoch = 0;
6873
6874 session->sent_epoch_lock.lock();
6875 last_sent_epoch = session->last_sent_epoch;
6876 session->sent_epoch_lock.unlock();
6877
6878 const Message *m = op->get_req();
6879 service.share_map(
6880 m->get_source(),
6881 m->get_connection().get(),
6882 op->sent_epoch,
6883 osdmap,
6884 session ? &last_sent_epoch : NULL);
6885
6886 session->sent_epoch_lock.lock();
6887 if (session->last_sent_epoch < last_sent_epoch) {
6888 session->last_sent_epoch = last_sent_epoch;
6889 }
6890 session->sent_epoch_lock.unlock();
6891
6892 op->check_send_map = false;
6893}
6894
6895void OSD::dispatch_session_waiting(Session *session, OSDMapRef osdmap)
6896{
6897 assert(session->session_dispatch_lock.is_locked());
6898
6899 auto i = session->waiting_on_map.begin();
6900 while (i != session->waiting_on_map.end()) {
6901 OpRequestRef op = &(*i);
6902 assert(ms_can_fast_dispatch(op->get_req()));
6903 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
6904 op->get_req());
6905 if (m->get_min_epoch() > osdmap->get_epoch()) {
6906 break;
6907 }
6908 session->waiting_on_map.erase(i++);
6909 op->put();
6910
6911 spg_t pgid;
6912 if (m->get_type() == CEPH_MSG_OSD_OP) {
6913 pg_t actual_pgid = osdmap->raw_pg_to_pg(
6914 static_cast<const MOSDOp*>(m)->get_pg());
6915 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
6916 continue;
6917 }
6918 } else {
6919 pgid = m->get_spg();
6920 }
6921 enqueue_op(pgid, op, m->get_map_epoch());
6922 }
6923
6924 if (session->waiting_on_map.empty()) {
6925 clear_session_waiting_on_map(session);
6926 } else {
6927 register_session_waiting_on_map(session);
6928 }
6929}
6930
6931void OSD::ms_fast_dispatch(Message *m)
6932{
6933 FUNCTRACE();
6934 if (service.is_stopping()) {
6935 m->put();
6936 return;
6937 }
6938 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
6939 {
6940#ifdef WITH_LTTNG
6941 osd_reqid_t reqid = op->get_reqid();
6942#endif
6943 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
6944 reqid.name._num, reqid.tid, reqid.inc);
6945 }
6946
6947 if (m->trace)
6948 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
6949
6950 // note sender epoch, min req'd epoch
6951 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
6952 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
6953 assert(op->min_epoch <= op->sent_epoch); // sanity check!
6954
6955 service.maybe_inject_dispatch_delay();
6956
6957 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
6958 m->get_type() != CEPH_MSG_OSD_OP) {
6959 // queue it directly
6960 enqueue_op(
6961 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
6962 op,
6963 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
6964 } else {
6965 // legacy client, and this is an MOSDOp (the *only* fast dispatch
6966 // message that didn't have an explicit spg_t); we need to map
6967 // them to an spg_t while preserving delivery order.
6968 Session *session = static_cast<Session*>(m->get_connection()->get_priv());
6969 if (session) {
6970 {
6971 Mutex::Locker l(session->session_dispatch_lock);
6972 op->get();
6973 session->waiting_on_map.push_back(*op);
6974 OSDMapRef nextmap = service.get_nextmap_reserved();
6975 dispatch_session_waiting(session, nextmap);
6976 service.release_map(nextmap);
6977 }
6978 session->put();
6979 }
6980 }
6981 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
6982}
6983
6984void OSD::ms_fast_preprocess(Message *m)
6985{
6986 if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
6987 if (m->get_type() == CEPH_MSG_OSD_MAP) {
6988 MOSDMap *mm = static_cast<MOSDMap*>(m);
6989 Session *s = static_cast<Session*>(m->get_connection()->get_priv());
6990 if (s) {
6991 s->received_map_lock.lock();
6992 s->received_map_epoch = mm->get_last();
6993 s->received_map_lock.unlock();
6994 s->put();
6995 }
6996 }
6997 }
6998}
6999
7000bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
7001{
7002 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
7003
31f18b77
FG
7004 if (is_stopping()) {
7005 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7006 return false;
7007 }
7008
7c673cae
FG
7009 if (dest_type == CEPH_ENTITY_TYPE_MON)
7010 return true;
7011
7012 if (force_new) {
7013 /* the MonClient checks keys every tick(), so we should just wait for that cycle
7014 to get through */
7015 if (monc->wait_auth_rotating(10) < 0) {
7016 derr << "OSD::ms_get_authorizer wait_auth_rotating failed" << dendl;
7017 return false;
7018 }
7019 }
7020
7021 *authorizer = monc->build_authorizer(dest_type);
7022 return *authorizer != NULL;
7023}
7024
7025
7026bool OSD::ms_verify_authorizer(Connection *con, int peer_type,
7027 int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
7028 bool& isvalid, CryptoKey& session_key)
7029{
7030 AuthAuthorizeHandler *authorize_handler = 0;
7031 switch (peer_type) {
7032 case CEPH_ENTITY_TYPE_MDS:
7033 /*
7034 * note: mds is technically a client from our perspective, but
7035 * this makes the 'cluster' consistent w/ monitor's usage.
7036 */
7037 case CEPH_ENTITY_TYPE_OSD:
7038 case CEPH_ENTITY_TYPE_MGR:
7039 authorize_handler = authorize_handler_cluster_registry->get_handler(protocol);
7040 break;
7041 default:
7042 authorize_handler = authorize_handler_service_registry->get_handler(protocol);
7043 }
7044 if (!authorize_handler) {
7045 dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol << dendl;
7046 isvalid = false;
7047 return true;
7048 }
7049
7050 AuthCapsInfo caps_info;
7051 EntityName name;
7052 uint64_t global_id;
7053 uint64_t auid = CEPH_AUTH_UID_DEFAULT;
7054
c07f9fc5
FG
7055 RotatingKeyRing *keys = monc->rotating_secrets.get();
7056 if (keys) {
7057 isvalid = authorize_handler->verify_authorizer(
7058 cct, keys,
7059 authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
7060 &auid);
7061 } else {
7062 dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
7063 isvalid = false;
7064 }
7c673cae
FG
7065
7066 if (isvalid) {
7067 Session *s = static_cast<Session *>(con->get_priv());
7068 if (!s) {
7069 s = new Session(cct);
7070 con->set_priv(s->get());
7071 s->con = con;
7072 dout(10) << " new session " << s << " con=" << s->con << " addr=" << s->con->get_peer_addr() << dendl;
7073 }
7074
7075 s->entity_name = name;
7076 if (caps_info.allow_all)
7077 s->caps.set_allow_all();
7078 s->auid = auid;
7079
7080 if (caps_info.caps.length() > 0) {
7081 bufferlist::iterator p = caps_info.caps.begin();
7082 string str;
7083 try {
7084 ::decode(str, p);
7085 }
7086 catch (buffer::error& e) {
7087 }
7088 bool success = s->caps.parse(str);
7089 if (success)
7090 dout(10) << " session " << s << " " << s->entity_name << " has caps " << s->caps << " '" << str << "'" << dendl;
7091 else
7092 dout(10) << " session " << s << " " << s->entity_name << " failed to parse caps '" << str << "'" << dendl;
7093 }
7094
7095 s->put();
7096 }
7097 return true;
7098}
7099
7100void OSD::do_waiters()
7101{
7102 assert(osd_lock.is_locked());
7103
7104 dout(10) << "do_waiters -- start" << dendl;
7105 while (!finished.empty()) {
7106 OpRequestRef next = finished.front();
7107 finished.pop_front();
7108 dispatch_op(next);
7109 }
7110 dout(10) << "do_waiters -- finish" << dendl;
7111}
7112
7113void OSD::dispatch_op(OpRequestRef op)
7114{
7115 switch (op->get_req()->get_type()) {
7116
7117 case MSG_OSD_PG_CREATE:
7118 handle_pg_create(op);
7119 break;
7120 case MSG_OSD_PG_NOTIFY:
7121 handle_pg_notify(op);
7122 break;
7123 case MSG_OSD_PG_QUERY:
7124 handle_pg_query(op);
7125 break;
7126 case MSG_OSD_PG_LOG:
7127 handle_pg_log(op);
7128 break;
7129 case MSG_OSD_PG_REMOVE:
7130 handle_pg_remove(op);
7131 break;
7132 case MSG_OSD_PG_INFO:
7133 handle_pg_info(op);
7134 break;
7135 case MSG_OSD_PG_TRIM:
7136 handle_pg_trim(op);
7137 break;
7138 case MSG_OSD_BACKFILL_RESERVE:
7139 handle_pg_backfill_reserve(op);
7140 break;
7141 case MSG_OSD_RECOVERY_RESERVE:
7142 handle_pg_recovery_reserve(op);
7143 break;
7144 }
7145}
7146
7147void OSD::_dispatch(Message *m)
7148{
7149 assert(osd_lock.is_locked());
7150 dout(20) << "_dispatch " << m << " " << *m << dendl;
7151
7152 switch (m->get_type()) {
7153
7154 // -- don't need lock --
7155 case CEPH_MSG_PING:
7156 dout(10) << "ping from " << m->get_source() << dendl;
7157 m->put();
7158 break;
7159
7160 // -- don't need OSDMap --
7161
7162 // map and replication
7163 case CEPH_MSG_OSD_MAP:
7164 handle_osd_map(static_cast<MOSDMap*>(m));
7165 break;
7166
7167 // osd
7168 case MSG_PGSTATSACK:
7169 handle_pg_stats_ack(static_cast<MPGStatsAck*>(m));
7170 break;
7171
7172 case MSG_MON_COMMAND:
7173 handle_command(static_cast<MMonCommand*>(m));
7174 break;
7175 case MSG_COMMAND:
7176 handle_command(static_cast<MCommand*>(m));
7177 break;
7178
7179 case MSG_OSD_SCRUB:
7180 handle_scrub(static_cast<MOSDScrub*>(m));
7181 break;
7182
c07f9fc5
FG
7183 case MSG_OSD_FORCE_RECOVERY:
7184 handle_force_recovery(m);
7185 break;
7186
7c673cae
FG
7187 // -- need OSDMap --
7188
7189 case MSG_OSD_PG_CREATE:
7190 case MSG_OSD_PG_NOTIFY:
7191 case MSG_OSD_PG_QUERY:
7192 case MSG_OSD_PG_LOG:
7193 case MSG_OSD_PG_REMOVE:
7194 case MSG_OSD_PG_INFO:
7195 case MSG_OSD_PG_TRIM:
7196 case MSG_OSD_BACKFILL_RESERVE:
7197 case MSG_OSD_RECOVERY_RESERVE:
7198 {
7199 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7200 if (m->trace)
7201 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7202 // no map? starting up?
7203 if (!osdmap) {
7204 dout(7) << "no OSDMap, not booted" << dendl;
7205 logger->inc(l_osd_waiting_for_map);
7206 waiting_for_osdmap.push_back(op);
7207 op->mark_delayed("no osdmap");
7208 break;
7209 }
7210
7211 // need OSDMap
7212 dispatch_op(op);
7213 }
7214 }
7215}
7216
7217void OSD::handle_pg_scrub(MOSDScrub *m, PG *pg)
7218{
7219 pg->lock();
7220 if (pg->is_primary()) {
7221 pg->unreg_next_scrub();
7222 pg->scrubber.must_scrub = true;
7223 pg->scrubber.must_deep_scrub = m->deep || m->repair;
7224 pg->scrubber.must_repair = m->repair;
7225 pg->reg_next_scrub();
7226 dout(10) << "marking " << *pg << " for scrub" << dendl;
7227 }
7228 pg->unlock();
7229}
7230
7231void OSD::handle_scrub(MOSDScrub *m)
7232{
7233 dout(10) << "handle_scrub " << *m << dendl;
7234 if (!require_mon_or_mgr_peer(m)) {
7235 m->put();
7236 return;
7237 }
7238 if (m->fsid != monc->get_fsid()) {
7239 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid() << dendl;
7240 m->put();
7241 return;
7242 }
7243
7244 RWLock::RLocker l(pg_map_lock);
7245 if (m->scrub_pgs.empty()) {
7246 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
7247 p != pg_map.end();
7248 ++p)
7249 handle_pg_scrub(m, p->second);
7250 } else {
7251 for (vector<pg_t>::iterator p = m->scrub_pgs.begin();
7252 p != m->scrub_pgs.end();
7253 ++p) {
7254 spg_t pcand;
7255 if (osdmap->get_primary_shard(*p, &pcand)) {
7256 auto pg_map_entry = pg_map.find(pcand);
7257 if (pg_map_entry != pg_map.end()) {
7258 handle_pg_scrub(m, pg_map_entry->second);
7259 }
7260 }
7261 }
7262 }
7263
7264 m->put();
7265}
7266
7267bool OSD::scrub_random_backoff()
7268{
7269 bool coin_flip = (rand() / (double)RAND_MAX >=
7270 cct->_conf->osd_scrub_backoff_ratio);
7271 if (!coin_flip) {
7272 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7273 return true;
7274 }
7275 return false;
7276}
7277
7278OSDService::ScrubJob::ScrubJob(CephContext* cct,
7279 const spg_t& pg, const utime_t& timestamp,
7280 double pool_scrub_min_interval,
7281 double pool_scrub_max_interval, bool must)
7282 : cct(cct),
7283 pgid(pg),
7284 sched_time(timestamp),
7285 deadline(timestamp)
7286{
7287 // if not explicitly requested, postpone the scrub with a random delay
7288 if (!must) {
7289 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7290 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7291 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7292 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7293
7294 sched_time += scrub_min_interval;
7295 double r = rand() / (double)RAND_MAX;
7296 sched_time +=
7297 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7298 deadline += scrub_max_interval;
7299 }
7300}
7301
7302bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7303 if (sched_time < rhs.sched_time)
7304 return true;
7305 if (sched_time > rhs.sched_time)
7306 return false;
7307 return pgid < rhs.pgid;
7308}
7309
7310bool OSD::scrub_time_permit(utime_t now)
7311{
7312 struct tm bdt;
7313 time_t tt = now.sec();
7314 localtime_r(&tt, &bdt);
7315 bool time_permit = false;
7316 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7317 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7318 time_permit = true;
7319 }
7320 } else {
7321 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7322 time_permit = true;
7323 }
7324 }
7325 if (!time_permit) {
7326 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7327 << " - " << cct->_conf->osd_scrub_end_hour
7328 << " now " << bdt.tm_hour << " = no" << dendl;
7329 } else {
7330 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7331 << " - " << cct->_conf->osd_scrub_end_hour
7332 << " now " << bdt.tm_hour << " = yes" << dendl;
7333 }
7334 return time_permit;
7335}
7336
7337bool OSD::scrub_load_below_threshold()
7338{
7339 double loadavgs[3];
7340 if (getloadavg(loadavgs, 3) != 3) {
7341 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7342 return false;
7343 }
7344
7345 // allow scrub if below configured threshold
7346 if (loadavgs[0] < cct->_conf->osd_scrub_load_threshold) {
7347 dout(20) << __func__ << " loadavg " << loadavgs[0]
7348 << " < max " << cct->_conf->osd_scrub_load_threshold
7349 << " = yes" << dendl;
7350 return true;
7351 }
7352
7353 // allow scrub if below daily avg and currently decreasing
7354 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7355 dout(20) << __func__ << " loadavg " << loadavgs[0]
7356 << " < daily_loadavg " << daily_loadavg
7357 << " and < 15m avg " << loadavgs[2]
7358 << " = yes" << dendl;
7359 return true;
7360 }
7361
7362 dout(20) << __func__ << " loadavg " << loadavgs[0]
7363 << " >= max " << cct->_conf->osd_scrub_load_threshold
7364 << " and ( >= daily_loadavg " << daily_loadavg
7365 << " or >= 15m avg " << loadavgs[2]
7366 << ") = no" << dendl;
7367 return false;
7368}
7369
7370void OSD::sched_scrub()
7371{
7372 // if not permitted, fail fast
7373 if (!service.can_inc_scrubs_pending()) {
7374 return;
7375 }
b5b8bbf5
FG
7376 if (!cct->_conf->osd_scrub_during_recovery && service.is_recovery_active()) {
7377 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7378 return;
7379 }
7380
7c673cae
FG
7381
7382 utime_t now = ceph_clock_now();
7383 bool time_permit = scrub_time_permit(now);
7384 bool load_is_low = scrub_load_below_threshold();
7385 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7386
7387 OSDService::ScrubJob scrub;
7388 if (service.first_scrub_stamp(&scrub)) {
7389 do {
7390 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7391
7392 if (scrub.sched_time > now) {
7393 // save ourselves some effort
7394 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7395 << " > " << now << dendl;
7396 break;
7397 }
7398
7c673cae
FG
7399 if ((scrub.deadline >= now) && !(time_permit && load_is_low)) {
7400 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7401 << (!time_permit ? "time not permit" : "high load") << dendl;
7402 continue;
7403 }
7404
7405 PG *pg = _lookup_lock_pg(scrub.pgid);
7406 if (!pg)
7407 continue;
7408 if (pg->get_pgbackend()->scrub_supported() && pg->is_active()) {
7409 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7410 << (pg->scrubber.must_scrub ? ", explicitly requested" :
7411 (load_is_low ? ", load_is_low" : " deadline < now"))
7412 << dendl;
7413 if (pg->sched_scrub()) {
7414 pg->unlock();
7415 break;
7416 }
7417 }
7418 pg->unlock();
7419 } while (service.next_scrub_stamp(scrub, &scrub));
7420 }
7421 dout(20) << "sched_scrub done" << dendl;
7422}
7423
7424
7425
7426// =====================================================
7427// MAP
7428
7429void OSD::wait_for_new_map(OpRequestRef op)
7430{
7431 // ask?
7432 if (waiting_for_osdmap.empty()) {
7433 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7434 }
7435
7436 logger->inc(l_osd_waiting_for_map);
7437 waiting_for_osdmap.push_back(op);
7438 op->mark_delayed("wait for new map");
7439}
7440
7441
7442/** update_map
7443 * assimilate new OSDMap(s). scan pgs, etc.
7444 */
7445
7446void OSD::note_down_osd(int peer)
7447{
7448 assert(osd_lock.is_locked());
7449 cluster_messenger->mark_down(osdmap->get_cluster_addr(peer));
7450
7451 heartbeat_lock.Lock();
7452 failure_queue.erase(peer);
7453 failure_pending.erase(peer);
7454 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7455 if (p != heartbeat_peers.end()) {
7456 p->second.con_back->mark_down();
7457 if (p->second.con_front) {
7458 p->second.con_front->mark_down();
7459 }
7460 heartbeat_peers.erase(p);
7461 }
7462 heartbeat_lock.Unlock();
7463}
7464
7465void OSD::note_up_osd(int peer)
7466{
7467 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
7468 heartbeat_set_peers_need_update();
7469}
7470
7471struct C_OnMapCommit : public Context {
7472 OSD *osd;
7473 epoch_t first, last;
7474 MOSDMap *msg;
7475 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7476 : osd(o), first(f), last(l), msg(m) {}
7477 void finish(int r) override {
7478 osd->_committed_osd_maps(first, last, msg);
7479 msg->put();
7480 }
7481};
7482
7483struct C_OnMapApply : public Context {
7484 OSDService *service;
7485 list<OSDMapRef> pinned_maps;
7486 epoch_t e;
7487 C_OnMapApply(OSDService *service,
7488 const list<OSDMapRef> &pinned_maps,
7489 epoch_t e)
7490 : service(service), pinned_maps(pinned_maps), e(e) {}
7491 void finish(int r) override {
7492 service->clear_map_bl_cache_pins(e);
7493 }
7494};
7495
7496void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7497{
181888fb
FG
7498 Mutex::Locker l(osdmap_subscribe_lock);
7499 if (latest_subscribed_epoch >= epoch && !force_request)
7c673cae
FG
7500 return;
7501
181888fb
FG
7502 latest_subscribed_epoch = MAX(epoch, latest_subscribed_epoch);
7503
7c673cae
FG
7504 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7505 force_request) {
7506 monc->renew_subs();
7507 }
7508}
7509
7510void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7511{
7512 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7513 if (min <= superblock.oldest_map)
7514 return;
7515
7516 int num = 0;
7517 ObjectStore::Transaction t;
7518 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7519 dout(20) << " removing old osdmap epoch " << e << dendl;
7520 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7521 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7522 superblock.oldest_map = e + 1;
7523 num++;
7524 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7525 service.publish_superblock(superblock);
7526 write_superblock(t);
7527 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7528 assert(tr == 0);
7529 num = 0;
7530 if (!skip_maps) {
7531 // skip_maps leaves us with a range of old maps if we fail to remove all
7532 // of them before moving superblock.oldest_map forward to the first map
7533 // in the incoming MOSDMap msg. so we should continue removing them in
7534 // this case, even we could do huge series of delete transactions all at
7535 // once.
7536 break;
7537 }
7538 }
7539 }
7540 if (num > 0) {
7541 service.publish_superblock(superblock);
7542 write_superblock(t);
224ce89b
WB
7543 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7544 assert(tr == 0);
7c673cae
FG
7545 }
7546 // we should not remove the cached maps
7547 assert(min <= service.map_cache.cached_key_lower_bound());
7548}
7549
7550void OSD::handle_osd_map(MOSDMap *m)
7551{
7552 assert(osd_lock.is_locked());
7553 // Keep a ref in the list until we get the newly received map written
7554 // onto disk. This is important because as long as the refs are alive,
7555 // the OSDMaps will be pinned in the cache and we won't try to read it
7556 // off of disk. Otherwise these maps will probably not stay in the cache,
7557 // and reading those OSDMaps before they are actually written can result
7558 // in a crash.
7559 list<OSDMapRef> pinned_maps;
7560 if (m->fsid != monc->get_fsid()) {
7561 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7562 << monc->get_fsid() << dendl;
7563 m->put();
7564 return;
7565 }
7566 if (is_initializing()) {
7567 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7568 m->put();
7569 return;
7570 }
7571
7572 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
7573 if (session && !(session->entity_name.is_mon() ||
7574 session->entity_name.is_osd())) {
7575 //not enough perms!
7576 dout(10) << "got osd map from Session " << session
7577 << " which we can't take maps from (not a mon or osd)" << dendl;
7578 m->put();
7579 session->put();
7580 return;
7581 }
7582 if (session)
7583 session->put();
7584
7585 // share with the objecter
7586 if (!is_preboot())
7587 service.objecter->handle_osd_map(m);
7588
7589 epoch_t first = m->get_first();
7590 epoch_t last = m->get_last();
7591 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7592 << superblock.newest_map
7593 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7594 << dendl;
7595
7596 logger->inc(l_osd_map);
7597 logger->inc(l_osd_mape, last - first + 1);
7598 if (first <= superblock.newest_map)
7599 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7600 if (service.max_oldest_map < m->oldest_map) {
7601 service.max_oldest_map = m->oldest_map;
7602 assert(service.max_oldest_map >= superblock.oldest_map);
7603 }
7604
7605 // make sure there is something new, here, before we bother flushing
7606 // the queues and such
7607 if (last <= superblock.newest_map) {
7608 dout(10) << " no new maps here, dropping" << dendl;
7609 m->put();
7610 return;
7611 }
7612
7613 // missing some?
7614 bool skip_maps = false;
7615 if (first > superblock.newest_map + 1) {
7616 dout(10) << "handle_osd_map message skips epochs "
7617 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7618 if (m->oldest_map <= superblock.newest_map + 1) {
7619 osdmap_subscribe(superblock.newest_map + 1, false);
7620 m->put();
7621 return;
7622 }
7623 // always try to get the full range of maps--as many as we can. this
7624 // 1- is good to have
7625 // 2- is at present the only way to ensure that we get a *full* map as
7626 // the first map!
7627 if (m->oldest_map < first) {
7628 osdmap_subscribe(m->oldest_map - 1, true);
7629 m->put();
7630 return;
7631 }
7632 skip_maps = true;
7633 }
7634
7635 ObjectStore::Transaction t;
7636 uint64_t txn_size = 0;
7637
7638 // store new maps: queue for disk and put in the osdmap cache
7639 epoch_t start = MAX(superblock.newest_map + 1, first);
7640 for (epoch_t e = start; e <= last; e++) {
7641 if (txn_size >= t.get_num_bytes()) {
7642 derr << __func__ << " transaction size overflowed" << dendl;
7643 assert(txn_size < t.get_num_bytes());
7644 }
7645 txn_size = t.get_num_bytes();
7646 map<epoch_t,bufferlist>::iterator p;
7647 p = m->maps.find(e);
7648 if (p != m->maps.end()) {
7649 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7650 OSDMap *o = new OSDMap;
7651 bufferlist& bl = p->second;
7652
7653 o->decode(bl);
7654
7655 ghobject_t fulloid = get_osdmap_pobject_name(e);
7656 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7657 pin_map_bl(e, bl);
7658 pinned_maps.push_back(add_map(o));
7659
7660 got_full_map(e);
7661 continue;
7662 }
7663
7664 p = m->incremental_maps.find(e);
7665 if (p != m->incremental_maps.end()) {
7666 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7667 bufferlist& bl = p->second;
7668 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7669 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7670 pin_map_inc_bl(e, bl);
7671
7672 OSDMap *o = new OSDMap;
7673 if (e > 1) {
7674 bufferlist obl;
7675 bool got = get_map_bl(e - 1, obl);
7676 assert(got);
7677 o->decode(obl);
7678 }
7679
7680 OSDMap::Incremental inc;
7681 bufferlist::iterator p = bl.begin();
7682 inc.decode(p);
7683 if (o->apply_incremental(inc) < 0) {
7684 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
7685 assert(0 == "bad fsid");
7686 }
7687
7688 bufferlist fbl;
7689 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7690
7691 bool injected_failure = false;
7692 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7693 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7694 derr << __func__ << " injecting map crc failure" << dendl;
7695 injected_failure = true;
7696 }
7697
7698 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7699 dout(2) << "got incremental " << e
7700 << " but failed to encode full with correct crc; requesting"
7701 << dendl;
7702 clog->warn() << "failed to encode map e" << e << " with expected crc";
7703 dout(20) << "my encoded map was:\n";
7704 fbl.hexdump(*_dout);
7705 *_dout << dendl;
7706 delete o;
7707 request_full_map(e, last);
7708 last = e - 1;
7709 break;
7710 }
7711 got_full_map(e);
7712
7713 ghobject_t fulloid = get_osdmap_pobject_name(e);
7714 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7715 pin_map_bl(e, fbl);
7716 pinned_maps.push_back(add_map(o));
7717 continue;
7718 }
7719
7720 assert(0 == "MOSDMap lied about what maps it had?");
7721 }
7722
7723 // even if this map isn't from a mon, we may have satisfied our subscription
7724 monc->sub_got("osdmap", last);
7725
7726 if (!m->maps.empty() && requested_full_first) {
7727 dout(10) << __func__ << " still missing full maps " << requested_full_first
7728 << ".." << requested_full_last << dendl;
7729 rerequest_full_maps();
7730 }
7731
7c673cae
FG
7732 if (superblock.oldest_map) {
7733 // make sure we at least keep pace with incoming maps
7734 trim_maps(m->oldest_map, last - first + 1, skip_maps);
7735 }
7736
7737 if (!superblock.oldest_map || skip_maps)
7738 superblock.oldest_map = first;
7739 superblock.newest_map = last;
7740 superblock.current_epoch = last;
7741
7742 // note in the superblock that we were clean thru the prior epoch
7743 epoch_t boot_epoch = service.get_boot_epoch();
7744 if (boot_epoch && boot_epoch >= superblock.mounted) {
7745 superblock.mounted = boot_epoch;
7746 superblock.clean_thru = last;
7747 }
7748
7749 // superblock and commit
7750 write_superblock(t);
7751 store->queue_transaction(
7752 service.meta_osr.get(),
7753 std::move(t),
7754 new C_OnMapApply(&service, pinned_maps, last),
7755 new C_OnMapCommit(this, start, last, m), 0);
7756 service.publish_superblock(superblock);
7757}
7758
7759void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
7760{
7761 dout(10) << __func__ << " " << first << ".." << last << dendl;
7762 if (is_stopping()) {
7763 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7764 return;
7765 }
7766 Mutex::Locker l(osd_lock);
31f18b77
FG
7767 if (is_stopping()) {
7768 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7769 return;
7770 }
7c673cae
FG
7771 map_lock.get_write();
7772
7773 bool do_shutdown = false;
7774 bool do_restart = false;
7775 bool network_error = false;
7776
7777 // advance through the new maps
7778 for (epoch_t cur = first; cur <= last; cur++) {
7779 dout(10) << " advance to epoch " << cur
7780 << " (<= last " << last
7781 << " <= newest_map " << superblock.newest_map
7782 << ")" << dendl;
7783
7784 OSDMapRef newmap = get_map(cur);
7785 assert(newmap); // we just cached it above!
7786
7787 // start blacklisting messages sent to peers that go down.
7788 service.pre_publish_map(newmap);
7789
7790 // kill connections to newly down osds
7791 bool waited_for_reservations = false;
7792 set<int> old;
7793 osdmap->get_all_osds(old);
7794 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
7795 if (*p != whoami &&
7796 osdmap->is_up(*p) && // in old map
7797 newmap->is_down(*p)) { // but not the new one
7798 if (!waited_for_reservations) {
7799 service.await_reserved_maps();
7800 waited_for_reservations = true;
7801 }
7802 note_down_osd(*p);
7803 } else if (*p != whoami &&
7804 osdmap->is_down(*p) &&
7805 newmap->is_up(*p)) {
7806 note_up_osd(*p);
7807 }
7808 }
7809
31f18b77
FG
7810 if ((osdmap->test_flag(CEPH_OSDMAP_NOUP) !=
7811 newmap->test_flag(CEPH_OSDMAP_NOUP)) ||
7812 (osdmap->is_noup(whoami) != newmap->is_noup(whoami))) {
7c673cae
FG
7813 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
7814 << dendl;
7815 if (is_booting()) {
7816 // this captures the case where we sent the boot message while
7817 // NOUP was being set on the mon and our boot request was
7818 // dropped, and then later it is cleared. it imperfectly
7819 // handles the case where our original boot message was not
7820 // dropped and we restart even though we might have booted, but
7821 // that is harmless (boot will just take slightly longer).
7822 do_restart = true;
7823 }
7824 }
31f18b77
FG
7825 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS &&
7826 newmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
7827 dout(10) << __func__ << " require_osd_release reached luminous in "
7828 << newmap->get_epoch() << dendl;
7829 clear_pg_stat_queue();
224ce89b 7830 clear_outstanding_pg_stats();
31f18b77 7831 }
7c673cae
FG
7832
7833 osdmap = newmap;
7834 epoch_t up_epoch;
7835 epoch_t boot_epoch;
7836 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
7837 if (!up_epoch &&
7838 osdmap->is_up(whoami) &&
7839 osdmap->get_inst(whoami) == client_messenger->get_myinst()) {
7840 up_epoch = osdmap->get_epoch();
7841 dout(10) << "up_epoch is " << up_epoch << dendl;
7842 if (!boot_epoch) {
7843 boot_epoch = osdmap->get_epoch();
7844 dout(10) << "boot_epoch is " << boot_epoch << dendl;
7845 }
7846 service.set_epochs(&boot_epoch, &up_epoch, NULL);
7847 }
7848 }
7849
7850 had_map_since = ceph_clock_now();
7851
7852 epoch_t _bind_epoch = service.get_bind_epoch();
7853 if (osdmap->is_up(whoami) &&
7854 osdmap->get_addr(whoami) == client_messenger->get_myaddr() &&
7855 _bind_epoch < osdmap->get_up_from(whoami)) {
7856
7857 if (is_booting()) {
7858 dout(1) << "state: booting -> active" << dendl;
7859 set_state(STATE_ACTIVE);
7860
7861 // set incarnation so that osd_reqid_t's we generate for our
7862 // objecter requests are unique across restarts.
7863 service.objecter->set_client_incarnation(osdmap->get_epoch());
7864 }
7865 }
7866
7867 if (osdmap->get_epoch() > 0 &&
7868 is_active()) {
7869 if (!osdmap->exists(whoami)) {
7870 dout(0) << "map says i do not exist. shutting down." << dendl;
7871 do_shutdown = true; // don't call shutdown() while we have
7872 // everything paused
7873 } else if (!osdmap->is_up(whoami) ||
7874 !osdmap->get_addr(whoami).probably_equals(
7875 client_messenger->get_myaddr()) ||
7876 !osdmap->get_cluster_addr(whoami).probably_equals(
7877 cluster_messenger->get_myaddr()) ||
7878 !osdmap->get_hb_back_addr(whoami).probably_equals(
7879 hb_back_server_messenger->get_myaddr()) ||
7880 (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
7881 !osdmap->get_hb_front_addr(whoami).probably_equals(
7882 hb_front_server_messenger->get_myaddr()))) {
7883 if (!osdmap->is_up(whoami)) {
7884 if (service.is_preparing_to_stop() || service.is_stopping()) {
7885 service.got_stop_ack();
7886 } else {
c07f9fc5
FG
7887 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
7888 "but it is still running";
7889 clog->debug() << "map e" << osdmap->get_epoch()
7890 << " wrongly marked me down at e"
7891 << osdmap->get_down_at(whoami);
7c673cae
FG
7892 }
7893 } else if (!osdmap->get_addr(whoami).probably_equals(
7894 client_messenger->get_myaddr())) {
7895 clog->error() << "map e" << osdmap->get_epoch()
7896 << " had wrong client addr (" << osdmap->get_addr(whoami)
7897 << " != my " << client_messenger->get_myaddr() << ")";
7898 } else if (!osdmap->get_cluster_addr(whoami).probably_equals(
7899 cluster_messenger->get_myaddr())) {
7900 clog->error() << "map e" << osdmap->get_epoch()
7901 << " had wrong cluster addr ("
7902 << osdmap->get_cluster_addr(whoami)
7903 << " != my " << cluster_messenger->get_myaddr() << ")";
7904 } else if (!osdmap->get_hb_back_addr(whoami).probably_equals(
7905 hb_back_server_messenger->get_myaddr())) {
7906 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 7907 << " had wrong heartbeat back addr ("
7c673cae
FG
7908 << osdmap->get_hb_back_addr(whoami)
7909 << " != my " << hb_back_server_messenger->get_myaddr()
7910 << ")";
7911 } else if (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
7912 !osdmap->get_hb_front_addr(whoami).probably_equals(
7913 hb_front_server_messenger->get_myaddr())) {
7914 clog->error() << "map e" << osdmap->get_epoch()
c07f9fc5 7915 << " had wrong heartbeat front addr ("
7c673cae
FG
7916 << osdmap->get_hb_front_addr(whoami)
7917 << " != my " << hb_front_server_messenger->get_myaddr()
7918 << ")";
7919 }
7920
7921 if (!service.is_stopping()) {
7922 epoch_t up_epoch = 0;
7923 epoch_t bind_epoch = osdmap->get_epoch();
7924 service.set_epochs(NULL,&up_epoch, &bind_epoch);
7925 do_restart = true;
7926
7927 //add markdown log
7928 utime_t now = ceph_clock_now();
7929 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
7930 osd_markdown_log.push_back(now);
7931 //clear all out-of-date log
7932 while (!osd_markdown_log.empty() &&
7933 osd_markdown_log.front() + grace < now)
7934 osd_markdown_log.pop_front();
7935 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
7936 dout(0) << __func__ << " marked down "
7937 << osd_markdown_log.size()
7938 << " > osd_max_markdown_count "
7939 << cct->_conf->osd_max_markdown_count
7940 << " in last " << grace << " seconds, shutting down"
7941 << dendl;
7942 do_restart = false;
7943 do_shutdown = true;
7944 }
7945
7946 start_waiting_for_healthy();
7947
7948 set<int> avoid_ports;
7949#if defined(__FreeBSD__)
7950 // prevent FreeBSD from grabbing the client_messenger port during
7951 // rebinding. In which case a cluster_meesneger will connect also
7952 // to the same port
7953 avoid_ports.insert(client_messenger->get_myaddr().get_port());
7954#endif
7955 avoid_ports.insert(cluster_messenger->get_myaddr().get_port());
7956 avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port());
7957 avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port());
7958
7959 int r = cluster_messenger->rebind(avoid_ports);
7960 if (r != 0) {
7961 do_shutdown = true; // FIXME: do_restart?
7962 network_error = true;
7963 dout(0) << __func__ << " marked down:"
7964 << " rebind cluster_messenger failed" << dendl;
7965 }
7966
7967 r = hb_back_server_messenger->rebind(avoid_ports);
7968 if (r != 0) {
7969 do_shutdown = true; // FIXME: do_restart?
7970 network_error = true;
7971 dout(0) << __func__ << " marked down:"
7972 << " rebind hb_back_server_messenger failed" << dendl;
7973 }
7974
7975 r = hb_front_server_messenger->rebind(avoid_ports);
7976 if (r != 0) {
7977 do_shutdown = true; // FIXME: do_restart?
7978 network_error = true;
7979 dout(0) << __func__ << " marked down:"
7980 << " rebind hb_front_server_messenger failed" << dendl;
7981 }
7982
7983 hb_front_client_messenger->mark_down_all();
7984 hb_back_client_messenger->mark_down_all();
7985
7986 reset_heartbeat_peers();
7987 }
7988 }
7989 }
7990
7991 map_lock.put_write();
7992
7993 check_osdmap_features(store);
7994
7995 // yay!
7996 consume_map();
7997
7998 if (is_active() || is_waiting_for_healthy())
7999 maybe_update_heartbeat_peers();
8000
8001 if (!is_active()) {
8002 dout(10) << " not yet active; waiting for peering wq to drain" << dendl;
8003 peering_wq.drain();
8004 } else {
8005 activate_map();
8006 }
8007
31f18b77 8008 if (do_shutdown) {
7c673cae
FG
8009 if (network_error) {
8010 Mutex::Locker l(heartbeat_lock);
8011 map<int,pair<utime_t,entity_inst_t>>::iterator it =
8012 failure_pending.begin();
8013 while (it != failure_pending.end()) {
8014 dout(10) << "handle_osd_ping canceling in-flight failure report for osd."
8015 << it->first << dendl;
8016 send_still_alive(osdmap->get_epoch(), it->second.second);
8017 failure_pending.erase(it++);
8018 }
8019 }
8020 // trigger shutdown in a different thread
8021 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8022 queue_async_signal(SIGINT);
8023 }
31f18b77
FG
8024 else if (m->newest_map && m->newest_map > last) {
8025 dout(10) << " msg say newest map is " << m->newest_map
8026 << ", requesting more" << dendl;
8027 osdmap_subscribe(osdmap->get_epoch()+1, false);
8028 }
7c673cae
FG
8029 else if (is_preboot()) {
8030 if (m->get_source().is_mon())
8031 _preboot(m->oldest_map, m->newest_map);
8032 else
8033 start_boot();
8034 }
8035 else if (do_restart)
8036 start_boot();
8037
8038}
8039
8040void OSD::check_osdmap_features(ObjectStore *fs)
8041{
8042 // adjust required feature bits?
8043
8044 // we have to be a bit careful here, because we are accessing the
8045 // Policy structures without taking any lock. in particular, only
8046 // modify integer values that can safely be read by a racing CPU.
8047 // since we are only accessing existing Policy structures a their
8048 // current memory location, and setting or clearing bits in integer
8049 // fields, and we are the only writer, this is not a problem.
8050
8051 {
8052 Messenger::Policy p = client_messenger->get_default_policy();
8053 uint64_t mask;
8054 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8055 if ((p.features_required & mask) != features) {
8056 dout(0) << "crush map has features " << features
8057 << ", adjusting msgr requires for clients" << dendl;
8058 p.features_required = (p.features_required & ~mask) | features;
8059 client_messenger->set_default_policy(p);
8060 }
8061 }
8062 {
8063 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8064 uint64_t mask;
8065 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8066 if ((p.features_required & mask) != features) {
8067 dout(0) << "crush map has features " << features
8068 << " was " << p.features_required
8069 << ", adjusting msgr requires for mons" << dendl;
8070 p.features_required = (p.features_required & ~mask) | features;
8071 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8072 }
8073 }
8074 {
8075 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8076 uint64_t mask;
8077 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8078
8079 if ((p.features_required & mask) != features) {
8080 dout(0) << "crush map has features " << features
8081 << ", adjusting msgr requires for osds" << dendl;
8082 p.features_required = (p.features_required & ~mask) | features;
8083 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8084 }
8085
8086 if ((features & CEPH_FEATURE_OSD_ERASURE_CODES) &&
8087 !superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8088 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8089 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8090 ObjectStore::Transaction t;
8091 write_superblock(t);
8092 int err = store->queue_transaction(service.meta_osr.get(), std::move(t), NULL);
8093 assert(err == 0);
8094 }
8095 }
8096}
8097
8098bool OSD::advance_pg(
8099 epoch_t osd_epoch, PG *pg,
8100 ThreadPool::TPHandle &handle,
8101 PG::RecoveryCtx *rctx,
31f18b77 8102 set<PGRef> *new_pgs)
7c673cae
FG
8103{
8104 assert(pg->is_locked());
8105 epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
8106 OSDMapRef lastmap = pg->get_osdmap();
8107
8108 if (lastmap->get_epoch() == osd_epoch)
8109 return true;
8110 assert(lastmap->get_epoch() < osd_epoch);
8111
8112 epoch_t min_epoch = service.get_min_pg_epoch();
8113 epoch_t max;
8114 if (min_epoch) {
8115 max = min_epoch + cct->_conf->osd_map_max_advance;
8116 } else {
8117 max = next_epoch + cct->_conf->osd_map_max_advance;
8118 }
8119
8120 for (;
8121 next_epoch <= osd_epoch && next_epoch <= max;
8122 ++next_epoch) {
8123 OSDMapRef nextmap = service.try_get_map(next_epoch);
8124 if (!nextmap) {
8125 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8126 // make sure max is bumped up so that we can get past any
8127 // gap in maps
8128 max = MAX(max, next_epoch + cct->_conf->osd_map_max_advance);
8129 continue;
8130 }
8131
8132 vector<int> newup, newacting;
8133 int up_primary, acting_primary;
8134 nextmap->pg_to_up_acting_osds(
8135 pg->info.pgid.pgid,
8136 &newup, &up_primary,
8137 &newacting, &acting_primary);
8138 pg->handle_advance_map(
8139 nextmap, lastmap, newup, up_primary,
8140 newacting, acting_primary, rctx);
8141
8142 // Check for split!
8143 set<spg_t> children;
8144 spg_t parent(pg->info.pgid);
8145 if (parent.is_split(
8146 lastmap->get_pg_num(pg->pool.id),
8147 nextmap->get_pg_num(pg->pool.id),
8148 &children)) {
8149 service.mark_split_in_progress(pg->info.pgid, children);
8150 split_pgs(
8151 pg, children, new_pgs, lastmap, nextmap,
8152 rctx);
8153 }
8154
8155 lastmap = nextmap;
8156 handle.reset_tp_timeout();
8157 }
8158 service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
8159 pg->handle_activate_map(rctx);
8160 if (next_epoch <= osd_epoch) {
8161 dout(10) << __func__ << " advanced to max " << max
8162 << " past min epoch " << min_epoch
8163 << " ... will requeue " << *pg << dendl;
8164 return false;
8165 }
8166 return true;
8167}
8168
8169void OSD::consume_map()
8170{
8171 assert(osd_lock.is_locked());
8172 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8173
8174 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8175 list<PGRef> to_remove;
8176
8177 // scan pg's
8178 {
8179 RWLock::RLocker l(pg_map_lock);
8180 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8181 it != pg_map.end();
8182 ++it) {
8183 PG *pg = it->second;
8184 pg->lock();
8185 if (pg->is_primary())
8186 num_pg_primary++;
8187 else if (pg->is_replica())
8188 num_pg_replica++;
8189 else
8190 num_pg_stray++;
8191
8192 if (!osdmap->have_pg_pool(pg->info.pgid.pool())) {
8193 //pool is deleted!
8194 to_remove.push_back(PGRef(pg));
8195 } else {
8196 service.init_splits_between(it->first, service.get_osdmap(), osdmap);
8197 }
8198
8199 pg->unlock();
8200 }
8201 }
8202
8203 for (list<PGRef>::iterator i = to_remove.begin();
8204 i != to_remove.end();
8205 to_remove.erase(i++)) {
8206 RWLock::WLocker locker(pg_map_lock);
8207 (*i)->lock();
8208 _remove_pg(&**i);
8209 (*i)->unlock();
8210 }
8211
8212 service.expand_pg_num(service.get_osdmap(), osdmap);
8213
8214 service.pre_publish_map(osdmap);
8215 service.await_reserved_maps();
8216 service.publish_map(osdmap);
8217
8218 service.maybe_inject_dispatch_delay();
8219
8220 dispatch_sessions_waiting_on_map();
8221
8222 service.maybe_inject_dispatch_delay();
8223
8224 // remove any PGs which we no longer host from the session waiting_for_pg lists
8225 dout(20) << __func__ << " checking waiting_for_pg" << dendl;
8226 op_shardedwq.prune_pg_waiters(osdmap, whoami);
8227
8228 service.maybe_inject_dispatch_delay();
8229
8230 // scan pg's
8231 {
8232 RWLock::RLocker l(pg_map_lock);
8233 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8234 it != pg_map.end();
8235 ++it) {
8236 PG *pg = it->second;
8237 pg->lock();
8238 pg->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
8239 pg->unlock();
8240 }
8241
8242 logger->set(l_osd_pg, pg_map.size());
8243 }
8244 logger->set(l_osd_pg_primary, num_pg_primary);
8245 logger->set(l_osd_pg_replica, num_pg_replica);
8246 logger->set(l_osd_pg_stray, num_pg_stray);
8247}
8248
8249void OSD::activate_map()
8250{
8251 assert(osd_lock.is_locked());
8252
8253 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8254
8255 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
8256 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8257 ceph_abort();
8258 }
8259
8260 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
8261 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
8262 osdmap_subscribe(osdmap->get_epoch() + 1, false);
8263 }
8264
8265 // norecover?
8266 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8267 if (!service.recovery_is_paused()) {
8268 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8269 service.pause_recovery();
8270 }
8271 } else {
8272 if (service.recovery_is_paused()) {
8273 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8274 service.unpause_recovery();
8275 }
8276 }
8277
8278 service.activate_map();
8279
8280 // process waiters
8281 take_waiters(waiting_for_osdmap);
8282}
8283
8284bool OSD::require_mon_peer(const Message *m)
8285{
8286 if (!m->get_connection()->peer_is_mon()) {
8287 dout(0) << "require_mon_peer received from non-mon "
8288 << m->get_connection()->get_peer_addr()
8289 << " " << *m << dendl;
8290 return false;
8291 }
8292 return true;
8293}
8294
8295bool OSD::require_mon_or_mgr_peer(const Message *m)
8296{
8297 if (!m->get_connection()->peer_is_mon() &&
8298 !m->get_connection()->peer_is_mgr()) {
8299 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8300 << m->get_connection()->get_peer_addr()
8301 << " " << *m << dendl;
8302 return false;
8303 }
8304 return true;
8305}
8306
8307bool OSD::require_osd_peer(const Message *m)
8308{
8309 if (!m->get_connection()->peer_is_osd()) {
8310 dout(0) << "require_osd_peer received from non-osd "
8311 << m->get_connection()->get_peer_addr()
8312 << " " << *m << dendl;
8313 return false;
8314 }
8315 return true;
8316}
8317
8318bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8319{
8320 epoch_t up_epoch = service.get_up_epoch();
8321 if (epoch < up_epoch) {
8322 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8323 return false;
8324 }
8325
8326 if (!is_active()) {
8327 dout(7) << "still in boot state, dropping message " << *m << dendl;
8328 return false;
8329 }
8330
8331 return true;
8332}
8333
8334bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
8335 bool is_fast_dispatch)
8336{
8337 int from = m->get_source().num();
8338
8339 if (map->is_down(from) ||
8340 (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
8341 dout(5) << "from dead osd." << from << ", marking down, "
8342 << " msg was " << m->get_source_inst().addr
8343 << " expected " << (map->is_up(from) ?
8344 map->get_cluster_addr(from) : entity_addr_t())
8345 << dendl;
8346 ConnectionRef con = m->get_connection();
8347 con->mark_down();
8348 Session *s = static_cast<Session*>(con->get_priv());
8349 if (s) {
8350 if (!is_fast_dispatch)
8351 s->session_dispatch_lock.Lock();
8352 clear_session_waiting_on_map(s);
8353 con->set_priv(NULL); // break ref <-> session cycle, if any
8354 if (!is_fast_dispatch)
8355 s->session_dispatch_lock.Unlock();
8356 s->put();
8357 }
8358 return false;
8359 }
8360 return true;
8361}
8362
8363
8364/*
8365 * require that we have same (or newer) map, and that
8366 * the source is the pg primary.
8367 */
8368bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8369 bool is_fast_dispatch)
8370{
8371 const Message *m = op->get_req();
8372 dout(15) << "require_same_or_newer_map " << epoch
8373 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8374
8375 assert(osd_lock.is_locked());
8376
8377 // do they have a newer map?
8378 if (epoch > osdmap->get_epoch()) {
8379 dout(7) << "waiting for newer map epoch " << epoch
8380 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8381 wait_for_new_map(op);
8382 return false;
8383 }
8384
8385 if (!require_self_aliveness(op->get_req(), epoch)) {
8386 return false;
8387 }
8388
8389 // ok, our map is same or newer.. do they still exist?
8390 if (m->get_connection()->get_messenger() == cluster_messenger &&
8391 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8392 return false;
8393 }
8394
8395 return true;
8396}
8397
8398
8399
8400
8401
8402// ----------------------------------------
8403// pg creation
8404
8405void OSD::split_pgs(
8406 PG *parent,
31f18b77 8407 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
7c673cae
FG
8408 OSDMapRef curmap,
8409 OSDMapRef nextmap,
8410 PG::RecoveryCtx *rctx)
8411{
8412 unsigned pg_num = nextmap->get_pg_num(
8413 parent->pool.id);
8414 parent->update_snap_mapper_bits(
8415 parent->info.pgid.get_split_bits(pg_num)
8416 );
8417
8418 vector<object_stat_sum_t> updated_stats(childpgids.size() + 1);
8419 parent->info.stats.stats.sum.split(updated_stats);
8420
8421 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8422 for (set<spg_t>::const_iterator i = childpgids.begin();
8423 i != childpgids.end();
8424 ++i, ++stat_iter) {
8425 assert(stat_iter != updated_stats.end());
8426 dout(10) << "Splitting " << *parent << " into " << *i << dendl;
8427 assert(service.splitting(*i));
8428 PG* child = _make_pg(nextmap, *i);
8429 child->lock(true);
8430 out_pgs->insert(child);
8431 rctx->created_pgs.insert(child);
8432
8433 unsigned split_bits = i->get_split_bits(pg_num);
8434 dout(10) << "pg_num is " << pg_num << dendl;
8435 dout(10) << "m_seed " << i->ps() << dendl;
8436 dout(10) << "split_bits is " << split_bits << dendl;
8437
8438 parent->split_colls(
8439 *i,
8440 split_bits,
8441 i->ps(),
8442 &child->pool.info,
8443 rctx->transaction);
8444 parent->split_into(
8445 i->pgid,
8446 child,
8447 split_bits);
8448 child->info.stats.stats.sum = *stat_iter;
8449
8450 child->write_if_dirty(*(rctx->transaction));
8451 child->unlock();
8452 }
8453 assert(stat_iter != updated_stats.end());
8454 parent->info.stats.stats.sum = *stat_iter;
8455 parent->write_if_dirty(*(rctx->transaction));
8456}
8457
8458/*
8459 * holding osd_lock
8460 */
8461void OSD::handle_pg_create(OpRequestRef op)
8462{
8463 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
8464 assert(m->get_type() == MSG_OSD_PG_CREATE);
8465
8466 dout(10) << "handle_pg_create " << *m << dendl;
8467
8468 if (!require_mon_peer(op->get_req())) {
8469 return;
8470 }
8471
8472 if (!require_same_or_newer_map(op, m->epoch, false))
8473 return;
8474
8475 op->mark_started();
8476
8477 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8478 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8479 p != m->mkpg.end();
8480 ++p, ++ci) {
8481 assert(ci != m->ctimes.end() && ci->first == p->first);
8482 epoch_t created = p->second.created;
8483 if (p->second.split_bits) // Skip split pgs
8484 continue;
8485 pg_t on = p->first;
8486
8487 if (on.preferred() >= 0) {
8488 dout(20) << "ignoring localized pg " << on << dendl;
8489 continue;
8490 }
8491
8492 if (!osdmap->have_pg_pool(on.pool())) {
8493 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8494 continue;
8495 }
8496
8497 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
8498
8499 // is it still ours?
8500 vector<int> up, acting;
8501 int up_primary = -1;
8502 int acting_primary = -1;
8503 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
8504 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
8505
8506 if (acting_primary != whoami) {
8507 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
8508 << "), my role=" << role << ", skipping" << dendl;
8509 continue;
8510 }
8511
8512 spg_t pgid;
8513 bool mapped = osdmap->get_primary_shard(on, &pgid);
8514 assert(mapped);
8515
8516 PastIntervals pi(
8517 osdmap->get_pools().at(pgid.pool()).ec_pool(),
8518 *osdmap);
8519 pg_history_t history;
8520 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
8521
8522 // The mon won't resend unless the primary changed, so
8523 // we ignore same_interval_since. We'll pass this history
8524 // to handle_pg_peering_evt with the current epoch as the
8525 // event -- the project_pg_history check in
8526 // handle_pg_peering_evt will be a noop.
8527 if (history.same_primary_since > m->epoch) {
8528 dout(10) << __func__ << ": got obsolete pg create on pgid "
8529 << pgid << " from epoch " << m->epoch
8530 << ", primary changed in " << history.same_primary_since
8531 << dendl;
8532 continue;
8533 }
8534
8535 if (handle_pg_peering_evt(
8536 pgid,
8537 history,
8538 pi,
8539 osdmap->get_epoch(),
8540 PG::CephPeeringEvtRef(
8541 new PG::CephPeeringEvt(
8542 osdmap->get_epoch(),
8543 osdmap->get_epoch(),
8544 PG::NullEvt()))
8545 ) == -EEXIST) {
8546 service.send_pg_created(pgid.pgid);
8547 }
8548 }
8549 last_pg_create_epoch = m->epoch;
8550
8551 maybe_update_heartbeat_peers();
8552}
8553
8554
8555// ----------------------------------------
8556// peering and recovery
8557
8558PG::RecoveryCtx OSD::create_context()
8559{
8560 ObjectStore::Transaction *t = new ObjectStore::Transaction;
8561 C_Contexts *on_applied = new C_Contexts(cct);
8562 C_Contexts *on_safe = new C_Contexts(cct);
8563 map<int, map<spg_t,pg_query_t> > *query_map =
8564 new map<int, map<spg_t, pg_query_t> >;
8565 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
8566 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
8567 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
8568 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
8569 PG::RecoveryCtx rctx(query_map, info_map, notify_list,
8570 on_applied, on_safe, t);
8571 return rctx;
8572}
8573
8574struct C_OpenPGs : public Context {
8575 set<PGRef> pgs;
8576 ObjectStore *store;
8577 OSD *osd;
8578 C_OpenPGs(set<PGRef>& p, ObjectStore *s, OSD* o) : store(s), osd(o) {
8579 pgs.swap(p);
8580 }
8581 void finish(int r) override {
8582 RWLock::RLocker l(osd->pg_map_lock);
8583 for (auto p : pgs) {
8584 if (osd->pg_map.count(p->info.pgid)) {
8585 p->ch = store->open_collection(p->coll);
8586 assert(p->ch);
8587 }
8588 }
8589 }
8590};
8591
8592void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
8593 ThreadPool::TPHandle *handle)
8594{
8595 if (!ctx.transaction->empty()) {
8596 if (!ctx.created_pgs.empty()) {
8597 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8598 }
8599 int tr = store->queue_transaction(
8600 pg->osr.get(),
8601 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL,
8602 TrackedOpRef(), handle);
8603 delete (ctx.transaction);
8604 assert(tr == 0);
8605 ctx.transaction = new ObjectStore::Transaction;
8606 ctx.on_applied = new C_Contexts(cct);
8607 ctx.on_safe = new C_Contexts(cct);
8608 }
8609}
8610
8611void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
8612 ThreadPool::TPHandle *handle)
8613{
8614 if (service.get_osdmap()->is_up(whoami) &&
8615 is_active()) {
8616 do_notifies(*ctx.notify_list, curmap);
8617 do_queries(*ctx.query_map, curmap);
8618 do_infos(*ctx.info_map, curmap);
8619 }
8620 delete ctx.notify_list;
8621 delete ctx.query_map;
8622 delete ctx.info_map;
8623 if ((ctx.on_applied->empty() &&
8624 ctx.on_safe->empty() &&
8625 ctx.transaction->empty() &&
8626 ctx.created_pgs.empty()) || !pg) {
8627 delete ctx.transaction;
8628 delete ctx.on_applied;
8629 delete ctx.on_safe;
8630 assert(ctx.created_pgs.empty());
8631 } else {
8632 if (!ctx.created_pgs.empty()) {
8633 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8634 }
8635 int tr = store->queue_transaction(
8636 pg->osr.get(),
8637 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL, TrackedOpRef(),
8638 handle);
8639 delete (ctx.transaction);
8640 assert(tr == 0);
8641 }
8642}
8643
8644/** do_notifies
8645 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
8646 * content for, and they are primary for.
8647 */
8648
8649void OSD::do_notifies(
8650 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
8651 OSDMapRef curmap)
8652{
8653 for (map<int,
8654 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
8655 notify_list.begin();
8656 it != notify_list.end();
8657 ++it) {
8658 if (!curmap->is_up(it->first)) {
8659 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
8660 continue;
8661 }
8662 ConnectionRef con = service.get_con_osd_cluster(
8663 it->first, curmap->get_epoch());
8664 if (!con) {
8665 dout(20) << __func__ << " skipping osd." << it->first
8666 << " (NULL con)" << dendl;
8667 continue;
8668 }
8669 service.share_map_peer(it->first, con.get(), curmap);
8670 dout(7) << __func__ << " osd " << it->first
8671 << " on " << it->second.size() << " PGs" << dendl;
8672 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
8673 it->second);
8674 con->send_message(m);
8675 }
8676}
8677
8678
8679/** do_queries
8680 * send out pending queries for info | summaries
8681 */
8682void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
8683 OSDMapRef curmap)
8684{
8685 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
8686 pit != query_map.end();
8687 ++pit) {
8688 if (!curmap->is_up(pit->first)) {
8689 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
8690 continue;
8691 }
8692 int who = pit->first;
8693 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
8694 if (!con) {
8695 dout(20) << __func__ << " skipping osd." << who
8696 << " (NULL con)" << dendl;
8697 continue;
8698 }
8699 service.share_map_peer(who, con.get(), curmap);
8700 dout(7) << __func__ << " querying osd." << who
8701 << " on " << pit->second.size() << " PGs" << dendl;
8702 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
8703 con->send_message(m);
8704 }
8705}
8706
8707
8708void OSD::do_infos(map<int,
8709 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
8710 OSDMapRef curmap)
8711{
8712 for (map<int,
8713 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
8714 info_map.begin();
8715 p != info_map.end();
8716 ++p) {
8717 if (!curmap->is_up(p->first)) {
8718 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
8719 continue;
8720 }
8721 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
8722 i != p->second.end();
8723 ++i) {
8724 dout(20) << __func__ << " sending info " << i->first.info
8725 << " to shard " << p->first << dendl;
8726 }
8727 ConnectionRef con = service.get_con_osd_cluster(
8728 p->first, curmap->get_epoch());
8729 if (!con) {
8730 dout(20) << __func__ << " skipping osd." << p->first
8731 << " (NULL con)" << dendl;
8732 continue;
8733 }
8734 service.share_map_peer(p->first, con.get(), curmap);
8735 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
8736 m->pg_list = p->second;
8737 con->send_message(m);
8738 }
8739 info_map.clear();
8740}
8741
8742
8743/** PGNotify
8744 * from non-primary to primary
8745 * includes pg_info_t.
8746 * NOTE: called with opqueue active.
8747 */
8748void OSD::handle_pg_notify(OpRequestRef op)
8749{
8750 const MOSDPGNotify *m = static_cast<const MOSDPGNotify*>(op->get_req());
8751 assert(m->get_type() == MSG_OSD_PG_NOTIFY);
8752
8753 dout(7) << "handle_pg_notify from " << m->get_source() << dendl;
8754 int from = m->get_source().num();
8755
8756 if (!require_osd_peer(op->get_req()))
8757 return;
8758
8759 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8760 return;
8761
8762 op->mark_started();
8763
8764 for (auto it = m->get_pg_list().begin();
8765 it != m->get_pg_list().end();
8766 ++it) {
8767 if (it->first.info.pgid.preferred() >= 0) {
8768 dout(20) << "ignoring localized pg " << it->first.info.pgid << dendl;
8769 continue;
8770 }
8771
8772 handle_pg_peering_evt(
8773 spg_t(it->first.info.pgid.pgid, it->first.to),
8774 it->first.info.history, it->second,
8775 it->first.query_epoch,
8776 PG::CephPeeringEvtRef(
8777 new PG::CephPeeringEvt(
8778 it->first.epoch_sent, it->first.query_epoch,
8779 PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first,
8780 op->get_req()->get_connection()->get_features())))
8781 );
8782 }
8783}
8784
8785void OSD::handle_pg_log(OpRequestRef op)
8786{
8787 MOSDPGLog *m = static_cast<MOSDPGLog*>(op->get_nonconst_req());
8788 assert(m->get_type() == MSG_OSD_PG_LOG);
8789 dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl;
8790
8791 if (!require_osd_peer(op->get_req()))
8792 return;
8793
8794 int from = m->get_source().num();
8795 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8796 return;
8797
8798 if (m->info.pgid.preferred() >= 0) {
8799 dout(10) << "ignoring localized pg " << m->info.pgid << dendl;
8800 return;
8801 }
8802
8803 op->mark_started();
8804 handle_pg_peering_evt(
8805 spg_t(m->info.pgid.pgid, m->to),
8806 m->info.history, m->past_intervals, m->get_epoch(),
8807 PG::CephPeeringEvtRef(
8808 new PG::CephPeeringEvt(
8809 m->get_epoch(), m->get_query_epoch(),
8810 PG::MLogRec(pg_shard_t(from, m->from), m)))
8811 );
8812}
8813
8814void OSD::handle_pg_info(OpRequestRef op)
8815{
8816 const MOSDPGInfo *m = static_cast<const MOSDPGInfo *>(op->get_req());
8817 assert(m->get_type() == MSG_OSD_PG_INFO);
8818 dout(7) << "handle_pg_info " << *m << " from " << m->get_source() << dendl;
8819
8820 if (!require_osd_peer(op->get_req()))
8821 return;
8822
8823 int from = m->get_source().num();
8824 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8825 return;
8826
8827 op->mark_started();
8828
8829 for (auto p = m->pg_list.begin();
8830 p != m->pg_list.end();
8831 ++p) {
8832 if (p->first.info.pgid.preferred() >= 0) {
8833 dout(10) << "ignoring localized pg " << p->first.info.pgid << dendl;
8834 continue;
8835 }
8836
8837 handle_pg_peering_evt(
8838 spg_t(p->first.info.pgid.pgid, p->first.to),
8839 p->first.info.history, p->second, p->first.epoch_sent,
8840 PG::CephPeeringEvtRef(
8841 new PG::CephPeeringEvt(
8842 p->first.epoch_sent, p->first.query_epoch,
8843 PG::MInfoRec(
8844 pg_shard_t(
8845 from, p->first.from), p->first.info, p->first.epoch_sent)))
8846 );
8847 }
8848}
8849
8850void OSD::handle_pg_trim(OpRequestRef op)
8851{
8852 const MOSDPGTrim *m = static_cast<const MOSDPGTrim*>(op->get_req());
8853 assert(m->get_type() == MSG_OSD_PG_TRIM);
8854
8855 dout(7) << "handle_pg_trim " << *m << " from " << m->get_source() << dendl;
8856
8857 if (!require_osd_peer(op->get_req()))
8858 return;
8859
8860 int from = m->get_source().num();
8861 if (!require_same_or_newer_map(op, m->epoch, false))
8862 return;
8863
8864 if (m->pgid.preferred() >= 0) {
8865 dout(10) << "ignoring localized pg " << m->pgid << dendl;
8866 return;
8867 }
8868
8869 op->mark_started();
8870
8871 PG *pg = _lookup_lock_pg(m->pgid);
8872 if(!pg) {
8873 dout(10) << " don't have pg " << m->pgid << dendl;
8874 return;
8875 }
8876
8877 if (m->epoch < pg->info.history.same_interval_since) {
8878 dout(10) << *pg << " got old trim to " << m->trim_to << ", ignoring" << dendl;
8879 pg->unlock();
8880 return;
8881 }
8882
8883 if (pg->is_primary()) {
8884 // peer is informing us of their last_complete_ondisk
8885 dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
8886 pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
8887 m->trim_to;
8888 // trim log when the pg is recovered
8889 pg->calc_min_last_complete_ondisk();
8890 } else {
8891 // primary is instructing us to trim
8892 ObjectStore::Transaction t;
8893 pg->pg_log.trim(m->trim_to, pg->info);
8894 pg->dirty_info = true;
8895 pg->write_if_dirty(t);
8896 int tr = store->queue_transaction(pg->osr.get(), std::move(t), NULL);
8897 assert(tr == 0);
8898 }
8899 pg->unlock();
8900}
8901
8902void OSD::handle_pg_backfill_reserve(OpRequestRef op)
8903{
8904 const MBackfillReserve *m = static_cast<const MBackfillReserve*>(op->get_req());
8905 assert(m->get_type() == MSG_OSD_BACKFILL_RESERVE);
8906
8907 if (!require_osd_peer(op->get_req()))
8908 return;
8909 if (!require_same_or_newer_map(op, m->query_epoch, false))
8910 return;
8911
8912 PG::CephPeeringEvtRef evt;
8913 if (m->type == MBackfillReserve::REQUEST) {
8914 evt = PG::CephPeeringEvtRef(
8915 new PG::CephPeeringEvt(
8916 m->query_epoch,
8917 m->query_epoch,
8918 PG::RequestBackfillPrio(m->priority)));
8919 } else if (m->type == MBackfillReserve::GRANT) {
8920 evt = PG::CephPeeringEvtRef(
8921 new PG::CephPeeringEvt(
8922 m->query_epoch,
8923 m->query_epoch,
8924 PG::RemoteBackfillReserved()));
8925 } else if (m->type == MBackfillReserve::REJECT) {
8926 evt = PG::CephPeeringEvtRef(
8927 new PG::CephPeeringEvt(
8928 m->query_epoch,
8929 m->query_epoch,
8930 PG::RemoteReservationRejected()));
8931 } else {
8932 ceph_abort();
8933 }
8934
8935 if (service.splitting(m->pgid)) {
8936 peering_wait_for_split[m->pgid].push_back(evt);
8937 return;
8938 }
8939
8940 PG *pg = _lookup_lock_pg(m->pgid);
8941 if (!pg) {
8942 dout(10) << " don't have pg " << m->pgid << dendl;
8943 return;
8944 }
8945
8946 pg->queue_peering_event(evt);
8947 pg->unlock();
8948}
8949
8950void OSD::handle_pg_recovery_reserve(OpRequestRef op)
8951{
8952 const MRecoveryReserve *m = static_cast<const MRecoveryReserve*>(op->get_req());
8953 assert(m->get_type() == MSG_OSD_RECOVERY_RESERVE);
8954
8955 if (!require_osd_peer(op->get_req()))
8956 return;
8957 if (!require_same_or_newer_map(op, m->query_epoch, false))
8958 return;
8959
8960 PG::CephPeeringEvtRef evt;
8961 if (m->type == MRecoveryReserve::REQUEST) {
8962 evt = PG::CephPeeringEvtRef(
8963 new PG::CephPeeringEvt(
8964 m->query_epoch,
8965 m->query_epoch,
8966 PG::RequestRecovery()));
8967 } else if (m->type == MRecoveryReserve::GRANT) {
8968 evt = PG::CephPeeringEvtRef(
8969 new PG::CephPeeringEvt(
8970 m->query_epoch,
8971 m->query_epoch,
8972 PG::RemoteRecoveryReserved()));
8973 } else if (m->type == MRecoveryReserve::RELEASE) {
8974 evt = PG::CephPeeringEvtRef(
8975 new PG::CephPeeringEvt(
8976 m->query_epoch,
8977 m->query_epoch,
8978 PG::RecoveryDone()));
8979 } else {
8980 ceph_abort();
8981 }
8982
8983 if (service.splitting(m->pgid)) {
8984 peering_wait_for_split[m->pgid].push_back(evt);
8985 return;
8986 }
8987
8988 PG *pg = _lookup_lock_pg(m->pgid);
8989 if (!pg) {
8990 dout(10) << " don't have pg " << m->pgid << dendl;
8991 return;
8992 }
8993
8994 pg->queue_peering_event(evt);
8995 pg->unlock();
8996}
8997
c07f9fc5
FG
8998void OSD::handle_force_recovery(Message *m)
8999{
9000 MOSDForceRecovery *msg = static_cast<MOSDForceRecovery*>(m);
9001 assert(msg->get_type() == MSG_OSD_FORCE_RECOVERY);
c07f9fc5 9002
d2e6a577 9003 vector<PGRef> local_pgs;
c07f9fc5
FG
9004 local_pgs.reserve(msg->forced_pgs.size());
9005
d2e6a577
FG
9006 {
9007 RWLock::RLocker l(pg_map_lock);
9008 for (auto& i : msg->forced_pgs) {
9009 spg_t locpg;
9010 if (osdmap->get_primary_shard(i, &locpg)) {
9011 auto pg_map_entry = pg_map.find(locpg);
9012 if (pg_map_entry != pg_map.end()) {
9013 local_pgs.push_back(pg_map_entry->second);
9014 }
c07f9fc5
FG
9015 }
9016 }
9017 }
9018
9019 if (local_pgs.size()) {
9020 service.adjust_pg_priorities(local_pgs, msg->options);
9021 }
9022
9023 msg->put();
9024}
7c673cae
FG
9025
9026/** PGQuery
9027 * from primary to replica | stray
9028 * NOTE: called with opqueue active.
9029 */
9030void OSD::handle_pg_query(OpRequestRef op)
9031{
9032 assert(osd_lock.is_locked());
9033
9034 const MOSDPGQuery *m = static_cast<const MOSDPGQuery*>(op->get_req());
9035 assert(m->get_type() == MSG_OSD_PG_QUERY);
9036
9037 if (!require_osd_peer(op->get_req()))
9038 return;
9039
9040 dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl;
9041 int from = m->get_source().num();
9042
9043 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9044 return;
9045
9046 op->mark_started();
9047
9048 map< int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
9049
9050 for (auto it = m->pg_list.begin();
9051 it != m->pg_list.end();
9052 ++it) {
9053 spg_t pgid = it->first;
9054
9055 if (pgid.preferred() >= 0) {
9056 dout(10) << "ignoring localized pg " << pgid << dendl;
9057 continue;
9058 }
9059
9060 if (service.splitting(pgid)) {
9061 peering_wait_for_split[pgid].push_back(
9062 PG::CephPeeringEvtRef(
9063 new PG::CephPeeringEvt(
9064 it->second.epoch_sent, it->second.epoch_sent,
9065 PG::MQuery(pg_shard_t(from, it->second.from),
9066 it->second, it->second.epoch_sent))));
9067 continue;
9068 }
9069
9070 {
9071 RWLock::RLocker l(pg_map_lock);
9072 if (pg_map.count(pgid)) {
9073 PG *pg = 0;
9074 pg = _lookup_lock_pg_with_map_lock_held(pgid);
9075 pg->queue_query(
9076 it->second.epoch_sent, it->second.epoch_sent,
9077 pg_shard_t(from, it->second.from), it->second);
9078 pg->unlock();
9079 continue;
9080 }
9081 }
9082
9083 if (!osdmap->have_pg_pool(pgid.pool()))
9084 continue;
9085
9086 // get active crush mapping
9087 int up_primary, acting_primary;
9088 vector<int> up, acting;
9089 osdmap->pg_to_up_acting_osds(
9090 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9091
9092 // same primary?
9093 pg_history_t history = it->second.history;
9094 bool valid_history = project_pg_history(
9095 pgid, history, it->second.epoch_sent,
9096 up, up_primary, acting, acting_primary);
9097
9098 if (!valid_history ||
9099 it->second.epoch_sent < history.same_interval_since) {
9100 dout(10) << " pg " << pgid << " dne, and pg has changed in "
9101 << history.same_interval_since
9102 << " (msg from " << it->second.epoch_sent << ")" << dendl;
9103 continue;
9104 }
9105
9106 dout(10) << " pg " << pgid << " dne" << dendl;
9107 pg_info_t empty(spg_t(pgid.pgid, it->second.to));
9108 /* This is racy, but that should be ok: if we complete the deletion
9109 * before the pg is recreated, we'll just start it off backfilling
9110 * instead of just empty */
9111 if (service.deleting_pgs.lookup(pgid))
9112 empty.set_last_backfill(hobject_t());
9113 if (it->second.type == pg_query_t::LOG ||
9114 it->second.type == pg_query_t::FULLLOG) {
9115 ConnectionRef con = service.get_con_osd_cluster(from, osdmap->get_epoch());
9116 if (con) {
9117 MOSDPGLog *mlog = new MOSDPGLog(
9118 it->second.from, it->second.to,
9119 osdmap->get_epoch(), empty,
9120 it->second.epoch_sent);
9121 service.share_map_peer(from, con.get(), osdmap);
9122 con->send_message(mlog);
9123 }
9124 } else {
9125 notify_list[from].push_back(
9126 make_pair(
9127 pg_notify_t(
9128 it->second.from, it->second.to,
9129 it->second.epoch_sent,
9130 osdmap->get_epoch(),
9131 empty),
9132 PastIntervals(
9133 osdmap->get_pools().at(pgid.pool()).ec_pool(),
9134 *osdmap)));
9135 }
9136 }
9137 do_notifies(notify_list, osdmap);
9138}
9139
9140
9141void OSD::handle_pg_remove(OpRequestRef op)
9142{
9143 const MOSDPGRemove *m = static_cast<const MOSDPGRemove *>(op->get_req());
9144 assert(m->get_type() == MSG_OSD_PG_REMOVE);
9145 assert(osd_lock.is_locked());
9146
9147 if (!require_osd_peer(op->get_req()))
9148 return;
9149
9150 dout(7) << "handle_pg_remove from " << m->get_source() << " on "
9151 << m->pg_list.size() << " pgs" << dendl;
9152
9153 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9154 return;
9155
9156 op->mark_started();
9157
9158 for (auto it = m->pg_list.begin();
9159 it != m->pg_list.end();
9160 ++it) {
9161 spg_t pgid = *it;
9162 if (pgid.preferred() >= 0) {
9163 dout(10) << "ignoring localized pg " << pgid << dendl;
9164 continue;
9165 }
9166
9167 RWLock::WLocker l(pg_map_lock);
9168 if (pg_map.count(pgid) == 0) {
9169 dout(10) << " don't have pg " << pgid << dendl;
9170 continue;
9171 }
9172 dout(5) << "queue_pg_for_deletion: " << pgid << dendl;
9173 PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
9174 pg_history_t history = pg->info.history;
9175 int up_primary, acting_primary;
9176 vector<int> up, acting;
9177 osdmap->pg_to_up_acting_osds(
9178 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9179 bool valid_history = project_pg_history(
9180 pg->info.pgid, history, pg->get_osdmap()->get_epoch(),
9181 up, up_primary, acting, acting_primary);
9182 if (valid_history &&
9183 history.same_interval_since <= m->get_epoch()) {
9184 assert(pg->get_primary().osd == m->get_source().num());
9185 PGRef _pg(pg);
9186 _remove_pg(pg);
9187 pg->unlock();
9188 } else {
9189 dout(10) << *pg << " ignoring remove request, pg changed in epoch "
9190 << history.same_interval_since
9191 << " > " << m->get_epoch() << dendl;
9192 pg->unlock();
9193 }
9194 }
9195}
9196
9197void OSD::_remove_pg(PG *pg)
9198{
9199 ObjectStore::Transaction rmt ;
9200
9201 // on_removal, which calls remove_watchers_and_notifies, and the erasure from
9202 // the pg_map must be done together without unlocking the pg lock,
9203 // to avoid racing with watcher cleanup in ms_handle_reset
9204 // and handle_notify_timeout
9205 pg->on_removal(&rmt);
9206
9207 service.cancel_pending_splits_for_parent(pg->info.pgid);
9208 int tr = store->queue_transaction(
9209 pg->osr.get(), std::move(rmt), NULL,
9210 new ContainerContext<
9211 SequencerRef>(pg->osr));
9212 assert(tr == 0);
9213
9214 DeletingStateRef deleting = service.deleting_pgs.lookup_or_create(
9215 pg->info.pgid,
9216 make_pair(
9217 pg->info.pgid,
9218 PGRef(pg))
9219 );
9220 remove_wq.queue(make_pair(PGRef(pg), deleting));
9221
9222 service.pg_remove_epoch(pg->info.pgid);
9223
9224 // dereference from op_wq
9225 op_shardedwq.clear_pg_pointer(pg->info.pgid);
9226
9227 // remove from map
9228 pg_map.erase(pg->info.pgid);
9229 pg->put("PGMap"); // since we've taken it out of map
9230}
9231
9232
9233// =========================================================
9234// RECOVERY
9235
9236void OSDService::_maybe_queue_recovery() {
9237 assert(recovery_lock.is_locked_by_me());
9238 uint64_t available_pushes;
9239 while (!awaiting_throttle.empty() &&
9240 _recover_now(&available_pushes)) {
9241 uint64_t to_start = MIN(
9242 available_pushes,
9243 cct->_conf->osd_recovery_max_single_start);
9244 _queue_for_recovery(awaiting_throttle.front(), to_start);
9245 awaiting_throttle.pop_front();
9246 recovery_ops_reserved += to_start;
9247 }
9248}
9249
9250bool OSDService::_recover_now(uint64_t *available_pushes)
9251{
9252 if (available_pushes)
9253 *available_pushes = 0;
9254
9255 if (ceph_clock_now() < defer_recovery_until) {
9256 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9257 return false;
9258 }
9259
9260 if (recovery_paused) {
9261 dout(15) << __func__ << " paused" << dendl;
9262 return false;
9263 }
9264
9265 uint64_t max = cct->_conf->osd_recovery_max_active;
9266 if (max <= recovery_ops_active + recovery_ops_reserved) {
9267 dout(15) << __func__ << " active " << recovery_ops_active
9268 << " + reserved " << recovery_ops_reserved
9269 << " >= max " << max << dendl;
9270 return false;
9271 }
9272
9273 if (available_pushes)
9274 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9275
9276 return true;
9277}
9278
c07f9fc5 9279
d2e6a577 9280void OSDService::adjust_pg_priorities(const vector<PGRef>& pgs, int newflags)
c07f9fc5
FG
9281{
9282 if (!pgs.size() || !(newflags & (OFR_BACKFILL | OFR_RECOVERY)))
9283 return;
9284 int newstate = 0;
9285
c07f9fc5
FG
9286 if (newflags & OFR_BACKFILL) {
9287 newstate = PG_STATE_FORCED_BACKFILL;
9288 } else if (newflags & OFR_RECOVERY) {
9289 newstate = PG_STATE_FORCED_RECOVERY;
9290 }
9291
9292 // debug output here may get large, don't generate it if debug level is below
9293 // 10 and use abbreviated pg ids otherwise
9294 if ((cct)->_conf->subsys.should_gather(ceph_subsys_osd, 10)) {
9295 stringstream ss;
9296
9297 for (auto& i : pgs) {
9298 ss << i->get_pgid() << " ";
9299 }
9300
9301 dout(10) << __func__ << " working on " << ss.str() << dendl;
9302 }
9303
9304 if (newflags & OFR_CANCEL) {
9305 for (auto& i : pgs) {
d2e6a577
FG
9306 i->lock();
9307 i->_change_recovery_force_mode(newstate, true);
9308 i->unlock();
c07f9fc5
FG
9309 }
9310 } else {
9311 for (auto& i : pgs) {
9312 // make sure the PG is in correct state before forcing backfill or recovery, or
9313 // else we'll make PG keeping FORCE_* flag forever, requiring osds restart
9314 // or forcing somehow recovery/backfill.
d2e6a577 9315 i->lock();
c07f9fc5
FG
9316 int pgstate = i->get_state();
9317 if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) ||
9318 ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILL))) )
d2e6a577
FG
9319 i->_change_recovery_force_mode(newstate, false);
9320 i->unlock();
c07f9fc5
FG
9321 }
9322 }
9323}
9324
7c673cae
FG
9325void OSD::do_recovery(
9326 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9327 ThreadPool::TPHandle &handle)
9328{
9329 uint64_t started = 0;
31f18b77
FG
9330
9331 /*
9332 * When the value of osd_recovery_sleep is set greater than zero, recovery
9333 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9334 * recovery event's schedule time. This is done by adding a
9335 * recovery_requeue_callback event, which re-queues the recovery op using
9336 * queue_recovery_after_sleep.
9337 */
c07f9fc5
FG
9338 float recovery_sleep = get_osd_recovery_sleep();
9339 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
31f18b77
FG
9340 PGRef pgref(pg);
9341 auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9342 dout(20) << "do_recovery wake up at "
9343 << ceph_clock_now()
9344 << ", re-queuing recovery" << dendl;
9345 service.recovery_needs_sleep = false;
9346 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9347 });
9348 Mutex::Locker l(service.recovery_sleep_lock);
9349
9350 // This is true for the first recovery op and when the previous recovery op
9351 // has been scheduled in the past. The next recovery op is scheduled after
9352 // completing the sleep from now.
9353 if (service.recovery_schedule_time < ceph_clock_now()) {
9354 service.recovery_schedule_time = ceph_clock_now();
9355 }
c07f9fc5 9356 service.recovery_schedule_time += recovery_sleep;
31f18b77
FG
9357 service.recovery_sleep_timer.add_event_at(service.recovery_schedule_time,
9358 recovery_requeue_callback);
9359 dout(20) << "Recovery event scheduled at "
9360 << service.recovery_schedule_time << dendl;
9361 return;
7c673cae
FG
9362 }
9363
9364 {
31f18b77 9365 service.recovery_needs_sleep = true;
7c673cae
FG
9366 if (pg->pg_has_reset_since(queued)) {
9367 goto out;
9368 }
9369
9370 assert(!pg->deleting);
9371 assert(pg->is_peered() && pg->is_primary());
9372
9373 assert(pg->recovery_queued);
9374 pg->recovery_queued = false;
9375
9376 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9377#ifdef DEBUG_RECOVERY_OIDS
9378 dout(20) << " active was " << service.recovery_oids[pg->info.pgid] << dendl;
9379#endif
9380
9381 bool more = pg->start_recovery_ops(reserved_pushes, handle, &started);
9382 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9383 << " on " << *pg << dendl;
9384
9385 // If no recovery op is started, don't bother to manipulate the RecoveryCtx
9386 if (!started && (more || !pg->have_unfound())) {
9387 goto out;
9388 }
9389
9390 PG::RecoveryCtx rctx = create_context();
9391 rctx.handle = &handle;
9392
9393 /*
9394 * if we couldn't start any recovery ops and things are still
9395 * unfound, see if we can discover more missing object locations.
9396 * It may be that our initial locations were bad and we errored
9397 * out while trying to pull.
9398 */
9399 if (!more && pg->have_unfound()) {
9400 pg->discover_all_missing(*rctx.query_map);
9401 if (rctx.query_map->empty()) {
224ce89b
WB
9402 string action;
9403 if (pg->state_test(PG_STATE_BACKFILL)) {
9404 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9405 queued,
9406 queued,
9407 PG::CancelBackfill()));
9408 pg->queue_peering_event(evt);
9409 action = "in backfill";
9410 } else if (pg->state_test(PG_STATE_RECOVERING)) {
9411 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9412 queued,
9413 queued,
9414 PG::CancelRecovery()));
9415 pg->queue_peering_event(evt);
9416 action = "in recovery";
9417 } else {
9418 action = "already out of recovery/backfill";
9419 }
9420 dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
7c673cae 9421 } else {
224ce89b 9422 dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
7c673cae
FG
9423 pg->queue_recovery();
9424 }
9425 }
9426
9427 pg->write_if_dirty(*rctx.transaction);
9428 OSDMapRef curmap = pg->get_osdmap();
9429 dispatch_context(rctx, pg, curmap);
9430 }
9431
9432 out:
9433 assert(started <= reserved_pushes);
9434 service.release_reserved_pushes(reserved_pushes);
9435}
9436
9437void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9438{
9439 Mutex::Locker l(recovery_lock);
9440 dout(10) << "start_recovery_op " << *pg << " " << soid
9441 << " (" << recovery_ops_active << "/"
9442 << cct->_conf->osd_recovery_max_active << " rops)"
9443 << dendl;
9444 recovery_ops_active++;
9445
9446#ifdef DEBUG_RECOVERY_OIDS
9447 dout(20) << " active was " << recovery_oids[pg->info.pgid] << dendl;
9448 assert(recovery_oids[pg->info.pgid].count(soid) == 0);
9449 recovery_oids[pg->info.pgid].insert(soid);
9450#endif
9451}
9452
9453void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9454{
9455 Mutex::Locker l(recovery_lock);
9456 dout(10) << "finish_recovery_op " << *pg << " " << soid
9457 << " dequeue=" << dequeue
9458 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
9459 << dendl;
9460
9461 // adjust count
9462 assert(recovery_ops_active > 0);
9463 recovery_ops_active--;
9464
9465#ifdef DEBUG_RECOVERY_OIDS
9466 dout(20) << " active oids was " << recovery_oids[pg->info.pgid] << dendl;
9467 assert(recovery_oids[pg->info.pgid].count(soid));
9468 recovery_oids[pg->info.pgid].erase(soid);
9469#endif
9470
9471 _maybe_queue_recovery();
9472}
9473
9474bool OSDService::is_recovery_active()
9475{
b5b8bbf5 9476 return local_reserver.has_reservation() || remote_reserver.has_reservation();
7c673cae
FG
9477}
9478
9479// =========================================================
9480// OPS
9481
9482bool OSD::op_is_discardable(const MOSDOp *op)
9483{
9484 // drop client request if they are not connected and can't get the
9485 // reply anyway.
9486 if (!op->get_connection()->is_connected()) {
9487 return true;
9488 }
9489 return false;
9490}
9491
9492void OSD::enqueue_op(spg_t pg, OpRequestRef& op, epoch_t epoch)
9493{
9494 utime_t latency = ceph_clock_now() - op->get_req()->get_recv_stamp();
9495 dout(15) << "enqueue_op " << op << " prio " << op->get_req()->get_priority()
9496 << " cost " << op->get_req()->get_cost()
9497 << " latency " << latency
9498 << " epoch " << epoch
9499 << " " << *(op->get_req()) << dendl;
9500 op->osd_trace.event("enqueue op");
9501 op->osd_trace.keyval("priority", op->get_req()->get_priority());
9502 op->osd_trace.keyval("cost", op->get_req()->get_cost());
9503 op->mark_queued_for_pg();
224ce89b 9504 logger->tinc(l_osd_op_before_queue_op_lat, latency);
7c673cae
FG
9505 op_shardedwq.queue(make_pair(pg, PGQueueable(op, epoch)));
9506}
9507
9508
9509
9510/*
9511 * NOTE: dequeue called in worker thread, with pg lock
9512 */
9513void OSD::dequeue_op(
9514 PGRef pg, OpRequestRef op,
9515 ThreadPool::TPHandle &handle)
9516{
9517 FUNCTRACE();
9518 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
9519
9520 utime_t now = ceph_clock_now();
9521 op->set_dequeued_time(now);
9522 utime_t latency = now - op->get_req()->get_recv_stamp();
9523 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
9524 << " cost " << op->get_req()->get_cost()
9525 << " latency " << latency
9526 << " " << *(op->get_req())
9527 << " pg " << *pg << dendl;
9528
224ce89b
WB
9529 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9530
7c673cae
FG
9531 Session *session = static_cast<Session *>(
9532 op->get_req()->get_connection()->get_priv());
9533 if (session) {
9534 maybe_share_map(session, op, pg->get_osdmap());
9535 session->put();
9536 }
9537
9538 if (pg->deleting)
9539 return;
9540
9541 op->mark_reached_pg();
9542 op->osd_trace.event("dequeue_op");
9543
9544 pg->do_request(op, handle);
9545
9546 // finish
9547 dout(10) << "dequeue_op " << op << " finish" << dendl;
9548 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
9549}
9550
9551
9552struct C_CompleteSplits : public Context {
9553 OSD *osd;
31f18b77
FG
9554 set<PGRef> pgs;
9555 C_CompleteSplits(OSD *osd, const set<PGRef> &in)
7c673cae
FG
9556 : osd(osd), pgs(in) {}
9557 void finish(int r) override {
9558 Mutex::Locker l(osd->osd_lock);
9559 if (osd->is_stopping())
9560 return;
9561 PG::RecoveryCtx rctx = osd->create_context();
31f18b77 9562 for (set<PGRef>::iterator i = pgs.begin();
7c673cae
FG
9563 i != pgs.end();
9564 ++i) {
9565 osd->pg_map_lock.get_write();
9566 (*i)->lock();
31f18b77
FG
9567 PG *pg = i->get();
9568 osd->add_newly_split_pg(pg, &rctx);
7c673cae
FG
9569 if (!((*i)->deleting)) {
9570 set<spg_t> to_complete;
9571 to_complete.insert((*i)->info.pgid);
9572 osd->service.complete_split(to_complete);
9573 }
9574 osd->pg_map_lock.put_write();
31f18b77 9575 osd->dispatch_context_transaction(rctx, pg);
7c673cae
FG
9576 osd->wake_pg_waiters(*i);
9577 (*i)->unlock();
9578 }
9579
9580 osd->dispatch_context(rctx, 0, osd->service.get_osdmap());
9581 }
9582};
9583
9584void OSD::process_peering_events(
9585 const list<PG*> &pgs,
9586 ThreadPool::TPHandle &handle
9587 )
9588{
9589 bool need_up_thru = false;
9590 epoch_t same_interval_since = 0;
9591 OSDMapRef curmap;
9592 PG::RecoveryCtx rctx = create_context();
9593 rctx.handle = &handle;
9594 for (list<PG*>::const_iterator i = pgs.begin();
9595 i != pgs.end();
9596 ++i) {
31f18b77 9597 set<PGRef> split_pgs;
7c673cae
FG
9598 PG *pg = *i;
9599 pg->lock_suspend_timeout(handle);
9600 curmap = service.get_osdmap();
9601 if (pg->deleting) {
9602 pg->unlock();
9603 continue;
9604 }
9605 if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
9606 // we need to requeue the PG explicitly since we didn't actually
9607 // handle an event
9608 peering_wq.queue(pg);
9609 } else {
9610 assert(!pg->peering_queue.empty());
9611 PG::CephPeeringEvtRef evt = pg->peering_queue.front();
9612 pg->peering_queue.pop_front();
9613 pg->handle_peering_event(evt, &rctx);
9614 }
9615 need_up_thru = pg->need_up_thru || need_up_thru;
9616 same_interval_since = MAX(pg->info.history.same_interval_since,
9617 same_interval_since);
9618 pg->write_if_dirty(*rctx.transaction);
9619 if (!split_pgs.empty()) {
9620 rctx.on_applied->add(new C_CompleteSplits(this, split_pgs));
9621 split_pgs.clear();
9622 }
9623 dispatch_context_transaction(rctx, pg, &handle);
9624 pg->unlock();
9625 }
9626 if (need_up_thru)
9627 queue_want_up_thru(same_interval_since);
9628 dispatch_context(rctx, 0, curmap, &handle);
9629
9630 service.send_pg_temp();
9631}
9632
9633// --------------------------------
9634
9635const char** OSD::get_tracked_conf_keys() const
9636{
9637 static const char* KEYS[] = {
9638 "osd_max_backfills",
9639 "osd_min_recovery_priority",
224ce89b
WB
9640 "osd_max_trimming_pgs",
9641 "osd_op_complaint_time",
9642 "osd_op_log_threshold",
9643 "osd_op_history_size",
9644 "osd_op_history_duration",
9645 "osd_op_history_slow_op_size",
9646 "osd_op_history_slow_op_threshold",
7c673cae
FG
9647 "osd_enable_op_tracker",
9648 "osd_map_cache_size",
9649 "osd_map_max_advance",
9650 "osd_pg_epoch_persisted_max_stale",
9651 "osd_disk_thread_ioprio_class",
9652 "osd_disk_thread_ioprio_priority",
9653 // clog & admin clog
9654 "clog_to_monitors",
9655 "clog_to_syslog",
9656 "clog_to_syslog_facility",
9657 "clog_to_syslog_level",
9658 "osd_objectstore_fuse",
9659 "clog_to_graylog",
9660 "clog_to_graylog_host",
9661 "clog_to_graylog_port",
9662 "host",
9663 "fsid",
9664 "osd_recovery_delay_start",
9665 "osd_client_message_size_cap",
9666 "osd_client_message_cap",
31f18b77
FG
9667 "osd_heartbeat_min_size",
9668 "osd_heartbeat_interval",
7c673cae
FG
9669 NULL
9670 };
9671 return KEYS;
9672}
9673
9674void OSD::handle_conf_change(const struct md_config_t *conf,
9675 const std::set <std::string> &changed)
9676{
9677 if (changed.count("osd_max_backfills")) {
9678 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9679 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9680 }
9681 if (changed.count("osd_min_recovery_priority")) {
9682 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9683 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9684 }
9685 if (changed.count("osd_max_trimming_pgs")) {
9686 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9687 }
9688 if (changed.count("osd_op_complaint_time") ||
9689 changed.count("osd_op_log_threshold")) {
9690 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9691 cct->_conf->osd_op_log_threshold);
9692 }
9693 if (changed.count("osd_op_history_size") ||
9694 changed.count("osd_op_history_duration")) {
9695 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9696 cct->_conf->osd_op_history_duration);
9697 }
9698 if (changed.count("osd_op_history_slow_op_size") ||
9699 changed.count("osd_op_history_slow_op_threshold")) {
9700 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9701 cct->_conf->osd_op_history_slow_op_threshold);
9702 }
9703 if (changed.count("osd_enable_op_tracker")) {
9704 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9705 }
9706 if (changed.count("osd_disk_thread_ioprio_class") ||
9707 changed.count("osd_disk_thread_ioprio_priority")) {
9708 set_disk_tp_priority();
9709 }
9710 if (changed.count("osd_map_cache_size")) {
9711 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9712 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9713 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9714 }
9715 if (changed.count("clog_to_monitors") ||
9716 changed.count("clog_to_syslog") ||
9717 changed.count("clog_to_syslog_level") ||
9718 changed.count("clog_to_syslog_facility") ||
9719 changed.count("clog_to_graylog") ||
9720 changed.count("clog_to_graylog_host") ||
9721 changed.count("clog_to_graylog_port") ||
9722 changed.count("host") ||
9723 changed.count("fsid")) {
9724 update_log_config();
9725 }
9726
9727#ifdef HAVE_LIBFUSE
9728 if (changed.count("osd_objectstore_fuse")) {
9729 if (store) {
9730 enable_disable_fuse(false);
9731 }
9732 }
9733#endif
9734
9735 if (changed.count("osd_recovery_delay_start")) {
9736 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9737 service.kick_recovery_queue();
9738 }
9739
9740 if (changed.count("osd_client_message_cap")) {
9741 uint64_t newval = cct->_conf->osd_client_message_cap;
9742 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9743 if (pol.throttler_messages && newval > 0) {
9744 pol.throttler_messages->reset_max(newval);
9745 }
9746 }
9747 if (changed.count("osd_client_message_size_cap")) {
9748 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9749 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9750 if (pol.throttler_bytes && newval > 0) {
9751 pol.throttler_bytes->reset_max(newval);
9752 }
9753 }
9754
9755 check_config();
9756}
9757
9758void OSD::update_log_config()
9759{
9760 map<string,string> log_to_monitors;
9761 map<string,string> log_to_syslog;
9762 map<string,string> log_channel;
9763 map<string,string> log_prio;
9764 map<string,string> log_to_graylog;
9765 map<string,string> log_to_graylog_host;
9766 map<string,string> log_to_graylog_port;
9767 uuid_d fsid;
9768 string host;
9769
9770 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9771 log_channel, log_prio, log_to_graylog,
9772 log_to_graylog_host, log_to_graylog_port,
9773 fsid, host) == 0)
9774 clog->update_config(log_to_monitors, log_to_syslog,
9775 log_channel, log_prio, log_to_graylog,
9776 log_to_graylog_host, log_to_graylog_port,
9777 fsid, host);
9778 derr << "log_to_monitors " << log_to_monitors << dendl;
9779}
9780
9781void OSD::check_config()
9782{
9783 // some sanity checks
9784 if (cct->_conf->osd_map_cache_size <= cct->_conf->osd_map_max_advance + 2) {
9785 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9786 << " is not > osd_map_max_advance ("
9787 << cct->_conf->osd_map_max_advance << ")";
9788 }
9789 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9790 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9791 << " is not > osd_pg_epoch_persisted_max_stale ("
9792 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9793 }
9794}
9795
9796void OSD::set_disk_tp_priority()
9797{
9798 dout(10) << __func__
9799 << " class " << cct->_conf->osd_disk_thread_ioprio_class
9800 << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
9801 << dendl;
9802 if (cct->_conf->osd_disk_thread_ioprio_class.empty() ||
9803 cct->_conf->osd_disk_thread_ioprio_priority < 0)
9804 return;
9805 int cls =
9806 ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
9807 if (cls < 0)
9808 derr << __func__ << cpp_strerror(cls) << ": "
9809 << "osd_disk_thread_ioprio_class is " << cct->_conf->osd_disk_thread_ioprio_class
9810 << " but only the following values are allowed: idle, be or rt" << dendl;
9811 else
9812 disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
9813}
9814
9815// --------------------------------
9816
9817void OSD::get_latest_osdmap()
9818{
9819 dout(10) << __func__ << " -- start" << dendl;
9820
9821 C_SaferCond cond;
9822 service.objecter->wait_for_latest_osdmap(&cond);
9823 cond.wait();
9824
9825 dout(10) << __func__ << " -- finish" << dendl;
9826}
9827
9828// --------------------------------
9829
9830int OSD::init_op_flags(OpRequestRef& op)
9831{
9832 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
9833 vector<OSDOp>::const_iterator iter;
9834
9835 // client flags have no bearing on whether an op is a read, write, etc.
9836 op->rmw_flags = 0;
9837
9838 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
9839 op->set_force_rwordered();
9840 }
9841
9842 // set bits based on op codes, called methods.
9843 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
9844 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
9845 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
9846 /* This a bit odd. PING isn't actually a write. It can't
9847 * result in an update to the object_info. PINGs also aren'ty
9848 * resent, so there's no reason to write out a log entry
9849 *
9850 * However, we pipeline them behind writes, so let's force
9851 * the write_ordered flag.
9852 */
9853 op->set_force_rwordered();
9854 } else {
9855 if (ceph_osd_op_mode_modify(iter->op.op))
9856 op->set_write();
9857 }
9858 if (ceph_osd_op_mode_read(iter->op.op))
9859 op->set_read();
9860
9861 // set READ flag if there are src_oids
9862 if (iter->soid.oid.name.length())
9863 op->set_read();
9864
9865 // set PGOP flag if there are PG ops
9866 if (ceph_osd_op_type_pg(iter->op.op))
9867 op->set_pg_op();
9868
9869 if (ceph_osd_op_mode_cache(iter->op.op))
9870 op->set_cache();
9871
9872 // check for ec base pool
9873 int64_t poolid = m->get_pg().pool();
9874 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
9875 if (pool && pool->is_tier()) {
9876 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
9877 if (base_pool && base_pool->require_rollback()) {
9878 if ((iter->op.op != CEPH_OSD_OP_READ) &&
9879 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
c07f9fc5 9880 (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
7c673cae
FG
9881 (iter->op.op != CEPH_OSD_OP_STAT) &&
9882 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
9883 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
9884 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
9885 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
9886 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
9887 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
9888 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
9889 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
9890 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
9891 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
9892 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
9893 (iter->op.op != CEPH_OSD_OP_CREATE) &&
9894 (iter->op.op != CEPH_OSD_OP_DELETE) &&
9895 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
9896 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
9897 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
9898 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
9899 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
9900 op->set_promote();
9901 }
9902 }
9903 }
9904
9905 switch (iter->op.op) {
9906 case CEPH_OSD_OP_CALL:
9907 {
9908 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
9909 int is_write, is_read;
9910 string cname, mname;
9911 bp.copy(iter->op.cls.class_len, cname);
9912 bp.copy(iter->op.cls.method_len, mname);
9913
9914 ClassHandler::ClassData *cls;
9915 int r = class_handler->open_class(cname, &cls);
9916 if (r) {
9917 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
9918 if (r == -ENOENT)
9919 r = -EOPNOTSUPP;
9920 else if (r != -EPERM) // propagate permission errors
9921 r = -EIO;
9922 return r;
9923 }
9924 int flags = cls->get_method_flags(mname.c_str());
9925 if (flags < 0) {
9926 if (flags == -ENOENT)
9927 r = -EOPNOTSUPP;
9928 else
9929 r = flags;
9930 return r;
9931 }
9932 is_read = flags & CLS_METHOD_RD;
9933 is_write = flags & CLS_METHOD_WR;
9934 bool is_promote = flags & CLS_METHOD_PROMOTE;
9935
9936 dout(10) << "class " << cname << " method " << mname << " "
9937 << "flags=" << (is_read ? "r" : "")
9938 << (is_write ? "w" : "")
9939 << (is_promote ? "p" : "")
9940 << dendl;
9941 if (is_read)
9942 op->set_class_read();
9943 if (is_write)
9944 op->set_class_write();
9945 if (is_promote)
9946 op->set_promote();
9947 op->add_class(cname, is_read, is_write, cls->whitelisted);
9948 break;
9949 }
9950
9951 case CEPH_OSD_OP_WATCH:
9952 // force the read bit for watch since it is depends on previous
9953 // watch state (and may return early if the watch exists) or, in
9954 // the case of ping, is simply a read op.
9955 op->set_read();
9956 // fall through
9957 case CEPH_OSD_OP_NOTIFY:
9958 case CEPH_OSD_OP_NOTIFY_ACK:
9959 {
9960 op->set_promote();
9961 break;
9962 }
9963
9964 case CEPH_OSD_OP_DELETE:
9965 // if we get a delete with FAILOK we can skip handle cache. without
9966 // FAILOK we still need to promote (or do something smarter) to
9967 // determine whether to return ENOENT or 0.
9968 if (iter == m->ops.begin() &&
9969 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
9970 op->set_skip_handle_cache();
9971 }
9972 // skip promotion when proxying a delete op
9973 if (m->ops.size() == 1) {
9974 op->set_skip_promote();
9975 }
9976 break;
9977
9978 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
9979 case CEPH_OSD_OP_CACHE_FLUSH:
9980 case CEPH_OSD_OP_CACHE_EVICT:
9981 // If try_flush/flush/evict is the only op, can skip handle cache.
9982 if (m->ops.size() == 1) {
9983 op->set_skip_handle_cache();
9984 }
9985 break;
9986
9987 case CEPH_OSD_OP_READ:
9988 case CEPH_OSD_OP_SYNC_READ:
9989 case CEPH_OSD_OP_SPARSE_READ:
9990 case CEPH_OSD_OP_CHECKSUM:
9991 case CEPH_OSD_OP_WRITEFULL:
9992 if (m->ops.size() == 1 &&
9993 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
9994 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
9995 op->set_skip_promote();
9996 }
9997 break;
9998
9999 // force promotion when pin an object in cache tier
10000 case CEPH_OSD_OP_CACHE_PIN:
10001 op->set_promote();
10002 break;
10003
10004 default:
10005 break;
10006 }
10007 }
10008
10009 if (op->rmw_flags == 0)
10010 return -EINVAL;
10011
10012 return 0;
10013}
10014
10015void OSD::PeeringWQ::_dequeue(list<PG*> *out) {
10016 for (list<PG*>::iterator i = peering_queue.begin();
10017 i != peering_queue.end() &&
10018 out->size() < osd->cct->_conf->osd_peering_wq_batch_size;
10019 ) {
10020 if (in_use.count(*i)) {
10021 ++i;
10022 } else {
10023 out->push_back(*i);
10024 peering_queue.erase(i++);
10025 }
10026 }
10027 in_use.insert(out->begin(), out->end());
10028}
10029
224ce89b 10030
7c673cae
FG
10031// =============================================================
10032
10033#undef dout_context
10034#define dout_context osd->cct
10035#undef dout_prefix
10036#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10037
10038void OSD::ShardedOpWQ::wake_pg_waiters(spg_t pgid)
10039{
10040 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10041 auto sdata = shard_list[shard_index];
10042 bool queued = false;
10043 unsigned pushes_to_free = 0;
10044 {
10045 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10046 auto p = sdata->pg_slots.find(pgid);
10047 if (p != sdata->pg_slots.end()) {
10048 dout(20) << __func__ << " " << pgid
10049 << " to_process " << p->second.to_process
10050 << " waiting_for_pg=" << (int)p->second.waiting_for_pg << dendl;
10051 for (auto i = p->second.to_process.rbegin();
10052 i != p->second.to_process.rend();
10053 ++i) {
10054 sdata->_enqueue_front(make_pair(pgid, *i), osd->op_prio_cutoff);
10055 }
10056 for (auto& q : p->second.to_process) {
10057 pushes_to_free += q.get_reserved_pushes();
10058 }
10059 p->second.to_process.clear();
10060 p->second.waiting_for_pg = false;
10061 ++p->second.requeue_seq;
10062 queued = true;
10063 }
10064 }
10065 if (pushes_to_free > 0) {
10066 osd->service.release_reserved_pushes(pushes_to_free);
10067 }
10068 if (queued) {
10069 sdata->sdata_lock.Lock();
10070 sdata->sdata_cond.SignalOne();
10071 sdata->sdata_lock.Unlock();
10072 }
10073}
10074
10075void OSD::ShardedOpWQ::prune_pg_waiters(OSDMapRef osdmap, int whoami)
10076{
10077 unsigned pushes_to_free = 0;
10078 for (auto sdata : shard_list) {
10079 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10080 sdata->waiting_for_pg_osdmap = osdmap;
10081 auto p = sdata->pg_slots.begin();
10082 while (p != sdata->pg_slots.end()) {
10083 ShardData::pg_slot& slot = p->second;
10084 if (!slot.to_process.empty() && slot.num_running == 0) {
10085 if (osdmap->is_up_acting_osd_shard(p->first, whoami)) {
10086 dout(20) << __func__ << " " << p->first << " maps to us, keeping"
10087 << dendl;
10088 ++p;
10089 continue;
10090 }
10091 while (!slot.to_process.empty() &&
10092 slot.to_process.front().get_map_epoch() <= osdmap->get_epoch()) {
10093 auto& qi = slot.to_process.front();
10094 dout(20) << __func__ << " " << p->first
10095 << " item " << qi
10096 << " epoch " << qi.get_map_epoch()
10097 << " <= " << osdmap->get_epoch()
10098 << ", stale, dropping" << dendl;
10099 pushes_to_free += qi.get_reserved_pushes();
10100 slot.to_process.pop_front();
10101 }
10102 }
10103 if (slot.to_process.empty() &&
10104 slot.num_running == 0 &&
10105 !slot.pg) {
10106 dout(20) << __func__ << " " << p->first << " empty, pruning" << dendl;
10107 p = sdata->pg_slots.erase(p);
10108 } else {
10109 ++p;
10110 }
10111 }
10112 }
10113 if (pushes_to_free > 0) {
10114 osd->service.release_reserved_pushes(pushes_to_free);
10115 }
10116}
10117
10118void OSD::ShardedOpWQ::clear_pg_pointer(spg_t pgid)
10119{
10120 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10121 auto sdata = shard_list[shard_index];
10122 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10123 auto p = sdata->pg_slots.find(pgid);
10124 if (p != sdata->pg_slots.end()) {
10125 auto& slot = p->second;
10126 dout(20) << __func__ << " " << pgid << " pg " << slot.pg << dendl;
10127 assert(!slot.pg || slot.pg->deleting);
10128 slot.pg = nullptr;
10129 }
10130}
10131
10132void OSD::ShardedOpWQ::clear_pg_slots()
10133{
10134 for (auto sdata : shard_list) {
10135 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10136 sdata->pg_slots.clear();
10137 sdata->waiting_for_pg_osdmap.reset();
10138 // don't bother with reserved pushes; we are shutting down
10139 }
10140}
10141
10142#undef dout_prefix
10143#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10144
10145void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10146{
10147 uint32_t shard_index = thread_index % num_shards;
10148 ShardData *sdata = shard_list[shard_index];
10149 assert(NULL != sdata);
10150
10151 // peek at spg_t
10152 sdata->sdata_op_ordering_lock.Lock();
10153 if (sdata->pqueue->empty()) {
10154 dout(20) << __func__ << " empty q, waiting" << dendl;
10155 // optimistically sleep a moment; maybe another work item will come along.
7c673cae
FG
10156 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10157 osd->cct->_conf->threadpool_default_timeout, 0);
10158 sdata->sdata_lock.Lock();
224ce89b 10159 sdata->sdata_op_ordering_lock.Unlock();
7c673cae
FG
10160 sdata->sdata_cond.WaitInterval(sdata->sdata_lock,
10161 utime_t(osd->cct->_conf->threadpool_empty_queue_max_wait, 0));
10162 sdata->sdata_lock.Unlock();
10163 sdata->sdata_op_ordering_lock.Lock();
10164 if (sdata->pqueue->empty()) {
10165 sdata->sdata_op_ordering_lock.Unlock();
10166 return;
10167 }
10168 }
10169 pair<spg_t, PGQueueable> item = sdata->pqueue->dequeue();
10170 if (osd->is_stopping()) {
10171 sdata->sdata_op_ordering_lock.Unlock();
10172 return; // OSD shutdown, discard.
10173 }
10174 PGRef pg;
10175 uint64_t requeue_seq;
10176 {
10177 auto& slot = sdata->pg_slots[item.first];
10178 dout(30) << __func__ << " " << item.first
10179 << " to_process " << slot.to_process
10180 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10181 slot.to_process.push_back(item.second);
10182 // note the requeue seq now...
10183 requeue_seq = slot.requeue_seq;
10184 if (slot.waiting_for_pg) {
10185 // save ourselves a bit of effort
10186 dout(20) << __func__ << " " << item.first << " item " << item.second
10187 << " queued, waiting_for_pg" << dendl;
10188 sdata->sdata_op_ordering_lock.Unlock();
10189 return;
10190 }
10191 pg = slot.pg;
10192 dout(20) << __func__ << " " << item.first << " item " << item.second
10193 << " queued" << dendl;
10194 ++slot.num_running;
10195 }
10196 sdata->sdata_op_ordering_lock.Unlock();
10197
10198 osd->service.maybe_inject_dispatch_delay();
10199
10200 // [lookup +] lock pg (if we have it)
10201 if (!pg) {
10202 pg = osd->_lookup_lock_pg(item.first);
10203 } else {
10204 pg->lock();
10205 }
10206
10207 osd->service.maybe_inject_dispatch_delay();
10208
10209 boost::optional<PGQueueable> qi;
10210
10211 // we don't use a Mutex::Locker here because of the
10212 // osd->service.release_reserved_pushes() call below
10213 sdata->sdata_op_ordering_lock.Lock();
10214
10215 auto q = sdata->pg_slots.find(item.first);
10216 assert(q != sdata->pg_slots.end());
10217 auto& slot = q->second;
10218 --slot.num_running;
10219
10220 if (slot.to_process.empty()) {
10221 // raced with wake_pg_waiters or prune_pg_waiters
10222 dout(20) << __func__ << " " << item.first << " nothing queued" << dendl;
10223 if (pg) {
10224 pg->unlock();
10225 }
10226 sdata->sdata_op_ordering_lock.Unlock();
10227 return;
10228 }
10229 if (requeue_seq != slot.requeue_seq) {
10230 dout(20) << __func__ << " " << item.first
10231 << " requeue_seq " << slot.requeue_seq << " > our "
10232 << requeue_seq << ", we raced with wake_pg_waiters"
10233 << dendl;
10234 if (pg) {
10235 pg->unlock();
10236 }
10237 sdata->sdata_op_ordering_lock.Unlock();
10238 return;
10239 }
10240 if (pg && !slot.pg && !pg->deleting) {
10241 dout(20) << __func__ << " " << item.first << " set pg to " << pg << dendl;
10242 slot.pg = pg;
10243 }
10244 dout(30) << __func__ << " " << item.first << " to_process " << slot.to_process
10245 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10246
10247 // make sure we're not already waiting for this pg
10248 if (slot.waiting_for_pg) {
10249 dout(20) << __func__ << " " << item.first << " item " << item.second
10250 << " slot is waiting_for_pg" << dendl;
10251 if (pg) {
10252 pg->unlock();
10253 }
10254 sdata->sdata_op_ordering_lock.Unlock();
10255 return;
10256 }
10257
10258 // take next item
10259 qi = slot.to_process.front();
10260 slot.to_process.pop_front();
10261 dout(20) << __func__ << " " << item.first << " item " << *qi
10262 << " pg " << pg << dendl;
10263
10264 if (!pg) {
10265 // should this pg shard exist on this osd in this (or a later) epoch?
10266 OSDMapRef osdmap = sdata->waiting_for_pg_osdmap;
10267 if (osdmap->is_up_acting_osd_shard(item.first, osd->whoami)) {
10268 dout(20) << __func__ << " " << item.first
10269 << " no pg, should exist, will wait" << " on " << *qi << dendl;
10270 slot.to_process.push_front(*qi);
10271 slot.waiting_for_pg = true;
10272 } else if (qi->get_map_epoch() > osdmap->get_epoch()) {
10273 dout(20) << __func__ << " " << item.first << " no pg, item epoch is "
10274 << qi->get_map_epoch() << " > " << osdmap->get_epoch()
10275 << ", will wait on " << *qi << dendl;
10276 slot.to_process.push_front(*qi);
10277 slot.waiting_for_pg = true;
10278 } else {
10279 dout(20) << __func__ << " " << item.first << " no pg, shouldn't exist,"
10280 << " dropping " << *qi << dendl;
10281 // share map with client?
10282 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10283 Session *session = static_cast<Session *>(
10284 (*_op)->get_req()->get_connection()->get_priv());
10285 if (session) {
10286 osd->maybe_share_map(session, *_op, sdata->waiting_for_pg_osdmap);
10287 session->put();
10288 }
10289 }
10290 unsigned pushes_to_free = qi->get_reserved_pushes();
10291 if (pushes_to_free > 0) {
10292 sdata->sdata_op_ordering_lock.Unlock();
10293 osd->service.release_reserved_pushes(pushes_to_free);
10294 return;
10295 }
10296 }
10297 sdata->sdata_op_ordering_lock.Unlock();
10298 return;
10299 }
10300 sdata->sdata_op_ordering_lock.Unlock();
10301
10302
10303 // osd_opwq_process marks the point at which an operation has been dequeued
10304 // and will begin to be handled by a worker thread.
10305 {
10306#ifdef WITH_LTTNG
10307 osd_reqid_t reqid;
10308 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10309 reqid = (*_op)->get_reqid();
10310 }
10311#endif
10312 tracepoint(osd, opwq_process_start, reqid.name._type,
10313 reqid.name._num, reqid.tid, reqid.inc);
10314 }
10315
10316 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10317 Formatter *f = Formatter::create("json");
10318 f->open_object_section("q");
10319 dump(f);
10320 f->close_section();
10321 f->flush(*_dout);
10322 delete f;
10323 *_dout << dendl;
10324
10325 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10326 suicide_interval);
10327 qi->run(osd, pg, tp_handle);
10328
10329 {
10330#ifdef WITH_LTTNG
10331 osd_reqid_t reqid;
10332 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10333 reqid = (*_op)->get_reqid();
10334 }
10335#endif
10336 tracepoint(osd, opwq_process_finish, reqid.name._type,
10337 reqid.name._num, reqid.tid, reqid.inc);
10338 }
10339
10340 pg->unlock();
10341}
10342
10343void OSD::ShardedOpWQ::_enqueue(pair<spg_t, PGQueueable> item) {
10344 uint32_t shard_index =
10345 item.first.hash_to_shard(shard_list.size());
10346
10347 ShardData* sdata = shard_list[shard_index];
10348 assert (NULL != sdata);
10349 unsigned priority = item.second.get_priority();
10350 unsigned cost = item.second.get_cost();
10351 sdata->sdata_op_ordering_lock.Lock();
10352
10353 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10354 if (priority >= osd->op_prio_cutoff)
10355 sdata->pqueue->enqueue_strict(
10356 item.second.get_owner(), priority, item);
10357 else
10358 sdata->pqueue->enqueue(
10359 item.second.get_owner(),
10360 priority, cost, item);
10361 sdata->sdata_op_ordering_lock.Unlock();
10362
10363 sdata->sdata_lock.Lock();
10364 sdata->sdata_cond.SignalOne();
10365 sdata->sdata_lock.Unlock();
10366
10367}
10368
10369void OSD::ShardedOpWQ::_enqueue_front(pair<spg_t, PGQueueable> item)
10370{
10371 uint32_t shard_index = item.first.hash_to_shard(shard_list.size());
10372 ShardData* sdata = shard_list[shard_index];
10373 assert (NULL != sdata);
10374 sdata->sdata_op_ordering_lock.Lock();
10375 auto p = sdata->pg_slots.find(item.first);
10376 if (p != sdata->pg_slots.end() && !p->second.to_process.empty()) {
10377 // we may be racing with _process, which has dequeued a new item
10378 // from pqueue, put it on to_process, and is now busy taking the
10379 // pg lock. ensure this old requeued item is ordered before any
10380 // such newer item in to_process.
10381 p->second.to_process.push_front(item.second);
10382 item.second = p->second.to_process.back();
10383 p->second.to_process.pop_back();
10384 dout(20) << __func__ << " " << item.first
10385 << " " << p->second.to_process.front()
10386 << " shuffled w/ " << item.second << dendl;
10387 } else {
10388 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10389 }
10390 sdata->_enqueue_front(item, osd->op_prio_cutoff);
10391 sdata->sdata_op_ordering_lock.Unlock();
10392 sdata->sdata_lock.Lock();
10393 sdata->sdata_cond.SignalOne();
10394 sdata->sdata_lock.Unlock();
10395}
10396
10397namespace ceph {
10398namespace osd_cmds {
10399
10400int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os)
10401{
10402 if (!ceph_using_tcmalloc()) {
10403 os << "could not issue heap profiler command -- not using tcmalloc!";
10404 return -EOPNOTSUPP;
10405 }
10406
10407 string cmd;
10408 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
10409 os << "unable to get value for command \"" << cmd << "\"";
10410 return -EINVAL;
10411 }
10412
10413 std::vector<std::string> cmd_vec;
10414 get_str_vec(cmd, cmd_vec);
10415
10416 ceph_heap_profiler_handle_command(cmd_vec, os);
10417
10418 return 0;
10419}
10420
10421}} // namespace ceph::osd_cmds
10422
224ce89b
WB
10423
10424std::ostream& operator<<(std::ostream& out, const OSD::io_queue& q) {
10425 switch(q) {
10426 case OSD::io_queue::prioritized:
10427 out << "prioritized";
10428 break;
10429 case OSD::io_queue::weightedpriority:
10430 out << "weightedpriority";
10431 break;
10432 case OSD::io_queue::mclock_opclass:
10433 out << "mclock_opclass";
10434 break;
10435 case OSD::io_queue::mclock_client:
10436 out << "mclock_client";
10437 break;
10438 }
10439 return out;
10440}