]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/OSD.cc
bump version to 12.0.3-pve3
[ceph.git] / ceph / src / osd / OSD.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
8 *
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
13 *
14 */
15#include "acconfig.h"
16
17#include <fstream>
18#include <iostream>
19#include <errno.h>
20#include <sys/stat.h>
21#include <signal.h>
22#include <ctype.h>
23#include <boost/scoped_ptr.hpp>
24
25#ifdef HAVE_SYS_PARAM_H
26#include <sys/param.h>
27#endif
28
29#ifdef HAVE_SYS_MOUNT_H
30#include <sys/mount.h>
31#endif
32
33#include "osd/PG.h"
34
35#include "include/types.h"
36#include "include/compat.h"
37
38#include "OSD.h"
39#include "OSDMap.h"
40#include "Watch.h"
41#include "osdc/Objecter.h"
42
43#include "common/errno.h"
44#include "common/ceph_argparse.h"
45#include "common/version.h"
46#include "common/io_priority.h"
47
48#include "os/ObjectStore.h"
49#ifdef HAVE_LIBFUSE
50#include "os/FuseStore.h"
51#endif
52
53#include "PrimaryLogPG.h"
54
55
56#include "msg/Messenger.h"
57#include "msg/Message.h"
58
59#include "mon/MonClient.h"
60
61#include "messages/MLog.h"
62
63#include "messages/MGenericMessage.h"
64#include "messages/MPing.h"
65#include "messages/MOSDPing.h"
66#include "messages/MOSDFailure.h"
67#include "messages/MOSDMarkMeDown.h"
68#include "messages/MOSDFull.h"
69#include "messages/MOSDOp.h"
70#include "messages/MOSDOpReply.h"
71#include "messages/MOSDBackoff.h"
72#include "messages/MOSDBeacon.h"
73#include "messages/MOSDRepOp.h"
74#include "messages/MOSDRepOpReply.h"
75#include "messages/MOSDBoot.h"
76#include "messages/MOSDPGTemp.h"
77
78#include "messages/MOSDMap.h"
79#include "messages/MMonGetOSDMap.h"
80#include "messages/MOSDPGNotify.h"
81#include "messages/MOSDPGQuery.h"
82#include "messages/MOSDPGLog.h"
83#include "messages/MOSDPGRemove.h"
84#include "messages/MOSDPGInfo.h"
85#include "messages/MOSDPGCreate.h"
86#include "messages/MOSDPGTrim.h"
87#include "messages/MOSDPGScan.h"
88#include "messages/MOSDPGBackfill.h"
89#include "messages/MBackfillReserve.h"
90#include "messages/MRecoveryReserve.h"
91#include "messages/MOSDECSubOpWrite.h"
92#include "messages/MOSDECSubOpWriteReply.h"
93#include "messages/MOSDECSubOpRead.h"
94#include "messages/MOSDECSubOpReadReply.h"
95#include "messages/MOSDPGCreated.h"
96#include "messages/MOSDPGUpdateLogMissing.h"
97#include "messages/MOSDPGUpdateLogMissingReply.h"
98
99#include "messages/MOSDAlive.h"
100
101#include "messages/MOSDScrub.h"
102#include "messages/MOSDScrubReserve.h"
103#include "messages/MOSDRepScrub.h"
104
105#include "messages/MMonCommand.h"
106#include "messages/MCommand.h"
107#include "messages/MCommandReply.h"
108
109#include "messages/MPGStats.h"
110#include "messages/MPGStatsAck.h"
111
112#include "messages/MWatchNotify.h"
113#include "messages/MOSDPGPush.h"
114#include "messages/MOSDPGPushReply.h"
115#include "messages/MOSDPGPull.h"
116
117#include "common/perf_counters.h"
118#include "common/Timer.h"
119#include "common/LogClient.h"
120#include "common/AsyncReserver.h"
121#include "common/HeartbeatMap.h"
122#include "common/admin_socket.h"
123#include "common/ceph_context.h"
124
125#include "global/signal_handler.h"
126#include "global/pidfile.h"
127
128#include "include/color.h"
129#include "perfglue/cpu_profiler.h"
130#include "perfglue/heap_profiler.h"
131
132#include "osd/OpRequest.h"
133
134#include "auth/AuthAuthorizeHandler.h"
135#include "auth/RotatingKeyRing.h"
136#include "common/errno.h"
137
138#include "objclass/objclass.h"
139
140#include "common/cmdparse.h"
141#include "include/str_list.h"
142#include "include/util.h"
143
144#include "include/assert.h"
145#include "common/config.h"
146#include "common/EventTrace.h"
147
148#ifdef WITH_LTTNG
149#define TRACEPOINT_DEFINE
150#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
151#include "tracing/osd.h"
152#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
153#undef TRACEPOINT_DEFINE
154#else
155#define tracepoint(...)
156#endif
157
158#define dout_context cct
159#define dout_subsys ceph_subsys_osd
160#undef dout_prefix
161#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
162
163const double OSD::OSD_TICK_INTERVAL = 1.0;
164
165static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
166 return *_dout << "osd." << whoami << " " << epoch << " ";
167}
168
169void PGQueueable::RunVis::operator()(const OpRequestRef &op) {
170 return osd->dequeue_op(pg, op, handle);
171}
172
173void PGQueueable::RunVis::operator()(const PGSnapTrim &op) {
174 return pg->snap_trimmer(op.epoch_queued);
175}
176
177void PGQueueable::RunVis::operator()(const PGScrub &op) {
178 return pg->scrub(op.epoch_queued, handle);
179}
180
181void PGQueueable::RunVis::operator()(const PGRecovery &op) {
182 return osd->do_recovery(pg.get(), op.epoch_queued, op.reserved_pushes, handle);
183}
184
185//Initial features in new superblock.
186//Features here are also automatically upgraded
187CompatSet OSD::get_osd_initial_compat_set() {
188 CompatSet::FeatureSet ceph_osd_feature_compat;
189 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
190 CompatSet::FeatureSet ceph_osd_feature_incompat;
191 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
192 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
193 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
194 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
195 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
196 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
197 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
198 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
199 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
200 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
201 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
202 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
203 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
204 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
205 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
206 ceph_osd_feature_incompat);
207}
208
209//Features are added here that this OSD supports.
210CompatSet OSD::get_osd_compat_set() {
211 CompatSet compat = get_osd_initial_compat_set();
212 //Any features here can be set in code, but not in initial superblock
213 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
214 return compat;
215}
216
217OSDService::OSDService(OSD *osd) :
218 osd(osd),
219 cct(osd->cct),
220 meta_osr(new ObjectStore::Sequencer("meta")),
221 whoami(osd->whoami), store(osd->store),
222 log_client(osd->log_client), clog(osd->clog),
223 pg_recovery_stats(osd->pg_recovery_stats),
224 cluster_messenger(osd->cluster_messenger),
225 client_messenger(osd->client_messenger),
226 logger(osd->logger),
227 recoverystate_perf(osd->recoverystate_perf),
228 monc(osd->monc),
229 peering_wq(osd->peering_wq),
230 recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
231 &osd->disk_tp),
232 class_handler(osd->class_handler),
233 pg_epoch_lock("OSDService::pg_epoch_lock"),
234 publish_lock("OSDService::publish_lock"),
235 pre_publish_lock("OSDService::pre_publish_lock"),
236 max_oldest_map(0),
237 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
238 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
239 scrubs_active(0),
240 agent_lock("OSDService::agent_lock"),
241 agent_valid_iterator(false),
242 agent_ops(0),
243 flush_mode_high_count(0),
244 agent_active(true),
245 agent_thread(this),
246 agent_stop_flag(false),
247 agent_timer_lock("OSDService::agent_timer_lock"),
248 agent_timer(osd->client_messenger->cct, agent_timer_lock),
249 last_recalibrate(ceph_clock_now()),
250 promote_max_objects(0),
251 promote_max_bytes(0),
252 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
253 objecter_finisher(osd->client_messenger->cct),
254 watch_lock("OSDService::watch_lock"),
255 watch_timer(osd->client_messenger->cct, watch_lock),
256 next_notif_id(0),
257 recovery_request_lock("OSDService::recovery_request_lock"),
258 recovery_request_timer(cct, recovery_request_lock, false),
259 reserver_finisher(cct),
260 local_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
261 cct->_conf->osd_min_recovery_priority),
262 remote_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
263 cct->_conf->osd_min_recovery_priority),
264 pg_temp_lock("OSDService::pg_temp_lock"),
265 snap_sleep_lock("OSDService::snap_sleep_lock"),
266 snap_sleep_timer(
267 osd->client_messenger->cct, snap_sleep_lock, false /* relax locking */),
268 snap_reserver(&reserver_finisher,
269 cct->_conf->osd_max_trimming_pgs),
270 recovery_lock("OSDService::recovery_lock"),
271 recovery_ops_active(0),
272 recovery_ops_reserved(0),
273 recovery_paused(false),
274 map_cache_lock("OSDService::map_cache_lock"),
275 map_cache(cct, cct->_conf->osd_map_cache_size),
276 map_bl_cache(cct->_conf->osd_map_cache_size),
277 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
278 in_progress_split_lock("OSDService::in_progress_split_lock"),
279 stat_lock("OSDService::stat_lock"),
280 full_status_lock("OSDService::full_status_lock"),
281 cur_state(NONE),
282 cur_ratio(0),
283 epoch_lock("OSDService::epoch_lock"),
284 boot_epoch(0), up_epoch(0), bind_epoch(0),
285 is_stopping_lock("OSDService::is_stopping_lock")
286#ifdef PG_DEBUG_REFS
287 , pgid_lock("OSDService::pgid_lock")
288#endif
289{
290 objecter->init();
291}
292
293OSDService::~OSDService()
294{
295 delete objecter;
296}
297
298void OSDService::_start_split(spg_t parent, const set<spg_t> &children)
299{
300 for (set<spg_t>::const_iterator i = children.begin();
301 i != children.end();
302 ++i) {
303 dout(10) << __func__ << ": Starting split on pg " << *i
304 << ", parent=" << parent << dendl;
305 assert(!pending_splits.count(*i));
306 assert(!in_progress_splits.count(*i));
307 pending_splits.insert(make_pair(*i, parent));
308
309 assert(!rev_pending_splits[parent].count(*i));
310 rev_pending_splits[parent].insert(*i);
311 }
312}
313
314void OSDService::mark_split_in_progress(spg_t parent, const set<spg_t> &children)
315{
316 Mutex::Locker l(in_progress_split_lock);
317 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
318 assert(piter != rev_pending_splits.end());
319 for (set<spg_t>::const_iterator i = children.begin();
320 i != children.end();
321 ++i) {
322 assert(piter->second.count(*i));
323 assert(pending_splits.count(*i));
324 assert(!in_progress_splits.count(*i));
325 assert(pending_splits[*i] == parent);
326
327 pending_splits.erase(*i);
328 piter->second.erase(*i);
329 in_progress_splits.insert(*i);
330 }
331 if (piter->second.empty())
332 rev_pending_splits.erase(piter);
333}
334
335void OSDService::cancel_pending_splits_for_parent(spg_t parent)
336{
337 Mutex::Locker l(in_progress_split_lock);
338 _cancel_pending_splits_for_parent(parent);
339}
340
341void OSDService::_cancel_pending_splits_for_parent(spg_t parent)
342{
343 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
344 if (piter == rev_pending_splits.end())
345 return;
346
347 for (set<spg_t>::iterator i = piter->second.begin();
348 i != piter->second.end();
349 ++i) {
350 assert(pending_splits.count(*i));
351 assert(!in_progress_splits.count(*i));
352 pending_splits.erase(*i);
353 dout(10) << __func__ << ": Completing split on pg " << *i
354 << " for parent: " << parent << dendl;
355 _cancel_pending_splits_for_parent(*i);
356 }
357 rev_pending_splits.erase(piter);
358}
359
360void OSDService::_maybe_split_pgid(OSDMapRef old_map,
361 OSDMapRef new_map,
362 spg_t pgid)
363{
364 assert(old_map->have_pg_pool(pgid.pool()));
365 int old_pgnum = old_map->get_pg_num(pgid.pool());
366 if (pgid.ps() < static_cast<unsigned>(old_pgnum)) {
367 set<spg_t> children;
368 if (pgid.is_split(old_pgnum,
369 new_map->get_pg_num(pgid.pool()), &children)) {
370 _start_split(pgid, children); }
371 } else {
372 assert(pgid.ps() < static_cast<unsigned>(new_map->get_pg_num(pgid.pool())));
373 }
374}
375
376void OSDService::init_splits_between(spg_t pgid,
377 OSDMapRef frommap,
378 OSDMapRef tomap)
379{
380 // First, check whether we can avoid this potentially expensive check
381 if (tomap->have_pg_pool(pgid.pool()) &&
382 pgid.is_split(
383 frommap->get_pg_num(pgid.pool()),
384 tomap->get_pg_num(pgid.pool()),
385 NULL)) {
386 // Ok, a split happened, so we need to walk the osdmaps
387 set<spg_t> new_pgs; // pgs to scan on each map
388 new_pgs.insert(pgid);
389 OSDMapRef curmap(get_map(frommap->get_epoch()));
390 for (epoch_t e = frommap->get_epoch() + 1;
391 e <= tomap->get_epoch();
392 ++e) {
393 OSDMapRef nextmap(try_get_map(e));
394 if (!nextmap)
395 continue;
396 set<spg_t> even_newer_pgs; // pgs added in this loop
397 for (set<spg_t>::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) {
398 set<spg_t> split_pgs;
399 if (i->is_split(curmap->get_pg_num(i->pool()),
400 nextmap->get_pg_num(i->pool()),
401 &split_pgs)) {
402 start_split(*i, split_pgs);
403 even_newer_pgs.insert(split_pgs.begin(), split_pgs.end());
404 }
405 }
406 new_pgs.insert(even_newer_pgs.begin(), even_newer_pgs.end());
407 curmap = nextmap;
408 }
409 assert(curmap == tomap); // we must have had both frommap and tomap
410 }
411}
412
413void OSDService::expand_pg_num(OSDMapRef old_map,
414 OSDMapRef new_map)
415{
416 Mutex::Locker l(in_progress_split_lock);
417 for (set<spg_t>::iterator i = in_progress_splits.begin();
418 i != in_progress_splits.end();
419 ) {
420 if (!new_map->have_pg_pool(i->pool())) {
421 in_progress_splits.erase(i++);
422 } else {
423 _maybe_split_pgid(old_map, new_map, *i);
424 ++i;
425 }
426 }
427 for (map<spg_t, spg_t>::iterator i = pending_splits.begin();
428 i != pending_splits.end();
429 ) {
430 if (!new_map->have_pg_pool(i->first.pool())) {
431 rev_pending_splits.erase(i->second);
432 pending_splits.erase(i++);
433 } else {
434 _maybe_split_pgid(old_map, new_map, i->first);
435 ++i;
436 }
437 }
438}
439
440bool OSDService::splitting(spg_t pgid)
441{
442 Mutex::Locker l(in_progress_split_lock);
443 return in_progress_splits.count(pgid) ||
444 pending_splits.count(pgid);
445}
446
447void OSDService::complete_split(const set<spg_t> &pgs)
448{
449 Mutex::Locker l(in_progress_split_lock);
450 for (set<spg_t>::const_iterator i = pgs.begin();
451 i != pgs.end();
452 ++i) {
453 dout(10) << __func__ << ": Completing split on pg " << *i << dendl;
454 assert(!pending_splits.count(*i));
455 assert(in_progress_splits.count(*i));
456 in_progress_splits.erase(*i);
457 }
458}
459
460void OSDService::need_heartbeat_peer_update()
461{
462 osd->need_heartbeat_peer_update();
463}
464
465void OSDService::pg_stat_queue_enqueue(PG *pg)
466{
467 osd->pg_stat_queue_enqueue(pg);
468}
469
470void OSDService::pg_stat_queue_dequeue(PG *pg)
471{
472 osd->pg_stat_queue_dequeue(pg);
473}
474
475void OSDService::start_shutdown()
476{
477 {
478 Mutex::Locker l(agent_timer_lock);
479 agent_timer.shutdown();
480 }
481}
482
483void OSDService::shutdown()
484{
485 reserver_finisher.wait_for_empty();
486 reserver_finisher.stop();
487 {
488 Mutex::Locker l(watch_lock);
489 watch_timer.shutdown();
490 }
491
492 objecter->shutdown();
493 objecter_finisher.wait_for_empty();
494 objecter_finisher.stop();
495
496 {
497 Mutex::Locker l(recovery_request_lock);
498 recovery_request_timer.shutdown();
499 }
500
501 {
502 Mutex::Locker l(snap_sleep_lock);
503 snap_sleep_timer.shutdown();
504 }
505
506 osdmap = OSDMapRef();
507 next_osdmap = OSDMapRef();
508}
509
510void OSDService::init()
511{
512 reserver_finisher.start();
513 objecter_finisher.start();
514 objecter->set_client_incarnation(0);
515
516 // deprioritize objecter in daemonperf output
517 objecter->get_logger()->set_prio_adjust(-3);
518
519 watch_timer.init();
520 agent_timer.init();
521 snap_sleep_timer.init();
522
523 agent_thread.create("osd_srv_agent");
524
525 if (cct->_conf->osd_recovery_delay_start)
526 defer_recovery(cct->_conf->osd_recovery_delay_start);
527}
528
529void OSDService::final_init()
530{
531 objecter->start(osdmap.get());
532}
533
534void OSDService::activate_map()
535{
536 // wake/unwake the tiering agent
537 agent_lock.Lock();
538 agent_active =
539 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
540 osd->is_active();
541 agent_cond.Signal();
542 agent_lock.Unlock();
543}
544
545class AgentTimeoutCB : public Context {
546 PGRef pg;
547public:
548 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
549 void finish(int) override {
550 pg->agent_choose_mode_restart();
551 }
552};
553
554void OSDService::agent_entry()
555{
556 dout(10) << __func__ << " start" << dendl;
557 agent_lock.Lock();
558
559 while (!agent_stop_flag) {
560 if (agent_queue.empty()) {
561 dout(20) << __func__ << " empty queue" << dendl;
562 agent_cond.Wait(agent_lock);
563 continue;
564 }
565 uint64_t level = agent_queue.rbegin()->first;
566 set<PGRef>& top = agent_queue.rbegin()->second;
567 dout(10) << __func__
568 << " tiers " << agent_queue.size()
569 << ", top is " << level
570 << " with pgs " << top.size()
571 << ", ops " << agent_ops << "/"
572 << cct->_conf->osd_agent_max_ops
573 << (agent_active ? " active" : " NOT ACTIVE")
574 << dendl;
575 dout(20) << __func__ << " oids " << agent_oids << dendl;
576 int max = cct->_conf->osd_agent_max_ops - agent_ops;
577 int agent_flush_quota = max;
578 if (!flush_mode_high_count)
579 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
580 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
581 agent_cond.Wait(agent_lock);
582 continue;
583 }
584
585 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
586 agent_queue_pos = top.begin();
587 agent_valid_iterator = true;
588 }
589 PGRef pg = *agent_queue_pos;
590 dout(10) << "high_count " << flush_mode_high_count
591 << " agent_ops " << agent_ops
592 << " flush_quota " << agent_flush_quota << dendl;
593 agent_lock.Unlock();
594 if (!pg->agent_work(max, agent_flush_quota)) {
595 dout(10) << __func__ << " " << pg->get_pgid()
596 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
597 << " seconds" << dendl;
598
599 osd->logger->inc(l_osd_tier_delay);
600 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
601 agent_timer_lock.Lock();
602 Context *cb = new AgentTimeoutCB(pg);
603 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
604 agent_timer_lock.Unlock();
605 }
606 agent_lock.Lock();
607 }
608 agent_lock.Unlock();
609 dout(10) << __func__ << " finish" << dendl;
610}
611
612void OSDService::agent_stop()
613{
614 {
615 Mutex::Locker l(agent_lock);
616
617 // By this time all ops should be cancelled
618 assert(agent_ops == 0);
619 // By this time all PGs are shutdown and dequeued
620 if (!agent_queue.empty()) {
621 set<PGRef>& top = agent_queue.rbegin()->second;
622 derr << "agent queue not empty, for example " << (*top.begin())->info.pgid << dendl;
623 assert(0 == "agent queue not empty");
624 }
625
626 agent_stop_flag = true;
627 agent_cond.Signal();
628 }
629 agent_thread.join();
630}
631
632// -------------------------------------
633
634void OSDService::promote_throttle_recalibrate()
635{
636 utime_t now = ceph_clock_now();
637 double dur = now - last_recalibrate;
638 last_recalibrate = now;
639 unsigned prob = promote_probability_millis;
640
641 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
642 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
643
644 unsigned min_prob = 1;
645
646 uint64_t attempts, obj, bytes;
647 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
648 dout(10) << __func__ << " " << attempts << " attempts, promoted "
649 << obj << " objects and " << pretty_si_t(bytes) << " bytes; target "
650 << target_obj_sec << " obj/sec or "
651 << pretty_si_t(target_bytes_sec) << " bytes/sec"
652 << dendl;
653
654 // calculate what the probability *should* be, given the targets
655 unsigned new_prob;
656 if (attempts && dur > 0) {
657 uint64_t avg_size = 1;
658 if (obj)
659 avg_size = MAX(bytes / obj, 1);
660 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
661 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
662 / (double)attempts;
663 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
664 << avg_size << dendl;
665 if (target_obj_sec && target_bytes_sec)
666 new_prob = MIN(po, pb);
667 else if (target_obj_sec)
668 new_prob = po;
669 else if (target_bytes_sec)
670 new_prob = pb;
671 else
672 new_prob = 1000;
673 } else {
674 new_prob = 1000;
675 }
676 dout(20) << __func__ << " new_prob " << new_prob << dendl;
677
678 // correct for persistent skew between target rate and actual rate, adjust
679 double ratio = 1.0;
680 unsigned actual = 0;
681 if (attempts && obj) {
682 actual = obj * 1000 / attempts;
683 ratio = (double)actual / (double)prob;
684 new_prob = (double)new_prob / ratio;
685 }
686 new_prob = MAX(new_prob, min_prob);
687 new_prob = MIN(new_prob, 1000);
688
689 // adjust
690 prob = (prob + new_prob) / 2;
691 prob = MAX(prob, min_prob);
692 prob = MIN(prob, 1000);
693 dout(10) << __func__ << " actual " << actual
694 << ", actual/prob ratio " << ratio
695 << ", adjusted new_prob " << new_prob
696 << ", prob " << promote_probability_millis << " -> " << prob
697 << dendl;
698 promote_probability_millis = prob;
699
700 // set hard limits for this interval to mitigate stampedes
701 promote_max_objects = target_obj_sec * OSD::OSD_TICK_INTERVAL * 2;
702 promote_max_bytes = target_bytes_sec * OSD::OSD_TICK_INTERVAL * 2;
703}
704
705// -------------------------------------
706
707float OSDService::get_failsafe_full_ratio()
708{
709 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
710 if (full_ratio > 1.0) full_ratio /= 100.0;
711 return full_ratio;
712}
713
714void OSDService::check_full_status(const osd_stat_t &osd_stat)
715{
716 Mutex::Locker l(full_status_lock);
717
718 float ratio = ((float)osd_stat.kb_used) / ((float)osd_stat.kb);
719 cur_ratio = ratio;
720
721 // The OSDMap ratios take precendence. So if the failsafe is .95 and
722 // the admin sets the cluster full to .96, the failsafe moves up to .96
723 // too. (Not that having failsafe == full is ideal, but it's better than
724 // dropping writes before the clusters appears full.)
725 OSDMapRef osdmap = get_osdmap();
726 if (!osdmap || osdmap->get_epoch() == 0) {
727 cur_state = NONE;
728 return;
729 }
730 float nearfull_ratio = osdmap->get_nearfull_ratio();
731 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
732 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
733 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
734
735 if (!osdmap->test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) {
736 // use the failsafe for nearfull and full; the mon isn't using the
737 // flags anyway because we're mid-upgrade.
738 full_ratio = failsafe_ratio;
739 backfillfull_ratio = failsafe_ratio;
740 nearfull_ratio = failsafe_ratio;
741 } else if (full_ratio <= 0 ||
742 backfillfull_ratio <= 0 ||
743 nearfull_ratio <= 0) {
744 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
745 // use failsafe flag. ick. the monitor did something wrong or the user
746 // did something stupid.
747 full_ratio = failsafe_ratio;
748 backfillfull_ratio = failsafe_ratio;
749 nearfull_ratio = failsafe_ratio;
750 }
751
752 string inject;
753 s_names new_state;
754 if (injectfull_state > NONE && injectfull) {
755 new_state = injectfull_state;
756 inject = "(Injected)";
757 } else if (ratio > failsafe_ratio) {
758 new_state = FAILSAFE;
759 } else if (ratio > full_ratio) {
760 new_state = FULL;
761 } else if (ratio > backfillfull_ratio) {
762 new_state = BACKFILLFULL;
763 } else if (ratio > nearfull_ratio) {
764 new_state = NEARFULL;
765 } else {
766 new_state = NONE;
767 }
768 dout(20) << __func__ << " cur ratio " << ratio
769 << ". nearfull_ratio " << nearfull_ratio
770 << ". backfillfull_ratio " << backfillfull_ratio
771 << ", full_ratio " << full_ratio
772 << ", failsafe_ratio " << failsafe_ratio
773 << ", new state " << get_full_state_name(new_state)
774 << " " << inject
775 << dendl;
776
777 // warn
778 if (cur_state != new_state) {
779 dout(10) << __func__ << " " << get_full_state_name(cur_state)
780 << " -> " << get_full_state_name(new_state) << dendl;
781 if (new_state == FAILSAFE) {
782 clog->error() << "failsafe engaged, dropping updates, now "
783 << (int)roundf(ratio * 100) << "% full";
784 } else if (cur_state == FAILSAFE) {
785 clog->error() << "failsafe disengaged, no longer dropping updates, now "
786 << (int)roundf(ratio * 100) << "% full";
787 }
788 cur_state = new_state;
789 }
790}
791
792bool OSDService::need_fullness_update()
793{
794 OSDMapRef osdmap = get_osdmap();
795 s_names cur = NONE;
796 if (osdmap->exists(whoami)) {
797 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
798 cur = FULL;
799 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
800 cur = BACKFILLFULL;
801 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
802 cur = NEARFULL;
803 }
804 }
805 s_names want = NONE;
806 if (is_full())
807 want = FULL;
808 else if (is_backfillfull())
809 want = BACKFILLFULL;
810 else if (is_nearfull())
811 want = NEARFULL;
812 return want != cur;
813}
814
815bool OSDService::_check_full(s_names type, ostream &ss) const
816{
817 Mutex::Locker l(full_status_lock);
818
819 if (injectfull && injectfull_state >= type) {
820 // injectfull is either a count of the number of times to return failsafe full
821 // or if -1 then always return full
822 if (injectfull > 0)
823 --injectfull;
824 ss << "Injected " << get_full_state_name(type) << " OSD ("
825 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")";
826 return true;
827 }
828
829 ss << "current usage is " << cur_ratio;
830 return cur_state >= type;
831}
832
833bool OSDService::check_failsafe_full(ostream &ss) const
834{
835 return _check_full(FAILSAFE, ss);
836}
837
838bool OSDService::check_full(ostream &ss) const
839{
840 return _check_full(FULL, ss);
841}
842
843bool OSDService::check_backfill_full(ostream &ss) const
844{
845 return _check_full(BACKFILLFULL, ss);
846}
847
848bool OSDService::check_nearfull(ostream &ss) const
849{
850 return _check_full(NEARFULL, ss);
851}
852
853bool OSDService::is_failsafe_full() const
854{
855 Mutex::Locker l(full_status_lock);
856 return cur_state == FAILSAFE;
857}
858
859bool OSDService::is_full() const
860{
861 Mutex::Locker l(full_status_lock);
862 return cur_state >= FULL;
863}
864
865bool OSDService::is_backfillfull() const
866{
867 Mutex::Locker l(full_status_lock);
868 return cur_state >= BACKFILLFULL;
869}
870
871bool OSDService::is_nearfull() const
872{
873 Mutex::Locker l(full_status_lock);
874 return cur_state >= NEARFULL;
875}
876
877void OSDService::set_injectfull(s_names type, int64_t count)
878{
879 Mutex::Locker l(full_status_lock);
880 injectfull_state = type;
881 injectfull = count;
882}
883
884void OSDService::update_osd_stat(vector<int>& hb_peers)
885{
886 Mutex::Locker lock(stat_lock);
887
888 osd_stat.hb_peers.swap(hb_peers);
889
890 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
891
892 // fill in osd stats too
893 struct store_statfs_t stbuf;
894 int r = osd->store->statfs(&stbuf);
895 if (r < 0) {
896 derr << "statfs() failed: " << cpp_strerror(r) << dendl;
897 return;
898 }
899
900 uint64_t bytes = stbuf.total;
901 uint64_t used = bytes - stbuf.available;
902 uint64_t avail = stbuf.available;
903
904 osd_stat.kb = bytes >> 10;
905 osd_stat.kb_used = used >> 10;
906 osd_stat.kb_avail = avail >> 10;
907
908 osd->logger->set(l_osd_stat_bytes, bytes);
909 osd->logger->set(l_osd_stat_bytes_used, used);
910 osd->logger->set(l_osd_stat_bytes_avail, avail);
911
912 dout(20) << "update_osd_stat " << osd_stat << dendl;
913
914 check_full_status(osd_stat);
915}
916
917bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
918{
919 OSDMapRef osdmap = get_osdmap();
920 for (auto shard : missing_on) {
921 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
922 return true;
923 }
924 return false;
925}
926
927void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
928{
929 OSDMapRef next_map = get_nextmap_reserved();
930 // service map is always newer/newest
931 assert(from_epoch <= next_map->get_epoch());
932
933 if (next_map->is_down(peer) ||
934 next_map->get_info(peer).up_from > from_epoch) {
935 m->put();
936 release_map(next_map);
937 return;
938 }
939 const entity_inst_t& peer_inst = next_map->get_cluster_inst(peer);
940 ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst);
941 share_map_peer(peer, peer_con.get(), next_map);
942 peer_con->send_message(m);
943 release_map(next_map);
944}
945
946ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
947{
948 OSDMapRef next_map = get_nextmap_reserved();
949 // service map is always newer/newest
950 assert(from_epoch <= next_map->get_epoch());
951
952 if (next_map->is_down(peer) ||
953 next_map->get_info(peer).up_from > from_epoch) {
954 release_map(next_map);
955 return NULL;
956 }
957 ConnectionRef con = osd->cluster_messenger->get_connection(next_map->get_cluster_inst(peer));
958 release_map(next_map);
959 return con;
960}
961
962pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
963{
964 OSDMapRef next_map = get_nextmap_reserved();
965 // service map is always newer/newest
966 assert(from_epoch <= next_map->get_epoch());
967
968 pair<ConnectionRef,ConnectionRef> ret;
969 if (next_map->is_down(peer) ||
970 next_map->get_info(peer).up_from > from_epoch) {
971 release_map(next_map);
972 return ret;
973 }
974 ret.first = osd->hb_back_client_messenger->get_connection(next_map->get_hb_back_inst(peer));
975 if (next_map->get_hb_front_addr(peer) != entity_addr_t())
976 ret.second = osd->hb_front_client_messenger->get_connection(next_map->get_hb_front_inst(peer));
977 release_map(next_map);
978 return ret;
979}
980
981
982void OSDService::queue_want_pg_temp(pg_t pgid, vector<int>& want)
983{
984 Mutex::Locker l(pg_temp_lock);
985 map<pg_t,vector<int> >::iterator p = pg_temp_pending.find(pgid);
986 if (p == pg_temp_pending.end() ||
987 p->second != want) {
988 pg_temp_wanted[pgid] = want;
989 }
990}
991
992void OSDService::remove_want_pg_temp(pg_t pgid)
993{
994 Mutex::Locker l(pg_temp_lock);
995 pg_temp_wanted.erase(pgid);
996 pg_temp_pending.erase(pgid);
997}
998
999void OSDService::_sent_pg_temp()
1000{
1001 for (map<pg_t,vector<int> >::iterator p = pg_temp_wanted.begin();
1002 p != pg_temp_wanted.end();
1003 ++p)
1004 pg_temp_pending[p->first] = p->second;
1005 pg_temp_wanted.clear();
1006}
1007
1008void OSDService::requeue_pg_temp()
1009{
1010 Mutex::Locker l(pg_temp_lock);
1011 // wanted overrides pending. note that remove_want_pg_temp
1012 // clears the item out of both.
1013 unsigned old_wanted = pg_temp_wanted.size();
1014 unsigned old_pending = pg_temp_pending.size();
1015 _sent_pg_temp();
1016 pg_temp_wanted.swap(pg_temp_pending);
1017 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1018 << pg_temp_wanted.size() << dendl;
1019}
1020
1021void OSDService::send_pg_temp()
1022{
1023 Mutex::Locker l(pg_temp_lock);
1024 if (pg_temp_wanted.empty())
1025 return;
1026 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1027 MOSDPGTemp *m = new MOSDPGTemp(osdmap->get_epoch());
1028 m->pg_temp = pg_temp_wanted;
1029 monc->send_mon_message(m);
1030 _sent_pg_temp();
1031}
1032
1033void OSDService::send_pg_created(pg_t pgid)
1034{
1035 dout(20) << __func__ << dendl;
1036 monc->send_mon_message(new MOSDPGCreated(pgid));
1037}
1038
1039// --------------------------------------
1040// dispatch
1041
1042epoch_t OSDService::get_peer_epoch(int peer)
1043{
1044 Mutex::Locker l(peer_map_epoch_lock);
1045 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1046 if (p == peer_map_epoch.end())
1047 return 0;
1048 return p->second;
1049}
1050
1051epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1052{
1053 Mutex::Locker l(peer_map_epoch_lock);
1054 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1055 if (p != peer_map_epoch.end()) {
1056 if (p->second < e) {
1057 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1058 p->second = e;
1059 } else {
1060 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1061 }
1062 return p->second;
1063 } else {
1064 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1065 peer_map_epoch[peer] = e;
1066 return e;
1067 }
1068}
1069
1070void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1071{
1072 Mutex::Locker l(peer_map_epoch_lock);
1073 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1074 if (p != peer_map_epoch.end()) {
1075 if (p->second <= as_of) {
1076 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1077 << " had " << p->second << dendl;
1078 peer_map_epoch.erase(p);
1079 } else {
1080 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1081 << " has " << p->second << " - not forgetting" << dendl;
1082 }
1083 }
1084}
1085
1086bool OSDService::should_share_map(entity_name_t name, Connection *con,
1087 epoch_t epoch, const OSDMapRef& osdmap,
1088 const epoch_t *sent_epoch_p)
1089{
1090 dout(20) << "should_share_map "
1091 << name << " " << con->get_peer_addr()
1092 << " " << epoch << dendl;
1093
1094 // does client have old map?
1095 if (name.is_client()) {
1096 bool message_sendmap = epoch < osdmap->get_epoch();
1097 if (message_sendmap && sent_epoch_p) {
1098 dout(20) << "client session last_sent_epoch: "
1099 << *sent_epoch_p
1100 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1101 if (*sent_epoch_p < osdmap->get_epoch()) {
1102 return true;
1103 } // else we don't need to send it out again
1104 }
1105 }
1106
1107 if (con->get_messenger() == osd->cluster_messenger &&
1108 con != osd->cluster_messenger->get_loopback_connection() &&
1109 osdmap->is_up(name.num()) &&
1110 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1111 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1112 // remember
1113 epoch_t has = MAX(get_peer_epoch(name.num()), epoch);
1114
1115 // share?
1116 if (has < osdmap->get_epoch()) {
1117 dout(10) << name << " " << con->get_peer_addr()
1118 << " has old map " << epoch << " < "
1119 << osdmap->get_epoch() << dendl;
1120 return true;
1121 }
1122 }
1123
1124 return false;
1125}
1126
1127void OSDService::share_map(
1128 entity_name_t name,
1129 Connection *con,
1130 epoch_t epoch,
1131 OSDMapRef& osdmap,
1132 epoch_t *sent_epoch_p)
1133{
1134 dout(20) << "share_map "
1135 << name << " " << con->get_peer_addr()
1136 << " " << epoch << dendl;
1137
1138 if (!osd->is_active()) {
1139 /*It is safe not to proceed as OSD is not in healthy state*/
1140 return;
1141 }
1142
1143 bool want_shared = should_share_map(name, con, epoch,
1144 osdmap, sent_epoch_p);
1145
1146 if (want_shared){
1147 if (name.is_client()) {
1148 dout(10) << name << " has old map " << epoch
1149 << " < " << osdmap->get_epoch() << dendl;
1150 // we know the Session is valid or we wouldn't be sending
1151 if (sent_epoch_p) {
1152 *sent_epoch_p = osdmap->get_epoch();
1153 }
1154 send_incremental_map(epoch, con, osdmap);
1155 } else if (con->get_messenger() == osd->cluster_messenger &&
1156 osdmap->is_up(name.num()) &&
1157 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1158 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1159 dout(10) << name << " " << con->get_peer_addr()
1160 << " has old map " << epoch << " < "
1161 << osdmap->get_epoch() << dendl;
1162 note_peer_epoch(name.num(), osdmap->get_epoch());
1163 send_incremental_map(epoch, con, osdmap);
1164 }
1165 }
1166}
1167
1168void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1169{
1170 if (!map)
1171 map = get_osdmap();
1172
1173 // send map?
1174 epoch_t pe = get_peer_epoch(peer);
1175 if (pe) {
1176 if (pe < map->get_epoch()) {
1177 send_incremental_map(pe, con, map);
1178 note_peer_epoch(peer, map->get_epoch());
1179 } else
1180 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1181 } else {
1182 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1183 // no idea about peer's epoch.
1184 // ??? send recent ???
1185 // do nothing.
1186 }
1187}
1188
1189bool OSDService::can_inc_scrubs_pending()
1190{
1191 bool can_inc = false;
1192 Mutex::Locker l(sched_scrub_lock);
1193
1194 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1195 dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
1196 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1197 can_inc = true;
1198 } else {
1199 dout(20) << __func__ << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1200 }
1201
1202 return can_inc;
1203}
1204
1205bool OSDService::inc_scrubs_pending()
1206{
1207 bool result = false;
1208
1209 sched_scrub_lock.Lock();
1210 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1211 dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
1212 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1213 result = true;
1214 ++scrubs_pending;
1215 } else {
1216 dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1217 }
1218 sched_scrub_lock.Unlock();
1219
1220 return result;
1221}
1222
1223void OSDService::dec_scrubs_pending()
1224{
1225 sched_scrub_lock.Lock();
1226 dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
1227 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1228 --scrubs_pending;
1229 assert(scrubs_pending >= 0);
1230 sched_scrub_lock.Unlock();
1231}
1232
1233void OSDService::inc_scrubs_active(bool reserved)
1234{
1235 sched_scrub_lock.Lock();
1236 ++(scrubs_active);
1237 if (reserved) {
1238 --(scrubs_pending);
1239 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1240 << " (max " << cct->_conf->osd_max_scrubs
1241 << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
1242 assert(scrubs_pending >= 0);
1243 } else {
1244 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1245 << " (max " << cct->_conf->osd_max_scrubs
1246 << ", pending " << scrubs_pending << ")" << dendl;
1247 }
1248 sched_scrub_lock.Unlock();
1249}
1250
1251void OSDService::dec_scrubs_active()
1252{
1253 sched_scrub_lock.Lock();
1254 dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
1255 << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
1256 --scrubs_active;
1257 assert(scrubs_active >= 0);
1258 sched_scrub_lock.Unlock();
1259}
1260
1261void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1262 epoch_t *_bind_epoch) const
1263{
1264 Mutex::Locker l(epoch_lock);
1265 if (_boot_epoch)
1266 *_boot_epoch = boot_epoch;
1267 if (_up_epoch)
1268 *_up_epoch = up_epoch;
1269 if (_bind_epoch)
1270 *_bind_epoch = bind_epoch;
1271}
1272
1273void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1274 const epoch_t *_bind_epoch)
1275{
1276 Mutex::Locker l(epoch_lock);
1277 if (_boot_epoch) {
1278 assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1279 boot_epoch = *_boot_epoch;
1280 }
1281 if (_up_epoch) {
1282 assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1283 up_epoch = *_up_epoch;
1284 }
1285 if (_bind_epoch) {
1286 assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1287 bind_epoch = *_bind_epoch;
1288 }
1289}
1290
1291bool OSDService::prepare_to_stop()
1292{
1293 Mutex::Locker l(is_stopping_lock);
1294 if (get_state() != NOT_STOPPING)
1295 return false;
1296
1297 OSDMapRef osdmap = get_osdmap();
1298 if (osdmap && osdmap->is_up(whoami)) {
1299 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1300 set_state(PREPARING_TO_STOP);
1301 monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(),
1302 osdmap->get_inst(whoami),
1303 osdmap->get_epoch(),
1304 true // request ack
1305 ));
1306 utime_t now = ceph_clock_now();
1307 utime_t timeout;
1308 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1309 while ((ceph_clock_now() < timeout) &&
1310 (get_state() != STOPPING)) {
1311 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1312 }
1313 }
1314 dout(0) << __func__ << " starting shutdown" << dendl;
1315 set_state(STOPPING);
1316 return true;
1317}
1318
1319void OSDService::got_stop_ack()
1320{
1321 Mutex::Locker l(is_stopping_lock);
1322 if (get_state() == PREPARING_TO_STOP) {
1323 dout(0) << __func__ << " starting shutdown" << dendl;
1324 set_state(STOPPING);
1325 is_stopping_cond.Signal();
1326 } else {
1327 dout(10) << __func__ << " ignoring msg" << dendl;
1328 }
1329}
1330
1331MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1332 OSDSuperblock& sblock)
1333{
1334 MOSDMap *m = new MOSDMap(monc->get_fsid());
1335 m->oldest_map = max_oldest_map;
1336 m->newest_map = sblock.newest_map;
1337
1338 for (epoch_t e = to; e > since; e--) {
1339 bufferlist bl;
1340 if (e > m->oldest_map && get_inc_map_bl(e, bl)) {
1341 m->incremental_maps[e].claim(bl);
1342 } else if (get_map_bl(e, bl)) {
1343 m->maps[e].claim(bl);
1344 break;
1345 } else {
1346 derr << "since " << since << " to " << to
1347 << " oldest " << m->oldest_map << " newest " << m->newest_map
1348 << dendl;
1349 m->put();
1350 m = NULL;
1351 break;
1352 }
1353 }
1354 return m;
1355}
1356
1357void OSDService::send_map(MOSDMap *m, Connection *con)
1358{
1359 con->send_message(m);
1360}
1361
1362void OSDService::send_incremental_map(epoch_t since, Connection *con,
1363 OSDMapRef& osdmap)
1364{
1365 epoch_t to = osdmap->get_epoch();
1366 dout(10) << "send_incremental_map " << since << " -> " << to
1367 << " to " << con << " " << con->get_peer_addr() << dendl;
1368
1369 MOSDMap *m = NULL;
1370 while (!m) {
1371 OSDSuperblock sblock(get_superblock());
1372 if (since < sblock.oldest_map) {
1373 // just send latest full map
1374 MOSDMap *m = new MOSDMap(monc->get_fsid());
1375 m->oldest_map = max_oldest_map;
1376 m->newest_map = sblock.newest_map;
1377 get_map_bl(to, m->maps[to]);
1378 send_map(m, con);
1379 return;
1380 }
1381
1382 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1383 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1384 << ", only sending most recent" << dendl;
1385 since = to - cct->_conf->osd_map_share_max_epochs;
1386 }
1387
1388 if (to - since > (epoch_t)cct->_conf->osd_map_message_max)
1389 to = since + cct->_conf->osd_map_message_max;
1390 m = build_incremental_map_msg(since, to, sblock);
1391 }
1392 send_map(m, con);
1393}
1394
1395bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1396{
1397 bool found = map_bl_cache.lookup(e, &bl);
1398 if (found)
1399 return true;
1400 found = store->read(coll_t::meta(),
1401 OSD::get_osdmap_pobject_name(e), 0, 0, bl) >= 0;
1402 if (found)
1403 _add_map_bl(e, bl);
1404 return found;
1405}
1406
1407bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1408{
1409 Mutex::Locker l(map_cache_lock);
1410 bool found = map_bl_inc_cache.lookup(e, &bl);
1411 if (found)
1412 return true;
1413 found = store->read(coll_t::meta(),
1414 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl) >= 0;
1415 if (found)
1416 _add_map_inc_bl(e, bl);
1417 return found;
1418}
1419
1420void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1421{
1422 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1423 map_bl_cache.add(e, bl);
1424}
1425
1426void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1427{
1428 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1429 map_bl_inc_cache.add(e, bl);
1430}
1431
1432void OSDService::pin_map_inc_bl(epoch_t e, bufferlist &bl)
1433{
1434 Mutex::Locker l(map_cache_lock);
1435 map_bl_inc_cache.pin(e, bl);
1436}
1437
1438void OSDService::pin_map_bl(epoch_t e, bufferlist &bl)
1439{
1440 Mutex::Locker l(map_cache_lock);
1441 map_bl_cache.pin(e, bl);
1442}
1443
1444void OSDService::clear_map_bl_cache_pins(epoch_t e)
1445{
1446 Mutex::Locker l(map_cache_lock);
1447 map_bl_inc_cache.clear_pinned(e);
1448 map_bl_cache.clear_pinned(e);
1449}
1450
1451OSDMapRef OSDService::_add_map(OSDMap *o)
1452{
1453 epoch_t e = o->get_epoch();
1454
1455 if (cct->_conf->osd_map_dedup) {
1456 // Dedup against an existing map at a nearby epoch
1457 OSDMapRef for_dedup = map_cache.lower_bound(e);
1458 if (for_dedup) {
1459 OSDMap::dedup(for_dedup.get(), o);
1460 }
1461 }
1462 bool existed;
1463 OSDMapRef l = map_cache.add(e, o, &existed);
1464 if (existed) {
1465 delete o;
1466 }
1467 return l;
1468}
1469
1470OSDMapRef OSDService::try_get_map(epoch_t epoch)
1471{
1472 Mutex::Locker l(map_cache_lock);
1473 OSDMapRef retval = map_cache.lookup(epoch);
1474 if (retval) {
1475 dout(30) << "get_map " << epoch << " -cached" << dendl;
1476 if (logger) {
1477 logger->inc(l_osd_map_cache_hit);
1478 }
1479 return retval;
1480 }
1481 if (logger) {
1482 logger->inc(l_osd_map_cache_miss);
1483 epoch_t lb = map_cache.cached_key_lower_bound();
1484 if (epoch < lb) {
1485 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1486 logger->inc(l_osd_map_cache_miss_low);
1487 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1488 }
1489 }
1490
1491 OSDMap *map = new OSDMap;
1492 if (epoch > 0) {
1493 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1494 bufferlist bl;
1495 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1496 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1497 delete map;
1498 return OSDMapRef();
1499 }
1500 map->decode(bl);
1501 } else {
1502 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1503 }
1504 return _add_map(map);
1505}
1506
1507// ops
1508
1509
1510void OSDService::reply_op_error(OpRequestRef op, int err)
1511{
1512 reply_op_error(op, err, eversion_t(), 0);
1513}
1514
1515void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1516 version_t uv)
1517{
1518 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1519 assert(m->get_type() == CEPH_MSG_OSD_OP);
1520 int flags;
1521 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1522
1523 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1524 true);
1525 reply->set_reply_versions(v, uv);
1526 m->get_connection()->send_message(reply);
1527}
1528
1529void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1530{
1531 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1532 assert(m->get_type() == CEPH_MSG_OSD_OP);
1533
1534 assert(m->get_map_epoch() >= pg->info.history.same_primary_since);
1535
1536 if (pg->is_ec_pg()) {
1537 /**
1538 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1539 * can get this result:
1540 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1541 * [CRUSH_ITEM_NONE, 2, 3]/3
1542 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1543 * [3, 2, 3]/3
1544 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1545 * -- misdirected op
1546 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1547 * it and fulfils it
1548 *
1549 * We can't compute the op target based on the sending map epoch due to
1550 * splitting. The simplest thing is to detect such cases here and drop
1551 * them without an error (the client will resend anyway).
1552 */
1553 assert(m->get_map_epoch() <= superblock.newest_map);
1554 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1555 if (!opmap) {
1556 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1557 << m->get_map_epoch() << ", dropping" << dendl;
1558 return;
1559 }
1560 pg_t _pgid = m->get_raw_pg();
1561 spg_t pgid;
1562 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1563 _pgid = opmap->raw_pg_to_pg(_pgid);
1564 if (opmap->get_primary_shard(_pgid, &pgid) &&
1565 pgid.shard != pg->info.pgid.shard) {
1566 dout(7) << __func__ << ": " << *pg << " primary changed since "
1567 << m->get_map_epoch() << ", dropping" << dendl;
1568 return;
1569 }
1570 }
1571
1572 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1573 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1574 << " pg " << m->get_raw_pg()
1575 << " to osd." << whoami
1576 << " not " << pg->acting
1577 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1578 if (g_conf->osd_enxio_on_misdirected_op) {
1579 reply_op_error(op, -ENXIO);
1580 }
1581}
1582
1583void OSDService::enqueue_back(spg_t pgid, PGQueueable qi)
1584{
1585 osd->op_shardedwq.queue(make_pair(pgid, qi));
1586}
1587
1588void OSDService::enqueue_front(spg_t pgid, PGQueueable qi)
1589{
1590 osd->op_shardedwq.queue_front(make_pair(pgid, qi));
1591}
1592
1593void OSDService::queue_for_peering(PG *pg)
1594{
1595 peering_wq.queue(pg);
1596}
1597
1598void OSDService::queue_for_snap_trim(PG *pg)
1599{
1600 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1601 osd->op_shardedwq.queue(
1602 make_pair(
1603 pg->info.pgid,
1604 PGQueueable(
1605 PGSnapTrim(pg->get_osdmap()->get_epoch()),
1606 cct->_conf->osd_snap_trim_cost,
1607 cct->_conf->osd_snap_trim_priority,
1608 ceph_clock_now(),
1609 entity_inst_t(),
1610 pg->get_osdmap()->get_epoch())));
1611}
1612
1613
1614// ====================================================================
1615// OSD
1616
1617#undef dout_prefix
1618#define dout_prefix *_dout
1619
1620// Commands shared between OSD's console and admin console:
1621namespace ceph {
1622namespace osd_cmds {
1623
1624int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1625
1626}} // namespace ceph::osd_cmds
1627
1628int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
1629 uuid_d fsid, int whoami)
1630{
1631 int ret;
1632
1633 ceph::shared_ptr<ObjectStore::Sequencer> osr(
1634 new ObjectStore::Sequencer("mkfs"));
1635 OSDSuperblock sb;
1636 bufferlist sbbl;
1637 C_SaferCond waiter;
1638
1639 // if we are fed a uuid for this osd, use it.
1640 store->set_fsid(cct->_conf->osd_uuid);
1641
1642 ret = store->mkfs();
1643 if (ret) {
1644 derr << "OSD::mkfs: ObjectStore::mkfs failed with error " << ret << dendl;
1645 goto free_store;
1646 }
1647
1648 store->set_cache_shards(cct->_conf->osd_op_num_shards);
1649
1650 ret = store->mount();
1651 if (ret) {
1652 derr << "OSD::mkfs: couldn't mount ObjectStore: error " << ret << dendl;
1653 goto free_store;
1654 }
1655
1656 ret = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1657 if (ret >= 0) {
1658 /* if we already have superblock, check content of superblock */
1659 dout(0) << " have superblock" << dendl;
1660 bufferlist::iterator p;
1661 p = sbbl.begin();
1662 ::decode(sb, p);
1663 if (whoami != sb.whoami) {
1664 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1665 << dendl;
1666 ret = -EINVAL;
1667 goto umount_store;
1668 }
1669 if (fsid != sb.cluster_fsid) {
1670 derr << "provided cluster fsid " << fsid
1671 << " != superblock's " << sb.cluster_fsid << dendl;
1672 ret = -EINVAL;
1673 goto umount_store;
1674 }
1675 } else {
1676 // create superblock
1677 sb.cluster_fsid = fsid;
1678 sb.osd_fsid = store->get_fsid();
1679 sb.whoami = whoami;
1680 sb.compat_features = get_osd_initial_compat_set();
1681
1682 bufferlist bl;
1683 ::encode(sb, bl);
1684
1685 ObjectStore::Transaction t;
1686 t.create_collection(coll_t::meta(), 0);
1687 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1688 ret = store->apply_transaction(osr.get(), std::move(t));
1689 if (ret) {
1690 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1691 << "apply_transaction returned " << ret << dendl;
1692 goto umount_store;
1693 }
1694 }
1695
1696 if (!osr->flush_commit(&waiter)) {
1697 waiter.wait();
1698 }
1699
1700 ret = write_meta(store, sb.cluster_fsid, sb.osd_fsid, whoami);
1701 if (ret) {
1702 derr << "OSD::mkfs: failed to write fsid file: error " << ret << dendl;
1703 goto umount_store;
1704 }
1705
1706umount_store:
1707 store->umount();
1708free_store:
1709 delete store;
1710 return ret;
1711}
1712
1713int OSD::write_meta(ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
1714{
1715 char val[80];
1716 int r;
1717
1718 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
1719 r = store->write_meta("magic", val);
1720 if (r < 0)
1721 return r;
1722
1723 snprintf(val, sizeof(val), "%d", whoami);
1724 r = store->write_meta("whoami", val);
1725 if (r < 0)
1726 return r;
1727
1728 cluster_fsid.print(val);
1729 r = store->write_meta("ceph_fsid", val);
1730 if (r < 0)
1731 return r;
1732
1733 r = store->write_meta("ready", "ready");
1734 if (r < 0)
1735 return r;
1736
1737 return 0;
1738}
1739
1740int OSD::peek_meta(ObjectStore *store, std::string& magic,
1741 uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami)
1742{
1743 string val;
1744
1745 int r = store->read_meta("magic", &val);
1746 if (r < 0)
1747 return r;
1748 magic = val;
1749
1750 r = store->read_meta("whoami", &val);
1751 if (r < 0)
1752 return r;
1753 whoami = atoi(val.c_str());
1754
1755 r = store->read_meta("ceph_fsid", &val);
1756 if (r < 0)
1757 return r;
1758 r = cluster_fsid.parse(val.c_str());
1759 if (!r)
1760 return -EINVAL;
1761
1762 r = store->read_meta("fsid", &val);
1763 if (r < 0) {
1764 osd_fsid = uuid_d();
1765 } else {
1766 r = osd_fsid.parse(val.c_str());
1767 if (!r)
1768 return -EINVAL;
1769 }
1770
1771 return 0;
1772}
1773
1774
1775#undef dout_prefix
1776#define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
1777
1778// cons/des
1779
1780OSD::OSD(CephContext *cct_, ObjectStore *store_,
1781 int id,
1782 Messenger *internal_messenger,
1783 Messenger *external_messenger,
1784 Messenger *hb_client_front,
1785 Messenger *hb_client_back,
1786 Messenger *hb_front_serverm,
1787 Messenger *hb_back_serverm,
1788 Messenger *osdc_messenger,
1789 MonClient *mc,
1790 const std::string &dev, const std::string &jdev) :
1791 Dispatcher(cct_),
1792 osd_lock("OSD::osd_lock"),
1793 tick_timer(cct, osd_lock),
1794 tick_timer_lock("OSD::tick_timer_lock"),
1795 tick_timer_without_osd_lock(cct, tick_timer_lock),
1796 authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(cct,
1797 cct->_conf->auth_supported.empty() ?
1798 cct->_conf->auth_cluster_required :
1799 cct->_conf->auth_supported)),
1800 authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(cct,
1801 cct->_conf->auth_supported.empty() ?
1802 cct->_conf->auth_service_required :
1803 cct->_conf->auth_supported)),
1804 cluster_messenger(internal_messenger),
1805 client_messenger(external_messenger),
1806 objecter_messenger(osdc_messenger),
1807 monc(mc),
1808 mgrc(cct_, client_messenger),
1809 logger(NULL),
1810 recoverystate_perf(NULL),
1811 store(store_),
1812 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
1813 clog(log_client.create_channel()),
1814 whoami(id),
1815 dev_path(dev), journal_path(jdev),
1816 trace_endpoint("0.0.0.0", 0, "osd"),
1817 asok_hook(NULL),
1818 osd_compat(get_osd_compat_set()),
1819 osd_tp(cct, "OSD::osd_tp", "tp_osd", cct->_conf->osd_op_threads, "osd_op_threads"),
1820 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
1821 cct->_conf->osd_op_num_threads_per_shard * cct->_conf->osd_op_num_shards),
1822 disk_tp(cct, "OSD::disk_tp", "tp_osd_disk", cct->_conf->osd_disk_threads, "osd_disk_threads"),
1823 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
1824 session_waiting_lock("OSD::session_waiting_lock"),
1825 heartbeat_lock("OSD::heartbeat_lock"),
1826 heartbeat_stop(false),
1827 heartbeat_need_update(true),
1828 hb_front_client_messenger(hb_client_front),
1829 hb_back_client_messenger(hb_client_back),
1830 hb_front_server_messenger(hb_front_serverm),
1831 hb_back_server_messenger(hb_back_serverm),
1832 daily_loadavg(0.0),
1833 heartbeat_thread(this),
1834 heartbeat_dispatcher(this),
1835 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
1836 cct->_conf->osd_num_op_tracker_shard),
1837 test_ops_hook(NULL),
1838 op_queue(get_io_queue()),
1839 op_prio_cutoff(get_io_prio_cut()),
1840 op_shardedwq(
1841 cct->_conf->osd_op_num_shards,
1842 this,
1843 cct->_conf->osd_op_thread_timeout,
1844 cct->_conf->osd_op_thread_suicide_timeout,
1845 &osd_op_tp),
1846 peering_wq(
1847 this,
1848 cct->_conf->osd_op_thread_timeout,
1849 cct->_conf->osd_op_thread_suicide_timeout,
1850 &osd_tp),
1851 map_lock("OSD::map_lock"),
1852 pg_map_lock("OSD::pg_map_lock"),
1853 last_pg_create_epoch(0),
1854 mon_report_lock("OSD::mon_report_lock"),
1855 stats_ack_timeout(cct->_conf->osd_mon_ack_timeout),
1856 up_thru_wanted(0),
1857 requested_full_first(0),
1858 requested_full_last(0),
1859 pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
1860 osd_stat_updated(false),
1861 pg_stat_tid(0), pg_stat_tid_flushed(0),
1862 command_wq(
1863 this,
1864 cct->_conf->osd_command_thread_timeout,
1865 cct->_conf->osd_command_thread_suicide_timeout,
1866 &command_tp),
1867 remove_wq(
1868 cct,
1869 store,
1870 cct->_conf->osd_remove_thread_timeout,
1871 cct->_conf->osd_remove_thread_suicide_timeout,
1872 &disk_tp),
1873 service(this)
1874{
1875 monc->set_messenger(client_messenger);
1876 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
1877 cct->_conf->osd_op_log_threshold);
1878 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
1879 cct->_conf->osd_op_history_duration);
1880 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
1881 cct->_conf->osd_op_history_slow_op_threshold);
1882#ifdef WITH_BLKIN
1883 std::stringstream ss;
1884 ss << "osd." << whoami;
1885 trace_endpoint.copy_name(ss.str());
1886#endif
1887}
1888
1889OSD::~OSD()
1890{
1891 delete authorize_handler_cluster_registry;
1892 delete authorize_handler_service_registry;
1893 delete class_handler;
1894 cct->get_perfcounters_collection()->remove(recoverystate_perf);
1895 cct->get_perfcounters_collection()->remove(logger);
1896 delete recoverystate_perf;
1897 delete logger;
1898 delete store;
1899}
1900
1901void cls_initialize(ClassHandler *ch);
1902
1903void OSD::handle_signal(int signum)
1904{
1905 assert(signum == SIGINT || signum == SIGTERM);
1906 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
1907 shutdown();
1908}
1909
1910int OSD::pre_init()
1911{
1912 Mutex::Locker lock(osd_lock);
1913 if (is_stopping())
1914 return 0;
1915
1916 if (store->test_mount_in_use()) {
1917 derr << "OSD::pre_init: object store '" << dev_path << "' is "
1918 << "currently in use. (Is ceph-osd already running?)" << dendl;
1919 return -EBUSY;
1920 }
1921
1922 cct->_conf->add_observer(this);
1923 return 0;
1924}
1925
1926// asok
1927
1928class OSDSocketHook : public AdminSocketHook {
1929 OSD *osd;
1930public:
1931 explicit OSDSocketHook(OSD *o) : osd(o) {}
1932 bool call(std::string admin_command, cmdmap_t& cmdmap, std::string format,
1933 bufferlist& out) override {
1934 stringstream ss;
1935 bool r = osd->asok_command(admin_command, cmdmap, format, ss);
1936 out.append(ss);
1937 return r;
1938 }
1939};
1940
1941bool OSD::asok_command(string admin_command, cmdmap_t& cmdmap, string format,
1942 ostream& ss)
1943{
1944 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
1945 if (admin_command == "status") {
1946 f->open_object_section("status");
1947 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
1948 f->dump_stream("osd_fsid") << superblock.osd_fsid;
1949 f->dump_unsigned("whoami", superblock.whoami);
1950 f->dump_string("state", get_state_name(get_state()));
1951 f->dump_unsigned("oldest_map", superblock.oldest_map);
1952 f->dump_unsigned("newest_map", superblock.newest_map);
1953 {
1954 RWLock::RLocker l(pg_map_lock);
1955 f->dump_unsigned("num_pgs", pg_map.size());
1956 }
1957 f->close_section();
1958 } else if (admin_command == "flush_journal") {
1959 store->flush_journal();
1960 } else if (admin_command == "dump_ops_in_flight" ||
1961 admin_command == "ops") {
1962 if (!op_tracker.dump_ops_in_flight(f)) {
1963 ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
1964 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
1965 }
1966 } else if (admin_command == "dump_blocked_ops") {
1967 if (!op_tracker.dump_ops_in_flight(f, true)) {
1968 ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
1969 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
1970 }
1971 } else if (admin_command == "dump_historic_ops") {
1972 if (!op_tracker.dump_historic_ops(f, false)) {
1973 ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
1974 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
1975 }
1976 } else if (admin_command == "dump_historic_ops_by_duration") {
1977 if (!op_tracker.dump_historic_ops(f, true)) {
1978 ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
1979 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
1980 }
1981 } else if (admin_command == "dump_historic_slow_ops") {
1982 if (!op_tracker.dump_historic_slow_ops(f)) {
1983 ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
1984 Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
1985 }
1986 } else if (admin_command == "dump_op_pq_state") {
1987 f->open_object_section("pq");
1988 op_shardedwq.dump(f);
1989 f->close_section();
1990 } else if (admin_command == "dump_blacklist") {
1991 list<pair<entity_addr_t,utime_t> > bl;
1992 OSDMapRef curmap = service.get_osdmap();
1993
1994 f->open_array_section("blacklist");
1995 curmap->get_blacklist(&bl);
1996 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
1997 it != bl.end(); ++it) {
1998 f->open_array_section("entry");
1999 f->open_object_section("entity_addr_t");
2000 it->first.dump(f);
2001 f->close_section(); //entity_addr_t
2002 it->second.localtime(f->dump_stream("expire_time"));
2003 f->close_section(); //entry
2004 }
2005 f->close_section(); //blacklist
2006 } else if (admin_command == "dump_watchers") {
2007 list<obj_watch_item_t> watchers;
2008 // scan pg's
2009 {
2010 Mutex::Locker l(osd_lock);
2011 RWLock::RLocker l2(pg_map_lock);
2012 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2013 it != pg_map.end();
2014 ++it) {
2015
2016 list<obj_watch_item_t> pg_watchers;
2017 PG *pg = it->second;
2018 pg->lock();
2019 pg->get_watchers(pg_watchers);
2020 pg->unlock();
2021 watchers.splice(watchers.end(), pg_watchers);
2022 }
2023 }
2024
2025 f->open_array_section("watchers");
2026 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2027 it != watchers.end(); ++it) {
2028
2029 f->open_array_section("watch");
2030
2031 f->dump_string("namespace", it->obj.nspace);
2032 f->dump_string("object", it->obj.oid.name);
2033
2034 f->open_object_section("entity_name");
2035 it->wi.name.dump(f);
2036 f->close_section(); //entity_name_t
2037
2038 f->dump_int("cookie", it->wi.cookie);
2039 f->dump_int("timeout", it->wi.timeout_seconds);
2040
2041 f->open_object_section("entity_addr_t");
2042 it->wi.addr.dump(f);
2043 f->close_section(); //entity_addr_t
2044
2045 f->close_section(); //watch
2046 }
2047
2048 f->close_section(); //watchers
2049 } else if (admin_command == "dump_reservations") {
2050 f->open_object_section("reservations");
2051 f->open_object_section("local_reservations");
2052 service.local_reserver.dump(f);
2053 f->close_section();
2054 f->open_object_section("remote_reservations");
2055 service.remote_reserver.dump(f);
2056 f->close_section();
2057 f->close_section();
2058 } else if (admin_command == "get_latest_osdmap") {
2059 get_latest_osdmap();
2060 } else if (admin_command == "heap") {
2061 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2062
2063 // Note: Failed heap profile commands won't necessarily trigger an error:
2064 f->open_object_section("result");
2065 f->dump_string("error", cpp_strerror(result));
2066 f->dump_bool("success", result >= 0);
2067 f->close_section();
2068 } else if (admin_command == "set_heap_property") {
2069 string property;
2070 int64_t value = 0;
2071 string error;
2072 bool success = false;
2073 if (!cmd_getval(cct, cmdmap, "property", property)) {
2074 error = "unable to get property";
2075 success = false;
2076 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2077 error = "unable to get value";
2078 success = false;
2079 } else if (value < 0) {
2080 error = "negative value not allowed";
2081 success = false;
2082 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2083 error = "invalid property";
2084 success = false;
2085 } else {
2086 success = true;
2087 }
2088 f->open_object_section("result");
2089 f->dump_string("error", error);
2090 f->dump_bool("success", success);
2091 f->close_section();
2092 } else if (admin_command == "get_heap_property") {
2093 string property;
2094 size_t value = 0;
2095 string error;
2096 bool success = false;
2097 if (!cmd_getval(cct, cmdmap, "property", property)) {
2098 error = "unable to get property";
2099 success = false;
2100 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2101 error = "invalid property";
2102 success = false;
2103 } else {
2104 success = true;
2105 }
2106 f->open_object_section("result");
2107 f->dump_string("error", error);
2108 f->dump_bool("success", success);
2109 f->dump_int("value", value);
2110 f->close_section();
2111 } else if (admin_command == "dump_objectstore_kv_stats") {
2112 store->get_db_statistics(f);
2113 } else if (admin_command == "dump_scrubs") {
2114 service.dumps_scrub(f);
2115 } else if (admin_command == "calc_objectstore_db_histogram") {
2116 store->generate_db_histogram(f);
2117 } else if (admin_command == "flush_store_cache") {
2118 store->flush_cache();
2119 } else if (admin_command == "dump_pgstate_history") {
2120 f->open_object_section("pgstate_history");
2121 RWLock::RLocker l2(pg_map_lock);
2122 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2123 it != pg_map.end();
2124 ++it) {
2125
2126 PG *pg = it->second;
2127 f->dump_stream("pg") << pg->get_pgid();
2128 pg->lock();
2129 pg->pgstate_history.dump(f);
2130 pg->unlock();
2131 }
2132 f->close_section();
2133 } else {
2134 assert(0 == "broken asok registration");
2135 }
2136 f->flush(ss);
2137 delete f;
2138 return true;
2139}
2140
2141class TestOpsSocketHook : public AdminSocketHook {
2142 OSDService *service;
2143 ObjectStore *store;
2144public:
2145 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
2146 bool call(std::string command, cmdmap_t& cmdmap, std::string format,
2147 bufferlist& out) override {
2148 stringstream ss;
2149 test_ops(service, store, command, cmdmap, ss);
2150 out.append(ss);
2151 return true;
2152 }
2153 void test_ops(OSDService *service, ObjectStore *store,
2154 const std::string &command, cmdmap_t& cmdmap, ostream &ss);
2155
2156};
2157
2158class OSD::C_Tick : public Context {
2159 OSD *osd;
2160 public:
2161 explicit C_Tick(OSD *o) : osd(o) {}
2162 void finish(int r) override {
2163 osd->tick();
2164 }
2165};
2166
2167class OSD::C_Tick_WithoutOSDLock : public Context {
2168 OSD *osd;
2169 public:
2170 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2171 void finish(int r) override {
2172 osd->tick_without_osd_lock();
2173 }
2174};
2175
2176int OSD::enable_disable_fuse(bool stop)
2177{
2178#ifdef HAVE_LIBFUSE
2179 int r;
2180 string mntpath = cct->_conf->osd_data + "/fuse";
2181 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2182 dout(1) << __func__ << " disabling" << dendl;
2183 fuse_store->stop();
2184 delete fuse_store;
2185 fuse_store = NULL;
2186 r = ::rmdir(mntpath.c_str());
2187 if (r < 0)
2188 r = -errno;
2189 if (r < 0) {
2190 derr << __func__ << " failed to rmdir " << mntpath << dendl;
2191 return r;
2192 }
2193 return 0;
2194 }
2195 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2196 dout(1) << __func__ << " enabling" << dendl;
2197 r = ::mkdir(mntpath.c_str(), 0700);
2198 if (r < 0)
2199 r = -errno;
2200 if (r < 0 && r != -EEXIST) {
2201 derr << __func__ << " unable to create " << mntpath << ": "
2202 << cpp_strerror(r) << dendl;
2203 return r;
2204 }
2205 fuse_store = new FuseStore(store, mntpath);
2206 r = fuse_store->start();
2207 if (r < 0) {
2208 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2209 delete fuse_store;
2210 fuse_store = NULL;
2211 return r;
2212 }
2213 }
2214#endif // HAVE_LIBFUSE
2215 return 0;
2216}
2217
2218int OSD::init()
2219{
2220 CompatSet initial, diff;
2221 Mutex::Locker lock(osd_lock);
2222 if (is_stopping())
2223 return 0;
2224
2225 tick_timer.init();
2226 tick_timer_without_osd_lock.init();
2227 service.recovery_request_timer.init();
2228
2229 // mount.
2230 dout(2) << "mounting " << dev_path << " "
2231 << (journal_path.empty() ? "(no journal)" : journal_path) << dendl;
2232 assert(store); // call pre_init() first!
2233
2234 store->set_cache_shards(cct->_conf->osd_op_num_shards);
2235
2236 int r = store->mount();
2237 if (r < 0) {
2238 derr << "OSD:init: unable to mount object store" << dendl;
2239 return r;
2240 }
2241
2242 enable_disable_fuse(false);
2243
2244 dout(2) << "boot" << dendl;
2245
2246 // initialize the daily loadavg with current 15min loadavg
2247 double loadavgs[3];
2248 if (getloadavg(loadavgs, 3) == 3) {
2249 daily_loadavg = loadavgs[2];
2250 } else {
2251 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
2252 daily_loadavg = 1.0;
2253 }
2254
2255 int rotating_auth_attempts = 0;
2256
2257 // sanity check long object name handling
2258 {
2259 hobject_t l;
2260 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
2261 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
2262 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
2263 r = store->validate_hobject_key(l);
2264 if (r < 0) {
2265 derr << "backend (" << store->get_type() << ") is unable to support max "
2266 << "object name[space] len" << dendl;
2267 derr << " osd max object name len = "
2268 << cct->_conf->osd_max_object_name_len << dendl;
2269 derr << " osd max object namespace len = "
2270 << cct->_conf->osd_max_object_namespace_len << dendl;
2271 derr << cpp_strerror(r) << dendl;
2272 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
2273 goto out;
2274 }
2275 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
2276 << dendl;
2277 } else {
2278 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
2279 }
2280 }
2281
2282 // read superblock
2283 r = read_superblock();
2284 if (r < 0) {
2285 derr << "OSD::init() : unable to read osd superblock" << dendl;
2286 r = -EINVAL;
2287 goto out;
2288 }
2289
2290 if (osd_compat.compare(superblock.compat_features) < 0) {
2291 derr << "The disk uses features unsupported by the executable." << dendl;
2292 derr << " ondisk features " << superblock.compat_features << dendl;
2293 derr << " daemon features " << osd_compat << dendl;
2294
2295 if (osd_compat.writeable(superblock.compat_features)) {
2296 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2297 derr << "it is still writeable, though. Missing features: " << diff << dendl;
2298 r = -EOPNOTSUPP;
2299 goto out;
2300 }
2301 else {
2302 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2303 derr << "Cannot write to disk! Missing features: " << diff << dendl;
2304 r = -EOPNOTSUPP;
2305 goto out;
2306 }
2307 }
2308
2309 assert_warn(whoami == superblock.whoami);
2310 if (whoami != superblock.whoami) {
2311 derr << "OSD::init: superblock says osd"
2312 << superblock.whoami << " but I am osd." << whoami << dendl;
2313 r = -EINVAL;
2314 goto out;
2315 }
2316
2317 initial = get_osd_initial_compat_set();
2318 diff = superblock.compat_features.unsupported(initial);
2319 if (superblock.compat_features.merge(initial)) {
2320 // We need to persist the new compat_set before we
2321 // do anything else
2322 dout(5) << "Upgrading superblock adding: " << diff << dendl;
2323 ObjectStore::Transaction t;
2324 write_superblock(t);
2325 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2326 if (r < 0)
2327 goto out;
2328 }
2329
2330 // make sure snap mapper object exists
2331 if (!store->exists(coll_t::meta(), OSD::make_snapmapper_oid())) {
2332 dout(10) << "init creating/touching snapmapper object" << dendl;
2333 ObjectStore::Transaction t;
2334 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
2335 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2336 if (r < 0)
2337 goto out;
2338 }
2339
2340 class_handler = new ClassHandler(cct);
2341 cls_initialize(class_handler);
2342
2343 if (cct->_conf->osd_open_classes_on_start) {
2344 int r = class_handler->open_all_classes();
2345 if (r)
2346 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
2347 }
2348
2349 // load up "current" osdmap
2350 assert_warn(!osdmap);
2351 if (osdmap) {
2352 derr << "OSD::init: unable to read current osdmap" << dendl;
2353 r = -EINVAL;
2354 goto out;
2355 }
2356 osdmap = get_map(superblock.current_epoch);
2357 check_osdmap_features(store);
2358
2359 create_recoverystate_perf();
2360
2361 {
2362 epoch_t bind_epoch = osdmap->get_epoch();
2363 service.set_epochs(NULL, NULL, &bind_epoch);
2364 }
2365
2366 clear_temp_objects();
2367
2368 // load up pgs (as they previously existed)
2369 load_pgs();
2370
2371 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
2372 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
2373 op_prio_cutoff << "." << dendl;
2374
2375 create_logger();
2376
2377 // i'm ready!
2378 client_messenger->add_dispatcher_head(this);
2379 cluster_messenger->add_dispatcher_head(this);
2380
2381 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2382 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2383 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2384 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2385
2386 objecter_messenger->add_dispatcher_head(service.objecter);
2387
2388 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
2389 | CEPH_ENTITY_TYPE_MGR);
2390 r = monc->init();
2391 if (r < 0)
2392 goto out;
2393
2394 /**
2395 * FIXME: this is a placeholder implementation that unconditionally
2396 * sends every is_primary PG's stats every time we're called, unlike
2397 * the existing mon PGStats mechanism that uses pg_stat_queue and acks.
2398 * This has equivalent cost to the existing worst case where all
2399 * PGs are busy and their stats are always enqueued for sending.
2400 */
2401 mgrc.set_pgstats_cb([this](){
2402 RWLock::RLocker l(map_lock);
2403
2404 utime_t had_for = ceph_clock_now() - had_map_since;
2405 osd_stat_t cur_stat = service.get_osd_stat();
2406 cur_stat.os_perf_stat = store->get_cur_stats();
2407
2408 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
2409 m->osd_stat = cur_stat;
2410
2411 Mutex::Locker lec{min_last_epoch_clean_lock};
2412 min_last_epoch_clean = osdmap->get_epoch();
2413 min_last_epoch_clean_pgs.clear();
2414 RWLock::RLocker lpg(pg_map_lock);
2415 for (const auto &i : pg_map) {
2416 PG *pg = i.second;
2417 if (!pg->is_primary()) {
2418 continue;
2419 }
2420
2421 pg->pg_stats_publish_lock.Lock();
2422 if (pg->pg_stats_publish_valid) {
2423 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
2424 const auto lec = pg->pg_stats_publish.get_effective_last_epoch_clean();
2425 min_last_epoch_clean = min(min_last_epoch_clean, lec);
2426 min_last_epoch_clean_pgs.push_back(pg->info.pgid.pgid);
2427 }
2428 pg->pg_stats_publish_lock.Unlock();
2429 }
2430
2431 return m;
2432 });
2433
2434 mgrc.init();
2435 client_messenger->add_dispatcher_head(&mgrc);
2436
2437 // tell monc about log_client so it will know about mon session resets
2438 monc->set_log_client(&log_client);
2439 update_log_config();
2440
2441 osd_tp.start();
2442 osd_op_tp.start();
2443 disk_tp.start();
2444 command_tp.start();
2445
2446 set_disk_tp_priority();
2447
2448 // start the heartbeat
2449 heartbeat_thread.create("osd_srv_heartbt");
2450
2451 // tick
2452 tick_timer.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick(this));
2453 {
2454 Mutex::Locker l(tick_timer_lock);
2455 tick_timer_without_osd_lock.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick_WithoutOSDLock(this));
2456 }
2457
2458 service.init();
2459 service.publish_map(osdmap);
2460 service.publish_superblock(superblock);
2461 service.max_oldest_map = superblock.oldest_map;
2462
2463 osd_lock.Unlock();
2464
2465 r = monc->authenticate();
2466 if (r < 0) {
2467 osd_lock.Lock(); // locker is going to unlock this on function exit
2468 if (is_stopping())
2469 r = 0;
2470 goto monout;
2471 }
2472
2473 while (monc->wait_auth_rotating(30.0) < 0) {
2474 derr << "unable to obtain rotating service keys; retrying" << dendl;
2475 ++rotating_auth_attempts;
2476 if (rotating_auth_attempts > g_conf->max_rotating_auth_attempts) {
2477 osd_lock.Lock(); // make locker happy
2478 if (!is_stopping()) {
2479 r = - ETIMEDOUT;
2480 }
2481 goto monout;
2482 }
2483 }
2484
2485 r = update_crush_device_class();
2486 if (r < 0) {
2487 osd_lock.Lock();
2488 goto monout;
2489 }
2490
2491 r = update_crush_location();
2492 if (r < 0) {
2493 osd_lock.Lock();
2494 goto monout;
2495 }
2496
2497 osd_lock.Lock();
2498 if (is_stopping())
2499 return 0;
2500
2501 // start objecter *after* we have authenticated, so that we don't ignore
2502 // the OSDMaps it requests.
2503 service.final_init();
2504
2505 check_config();
2506
2507 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
2508 consume_map();
2509 peering_wq.drain();
2510
2511 dout(0) << "done with init, starting boot process" << dendl;
2512
2513 // subscribe to any pg creations
2514 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
2515
2516 // MgrClient needs this (it doesn't have MonClient reference itself)
2517 monc->sub_want("mgrmap", 0, 0);
2518
2519 // we don't need to ask for an osdmap here; objecter will
2520 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
2521
2522 monc->renew_subs();
2523
2524 start_boot();
2525
2526 return 0;
2527monout:
2528 mgrc.shutdown();
2529 monc->shutdown();
2530
2531out:
2532 enable_disable_fuse(true);
2533 store->umount();
2534 delete store;
2535 store = NULL;
2536 return r;
2537}
2538
2539void OSD::final_init()
2540{
2541 AdminSocket *admin_socket = cct->get_admin_socket();
2542 asok_hook = new OSDSocketHook(this);
2543 int r = admin_socket->register_command("status", "status", asok_hook,
2544 "high-level status of OSD");
2545 assert(r == 0);
2546 r = admin_socket->register_command("flush_journal", "flush_journal",
2547 asok_hook,
2548 "flush the journal to permanent store");
2549 assert(r == 0);
2550 r = admin_socket->register_command("dump_ops_in_flight",
2551 "dump_ops_in_flight", asok_hook,
2552 "show the ops currently in flight");
2553 assert(r == 0);
2554 r = admin_socket->register_command("ops",
2555 "ops", asok_hook,
2556 "show the ops currently in flight");
2557 assert(r == 0);
2558 r = admin_socket->register_command("dump_blocked_ops",
2559 "dump_blocked_ops", asok_hook,
2560 "show the blocked ops currently in flight");
2561 assert(r == 0);
2562 r = admin_socket->register_command("dump_historic_ops", "dump_historic_ops",
2563 asok_hook,
2564 "show recent ops");
2565 assert(r == 0);
2566 r = admin_socket->register_command("dump_historic_slow_ops", "dump_historic_slow_ops",
2567 asok_hook,
2568 "show slowest recent ops");
2569 assert(r == 0);
2570 r = admin_socket->register_command("dump_historic_ops_by_duration", "dump_historic_ops_by_duration",
2571 asok_hook,
2572 "show slowest recent ops, sorted by duration");
2573 assert(r == 0);
2574 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
2575 asok_hook,
2576 "dump op priority queue state");
2577 assert(r == 0);
2578 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
2579 asok_hook,
2580 "dump blacklisted clients and times");
2581 assert(r == 0);
2582 r = admin_socket->register_command("dump_watchers", "dump_watchers",
2583 asok_hook,
2584 "show clients which have active watches,"
2585 " and on which objects");
2586 assert(r == 0);
2587 r = admin_socket->register_command("dump_reservations", "dump_reservations",
2588 asok_hook,
2589 "show recovery reservations");
2590 assert(r == 0);
2591 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
2592 asok_hook,
2593 "force osd to update the latest map from "
2594 "the mon");
2595 assert(r == 0);
2596
2597 r = admin_socket->register_command( "heap",
2598 "heap " \
2599 "name=heapcmd,type=CephString",
2600 asok_hook,
2601 "show heap usage info (available only if "
2602 "compiled with tcmalloc)");
2603 assert(r == 0);
2604
2605 r = admin_socket->register_command("set_heap_property",
2606 "set_heap_property " \
2607 "name=property,type=CephString " \
2608 "name=value,type=CephInt",
2609 asok_hook,
2610 "update malloc extension heap property");
2611 assert(r == 0);
2612
2613 r = admin_socket->register_command("get_heap_property",
2614 "get_heap_property " \
2615 "name=property,type=CephString",
2616 asok_hook,
2617 "get malloc extension heap property");
2618 assert(r == 0);
2619
2620 r = admin_socket->register_command("dump_objectstore_kv_stats",
2621 "dump_objectstore_kv_stats",
2622 asok_hook,
2623 "print statistics of kvdb which used by bluestore");
2624 assert(r == 0);
2625
2626 r = admin_socket->register_command("dump_scrubs",
2627 "dump_scrubs",
2628 asok_hook,
2629 "print scheduled scrubs");
2630 assert(r == 0);
2631
2632 r = admin_socket->register_command("calc_objectstore_db_histogram",
2633 "calc_objectstore_db_histogram",
2634 asok_hook,
2635 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
2636 assert(r == 0);
2637
2638 r = admin_socket->register_command("flush_store_cache",
2639 "flush_store_cache",
2640 asok_hook,
2641 "Flush bluestore internal cache");
2642 assert(r == 0);
2643 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
2644 asok_hook,
2645 "show recent state history");
2646 assert(r == 0);
2647
2648 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
2649 // Note: pools are CephString instead of CephPoolname because
2650 // these commands traditionally support both pool names and numbers
2651 r = admin_socket->register_command(
2652 "setomapval",
2653 "setomapval " \
2654 "name=pool,type=CephString " \
2655 "name=objname,type=CephObjectname " \
2656 "name=key,type=CephString "\
2657 "name=val,type=CephString",
2658 test_ops_hook,
2659 "set omap key");
2660 assert(r == 0);
2661 r = admin_socket->register_command(
2662 "rmomapkey",
2663 "rmomapkey " \
2664 "name=pool,type=CephString " \
2665 "name=objname,type=CephObjectname " \
2666 "name=key,type=CephString",
2667 test_ops_hook,
2668 "remove omap key");
2669 assert(r == 0);
2670 r = admin_socket->register_command(
2671 "setomapheader",
2672 "setomapheader " \
2673 "name=pool,type=CephString " \
2674 "name=objname,type=CephObjectname " \
2675 "name=header,type=CephString",
2676 test_ops_hook,
2677 "set omap header");
2678 assert(r == 0);
2679
2680 r = admin_socket->register_command(
2681 "getomap",
2682 "getomap " \
2683 "name=pool,type=CephString " \
2684 "name=objname,type=CephObjectname",
2685 test_ops_hook,
2686 "output entire object map");
2687 assert(r == 0);
2688
2689 r = admin_socket->register_command(
2690 "truncobj",
2691 "truncobj " \
2692 "name=pool,type=CephString " \
2693 "name=objname,type=CephObjectname " \
2694 "name=len,type=CephInt",
2695 test_ops_hook,
2696 "truncate object to length");
2697 assert(r == 0);
2698
2699 r = admin_socket->register_command(
2700 "injectdataerr",
2701 "injectdataerr " \
2702 "name=pool,type=CephString " \
2703 "name=objname,type=CephObjectname " \
2704 "name=shardid,type=CephInt,req=false,range=0|255",
2705 test_ops_hook,
2706 "inject data error to an object");
2707 assert(r == 0);
2708
2709 r = admin_socket->register_command(
2710 "injectmdataerr",
2711 "injectmdataerr " \
2712 "name=pool,type=CephString " \
2713 "name=objname,type=CephObjectname " \
2714 "name=shardid,type=CephInt,req=false,range=0|255",
2715 test_ops_hook,
2716 "inject metadata error to an object");
2717 assert(r == 0);
2718 r = admin_socket->register_command(
2719 "set_recovery_delay",
2720 "set_recovery_delay " \
2721 "name=utime,type=CephInt,req=false",
2722 test_ops_hook,
2723 "Delay osd recovery by specified seconds");
2724 assert(r == 0);
2725 r = admin_socket->register_command(
2726 "trigger_scrub",
2727 "trigger_scrub " \
2728 "name=pgid,type=CephString ",
2729 test_ops_hook,
2730 "Trigger a scheduled scrub ");
2731 assert(r == 0);
2732 r = admin_socket->register_command(
2733 "injectfull",
2734 "injectfull " \
2735 "name=type,type=CephString,req=false " \
2736 "name=count,type=CephInt,req=false ",
2737 test_ops_hook,
2738 "Inject a full disk (optional count times)");
2739 assert(r == 0);
2740}
2741
2742void OSD::create_logger()
2743{
2744 dout(10) << "create_logger" << dendl;
2745
2746 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
2747
2748 // Latency axis configuration for op histograms, values are in nanoseconds
2749 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
2750 "Latency (usec)",
2751 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
2752 0, ///< Start at 0
2753 100000, ///< Quantization unit is 100usec
2754 32, ///< Enough to cover much longer than slow requests
2755 };
2756
2757 // Op size axis configuration for op histograms, values are in bytes
2758 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
2759 "Request size (bytes)",
2760 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
2761 0, ///< Start at 0
2762 512, ///< Quantization unit is 512 bytes
2763 32, ///< Enough to cover requests larger than GB
2764 };
2765
2766
2767 osd_plb.add_u64(
2768 l_osd_op_wip, "op_wip",
2769 "Replication operations currently being processed (primary)");
2770 osd_plb.add_u64_counter(
2771 l_osd_op, "op",
2772 "Client operations",
2773 "ops", PerfCountersBuilder::PRIO_CRITICAL);
2774 osd_plb.add_u64_counter(
2775 l_osd_op_inb, "op_in_bytes",
2776 "Client operations total write size",
2777 "wr", PerfCountersBuilder::PRIO_INTERESTING);
2778 osd_plb.add_u64_counter(
2779 l_osd_op_outb, "op_out_bytes",
2780 "Client operations total read size",
2781 "rd", PerfCountersBuilder::PRIO_INTERESTING);
2782 osd_plb.add_time_avg(
2783 l_osd_op_lat, "op_latency",
2784 "Latency of client operations (including queue time)",
2785 "l", 9);
2786 osd_plb.add_time_avg(
2787 l_osd_op_process_lat, "op_process_latency",
2788 "Latency of client operations (excluding queue time)");
2789 osd_plb.add_time_avg(
2790 l_osd_op_prepare_lat, "op_prepare_latency",
2791 "Latency of client operations (excluding queue time and wait for finished)");
2792
2793 osd_plb.add_u64_counter(
2794 l_osd_op_r, "op_r", "Client read operations");
2795 osd_plb.add_u64_counter(
2796 l_osd_op_r_outb, "op_r_out_bytes", "Client data read");
2797 osd_plb.add_time_avg(
2798 l_osd_op_r_lat, "op_r_latency",
2799 "Latency of read operation (including queue time)");
2800 osd_plb.add_histogram(
2801 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
2802 op_hist_x_axis_config, op_hist_y_axis_config,
2803 "Histogram of operation latency (including queue time) + data read");
2804 osd_plb.add_time_avg(
2805 l_osd_op_r_process_lat, "op_r_process_latency",
2806 "Latency of read operation (excluding queue time)");
2807 osd_plb.add_time_avg(
2808 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
2809 "Latency of read operations (excluding queue time and wait for finished)");
2810 osd_plb.add_u64_counter(
2811 l_osd_op_w, "op_w", "Client write operations");
2812 osd_plb.add_u64_counter(
2813 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
2814 osd_plb.add_time_avg(
2815 l_osd_op_w_lat, "op_w_latency",
2816 "Latency of write operation (including queue time)");
2817 osd_plb.add_histogram(
2818 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
2819 op_hist_x_axis_config, op_hist_y_axis_config,
2820 "Histogram of operation latency (including queue time) + data written");
2821 osd_plb.add_time_avg(
2822 l_osd_op_w_process_lat, "op_w_process_latency",
2823 "Latency of write operation (excluding queue time)");
2824 osd_plb.add_time_avg(
2825 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
2826 "Latency of write operations (excluding queue time and wait for finished)");
2827 osd_plb.add_u64_counter(
2828 l_osd_op_rw, "op_rw",
2829 "Client read-modify-write operations");
2830 osd_plb.add_u64_counter(
2831 l_osd_op_rw_inb, "op_rw_in_bytes",
2832 "Client read-modify-write operations write in");
2833 osd_plb.add_u64_counter(
2834 l_osd_op_rw_outb,"op_rw_out_bytes",
2835 "Client read-modify-write operations read out ");
2836 osd_plb.add_time_avg(
2837 l_osd_op_rw_lat, "op_rw_latency",
2838 "Latency of read-modify-write operation (including queue time)");
2839 osd_plb.add_histogram(
2840 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
2841 op_hist_x_axis_config, op_hist_y_axis_config,
2842 "Histogram of rw operation latency (including queue time) + data written");
2843 osd_plb.add_histogram(
2844 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
2845 op_hist_x_axis_config, op_hist_y_axis_config,
2846 "Histogram of rw operation latency (including queue time) + data read");
2847 osd_plb.add_time_avg(
2848 l_osd_op_rw_process_lat, "op_rw_process_latency",
2849 "Latency of read-modify-write operation (excluding queue time)");
2850 osd_plb.add_time_avg(
2851 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
2852 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
2853
2854 osd_plb.add_u64_counter(
2855 l_osd_sop, "subop", "Suboperations");
2856 osd_plb.add_u64_counter(
2857 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size");
2858 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
2859
2860 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
2861 osd_plb.add_u64_counter(
2862 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size");
2863 osd_plb.add_time_avg(
2864 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
2865 osd_plb.add_u64_counter(
2866 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
2867 osd_plb.add_time_avg(
2868 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
2869 osd_plb.add_u64_counter(
2870 l_osd_sop_push, "subop_push", "Suboperations push messages");
2871 osd_plb.add_u64_counter(
2872 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size");
2873 osd_plb.add_time_avg(
2874 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
2875
2876 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
2877 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
2878 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size");
2879
2880 osd_plb.add_u64_counter(
2881 l_osd_rop, "recovery_ops",
2882 "Started recovery operations",
2883 "rop", PerfCountersBuilder::PRIO_INTERESTING);
2884
2885 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
2886 osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size");
2887 osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes");
2888 osd_plb.add_u64(l_osd_history_alloc_num, "history_alloc_num");
2889 osd_plb.add_u64(
2890 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
2891 osd_plb.add_u64(
2892 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
2893 "Total number getting crc from crc_cache with adjusting");
2894 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
2895 "Total number of crc cache misses");
2896
2897 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
2898 "pgs", PerfCountersBuilder::PRIO_USEFUL);
2899 osd_plb.add_u64(
2900 l_osd_pg_primary, "numpg_primary",
2901 "Placement groups for which this osd is primary");
2902 osd_plb.add_u64(
2903 l_osd_pg_replica, "numpg_replica",
2904 "Placement groups for which this osd is replica");
2905 osd_plb.add_u64(
2906 l_osd_pg_stray, "numpg_stray",
2907 "Placement groups ready to be deleted from this osd");
2908 osd_plb.add_u64(
2909 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
2910 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
2911 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
2912 osd_plb.add_u64_counter(
2913 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
2914 osd_plb.add_u64_counter(
2915 l_osd_waiting_for_map, "messages_delayed_for_map",
2916 "Operations waiting for OSD map");
2917 osd_plb.add_u64_counter(
2918 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
2919 osd_plb.add_u64_counter(
2920 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
2921 osd_plb.add_u64_counter(
2922 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
2923 "osdmap cache miss below cache lower bound");
2924 osd_plb.add_u64_avg(
2925 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
2926 "osdmap cache miss, avg distance below cache lower bound");
2927
2928 osd_plb.add_u64(l_osd_stat_bytes, "stat_bytes", "OSD size");
2929 osd_plb.add_u64(l_osd_stat_bytes_used, "stat_bytes_used", "Used space");
2930 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
2931
2932 osd_plb.add_u64_counter(
2933 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
2934
2935 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
2936 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
2937 osd_plb.add_u64_counter(
2938 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
2939 osd_plb.add_u64_counter(
2940 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
2941 osd_plb.add_u64_counter(
2942 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
2943 "Failed tier flush attempts");
2944 osd_plb.add_u64_counter(
2945 l_osd_tier_evict, "tier_evict", "Tier evictions");
2946 osd_plb.add_u64_counter(
2947 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
2948 osd_plb.add_u64_counter(
2949 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
2950 osd_plb.add_u64_counter(
2951 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
2952 osd_plb.add_u64_counter(
2953 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
2954 osd_plb.add_u64_counter(
2955 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
2956 osd_plb.add_u64_counter(
2957 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
2958
2959 osd_plb.add_u64_counter(
2960 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
2961 osd_plb.add_u64_counter(
2962 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
2963 osd_plb.add_u64_counter(
2964 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
2965 osd_plb.add_u64_counter(
2966 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
2967
2968 osd_plb.add_u64_counter(
2969 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
2970 osd_plb.add_u64_counter(
2971 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
2972
2973 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
2974 osd_plb.add_time_avg(
2975 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
2976 osd_plb.add_time_avg(
2977 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
2978 osd_plb.add_time_avg(
2979 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
2980
2981 osd_plb.add_u64_counter(
2982 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
2983 osd_plb.add_u64_counter(
2984 l_osd_pg_fastinfo, "osd_pg_fastinfo",
2985 "PG updated its info using fastinfo attr");
2986 osd_plb.add_u64_counter(
2987 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
2988
2989 logger = osd_plb.create_perf_counters();
2990 cct->get_perfcounters_collection()->add(logger);
2991}
2992
2993void OSD::create_recoverystate_perf()
2994{
2995 dout(10) << "create_recoverystate_perf" << dendl;
2996
2997 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
2998
2999 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3000 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3001 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3002 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3003 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3004 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3005 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3006 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3007 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3008 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3009 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3010 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3011 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3012 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3013 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3014 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3015 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3016 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3017 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3018 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3019 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3020 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3021 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3022 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3023 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3024 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3025 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3026 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3027 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3028 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3029 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3030
3031 recoverystate_perf = rs_perf.create_perf_counters();
3032 cct->get_perfcounters_collection()->add(recoverystate_perf);
3033}
3034
3035int OSD::shutdown()
3036{
3037 if (!service.prepare_to_stop())
3038 return 0; // already shutting down
3039 osd_lock.Lock();
3040 if (is_stopping()) {
3041 osd_lock.Unlock();
3042 return 0;
3043 }
3044 derr << "shutdown" << dendl;
3045
3046 set_state(STATE_STOPPING);
3047
3048 // Debugging
3049 cct->_conf->set_val("debug_osd", "100");
3050 cct->_conf->set_val("debug_journal", "100");
3051 cct->_conf->set_val("debug_filestore", "100");
3052 cct->_conf->set_val("debug_ms", "100");
3053 cct->_conf->apply_changes(NULL);
3054
3055 // stop MgrClient earlier as it's more like an internal consumer of OSD
3056 mgrc.shutdown();
3057
3058 service.start_shutdown();
3059
3060 // stop sending work to pgs. this just prevents any new work in _process
3061 // from racing with on_shutdown and potentially entering the pg after.
3062 op_shardedwq.drain();
3063
3064 // Shutdown PGs
3065 {
3066 RWLock::RLocker l(pg_map_lock);
3067 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3068 p != pg_map.end();
3069 ++p) {
3070 dout(20) << " kicking pg " << p->first << dendl;
3071 p->second->lock();
3072 p->second->on_shutdown();
3073 p->second->unlock();
3074 p->second->osr->flush();
3075 }
3076 }
3077 clear_pg_stat_queue();
3078
3079 // drain op queue again (in case PGs requeued something)
3080 op_shardedwq.drain();
3081 {
3082 finished.clear(); // zap waiters (bleh, this is messy)
3083 }
3084
3085 op_shardedwq.clear_pg_slots();
3086
3087 // unregister commands
3088 cct->get_admin_socket()->unregister_command("status");
3089 cct->get_admin_socket()->unregister_command("flush_journal");
3090 cct->get_admin_socket()->unregister_command("dump_ops_in_flight");
3091 cct->get_admin_socket()->unregister_command("ops");
3092 cct->get_admin_socket()->unregister_command("dump_blocked_ops");
3093 cct->get_admin_socket()->unregister_command("dump_historic_ops");
3094 cct->get_admin_socket()->unregister_command("dump_historic_ops_by_duration");
3095 cct->get_admin_socket()->unregister_command("dump_historic_slow_ops");
3096 cct->get_admin_socket()->unregister_command("dump_op_pq_state");
3097 cct->get_admin_socket()->unregister_command("dump_blacklist");
3098 cct->get_admin_socket()->unregister_command("dump_watchers");
3099 cct->get_admin_socket()->unregister_command("dump_reservations");
3100 cct->get_admin_socket()->unregister_command("get_latest_osdmap");
3101 cct->get_admin_socket()->unregister_command("set_heap_property");
3102 cct->get_admin_socket()->unregister_command("get_heap_property");
3103 cct->get_admin_socket()->unregister_command("dump_objectstore_kv_stats");
3104 cct->get_admin_socket()->unregister_command("calc_objectstore_db_histogram");
3105 cct->get_admin_socket()->unregister_command("flush_store_cache");
3106 cct->get_admin_socket()->unregister_command("dump_pgstate_history");
3107 delete asok_hook;
3108 asok_hook = NULL;
3109
3110 cct->get_admin_socket()->unregister_command("setomapval");
3111 cct->get_admin_socket()->unregister_command("rmomapkey");
3112 cct->get_admin_socket()->unregister_command("setomapheader");
3113 cct->get_admin_socket()->unregister_command("getomap");
3114 cct->get_admin_socket()->unregister_command("truncobj");
3115 cct->get_admin_socket()->unregister_command("injectdataerr");
3116 cct->get_admin_socket()->unregister_command("injectmdataerr");
3117 cct->get_admin_socket()->unregister_command("set_recovery_delay");
3118 delete test_ops_hook;
3119 test_ops_hook = NULL;
3120
3121 osd_lock.Unlock();
3122
3123 heartbeat_lock.Lock();
3124 heartbeat_stop = true;
3125 heartbeat_cond.Signal();
3126 heartbeat_lock.Unlock();
3127 heartbeat_thread.join();
3128
3129 osd_tp.drain();
3130 peering_wq.clear();
3131 osd_tp.stop();
3132 dout(10) << "osd tp stopped" << dendl;
3133
3134 osd_op_tp.drain();
3135 osd_op_tp.stop();
3136 dout(10) << "op sharded tp stopped" << dendl;
3137
3138 command_tp.drain();
3139 command_tp.stop();
3140 dout(10) << "command tp stopped" << dendl;
3141
3142 disk_tp.drain();
3143 disk_tp.stop();
3144 dout(10) << "disk tp paused (new)" << dendl;
3145
3146 dout(10) << "stopping agent" << dendl;
3147 service.agent_stop();
3148
3149 osd_lock.Lock();
3150
3151 reset_heartbeat_peers();
3152
3153 tick_timer.shutdown();
3154
3155 {
3156 Mutex::Locker l(tick_timer_lock);
3157 tick_timer_without_osd_lock.shutdown();
3158 }
3159
3160 // note unmount epoch
3161 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
3162 superblock.mounted = service.get_boot_epoch();
3163 superblock.clean_thru = osdmap->get_epoch();
3164 ObjectStore::Transaction t;
3165 write_superblock(t);
3166 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
3167 if (r) {
3168 derr << "OSD::shutdown: error writing superblock: "
3169 << cpp_strerror(r) << dendl;
3170 }
3171
3172
3173 {
3174 Mutex::Locker l(pg_stat_queue_lock);
3175 assert(pg_stat_queue.empty());
3176 }
3177
3178 // Remove PGs
3179#ifdef PG_DEBUG_REFS
3180 service.dump_live_pgids();
3181#endif
3182 {
3183 RWLock::RLocker l(pg_map_lock);
3184 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3185 p != pg_map.end();
3186 ++p) {
3187 dout(20) << " kicking pg " << p->first << dendl;
3188 p->second->lock();
3189 if (p->second->ref != 1) {
3190 derr << "pgid " << p->first << " has ref count of "
3191 << p->second->ref << dendl;
3192#ifdef PG_DEBUG_REFS
3193 p->second->dump_live_ids();
3194#endif
3195 ceph_abort();
3196 }
3197 p->second->unlock();
3198 p->second->put("PGMap");
3199 }
3200 pg_map.clear();
3201 }
3202#ifdef PG_DEBUG_REFS
3203 service.dump_live_pgids();
3204#endif
3205 cct->_conf->remove_observer(this);
3206
3207 dout(10) << "syncing store" << dendl;
3208 enable_disable_fuse(true);
3209
3210 if (cct->_conf->osd_journal_flush_on_shutdown) {
3211 dout(10) << "flushing journal" << dendl;
3212 store->flush_journal();
3213 }
3214
3215 store->umount();
3216 delete store;
3217 store = 0;
3218 dout(10) << "Store synced" << dendl;
3219
3220 monc->shutdown();
3221 osd_lock.Unlock();
3222
3223 osdmap = OSDMapRef();
3224 service.shutdown();
3225 op_tracker.on_shutdown();
3226
3227 class_handler->shutdown();
3228 client_messenger->shutdown();
3229 cluster_messenger->shutdown();
3230 hb_front_client_messenger->shutdown();
3231 hb_back_client_messenger->shutdown();
3232 objecter_messenger->shutdown();
3233 hb_front_server_messenger->shutdown();
3234 hb_back_server_messenger->shutdown();
3235
3236 peering_wq.clear();
3237
3238 return r;
3239}
3240
3241int OSD::mon_cmd_maybe_osd_create(string &cmd)
3242{
3243 bool created = false;
3244 while (true) {
3245 dout(10) << __func__ << " cmd: " << cmd << dendl;
3246 vector<string> vcmd{cmd};
3247 bufferlist inbl;
3248 C_SaferCond w;
3249 string outs;
3250 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
3251 int r = w.wait();
3252 if (r < 0) {
3253 if (r == -ENOENT && !created) {
3254 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
3255 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
3256 vector<string> vnewcmd{newcmd};
3257 bufferlist inbl;
3258 C_SaferCond w;
3259 string outs;
3260 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
3261 int r = w.wait();
3262 if (r < 0) {
3263 derr << __func__ << " fail: osd does not exist and created failed: "
3264 << cpp_strerror(r) << dendl;
3265 return r;
3266 }
3267 created = true;
3268 continue;
3269 }
3270 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
3271 return r;
3272 }
3273 break;
3274 }
3275
3276 return 0;
3277}
3278
3279int OSD::update_crush_location()
3280{
3281 if (!cct->_conf->osd_crush_update_on_start) {
3282 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
3283 return 0;
3284 }
3285
3286 char weight[32];
3287 if (cct->_conf->osd_crush_initial_weight >= 0) {
3288 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
3289 } else {
3290 struct store_statfs_t st;
3291 int r = store->statfs(&st);
3292 if (r < 0) {
3293 derr << "statfs: " << cpp_strerror(r) << dendl;
3294 return r;
3295 }
3296 snprintf(weight, sizeof(weight), "%.4lf",
3297 MAX((double).00001,
3298 (double)(st.total) /
3299 (double)(1ull << 40 /* TB */)));
3300 }
3301
3302 std::multimap<string,string> loc = cct->crush_location.get_location();
3303 dout(10) << __func__ << " crush location is " << loc << dendl;
3304
3305 string cmd =
3306 string("{\"prefix\": \"osd crush create-or-move\", ") +
3307 string("\"id\": ") + stringify(whoami) + string(", ") +
3308 string("\"weight\":") + weight + string(", ") +
3309 string("\"args\": [");
3310 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
3311 if (p != loc.begin())
3312 cmd += ", ";
3313 cmd += "\"" + p->first + "=" + p->second + "\"";
3314 }
3315 cmd += "]}";
3316
3317 return mon_cmd_maybe_osd_create(cmd);
3318}
3319
3320int OSD::update_crush_device_class()
3321{
3322 string device_class;
3323 int r = store->read_meta("crush_device_class", &device_class);
3324 if (r < 0)
3325 return 0;
3326
3327 string cmd =
3328 string("{\"prefix\": \"osd crush set-device-class\", ") +
3329 string("\"id\": ") + stringify(whoami) + string(", ") +
3330 string("\"class\": \"") + device_class + string("\"}");
3331
3332 return mon_cmd_maybe_osd_create(cmd);
3333}
3334
3335void OSD::write_superblock(ObjectStore::Transaction& t)
3336{
3337 dout(10) << "write_superblock " << superblock << dendl;
3338
3339 //hack: at minimum it's using the baseline feature set
3340 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
3341 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
3342
3343 bufferlist bl;
3344 ::encode(superblock, bl);
3345 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
3346}
3347
3348int OSD::read_superblock()
3349{
3350 bufferlist bl;
3351 int r = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
3352 if (r < 0)
3353 return r;
3354
3355 bufferlist::iterator p = bl.begin();
3356 ::decode(superblock, p);
3357
3358 dout(10) << "read_superblock " << superblock << dendl;
3359
3360 return 0;
3361}
3362
3363void OSD::clear_temp_objects()
3364{
3365 dout(10) << __func__ << dendl;
3366 vector<coll_t> ls;
3367 store->list_collections(ls);
3368 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
3369 spg_t pgid;
3370 if (!p->is_pg(&pgid))
3371 continue;
3372
3373 // list temp objects
3374 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
3375
3376 vector<ghobject_t> temps;
3377 ghobject_t next;
3378 while (1) {
3379 vector<ghobject_t> objects;
3380 store->collection_list(*p, next, ghobject_t::get_max(),
3381 store->get_ideal_list_max(),
3382 &objects, &next);
3383 if (objects.empty())
3384 break;
3385 vector<ghobject_t>::iterator q;
3386 for (q = objects.begin(); q != objects.end(); ++q) {
3387 // Hammer set pool for temps to -1, so check for clean-up
3388 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
3389 temps.push_back(*q);
3390 } else {
3391 break;
3392 }
3393 }
3394 // If we saw a non-temp object and hit the break above we can
3395 // break out of the while loop too.
3396 if (q != objects.end())
3397 break;
3398 }
3399 if (!temps.empty()) {
3400 ObjectStore::Transaction t;
3401 int removed = 0;
3402 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
3403 dout(20) << " removing " << *p << " object " << *q << dendl;
3404 t.remove(*p, *q);
3405 if (++removed > cct->_conf->osd_target_transaction_size) {
3406 store->apply_transaction(service.meta_osr.get(), std::move(t));
3407 t = ObjectStore::Transaction();
3408 removed = 0;
3409 }
3410 }
3411 if (removed) {
3412 store->apply_transaction(service.meta_osr.get(), std::move(t));
3413 }
3414 }
3415 }
3416}
3417
3418void OSD::recursive_remove_collection(CephContext* cct,
3419 ObjectStore *store, spg_t pgid,
3420 coll_t tmp)
3421{
3422 OSDriver driver(
3423 store,
3424 coll_t(),
3425 make_snapmapper_oid());
3426
3427 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
3428 ObjectStore::Sequencer>("rm"));
3429 ObjectStore::Transaction t;
3430 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
3431
3432 vector<ghobject_t> objects;
3433 store->collection_list(tmp, ghobject_t(), ghobject_t::get_max(),
3434 INT_MAX, &objects, 0);
3435 generic_dout(10) << __func__ << " " << objects << dendl;
3436 // delete them.
3437 int removed = 0;
3438 for (vector<ghobject_t>::iterator p = objects.begin();
3439 p != objects.end();
3440 ++p, removed++) {
3441 OSDriver::OSTransaction _t(driver.get_transaction(&t));
3442 int r = mapper.remove_oid(p->hobj, &_t);
3443 if (r != 0 && r != -ENOENT)
3444 ceph_abort();
3445 t.remove(tmp, *p);
3446 if (removed > cct->_conf->osd_target_transaction_size) {
3447 int r = store->apply_transaction(osr.get(), std::move(t));
3448 assert(r == 0);
3449 t = ObjectStore::Transaction();
3450 removed = 0;
3451 }
3452 }
3453 t.remove_collection(tmp);
3454 int r = store->apply_transaction(osr.get(), std::move(t));
3455 assert(r == 0);
3456
3457 C_SaferCond waiter;
3458 if (!osr->flush_commit(&waiter)) {
3459 waiter.wait();
3460 }
3461}
3462
3463
3464// ======================================================
3465// PG's
3466
3467PGPool OSD::_get_pool(int id, OSDMapRef createmap)
3468{
3469 if (!createmap->have_pg_pool(id)) {
3470 dout(5) << __func__ << ": the OSDmap does not contain a PG pool with id = "
3471 << id << dendl;
3472 ceph_abort();
3473 }
3474
3475 PGPool p = PGPool(cct, createmap, id);
3476
3477 dout(10) << "_get_pool " << p.id << dendl;
3478 return p;
3479}
3480
3481PG *OSD::_open_lock_pg(
3482 OSDMapRef createmap,
3483 spg_t pgid, bool no_lockdep_check)
3484{
3485 assert(osd_lock.is_locked());
3486
3487 PG* pg = _make_pg(createmap, pgid);
3488 {
3489 RWLock::WLocker l(pg_map_lock);
3490 pg->lock(no_lockdep_check);
3491 pg_map[pgid] = pg;
3492 pg->get("PGMap"); // because it's in pg_map
3493 service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
3494 }
3495 return pg;
3496}
3497
3498PG* OSD::_make_pg(
3499 OSDMapRef createmap,
3500 spg_t pgid)
3501{
3502 dout(10) << "_open_lock_pg " << pgid << dendl;
3503 PGPool pool = _get_pool(pgid.pool(), createmap);
3504
3505 // create
3506 PG *pg;
3507 if (createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_REPLICATED ||
3508 createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_ERASURE)
3509 pg = new PrimaryLogPG(&service, createmap, pool, pgid);
3510 else
3511 ceph_abort();
3512
3513 return pg;
3514}
3515
3516
3517void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
3518{
3519 epoch_t e(service.get_osdmap()->get_epoch());
3520 pg->get("PGMap"); // For pg_map
3521 pg_map[pg->info.pgid] = pg;
3522 service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
3523
3524 dout(10) << "Adding newly split pg " << *pg << dendl;
3525 pg->handle_loaded(rctx);
3526 pg->write_if_dirty(*(rctx->transaction));
3527 pg->queue_null(e, e);
3528 map<spg_t, list<PG::CephPeeringEvtRef> >::iterator to_wake =
3529 peering_wait_for_split.find(pg->info.pgid);
3530 if (to_wake != peering_wait_for_split.end()) {
3531 for (list<PG::CephPeeringEvtRef>::iterator i =
3532 to_wake->second.begin();
3533 i != to_wake->second.end();
3534 ++i) {
3535 pg->queue_peering_event(*i);
3536 }
3537 peering_wait_for_split.erase(to_wake);
3538 }
3539 if (!service.get_osdmap()->have_pg_pool(pg->info.pgid.pool()))
3540 _remove_pg(pg);
3541}
3542
3543OSD::res_result OSD::_try_resurrect_pg(
3544 OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state)
3545{
3546 assert(resurrected);
3547 assert(old_pg_state);
3548 // find nearest ancestor
3549 DeletingStateRef df;
3550 spg_t cur(pgid);
3551 while (true) {
3552 df = service.deleting_pgs.lookup(cur);
3553 if (df)
3554 break;
3555 if (!cur.ps())
3556 break;
3557 cur = cur.get_parent();
3558 }
3559 if (!df)
3560 return RES_NONE; // good to go
3561
3562 df->old_pg_state->lock();
3563 OSDMapRef create_map = df->old_pg_state->get_osdmap();
3564 df->old_pg_state->unlock();
3565
3566 set<spg_t> children;
3567 if (cur == pgid) {
3568 if (df->try_stop_deletion()) {
3569 dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl;
3570 *resurrected = cur;
3571 *old_pg_state = df->old_pg_state;
3572 service.deleting_pgs.remove(pgid); // PG is no longer being removed!
3573 return RES_SELF;
3574 } else {
3575 // raced, ensure we don't see DeletingStateRef when we try to
3576 // delete this pg
3577 service.deleting_pgs.remove(pgid);
3578 return RES_NONE;
3579 }
3580 } else if (cur.is_split(create_map->get_pg_num(cur.pool()),
3581 curmap->get_pg_num(cur.pool()),
3582 &children) &&
3583 children.count(pgid)) {
3584 if (df->try_stop_deletion()) {
3585 dout(10) << __func__ << ": halted deletion on ancestor pg " << pgid
3586 << dendl;
3587 *resurrected = cur;
3588 *old_pg_state = df->old_pg_state;
3589 service.deleting_pgs.remove(cur); // PG is no longer being removed!
3590 return RES_PARENT;
3591 } else {
3592 /* this is not a problem, failing to cancel proves that all objects
3593 * have been removed, so no hobject_t overlap is possible
3594 */
3595 return RES_NONE;
3596 }
3597 }
3598 return RES_NONE;
3599}
3600
3601PG *OSD::_create_lock_pg(
3602 OSDMapRef createmap,
3603 spg_t pgid,
3604 bool hold_map_lock,
3605 bool backfill,
3606 int role,
3607 vector<int>& up, int up_primary,
3608 vector<int>& acting, int acting_primary,
3609 pg_history_t history,
3610 const PastIntervals& pi,
3611 ObjectStore::Transaction& t)
3612{
3613 assert(osd_lock.is_locked());
3614 dout(20) << "_create_lock_pg pgid " << pgid << dendl;
3615
3616 PG *pg = _open_lock_pg(createmap, pgid, true);
3617
3618 service.init_splits_between(pgid, pg->get_osdmap(), service.get_osdmap());
3619
3620 pg->init(
3621 role,
3622 up,
3623 up_primary,
3624 acting,
3625 acting_primary,
3626 history,
3627 pi,
3628 backfill,
3629 &t);
3630
3631 dout(7) << "_create_lock_pg " << *pg << dendl;
3632 return pg;
3633}
3634
3635PG *OSD::_lookup_lock_pg(spg_t pgid)
3636{
3637 RWLock::RLocker l(pg_map_lock);
3638
3639 auto pg_map_entry = pg_map.find(pgid);
3640 if (pg_map_entry == pg_map.end())
3641 return nullptr;
3642 PG *pg = pg_map_entry->second;
3643 pg->lock();
3644 return pg;
3645}
3646
3647PG *OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid)
3648{
3649 assert(pg_map.count(pgid));
3650 PG *pg = pg_map[pgid];
3651 pg->lock();
3652 return pg;
3653}
3654
3655void OSD::load_pgs()
3656{
3657 assert(osd_lock.is_locked());
3658 dout(0) << "load_pgs" << dendl;
3659 {
3660 RWLock::RLocker l(pg_map_lock);
3661 assert(pg_map.empty());
3662 }
3663
3664 vector<coll_t> ls;
3665 int r = store->list_collections(ls);
3666 if (r < 0) {
3667 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
3668 }
3669
3670 bool has_upgraded = false;
3671
3672 for (vector<coll_t>::iterator it = ls.begin();
3673 it != ls.end();
3674 ++it) {
3675 spg_t pgid;
3676 if (it->is_temp(&pgid) ||
3677 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
3678 dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
3679 recursive_remove_collection(cct, store, pgid, *it);
3680 continue;
3681 }
3682
3683 if (!it->is_pg(&pgid)) {
3684 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
3685 continue;
3686 }
3687
3688 if (pgid.preferred() >= 0) {
3689 dout(10) << __func__ << ": skipping localized PG " << pgid << dendl;
3690 // FIXME: delete it too, eventually
3691 continue;
3692 }
3693
3694 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
3695 bufferlist bl;
3696 epoch_t map_epoch = 0;
3697 int r = PG::peek_map_epoch(store, pgid, &map_epoch, &bl);
3698 if (r < 0) {
3699 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
3700 << dendl;
3701 continue;
3702 }
3703
3704 PG *pg = NULL;
3705 if (map_epoch > 0) {
3706 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
3707 if (!pgosdmap) {
3708 if (!osdmap->have_pg_pool(pgid.pool())) {
3709 derr << __func__ << ": could not find map for epoch " << map_epoch
3710 << " on pg " << pgid << ", but the pool is not present in the "
3711 << "current map, so this is probably a result of bug 10617. "
3712 << "Skipping the pg for now, you can use ceph-objectstore-tool "
3713 << "to clean it up later." << dendl;
3714 continue;
3715 } else {
3716 derr << __func__ << ": have pgid " << pgid << " at epoch "
3717 << map_epoch << ", but missing map. Crashing."
3718 << dendl;
3719 assert(0 == "Missing map in load_pgs");
3720 }
3721 }
3722 pg = _open_lock_pg(pgosdmap, pgid);
3723 } else {
3724 pg = _open_lock_pg(osdmap, pgid);
3725 }
3726 // there can be no waiters here, so we don't call wake_pg_waiters
3727
3728 pg->ch = store->open_collection(pg->coll);
3729
3730 // read pg state, log
3731 pg->read_state(store, bl);
3732
3733 if (pg->must_upgrade()) {
3734 if (!pg->can_upgrade()) {
3735 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
3736 << " an older version first." << dendl;
3737 assert(0 == "PG too old to upgrade");
3738 }
3739 if (!has_upgraded) {
3740 derr << "PGs are upgrading" << dendl;
3741 has_upgraded = true;
3742 }
3743 dout(10) << "PG " << pg->info.pgid
3744 << " must upgrade..." << dendl;
3745 pg->upgrade(store);
3746 }
3747
3748 service.init_splits_between(pg->info.pgid, pg->get_osdmap(), osdmap);
3749
3750 // generate state for PG's current mapping
3751 int primary, up_primary;
3752 vector<int> acting, up;
3753 pg->get_osdmap()->pg_to_up_acting_osds(
3754 pgid.pgid, &up, &up_primary, &acting, &primary);
3755 pg->init_primary_up_acting(
3756 up,
3757 acting,
3758 up_primary,
3759 primary);
3760 int role = OSDMap::calc_pg_role(whoami, pg->acting);
3761 if (pg->pool.info.is_replicated() || role == pg->pg_whoami.shard)
3762 pg->set_role(role);
3763 else
3764 pg->set_role(-1);
3765
3766 pg->reg_next_scrub();
3767
3768 PG::RecoveryCtx rctx(0, 0, 0, 0, 0, 0);
3769 pg->handle_loaded(&rctx);
3770
3771 dout(10) << "load_pgs loaded " << *pg << " " << pg->pg_log.get_log() << dendl;
3772 if (pg->pg_log.is_dirty()) {
3773 ObjectStore::Transaction t;
3774 pg->write_if_dirty(t);
3775 store->apply_transaction(pg->osr.get(), std::move(t));
3776 }
3777 pg->unlock();
3778 }
3779 {
3780 RWLock::RLocker l(pg_map_lock);
3781 dout(0) << "load_pgs opened " << pg_map.size() << " pgs" << dendl;
3782 }
3783
3784 // clean up old infos object?
3785 if (has_upgraded && store->exists(coll_t::meta(), OSD::make_infos_oid())) {
3786 dout(1) << __func__ << " removing legacy infos object" << dendl;
3787 ObjectStore::Transaction t;
3788 t.remove(coll_t::meta(), OSD::make_infos_oid());
3789 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
3790 if (r != 0) {
3791 derr << __func__ << ": apply_transaction returned "
3792 << cpp_strerror(r) << dendl;
3793 ceph_abort();
3794 }
3795 }
3796
3797 build_past_intervals_parallel();
3798}
3799
3800
3801/*
3802 * build past_intervals efficiently on old, degraded, and buried
3803 * clusters. this is important for efficiently catching up osds that
3804 * are way behind on maps to the current cluster state.
3805 *
3806 * this is a parallel version of PG::generate_past_intervals().
3807 * follow the same logic, but do all pgs at the same time so that we
3808 * can make a single pass across the osdmap history.
3809 */
3810void OSD::build_past_intervals_parallel()
3811{
3812 struct pistate {
3813 epoch_t start, end;
3814 vector<int> old_acting, old_up;
3815 epoch_t same_interval_since;
3816 int primary;
3817 int up_primary;
3818 };
3819 map<PG*,pistate> pis;
3820
3821 // calculate junction of map range
3822 epoch_t end_epoch = superblock.oldest_map;
3823 epoch_t cur_epoch = superblock.newest_map;
3824 {
3825 RWLock::RLocker l(pg_map_lock);
3826 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
3827 i != pg_map.end();
3828 ++i) {
3829 PG *pg = i->second;
3830
3831 auto rpib = pg->get_required_past_interval_bounds(
3832 pg->info,
3833 superblock.oldest_map);
3834 if (rpib.first >= rpib.second && pg->past_intervals.empty()) {
3835 if (pg->info.history.same_interval_since == 0) {
3836 pg->info.history.same_interval_since = rpib.second;
3837 }
3838 continue;
3839 } else {
3840 auto apib = pg->past_intervals.get_bounds();
3841 if (apib.second >= rpib.second &&
3842 apib.first <= rpib.first) {
3843 if (pg->info.history.same_interval_since == 0) {
3844 pg->info.history.same_interval_since = rpib.second;
3845 }
3846 continue;
3847 }
3848 }
3849
3850 dout(10) << pg->info.pgid << " needs " << rpib.first << "-"
3851 << rpib.second << dendl;
3852 pistate& p = pis[pg];
3853 p.start = rpib.first;
3854 p.end = rpib.second;
3855 p.same_interval_since = 0;
3856
3857 if (rpib.first < cur_epoch)
3858 cur_epoch = rpib.first;
3859 if (rpib.second > end_epoch)
3860 end_epoch = rpib.second;
3861 }
3862 }
3863 if (pis.empty()) {
3864 dout(10) << __func__ << " nothing to build" << dendl;
3865 return;
3866 }
3867
3868 dout(1) << __func__ << " over " << cur_epoch << "-" << end_epoch << dendl;
3869 assert(cur_epoch <= end_epoch);
3870
3871 OSDMapRef cur_map, last_map;
3872 for ( ; cur_epoch <= end_epoch; cur_epoch++) {
3873 dout(10) << __func__ << " epoch " << cur_epoch << dendl;
3874 last_map = cur_map;
3875 cur_map = get_map(cur_epoch);
3876
3877 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
3878 PG *pg = i->first;
3879 pistate& p = i->second;
3880
3881 if (cur_epoch < p.start || cur_epoch > p.end)
3882 continue;
3883
3884 vector<int> acting, up;
3885 int up_primary;
3886 int primary;
3887 pg_t pgid = pg->info.pgid.pgid;
3888 if (p.same_interval_since && last_map->get_pools().count(pgid.pool()))
3889 pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool()));
3890 cur_map->pg_to_up_acting_osds(
3891 pgid, &up, &up_primary, &acting, &primary);
3892
3893 if (p.same_interval_since == 0) {
3894 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
3895 << " first map, acting " << acting
3896 << " up " << up << ", same_interval_since = " << cur_epoch << dendl;
3897 p.same_interval_since = cur_epoch;
3898 p.old_up = up;
3899 p.old_acting = acting;
3900 p.primary = primary;
3901 p.up_primary = up_primary;
3902 continue;
3903 }
3904 assert(last_map);
3905
3906 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
3907 pg->get_is_recoverable_predicate());
3908 std::stringstream debug;
3909 bool new_interval = PastIntervals::check_new_interval(
3910 p.primary,
3911 primary,
3912 p.old_acting, acting,
3913 p.up_primary,
3914 up_primary,
3915 p.old_up, up,
3916 p.same_interval_since,
3917 pg->info.history.last_epoch_clean,
3918 cur_map, last_map,
3919 pgid,
3920 recoverable.get(),
3921 &pg->past_intervals,
3922 &debug);
3923 if (new_interval) {
3924 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
3925 << " " << debug.str() << dendl;
3926 p.old_up = up;
3927 p.old_acting = acting;
3928 p.primary = primary;
3929 p.up_primary = up_primary;
3930 p.same_interval_since = cur_epoch;
3931 }
3932 }
3933 }
3934
3935 // Now that past_intervals have been recomputed let's fix the same_interval_since
3936 // if it was cleared by import.
3937 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
3938 PG *pg = i->first;
3939 pistate& p = i->second;
3940
3941 if (pg->info.history.same_interval_since == 0) {
3942 assert(p.same_interval_since);
3943 dout(10) << __func__ << " fix same_interval_since " << p.same_interval_since << " pg " << *pg << dendl;
3944 dout(10) << __func__ << " past_intervals " << pg->past_intervals << dendl;
3945 // Fix it
3946 pg->info.history.same_interval_since = p.same_interval_since;
3947 }
3948 }
3949
3950 // write info only at the end. this is necessary because we check
3951 // whether the past_intervals go far enough back or forward in time,
3952 // but we don't check for holes. we could avoid it by discarding
3953 // the previous past_intervals and rebuilding from scratch, or we
3954 // can just do this and commit all our work at the end.
3955 ObjectStore::Transaction t;
3956 int num = 0;
3957 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
3958 PG *pg = i->first;
3959 pg->lock();
3960 pg->dirty_big_info = true;
3961 pg->dirty_info = true;
3962 pg->write_if_dirty(t);
3963 pg->unlock();
3964
3965 // don't let the transaction get too big
3966 if (++num >= cct->_conf->osd_target_transaction_size) {
3967 store->apply_transaction(service.meta_osr.get(), std::move(t));
3968 t = ObjectStore::Transaction();
3969 num = 0;
3970 }
3971 }
3972 if (!t.empty())
3973 store->apply_transaction(service.meta_osr.get(), std::move(t));
3974}
3975
3976/*
3977 * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping
3978 * hasn't changed since the given epoch and we are the primary.
3979 */
3980int OSD::handle_pg_peering_evt(
3981 spg_t pgid,
3982 const pg_history_t& orig_history,
3983 const PastIntervals& pi,
3984 epoch_t epoch,
3985 PG::CephPeeringEvtRef evt)
3986{
3987 if (service.splitting(pgid)) {
3988 peering_wait_for_split[pgid].push_back(evt);
3989 return -EEXIST;
3990 }
3991
3992 PG *pg = _lookup_lock_pg(pgid);
3993 if (!pg) {
3994 // same primary?
3995 if (!osdmap->have_pg_pool(pgid.pool()))
3996 return -EINVAL;
3997 int up_primary, acting_primary;
3998 vector<int> up, acting;
3999 osdmap->pg_to_up_acting_osds(
4000 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4001
4002 pg_history_t history = orig_history;
4003 bool valid_history = project_pg_history(
4004 pgid, history, epoch, up, up_primary, acting, acting_primary);
4005
4006 if (!valid_history || epoch < history.same_interval_since) {
4007 dout(10) << __func__ << pgid << " acting changed in "
4008 << history.same_interval_since << " (msg from " << epoch << ")"
4009 << dendl;
4010 return -EINVAL;
4011 }
4012
4013 if (service.splitting(pgid)) {
4014 ceph_abort();
4015 }
4016
4017 // do we need to resurrect a deleting pg?
4018 spg_t resurrected;
4019 PGRef old_pg_state;
4020 res_result result = _try_resurrect_pg(
4021 service.get_osdmap(),
4022 pgid,
4023 &resurrected,
4024 &old_pg_state);
4025
4026 PG::RecoveryCtx rctx = create_context();
4027 switch (result) {
4028 case RES_NONE: {
4029 const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
4030 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4031 store->get_type() != "bluestore") {
4032 clog->warn() << "pg " << pgid
4033 << " is at risk of silent data corruption: "
4034 << "the pool allows ec overwrites but is not stored in "
4035 << "bluestore, so deep scrubbing will not detect bitrot";
4036 }
4037 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4038 PG::_init(*rctx.transaction, pgid, pp);
4039
4040 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
4041 if (!pp->is_replicated() && role != pgid.shard)
4042 role = -1;
4043
4044 pg = _create_lock_pg(
4045 get_map(epoch),
4046 pgid, false, false,
4047 role,
4048 up, up_primary,
4049 acting, acting_primary,
4050 history, pi,
4051 *rctx.transaction);
4052 pg->handle_create(&rctx);
4053 pg->write_if_dirty(*rctx.transaction);
4054 dispatch_context(rctx, pg, osdmap);
4055
4056 dout(10) << *pg << " is new" << dendl;
4057
4058 pg->queue_peering_event(evt);
4059 wake_pg_waiters(pg);
4060 pg->unlock();
4061 return 0;
4062 }
4063 case RES_SELF: {
4064 old_pg_state->lock();
4065 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4066 int old_role = old_pg_state->role;
4067 vector<int> old_up = old_pg_state->up;
4068 int old_up_primary = old_pg_state->up_primary.osd;
4069 vector<int> old_acting = old_pg_state->acting;
4070 int old_primary = old_pg_state->primary.osd;
4071 pg_history_t old_history = old_pg_state->info.history;
4072 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4073 old_pg_state->unlock();
4074 pg = _create_lock_pg(
4075 old_osd_map,
4076 resurrected,
4077 false,
4078 true,
4079 old_role,
4080 old_up,
4081 old_up_primary,
4082 old_acting,
4083 old_primary,
4084 old_history,
4085 old_past_intervals,
4086 *rctx.transaction);
4087 pg->handle_create(&rctx);
4088 pg->write_if_dirty(*rctx.transaction);
4089 dispatch_context(rctx, pg, osdmap);
4090
4091 dout(10) << *pg << " is new (resurrected)" << dendl;
4092
4093 pg->queue_peering_event(evt);
4094 wake_pg_waiters(pg);
4095 pg->unlock();
4096 return 0;
4097 }
4098 case RES_PARENT: {
4099 assert(old_pg_state);
4100 old_pg_state->lock();
4101 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4102 int old_role = old_pg_state->role;
4103 vector<int> old_up = old_pg_state->up;
4104 int old_up_primary = old_pg_state->up_primary.osd;
4105 vector<int> old_acting = old_pg_state->acting;
4106 int old_primary = old_pg_state->primary.osd;
4107 pg_history_t old_history = old_pg_state->info.history;
4108 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4109 old_pg_state->unlock();
4110 PG *parent = _create_lock_pg(
4111 old_osd_map,
4112 resurrected,
4113 false,
4114 true,
4115 old_role,
4116 old_up,
4117 old_up_primary,
4118 old_acting,
4119 old_primary,
4120 old_history,
4121 old_past_intervals,
4122 *rctx.transaction
4123 );
4124 parent->handle_create(&rctx);
4125 parent->write_if_dirty(*rctx.transaction);
4126 dispatch_context(rctx, parent, osdmap);
4127
4128 dout(10) << *parent << " is new" << dendl;
4129
4130 assert(service.splitting(pgid));
4131 peering_wait_for_split[pgid].push_back(evt);
4132
4133 //parent->queue_peering_event(evt);
4134 parent->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
4135 wake_pg_waiters(parent);
4136 parent->unlock();
4137 return 0;
4138 }
4139 default:
4140 assert(0);
4141 return 0;
4142 }
4143 } else {
4144 // already had it. did the mapping change?
4145 if (epoch < pg->info.history.same_interval_since) {
4146 dout(10) << *pg << __func__ << " acting changed in "
4147 << pg->info.history.same_interval_since
4148 << " (msg from " << epoch << ")" << dendl;
4149 } else {
4150 pg->queue_peering_event(evt);
4151 }
4152 pg->unlock();
4153 return -EEXIST;
4154 }
4155}
4156
4157
4158void OSD::build_initial_pg_history(
4159 spg_t pgid,
4160 epoch_t created,
4161 utime_t created_stamp,
4162 pg_history_t *h,
4163 PastIntervals *pi)
4164{
4165 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4166 h->epoch_created = created;
4167 h->same_interval_since = created;
4168 h->same_up_since = created;
4169 h->same_primary_since = created;
4170 h->last_scrub_stamp = created_stamp;
4171 h->last_deep_scrub_stamp = created_stamp;
4172 h->last_clean_scrub_stamp = created_stamp;
4173
4174 OSDMapRef lastmap = service.get_map(created);
4175 int up_primary, acting_primary;
4176 vector<int> up, acting;
4177 lastmap->pg_to_up_acting_osds(
4178 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4179
4180 ostringstream debug;
4181 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4182 OSDMapRef osdmap = service.get_map(e);
4183 int new_up_primary, new_acting_primary;
4184 vector<int> new_up, new_acting;
4185 osdmap->pg_to_up_acting_osds(
4186 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4187
4188 // this is a bit imprecise, but sufficient?
4189 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4190 const pg_pool_t *pi;
4191 bool operator()(const set<pg_shard_t> &have) const {
4192 return have.size() >= pi->min_size;
4193 }
4194 min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4195 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4196
4197 bool new_interval = PastIntervals::check_new_interval(
4198 acting_primary,
4199 new_acting_primary,
4200 acting, new_acting,
4201 up_primary,
4202 new_up_primary,
4203 up, new_up,
4204 h->same_interval_since,
4205 h->last_epoch_clean,
4206 osdmap,
4207 lastmap,
4208 pgid.pgid,
4209 &min_size_predicate,
4210 pi,
4211 &debug);
4212 if (new_interval) {
4213 h->same_interval_since = e;
4214 }
4215 if (up != new_up) {
4216 h->same_up_since = e;
4217 }
4218 if (acting_primary != new_acting_primary) {
4219 h->same_primary_since = e;
4220 }
4221 lastmap = osdmap;
4222 }
4223 dout(20) << __func__ << " " << debug.str() << dendl;
4224 dout(10) << __func__ << " " << *h << " " << *pi
4225 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4226 pi->get_bounds()) << ")"
4227 << dendl;
4228}
4229
4230/**
4231 * Fill in the passed history so you know same_interval_since, same_up_since,
4232 * and same_primary_since.
4233 */
4234bool OSD::project_pg_history(spg_t pgid, pg_history_t& h, epoch_t from,
4235 const vector<int>& currentup,
4236 int currentupprimary,
4237 const vector<int>& currentacting,
4238 int currentactingprimary)
4239{
4240 dout(15) << "project_pg_history " << pgid
4241 << " from " << from << " to " << osdmap->get_epoch()
4242 << ", start " << h
4243 << dendl;
4244
4245 epoch_t e;
4246 for (e = osdmap->get_epoch();
4247 e > from;
4248 e--) {
4249 // verify during intermediate epoch (e-1)
4250 OSDMapRef oldmap = service.try_get_map(e-1);
4251 if (!oldmap) {
4252 dout(15) << __func__ << ": found map gap, returning false" << dendl;
4253 return false;
4254 }
4255 assert(oldmap->have_pg_pool(pgid.pool()));
4256
4257 int upprimary, actingprimary;
4258 vector<int> up, acting;
4259 oldmap->pg_to_up_acting_osds(
4260 pgid.pgid,
4261 &up,
4262 &upprimary,
4263 &acting,
4264 &actingprimary);
4265
4266 // acting set change?
4267 if ((actingprimary != currentactingprimary ||
4268 upprimary != currentupprimary ||
4269 acting != currentacting ||
4270 up != currentup) && e > h.same_interval_since) {
4271 dout(15) << "project_pg_history " << pgid << " acting|up changed in " << e
4272 << " from " << acting << "/" << up
4273 << " " << actingprimary << "/" << upprimary
4274 << " -> " << currentacting << "/" << currentup
4275 << " " << currentactingprimary << "/" << currentupprimary
4276 << dendl;
4277 h.same_interval_since = e;
4278 }
4279 // split?
4280 if (pgid.is_split(oldmap->get_pg_num(pgid.pool()),
4281 osdmap->get_pg_num(pgid.pool()),
4282 0) && e > h.same_interval_since) {
4283 h.same_interval_since = e;
4284 }
4285 // up set change?
4286 if ((up != currentup || upprimary != currentupprimary)
4287 && e > h.same_up_since) {
4288 dout(15) << "project_pg_history " << pgid << " up changed in " << e
4289 << " from " << up << " " << upprimary
4290 << " -> " << currentup << " " << currentupprimary << dendl;
4291 h.same_up_since = e;
4292 }
4293
4294 // primary change?
4295 if (OSDMap::primary_changed(
4296 actingprimary,
4297 acting,
4298 currentactingprimary,
4299 currentacting) &&
4300 e > h.same_primary_since) {
4301 dout(15) << "project_pg_history " << pgid << " primary changed in " << e << dendl;
4302 h.same_primary_since = e;
4303 }
4304
4305 if (h.same_interval_since >= e && h.same_up_since >= e && h.same_primary_since >= e)
4306 break;
4307 }
4308
4309 // base case: these floors should be the creation epoch if we didn't
4310 // find any changes.
4311 if (e == h.epoch_created) {
4312 if (!h.same_interval_since)
4313 h.same_interval_since = e;
4314 if (!h.same_up_since)
4315 h.same_up_since = e;
4316 if (!h.same_primary_since)
4317 h.same_primary_since = e;
4318 }
4319
4320 dout(15) << "project_pg_history end " << h << dendl;
4321 return true;
4322}
4323
4324
4325
4326void OSD::_add_heartbeat_peer(int p)
4327{
4328 if (p == whoami)
4329 return;
4330 HeartbeatInfo *hi;
4331
4332 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4333 if (i == heartbeat_peers.end()) {
4334 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4335 if (!cons.first)
4336 return;
4337 hi = &heartbeat_peers[p];
4338 hi->peer = p;
4339 HeartbeatSession *s = new HeartbeatSession(p);
4340 hi->con_back = cons.first.get();
4341 hi->con_back->set_priv(s->get());
4342 if (cons.second) {
4343 hi->con_front = cons.second.get();
4344 hi->con_front->set_priv(s->get());
4345 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4346 << " " << hi->con_back->get_peer_addr()
4347 << " " << hi->con_front->get_peer_addr()
4348 << dendl;
4349 } else {
4350 hi->con_front.reset(NULL);
4351 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4352 << " " << hi->con_back->get_peer_addr()
4353 << dendl;
4354 }
4355 s->put();
4356 } else {
4357 hi = &i->second;
4358 }
4359 hi->epoch = osdmap->get_epoch();
4360}
4361
4362void OSD::_remove_heartbeat_peer(int n)
4363{
4364 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
4365 assert(q != heartbeat_peers.end());
4366 dout(20) << " removing heartbeat peer osd." << n
4367 << " " << q->second.con_back->get_peer_addr()
4368 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
4369 << dendl;
4370 q->second.con_back->mark_down();
4371 if (q->second.con_front) {
4372 q->second.con_front->mark_down();
4373 }
4374 heartbeat_peers.erase(q);
4375}
4376
4377void OSD::need_heartbeat_peer_update()
4378{
4379 if (is_stopping())
4380 return;
4381 dout(20) << "need_heartbeat_peer_update" << dendl;
4382 heartbeat_set_peers_need_update();
4383}
4384
4385void OSD::maybe_update_heartbeat_peers()
4386{
4387 assert(osd_lock.is_locked());
4388
4389 if (is_waiting_for_healthy()) {
4390 utime_t now = ceph_clock_now();
4391 if (last_heartbeat_resample == utime_t()) {
4392 last_heartbeat_resample = now;
4393 heartbeat_set_peers_need_update();
4394 } else if (!heartbeat_peers_need_update()) {
4395 utime_t dur = now - last_heartbeat_resample;
4396 if (dur > cct->_conf->osd_heartbeat_grace) {
4397 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
4398 heartbeat_set_peers_need_update();
4399 last_heartbeat_resample = now;
4400 reset_heartbeat_peers(); // we want *new* peers!
4401 }
4402 }
4403 }
4404
4405 if (!heartbeat_peers_need_update())
4406 return;
4407 heartbeat_clear_peers_need_update();
4408
4409 Mutex::Locker l(heartbeat_lock);
4410
4411 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
4412
4413
4414 // build heartbeat from set
4415 if (is_active()) {
4416 RWLock::RLocker l(pg_map_lock);
4417 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4418 i != pg_map.end();
4419 ++i) {
4420 PG *pg = i->second;
4421 pg->heartbeat_peer_lock.Lock();
4422 dout(20) << i->first << " heartbeat_peers " << pg->heartbeat_peers << dendl;
4423 for (set<int>::iterator p = pg->heartbeat_peers.begin();
4424 p != pg->heartbeat_peers.end();
4425 ++p)
4426 if (osdmap->is_up(*p))
4427 _add_heartbeat_peer(*p);
4428 for (set<int>::iterator p = pg->probe_targets.begin();
4429 p != pg->probe_targets.end();
4430 ++p)
4431 if (osdmap->is_up(*p))
4432 _add_heartbeat_peer(*p);
4433 pg->heartbeat_peer_lock.Unlock();
4434 }
4435 }
4436
4437 // include next and previous up osds to ensure we have a fully-connected set
4438 set<int> want, extras;
4439 int next = osdmap->get_next_up_osd_after(whoami);
4440 if (next >= 0)
4441 want.insert(next);
4442 int prev = osdmap->get_previous_up_osd_before(whoami);
4443 if (prev >= 0 && prev != next)
4444 want.insert(prev);
4445
4446 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
4447 dout(10) << " adding neighbor peer osd." << *p << dendl;
4448 extras.insert(*p);
4449 _add_heartbeat_peer(*p);
4450 }
4451
4452 // remove down peers; enumerate extras
4453 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4454 while (p != heartbeat_peers.end()) {
4455 if (!osdmap->is_up(p->first)) {
4456 int o = p->first;
4457 ++p;
4458 _remove_heartbeat_peer(o);
4459 continue;
4460 }
4461 if (p->second.epoch < osdmap->get_epoch()) {
4462 extras.insert(p->first);
4463 }
4464 ++p;
4465 }
4466
4467 // too few?
4468 int start = osdmap->get_next_up_osd_after(whoami);
4469 for (int n = start; n >= 0; ) {
4470 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
4471 break;
4472 if (!extras.count(n) && !want.count(n) && n != whoami) {
4473 dout(10) << " adding random peer osd." << n << dendl;
4474 extras.insert(n);
4475 _add_heartbeat_peer(n);
4476 }
4477 n = osdmap->get_next_up_osd_after(n);
4478 if (n == start)
4479 break; // came full circle; stop
4480 }
4481
4482 // too many?
4483 for (set<int>::iterator p = extras.begin();
4484 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
4485 ++p) {
4486 if (want.count(*p))
4487 continue;
4488 _remove_heartbeat_peer(*p);
4489 }
4490
4491 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
4492}
4493
4494void OSD::reset_heartbeat_peers()
4495{
4496 assert(osd_lock.is_locked());
4497 dout(10) << "reset_heartbeat_peers" << dendl;
4498 Mutex::Locker l(heartbeat_lock);
4499 while (!heartbeat_peers.empty()) {
4500 HeartbeatInfo& hi = heartbeat_peers.begin()->second;
4501 hi.con_back->mark_down();
4502 if (hi.con_front) {
4503 hi.con_front->mark_down();
4504 }
4505 heartbeat_peers.erase(heartbeat_peers.begin());
4506 }
4507 failure_queue.clear();
4508}
4509
4510void OSD::handle_osd_ping(MOSDPing *m)
4511{
4512 if (superblock.cluster_fsid != m->fsid) {
4513 dout(20) << "handle_osd_ping from " << m->get_source_inst()
4514 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
4515 m->put();
4516 return;
4517 }
4518
4519 int from = m->get_source().num();
4520
4521 heartbeat_lock.Lock();
4522 if (is_stopping()) {
4523 heartbeat_lock.Unlock();
4524 m->put();
4525 return;
4526 }
4527
4528 OSDMapRef curmap = service.get_osdmap();
4529 assert(curmap);
4530
4531 switch (m->op) {
4532
4533 case MOSDPing::PING:
4534 {
4535 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
4536 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
4537 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
4538 if (heartbeat_drop->second == 0) {
4539 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
4540 } else {
4541 --heartbeat_drop->second;
4542 dout(5) << "Dropping heartbeat from " << from
4543 << ", " << heartbeat_drop->second
4544 << " remaining to drop" << dendl;
4545 break;
4546 }
4547 } else if (cct->_conf->osd_debug_drop_ping_probability >
4548 ((((double)(rand()%100))/100.0))) {
4549 heartbeat_drop =
4550 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
4551 cct->_conf->osd_debug_drop_ping_duration)).first;
4552 dout(5) << "Dropping heartbeat from " << from
4553 << ", " << heartbeat_drop->second
4554 << " remaining to drop" << dendl;
4555 break;
4556 }
4557 }
4558
4559 if (!cct->get_heartbeat_map()->is_healthy()) {
4560 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
4561 break;
4562 }
4563
4564 Message *r = new MOSDPing(monc->get_fsid(),
4565 curmap->get_epoch(),
4566 MOSDPing::PING_REPLY,
4567 m->stamp);
4568 m->get_connection()->send_message(r);
4569
4570 if (curmap->is_up(from)) {
4571 service.note_peer_epoch(from, m->map_epoch);
4572 if (is_active()) {
4573 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
4574 if (con) {
4575 service.share_map_peer(from, con.get());
4576 }
4577 }
4578 } else if (!curmap->exists(from) ||
4579 curmap->get_down_at(from) > m->map_epoch) {
4580 // tell them they have died
4581 Message *r = new MOSDPing(monc->get_fsid(),
4582 curmap->get_epoch(),
4583 MOSDPing::YOU_DIED,
4584 m->stamp);
4585 m->get_connection()->send_message(r);
4586 }
4587 }
4588 break;
4589
4590 case MOSDPing::PING_REPLY:
4591 {
4592 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
4593 if (i != heartbeat_peers.end()) {
4594 if (m->get_connection() == i->second.con_back) {
4595 dout(25) << "handle_osd_ping got reply from osd." << from
4596 << " first_tx " << i->second.first_tx
4597 << " last_tx " << i->second.last_tx
4598 << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
4599 << " last_rx_front " << i->second.last_rx_front
4600 << dendl;
4601 i->second.last_rx_back = m->stamp;
4602 // if there is no front con, set both stamps.
4603 if (i->second.con_front == NULL)
4604 i->second.last_rx_front = m->stamp;
4605 } else if (m->get_connection() == i->second.con_front) {
4606 dout(25) << "handle_osd_ping got reply from osd." << from
4607 << " first_tx " << i->second.first_tx
4608 << " last_tx " << i->second.last_tx
4609 << " last_rx_back " << i->second.last_rx_back
4610 << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
4611 << dendl;
4612 i->second.last_rx_front = m->stamp;
4613 }
4614
4615 utime_t cutoff = ceph_clock_now();
4616 cutoff -= cct->_conf->osd_heartbeat_grace;
4617 if (i->second.is_healthy(cutoff)) {
4618 // Cancel false reports
4619 auto failure_queue_entry = failure_queue.find(from);
4620 if (failure_queue_entry != failure_queue.end()) {
4621 dout(10) << "handle_osd_ping canceling queued "
4622 << "failure report for osd." << from << dendl;
4623 failure_queue.erase(failure_queue_entry);
4624 }
4625
4626 auto failure_pending_entry = failure_pending.find(from);
4627 if (failure_pending_entry != failure_pending.end()) {
4628 dout(10) << "handle_osd_ping canceling in-flight "
4629 << "failure report for osd." << from << dendl;
4630 send_still_alive(curmap->get_epoch(),
4631 failure_pending_entry->second.second);
4632 failure_pending.erase(failure_pending_entry);
4633 }
4634 }
4635 }
4636
4637 if (m->map_epoch &&
4638 curmap->is_up(from)) {
4639 service.note_peer_epoch(from, m->map_epoch);
4640 if (is_active()) {
4641 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
4642 if (con) {
4643 service.share_map_peer(from, con.get());
4644 }
4645 }
4646 }
4647 }
4648 break;
4649
4650 case MOSDPing::YOU_DIED:
4651 dout(10) << "handle_osd_ping " << m->get_source_inst()
4652 << " says i am down in " << m->map_epoch << dendl;
4653 osdmap_subscribe(curmap->get_epoch()+1, false);
4654 break;
4655 }
4656
4657 heartbeat_lock.Unlock();
4658 m->put();
4659}
4660
4661void OSD::heartbeat_entry()
4662{
4663 Mutex::Locker l(heartbeat_lock);
4664 if (is_stopping())
4665 return;
4666 while (!heartbeat_stop) {
4667 heartbeat();
4668
4669 double wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
4670 utime_t w;
4671 w.set_from_double(wait);
4672 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
4673 heartbeat_cond.WaitInterval(heartbeat_lock, w);
4674 if (is_stopping())
4675 return;
4676 dout(30) << "heartbeat_entry woke up" << dendl;
4677 }
4678}
4679
4680void OSD::heartbeat_check()
4681{
4682 assert(heartbeat_lock.is_locked());
4683 utime_t now = ceph_clock_now();
4684
4685 // check for heartbeat replies (move me elsewhere?)
4686 utime_t cutoff = now;
4687 cutoff -= cct->_conf->osd_heartbeat_grace;
4688 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4689 p != heartbeat_peers.end();
4690 ++p) {
4691
4692 if (p->second.first_tx == utime_t()) {
4693 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
4694 << "yet, skipping" << dendl;
4695 continue;
4696 }
4697
4698 dout(25) << "heartbeat_check osd." << p->first
4699 << " first_tx " << p->second.first_tx
4700 << " last_tx " << p->second.last_tx
4701 << " last_rx_back " << p->second.last_rx_back
4702 << " last_rx_front " << p->second.last_rx_front
4703 << dendl;
4704 if (p->second.is_unhealthy(cutoff)) {
4705 if (p->second.last_rx_back == utime_t() ||
4706 p->second.last_rx_front == utime_t()) {
4707 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
4708 << " osd." << p->first << " ever on either front or back, first ping sent "
4709 << p->second.first_tx << " (cutoff " << cutoff << ")" << dendl;
4710 // fail
4711 failure_queue[p->first] = p->second.last_tx;
4712 } else {
4713 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
4714 << " osd." << p->first << " since back " << p->second.last_rx_back
4715 << " front " << p->second.last_rx_front
4716 << " (cutoff " << cutoff << ")" << dendl;
4717 // fail
4718 failure_queue[p->first] = MIN(p->second.last_rx_back, p->second.last_rx_front);
4719 }
4720 }
4721 }
4722}
4723
4724void OSD::heartbeat()
4725{
4726 dout(30) << "heartbeat" << dendl;
4727
4728 // get CPU load avg
4729 double loadavgs[1];
4730 int n_samples = 86400 / cct->_conf->osd_heartbeat_interval;
4731 if (getloadavg(loadavgs, 1) == 1) {
4732 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
4733 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
4734 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
4735 }
4736
4737 dout(30) << "heartbeat checking stats" << dendl;
4738
4739 // refresh stats?
4740 vector<int> hb_peers;
4741 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4742 p != heartbeat_peers.end();
4743 ++p)
4744 hb_peers.push_back(p->first);
4745 service.update_osd_stat(hb_peers);
4746
4747 dout(5) << "heartbeat: " << service.get_osd_stat() << dendl;
4748
4749 utime_t now = ceph_clock_now();
4750
4751 // send heartbeats
4752 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
4753 i != heartbeat_peers.end();
4754 ++i) {
4755 int peer = i->first;
4756 i->second.last_tx = now;
4757 if (i->second.first_tx == utime_t())
4758 i->second.first_tx = now;
4759 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
4760 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
4761 service.get_osdmap()->get_epoch(),
4762 MOSDPing::PING,
4763 now));
4764
4765 if (i->second.con_front)
4766 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
4767 service.get_osdmap()->get_epoch(),
4768 MOSDPing::PING,
4769 now));
4770 }
4771
4772 logger->set(l_osd_hb_to, heartbeat_peers.size());
4773
4774 // hmm.. am i all alone?
4775 dout(30) << "heartbeat lonely?" << dendl;
4776 if (heartbeat_peers.empty()) {
4777 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
4778 last_mon_heartbeat = now;
4779 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
4780 osdmap_subscribe(osdmap->get_epoch() + 1, false);
4781 }
4782 }
4783
4784 dout(30) << "heartbeat done" << dendl;
4785}
4786
4787bool OSD::heartbeat_reset(Connection *con)
4788{
4789 HeartbeatSession *s = static_cast<HeartbeatSession*>(con->get_priv());
4790 if (s) {
4791 heartbeat_lock.Lock();
4792 if (is_stopping()) {
4793 heartbeat_lock.Unlock();
4794 s->put();
4795 return true;
4796 }
4797 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer);
4798 if (p != heartbeat_peers.end() &&
4799 (p->second.con_back == con ||
4800 p->second.con_front == con)) {
4801 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
4802 << ", reopening" << dendl;
4803 if (con != p->second.con_back) {
4804 p->second.con_back->mark_down();
4805 }
4806 p->second.con_back.reset(NULL);
4807 if (p->second.con_front && con != p->second.con_front) {
4808 p->second.con_front->mark_down();
4809 }
4810 p->second.con_front.reset(NULL);
4811 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
4812 if (newcon.first) {
4813 p->second.con_back = newcon.first.get();
4814 p->second.con_back->set_priv(s->get());
4815 if (newcon.second) {
4816 p->second.con_front = newcon.second.get();
4817 p->second.con_front->set_priv(s->get());
4818 }
4819 } else {
4820 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
4821 << ", raced with osdmap update, closing out peer" << dendl;
4822 heartbeat_peers.erase(p);
4823 }
4824 } else {
4825 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
4826 }
4827 heartbeat_lock.Unlock();
4828 s->put();
4829 }
4830 return true;
4831}
4832
4833
4834
4835// =========================================
4836
4837void OSD::tick()
4838{
4839 assert(osd_lock.is_locked());
4840 dout(10) << "tick" << dendl;
4841
4842 if (is_active() || is_waiting_for_healthy()) {
4843 maybe_update_heartbeat_peers();
4844 }
4845
4846 if (is_waiting_for_healthy()) {
4847 start_boot();
4848 }
4849
4850 do_waiters();
4851
4852 tick_timer.add_event_after(OSD_TICK_INTERVAL, new C_Tick(this));
4853
4854 if (is_active()) {
4855 const auto now = ceph::coarse_mono_clock::now();
4856 const auto elapsed = now - last_sent_beacon;
4857 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
4858 cct->_conf->osd_beacon_report_interval) {
4859 send_beacon(now);
4860 }
4861 }
4862}
4863
4864void OSD::tick_without_osd_lock()
4865{
4866 assert(tick_timer_lock.is_locked());
4867 dout(10) << "tick_without_osd_lock" << dendl;
4868
4869 logger->set(l_osd_buf, buffer::get_total_alloc());
4870 logger->set(l_osd_history_alloc_bytes, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
4871 logger->set(l_osd_history_alloc_num, buffer::get_history_alloc_num());
4872 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
4873 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
4874 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
4875
4876 // osd_lock is not being held, which means the OSD state
4877 // might change when doing the monitor report
4878 if (is_active() || is_waiting_for_healthy()) {
4879 heartbeat_lock.Lock();
4880 heartbeat_check();
4881 heartbeat_lock.Unlock();
4882
4883 map_lock.get_read();
4884 Mutex::Locker l(mon_report_lock);
4885
4886 // mon report?
4887 bool reset = false;
4888 bool report = false;
4889 utime_t now = ceph_clock_now();
4890 pg_stat_queue_lock.Lock();
4891 double backoff = stats_ack_timeout / cct->_conf->osd_mon_ack_timeout;
4892 double adjusted_min = cct->_conf->osd_mon_report_interval_min * backoff;
4893 // note: we shouldn't adjust max because it must remain < the
4894 // mon's mon_osd_report_timeout (which defaults to 1.5x our
4895 // value).
4896 double max = cct->_conf->osd_mon_report_interval_max;
4897 if (!outstanding_pg_stats.empty() &&
4898 (now - stats_ack_timeout) > last_pg_stats_ack) {
4899 dout(1) << __func__ << " mon hasn't acked PGStats in "
4900 << now - last_pg_stats_ack
4901 << " seconds, reconnecting elsewhere" << dendl;
4902 reset = true;
4903 last_pg_stats_ack = now; // reset clock
4904 last_pg_stats_sent = utime_t();
4905 stats_ack_timeout =
4906 MAX(cct->_conf->osd_mon_ack_timeout,
4907 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_factor);
4908 outstanding_pg_stats.clear();
4909 }
4910 if (now - last_pg_stats_sent > max) {
4911 osd_stat_updated = true;
4912 report = true;
4913 } else if (service.need_fullness_update()) {
4914 report = true;
4915 } else if ((int)outstanding_pg_stats.size() >=
4916 cct->_conf->osd_mon_report_max_in_flight) {
4917 dout(20) << __func__ << " have max " << outstanding_pg_stats
4918 << " stats updates in flight" << dendl;
4919 } else {
4920 if (now - last_mon_report > adjusted_min) {
4921 dout(20) << __func__ << " stats backoff " << backoff
4922 << " adjusted_min " << adjusted_min << " - sending report"
4923 << dendl;
4924 osd_stat_updated = true;
4925 report = true;
4926 }
4927 }
4928 pg_stat_queue_lock.Unlock();
4929
4930 if (reset) {
4931 monc->reopen_session();
4932 } else if (report) {
4933 last_mon_report = now;
4934
4935 // do any pending reports
4936 send_full_update();
4937 send_failures();
4938 send_pg_stats(now);
4939 }
4940 map_lock.put_read();
4941 }
4942
4943 if (is_active()) {
4944 if (!scrub_random_backoff()) {
4945 sched_scrub();
4946 }
4947 service.promote_throttle_recalibrate();
4948 }
4949
4950 check_ops_in_flight();
4951 service.kick_recovery_queue();
4952 tick_timer_without_osd_lock.add_event_after(OSD_TICK_INTERVAL, new C_Tick_WithoutOSDLock(this));
4953}
4954
4955void OSD::check_ops_in_flight()
4956{
4957 vector<string> warnings;
4958 if (op_tracker.check_ops_in_flight(warnings)) {
4959 for (vector<string>::iterator i = warnings.begin();
4960 i != warnings.end();
4961 ++i) {
4962 clog->warn() << *i;
4963 }
4964 }
4965}
4966
4967// Usage:
4968// setomapval <pool-id> [namespace/]<obj-name> <key> <val>
4969// rmomapkey <pool-id> [namespace/]<obj-name> <key>
4970// setomapheader <pool-id> [namespace/]<obj-name> <header>
4971// getomap <pool> [namespace/]<obj-name>
4972// truncobj <pool-id> [namespace/]<obj-name> <newlen>
4973// injectmdataerr [namespace/]<obj-name> [shardid]
4974// injectdataerr [namespace/]<obj-name> [shardid]
4975//
4976// set_recovery_delay [utime]
4977void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
4978 const std::string &command, cmdmap_t& cmdmap, ostream &ss)
4979{
4980 //Test support
4981 //Support changing the omap on a single osd by using the Admin Socket to
4982 //directly request the osd make a change.
4983 if (command == "setomapval" || command == "rmomapkey" ||
4984 command == "setomapheader" || command == "getomap" ||
4985 command == "truncobj" || command == "injectmdataerr" ||
4986 command == "injectdataerr"
4987 ) {
4988 pg_t rawpg;
4989 int64_t pool;
4990 OSDMapRef curmap = service->get_osdmap();
4991 int r = -1;
4992
4993 string poolstr;
4994
4995 cmd_getval(service->cct, cmdmap, "pool", poolstr);
4996 pool = curmap->lookup_pg_pool_name(poolstr);
4997 //If we can't find it by name then maybe id specified
4998 if (pool < 0 && isdigit(poolstr[0]))
4999 pool = atoll(poolstr.c_str());
5000 if (pool < 0) {
5001 ss << "Invalid pool" << poolstr;
5002 return;
5003 }
5004
5005 string objname, nspace;
5006 cmd_getval(service->cct, cmdmap, "objname", objname);
5007 std::size_t found = objname.find_first_of('/');
5008 if (found != string::npos) {
5009 nspace = objname.substr(0, found);
5010 objname = objname.substr(found+1);
5011 }
5012 object_locator_t oloc(pool, nspace);
5013 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5014
5015 if (r < 0) {
5016 ss << "Invalid namespace/objname";
5017 return;
5018 }
5019
5020 int64_t shardid;
5021 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5022 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5023 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5024 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5025 if (curmap->pg_is_ec(rawpg)) {
5026 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5027 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5028 return;
5029 }
5030 }
5031
5032 ObjectStore::Transaction t;
5033
5034 if (command == "setomapval") {
5035 map<string, bufferlist> newattrs;
5036 bufferlist val;
5037 string key, valstr;
5038 cmd_getval(service->cct, cmdmap, "key", key);
5039 cmd_getval(service->cct, cmdmap, "val", valstr);
5040
5041 val.append(valstr);
5042 newattrs[key] = val;
5043 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5044 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5045 if (r < 0)
5046 ss << "error=" << r;
5047 else
5048 ss << "ok";
5049 } else if (command == "rmomapkey") {
5050 string key;
5051 set<string> keys;
5052 cmd_getval(service->cct, cmdmap, "key", key);
5053
5054 keys.insert(key);
5055 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
5056 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5057 if (r < 0)
5058 ss << "error=" << r;
5059 else
5060 ss << "ok";
5061 } else if (command == "setomapheader") {
5062 bufferlist newheader;
5063 string headerstr;
5064
5065 cmd_getval(service->cct, cmdmap, "header", headerstr);
5066 newheader.append(headerstr);
5067 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
5068 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5069 if (r < 0)
5070 ss << "error=" << r;
5071 else
5072 ss << "ok";
5073 } else if (command == "getomap") {
5074 //Debug: Output entire omap
5075 bufferlist hdrbl;
5076 map<string, bufferlist> keyvals;
5077 r = store->omap_get(coll_t(pgid), ghobject_t(obj), &hdrbl, &keyvals);
5078 if (r >= 0) {
5079 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5080 for (map<string, bufferlist>::iterator it = keyvals.begin();
5081 it != keyvals.end(); ++it)
5082 ss << " key=" << (*it).first << " val="
5083 << string((*it).second.c_str(), (*it).second.length());
5084 } else {
5085 ss << "error=" << r;
5086 }
5087 } else if (command == "truncobj") {
5088 int64_t trunclen;
5089 cmd_getval(service->cct, cmdmap, "len", trunclen);
5090 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
5091 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5092 if (r < 0)
5093 ss << "error=" << r;
5094 else
5095 ss << "ok";
5096 } else if (command == "injectdataerr") {
5097 store->inject_data_error(gobj);
5098 ss << "ok";
5099 } else if (command == "injectmdataerr") {
5100 store->inject_mdata_error(gobj);
5101 ss << "ok";
5102 }
5103 return;
5104 }
5105 if (command == "set_recovery_delay") {
5106 int64_t delay;
5107 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5108 ostringstream oss;
5109 oss << delay;
5110 int r = service->cct->_conf->set_val("osd_recovery_delay_start",
5111 oss.str().c_str());
5112 if (r != 0) {
5113 ss << "set_recovery_delay: error setting "
5114 << "osd_recovery_delay_start to '" << delay << "': error "
5115 << r;
5116 return;
5117 }
5118 service->cct->_conf->apply_changes(NULL);
5119 ss << "set_recovery_delay: set osd_recovery_delay_start "
5120 << "to " << service->cct->_conf->osd_recovery_delay_start;
5121 return;
5122 }
5123 if (command == "trigger_scrub") {
5124 spg_t pgid;
5125 OSDMapRef curmap = service->get_osdmap();
5126
5127 string pgidstr;
5128
5129 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5130 if (!pgid.parse(pgidstr.c_str())) {
5131 ss << "Invalid pgid specified";
5132 return;
5133 }
5134
5135 PG *pg = service->osd->_lookup_lock_pg(pgid);
5136 if (pg == nullptr) {
5137 ss << "Can't find pg " << pgid;
5138 return;
5139 }
5140
5141 if (pg->is_primary()) {
5142 pg->unreg_next_scrub();
5143 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5144 double pool_scrub_max_interval = 0;
5145 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5146 double scrub_max_interval = pool_scrub_max_interval > 0 ?
5147 pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
5148 // Instead of marking must_scrub force a schedule scrub
5149 utime_t stamp = ceph_clock_now();
5150 stamp -= scrub_max_interval;
5151 stamp -= 100.0; // push back last scrub more for good measure
5152 pg->info.history.last_scrub_stamp = stamp;
5153 pg->reg_next_scrub();
5154 ss << "ok";
5155 } else {
5156 ss << "Not primary";
5157 }
5158 pg->unlock();
5159 return;
5160 }
5161 if (command == "injectfull") {
5162 int64_t count;
5163 string type;
5164 OSDService::s_names state;
5165 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
5166 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
5167 if (type == "none" || count == 0) {
5168 type = "none";
5169 count = 0;
5170 }
5171 state = service->get_full_state(type);
5172 if (state == OSDService::s_names::INVALID) {
5173 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5174 return;
5175 }
5176 service->set_injectfull(state, count);
5177 return;
5178 }
5179 ss << "Internal error - command=" << command;
5180}
5181
5182// =========================================
5183bool remove_dir(
5184 CephContext *cct,
5185 ObjectStore *store, SnapMapper *mapper,
5186 OSDriver *osdriver,
5187 ObjectStore::Sequencer *osr,
5188 coll_t coll, DeletingStateRef dstate,
5189 bool *finished,
5190 ThreadPool::TPHandle &handle)
5191{
5192 vector<ghobject_t> olist;
5193 int64_t num = 0;
5194 ObjectStore::Transaction t;
5195 ghobject_t next;
5196 handle.reset_tp_timeout();
5197 store->collection_list(
5198 coll,
5199 next,
5200 ghobject_t::get_max(),
5201 store->get_ideal_list_max(),
5202 &olist,
5203 &next);
5204 generic_dout(10) << __func__ << " " << olist << dendl;
5205 // default cont to true, this is safe because caller(OSD::RemoveWQ::_process())
5206 // will recheck the answer before it really goes on.
5207 bool cont = true;
5208 for (vector<ghobject_t>::iterator i = olist.begin();
5209 i != olist.end();
5210 ++i) {
5211 if (i->is_pgmeta())
5212 continue;
5213 OSDriver::OSTransaction _t(osdriver->get_transaction(&t));
5214 int r = mapper->remove_oid(i->hobj, &_t);
5215 if (r != 0 && r != -ENOENT) {
5216 ceph_abort();
5217 }
5218 t.remove(coll, *i);
5219 if (++num >= cct->_conf->osd_target_transaction_size) {
5220 C_SaferCond waiter;
5221 store->queue_transaction(osr, std::move(t), &waiter);
5222 cont = dstate->pause_clearing();
5223 handle.suspend_tp_timeout();
5224 waiter.wait();
5225 handle.reset_tp_timeout();
5226 if (cont)
5227 cont = dstate->resume_clearing();
5228 if (!cont)
5229 return false;
5230 t = ObjectStore::Transaction();
5231 num = 0;
5232 }
5233 }
5234 if (num) {
5235 C_SaferCond waiter;
5236 store->queue_transaction(osr, std::move(t), &waiter);
5237 cont = dstate->pause_clearing();
5238 handle.suspend_tp_timeout();
5239 waiter.wait();
5240 handle.reset_tp_timeout();
5241 if (cont)
5242 cont = dstate->resume_clearing();
5243 }
5244 // whether there are more objects to remove in the collection
5245 *finished = next.is_max();
5246 return cont;
5247}
5248
5249void OSD::RemoveWQ::_process(
5250 pair<PGRef, DeletingStateRef> item,
5251 ThreadPool::TPHandle &handle)
5252{
5253 FUNCTRACE();
5254 PGRef pg(item.first);
5255 SnapMapper &mapper = pg->snap_mapper;
5256 OSDriver &driver = pg->osdriver;
5257 coll_t coll = coll_t(pg->info.pgid);
5258 pg->osr->flush();
5259 bool finished = false;
5260
5261 if (!item.second->start_or_resume_clearing())
5262 return;
5263
5264 bool cont = remove_dir(
5265 pg->cct, store, &mapper, &driver, pg->osr.get(), coll, item.second,
5266 &finished, handle);
5267 if (!cont)
5268 return;
5269 if (!finished) {
5270 if (item.second->pause_clearing())
5271 queue_front(item);
5272 return;
5273 }
5274
5275 if (!item.second->start_deleting())
5276 return;
5277
5278 ObjectStore::Transaction t;
5279 PGLog::clear_info_log(pg->info.pgid, &t);
5280
5281 if (cct->_conf->osd_inject_failure_on_pg_removal) {
5282 generic_derr << "osd_inject_failure_on_pg_removal" << dendl;
5283 _exit(1);
5284 }
5285 t.remove_collection(coll);
5286
5287 // We need the sequencer to stick around until the op is complete
5288 store->queue_transaction(
5289 pg->osr.get(),
5290 std::move(t),
5291 0, // onapplied
5292 0, // oncommit
5293 0, // onreadable sync
5294 new ContainerContext<PGRef>(pg),
5295 TrackedOpRef());
5296
5297 item.second->finish_deleting();
5298}
5299// =========================================
5300
5301void OSD::ms_handle_connect(Connection *con)
5302{
5303 dout(10) << __func__ << " con " << con << dendl;
5304 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
5305 Mutex::Locker l(osd_lock);
5306 if (is_stopping())
5307 return;
5308 dout(10) << __func__ << " on mon" << dendl;
5309
5310 if (is_preboot()) {
5311 start_boot();
5312 } else if (is_booting()) {
5313 _send_boot(); // resend boot message
5314 } else {
5315 map_lock.get_read();
5316 Mutex::Locker l2(mon_report_lock);
5317
5318 utime_t now = ceph_clock_now();
5319 last_mon_report = now;
5320
5321 // resend everything, it's a new session
5322 send_full_update();
5323 send_alive();
5324 service.requeue_pg_temp();
5325 service.send_pg_temp();
5326 requeue_failures();
5327 send_failures();
5328 send_pg_stats(now);
5329
5330 map_lock.put_read();
5331 if (is_active()) {
5332 send_beacon(ceph::coarse_mono_clock::now());
5333 }
5334 }
5335
5336 // full map requests may happen while active or pre-boot
5337 if (requested_full_first) {
5338 rerequest_full_maps();
5339 }
5340 }
5341}
5342
5343void OSD::ms_handle_fast_connect(Connection *con)
5344{
5345 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5346 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5347 Session *s = static_cast<Session*>(con->get_priv());
5348 if (!s) {
5349 s = new Session(cct);
5350 con->set_priv(s->get());
5351 s->con = con;
5352 dout(10) << " new session (outgoing) " << s << " con=" << s->con
5353 << " addr=" << s->con->get_peer_addr() << dendl;
5354 // we don't connect to clients
5355 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5356 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5357 }
5358 s->put();
5359 }
5360}
5361
5362void OSD::ms_handle_fast_accept(Connection *con)
5363{
5364 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5365 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5366 Session *s = static_cast<Session*>(con->get_priv());
5367 if (!s) {
5368 s = new Session(cct);
5369 con->set_priv(s->get());
5370 s->con = con;
5371 dout(10) << "new session (incoming)" << s << " con=" << con
5372 << " addr=" << con->get_peer_addr()
5373 << " must have raced with connect" << dendl;
5374 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5375 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5376 }
5377 s->put();
5378 }
5379}
5380
5381bool OSD::ms_handle_reset(Connection *con)
5382{
5383 Session *session = static_cast<Session*>(con->get_priv());
5384 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
5385 if (!session)
5386 return false;
5387 session->wstate.reset(con);
5388 session->con.reset(NULL); // break con <-> session ref cycle
5389 // note that we break session->con *before* the session_handle_reset
5390 // cleanup below. this avoids a race between us and
5391 // PG::add_backoff, Session::check_backoff, etc.
5392 session_handle_reset(session);
5393 session->put();
5394 return true;
5395}
5396
5397bool OSD::ms_handle_refused(Connection *con)
5398{
5399 if (!cct->_conf->osd_fast_fail_on_connection_refused)
5400 return false;
5401
5402 Session *session = static_cast<Session*>(con->get_priv());
5403 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
5404 if (!session)
5405 return false;
5406 int type = con->get_peer_type();
5407 // handle only OSD failures here
5408 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
5409 OSDMapRef osdmap = get_osdmap();
5410 if (osdmap) {
5411 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
5412 if (id >= 0 && osdmap->is_up(id)) {
5413 // I'm cheating mon heartbeat grace logic, because we know it's not going
5414 // to respawn alone. +1 so we won't hit any boundary case.
5415 monc->send_mon_message(new MOSDFailure(monc->get_fsid(),
5416 osdmap->get_inst(id),
5417 cct->_conf->osd_heartbeat_grace + 1,
5418 osdmap->get_epoch(),
5419 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
5420 ));
5421 }
5422 }
5423 }
5424 session->put();
5425 return true;
5426}
5427
5428struct C_OSD_GetVersion : public Context {
5429 OSD *osd;
5430 uint64_t oldest, newest;
5431 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
5432 void finish(int r) override {
5433 if (r >= 0)
5434 osd->_got_mon_epochs(oldest, newest);
5435 }
5436};
5437
5438void OSD::start_boot()
5439{
5440 if (!_is_healthy()) {
5441 // if we are not healthy, do not mark ourselves up (yet)
5442 dout(1) << "not healthy; waiting to boot" << dendl;
5443 if (!is_waiting_for_healthy())
5444 start_waiting_for_healthy();
5445 // send pings sooner rather than later
5446 heartbeat_kick();
5447 return;
5448 }
5449 dout(1) << __func__ << dendl;
5450 set_state(STATE_PREBOOT);
5451 dout(10) << "start_boot - have maps " << superblock.oldest_map
5452 << ".." << superblock.newest_map << dendl;
5453 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
5454 monc->get_version("osdmap", &c->newest, &c->oldest, c);
5455}
5456
5457void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
5458{
5459 Mutex::Locker l(osd_lock);
5460 if (is_preboot()) {
5461 _preboot(oldest, newest);
5462 }
5463}
5464
5465void OSD::_preboot(epoch_t oldest, epoch_t newest)
5466{
5467 assert(is_preboot());
5468 dout(10) << __func__ << " _preboot mon has osdmaps "
5469 << oldest << ".." << newest << dendl;
5470
5471 // ensure our local fullness awareness is accurate
5472 heartbeat();
5473
5474 // if our map within recent history, try to add ourselves to the osdmap.
5475 if (osdmap->test_flag(CEPH_OSDMAP_NOUP)) {
5476 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
5477 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
5478 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
5479 << dendl;
5480 } else if (!osdmap->test_flag(CEPH_OSDMAP_REQUIRE_JEWEL)) {
5481 derr << "osdmap REQUIRE_JEWEL OSDMap flag is NOT set; please set it"
5482 << dendl;
5483 } else if (!monc->monmap.get_required_features().contains_all(
5484 ceph::features::mon::FEATURE_LUMINOUS)) {
5485 derr << "monmap REQUIRE_LUMINOUS is NOT set; must upgrade all monitors to "
5486 << "Luminous or later before Luminous OSDs will boot" << dendl;
5487 } else if (service.need_fullness_update()) {
5488 derr << "osdmap fullness state needs update" << dendl;
5489 send_full_update();
5490 } else if (osdmap->get_epoch() >= oldest - 1 &&
5491 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
5492 _send_boot();
5493 return;
5494 }
5495
5496 // get all the latest maps
5497 if (osdmap->get_epoch() + 1 >= oldest)
5498 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5499 else
5500 osdmap_subscribe(oldest - 1, true);
5501}
5502
5503void OSD::send_full_update()
5504{
5505 if (!service.need_fullness_update())
5506 return;
5507 unsigned state = 0;
5508 if (service.is_full()) {
5509 state = CEPH_OSD_FULL;
5510 } else if (service.is_backfillfull()) {
5511 state = CEPH_OSD_BACKFILLFULL;
5512 } else if (service.is_nearfull()) {
5513 state = CEPH_OSD_NEARFULL;
5514 }
5515 set<string> s;
5516 OSDMap::calc_state_set(state, s);
5517 dout(10) << __func__ << " want state " << s << dendl;
5518 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
5519}
5520
5521void OSD::start_waiting_for_healthy()
5522{
5523 dout(1) << "start_waiting_for_healthy" << dendl;
5524 set_state(STATE_WAITING_FOR_HEALTHY);
5525 last_heartbeat_resample = utime_t();
5526}
5527
5528bool OSD::_is_healthy()
5529{
5530 if (!cct->get_heartbeat_map()->is_healthy()) {
5531 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
5532 return false;
5533 }
5534
5535 if (is_waiting_for_healthy()) {
5536 Mutex::Locker l(heartbeat_lock);
5537 utime_t cutoff = ceph_clock_now();
5538 cutoff -= cct->_conf->osd_heartbeat_grace;
5539 int num = 0, up = 0;
5540 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5541 p != heartbeat_peers.end();
5542 ++p) {
5543 if (p->second.is_healthy(cutoff))
5544 ++up;
5545 ++num;
5546 }
5547 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
5548 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
5549 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
5550 return false;
5551 }
5552 }
5553
5554 return true;
5555}
5556
5557void OSD::_send_boot()
5558{
5559 dout(10) << "_send_boot" << dendl;
5560 entity_addr_t cluster_addr = cluster_messenger->get_myaddr();
5561 Connection *local_connection = cluster_messenger->get_loopback_connection().get();
5562 if (cluster_addr.is_blank_ip()) {
5563 int port = cluster_addr.get_port();
5564 cluster_addr = client_messenger->get_myaddr();
5565 cluster_addr.set_port(port);
5566 cluster_messenger->set_addr_unknowns(cluster_addr);
5567 dout(10) << " assuming cluster_addr ip matches client_addr" << dendl;
5568 } else {
5569 Session *s = static_cast<Session*>(local_connection->get_priv());
5570 if (s)
5571 s->put();
5572 else
5573 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
5574 }
5575
5576 entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr();
5577 local_connection = hb_back_server_messenger->get_loopback_connection().get();
5578 if (hb_back_addr.is_blank_ip()) {
5579 int port = hb_back_addr.get_port();
5580 hb_back_addr = cluster_addr;
5581 hb_back_addr.set_port(port);
5582 hb_back_server_messenger->set_addr_unknowns(hb_back_addr);
5583 dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl;
5584 } else {
5585 Session *s = static_cast<Session*>(local_connection->get_priv());
5586 if (s)
5587 s->put();
5588 else
5589 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
5590 }
5591
5592 entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr();
5593 local_connection = hb_front_server_messenger->get_loopback_connection().get();
5594 if (hb_front_addr.is_blank_ip()) {
5595 int port = hb_front_addr.get_port();
5596 hb_front_addr = client_messenger->get_myaddr();
5597 hb_front_addr.set_port(port);
5598 hb_front_server_messenger->set_addr_unknowns(hb_front_addr);
5599 dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
5600 } else {
5601 Session *s = static_cast<Session*>(local_connection->get_priv());
5602 if (s)
5603 s->put();
5604 else
5605 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
5606 }
5607
5608 MOSDBoot *mboot = new MOSDBoot(superblock, get_osdmap_epoch(), service.get_boot_epoch(),
5609 hb_back_addr, hb_front_addr, cluster_addr,
5610 CEPH_FEATURES_ALL);
5611 dout(10) << " client_addr " << client_messenger->get_myaddr()
5612 << ", cluster_addr " << cluster_addr
5613 << ", hb_back_addr " << hb_back_addr
5614 << ", hb_front_addr " << hb_front_addr
5615 << dendl;
5616 _collect_metadata(&mboot->metadata);
5617 monc->send_mon_message(mboot);
5618 set_state(STATE_BOOTING);
5619}
5620
5621void OSD::_collect_metadata(map<string,string> *pm)
5622{
5623 // config info
5624 (*pm)["osd_data"] = dev_path;
5625 (*pm)["osd_journal"] = journal_path;
5626 (*pm)["front_addr"] = stringify(client_messenger->get_myaddr());
5627 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddr());
5628 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddr());
5629 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddr());
5630
5631 // backend
5632 (*pm)["osd_objectstore"] = store->get_type();
5633 store->collect_metadata(pm);
5634
5635 collect_sys_info(pm, cct);
5636
5637 dout(10) << __func__ << " " << *pm << dendl;
5638}
5639
5640void OSD::queue_want_up_thru(epoch_t want)
5641{
5642 map_lock.get_read();
5643 epoch_t cur = osdmap->get_up_thru(whoami);
5644 Mutex::Locker l(mon_report_lock);
5645 if (want > up_thru_wanted) {
5646 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
5647 << ", currently " << cur
5648 << dendl;
5649 up_thru_wanted = want;
5650 send_alive();
5651 } else {
5652 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
5653 << ", currently " << cur
5654 << dendl;
5655 }
5656 map_lock.put_read();
5657}
5658
5659void OSD::send_alive()
5660{
5661 assert(mon_report_lock.is_locked());
5662 if (!osdmap->exists(whoami))
5663 return;
5664 epoch_t up_thru = osdmap->get_up_thru(whoami);
5665 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
5666 if (up_thru_wanted > up_thru) {
5667 dout(10) << "send_alive want " << up_thru_wanted << dendl;
5668 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
5669 }
5670}
5671
5672void OSD::request_full_map(epoch_t first, epoch_t last)
5673{
5674 dout(10) << __func__ << " " << first << ".." << last
5675 << ", previously requested "
5676 << requested_full_first << ".." << requested_full_last << dendl;
5677 assert(osd_lock.is_locked());
5678 assert(first > 0 && last > 0);
5679 assert(first <= last);
5680 assert(first >= requested_full_first); // we shouldn't ever ask for older maps
5681 if (requested_full_first == 0) {
5682 // first request
5683 requested_full_first = first;
5684 requested_full_last = last;
5685 } else if (last <= requested_full_last) {
5686 // dup
5687 return;
5688 } else {
5689 // additional request
5690 first = requested_full_last + 1;
5691 requested_full_last = last;
5692 }
5693 MMonGetOSDMap *req = new MMonGetOSDMap;
5694 req->request_full(first, last);
5695 monc->send_mon_message(req);
5696}
5697
5698void OSD::got_full_map(epoch_t e)
5699{
5700 assert(requested_full_first <= requested_full_last);
5701 assert(osd_lock.is_locked());
5702 if (requested_full_first == 0) {
5703 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
5704 return;
5705 }
5706 if (e < requested_full_first) {
5707 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
5708 << ".." << requested_full_last
5709 << ", ignoring" << dendl;
5710 return;
5711 }
5712 if (e >= requested_full_last) {
5713 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
5714 << ".." << requested_full_last << ", resetting" << dendl;
5715 requested_full_first = requested_full_last = 0;
5716 return;
5717 }
5718
5719 requested_full_first = e + 1;
5720
5721 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
5722 << ".." << requested_full_last
5723 << ", still need more" << dendl;
5724}
5725
5726void OSD::requeue_failures()
5727{
5728 Mutex::Locker l(heartbeat_lock);
5729 unsigned old_queue = failure_queue.size();
5730 unsigned old_pending = failure_pending.size();
5731 for (map<int,pair<utime_t,entity_inst_t> >::iterator p =
5732 failure_pending.begin();
5733 p != failure_pending.end(); ) {
5734 failure_queue[p->first] = p->second.first;
5735 failure_pending.erase(p++);
5736 }
5737 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
5738 << failure_queue.size() << dendl;
5739}
5740
5741void OSD::send_failures()
5742{
5743 assert(map_lock.is_locked());
5744 assert(mon_report_lock.is_locked());
5745 Mutex::Locker l(heartbeat_lock);
5746 utime_t now = ceph_clock_now();
5747 while (!failure_queue.empty()) {
5748 int osd = failure_queue.begin()->first;
5749 entity_inst_t i = osdmap->get_inst(osd);
5750 if (!failure_pending.count(osd)) {
5751 int failed_for = (int)(double)(now - failure_queue.begin()->second);
5752 monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, failed_for,
5753 osdmap->get_epoch()));
5754 failure_pending[osd] = make_pair(failure_queue.begin()->second, i);
5755 }
5756 failure_queue.erase(osd);
5757 }
5758}
5759
5760void OSD::send_still_alive(epoch_t epoch, const entity_inst_t &i)
5761{
5762 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), i, 0, epoch, MOSDFailure::FLAG_ALIVE);
5763 monc->send_mon_message(m);
5764}
5765
5766void OSD::send_pg_stats(const utime_t &now)
5767{
5768 assert(map_lock.is_locked());
5769 dout(20) << "send_pg_stats" << dendl;
5770
5771 osd_stat_t cur_stat = service.get_osd_stat();
5772
5773 cur_stat.os_perf_stat = store->get_cur_stats();
5774
5775 pg_stat_queue_lock.Lock();
5776
5777 if (osd_stat_updated || !pg_stat_queue.empty()) {
5778 last_pg_stats_sent = now;
5779 osd_stat_updated = false;
5780
5781 dout(10) << "send_pg_stats - " << pg_stat_queue.size() << " pgs updated" << dendl;
5782
5783 utime_t had_for(now);
5784 had_for -= had_map_since;
5785
5786 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
5787
5788 uint64_t tid = ++pg_stat_tid;
5789 m->set_tid(tid);
5790 m->osd_stat = cur_stat;
5791
5792 xlist<PG*>::iterator p = pg_stat_queue.begin();
5793 while (!p.end()) {
5794 PG *pg = *p;
5795 ++p;
5796 if (!pg->is_primary()) { // we hold map_lock; role is stable.
5797 pg->stat_queue_item.remove_myself();
5798 pg->put("pg_stat_queue");
5799 continue;
5800 }
5801 pg->pg_stats_publish_lock.Lock();
5802 if (pg->pg_stats_publish_valid) {
5803 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
5804 dout(25) << " sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
5805 << pg->pg_stats_publish.reported_seq << dendl;
5806 } else {
5807 dout(25) << " NOT sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
5808 << pg->pg_stats_publish.reported_seq << ", not valid" << dendl;
5809 }
5810 pg->pg_stats_publish_lock.Unlock();
5811 }
5812
5813 if (last_pg_stats_ack == utime_t() || !outstanding_pg_stats.empty()) {
5814 last_pg_stats_ack = ceph_clock_now();
5815 }
5816 outstanding_pg_stats.insert(tid);
5817 dout(20) << __func__ << " updates pending: " << outstanding_pg_stats << dendl;
5818
5819 monc->send_mon_message(m);
5820 }
5821
5822 pg_stat_queue_lock.Unlock();
5823}
5824
5825void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
5826{
5827 dout(10) << "handle_pg_stats_ack " << dendl;
5828
5829 if (!require_mon_peer(ack)) {
5830 ack->put();
5831 return;
5832 }
5833
5834 // NOTE: we may get replies from a previous mon even while
5835 // outstanding_pg_stats is empty if reconnecting races with replies
5836 // in flight.
5837
5838 pg_stat_queue_lock.Lock();
5839
5840 last_pg_stats_ack = ceph_clock_now();
5841
5842 // decay timeout slowly (analogous to TCP)
5843 stats_ack_timeout =
5844 MAX(cct->_conf->osd_mon_ack_timeout,
5845 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_decay);
5846 dout(20) << __func__ << " timeout now " << stats_ack_timeout << dendl;
5847
5848 if (ack->get_tid() > pg_stat_tid_flushed) {
5849 pg_stat_tid_flushed = ack->get_tid();
5850 pg_stat_queue_cond.Signal();
5851 }
5852
5853 xlist<PG*>::iterator p = pg_stat_queue.begin();
5854 while (!p.end()) {
5855 PG *pg = *p;
5856 PGRef _pg(pg);
5857 ++p;
5858
5859 auto acked = ack->pg_stat.find(pg->info.pgid.pgid);
5860 if (acked != ack->pg_stat.end()) {
5861 pg->pg_stats_publish_lock.Lock();
5862 if (acked->second.first == pg->pg_stats_publish.reported_seq &&
5863 acked->second.second == pg->pg_stats_publish.reported_epoch) {
5864 dout(25) << " ack on " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
5865 << ":" << pg->pg_stats_publish.reported_seq << dendl;
5866 pg->stat_queue_item.remove_myself();
5867 pg->put("pg_stat_queue");
5868 } else {
5869 dout(25) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
5870 << ":" << pg->pg_stats_publish.reported_seq << " > acked "
5871 << acked->second << dendl;
5872 }
5873 pg->pg_stats_publish_lock.Unlock();
5874 } else {
5875 dout(30) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
5876 << ":" << pg->pg_stats_publish.reported_seq << dendl;
5877 }
5878 }
5879
5880 outstanding_pg_stats.erase(ack->get_tid());
5881 dout(20) << __func__ << " still pending: " << outstanding_pg_stats << dendl;
5882
5883 pg_stat_queue_lock.Unlock();
5884
5885 ack->put();
5886}
5887
5888void OSD::flush_pg_stats()
5889{
5890 dout(10) << "flush_pg_stats" << dendl;
5891 osd_lock.Unlock();
5892 utime_t now = ceph_clock_now();
5893 map_lock.get_read();
5894 mon_report_lock.Lock();
5895 send_pg_stats(now);
5896 mon_report_lock.Unlock();
5897 map_lock.put_read();
5898
5899
5900 pg_stat_queue_lock.Lock();
5901 uint64_t tid = pg_stat_tid;
5902 dout(10) << "flush_pg_stats waiting for stats tid " << tid << " to flush" << dendl;
5903 while (tid > pg_stat_tid_flushed)
5904 pg_stat_queue_cond.Wait(pg_stat_queue_lock);
5905 dout(10) << "flush_pg_stats finished waiting for stats tid " << tid << " to flush" << dendl;
5906 pg_stat_queue_lock.Unlock();
5907
5908 osd_lock.Lock();
5909}
5910
5911void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
5912{
5913 const auto& monmap = monc->monmap;
5914 // send beacon to mon even if we are just connected, and the monmap is not
5915 // initialized yet by then.
5916 if (monmap.epoch > 0 &&
5917 monmap.get_required_features().contains_all(
5918 ceph::features::mon::FEATURE_LUMINOUS)) {
5919 dout(20) << __func__ << " sending" << dendl;
5920 last_sent_beacon = now;
5921 MOSDBeacon* beacon = nullptr;
5922 {
5923 Mutex::Locker l{min_last_epoch_clean_lock};
5924 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
5925 std::swap(beacon->pgs, min_last_epoch_clean_pgs);
5926 }
5927 monc->send_mon_message(beacon);
5928 } else {
5929 dout(20) << __func__ << " not sending" << dendl;
5930 }
5931}
5932
5933void OSD::handle_command(MMonCommand *m)
5934{
5935 if (!require_mon_peer(m)) {
5936 m->put();
5937 return;
5938 }
5939
5940 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
5941 command_wq.queue(c);
5942 m->put();
5943}
5944
5945void OSD::handle_command(MCommand *m)
5946{
5947 ConnectionRef con = m->get_connection();
5948 Session *session = static_cast<Session *>(con->get_priv());
5949 if (!session) {
5950 con->send_message(new MCommandReply(m, -EPERM));
5951 m->put();
5952 return;
5953 }
5954
5955 OSDCap& caps = session->caps;
5956 session->put();
5957
5958 if (!caps.allow_all() || m->get_source().is_mon()) {
5959 con->send_message(new MCommandReply(m, -EPERM));
5960 m->put();
5961 return;
5962 }
5963
5964 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
5965 command_wq.queue(c);
5966
5967 m->put();
5968}
5969
5970struct OSDCommand {
5971 string cmdstring;
5972 string helpstring;
5973 string module;
5974 string perm;
5975 string availability;
5976} osd_commands[] = {
5977
5978#define COMMAND(parsesig, helptext, module, perm, availability) \
5979 {parsesig, helptext, module, perm, availability},
5980
5981// yes, these are really pg commands, but there's a limit to how
5982// much work it's worth. The OSD returns all of them. Make this
5983// form (pg <pgid> <cmd>) valid only for the cli.
5984// Rest uses "tell <pgid> <cmd>"
5985
5986COMMAND("pg " \
5987 "name=pgid,type=CephPgid " \
5988 "name=cmd,type=CephChoices,strings=query", \
5989 "show details of a specific pg", "osd", "r", "cli")
5990COMMAND("pg " \
5991 "name=pgid,type=CephPgid " \
5992 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
5993 "name=mulcmd,type=CephChoices,strings=revert|delete", \
5994 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
5995 "osd", "rw", "cli")
5996COMMAND("pg " \
5997 "name=pgid,type=CephPgid " \
5998 "name=cmd,type=CephChoices,strings=list_missing " \
5999 "name=offset,type=CephString,req=false",
6000 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6001 "osd", "r", "cli")
6002
6003// new form: tell <pgid> <cmd> for both cli and rest
6004
6005COMMAND("query",
6006 "show details of a specific pg", "osd", "r", "cli,rest")
6007COMMAND("mark_unfound_lost " \
6008 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6009 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6010 "osd", "rw", "cli,rest")
6011COMMAND("list_missing " \
6012 "name=offset,type=CephString,req=false",
6013 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6014 "osd", "r", "cli,rest")
6015
6016// tell <osd.n> commands. Validation of osd.n must be special-cased in client
6017COMMAND("version", "report version of OSD", "osd", "r", "cli,rest")
6018COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r", "cli,rest")
6019COMMAND("injectargs " \
6020 "name=injected_args,type=CephString,n=N",
6021 "inject configuration arguments into running OSD",
6022 "osd", "rw", "cli,rest")
6023COMMAND("cluster_log " \
6024 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6025 "name=message,type=CephString,n=N",
6026 "log a message to the cluster log",
6027 "osd", "rw", "cli,rest")
6028COMMAND("bench " \
6029 "name=count,type=CephInt,req=false " \
6030 "name=size,type=CephInt,req=false " \
6031 "name=object_size,type=CephInt,req=false " \
6032 "name=object_num,type=CephInt,req=false ", \
6033 "OSD benchmark: write <count> <size>-byte objects, " \
6034 "(default 1G size 4MB). Results in log.",
6035 "osd", "rw", "cli,rest")
6036COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw", "cli,rest")
6037COMMAND("heap " \
6038 "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
6039 "show heap usage info (available only if compiled with tcmalloc)", \
6040 "osd", "rw", "cli,rest")
6041COMMAND("debug dump_missing " \
6042 "name=filename,type=CephFilepath",
6043 "dump missing objects to a named file", "osd", "r", "cli,rest")
6044COMMAND("debug kick_recovery_wq " \
6045 "name=delay,type=CephInt,range=0",
6046 "set osd_recovery_delay_start to <val>", "osd", "rw", "cli,rest")
6047COMMAND("cpu_profiler " \
6048 "name=arg,type=CephChoices,strings=status|flush",
6049 "run cpu profiling on daemon", "osd", "rw", "cli,rest")
6050COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6051 "osd", "r", "cli,rest")
6052COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6053 "osd", "rw", "cli,rest")
6054};
6055
6056void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6057{
6058 int r = 0;
6059 stringstream ss, ds;
6060 string rs;
6061 bufferlist odata;
6062
6063 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6064
6065 map<string, cmd_vartype> cmdmap;
6066 string prefix;
6067 string format;
6068 string pgidstr;
6069 boost::scoped_ptr<Formatter> f;
6070
6071 if (cmd.empty()) {
6072 ss << "no command given";
6073 goto out;
6074 }
6075
6076 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6077 r = -EINVAL;
6078 goto out;
6079 }
6080
6081 cmd_getval(cct, cmdmap, "prefix", prefix);
6082
6083 if (prefix == "get_command_descriptions") {
6084 int cmdnum = 0;
6085 JSONFormatter *f = new JSONFormatter();
6086 f->open_object_section("command_descriptions");
6087 for (OSDCommand *cp = osd_commands;
6088 cp < &osd_commands[ARRAY_SIZE(osd_commands)]; cp++) {
6089
6090 ostringstream secname;
6091 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
6092 dump_cmddesc_to_json(f, secname.str(), cp->cmdstring, cp->helpstring,
6093 cp->module, cp->perm, cp->availability, 0);
6094 cmdnum++;
6095 }
6096 f->close_section(); // command_descriptions
6097
6098 f->flush(ds);
6099 delete f;
6100 goto out;
6101 }
6102
6103 cmd_getval(cct, cmdmap, "format", format);
6104 f.reset(Formatter::create(format));
6105
6106 if (prefix == "version") {
6107 if (f) {
6108 f->open_object_section("version");
6109 f->dump_string("version", pretty_version_to_str());
6110 f->close_section();
6111 f->flush(ds);
6112 } else {
6113 ds << pretty_version_to_str();
6114 }
6115 goto out;
6116 }
6117 else if (prefix == "injectargs") {
6118 vector<string> argsvec;
6119 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6120
6121 if (argsvec.empty()) {
6122 r = -EINVAL;
6123 ss << "ignoring empty injectargs";
6124 goto out;
6125 }
6126 string args = argsvec.front();
6127 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6128 args += " " + *a;
6129 osd_lock.Unlock();
6130 r = cct->_conf->injectargs(args, &ss);
6131 osd_lock.Lock();
6132 }
6133 else if (prefix == "cluster_log") {
6134 vector<string> msg;
6135 cmd_getval(cct, cmdmap, "message", msg);
6136 if (msg.empty()) {
6137 r = -EINVAL;
6138 ss << "ignoring empty log message";
6139 goto out;
6140 }
6141 string message = msg.front();
6142 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6143 message += " " + *a;
6144 string lvl;
6145 cmd_getval(cct, cmdmap, "level", lvl);
6146 clog_type level = string_to_clog_type(lvl);
6147 if (level < 0) {
6148 r = -EINVAL;
6149 ss << "unknown level '" << lvl << "'";
6150 goto out;
6151 }
6152 clog->do_log(level, message);
6153 }
6154
6155 // either 'pg <pgid> <command>' or
6156 // 'tell <pgid>' (which comes in without any of that prefix)?
6157
6158 else if (prefix == "pg" ||
6159 prefix == "query" ||
6160 prefix == "mark_unfound_lost" ||
6161 prefix == "list_missing"
6162 ) {
6163 pg_t pgid;
6164
6165 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6166 ss << "no pgid specified";
6167 r = -EINVAL;
6168 } else if (!pgid.parse(pgidstr.c_str())) {
6169 ss << "couldn't parse pgid '" << pgidstr << "'";
6170 r = -EINVAL;
6171 } else {
6172 spg_t pcand;
6173 PG *pg = nullptr;
6174 if (osdmap->get_primary_shard(pgid, &pcand) &&
6175 (pg = _lookup_lock_pg(pcand))) {
6176 if (pg->is_primary()) {
6177 // simulate pg <pgid> cmd= for pg->do-command
6178 if (prefix != "pg")
6179 cmd_putval(cct, cmdmap, "cmd", prefix);
6180 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
6181 if (r == -EAGAIN) {
6182 pg->unlock();
6183 // don't reply, pg will do so async
6184 return;
6185 }
6186 } else {
6187 ss << "not primary for pgid " << pgid;
6188
6189 // send them the latest diff to ensure they realize the mapping
6190 // has changed.
6191 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
6192
6193 // do not reply; they will get newer maps and realize they
6194 // need to resend.
6195 pg->unlock();
6196 return;
6197 }
6198 pg->unlock();
6199 } else {
6200 ss << "i don't have pgid " << pgid;
6201 r = -ENOENT;
6202 }
6203 }
6204 }
6205
6206 else if (prefix == "bench") {
6207 int64_t count;
6208 int64_t bsize;
6209 int64_t osize, onum;
6210 // default count 1G, size 4MB
6211 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
6212 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
6213 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
6214 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
6215
6216 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
6217 ObjectStore::Sequencer>("bench"));
6218
6219 uint32_t duration = cct->_conf->osd_bench_duration;
6220
6221 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
6222 // let us limit the block size because the next checks rely on it
6223 // having a sane value. If we allow any block size to be set things
6224 // can still go sideways.
6225 ss << "block 'size' values are capped at "
6226 << prettybyte_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
6227 << " a higher value, please adjust 'osd_bench_max_block_size'";
6228 r = -EINVAL;
6229 goto out;
6230 } else if (bsize < (int64_t) (1 << 20)) {
6231 // entering the realm of small block sizes.
6232 // limit the count to a sane value, assuming a configurable amount of
6233 // IOPS and duration, so that the OSD doesn't get hung up on this,
6234 // preventing timeouts from going off
6235 int64_t max_count =
6236 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
6237 if (count > max_count) {
6238 ss << "'count' values greater than " << max_count
6239 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6240 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
6241 << " for " << duration << " seconds,"
6242 << " can cause ill effects on osd. "
6243 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6244 << " value if you wish to use a higher 'count'.";
6245 r = -EINVAL;
6246 goto out;
6247 }
6248 } else {
6249 // 1MB block sizes are big enough so that we get more stuff done.
6250 // However, to avoid the osd from getting hung on this and having
6251 // timers being triggered, we are going to limit the count assuming
6252 // a configurable throughput and duration.
6253 // NOTE: max_count is the total amount of bytes that we believe we
6254 // will be able to write during 'duration' for the given
6255 // throughput. The block size hardly impacts this unless it's
6256 // way too big. Given we already check how big the block size
6257 // is, it's safe to assume everything will check out.
6258 int64_t max_count =
6259 cct->_conf->osd_bench_large_size_max_throughput * duration;
6260 if (count > max_count) {
6261 ss << "'count' values greater than " << max_count
6262 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6263 << prettybyte_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
6264 << " for " << duration << " seconds,"
6265 << " can cause ill effects on osd. "
6266 << " Please adjust 'osd_bench_large_size_max_throughput'"
6267 << " with a higher value if you wish to use a higher 'count'.";
6268 r = -EINVAL;
6269 goto out;
6270 }
6271 }
6272
6273 if (osize && bsize > osize)
6274 bsize = osize;
6275
6276 dout(1) << " bench count " << count
6277 << " bsize " << prettybyte_t(bsize) << dendl;
6278
6279 ObjectStore::Transaction cleanupt;
6280
6281 if (osize && onum) {
6282 bufferlist bl;
6283 bufferptr bp(osize);
6284 bp.zero();
6285 bl.push_back(std::move(bp));
6286 bl.rebuild_page_aligned();
6287 for (int i=0; i<onum; ++i) {
6288 char nm[30];
6289 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
6290 object_t oid(nm);
6291 hobject_t soid(sobject_t(oid, 0));
6292 ObjectStore::Transaction t;
6293 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
6294 store->queue_transaction(osr.get(), std::move(t), NULL);
6295 cleanupt.remove(coll_t(), ghobject_t(soid));
6296 }
6297 }
6298
6299 bufferlist bl;
6300 bufferptr bp(bsize);
6301 bp.zero();
6302 bl.push_back(std::move(bp));
6303 bl.rebuild_page_aligned();
6304
6305 {
6306 C_SaferCond waiter;
6307 if (!osr->flush_commit(&waiter)) {
6308 waiter.wait();
6309 }
6310 }
6311
6312 utime_t start = ceph_clock_now();
6313 for (int64_t pos = 0; pos < count; pos += bsize) {
6314 char nm[30];
6315 unsigned offset = 0;
6316 if (onum && osize) {
6317 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
6318 offset = rand() % (osize / bsize) * bsize;
6319 } else {
6320 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
6321 }
6322 object_t oid(nm);
6323 hobject_t soid(sobject_t(oid, 0));
6324 ObjectStore::Transaction t;
6325 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
6326 store->queue_transaction(osr.get(), std::move(t), NULL);
6327 if (!onum || !osize)
6328 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
6329 }
6330
6331 {
6332 C_SaferCond waiter;
6333 if (!osr->flush_commit(&waiter)) {
6334 waiter.wait();
6335 }
6336 }
6337 utime_t end = ceph_clock_now();
6338
6339 // clean up
6340 store->queue_transaction(osr.get(), std::move(cleanupt), NULL);
6341 {
6342 C_SaferCond waiter;
6343 if (!osr->flush_commit(&waiter)) {
6344 waiter.wait();
6345 }
6346 }
6347
6348 uint64_t rate = (double)count / (end - start);
6349 if (f) {
6350 f->open_object_section("osd_bench_results");
6351 f->dump_int("bytes_written", count);
6352 f->dump_int("blocksize", bsize);
6353 f->dump_unsigned("bytes_per_sec", rate);
6354 f->close_section();
6355 f->flush(ss);
6356 } else {
6357 ss << "bench: wrote " << prettybyte_t(count)
6358 << " in blocks of " << prettybyte_t(bsize) << " in "
6359 << (end-start) << " sec at " << prettybyte_t(rate) << "/sec";
6360 }
6361 }
6362
6363 else if (prefix == "flush_pg_stats") {
6364 flush_pg_stats();
6365 }
6366
6367 else if (prefix == "heap") {
6368 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
6369 }
6370
6371 else if (prefix == "debug dump_missing") {
6372 string file_name;
6373 cmd_getval(cct, cmdmap, "filename", file_name);
6374 std::ofstream fout(file_name.c_str());
6375 if (!fout.is_open()) {
6376 ss << "failed to open file '" << file_name << "'";
6377 r = -EINVAL;
6378 goto out;
6379 }
6380
6381 fout << "*** osd " << whoami << ": dump_missing ***" << std::endl;
6382 RWLock::RLocker l(pg_map_lock);
6383 for (ceph::unordered_map<spg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
6384 pg_map_e != pg_map.end(); ++pg_map_e) {
6385 PG *pg = pg_map_e->second;
6386 pg->lock();
6387
6388 fout << *pg << std::endl;
6389 std::map<hobject_t, pg_missing_item>::const_iterator mend =
6390 pg->pg_log.get_missing().get_items().end();
6391 std::map<hobject_t, pg_missing_item>::const_iterator mi =
6392 pg->pg_log.get_missing().get_items().begin();
6393 for (; mi != mend; ++mi) {
6394 fout << mi->first << " -> " << mi->second << std::endl;
6395 if (!pg->missing_loc.needs_recovery(mi->first))
6396 continue;
6397 if (pg->missing_loc.is_unfound(mi->first))
6398 fout << " unfound ";
6399 const set<pg_shard_t> &mls(pg->missing_loc.get_locations(mi->first));
6400 if (mls.empty())
6401 continue;
6402 fout << "missing_loc: " << mls << std::endl;
6403 }
6404 pg->unlock();
6405 fout << std::endl;
6406 }
6407
6408 fout.close();
6409 }
6410 else if (prefix == "debug kick_recovery_wq") {
6411 int64_t delay;
6412 cmd_getval(cct, cmdmap, "delay", delay);
6413 ostringstream oss;
6414 oss << delay;
6415 r = cct->_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
6416 if (r != 0) {
6417 ss << "kick_recovery_wq: error setting "
6418 << "osd_recovery_delay_start to '" << delay << "': error "
6419 << r;
6420 goto out;
6421 }
6422 cct->_conf->apply_changes(NULL);
6423 ss << "kicking recovery queue. set osd_recovery_delay_start "
6424 << "to " << cct->_conf->osd_recovery_delay_start;
6425 }
6426
6427 else if (prefix == "cpu_profiler") {
6428 string arg;
6429 cmd_getval(cct, cmdmap, "arg", arg);
6430 vector<string> argvec;
6431 get_str_vec(arg, argvec);
6432 cpu_profiler_handle_command(argvec, ds);
6433 }
6434
6435 else if (prefix == "dump_pg_recovery_stats") {
6436 stringstream s;
6437 if (f) {
6438 pg_recovery_stats.dump_formatted(f.get());
6439 f->flush(ds);
6440 } else {
6441 pg_recovery_stats.dump(s);
6442 ds << "dump pg recovery stats: " << s.str();
6443 }
6444 }
6445
6446 else if (prefix == "reset_pg_recovery_stats") {
6447 ss << "reset pg recovery stats";
6448 pg_recovery_stats.reset();
6449 }
6450
6451 else {
6452 ss << "unrecognized command! " << cmd;
6453 r = -EINVAL;
6454 }
6455
6456 out:
6457 rs = ss.str();
6458 odata.append(ds);
6459 dout(0) << "do_command r=" << r << " " << rs << dendl;
6460 clog->info() << rs;
6461 if (con) {
6462 MCommandReply *reply = new MCommandReply(r, rs);
6463 reply->set_tid(tid);
6464 reply->set_data(odata);
6465 con->send_message(reply);
6466 }
6467}
6468
6469bool OSD::heartbeat_dispatch(Message *m)
6470{
6471 dout(30) << "heartbeat_dispatch " << m << dendl;
6472 switch (m->get_type()) {
6473
6474 case CEPH_MSG_PING:
6475 dout(10) << "ping from " << m->get_source_inst() << dendl;
6476 m->put();
6477 break;
6478
6479 case MSG_OSD_PING:
6480 handle_osd_ping(static_cast<MOSDPing*>(m));
6481 break;
6482
6483 default:
6484 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
6485 m->put();
6486 }
6487
6488 return true;
6489}
6490
6491bool OSD::ms_dispatch(Message *m)
6492{
6493 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
6494 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
6495 service.got_stop_ack();
6496 m->put();
6497 return true;
6498 }
6499
6500 // lock!
6501
6502 osd_lock.Lock();
6503 if (is_stopping()) {
6504 osd_lock.Unlock();
6505 m->put();
6506 return true;
6507 }
6508
6509 do_waiters();
6510 _dispatch(m);
6511
6512 osd_lock.Unlock();
6513
6514 return true;
6515}
6516
6517void OSD::maybe_share_map(
6518 Session *session,
6519 OpRequestRef op,
6520 OSDMapRef osdmap)
6521{
6522 if (!op->check_send_map) {
6523 return;
6524 }
6525 epoch_t last_sent_epoch = 0;
6526
6527 session->sent_epoch_lock.lock();
6528 last_sent_epoch = session->last_sent_epoch;
6529 session->sent_epoch_lock.unlock();
6530
6531 const Message *m = op->get_req();
6532 service.share_map(
6533 m->get_source(),
6534 m->get_connection().get(),
6535 op->sent_epoch,
6536 osdmap,
6537 session ? &last_sent_epoch : NULL);
6538
6539 session->sent_epoch_lock.lock();
6540 if (session->last_sent_epoch < last_sent_epoch) {
6541 session->last_sent_epoch = last_sent_epoch;
6542 }
6543 session->sent_epoch_lock.unlock();
6544
6545 op->check_send_map = false;
6546}
6547
6548void OSD::dispatch_session_waiting(Session *session, OSDMapRef osdmap)
6549{
6550 assert(session->session_dispatch_lock.is_locked());
6551
6552 auto i = session->waiting_on_map.begin();
6553 while (i != session->waiting_on_map.end()) {
6554 OpRequestRef op = &(*i);
6555 assert(ms_can_fast_dispatch(op->get_req()));
6556 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
6557 op->get_req());
6558 if (m->get_min_epoch() > osdmap->get_epoch()) {
6559 break;
6560 }
6561 session->waiting_on_map.erase(i++);
6562 op->put();
6563
6564 spg_t pgid;
6565 if (m->get_type() == CEPH_MSG_OSD_OP) {
6566 pg_t actual_pgid = osdmap->raw_pg_to_pg(
6567 static_cast<const MOSDOp*>(m)->get_pg());
6568 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
6569 continue;
6570 }
6571 } else {
6572 pgid = m->get_spg();
6573 }
6574 enqueue_op(pgid, op, m->get_map_epoch());
6575 }
6576
6577 if (session->waiting_on_map.empty()) {
6578 clear_session_waiting_on_map(session);
6579 } else {
6580 register_session_waiting_on_map(session);
6581 }
6582}
6583
6584void OSD::ms_fast_dispatch(Message *m)
6585{
6586 FUNCTRACE();
6587 if (service.is_stopping()) {
6588 m->put();
6589 return;
6590 }
6591 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
6592 {
6593#ifdef WITH_LTTNG
6594 osd_reqid_t reqid = op->get_reqid();
6595#endif
6596 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
6597 reqid.name._num, reqid.tid, reqid.inc);
6598 }
6599
6600 if (m->trace)
6601 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
6602
6603 // note sender epoch, min req'd epoch
6604 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
6605 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
6606 assert(op->min_epoch <= op->sent_epoch); // sanity check!
6607
6608 service.maybe_inject_dispatch_delay();
6609
6610 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
6611 m->get_type() != CEPH_MSG_OSD_OP) {
6612 // queue it directly
6613 enqueue_op(
6614 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
6615 op,
6616 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
6617 } else {
6618 // legacy client, and this is an MOSDOp (the *only* fast dispatch
6619 // message that didn't have an explicit spg_t); we need to map
6620 // them to an spg_t while preserving delivery order.
6621 Session *session = static_cast<Session*>(m->get_connection()->get_priv());
6622 if (session) {
6623 {
6624 Mutex::Locker l(session->session_dispatch_lock);
6625 op->get();
6626 session->waiting_on_map.push_back(*op);
6627 OSDMapRef nextmap = service.get_nextmap_reserved();
6628 dispatch_session_waiting(session, nextmap);
6629 service.release_map(nextmap);
6630 }
6631 session->put();
6632 }
6633 }
6634 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
6635}
6636
6637void OSD::ms_fast_preprocess(Message *m)
6638{
6639 if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
6640 if (m->get_type() == CEPH_MSG_OSD_MAP) {
6641 MOSDMap *mm = static_cast<MOSDMap*>(m);
6642 Session *s = static_cast<Session*>(m->get_connection()->get_priv());
6643 if (s) {
6644 s->received_map_lock.lock();
6645 s->received_map_epoch = mm->get_last();
6646 s->received_map_lock.unlock();
6647 s->put();
6648 }
6649 }
6650 }
6651}
6652
6653bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
6654{
6655 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
6656
6657 if (dest_type == CEPH_ENTITY_TYPE_MON)
6658 return true;
6659
6660 if (force_new) {
6661 /* the MonClient checks keys every tick(), so we should just wait for that cycle
6662 to get through */
6663 if (monc->wait_auth_rotating(10) < 0) {
6664 derr << "OSD::ms_get_authorizer wait_auth_rotating failed" << dendl;
6665 return false;
6666 }
6667 }
6668
6669 *authorizer = monc->build_authorizer(dest_type);
6670 return *authorizer != NULL;
6671}
6672
6673
6674bool OSD::ms_verify_authorizer(Connection *con, int peer_type,
6675 int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
6676 bool& isvalid, CryptoKey& session_key)
6677{
6678 AuthAuthorizeHandler *authorize_handler = 0;
6679 switch (peer_type) {
6680 case CEPH_ENTITY_TYPE_MDS:
6681 /*
6682 * note: mds is technically a client from our perspective, but
6683 * this makes the 'cluster' consistent w/ monitor's usage.
6684 */
6685 case CEPH_ENTITY_TYPE_OSD:
6686 case CEPH_ENTITY_TYPE_MGR:
6687 authorize_handler = authorize_handler_cluster_registry->get_handler(protocol);
6688 break;
6689 default:
6690 authorize_handler = authorize_handler_service_registry->get_handler(protocol);
6691 }
6692 if (!authorize_handler) {
6693 dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol << dendl;
6694 isvalid = false;
6695 return true;
6696 }
6697
6698 AuthCapsInfo caps_info;
6699 EntityName name;
6700 uint64_t global_id;
6701 uint64_t auid = CEPH_AUTH_UID_DEFAULT;
6702
6703 isvalid = authorize_handler->verify_authorizer(
6704 cct, monc->rotating_secrets.get(),
6705 authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
6706 &auid);
6707
6708 if (isvalid) {
6709 Session *s = static_cast<Session *>(con->get_priv());
6710 if (!s) {
6711 s = new Session(cct);
6712 con->set_priv(s->get());
6713 s->con = con;
6714 dout(10) << " new session " << s << " con=" << s->con << " addr=" << s->con->get_peer_addr() << dendl;
6715 }
6716
6717 s->entity_name = name;
6718 if (caps_info.allow_all)
6719 s->caps.set_allow_all();
6720 s->auid = auid;
6721
6722 if (caps_info.caps.length() > 0) {
6723 bufferlist::iterator p = caps_info.caps.begin();
6724 string str;
6725 try {
6726 ::decode(str, p);
6727 }
6728 catch (buffer::error& e) {
6729 }
6730 bool success = s->caps.parse(str);
6731 if (success)
6732 dout(10) << " session " << s << " " << s->entity_name << " has caps " << s->caps << " '" << str << "'" << dendl;
6733 else
6734 dout(10) << " session " << s << " " << s->entity_name << " failed to parse caps '" << str << "'" << dendl;
6735 }
6736
6737 s->put();
6738 }
6739 return true;
6740}
6741
6742void OSD::do_waiters()
6743{
6744 assert(osd_lock.is_locked());
6745
6746 dout(10) << "do_waiters -- start" << dendl;
6747 while (!finished.empty()) {
6748 OpRequestRef next = finished.front();
6749 finished.pop_front();
6750 dispatch_op(next);
6751 }
6752 dout(10) << "do_waiters -- finish" << dendl;
6753}
6754
6755void OSD::dispatch_op(OpRequestRef op)
6756{
6757 switch (op->get_req()->get_type()) {
6758
6759 case MSG_OSD_PG_CREATE:
6760 handle_pg_create(op);
6761 break;
6762 case MSG_OSD_PG_NOTIFY:
6763 handle_pg_notify(op);
6764 break;
6765 case MSG_OSD_PG_QUERY:
6766 handle_pg_query(op);
6767 break;
6768 case MSG_OSD_PG_LOG:
6769 handle_pg_log(op);
6770 break;
6771 case MSG_OSD_PG_REMOVE:
6772 handle_pg_remove(op);
6773 break;
6774 case MSG_OSD_PG_INFO:
6775 handle_pg_info(op);
6776 break;
6777 case MSG_OSD_PG_TRIM:
6778 handle_pg_trim(op);
6779 break;
6780 case MSG_OSD_BACKFILL_RESERVE:
6781 handle_pg_backfill_reserve(op);
6782 break;
6783 case MSG_OSD_RECOVERY_RESERVE:
6784 handle_pg_recovery_reserve(op);
6785 break;
6786 }
6787}
6788
6789void OSD::_dispatch(Message *m)
6790{
6791 assert(osd_lock.is_locked());
6792 dout(20) << "_dispatch " << m << " " << *m << dendl;
6793
6794 switch (m->get_type()) {
6795
6796 // -- don't need lock --
6797 case CEPH_MSG_PING:
6798 dout(10) << "ping from " << m->get_source() << dendl;
6799 m->put();
6800 break;
6801
6802 // -- don't need OSDMap --
6803
6804 // map and replication
6805 case CEPH_MSG_OSD_MAP:
6806 handle_osd_map(static_cast<MOSDMap*>(m));
6807 break;
6808
6809 // osd
6810 case MSG_PGSTATSACK:
6811 handle_pg_stats_ack(static_cast<MPGStatsAck*>(m));
6812 break;
6813
6814 case MSG_MON_COMMAND:
6815 handle_command(static_cast<MMonCommand*>(m));
6816 break;
6817 case MSG_COMMAND:
6818 handle_command(static_cast<MCommand*>(m));
6819 break;
6820
6821 case MSG_OSD_SCRUB:
6822 handle_scrub(static_cast<MOSDScrub*>(m));
6823 break;
6824
6825 // -- need OSDMap --
6826
6827 case MSG_OSD_PG_CREATE:
6828 case MSG_OSD_PG_NOTIFY:
6829 case MSG_OSD_PG_QUERY:
6830 case MSG_OSD_PG_LOG:
6831 case MSG_OSD_PG_REMOVE:
6832 case MSG_OSD_PG_INFO:
6833 case MSG_OSD_PG_TRIM:
6834 case MSG_OSD_BACKFILL_RESERVE:
6835 case MSG_OSD_RECOVERY_RESERVE:
6836 {
6837 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
6838 if (m->trace)
6839 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
6840 // no map? starting up?
6841 if (!osdmap) {
6842 dout(7) << "no OSDMap, not booted" << dendl;
6843 logger->inc(l_osd_waiting_for_map);
6844 waiting_for_osdmap.push_back(op);
6845 op->mark_delayed("no osdmap");
6846 break;
6847 }
6848
6849 // need OSDMap
6850 dispatch_op(op);
6851 }
6852 }
6853}
6854
6855void OSD::handle_pg_scrub(MOSDScrub *m, PG *pg)
6856{
6857 pg->lock();
6858 if (pg->is_primary()) {
6859 pg->unreg_next_scrub();
6860 pg->scrubber.must_scrub = true;
6861 pg->scrubber.must_deep_scrub = m->deep || m->repair;
6862 pg->scrubber.must_repair = m->repair;
6863 pg->reg_next_scrub();
6864 dout(10) << "marking " << *pg << " for scrub" << dendl;
6865 }
6866 pg->unlock();
6867}
6868
6869void OSD::handle_scrub(MOSDScrub *m)
6870{
6871 dout(10) << "handle_scrub " << *m << dendl;
6872 if (!require_mon_or_mgr_peer(m)) {
6873 m->put();
6874 return;
6875 }
6876 if (m->fsid != monc->get_fsid()) {
6877 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid() << dendl;
6878 m->put();
6879 return;
6880 }
6881
6882 RWLock::RLocker l(pg_map_lock);
6883 if (m->scrub_pgs.empty()) {
6884 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
6885 p != pg_map.end();
6886 ++p)
6887 handle_pg_scrub(m, p->second);
6888 } else {
6889 for (vector<pg_t>::iterator p = m->scrub_pgs.begin();
6890 p != m->scrub_pgs.end();
6891 ++p) {
6892 spg_t pcand;
6893 if (osdmap->get_primary_shard(*p, &pcand)) {
6894 auto pg_map_entry = pg_map.find(pcand);
6895 if (pg_map_entry != pg_map.end()) {
6896 handle_pg_scrub(m, pg_map_entry->second);
6897 }
6898 }
6899 }
6900 }
6901
6902 m->put();
6903}
6904
6905bool OSD::scrub_random_backoff()
6906{
6907 bool coin_flip = (rand() / (double)RAND_MAX >=
6908 cct->_conf->osd_scrub_backoff_ratio);
6909 if (!coin_flip) {
6910 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
6911 return true;
6912 }
6913 return false;
6914}
6915
6916OSDService::ScrubJob::ScrubJob(CephContext* cct,
6917 const spg_t& pg, const utime_t& timestamp,
6918 double pool_scrub_min_interval,
6919 double pool_scrub_max_interval, bool must)
6920 : cct(cct),
6921 pgid(pg),
6922 sched_time(timestamp),
6923 deadline(timestamp)
6924{
6925 // if not explicitly requested, postpone the scrub with a random delay
6926 if (!must) {
6927 double scrub_min_interval = pool_scrub_min_interval > 0 ?
6928 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
6929 double scrub_max_interval = pool_scrub_max_interval > 0 ?
6930 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
6931
6932 sched_time += scrub_min_interval;
6933 double r = rand() / (double)RAND_MAX;
6934 sched_time +=
6935 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
6936 deadline += scrub_max_interval;
6937 }
6938}
6939
6940bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
6941 if (sched_time < rhs.sched_time)
6942 return true;
6943 if (sched_time > rhs.sched_time)
6944 return false;
6945 return pgid < rhs.pgid;
6946}
6947
6948bool OSD::scrub_time_permit(utime_t now)
6949{
6950 struct tm bdt;
6951 time_t tt = now.sec();
6952 localtime_r(&tt, &bdt);
6953 bool time_permit = false;
6954 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
6955 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
6956 time_permit = true;
6957 }
6958 } else {
6959 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
6960 time_permit = true;
6961 }
6962 }
6963 if (!time_permit) {
6964 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
6965 << " - " << cct->_conf->osd_scrub_end_hour
6966 << " now " << bdt.tm_hour << " = no" << dendl;
6967 } else {
6968 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
6969 << " - " << cct->_conf->osd_scrub_end_hour
6970 << " now " << bdt.tm_hour << " = yes" << dendl;
6971 }
6972 return time_permit;
6973}
6974
6975bool OSD::scrub_load_below_threshold()
6976{
6977 double loadavgs[3];
6978 if (getloadavg(loadavgs, 3) != 3) {
6979 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
6980 return false;
6981 }
6982
6983 // allow scrub if below configured threshold
6984 if (loadavgs[0] < cct->_conf->osd_scrub_load_threshold) {
6985 dout(20) << __func__ << " loadavg " << loadavgs[0]
6986 << " < max " << cct->_conf->osd_scrub_load_threshold
6987 << " = yes" << dendl;
6988 return true;
6989 }
6990
6991 // allow scrub if below daily avg and currently decreasing
6992 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
6993 dout(20) << __func__ << " loadavg " << loadavgs[0]
6994 << " < daily_loadavg " << daily_loadavg
6995 << " and < 15m avg " << loadavgs[2]
6996 << " = yes" << dendl;
6997 return true;
6998 }
6999
7000 dout(20) << __func__ << " loadavg " << loadavgs[0]
7001 << " >= max " << cct->_conf->osd_scrub_load_threshold
7002 << " and ( >= daily_loadavg " << daily_loadavg
7003 << " or >= 15m avg " << loadavgs[2]
7004 << ") = no" << dendl;
7005 return false;
7006}
7007
7008void OSD::sched_scrub()
7009{
7010 // if not permitted, fail fast
7011 if (!service.can_inc_scrubs_pending()) {
7012 return;
7013 }
7014
7015 utime_t now = ceph_clock_now();
7016 bool time_permit = scrub_time_permit(now);
7017 bool load_is_low = scrub_load_below_threshold();
7018 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7019
7020 OSDService::ScrubJob scrub;
7021 if (service.first_scrub_stamp(&scrub)) {
7022 do {
7023 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7024
7025 if (scrub.sched_time > now) {
7026 // save ourselves some effort
7027 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7028 << " > " << now << dendl;
7029 break;
7030 }
7031
7032 if (!cct->_conf->osd_scrub_during_recovery && service.is_recovery_active()) {
7033 dout(10) << __func__ << "not scheduling scrub of " << scrub.pgid << " due to active recovery ops" << dendl;
7034 break;
7035 }
7036
7037 if ((scrub.deadline >= now) && !(time_permit && load_is_low)) {
7038 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7039 << (!time_permit ? "time not permit" : "high load") << dendl;
7040 continue;
7041 }
7042
7043 PG *pg = _lookup_lock_pg(scrub.pgid);
7044 if (!pg)
7045 continue;
7046 if (pg->get_pgbackend()->scrub_supported() && pg->is_active()) {
7047 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7048 << (pg->scrubber.must_scrub ? ", explicitly requested" :
7049 (load_is_low ? ", load_is_low" : " deadline < now"))
7050 << dendl;
7051 if (pg->sched_scrub()) {
7052 pg->unlock();
7053 break;
7054 }
7055 }
7056 pg->unlock();
7057 } while (service.next_scrub_stamp(scrub, &scrub));
7058 }
7059 dout(20) << "sched_scrub done" << dendl;
7060}
7061
7062
7063
7064// =====================================================
7065// MAP
7066
7067void OSD::wait_for_new_map(OpRequestRef op)
7068{
7069 // ask?
7070 if (waiting_for_osdmap.empty()) {
7071 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7072 }
7073
7074 logger->inc(l_osd_waiting_for_map);
7075 waiting_for_osdmap.push_back(op);
7076 op->mark_delayed("wait for new map");
7077}
7078
7079
7080/** update_map
7081 * assimilate new OSDMap(s). scan pgs, etc.
7082 */
7083
7084void OSD::note_down_osd(int peer)
7085{
7086 assert(osd_lock.is_locked());
7087 cluster_messenger->mark_down(osdmap->get_cluster_addr(peer));
7088
7089 heartbeat_lock.Lock();
7090 failure_queue.erase(peer);
7091 failure_pending.erase(peer);
7092 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7093 if (p != heartbeat_peers.end()) {
7094 p->second.con_back->mark_down();
7095 if (p->second.con_front) {
7096 p->second.con_front->mark_down();
7097 }
7098 heartbeat_peers.erase(p);
7099 }
7100 heartbeat_lock.Unlock();
7101}
7102
7103void OSD::note_up_osd(int peer)
7104{
7105 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
7106 heartbeat_set_peers_need_update();
7107}
7108
7109struct C_OnMapCommit : public Context {
7110 OSD *osd;
7111 epoch_t first, last;
7112 MOSDMap *msg;
7113 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7114 : osd(o), first(f), last(l), msg(m) {}
7115 void finish(int r) override {
7116 osd->_committed_osd_maps(first, last, msg);
7117 msg->put();
7118 }
7119};
7120
7121struct C_OnMapApply : public Context {
7122 OSDService *service;
7123 list<OSDMapRef> pinned_maps;
7124 epoch_t e;
7125 C_OnMapApply(OSDService *service,
7126 const list<OSDMapRef> &pinned_maps,
7127 epoch_t e)
7128 : service(service), pinned_maps(pinned_maps), e(e) {}
7129 void finish(int r) override {
7130 service->clear_map_bl_cache_pins(e);
7131 }
7132};
7133
7134void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7135{
7136 OSDMapRef osdmap = service.get_osdmap();
7137 if (osdmap->get_epoch() >= epoch)
7138 return;
7139
7140 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7141 force_request) {
7142 monc->renew_subs();
7143 }
7144}
7145
7146void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7147{
7148 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7149 if (min <= superblock.oldest_map)
7150 return;
7151
7152 int num = 0;
7153 ObjectStore::Transaction t;
7154 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7155 dout(20) << " removing old osdmap epoch " << e << dendl;
7156 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7157 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7158 superblock.oldest_map = e + 1;
7159 num++;
7160 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7161 service.publish_superblock(superblock);
7162 write_superblock(t);
7163 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7164 assert(tr == 0);
7165 num = 0;
7166 if (!skip_maps) {
7167 // skip_maps leaves us with a range of old maps if we fail to remove all
7168 // of them before moving superblock.oldest_map forward to the first map
7169 // in the incoming MOSDMap msg. so we should continue removing them in
7170 // this case, even we could do huge series of delete transactions all at
7171 // once.
7172 break;
7173 }
7174 }
7175 }
7176 if (num > 0) {
7177 service.publish_superblock(superblock);
7178 write_superblock(t);
7179 store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7180 }
7181 // we should not remove the cached maps
7182 assert(min <= service.map_cache.cached_key_lower_bound());
7183}
7184
7185void OSD::handle_osd_map(MOSDMap *m)
7186{
7187 assert(osd_lock.is_locked());
7188 // Keep a ref in the list until we get the newly received map written
7189 // onto disk. This is important because as long as the refs are alive,
7190 // the OSDMaps will be pinned in the cache and we won't try to read it
7191 // off of disk. Otherwise these maps will probably not stay in the cache,
7192 // and reading those OSDMaps before they are actually written can result
7193 // in a crash.
7194 list<OSDMapRef> pinned_maps;
7195 if (m->fsid != monc->get_fsid()) {
7196 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7197 << monc->get_fsid() << dendl;
7198 m->put();
7199 return;
7200 }
7201 if (is_initializing()) {
7202 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7203 m->put();
7204 return;
7205 }
7206
7207 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
7208 if (session && !(session->entity_name.is_mon() ||
7209 session->entity_name.is_osd())) {
7210 //not enough perms!
7211 dout(10) << "got osd map from Session " << session
7212 << " which we can't take maps from (not a mon or osd)" << dendl;
7213 m->put();
7214 session->put();
7215 return;
7216 }
7217 if (session)
7218 session->put();
7219
7220 // share with the objecter
7221 if (!is_preboot())
7222 service.objecter->handle_osd_map(m);
7223
7224 epoch_t first = m->get_first();
7225 epoch_t last = m->get_last();
7226 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7227 << superblock.newest_map
7228 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7229 << dendl;
7230
7231 logger->inc(l_osd_map);
7232 logger->inc(l_osd_mape, last - first + 1);
7233 if (first <= superblock.newest_map)
7234 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7235 if (service.max_oldest_map < m->oldest_map) {
7236 service.max_oldest_map = m->oldest_map;
7237 assert(service.max_oldest_map >= superblock.oldest_map);
7238 }
7239
7240 // make sure there is something new, here, before we bother flushing
7241 // the queues and such
7242 if (last <= superblock.newest_map) {
7243 dout(10) << " no new maps here, dropping" << dendl;
7244 m->put();
7245 return;
7246 }
7247
7248 // missing some?
7249 bool skip_maps = false;
7250 if (first > superblock.newest_map + 1) {
7251 dout(10) << "handle_osd_map message skips epochs "
7252 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7253 if (m->oldest_map <= superblock.newest_map + 1) {
7254 osdmap_subscribe(superblock.newest_map + 1, false);
7255 m->put();
7256 return;
7257 }
7258 // always try to get the full range of maps--as many as we can. this
7259 // 1- is good to have
7260 // 2- is at present the only way to ensure that we get a *full* map as
7261 // the first map!
7262 if (m->oldest_map < first) {
7263 osdmap_subscribe(m->oldest_map - 1, true);
7264 m->put();
7265 return;
7266 }
7267 skip_maps = true;
7268 }
7269
7270 ObjectStore::Transaction t;
7271 uint64_t txn_size = 0;
7272
7273 // store new maps: queue for disk and put in the osdmap cache
7274 epoch_t start = MAX(superblock.newest_map + 1, first);
7275 for (epoch_t e = start; e <= last; e++) {
7276 if (txn_size >= t.get_num_bytes()) {
7277 derr << __func__ << " transaction size overflowed" << dendl;
7278 assert(txn_size < t.get_num_bytes());
7279 }
7280 txn_size = t.get_num_bytes();
7281 map<epoch_t,bufferlist>::iterator p;
7282 p = m->maps.find(e);
7283 if (p != m->maps.end()) {
7284 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7285 OSDMap *o = new OSDMap;
7286 bufferlist& bl = p->second;
7287
7288 o->decode(bl);
7289
7290 ghobject_t fulloid = get_osdmap_pobject_name(e);
7291 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7292 pin_map_bl(e, bl);
7293 pinned_maps.push_back(add_map(o));
7294
7295 got_full_map(e);
7296 continue;
7297 }
7298
7299 p = m->incremental_maps.find(e);
7300 if (p != m->incremental_maps.end()) {
7301 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7302 bufferlist& bl = p->second;
7303 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7304 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7305 pin_map_inc_bl(e, bl);
7306
7307 OSDMap *o = new OSDMap;
7308 if (e > 1) {
7309 bufferlist obl;
7310 bool got = get_map_bl(e - 1, obl);
7311 assert(got);
7312 o->decode(obl);
7313 }
7314
7315 OSDMap::Incremental inc;
7316 bufferlist::iterator p = bl.begin();
7317 inc.decode(p);
7318 if (o->apply_incremental(inc) < 0) {
7319 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
7320 assert(0 == "bad fsid");
7321 }
7322
7323 bufferlist fbl;
7324 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7325
7326 bool injected_failure = false;
7327 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7328 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7329 derr << __func__ << " injecting map crc failure" << dendl;
7330 injected_failure = true;
7331 }
7332
7333 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7334 dout(2) << "got incremental " << e
7335 << " but failed to encode full with correct crc; requesting"
7336 << dendl;
7337 clog->warn() << "failed to encode map e" << e << " with expected crc";
7338 dout(20) << "my encoded map was:\n";
7339 fbl.hexdump(*_dout);
7340 *_dout << dendl;
7341 delete o;
7342 request_full_map(e, last);
7343 last = e - 1;
7344 break;
7345 }
7346 got_full_map(e);
7347
7348 ghobject_t fulloid = get_osdmap_pobject_name(e);
7349 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7350 pin_map_bl(e, fbl);
7351 pinned_maps.push_back(add_map(o));
7352 continue;
7353 }
7354
7355 assert(0 == "MOSDMap lied about what maps it had?");
7356 }
7357
7358 // even if this map isn't from a mon, we may have satisfied our subscription
7359 monc->sub_got("osdmap", last);
7360
7361 if (!m->maps.empty() && requested_full_first) {
7362 dout(10) << __func__ << " still missing full maps " << requested_full_first
7363 << ".." << requested_full_last << dendl;
7364 rerequest_full_maps();
7365 }
7366
7367 if (last <= superblock.newest_map) {
7368 dout(10) << " no new maps here, dropping" << dendl;
7369 m->put();
7370 return;
7371 }
7372
7373 if (superblock.oldest_map) {
7374 // make sure we at least keep pace with incoming maps
7375 trim_maps(m->oldest_map, last - first + 1, skip_maps);
7376 }
7377
7378 if (!superblock.oldest_map || skip_maps)
7379 superblock.oldest_map = first;
7380 superblock.newest_map = last;
7381 superblock.current_epoch = last;
7382
7383 // note in the superblock that we were clean thru the prior epoch
7384 epoch_t boot_epoch = service.get_boot_epoch();
7385 if (boot_epoch && boot_epoch >= superblock.mounted) {
7386 superblock.mounted = boot_epoch;
7387 superblock.clean_thru = last;
7388 }
7389
7390 // superblock and commit
7391 write_superblock(t);
7392 store->queue_transaction(
7393 service.meta_osr.get(),
7394 std::move(t),
7395 new C_OnMapApply(&service, pinned_maps, last),
7396 new C_OnMapCommit(this, start, last, m), 0);
7397 service.publish_superblock(superblock);
7398}
7399
7400void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
7401{
7402 dout(10) << __func__ << " " << first << ".." << last << dendl;
7403 if (is_stopping()) {
7404 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7405 return;
7406 }
7407 Mutex::Locker l(osd_lock);
7408 map_lock.get_write();
7409
7410 bool do_shutdown = false;
7411 bool do_restart = false;
7412 bool network_error = false;
7413
7414 // advance through the new maps
7415 for (epoch_t cur = first; cur <= last; cur++) {
7416 dout(10) << " advance to epoch " << cur
7417 << " (<= last " << last
7418 << " <= newest_map " << superblock.newest_map
7419 << ")" << dendl;
7420
7421 OSDMapRef newmap = get_map(cur);
7422 assert(newmap); // we just cached it above!
7423
7424 // start blacklisting messages sent to peers that go down.
7425 service.pre_publish_map(newmap);
7426
7427 // kill connections to newly down osds
7428 bool waited_for_reservations = false;
7429 set<int> old;
7430 osdmap->get_all_osds(old);
7431 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
7432 if (*p != whoami &&
7433 osdmap->is_up(*p) && // in old map
7434 newmap->is_down(*p)) { // but not the new one
7435 if (!waited_for_reservations) {
7436 service.await_reserved_maps();
7437 waited_for_reservations = true;
7438 }
7439 note_down_osd(*p);
7440 } else if (*p != whoami &&
7441 osdmap->is_down(*p) &&
7442 newmap->is_up(*p)) {
7443 note_up_osd(*p);
7444 }
7445 }
7446
7447 if (osdmap->test_flag(CEPH_OSDMAP_NOUP) !=
7448 newmap->test_flag(CEPH_OSDMAP_NOUP)) {
7449 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
7450 << dendl;
7451 if (is_booting()) {
7452 // this captures the case where we sent the boot message while
7453 // NOUP was being set on the mon and our boot request was
7454 // dropped, and then later it is cleared. it imperfectly
7455 // handles the case where our original boot message was not
7456 // dropped and we restart even though we might have booted, but
7457 // that is harmless (boot will just take slightly longer).
7458 do_restart = true;
7459 }
7460 }
7461
7462 osdmap = newmap;
7463 epoch_t up_epoch;
7464 epoch_t boot_epoch;
7465 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
7466 if (!up_epoch &&
7467 osdmap->is_up(whoami) &&
7468 osdmap->get_inst(whoami) == client_messenger->get_myinst()) {
7469 up_epoch = osdmap->get_epoch();
7470 dout(10) << "up_epoch is " << up_epoch << dendl;
7471 if (!boot_epoch) {
7472 boot_epoch = osdmap->get_epoch();
7473 dout(10) << "boot_epoch is " << boot_epoch << dendl;
7474 }
7475 service.set_epochs(&boot_epoch, &up_epoch, NULL);
7476 }
7477 }
7478
7479 had_map_since = ceph_clock_now();
7480
7481 epoch_t _bind_epoch = service.get_bind_epoch();
7482 if (osdmap->is_up(whoami) &&
7483 osdmap->get_addr(whoami) == client_messenger->get_myaddr() &&
7484 _bind_epoch < osdmap->get_up_from(whoami)) {
7485
7486 if (is_booting()) {
7487 dout(1) << "state: booting -> active" << dendl;
7488 set_state(STATE_ACTIVE);
7489
7490 // set incarnation so that osd_reqid_t's we generate for our
7491 // objecter requests are unique across restarts.
7492 service.objecter->set_client_incarnation(osdmap->get_epoch());
7493 }
7494 }
7495
7496 if (osdmap->get_epoch() > 0 &&
7497 is_active()) {
7498 if (!osdmap->exists(whoami)) {
7499 dout(0) << "map says i do not exist. shutting down." << dendl;
7500 do_shutdown = true; // don't call shutdown() while we have
7501 // everything paused
7502 } else if (!osdmap->is_up(whoami) ||
7503 !osdmap->get_addr(whoami).probably_equals(
7504 client_messenger->get_myaddr()) ||
7505 !osdmap->get_cluster_addr(whoami).probably_equals(
7506 cluster_messenger->get_myaddr()) ||
7507 !osdmap->get_hb_back_addr(whoami).probably_equals(
7508 hb_back_server_messenger->get_myaddr()) ||
7509 (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
7510 !osdmap->get_hb_front_addr(whoami).probably_equals(
7511 hb_front_server_messenger->get_myaddr()))) {
7512 if (!osdmap->is_up(whoami)) {
7513 if (service.is_preparing_to_stop() || service.is_stopping()) {
7514 service.got_stop_ack();
7515 } else {
7516 clog->warn() << "map e" << osdmap->get_epoch()
7517 << " wrongly marked me down at e"
7518 << osdmap->get_down_at(whoami);
7519 }
7520 } else if (!osdmap->get_addr(whoami).probably_equals(
7521 client_messenger->get_myaddr())) {
7522 clog->error() << "map e" << osdmap->get_epoch()
7523 << " had wrong client addr (" << osdmap->get_addr(whoami)
7524 << " != my " << client_messenger->get_myaddr() << ")";
7525 } else if (!osdmap->get_cluster_addr(whoami).probably_equals(
7526 cluster_messenger->get_myaddr())) {
7527 clog->error() << "map e" << osdmap->get_epoch()
7528 << " had wrong cluster addr ("
7529 << osdmap->get_cluster_addr(whoami)
7530 << " != my " << cluster_messenger->get_myaddr() << ")";
7531 } else if (!osdmap->get_hb_back_addr(whoami).probably_equals(
7532 hb_back_server_messenger->get_myaddr())) {
7533 clog->error() << "map e" << osdmap->get_epoch()
7534 << " had wrong hb back addr ("
7535 << osdmap->get_hb_back_addr(whoami)
7536 << " != my " << hb_back_server_messenger->get_myaddr()
7537 << ")";
7538 } else if (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
7539 !osdmap->get_hb_front_addr(whoami).probably_equals(
7540 hb_front_server_messenger->get_myaddr())) {
7541 clog->error() << "map e" << osdmap->get_epoch()
7542 << " had wrong hb front addr ("
7543 << osdmap->get_hb_front_addr(whoami)
7544 << " != my " << hb_front_server_messenger->get_myaddr()
7545 << ")";
7546 }
7547
7548 if (!service.is_stopping()) {
7549 epoch_t up_epoch = 0;
7550 epoch_t bind_epoch = osdmap->get_epoch();
7551 service.set_epochs(NULL,&up_epoch, &bind_epoch);
7552 do_restart = true;
7553
7554 //add markdown log
7555 utime_t now = ceph_clock_now();
7556 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
7557 osd_markdown_log.push_back(now);
7558 //clear all out-of-date log
7559 while (!osd_markdown_log.empty() &&
7560 osd_markdown_log.front() + grace < now)
7561 osd_markdown_log.pop_front();
7562 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
7563 dout(0) << __func__ << " marked down "
7564 << osd_markdown_log.size()
7565 << " > osd_max_markdown_count "
7566 << cct->_conf->osd_max_markdown_count
7567 << " in last " << grace << " seconds, shutting down"
7568 << dendl;
7569 do_restart = false;
7570 do_shutdown = true;
7571 }
7572
7573 start_waiting_for_healthy();
7574
7575 set<int> avoid_ports;
7576#if defined(__FreeBSD__)
7577 // prevent FreeBSD from grabbing the client_messenger port during
7578 // rebinding. In which case a cluster_meesneger will connect also
7579 // to the same port
7580 avoid_ports.insert(client_messenger->get_myaddr().get_port());
7581#endif
7582 avoid_ports.insert(cluster_messenger->get_myaddr().get_port());
7583 avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port());
7584 avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port());
7585
7586 int r = cluster_messenger->rebind(avoid_ports);
7587 if (r != 0) {
7588 do_shutdown = true; // FIXME: do_restart?
7589 network_error = true;
7590 dout(0) << __func__ << " marked down:"
7591 << " rebind cluster_messenger failed" << dendl;
7592 }
7593
7594 r = hb_back_server_messenger->rebind(avoid_ports);
7595 if (r != 0) {
7596 do_shutdown = true; // FIXME: do_restart?
7597 network_error = true;
7598 dout(0) << __func__ << " marked down:"
7599 << " rebind hb_back_server_messenger failed" << dendl;
7600 }
7601
7602 r = hb_front_server_messenger->rebind(avoid_ports);
7603 if (r != 0) {
7604 do_shutdown = true; // FIXME: do_restart?
7605 network_error = true;
7606 dout(0) << __func__ << " marked down:"
7607 << " rebind hb_front_server_messenger failed" << dendl;
7608 }
7609
7610 hb_front_client_messenger->mark_down_all();
7611 hb_back_client_messenger->mark_down_all();
7612
7613 reset_heartbeat_peers();
7614 }
7615 }
7616 }
7617
7618 map_lock.put_write();
7619
7620 check_osdmap_features(store);
7621
7622 // yay!
7623 consume_map();
7624
7625 if (is_active() || is_waiting_for_healthy())
7626 maybe_update_heartbeat_peers();
7627
7628 if (!is_active()) {
7629 dout(10) << " not yet active; waiting for peering wq to drain" << dendl;
7630 peering_wq.drain();
7631 } else {
7632 activate_map();
7633 }
7634
7635 if (m->newest_map && m->newest_map > last) {
7636 dout(10) << " msg say newest map is " << m->newest_map
7637 << ", requesting more" << dendl;
7638 osdmap_subscribe(osdmap->get_epoch()+1, false);
7639 }
7640 else if (do_shutdown) {
7641 if (network_error) {
7642 Mutex::Locker l(heartbeat_lock);
7643 map<int,pair<utime_t,entity_inst_t>>::iterator it =
7644 failure_pending.begin();
7645 while (it != failure_pending.end()) {
7646 dout(10) << "handle_osd_ping canceling in-flight failure report for osd."
7647 << it->first << dendl;
7648 send_still_alive(osdmap->get_epoch(), it->second.second);
7649 failure_pending.erase(it++);
7650 }
7651 }
7652 // trigger shutdown in a different thread
7653 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
7654 queue_async_signal(SIGINT);
7655 }
7656 else if (is_preboot()) {
7657 if (m->get_source().is_mon())
7658 _preboot(m->oldest_map, m->newest_map);
7659 else
7660 start_boot();
7661 }
7662 else if (do_restart)
7663 start_boot();
7664
7665}
7666
7667void OSD::check_osdmap_features(ObjectStore *fs)
7668{
7669 // adjust required feature bits?
7670
7671 // we have to be a bit careful here, because we are accessing the
7672 // Policy structures without taking any lock. in particular, only
7673 // modify integer values that can safely be read by a racing CPU.
7674 // since we are only accessing existing Policy structures a their
7675 // current memory location, and setting or clearing bits in integer
7676 // fields, and we are the only writer, this is not a problem.
7677
7678 {
7679 Messenger::Policy p = client_messenger->get_default_policy();
7680 uint64_t mask;
7681 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
7682 if ((p.features_required & mask) != features) {
7683 dout(0) << "crush map has features " << features
7684 << ", adjusting msgr requires for clients" << dendl;
7685 p.features_required = (p.features_required & ~mask) | features;
7686 client_messenger->set_default_policy(p);
7687 }
7688 }
7689 {
7690 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
7691 uint64_t mask;
7692 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
7693 if ((p.features_required & mask) != features) {
7694 dout(0) << "crush map has features " << features
7695 << " was " << p.features_required
7696 << ", adjusting msgr requires for mons" << dendl;
7697 p.features_required = (p.features_required & ~mask) | features;
7698 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
7699 }
7700 }
7701 {
7702 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
7703 uint64_t mask;
7704 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
7705
7706 if ((p.features_required & mask) != features) {
7707 dout(0) << "crush map has features " << features
7708 << ", adjusting msgr requires for osds" << dendl;
7709 p.features_required = (p.features_required & ~mask) | features;
7710 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
7711 }
7712
7713 if ((features & CEPH_FEATURE_OSD_ERASURE_CODES) &&
7714 !superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
7715 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
7716 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
7717 ObjectStore::Transaction t;
7718 write_superblock(t);
7719 int err = store->queue_transaction(service.meta_osr.get(), std::move(t), NULL);
7720 assert(err == 0);
7721 }
7722 }
7723}
7724
7725bool OSD::advance_pg(
7726 epoch_t osd_epoch, PG *pg,
7727 ThreadPool::TPHandle &handle,
7728 PG::RecoveryCtx *rctx,
7729 set<boost::intrusive_ptr<PG> > *new_pgs)
7730{
7731 assert(pg->is_locked());
7732 epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
7733 OSDMapRef lastmap = pg->get_osdmap();
7734
7735 if (lastmap->get_epoch() == osd_epoch)
7736 return true;
7737 assert(lastmap->get_epoch() < osd_epoch);
7738
7739 epoch_t min_epoch = service.get_min_pg_epoch();
7740 epoch_t max;
7741 if (min_epoch) {
7742 max = min_epoch + cct->_conf->osd_map_max_advance;
7743 } else {
7744 max = next_epoch + cct->_conf->osd_map_max_advance;
7745 }
7746
7747 for (;
7748 next_epoch <= osd_epoch && next_epoch <= max;
7749 ++next_epoch) {
7750 OSDMapRef nextmap = service.try_get_map(next_epoch);
7751 if (!nextmap) {
7752 dout(20) << __func__ << " missing map " << next_epoch << dendl;
7753 // make sure max is bumped up so that we can get past any
7754 // gap in maps
7755 max = MAX(max, next_epoch + cct->_conf->osd_map_max_advance);
7756 continue;
7757 }
7758
7759 vector<int> newup, newacting;
7760 int up_primary, acting_primary;
7761 nextmap->pg_to_up_acting_osds(
7762 pg->info.pgid.pgid,
7763 &newup, &up_primary,
7764 &newacting, &acting_primary);
7765 pg->handle_advance_map(
7766 nextmap, lastmap, newup, up_primary,
7767 newacting, acting_primary, rctx);
7768
7769 // Check for split!
7770 set<spg_t> children;
7771 spg_t parent(pg->info.pgid);
7772 if (parent.is_split(
7773 lastmap->get_pg_num(pg->pool.id),
7774 nextmap->get_pg_num(pg->pool.id),
7775 &children)) {
7776 service.mark_split_in_progress(pg->info.pgid, children);
7777 split_pgs(
7778 pg, children, new_pgs, lastmap, nextmap,
7779 rctx);
7780 }
7781
7782 lastmap = nextmap;
7783 handle.reset_tp_timeout();
7784 }
7785 service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
7786 pg->handle_activate_map(rctx);
7787 if (next_epoch <= osd_epoch) {
7788 dout(10) << __func__ << " advanced to max " << max
7789 << " past min epoch " << min_epoch
7790 << " ... will requeue " << *pg << dendl;
7791 return false;
7792 }
7793 return true;
7794}
7795
7796void OSD::consume_map()
7797{
7798 assert(osd_lock.is_locked());
7799 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
7800
7801 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
7802 list<PGRef> to_remove;
7803
7804 // scan pg's
7805 {
7806 RWLock::RLocker l(pg_map_lock);
7807 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
7808 it != pg_map.end();
7809 ++it) {
7810 PG *pg = it->second;
7811 pg->lock();
7812 if (pg->is_primary())
7813 num_pg_primary++;
7814 else if (pg->is_replica())
7815 num_pg_replica++;
7816 else
7817 num_pg_stray++;
7818
7819 if (!osdmap->have_pg_pool(pg->info.pgid.pool())) {
7820 //pool is deleted!
7821 to_remove.push_back(PGRef(pg));
7822 } else {
7823 service.init_splits_between(it->first, service.get_osdmap(), osdmap);
7824 }
7825
7826 pg->unlock();
7827 }
7828 }
7829
7830 for (list<PGRef>::iterator i = to_remove.begin();
7831 i != to_remove.end();
7832 to_remove.erase(i++)) {
7833 RWLock::WLocker locker(pg_map_lock);
7834 (*i)->lock();
7835 _remove_pg(&**i);
7836 (*i)->unlock();
7837 }
7838
7839 service.expand_pg_num(service.get_osdmap(), osdmap);
7840
7841 service.pre_publish_map(osdmap);
7842 service.await_reserved_maps();
7843 service.publish_map(osdmap);
7844
7845 service.maybe_inject_dispatch_delay();
7846
7847 dispatch_sessions_waiting_on_map();
7848
7849 service.maybe_inject_dispatch_delay();
7850
7851 // remove any PGs which we no longer host from the session waiting_for_pg lists
7852 dout(20) << __func__ << " checking waiting_for_pg" << dendl;
7853 op_shardedwq.prune_pg_waiters(osdmap, whoami);
7854
7855 service.maybe_inject_dispatch_delay();
7856
7857 // scan pg's
7858 {
7859 RWLock::RLocker l(pg_map_lock);
7860 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
7861 it != pg_map.end();
7862 ++it) {
7863 PG *pg = it->second;
7864 pg->lock();
7865 pg->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
7866 pg->unlock();
7867 }
7868
7869 logger->set(l_osd_pg, pg_map.size());
7870 }
7871 logger->set(l_osd_pg_primary, num_pg_primary);
7872 logger->set(l_osd_pg_replica, num_pg_replica);
7873 logger->set(l_osd_pg_stray, num_pg_stray);
7874}
7875
7876void OSD::activate_map()
7877{
7878 assert(osd_lock.is_locked());
7879
7880 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
7881
7882 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
7883 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
7884 ceph_abort();
7885 }
7886
7887 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
7888 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
7889 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7890 }
7891
7892 // norecover?
7893 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
7894 if (!service.recovery_is_paused()) {
7895 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
7896 service.pause_recovery();
7897 }
7898 } else {
7899 if (service.recovery_is_paused()) {
7900 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
7901 service.unpause_recovery();
7902 }
7903 }
7904
7905 service.activate_map();
7906
7907 // process waiters
7908 take_waiters(waiting_for_osdmap);
7909}
7910
7911bool OSD::require_mon_peer(const Message *m)
7912{
7913 if (!m->get_connection()->peer_is_mon()) {
7914 dout(0) << "require_mon_peer received from non-mon "
7915 << m->get_connection()->get_peer_addr()
7916 << " " << *m << dendl;
7917 return false;
7918 }
7919 return true;
7920}
7921
7922bool OSD::require_mon_or_mgr_peer(const Message *m)
7923{
7924 if (!m->get_connection()->peer_is_mon() &&
7925 !m->get_connection()->peer_is_mgr()) {
7926 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
7927 << m->get_connection()->get_peer_addr()
7928 << " " << *m << dendl;
7929 return false;
7930 }
7931 return true;
7932}
7933
7934bool OSD::require_osd_peer(const Message *m)
7935{
7936 if (!m->get_connection()->peer_is_osd()) {
7937 dout(0) << "require_osd_peer received from non-osd "
7938 << m->get_connection()->get_peer_addr()
7939 << " " << *m << dendl;
7940 return false;
7941 }
7942 return true;
7943}
7944
7945bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
7946{
7947 epoch_t up_epoch = service.get_up_epoch();
7948 if (epoch < up_epoch) {
7949 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
7950 return false;
7951 }
7952
7953 if (!is_active()) {
7954 dout(7) << "still in boot state, dropping message " << *m << dendl;
7955 return false;
7956 }
7957
7958 return true;
7959}
7960
7961bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
7962 bool is_fast_dispatch)
7963{
7964 int from = m->get_source().num();
7965
7966 if (map->is_down(from) ||
7967 (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
7968 dout(5) << "from dead osd." << from << ", marking down, "
7969 << " msg was " << m->get_source_inst().addr
7970 << " expected " << (map->is_up(from) ?
7971 map->get_cluster_addr(from) : entity_addr_t())
7972 << dendl;
7973 ConnectionRef con = m->get_connection();
7974 con->mark_down();
7975 Session *s = static_cast<Session*>(con->get_priv());
7976 if (s) {
7977 if (!is_fast_dispatch)
7978 s->session_dispatch_lock.Lock();
7979 clear_session_waiting_on_map(s);
7980 con->set_priv(NULL); // break ref <-> session cycle, if any
7981 if (!is_fast_dispatch)
7982 s->session_dispatch_lock.Unlock();
7983 s->put();
7984 }
7985 return false;
7986 }
7987 return true;
7988}
7989
7990
7991/*
7992 * require that we have same (or newer) map, and that
7993 * the source is the pg primary.
7994 */
7995bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
7996 bool is_fast_dispatch)
7997{
7998 const Message *m = op->get_req();
7999 dout(15) << "require_same_or_newer_map " << epoch
8000 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8001
8002 assert(osd_lock.is_locked());
8003
8004 // do they have a newer map?
8005 if (epoch > osdmap->get_epoch()) {
8006 dout(7) << "waiting for newer map epoch " << epoch
8007 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8008 wait_for_new_map(op);
8009 return false;
8010 }
8011
8012 if (!require_self_aliveness(op->get_req(), epoch)) {
8013 return false;
8014 }
8015
8016 // ok, our map is same or newer.. do they still exist?
8017 if (m->get_connection()->get_messenger() == cluster_messenger &&
8018 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8019 return false;
8020 }
8021
8022 return true;
8023}
8024
8025
8026
8027
8028
8029// ----------------------------------------
8030// pg creation
8031
8032void OSD::split_pgs(
8033 PG *parent,
8034 const set<spg_t> &childpgids, set<boost::intrusive_ptr<PG> > *out_pgs,
8035 OSDMapRef curmap,
8036 OSDMapRef nextmap,
8037 PG::RecoveryCtx *rctx)
8038{
8039 unsigned pg_num = nextmap->get_pg_num(
8040 parent->pool.id);
8041 parent->update_snap_mapper_bits(
8042 parent->info.pgid.get_split_bits(pg_num)
8043 );
8044
8045 vector<object_stat_sum_t> updated_stats(childpgids.size() + 1);
8046 parent->info.stats.stats.sum.split(updated_stats);
8047
8048 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8049 for (set<spg_t>::const_iterator i = childpgids.begin();
8050 i != childpgids.end();
8051 ++i, ++stat_iter) {
8052 assert(stat_iter != updated_stats.end());
8053 dout(10) << "Splitting " << *parent << " into " << *i << dendl;
8054 assert(service.splitting(*i));
8055 PG* child = _make_pg(nextmap, *i);
8056 child->lock(true);
8057 out_pgs->insert(child);
8058 rctx->created_pgs.insert(child);
8059
8060 unsigned split_bits = i->get_split_bits(pg_num);
8061 dout(10) << "pg_num is " << pg_num << dendl;
8062 dout(10) << "m_seed " << i->ps() << dendl;
8063 dout(10) << "split_bits is " << split_bits << dendl;
8064
8065 parent->split_colls(
8066 *i,
8067 split_bits,
8068 i->ps(),
8069 &child->pool.info,
8070 rctx->transaction);
8071 parent->split_into(
8072 i->pgid,
8073 child,
8074 split_bits);
8075 child->info.stats.stats.sum = *stat_iter;
8076
8077 child->write_if_dirty(*(rctx->transaction));
8078 child->unlock();
8079 }
8080 assert(stat_iter != updated_stats.end());
8081 parent->info.stats.stats.sum = *stat_iter;
8082 parent->write_if_dirty(*(rctx->transaction));
8083}
8084
8085/*
8086 * holding osd_lock
8087 */
8088void OSD::handle_pg_create(OpRequestRef op)
8089{
8090 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
8091 assert(m->get_type() == MSG_OSD_PG_CREATE);
8092
8093 dout(10) << "handle_pg_create " << *m << dendl;
8094
8095 if (!require_mon_peer(op->get_req())) {
8096 return;
8097 }
8098
8099 if (!require_same_or_newer_map(op, m->epoch, false))
8100 return;
8101
8102 op->mark_started();
8103
8104 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8105 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8106 p != m->mkpg.end();
8107 ++p, ++ci) {
8108 assert(ci != m->ctimes.end() && ci->first == p->first);
8109 epoch_t created = p->second.created;
8110 if (p->second.split_bits) // Skip split pgs
8111 continue;
8112 pg_t on = p->first;
8113
8114 if (on.preferred() >= 0) {
8115 dout(20) << "ignoring localized pg " << on << dendl;
8116 continue;
8117 }
8118
8119 if (!osdmap->have_pg_pool(on.pool())) {
8120 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8121 continue;
8122 }
8123
8124 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
8125
8126 // is it still ours?
8127 vector<int> up, acting;
8128 int up_primary = -1;
8129 int acting_primary = -1;
8130 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
8131 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
8132
8133 if (acting_primary != whoami) {
8134 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
8135 << "), my role=" << role << ", skipping" << dendl;
8136 continue;
8137 }
8138
8139 spg_t pgid;
8140 bool mapped = osdmap->get_primary_shard(on, &pgid);
8141 assert(mapped);
8142
8143 PastIntervals pi(
8144 osdmap->get_pools().at(pgid.pool()).ec_pool(),
8145 *osdmap);
8146 pg_history_t history;
8147 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
8148
8149 // The mon won't resend unless the primary changed, so
8150 // we ignore same_interval_since. We'll pass this history
8151 // to handle_pg_peering_evt with the current epoch as the
8152 // event -- the project_pg_history check in
8153 // handle_pg_peering_evt will be a noop.
8154 if (history.same_primary_since > m->epoch) {
8155 dout(10) << __func__ << ": got obsolete pg create on pgid "
8156 << pgid << " from epoch " << m->epoch
8157 << ", primary changed in " << history.same_primary_since
8158 << dendl;
8159 continue;
8160 }
8161
8162 if (handle_pg_peering_evt(
8163 pgid,
8164 history,
8165 pi,
8166 osdmap->get_epoch(),
8167 PG::CephPeeringEvtRef(
8168 new PG::CephPeeringEvt(
8169 osdmap->get_epoch(),
8170 osdmap->get_epoch(),
8171 PG::NullEvt()))
8172 ) == -EEXIST) {
8173 service.send_pg_created(pgid.pgid);
8174 }
8175 }
8176 last_pg_create_epoch = m->epoch;
8177
8178 maybe_update_heartbeat_peers();
8179}
8180
8181
8182// ----------------------------------------
8183// peering and recovery
8184
8185PG::RecoveryCtx OSD::create_context()
8186{
8187 ObjectStore::Transaction *t = new ObjectStore::Transaction;
8188 C_Contexts *on_applied = new C_Contexts(cct);
8189 C_Contexts *on_safe = new C_Contexts(cct);
8190 map<int, map<spg_t,pg_query_t> > *query_map =
8191 new map<int, map<spg_t, pg_query_t> >;
8192 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
8193 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
8194 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
8195 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
8196 PG::RecoveryCtx rctx(query_map, info_map, notify_list,
8197 on_applied, on_safe, t);
8198 return rctx;
8199}
8200
8201struct C_OpenPGs : public Context {
8202 set<PGRef> pgs;
8203 ObjectStore *store;
8204 OSD *osd;
8205 C_OpenPGs(set<PGRef>& p, ObjectStore *s, OSD* o) : store(s), osd(o) {
8206 pgs.swap(p);
8207 }
8208 void finish(int r) override {
8209 RWLock::RLocker l(osd->pg_map_lock);
8210 for (auto p : pgs) {
8211 if (osd->pg_map.count(p->info.pgid)) {
8212 p->ch = store->open_collection(p->coll);
8213 assert(p->ch);
8214 }
8215 }
8216 }
8217};
8218
8219void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
8220 ThreadPool::TPHandle *handle)
8221{
8222 if (!ctx.transaction->empty()) {
8223 if (!ctx.created_pgs.empty()) {
8224 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8225 }
8226 int tr = store->queue_transaction(
8227 pg->osr.get(),
8228 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL,
8229 TrackedOpRef(), handle);
8230 delete (ctx.transaction);
8231 assert(tr == 0);
8232 ctx.transaction = new ObjectStore::Transaction;
8233 ctx.on_applied = new C_Contexts(cct);
8234 ctx.on_safe = new C_Contexts(cct);
8235 }
8236}
8237
8238void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
8239 ThreadPool::TPHandle *handle)
8240{
8241 if (service.get_osdmap()->is_up(whoami) &&
8242 is_active()) {
8243 do_notifies(*ctx.notify_list, curmap);
8244 do_queries(*ctx.query_map, curmap);
8245 do_infos(*ctx.info_map, curmap);
8246 }
8247 delete ctx.notify_list;
8248 delete ctx.query_map;
8249 delete ctx.info_map;
8250 if ((ctx.on_applied->empty() &&
8251 ctx.on_safe->empty() &&
8252 ctx.transaction->empty() &&
8253 ctx.created_pgs.empty()) || !pg) {
8254 delete ctx.transaction;
8255 delete ctx.on_applied;
8256 delete ctx.on_safe;
8257 assert(ctx.created_pgs.empty());
8258 } else {
8259 if (!ctx.created_pgs.empty()) {
8260 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8261 }
8262 int tr = store->queue_transaction(
8263 pg->osr.get(),
8264 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL, TrackedOpRef(),
8265 handle);
8266 delete (ctx.transaction);
8267 assert(tr == 0);
8268 }
8269}
8270
8271/** do_notifies
8272 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
8273 * content for, and they are primary for.
8274 */
8275
8276void OSD::do_notifies(
8277 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
8278 OSDMapRef curmap)
8279{
8280 for (map<int,
8281 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
8282 notify_list.begin();
8283 it != notify_list.end();
8284 ++it) {
8285 if (!curmap->is_up(it->first)) {
8286 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
8287 continue;
8288 }
8289 ConnectionRef con = service.get_con_osd_cluster(
8290 it->first, curmap->get_epoch());
8291 if (!con) {
8292 dout(20) << __func__ << " skipping osd." << it->first
8293 << " (NULL con)" << dendl;
8294 continue;
8295 }
8296 service.share_map_peer(it->first, con.get(), curmap);
8297 dout(7) << __func__ << " osd " << it->first
8298 << " on " << it->second.size() << " PGs" << dendl;
8299 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
8300 it->second);
8301 con->send_message(m);
8302 }
8303}
8304
8305
8306/** do_queries
8307 * send out pending queries for info | summaries
8308 */
8309void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
8310 OSDMapRef curmap)
8311{
8312 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
8313 pit != query_map.end();
8314 ++pit) {
8315 if (!curmap->is_up(pit->first)) {
8316 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
8317 continue;
8318 }
8319 int who = pit->first;
8320 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
8321 if (!con) {
8322 dout(20) << __func__ << " skipping osd." << who
8323 << " (NULL con)" << dendl;
8324 continue;
8325 }
8326 service.share_map_peer(who, con.get(), curmap);
8327 dout(7) << __func__ << " querying osd." << who
8328 << " on " << pit->second.size() << " PGs" << dendl;
8329 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
8330 con->send_message(m);
8331 }
8332}
8333
8334
8335void OSD::do_infos(map<int,
8336 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
8337 OSDMapRef curmap)
8338{
8339 for (map<int,
8340 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
8341 info_map.begin();
8342 p != info_map.end();
8343 ++p) {
8344 if (!curmap->is_up(p->first)) {
8345 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
8346 continue;
8347 }
8348 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
8349 i != p->second.end();
8350 ++i) {
8351 dout(20) << __func__ << " sending info " << i->first.info
8352 << " to shard " << p->first << dendl;
8353 }
8354 ConnectionRef con = service.get_con_osd_cluster(
8355 p->first, curmap->get_epoch());
8356 if (!con) {
8357 dout(20) << __func__ << " skipping osd." << p->first
8358 << " (NULL con)" << dendl;
8359 continue;
8360 }
8361 service.share_map_peer(p->first, con.get(), curmap);
8362 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
8363 m->pg_list = p->second;
8364 con->send_message(m);
8365 }
8366 info_map.clear();
8367}
8368
8369
8370/** PGNotify
8371 * from non-primary to primary
8372 * includes pg_info_t.
8373 * NOTE: called with opqueue active.
8374 */
8375void OSD::handle_pg_notify(OpRequestRef op)
8376{
8377 const MOSDPGNotify *m = static_cast<const MOSDPGNotify*>(op->get_req());
8378 assert(m->get_type() == MSG_OSD_PG_NOTIFY);
8379
8380 dout(7) << "handle_pg_notify from " << m->get_source() << dendl;
8381 int from = m->get_source().num();
8382
8383 if (!require_osd_peer(op->get_req()))
8384 return;
8385
8386 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8387 return;
8388
8389 op->mark_started();
8390
8391 for (auto it = m->get_pg_list().begin();
8392 it != m->get_pg_list().end();
8393 ++it) {
8394 if (it->first.info.pgid.preferred() >= 0) {
8395 dout(20) << "ignoring localized pg " << it->first.info.pgid << dendl;
8396 continue;
8397 }
8398
8399 handle_pg_peering_evt(
8400 spg_t(it->first.info.pgid.pgid, it->first.to),
8401 it->first.info.history, it->second,
8402 it->first.query_epoch,
8403 PG::CephPeeringEvtRef(
8404 new PG::CephPeeringEvt(
8405 it->first.epoch_sent, it->first.query_epoch,
8406 PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first,
8407 op->get_req()->get_connection()->get_features())))
8408 );
8409 }
8410}
8411
8412void OSD::handle_pg_log(OpRequestRef op)
8413{
8414 MOSDPGLog *m = static_cast<MOSDPGLog*>(op->get_nonconst_req());
8415 assert(m->get_type() == MSG_OSD_PG_LOG);
8416 dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl;
8417
8418 if (!require_osd_peer(op->get_req()))
8419 return;
8420
8421 int from = m->get_source().num();
8422 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8423 return;
8424
8425 if (m->info.pgid.preferred() >= 0) {
8426 dout(10) << "ignoring localized pg " << m->info.pgid << dendl;
8427 return;
8428 }
8429
8430 op->mark_started();
8431 handle_pg_peering_evt(
8432 spg_t(m->info.pgid.pgid, m->to),
8433 m->info.history, m->past_intervals, m->get_epoch(),
8434 PG::CephPeeringEvtRef(
8435 new PG::CephPeeringEvt(
8436 m->get_epoch(), m->get_query_epoch(),
8437 PG::MLogRec(pg_shard_t(from, m->from), m)))
8438 );
8439}
8440
8441void OSD::handle_pg_info(OpRequestRef op)
8442{
8443 const MOSDPGInfo *m = static_cast<const MOSDPGInfo *>(op->get_req());
8444 assert(m->get_type() == MSG_OSD_PG_INFO);
8445 dout(7) << "handle_pg_info " << *m << " from " << m->get_source() << dendl;
8446
8447 if (!require_osd_peer(op->get_req()))
8448 return;
8449
8450 int from = m->get_source().num();
8451 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8452 return;
8453
8454 op->mark_started();
8455
8456 for (auto p = m->pg_list.begin();
8457 p != m->pg_list.end();
8458 ++p) {
8459 if (p->first.info.pgid.preferred() >= 0) {
8460 dout(10) << "ignoring localized pg " << p->first.info.pgid << dendl;
8461 continue;
8462 }
8463
8464 handle_pg_peering_evt(
8465 spg_t(p->first.info.pgid.pgid, p->first.to),
8466 p->first.info.history, p->second, p->first.epoch_sent,
8467 PG::CephPeeringEvtRef(
8468 new PG::CephPeeringEvt(
8469 p->first.epoch_sent, p->first.query_epoch,
8470 PG::MInfoRec(
8471 pg_shard_t(
8472 from, p->first.from), p->first.info, p->first.epoch_sent)))
8473 );
8474 }
8475}
8476
8477void OSD::handle_pg_trim(OpRequestRef op)
8478{
8479 const MOSDPGTrim *m = static_cast<const MOSDPGTrim*>(op->get_req());
8480 assert(m->get_type() == MSG_OSD_PG_TRIM);
8481
8482 dout(7) << "handle_pg_trim " << *m << " from " << m->get_source() << dendl;
8483
8484 if (!require_osd_peer(op->get_req()))
8485 return;
8486
8487 int from = m->get_source().num();
8488 if (!require_same_or_newer_map(op, m->epoch, false))
8489 return;
8490
8491 if (m->pgid.preferred() >= 0) {
8492 dout(10) << "ignoring localized pg " << m->pgid << dendl;
8493 return;
8494 }
8495
8496 op->mark_started();
8497
8498 PG *pg = _lookup_lock_pg(m->pgid);
8499 if(!pg) {
8500 dout(10) << " don't have pg " << m->pgid << dendl;
8501 return;
8502 }
8503
8504 if (m->epoch < pg->info.history.same_interval_since) {
8505 dout(10) << *pg << " got old trim to " << m->trim_to << ", ignoring" << dendl;
8506 pg->unlock();
8507 return;
8508 }
8509
8510 if (pg->is_primary()) {
8511 // peer is informing us of their last_complete_ondisk
8512 dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
8513 pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
8514 m->trim_to;
8515 // trim log when the pg is recovered
8516 pg->calc_min_last_complete_ondisk();
8517 } else {
8518 // primary is instructing us to trim
8519 ObjectStore::Transaction t;
8520 pg->pg_log.trim(m->trim_to, pg->info);
8521 pg->dirty_info = true;
8522 pg->write_if_dirty(t);
8523 int tr = store->queue_transaction(pg->osr.get(), std::move(t), NULL);
8524 assert(tr == 0);
8525 }
8526 pg->unlock();
8527}
8528
8529void OSD::handle_pg_backfill_reserve(OpRequestRef op)
8530{
8531 const MBackfillReserve *m = static_cast<const MBackfillReserve*>(op->get_req());
8532 assert(m->get_type() == MSG_OSD_BACKFILL_RESERVE);
8533
8534 if (!require_osd_peer(op->get_req()))
8535 return;
8536 if (!require_same_or_newer_map(op, m->query_epoch, false))
8537 return;
8538
8539 PG::CephPeeringEvtRef evt;
8540 if (m->type == MBackfillReserve::REQUEST) {
8541 evt = PG::CephPeeringEvtRef(
8542 new PG::CephPeeringEvt(
8543 m->query_epoch,
8544 m->query_epoch,
8545 PG::RequestBackfillPrio(m->priority)));
8546 } else if (m->type == MBackfillReserve::GRANT) {
8547 evt = PG::CephPeeringEvtRef(
8548 new PG::CephPeeringEvt(
8549 m->query_epoch,
8550 m->query_epoch,
8551 PG::RemoteBackfillReserved()));
8552 } else if (m->type == MBackfillReserve::REJECT) {
8553 evt = PG::CephPeeringEvtRef(
8554 new PG::CephPeeringEvt(
8555 m->query_epoch,
8556 m->query_epoch,
8557 PG::RemoteReservationRejected()));
8558 } else {
8559 ceph_abort();
8560 }
8561
8562 if (service.splitting(m->pgid)) {
8563 peering_wait_for_split[m->pgid].push_back(evt);
8564 return;
8565 }
8566
8567 PG *pg = _lookup_lock_pg(m->pgid);
8568 if (!pg) {
8569 dout(10) << " don't have pg " << m->pgid << dendl;
8570 return;
8571 }
8572
8573 pg->queue_peering_event(evt);
8574 pg->unlock();
8575}
8576
8577void OSD::handle_pg_recovery_reserve(OpRequestRef op)
8578{
8579 const MRecoveryReserve *m = static_cast<const MRecoveryReserve*>(op->get_req());
8580 assert(m->get_type() == MSG_OSD_RECOVERY_RESERVE);
8581
8582 if (!require_osd_peer(op->get_req()))
8583 return;
8584 if (!require_same_or_newer_map(op, m->query_epoch, false))
8585 return;
8586
8587 PG::CephPeeringEvtRef evt;
8588 if (m->type == MRecoveryReserve::REQUEST) {
8589 evt = PG::CephPeeringEvtRef(
8590 new PG::CephPeeringEvt(
8591 m->query_epoch,
8592 m->query_epoch,
8593 PG::RequestRecovery()));
8594 } else if (m->type == MRecoveryReserve::GRANT) {
8595 evt = PG::CephPeeringEvtRef(
8596 new PG::CephPeeringEvt(
8597 m->query_epoch,
8598 m->query_epoch,
8599 PG::RemoteRecoveryReserved()));
8600 } else if (m->type == MRecoveryReserve::RELEASE) {
8601 evt = PG::CephPeeringEvtRef(
8602 new PG::CephPeeringEvt(
8603 m->query_epoch,
8604 m->query_epoch,
8605 PG::RecoveryDone()));
8606 } else {
8607 ceph_abort();
8608 }
8609
8610 if (service.splitting(m->pgid)) {
8611 peering_wait_for_split[m->pgid].push_back(evt);
8612 return;
8613 }
8614
8615 PG *pg = _lookup_lock_pg(m->pgid);
8616 if (!pg) {
8617 dout(10) << " don't have pg " << m->pgid << dendl;
8618 return;
8619 }
8620
8621 pg->queue_peering_event(evt);
8622 pg->unlock();
8623}
8624
8625
8626/** PGQuery
8627 * from primary to replica | stray
8628 * NOTE: called with opqueue active.
8629 */
8630void OSD::handle_pg_query(OpRequestRef op)
8631{
8632 assert(osd_lock.is_locked());
8633
8634 const MOSDPGQuery *m = static_cast<const MOSDPGQuery*>(op->get_req());
8635 assert(m->get_type() == MSG_OSD_PG_QUERY);
8636
8637 if (!require_osd_peer(op->get_req()))
8638 return;
8639
8640 dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl;
8641 int from = m->get_source().num();
8642
8643 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8644 return;
8645
8646 op->mark_started();
8647
8648 map< int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
8649
8650 for (auto it = m->pg_list.begin();
8651 it != m->pg_list.end();
8652 ++it) {
8653 spg_t pgid = it->first;
8654
8655 if (pgid.preferred() >= 0) {
8656 dout(10) << "ignoring localized pg " << pgid << dendl;
8657 continue;
8658 }
8659
8660 if (service.splitting(pgid)) {
8661 peering_wait_for_split[pgid].push_back(
8662 PG::CephPeeringEvtRef(
8663 new PG::CephPeeringEvt(
8664 it->second.epoch_sent, it->second.epoch_sent,
8665 PG::MQuery(pg_shard_t(from, it->second.from),
8666 it->second, it->second.epoch_sent))));
8667 continue;
8668 }
8669
8670 {
8671 RWLock::RLocker l(pg_map_lock);
8672 if (pg_map.count(pgid)) {
8673 PG *pg = 0;
8674 pg = _lookup_lock_pg_with_map_lock_held(pgid);
8675 pg->queue_query(
8676 it->second.epoch_sent, it->second.epoch_sent,
8677 pg_shard_t(from, it->second.from), it->second);
8678 pg->unlock();
8679 continue;
8680 }
8681 }
8682
8683 if (!osdmap->have_pg_pool(pgid.pool()))
8684 continue;
8685
8686 // get active crush mapping
8687 int up_primary, acting_primary;
8688 vector<int> up, acting;
8689 osdmap->pg_to_up_acting_osds(
8690 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
8691
8692 // same primary?
8693 pg_history_t history = it->second.history;
8694 bool valid_history = project_pg_history(
8695 pgid, history, it->second.epoch_sent,
8696 up, up_primary, acting, acting_primary);
8697
8698 if (!valid_history ||
8699 it->second.epoch_sent < history.same_interval_since) {
8700 dout(10) << " pg " << pgid << " dne, and pg has changed in "
8701 << history.same_interval_since
8702 << " (msg from " << it->second.epoch_sent << ")" << dendl;
8703 continue;
8704 }
8705
8706 dout(10) << " pg " << pgid << " dne" << dendl;
8707 pg_info_t empty(spg_t(pgid.pgid, it->second.to));
8708 /* This is racy, but that should be ok: if we complete the deletion
8709 * before the pg is recreated, we'll just start it off backfilling
8710 * instead of just empty */
8711 if (service.deleting_pgs.lookup(pgid))
8712 empty.set_last_backfill(hobject_t());
8713 if (it->second.type == pg_query_t::LOG ||
8714 it->second.type == pg_query_t::FULLLOG) {
8715 ConnectionRef con = service.get_con_osd_cluster(from, osdmap->get_epoch());
8716 if (con) {
8717 MOSDPGLog *mlog = new MOSDPGLog(
8718 it->second.from, it->second.to,
8719 osdmap->get_epoch(), empty,
8720 it->second.epoch_sent);
8721 service.share_map_peer(from, con.get(), osdmap);
8722 con->send_message(mlog);
8723 }
8724 } else {
8725 notify_list[from].push_back(
8726 make_pair(
8727 pg_notify_t(
8728 it->second.from, it->second.to,
8729 it->second.epoch_sent,
8730 osdmap->get_epoch(),
8731 empty),
8732 PastIntervals(
8733 osdmap->get_pools().at(pgid.pool()).ec_pool(),
8734 *osdmap)));
8735 }
8736 }
8737 do_notifies(notify_list, osdmap);
8738}
8739
8740
8741void OSD::handle_pg_remove(OpRequestRef op)
8742{
8743 const MOSDPGRemove *m = static_cast<const MOSDPGRemove *>(op->get_req());
8744 assert(m->get_type() == MSG_OSD_PG_REMOVE);
8745 assert(osd_lock.is_locked());
8746
8747 if (!require_osd_peer(op->get_req()))
8748 return;
8749
8750 dout(7) << "handle_pg_remove from " << m->get_source() << " on "
8751 << m->pg_list.size() << " pgs" << dendl;
8752
8753 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8754 return;
8755
8756 op->mark_started();
8757
8758 for (auto it = m->pg_list.begin();
8759 it != m->pg_list.end();
8760 ++it) {
8761 spg_t pgid = *it;
8762 if (pgid.preferred() >= 0) {
8763 dout(10) << "ignoring localized pg " << pgid << dendl;
8764 continue;
8765 }
8766
8767 RWLock::WLocker l(pg_map_lock);
8768 if (pg_map.count(pgid) == 0) {
8769 dout(10) << " don't have pg " << pgid << dendl;
8770 continue;
8771 }
8772 dout(5) << "queue_pg_for_deletion: " << pgid << dendl;
8773 PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
8774 pg_history_t history = pg->info.history;
8775 int up_primary, acting_primary;
8776 vector<int> up, acting;
8777 osdmap->pg_to_up_acting_osds(
8778 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
8779 bool valid_history = project_pg_history(
8780 pg->info.pgid, history, pg->get_osdmap()->get_epoch(),
8781 up, up_primary, acting, acting_primary);
8782 if (valid_history &&
8783 history.same_interval_since <= m->get_epoch()) {
8784 assert(pg->get_primary().osd == m->get_source().num());
8785 PGRef _pg(pg);
8786 _remove_pg(pg);
8787 pg->unlock();
8788 } else {
8789 dout(10) << *pg << " ignoring remove request, pg changed in epoch "
8790 << history.same_interval_since
8791 << " > " << m->get_epoch() << dendl;
8792 pg->unlock();
8793 }
8794 }
8795}
8796
8797void OSD::_remove_pg(PG *pg)
8798{
8799 ObjectStore::Transaction rmt ;
8800
8801 // on_removal, which calls remove_watchers_and_notifies, and the erasure from
8802 // the pg_map must be done together without unlocking the pg lock,
8803 // to avoid racing with watcher cleanup in ms_handle_reset
8804 // and handle_notify_timeout
8805 pg->on_removal(&rmt);
8806
8807 service.cancel_pending_splits_for_parent(pg->info.pgid);
8808 int tr = store->queue_transaction(
8809 pg->osr.get(), std::move(rmt), NULL,
8810 new ContainerContext<
8811 SequencerRef>(pg->osr));
8812 assert(tr == 0);
8813
8814 DeletingStateRef deleting = service.deleting_pgs.lookup_or_create(
8815 pg->info.pgid,
8816 make_pair(
8817 pg->info.pgid,
8818 PGRef(pg))
8819 );
8820 remove_wq.queue(make_pair(PGRef(pg), deleting));
8821
8822 service.pg_remove_epoch(pg->info.pgid);
8823
8824 // dereference from op_wq
8825 op_shardedwq.clear_pg_pointer(pg->info.pgid);
8826
8827 // remove from map
8828 pg_map.erase(pg->info.pgid);
8829 pg->put("PGMap"); // since we've taken it out of map
8830}
8831
8832
8833// =========================================================
8834// RECOVERY
8835
8836void OSDService::_maybe_queue_recovery() {
8837 assert(recovery_lock.is_locked_by_me());
8838 uint64_t available_pushes;
8839 while (!awaiting_throttle.empty() &&
8840 _recover_now(&available_pushes)) {
8841 uint64_t to_start = MIN(
8842 available_pushes,
8843 cct->_conf->osd_recovery_max_single_start);
8844 _queue_for_recovery(awaiting_throttle.front(), to_start);
8845 awaiting_throttle.pop_front();
8846 recovery_ops_reserved += to_start;
8847 }
8848}
8849
8850bool OSDService::_recover_now(uint64_t *available_pushes)
8851{
8852 if (available_pushes)
8853 *available_pushes = 0;
8854
8855 if (ceph_clock_now() < defer_recovery_until) {
8856 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
8857 return false;
8858 }
8859
8860 if (recovery_paused) {
8861 dout(15) << __func__ << " paused" << dendl;
8862 return false;
8863 }
8864
8865 uint64_t max = cct->_conf->osd_recovery_max_active;
8866 if (max <= recovery_ops_active + recovery_ops_reserved) {
8867 dout(15) << __func__ << " active " << recovery_ops_active
8868 << " + reserved " << recovery_ops_reserved
8869 << " >= max " << max << dendl;
8870 return false;
8871 }
8872
8873 if (available_pushes)
8874 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
8875
8876 return true;
8877}
8878
8879void OSD::do_recovery(
8880 PG *pg, epoch_t queued, uint64_t reserved_pushes,
8881 ThreadPool::TPHandle &handle)
8882{
8883 uint64_t started = 0;
8884 if (cct->_conf->osd_recovery_sleep > 0) {
8885 handle.suspend_tp_timeout();
8886 pg->unlock();
8887 utime_t t;
8888 t.set_from_double(cct->_conf->osd_recovery_sleep);
8889 t.sleep();
8890 dout(20) << __func__ << " slept for " << t << dendl;
8891 pg->lock();
8892 handle.reset_tp_timeout();
8893 }
8894
8895 {
8896 if (pg->pg_has_reset_since(queued)) {
8897 goto out;
8898 }
8899
8900 assert(!pg->deleting);
8901 assert(pg->is_peered() && pg->is_primary());
8902
8903 assert(pg->recovery_queued);
8904 pg->recovery_queued = false;
8905
8906 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
8907#ifdef DEBUG_RECOVERY_OIDS
8908 dout(20) << " active was " << service.recovery_oids[pg->info.pgid] << dendl;
8909#endif
8910
8911 bool more = pg->start_recovery_ops(reserved_pushes, handle, &started);
8912 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
8913 << " on " << *pg << dendl;
8914
8915 // If no recovery op is started, don't bother to manipulate the RecoveryCtx
8916 if (!started && (more || !pg->have_unfound())) {
8917 goto out;
8918 }
8919
8920 PG::RecoveryCtx rctx = create_context();
8921 rctx.handle = &handle;
8922
8923 /*
8924 * if we couldn't start any recovery ops and things are still
8925 * unfound, see if we can discover more missing object locations.
8926 * It may be that our initial locations were bad and we errored
8927 * out while trying to pull.
8928 */
8929 if (!more && pg->have_unfound()) {
8930 pg->discover_all_missing(*rctx.query_map);
8931 if (rctx.query_map->empty()) {
8932 dout(10) << "do_recovery no luck, giving up on this pg for now" << dendl;
8933 } else {
8934 dout(10) << "do_recovery no luck, giving up on this pg for now" << dendl;
8935 pg->queue_recovery();
8936 }
8937 }
8938
8939 pg->write_if_dirty(*rctx.transaction);
8940 OSDMapRef curmap = pg->get_osdmap();
8941 dispatch_context(rctx, pg, curmap);
8942 }
8943
8944 out:
8945 assert(started <= reserved_pushes);
8946 service.release_reserved_pushes(reserved_pushes);
8947}
8948
8949void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
8950{
8951 Mutex::Locker l(recovery_lock);
8952 dout(10) << "start_recovery_op " << *pg << " " << soid
8953 << " (" << recovery_ops_active << "/"
8954 << cct->_conf->osd_recovery_max_active << " rops)"
8955 << dendl;
8956 recovery_ops_active++;
8957
8958#ifdef DEBUG_RECOVERY_OIDS
8959 dout(20) << " active was " << recovery_oids[pg->info.pgid] << dendl;
8960 assert(recovery_oids[pg->info.pgid].count(soid) == 0);
8961 recovery_oids[pg->info.pgid].insert(soid);
8962#endif
8963}
8964
8965void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
8966{
8967 Mutex::Locker l(recovery_lock);
8968 dout(10) << "finish_recovery_op " << *pg << " " << soid
8969 << " dequeue=" << dequeue
8970 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
8971 << dendl;
8972
8973 // adjust count
8974 assert(recovery_ops_active > 0);
8975 recovery_ops_active--;
8976
8977#ifdef DEBUG_RECOVERY_OIDS
8978 dout(20) << " active oids was " << recovery_oids[pg->info.pgid] << dendl;
8979 assert(recovery_oids[pg->info.pgid].count(soid));
8980 recovery_oids[pg->info.pgid].erase(soid);
8981#endif
8982
8983 _maybe_queue_recovery();
8984}
8985
8986bool OSDService::is_recovery_active()
8987{
8988 if (recovery_ops_active > 0)
8989 return true;
8990
8991 return false;
8992}
8993
8994// =========================================================
8995// OPS
8996
8997bool OSD::op_is_discardable(const MOSDOp *op)
8998{
8999 // drop client request if they are not connected and can't get the
9000 // reply anyway.
9001 if (!op->get_connection()->is_connected()) {
9002 return true;
9003 }
9004 return false;
9005}
9006
9007void OSD::enqueue_op(spg_t pg, OpRequestRef& op, epoch_t epoch)
9008{
9009 utime_t latency = ceph_clock_now() - op->get_req()->get_recv_stamp();
9010 dout(15) << "enqueue_op " << op << " prio " << op->get_req()->get_priority()
9011 << " cost " << op->get_req()->get_cost()
9012 << " latency " << latency
9013 << " epoch " << epoch
9014 << " " << *(op->get_req()) << dendl;
9015 op->osd_trace.event("enqueue op");
9016 op->osd_trace.keyval("priority", op->get_req()->get_priority());
9017 op->osd_trace.keyval("cost", op->get_req()->get_cost());
9018 op->mark_queued_for_pg();
9019 op_shardedwq.queue(make_pair(pg, PGQueueable(op, epoch)));
9020}
9021
9022
9023
9024/*
9025 * NOTE: dequeue called in worker thread, with pg lock
9026 */
9027void OSD::dequeue_op(
9028 PGRef pg, OpRequestRef op,
9029 ThreadPool::TPHandle &handle)
9030{
9031 FUNCTRACE();
9032 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
9033
9034 utime_t now = ceph_clock_now();
9035 op->set_dequeued_time(now);
9036 utime_t latency = now - op->get_req()->get_recv_stamp();
9037 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
9038 << " cost " << op->get_req()->get_cost()
9039 << " latency " << latency
9040 << " " << *(op->get_req())
9041 << " pg " << *pg << dendl;
9042
9043 Session *session = static_cast<Session *>(
9044 op->get_req()->get_connection()->get_priv());
9045 if (session) {
9046 maybe_share_map(session, op, pg->get_osdmap());
9047 session->put();
9048 }
9049
9050 if (pg->deleting)
9051 return;
9052
9053 op->mark_reached_pg();
9054 op->osd_trace.event("dequeue_op");
9055
9056 pg->do_request(op, handle);
9057
9058 // finish
9059 dout(10) << "dequeue_op " << op << " finish" << dendl;
9060 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
9061}
9062
9063
9064struct C_CompleteSplits : public Context {
9065 OSD *osd;
9066 set<boost::intrusive_ptr<PG> > pgs;
9067 C_CompleteSplits(OSD *osd, const set<boost::intrusive_ptr<PG> > &in)
9068 : osd(osd), pgs(in) {}
9069 void finish(int r) override {
9070 Mutex::Locker l(osd->osd_lock);
9071 if (osd->is_stopping())
9072 return;
9073 PG::RecoveryCtx rctx = osd->create_context();
9074 for (set<boost::intrusive_ptr<PG> >::iterator i = pgs.begin();
9075 i != pgs.end();
9076 ++i) {
9077 osd->pg_map_lock.get_write();
9078 (*i)->lock();
9079 osd->add_newly_split_pg(&**i, &rctx);
9080 if (!((*i)->deleting)) {
9081 set<spg_t> to_complete;
9082 to_complete.insert((*i)->info.pgid);
9083 osd->service.complete_split(to_complete);
9084 }
9085 osd->pg_map_lock.put_write();
9086 osd->dispatch_context_transaction(rctx, &**i);
9087 osd->wake_pg_waiters(*i);
9088 (*i)->unlock();
9089 }
9090
9091 osd->dispatch_context(rctx, 0, osd->service.get_osdmap());
9092 }
9093};
9094
9095void OSD::process_peering_events(
9096 const list<PG*> &pgs,
9097 ThreadPool::TPHandle &handle
9098 )
9099{
9100 bool need_up_thru = false;
9101 epoch_t same_interval_since = 0;
9102 OSDMapRef curmap;
9103 PG::RecoveryCtx rctx = create_context();
9104 rctx.handle = &handle;
9105 for (list<PG*>::const_iterator i = pgs.begin();
9106 i != pgs.end();
9107 ++i) {
9108 set<boost::intrusive_ptr<PG> > split_pgs;
9109 PG *pg = *i;
9110 pg->lock_suspend_timeout(handle);
9111 curmap = service.get_osdmap();
9112 if (pg->deleting) {
9113 pg->unlock();
9114 continue;
9115 }
9116 if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
9117 // we need to requeue the PG explicitly since we didn't actually
9118 // handle an event
9119 peering_wq.queue(pg);
9120 } else {
9121 assert(!pg->peering_queue.empty());
9122 PG::CephPeeringEvtRef evt = pg->peering_queue.front();
9123 pg->peering_queue.pop_front();
9124 pg->handle_peering_event(evt, &rctx);
9125 }
9126 need_up_thru = pg->need_up_thru || need_up_thru;
9127 same_interval_since = MAX(pg->info.history.same_interval_since,
9128 same_interval_since);
9129 pg->write_if_dirty(*rctx.transaction);
9130 if (!split_pgs.empty()) {
9131 rctx.on_applied->add(new C_CompleteSplits(this, split_pgs));
9132 split_pgs.clear();
9133 }
9134 dispatch_context_transaction(rctx, pg, &handle);
9135 pg->unlock();
9136 }
9137 if (need_up_thru)
9138 queue_want_up_thru(same_interval_since);
9139 dispatch_context(rctx, 0, curmap, &handle);
9140
9141 service.send_pg_temp();
9142}
9143
9144// --------------------------------
9145
9146const char** OSD::get_tracked_conf_keys() const
9147{
9148 static const char* KEYS[] = {
9149 "osd_max_backfills",
9150 "osd_min_recovery_priority",
9151 "osd_op_complaint_time", "osd_op_log_threshold",
9152 "osd_op_history_size", "osd_op_history_duration",
9153 "osd_enable_op_tracker",
9154 "osd_map_cache_size",
9155 "osd_map_max_advance",
9156 "osd_pg_epoch_persisted_max_stale",
9157 "osd_disk_thread_ioprio_class",
9158 "osd_disk_thread_ioprio_priority",
9159 // clog & admin clog
9160 "clog_to_monitors",
9161 "clog_to_syslog",
9162 "clog_to_syslog_facility",
9163 "clog_to_syslog_level",
9164 "osd_objectstore_fuse",
9165 "clog_to_graylog",
9166 "clog_to_graylog_host",
9167 "clog_to_graylog_port",
9168 "host",
9169 "fsid",
9170 "osd_recovery_delay_start",
9171 "osd_client_message_size_cap",
9172 "osd_client_message_cap",
9173 NULL
9174 };
9175 return KEYS;
9176}
9177
9178void OSD::handle_conf_change(const struct md_config_t *conf,
9179 const std::set <std::string> &changed)
9180{
9181 if (changed.count("osd_max_backfills")) {
9182 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9183 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9184 }
9185 if (changed.count("osd_min_recovery_priority")) {
9186 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9187 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9188 }
9189 if (changed.count("osd_max_trimming_pgs")) {
9190 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9191 }
9192 if (changed.count("osd_op_complaint_time") ||
9193 changed.count("osd_op_log_threshold")) {
9194 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9195 cct->_conf->osd_op_log_threshold);
9196 }
9197 if (changed.count("osd_op_history_size") ||
9198 changed.count("osd_op_history_duration")) {
9199 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9200 cct->_conf->osd_op_history_duration);
9201 }
9202 if (changed.count("osd_op_history_slow_op_size") ||
9203 changed.count("osd_op_history_slow_op_threshold")) {
9204 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9205 cct->_conf->osd_op_history_slow_op_threshold);
9206 }
9207 if (changed.count("osd_enable_op_tracker")) {
9208 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9209 }
9210 if (changed.count("osd_disk_thread_ioprio_class") ||
9211 changed.count("osd_disk_thread_ioprio_priority")) {
9212 set_disk_tp_priority();
9213 }
9214 if (changed.count("osd_map_cache_size")) {
9215 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9216 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9217 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9218 }
9219 if (changed.count("clog_to_monitors") ||
9220 changed.count("clog_to_syslog") ||
9221 changed.count("clog_to_syslog_level") ||
9222 changed.count("clog_to_syslog_facility") ||
9223 changed.count("clog_to_graylog") ||
9224 changed.count("clog_to_graylog_host") ||
9225 changed.count("clog_to_graylog_port") ||
9226 changed.count("host") ||
9227 changed.count("fsid")) {
9228 update_log_config();
9229 }
9230
9231#ifdef HAVE_LIBFUSE
9232 if (changed.count("osd_objectstore_fuse")) {
9233 if (store) {
9234 enable_disable_fuse(false);
9235 }
9236 }
9237#endif
9238
9239 if (changed.count("osd_recovery_delay_start")) {
9240 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9241 service.kick_recovery_queue();
9242 }
9243
9244 if (changed.count("osd_client_message_cap")) {
9245 uint64_t newval = cct->_conf->osd_client_message_cap;
9246 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9247 if (pol.throttler_messages && newval > 0) {
9248 pol.throttler_messages->reset_max(newval);
9249 }
9250 }
9251 if (changed.count("osd_client_message_size_cap")) {
9252 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9253 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9254 if (pol.throttler_bytes && newval > 0) {
9255 pol.throttler_bytes->reset_max(newval);
9256 }
9257 }
9258
9259 check_config();
9260}
9261
9262void OSD::update_log_config()
9263{
9264 map<string,string> log_to_monitors;
9265 map<string,string> log_to_syslog;
9266 map<string,string> log_channel;
9267 map<string,string> log_prio;
9268 map<string,string> log_to_graylog;
9269 map<string,string> log_to_graylog_host;
9270 map<string,string> log_to_graylog_port;
9271 uuid_d fsid;
9272 string host;
9273
9274 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9275 log_channel, log_prio, log_to_graylog,
9276 log_to_graylog_host, log_to_graylog_port,
9277 fsid, host) == 0)
9278 clog->update_config(log_to_monitors, log_to_syslog,
9279 log_channel, log_prio, log_to_graylog,
9280 log_to_graylog_host, log_to_graylog_port,
9281 fsid, host);
9282 derr << "log_to_monitors " << log_to_monitors << dendl;
9283}
9284
9285void OSD::check_config()
9286{
9287 // some sanity checks
9288 if (cct->_conf->osd_map_cache_size <= cct->_conf->osd_map_max_advance + 2) {
9289 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9290 << " is not > osd_map_max_advance ("
9291 << cct->_conf->osd_map_max_advance << ")";
9292 }
9293 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9294 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9295 << " is not > osd_pg_epoch_persisted_max_stale ("
9296 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9297 }
9298}
9299
9300void OSD::set_disk_tp_priority()
9301{
9302 dout(10) << __func__
9303 << " class " << cct->_conf->osd_disk_thread_ioprio_class
9304 << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
9305 << dendl;
9306 if (cct->_conf->osd_disk_thread_ioprio_class.empty() ||
9307 cct->_conf->osd_disk_thread_ioprio_priority < 0)
9308 return;
9309 int cls =
9310 ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
9311 if (cls < 0)
9312 derr << __func__ << cpp_strerror(cls) << ": "
9313 << "osd_disk_thread_ioprio_class is " << cct->_conf->osd_disk_thread_ioprio_class
9314 << " but only the following values are allowed: idle, be or rt" << dendl;
9315 else
9316 disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
9317}
9318
9319// --------------------------------
9320
9321void OSD::get_latest_osdmap()
9322{
9323 dout(10) << __func__ << " -- start" << dendl;
9324
9325 C_SaferCond cond;
9326 service.objecter->wait_for_latest_osdmap(&cond);
9327 cond.wait();
9328
9329 dout(10) << __func__ << " -- finish" << dendl;
9330}
9331
9332// --------------------------------
9333
9334int OSD::init_op_flags(OpRequestRef& op)
9335{
9336 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
9337 vector<OSDOp>::const_iterator iter;
9338
9339 // client flags have no bearing on whether an op is a read, write, etc.
9340 op->rmw_flags = 0;
9341
9342 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
9343 op->set_force_rwordered();
9344 }
9345
9346 // set bits based on op codes, called methods.
9347 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
9348 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
9349 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
9350 /* This a bit odd. PING isn't actually a write. It can't
9351 * result in an update to the object_info. PINGs also aren'ty
9352 * resent, so there's no reason to write out a log entry
9353 *
9354 * However, we pipeline them behind writes, so let's force
9355 * the write_ordered flag.
9356 */
9357 op->set_force_rwordered();
9358 } else {
9359 if (ceph_osd_op_mode_modify(iter->op.op))
9360 op->set_write();
9361 }
9362 if (ceph_osd_op_mode_read(iter->op.op))
9363 op->set_read();
9364
9365 // set READ flag if there are src_oids
9366 if (iter->soid.oid.name.length())
9367 op->set_read();
9368
9369 // set PGOP flag if there are PG ops
9370 if (ceph_osd_op_type_pg(iter->op.op))
9371 op->set_pg_op();
9372
9373 if (ceph_osd_op_mode_cache(iter->op.op))
9374 op->set_cache();
9375
9376 // check for ec base pool
9377 int64_t poolid = m->get_pg().pool();
9378 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
9379 if (pool && pool->is_tier()) {
9380 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
9381 if (base_pool && base_pool->require_rollback()) {
9382 if ((iter->op.op != CEPH_OSD_OP_READ) &&
9383 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
9384 (iter->op.op != CEPH_OSD_OP_STAT) &&
9385 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
9386 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
9387 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
9388 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
9389 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
9390 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
9391 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
9392 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
9393 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
9394 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
9395 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
9396 (iter->op.op != CEPH_OSD_OP_CREATE) &&
9397 (iter->op.op != CEPH_OSD_OP_DELETE) &&
9398 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
9399 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
9400 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
9401 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
9402 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
9403 op->set_promote();
9404 }
9405 }
9406 }
9407
9408 switch (iter->op.op) {
9409 case CEPH_OSD_OP_CALL:
9410 {
9411 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
9412 int is_write, is_read;
9413 string cname, mname;
9414 bp.copy(iter->op.cls.class_len, cname);
9415 bp.copy(iter->op.cls.method_len, mname);
9416
9417 ClassHandler::ClassData *cls;
9418 int r = class_handler->open_class(cname, &cls);
9419 if (r) {
9420 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
9421 if (r == -ENOENT)
9422 r = -EOPNOTSUPP;
9423 else if (r != -EPERM) // propagate permission errors
9424 r = -EIO;
9425 return r;
9426 }
9427 int flags = cls->get_method_flags(mname.c_str());
9428 if (flags < 0) {
9429 if (flags == -ENOENT)
9430 r = -EOPNOTSUPP;
9431 else
9432 r = flags;
9433 return r;
9434 }
9435 is_read = flags & CLS_METHOD_RD;
9436 is_write = flags & CLS_METHOD_WR;
9437 bool is_promote = flags & CLS_METHOD_PROMOTE;
9438
9439 dout(10) << "class " << cname << " method " << mname << " "
9440 << "flags=" << (is_read ? "r" : "")
9441 << (is_write ? "w" : "")
9442 << (is_promote ? "p" : "")
9443 << dendl;
9444 if (is_read)
9445 op->set_class_read();
9446 if (is_write)
9447 op->set_class_write();
9448 if (is_promote)
9449 op->set_promote();
9450 op->add_class(cname, is_read, is_write, cls->whitelisted);
9451 break;
9452 }
9453
9454 case CEPH_OSD_OP_WATCH:
9455 // force the read bit for watch since it is depends on previous
9456 // watch state (and may return early if the watch exists) or, in
9457 // the case of ping, is simply a read op.
9458 op->set_read();
9459 // fall through
9460 case CEPH_OSD_OP_NOTIFY:
9461 case CEPH_OSD_OP_NOTIFY_ACK:
9462 {
9463 op->set_promote();
9464 break;
9465 }
9466
9467 case CEPH_OSD_OP_DELETE:
9468 // if we get a delete with FAILOK we can skip handle cache. without
9469 // FAILOK we still need to promote (or do something smarter) to
9470 // determine whether to return ENOENT or 0.
9471 if (iter == m->ops.begin() &&
9472 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
9473 op->set_skip_handle_cache();
9474 }
9475 // skip promotion when proxying a delete op
9476 if (m->ops.size() == 1) {
9477 op->set_skip_promote();
9478 }
9479 break;
9480
9481 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
9482 case CEPH_OSD_OP_CACHE_FLUSH:
9483 case CEPH_OSD_OP_CACHE_EVICT:
9484 // If try_flush/flush/evict is the only op, can skip handle cache.
9485 if (m->ops.size() == 1) {
9486 op->set_skip_handle_cache();
9487 }
9488 break;
9489
9490 case CEPH_OSD_OP_READ:
9491 case CEPH_OSD_OP_SYNC_READ:
9492 case CEPH_OSD_OP_SPARSE_READ:
9493 case CEPH_OSD_OP_CHECKSUM:
9494 case CEPH_OSD_OP_WRITEFULL:
9495 if (m->ops.size() == 1 &&
9496 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
9497 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
9498 op->set_skip_promote();
9499 }
9500 break;
9501
9502 // force promotion when pin an object in cache tier
9503 case CEPH_OSD_OP_CACHE_PIN:
9504 op->set_promote();
9505 break;
9506
9507 default:
9508 break;
9509 }
9510 }
9511
9512 if (op->rmw_flags == 0)
9513 return -EINVAL;
9514
9515 return 0;
9516}
9517
9518void OSD::PeeringWQ::_dequeue(list<PG*> *out) {
9519 for (list<PG*>::iterator i = peering_queue.begin();
9520 i != peering_queue.end() &&
9521 out->size() < osd->cct->_conf->osd_peering_wq_batch_size;
9522 ) {
9523 if (in_use.count(*i)) {
9524 ++i;
9525 } else {
9526 out->push_back(*i);
9527 peering_queue.erase(i++);
9528 }
9529 }
9530 in_use.insert(out->begin(), out->end());
9531}
9532
9533// =============================================================
9534
9535#undef dout_context
9536#define dout_context osd->cct
9537#undef dout_prefix
9538#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
9539
9540void OSD::ShardedOpWQ::wake_pg_waiters(spg_t pgid)
9541{
9542 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
9543 auto sdata = shard_list[shard_index];
9544 bool queued = false;
9545 unsigned pushes_to_free = 0;
9546 {
9547 Mutex::Locker l(sdata->sdata_op_ordering_lock);
9548 auto p = sdata->pg_slots.find(pgid);
9549 if (p != sdata->pg_slots.end()) {
9550 dout(20) << __func__ << " " << pgid
9551 << " to_process " << p->second.to_process
9552 << " waiting_for_pg=" << (int)p->second.waiting_for_pg << dendl;
9553 for (auto i = p->second.to_process.rbegin();
9554 i != p->second.to_process.rend();
9555 ++i) {
9556 sdata->_enqueue_front(make_pair(pgid, *i), osd->op_prio_cutoff);
9557 }
9558 for (auto& q : p->second.to_process) {
9559 pushes_to_free += q.get_reserved_pushes();
9560 }
9561 p->second.to_process.clear();
9562 p->second.waiting_for_pg = false;
9563 ++p->second.requeue_seq;
9564 queued = true;
9565 }
9566 }
9567 if (pushes_to_free > 0) {
9568 osd->service.release_reserved_pushes(pushes_to_free);
9569 }
9570 if (queued) {
9571 sdata->sdata_lock.Lock();
9572 sdata->sdata_cond.SignalOne();
9573 sdata->sdata_lock.Unlock();
9574 }
9575}
9576
9577void OSD::ShardedOpWQ::prune_pg_waiters(OSDMapRef osdmap, int whoami)
9578{
9579 unsigned pushes_to_free = 0;
9580 for (auto sdata : shard_list) {
9581 Mutex::Locker l(sdata->sdata_op_ordering_lock);
9582 sdata->waiting_for_pg_osdmap = osdmap;
9583 auto p = sdata->pg_slots.begin();
9584 while (p != sdata->pg_slots.end()) {
9585 ShardData::pg_slot& slot = p->second;
9586 if (!slot.to_process.empty() && slot.num_running == 0) {
9587 if (osdmap->is_up_acting_osd_shard(p->first, whoami)) {
9588 dout(20) << __func__ << " " << p->first << " maps to us, keeping"
9589 << dendl;
9590 ++p;
9591 continue;
9592 }
9593 while (!slot.to_process.empty() &&
9594 slot.to_process.front().get_map_epoch() <= osdmap->get_epoch()) {
9595 auto& qi = slot.to_process.front();
9596 dout(20) << __func__ << " " << p->first
9597 << " item " << qi
9598 << " epoch " << qi.get_map_epoch()
9599 << " <= " << osdmap->get_epoch()
9600 << ", stale, dropping" << dendl;
9601 pushes_to_free += qi.get_reserved_pushes();
9602 slot.to_process.pop_front();
9603 }
9604 }
9605 if (slot.to_process.empty() &&
9606 slot.num_running == 0 &&
9607 !slot.pg) {
9608 dout(20) << __func__ << " " << p->first << " empty, pruning" << dendl;
9609 p = sdata->pg_slots.erase(p);
9610 } else {
9611 ++p;
9612 }
9613 }
9614 }
9615 if (pushes_to_free > 0) {
9616 osd->service.release_reserved_pushes(pushes_to_free);
9617 }
9618}
9619
9620void OSD::ShardedOpWQ::clear_pg_pointer(spg_t pgid)
9621{
9622 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
9623 auto sdata = shard_list[shard_index];
9624 Mutex::Locker l(sdata->sdata_op_ordering_lock);
9625 auto p = sdata->pg_slots.find(pgid);
9626 if (p != sdata->pg_slots.end()) {
9627 auto& slot = p->second;
9628 dout(20) << __func__ << " " << pgid << " pg " << slot.pg << dendl;
9629 assert(!slot.pg || slot.pg->deleting);
9630 slot.pg = nullptr;
9631 }
9632}
9633
9634void OSD::ShardedOpWQ::clear_pg_slots()
9635{
9636 for (auto sdata : shard_list) {
9637 Mutex::Locker l(sdata->sdata_op_ordering_lock);
9638 sdata->pg_slots.clear();
9639 sdata->waiting_for_pg_osdmap.reset();
9640 // don't bother with reserved pushes; we are shutting down
9641 }
9642}
9643
9644#undef dout_prefix
9645#define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
9646
9647void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
9648{
9649 uint32_t shard_index = thread_index % num_shards;
9650 ShardData *sdata = shard_list[shard_index];
9651 assert(NULL != sdata);
9652
9653 // peek at spg_t
9654 sdata->sdata_op_ordering_lock.Lock();
9655 if (sdata->pqueue->empty()) {
9656 dout(20) << __func__ << " empty q, waiting" << dendl;
9657 // optimistically sleep a moment; maybe another work item will come along.
9658 sdata->sdata_op_ordering_lock.Unlock();
9659 osd->cct->get_heartbeat_map()->reset_timeout(hb,
9660 osd->cct->_conf->threadpool_default_timeout, 0);
9661 sdata->sdata_lock.Lock();
9662 sdata->sdata_cond.WaitInterval(sdata->sdata_lock,
9663 utime_t(osd->cct->_conf->threadpool_empty_queue_max_wait, 0));
9664 sdata->sdata_lock.Unlock();
9665 sdata->sdata_op_ordering_lock.Lock();
9666 if (sdata->pqueue->empty()) {
9667 sdata->sdata_op_ordering_lock.Unlock();
9668 return;
9669 }
9670 }
9671 pair<spg_t, PGQueueable> item = sdata->pqueue->dequeue();
9672 if (osd->is_stopping()) {
9673 sdata->sdata_op_ordering_lock.Unlock();
9674 return; // OSD shutdown, discard.
9675 }
9676 PGRef pg;
9677 uint64_t requeue_seq;
9678 {
9679 auto& slot = sdata->pg_slots[item.first];
9680 dout(30) << __func__ << " " << item.first
9681 << " to_process " << slot.to_process
9682 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
9683 slot.to_process.push_back(item.second);
9684 // note the requeue seq now...
9685 requeue_seq = slot.requeue_seq;
9686 if (slot.waiting_for_pg) {
9687 // save ourselves a bit of effort
9688 dout(20) << __func__ << " " << item.first << " item " << item.second
9689 << " queued, waiting_for_pg" << dendl;
9690 sdata->sdata_op_ordering_lock.Unlock();
9691 return;
9692 }
9693 pg = slot.pg;
9694 dout(20) << __func__ << " " << item.first << " item " << item.second
9695 << " queued" << dendl;
9696 ++slot.num_running;
9697 }
9698 sdata->sdata_op_ordering_lock.Unlock();
9699
9700 osd->service.maybe_inject_dispatch_delay();
9701
9702 // [lookup +] lock pg (if we have it)
9703 if (!pg) {
9704 pg = osd->_lookup_lock_pg(item.first);
9705 } else {
9706 pg->lock();
9707 }
9708
9709 osd->service.maybe_inject_dispatch_delay();
9710
9711 boost::optional<PGQueueable> qi;
9712
9713 // we don't use a Mutex::Locker here because of the
9714 // osd->service.release_reserved_pushes() call below
9715 sdata->sdata_op_ordering_lock.Lock();
9716
9717 auto q = sdata->pg_slots.find(item.first);
9718 assert(q != sdata->pg_slots.end());
9719 auto& slot = q->second;
9720 --slot.num_running;
9721
9722 if (slot.to_process.empty()) {
9723 // raced with wake_pg_waiters or prune_pg_waiters
9724 dout(20) << __func__ << " " << item.first << " nothing queued" << dendl;
9725 if (pg) {
9726 pg->unlock();
9727 }
9728 sdata->sdata_op_ordering_lock.Unlock();
9729 return;
9730 }
9731 if (requeue_seq != slot.requeue_seq) {
9732 dout(20) << __func__ << " " << item.first
9733 << " requeue_seq " << slot.requeue_seq << " > our "
9734 << requeue_seq << ", we raced with wake_pg_waiters"
9735 << dendl;
9736 if (pg) {
9737 pg->unlock();
9738 }
9739 sdata->sdata_op_ordering_lock.Unlock();
9740 return;
9741 }
9742 if (pg && !slot.pg && !pg->deleting) {
9743 dout(20) << __func__ << " " << item.first << " set pg to " << pg << dendl;
9744 slot.pg = pg;
9745 }
9746 dout(30) << __func__ << " " << item.first << " to_process " << slot.to_process
9747 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
9748
9749 // make sure we're not already waiting for this pg
9750 if (slot.waiting_for_pg) {
9751 dout(20) << __func__ << " " << item.first << " item " << item.second
9752 << " slot is waiting_for_pg" << dendl;
9753 if (pg) {
9754 pg->unlock();
9755 }
9756 sdata->sdata_op_ordering_lock.Unlock();
9757 return;
9758 }
9759
9760 // take next item
9761 qi = slot.to_process.front();
9762 slot.to_process.pop_front();
9763 dout(20) << __func__ << " " << item.first << " item " << *qi
9764 << " pg " << pg << dendl;
9765
9766 if (!pg) {
9767 // should this pg shard exist on this osd in this (or a later) epoch?
9768 OSDMapRef osdmap = sdata->waiting_for_pg_osdmap;
9769 if (osdmap->is_up_acting_osd_shard(item.first, osd->whoami)) {
9770 dout(20) << __func__ << " " << item.first
9771 << " no pg, should exist, will wait" << " on " << *qi << dendl;
9772 slot.to_process.push_front(*qi);
9773 slot.waiting_for_pg = true;
9774 } else if (qi->get_map_epoch() > osdmap->get_epoch()) {
9775 dout(20) << __func__ << " " << item.first << " no pg, item epoch is "
9776 << qi->get_map_epoch() << " > " << osdmap->get_epoch()
9777 << ", will wait on " << *qi << dendl;
9778 slot.to_process.push_front(*qi);
9779 slot.waiting_for_pg = true;
9780 } else {
9781 dout(20) << __func__ << " " << item.first << " no pg, shouldn't exist,"
9782 << " dropping " << *qi << dendl;
9783 // share map with client?
9784 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
9785 Session *session = static_cast<Session *>(
9786 (*_op)->get_req()->get_connection()->get_priv());
9787 if (session) {
9788 osd->maybe_share_map(session, *_op, sdata->waiting_for_pg_osdmap);
9789 session->put();
9790 }
9791 }
9792 unsigned pushes_to_free = qi->get_reserved_pushes();
9793 if (pushes_to_free > 0) {
9794 sdata->sdata_op_ordering_lock.Unlock();
9795 osd->service.release_reserved_pushes(pushes_to_free);
9796 return;
9797 }
9798 }
9799 sdata->sdata_op_ordering_lock.Unlock();
9800 return;
9801 }
9802 sdata->sdata_op_ordering_lock.Unlock();
9803
9804
9805 // osd_opwq_process marks the point at which an operation has been dequeued
9806 // and will begin to be handled by a worker thread.
9807 {
9808#ifdef WITH_LTTNG
9809 osd_reqid_t reqid;
9810 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
9811 reqid = (*_op)->get_reqid();
9812 }
9813#endif
9814 tracepoint(osd, opwq_process_start, reqid.name._type,
9815 reqid.name._num, reqid.tid, reqid.inc);
9816 }
9817
9818 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
9819 Formatter *f = Formatter::create("json");
9820 f->open_object_section("q");
9821 dump(f);
9822 f->close_section();
9823 f->flush(*_dout);
9824 delete f;
9825 *_dout << dendl;
9826
9827 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
9828 suicide_interval);
9829 qi->run(osd, pg, tp_handle);
9830
9831 {
9832#ifdef WITH_LTTNG
9833 osd_reqid_t reqid;
9834 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
9835 reqid = (*_op)->get_reqid();
9836 }
9837#endif
9838 tracepoint(osd, opwq_process_finish, reqid.name._type,
9839 reqid.name._num, reqid.tid, reqid.inc);
9840 }
9841
9842 pg->unlock();
9843}
9844
9845void OSD::ShardedOpWQ::_enqueue(pair<spg_t, PGQueueable> item) {
9846 uint32_t shard_index =
9847 item.first.hash_to_shard(shard_list.size());
9848
9849 ShardData* sdata = shard_list[shard_index];
9850 assert (NULL != sdata);
9851 unsigned priority = item.second.get_priority();
9852 unsigned cost = item.second.get_cost();
9853 sdata->sdata_op_ordering_lock.Lock();
9854
9855 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
9856 if (priority >= osd->op_prio_cutoff)
9857 sdata->pqueue->enqueue_strict(
9858 item.second.get_owner(), priority, item);
9859 else
9860 sdata->pqueue->enqueue(
9861 item.second.get_owner(),
9862 priority, cost, item);
9863 sdata->sdata_op_ordering_lock.Unlock();
9864
9865 sdata->sdata_lock.Lock();
9866 sdata->sdata_cond.SignalOne();
9867 sdata->sdata_lock.Unlock();
9868
9869}
9870
9871void OSD::ShardedOpWQ::_enqueue_front(pair<spg_t, PGQueueable> item)
9872{
9873 uint32_t shard_index = item.first.hash_to_shard(shard_list.size());
9874 ShardData* sdata = shard_list[shard_index];
9875 assert (NULL != sdata);
9876 sdata->sdata_op_ordering_lock.Lock();
9877 auto p = sdata->pg_slots.find(item.first);
9878 if (p != sdata->pg_slots.end() && !p->second.to_process.empty()) {
9879 // we may be racing with _process, which has dequeued a new item
9880 // from pqueue, put it on to_process, and is now busy taking the
9881 // pg lock. ensure this old requeued item is ordered before any
9882 // such newer item in to_process.
9883 p->second.to_process.push_front(item.second);
9884 item.second = p->second.to_process.back();
9885 p->second.to_process.pop_back();
9886 dout(20) << __func__ << " " << item.first
9887 << " " << p->second.to_process.front()
9888 << " shuffled w/ " << item.second << dendl;
9889 } else {
9890 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
9891 }
9892 sdata->_enqueue_front(item, osd->op_prio_cutoff);
9893 sdata->sdata_op_ordering_lock.Unlock();
9894 sdata->sdata_lock.Lock();
9895 sdata->sdata_cond.SignalOne();
9896 sdata->sdata_lock.Unlock();
9897}
9898
9899namespace ceph {
9900namespace osd_cmds {
9901
9902int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os)
9903{
9904 if (!ceph_using_tcmalloc()) {
9905 os << "could not issue heap profiler command -- not using tcmalloc!";
9906 return -EOPNOTSUPP;
9907 }
9908
9909 string cmd;
9910 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
9911 os << "unable to get value for command \"" << cmd << "\"";
9912 return -EINVAL;
9913 }
9914
9915 std::vector<std::string> cmd_vec;
9916 get_str_vec(cmd, cmd_vec);
9917
9918 ceph_heap_profiler_handle_command(cmd_vec, os);
9919
9920 return 0;
9921}
9922
9923}} // namespace ceph::osd_cmds
9924