recovery_sleep_lock("OSDService::recovery_sleep_lock"),
recovery_sleep_timer(cct, recovery_sleep_lock, false),
reserver_finisher(cct),
- local_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
+ local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
cct->_conf->osd_min_recovery_priority),
- remote_reserver(&reserver_finisher, cct->_conf->osd_max_backfills,
+ remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
cct->_conf->osd_min_recovery_priority),
pg_temp_lock("OSDService::pg_temp_lock"),
snap_sleep_lock("OSDService::snap_sleep_lock"),
scrub_sleep_lock("OSDService::scrub_sleep_lock"),
scrub_sleep_timer(
osd->client_messenger->cct, scrub_sleep_lock, false /* relax locking */),
- snap_reserver(&reserver_finisher,
+ snap_reserver(cct, &reserver_finisher,
cct->_conf->osd_max_trimming_pgs),
recovery_lock("OSDService::recovery_lock"),
recovery_ops_active(0),
waiter.wait();
}
- ret = write_meta(store, sb.cluster_fsid, sb.osd_fsid, whoami);
+ ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
if (ret) {
derr << "OSD::mkfs: failed to write fsid file: error "
<< cpp_strerror(ret) << dendl;
return ret;
}
-int OSD::write_meta(ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
+int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
{
char val[80];
int r;
if (r < 0)
return r;
+ string key = cct->_conf->get_val<string>("key");
+ lderr(cct) << "key " << key << dendl;
+ if (key.size()) {
+ r = store->write_meta("osd_key", key);
+ if (r < 0)
+ return r;
+ }
+
r = store->write_meta("ready", "ready");
if (r < 0)
return r;
};
+ // All the basic OSD operation stats are to be considered useful
+ osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
+
osd_plb.add_u64(
l_osd_op_wip, "op_wip",
"Replication operations currently being processed (primary)");
l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
"Latency of read-modify-write operations (excluding queue time and wait for finished)");
+ // Now we move on to some more obscure stats, revert to assuming things
+ // are low priority unless otherwise specified.
+ osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
+
osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
"Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
"OSDMap buffer cache misses");
- osd_plb.add_u64(l_osd_stat_bytes, "stat_bytes", "OSD size");
- osd_plb.add_u64(l_osd_stat_bytes_used, "stat_bytes_used", "Used space");
+ osd_plb.add_u64(
+ l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
+ PerfCountersBuilder::PRIO_USEFUL);
+ osd_plb.add_u64(
+ l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
+ PerfCountersBuilder::PRIO_USEFUL);
osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
osd_plb.add_u64_counter(
set_state(STATE_STOPPING);
// Debugging
- cct->_conf->set_val("debug_osd", "100");
- cct->_conf->set_val("debug_journal", "100");
- cct->_conf->set_val("debug_filestore", "100");
- cct->_conf->set_val("debug_ms", "100");
- cct->_conf->apply_changes(NULL);
+ if (cct->_conf->get_val<bool>("osd_debug_shutdown")) {
+ cct->_conf->set_val("debug_osd", "100");
+ cct->_conf->set_val("debug_journal", "100");
+ cct->_conf->set_val("debug_filestore", "100");
+ cct->_conf->set_val("debug_bluestore", "100");
+ cct->_conf->set_val("debug_ms", "100");
+ cct->_conf->apply_changes(NULL);
+ }
// stop MgrClient earlier as it's more like an internal consumer of OSD
mgrc.shutdown();
++i) {
PG *pg = i->second;
+ // Ignore PGs only partially created (DNE)
+ if (pg->info.dne()) {
+ continue;
+ }
+
auto rpib = pg->get_required_past_interval_bounds(
pg->info,
superblock.oldest_map);
ceph_abort();
}
+ const bool is_mon_create =
+ evt->get_event().dynamic_type() == PG::NullEvt::static_type();
+ if (maybe_wait_for_max_pg(pgid, is_mon_create)) {
+ return -EAGAIN;
+ }
// do we need to resurrect a deleting pg?
spg_t resurrected;
PGRef old_pg_state;
}
}
+bool OSD::maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create)
+{
+ const auto max_pgs_per_osd =
+ (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
+ cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+
+ RWLock::RLocker pg_map_locker{pg_map_lock};
+ if (pg_map.size() < max_pgs_per_osd) {
+ return false;
+ }
+ lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+ if (is_mon_create) {
+ pending_creates_from_mon++;
+ } else {
+ pending_creates_from_osd.emplace(pgid.pgid);
+ }
+ dout(5) << __func__ << " withhold creation of pg " << pgid
+ << ": " << pg_map.size() << " >= "<< max_pgs_per_osd << dendl;
+ return true;
+}
+
+// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
+// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
+// to up set if pg_temp is empty. so an empty pg_temp won't work.
+static vector<int32_t> twiddle(const vector<int>& acting) {
+ if (acting.size() > 1) {
+ return {acting[0]};
+ } else {
+ vector<int32_t> twiddled(acting.begin(), acting.end());
+ twiddled.push_back(-1);
+ return twiddled;
+ }
+}
+
+void OSD::resume_creating_pg()
+{
+ bool do_sub_pg_creates = false;
+ MOSDPGTemp *pgtemp = nullptr;
+ {
+ const auto max_pgs_per_osd =
+ (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
+ cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+ RWLock::RLocker l(pg_map_lock);
+ if (max_pgs_per_osd <= pg_map.size()) {
+ // this could happen if admin decreases this setting before a PG is removed
+ return;
+ }
+ unsigned spare_pgs = max_pgs_per_osd - pg_map.size();
+ lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+ if (pending_creates_from_mon > 0) {
+ do_sub_pg_creates = true;
+ if (pending_creates_from_mon >= spare_pgs) {
+ spare_pgs = pending_creates_from_mon = 0;
+ } else {
+ spare_pgs -= pending_creates_from_mon;
+ pending_creates_from_mon = 0;
+ }
+ }
+ auto pg = pending_creates_from_osd.cbegin();
+ while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
+ if (!pgtemp) {
+ pgtemp = new MOSDPGTemp{osdmap->get_epoch()};
+ }
+ vector<int> acting;
+ osdmap->pg_to_up_acting_osds(*pg, nullptr, nullptr, &acting, nullptr);
+ pgtemp->pg_temp[*pg] = twiddle(acting);
+ pg = pending_creates_from_osd.erase(pg);
+ spare_pgs--;
+ }
+ }
+ if (do_sub_pg_creates) {
+ if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
+ dout(4) << __func__ << ": resolicit pg creates from mon since "
+ << last_pg_create_epoch << dendl;
+ monc->renew_subs();
+ }
+ }
+ if (pgtemp) {
+ pgtemp->forced = true;
+ monc->send_mon_message(pgtemp);
+ }
+}
void OSD::build_initial_pg_history(
spg_t pgid,
sched_scrub();
}
service.promote_throttle_recalibrate();
+ resume_creating_pg();
bool need_send_beacon = false;
const auto now = ceph::coarse_mono_clock::now();
{
assert(osd_lock.is_locked());
dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
+ /** make sure the cluster is speaking in SORTBITWISE, because we don't
+ * speak the older sorting version any more. Be careful not to force
+ * a shutdown if we are merely processing old maps, though.
+ */
+ if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
+ derr << __func__ << " SORTBITWISE flag is not set" << dendl;
+ ceph_abort();
+ }
+
int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
list<PGRef> to_remove;
pg->unlock();
}
+
+ lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+ for (auto pg = pending_creates_from_osd.cbegin();
+ pg != pending_creates_from_osd.cend();) {
+ if (osdmap->get_pg_acting_rank(*pg, whoami) < 0) {
+ pg = pending_creates_from_osd.erase(pg);
+ } else {
+ ++pg;
+ }
+ }
}
for (list<PGRef>::iterator i = to_remove.begin();
dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
- if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
- derr << __func__ << " SORTBITWISE flag is not set" << dendl;
- ceph_abort();
- }
-
if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
osdmap_subscribe(osdmap->get_epoch() + 1, false);
<< dendl;
continue;
}
-
if (handle_pg_peering_evt(
pgid,
history,
service.send_pg_created(pgid.pgid);
}
}
- last_pg_create_epoch = m->epoch;
+ {
+ lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+ if (pending_creates_from_mon == 0) {
+ last_pg_create_epoch = m->epoch;
+ }
+ }
maybe_update_heartbeat_peers();
}
continue;
}
service.share_map_peer(it->first, con.get(), curmap);
- dout(7) << __func__ << " osd " << it->first
+ dout(7) << __func__ << " osd." << it->first
<< " on " << it->second.size() << " PGs" << dendl;
MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
it->second);
m->query_epoch,
PG::RemoteBackfillReserved()));
} else if (m->type == MBackfillReserve::REJECT) {
+ // NOTE: this is replica -> primary "i reject your request"
+ // and also primary -> replica "cancel my previously-granted request"
evt = PG::CephPeeringEvtRef(
new PG::CephPeeringEvt(
m->query_epoch,
pg->put("PGMap"); // since we've taken it out of map
}
-
// =========================================================
// RECOVERY
i->lock();
int pgstate = i->get_state();
if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) ||
- ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILL))) )
+ ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING))) )
i->_change_recovery_force_mode(newstate, false);
i->unlock();
}
pg->discover_all_missing(*rctx.query_map);
if (rctx.query_map->empty()) {
string action;
- if (pg->state_test(PG_STATE_BACKFILL)) {
+ if (pg->state_test(PG_STATE_BACKFILLING)) {
auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
queued,
queued,
- PG::CancelBackfill()));
+ PG::DeferBackfill(cct->_conf->osd_recovery_retry_interval)));
pg->queue_peering_event(evt);
action = "in backfill";
} else if (pg->state_test(PG_STATE_RECOVERING)) {
auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
queued,
queued,
- PG::CancelRecovery()));
+ PG::DeferRecovery(cct->_conf->osd_recovery_retry_interval)));
pg->queue_peering_event(evt);
action = "in recovery";
} else {