#include "messages/MOSDPGBackfill.h"
#include "messages/MBackfillReserve.h"
#include "messages/MRecoveryReserve.h"
+#include "messages/MOSDForceRecovery.h"
#include "messages/MOSDECSubOpWrite.h"
#include "messages/MOSDECSubOpWriteReply.h"
#include "messages/MOSDECSubOpRead.h"
ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
+ ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
ceph_osd_feature_incompat);
}
dout(10) << __func__ << " " << get_full_state_name(cur_state)
<< " -> " << get_full_state_name(new_state) << dendl;
if (new_state == FAILSAFE) {
- clog->error() << "failsafe engaged, dropping updates, now "
+ clog->error() << "full status failsafe engaged, dropping updates, now "
<< (int)roundf(ratio * 100) << "% full";
} else if (cur_state == FAILSAFE) {
- clog->error() << "failsafe disengaged, no longer dropping updates, now "
- << (int)roundf(ratio * 100) << "% full";
+ clog->error() << "full status failsafe disengaged, no longer dropping "
+ << "updates, now " << (int)roundf(ratio * 100) << "% full";
}
cur_state = new_state;
}
void OSDService::send_pg_created(pg_t pgid)
{
dout(20) << __func__ << dendl;
- monc->send_mon_message(new MOSDPGCreated(pgid));
+ if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
+ monc->send_mon_message(new MOSDPGCreated(pgid));
+ }
}
// --------------------------------------
} else if (admin_command == "flush_journal") {
store->flush_journal();
} else if (admin_command == "dump_ops_in_flight" ||
- admin_command == "ops") {
- if (!op_tracker.dump_ops_in_flight(f)) {
- ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
- Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
- }
- } else if (admin_command == "dump_blocked_ops") {
- if (!op_tracker.dump_ops_in_flight(f, true)) {
- ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
- Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
- }
- } else if (admin_command == "dump_historic_ops") {
- if (!op_tracker.dump_historic_ops(f, false)) {
- ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
- Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
- }
- } else if (admin_command == "dump_historic_ops_by_duration") {
- if (!op_tracker.dump_historic_ops(f, true)) {
- ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
- Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
- }
- } else if (admin_command == "dump_historic_slow_ops") {
- if (!op_tracker.dump_historic_slow_ops(f)) {
- ss << "op_tracker tracking is not enabled now, so no ops are tracked currently, even those get stuck. \
- Please enable \"osd_enable_op_tracker\", and the tracker will start to track new ops received afterwards.";
+ admin_command == "ops" ||
+ admin_command == "dump_blocked_ops" ||
+ admin_command == "dump_historic_ops" ||
+ admin_command == "dump_historic_ops_by_duration" ||
+ admin_command == "dump_historic_slow_ops") {
+
+ const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
+even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
+will start to track new ops received afterwards.";
+
+ set<string> filters;
+ vector<string> filter_str;
+ if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
+ copy(filter_str.begin(), filter_str.end(),
+ inserter(filters, filters.end()));
+ }
+
+ if (admin_command == "dump_ops_in_flight" ||
+ admin_command == "ops") {
+ if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
+ ss << error_str;
+ }
+ }
+ if (admin_command == "dump_blocked_ops") {
+ if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
+ ss << error_str;
+ }
+ }
+ if (admin_command == "dump_historic_ops") {
+ if (!op_tracker.dump_historic_ops(f, false, filters)) {
+ ss << error_str;
+ }
+ }
+ if (admin_command == "dump_historic_ops_by_duration") {
+ if (!op_tracker.dump_historic_ops(f, true, filters)) {
+ ss << error_str;
+ }
+ }
+ if (admin_command == "dump_historic_slow_ops") {
+ if (!op_tracker.dump_historic_slow_ops(f, filters)) {
+ ss << error_str;
+ }
}
} else if (admin_command == "dump_op_pq_state") {
f->open_object_section("pq");
delete fuse_store;
fuse_store = NULL;
r = ::rmdir(mntpath.c_str());
- if (r < 0)
- r = -errno;
if (r < 0) {
- derr << __func__ << " failed to rmdir " << mntpath << dendl;
+ r = -errno;
+ derr << __func__ << " failed to rmdir " << mntpath << ": "
+ << cpp_strerror(r) << dendl;
return r;
}
return 0;
return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
}
+float OSD::get_osd_recovery_sleep()
+{
+ if (cct->_conf->osd_recovery_sleep)
+ return cct->_conf->osd_recovery_sleep;
+ if (store_is_rotational)
+ return cct->_conf->osd_recovery_sleep_hdd;
+ else
+ return cct->_conf->osd_recovery_sleep_ssd;
+}
+
int OSD::init()
{
CompatSet initial, diff;
r = monc->authenticate();
if (r < 0) {
+ derr << __func__ << " authentication failed: " << cpp_strerror(r)
+ << dendl;
osd_lock.Lock(); // locker is going to unlock this on function exit
if (is_stopping())
- r = 0;
+ r = 0;
goto monout;
}
derr << "unable to obtain rotating service keys; retrying" << dendl;
++rotating_auth_attempts;
if (rotating_auth_attempts > g_conf->max_rotating_auth_attempts) {
+ derr << __func__ << " wait_auth_rotating timed out" << dendl;
osd_lock.Lock(); // make locker happy
if (!is_stopping()) {
- r = - ETIMEDOUT;
+ r = -ETIMEDOUT;
}
goto monout;
}
r = update_crush_device_class();
if (r < 0) {
+ derr << __func__ <<" unable to update_crush_device_class: "
+ << cpp_strerror(r) << dendl;
osd_lock.Lock();
goto monout;
}
r = update_crush_location();
if (r < 0) {
+ derr << __func__ <<" unable to update_crush_location: "
+ << cpp_strerror(r) << dendl;
osd_lock.Lock();
goto monout;
}
return 0;
monout:
- mgrc.shutdown();
- monc->shutdown();
+ exit(1);
out:
enable_disable_fuse(true);
"flush the journal to permanent store");
assert(r == 0);
r = admin_socket->register_command("dump_ops_in_flight",
- "dump_ops_in_flight", asok_hook,
+ "dump_ops_in_flight " \
+ "name=filterstr,type=CephString,n=N,req=false",
+ asok_hook,
"show the ops currently in flight");
assert(r == 0);
r = admin_socket->register_command("ops",
- "ops", asok_hook,
+ "ops " \
+ "name=filterstr,type=CephString,n=N,req=false",
+ asok_hook,
"show the ops currently in flight");
assert(r == 0);
r = admin_socket->register_command("dump_blocked_ops",
- "dump_blocked_ops", asok_hook,
+ "dump_blocked_ops " \
+ "name=filterstr,type=CephString,n=N,req=false",
+ asok_hook,
"show the blocked ops currently in flight");
assert(r == 0);
- r = admin_socket->register_command("dump_historic_ops", "dump_historic_ops",
+ r = admin_socket->register_command("dump_historic_ops",
+ "dump_historic_ops " \
+ "name=filterstr,type=CephString,n=N,req=false",
asok_hook,
"show recent ops");
assert(r == 0);
- r = admin_socket->register_command("dump_historic_slow_ops", "dump_historic_slow_ops",
+ r = admin_socket->register_command("dump_historic_slow_ops",
+ "dump_historic_slow_ops " \
+ "name=filterstr,type=CephString,n=N,req=false",
asok_hook,
"show slowest recent ops");
assert(r == 0);
- r = admin_socket->register_command("dump_historic_ops_by_duration", "dump_historic_ops_by_duration",
+ r = admin_socket->register_command("dump_historic_ops_by_duration",
+ "dump_historic_ops_by_duration " \
+ "name=filterstr,type=CephString,n=N,req=false",
asok_hook,
"show slowest recent ops, sorted by duration");
assert(r == 0);
if (acting_primary != new_acting_primary) {
h->same_primary_since = e;
}
+ if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
+ osdmap->get_pg_num(pgid.pgid.pool()),
+ nullptr)) {
+ h->last_epoch_split = e;
+ }
lastmap = osdmap;
}
dout(20) << __func__ << " " << debug.str() << dendl;
}
OSDMapRef curmap = service.get_osdmap();
- assert(curmap);
+ if (!curmap) {
+ heartbeat_lock.Unlock();
+ m->put();
+ return;
+ }
switch (m->op) {
// if our map within recent history, try to add ourselves to the osdmap.
if (osdmap->get_epoch() == 0) {
derr << "waiting for initial osdmap" << dendl;
+ } else if (osdmap->is_destroyed(whoami)) {
+ derr << "osdmap says I am destroyed, exiting" << dendl;
+ exit(0);
} else if (osdmap->test_flag(CEPH_OSDMAP_NOUP) || osdmap->is_noup(whoami)) {
derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
} else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
{
// config info
(*pm)["osd_data"] = dev_path;
- (*pm)["osd_journal"] = journal_path;
+ if (store->get_type() == "filestore") {
+ // not applicable for bluestore
+ (*pm)["osd_journal"] = journal_path;
+ }
(*pm)["front_addr"] = stringify(client_messenger->get_myaddr());
(*pm)["back_addr"] = stringify(cluster_messenger->get_myaddr());
(*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddr());
// backend
(*pm)["osd_objectstore"] = store->get_type();
(*pm)["rotational"] = store_is_rotational ? "1" : "0";
+ (*pm)["default_device_class"] = store->get_default_device_class();
store->collect_metadata(pm);
collect_sys_info(pm, cct);
"name=injected_args,type=CephString,n=N",
"inject configuration arguments into running OSD",
"osd", "rw", "cli,rest")
+COMMAND("config set " \
+ "name=key,type=CephString name=value,type=CephString",
+ "Set a configuration option at runtime (not persistent)",
+ "osd", "rw", "cli,rest")
COMMAND("cluster_log " \
"name=level,type=CephChoices,strings=error,warning,info,debug " \
"name=message,type=CephString,n=N",
r = cct->_conf->injectargs(args, &ss);
osd_lock.Lock();
}
+ else if (prefix == "config set") {
+ std::string key;
+ std::string val;
+ cmd_getval(cct, cmdmap, "key", key);
+ cmd_getval(cct, cmdmap, "value", val);
+ osd_lock.Unlock();
+ r = cct->_conf->set_val(key, val, true, &ss);
+ osd_lock.Lock();
+ }
else if (prefix == "cluster_log") {
vector<string> msg;
cmd_getval(cct, cmdmap, "message", msg);
uint64_t global_id;
uint64_t auid = CEPH_AUTH_UID_DEFAULT;
- isvalid = authorize_handler->verify_authorizer(
- cct, monc->rotating_secrets.get(),
- authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
- &auid);
+ RotatingKeyRing *keys = monc->rotating_secrets.get();
+ if (keys) {
+ isvalid = authorize_handler->verify_authorizer(
+ cct, keys,
+ authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
+ &auid);
+ } else {
+ dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
+ isvalid = false;
+ }
if (isvalid) {
Session *s = static_cast<Session *>(con->get_priv());
handle_scrub(static_cast<MOSDScrub*>(m));
break;
+ case MSG_OSD_FORCE_RECOVERY:
+ handle_force_recovery(m);
+ break;
+
// -- need OSDMap --
case MSG_OSD_PG_CREATE:
if (service.is_preparing_to_stop() || service.is_stopping()) {
service.got_stop_ack();
} else {
- clog->warn() << "map e" << osdmap->get_epoch()
- << " wrongly marked me down at e"
- << osdmap->get_down_at(whoami);
+ clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
+ "but it is still running";
+ clog->debug() << "map e" << osdmap->get_epoch()
+ << " wrongly marked me down at e"
+ << osdmap->get_down_at(whoami);
}
} else if (!osdmap->get_addr(whoami).probably_equals(
client_messenger->get_myaddr())) {
} else if (!osdmap->get_hb_back_addr(whoami).probably_equals(
hb_back_server_messenger->get_myaddr())) {
clog->error() << "map e" << osdmap->get_epoch()
- << " had wrong hb back addr ("
+ << " had wrong heartbeat back addr ("
<< osdmap->get_hb_back_addr(whoami)
<< " != my " << hb_back_server_messenger->get_myaddr()
<< ")";
!osdmap->get_hb_front_addr(whoami).probably_equals(
hb_front_server_messenger->get_myaddr())) {
clog->error() << "map e" << osdmap->get_epoch()
- << " had wrong hb front addr ("
+ << " had wrong heartbeat front addr ("
<< osdmap->get_hb_front_addr(whoami)
<< " != my " << hb_front_server_messenger->get_myaddr()
<< ")";
pg->unlock();
}
+void OSD::handle_force_recovery(Message *m)
+{
+ MOSDForceRecovery *msg = static_cast<MOSDForceRecovery*>(m);
+ assert(msg->get_type() == MSG_OSD_FORCE_RECOVERY);
+ RWLock::RLocker l(pg_map_lock);
+
+ vector<PG*> local_pgs;
+ local_pgs.reserve(msg->forced_pgs.size());
+
+ for (auto& i : msg->forced_pgs) {
+ spg_t locpg;
+ if (osdmap->get_primary_shard(i, &locpg)) {
+ auto pg_map_entry = pg_map.find(locpg);
+ if (pg_map_entry != pg_map.end()) {
+ local_pgs.push_back(pg_map_entry->second);
+ }
+ }
+ }
+
+ if (local_pgs.size()) {
+ service.adjust_pg_priorities(local_pgs, msg->options);
+ }
+
+ msg->put();
+}
/** PGQuery
* from primary to replica | stray
return true;
}
+
+void OSDService::adjust_pg_priorities(vector<PG*> pgs, int newflags)
+{
+ if (!pgs.size() || !(newflags & (OFR_BACKFILL | OFR_RECOVERY)))
+ return;
+ int newstate = 0;
+
+ Mutex::Locker l(recovery_lock);
+
+ if (newflags & OFR_BACKFILL) {
+ newstate = PG_STATE_FORCED_BACKFILL;
+ } else if (newflags & OFR_RECOVERY) {
+ newstate = PG_STATE_FORCED_RECOVERY;
+ }
+
+ // debug output here may get large, don't generate it if debug level is below
+ // 10 and use abbreviated pg ids otherwise
+ if ((cct)->_conf->subsys.should_gather(ceph_subsys_osd, 10)) {
+ stringstream ss;
+
+ for (auto& i : pgs) {
+ ss << i->get_pgid() << " ";
+ }
+
+ dout(10) << __func__ << " working on " << ss.str() << dendl;
+ }
+
+ if (newflags & OFR_CANCEL) {
+ for (auto& i : pgs) {
+ i->change_recovery_force_mode(newstate, true);
+ }
+ } else {
+ for (auto& i : pgs) {
+ // make sure the PG is in correct state before forcing backfill or recovery, or
+ // else we'll make PG keeping FORCE_* flag forever, requiring osds restart
+ // or forcing somehow recovery/backfill.
+ int pgstate = i->get_state();
+ if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) ||
+ ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILL))) )
+ i->change_recovery_force_mode(newstate, false);
+ }
+ }
+}
+
void OSD::do_recovery(
PG *pg, epoch_t queued, uint64_t reserved_pushes,
ThreadPool::TPHandle &handle)
* recovery_requeue_callback event, which re-queues the recovery op using
* queue_recovery_after_sleep.
*/
- if (cct->_conf->osd_recovery_sleep > 0 && service.recovery_needs_sleep) {
+ float recovery_sleep = get_osd_recovery_sleep();
+ if (recovery_sleep > 0 && service.recovery_needs_sleep) {
PGRef pgref(pg);
auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
dout(20) << "do_recovery wake up at "
if (service.recovery_schedule_time < ceph_clock_now()) {
service.recovery_schedule_time = ceph_clock_now();
}
- service.recovery_schedule_time += cct->_conf->osd_recovery_sleep;
+ service.recovery_schedule_time += recovery_sleep;
service.recovery_sleep_timer.add_event_at(service.recovery_schedule_time,
recovery_requeue_callback);
dout(20) << "Recovery event scheduled at "
if (base_pool && base_pool->require_rollback()) {
if ((iter->op.op != CEPH_OSD_OP_READ) &&
(iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
+ (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
(iter->op.op != CEPH_OSD_OP_STAT) &&
(iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
(iter->op.op != CEPH_OSD_OP_UNDIRTY) &&