<< "queue_transaction returned " << cpp_strerror(ret) << dendl;
goto umount_store;
}
+ ch->flush();
}
ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami, osdspec_affinity);
this);
shards.push_back(one_shard);
}
-
- // override some config options if mclock is enabled on all the shards
- maybe_override_options_for_qos();
}
OSD::~OSD()
cmd_getval(cmdmap, "size", bsize, (int64_t)4 << 20);
cmd_getval(cmdmap, "object_size", osize, (int64_t)0);
cmd_getval(cmdmap, "object_num", onum, (int64_t)0);
+ double elapsed = 0.0;
- uint32_t duration = cct->_conf->osd_bench_duration;
-
- if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
- // let us limit the block size because the next checks rely on it
- // having a sane value. If we allow any block size to be set things
- // can still go sideways.
- ss << "block 'size' values are capped at "
- << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
- << " a higher value, please adjust 'osd_bench_max_block_size'";
- ret = -EINVAL;
+ ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
+ if (ret != 0) {
goto out;
- } else if (bsize < (int64_t) (1 << 20)) {
- // entering the realm of small block sizes.
- // limit the count to a sane value, assuming a configurable amount of
- // IOPS and duration, so that the OSD doesn't get hung up on this,
- // preventing timeouts from going off
- int64_t max_count =
- bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
- if (count > max_count) {
- ss << "'count' values greater than " << max_count
- << " for a block size of " << byte_u_t(bsize) << ", assuming "
- << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
- << " for " << duration << " seconds,"
- << " can cause ill effects on osd. "
- << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
- << " value if you wish to use a higher 'count'.";
- ret = -EINVAL;
- goto out;
- }
- } else {
- // 1MB block sizes are big enough so that we get more stuff done.
- // However, to avoid the osd from getting hung on this and having
- // timers being triggered, we are going to limit the count assuming
- // a configurable throughput and duration.
- // NOTE: max_count is the total amount of bytes that we believe we
- // will be able to write during 'duration' for the given
- // throughput. The block size hardly impacts this unless it's
- // way too big. Given we already check how big the block size
- // is, it's safe to assume everything will check out.
- int64_t max_count =
- cct->_conf->osd_bench_large_size_max_throughput * duration;
- if (count > max_count) {
- ss << "'count' values greater than " << max_count
- << " for a block size of " << byte_u_t(bsize) << ", assuming "
- << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
- << " for " << duration << " seconds,"
- << " can cause ill effects on osd. "
- << " Please adjust 'osd_bench_large_size_max_throughput'"
- << " with a higher value if you wish to use a higher 'count'.";
- ret = -EINVAL;
- goto out;
- }
- }
-
- if (osize && bsize > osize)
- bsize = osize;
-
- dout(1) << " bench count " << count
- << " bsize " << byte_u_t(bsize) << dendl;
-
- ObjectStore::Transaction cleanupt;
-
- if (osize && onum) {
- bufferlist bl;
- bufferptr bp(osize);
- bp.zero();
- bl.push_back(std::move(bp));
- bl.rebuild_page_aligned();
- for (int i=0; i<onum; ++i) {
- char nm[30];
- snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
- object_t oid(nm);
- hobject_t soid(sobject_t(oid, 0));
- ObjectStore::Transaction t;
- t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
- store->queue_transaction(service.meta_ch, std::move(t), NULL);
- cleanupt.remove(coll_t(), ghobject_t(soid));
- }
- }
-
- bufferlist bl;
- bufferptr bp(bsize);
- bp.zero();
- bl.push_back(std::move(bp));
- bl.rebuild_page_aligned();
-
- {
- C_SaferCond waiter;
- if (!service.meta_ch->flush_commit(&waiter)) {
- waiter.wait();
- }
- }
-
- utime_t start = ceph_clock_now();
- for (int64_t pos = 0; pos < count; pos += bsize) {
- char nm[30];
- unsigned offset = 0;
- if (onum && osize) {
- snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
- offset = rand() % (osize / bsize) * bsize;
- } else {
- snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
- }
- object_t oid(nm);
- hobject_t soid(sobject_t(oid, 0));
- ObjectStore::Transaction t;
- t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
- store->queue_transaction(service.meta_ch, std::move(t), NULL);
- if (!onum || !osize)
- cleanupt.remove(coll_t::meta(), ghobject_t(soid));
- }
-
- {
- C_SaferCond waiter;
- if (!service.meta_ch->flush_commit(&waiter)) {
- waiter.wait();
- }
- }
- utime_t end = ceph_clock_now();
-
- // clean up
- store->queue_transaction(service.meta_ch, std::move(cleanupt), NULL);
- {
- C_SaferCond waiter;
- if (!service.meta_ch->flush_commit(&waiter)) {
- waiter.wait();
- }
}
- double elapsed = end - start;
double rate = count / elapsed;
double iops = rate / bsize;
f->open_object_section("osd_bench_results");
on_finish(ret, ss.str(), outbl);
}
+int OSD::run_osd_bench_test(
+ int64_t count,
+ int64_t bsize,
+ int64_t osize,
+ int64_t onum,
+ double *elapsed,
+ ostream &ss)
+{
+ int ret = 0;
+ uint32_t duration = cct->_conf->osd_bench_duration;
+
+ if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
+ // let us limit the block size because the next checks rely on it
+ // having a sane value. If we allow any block size to be set things
+ // can still go sideways.
+ ss << "block 'size' values are capped at "
+ << byte_u_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
+ << " a higher value, please adjust 'osd_bench_max_block_size'";
+ ret = -EINVAL;
+ return ret;
+ } else if (bsize < (int64_t) (1 << 20)) {
+ // entering the realm of small block sizes.
+ // limit the count to a sane value, assuming a configurable amount of
+ // IOPS and duration, so that the OSD doesn't get hung up on this,
+ // preventing timeouts from going off
+ int64_t max_count =
+ bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
+ if (count > max_count) {
+ ss << "'count' values greater than " << max_count
+ << " for a block size of " << byte_u_t(bsize) << ", assuming "
+ << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
+ << " for " << duration << " seconds,"
+ << " can cause ill effects on osd. "
+ << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
+ << " value if you wish to use a higher 'count'.";
+ ret = -EINVAL;
+ return ret;
+ }
+ } else {
+ // 1MB block sizes are big enough so that we get more stuff done.
+ // However, to avoid the osd from getting hung on this and having
+ // timers being triggered, we are going to limit the count assuming
+ // a configurable throughput and duration.
+ // NOTE: max_count is the total amount of bytes that we believe we
+ // will be able to write during 'duration' for the given
+ // throughput. The block size hardly impacts this unless it's
+ // way too big. Given we already check how big the block size
+ // is, it's safe to assume everything will check out.
+ int64_t max_count =
+ cct->_conf->osd_bench_large_size_max_throughput * duration;
+ if (count > max_count) {
+ ss << "'count' values greater than " << max_count
+ << " for a block size of " << byte_u_t(bsize) << ", assuming "
+ << byte_u_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
+ << " for " << duration << " seconds,"
+ << " can cause ill effects on osd. "
+ << " Please adjust 'osd_bench_large_size_max_throughput'"
+ << " with a higher value if you wish to use a higher 'count'.";
+ ret = -EINVAL;
+ return ret;
+ }
+ }
+
+ if (osize && bsize > osize) {
+ bsize = osize;
+ }
+
+ dout(1) << " bench count " << count
+ << " bsize " << byte_u_t(bsize) << dendl;
+
+ ObjectStore::Transaction cleanupt;
+
+ if (osize && onum) {
+ bufferlist bl;
+ bufferptr bp(osize);
+ bp.zero();
+ bl.push_back(std::move(bp));
+ bl.rebuild_page_aligned();
+ for (int i=0; i<onum; ++i) {
+ char nm[30];
+ snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
+ object_t oid(nm);
+ hobject_t soid(sobject_t(oid, 0));
+ ObjectStore::Transaction t;
+ t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
+ store->queue_transaction(service.meta_ch, std::move(t), nullptr);
+ cleanupt.remove(coll_t(), ghobject_t(soid));
+ }
+ }
+
+ bufferlist bl;
+ bufferptr bp(bsize);
+ bp.zero();
+ bl.push_back(std::move(bp));
+ bl.rebuild_page_aligned();
+
+ {
+ C_SaferCond waiter;
+ if (!service.meta_ch->flush_commit(&waiter)) {
+ waiter.wait();
+ }
+ }
+
+ utime_t start = ceph_clock_now();
+ for (int64_t pos = 0; pos < count; pos += bsize) {
+ char nm[30];
+ unsigned offset = 0;
+ if (onum && osize) {
+ snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
+ offset = rand() % (osize / bsize) * bsize;
+ } else {
+ snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
+ }
+ object_t oid(nm);
+ hobject_t soid(sobject_t(oid, 0));
+ ObjectStore::Transaction t;
+ t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
+ store->queue_transaction(service.meta_ch, std::move(t), nullptr);
+ if (!onum || !osize) {
+ cleanupt.remove(coll_t::meta(), ghobject_t(soid));
+ }
+ }
+
+ {
+ C_SaferCond waiter;
+ if (!service.meta_ch->flush_commit(&waiter)) {
+ waiter.wait();
+ }
+ }
+ utime_t end = ceph_clock_now();
+ *elapsed = end - start;
+
+ // clean up
+ store->queue_transaction(service.meta_ch, std::move(cleanupt), nullptr);
+ {
+ C_SaferCond waiter;
+ if (!service.meta_ch->flush_commit(&waiter)) {
+ waiter.wait();
+ }
+ }
+
+ return ret;
+}
+
class TestOpsSocketHook : public AdminSocketHook {
OSDService *service;
ObjectStore *store;
start_boot();
+ // Override a few options if mclock scheduler is enabled.
+ maybe_override_max_osd_capacity_for_qos();
+ maybe_override_options_for_qos();
+
return 0;
out:
}
}
+void OSD::maybe_override_max_osd_capacity_for_qos()
+{
+ // If the scheduler enabled is mclock, override the default
+ // osd capacity with the value obtained from running the
+ // osd bench test. This is later used to setup mclock.
+ if ((cct->_conf.get_val<std::string>("osd_op_queue") == "mclock_scheduler") &&
+ (cct->_conf.get_val<bool>("osd_mclock_skip_benchmark") == false)) {
+ std::string max_capacity_iops_config;
+ bool force_run_benchmark =
+ cct->_conf.get_val<bool>("osd_mclock_force_run_benchmark_on_init");
+
+ if (store_is_rotational) {
+ max_capacity_iops_config = "osd_mclock_max_capacity_iops_hdd";
+ } else {
+ max_capacity_iops_config = "osd_mclock_max_capacity_iops_ssd";
+ }
+
+ if (!force_run_benchmark) {
+ double default_iops = 0.0;
+
+ // Get the current osd iops capacity
+ double cur_iops = cct->_conf.get_val<double>(max_capacity_iops_config);
+
+ // Get the default max iops capacity
+ auto val = cct->_conf.get_val_default(max_capacity_iops_config);
+ if (!val.has_value()) {
+ derr << __func__ << " Unable to determine default value of "
+ << max_capacity_iops_config << dendl;
+ // Cannot determine default iops. Force a run of the OSD benchmark.
+ force_run_benchmark = true;
+ } else {
+ // Default iops
+ default_iops = std::stod(val.value());
+ }
+
+ // Determine if we really need to run the osd benchmark
+ if (!force_run_benchmark && (default_iops != cur_iops)) {
+ dout(1) << __func__ << std::fixed << std::setprecision(2)
+ << " default_iops: " << default_iops
+ << " cur_iops: " << cur_iops
+ << ". Skip OSD benchmark test." << dendl;
+ return;
+ }
+ }
+
+ // Run osd bench: write 100 4MiB objects with blocksize 4KiB
+ int64_t count = 12288000; // Count of bytes to write
+ int64_t bsize = 4096; // Block size
+ int64_t osize = 4194304; // Object size
+ int64_t onum = 100; // Count of objects to write
+ double elapsed = 0.0; // Time taken to complete the test
+ double iops = 0.0;
+ stringstream ss;
+ int ret = run_osd_bench_test(count, bsize, osize, onum, &elapsed, ss);
+ if (ret != 0) {
+ derr << __func__
+ << " osd bench err: " << ret
+ << " osd bench errstr: " << ss.str()
+ << dendl;
+ return;
+ }
+
+ double rate = count / elapsed;
+ iops = rate / bsize;
+ dout(1) << __func__
+ << " osd bench result -"
+ << std::fixed << std::setprecision(3)
+ << " bandwidth (MiB/sec): " << rate / (1024 * 1024)
+ << " iops: " << iops
+ << " elapsed_sec: " << elapsed
+ << dendl;
+
+ // Persist iops to the MON store
+ ret = mon_cmd_set_config(max_capacity_iops_config, std::to_string(iops));
+ if (ret < 0) {
+ // Fallback to setting the config within the in-memory "values" map.
+ cct->_conf.set_val(max_capacity_iops_config, std::to_string(iops));
+ }
+
+ // Override the max osd capacity for all shards
+ for (auto& shard : shards) {
+ shard->update_scheduler_config();
+ }
+ }
+}
+
bool OSD::maybe_override_options_for_qos()
{
// If the scheduler enabled is mclock, override the recovery, backfill
return false;
}
+int OSD::mon_cmd_set_config(const std::string &key, const std::string &val)
+{
+ std::string cmd =
+ "{"
+ "\"prefix\": \"config set\", "
+ "\"who\": \"osd." + std::to_string(whoami) + "\", "
+ "\"name\": \"" + key + "\", "
+ "\"value\": \"" + val + "\""
+ "}";
+
+ vector<std::string> vcmd{cmd};
+ bufferlist inbl;
+ std::string outs;
+ C_SaferCond cond;
+ monc->start_mon_command(vcmd, inbl, nullptr, &outs, &cond);
+ int r = cond.wait();
+ if (r < 0) {
+ derr << __func__ << " Failed to set config key " << key
+ << " err: " << cpp_strerror(r)
+ << " errstr: " << outs << dendl;
+ return r;
+ }
+
+ return 0;
+}
+
void OSD::update_log_config()
{
map<string,string> log_to_monitors;
}
}
+void OSDShard::update_scheduler_config()
+{
+ std::lock_guard l(shard_lock);
+ scheduler->update_configuration();
+}
+
OSDShard::OSDShard(
int id,
CephContext *cct,
std::unique_lock wait_lock{sdata->sdata_wait_lock};
auto future_time = ceph::real_clock::from_double(*when_ready);
dout(10) << __func__ << " dequeue future request at " << future_time << dendl;
+ // Disable heartbeat timeout until we find a non-future work item to process.
+ osd->cct->get_heartbeat_map()->clear_timeout(hb);
sdata->shard_lock.unlock();
++sdata->waiting_threads;
sdata->sdata_cond.wait_until(wait_lock, future_time);
--sdata->waiting_threads;
wait_lock.unlock();
sdata->shard_lock.lock();
+ // Reapply default wq timeouts
+ osd->cct->get_heartbeat_map()->reset_timeout(hb,
+ timeout_interval, suicide_interval);
}
} // while