]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph ObjectStore engine | |
5 | * | |
6 | * IO engine using Ceph's ObjectStore class to test low-level performance of | |
7 | * Ceph OSDs. | |
8 | * | |
9 | */ | |
10 | ||
11 | #include <memory> | |
12 | #include <system_error> | |
13 | #include <vector> | |
14 | ||
15 | #include "os/ObjectStore.h" | |
16 | #include "global/global_init.h" | |
17 | #include "common/errno.h" | |
18 | #include "include/intarith.h" | |
19 | #include "include/stringify.h" | |
224ce89b | 20 | #include "common/perf_counters.h" |
7c673cae FG |
21 | |
22 | #include <fio.h> | |
23 | #include <optgroup.h> | |
24 | ||
25 | #include "include/assert.h" // fio.h clobbers our assert.h | |
26 | ||
27 | #define dout_context g_ceph_context | |
28 | #define dout_subsys ceph_subsys_ | |
29 | ||
30 | namespace { | |
31 | ||
32 | /// fio configuration options read from the job file | |
33 | struct Options { | |
34 | thread_data* td; | |
35 | char* conf; | |
36 | }; | |
37 | ||
38 | template <class Func> // void Func(fio_option&) | |
39 | fio_option make_option(Func&& func) | |
40 | { | |
41 | // zero-initialize and set common defaults | |
42 | auto o = fio_option{}; | |
43 | o.category = FIO_OPT_C_ENGINE; | |
44 | o.group = FIO_OPT_G_RBD; | |
45 | func(std::ref(o)); | |
46 | return o; | |
47 | } | |
48 | ||
49 | static std::vector<fio_option> ceph_options{ | |
50 | make_option([] (fio_option& o) { | |
51 | o.name = "conf"; | |
52 | o.lname = "ceph configuration file"; | |
53 | o.type = FIO_OPT_STR_STORE; | |
54 | o.help = "Path to a ceph configuration file"; | |
55 | o.off1 = offsetof(Options, conf); | |
56 | }), | |
57 | {} // fio expects a 'null'-terminated list | |
58 | }; | |
59 | ||
60 | ||
61 | /// global engine state shared between all jobs within the process. this | |
62 | /// includes g_ceph_context and the ObjectStore instance | |
63 | struct Engine { | |
64 | /// the initial g_ceph_context reference to be dropped on destruction | |
65 | boost::intrusive_ptr<CephContext> cct; | |
66 | std::unique_ptr<ObjectStore> os; | |
67 | ||
68 | std::mutex lock; | |
69 | int ref_count; | |
70 | ||
71 | Engine(const thread_data* td); | |
72 | ~Engine(); | |
73 | ||
74 | static Engine* get_instance(thread_data* td) { | |
75 | // note: creates an Engine with the options associated with the first job | |
76 | static Engine engine(td); | |
77 | return &engine; | |
78 | } | |
79 | ||
80 | void ref() { | |
81 | std::lock_guard<std::mutex> l(lock); | |
82 | ++ref_count; | |
83 | } | |
84 | void deref() { | |
85 | std::lock_guard<std::mutex> l(lock); | |
86 | --ref_count; | |
87 | if (!ref_count) { | |
88 | ostringstream ostr; | |
89 | Formatter* f = Formatter::create("json-pretty", "json-pretty", "json-pretty"); | |
224ce89b | 90 | cct->get_perfcounters_collection()->dump_formatted(f, false); |
31f18b77 | 91 | ostr << "FIO plugin "; |
7c673cae | 92 | f->flush(ostr); |
31f18b77 FG |
93 | if (g_conf->rocksdb_perf) { |
94 | os->get_db_statistics(f); | |
95 | ostr << "FIO get_db_statistics "; | |
96 | f->flush(ostr); | |
97 | } | |
7c673cae FG |
98 | delete f; |
99 | os->umount(); | |
31f18b77 | 100 | dout(0) << ostr.str() << dendl; |
7c673cae FG |
101 | } |
102 | } | |
103 | }; | |
104 | ||
105 | Engine::Engine(const thread_data* td) : ref_count(0) | |
106 | { | |
107 | // add the ceph command line arguments | |
108 | auto o = static_cast<const Options*>(td->eo); | |
109 | if (!o->conf) { | |
110 | throw std::runtime_error("missing conf option for ceph configuration file"); | |
111 | } | |
112 | std::vector<const char*> args{ | |
113 | "-i", "0", // identify as osd.0 for osd_data and osd_journal | |
114 | "--conf", o->conf, // use the requested conf file | |
115 | }; | |
116 | if (td->o.directory) { // allow conf files to use ${fio_dir} for data | |
117 | args.emplace_back("--fio_dir"); | |
118 | args.emplace_back(td->o.directory); | |
119 | } | |
120 | ||
121 | // claim the g_ceph_context reference and release it on destruction | |
122 | cct = global_init(nullptr, args, CEPH_ENTITY_TYPE_OSD, | |
123 | CODE_ENVIRONMENT_UTILITY, 0); | |
124 | common_init_finish(g_ceph_context); | |
125 | ||
126 | // create the ObjectStore | |
127 | os.reset(ObjectStore::create(g_ceph_context, | |
128 | g_conf->osd_objectstore, | |
129 | g_conf->osd_data, | |
130 | g_conf->osd_journal)); | |
131 | if (!os) | |
132 | throw std::runtime_error("bad objectstore type " + g_conf->osd_objectstore); | |
133 | ||
31f18b77 FG |
134 | unsigned num_shards; |
135 | if(g_conf->osd_op_num_shards) | |
136 | num_shards = g_conf->osd_op_num_shards; | |
137 | else if(os->is_rotational()) | |
138 | num_shards = g_conf->osd_op_num_shards_hdd; | |
139 | else | |
140 | num_shards = g_conf->osd_op_num_shards_ssd; | |
141 | os->set_cache_shards(num_shards); | |
7c673cae FG |
142 | |
143 | int r = os->mkfs(); | |
144 | if (r < 0) | |
145 | throw std::system_error(-r, std::system_category(), "mkfs failed"); | |
146 | ||
147 | r = os->mount(); | |
148 | if (r < 0) | |
149 | throw std::system_error(-r, std::system_category(), "mount failed"); | |
150 | } | |
151 | ||
152 | Engine::~Engine() | |
153 | { | |
154 | assert(!ref_count); | |
155 | } | |
156 | ||
157 | ||
158 | struct Collection { | |
159 | spg_t pg; | |
160 | coll_t cid; | |
161 | ObjectStore::Sequencer sequencer; | |
162 | ||
163 | // use big pool ids to avoid clashing with existing collections | |
164 | static constexpr int64_t MIN_POOL_ID = 0x0000ffffffffffff; | |
165 | ||
166 | Collection(const spg_t& pg) | |
31f18b77 FG |
167 | : pg(pg), cid(pg), sequencer(stringify(pg)) { |
168 | sequencer.shard_hint = pg; | |
169 | } | |
7c673cae FG |
170 | }; |
171 | ||
172 | struct Object { | |
173 | ghobject_t oid; | |
174 | Collection& coll; | |
175 | ||
176 | Object(const char* name, Collection& coll) | |
177 | : oid(hobject_t(name, "", CEPH_NOSNAP, coll.pg.ps(), coll.pg.pool(), "")), | |
178 | coll(coll) {} | |
179 | }; | |
180 | ||
181 | /// treat each fio job like a separate pool with its own collections and objects | |
182 | struct Job { | |
183 | Engine* engine; //< shared ptr to the global Engine | |
184 | std::vector<Collection> collections; //< spread objects over collections | |
185 | std::vector<Object> objects; //< associate an object with each fio_file | |
186 | std::vector<io_u*> events; //< completions for fio_ceph_os_event() | |
187 | const bool unlink; //< unlink objects on destruction | |
188 | ||
189 | Job(Engine* engine, const thread_data* td); | |
190 | ~Job(); | |
191 | }; | |
192 | ||
193 | Job::Job(Engine* engine, const thread_data* td) | |
194 | : engine(engine), | |
195 | events(td->o.iodepth), | |
196 | unlink(td->o.unlink) | |
197 | { | |
198 | engine->ref(); | |
199 | // use the fio thread_number for our unique pool id | |
200 | const uint64_t pool = Collection::MIN_POOL_ID + td->thread_number; | |
201 | ||
202 | // create a collection for each object, up to osd_pool_default_pg_num | |
203 | uint32_t count = g_conf->osd_pool_default_pg_num; | |
204 | if (count > td->o.nr_files) | |
205 | count = td->o.nr_files; | |
206 | ||
207 | assert(count > 0); | |
208 | collections.reserve(count); | |
209 | ||
210 | const int split_bits = cbits(count - 1); | |
211 | ||
212 | ObjectStore::Transaction t; | |
213 | for (uint32_t i = 0; i < count; i++) { | |
214 | auto pg = spg_t{pg_t{i, pool}}; | |
215 | collections.emplace_back(pg); | |
216 | ||
217 | auto& cid = collections.back().cid; | |
218 | if (!engine->os->collection_exists(cid)) | |
219 | t.create_collection(cid, split_bits); | |
220 | } | |
221 | ||
222 | const uint64_t file_size = td->o.size / max(1u, td->o.nr_files); | |
223 | ||
224 | // create an object for each file in the job | |
225 | for (uint32_t i = 0; i < td->o.nr_files; i++) { | |
226 | auto f = td->files[i]; | |
227 | f->real_file_size = file_size; | |
228 | f->engine_pos = i; | |
229 | ||
230 | // associate each object with a collection in a round-robin fashion | |
231 | auto& coll = collections[i % collections.size()]; | |
232 | ||
233 | objects.emplace_back(f->file_name, coll); | |
234 | auto& oid = objects.back().oid; | |
235 | ||
236 | t.touch(coll.cid, oid); | |
237 | t.truncate(coll.cid, oid, file_size); | |
238 | } | |
239 | ||
240 | // apply the entire transaction synchronously | |
241 | ObjectStore::Sequencer sequencer("job init"); | |
242 | int r = engine->os->apply_transaction(&sequencer, std::move(t)); | |
243 | if (r) { | |
244 | engine->deref(); | |
245 | throw std::system_error(r, std::system_category(), "job init"); | |
246 | } | |
247 | } | |
248 | ||
249 | Job::~Job() | |
250 | { | |
251 | if (unlink) { | |
252 | ObjectStore::Transaction t; | |
253 | // remove our objects | |
254 | for (auto& obj : objects) { | |
255 | t.remove(obj.coll.cid, obj.oid); | |
256 | } | |
257 | // remove our collections | |
258 | for (auto& coll : collections) { | |
259 | t.remove_collection(coll.cid); | |
260 | } | |
261 | ObjectStore::Sequencer sequencer("job cleanup"); | |
262 | int r = engine->os->apply_transaction(&sequencer, std::move(t)); | |
263 | if (r) | |
264 | derr << "job cleanup failed with " << cpp_strerror(-r) << dendl; | |
265 | } | |
266 | engine->deref(); | |
267 | } | |
268 | ||
269 | ||
270 | int fio_ceph_os_setup(thread_data* td) | |
271 | { | |
272 | // if there are multiple jobs, they must run in the same process against a | |
273 | // single instance of the ObjectStore. explicitly disable fio's default | |
274 | // job-per-process configuration | |
275 | td->o.use_thread = 1; | |
276 | ||
277 | try { | |
278 | // get or create the global Engine instance | |
279 | auto engine = Engine::get_instance(td); | |
280 | // create a Job for this thread | |
281 | td->io_ops_data = new Job(engine, td); | |
282 | } catch (std::exception& e) { | |
283 | std::cerr << "setup failed with " << e.what() << std::endl; | |
284 | return -1; | |
285 | } | |
286 | return 0; | |
287 | } | |
288 | ||
289 | void fio_ceph_os_cleanup(thread_data* td) | |
290 | { | |
291 | auto job = static_cast<Job*>(td->io_ops_data); | |
292 | td->io_ops_data = nullptr; | |
293 | delete job; | |
294 | } | |
295 | ||
296 | ||
297 | io_u* fio_ceph_os_event(thread_data* td, int event) | |
298 | { | |
299 | // return the requested event from fio_ceph_os_getevents() | |
300 | auto job = static_cast<Job*>(td->io_ops_data); | |
301 | return job->events[event]; | |
302 | } | |
303 | ||
304 | int fio_ceph_os_getevents(thread_data* td, unsigned int min, | |
305 | unsigned int max, const timespec* t) | |
306 | { | |
307 | auto job = static_cast<Job*>(td->io_ops_data); | |
308 | unsigned int events = 0; | |
309 | io_u* u; | |
310 | unsigned int i; | |
311 | ||
312 | // loop through inflight ios until we find 'min' completions | |
313 | do { | |
314 | io_u_qiter(&td->io_u_all, u, i) { | |
315 | if (!(u->flags & IO_U_F_FLIGHT)) | |
316 | continue; | |
317 | ||
318 | if (u->engine_data) { | |
319 | u->engine_data = nullptr; | |
320 | job->events[events] = u; | |
321 | events++; | |
322 | } | |
323 | } | |
324 | if (events >= min) | |
325 | break; | |
326 | usleep(100); | |
327 | } while (1); | |
328 | ||
329 | return events; | |
330 | } | |
331 | ||
332 | /// completion context for ObjectStore::queue_transaction() | |
333 | class UnitComplete : public Context { | |
334 | io_u* u; | |
335 | public: | |
336 | UnitComplete(io_u* u) : u(u) {} | |
337 | void finish(int r) { | |
338 | // mark the pointer to indicate completion for fio_ceph_os_getevents() | |
339 | u->engine_data = reinterpret_cast<void*>(1ull); | |
340 | } | |
341 | }; | |
342 | ||
343 | int fio_ceph_os_queue(thread_data* td, io_u* u) | |
344 | { | |
345 | fio_ro_check(td, u); | |
346 | ||
347 | auto job = static_cast<Job*>(td->io_ops_data); | |
348 | auto& object = job->objects[u->file->engine_pos]; | |
349 | auto& coll = object.coll; | |
350 | auto& os = job->engine->os; | |
351 | ||
352 | if (u->ddir == DDIR_WRITE) { | |
353 | // provide a hint if we're likely to read this data back | |
354 | const int flags = td_rw(td) ? CEPH_OSD_OP_FLAG_FADVISE_WILLNEED : 0; | |
355 | ||
356 | bufferlist bl; | |
357 | bl.push_back(buffer::copy(reinterpret_cast<char*>(u->xfer_buf), | |
358 | u->xfer_buflen ) ); | |
359 | ||
360 | // enqueue a write transaction on the collection's sequencer | |
361 | ObjectStore::Transaction t; | |
362 | t.write(coll.cid, object.oid, u->offset, u->xfer_buflen, bl, flags); | |
363 | os->queue_transaction(&coll.sequencer, | |
364 | std::move(t), | |
365 | nullptr, | |
366 | new UnitComplete(u)); | |
367 | return FIO_Q_QUEUED; | |
368 | } | |
369 | ||
370 | if (u->ddir == DDIR_READ) { | |
371 | // ObjectStore reads are synchronous, so make the call and return COMPLETED | |
372 | bufferlist bl; | |
373 | int r = os->read(coll.cid, object.oid, u->offset, u->xfer_buflen, bl); | |
374 | if (r < 0) { | |
375 | u->error = r; | |
376 | td_verror(td, u->error, "xfer"); | |
377 | } else { | |
378 | bl.copy(0, bl.length(), static_cast<char*>(u->xfer_buf)); | |
379 | u->resid = u->xfer_buflen - r; | |
380 | } | |
381 | return FIO_Q_COMPLETED; | |
382 | } | |
383 | ||
384 | derr << "WARNING: Only DDIR_READ and DDIR_WRITE are supported!" << dendl; | |
385 | u->error = -EINVAL; | |
386 | td_verror(td, u->error, "xfer"); | |
387 | return FIO_Q_COMPLETED; | |
388 | } | |
389 | ||
390 | int fio_ceph_os_commit(thread_data* td) | |
391 | { | |
392 | // commit() allows the engine to batch up queued requests to be submitted all | |
393 | // at once. it would be natural for queue() to collect transactions in a list, | |
394 | // and use commit() to pass them all to ObjectStore::queue_transactions(). but | |
395 | // because we spread objects over multiple collections, we a) need to use a | |
396 | // different sequencer for each collection, and b) are less likely to see a | |
397 | // benefit from batching requests within a collection | |
398 | return 0; | |
399 | } | |
400 | ||
401 | // open/close are noops. we set the FIO_DISKLESSIO flag in ioengine_ops to | |
402 | // prevent fio from creating the files | |
403 | int fio_ceph_os_open(thread_data* td, fio_file* f) { return 0; } | |
404 | int fio_ceph_os_close(thread_data* td, fio_file* f) { return 0; } | |
405 | ||
406 | int fio_ceph_os_io_u_init(thread_data* td, io_u* u) | |
407 | { | |
408 | // no data is allocated, we just use the pointer as a boolean 'completed' flag | |
409 | u->engine_data = nullptr; | |
410 | return 0; | |
411 | } | |
412 | ||
413 | void fio_ceph_os_io_u_free(thread_data* td, io_u* u) | |
414 | { | |
415 | u->engine_data = nullptr; | |
416 | } | |
417 | ||
418 | ||
419 | // ioengine_ops for get_ioengine() | |
420 | struct ceph_ioengine : public ioengine_ops { | |
421 | ceph_ioengine() : ioengine_ops({}) { | |
422 | name = "ceph-os"; | |
423 | version = FIO_IOOPS_VERSION; | |
424 | flags = FIO_DISKLESSIO; | |
425 | setup = fio_ceph_os_setup; | |
426 | queue = fio_ceph_os_queue; | |
427 | commit = fio_ceph_os_commit; | |
428 | getevents = fio_ceph_os_getevents; | |
429 | event = fio_ceph_os_event; | |
430 | cleanup = fio_ceph_os_cleanup; | |
431 | open_file = fio_ceph_os_open; | |
432 | close_file = fio_ceph_os_close; | |
433 | io_u_init = fio_ceph_os_io_u_init; | |
434 | io_u_free = fio_ceph_os_io_u_free; | |
435 | options = ceph_options.data(); | |
436 | option_struct_size = sizeof(struct Options); | |
437 | } | |
438 | }; | |
439 | ||
440 | } // anonymous namespace | |
441 | ||
442 | extern "C" { | |
443 | // the exported fio engine interface | |
444 | void get_ioengine(struct ioengine_ops** ioengine_ptr) { | |
445 | static ceph_ioengine ioengine; | |
446 | *ioengine_ptr = &ioengine; | |
447 | } | |
448 | } // extern "C" |