1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph ObjectStore engine
6 * IO engine using Ceph's ObjectStore class to test low-level performance of
12 #include <system_error>
15 #include "os/ObjectStore.h"
16 #include "global/global_init.h"
17 #include "common/errno.h"
18 #include "include/intarith.h"
19 #include "include/stringify.h"
20 #include "common/perf_counters.h"
25 #include "include/assert.h" // fio.h clobbers our assert.h
27 #define dout_context g_ceph_context
28 #define dout_subsys ceph_subsys_
32 /// fio configuration options read from the job file
38 template <class Func
> // void Func(fio_option&)
39 fio_option
make_option(Func
&& func
)
41 // zero-initialize and set common defaults
42 auto o
= fio_option
{};
43 o
.category
= FIO_OPT_C_ENGINE
;
44 o
.group
= FIO_OPT_G_RBD
;
49 static std::vector
<fio_option
> ceph_options
{
50 make_option([] (fio_option
& o
) {
52 o
.lname
= "ceph configuration file";
53 o
.type
= FIO_OPT_STR_STORE
;
54 o
.help
= "Path to a ceph configuration file";
55 o
.off1
= offsetof(Options
, conf
);
57 {} // fio expects a 'null'-terminated list
61 /// global engine state shared between all jobs within the process. this
62 /// includes g_ceph_context and the ObjectStore instance
64 /// the initial g_ceph_context reference to be dropped on destruction
65 boost::intrusive_ptr
<CephContext
> cct
;
66 std::unique_ptr
<ObjectStore
> os
;
71 Engine(const thread_data
* td
);
74 static Engine
* get_instance(thread_data
* td
) {
75 // note: creates an Engine with the options associated with the first job
76 static Engine
engine(td
);
81 std::lock_guard
<std::mutex
> l(lock
);
85 std::lock_guard
<std::mutex
> l(lock
);
89 Formatter
* f
= Formatter::create("json-pretty", "json-pretty", "json-pretty");
90 cct
->get_perfcounters_collection()->dump_formatted(f
, false);
91 ostr
<< "FIO plugin ";
93 if (g_conf
->rocksdb_perf
) {
94 os
->get_db_statistics(f
);
95 ostr
<< "FIO get_db_statistics ";
100 dout(0) << ostr
.str() << dendl
;
105 Engine::Engine(const thread_data
* td
) : ref_count(0)
107 // add the ceph command line arguments
108 auto o
= static_cast<const Options
*>(td
->eo
);
110 throw std::runtime_error("missing conf option for ceph configuration file");
112 std::vector
<const char*> args
{
113 "-i", "0", // identify as osd.0 for osd_data and osd_journal
114 "--conf", o
->conf
, // use the requested conf file
116 if (td
->o
.directory
) { // allow conf files to use ${fio_dir} for data
117 args
.emplace_back("--fio_dir");
118 args
.emplace_back(td
->o
.directory
);
121 // claim the g_ceph_context reference and release it on destruction
122 cct
= global_init(nullptr, args
, CEPH_ENTITY_TYPE_OSD
,
123 CODE_ENVIRONMENT_UTILITY
, 0);
124 common_init_finish(g_ceph_context
);
126 // create the ObjectStore
127 os
.reset(ObjectStore::create(g_ceph_context
,
128 g_conf
->osd_objectstore
,
130 g_conf
->osd_journal
));
132 throw std::runtime_error("bad objectstore type " + g_conf
->osd_objectstore
);
135 if(g_conf
->osd_op_num_shards
)
136 num_shards
= g_conf
->osd_op_num_shards
;
137 else if(os
->is_rotational())
138 num_shards
= g_conf
->osd_op_num_shards_hdd
;
140 num_shards
= g_conf
->osd_op_num_shards_ssd
;
141 os
->set_cache_shards(num_shards
);
145 throw std::system_error(-r
, std::system_category(), "mkfs failed");
149 throw std::system_error(-r
, std::system_category(), "mount failed");
161 ObjectStore::Sequencer sequencer
;
163 // use big pool ids to avoid clashing with existing collections
164 static constexpr int64_t MIN_POOL_ID
= 0x0000ffffffffffff;
166 Collection(const spg_t
& pg
)
167 : pg(pg
), cid(pg
), sequencer(stringify(pg
)) {
168 sequencer
.shard_hint
= pg
;
176 Object(const char* name
, Collection
& coll
)
177 : oid(hobject_t(name
, "", CEPH_NOSNAP
, coll
.pg
.ps(), coll
.pg
.pool(), "")),
181 /// treat each fio job like a separate pool with its own collections and objects
183 Engine
* engine
; //< shared ptr to the global Engine
184 std::vector
<Collection
> collections
; //< spread objects over collections
185 std::vector
<Object
> objects
; //< associate an object with each fio_file
186 std::vector
<io_u
*> events
; //< completions for fio_ceph_os_event()
187 const bool unlink
; //< unlink objects on destruction
189 Job(Engine
* engine
, const thread_data
* td
);
193 Job::Job(Engine
* engine
, const thread_data
* td
)
195 events(td
->o
.iodepth
),
199 // use the fio thread_number for our unique pool id
200 const uint64_t pool
= Collection::MIN_POOL_ID
+ td
->thread_number
;
202 // create a collection for each object, up to osd_pool_default_pg_num
203 uint32_t count
= g_conf
->osd_pool_default_pg_num
;
204 if (count
> td
->o
.nr_files
)
205 count
= td
->o
.nr_files
;
208 collections
.reserve(count
);
210 const int split_bits
= cbits(count
- 1);
212 ObjectStore::Transaction t
;
213 for (uint32_t i
= 0; i
< count
; i
++) {
214 auto pg
= spg_t
{pg_t
{i
, pool
}};
215 collections
.emplace_back(pg
);
217 auto& cid
= collections
.back().cid
;
218 if (!engine
->os
->collection_exists(cid
))
219 t
.create_collection(cid
, split_bits
);
222 const uint64_t file_size
= td
->o
.size
/ max(1u, td
->o
.nr_files
);
224 // create an object for each file in the job
225 for (uint32_t i
= 0; i
< td
->o
.nr_files
; i
++) {
226 auto f
= td
->files
[i
];
227 f
->real_file_size
= file_size
;
230 // associate each object with a collection in a round-robin fashion
231 auto& coll
= collections
[i
% collections
.size()];
233 objects
.emplace_back(f
->file_name
, coll
);
234 auto& oid
= objects
.back().oid
;
236 t
.touch(coll
.cid
, oid
);
237 t
.truncate(coll
.cid
, oid
, file_size
);
240 // apply the entire transaction synchronously
241 ObjectStore::Sequencer
sequencer("job init");
242 int r
= engine
->os
->apply_transaction(&sequencer
, std::move(t
));
245 throw std::system_error(r
, std::system_category(), "job init");
252 ObjectStore::Transaction t
;
253 // remove our objects
254 for (auto& obj
: objects
) {
255 t
.remove(obj
.coll
.cid
, obj
.oid
);
257 // remove our collections
258 for (auto& coll
: collections
) {
259 t
.remove_collection(coll
.cid
);
261 ObjectStore::Sequencer
sequencer("job cleanup");
262 int r
= engine
->os
->apply_transaction(&sequencer
, std::move(t
));
264 derr
<< "job cleanup failed with " << cpp_strerror(-r
) << dendl
;
270 int fio_ceph_os_setup(thread_data
* td
)
272 // if there are multiple jobs, they must run in the same process against a
273 // single instance of the ObjectStore. explicitly disable fio's default
274 // job-per-process configuration
275 td
->o
.use_thread
= 1;
278 // get or create the global Engine instance
279 auto engine
= Engine::get_instance(td
);
280 // create a Job for this thread
281 td
->io_ops_data
= new Job(engine
, td
);
282 } catch (std::exception
& e
) {
283 std::cerr
<< "setup failed with " << e
.what() << std::endl
;
289 void fio_ceph_os_cleanup(thread_data
* td
)
291 auto job
= static_cast<Job
*>(td
->io_ops_data
);
292 td
->io_ops_data
= nullptr;
297 io_u
* fio_ceph_os_event(thread_data
* td
, int event
)
299 // return the requested event from fio_ceph_os_getevents()
300 auto job
= static_cast<Job
*>(td
->io_ops_data
);
301 return job
->events
[event
];
304 int fio_ceph_os_getevents(thread_data
* td
, unsigned int min
,
305 unsigned int max
, const timespec
* t
)
307 auto job
= static_cast<Job
*>(td
->io_ops_data
);
308 unsigned int events
= 0;
312 // loop through inflight ios until we find 'min' completions
314 io_u_qiter(&td
->io_u_all
, u
, i
) {
315 if (!(u
->flags
& IO_U_F_FLIGHT
))
318 if (u
->engine_data
) {
319 u
->engine_data
= nullptr;
320 job
->events
[events
] = u
;
332 /// completion context for ObjectStore::queue_transaction()
333 class UnitComplete
: public Context
{
336 UnitComplete(io_u
* u
) : u(u
) {}
338 // mark the pointer to indicate completion for fio_ceph_os_getevents()
339 u
->engine_data
= reinterpret_cast<void*>(1ull);
343 int fio_ceph_os_queue(thread_data
* td
, io_u
* u
)
347 auto job
= static_cast<Job
*>(td
->io_ops_data
);
348 auto& object
= job
->objects
[u
->file
->engine_pos
];
349 auto& coll
= object
.coll
;
350 auto& os
= job
->engine
->os
;
352 if (u
->ddir
== DDIR_WRITE
) {
353 // provide a hint if we're likely to read this data back
354 const int flags
= td_rw(td
) ? CEPH_OSD_OP_FLAG_FADVISE_WILLNEED
: 0;
357 bl
.push_back(buffer::copy(reinterpret_cast<char*>(u
->xfer_buf
),
360 // enqueue a write transaction on the collection's sequencer
361 ObjectStore::Transaction t
;
362 t
.write(coll
.cid
, object
.oid
, u
->offset
, u
->xfer_buflen
, bl
, flags
);
363 os
->queue_transaction(&coll
.sequencer
,
366 new UnitComplete(u
));
370 if (u
->ddir
== DDIR_READ
) {
371 // ObjectStore reads are synchronous, so make the call and return COMPLETED
373 int r
= os
->read(coll
.cid
, object
.oid
, u
->offset
, u
->xfer_buflen
, bl
);
376 td_verror(td
, u
->error
, "xfer");
378 bl
.copy(0, bl
.length(), static_cast<char*>(u
->xfer_buf
));
379 u
->resid
= u
->xfer_buflen
- r
;
381 return FIO_Q_COMPLETED
;
384 derr
<< "WARNING: Only DDIR_READ and DDIR_WRITE are supported!" << dendl
;
386 td_verror(td
, u
->error
, "xfer");
387 return FIO_Q_COMPLETED
;
390 int fio_ceph_os_commit(thread_data
* td
)
392 // commit() allows the engine to batch up queued requests to be submitted all
393 // at once. it would be natural for queue() to collect transactions in a list,
394 // and use commit() to pass them all to ObjectStore::queue_transactions(). but
395 // because we spread objects over multiple collections, we a) need to use a
396 // different sequencer for each collection, and b) are less likely to see a
397 // benefit from batching requests within a collection
401 // open/close are noops. we set the FIO_DISKLESSIO flag in ioengine_ops to
402 // prevent fio from creating the files
403 int fio_ceph_os_open(thread_data
* td
, fio_file
* f
) { return 0; }
404 int fio_ceph_os_close(thread_data
* td
, fio_file
* f
) { return 0; }
406 int fio_ceph_os_io_u_init(thread_data
* td
, io_u
* u
)
408 // no data is allocated, we just use the pointer as a boolean 'completed' flag
409 u
->engine_data
= nullptr;
413 void fio_ceph_os_io_u_free(thread_data
* td
, io_u
* u
)
415 u
->engine_data
= nullptr;
419 // ioengine_ops for get_ioengine()
420 struct ceph_ioengine
: public ioengine_ops
{
421 ceph_ioengine() : ioengine_ops({}) {
423 version
= FIO_IOOPS_VERSION
;
424 flags
= FIO_DISKLESSIO
;
425 setup
= fio_ceph_os_setup
;
426 queue
= fio_ceph_os_queue
;
427 commit
= fio_ceph_os_commit
;
428 getevents
= fio_ceph_os_getevents
;
429 event
= fio_ceph_os_event
;
430 cleanup
= fio_ceph_os_cleanup
;
431 open_file
= fio_ceph_os_open
;
432 close_file
= fio_ceph_os_close
;
433 io_u_init
= fio_ceph_os_io_u_init
;
434 io_u_free
= fio_ceph_os_io_u_free
;
435 options
= ceph_options
.data();
436 option_struct_size
= sizeof(struct Options
);
440 } // anonymous namespace
443 // the exported fio engine interface
444 void get_ioengine(struct ioengine_ops
** ioengine_ptr
) {
445 static ceph_ioengine ioengine
;
446 *ioengine_ptr
= &ioengine
;