]>
git.proxmox.com Git - ceph.git/blob - ceph/src/os/ObjectStore.h
44d67c26e88f9a48b58434eee2d82eabbd472aee
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
14 #ifndef CEPH_OBJECTSTORE_H
15 #define CEPH_OBJECTSTORE_H
17 #include "include/buffer.h"
18 #include "include/common_fwd.h"
19 #include "include/Context.h"
20 #include "include/interval_set.h"
21 #include "include/stringify.h"
22 #include "include/types.h"
24 #include "osd/osd_types.h"
25 #include "common/TrackedOp.h"
26 #include "common/WorkQueue.h"
27 #include "ObjectMap.h"
28 #include "os/Transaction.h"
36 #if defined(__APPLE__) || defined(__FreeBSD__) || defined(__sun) || defined(_WIN32)
37 #include <sys/statvfs.h>
39 #include <sys/vfs.h> /* or <sys/statfs.h> */
47 * low-level interface to the local OSD file system
53 static inline void encode(const std::map
<std::string
,ceph::buffer::ptr
> *attrset
, ceph::buffer::list
&bl
) {
59 typedef uint32_t osflagbits_t
;
60 const int SKIP_JOURNAL_REPLAY
= 1 << 0;
61 const int SKIP_MOUNT_OMAP
= 1 << 1;
68 using Transaction
= ceph::os::Transaction
;
72 * create - create an ObjectStore instance.
74 * This is invoked once at initialization time.
76 * @param type type of store. This is a std::string from the configuration file.
77 * @param data path (or other descriptor) for data
78 * @param journal path (or other descriptor) for journal (optional)
79 * @param flags which filestores should check if applicable
82 static std::unique_ptr
<ObjectStore
> create(
84 const std::string
& type
,
85 const std::string
& data
,
86 const std::string
& journal
,
87 osflagbits_t flags
= 0);
89 static std::unique_ptr
<ObjectStore
> create(
91 const std::string
& type
,
92 const std::string
& data
);
95 * probe a block device to learn the uuid of the owning OSD
98 * @param path path to device
99 * @param fsid [out] osd uuid
101 static int probe_block_device_fsid(
103 const std::string
& path
,
107 * Fetch Object Store statistics.
109 * Currently only latency of write and apply times are measured.
111 * This appears to be called with nothing locked.
113 virtual objectstore_perf_stat_t
get_cur_stats() = 0;
116 * Fetch Object Store performance counters.
119 * This appears to be called with nothing locked.
121 virtual const PerfCounters
* get_perf_counters() const = 0;
124 * a collection also orders transactions
126 * Any transactions queued under a given collection will be applied in
127 * sequence. Transactions queued under different collections may run
130 * ObjectStore users may get collection handles with open_collection() (or,
131 * for bootstrapping a new collection, create_new_collection()).
133 struct CollectionImpl
: public RefCountedObject
{
136 /// wait for any queued transactions to apply
137 // block until any previous transactions are visible. specifically,
138 // collection_list and collection_empty need to reflect prior operations.
139 virtual void flush() = 0;
144 * There are two cases:
145 * 1) collection is currently idle: the method returns true. c is
147 * 2) collection is not idle: the method returns false and c is
148 * called asynchronously with a value of 0 once all transactions
149 * queued on this collection prior to the call have been applied
152 virtual bool flush_commit(Context
*c
) = 0;
154 const coll_t
&get_cid() {
158 CollectionImpl() = delete;
159 CollectionImpl(CephContext
* cct
, const coll_t
& c
) : RefCountedObject(cct
), cid(c
) {}
160 ~CollectionImpl() = default;
162 using CollectionHandle
= ceph::ref_t
<CollectionImpl
>;
165 /*********************************
167 * Object Contents and semantics
169 * All ObjectStore objects are identified as a named object
170 * (ghobject_t and hobject_t) in a named collection (coll_t).
171 * ObjectStore operations support the creation, mutation, deletion
172 * and enumeration of objects within a collection. Enumeration is
173 * in sorted key order (where keys are sorted by hash). Object names
174 * are globally unique.
176 * Each object has four distinct parts: byte data, xattrs, omap_header
179 * The data portion of an object is conceptually equivalent to a
180 * file in a file system. Random and Partial access for both read
181 * and write operations is required. The ability to have a sparse
182 * implementation of the data portion of an object is beneficial for
183 * some workloads, but not required. There is a system-wide limit on
184 * the maximum size of an object, which is typically around 100 MB.
186 * Xattrs are equivalent to the extended attributes of file
187 * systems. Xattrs are a std::set of key/value pairs. Sub-value access
188 * is not required. It is possible to enumerate the std::set of xattrs in
189 * key order. At the implementation level, xattrs are used
190 * exclusively internal to Ceph and the implementer can expect the
191 * total size of all of the xattrs on an object to be relatively
192 * small, i.e., less than 64KB. Much of Ceph assumes that accessing
193 * xattrs on temporally adjacent object accesses (recent past or
194 * near future) is inexpensive.
196 * omap_header is a single blob of data. It can be read or written
199 * Omap entries are conceptually the same as xattrs
200 * but in a different address space. In other words, you can have
201 * the same key as an xattr and an omap entry and they have distinct
202 * values. Enumeration of xattrs doesn't include omap entries and
203 * vice versa. The size and access characteristics of omap entries
204 * are very different from xattrs. In particular, the value portion
205 * of an omap entry can be quite large (MBs). More importantly, the
206 * interface must support efficient range queries on omap entries even
207 * when there are a large numbers of entries.
209 *********************************/
211 /*******************************
215 * A collection is simply a grouping of objects. Collections have
216 * names (coll_t) and can be enumerated in order. Like an
217 * individual object, a collection also has a std::set of xattrs.
223 int queue_transaction(CollectionHandle
& ch
,
225 TrackedOpRef op
= TrackedOpRef(),
226 ThreadPool::TPHandle
*handle
= NULL
) {
227 std::vector
<Transaction
> tls
;
228 tls
.push_back(std::move(t
));
229 return queue_transactions(ch
, tls
, op
, handle
);
232 virtual int queue_transactions(
233 CollectionHandle
& ch
, std::vector
<Transaction
>& tls
,
234 TrackedOpRef op
= TrackedOpRef(),
235 ThreadPool::TPHandle
*handle
= NULL
) = 0;
239 ObjectStore(CephContext
* cct
,
240 const std::string
& path_
) : path(path_
), cct(cct
) {}
241 virtual ~ObjectStore() {}
244 explicit ObjectStore(const ObjectStore
& o
) = delete;
245 const ObjectStore
& operator=(const ObjectStore
& o
) = delete;
248 virtual int upgrade() {
252 virtual void get_db_statistics(ceph::Formatter
*f
) { }
253 virtual void generate_db_histogram(ceph::Formatter
*f
) { }
254 virtual int flush_cache(std::ostream
*os
= NULL
) { return -1; }
255 virtual void dump_perf_counters(ceph::Formatter
*f
) {}
256 virtual void dump_cache_stats(ceph::Formatter
*f
) {}
257 virtual void dump_cache_stats(std::ostream
& os
) {}
259 virtual std::string
get_type() = 0;
262 virtual bool test_mount_in_use() = 0;
263 virtual int mount() = 0;
264 virtual int umount() = 0;
265 virtual int fsck(bool deep
) {
268 virtual int repair(bool deep
) {
271 virtual int quick_fix() {
275 virtual void set_cache_shards(unsigned num
) { }
278 * Returns 0 if the hobject is valid, -error otherwise
281 * -ENAMETOOLONG: locator/namespace/name too large
283 virtual int validate_hobject_key(const hobject_t
&obj
) const = 0;
285 virtual unsigned get_max_attr_name_length() = 0;
286 virtual int mkfs() = 0; // wipe
287 virtual int mkjournal() = 0; // journal only
288 virtual bool needs_journal() = 0; //< requires a journal
289 virtual bool wants_journal() = 0; //< prefers a journal
290 virtual bool allows_journal() = 0; //< allows a journal
291 virtual void prepare_for_fast_shutdown() {}
292 virtual bool has_null_manager() { return false; }
293 // return store min allocation size, if applicable
294 virtual uint64_t get_min_alloc_size() const {
298 /// enumerate hardware devices (by 'devname', e.g., 'sda' as in /sys/block/sda)
299 virtual int get_devices(std::set
<std::string
> *devls
) {
303 /// true if a txn is readable immediately after it is queued.
304 virtual bool is_sync_onreadable() const {
311 * Check whether store is backed by a rotational (HDD) or non-rotational
314 * This must be usable *before* the store is mounted.
316 * @return true for HDD, false for SSD
318 virtual bool is_rotational() {
323 * is_journal_rotational
325 * Check whether journal is backed by a rotational (HDD) or non-rotational
329 * @return true for HDD, false for SSD
331 virtual bool is_journal_rotational() {
335 virtual std::string
get_default_device_class() {
336 return is_rotational() ? "hdd" : "ssd";
339 virtual int get_numa_node(
341 std::set
<int> *nodes
,
342 std::set
<std::string
> *failed
) {
347 virtual bool can_sort_nibblewise() {
348 return false; // assume a backend cannot, unless it says otherwise
351 virtual int statfs(struct store_statfs_t
*buf
,
352 osd_alert_list_t
* alerts
= nullptr) = 0;
353 virtual int pool_statfs(uint64_t pool_id
, struct store_statfs_t
*buf
,
354 bool *per_pool_omap
) = 0;
356 virtual void collect_metadata(std::map
<std::string
,std::string
> *pm
) { }
359 * write_meta - write a simple configuration key out-of-band
361 * Write a simple key/value pair for basic store configuration
362 * (e.g., a uuid or magic number) to an unopened/unmounted store.
363 * The default implementation writes this to a plaintext file in the
366 * A newline is appended.
368 * @param key key name (e.g., "fsid")
369 * @param value value (e.g., a uuid rendered as a std::string)
370 * @returns 0 for success, or an error code
372 virtual int write_meta(const std::string
& key
,
373 const std::string
& value
);
376 * read_meta - read a simple configuration key out-of-band
378 * Read a simple key value to an unopened/mounted store.
380 * Trailing whitespace is stripped off.
382 * @param key key name
383 * @param value pointer to value std::string
384 * @returns 0 for success, or an error code
386 virtual int read_meta(const std::string
& key
,
390 * get ideal max value for collection_list()
392 * default to some arbitrary values; the implementation will override.
394 virtual int get_ideal_list_max() { return 64; }
398 * get a collection handle
400 * Provide a trivial handle as a default to avoid converting legacy
403 virtual CollectionHandle
open_collection(const coll_t
&cid
) = 0;
406 * get a collection handle for a soon-to-be-created collection
408 * This handle must be used by queue_transaction that includes a
409 * create_collection call in order to become valid. It will become the
410 * reference to the created collection.
412 virtual CollectionHandle
create_new_collection(const coll_t
&cid
) = 0;
415 * std::set ContextQueue for a collection
417 * After that, oncommits of Transaction will queue into commit_queue.
418 * And osd ShardThread will call oncommits.
420 virtual void set_collection_commit_queue(const coll_t
&cid
, ContextQueue
*commit_queue
) = 0;
423 * Synchronous read operations
427 * exists -- Test for existence of object
429 * @param cid collection for object
430 * @param oid oid of object
431 * @returns true if object exists, false otherwise
433 virtual bool exists(CollectionHandle
& c
, const ghobject_t
& oid
) = 0;
435 * set_collection_opts -- std::set pool options for a collectioninformation for an object
437 * @param cid collection
438 * @param opts new collection options
439 * @returns 0 on success, negative error code on failure.
441 virtual int set_collection_opts(
443 const pool_opts_t
& opts
) = 0;
446 * stat -- get information for an object
448 * @param cid collection for object
449 * @param oid oid of object
450 * @param st output information for the object
451 * @param allow_eio if false, assert on -EIO operation failure
452 * @returns 0 on success, negative error code on failure.
456 const ghobject_t
& oid
,
458 bool allow_eio
= false) = 0;
460 * read -- read a byte range of data from an object
462 * Note: if reading from an offset past the end of the object, we
463 * return 0 (not, say, -EINVAL).
465 * @param cid collection for object
466 * @param oid oid of object
467 * @param offset location offset of first byte to be read
468 * @param len number of bytes to be read
469 * @param bl output ceph::buffer::list
470 * @param op_flags is CEPH_OSD_OP_FLAG_*
471 * @returns number of bytes read on success, or negative error code on failure.
475 const ghobject_t
& oid
,
478 ceph::buffer::list
& bl
,
479 uint32_t op_flags
= 0) = 0;
482 * fiemap -- get extent std::map of data of an object
484 * Returns an encoded std::map of the extents of an object's data portion
485 * (std::map<offset,size>).
487 * A non-enlightened implementation is free to return the extent (offset, len)
488 * as the sole extent.
490 * @param cid collection for object
491 * @param oid oid of object
492 * @param offset location offset of first byte to be read
493 * @param len number of bytes to be read
494 * @param bl output ceph::buffer::list for extent std::map information.
495 * @returns 0 on success, negative error code on failure.
497 virtual int fiemap(CollectionHandle
& c
, const ghobject_t
& oid
,
498 uint64_t offset
, size_t len
, ceph::buffer::list
& bl
) = 0;
499 virtual int fiemap(CollectionHandle
& c
, const ghobject_t
& oid
,
500 uint64_t offset
, size_t len
, std::map
<uint64_t, uint64_t>& destmap
) = 0;
503 * readv -- read specfic intervals from an object;
504 * caller must call fiemap to fill in the extent-map first.
506 * Note: if reading from an offset past the end of the object, we
507 * return 0 (not, say, -EINVAL). Also the default version of readv
508 * reads each extent separately synchronously, which can become horribly
509 * inefficient if the physical layout of the pushing object get massively
510 * fragmented and hence should be overridden by any real os that
511 * cares about the performance..
513 * @param cid collection for object
514 * @param oid oid of object
515 * @param m intervals to be read
516 * @param bl output ceph::buffer::list
517 * @param op_flags is CEPH_OSD_OP_FLAG_*
518 * @returns number of bytes read on success, or negative error code on failure.
522 const ghobject_t
& oid
,
523 interval_set
<uint64_t>& m
,
524 ceph::buffer::list
& bl
,
525 uint32_t op_flags
= 0) {
527 for (auto p
= m
.begin(); p
!= m
.end(); p
++) {
528 ceph::buffer::list t
;
529 int r
= read(c
, oid
, p
.get_start(), p
.get_len(), t
, op_flags
);
533 // prune fiemap, if necessary
534 if (p
.get_len() != t
.length()) {
536 if (t
.length() == 0) {
537 m
.erase(save
); // Remove this empty interval
539 save
.set_len(t
.length()); // fix interval length
542 // Remove any other follow-up intervals present too
543 while (p
!= m
.end()) {
555 * dump_onode -- dumps onode metadata in human readable form,
556 intended primiarily for debugging
558 * @param cid collection for object
559 * @param oid oid of object
560 * @param section_name section name to create and print under
561 * @param f Formatter class instance to print to
562 * @returns 0 on success, negative error code on failure.
564 virtual int dump_onode(
566 const ghobject_t
& oid
,
567 const std::string
& section_name
,
568 ceph::Formatter
*f
) {
573 * getattr -- get an xattr of an object
575 * @param cid collection for object
576 * @param oid oid of object
577 * @param name name of attr to read
578 * @param value place to put output result.
579 * @returns 0 on success, negative error code on failure.
581 virtual int getattr(CollectionHandle
&c
, const ghobject_t
& oid
,
582 const char *name
, ceph::buffer::ptr
& value
) = 0;
585 * getattr -- get an xattr of an object
587 * @param cid collection for object
588 * @param oid oid of object
589 * @param name name of attr to read
590 * @param value place to put output result.
591 * @returns 0 on success, negative error code on failure.
594 CollectionHandle
&c
, const ghobject_t
& oid
,
595 const std::string
& name
, ceph::buffer::list
& value
) {
596 ceph::buffer::ptr bp
;
597 int r
= getattr(c
, oid
, name
.c_str(), bp
);
603 * getattrs -- get all of the xattrs of an object
605 * @param cid collection for object
606 * @param oid oid of object
607 * @param aset place to put output result.
608 * @returns 0 on success, negative error code on failure.
610 virtual int getattrs(CollectionHandle
&c
, const ghobject_t
& oid
,
611 std::map
<std::string
,ceph::buffer::ptr
, std::less
<>>& aset
) = 0;
614 * getattrs -- get all of the xattrs of an object
616 * @param cid collection for object
617 * @param oid oid of object
618 * @param aset place to put output result.
619 * @returns 0 on success, negative error code on failure.
621 int getattrs(CollectionHandle
&c
, const ghobject_t
& oid
,
622 std::map
<std::string
,ceph::buffer::list
,std::less
<>>& aset
) {
623 std::map
<std::string
,ceph::buffer::ptr
,std::less
<>> bmap
;
624 int r
= getattrs(c
, oid
, bmap
);
625 for (auto i
= bmap
.begin(); i
!= bmap
.end(); ++i
) {
626 aset
[i
->first
].append(i
->second
);
635 * list_collections -- get all of the collections known to this ObjectStore
637 * @param ls std::list of the collections in sorted order.
638 * @returns 0 on success, negative error code on failure.
640 virtual int list_collections(std::vector
<coll_t
>& ls
) = 0;
643 * does a collection exist?
645 * @param c collection
646 * @returns true if it exists, false otherwise
648 virtual bool collection_exists(const coll_t
& c
) = 0;
651 * is a collection empty?
653 * @param c collection
654 * @param empty true if the specified collection is empty, false otherwise
655 * @returns 0 on success, negative error code on failure.
657 virtual int collection_empty(CollectionHandle
& c
, bool *empty
) = 0;
660 * return the number of significant bits of the coll_t::pgid.
662 * This should return what the last create_collection or split_collection
663 * std::set. A legacy backend may return -EAGAIN if the value is unavailable
664 * (because we upgraded from an older version, e.g., FileStore).
666 virtual int collection_bits(CollectionHandle
& c
) = 0;
670 * std::list contents of a collection that fall in the range [start, end) and no more than a specified many result
672 * @param c collection
673 * @param start list object that sort >= this value
674 * @param end list objects that sort < this value
675 * @param max return no more than this many results
676 * @param seq return no objects with snap < seq
677 * @param ls [out] result
678 * @param next [out] next item sorts >= this value
679 * @return zero on success, or negative error
681 virtual int collection_list(CollectionHandle
&c
,
682 const ghobject_t
& start
, const ghobject_t
& end
,
684 std::vector
<ghobject_t
> *ls
, ghobject_t
*next
) = 0;
686 virtual int collection_list_legacy(CollectionHandle
&c
,
687 const ghobject_t
& start
,
688 const ghobject_t
& end
, int max
,
689 std::vector
<ghobject_t
> *ls
,
691 return collection_list(c
, start
, end
, max
, ls
, next
);
695 /// Get omap contents
696 virtual int omap_get(
697 CollectionHandle
&c
, ///< [in] Collection containing oid
698 const ghobject_t
&oid
, ///< [in] Object containing omap
699 ceph::buffer::list
*header
, ///< [out] omap header
700 std::map
<std::string
, ceph::buffer::list
> *out
/// < [out] Key to value std::map
704 virtual int omap_get_header(
705 CollectionHandle
&c
, ///< [in] Collection containing oid
706 const ghobject_t
&oid
, ///< [in] Object containing omap
707 ceph::buffer::list
*header
, ///< [out] omap header
708 bool allow_eio
= false ///< [in] don't assert on eio
711 /// Get keys defined on oid
712 virtual int omap_get_keys(
713 CollectionHandle
&c
, ///< [in] Collection containing oid
714 const ghobject_t
&oid
, ///< [in] Object containing omap
715 std::set
<std::string
> *keys
///< [out] Keys defined on oid
719 virtual int omap_get_values(
720 CollectionHandle
&c
, ///< [in] Collection containing oid
721 const ghobject_t
&oid
, ///< [in] Object containing omap
722 const std::set
<std::string
> &keys
, ///< [in] Keys to get
723 std::map
<std::string
, ceph::buffer::list
> *out
///< [out] Returned keys and values
727 virtual int omap_get_values(
728 CollectionHandle
&c
, ///< [in] Collection containing oid
729 const ghobject_t
&oid
, ///< [in] Object containing omap
730 const std::optional
<std::string
> &start_after
, ///< [in] Keys to get
731 std::map
<std::string
, ceph::buffer::list
> *out
///< [out] Returned keys and values
735 /// Filters keys into out which are defined on oid
736 virtual int omap_check_keys(
737 CollectionHandle
&c
, ///< [in] Collection containing oid
738 const ghobject_t
&oid
, ///< [in] Object containing omap
739 const std::set
<std::string
> &keys
, ///< [in] Keys to check
740 std::set
<std::string
> *out
///< [out] Subset of keys defined on oid
744 * Returns an object map iterator
746 * Warning! The returned iterator is an implicit lock on filestore
747 * operations in c. Do not use filestore methods on c while the returned
748 * iterator is live. (Filling in a transaction is no problem).
750 * @return iterator, null on error
752 virtual ObjectMap::ObjectMapIterator
get_omap_iterator(
753 CollectionHandle
&c
, ///< [in] collection
754 const ghobject_t
&oid
///< [in] object
757 virtual int flush_journal() { return -EOPNOTSUPP
; }
759 virtual int dump_journal(std::ostream
& out
) { return -EOPNOTSUPP
; }
761 virtual int snapshot(const std::string
& name
) { return -EOPNOTSUPP
; }
764 * Set and get internal fsid for this instance. No external data is modified
766 virtual void set_fsid(uuid_d u
) = 0;
767 virtual uuid_d
get_fsid() = 0;
770 * Estimates additional disk space used by the specified amount of objects and caused by file allocation granularity and metadata store
771 * - num objects - total (including witeouts) object count to measure used space for.
773 virtual uint64_t estimate_objects_overhead(uint64_t num_objects
) = 0;
777 virtual void inject_data_error(const ghobject_t
&oid
) {}
778 virtual void inject_mdata_error(const ghobject_t
&oid
) {}
780 virtual void compact() {}
781 virtual bool has_builtin_csum() const {