1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef CEPH_MDSTYPES_H
4 #define CEPH_MDSTYPES_H
6 #include "include/int_types.h"
13 #include "common/config.h"
14 #include "common/Clock.h"
15 #include "common/DecayCounter.h"
16 #include "common/entity_name.h"
18 #include "include/Context.h"
19 #include "include/frag.h"
20 #include "include/xlist.h"
21 #include "include/interval_set.h"
22 #include "include/compact_map.h"
23 #include "include/compact_set.h"
24 #include "include/fs_types.h"
26 #include "inode_backtrace.h"
28 #include <boost/spirit/include/qi.hpp>
29 #include <boost/pool/pool.hpp>
30 #include "include/assert.h"
31 #include <boost/serialization/strong_typedef.hpp>
33 #define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011"
35 #define MDS_PORT_CACHE 0x200
36 #define MDS_PORT_LOCKER 0x300
37 #define MDS_PORT_MIGRATOR 0x400
42 #define MDS_INO_ROOT 1
44 // No longer created but recognised in existing filesystems
45 // so that we don't try to fragment it.
46 #define MDS_INO_CEPH 2
48 #define MDS_INO_MDSDIR_OFFSET (1*MAX_MDS)
49 #define MDS_INO_STRAY_OFFSET (6*MAX_MDS)
51 // Locations for journal data
52 #define MDS_INO_LOG_OFFSET (2*MAX_MDS)
53 #define MDS_INO_LOG_BACKUP_OFFSET (3*MAX_MDS)
54 #define MDS_INO_LOG_POINTER_OFFSET (4*MAX_MDS)
55 #define MDS_INO_PURGE_QUEUE (5*MAX_MDS)
57 #define MDS_INO_SYSTEM_BASE ((6*MAX_MDS) + (MAX_MDS * NUM_STRAY))
59 #define MDS_INO_STRAY(x,i) (MDS_INO_STRAY_OFFSET+((((unsigned)(x))*NUM_STRAY)+((unsigned)(i))))
60 #define MDS_INO_MDSDIR(x) (MDS_INO_MDSDIR_OFFSET+((unsigned)x))
62 #define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < (MDS_INO_STRAY_OFFSET+(MAX_MDS*NUM_STRAY)))
63 #define MDS_INO_IS_MDSDIR(i) ((i) >= MDS_INO_MDSDIR_OFFSET && (i) < (MDS_INO_MDSDIR_OFFSET+MAX_MDS))
64 #define MDS_INO_MDSDIR_OWNER(i) (signed ((unsigned (i)) - MDS_INO_MDSDIR_OFFSET))
65 #define MDS_INO_IS_BASE(i) (MDS_INO_ROOT == (i) || MDS_INO_IS_MDSDIR(i))
66 #define MDS_INO_STRAY_OWNER(i) (signed (((unsigned (i)) - MDS_INO_STRAY_OFFSET) / NUM_STRAY))
67 #define MDS_INO_STRAY_INDEX(i) (((unsigned (i)) - MDS_INO_STRAY_OFFSET) % NUM_STRAY)
69 #define MDS_TRAVERSE_FORWARD 1
70 #define MDS_TRAVERSE_DISCOVER 2 // skips permissions checks etc.
71 #define MDS_TRAVERSE_DISCOVERXLOCK 3 // succeeds on (foreign?) null, xlocked dentries.
74 typedef int32_t mds_rank_t
;
75 typedef int32_t fs_cluster_id_t
;
77 BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t
)
78 extern const mds_gid_t MDS_GID_NONE
;
79 constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE
= {-1};
80 // The namespace ID of the anonymous default filesystem from legacy systems
81 constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS
= {0};
82 extern const mds_rank_t MDS_RANK_NONE
;
87 fs_cluster_id_t fscid
;
90 mds_role_t(fs_cluster_id_t fscid_
, mds_rank_t rank_
)
91 : fscid(fscid_
), rank(rank_
)
94 : fscid(FS_CLUSTER_ID_NONE
), rank(MDS_RANK_NONE
)
96 bool operator<(mds_role_t
const &rhs
) const
98 if (fscid
< rhs
.fscid
) {
100 } else if (fscid
== rhs
.fscid
) {
101 return rank
< rhs
.rank
;
109 return (rank
== MDS_RANK_NONE
);
112 std::ostream
& operator<<(std::ostream
&out
, const mds_role_t
&role
);
117 inline string
gcap_string(int cap
)
120 if (cap
& CEPH_CAP_GSHARED
) s
+= "s";
121 if (cap
& CEPH_CAP_GEXCL
) s
+= "x";
122 if (cap
& CEPH_CAP_GCACHE
) s
+= "c";
123 if (cap
& CEPH_CAP_GRD
) s
+= "r";
124 if (cap
& CEPH_CAP_GWR
) s
+= "w";
125 if (cap
& CEPH_CAP_GBUFFER
) s
+= "b";
126 if (cap
& CEPH_CAP_GWREXTEND
) s
+= "a";
127 if (cap
& CEPH_CAP_GLAZYIO
) s
+= "l";
130 inline string
ccap_string(int cap
)
133 if (cap
& CEPH_CAP_PIN
) s
+= "p";
135 int a
= (cap
>> CEPH_CAP_SAUTH
) & 3;
136 if (a
) s
+= 'A' + gcap_string(a
);
138 a
= (cap
>> CEPH_CAP_SLINK
) & 3;
139 if (a
) s
+= 'L' + gcap_string(a
);
141 a
= (cap
>> CEPH_CAP_SXATTR
) & 3;
142 if (a
) s
+= 'X' + gcap_string(a
);
144 a
= cap
>> CEPH_CAP_SFILE
;
145 if (a
) s
+= 'F' + gcap_string(a
);
153 struct scatter_info_t
{
156 scatter_info_t() : version(0) {}
159 struct frag_info_t
: public scatter_info_t
{
162 uint64_t change_attr
;
163 int64_t nfiles
; // files
164 int64_t nsubdirs
; // subdirs
166 frag_info_t() : change_attr(0), nfiles(0), nsubdirs(0) {}
168 int64_t size() const { return nfiles
+ nsubdirs
; }
171 *this = frag_info_t();
174 // *this += cur - acc;
175 void add_delta(const frag_info_t
&cur
, const frag_info_t
&acc
, bool *touched_mtime
=0, bool *touched_chattr
=0) {
176 if (cur
.mtime
> mtime
) {
179 *touched_mtime
= true;
181 if (cur
.change_attr
> change_attr
) {
182 change_attr
= cur
.change_attr
;
184 *touched_chattr
= true;
186 nfiles
+= cur
.nfiles
- acc
.nfiles
;
187 nsubdirs
+= cur
.nsubdirs
- acc
.nsubdirs
;
190 void add(const frag_info_t
& other
) {
191 if (other
.mtime
> mtime
)
193 if (other
.change_attr
> change_attr
)
194 change_attr
= other
.change_attr
;
195 nfiles
+= other
.nfiles
;
196 nsubdirs
+= other
.nsubdirs
;
199 bool same_sums(const frag_info_t
&o
) const {
200 return mtime
<= o
.mtime
&&
201 nfiles
== o
.nfiles
&&
202 nsubdirs
== o
.nsubdirs
;
205 void encode(bufferlist
&bl
) const;
206 void decode(bufferlist::iterator
& bl
);
207 void dump(Formatter
*f
) const;
208 static void generate_test_instances(list
<frag_info_t
*>& ls
);
210 WRITE_CLASS_ENCODER(frag_info_t
)
212 inline bool operator==(const frag_info_t
&l
, const frag_info_t
&r
) {
213 return memcmp(&l
, &r
, sizeof(l
)) == 0;
215 inline bool operator!=(const frag_info_t
&l
, const frag_info_t
&r
) {
219 std::ostream
& operator<<(std::ostream
&out
, const frag_info_t
&f
);
222 struct nest_info_t
: public scatter_info_t
{
223 // this frag + children
228 int64_t rsize() const { return rfiles
+ rsubdirs
; }
232 nest_info_t() : rbytes(0), rfiles(0), rsubdirs(0), rsnaprealms(0) {}
235 *this = nest_info_t();
238 void sub(const nest_info_t
&other
) {
241 void add(const nest_info_t
&other
, int fac
=1) {
242 if (other
.rctime
> rctime
)
243 rctime
= other
.rctime
;
244 rbytes
+= fac
*other
.rbytes
;
245 rfiles
+= fac
*other
.rfiles
;
246 rsubdirs
+= fac
*other
.rsubdirs
;
247 rsnaprealms
+= fac
*other
.rsnaprealms
;
250 // *this += cur - acc;
251 void add_delta(const nest_info_t
&cur
, const nest_info_t
&acc
) {
252 if (cur
.rctime
> rctime
)
254 rbytes
+= cur
.rbytes
- acc
.rbytes
;
255 rfiles
+= cur
.rfiles
- acc
.rfiles
;
256 rsubdirs
+= cur
.rsubdirs
- acc
.rsubdirs
;
257 rsnaprealms
+= cur
.rsnaprealms
- acc
.rsnaprealms
;
260 bool same_sums(const nest_info_t
&o
) const {
261 return rctime
<= o
.rctime
&&
262 rbytes
== o
.rbytes
&&
263 rfiles
== o
.rfiles
&&
264 rsubdirs
== o
.rsubdirs
&&
265 rsnaprealms
== o
.rsnaprealms
;
268 void encode(bufferlist
&bl
) const;
269 void decode(bufferlist::iterator
& bl
);
270 void dump(Formatter
*f
) const;
271 static void generate_test_instances(list
<nest_info_t
*>& ls
);
273 WRITE_CLASS_ENCODER(nest_info_t
)
275 inline bool operator==(const nest_info_t
&l
, const nest_info_t
&r
) {
276 return memcmp(&l
, &r
, sizeof(l
)) == 0;
278 inline bool operator!=(const nest_info_t
&l
, const nest_info_t
&r
) {
282 std::ostream
& operator<<(std::ostream
&out
, const nest_info_t
&n
);
289 vinodeno_t(inodeno_t i
, snapid_t s
) : ino(i
), snapid(s
) {}
291 void encode(bufferlist
& bl
) const {
293 ::encode(snapid
, bl
);
295 void decode(bufferlist::iterator
& p
) {
300 WRITE_CLASS_ENCODER(vinodeno_t
)
302 inline bool operator==(const vinodeno_t
&l
, const vinodeno_t
&r
) {
303 return l
.ino
== r
.ino
&& l
.snapid
== r
.snapid
;
305 inline bool operator!=(const vinodeno_t
&l
, const vinodeno_t
&r
) {
308 inline bool operator<(const vinodeno_t
&l
, const vinodeno_t
&r
) {
311 (l
.ino
== r
.ino
&& l
.snapid
< r
.snapid
);
319 quota_info_t() : max_bytes(0), max_files(0) {}
321 void encode(bufferlist
& bl
) const {
322 ENCODE_START(1, 1, bl
);
323 ::encode(max_bytes
, bl
);
324 ::encode(max_files
, bl
);
327 void decode(bufferlist::iterator
& p
) {
328 DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p
);
329 ::decode(max_bytes
, p
);
330 ::decode(max_files
, p
);
334 void dump(Formatter
*f
) const;
335 static void generate_test_instances(list
<quota_info_t
*>& ls
);
337 bool is_valid() const {
338 return max_bytes
>=0 && max_files
>=0;
340 bool is_enable() const {
341 return max_bytes
|| max_files
;
344 WRITE_CLASS_ENCODER(quota_info_t
)
346 inline bool operator==(const quota_info_t
&l
, const quota_info_t
&r
) {
347 return memcmp(&l
, &r
, sizeof(l
)) == 0;
350 ostream
& operator<<(ostream
&out
, const quota_info_t
&n
);
353 template<> struct hash
<vinodeno_t
> {
354 size_t operator()(const vinodeno_t
&vino
) const {
357 return H(vino
.ino
) ^ I(vino
.snapid
);
365 inline std::ostream
& operator<<(std::ostream
&out
, const vinodeno_t
&vino
) {
367 if (vino
.snapid
== CEPH_NOSNAP
)
369 else if (vino
.snapid
)
370 out
<< '.' << vino
.snapid
;
376 * client_writeable_range_t
378 struct client_writeable_range_t
{
379 struct byte_range_t
{
380 uint64_t first
, last
; // interval client can write to
381 byte_range_t() : first(0), last(0) {}
385 snapid_t follows
; // aka "data+metadata flushed thru"
387 client_writeable_range_t() : follows(0) {}
389 void encode(bufferlist
&bl
) const;
390 void decode(bufferlist::iterator
& bl
);
391 void dump(Formatter
*f
) const;
392 static void generate_test_instances(list
<client_writeable_range_t
*>& ls
);
395 inline void decode(client_writeable_range_t::byte_range_t
& range
, bufferlist::iterator
& bl
) {
396 ::decode(range
.first
, bl
);
397 ::decode(range
.last
, bl
);
400 WRITE_CLASS_ENCODER(client_writeable_range_t
)
402 std::ostream
& operator<<(std::ostream
& out
, const client_writeable_range_t
& r
);
404 inline bool operator==(const client_writeable_range_t
& l
,
405 const client_writeable_range_t
& r
) {
406 return l
.range
.first
== r
.range
.first
&& l
.range
.last
== r
.range
.last
&&
407 l
.follows
== r
.follows
;
410 struct inline_data_t
{
412 std::unique_ptr
<bufferlist
> blp
;
419 bufferlist
& get_data() {
421 blp
.reset(new bufferlist
);
424 size_t length() const { return blp
? blp
->length() : 0; }
426 inline_data_t() : version(1) {}
427 inline_data_t(const inline_data_t
& o
) : version(o
.version
) {
431 inline_data_t
& operator=(const inline_data_t
& o
) {
439 bool operator==(const inline_data_t
& o
) const {
440 return length() == o
.length() &&
442 (*const_cast<bufferlist
*>(blp
.get()) == *const_cast<bufferlist
*>(o
.blp
.get())));
444 bool operator!=(const inline_data_t
& o
) const {
445 return !(*this == o
);
447 void encode(bufferlist
&bl
) const;
448 void decode(bufferlist::iterator
& bl
);
450 WRITE_CLASS_ENCODER(inline_data_t
)
453 DAMAGE_STATS
, // statistics (dirstat, size, etc)
454 DAMAGE_RSTATS
, // recursive statistics (rstat, accounted_rstat)
455 DAMAGE_FRAGTREE
// fragtree -- repair by searching
457 typedef uint32_t damage_flags_t
;
465 * Do not forget to add any new fields to the compare() function.
470 uint32_t rdev
; // if special file
472 // affected by any inode change...
473 utime_t ctime
; // inode change time
474 utime_t btime
; // birth time
476 // perm (namespace permissions)
484 // file (data access)
485 ceph_dir_layout dir_layout
; // [dir only]
486 file_layout_t layout
;
487 compact_set
<int64_t> old_pools
;
488 uint64_t size
; // on directory, # dentries
489 uint64_t max_size_ever
; // max size the file has ever been
490 uint32_t truncate_seq
;
491 uint64_t truncate_size
, truncate_from
;
492 uint32_t truncate_pending
;
493 utime_t mtime
; // file data modify time.
494 utime_t atime
; // file data access time.
495 uint32_t time_warp_seq
; // count of (potential) mtime/atime timewarps (i.e., utimes())
496 inline_data_t inline_data
;
499 uint64_t change_attr
;
501 std::map
<client_t
,client_writeable_range_t
> client_ranges
; // client(s) can write to these ranges
503 // dirfrag, recursive accountin
504 frag_info_t dirstat
; // protected by my filelock
505 nest_info_t rstat
; // protected by my nestlock
506 nest_info_t accounted_rstat
; // protected by parent's nestlock
510 mds_rank_t export_pin
;
513 version_t version
; // auth only
514 version_t file_data_version
; // auth only
515 version_t xattr_version
;
517 utime_t last_scrub_stamp
; // start time of last complete scrub
518 version_t last_scrub_version
;// (parent) start version of last complete scrub
520 version_t backtrace_version
;
522 snapid_t oldest_snap
;
524 string stray_prior_path
; //stores path before unlink
526 inode_t() : ino(0), rdev(0),
527 mode(0), uid(0), gid(0), nlink(0),
528 size(0), max_size_ever(0),
529 truncate_seq(0), truncate_size(0), truncate_from(0),
531 time_warp_seq(0), change_attr(0),
532 export_pin(MDS_RANK_NONE
),
533 version(0), file_data_version(0), xattr_version(0),
534 last_scrub_version(0), backtrace_version(0) {
536 memset(&dir_layout
, 0, sizeof(dir_layout
));
537 memset("a
, 0, sizeof(quota
));
541 bool is_symlink() const { return (mode
& S_IFMT
) == S_IFLNK
; }
542 bool is_dir() const { return (mode
& S_IFMT
) == S_IFDIR
; }
543 bool is_file() const { return (mode
& S_IFMT
) == S_IFREG
; }
545 bool is_truncating() const { return (truncate_pending
> 0); }
546 void truncate(uint64_t old_size
, uint64_t new_size
) {
547 assert(new_size
< old_size
);
548 if (old_size
> max_size_ever
)
549 max_size_ever
= old_size
;
550 truncate_from
= old_size
;
552 rstat
.rbytes
= new_size
;
553 truncate_size
= size
;
558 bool has_layout() const {
559 return layout
!= file_layout_t();
562 void clear_layout() {
563 layout
= file_layout_t();
566 uint64_t get_layout_size_increment() const {
567 return layout
.get_period();
570 bool is_dirty_rstat() const { return !(rstat
== accounted_rstat
); }
572 uint64_t get_max_size() const {
574 for (std::map
<client_t
,client_writeable_range_t
>::const_iterator p
= client_ranges
.begin();
575 p
!= client_ranges
.end();
577 if (p
->second
.range
.last
> max
)
578 max
= p
->second
.range
.last
;
581 void set_max_size(uint64_t new_max
) {
583 client_ranges
.clear();
585 for (std::map
<client_t
,client_writeable_range_t
>::iterator p
= client_ranges
.begin();
586 p
!= client_ranges
.end();
588 p
->second
.range
.last
= new_max
;
592 void trim_client_ranges(snapid_t last
) {
593 std::map
<client_t
, client_writeable_range_t
>::iterator p
= client_ranges
.begin();
594 while (p
!= client_ranges
.end()) {
595 if (p
->second
.follows
>= last
)
596 client_ranges
.erase(p
++);
602 bool is_backtrace_updated() const {
603 return backtrace_version
== version
;
605 void update_backtrace(version_t pv
=0) {
606 backtrace_version
= pv
? pv
: version
;
609 void add_old_pool(int64_t l
) {
610 backtrace_version
= version
;
614 void encode(bufferlist
&bl
, uint64_t features
) const;
615 void decode(bufferlist::iterator
& bl
);
616 void dump(Formatter
*f
) const;
617 static void generate_test_instances(list
<inode_t
*>& ls
);
619 * Compare this inode_t with another that represent *the same inode*
620 * at different points in time.
621 * @pre The inodes are the same ino
623 * @param other The inode_t to compare ourselves with
624 * @param divergent A bool pointer which will be set to true
625 * if the values are different in a way that can't be explained
626 * by one being a newer version than the other.
628 * @returns 1 if we are newer than the other, 0 if equal, -1 if older.
630 int compare(const inode_t
&other
, bool *divergent
) const;
632 bool older_is_consistent(const inode_t
&other
) const;
634 WRITE_CLASS_ENCODER_FEATURES(inode_t
)
643 std::map
<string
,bufferptr
> xattrs
;
645 void encode(bufferlist
&bl
, uint64_t features
) const;
646 void decode(bufferlist::iterator
& bl
);
647 void dump(Formatter
*f
) const;
648 static void generate_test_instances(list
<old_inode_t
*>& ls
);
650 WRITE_CLASS_ENCODER_FEATURES(old_inode_t
)
654 * like an inode, but for a dir frag
658 snapid_t snap_purged_thru
; // the max_last_destroy snapid we've been purged thru
659 frag_info_t fragstat
, accounted_fragstat
;
660 nest_info_t rstat
, accounted_rstat
;
661 damage_flags_t damage_flags
;
663 // we know we and all our descendants have been scrubbed since this version
664 version_t recursive_scrub_version
;
665 utime_t recursive_scrub_stamp
;
666 // version at which we last scrubbed our personal data structures
667 version_t localized_scrub_version
;
668 utime_t localized_scrub_stamp
;
670 void encode(bufferlist
&bl
) const;
671 void decode(bufferlist::iterator
& bl
);
672 void dump(Formatter
*f
) const;
673 static void generate_test_instances(list
<fnode_t
*>& ls
);
674 fnode_t() : version(0), damage_flags(0),
675 recursive_scrub_version(0), localized_scrub_version(0) {}
677 WRITE_CLASS_ENCODER(fnode_t
)
682 nest_info_t rstat
, accounted_rstat
;
684 void encode(bufferlist
& bl
) const;
685 void decode(bufferlist::iterator
& p
);
686 void dump(Formatter
*f
) const;
687 static void generate_test_instances(list
<old_rstat_t
*>& ls
);
689 WRITE_CLASS_ENCODER(old_rstat_t
)
691 inline std::ostream
& operator<<(std::ostream
& out
, const old_rstat_t
& o
) {
692 return out
<< "old_rstat(first " << o
.first
<< " " << o
.rstat
<< " " << o
.accounted_rstat
<< ")";
700 struct session_info_t
{
702 std::map
<ceph_tid_t
,inodeno_t
> completed_requests
;
703 interval_set
<inodeno_t
> prealloc_inos
; // preallocated, ready to use.
704 interval_set
<inodeno_t
> used_inos
; // journaling use
705 std::map
<std::string
, std::string
> client_metadata
;
706 std::set
<ceph_tid_t
> completed_flushes
;
707 EntityName auth_name
;
709 client_t
get_client() const { return client_t(inst
.name
.num()); }
710 const entity_name_t
& get_source() const { return inst
.name
; }
713 prealloc_inos
.clear();
715 completed_requests
.clear();
716 completed_flushes
.clear();
719 void encode(bufferlist
& bl
, uint64_t features
) const;
720 void decode(bufferlist::iterator
& p
);
721 void dump(Formatter
*f
) const;
722 static void generate_test_instances(list
<session_info_t
*>& ls
);
724 WRITE_CLASS_ENCODER_FEATURES(session_info_t
)
730 struct dentry_key_t
{
734 dentry_key_t() : snapid(0), name(0), hash(0) {}
735 dentry_key_t(snapid_t s
, const char *n
, __u32 h
=0) :
736 snapid(s
), name(n
), hash(h
) {}
738 bool is_valid() { return name
|| snapid
; }
740 // encode into something that can be decoded as a string.
741 // name_ (head) or name_%x (!head)
742 void encode(bufferlist
& bl
) const {
747 void encode(string
& key
) const {
749 if (snapid
!= CEPH_NOSNAP
) {
750 uint64_t val(snapid
);
751 snprintf(b
, sizeof(b
), "%" PRIx64
, val
);
753 snprintf(b
, sizeof(b
), "%s", "head");
756 oss
<< name
<< "_" << b
;
759 static void decode_helper(bufferlist::iterator
& bl
, string
& nm
, snapid_t
& sn
) {
762 decode_helper(key
, nm
, sn
);
764 static void decode_helper(const string
& key
, string
& nm
, snapid_t
& sn
) {
765 size_t i
= key
.find_last_of('_');
766 assert(i
!= string::npos
);
767 if (key
.compare(i
+1, string::npos
, "head") == 0) {
772 long long unsigned x
= 0;
773 sscanf(key
.c_str() + i
+ 1, "%llx", &x
);
776 nm
= string(key
.c_str(), i
);
780 inline std::ostream
& operator<<(std::ostream
& out
, const dentry_key_t
&k
)
782 return out
<< "(" << k
.name
<< "," << k
.snapid
<< ")";
785 inline bool operator<(const dentry_key_t
& k1
, const dentry_key_t
& k2
)
788 * order by hash, name, snap
790 int c
= ceph_frag_value(k1
.hash
) - ceph_frag_value(k2
.hash
);
793 c
= strcmp(k1
.name
, k2
.name
);
796 return k1
.snapid
< k2
.snapid
;
801 * string_snap_t is a simple (string, snapid_t) pair
803 struct string_snap_t
{
807 string_snap_t(const string
& n
, snapid_t s
) : name(n
), snapid(s
) {}
808 string_snap_t(const char *n
, snapid_t s
) : name(n
), snapid(s
) {}
810 void encode(bufferlist
& bl
) const;
811 void decode(bufferlist::iterator
& p
);
812 void dump(Formatter
*f
) const;
813 static void generate_test_instances(list
<string_snap_t
*>& ls
);
815 WRITE_CLASS_ENCODER(string_snap_t
)
817 inline bool operator<(const string_snap_t
& l
, const string_snap_t
& r
) {
818 int c
= strcmp(l
.name
.c_str(), r
.name
.c_str());
819 return c
< 0 || (c
== 0 && l
.snapid
< r
.snapid
);
822 inline std::ostream
& operator<<(std::ostream
& out
, const string_snap_t
&k
)
824 return out
<< "(" << k
.name
<< "," << k
.snapid
<< ")";
828 * mds_table_pending_t
830 * mds's requesting any pending ops. child needs to encode the corresponding
831 * pending mutation state in the table.
833 struct mds_table_pending_t
{
837 mds_table_pending_t() : reqid(0), mds(0), tid(0) {}
838 void encode(bufferlist
& bl
) const;
839 void decode(bufferlist::iterator
& bl
);
840 void dump(Formatter
*f
) const;
841 static void generate_test_instances(list
<mds_table_pending_t
*>& ls
);
843 WRITE_CLASS_ENCODER(mds_table_pending_t
)
852 metareqid_t() : tid(0) {}
853 metareqid_t(entity_name_t n
, ceph_tid_t t
) : name(n
), tid(t
) {}
854 void encode(bufferlist
& bl
) const {
858 void decode(bufferlist::iterator
&p
) {
863 WRITE_CLASS_ENCODER(metareqid_t
)
865 inline std::ostream
& operator<<(std::ostream
& out
, const metareqid_t
& r
) {
866 return out
<< r
.name
<< ":" << r
.tid
;
869 inline bool operator==(const metareqid_t
& l
, const metareqid_t
& r
) {
870 return (l
.name
== r
.name
) && (l
.tid
== r
.tid
);
872 inline bool operator!=(const metareqid_t
& l
, const metareqid_t
& r
) {
873 return (l
.name
!= r
.name
) || (l
.tid
!= r
.tid
);
875 inline bool operator<(const metareqid_t
& l
, const metareqid_t
& r
) {
876 return (l
.name
< r
.name
) ||
877 (l
.name
== r
.name
&& l
.tid
< r
.tid
);
879 inline bool operator<=(const metareqid_t
& l
, const metareqid_t
& r
) {
880 return (l
.name
< r
.name
) ||
881 (l
.name
== r
.name
&& l
.tid
<= r
.tid
);
883 inline bool operator>(const metareqid_t
& l
, const metareqid_t
& r
) { return !(l
<= r
); }
884 inline bool operator>=(const metareqid_t
& l
, const metareqid_t
& r
) { return !(l
< r
); }
887 template<> struct hash
<metareqid_t
> {
888 size_t operator()(const metareqid_t
&r
) const {
890 return H(r
.name
.num()) ^ H(r
.name
.type()) ^ H(r
.tid
);
896 // cap info for client reconnect
897 struct cap_reconnect_t
{
899 mutable ceph_mds_cap_reconnect capinfo
;
900 snapid_t snap_follows
;
904 memset(&capinfo
, 0, sizeof(capinfo
));
907 cap_reconnect_t(uint64_t cap_id
, inodeno_t pino
, const string
& p
, int w
, int i
,
908 inodeno_t sr
, snapid_t sf
, bufferlist
& lb
) :
910 capinfo
.cap_id
= cap_id
;
913 capinfo
.snaprealm
= sr
;
914 capinfo
.pathbase
= pino
;
915 capinfo
.flock_len
= 0;
919 void encode(bufferlist
& bl
) const;
920 void decode(bufferlist::iterator
& bl
);
921 void encode_old(bufferlist
& bl
) const;
922 void decode_old(bufferlist::iterator
& bl
);
924 void dump(Formatter
*f
) const;
925 static void generate_test_instances(list
<cap_reconnect_t
*>& ls
);
927 WRITE_CLASS_ENCODER(cap_reconnect_t
)
930 // compat for pre-FLOCK feature
931 struct old_ceph_mds_cap_reconnect
{
936 struct ceph_timespec old_mtime
, old_atime
;
938 __le64 pathbase
; /* base ino for our path to this ino */
939 } __attribute__ ((packed
));
940 WRITE_RAW_ENCODER(old_ceph_mds_cap_reconnect
)
942 struct old_cap_reconnect_t
{
944 old_ceph_mds_cap_reconnect capinfo
;
946 const old_cap_reconnect_t
& operator=(const cap_reconnect_t
& n
) {
948 capinfo
.cap_id
= n
.capinfo
.cap_id
;
949 capinfo
.wanted
= n
.capinfo
.wanted
;
950 capinfo
.issued
= n
.capinfo
.issued
;
951 capinfo
.snaprealm
= n
.capinfo
.snaprealm
;
952 capinfo
.pathbase
= n
.capinfo
.pathbase
;
955 operator cap_reconnect_t() {
958 n
.capinfo
.cap_id
= capinfo
.cap_id
;
959 n
.capinfo
.wanted
= capinfo
.wanted
;
960 n
.capinfo
.issued
= capinfo
.issued
;
961 n
.capinfo
.snaprealm
= capinfo
.snaprealm
;
962 n
.capinfo
.pathbase
= capinfo
.pathbase
;
966 void encode(bufferlist
& bl
) const {
968 ::encode(capinfo
, bl
);
970 void decode(bufferlist::iterator
& bl
) {
972 ::decode(capinfo
, bl
);
975 WRITE_CLASS_ENCODER(old_cap_reconnect_t
)
978 // ================================================================
985 dirfrag_t() : ino(0) { }
986 dirfrag_t(inodeno_t i
, frag_t f
) : ino(i
), frag(f
) { }
988 void encode(bufferlist
& bl
) const {
992 void decode(bufferlist::iterator
& bl
) {
997 WRITE_CLASS_ENCODER(dirfrag_t
)
1000 inline std::ostream
& operator<<(std::ostream
& out
, const dirfrag_t
&df
) {
1002 if (!df
.frag
.is_root()) out
<< "." << df
.frag
;
1005 inline bool operator<(dirfrag_t l
, dirfrag_t r
) {
1006 if (l
.ino
< r
.ino
) return true;
1007 if (l
.ino
== r
.ino
&& l
.frag
< r
.frag
) return true;
1010 inline bool operator==(dirfrag_t l
, dirfrag_t r
) {
1011 return l
.ino
== r
.ino
&& l
.frag
== r
.frag
;
1015 template<> struct hash
<dirfrag_t
> {
1016 size_t operator()(const dirfrag_t
&df
) const {
1017 static rjhash
<uint64_t> H
;
1018 static rjhash
<uint32_t> I
;
1019 return H(df
.ino
) ^ I(df
.frag
);
1026 // ================================================================
1028 #define META_POP_IRD 0
1029 #define META_POP_IWR 1
1030 #define META_POP_READDIR 2
1031 #define META_POP_FETCH 3
1032 #define META_POP_STORE 4
1035 class inode_load_vec_t
{
1036 static const int NUM
= 2;
1037 std::vector
< DecayCounter
> vec
;
1039 explicit inode_load_vec_t(const utime_t
&now
)
1040 : vec(NUM
, DecayCounter(now
))
1042 // for dencoder infrastructure
1043 inode_load_vec_t() :
1044 vec(NUM
, DecayCounter())
1046 DecayCounter
&get(int t
) {
1050 void zero(utime_t now
) {
1051 for (int i
=0; i
<NUM
; i
++)
1054 void encode(bufferlist
&bl
) const;
1055 void decode(const utime_t
&t
, bufferlist::iterator
&p
);
1057 void decode(bufferlist::iterator
& p
) { utime_t sample
; decode(sample
, p
); }
1058 void dump(Formatter
*f
);
1059 static void generate_test_instances(list
<inode_load_vec_t
*>& ls
);
1061 inline void encode(const inode_load_vec_t
&c
, bufferlist
&bl
) { c
.encode(bl
); }
1062 inline void decode(inode_load_vec_t
& c
, const utime_t
&t
, bufferlist::iterator
&p
) {
1066 inline void decode(inode_load_vec_t
& c
, bufferlist::iterator
&p
) {
1068 c
.decode(sample
, p
);
1071 class dirfrag_load_vec_t
{
1073 static const int NUM
= 5;
1074 std::vector
< DecayCounter
> vec
;
1075 explicit dirfrag_load_vec_t(const utime_t
&now
)
1076 : vec(NUM
, DecayCounter(now
))
1078 // for dencoder infrastructure
1079 dirfrag_load_vec_t()
1080 : vec(NUM
, DecayCounter())
1082 void encode(bufferlist
&bl
) const {
1083 ENCODE_START(2, 2, bl
);
1084 for (int i
=0; i
<NUM
; i
++)
1085 ::encode(vec
[i
], bl
);
1088 void decode(const utime_t
&t
, bufferlist::iterator
&p
) {
1089 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p
);
1090 for (int i
=0; i
<NUM
; i
++)
1091 ::decode(vec
[i
], t
, p
);
1094 // for dencoder infrastructure
1095 void decode(bufferlist::iterator
& p
) {
1099 void dump(Formatter
*f
) const;
1100 static void generate_test_instances(list
<dirfrag_load_vec_t
*>& ls
);
1102 DecayCounter
&get(int t
) {
1106 void adjust(utime_t now
, const DecayRate
& rate
, double d
) {
1107 for (int i
=0; i
<NUM
; i
++)
1108 vec
[i
].adjust(now
, rate
, d
);
1110 void zero(utime_t now
) {
1111 for (int i
=0; i
<NUM
; i
++)
1114 double meta_load(utime_t now
, const DecayRate
& rate
) {
1116 1*vec
[META_POP_IRD
].get(now
, rate
) +
1117 2*vec
[META_POP_IWR
].get(now
, rate
) +
1118 1*vec
[META_POP_READDIR
].get(now
, rate
) +
1119 2*vec
[META_POP_FETCH
].get(now
, rate
) +
1120 4*vec
[META_POP_STORE
].get(now
, rate
);
1122 double meta_load() {
1124 1*vec
[META_POP_IRD
].get_last() +
1125 2*vec
[META_POP_IWR
].get_last() +
1126 1*vec
[META_POP_READDIR
].get_last() +
1127 2*vec
[META_POP_FETCH
].get_last() +
1128 4*vec
[META_POP_STORE
].get_last();
1131 void add(utime_t now
, DecayRate
& rate
, dirfrag_load_vec_t
& r
) {
1132 for (int i
=0; i
<dirfrag_load_vec_t::NUM
; i
++)
1133 vec
[i
].adjust(r
.vec
[i
].get(now
, rate
));
1135 void sub(utime_t now
, DecayRate
& rate
, dirfrag_load_vec_t
& r
) {
1136 for (int i
=0; i
<dirfrag_load_vec_t::NUM
; i
++)
1137 vec
[i
].adjust(-r
.vec
[i
].get(now
, rate
));
1139 void scale(double f
) {
1140 for (int i
=0; i
<dirfrag_load_vec_t::NUM
; i
++)
1145 inline void encode(const dirfrag_load_vec_t
&c
, bufferlist
&bl
) { c
.encode(bl
); }
1146 inline void decode(dirfrag_load_vec_t
& c
, const utime_t
&t
, bufferlist::iterator
&p
) {
1149 // this for dencoder
1150 inline void decode(dirfrag_load_vec_t
& c
, bufferlist::iterator
&p
) {
1152 c
.decode(sample
, p
);
1155 inline std::ostream
& operator<<(std::ostream
& out
, dirfrag_load_vec_t
& dl
)
1158 utime_t now
= ceph_clock_now();
1159 DecayRate
rate(g_conf
->mds_decay_halflife
);
1160 return out
<< "[" << dl
.vec
[0].get(now
, rate
) << "," << dl
.vec
[1].get(now
, rate
)
1161 << " " << dl
.meta_load(now
, rate
)
1175 dirfrag_load_vec_t auth
;
1176 dirfrag_load_vec_t all
;
1179 double cache_hit_rate
;
1182 double cpu_load_avg
;
1184 explicit mds_load_t(const utime_t
&t
) :
1185 auth(t
), all(t
), req_rate(0), cache_hit_rate(0),
1186 queue_len(0), cpu_load_avg(0)
1188 // mostly for the dencoder infrastructure
1191 req_rate(0), cache_hit_rate(0), queue_len(0), cpu_load_avg(0)
1194 double mds_load(); // defiend in MDBalancer.cc
1195 void encode(bufferlist
& bl
) const;
1196 void decode(const utime_t
& now
, bufferlist::iterator
& bl
);
1197 //this one is for dencoder infrastructure
1198 void decode(bufferlist::iterator
& bl
) { utime_t sample
; decode(sample
, bl
); }
1199 void dump(Formatter
*f
) const;
1200 static void generate_test_instances(list
<mds_load_t
*>& ls
);
1202 inline void encode(const mds_load_t
&c
, bufferlist
&bl
) { c
.encode(bl
); }
1203 inline void decode(mds_load_t
&c
, const utime_t
&t
, bufferlist::iterator
&p
) {
1206 // this one is for dencoder
1207 inline void decode(mds_load_t
&c
, bufferlist::iterator
&p
) {
1209 c
.decode(sample
, p
);
1212 inline std::ostream
& operator<<( std::ostream
& out
, mds_load_t
& load
)
1214 return out
<< "mdsload<" << load
.auth
<< "/" << load
.all
1215 << ", req " << load
.req_rate
1216 << ", hr " << load
.cache_hit_rate
1217 << ", qlen " << load
.queue_len
1218 << ", cpu " << load
.cpu_load_avg
1222 class load_spread_t
{
1224 static const int MAX
= 4;
1230 load_spread_t() : p(0), n(0), count(ceph_clock_now())
1232 for (int i
=0; i
<MAX
; i
++)
1236 double hit(utime_t now
, const DecayRate
& rate
, int who
) {
1237 for (int i
=0; i
<n
; i
++)
1239 return count
.get_last();
1244 if (n
== 1) return 0.0;
1246 if (p
== MAX
) p
= 0;
1248 return count
.hit(now
, rate
);
1250 double get(utime_t now
, const DecayRate
& rate
) {
1251 return count
.get(now
, rate
);
1257 // ================================================================
1258 typedef std::pair
<mds_rank_t
, mds_rank_t
> mds_authority_t
;
1260 // -- authority delegation --
1261 // directory authority types
1262 // >= 0 is the auth mds
1263 #define CDIR_AUTH_PARENT mds_rank_t(-1) // default
1264 #define CDIR_AUTH_UNKNOWN mds_rank_t(-2)
1265 #define CDIR_AUTH_DEFAULT mds_authority_t(CDIR_AUTH_PARENT, CDIR_AUTH_UNKNOWN)
1266 #define CDIR_AUTH_UNDEF mds_authority_t(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN)
1267 //#define CDIR_AUTH_ROOTINODE pair<int,int>( 0, -2)
1269 class MDSCacheObjectInfo
{
1276 MDSCacheObjectInfo() : ino(0) {}
1278 void encode(bufferlist
& bl
) const;
1279 void decode(bufferlist::iterator
& bl
);
1280 void dump(Formatter
*f
) const;
1281 static void generate_test_instances(list
<MDSCacheObjectInfo
*>& ls
);
1284 inline std::ostream
& operator<<(std::ostream
& out
, const MDSCacheObjectInfo
&info
) {
1285 if (info
.ino
) return out
<< info
.ino
<< "." << info
.snapid
;
1286 if (info
.dname
.length()) return out
<< info
.dirfrag
<< "/" << info
.dname
1287 << " snap " << info
.snapid
;
1288 return out
<< info
.dirfrag
;
1291 inline bool operator==(const MDSCacheObjectInfo
& l
, const MDSCacheObjectInfo
& r
) {
1293 return l
.ino
== r
.ino
&& l
.snapid
== r
.snapid
;
1295 return l
.dirfrag
== r
.dirfrag
&& l
.dname
== r
.dname
;
1297 WRITE_CLASS_ENCODER(MDSCacheObjectInfo
)
1300 // parse a map of keys/values.
1301 namespace qi
= boost::spirit::qi
;
1303 template <typename Iterator
>
1304 struct keys_and_values
1305 : qi::grammar
<Iterator
, std::map
<string
, string
>()>
1308 : keys_and_values::base_type(query
)
1310 query
= pair
>> *(qi::lit(' ') >> pair
);
1311 pair
= key
>> '=' >> value
;
1312 key
= qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9");
1313 value
= +qi::char_("a-zA-Z_0-9");
1315 qi::rule
<Iterator
, std::map
<string
, string
>()> query
;
1316 qi::rule
<Iterator
, std::pair
<string
, string
>()> pair
;
1317 qi::rule
<Iterator
, string()> key
, value
;