]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/mdstypes.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / mds / mdstypes.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3#ifndef CEPH_MDSTYPES_H
4#define CEPH_MDSTYPES_H
5
6#include "include/int_types.h"
7
8#include <math.h>
9#include <ostream>
10#include <set>
11#include <map>
12
13#include "common/config.h"
14#include "common/Clock.h"
15#include "common/DecayCounter.h"
16#include "common/entity_name.h"
17
18#include "include/Context.h"
19#include "include/frag.h"
20#include "include/xlist.h"
21#include "include/interval_set.h"
22#include "include/compact_map.h"
23#include "include/compact_set.h"
24#include "include/fs_types.h"
25
26#include "inode_backtrace.h"
27
28#include <boost/spirit/include/qi.hpp>
29#include <boost/pool/pool.hpp>
30#include "include/assert.h"
31#include <boost/serialization/strong_typedef.hpp>
32
33#define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011"
34
35#define MDS_PORT_CACHE 0x200
36#define MDS_PORT_LOCKER 0x300
37#define MDS_PORT_MIGRATOR 0x400
38
39#define MAX_MDS 0x100
40#define NUM_STRAY 10
41
42#define MDS_INO_ROOT 1
43
44// No longer created but recognised in existing filesystems
45// so that we don't try to fragment it.
46#define MDS_INO_CEPH 2
47
48#define MDS_INO_MDSDIR_OFFSET (1*MAX_MDS)
49#define MDS_INO_STRAY_OFFSET (6*MAX_MDS)
50
51// Locations for journal data
52#define MDS_INO_LOG_OFFSET (2*MAX_MDS)
53#define MDS_INO_LOG_BACKUP_OFFSET (3*MAX_MDS)
54#define MDS_INO_LOG_POINTER_OFFSET (4*MAX_MDS)
55#define MDS_INO_PURGE_QUEUE (5*MAX_MDS)
56
57#define MDS_INO_SYSTEM_BASE ((6*MAX_MDS) + (MAX_MDS * NUM_STRAY))
58
59#define MDS_INO_STRAY(x,i) (MDS_INO_STRAY_OFFSET+((((unsigned)(x))*NUM_STRAY)+((unsigned)(i))))
60#define MDS_INO_MDSDIR(x) (MDS_INO_MDSDIR_OFFSET+((unsigned)x))
61
62#define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < (MDS_INO_STRAY_OFFSET+(MAX_MDS*NUM_STRAY)))
63#define MDS_INO_IS_MDSDIR(i) ((i) >= MDS_INO_MDSDIR_OFFSET && (i) < (MDS_INO_MDSDIR_OFFSET+MAX_MDS))
64#define MDS_INO_MDSDIR_OWNER(i) (signed ((unsigned (i)) - MDS_INO_MDSDIR_OFFSET))
65#define MDS_INO_IS_BASE(i) (MDS_INO_ROOT == (i) || MDS_INO_IS_MDSDIR(i))
66#define MDS_INO_STRAY_OWNER(i) (signed (((unsigned (i)) - MDS_INO_STRAY_OFFSET) / NUM_STRAY))
67#define MDS_INO_STRAY_INDEX(i) (((unsigned (i)) - MDS_INO_STRAY_OFFSET) % NUM_STRAY)
68
69#define MDS_TRAVERSE_FORWARD 1
70#define MDS_TRAVERSE_DISCOVER 2 // skips permissions checks etc.
71#define MDS_TRAVERSE_DISCOVERXLOCK 3 // succeeds on (foreign?) null, xlocked dentries.
72
73
74typedef int32_t mds_rank_t;
75typedef int32_t fs_cluster_id_t;
76
77BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
78extern const mds_gid_t MDS_GID_NONE;
79constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = {-1};
80// The namespace ID of the anonymous default filesystem from legacy systems
81constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS = {0};
82extern const mds_rank_t MDS_RANK_NONE;
83
84class mds_role_t
85{
86 public:
87 fs_cluster_id_t fscid;
88 mds_rank_t rank;
89
90 mds_role_t(fs_cluster_id_t fscid_, mds_rank_t rank_)
91 : fscid(fscid_), rank(rank_)
92 {}
93 mds_role_t()
94 : fscid(FS_CLUSTER_ID_NONE), rank(MDS_RANK_NONE)
95 {}
96 bool operator<(mds_role_t const &rhs) const
97 {
98 if (fscid < rhs.fscid) {
99 return true;
100 } else if (fscid == rhs.fscid) {
101 return rank < rhs.rank;
102 } else {
103 return false;
104 }
105 }
106
107 bool is_none() const
108 {
109 return (rank == MDS_RANK_NONE);
110 }
111};
112std::ostream& operator<<(std::ostream &out, const mds_role_t &role);
113
114
115// CAPS
116
117inline string gcap_string(int cap)
118{
119 string s;
120 if (cap & CEPH_CAP_GSHARED) s += "s";
121 if (cap & CEPH_CAP_GEXCL) s += "x";
122 if (cap & CEPH_CAP_GCACHE) s += "c";
123 if (cap & CEPH_CAP_GRD) s += "r";
124 if (cap & CEPH_CAP_GWR) s += "w";
125 if (cap & CEPH_CAP_GBUFFER) s += "b";
126 if (cap & CEPH_CAP_GWREXTEND) s += "a";
127 if (cap & CEPH_CAP_GLAZYIO) s += "l";
128 return s;
129}
130inline string ccap_string(int cap)
131{
132 string s;
133 if (cap & CEPH_CAP_PIN) s += "p";
134
135 int a = (cap >> CEPH_CAP_SAUTH) & 3;
136 if (a) s += 'A' + gcap_string(a);
137
138 a = (cap >> CEPH_CAP_SLINK) & 3;
139 if (a) s += 'L' + gcap_string(a);
140
141 a = (cap >> CEPH_CAP_SXATTR) & 3;
142 if (a) s += 'X' + gcap_string(a);
143
144 a = cap >> CEPH_CAP_SFILE;
145 if (a) s += 'F' + gcap_string(a);
146
147 if (s.length() == 0)
148 s = "-";
149 return s;
150}
151
152
153struct scatter_info_t {
154 version_t version;
155
156 scatter_info_t() : version(0) {}
157};
158
159struct frag_info_t : public scatter_info_t {
160 // this frag
161 utime_t mtime;
162 uint64_t change_attr;
163 int64_t nfiles; // files
164 int64_t nsubdirs; // subdirs
165
166 frag_info_t() : change_attr(0), nfiles(0), nsubdirs(0) {}
167
168 int64_t size() const { return nfiles + nsubdirs; }
169
170 void zero() {
171 *this = frag_info_t();
172 }
173
174 // *this += cur - acc;
175 void add_delta(const frag_info_t &cur, const frag_info_t &acc, bool *touched_mtime=0, bool *touched_chattr=0) {
176 if (cur.mtime > mtime) {
177 mtime = cur.mtime;
178 if (touched_mtime)
179 *touched_mtime = true;
180 }
181 if (cur.change_attr > change_attr) {
182 change_attr = cur.change_attr;
183 if (touched_chattr)
184 *touched_chattr = true;
185 }
186 nfiles += cur.nfiles - acc.nfiles;
187 nsubdirs += cur.nsubdirs - acc.nsubdirs;
188 }
189
190 void add(const frag_info_t& other) {
191 if (other.mtime > mtime)
192 mtime = other.mtime;
193 if (other.change_attr > change_attr)
194 change_attr = other.change_attr;
195 nfiles += other.nfiles;
196 nsubdirs += other.nsubdirs;
197 }
198
199 bool same_sums(const frag_info_t &o) const {
200 return mtime <= o.mtime &&
201 nfiles == o.nfiles &&
202 nsubdirs == o.nsubdirs;
203 }
204
205 void encode(bufferlist &bl) const;
206 void decode(bufferlist::iterator& bl);
207 void dump(Formatter *f) const;
208 static void generate_test_instances(list<frag_info_t*>& ls);
209};
210WRITE_CLASS_ENCODER(frag_info_t)
211
212inline bool operator==(const frag_info_t &l, const frag_info_t &r) {
213 return memcmp(&l, &r, sizeof(l)) == 0;
214}
215inline bool operator!=(const frag_info_t &l, const frag_info_t &r) {
216 return !(l == r);
217}
218
219std::ostream& operator<<(std::ostream &out, const frag_info_t &f);
220
221
222struct nest_info_t : public scatter_info_t {
223 // this frag + children
224 utime_t rctime;
225 int64_t rbytes;
226 int64_t rfiles;
227 int64_t rsubdirs;
228 int64_t rsize() const { return rfiles + rsubdirs; }
229
230 int64_t rsnaprealms;
231
232 nest_info_t() : rbytes(0), rfiles(0), rsubdirs(0), rsnaprealms(0) {}
233
234 void zero() {
235 *this = nest_info_t();
236 }
237
238 void sub(const nest_info_t &other) {
239 add(other, -1);
240 }
241 void add(const nest_info_t &other, int fac=1) {
242 if (other.rctime > rctime)
243 rctime = other.rctime;
244 rbytes += fac*other.rbytes;
245 rfiles += fac*other.rfiles;
246 rsubdirs += fac*other.rsubdirs;
247 rsnaprealms += fac*other.rsnaprealms;
248 }
249
250 // *this += cur - acc;
251 void add_delta(const nest_info_t &cur, const nest_info_t &acc) {
252 if (cur.rctime > rctime)
253 rctime = cur.rctime;
254 rbytes += cur.rbytes - acc.rbytes;
255 rfiles += cur.rfiles - acc.rfiles;
256 rsubdirs += cur.rsubdirs - acc.rsubdirs;
257 rsnaprealms += cur.rsnaprealms - acc.rsnaprealms;
258 }
259
260 bool same_sums(const nest_info_t &o) const {
261 return rctime <= o.rctime &&
262 rbytes == o.rbytes &&
263 rfiles == o.rfiles &&
264 rsubdirs == o.rsubdirs &&
265 rsnaprealms == o.rsnaprealms;
266 }
267
268 void encode(bufferlist &bl) const;
269 void decode(bufferlist::iterator& bl);
270 void dump(Formatter *f) const;
271 static void generate_test_instances(list<nest_info_t*>& ls);
272};
273WRITE_CLASS_ENCODER(nest_info_t)
274
275inline bool operator==(const nest_info_t &l, const nest_info_t &r) {
276 return memcmp(&l, &r, sizeof(l)) == 0;
277}
278inline bool operator!=(const nest_info_t &l, const nest_info_t &r) {
279 return !(l == r);
280}
281
282std::ostream& operator<<(std::ostream &out, const nest_info_t &n);
283
284
285struct vinodeno_t {
286 inodeno_t ino;
287 snapid_t snapid;
288 vinodeno_t() {}
289 vinodeno_t(inodeno_t i, snapid_t s) : ino(i), snapid(s) {}
290
291 void encode(bufferlist& bl) const {
292 ::encode(ino, bl);
293 ::encode(snapid, bl);
294 }
295 void decode(bufferlist::iterator& p) {
296 ::decode(ino, p);
297 ::decode(snapid, p);
298 }
299};
300WRITE_CLASS_ENCODER(vinodeno_t)
301
302inline bool operator==(const vinodeno_t &l, const vinodeno_t &r) {
303 return l.ino == r.ino && l.snapid == r.snapid;
304}
305inline bool operator!=(const vinodeno_t &l, const vinodeno_t &r) {
306 return !(l == r);
307}
308inline bool operator<(const vinodeno_t &l, const vinodeno_t &r) {
309 return
310 l.ino < r.ino ||
311 (l.ino == r.ino && l.snapid < r.snapid);
312}
313
314struct quota_info_t
315{
316 int64_t max_bytes;
317 int64_t max_files;
318
319 quota_info_t() : max_bytes(0), max_files(0) {}
320
321 void encode(bufferlist& bl) const {
322 ENCODE_START(1, 1, bl);
323 ::encode(max_bytes, bl);
324 ::encode(max_files, bl);
325 ENCODE_FINISH(bl);
326 }
327 void decode(bufferlist::iterator& p) {
328 DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p);
329 ::decode(max_bytes, p);
330 ::decode(max_files, p);
331 DECODE_FINISH(p);
332 }
333
334 void dump(Formatter *f) const;
335 static void generate_test_instances(list<quota_info_t *>& ls);
336
337 bool is_valid() const {
338 return max_bytes >=0 && max_files >=0;
339 }
340 bool is_enable() const {
341 return max_bytes || max_files;
342 }
343};
344WRITE_CLASS_ENCODER(quota_info_t)
345
346inline bool operator==(const quota_info_t &l, const quota_info_t &r) {
347 return memcmp(&l, &r, sizeof(l)) == 0;
348}
349
350ostream& operator<<(ostream &out, const quota_info_t &n);
351
352namespace std {
353 template<> struct hash<vinodeno_t> {
354 size_t operator()(const vinodeno_t &vino) const {
355 hash<inodeno_t> H;
356 hash<uint64_t> I;
357 return H(vino.ino) ^ I(vino.snapid);
358 }
359 };
360} // namespace std
361
362
363
364
365inline std::ostream& operator<<(std::ostream &out, const vinodeno_t &vino) {
366 out << vino.ino;
367 if (vino.snapid == CEPH_NOSNAP)
368 out << ".head";
369 else if (vino.snapid)
370 out << '.' << vino.snapid;
371 return out;
372}
373
374
375/*
376 * client_writeable_range_t
377 */
378struct client_writeable_range_t {
379 struct byte_range_t {
380 uint64_t first, last; // interval client can write to
381 byte_range_t() : first(0), last(0) {}
382 };
383
384 byte_range_t range;
385 snapid_t follows; // aka "data+metadata flushed thru"
386
387 client_writeable_range_t() : follows(0) {}
388
389 void encode(bufferlist &bl) const;
390 void decode(bufferlist::iterator& bl);
391 void dump(Formatter *f) const;
392 static void generate_test_instances(list<client_writeable_range_t*>& ls);
393};
394
395inline void decode(client_writeable_range_t::byte_range_t& range, bufferlist::iterator& bl) {
396 ::decode(range.first, bl);
397 ::decode(range.last, bl);
398}
399
400WRITE_CLASS_ENCODER(client_writeable_range_t)
401
402std::ostream& operator<<(std::ostream& out, const client_writeable_range_t& r);
403
404inline bool operator==(const client_writeable_range_t& l,
405 const client_writeable_range_t& r) {
406 return l.range.first == r.range.first && l.range.last == r.range.last &&
407 l.follows == r.follows;
408}
409
410struct inline_data_t {
411private:
412 std::unique_ptr<bufferlist> blp;
413public:
414 version_t version;
415
416 void free_data() {
417 blp.reset();
418 }
419 bufferlist& get_data() {
420 if (!blp)
421 blp.reset(new bufferlist);
422 return *blp;
423 }
424 size_t length() const { return blp ? blp->length() : 0; }
425
426 inline_data_t() : version(1) {}
427 inline_data_t(const inline_data_t& o) : version(o.version) {
428 if (o.blp)
429 get_data() = *o.blp;
430 }
431 inline_data_t& operator=(const inline_data_t& o) {
432 version = o.version;
433 if (o.blp)
434 get_data() = *o.blp;
435 else
436 free_data();
437 return *this;
438 }
439 bool operator==(const inline_data_t& o) const {
440 return length() == o.length() &&
441 (length() == 0 ||
442 (*const_cast<bufferlist*>(blp.get()) == *const_cast<bufferlist*>(o.blp.get())));
443 }
444 bool operator!=(const inline_data_t& o) const {
445 return !(*this == o);
446 }
447 void encode(bufferlist &bl) const;
448 void decode(bufferlist::iterator& bl);
449};
450WRITE_CLASS_ENCODER(inline_data_t)
451
452enum {
453 DAMAGE_STATS, // statistics (dirstat, size, etc)
454 DAMAGE_RSTATS, // recursive statistics (rstat, accounted_rstat)
455 DAMAGE_FRAGTREE // fragtree -- repair by searching
456};
457typedef uint32_t damage_flags_t;
458
459/*
460 * inode_t
461 */
462struct inode_t {
463 /**
464 * ***************
465 * Do not forget to add any new fields to the compare() function.
466 * ***************
467 */
468 // base (immutable)
469 inodeno_t ino;
470 uint32_t rdev; // if special file
471
472 // affected by any inode change...
473 utime_t ctime; // inode change time
474 utime_t btime; // birth time
475
476 // perm (namespace permissions)
477 uint32_t mode;
478 uid_t uid;
479 gid_t gid;
480
481 // nlink
482 int32_t nlink;
483
484 // file (data access)
485 ceph_dir_layout dir_layout; // [dir only]
486 file_layout_t layout;
487 compact_set <int64_t> old_pools;
488 uint64_t size; // on directory, # dentries
489 uint64_t max_size_ever; // max size the file has ever been
490 uint32_t truncate_seq;
491 uint64_t truncate_size, truncate_from;
492 uint32_t truncate_pending;
493 utime_t mtime; // file data modify time.
494 utime_t atime; // file data access time.
495 uint32_t time_warp_seq; // count of (potential) mtime/atime timewarps (i.e., utimes())
496 inline_data_t inline_data;
497
498 // change attribute
499 uint64_t change_attr;
500
501 std::map<client_t,client_writeable_range_t> client_ranges; // client(s) can write to these ranges
502
503 // dirfrag, recursive accountin
504 frag_info_t dirstat; // protected by my filelock
505 nest_info_t rstat; // protected by my nestlock
506 nest_info_t accounted_rstat; // protected by parent's nestlock
507
508 quota_info_t quota;
509
510 mds_rank_t export_pin;
511
512 // special stuff
513 version_t version; // auth only
514 version_t file_data_version; // auth only
515 version_t xattr_version;
516
517 utime_t last_scrub_stamp; // start time of last complete scrub
518 version_t last_scrub_version;// (parent) start version of last complete scrub
519
520 version_t backtrace_version;
521
522 snapid_t oldest_snap;
523
524 string stray_prior_path; //stores path before unlink
525
526 inode_t() : ino(0), rdev(0),
527 mode(0), uid(0), gid(0), nlink(0),
528 size(0), max_size_ever(0),
529 truncate_seq(0), truncate_size(0), truncate_from(0),
530 truncate_pending(0),
531 time_warp_seq(0), change_attr(0),
532 export_pin(MDS_RANK_NONE),
533 version(0), file_data_version(0), xattr_version(0),
534 last_scrub_version(0), backtrace_version(0) {
535 clear_layout();
536 memset(&dir_layout, 0, sizeof(dir_layout));
537 memset(&quota, 0, sizeof(quota));
538 }
539
540 // file type
541 bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; }
542 bool is_dir() const { return (mode & S_IFMT) == S_IFDIR; }
543 bool is_file() const { return (mode & S_IFMT) == S_IFREG; }
544
545 bool is_truncating() const { return (truncate_pending > 0); }
546 void truncate(uint64_t old_size, uint64_t new_size) {
547 assert(new_size < old_size);
548 if (old_size > max_size_ever)
549 max_size_ever = old_size;
550 truncate_from = old_size;
551 size = new_size;
552 rstat.rbytes = new_size;
553 truncate_size = size;
554 truncate_seq++;
555 truncate_pending++;
556 }
557
558 bool has_layout() const {
559 return layout != file_layout_t();
560 }
561
562 void clear_layout() {
563 layout = file_layout_t();
564 }
565
566 uint64_t get_layout_size_increment() const {
567 return layout.get_period();
568 }
569
570 bool is_dirty_rstat() const { return !(rstat == accounted_rstat); }
571
572 uint64_t get_max_size() const {
573 uint64_t max = 0;
574 for (std::map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin();
575 p != client_ranges.end();
576 ++p)
577 if (p->second.range.last > max)
578 max = p->second.range.last;
579 return max;
580 }
581 void set_max_size(uint64_t new_max) {
582 if (new_max == 0) {
583 client_ranges.clear();
584 } else {
585 for (std::map<client_t,client_writeable_range_t>::iterator p = client_ranges.begin();
586 p != client_ranges.end();
587 ++p)
588 p->second.range.last = new_max;
589 }
590 }
591
592 void trim_client_ranges(snapid_t last) {
593 std::map<client_t, client_writeable_range_t>::iterator p = client_ranges.begin();
594 while (p != client_ranges.end()) {
595 if (p->second.follows >= last)
596 client_ranges.erase(p++);
597 else
598 ++p;
599 }
600 }
601
602 bool is_backtrace_updated() const {
603 return backtrace_version == version;
604 }
605 void update_backtrace(version_t pv=0) {
606 backtrace_version = pv ? pv : version;
607 }
608
609 void add_old_pool(int64_t l) {
610 backtrace_version = version;
611 old_pools.insert(l);
612 }
613
614 void encode(bufferlist &bl, uint64_t features) const;
615 void decode(bufferlist::iterator& bl);
616 void dump(Formatter *f) const;
617 static void generate_test_instances(list<inode_t*>& ls);
618 /**
619 * Compare this inode_t with another that represent *the same inode*
620 * at different points in time.
621 * @pre The inodes are the same ino
622 *
623 * @param other The inode_t to compare ourselves with
624 * @param divergent A bool pointer which will be set to true
625 * if the values are different in a way that can't be explained
626 * by one being a newer version than the other.
627 *
628 * @returns 1 if we are newer than the other, 0 if equal, -1 if older.
629 */
630 int compare(const inode_t &other, bool *divergent) const;
631private:
632 bool older_is_consistent(const inode_t &other) const;
633};
634WRITE_CLASS_ENCODER_FEATURES(inode_t)
635
636
637/*
638 * old_inode_t
639 */
640struct old_inode_t {
641 snapid_t first;
642 inode_t inode;
643 std::map<string,bufferptr> xattrs;
644
645 void encode(bufferlist &bl, uint64_t features) const;
646 void decode(bufferlist::iterator& bl);
647 void dump(Formatter *f) const;
648 static void generate_test_instances(list<old_inode_t*>& ls);
649};
650WRITE_CLASS_ENCODER_FEATURES(old_inode_t)
651
652
653/*
654 * like an inode, but for a dir frag
655 */
656struct fnode_t {
657 version_t version;
658 snapid_t snap_purged_thru; // the max_last_destroy snapid we've been purged thru
659 frag_info_t fragstat, accounted_fragstat;
660 nest_info_t rstat, accounted_rstat;
661 damage_flags_t damage_flags;
662
663 // we know we and all our descendants have been scrubbed since this version
664 version_t recursive_scrub_version;
665 utime_t recursive_scrub_stamp;
666 // version at which we last scrubbed our personal data structures
667 version_t localized_scrub_version;
668 utime_t localized_scrub_stamp;
669
670 void encode(bufferlist &bl) const;
671 void decode(bufferlist::iterator& bl);
672 void dump(Formatter *f) const;
673 static void generate_test_instances(list<fnode_t*>& ls);
674 fnode_t() : version(0), damage_flags(0),
675 recursive_scrub_version(0), localized_scrub_version(0) {}
676};
677WRITE_CLASS_ENCODER(fnode_t)
678
679
680struct old_rstat_t {
681 snapid_t first;
682 nest_info_t rstat, accounted_rstat;
683
684 void encode(bufferlist& bl) const;
685 void decode(bufferlist::iterator& p);
686 void dump(Formatter *f) const;
687 static void generate_test_instances(list<old_rstat_t*>& ls);
688};
689WRITE_CLASS_ENCODER(old_rstat_t)
690
691inline std::ostream& operator<<(std::ostream& out, const old_rstat_t& o) {
692 return out << "old_rstat(first " << o.first << " " << o.rstat << " " << o.accounted_rstat << ")";
693}
694
695
696/*
697 * session_info_t
698 */
699
700struct session_info_t {
701 entity_inst_t inst;
702 std::map<ceph_tid_t,inodeno_t> completed_requests;
703 interval_set<inodeno_t> prealloc_inos; // preallocated, ready to use.
704 interval_set<inodeno_t> used_inos; // journaling use
705 std::map<std::string, std::string> client_metadata;
706 std::set<ceph_tid_t> completed_flushes;
707 EntityName auth_name;
708
709 client_t get_client() const { return client_t(inst.name.num()); }
710 const entity_name_t& get_source() const { return inst.name; }
711
712 void clear_meta() {
713 prealloc_inos.clear();
714 used_inos.clear();
715 completed_requests.clear();
716 completed_flushes.clear();
717 }
718
719 void encode(bufferlist& bl, uint64_t features) const;
720 void decode(bufferlist::iterator& p);
721 void dump(Formatter *f) const;
722 static void generate_test_instances(list<session_info_t*>& ls);
723};
724WRITE_CLASS_ENCODER_FEATURES(session_info_t)
725
726
727// =======
728// dentries
729
730struct dentry_key_t {
731 snapid_t snapid;
732 const char *name;
733 __u32 hash;
734 dentry_key_t() : snapid(0), name(0), hash(0) {}
735 dentry_key_t(snapid_t s, const char *n, __u32 h=0) :
736 snapid(s), name(n), hash(h) {}
737
738 bool is_valid() { return name || snapid; }
739
740 // encode into something that can be decoded as a string.
741 // name_ (head) or name_%x (!head)
742 void encode(bufferlist& bl) const {
743 string key;
744 encode(key);
745 ::encode(key, bl);
746 }
747 void encode(string& key) const {
748 char b[20];
749 if (snapid != CEPH_NOSNAP) {
750 uint64_t val(snapid);
751 snprintf(b, sizeof(b), "%" PRIx64, val);
752 } else {
753 snprintf(b, sizeof(b), "%s", "head");
754 }
755 ostringstream oss;
756 oss << name << "_" << b;
757 key = oss.str();
758 }
759 static void decode_helper(bufferlist::iterator& bl, string& nm, snapid_t& sn) {
760 string key;
761 ::decode(key, bl);
762 decode_helper(key, nm, sn);
763 }
764 static void decode_helper(const string& key, string& nm, snapid_t& sn) {
765 size_t i = key.find_last_of('_');
766 assert(i != string::npos);
767 if (key.compare(i+1, string::npos, "head") == 0) {
768 // name_head
769 sn = CEPH_NOSNAP;
770 } else {
771 // name_%x
772 long long unsigned x = 0;
773 sscanf(key.c_str() + i + 1, "%llx", &x);
774 sn = x;
775 }
776 nm = string(key.c_str(), i);
777 }
778};
779
780inline std::ostream& operator<<(std::ostream& out, const dentry_key_t &k)
781{
782 return out << "(" << k.name << "," << k.snapid << ")";
783}
784
785inline bool operator<(const dentry_key_t& k1, const dentry_key_t& k2)
786{
787 /*
788 * order by hash, name, snap
789 */
790 int c = ceph_frag_value(k1.hash) - ceph_frag_value(k2.hash);
791 if (c)
792 return c < 0;
793 c = strcmp(k1.name, k2.name);
794 if (c)
795 return c < 0;
796 return k1.snapid < k2.snapid;
797}
798
799
800/*
801 * string_snap_t is a simple (string, snapid_t) pair
802 */
803struct string_snap_t {
804 string name;
805 snapid_t snapid;
806 string_snap_t() {}
807 string_snap_t(const string& n, snapid_t s) : name(n), snapid(s) {}
808 string_snap_t(const char *n, snapid_t s) : name(n), snapid(s) {}
809
810 void encode(bufferlist& bl) const;
811 void decode(bufferlist::iterator& p);
812 void dump(Formatter *f) const;
813 static void generate_test_instances(list<string_snap_t*>& ls);
814};
815WRITE_CLASS_ENCODER(string_snap_t)
816
817inline bool operator<(const string_snap_t& l, const string_snap_t& r) {
818 int c = strcmp(l.name.c_str(), r.name.c_str());
819 return c < 0 || (c == 0 && l.snapid < r.snapid);
820}
821
822inline std::ostream& operator<<(std::ostream& out, const string_snap_t &k)
823{
824 return out << "(" << k.name << "," << k.snapid << ")";
825}
826
827/*
828 * mds_table_pending_t
829 *
830 * mds's requesting any pending ops. child needs to encode the corresponding
831 * pending mutation state in the table.
832 */
833struct mds_table_pending_t {
834 uint64_t reqid;
835 __s32 mds;
836 version_t tid;
837 mds_table_pending_t() : reqid(0), mds(0), tid(0) {}
838 void encode(bufferlist& bl) const;
839 void decode(bufferlist::iterator& bl);
840 void dump(Formatter *f) const;
841 static void generate_test_instances(list<mds_table_pending_t*>& ls);
842};
843WRITE_CLASS_ENCODER(mds_table_pending_t)
844
845
846// =========
847// requests
848
849struct metareqid_t {
850 entity_name_t name;
851 uint64_t tid;
852 metareqid_t() : tid(0) {}
853 metareqid_t(entity_name_t n, ceph_tid_t t) : name(n), tid(t) {}
854 void encode(bufferlist& bl) const {
855 ::encode(name, bl);
856 ::encode(tid, bl);
857 }
858 void decode(bufferlist::iterator &p) {
859 ::decode(name, p);
860 ::decode(tid, p);
861 }
862};
863WRITE_CLASS_ENCODER(metareqid_t)
864
865inline std::ostream& operator<<(std::ostream& out, const metareqid_t& r) {
866 return out << r.name << ":" << r.tid;
867}
868
869inline bool operator==(const metareqid_t& l, const metareqid_t& r) {
870 return (l.name == r.name) && (l.tid == r.tid);
871}
872inline bool operator!=(const metareqid_t& l, const metareqid_t& r) {
873 return (l.name != r.name) || (l.tid != r.tid);
874}
875inline bool operator<(const metareqid_t& l, const metareqid_t& r) {
876 return (l.name < r.name) ||
877 (l.name == r.name && l.tid < r.tid);
878}
879inline bool operator<=(const metareqid_t& l, const metareqid_t& r) {
880 return (l.name < r.name) ||
881 (l.name == r.name && l.tid <= r.tid);
882}
883inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); }
884inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); }
885
886namespace std {
887 template<> struct hash<metareqid_t> {
888 size_t operator()(const metareqid_t &r) const {
889 hash<uint64_t> H;
890 return H(r.name.num()) ^ H(r.name.type()) ^ H(r.tid);
891 }
892 };
893} // namespace std
894
895
896// cap info for client reconnect
897struct cap_reconnect_t {
898 string path;
899 mutable ceph_mds_cap_reconnect capinfo;
900 snapid_t snap_follows;
901 bufferlist flockbl;
902
903 cap_reconnect_t() {
904 memset(&capinfo, 0, sizeof(capinfo));
905 snap_follows = 0;
906 }
907 cap_reconnect_t(uint64_t cap_id, inodeno_t pino, const string& p, int w, int i,
908 inodeno_t sr, snapid_t sf, bufferlist& lb) :
909 path(p) {
910 capinfo.cap_id = cap_id;
911 capinfo.wanted = w;
912 capinfo.issued = i;
913 capinfo.snaprealm = sr;
914 capinfo.pathbase = pino;
915 capinfo.flock_len = 0;
916 snap_follows = sf;
917 flockbl.claim(lb);
918 }
919 void encode(bufferlist& bl) const;
920 void decode(bufferlist::iterator& bl);
921 void encode_old(bufferlist& bl) const;
922 void decode_old(bufferlist::iterator& bl);
923
924 void dump(Formatter *f) const;
925 static void generate_test_instances(list<cap_reconnect_t*>& ls);
926};
927WRITE_CLASS_ENCODER(cap_reconnect_t)
928
929
930// compat for pre-FLOCK feature
931struct old_ceph_mds_cap_reconnect {
932 __le64 cap_id;
933 __le32 wanted;
934 __le32 issued;
935 __le64 old_size;
936 struct ceph_timespec old_mtime, old_atime;
937 __le64 snaprealm;
938 __le64 pathbase; /* base ino for our path to this ino */
939} __attribute__ ((packed));
940WRITE_RAW_ENCODER(old_ceph_mds_cap_reconnect)
941
942struct old_cap_reconnect_t {
943 string path;
944 old_ceph_mds_cap_reconnect capinfo;
945
946 const old_cap_reconnect_t& operator=(const cap_reconnect_t& n) {
947 path = n.path;
948 capinfo.cap_id = n.capinfo.cap_id;
949 capinfo.wanted = n.capinfo.wanted;
950 capinfo.issued = n.capinfo.issued;
951 capinfo.snaprealm = n.capinfo.snaprealm;
952 capinfo.pathbase = n.capinfo.pathbase;
953 return *this;
954 }
955 operator cap_reconnect_t() {
956 cap_reconnect_t n;
957 n.path = path;
958 n.capinfo.cap_id = capinfo.cap_id;
959 n.capinfo.wanted = capinfo.wanted;
960 n.capinfo.issued = capinfo.issued;
961 n.capinfo.snaprealm = capinfo.snaprealm;
962 n.capinfo.pathbase = capinfo.pathbase;
963 return n;
964 }
965
966 void encode(bufferlist& bl) const {
967 ::encode(path, bl);
968 ::encode(capinfo, bl);
969 }
970 void decode(bufferlist::iterator& bl) {
971 ::decode(path, bl);
972 ::decode(capinfo, bl);
973 }
974};
975WRITE_CLASS_ENCODER(old_cap_reconnect_t)
976
977
978// ================================================================
979// dir frag
980
981struct dirfrag_t {
982 inodeno_t ino;
983 frag_t frag;
984
985 dirfrag_t() : ino(0) { }
986 dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { }
987
988 void encode(bufferlist& bl) const {
989 ::encode(ino, bl);
990 ::encode(frag, bl);
991 }
992 void decode(bufferlist::iterator& bl) {
993 ::decode(ino, bl);
994 ::decode(frag, bl);
995 }
996};
997WRITE_CLASS_ENCODER(dirfrag_t)
998
999
1000inline std::ostream& operator<<(std::ostream& out, const dirfrag_t &df) {
1001 out << df.ino;
1002 if (!df.frag.is_root()) out << "." << df.frag;
1003 return out;
1004}
1005inline bool operator<(dirfrag_t l, dirfrag_t r) {
1006 if (l.ino < r.ino) return true;
1007 if (l.ino == r.ino && l.frag < r.frag) return true;
1008 return false;
1009}
1010inline bool operator==(dirfrag_t l, dirfrag_t r) {
1011 return l.ino == r.ino && l.frag == r.frag;
1012}
1013
1014namespace std {
1015 template<> struct hash<dirfrag_t> {
1016 size_t operator()(const dirfrag_t &df) const {
1017 static rjhash<uint64_t> H;
1018 static rjhash<uint32_t> I;
1019 return H(df.ino) ^ I(df.frag);
1020 }
1021 };
1022} // namespace std
1023
1024
1025
1026// ================================================================
1027
1028#define META_POP_IRD 0
1029#define META_POP_IWR 1
1030#define META_POP_READDIR 2
1031#define META_POP_FETCH 3
1032#define META_POP_STORE 4
1033#define META_NPOP 5
1034
1035class inode_load_vec_t {
1036 static const int NUM = 2;
1037 std::vector < DecayCounter > vec;
1038public:
1039 explicit inode_load_vec_t(const utime_t &now)
1040 : vec(NUM, DecayCounter(now))
1041 {}
1042 // for dencoder infrastructure
1043 inode_load_vec_t() :
1044 vec(NUM, DecayCounter())
1045 {}
1046 DecayCounter &get(int t) {
1047 assert(t < NUM);
1048 return vec[t];
1049 }
1050 void zero(utime_t now) {
1051 for (int i=0; i<NUM; i++)
1052 vec[i].reset(now);
1053 }
1054 void encode(bufferlist &bl) const;
1055 void decode(const utime_t &t, bufferlist::iterator &p);
1056 // for dencoder
1057 void decode(bufferlist::iterator& p) { utime_t sample; decode(sample, p); }
1058 void dump(Formatter *f);
1059 static void generate_test_instances(list<inode_load_vec_t*>& ls);
1060};
1061inline void encode(const inode_load_vec_t &c, bufferlist &bl) { c.encode(bl); }
1062inline void decode(inode_load_vec_t & c, const utime_t &t, bufferlist::iterator &p) {
1063 c.decode(t, p);
1064}
1065// for dencoder
1066inline void decode(inode_load_vec_t & c, bufferlist::iterator &p) {
1067 utime_t sample;
1068 c.decode(sample, p);
1069}
1070
1071class dirfrag_load_vec_t {
1072public:
1073 static const int NUM = 5;
1074 std::vector < DecayCounter > vec;
1075 explicit dirfrag_load_vec_t(const utime_t &now)
1076 : vec(NUM, DecayCounter(now))
1077 { }
1078 // for dencoder infrastructure
1079 dirfrag_load_vec_t()
1080 : vec(NUM, DecayCounter())
1081 {}
1082 void encode(bufferlist &bl) const {
1083 ENCODE_START(2, 2, bl);
1084 for (int i=0; i<NUM; i++)
1085 ::encode(vec[i], bl);
1086 ENCODE_FINISH(bl);
1087 }
1088 void decode(const utime_t &t, bufferlist::iterator &p) {
1089 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p);
1090 for (int i=0; i<NUM; i++)
1091 ::decode(vec[i], t, p);
1092 DECODE_FINISH(p);
1093 }
1094 // for dencoder infrastructure
1095 void decode(bufferlist::iterator& p) {
1096 utime_t sample;
1097 decode(sample, p);
1098 }
1099 void dump(Formatter *f) const;
1100 static void generate_test_instances(list<dirfrag_load_vec_t*>& ls);
1101
1102 DecayCounter &get(int t) {
1103 assert(t < NUM);
1104 return vec[t];
1105 }
1106 void adjust(utime_t now, const DecayRate& rate, double d) {
1107 for (int i=0; i<NUM; i++)
1108 vec[i].adjust(now, rate, d);
1109 }
1110 void zero(utime_t now) {
1111 for (int i=0; i<NUM; i++)
1112 vec[i].reset(now);
1113 }
1114 double meta_load(utime_t now, const DecayRate& rate) {
1115 return
1116 1*vec[META_POP_IRD].get(now, rate) +
1117 2*vec[META_POP_IWR].get(now, rate) +
1118 1*vec[META_POP_READDIR].get(now, rate) +
1119 2*vec[META_POP_FETCH].get(now, rate) +
1120 4*vec[META_POP_STORE].get(now, rate);
1121 }
1122 double meta_load() {
1123 return
1124 1*vec[META_POP_IRD].get_last() +
1125 2*vec[META_POP_IWR].get_last() +
1126 1*vec[META_POP_READDIR].get_last() +
1127 2*vec[META_POP_FETCH].get_last() +
1128 4*vec[META_POP_STORE].get_last();
1129 }
1130
1131 void add(utime_t now, DecayRate& rate, dirfrag_load_vec_t& r) {
1132 for (int i=0; i<dirfrag_load_vec_t::NUM; i++)
1133 vec[i].adjust(r.vec[i].get(now, rate));
1134 }
1135 void sub(utime_t now, DecayRate& rate, dirfrag_load_vec_t& r) {
1136 for (int i=0; i<dirfrag_load_vec_t::NUM; i++)
1137 vec[i].adjust(-r.vec[i].get(now, rate));
1138 }
1139 void scale(double f) {
1140 for (int i=0; i<dirfrag_load_vec_t::NUM; i++)
1141 vec[i].scale(f);
1142 }
1143};
1144
1145inline void encode(const dirfrag_load_vec_t &c, bufferlist &bl) { c.encode(bl); }
1146inline void decode(dirfrag_load_vec_t& c, const utime_t &t, bufferlist::iterator &p) {
1147 c.decode(t, p);
1148}
1149// this for dencoder
1150inline void decode(dirfrag_load_vec_t& c, bufferlist::iterator &p) {
1151 utime_t sample;
1152 c.decode(sample, p);
1153}
1154
1155inline std::ostream& operator<<(std::ostream& out, dirfrag_load_vec_t& dl)
1156{
1157 // ugliness!
1158 utime_t now = ceph_clock_now();
1159 DecayRate rate(g_conf->mds_decay_halflife);
1160 return out << "[" << dl.vec[0].get(now, rate) << "," << dl.vec[1].get(now, rate)
1161 << " " << dl.meta_load(now, rate)
1162 << "]";
1163}
1164
1165
1166
1167
1168
1169
1170/* mds_load_t
1171 * mds load
1172 */
1173
1174struct mds_load_t {
1175 dirfrag_load_vec_t auth;
1176 dirfrag_load_vec_t all;
1177
1178 double req_rate;
1179 double cache_hit_rate;
1180 double queue_len;
1181
1182 double cpu_load_avg;
1183
1184 explicit mds_load_t(const utime_t &t) :
1185 auth(t), all(t), req_rate(0), cache_hit_rate(0),
1186 queue_len(0), cpu_load_avg(0)
1187 {}
1188 // mostly for the dencoder infrastructure
1189 mds_load_t() :
1190 auth(), all(),
1191 req_rate(0), cache_hit_rate(0), queue_len(0), cpu_load_avg(0)
1192 {}
1193
1194 double mds_load(); // defiend in MDBalancer.cc
1195 void encode(bufferlist& bl) const;
1196 void decode(const utime_t& now, bufferlist::iterator& bl);
1197 //this one is for dencoder infrastructure
1198 void decode(bufferlist::iterator& bl) { utime_t sample; decode(sample, bl); }
1199 void dump(Formatter *f) const;
1200 static void generate_test_instances(list<mds_load_t*>& ls);
1201};
1202inline void encode(const mds_load_t &c, bufferlist &bl) { c.encode(bl); }
1203inline void decode(mds_load_t &c, const utime_t &t, bufferlist::iterator &p) {
1204 c.decode(t, p);
1205}
1206// this one is for dencoder
1207inline void decode(mds_load_t &c, bufferlist::iterator &p) {
1208 utime_t sample;
1209 c.decode(sample, p);
1210}
1211
1212inline std::ostream& operator<<( std::ostream& out, mds_load_t& load )
1213{
1214 return out << "mdsload<" << load.auth << "/" << load.all
1215 << ", req " << load.req_rate
1216 << ", hr " << load.cache_hit_rate
1217 << ", qlen " << load.queue_len
1218 << ", cpu " << load.cpu_load_avg
1219 << ">";
1220}
1221
1222class load_spread_t {
1223public:
1224 static const int MAX = 4;
1225 int last[MAX];
1226 int p, n;
1227 DecayCounter count;
1228
1229public:
1230 load_spread_t() : p(0), n(0), count(ceph_clock_now())
1231 {
1232 for (int i=0; i<MAX; i++)
1233 last[i] = -1;
1234 }
1235
1236 double hit(utime_t now, const DecayRate& rate, int who) {
1237 for (int i=0; i<n; i++)
1238 if (last[i] == who)
1239 return count.get_last();
1240
1241 // we're new(ish)
1242 last[p++] = who;
1243 if (n < MAX) n++;
1244 if (n == 1) return 0.0;
1245
1246 if (p == MAX) p = 0;
1247
1248 return count.hit(now, rate);
1249 }
1250 double get(utime_t now, const DecayRate& rate) {
1251 return count.get(now, rate);
1252 }
1253};
1254
1255
1256
1257// ================================================================
1258typedef std::pair<mds_rank_t, mds_rank_t> mds_authority_t;
1259
1260// -- authority delegation --
1261// directory authority types
1262// >= 0 is the auth mds
1263#define CDIR_AUTH_PARENT mds_rank_t(-1) // default
1264#define CDIR_AUTH_UNKNOWN mds_rank_t(-2)
1265#define CDIR_AUTH_DEFAULT mds_authority_t(CDIR_AUTH_PARENT, CDIR_AUTH_UNKNOWN)
1266#define CDIR_AUTH_UNDEF mds_authority_t(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN)
1267//#define CDIR_AUTH_ROOTINODE pair<int,int>( 0, -2)
1268
1269class MDSCacheObjectInfo {
1270public:
1271 inodeno_t ino;
1272 dirfrag_t dirfrag;
1273 string dname;
1274 snapid_t snapid;
1275
1276 MDSCacheObjectInfo() : ino(0) {}
1277
1278 void encode(bufferlist& bl) const;
1279 void decode(bufferlist::iterator& bl);
1280 void dump(Formatter *f) const;
1281 static void generate_test_instances(list<MDSCacheObjectInfo*>& ls);
1282};
1283
1284inline std::ostream& operator<<(std::ostream& out, const MDSCacheObjectInfo &info) {
1285 if (info.ino) return out << info.ino << "." << info.snapid;
1286 if (info.dname.length()) return out << info.dirfrag << "/" << info.dname
1287 << " snap " << info.snapid;
1288 return out << info.dirfrag;
1289}
1290
1291inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) {
1292 if (l.ino || r.ino)
1293 return l.ino == r.ino && l.snapid == r.snapid;
1294 else
1295 return l.dirfrag == r.dirfrag && l.dname == r.dname;
1296}
1297WRITE_CLASS_ENCODER(MDSCacheObjectInfo)
1298
1299
1300// parse a map of keys/values.
1301namespace qi = boost::spirit::qi;
1302
1303template <typename Iterator>
1304struct keys_and_values
1305 : qi::grammar<Iterator, std::map<string, string>()>
1306{
1307 keys_and_values()
1308 : keys_and_values::base_type(query)
1309 {
1310 query = pair >> *(qi::lit(' ') >> pair);
1311 pair = key >> '=' >> value;
1312 key = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9");
1313 value = +qi::char_("a-zA-Z_0-9");
1314 }
1315 qi::rule<Iterator, std::map<string, string>()> query;
1316 qi::rule<Iterator, std::pair<string, string>()> pair;
1317 qi::rule<Iterator, string()> key, value;
1318};
1319
1320#endif