]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/mdstypes.h
import ceph 14.2.5
[ceph.git] / ceph / src / mds / mdstypes.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef CEPH_MDSTYPES_H
4 #define CEPH_MDSTYPES_H
5
6 #include "include/int_types.h"
7
8 #include <math.h>
9 #include <ostream>
10 #include <set>
11 #include <map>
12 #include <string_view>
13
14 #include "common/config.h"
15 #include "common/Clock.h"
16 #include "common/DecayCounter.h"
17 #include "common/entity_name.h"
18
19 #include "include/Context.h"
20 #include "include/frag.h"
21 #include "include/xlist.h"
22 #include "include/interval_set.h"
23 #include "include/compact_map.h"
24 #include "include/compact_set.h"
25 #include "include/fs_types.h"
26
27 #include "inode_backtrace.h"
28
29 #include <boost/spirit/include/qi.hpp>
30 #include <boost/pool/pool.hpp>
31 #include "include/ceph_assert.h"
32 #include <boost/serialization/strong_typedef.hpp>
33
34 #define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011"
35
36 #define MDS_PORT_CACHE 0x200
37 #define MDS_PORT_LOCKER 0x300
38 #define MDS_PORT_MIGRATOR 0x400
39
40 #define MAX_MDS 0x100
41 #define NUM_STRAY 10
42
43 #define MDS_INO_ROOT 1
44
45 // No longer created but recognised in existing filesystems
46 // so that we don't try to fragment it.
47 #define MDS_INO_CEPH 2
48
49 #define MDS_INO_GLOBAL_SNAPREALM 3
50
51 #define MDS_INO_MDSDIR_OFFSET (1*MAX_MDS)
52 #define MDS_INO_STRAY_OFFSET (6*MAX_MDS)
53
54 // Locations for journal data
55 #define MDS_INO_LOG_OFFSET (2*MAX_MDS)
56 #define MDS_INO_LOG_BACKUP_OFFSET (3*MAX_MDS)
57 #define MDS_INO_LOG_POINTER_OFFSET (4*MAX_MDS)
58 #define MDS_INO_PURGE_QUEUE (5*MAX_MDS)
59
60 #define MDS_INO_SYSTEM_BASE ((6*MAX_MDS) + (MAX_MDS * NUM_STRAY))
61
62 #define MDS_INO_STRAY(x,i) (MDS_INO_STRAY_OFFSET+((((unsigned)(x))*NUM_STRAY)+((unsigned)(i))))
63 #define MDS_INO_MDSDIR(x) (MDS_INO_MDSDIR_OFFSET+((unsigned)x))
64
65 #define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < (MDS_INO_STRAY_OFFSET+(MAX_MDS*NUM_STRAY)))
66 #define MDS_INO_IS_MDSDIR(i) ((i) >= MDS_INO_MDSDIR_OFFSET && (i) < (MDS_INO_MDSDIR_OFFSET+MAX_MDS))
67 #define MDS_INO_MDSDIR_OWNER(i) (signed ((unsigned (i)) - MDS_INO_MDSDIR_OFFSET))
68 #define MDS_INO_IS_BASE(i) ((i) == MDS_INO_ROOT || (i) == MDS_INO_GLOBAL_SNAPREALM || MDS_INO_IS_MDSDIR(i))
69 #define MDS_INO_STRAY_OWNER(i) (signed (((unsigned (i)) - MDS_INO_STRAY_OFFSET) / NUM_STRAY))
70 #define MDS_INO_STRAY_INDEX(i) (((unsigned (i)) - MDS_INO_STRAY_OFFSET) % NUM_STRAY)
71
72 #define MDS_TRAVERSE_FORWARD 1
73 #define MDS_TRAVERSE_DISCOVER 2 // skips permissions checks etc.
74 #define MDS_TRAVERSE_DISCOVERXLOCK 3 // succeeds on (foreign?) null, xlocked dentries.
75
76
77 typedef int32_t mds_rank_t;
78 constexpr mds_rank_t MDS_RANK_NONE = -1;
79
80 BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
81 extern const mds_gid_t MDS_GID_NONE;
82
83 typedef int32_t fs_cluster_id_t;
84 constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = -1;
85 // The namespace ID of the anonymous default filesystem from legacy systems
86 constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS = 0;
87
88 class mds_role_t
89 {
90 public:
91 fs_cluster_id_t fscid;
92 mds_rank_t rank;
93
94 mds_role_t(fs_cluster_id_t fscid_, mds_rank_t rank_)
95 : fscid(fscid_), rank(rank_)
96 {}
97 mds_role_t()
98 : fscid(FS_CLUSTER_ID_NONE), rank(MDS_RANK_NONE)
99 {}
100 bool operator<(mds_role_t const &rhs) const
101 {
102 if (fscid < rhs.fscid) {
103 return true;
104 } else if (fscid == rhs.fscid) {
105 return rank < rhs.rank;
106 } else {
107 return false;
108 }
109 }
110
111 bool is_none() const
112 {
113 return (rank == MDS_RANK_NONE);
114 }
115 };
116 std::ostream& operator<<(std::ostream &out, const mds_role_t &role);
117
118
119 // CAPS
120
121 inline string gcap_string(int cap)
122 {
123 string s;
124 if (cap & CEPH_CAP_GSHARED) s += "s";
125 if (cap & CEPH_CAP_GEXCL) s += "x";
126 if (cap & CEPH_CAP_GCACHE) s += "c";
127 if (cap & CEPH_CAP_GRD) s += "r";
128 if (cap & CEPH_CAP_GWR) s += "w";
129 if (cap & CEPH_CAP_GBUFFER) s += "b";
130 if (cap & CEPH_CAP_GWREXTEND) s += "a";
131 if (cap & CEPH_CAP_GLAZYIO) s += "l";
132 return s;
133 }
134 inline string ccap_string(int cap)
135 {
136 string s;
137 if (cap & CEPH_CAP_PIN) s += "p";
138
139 int a = (cap >> CEPH_CAP_SAUTH) & 3;
140 if (a) s += 'A' + gcap_string(a);
141
142 a = (cap >> CEPH_CAP_SLINK) & 3;
143 if (a) s += 'L' + gcap_string(a);
144
145 a = (cap >> CEPH_CAP_SXATTR) & 3;
146 if (a) s += 'X' + gcap_string(a);
147
148 a = cap >> CEPH_CAP_SFILE;
149 if (a) s += 'F' + gcap_string(a);
150
151 if (s.length() == 0)
152 s = "-";
153 return s;
154 }
155
156
157 struct scatter_info_t {
158 version_t version = 0;
159
160 scatter_info_t() {}
161 };
162
163 struct frag_info_t : public scatter_info_t {
164 // this frag
165 utime_t mtime;
166 uint64_t change_attr = 0;
167 int64_t nfiles = 0; // files
168 int64_t nsubdirs = 0; // subdirs
169
170 frag_info_t() {}
171
172 int64_t size() const { return nfiles + nsubdirs; }
173
174 void zero() {
175 *this = frag_info_t();
176 }
177
178 // *this += cur - acc;
179 void add_delta(const frag_info_t &cur, const frag_info_t &acc, bool *touched_mtime=0, bool *touched_chattr=0) {
180 if (cur.mtime > mtime) {
181 mtime = cur.mtime;
182 if (touched_mtime)
183 *touched_mtime = true;
184 }
185 if (cur.change_attr > change_attr) {
186 change_attr = cur.change_attr;
187 if (touched_chattr)
188 *touched_chattr = true;
189 }
190 nfiles += cur.nfiles - acc.nfiles;
191 nsubdirs += cur.nsubdirs - acc.nsubdirs;
192 }
193
194 void add(const frag_info_t& other) {
195 if (other.mtime > mtime)
196 mtime = other.mtime;
197 if (other.change_attr > change_attr)
198 change_attr = other.change_attr;
199 nfiles += other.nfiles;
200 nsubdirs += other.nsubdirs;
201 }
202
203 bool same_sums(const frag_info_t &o) const {
204 return mtime <= o.mtime &&
205 nfiles == o.nfiles &&
206 nsubdirs == o.nsubdirs;
207 }
208
209 void encode(bufferlist &bl) const;
210 void decode(bufferlist::const_iterator& bl);
211 void dump(Formatter *f) const;
212 static void generate_test_instances(list<frag_info_t*>& ls);
213 };
214 WRITE_CLASS_ENCODER(frag_info_t)
215
216 inline bool operator==(const frag_info_t &l, const frag_info_t &r) {
217 return memcmp(&l, &r, sizeof(l)) == 0;
218 }
219 inline bool operator!=(const frag_info_t &l, const frag_info_t &r) {
220 return !(l == r);
221 }
222
223 std::ostream& operator<<(std::ostream &out, const frag_info_t &f);
224
225
226 struct nest_info_t : public scatter_info_t {
227 // this frag + children
228 utime_t rctime;
229 int64_t rbytes = 0;
230 int64_t rfiles = 0;
231 int64_t rsubdirs = 0;
232 int64_t rsize() const { return rfiles + rsubdirs; }
233
234 int64_t rsnaps = 0;
235
236 nest_info_t() {}
237
238 void zero() {
239 *this = nest_info_t();
240 }
241
242 void sub(const nest_info_t &other) {
243 add(other, -1);
244 }
245 void add(const nest_info_t &other, int fac=1) {
246 if (other.rctime > rctime)
247 rctime = other.rctime;
248 rbytes += fac*other.rbytes;
249 rfiles += fac*other.rfiles;
250 rsubdirs += fac*other.rsubdirs;
251 rsnaps += fac*other.rsnaps;
252 }
253
254 // *this += cur - acc;
255 void add_delta(const nest_info_t &cur, const nest_info_t &acc) {
256 if (cur.rctime > rctime)
257 rctime = cur.rctime;
258 rbytes += cur.rbytes - acc.rbytes;
259 rfiles += cur.rfiles - acc.rfiles;
260 rsubdirs += cur.rsubdirs - acc.rsubdirs;
261 rsnaps += cur.rsnaps - acc.rsnaps;
262 }
263
264 bool same_sums(const nest_info_t &o) const {
265 return rctime <= o.rctime &&
266 rbytes == o.rbytes &&
267 rfiles == o.rfiles &&
268 rsubdirs == o.rsubdirs &&
269 rsnaps == o.rsnaps;
270 }
271
272 void encode(bufferlist &bl) const;
273 void decode(bufferlist::const_iterator& bl);
274 void dump(Formatter *f) const;
275 static void generate_test_instances(list<nest_info_t*>& ls);
276 };
277 WRITE_CLASS_ENCODER(nest_info_t)
278
279 inline bool operator==(const nest_info_t &l, const nest_info_t &r) {
280 return memcmp(&l, &r, sizeof(l)) == 0;
281 }
282 inline bool operator!=(const nest_info_t &l, const nest_info_t &r) {
283 return !(l == r);
284 }
285
286 std::ostream& operator<<(std::ostream &out, const nest_info_t &n);
287
288
289 struct vinodeno_t {
290 inodeno_t ino;
291 snapid_t snapid;
292 vinodeno_t() {}
293 vinodeno_t(inodeno_t i, snapid_t s) : ino(i), snapid(s) {}
294
295 void encode(bufferlist& bl) const {
296 using ceph::encode;
297 encode(ino, bl);
298 encode(snapid, bl);
299 }
300 void decode(bufferlist::const_iterator& p) {
301 using ceph::decode;
302 decode(ino, p);
303 decode(snapid, p);
304 }
305 };
306 WRITE_CLASS_ENCODER(vinodeno_t)
307
308 inline bool operator==(const vinodeno_t &l, const vinodeno_t &r) {
309 return l.ino == r.ino && l.snapid == r.snapid;
310 }
311 inline bool operator!=(const vinodeno_t &l, const vinodeno_t &r) {
312 return !(l == r);
313 }
314 inline bool operator<(const vinodeno_t &l, const vinodeno_t &r) {
315 return
316 l.ino < r.ino ||
317 (l.ino == r.ino && l.snapid < r.snapid);
318 }
319
320 struct quota_info_t
321 {
322 int64_t max_bytes = 0;
323 int64_t max_files = 0;
324
325 quota_info_t() {}
326
327 void encode(bufferlist& bl) const {
328 ENCODE_START(1, 1, bl);
329 encode(max_bytes, bl);
330 encode(max_files, bl);
331 ENCODE_FINISH(bl);
332 }
333 void decode(bufferlist::const_iterator& p) {
334 DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p);
335 decode(max_bytes, p);
336 decode(max_files, p);
337 DECODE_FINISH(p);
338 }
339
340 void dump(Formatter *f) const;
341 static void generate_test_instances(list<quota_info_t *>& ls);
342
343 bool is_valid() const {
344 return max_bytes >=0 && max_files >=0;
345 }
346 bool is_enable() const {
347 return max_bytes || max_files;
348 }
349 };
350 WRITE_CLASS_ENCODER(quota_info_t)
351
352 inline bool operator==(const quota_info_t &l, const quota_info_t &r) {
353 return memcmp(&l, &r, sizeof(l)) == 0;
354 }
355
356 ostream& operator<<(ostream &out, const quota_info_t &n);
357
358 namespace std {
359 template<> struct hash<vinodeno_t> {
360 size_t operator()(const vinodeno_t &vino) const {
361 hash<inodeno_t> H;
362 hash<uint64_t> I;
363 return H(vino.ino) ^ I(vino.snapid);
364 }
365 };
366 } // namespace std
367
368
369
370
371 inline std::ostream& operator<<(std::ostream &out, const vinodeno_t &vino) {
372 out << vino.ino;
373 if (vino.snapid == CEPH_NOSNAP)
374 out << ".head";
375 else if (vino.snapid)
376 out << '.' << vino.snapid;
377 return out;
378 }
379
380
381 /*
382 * client_writeable_range_t
383 */
384 struct client_writeable_range_t {
385 struct byte_range_t {
386 uint64_t first = 0, last = 0; // interval client can write to
387 byte_range_t() {}
388 };
389
390 byte_range_t range;
391 snapid_t follows = 0; // aka "data+metadata flushed thru"
392
393 client_writeable_range_t() {}
394
395 void encode(bufferlist &bl) const;
396 void decode(bufferlist::const_iterator& bl);
397 void dump(Formatter *f) const;
398 static void generate_test_instances(std::list<client_writeable_range_t*>& ls);
399 };
400
401 inline void decode(client_writeable_range_t::byte_range_t& range, bufferlist::const_iterator& bl) {
402 decode(range.first, bl);
403 decode(range.last, bl);
404 }
405
406 WRITE_CLASS_ENCODER(client_writeable_range_t)
407
408 std::ostream& operator<<(std::ostream& out, const client_writeable_range_t& r);
409
410 inline bool operator==(const client_writeable_range_t& l,
411 const client_writeable_range_t& r) {
412 return l.range.first == r.range.first && l.range.last == r.range.last &&
413 l.follows == r.follows;
414 }
415
416 struct inline_data_t {
417 private:
418 std::unique_ptr<bufferlist> blp;
419 public:
420 version_t version = 1;
421
422 void free_data() {
423 blp.reset();
424 }
425 bufferlist& get_data() {
426 if (!blp)
427 blp.reset(new bufferlist);
428 return *blp;
429 }
430 size_t length() const { return blp ? blp->length() : 0; }
431
432 inline_data_t() {}
433 inline_data_t(const inline_data_t& o) : version(o.version) {
434 if (o.blp)
435 get_data() = *o.blp;
436 }
437 inline_data_t& operator=(const inline_data_t& o) {
438 version = o.version;
439 if (o.blp)
440 get_data() = *o.blp;
441 else
442 free_data();
443 return *this;
444 }
445 bool operator==(const inline_data_t& o) const {
446 return length() == o.length() &&
447 (length() == 0 ||
448 (*const_cast<bufferlist*>(blp.get()) == *const_cast<bufferlist*>(o.blp.get())));
449 }
450 bool operator!=(const inline_data_t& o) const {
451 return !(*this == o);
452 }
453 void encode(bufferlist &bl) const;
454 void decode(bufferlist::const_iterator& bl);
455 };
456 WRITE_CLASS_ENCODER(inline_data_t)
457
458 enum {
459 DAMAGE_STATS, // statistics (dirstat, size, etc)
460 DAMAGE_RSTATS, // recursive statistics (rstat, accounted_rstat)
461 DAMAGE_FRAGTREE // fragtree -- repair by searching
462 };
463 typedef uint32_t damage_flags_t;
464
465 /*
466 * inode_t
467 */
468 template<template<typename> class Allocator = std::allocator>
469 struct inode_t {
470 /**
471 * ***************
472 * Do not forget to add any new fields to the compare() function.
473 * ***************
474 */
475 // base (immutable)
476 inodeno_t ino = 0;
477 uint32_t rdev = 0; // if special file
478
479 // affected by any inode change...
480 utime_t ctime; // inode change time
481 utime_t btime; // birth time
482
483 // perm (namespace permissions)
484 uint32_t mode = 0;
485 uid_t uid = 0;
486 gid_t gid = 0;
487
488 // nlink
489 int32_t nlink = 0;
490
491 // file (data access)
492 ceph_dir_layout dir_layout; // [dir only]
493 file_layout_t layout;
494 compact_set<int64_t, std::less<int64_t>, Allocator<int64_t>> old_pools;
495 uint64_t size = 0; // on directory, # dentries
496 uint64_t max_size_ever = 0; // max size the file has ever been
497 uint32_t truncate_seq = 0;
498 uint64_t truncate_size = 0, truncate_from = 0;
499 uint32_t truncate_pending = 0;
500 utime_t mtime; // file data modify time.
501 utime_t atime; // file data access time.
502 uint32_t time_warp_seq = 0; // count of (potential) mtime/atime timewarps (i.e., utimes())
503 inline_data_t inline_data; // FIXME check
504
505 // change attribute
506 uint64_t change_attr = 0;
507
508 using client_range_map = std::map<client_t,client_writeable_range_t,std::less<client_t>,Allocator<std::pair<const client_t,client_writeable_range_t>>>;
509 client_range_map client_ranges; // client(s) can write to these ranges
510
511 // dirfrag, recursive accountin
512 frag_info_t dirstat; // protected by my filelock
513 nest_info_t rstat; // protected by my nestlock
514 nest_info_t accounted_rstat; // protected by parent's nestlock
515
516 quota_info_t quota;
517
518 mds_rank_t export_pin = MDS_RANK_NONE;
519
520 // special stuff
521 version_t version = 0; // auth only
522 version_t file_data_version = 0; // auth only
523 version_t xattr_version = 0;
524
525 utime_t last_scrub_stamp; // start time of last complete scrub
526 version_t last_scrub_version = 0;// (parent) start version of last complete scrub
527
528 version_t backtrace_version = 0;
529
530 snapid_t oldest_snap;
531
532 std::basic_string<char,std::char_traits<char>,Allocator<char>> stray_prior_path; //stores path before unlink
533
534 inode_t()
535 {
536 clear_layout();
537 memset(&dir_layout, 0, sizeof(dir_layout));
538 }
539
540 // file type
541 bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; }
542 bool is_dir() const { return (mode & S_IFMT) == S_IFDIR; }
543 bool is_file() const { return (mode & S_IFMT) == S_IFREG; }
544
545 bool is_truncating() const { return (truncate_pending > 0); }
546 void truncate(uint64_t old_size, uint64_t new_size) {
547 ceph_assert(new_size < old_size);
548 if (old_size > max_size_ever)
549 max_size_ever = old_size;
550 truncate_from = old_size;
551 size = new_size;
552 rstat.rbytes = new_size;
553 truncate_size = size;
554 truncate_seq++;
555 truncate_pending++;
556 }
557
558 bool has_layout() const {
559 return layout != file_layout_t();
560 }
561
562 void clear_layout() {
563 layout = file_layout_t();
564 }
565
566 uint64_t get_layout_size_increment() const {
567 return layout.get_period();
568 }
569
570 bool is_dirty_rstat() const { return !(rstat == accounted_rstat); }
571
572 uint64_t get_max_size() const {
573 uint64_t max = 0;
574 for (std::map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin();
575 p != client_ranges.end();
576 ++p)
577 if (p->second.range.last > max)
578 max = p->second.range.last;
579 return max;
580 }
581 void set_max_size(uint64_t new_max) {
582 if (new_max == 0) {
583 client_ranges.clear();
584 } else {
585 for (std::map<client_t,client_writeable_range_t>::iterator p = client_ranges.begin();
586 p != client_ranges.end();
587 ++p)
588 p->second.range.last = new_max;
589 }
590 }
591
592 void trim_client_ranges(snapid_t last) {
593 std::map<client_t, client_writeable_range_t>::iterator p = client_ranges.begin();
594 while (p != client_ranges.end()) {
595 if (p->second.follows >= last)
596 client_ranges.erase(p++);
597 else
598 ++p;
599 }
600 }
601
602 bool is_backtrace_updated() const {
603 return backtrace_version == version;
604 }
605 void update_backtrace(version_t pv=0) {
606 backtrace_version = pv ? pv : version;
607 }
608
609 void add_old_pool(int64_t l) {
610 backtrace_version = version;
611 old_pools.insert(l);
612 }
613
614 void encode(bufferlist &bl, uint64_t features) const;
615 void decode(bufferlist::const_iterator& bl);
616 void dump(Formatter *f) const;
617 static void generate_test_instances(std::list<inode_t*>& ls);
618 /**
619 * Compare this inode_t with another that represent *the same inode*
620 * at different points in time.
621 * @pre The inodes are the same ino
622 *
623 * @param other The inode_t to compare ourselves with
624 * @param divergent A bool pointer which will be set to true
625 * if the values are different in a way that can't be explained
626 * by one being a newer version than the other.
627 *
628 * @returns 1 if we are newer than the other, 0 if equal, -1 if older.
629 */
630 int compare(const inode_t &other, bool *divergent) const;
631 private:
632 bool older_is_consistent(const inode_t &other) const;
633 };
634
635 // These methods may be moved back to mdstypes.cc when we have pmr
636 template<template<typename> class Allocator>
637 void inode_t<Allocator>::encode(bufferlist &bl, uint64_t features) const
638 {
639 ENCODE_START(15, 6, bl);
640
641 encode(ino, bl);
642 encode(rdev, bl);
643 encode(ctime, bl);
644
645 encode(mode, bl);
646 encode(uid, bl);
647 encode(gid, bl);
648
649 encode(nlink, bl);
650 {
651 // removed field
652 bool anchored = 0;
653 encode(anchored, bl);
654 }
655
656 encode(dir_layout, bl);
657 encode(layout, bl, features);
658 encode(size, bl);
659 encode(truncate_seq, bl);
660 encode(truncate_size, bl);
661 encode(truncate_from, bl);
662 encode(truncate_pending, bl);
663 encode(mtime, bl);
664 encode(atime, bl);
665 encode(time_warp_seq, bl);
666 encode(client_ranges, bl);
667
668 encode(dirstat, bl);
669 encode(rstat, bl);
670 encode(accounted_rstat, bl);
671
672 encode(version, bl);
673 encode(file_data_version, bl);
674 encode(xattr_version, bl);
675 encode(backtrace_version, bl);
676 encode(old_pools, bl);
677 encode(max_size_ever, bl);
678 encode(inline_data, bl);
679 encode(quota, bl);
680
681 encode(stray_prior_path, bl);
682
683 encode(last_scrub_version, bl);
684 encode(last_scrub_stamp, bl);
685
686 encode(btime, bl);
687 encode(change_attr, bl);
688
689 encode(export_pin, bl);
690
691 ENCODE_FINISH(bl);
692 }
693
694 template<template<typename> class Allocator>
695 void inode_t<Allocator>::decode(bufferlist::const_iterator &p)
696 {
697 DECODE_START_LEGACY_COMPAT_LEN(15, 6, 6, p);
698
699 decode(ino, p);
700 decode(rdev, p);
701 decode(ctime, p);
702
703 decode(mode, p);
704 decode(uid, p);
705 decode(gid, p);
706
707 decode(nlink, p);
708 {
709 bool anchored;
710 decode(anchored, p);
711 }
712
713 if (struct_v >= 4)
714 decode(dir_layout, p);
715 else
716 memset(&dir_layout, 0, sizeof(dir_layout));
717 decode(layout, p);
718 decode(size, p);
719 decode(truncate_seq, p);
720 decode(truncate_size, p);
721 decode(truncate_from, p);
722 if (struct_v >= 5)
723 decode(truncate_pending, p);
724 else
725 truncate_pending = 0;
726 decode(mtime, p);
727 decode(atime, p);
728 decode(time_warp_seq, p);
729 if (struct_v >= 3) {
730 decode(client_ranges, p);
731 } else {
732 map<client_t, client_writeable_range_t::byte_range_t> m;
733 decode(m, p);
734 for (map<client_t, client_writeable_range_t::byte_range_t>::iterator
735 q = m.begin(); q != m.end(); ++q)
736 client_ranges[q->first].range = q->second;
737 }
738
739 decode(dirstat, p);
740 decode(rstat, p);
741 decode(accounted_rstat, p);
742
743 decode(version, p);
744 decode(file_data_version, p);
745 decode(xattr_version, p);
746 if (struct_v >= 2)
747 decode(backtrace_version, p);
748 if (struct_v >= 7)
749 decode(old_pools, p);
750 if (struct_v >= 8)
751 decode(max_size_ever, p);
752 if (struct_v >= 9) {
753 decode(inline_data, p);
754 } else {
755 inline_data.version = CEPH_INLINE_NONE;
756 }
757 if (struct_v < 10)
758 backtrace_version = 0; // force update backtrace
759 if (struct_v >= 11)
760 decode(quota, p);
761
762 if (struct_v >= 12) {
763 std::string tmp;
764 decode(tmp, p);
765 stray_prior_path = std::string_view(tmp);
766 }
767
768 if (struct_v >= 13) {
769 decode(last_scrub_version, p);
770 decode(last_scrub_stamp, p);
771 }
772 if (struct_v >= 14) {
773 decode(btime, p);
774 decode(change_attr, p);
775 } else {
776 btime = utime_t();
777 change_attr = 0;
778 }
779
780 if (struct_v >= 15) {
781 decode(export_pin, p);
782 } else {
783 export_pin = MDS_RANK_NONE;
784 }
785
786 DECODE_FINISH(p);
787 }
788
789 template<template<typename> class Allocator>
790 void inode_t<Allocator>::dump(Formatter *f) const
791 {
792 f->dump_unsigned("ino", ino);
793 f->dump_unsigned("rdev", rdev);
794 f->dump_stream("ctime") << ctime;
795 f->dump_stream("btime") << btime;
796 f->dump_unsigned("mode", mode);
797 f->dump_unsigned("uid", uid);
798 f->dump_unsigned("gid", gid);
799 f->dump_unsigned("nlink", nlink);
800
801 f->open_object_section("dir_layout");
802 ::dump(dir_layout, f);
803 f->close_section();
804
805 f->dump_object("layout", layout);
806
807 f->open_array_section("old_pools");
808 for (const auto &p : old_pools) {
809 f->dump_int("pool", p);
810 }
811 f->close_section();
812
813 f->dump_unsigned("size", size);
814 f->dump_unsigned("truncate_seq", truncate_seq);
815 f->dump_unsigned("truncate_size", truncate_size);
816 f->dump_unsigned("truncate_from", truncate_from);
817 f->dump_unsigned("truncate_pending", truncate_pending);
818 f->dump_stream("mtime") << mtime;
819 f->dump_stream("atime") << atime;
820 f->dump_unsigned("time_warp_seq", time_warp_seq);
821 f->dump_unsigned("change_attr", change_attr);
822 f->dump_int("export_pin", export_pin);
823
824 f->open_array_section("client_ranges");
825 for (const auto &p : client_ranges) {
826 f->open_object_section("client");
827 f->dump_unsigned("client", p.first.v);
828 p.second.dump(f);
829 f->close_section();
830 }
831 f->close_section();
832
833 f->open_object_section("dirstat");
834 dirstat.dump(f);
835 f->close_section();
836
837 f->open_object_section("rstat");
838 rstat.dump(f);
839 f->close_section();
840
841 f->open_object_section("accounted_rstat");
842 accounted_rstat.dump(f);
843 f->close_section();
844
845 f->dump_unsigned("version", version);
846 f->dump_unsigned("file_data_version", file_data_version);
847 f->dump_unsigned("xattr_version", xattr_version);
848 f->dump_unsigned("backtrace_version", backtrace_version);
849
850 f->dump_string("stray_prior_path", stray_prior_path);
851 }
852
853 template<template<typename> class Allocator>
854 void inode_t<Allocator>::generate_test_instances(list<inode_t*>& ls)
855 {
856 ls.push_back(new inode_t<Allocator>);
857 ls.push_back(new inode_t<Allocator>);
858 ls.back()->ino = 1;
859 // i am lazy.
860 }
861
862 template<template<typename> class Allocator>
863 int inode_t<Allocator>::compare(const inode_t<Allocator> &other, bool *divergent) const
864 {
865 ceph_assert(ino == other.ino);
866 *divergent = false;
867 if (version == other.version) {
868 if (rdev != other.rdev ||
869 ctime != other.ctime ||
870 btime != other.btime ||
871 mode != other.mode ||
872 uid != other.uid ||
873 gid != other.gid ||
874 nlink != other.nlink ||
875 memcmp(&dir_layout, &other.dir_layout, sizeof(dir_layout)) ||
876 layout != other.layout ||
877 old_pools != other.old_pools ||
878 size != other.size ||
879 max_size_ever != other.max_size_ever ||
880 truncate_seq != other.truncate_seq ||
881 truncate_size != other.truncate_size ||
882 truncate_from != other.truncate_from ||
883 truncate_pending != other.truncate_pending ||
884 change_attr != other.change_attr ||
885 mtime != other.mtime ||
886 atime != other.atime ||
887 time_warp_seq != other.time_warp_seq ||
888 inline_data != other.inline_data ||
889 client_ranges != other.client_ranges ||
890 !(dirstat == other.dirstat) ||
891 !(rstat == other.rstat) ||
892 !(accounted_rstat == other.accounted_rstat) ||
893 file_data_version != other.file_data_version ||
894 xattr_version != other.xattr_version ||
895 backtrace_version != other.backtrace_version) {
896 *divergent = true;
897 }
898 return 0;
899 } else if (version > other.version) {
900 *divergent = !older_is_consistent(other);
901 return 1;
902 } else {
903 ceph_assert(version < other.version);
904 *divergent = !other.older_is_consistent(*this);
905 return -1;
906 }
907 }
908
909 template<template<typename> class Allocator>
910 bool inode_t<Allocator>::older_is_consistent(const inode_t<Allocator> &other) const
911 {
912 if (max_size_ever < other.max_size_ever ||
913 truncate_seq < other.truncate_seq ||
914 time_warp_seq < other.time_warp_seq ||
915 inline_data.version < other.inline_data.version ||
916 dirstat.version < other.dirstat.version ||
917 rstat.version < other.rstat.version ||
918 accounted_rstat.version < other.accounted_rstat.version ||
919 file_data_version < other.file_data_version ||
920 xattr_version < other.xattr_version ||
921 backtrace_version < other.backtrace_version) {
922 return false;
923 }
924 return true;
925 }
926
927 template<template<typename> class Allocator>
928 inline void encode(const inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
929 {
930 ENCODE_DUMP_PRE();
931 c.encode(bl, features);
932 ENCODE_DUMP_POST(cl);
933 }
934 template<template<typename> class Allocator>
935 inline void decode(inode_t<Allocator> &c, ::ceph::bufferlist::const_iterator &p)
936 {
937 c.decode(p);
938 }
939
940 template<template<typename> class Allocator>
941 using alloc_string = std::basic_string<char,std::char_traits<char>,Allocator<char>>;
942
943 template<template<typename> class Allocator>
944 using xattr_map = compact_map<alloc_string<Allocator>, bufferptr, std::less<alloc_string<Allocator>>, Allocator<std::pair<const alloc_string<Allocator>, bufferptr>>>; // FIXME bufferptr not in mempool
945
946 /*
947 * old_inode_t
948 */
949 template<template<typename> class Allocator = std::allocator>
950 struct old_inode_t {
951 snapid_t first;
952 inode_t<Allocator> inode;
953 xattr_map<Allocator> xattrs;
954
955 void encode(bufferlist &bl, uint64_t features) const;
956 void decode(bufferlist::const_iterator& bl);
957 void dump(Formatter *f) const;
958 static void generate_test_instances(std::list<old_inode_t*>& ls);
959 };
960
961 // These methods may be moved back to mdstypes.cc when we have pmr
962 template<template<typename> class Allocator>
963 void old_inode_t<Allocator>::encode(bufferlist& bl, uint64_t features) const
964 {
965 ENCODE_START(2, 2, bl);
966 encode(first, bl);
967 encode(inode, bl, features);
968 encode(xattrs, bl);
969 ENCODE_FINISH(bl);
970 }
971
972 template<template<typename> class Allocator>
973 void old_inode_t<Allocator>::decode(bufferlist::const_iterator& bl)
974 {
975 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
976 decode(first, bl);
977 decode(inode, bl);
978 decode(xattrs, bl);
979 DECODE_FINISH(bl);
980 }
981
982 template<template<typename> class Allocator>
983 void old_inode_t<Allocator>::dump(Formatter *f) const
984 {
985 f->dump_unsigned("first", first);
986 inode.dump(f);
987 f->open_object_section("xattrs");
988 for (const auto &p : xattrs) {
989 std::string v(p.second.c_str(), p.second.length());
990 f->dump_string(p.first.c_str(), v);
991 }
992 f->close_section();
993 }
994
995 template<template<typename> class Allocator>
996 void old_inode_t<Allocator>::generate_test_instances(std::list<old_inode_t<Allocator>*>& ls)
997 {
998 ls.push_back(new old_inode_t<Allocator>);
999 ls.push_back(new old_inode_t<Allocator>);
1000 ls.back()->first = 2;
1001 std::list<inode_t<Allocator>*> ils;
1002 inode_t<Allocator>::generate_test_instances(ils);
1003 ls.back()->inode = *ils.back();
1004 ls.back()->xattrs["user.foo"] = buffer::copy("asdf", 4);
1005 ls.back()->xattrs["user.unprintable"] = buffer::copy("\000\001\002", 3);
1006 }
1007
1008 template<template<typename> class Allocator>
1009 inline void encode(const old_inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
1010 {
1011 ENCODE_DUMP_PRE();
1012 c.encode(bl, features);
1013 ENCODE_DUMP_POST(cl);
1014 }
1015 template<template<typename> class Allocator>
1016 inline void decode(old_inode_t<Allocator> &c, ::ceph::bufferlist::const_iterator &p)
1017 {
1018 c.decode(p);
1019 }
1020
1021
1022 /*
1023 * like an inode, but for a dir frag
1024 */
1025 struct fnode_t {
1026 version_t version = 0;
1027 snapid_t snap_purged_thru; // the max_last_destroy snapid we've been purged thru
1028 frag_info_t fragstat, accounted_fragstat;
1029 nest_info_t rstat, accounted_rstat;
1030 damage_flags_t damage_flags = 0;
1031
1032 // we know we and all our descendants have been scrubbed since this version
1033 version_t recursive_scrub_version = 0;
1034 utime_t recursive_scrub_stamp;
1035 // version at which we last scrubbed our personal data structures
1036 version_t localized_scrub_version = 0;
1037 utime_t localized_scrub_stamp;
1038
1039 void encode(bufferlist &bl) const;
1040 void decode(bufferlist::const_iterator& bl);
1041 void dump(Formatter *f) const;
1042 static void generate_test_instances(list<fnode_t*>& ls);
1043 fnode_t() {}
1044 };
1045 WRITE_CLASS_ENCODER(fnode_t)
1046
1047
1048 struct old_rstat_t {
1049 snapid_t first;
1050 nest_info_t rstat, accounted_rstat;
1051
1052 void encode(bufferlist& bl) const;
1053 void decode(bufferlist::const_iterator& p);
1054 void dump(Formatter *f) const;
1055 static void generate_test_instances(list<old_rstat_t*>& ls);
1056 };
1057 WRITE_CLASS_ENCODER(old_rstat_t)
1058
1059 inline std::ostream& operator<<(std::ostream& out, const old_rstat_t& o) {
1060 return out << "old_rstat(first " << o.first << " " << o.rstat << " " << o.accounted_rstat << ")";
1061 }
1062
1063 /*
1064 * feature_bitset_t
1065 */
1066 class feature_bitset_t {
1067 public:
1068 typedef uint64_t block_type;
1069 static const size_t bits_per_block = sizeof(block_type) * 8;
1070
1071 feature_bitset_t(const feature_bitset_t& other) : _vec(other._vec) {}
1072 feature_bitset_t(feature_bitset_t&& other) : _vec(std::move(other._vec)) {}
1073 feature_bitset_t(unsigned long value = 0);
1074 feature_bitset_t(const vector<size_t>& array);
1075 feature_bitset_t& operator=(const feature_bitset_t& other) {
1076 _vec = other._vec;
1077 return *this;
1078 }
1079 feature_bitset_t& operator=(feature_bitset_t&& other) {
1080 _vec = std::move(other._vec);
1081 return *this;
1082 }
1083 bool empty() const {
1084 for (auto& v : _vec) {
1085 if (v)
1086 return false;
1087 }
1088 return true;
1089 }
1090 bool test(size_t bit) const {
1091 if (bit >= bits_per_block * _vec.size())
1092 return false;
1093 return _vec[bit / bits_per_block] & ((block_type)1 << (bit % bits_per_block));
1094 }
1095 void clear() {
1096 _vec.clear();
1097 }
1098 feature_bitset_t& operator-=(const feature_bitset_t& other);
1099 void encode(bufferlist& bl) const;
1100 void decode(bufferlist::const_iterator &p);
1101 void print(ostream& out) const;
1102 private:
1103 vector<block_type> _vec;
1104 };
1105 WRITE_CLASS_ENCODER(feature_bitset_t)
1106
1107 inline std::ostream& operator<<(std::ostream& out, const feature_bitset_t& s) {
1108 s.print(out);
1109 return out;
1110 }
1111
1112 /*
1113 * client_metadata_t
1114 */
1115 struct client_metadata_t {
1116 using kv_map_t = std::map<std::string,std::string>;
1117 using iterator = kv_map_t::const_iterator;
1118
1119 kv_map_t kv_map;
1120 feature_bitset_t features;
1121
1122 client_metadata_t() {}
1123 client_metadata_t(const client_metadata_t& other) :
1124 kv_map(other.kv_map), features(other.features) {}
1125 client_metadata_t(client_metadata_t&& other) :
1126 kv_map(std::move(other.kv_map)), features(std::move(other.features)) {}
1127 client_metadata_t(kv_map_t&& kv, feature_bitset_t &&f) :
1128 kv_map(std::move(kv)), features(std::move(f)) {}
1129 client_metadata_t(const kv_map_t& kv, const feature_bitset_t &f) :
1130 kv_map(kv), features(f) {}
1131 client_metadata_t& operator=(const client_metadata_t& other) {
1132 kv_map = other.kv_map;
1133 features = other.features;
1134 return *this;
1135 }
1136
1137 bool empty() const { return kv_map.empty() && features.empty(); }
1138 iterator find(const std::string& key) const { return kv_map.find(key); }
1139 iterator begin() const { return kv_map.begin(); }
1140 iterator end() const { return kv_map.end(); }
1141 std::string& operator[](const std::string& key) { return kv_map[key]; }
1142 void merge(const client_metadata_t& other) {
1143 kv_map.insert(other.kv_map.begin(), other.kv_map.end());
1144 features = other.features;
1145 }
1146 void clear() {
1147 kv_map.clear();
1148 features.clear();
1149 }
1150
1151 void encode(bufferlist& bl) const;
1152 void decode(bufferlist::const_iterator& p);
1153 void dump(Formatter *f) const;
1154 };
1155 WRITE_CLASS_ENCODER(client_metadata_t)
1156
1157 /*
1158 * session_info_t
1159 */
1160 struct session_info_t {
1161 entity_inst_t inst;
1162 std::map<ceph_tid_t,inodeno_t> completed_requests;
1163 interval_set<inodeno_t> prealloc_inos; // preallocated, ready to use.
1164 interval_set<inodeno_t> used_inos; // journaling use
1165 client_metadata_t client_metadata;
1166 std::set<ceph_tid_t> completed_flushes;
1167 EntityName auth_name;
1168
1169 client_t get_client() const { return client_t(inst.name.num()); }
1170 bool has_feature(size_t bit) const { return client_metadata.features.test(bit); }
1171 const entity_name_t& get_source() const { return inst.name; }
1172
1173 void clear_meta() {
1174 prealloc_inos.clear();
1175 used_inos.clear();
1176 completed_requests.clear();
1177 completed_flushes.clear();
1178 client_metadata.clear();
1179 }
1180
1181 void encode(bufferlist& bl, uint64_t features) const;
1182 void decode(bufferlist::const_iterator& p);
1183 void dump(Formatter *f) const;
1184 static void generate_test_instances(list<session_info_t*>& ls);
1185 };
1186 WRITE_CLASS_ENCODER_FEATURES(session_info_t)
1187
1188
1189 // =======
1190 // dentries
1191
1192 struct dentry_key_t {
1193 snapid_t snapid = 0;
1194 std::string_view name;
1195 __u32 hash = 0;
1196 dentry_key_t() {}
1197 dentry_key_t(snapid_t s, std::string_view n, __u32 h=0) :
1198 snapid(s), name(n), hash(h) {}
1199
1200 bool is_valid() { return name.length() || snapid; }
1201
1202 // encode into something that can be decoded as a string.
1203 // name_ (head) or name_%x (!head)
1204 void encode(bufferlist& bl) const {
1205 string key;
1206 encode(key);
1207 using ceph::encode;
1208 encode(key, bl);
1209 }
1210 void encode(string& key) const {
1211 char b[20];
1212 if (snapid != CEPH_NOSNAP) {
1213 uint64_t val(snapid);
1214 snprintf(b, sizeof(b), "%" PRIx64, val);
1215 } else {
1216 snprintf(b, sizeof(b), "%s", "head");
1217 }
1218 ostringstream oss;
1219 oss << name << "_" << b;
1220 key = oss.str();
1221 }
1222 static void decode_helper(bufferlist::const_iterator& bl, string& nm, snapid_t& sn) {
1223 string key;
1224 decode(key, bl);
1225 decode_helper(key, nm, sn);
1226 }
1227 static void decode_helper(std::string_view key, string& nm, snapid_t& sn) {
1228 size_t i = key.find_last_of('_');
1229 ceph_assert(i != string::npos);
1230 if (key.compare(i+1, std::string_view::npos, "head") == 0) {
1231 // name_head
1232 sn = CEPH_NOSNAP;
1233 } else {
1234 // name_%x
1235 long long unsigned x = 0;
1236 std::string x_str(key.substr(i+1));
1237 sscanf(x_str.c_str(), "%llx", &x);
1238 sn = x;
1239 }
1240 nm = key.substr(0, i);
1241 }
1242 };
1243
1244 inline std::ostream& operator<<(std::ostream& out, const dentry_key_t &k)
1245 {
1246 return out << "(" << k.name << "," << k.snapid << ")";
1247 }
1248
1249 inline bool operator<(const dentry_key_t& k1, const dentry_key_t& k2)
1250 {
1251 /*
1252 * order by hash, name, snap
1253 */
1254 int c = ceph_frag_value(k1.hash) - ceph_frag_value(k2.hash);
1255 if (c)
1256 return c < 0;
1257 c = k1.name.compare(k2.name);
1258 if (c)
1259 return c < 0;
1260 return k1.snapid < k2.snapid;
1261 }
1262
1263
1264 /*
1265 * string_snap_t is a simple (string, snapid_t) pair
1266 */
1267 struct string_snap_t {
1268 string name;
1269 snapid_t snapid;
1270 string_snap_t() {}
1271 string_snap_t(std::string_view n, snapid_t s) : name(n), snapid(s) {}
1272
1273 void encode(bufferlist& bl) const;
1274 void decode(bufferlist::const_iterator& p);
1275 void dump(Formatter *f) const;
1276 static void generate_test_instances(list<string_snap_t*>& ls);
1277 };
1278 WRITE_CLASS_ENCODER(string_snap_t)
1279
1280 inline bool operator<(const string_snap_t& l, const string_snap_t& r) {
1281 int c = l.name.compare(r.name);
1282 return c < 0 || (c == 0 && l.snapid < r.snapid);
1283 }
1284
1285 inline std::ostream& operator<<(std::ostream& out, const string_snap_t &k)
1286 {
1287 return out << "(" << k.name << "," << k.snapid << ")";
1288 }
1289
1290 /*
1291 * mds_table_pending_t
1292 *
1293 * mds's requesting any pending ops. child needs to encode the corresponding
1294 * pending mutation state in the table.
1295 */
1296 struct mds_table_pending_t {
1297 uint64_t reqid = 0;
1298 __s32 mds = 0;
1299 version_t tid = 0;
1300 mds_table_pending_t() {}
1301 void encode(bufferlist& bl) const;
1302 void decode(bufferlist::const_iterator& bl);
1303 void dump(Formatter *f) const;
1304 static void generate_test_instances(list<mds_table_pending_t*>& ls);
1305 };
1306 WRITE_CLASS_ENCODER(mds_table_pending_t)
1307
1308
1309 // =========
1310 // requests
1311
1312 struct metareqid_t {
1313 entity_name_t name;
1314 uint64_t tid = 0;
1315 metareqid_t() {}
1316 metareqid_t(entity_name_t n, ceph_tid_t t) : name(n), tid(t) {}
1317 void encode(bufferlist& bl) const {
1318 using ceph::encode;
1319 encode(name, bl);
1320 encode(tid, bl);
1321 }
1322 void decode(bufferlist::const_iterator &p) {
1323 using ceph::decode;
1324 decode(name, p);
1325 decode(tid, p);
1326 }
1327 };
1328 WRITE_CLASS_ENCODER(metareqid_t)
1329
1330 inline std::ostream& operator<<(std::ostream& out, const metareqid_t& r) {
1331 return out << r.name << ":" << r.tid;
1332 }
1333
1334 inline bool operator==(const metareqid_t& l, const metareqid_t& r) {
1335 return (l.name == r.name) && (l.tid == r.tid);
1336 }
1337 inline bool operator!=(const metareqid_t& l, const metareqid_t& r) {
1338 return (l.name != r.name) || (l.tid != r.tid);
1339 }
1340 inline bool operator<(const metareqid_t& l, const metareqid_t& r) {
1341 return (l.name < r.name) ||
1342 (l.name == r.name && l.tid < r.tid);
1343 }
1344 inline bool operator<=(const metareqid_t& l, const metareqid_t& r) {
1345 return (l.name < r.name) ||
1346 (l.name == r.name && l.tid <= r.tid);
1347 }
1348 inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); }
1349 inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); }
1350
1351 namespace std {
1352 template<> struct hash<metareqid_t> {
1353 size_t operator()(const metareqid_t &r) const {
1354 hash<uint64_t> H;
1355 return H(r.name.num()) ^ H(r.name.type()) ^ H(r.tid);
1356 }
1357 };
1358 } // namespace std
1359
1360
1361 // cap info for client reconnect
1362 struct cap_reconnect_t {
1363 string path;
1364 mutable ceph_mds_cap_reconnect capinfo;
1365 snapid_t snap_follows;
1366 bufferlist flockbl;
1367
1368 cap_reconnect_t() {
1369 memset(&capinfo, 0, sizeof(capinfo));
1370 snap_follows = 0;
1371 }
1372 cap_reconnect_t(uint64_t cap_id, inodeno_t pino, std::string_view p, int w, int i,
1373 inodeno_t sr, snapid_t sf, bufferlist& lb) :
1374 path(p) {
1375 capinfo.cap_id = cap_id;
1376 capinfo.wanted = w;
1377 capinfo.issued = i;
1378 capinfo.snaprealm = sr;
1379 capinfo.pathbase = pino;
1380 capinfo.flock_len = 0;
1381 snap_follows = sf;
1382 flockbl.claim(lb);
1383 }
1384 void encode(bufferlist& bl) const;
1385 void decode(bufferlist::const_iterator& bl);
1386 void encode_old(bufferlist& bl) const;
1387 void decode_old(bufferlist::const_iterator& bl);
1388
1389 void dump(Formatter *f) const;
1390 static void generate_test_instances(list<cap_reconnect_t*>& ls);
1391 };
1392 WRITE_CLASS_ENCODER(cap_reconnect_t)
1393
1394 struct snaprealm_reconnect_t {
1395 mutable ceph_mds_snaprealm_reconnect realm;
1396
1397 snaprealm_reconnect_t() {
1398 memset(&realm, 0, sizeof(realm));
1399 }
1400 snaprealm_reconnect_t(inodeno_t ino, snapid_t seq, inodeno_t parent) {
1401 realm.ino = ino;
1402 realm.seq = seq;
1403 realm.parent = parent;
1404 }
1405 void encode(bufferlist& bl) const;
1406 void decode(bufferlist::const_iterator& bl);
1407 void encode_old(bufferlist& bl) const;
1408 void decode_old(bufferlist::const_iterator& bl);
1409
1410 void dump(Formatter *f) const;
1411 static void generate_test_instances(list<snaprealm_reconnect_t*>& ls);
1412 };
1413 WRITE_CLASS_ENCODER(snaprealm_reconnect_t)
1414
1415 // compat for pre-FLOCK feature
1416 struct old_ceph_mds_cap_reconnect {
1417 ceph_le64 cap_id;
1418 ceph_le32 wanted;
1419 ceph_le32 issued;
1420 ceph_le64 old_size;
1421 struct ceph_timespec old_mtime, old_atime;
1422 ceph_le64 snaprealm;
1423 ceph_le64 pathbase; /* base ino for our path to this ino */
1424 } __attribute__ ((packed));
1425 WRITE_RAW_ENCODER(old_ceph_mds_cap_reconnect)
1426
1427 struct old_cap_reconnect_t {
1428 string path;
1429 old_ceph_mds_cap_reconnect capinfo;
1430
1431 const old_cap_reconnect_t& operator=(const cap_reconnect_t& n) {
1432 path = n.path;
1433 capinfo.cap_id = n.capinfo.cap_id;
1434 capinfo.wanted = n.capinfo.wanted;
1435 capinfo.issued = n.capinfo.issued;
1436 capinfo.snaprealm = n.capinfo.snaprealm;
1437 capinfo.pathbase = n.capinfo.pathbase;
1438 return *this;
1439 }
1440 operator cap_reconnect_t() {
1441 cap_reconnect_t n;
1442 n.path = path;
1443 n.capinfo.cap_id = capinfo.cap_id;
1444 n.capinfo.wanted = capinfo.wanted;
1445 n.capinfo.issued = capinfo.issued;
1446 n.capinfo.snaprealm = capinfo.snaprealm;
1447 n.capinfo.pathbase = capinfo.pathbase;
1448 return n;
1449 }
1450
1451 void encode(bufferlist& bl) const {
1452 using ceph::encode;
1453 encode(path, bl);
1454 encode(capinfo, bl);
1455 }
1456 void decode(bufferlist::const_iterator& bl) {
1457 using ceph::decode;
1458 decode(path, bl);
1459 decode(capinfo, bl);
1460 }
1461 };
1462 WRITE_CLASS_ENCODER(old_cap_reconnect_t)
1463
1464
1465 // ================================================================
1466 // dir frag
1467
1468 struct dirfrag_t {
1469 inodeno_t ino = 0;
1470 frag_t frag;
1471
1472 dirfrag_t() {}
1473 dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { }
1474
1475 void encode(bufferlist& bl) const {
1476 using ceph::encode;
1477 encode(ino, bl);
1478 encode(frag, bl);
1479 }
1480 void decode(bufferlist::const_iterator& bl) {
1481 using ceph::decode;
1482 decode(ino, bl);
1483 decode(frag, bl);
1484 }
1485 };
1486 WRITE_CLASS_ENCODER(dirfrag_t)
1487
1488
1489 inline std::ostream& operator<<(std::ostream& out, const dirfrag_t &df) {
1490 out << df.ino;
1491 if (!df.frag.is_root()) out << "." << df.frag;
1492 return out;
1493 }
1494 inline bool operator<(dirfrag_t l, dirfrag_t r) {
1495 if (l.ino < r.ino) return true;
1496 if (l.ino == r.ino && l.frag < r.frag) return true;
1497 return false;
1498 }
1499 inline bool operator==(dirfrag_t l, dirfrag_t r) {
1500 return l.ino == r.ino && l.frag == r.frag;
1501 }
1502
1503 namespace std {
1504 template<> struct hash<dirfrag_t> {
1505 size_t operator()(const dirfrag_t &df) const {
1506 static rjhash<uint64_t> H;
1507 static rjhash<uint32_t> I;
1508 return H(df.ino) ^ I(df.frag);
1509 }
1510 };
1511 } // namespace std
1512
1513
1514
1515 // ================================================================
1516
1517 #define META_POP_IRD 0
1518 #define META_POP_IWR 1
1519 #define META_POP_READDIR 2
1520 #define META_POP_FETCH 3
1521 #define META_POP_STORE 4
1522 #define META_NPOP 5
1523
1524 class inode_load_vec_t {
1525 public:
1526 using time = DecayCounter::time;
1527 using clock = DecayCounter::clock;
1528 static const size_t NUM = 2;
1529
1530 inode_load_vec_t() : vec{DecayCounter(DecayRate()), DecayCounter(DecayRate())} {}
1531 inode_load_vec_t(const DecayRate &rate) : vec{DecayCounter(rate), DecayCounter(rate)} {}
1532
1533 DecayCounter &get(int t) {
1534 return vec[t];
1535 }
1536 void zero() {
1537 for (auto &d : vec) {
1538 d.reset();
1539 }
1540 }
1541 void encode(bufferlist &bl) const;
1542 void decode(bufferlist::const_iterator& p);
1543 void dump(Formatter *f) const;
1544 static void generate_test_instances(list<inode_load_vec_t*>& ls);
1545
1546 private:
1547 std::array<DecayCounter, NUM> vec;
1548 };
1549 inline void encode(const inode_load_vec_t &c, bufferlist &bl) {
1550 c.encode(bl);
1551 }
1552 inline void decode(inode_load_vec_t & c, bufferlist::const_iterator &p) {
1553 c.decode(p);
1554 }
1555
1556 class dirfrag_load_vec_t {
1557 public:
1558 using time = DecayCounter::time;
1559 using clock = DecayCounter::clock;
1560 static const size_t NUM = 5;
1561
1562 dirfrag_load_vec_t() :
1563 vec{DecayCounter(DecayRate()),
1564 DecayCounter(DecayRate()),
1565 DecayCounter(DecayRate()),
1566 DecayCounter(DecayRate()),
1567 DecayCounter(DecayRate())
1568 }
1569 {}
1570 dirfrag_load_vec_t(const DecayRate &rate) :
1571 vec{DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate)}
1572 {}
1573
1574 void encode(bufferlist &bl) const {
1575 ENCODE_START(2, 2, bl);
1576 for (const auto &i : vec) {
1577 encode(i, bl);
1578 }
1579 ENCODE_FINISH(bl);
1580 }
1581 void decode(bufferlist::const_iterator &p) {
1582 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p);
1583 for (auto &i : vec) {
1584 decode(i, p);
1585 }
1586 DECODE_FINISH(p);
1587 }
1588 void dump(Formatter *f) const;
1589 void dump(Formatter *f, const DecayRate& rate) const;
1590 static void generate_test_instances(std::list<dirfrag_load_vec_t*>& ls);
1591
1592 const DecayCounter &get(int t) const {
1593 return vec[t];
1594 }
1595 DecayCounter &get(int t) {
1596 return vec[t];
1597 }
1598 void adjust(double d) {
1599 for (auto &i : vec) {
1600 i.adjust(d);
1601 }
1602 }
1603 void zero() {
1604 for (auto &i : vec) {
1605 i.reset();
1606 }
1607 }
1608 double meta_load() const {
1609 return
1610 1*vec[META_POP_IRD].get() +
1611 2*vec[META_POP_IWR].get() +
1612 1*vec[META_POP_READDIR].get() +
1613 2*vec[META_POP_FETCH].get() +
1614 4*vec[META_POP_STORE].get();
1615 }
1616
1617 void add(dirfrag_load_vec_t& r) {
1618 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1619 vec[i].adjust(r.vec[i].get());
1620 }
1621 void sub(dirfrag_load_vec_t& r) {
1622 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1623 vec[i].adjust(-r.vec[i].get());
1624 }
1625 void scale(double f) {
1626 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1627 vec[i].scale(f);
1628 }
1629
1630 private:
1631 friend inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl);
1632 std::array<DecayCounter, NUM> vec;
1633 };
1634
1635 inline void encode(const dirfrag_load_vec_t &c, bufferlist &bl) {
1636 c.encode(bl);
1637 }
1638 inline void decode(dirfrag_load_vec_t& c, bufferlist::const_iterator &p) {
1639 c.decode(p);
1640 }
1641
1642 inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl)
1643 {
1644 std::ostringstream ss;
1645 ss << std::setprecision(1) << std::fixed
1646 << "[pop"
1647 " IRD:" << dl.vec[0]
1648 << " IWR:" << dl.vec[1]
1649 << " RDR:" << dl.vec[2]
1650 << " FET:" << dl.vec[3]
1651 << " STR:" << dl.vec[4]
1652 << " *LOAD:" << dl.meta_load() << "]";
1653 return out << ss.str() << std::endl;
1654 }
1655
1656
1657 /* mds_load_t
1658 * mds load
1659 */
1660
1661 struct mds_load_t {
1662 using clock = dirfrag_load_vec_t::clock;
1663 using time = dirfrag_load_vec_t::time;
1664
1665 dirfrag_load_vec_t auth;
1666 dirfrag_load_vec_t all;
1667
1668 mds_load_t() : auth(DecayRate()), all(DecayRate()) {}
1669 mds_load_t(const DecayRate &rate) : auth(rate), all(rate) {}
1670
1671 double req_rate = 0.0;
1672 double cache_hit_rate = 0.0;
1673 double queue_len = 0.0;
1674
1675 double cpu_load_avg = 0.0;
1676
1677 double mds_load() const; // defiend in MDBalancer.cc
1678 void encode(bufferlist& bl) const;
1679 void decode(bufferlist::const_iterator& bl);
1680 void dump(Formatter *f) const;
1681 static void generate_test_instances(std::list<mds_load_t*>& ls);
1682 };
1683 inline void encode(const mds_load_t &c, bufferlist &bl) {
1684 c.encode(bl);
1685 }
1686 inline void decode(mds_load_t &c, bufferlist::const_iterator &p) {
1687 c.decode(p);
1688 }
1689
1690 inline std::ostream& operator<<(std::ostream& out, const mds_load_t& load)
1691 {
1692 return out << "mdsload<" << load.auth << "/" << load.all
1693 << ", req " << load.req_rate
1694 << ", hr " << load.cache_hit_rate
1695 << ", qlen " << load.queue_len
1696 << ", cpu " << load.cpu_load_avg
1697 << ">";
1698 }
1699
1700 class load_spread_t {
1701 public:
1702 using time = DecayCounter::time;
1703 using clock = DecayCounter::clock;
1704 static const int MAX = 4;
1705 int last[MAX];
1706 int p = 0, n = 0;
1707 DecayCounter count;
1708
1709 public:
1710 load_spread_t() = delete;
1711 load_spread_t(const DecayRate &rate) : count(rate)
1712 {
1713 for (int i=0; i<MAX; i++)
1714 last[i] = -1;
1715 }
1716
1717 double hit(int who) {
1718 for (int i=0; i<n; i++)
1719 if (last[i] == who)
1720 return count.get_last();
1721
1722 // we're new(ish)
1723 last[p++] = who;
1724 if (n < MAX) n++;
1725 if (n == 1) return 0.0;
1726
1727 if (p == MAX) p = 0;
1728
1729 return count.hit();
1730 }
1731 double get() const {
1732 return count.get();
1733 }
1734 };
1735
1736
1737
1738 // ================================================================
1739 typedef std::pair<mds_rank_t, mds_rank_t> mds_authority_t;
1740
1741 // -- authority delegation --
1742 // directory authority types
1743 // >= 0 is the auth mds
1744 #define CDIR_AUTH_PARENT mds_rank_t(-1) // default
1745 #define CDIR_AUTH_UNKNOWN mds_rank_t(-2)
1746 #define CDIR_AUTH_DEFAULT mds_authority_t(CDIR_AUTH_PARENT, CDIR_AUTH_UNKNOWN)
1747 #define CDIR_AUTH_UNDEF mds_authority_t(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN)
1748 //#define CDIR_AUTH_ROOTINODE pair<int,int>( 0, -2)
1749
1750 class MDSCacheObjectInfo {
1751 public:
1752 inodeno_t ino = 0;
1753 dirfrag_t dirfrag;
1754 string dname;
1755 snapid_t snapid;
1756
1757 MDSCacheObjectInfo() {}
1758
1759 void encode(bufferlist& bl) const;
1760 void decode(bufferlist::const_iterator& bl);
1761 void dump(Formatter *f) const;
1762 static void generate_test_instances(list<MDSCacheObjectInfo*>& ls);
1763 };
1764
1765 inline std::ostream& operator<<(std::ostream& out, const MDSCacheObjectInfo &info) {
1766 if (info.ino) return out << info.ino << "." << info.snapid;
1767 if (info.dname.length()) return out << info.dirfrag << "/" << info.dname
1768 << " snap " << info.snapid;
1769 return out << info.dirfrag;
1770 }
1771
1772 inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) {
1773 if (l.ino || r.ino)
1774 return l.ino == r.ino && l.snapid == r.snapid;
1775 else
1776 return l.dirfrag == r.dirfrag && l.dname == r.dname;
1777 }
1778 WRITE_CLASS_ENCODER(MDSCacheObjectInfo)
1779
1780
1781 // parse a map of keys/values.
1782 namespace qi = boost::spirit::qi;
1783
1784 template <typename Iterator>
1785 struct keys_and_values
1786 : qi::grammar<Iterator, std::map<string, string>()>
1787 {
1788 keys_and_values()
1789 : keys_and_values::base_type(query)
1790 {
1791 query = pair >> *(qi::lit(' ') >> pair);
1792 pair = key >> '=' >> value;
1793 key = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9");
1794 value = +qi::char_("a-zA-Z_0-9");
1795 }
1796 qi::rule<Iterator, std::map<string, string>()> query;
1797 qi::rule<Iterator, std::pair<string, string>()> pair;
1798 qi::rule<Iterator, string()> key, value;
1799 };
1800
1801 #endif