]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/mdstypes.h
import 15.2.5
[ceph.git] / ceph / src / mds / mdstypes.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef CEPH_MDSTYPES_H
4 #define CEPH_MDSTYPES_H
5
6 #include "include/int_types.h"
7
8 #include <math.h>
9 #include <ostream>
10 #include <set>
11 #include <map>
12 #include <string_view>
13
14 #include "common/config.h"
15 #include "common/Clock.h"
16 #include "common/DecayCounter.h"
17 #include "common/entity_name.h"
18
19 #include "include/Context.h"
20 #include "include/frag.h"
21 #include "include/xlist.h"
22 #include "include/interval_set.h"
23 #include "include/compact_map.h"
24 #include "include/compact_set.h"
25 #include "include/fs_types.h"
26
27 #include "inode_backtrace.h"
28
29 #include <boost/spirit/include/qi.hpp>
30 #include <boost/pool/pool.hpp>
31 #include "include/ceph_assert.h"
32 #include <boost/serialization/strong_typedef.hpp>
33
34 #define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011"
35
36 #define MDS_PORT_CACHE 0x200
37 #define MDS_PORT_LOCKER 0x300
38 #define MDS_PORT_MIGRATOR 0x400
39
40 #define MAX_MDS 0x100
41 #define NUM_STRAY 10
42
43 #define MDS_INO_ROOT 1
44
45 // No longer created but recognised in existing filesystems
46 // so that we don't try to fragment it.
47 #define MDS_INO_CEPH 2
48
49 #define MDS_INO_GLOBAL_SNAPREALM 3
50
51 #define MDS_INO_MDSDIR_OFFSET (1*MAX_MDS)
52 #define MDS_INO_STRAY_OFFSET (6*MAX_MDS)
53
54 // Locations for journal data
55 #define MDS_INO_LOG_OFFSET (2*MAX_MDS)
56 #define MDS_INO_LOG_BACKUP_OFFSET (3*MAX_MDS)
57 #define MDS_INO_LOG_POINTER_OFFSET (4*MAX_MDS)
58 #define MDS_INO_PURGE_QUEUE (5*MAX_MDS)
59
60 #define MDS_INO_SYSTEM_BASE ((6*MAX_MDS) + (MAX_MDS * NUM_STRAY))
61
62 #define MDS_INO_STRAY(x,i) (MDS_INO_STRAY_OFFSET+((((unsigned)(x))*NUM_STRAY)+((unsigned)(i))))
63 #define MDS_INO_MDSDIR(x) (MDS_INO_MDSDIR_OFFSET+((unsigned)x))
64
65 #define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < (MDS_INO_STRAY_OFFSET+(MAX_MDS*NUM_STRAY)))
66 #define MDS_INO_IS_MDSDIR(i) ((i) >= MDS_INO_MDSDIR_OFFSET && (i) < (MDS_INO_MDSDIR_OFFSET+MAX_MDS))
67 #define MDS_INO_MDSDIR_OWNER(i) (signed ((unsigned (i)) - MDS_INO_MDSDIR_OFFSET))
68 #define MDS_INO_IS_BASE(i) ((i) == MDS_INO_ROOT || (i) == MDS_INO_GLOBAL_SNAPREALM || MDS_INO_IS_MDSDIR(i))
69 #define MDS_INO_STRAY_OWNER(i) (signed (((unsigned (i)) - MDS_INO_STRAY_OFFSET) / NUM_STRAY))
70 #define MDS_INO_STRAY_INDEX(i) (((unsigned (i)) - MDS_INO_STRAY_OFFSET) % NUM_STRAY)
71
72 typedef int32_t mds_rank_t;
73 constexpr mds_rank_t MDS_RANK_NONE = -1;
74
75 BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
76 extern const mds_gid_t MDS_GID_NONE;
77
78 typedef int32_t fs_cluster_id_t;
79 constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = -1;
80 // The namespace ID of the anonymous default filesystem from legacy systems
81 constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS = 0;
82
83 class mds_role_t {
84 public:
85 mds_role_t(fs_cluster_id_t fscid_, mds_rank_t rank_)
86 : fscid(fscid_), rank(rank_)
87 {}
88 mds_role_t() {}
89
90 bool operator<(mds_role_t const &rhs) const {
91 if (fscid < rhs.fscid) {
92 return true;
93 } else if (fscid == rhs.fscid) {
94 return rank < rhs.rank;
95 } else {
96 return false;
97 }
98 }
99
100 bool is_none() const {
101 return (rank == MDS_RANK_NONE);
102 }
103
104 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
105 mds_rank_t rank = MDS_RANK_NONE;
106 };
107 inline std::ostream& operator<<(std::ostream& out, const mds_role_t& role) {
108 return out << role.fscid << ":" << role.rank;
109 }
110
111 // CAPS
112 inline string gcap_string(int cap)
113 {
114 string s;
115 if (cap & CEPH_CAP_GSHARED) s += "s";
116 if (cap & CEPH_CAP_GEXCL) s += "x";
117 if (cap & CEPH_CAP_GCACHE) s += "c";
118 if (cap & CEPH_CAP_GRD) s += "r";
119 if (cap & CEPH_CAP_GWR) s += "w";
120 if (cap & CEPH_CAP_GBUFFER) s += "b";
121 if (cap & CEPH_CAP_GWREXTEND) s += "a";
122 if (cap & CEPH_CAP_GLAZYIO) s += "l";
123 return s;
124 }
125 inline string ccap_string(int cap)
126 {
127 string s;
128 if (cap & CEPH_CAP_PIN) s += "p";
129
130 int a = (cap >> CEPH_CAP_SAUTH) & 3;
131 if (a) s += 'A' + gcap_string(a);
132
133 a = (cap >> CEPH_CAP_SLINK) & 3;
134 if (a) s += 'L' + gcap_string(a);
135
136 a = (cap >> CEPH_CAP_SXATTR) & 3;
137 if (a) s += 'X' + gcap_string(a);
138
139 a = cap >> CEPH_CAP_SFILE;
140 if (a) s += 'F' + gcap_string(a);
141
142 if (s.length() == 0)
143 s = "-";
144 return s;
145 }
146
147 struct scatter_info_t {
148 version_t version = 0;
149 };
150
151 struct frag_info_t : public scatter_info_t {
152 int64_t size() const { return nfiles + nsubdirs; }
153
154 void zero() {
155 *this = frag_info_t();
156 }
157
158 // *this += cur - acc;
159 void add_delta(const frag_info_t &cur, const frag_info_t &acc, bool *touched_mtime=0, bool *touched_chattr=0) {
160 if (cur.mtime > mtime) {
161 mtime = cur.mtime;
162 if (touched_mtime)
163 *touched_mtime = true;
164 }
165 if (cur.change_attr > change_attr) {
166 change_attr = cur.change_attr;
167 if (touched_chattr)
168 *touched_chattr = true;
169 }
170 nfiles += cur.nfiles - acc.nfiles;
171 nsubdirs += cur.nsubdirs - acc.nsubdirs;
172 }
173
174 void add(const frag_info_t& other) {
175 if (other.mtime > mtime)
176 mtime = other.mtime;
177 if (other.change_attr > change_attr)
178 change_attr = other.change_attr;
179 nfiles += other.nfiles;
180 nsubdirs += other.nsubdirs;
181 }
182
183 bool same_sums(const frag_info_t &o) const {
184 return mtime <= o.mtime &&
185 nfiles == o.nfiles &&
186 nsubdirs == o.nsubdirs;
187 }
188
189 void encode(bufferlist &bl) const;
190 void decode(bufferlist::const_iterator& bl);
191 void dump(Formatter *f) const;
192 static void generate_test_instances(std::list<frag_info_t*>& ls);
193
194 // this frag
195 utime_t mtime;
196 uint64_t change_attr = 0;
197 int64_t nfiles = 0; // files
198 int64_t nsubdirs = 0; // subdirs
199 };
200 WRITE_CLASS_ENCODER(frag_info_t)
201
202 inline bool operator==(const frag_info_t &l, const frag_info_t &r) {
203 return memcmp(&l, &r, sizeof(l)) == 0;
204 }
205 inline bool operator!=(const frag_info_t &l, const frag_info_t &r) {
206 return !(l == r);
207 }
208
209 std::ostream& operator<<(std::ostream &out, const frag_info_t &f);
210
211
212 struct nest_info_t : public scatter_info_t {
213 int64_t rsize() const { return rfiles + rsubdirs; }
214
215 void zero() {
216 *this = nest_info_t();
217 }
218
219 void sub(const nest_info_t &other) {
220 add(other, -1);
221 }
222 void add(const nest_info_t &other, int fac=1) {
223 if (other.rctime > rctime)
224 rctime = other.rctime;
225 rbytes += fac*other.rbytes;
226 rfiles += fac*other.rfiles;
227 rsubdirs += fac*other.rsubdirs;
228 rsnaps += fac*other.rsnaps;
229 }
230
231 // *this += cur - acc;
232 void add_delta(const nest_info_t &cur, const nest_info_t &acc) {
233 if (cur.rctime > rctime)
234 rctime = cur.rctime;
235 rbytes += cur.rbytes - acc.rbytes;
236 rfiles += cur.rfiles - acc.rfiles;
237 rsubdirs += cur.rsubdirs - acc.rsubdirs;
238 rsnaps += cur.rsnaps - acc.rsnaps;
239 }
240
241 bool same_sums(const nest_info_t &o) const {
242 return rctime <= o.rctime &&
243 rbytes == o.rbytes &&
244 rfiles == o.rfiles &&
245 rsubdirs == o.rsubdirs &&
246 rsnaps == o.rsnaps;
247 }
248
249 void encode(bufferlist &bl) const;
250 void decode(bufferlist::const_iterator& bl);
251 void dump(Formatter *f) const;
252 static void generate_test_instances(std::list<nest_info_t*>& ls);
253
254 // this frag + children
255 utime_t rctime;
256 int64_t rbytes = 0;
257 int64_t rfiles = 0;
258 int64_t rsubdirs = 0;
259 int64_t rsnaps = 0;
260 };
261 WRITE_CLASS_ENCODER(nest_info_t)
262
263 inline bool operator==(const nest_info_t &l, const nest_info_t &r) {
264 return memcmp(&l, &r, sizeof(l)) == 0;
265 }
266 inline bool operator!=(const nest_info_t &l, const nest_info_t &r) {
267 return !(l == r);
268 }
269
270 std::ostream& operator<<(std::ostream &out, const nest_info_t &n);
271
272 struct vinodeno_t {
273 vinodeno_t() {}
274 vinodeno_t(inodeno_t i, snapid_t s) : ino(i), snapid(s) {}
275
276 void encode(bufferlist& bl) const {
277 using ceph::encode;
278 encode(ino, bl);
279 encode(snapid, bl);
280 }
281 void decode(bufferlist::const_iterator& p) {
282 using ceph::decode;
283 decode(ino, p);
284 decode(snapid, p);
285 }
286
287 inodeno_t ino;
288 snapid_t snapid;
289 };
290 WRITE_CLASS_ENCODER(vinodeno_t)
291
292 inline bool operator==(const vinodeno_t &l, const vinodeno_t &r) {
293 return l.ino == r.ino && l.snapid == r.snapid;
294 }
295 inline bool operator!=(const vinodeno_t &l, const vinodeno_t &r) {
296 return !(l == r);
297 }
298 inline bool operator<(const vinodeno_t &l, const vinodeno_t &r) {
299 return
300 l.ino < r.ino ||
301 (l.ino == r.ino && l.snapid < r.snapid);
302 }
303
304 struct quota_info_t
305 {
306 void encode(bufferlist& bl) const {
307 ENCODE_START(1, 1, bl);
308 encode(max_bytes, bl);
309 encode(max_files, bl);
310 ENCODE_FINISH(bl);
311 }
312 void decode(bufferlist::const_iterator& p) {
313 DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p);
314 decode(max_bytes, p);
315 decode(max_files, p);
316 DECODE_FINISH(p);
317 }
318
319 void dump(Formatter *f) const;
320 static void generate_test_instances(std::list<quota_info_t *>& ls);
321
322 bool is_valid() const {
323 return max_bytes >=0 && max_files >=0;
324 }
325 bool is_enable() const {
326 return max_bytes || max_files;
327 }
328
329 int64_t max_bytes = 0;
330 int64_t max_files = 0;
331 };
332 WRITE_CLASS_ENCODER(quota_info_t)
333
334 inline bool operator==(const quota_info_t &l, const quota_info_t &r) {
335 return memcmp(&l, &r, sizeof(l)) == 0;
336 }
337
338 ostream& operator<<(ostream &out, const quota_info_t &n);
339
340 namespace std {
341 template<> struct hash<vinodeno_t> {
342 size_t operator()(const vinodeno_t &vino) const {
343 hash<inodeno_t> H;
344 hash<uint64_t> I;
345 return H(vino.ino) ^ I(vino.snapid);
346 }
347 };
348 }
349
350 inline std::ostream& operator<<(std::ostream &out, const vinodeno_t &vino) {
351 out << vino.ino;
352 if (vino.snapid == CEPH_NOSNAP)
353 out << ".head";
354 else if (vino.snapid)
355 out << '.' << vino.snapid;
356 return out;
357 }
358
359 struct client_writeable_range_t {
360 struct byte_range_t {
361 uint64_t first = 0, last = 0; // interval client can write to
362 };
363
364 void encode(bufferlist &bl) const;
365 void decode(bufferlist::const_iterator& bl);
366 void dump(Formatter *f) const;
367 static void generate_test_instances(std::list<client_writeable_range_t*>& ls);
368
369 byte_range_t range;
370 snapid_t follows = 0; // aka "data+metadata flushed thru"
371 };
372
373 inline void decode(client_writeable_range_t::byte_range_t& range, bufferlist::const_iterator& bl) {
374 decode(range.first, bl);
375 decode(range.last, bl);
376 }
377
378 WRITE_CLASS_ENCODER(client_writeable_range_t)
379
380 std::ostream& operator<<(std::ostream& out, const client_writeable_range_t& r);
381
382 inline bool operator==(const client_writeable_range_t& l,
383 const client_writeable_range_t& r) {
384 return l.range.first == r.range.first && l.range.last == r.range.last &&
385 l.follows == r.follows;
386 }
387
388 struct inline_data_t {
389 public:
390 inline_data_t() {}
391 inline_data_t(const inline_data_t& o) : version(o.version) {
392 if (o.blp)
393 get_data() = *o.blp;
394 }
395 inline_data_t& operator=(const inline_data_t& o) {
396 version = o.version;
397 if (o.blp)
398 get_data() = *o.blp;
399 else
400 free_data();
401 return *this;
402 }
403
404 void free_data() {
405 blp.reset();
406 }
407 bufferlist& get_data() {
408 if (!blp)
409 blp.reset(new bufferlist);
410 return *blp;
411 }
412 size_t length() const { return blp ? blp->length() : 0; }
413
414 bool operator==(const inline_data_t& o) const {
415 return length() == o.length() &&
416 (length() == 0 ||
417 (*const_cast<bufferlist*>(blp.get()) == *const_cast<bufferlist*>(o.blp.get())));
418 }
419 bool operator!=(const inline_data_t& o) const {
420 return !(*this == o);
421 }
422 void encode(bufferlist &bl) const;
423 void decode(bufferlist::const_iterator& bl);
424
425 version_t version = 1;
426
427 private:
428 std::unique_ptr<bufferlist> blp;
429 };
430 WRITE_CLASS_ENCODER(inline_data_t)
431
432 enum {
433 DAMAGE_STATS, // statistics (dirstat, size, etc)
434 DAMAGE_RSTATS, // recursive statistics (rstat, accounted_rstat)
435 DAMAGE_FRAGTREE // fragtree -- repair by searching
436 };
437 typedef uint32_t damage_flags_t;
438
439 template<template<typename> class Allocator = std::allocator>
440 struct inode_t {
441 /**
442 * ***************
443 * Do not forget to add any new fields to the compare() function.
444 * ***************
445 */
446 using client_range_map = std::map<client_t,client_writeable_range_t,std::less<client_t>,Allocator<std::pair<const client_t,client_writeable_range_t>>>;
447
448 inode_t()
449 {
450 clear_layout();
451 }
452
453 // file type
454 bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; }
455 bool is_dir() const { return (mode & S_IFMT) == S_IFDIR; }
456 bool is_file() const { return (mode & S_IFMT) == S_IFREG; }
457
458 bool is_truncating() const { return (truncate_pending > 0); }
459 void truncate(uint64_t old_size, uint64_t new_size) {
460 ceph_assert(new_size < old_size);
461 if (old_size > max_size_ever)
462 max_size_ever = old_size;
463 truncate_from = old_size;
464 size = new_size;
465 rstat.rbytes = new_size;
466 truncate_size = size;
467 truncate_seq++;
468 truncate_pending++;
469 }
470
471 bool has_layout() const {
472 return layout != file_layout_t();
473 }
474
475 void clear_layout() {
476 layout = file_layout_t();
477 }
478
479 uint64_t get_layout_size_increment() const {
480 return layout.get_period();
481 }
482
483 bool is_dirty_rstat() const { return !(rstat == accounted_rstat); }
484
485 uint64_t get_max_size() const {
486 uint64_t max = 0;
487 for (std::map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin();
488 p != client_ranges.end();
489 ++p)
490 if (p->second.range.last > max)
491 max = p->second.range.last;
492 return max;
493 }
494 void set_max_size(uint64_t new_max) {
495 if (new_max == 0) {
496 client_ranges.clear();
497 } else {
498 for (std::map<client_t,client_writeable_range_t>::iterator p = client_ranges.begin();
499 p != client_ranges.end();
500 ++p)
501 p->second.range.last = new_max;
502 }
503 }
504
505 void trim_client_ranges(snapid_t last) {
506 std::map<client_t, client_writeable_range_t>::iterator p = client_ranges.begin();
507 while (p != client_ranges.end()) {
508 if (p->second.follows >= last)
509 client_ranges.erase(p++);
510 else
511 ++p;
512 }
513 }
514
515 bool is_backtrace_updated() const {
516 return backtrace_version == version;
517 }
518 void update_backtrace(version_t pv=0) {
519 backtrace_version = pv ? pv : version;
520 }
521
522 void add_old_pool(int64_t l) {
523 backtrace_version = version;
524 old_pools.insert(l);
525 }
526
527 void encode(bufferlist &bl, uint64_t features) const;
528 void decode(bufferlist::const_iterator& bl);
529 void dump(Formatter *f) const;
530 static void generate_test_instances(std::list<inode_t*>& ls);
531 /**
532 * Compare this inode_t with another that represent *the same inode*
533 * at different points in time.
534 * @pre The inodes are the same ino
535 *
536 * @param other The inode_t to compare ourselves with
537 * @param divergent A bool pointer which will be set to true
538 * if the values are different in a way that can't be explained
539 * by one being a newer version than the other.
540 *
541 * @returns 1 if we are newer than the other, 0 if equal, -1 if older.
542 */
543 int compare(const inode_t &other, bool *divergent) const;
544
545 // base (immutable)
546 inodeno_t ino = 0;
547 uint32_t rdev = 0; // if special file
548
549 // affected by any inode change...
550 utime_t ctime; // inode change time
551 utime_t btime; // birth time
552
553 // perm (namespace permissions)
554 uint32_t mode = 0;
555 uid_t uid = 0;
556 gid_t gid = 0;
557
558 // nlink
559 int32_t nlink = 0;
560
561 // file (data access)
562 ceph_dir_layout dir_layout = {}; // [dir only]
563 file_layout_t layout;
564 compact_set<int64_t, std::less<int64_t>, Allocator<int64_t>> old_pools;
565 uint64_t size = 0; // on directory, # dentries
566 uint64_t max_size_ever = 0; // max size the file has ever been
567 uint32_t truncate_seq = 0;
568 uint64_t truncate_size = 0, truncate_from = 0;
569 uint32_t truncate_pending = 0;
570 utime_t mtime; // file data modify time.
571 utime_t atime; // file data access time.
572 uint32_t time_warp_seq = 0; // count of (potential) mtime/atime timewarps (i.e., utimes())
573 inline_data_t inline_data; // FIXME check
574
575 // change attribute
576 uint64_t change_attr = 0;
577
578 client_range_map client_ranges; // client(s) can write to these ranges
579
580 // dirfrag, recursive accountin
581 frag_info_t dirstat; // protected by my filelock
582 nest_info_t rstat; // protected by my nestlock
583 nest_info_t accounted_rstat; // protected by parent's nestlock
584
585 quota_info_t quota;
586
587 mds_rank_t export_pin = MDS_RANK_NONE;
588
589 double export_ephemeral_random_pin = 0;
590 bool export_ephemeral_distributed_pin = false;
591
592 // special stuff
593 version_t version = 0; // auth only
594 version_t file_data_version = 0; // auth only
595 version_t xattr_version = 0;
596
597 utime_t last_scrub_stamp; // start time of last complete scrub
598 version_t last_scrub_version = 0;// (parent) start version of last complete scrub
599
600 version_t backtrace_version = 0;
601
602 snapid_t oldest_snap;
603
604 std::basic_string<char,std::char_traits<char>,Allocator<char>> stray_prior_path; //stores path before unlink
605
606 private:
607 bool older_is_consistent(const inode_t &other) const;
608 };
609
610 // These methods may be moved back to mdstypes.cc when we have pmr
611 template<template<typename> class Allocator>
612 void inode_t<Allocator>::encode(bufferlist &bl, uint64_t features) const
613 {
614 ENCODE_START(16, 6, bl);
615
616 encode(ino, bl);
617 encode(rdev, bl);
618 encode(ctime, bl);
619
620 encode(mode, bl);
621 encode(uid, bl);
622 encode(gid, bl);
623
624 encode(nlink, bl);
625 {
626 // removed field
627 bool anchored = 0;
628 encode(anchored, bl);
629 }
630
631 encode(dir_layout, bl);
632 encode(layout, bl, features);
633 encode(size, bl);
634 encode(truncate_seq, bl);
635 encode(truncate_size, bl);
636 encode(truncate_from, bl);
637 encode(truncate_pending, bl);
638 encode(mtime, bl);
639 encode(atime, bl);
640 encode(time_warp_seq, bl);
641 encode(client_ranges, bl);
642
643 encode(dirstat, bl);
644 encode(rstat, bl);
645 encode(accounted_rstat, bl);
646
647 encode(version, bl);
648 encode(file_data_version, bl);
649 encode(xattr_version, bl);
650 encode(backtrace_version, bl);
651 encode(old_pools, bl);
652 encode(max_size_ever, bl);
653 encode(inline_data, bl);
654 encode(quota, bl);
655
656 encode(stray_prior_path, bl);
657
658 encode(last_scrub_version, bl);
659 encode(last_scrub_stamp, bl);
660
661 encode(btime, bl);
662 encode(change_attr, bl);
663
664 encode(export_pin, bl);
665
666 encode(export_ephemeral_random_pin, bl);
667 encode(export_ephemeral_distributed_pin, bl);
668
669 ENCODE_FINISH(bl);
670 }
671
672 template<template<typename> class Allocator>
673 void inode_t<Allocator>::decode(bufferlist::const_iterator &p)
674 {
675 DECODE_START_LEGACY_COMPAT_LEN(16, 6, 6, p);
676
677 decode(ino, p);
678 decode(rdev, p);
679 decode(ctime, p);
680
681 decode(mode, p);
682 decode(uid, p);
683 decode(gid, p);
684
685 decode(nlink, p);
686 {
687 bool anchored;
688 decode(anchored, p);
689 }
690
691 if (struct_v >= 4)
692 decode(dir_layout, p);
693 else {
694 // FIPS zeroization audit 20191117: this memset is not security related.
695 memset(&dir_layout, 0, sizeof(dir_layout));
696 }
697 decode(layout, p);
698 decode(size, p);
699 decode(truncate_seq, p);
700 decode(truncate_size, p);
701 decode(truncate_from, p);
702 if (struct_v >= 5)
703 decode(truncate_pending, p);
704 else
705 truncate_pending = 0;
706 decode(mtime, p);
707 decode(atime, p);
708 decode(time_warp_seq, p);
709 if (struct_v >= 3) {
710 decode(client_ranges, p);
711 } else {
712 map<client_t, client_writeable_range_t::byte_range_t> m;
713 decode(m, p);
714 for (map<client_t, client_writeable_range_t::byte_range_t>::iterator
715 q = m.begin(); q != m.end(); ++q)
716 client_ranges[q->first].range = q->second;
717 }
718
719 decode(dirstat, p);
720 decode(rstat, p);
721 decode(accounted_rstat, p);
722
723 decode(version, p);
724 decode(file_data_version, p);
725 decode(xattr_version, p);
726 if (struct_v >= 2)
727 decode(backtrace_version, p);
728 if (struct_v >= 7)
729 decode(old_pools, p);
730 if (struct_v >= 8)
731 decode(max_size_ever, p);
732 if (struct_v >= 9) {
733 decode(inline_data, p);
734 } else {
735 inline_data.version = CEPH_INLINE_NONE;
736 }
737 if (struct_v < 10)
738 backtrace_version = 0; // force update backtrace
739 if (struct_v >= 11)
740 decode(quota, p);
741
742 if (struct_v >= 12) {
743 std::string tmp;
744 decode(tmp, p);
745 stray_prior_path = std::string_view(tmp);
746 }
747
748 if (struct_v >= 13) {
749 decode(last_scrub_version, p);
750 decode(last_scrub_stamp, p);
751 }
752 if (struct_v >= 14) {
753 decode(btime, p);
754 decode(change_attr, p);
755 } else {
756 btime = utime_t();
757 change_attr = 0;
758 }
759
760 if (struct_v >= 15) {
761 decode(export_pin, p);
762 } else {
763 export_pin = MDS_RANK_NONE;
764 }
765
766 if (struct_v >= 16) {
767 decode(export_ephemeral_random_pin, p);
768 decode(export_ephemeral_distributed_pin, p);
769 } else {
770 export_ephemeral_random_pin = 0;
771 export_ephemeral_distributed_pin = false;
772 }
773
774 DECODE_FINISH(p);
775 }
776
777 template<template<typename> class Allocator>
778 void inode_t<Allocator>::dump(Formatter *f) const
779 {
780 f->dump_unsigned("ino", ino);
781 f->dump_unsigned("rdev", rdev);
782 f->dump_stream("ctime") << ctime;
783 f->dump_stream("btime") << btime;
784 f->dump_unsigned("mode", mode);
785 f->dump_unsigned("uid", uid);
786 f->dump_unsigned("gid", gid);
787 f->dump_unsigned("nlink", nlink);
788
789 f->open_object_section("dir_layout");
790 ::dump(dir_layout, f);
791 f->close_section();
792
793 f->dump_object("layout", layout);
794
795 f->open_array_section("old_pools");
796 for (const auto &p : old_pools) {
797 f->dump_int("pool", p);
798 }
799 f->close_section();
800
801 f->dump_unsigned("size", size);
802 f->dump_unsigned("truncate_seq", truncate_seq);
803 f->dump_unsigned("truncate_size", truncate_size);
804 f->dump_unsigned("truncate_from", truncate_from);
805 f->dump_unsigned("truncate_pending", truncate_pending);
806 f->dump_stream("mtime") << mtime;
807 f->dump_stream("atime") << atime;
808 f->dump_unsigned("time_warp_seq", time_warp_seq);
809 f->dump_unsigned("change_attr", change_attr);
810 f->dump_int("export_pin", export_pin);
811 f->dump_int("export_ephemeral_random_pin", export_ephemeral_random_pin);
812 f->dump_bool("export_ephemeral_distributed_pin", export_ephemeral_distributed_pin);
813
814 f->open_array_section("client_ranges");
815 for (const auto &p : client_ranges) {
816 f->open_object_section("client");
817 f->dump_unsigned("client", p.first.v);
818 p.second.dump(f);
819 f->close_section();
820 }
821 f->close_section();
822
823 f->open_object_section("dirstat");
824 dirstat.dump(f);
825 f->close_section();
826
827 f->open_object_section("rstat");
828 rstat.dump(f);
829 f->close_section();
830
831 f->open_object_section("accounted_rstat");
832 accounted_rstat.dump(f);
833 f->close_section();
834
835 f->dump_unsigned("version", version);
836 f->dump_unsigned("file_data_version", file_data_version);
837 f->dump_unsigned("xattr_version", xattr_version);
838 f->dump_unsigned("backtrace_version", backtrace_version);
839
840 f->dump_string("stray_prior_path", stray_prior_path);
841 f->dump_unsigned("max_size_ever", max_size_ever);
842
843 f->open_object_section("quota");
844 quota.dump(f);
845 f->close_section();
846
847 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
848 f->dump_unsigned("last_scrub_version", last_scrub_version);
849 }
850
851 template<template<typename> class Allocator>
852 void inode_t<Allocator>::generate_test_instances(std::list<inode_t*>& ls)
853 {
854 ls.push_back(new inode_t<Allocator>);
855 ls.push_back(new inode_t<Allocator>);
856 ls.back()->ino = 1;
857 // i am lazy.
858 }
859
860 template<template<typename> class Allocator>
861 int inode_t<Allocator>::compare(const inode_t<Allocator> &other, bool *divergent) const
862 {
863 ceph_assert(ino == other.ino);
864 *divergent = false;
865 if (version == other.version) {
866 if (rdev != other.rdev ||
867 ctime != other.ctime ||
868 btime != other.btime ||
869 mode != other.mode ||
870 uid != other.uid ||
871 gid != other.gid ||
872 nlink != other.nlink ||
873 memcmp(&dir_layout, &other.dir_layout, sizeof(dir_layout)) ||
874 layout != other.layout ||
875 old_pools != other.old_pools ||
876 size != other.size ||
877 max_size_ever != other.max_size_ever ||
878 truncate_seq != other.truncate_seq ||
879 truncate_size != other.truncate_size ||
880 truncate_from != other.truncate_from ||
881 truncate_pending != other.truncate_pending ||
882 change_attr != other.change_attr ||
883 mtime != other.mtime ||
884 atime != other.atime ||
885 time_warp_seq != other.time_warp_seq ||
886 inline_data != other.inline_data ||
887 client_ranges != other.client_ranges ||
888 !(dirstat == other.dirstat) ||
889 !(rstat == other.rstat) ||
890 !(accounted_rstat == other.accounted_rstat) ||
891 file_data_version != other.file_data_version ||
892 xattr_version != other.xattr_version ||
893 backtrace_version != other.backtrace_version) {
894 *divergent = true;
895 }
896 return 0;
897 } else if (version > other.version) {
898 *divergent = !older_is_consistent(other);
899 return 1;
900 } else {
901 ceph_assert(version < other.version);
902 *divergent = !other.older_is_consistent(*this);
903 return -1;
904 }
905 }
906
907 template<template<typename> class Allocator>
908 bool inode_t<Allocator>::older_is_consistent(const inode_t<Allocator> &other) const
909 {
910 if (max_size_ever < other.max_size_ever ||
911 truncate_seq < other.truncate_seq ||
912 time_warp_seq < other.time_warp_seq ||
913 inline_data.version < other.inline_data.version ||
914 dirstat.version < other.dirstat.version ||
915 rstat.version < other.rstat.version ||
916 accounted_rstat.version < other.accounted_rstat.version ||
917 file_data_version < other.file_data_version ||
918 xattr_version < other.xattr_version ||
919 backtrace_version < other.backtrace_version) {
920 return false;
921 }
922 return true;
923 }
924
925 template<template<typename> class Allocator>
926 inline void encode(const inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
927 {
928 ENCODE_DUMP_PRE();
929 c.encode(bl, features);
930 ENCODE_DUMP_POST(cl);
931 }
932 template<template<typename> class Allocator>
933 inline void decode(inode_t<Allocator> &c, ::ceph::bufferlist::const_iterator &p)
934 {
935 c.decode(p);
936 }
937
938 template<template<typename> class Allocator>
939 using alloc_string = std::basic_string<char,std::char_traits<char>,Allocator<char>>;
940
941 template<template<typename> class Allocator>
942 using xattr_map = compact_map<alloc_string<Allocator>, bufferptr, std::less<alloc_string<Allocator>>, Allocator<std::pair<const alloc_string<Allocator>, bufferptr>>>; // FIXME bufferptr not in mempool
943
944 template<template<typename> class Allocator>
945 inline void decode_noshare(xattr_map<Allocator>& xattrs, ceph::buffer::list::const_iterator &p)
946 {
947 __u32 n;
948 decode(n, p);
949 while (n-- > 0) {
950 alloc_string<Allocator> key;
951 decode(key, p);
952 __u32 len;
953 decode(len, p);
954 p.copy_deep(len, xattrs[key]);
955 }
956 }
957
958 template<template<typename> class Allocator = std::allocator>
959 struct old_inode_t {
960 snapid_t first;
961 inode_t<Allocator> inode;
962 xattr_map<Allocator> xattrs;
963
964 void encode(bufferlist &bl, uint64_t features) const;
965 void decode(bufferlist::const_iterator& bl);
966 void dump(Formatter *f) const;
967 static void generate_test_instances(std::list<old_inode_t*>& ls);
968 };
969
970 // These methods may be moved back to mdstypes.cc when we have pmr
971 template<template<typename> class Allocator>
972 void old_inode_t<Allocator>::encode(bufferlist& bl, uint64_t features) const
973 {
974 ENCODE_START(2, 2, bl);
975 encode(first, bl);
976 encode(inode, bl, features);
977 encode(xattrs, bl);
978 ENCODE_FINISH(bl);
979 }
980
981 template<template<typename> class Allocator>
982 void old_inode_t<Allocator>::decode(bufferlist::const_iterator& bl)
983 {
984 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
985 decode(first, bl);
986 decode(inode, bl);
987 decode_noshare<Allocator>(xattrs, bl);
988 DECODE_FINISH(bl);
989 }
990
991 template<template<typename> class Allocator>
992 void old_inode_t<Allocator>::dump(Formatter *f) const
993 {
994 f->dump_unsigned("first", first);
995 inode.dump(f);
996 f->open_object_section("xattrs");
997 for (const auto &p : xattrs) {
998 std::string v(p.second.c_str(), p.second.length());
999 f->dump_string(p.first.c_str(), v);
1000 }
1001 f->close_section();
1002 }
1003
1004 template<template<typename> class Allocator>
1005 void old_inode_t<Allocator>::generate_test_instances(std::list<old_inode_t<Allocator>*>& ls)
1006 {
1007 ls.push_back(new old_inode_t<Allocator>);
1008 ls.push_back(new old_inode_t<Allocator>);
1009 ls.back()->first = 2;
1010 std::list<inode_t<Allocator>*> ils;
1011 inode_t<Allocator>::generate_test_instances(ils);
1012 ls.back()->inode = *ils.back();
1013 ls.back()->xattrs["user.foo"] = buffer::copy("asdf", 4);
1014 ls.back()->xattrs["user.unprintable"] = buffer::copy("\000\001\002", 3);
1015 }
1016
1017 template<template<typename> class Allocator>
1018 inline void encode(const old_inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
1019 {
1020 ENCODE_DUMP_PRE();
1021 c.encode(bl, features);
1022 ENCODE_DUMP_POST(cl);
1023 }
1024 template<template<typename> class Allocator>
1025 inline void decode(old_inode_t<Allocator> &c, ::ceph::bufferlist::const_iterator &p)
1026 {
1027 c.decode(p);
1028 }
1029
1030 /*
1031 * like an inode, but for a dir frag
1032 */
1033 struct fnode_t {
1034 void encode(bufferlist &bl) const;
1035 void decode(bufferlist::const_iterator& bl);
1036 void dump(Formatter *f) const;
1037 static void generate_test_instances(std::list<fnode_t*>& ls);
1038
1039 version_t version = 0;
1040 snapid_t snap_purged_thru; // the max_last_destroy snapid we've been purged thru
1041 frag_info_t fragstat, accounted_fragstat;
1042 nest_info_t rstat, accounted_rstat;
1043 damage_flags_t damage_flags = 0;
1044
1045 // we know we and all our descendants have been scrubbed since this version
1046 version_t recursive_scrub_version = 0;
1047 utime_t recursive_scrub_stamp;
1048 // version at which we last scrubbed our personal data structures
1049 version_t localized_scrub_version = 0;
1050 utime_t localized_scrub_stamp;
1051 };
1052 WRITE_CLASS_ENCODER(fnode_t)
1053
1054
1055 struct old_rstat_t {
1056 void encode(bufferlist& bl) const;
1057 void decode(bufferlist::const_iterator& p);
1058 void dump(Formatter *f) const;
1059 static void generate_test_instances(std::list<old_rstat_t*>& ls);
1060
1061 snapid_t first;
1062 nest_info_t rstat, accounted_rstat;
1063 };
1064 WRITE_CLASS_ENCODER(old_rstat_t)
1065
1066 inline std::ostream& operator<<(std::ostream& out, const old_rstat_t& o) {
1067 return out << "old_rstat(first " << o.first << " " << o.rstat << " " << o.accounted_rstat << ")";
1068 }
1069
1070 class feature_bitset_t {
1071 public:
1072 typedef uint64_t block_type;
1073 static const size_t bits_per_block = sizeof(block_type) * 8;
1074
1075 feature_bitset_t(const feature_bitset_t& other) : _vec(other._vec) {}
1076 feature_bitset_t(feature_bitset_t&& other) : _vec(std::move(other._vec)) {}
1077 feature_bitset_t(unsigned long value = 0);
1078 feature_bitset_t(const vector<size_t>& array);
1079 feature_bitset_t& operator=(const feature_bitset_t& other) {
1080 _vec = other._vec;
1081 return *this;
1082 }
1083 feature_bitset_t& operator=(feature_bitset_t&& other) {
1084 _vec = std::move(other._vec);
1085 return *this;
1086 }
1087 feature_bitset_t& operator-=(const feature_bitset_t& other);
1088 bool empty() const {
1089 //block_type is a uint64_t. If the vector is only composed of 0s, then it's still "empty"
1090 for (auto& v : _vec) {
1091 if (v)
1092 return false;
1093 }
1094 return true;
1095 }
1096 bool test(size_t bit) const {
1097 if (bit >= bits_per_block * _vec.size())
1098 return false;
1099 return _vec[bit / bits_per_block] & ((block_type)1 << (bit % bits_per_block));
1100 }
1101 void clear() {
1102 _vec.clear();
1103 }
1104 void encode(bufferlist& bl) const;
1105 void decode(bufferlist::const_iterator &p);
1106 void dump(Formatter *f) const;
1107 void print(ostream& out) const;
1108 private:
1109 vector<block_type> _vec;
1110 };
1111 WRITE_CLASS_ENCODER(feature_bitset_t)
1112
1113 inline std::ostream& operator<<(std::ostream& out, const feature_bitset_t& s) {
1114 s.print(out);
1115 return out;
1116 }
1117
1118 struct metric_spec_t {
1119 metric_spec_t() {}
1120 metric_spec_t(const metric_spec_t& other) :
1121 metric_flags(other.metric_flags) {}
1122 metric_spec_t(metric_spec_t&& other) :
1123 metric_flags(std::move(other.metric_flags)) {}
1124 metric_spec_t(const feature_bitset_t& mf) :
1125 metric_flags(mf) {}
1126 metric_spec_t(feature_bitset_t&& mf) :
1127 metric_flags(std::move(mf)) {}
1128
1129 metric_spec_t& operator=(const metric_spec_t& other) {
1130 metric_flags = other.metric_flags;
1131 return *this;
1132 }
1133 metric_spec_t& operator=(metric_spec_t&& other) {
1134 metric_flags = std::move(other.metric_flags);
1135 return *this;
1136 }
1137
1138 bool empty() const {
1139 return metric_flags.empty();
1140 }
1141
1142 void clear() {
1143 metric_flags.clear();
1144 }
1145
1146 void encode(bufferlist& bl) const;
1147 void decode(bufferlist::const_iterator& p);
1148 void dump(Formatter *f) const;
1149 void print(ostream& out) const;
1150
1151 // set of metrics that a client is capable of forwarding
1152 feature_bitset_t metric_flags;
1153 };
1154 WRITE_CLASS_ENCODER(metric_spec_t)
1155
1156 inline std::ostream& operator<<(std::ostream& out, const metric_spec_t& mst) {
1157 mst.print(out);
1158 return out;
1159 }
1160
1161 /*
1162 * client_metadata_t
1163 */
1164 struct client_metadata_t {
1165 using kv_map_t = std::map<std::string,std::string>;
1166 using iterator = kv_map_t::const_iterator;
1167
1168 client_metadata_t() {}
1169 client_metadata_t(const kv_map_t& kv, const feature_bitset_t &f, const metric_spec_t &mst) :
1170 kv_map(kv),
1171 features(f),
1172 metric_spec(mst) {}
1173 client_metadata_t& operator=(const client_metadata_t& other) {
1174 kv_map = other.kv_map;
1175 features = other.features;
1176 metric_spec = other.metric_spec;
1177 return *this;
1178 }
1179
1180 bool empty() const { return kv_map.empty() && features.empty() && metric_spec.empty(); }
1181 iterator find(const std::string& key) const { return kv_map.find(key); }
1182 iterator begin() const { return kv_map.begin(); }
1183 iterator end() const { return kv_map.end(); }
1184 void erase(iterator it) { kv_map.erase(it); }
1185 std::string& operator[](const std::string& key) { return kv_map[key]; }
1186 void merge(const client_metadata_t& other) {
1187 kv_map.insert(other.kv_map.begin(), other.kv_map.end());
1188 features = other.features;
1189 metric_spec = other.metric_spec;
1190 }
1191 void clear() {
1192 kv_map.clear();
1193 features.clear();
1194 metric_spec.clear();
1195 }
1196
1197 void encode(bufferlist& bl) const;
1198 void decode(bufferlist::const_iterator& p);
1199 void dump(Formatter *f) const;
1200
1201 kv_map_t kv_map;
1202 feature_bitset_t features;
1203 metric_spec_t metric_spec;
1204 };
1205 WRITE_CLASS_ENCODER(client_metadata_t)
1206
1207 /*
1208 * session_info_t - durable part of a Session
1209 */
1210 struct session_info_t {
1211 client_t get_client() const { return client_t(inst.name.num()); }
1212 bool has_feature(size_t bit) const { return client_metadata.features.test(bit); }
1213 const entity_name_t& get_source() const { return inst.name; }
1214
1215 void clear_meta() {
1216 prealloc_inos.clear();
1217 used_inos.clear();
1218 completed_requests.clear();
1219 completed_flushes.clear();
1220 client_metadata.clear();
1221 }
1222
1223 void encode(bufferlist& bl, uint64_t features) const;
1224 void decode(bufferlist::const_iterator& p);
1225 void dump(Formatter *f) const;
1226 static void generate_test_instances(std::list<session_info_t*>& ls);
1227
1228 entity_inst_t inst;
1229 std::map<ceph_tid_t,inodeno_t> completed_requests;
1230 interval_set<inodeno_t> prealloc_inos; // preallocated, ready to use.
1231 interval_set<inodeno_t> used_inos; // journaling use
1232 client_metadata_t client_metadata;
1233 std::set<ceph_tid_t> completed_flushes;
1234 EntityName auth_name;
1235 };
1236 WRITE_CLASS_ENCODER_FEATURES(session_info_t)
1237
1238 // dentries
1239 struct dentry_key_t {
1240 dentry_key_t() {}
1241 dentry_key_t(snapid_t s, std::string_view n, __u32 h=0) :
1242 snapid(s), name(n), hash(h) {}
1243
1244 bool is_valid() { return name.length() || snapid; }
1245
1246 // encode into something that can be decoded as a string.
1247 // name_ (head) or name_%x (!head)
1248 void encode(bufferlist& bl) const {
1249 string key;
1250 encode(key);
1251 using ceph::encode;
1252 encode(key, bl);
1253 }
1254 void encode(string& key) const {
1255 char b[20];
1256 if (snapid != CEPH_NOSNAP) {
1257 uint64_t val(snapid);
1258 snprintf(b, sizeof(b), "%" PRIx64, val);
1259 } else {
1260 snprintf(b, sizeof(b), "%s", "head");
1261 }
1262 ostringstream oss;
1263 oss << name << "_" << b;
1264 key = oss.str();
1265 }
1266 static void decode_helper(bufferlist::const_iterator& bl, string& nm, snapid_t& sn) {
1267 string key;
1268 decode(key, bl);
1269 decode_helper(key, nm, sn);
1270 }
1271 static void decode_helper(std::string_view key, string& nm, snapid_t& sn) {
1272 size_t i = key.find_last_of('_');
1273 ceph_assert(i != string::npos);
1274 if (key.compare(i+1, std::string_view::npos, "head") == 0) {
1275 // name_head
1276 sn = CEPH_NOSNAP;
1277 } else {
1278 // name_%x
1279 long long unsigned x = 0;
1280 std::string x_str(key.substr(i+1));
1281 sscanf(x_str.c_str(), "%llx", &x);
1282 sn = x;
1283 }
1284 nm = key.substr(0, i);
1285 }
1286
1287 snapid_t snapid = 0;
1288 std::string_view name;
1289 __u32 hash = 0;
1290 };
1291
1292 inline std::ostream& operator<<(std::ostream& out, const dentry_key_t &k)
1293 {
1294 return out << "(" << k.name << "," << k.snapid << ")";
1295 }
1296
1297 inline bool operator<(const dentry_key_t& k1, const dentry_key_t& k2)
1298 {
1299 /*
1300 * order by hash, name, snap
1301 */
1302 int c = ceph_frag_value(k1.hash) - ceph_frag_value(k2.hash);
1303 if (c)
1304 return c < 0;
1305 c = k1.name.compare(k2.name);
1306 if (c)
1307 return c < 0;
1308 return k1.snapid < k2.snapid;
1309 }
1310
1311 /*
1312 * string_snap_t is a simple (string, snapid_t) pair
1313 */
1314 struct string_snap_t {
1315 string_snap_t() {}
1316 string_snap_t(std::string_view n, snapid_t s) : name(n), snapid(s) {}
1317
1318 void encode(bufferlist& bl) const;
1319 void decode(bufferlist::const_iterator& p);
1320 void dump(Formatter *f) const;
1321 static void generate_test_instances(std::list<string_snap_t*>& ls);
1322
1323 string name;
1324 snapid_t snapid;
1325 };
1326 WRITE_CLASS_ENCODER(string_snap_t)
1327
1328 inline bool operator<(const string_snap_t& l, const string_snap_t& r) {
1329 int c = l.name.compare(r.name);
1330 return c < 0 || (c == 0 && l.snapid < r.snapid);
1331 }
1332
1333 inline std::ostream& operator<<(std::ostream& out, const string_snap_t &k)
1334 {
1335 return out << "(" << k.name << "," << k.snapid << ")";
1336 }
1337
1338 /*
1339 * mds_table_pending_t
1340 *
1341 * For mds's requesting any pending ops, child needs to encode the corresponding
1342 * pending mutation state in the table.
1343 */
1344 struct mds_table_pending_t {
1345 void encode(bufferlist& bl) const;
1346 void decode(bufferlist::const_iterator& bl);
1347 void dump(Formatter *f) const;
1348 static void generate_test_instances(std::list<mds_table_pending_t*>& ls);
1349
1350 uint64_t reqid = 0;
1351 __s32 mds = 0;
1352 version_t tid = 0;
1353 };
1354 WRITE_CLASS_ENCODER(mds_table_pending_t)
1355
1356 // requests
1357 struct metareqid_t {
1358 metareqid_t() {}
1359 metareqid_t(entity_name_t n, ceph_tid_t t) : name(n), tid(t) {}
1360 void encode(bufferlist& bl) const {
1361 using ceph::encode;
1362 encode(name, bl);
1363 encode(tid, bl);
1364 }
1365 void decode(bufferlist::const_iterator &p) {
1366 using ceph::decode;
1367 decode(name, p);
1368 decode(tid, p);
1369 }
1370
1371 entity_name_t name;
1372 uint64_t tid = 0;
1373 };
1374 WRITE_CLASS_ENCODER(metareqid_t)
1375
1376 inline std::ostream& operator<<(std::ostream& out, const metareqid_t& r) {
1377 return out << r.name << ":" << r.tid;
1378 }
1379
1380 inline bool operator==(const metareqid_t& l, const metareqid_t& r) {
1381 return (l.name == r.name) && (l.tid == r.tid);
1382 }
1383 inline bool operator!=(const metareqid_t& l, const metareqid_t& r) {
1384 return (l.name != r.name) || (l.tid != r.tid);
1385 }
1386 inline bool operator<(const metareqid_t& l, const metareqid_t& r) {
1387 return (l.name < r.name) ||
1388 (l.name == r.name && l.tid < r.tid);
1389 }
1390 inline bool operator<=(const metareqid_t& l, const metareqid_t& r) {
1391 return (l.name < r.name) ||
1392 (l.name == r.name && l.tid <= r.tid);
1393 }
1394 inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); }
1395 inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); }
1396
1397 namespace std {
1398 template<> struct hash<metareqid_t> {
1399 size_t operator()(const metareqid_t &r) const {
1400 hash<uint64_t> H;
1401 return H(r.name.num()) ^ H(r.name.type()) ^ H(r.tid);
1402 }
1403 };
1404 } // namespace std
1405
1406 // cap info for client reconnect
1407 struct cap_reconnect_t {
1408 cap_reconnect_t() {}
1409 cap_reconnect_t(uint64_t cap_id, inodeno_t pino, std::string_view p, int w, int i,
1410 inodeno_t sr, snapid_t sf, bufferlist& lb) :
1411 path(p) {
1412 capinfo.cap_id = cap_id;
1413 capinfo.wanted = w;
1414 capinfo.issued = i;
1415 capinfo.snaprealm = sr;
1416 capinfo.pathbase = pino;
1417 capinfo.flock_len = 0;
1418 snap_follows = sf;
1419 flockbl.claim(lb);
1420 }
1421 void encode(bufferlist& bl) const;
1422 void decode(bufferlist::const_iterator& bl);
1423 void encode_old(bufferlist& bl) const;
1424 void decode_old(bufferlist::const_iterator& bl);
1425
1426 void dump(Formatter *f) const;
1427 static void generate_test_instances(std::list<cap_reconnect_t*>& ls);
1428
1429 string path;
1430 mutable ceph_mds_cap_reconnect capinfo = {};
1431 snapid_t snap_follows = 0;
1432 bufferlist flockbl;
1433 };
1434 WRITE_CLASS_ENCODER(cap_reconnect_t)
1435
1436 struct snaprealm_reconnect_t {
1437 snaprealm_reconnect_t() {}
1438 snaprealm_reconnect_t(inodeno_t ino, snapid_t seq, inodeno_t parent) {
1439 realm.ino = ino;
1440 realm.seq = seq;
1441 realm.parent = parent;
1442 }
1443 void encode(bufferlist& bl) const;
1444 void decode(bufferlist::const_iterator& bl);
1445 void encode_old(bufferlist& bl) const;
1446 void decode_old(bufferlist::const_iterator& bl);
1447
1448 void dump(Formatter *f) const;
1449 static void generate_test_instances(std::list<snaprealm_reconnect_t*>& ls);
1450
1451 mutable ceph_mds_snaprealm_reconnect realm = {};
1452 };
1453 WRITE_CLASS_ENCODER(snaprealm_reconnect_t)
1454
1455 // compat for pre-FLOCK feature
1456 struct old_ceph_mds_cap_reconnect {
1457 ceph_le64 cap_id;
1458 ceph_le32 wanted;
1459 ceph_le32 issued;
1460 ceph_le64 old_size;
1461 struct ceph_timespec old_mtime, old_atime;
1462 ceph_le64 snaprealm;
1463 ceph_le64 pathbase; /* base ino for our path to this ino */
1464 } __attribute__ ((packed));
1465 WRITE_RAW_ENCODER(old_ceph_mds_cap_reconnect)
1466
1467 struct old_cap_reconnect_t {
1468 const old_cap_reconnect_t& operator=(const cap_reconnect_t& n) {
1469 path = n.path;
1470 capinfo.cap_id = n.capinfo.cap_id;
1471 capinfo.wanted = n.capinfo.wanted;
1472 capinfo.issued = n.capinfo.issued;
1473 capinfo.snaprealm = n.capinfo.snaprealm;
1474 capinfo.pathbase = n.capinfo.pathbase;
1475 return *this;
1476 }
1477 operator cap_reconnect_t() {
1478 cap_reconnect_t n;
1479 n.path = path;
1480 n.capinfo.cap_id = capinfo.cap_id;
1481 n.capinfo.wanted = capinfo.wanted;
1482 n.capinfo.issued = capinfo.issued;
1483 n.capinfo.snaprealm = capinfo.snaprealm;
1484 n.capinfo.pathbase = capinfo.pathbase;
1485 return n;
1486 }
1487
1488 void encode(bufferlist& bl) const {
1489 using ceph::encode;
1490 encode(path, bl);
1491 encode(capinfo, bl);
1492 }
1493 void decode(bufferlist::const_iterator& bl) {
1494 using ceph::decode;
1495 decode(path, bl);
1496 decode(capinfo, bl);
1497 }
1498
1499 string path;
1500 old_ceph_mds_cap_reconnect capinfo;
1501 };
1502 WRITE_CLASS_ENCODER(old_cap_reconnect_t)
1503
1504 // dir frag
1505 struct dirfrag_t {
1506 dirfrag_t() {}
1507 dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { }
1508
1509 void encode(bufferlist& bl) const {
1510 using ceph::encode;
1511 encode(ino, bl);
1512 encode(frag, bl);
1513 }
1514 void decode(bufferlist::const_iterator& bl) {
1515 using ceph::decode;
1516 decode(ino, bl);
1517 decode(frag, bl);
1518 }
1519
1520 inodeno_t ino = 0;
1521 frag_t frag;
1522 };
1523 WRITE_CLASS_ENCODER(dirfrag_t)
1524
1525 inline std::ostream& operator<<(std::ostream& out, const dirfrag_t &df) {
1526 out << df.ino;
1527 if (!df.frag.is_root()) out << "." << df.frag;
1528 return out;
1529 }
1530 inline bool operator<(dirfrag_t l, dirfrag_t r) {
1531 if (l.ino < r.ino) return true;
1532 if (l.ino == r.ino && l.frag < r.frag) return true;
1533 return false;
1534 }
1535 inline bool operator==(dirfrag_t l, dirfrag_t r) {
1536 return l.ino == r.ino && l.frag == r.frag;
1537 }
1538
1539 namespace std {
1540 template<> struct hash<dirfrag_t> {
1541 size_t operator()(const dirfrag_t &df) const {
1542 static rjhash<uint64_t> H;
1543 static rjhash<uint32_t> I;
1544 return H(df.ino) ^ I(df.frag);
1545 }
1546 };
1547 } // namespace std
1548
1549 // ================================================================
1550 #define META_POP_IRD 0
1551 #define META_POP_IWR 1
1552 #define META_POP_READDIR 2
1553 #define META_POP_FETCH 3
1554 #define META_POP_STORE 4
1555 #define META_NPOP 5
1556
1557 class inode_load_vec_t {
1558 public:
1559 using time = DecayCounter::time;
1560 using clock = DecayCounter::clock;
1561 static const size_t NUM = 2;
1562
1563 inode_load_vec_t() : vec{DecayCounter(DecayRate()), DecayCounter(DecayRate())} {}
1564 inode_load_vec_t(const DecayRate &rate) : vec{DecayCounter(rate), DecayCounter(rate)} {}
1565
1566 DecayCounter &get(int t) {
1567 return vec[t];
1568 }
1569 void zero() {
1570 for (auto &d : vec) {
1571 d.reset();
1572 }
1573 }
1574 void encode(bufferlist &bl) const;
1575 void decode(bufferlist::const_iterator& p);
1576 void dump(Formatter *f) const;
1577 static void generate_test_instances(std::list<inode_load_vec_t*>& ls);
1578
1579 private:
1580 std::array<DecayCounter, NUM> vec;
1581 };
1582 inline void encode(const inode_load_vec_t &c, bufferlist &bl) {
1583 c.encode(bl);
1584 }
1585 inline void decode(inode_load_vec_t & c, bufferlist::const_iterator &p) {
1586 c.decode(p);
1587 }
1588
1589 class dirfrag_load_vec_t {
1590 public:
1591 using time = DecayCounter::time;
1592 using clock = DecayCounter::clock;
1593 static const size_t NUM = 5;
1594
1595 dirfrag_load_vec_t() :
1596 vec{DecayCounter(DecayRate()),
1597 DecayCounter(DecayRate()),
1598 DecayCounter(DecayRate()),
1599 DecayCounter(DecayRate()),
1600 DecayCounter(DecayRate())
1601 }
1602 {}
1603 dirfrag_load_vec_t(const DecayRate &rate) :
1604 vec{DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate)}
1605 {}
1606
1607 void encode(bufferlist &bl) const {
1608 ENCODE_START(2, 2, bl);
1609 for (const auto &i : vec) {
1610 encode(i, bl);
1611 }
1612 ENCODE_FINISH(bl);
1613 }
1614 void decode(bufferlist::const_iterator &p) {
1615 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p);
1616 for (auto &i : vec) {
1617 decode(i, p);
1618 }
1619 DECODE_FINISH(p);
1620 }
1621 void dump(Formatter *f) const;
1622 void dump(Formatter *f, const DecayRate& rate) const;
1623 static void generate_test_instances(std::list<dirfrag_load_vec_t*>& ls);
1624
1625 const DecayCounter &get(int t) const {
1626 return vec[t];
1627 }
1628 DecayCounter &get(int t) {
1629 return vec[t];
1630 }
1631 void adjust(double d) {
1632 for (auto &i : vec) {
1633 i.adjust(d);
1634 }
1635 }
1636 void zero() {
1637 for (auto &i : vec) {
1638 i.reset();
1639 }
1640 }
1641 double meta_load() const {
1642 return
1643 1*vec[META_POP_IRD].get() +
1644 2*vec[META_POP_IWR].get() +
1645 1*vec[META_POP_READDIR].get() +
1646 2*vec[META_POP_FETCH].get() +
1647 4*vec[META_POP_STORE].get();
1648 }
1649
1650 void add(dirfrag_load_vec_t& r) {
1651 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1652 vec[i].adjust(r.vec[i].get());
1653 }
1654 void sub(dirfrag_load_vec_t& r) {
1655 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1656 vec[i].adjust(-r.vec[i].get());
1657 }
1658 void scale(double f) {
1659 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1660 vec[i].scale(f);
1661 }
1662
1663 private:
1664 friend inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl);
1665 std::array<DecayCounter, NUM> vec;
1666 };
1667
1668 inline void encode(const dirfrag_load_vec_t &c, bufferlist &bl) {
1669 c.encode(bl);
1670 }
1671 inline void decode(dirfrag_load_vec_t& c, bufferlist::const_iterator &p) {
1672 c.decode(p);
1673 }
1674
1675 inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl)
1676 {
1677 std::ostringstream ss;
1678 ss << std::setprecision(1) << std::fixed
1679 << "[pop"
1680 " IRD:" << dl.vec[0]
1681 << " IWR:" << dl.vec[1]
1682 << " RDR:" << dl.vec[2]
1683 << " FET:" << dl.vec[3]
1684 << " STR:" << dl.vec[4]
1685 << " *LOAD:" << dl.meta_load() << "]";
1686 return out << ss.str() << std::endl;
1687 }
1688
1689 struct mds_load_t {
1690 using clock = dirfrag_load_vec_t::clock;
1691 using time = dirfrag_load_vec_t::time;
1692
1693 dirfrag_load_vec_t auth;
1694 dirfrag_load_vec_t all;
1695
1696 mds_load_t() : auth(DecayRate()), all(DecayRate()) {}
1697 mds_load_t(const DecayRate &rate) : auth(rate), all(rate) {}
1698
1699 double req_rate = 0.0;
1700 double cache_hit_rate = 0.0;
1701 double queue_len = 0.0;
1702
1703 double cpu_load_avg = 0.0;
1704
1705 double mds_load() const; // defiend in MDBalancer.cc
1706 void encode(bufferlist& bl) const;
1707 void decode(bufferlist::const_iterator& bl);
1708 void dump(Formatter *f) const;
1709 static void generate_test_instances(std::list<mds_load_t*>& ls);
1710 };
1711 inline void encode(const mds_load_t &c, bufferlist &bl) {
1712 c.encode(bl);
1713 }
1714 inline void decode(mds_load_t &c, bufferlist::const_iterator &p) {
1715 c.decode(p);
1716 }
1717
1718 inline std::ostream& operator<<(std::ostream& out, const mds_load_t& load)
1719 {
1720 return out << "mdsload<" << load.auth << "/" << load.all
1721 << ", req " << load.req_rate
1722 << ", hr " << load.cache_hit_rate
1723 << ", qlen " << load.queue_len
1724 << ", cpu " << load.cpu_load_avg
1725 << ">";
1726 }
1727
1728 class load_spread_t {
1729 public:
1730 using time = DecayCounter::time;
1731 using clock = DecayCounter::clock;
1732 static const int MAX = 4;
1733
1734 load_spread_t(const DecayRate &rate) : count(rate)
1735 {}
1736
1737 load_spread_t() = delete;
1738
1739 double hit(int who) {
1740 for (int i=0; i<n; i++)
1741 if (last[i] == who)
1742 return count.get_last();
1743
1744 // we're new(ish)
1745 last[p++] = who;
1746 if (n < MAX) n++;
1747 if (n == 1) return 0.0;
1748
1749 if (p == MAX) p = 0;
1750
1751 return count.hit();
1752 }
1753 double get() const {
1754 return count.get();
1755 }
1756
1757 std::array<int, MAX> last = {-1, -1, -1, -1};
1758 int p = 0, n = 0;
1759 DecayCounter count;
1760 };
1761
1762 // ================================================================
1763 typedef std::pair<mds_rank_t, mds_rank_t> mds_authority_t;
1764
1765 // -- authority delegation --
1766 // directory authority types
1767 // >= 0 is the auth mds
1768 #define CDIR_AUTH_PARENT mds_rank_t(-1) // default
1769 #define CDIR_AUTH_UNKNOWN mds_rank_t(-2)
1770 #define CDIR_AUTH_DEFAULT mds_authority_t(CDIR_AUTH_PARENT, CDIR_AUTH_UNKNOWN)
1771 #define CDIR_AUTH_UNDEF mds_authority_t(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN)
1772 //#define CDIR_AUTH_ROOTINODE pair<int,int>( 0, -2)
1773
1774 class MDSCacheObjectInfo {
1775 public:
1776 void encode(bufferlist& bl) const;
1777 void decode(bufferlist::const_iterator& bl);
1778 void dump(Formatter *f) const;
1779 static void generate_test_instances(std::list<MDSCacheObjectInfo*>& ls);
1780
1781 inodeno_t ino = 0;
1782 dirfrag_t dirfrag;
1783 string dname;
1784 snapid_t snapid;
1785 };
1786
1787 inline std::ostream& operator<<(std::ostream& out, const MDSCacheObjectInfo &info) {
1788 if (info.ino) return out << info.ino << "." << info.snapid;
1789 if (info.dname.length()) return out << info.dirfrag << "/" << info.dname
1790 << " snap " << info.snapid;
1791 return out << info.dirfrag;
1792 }
1793
1794 inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) {
1795 if (l.ino || r.ino)
1796 return l.ino == r.ino && l.snapid == r.snapid;
1797 else
1798 return l.dirfrag == r.dirfrag && l.dname == r.dname;
1799 }
1800 WRITE_CLASS_ENCODER(MDSCacheObjectInfo)
1801
1802 // parse a map of keys/values.
1803 namespace qi = boost::spirit::qi;
1804
1805 template <typename Iterator>
1806 struct keys_and_values
1807 : qi::grammar<Iterator, std::map<string, string>()>
1808 {
1809 keys_and_values()
1810 : keys_and_values::base_type(query)
1811 {
1812 query = pair >> *(qi::lit(' ') >> pair);
1813 pair = key >> '=' >> value;
1814 key = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9");
1815 value = +qi::char_("a-zA-Z0-9-_.");
1816 }
1817 qi::rule<Iterator, std::map<string, string>()> query;
1818 qi::rule<Iterator, std::pair<string, string>()> pair;
1819 qi::rule<Iterator, string()> key, value;
1820 };
1821
1822 #endif