]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/mdstypes.h
3f0b05a095284122fa04c9022a1f47a651ac4b2b
[ceph.git] / ceph / src / mds / mdstypes.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef CEPH_MDSTYPES_H
4 #define CEPH_MDSTYPES_H
5
6 #include "include/int_types.h"
7
8 #include <math.h>
9 #include <ostream>
10 #include <set>
11 #include <map>
12 #include <string_view>
13
14 #include "common/config.h"
15 #include "common/Clock.h"
16 #include "common/DecayCounter.h"
17 #include "common/entity_name.h"
18
19 #include "include/Context.h"
20 #include "include/frag.h"
21 #include "include/xlist.h"
22 #include "include/interval_set.h"
23 #include "include/compact_map.h"
24 #include "include/compact_set.h"
25 #include "include/fs_types.h"
26
27 #include "inode_backtrace.h"
28
29 #include <boost/spirit/include/qi.hpp>
30 #include <boost/pool/pool.hpp>
31 #include "include/ceph_assert.h"
32 #include <boost/serialization/strong_typedef.hpp>
33
34 #define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011"
35
36 #define MDS_PORT_CACHE 0x200
37 #define MDS_PORT_LOCKER 0x300
38 #define MDS_PORT_MIGRATOR 0x400
39
40 #define MAX_MDS 0x100
41 #define NUM_STRAY 10
42
43 #define MDS_INO_ROOT 1
44
45 // No longer created but recognised in existing filesystems
46 // so that we don't try to fragment it.
47 #define MDS_INO_CEPH 2
48
49 #define MDS_INO_GLOBAL_SNAPREALM 3
50
51 #define MDS_INO_MDSDIR_OFFSET (1*MAX_MDS)
52 #define MDS_INO_STRAY_OFFSET (6*MAX_MDS)
53
54 // Locations for journal data
55 #define MDS_INO_LOG_OFFSET (2*MAX_MDS)
56 #define MDS_INO_LOG_BACKUP_OFFSET (3*MAX_MDS)
57 #define MDS_INO_LOG_POINTER_OFFSET (4*MAX_MDS)
58 #define MDS_INO_PURGE_QUEUE (5*MAX_MDS)
59
60 #define MDS_INO_SYSTEM_BASE ((6*MAX_MDS) + (MAX_MDS * NUM_STRAY))
61
62 #define MDS_INO_STRAY(x,i) (MDS_INO_STRAY_OFFSET+((((unsigned)(x))*NUM_STRAY)+((unsigned)(i))))
63 #define MDS_INO_MDSDIR(x) (MDS_INO_MDSDIR_OFFSET+((unsigned)x))
64
65 #define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < (MDS_INO_STRAY_OFFSET+(MAX_MDS*NUM_STRAY)))
66 #define MDS_INO_IS_MDSDIR(i) ((i) >= MDS_INO_MDSDIR_OFFSET && (i) < (MDS_INO_MDSDIR_OFFSET+MAX_MDS))
67 #define MDS_INO_MDSDIR_OWNER(i) (signed ((unsigned (i)) - MDS_INO_MDSDIR_OFFSET))
68 #define MDS_INO_IS_BASE(i) ((i) == MDS_INO_ROOT || (i) == MDS_INO_GLOBAL_SNAPREALM || MDS_INO_IS_MDSDIR(i))
69 #define MDS_INO_STRAY_OWNER(i) (signed (((unsigned (i)) - MDS_INO_STRAY_OFFSET) / NUM_STRAY))
70 #define MDS_INO_STRAY_INDEX(i) (((unsigned (i)) - MDS_INO_STRAY_OFFSET) % NUM_STRAY)
71
72 typedef int32_t mds_rank_t;
73 constexpr mds_rank_t MDS_RANK_NONE = -1;
74
75 BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
76 extern const mds_gid_t MDS_GID_NONE;
77
78 typedef int32_t fs_cluster_id_t;
79 constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = -1;
80 // The namespace ID of the anonymous default filesystem from legacy systems
81 constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS = 0;
82
83 class mds_role_t {
84 public:
85 mds_role_t(fs_cluster_id_t fscid_, mds_rank_t rank_)
86 : fscid(fscid_), rank(rank_)
87 {}
88 mds_role_t() {}
89
90 bool operator<(mds_role_t const &rhs) const {
91 if (fscid < rhs.fscid) {
92 return true;
93 } else if (fscid == rhs.fscid) {
94 return rank < rhs.rank;
95 } else {
96 return false;
97 }
98 }
99
100 bool is_none() const {
101 return (rank == MDS_RANK_NONE);
102 }
103
104 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
105 mds_rank_t rank = MDS_RANK_NONE;
106 };
107 inline std::ostream& operator<<(std::ostream& out, const mds_role_t& role) {
108 return out << role.fscid << ":" << role.rank;
109 }
110
111 // CAPS
112 inline string gcap_string(int cap)
113 {
114 string s;
115 if (cap & CEPH_CAP_GSHARED) s += "s";
116 if (cap & CEPH_CAP_GEXCL) s += "x";
117 if (cap & CEPH_CAP_GCACHE) s += "c";
118 if (cap & CEPH_CAP_GRD) s += "r";
119 if (cap & CEPH_CAP_GWR) s += "w";
120 if (cap & CEPH_CAP_GBUFFER) s += "b";
121 if (cap & CEPH_CAP_GWREXTEND) s += "a";
122 if (cap & CEPH_CAP_GLAZYIO) s += "l";
123 return s;
124 }
125 inline string ccap_string(int cap)
126 {
127 string s;
128 if (cap & CEPH_CAP_PIN) s += "p";
129
130 int a = (cap >> CEPH_CAP_SAUTH) & 3;
131 if (a) s += 'A' + gcap_string(a);
132
133 a = (cap >> CEPH_CAP_SLINK) & 3;
134 if (a) s += 'L' + gcap_string(a);
135
136 a = (cap >> CEPH_CAP_SXATTR) & 3;
137 if (a) s += 'X' + gcap_string(a);
138
139 a = cap >> CEPH_CAP_SFILE;
140 if (a) s += 'F' + gcap_string(a);
141
142 if (s.length() == 0)
143 s = "-";
144 return s;
145 }
146
147 struct scatter_info_t {
148 version_t version = 0;
149 };
150
151 struct frag_info_t : public scatter_info_t {
152 int64_t size() const { return nfiles + nsubdirs; }
153
154 void zero() {
155 *this = frag_info_t();
156 }
157
158 // *this += cur - acc;
159 void add_delta(const frag_info_t &cur, const frag_info_t &acc, bool *touched_mtime=0, bool *touched_chattr=0) {
160 if (cur.mtime > mtime) {
161 mtime = cur.mtime;
162 if (touched_mtime)
163 *touched_mtime = true;
164 }
165 if (cur.change_attr > change_attr) {
166 change_attr = cur.change_attr;
167 if (touched_chattr)
168 *touched_chattr = true;
169 }
170 nfiles += cur.nfiles - acc.nfiles;
171 nsubdirs += cur.nsubdirs - acc.nsubdirs;
172 }
173
174 void add(const frag_info_t& other) {
175 if (other.mtime > mtime)
176 mtime = other.mtime;
177 if (other.change_attr > change_attr)
178 change_attr = other.change_attr;
179 nfiles += other.nfiles;
180 nsubdirs += other.nsubdirs;
181 }
182
183 bool same_sums(const frag_info_t &o) const {
184 return mtime <= o.mtime &&
185 nfiles == o.nfiles &&
186 nsubdirs == o.nsubdirs;
187 }
188
189 void encode(bufferlist &bl) const;
190 void decode(bufferlist::const_iterator& bl);
191 void dump(Formatter *f) const;
192 static void generate_test_instances(std::list<frag_info_t*>& ls);
193
194 // this frag
195 utime_t mtime;
196 uint64_t change_attr = 0;
197 int64_t nfiles = 0; // files
198 int64_t nsubdirs = 0; // subdirs
199 };
200 WRITE_CLASS_ENCODER(frag_info_t)
201
202 inline bool operator==(const frag_info_t &l, const frag_info_t &r) {
203 return memcmp(&l, &r, sizeof(l)) == 0;
204 }
205 inline bool operator!=(const frag_info_t &l, const frag_info_t &r) {
206 return !(l == r);
207 }
208
209 std::ostream& operator<<(std::ostream &out, const frag_info_t &f);
210
211
212 struct nest_info_t : public scatter_info_t {
213 int64_t rsize() const { return rfiles + rsubdirs; }
214
215 void zero() {
216 *this = nest_info_t();
217 }
218
219 void sub(const nest_info_t &other) {
220 add(other, -1);
221 }
222 void add(const nest_info_t &other, int fac=1) {
223 if (other.rctime > rctime)
224 rctime = other.rctime;
225 rbytes += fac*other.rbytes;
226 rfiles += fac*other.rfiles;
227 rsubdirs += fac*other.rsubdirs;
228 rsnaps += fac*other.rsnaps;
229 }
230
231 // *this += cur - acc;
232 void add_delta(const nest_info_t &cur, const nest_info_t &acc) {
233 if (cur.rctime > rctime)
234 rctime = cur.rctime;
235 rbytes += cur.rbytes - acc.rbytes;
236 rfiles += cur.rfiles - acc.rfiles;
237 rsubdirs += cur.rsubdirs - acc.rsubdirs;
238 rsnaps += cur.rsnaps - acc.rsnaps;
239 }
240
241 bool same_sums(const nest_info_t &o) const {
242 return rctime <= o.rctime &&
243 rbytes == o.rbytes &&
244 rfiles == o.rfiles &&
245 rsubdirs == o.rsubdirs &&
246 rsnaps == o.rsnaps;
247 }
248
249 void encode(bufferlist &bl) const;
250 void decode(bufferlist::const_iterator& bl);
251 void dump(Formatter *f) const;
252 static void generate_test_instances(std::list<nest_info_t*>& ls);
253
254 // this frag + children
255 utime_t rctime;
256 int64_t rbytes = 0;
257 int64_t rfiles = 0;
258 int64_t rsubdirs = 0;
259 int64_t rsnaps = 0;
260 };
261 WRITE_CLASS_ENCODER(nest_info_t)
262
263 inline bool operator==(const nest_info_t &l, const nest_info_t &r) {
264 return memcmp(&l, &r, sizeof(l)) == 0;
265 }
266 inline bool operator!=(const nest_info_t &l, const nest_info_t &r) {
267 return !(l == r);
268 }
269
270 std::ostream& operator<<(std::ostream &out, const nest_info_t &n);
271
272 struct vinodeno_t {
273 vinodeno_t() {}
274 vinodeno_t(inodeno_t i, snapid_t s) : ino(i), snapid(s) {}
275
276 void encode(bufferlist& bl) const {
277 using ceph::encode;
278 encode(ino, bl);
279 encode(snapid, bl);
280 }
281 void decode(bufferlist::const_iterator& p) {
282 using ceph::decode;
283 decode(ino, p);
284 decode(snapid, p);
285 }
286
287 inodeno_t ino;
288 snapid_t snapid;
289 };
290 WRITE_CLASS_ENCODER(vinodeno_t)
291
292 inline bool operator==(const vinodeno_t &l, const vinodeno_t &r) {
293 return l.ino == r.ino && l.snapid == r.snapid;
294 }
295 inline bool operator!=(const vinodeno_t &l, const vinodeno_t &r) {
296 return !(l == r);
297 }
298 inline bool operator<(const vinodeno_t &l, const vinodeno_t &r) {
299 return
300 l.ino < r.ino ||
301 (l.ino == r.ino && l.snapid < r.snapid);
302 }
303
304 struct quota_info_t
305 {
306 void encode(bufferlist& bl) const {
307 ENCODE_START(1, 1, bl);
308 encode(max_bytes, bl);
309 encode(max_files, bl);
310 ENCODE_FINISH(bl);
311 }
312 void decode(bufferlist::const_iterator& p) {
313 DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p);
314 decode(max_bytes, p);
315 decode(max_files, p);
316 DECODE_FINISH(p);
317 }
318
319 void dump(Formatter *f) const;
320 static void generate_test_instances(std::list<quota_info_t *>& ls);
321
322 bool is_valid() const {
323 return max_bytes >=0 && max_files >=0;
324 }
325 bool is_enable() const {
326 return max_bytes || max_files;
327 }
328
329 int64_t max_bytes = 0;
330 int64_t max_files = 0;
331 };
332 WRITE_CLASS_ENCODER(quota_info_t)
333
334 inline bool operator==(const quota_info_t &l, const quota_info_t &r) {
335 return memcmp(&l, &r, sizeof(l)) == 0;
336 }
337
338 ostream& operator<<(ostream &out, const quota_info_t &n);
339
340 namespace std {
341 template<> struct hash<vinodeno_t> {
342 size_t operator()(const vinodeno_t &vino) const {
343 hash<inodeno_t> H;
344 hash<uint64_t> I;
345 return H(vino.ino) ^ I(vino.snapid);
346 }
347 };
348 }
349
350 inline std::ostream& operator<<(std::ostream &out, const vinodeno_t &vino) {
351 out << vino.ino;
352 if (vino.snapid == CEPH_NOSNAP)
353 out << ".head";
354 else if (vino.snapid)
355 out << '.' << vino.snapid;
356 return out;
357 }
358
359 struct client_writeable_range_t {
360 struct byte_range_t {
361 uint64_t first = 0, last = 0; // interval client can write to
362 };
363
364 void encode(bufferlist &bl) const;
365 void decode(bufferlist::const_iterator& bl);
366 void dump(Formatter *f) const;
367 static void generate_test_instances(std::list<client_writeable_range_t*>& ls);
368
369 byte_range_t range;
370 snapid_t follows = 0; // aka "data+metadata flushed thru"
371 };
372
373 inline void decode(client_writeable_range_t::byte_range_t& range, bufferlist::const_iterator& bl) {
374 decode(range.first, bl);
375 decode(range.last, bl);
376 }
377
378 WRITE_CLASS_ENCODER(client_writeable_range_t)
379
380 std::ostream& operator<<(std::ostream& out, const client_writeable_range_t& r);
381
382 inline bool operator==(const client_writeable_range_t& l,
383 const client_writeable_range_t& r) {
384 return l.range.first == r.range.first && l.range.last == r.range.last &&
385 l.follows == r.follows;
386 }
387
388 struct inline_data_t {
389 public:
390 inline_data_t() {}
391 inline_data_t(const inline_data_t& o) : version(o.version) {
392 if (o.blp)
393 get_data() = *o.blp;
394 }
395 inline_data_t& operator=(const inline_data_t& o) {
396 version = o.version;
397 if (o.blp)
398 get_data() = *o.blp;
399 else
400 free_data();
401 return *this;
402 }
403
404 void free_data() {
405 blp.reset();
406 }
407 bufferlist& get_data() {
408 if (!blp)
409 blp.reset(new bufferlist);
410 return *blp;
411 }
412 size_t length() const { return blp ? blp->length() : 0; }
413
414 bool operator==(const inline_data_t& o) const {
415 return length() == o.length() &&
416 (length() == 0 ||
417 (*const_cast<bufferlist*>(blp.get()) == *const_cast<bufferlist*>(o.blp.get())));
418 }
419 bool operator!=(const inline_data_t& o) const {
420 return !(*this == o);
421 }
422 void encode(bufferlist &bl) const;
423 void decode(bufferlist::const_iterator& bl);
424
425 version_t version = 1;
426
427 private:
428 std::unique_ptr<bufferlist> blp;
429 };
430 WRITE_CLASS_ENCODER(inline_data_t)
431
432 enum {
433 DAMAGE_STATS, // statistics (dirstat, size, etc)
434 DAMAGE_RSTATS, // recursive statistics (rstat, accounted_rstat)
435 DAMAGE_FRAGTREE // fragtree -- repair by searching
436 };
437 typedef uint32_t damage_flags_t;
438
439 template<template<typename> class Allocator = std::allocator>
440 struct inode_t {
441 /**
442 * ***************
443 * Do not forget to add any new fields to the compare() function.
444 * ***************
445 */
446 using client_range_map = std::map<client_t,client_writeable_range_t,std::less<client_t>,Allocator<std::pair<const client_t,client_writeable_range_t>>>;
447
448 inode_t()
449 {
450 clear_layout();
451 }
452
453 // file type
454 bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; }
455 bool is_dir() const { return (mode & S_IFMT) == S_IFDIR; }
456 bool is_file() const { return (mode & S_IFMT) == S_IFREG; }
457
458 bool is_truncating() const { return (truncate_pending > 0); }
459 void truncate(uint64_t old_size, uint64_t new_size) {
460 ceph_assert(new_size < old_size);
461 if (old_size > max_size_ever)
462 max_size_ever = old_size;
463 truncate_from = old_size;
464 size = new_size;
465 rstat.rbytes = new_size;
466 truncate_size = size;
467 truncate_seq++;
468 truncate_pending++;
469 }
470
471 bool has_layout() const {
472 return layout != file_layout_t();
473 }
474
475 void clear_layout() {
476 layout = file_layout_t();
477 }
478
479 uint64_t get_layout_size_increment() const {
480 return layout.get_period();
481 }
482
483 bool is_dirty_rstat() const { return !(rstat == accounted_rstat); }
484
485 uint64_t get_client_range(client_t client) const {
486 auto it = client_ranges.find(client);
487 return it != client_ranges.end() ? it->second.range.last : 0;
488 }
489
490 uint64_t get_max_size() const {
491 uint64_t max = 0;
492 for (std::map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin();
493 p != client_ranges.end();
494 ++p)
495 if (p->second.range.last > max)
496 max = p->second.range.last;
497 return max;
498 }
499 void set_max_size(uint64_t new_max) {
500 if (new_max == 0) {
501 client_ranges.clear();
502 } else {
503 for (std::map<client_t,client_writeable_range_t>::iterator p = client_ranges.begin();
504 p != client_ranges.end();
505 ++p)
506 p->second.range.last = new_max;
507 }
508 }
509
510 void trim_client_ranges(snapid_t last) {
511 std::map<client_t, client_writeable_range_t>::iterator p = client_ranges.begin();
512 while (p != client_ranges.end()) {
513 if (p->second.follows >= last)
514 client_ranges.erase(p++);
515 else
516 ++p;
517 }
518 }
519
520 bool is_backtrace_updated() const {
521 return backtrace_version == version;
522 }
523 void update_backtrace(version_t pv=0) {
524 backtrace_version = pv ? pv : version;
525 }
526
527 void add_old_pool(int64_t l) {
528 backtrace_version = version;
529 old_pools.insert(l);
530 }
531
532 void encode(bufferlist &bl, uint64_t features) const;
533 void decode(bufferlist::const_iterator& bl);
534 void dump(Formatter *f) const;
535 static void generate_test_instances(std::list<inode_t*>& ls);
536 /**
537 * Compare this inode_t with another that represent *the same inode*
538 * at different points in time.
539 * @pre The inodes are the same ino
540 *
541 * @param other The inode_t to compare ourselves with
542 * @param divergent A bool pointer which will be set to true
543 * if the values are different in a way that can't be explained
544 * by one being a newer version than the other.
545 *
546 * @returns 1 if we are newer than the other, 0 if equal, -1 if older.
547 */
548 int compare(const inode_t &other, bool *divergent) const;
549
550 // base (immutable)
551 inodeno_t ino = 0;
552 uint32_t rdev = 0; // if special file
553
554 // affected by any inode change...
555 utime_t ctime; // inode change time
556 utime_t btime; // birth time
557
558 // perm (namespace permissions)
559 uint32_t mode = 0;
560 uid_t uid = 0;
561 gid_t gid = 0;
562
563 // nlink
564 int32_t nlink = 0;
565
566 // file (data access)
567 ceph_dir_layout dir_layout = {}; // [dir only]
568 file_layout_t layout;
569 compact_set<int64_t, std::less<int64_t>, Allocator<int64_t>> old_pools;
570 uint64_t size = 0; // on directory, # dentries
571 uint64_t max_size_ever = 0; // max size the file has ever been
572 uint32_t truncate_seq = 0;
573 uint64_t truncate_size = 0, truncate_from = 0;
574 uint32_t truncate_pending = 0;
575 utime_t mtime; // file data modify time.
576 utime_t atime; // file data access time.
577 uint32_t time_warp_seq = 0; // count of (potential) mtime/atime timewarps (i.e., utimes())
578 inline_data_t inline_data; // FIXME check
579
580 // change attribute
581 uint64_t change_attr = 0;
582
583 client_range_map client_ranges; // client(s) can write to these ranges
584
585 // dirfrag, recursive accountin
586 frag_info_t dirstat; // protected by my filelock
587 nest_info_t rstat; // protected by my nestlock
588 nest_info_t accounted_rstat; // protected by parent's nestlock
589
590 quota_info_t quota;
591
592 mds_rank_t export_pin = MDS_RANK_NONE;
593
594 double export_ephemeral_random_pin = 0;
595 bool export_ephemeral_distributed_pin = false;
596
597 // special stuff
598 version_t version = 0; // auth only
599 version_t file_data_version = 0; // auth only
600 version_t xattr_version = 0;
601
602 utime_t last_scrub_stamp; // start time of last complete scrub
603 version_t last_scrub_version = 0;// (parent) start version of last complete scrub
604
605 version_t backtrace_version = 0;
606
607 snapid_t oldest_snap;
608
609 std::basic_string<char,std::char_traits<char>,Allocator<char>> stray_prior_path; //stores path before unlink
610
611 private:
612 bool older_is_consistent(const inode_t &other) const;
613 };
614
615 // These methods may be moved back to mdstypes.cc when we have pmr
616 template<template<typename> class Allocator>
617 void inode_t<Allocator>::encode(bufferlist &bl, uint64_t features) const
618 {
619 ENCODE_START(16, 6, bl);
620
621 encode(ino, bl);
622 encode(rdev, bl);
623 encode(ctime, bl);
624
625 encode(mode, bl);
626 encode(uid, bl);
627 encode(gid, bl);
628
629 encode(nlink, bl);
630 {
631 // removed field
632 bool anchored = 0;
633 encode(anchored, bl);
634 }
635
636 encode(dir_layout, bl);
637 encode(layout, bl, features);
638 encode(size, bl);
639 encode(truncate_seq, bl);
640 encode(truncate_size, bl);
641 encode(truncate_from, bl);
642 encode(truncate_pending, bl);
643 encode(mtime, bl);
644 encode(atime, bl);
645 encode(time_warp_seq, bl);
646 encode(client_ranges, bl);
647
648 encode(dirstat, bl);
649 encode(rstat, bl);
650 encode(accounted_rstat, bl);
651
652 encode(version, bl);
653 encode(file_data_version, bl);
654 encode(xattr_version, bl);
655 encode(backtrace_version, bl);
656 encode(old_pools, bl);
657 encode(max_size_ever, bl);
658 encode(inline_data, bl);
659 encode(quota, bl);
660
661 encode(stray_prior_path, bl);
662
663 encode(last_scrub_version, bl);
664 encode(last_scrub_stamp, bl);
665
666 encode(btime, bl);
667 encode(change_attr, bl);
668
669 encode(export_pin, bl);
670
671 encode(export_ephemeral_random_pin, bl);
672 encode(export_ephemeral_distributed_pin, bl);
673
674 ENCODE_FINISH(bl);
675 }
676
677 template<template<typename> class Allocator>
678 void inode_t<Allocator>::decode(bufferlist::const_iterator &p)
679 {
680 DECODE_START_LEGACY_COMPAT_LEN(16, 6, 6, p);
681
682 decode(ino, p);
683 decode(rdev, p);
684 decode(ctime, p);
685
686 decode(mode, p);
687 decode(uid, p);
688 decode(gid, p);
689
690 decode(nlink, p);
691 {
692 bool anchored;
693 decode(anchored, p);
694 }
695
696 if (struct_v >= 4)
697 decode(dir_layout, p);
698 else {
699 // FIPS zeroization audit 20191117: this memset is not security related.
700 memset(&dir_layout, 0, sizeof(dir_layout));
701 }
702 decode(layout, p);
703 decode(size, p);
704 decode(truncate_seq, p);
705 decode(truncate_size, p);
706 decode(truncate_from, p);
707 if (struct_v >= 5)
708 decode(truncate_pending, p);
709 else
710 truncate_pending = 0;
711 decode(mtime, p);
712 decode(atime, p);
713 decode(time_warp_seq, p);
714 if (struct_v >= 3) {
715 decode(client_ranges, p);
716 } else {
717 map<client_t, client_writeable_range_t::byte_range_t> m;
718 decode(m, p);
719 for (map<client_t, client_writeable_range_t::byte_range_t>::iterator
720 q = m.begin(); q != m.end(); ++q)
721 client_ranges[q->first].range = q->second;
722 }
723
724 decode(dirstat, p);
725 decode(rstat, p);
726 decode(accounted_rstat, p);
727
728 decode(version, p);
729 decode(file_data_version, p);
730 decode(xattr_version, p);
731 if (struct_v >= 2)
732 decode(backtrace_version, p);
733 if (struct_v >= 7)
734 decode(old_pools, p);
735 if (struct_v >= 8)
736 decode(max_size_ever, p);
737 if (struct_v >= 9) {
738 decode(inline_data, p);
739 } else {
740 inline_data.version = CEPH_INLINE_NONE;
741 }
742 if (struct_v < 10)
743 backtrace_version = 0; // force update backtrace
744 if (struct_v >= 11)
745 decode(quota, p);
746
747 if (struct_v >= 12) {
748 std::string tmp;
749 decode(tmp, p);
750 stray_prior_path = std::string_view(tmp);
751 }
752
753 if (struct_v >= 13) {
754 decode(last_scrub_version, p);
755 decode(last_scrub_stamp, p);
756 }
757 if (struct_v >= 14) {
758 decode(btime, p);
759 decode(change_attr, p);
760 } else {
761 btime = utime_t();
762 change_attr = 0;
763 }
764
765 if (struct_v >= 15) {
766 decode(export_pin, p);
767 } else {
768 export_pin = MDS_RANK_NONE;
769 }
770
771 if (struct_v >= 16) {
772 decode(export_ephemeral_random_pin, p);
773 decode(export_ephemeral_distributed_pin, p);
774 } else {
775 export_ephemeral_random_pin = 0;
776 export_ephemeral_distributed_pin = false;
777 }
778
779 DECODE_FINISH(p);
780 }
781
782 template<template<typename> class Allocator>
783 void inode_t<Allocator>::dump(Formatter *f) const
784 {
785 f->dump_unsigned("ino", ino);
786 f->dump_unsigned("rdev", rdev);
787 f->dump_stream("ctime") << ctime;
788 f->dump_stream("btime") << btime;
789 f->dump_unsigned("mode", mode);
790 f->dump_unsigned("uid", uid);
791 f->dump_unsigned("gid", gid);
792 f->dump_unsigned("nlink", nlink);
793
794 f->open_object_section("dir_layout");
795 ::dump(dir_layout, f);
796 f->close_section();
797
798 f->dump_object("layout", layout);
799
800 f->open_array_section("old_pools");
801 for (const auto &p : old_pools) {
802 f->dump_int("pool", p);
803 }
804 f->close_section();
805
806 f->dump_unsigned("size", size);
807 f->dump_unsigned("truncate_seq", truncate_seq);
808 f->dump_unsigned("truncate_size", truncate_size);
809 f->dump_unsigned("truncate_from", truncate_from);
810 f->dump_unsigned("truncate_pending", truncate_pending);
811 f->dump_stream("mtime") << mtime;
812 f->dump_stream("atime") << atime;
813 f->dump_unsigned("time_warp_seq", time_warp_seq);
814 f->dump_unsigned("change_attr", change_attr);
815 f->dump_int("export_pin", export_pin);
816 f->dump_int("export_ephemeral_random_pin", export_ephemeral_random_pin);
817 f->dump_bool("export_ephemeral_distributed_pin", export_ephemeral_distributed_pin);
818
819 f->open_array_section("client_ranges");
820 for (const auto &p : client_ranges) {
821 f->open_object_section("client");
822 f->dump_unsigned("client", p.first.v);
823 p.second.dump(f);
824 f->close_section();
825 }
826 f->close_section();
827
828 f->open_object_section("dirstat");
829 dirstat.dump(f);
830 f->close_section();
831
832 f->open_object_section("rstat");
833 rstat.dump(f);
834 f->close_section();
835
836 f->open_object_section("accounted_rstat");
837 accounted_rstat.dump(f);
838 f->close_section();
839
840 f->dump_unsigned("version", version);
841 f->dump_unsigned("file_data_version", file_data_version);
842 f->dump_unsigned("xattr_version", xattr_version);
843 f->dump_unsigned("backtrace_version", backtrace_version);
844
845 f->dump_string("stray_prior_path", stray_prior_path);
846 f->dump_unsigned("max_size_ever", max_size_ever);
847
848 f->open_object_section("quota");
849 quota.dump(f);
850 f->close_section();
851
852 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
853 f->dump_unsigned("last_scrub_version", last_scrub_version);
854 }
855
856 template<template<typename> class Allocator>
857 void inode_t<Allocator>::generate_test_instances(std::list<inode_t*>& ls)
858 {
859 ls.push_back(new inode_t<Allocator>);
860 ls.push_back(new inode_t<Allocator>);
861 ls.back()->ino = 1;
862 // i am lazy.
863 }
864
865 template<template<typename> class Allocator>
866 int inode_t<Allocator>::compare(const inode_t<Allocator> &other, bool *divergent) const
867 {
868 ceph_assert(ino == other.ino);
869 *divergent = false;
870 if (version == other.version) {
871 if (rdev != other.rdev ||
872 ctime != other.ctime ||
873 btime != other.btime ||
874 mode != other.mode ||
875 uid != other.uid ||
876 gid != other.gid ||
877 nlink != other.nlink ||
878 memcmp(&dir_layout, &other.dir_layout, sizeof(dir_layout)) ||
879 layout != other.layout ||
880 old_pools != other.old_pools ||
881 size != other.size ||
882 max_size_ever != other.max_size_ever ||
883 truncate_seq != other.truncate_seq ||
884 truncate_size != other.truncate_size ||
885 truncate_from != other.truncate_from ||
886 truncate_pending != other.truncate_pending ||
887 change_attr != other.change_attr ||
888 mtime != other.mtime ||
889 atime != other.atime ||
890 time_warp_seq != other.time_warp_seq ||
891 inline_data != other.inline_data ||
892 client_ranges != other.client_ranges ||
893 !(dirstat == other.dirstat) ||
894 !(rstat == other.rstat) ||
895 !(accounted_rstat == other.accounted_rstat) ||
896 file_data_version != other.file_data_version ||
897 xattr_version != other.xattr_version ||
898 backtrace_version != other.backtrace_version) {
899 *divergent = true;
900 }
901 return 0;
902 } else if (version > other.version) {
903 *divergent = !older_is_consistent(other);
904 return 1;
905 } else {
906 ceph_assert(version < other.version);
907 *divergent = !other.older_is_consistent(*this);
908 return -1;
909 }
910 }
911
912 template<template<typename> class Allocator>
913 bool inode_t<Allocator>::older_is_consistent(const inode_t<Allocator> &other) const
914 {
915 if (max_size_ever < other.max_size_ever ||
916 truncate_seq < other.truncate_seq ||
917 time_warp_seq < other.time_warp_seq ||
918 inline_data.version < other.inline_data.version ||
919 dirstat.version < other.dirstat.version ||
920 rstat.version < other.rstat.version ||
921 accounted_rstat.version < other.accounted_rstat.version ||
922 file_data_version < other.file_data_version ||
923 xattr_version < other.xattr_version ||
924 backtrace_version < other.backtrace_version) {
925 return false;
926 }
927 return true;
928 }
929
930 template<template<typename> class Allocator>
931 inline void encode(const inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
932 {
933 ENCODE_DUMP_PRE();
934 c.encode(bl, features);
935 ENCODE_DUMP_POST(cl);
936 }
937 template<template<typename> class Allocator>
938 inline void decode(inode_t<Allocator> &c, ::ceph::bufferlist::const_iterator &p)
939 {
940 c.decode(p);
941 }
942
943 template<template<typename> class Allocator>
944 using alloc_string = std::basic_string<char,std::char_traits<char>,Allocator<char>>;
945
946 template<template<typename> class Allocator>
947 using xattr_map = compact_map<alloc_string<Allocator>, bufferptr, std::less<alloc_string<Allocator>>, Allocator<std::pair<const alloc_string<Allocator>, bufferptr>>>; // FIXME bufferptr not in mempool
948
949 template<template<typename> class Allocator>
950 inline void decode_noshare(xattr_map<Allocator>& xattrs, ceph::buffer::list::const_iterator &p)
951 {
952 __u32 n;
953 decode(n, p);
954 while (n-- > 0) {
955 alloc_string<Allocator> key;
956 decode(key, p);
957 __u32 len;
958 decode(len, p);
959 p.copy_deep(len, xattrs[key]);
960 }
961 }
962
963 template<template<typename> class Allocator = std::allocator>
964 struct old_inode_t {
965 snapid_t first;
966 inode_t<Allocator> inode;
967 xattr_map<Allocator> xattrs;
968
969 void encode(bufferlist &bl, uint64_t features) const;
970 void decode(bufferlist::const_iterator& bl);
971 void dump(Formatter *f) const;
972 static void generate_test_instances(std::list<old_inode_t*>& ls);
973 };
974
975 // These methods may be moved back to mdstypes.cc when we have pmr
976 template<template<typename> class Allocator>
977 void old_inode_t<Allocator>::encode(bufferlist& bl, uint64_t features) const
978 {
979 ENCODE_START(2, 2, bl);
980 encode(first, bl);
981 encode(inode, bl, features);
982 encode(xattrs, bl);
983 ENCODE_FINISH(bl);
984 }
985
986 template<template<typename> class Allocator>
987 void old_inode_t<Allocator>::decode(bufferlist::const_iterator& bl)
988 {
989 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
990 decode(first, bl);
991 decode(inode, bl);
992 decode_noshare<Allocator>(xattrs, bl);
993 DECODE_FINISH(bl);
994 }
995
996 template<template<typename> class Allocator>
997 void old_inode_t<Allocator>::dump(Formatter *f) const
998 {
999 f->dump_unsigned("first", first);
1000 inode.dump(f);
1001 f->open_object_section("xattrs");
1002 for (const auto &p : xattrs) {
1003 std::string v(p.second.c_str(), p.second.length());
1004 f->dump_string(p.first.c_str(), v);
1005 }
1006 f->close_section();
1007 }
1008
1009 template<template<typename> class Allocator>
1010 void old_inode_t<Allocator>::generate_test_instances(std::list<old_inode_t<Allocator>*>& ls)
1011 {
1012 ls.push_back(new old_inode_t<Allocator>);
1013 ls.push_back(new old_inode_t<Allocator>);
1014 ls.back()->first = 2;
1015 std::list<inode_t<Allocator>*> ils;
1016 inode_t<Allocator>::generate_test_instances(ils);
1017 ls.back()->inode = *ils.back();
1018 ls.back()->xattrs["user.foo"] = buffer::copy("asdf", 4);
1019 ls.back()->xattrs["user.unprintable"] = buffer::copy("\000\001\002", 3);
1020 }
1021
1022 template<template<typename> class Allocator>
1023 inline void encode(const old_inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
1024 {
1025 ENCODE_DUMP_PRE();
1026 c.encode(bl, features);
1027 ENCODE_DUMP_POST(cl);
1028 }
1029 template<template<typename> class Allocator>
1030 inline void decode(old_inode_t<Allocator> &c, ::ceph::bufferlist::const_iterator &p)
1031 {
1032 c.decode(p);
1033 }
1034
1035 /*
1036 * like an inode, but for a dir frag
1037 */
1038 struct fnode_t {
1039 void encode(bufferlist &bl) const;
1040 void decode(bufferlist::const_iterator& bl);
1041 void dump(Formatter *f) const;
1042 static void generate_test_instances(std::list<fnode_t*>& ls);
1043
1044 version_t version = 0;
1045 snapid_t snap_purged_thru; // the max_last_destroy snapid we've been purged thru
1046 frag_info_t fragstat, accounted_fragstat;
1047 nest_info_t rstat, accounted_rstat;
1048 damage_flags_t damage_flags = 0;
1049
1050 // we know we and all our descendants have been scrubbed since this version
1051 version_t recursive_scrub_version = 0;
1052 utime_t recursive_scrub_stamp;
1053 // version at which we last scrubbed our personal data structures
1054 version_t localized_scrub_version = 0;
1055 utime_t localized_scrub_stamp;
1056 };
1057 WRITE_CLASS_ENCODER(fnode_t)
1058
1059
1060 struct old_rstat_t {
1061 void encode(bufferlist& bl) const;
1062 void decode(bufferlist::const_iterator& p);
1063 void dump(Formatter *f) const;
1064 static void generate_test_instances(std::list<old_rstat_t*>& ls);
1065
1066 snapid_t first;
1067 nest_info_t rstat, accounted_rstat;
1068 };
1069 WRITE_CLASS_ENCODER(old_rstat_t)
1070
1071 inline std::ostream& operator<<(std::ostream& out, const old_rstat_t& o) {
1072 return out << "old_rstat(first " << o.first << " " << o.rstat << " " << o.accounted_rstat << ")";
1073 }
1074
1075 class feature_bitset_t {
1076 public:
1077 typedef uint64_t block_type;
1078 static const size_t bits_per_block = sizeof(block_type) * 8;
1079
1080 feature_bitset_t(const feature_bitset_t& other) : _vec(other._vec) {}
1081 feature_bitset_t(feature_bitset_t&& other) : _vec(std::move(other._vec)) {}
1082 feature_bitset_t(unsigned long value = 0);
1083 feature_bitset_t(const vector<size_t>& array);
1084 feature_bitset_t& operator=(const feature_bitset_t& other) {
1085 _vec = other._vec;
1086 return *this;
1087 }
1088 feature_bitset_t& operator=(feature_bitset_t&& other) {
1089 _vec = std::move(other._vec);
1090 return *this;
1091 }
1092 feature_bitset_t& operator-=(const feature_bitset_t& other);
1093 bool empty() const {
1094 //block_type is a uint64_t. If the vector is only composed of 0s, then it's still "empty"
1095 for (auto& v : _vec) {
1096 if (v)
1097 return false;
1098 }
1099 return true;
1100 }
1101 bool test(size_t bit) const {
1102 if (bit >= bits_per_block * _vec.size())
1103 return false;
1104 return _vec[bit / bits_per_block] & ((block_type)1 << (bit % bits_per_block));
1105 }
1106 void clear() {
1107 _vec.clear();
1108 }
1109 void encode(bufferlist& bl) const;
1110 void decode(bufferlist::const_iterator &p);
1111 void dump(Formatter *f) const;
1112 void print(ostream& out) const;
1113 private:
1114 vector<block_type> _vec;
1115 };
1116 WRITE_CLASS_ENCODER(feature_bitset_t)
1117
1118 inline std::ostream& operator<<(std::ostream& out, const feature_bitset_t& s) {
1119 s.print(out);
1120 return out;
1121 }
1122
1123 struct metric_spec_t {
1124 metric_spec_t() {}
1125 metric_spec_t(const metric_spec_t& other) :
1126 metric_flags(other.metric_flags) {}
1127 metric_spec_t(metric_spec_t&& other) :
1128 metric_flags(std::move(other.metric_flags)) {}
1129 metric_spec_t(const feature_bitset_t& mf) :
1130 metric_flags(mf) {}
1131 metric_spec_t(feature_bitset_t&& mf) :
1132 metric_flags(std::move(mf)) {}
1133
1134 metric_spec_t& operator=(const metric_spec_t& other) {
1135 metric_flags = other.metric_flags;
1136 return *this;
1137 }
1138 metric_spec_t& operator=(metric_spec_t&& other) {
1139 metric_flags = std::move(other.metric_flags);
1140 return *this;
1141 }
1142
1143 bool empty() const {
1144 return metric_flags.empty();
1145 }
1146
1147 void clear() {
1148 metric_flags.clear();
1149 }
1150
1151 void encode(bufferlist& bl) const;
1152 void decode(bufferlist::const_iterator& p);
1153 void dump(Formatter *f) const;
1154 void print(ostream& out) const;
1155
1156 // set of metrics that a client is capable of forwarding
1157 feature_bitset_t metric_flags;
1158 };
1159 WRITE_CLASS_ENCODER(metric_spec_t)
1160
1161 inline std::ostream& operator<<(std::ostream& out, const metric_spec_t& mst) {
1162 mst.print(out);
1163 return out;
1164 }
1165
1166 /*
1167 * client_metadata_t
1168 */
1169 struct client_metadata_t {
1170 using kv_map_t = std::map<std::string,std::string>;
1171 using iterator = kv_map_t::const_iterator;
1172
1173 client_metadata_t() {}
1174 client_metadata_t(const kv_map_t& kv, const feature_bitset_t &f, const metric_spec_t &mst) :
1175 kv_map(kv),
1176 features(f),
1177 metric_spec(mst) {}
1178 client_metadata_t& operator=(const client_metadata_t& other) {
1179 kv_map = other.kv_map;
1180 features = other.features;
1181 metric_spec = other.metric_spec;
1182 return *this;
1183 }
1184
1185 bool empty() const { return kv_map.empty() && features.empty() && metric_spec.empty(); }
1186 iterator find(const std::string& key) const { return kv_map.find(key); }
1187 iterator begin() const { return kv_map.begin(); }
1188 iterator end() const { return kv_map.end(); }
1189 void erase(iterator it) { kv_map.erase(it); }
1190 std::string& operator[](const std::string& key) { return kv_map[key]; }
1191 void merge(const client_metadata_t& other) {
1192 kv_map.insert(other.kv_map.begin(), other.kv_map.end());
1193 features = other.features;
1194 metric_spec = other.metric_spec;
1195 }
1196 void clear() {
1197 kv_map.clear();
1198 features.clear();
1199 metric_spec.clear();
1200 }
1201
1202 void encode(bufferlist& bl) const;
1203 void decode(bufferlist::const_iterator& p);
1204 void dump(Formatter *f) const;
1205
1206 kv_map_t kv_map;
1207 feature_bitset_t features;
1208 metric_spec_t metric_spec;
1209 };
1210 WRITE_CLASS_ENCODER(client_metadata_t)
1211
1212 /*
1213 * session_info_t - durable part of a Session
1214 */
1215 struct session_info_t {
1216 client_t get_client() const { return client_t(inst.name.num()); }
1217 bool has_feature(size_t bit) const { return client_metadata.features.test(bit); }
1218 const entity_name_t& get_source() const { return inst.name; }
1219
1220 void clear_meta() {
1221 prealloc_inos.clear();
1222 used_inos.clear();
1223 completed_requests.clear();
1224 completed_flushes.clear();
1225 client_metadata.clear();
1226 }
1227
1228 void encode(bufferlist& bl, uint64_t features) const;
1229 void decode(bufferlist::const_iterator& p);
1230 void dump(Formatter *f) const;
1231 static void generate_test_instances(std::list<session_info_t*>& ls);
1232
1233 entity_inst_t inst;
1234 std::map<ceph_tid_t,inodeno_t> completed_requests;
1235 interval_set<inodeno_t> prealloc_inos; // preallocated, ready to use.
1236 interval_set<inodeno_t> used_inos; // journaling use
1237 client_metadata_t client_metadata;
1238 std::set<ceph_tid_t> completed_flushes;
1239 EntityName auth_name;
1240 };
1241 WRITE_CLASS_ENCODER_FEATURES(session_info_t)
1242
1243 // dentries
1244 struct dentry_key_t {
1245 dentry_key_t() {}
1246 dentry_key_t(snapid_t s, std::string_view n, __u32 h=0) :
1247 snapid(s), name(n), hash(h) {}
1248
1249 bool is_valid() { return name.length() || snapid; }
1250
1251 // encode into something that can be decoded as a string.
1252 // name_ (head) or name_%x (!head)
1253 void encode(bufferlist& bl) const {
1254 string key;
1255 encode(key);
1256 using ceph::encode;
1257 encode(key, bl);
1258 }
1259 void encode(string& key) const {
1260 char b[20];
1261 if (snapid != CEPH_NOSNAP) {
1262 uint64_t val(snapid);
1263 snprintf(b, sizeof(b), "%" PRIx64, val);
1264 } else {
1265 snprintf(b, sizeof(b), "%s", "head");
1266 }
1267 ostringstream oss;
1268 oss << name << "_" << b;
1269 key = oss.str();
1270 }
1271 static void decode_helper(bufferlist::const_iterator& bl, string& nm, snapid_t& sn) {
1272 string key;
1273 decode(key, bl);
1274 decode_helper(key, nm, sn);
1275 }
1276 static void decode_helper(std::string_view key, string& nm, snapid_t& sn) {
1277 size_t i = key.find_last_of('_');
1278 ceph_assert(i != string::npos);
1279 if (key.compare(i+1, std::string_view::npos, "head") == 0) {
1280 // name_head
1281 sn = CEPH_NOSNAP;
1282 } else {
1283 // name_%x
1284 long long unsigned x = 0;
1285 std::string x_str(key.substr(i+1));
1286 sscanf(x_str.c_str(), "%llx", &x);
1287 sn = x;
1288 }
1289 nm = key.substr(0, i);
1290 }
1291
1292 snapid_t snapid = 0;
1293 std::string_view name;
1294 __u32 hash = 0;
1295 };
1296
1297 inline std::ostream& operator<<(std::ostream& out, const dentry_key_t &k)
1298 {
1299 return out << "(" << k.name << "," << k.snapid << ")";
1300 }
1301
1302 inline bool operator<(const dentry_key_t& k1, const dentry_key_t& k2)
1303 {
1304 /*
1305 * order by hash, name, snap
1306 */
1307 int c = ceph_frag_value(k1.hash) - ceph_frag_value(k2.hash);
1308 if (c)
1309 return c < 0;
1310 c = k1.name.compare(k2.name);
1311 if (c)
1312 return c < 0;
1313 return k1.snapid < k2.snapid;
1314 }
1315
1316 /*
1317 * string_snap_t is a simple (string, snapid_t) pair
1318 */
1319 struct string_snap_t {
1320 string_snap_t() {}
1321 string_snap_t(std::string_view n, snapid_t s) : name(n), snapid(s) {}
1322
1323 void encode(bufferlist& bl) const;
1324 void decode(bufferlist::const_iterator& p);
1325 void dump(Formatter *f) const;
1326 static void generate_test_instances(std::list<string_snap_t*>& ls);
1327
1328 string name;
1329 snapid_t snapid;
1330 };
1331 WRITE_CLASS_ENCODER(string_snap_t)
1332
1333 inline bool operator<(const string_snap_t& l, const string_snap_t& r) {
1334 int c = l.name.compare(r.name);
1335 return c < 0 || (c == 0 && l.snapid < r.snapid);
1336 }
1337
1338 inline std::ostream& operator<<(std::ostream& out, const string_snap_t &k)
1339 {
1340 return out << "(" << k.name << "," << k.snapid << ")";
1341 }
1342
1343 /*
1344 * mds_table_pending_t
1345 *
1346 * For mds's requesting any pending ops, child needs to encode the corresponding
1347 * pending mutation state in the table.
1348 */
1349 struct mds_table_pending_t {
1350 void encode(bufferlist& bl) const;
1351 void decode(bufferlist::const_iterator& bl);
1352 void dump(Formatter *f) const;
1353 static void generate_test_instances(std::list<mds_table_pending_t*>& ls);
1354
1355 uint64_t reqid = 0;
1356 __s32 mds = 0;
1357 version_t tid = 0;
1358 };
1359 WRITE_CLASS_ENCODER(mds_table_pending_t)
1360
1361 // requests
1362 struct metareqid_t {
1363 metareqid_t() {}
1364 metareqid_t(entity_name_t n, ceph_tid_t t) : name(n), tid(t) {}
1365 void encode(bufferlist& bl) const {
1366 using ceph::encode;
1367 encode(name, bl);
1368 encode(tid, bl);
1369 }
1370 void decode(bufferlist::const_iterator &p) {
1371 using ceph::decode;
1372 decode(name, p);
1373 decode(tid, p);
1374 }
1375
1376 entity_name_t name;
1377 uint64_t tid = 0;
1378 };
1379 WRITE_CLASS_ENCODER(metareqid_t)
1380
1381 inline std::ostream& operator<<(std::ostream& out, const metareqid_t& r) {
1382 return out << r.name << ":" << r.tid;
1383 }
1384
1385 inline bool operator==(const metareqid_t& l, const metareqid_t& r) {
1386 return (l.name == r.name) && (l.tid == r.tid);
1387 }
1388 inline bool operator!=(const metareqid_t& l, const metareqid_t& r) {
1389 return (l.name != r.name) || (l.tid != r.tid);
1390 }
1391 inline bool operator<(const metareqid_t& l, const metareqid_t& r) {
1392 return (l.name < r.name) ||
1393 (l.name == r.name && l.tid < r.tid);
1394 }
1395 inline bool operator<=(const metareqid_t& l, const metareqid_t& r) {
1396 return (l.name < r.name) ||
1397 (l.name == r.name && l.tid <= r.tid);
1398 }
1399 inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); }
1400 inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); }
1401
1402 namespace std {
1403 template<> struct hash<metareqid_t> {
1404 size_t operator()(const metareqid_t &r) const {
1405 hash<uint64_t> H;
1406 return H(r.name.num()) ^ H(r.name.type()) ^ H(r.tid);
1407 }
1408 };
1409 } // namespace std
1410
1411 // cap info for client reconnect
1412 struct cap_reconnect_t {
1413 cap_reconnect_t() {}
1414 cap_reconnect_t(uint64_t cap_id, inodeno_t pino, std::string_view p, int w, int i,
1415 inodeno_t sr, snapid_t sf, bufferlist& lb) :
1416 path(p) {
1417 capinfo.cap_id = cap_id;
1418 capinfo.wanted = w;
1419 capinfo.issued = i;
1420 capinfo.snaprealm = sr;
1421 capinfo.pathbase = pino;
1422 capinfo.flock_len = 0;
1423 snap_follows = sf;
1424 flockbl.claim(lb);
1425 }
1426 void encode(bufferlist& bl) const;
1427 void decode(bufferlist::const_iterator& bl);
1428 void encode_old(bufferlist& bl) const;
1429 void decode_old(bufferlist::const_iterator& bl);
1430
1431 void dump(Formatter *f) const;
1432 static void generate_test_instances(std::list<cap_reconnect_t*>& ls);
1433
1434 string path;
1435 mutable ceph_mds_cap_reconnect capinfo = {};
1436 snapid_t snap_follows = 0;
1437 bufferlist flockbl;
1438 };
1439 WRITE_CLASS_ENCODER(cap_reconnect_t)
1440
1441 struct snaprealm_reconnect_t {
1442 snaprealm_reconnect_t() {}
1443 snaprealm_reconnect_t(inodeno_t ino, snapid_t seq, inodeno_t parent) {
1444 realm.ino = ino;
1445 realm.seq = seq;
1446 realm.parent = parent;
1447 }
1448 void encode(bufferlist& bl) const;
1449 void decode(bufferlist::const_iterator& bl);
1450 void encode_old(bufferlist& bl) const;
1451 void decode_old(bufferlist::const_iterator& bl);
1452
1453 void dump(Formatter *f) const;
1454 static void generate_test_instances(std::list<snaprealm_reconnect_t*>& ls);
1455
1456 mutable ceph_mds_snaprealm_reconnect realm = {};
1457 };
1458 WRITE_CLASS_ENCODER(snaprealm_reconnect_t)
1459
1460 // compat for pre-FLOCK feature
1461 struct old_ceph_mds_cap_reconnect {
1462 ceph_le64 cap_id;
1463 ceph_le32 wanted;
1464 ceph_le32 issued;
1465 ceph_le64 old_size;
1466 struct ceph_timespec old_mtime, old_atime;
1467 ceph_le64 snaprealm;
1468 ceph_le64 pathbase; /* base ino for our path to this ino */
1469 } __attribute__ ((packed));
1470 WRITE_RAW_ENCODER(old_ceph_mds_cap_reconnect)
1471
1472 struct old_cap_reconnect_t {
1473 const old_cap_reconnect_t& operator=(const cap_reconnect_t& n) {
1474 path = n.path;
1475 capinfo.cap_id = n.capinfo.cap_id;
1476 capinfo.wanted = n.capinfo.wanted;
1477 capinfo.issued = n.capinfo.issued;
1478 capinfo.snaprealm = n.capinfo.snaprealm;
1479 capinfo.pathbase = n.capinfo.pathbase;
1480 return *this;
1481 }
1482 operator cap_reconnect_t() {
1483 cap_reconnect_t n;
1484 n.path = path;
1485 n.capinfo.cap_id = capinfo.cap_id;
1486 n.capinfo.wanted = capinfo.wanted;
1487 n.capinfo.issued = capinfo.issued;
1488 n.capinfo.snaprealm = capinfo.snaprealm;
1489 n.capinfo.pathbase = capinfo.pathbase;
1490 return n;
1491 }
1492
1493 void encode(bufferlist& bl) const {
1494 using ceph::encode;
1495 encode(path, bl);
1496 encode(capinfo, bl);
1497 }
1498 void decode(bufferlist::const_iterator& bl) {
1499 using ceph::decode;
1500 decode(path, bl);
1501 decode(capinfo, bl);
1502 }
1503
1504 string path;
1505 old_ceph_mds_cap_reconnect capinfo;
1506 };
1507 WRITE_CLASS_ENCODER(old_cap_reconnect_t)
1508
1509 // dir frag
1510 struct dirfrag_t {
1511 dirfrag_t() {}
1512 dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { }
1513
1514 void encode(bufferlist& bl) const {
1515 using ceph::encode;
1516 encode(ino, bl);
1517 encode(frag, bl);
1518 }
1519 void decode(bufferlist::const_iterator& bl) {
1520 using ceph::decode;
1521 decode(ino, bl);
1522 decode(frag, bl);
1523 }
1524
1525 inodeno_t ino = 0;
1526 frag_t frag;
1527 };
1528 WRITE_CLASS_ENCODER(dirfrag_t)
1529
1530 inline std::ostream& operator<<(std::ostream& out, const dirfrag_t &df) {
1531 out << df.ino;
1532 if (!df.frag.is_root()) out << "." << df.frag;
1533 return out;
1534 }
1535 inline bool operator<(dirfrag_t l, dirfrag_t r) {
1536 if (l.ino < r.ino) return true;
1537 if (l.ino == r.ino && l.frag < r.frag) return true;
1538 return false;
1539 }
1540 inline bool operator==(dirfrag_t l, dirfrag_t r) {
1541 return l.ino == r.ino && l.frag == r.frag;
1542 }
1543
1544 namespace std {
1545 template<> struct hash<dirfrag_t> {
1546 size_t operator()(const dirfrag_t &df) const {
1547 static rjhash<uint64_t> H;
1548 static rjhash<uint32_t> I;
1549 return H(df.ino) ^ I(df.frag);
1550 }
1551 };
1552 } // namespace std
1553
1554 // ================================================================
1555 #define META_POP_IRD 0
1556 #define META_POP_IWR 1
1557 #define META_POP_READDIR 2
1558 #define META_POP_FETCH 3
1559 #define META_POP_STORE 4
1560 #define META_NPOP 5
1561
1562 class inode_load_vec_t {
1563 public:
1564 using time = DecayCounter::time;
1565 using clock = DecayCounter::clock;
1566 static const size_t NUM = 2;
1567
1568 inode_load_vec_t() : vec{DecayCounter(DecayRate()), DecayCounter(DecayRate())} {}
1569 inode_load_vec_t(const DecayRate &rate) : vec{DecayCounter(rate), DecayCounter(rate)} {}
1570
1571 DecayCounter &get(int t) {
1572 return vec[t];
1573 }
1574 void zero() {
1575 for (auto &d : vec) {
1576 d.reset();
1577 }
1578 }
1579 void encode(bufferlist &bl) const;
1580 void decode(bufferlist::const_iterator& p);
1581 void dump(Formatter *f) const;
1582 static void generate_test_instances(std::list<inode_load_vec_t*>& ls);
1583
1584 private:
1585 std::array<DecayCounter, NUM> vec;
1586 };
1587 inline void encode(const inode_load_vec_t &c, bufferlist &bl) {
1588 c.encode(bl);
1589 }
1590 inline void decode(inode_load_vec_t & c, bufferlist::const_iterator &p) {
1591 c.decode(p);
1592 }
1593
1594 class dirfrag_load_vec_t {
1595 public:
1596 using time = DecayCounter::time;
1597 using clock = DecayCounter::clock;
1598 static const size_t NUM = 5;
1599
1600 dirfrag_load_vec_t() :
1601 vec{DecayCounter(DecayRate()),
1602 DecayCounter(DecayRate()),
1603 DecayCounter(DecayRate()),
1604 DecayCounter(DecayRate()),
1605 DecayCounter(DecayRate())
1606 }
1607 {}
1608 dirfrag_load_vec_t(const DecayRate &rate) :
1609 vec{DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate)}
1610 {}
1611
1612 void encode(bufferlist &bl) const {
1613 ENCODE_START(2, 2, bl);
1614 for (const auto &i : vec) {
1615 encode(i, bl);
1616 }
1617 ENCODE_FINISH(bl);
1618 }
1619 void decode(bufferlist::const_iterator &p) {
1620 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p);
1621 for (auto &i : vec) {
1622 decode(i, p);
1623 }
1624 DECODE_FINISH(p);
1625 }
1626 void dump(Formatter *f) const;
1627 void dump(Formatter *f, const DecayRate& rate) const;
1628 static void generate_test_instances(std::list<dirfrag_load_vec_t*>& ls);
1629
1630 const DecayCounter &get(int t) const {
1631 return vec[t];
1632 }
1633 DecayCounter &get(int t) {
1634 return vec[t];
1635 }
1636 void adjust(double d) {
1637 for (auto &i : vec) {
1638 i.adjust(d);
1639 }
1640 }
1641 void zero() {
1642 for (auto &i : vec) {
1643 i.reset();
1644 }
1645 }
1646 double meta_load() const {
1647 return
1648 1*vec[META_POP_IRD].get() +
1649 2*vec[META_POP_IWR].get() +
1650 1*vec[META_POP_READDIR].get() +
1651 2*vec[META_POP_FETCH].get() +
1652 4*vec[META_POP_STORE].get();
1653 }
1654
1655 void add(dirfrag_load_vec_t& r) {
1656 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1657 vec[i].adjust(r.vec[i].get());
1658 }
1659 void sub(dirfrag_load_vec_t& r) {
1660 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1661 vec[i].adjust(-r.vec[i].get());
1662 }
1663 void scale(double f) {
1664 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1665 vec[i].scale(f);
1666 }
1667
1668 private:
1669 friend inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl);
1670 std::array<DecayCounter, NUM> vec;
1671 };
1672
1673 inline void encode(const dirfrag_load_vec_t &c, bufferlist &bl) {
1674 c.encode(bl);
1675 }
1676 inline void decode(dirfrag_load_vec_t& c, bufferlist::const_iterator &p) {
1677 c.decode(p);
1678 }
1679
1680 inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl)
1681 {
1682 std::ostringstream ss;
1683 ss << std::setprecision(1) << std::fixed
1684 << "[pop"
1685 " IRD:" << dl.vec[0]
1686 << " IWR:" << dl.vec[1]
1687 << " RDR:" << dl.vec[2]
1688 << " FET:" << dl.vec[3]
1689 << " STR:" << dl.vec[4]
1690 << " *LOAD:" << dl.meta_load() << "]";
1691 return out << ss.str() << std::endl;
1692 }
1693
1694 struct mds_load_t {
1695 using clock = dirfrag_load_vec_t::clock;
1696 using time = dirfrag_load_vec_t::time;
1697
1698 dirfrag_load_vec_t auth;
1699 dirfrag_load_vec_t all;
1700
1701 mds_load_t() : auth(DecayRate()), all(DecayRate()) {}
1702 mds_load_t(const DecayRate &rate) : auth(rate), all(rate) {}
1703
1704 double req_rate = 0.0;
1705 double cache_hit_rate = 0.0;
1706 double queue_len = 0.0;
1707
1708 double cpu_load_avg = 0.0;
1709
1710 double mds_load() const; // defiend in MDBalancer.cc
1711 void encode(bufferlist& bl) const;
1712 void decode(bufferlist::const_iterator& bl);
1713 void dump(Formatter *f) const;
1714 static void generate_test_instances(std::list<mds_load_t*>& ls);
1715 };
1716 inline void encode(const mds_load_t &c, bufferlist &bl) {
1717 c.encode(bl);
1718 }
1719 inline void decode(mds_load_t &c, bufferlist::const_iterator &p) {
1720 c.decode(p);
1721 }
1722
1723 inline std::ostream& operator<<(std::ostream& out, const mds_load_t& load)
1724 {
1725 return out << "mdsload<" << load.auth << "/" << load.all
1726 << ", req " << load.req_rate
1727 << ", hr " << load.cache_hit_rate
1728 << ", qlen " << load.queue_len
1729 << ", cpu " << load.cpu_load_avg
1730 << ">";
1731 }
1732
1733 class load_spread_t {
1734 public:
1735 using time = DecayCounter::time;
1736 using clock = DecayCounter::clock;
1737 static const int MAX = 4;
1738
1739 load_spread_t(const DecayRate &rate) : count(rate)
1740 {}
1741
1742 load_spread_t() = delete;
1743
1744 double hit(int who) {
1745 for (int i=0; i<n; i++)
1746 if (last[i] == who)
1747 return count.get_last();
1748
1749 // we're new(ish)
1750 last[p++] = who;
1751 if (n < MAX) n++;
1752 if (n == 1) return 0.0;
1753
1754 if (p == MAX) p = 0;
1755
1756 return count.hit();
1757 }
1758 double get() const {
1759 return count.get();
1760 }
1761
1762 std::array<int, MAX> last = {-1, -1, -1, -1};
1763 int p = 0, n = 0;
1764 DecayCounter count;
1765 };
1766
1767 // ================================================================
1768 typedef std::pair<mds_rank_t, mds_rank_t> mds_authority_t;
1769
1770 // -- authority delegation --
1771 // directory authority types
1772 // >= 0 is the auth mds
1773 #define CDIR_AUTH_PARENT mds_rank_t(-1) // default
1774 #define CDIR_AUTH_UNKNOWN mds_rank_t(-2)
1775 #define CDIR_AUTH_DEFAULT mds_authority_t(CDIR_AUTH_PARENT, CDIR_AUTH_UNKNOWN)
1776 #define CDIR_AUTH_UNDEF mds_authority_t(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN)
1777 //#define CDIR_AUTH_ROOTINODE pair<int,int>( 0, -2)
1778
1779 class MDSCacheObjectInfo {
1780 public:
1781 void encode(bufferlist& bl) const;
1782 void decode(bufferlist::const_iterator& bl);
1783 void dump(Formatter *f) const;
1784 static void generate_test_instances(std::list<MDSCacheObjectInfo*>& ls);
1785
1786 inodeno_t ino = 0;
1787 dirfrag_t dirfrag;
1788 string dname;
1789 snapid_t snapid;
1790 };
1791
1792 inline std::ostream& operator<<(std::ostream& out, const MDSCacheObjectInfo &info) {
1793 if (info.ino) return out << info.ino << "." << info.snapid;
1794 if (info.dname.length()) return out << info.dirfrag << "/" << info.dname
1795 << " snap " << info.snapid;
1796 return out << info.dirfrag;
1797 }
1798
1799 inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) {
1800 if (l.ino || r.ino)
1801 return l.ino == r.ino && l.snapid == r.snapid;
1802 else
1803 return l.dirfrag == r.dirfrag && l.dname == r.dname;
1804 }
1805 WRITE_CLASS_ENCODER(MDSCacheObjectInfo)
1806
1807 // parse a map of keys/values.
1808 namespace qi = boost::spirit::qi;
1809
1810 template <typename Iterator>
1811 struct keys_and_values
1812 : qi::grammar<Iterator, std::map<string, string>()>
1813 {
1814 keys_and_values()
1815 : keys_and_values::base_type(query)
1816 {
1817 query = pair >> *(qi::lit(' ') >> pair);
1818 pair = key >> '=' >> value;
1819 key = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9");
1820 value = +qi::char_("a-zA-Z0-9-_.");
1821 }
1822 qi::rule<Iterator, std::map<string, string>()> query;
1823 qi::rule<Iterator, std::pair<string, string>()> pair;
1824 qi::rule<Iterator, string()> key, value;
1825 };
1826
1827 #endif