]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/mdstypes.h
import ceph 15.2.14
[ceph.git] / ceph / src / mds / mdstypes.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef CEPH_MDSTYPES_H
4 #define CEPH_MDSTYPES_H
5
6 #include "include/int_types.h"
7
8 #include <math.h>
9 #include <ostream>
10 #include <set>
11 #include <map>
12 #include <string_view>
13
14 #include "common/config.h"
15 #include "common/Clock.h"
16 #include "common/DecayCounter.h"
17 #include "common/entity_name.h"
18
19 #include "include/Context.h"
20 #include "include/frag.h"
21 #include "include/xlist.h"
22 #include "include/interval_set.h"
23 #include "include/compact_map.h"
24 #include "include/compact_set.h"
25 #include "include/fs_types.h"
26 #include "include/ceph_fs.h"
27
28 #include "inode_backtrace.h"
29
30 #include <boost/spirit/include/qi.hpp>
31 #include <boost/pool/pool.hpp>
32 #include "include/ceph_assert.h"
33 #include <boost/serialization/strong_typedef.hpp>
34
35 #define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011"
36
37 #define MDS_PORT_CACHE 0x200
38 #define MDS_PORT_LOCKER 0x300
39 #define MDS_PORT_MIGRATOR 0x400
40
41 #define MAX_MDS 0x100
42 #define NUM_STRAY 10
43
44 // Inode numbers 1,2 and 4 please see CEPH_INO_* in include/ceph_fs.h
45
46 #define MDS_INO_MDSDIR_OFFSET (1*MAX_MDS)
47 #define MDS_INO_STRAY_OFFSET (6*MAX_MDS)
48
49 // Locations for journal data
50 #define MDS_INO_LOG_OFFSET (2*MAX_MDS)
51 #define MDS_INO_LOG_BACKUP_OFFSET (3*MAX_MDS)
52 #define MDS_INO_LOG_POINTER_OFFSET (4*MAX_MDS)
53 #define MDS_INO_PURGE_QUEUE (5*MAX_MDS)
54
55 #define MDS_INO_SYSTEM_BASE ((6*MAX_MDS) + (MAX_MDS * NUM_STRAY))
56
57 #define MDS_INO_STRAY(x,i) (MDS_INO_STRAY_OFFSET+((((unsigned)(x))*NUM_STRAY)+((unsigned)(i))))
58 #define MDS_INO_MDSDIR(x) (MDS_INO_MDSDIR_OFFSET+((unsigned)x))
59
60 #define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < (MDS_INO_STRAY_OFFSET+(MAX_MDS*NUM_STRAY)))
61 #define MDS_INO_IS_MDSDIR(i) ((i) >= MDS_INO_MDSDIR_OFFSET && (i) < (MDS_INO_MDSDIR_OFFSET+MAX_MDS))
62 #define MDS_INO_MDSDIR_OWNER(i) (signed ((unsigned (i)) - MDS_INO_MDSDIR_OFFSET))
63 #define MDS_INO_IS_BASE(i) ((i) == CEPH_INO_ROOT || (i) == CEPH_INO_GLOBAL_SNAPREALM || MDS_INO_IS_MDSDIR(i))
64 #define MDS_INO_STRAY_OWNER(i) (signed (((unsigned (i)) - MDS_INO_STRAY_OFFSET) / NUM_STRAY))
65 #define MDS_INO_STRAY_INDEX(i) (((unsigned (i)) - MDS_INO_STRAY_OFFSET) % NUM_STRAY)
66
67 #define MDS_IS_PRIVATE_INO(i) ((i) < MDS_INO_SYSTEM_BASE && (i) >= MDS_INO_MDSDIR_OFFSET)
68
69 typedef int32_t mds_rank_t;
70 constexpr mds_rank_t MDS_RANK_NONE = -1;
71
72 BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
73 extern const mds_gid_t MDS_GID_NONE;
74
75 typedef int32_t fs_cluster_id_t;
76 constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = -1;
77 // The namespace ID of the anonymous default filesystem from legacy systems
78 constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS = 0;
79
80 class mds_role_t {
81 public:
82 mds_role_t(fs_cluster_id_t fscid_, mds_rank_t rank_)
83 : fscid(fscid_), rank(rank_)
84 {}
85 mds_role_t() {}
86
87 bool operator<(mds_role_t const &rhs) const {
88 if (fscid < rhs.fscid) {
89 return true;
90 } else if (fscid == rhs.fscid) {
91 return rank < rhs.rank;
92 } else {
93 return false;
94 }
95 }
96
97 bool is_none() const {
98 return (rank == MDS_RANK_NONE);
99 }
100
101 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
102 mds_rank_t rank = MDS_RANK_NONE;
103 };
104 inline std::ostream& operator<<(std::ostream& out, const mds_role_t& role) {
105 return out << role.fscid << ":" << role.rank;
106 }
107
108 // CAPS
109 inline string gcap_string(int cap)
110 {
111 string s;
112 if (cap & CEPH_CAP_GSHARED) s += "s";
113 if (cap & CEPH_CAP_GEXCL) s += "x";
114 if (cap & CEPH_CAP_GCACHE) s += "c";
115 if (cap & CEPH_CAP_GRD) s += "r";
116 if (cap & CEPH_CAP_GWR) s += "w";
117 if (cap & CEPH_CAP_GBUFFER) s += "b";
118 if (cap & CEPH_CAP_GWREXTEND) s += "a";
119 if (cap & CEPH_CAP_GLAZYIO) s += "l";
120 return s;
121 }
122 inline string ccap_string(int cap)
123 {
124 string s;
125 if (cap & CEPH_CAP_PIN) s += "p";
126
127 int a = (cap >> CEPH_CAP_SAUTH) & 3;
128 if (a) s += 'A' + gcap_string(a);
129
130 a = (cap >> CEPH_CAP_SLINK) & 3;
131 if (a) s += 'L' + gcap_string(a);
132
133 a = (cap >> CEPH_CAP_SXATTR) & 3;
134 if (a) s += 'X' + gcap_string(a);
135
136 a = cap >> CEPH_CAP_SFILE;
137 if (a) s += 'F' + gcap_string(a);
138
139 if (s.length() == 0)
140 s = "-";
141 return s;
142 }
143
144 struct scatter_info_t {
145 version_t version = 0;
146 };
147
148 struct frag_info_t : public scatter_info_t {
149 int64_t size() const { return nfiles + nsubdirs; }
150
151 void zero() {
152 *this = frag_info_t();
153 }
154
155 // *this += cur - acc;
156 void add_delta(const frag_info_t &cur, const frag_info_t &acc, bool *touched_mtime=0, bool *touched_chattr=0) {
157 if (cur.mtime > mtime) {
158 mtime = cur.mtime;
159 if (touched_mtime)
160 *touched_mtime = true;
161 }
162 if (cur.change_attr > change_attr) {
163 change_attr = cur.change_attr;
164 if (touched_chattr)
165 *touched_chattr = true;
166 }
167 nfiles += cur.nfiles - acc.nfiles;
168 nsubdirs += cur.nsubdirs - acc.nsubdirs;
169 }
170
171 void add(const frag_info_t& other) {
172 if (other.mtime > mtime)
173 mtime = other.mtime;
174 if (other.change_attr > change_attr)
175 change_attr = other.change_attr;
176 nfiles += other.nfiles;
177 nsubdirs += other.nsubdirs;
178 }
179
180 bool same_sums(const frag_info_t &o) const {
181 return mtime <= o.mtime &&
182 nfiles == o.nfiles &&
183 nsubdirs == o.nsubdirs;
184 }
185
186 void encode(bufferlist &bl) const;
187 void decode(bufferlist::const_iterator& bl);
188 void dump(Formatter *f) const;
189 static void generate_test_instances(std::list<frag_info_t*>& ls);
190
191 // this frag
192 utime_t mtime;
193 uint64_t change_attr = 0;
194 int64_t nfiles = 0; // files
195 int64_t nsubdirs = 0; // subdirs
196 };
197 WRITE_CLASS_ENCODER(frag_info_t)
198
199 inline bool operator==(const frag_info_t &l, const frag_info_t &r) {
200 return memcmp(&l, &r, sizeof(l)) == 0;
201 }
202 inline bool operator!=(const frag_info_t &l, const frag_info_t &r) {
203 return !(l == r);
204 }
205
206 std::ostream& operator<<(std::ostream &out, const frag_info_t &f);
207
208
209 struct nest_info_t : public scatter_info_t {
210 int64_t rsize() const { return rfiles + rsubdirs; }
211
212 void zero() {
213 *this = nest_info_t();
214 }
215
216 void sub(const nest_info_t &other) {
217 add(other, -1);
218 }
219 void add(const nest_info_t &other, int fac=1) {
220 if (other.rctime > rctime)
221 rctime = other.rctime;
222 rbytes += fac*other.rbytes;
223 rfiles += fac*other.rfiles;
224 rsubdirs += fac*other.rsubdirs;
225 rsnaps += fac*other.rsnaps;
226 }
227
228 // *this += cur - acc;
229 void add_delta(const nest_info_t &cur, const nest_info_t &acc) {
230 if (cur.rctime > rctime)
231 rctime = cur.rctime;
232 rbytes += cur.rbytes - acc.rbytes;
233 rfiles += cur.rfiles - acc.rfiles;
234 rsubdirs += cur.rsubdirs - acc.rsubdirs;
235 rsnaps += cur.rsnaps - acc.rsnaps;
236 }
237
238 bool same_sums(const nest_info_t &o) const {
239 return rctime <= o.rctime &&
240 rbytes == o.rbytes &&
241 rfiles == o.rfiles &&
242 rsubdirs == o.rsubdirs &&
243 rsnaps == o.rsnaps;
244 }
245
246 void encode(bufferlist &bl) const;
247 void decode(bufferlist::const_iterator& bl);
248 void dump(Formatter *f) const;
249 static void generate_test_instances(std::list<nest_info_t*>& ls);
250
251 // this frag + children
252 utime_t rctime;
253 int64_t rbytes = 0;
254 int64_t rfiles = 0;
255 int64_t rsubdirs = 0;
256 int64_t rsnaps = 0;
257 };
258 WRITE_CLASS_ENCODER(nest_info_t)
259
260 inline bool operator==(const nest_info_t &l, const nest_info_t &r) {
261 return memcmp(&l, &r, sizeof(l)) == 0;
262 }
263 inline bool operator!=(const nest_info_t &l, const nest_info_t &r) {
264 return !(l == r);
265 }
266
267 std::ostream& operator<<(std::ostream &out, const nest_info_t &n);
268
269 struct vinodeno_t {
270 vinodeno_t() {}
271 vinodeno_t(inodeno_t i, snapid_t s) : ino(i), snapid(s) {}
272
273 void encode(bufferlist& bl) const {
274 using ceph::encode;
275 encode(ino, bl);
276 encode(snapid, bl);
277 }
278 void decode(bufferlist::const_iterator& p) {
279 using ceph::decode;
280 decode(ino, p);
281 decode(snapid, p);
282 }
283
284 inodeno_t ino;
285 snapid_t snapid;
286 };
287 WRITE_CLASS_ENCODER(vinodeno_t)
288
289 inline bool operator==(const vinodeno_t &l, const vinodeno_t &r) {
290 return l.ino == r.ino && l.snapid == r.snapid;
291 }
292 inline bool operator!=(const vinodeno_t &l, const vinodeno_t &r) {
293 return !(l == r);
294 }
295 inline bool operator<(const vinodeno_t &l, const vinodeno_t &r) {
296 return
297 l.ino < r.ino ||
298 (l.ino == r.ino && l.snapid < r.snapid);
299 }
300
301 struct quota_info_t
302 {
303 void encode(bufferlist& bl) const {
304 ENCODE_START(1, 1, bl);
305 encode(max_bytes, bl);
306 encode(max_files, bl);
307 ENCODE_FINISH(bl);
308 }
309 void decode(bufferlist::const_iterator& p) {
310 DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p);
311 decode(max_bytes, p);
312 decode(max_files, p);
313 DECODE_FINISH(p);
314 }
315
316 void dump(Formatter *f) const;
317 static void generate_test_instances(std::list<quota_info_t *>& ls);
318
319 bool is_valid() const {
320 return max_bytes >=0 && max_files >=0;
321 }
322 bool is_enable() const {
323 return max_bytes || max_files;
324 }
325
326 int64_t max_bytes = 0;
327 int64_t max_files = 0;
328 };
329 WRITE_CLASS_ENCODER(quota_info_t)
330
331 inline bool operator==(const quota_info_t &l, const quota_info_t &r) {
332 return memcmp(&l, &r, sizeof(l)) == 0;
333 }
334
335 ostream& operator<<(ostream &out, const quota_info_t &n);
336
337 namespace std {
338 template<> struct hash<vinodeno_t> {
339 size_t operator()(const vinodeno_t &vino) const {
340 hash<inodeno_t> H;
341 hash<uint64_t> I;
342 return H(vino.ino) ^ I(vino.snapid);
343 }
344 };
345 }
346
347 inline std::ostream& operator<<(std::ostream &out, const vinodeno_t &vino) {
348 out << vino.ino;
349 if (vino.snapid == CEPH_NOSNAP)
350 out << ".head";
351 else if (vino.snapid)
352 out << '.' << vino.snapid;
353 return out;
354 }
355
356 struct client_writeable_range_t {
357 struct byte_range_t {
358 uint64_t first = 0, last = 0; // interval client can write to
359 };
360
361 void encode(bufferlist &bl) const;
362 void decode(bufferlist::const_iterator& bl);
363 void dump(Formatter *f) const;
364 static void generate_test_instances(std::list<client_writeable_range_t*>& ls);
365
366 byte_range_t range;
367 snapid_t follows = 0; // aka "data+metadata flushed thru"
368 };
369
370 inline void decode(client_writeable_range_t::byte_range_t& range, bufferlist::const_iterator& bl) {
371 decode(range.first, bl);
372 decode(range.last, bl);
373 }
374
375 WRITE_CLASS_ENCODER(client_writeable_range_t)
376
377 std::ostream& operator<<(std::ostream& out, const client_writeable_range_t& r);
378
379 inline bool operator==(const client_writeable_range_t& l,
380 const client_writeable_range_t& r) {
381 return l.range.first == r.range.first && l.range.last == r.range.last &&
382 l.follows == r.follows;
383 }
384
385 struct inline_data_t {
386 public:
387 inline_data_t() {}
388 inline_data_t(const inline_data_t& o) : version(o.version) {
389 if (o.blp)
390 get_data() = *o.blp;
391 }
392 inline_data_t& operator=(const inline_data_t& o) {
393 version = o.version;
394 if (o.blp)
395 get_data() = *o.blp;
396 else
397 free_data();
398 return *this;
399 }
400
401 void free_data() {
402 blp.reset();
403 }
404 bufferlist& get_data() {
405 if (!blp)
406 blp.reset(new bufferlist);
407 return *blp;
408 }
409 size_t length() const { return blp ? blp->length() : 0; }
410
411 bool operator==(const inline_data_t& o) const {
412 return length() == o.length() &&
413 (length() == 0 ||
414 (*const_cast<bufferlist*>(blp.get()) == *const_cast<bufferlist*>(o.blp.get())));
415 }
416 bool operator!=(const inline_data_t& o) const {
417 return !(*this == o);
418 }
419 void encode(bufferlist &bl) const;
420 void decode(bufferlist::const_iterator& bl);
421
422 version_t version = 1;
423
424 private:
425 std::unique_ptr<bufferlist> blp;
426 };
427 WRITE_CLASS_ENCODER(inline_data_t)
428
429 enum {
430 DAMAGE_STATS, // statistics (dirstat, size, etc)
431 DAMAGE_RSTATS, // recursive statistics (rstat, accounted_rstat)
432 DAMAGE_FRAGTREE // fragtree -- repair by searching
433 };
434 typedef uint32_t damage_flags_t;
435
436 template<template<typename> class Allocator = std::allocator>
437 struct inode_t {
438 /**
439 * ***************
440 * Do not forget to add any new fields to the compare() function.
441 * ***************
442 */
443 using client_range_map = std::map<client_t,client_writeable_range_t,std::less<client_t>,Allocator<std::pair<const client_t,client_writeable_range_t>>>;
444
445 inode_t()
446 {
447 clear_layout();
448 }
449
450 // file type
451 bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; }
452 bool is_dir() const { return (mode & S_IFMT) == S_IFDIR; }
453 bool is_file() const { return (mode & S_IFMT) == S_IFREG; }
454
455 bool is_truncating() const { return (truncate_pending > 0); }
456 void truncate(uint64_t old_size, uint64_t new_size) {
457 ceph_assert(new_size < old_size);
458 if (old_size > max_size_ever)
459 max_size_ever = old_size;
460 truncate_from = old_size;
461 size = new_size;
462 rstat.rbytes = new_size;
463 truncate_size = size;
464 truncate_seq++;
465 truncate_pending++;
466 }
467
468 bool has_layout() const {
469 return layout != file_layout_t();
470 }
471
472 void clear_layout() {
473 layout = file_layout_t();
474 }
475
476 uint64_t get_layout_size_increment() const {
477 return layout.get_period();
478 }
479
480 bool is_dirty_rstat() const { return !(rstat == accounted_rstat); }
481
482 uint64_t get_client_range(client_t client) const {
483 auto it = client_ranges.find(client);
484 return it != client_ranges.end() ? it->second.range.last : 0;
485 }
486
487 uint64_t get_max_size() const {
488 uint64_t max = 0;
489 for (std::map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin();
490 p != client_ranges.end();
491 ++p)
492 if (p->second.range.last > max)
493 max = p->second.range.last;
494 return max;
495 }
496 void set_max_size(uint64_t new_max) {
497 if (new_max == 0) {
498 client_ranges.clear();
499 } else {
500 for (std::map<client_t,client_writeable_range_t>::iterator p = client_ranges.begin();
501 p != client_ranges.end();
502 ++p)
503 p->second.range.last = new_max;
504 }
505 }
506
507 void trim_client_ranges(snapid_t last) {
508 std::map<client_t, client_writeable_range_t>::iterator p = client_ranges.begin();
509 while (p != client_ranges.end()) {
510 if (p->second.follows >= last)
511 client_ranges.erase(p++);
512 else
513 ++p;
514 }
515 }
516
517 bool is_backtrace_updated() const {
518 return backtrace_version == version;
519 }
520 void update_backtrace(version_t pv=0) {
521 backtrace_version = pv ? pv : version;
522 }
523
524 void add_old_pool(int64_t l) {
525 backtrace_version = version;
526 old_pools.insert(l);
527 }
528
529 void encode(bufferlist &bl, uint64_t features) const;
530 void decode(bufferlist::const_iterator& bl);
531 void dump(Formatter *f) const;
532 static void generate_test_instances(std::list<inode_t*>& ls);
533 /**
534 * Compare this inode_t with another that represent *the same inode*
535 * at different points in time.
536 * @pre The inodes are the same ino
537 *
538 * @param other The inode_t to compare ourselves with
539 * @param divergent A bool pointer which will be set to true
540 * if the values are different in a way that can't be explained
541 * by one being a newer version than the other.
542 *
543 * @returns 1 if we are newer than the other, 0 if equal, -1 if older.
544 */
545 int compare(const inode_t &other, bool *divergent) const;
546
547 // base (immutable)
548 inodeno_t ino = 0;
549 uint32_t rdev = 0; // if special file
550
551 // affected by any inode change...
552 utime_t ctime; // inode change time
553 utime_t btime; // birth time
554
555 // perm (namespace permissions)
556 uint32_t mode = 0;
557 uid_t uid = 0;
558 gid_t gid = 0;
559
560 // nlink
561 int32_t nlink = 0;
562
563 // file (data access)
564 ceph_dir_layout dir_layout = {}; // [dir only]
565 file_layout_t layout;
566 compact_set<int64_t, std::less<int64_t>, Allocator<int64_t>> old_pools;
567 uint64_t size = 0; // on directory, # dentries
568 uint64_t max_size_ever = 0; // max size the file has ever been
569 uint32_t truncate_seq = 0;
570 uint64_t truncate_size = 0, truncate_from = 0;
571 uint32_t truncate_pending = 0;
572 utime_t mtime; // file data modify time.
573 utime_t atime; // file data access time.
574 uint32_t time_warp_seq = 0; // count of (potential) mtime/atime timewarps (i.e., utimes())
575 inline_data_t inline_data; // FIXME check
576
577 // change attribute
578 uint64_t change_attr = 0;
579
580 client_range_map client_ranges; // client(s) can write to these ranges
581
582 // dirfrag, recursive accountin
583 frag_info_t dirstat; // protected by my filelock
584 nest_info_t rstat; // protected by my nestlock
585 nest_info_t accounted_rstat; // protected by parent's nestlock
586
587 quota_info_t quota;
588
589 mds_rank_t export_pin = MDS_RANK_NONE;
590
591 double export_ephemeral_random_pin = 0;
592 bool export_ephemeral_distributed_pin = false;
593
594 // special stuff
595 version_t version = 0; // auth only
596 version_t file_data_version = 0; // auth only
597 version_t xattr_version = 0;
598
599 utime_t last_scrub_stamp; // start time of last complete scrub
600 version_t last_scrub_version = 0;// (parent) start version of last complete scrub
601
602 version_t backtrace_version = 0;
603
604 snapid_t oldest_snap;
605
606 std::basic_string<char,std::char_traits<char>,Allocator<char>> stray_prior_path; //stores path before unlink
607
608 private:
609 bool older_is_consistent(const inode_t &other) const;
610 };
611
612 // These methods may be moved back to mdstypes.cc when we have pmr
613 template<template<typename> class Allocator>
614 void inode_t<Allocator>::encode(bufferlist &bl, uint64_t features) const
615 {
616 ENCODE_START(16, 6, bl);
617
618 encode(ino, bl);
619 encode(rdev, bl);
620 encode(ctime, bl);
621
622 encode(mode, bl);
623 encode(uid, bl);
624 encode(gid, bl);
625
626 encode(nlink, bl);
627 {
628 // removed field
629 bool anchored = 0;
630 encode(anchored, bl);
631 }
632
633 encode(dir_layout, bl);
634 encode(layout, bl, features);
635 encode(size, bl);
636 encode(truncate_seq, bl);
637 encode(truncate_size, bl);
638 encode(truncate_from, bl);
639 encode(truncate_pending, bl);
640 encode(mtime, bl);
641 encode(atime, bl);
642 encode(time_warp_seq, bl);
643 encode(client_ranges, bl);
644
645 encode(dirstat, bl);
646 encode(rstat, bl);
647 encode(accounted_rstat, bl);
648
649 encode(version, bl);
650 encode(file_data_version, bl);
651 encode(xattr_version, bl);
652 encode(backtrace_version, bl);
653 encode(old_pools, bl);
654 encode(max_size_ever, bl);
655 encode(inline_data, bl);
656 encode(quota, bl);
657
658 encode(stray_prior_path, bl);
659
660 encode(last_scrub_version, bl);
661 encode(last_scrub_stamp, bl);
662
663 encode(btime, bl);
664 encode(change_attr, bl);
665
666 encode(export_pin, bl);
667
668 encode(export_ephemeral_random_pin, bl);
669 encode(export_ephemeral_distributed_pin, bl);
670
671 ENCODE_FINISH(bl);
672 }
673
674 template<template<typename> class Allocator>
675 void inode_t<Allocator>::decode(bufferlist::const_iterator &p)
676 {
677 DECODE_START_LEGACY_COMPAT_LEN(16, 6, 6, p);
678
679 decode(ino, p);
680 decode(rdev, p);
681 decode(ctime, p);
682
683 decode(mode, p);
684 decode(uid, p);
685 decode(gid, p);
686
687 decode(nlink, p);
688 {
689 bool anchored;
690 decode(anchored, p);
691 }
692
693 if (struct_v >= 4)
694 decode(dir_layout, p);
695 else {
696 // FIPS zeroization audit 20191117: this memset is not security related.
697 memset(&dir_layout, 0, sizeof(dir_layout));
698 }
699 decode(layout, p);
700 decode(size, p);
701 decode(truncate_seq, p);
702 decode(truncate_size, p);
703 decode(truncate_from, p);
704 if (struct_v >= 5)
705 decode(truncate_pending, p);
706 else
707 truncate_pending = 0;
708 decode(mtime, p);
709 decode(atime, p);
710 decode(time_warp_seq, p);
711 if (struct_v >= 3) {
712 decode(client_ranges, p);
713 } else {
714 map<client_t, client_writeable_range_t::byte_range_t> m;
715 decode(m, p);
716 for (map<client_t, client_writeable_range_t::byte_range_t>::iterator
717 q = m.begin(); q != m.end(); ++q)
718 client_ranges[q->first].range = q->second;
719 }
720
721 decode(dirstat, p);
722 decode(rstat, p);
723 decode(accounted_rstat, p);
724
725 decode(version, p);
726 decode(file_data_version, p);
727 decode(xattr_version, p);
728 if (struct_v >= 2)
729 decode(backtrace_version, p);
730 if (struct_v >= 7)
731 decode(old_pools, p);
732 if (struct_v >= 8)
733 decode(max_size_ever, p);
734 if (struct_v >= 9) {
735 decode(inline_data, p);
736 } else {
737 inline_data.version = CEPH_INLINE_NONE;
738 }
739 if (struct_v < 10)
740 backtrace_version = 0; // force update backtrace
741 if (struct_v >= 11)
742 decode(quota, p);
743
744 if (struct_v >= 12) {
745 std::string tmp;
746 decode(tmp, p);
747 stray_prior_path = std::string_view(tmp);
748 }
749
750 if (struct_v >= 13) {
751 decode(last_scrub_version, p);
752 decode(last_scrub_stamp, p);
753 }
754 if (struct_v >= 14) {
755 decode(btime, p);
756 decode(change_attr, p);
757 } else {
758 btime = utime_t();
759 change_attr = 0;
760 }
761
762 if (struct_v >= 15) {
763 decode(export_pin, p);
764 } else {
765 export_pin = MDS_RANK_NONE;
766 }
767
768 if (struct_v >= 16) {
769 decode(export_ephemeral_random_pin, p);
770 decode(export_ephemeral_distributed_pin, p);
771 } else {
772 export_ephemeral_random_pin = 0;
773 export_ephemeral_distributed_pin = false;
774 }
775
776 DECODE_FINISH(p);
777 }
778
779 template<template<typename> class Allocator>
780 void inode_t<Allocator>::dump(Formatter *f) const
781 {
782 f->dump_unsigned("ino", ino);
783 f->dump_unsigned("rdev", rdev);
784 f->dump_stream("ctime") << ctime;
785 f->dump_stream("btime") << btime;
786 f->dump_unsigned("mode", mode);
787 f->dump_unsigned("uid", uid);
788 f->dump_unsigned("gid", gid);
789 f->dump_unsigned("nlink", nlink);
790
791 f->open_object_section("dir_layout");
792 ::dump(dir_layout, f);
793 f->close_section();
794
795 f->dump_object("layout", layout);
796
797 f->open_array_section("old_pools");
798 for (const auto &p : old_pools) {
799 f->dump_int("pool", p);
800 }
801 f->close_section();
802
803 f->dump_unsigned("size", size);
804 f->dump_unsigned("truncate_seq", truncate_seq);
805 f->dump_unsigned("truncate_size", truncate_size);
806 f->dump_unsigned("truncate_from", truncate_from);
807 f->dump_unsigned("truncate_pending", truncate_pending);
808 f->dump_stream("mtime") << mtime;
809 f->dump_stream("atime") << atime;
810 f->dump_unsigned("time_warp_seq", time_warp_seq);
811 f->dump_unsigned("change_attr", change_attr);
812 f->dump_int("export_pin", export_pin);
813 f->dump_int("export_ephemeral_random_pin", export_ephemeral_random_pin);
814 f->dump_bool("export_ephemeral_distributed_pin", export_ephemeral_distributed_pin);
815
816 f->open_array_section("client_ranges");
817 for (const auto &p : client_ranges) {
818 f->open_object_section("client");
819 f->dump_unsigned("client", p.first.v);
820 p.second.dump(f);
821 f->close_section();
822 }
823 f->close_section();
824
825 f->open_object_section("dirstat");
826 dirstat.dump(f);
827 f->close_section();
828
829 f->open_object_section("rstat");
830 rstat.dump(f);
831 f->close_section();
832
833 f->open_object_section("accounted_rstat");
834 accounted_rstat.dump(f);
835 f->close_section();
836
837 f->dump_unsigned("version", version);
838 f->dump_unsigned("file_data_version", file_data_version);
839 f->dump_unsigned("xattr_version", xattr_version);
840 f->dump_unsigned("backtrace_version", backtrace_version);
841
842 f->dump_string("stray_prior_path", stray_prior_path);
843 f->dump_unsigned("max_size_ever", max_size_ever);
844
845 f->open_object_section("quota");
846 quota.dump(f);
847 f->close_section();
848
849 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
850 f->dump_unsigned("last_scrub_version", last_scrub_version);
851 }
852
853 template<template<typename> class Allocator>
854 void inode_t<Allocator>::generate_test_instances(std::list<inode_t*>& ls)
855 {
856 ls.push_back(new inode_t<Allocator>);
857 ls.push_back(new inode_t<Allocator>);
858 ls.back()->ino = 1;
859 // i am lazy.
860 }
861
862 template<template<typename> class Allocator>
863 int inode_t<Allocator>::compare(const inode_t<Allocator> &other, bool *divergent) const
864 {
865 ceph_assert(ino == other.ino);
866 *divergent = false;
867 if (version == other.version) {
868 if (rdev != other.rdev ||
869 ctime != other.ctime ||
870 btime != other.btime ||
871 mode != other.mode ||
872 uid != other.uid ||
873 gid != other.gid ||
874 nlink != other.nlink ||
875 memcmp(&dir_layout, &other.dir_layout, sizeof(dir_layout)) ||
876 layout != other.layout ||
877 old_pools != other.old_pools ||
878 size != other.size ||
879 max_size_ever != other.max_size_ever ||
880 truncate_seq != other.truncate_seq ||
881 truncate_size != other.truncate_size ||
882 truncate_from != other.truncate_from ||
883 truncate_pending != other.truncate_pending ||
884 change_attr != other.change_attr ||
885 mtime != other.mtime ||
886 atime != other.atime ||
887 time_warp_seq != other.time_warp_seq ||
888 inline_data != other.inline_data ||
889 client_ranges != other.client_ranges ||
890 !(dirstat == other.dirstat) ||
891 !(rstat == other.rstat) ||
892 !(accounted_rstat == other.accounted_rstat) ||
893 file_data_version != other.file_data_version ||
894 xattr_version != other.xattr_version ||
895 backtrace_version != other.backtrace_version) {
896 *divergent = true;
897 }
898 return 0;
899 } else if (version > other.version) {
900 *divergent = !older_is_consistent(other);
901 return 1;
902 } else {
903 ceph_assert(version < other.version);
904 *divergent = !other.older_is_consistent(*this);
905 return -1;
906 }
907 }
908
909 template<template<typename> class Allocator>
910 bool inode_t<Allocator>::older_is_consistent(const inode_t<Allocator> &other) const
911 {
912 if (max_size_ever < other.max_size_ever ||
913 truncate_seq < other.truncate_seq ||
914 time_warp_seq < other.time_warp_seq ||
915 inline_data.version < other.inline_data.version ||
916 dirstat.version < other.dirstat.version ||
917 rstat.version < other.rstat.version ||
918 accounted_rstat.version < other.accounted_rstat.version ||
919 file_data_version < other.file_data_version ||
920 xattr_version < other.xattr_version ||
921 backtrace_version < other.backtrace_version) {
922 return false;
923 }
924 return true;
925 }
926
927 template<template<typename> class Allocator>
928 inline void encode(const inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
929 {
930 ENCODE_DUMP_PRE();
931 c.encode(bl, features);
932 ENCODE_DUMP_POST(cl);
933 }
934 template<template<typename> class Allocator>
935 inline void decode(inode_t<Allocator> &c, ::ceph::bufferlist::const_iterator &p)
936 {
937 c.decode(p);
938 }
939
940 template<template<typename> class Allocator>
941 using alloc_string = std::basic_string<char,std::char_traits<char>,Allocator<char>>;
942
943 template<template<typename> class Allocator>
944 using xattr_map = compact_map<alloc_string<Allocator>, bufferptr, std::less<alloc_string<Allocator>>, Allocator<std::pair<const alloc_string<Allocator>, bufferptr>>>; // FIXME bufferptr not in mempool
945
946 template<template<typename> class Allocator>
947 inline void decode_noshare(xattr_map<Allocator>& xattrs, ceph::buffer::list::const_iterator &p)
948 {
949 __u32 n;
950 decode(n, p);
951 while (n-- > 0) {
952 alloc_string<Allocator> key;
953 decode(key, p);
954 __u32 len;
955 decode(len, p);
956 p.copy_deep(len, xattrs[key]);
957 }
958 }
959
960 template<template<typename> class Allocator = std::allocator>
961 struct old_inode_t {
962 snapid_t first;
963 inode_t<Allocator> inode;
964 xattr_map<Allocator> xattrs;
965
966 void encode(bufferlist &bl, uint64_t features) const;
967 void decode(bufferlist::const_iterator& bl);
968 void dump(Formatter *f) const;
969 static void generate_test_instances(std::list<old_inode_t*>& ls);
970 };
971
972 // These methods may be moved back to mdstypes.cc when we have pmr
973 template<template<typename> class Allocator>
974 void old_inode_t<Allocator>::encode(bufferlist& bl, uint64_t features) const
975 {
976 ENCODE_START(2, 2, bl);
977 encode(first, bl);
978 encode(inode, bl, features);
979 encode(xattrs, bl);
980 ENCODE_FINISH(bl);
981 }
982
983 template<template<typename> class Allocator>
984 void old_inode_t<Allocator>::decode(bufferlist::const_iterator& bl)
985 {
986 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
987 decode(first, bl);
988 decode(inode, bl);
989 decode_noshare<Allocator>(xattrs, bl);
990 DECODE_FINISH(bl);
991 }
992
993 template<template<typename> class Allocator>
994 void old_inode_t<Allocator>::dump(Formatter *f) const
995 {
996 f->dump_unsigned("first", first);
997 inode.dump(f);
998 f->open_object_section("xattrs");
999 for (const auto &p : xattrs) {
1000 std::string v(p.second.c_str(), p.second.length());
1001 f->dump_string(p.first.c_str(), v);
1002 }
1003 f->close_section();
1004 }
1005
1006 template<template<typename> class Allocator>
1007 void old_inode_t<Allocator>::generate_test_instances(std::list<old_inode_t<Allocator>*>& ls)
1008 {
1009 ls.push_back(new old_inode_t<Allocator>);
1010 ls.push_back(new old_inode_t<Allocator>);
1011 ls.back()->first = 2;
1012 std::list<inode_t<Allocator>*> ils;
1013 inode_t<Allocator>::generate_test_instances(ils);
1014 ls.back()->inode = *ils.back();
1015 ls.back()->xattrs["user.foo"] = buffer::copy("asdf", 4);
1016 ls.back()->xattrs["user.unprintable"] = buffer::copy("\000\001\002", 3);
1017 }
1018
1019 template<template<typename> class Allocator>
1020 inline void encode(const old_inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
1021 {
1022 ENCODE_DUMP_PRE();
1023 c.encode(bl, features);
1024 ENCODE_DUMP_POST(cl);
1025 }
1026 template<template<typename> class Allocator>
1027 inline void decode(old_inode_t<Allocator> &c, ::ceph::bufferlist::const_iterator &p)
1028 {
1029 c.decode(p);
1030 }
1031
1032 /*
1033 * like an inode, but for a dir frag
1034 */
1035 struct fnode_t {
1036 void encode(bufferlist &bl) const;
1037 void decode(bufferlist::const_iterator& bl);
1038 void dump(Formatter *f) const;
1039 static void generate_test_instances(std::list<fnode_t*>& ls);
1040
1041 version_t version = 0;
1042 snapid_t snap_purged_thru; // the max_last_destroy snapid we've been purged thru
1043 frag_info_t fragstat, accounted_fragstat;
1044 nest_info_t rstat, accounted_rstat;
1045 damage_flags_t damage_flags = 0;
1046
1047 // we know we and all our descendants have been scrubbed since this version
1048 version_t recursive_scrub_version = 0;
1049 utime_t recursive_scrub_stamp;
1050 // version at which we last scrubbed our personal data structures
1051 version_t localized_scrub_version = 0;
1052 utime_t localized_scrub_stamp;
1053 };
1054 WRITE_CLASS_ENCODER(fnode_t)
1055
1056
1057 struct old_rstat_t {
1058 void encode(bufferlist& bl) const;
1059 void decode(bufferlist::const_iterator& p);
1060 void dump(Formatter *f) const;
1061 static void generate_test_instances(std::list<old_rstat_t*>& ls);
1062
1063 snapid_t first;
1064 nest_info_t rstat, accounted_rstat;
1065 };
1066 WRITE_CLASS_ENCODER(old_rstat_t)
1067
1068 inline std::ostream& operator<<(std::ostream& out, const old_rstat_t& o) {
1069 return out << "old_rstat(first " << o.first << " " << o.rstat << " " << o.accounted_rstat << ")";
1070 }
1071
1072 class feature_bitset_t {
1073 public:
1074 typedef uint64_t block_type;
1075 static const size_t bits_per_block = sizeof(block_type) * 8;
1076
1077 feature_bitset_t(const feature_bitset_t& other) : _vec(other._vec) {}
1078 feature_bitset_t(feature_bitset_t&& other) : _vec(std::move(other._vec)) {}
1079 feature_bitset_t(unsigned long value = 0);
1080 feature_bitset_t(const vector<size_t>& array);
1081 feature_bitset_t& operator=(const feature_bitset_t& other) {
1082 _vec = other._vec;
1083 return *this;
1084 }
1085 feature_bitset_t& operator=(feature_bitset_t&& other) {
1086 _vec = std::move(other._vec);
1087 return *this;
1088 }
1089 feature_bitset_t& operator-=(const feature_bitset_t& other);
1090 bool empty() const {
1091 //block_type is a uint64_t. If the vector is only composed of 0s, then it's still "empty"
1092 for (auto& v : _vec) {
1093 if (v)
1094 return false;
1095 }
1096 return true;
1097 }
1098 bool test(size_t bit) const {
1099 if (bit >= bits_per_block * _vec.size())
1100 return false;
1101 return _vec[bit / bits_per_block] & ((block_type)1 << (bit % bits_per_block));
1102 }
1103 void clear() {
1104 _vec.clear();
1105 }
1106 void encode(bufferlist& bl) const;
1107 void decode(bufferlist::const_iterator &p);
1108 void dump(Formatter *f) const;
1109 void print(ostream& out) const;
1110 private:
1111 vector<block_type> _vec;
1112 };
1113 WRITE_CLASS_ENCODER(feature_bitset_t)
1114
1115 inline std::ostream& operator<<(std::ostream& out, const feature_bitset_t& s) {
1116 s.print(out);
1117 return out;
1118 }
1119
1120 struct metric_spec_t {
1121 metric_spec_t() {}
1122 metric_spec_t(const metric_spec_t& other) :
1123 metric_flags(other.metric_flags) {}
1124 metric_spec_t(metric_spec_t&& other) :
1125 metric_flags(std::move(other.metric_flags)) {}
1126 metric_spec_t(const feature_bitset_t& mf) :
1127 metric_flags(mf) {}
1128 metric_spec_t(feature_bitset_t&& mf) :
1129 metric_flags(std::move(mf)) {}
1130
1131 metric_spec_t& operator=(const metric_spec_t& other) {
1132 metric_flags = other.metric_flags;
1133 return *this;
1134 }
1135 metric_spec_t& operator=(metric_spec_t&& other) {
1136 metric_flags = std::move(other.metric_flags);
1137 return *this;
1138 }
1139
1140 bool empty() const {
1141 return metric_flags.empty();
1142 }
1143
1144 void clear() {
1145 metric_flags.clear();
1146 }
1147
1148 void encode(bufferlist& bl) const;
1149 void decode(bufferlist::const_iterator& p);
1150 void dump(Formatter *f) const;
1151 void print(ostream& out) const;
1152
1153 // set of metrics that a client is capable of forwarding
1154 feature_bitset_t metric_flags;
1155 };
1156 WRITE_CLASS_ENCODER(metric_spec_t)
1157
1158 inline std::ostream& operator<<(std::ostream& out, const metric_spec_t& mst) {
1159 mst.print(out);
1160 return out;
1161 }
1162
1163 /*
1164 * client_metadata_t
1165 */
1166 struct client_metadata_t {
1167 using kv_map_t = std::map<std::string,std::string>;
1168 using iterator = kv_map_t::const_iterator;
1169
1170 client_metadata_t() {}
1171 client_metadata_t(const kv_map_t& kv, const feature_bitset_t &f, const metric_spec_t &mst) :
1172 kv_map(kv),
1173 features(f),
1174 metric_spec(mst) {}
1175 client_metadata_t& operator=(const client_metadata_t& other) {
1176 kv_map = other.kv_map;
1177 features = other.features;
1178 metric_spec = other.metric_spec;
1179 return *this;
1180 }
1181
1182 bool empty() const { return kv_map.empty() && features.empty() && metric_spec.empty(); }
1183 iterator find(const std::string& key) const { return kv_map.find(key); }
1184 iterator begin() const { return kv_map.begin(); }
1185 iterator end() const { return kv_map.end(); }
1186 void erase(iterator it) { kv_map.erase(it); }
1187 std::string& operator[](const std::string& key) { return kv_map[key]; }
1188 void merge(const client_metadata_t& other) {
1189 kv_map.insert(other.kv_map.begin(), other.kv_map.end());
1190 features = other.features;
1191 metric_spec = other.metric_spec;
1192 }
1193 void clear() {
1194 kv_map.clear();
1195 features.clear();
1196 metric_spec.clear();
1197 }
1198
1199 void encode(bufferlist& bl) const;
1200 void decode(bufferlist::const_iterator& p);
1201 void dump(Formatter *f) const;
1202
1203 kv_map_t kv_map;
1204 feature_bitset_t features;
1205 metric_spec_t metric_spec;
1206 };
1207 WRITE_CLASS_ENCODER(client_metadata_t)
1208
1209 /*
1210 * session_info_t - durable part of a Session
1211 */
1212 struct session_info_t {
1213 client_t get_client() const { return client_t(inst.name.num()); }
1214 bool has_feature(size_t bit) const { return client_metadata.features.test(bit); }
1215 const entity_name_t& get_source() const { return inst.name; }
1216
1217 void clear_meta() {
1218 prealloc_inos.clear();
1219 used_inos.clear();
1220 completed_requests.clear();
1221 completed_flushes.clear();
1222 client_metadata.clear();
1223 }
1224
1225 void encode(bufferlist& bl, uint64_t features) const;
1226 void decode(bufferlist::const_iterator& p);
1227 void dump(Formatter *f) const;
1228 static void generate_test_instances(std::list<session_info_t*>& ls);
1229
1230 entity_inst_t inst;
1231 std::map<ceph_tid_t,inodeno_t> completed_requests;
1232 interval_set<inodeno_t> prealloc_inos; // preallocated, ready to use.
1233 interval_set<inodeno_t> used_inos; // journaling use
1234 client_metadata_t client_metadata;
1235 std::set<ceph_tid_t> completed_flushes;
1236 EntityName auth_name;
1237 };
1238 WRITE_CLASS_ENCODER_FEATURES(session_info_t)
1239
1240 // dentries
1241 struct dentry_key_t {
1242 dentry_key_t() {}
1243 dentry_key_t(snapid_t s, std::string_view n, __u32 h=0) :
1244 snapid(s), name(n), hash(h) {}
1245
1246 bool is_valid() { return name.length() || snapid; }
1247
1248 // encode into something that can be decoded as a string.
1249 // name_ (head) or name_%x (!head)
1250 void encode(bufferlist& bl) const {
1251 string key;
1252 encode(key);
1253 using ceph::encode;
1254 encode(key, bl);
1255 }
1256 void encode(string& key) const {
1257 char b[20];
1258 if (snapid != CEPH_NOSNAP) {
1259 uint64_t val(snapid);
1260 snprintf(b, sizeof(b), "%" PRIx64, val);
1261 } else {
1262 snprintf(b, sizeof(b), "%s", "head");
1263 }
1264 ostringstream oss;
1265 oss << name << "_" << b;
1266 key = oss.str();
1267 }
1268 static void decode_helper(bufferlist::const_iterator& bl, string& nm, snapid_t& sn) {
1269 string key;
1270 decode(key, bl);
1271 decode_helper(key, nm, sn);
1272 }
1273 static void decode_helper(std::string_view key, string& nm, snapid_t& sn) {
1274 size_t i = key.find_last_of('_');
1275 ceph_assert(i != string::npos);
1276 if (key.compare(i+1, std::string_view::npos, "head") == 0) {
1277 // name_head
1278 sn = CEPH_NOSNAP;
1279 } else {
1280 // name_%x
1281 long long unsigned x = 0;
1282 std::string x_str(key.substr(i+1));
1283 sscanf(x_str.c_str(), "%llx", &x);
1284 sn = x;
1285 }
1286 nm = key.substr(0, i);
1287 }
1288
1289 snapid_t snapid = 0;
1290 std::string_view name;
1291 __u32 hash = 0;
1292 };
1293
1294 inline std::ostream& operator<<(std::ostream& out, const dentry_key_t &k)
1295 {
1296 return out << "(" << k.name << "," << k.snapid << ")";
1297 }
1298
1299 inline bool operator<(const dentry_key_t& k1, const dentry_key_t& k2)
1300 {
1301 /*
1302 * order by hash, name, snap
1303 */
1304 int c = ceph_frag_value(k1.hash) - ceph_frag_value(k2.hash);
1305 if (c)
1306 return c < 0;
1307 c = k1.name.compare(k2.name);
1308 if (c)
1309 return c < 0;
1310 return k1.snapid < k2.snapid;
1311 }
1312
1313 /*
1314 * string_snap_t is a simple (string, snapid_t) pair
1315 */
1316 struct string_snap_t {
1317 string_snap_t() {}
1318 string_snap_t(std::string_view n, snapid_t s) : name(n), snapid(s) {}
1319
1320 void encode(bufferlist& bl) const;
1321 void decode(bufferlist::const_iterator& p);
1322 void dump(Formatter *f) const;
1323 static void generate_test_instances(std::list<string_snap_t*>& ls);
1324
1325 string name;
1326 snapid_t snapid;
1327 };
1328 WRITE_CLASS_ENCODER(string_snap_t)
1329
1330 inline bool operator<(const string_snap_t& l, const string_snap_t& r) {
1331 int c = l.name.compare(r.name);
1332 return c < 0 || (c == 0 && l.snapid < r.snapid);
1333 }
1334
1335 inline std::ostream& operator<<(std::ostream& out, const string_snap_t &k)
1336 {
1337 return out << "(" << k.name << "," << k.snapid << ")";
1338 }
1339
1340 /*
1341 * mds_table_pending_t
1342 *
1343 * For mds's requesting any pending ops, child needs to encode the corresponding
1344 * pending mutation state in the table.
1345 */
1346 struct mds_table_pending_t {
1347 void encode(bufferlist& bl) const;
1348 void decode(bufferlist::const_iterator& bl);
1349 void dump(Formatter *f) const;
1350 static void generate_test_instances(std::list<mds_table_pending_t*>& ls);
1351
1352 uint64_t reqid = 0;
1353 __s32 mds = 0;
1354 version_t tid = 0;
1355 };
1356 WRITE_CLASS_ENCODER(mds_table_pending_t)
1357
1358 // requests
1359 struct metareqid_t {
1360 metareqid_t() {}
1361 metareqid_t(entity_name_t n, ceph_tid_t t) : name(n), tid(t) {}
1362 void encode(bufferlist& bl) const {
1363 using ceph::encode;
1364 encode(name, bl);
1365 encode(tid, bl);
1366 }
1367 void decode(bufferlist::const_iterator &p) {
1368 using ceph::decode;
1369 decode(name, p);
1370 decode(tid, p);
1371 }
1372
1373 entity_name_t name;
1374 uint64_t tid = 0;
1375 };
1376 WRITE_CLASS_ENCODER(metareqid_t)
1377
1378 inline std::ostream& operator<<(std::ostream& out, const metareqid_t& r) {
1379 return out << r.name << ":" << r.tid;
1380 }
1381
1382 inline bool operator==(const metareqid_t& l, const metareqid_t& r) {
1383 return (l.name == r.name) && (l.tid == r.tid);
1384 }
1385 inline bool operator!=(const metareqid_t& l, const metareqid_t& r) {
1386 return (l.name != r.name) || (l.tid != r.tid);
1387 }
1388 inline bool operator<(const metareqid_t& l, const metareqid_t& r) {
1389 return (l.name < r.name) ||
1390 (l.name == r.name && l.tid < r.tid);
1391 }
1392 inline bool operator<=(const metareqid_t& l, const metareqid_t& r) {
1393 return (l.name < r.name) ||
1394 (l.name == r.name && l.tid <= r.tid);
1395 }
1396 inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); }
1397 inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); }
1398
1399 namespace std {
1400 template<> struct hash<metareqid_t> {
1401 size_t operator()(const metareqid_t &r) const {
1402 hash<uint64_t> H;
1403 return H(r.name.num()) ^ H(r.name.type()) ^ H(r.tid);
1404 }
1405 };
1406 } // namespace std
1407
1408 // cap info for client reconnect
1409 struct cap_reconnect_t {
1410 cap_reconnect_t() {}
1411 cap_reconnect_t(uint64_t cap_id, inodeno_t pino, std::string_view p, int w, int i,
1412 inodeno_t sr, snapid_t sf, bufferlist& lb) :
1413 path(p) {
1414 capinfo.cap_id = cap_id;
1415 capinfo.wanted = w;
1416 capinfo.issued = i;
1417 capinfo.snaprealm = sr;
1418 capinfo.pathbase = pino;
1419 capinfo.flock_len = 0;
1420 snap_follows = sf;
1421 flockbl.claim(lb);
1422 }
1423 void encode(bufferlist& bl) const;
1424 void decode(bufferlist::const_iterator& bl);
1425 void encode_old(bufferlist& bl) const;
1426 void decode_old(bufferlist::const_iterator& bl);
1427
1428 void dump(Formatter *f) const;
1429 static void generate_test_instances(std::list<cap_reconnect_t*>& ls);
1430
1431 string path;
1432 mutable ceph_mds_cap_reconnect capinfo = {};
1433 snapid_t snap_follows = 0;
1434 bufferlist flockbl;
1435 };
1436 WRITE_CLASS_ENCODER(cap_reconnect_t)
1437
1438 struct snaprealm_reconnect_t {
1439 snaprealm_reconnect_t() {}
1440 snaprealm_reconnect_t(inodeno_t ino, snapid_t seq, inodeno_t parent) {
1441 realm.ino = ino;
1442 realm.seq = seq;
1443 realm.parent = parent;
1444 }
1445 void encode(bufferlist& bl) const;
1446 void decode(bufferlist::const_iterator& bl);
1447 void encode_old(bufferlist& bl) const;
1448 void decode_old(bufferlist::const_iterator& bl);
1449
1450 void dump(Formatter *f) const;
1451 static void generate_test_instances(std::list<snaprealm_reconnect_t*>& ls);
1452
1453 mutable ceph_mds_snaprealm_reconnect realm = {};
1454 };
1455 WRITE_CLASS_ENCODER(snaprealm_reconnect_t)
1456
1457 // compat for pre-FLOCK feature
1458 struct old_ceph_mds_cap_reconnect {
1459 ceph_le64 cap_id;
1460 ceph_le32 wanted;
1461 ceph_le32 issued;
1462 ceph_le64 old_size;
1463 struct ceph_timespec old_mtime, old_atime;
1464 ceph_le64 snaprealm;
1465 ceph_le64 pathbase; /* base ino for our path to this ino */
1466 } __attribute__ ((packed));
1467 WRITE_RAW_ENCODER(old_ceph_mds_cap_reconnect)
1468
1469 struct old_cap_reconnect_t {
1470 const old_cap_reconnect_t& operator=(const cap_reconnect_t& n) {
1471 path = n.path;
1472 capinfo.cap_id = n.capinfo.cap_id;
1473 capinfo.wanted = n.capinfo.wanted;
1474 capinfo.issued = n.capinfo.issued;
1475 capinfo.snaprealm = n.capinfo.snaprealm;
1476 capinfo.pathbase = n.capinfo.pathbase;
1477 return *this;
1478 }
1479 operator cap_reconnect_t() {
1480 cap_reconnect_t n;
1481 n.path = path;
1482 n.capinfo.cap_id = capinfo.cap_id;
1483 n.capinfo.wanted = capinfo.wanted;
1484 n.capinfo.issued = capinfo.issued;
1485 n.capinfo.snaprealm = capinfo.snaprealm;
1486 n.capinfo.pathbase = capinfo.pathbase;
1487 return n;
1488 }
1489
1490 void encode(bufferlist& bl) const {
1491 using ceph::encode;
1492 encode(path, bl);
1493 encode(capinfo, bl);
1494 }
1495 void decode(bufferlist::const_iterator& bl) {
1496 using ceph::decode;
1497 decode(path, bl);
1498 decode(capinfo, bl);
1499 }
1500
1501 string path;
1502 old_ceph_mds_cap_reconnect capinfo;
1503 };
1504 WRITE_CLASS_ENCODER(old_cap_reconnect_t)
1505
1506 // dir frag
1507 struct dirfrag_t {
1508 dirfrag_t() {}
1509 dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { }
1510
1511 void encode(bufferlist& bl) const {
1512 using ceph::encode;
1513 encode(ino, bl);
1514 encode(frag, bl);
1515 }
1516 void decode(bufferlist::const_iterator& bl) {
1517 using ceph::decode;
1518 decode(ino, bl);
1519 decode(frag, bl);
1520 }
1521
1522 inodeno_t ino = 0;
1523 frag_t frag;
1524 };
1525 WRITE_CLASS_ENCODER(dirfrag_t)
1526
1527 inline std::ostream& operator<<(std::ostream& out, const dirfrag_t &df) {
1528 out << df.ino;
1529 if (!df.frag.is_root()) out << "." << df.frag;
1530 return out;
1531 }
1532 inline bool operator<(dirfrag_t l, dirfrag_t r) {
1533 if (l.ino < r.ino) return true;
1534 if (l.ino == r.ino && l.frag < r.frag) return true;
1535 return false;
1536 }
1537 inline bool operator==(dirfrag_t l, dirfrag_t r) {
1538 return l.ino == r.ino && l.frag == r.frag;
1539 }
1540
1541 namespace std {
1542 template<> struct hash<dirfrag_t> {
1543 size_t operator()(const dirfrag_t &df) const {
1544 static rjhash<uint64_t> H;
1545 static rjhash<uint32_t> I;
1546 return H(df.ino) ^ I(df.frag);
1547 }
1548 };
1549 } // namespace std
1550
1551 // ================================================================
1552 #define META_POP_IRD 0
1553 #define META_POP_IWR 1
1554 #define META_POP_READDIR 2
1555 #define META_POP_FETCH 3
1556 #define META_POP_STORE 4
1557 #define META_NPOP 5
1558
1559 class inode_load_vec_t {
1560 public:
1561 using time = DecayCounter::time;
1562 using clock = DecayCounter::clock;
1563 static const size_t NUM = 2;
1564
1565 inode_load_vec_t() : vec{DecayCounter(DecayRate()), DecayCounter(DecayRate())} {}
1566 inode_load_vec_t(const DecayRate &rate) : vec{DecayCounter(rate), DecayCounter(rate)} {}
1567
1568 DecayCounter &get(int t) {
1569 return vec[t];
1570 }
1571 void zero() {
1572 for (auto &d : vec) {
1573 d.reset();
1574 }
1575 }
1576 void encode(bufferlist &bl) const;
1577 void decode(bufferlist::const_iterator& p);
1578 void dump(Formatter *f) const;
1579 static void generate_test_instances(std::list<inode_load_vec_t*>& ls);
1580
1581 private:
1582 std::array<DecayCounter, NUM> vec;
1583 };
1584 inline void encode(const inode_load_vec_t &c, bufferlist &bl) {
1585 c.encode(bl);
1586 }
1587 inline void decode(inode_load_vec_t & c, bufferlist::const_iterator &p) {
1588 c.decode(p);
1589 }
1590
1591 class dirfrag_load_vec_t {
1592 public:
1593 using time = DecayCounter::time;
1594 using clock = DecayCounter::clock;
1595 static const size_t NUM = 5;
1596
1597 dirfrag_load_vec_t() :
1598 vec{DecayCounter(DecayRate()),
1599 DecayCounter(DecayRate()),
1600 DecayCounter(DecayRate()),
1601 DecayCounter(DecayRate()),
1602 DecayCounter(DecayRate())
1603 }
1604 {}
1605 dirfrag_load_vec_t(const DecayRate &rate) :
1606 vec{DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate)}
1607 {}
1608
1609 void encode(bufferlist &bl) const {
1610 ENCODE_START(2, 2, bl);
1611 for (const auto &i : vec) {
1612 encode(i, bl);
1613 }
1614 ENCODE_FINISH(bl);
1615 }
1616 void decode(bufferlist::const_iterator &p) {
1617 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p);
1618 for (auto &i : vec) {
1619 decode(i, p);
1620 }
1621 DECODE_FINISH(p);
1622 }
1623 void dump(Formatter *f) const;
1624 void dump(Formatter *f, const DecayRate& rate) const;
1625 static void generate_test_instances(std::list<dirfrag_load_vec_t*>& ls);
1626
1627 const DecayCounter &get(int t) const {
1628 return vec[t];
1629 }
1630 DecayCounter &get(int t) {
1631 return vec[t];
1632 }
1633 void adjust(double d) {
1634 for (auto &i : vec) {
1635 i.adjust(d);
1636 }
1637 }
1638 void zero() {
1639 for (auto &i : vec) {
1640 i.reset();
1641 }
1642 }
1643 double meta_load() const {
1644 return
1645 1*vec[META_POP_IRD].get() +
1646 2*vec[META_POP_IWR].get() +
1647 1*vec[META_POP_READDIR].get() +
1648 2*vec[META_POP_FETCH].get() +
1649 4*vec[META_POP_STORE].get();
1650 }
1651
1652 void add(dirfrag_load_vec_t& r) {
1653 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1654 vec[i].adjust(r.vec[i].get());
1655 }
1656 void sub(dirfrag_load_vec_t& r) {
1657 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1658 vec[i].adjust(-r.vec[i].get());
1659 }
1660 void scale(double f) {
1661 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1662 vec[i].scale(f);
1663 }
1664
1665 private:
1666 friend inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl);
1667 std::array<DecayCounter, NUM> vec;
1668 };
1669
1670 inline void encode(const dirfrag_load_vec_t &c, bufferlist &bl) {
1671 c.encode(bl);
1672 }
1673 inline void decode(dirfrag_load_vec_t& c, bufferlist::const_iterator &p) {
1674 c.decode(p);
1675 }
1676
1677 inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl)
1678 {
1679 std::ostringstream ss;
1680 ss << std::setprecision(1) << std::fixed
1681 << "[pop"
1682 " IRD:" << dl.vec[0]
1683 << " IWR:" << dl.vec[1]
1684 << " RDR:" << dl.vec[2]
1685 << " FET:" << dl.vec[3]
1686 << " STR:" << dl.vec[4]
1687 << " *LOAD:" << dl.meta_load() << "]";
1688 return out << ss.str() << std::endl;
1689 }
1690
1691 struct mds_load_t {
1692 using clock = dirfrag_load_vec_t::clock;
1693 using time = dirfrag_load_vec_t::time;
1694
1695 dirfrag_load_vec_t auth;
1696 dirfrag_load_vec_t all;
1697
1698 mds_load_t() : auth(DecayRate()), all(DecayRate()) {}
1699 mds_load_t(const DecayRate &rate) : auth(rate), all(rate) {}
1700
1701 double req_rate = 0.0;
1702 double cache_hit_rate = 0.0;
1703 double queue_len = 0.0;
1704
1705 double cpu_load_avg = 0.0;
1706
1707 double mds_load() const; // defiend in MDBalancer.cc
1708 void encode(bufferlist& bl) const;
1709 void decode(bufferlist::const_iterator& bl);
1710 void dump(Formatter *f) const;
1711 static void generate_test_instances(std::list<mds_load_t*>& ls);
1712 };
1713 inline void encode(const mds_load_t &c, bufferlist &bl) {
1714 c.encode(bl);
1715 }
1716 inline void decode(mds_load_t &c, bufferlist::const_iterator &p) {
1717 c.decode(p);
1718 }
1719
1720 inline std::ostream& operator<<(std::ostream& out, const mds_load_t& load)
1721 {
1722 return out << "mdsload<" << load.auth << "/" << load.all
1723 << ", req " << load.req_rate
1724 << ", hr " << load.cache_hit_rate
1725 << ", qlen " << load.queue_len
1726 << ", cpu " << load.cpu_load_avg
1727 << ">";
1728 }
1729
1730 class load_spread_t {
1731 public:
1732 using time = DecayCounter::time;
1733 using clock = DecayCounter::clock;
1734 static const int MAX = 4;
1735
1736 load_spread_t(const DecayRate &rate) : count(rate)
1737 {}
1738
1739 load_spread_t() = delete;
1740
1741 double hit(int who) {
1742 for (int i=0; i<n; i++)
1743 if (last[i] == who)
1744 return count.get_last();
1745
1746 // we're new(ish)
1747 last[p++] = who;
1748 if (n < MAX) n++;
1749 if (n == 1) return 0.0;
1750
1751 if (p == MAX) p = 0;
1752
1753 return count.hit();
1754 }
1755 double get() const {
1756 return count.get();
1757 }
1758
1759 std::array<int, MAX> last = {-1, -1, -1, -1};
1760 int p = 0, n = 0;
1761 DecayCounter count;
1762 };
1763
1764 // ================================================================
1765 typedef std::pair<mds_rank_t, mds_rank_t> mds_authority_t;
1766
1767 // -- authority delegation --
1768 // directory authority types
1769 // >= 0 is the auth mds
1770 #define CDIR_AUTH_PARENT mds_rank_t(-1) // default
1771 #define CDIR_AUTH_UNKNOWN mds_rank_t(-2)
1772 #define CDIR_AUTH_DEFAULT mds_authority_t(CDIR_AUTH_PARENT, CDIR_AUTH_UNKNOWN)
1773 #define CDIR_AUTH_UNDEF mds_authority_t(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN)
1774 //#define CDIR_AUTH_ROOTINODE pair<int,int>( 0, -2)
1775
1776 class MDSCacheObjectInfo {
1777 public:
1778 void encode(bufferlist& bl) const;
1779 void decode(bufferlist::const_iterator& bl);
1780 void dump(Formatter *f) const;
1781 static void generate_test_instances(std::list<MDSCacheObjectInfo*>& ls);
1782
1783 inodeno_t ino = 0;
1784 dirfrag_t dirfrag;
1785 string dname;
1786 snapid_t snapid;
1787 };
1788
1789 inline std::ostream& operator<<(std::ostream& out, const MDSCacheObjectInfo &info) {
1790 if (info.ino) return out << info.ino << "." << info.snapid;
1791 if (info.dname.length()) return out << info.dirfrag << "/" << info.dname
1792 << " snap " << info.snapid;
1793 return out << info.dirfrag;
1794 }
1795
1796 inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) {
1797 if (l.ino || r.ino)
1798 return l.ino == r.ino && l.snapid == r.snapid;
1799 else
1800 return l.dirfrag == r.dirfrag && l.dname == r.dname;
1801 }
1802 WRITE_CLASS_ENCODER(MDSCacheObjectInfo)
1803
1804 // parse a map of keys/values.
1805 namespace qi = boost::spirit::qi;
1806
1807 template <typename Iterator>
1808 struct keys_and_values
1809 : qi::grammar<Iterator, std::map<string, string>()>
1810 {
1811 keys_and_values()
1812 : keys_and_values::base_type(query)
1813 {
1814 query = pair >> *(qi::lit(' ') >> pair);
1815 pair = key >> '=' >> value;
1816 key = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9");
1817 value = +qi::char_("a-zA-Z0-9-_.");
1818 }
1819 qi::rule<Iterator, std::map<string, string>()> query;
1820 qi::rule<Iterator, std::pair<string, string>()> pair;
1821 qi::rule<Iterator, string()> key, value;
1822 };
1823
1824 #endif