]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/mdstypes.h
d/control: depend on python3-yaml for ceph-mgr
[ceph.git] / ceph / src / mds / mdstypes.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3#ifndef CEPH_MDSTYPES_H
4#define CEPH_MDSTYPES_H
5
6#include "include/int_types.h"
7
8#include <math.h>
9#include <ostream>
10#include <set>
11#include <map>
11fdf7f2 12#include <string_view>
7c673cae
FG
13
14#include "common/config.h"
15#include "common/Clock.h"
16#include "common/DecayCounter.h"
17#include "common/entity_name.h"
18
19#include "include/Context.h"
20#include "include/frag.h"
21#include "include/xlist.h"
22#include "include/interval_set.h"
23#include "include/compact_map.h"
24#include "include/compact_set.h"
25#include "include/fs_types.h"
26
27#include "inode_backtrace.h"
28
29#include <boost/spirit/include/qi.hpp>
30#include <boost/pool/pool.hpp>
11fdf7f2 31#include "include/ceph_assert.h"
7c673cae
FG
32#include <boost/serialization/strong_typedef.hpp>
33
34#define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011"
35
36#define MDS_PORT_CACHE 0x200
37#define MDS_PORT_LOCKER 0x300
38#define MDS_PORT_MIGRATOR 0x400
39
40#define MAX_MDS 0x100
41#define NUM_STRAY 10
42
43#define MDS_INO_ROOT 1
44
45// No longer created but recognised in existing filesystems
46// so that we don't try to fragment it.
47#define MDS_INO_CEPH 2
48
11fdf7f2
TL
49#define MDS_INO_GLOBAL_SNAPREALM 3
50
7c673cae
FG
51#define MDS_INO_MDSDIR_OFFSET (1*MAX_MDS)
52#define MDS_INO_STRAY_OFFSET (6*MAX_MDS)
53
54// Locations for journal data
55#define MDS_INO_LOG_OFFSET (2*MAX_MDS)
56#define MDS_INO_LOG_BACKUP_OFFSET (3*MAX_MDS)
57#define MDS_INO_LOG_POINTER_OFFSET (4*MAX_MDS)
58#define MDS_INO_PURGE_QUEUE (5*MAX_MDS)
59
60#define MDS_INO_SYSTEM_BASE ((6*MAX_MDS) + (MAX_MDS * NUM_STRAY))
61
62#define MDS_INO_STRAY(x,i) (MDS_INO_STRAY_OFFSET+((((unsigned)(x))*NUM_STRAY)+((unsigned)(i))))
63#define MDS_INO_MDSDIR(x) (MDS_INO_MDSDIR_OFFSET+((unsigned)x))
64
65#define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < (MDS_INO_STRAY_OFFSET+(MAX_MDS*NUM_STRAY)))
66#define MDS_INO_IS_MDSDIR(i) ((i) >= MDS_INO_MDSDIR_OFFSET && (i) < (MDS_INO_MDSDIR_OFFSET+MAX_MDS))
67#define MDS_INO_MDSDIR_OWNER(i) (signed ((unsigned (i)) - MDS_INO_MDSDIR_OFFSET))
11fdf7f2 68#define MDS_INO_IS_BASE(i) ((i) == MDS_INO_ROOT || (i) == MDS_INO_GLOBAL_SNAPREALM || MDS_INO_IS_MDSDIR(i))
7c673cae
FG
69#define MDS_INO_STRAY_OWNER(i) (signed (((unsigned (i)) - MDS_INO_STRAY_OFFSET) / NUM_STRAY))
70#define MDS_INO_STRAY_INDEX(i) (((unsigned (i)) - MDS_INO_STRAY_OFFSET) % NUM_STRAY)
71
7c673cae 72typedef int32_t mds_rank_t;
11fdf7f2 73constexpr mds_rank_t MDS_RANK_NONE = -1;
7c673cae
FG
74
75BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
76extern const mds_gid_t MDS_GID_NONE;
11fdf7f2
TL
77
78typedef int32_t fs_cluster_id_t;
79constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = -1;
7c673cae 80// The namespace ID of the anonymous default filesystem from legacy systems
11fdf7f2 81constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS = 0;
7c673cae 82
9f95a23c
TL
83class mds_role_t {
84public:
7c673cae
FG
85 mds_role_t(fs_cluster_id_t fscid_, mds_rank_t rank_)
86 : fscid(fscid_), rank(rank_)
87 {}
9f95a23c
TL
88 mds_role_t() {}
89
90 bool operator<(mds_role_t const &rhs) const {
7c673cae
FG
91 if (fscid < rhs.fscid) {
92 return true;
93 } else if (fscid == rhs.fscid) {
94 return rank < rhs.rank;
95 } else {
96 return false;
97 }
98 }
99
9f95a23c 100 bool is_none() const {
7c673cae
FG
101 return (rank == MDS_RANK_NONE);
102 }
7c673cae 103
9f95a23c
TL
104 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
105 mds_rank_t rank = MDS_RANK_NONE;
106};
107inline std::ostream& operator<<(std::ostream& out, const mds_role_t& role) {
108 return out << role.fscid << ":" << role.rank;
109}
7c673cae
FG
110
111// CAPS
7c673cae
FG
112inline string gcap_string(int cap)
113{
114 string s;
115 if (cap & CEPH_CAP_GSHARED) s += "s";
116 if (cap & CEPH_CAP_GEXCL) s += "x";
117 if (cap & CEPH_CAP_GCACHE) s += "c";
118 if (cap & CEPH_CAP_GRD) s += "r";
119 if (cap & CEPH_CAP_GWR) s += "w";
120 if (cap & CEPH_CAP_GBUFFER) s += "b";
121 if (cap & CEPH_CAP_GWREXTEND) s += "a";
122 if (cap & CEPH_CAP_GLAZYIO) s += "l";
123 return s;
124}
125inline string ccap_string(int cap)
126{
127 string s;
128 if (cap & CEPH_CAP_PIN) s += "p";
129
130 int a = (cap >> CEPH_CAP_SAUTH) & 3;
131 if (a) s += 'A' + gcap_string(a);
132
133 a = (cap >> CEPH_CAP_SLINK) & 3;
134 if (a) s += 'L' + gcap_string(a);
135
136 a = (cap >> CEPH_CAP_SXATTR) & 3;
137 if (a) s += 'X' + gcap_string(a);
138
139 a = cap >> CEPH_CAP_SFILE;
140 if (a) s += 'F' + gcap_string(a);
141
142 if (s.length() == 0)
143 s = "-";
144 return s;
145}
146
7c673cae 147struct scatter_info_t {
94b18763 148 version_t version = 0;
7c673cae
FG
149};
150
151struct frag_info_t : public scatter_info_t {
7c673cae
FG
152 int64_t size() const { return nfiles + nsubdirs; }
153
154 void zero() {
155 *this = frag_info_t();
156 }
157
158 // *this += cur - acc;
159 void add_delta(const frag_info_t &cur, const frag_info_t &acc, bool *touched_mtime=0, bool *touched_chattr=0) {
160 if (cur.mtime > mtime) {
161 mtime = cur.mtime;
162 if (touched_mtime)
163 *touched_mtime = true;
164 }
165 if (cur.change_attr > change_attr) {
166 change_attr = cur.change_attr;
167 if (touched_chattr)
168 *touched_chattr = true;
169 }
170 nfiles += cur.nfiles - acc.nfiles;
171 nsubdirs += cur.nsubdirs - acc.nsubdirs;
172 }
173
174 void add(const frag_info_t& other) {
175 if (other.mtime > mtime)
176 mtime = other.mtime;
177 if (other.change_attr > change_attr)
178 change_attr = other.change_attr;
179 nfiles += other.nfiles;
180 nsubdirs += other.nsubdirs;
181 }
182
183 bool same_sums(const frag_info_t &o) const {
184 return mtime <= o.mtime &&
185 nfiles == o.nfiles &&
186 nsubdirs == o.nsubdirs;
187 }
188
189 void encode(bufferlist &bl) const;
11fdf7f2 190 void decode(bufferlist::const_iterator& bl);
7c673cae 191 void dump(Formatter *f) const;
9f95a23c
TL
192 static void generate_test_instances(std::list<frag_info_t*>& ls);
193
194 // this frag
195 utime_t mtime;
196 uint64_t change_attr = 0;
197 int64_t nfiles = 0; // files
198 int64_t nsubdirs = 0; // subdirs
7c673cae
FG
199};
200WRITE_CLASS_ENCODER(frag_info_t)
201
202inline bool operator==(const frag_info_t &l, const frag_info_t &r) {
203 return memcmp(&l, &r, sizeof(l)) == 0;
204}
205inline bool operator!=(const frag_info_t &l, const frag_info_t &r) {
206 return !(l == r);
207}
208
209std::ostream& operator<<(std::ostream &out, const frag_info_t &f);
210
211
212struct nest_info_t : public scatter_info_t {
7c673cae
FG
213 int64_t rsize() const { return rfiles + rsubdirs; }
214
7c673cae
FG
215 void zero() {
216 *this = nest_info_t();
217 }
218
219 void sub(const nest_info_t &other) {
220 add(other, -1);
221 }
222 void add(const nest_info_t &other, int fac=1) {
223 if (other.rctime > rctime)
224 rctime = other.rctime;
225 rbytes += fac*other.rbytes;
226 rfiles += fac*other.rfiles;
227 rsubdirs += fac*other.rsubdirs;
11fdf7f2 228 rsnaps += fac*other.rsnaps;
7c673cae
FG
229 }
230
231 // *this += cur - acc;
232 void add_delta(const nest_info_t &cur, const nest_info_t &acc) {
233 if (cur.rctime > rctime)
234 rctime = cur.rctime;
235 rbytes += cur.rbytes - acc.rbytes;
236 rfiles += cur.rfiles - acc.rfiles;
237 rsubdirs += cur.rsubdirs - acc.rsubdirs;
11fdf7f2 238 rsnaps += cur.rsnaps - acc.rsnaps;
7c673cae
FG
239 }
240
241 bool same_sums(const nest_info_t &o) const {
242 return rctime <= o.rctime &&
243 rbytes == o.rbytes &&
244 rfiles == o.rfiles &&
245 rsubdirs == o.rsubdirs &&
11fdf7f2 246 rsnaps == o.rsnaps;
7c673cae
FG
247 }
248
249 void encode(bufferlist &bl) const;
11fdf7f2 250 void decode(bufferlist::const_iterator& bl);
7c673cae 251 void dump(Formatter *f) const;
9f95a23c
TL
252 static void generate_test_instances(std::list<nest_info_t*>& ls);
253
254 // this frag + children
255 utime_t rctime;
256 int64_t rbytes = 0;
257 int64_t rfiles = 0;
258 int64_t rsubdirs = 0;
259 int64_t rsnaps = 0;
7c673cae
FG
260};
261WRITE_CLASS_ENCODER(nest_info_t)
262
263inline bool operator==(const nest_info_t &l, const nest_info_t &r) {
264 return memcmp(&l, &r, sizeof(l)) == 0;
265}
266inline bool operator!=(const nest_info_t &l, const nest_info_t &r) {
267 return !(l == r);
268}
269
270std::ostream& operator<<(std::ostream &out, const nest_info_t &n);
271
7c673cae 272struct vinodeno_t {
7c673cae
FG
273 vinodeno_t() {}
274 vinodeno_t(inodeno_t i, snapid_t s) : ino(i), snapid(s) {}
275
276 void encode(bufferlist& bl) const {
11fdf7f2
TL
277 using ceph::encode;
278 encode(ino, bl);
279 encode(snapid, bl);
7c673cae 280 }
11fdf7f2
TL
281 void decode(bufferlist::const_iterator& p) {
282 using ceph::decode;
283 decode(ino, p);
284 decode(snapid, p);
7c673cae 285 }
9f95a23c
TL
286
287 inodeno_t ino;
288 snapid_t snapid;
7c673cae
FG
289};
290WRITE_CLASS_ENCODER(vinodeno_t)
291
292inline bool operator==(const vinodeno_t &l, const vinodeno_t &r) {
293 return l.ino == r.ino && l.snapid == r.snapid;
294}
295inline bool operator!=(const vinodeno_t &l, const vinodeno_t &r) {
296 return !(l == r);
297}
298inline bool operator<(const vinodeno_t &l, const vinodeno_t &r) {
299 return
300 l.ino < r.ino ||
301 (l.ino == r.ino && l.snapid < r.snapid);
302}
303
304struct quota_info_t
305{
7c673cae
FG
306 void encode(bufferlist& bl) const {
307 ENCODE_START(1, 1, bl);
11fdf7f2
TL
308 encode(max_bytes, bl);
309 encode(max_files, bl);
7c673cae
FG
310 ENCODE_FINISH(bl);
311 }
11fdf7f2 312 void decode(bufferlist::const_iterator& p) {
7c673cae 313 DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p);
11fdf7f2
TL
314 decode(max_bytes, p);
315 decode(max_files, p);
7c673cae
FG
316 DECODE_FINISH(p);
317 }
318
319 void dump(Formatter *f) const;
9f95a23c 320 static void generate_test_instances(std::list<quota_info_t *>& ls);
7c673cae
FG
321
322 bool is_valid() const {
323 return max_bytes >=0 && max_files >=0;
324 }
325 bool is_enable() const {
326 return max_bytes || max_files;
327 }
9f95a23c
TL
328
329 int64_t max_bytes = 0;
330 int64_t max_files = 0;
7c673cae
FG
331};
332WRITE_CLASS_ENCODER(quota_info_t)
333
334inline bool operator==(const quota_info_t &l, const quota_info_t &r) {
335 return memcmp(&l, &r, sizeof(l)) == 0;
336}
337
338ostream& operator<<(ostream &out, const quota_info_t &n);
339
340namespace std {
341 template<> struct hash<vinodeno_t> {
342 size_t operator()(const vinodeno_t &vino) const {
343 hash<inodeno_t> H;
344 hash<uint64_t> I;
345 return H(vino.ino) ^ I(vino.snapid);
346 }
347 };
9f95a23c 348}
7c673cae
FG
349
350inline std::ostream& operator<<(std::ostream &out, const vinodeno_t &vino) {
351 out << vino.ino;
352 if (vino.snapid == CEPH_NOSNAP)
353 out << ".head";
354 else if (vino.snapid)
355 out << '.' << vino.snapid;
356 return out;
357}
358
7c673cae
FG
359struct client_writeable_range_t {
360 struct byte_range_t {
94b18763 361 uint64_t first = 0, last = 0; // interval client can write to
7c673cae
FG
362 };
363
7c673cae 364 void encode(bufferlist &bl) const;
11fdf7f2 365 void decode(bufferlist::const_iterator& bl);
7c673cae 366 void dump(Formatter *f) const;
94b18763 367 static void generate_test_instances(std::list<client_writeable_range_t*>& ls);
9f95a23c
TL
368
369 byte_range_t range;
370 snapid_t follows = 0; // aka "data+metadata flushed thru"
7c673cae
FG
371};
372
11fdf7f2
TL
373inline void decode(client_writeable_range_t::byte_range_t& range, bufferlist::const_iterator& bl) {
374 decode(range.first, bl);
375 decode(range.last, bl);
7c673cae
FG
376}
377
378WRITE_CLASS_ENCODER(client_writeable_range_t)
379
380std::ostream& operator<<(std::ostream& out, const client_writeable_range_t& r);
381
382inline bool operator==(const client_writeable_range_t& l,
383 const client_writeable_range_t& r) {
384 return l.range.first == r.range.first && l.range.last == r.range.last &&
385 l.follows == r.follows;
386}
387
388struct inline_data_t {
7c673cae 389public:
94b18763 390 inline_data_t() {}
7c673cae
FG
391 inline_data_t(const inline_data_t& o) : version(o.version) {
392 if (o.blp)
393 get_data() = *o.blp;
394 }
395 inline_data_t& operator=(const inline_data_t& o) {
396 version = o.version;
397 if (o.blp)
398 get_data() = *o.blp;
399 else
400 free_data();
401 return *this;
402 }
9f95a23c
TL
403
404 void free_data() {
405 blp.reset();
406 }
407 bufferlist& get_data() {
408 if (!blp)
409 blp.reset(new bufferlist);
410 return *blp;
411 }
412 size_t length() const { return blp ? blp->length() : 0; }
413
7c673cae
FG
414 bool operator==(const inline_data_t& o) const {
415 return length() == o.length() &&
416 (length() == 0 ||
417 (*const_cast<bufferlist*>(blp.get()) == *const_cast<bufferlist*>(o.blp.get())));
418 }
419 bool operator!=(const inline_data_t& o) const {
420 return !(*this == o);
421 }
422 void encode(bufferlist &bl) const;
11fdf7f2 423 void decode(bufferlist::const_iterator& bl);
9f95a23c
TL
424
425 version_t version = 1;
426
427private:
428 std::unique_ptr<bufferlist> blp;
7c673cae
FG
429};
430WRITE_CLASS_ENCODER(inline_data_t)
431
432enum {
433 DAMAGE_STATS, // statistics (dirstat, size, etc)
434 DAMAGE_RSTATS, // recursive statistics (rstat, accounted_rstat)
435 DAMAGE_FRAGTREE // fragtree -- repair by searching
436};
437typedef uint32_t damage_flags_t;
438
94b18763 439template<template<typename> class Allocator = std::allocator>
7c673cae
FG
440struct inode_t {
441 /**
442 * ***************
443 * Do not forget to add any new fields to the compare() function.
444 * ***************
445 */
94b18763 446 using client_range_map = std::map<client_t,client_writeable_range_t,std::less<client_t>,Allocator<std::pair<const client_t,client_writeable_range_t>>>;
94b18763
FG
447
448 inode_t()
449 {
7c673cae 450 clear_layout();
7c673cae
FG
451 }
452
453 // file type
454 bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; }
455 bool is_dir() const { return (mode & S_IFMT) == S_IFDIR; }
456 bool is_file() const { return (mode & S_IFMT) == S_IFREG; }
457
458 bool is_truncating() const { return (truncate_pending > 0); }
459 void truncate(uint64_t old_size, uint64_t new_size) {
11fdf7f2 460 ceph_assert(new_size < old_size);
7c673cae
FG
461 if (old_size > max_size_ever)
462 max_size_ever = old_size;
463 truncate_from = old_size;
464 size = new_size;
465 rstat.rbytes = new_size;
466 truncate_size = size;
467 truncate_seq++;
468 truncate_pending++;
469 }
470
471 bool has_layout() const {
472 return layout != file_layout_t();
473 }
474
475 void clear_layout() {
476 layout = file_layout_t();
477 }
478
479 uint64_t get_layout_size_increment() const {
480 return layout.get_period();
481 }
482
483 bool is_dirty_rstat() const { return !(rstat == accounted_rstat); }
484
485 uint64_t get_max_size() const {
486 uint64_t max = 0;
487 for (std::map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin();
488 p != client_ranges.end();
489 ++p)
490 if (p->second.range.last > max)
491 max = p->second.range.last;
492 return max;
493 }
494 void set_max_size(uint64_t new_max) {
495 if (new_max == 0) {
496 client_ranges.clear();
497 } else {
498 for (std::map<client_t,client_writeable_range_t>::iterator p = client_ranges.begin();
499 p != client_ranges.end();
500 ++p)
501 p->second.range.last = new_max;
502 }
503 }
504
505 void trim_client_ranges(snapid_t last) {
506 std::map<client_t, client_writeable_range_t>::iterator p = client_ranges.begin();
507 while (p != client_ranges.end()) {
508 if (p->second.follows >= last)
509 client_ranges.erase(p++);
510 else
511 ++p;
512 }
513 }
514
515 bool is_backtrace_updated() const {
516 return backtrace_version == version;
517 }
518 void update_backtrace(version_t pv=0) {
519 backtrace_version = pv ? pv : version;
520 }
521
522 void add_old_pool(int64_t l) {
523 backtrace_version = version;
524 old_pools.insert(l);
525 }
526
527 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 528 void decode(bufferlist::const_iterator& bl);
7c673cae 529 void dump(Formatter *f) const;
94b18763 530 static void generate_test_instances(std::list<inode_t*>& ls);
7c673cae
FG
531 /**
532 * Compare this inode_t with another that represent *the same inode*
533 * at different points in time.
534 * @pre The inodes are the same ino
535 *
536 * @param other The inode_t to compare ourselves with
537 * @param divergent A bool pointer which will be set to true
538 * if the values are different in a way that can't be explained
539 * by one being a newer version than the other.
540 *
541 * @returns 1 if we are newer than the other, 0 if equal, -1 if older.
542 */
543 int compare(const inode_t &other, bool *divergent) const;
9f95a23c
TL
544
545 // base (immutable)
546 inodeno_t ino = 0;
547 uint32_t rdev = 0; // if special file
548
549 // affected by any inode change...
550 utime_t ctime; // inode change time
551 utime_t btime; // birth time
552
553 // perm (namespace permissions)
554 uint32_t mode = 0;
555 uid_t uid = 0;
556 gid_t gid = 0;
557
558 // nlink
559 int32_t nlink = 0;
560
561 // file (data access)
562 ceph_dir_layout dir_layout = {}; // [dir only]
563 file_layout_t layout;
564 compact_set<int64_t, std::less<int64_t>, Allocator<int64_t>> old_pools;
565 uint64_t size = 0; // on directory, # dentries
566 uint64_t max_size_ever = 0; // max size the file has ever been
567 uint32_t truncate_seq = 0;
568 uint64_t truncate_size = 0, truncate_from = 0;
569 uint32_t truncate_pending = 0;
570 utime_t mtime; // file data modify time.
571 utime_t atime; // file data access time.
572 uint32_t time_warp_seq = 0; // count of (potential) mtime/atime timewarps (i.e., utimes())
573 inline_data_t inline_data; // FIXME check
574
575 // change attribute
576 uint64_t change_attr = 0;
577
578 client_range_map client_ranges; // client(s) can write to these ranges
579
580 // dirfrag, recursive accountin
581 frag_info_t dirstat; // protected by my filelock
582 nest_info_t rstat; // protected by my nestlock
583 nest_info_t accounted_rstat; // protected by parent's nestlock
584
585 quota_info_t quota;
586
587 mds_rank_t export_pin = MDS_RANK_NONE;
588
589 // special stuff
590 version_t version = 0; // auth only
591 version_t file_data_version = 0; // auth only
592 version_t xattr_version = 0;
593
594 utime_t last_scrub_stamp; // start time of last complete scrub
595 version_t last_scrub_version = 0;// (parent) start version of last complete scrub
596
597 version_t backtrace_version = 0;
598
599 snapid_t oldest_snap;
600
601 std::basic_string<char,std::char_traits<char>,Allocator<char>> stray_prior_path; //stores path before unlink
602
7c673cae
FG
603private:
604 bool older_is_consistent(const inode_t &other) const;
605};
7c673cae 606
94b18763
FG
607// These methods may be moved back to mdstypes.cc when we have pmr
608template<template<typename> class Allocator>
609void inode_t<Allocator>::encode(bufferlist &bl, uint64_t features) const
610{
611 ENCODE_START(15, 6, bl);
612
11fdf7f2
TL
613 encode(ino, bl);
614 encode(rdev, bl);
615 encode(ctime, bl);
94b18763 616
11fdf7f2
TL
617 encode(mode, bl);
618 encode(uid, bl);
619 encode(gid, bl);
94b18763 620
11fdf7f2 621 encode(nlink, bl);
94b18763
FG
622 {
623 // removed field
624 bool anchored = 0;
11fdf7f2 625 encode(anchored, bl);
94b18763
FG
626 }
627
11fdf7f2
TL
628 encode(dir_layout, bl);
629 encode(layout, bl, features);
630 encode(size, bl);
631 encode(truncate_seq, bl);
632 encode(truncate_size, bl);
633 encode(truncate_from, bl);
634 encode(truncate_pending, bl);
635 encode(mtime, bl);
636 encode(atime, bl);
637 encode(time_warp_seq, bl);
638 encode(client_ranges, bl);
94b18763 639
11fdf7f2
TL
640 encode(dirstat, bl);
641 encode(rstat, bl);
642 encode(accounted_rstat, bl);
94b18763 643
11fdf7f2
TL
644 encode(version, bl);
645 encode(file_data_version, bl);
646 encode(xattr_version, bl);
647 encode(backtrace_version, bl);
648 encode(old_pools, bl);
649 encode(max_size_ever, bl);
650 encode(inline_data, bl);
651 encode(quota, bl);
94b18763 652
11fdf7f2 653 encode(stray_prior_path, bl);
94b18763 654
11fdf7f2
TL
655 encode(last_scrub_version, bl);
656 encode(last_scrub_stamp, bl);
94b18763 657
11fdf7f2
TL
658 encode(btime, bl);
659 encode(change_attr, bl);
94b18763 660
11fdf7f2 661 encode(export_pin, bl);
94b18763
FG
662
663 ENCODE_FINISH(bl);
664}
665
666template<template<typename> class Allocator>
11fdf7f2 667void inode_t<Allocator>::decode(bufferlist::const_iterator &p)
94b18763
FG
668{
669 DECODE_START_LEGACY_COMPAT_LEN(15, 6, 6, p);
670
11fdf7f2
TL
671 decode(ino, p);
672 decode(rdev, p);
673 decode(ctime, p);
94b18763 674
11fdf7f2
TL
675 decode(mode, p);
676 decode(uid, p);
677 decode(gid, p);
94b18763 678
11fdf7f2 679 decode(nlink, p);
94b18763
FG
680 {
681 bool anchored;
11fdf7f2 682 decode(anchored, p);
94b18763
FG
683 }
684
685 if (struct_v >= 4)
11fdf7f2 686 decode(dir_layout, p);
92f5a8d4
TL
687 else {
688 // FIPS zeroization audit 20191117: this memset is not security related.
94b18763 689 memset(&dir_layout, 0, sizeof(dir_layout));
92f5a8d4 690 }
11fdf7f2
TL
691 decode(layout, p);
692 decode(size, p);
693 decode(truncate_seq, p);
694 decode(truncate_size, p);
695 decode(truncate_from, p);
94b18763 696 if (struct_v >= 5)
11fdf7f2 697 decode(truncate_pending, p);
94b18763
FG
698 else
699 truncate_pending = 0;
11fdf7f2
TL
700 decode(mtime, p);
701 decode(atime, p);
702 decode(time_warp_seq, p);
94b18763 703 if (struct_v >= 3) {
11fdf7f2 704 decode(client_ranges, p);
94b18763
FG
705 } else {
706 map<client_t, client_writeable_range_t::byte_range_t> m;
11fdf7f2 707 decode(m, p);
94b18763
FG
708 for (map<client_t, client_writeable_range_t::byte_range_t>::iterator
709 q = m.begin(); q != m.end(); ++q)
710 client_ranges[q->first].range = q->second;
711 }
712
11fdf7f2
TL
713 decode(dirstat, p);
714 decode(rstat, p);
715 decode(accounted_rstat, p);
94b18763 716
11fdf7f2
TL
717 decode(version, p);
718 decode(file_data_version, p);
719 decode(xattr_version, p);
94b18763 720 if (struct_v >= 2)
11fdf7f2 721 decode(backtrace_version, p);
94b18763 722 if (struct_v >= 7)
11fdf7f2 723 decode(old_pools, p);
94b18763 724 if (struct_v >= 8)
11fdf7f2 725 decode(max_size_ever, p);
94b18763 726 if (struct_v >= 9) {
11fdf7f2 727 decode(inline_data, p);
94b18763
FG
728 } else {
729 inline_data.version = CEPH_INLINE_NONE;
730 }
731 if (struct_v < 10)
732 backtrace_version = 0; // force update backtrace
733 if (struct_v >= 11)
11fdf7f2 734 decode(quota, p);
94b18763
FG
735
736 if (struct_v >= 12) {
737 std::string tmp;
11fdf7f2
TL
738 decode(tmp, p);
739 stray_prior_path = std::string_view(tmp);
94b18763
FG
740 }
741
742 if (struct_v >= 13) {
11fdf7f2
TL
743 decode(last_scrub_version, p);
744 decode(last_scrub_stamp, p);
94b18763
FG
745 }
746 if (struct_v >= 14) {
11fdf7f2
TL
747 decode(btime, p);
748 decode(change_attr, p);
94b18763
FG
749 } else {
750 btime = utime_t();
751 change_attr = 0;
752 }
753
754 if (struct_v >= 15) {
11fdf7f2 755 decode(export_pin, p);
94b18763
FG
756 } else {
757 export_pin = MDS_RANK_NONE;
758 }
759
760 DECODE_FINISH(p);
761}
762
763template<template<typename> class Allocator>
764void inode_t<Allocator>::dump(Formatter *f) const
765{
766 f->dump_unsigned("ino", ino);
767 f->dump_unsigned("rdev", rdev);
768 f->dump_stream("ctime") << ctime;
769 f->dump_stream("btime") << btime;
770 f->dump_unsigned("mode", mode);
771 f->dump_unsigned("uid", uid);
772 f->dump_unsigned("gid", gid);
773 f->dump_unsigned("nlink", nlink);
774
775 f->open_object_section("dir_layout");
776 ::dump(dir_layout, f);
777 f->close_section();
778
779 f->dump_object("layout", layout);
780
781 f->open_array_section("old_pools");
782 for (const auto &p : old_pools) {
783 f->dump_int("pool", p);
784 }
785 f->close_section();
786
787 f->dump_unsigned("size", size);
788 f->dump_unsigned("truncate_seq", truncate_seq);
789 f->dump_unsigned("truncate_size", truncate_size);
790 f->dump_unsigned("truncate_from", truncate_from);
791 f->dump_unsigned("truncate_pending", truncate_pending);
792 f->dump_stream("mtime") << mtime;
793 f->dump_stream("atime") << atime;
794 f->dump_unsigned("time_warp_seq", time_warp_seq);
795 f->dump_unsigned("change_attr", change_attr);
796 f->dump_int("export_pin", export_pin);
797
798 f->open_array_section("client_ranges");
799 for (const auto &p : client_ranges) {
800 f->open_object_section("client");
801 f->dump_unsigned("client", p.first.v);
802 p.second.dump(f);
803 f->close_section();
804 }
805 f->close_section();
806
807 f->open_object_section("dirstat");
808 dirstat.dump(f);
809 f->close_section();
810
811 f->open_object_section("rstat");
812 rstat.dump(f);
813 f->close_section();
814
815 f->open_object_section("accounted_rstat");
816 accounted_rstat.dump(f);
817 f->close_section();
818
819 f->dump_unsigned("version", version);
820 f->dump_unsigned("file_data_version", file_data_version);
821 f->dump_unsigned("xattr_version", xattr_version);
822 f->dump_unsigned("backtrace_version", backtrace_version);
823
824 f->dump_string("stray_prior_path", stray_prior_path);
9f95a23c
TL
825 f->dump_unsigned("max_size_ever", max_size_ever);
826
827 f->open_object_section("quota");
828 quota.dump(f);
829 f->close_section();
830
831 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
832 f->dump_unsigned("last_scrub_version", last_scrub_version);
94b18763
FG
833}
834
835template<template<typename> class Allocator>
9f95a23c 836void inode_t<Allocator>::generate_test_instances(std::list<inode_t*>& ls)
94b18763
FG
837{
838 ls.push_back(new inode_t<Allocator>);
839 ls.push_back(new inode_t<Allocator>);
840 ls.back()->ino = 1;
841 // i am lazy.
842}
843
844template<template<typename> class Allocator>
845int inode_t<Allocator>::compare(const inode_t<Allocator> &other, bool *divergent) const
846{
11fdf7f2 847 ceph_assert(ino == other.ino);
94b18763
FG
848 *divergent = false;
849 if (version == other.version) {
850 if (rdev != other.rdev ||
851 ctime != other.ctime ||
852 btime != other.btime ||
853 mode != other.mode ||
854 uid != other.uid ||
855 gid != other.gid ||
856 nlink != other.nlink ||
857 memcmp(&dir_layout, &other.dir_layout, sizeof(dir_layout)) ||
858 layout != other.layout ||
859 old_pools != other.old_pools ||
860 size != other.size ||
861 max_size_ever != other.max_size_ever ||
862 truncate_seq != other.truncate_seq ||
863 truncate_size != other.truncate_size ||
864 truncate_from != other.truncate_from ||
865 truncate_pending != other.truncate_pending ||
866 change_attr != other.change_attr ||
867 mtime != other.mtime ||
868 atime != other.atime ||
869 time_warp_seq != other.time_warp_seq ||
870 inline_data != other.inline_data ||
871 client_ranges != other.client_ranges ||
872 !(dirstat == other.dirstat) ||
873 !(rstat == other.rstat) ||
874 !(accounted_rstat == other.accounted_rstat) ||
875 file_data_version != other.file_data_version ||
876 xattr_version != other.xattr_version ||
877 backtrace_version != other.backtrace_version) {
878 *divergent = true;
879 }
880 return 0;
881 } else if (version > other.version) {
882 *divergent = !older_is_consistent(other);
883 return 1;
884 } else {
11fdf7f2 885 ceph_assert(version < other.version);
94b18763
FG
886 *divergent = !other.older_is_consistent(*this);
887 return -1;
888 }
889}
890
891template<template<typename> class Allocator>
892bool inode_t<Allocator>::older_is_consistent(const inode_t<Allocator> &other) const
893{
894 if (max_size_ever < other.max_size_ever ||
895 truncate_seq < other.truncate_seq ||
896 time_warp_seq < other.time_warp_seq ||
897 inline_data.version < other.inline_data.version ||
898 dirstat.version < other.dirstat.version ||
899 rstat.version < other.rstat.version ||
900 accounted_rstat.version < other.accounted_rstat.version ||
901 file_data_version < other.file_data_version ||
902 xattr_version < other.xattr_version ||
903 backtrace_version < other.backtrace_version) {
904 return false;
905 }
906 return true;
907}
908
909template<template<typename> class Allocator>
910inline void encode(const inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
911{
912 ENCODE_DUMP_PRE();
913 c.encode(bl, features);
914 ENCODE_DUMP_POST(cl);
915}
916template<template<typename> class Allocator>
11fdf7f2 917inline void decode(inode_t<Allocator> &c, ::ceph::bufferlist::const_iterator &p)
94b18763
FG
918{
919 c.decode(p);
920}
921
922template<template<typename> class Allocator>
923using alloc_string = std::basic_string<char,std::char_traits<char>,Allocator<char>>;
924
925template<template<typename> class Allocator>
926using xattr_map = compact_map<alloc_string<Allocator>, bufferptr, std::less<alloc_string<Allocator>>, Allocator<std::pair<const alloc_string<Allocator>, bufferptr>>>; // FIXME bufferptr not in mempool
7c673cae 927
94b18763 928template<template<typename> class Allocator = std::allocator>
7c673cae
FG
929struct old_inode_t {
930 snapid_t first;
94b18763
FG
931 inode_t<Allocator> inode;
932 xattr_map<Allocator> xattrs;
7c673cae
FG
933
934 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 935 void decode(bufferlist::const_iterator& bl);
7c673cae 936 void dump(Formatter *f) const;
94b18763 937 static void generate_test_instances(std::list<old_inode_t*>& ls);
7c673cae 938};
94b18763
FG
939
940// These methods may be moved back to mdstypes.cc when we have pmr
941template<template<typename> class Allocator>
942void old_inode_t<Allocator>::encode(bufferlist& bl, uint64_t features) const
943{
944 ENCODE_START(2, 2, bl);
11fdf7f2
TL
945 encode(first, bl);
946 encode(inode, bl, features);
947 encode(xattrs, bl);
94b18763
FG
948 ENCODE_FINISH(bl);
949}
950
951template<template<typename> class Allocator>
11fdf7f2 952void old_inode_t<Allocator>::decode(bufferlist::const_iterator& bl)
94b18763
FG
953{
954 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
11fdf7f2
TL
955 decode(first, bl);
956 decode(inode, bl);
957 decode(xattrs, bl);
94b18763
FG
958 DECODE_FINISH(bl);
959}
960
961template<template<typename> class Allocator>
962void old_inode_t<Allocator>::dump(Formatter *f) const
963{
964 f->dump_unsigned("first", first);
965 inode.dump(f);
966 f->open_object_section("xattrs");
967 for (const auto &p : xattrs) {
968 std::string v(p.second.c_str(), p.second.length());
969 f->dump_string(p.first.c_str(), v);
970 }
971 f->close_section();
972}
973
974template<template<typename> class Allocator>
975void old_inode_t<Allocator>::generate_test_instances(std::list<old_inode_t<Allocator>*>& ls)
976{
977 ls.push_back(new old_inode_t<Allocator>);
978 ls.push_back(new old_inode_t<Allocator>);
979 ls.back()->first = 2;
980 std::list<inode_t<Allocator>*> ils;
981 inode_t<Allocator>::generate_test_instances(ils);
982 ls.back()->inode = *ils.back();
983 ls.back()->xattrs["user.foo"] = buffer::copy("asdf", 4);
984 ls.back()->xattrs["user.unprintable"] = buffer::copy("\000\001\002", 3);
985}
986
987template<template<typename> class Allocator>
988inline void encode(const old_inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
989{
990 ENCODE_DUMP_PRE();
991 c.encode(bl, features);
992 ENCODE_DUMP_POST(cl);
993}
994template<template<typename> class Allocator>
11fdf7f2 995inline void decode(old_inode_t<Allocator> &c, ::ceph::bufferlist::const_iterator &p)
94b18763
FG
996{
997 c.decode(p);
998}
7c673cae 999
7c673cae
FG
1000/*
1001 * like an inode, but for a dir frag
1002 */
1003struct fnode_t {
9f95a23c
TL
1004 void encode(bufferlist &bl) const;
1005 void decode(bufferlist::const_iterator& bl);
1006 void dump(Formatter *f) const;
1007 static void generate_test_instances(std::list<fnode_t*>& ls);
1008
94b18763 1009 version_t version = 0;
7c673cae
FG
1010 snapid_t snap_purged_thru; // the max_last_destroy snapid we've been purged thru
1011 frag_info_t fragstat, accounted_fragstat;
1012 nest_info_t rstat, accounted_rstat;
94b18763 1013 damage_flags_t damage_flags = 0;
7c673cae
FG
1014
1015 // we know we and all our descendants have been scrubbed since this version
94b18763 1016 version_t recursive_scrub_version = 0;
7c673cae
FG
1017 utime_t recursive_scrub_stamp;
1018 // version at which we last scrubbed our personal data structures
94b18763 1019 version_t localized_scrub_version = 0;
7c673cae 1020 utime_t localized_scrub_stamp;
7c673cae
FG
1021};
1022WRITE_CLASS_ENCODER(fnode_t)
1023
1024
1025struct old_rstat_t {
7c673cae 1026 void encode(bufferlist& bl) const;
11fdf7f2 1027 void decode(bufferlist::const_iterator& p);
7c673cae 1028 void dump(Formatter *f) const;
9f95a23c
TL
1029 static void generate_test_instances(std::list<old_rstat_t*>& ls);
1030
1031 snapid_t first;
1032 nest_info_t rstat, accounted_rstat;
7c673cae
FG
1033};
1034WRITE_CLASS_ENCODER(old_rstat_t)
1035
1036inline std::ostream& operator<<(std::ostream& out, const old_rstat_t& o) {
1037 return out << "old_rstat(first " << o.first << " " << o.rstat << " " << o.accounted_rstat << ")";
1038}
1039
11fdf7f2
TL
1040class feature_bitset_t {
1041public:
1042 typedef uint64_t block_type;
1043 static const size_t bits_per_block = sizeof(block_type) * 8;
1044
1045 feature_bitset_t(const feature_bitset_t& other) : _vec(other._vec) {}
1046 feature_bitset_t(feature_bitset_t&& other) : _vec(std::move(other._vec)) {}
1047 feature_bitset_t(unsigned long value = 0);
1048 feature_bitset_t(const vector<size_t>& array);
1049 feature_bitset_t& operator=(const feature_bitset_t& other) {
1050 _vec = other._vec;
1051 return *this;
1052 }
1053 feature_bitset_t& operator=(feature_bitset_t&& other) {
1054 _vec = std::move(other._vec);
1055 return *this;
1056 }
9f95a23c 1057 feature_bitset_t& operator-=(const feature_bitset_t& other);
11fdf7f2 1058 bool empty() const {
9f95a23c 1059 //block_type is a uint64_t. If the vector is only composed of 0s, then it's still "empty"
11fdf7f2
TL
1060 for (auto& v : _vec) {
1061 if (v)
1062 return false;
1063 }
1064 return true;
1065 }
1066 bool test(size_t bit) const {
1067 if (bit >= bits_per_block * _vec.size())
1068 return false;
1069 return _vec[bit / bits_per_block] & ((block_type)1 << (bit % bits_per_block));
1070 }
1071 void clear() {
1072 _vec.clear();
1073 }
11fdf7f2
TL
1074 void encode(bufferlist& bl) const;
1075 void decode(bufferlist::const_iterator &p);
9f95a23c 1076 void dump(Formatter *f) const;
11fdf7f2
TL
1077 void print(ostream& out) const;
1078private:
1079 vector<block_type> _vec;
1080};
1081WRITE_CLASS_ENCODER(feature_bitset_t)
1082
1083inline std::ostream& operator<<(std::ostream& out, const feature_bitset_t& s) {
1084 s.print(out);
1085 return out;
1086}
1087
9f95a23c
TL
1088struct metric_spec_t {
1089 metric_spec_t() {}
1090 metric_spec_t(const metric_spec_t& other) :
1091 metric_flags(other.metric_flags) {}
1092 metric_spec_t(metric_spec_t&& other) :
1093 metric_flags(std::move(other.metric_flags)) {}
1094 metric_spec_t(const feature_bitset_t& mf) :
1095 metric_flags(mf) {}
1096 metric_spec_t(feature_bitset_t&& mf) :
1097 metric_flags(std::move(mf)) {}
1098
1099 metric_spec_t& operator=(const metric_spec_t& other) {
1100 metric_flags = other.metric_flags;
1101 return *this;
1102 }
1103 metric_spec_t& operator=(metric_spec_t&& other) {
1104 metric_flags = std::move(other.metric_flags);
1105 return *this;
1106 }
1107
1108 bool empty() const {
1109 return metric_flags.empty();
1110 }
1111
1112 void clear() {
1113 metric_flags.clear();
1114 }
1115
1116 void encode(bufferlist& bl) const;
1117 void decode(bufferlist::const_iterator& p);
1118 void dump(Formatter *f) const;
1119 void print(ostream& out) const;
1120
1121 // set of metrics that a client is capable of forwarding
1122 feature_bitset_t metric_flags;
1123};
1124WRITE_CLASS_ENCODER(metric_spec_t)
1125
1126inline std::ostream& operator<<(std::ostream& out, const metric_spec_t& mst) {
1127 mst.print(out);
1128 return out;
1129}
1130
11fdf7f2
TL
1131/*
1132 * client_metadata_t
1133 */
1134struct client_metadata_t {
1135 using kv_map_t = std::map<std::string,std::string>;
1136 using iterator = kv_map_t::const_iterator;
1137
11fdf7f2 1138 client_metadata_t() {}
9f95a23c
TL
1139 client_metadata_t(const kv_map_t& kv, const feature_bitset_t &f, const metric_spec_t &mst) :
1140 kv_map(kv),
1141 features(f),
1142 metric_spec(mst) {}
11fdf7f2
TL
1143 client_metadata_t& operator=(const client_metadata_t& other) {
1144 kv_map = other.kv_map;
1145 features = other.features;
9f95a23c 1146 metric_spec = other.metric_spec;
11fdf7f2
TL
1147 return *this;
1148 }
1149
9f95a23c 1150 bool empty() const { return kv_map.empty() && features.empty() && metric_spec.empty(); }
11fdf7f2
TL
1151 iterator find(const std::string& key) const { return kv_map.find(key); }
1152 iterator begin() const { return kv_map.begin(); }
1153 iterator end() const { return kv_map.end(); }
92f5a8d4 1154 void erase(iterator it) { kv_map.erase(it); }
11fdf7f2
TL
1155 std::string& operator[](const std::string& key) { return kv_map[key]; }
1156 void merge(const client_metadata_t& other) {
1157 kv_map.insert(other.kv_map.begin(), other.kv_map.end());
1158 features = other.features;
9f95a23c 1159 metric_spec = other.metric_spec;
11fdf7f2
TL
1160 }
1161 void clear() {
1162 kv_map.clear();
1163 features.clear();
9f95a23c 1164 metric_spec.clear();
11fdf7f2
TL
1165 }
1166
1167 void encode(bufferlist& bl) const;
1168 void decode(bufferlist::const_iterator& p);
1169 void dump(Formatter *f) const;
9f95a23c
TL
1170
1171 kv_map_t kv_map;
1172 feature_bitset_t features;
1173 metric_spec_t metric_spec;
11fdf7f2
TL
1174};
1175WRITE_CLASS_ENCODER(client_metadata_t)
7c673cae
FG
1176
1177/*
9f95a23c 1178 * session_info_t - durable part of a Session
7c673cae 1179 */
7c673cae 1180struct session_info_t {
7c673cae 1181 client_t get_client() const { return client_t(inst.name.num()); }
11fdf7f2 1182 bool has_feature(size_t bit) const { return client_metadata.features.test(bit); }
7c673cae
FG
1183 const entity_name_t& get_source() const { return inst.name; }
1184
1185 void clear_meta() {
1186 prealloc_inos.clear();
1187 used_inos.clear();
1188 completed_requests.clear();
1189 completed_flushes.clear();
11fdf7f2 1190 client_metadata.clear();
7c673cae
FG
1191 }
1192
1193 void encode(bufferlist& bl, uint64_t features) const;
11fdf7f2 1194 void decode(bufferlist::const_iterator& p);
7c673cae 1195 void dump(Formatter *f) const;
9f95a23c
TL
1196 static void generate_test_instances(std::list<session_info_t*>& ls);
1197
1198 entity_inst_t inst;
1199 std::map<ceph_tid_t,inodeno_t> completed_requests;
1200 interval_set<inodeno_t> prealloc_inos; // preallocated, ready to use.
1201 interval_set<inodeno_t> used_inos; // journaling use
1202 client_metadata_t client_metadata;
1203 std::set<ceph_tid_t> completed_flushes;
1204 EntityName auth_name;
7c673cae
FG
1205};
1206WRITE_CLASS_ENCODER_FEATURES(session_info_t)
1207
7c673cae 1208// dentries
7c673cae 1209struct dentry_key_t {
94b18763 1210 dentry_key_t() {}
11fdf7f2 1211 dentry_key_t(snapid_t s, std::string_view n, __u32 h=0) :
7c673cae
FG
1212 snapid(s), name(n), hash(h) {}
1213
94b18763 1214 bool is_valid() { return name.length() || snapid; }
7c673cae
FG
1215
1216 // encode into something that can be decoded as a string.
1217 // name_ (head) or name_%x (!head)
1218 void encode(bufferlist& bl) const {
1219 string key;
1220 encode(key);
11fdf7f2
TL
1221 using ceph::encode;
1222 encode(key, bl);
7c673cae
FG
1223 }
1224 void encode(string& key) const {
1225 char b[20];
1226 if (snapid != CEPH_NOSNAP) {
1227 uint64_t val(snapid);
1228 snprintf(b, sizeof(b), "%" PRIx64, val);
1229 } else {
1230 snprintf(b, sizeof(b), "%s", "head");
1231 }
1232 ostringstream oss;
1233 oss << name << "_" << b;
1234 key = oss.str();
1235 }
11fdf7f2 1236 static void decode_helper(bufferlist::const_iterator& bl, string& nm, snapid_t& sn) {
7c673cae 1237 string key;
11fdf7f2 1238 decode(key, bl);
7c673cae
FG
1239 decode_helper(key, nm, sn);
1240 }
11fdf7f2 1241 static void decode_helper(std::string_view key, string& nm, snapid_t& sn) {
7c673cae 1242 size_t i = key.find_last_of('_');
11fdf7f2
TL
1243 ceph_assert(i != string::npos);
1244 if (key.compare(i+1, std::string_view::npos, "head") == 0) {
7c673cae
FG
1245 // name_head
1246 sn = CEPH_NOSNAP;
1247 } else {
1248 // name_%x
1249 long long unsigned x = 0;
94b18763
FG
1250 std::string x_str(key.substr(i+1));
1251 sscanf(x_str.c_str(), "%llx", &x);
7c673cae 1252 sn = x;
9f95a23c 1253 }
11fdf7f2 1254 nm = key.substr(0, i);
7c673cae 1255 }
9f95a23c
TL
1256
1257 snapid_t snapid = 0;
1258 std::string_view name;
1259 __u32 hash = 0;
7c673cae
FG
1260};
1261
1262inline std::ostream& operator<<(std::ostream& out, const dentry_key_t &k)
1263{
1264 return out << "(" << k.name << "," << k.snapid << ")";
1265}
1266
1267inline bool operator<(const dentry_key_t& k1, const dentry_key_t& k2)
1268{
1269 /*
1270 * order by hash, name, snap
1271 */
1272 int c = ceph_frag_value(k1.hash) - ceph_frag_value(k2.hash);
1273 if (c)
1274 return c < 0;
94b18763 1275 c = k1.name.compare(k2.name);
7c673cae
FG
1276 if (c)
1277 return c < 0;
1278 return k1.snapid < k2.snapid;
1279}
1280
7c673cae
FG
1281/*
1282 * string_snap_t is a simple (string, snapid_t) pair
1283 */
1284struct string_snap_t {
7c673cae 1285 string_snap_t() {}
11fdf7f2 1286 string_snap_t(std::string_view n, snapid_t s) : name(n), snapid(s) {}
7c673cae
FG
1287
1288 void encode(bufferlist& bl) const;
11fdf7f2 1289 void decode(bufferlist::const_iterator& p);
7c673cae 1290 void dump(Formatter *f) const;
9f95a23c
TL
1291 static void generate_test_instances(std::list<string_snap_t*>& ls);
1292
1293 string name;
1294 snapid_t snapid;
7c673cae
FG
1295};
1296WRITE_CLASS_ENCODER(string_snap_t)
1297
1298inline bool operator<(const string_snap_t& l, const string_snap_t& r) {
94b18763 1299 int c = l.name.compare(r.name);
7c673cae
FG
1300 return c < 0 || (c == 0 && l.snapid < r.snapid);
1301}
1302
1303inline std::ostream& operator<<(std::ostream& out, const string_snap_t &k)
1304{
1305 return out << "(" << k.name << "," << k.snapid << ")";
1306}
1307
1308/*
1309 * mds_table_pending_t
1310 *
9f95a23c 1311 * For mds's requesting any pending ops, child needs to encode the corresponding
7c673cae
FG
1312 * pending mutation state in the table.
1313 */
1314struct mds_table_pending_t {
7c673cae 1315 void encode(bufferlist& bl) const;
11fdf7f2 1316 void decode(bufferlist::const_iterator& bl);
7c673cae 1317 void dump(Formatter *f) const;
9f95a23c
TL
1318 static void generate_test_instances(std::list<mds_table_pending_t*>& ls);
1319
1320 uint64_t reqid = 0;
1321 __s32 mds = 0;
1322 version_t tid = 0;
7c673cae
FG
1323};
1324WRITE_CLASS_ENCODER(mds_table_pending_t)
1325
7c673cae 1326// requests
7c673cae 1327struct metareqid_t {
94b18763 1328 metareqid_t() {}
7c673cae
FG
1329 metareqid_t(entity_name_t n, ceph_tid_t t) : name(n), tid(t) {}
1330 void encode(bufferlist& bl) const {
11fdf7f2
TL
1331 using ceph::encode;
1332 encode(name, bl);
1333 encode(tid, bl);
7c673cae 1334 }
11fdf7f2
TL
1335 void decode(bufferlist::const_iterator &p) {
1336 using ceph::decode;
1337 decode(name, p);
1338 decode(tid, p);
7c673cae 1339 }
9f95a23c
TL
1340
1341 entity_name_t name;
1342 uint64_t tid = 0;
7c673cae
FG
1343};
1344WRITE_CLASS_ENCODER(metareqid_t)
1345
1346inline std::ostream& operator<<(std::ostream& out, const metareqid_t& r) {
1347 return out << r.name << ":" << r.tid;
1348}
1349
1350inline bool operator==(const metareqid_t& l, const metareqid_t& r) {
1351 return (l.name == r.name) && (l.tid == r.tid);
1352}
1353inline bool operator!=(const metareqid_t& l, const metareqid_t& r) {
1354 return (l.name != r.name) || (l.tid != r.tid);
1355}
1356inline bool operator<(const metareqid_t& l, const metareqid_t& r) {
1357 return (l.name < r.name) ||
1358 (l.name == r.name && l.tid < r.tid);
1359}
1360inline bool operator<=(const metareqid_t& l, const metareqid_t& r) {
1361 return (l.name < r.name) ||
1362 (l.name == r.name && l.tid <= r.tid);
1363}
1364inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); }
1365inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); }
1366
1367namespace std {
1368 template<> struct hash<metareqid_t> {
1369 size_t operator()(const metareqid_t &r) const {
1370 hash<uint64_t> H;
1371 return H(r.name.num()) ^ H(r.name.type()) ^ H(r.tid);
1372 }
1373 };
1374} // namespace std
1375
7c673cae
FG
1376// cap info for client reconnect
1377struct cap_reconnect_t {
9f95a23c 1378 cap_reconnect_t() {}
11fdf7f2 1379 cap_reconnect_t(uint64_t cap_id, inodeno_t pino, std::string_view p, int w, int i,
7c673cae
FG
1380 inodeno_t sr, snapid_t sf, bufferlist& lb) :
1381 path(p) {
1382 capinfo.cap_id = cap_id;
1383 capinfo.wanted = w;
1384 capinfo.issued = i;
1385 capinfo.snaprealm = sr;
1386 capinfo.pathbase = pino;
1387 capinfo.flock_len = 0;
1388 snap_follows = sf;
1389 flockbl.claim(lb);
1390 }
1391 void encode(bufferlist& bl) const;
11fdf7f2 1392 void decode(bufferlist::const_iterator& bl);
7c673cae 1393 void encode_old(bufferlist& bl) const;
11fdf7f2 1394 void decode_old(bufferlist::const_iterator& bl);
7c673cae
FG
1395
1396 void dump(Formatter *f) const;
9f95a23c
TL
1397 static void generate_test_instances(std::list<cap_reconnect_t*>& ls);
1398
1399 string path;
1400 mutable ceph_mds_cap_reconnect capinfo = {};
1401 snapid_t snap_follows = 0;
1402 bufferlist flockbl;
7c673cae
FG
1403};
1404WRITE_CLASS_ENCODER(cap_reconnect_t)
1405
11fdf7f2 1406struct snaprealm_reconnect_t {
9f95a23c 1407 snaprealm_reconnect_t() {}
11fdf7f2
TL
1408 snaprealm_reconnect_t(inodeno_t ino, snapid_t seq, inodeno_t parent) {
1409 realm.ino = ino;
1410 realm.seq = seq;
1411 realm.parent = parent;
1412 }
1413 void encode(bufferlist& bl) const;
1414 void decode(bufferlist::const_iterator& bl);
1415 void encode_old(bufferlist& bl) const;
1416 void decode_old(bufferlist::const_iterator& bl);
1417
1418 void dump(Formatter *f) const;
9f95a23c
TL
1419 static void generate_test_instances(std::list<snaprealm_reconnect_t*>& ls);
1420
1421 mutable ceph_mds_snaprealm_reconnect realm = {};
11fdf7f2
TL
1422};
1423WRITE_CLASS_ENCODER(snaprealm_reconnect_t)
7c673cae
FG
1424
1425// compat for pre-FLOCK feature
1426struct old_ceph_mds_cap_reconnect {
eafe8130
TL
1427 ceph_le64 cap_id;
1428 ceph_le32 wanted;
1429 ceph_le32 issued;
1430 ceph_le64 old_size;
7c673cae 1431 struct ceph_timespec old_mtime, old_atime;
eafe8130
TL
1432 ceph_le64 snaprealm;
1433 ceph_le64 pathbase; /* base ino for our path to this ino */
7c673cae
FG
1434} __attribute__ ((packed));
1435WRITE_RAW_ENCODER(old_ceph_mds_cap_reconnect)
1436
1437struct old_cap_reconnect_t {
7c673cae
FG
1438 const old_cap_reconnect_t& operator=(const cap_reconnect_t& n) {
1439 path = n.path;
1440 capinfo.cap_id = n.capinfo.cap_id;
1441 capinfo.wanted = n.capinfo.wanted;
1442 capinfo.issued = n.capinfo.issued;
1443 capinfo.snaprealm = n.capinfo.snaprealm;
1444 capinfo.pathbase = n.capinfo.pathbase;
1445 return *this;
1446 }
1447 operator cap_reconnect_t() {
1448 cap_reconnect_t n;
1449 n.path = path;
1450 n.capinfo.cap_id = capinfo.cap_id;
1451 n.capinfo.wanted = capinfo.wanted;
1452 n.capinfo.issued = capinfo.issued;
1453 n.capinfo.snaprealm = capinfo.snaprealm;
1454 n.capinfo.pathbase = capinfo.pathbase;
1455 return n;
1456 }
1457
1458 void encode(bufferlist& bl) const {
11fdf7f2
TL
1459 using ceph::encode;
1460 encode(path, bl);
1461 encode(capinfo, bl);
7c673cae 1462 }
11fdf7f2
TL
1463 void decode(bufferlist::const_iterator& bl) {
1464 using ceph::decode;
1465 decode(path, bl);
1466 decode(capinfo, bl);
7c673cae 1467 }
9f95a23c
TL
1468
1469 string path;
1470 old_ceph_mds_cap_reconnect capinfo;
7c673cae
FG
1471};
1472WRITE_CLASS_ENCODER(old_cap_reconnect_t)
1473
7c673cae 1474// dir frag
7c673cae 1475struct dirfrag_t {
94b18763 1476 dirfrag_t() {}
7c673cae
FG
1477 dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { }
1478
1479 void encode(bufferlist& bl) const {
11fdf7f2
TL
1480 using ceph::encode;
1481 encode(ino, bl);
1482 encode(frag, bl);
7c673cae 1483 }
11fdf7f2
TL
1484 void decode(bufferlist::const_iterator& bl) {
1485 using ceph::decode;
1486 decode(ino, bl);
1487 decode(frag, bl);
7c673cae 1488 }
9f95a23c
TL
1489
1490 inodeno_t ino = 0;
1491 frag_t frag;
7c673cae
FG
1492};
1493WRITE_CLASS_ENCODER(dirfrag_t)
1494
7c673cae
FG
1495inline std::ostream& operator<<(std::ostream& out, const dirfrag_t &df) {
1496 out << df.ino;
1497 if (!df.frag.is_root()) out << "." << df.frag;
1498 return out;
1499}
1500inline bool operator<(dirfrag_t l, dirfrag_t r) {
1501 if (l.ino < r.ino) return true;
1502 if (l.ino == r.ino && l.frag < r.frag) return true;
1503 return false;
1504}
1505inline bool operator==(dirfrag_t l, dirfrag_t r) {
1506 return l.ino == r.ino && l.frag == r.frag;
1507}
1508
1509namespace std {
1510 template<> struct hash<dirfrag_t> {
1511 size_t operator()(const dirfrag_t &df) const {
1512 static rjhash<uint64_t> H;
1513 static rjhash<uint32_t> I;
1514 return H(df.ino) ^ I(df.frag);
1515 }
1516 };
1517} // namespace std
1518
7c673cae 1519// ================================================================
7c673cae
FG
1520#define META_POP_IRD 0
1521#define META_POP_IWR 1
1522#define META_POP_READDIR 2
1523#define META_POP_FETCH 3
1524#define META_POP_STORE 4
1525#define META_NPOP 5
1526
1527class inode_load_vec_t {
7c673cae 1528public:
11fdf7f2
TL
1529 using time = DecayCounter::time;
1530 using clock = DecayCounter::clock;
1531 static const size_t NUM = 2;
1532
1533 inode_load_vec_t() : vec{DecayCounter(DecayRate()), DecayCounter(DecayRate())} {}
1534 inode_load_vec_t(const DecayRate &rate) : vec{DecayCounter(rate), DecayCounter(rate)} {}
1535
7c673cae 1536 DecayCounter &get(int t) {
7c673cae
FG
1537 return vec[t];
1538 }
11fdf7f2
TL
1539 void zero() {
1540 for (auto &d : vec) {
1541 d.reset();
1542 }
7c673cae
FG
1543 }
1544 void encode(bufferlist &bl) const;
11fdf7f2
TL
1545 void decode(bufferlist::const_iterator& p);
1546 void dump(Formatter *f) const;
9f95a23c 1547 static void generate_test_instances(std::list<inode_load_vec_t*>& ls);
11fdf7f2
TL
1548
1549private:
1550 std::array<DecayCounter, NUM> vec;
7c673cae 1551};
11fdf7f2
TL
1552inline void encode(const inode_load_vec_t &c, bufferlist &bl) {
1553 c.encode(bl);
7c673cae 1554}
11fdf7f2
TL
1555inline void decode(inode_load_vec_t & c, bufferlist::const_iterator &p) {
1556 c.decode(p);
7c673cae
FG
1557}
1558
1559class dirfrag_load_vec_t {
1560public:
11fdf7f2
TL
1561 using time = DecayCounter::time;
1562 using clock = DecayCounter::clock;
1563 static const size_t NUM = 5;
1564
1565 dirfrag_load_vec_t() :
1566 vec{DecayCounter(DecayRate()),
1567 DecayCounter(DecayRate()),
1568 DecayCounter(DecayRate()),
1569 DecayCounter(DecayRate()),
1570 DecayCounter(DecayRate())
1571 }
7c673cae 1572 {}
11fdf7f2
TL
1573 dirfrag_load_vec_t(const DecayRate &rate) :
1574 vec{DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate)}
1575 {}
1576
7c673cae
FG
1577 void encode(bufferlist &bl) const {
1578 ENCODE_START(2, 2, bl);
94b18763 1579 for (const auto &i : vec) {
11fdf7f2 1580 encode(i, bl);
94b18763 1581 }
7c673cae
FG
1582 ENCODE_FINISH(bl);
1583 }
11fdf7f2 1584 void decode(bufferlist::const_iterator &p) {
7c673cae 1585 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p);
94b18763 1586 for (auto &i : vec) {
11fdf7f2 1587 decode(i, p);
94b18763 1588 }
7c673cae
FG
1589 DECODE_FINISH(p);
1590 }
7c673cae 1591 void dump(Formatter *f) const;
11fdf7f2
TL
1592 void dump(Formatter *f, const DecayRate& rate) const;
1593 static void generate_test_instances(std::list<dirfrag_load_vec_t*>& ls);
7c673cae 1594
11fdf7f2
TL
1595 const DecayCounter &get(int t) const {
1596 return vec[t];
7c673cae 1597 }
11fdf7f2
TL
1598 DecayCounter &get(int t) {
1599 return vec[t];
1600 }
1601 void adjust(double d) {
94b18763 1602 for (auto &i : vec) {
11fdf7f2 1603 i.adjust(d);
94b18763 1604 }
7c673cae 1605 }
11fdf7f2 1606 void zero() {
94b18763 1607 for (auto &i : vec) {
11fdf7f2 1608 i.reset();
94b18763 1609 }
7c673cae 1610 }
28e407b8 1611 double meta_load() const {
7c673cae 1612 return
11fdf7f2
TL
1613 1*vec[META_POP_IRD].get() +
1614 2*vec[META_POP_IWR].get() +
1615 1*vec[META_POP_READDIR].get() +
1616 2*vec[META_POP_FETCH].get() +
1617 4*vec[META_POP_STORE].get();
7c673cae
FG
1618 }
1619
11fdf7f2
TL
1620 void add(dirfrag_load_vec_t& r) {
1621 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1622 vec[i].adjust(r.vec[i].get());
7c673cae 1623 }
11fdf7f2
TL
1624 void sub(dirfrag_load_vec_t& r) {
1625 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1626 vec[i].adjust(-r.vec[i].get());
7c673cae
FG
1627 }
1628 void scale(double f) {
11fdf7f2 1629 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
7c673cae
FG
1630 vec[i].scale(f);
1631 }
11fdf7f2
TL
1632
1633private:
1634 friend inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl);
1635 std::array<DecayCounter, NUM> vec;
7c673cae
FG
1636};
1637
11fdf7f2
TL
1638inline void encode(const dirfrag_load_vec_t &c, bufferlist &bl) {
1639 c.encode(bl);
7c673cae 1640}
11fdf7f2
TL
1641inline void decode(dirfrag_load_vec_t& c, bufferlist::const_iterator &p) {
1642 c.decode(p);
7c673cae
FG
1643}
1644
28e407b8 1645inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl)
7c673cae 1646{
11fdf7f2
TL
1647 std::ostringstream ss;
1648 ss << std::setprecision(1) << std::fixed
1649 << "[pop"
1650 " IRD:" << dl.vec[0]
1651 << " IWR:" << dl.vec[1]
1652 << " RDR:" << dl.vec[2]
1653 << " FET:" << dl.vec[3]
1654 << " STR:" << dl.vec[4]
1655 << " *LOAD:" << dl.meta_load() << "]";
1656 return out << ss.str() << std::endl;
7c673cae
FG
1657}
1658
7c673cae 1659struct mds_load_t {
11fdf7f2
TL
1660 using clock = dirfrag_load_vec_t::clock;
1661 using time = dirfrag_load_vec_t::time;
1662
7c673cae
FG
1663 dirfrag_load_vec_t auth;
1664 dirfrag_load_vec_t all;
1665
11fdf7f2
TL
1666 mds_load_t() : auth(DecayRate()), all(DecayRate()) {}
1667 mds_load_t(const DecayRate &rate) : auth(rate), all(rate) {}
1668
94b18763
FG
1669 double req_rate = 0.0;
1670 double cache_hit_rate = 0.0;
1671 double queue_len = 0.0;
7c673cae 1672
94b18763 1673 double cpu_load_avg = 0.0;
7c673cae 1674
11fdf7f2 1675 double mds_load() const; // defiend in MDBalancer.cc
7c673cae 1676 void encode(bufferlist& bl) const;
11fdf7f2 1677 void decode(bufferlist::const_iterator& bl);
7c673cae 1678 void dump(Formatter *f) const;
11fdf7f2 1679 static void generate_test_instances(std::list<mds_load_t*>& ls);
7c673cae 1680};
11fdf7f2
TL
1681inline void encode(const mds_load_t &c, bufferlist &bl) {
1682 c.encode(bl);
7c673cae 1683}
11fdf7f2
TL
1684inline void decode(mds_load_t &c, bufferlist::const_iterator &p) {
1685 c.decode(p);
7c673cae
FG
1686}
1687
28e407b8 1688inline std::ostream& operator<<(std::ostream& out, const mds_load_t& load)
7c673cae
FG
1689{
1690 return out << "mdsload<" << load.auth << "/" << load.all
1691 << ", req " << load.req_rate
1692 << ", hr " << load.cache_hit_rate
1693 << ", qlen " << load.queue_len
1694 << ", cpu " << load.cpu_load_avg
1695 << ">";
1696}
1697
1698class load_spread_t {
1699public:
11fdf7f2
TL
1700 using time = DecayCounter::time;
1701 using clock = DecayCounter::clock;
7c673cae 1702 static const int MAX = 4;
7c673cae 1703
11fdf7f2 1704 load_spread_t(const DecayRate &rate) : count(rate)
9f95a23c
TL
1705 {}
1706
1707 load_spread_t() = delete;
7c673cae 1708
11fdf7f2 1709 double hit(int who) {
7c673cae
FG
1710 for (int i=0; i<n; i++)
1711 if (last[i] == who)
1712 return count.get_last();
1713
1714 // we're new(ish)
1715 last[p++] = who;
1716 if (n < MAX) n++;
1717 if (n == 1) return 0.0;
1718
1719 if (p == MAX) p = 0;
1720
11fdf7f2 1721 return count.hit();
7c673cae 1722 }
11fdf7f2
TL
1723 double get() const {
1724 return count.get();
7c673cae 1725 }
7c673cae 1726
9f95a23c
TL
1727 std::array<int, MAX> last = {-1, -1, -1, -1};
1728 int p = 0, n = 0;
1729 DecayCounter count;
1730};
7c673cae
FG
1731
1732// ================================================================
1733typedef std::pair<mds_rank_t, mds_rank_t> mds_authority_t;
1734
1735// -- authority delegation --
1736// directory authority types
1737// >= 0 is the auth mds
1738#define CDIR_AUTH_PARENT mds_rank_t(-1) // default
1739#define CDIR_AUTH_UNKNOWN mds_rank_t(-2)
1740#define CDIR_AUTH_DEFAULT mds_authority_t(CDIR_AUTH_PARENT, CDIR_AUTH_UNKNOWN)
1741#define CDIR_AUTH_UNDEF mds_authority_t(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN)
1742//#define CDIR_AUTH_ROOTINODE pair<int,int>( 0, -2)
1743
1744class MDSCacheObjectInfo {
1745public:
9f95a23c
TL
1746 void encode(bufferlist& bl) const;
1747 void decode(bufferlist::const_iterator& bl);
1748 void dump(Formatter *f) const;
1749 static void generate_test_instances(std::list<MDSCacheObjectInfo*>& ls);
1750
94b18763 1751 inodeno_t ino = 0;
7c673cae
FG
1752 dirfrag_t dirfrag;
1753 string dname;
1754 snapid_t snapid;
7c673cae
FG
1755};
1756
1757inline std::ostream& operator<<(std::ostream& out, const MDSCacheObjectInfo &info) {
1758 if (info.ino) return out << info.ino << "." << info.snapid;
1759 if (info.dname.length()) return out << info.dirfrag << "/" << info.dname
1760 << " snap " << info.snapid;
1761 return out << info.dirfrag;
1762}
1763
1764inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) {
1765 if (l.ino || r.ino)
1766 return l.ino == r.ino && l.snapid == r.snapid;
1767 else
1768 return l.dirfrag == r.dirfrag && l.dname == r.dname;
1769}
1770WRITE_CLASS_ENCODER(MDSCacheObjectInfo)
1771
7c673cae
FG
1772// parse a map of keys/values.
1773namespace qi = boost::spirit::qi;
1774
1775template <typename Iterator>
1776struct keys_and_values
1777 : qi::grammar<Iterator, std::map<string, string>()>
1778{
1779 keys_and_values()
1780 : keys_and_values::base_type(query)
1781 {
1782 query = pair >> *(qi::lit(' ') >> pair);
1783 pair = key >> '=' >> value;
1784 key = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9");
1785 value = +qi::char_("a-zA-Z_0-9");
1786 }
1787 qi::rule<Iterator, std::map<string, string>()> query;
1788 qi::rule<Iterator, std::pair<string, string>()> pair;
1789 qi::rule<Iterator, string()> key, value;
1790};
1791
1792#endif