]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/mdstypes.h
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / mds / mdstypes.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3#ifndef CEPH_MDSTYPES_H
4#define CEPH_MDSTYPES_H
5
6#include "include/int_types.h"
7
8#include <math.h>
9#include <ostream>
10#include <set>
11#include <map>
11fdf7f2 12#include <string_view>
7c673cae
FG
13
14#include "common/config.h"
15#include "common/Clock.h"
16#include "common/DecayCounter.h"
17#include "common/entity_name.h"
18
19#include "include/Context.h"
20#include "include/frag.h"
21#include "include/xlist.h"
22#include "include/interval_set.h"
23#include "include/compact_map.h"
24#include "include/compact_set.h"
25#include "include/fs_types.h"
26
27#include "inode_backtrace.h"
28
29#include <boost/spirit/include/qi.hpp>
30#include <boost/pool/pool.hpp>
11fdf7f2 31#include "include/ceph_assert.h"
7c673cae
FG
32#include <boost/serialization/strong_typedef.hpp>
33
34#define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011"
35
36#define MDS_PORT_CACHE 0x200
37#define MDS_PORT_LOCKER 0x300
38#define MDS_PORT_MIGRATOR 0x400
39
40#define MAX_MDS 0x100
41#define NUM_STRAY 10
42
43#define MDS_INO_ROOT 1
44
45// No longer created but recognised in existing filesystems
46// so that we don't try to fragment it.
47#define MDS_INO_CEPH 2
48
11fdf7f2
TL
49#define MDS_INO_GLOBAL_SNAPREALM 3
50
7c673cae
FG
51#define MDS_INO_MDSDIR_OFFSET (1*MAX_MDS)
52#define MDS_INO_STRAY_OFFSET (6*MAX_MDS)
53
54// Locations for journal data
55#define MDS_INO_LOG_OFFSET (2*MAX_MDS)
56#define MDS_INO_LOG_BACKUP_OFFSET (3*MAX_MDS)
57#define MDS_INO_LOG_POINTER_OFFSET (4*MAX_MDS)
58#define MDS_INO_PURGE_QUEUE (5*MAX_MDS)
59
60#define MDS_INO_SYSTEM_BASE ((6*MAX_MDS) + (MAX_MDS * NUM_STRAY))
61
62#define MDS_INO_STRAY(x,i) (MDS_INO_STRAY_OFFSET+((((unsigned)(x))*NUM_STRAY)+((unsigned)(i))))
63#define MDS_INO_MDSDIR(x) (MDS_INO_MDSDIR_OFFSET+((unsigned)x))
64
65#define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < (MDS_INO_STRAY_OFFSET+(MAX_MDS*NUM_STRAY)))
66#define MDS_INO_IS_MDSDIR(i) ((i) >= MDS_INO_MDSDIR_OFFSET && (i) < (MDS_INO_MDSDIR_OFFSET+MAX_MDS))
67#define MDS_INO_MDSDIR_OWNER(i) (signed ((unsigned (i)) - MDS_INO_MDSDIR_OFFSET))
11fdf7f2 68#define MDS_INO_IS_BASE(i) ((i) == MDS_INO_ROOT || (i) == MDS_INO_GLOBAL_SNAPREALM || MDS_INO_IS_MDSDIR(i))
7c673cae
FG
69#define MDS_INO_STRAY_OWNER(i) (signed (((unsigned (i)) - MDS_INO_STRAY_OFFSET) / NUM_STRAY))
70#define MDS_INO_STRAY_INDEX(i) (((unsigned (i)) - MDS_INO_STRAY_OFFSET) % NUM_STRAY)
71
72#define MDS_TRAVERSE_FORWARD 1
73#define MDS_TRAVERSE_DISCOVER 2 // skips permissions checks etc.
74#define MDS_TRAVERSE_DISCOVERXLOCK 3 // succeeds on (foreign?) null, xlocked dentries.
75
76
77typedef int32_t mds_rank_t;
11fdf7f2 78constexpr mds_rank_t MDS_RANK_NONE = -1;
7c673cae
FG
79
80BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
81extern const mds_gid_t MDS_GID_NONE;
11fdf7f2
TL
82
83typedef int32_t fs_cluster_id_t;
84constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = -1;
7c673cae 85// The namespace ID of the anonymous default filesystem from legacy systems
11fdf7f2 86constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS = 0;
7c673cae
FG
87
88class mds_role_t
89{
90 public:
91 fs_cluster_id_t fscid;
92 mds_rank_t rank;
93
94 mds_role_t(fs_cluster_id_t fscid_, mds_rank_t rank_)
95 : fscid(fscid_), rank(rank_)
96 {}
97 mds_role_t()
98 : fscid(FS_CLUSTER_ID_NONE), rank(MDS_RANK_NONE)
99 {}
100 bool operator<(mds_role_t const &rhs) const
101 {
102 if (fscid < rhs.fscid) {
103 return true;
104 } else if (fscid == rhs.fscid) {
105 return rank < rhs.rank;
106 } else {
107 return false;
108 }
109 }
110
111 bool is_none() const
112 {
113 return (rank == MDS_RANK_NONE);
114 }
115};
116std::ostream& operator<<(std::ostream &out, const mds_role_t &role);
117
118
119// CAPS
120
121inline string gcap_string(int cap)
122{
123 string s;
124 if (cap & CEPH_CAP_GSHARED) s += "s";
125 if (cap & CEPH_CAP_GEXCL) s += "x";
126 if (cap & CEPH_CAP_GCACHE) s += "c";
127 if (cap & CEPH_CAP_GRD) s += "r";
128 if (cap & CEPH_CAP_GWR) s += "w";
129 if (cap & CEPH_CAP_GBUFFER) s += "b";
130 if (cap & CEPH_CAP_GWREXTEND) s += "a";
131 if (cap & CEPH_CAP_GLAZYIO) s += "l";
132 return s;
133}
134inline string ccap_string(int cap)
135{
136 string s;
137 if (cap & CEPH_CAP_PIN) s += "p";
138
139 int a = (cap >> CEPH_CAP_SAUTH) & 3;
140 if (a) s += 'A' + gcap_string(a);
141
142 a = (cap >> CEPH_CAP_SLINK) & 3;
143 if (a) s += 'L' + gcap_string(a);
144
145 a = (cap >> CEPH_CAP_SXATTR) & 3;
146 if (a) s += 'X' + gcap_string(a);
147
148 a = cap >> CEPH_CAP_SFILE;
149 if (a) s += 'F' + gcap_string(a);
150
151 if (s.length() == 0)
152 s = "-";
153 return s;
154}
155
156
157struct scatter_info_t {
94b18763 158 version_t version = 0;
7c673cae 159
94b18763 160 scatter_info_t() {}
7c673cae
FG
161};
162
163struct frag_info_t : public scatter_info_t {
164 // this frag
165 utime_t mtime;
94b18763
FG
166 uint64_t change_attr = 0;
167 int64_t nfiles = 0; // files
168 int64_t nsubdirs = 0; // subdirs
7c673cae 169
94b18763 170 frag_info_t() {}
7c673cae
FG
171
172 int64_t size() const { return nfiles + nsubdirs; }
173
174 void zero() {
175 *this = frag_info_t();
176 }
177
178 // *this += cur - acc;
179 void add_delta(const frag_info_t &cur, const frag_info_t &acc, bool *touched_mtime=0, bool *touched_chattr=0) {
180 if (cur.mtime > mtime) {
181 mtime = cur.mtime;
182 if (touched_mtime)
183 *touched_mtime = true;
184 }
185 if (cur.change_attr > change_attr) {
186 change_attr = cur.change_attr;
187 if (touched_chattr)
188 *touched_chattr = true;
189 }
190 nfiles += cur.nfiles - acc.nfiles;
191 nsubdirs += cur.nsubdirs - acc.nsubdirs;
192 }
193
194 void add(const frag_info_t& other) {
195 if (other.mtime > mtime)
196 mtime = other.mtime;
197 if (other.change_attr > change_attr)
198 change_attr = other.change_attr;
199 nfiles += other.nfiles;
200 nsubdirs += other.nsubdirs;
201 }
202
203 bool same_sums(const frag_info_t &o) const {
204 return mtime <= o.mtime &&
205 nfiles == o.nfiles &&
206 nsubdirs == o.nsubdirs;
207 }
208
209 void encode(bufferlist &bl) const;
11fdf7f2 210 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
211 void dump(Formatter *f) const;
212 static void generate_test_instances(list<frag_info_t*>& ls);
213};
214WRITE_CLASS_ENCODER(frag_info_t)
215
216inline bool operator==(const frag_info_t &l, const frag_info_t &r) {
217 return memcmp(&l, &r, sizeof(l)) == 0;
218}
219inline bool operator!=(const frag_info_t &l, const frag_info_t &r) {
220 return !(l == r);
221}
222
223std::ostream& operator<<(std::ostream &out, const frag_info_t &f);
224
225
226struct nest_info_t : public scatter_info_t {
227 // this frag + children
228 utime_t rctime;
94b18763
FG
229 int64_t rbytes = 0;
230 int64_t rfiles = 0;
231 int64_t rsubdirs = 0;
7c673cae
FG
232 int64_t rsize() const { return rfiles + rsubdirs; }
233
11fdf7f2 234 int64_t rsnaps = 0;
7c673cae 235
94b18763 236 nest_info_t() {}
7c673cae
FG
237
238 void zero() {
239 *this = nest_info_t();
240 }
241
242 void sub(const nest_info_t &other) {
243 add(other, -1);
244 }
245 void add(const nest_info_t &other, int fac=1) {
246 if (other.rctime > rctime)
247 rctime = other.rctime;
248 rbytes += fac*other.rbytes;
249 rfiles += fac*other.rfiles;
250 rsubdirs += fac*other.rsubdirs;
11fdf7f2 251 rsnaps += fac*other.rsnaps;
7c673cae
FG
252 }
253
254 // *this += cur - acc;
255 void add_delta(const nest_info_t &cur, const nest_info_t &acc) {
256 if (cur.rctime > rctime)
257 rctime = cur.rctime;
258 rbytes += cur.rbytes - acc.rbytes;
259 rfiles += cur.rfiles - acc.rfiles;
260 rsubdirs += cur.rsubdirs - acc.rsubdirs;
11fdf7f2 261 rsnaps += cur.rsnaps - acc.rsnaps;
7c673cae
FG
262 }
263
264 bool same_sums(const nest_info_t &o) const {
265 return rctime <= o.rctime &&
266 rbytes == o.rbytes &&
267 rfiles == o.rfiles &&
268 rsubdirs == o.rsubdirs &&
11fdf7f2 269 rsnaps == o.rsnaps;
7c673cae
FG
270 }
271
272 void encode(bufferlist &bl) const;
11fdf7f2 273 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
274 void dump(Formatter *f) const;
275 static void generate_test_instances(list<nest_info_t*>& ls);
276};
277WRITE_CLASS_ENCODER(nest_info_t)
278
279inline bool operator==(const nest_info_t &l, const nest_info_t &r) {
280 return memcmp(&l, &r, sizeof(l)) == 0;
281}
282inline bool operator!=(const nest_info_t &l, const nest_info_t &r) {
283 return !(l == r);
284}
285
286std::ostream& operator<<(std::ostream &out, const nest_info_t &n);
287
288
289struct vinodeno_t {
290 inodeno_t ino;
291 snapid_t snapid;
292 vinodeno_t() {}
293 vinodeno_t(inodeno_t i, snapid_t s) : ino(i), snapid(s) {}
294
295 void encode(bufferlist& bl) const {
11fdf7f2
TL
296 using ceph::encode;
297 encode(ino, bl);
298 encode(snapid, bl);
7c673cae 299 }
11fdf7f2
TL
300 void decode(bufferlist::const_iterator& p) {
301 using ceph::decode;
302 decode(ino, p);
303 decode(snapid, p);
7c673cae
FG
304 }
305};
306WRITE_CLASS_ENCODER(vinodeno_t)
307
308inline bool operator==(const vinodeno_t &l, const vinodeno_t &r) {
309 return l.ino == r.ino && l.snapid == r.snapid;
310}
311inline bool operator!=(const vinodeno_t &l, const vinodeno_t &r) {
312 return !(l == r);
313}
314inline bool operator<(const vinodeno_t &l, const vinodeno_t &r) {
315 return
316 l.ino < r.ino ||
317 (l.ino == r.ino && l.snapid < r.snapid);
318}
319
320struct quota_info_t
321{
94b18763
FG
322 int64_t max_bytes = 0;
323 int64_t max_files = 0;
7c673cae 324
94b18763 325 quota_info_t() {}
7c673cae
FG
326
327 void encode(bufferlist& bl) const {
328 ENCODE_START(1, 1, bl);
11fdf7f2
TL
329 encode(max_bytes, bl);
330 encode(max_files, bl);
7c673cae
FG
331 ENCODE_FINISH(bl);
332 }
11fdf7f2 333 void decode(bufferlist::const_iterator& p) {
7c673cae 334 DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p);
11fdf7f2
TL
335 decode(max_bytes, p);
336 decode(max_files, p);
7c673cae
FG
337 DECODE_FINISH(p);
338 }
339
340 void dump(Formatter *f) const;
341 static void generate_test_instances(list<quota_info_t *>& ls);
342
343 bool is_valid() const {
344 return max_bytes >=0 && max_files >=0;
345 }
346 bool is_enable() const {
347 return max_bytes || max_files;
348 }
349};
350WRITE_CLASS_ENCODER(quota_info_t)
351
352inline bool operator==(const quota_info_t &l, const quota_info_t &r) {
353 return memcmp(&l, &r, sizeof(l)) == 0;
354}
355
356ostream& operator<<(ostream &out, const quota_info_t &n);
357
358namespace std {
359 template<> struct hash<vinodeno_t> {
360 size_t operator()(const vinodeno_t &vino) const {
361 hash<inodeno_t> H;
362 hash<uint64_t> I;
363 return H(vino.ino) ^ I(vino.snapid);
364 }
365 };
366} // namespace std
367
368
369
370
371inline std::ostream& operator<<(std::ostream &out, const vinodeno_t &vino) {
372 out << vino.ino;
373 if (vino.snapid == CEPH_NOSNAP)
374 out << ".head";
375 else if (vino.snapid)
376 out << '.' << vino.snapid;
377 return out;
378}
379
380
381/*
382 * client_writeable_range_t
383 */
384struct client_writeable_range_t {
385 struct byte_range_t {
94b18763
FG
386 uint64_t first = 0, last = 0; // interval client can write to
387 byte_range_t() {}
7c673cae
FG
388 };
389
390 byte_range_t range;
94b18763 391 snapid_t follows = 0; // aka "data+metadata flushed thru"
7c673cae 392
94b18763 393 client_writeable_range_t() {}
7c673cae
FG
394
395 void encode(bufferlist &bl) const;
11fdf7f2 396 void decode(bufferlist::const_iterator& bl);
7c673cae 397 void dump(Formatter *f) const;
94b18763 398 static void generate_test_instances(std::list<client_writeable_range_t*>& ls);
7c673cae
FG
399};
400
11fdf7f2
TL
401inline void decode(client_writeable_range_t::byte_range_t& range, bufferlist::const_iterator& bl) {
402 decode(range.first, bl);
403 decode(range.last, bl);
7c673cae
FG
404}
405
406WRITE_CLASS_ENCODER(client_writeable_range_t)
407
408std::ostream& operator<<(std::ostream& out, const client_writeable_range_t& r);
409
410inline bool operator==(const client_writeable_range_t& l,
411 const client_writeable_range_t& r) {
412 return l.range.first == r.range.first && l.range.last == r.range.last &&
413 l.follows == r.follows;
414}
415
416struct inline_data_t {
417private:
418 std::unique_ptr<bufferlist> blp;
419public:
94b18763 420 version_t version = 1;
7c673cae
FG
421
422 void free_data() {
423 blp.reset();
424 }
425 bufferlist& get_data() {
426 if (!blp)
427 blp.reset(new bufferlist);
428 return *blp;
429 }
430 size_t length() const { return blp ? blp->length() : 0; }
431
94b18763 432 inline_data_t() {}
7c673cae
FG
433 inline_data_t(const inline_data_t& o) : version(o.version) {
434 if (o.blp)
435 get_data() = *o.blp;
436 }
437 inline_data_t& operator=(const inline_data_t& o) {
438 version = o.version;
439 if (o.blp)
440 get_data() = *o.blp;
441 else
442 free_data();
443 return *this;
444 }
445 bool operator==(const inline_data_t& o) const {
446 return length() == o.length() &&
447 (length() == 0 ||
448 (*const_cast<bufferlist*>(blp.get()) == *const_cast<bufferlist*>(o.blp.get())));
449 }
450 bool operator!=(const inline_data_t& o) const {
451 return !(*this == o);
452 }
453 void encode(bufferlist &bl) const;
11fdf7f2 454 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
455};
456WRITE_CLASS_ENCODER(inline_data_t)
457
458enum {
459 DAMAGE_STATS, // statistics (dirstat, size, etc)
460 DAMAGE_RSTATS, // recursive statistics (rstat, accounted_rstat)
461 DAMAGE_FRAGTREE // fragtree -- repair by searching
462};
463typedef uint32_t damage_flags_t;
464
465/*
466 * inode_t
467 */
94b18763 468template<template<typename> class Allocator = std::allocator>
7c673cae
FG
469struct inode_t {
470 /**
471 * ***************
472 * Do not forget to add any new fields to the compare() function.
473 * ***************
474 */
475 // base (immutable)
94b18763
FG
476 inodeno_t ino = 0;
477 uint32_t rdev = 0; // if special file
7c673cae
FG
478
479 // affected by any inode change...
480 utime_t ctime; // inode change time
481 utime_t btime; // birth time
482
483 // perm (namespace permissions)
94b18763
FG
484 uint32_t mode = 0;
485 uid_t uid = 0;
486 gid_t gid = 0;
7c673cae
FG
487
488 // nlink
94b18763 489 int32_t nlink = 0;
7c673cae
FG
490
491 // file (data access)
492 ceph_dir_layout dir_layout; // [dir only]
493 file_layout_t layout;
94b18763
FG
494 compact_set<int64_t, std::less<int64_t>, Allocator<int64_t>> old_pools;
495 uint64_t size = 0; // on directory, # dentries
496 uint64_t max_size_ever = 0; // max size the file has ever been
497 uint32_t truncate_seq = 0;
498 uint64_t truncate_size = 0, truncate_from = 0;
499 uint32_t truncate_pending = 0;
7c673cae
FG
500 utime_t mtime; // file data modify time.
501 utime_t atime; // file data access time.
94b18763
FG
502 uint32_t time_warp_seq = 0; // count of (potential) mtime/atime timewarps (i.e., utimes())
503 inline_data_t inline_data; // FIXME check
7c673cae
FG
504
505 // change attribute
94b18763 506 uint64_t change_attr = 0;
7c673cae 507
94b18763
FG
508 using client_range_map = std::map<client_t,client_writeable_range_t,std::less<client_t>,Allocator<std::pair<const client_t,client_writeable_range_t>>>;
509 client_range_map client_ranges; // client(s) can write to these ranges
7c673cae
FG
510
511 // dirfrag, recursive accountin
512 frag_info_t dirstat; // protected by my filelock
513 nest_info_t rstat; // protected by my nestlock
514 nest_info_t accounted_rstat; // protected by parent's nestlock
515
516 quota_info_t quota;
517
94b18763 518 mds_rank_t export_pin = MDS_RANK_NONE;
7c673cae
FG
519
520 // special stuff
94b18763
FG
521 version_t version = 0; // auth only
522 version_t file_data_version = 0; // auth only
523 version_t xattr_version = 0;
7c673cae
FG
524
525 utime_t last_scrub_stamp; // start time of last complete scrub
94b18763 526 version_t last_scrub_version = 0;// (parent) start version of last complete scrub
7c673cae 527
94b18763 528 version_t backtrace_version = 0;
7c673cae
FG
529
530 snapid_t oldest_snap;
531
94b18763
FG
532 std::basic_string<char,std::char_traits<char>,Allocator<char>> stray_prior_path; //stores path before unlink
533
534 inode_t()
535 {
7c673cae
FG
536 clear_layout();
537 memset(&dir_layout, 0, sizeof(dir_layout));
7c673cae
FG
538 }
539
540 // file type
541 bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; }
542 bool is_dir() const { return (mode & S_IFMT) == S_IFDIR; }
543 bool is_file() const { return (mode & S_IFMT) == S_IFREG; }
544
545 bool is_truncating() const { return (truncate_pending > 0); }
546 void truncate(uint64_t old_size, uint64_t new_size) {
11fdf7f2 547 ceph_assert(new_size < old_size);
7c673cae
FG
548 if (old_size > max_size_ever)
549 max_size_ever = old_size;
550 truncate_from = old_size;
551 size = new_size;
552 rstat.rbytes = new_size;
553 truncate_size = size;
554 truncate_seq++;
555 truncate_pending++;
556 }
557
558 bool has_layout() const {
559 return layout != file_layout_t();
560 }
561
562 void clear_layout() {
563 layout = file_layout_t();
564 }
565
566 uint64_t get_layout_size_increment() const {
567 return layout.get_period();
568 }
569
570 bool is_dirty_rstat() const { return !(rstat == accounted_rstat); }
571
572 uint64_t get_max_size() const {
573 uint64_t max = 0;
574 for (std::map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin();
575 p != client_ranges.end();
576 ++p)
577 if (p->second.range.last > max)
578 max = p->second.range.last;
579 return max;
580 }
581 void set_max_size(uint64_t new_max) {
582 if (new_max == 0) {
583 client_ranges.clear();
584 } else {
585 for (std::map<client_t,client_writeable_range_t>::iterator p = client_ranges.begin();
586 p != client_ranges.end();
587 ++p)
588 p->second.range.last = new_max;
589 }
590 }
591
592 void trim_client_ranges(snapid_t last) {
593 std::map<client_t, client_writeable_range_t>::iterator p = client_ranges.begin();
594 while (p != client_ranges.end()) {
595 if (p->second.follows >= last)
596 client_ranges.erase(p++);
597 else
598 ++p;
599 }
600 }
601
602 bool is_backtrace_updated() const {
603 return backtrace_version == version;
604 }
605 void update_backtrace(version_t pv=0) {
606 backtrace_version = pv ? pv : version;
607 }
608
609 void add_old_pool(int64_t l) {
610 backtrace_version = version;
611 old_pools.insert(l);
612 }
613
614 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 615 void decode(bufferlist::const_iterator& bl);
7c673cae 616 void dump(Formatter *f) const;
94b18763 617 static void generate_test_instances(std::list<inode_t*>& ls);
7c673cae
FG
618 /**
619 * Compare this inode_t with another that represent *the same inode*
620 * at different points in time.
621 * @pre The inodes are the same ino
622 *
623 * @param other The inode_t to compare ourselves with
624 * @param divergent A bool pointer which will be set to true
625 * if the values are different in a way that can't be explained
626 * by one being a newer version than the other.
627 *
628 * @returns 1 if we are newer than the other, 0 if equal, -1 if older.
629 */
630 int compare(const inode_t &other, bool *divergent) const;
631private:
632 bool older_is_consistent(const inode_t &other) const;
633};
7c673cae 634
94b18763
FG
635// These methods may be moved back to mdstypes.cc when we have pmr
636template<template<typename> class Allocator>
637void inode_t<Allocator>::encode(bufferlist &bl, uint64_t features) const
638{
639 ENCODE_START(15, 6, bl);
640
11fdf7f2
TL
641 encode(ino, bl);
642 encode(rdev, bl);
643 encode(ctime, bl);
94b18763 644
11fdf7f2
TL
645 encode(mode, bl);
646 encode(uid, bl);
647 encode(gid, bl);
94b18763 648
11fdf7f2 649 encode(nlink, bl);
94b18763
FG
650 {
651 // removed field
652 bool anchored = 0;
11fdf7f2 653 encode(anchored, bl);
94b18763
FG
654 }
655
11fdf7f2
TL
656 encode(dir_layout, bl);
657 encode(layout, bl, features);
658 encode(size, bl);
659 encode(truncate_seq, bl);
660 encode(truncate_size, bl);
661 encode(truncate_from, bl);
662 encode(truncate_pending, bl);
663 encode(mtime, bl);
664 encode(atime, bl);
665 encode(time_warp_seq, bl);
666 encode(client_ranges, bl);
94b18763 667
11fdf7f2
TL
668 encode(dirstat, bl);
669 encode(rstat, bl);
670 encode(accounted_rstat, bl);
94b18763 671
11fdf7f2
TL
672 encode(version, bl);
673 encode(file_data_version, bl);
674 encode(xattr_version, bl);
675 encode(backtrace_version, bl);
676 encode(old_pools, bl);
677 encode(max_size_ever, bl);
678 encode(inline_data, bl);
679 encode(quota, bl);
94b18763 680
11fdf7f2 681 encode(stray_prior_path, bl);
94b18763 682
11fdf7f2
TL
683 encode(last_scrub_version, bl);
684 encode(last_scrub_stamp, bl);
94b18763 685
11fdf7f2
TL
686 encode(btime, bl);
687 encode(change_attr, bl);
94b18763 688
11fdf7f2 689 encode(export_pin, bl);
94b18763
FG
690
691 ENCODE_FINISH(bl);
692}
693
694template<template<typename> class Allocator>
11fdf7f2 695void inode_t<Allocator>::decode(bufferlist::const_iterator &p)
94b18763
FG
696{
697 DECODE_START_LEGACY_COMPAT_LEN(15, 6, 6, p);
698
11fdf7f2
TL
699 decode(ino, p);
700 decode(rdev, p);
701 decode(ctime, p);
94b18763 702
11fdf7f2
TL
703 decode(mode, p);
704 decode(uid, p);
705 decode(gid, p);
94b18763 706
11fdf7f2 707 decode(nlink, p);
94b18763
FG
708 {
709 bool anchored;
11fdf7f2 710 decode(anchored, p);
94b18763
FG
711 }
712
713 if (struct_v >= 4)
11fdf7f2 714 decode(dir_layout, p);
94b18763
FG
715 else
716 memset(&dir_layout, 0, sizeof(dir_layout));
11fdf7f2
TL
717 decode(layout, p);
718 decode(size, p);
719 decode(truncate_seq, p);
720 decode(truncate_size, p);
721 decode(truncate_from, p);
94b18763 722 if (struct_v >= 5)
11fdf7f2 723 decode(truncate_pending, p);
94b18763
FG
724 else
725 truncate_pending = 0;
11fdf7f2
TL
726 decode(mtime, p);
727 decode(atime, p);
728 decode(time_warp_seq, p);
94b18763 729 if (struct_v >= 3) {
11fdf7f2 730 decode(client_ranges, p);
94b18763
FG
731 } else {
732 map<client_t, client_writeable_range_t::byte_range_t> m;
11fdf7f2 733 decode(m, p);
94b18763
FG
734 for (map<client_t, client_writeable_range_t::byte_range_t>::iterator
735 q = m.begin(); q != m.end(); ++q)
736 client_ranges[q->first].range = q->second;
737 }
738
11fdf7f2
TL
739 decode(dirstat, p);
740 decode(rstat, p);
741 decode(accounted_rstat, p);
94b18763 742
11fdf7f2
TL
743 decode(version, p);
744 decode(file_data_version, p);
745 decode(xattr_version, p);
94b18763 746 if (struct_v >= 2)
11fdf7f2 747 decode(backtrace_version, p);
94b18763 748 if (struct_v >= 7)
11fdf7f2 749 decode(old_pools, p);
94b18763 750 if (struct_v >= 8)
11fdf7f2 751 decode(max_size_ever, p);
94b18763 752 if (struct_v >= 9) {
11fdf7f2 753 decode(inline_data, p);
94b18763
FG
754 } else {
755 inline_data.version = CEPH_INLINE_NONE;
756 }
757 if (struct_v < 10)
758 backtrace_version = 0; // force update backtrace
759 if (struct_v >= 11)
11fdf7f2 760 decode(quota, p);
94b18763
FG
761
762 if (struct_v >= 12) {
763 std::string tmp;
11fdf7f2
TL
764 decode(tmp, p);
765 stray_prior_path = std::string_view(tmp);
94b18763
FG
766 }
767
768 if (struct_v >= 13) {
11fdf7f2
TL
769 decode(last_scrub_version, p);
770 decode(last_scrub_stamp, p);
94b18763
FG
771 }
772 if (struct_v >= 14) {
11fdf7f2
TL
773 decode(btime, p);
774 decode(change_attr, p);
94b18763
FG
775 } else {
776 btime = utime_t();
777 change_attr = 0;
778 }
779
780 if (struct_v >= 15) {
11fdf7f2 781 decode(export_pin, p);
94b18763
FG
782 } else {
783 export_pin = MDS_RANK_NONE;
784 }
785
786 DECODE_FINISH(p);
787}
788
789template<template<typename> class Allocator>
790void inode_t<Allocator>::dump(Formatter *f) const
791{
792 f->dump_unsigned("ino", ino);
793 f->dump_unsigned("rdev", rdev);
794 f->dump_stream("ctime") << ctime;
795 f->dump_stream("btime") << btime;
796 f->dump_unsigned("mode", mode);
797 f->dump_unsigned("uid", uid);
798 f->dump_unsigned("gid", gid);
799 f->dump_unsigned("nlink", nlink);
800
801 f->open_object_section("dir_layout");
802 ::dump(dir_layout, f);
803 f->close_section();
804
805 f->dump_object("layout", layout);
806
807 f->open_array_section("old_pools");
808 for (const auto &p : old_pools) {
809 f->dump_int("pool", p);
810 }
811 f->close_section();
812
813 f->dump_unsigned("size", size);
814 f->dump_unsigned("truncate_seq", truncate_seq);
815 f->dump_unsigned("truncate_size", truncate_size);
816 f->dump_unsigned("truncate_from", truncate_from);
817 f->dump_unsigned("truncate_pending", truncate_pending);
818 f->dump_stream("mtime") << mtime;
819 f->dump_stream("atime") << atime;
820 f->dump_unsigned("time_warp_seq", time_warp_seq);
821 f->dump_unsigned("change_attr", change_attr);
822 f->dump_int("export_pin", export_pin);
823
824 f->open_array_section("client_ranges");
825 for (const auto &p : client_ranges) {
826 f->open_object_section("client");
827 f->dump_unsigned("client", p.first.v);
828 p.second.dump(f);
829 f->close_section();
830 }
831 f->close_section();
832
833 f->open_object_section("dirstat");
834 dirstat.dump(f);
835 f->close_section();
836
837 f->open_object_section("rstat");
838 rstat.dump(f);
839 f->close_section();
840
841 f->open_object_section("accounted_rstat");
842 accounted_rstat.dump(f);
843 f->close_section();
844
845 f->dump_unsigned("version", version);
846 f->dump_unsigned("file_data_version", file_data_version);
847 f->dump_unsigned("xattr_version", xattr_version);
848 f->dump_unsigned("backtrace_version", backtrace_version);
849
850 f->dump_string("stray_prior_path", stray_prior_path);
851}
852
853template<template<typename> class Allocator>
854void inode_t<Allocator>::generate_test_instances(list<inode_t*>& ls)
855{
856 ls.push_back(new inode_t<Allocator>);
857 ls.push_back(new inode_t<Allocator>);
858 ls.back()->ino = 1;
859 // i am lazy.
860}
861
862template<template<typename> class Allocator>
863int inode_t<Allocator>::compare(const inode_t<Allocator> &other, bool *divergent) const
864{
11fdf7f2 865 ceph_assert(ino == other.ino);
94b18763
FG
866 *divergent = false;
867 if (version == other.version) {
868 if (rdev != other.rdev ||
869 ctime != other.ctime ||
870 btime != other.btime ||
871 mode != other.mode ||
872 uid != other.uid ||
873 gid != other.gid ||
874 nlink != other.nlink ||
875 memcmp(&dir_layout, &other.dir_layout, sizeof(dir_layout)) ||
876 layout != other.layout ||
877 old_pools != other.old_pools ||
878 size != other.size ||
879 max_size_ever != other.max_size_ever ||
880 truncate_seq != other.truncate_seq ||
881 truncate_size != other.truncate_size ||
882 truncate_from != other.truncate_from ||
883 truncate_pending != other.truncate_pending ||
884 change_attr != other.change_attr ||
885 mtime != other.mtime ||
886 atime != other.atime ||
887 time_warp_seq != other.time_warp_seq ||
888 inline_data != other.inline_data ||
889 client_ranges != other.client_ranges ||
890 !(dirstat == other.dirstat) ||
891 !(rstat == other.rstat) ||
892 !(accounted_rstat == other.accounted_rstat) ||
893 file_data_version != other.file_data_version ||
894 xattr_version != other.xattr_version ||
895 backtrace_version != other.backtrace_version) {
896 *divergent = true;
897 }
898 return 0;
899 } else if (version > other.version) {
900 *divergent = !older_is_consistent(other);
901 return 1;
902 } else {
11fdf7f2 903 ceph_assert(version < other.version);
94b18763
FG
904 *divergent = !other.older_is_consistent(*this);
905 return -1;
906 }
907}
908
909template<template<typename> class Allocator>
910bool inode_t<Allocator>::older_is_consistent(const inode_t<Allocator> &other) const
911{
912 if (max_size_ever < other.max_size_ever ||
913 truncate_seq < other.truncate_seq ||
914 time_warp_seq < other.time_warp_seq ||
915 inline_data.version < other.inline_data.version ||
916 dirstat.version < other.dirstat.version ||
917 rstat.version < other.rstat.version ||
918 accounted_rstat.version < other.accounted_rstat.version ||
919 file_data_version < other.file_data_version ||
920 xattr_version < other.xattr_version ||
921 backtrace_version < other.backtrace_version) {
922 return false;
923 }
924 return true;
925}
926
927template<template<typename> class Allocator>
928inline void encode(const inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
929{
930 ENCODE_DUMP_PRE();
931 c.encode(bl, features);
932 ENCODE_DUMP_POST(cl);
933}
934template<template<typename> class Allocator>
11fdf7f2 935inline void decode(inode_t<Allocator> &c, ::ceph::bufferlist::const_iterator &p)
94b18763
FG
936{
937 c.decode(p);
938}
939
940template<template<typename> class Allocator>
941using alloc_string = std::basic_string<char,std::char_traits<char>,Allocator<char>>;
942
943template<template<typename> class Allocator>
944using xattr_map = compact_map<alloc_string<Allocator>, bufferptr, std::less<alloc_string<Allocator>>, Allocator<std::pair<const alloc_string<Allocator>, bufferptr>>>; // FIXME bufferptr not in mempool
7c673cae
FG
945
946/*
947 * old_inode_t
948 */
94b18763 949template<template<typename> class Allocator = std::allocator>
7c673cae
FG
950struct old_inode_t {
951 snapid_t first;
94b18763
FG
952 inode_t<Allocator> inode;
953 xattr_map<Allocator> xattrs;
7c673cae
FG
954
955 void encode(bufferlist &bl, uint64_t features) const;
11fdf7f2 956 void decode(bufferlist::const_iterator& bl);
7c673cae 957 void dump(Formatter *f) const;
94b18763 958 static void generate_test_instances(std::list<old_inode_t*>& ls);
7c673cae 959};
94b18763
FG
960
961// These methods may be moved back to mdstypes.cc when we have pmr
962template<template<typename> class Allocator>
963void old_inode_t<Allocator>::encode(bufferlist& bl, uint64_t features) const
964{
965 ENCODE_START(2, 2, bl);
11fdf7f2
TL
966 encode(first, bl);
967 encode(inode, bl, features);
968 encode(xattrs, bl);
94b18763
FG
969 ENCODE_FINISH(bl);
970}
971
972template<template<typename> class Allocator>
11fdf7f2 973void old_inode_t<Allocator>::decode(bufferlist::const_iterator& bl)
94b18763
FG
974{
975 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
11fdf7f2
TL
976 decode(first, bl);
977 decode(inode, bl);
978 decode(xattrs, bl);
94b18763
FG
979 DECODE_FINISH(bl);
980}
981
982template<template<typename> class Allocator>
983void old_inode_t<Allocator>::dump(Formatter *f) const
984{
985 f->dump_unsigned("first", first);
986 inode.dump(f);
987 f->open_object_section("xattrs");
988 for (const auto &p : xattrs) {
989 std::string v(p.second.c_str(), p.second.length());
990 f->dump_string(p.first.c_str(), v);
991 }
992 f->close_section();
993}
994
995template<template<typename> class Allocator>
996void old_inode_t<Allocator>::generate_test_instances(std::list<old_inode_t<Allocator>*>& ls)
997{
998 ls.push_back(new old_inode_t<Allocator>);
999 ls.push_back(new old_inode_t<Allocator>);
1000 ls.back()->first = 2;
1001 std::list<inode_t<Allocator>*> ils;
1002 inode_t<Allocator>::generate_test_instances(ils);
1003 ls.back()->inode = *ils.back();
1004 ls.back()->xattrs["user.foo"] = buffer::copy("asdf", 4);
1005 ls.back()->xattrs["user.unprintable"] = buffer::copy("\000\001\002", 3);
1006}
1007
1008template<template<typename> class Allocator>
1009inline void encode(const old_inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
1010{
1011 ENCODE_DUMP_PRE();
1012 c.encode(bl, features);
1013 ENCODE_DUMP_POST(cl);
1014}
1015template<template<typename> class Allocator>
11fdf7f2 1016inline void decode(old_inode_t<Allocator> &c, ::ceph::bufferlist::const_iterator &p)
94b18763
FG
1017{
1018 c.decode(p);
1019}
7c673cae
FG
1020
1021
1022/*
1023 * like an inode, but for a dir frag
1024 */
1025struct fnode_t {
94b18763 1026 version_t version = 0;
7c673cae
FG
1027 snapid_t snap_purged_thru; // the max_last_destroy snapid we've been purged thru
1028 frag_info_t fragstat, accounted_fragstat;
1029 nest_info_t rstat, accounted_rstat;
94b18763 1030 damage_flags_t damage_flags = 0;
7c673cae
FG
1031
1032 // we know we and all our descendants have been scrubbed since this version
94b18763 1033 version_t recursive_scrub_version = 0;
7c673cae
FG
1034 utime_t recursive_scrub_stamp;
1035 // version at which we last scrubbed our personal data structures
94b18763 1036 version_t localized_scrub_version = 0;
7c673cae
FG
1037 utime_t localized_scrub_stamp;
1038
1039 void encode(bufferlist &bl) const;
11fdf7f2 1040 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
1041 void dump(Formatter *f) const;
1042 static void generate_test_instances(list<fnode_t*>& ls);
94b18763 1043 fnode_t() {}
7c673cae
FG
1044};
1045WRITE_CLASS_ENCODER(fnode_t)
1046
1047
1048struct old_rstat_t {
1049 snapid_t first;
1050 nest_info_t rstat, accounted_rstat;
1051
1052 void encode(bufferlist& bl) const;
11fdf7f2 1053 void decode(bufferlist::const_iterator& p);
7c673cae
FG
1054 void dump(Formatter *f) const;
1055 static void generate_test_instances(list<old_rstat_t*>& ls);
1056};
1057WRITE_CLASS_ENCODER(old_rstat_t)
1058
1059inline std::ostream& operator<<(std::ostream& out, const old_rstat_t& o) {
1060 return out << "old_rstat(first " << o.first << " " << o.rstat << " " << o.accounted_rstat << ")";
1061}
1062
11fdf7f2
TL
1063/*
1064 * feature_bitset_t
1065 */
1066class feature_bitset_t {
1067public:
1068 typedef uint64_t block_type;
1069 static const size_t bits_per_block = sizeof(block_type) * 8;
1070
1071 feature_bitset_t(const feature_bitset_t& other) : _vec(other._vec) {}
1072 feature_bitset_t(feature_bitset_t&& other) : _vec(std::move(other._vec)) {}
1073 feature_bitset_t(unsigned long value = 0);
1074 feature_bitset_t(const vector<size_t>& array);
1075 feature_bitset_t& operator=(const feature_bitset_t& other) {
1076 _vec = other._vec;
1077 return *this;
1078 }
1079 feature_bitset_t& operator=(feature_bitset_t&& other) {
1080 _vec = std::move(other._vec);
1081 return *this;
1082 }
1083 bool empty() const {
1084 for (auto& v : _vec) {
1085 if (v)
1086 return false;
1087 }
1088 return true;
1089 }
1090 bool test(size_t bit) const {
1091 if (bit >= bits_per_block * _vec.size())
1092 return false;
1093 return _vec[bit / bits_per_block] & ((block_type)1 << (bit % bits_per_block));
1094 }
1095 void clear() {
1096 _vec.clear();
1097 }
1098 feature_bitset_t& operator-=(const feature_bitset_t& other);
1099 void encode(bufferlist& bl) const;
1100 void decode(bufferlist::const_iterator &p);
1101 void print(ostream& out) const;
1102private:
1103 vector<block_type> _vec;
1104};
1105WRITE_CLASS_ENCODER(feature_bitset_t)
1106
1107inline std::ostream& operator<<(std::ostream& out, const feature_bitset_t& s) {
1108 s.print(out);
1109 return out;
1110}
1111
1112/*
1113 * client_metadata_t
1114 */
1115struct client_metadata_t {
1116 using kv_map_t = std::map<std::string,std::string>;
1117 using iterator = kv_map_t::const_iterator;
1118
1119 kv_map_t kv_map;
1120 feature_bitset_t features;
1121
1122 client_metadata_t() {}
1123 client_metadata_t(const client_metadata_t& other) :
1124 kv_map(other.kv_map), features(other.features) {}
1125 client_metadata_t(client_metadata_t&& other) :
1126 kv_map(std::move(other.kv_map)), features(std::move(other.features)) {}
1127 client_metadata_t(kv_map_t&& kv, feature_bitset_t &&f) :
1128 kv_map(std::move(kv)), features(std::move(f)) {}
1129 client_metadata_t(const kv_map_t& kv, const feature_bitset_t &f) :
1130 kv_map(kv), features(f) {}
1131 client_metadata_t& operator=(const client_metadata_t& other) {
1132 kv_map = other.kv_map;
1133 features = other.features;
1134 return *this;
1135 }
1136
1137 bool empty() const { return kv_map.empty() && features.empty(); }
1138 iterator find(const std::string& key) const { return kv_map.find(key); }
1139 iterator begin() const { return kv_map.begin(); }
1140 iterator end() const { return kv_map.end(); }
1141 std::string& operator[](const std::string& key) { return kv_map[key]; }
1142 void merge(const client_metadata_t& other) {
1143 kv_map.insert(other.kv_map.begin(), other.kv_map.end());
1144 features = other.features;
1145 }
1146 void clear() {
1147 kv_map.clear();
1148 features.clear();
1149 }
1150
1151 void encode(bufferlist& bl) const;
1152 void decode(bufferlist::const_iterator& p);
1153 void dump(Formatter *f) const;
1154};
1155WRITE_CLASS_ENCODER(client_metadata_t)
7c673cae
FG
1156
1157/*
1158 * session_info_t
1159 */
7c673cae
FG
1160struct session_info_t {
1161 entity_inst_t inst;
1162 std::map<ceph_tid_t,inodeno_t> completed_requests;
1163 interval_set<inodeno_t> prealloc_inos; // preallocated, ready to use.
1164 interval_set<inodeno_t> used_inos; // journaling use
11fdf7f2 1165 client_metadata_t client_metadata;
7c673cae
FG
1166 std::set<ceph_tid_t> completed_flushes;
1167 EntityName auth_name;
1168
1169 client_t get_client() const { return client_t(inst.name.num()); }
11fdf7f2 1170 bool has_feature(size_t bit) const { return client_metadata.features.test(bit); }
7c673cae
FG
1171 const entity_name_t& get_source() const { return inst.name; }
1172
1173 void clear_meta() {
1174 prealloc_inos.clear();
1175 used_inos.clear();
1176 completed_requests.clear();
1177 completed_flushes.clear();
11fdf7f2 1178 client_metadata.clear();
7c673cae
FG
1179 }
1180
1181 void encode(bufferlist& bl, uint64_t features) const;
11fdf7f2 1182 void decode(bufferlist::const_iterator& p);
7c673cae
FG
1183 void dump(Formatter *f) const;
1184 static void generate_test_instances(list<session_info_t*>& ls);
1185};
1186WRITE_CLASS_ENCODER_FEATURES(session_info_t)
1187
1188
1189// =======
1190// dentries
1191
1192struct dentry_key_t {
94b18763 1193 snapid_t snapid = 0;
11fdf7f2 1194 std::string_view name;
94b18763
FG
1195 __u32 hash = 0;
1196 dentry_key_t() {}
11fdf7f2 1197 dentry_key_t(snapid_t s, std::string_view n, __u32 h=0) :
7c673cae
FG
1198 snapid(s), name(n), hash(h) {}
1199
94b18763 1200 bool is_valid() { return name.length() || snapid; }
7c673cae
FG
1201
1202 // encode into something that can be decoded as a string.
1203 // name_ (head) or name_%x (!head)
1204 void encode(bufferlist& bl) const {
1205 string key;
1206 encode(key);
11fdf7f2
TL
1207 using ceph::encode;
1208 encode(key, bl);
7c673cae
FG
1209 }
1210 void encode(string& key) const {
1211 char b[20];
1212 if (snapid != CEPH_NOSNAP) {
1213 uint64_t val(snapid);
1214 snprintf(b, sizeof(b), "%" PRIx64, val);
1215 } else {
1216 snprintf(b, sizeof(b), "%s", "head");
1217 }
1218 ostringstream oss;
1219 oss << name << "_" << b;
1220 key = oss.str();
1221 }
11fdf7f2 1222 static void decode_helper(bufferlist::const_iterator& bl, string& nm, snapid_t& sn) {
7c673cae 1223 string key;
11fdf7f2 1224 decode(key, bl);
7c673cae
FG
1225 decode_helper(key, nm, sn);
1226 }
11fdf7f2 1227 static void decode_helper(std::string_view key, string& nm, snapid_t& sn) {
7c673cae 1228 size_t i = key.find_last_of('_');
11fdf7f2
TL
1229 ceph_assert(i != string::npos);
1230 if (key.compare(i+1, std::string_view::npos, "head") == 0) {
7c673cae
FG
1231 // name_head
1232 sn = CEPH_NOSNAP;
1233 } else {
1234 // name_%x
1235 long long unsigned x = 0;
94b18763
FG
1236 std::string x_str(key.substr(i+1));
1237 sscanf(x_str.c_str(), "%llx", &x);
7c673cae
FG
1238 sn = x;
1239 }
11fdf7f2 1240 nm = key.substr(0, i);
7c673cae
FG
1241 }
1242};
1243
1244inline std::ostream& operator<<(std::ostream& out, const dentry_key_t &k)
1245{
1246 return out << "(" << k.name << "," << k.snapid << ")";
1247}
1248
1249inline bool operator<(const dentry_key_t& k1, const dentry_key_t& k2)
1250{
1251 /*
1252 * order by hash, name, snap
1253 */
1254 int c = ceph_frag_value(k1.hash) - ceph_frag_value(k2.hash);
1255 if (c)
1256 return c < 0;
94b18763 1257 c = k1.name.compare(k2.name);
7c673cae
FG
1258 if (c)
1259 return c < 0;
1260 return k1.snapid < k2.snapid;
1261}
1262
1263
1264/*
1265 * string_snap_t is a simple (string, snapid_t) pair
1266 */
1267struct string_snap_t {
1268 string name;
1269 snapid_t snapid;
1270 string_snap_t() {}
11fdf7f2 1271 string_snap_t(std::string_view n, snapid_t s) : name(n), snapid(s) {}
7c673cae
FG
1272
1273 void encode(bufferlist& bl) const;
11fdf7f2 1274 void decode(bufferlist::const_iterator& p);
7c673cae
FG
1275 void dump(Formatter *f) const;
1276 static void generate_test_instances(list<string_snap_t*>& ls);
1277};
1278WRITE_CLASS_ENCODER(string_snap_t)
1279
1280inline bool operator<(const string_snap_t& l, const string_snap_t& r) {
94b18763 1281 int c = l.name.compare(r.name);
7c673cae
FG
1282 return c < 0 || (c == 0 && l.snapid < r.snapid);
1283}
1284
1285inline std::ostream& operator<<(std::ostream& out, const string_snap_t &k)
1286{
1287 return out << "(" << k.name << "," << k.snapid << ")";
1288}
1289
1290/*
1291 * mds_table_pending_t
1292 *
1293 * mds's requesting any pending ops. child needs to encode the corresponding
1294 * pending mutation state in the table.
1295 */
1296struct mds_table_pending_t {
94b18763
FG
1297 uint64_t reqid = 0;
1298 __s32 mds = 0;
1299 version_t tid = 0;
1300 mds_table_pending_t() {}
7c673cae 1301 void encode(bufferlist& bl) const;
11fdf7f2 1302 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
1303 void dump(Formatter *f) const;
1304 static void generate_test_instances(list<mds_table_pending_t*>& ls);
1305};
1306WRITE_CLASS_ENCODER(mds_table_pending_t)
1307
1308
1309// =========
1310// requests
1311
1312struct metareqid_t {
1313 entity_name_t name;
94b18763
FG
1314 uint64_t tid = 0;
1315 metareqid_t() {}
7c673cae
FG
1316 metareqid_t(entity_name_t n, ceph_tid_t t) : name(n), tid(t) {}
1317 void encode(bufferlist& bl) const {
11fdf7f2
TL
1318 using ceph::encode;
1319 encode(name, bl);
1320 encode(tid, bl);
7c673cae 1321 }
11fdf7f2
TL
1322 void decode(bufferlist::const_iterator &p) {
1323 using ceph::decode;
1324 decode(name, p);
1325 decode(tid, p);
7c673cae
FG
1326 }
1327};
1328WRITE_CLASS_ENCODER(metareqid_t)
1329
1330inline std::ostream& operator<<(std::ostream& out, const metareqid_t& r) {
1331 return out << r.name << ":" << r.tid;
1332}
1333
1334inline bool operator==(const metareqid_t& l, const metareqid_t& r) {
1335 return (l.name == r.name) && (l.tid == r.tid);
1336}
1337inline bool operator!=(const metareqid_t& l, const metareqid_t& r) {
1338 return (l.name != r.name) || (l.tid != r.tid);
1339}
1340inline bool operator<(const metareqid_t& l, const metareqid_t& r) {
1341 return (l.name < r.name) ||
1342 (l.name == r.name && l.tid < r.tid);
1343}
1344inline bool operator<=(const metareqid_t& l, const metareqid_t& r) {
1345 return (l.name < r.name) ||
1346 (l.name == r.name && l.tid <= r.tid);
1347}
1348inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); }
1349inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); }
1350
1351namespace std {
1352 template<> struct hash<metareqid_t> {
1353 size_t operator()(const metareqid_t &r) const {
1354 hash<uint64_t> H;
1355 return H(r.name.num()) ^ H(r.name.type()) ^ H(r.tid);
1356 }
1357 };
1358} // namespace std
1359
1360
1361// cap info for client reconnect
1362struct cap_reconnect_t {
1363 string path;
1364 mutable ceph_mds_cap_reconnect capinfo;
1365 snapid_t snap_follows;
1366 bufferlist flockbl;
1367
1368 cap_reconnect_t() {
1369 memset(&capinfo, 0, sizeof(capinfo));
1370 snap_follows = 0;
1371 }
11fdf7f2 1372 cap_reconnect_t(uint64_t cap_id, inodeno_t pino, std::string_view p, int w, int i,
7c673cae
FG
1373 inodeno_t sr, snapid_t sf, bufferlist& lb) :
1374 path(p) {
1375 capinfo.cap_id = cap_id;
1376 capinfo.wanted = w;
1377 capinfo.issued = i;
1378 capinfo.snaprealm = sr;
1379 capinfo.pathbase = pino;
1380 capinfo.flock_len = 0;
1381 snap_follows = sf;
1382 flockbl.claim(lb);
1383 }
1384 void encode(bufferlist& bl) const;
11fdf7f2 1385 void decode(bufferlist::const_iterator& bl);
7c673cae 1386 void encode_old(bufferlist& bl) const;
11fdf7f2 1387 void decode_old(bufferlist::const_iterator& bl);
7c673cae
FG
1388
1389 void dump(Formatter *f) const;
1390 static void generate_test_instances(list<cap_reconnect_t*>& ls);
1391};
1392WRITE_CLASS_ENCODER(cap_reconnect_t)
1393
11fdf7f2
TL
1394struct snaprealm_reconnect_t {
1395 mutable ceph_mds_snaprealm_reconnect realm;
1396
1397 snaprealm_reconnect_t() {
1398 memset(&realm, 0, sizeof(realm));
1399 }
1400 snaprealm_reconnect_t(inodeno_t ino, snapid_t seq, inodeno_t parent) {
1401 realm.ino = ino;
1402 realm.seq = seq;
1403 realm.parent = parent;
1404 }
1405 void encode(bufferlist& bl) const;
1406 void decode(bufferlist::const_iterator& bl);
1407 void encode_old(bufferlist& bl) const;
1408 void decode_old(bufferlist::const_iterator& bl);
1409
1410 void dump(Formatter *f) const;
1411 static void generate_test_instances(list<snaprealm_reconnect_t*>& ls);
1412};
1413WRITE_CLASS_ENCODER(snaprealm_reconnect_t)
7c673cae
FG
1414
1415// compat for pre-FLOCK feature
1416struct old_ceph_mds_cap_reconnect {
1417 __le64 cap_id;
1418 __le32 wanted;
1419 __le32 issued;
1420 __le64 old_size;
1421 struct ceph_timespec old_mtime, old_atime;
1422 __le64 snaprealm;
1423 __le64 pathbase; /* base ino for our path to this ino */
1424} __attribute__ ((packed));
1425WRITE_RAW_ENCODER(old_ceph_mds_cap_reconnect)
1426
1427struct old_cap_reconnect_t {
1428 string path;
1429 old_ceph_mds_cap_reconnect capinfo;
1430
1431 const old_cap_reconnect_t& operator=(const cap_reconnect_t& n) {
1432 path = n.path;
1433 capinfo.cap_id = n.capinfo.cap_id;
1434 capinfo.wanted = n.capinfo.wanted;
1435 capinfo.issued = n.capinfo.issued;
1436 capinfo.snaprealm = n.capinfo.snaprealm;
1437 capinfo.pathbase = n.capinfo.pathbase;
1438 return *this;
1439 }
1440 operator cap_reconnect_t() {
1441 cap_reconnect_t n;
1442 n.path = path;
1443 n.capinfo.cap_id = capinfo.cap_id;
1444 n.capinfo.wanted = capinfo.wanted;
1445 n.capinfo.issued = capinfo.issued;
1446 n.capinfo.snaprealm = capinfo.snaprealm;
1447 n.capinfo.pathbase = capinfo.pathbase;
1448 return n;
1449 }
1450
1451 void encode(bufferlist& bl) const {
11fdf7f2
TL
1452 using ceph::encode;
1453 encode(path, bl);
1454 encode(capinfo, bl);
7c673cae 1455 }
11fdf7f2
TL
1456 void decode(bufferlist::const_iterator& bl) {
1457 using ceph::decode;
1458 decode(path, bl);
1459 decode(capinfo, bl);
7c673cae
FG
1460 }
1461};
1462WRITE_CLASS_ENCODER(old_cap_reconnect_t)
1463
1464
1465// ================================================================
1466// dir frag
1467
1468struct dirfrag_t {
94b18763 1469 inodeno_t ino = 0;
7c673cae
FG
1470 frag_t frag;
1471
94b18763 1472 dirfrag_t() {}
7c673cae
FG
1473 dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { }
1474
1475 void encode(bufferlist& bl) const {
11fdf7f2
TL
1476 using ceph::encode;
1477 encode(ino, bl);
1478 encode(frag, bl);
7c673cae 1479 }
11fdf7f2
TL
1480 void decode(bufferlist::const_iterator& bl) {
1481 using ceph::decode;
1482 decode(ino, bl);
1483 decode(frag, bl);
7c673cae
FG
1484 }
1485};
1486WRITE_CLASS_ENCODER(dirfrag_t)
1487
1488
1489inline std::ostream& operator<<(std::ostream& out, const dirfrag_t &df) {
1490 out << df.ino;
1491 if (!df.frag.is_root()) out << "." << df.frag;
1492 return out;
1493}
1494inline bool operator<(dirfrag_t l, dirfrag_t r) {
1495 if (l.ino < r.ino) return true;
1496 if (l.ino == r.ino && l.frag < r.frag) return true;
1497 return false;
1498}
1499inline bool operator==(dirfrag_t l, dirfrag_t r) {
1500 return l.ino == r.ino && l.frag == r.frag;
1501}
1502
1503namespace std {
1504 template<> struct hash<dirfrag_t> {
1505 size_t operator()(const dirfrag_t &df) const {
1506 static rjhash<uint64_t> H;
1507 static rjhash<uint32_t> I;
1508 return H(df.ino) ^ I(df.frag);
1509 }
1510 };
1511} // namespace std
1512
1513
1514
1515// ================================================================
1516
1517#define META_POP_IRD 0
1518#define META_POP_IWR 1
1519#define META_POP_READDIR 2
1520#define META_POP_FETCH 3
1521#define META_POP_STORE 4
1522#define META_NPOP 5
1523
1524class inode_load_vec_t {
7c673cae 1525public:
11fdf7f2
TL
1526 using time = DecayCounter::time;
1527 using clock = DecayCounter::clock;
1528 static const size_t NUM = 2;
1529
1530 inode_load_vec_t() : vec{DecayCounter(DecayRate()), DecayCounter(DecayRate())} {}
1531 inode_load_vec_t(const DecayRate &rate) : vec{DecayCounter(rate), DecayCounter(rate)} {}
1532
7c673cae 1533 DecayCounter &get(int t) {
7c673cae
FG
1534 return vec[t];
1535 }
11fdf7f2
TL
1536 void zero() {
1537 for (auto &d : vec) {
1538 d.reset();
1539 }
7c673cae
FG
1540 }
1541 void encode(bufferlist &bl) const;
11fdf7f2
TL
1542 void decode(bufferlist::const_iterator& p);
1543 void dump(Formatter *f) const;
7c673cae 1544 static void generate_test_instances(list<inode_load_vec_t*>& ls);
11fdf7f2
TL
1545
1546private:
1547 std::array<DecayCounter, NUM> vec;
7c673cae 1548};
11fdf7f2
TL
1549inline void encode(const inode_load_vec_t &c, bufferlist &bl) {
1550 c.encode(bl);
7c673cae 1551}
11fdf7f2
TL
1552inline void decode(inode_load_vec_t & c, bufferlist::const_iterator &p) {
1553 c.decode(p);
7c673cae
FG
1554}
1555
1556class dirfrag_load_vec_t {
1557public:
11fdf7f2
TL
1558 using time = DecayCounter::time;
1559 using clock = DecayCounter::clock;
1560 static const size_t NUM = 5;
1561
1562 dirfrag_load_vec_t() :
1563 vec{DecayCounter(DecayRate()),
1564 DecayCounter(DecayRate()),
1565 DecayCounter(DecayRate()),
1566 DecayCounter(DecayRate()),
1567 DecayCounter(DecayRate())
1568 }
7c673cae 1569 {}
11fdf7f2
TL
1570 dirfrag_load_vec_t(const DecayRate &rate) :
1571 vec{DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate)}
1572 {}
1573
7c673cae
FG
1574 void encode(bufferlist &bl) const {
1575 ENCODE_START(2, 2, bl);
94b18763 1576 for (const auto &i : vec) {
11fdf7f2 1577 encode(i, bl);
94b18763 1578 }
7c673cae
FG
1579 ENCODE_FINISH(bl);
1580 }
11fdf7f2 1581 void decode(bufferlist::const_iterator &p) {
7c673cae 1582 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p);
94b18763 1583 for (auto &i : vec) {
11fdf7f2 1584 decode(i, p);
94b18763 1585 }
7c673cae
FG
1586 DECODE_FINISH(p);
1587 }
7c673cae 1588 void dump(Formatter *f) const;
11fdf7f2
TL
1589 void dump(Formatter *f, const DecayRate& rate) const;
1590 static void generate_test_instances(std::list<dirfrag_load_vec_t*>& ls);
7c673cae 1591
11fdf7f2
TL
1592 const DecayCounter &get(int t) const {
1593 return vec[t];
7c673cae 1594 }
11fdf7f2
TL
1595 DecayCounter &get(int t) {
1596 return vec[t];
1597 }
1598 void adjust(double d) {
94b18763 1599 for (auto &i : vec) {
11fdf7f2 1600 i.adjust(d);
94b18763 1601 }
7c673cae 1602 }
11fdf7f2 1603 void zero() {
94b18763 1604 for (auto &i : vec) {
11fdf7f2 1605 i.reset();
94b18763 1606 }
7c673cae 1607 }
28e407b8 1608 double meta_load() const {
7c673cae 1609 return
11fdf7f2
TL
1610 1*vec[META_POP_IRD].get() +
1611 2*vec[META_POP_IWR].get() +
1612 1*vec[META_POP_READDIR].get() +
1613 2*vec[META_POP_FETCH].get() +
1614 4*vec[META_POP_STORE].get();
7c673cae
FG
1615 }
1616
11fdf7f2
TL
1617 void add(dirfrag_load_vec_t& r) {
1618 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1619 vec[i].adjust(r.vec[i].get());
7c673cae 1620 }
11fdf7f2
TL
1621 void sub(dirfrag_load_vec_t& r) {
1622 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1623 vec[i].adjust(-r.vec[i].get());
7c673cae
FG
1624 }
1625 void scale(double f) {
11fdf7f2 1626 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
7c673cae
FG
1627 vec[i].scale(f);
1628 }
11fdf7f2
TL
1629
1630private:
1631 friend inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl);
1632 std::array<DecayCounter, NUM> vec;
7c673cae
FG
1633};
1634
11fdf7f2
TL
1635inline void encode(const dirfrag_load_vec_t &c, bufferlist &bl) {
1636 c.encode(bl);
7c673cae 1637}
11fdf7f2
TL
1638inline void decode(dirfrag_load_vec_t& c, bufferlist::const_iterator &p) {
1639 c.decode(p);
7c673cae
FG
1640}
1641
28e407b8 1642inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl)
7c673cae 1643{
11fdf7f2
TL
1644 std::ostringstream ss;
1645 ss << std::setprecision(1) << std::fixed
1646 << "[pop"
1647 " IRD:" << dl.vec[0]
1648 << " IWR:" << dl.vec[1]
1649 << " RDR:" << dl.vec[2]
1650 << " FET:" << dl.vec[3]
1651 << " STR:" << dl.vec[4]
1652 << " *LOAD:" << dl.meta_load() << "]";
1653 return out << ss.str() << std::endl;
7c673cae
FG
1654}
1655
1656
7c673cae
FG
1657/* mds_load_t
1658 * mds load
1659 */
1660
1661struct mds_load_t {
11fdf7f2
TL
1662 using clock = dirfrag_load_vec_t::clock;
1663 using time = dirfrag_load_vec_t::time;
1664
7c673cae
FG
1665 dirfrag_load_vec_t auth;
1666 dirfrag_load_vec_t all;
1667
11fdf7f2
TL
1668 mds_load_t() : auth(DecayRate()), all(DecayRate()) {}
1669 mds_load_t(const DecayRate &rate) : auth(rate), all(rate) {}
1670
94b18763
FG
1671 double req_rate = 0.0;
1672 double cache_hit_rate = 0.0;
1673 double queue_len = 0.0;
7c673cae 1674
94b18763 1675 double cpu_load_avg = 0.0;
7c673cae 1676
11fdf7f2 1677 double mds_load() const; // defiend in MDBalancer.cc
7c673cae 1678 void encode(bufferlist& bl) const;
11fdf7f2 1679 void decode(bufferlist::const_iterator& bl);
7c673cae 1680 void dump(Formatter *f) const;
11fdf7f2 1681 static void generate_test_instances(std::list<mds_load_t*>& ls);
7c673cae 1682};
11fdf7f2
TL
1683inline void encode(const mds_load_t &c, bufferlist &bl) {
1684 c.encode(bl);
7c673cae 1685}
11fdf7f2
TL
1686inline void decode(mds_load_t &c, bufferlist::const_iterator &p) {
1687 c.decode(p);
7c673cae
FG
1688}
1689
28e407b8 1690inline std::ostream& operator<<(std::ostream& out, const mds_load_t& load)
7c673cae
FG
1691{
1692 return out << "mdsload<" << load.auth << "/" << load.all
1693 << ", req " << load.req_rate
1694 << ", hr " << load.cache_hit_rate
1695 << ", qlen " << load.queue_len
1696 << ", cpu " << load.cpu_load_avg
1697 << ">";
1698}
1699
1700class load_spread_t {
1701public:
11fdf7f2
TL
1702 using time = DecayCounter::time;
1703 using clock = DecayCounter::clock;
7c673cae
FG
1704 static const int MAX = 4;
1705 int last[MAX];
94b18763 1706 int p = 0, n = 0;
7c673cae
FG
1707 DecayCounter count;
1708
1709public:
11fdf7f2
TL
1710 load_spread_t() = delete;
1711 load_spread_t(const DecayRate &rate) : count(rate)
7c673cae
FG
1712 {
1713 for (int i=0; i<MAX; i++)
1714 last[i] = -1;
1715 }
1716
11fdf7f2 1717 double hit(int who) {
7c673cae
FG
1718 for (int i=0; i<n; i++)
1719 if (last[i] == who)
1720 return count.get_last();
1721
1722 // we're new(ish)
1723 last[p++] = who;
1724 if (n < MAX) n++;
1725 if (n == 1) return 0.0;
1726
1727 if (p == MAX) p = 0;
1728
11fdf7f2 1729 return count.hit();
7c673cae 1730 }
11fdf7f2
TL
1731 double get() const {
1732 return count.get();
7c673cae
FG
1733 }
1734};
1735
1736
1737
1738// ================================================================
1739typedef std::pair<mds_rank_t, mds_rank_t> mds_authority_t;
1740
1741// -- authority delegation --
1742// directory authority types
1743// >= 0 is the auth mds
1744#define CDIR_AUTH_PARENT mds_rank_t(-1) // default
1745#define CDIR_AUTH_UNKNOWN mds_rank_t(-2)
1746#define CDIR_AUTH_DEFAULT mds_authority_t(CDIR_AUTH_PARENT, CDIR_AUTH_UNKNOWN)
1747#define CDIR_AUTH_UNDEF mds_authority_t(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN)
1748//#define CDIR_AUTH_ROOTINODE pair<int,int>( 0, -2)
1749
1750class MDSCacheObjectInfo {
1751public:
94b18763 1752 inodeno_t ino = 0;
7c673cae
FG
1753 dirfrag_t dirfrag;
1754 string dname;
1755 snapid_t snapid;
1756
94b18763 1757 MDSCacheObjectInfo() {}
7c673cae
FG
1758
1759 void encode(bufferlist& bl) const;
11fdf7f2 1760 void decode(bufferlist::const_iterator& bl);
7c673cae
FG
1761 void dump(Formatter *f) const;
1762 static void generate_test_instances(list<MDSCacheObjectInfo*>& ls);
1763};
1764
1765inline std::ostream& operator<<(std::ostream& out, const MDSCacheObjectInfo &info) {
1766 if (info.ino) return out << info.ino << "." << info.snapid;
1767 if (info.dname.length()) return out << info.dirfrag << "/" << info.dname
1768 << " snap " << info.snapid;
1769 return out << info.dirfrag;
1770}
1771
1772inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) {
1773 if (l.ino || r.ino)
1774 return l.ino == r.ino && l.snapid == r.snapid;
1775 else
1776 return l.dirfrag == r.dirfrag && l.dname == r.dname;
1777}
1778WRITE_CLASS_ENCODER(MDSCacheObjectInfo)
1779
1780
1781// parse a map of keys/values.
1782namespace qi = boost::spirit::qi;
1783
1784template <typename Iterator>
1785struct keys_and_values
1786 : qi::grammar<Iterator, std::map<string, string>()>
1787{
1788 keys_and_values()
1789 : keys_and_values::base_type(query)
1790 {
1791 query = pair >> *(qi::lit(' ') >> pair);
1792 pair = key >> '=' >> value;
1793 key = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9");
1794 value = +qi::char_("a-zA-Z_0-9");
1795 }
1796 qi::rule<Iterator, std::map<string, string>()> query;
1797 qi::rule<Iterator, std::pair<string, string>()> pair;
1798 qi::rule<Iterator, string()> key, value;
1799};
1800
1801#endif