]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/mdstypes.h
import ceph pacific 16.2.5
[ceph.git] / ceph / src / mds / mdstypes.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef CEPH_MDSTYPES_H
4 #define CEPH_MDSTYPES_H
5
6 #include "include/int_types.h"
7
8 #include <ostream>
9 #include <set>
10 #include <map>
11 #include <string_view>
12
13 #include "common/config.h"
14 #include "common/Clock.h"
15 #include "common/DecayCounter.h"
16 #include "common/StackStringStream.h"
17 #include "common/entity_name.h"
18
19 #include "include/compat.h"
20 #include "include/Context.h"
21 #include "include/frag.h"
22 #include "include/xlist.h"
23 #include "include/interval_set.h"
24 #include "include/compact_set.h"
25 #include "include/fs_types.h"
26 #include "include/ceph_fs.h"
27
28 #include "inode_backtrace.h"
29
30 #include <boost/spirit/include/qi.hpp>
31 #include <boost/pool/pool.hpp>
32 #include "include/ceph_assert.h"
33 #include <boost/serialization/strong_typedef.hpp>
34 #include "common/ceph_json.h"
35
36 #define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011"
37
38 #define MDS_PORT_CACHE 0x200
39 #define MDS_PORT_LOCKER 0x300
40 #define MDS_PORT_MIGRATOR 0x400
41
42 #define MAX_MDS 0x100
43 #define NUM_STRAY 10
44
45 // Inode numbers 1,2 and 4 please see CEPH_INO_* in include/ceph_fs.h
46
47 #define MDS_INO_MDSDIR_OFFSET (1*MAX_MDS)
48 #define MDS_INO_STRAY_OFFSET (6*MAX_MDS)
49
50 // Locations for journal data
51 #define MDS_INO_LOG_OFFSET (2*MAX_MDS)
52 #define MDS_INO_LOG_BACKUP_OFFSET (3*MAX_MDS)
53 #define MDS_INO_LOG_POINTER_OFFSET (4*MAX_MDS)
54 #define MDS_INO_PURGE_QUEUE (5*MAX_MDS)
55
56 #define MDS_INO_SYSTEM_BASE ((6*MAX_MDS) + (MAX_MDS * NUM_STRAY))
57
58 #define MDS_INO_STRAY(x,i) (MDS_INO_STRAY_OFFSET+((((unsigned)(x))*NUM_STRAY)+((unsigned)(i))))
59 #define MDS_INO_MDSDIR(x) (MDS_INO_MDSDIR_OFFSET+((unsigned)x))
60
61 #define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < (MDS_INO_STRAY_OFFSET+(MAX_MDS*NUM_STRAY)))
62 #define MDS_INO_IS_MDSDIR(i) ((i) >= MDS_INO_MDSDIR_OFFSET && (i) < (MDS_INO_MDSDIR_OFFSET+MAX_MDS))
63 #define MDS_INO_MDSDIR_OWNER(i) (signed ((unsigned (i)) - MDS_INO_MDSDIR_OFFSET))
64 #define MDS_INO_IS_BASE(i) ((i) == CEPH_INO_ROOT || (i) == CEPH_INO_GLOBAL_SNAPREALM || MDS_INO_IS_MDSDIR(i))
65 #define MDS_INO_STRAY_OWNER(i) (signed (((unsigned (i)) - MDS_INO_STRAY_OFFSET) / NUM_STRAY))
66 #define MDS_INO_STRAY_INDEX(i) (((unsigned (i)) - MDS_INO_STRAY_OFFSET) % NUM_STRAY)
67
68 #define MDS_IS_PRIVATE_INO(i) ((i) < MDS_INO_SYSTEM_BASE && (i) >= MDS_INO_MDSDIR_OFFSET)
69
70 typedef int32_t mds_rank_t;
71 constexpr mds_rank_t MDS_RANK_NONE = -1;
72 constexpr mds_rank_t MDS_RANK_EPHEMERAL_DIST = -2;
73 constexpr mds_rank_t MDS_RANK_EPHEMERAL_RAND = -3;
74
75 BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
76 extern const mds_gid_t MDS_GID_NONE;
77
78 typedef int32_t fs_cluster_id_t;
79 constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = -1;
80
81 // The namespace ID of the anonymous default filesystem from legacy systems
82 constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS = 0;
83
84 class mds_role_t {
85 public:
86 mds_role_t(fs_cluster_id_t fscid_, mds_rank_t rank_)
87 : fscid(fscid_), rank(rank_)
88 {}
89 mds_role_t() {}
90
91 bool operator<(mds_role_t const &rhs) const {
92 if (fscid < rhs.fscid) {
93 return true;
94 } else if (fscid == rhs.fscid) {
95 return rank < rhs.rank;
96 } else {
97 return false;
98 }
99 }
100
101 bool is_none() const {
102 return (rank == MDS_RANK_NONE);
103 }
104
105 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
106 mds_rank_t rank = MDS_RANK_NONE;
107 };
108 inline std::ostream& operator<<(std::ostream& out, const mds_role_t& role) {
109 return out << role.fscid << ":" << role.rank;
110 }
111
112 // CAPS
113 inline std::string gcap_string(int cap)
114 {
115 std::string s;
116 if (cap & CEPH_CAP_GSHARED) s += "s";
117 if (cap & CEPH_CAP_GEXCL) s += "x";
118 if (cap & CEPH_CAP_GCACHE) s += "c";
119 if (cap & CEPH_CAP_GRD) s += "r";
120 if (cap & CEPH_CAP_GWR) s += "w";
121 if (cap & CEPH_CAP_GBUFFER) s += "b";
122 if (cap & CEPH_CAP_GWREXTEND) s += "a";
123 if (cap & CEPH_CAP_GLAZYIO) s += "l";
124 return s;
125 }
126 inline std::string ccap_string(int cap)
127 {
128 std::string s;
129 if (cap & CEPH_CAP_PIN) s += "p";
130
131 int a = (cap >> CEPH_CAP_SAUTH) & 3;
132 if (a) s += 'A' + gcap_string(a);
133
134 a = (cap >> CEPH_CAP_SLINK) & 3;
135 if (a) s += 'L' + gcap_string(a);
136
137 a = (cap >> CEPH_CAP_SXATTR) & 3;
138 if (a) s += 'X' + gcap_string(a);
139
140 a = cap >> CEPH_CAP_SFILE;
141 if (a) s += 'F' + gcap_string(a);
142
143 if (s.length() == 0)
144 s = "-";
145 return s;
146 }
147
148 struct scatter_info_t {
149 version_t version = 0;
150 };
151
152 struct frag_info_t : public scatter_info_t {
153 int64_t size() const { return nfiles + nsubdirs; }
154
155 void zero() {
156 *this = frag_info_t();
157 }
158
159 // *this += cur - acc;
160 void add_delta(const frag_info_t &cur, const frag_info_t &acc, bool *touched_mtime=0, bool *touched_chattr=0) {
161 if (cur.mtime > mtime) {
162 mtime = cur.mtime;
163 if (touched_mtime)
164 *touched_mtime = true;
165 }
166 if (cur.change_attr > change_attr) {
167 change_attr = cur.change_attr;
168 if (touched_chattr)
169 *touched_chattr = true;
170 }
171 nfiles += cur.nfiles - acc.nfiles;
172 nsubdirs += cur.nsubdirs - acc.nsubdirs;
173 }
174
175 void add(const frag_info_t& other) {
176 if (other.mtime > mtime)
177 mtime = other.mtime;
178 if (other.change_attr > change_attr)
179 change_attr = other.change_attr;
180 nfiles += other.nfiles;
181 nsubdirs += other.nsubdirs;
182 }
183
184 bool same_sums(const frag_info_t &o) const {
185 return mtime <= o.mtime &&
186 nfiles == o.nfiles &&
187 nsubdirs == o.nsubdirs;
188 }
189
190 void encode(ceph::buffer::list &bl) const;
191 void decode(ceph::buffer::list::const_iterator& bl);
192 void dump(ceph::Formatter *f) const;
193 void decode_json(JSONObj *obj);
194 static void generate_test_instances(std::list<frag_info_t*>& ls);
195
196 // this frag
197 utime_t mtime;
198 uint64_t change_attr = 0;
199 int64_t nfiles = 0; // files
200 int64_t nsubdirs = 0; // subdirs
201 };
202 WRITE_CLASS_ENCODER(frag_info_t)
203
204 inline bool operator==(const frag_info_t &l, const frag_info_t &r) {
205 return memcmp(&l, &r, sizeof(l)) == 0;
206 }
207 inline bool operator!=(const frag_info_t &l, const frag_info_t &r) {
208 return !(l == r);
209 }
210
211 std::ostream& operator<<(std::ostream &out, const frag_info_t &f);
212
213
214 struct nest_info_t : public scatter_info_t {
215 int64_t rsize() const { return rfiles + rsubdirs; }
216
217 void zero() {
218 *this = nest_info_t();
219 }
220
221 void sub(const nest_info_t &other) {
222 add(other, -1);
223 }
224 void add(const nest_info_t &other, int fac=1) {
225 if (other.rctime > rctime)
226 rctime = other.rctime;
227 rbytes += fac*other.rbytes;
228 rfiles += fac*other.rfiles;
229 rsubdirs += fac*other.rsubdirs;
230 rsnaps += fac*other.rsnaps;
231 }
232
233 // *this += cur - acc;
234 void add_delta(const nest_info_t &cur, const nest_info_t &acc) {
235 if (cur.rctime > rctime)
236 rctime = cur.rctime;
237 rbytes += cur.rbytes - acc.rbytes;
238 rfiles += cur.rfiles - acc.rfiles;
239 rsubdirs += cur.rsubdirs - acc.rsubdirs;
240 rsnaps += cur.rsnaps - acc.rsnaps;
241 }
242
243 bool same_sums(const nest_info_t &o) const {
244 return rctime <= o.rctime &&
245 rbytes == o.rbytes &&
246 rfiles == o.rfiles &&
247 rsubdirs == o.rsubdirs &&
248 rsnaps == o.rsnaps;
249 }
250
251 void encode(ceph::buffer::list &bl) const;
252 void decode(ceph::buffer::list::const_iterator& bl);
253 void dump(ceph::Formatter *f) const;
254 void decode_json(JSONObj *obj);
255 static void generate_test_instances(std::list<nest_info_t*>& ls);
256
257 // this frag + children
258 utime_t rctime;
259 int64_t rbytes = 0;
260 int64_t rfiles = 0;
261 int64_t rsubdirs = 0;
262 int64_t rsnaps = 0;
263 };
264 WRITE_CLASS_ENCODER(nest_info_t)
265
266 inline bool operator==(const nest_info_t &l, const nest_info_t &r) {
267 return memcmp(&l, &r, sizeof(l)) == 0;
268 }
269 inline bool operator!=(const nest_info_t &l, const nest_info_t &r) {
270 return !(l == r);
271 }
272
273 std::ostream& operator<<(std::ostream &out, const nest_info_t &n);
274
275 struct vinodeno_t {
276 vinodeno_t() {}
277 vinodeno_t(inodeno_t i, snapid_t s) : ino(i), snapid(s) {}
278
279 void encode(ceph::buffer::list& bl) const {
280 using ceph::encode;
281 encode(ino, bl);
282 encode(snapid, bl);
283 }
284 void decode(ceph::buffer::list::const_iterator& p) {
285 using ceph::decode;
286 decode(ino, p);
287 decode(snapid, p);
288 }
289
290 inodeno_t ino;
291 snapid_t snapid;
292 };
293 WRITE_CLASS_ENCODER(vinodeno_t)
294
295 inline bool operator==(const vinodeno_t &l, const vinodeno_t &r) {
296 return l.ino == r.ino && l.snapid == r.snapid;
297 }
298 inline bool operator!=(const vinodeno_t &l, const vinodeno_t &r) {
299 return !(l == r);
300 }
301 inline bool operator<(const vinodeno_t &l, const vinodeno_t &r) {
302 return
303 l.ino < r.ino ||
304 (l.ino == r.ino && l.snapid < r.snapid);
305 }
306
307 struct quota_info_t
308 {
309 void encode(ceph::buffer::list& bl) const {
310 ENCODE_START(1, 1, bl);
311 encode(max_bytes, bl);
312 encode(max_files, bl);
313 ENCODE_FINISH(bl);
314 }
315 void decode(ceph::buffer::list::const_iterator& p) {
316 DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p);
317 decode(max_bytes, p);
318 decode(max_files, p);
319 DECODE_FINISH(p);
320 }
321
322 void dump(ceph::Formatter *f) const;
323 static void generate_test_instances(std::list<quota_info_t *>& ls);
324
325 bool is_valid() const {
326 return max_bytes >=0 && max_files >=0;
327 }
328 bool is_enable() const {
329 return max_bytes || max_files;
330 }
331 void decode_json(JSONObj *obj);
332
333 int64_t max_bytes = 0;
334 int64_t max_files = 0;
335 };
336 WRITE_CLASS_ENCODER(quota_info_t)
337
338 inline bool operator==(const quota_info_t &l, const quota_info_t &r) {
339 return memcmp(&l, &r, sizeof(l)) == 0;
340 }
341
342 std::ostream& operator<<(std::ostream &out, const quota_info_t &n);
343
344 namespace std {
345 template<> struct hash<vinodeno_t> {
346 size_t operator()(const vinodeno_t &vino) const {
347 hash<inodeno_t> H;
348 hash<uint64_t> I;
349 return H(vino.ino) ^ I(vino.snapid);
350 }
351 };
352 }
353
354 inline std::ostream& operator<<(std::ostream &out, const vinodeno_t &vino) {
355 out << vino.ino;
356 if (vino.snapid == CEPH_NOSNAP)
357 out << ".head";
358 else if (vino.snapid)
359 out << '.' << vino.snapid;
360 return out;
361 }
362
363 struct client_writeable_range_t {
364 struct byte_range_t {
365 uint64_t first = 0, last = 0; // interval client can write to
366 byte_range_t() {}
367 void decode_json(JSONObj *obj);
368 };
369
370 void encode(ceph::buffer::list &bl) const;
371 void decode(ceph::buffer::list::const_iterator& bl);
372 void dump(ceph::Formatter *f) const;
373 static void generate_test_instances(std::list<client_writeable_range_t*>& ls);
374
375 byte_range_t range;
376 snapid_t follows = 0; // aka "data+metadata flushed thru"
377 };
378
379 inline void decode(client_writeable_range_t::byte_range_t& range, ceph::buffer::list::const_iterator& bl) {
380 using ceph::decode;
381 decode(range.first, bl);
382 decode(range.last, bl);
383 }
384
385 WRITE_CLASS_ENCODER(client_writeable_range_t)
386
387 std::ostream& operator<<(std::ostream& out, const client_writeable_range_t& r);
388
389 inline bool operator==(const client_writeable_range_t& l,
390 const client_writeable_range_t& r) {
391 return l.range.first == r.range.first && l.range.last == r.range.last &&
392 l.follows == r.follows;
393 }
394
395 struct inline_data_t {
396 public:
397 inline_data_t() {}
398 inline_data_t(const inline_data_t& o) : version(o.version) {
399 if (o.blp)
400 set_data(*o.blp);
401 }
402 inline_data_t& operator=(const inline_data_t& o) {
403 version = o.version;
404 if (o.blp)
405 set_data(*o.blp);
406 else
407 free_data();
408 return *this;
409 }
410
411 void free_data() {
412 blp.reset();
413 }
414 void get_data(ceph::buffer::list& ret) const {
415 if (blp)
416 ret = *blp;
417 else
418 ret.clear();
419 }
420 void set_data(const ceph::buffer::list& bl) {
421 if (!blp)
422 blp.reset(new ceph::buffer::list);
423 *blp = bl;
424 }
425 size_t length() const { return blp ? blp->length() : 0; }
426
427 bool operator==(const inline_data_t& o) const {
428 return length() == o.length() &&
429 (length() == 0 ||
430 (*const_cast<ceph::buffer::list*>(blp.get()) == *const_cast<ceph::buffer::list*>(o.blp.get())));
431 }
432 bool operator!=(const inline_data_t& o) const {
433 return !(*this == o);
434 }
435 void encode(ceph::buffer::list &bl) const;
436 void decode(ceph::buffer::list::const_iterator& bl);
437
438 version_t version = 1;
439
440 private:
441 std::unique_ptr<ceph::buffer::list> blp;
442 };
443 WRITE_CLASS_ENCODER(inline_data_t)
444
445 enum {
446 DAMAGE_STATS, // statistics (dirstat, size, etc)
447 DAMAGE_RSTATS, // recursive statistics (rstat, accounted_rstat)
448 DAMAGE_FRAGTREE // fragtree -- repair by searching
449 };
450 typedef uint32_t damage_flags_t;
451
452 template<template<typename> class Allocator = std::allocator>
453 struct inode_t {
454 /**
455 * ***************
456 * Do not forget to add any new fields to the compare() function.
457 * ***************
458 */
459 using client_range_map = std::map<client_t,client_writeable_range_t,std::less<client_t>,Allocator<std::pair<const client_t,client_writeable_range_t>>>;
460
461 inode_t()
462 {
463 clear_layout();
464 }
465
466 // file type
467 bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; }
468 bool is_dir() const { return (mode & S_IFMT) == S_IFDIR; }
469 bool is_file() const { return (mode & S_IFMT) == S_IFREG; }
470
471 bool is_truncating() const { return (truncate_pending > 0); }
472 void truncate(uint64_t old_size, uint64_t new_size) {
473 ceph_assert(new_size < old_size);
474 if (old_size > max_size_ever)
475 max_size_ever = old_size;
476 truncate_from = old_size;
477 size = new_size;
478 rstat.rbytes = new_size;
479 truncate_size = size;
480 truncate_seq++;
481 truncate_pending++;
482 }
483
484 bool has_layout() const {
485 return layout != file_layout_t();
486 }
487
488 void clear_layout() {
489 layout = file_layout_t();
490 }
491
492 uint64_t get_layout_size_increment() const {
493 return layout.get_period();
494 }
495
496 bool is_dirty_rstat() const { return !(rstat == accounted_rstat); }
497
498 uint64_t get_client_range(client_t client) const {
499 auto it = client_ranges.find(client);
500 return it != client_ranges.end() ? it->second.range.last : 0;
501 }
502
503 uint64_t get_max_size() const {
504 uint64_t max = 0;
505 for (std::map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin();
506 p != client_ranges.end();
507 ++p)
508 if (p->second.range.last > max)
509 max = p->second.range.last;
510 return max;
511 }
512 void set_max_size(uint64_t new_max) {
513 if (new_max == 0) {
514 client_ranges.clear();
515 } else {
516 for (std::map<client_t,client_writeable_range_t>::iterator p = client_ranges.begin();
517 p != client_ranges.end();
518 ++p)
519 p->second.range.last = new_max;
520 }
521 }
522
523 void trim_client_ranges(snapid_t last) {
524 std::map<client_t, client_writeable_range_t>::iterator p = client_ranges.begin();
525 while (p != client_ranges.end()) {
526 if (p->second.follows >= last)
527 client_ranges.erase(p++);
528 else
529 ++p;
530 }
531 }
532
533 bool is_backtrace_updated() const {
534 return backtrace_version == version;
535 }
536 void update_backtrace(version_t pv=0) {
537 backtrace_version = pv ? pv : version;
538 }
539
540 void add_old_pool(int64_t l) {
541 backtrace_version = version;
542 old_pools.insert(l);
543 }
544
545 void encode(ceph::buffer::list &bl, uint64_t features) const;
546 void decode(ceph::buffer::list::const_iterator& bl);
547 void dump(ceph::Formatter *f) const;
548 static void client_ranges_cb(client_range_map& c, JSONObj *obj);
549 static void old_pools_cb(compact_set<int64_t, std::less<int64_t>, Allocator<int64_t> >& c, JSONObj *obj);
550 void decode_json(JSONObj *obj);
551 static void generate_test_instances(std::list<inode_t*>& ls);
552 /**
553 * Compare this inode_t with another that represent *the same inode*
554 * at different points in time.
555 * @pre The inodes are the same ino
556 *
557 * @param other The inode_t to compare ourselves with
558 * @param divergent A bool pointer which will be set to true
559 * if the values are different in a way that can't be explained
560 * by one being a newer version than the other.
561 *
562 * @returns 1 if we are newer than the other, 0 if equal, -1 if older.
563 */
564 int compare(const inode_t &other, bool *divergent) const;
565
566 // base (immutable)
567 inodeno_t ino = 0;
568 uint32_t rdev = 0; // if special file
569
570 // affected by any inode change...
571 utime_t ctime; // inode change time
572 utime_t btime; // birth time
573
574 // perm (namespace permissions)
575 uint32_t mode = 0;
576 uid_t uid = 0;
577 gid_t gid = 0;
578
579 // nlink
580 int32_t nlink = 0;
581
582 // file (data access)
583 ceph_dir_layout dir_layout = {}; // [dir only]
584 file_layout_t layout;
585 compact_set<int64_t, std::less<int64_t>, Allocator<int64_t>> old_pools;
586 uint64_t size = 0; // on directory, # dentries
587 uint64_t max_size_ever = 0; // max size the file has ever been
588 uint32_t truncate_seq = 0;
589 uint64_t truncate_size = 0, truncate_from = 0;
590 uint32_t truncate_pending = 0;
591 utime_t mtime; // file data modify time.
592 utime_t atime; // file data access time.
593 uint32_t time_warp_seq = 0; // count of (potential) mtime/atime timewarps (i.e., utimes())
594 inline_data_t inline_data; // FIXME check
595
596 // change attribute
597 uint64_t change_attr = 0;
598
599 client_range_map client_ranges; // client(s) can write to these ranges
600
601 // dirfrag, recursive accountin
602 frag_info_t dirstat; // protected by my filelock
603 nest_info_t rstat; // protected by my nestlock
604 nest_info_t accounted_rstat; // protected by parent's nestlock
605
606 quota_info_t quota;
607
608 mds_rank_t export_pin = MDS_RANK_NONE;
609
610 double export_ephemeral_random_pin = 0;
611 bool export_ephemeral_distributed_pin = false;
612
613 // special stuff
614 version_t version = 0; // auth only
615 version_t file_data_version = 0; // auth only
616 version_t xattr_version = 0;
617
618 utime_t last_scrub_stamp; // start time of last complete scrub
619 version_t last_scrub_version = 0;// (parent) start version of last complete scrub
620
621 version_t backtrace_version = 0;
622
623 snapid_t oldest_snap;
624
625 std::basic_string<char,std::char_traits<char>,Allocator<char>> stray_prior_path; //stores path before unlink
626
627 bool fscrypt = false; // fscrypt enabled ?
628
629 private:
630 bool older_is_consistent(const inode_t &other) const;
631 };
632
633 // These methods may be moved back to mdstypes.cc when we have pmr
634 template<template<typename> class Allocator>
635 void inode_t<Allocator>::encode(ceph::buffer::list &bl, uint64_t features) const
636 {
637 ENCODE_START(17, 6, bl);
638
639 encode(ino, bl);
640 encode(rdev, bl);
641 encode(ctime, bl);
642
643 encode(mode, bl);
644 encode(uid, bl);
645 encode(gid, bl);
646
647 encode(nlink, bl);
648 {
649 // removed field
650 bool anchored = 0;
651 encode(anchored, bl);
652 }
653
654 encode(dir_layout, bl);
655 encode(layout, bl, features);
656 encode(size, bl);
657 encode(truncate_seq, bl);
658 encode(truncate_size, bl);
659 encode(truncate_from, bl);
660 encode(truncate_pending, bl);
661 encode(mtime, bl);
662 encode(atime, bl);
663 encode(time_warp_seq, bl);
664 encode(client_ranges, bl);
665
666 encode(dirstat, bl);
667 encode(rstat, bl);
668 encode(accounted_rstat, bl);
669
670 encode(version, bl);
671 encode(file_data_version, bl);
672 encode(xattr_version, bl);
673 encode(backtrace_version, bl);
674 encode(old_pools, bl);
675 encode(max_size_ever, bl);
676 encode(inline_data, bl);
677 encode(quota, bl);
678
679 encode(stray_prior_path, bl);
680
681 encode(last_scrub_version, bl);
682 encode(last_scrub_stamp, bl);
683
684 encode(btime, bl);
685 encode(change_attr, bl);
686
687 encode(export_pin, bl);
688
689 encode(export_ephemeral_random_pin, bl);
690 encode(export_ephemeral_distributed_pin, bl);
691
692 encode(fscrypt, bl);
693
694 ENCODE_FINISH(bl);
695 }
696
697 template<template<typename> class Allocator>
698 void inode_t<Allocator>::decode(ceph::buffer::list::const_iterator &p)
699 {
700 DECODE_START_LEGACY_COMPAT_LEN(17, 6, 6, p);
701
702 decode(ino, p);
703 decode(rdev, p);
704 decode(ctime, p);
705
706 decode(mode, p);
707 decode(uid, p);
708 decode(gid, p);
709
710 decode(nlink, p);
711 {
712 bool anchored;
713 decode(anchored, p);
714 }
715
716 if (struct_v >= 4)
717 decode(dir_layout, p);
718 else {
719 // FIPS zeroization audit 20191117: this memset is not security related.
720 memset(&dir_layout, 0, sizeof(dir_layout));
721 }
722 decode(layout, p);
723 decode(size, p);
724 decode(truncate_seq, p);
725 decode(truncate_size, p);
726 decode(truncate_from, p);
727 if (struct_v >= 5)
728 decode(truncate_pending, p);
729 else
730 truncate_pending = 0;
731 decode(mtime, p);
732 decode(atime, p);
733 decode(time_warp_seq, p);
734 if (struct_v >= 3) {
735 decode(client_ranges, p);
736 } else {
737 std::map<client_t, client_writeable_range_t::byte_range_t> m;
738 decode(m, p);
739 for (auto q = m.begin(); q != m.end(); ++q)
740 client_ranges[q->first].range = q->second;
741 }
742
743 decode(dirstat, p);
744 decode(rstat, p);
745 decode(accounted_rstat, p);
746
747 decode(version, p);
748 decode(file_data_version, p);
749 decode(xattr_version, p);
750 if (struct_v >= 2)
751 decode(backtrace_version, p);
752 if (struct_v >= 7)
753 decode(old_pools, p);
754 if (struct_v >= 8)
755 decode(max_size_ever, p);
756 if (struct_v >= 9) {
757 decode(inline_data, p);
758 } else {
759 inline_data.version = CEPH_INLINE_NONE;
760 }
761 if (struct_v < 10)
762 backtrace_version = 0; // force update backtrace
763 if (struct_v >= 11)
764 decode(quota, p);
765
766 if (struct_v >= 12) {
767 std::string tmp;
768 decode(tmp, p);
769 stray_prior_path = std::string_view(tmp);
770 }
771
772 if (struct_v >= 13) {
773 decode(last_scrub_version, p);
774 decode(last_scrub_stamp, p);
775 }
776 if (struct_v >= 14) {
777 decode(btime, p);
778 decode(change_attr, p);
779 } else {
780 btime = utime_t();
781 change_attr = 0;
782 }
783
784 if (struct_v >= 15) {
785 decode(export_pin, p);
786 } else {
787 export_pin = MDS_RANK_NONE;
788 }
789
790 if (struct_v >= 16) {
791 decode(export_ephemeral_random_pin, p);
792 decode(export_ephemeral_distributed_pin, p);
793 } else {
794 export_ephemeral_random_pin = 0;
795 export_ephemeral_distributed_pin = false;
796 }
797
798 if (struct_v >= 17) {
799 decode(fscrypt, p);
800 } else {
801 fscrypt = 0;
802 }
803
804 DECODE_FINISH(p);
805 }
806
807 template<template<typename> class Allocator>
808 void inode_t<Allocator>::dump(ceph::Formatter *f) const
809 {
810 f->dump_unsigned("ino", ino);
811 f->dump_unsigned("rdev", rdev);
812 f->dump_stream("ctime") << ctime;
813 f->dump_stream("btime") << btime;
814 f->dump_unsigned("mode", mode);
815 f->dump_unsigned("uid", uid);
816 f->dump_unsigned("gid", gid);
817 f->dump_unsigned("nlink", nlink);
818
819 f->open_object_section("dir_layout");
820 ::dump(dir_layout, f);
821 f->close_section();
822
823 f->dump_object("layout", layout);
824
825 f->open_array_section("old_pools");
826 for (const auto &p : old_pools) {
827 f->dump_int("pool", p);
828 }
829 f->close_section();
830
831 f->dump_unsigned("size", size);
832 f->dump_unsigned("truncate_seq", truncate_seq);
833 f->dump_unsigned("truncate_size", truncate_size);
834 f->dump_unsigned("truncate_from", truncate_from);
835 f->dump_unsigned("truncate_pending", truncate_pending);
836 f->dump_stream("mtime") << mtime;
837 f->dump_stream("atime") << atime;
838 f->dump_unsigned("time_warp_seq", time_warp_seq);
839 f->dump_unsigned("change_attr", change_attr);
840 f->dump_int("export_pin", export_pin);
841 f->dump_int("export_ephemeral_random_pin", export_ephemeral_random_pin);
842 f->dump_bool("export_ephemeral_distributed_pin", export_ephemeral_distributed_pin);
843
844 f->open_array_section("client_ranges");
845 for (const auto &p : client_ranges) {
846 f->open_object_section("client");
847 f->dump_unsigned("client", p.first.v);
848 p.second.dump(f);
849 f->close_section();
850 }
851 f->close_section();
852
853 f->open_object_section("dirstat");
854 dirstat.dump(f);
855 f->close_section();
856
857 f->open_object_section("rstat");
858 rstat.dump(f);
859 f->close_section();
860
861 f->open_object_section("accounted_rstat");
862 accounted_rstat.dump(f);
863 f->close_section();
864
865 f->dump_unsigned("version", version);
866 f->dump_unsigned("file_data_version", file_data_version);
867 f->dump_unsigned("xattr_version", xattr_version);
868 f->dump_unsigned("backtrace_version", backtrace_version);
869
870 f->dump_string("stray_prior_path", stray_prior_path);
871 f->dump_unsigned("max_size_ever", max_size_ever);
872
873 f->open_object_section("quota");
874 quota.dump(f);
875 f->close_section();
876
877 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
878 f->dump_unsigned("last_scrub_version", last_scrub_version);
879 }
880
881 template<template<typename> class Allocator>
882 void inode_t<Allocator>::client_ranges_cb(typename inode_t<Allocator>::client_range_map& c, JSONObj *obj){
883
884 int64_t client;
885 JSONDecoder::decode_json("client", client, obj, true);
886 client_writeable_range_t client_range_tmp;
887 JSONDecoder::decode_json("byte range", client_range_tmp.range, obj, true);
888 JSONDecoder::decode_json("follows", client_range_tmp.follows.val, obj, true);
889 c[client] = client_range_tmp;
890 }
891
892 template<template<typename> class Allocator>
893 void inode_t<Allocator>::old_pools_cb(compact_set<int64_t, std::less<int64_t>, Allocator<int64_t> >& c, JSONObj *obj){
894
895 int64_t tmp;
896 decode_json_obj(tmp, obj);
897 c.insert(tmp);
898 }
899
900 template<template<typename> class Allocator>
901 void inode_t<Allocator>::decode_json(JSONObj *obj)
902 {
903
904 JSONDecoder::decode_json("ino", ino.val, obj, true);
905 JSONDecoder::decode_json("rdev", rdev, obj, true);
906 //JSONDecoder::decode_json("ctime", ctime, obj, true);
907 //JSONDecoder::decode_json("btime", btime, obj, true);
908 JSONDecoder::decode_json("mode", mode, obj, true);
909 JSONDecoder::decode_json("uid", uid, obj, true);
910 JSONDecoder::decode_json("gid", gid, obj, true);
911 JSONDecoder::decode_json("nlink", nlink, obj, true);
912 JSONDecoder::decode_json("dir_layout", dir_layout, obj, true);
913 JSONDecoder::decode_json("layout", layout, obj, true);
914 JSONDecoder::decode_json("old_pools", old_pools, inode_t<Allocator>::old_pools_cb, obj, true);
915 JSONDecoder::decode_json("size", size, obj, true);
916 JSONDecoder::decode_json("truncate_seq", truncate_seq, obj, true);
917 JSONDecoder::decode_json("truncate_size", truncate_size, obj, true);
918 JSONDecoder::decode_json("truncate_from", truncate_from, obj, true);
919 JSONDecoder::decode_json("truncate_pending", truncate_pending, obj, true);
920 //JSONDecoder::decode_json("mtime", mtime, obj, true);
921 //JSONDecoder::decode_json("atime", atime, obj, true);
922 JSONDecoder::decode_json("time_warp_seq", time_warp_seq, obj, true);
923 JSONDecoder::decode_json("change_attr", change_attr, obj, true);
924 JSONDecoder::decode_json("export_pin", export_pin, obj, true);
925 JSONDecoder::decode_json("client_ranges", client_ranges, inode_t<Allocator>::client_ranges_cb, obj, true);
926 JSONDecoder::decode_json("dirstat", dirstat, obj, true);
927 JSONDecoder::decode_json("rstat", rstat, obj, true);
928 JSONDecoder::decode_json("accounted_rstat", accounted_rstat, obj, true);
929 JSONDecoder::decode_json("version", version, obj, true);
930 JSONDecoder::decode_json("file_data_version", file_data_version, obj, true);
931 JSONDecoder::decode_json("xattr_version", xattr_version, obj, true);
932 JSONDecoder::decode_json("backtrace_version", backtrace_version, obj, true);
933 JSONDecoder::decode_json("stray_prior_path", stray_prior_path, obj, true);
934 JSONDecoder::decode_json("max_size_ever", max_size_ever, obj, true);
935 JSONDecoder::decode_json("quota", quota, obj, true);
936 JSONDecoder::decode_json("last_scrub_stamp", last_scrub_stamp, obj, true);
937 JSONDecoder::decode_json("last_scrub_version", last_scrub_version, obj, true);
938 }
939
940 template<template<typename> class Allocator>
941 void inode_t<Allocator>::generate_test_instances(std::list<inode_t*>& ls)
942 {
943 ls.push_back(new inode_t<Allocator>);
944 ls.push_back(new inode_t<Allocator>);
945 ls.back()->ino = 1;
946 // i am lazy.
947 }
948
949 template<template<typename> class Allocator>
950 int inode_t<Allocator>::compare(const inode_t<Allocator> &other, bool *divergent) const
951 {
952 ceph_assert(ino == other.ino);
953 *divergent = false;
954 if (version == other.version) {
955 if (rdev != other.rdev ||
956 ctime != other.ctime ||
957 btime != other.btime ||
958 mode != other.mode ||
959 uid != other.uid ||
960 gid != other.gid ||
961 nlink != other.nlink ||
962 memcmp(&dir_layout, &other.dir_layout, sizeof(dir_layout)) ||
963 layout != other.layout ||
964 old_pools != other.old_pools ||
965 size != other.size ||
966 max_size_ever != other.max_size_ever ||
967 truncate_seq != other.truncate_seq ||
968 truncate_size != other.truncate_size ||
969 truncate_from != other.truncate_from ||
970 truncate_pending != other.truncate_pending ||
971 change_attr != other.change_attr ||
972 mtime != other.mtime ||
973 atime != other.atime ||
974 time_warp_seq != other.time_warp_seq ||
975 inline_data != other.inline_data ||
976 client_ranges != other.client_ranges ||
977 !(dirstat == other.dirstat) ||
978 !(rstat == other.rstat) ||
979 !(accounted_rstat == other.accounted_rstat) ||
980 file_data_version != other.file_data_version ||
981 xattr_version != other.xattr_version ||
982 backtrace_version != other.backtrace_version) {
983 *divergent = true;
984 }
985 return 0;
986 } else if (version > other.version) {
987 *divergent = !older_is_consistent(other);
988 return 1;
989 } else {
990 ceph_assert(version < other.version);
991 *divergent = !other.older_is_consistent(*this);
992 return -1;
993 }
994 }
995
996 template<template<typename> class Allocator>
997 bool inode_t<Allocator>::older_is_consistent(const inode_t<Allocator> &other) const
998 {
999 if (max_size_ever < other.max_size_ever ||
1000 truncate_seq < other.truncate_seq ||
1001 time_warp_seq < other.time_warp_seq ||
1002 inline_data.version < other.inline_data.version ||
1003 dirstat.version < other.dirstat.version ||
1004 rstat.version < other.rstat.version ||
1005 accounted_rstat.version < other.accounted_rstat.version ||
1006 file_data_version < other.file_data_version ||
1007 xattr_version < other.xattr_version ||
1008 backtrace_version < other.backtrace_version) {
1009 return false;
1010 }
1011 return true;
1012 }
1013
1014 template<template<typename> class Allocator>
1015 inline void encode(const inode_t<Allocator> &c, ::ceph::buffer::list &bl, uint64_t features)
1016 {
1017 ENCODE_DUMP_PRE();
1018 c.encode(bl, features);
1019 ENCODE_DUMP_POST(cl);
1020 }
1021 template<template<typename> class Allocator>
1022 inline void decode(inode_t<Allocator> &c, ::ceph::buffer::list::const_iterator &p)
1023 {
1024 c.decode(p);
1025 }
1026
1027 template<template<typename> class Allocator>
1028 using alloc_string = std::basic_string<char,std::char_traits<char>,Allocator<char>>;
1029
1030 template<template<typename> class Allocator>
1031 using xattr_map = std::map<alloc_string<Allocator>,
1032 ceph::bufferptr,
1033 std::less<alloc_string<Allocator>>,
1034 Allocator<std::pair<const alloc_string<Allocator>,
1035 ceph::bufferptr>>>; // FIXME bufferptr not in mempool
1036
1037 template<template<typename> class Allocator>
1038 inline void decode_noshare(xattr_map<Allocator>& xattrs, ceph::buffer::list::const_iterator &p)
1039 {
1040 __u32 n;
1041 decode(n, p);
1042 while (n-- > 0) {
1043 alloc_string<Allocator> key;
1044 decode(key, p);
1045 __u32 len;
1046 decode(len, p);
1047 p.copy_deep(len, xattrs[key]);
1048 }
1049 }
1050
1051 template<template<typename> class Allocator = std::allocator>
1052 struct old_inode_t {
1053 snapid_t first;
1054 inode_t<Allocator> inode;
1055 xattr_map<Allocator> xattrs;
1056
1057 void encode(ceph::buffer::list &bl, uint64_t features) const;
1058 void decode(ceph::buffer::list::const_iterator& bl);
1059 void dump(ceph::Formatter *f) const;
1060 static void generate_test_instances(std::list<old_inode_t*>& ls);
1061 };
1062
1063 // These methods may be moved back to mdstypes.cc when we have pmr
1064 template<template<typename> class Allocator>
1065 void old_inode_t<Allocator>::encode(ceph::buffer::list& bl, uint64_t features) const
1066 {
1067 ENCODE_START(2, 2, bl);
1068 encode(first, bl);
1069 encode(inode, bl, features);
1070 encode(xattrs, bl);
1071 ENCODE_FINISH(bl);
1072 }
1073
1074 template<template<typename> class Allocator>
1075 void old_inode_t<Allocator>::decode(ceph::buffer::list::const_iterator& bl)
1076 {
1077 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
1078 decode(first, bl);
1079 decode(inode, bl);
1080 decode_noshare<Allocator>(xattrs, bl);
1081 DECODE_FINISH(bl);
1082 }
1083
1084 template<template<typename> class Allocator>
1085 void old_inode_t<Allocator>::dump(ceph::Formatter *f) const
1086 {
1087 f->dump_unsigned("first", first);
1088 inode.dump(f);
1089 f->open_object_section("xattrs");
1090 for (const auto &p : xattrs) {
1091 std::string v(p.second.c_str(), p.second.length());
1092 f->dump_string(p.first.c_str(), v);
1093 }
1094 f->close_section();
1095 }
1096
1097 template<template<typename> class Allocator>
1098 void old_inode_t<Allocator>::generate_test_instances(std::list<old_inode_t<Allocator>*>& ls)
1099 {
1100 ls.push_back(new old_inode_t<Allocator>);
1101 ls.push_back(new old_inode_t<Allocator>);
1102 ls.back()->first = 2;
1103 std::list<inode_t<Allocator>*> ils;
1104 inode_t<Allocator>::generate_test_instances(ils);
1105 ls.back()->inode = *ils.back();
1106 ls.back()->xattrs["user.foo"] = ceph::buffer::copy("asdf", 4);
1107 ls.back()->xattrs["user.unprintable"] = ceph::buffer::copy("\000\001\002", 3);
1108 }
1109
1110 template<template<typename> class Allocator>
1111 inline void encode(const old_inode_t<Allocator> &c, ::ceph::buffer::list &bl, uint64_t features)
1112 {
1113 ENCODE_DUMP_PRE();
1114 c.encode(bl, features);
1115 ENCODE_DUMP_POST(cl);
1116 }
1117 template<template<typename> class Allocator>
1118 inline void decode(old_inode_t<Allocator> &c, ::ceph::buffer::list::const_iterator &p)
1119 {
1120 c.decode(p);
1121 }
1122
1123 /*
1124 * like an inode, but for a dir frag
1125 */
1126 struct fnode_t {
1127 void encode(ceph::buffer::list &bl) const;
1128 void decode(ceph::buffer::list::const_iterator& bl);
1129 void dump(ceph::Formatter *f) const;
1130 void decode_json(JSONObj *obj);
1131 static void generate_test_instances(std::list<fnode_t*>& ls);
1132
1133 version_t version = 0;
1134 snapid_t snap_purged_thru; // the max_last_destroy snapid we've been purged thru
1135 frag_info_t fragstat, accounted_fragstat;
1136 nest_info_t rstat, accounted_rstat;
1137 damage_flags_t damage_flags = 0;
1138
1139 // we know we and all our descendants have been scrubbed since this version
1140 version_t recursive_scrub_version = 0;
1141 utime_t recursive_scrub_stamp;
1142 // version at which we last scrubbed our personal data structures
1143 version_t localized_scrub_version = 0;
1144 utime_t localized_scrub_stamp;
1145 };
1146 WRITE_CLASS_ENCODER(fnode_t)
1147
1148
1149 struct old_rstat_t {
1150 void encode(ceph::buffer::list& bl) const;
1151 void decode(ceph::buffer::list::const_iterator& p);
1152 void dump(ceph::Formatter *f) const;
1153 static void generate_test_instances(std::list<old_rstat_t*>& ls);
1154
1155 snapid_t first;
1156 nest_info_t rstat, accounted_rstat;
1157 };
1158 WRITE_CLASS_ENCODER(old_rstat_t)
1159
1160 inline std::ostream& operator<<(std::ostream& out, const old_rstat_t& o) {
1161 return out << "old_rstat(first " << o.first << " " << o.rstat << " " << o.accounted_rstat << ")";
1162 }
1163
1164 class feature_bitset_t {
1165 public:
1166 typedef uint64_t block_type;
1167 static const size_t bits_per_block = sizeof(block_type) * 8;
1168
1169 feature_bitset_t(const feature_bitset_t& other) : _vec(other._vec) {}
1170 feature_bitset_t(feature_bitset_t&& other) : _vec(std::move(other._vec)) {}
1171 feature_bitset_t(unsigned long value = 0);
1172 feature_bitset_t(const std::vector<size_t>& array);
1173 feature_bitset_t& operator=(const feature_bitset_t& other) {
1174 _vec = other._vec;
1175 return *this;
1176 }
1177 feature_bitset_t& operator=(feature_bitset_t&& other) {
1178 _vec = std::move(other._vec);
1179 return *this;
1180 }
1181 feature_bitset_t& operator-=(const feature_bitset_t& other);
1182 bool empty() const {
1183 //block_type is a uint64_t. If the vector is only composed of 0s, then it's still "empty"
1184 for (auto& v : _vec) {
1185 if (v)
1186 return false;
1187 }
1188 return true;
1189 }
1190 bool test(size_t bit) const {
1191 if (bit >= bits_per_block * _vec.size())
1192 return false;
1193 return _vec[bit / bits_per_block] & ((block_type)1 << (bit % bits_per_block));
1194 }
1195 void insert(size_t bit) {
1196 size_t n = bit / bits_per_block;
1197 if (n >= _vec.size())
1198 _vec.resize(n + 1);
1199 _vec[n] |= ((block_type)1 << (bit % bits_per_block));
1200 }
1201 void erase(size_t bit) {
1202 size_t n = bit / bits_per_block;
1203 if (n >= _vec.size())
1204 return;
1205 _vec[n] &= ~((block_type)1 << (bit % bits_per_block));
1206 if (n + 1 == _vec.size()) {
1207 while (!_vec.empty() && _vec.back() == 0)
1208 _vec.pop_back();
1209 }
1210 }
1211 void clear() {
1212 _vec.clear();
1213 }
1214 bool operator==(const feature_bitset_t& other) const {
1215 return _vec == other._vec;
1216 }
1217 bool operator!=(const feature_bitset_t& other) const {
1218 return _vec != other._vec;
1219 }
1220 void encode(ceph::buffer::list& bl) const;
1221 void decode(ceph::buffer::list::const_iterator &p);
1222 void dump(ceph::Formatter *f) const;
1223 void print(std::ostream& out) const;
1224 private:
1225 std::vector<block_type> _vec;
1226 };
1227 WRITE_CLASS_ENCODER(feature_bitset_t)
1228
1229 inline std::ostream& operator<<(std::ostream& out, const feature_bitset_t& s) {
1230 s.print(out);
1231 return out;
1232 }
1233
1234 struct metric_spec_t {
1235 metric_spec_t() {}
1236 metric_spec_t(const metric_spec_t& other) :
1237 metric_flags(other.metric_flags) {}
1238 metric_spec_t(metric_spec_t&& other) :
1239 metric_flags(std::move(other.metric_flags)) {}
1240 metric_spec_t(const feature_bitset_t& mf) :
1241 metric_flags(mf) {}
1242 metric_spec_t(feature_bitset_t&& mf) :
1243 metric_flags(std::move(mf)) {}
1244
1245 metric_spec_t& operator=(const metric_spec_t& other) {
1246 metric_flags = other.metric_flags;
1247 return *this;
1248 }
1249 metric_spec_t& operator=(metric_spec_t&& other) {
1250 metric_flags = std::move(other.metric_flags);
1251 return *this;
1252 }
1253
1254 bool empty() const {
1255 return metric_flags.empty();
1256 }
1257
1258 void clear() {
1259 metric_flags.clear();
1260 }
1261
1262 void encode(ceph::buffer::list& bl) const;
1263 void decode(ceph::buffer::list::const_iterator& p);
1264 void dump(ceph::Formatter *f) const;
1265 void print(std::ostream& out) const;
1266
1267 // set of metrics that a client is capable of forwarding
1268 feature_bitset_t metric_flags;
1269 };
1270 WRITE_CLASS_ENCODER(metric_spec_t)
1271
1272 inline std::ostream& operator<<(std::ostream& out, const metric_spec_t& mst) {
1273 mst.print(out);
1274 return out;
1275 }
1276
1277 /*
1278 * client_metadata_t
1279 */
1280 struct client_metadata_t {
1281 using kv_map_t = std::map<std::string,std::string>;
1282 using iterator = kv_map_t::const_iterator;
1283
1284 client_metadata_t() {}
1285 client_metadata_t(const kv_map_t& kv, const feature_bitset_t &f, const metric_spec_t &mst) :
1286 kv_map(kv),
1287 features(f),
1288 metric_spec(mst) {}
1289 client_metadata_t& operator=(const client_metadata_t& other) {
1290 kv_map = other.kv_map;
1291 features = other.features;
1292 metric_spec = other.metric_spec;
1293 return *this;
1294 }
1295
1296 bool empty() const { return kv_map.empty() && features.empty() && metric_spec.empty(); }
1297 iterator find(const std::string& key) const { return kv_map.find(key); }
1298 iterator begin() const { return kv_map.begin(); }
1299 iterator end() const { return kv_map.end(); }
1300 void erase(iterator it) { kv_map.erase(it); }
1301 std::string& operator[](const std::string& key) { return kv_map[key]; }
1302 void merge(const client_metadata_t& other) {
1303 kv_map.insert(other.kv_map.begin(), other.kv_map.end());
1304 features = other.features;
1305 metric_spec = other.metric_spec;
1306 }
1307 void clear() {
1308 kv_map.clear();
1309 features.clear();
1310 metric_spec.clear();
1311 }
1312
1313 void encode(ceph::buffer::list& bl) const;
1314 void decode(ceph::buffer::list::const_iterator& p);
1315 void dump(ceph::Formatter *f) const;
1316
1317 kv_map_t kv_map;
1318 feature_bitset_t features;
1319 metric_spec_t metric_spec;
1320 };
1321 WRITE_CLASS_ENCODER(client_metadata_t)
1322
1323 /*
1324 * session_info_t - durable part of a Session
1325 */
1326 struct session_info_t {
1327 client_t get_client() const { return client_t(inst.name.num()); }
1328 bool has_feature(size_t bit) const { return client_metadata.features.test(bit); }
1329 const entity_name_t& get_source() const { return inst.name; }
1330
1331 void clear_meta() {
1332 prealloc_inos.clear();
1333 completed_requests.clear();
1334 completed_flushes.clear();
1335 client_metadata.clear();
1336 }
1337
1338 void encode(ceph::buffer::list& bl, uint64_t features) const;
1339 void decode(ceph::buffer::list::const_iterator& p);
1340 void dump(ceph::Formatter *f) const;
1341 static void generate_test_instances(std::list<session_info_t*>& ls);
1342
1343 entity_inst_t inst;
1344 std::map<ceph_tid_t,inodeno_t> completed_requests;
1345 interval_set<inodeno_t> prealloc_inos; // preallocated, ready to use.
1346 client_metadata_t client_metadata;
1347 std::set<ceph_tid_t> completed_flushes;
1348 EntityName auth_name;
1349 };
1350 WRITE_CLASS_ENCODER_FEATURES(session_info_t)
1351
1352 // dentries
1353 struct dentry_key_t {
1354 dentry_key_t() {}
1355 dentry_key_t(snapid_t s, std::string_view n, __u32 h=0) :
1356 snapid(s), name(n), hash(h) {}
1357
1358 bool is_valid() { return name.length() || snapid; }
1359
1360 // encode into something that can be decoded as a string.
1361 // name_ (head) or name_%x (!head)
1362 void encode(ceph::buffer::list& bl) const {
1363 std::string key;
1364 encode(key);
1365 using ceph::encode;
1366 encode(key, bl);
1367 }
1368 void encode(std::string& key) const {
1369 char b[20];
1370 if (snapid != CEPH_NOSNAP) {
1371 uint64_t val(snapid);
1372 snprintf(b, sizeof(b), "%" PRIx64, val);
1373 } else {
1374 snprintf(b, sizeof(b), "%s", "head");
1375 }
1376 CachedStackStringStream css;
1377 *css << name << "_" << b;
1378 key = css->strv();
1379 }
1380 static void decode_helper(ceph::buffer::list::const_iterator& bl, std::string& nm,
1381 snapid_t& sn) {
1382 std::string key;
1383 using ceph::decode;
1384 decode(key, bl);
1385 decode_helper(key, nm, sn);
1386 }
1387 static void decode_helper(std::string_view key, std::string& nm, snapid_t& sn) {
1388 size_t i = key.find_last_of('_');
1389 ceph_assert(i != std::string::npos);
1390 if (key.compare(i+1, std::string_view::npos, "head") == 0) {
1391 // name_head
1392 sn = CEPH_NOSNAP;
1393 } else {
1394 // name_%x
1395 long long unsigned x = 0;
1396 std::string x_str(key.substr(i+1));
1397 sscanf(x_str.c_str(), "%llx", &x);
1398 sn = x;
1399 }
1400 nm = key.substr(0, i);
1401 }
1402
1403 snapid_t snapid = 0;
1404 std::string_view name;
1405 __u32 hash = 0;
1406 };
1407
1408 inline std::ostream& operator<<(std::ostream& out, const dentry_key_t &k)
1409 {
1410 return out << "(" << k.name << "," << k.snapid << ")";
1411 }
1412
1413 inline bool operator<(const dentry_key_t& k1, const dentry_key_t& k2)
1414 {
1415 /*
1416 * order by hash, name, snap
1417 */
1418 int c = ceph_frag_value(k1.hash) - ceph_frag_value(k2.hash);
1419 if (c)
1420 return c < 0;
1421 c = k1.name.compare(k2.name);
1422 if (c)
1423 return c < 0;
1424 return k1.snapid < k2.snapid;
1425 }
1426
1427 /*
1428 * string_snap_t is a simple (string, snapid_t) pair
1429 */
1430 struct string_snap_t {
1431 string_snap_t() {}
1432 string_snap_t(std::string_view n, snapid_t s) : name(n), snapid(s) {}
1433
1434 void encode(ceph::buffer::list& bl) const;
1435 void decode(ceph::buffer::list::const_iterator& p);
1436 void dump(ceph::Formatter *f) const;
1437 static void generate_test_instances(std::list<string_snap_t*>& ls);
1438
1439 std::string name;
1440 snapid_t snapid;
1441 };
1442 WRITE_CLASS_ENCODER(string_snap_t)
1443
1444 inline bool operator<(const string_snap_t& l, const string_snap_t& r) {
1445 int c = l.name.compare(r.name);
1446 return c < 0 || (c == 0 && l.snapid < r.snapid);
1447 }
1448
1449 inline std::ostream& operator<<(std::ostream& out, const string_snap_t &k)
1450 {
1451 return out << "(" << k.name << "," << k.snapid << ")";
1452 }
1453
1454 /*
1455 * mds_table_pending_t
1456 *
1457 * For mds's requesting any pending ops, child needs to encode the corresponding
1458 * pending mutation state in the table.
1459 */
1460 struct mds_table_pending_t {
1461 void encode(ceph::buffer::list& bl) const;
1462 void decode(ceph::buffer::list::const_iterator& bl);
1463 void dump(ceph::Formatter *f) const;
1464 static void generate_test_instances(std::list<mds_table_pending_t*>& ls);
1465
1466 uint64_t reqid = 0;
1467 __s32 mds = 0;
1468 version_t tid = 0;
1469 };
1470 WRITE_CLASS_ENCODER(mds_table_pending_t)
1471
1472 // requests
1473 struct metareqid_t {
1474 metareqid_t() {}
1475 metareqid_t(entity_name_t n, ceph_tid_t t) : name(n), tid(t) {}
1476 void encode(ceph::buffer::list& bl) const {
1477 using ceph::encode;
1478 encode(name, bl);
1479 encode(tid, bl);
1480 }
1481 void decode(ceph::buffer::list::const_iterator &p) {
1482 using ceph::decode;
1483 decode(name, p);
1484 decode(tid, p);
1485 }
1486
1487 entity_name_t name;
1488 uint64_t tid = 0;
1489 };
1490 WRITE_CLASS_ENCODER(metareqid_t)
1491
1492 inline std::ostream& operator<<(std::ostream& out, const metareqid_t& r) {
1493 return out << r.name << ":" << r.tid;
1494 }
1495
1496 inline bool operator==(const metareqid_t& l, const metareqid_t& r) {
1497 return (l.name == r.name) && (l.tid == r.tid);
1498 }
1499 inline bool operator!=(const metareqid_t& l, const metareqid_t& r) {
1500 return (l.name != r.name) || (l.tid != r.tid);
1501 }
1502 inline bool operator<(const metareqid_t& l, const metareqid_t& r) {
1503 return (l.name < r.name) ||
1504 (l.name == r.name && l.tid < r.tid);
1505 }
1506 inline bool operator<=(const metareqid_t& l, const metareqid_t& r) {
1507 return (l.name < r.name) ||
1508 (l.name == r.name && l.tid <= r.tid);
1509 }
1510 inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); }
1511 inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); }
1512
1513 namespace std {
1514 template<> struct hash<metareqid_t> {
1515 size_t operator()(const metareqid_t &r) const {
1516 hash<uint64_t> H;
1517 return H(r.name.num()) ^ H(r.name.type()) ^ H(r.tid);
1518 }
1519 };
1520 } // namespace std
1521
1522 // cap info for client reconnect
1523 struct cap_reconnect_t {
1524 cap_reconnect_t() {}
1525 cap_reconnect_t(uint64_t cap_id, inodeno_t pino, std::string_view p, int w, int i,
1526 inodeno_t sr, snapid_t sf, ceph::buffer::list& lb) :
1527 path(p) {
1528 capinfo.cap_id = cap_id;
1529 capinfo.wanted = w;
1530 capinfo.issued = i;
1531 capinfo.snaprealm = sr;
1532 capinfo.pathbase = pino;
1533 capinfo.flock_len = 0;
1534 snap_follows = sf;
1535 flockbl = std::move(lb);
1536 }
1537 void encode(ceph::buffer::list& bl) const;
1538 void decode(ceph::buffer::list::const_iterator& bl);
1539 void encode_old(ceph::buffer::list& bl) const;
1540 void decode_old(ceph::buffer::list::const_iterator& bl);
1541
1542 void dump(ceph::Formatter *f) const;
1543 static void generate_test_instances(std::list<cap_reconnect_t*>& ls);
1544
1545 std::string path;
1546 mutable ceph_mds_cap_reconnect capinfo = {};
1547 snapid_t snap_follows = 0;
1548 ceph::buffer::list flockbl;
1549 };
1550 WRITE_CLASS_ENCODER(cap_reconnect_t)
1551
1552 struct snaprealm_reconnect_t {
1553 snaprealm_reconnect_t() {}
1554 snaprealm_reconnect_t(inodeno_t ino, snapid_t seq, inodeno_t parent) {
1555 realm.ino = ino;
1556 realm.seq = seq;
1557 realm.parent = parent;
1558 }
1559 void encode(ceph::buffer::list& bl) const;
1560 void decode(ceph::buffer::list::const_iterator& bl);
1561 void encode_old(ceph::buffer::list& bl) const;
1562 void decode_old(ceph::buffer::list::const_iterator& bl);
1563
1564 void dump(ceph::Formatter *f) const;
1565 static void generate_test_instances(std::list<snaprealm_reconnect_t*>& ls);
1566
1567 mutable ceph_mds_snaprealm_reconnect realm = {};
1568 };
1569 WRITE_CLASS_ENCODER(snaprealm_reconnect_t)
1570
1571 // compat for pre-FLOCK feature
1572 struct old_ceph_mds_cap_reconnect {
1573 ceph_le64 cap_id;
1574 ceph_le32 wanted;
1575 ceph_le32 issued;
1576 ceph_le64 old_size;
1577 struct ceph_timespec old_mtime, old_atime;
1578 ceph_le64 snaprealm;
1579 ceph_le64 pathbase; /* base ino for our path to this ino */
1580 } __attribute__ ((packed));
1581 WRITE_RAW_ENCODER(old_ceph_mds_cap_reconnect)
1582
1583 struct old_cap_reconnect_t {
1584 const old_cap_reconnect_t& operator=(const cap_reconnect_t& n) {
1585 path = n.path;
1586 capinfo.cap_id = n.capinfo.cap_id;
1587 capinfo.wanted = n.capinfo.wanted;
1588 capinfo.issued = n.capinfo.issued;
1589 capinfo.snaprealm = n.capinfo.snaprealm;
1590 capinfo.pathbase = n.capinfo.pathbase;
1591 return *this;
1592 }
1593 operator cap_reconnect_t() {
1594 cap_reconnect_t n;
1595 n.path = path;
1596 n.capinfo.cap_id = capinfo.cap_id;
1597 n.capinfo.wanted = capinfo.wanted;
1598 n.capinfo.issued = capinfo.issued;
1599 n.capinfo.snaprealm = capinfo.snaprealm;
1600 n.capinfo.pathbase = capinfo.pathbase;
1601 return n;
1602 }
1603
1604 void encode(ceph::buffer::list& bl) const {
1605 using ceph::encode;
1606 encode(path, bl);
1607 encode(capinfo, bl);
1608 }
1609 void decode(ceph::buffer::list::const_iterator& bl) {
1610 using ceph::decode;
1611 decode(path, bl);
1612 decode(capinfo, bl);
1613 }
1614
1615 std::string path;
1616 old_ceph_mds_cap_reconnect capinfo;
1617 };
1618 WRITE_CLASS_ENCODER(old_cap_reconnect_t)
1619
1620 // dir frag
1621 struct dirfrag_t {
1622 dirfrag_t() {}
1623 dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { }
1624
1625 void encode(ceph::buffer::list& bl) const {
1626 using ceph::encode;
1627 encode(ino, bl);
1628 encode(frag, bl);
1629 }
1630 void decode(ceph::buffer::list::const_iterator& bl) {
1631 using ceph::decode;
1632 decode(ino, bl);
1633 decode(frag, bl);
1634 }
1635
1636 inodeno_t ino = 0;
1637 frag_t frag;
1638 };
1639 WRITE_CLASS_ENCODER(dirfrag_t)
1640
1641 inline std::ostream& operator<<(std::ostream& out, const dirfrag_t &df) {
1642 out << df.ino;
1643 if (!df.frag.is_root()) out << "." << df.frag;
1644 return out;
1645 }
1646 inline bool operator<(dirfrag_t l, dirfrag_t r) {
1647 if (l.ino < r.ino) return true;
1648 if (l.ino == r.ino && l.frag < r.frag) return true;
1649 return false;
1650 }
1651 inline bool operator==(dirfrag_t l, dirfrag_t r) {
1652 return l.ino == r.ino && l.frag == r.frag;
1653 }
1654
1655 namespace std {
1656 template<> struct hash<dirfrag_t> {
1657 size_t operator()(const dirfrag_t &df) const {
1658 static rjhash<uint64_t> H;
1659 static rjhash<uint32_t> I;
1660 return H(df.ino) ^ I(df.frag);
1661 }
1662 };
1663 } // namespace std
1664
1665 // ================================================================
1666 #define META_POP_IRD 0
1667 #define META_POP_IWR 1
1668 #define META_POP_READDIR 2
1669 #define META_POP_FETCH 3
1670 #define META_POP_STORE 4
1671 #define META_NPOP 5
1672
1673 class inode_load_vec_t {
1674 public:
1675 using time = DecayCounter::time;
1676 using clock = DecayCounter::clock;
1677 static const size_t NUM = 2;
1678
1679 inode_load_vec_t() : vec{DecayCounter(DecayRate()), DecayCounter(DecayRate())} {}
1680 inode_load_vec_t(const DecayRate &rate) : vec{DecayCounter(rate), DecayCounter(rate)} {}
1681
1682 DecayCounter &get(int t) {
1683 return vec[t];
1684 }
1685 void zero() {
1686 for (auto &d : vec) {
1687 d.reset();
1688 }
1689 }
1690 void encode(ceph::buffer::list &bl) const;
1691 void decode(ceph::buffer::list::const_iterator& p);
1692 void dump(ceph::Formatter *f) const;
1693 static void generate_test_instances(std::list<inode_load_vec_t*>& ls);
1694
1695 private:
1696 std::array<DecayCounter, NUM> vec;
1697 };
1698 inline void encode(const inode_load_vec_t &c, ceph::buffer::list &bl) {
1699 c.encode(bl);
1700 }
1701 inline void decode(inode_load_vec_t & c, ceph::buffer::list::const_iterator &p) {
1702 c.decode(p);
1703 }
1704
1705 class dirfrag_load_vec_t {
1706 public:
1707 using time = DecayCounter::time;
1708 using clock = DecayCounter::clock;
1709 static const size_t NUM = 5;
1710
1711 dirfrag_load_vec_t() :
1712 vec{DecayCounter(DecayRate()),
1713 DecayCounter(DecayRate()),
1714 DecayCounter(DecayRate()),
1715 DecayCounter(DecayRate()),
1716 DecayCounter(DecayRate())
1717 }
1718 {}
1719 dirfrag_load_vec_t(const DecayRate &rate) :
1720 vec{DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate), DecayCounter(rate)}
1721 {}
1722
1723 void encode(ceph::buffer::list &bl) const {
1724 ENCODE_START(2, 2, bl);
1725 for (const auto &i : vec) {
1726 encode(i, bl);
1727 }
1728 ENCODE_FINISH(bl);
1729 }
1730 void decode(ceph::buffer::list::const_iterator &p) {
1731 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p);
1732 for (auto &i : vec) {
1733 decode(i, p);
1734 }
1735 DECODE_FINISH(p);
1736 }
1737 void dump(ceph::Formatter *f) const;
1738 void dump(ceph::Formatter *f, const DecayRate& rate) const;
1739 static void generate_test_instances(std::list<dirfrag_load_vec_t*>& ls);
1740
1741 const DecayCounter &get(int t) const {
1742 return vec[t];
1743 }
1744 DecayCounter &get(int t) {
1745 return vec[t];
1746 }
1747 void adjust(double d) {
1748 for (auto &i : vec) {
1749 i.adjust(d);
1750 }
1751 }
1752 void zero() {
1753 for (auto &i : vec) {
1754 i.reset();
1755 }
1756 }
1757 double meta_load() const {
1758 return
1759 1*vec[META_POP_IRD].get() +
1760 2*vec[META_POP_IWR].get() +
1761 1*vec[META_POP_READDIR].get() +
1762 2*vec[META_POP_FETCH].get() +
1763 4*vec[META_POP_STORE].get();
1764 }
1765
1766 void add(dirfrag_load_vec_t& r) {
1767 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1768 vec[i].adjust(r.vec[i].get());
1769 }
1770 void sub(dirfrag_load_vec_t& r) {
1771 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1772 vec[i].adjust(-r.vec[i].get());
1773 }
1774 void scale(double f) {
1775 for (size_t i=0; i<dirfrag_load_vec_t::NUM; i++)
1776 vec[i].scale(f);
1777 }
1778
1779 private:
1780 friend inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl);
1781 std::array<DecayCounter, NUM> vec;
1782 };
1783
1784 inline void encode(const dirfrag_load_vec_t &c, ceph::buffer::list &bl) {
1785 c.encode(bl);
1786 }
1787 inline void decode(dirfrag_load_vec_t& c, ceph::buffer::list::const_iterator &p) {
1788 c.decode(p);
1789 }
1790
1791 inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl)
1792 {
1793 CachedStackStringStream css;
1794 *css << std::setprecision(1) << std::fixed
1795 << "[pop"
1796 " IRD:" << dl.vec[0]
1797 << " IWR:" << dl.vec[1]
1798 << " RDR:" << dl.vec[2]
1799 << " FET:" << dl.vec[3]
1800 << " STR:" << dl.vec[4]
1801 << " *LOAD:" << dl.meta_load() << "]";
1802 return out << css->strv() << std::endl;
1803 }
1804
1805 struct mds_load_t {
1806 using clock = dirfrag_load_vec_t::clock;
1807 using time = dirfrag_load_vec_t::time;
1808
1809 dirfrag_load_vec_t auth;
1810 dirfrag_load_vec_t all;
1811
1812 mds_load_t() : auth(DecayRate()), all(DecayRate()) {}
1813 mds_load_t(const DecayRate &rate) : auth(rate), all(rate) {}
1814
1815 double req_rate = 0.0;
1816 double cache_hit_rate = 0.0;
1817 double queue_len = 0.0;
1818
1819 double cpu_load_avg = 0.0;
1820
1821 double mds_load() const; // defiend in MDBalancer.cc
1822 void encode(ceph::buffer::list& bl) const;
1823 void decode(ceph::buffer::list::const_iterator& bl);
1824 void dump(ceph::Formatter *f) const;
1825 static void generate_test_instances(std::list<mds_load_t*>& ls);
1826 };
1827 inline void encode(const mds_load_t &c, ceph::buffer::list &bl) {
1828 c.encode(bl);
1829 }
1830 inline void decode(mds_load_t &c, ceph::buffer::list::const_iterator &p) {
1831 c.decode(p);
1832 }
1833
1834 inline std::ostream& operator<<(std::ostream& out, const mds_load_t& load)
1835 {
1836 return out << "mdsload<" << load.auth << "/" << load.all
1837 << ", req " << load.req_rate
1838 << ", hr " << load.cache_hit_rate
1839 << ", qlen " << load.queue_len
1840 << ", cpu " << load.cpu_load_avg
1841 << ">";
1842 }
1843
1844 class load_spread_t {
1845 public:
1846 using time = DecayCounter::time;
1847 using clock = DecayCounter::clock;
1848 static const int MAX = 4;
1849
1850 load_spread_t(const DecayRate &rate) : count(rate)
1851 {}
1852
1853 load_spread_t() = delete;
1854
1855 double hit(int who) {
1856 for (int i=0; i<n; i++)
1857 if (last[i] == who)
1858 return count.get_last();
1859
1860 // we're new(ish)
1861 last[p++] = who;
1862 if (n < MAX) n++;
1863 if (n == 1) return 0.0;
1864
1865 if (p == MAX) p = 0;
1866
1867 return count.hit();
1868 }
1869 double get() const {
1870 return count.get();
1871 }
1872
1873 std::array<int, MAX> last = {-1, -1, -1, -1};
1874 int p = 0, n = 0;
1875 DecayCounter count;
1876 };
1877
1878 // ================================================================
1879 typedef std::pair<mds_rank_t, mds_rank_t> mds_authority_t;
1880
1881 // -- authority delegation --
1882 // directory authority types
1883 // >= 0 is the auth mds
1884 #define CDIR_AUTH_PARENT mds_rank_t(-1) // default
1885 #define CDIR_AUTH_UNKNOWN mds_rank_t(-2)
1886 #define CDIR_AUTH_DEFAULT mds_authority_t(CDIR_AUTH_PARENT, CDIR_AUTH_UNKNOWN)
1887 #define CDIR_AUTH_UNDEF mds_authority_t(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN)
1888 //#define CDIR_AUTH_ROOTINODE pair<int,int>( 0, -2)
1889
1890 class MDSCacheObjectInfo {
1891 public:
1892 void encode(ceph::buffer::list& bl) const;
1893 void decode(ceph::buffer::list::const_iterator& bl);
1894 void dump(ceph::Formatter *f) const;
1895 static void generate_test_instances(std::list<MDSCacheObjectInfo*>& ls);
1896
1897 inodeno_t ino = 0;
1898 dirfrag_t dirfrag;
1899 std::string dname;
1900 snapid_t snapid;
1901 };
1902
1903 inline std::ostream& operator<<(std::ostream& out, const MDSCacheObjectInfo &info) {
1904 if (info.ino) return out << info.ino << "." << info.snapid;
1905 if (info.dname.length()) return out << info.dirfrag << "/" << info.dname
1906 << " snap " << info.snapid;
1907 return out << info.dirfrag;
1908 }
1909
1910 inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) {
1911 if (l.ino || r.ino)
1912 return l.ino == r.ino && l.snapid == r.snapid;
1913 else
1914 return l.dirfrag == r.dirfrag && l.dname == r.dname;
1915 }
1916 WRITE_CLASS_ENCODER(MDSCacheObjectInfo)
1917
1918 // parse a map of keys/values.
1919 namespace qi = boost::spirit::qi;
1920
1921 template <typename Iterator>
1922 struct keys_and_values
1923 : qi::grammar<Iterator, std::map<std::string, std::string>()>
1924 {
1925 keys_and_values()
1926 : keys_and_values::base_type(query)
1927 {
1928 query = pair >> *(qi::lit(' ') >> pair);
1929 pair = key >> '=' >> value;
1930 key = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9");
1931 value = +qi::char_("a-zA-Z0-9-_.");
1932 }
1933 qi::rule<Iterator, std::map<std::string, std::string>()> query;
1934 qi::rule<Iterator, std::pair<std::string, std::string>()> pair;
1935 qi::rule<Iterator, std::string()> key, value;
1936 };
1937
1938 #endif