]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/mdstypes.h
update sources to 12.2.7
[ceph.git] / ceph / src / mds / mdstypes.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3#ifndef CEPH_MDSTYPES_H
4#define CEPH_MDSTYPES_H
5
6#include "include/int_types.h"
7
8#include <math.h>
9#include <ostream>
10#include <set>
11#include <map>
94b18763 12#include <boost/utility/string_view.hpp>
7c673cae
FG
13
14#include "common/config.h"
15#include "common/Clock.h"
16#include "common/DecayCounter.h"
17#include "common/entity_name.h"
18
19#include "include/Context.h"
20#include "include/frag.h"
21#include "include/xlist.h"
22#include "include/interval_set.h"
23#include "include/compact_map.h"
24#include "include/compact_set.h"
25#include "include/fs_types.h"
26
27#include "inode_backtrace.h"
28
29#include <boost/spirit/include/qi.hpp>
30#include <boost/pool/pool.hpp>
31#include "include/assert.h"
32#include <boost/serialization/strong_typedef.hpp>
33
34#define CEPH_FS_ONDISK_MAGIC "ceph fs volume v011"
35
36#define MDS_PORT_CACHE 0x200
37#define MDS_PORT_LOCKER 0x300
38#define MDS_PORT_MIGRATOR 0x400
39
40#define MAX_MDS 0x100
41#define NUM_STRAY 10
42
43#define MDS_INO_ROOT 1
44
45// No longer created but recognised in existing filesystems
46// so that we don't try to fragment it.
47#define MDS_INO_CEPH 2
48
49#define MDS_INO_MDSDIR_OFFSET (1*MAX_MDS)
50#define MDS_INO_STRAY_OFFSET (6*MAX_MDS)
51
52// Locations for journal data
53#define MDS_INO_LOG_OFFSET (2*MAX_MDS)
54#define MDS_INO_LOG_BACKUP_OFFSET (3*MAX_MDS)
55#define MDS_INO_LOG_POINTER_OFFSET (4*MAX_MDS)
56#define MDS_INO_PURGE_QUEUE (5*MAX_MDS)
57
58#define MDS_INO_SYSTEM_BASE ((6*MAX_MDS) + (MAX_MDS * NUM_STRAY))
59
60#define MDS_INO_STRAY(x,i) (MDS_INO_STRAY_OFFSET+((((unsigned)(x))*NUM_STRAY)+((unsigned)(i))))
61#define MDS_INO_MDSDIR(x) (MDS_INO_MDSDIR_OFFSET+((unsigned)x))
62
63#define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < (MDS_INO_STRAY_OFFSET+(MAX_MDS*NUM_STRAY)))
64#define MDS_INO_IS_MDSDIR(i) ((i) >= MDS_INO_MDSDIR_OFFSET && (i) < (MDS_INO_MDSDIR_OFFSET+MAX_MDS))
65#define MDS_INO_MDSDIR_OWNER(i) (signed ((unsigned (i)) - MDS_INO_MDSDIR_OFFSET))
66#define MDS_INO_IS_BASE(i) (MDS_INO_ROOT == (i) || MDS_INO_IS_MDSDIR(i))
67#define MDS_INO_STRAY_OWNER(i) (signed (((unsigned (i)) - MDS_INO_STRAY_OFFSET) / NUM_STRAY))
68#define MDS_INO_STRAY_INDEX(i) (((unsigned (i)) - MDS_INO_STRAY_OFFSET) % NUM_STRAY)
69
70#define MDS_TRAVERSE_FORWARD 1
71#define MDS_TRAVERSE_DISCOVER 2 // skips permissions checks etc.
72#define MDS_TRAVERSE_DISCOVERXLOCK 3 // succeeds on (foreign?) null, xlocked dentries.
73
74
75typedef int32_t mds_rank_t;
76typedef int32_t fs_cluster_id_t;
77
78BOOST_STRONG_TYPEDEF(uint64_t, mds_gid_t)
79extern const mds_gid_t MDS_GID_NONE;
80constexpr fs_cluster_id_t FS_CLUSTER_ID_NONE = {-1};
81// The namespace ID of the anonymous default filesystem from legacy systems
82constexpr fs_cluster_id_t FS_CLUSTER_ID_ANONYMOUS = {0};
83extern const mds_rank_t MDS_RANK_NONE;
84
85class mds_role_t
86{
87 public:
88 fs_cluster_id_t fscid;
89 mds_rank_t rank;
90
91 mds_role_t(fs_cluster_id_t fscid_, mds_rank_t rank_)
92 : fscid(fscid_), rank(rank_)
93 {}
94 mds_role_t()
95 : fscid(FS_CLUSTER_ID_NONE), rank(MDS_RANK_NONE)
96 {}
97 bool operator<(mds_role_t const &rhs) const
98 {
99 if (fscid < rhs.fscid) {
100 return true;
101 } else if (fscid == rhs.fscid) {
102 return rank < rhs.rank;
103 } else {
104 return false;
105 }
106 }
107
108 bool is_none() const
109 {
110 return (rank == MDS_RANK_NONE);
111 }
112};
113std::ostream& operator<<(std::ostream &out, const mds_role_t &role);
114
115
116// CAPS
117
118inline string gcap_string(int cap)
119{
120 string s;
121 if (cap & CEPH_CAP_GSHARED) s += "s";
122 if (cap & CEPH_CAP_GEXCL) s += "x";
123 if (cap & CEPH_CAP_GCACHE) s += "c";
124 if (cap & CEPH_CAP_GRD) s += "r";
125 if (cap & CEPH_CAP_GWR) s += "w";
126 if (cap & CEPH_CAP_GBUFFER) s += "b";
127 if (cap & CEPH_CAP_GWREXTEND) s += "a";
128 if (cap & CEPH_CAP_GLAZYIO) s += "l";
129 return s;
130}
131inline string ccap_string(int cap)
132{
133 string s;
134 if (cap & CEPH_CAP_PIN) s += "p";
135
136 int a = (cap >> CEPH_CAP_SAUTH) & 3;
137 if (a) s += 'A' + gcap_string(a);
138
139 a = (cap >> CEPH_CAP_SLINK) & 3;
140 if (a) s += 'L' + gcap_string(a);
141
142 a = (cap >> CEPH_CAP_SXATTR) & 3;
143 if (a) s += 'X' + gcap_string(a);
144
145 a = cap >> CEPH_CAP_SFILE;
146 if (a) s += 'F' + gcap_string(a);
147
148 if (s.length() == 0)
149 s = "-";
150 return s;
151}
152
153
154struct scatter_info_t {
94b18763 155 version_t version = 0;
7c673cae 156
94b18763 157 scatter_info_t() {}
7c673cae
FG
158};
159
160struct frag_info_t : public scatter_info_t {
161 // this frag
162 utime_t mtime;
94b18763
FG
163 uint64_t change_attr = 0;
164 int64_t nfiles = 0; // files
165 int64_t nsubdirs = 0; // subdirs
7c673cae 166
94b18763 167 frag_info_t() {}
7c673cae
FG
168
169 int64_t size() const { return nfiles + nsubdirs; }
170
171 void zero() {
172 *this = frag_info_t();
173 }
174
175 // *this += cur - acc;
176 void add_delta(const frag_info_t &cur, const frag_info_t &acc, bool *touched_mtime=0, bool *touched_chattr=0) {
177 if (cur.mtime > mtime) {
178 mtime = cur.mtime;
179 if (touched_mtime)
180 *touched_mtime = true;
181 }
182 if (cur.change_attr > change_attr) {
183 change_attr = cur.change_attr;
184 if (touched_chattr)
185 *touched_chattr = true;
186 }
187 nfiles += cur.nfiles - acc.nfiles;
188 nsubdirs += cur.nsubdirs - acc.nsubdirs;
189 }
190
191 void add(const frag_info_t& other) {
192 if (other.mtime > mtime)
193 mtime = other.mtime;
194 if (other.change_attr > change_attr)
195 change_attr = other.change_attr;
196 nfiles += other.nfiles;
197 nsubdirs += other.nsubdirs;
198 }
199
200 bool same_sums(const frag_info_t &o) const {
201 return mtime <= o.mtime &&
202 nfiles == o.nfiles &&
203 nsubdirs == o.nsubdirs;
204 }
205
206 void encode(bufferlist &bl) const;
207 void decode(bufferlist::iterator& bl);
208 void dump(Formatter *f) const;
209 static void generate_test_instances(list<frag_info_t*>& ls);
210};
211WRITE_CLASS_ENCODER(frag_info_t)
212
213inline bool operator==(const frag_info_t &l, const frag_info_t &r) {
214 return memcmp(&l, &r, sizeof(l)) == 0;
215}
216inline bool operator!=(const frag_info_t &l, const frag_info_t &r) {
217 return !(l == r);
218}
219
220std::ostream& operator<<(std::ostream &out, const frag_info_t &f);
221
222
223struct nest_info_t : public scatter_info_t {
224 // this frag + children
225 utime_t rctime;
94b18763
FG
226 int64_t rbytes = 0;
227 int64_t rfiles = 0;
228 int64_t rsubdirs = 0;
7c673cae
FG
229 int64_t rsize() const { return rfiles + rsubdirs; }
230
94b18763 231 int64_t rsnaprealms = 0;
7c673cae 232
94b18763 233 nest_info_t() {}
7c673cae
FG
234
235 void zero() {
236 *this = nest_info_t();
237 }
238
239 void sub(const nest_info_t &other) {
240 add(other, -1);
241 }
242 void add(const nest_info_t &other, int fac=1) {
243 if (other.rctime > rctime)
244 rctime = other.rctime;
245 rbytes += fac*other.rbytes;
246 rfiles += fac*other.rfiles;
247 rsubdirs += fac*other.rsubdirs;
248 rsnaprealms += fac*other.rsnaprealms;
249 }
250
251 // *this += cur - acc;
252 void add_delta(const nest_info_t &cur, const nest_info_t &acc) {
253 if (cur.rctime > rctime)
254 rctime = cur.rctime;
255 rbytes += cur.rbytes - acc.rbytes;
256 rfiles += cur.rfiles - acc.rfiles;
257 rsubdirs += cur.rsubdirs - acc.rsubdirs;
258 rsnaprealms += cur.rsnaprealms - acc.rsnaprealms;
259 }
260
261 bool same_sums(const nest_info_t &o) const {
262 return rctime <= o.rctime &&
263 rbytes == o.rbytes &&
264 rfiles == o.rfiles &&
265 rsubdirs == o.rsubdirs &&
266 rsnaprealms == o.rsnaprealms;
267 }
268
269 void encode(bufferlist &bl) const;
270 void decode(bufferlist::iterator& bl);
271 void dump(Formatter *f) const;
272 static void generate_test_instances(list<nest_info_t*>& ls);
273};
274WRITE_CLASS_ENCODER(nest_info_t)
275
276inline bool operator==(const nest_info_t &l, const nest_info_t &r) {
277 return memcmp(&l, &r, sizeof(l)) == 0;
278}
279inline bool operator!=(const nest_info_t &l, const nest_info_t &r) {
280 return !(l == r);
281}
282
283std::ostream& operator<<(std::ostream &out, const nest_info_t &n);
284
285
286struct vinodeno_t {
287 inodeno_t ino;
288 snapid_t snapid;
289 vinodeno_t() {}
290 vinodeno_t(inodeno_t i, snapid_t s) : ino(i), snapid(s) {}
291
292 void encode(bufferlist& bl) const {
293 ::encode(ino, bl);
294 ::encode(snapid, bl);
295 }
296 void decode(bufferlist::iterator& p) {
297 ::decode(ino, p);
298 ::decode(snapid, p);
299 }
300};
301WRITE_CLASS_ENCODER(vinodeno_t)
302
303inline bool operator==(const vinodeno_t &l, const vinodeno_t &r) {
304 return l.ino == r.ino && l.snapid == r.snapid;
305}
306inline bool operator!=(const vinodeno_t &l, const vinodeno_t &r) {
307 return !(l == r);
308}
309inline bool operator<(const vinodeno_t &l, const vinodeno_t &r) {
310 return
311 l.ino < r.ino ||
312 (l.ino == r.ino && l.snapid < r.snapid);
313}
314
315struct quota_info_t
316{
94b18763
FG
317 int64_t max_bytes = 0;
318 int64_t max_files = 0;
7c673cae 319
94b18763 320 quota_info_t() {}
7c673cae
FG
321
322 void encode(bufferlist& bl) const {
323 ENCODE_START(1, 1, bl);
324 ::encode(max_bytes, bl);
325 ::encode(max_files, bl);
326 ENCODE_FINISH(bl);
327 }
328 void decode(bufferlist::iterator& p) {
329 DECODE_START_LEGACY_COMPAT_LEN(1, 1, 1, p);
330 ::decode(max_bytes, p);
331 ::decode(max_files, p);
332 DECODE_FINISH(p);
333 }
334
335 void dump(Formatter *f) const;
336 static void generate_test_instances(list<quota_info_t *>& ls);
337
338 bool is_valid() const {
339 return max_bytes >=0 && max_files >=0;
340 }
341 bool is_enable() const {
342 return max_bytes || max_files;
343 }
344};
345WRITE_CLASS_ENCODER(quota_info_t)
346
347inline bool operator==(const quota_info_t &l, const quota_info_t &r) {
348 return memcmp(&l, &r, sizeof(l)) == 0;
349}
350
351ostream& operator<<(ostream &out, const quota_info_t &n);
352
353namespace std {
354 template<> struct hash<vinodeno_t> {
355 size_t operator()(const vinodeno_t &vino) const {
356 hash<inodeno_t> H;
357 hash<uint64_t> I;
358 return H(vino.ino) ^ I(vino.snapid);
359 }
360 };
361} // namespace std
362
363
364
365
366inline std::ostream& operator<<(std::ostream &out, const vinodeno_t &vino) {
367 out << vino.ino;
368 if (vino.snapid == CEPH_NOSNAP)
369 out << ".head";
370 else if (vino.snapid)
371 out << '.' << vino.snapid;
372 return out;
373}
374
375
376/*
377 * client_writeable_range_t
378 */
379struct client_writeable_range_t {
380 struct byte_range_t {
94b18763
FG
381 uint64_t first = 0, last = 0; // interval client can write to
382 byte_range_t() {}
7c673cae
FG
383 };
384
385 byte_range_t range;
94b18763 386 snapid_t follows = 0; // aka "data+metadata flushed thru"
7c673cae 387
94b18763 388 client_writeable_range_t() {}
7c673cae
FG
389
390 void encode(bufferlist &bl) const;
391 void decode(bufferlist::iterator& bl);
392 void dump(Formatter *f) const;
94b18763 393 static void generate_test_instances(std::list<client_writeable_range_t*>& ls);
7c673cae
FG
394};
395
396inline void decode(client_writeable_range_t::byte_range_t& range, bufferlist::iterator& bl) {
397 ::decode(range.first, bl);
398 ::decode(range.last, bl);
399}
400
401WRITE_CLASS_ENCODER(client_writeable_range_t)
402
403std::ostream& operator<<(std::ostream& out, const client_writeable_range_t& r);
404
405inline bool operator==(const client_writeable_range_t& l,
406 const client_writeable_range_t& r) {
407 return l.range.first == r.range.first && l.range.last == r.range.last &&
408 l.follows == r.follows;
409}
410
411struct inline_data_t {
412private:
413 std::unique_ptr<bufferlist> blp;
414public:
94b18763 415 version_t version = 1;
7c673cae
FG
416
417 void free_data() {
418 blp.reset();
419 }
420 bufferlist& get_data() {
421 if (!blp)
422 blp.reset(new bufferlist);
423 return *blp;
424 }
425 size_t length() const { return blp ? blp->length() : 0; }
426
94b18763 427 inline_data_t() {}
7c673cae
FG
428 inline_data_t(const inline_data_t& o) : version(o.version) {
429 if (o.blp)
430 get_data() = *o.blp;
431 }
432 inline_data_t& operator=(const inline_data_t& o) {
433 version = o.version;
434 if (o.blp)
435 get_data() = *o.blp;
436 else
437 free_data();
438 return *this;
439 }
440 bool operator==(const inline_data_t& o) const {
441 return length() == o.length() &&
442 (length() == 0 ||
443 (*const_cast<bufferlist*>(blp.get()) == *const_cast<bufferlist*>(o.blp.get())));
444 }
445 bool operator!=(const inline_data_t& o) const {
446 return !(*this == o);
447 }
448 void encode(bufferlist &bl) const;
449 void decode(bufferlist::iterator& bl);
450};
451WRITE_CLASS_ENCODER(inline_data_t)
452
453enum {
454 DAMAGE_STATS, // statistics (dirstat, size, etc)
455 DAMAGE_RSTATS, // recursive statistics (rstat, accounted_rstat)
456 DAMAGE_FRAGTREE // fragtree -- repair by searching
457};
458typedef uint32_t damage_flags_t;
459
460/*
461 * inode_t
462 */
94b18763 463template<template<typename> class Allocator = std::allocator>
7c673cae
FG
464struct inode_t {
465 /**
466 * ***************
467 * Do not forget to add any new fields to the compare() function.
468 * ***************
469 */
470 // base (immutable)
94b18763
FG
471 inodeno_t ino = 0;
472 uint32_t rdev = 0; // if special file
7c673cae
FG
473
474 // affected by any inode change...
475 utime_t ctime; // inode change time
476 utime_t btime; // birth time
477
478 // perm (namespace permissions)
94b18763
FG
479 uint32_t mode = 0;
480 uid_t uid = 0;
481 gid_t gid = 0;
7c673cae
FG
482
483 // nlink
94b18763 484 int32_t nlink = 0;
7c673cae
FG
485
486 // file (data access)
487 ceph_dir_layout dir_layout; // [dir only]
488 file_layout_t layout;
94b18763
FG
489 compact_set<int64_t, std::less<int64_t>, Allocator<int64_t>> old_pools;
490 uint64_t size = 0; // on directory, # dentries
491 uint64_t max_size_ever = 0; // max size the file has ever been
492 uint32_t truncate_seq = 0;
493 uint64_t truncate_size = 0, truncate_from = 0;
494 uint32_t truncate_pending = 0;
7c673cae
FG
495 utime_t mtime; // file data modify time.
496 utime_t atime; // file data access time.
94b18763
FG
497 uint32_t time_warp_seq = 0; // count of (potential) mtime/atime timewarps (i.e., utimes())
498 inline_data_t inline_data; // FIXME check
7c673cae
FG
499
500 // change attribute
94b18763 501 uint64_t change_attr = 0;
7c673cae 502
94b18763
FG
503 using client_range_map = std::map<client_t,client_writeable_range_t,std::less<client_t>,Allocator<std::pair<const client_t,client_writeable_range_t>>>;
504 client_range_map client_ranges; // client(s) can write to these ranges
7c673cae
FG
505
506 // dirfrag, recursive accountin
507 frag_info_t dirstat; // protected by my filelock
508 nest_info_t rstat; // protected by my nestlock
509 nest_info_t accounted_rstat; // protected by parent's nestlock
510
511 quota_info_t quota;
512
94b18763 513 mds_rank_t export_pin = MDS_RANK_NONE;
7c673cae
FG
514
515 // special stuff
94b18763
FG
516 version_t version = 0; // auth only
517 version_t file_data_version = 0; // auth only
518 version_t xattr_version = 0;
7c673cae
FG
519
520 utime_t last_scrub_stamp; // start time of last complete scrub
94b18763 521 version_t last_scrub_version = 0;// (parent) start version of last complete scrub
7c673cae 522
94b18763 523 version_t backtrace_version = 0;
7c673cae
FG
524
525 snapid_t oldest_snap;
526
94b18763
FG
527 std::basic_string<char,std::char_traits<char>,Allocator<char>> stray_prior_path; //stores path before unlink
528
529 inode_t()
530 {
7c673cae
FG
531 clear_layout();
532 memset(&dir_layout, 0, sizeof(dir_layout));
533 memset(&quota, 0, sizeof(quota));
534 }
535
536 // file type
537 bool is_symlink() const { return (mode & S_IFMT) == S_IFLNK; }
538 bool is_dir() const { return (mode & S_IFMT) == S_IFDIR; }
539 bool is_file() const { return (mode & S_IFMT) == S_IFREG; }
540
541 bool is_truncating() const { return (truncate_pending > 0); }
542 void truncate(uint64_t old_size, uint64_t new_size) {
543 assert(new_size < old_size);
544 if (old_size > max_size_ever)
545 max_size_ever = old_size;
546 truncate_from = old_size;
547 size = new_size;
548 rstat.rbytes = new_size;
549 truncate_size = size;
550 truncate_seq++;
551 truncate_pending++;
552 }
553
554 bool has_layout() const {
555 return layout != file_layout_t();
556 }
557
558 void clear_layout() {
559 layout = file_layout_t();
560 }
561
562 uint64_t get_layout_size_increment() const {
563 return layout.get_period();
564 }
565
566 bool is_dirty_rstat() const { return !(rstat == accounted_rstat); }
567
568 uint64_t get_max_size() const {
569 uint64_t max = 0;
570 for (std::map<client_t,client_writeable_range_t>::const_iterator p = client_ranges.begin();
571 p != client_ranges.end();
572 ++p)
573 if (p->second.range.last > max)
574 max = p->second.range.last;
575 return max;
576 }
577 void set_max_size(uint64_t new_max) {
578 if (new_max == 0) {
579 client_ranges.clear();
580 } else {
581 for (std::map<client_t,client_writeable_range_t>::iterator p = client_ranges.begin();
582 p != client_ranges.end();
583 ++p)
584 p->second.range.last = new_max;
585 }
586 }
587
588 void trim_client_ranges(snapid_t last) {
589 std::map<client_t, client_writeable_range_t>::iterator p = client_ranges.begin();
590 while (p != client_ranges.end()) {
591 if (p->second.follows >= last)
592 client_ranges.erase(p++);
593 else
594 ++p;
595 }
596 }
597
598 bool is_backtrace_updated() const {
599 return backtrace_version == version;
600 }
601 void update_backtrace(version_t pv=0) {
602 backtrace_version = pv ? pv : version;
603 }
604
605 void add_old_pool(int64_t l) {
606 backtrace_version = version;
607 old_pools.insert(l);
608 }
609
610 void encode(bufferlist &bl, uint64_t features) const;
611 void decode(bufferlist::iterator& bl);
612 void dump(Formatter *f) const;
94b18763 613 static void generate_test_instances(std::list<inode_t*>& ls);
7c673cae
FG
614 /**
615 * Compare this inode_t with another that represent *the same inode*
616 * at different points in time.
617 * @pre The inodes are the same ino
618 *
619 * @param other The inode_t to compare ourselves with
620 * @param divergent A bool pointer which will be set to true
621 * if the values are different in a way that can't be explained
622 * by one being a newer version than the other.
623 *
624 * @returns 1 if we are newer than the other, 0 if equal, -1 if older.
625 */
626 int compare(const inode_t &other, bool *divergent) const;
627private:
628 bool older_is_consistent(const inode_t &other) const;
629};
7c673cae 630
94b18763
FG
631// These methods may be moved back to mdstypes.cc when we have pmr
632template<template<typename> class Allocator>
633void inode_t<Allocator>::encode(bufferlist &bl, uint64_t features) const
634{
635 ENCODE_START(15, 6, bl);
636
637 ::encode(ino, bl);
638 ::encode(rdev, bl);
639 ::encode(ctime, bl);
640
641 ::encode(mode, bl);
642 ::encode(uid, bl);
643 ::encode(gid, bl);
644
645 ::encode(nlink, bl);
646 {
647 // removed field
648 bool anchored = 0;
649 ::encode(anchored, bl);
650 }
651
652 ::encode(dir_layout, bl);
653 ::encode(layout, bl, features);
654 ::encode(size, bl);
655 ::encode(truncate_seq, bl);
656 ::encode(truncate_size, bl);
657 ::encode(truncate_from, bl);
658 ::encode(truncate_pending, bl);
659 ::encode(mtime, bl);
660 ::encode(atime, bl);
661 ::encode(time_warp_seq, bl);
662 ::encode(client_ranges, bl);
663
664 ::encode(dirstat, bl);
665 ::encode(rstat, bl);
666 ::encode(accounted_rstat, bl);
667
668 ::encode(version, bl);
669 ::encode(file_data_version, bl);
670 ::encode(xattr_version, bl);
671 ::encode(backtrace_version, bl);
672 ::encode(old_pools, bl);
673 ::encode(max_size_ever, bl);
674 ::encode(inline_data, bl);
675 ::encode(quota, bl);
676
677 ::encode(stray_prior_path, bl);
678
679 ::encode(last_scrub_version, bl);
680 ::encode(last_scrub_stamp, bl);
681
682 ::encode(btime, bl);
683 ::encode(change_attr, bl);
684
685 ::encode(export_pin, bl);
686
687 ENCODE_FINISH(bl);
688}
689
690template<template<typename> class Allocator>
691void inode_t<Allocator>::decode(bufferlist::iterator &p)
692{
693 DECODE_START_LEGACY_COMPAT_LEN(15, 6, 6, p);
694
695 ::decode(ino, p);
696 ::decode(rdev, p);
697 ::decode(ctime, p);
698
699 ::decode(mode, p);
700 ::decode(uid, p);
701 ::decode(gid, p);
702
703 ::decode(nlink, p);
704 {
705 bool anchored;
706 ::decode(anchored, p);
707 }
708
709 if (struct_v >= 4)
710 ::decode(dir_layout, p);
711 else
712 memset(&dir_layout, 0, sizeof(dir_layout));
713 ::decode(layout, p);
714 ::decode(size, p);
715 ::decode(truncate_seq, p);
716 ::decode(truncate_size, p);
717 ::decode(truncate_from, p);
718 if (struct_v >= 5)
719 ::decode(truncate_pending, p);
720 else
721 truncate_pending = 0;
722 ::decode(mtime, p);
723 ::decode(atime, p);
724 ::decode(time_warp_seq, p);
725 if (struct_v >= 3) {
726 ::decode(client_ranges, p);
727 } else {
728 map<client_t, client_writeable_range_t::byte_range_t> m;
729 ::decode(m, p);
730 for (map<client_t, client_writeable_range_t::byte_range_t>::iterator
731 q = m.begin(); q != m.end(); ++q)
732 client_ranges[q->first].range = q->second;
733 }
734
735 ::decode(dirstat, p);
736 ::decode(rstat, p);
737 ::decode(accounted_rstat, p);
738
739 ::decode(version, p);
740 ::decode(file_data_version, p);
741 ::decode(xattr_version, p);
742 if (struct_v >= 2)
743 ::decode(backtrace_version, p);
744 if (struct_v >= 7)
745 ::decode(old_pools, p);
746 if (struct_v >= 8)
747 ::decode(max_size_ever, p);
748 if (struct_v >= 9) {
749 ::decode(inline_data, p);
750 } else {
751 inline_data.version = CEPH_INLINE_NONE;
752 }
753 if (struct_v < 10)
754 backtrace_version = 0; // force update backtrace
755 if (struct_v >= 11)
756 ::decode(quota, p);
757
758 if (struct_v >= 12) {
759 std::string tmp;
760 ::decode(tmp, p);
761 stray_prior_path = std::basic_string<char,std::char_traits<char>,Allocator<char>>(boost::string_view(tmp));
762 }
763
764 if (struct_v >= 13) {
765 ::decode(last_scrub_version, p);
766 ::decode(last_scrub_stamp, p);
767 }
768 if (struct_v >= 14) {
769 ::decode(btime, p);
770 ::decode(change_attr, p);
771 } else {
772 btime = utime_t();
773 change_attr = 0;
774 }
775
776 if (struct_v >= 15) {
777 ::decode(export_pin, p);
778 } else {
779 export_pin = MDS_RANK_NONE;
780 }
781
782 DECODE_FINISH(p);
783}
784
785template<template<typename> class Allocator>
786void inode_t<Allocator>::dump(Formatter *f) const
787{
788 f->dump_unsigned("ino", ino);
789 f->dump_unsigned("rdev", rdev);
790 f->dump_stream("ctime") << ctime;
791 f->dump_stream("btime") << btime;
792 f->dump_unsigned("mode", mode);
793 f->dump_unsigned("uid", uid);
794 f->dump_unsigned("gid", gid);
795 f->dump_unsigned("nlink", nlink);
796
797 f->open_object_section("dir_layout");
798 ::dump(dir_layout, f);
799 f->close_section();
800
801 f->dump_object("layout", layout);
802
803 f->open_array_section("old_pools");
804 for (const auto &p : old_pools) {
805 f->dump_int("pool", p);
806 }
807 f->close_section();
808
809 f->dump_unsigned("size", size);
810 f->dump_unsigned("truncate_seq", truncate_seq);
811 f->dump_unsigned("truncate_size", truncate_size);
812 f->dump_unsigned("truncate_from", truncate_from);
813 f->dump_unsigned("truncate_pending", truncate_pending);
814 f->dump_stream("mtime") << mtime;
815 f->dump_stream("atime") << atime;
816 f->dump_unsigned("time_warp_seq", time_warp_seq);
817 f->dump_unsigned("change_attr", change_attr);
818 f->dump_int("export_pin", export_pin);
819
820 f->open_array_section("client_ranges");
821 for (const auto &p : client_ranges) {
822 f->open_object_section("client");
823 f->dump_unsigned("client", p.first.v);
824 p.second.dump(f);
825 f->close_section();
826 }
827 f->close_section();
828
829 f->open_object_section("dirstat");
830 dirstat.dump(f);
831 f->close_section();
832
833 f->open_object_section("rstat");
834 rstat.dump(f);
835 f->close_section();
836
837 f->open_object_section("accounted_rstat");
838 accounted_rstat.dump(f);
839 f->close_section();
840
841 f->dump_unsigned("version", version);
842 f->dump_unsigned("file_data_version", file_data_version);
843 f->dump_unsigned("xattr_version", xattr_version);
844 f->dump_unsigned("backtrace_version", backtrace_version);
845
846 f->dump_string("stray_prior_path", stray_prior_path);
847}
848
849template<template<typename> class Allocator>
850void inode_t<Allocator>::generate_test_instances(list<inode_t*>& ls)
851{
852 ls.push_back(new inode_t<Allocator>);
853 ls.push_back(new inode_t<Allocator>);
854 ls.back()->ino = 1;
855 // i am lazy.
856}
857
858template<template<typename> class Allocator>
859int inode_t<Allocator>::compare(const inode_t<Allocator> &other, bool *divergent) const
860{
861 assert(ino == other.ino);
862 *divergent = false;
863 if (version == other.version) {
864 if (rdev != other.rdev ||
865 ctime != other.ctime ||
866 btime != other.btime ||
867 mode != other.mode ||
868 uid != other.uid ||
869 gid != other.gid ||
870 nlink != other.nlink ||
871 memcmp(&dir_layout, &other.dir_layout, sizeof(dir_layout)) ||
872 layout != other.layout ||
873 old_pools != other.old_pools ||
874 size != other.size ||
875 max_size_ever != other.max_size_ever ||
876 truncate_seq != other.truncate_seq ||
877 truncate_size != other.truncate_size ||
878 truncate_from != other.truncate_from ||
879 truncate_pending != other.truncate_pending ||
880 change_attr != other.change_attr ||
881 mtime != other.mtime ||
882 atime != other.atime ||
883 time_warp_seq != other.time_warp_seq ||
884 inline_data != other.inline_data ||
885 client_ranges != other.client_ranges ||
886 !(dirstat == other.dirstat) ||
887 !(rstat == other.rstat) ||
888 !(accounted_rstat == other.accounted_rstat) ||
889 file_data_version != other.file_data_version ||
890 xattr_version != other.xattr_version ||
891 backtrace_version != other.backtrace_version) {
892 *divergent = true;
893 }
894 return 0;
895 } else if (version > other.version) {
896 *divergent = !older_is_consistent(other);
897 return 1;
898 } else {
899 assert(version < other.version);
900 *divergent = !other.older_is_consistent(*this);
901 return -1;
902 }
903}
904
905template<template<typename> class Allocator>
906bool inode_t<Allocator>::older_is_consistent(const inode_t<Allocator> &other) const
907{
908 if (max_size_ever < other.max_size_ever ||
909 truncate_seq < other.truncate_seq ||
910 time_warp_seq < other.time_warp_seq ||
911 inline_data.version < other.inline_data.version ||
912 dirstat.version < other.dirstat.version ||
913 rstat.version < other.rstat.version ||
914 accounted_rstat.version < other.accounted_rstat.version ||
915 file_data_version < other.file_data_version ||
916 xattr_version < other.xattr_version ||
917 backtrace_version < other.backtrace_version) {
918 return false;
919 }
920 return true;
921}
922
923template<template<typename> class Allocator>
924inline void encode(const inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
925{
926 ENCODE_DUMP_PRE();
927 c.encode(bl, features);
928 ENCODE_DUMP_POST(cl);
929}
930template<template<typename> class Allocator>
931inline void decode(inode_t<Allocator> &c, ::ceph::bufferlist::iterator &p)
932{
933 c.decode(p);
934}
935
936template<template<typename> class Allocator>
937using alloc_string = std::basic_string<char,std::char_traits<char>,Allocator<char>>;
938
939template<template<typename> class Allocator>
940using xattr_map = compact_map<alloc_string<Allocator>, bufferptr, std::less<alloc_string<Allocator>>, Allocator<std::pair<const alloc_string<Allocator>, bufferptr>>>; // FIXME bufferptr not in mempool
7c673cae
FG
941
942/*
943 * old_inode_t
944 */
94b18763 945template<template<typename> class Allocator = std::allocator>
7c673cae
FG
946struct old_inode_t {
947 snapid_t first;
94b18763
FG
948 inode_t<Allocator> inode;
949 xattr_map<Allocator> xattrs;
7c673cae
FG
950
951 void encode(bufferlist &bl, uint64_t features) const;
952 void decode(bufferlist::iterator& bl);
953 void dump(Formatter *f) const;
94b18763 954 static void generate_test_instances(std::list<old_inode_t*>& ls);
7c673cae 955};
94b18763
FG
956
957// These methods may be moved back to mdstypes.cc when we have pmr
958template<template<typename> class Allocator>
959void old_inode_t<Allocator>::encode(bufferlist& bl, uint64_t features) const
960{
961 ENCODE_START(2, 2, bl);
962 ::encode(first, bl);
963 ::encode(inode, bl, features);
964 ::encode(xattrs, bl);
965 ENCODE_FINISH(bl);
966}
967
968template<template<typename> class Allocator>
969void old_inode_t<Allocator>::decode(bufferlist::iterator& bl)
970{
971 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
972 ::decode(first, bl);
973 ::decode(inode, bl);
974 ::decode(xattrs, bl);
975 DECODE_FINISH(bl);
976}
977
978template<template<typename> class Allocator>
979void old_inode_t<Allocator>::dump(Formatter *f) const
980{
981 f->dump_unsigned("first", first);
982 inode.dump(f);
983 f->open_object_section("xattrs");
984 for (const auto &p : xattrs) {
985 std::string v(p.second.c_str(), p.second.length());
986 f->dump_string(p.first.c_str(), v);
987 }
988 f->close_section();
989}
990
991template<template<typename> class Allocator>
992void old_inode_t<Allocator>::generate_test_instances(std::list<old_inode_t<Allocator>*>& ls)
993{
994 ls.push_back(new old_inode_t<Allocator>);
995 ls.push_back(new old_inode_t<Allocator>);
996 ls.back()->first = 2;
997 std::list<inode_t<Allocator>*> ils;
998 inode_t<Allocator>::generate_test_instances(ils);
999 ls.back()->inode = *ils.back();
1000 ls.back()->xattrs["user.foo"] = buffer::copy("asdf", 4);
1001 ls.back()->xattrs["user.unprintable"] = buffer::copy("\000\001\002", 3);
1002}
1003
1004template<template<typename> class Allocator>
1005inline void encode(const old_inode_t<Allocator> &c, ::ceph::bufferlist &bl, uint64_t features)
1006{
1007 ENCODE_DUMP_PRE();
1008 c.encode(bl, features);
1009 ENCODE_DUMP_POST(cl);
1010}
1011template<template<typename> class Allocator>
1012inline void decode(old_inode_t<Allocator> &c, ::ceph::bufferlist::iterator &p)
1013{
1014 c.decode(p);
1015}
7c673cae
FG
1016
1017
1018/*
1019 * like an inode, but for a dir frag
1020 */
1021struct fnode_t {
94b18763 1022 version_t version = 0;
7c673cae
FG
1023 snapid_t snap_purged_thru; // the max_last_destroy snapid we've been purged thru
1024 frag_info_t fragstat, accounted_fragstat;
1025 nest_info_t rstat, accounted_rstat;
94b18763 1026 damage_flags_t damage_flags = 0;
7c673cae
FG
1027
1028 // we know we and all our descendants have been scrubbed since this version
94b18763 1029 version_t recursive_scrub_version = 0;
7c673cae
FG
1030 utime_t recursive_scrub_stamp;
1031 // version at which we last scrubbed our personal data structures
94b18763 1032 version_t localized_scrub_version = 0;
7c673cae
FG
1033 utime_t localized_scrub_stamp;
1034
1035 void encode(bufferlist &bl) const;
1036 void decode(bufferlist::iterator& bl);
1037 void dump(Formatter *f) const;
1038 static void generate_test_instances(list<fnode_t*>& ls);
94b18763 1039 fnode_t() {}
7c673cae
FG
1040};
1041WRITE_CLASS_ENCODER(fnode_t)
1042
1043
1044struct old_rstat_t {
1045 snapid_t first;
1046 nest_info_t rstat, accounted_rstat;
1047
1048 void encode(bufferlist& bl) const;
1049 void decode(bufferlist::iterator& p);
1050 void dump(Formatter *f) const;
1051 static void generate_test_instances(list<old_rstat_t*>& ls);
1052};
1053WRITE_CLASS_ENCODER(old_rstat_t)
1054
1055inline std::ostream& operator<<(std::ostream& out, const old_rstat_t& o) {
1056 return out << "old_rstat(first " << o.first << " " << o.rstat << " " << o.accounted_rstat << ")";
1057}
1058
1059
1060/*
1061 * session_info_t
1062 */
1063
1064struct session_info_t {
1065 entity_inst_t inst;
1066 std::map<ceph_tid_t,inodeno_t> completed_requests;
1067 interval_set<inodeno_t> prealloc_inos; // preallocated, ready to use.
1068 interval_set<inodeno_t> used_inos; // journaling use
1069 std::map<std::string, std::string> client_metadata;
1070 std::set<ceph_tid_t> completed_flushes;
1071 EntityName auth_name;
1072
1073 client_t get_client() const { return client_t(inst.name.num()); }
1074 const entity_name_t& get_source() const { return inst.name; }
1075
1076 void clear_meta() {
1077 prealloc_inos.clear();
1078 used_inos.clear();
1079 completed_requests.clear();
1080 completed_flushes.clear();
1081 }
1082
1083 void encode(bufferlist& bl, uint64_t features) const;
1084 void decode(bufferlist::iterator& p);
1085 void dump(Formatter *f) const;
1086 static void generate_test_instances(list<session_info_t*>& ls);
1087};
1088WRITE_CLASS_ENCODER_FEATURES(session_info_t)
1089
1090
1091// =======
1092// dentries
1093
1094struct dentry_key_t {
94b18763
FG
1095 snapid_t snapid = 0;
1096 boost::string_view name;
1097 __u32 hash = 0;
1098 dentry_key_t() {}
1099 dentry_key_t(snapid_t s, boost::string_view n, __u32 h=0) :
7c673cae
FG
1100 snapid(s), name(n), hash(h) {}
1101
94b18763 1102 bool is_valid() { return name.length() || snapid; }
7c673cae
FG
1103
1104 // encode into something that can be decoded as a string.
1105 // name_ (head) or name_%x (!head)
1106 void encode(bufferlist& bl) const {
1107 string key;
1108 encode(key);
1109 ::encode(key, bl);
1110 }
1111 void encode(string& key) const {
1112 char b[20];
1113 if (snapid != CEPH_NOSNAP) {
1114 uint64_t val(snapid);
1115 snprintf(b, sizeof(b), "%" PRIx64, val);
1116 } else {
1117 snprintf(b, sizeof(b), "%s", "head");
1118 }
1119 ostringstream oss;
1120 oss << name << "_" << b;
1121 key = oss.str();
1122 }
1123 static void decode_helper(bufferlist::iterator& bl, string& nm, snapid_t& sn) {
1124 string key;
1125 ::decode(key, bl);
1126 decode_helper(key, nm, sn);
1127 }
94b18763 1128 static void decode_helper(boost::string_view key, string& nm, snapid_t& sn) {
7c673cae
FG
1129 size_t i = key.find_last_of('_');
1130 assert(i != string::npos);
94b18763 1131 if (key.compare(i+1, boost::string_view::npos, "head") == 0) {
7c673cae
FG
1132 // name_head
1133 sn = CEPH_NOSNAP;
1134 } else {
1135 // name_%x
1136 long long unsigned x = 0;
94b18763
FG
1137 std::string x_str(key.substr(i+1));
1138 sscanf(x_str.c_str(), "%llx", &x);
7c673cae
FG
1139 sn = x;
1140 }
94b18763 1141 nm = std::string(key.substr(0, i));
7c673cae
FG
1142 }
1143};
1144
1145inline std::ostream& operator<<(std::ostream& out, const dentry_key_t &k)
1146{
1147 return out << "(" << k.name << "," << k.snapid << ")";
1148}
1149
1150inline bool operator<(const dentry_key_t& k1, const dentry_key_t& k2)
1151{
1152 /*
1153 * order by hash, name, snap
1154 */
1155 int c = ceph_frag_value(k1.hash) - ceph_frag_value(k2.hash);
1156 if (c)
1157 return c < 0;
94b18763 1158 c = k1.name.compare(k2.name);
7c673cae
FG
1159 if (c)
1160 return c < 0;
1161 return k1.snapid < k2.snapid;
1162}
1163
1164
1165/*
1166 * string_snap_t is a simple (string, snapid_t) pair
1167 */
1168struct string_snap_t {
1169 string name;
1170 snapid_t snapid;
1171 string_snap_t() {}
94b18763 1172 string_snap_t(boost::string_view n, snapid_t s) : name(n), snapid(s) {}
7c673cae
FG
1173 string_snap_t(const char *n, snapid_t s) : name(n), snapid(s) {}
1174
1175 void encode(bufferlist& bl) const;
1176 void decode(bufferlist::iterator& p);
1177 void dump(Formatter *f) const;
1178 static void generate_test_instances(list<string_snap_t*>& ls);
1179};
1180WRITE_CLASS_ENCODER(string_snap_t)
1181
1182inline bool operator<(const string_snap_t& l, const string_snap_t& r) {
94b18763 1183 int c = l.name.compare(r.name);
7c673cae
FG
1184 return c < 0 || (c == 0 && l.snapid < r.snapid);
1185}
1186
1187inline std::ostream& operator<<(std::ostream& out, const string_snap_t &k)
1188{
1189 return out << "(" << k.name << "," << k.snapid << ")";
1190}
1191
1192/*
1193 * mds_table_pending_t
1194 *
1195 * mds's requesting any pending ops. child needs to encode the corresponding
1196 * pending mutation state in the table.
1197 */
1198struct mds_table_pending_t {
94b18763
FG
1199 uint64_t reqid = 0;
1200 __s32 mds = 0;
1201 version_t tid = 0;
1202 mds_table_pending_t() {}
7c673cae
FG
1203 void encode(bufferlist& bl) const;
1204 void decode(bufferlist::iterator& bl);
1205 void dump(Formatter *f) const;
1206 static void generate_test_instances(list<mds_table_pending_t*>& ls);
1207};
1208WRITE_CLASS_ENCODER(mds_table_pending_t)
1209
1210
1211// =========
1212// requests
1213
1214struct metareqid_t {
1215 entity_name_t name;
94b18763
FG
1216 uint64_t tid = 0;
1217 metareqid_t() {}
7c673cae
FG
1218 metareqid_t(entity_name_t n, ceph_tid_t t) : name(n), tid(t) {}
1219 void encode(bufferlist& bl) const {
1220 ::encode(name, bl);
1221 ::encode(tid, bl);
1222 }
1223 void decode(bufferlist::iterator &p) {
1224 ::decode(name, p);
1225 ::decode(tid, p);
1226 }
1227};
1228WRITE_CLASS_ENCODER(metareqid_t)
1229
1230inline std::ostream& operator<<(std::ostream& out, const metareqid_t& r) {
1231 return out << r.name << ":" << r.tid;
1232}
1233
1234inline bool operator==(const metareqid_t& l, const metareqid_t& r) {
1235 return (l.name == r.name) && (l.tid == r.tid);
1236}
1237inline bool operator!=(const metareqid_t& l, const metareqid_t& r) {
1238 return (l.name != r.name) || (l.tid != r.tid);
1239}
1240inline bool operator<(const metareqid_t& l, const metareqid_t& r) {
1241 return (l.name < r.name) ||
1242 (l.name == r.name && l.tid < r.tid);
1243}
1244inline bool operator<=(const metareqid_t& l, const metareqid_t& r) {
1245 return (l.name < r.name) ||
1246 (l.name == r.name && l.tid <= r.tid);
1247}
1248inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); }
1249inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); }
1250
1251namespace std {
1252 template<> struct hash<metareqid_t> {
1253 size_t operator()(const metareqid_t &r) const {
1254 hash<uint64_t> H;
1255 return H(r.name.num()) ^ H(r.name.type()) ^ H(r.tid);
1256 }
1257 };
1258} // namespace std
1259
1260
1261// cap info for client reconnect
1262struct cap_reconnect_t {
1263 string path;
1264 mutable ceph_mds_cap_reconnect capinfo;
1265 snapid_t snap_follows;
1266 bufferlist flockbl;
1267
1268 cap_reconnect_t() {
1269 memset(&capinfo, 0, sizeof(capinfo));
1270 snap_follows = 0;
1271 }
94b18763 1272 cap_reconnect_t(uint64_t cap_id, inodeno_t pino, boost::string_view p, int w, int i,
7c673cae
FG
1273 inodeno_t sr, snapid_t sf, bufferlist& lb) :
1274 path(p) {
1275 capinfo.cap_id = cap_id;
1276 capinfo.wanted = w;
1277 capinfo.issued = i;
1278 capinfo.snaprealm = sr;
1279 capinfo.pathbase = pino;
1280 capinfo.flock_len = 0;
1281 snap_follows = sf;
1282 flockbl.claim(lb);
1283 }
1284 void encode(bufferlist& bl) const;
1285 void decode(bufferlist::iterator& bl);
1286 void encode_old(bufferlist& bl) const;
1287 void decode_old(bufferlist::iterator& bl);
1288
1289 void dump(Formatter *f) const;
1290 static void generate_test_instances(list<cap_reconnect_t*>& ls);
1291};
1292WRITE_CLASS_ENCODER(cap_reconnect_t)
1293
1294
1295// compat for pre-FLOCK feature
1296struct old_ceph_mds_cap_reconnect {
1297 __le64 cap_id;
1298 __le32 wanted;
1299 __le32 issued;
1300 __le64 old_size;
1301 struct ceph_timespec old_mtime, old_atime;
1302 __le64 snaprealm;
1303 __le64 pathbase; /* base ino for our path to this ino */
1304} __attribute__ ((packed));
1305WRITE_RAW_ENCODER(old_ceph_mds_cap_reconnect)
1306
1307struct old_cap_reconnect_t {
1308 string path;
1309 old_ceph_mds_cap_reconnect capinfo;
1310
1311 const old_cap_reconnect_t& operator=(const cap_reconnect_t& n) {
1312 path = n.path;
1313 capinfo.cap_id = n.capinfo.cap_id;
1314 capinfo.wanted = n.capinfo.wanted;
1315 capinfo.issued = n.capinfo.issued;
1316 capinfo.snaprealm = n.capinfo.snaprealm;
1317 capinfo.pathbase = n.capinfo.pathbase;
1318 return *this;
1319 }
1320 operator cap_reconnect_t() {
1321 cap_reconnect_t n;
1322 n.path = path;
1323 n.capinfo.cap_id = capinfo.cap_id;
1324 n.capinfo.wanted = capinfo.wanted;
1325 n.capinfo.issued = capinfo.issued;
1326 n.capinfo.snaprealm = capinfo.snaprealm;
1327 n.capinfo.pathbase = capinfo.pathbase;
1328 return n;
1329 }
1330
1331 void encode(bufferlist& bl) const {
1332 ::encode(path, bl);
1333 ::encode(capinfo, bl);
1334 }
1335 void decode(bufferlist::iterator& bl) {
1336 ::decode(path, bl);
1337 ::decode(capinfo, bl);
1338 }
1339};
1340WRITE_CLASS_ENCODER(old_cap_reconnect_t)
1341
1342
1343// ================================================================
1344// dir frag
1345
1346struct dirfrag_t {
94b18763 1347 inodeno_t ino = 0;
7c673cae
FG
1348 frag_t frag;
1349
94b18763 1350 dirfrag_t() {}
7c673cae
FG
1351 dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { }
1352
1353 void encode(bufferlist& bl) const {
1354 ::encode(ino, bl);
1355 ::encode(frag, bl);
1356 }
1357 void decode(bufferlist::iterator& bl) {
1358 ::decode(ino, bl);
1359 ::decode(frag, bl);
1360 }
1361};
1362WRITE_CLASS_ENCODER(dirfrag_t)
1363
1364
1365inline std::ostream& operator<<(std::ostream& out, const dirfrag_t &df) {
1366 out << df.ino;
1367 if (!df.frag.is_root()) out << "." << df.frag;
1368 return out;
1369}
1370inline bool operator<(dirfrag_t l, dirfrag_t r) {
1371 if (l.ino < r.ino) return true;
1372 if (l.ino == r.ino && l.frag < r.frag) return true;
1373 return false;
1374}
1375inline bool operator==(dirfrag_t l, dirfrag_t r) {
1376 return l.ino == r.ino && l.frag == r.frag;
1377}
1378
1379namespace std {
1380 template<> struct hash<dirfrag_t> {
1381 size_t operator()(const dirfrag_t &df) const {
1382 static rjhash<uint64_t> H;
1383 static rjhash<uint32_t> I;
1384 return H(df.ino) ^ I(df.frag);
1385 }
1386 };
1387} // namespace std
1388
1389
1390
1391// ================================================================
1392
1393#define META_POP_IRD 0
1394#define META_POP_IWR 1
1395#define META_POP_READDIR 2
1396#define META_POP_FETCH 3
1397#define META_POP_STORE 4
1398#define META_NPOP 5
1399
1400class inode_load_vec_t {
1401 static const int NUM = 2;
94b18763 1402 std::array<DecayCounter, NUM> vec;
7c673cae
FG
1403public:
1404 explicit inode_load_vec_t(const utime_t &now)
94b18763 1405 : vec{DecayCounter(now), DecayCounter(now)}
7c673cae
FG
1406 {}
1407 // for dencoder infrastructure
94b18763 1408 inode_load_vec_t() {}
7c673cae
FG
1409 DecayCounter &get(int t) {
1410 assert(t < NUM);
1411 return vec[t];
1412 }
1413 void zero(utime_t now) {
1414 for (int i=0; i<NUM; i++)
1415 vec[i].reset(now);
1416 }
1417 void encode(bufferlist &bl) const;
1418 void decode(const utime_t &t, bufferlist::iterator &p);
1419 // for dencoder
1420 void decode(bufferlist::iterator& p) { utime_t sample; decode(sample, p); }
1421 void dump(Formatter *f);
1422 static void generate_test_instances(list<inode_load_vec_t*>& ls);
1423};
1424inline void encode(const inode_load_vec_t &c, bufferlist &bl) { c.encode(bl); }
1425inline void decode(inode_load_vec_t & c, const utime_t &t, bufferlist::iterator &p) {
1426 c.decode(t, p);
1427}
1428// for dencoder
1429inline void decode(inode_load_vec_t & c, bufferlist::iterator &p) {
1430 utime_t sample;
1431 c.decode(sample, p);
1432}
1433
1434class dirfrag_load_vec_t {
1435public:
1436 static const int NUM = 5;
94b18763 1437 std::array<DecayCounter, NUM> vec;
7c673cae 1438 explicit dirfrag_load_vec_t(const utime_t &now)
94b18763
FG
1439 : vec{
1440 DecayCounter(now),
1441 DecayCounter(now),
1442 DecayCounter(now),
1443 DecayCounter(now),
1444 DecayCounter(now)
1445 }
7c673cae 1446 {}
94b18763
FG
1447 // for dencoder infrastructure
1448 dirfrag_load_vec_t() {}
7c673cae
FG
1449 void encode(bufferlist &bl) const {
1450 ENCODE_START(2, 2, bl);
94b18763
FG
1451 for (const auto &i : vec) {
1452 ::encode(i, bl);
1453 }
7c673cae
FG
1454 ENCODE_FINISH(bl);
1455 }
1456 void decode(const utime_t &t, bufferlist::iterator &p) {
1457 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, p);
94b18763
FG
1458 for (auto &i : vec) {
1459 ::decode(i, t, p);
1460 }
7c673cae
FG
1461 DECODE_FINISH(p);
1462 }
1463 // for dencoder infrastructure
1464 void decode(bufferlist::iterator& p) {
1465 utime_t sample;
1466 decode(sample, p);
1467 }
1468 void dump(Formatter *f) const;
28e407b8 1469 void dump(Formatter *f, utime_t now, const DecayRate& rate);
7c673cae
FG
1470 static void generate_test_instances(list<dirfrag_load_vec_t*>& ls);
1471
1472 DecayCounter &get(int t) {
1473 assert(t < NUM);
1474 return vec[t];
1475 }
1476 void adjust(utime_t now, const DecayRate& rate, double d) {
94b18763
FG
1477 for (auto &i : vec) {
1478 i.adjust(now, rate, d);
1479 }
7c673cae
FG
1480 }
1481 void zero(utime_t now) {
94b18763
FG
1482 for (auto &i : vec) {
1483 i.reset(now);
1484 }
7c673cae
FG
1485 }
1486 double meta_load(utime_t now, const DecayRate& rate) {
1487 return
1488 1*vec[META_POP_IRD].get(now, rate) +
1489 2*vec[META_POP_IWR].get(now, rate) +
1490 1*vec[META_POP_READDIR].get(now, rate) +
1491 2*vec[META_POP_FETCH].get(now, rate) +
1492 4*vec[META_POP_STORE].get(now, rate);
1493 }
28e407b8 1494 double meta_load() const {
7c673cae
FG
1495 return
1496 1*vec[META_POP_IRD].get_last() +
1497 2*vec[META_POP_IWR].get_last() +
1498 1*vec[META_POP_READDIR].get_last() +
1499 2*vec[META_POP_FETCH].get_last() +
1500 4*vec[META_POP_STORE].get_last();
1501 }
1502
1503 void add(utime_t now, DecayRate& rate, dirfrag_load_vec_t& r) {
1504 for (int i=0; i<dirfrag_load_vec_t::NUM; i++)
1505 vec[i].adjust(r.vec[i].get(now, rate));
1506 }
1507 void sub(utime_t now, DecayRate& rate, dirfrag_load_vec_t& r) {
1508 for (int i=0; i<dirfrag_load_vec_t::NUM; i++)
1509 vec[i].adjust(-r.vec[i].get(now, rate));
1510 }
1511 void scale(double f) {
1512 for (int i=0; i<dirfrag_load_vec_t::NUM; i++)
1513 vec[i].scale(f);
1514 }
1515};
1516
1517inline void encode(const dirfrag_load_vec_t &c, bufferlist &bl) { c.encode(bl); }
1518inline void decode(dirfrag_load_vec_t& c, const utime_t &t, bufferlist::iterator &p) {
1519 c.decode(t, p);
1520}
1521// this for dencoder
1522inline void decode(dirfrag_load_vec_t& c, bufferlist::iterator &p) {
1523 utime_t sample;
1524 c.decode(sample, p);
1525}
1526
28e407b8 1527inline std::ostream& operator<<(std::ostream& out, const dirfrag_load_vec_t& dl)
7c673cae 1528{
28e407b8
AA
1529 return out << "[" << dl.vec[0].get_last() << "," << dl.vec[1].get_last()
1530 << " " << dl.meta_load() << "]";
7c673cae
FG
1531}
1532
1533
1534
1535
1536
1537
1538/* mds_load_t
1539 * mds load
1540 */
1541
1542struct mds_load_t {
1543 dirfrag_load_vec_t auth;
1544 dirfrag_load_vec_t all;
1545
94b18763
FG
1546 double req_rate = 0.0;
1547 double cache_hit_rate = 0.0;
1548 double queue_len = 0.0;
7c673cae 1549
94b18763 1550 double cpu_load_avg = 0.0;
7c673cae 1551
94b18763 1552 explicit mds_load_t(const utime_t &t) : auth(t), all(t) {}
7c673cae 1553 // mostly for the dencoder infrastructure
94b18763 1554 mds_load_t() : auth(), all() {}
7c673cae
FG
1555
1556 double mds_load(); // defiend in MDBalancer.cc
1557 void encode(bufferlist& bl) const;
1558 void decode(const utime_t& now, bufferlist::iterator& bl);
1559 //this one is for dencoder infrastructure
1560 void decode(bufferlist::iterator& bl) { utime_t sample; decode(sample, bl); }
1561 void dump(Formatter *f) const;
1562 static void generate_test_instances(list<mds_load_t*>& ls);
1563};
1564inline void encode(const mds_load_t &c, bufferlist &bl) { c.encode(bl); }
1565inline void decode(mds_load_t &c, const utime_t &t, bufferlist::iterator &p) {
1566 c.decode(t, p);
1567}
1568// this one is for dencoder
1569inline void decode(mds_load_t &c, bufferlist::iterator &p) {
1570 utime_t sample;
1571 c.decode(sample, p);
1572}
1573
28e407b8 1574inline std::ostream& operator<<(std::ostream& out, const mds_load_t& load)
7c673cae
FG
1575{
1576 return out << "mdsload<" << load.auth << "/" << load.all
1577 << ", req " << load.req_rate
1578 << ", hr " << load.cache_hit_rate
1579 << ", qlen " << load.queue_len
1580 << ", cpu " << load.cpu_load_avg
1581 << ">";
1582}
1583
1584class load_spread_t {
1585public:
1586 static const int MAX = 4;
1587 int last[MAX];
94b18763 1588 int p = 0, n = 0;
7c673cae
FG
1589 DecayCounter count;
1590
1591public:
94b18763 1592 load_spread_t() : count(ceph_clock_now())
7c673cae
FG
1593 {
1594 for (int i=0; i<MAX; i++)
1595 last[i] = -1;
1596 }
1597
1598 double hit(utime_t now, const DecayRate& rate, int who) {
1599 for (int i=0; i<n; i++)
1600 if (last[i] == who)
1601 return count.get_last();
1602
1603 // we're new(ish)
1604 last[p++] = who;
1605 if (n < MAX) n++;
1606 if (n == 1) return 0.0;
1607
1608 if (p == MAX) p = 0;
1609
1610 return count.hit(now, rate);
1611 }
1612 double get(utime_t now, const DecayRate& rate) {
1613 return count.get(now, rate);
1614 }
1615};
1616
1617
1618
1619// ================================================================
1620typedef std::pair<mds_rank_t, mds_rank_t> mds_authority_t;
1621
1622// -- authority delegation --
1623// directory authority types
1624// >= 0 is the auth mds
1625#define CDIR_AUTH_PARENT mds_rank_t(-1) // default
1626#define CDIR_AUTH_UNKNOWN mds_rank_t(-2)
1627#define CDIR_AUTH_DEFAULT mds_authority_t(CDIR_AUTH_PARENT, CDIR_AUTH_UNKNOWN)
1628#define CDIR_AUTH_UNDEF mds_authority_t(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN)
1629//#define CDIR_AUTH_ROOTINODE pair<int,int>( 0, -2)
1630
1631class MDSCacheObjectInfo {
1632public:
94b18763 1633 inodeno_t ino = 0;
7c673cae
FG
1634 dirfrag_t dirfrag;
1635 string dname;
1636 snapid_t snapid;
1637
94b18763 1638 MDSCacheObjectInfo() {}
7c673cae
FG
1639
1640 void encode(bufferlist& bl) const;
1641 void decode(bufferlist::iterator& bl);
1642 void dump(Formatter *f) const;
1643 static void generate_test_instances(list<MDSCacheObjectInfo*>& ls);
1644};
1645
1646inline std::ostream& operator<<(std::ostream& out, const MDSCacheObjectInfo &info) {
1647 if (info.ino) return out << info.ino << "." << info.snapid;
1648 if (info.dname.length()) return out << info.dirfrag << "/" << info.dname
1649 << " snap " << info.snapid;
1650 return out << info.dirfrag;
1651}
1652
1653inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) {
1654 if (l.ino || r.ino)
1655 return l.ino == r.ino && l.snapid == r.snapid;
1656 else
1657 return l.dirfrag == r.dirfrag && l.dname == r.dname;
1658}
1659WRITE_CLASS_ENCODER(MDSCacheObjectInfo)
1660
1661
1662// parse a map of keys/values.
1663namespace qi = boost::spirit::qi;
1664
1665template <typename Iterator>
1666struct keys_and_values
1667 : qi::grammar<Iterator, std::map<string, string>()>
1668{
1669 keys_and_values()
1670 : keys_and_values::base_type(query)
1671 {
1672 query = pair >> *(qi::lit(' ') >> pair);
1673 pair = key >> '=' >> value;
1674 key = qi::char_("a-zA-Z_") >> *qi::char_("a-zA-Z_0-9");
1675 value = +qi::char_("a-zA-Z_0-9");
1676 }
1677 qi::rule<Iterator, std::map<string, string>()> query;
1678 qi::rule<Iterator, std::pair<string, string>()> pair;
1679 qi::rule<Iterator, string()> key, value;
1680};
1681
1682#endif