1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #ifndef CEPH_MDS_EMETABLOB_H
16 #define CEPH_MDS_EMETABLOB_H
18 #include <string_view>
22 #include "../CInode.h"
24 #include "../CDentry.h"
25 #include "../LogSegment.h"
27 #include "include/interval_set.h"
35 * a bunch of metadata in the journal
40 * - make sure you adjust the inode.version for any modified inode you
41 * journal. CDir and CDentry maintain a projected_version, but CInode
42 * doesn't, since the journaled inode usually has to be modified
43 * manually anyway (to delay the change in the MDS's cache until after
52 /* fullbit - a regular dentry + inode
54 * We encode this one a bit weirdly, just because (also, it's marginally faster
55 * on multiple encodes, which I think can happen):
56 * Encode a bufferlist on struct creation with all data members, without a struct_v.
57 * When encode is called, encode struct_v and then append the bufferlist.
58 * Decode straight into the appropriate variables.
60 * So, if you add members, encode them in the constructor and then change
61 * the struct_v in the encode function!
64 static const int STATE_DIRTY
= (1<<0);
65 static const int STATE_DIRTYPARENT
= (1<<1);
66 static const int STATE_DIRTYPOOL
= (1<<2);
67 static const int STATE_NEED_SNAPFLUSH
= (1<<3);
68 static const int STATE_EPHEMERAL_RANDOM
= (1<<4);
69 std::string dn
; // dentry
70 snapid_t dnfirst
, dnlast
;
72 CInode::mempool_inode inode
; // if it's not XXX should not be part of mempool; wait for std::pmr to simplify
73 fragtree_t dirfragtree
;
74 CInode::mempool_xattr_map xattrs
;
79 CInode::mempool_old_inode_map old_inodes
; // XXX should not be part of mempool; wait for std::pmr to simplify
81 fullbit(std::string_view d
, snapid_t df
, snapid_t dl
,
82 version_t v
, const CInode::mempool_inode
& i
, const fragtree_t
&dft
,
83 const CInode::mempool_xattr_map
&xa
, std::string_view sym
,
84 snapid_t os
, const bufferlist
&sbl
, __u8 st
,
85 const CInode::mempool_old_inode_map
*oi
= NULL
) :
86 dn(d
), dnfirst(df
), dnlast(dl
), dnv(v
), inode(i
), xattrs(xa
),
87 oldest_snap(os
), state(st
)
97 explicit fullbit(bufferlist::const_iterator
&p
) {
101 fullbit(const fullbit
&) = delete;
103 fullbit
& operator=(const fullbit
&) = delete;
105 void encode(bufferlist
& bl
, uint64_t features
) const;
106 void decode(bufferlist::const_iterator
&bl
);
107 void dump(Formatter
*f
) const;
108 static void generate_test_instances(std::list
<EMetaBlob::fullbit
*>& ls
);
110 void update_inode(MDSRank
*mds
, CInode
*in
);
111 bool is_dirty() const { return (state
& STATE_DIRTY
); }
112 bool is_dirty_parent() const { return (state
& STATE_DIRTYPARENT
); }
113 bool is_dirty_pool() const { return (state
& STATE_DIRTYPOOL
); }
114 bool need_snapflush() const { return (state
& STATE_NEED_SNAPFLUSH
); }
115 bool is_export_ephemeral_random() const { return (state
& STATE_EPHEMERAL_RANDOM
); }
117 void print(ostream
& out
) const {
118 out
<< " fullbit dn " << dn
<< " [" << dnfirst
<< "," << dnlast
<< "] dnv " << dnv
119 << " inode " << inode
.ino
120 << " state=" << state
<< std::endl
;
122 string
state_string() const {
124 bool marked_already
= false;
126 state_string
.append("dirty");
127 marked_already
= true;
129 if (is_dirty_parent()) {
130 state_string
.append(marked_already
? "+dirty_parent" : "dirty_parent");
132 state_string
.append("+dirty_pool");
137 WRITE_CLASS_ENCODER_FEATURES(fullbit
)
139 /* remotebit - a dentry + remote inode link (i.e. just an ino)
143 snapid_t dnfirst
, dnlast
;
146 unsigned char d_type
;
149 remotebit(std::string_view d
, snapid_t df
, snapid_t dl
, version_t v
, inodeno_t i
, unsigned char dt
, bool dr
) :
150 dn(d
), dnfirst(df
), dnlast(dl
), dnv(v
), ino(i
), d_type(dt
), dirty(dr
) { }
151 explicit remotebit(bufferlist::const_iterator
&p
) { decode(p
); }
152 remotebit(): dnfirst(0), dnlast(0), dnv(0), ino(0),
153 d_type('\0'), dirty(false) {}
155 void encode(bufferlist
& bl
) const;
156 void decode(bufferlist::const_iterator
&bl
);
157 void print(ostream
& out
) const {
158 out
<< " remotebit dn " << dn
<< " [" << dnfirst
<< "," << dnlast
<< "] dnv " << dnv
160 << " dirty=" << dirty
<< std::endl
;
162 void dump(Formatter
*f
) const;
163 static void generate_test_instances(std::list
<remotebit
*>& ls
);
165 WRITE_CLASS_ENCODER(remotebit
)
168 * nullbit - a null dentry
172 snapid_t dnfirst
, dnlast
;
176 nullbit(std::string_view d
, snapid_t df
, snapid_t dl
, version_t v
, bool dr
) :
177 dn(d
), dnfirst(df
), dnlast(dl
), dnv(v
), dirty(dr
) { }
178 explicit nullbit(bufferlist::const_iterator
&p
) { decode(p
); }
179 nullbit(): dnfirst(0), dnlast(0), dnv(0), dirty(false) {}
181 void encode(bufferlist
& bl
) const;
182 void decode(bufferlist::const_iterator
&bl
);
183 void dump(Formatter
*f
) const;
184 static void generate_test_instances(std::list
<nullbit
*>& ls
);
185 void print(ostream
& out
) const {
186 out
<< " nullbit dn " << dn
<< " [" << dnfirst
<< "," << dnlast
<< "] dnv " << dnv
187 << " dirty=" << dirty
<< std::endl
;
190 WRITE_CLASS_ENCODER(nullbit
)
193 /* dirlump - contains metadata for any dir we have contents for.
197 static const int STATE_COMPLETE
= (1<<1);
198 static const int STATE_DIRTY
= (1<<2); // dirty due to THIS journal item, that is!
199 static const int STATE_NEW
= (1<<3); // new directory
200 static const int STATE_IMPORTING
= (1<<4); // importing directory
201 static const int STATE_DIRTYDFT
= (1<<5); // dirty dirfragtree
206 __u32 nfull
, nremote
, nnull
;
209 mutable bufferlist dnbl
;
210 mutable bool dn_decoded
;
211 mutable list
<fullbit
> dfull
;
212 mutable vector
<remotebit
> dremote
;
213 mutable vector
<nullbit
> dnull
;
216 dirlump() : state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { }
217 dirlump(const dirlump
&) = delete;
218 dirlump
& operator=(const dirlump
&) = delete;
220 bool is_complete() const { return state
& STATE_COMPLETE
; }
221 void mark_complete() { state
|= STATE_COMPLETE
; }
222 bool is_dirty() const { return state
& STATE_DIRTY
; }
223 void mark_dirty() { state
|= STATE_DIRTY
; }
224 bool is_new() const { return state
& STATE_NEW
; }
225 void mark_new() { state
|= STATE_NEW
; }
226 bool is_importing() { return state
& STATE_IMPORTING
; }
227 void mark_importing() { state
|= STATE_IMPORTING
; }
228 bool is_dirty_dft() { return state
& STATE_DIRTYDFT
; }
229 void mark_dirty_dft() { state
|= STATE_DIRTYDFT
; }
231 const list
<fullbit
> &get_dfull() const { return dfull
; }
232 list
<fullbit
> &_get_dfull() { return dfull
; }
233 const vector
<remotebit
> &get_dremote() const { return dremote
; }
234 const vector
<nullbit
> &get_dnull() const { return dnull
; }
236 template< class... Args
>
237 void add_dfull(Args
&&... args
) {
238 dfull
.emplace_back(std::forward
<Args
>(args
)...);
240 template< class... Args
>
241 void add_dremote(Args
&&... args
) {
242 dremote
.emplace_back(std::forward
<Args
>(args
)...);
244 template< class... Args
>
245 void add_dnull(Args
&&... args
) {
246 dnull
.emplace_back(std::forward
<Args
>(args
)...);
249 void print(dirfrag_t dirfrag
, ostream
& out
) const {
250 out
<< "dirlump " << dirfrag
<< " v " << fnode
.version
251 << " state " << state
252 << " num " << nfull
<< "/" << nremote
<< "/" << nnull
255 for (const auto& p
: dfull
)
257 for (const auto& p
: dremote
)
259 for (const auto& p
: dnull
)
263 string
state_string() const {
265 bool marked_already
= false;
267 state_string
.append("complete");
268 marked_already
= true;
271 state_string
.append(marked_already
? "+dirty" : "dirty");
272 marked_already
= true;
275 state_string
.append(marked_already
? "+new" : "new");
280 // if this changes, update the versioning in encode for it!
281 void _encode_bits(uint64_t features
) const {
283 if (!dn_decoded
) return;
284 encode(dfull
, dnbl
, features
);
285 encode(dremote
, dnbl
);
288 void _decode_bits() const {
290 if (dn_decoded
) return;
291 auto p
= dnbl
.cbegin();
298 void encode(bufferlist
& bl
, uint64_t features
) const;
299 void decode(bufferlist::const_iterator
&bl
);
300 void dump(Formatter
*f
) const;
301 static void generate_test_instances(std::list
<dirlump
*>& ls
);
303 WRITE_CLASS_ENCODER_FEATURES(dirlump
)
305 // my lumps. preserve the order we added them in a list.
306 vector
<dirfrag_t
> lump_order
;
307 map
<dirfrag_t
, dirlump
> lump_map
;
310 vector
<pair
<__u8
,version_t
> > table_tids
; // tableclient transactions
312 inodeno_t opened_ino
;
314 inodeno_t renamed_dirino
;
315 vector
<frag_t
> renamed_dir_frags
;
318 // ino (pre)allocation. may involve both inotable AND session state.
319 version_t inotablev
, sessionmapv
;
320 inodeno_t allocated_ino
; // inotable
321 interval_set
<inodeno_t
> preallocated_inos
; // inotable + session
322 inodeno_t used_preallocated_ino
; // session
323 entity_name_t client_name
; // session
325 // inodes i've truncated
326 vector
<inodeno_t
> truncate_start
; // start truncate
327 map
<inodeno_t
, LogSegment::seq_t
> truncate_finish
; // finished truncate (started in segment blah)
330 vector
<inodeno_t
> destroyed_inodes
;
334 vector
<pair
<metareqid_t
,uint64_t> > client_reqs
;
335 vector
<pair
<metareqid_t
,uint64_t> > client_flushes
;
338 void encode(bufferlist
& bl
, uint64_t features
) const;
339 void decode(bufferlist::const_iterator
& bl
);
340 void get_inodes(std::set
<inodeno_t
> &inodes
) const;
341 void get_paths(std::vector
<std::string
> &paths
) const;
342 void get_dentries(std::map
<dirfrag_t
, std::set
<std::string
> > &dentries
) const;
343 entity_name_t
get_client_name() const {return client_name
;}
345 void dump(Formatter
*f
) const;
346 static void generate_test_instances(std::list
<EMetaBlob
*>& ls
);
348 uint64_t last_subtree_map
;
351 // for replay, in certain cases
352 //LogSegment *_segment;
354 EMetaBlob() : opened_ino(0), renamed_dirino(0),
355 inotablev(0), sessionmapv(0), allocated_ino(0),
356 last_subtree_map(0), event_seq(0)
358 EMetaBlob(const EMetaBlob
&) = delete;
360 EMetaBlob
& operator=(const EMetaBlob
&) = delete;
362 void print(ostream
& out
) {
363 for (const auto &p
: lump_order
)
364 lump_map
[p
].print(p
, out
);
367 void add_client_req(metareqid_t r
, uint64_t tid
=0) {
368 client_reqs
.push_back(pair
<metareqid_t
,uint64_t>(r
, tid
));
370 void add_client_flush(metareqid_t r
, uint64_t tid
=0) {
371 client_flushes
.push_back(pair
<metareqid_t
,uint64_t>(r
, tid
));
374 void add_table_transaction(int table
, version_t tid
) {
375 table_tids
.push_back(pair
<__u8
, version_t
>(table
, tid
));
378 void add_opened_ino(inodeno_t ino
) {
379 ceph_assert(!opened_ino
);
383 void set_ino_alloc(inodeno_t alloc
,
384 inodeno_t used_prealloc
,
385 interval_set
<inodeno_t
>& prealloc
,
386 entity_name_t client
,
387 version_t sv
, version_t iv
) {
388 allocated_ino
= alloc
;
389 used_preallocated_ino
= used_prealloc
;
390 preallocated_inos
= prealloc
;
391 client_name
= client
;
396 void add_truncate_start(inodeno_t ino
) {
397 truncate_start
.push_back(ino
);
399 void add_truncate_finish(inodeno_t ino
, uint64_t segoff
) {
400 truncate_finish
[ino
] = segoff
;
403 bool rewrite_truncate_finish(MDSRank
const *mds
, std::map
<uint64_t, uint64_t> const &old_to_new
);
405 void add_destroyed_inode(inodeno_t ino
) {
406 destroyed_inodes
.push_back(ino
);
409 void add_null_dentry(CDentry
*dn
, bool dirty
) {
410 add_null_dentry(add_dir(dn
->get_dir(), false), dn
, dirty
);
412 void add_null_dentry(dirlump
& lump
, CDentry
*dn
, bool dirty
) {
415 lump
.add_dnull(dn
->get_name(), dn
->first
, dn
->last
,
416 dn
->get_projected_version(), dirty
);
419 void add_remote_dentry(CDentry
*dn
, bool dirty
) {
420 add_remote_dentry(add_dir(dn
->get_dir(), false), dn
, dirty
, 0, 0);
422 void add_remote_dentry(CDentry
*dn
, bool dirty
, inodeno_t rino
, int rdt
) {
423 add_remote_dentry(add_dir(dn
->get_dir(), false), dn
, dirty
, rino
, rdt
);
425 void add_remote_dentry(dirlump
& lump
, CDentry
*dn
, bool dirty
,
426 inodeno_t rino
=0, unsigned char rdt
=0) {
428 rino
= dn
->get_projected_linkage()->get_remote_ino();
429 rdt
= dn
->get_projected_linkage()->get_remote_d_type();
432 lump
.add_dremote(dn
->get_name(), dn
->first
, dn
->last
,
433 dn
->get_projected_version(), rino
, rdt
, dirty
);
436 // return remote pointer to to-be-journaled inode
437 void add_primary_dentry(CDentry
*dn
, CInode
*in
, bool dirty
,
438 bool dirty_parent
=false, bool dirty_pool
=false,
439 bool need_snapflush
=false) {
441 if (dirty
) state
|= fullbit::STATE_DIRTY
;
442 if (dirty_parent
) state
|= fullbit::STATE_DIRTYPARENT
;
443 if (dirty_pool
) state
|= fullbit::STATE_DIRTYPOOL
;
444 if (need_snapflush
) state
|= fullbit::STATE_NEED_SNAPFLUSH
;
445 add_primary_dentry(add_dir(dn
->get_dir(), false), dn
, in
, state
);
447 void add_primary_dentry(dirlump
& lump
, CDentry
*dn
, CInode
*in
, __u8 state
) {
449 in
= dn
->get_projected_linkage()->get_inode();
451 if (in
->is_ephemeral_rand()) {
452 state
|= fullbit::STATE_EPHEMERAL_RANDOM
;
455 // make note of where this inode was last journaled
456 in
->last_journaled
= event_seq
;
457 //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl;
459 const auto pi
= in
->get_projected_inode();
460 if ((state
& fullbit::STATE_DIRTY
) && pi
->is_backtrace_updated())
461 state
|= fullbit::STATE_DIRTYPARENT
;
464 const sr_t
*sr
= in
->get_projected_srnode();
469 lump
.add_dfull(dn
->get_name(), dn
->first
, dn
->last
, dn
->get_projected_version(),
470 *pi
, in
->dirfragtree
, *in
->get_projected_xattrs(), in
->symlink
,
471 in
->oldest_snap
, snapbl
, state
, &in
->old_inodes
);
474 // convenience: primary or remote? figure it out.
475 void add_dentry(CDentry
*dn
, bool dirty
) {
476 dirlump
& lump
= add_dir(dn
->get_dir(), false);
477 add_dentry(lump
, dn
, dirty
, false, false);
479 void add_import_dentry(CDentry
*dn
) {
480 bool dirty_parent
= false;
481 bool dirty_pool
= false;
482 if (dn
->get_linkage()->is_primary()) {
483 dirty_parent
= dn
->get_linkage()->get_inode()->is_dirty_parent();
484 dirty_pool
= dn
->get_linkage()->get_inode()->is_dirty_pool();
486 dirlump
& lump
= add_dir(dn
->get_dir(), false);
487 add_dentry(lump
, dn
, dn
->is_dirty(), dirty_parent
, dirty_pool
);
489 void add_dentry(dirlump
& lump
, CDentry
*dn
, bool dirty
, bool dirty_parent
, bool dirty_pool
) {
491 if (dn
->get_projected_linkage()->is_remote()) {
492 add_remote_dentry(dn
, dirty
);
494 } else if (dn
->get_projected_linkage()->is_null()) {
495 add_null_dentry(dn
, dirty
);
498 ceph_assert(dn
->get_projected_linkage()->is_primary());
499 add_primary_dentry(dn
, 0, dirty
, dirty_parent
, dirty_pool
);
502 void add_root(bool dirty
, CInode
*in
) {
503 in
->last_journaled
= event_seq
;
504 //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl;
506 const auto& pi
= *(in
->get_projected_inode());
507 const auto& pdft
= in
->dirfragtree
;
508 const auto& px
= *(in
->get_projected_xattrs());
511 const sr_t
*sr
= in
->get_projected_srnode();
515 for (auto p
= roots
.begin(); p
!= roots
.end(); ++p
) {
516 if (p
->inode
.ino
== in
->ino()) {
523 roots
.emplace_back(empty
, in
->first
, in
->last
, 0, pi
, pdft
, px
, in
->symlink
,
524 in
->oldest_snap
, snapbl
, (dirty
? fullbit::STATE_DIRTY
: 0),
528 dirlump
& add_dir(CDir
*dir
, bool dirty
, bool complete
=false) {
529 return add_dir(dir
->dirfrag(), dir
->get_projected_fnode(), dir
->get_projected_version(),
532 dirlump
& add_new_dir(CDir
*dir
) {
533 return add_dir(dir
->dirfrag(), dir
->get_projected_fnode(), dir
->get_projected_version(),
534 true, true, true); // dirty AND complete AND new
536 dirlump
& add_import_dir(CDir
*dir
) {
537 // dirty=false would be okay in some cases
538 return add_dir(dir
->dirfrag(), dir
->get_projected_fnode(), dir
->get_projected_version(),
539 dir
->is_dirty(), dir
->is_complete(), false, true, dir
->is_dirty_dft());
541 dirlump
& add_fragmented_dir(CDir
*dir
, bool dirty
, bool dirtydft
) {
542 return add_dir(dir
->dirfrag(), dir
->get_projected_fnode(), dir
->get_projected_version(),
543 dirty
, false, false, false, dirtydft
);
545 dirlump
& add_dir(dirfrag_t df
, const fnode_t
*pf
, version_t pv
, bool dirty
,
546 bool complete
=false, bool isnew
=false,
547 bool importing
=false, bool dirty_dft
=false) {
548 if (lump_map
.count(df
) == 0)
549 lump_order
.push_back(df
);
551 dirlump
& l
= lump_map
[df
];
553 l
.fnode
.version
= pv
;
554 if (complete
) l
.mark_complete();
555 if (dirty
) l
.mark_dirty();
556 if (isnew
) l
.mark_new();
557 if (importing
) l
.mark_importing();
558 if (dirty_dft
) l
.mark_dirty_dft();
562 static const int TO_AUTH_SUBTREE_ROOT
= 0; // default.
563 static const int TO_ROOT
= 1;
565 void add_dir_context(CDir
*dir
, int mode
= TO_AUTH_SUBTREE_ROOT
);
568 return roots
.empty() && lump_order
.empty() && table_tids
.empty() &&
569 truncate_start
.empty() && truncate_finish
.empty() &&
570 destroyed_inodes
.empty() && client_reqs
.empty() &&
571 opened_ino
== 0 && inotablev
== 0 && sessionmapv
== 0;
574 void print(ostream
& out
) const {
576 if (!lump_order
.empty())
577 out
<< " " << lump_order
.front() << ", " << lump_map
.size() << " dirs";
578 if (!table_tids
.empty())
579 out
<< " table_tids=" << table_tids
;
580 if (allocated_ino
|| preallocated_inos
.size()) {
582 out
<< " alloc_ino=" << allocated_ino
;
583 if (preallocated_inos
.size())
584 out
<< " prealloc_ino=" << preallocated_inos
;
585 if (used_preallocated_ino
)
586 out
<< " used_prealloc_ino=" << used_preallocated_ino
;
587 out
<< " v" << inotablev
;
592 void update_segment(LogSegment
*ls
);
593 void replay(MDSRank
*mds
, LogSegment
*ls
, MDSlaveUpdate
*su
=NULL
);
595 WRITE_CLASS_ENCODER_FEATURES(EMetaBlob
)
596 WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::fullbit
)
597 WRITE_CLASS_ENCODER(EMetaBlob::remotebit
)
598 WRITE_CLASS_ENCODER(EMetaBlob::nullbit
)
599 WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::dirlump
)
601 inline ostream
& operator<<(ostream
& out
, const EMetaBlob
& t
) {