]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/events/EMetaBlob.h
import 15.2.5
[ceph.git] / ceph / src / mds / events / EMetaBlob.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_MDS_EMETABLOB_H
16 #define CEPH_MDS_EMETABLOB_H
17
18 #include <string_view>
19
20 #include <stdlib.h>
21
22 #include "../CInode.h"
23 #include "../CDir.h"
24 #include "../CDentry.h"
25 #include "../LogSegment.h"
26
27 #include "include/interval_set.h"
28
29 class MDSRank;
30 class MDLog;
31 class LogSegment;
32 struct MDSlaveUpdate;
33
34 /*
35 * a bunch of metadata in the journal
36 */
37
38 /* notes:
39 *
40 * - make sure you adjust the inode.version for any modified inode you
41 * journal. CDir and CDentry maintain a projected_version, but CInode
42 * doesn't, since the journaled inode usually has to be modified
43 * manually anyway (to delay the change in the MDS's cache until after
44 * it is journaled).
45 *
46 */
47
48
49 class EMetaBlob {
50
51 public:
52 /* fullbit - a regular dentry + inode
53 *
54 * We encode this one a bit weirdly, just because (also, it's marginally faster
55 * on multiple encodes, which I think can happen):
56 * Encode a bufferlist on struct creation with all data members, without a struct_v.
57 * When encode is called, encode struct_v and then append the bufferlist.
58 * Decode straight into the appropriate variables.
59 *
60 * So, if you add members, encode them in the constructor and then change
61 * the struct_v in the encode function!
62 */
63 struct fullbit {
64 static const int STATE_DIRTY = (1<<0);
65 static const int STATE_DIRTYPARENT = (1<<1);
66 static const int STATE_DIRTYPOOL = (1<<2);
67 static const int STATE_NEED_SNAPFLUSH = (1<<3);
68 static const int STATE_EPHEMERAL_RANDOM = (1<<4);
69 std::string dn; // dentry
70 snapid_t dnfirst, dnlast;
71 version_t dnv{0};
72 CInode::mempool_inode inode; // if it's not XXX should not be part of mempool; wait for std::pmr to simplify
73 fragtree_t dirfragtree;
74 CInode::mempool_xattr_map xattrs;
75 std::string symlink;
76 snapid_t oldest_snap;
77 bufferlist snapbl;
78 __u8 state{0};
79 CInode::mempool_old_inode_map old_inodes; // XXX should not be part of mempool; wait for std::pmr to simplify
80
81 fullbit(std::string_view d, snapid_t df, snapid_t dl,
82 version_t v, const CInode::mempool_inode& i, const fragtree_t &dft,
83 const CInode::mempool_xattr_map &xa, std::string_view sym,
84 snapid_t os, const bufferlist &sbl, __u8 st,
85 const CInode::mempool_old_inode_map *oi = NULL) :
86 dn(d), dnfirst(df), dnlast(dl), dnv(v), inode(i), xattrs(xa),
87 oldest_snap(os), state(st)
88 {
89 if (i.is_symlink())
90 symlink = sym;
91 if (i.is_dir())
92 dirfragtree = dft;
93 if (oi)
94 old_inodes = *oi;
95 snapbl = sbl;
96 }
97 explicit fullbit(bufferlist::const_iterator &p) {
98 decode(p);
99 }
100 fullbit() {}
101 fullbit(const fullbit&) = delete;
102 ~fullbit() {}
103 fullbit& operator=(const fullbit&) = delete;
104
105 void encode(bufferlist& bl, uint64_t features) const;
106 void decode(bufferlist::const_iterator &bl);
107 void dump(Formatter *f) const;
108 static void generate_test_instances(std::list<EMetaBlob::fullbit*>& ls);
109
110 void update_inode(MDSRank *mds, CInode *in);
111 bool is_dirty() const { return (state & STATE_DIRTY); }
112 bool is_dirty_parent() const { return (state & STATE_DIRTYPARENT); }
113 bool is_dirty_pool() const { return (state & STATE_DIRTYPOOL); }
114 bool need_snapflush() const { return (state & STATE_NEED_SNAPFLUSH); }
115 bool is_export_ephemeral_random() const { return (state & STATE_EPHEMERAL_RANDOM); }
116
117 void print(ostream& out) const {
118 out << " fullbit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv
119 << " inode " << inode.ino
120 << " state=" << state << std::endl;
121 }
122 string state_string() const {
123 string state_string;
124 bool marked_already = false;
125 if (is_dirty()) {
126 state_string.append("dirty");
127 marked_already = true;
128 }
129 if (is_dirty_parent()) {
130 state_string.append(marked_already ? "+dirty_parent" : "dirty_parent");
131 if (is_dirty_pool())
132 state_string.append("+dirty_pool");
133 }
134 return state_string;
135 }
136 };
137 WRITE_CLASS_ENCODER_FEATURES(fullbit)
138
139 /* remotebit - a dentry + remote inode link (i.e. just an ino)
140 */
141 struct remotebit {
142 std::string dn;
143 snapid_t dnfirst, dnlast;
144 version_t dnv;
145 inodeno_t ino;
146 unsigned char d_type;
147 bool dirty;
148
149 remotebit(std::string_view d, snapid_t df, snapid_t dl, version_t v, inodeno_t i, unsigned char dt, bool dr) :
150 dn(d), dnfirst(df), dnlast(dl), dnv(v), ino(i), d_type(dt), dirty(dr) { }
151 explicit remotebit(bufferlist::const_iterator &p) { decode(p); }
152 remotebit(): dnfirst(0), dnlast(0), dnv(0), ino(0),
153 d_type('\0'), dirty(false) {}
154
155 void encode(bufferlist& bl) const;
156 void decode(bufferlist::const_iterator &bl);
157 void print(ostream& out) const {
158 out << " remotebit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv
159 << " ino " << ino
160 << " dirty=" << dirty << std::endl;
161 }
162 void dump(Formatter *f) const;
163 static void generate_test_instances(std::list<remotebit*>& ls);
164 };
165 WRITE_CLASS_ENCODER(remotebit)
166
167 /*
168 * nullbit - a null dentry
169 */
170 struct nullbit {
171 std::string dn;
172 snapid_t dnfirst, dnlast;
173 version_t dnv;
174 bool dirty;
175
176 nullbit(std::string_view d, snapid_t df, snapid_t dl, version_t v, bool dr) :
177 dn(d), dnfirst(df), dnlast(dl), dnv(v), dirty(dr) { }
178 explicit nullbit(bufferlist::const_iterator &p) { decode(p); }
179 nullbit(): dnfirst(0), dnlast(0), dnv(0), dirty(false) {}
180
181 void encode(bufferlist& bl) const;
182 void decode(bufferlist::const_iterator &bl);
183 void dump(Formatter *f) const;
184 static void generate_test_instances(std::list<nullbit*>& ls);
185 void print(ostream& out) const {
186 out << " nullbit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv
187 << " dirty=" << dirty << std::endl;
188 }
189 };
190 WRITE_CLASS_ENCODER(nullbit)
191
192
193 /* dirlump - contains metadata for any dir we have contents for.
194 */
195 public:
196 struct dirlump {
197 static const int STATE_COMPLETE = (1<<1);
198 static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is!
199 static const int STATE_NEW = (1<<3); // new directory
200 static const int STATE_IMPORTING = (1<<4); // importing directory
201 static const int STATE_DIRTYDFT = (1<<5); // dirty dirfragtree
202
203 //version_t dirv;
204 fnode_t fnode;
205 __u32 state;
206 __u32 nfull, nremote, nnull;
207
208 private:
209 mutable bufferlist dnbl;
210 mutable bool dn_decoded;
211 mutable list<fullbit> dfull;
212 mutable vector<remotebit> dremote;
213 mutable vector<nullbit> dnull;
214
215 public:
216 dirlump() : state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { }
217 dirlump(const dirlump&) = delete;
218 dirlump& operator=(const dirlump&) = delete;
219
220 bool is_complete() const { return state & STATE_COMPLETE; }
221 void mark_complete() { state |= STATE_COMPLETE; }
222 bool is_dirty() const { return state & STATE_DIRTY; }
223 void mark_dirty() { state |= STATE_DIRTY; }
224 bool is_new() const { return state & STATE_NEW; }
225 void mark_new() { state |= STATE_NEW; }
226 bool is_importing() { return state & STATE_IMPORTING; }
227 void mark_importing() { state |= STATE_IMPORTING; }
228 bool is_dirty_dft() { return state & STATE_DIRTYDFT; }
229 void mark_dirty_dft() { state |= STATE_DIRTYDFT; }
230
231 const list<fullbit> &get_dfull() const { return dfull; }
232 list<fullbit> &_get_dfull() { return dfull; }
233 const vector<remotebit> &get_dremote() const { return dremote; }
234 const vector<nullbit> &get_dnull() const { return dnull; }
235
236 template< class... Args>
237 void add_dfull(Args&&... args) {
238 dfull.emplace_back(std::forward<Args>(args)...);
239 }
240 template< class... Args>
241 void add_dremote(Args&&... args) {
242 dremote.emplace_back(std::forward<Args>(args)...);
243 }
244 template< class... Args>
245 void add_dnull(Args&&... args) {
246 dnull.emplace_back(std::forward<Args>(args)...);
247 }
248
249 void print(dirfrag_t dirfrag, ostream& out) const {
250 out << "dirlump " << dirfrag << " v " << fnode.version
251 << " state " << state
252 << " num " << nfull << "/" << nremote << "/" << nnull
253 << std::endl;
254 _decode_bits();
255 for (const auto& p : dfull)
256 p.print(out);
257 for (const auto& p : dremote)
258 p.print(out);
259 for (const auto& p : dnull)
260 p.print(out);
261 }
262
263 string state_string() const {
264 string state_string;
265 bool marked_already = false;
266 if (is_complete()) {
267 state_string.append("complete");
268 marked_already = true;
269 }
270 if (is_dirty()) {
271 state_string.append(marked_already ? "+dirty" : "dirty");
272 marked_already = true;
273 }
274 if (is_new()) {
275 state_string.append(marked_already ? "+new" : "new");
276 }
277 return state_string;
278 }
279
280 // if this changes, update the versioning in encode for it!
281 void _encode_bits(uint64_t features) const {
282 using ceph::encode;
283 if (!dn_decoded) return;
284 encode(dfull, dnbl, features);
285 encode(dremote, dnbl);
286 encode(dnull, dnbl);
287 }
288 void _decode_bits() const {
289 using ceph::decode;
290 if (dn_decoded) return;
291 auto p = dnbl.cbegin();
292 decode(dfull, p);
293 decode(dremote, p);
294 decode(dnull, p);
295 dn_decoded = true;
296 }
297
298 void encode(bufferlist& bl, uint64_t features) const;
299 void decode(bufferlist::const_iterator &bl);
300 void dump(Formatter *f) const;
301 static void generate_test_instances(std::list<dirlump*>& ls);
302 };
303 WRITE_CLASS_ENCODER_FEATURES(dirlump)
304
305 // my lumps. preserve the order we added them in a list.
306 vector<dirfrag_t> lump_order;
307 map<dirfrag_t, dirlump> lump_map;
308 list<fullbit> roots;
309 public:
310 vector<pair<__u8,version_t> > table_tids; // tableclient transactions
311
312 inodeno_t opened_ino;
313 public:
314 inodeno_t renamed_dirino;
315 vector<frag_t> renamed_dir_frags;
316 private:
317
318 // ino (pre)allocation. may involve both inotable AND session state.
319 version_t inotablev, sessionmapv;
320 inodeno_t allocated_ino; // inotable
321 interval_set<inodeno_t> preallocated_inos; // inotable + session
322 inodeno_t used_preallocated_ino; // session
323 entity_name_t client_name; // session
324
325 // inodes i've truncated
326 vector<inodeno_t> truncate_start; // start truncate
327 map<inodeno_t, LogSegment::seq_t> truncate_finish; // finished truncate (started in segment blah)
328
329 public:
330 vector<inodeno_t> destroyed_inodes;
331 private:
332
333 // idempotent op(s)
334 vector<pair<metareqid_t,uint64_t> > client_reqs;
335 vector<pair<metareqid_t,uint64_t> > client_flushes;
336
337 public:
338 void encode(bufferlist& bl, uint64_t features) const;
339 void decode(bufferlist::const_iterator& bl);
340 void get_inodes(std::set<inodeno_t> &inodes) const;
341 void get_paths(std::vector<std::string> &paths) const;
342 void get_dentries(std::map<dirfrag_t, std::set<std::string> > &dentries) const;
343 entity_name_t get_client_name() const {return client_name;}
344
345 void dump(Formatter *f) const;
346 static void generate_test_instances(std::list<EMetaBlob*>& ls);
347 // soft stateadd
348 uint64_t last_subtree_map;
349 uint64_t event_seq;
350
351 // for replay, in certain cases
352 //LogSegment *_segment;
353
354 EMetaBlob() : opened_ino(0), renamed_dirino(0),
355 inotablev(0), sessionmapv(0), allocated_ino(0),
356 last_subtree_map(0), event_seq(0)
357 {}
358 EMetaBlob(const EMetaBlob&) = delete;
359 ~EMetaBlob() { }
360 EMetaBlob& operator=(const EMetaBlob&) = delete;
361
362 void print(ostream& out) {
363 for (const auto &p : lump_order)
364 lump_map[p].print(p, out);
365 }
366
367 void add_client_req(metareqid_t r, uint64_t tid=0) {
368 client_reqs.push_back(pair<metareqid_t,uint64_t>(r, tid));
369 }
370 void add_client_flush(metareqid_t r, uint64_t tid=0) {
371 client_flushes.push_back(pair<metareqid_t,uint64_t>(r, tid));
372 }
373
374 void add_table_transaction(int table, version_t tid) {
375 table_tids.push_back(pair<__u8, version_t>(table, tid));
376 }
377
378 void add_opened_ino(inodeno_t ino) {
379 ceph_assert(!opened_ino);
380 opened_ino = ino;
381 }
382
383 void set_ino_alloc(inodeno_t alloc,
384 inodeno_t used_prealloc,
385 interval_set<inodeno_t>& prealloc,
386 entity_name_t client,
387 version_t sv, version_t iv) {
388 allocated_ino = alloc;
389 used_preallocated_ino = used_prealloc;
390 preallocated_inos = prealloc;
391 client_name = client;
392 sessionmapv = sv;
393 inotablev = iv;
394 }
395
396 void add_truncate_start(inodeno_t ino) {
397 truncate_start.push_back(ino);
398 }
399 void add_truncate_finish(inodeno_t ino, uint64_t segoff) {
400 truncate_finish[ino] = segoff;
401 }
402
403 bool rewrite_truncate_finish(MDSRank const *mds, std::map<uint64_t, uint64_t> const &old_to_new);
404
405 void add_destroyed_inode(inodeno_t ino) {
406 destroyed_inodes.push_back(ino);
407 }
408
409 void add_null_dentry(CDentry *dn, bool dirty) {
410 add_null_dentry(add_dir(dn->get_dir(), false), dn, dirty);
411 }
412 void add_null_dentry(dirlump& lump, CDentry *dn, bool dirty) {
413 // add the dir
414 lump.nnull++;
415 lump.add_dnull(dn->get_name(), dn->first, dn->last,
416 dn->get_projected_version(), dirty);
417 }
418
419 void add_remote_dentry(CDentry *dn, bool dirty) {
420 add_remote_dentry(add_dir(dn->get_dir(), false), dn, dirty, 0, 0);
421 }
422 void add_remote_dentry(CDentry *dn, bool dirty, inodeno_t rino, int rdt) {
423 add_remote_dentry(add_dir(dn->get_dir(), false), dn, dirty, rino, rdt);
424 }
425 void add_remote_dentry(dirlump& lump, CDentry *dn, bool dirty,
426 inodeno_t rino=0, unsigned char rdt=0) {
427 if (!rino) {
428 rino = dn->get_projected_linkage()->get_remote_ino();
429 rdt = dn->get_projected_linkage()->get_remote_d_type();
430 }
431 lump.nremote++;
432 lump.add_dremote(dn->get_name(), dn->first, dn->last,
433 dn->get_projected_version(), rino, rdt, dirty);
434 }
435
436 // return remote pointer to to-be-journaled inode
437 void add_primary_dentry(CDentry *dn, CInode *in, bool dirty,
438 bool dirty_parent=false, bool dirty_pool=false,
439 bool need_snapflush=false) {
440 __u8 state = 0;
441 if (dirty) state |= fullbit::STATE_DIRTY;
442 if (dirty_parent) state |= fullbit::STATE_DIRTYPARENT;
443 if (dirty_pool) state |= fullbit::STATE_DIRTYPOOL;
444 if (need_snapflush) state |= fullbit::STATE_NEED_SNAPFLUSH;
445 add_primary_dentry(add_dir(dn->get_dir(), false), dn, in, state);
446 }
447 void add_primary_dentry(dirlump& lump, CDentry *dn, CInode *in, __u8 state) {
448 if (!in)
449 in = dn->get_projected_linkage()->get_inode();
450
451 if (in->is_ephemeral_rand()) {
452 state |= fullbit::STATE_EPHEMERAL_RANDOM;
453 }
454
455 // make note of where this inode was last journaled
456 in->last_journaled = event_seq;
457 //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl;
458
459 const auto pi = in->get_projected_inode();
460 if ((state & fullbit::STATE_DIRTY) && pi->is_backtrace_updated())
461 state |= fullbit::STATE_DIRTYPARENT;
462
463 bufferlist snapbl;
464 const sr_t *sr = in->get_projected_srnode();
465 if (sr)
466 sr->encode(snapbl);
467
468 lump.nfull++;
469 lump.add_dfull(dn->get_name(), dn->first, dn->last, dn->get_projected_version(),
470 *pi, in->dirfragtree, *in->get_projected_xattrs(), in->symlink,
471 in->oldest_snap, snapbl, state, &in->old_inodes);
472 }
473
474 // convenience: primary or remote? figure it out.
475 void add_dentry(CDentry *dn, bool dirty) {
476 dirlump& lump = add_dir(dn->get_dir(), false);
477 add_dentry(lump, dn, dirty, false, false);
478 }
479 void add_import_dentry(CDentry *dn) {
480 bool dirty_parent = false;
481 bool dirty_pool = false;
482 if (dn->get_linkage()->is_primary()) {
483 dirty_parent = dn->get_linkage()->get_inode()->is_dirty_parent();
484 dirty_pool = dn->get_linkage()->get_inode()->is_dirty_pool();
485 }
486 dirlump& lump = add_dir(dn->get_dir(), false);
487 add_dentry(lump, dn, dn->is_dirty(), dirty_parent, dirty_pool);
488 }
489 void add_dentry(dirlump& lump, CDentry *dn, bool dirty, bool dirty_parent, bool dirty_pool) {
490 // primary or remote
491 if (dn->get_projected_linkage()->is_remote()) {
492 add_remote_dentry(dn, dirty);
493 return;
494 } else if (dn->get_projected_linkage()->is_null()) {
495 add_null_dentry(dn, dirty);
496 return;
497 }
498 ceph_assert(dn->get_projected_linkage()->is_primary());
499 add_primary_dentry(dn, 0, dirty, dirty_parent, dirty_pool);
500 }
501
502 void add_root(bool dirty, CInode *in) {
503 in->last_journaled = event_seq;
504 //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl;
505
506 const auto& pi = *(in->get_projected_inode());
507 const auto& pdft = in->dirfragtree;
508 const auto& px = *(in->get_projected_xattrs());
509
510 bufferlist snapbl;
511 const sr_t *sr = in->get_projected_srnode();
512 if (sr)
513 sr->encode(snapbl);
514
515 for (auto p = roots.begin(); p != roots.end(); ++p) {
516 if (p->inode.ino == in->ino()) {
517 roots.erase(p);
518 break;
519 }
520 }
521
522 string empty;
523 roots.emplace_back(empty, in->first, in->last, 0, pi, pdft, px, in->symlink,
524 in->oldest_snap, snapbl, (dirty ? fullbit::STATE_DIRTY : 0),
525 &in->old_inodes);
526 }
527
528 dirlump& add_dir(CDir *dir, bool dirty, bool complete=false) {
529 return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(),
530 dirty, complete);
531 }
532 dirlump& add_new_dir(CDir *dir) {
533 return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(),
534 true, true, true); // dirty AND complete AND new
535 }
536 dirlump& add_import_dir(CDir *dir) {
537 // dirty=false would be okay in some cases
538 return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(),
539 dir->is_dirty(), dir->is_complete(), false, true, dir->is_dirty_dft());
540 }
541 dirlump& add_fragmented_dir(CDir *dir, bool dirty, bool dirtydft) {
542 return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(),
543 dirty, false, false, false, dirtydft);
544 }
545 dirlump& add_dir(dirfrag_t df, const fnode_t *pf, version_t pv, bool dirty,
546 bool complete=false, bool isnew=false,
547 bool importing=false, bool dirty_dft=false) {
548 if (lump_map.count(df) == 0)
549 lump_order.push_back(df);
550
551 dirlump& l = lump_map[df];
552 l.fnode = *pf;
553 l.fnode.version = pv;
554 if (complete) l.mark_complete();
555 if (dirty) l.mark_dirty();
556 if (isnew) l.mark_new();
557 if (importing) l.mark_importing();
558 if (dirty_dft) l.mark_dirty_dft();
559 return l;
560 }
561
562 static const int TO_AUTH_SUBTREE_ROOT = 0; // default.
563 static const int TO_ROOT = 1;
564
565 void add_dir_context(CDir *dir, int mode = TO_AUTH_SUBTREE_ROOT);
566
567 bool empty() {
568 return roots.empty() && lump_order.empty() && table_tids.empty() &&
569 truncate_start.empty() && truncate_finish.empty() &&
570 destroyed_inodes.empty() && client_reqs.empty() &&
571 opened_ino == 0 && inotablev == 0 && sessionmapv == 0;
572 }
573
574 void print(ostream& out) const {
575 out << "[metablob";
576 if (!lump_order.empty())
577 out << " " << lump_order.front() << ", " << lump_map.size() << " dirs";
578 if (!table_tids.empty())
579 out << " table_tids=" << table_tids;
580 if (allocated_ino || preallocated_inos.size()) {
581 if (allocated_ino)
582 out << " alloc_ino=" << allocated_ino;
583 if (preallocated_inos.size())
584 out << " prealloc_ino=" << preallocated_inos;
585 if (used_preallocated_ino)
586 out << " used_prealloc_ino=" << used_preallocated_ino;
587 out << " v" << inotablev;
588 }
589 out << "]";
590 }
591
592 void update_segment(LogSegment *ls);
593 void replay(MDSRank *mds, LogSegment *ls, MDSlaveUpdate *su=NULL);
594 };
595 WRITE_CLASS_ENCODER_FEATURES(EMetaBlob)
596 WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::fullbit)
597 WRITE_CLASS_ENCODER(EMetaBlob::remotebit)
598 WRITE_CLASS_ENCODER(EMetaBlob::nullbit)
599 WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::dirlump)
600
601 inline ostream& operator<<(ostream& out, const EMetaBlob& t) {
602 t.print(out);
603 return out;
604 }
605
606 #endif