]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/CInode.h
update sources to v12.2.3
[ceph.git] / ceph / src / mds / CInode.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16
17 #ifndef CEPH_CINODE_H
18 #define CEPH_CINODE_H
19
20 #include "common/config.h"
21 #include "include/counter.h"
22 #include "include/elist.h"
23 #include "include/types.h"
24 #include "include/lru.h"
25 #include "include/compact_set.h"
26
27 #include "MDSCacheObject.h"
28 #include "flock.h"
29
30 #include "CDentry.h"
31 #include "SimpleLock.h"
32 #include "ScatterLock.h"
33 #include "LocalLock.h"
34 #include "Capability.h"
35 #include "SnapRealm.h"
36 #include "Mutation.h"
37
38 #include <list>
39 #include <set>
40 #include <map>
41
42 #define dout_context g_ceph_context
43
44 class Context;
45 class CDentry;
46 class CDir;
47 class Message;
48 class CInode;
49 class MDCache;
50 class LogSegment;
51 struct SnapRealm;
52 class Session;
53 class MClientCaps;
54 struct ObjectOperation;
55 class EMetaBlob;
56
57
58 ostream& operator<<(ostream& out, const CInode& in);
59
60 struct cinode_lock_info_t {
61 int lock;
62 int wr_caps;
63 };
64
65 extern cinode_lock_info_t cinode_lock_info[];
66 extern int num_cinode_locks;
67
68
69 /**
70 * Base class for CInode, containing the backing store data and
71 * serialization methods. This exists so that we can read and
72 * handle CInodes from the backing store without hitting all
73 * the business logic in CInode proper.
74 */
75 class InodeStoreBase {
76 public:
77 inode_t inode; // the inode itself
78 std::string symlink; // symlink dest, if symlink
79 std::map<std::string, bufferptr> xattrs;
80 fragtree_t dirfragtree; // dir frag tree, if any. always consistent with our dirfrag map.
81 compact_map<snapid_t, old_inode_t> old_inodes; // key = last, value.first = first
82 snapid_t oldest_snap;
83 damage_flags_t damage_flags;
84
85 InodeStoreBase() : oldest_snap(CEPH_NOSNAP), damage_flags(0) { }
86
87 /* Helpers */
88 bool is_file() const { return inode.is_file(); }
89 bool is_symlink() const { return inode.is_symlink(); }
90 bool is_dir() const { return inode.is_dir(); }
91 static object_t get_object_name(inodeno_t ino, frag_t fg, const char *suffix);
92
93 /* Full serialization for use in ".inode" root inode objects */
94 void encode(bufferlist &bl, uint64_t features, const bufferlist *snap_blob=NULL) const;
95 void decode(bufferlist::iterator &bl, bufferlist& snap_blob);
96
97 /* Serialization without ENCODE_START/FINISH blocks for use embedded in dentry */
98 void encode_bare(bufferlist &bl, uint64_t features, const bufferlist *snap_blob=NULL) const;
99 void decode_bare(bufferlist::iterator &bl, bufferlist &snap_blob, __u8 struct_v=5);
100
101 /* For test/debug output */
102 void dump(Formatter *f) const;
103
104 /* For use by offline tools */
105 __u32 hash_dentry_name(const std::string &dn);
106 frag_t pick_dirfrag(const std::string &dn);
107 };
108
109 class InodeStore : public InodeStoreBase {
110 public:
111 bufferlist snap_blob; // Encoded copy of SnapRealm, because we can't
112 // rehydrate it without full MDCache
113 void encode(bufferlist &bl, uint64_t features) const {
114 InodeStoreBase::encode(bl, features, &snap_blob);
115 }
116 void decode(bufferlist::iterator &bl) {
117 InodeStoreBase::decode(bl, snap_blob);
118 }
119 void encode_bare(bufferlist &bl, uint64_t features) const {
120 InodeStoreBase::encode_bare(bl, features, &snap_blob);
121 }
122 void decode_bare(bufferlist::iterator &bl) {
123 InodeStoreBase::decode_bare(bl, snap_blob);
124 }
125
126 static void generate_test_instances(std::list<InodeStore*>& ls);
127 };
128 WRITE_CLASS_ENCODER_FEATURES(InodeStore)
129
130 // cached inode wrapper
131 class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CInode> {
132 public:
133 MEMPOOL_CLASS_HELPERS();
134 // -- pins --
135 static const int PIN_DIRFRAG = -1;
136 static const int PIN_CAPS = 2; // client caps
137 static const int PIN_IMPORTING = -4; // importing
138 static const int PIN_OPENINGDIR = 7;
139 static const int PIN_REMOTEPARENT = 8;
140 static const int PIN_BATCHOPENJOURNAL = 9;
141 static const int PIN_SCATTERED = 10;
142 static const int PIN_STICKYDIRS = 11;
143 //static const int PIN_PURGING = -12;
144 static const int PIN_FREEZING = 13;
145 static const int PIN_FROZEN = 14;
146 static const int PIN_IMPORTINGCAPS = -15;
147 static const int PIN_PASTSNAPPARENT = -16;
148 static const int PIN_OPENINGSNAPPARENTS = 17;
149 static const int PIN_TRUNCATING = 18;
150 static const int PIN_STRAY = 19; // we pin our stray inode while active
151 static const int PIN_NEEDSNAPFLUSH = 20;
152 static const int PIN_DIRTYRSTAT = 21;
153 static const int PIN_EXPORTINGCAPS = 22;
154 static const int PIN_DIRTYPARENT = 23;
155 static const int PIN_DIRWAITER = 24;
156 static const int PIN_SCRUBQUEUE = 25;
157
158 const char *pin_name(int p) const override {
159 switch (p) {
160 case PIN_DIRFRAG: return "dirfrag";
161 case PIN_CAPS: return "caps";
162 case PIN_IMPORTING: return "importing";
163 case PIN_OPENINGDIR: return "openingdir";
164 case PIN_REMOTEPARENT: return "remoteparent";
165 case PIN_BATCHOPENJOURNAL: return "batchopenjournal";
166 case PIN_SCATTERED: return "scattered";
167 case PIN_STICKYDIRS: return "stickydirs";
168 //case PIN_PURGING: return "purging";
169 case PIN_FREEZING: return "freezing";
170 case PIN_FROZEN: return "frozen";
171 case PIN_IMPORTINGCAPS: return "importingcaps";
172 case PIN_EXPORTINGCAPS: return "exportingcaps";
173 case PIN_PASTSNAPPARENT: return "pastsnapparent";
174 case PIN_OPENINGSNAPPARENTS: return "openingsnapparents";
175 case PIN_TRUNCATING: return "truncating";
176 case PIN_STRAY: return "stray";
177 case PIN_NEEDSNAPFLUSH: return "needsnapflush";
178 case PIN_DIRTYRSTAT: return "dirtyrstat";
179 case PIN_DIRTYPARENT: return "dirtyparent";
180 case PIN_DIRWAITER: return "dirwaiter";
181 case PIN_SCRUBQUEUE: return "scrubqueue";
182 default: return generic_pin_name(p);
183 }
184 }
185
186 // -- state --
187 static const int STATE_EXPORTING = (1<<2); // on nonauth bystander.
188 static const int STATE_OPENINGDIR = (1<<5);
189 static const int STATE_FREEZING = (1<<7);
190 static const int STATE_FROZEN = (1<<8);
191 static const int STATE_AMBIGUOUSAUTH = (1<<9);
192 static const int STATE_EXPORTINGCAPS = (1<<10);
193 static const int STATE_NEEDSRECOVER = (1<<11);
194 static const int STATE_RECOVERING = (1<<12);
195 static const int STATE_PURGING = (1<<13);
196 static const int STATE_DIRTYPARENT = (1<<14);
197 static const int STATE_DIRTYRSTAT = (1<<15);
198 static const int STATE_STRAYPINNED = (1<<16);
199 static const int STATE_FROZENAUTHPIN = (1<<17);
200 static const int STATE_DIRTYPOOL = (1<<18);
201 static const int STATE_REPAIRSTATS = (1<<19);
202 static const int STATE_MISSINGOBJS = (1<<20);
203 static const int STATE_EVALSTALECAPS = (1<<21);
204 static const int STATE_QUEUEDEXPORTPIN = (1<<22);
205 // orphan inode needs notification of releasing reference
206 static const int STATE_ORPHAN = STATE_NOTIFYREF;
207
208 static const int MASK_STATE_EXPORTED =
209 (STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL);
210 static const int MASK_STATE_EXPORT_KEPT =
211 (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS|STATE_QUEUEDEXPORTPIN);
212
213 // -- waiters --
214 static const uint64_t WAIT_DIR = (1<<0);
215 static const uint64_t WAIT_FROZEN = (1<<1);
216 static const uint64_t WAIT_TRUNC = (1<<2);
217 static const uint64_t WAIT_FLOCK = (1<<3);
218
219 static const uint64_t WAIT_ANY_MASK = (uint64_t)(-1);
220
221 // misc
222 static const unsigned EXPORT_NONCE = 1; // nonce given to replicas created by export
223
224 ostream& print_db_line_prefix(ostream& out) override;
225
226 public:
227 MDCache *mdcache;
228
229 SnapRealm *snaprealm;
230 SnapRealm *containing_realm;
231 snapid_t first, last;
232 compact_set<snapid_t> dirty_old_rstats;
233
234 class scrub_stamp_info_t {
235 public:
236 /// version we started our latest scrub (whether in-progress or finished)
237 version_t scrub_start_version;
238 /// time we started our latest scrub (whether in-progress or finished)
239 utime_t scrub_start_stamp;
240 /// version we started our most recent finished scrub
241 version_t last_scrub_version;
242 /// time we started our most recent finished scrub
243 utime_t last_scrub_stamp;
244 scrub_stamp_info_t() : scrub_start_version(0), last_scrub_version(0) {}
245 void reset() {
246 scrub_start_version = last_scrub_version = 0;
247 scrub_start_stamp = last_scrub_stamp = utime_t();
248 }
249 };
250
251 class scrub_info_t : public scrub_stamp_info_t {
252 public:
253 CDentry *scrub_parent;
254 MDSInternalContextBase *on_finish;
255
256 bool last_scrub_dirty; /// are our stamps dirty with respect to disk state?
257 bool scrub_in_progress; /// are we currently scrubbing?
258 bool children_scrubbed;
259
260 /// my own (temporary) stamps and versions for each dirfrag we have
261 std::map<frag_t, scrub_stamp_info_t> dirfrag_stamps;
262
263 ScrubHeaderRef header;
264
265 scrub_info_t() : scrub_stamp_info_t(),
266 scrub_parent(NULL), on_finish(NULL),
267 last_scrub_dirty(false), scrub_in_progress(false),
268 children_scrubbed(false) {}
269 };
270
271 const scrub_info_t *scrub_info() const{
272 if (!scrub_infop)
273 scrub_info_create();
274 return scrub_infop;
275 }
276
277 ScrubHeaderRef get_scrub_header() {
278 if (scrub_infop == nullptr) {
279 return nullptr;
280 } else {
281 return scrub_infop->header;
282 }
283 }
284
285 bool scrub_is_in_progress() const {
286 return (scrub_infop && scrub_infop->scrub_in_progress);
287 }
288 /**
289 * Start scrubbing on this inode. That could be very short if it's
290 * a file, or take a long time if we're recursively scrubbing a directory.
291 * @pre It is not currently scrubbing
292 * @post it has set up internal scrubbing state
293 * @param scrub_version What version are we scrubbing at (usually, parent
294 * directory's get_projected_version())
295 */
296 void scrub_initialize(CDentry *scrub_parent,
297 ScrubHeaderRef& header,
298 MDSInternalContextBase *f);
299 /**
300 * Get the next dirfrag to scrub. Gives you a frag_t in output param which
301 * you must convert to a CDir (and possibly load off disk).
302 * @param dir A pointer to frag_t, will be filled in with the next dirfrag to
303 * scrub if there is one.
304 * @returns 0 on success, you should scrub the passed-out frag_t right now;
305 * ENOENT: There are no remaining dirfrags to scrub
306 * <0 There was some other error (It will return -ENOTDIR if not a directory)
307 */
308 int scrub_dirfrag_next(frag_t* out_dirfrag);
309 /**
310 * Get the currently scrubbing dirfrags. When returned, the
311 * passed-in list will be filled in with all frag_ts which have
312 * been returned from scrub_dirfrag_next but not sent back
313 * via scrub_dirfrag_finished.
314 */
315 void scrub_dirfrags_scrubbing(list<frag_t> *out_dirfrags);
316 /**
317 * Report to the CInode that a dirfrag it owns has been scrubbed. Call
318 * this for every frag_t returned from scrub_dirfrag_next().
319 * @param dirfrag The frag_t that was scrubbed
320 */
321 void scrub_dirfrag_finished(frag_t dirfrag);
322 /**
323 * Call this once the scrub has been completed, whether it's a full
324 * recursive scrub on a directory or simply the data on a file (or
325 * anything in between).
326 * @param c An out param which is filled in with a Context* that must
327 * be complete()ed.
328 */
329 void scrub_finished(MDSInternalContextBase **c);
330 /**
331 * Report to the CInode that alldirfrags it owns have been scrubbed.
332 */
333 void scrub_children_finished() {
334 scrub_infop->children_scrubbed = true;
335 }
336 void scrub_set_finisher(MDSInternalContextBase *c) {
337 assert(!scrub_infop->on_finish);
338 scrub_infop->on_finish = c;
339 }
340
341 private:
342 /**
343 * Create a scrub_info_t struct for the scrub_infop poitner.
344 */
345 void scrub_info_create() const;
346 /**
347 * Delete the scrub_info_t struct if it's not got any useful data
348 */
349 void scrub_maybe_delete_info();
350 public:
351
352 bool is_multiversion() const {
353 return snaprealm || // other snaprealms will link to me
354 inode.is_dir() || // links to me in other snaps
355 inode.nlink > 1 || // there are remote links, possibly snapped, that will need to find me
356 !old_inodes.empty(); // once multiversion, always multiversion. until old_inodes gets cleaned out.
357 }
358 snapid_t get_oldest_snap();
359
360 uint64_t last_journaled; // log offset for the last time i was journaled
361 //loff_t last_open_journaled; // log offset for the last journaled EOpen
362 utime_t last_dirstat_prop;
363
364
365 // list item node for when we have unpropagated rstat data
366 elist<CInode*>::item dirty_rstat_item;
367
368 bool is_dirty_rstat() {
369 return state_test(STATE_DIRTYRSTAT);
370 }
371 void mark_dirty_rstat();
372 void clear_dirty_rstat();
373
374 //bool hack_accessed;
375 //utime_t hack_load_stamp;
376
377 /**
378 * Projection methods, used to store inode changes until they have been journaled,
379 * at which point they are popped.
380 * Usage:
381 * project_inode as needed. If you're also projecting xattrs, pass
382 * in an xattr map (by pointer), then edit the map.
383 * If you're also projecting the snaprealm, call project_snaprealm after
384 * calling project_inode, and modify the snaprealm as necessary.
385 *
386 * Then, journal. Once journaling is done, pop_and_dirty_projected_inode.
387 * This function will take care of the inode itself, the xattrs, and the snaprealm.
388 */
389
390 struct projected_inode_t {
391 inode_t *inode;
392 std::map<std::string,bufferptr> *xattrs;
393 sr_t *snapnode;
394
395 projected_inode_t()
396 : inode(NULL), xattrs(NULL), snapnode(NULL) {}
397 projected_inode_t(inode_t *in, sr_t *sn)
398 : inode(in), xattrs(NULL), snapnode(sn) {}
399 projected_inode_t(inode_t *in, std::map<std::string, bufferptr> *xp = NULL, sr_t *sn = NULL)
400 : inode(in), xattrs(xp), snapnode(sn) {}
401 };
402 std::list<projected_inode_t*> projected_nodes; // projected values (only defined while dirty)
403 int num_projected_xattrs;
404 int num_projected_srnodes;
405
406 inode_t *project_inode(std::map<std::string,bufferptr> *px=0);
407 void pop_and_dirty_projected_inode(LogSegment *ls);
408
409 projected_inode_t *get_projected_node() {
410 if (projected_nodes.empty())
411 return NULL;
412 else
413 return projected_nodes.back();
414 }
415
416 version_t get_projected_version() const {
417 if (projected_nodes.empty())
418 return inode.version;
419 else
420 return projected_nodes.back()->inode->version;
421 }
422 bool is_projected() const {
423 return !projected_nodes.empty();
424 }
425
426 const inode_t *get_projected_inode() const {
427 if (projected_nodes.empty())
428 return &inode;
429 else
430 return projected_nodes.back()->inode;
431 }
432 inode_t *get_projected_inode() {
433 if (projected_nodes.empty())
434 return &inode;
435 else
436 return projected_nodes.back()->inode;
437 }
438 inode_t *get_previous_projected_inode() {
439 assert(!projected_nodes.empty());
440 std::list<projected_inode_t*>::reverse_iterator p = projected_nodes.rbegin();
441 ++p;
442 if (p != projected_nodes.rend())
443 return (*p)->inode;
444 else
445 return &inode;
446 }
447
448 std::map<std::string,bufferptr> *get_projected_xattrs() {
449 if (num_projected_xattrs > 0) {
450 for (std::list<projected_inode_t*>::reverse_iterator p = projected_nodes.rbegin();
451 p != projected_nodes.rend();
452 ++p)
453 if ((*p)->xattrs)
454 return (*p)->xattrs;
455 }
456 return &xattrs;
457 }
458 std::map<std::string,bufferptr> *get_previous_projected_xattrs() {
459 std::list<projected_inode_t*>::reverse_iterator p = projected_nodes.rbegin();
460 for (++p; // skip the most recent projected value
461 p != projected_nodes.rend();
462 ++p)
463 if ((*p)->xattrs)
464 return (*p)->xattrs;
465 return &xattrs;
466 }
467
468 sr_t *project_snaprealm(snapid_t snapid=0);
469 const sr_t *get_projected_srnode() const {
470 if (num_projected_srnodes > 0) {
471 for (std::list<projected_inode_t*>::const_reverse_iterator p = projected_nodes.rbegin();
472 p != projected_nodes.rend();
473 ++p)
474 if ((*p)->snapnode)
475 return (*p)->snapnode;
476 }
477 if (snaprealm)
478 return &snaprealm->srnode;
479 else
480 return NULL;
481 }
482 sr_t *get_projected_srnode() {
483 if (num_projected_srnodes > 0) {
484 for (std::list<projected_inode_t*>::reverse_iterator p = projected_nodes.rbegin();
485 p != projected_nodes.rend();
486 ++p)
487 if ((*p)->snapnode)
488 return (*p)->snapnode;
489 }
490 if (snaprealm)
491 return &snaprealm->srnode;
492 else
493 return NULL;
494 }
495 void project_past_snaprealm_parent(SnapRealm *newparent);
496
497 private:
498 void pop_projected_snaprealm(sr_t *next_snaprealm);
499
500 public:
501 old_inode_t& cow_old_inode(snapid_t follows, bool cow_head);
502 void split_old_inode(snapid_t snap);
503 old_inode_t *pick_old_inode(snapid_t last);
504 void pre_cow_old_inode();
505 void purge_stale_snap_data(const std::set<snapid_t>& snaps);
506
507 // -- cache infrastructure --
508 private:
509 compact_map<frag_t,CDir*> dirfrags; // cached dir fragments under this Inode
510 int stickydir_ref;
511 scrub_info_t *scrub_infop;
512
513 public:
514 bool has_dirfrags() { return !dirfrags.empty(); }
515 CDir* get_dirfrag(frag_t fg) {
516 if (dirfrags.count(fg)) {
517 //assert(g_conf->debug_mds < 2 || dirfragtree.is_leaf(fg)); // performance hack FIXME
518 return dirfrags[fg];
519 } else
520 return NULL;
521 }
522 bool get_dirfrags_under(frag_t fg, std::list<CDir*>& ls);
523 CDir* get_approx_dirfrag(frag_t fg);
524 void get_dirfrags(std::list<CDir*>& ls);
525 void get_nested_dirfrags(std::list<CDir*>& ls);
526 void get_subtree_dirfrags(std::list<CDir*>& ls);
527 CDir *get_or_open_dirfrag(MDCache *mdcache, frag_t fg);
528 CDir *add_dirfrag(CDir *dir);
529 void close_dirfrag(frag_t fg);
530 void close_dirfrags();
531 bool has_subtree_root_dirfrag(int auth=-1);
532 bool has_subtree_or_exporting_dirfrag();
533
534 void force_dirfrags();
535 void verify_dirfrags();
536
537 void get_stickydirs();
538 void put_stickydirs();
539
540 protected:
541 // parent dentries in cache
542 CDentry *parent; // primary link
543 compact_set<CDentry*> remote_parents; // if hard linked
544
545 std::list<CDentry*> projected_parent; // for in-progress rename, (un)link, etc.
546
547 mds_authority_t inode_auth;
548
549 // -- distributed state --
550 protected:
551 // file capabilities
552 std::map<client_t, Capability*> client_caps; // client -> caps
553 compact_map<int32_t, int32_t> mds_caps_wanted; // [auth] mds -> caps wanted
554 int replica_caps_wanted; // [replica] what i've requested from auth
555
556 public:
557 compact_map<int, std::set<client_t> > client_snap_caps; // [auth] [snap] dirty metadata we still need from the head
558 compact_map<snapid_t, std::set<client_t> > client_need_snapflush;
559
560 void add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client);
561 void remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client);
562 bool split_need_snapflush(CInode *cowin, CInode *in);
563
564 protected:
565
566 ceph_lock_state_t *fcntl_locks;
567 ceph_lock_state_t *flock_locks;
568
569 ceph_lock_state_t *get_fcntl_lock_state() {
570 if (!fcntl_locks)
571 fcntl_locks = new ceph_lock_state_t(g_ceph_context, CEPH_LOCK_FCNTL);
572 return fcntl_locks;
573 }
574 void clear_fcntl_lock_state() {
575 delete fcntl_locks;
576 fcntl_locks = NULL;
577 }
578 ceph_lock_state_t *get_flock_lock_state() {
579 if (!flock_locks)
580 flock_locks = new ceph_lock_state_t(g_ceph_context, CEPH_LOCK_FLOCK);
581 return flock_locks;
582 }
583 void clear_flock_lock_state() {
584 delete flock_locks;
585 flock_locks = NULL;
586 }
587 void clear_file_locks() {
588 clear_fcntl_lock_state();
589 clear_flock_lock_state();
590 }
591 void _encode_file_locks(bufferlist& bl) const {
592 bool has_fcntl_locks = fcntl_locks && !fcntl_locks->empty();
593 ::encode(has_fcntl_locks, bl);
594 if (has_fcntl_locks)
595 ::encode(*fcntl_locks, bl);
596 bool has_flock_locks = flock_locks && !flock_locks->empty();
597 ::encode(has_flock_locks, bl);
598 if (has_flock_locks)
599 ::encode(*flock_locks, bl);
600 }
601 void _decode_file_locks(bufferlist::iterator& p) {
602 bool has_fcntl_locks;
603 ::decode(has_fcntl_locks, p);
604 if (has_fcntl_locks)
605 ::decode(*get_fcntl_lock_state(), p);
606 else
607 clear_fcntl_lock_state();
608 bool has_flock_locks;
609 ::decode(has_flock_locks, p);
610 if (has_flock_locks)
611 ::decode(*get_flock_lock_state(), p);
612 else
613 clear_flock_lock_state();
614 }
615
616 // LogSegment lists i (may) belong to
617 public:
618 elist<CInode*>::item item_dirty;
619 elist<CInode*>::item item_caps;
620 elist<CInode*>::item item_open_file;
621 elist<CInode*>::item item_dirty_parent;
622 elist<CInode*>::item item_dirty_dirfrag_dir;
623 elist<CInode*>::item item_dirty_dirfrag_nest;
624 elist<CInode*>::item item_dirty_dirfrag_dirfragtree;
625 elist<CInode*>::item item_scrub;
626
627 // also update RecoveryQueue::RecoveryQueue() if you change this
628 elist<CInode*>::item& item_recover_queue = item_dirty_dirfrag_dir;
629 elist<CInode*>::item& item_recover_queue_front = item_dirty_dirfrag_nest;
630
631 public:
632 int auth_pin_freeze_allowance;
633
634 inode_load_vec_t pop;
635
636 // friends
637 friend class Server;
638 friend class Locker;
639 friend class Migrator;
640 friend class MDCache;
641 friend class StrayManager;
642 friend class CDir;
643 friend class CInodeExport;
644
645 // ---------------------------
646 CInode(MDCache *c, bool auth=true, snapid_t f=2, snapid_t l=CEPH_NOSNAP) :
647 mdcache(c),
648 snaprealm(0), containing_realm(0),
649 first(f), last(l),
650 last_journaled(0), //last_open_journaled(0),
651 //hack_accessed(true),
652 num_projected_xattrs(0),
653 num_projected_srnodes(0),
654 stickydir_ref(0),
655 scrub_infop(NULL),
656 parent(0),
657 inode_auth(CDIR_AUTH_DEFAULT),
658 replica_caps_wanted(0),
659 fcntl_locks(0), flock_locks(0),
660 item_dirty(this), item_caps(this), item_open_file(this), item_dirty_parent(this),
661 item_dirty_dirfrag_dir(this),
662 item_dirty_dirfrag_nest(this),
663 item_dirty_dirfrag_dirfragtree(this),
664 auth_pin_freeze_allowance(0),
665 pop(ceph_clock_now()),
666 versionlock(this, &versionlock_type),
667 authlock(this, &authlock_type),
668 linklock(this, &linklock_type),
669 dirfragtreelock(this, &dirfragtreelock_type),
670 filelock(this, &filelock_type),
671 xattrlock(this, &xattrlock_type),
672 snaplock(this, &snaplock_type),
673 nestlock(this, &nestlock_type),
674 flocklock(this, &flocklock_type),
675 policylock(this, &policylock_type),
676 loner_cap(-1), want_loner_cap(-1)
677 {
678 state = 0;
679 if (auth) state_set(STATE_AUTH);
680 }
681 ~CInode() override {
682 close_dirfrags();
683 close_snaprealm();
684 clear_file_locks();
685 assert(num_projected_xattrs == 0);
686 assert(num_projected_srnodes == 0);
687 }
688
689
690 // -- accessors --
691 bool is_root() const { return inode.ino == MDS_INO_ROOT; }
692 bool is_stray() const { return MDS_INO_IS_STRAY(inode.ino); }
693 mds_rank_t get_stray_owner() const {
694 return (mds_rank_t)MDS_INO_STRAY_OWNER(inode.ino);
695 }
696 bool is_mdsdir() const { return MDS_INO_IS_MDSDIR(inode.ino); }
697 bool is_base() const { return is_root() || is_mdsdir(); }
698 bool is_system() const { return inode.ino < MDS_INO_SYSTEM_BASE; }
699 bool is_normal() const { return !(is_base() || is_system() || is_stray()); }
700
701 bool is_head() const { return last == CEPH_NOSNAP; }
702
703 // note: this overloads MDSCacheObject
704 bool is_ambiguous_auth() const {
705 return state_test(STATE_AMBIGUOUSAUTH) ||
706 MDSCacheObject::is_ambiguous_auth();
707 }
708 void set_ambiguous_auth() {
709 state_set(STATE_AMBIGUOUSAUTH);
710 }
711 void clear_ambiguous_auth(std::list<MDSInternalContextBase*>& finished);
712 void clear_ambiguous_auth();
713
714 inodeno_t ino() const { return inode.ino; }
715 vinodeno_t vino() const { return vinodeno_t(inode.ino, last); }
716 int d_type() const { return IFTODT(inode.mode); }
717
718 inode_t& get_inode() { return inode; }
719 CDentry* get_parent_dn() { return parent; }
720 const CDentry* get_parent_dn() const { return parent; }
721 const CDentry* get_projected_parent_dn() const { return !projected_parent.empty() ? projected_parent.back() : parent; }
722 CDentry* get_projected_parent_dn() { return !projected_parent.empty() ? projected_parent.back() : parent; }
723 CDir *get_parent_dir();
724 const CDir *get_projected_parent_dir() const;
725 CDir *get_projected_parent_dir();
726 CInode *get_parent_inode();
727
728 bool is_lt(const MDSCacheObject *r) const override {
729 const CInode *o = static_cast<const CInode*>(r);
730 return ino() < o->ino() ||
731 (ino() == o->ino() && last < o->last);
732 }
733
734 // -- misc --
735 bool is_projected_ancestor_of(CInode *other);
736
737 void make_path_string(std::string& s, bool projected=false, const CDentry *use_parent=NULL) const;
738 void make_path(filepath& s, bool projected=false) const;
739 void name_stray_dentry(std::string& dname);
740
741 // -- dirtyness --
742 version_t get_version() const { return inode.version; }
743
744 version_t pre_dirty();
745 void _mark_dirty(LogSegment *ls);
746 void mark_dirty(version_t projected_dirv, LogSegment *ls);
747 void mark_clean();
748
749 void store(MDSInternalContextBase *fin);
750 void _stored(int r, version_t cv, Context *fin);
751 /**
752 * Flush a CInode to disk. This includes the backtrace, the parent
753 * directory's link, and the Inode object itself (if a base directory).
754 * @pre is_auth() on both the inode and its containing directory
755 * @pre can_auth_pin()
756 * @param fin The Context to call when the flush is completed.
757 */
758 void flush(MDSInternalContextBase *fin);
759 void fetch(MDSInternalContextBase *fin);
760 void _fetched(bufferlist& bl, bufferlist& bl2, Context *fin);
761
762
763 void build_backtrace(int64_t pool, inode_backtrace_t& bt);
764 void store_backtrace(MDSInternalContextBase *fin, int op_prio=-1);
765 void _stored_backtrace(int r, version_t v, Context *fin);
766 void fetch_backtrace(Context *fin, bufferlist *backtrace);
767 protected:
768 /**
769 * Return the pool ID where we currently write backtraces for
770 * this inode (in addition to inode.old_pools)
771 *
772 * @returns a pool ID >=0
773 */
774 int64_t get_backtrace_pool() const;
775 public:
776 void _mark_dirty_parent(LogSegment *ls, bool dirty_pool=false);
777 void clear_dirty_parent();
778 void verify_diri_backtrace(bufferlist &bl, int err);
779 bool is_dirty_parent() { return state_test(STATE_DIRTYPARENT); }
780 bool is_dirty_pool() { return state_test(STATE_DIRTYPOOL); }
781
782 void encode_snap_blob(bufferlist &bl);
783 void decode_snap_blob(bufferlist &bl);
784 void encode_store(bufferlist& bl, uint64_t features);
785 void decode_store(bufferlist::iterator& bl);
786
787 void encode_replica(mds_rank_t rep, bufferlist& bl, uint64_t features, bool need_recover) {
788 assert(is_auth());
789
790 // relax locks?
791 if (!is_replicated())
792 replicate_relax_locks();
793
794 __u32 nonce = add_replica(rep);
795 ::encode(nonce, bl);
796
797 _encode_base(bl, features);
798 _encode_locks_state_for_replica(bl, need_recover);
799 }
800 void decode_replica(bufferlist::iterator& p, bool is_new) {
801 __u32 nonce;
802 ::decode(nonce, p);
803 replica_nonce = nonce;
804
805 _decode_base(p);
806 _decode_locks_state(p, is_new);
807 }
808
809 // -- waiting --
810 protected:
811 compact_map<frag_t, std::list<MDSInternalContextBase*> > waiting_on_dir;
812 public:
813 void add_dir_waiter(frag_t fg, MDSInternalContextBase *c);
814 void take_dir_waiting(frag_t fg, std::list<MDSInternalContextBase*>& ls);
815 bool is_waiting_for_dir(frag_t fg) {
816 return waiting_on_dir.count(fg);
817 }
818 void add_waiter(uint64_t tag, MDSInternalContextBase *c) override;
819 void take_waiting(uint64_t tag, std::list<MDSInternalContextBase*>& ls) override;
820
821 // -- encode/decode helpers --
822 void _encode_base(bufferlist& bl, uint64_t features);
823 void _decode_base(bufferlist::iterator& p);
824 void _encode_locks_full(bufferlist& bl);
825 void _decode_locks_full(bufferlist::iterator& p);
826 void _encode_locks_state_for_replica(bufferlist& bl, bool need_recover);
827 void _encode_locks_state_for_rejoin(bufferlist& bl, int rep);
828 void _decode_locks_state(bufferlist::iterator& p, bool is_new);
829 void _decode_locks_rejoin(bufferlist::iterator& p, std::list<MDSInternalContextBase*>& waiters,
830 std::list<SimpleLock*>& eval_locks, bool survivor);
831
832 // -- import/export --
833 void encode_export(bufferlist& bl);
834 void finish_export(utime_t now);
835 void abort_export() {
836 put(PIN_TEMPEXPORTING);
837 assert(state_test(STATE_EXPORTINGCAPS));
838 state_clear(STATE_EXPORTINGCAPS);
839 put(PIN_EXPORTINGCAPS);
840 }
841 void decode_import(bufferlist::iterator& p, LogSegment *ls);
842
843
844 // for giving to clients
845 int encode_inodestat(bufferlist& bl, Session *session, SnapRealm *realm,
846 snapid_t snapid=CEPH_NOSNAP, unsigned max_bytes=0,
847 int getattr_wants=0);
848 void encode_cap_message(MClientCaps *m, Capability *cap);
849
850
851 // -- locks --
852 public:
853 static LockType versionlock_type;
854 static LockType authlock_type;
855 static LockType linklock_type;
856 static LockType dirfragtreelock_type;
857 static LockType filelock_type;
858 static LockType xattrlock_type;
859 static LockType snaplock_type;
860 static LockType nestlock_type;
861 static LockType flocklock_type;
862 static LockType policylock_type;
863
864 LocalLock versionlock;
865 SimpleLock authlock;
866 SimpleLock linklock;
867 ScatterLock dirfragtreelock;
868 ScatterLock filelock;
869 SimpleLock xattrlock;
870 SimpleLock snaplock;
871 ScatterLock nestlock;
872 SimpleLock flocklock;
873 SimpleLock policylock;
874
875 SimpleLock* get_lock(int type) override {
876 switch (type) {
877 case CEPH_LOCK_IFILE: return &filelock;
878 case CEPH_LOCK_IAUTH: return &authlock;
879 case CEPH_LOCK_ILINK: return &linklock;
880 case CEPH_LOCK_IDFT: return &dirfragtreelock;
881 case CEPH_LOCK_IXATTR: return &xattrlock;
882 case CEPH_LOCK_ISNAP: return &snaplock;
883 case CEPH_LOCK_INEST: return &nestlock;
884 case CEPH_LOCK_IFLOCK: return &flocklock;
885 case CEPH_LOCK_IPOLICY: return &policylock;
886 }
887 return 0;
888 }
889
890 void set_object_info(MDSCacheObjectInfo &info) override;
891 void encode_lock_state(int type, bufferlist& bl) override;
892 void decode_lock_state(int type, bufferlist& bl) override;
893
894 void _finish_frag_update(CDir *dir, MutationRef& mut);
895
896 void clear_dirty_scattered(int type) override;
897 bool is_dirty_scattered();
898 void clear_scatter_dirty(); // on rejoin ack
899
900 void start_scatter(ScatterLock *lock);
901 void finish_scatter_update(ScatterLock *lock, CDir *dir,
902 version_t inode_version, version_t dir_accounted_version);
903 void finish_scatter_gather_update(int type);
904 void finish_scatter_gather_update_accounted(int type, MutationRef& mut, EMetaBlob *metablob);
905
906 // -- snap --
907 void open_snaprealm(bool no_split=false);
908 void close_snaprealm(bool no_join=false);
909 SnapRealm *find_snaprealm() const;
910 void encode_snap(bufferlist& bl);
911 void decode_snap(bufferlist::iterator& p);
912
913 // -- caps -- (new)
914 // client caps
915 client_t loner_cap, want_loner_cap;
916
917 client_t get_loner() const { return loner_cap; }
918 client_t get_wanted_loner() const { return want_loner_cap; }
919
920 // this is the loner state our locks should aim for
921 client_t get_target_loner() const {
922 if (loner_cap == want_loner_cap)
923 return loner_cap;
924 else
925 return -1;
926 }
927
928 client_t calc_ideal_loner();
929 void set_loner_cap(client_t l);
930 bool choose_ideal_loner();
931 bool try_set_loner();
932 bool try_drop_loner();
933
934 // choose new lock state during recovery, based on issued caps
935 void choose_lock_state(SimpleLock *lock, int allissued);
936 void choose_lock_states(int dirty_caps);
937
938 int count_nonstale_caps() {
939 int n = 0;
940 for (std::map<client_t,Capability*>::iterator it = client_caps.begin();
941 it != client_caps.end();
942 ++it)
943 if (!it->second->is_stale())
944 n++;
945 return n;
946 }
947 bool multiple_nonstale_caps() {
948 int n = 0;
949 for (std::map<client_t,Capability*>::iterator it = client_caps.begin();
950 it != client_caps.end();
951 ++it)
952 if (!it->second->is_stale()) {
953 if (n)
954 return true;
955 n++;
956 }
957 return false;
958 }
959
960 bool is_any_caps() { return !client_caps.empty(); }
961 bool is_any_nonstale_caps() { return count_nonstale_caps(); }
962
963 const compact_map<int32_t,int32_t>& get_mds_caps_wanted() const { return mds_caps_wanted; }
964 compact_map<int32_t,int32_t>& get_mds_caps_wanted() { return mds_caps_wanted; }
965
966 const std::map<client_t,Capability*>& get_client_caps() const { return client_caps; }
967 Capability *get_client_cap(client_t client) {
968 auto client_caps_entry = client_caps.find(client);
969 if (client_caps_entry != client_caps.end())
970 return client_caps_entry->second;
971 return 0;
972 }
973 int get_client_cap_pending(client_t client) const {
974 auto client_caps_entry = client_caps.find(client);
975 if (client_caps_entry != client_caps.end()) {
976 return client_caps_entry->second->pending();
977 } else {
978 return 0;
979 }
980 }
981
982 Capability *add_client_cap(client_t client, Session *session, SnapRealm *conrealm=0);
983 void remove_client_cap(client_t client);
984 void move_to_realm(SnapRealm *realm);
985
986 Capability *reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session);
987 void clear_client_caps_after_export();
988 void export_client_caps(std::map<client_t,Capability::Export>& cl);
989
990 // caps allowed
991 int get_caps_liked() const;
992 int get_caps_allowed_ever() const;
993 int get_caps_allowed_by_type(int type) const;
994 int get_caps_careful() const;
995 int get_xlocker_mask(client_t client) const;
996 int get_caps_allowed_for_client(Session *s, inode_t *file_i) const;
997
998 // caps issued, wanted
999 int get_caps_issued(int *ploner = 0, int *pother = 0, int *pxlocker = 0,
1000 int shift = 0, int mask = -1);
1001 bool is_any_caps_wanted() const;
1002 int get_caps_wanted(int *ploner = 0, int *pother = 0, int shift = 0, int mask = -1) const;
1003 bool issued_caps_need_gather(SimpleLock *lock);
1004 void replicate_relax_locks();
1005
1006 // -- authority --
1007 mds_authority_t authority() const override;
1008
1009 // -- auth pins --
1010 void adjust_nested_auth_pins(int a, void *by);
1011 bool can_auth_pin() const override;
1012 void auth_pin(void *by) override;
1013 void auth_unpin(void *by) override;
1014
1015 // -- freeze --
1016 bool is_freezing_inode() const { return state_test(STATE_FREEZING); }
1017 bool is_frozen_inode() const { return state_test(STATE_FROZEN); }
1018 bool is_frozen_auth_pin() const { return state_test(STATE_FROZENAUTHPIN); }
1019 bool is_frozen() const override;
1020 bool is_frozen_dir() const;
1021 bool is_freezing() const override;
1022
1023 /* Freeze the inode. auth_pin_allowance lets the caller account for any
1024 * auth_pins it is itself holding/responsible for. */
1025 bool freeze_inode(int auth_pin_allowance=0);
1026 void unfreeze_inode(std::list<MDSInternalContextBase*>& finished);
1027 void unfreeze_inode();
1028
1029 void freeze_auth_pin();
1030 void unfreeze_auth_pin();
1031
1032 // -- reference counting --
1033 void bad_put(int by) override {
1034 generic_dout(0) << " bad put " << *this << " by " << by << " " << pin_name(by) << " was " << ref
1035 #ifdef MDS_REF_SET
1036 << " (" << ref_map << ")"
1037 #endif
1038 << dendl;
1039 #ifdef MDS_REF_SET
1040 assert(ref_map[by] > 0);
1041 #endif
1042 assert(ref > 0);
1043 }
1044 void bad_get(int by) override {
1045 generic_dout(0) << " bad get " << *this << " by " << by << " " << pin_name(by) << " was " << ref
1046 #ifdef MDS_REF_SET
1047 << " (" << ref_map << ")"
1048 #endif
1049 << dendl;
1050 #ifdef MDS_REF_SET
1051 assert(ref_map[by] >= 0);
1052 #endif
1053 }
1054 void first_get() override;
1055 void last_put() override;
1056 void _put() override;
1057
1058
1059 // -- hierarchy stuff --
1060 public:
1061 void set_primary_parent(CDentry *p) {
1062 assert(parent == 0);
1063 parent = p;
1064 }
1065 void remove_primary_parent(CDentry *dn) {
1066 assert(dn == parent);
1067 parent = 0;
1068 }
1069 void add_remote_parent(CDentry *p);
1070 void remove_remote_parent(CDentry *p);
1071 int num_remote_parents() {
1072 return remote_parents.size();
1073 }
1074
1075 void push_projected_parent(CDentry *dn) {
1076 projected_parent.push_back(dn);
1077 }
1078 void pop_projected_parent() {
1079 assert(projected_parent.size());
1080 parent = projected_parent.front();
1081 projected_parent.pop_front();
1082 }
1083
1084 public:
1085 void maybe_export_pin(bool update=false);
1086 void set_export_pin(mds_rank_t rank);
1087 mds_rank_t get_export_pin(bool inherit=true) const;
1088 bool is_exportable(mds_rank_t dest) const;
1089
1090 void print(ostream& out) override;
1091 void dump(Formatter *f) const;
1092
1093 /**
1094 * @defgroup Scrubbing and fsck
1095 * @{
1096 */
1097
1098 /**
1099 * Report the results of validation against a particular inode.
1100 * Each member is a pair of bools.
1101 * <member>.first represents if validation was performed against the member.
1102 * <member.second represents if the member passed validation.
1103 * performed_validation is set to true if the validation was actually
1104 * run. It might not be run if, for instance, the inode is marked as dirty.
1105 * passed_validation is set to true if everything that was checked
1106 * passed its validation.
1107 */
1108 struct validated_data {
1109 template<typename T>struct member_status {
1110 bool checked = false;
1111 bool passed = false;
1112 bool repaired = false;
1113 int ondisk_read_retval = 0;
1114 T ondisk_value;
1115 T memory_value;
1116 std::stringstream error_str;
1117 };
1118
1119 bool performed_validation;
1120 bool passed_validation;
1121
1122 struct raw_stats_t {
1123 frag_info_t dirstat;
1124 nest_info_t rstat;
1125 };
1126
1127 member_status<inode_backtrace_t> backtrace;
1128 member_status<inode_t> inode;
1129 member_status<raw_stats_t> raw_stats;
1130
1131 validated_data() : performed_validation(false),
1132 passed_validation(false) {}
1133
1134 void dump(Formatter *f) const;
1135
1136 bool all_damage_repaired() const;
1137 };
1138
1139 /**
1140 * Validate that the on-disk state of an inode matches what
1141 * we expect from our memory state. Currently this checks that:
1142 * 1) The backtrace associated with the file data exists and is correct
1143 * 2) For directories, the actual inode metadata matches our memory state,
1144 * 3) For directories, the rstats match
1145 *
1146 * @param results A freshly-created validated_data struct, with values set
1147 * as described in the struct documentation.
1148 * @param mdr The request to be responeded upon the completion of the
1149 * validation (or NULL)
1150 * @param fin Context to call back on completion (or NULL)
1151 */
1152 void validate_disk_state(validated_data *results,
1153 MDSInternalContext *fin);
1154 static void dump_validation_results(const validated_data& results,
1155 Formatter *f);
1156 private:
1157 bool _validate_disk_state(class ValidationContinuation *c,
1158 int rval, int stage);
1159 friend class ValidationContinuation;
1160 /** @} Scrubbing and fsck */
1161 };
1162
1163 ostream& operator<<(ostream& out, const CInode::scrub_stamp_info_t& si);
1164
1165 #undef dout_context
1166 #endif