]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/CInode.h
import ceph 15.2.14
[ceph.git] / ceph / src / mds / CInode.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_CINODE_H
16 #define CEPH_CINODE_H
17
18 #include <list>
19 #include <map>
20 #include <set>
21 #include <string_view>
22
23 #include "common/config.h"
24 #include "include/counter.h"
25 #include "include/elist.h"
26 #include "include/types.h"
27 #include "include/lru.h"
28 #include "include/compact_set.h"
29
30 #include "MDSCacheObject.h"
31 #include "MDSContext.h"
32 #include "flock.h"
33
34 #include "BatchOp.h"
35 #include "CDentry.h"
36 #include "SimpleLock.h"
37 #include "ScatterLock.h"
38 #include "LocalLock.h"
39 #include "Capability.h"
40 #include "SnapRealm.h"
41 #include "Mutation.h"
42
43 #include "messages/MClientCaps.h"
44
45 #define dout_context g_ceph_context
46
47 class Context;
48 class CDir;
49 class CInode;
50 class MDCache;
51 class LogSegment;
52 struct SnapRealm;
53 class Session;
54 struct ObjectOperation;
55 class EMetaBlob;
56
57 struct cinode_lock_info_t {
58 int lock;
59 int wr_caps;
60 };
61
62 /**
63 * Base class for CInode, containing the backing store data and
64 * serialization methods. This exists so that we can read and
65 * handle CInodes from the backing store without hitting all
66 * the business logic in CInode proper.
67 */
68 class InodeStoreBase {
69 public:
70 typedef inode_t<mempool::mds_co::pool_allocator> mempool_inode;
71 typedef old_inode_t<mempool::mds_co::pool_allocator> mempool_old_inode;
72 typedef mempool::mds_co::compact_map<snapid_t, mempool_old_inode> mempool_old_inode_map;
73 typedef xattr_map<mempool::mds_co::pool_allocator> mempool_xattr_map; // FIXME bufferptr not in mempool
74
75 InodeStoreBase() {}
76
77 /* Helpers */
78 bool is_file() const { return inode.is_file(); }
79 bool is_symlink() const { return inode.is_symlink(); }
80 bool is_dir() const { return inode.is_dir(); }
81 static object_t get_object_name(inodeno_t ino, frag_t fg, std::string_view suffix);
82
83 /* Full serialization for use in ".inode" root inode objects */
84 void encode(bufferlist &bl, uint64_t features, const bufferlist *snap_blob=NULL) const;
85 void decode(bufferlist::const_iterator &bl, bufferlist& snap_blob);
86
87 /* Serialization without ENCODE_START/FINISH blocks for use embedded in dentry */
88 void encode_bare(bufferlist &bl, uint64_t features, const bufferlist *snap_blob=NULL) const;
89 void decode_bare(bufferlist::const_iterator &bl, bufferlist &snap_blob, __u8 struct_v=5);
90
91 /* For test/debug output */
92 void dump(Formatter *f) const;
93
94 /* For use by offline tools */
95 __u32 hash_dentry_name(std::string_view dn);
96 frag_t pick_dirfrag(std::string_view dn);
97
98 mempool_inode inode; // the inode itself
99 mempool::mds_co::string symlink; // symlink dest, if symlink
100 mempool_xattr_map xattrs;
101 fragtree_t dirfragtree; // dir frag tree, if any. always consistent with our dirfrag map.
102 mempool_old_inode_map old_inodes; // key = last, value.first = first
103 snapid_t oldest_snap = CEPH_NOSNAP;
104 damage_flags_t damage_flags = 0;
105 };
106
107 inline void decode_noshare(InodeStoreBase::mempool_xattr_map& xattrs,
108 ceph::buffer::list::const_iterator &p)
109 {
110 decode_noshare<mempool::mds_co::pool_allocator>(xattrs, p);
111 }
112
113 class InodeStore : public InodeStoreBase {
114 public:
115 void encode(bufferlist &bl, uint64_t features) const {
116 InodeStoreBase::encode(bl, features, &snap_blob);
117 }
118 void decode(bufferlist::const_iterator &bl) {
119 InodeStoreBase::decode(bl, snap_blob);
120 }
121 void encode_bare(bufferlist &bl, uint64_t features) const {
122 InodeStoreBase::encode_bare(bl, features, &snap_blob);
123 }
124 void decode_bare(bufferlist::const_iterator &bl) {
125 InodeStoreBase::decode_bare(bl, snap_blob);
126 }
127
128 static void generate_test_instances(std::list<InodeStore*>& ls);
129
130 // FIXME bufferlist not part of mempool
131 bufferlist snap_blob; // Encoded copy of SnapRealm, because we can't
132 // rehydrate it without full MDCache
133 };
134 WRITE_CLASS_ENCODER_FEATURES(InodeStore)
135
136 // just for ceph-dencoder
137 class InodeStoreBare : public InodeStore {
138 public:
139 void encode(bufferlist &bl, uint64_t features) const {
140 InodeStore::encode_bare(bl, features);
141 }
142 void decode(bufferlist::const_iterator &bl) {
143 InodeStore::decode_bare(bl);
144 }
145 static void generate_test_instances(std::list<InodeStoreBare*>& ls);
146 };
147 WRITE_CLASS_ENCODER_FEATURES(InodeStoreBare)
148
149 // cached inode wrapper
150 class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CInode> {
151 public:
152 MEMPOOL_CLASS_HELPERS();
153
154 using mempool_cap_map = mempool::mds_co::map<client_t, Capability>;
155 /**
156 * @defgroup Scrubbing and fsck
157 */
158
159 /**
160 * Report the results of validation against a particular inode.
161 * Each member is a pair of bools.
162 * <member>.first represents if validation was performed against the member.
163 * <member.second represents if the member passed validation.
164 * performed_validation is set to true if the validation was actually
165 * run. It might not be run if, for instance, the inode is marked as dirty.
166 * passed_validation is set to true if everything that was checked
167 * passed its validation.
168 */
169 struct validated_data {
170 template<typename T>struct member_status {
171 bool checked = false;
172 bool passed = false;
173 bool repaired = false;
174 int ondisk_read_retval = 0;
175 T ondisk_value;
176 T memory_value;
177 std::stringstream error_str;
178 };
179
180 struct raw_stats_t {
181 frag_info_t dirstat;
182 nest_info_t rstat;
183 };
184
185 validated_data() {}
186
187 void dump(Formatter *f) const;
188
189 bool all_damage_repaired() const;
190
191 bool performed_validation = false;
192 bool passed_validation = false;
193
194 member_status<inode_backtrace_t> backtrace;
195 member_status<mempool_inode> inode; // XXX should not be in mempool; wait for pmr
196 member_status<raw_stats_t> raw_stats;
197 };
198
199 // friends
200 friend class Server;
201 friend class Locker;
202 friend class Migrator;
203 friend class MDCache;
204 friend class StrayManager;
205 friend class CDir;
206 friend ostream& operator<<(ostream&, const CInode&);
207
208 class scrub_stamp_info_t {
209 public:
210 scrub_stamp_info_t() {}
211 void reset() {
212 scrub_start_version = last_scrub_version = 0;
213 scrub_start_stamp = last_scrub_stamp = utime_t();
214 }
215 /// version we started our latest scrub (whether in-progress or finished)
216 version_t scrub_start_version = 0;
217 /// time we started our latest scrub (whether in-progress or finished)
218 utime_t scrub_start_stamp;
219 /// version we started our most recent finished scrub
220 version_t last_scrub_version = 0;
221 /// time we started our most recent finished scrub
222 utime_t last_scrub_stamp;
223 };
224
225 class scrub_info_t : public scrub_stamp_info_t {
226 public:
227 scrub_info_t() {}
228
229 CDentry *scrub_parent = nullptr;
230 MDSContext *on_finish = nullptr;
231
232 bool last_scrub_dirty = false; /// are our stamps dirty with respect to disk state?
233 bool scrub_in_progress = false; /// are we currently scrubbing?
234 bool children_scrubbed = false;
235
236 /// my own (temporary) stamps and versions for each dirfrag we have
237 std::map<frag_t, scrub_stamp_info_t> dirfrag_stamps; // XXX not part of mempool
238
239 ScrubHeaderRef header;
240 };
241
242 /**
243 * Projection methods, used to store inode changes until they have been journaled,
244 * at which point they are popped.
245 * Usage:
246 * project_inode as needed. If you're changing xattrs or sr_t, then pass true
247 * as needed then change the xattrs/snapnode member as needed. (Dirty
248 * exception: project_past_snaprealm_parent allows you to project the
249 * snapnode after doing project_inode (i.e. you don't need to pass
250 * snap=true).
251 *
252 * Then, journal. Once journaling is done, pop_and_dirty_projected_inode.
253 * This function will take care of the inode itself, the xattrs, and the snaprealm.
254 */
255
256 class projected_inode {
257 public:
258 static sr_t* const UNDEF_SRNODE;
259
260 projected_inode() = delete;
261 explicit projected_inode(const mempool_inode &in) : inode(in) {}
262
263 mempool_inode inode;
264 std::unique_ptr<mempool_xattr_map> xattrs;
265 sr_t *snapnode = UNDEF_SRNODE;
266 };
267
268 // -- pins --
269 static const int PIN_DIRFRAG = -1;
270 static const int PIN_CAPS = 2; // client caps
271 static const int PIN_IMPORTING = -4; // importing
272 static const int PIN_OPENINGDIR = 7;
273 static const int PIN_REMOTEPARENT = 8;
274 static const int PIN_BATCHOPENJOURNAL = 9;
275 static const int PIN_SCATTERED = 10;
276 static const int PIN_STICKYDIRS = 11;
277 //static const int PIN_PURGING = -12;
278 static const int PIN_FREEZING = 13;
279 static const int PIN_FROZEN = 14;
280 static const int PIN_IMPORTINGCAPS = -15;
281 static const int PIN_PASTSNAPPARENT = -16;
282 static const int PIN_OPENINGSNAPPARENTS = 17;
283 static const int PIN_TRUNCATING = 18;
284 static const int PIN_STRAY = 19; // we pin our stray inode while active
285 static const int PIN_NEEDSNAPFLUSH = 20;
286 static const int PIN_DIRTYRSTAT = 21;
287 static const int PIN_EXPORTINGCAPS = 22;
288 static const int PIN_DIRTYPARENT = 23;
289 static const int PIN_DIRWAITER = 24;
290 static const int PIN_SCRUBQUEUE = 25;
291
292 // -- dump flags --
293 static const int DUMP_INODE_STORE_BASE = (1 << 0);
294 static const int DUMP_MDS_CACHE_OBJECT = (1 << 1);
295 static const int DUMP_LOCKS = (1 << 2);
296 static const int DUMP_STATE = (1 << 3);
297 static const int DUMP_CAPS = (1 << 4);
298 static const int DUMP_PATH = (1 << 5);
299 static const int DUMP_DIRFRAGS = (1 << 6);
300 static const int DUMP_ALL = (-1);
301 static const int DUMP_DEFAULT = DUMP_ALL & (~DUMP_PATH) & (~DUMP_DIRFRAGS);
302
303 // -- state --
304 static const int STATE_EXPORTING = (1<<0); // on nonauth bystander.
305 static const int STATE_OPENINGDIR = (1<<1);
306 static const int STATE_FREEZING = (1<<2);
307 static const int STATE_FROZEN = (1<<3);
308 static const int STATE_AMBIGUOUSAUTH = (1<<4);
309 static const int STATE_EXPORTINGCAPS = (1<<5);
310 static const int STATE_NEEDSRECOVER = (1<<6);
311 static const int STATE_RECOVERING = (1<<7);
312 static const int STATE_PURGING = (1<<8);
313 static const int STATE_DIRTYPARENT = (1<<9);
314 static const int STATE_DIRTYRSTAT = (1<<10);
315 static const int STATE_STRAYPINNED = (1<<11);
316 static const int STATE_FROZENAUTHPIN = (1<<12);
317 static const int STATE_DIRTYPOOL = (1<<13);
318 static const int STATE_REPAIRSTATS = (1<<14);
319 static const int STATE_MISSINGOBJS = (1<<15);
320 static const int STATE_EVALSTALECAPS = (1<<16);
321 static const int STATE_QUEUEDEXPORTPIN = (1<<17);
322 static const int STATE_TRACKEDBYOFT = (1<<18); // tracked by open file table
323 static const int STATE_DELAYEDEXPORTPIN = (1<<19);
324 static const int STATE_DISTEPHEMERALPIN = (1<<20);
325 static const int STATE_RANDEPHEMERALPIN = (1<<21);
326 static const int STATE_CLIENTWRITEABLE = (1<<22);
327
328 // orphan inode needs notification of releasing reference
329 static const int STATE_ORPHAN = STATE_NOTIFYREF;
330
331 static const int MASK_STATE_EXPORTED =
332 (STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL|
333 STATE_DISTEPHEMERALPIN|STATE_RANDEPHEMERALPIN);
334 static const int MASK_STATE_EXPORT_KEPT =
335 (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS|
336 STATE_QUEUEDEXPORTPIN|STATE_TRACKEDBYOFT|STATE_DELAYEDEXPORTPIN|
337 STATE_DISTEPHEMERALPIN|STATE_RANDEPHEMERALPIN);
338
339 /* These are for "permanent" state markers that are passed around between
340 * MDS. Nothing protects/updates it like a typical MDS lock.
341 *
342 * Currently, we just use this for REPLICATED inodes. The reason we need to
343 * replicate the random epin state is because the directory inode is still
344 * under the authority of the parent subtree. So it's not exported normally
345 * and we can't pass around the state that way. The importer of the dirfrags
346 * still needs to know that the inode is random pinned though otherwise it
347 * doesn't know that the dirfrags are pinned.
348 */
349 static const int MASK_STATE_REPLICATED = STATE_RANDEPHEMERALPIN;
350
351 // -- waiters --
352 static const uint64_t WAIT_DIR = (1<<0);
353 static const uint64_t WAIT_FROZEN = (1<<1);
354 static const uint64_t WAIT_TRUNC = (1<<2);
355 static const uint64_t WAIT_FLOCK = (1<<3);
356
357 static const uint64_t WAIT_ANY_MASK = (uint64_t)(-1);
358
359 // misc
360 static const unsigned EXPORT_NONCE = 1; // nonce given to replicas created by export
361
362 // ---------------------------
363 CInode() = delete;
364 CInode(MDCache *c, bool auth=true, snapid_t f=2, snapid_t l=CEPH_NOSNAP);
365 ~CInode() override {
366 close_dirfrags();
367 close_snaprealm();
368 clear_file_locks();
369 ceph_assert(num_projected_xattrs == 0);
370 ceph_assert(num_projected_srnodes == 0);
371 ceph_assert(num_caps_notable == 0);
372 ceph_assert(num_subtree_roots == 0);
373 ceph_assert(num_exporting_dirs == 0);
374 ceph_assert(batch_ops.empty());
375 }
376
377 std::map<int, std::unique_ptr<BatchOp>> batch_ops;
378
379 std::string_view pin_name(int p) const override;
380
381 ostream& print_db_line_prefix(ostream& out) override;
382
383 const scrub_info_t *scrub_info() const{
384 if (!scrub_infop)
385 scrub_info_create();
386 return scrub_infop;
387 }
388
389 ScrubHeaderRef get_scrub_header() {
390 if (scrub_infop == nullptr) {
391 return nullptr;
392 } else {
393 return scrub_infop->header;
394 }
395 }
396
397 bool scrub_is_in_progress() const {
398 return (scrub_infop && scrub_infop->scrub_in_progress);
399 }
400 /**
401 * Start scrubbing on this inode. That could be very short if it's
402 * a file, or take a long time if we're recursively scrubbing a directory.
403 * @pre It is not currently scrubbing
404 * @post it has set up internal scrubbing state
405 * @param scrub_version What version are we scrubbing at (usually, parent
406 * directory's get_projected_version())
407 */
408 void scrub_initialize(CDentry *scrub_parent,
409 ScrubHeaderRef& header,
410 MDSContext *f);
411 /**
412 * Get the next dirfrag to scrub. Gives you a frag_t in output param which
413 * you must convert to a CDir (and possibly load off disk).
414 * @param dir A pointer to frag_t, will be filled in with the next dirfrag to
415 * scrub if there is one.
416 * @returns 0 on success, you should scrub the passed-out frag_t right now;
417 * ENOENT: There are no remaining dirfrags to scrub
418 * <0 There was some other error (It will return -ENOTDIR if not a directory)
419 */
420 int scrub_dirfrag_next(frag_t* out_dirfrag);
421 /**
422 * Get the currently scrubbing dirfrags. When returned, the
423 * passed-in list will be filled in with all frag_ts which have
424 * been returned from scrub_dirfrag_next but not sent back
425 * via scrub_dirfrag_finished.
426 */
427 void scrub_dirfrags_scrubbing(frag_vec_t *out_dirfrags);
428 /**
429 * Report to the CInode that a dirfrag it owns has been scrubbed. Call
430 * this for every frag_t returned from scrub_dirfrag_next().
431 * @param dirfrag The frag_t that was scrubbed
432 */
433 void scrub_dirfrag_finished(frag_t dirfrag);
434 /**
435 * Call this once the scrub has been completed, whether it's a full
436 * recursive scrub on a directory or simply the data on a file (or
437 * anything in between).
438 * @param c An out param which is filled in with a Context* that must
439 * be complete()ed.
440 */
441 void scrub_finished(MDSContext **c);
442
443 void scrub_aborted(MDSContext **c);
444
445 /**
446 * Report to the CInode that alldirfrags it owns have been scrubbed.
447 */
448 void scrub_children_finished() {
449 scrub_infop->children_scrubbed = true;
450 }
451 void scrub_set_finisher(MDSContext *c) {
452 ceph_assert(!scrub_infop->on_finish);
453 scrub_infop->on_finish = c;
454 }
455
456 bool is_multiversion() const {
457 return snaprealm || // other snaprealms will link to me
458 inode.is_dir() || // links to me in other snaps
459 inode.nlink > 1 || // there are remote links, possibly snapped, that will need to find me
460 !old_inodes.empty(); // once multiversion, always multiversion. until old_inodes gets cleaned out.
461 }
462 snapid_t get_oldest_snap();
463
464 bool is_dirty_rstat() {
465 return state_test(STATE_DIRTYRSTAT);
466 }
467 void mark_dirty_rstat();
468 void clear_dirty_rstat();
469
470 CInode::projected_inode &project_inode(bool xattr = false, bool snap = false);
471 void pop_and_dirty_projected_inode(LogSegment *ls);
472
473 projected_inode *get_projected_node() {
474 if (projected_nodes.empty())
475 return NULL;
476 else
477 return &projected_nodes.back();
478 }
479
480 version_t get_projected_version() const {
481 if (projected_nodes.empty())
482 return inode.version;
483 else
484 return projected_nodes.back().inode.version;
485 }
486 bool is_projected() const {
487 return !projected_nodes.empty();
488 }
489
490 const mempool_inode *get_projected_inode() const {
491 if (projected_nodes.empty())
492 return &inode;
493 else
494 return &projected_nodes.back().inode;
495 }
496 mempool_inode *get_projected_inode() {
497 if (projected_nodes.empty())
498 return &inode;
499 else
500 return &projected_nodes.back().inode;
501 }
502 mempool_inode *get_previous_projected_inode() {
503 ceph_assert(!projected_nodes.empty());
504 auto it = projected_nodes.rbegin();
505 ++it;
506 if (it != projected_nodes.rend())
507 return &it->inode;
508 else
509 return &inode;
510 }
511
512 mempool_xattr_map *get_projected_xattrs();
513 mempool_xattr_map *get_previous_projected_xattrs();
514
515 sr_t *prepare_new_srnode(snapid_t snapid);
516 void project_snaprealm(sr_t *new_srnode);
517 sr_t *project_snaprealm(snapid_t snapid=0) {
518 sr_t* new_srnode = prepare_new_srnode(snapid);
519 project_snaprealm(new_srnode);
520 return new_srnode;
521 }
522 const sr_t *get_projected_srnode() const;
523
524 void mark_snaprealm_global(sr_t *new_srnode);
525 void clear_snaprealm_global(sr_t *new_srnode);
526 bool is_projected_snaprealm_global() const;
527
528 void record_snaprealm_past_parent(sr_t *new_snap, SnapRealm *newparent);
529 void record_snaprealm_parent_dentry(sr_t *new_snap, SnapRealm *newparent,
530 CDentry *dn, bool primary_dn);
531 void project_snaprealm_past_parent(SnapRealm *newparent);
532 void early_pop_projected_snaprealm();
533
534 mempool_old_inode& cow_old_inode(snapid_t follows, bool cow_head);
535 void split_old_inode(snapid_t snap);
536 mempool_old_inode *pick_old_inode(snapid_t last);
537 void pre_cow_old_inode();
538 bool has_snap_data(snapid_t s);
539 void purge_stale_snap_data(const std::set<snapid_t>& snaps);
540
541 size_t get_num_dirfrags() const { return dirfrags.size(); }
542 CDir* get_dirfrag(frag_t fg) {
543 auto pi = dirfrags.find(fg);
544 if (pi != dirfrags.end()) {
545 //assert(g_conf()->debug_mds < 2 || dirfragtree.is_leaf(fg)); // performance hack FIXME
546 return pi->second;
547 }
548 return NULL;
549 }
550 std::pair<bool, std::vector<CDir*>> get_dirfrags_under(frag_t fg);
551 CDir* get_approx_dirfrag(frag_t fg);
552
553 template<typename Container>
554 void get_dirfrags(Container& ls) const {
555 // all dirfrags
556 if constexpr (std::is_same_v<Container, std::vector<CDir*>>)
557 ls.reserve(ls.size() + dirfrags.size());
558 for (const auto &p : dirfrags)
559 ls.push_back(p.second);
560 }
561
562 auto get_dirfrags() const {
563 std::vector<CDir*> result;
564 get_dirfrags(result);
565 return result;
566 }
567
568 void get_nested_dirfrags(std::vector<CDir*>&) const;
569 std::vector<CDir*> get_nested_dirfrags() const {
570 std::vector<CDir*> v;
571 get_nested_dirfrags(v);
572 return v;
573 }
574 void get_subtree_dirfrags(std::vector<CDir*>&) const;
575 std::vector<CDir*> get_subtree_dirfrags() const {
576 std::vector<CDir*> v;
577 get_subtree_dirfrags(v);
578 return v;
579 }
580 int get_num_subtree_roots() const {
581 return num_subtree_roots;
582 }
583
584 CDir *get_or_open_dirfrag(MDCache *mdcache, frag_t fg);
585 CDir *add_dirfrag(CDir *dir);
586 void close_dirfrag(frag_t fg);
587 void close_dirfrags();
588 bool has_subtree_root_dirfrag(int auth=-1);
589 bool has_subtree_or_exporting_dirfrag();
590
591 void force_dirfrags();
592 void verify_dirfrags();
593
594 void get_stickydirs();
595 void put_stickydirs();
596
597 void add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client);
598 void remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client);
599 pair<bool,bool> split_need_snapflush(CInode *cowin, CInode *in);
600
601 // -- accessors --
602 bool is_root() const { return ino() == CEPH_INO_ROOT; }
603 bool is_stray() const { return MDS_INO_IS_STRAY(inode.ino); }
604 mds_rank_t get_stray_owner() const {
605 return (mds_rank_t)MDS_INO_STRAY_OWNER(inode.ino);
606 }
607 bool is_mdsdir() const { return MDS_INO_IS_MDSDIR(inode.ino); }
608 bool is_base() const { return MDS_INO_IS_BASE(inode.ino); }
609 bool is_system() const { return inode.ino < MDS_INO_SYSTEM_BASE; }
610 bool is_normal() const { return !(is_base() || is_system() || is_stray()); }
611
612 bool is_head() const { return last == CEPH_NOSNAP; }
613
614 // note: this overloads MDSCacheObject
615 bool is_ambiguous_auth() const {
616 return state_test(STATE_AMBIGUOUSAUTH) ||
617 MDSCacheObject::is_ambiguous_auth();
618 }
619 void set_ambiguous_auth() {
620 state_set(STATE_AMBIGUOUSAUTH);
621 }
622 void clear_ambiguous_auth(MDSContext::vec& finished);
623 void clear_ambiguous_auth();
624
625 inodeno_t ino() const { return inode.ino; }
626 vinodeno_t vino() const { return vinodeno_t(inode.ino, last); }
627 int d_type() const { return IFTODT(inode.mode); }
628
629 mempool_inode& get_inode() { return inode; }
630 const mempool_inode& get_inode() const { return inode; }
631 CDentry* get_parent_dn() { return parent; }
632 const CDentry* get_parent_dn() const { return parent; }
633 CDentry* get_projected_parent_dn() { return !projected_parent.empty() ? projected_parent.back() : parent; }
634 const CDentry* get_projected_parent_dn() const { return !projected_parent.empty() ? projected_parent.back() : parent; }
635 const CDentry* get_oldest_parent_dn() const {
636 if (parent)
637 return parent;
638 return !projected_parent.empty() ? projected_parent.front(): NULL;
639 }
640 CDir *get_parent_dir();
641 const CDir *get_projected_parent_dir() const;
642 CDir *get_projected_parent_dir();
643 CInode *get_parent_inode();
644
645 bool is_lt(const MDSCacheObject *r) const override {
646 const CInode *o = static_cast<const CInode*>(r);
647 return ino() < o->ino() ||
648 (ino() == o->ino() && last < o->last);
649 }
650
651 // -- misc --
652 bool is_ancestor_of(const CInode *other) const;
653 bool is_projected_ancestor_of(const CInode *other) const;
654
655 void make_path_string(std::string& s, bool projected=false, const CDentry *use_parent=NULL) const;
656 void make_path(filepath& s, bool projected=false) const;
657 void name_stray_dentry(std::string& dname);
658
659 // -- dirtyness --
660 version_t get_version() const { return inode.version; }
661
662 version_t pre_dirty();
663 void _mark_dirty(LogSegment *ls);
664 void mark_dirty(version_t projected_dirv, LogSegment *ls);
665 void mark_clean();
666
667 void store(MDSContext *fin);
668 void _stored(int r, version_t cv, Context *fin);
669 /**
670 * Flush a CInode to disk. This includes the backtrace, the parent
671 * directory's link, and the Inode object itself (if a base directory).
672 * @pre is_auth() on both the inode and its containing directory
673 * @pre can_auth_pin()
674 * @param fin The Context to call when the flush is completed.
675 */
676 void flush(MDSContext *fin);
677 void fetch(MDSContext *fin);
678 void _fetched(bufferlist& bl, bufferlist& bl2, Context *fin);
679
680 void build_backtrace(int64_t pool, inode_backtrace_t& bt);
681 void store_backtrace(MDSContext *fin, int op_prio=-1);
682 void _stored_backtrace(int r, version_t v, Context *fin);
683 void fetch_backtrace(Context *fin, bufferlist *backtrace);
684
685 void mark_dirty_parent(LogSegment *ls, bool dirty_pool=false);
686 void clear_dirty_parent();
687 void verify_diri_backtrace(bufferlist &bl, int err);
688 bool is_dirty_parent() { return state_test(STATE_DIRTYPARENT); }
689 bool is_dirty_pool() { return state_test(STATE_DIRTYPOOL); }
690
691 void encode_snap_blob(bufferlist &bl);
692 void decode_snap_blob(const bufferlist &bl);
693 void encode_store(bufferlist& bl, uint64_t features);
694 void decode_store(bufferlist::const_iterator& bl);
695
696 void add_dir_waiter(frag_t fg, MDSContext *c);
697 void take_dir_waiting(frag_t fg, MDSContext::vec& ls);
698 bool is_waiting_for_dir(frag_t fg) {
699 return waiting_on_dir.count(fg);
700 }
701 void add_waiter(uint64_t tag, MDSContext *c) override;
702 void take_waiting(uint64_t tag, MDSContext::vec& ls) override;
703
704 // -- encode/decode helpers --
705 void _encode_base(bufferlist& bl, uint64_t features);
706 void _decode_base(bufferlist::const_iterator& p);
707 void _encode_locks_full(bufferlist& bl);
708 void _decode_locks_full(bufferlist::const_iterator& p);
709 void _encode_locks_state_for_replica(bufferlist& bl, bool need_recover);
710 void _encode_locks_state_for_rejoin(bufferlist& bl, int rep);
711 void _decode_locks_state_for_replica(bufferlist::const_iterator& p, bool is_new);
712 void _decode_locks_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters,
713 std::list<SimpleLock*>& eval_locks, bool survivor);
714
715 // -- import/export --
716 void encode_export(bufferlist& bl);
717 void finish_export();
718 void abort_export() {
719 put(PIN_TEMPEXPORTING);
720 ceph_assert(state_test(STATE_EXPORTINGCAPS));
721 state_clear(STATE_EXPORTINGCAPS);
722 put(PIN_EXPORTINGCAPS);
723 }
724 void decode_import(bufferlist::const_iterator& p, LogSegment *ls);
725
726 // for giving to clients
727 int encode_inodestat(bufferlist& bl, Session *session, SnapRealm *realm,
728 snapid_t snapid=CEPH_NOSNAP, unsigned max_bytes=0,
729 int getattr_wants=0);
730 void encode_cap_message(const ref_t<MClientCaps> &m, Capability *cap);
731
732 SimpleLock* get_lock(int type) override;
733
734 void set_object_info(MDSCacheObjectInfo &info) override;
735
736 void encode_lock_state(int type, bufferlist& bl) override;
737 void decode_lock_state(int type, const bufferlist& bl) override;
738 void encode_lock_iauth(bufferlist& bl);
739 void decode_lock_iauth(bufferlist::const_iterator& p);
740 void encode_lock_ilink(bufferlist& bl);
741 void decode_lock_ilink(bufferlist::const_iterator& p);
742 void encode_lock_idft(bufferlist& bl);
743 void decode_lock_idft(bufferlist::const_iterator& p);
744 void encode_lock_ifile(bufferlist& bl);
745 void decode_lock_ifile(bufferlist::const_iterator& p);
746 void encode_lock_inest(bufferlist& bl);
747 void decode_lock_inest(bufferlist::const_iterator& p);
748 void encode_lock_ixattr(bufferlist& bl);
749 void decode_lock_ixattr(bufferlist::const_iterator& p);
750 void encode_lock_isnap(bufferlist& bl);
751 void decode_lock_isnap(bufferlist::const_iterator& p);
752 void encode_lock_iflock(bufferlist& bl);
753 void decode_lock_iflock(bufferlist::const_iterator& p);
754 void encode_lock_ipolicy(bufferlist& bl);
755 void decode_lock_ipolicy(bufferlist::const_iterator& p);
756
757 void _finish_frag_update(CDir *dir, MutationRef& mut);
758
759 void clear_dirty_scattered(int type) override;
760 bool is_dirty_scattered();
761 void clear_scatter_dirty(); // on rejoin ack
762
763 void start_scatter(ScatterLock *lock);
764 void finish_scatter_update(ScatterLock *lock, CDir *dir,
765 version_t inode_version, version_t dir_accounted_version);
766 void finish_scatter_gather_update(int type);
767 void finish_scatter_gather_update_accounted(int type, MutationRef& mut, EMetaBlob *metablob);
768
769 // -- snap --
770 void open_snaprealm(bool no_split=false);
771 void close_snaprealm(bool no_join=false);
772 SnapRealm *find_snaprealm() const;
773 void encode_snap(bufferlist& bl);
774 void decode_snap(bufferlist::const_iterator& p);
775
776 client_t get_loner() const { return loner_cap; }
777 client_t get_wanted_loner() const { return want_loner_cap; }
778
779 // this is the loner state our locks should aim for
780 client_t get_target_loner() const {
781 if (loner_cap == want_loner_cap)
782 return loner_cap;
783 else
784 return -1;
785 }
786
787 client_t calc_ideal_loner();
788 void set_loner_cap(client_t l);
789 bool choose_ideal_loner();
790 bool try_set_loner();
791 bool try_drop_loner();
792
793 // choose new lock state during recovery, based on issued caps
794 void choose_lock_state(SimpleLock *lock, int allissued);
795 void choose_lock_states(int dirty_caps);
796
797 int count_nonstale_caps();
798 bool multiple_nonstale_caps();
799
800 bool is_any_caps() { return !client_caps.empty(); }
801 bool is_any_nonstale_caps() { return count_nonstale_caps(); }
802
803 const mempool::mds_co::compact_map<int32_t,int32_t>& get_mds_caps_wanted() const { return mds_caps_wanted; }
804 void set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m);
805 void set_mds_caps_wanted(mds_rank_t mds, int32_t wanted);
806
807 const mempool_cap_map& get_client_caps() const { return client_caps; }
808 Capability *get_client_cap(client_t client) {
809 auto client_caps_entry = client_caps.find(client);
810 if (client_caps_entry != client_caps.end())
811 return &client_caps_entry->second;
812 return 0;
813 }
814 int get_client_cap_pending(client_t client) const {
815 auto client_caps_entry = client_caps.find(client);
816 if (client_caps_entry != client_caps.end()) {
817 return client_caps_entry->second.pending();
818 } else {
819 return 0;
820 }
821 }
822
823 int get_num_caps_notable() const { return num_caps_notable; }
824 void adjust_num_caps_notable(int d);
825
826 Capability *add_client_cap(client_t client, Session *session,
827 SnapRealm *conrealm=nullptr, bool new_inode=false);
828 void remove_client_cap(client_t client);
829 void move_to_realm(SnapRealm *realm);
830
831 Capability *reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session);
832 void clear_client_caps_after_export();
833 void export_client_caps(std::map<client_t,Capability::Export>& cl);
834
835 // caps allowed
836 int get_caps_liked() const;
837 int get_caps_allowed_ever() const;
838 int get_caps_allowed_by_type(int type) const;
839 int get_caps_careful() const;
840 int get_xlocker_mask(client_t client) const;
841 int get_caps_allowed_for_client(Session *s, Capability *cap, mempool_inode *file_i) const;
842
843 // caps issued, wanted
844 int get_caps_issued(int *ploner = 0, int *pother = 0, int *pxlocker = 0,
845 int shift = 0, int mask = -1);
846 bool is_any_caps_wanted() const;
847 int get_caps_wanted(int *ploner = 0, int *pother = 0, int shift = 0, int mask = -1) const;
848 bool issued_caps_need_gather(SimpleLock *lock);
849
850 // client writeable
851 bool is_clientwriteable() const { return state & STATE_CLIENTWRITEABLE; }
852 void mark_clientwriteable();
853 void clear_clientwriteable();
854
855 // -- authority --
856 mds_authority_t authority() const override;
857
858 // -- auth pins --
859 bool can_auth_pin(int *err_ret=nullptr) const override;
860 void auth_pin(void *by) override;
861 void auth_unpin(void *by) override;
862
863 // -- freeze --
864 bool is_freezing_inode() const { return state_test(STATE_FREEZING); }
865 bool is_frozen_inode() const { return state_test(STATE_FROZEN); }
866 bool is_frozen_auth_pin() const { return state_test(STATE_FROZENAUTHPIN); }
867 bool is_frozen() const override;
868 bool is_frozen_dir() const;
869 bool is_freezing() const override;
870
871 /* Freeze the inode. auth_pin_allowance lets the caller account for any
872 * auth_pins it is itself holding/responsible for. */
873 bool freeze_inode(int auth_pin_allowance=0);
874 void unfreeze_inode(MDSContext::vec& finished);
875 void unfreeze_inode();
876
877 void freeze_auth_pin();
878 void unfreeze_auth_pin();
879
880 // -- reference counting --
881 void bad_put(int by) override {
882 generic_dout(0) << " bad put " << *this << " by " << by << " " << pin_name(by) << " was " << ref
883 #ifdef MDS_REF_SET
884 << " (" << ref_map << ")"
885 #endif
886 << dendl;
887 #ifdef MDS_REF_SET
888 ceph_assert(ref_map[by] > 0);
889 #endif
890 ceph_assert(ref > 0);
891 }
892 void bad_get(int by) override {
893 generic_dout(0) << " bad get " << *this << " by " << by << " " << pin_name(by) << " was " << ref
894 #ifdef MDS_REF_SET
895 << " (" << ref_map << ")"
896 #endif
897 << dendl;
898 #ifdef MDS_REF_SET
899 ceph_assert(ref_map[by] >= 0);
900 #endif
901 }
902 void first_get() override;
903 void last_put() override;
904 void _put() override;
905
906 // -- hierarchy stuff --
907 void set_primary_parent(CDentry *p) {
908 ceph_assert(parent == 0 ||
909 g_conf().get_val<bool>("mds_hack_allow_loading_invalid_metadata"));
910 parent = p;
911 }
912 void remove_primary_parent(CDentry *dn) {
913 ceph_assert(dn == parent);
914 parent = 0;
915 }
916 void add_remote_parent(CDentry *p);
917 void remove_remote_parent(CDentry *p);
918 int num_remote_parents() {
919 return remote_parents.size();
920 }
921
922 void push_projected_parent(CDentry *dn) {
923 projected_parent.push_back(dn);
924 }
925 void pop_projected_parent() {
926 ceph_assert(projected_parent.size());
927 parent = projected_parent.front();
928 projected_parent.pop_front();
929 }
930 bool is_parent_projected() const {
931 return !projected_parent.empty();
932 }
933
934 mds_rank_t get_export_pin(bool inherit=true, bool ephemeral=true) const;
935 void set_export_pin(mds_rank_t rank);
936 void queue_export_pin(mds_rank_t target);
937 void maybe_export_pin(bool update=false);
938
939 void check_pin_policy();
940
941 void set_ephemeral_dist(bool yes);
942 void maybe_ephemeral_dist(bool update=false);
943 void maybe_ephemeral_dist_children(bool update=false);
944 void setxattr_ephemeral_dist(bool val=false);
945 bool is_ephemeral_dist() const {
946 return state_test(STATE_DISTEPHEMERALPIN);
947 }
948
949 double get_ephemeral_rand(bool inherit=true) const;
950 void set_ephemeral_rand(bool yes);
951 void maybe_ephemeral_rand(bool fresh=false, double threshold=-1.0);
952 void setxattr_ephemeral_rand(double prob=0.0);
953 bool is_ephemeral_rand() const {
954 return state_test(STATE_RANDEPHEMERALPIN);
955 }
956
957 bool has_ephemeral_policy() const {
958 return get_inode().export_ephemeral_random_pin > 0.0 ||
959 get_inode().export_ephemeral_distributed_pin;
960 }
961 bool is_ephemerally_pinned() const {
962 return state_test(STATE_DISTEPHEMERALPIN) ||
963 state_test(STATE_RANDEPHEMERALPIN);
964 }
965 bool is_exportable(mds_rank_t dest) const;
966
967 void maybe_pin() {
968 maybe_export_pin();
969 maybe_ephemeral_dist();
970 maybe_ephemeral_rand();
971 }
972
973 void print(ostream& out) override;
974 void dump(Formatter *f, int flags = DUMP_DEFAULT) const;
975
976 /**
977 * Validate that the on-disk state of an inode matches what
978 * we expect from our memory state. Currently this checks that:
979 * 1) The backtrace associated with the file data exists and is correct
980 * 2) For directories, the actual inode metadata matches our memory state,
981 * 3) For directories, the rstats match
982 *
983 * @param results A freshly-created validated_data struct, with values set
984 * as described in the struct documentation.
985 * @param mdr The request to be responeded upon the completion of the
986 * validation (or NULL)
987 * @param fin Context to call back on completion (or NULL)
988 */
989 void validate_disk_state(validated_data *results,
990 MDSContext *fin);
991 static void dump_validation_results(const validated_data& results,
992 Formatter *f);
993
994 //bool hack_accessed = false;
995 //utime_t hack_load_stamp;
996
997 MDCache *mdcache;
998
999 SnapRealm *snaprealm = nullptr;
1000 SnapRealm *containing_realm = nullptr;
1001 snapid_t first, last;
1002 mempool::mds_co::compact_set<snapid_t> dirty_old_rstats;
1003
1004 uint64_t last_journaled = 0; // log offset for the last time i was journaled
1005 //loff_t last_open_journaled; // log offset for the last journaled EOpen
1006 utime_t last_dirstat_prop;
1007
1008 // list item node for when we have unpropagated rstat data
1009 elist<CInode*>::item dirty_rstat_item;
1010
1011 mempool::mds_co::set<client_t> client_snap_caps;
1012 mempool::mds_co::compact_map<snapid_t, mempool::mds_co::set<client_t> > client_need_snapflush;
1013
1014 // LogSegment lists i (may) belong to
1015 elist<CInode*>::item item_dirty;
1016 elist<CInode*>::item item_caps;
1017 elist<CInode*>::item item_open_file;
1018 elist<CInode*>::item item_dirty_parent;
1019 elist<CInode*>::item item_dirty_dirfrag_dir;
1020 elist<CInode*>::item item_dirty_dirfrag_nest;
1021 elist<CInode*>::item item_dirty_dirfrag_dirfragtree;
1022 elist<CInode*>::item item_scrub;
1023
1024 // also update RecoveryQueue::RecoveryQueue() if you change this
1025 elist<CInode*>::item& item_recover_queue = item_dirty_dirfrag_dir;
1026 elist<CInode*>::item& item_recover_queue_front = item_dirty_dirfrag_nest;
1027
1028 inode_load_vec_t pop;
1029 elist<CInode*>::item item_pop_lru;
1030
1031 // -- locks --
1032 static LockType versionlock_type;
1033 static LockType authlock_type;
1034 static LockType linklock_type;
1035 static LockType dirfragtreelock_type;
1036 static LockType filelock_type;
1037 static LockType xattrlock_type;
1038 static LockType snaplock_type;
1039 static LockType nestlock_type;
1040 static LockType flocklock_type;
1041 static LockType policylock_type;
1042
1043 // FIXME not part of mempool
1044 LocalLock versionlock;
1045 SimpleLock authlock;
1046 SimpleLock linklock;
1047 ScatterLock dirfragtreelock;
1048 ScatterLock filelock;
1049 SimpleLock xattrlock;
1050 SimpleLock snaplock;
1051 ScatterLock nestlock;
1052 SimpleLock flocklock;
1053 SimpleLock policylock;
1054
1055 // -- caps -- (new)
1056 // client caps
1057 client_t loner_cap = -1, want_loner_cap = -1;
1058
1059 protected:
1060 ceph_lock_state_t *get_fcntl_lock_state() {
1061 if (!fcntl_locks)
1062 fcntl_locks = new ceph_lock_state_t(g_ceph_context, CEPH_LOCK_FCNTL);
1063 return fcntl_locks;
1064 }
1065 void clear_fcntl_lock_state() {
1066 delete fcntl_locks;
1067 fcntl_locks = NULL;
1068 }
1069 ceph_lock_state_t *get_flock_lock_state() {
1070 if (!flock_locks)
1071 flock_locks = new ceph_lock_state_t(g_ceph_context, CEPH_LOCK_FLOCK);
1072 return flock_locks;
1073 }
1074 void clear_flock_lock_state() {
1075 delete flock_locks;
1076 flock_locks = NULL;
1077 }
1078 void clear_file_locks() {
1079 clear_fcntl_lock_state();
1080 clear_flock_lock_state();
1081 }
1082 void _encode_file_locks(bufferlist& bl) const {
1083 using ceph::encode;
1084 bool has_fcntl_locks = fcntl_locks && !fcntl_locks->empty();
1085 encode(has_fcntl_locks, bl);
1086 if (has_fcntl_locks)
1087 encode(*fcntl_locks, bl);
1088 bool has_flock_locks = flock_locks && !flock_locks->empty();
1089 encode(has_flock_locks, bl);
1090 if (has_flock_locks)
1091 encode(*flock_locks, bl);
1092 }
1093 void _decode_file_locks(bufferlist::const_iterator& p) {
1094 using ceph::decode;
1095 bool has_fcntl_locks;
1096 decode(has_fcntl_locks, p);
1097 if (has_fcntl_locks)
1098 decode(*get_fcntl_lock_state(), p);
1099 else
1100 clear_fcntl_lock_state();
1101 bool has_flock_locks;
1102 decode(has_flock_locks, p);
1103 if (has_flock_locks)
1104 decode(*get_flock_lock_state(), p);
1105 else
1106 clear_flock_lock_state();
1107 }
1108
1109 /**
1110 * Return the pool ID where we currently write backtraces for
1111 * this inode (in addition to inode.old_pools)
1112 *
1113 * @returns a pool ID >=0
1114 */
1115 int64_t get_backtrace_pool() const;
1116
1117 // parent dentries in cache
1118 CDentry *parent = nullptr; // primary link
1119 mempool::mds_co::compact_set<CDentry*> remote_parents; // if hard linked
1120
1121 mempool::mds_co::list<CDentry*> projected_parent; // for in-progress rename, (un)link, etc.
1122
1123 mds_authority_t inode_auth = CDIR_AUTH_DEFAULT;
1124
1125 // -- distributed state --
1126 // file capabilities
1127 mempool_cap_map client_caps; // client -> caps
1128 mempool::mds_co::compact_map<int32_t, int32_t> mds_caps_wanted; // [auth] mds -> caps wanted
1129 int replica_caps_wanted = 0; // [replica] what i've requested from auth
1130 int num_caps_notable = 0;
1131
1132 ceph_lock_state_t *fcntl_locks = nullptr;
1133 ceph_lock_state_t *flock_locks = nullptr;
1134
1135 // -- waiting --
1136 mempool::mds_co::compact_map<frag_t, MDSContext::vec > waiting_on_dir;
1137
1138
1139 // -- freezing inode --
1140 int auth_pin_freeze_allowance = 0;
1141 elist<CInode*>::item item_freezing_inode;
1142 void maybe_finish_freeze_inode();
1143 private:
1144
1145 friend class ValidationContinuation;
1146
1147 /**
1148 * Create a scrub_info_t struct for the scrub_infop pointer.
1149 */
1150 void scrub_info_create() const;
1151 /**
1152 * Delete the scrub_info_t struct if it's not got any useful data
1153 */
1154 void scrub_maybe_delete_info();
1155
1156 void pop_projected_snaprealm(sr_t *next_snaprealm, bool early);
1157
1158 bool _validate_disk_state(class ValidationContinuation *c,
1159 int rval, int stage);
1160
1161 mempool::mds_co::list<projected_inode> projected_nodes; // projected values (only defined while dirty)
1162 size_t num_projected_xattrs = 0;
1163 size_t num_projected_srnodes = 0;
1164
1165 // -- cache infrastructure --
1166 mempool::mds_co::compact_map<frag_t,CDir*> dirfrags; // cached dir fragments under this Inode
1167
1168 //for the purpose of quickly determining whether there's a subtree root or exporting dir
1169 int num_subtree_roots = 0;
1170 int num_exporting_dirs = 0;
1171
1172 int stickydir_ref = 0;
1173 scrub_info_t *scrub_infop = nullptr;
1174 /** @} Scrubbing and fsck */
1175 };
1176
1177 ostream& operator<<(ostream& out, const CInode& in);
1178 ostream& operator<<(ostream& out, const CInode::scrub_stamp_info_t& si);
1179
1180 extern cinode_lock_info_t cinode_lock_info[];
1181 extern int num_cinode_locks;
1182 #undef dout_context
1183 #endif