]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/CInode.h
update sources to 12.2.10
[ceph.git] / ceph / src / mds / CInode.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16
17#ifndef CEPH_CINODE_H
18#define CEPH_CINODE_H
19
94b18763
FG
20#include <list>
21#include <map>
22#include <set>
23#include <boost/utility/string_view.hpp>
24
7c673cae
FG
25#include "common/config.h"
26#include "include/counter.h"
27#include "include/elist.h"
28#include "include/types.h"
29#include "include/lru.h"
30#include "include/compact_set.h"
31
32#include "MDSCacheObject.h"
33#include "flock.h"
34
35#include "CDentry.h"
36#include "SimpleLock.h"
37#include "ScatterLock.h"
38#include "LocalLock.h"
39#include "Capability.h"
40#include "SnapRealm.h"
41#include "Mutation.h"
42
7c673cae
FG
43#define dout_context g_ceph_context
44
45class Context;
46class CDentry;
47class CDir;
48class Message;
49class CInode;
50class MDCache;
51class LogSegment;
52struct SnapRealm;
53class Session;
54class MClientCaps;
55struct ObjectOperation;
56class EMetaBlob;
57
58
59ostream& operator<<(ostream& out, const CInode& in);
60
61struct cinode_lock_info_t {
62 int lock;
63 int wr_caps;
64};
65
66extern cinode_lock_info_t cinode_lock_info[];
67extern int num_cinode_locks;
68
69
70/**
71 * Base class for CInode, containing the backing store data and
72 * serialization methods. This exists so that we can read and
73 * handle CInodes from the backing store without hitting all
74 * the business logic in CInode proper.
75 */
76class InodeStoreBase {
77public:
94b18763
FG
78 typedef inode_t<mempool::mds_co::pool_allocator> mempool_inode;
79 typedef old_inode_t<mempool::mds_co::pool_allocator> mempool_old_inode;
80 typedef mempool::mds_co::compact_map<snapid_t, mempool_old_inode> mempool_old_inode_map;
81 typedef xattr_map<mempool::mds_co::pool_allocator> mempool_xattr_map; // FIXME bufferptr not in mempool
82
83 mempool_inode inode; // the inode itself
84 mempool::mds_co::string symlink; // symlink dest, if symlink
85 mempool_xattr_map xattrs;
7c673cae 86 fragtree_t dirfragtree; // dir frag tree, if any. always consistent with our dirfrag map.
94b18763
FG
87 mempool_old_inode_map old_inodes; // key = last, value.first = first
88 snapid_t oldest_snap = CEPH_NOSNAP;
89 damage_flags_t damage_flags = 0;
7c673cae 90
94b18763 91 InodeStoreBase() {}
7c673cae
FG
92
93 /* Helpers */
94 bool is_file() const { return inode.is_file(); }
95 bool is_symlink() const { return inode.is_symlink(); }
96 bool is_dir() const { return inode.is_dir(); }
97 static object_t get_object_name(inodeno_t ino, frag_t fg, const char *suffix);
98
99 /* Full serialization for use in ".inode" root inode objects */
100 void encode(bufferlist &bl, uint64_t features, const bufferlist *snap_blob=NULL) const;
101 void decode(bufferlist::iterator &bl, bufferlist& snap_blob);
102
103 /* Serialization without ENCODE_START/FINISH blocks for use embedded in dentry */
104 void encode_bare(bufferlist &bl, uint64_t features, const bufferlist *snap_blob=NULL) const;
105 void decode_bare(bufferlist::iterator &bl, bufferlist &snap_blob, __u8 struct_v=5);
106
107 /* For test/debug output */
108 void dump(Formatter *f) const;
109
110 /* For use by offline tools */
94b18763
FG
111 __u32 hash_dentry_name(boost::string_view dn);
112 frag_t pick_dirfrag(boost::string_view dn);
7c673cae
FG
113};
114
115class InodeStore : public InodeStoreBase {
116public:
94b18763 117 // FIXME bufferlist not part of mempool
7c673cae
FG
118 bufferlist snap_blob; // Encoded copy of SnapRealm, because we can't
119 // rehydrate it without full MDCache
120 void encode(bufferlist &bl, uint64_t features) const {
121 InodeStoreBase::encode(bl, features, &snap_blob);
122 }
123 void decode(bufferlist::iterator &bl) {
124 InodeStoreBase::decode(bl, snap_blob);
125 }
126 void encode_bare(bufferlist &bl, uint64_t features) const {
127 InodeStoreBase::encode_bare(bl, features, &snap_blob);
128 }
129 void decode_bare(bufferlist::iterator &bl) {
130 InodeStoreBase::decode_bare(bl, snap_blob);
131 }
132
133 static void generate_test_instances(std::list<InodeStore*>& ls);
134};
135WRITE_CLASS_ENCODER_FEATURES(InodeStore)
136
137// cached inode wrapper
138class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CInode> {
139 public:
181888fb 140 MEMPOOL_CLASS_HELPERS();
7c673cae
FG
141 // -- pins --
142 static const int PIN_DIRFRAG = -1;
143 static const int PIN_CAPS = 2; // client caps
144 static const int PIN_IMPORTING = -4; // importing
145 static const int PIN_OPENINGDIR = 7;
146 static const int PIN_REMOTEPARENT = 8;
147 static const int PIN_BATCHOPENJOURNAL = 9;
148 static const int PIN_SCATTERED = 10;
149 static const int PIN_STICKYDIRS = 11;
150 //static const int PIN_PURGING = -12;
151 static const int PIN_FREEZING = 13;
152 static const int PIN_FROZEN = 14;
153 static const int PIN_IMPORTINGCAPS = -15;
154 static const int PIN_PASTSNAPPARENT = -16;
155 static const int PIN_OPENINGSNAPPARENTS = 17;
156 static const int PIN_TRUNCATING = 18;
157 static const int PIN_STRAY = 19; // we pin our stray inode while active
158 static const int PIN_NEEDSNAPFLUSH = 20;
159 static const int PIN_DIRTYRSTAT = 21;
160 static const int PIN_EXPORTINGCAPS = 22;
161 static const int PIN_DIRTYPARENT = 23;
162 static const int PIN_DIRWAITER = 24;
163 static const int PIN_SCRUBQUEUE = 25;
164
165 const char *pin_name(int p) const override {
166 switch (p) {
167 case PIN_DIRFRAG: return "dirfrag";
168 case PIN_CAPS: return "caps";
169 case PIN_IMPORTING: return "importing";
170 case PIN_OPENINGDIR: return "openingdir";
171 case PIN_REMOTEPARENT: return "remoteparent";
172 case PIN_BATCHOPENJOURNAL: return "batchopenjournal";
173 case PIN_SCATTERED: return "scattered";
174 case PIN_STICKYDIRS: return "stickydirs";
175 //case PIN_PURGING: return "purging";
176 case PIN_FREEZING: return "freezing";
177 case PIN_FROZEN: return "frozen";
178 case PIN_IMPORTINGCAPS: return "importingcaps";
179 case PIN_EXPORTINGCAPS: return "exportingcaps";
180 case PIN_PASTSNAPPARENT: return "pastsnapparent";
181 case PIN_OPENINGSNAPPARENTS: return "openingsnapparents";
182 case PIN_TRUNCATING: return "truncating";
183 case PIN_STRAY: return "stray";
184 case PIN_NEEDSNAPFLUSH: return "needsnapflush";
185 case PIN_DIRTYRSTAT: return "dirtyrstat";
186 case PIN_DIRTYPARENT: return "dirtyparent";
187 case PIN_DIRWAITER: return "dirwaiter";
188 case PIN_SCRUBQUEUE: return "scrubqueue";
189 default: return generic_pin_name(p);
190 }
191 }
192
193 // -- state --
194 static const int STATE_EXPORTING = (1<<2); // on nonauth bystander.
195 static const int STATE_OPENINGDIR = (1<<5);
196 static const int STATE_FREEZING = (1<<7);
197 static const int STATE_FROZEN = (1<<8);
198 static const int STATE_AMBIGUOUSAUTH = (1<<9);
199 static const int STATE_EXPORTINGCAPS = (1<<10);
200 static const int STATE_NEEDSRECOVER = (1<<11);
201 static const int STATE_RECOVERING = (1<<12);
202 static const int STATE_PURGING = (1<<13);
203 static const int STATE_DIRTYPARENT = (1<<14);
204 static const int STATE_DIRTYRSTAT = (1<<15);
205 static const int STATE_STRAYPINNED = (1<<16);
206 static const int STATE_FROZENAUTHPIN = (1<<17);
207 static const int STATE_DIRTYPOOL = (1<<18);
208 static const int STATE_REPAIRSTATS = (1<<19);
209 static const int STATE_MISSINGOBJS = (1<<20);
210 static const int STATE_EVALSTALECAPS = (1<<21);
31f18b77 211 static const int STATE_QUEUEDEXPORTPIN = (1<<22);
7c673cae
FG
212 // orphan inode needs notification of releasing reference
213 static const int STATE_ORPHAN = STATE_NOTIFYREF;
214
215 static const int MASK_STATE_EXPORTED =
216 (STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL);
217 static const int MASK_STATE_EXPORT_KEPT =
3efd9988 218 (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS|STATE_QUEUEDEXPORTPIN);
7c673cae
FG
219
220 // -- waiters --
221 static const uint64_t WAIT_DIR = (1<<0);
222 static const uint64_t WAIT_FROZEN = (1<<1);
223 static const uint64_t WAIT_TRUNC = (1<<2);
224 static const uint64_t WAIT_FLOCK = (1<<3);
225
226 static const uint64_t WAIT_ANY_MASK = (uint64_t)(-1);
227
228 // misc
229 static const unsigned EXPORT_NONCE = 1; // nonce given to replicas created by export
230
231 ostream& print_db_line_prefix(ostream& out) override;
232
233 public:
234 MDCache *mdcache;
235
94b18763
FG
236 SnapRealm *snaprealm = nullptr;
237 SnapRealm *containing_realm = nullptr;
7c673cae 238 snapid_t first, last;
94b18763 239 mempool::mds_co::compact_set<snapid_t> dirty_old_rstats;
7c673cae
FG
240
241 class scrub_stamp_info_t {
242 public:
243 /// version we started our latest scrub (whether in-progress or finished)
94b18763 244 version_t scrub_start_version = 0;
7c673cae
FG
245 /// time we started our latest scrub (whether in-progress or finished)
246 utime_t scrub_start_stamp;
247 /// version we started our most recent finished scrub
94b18763 248 version_t last_scrub_version = 0;
7c673cae
FG
249 /// time we started our most recent finished scrub
250 utime_t last_scrub_stamp;
94b18763 251 scrub_stamp_info_t() {}
7c673cae 252 void reset() {
b32b8144
FG
253 scrub_start_version = last_scrub_version = 0;
254 scrub_start_stamp = last_scrub_stamp = utime_t();
7c673cae
FG
255 }
256 };
257
258 class scrub_info_t : public scrub_stamp_info_t {
259 public:
94b18763
FG
260 CDentry *scrub_parent = nullptr;
261 MDSInternalContextBase *on_finish = nullptr;
7c673cae 262
94b18763
FG
263 bool last_scrub_dirty = false; /// are our stamps dirty with respect to disk state?
264 bool scrub_in_progress = false; /// are we currently scrubbing?
265 bool children_scrubbed = false;
7c673cae
FG
266
267 /// my own (temporary) stamps and versions for each dirfrag we have
94b18763 268 std::map<frag_t, scrub_stamp_info_t> dirfrag_stamps; // XXX not part of mempool
7c673cae 269
b32b8144 270 ScrubHeaderRef header;
7c673cae 271
94b18763 272 scrub_info_t() {}
7c673cae
FG
273 };
274
275 const scrub_info_t *scrub_info() const{
276 if (!scrub_infop)
277 scrub_info_create();
278 return scrub_infop;
279 }
280
b32b8144
FG
281 ScrubHeaderRef get_scrub_header() {
282 if (scrub_infop == nullptr) {
283 return nullptr;
284 } else {
285 return scrub_infop->header;
286 }
287 }
288
7c673cae
FG
289 bool scrub_is_in_progress() const {
290 return (scrub_infop && scrub_infop->scrub_in_progress);
291 }
292 /**
293 * Start scrubbing on this inode. That could be very short if it's
294 * a file, or take a long time if we're recursively scrubbing a directory.
295 * @pre It is not currently scrubbing
296 * @post it has set up internal scrubbing state
297 * @param scrub_version What version are we scrubbing at (usually, parent
298 * directory's get_projected_version())
299 */
300 void scrub_initialize(CDentry *scrub_parent,
b32b8144 301 ScrubHeaderRef& header,
7c673cae
FG
302 MDSInternalContextBase *f);
303 /**
304 * Get the next dirfrag to scrub. Gives you a frag_t in output param which
305 * you must convert to a CDir (and possibly load off disk).
306 * @param dir A pointer to frag_t, will be filled in with the next dirfrag to
307 * scrub if there is one.
308 * @returns 0 on success, you should scrub the passed-out frag_t right now;
309 * ENOENT: There are no remaining dirfrags to scrub
310 * <0 There was some other error (It will return -ENOTDIR if not a directory)
311 */
312 int scrub_dirfrag_next(frag_t* out_dirfrag);
313 /**
314 * Get the currently scrubbing dirfrags. When returned, the
315 * passed-in list will be filled in with all frag_ts which have
316 * been returned from scrub_dirfrag_next but not sent back
317 * via scrub_dirfrag_finished.
318 */
319 void scrub_dirfrags_scrubbing(list<frag_t> *out_dirfrags);
320 /**
321 * Report to the CInode that a dirfrag it owns has been scrubbed. Call
322 * this for every frag_t returned from scrub_dirfrag_next().
323 * @param dirfrag The frag_t that was scrubbed
324 */
325 void scrub_dirfrag_finished(frag_t dirfrag);
326 /**
327 * Call this once the scrub has been completed, whether it's a full
328 * recursive scrub on a directory or simply the data on a file (or
329 * anything in between).
330 * @param c An out param which is filled in with a Context* that must
331 * be complete()ed.
332 */
333 void scrub_finished(MDSInternalContextBase **c);
334 /**
335 * Report to the CInode that alldirfrags it owns have been scrubbed.
336 */
337 void scrub_children_finished() {
338 scrub_infop->children_scrubbed = true;
339 }
340 void scrub_set_finisher(MDSInternalContextBase *c) {
341 assert(!scrub_infop->on_finish);
342 scrub_infop->on_finish = c;
343 }
344
345private:
346 /**
347 * Create a scrub_info_t struct for the scrub_infop poitner.
348 */
349 void scrub_info_create() const;
350 /**
351 * Delete the scrub_info_t struct if it's not got any useful data
352 */
353 void scrub_maybe_delete_info();
354public:
355
356 bool is_multiversion() const {
357 return snaprealm || // other snaprealms will link to me
358 inode.is_dir() || // links to me in other snaps
359 inode.nlink > 1 || // there are remote links, possibly snapped, that will need to find me
360 !old_inodes.empty(); // once multiversion, always multiversion. until old_inodes gets cleaned out.
361 }
362 snapid_t get_oldest_snap();
363
94b18763 364 uint64_t last_journaled = 0; // log offset for the last time i was journaled
7c673cae
FG
365 //loff_t last_open_journaled; // log offset for the last journaled EOpen
366 utime_t last_dirstat_prop;
367
368
369 // list item node for when we have unpropagated rstat data
370 elist<CInode*>::item dirty_rstat_item;
371
372 bool is_dirty_rstat() {
373 return state_test(STATE_DIRTYRSTAT);
374 }
375 void mark_dirty_rstat();
376 void clear_dirty_rstat();
377
94b18763 378 //bool hack_accessed = false;
7c673cae
FG
379 //utime_t hack_load_stamp;
380
381 /**
382 * Projection methods, used to store inode changes until they have been journaled,
383 * at which point they are popped.
384 * Usage:
94b18763
FG
385 * project_inode as needed. If you're changing xattrs or sr_t, then pass true
386 * as needed then change the xattrs/snapnode member as needed. (Dirty
387 * exception: project_past_snaprealm_parent allows you to project the
388 * snapnode after doing project_inode (i.e. you don't need to pass
389 * snap=true).
7c673cae
FG
390 *
391 * Then, journal. Once journaling is done, pop_and_dirty_projected_inode.
392 * This function will take care of the inode itself, the xattrs, and the snaprealm.
393 */
394
94b18763
FG
395 class projected_inode {
396 public:
397 mempool_inode inode;
398 std::unique_ptr<mempool_xattr_map> xattrs;
399 std::unique_ptr<sr_t> snapnode;
400
401 projected_inode() = delete;
402 projected_inode(const mempool_inode &in) : inode(in) {}
7c673cae 403 };
94b18763
FG
404
405private:
406 mempool::mds_co::list<projected_inode> projected_nodes; // projected values (only defined while dirty)
407 size_t num_projected_xattrs = 0;
408 size_t num_projected_srnodes = 0;
409
410 sr_t &project_snaprealm(projected_inode &pi);
411public:
412 CInode::projected_inode &project_inode(bool xattr = false, bool snap = false);
7c673cae
FG
413 void pop_and_dirty_projected_inode(LogSegment *ls);
414
94b18763 415 projected_inode *get_projected_node() {
7c673cae
FG
416 if (projected_nodes.empty())
417 return NULL;
418 else
94b18763 419 return &projected_nodes.back();
7c673cae
FG
420 }
421
422 version_t get_projected_version() const {
423 if (projected_nodes.empty())
424 return inode.version;
425 else
94b18763 426 return projected_nodes.back().inode.version;
7c673cae
FG
427 }
428 bool is_projected() const {
429 return !projected_nodes.empty();
430 }
431
94b18763 432 const mempool_inode *get_projected_inode() const {
7c673cae
FG
433 if (projected_nodes.empty())
434 return &inode;
435 else
94b18763 436 return &projected_nodes.back().inode;
7c673cae 437 }
94b18763 438 mempool_inode *get_projected_inode() {
7c673cae
FG
439 if (projected_nodes.empty())
440 return &inode;
441 else
94b18763 442 return &projected_nodes.back().inode;
7c673cae 443 }
94b18763 444 mempool_inode *get_previous_projected_inode() {
7c673cae 445 assert(!projected_nodes.empty());
94b18763
FG
446 auto it = projected_nodes.rbegin();
447 ++it;
448 if (it != projected_nodes.rend())
449 return &it->inode;
7c673cae
FG
450 else
451 return &inode;
452 }
453
94b18763 454 mempool_xattr_map *get_projected_xattrs() {
7c673cae 455 if (num_projected_xattrs > 0) {
94b18763
FG
456 for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
457 if (it->xattrs)
458 return it->xattrs.get();
7c673cae
FG
459 }
460 return &xattrs;
461 }
94b18763
FG
462 mempool_xattr_map *get_previous_projected_xattrs() {
463 if (num_projected_xattrs > 0) {
464 for (auto it = ++projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
465 if (it->xattrs)
466 return it->xattrs.get();
467 }
7c673cae
FG
468 return &xattrs;
469 }
470
7c673cae
FG
471 const sr_t *get_projected_srnode() const {
472 if (num_projected_srnodes > 0) {
94b18763
FG
473 for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
474 if (it->snapnode)
475 return it->snapnode.get();
7c673cae
FG
476 }
477 if (snaprealm)
478 return &snaprealm->srnode;
479 else
480 return NULL;
481 }
482 sr_t *get_projected_srnode() {
483 if (num_projected_srnodes > 0) {
94b18763
FG
484 for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
485 if (it->snapnode)
486 return it->snapnode.get();
7c673cae
FG
487 }
488 if (snaprealm)
489 return &snaprealm->srnode;
490 else
491 return NULL;
492 }
493 void project_past_snaprealm_parent(SnapRealm *newparent);
494
495private:
496 void pop_projected_snaprealm(sr_t *next_snaprealm);
497
498public:
94b18763 499 mempool_old_inode& cow_old_inode(snapid_t follows, bool cow_head);
7c673cae 500 void split_old_inode(snapid_t snap);
94b18763 501 mempool_old_inode *pick_old_inode(snapid_t last);
7c673cae
FG
502 void pre_cow_old_inode();
503 void purge_stale_snap_data(const std::set<snapid_t>& snaps);
504
505 // -- cache infrastructure --
506private:
94b18763 507 mempool::mds_co::compact_map<frag_t,CDir*> dirfrags; // cached dir fragments under this Inode
1adf2230
AA
508
509 //for the purpose of quickly determining whether there's a subtree root or exporting dir
510 int num_subtree_roots = 0;
511 int num_exporting_dirs = 0;
512
94b18763
FG
513 int stickydir_ref = 0;
514 scrub_info_t *scrub_infop = nullptr;
7c673cae
FG
515
516public:
517 bool has_dirfrags() { return !dirfrags.empty(); }
518 CDir* get_dirfrag(frag_t fg) {
519 if (dirfrags.count(fg)) {
520 //assert(g_conf->debug_mds < 2 || dirfragtree.is_leaf(fg)); // performance hack FIXME
521 return dirfrags[fg];
522 } else
523 return NULL;
524 }
525 bool get_dirfrags_under(frag_t fg, std::list<CDir*>& ls);
526 CDir* get_approx_dirfrag(frag_t fg);
91327a77
AA
527
528 template<typename Container>
529 void get_dirfrags(Container& ls) const {
530 // all dirfrags
531 for (const auto &p : dirfrags)
532 ls.push_back(p.second);
533 }
534 template<typename Container>
535 void get_nested_dirfrags(Container& ls) const {
536 // dirfrags in same subtree
537 for (const auto &p : dirfrags) {
538 typename Container::value_type dir = p.second;
539 if (!dir->is_subtree_root())
540 ls.push_back(dir);
541 }
542 }
543 template<typename Container>
544 void get_subtree_dirfrags(Container& ls) {
545 // dirfrags that are roots of new subtrees
546 for (const auto &p : dirfrags) {
547 typename Container::value_type dir = p.second;
548 if (dir->is_subtree_root())
549 ls.push_back(dir);
550 }
551 }
552
7c673cae
FG
553 CDir *get_or_open_dirfrag(MDCache *mdcache, frag_t fg);
554 CDir *add_dirfrag(CDir *dir);
555 void close_dirfrag(frag_t fg);
556 void close_dirfrags();
557 bool has_subtree_root_dirfrag(int auth=-1);
558 bool has_subtree_or_exporting_dirfrag();
559
560 void force_dirfrags();
561 void verify_dirfrags();
562
563 void get_stickydirs();
564 void put_stickydirs();
565
566 protected:
567 // parent dentries in cache
94b18763
FG
568 CDentry *parent = nullptr; // primary link
569 mempool::mds_co::compact_set<CDentry*> remote_parents; // if hard linked
7c673cae 570
94b18763 571 mempool::mds_co::list<CDentry*> projected_parent; // for in-progress rename, (un)link, etc.
7c673cae 572
94b18763 573 mds_authority_t inode_auth = CDIR_AUTH_DEFAULT;
7c673cae
FG
574
575 // -- distributed state --
576protected:
577 // file capabilities
94b18763
FG
578 using cap_map = mempool::mds_co::map<client_t, Capability*>;
579 cap_map client_caps; // client -> caps
580 mempool::mds_co::compact_map<int32_t, int32_t> mds_caps_wanted; // [auth] mds -> caps wanted
581 int replica_caps_wanted = 0; // [replica] what i've requested from auth
7c673cae
FG
582
583public:
94b18763
FG
584 mempool::mds_co::compact_map<int, mempool::mds_co::set<client_t> > client_snap_caps; // [auth] [snap] dirty metadata we still need from the head
585 mempool::mds_co::compact_map<snapid_t, mempool::mds_co::set<client_t> > client_need_snapflush;
7c673cae
FG
586
587 void add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client);
588 void remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client);
589 bool split_need_snapflush(CInode *cowin, CInode *in);
590
591protected:
592
94b18763
FG
593 ceph_lock_state_t *fcntl_locks = nullptr;
594 ceph_lock_state_t *flock_locks = nullptr;
7c673cae
FG
595
596 ceph_lock_state_t *get_fcntl_lock_state() {
597 if (!fcntl_locks)
598 fcntl_locks = new ceph_lock_state_t(g_ceph_context, CEPH_LOCK_FCNTL);
599 return fcntl_locks;
600 }
601 void clear_fcntl_lock_state() {
602 delete fcntl_locks;
603 fcntl_locks = NULL;
604 }
605 ceph_lock_state_t *get_flock_lock_state() {
606 if (!flock_locks)
607 flock_locks = new ceph_lock_state_t(g_ceph_context, CEPH_LOCK_FLOCK);
608 return flock_locks;
609 }
610 void clear_flock_lock_state() {
611 delete flock_locks;
612 flock_locks = NULL;
613 }
614 void clear_file_locks() {
615 clear_fcntl_lock_state();
616 clear_flock_lock_state();
617 }
618 void _encode_file_locks(bufferlist& bl) const {
619 bool has_fcntl_locks = fcntl_locks && !fcntl_locks->empty();
620 ::encode(has_fcntl_locks, bl);
621 if (has_fcntl_locks)
622 ::encode(*fcntl_locks, bl);
623 bool has_flock_locks = flock_locks && !flock_locks->empty();
624 ::encode(has_flock_locks, bl);
625 if (has_flock_locks)
626 ::encode(*flock_locks, bl);
627 }
628 void _decode_file_locks(bufferlist::iterator& p) {
629 bool has_fcntl_locks;
630 ::decode(has_fcntl_locks, p);
631 if (has_fcntl_locks)
632 ::decode(*get_fcntl_lock_state(), p);
633 else
634 clear_fcntl_lock_state();
635 bool has_flock_locks;
636 ::decode(has_flock_locks, p);
637 if (has_flock_locks)
638 ::decode(*get_flock_lock_state(), p);
639 else
640 clear_flock_lock_state();
641 }
642
643 // LogSegment lists i (may) belong to
644public:
645 elist<CInode*>::item item_dirty;
646 elist<CInode*>::item item_caps;
647 elist<CInode*>::item item_open_file;
648 elist<CInode*>::item item_dirty_parent;
649 elist<CInode*>::item item_dirty_dirfrag_dir;
650 elist<CInode*>::item item_dirty_dirfrag_nest;
651 elist<CInode*>::item item_dirty_dirfrag_dirfragtree;
652 elist<CInode*>::item item_scrub;
653
b32b8144
FG
654 // also update RecoveryQueue::RecoveryQueue() if you change this
655 elist<CInode*>::item& item_recover_queue = item_dirty_dirfrag_dir;
656 elist<CInode*>::item& item_recover_queue_front = item_dirty_dirfrag_nest;
657
7c673cae 658public:
94b18763 659 int auth_pin_freeze_allowance = 0;
7c673cae
FG
660
661 inode_load_vec_t pop;
28e407b8 662 elist<CInode*>::item item_pop_lru;
7c673cae
FG
663
664 // friends
665 friend class Server;
666 friend class Locker;
667 friend class Migrator;
668 friend class MDCache;
669 friend class StrayManager;
670 friend class CDir;
671 friend class CInodeExport;
7c673cae
FG
672
673 // ---------------------------
94b18763 674 CInode() = delete;
7c673cae
FG
675 CInode(MDCache *c, bool auth=true, snapid_t f=2, snapid_t l=CEPH_NOSNAP) :
676 mdcache(c),
7c673cae 677 first(f), last(l),
94b18763
FG
678 item_dirty(this),
679 item_caps(this),
680 item_open_file(this),
681 item_dirty_parent(this),
7c673cae
FG
682 item_dirty_dirfrag_dir(this),
683 item_dirty_dirfrag_nest(this),
684 item_dirty_dirfrag_dirfragtree(this),
7c673cae
FG
685 pop(ceph_clock_now()),
686 versionlock(this, &versionlock_type),
687 authlock(this, &authlock_type),
688 linklock(this, &linklock_type),
689 dirfragtreelock(this, &dirfragtreelock_type),
690 filelock(this, &filelock_type),
691 xattrlock(this, &xattrlock_type),
692 snaplock(this, &snaplock_type),
693 nestlock(this, &nestlock_type),
694 flocklock(this, &flocklock_type),
94b18763 695 policylock(this, &policylock_type)
7c673cae 696 {
7c673cae
FG
697 if (auth) state_set(STATE_AUTH);
698 }
699 ~CInode() override {
700 close_dirfrags();
701 close_snaprealm();
702 clear_file_locks();
703 assert(num_projected_xattrs == 0);
704 assert(num_projected_srnodes == 0);
1adf2230
AA
705 assert(num_subtree_roots == 0);
706 assert(num_exporting_dirs == 0);
7c673cae
FG
707 }
708
709
710 // -- accessors --
711 bool is_root() const { return inode.ino == MDS_INO_ROOT; }
712 bool is_stray() const { return MDS_INO_IS_STRAY(inode.ino); }
713 mds_rank_t get_stray_owner() const {
714 return (mds_rank_t)MDS_INO_STRAY_OWNER(inode.ino);
715 }
716 bool is_mdsdir() const { return MDS_INO_IS_MDSDIR(inode.ino); }
717 bool is_base() const { return is_root() || is_mdsdir(); }
718 bool is_system() const { return inode.ino < MDS_INO_SYSTEM_BASE; }
719 bool is_normal() const { return !(is_base() || is_system() || is_stray()); }
720
721 bool is_head() const { return last == CEPH_NOSNAP; }
722
723 // note: this overloads MDSCacheObject
724 bool is_ambiguous_auth() const {
725 return state_test(STATE_AMBIGUOUSAUTH) ||
726 MDSCacheObject::is_ambiguous_auth();
727 }
728 void set_ambiguous_auth() {
729 state_set(STATE_AMBIGUOUSAUTH);
730 }
731 void clear_ambiguous_auth(std::list<MDSInternalContextBase*>& finished);
732 void clear_ambiguous_auth();
733
734 inodeno_t ino() const { return inode.ino; }
735 vinodeno_t vino() const { return vinodeno_t(inode.ino, last); }
736 int d_type() const { return IFTODT(inode.mode); }
737
94b18763 738 mempool_inode& get_inode() { return inode; }
7c673cae
FG
739 CDentry* get_parent_dn() { return parent; }
740 const CDentry* get_parent_dn() const { return parent; }
741 const CDentry* get_projected_parent_dn() const { return !projected_parent.empty() ? projected_parent.back() : parent; }
742 CDentry* get_projected_parent_dn() { return !projected_parent.empty() ? projected_parent.back() : parent; }
743 CDir *get_parent_dir();
744 const CDir *get_projected_parent_dir() const;
745 CDir *get_projected_parent_dir();
746 CInode *get_parent_inode();
747
748 bool is_lt(const MDSCacheObject *r) const override {
749 const CInode *o = static_cast<const CInode*>(r);
750 return ino() < o->ino() ||
751 (ino() == o->ino() && last < o->last);
752 }
753
754 // -- misc --
755 bool is_projected_ancestor_of(CInode *other);
756
757 void make_path_string(std::string& s, bool projected=false, const CDentry *use_parent=NULL) const;
758 void make_path(filepath& s, bool projected=false) const;
759 void name_stray_dentry(std::string& dname);
760
761 // -- dirtyness --
762 version_t get_version() const { return inode.version; }
763
764 version_t pre_dirty();
765 void _mark_dirty(LogSegment *ls);
766 void mark_dirty(version_t projected_dirv, LogSegment *ls);
767 void mark_clean();
768
769 void store(MDSInternalContextBase *fin);
770 void _stored(int r, version_t cv, Context *fin);
771 /**
772 * Flush a CInode to disk. This includes the backtrace, the parent
773 * directory's link, and the Inode object itself (if a base directory).
774 * @pre is_auth() on both the inode and its containing directory
775 * @pre can_auth_pin()
776 * @param fin The Context to call when the flush is completed.
777 */
778 void flush(MDSInternalContextBase *fin);
779 void fetch(MDSInternalContextBase *fin);
780 void _fetched(bufferlist& bl, bufferlist& bl2, Context *fin);
781
782
783 void build_backtrace(int64_t pool, inode_backtrace_t& bt);
784 void store_backtrace(MDSInternalContextBase *fin, int op_prio=-1);
785 void _stored_backtrace(int r, version_t v, Context *fin);
786 void fetch_backtrace(Context *fin, bufferlist *backtrace);
787protected:
788 /**
789 * Return the pool ID where we currently write backtraces for
790 * this inode (in addition to inode.old_pools)
791 *
792 * @returns a pool ID >=0
793 */
794 int64_t get_backtrace_pool() const;
795public:
28e407b8 796 void mark_dirty_parent(LogSegment *ls, bool dirty_pool=false);
7c673cae
FG
797 void clear_dirty_parent();
798 void verify_diri_backtrace(bufferlist &bl, int err);
799 bool is_dirty_parent() { return state_test(STATE_DIRTYPARENT); }
800 bool is_dirty_pool() { return state_test(STATE_DIRTYPOOL); }
801
802 void encode_snap_blob(bufferlist &bl);
803 void decode_snap_blob(bufferlist &bl);
804 void encode_store(bufferlist& bl, uint64_t features);
805 void decode_store(bufferlist::iterator& bl);
806
b32b8144 807 void encode_replica(mds_rank_t rep, bufferlist& bl, uint64_t features, bool need_recover) {
7c673cae
FG
808 assert(is_auth());
809
810 // relax locks?
811 if (!is_replicated())
812 replicate_relax_locks();
813
814 __u32 nonce = add_replica(rep);
815 ::encode(nonce, bl);
816
817 _encode_base(bl, features);
b32b8144 818 _encode_locks_state_for_replica(bl, need_recover);
7c673cae
FG
819 }
820 void decode_replica(bufferlist::iterator& p, bool is_new) {
821 __u32 nonce;
822 ::decode(nonce, p);
823 replica_nonce = nonce;
824
825 _decode_base(p);
826 _decode_locks_state(p, is_new);
827 }
828
829 // -- waiting --
830protected:
94b18763 831 mempool::mds_co::compact_map<frag_t, std::list<MDSInternalContextBase*> > waiting_on_dir;
7c673cae
FG
832public:
833 void add_dir_waiter(frag_t fg, MDSInternalContextBase *c);
834 void take_dir_waiting(frag_t fg, std::list<MDSInternalContextBase*>& ls);
835 bool is_waiting_for_dir(frag_t fg) {
836 return waiting_on_dir.count(fg);
837 }
838 void add_waiter(uint64_t tag, MDSInternalContextBase *c) override;
839 void take_waiting(uint64_t tag, std::list<MDSInternalContextBase*>& ls) override;
840
841 // -- encode/decode helpers --
842 void _encode_base(bufferlist& bl, uint64_t features);
843 void _decode_base(bufferlist::iterator& p);
844 void _encode_locks_full(bufferlist& bl);
845 void _decode_locks_full(bufferlist::iterator& p);
b32b8144 846 void _encode_locks_state_for_replica(bufferlist& bl, bool need_recover);
7c673cae
FG
847 void _encode_locks_state_for_rejoin(bufferlist& bl, int rep);
848 void _decode_locks_state(bufferlist::iterator& p, bool is_new);
849 void _decode_locks_rejoin(bufferlist::iterator& p, std::list<MDSInternalContextBase*>& waiters,
b32b8144 850 std::list<SimpleLock*>& eval_locks, bool survivor);
7c673cae
FG
851
852 // -- import/export --
853 void encode_export(bufferlist& bl);
854 void finish_export(utime_t now);
855 void abort_export() {
856 put(PIN_TEMPEXPORTING);
857 assert(state_test(STATE_EXPORTINGCAPS));
858 state_clear(STATE_EXPORTINGCAPS);
859 put(PIN_EXPORTINGCAPS);
860 }
861 void decode_import(bufferlist::iterator& p, LogSegment *ls);
862
863
864 // for giving to clients
865 int encode_inodestat(bufferlist& bl, Session *session, SnapRealm *realm,
866 snapid_t snapid=CEPH_NOSNAP, unsigned max_bytes=0,
867 int getattr_wants=0);
868 void encode_cap_message(MClientCaps *m, Capability *cap);
869
870
871 // -- locks --
872public:
873 static LockType versionlock_type;
874 static LockType authlock_type;
875 static LockType linklock_type;
876 static LockType dirfragtreelock_type;
877 static LockType filelock_type;
878 static LockType xattrlock_type;
879 static LockType snaplock_type;
880 static LockType nestlock_type;
881 static LockType flocklock_type;
882 static LockType policylock_type;
883
94b18763 884 // FIXME not part of mempool
7c673cae
FG
885 LocalLock versionlock;
886 SimpleLock authlock;
887 SimpleLock linklock;
888 ScatterLock dirfragtreelock;
889 ScatterLock filelock;
890 SimpleLock xattrlock;
891 SimpleLock snaplock;
892 ScatterLock nestlock;
893 SimpleLock flocklock;
894 SimpleLock policylock;
895
896 SimpleLock* get_lock(int type) override {
897 switch (type) {
898 case CEPH_LOCK_IFILE: return &filelock;
899 case CEPH_LOCK_IAUTH: return &authlock;
900 case CEPH_LOCK_ILINK: return &linklock;
901 case CEPH_LOCK_IDFT: return &dirfragtreelock;
902 case CEPH_LOCK_IXATTR: return &xattrlock;
903 case CEPH_LOCK_ISNAP: return &snaplock;
904 case CEPH_LOCK_INEST: return &nestlock;
905 case CEPH_LOCK_IFLOCK: return &flocklock;
906 case CEPH_LOCK_IPOLICY: return &policylock;
907 }
908 return 0;
909 }
910
911 void set_object_info(MDSCacheObjectInfo &info) override;
912 void encode_lock_state(int type, bufferlist& bl) override;
913 void decode_lock_state(int type, bufferlist& bl) override;
914
915 void _finish_frag_update(CDir *dir, MutationRef& mut);
916
917 void clear_dirty_scattered(int type) override;
918 bool is_dirty_scattered();
919 void clear_scatter_dirty(); // on rejoin ack
920
921 void start_scatter(ScatterLock *lock);
922 void finish_scatter_update(ScatterLock *lock, CDir *dir,
923 version_t inode_version, version_t dir_accounted_version);
924 void finish_scatter_gather_update(int type);
925 void finish_scatter_gather_update_accounted(int type, MutationRef& mut, EMetaBlob *metablob);
926
927 // -- snap --
928 void open_snaprealm(bool no_split=false);
929 void close_snaprealm(bool no_join=false);
930 SnapRealm *find_snaprealm() const;
931 void encode_snap(bufferlist& bl);
932 void decode_snap(bufferlist::iterator& p);
933
934 // -- caps -- (new)
935 // client caps
94b18763 936 client_t loner_cap = -1, want_loner_cap = -1;
7c673cae
FG
937
938 client_t get_loner() const { return loner_cap; }
939 client_t get_wanted_loner() const { return want_loner_cap; }
940
941 // this is the loner state our locks should aim for
942 client_t get_target_loner() const {
943 if (loner_cap == want_loner_cap)
944 return loner_cap;
945 else
946 return -1;
947 }
948
949 client_t calc_ideal_loner();
7c673cae 950 void set_loner_cap(client_t l);
b32b8144
FG
951 bool choose_ideal_loner();
952 bool try_set_loner();
7c673cae
FG
953 bool try_drop_loner();
954
955 // choose new lock state during recovery, based on issued caps
956 void choose_lock_state(SimpleLock *lock, int allissued);
957 void choose_lock_states(int dirty_caps);
958
959 int count_nonstale_caps() {
960 int n = 0;
94b18763
FG
961 for (const auto &p : client_caps) {
962 if (!p.second->is_stale())
7c673cae 963 n++;
94b18763 964 }
7c673cae
FG
965 return n;
966 }
967 bool multiple_nonstale_caps() {
968 int n = 0;
94b18763
FG
969 for (const auto &p : client_caps) {
970 if (!p.second->is_stale()) {
7c673cae
FG
971 if (n)
972 return true;
973 n++;
974 }
94b18763 975 }
7c673cae
FG
976 return false;
977 }
978
979 bool is_any_caps() { return !client_caps.empty(); }
980 bool is_any_nonstale_caps() { return count_nonstale_caps(); }
981
94b18763
FG
982 const mempool::mds_co::compact_map<int32_t,int32_t>& get_mds_caps_wanted() const { return mds_caps_wanted; }
983 mempool::mds_co::compact_map<int32_t,int32_t>& get_mds_caps_wanted() { return mds_caps_wanted; }
7c673cae 984
94b18763 985 const cap_map& get_client_caps() const { return client_caps; }
7c673cae
FG
986 Capability *get_client_cap(client_t client) {
987 auto client_caps_entry = client_caps.find(client);
988 if (client_caps_entry != client_caps.end())
989 return client_caps_entry->second;
990 return 0;
991 }
992 int get_client_cap_pending(client_t client) const {
993 auto client_caps_entry = client_caps.find(client);
994 if (client_caps_entry != client_caps.end()) {
995 return client_caps_entry->second->pending();
996 } else {
997 return 0;
998 }
999 }
1000
1001 Capability *add_client_cap(client_t client, Session *session, SnapRealm *conrealm=0);
1002 void remove_client_cap(client_t client);
1003 void move_to_realm(SnapRealm *realm);
1004
1005 Capability *reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session);
1006 void clear_client_caps_after_export();
1007 void export_client_caps(std::map<client_t,Capability::Export>& cl);
1008
1009 // caps allowed
1010 int get_caps_liked() const;
1011 int get_caps_allowed_ever() const;
1012 int get_caps_allowed_by_type(int type) const;
1013 int get_caps_careful() const;
1014 int get_xlocker_mask(client_t client) const;
94b18763 1015 int get_caps_allowed_for_client(Session *s, mempool_inode *file_i) const;
7c673cae
FG
1016
1017 // caps issued, wanted
1018 int get_caps_issued(int *ploner = 0, int *pother = 0, int *pxlocker = 0,
1019 int shift = 0, int mask = -1);
1020 bool is_any_caps_wanted() const;
1021 int get_caps_wanted(int *ploner = 0, int *pother = 0, int shift = 0, int mask = -1) const;
1022 bool issued_caps_need_gather(SimpleLock *lock);
1023 void replicate_relax_locks();
1024
1025 // -- authority --
1026 mds_authority_t authority() const override;
1027
1028 // -- auth pins --
1029 void adjust_nested_auth_pins(int a, void *by);
91327a77 1030 bool can_auth_pin(int *err_ret=nullptr) const override;
7c673cae
FG
1031 void auth_pin(void *by) override;
1032 void auth_unpin(void *by) override;
1033
1034 // -- freeze --
1035 bool is_freezing_inode() const { return state_test(STATE_FREEZING); }
1036 bool is_frozen_inode() const { return state_test(STATE_FROZEN); }
1037 bool is_frozen_auth_pin() const { return state_test(STATE_FROZENAUTHPIN); }
1038 bool is_frozen() const override;
1039 bool is_frozen_dir() const;
1040 bool is_freezing() const override;
1041
1042 /* Freeze the inode. auth_pin_allowance lets the caller account for any
1043 * auth_pins it is itself holding/responsible for. */
1044 bool freeze_inode(int auth_pin_allowance=0);
1045 void unfreeze_inode(std::list<MDSInternalContextBase*>& finished);
1046 void unfreeze_inode();
1047
1048 void freeze_auth_pin();
1049 void unfreeze_auth_pin();
1050
1051 // -- reference counting --
1052 void bad_put(int by) override {
1053 generic_dout(0) << " bad put " << *this << " by " << by << " " << pin_name(by) << " was " << ref
1054#ifdef MDS_REF_SET
1055 << " (" << ref_map << ")"
1056#endif
1057 << dendl;
1058#ifdef MDS_REF_SET
1059 assert(ref_map[by] > 0);
1060#endif
1061 assert(ref > 0);
1062 }
1063 void bad_get(int by) override {
1064 generic_dout(0) << " bad get " << *this << " by " << by << " " << pin_name(by) << " was " << ref
1065#ifdef MDS_REF_SET
1066 << " (" << ref_map << ")"
1067#endif
1068 << dendl;
1069#ifdef MDS_REF_SET
1070 assert(ref_map[by] >= 0);
1071#endif
1072 }
1073 void first_get() override;
1074 void last_put() override;
1075 void _put() override;
1076
1077
1078 // -- hierarchy stuff --
1079public:
1080 void set_primary_parent(CDentry *p) {
94b18763
FG
1081 assert(parent == 0 ||
1082 g_conf->get_val<bool>("mds_hack_allow_loading_invalid_metadata"));
7c673cae
FG
1083 parent = p;
1084 }
1085 void remove_primary_parent(CDentry *dn) {
1086 assert(dn == parent);
1087 parent = 0;
1088 }
1089 void add_remote_parent(CDentry *p);
1090 void remove_remote_parent(CDentry *p);
1091 int num_remote_parents() {
1092 return remote_parents.size();
1093 }
1094
1095 void push_projected_parent(CDentry *dn) {
1096 projected_parent.push_back(dn);
1097 }
1098 void pop_projected_parent() {
1099 assert(projected_parent.size());
1100 parent = projected_parent.front();
1101 projected_parent.pop_front();
1102 }
1103
7c673cae 1104public:
31f18b77 1105 void maybe_export_pin(bool update=false);
7c673cae
FG
1106 void set_export_pin(mds_rank_t rank);
1107 mds_rank_t get_export_pin(bool inherit=true) const;
1108 bool is_exportable(mds_rank_t dest) const;
1109
1110 void print(ostream& out) override;
1111 void dump(Formatter *f) const;
1112
1113 /**
1114 * @defgroup Scrubbing and fsck
1115 * @{
1116 */
1117
1118 /**
1119 * Report the results of validation against a particular inode.
1120 * Each member is a pair of bools.
1121 * <member>.first represents if validation was performed against the member.
1122 * <member.second represents if the member passed validation.
1123 * performed_validation is set to true if the validation was actually
1124 * run. It might not be run if, for instance, the inode is marked as dirty.
1125 * passed_validation is set to true if everything that was checked
1126 * passed its validation.
1127 */
1128 struct validated_data {
1129 template<typename T>struct member_status {
b32b8144
FG
1130 bool checked = false;
1131 bool passed = false;
1132 bool repaired = false;
1133 int ondisk_read_retval = 0;
7c673cae
FG
1134 T ondisk_value;
1135 T memory_value;
1136 std::stringstream error_str;
7c673cae
FG
1137 };
1138
94b18763
FG
1139 bool performed_validation = false;
1140 bool passed_validation = false;
7c673cae
FG
1141
1142 struct raw_stats_t {
1143 frag_info_t dirstat;
1144 nest_info_t rstat;
1145 };
1146
1147 member_status<inode_backtrace_t> backtrace;
94b18763 1148 member_status<mempool_inode> inode; // XXX should not be in mempool; wait for pmr
7c673cae
FG
1149 member_status<raw_stats_t> raw_stats;
1150
94b18763 1151 validated_data() {}
7c673cae
FG
1152
1153 void dump(Formatter *f) const;
b32b8144
FG
1154
1155 bool all_damage_repaired() const;
7c673cae
FG
1156 };
1157
1158 /**
1159 * Validate that the on-disk state of an inode matches what
1160 * we expect from our memory state. Currently this checks that:
1161 * 1) The backtrace associated with the file data exists and is correct
1162 * 2) For directories, the actual inode metadata matches our memory state,
1163 * 3) For directories, the rstats match
1164 *
1165 * @param results A freshly-created validated_data struct, with values set
1166 * as described in the struct documentation.
1167 * @param mdr The request to be responeded upon the completion of the
1168 * validation (or NULL)
1169 * @param fin Context to call back on completion (or NULL)
1170 */
1171 void validate_disk_state(validated_data *results,
1172 MDSInternalContext *fin);
1173 static void dump_validation_results(const validated_data& results,
1174 Formatter *f);
1175private:
1176 bool _validate_disk_state(class ValidationContinuation *c,
1177 int rval, int stage);
1178 friend class ValidationContinuation;
1179 /** @} Scrubbing and fsck */
1180};
1181
1182ostream& operator<<(ostream& out, const CInode::scrub_stamp_info_t& si);
1183
1184#undef dout_context
1185#endif