]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | ||
16 | ||
17 | #ifndef CEPH_CINODE_H | |
18 | #define CEPH_CINODE_H | |
19 | ||
94b18763 FG |
20 | #include <list> |
21 | #include <map> | |
22 | #include <set> | |
11fdf7f2 | 23 | #include <string_view> |
94b18763 | 24 | |
7c673cae FG |
25 | #include "common/config.h" |
26 | #include "include/counter.h" | |
27 | #include "include/elist.h" | |
28 | #include "include/types.h" | |
29 | #include "include/lru.h" | |
30 | #include "include/compact_set.h" | |
31 | ||
32 | #include "MDSCacheObject.h" | |
11fdf7f2 | 33 | #include "MDSContext.h" |
7c673cae FG |
34 | #include "flock.h" |
35 | ||
36 | #include "CDentry.h" | |
37 | #include "SimpleLock.h" | |
38 | #include "ScatterLock.h" | |
39 | #include "LocalLock.h" | |
40 | #include "Capability.h" | |
41 | #include "SnapRealm.h" | |
42 | #include "Mutation.h" | |
43 | ||
11fdf7f2 TL |
44 | #include "messages/MClientCaps.h" |
45 | ||
7c673cae FG |
46 | #define dout_context g_ceph_context |
47 | ||
48 | class Context; | |
49 | class CDentry; | |
50 | class CDir; | |
7c673cae FG |
51 | class CInode; |
52 | class MDCache; | |
53 | class LogSegment; | |
54 | struct SnapRealm; | |
55 | class Session; | |
7c673cae FG |
56 | struct ObjectOperation; |
57 | class EMetaBlob; | |
58 | ||
59 | ||
60 | ostream& operator<<(ostream& out, const CInode& in); | |
61 | ||
62 | struct cinode_lock_info_t { | |
63 | int lock; | |
64 | int wr_caps; | |
65 | }; | |
66 | ||
67 | extern cinode_lock_info_t cinode_lock_info[]; | |
68 | extern int num_cinode_locks; | |
69 | ||
70 | ||
71 | /** | |
72 | * Base class for CInode, containing the backing store data and | |
73 | * serialization methods. This exists so that we can read and | |
74 | * handle CInodes from the backing store without hitting all | |
75 | * the business logic in CInode proper. | |
76 | */ | |
77 | class InodeStoreBase { | |
78 | public: | |
94b18763 FG |
79 | typedef inode_t<mempool::mds_co::pool_allocator> mempool_inode; |
80 | typedef old_inode_t<mempool::mds_co::pool_allocator> mempool_old_inode; | |
81 | typedef mempool::mds_co::compact_map<snapid_t, mempool_old_inode> mempool_old_inode_map; | |
82 | typedef xattr_map<mempool::mds_co::pool_allocator> mempool_xattr_map; // FIXME bufferptr not in mempool | |
83 | ||
84 | mempool_inode inode; // the inode itself | |
85 | mempool::mds_co::string symlink; // symlink dest, if symlink | |
86 | mempool_xattr_map xattrs; | |
7c673cae | 87 | fragtree_t dirfragtree; // dir frag tree, if any. always consistent with our dirfrag map. |
94b18763 FG |
88 | mempool_old_inode_map old_inodes; // key = last, value.first = first |
89 | snapid_t oldest_snap = CEPH_NOSNAP; | |
90 | damage_flags_t damage_flags = 0; | |
7c673cae | 91 | |
94b18763 | 92 | InodeStoreBase() {} |
7c673cae FG |
93 | |
94 | /* Helpers */ | |
95 | bool is_file() const { return inode.is_file(); } | |
96 | bool is_symlink() const { return inode.is_symlink(); } | |
97 | bool is_dir() const { return inode.is_dir(); } | |
11fdf7f2 | 98 | static object_t get_object_name(inodeno_t ino, frag_t fg, std::string_view suffix); |
7c673cae FG |
99 | |
100 | /* Full serialization for use in ".inode" root inode objects */ | |
101 | void encode(bufferlist &bl, uint64_t features, const bufferlist *snap_blob=NULL) const; | |
11fdf7f2 | 102 | void decode(bufferlist::const_iterator &bl, bufferlist& snap_blob); |
7c673cae FG |
103 | |
104 | /* Serialization without ENCODE_START/FINISH blocks for use embedded in dentry */ | |
105 | void encode_bare(bufferlist &bl, uint64_t features, const bufferlist *snap_blob=NULL) const; | |
11fdf7f2 | 106 | void decode_bare(bufferlist::const_iterator &bl, bufferlist &snap_blob, __u8 struct_v=5); |
7c673cae FG |
107 | |
108 | /* For test/debug output */ | |
109 | void dump(Formatter *f) const; | |
110 | ||
111 | /* For use by offline tools */ | |
11fdf7f2 TL |
112 | __u32 hash_dentry_name(std::string_view dn); |
113 | frag_t pick_dirfrag(std::string_view dn); | |
7c673cae FG |
114 | }; |
115 | ||
116 | class InodeStore : public InodeStoreBase { | |
117 | public: | |
94b18763 | 118 | // FIXME bufferlist not part of mempool |
7c673cae FG |
119 | bufferlist snap_blob; // Encoded copy of SnapRealm, because we can't |
120 | // rehydrate it without full MDCache | |
121 | void encode(bufferlist &bl, uint64_t features) const { | |
122 | InodeStoreBase::encode(bl, features, &snap_blob); | |
123 | } | |
11fdf7f2 | 124 | void decode(bufferlist::const_iterator &bl) { |
7c673cae FG |
125 | InodeStoreBase::decode(bl, snap_blob); |
126 | } | |
127 | void encode_bare(bufferlist &bl, uint64_t features) const { | |
128 | InodeStoreBase::encode_bare(bl, features, &snap_blob); | |
129 | } | |
11fdf7f2 | 130 | void decode_bare(bufferlist::const_iterator &bl) { |
7c673cae FG |
131 | InodeStoreBase::decode_bare(bl, snap_blob); |
132 | } | |
133 | ||
134 | static void generate_test_instances(std::list<InodeStore*>& ls); | |
135 | }; | |
136 | WRITE_CLASS_ENCODER_FEATURES(InodeStore) | |
137 | ||
11fdf7f2 TL |
138 | // just for ceph-dencoder |
139 | class InodeStoreBare : public InodeStore { | |
140 | public: | |
141 | void encode(bufferlist &bl, uint64_t features) const { | |
142 | InodeStore::encode_bare(bl, features); | |
143 | } | |
144 | void decode(bufferlist::const_iterator &bl) { | |
145 | InodeStore::decode_bare(bl); | |
146 | } | |
147 | static void generate_test_instances(std::list<InodeStoreBare*>& ls); | |
148 | }; | |
149 | WRITE_CLASS_ENCODER_FEATURES(InodeStoreBare) | |
150 | ||
7c673cae FG |
151 | // cached inode wrapper |
152 | class CInode : public MDSCacheObject, public InodeStoreBase, public Counter<CInode> { | |
153 | public: | |
181888fb | 154 | MEMPOOL_CLASS_HELPERS(); |
7c673cae FG |
155 | // -- pins -- |
156 | static const int PIN_DIRFRAG = -1; | |
157 | static const int PIN_CAPS = 2; // client caps | |
158 | static const int PIN_IMPORTING = -4; // importing | |
159 | static const int PIN_OPENINGDIR = 7; | |
160 | static const int PIN_REMOTEPARENT = 8; | |
161 | static const int PIN_BATCHOPENJOURNAL = 9; | |
162 | static const int PIN_SCATTERED = 10; | |
163 | static const int PIN_STICKYDIRS = 11; | |
164 | //static const int PIN_PURGING = -12; | |
165 | static const int PIN_FREEZING = 13; | |
166 | static const int PIN_FROZEN = 14; | |
167 | static const int PIN_IMPORTINGCAPS = -15; | |
168 | static const int PIN_PASTSNAPPARENT = -16; | |
169 | static const int PIN_OPENINGSNAPPARENTS = 17; | |
170 | static const int PIN_TRUNCATING = 18; | |
171 | static const int PIN_STRAY = 19; // we pin our stray inode while active | |
172 | static const int PIN_NEEDSNAPFLUSH = 20; | |
173 | static const int PIN_DIRTYRSTAT = 21; | |
174 | static const int PIN_EXPORTINGCAPS = 22; | |
175 | static const int PIN_DIRTYPARENT = 23; | |
176 | static const int PIN_DIRWAITER = 24; | |
177 | static const int PIN_SCRUBQUEUE = 25; | |
178 | ||
11fdf7f2 | 179 | std::string_view pin_name(int p) const override { |
7c673cae FG |
180 | switch (p) { |
181 | case PIN_DIRFRAG: return "dirfrag"; | |
182 | case PIN_CAPS: return "caps"; | |
183 | case PIN_IMPORTING: return "importing"; | |
184 | case PIN_OPENINGDIR: return "openingdir"; | |
185 | case PIN_REMOTEPARENT: return "remoteparent"; | |
186 | case PIN_BATCHOPENJOURNAL: return "batchopenjournal"; | |
187 | case PIN_SCATTERED: return "scattered"; | |
188 | case PIN_STICKYDIRS: return "stickydirs"; | |
189 | //case PIN_PURGING: return "purging"; | |
190 | case PIN_FREEZING: return "freezing"; | |
191 | case PIN_FROZEN: return "frozen"; | |
192 | case PIN_IMPORTINGCAPS: return "importingcaps"; | |
193 | case PIN_EXPORTINGCAPS: return "exportingcaps"; | |
194 | case PIN_PASTSNAPPARENT: return "pastsnapparent"; | |
195 | case PIN_OPENINGSNAPPARENTS: return "openingsnapparents"; | |
196 | case PIN_TRUNCATING: return "truncating"; | |
197 | case PIN_STRAY: return "stray"; | |
198 | case PIN_NEEDSNAPFLUSH: return "needsnapflush"; | |
199 | case PIN_DIRTYRSTAT: return "dirtyrstat"; | |
200 | case PIN_DIRTYPARENT: return "dirtyparent"; | |
201 | case PIN_DIRWAITER: return "dirwaiter"; | |
202 | case PIN_SCRUBQUEUE: return "scrubqueue"; | |
203 | default: return generic_pin_name(p); | |
204 | } | |
205 | } | |
206 | ||
11fdf7f2 TL |
207 | // -- dump flags -- |
208 | static const int DUMP_INODE_STORE_BASE = (1 << 0); | |
209 | static const int DUMP_MDS_CACHE_OBJECT = (1 << 1); | |
210 | static const int DUMP_LOCKS = (1 << 2); | |
211 | static const int DUMP_STATE = (1 << 3); | |
212 | static const int DUMP_CAPS = (1 << 4); | |
213 | static const int DUMP_PATH = (1 << 5); | |
214 | static const int DUMP_DIRFRAGS = (1 << 6); | |
215 | static const int DUMP_ALL = (-1); | |
216 | static const int DUMP_DEFAULT = DUMP_ALL & (~DUMP_PATH) & (~DUMP_DIRFRAGS); | |
217 | ||
7c673cae | 218 | // -- state -- |
11fdf7f2 TL |
219 | static const int STATE_EXPORTING = (1<<0); // on nonauth bystander. |
220 | static const int STATE_OPENINGDIR = (1<<1); | |
221 | static const int STATE_FREEZING = (1<<2); | |
222 | static const int STATE_FROZEN = (1<<3); | |
223 | static const int STATE_AMBIGUOUSAUTH = (1<<4); | |
224 | static const int STATE_EXPORTINGCAPS = (1<<5); | |
225 | static const int STATE_NEEDSRECOVER = (1<<6); | |
226 | static const int STATE_RECOVERING = (1<<7); | |
227 | static const int STATE_PURGING = (1<<8); | |
228 | static const int STATE_DIRTYPARENT = (1<<9); | |
229 | static const int STATE_DIRTYRSTAT = (1<<10); | |
230 | static const int STATE_STRAYPINNED = (1<<11); | |
231 | static const int STATE_FROZENAUTHPIN = (1<<12); | |
232 | static const int STATE_DIRTYPOOL = (1<<13); | |
233 | static const int STATE_REPAIRSTATS = (1<<14); | |
234 | static const int STATE_MISSINGOBJS = (1<<15); | |
235 | static const int STATE_EVALSTALECAPS = (1<<16); | |
236 | static const int STATE_QUEUEDEXPORTPIN = (1<<17); | |
237 | static const int STATE_TRACKEDBYOFT = (1<<18); // tracked by open file table | |
eafe8130 | 238 | static const int STATE_DELAYEDEXPORTPIN = (1<<19); |
7c673cae FG |
239 | // orphan inode needs notification of releasing reference |
240 | static const int STATE_ORPHAN = STATE_NOTIFYREF; | |
241 | ||
242 | static const int MASK_STATE_EXPORTED = | |
243 | (STATE_DIRTY|STATE_NEEDSRECOVER|STATE_DIRTYPARENT|STATE_DIRTYPOOL); | |
244 | static const int MASK_STATE_EXPORT_KEPT = | |
11fdf7f2 | 245 | (STATE_FROZEN|STATE_AMBIGUOUSAUTH|STATE_EXPORTINGCAPS| |
eafe8130 | 246 | STATE_QUEUEDEXPORTPIN|STATE_TRACKEDBYOFT|STATE_DELAYEDEXPORTPIN); |
7c673cae FG |
247 | |
248 | // -- waiters -- | |
249 | static const uint64_t WAIT_DIR = (1<<0); | |
250 | static const uint64_t WAIT_FROZEN = (1<<1); | |
251 | static const uint64_t WAIT_TRUNC = (1<<2); | |
252 | static const uint64_t WAIT_FLOCK = (1<<3); | |
253 | ||
254 | static const uint64_t WAIT_ANY_MASK = (uint64_t)(-1); | |
255 | ||
256 | // misc | |
257 | static const unsigned EXPORT_NONCE = 1; // nonce given to replicas created by export | |
258 | ||
259 | ostream& print_db_line_prefix(ostream& out) override; | |
260 | ||
261 | public: | |
262 | MDCache *mdcache; | |
263 | ||
94b18763 FG |
264 | SnapRealm *snaprealm = nullptr; |
265 | SnapRealm *containing_realm = nullptr; | |
7c673cae | 266 | snapid_t first, last; |
94b18763 | 267 | mempool::mds_co::compact_set<snapid_t> dirty_old_rstats; |
7c673cae FG |
268 | |
269 | class scrub_stamp_info_t { | |
270 | public: | |
271 | /// version we started our latest scrub (whether in-progress or finished) | |
94b18763 | 272 | version_t scrub_start_version = 0; |
7c673cae FG |
273 | /// time we started our latest scrub (whether in-progress or finished) |
274 | utime_t scrub_start_stamp; | |
275 | /// version we started our most recent finished scrub | |
94b18763 | 276 | version_t last_scrub_version = 0; |
7c673cae FG |
277 | /// time we started our most recent finished scrub |
278 | utime_t last_scrub_stamp; | |
94b18763 | 279 | scrub_stamp_info_t() {} |
7c673cae | 280 | void reset() { |
b32b8144 FG |
281 | scrub_start_version = last_scrub_version = 0; |
282 | scrub_start_stamp = last_scrub_stamp = utime_t(); | |
7c673cae FG |
283 | } |
284 | }; | |
285 | ||
286 | class scrub_info_t : public scrub_stamp_info_t { | |
287 | public: | |
94b18763 | 288 | CDentry *scrub_parent = nullptr; |
11fdf7f2 | 289 | MDSContext *on_finish = nullptr; |
7c673cae | 290 | |
94b18763 FG |
291 | bool last_scrub_dirty = false; /// are our stamps dirty with respect to disk state? |
292 | bool scrub_in_progress = false; /// are we currently scrubbing? | |
293 | bool children_scrubbed = false; | |
7c673cae FG |
294 | |
295 | /// my own (temporary) stamps and versions for each dirfrag we have | |
94b18763 | 296 | std::map<frag_t, scrub_stamp_info_t> dirfrag_stamps; // XXX not part of mempool |
7c673cae | 297 | |
b32b8144 | 298 | ScrubHeaderRef header; |
7c673cae | 299 | |
94b18763 | 300 | scrub_info_t() {} |
7c673cae FG |
301 | }; |
302 | ||
303 | const scrub_info_t *scrub_info() const{ | |
304 | if (!scrub_infop) | |
305 | scrub_info_create(); | |
306 | return scrub_infop; | |
307 | } | |
308 | ||
b32b8144 FG |
309 | ScrubHeaderRef get_scrub_header() { |
310 | if (scrub_infop == nullptr) { | |
311 | return nullptr; | |
312 | } else { | |
313 | return scrub_infop->header; | |
314 | } | |
315 | } | |
316 | ||
7c673cae FG |
317 | bool scrub_is_in_progress() const { |
318 | return (scrub_infop && scrub_infop->scrub_in_progress); | |
319 | } | |
320 | /** | |
321 | * Start scrubbing on this inode. That could be very short if it's | |
322 | * a file, or take a long time if we're recursively scrubbing a directory. | |
323 | * @pre It is not currently scrubbing | |
324 | * @post it has set up internal scrubbing state | |
325 | * @param scrub_version What version are we scrubbing at (usually, parent | |
326 | * directory's get_projected_version()) | |
327 | */ | |
328 | void scrub_initialize(CDentry *scrub_parent, | |
b32b8144 | 329 | ScrubHeaderRef& header, |
11fdf7f2 | 330 | MDSContext *f); |
7c673cae FG |
331 | /** |
332 | * Get the next dirfrag to scrub. Gives you a frag_t in output param which | |
333 | * you must convert to a CDir (and possibly load off disk). | |
334 | * @param dir A pointer to frag_t, will be filled in with the next dirfrag to | |
335 | * scrub if there is one. | |
336 | * @returns 0 on success, you should scrub the passed-out frag_t right now; | |
337 | * ENOENT: There are no remaining dirfrags to scrub | |
338 | * <0 There was some other error (It will return -ENOTDIR if not a directory) | |
339 | */ | |
340 | int scrub_dirfrag_next(frag_t* out_dirfrag); | |
341 | /** | |
342 | * Get the currently scrubbing dirfrags. When returned, the | |
343 | * passed-in list will be filled in with all frag_ts which have | |
344 | * been returned from scrub_dirfrag_next but not sent back | |
345 | * via scrub_dirfrag_finished. | |
346 | */ | |
11fdf7f2 | 347 | void scrub_dirfrags_scrubbing(frag_vec_t *out_dirfrags); |
7c673cae FG |
348 | /** |
349 | * Report to the CInode that a dirfrag it owns has been scrubbed. Call | |
350 | * this for every frag_t returned from scrub_dirfrag_next(). | |
351 | * @param dirfrag The frag_t that was scrubbed | |
352 | */ | |
353 | void scrub_dirfrag_finished(frag_t dirfrag); | |
354 | /** | |
355 | * Call this once the scrub has been completed, whether it's a full | |
356 | * recursive scrub on a directory or simply the data on a file (or | |
357 | * anything in between). | |
358 | * @param c An out param which is filled in with a Context* that must | |
359 | * be complete()ed. | |
360 | */ | |
11fdf7f2 TL |
361 | void scrub_finished(MDSContext **c); |
362 | ||
363 | void scrub_aborted(MDSContext **c); | |
364 | ||
7c673cae FG |
365 | /** |
366 | * Report to the CInode that alldirfrags it owns have been scrubbed. | |
367 | */ | |
368 | void scrub_children_finished() { | |
369 | scrub_infop->children_scrubbed = true; | |
370 | } | |
11fdf7f2 TL |
371 | void scrub_set_finisher(MDSContext *c) { |
372 | ceph_assert(!scrub_infop->on_finish); | |
7c673cae FG |
373 | scrub_infop->on_finish = c; |
374 | } | |
375 | ||
376 | private: | |
377 | /** | |
11fdf7f2 | 378 | * Create a scrub_info_t struct for the scrub_infop pointer. |
7c673cae FG |
379 | */ |
380 | void scrub_info_create() const; | |
381 | /** | |
382 | * Delete the scrub_info_t struct if it's not got any useful data | |
383 | */ | |
384 | void scrub_maybe_delete_info(); | |
385 | public: | |
386 | ||
387 | bool is_multiversion() const { | |
388 | return snaprealm || // other snaprealms will link to me | |
389 | inode.is_dir() || // links to me in other snaps | |
390 | inode.nlink > 1 || // there are remote links, possibly snapped, that will need to find me | |
391 | !old_inodes.empty(); // once multiversion, always multiversion. until old_inodes gets cleaned out. | |
392 | } | |
393 | snapid_t get_oldest_snap(); | |
394 | ||
94b18763 | 395 | uint64_t last_journaled = 0; // log offset for the last time i was journaled |
7c673cae FG |
396 | //loff_t last_open_journaled; // log offset for the last journaled EOpen |
397 | utime_t last_dirstat_prop; | |
398 | ||
399 | ||
400 | // list item node for when we have unpropagated rstat data | |
401 | elist<CInode*>::item dirty_rstat_item; | |
402 | ||
403 | bool is_dirty_rstat() { | |
404 | return state_test(STATE_DIRTYRSTAT); | |
405 | } | |
406 | void mark_dirty_rstat(); | |
407 | void clear_dirty_rstat(); | |
408 | ||
94b18763 | 409 | //bool hack_accessed = false; |
7c673cae FG |
410 | //utime_t hack_load_stamp; |
411 | ||
412 | /** | |
413 | * Projection methods, used to store inode changes until they have been journaled, | |
414 | * at which point they are popped. | |
415 | * Usage: | |
94b18763 FG |
416 | * project_inode as needed. If you're changing xattrs or sr_t, then pass true |
417 | * as needed then change the xattrs/snapnode member as needed. (Dirty | |
418 | * exception: project_past_snaprealm_parent allows you to project the | |
419 | * snapnode after doing project_inode (i.e. you don't need to pass | |
420 | * snap=true). | |
7c673cae FG |
421 | * |
422 | * Then, journal. Once journaling is done, pop_and_dirty_projected_inode. | |
423 | * This function will take care of the inode itself, the xattrs, and the snaprealm. | |
424 | */ | |
425 | ||
94b18763 FG |
426 | class projected_inode { |
427 | public: | |
11fdf7f2 TL |
428 | static sr_t* const UNDEF_SRNODE; |
429 | ||
94b18763 FG |
430 | mempool_inode inode; |
431 | std::unique_ptr<mempool_xattr_map> xattrs; | |
11fdf7f2 | 432 | sr_t *snapnode = UNDEF_SRNODE; |
94b18763 FG |
433 | |
434 | projected_inode() = delete; | |
11fdf7f2 | 435 | explicit projected_inode(const mempool_inode &in) : inode(in) {} |
7c673cae | 436 | }; |
94b18763 FG |
437 | |
438 | private: | |
439 | mempool::mds_co::list<projected_inode> projected_nodes; // projected values (only defined while dirty) | |
440 | size_t num_projected_xattrs = 0; | |
441 | size_t num_projected_srnodes = 0; | |
442 | ||
94b18763 FG |
443 | public: |
444 | CInode::projected_inode &project_inode(bool xattr = false, bool snap = false); | |
7c673cae FG |
445 | void pop_and_dirty_projected_inode(LogSegment *ls); |
446 | ||
94b18763 | 447 | projected_inode *get_projected_node() { |
7c673cae FG |
448 | if (projected_nodes.empty()) |
449 | return NULL; | |
450 | else | |
94b18763 | 451 | return &projected_nodes.back(); |
7c673cae FG |
452 | } |
453 | ||
454 | version_t get_projected_version() const { | |
455 | if (projected_nodes.empty()) | |
456 | return inode.version; | |
457 | else | |
94b18763 | 458 | return projected_nodes.back().inode.version; |
7c673cae FG |
459 | } |
460 | bool is_projected() const { | |
461 | return !projected_nodes.empty(); | |
462 | } | |
463 | ||
94b18763 | 464 | const mempool_inode *get_projected_inode() const { |
7c673cae FG |
465 | if (projected_nodes.empty()) |
466 | return &inode; | |
467 | else | |
94b18763 | 468 | return &projected_nodes.back().inode; |
7c673cae | 469 | } |
94b18763 | 470 | mempool_inode *get_projected_inode() { |
7c673cae FG |
471 | if (projected_nodes.empty()) |
472 | return &inode; | |
473 | else | |
94b18763 | 474 | return &projected_nodes.back().inode; |
7c673cae | 475 | } |
94b18763 | 476 | mempool_inode *get_previous_projected_inode() { |
11fdf7f2 | 477 | ceph_assert(!projected_nodes.empty()); |
94b18763 FG |
478 | auto it = projected_nodes.rbegin(); |
479 | ++it; | |
480 | if (it != projected_nodes.rend()) | |
481 | return &it->inode; | |
7c673cae FG |
482 | else |
483 | return &inode; | |
484 | } | |
485 | ||
94b18763 | 486 | mempool_xattr_map *get_projected_xattrs() { |
7c673cae | 487 | if (num_projected_xattrs > 0) { |
94b18763 FG |
488 | for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it) |
489 | if (it->xattrs) | |
490 | return it->xattrs.get(); | |
7c673cae FG |
491 | } |
492 | return &xattrs; | |
493 | } | |
94b18763 FG |
494 | mempool_xattr_map *get_previous_projected_xattrs() { |
495 | if (num_projected_xattrs > 0) { | |
496 | for (auto it = ++projected_nodes.rbegin(); it != projected_nodes.rend(); ++it) | |
497 | if (it->xattrs) | |
498 | return it->xattrs.get(); | |
499 | } | |
7c673cae FG |
500 | return &xattrs; |
501 | } | |
502 | ||
11fdf7f2 TL |
503 | sr_t *prepare_new_srnode(snapid_t snapid); |
504 | void project_snaprealm(sr_t *new_srnode); | |
505 | sr_t *project_snaprealm(snapid_t snapid=0) { | |
506 | sr_t* new_srnode = prepare_new_srnode(snapid); | |
507 | project_snaprealm(new_srnode); | |
508 | return new_srnode; | |
7c673cae | 509 | } |
11fdf7f2 | 510 | const sr_t *get_projected_srnode() const { |
7c673cae | 511 | if (num_projected_srnodes > 0) { |
94b18763 | 512 | for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it) |
11fdf7f2 TL |
513 | if (it->snapnode != projected_inode::UNDEF_SRNODE) |
514 | return it->snapnode; | |
7c673cae FG |
515 | } |
516 | if (snaprealm) | |
517 | return &snaprealm->srnode; | |
518 | else | |
519 | return NULL; | |
520 | } | |
11fdf7f2 TL |
521 | |
522 | void mark_snaprealm_global(sr_t *new_srnode); | |
523 | void clear_snaprealm_global(sr_t *new_srnode); | |
524 | bool is_projected_snaprealm_global() const; | |
525 | ||
526 | void record_snaprealm_past_parent(sr_t *new_snap, SnapRealm *newparent); | |
527 | void record_snaprealm_parent_dentry(sr_t *new_snap, SnapRealm *newparent, | |
528 | CDentry *dn, bool primary_dn); | |
529 | void project_snaprealm_past_parent(SnapRealm *newparent); | |
530 | void early_pop_projected_snaprealm(); | |
7c673cae FG |
531 | |
532 | private: | |
11fdf7f2 | 533 | void pop_projected_snaprealm(sr_t *next_snaprealm, bool early); |
7c673cae FG |
534 | |
535 | public: | |
94b18763 | 536 | mempool_old_inode& cow_old_inode(snapid_t follows, bool cow_head); |
7c673cae | 537 | void split_old_inode(snapid_t snap); |
94b18763 | 538 | mempool_old_inode *pick_old_inode(snapid_t last); |
7c673cae | 539 | void pre_cow_old_inode(); |
11fdf7f2 | 540 | bool has_snap_data(snapid_t s); |
7c673cae FG |
541 | void purge_stale_snap_data(const std::set<snapid_t>& snaps); |
542 | ||
543 | // -- cache infrastructure -- | |
544 | private: | |
94b18763 | 545 | mempool::mds_co::compact_map<frag_t,CDir*> dirfrags; // cached dir fragments under this Inode |
1adf2230 AA |
546 | |
547 | //for the purpose of quickly determining whether there's a subtree root or exporting dir | |
548 | int num_subtree_roots = 0; | |
549 | int num_exporting_dirs = 0; | |
550 | ||
94b18763 FG |
551 | int stickydir_ref = 0; |
552 | scrub_info_t *scrub_infop = nullptr; | |
7c673cae FG |
553 | |
554 | public: | |
555 | bool has_dirfrags() { return !dirfrags.empty(); } | |
556 | CDir* get_dirfrag(frag_t fg) { | |
11fdf7f2 TL |
557 | auto pi = dirfrags.find(fg); |
558 | if (pi != dirfrags.end()) { | |
559 | //assert(g_conf()->debug_mds < 2 || dirfragtree.is_leaf(fg)); // performance hack FIXME | |
560 | return pi->second; | |
561 | } | |
562 | return NULL; | |
7c673cae FG |
563 | } |
564 | bool get_dirfrags_under(frag_t fg, std::list<CDir*>& ls); | |
565 | CDir* get_approx_dirfrag(frag_t fg); | |
91327a77 AA |
566 | |
567 | template<typename Container> | |
568 | void get_dirfrags(Container& ls) const { | |
569 | // all dirfrags | |
11fdf7f2 TL |
570 | if constexpr (std::is_same_v<Container, std::vector<CDir*>>) |
571 | ls.reserve(ls.size() + dirfrags.size()); | |
91327a77 AA |
572 | for (const auto &p : dirfrags) |
573 | ls.push_back(p.second); | |
574 | } | |
575 | template<typename Container> | |
576 | void get_nested_dirfrags(Container& ls) const { | |
577 | // dirfrags in same subtree | |
11fdf7f2 TL |
578 | if constexpr (std::is_same_v<Container, std::vector<CDir*>>) |
579 | ls.reserve(ls.size() + dirfrags.size() - num_subtree_roots); | |
91327a77 AA |
580 | for (const auto &p : dirfrags) { |
581 | typename Container::value_type dir = p.second; | |
582 | if (!dir->is_subtree_root()) | |
583 | ls.push_back(dir); | |
584 | } | |
585 | } | |
586 | template<typename Container> | |
587 | void get_subtree_dirfrags(Container& ls) { | |
588 | // dirfrags that are roots of new subtrees | |
11fdf7f2 TL |
589 | if constexpr (std::is_same_v<Container, std::vector<CDir*>>) |
590 | ls.reserve(ls.size() + num_subtree_roots); | |
91327a77 AA |
591 | for (const auto &p : dirfrags) { |
592 | typename Container::value_type dir = p.second; | |
593 | if (dir->is_subtree_root()) | |
594 | ls.push_back(dir); | |
595 | } | |
596 | } | |
597 | ||
7c673cae FG |
598 | CDir *get_or_open_dirfrag(MDCache *mdcache, frag_t fg); |
599 | CDir *add_dirfrag(CDir *dir); | |
600 | void close_dirfrag(frag_t fg); | |
601 | void close_dirfrags(); | |
602 | bool has_subtree_root_dirfrag(int auth=-1); | |
603 | bool has_subtree_or_exporting_dirfrag(); | |
604 | ||
605 | void force_dirfrags(); | |
606 | void verify_dirfrags(); | |
607 | ||
608 | void get_stickydirs(); | |
609 | void put_stickydirs(); | |
610 | ||
611 | protected: | |
612 | // parent dentries in cache | |
94b18763 FG |
613 | CDentry *parent = nullptr; // primary link |
614 | mempool::mds_co::compact_set<CDentry*> remote_parents; // if hard linked | |
7c673cae | 615 | |
94b18763 | 616 | mempool::mds_co::list<CDentry*> projected_parent; // for in-progress rename, (un)link, etc. |
7c673cae | 617 | |
94b18763 | 618 | mds_authority_t inode_auth = CDIR_AUTH_DEFAULT; |
7c673cae FG |
619 | |
620 | // -- distributed state -- | |
621 | protected: | |
622 | // file capabilities | |
11fdf7f2 TL |
623 | using mempool_cap_map = mempool::mds_co::map<client_t, Capability>; |
624 | mempool_cap_map client_caps; // client -> caps | |
94b18763 | 625 | mempool::mds_co::compact_map<int32_t, int32_t> mds_caps_wanted; // [auth] mds -> caps wanted |
11fdf7f2 TL |
626 | int replica_caps_wanted = 0; // [replica] what i've requested from auth |
627 | int num_caps_wanted = 0; | |
7c673cae FG |
628 | |
629 | public: | |
eafe8130 | 630 | mempool::mds_co::set<client_t> client_snap_caps; |
94b18763 | 631 | mempool::mds_co::compact_map<snapid_t, mempool::mds_co::set<client_t> > client_need_snapflush; |
7c673cae FG |
632 | |
633 | void add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client); | |
634 | void remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client); | |
494da23a | 635 | pair<bool,bool> split_need_snapflush(CInode *cowin, CInode *in); |
7c673cae FG |
636 | |
637 | protected: | |
638 | ||
94b18763 FG |
639 | ceph_lock_state_t *fcntl_locks = nullptr; |
640 | ceph_lock_state_t *flock_locks = nullptr; | |
7c673cae FG |
641 | |
642 | ceph_lock_state_t *get_fcntl_lock_state() { | |
643 | if (!fcntl_locks) | |
644 | fcntl_locks = new ceph_lock_state_t(g_ceph_context, CEPH_LOCK_FCNTL); | |
645 | return fcntl_locks; | |
646 | } | |
647 | void clear_fcntl_lock_state() { | |
648 | delete fcntl_locks; | |
649 | fcntl_locks = NULL; | |
650 | } | |
651 | ceph_lock_state_t *get_flock_lock_state() { | |
652 | if (!flock_locks) | |
653 | flock_locks = new ceph_lock_state_t(g_ceph_context, CEPH_LOCK_FLOCK); | |
654 | return flock_locks; | |
655 | } | |
656 | void clear_flock_lock_state() { | |
657 | delete flock_locks; | |
658 | flock_locks = NULL; | |
659 | } | |
660 | void clear_file_locks() { | |
661 | clear_fcntl_lock_state(); | |
662 | clear_flock_lock_state(); | |
663 | } | |
664 | void _encode_file_locks(bufferlist& bl) const { | |
11fdf7f2 | 665 | using ceph::encode; |
7c673cae | 666 | bool has_fcntl_locks = fcntl_locks && !fcntl_locks->empty(); |
11fdf7f2 | 667 | encode(has_fcntl_locks, bl); |
7c673cae | 668 | if (has_fcntl_locks) |
11fdf7f2 | 669 | encode(*fcntl_locks, bl); |
7c673cae | 670 | bool has_flock_locks = flock_locks && !flock_locks->empty(); |
11fdf7f2 | 671 | encode(has_flock_locks, bl); |
7c673cae | 672 | if (has_flock_locks) |
11fdf7f2 | 673 | encode(*flock_locks, bl); |
7c673cae | 674 | } |
11fdf7f2 TL |
675 | void _decode_file_locks(bufferlist::const_iterator& p) { |
676 | using ceph::decode; | |
7c673cae | 677 | bool has_fcntl_locks; |
11fdf7f2 | 678 | decode(has_fcntl_locks, p); |
7c673cae | 679 | if (has_fcntl_locks) |
11fdf7f2 | 680 | decode(*get_fcntl_lock_state(), p); |
7c673cae FG |
681 | else |
682 | clear_fcntl_lock_state(); | |
683 | bool has_flock_locks; | |
11fdf7f2 | 684 | decode(has_flock_locks, p); |
7c673cae | 685 | if (has_flock_locks) |
11fdf7f2 | 686 | decode(*get_flock_lock_state(), p); |
7c673cae FG |
687 | else |
688 | clear_flock_lock_state(); | |
689 | } | |
690 | ||
691 | // LogSegment lists i (may) belong to | |
692 | public: | |
693 | elist<CInode*>::item item_dirty; | |
694 | elist<CInode*>::item item_caps; | |
695 | elist<CInode*>::item item_open_file; | |
696 | elist<CInode*>::item item_dirty_parent; | |
697 | elist<CInode*>::item item_dirty_dirfrag_dir; | |
698 | elist<CInode*>::item item_dirty_dirfrag_nest; | |
699 | elist<CInode*>::item item_dirty_dirfrag_dirfragtree; | |
700 | elist<CInode*>::item item_scrub; | |
701 | ||
b32b8144 FG |
702 | // also update RecoveryQueue::RecoveryQueue() if you change this |
703 | elist<CInode*>::item& item_recover_queue = item_dirty_dirfrag_dir; | |
704 | elist<CInode*>::item& item_recover_queue_front = item_dirty_dirfrag_nest; | |
705 | ||
7c673cae | 706 | public: |
94b18763 | 707 | int auth_pin_freeze_allowance = 0; |
7c673cae FG |
708 | |
709 | inode_load_vec_t pop; | |
28e407b8 | 710 | elist<CInode*>::item item_pop_lru; |
7c673cae FG |
711 | |
712 | // friends | |
713 | friend class Server; | |
714 | friend class Locker; | |
715 | friend class Migrator; | |
716 | friend class MDCache; | |
717 | friend class StrayManager; | |
718 | friend class CDir; | |
719 | friend class CInodeExport; | |
7c673cae FG |
720 | |
721 | // --------------------------- | |
94b18763 | 722 | CInode() = delete; |
11fdf7f2 | 723 | CInode(MDCache *c, bool auth=true, snapid_t f=2, snapid_t l=CEPH_NOSNAP); |
7c673cae FG |
724 | ~CInode() override { |
725 | close_dirfrags(); | |
726 | close_snaprealm(); | |
727 | clear_file_locks(); | |
11fdf7f2 TL |
728 | ceph_assert(num_projected_xattrs == 0); |
729 | ceph_assert(num_projected_srnodes == 0); | |
730 | ceph_assert(num_caps_wanted == 0); | |
731 | ceph_assert(num_subtree_roots == 0); | |
732 | ceph_assert(num_exporting_dirs == 0); | |
7c673cae FG |
733 | } |
734 | ||
735 | ||
736 | // -- accessors -- | |
737 | bool is_root() const { return inode.ino == MDS_INO_ROOT; } | |
738 | bool is_stray() const { return MDS_INO_IS_STRAY(inode.ino); } | |
739 | mds_rank_t get_stray_owner() const { | |
740 | return (mds_rank_t)MDS_INO_STRAY_OWNER(inode.ino); | |
741 | } | |
742 | bool is_mdsdir() const { return MDS_INO_IS_MDSDIR(inode.ino); } | |
11fdf7f2 | 743 | bool is_base() const { return MDS_INO_IS_BASE(inode.ino); } |
7c673cae FG |
744 | bool is_system() const { return inode.ino < MDS_INO_SYSTEM_BASE; } |
745 | bool is_normal() const { return !(is_base() || is_system() || is_stray()); } | |
746 | ||
747 | bool is_head() const { return last == CEPH_NOSNAP; } | |
748 | ||
749 | // note: this overloads MDSCacheObject | |
750 | bool is_ambiguous_auth() const { | |
751 | return state_test(STATE_AMBIGUOUSAUTH) || | |
752 | MDSCacheObject::is_ambiguous_auth(); | |
753 | } | |
754 | void set_ambiguous_auth() { | |
755 | state_set(STATE_AMBIGUOUSAUTH); | |
756 | } | |
11fdf7f2 | 757 | void clear_ambiguous_auth(MDSContext::vec& finished); |
7c673cae FG |
758 | void clear_ambiguous_auth(); |
759 | ||
760 | inodeno_t ino() const { return inode.ino; } | |
761 | vinodeno_t vino() const { return vinodeno_t(inode.ino, last); } | |
762 | int d_type() const { return IFTODT(inode.mode); } | |
763 | ||
94b18763 | 764 | mempool_inode& get_inode() { return inode; } |
f64942e4 | 765 | const mempool_inode& get_inode() const { return inode; } |
7c673cae FG |
766 | CDentry* get_parent_dn() { return parent; } |
767 | const CDentry* get_parent_dn() const { return parent; } | |
7c673cae | 768 | CDentry* get_projected_parent_dn() { return !projected_parent.empty() ? projected_parent.back() : parent; } |
11fdf7f2 TL |
769 | const CDentry* get_projected_parent_dn() const { return !projected_parent.empty() ? projected_parent.back() : parent; } |
770 | const CDentry* get_oldest_parent_dn() const { | |
771 | if (parent) | |
772 | return parent; | |
773 | return !projected_parent.empty() ? projected_parent.front(): NULL; | |
774 | } | |
7c673cae FG |
775 | CDir *get_parent_dir(); |
776 | const CDir *get_projected_parent_dir() const; | |
777 | CDir *get_projected_parent_dir(); | |
778 | CInode *get_parent_inode(); | |
779 | ||
780 | bool is_lt(const MDSCacheObject *r) const override { | |
781 | const CInode *o = static_cast<const CInode*>(r); | |
782 | return ino() < o->ino() || | |
783 | (ino() == o->ino() && last < o->last); | |
784 | } | |
785 | ||
786 | // -- misc -- | |
11fdf7f2 TL |
787 | bool is_ancestor_of(const CInode *other) const; |
788 | bool is_projected_ancestor_of(const CInode *other) const; | |
7c673cae FG |
789 | |
790 | void make_path_string(std::string& s, bool projected=false, const CDentry *use_parent=NULL) const; | |
791 | void make_path(filepath& s, bool projected=false) const; | |
792 | void name_stray_dentry(std::string& dname); | |
793 | ||
794 | // -- dirtyness -- | |
795 | version_t get_version() const { return inode.version; } | |
796 | ||
797 | version_t pre_dirty(); | |
798 | void _mark_dirty(LogSegment *ls); | |
799 | void mark_dirty(version_t projected_dirv, LogSegment *ls); | |
800 | void mark_clean(); | |
801 | ||
11fdf7f2 | 802 | void store(MDSContext *fin); |
7c673cae FG |
803 | void _stored(int r, version_t cv, Context *fin); |
804 | /** | |
805 | * Flush a CInode to disk. This includes the backtrace, the parent | |
806 | * directory's link, and the Inode object itself (if a base directory). | |
807 | * @pre is_auth() on both the inode and its containing directory | |
808 | * @pre can_auth_pin() | |
809 | * @param fin The Context to call when the flush is completed. | |
810 | */ | |
11fdf7f2 TL |
811 | void flush(MDSContext *fin); |
812 | void fetch(MDSContext *fin); | |
7c673cae FG |
813 | void _fetched(bufferlist& bl, bufferlist& bl2, Context *fin); |
814 | ||
815 | ||
816 | void build_backtrace(int64_t pool, inode_backtrace_t& bt); | |
11fdf7f2 | 817 | void store_backtrace(MDSContext *fin, int op_prio=-1); |
7c673cae FG |
818 | void _stored_backtrace(int r, version_t v, Context *fin); |
819 | void fetch_backtrace(Context *fin, bufferlist *backtrace); | |
820 | protected: | |
821 | /** | |
822 | * Return the pool ID where we currently write backtraces for | |
823 | * this inode (in addition to inode.old_pools) | |
824 | * | |
825 | * @returns a pool ID >=0 | |
826 | */ | |
827 | int64_t get_backtrace_pool() const; | |
828 | public: | |
28e407b8 | 829 | void mark_dirty_parent(LogSegment *ls, bool dirty_pool=false); |
7c673cae FG |
830 | void clear_dirty_parent(); |
831 | void verify_diri_backtrace(bufferlist &bl, int err); | |
832 | bool is_dirty_parent() { return state_test(STATE_DIRTYPARENT); } | |
833 | bool is_dirty_pool() { return state_test(STATE_DIRTYPOOL); } | |
834 | ||
835 | void encode_snap_blob(bufferlist &bl); | |
11fdf7f2 | 836 | void decode_snap_blob(const bufferlist &bl); |
7c673cae | 837 | void encode_store(bufferlist& bl, uint64_t features); |
11fdf7f2 | 838 | void decode_store(bufferlist::const_iterator& bl); |
7c673cae | 839 | |
b32b8144 | 840 | void encode_replica(mds_rank_t rep, bufferlist& bl, uint64_t features, bool need_recover) { |
11fdf7f2 | 841 | ceph_assert(is_auth()); |
7c673cae FG |
842 | |
843 | __u32 nonce = add_replica(rep); | |
11fdf7f2 TL |
844 | using ceph::encode; |
845 | encode(nonce, bl); | |
7c673cae FG |
846 | |
847 | _encode_base(bl, features); | |
b32b8144 | 848 | _encode_locks_state_for_replica(bl, need_recover); |
7c673cae | 849 | } |
11fdf7f2 TL |
850 | void decode_replica(bufferlist::const_iterator& p, bool is_new) { |
851 | using ceph::decode; | |
7c673cae | 852 | __u32 nonce; |
11fdf7f2 | 853 | decode(nonce, p); |
7c673cae FG |
854 | replica_nonce = nonce; |
855 | ||
856 | _decode_base(p); | |
857 | _decode_locks_state(p, is_new); | |
858 | } | |
859 | ||
860 | // -- waiting -- | |
861 | protected: | |
11fdf7f2 | 862 | mempool::mds_co::compact_map<frag_t, MDSContext::vec > waiting_on_dir; |
7c673cae | 863 | public: |
11fdf7f2 TL |
864 | void add_dir_waiter(frag_t fg, MDSContext *c); |
865 | void take_dir_waiting(frag_t fg, MDSContext::vec& ls); | |
7c673cae FG |
866 | bool is_waiting_for_dir(frag_t fg) { |
867 | return waiting_on_dir.count(fg); | |
868 | } | |
11fdf7f2 TL |
869 | void add_waiter(uint64_t tag, MDSContext *c) override; |
870 | void take_waiting(uint64_t tag, MDSContext::vec& ls) override; | |
7c673cae FG |
871 | |
872 | // -- encode/decode helpers -- | |
873 | void _encode_base(bufferlist& bl, uint64_t features); | |
11fdf7f2 | 874 | void _decode_base(bufferlist::const_iterator& p); |
7c673cae | 875 | void _encode_locks_full(bufferlist& bl); |
11fdf7f2 | 876 | void _decode_locks_full(bufferlist::const_iterator& p); |
b32b8144 | 877 | void _encode_locks_state_for_replica(bufferlist& bl, bool need_recover); |
7c673cae | 878 | void _encode_locks_state_for_rejoin(bufferlist& bl, int rep); |
11fdf7f2 TL |
879 | void _decode_locks_state(bufferlist::const_iterator& p, bool is_new); |
880 | void _decode_locks_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters, | |
b32b8144 | 881 | std::list<SimpleLock*>& eval_locks, bool survivor); |
7c673cae FG |
882 | |
883 | // -- import/export -- | |
884 | void encode_export(bufferlist& bl); | |
11fdf7f2 | 885 | void finish_export(); |
7c673cae FG |
886 | void abort_export() { |
887 | put(PIN_TEMPEXPORTING); | |
11fdf7f2 | 888 | ceph_assert(state_test(STATE_EXPORTINGCAPS)); |
7c673cae FG |
889 | state_clear(STATE_EXPORTINGCAPS); |
890 | put(PIN_EXPORTINGCAPS); | |
891 | } | |
11fdf7f2 | 892 | void decode_import(bufferlist::const_iterator& p, LogSegment *ls); |
7c673cae FG |
893 | |
894 | ||
895 | // for giving to clients | |
896 | int encode_inodestat(bufferlist& bl, Session *session, SnapRealm *realm, | |
897 | snapid_t snapid=CEPH_NOSNAP, unsigned max_bytes=0, | |
898 | int getattr_wants=0); | |
11fdf7f2 | 899 | void encode_cap_message(const MClientCaps::ref &m, Capability *cap); |
7c673cae FG |
900 | |
901 | ||
902 | // -- locks -- | |
903 | public: | |
904 | static LockType versionlock_type; | |
905 | static LockType authlock_type; | |
906 | static LockType linklock_type; | |
907 | static LockType dirfragtreelock_type; | |
908 | static LockType filelock_type; | |
909 | static LockType xattrlock_type; | |
910 | static LockType snaplock_type; | |
911 | static LockType nestlock_type; | |
912 | static LockType flocklock_type; | |
913 | static LockType policylock_type; | |
914 | ||
94b18763 | 915 | // FIXME not part of mempool |
7c673cae FG |
916 | LocalLock versionlock; |
917 | SimpleLock authlock; | |
918 | SimpleLock linklock; | |
919 | ScatterLock dirfragtreelock; | |
920 | ScatterLock filelock; | |
921 | SimpleLock xattrlock; | |
922 | SimpleLock snaplock; | |
923 | ScatterLock nestlock; | |
924 | SimpleLock flocklock; | |
925 | SimpleLock policylock; | |
926 | ||
927 | SimpleLock* get_lock(int type) override { | |
928 | switch (type) { | |
929 | case CEPH_LOCK_IFILE: return &filelock; | |
930 | case CEPH_LOCK_IAUTH: return &authlock; | |
931 | case CEPH_LOCK_ILINK: return &linklock; | |
932 | case CEPH_LOCK_IDFT: return &dirfragtreelock; | |
933 | case CEPH_LOCK_IXATTR: return &xattrlock; | |
934 | case CEPH_LOCK_ISNAP: return &snaplock; | |
935 | case CEPH_LOCK_INEST: return &nestlock; | |
936 | case CEPH_LOCK_IFLOCK: return &flocklock; | |
937 | case CEPH_LOCK_IPOLICY: return &policylock; | |
938 | } | |
939 | return 0; | |
940 | } | |
941 | ||
942 | void set_object_info(MDSCacheObjectInfo &info) override; | |
943 | void encode_lock_state(int type, bufferlist& bl) override; | |
11fdf7f2 | 944 | void decode_lock_state(int type, const bufferlist& bl) override; |
7c673cae FG |
945 | |
946 | void _finish_frag_update(CDir *dir, MutationRef& mut); | |
947 | ||
948 | void clear_dirty_scattered(int type) override; | |
949 | bool is_dirty_scattered(); | |
950 | void clear_scatter_dirty(); // on rejoin ack | |
951 | ||
952 | void start_scatter(ScatterLock *lock); | |
953 | void finish_scatter_update(ScatterLock *lock, CDir *dir, | |
954 | version_t inode_version, version_t dir_accounted_version); | |
955 | void finish_scatter_gather_update(int type); | |
956 | void finish_scatter_gather_update_accounted(int type, MutationRef& mut, EMetaBlob *metablob); | |
957 | ||
958 | // -- snap -- | |
959 | void open_snaprealm(bool no_split=false); | |
960 | void close_snaprealm(bool no_join=false); | |
961 | SnapRealm *find_snaprealm() const; | |
962 | void encode_snap(bufferlist& bl); | |
11fdf7f2 | 963 | void decode_snap(bufferlist::const_iterator& p); |
7c673cae FG |
964 | |
965 | // -- caps -- (new) | |
966 | // client caps | |
94b18763 | 967 | client_t loner_cap = -1, want_loner_cap = -1; |
7c673cae FG |
968 | |
969 | client_t get_loner() const { return loner_cap; } | |
970 | client_t get_wanted_loner() const { return want_loner_cap; } | |
971 | ||
972 | // this is the loner state our locks should aim for | |
973 | client_t get_target_loner() const { | |
974 | if (loner_cap == want_loner_cap) | |
975 | return loner_cap; | |
976 | else | |
977 | return -1; | |
978 | } | |
979 | ||
980 | client_t calc_ideal_loner(); | |
7c673cae | 981 | void set_loner_cap(client_t l); |
b32b8144 FG |
982 | bool choose_ideal_loner(); |
983 | bool try_set_loner(); | |
7c673cae FG |
984 | bool try_drop_loner(); |
985 | ||
986 | // choose new lock state during recovery, based on issued caps | |
987 | void choose_lock_state(SimpleLock *lock, int allissued); | |
988 | void choose_lock_states(int dirty_caps); | |
989 | ||
990 | int count_nonstale_caps() { | |
991 | int n = 0; | |
94b18763 | 992 | for (const auto &p : client_caps) { |
11fdf7f2 | 993 | if (!p.second.is_stale()) |
7c673cae | 994 | n++; |
94b18763 | 995 | } |
7c673cae FG |
996 | return n; |
997 | } | |
998 | bool multiple_nonstale_caps() { | |
999 | int n = 0; | |
94b18763 | 1000 | for (const auto &p : client_caps) { |
11fdf7f2 | 1001 | if (!p.second.is_stale()) { |
7c673cae FG |
1002 | if (n) |
1003 | return true; | |
1004 | n++; | |
1005 | } | |
94b18763 | 1006 | } |
7c673cae FG |
1007 | return false; |
1008 | } | |
1009 | ||
1010 | bool is_any_caps() { return !client_caps.empty(); } | |
1011 | bool is_any_nonstale_caps() { return count_nonstale_caps(); } | |
1012 | ||
94b18763 | 1013 | const mempool::mds_co::compact_map<int32_t,int32_t>& get_mds_caps_wanted() const { return mds_caps_wanted; } |
11fdf7f2 TL |
1014 | void set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m); |
1015 | void set_mds_caps_wanted(mds_rank_t mds, int32_t wanted); | |
7c673cae | 1016 | |
11fdf7f2 | 1017 | const mempool_cap_map& get_client_caps() const { return client_caps; } |
7c673cae FG |
1018 | Capability *get_client_cap(client_t client) { |
1019 | auto client_caps_entry = client_caps.find(client); | |
1020 | if (client_caps_entry != client_caps.end()) | |
11fdf7f2 | 1021 | return &client_caps_entry->second; |
7c673cae FG |
1022 | return 0; |
1023 | } | |
1024 | int get_client_cap_pending(client_t client) const { | |
1025 | auto client_caps_entry = client_caps.find(client); | |
1026 | if (client_caps_entry != client_caps.end()) { | |
11fdf7f2 | 1027 | return client_caps_entry->second.pending(); |
7c673cae FG |
1028 | } else { |
1029 | return 0; | |
1030 | } | |
1031 | } | |
1032 | ||
11fdf7f2 TL |
1033 | int get_num_caps_wanted() const { return num_caps_wanted; } |
1034 | void adjust_num_caps_wanted(int d); | |
1035 | ||
7c673cae FG |
1036 | Capability *add_client_cap(client_t client, Session *session, SnapRealm *conrealm=0); |
1037 | void remove_client_cap(client_t client); | |
1038 | void move_to_realm(SnapRealm *realm); | |
1039 | ||
1040 | Capability *reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session); | |
1041 | void clear_client_caps_after_export(); | |
1042 | void export_client_caps(std::map<client_t,Capability::Export>& cl); | |
1043 | ||
1044 | // caps allowed | |
1045 | int get_caps_liked() const; | |
1046 | int get_caps_allowed_ever() const; | |
1047 | int get_caps_allowed_by_type(int type) const; | |
1048 | int get_caps_careful() const; | |
1049 | int get_xlocker_mask(client_t client) const; | |
11fdf7f2 | 1050 | int get_caps_allowed_for_client(Session *s, Capability *cap, mempool_inode *file_i) const; |
7c673cae FG |
1051 | |
1052 | // caps issued, wanted | |
1053 | int get_caps_issued(int *ploner = 0, int *pother = 0, int *pxlocker = 0, | |
1054 | int shift = 0, int mask = -1); | |
1055 | bool is_any_caps_wanted() const; | |
1056 | int get_caps_wanted(int *ploner = 0, int *pother = 0, int shift = 0, int mask = -1) const; | |
1057 | bool issued_caps_need_gather(SimpleLock *lock); | |
7c673cae FG |
1058 | |
1059 | // -- authority -- | |
1060 | mds_authority_t authority() const override; | |
1061 | ||
1062 | // -- auth pins -- | |
91327a77 | 1063 | bool can_auth_pin(int *err_ret=nullptr) const override; |
7c673cae FG |
1064 | void auth_pin(void *by) override; |
1065 | void auth_unpin(void *by) override; | |
1066 | ||
1067 | // -- freeze -- | |
1068 | bool is_freezing_inode() const { return state_test(STATE_FREEZING); } | |
1069 | bool is_frozen_inode() const { return state_test(STATE_FROZEN); } | |
1070 | bool is_frozen_auth_pin() const { return state_test(STATE_FROZENAUTHPIN); } | |
1071 | bool is_frozen() const override; | |
1072 | bool is_frozen_dir() const; | |
1073 | bool is_freezing() const override; | |
1074 | ||
1075 | /* Freeze the inode. auth_pin_allowance lets the caller account for any | |
1076 | * auth_pins it is itself holding/responsible for. */ | |
1077 | bool freeze_inode(int auth_pin_allowance=0); | |
11fdf7f2 | 1078 | void unfreeze_inode(MDSContext::vec& finished); |
7c673cae FG |
1079 | void unfreeze_inode(); |
1080 | ||
1081 | void freeze_auth_pin(); | |
1082 | void unfreeze_auth_pin(); | |
1083 | ||
1084 | // -- reference counting -- | |
1085 | void bad_put(int by) override { | |
1086 | generic_dout(0) << " bad put " << *this << " by " << by << " " << pin_name(by) << " was " << ref | |
1087 | #ifdef MDS_REF_SET | |
1088 | << " (" << ref_map << ")" | |
1089 | #endif | |
1090 | << dendl; | |
1091 | #ifdef MDS_REF_SET | |
11fdf7f2 | 1092 | ceph_assert(ref_map[by] > 0); |
7c673cae | 1093 | #endif |
11fdf7f2 | 1094 | ceph_assert(ref > 0); |
7c673cae FG |
1095 | } |
1096 | void bad_get(int by) override { | |
1097 | generic_dout(0) << " bad get " << *this << " by " << by << " " << pin_name(by) << " was " << ref | |
1098 | #ifdef MDS_REF_SET | |
1099 | << " (" << ref_map << ")" | |
1100 | #endif | |
1101 | << dendl; | |
1102 | #ifdef MDS_REF_SET | |
11fdf7f2 | 1103 | ceph_assert(ref_map[by] >= 0); |
7c673cae FG |
1104 | #endif |
1105 | } | |
1106 | void first_get() override; | |
1107 | void last_put() override; | |
1108 | void _put() override; | |
1109 | ||
1110 | ||
1111 | // -- hierarchy stuff -- | |
1112 | public: | |
1113 | void set_primary_parent(CDentry *p) { | |
11fdf7f2 TL |
1114 | ceph_assert(parent == 0 || |
1115 | g_conf().get_val<bool>("mds_hack_allow_loading_invalid_metadata")); | |
7c673cae FG |
1116 | parent = p; |
1117 | } | |
1118 | void remove_primary_parent(CDentry *dn) { | |
11fdf7f2 | 1119 | ceph_assert(dn == parent); |
7c673cae FG |
1120 | parent = 0; |
1121 | } | |
1122 | void add_remote_parent(CDentry *p); | |
1123 | void remove_remote_parent(CDentry *p); | |
1124 | int num_remote_parents() { | |
1125 | return remote_parents.size(); | |
1126 | } | |
1127 | ||
1128 | void push_projected_parent(CDentry *dn) { | |
1129 | projected_parent.push_back(dn); | |
1130 | } | |
1131 | void pop_projected_parent() { | |
11fdf7f2 | 1132 | ceph_assert(projected_parent.size()); |
7c673cae FG |
1133 | parent = projected_parent.front(); |
1134 | projected_parent.pop_front(); | |
1135 | } | |
1136 | ||
7c673cae | 1137 | public: |
31f18b77 | 1138 | void maybe_export_pin(bool update=false); |
7c673cae FG |
1139 | void set_export_pin(mds_rank_t rank); |
1140 | mds_rank_t get_export_pin(bool inherit=true) const; | |
1141 | bool is_exportable(mds_rank_t dest) const; | |
1142 | ||
1143 | void print(ostream& out) override; | |
11fdf7f2 | 1144 | void dump(Formatter *f, int flags = DUMP_DEFAULT) const; |
7c673cae FG |
1145 | |
1146 | /** | |
1147 | * @defgroup Scrubbing and fsck | |
1148 | * @{ | |
1149 | */ | |
1150 | ||
1151 | /** | |
1152 | * Report the results of validation against a particular inode. | |
1153 | * Each member is a pair of bools. | |
1154 | * <member>.first represents if validation was performed against the member. | |
1155 | * <member.second represents if the member passed validation. | |
1156 | * performed_validation is set to true if the validation was actually | |
1157 | * run. It might not be run if, for instance, the inode is marked as dirty. | |
1158 | * passed_validation is set to true if everything that was checked | |
1159 | * passed its validation. | |
1160 | */ | |
1161 | struct validated_data { | |
1162 | template<typename T>struct member_status { | |
b32b8144 FG |
1163 | bool checked = false; |
1164 | bool passed = false; | |
1165 | bool repaired = false; | |
1166 | int ondisk_read_retval = 0; | |
7c673cae FG |
1167 | T ondisk_value; |
1168 | T memory_value; | |
1169 | std::stringstream error_str; | |
7c673cae FG |
1170 | }; |
1171 | ||
94b18763 FG |
1172 | bool performed_validation = false; |
1173 | bool passed_validation = false; | |
7c673cae FG |
1174 | |
1175 | struct raw_stats_t { | |
1176 | frag_info_t dirstat; | |
1177 | nest_info_t rstat; | |
1178 | }; | |
1179 | ||
1180 | member_status<inode_backtrace_t> backtrace; | |
94b18763 | 1181 | member_status<mempool_inode> inode; // XXX should not be in mempool; wait for pmr |
7c673cae FG |
1182 | member_status<raw_stats_t> raw_stats; |
1183 | ||
94b18763 | 1184 | validated_data() {} |
7c673cae FG |
1185 | |
1186 | void dump(Formatter *f) const; | |
b32b8144 FG |
1187 | |
1188 | bool all_damage_repaired() const; | |
7c673cae FG |
1189 | }; |
1190 | ||
1191 | /** | |
1192 | * Validate that the on-disk state of an inode matches what | |
1193 | * we expect from our memory state. Currently this checks that: | |
1194 | * 1) The backtrace associated with the file data exists and is correct | |
1195 | * 2) For directories, the actual inode metadata matches our memory state, | |
1196 | * 3) For directories, the rstats match | |
1197 | * | |
1198 | * @param results A freshly-created validated_data struct, with values set | |
1199 | * as described in the struct documentation. | |
1200 | * @param mdr The request to be responeded upon the completion of the | |
1201 | * validation (or NULL) | |
1202 | * @param fin Context to call back on completion (or NULL) | |
1203 | */ | |
1204 | void validate_disk_state(validated_data *results, | |
11fdf7f2 | 1205 | MDSContext *fin); |
7c673cae FG |
1206 | static void dump_validation_results(const validated_data& results, |
1207 | Formatter *f); | |
1208 | private: | |
1209 | bool _validate_disk_state(class ValidationContinuation *c, | |
1210 | int rval, int stage); | |
1211 | friend class ValidationContinuation; | |
1212 | /** @} Scrubbing and fsck */ | |
1213 | }; | |
1214 | ||
1215 | ostream& operator<<(ostream& out, const CInode::scrub_stamp_info_t& si); | |
1216 | ||
1217 | #undef dout_context | |
1218 | #endif |