]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "include/int_types.h" | |
16 | #include "common/errno.h" | |
17 | ||
18 | #include <string> | |
7c673cae FG |
19 | |
20 | #include "CInode.h" | |
21 | #include "CDir.h" | |
22 | #include "CDentry.h" | |
23 | ||
24 | #include "MDSRank.h" | |
25 | #include "MDCache.h" | |
26 | #include "MDLog.h" | |
27 | #include "Locker.h" | |
28 | #include "Mutation.h" | |
29 | ||
30 | #include "events/EUpdate.h" | |
31 | ||
32 | #include "osdc/Objecter.h" | |
33 | ||
34 | #include "snap.h" | |
35 | ||
36 | #include "LogSegment.h" | |
37 | ||
38 | #include "common/Clock.h" | |
39 | ||
7c673cae FG |
40 | #include "common/config.h" |
41 | #include "global/global_context.h" | |
11fdf7f2 | 42 | #include "include/ceph_assert.h" |
7c673cae FG |
43 | |
44 | #include "mds/MDSContinuation.h" | |
45 | #include "mds/InoTable.h" | |
11fdf7f2 | 46 | #include "cephfs_features.h" |
f67539c2 | 47 | #include "osdc/Objecter.h" |
7c673cae FG |
48 | |
49 | #define dout_context g_ceph_context | |
50 | #define dout_subsys ceph_subsys_mds | |
51 | #undef dout_prefix | |
f67539c2 TL |
52 | #define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") " |
53 | ||
20effc67 TL |
54 | using namespace std; |
55 | ||
f67539c2 TL |
56 | void CInodeCommitOperation::update(ObjectOperation &op, inode_backtrace_t &bt) { |
57 | using ceph::encode; | |
58 | ||
59 | op.priority = priority; | |
60 | op.create(false); | |
61 | ||
62 | bufferlist parent_bl; | |
63 | encode(bt, parent_bl); | |
64 | op.setxattr("parent", parent_bl); | |
7c673cae | 65 | |
20effc67 TL |
66 | // for the old pool there is no need to update the layout and symlink |
67 | if (!update_layout_symlink) | |
f67539c2 TL |
68 | return; |
69 | ||
70 | bufferlist layout_bl; | |
71 | encode(_layout, layout_bl, _features); | |
72 | op.setxattr("layout", layout_bl); | |
20effc67 TL |
73 | |
74 | if (!_symlink.empty()) { | |
75 | bufferlist symlink_bl; | |
76 | encode(_symlink, symlink_bl); | |
77 | op.setxattr("symlink", symlink_bl); | |
78 | } | |
f67539c2 | 79 | } |
7c673cae FG |
80 | |
81 | class CInodeIOContext : public MDSIOContextBase | |
82 | { | |
83 | protected: | |
84 | CInode *in; | |
85 | MDSRank *get_mds() override {return in->mdcache->mds;} | |
86 | public: | |
87 | explicit CInodeIOContext(CInode *in_) : in(in_) { | |
11fdf7f2 | 88 | ceph_assert(in != NULL); |
7c673cae FG |
89 | } |
90 | }; | |
91 | ||
11fdf7f2 | 92 | sr_t* const CInode::projected_inode::UNDEF_SRNODE = (sr_t*)(unsigned long)-1; |
7c673cae FG |
93 | |
94 | LockType CInode::versionlock_type(CEPH_LOCK_IVERSION); | |
95 | LockType CInode::authlock_type(CEPH_LOCK_IAUTH); | |
96 | LockType CInode::linklock_type(CEPH_LOCK_ILINK); | |
97 | LockType CInode::dirfragtreelock_type(CEPH_LOCK_IDFT); | |
98 | LockType CInode::filelock_type(CEPH_LOCK_IFILE); | |
99 | LockType CInode::xattrlock_type(CEPH_LOCK_IXATTR); | |
100 | LockType CInode::snaplock_type(CEPH_LOCK_ISNAP); | |
101 | LockType CInode::nestlock_type(CEPH_LOCK_INEST); | |
102 | LockType CInode::flocklock_type(CEPH_LOCK_IFLOCK); | |
103 | LockType CInode::policylock_type(CEPH_LOCK_IPOLICY); | |
104 | ||
9f95a23c TL |
105 | std::string_view CInode::pin_name(int p) const |
106 | { | |
107 | switch (p) { | |
108 | case PIN_DIRFRAG: return "dirfrag"; | |
109 | case PIN_CAPS: return "caps"; | |
110 | case PIN_IMPORTING: return "importing"; | |
111 | case PIN_OPENINGDIR: return "openingdir"; | |
112 | case PIN_REMOTEPARENT: return "remoteparent"; | |
113 | case PIN_BATCHOPENJOURNAL: return "batchopenjournal"; | |
114 | case PIN_SCATTERED: return "scattered"; | |
115 | case PIN_STICKYDIRS: return "stickydirs"; | |
116 | //case PIN_PURGING: return "purging"; | |
117 | case PIN_FREEZING: return "freezing"; | |
118 | case PIN_FROZEN: return "frozen"; | |
119 | case PIN_IMPORTINGCAPS: return "importingcaps"; | |
120 | case PIN_EXPORTINGCAPS: return "exportingcaps"; | |
121 | case PIN_PASTSNAPPARENT: return "pastsnapparent"; | |
122 | case PIN_OPENINGSNAPPARENTS: return "openingsnapparents"; | |
123 | case PIN_TRUNCATING: return "truncating"; | |
124 | case PIN_STRAY: return "stray"; | |
125 | case PIN_NEEDSNAPFLUSH: return "needsnapflush"; | |
126 | case PIN_DIRTYRSTAT: return "dirtyrstat"; | |
127 | case PIN_DIRTYPARENT: return "dirtyparent"; | |
128 | case PIN_DIRWAITER: return "dirwaiter"; | |
9f95a23c TL |
129 | default: return generic_pin_name(p); |
130 | } | |
131 | } | |
132 | ||
7c673cae FG |
133 | //int cinode_pins[CINODE_NUM_PINS]; // counts |
134 | ostream& CInode::print_db_line_prefix(ostream& out) | |
135 | { | |
f67539c2 | 136 | return out << ceph_clock_now() << " mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") "; |
7c673cae FG |
137 | } |
138 | ||
139 | /* | |
140 | * write caps and lock ids | |
141 | */ | |
142 | struct cinode_lock_info_t cinode_lock_info[] = { | |
143 | { CEPH_LOCK_IFILE, CEPH_CAP_ANY_FILE_WR }, | |
144 | { CEPH_LOCK_IAUTH, CEPH_CAP_AUTH_EXCL }, | |
145 | { CEPH_LOCK_ILINK, CEPH_CAP_LINK_EXCL }, | |
146 | { CEPH_LOCK_IXATTR, CEPH_CAP_XATTR_EXCL }, | |
147 | }; | |
148 | int num_cinode_locks = sizeof(cinode_lock_info) / sizeof(cinode_lock_info[0]); | |
149 | ||
7c673cae FG |
150 | ostream& operator<<(ostream& out, const CInode& in) |
151 | { | |
152 | string path; | |
153 | in.make_path_string(path, true); | |
154 | ||
f67539c2 | 155 | out << "[inode " << in.ino(); |
7c673cae FG |
156 | out << " [" |
157 | << (in.is_multiversion() ? "...":"") | |
158 | << in.first << "," << in.last << "]"; | |
159 | out << " " << path << (in.is_dir() ? "/":""); | |
160 | ||
161 | if (in.is_auth()) { | |
162 | out << " auth"; | |
163 | if (in.is_replicated()) | |
164 | out << in.get_replicas(); | |
165 | } else { | |
166 | mds_authority_t a = in.authority(); | |
167 | out << " rep@" << a.first; | |
168 | if (a.second != CDIR_AUTH_UNKNOWN) | |
169 | out << "," << a.second; | |
170 | out << "." << in.get_replica_nonce(); | |
171 | } | |
172 | ||
173 | if (in.is_symlink()) | |
174 | out << " symlink='" << in.symlink << "'"; | |
175 | if (in.is_dir() && !in.dirfragtree.empty()) | |
176 | out << " " << in.dirfragtree; | |
177 | ||
178 | out << " v" << in.get_version(); | |
179 | if (in.get_projected_version() > in.get_version()) | |
180 | out << " pv" << in.get_projected_version(); | |
181 | ||
11fdf7f2 TL |
182 | if (in.get_num_auth_pins()) { |
183 | out << " ap=" << in.get_num_auth_pins(); | |
7c673cae | 184 | #ifdef MDS_AUTHPIN_SET |
11fdf7f2 | 185 | in.print_authpin_set(out); |
7c673cae FG |
186 | #endif |
187 | } | |
188 | ||
189 | if (in.snaprealm) | |
190 | out << " snaprealm=" << in.snaprealm; | |
191 | ||
192 | if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH"; | |
f67539c2 TL |
193 | if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " NEEDSRECOVER"; |
194 | if (in.state_test(CInode::STATE_RECOVERING)) out << " RECOVERING"; | |
195 | if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " DIRTYPARENT"; | |
196 | if (in.state_test(CInode::STATE_MISSINGOBJS)) out << " MISSINGOBJS"; | |
197 | if (in.is_ephemeral_dist()) out << " DISTEPHEMERALPIN"; | |
198 | if (in.is_ephemeral_rand()) out << " RANDEPHEMERALPIN"; | |
7c673cae FG |
199 | if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance; |
200 | if (in.is_frozen_inode()) out << " FROZEN"; | |
201 | if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN"; | |
202 | ||
f67539c2 | 203 | const auto& pi = in.get_projected_inode(); |
7c673cae FG |
204 | if (pi->is_truncating()) |
205 | out << " truncating(" << pi->truncate_from << " to " << pi->truncate_size << ")"; | |
206 | ||
f67539c2 TL |
207 | if (in.is_dir()) { |
208 | out << " " << in.get_inode()->dirstat; | |
11fdf7f2 | 209 | if (g_conf()->mds_debug_scatterstat && in.is_projected()) { |
7c673cae FG |
210 | out << "->" << pi->dirstat; |
211 | } | |
212 | } else { | |
f67539c2 TL |
213 | out << " s=" << in.get_inode()->size; |
214 | if (in.get_inode()->nlink != 1) | |
215 | out << " nl=" << in.get_inode()->nlink; | |
7c673cae FG |
216 | } |
217 | ||
218 | // rstat | |
f67539c2 TL |
219 | out << " " << in.get_inode()->rstat; |
220 | if (!(in.get_inode()->rstat == in.get_inode()->accounted_rstat)) | |
221 | out << "/" << in.get_inode()->accounted_rstat; | |
11fdf7f2 | 222 | if (g_conf()->mds_debug_scatterstat && in.is_projected()) { |
7c673cae FG |
223 | out << "->" << pi->rstat; |
224 | if (!(pi->rstat == pi->accounted_rstat)) | |
225 | out << "/" << pi->accounted_rstat; | |
226 | } | |
227 | ||
f67539c2 TL |
228 | if (in.is_any_old_inodes()) { |
229 | out << " old_inodes=" << in.get_old_inodes()->size(); | |
230 | } | |
231 | ||
7c673cae FG |
232 | if (!in.client_need_snapflush.empty()) |
233 | out << " need_snapflush=" << in.client_need_snapflush; | |
234 | ||
7c673cae FG |
235 | // locks |
236 | if (!in.authlock.is_sync_and_unlocked()) | |
237 | out << " " << in.authlock; | |
238 | if (!in.linklock.is_sync_and_unlocked()) | |
239 | out << " " << in.linklock; | |
f67539c2 | 240 | if (in.get_inode()->is_dir()) { |
7c673cae FG |
241 | if (!in.dirfragtreelock.is_sync_and_unlocked()) |
242 | out << " " << in.dirfragtreelock; | |
243 | if (!in.snaplock.is_sync_and_unlocked()) | |
244 | out << " " << in.snaplock; | |
245 | if (!in.nestlock.is_sync_and_unlocked()) | |
246 | out << " " << in.nestlock; | |
247 | if (!in.policylock.is_sync_and_unlocked()) | |
248 | out << " " << in.policylock; | |
249 | } else { | |
250 | if (!in.flocklock.is_sync_and_unlocked()) | |
251 | out << " " << in.flocklock; | |
252 | } | |
253 | if (!in.filelock.is_sync_and_unlocked()) | |
254 | out << " " << in.filelock; | |
255 | if (!in.xattrlock.is_sync_and_unlocked()) | |
256 | out << " " << in.xattrlock; | |
257 | if (!in.versionlock.is_sync_and_unlocked()) | |
258 | out << " " << in.versionlock; | |
259 | ||
260 | // hack: spit out crap on which clients have caps | |
f67539c2 TL |
261 | if (in.get_inode()->client_ranges.size()) |
262 | out << " cr=" << in.get_inode()->client_ranges; | |
7c673cae FG |
263 | |
264 | if (!in.get_client_caps().empty()) { | |
265 | out << " caps={"; | |
11fdf7f2 TL |
266 | bool first = true; |
267 | for (const auto &p : in.get_client_caps()) { | |
268 | if (!first) out << ","; | |
269 | out << p.first << "=" | |
270 | << ccap_string(p.second.pending()); | |
271 | if (p.second.issued() != p.second.pending()) | |
272 | out << "/" << ccap_string(p.second.issued()); | |
273 | out << "/" << ccap_string(p.second.wanted()) | |
274 | << "@" << p.second.get_last_seq(); | |
275 | first = false; | |
7c673cae FG |
276 | } |
277 | out << "}"; | |
278 | if (in.get_loner() >= 0 || in.get_wanted_loner() >= 0) { | |
279 | out << ",l=" << in.get_loner(); | |
280 | if (in.get_loner() != in.get_wanted_loner()) | |
281 | out << "(" << in.get_wanted_loner() << ")"; | |
282 | } | |
283 | } | |
284 | if (!in.get_mds_caps_wanted().empty()) { | |
285 | out << " mcw={"; | |
94b18763 FG |
286 | bool first = true; |
287 | for (const auto &p : in.get_mds_caps_wanted()) { | |
288 | if (!first) | |
7c673cae | 289 | out << ','; |
94b18763 FG |
290 | out << p.first << '=' << ccap_string(p.second); |
291 | first = false; | |
7c673cae FG |
292 | } |
293 | out << '}'; | |
294 | } | |
295 | ||
296 | if (in.get_num_ref()) { | |
297 | out << " |"; | |
298 | in.print_pin_set(out); | |
299 | } | |
300 | ||
f67539c2 TL |
301 | if (in.get_inode()->export_pin != MDS_RANK_NONE) { |
302 | out << " export_pin=" << in.get_inode()->export_pin; | |
7c673cae | 303 | } |
f6b5b4d7 TL |
304 | if (in.state_test(CInode::STATE_DISTEPHEMERALPIN)) { |
305 | out << " distepin"; | |
306 | } | |
307 | if (in.state_test(CInode::STATE_RANDEPHEMERALPIN)) { | |
308 | out << " randepin"; | |
309 | } | |
7c673cae FG |
310 | |
311 | out << " " << ∈ | |
312 | out << "]"; | |
313 | return out; | |
314 | } | |
315 | ||
f67539c2 TL |
316 | CInode::CInode(MDCache *c, bool auth, snapid_t f, snapid_t l) : |
317 | mdcache(c), first(f), last(l), | |
11fdf7f2 TL |
318 | item_dirty(this), |
319 | item_caps(this), | |
320 | item_open_file(this), | |
321 | item_dirty_parent(this), | |
322 | item_dirty_dirfrag_dir(this), | |
323 | item_dirty_dirfrag_nest(this), | |
324 | item_dirty_dirfrag_dirfragtree(this), | |
325 | pop(c->decayrate), | |
326 | versionlock(this, &versionlock_type), | |
327 | authlock(this, &authlock_type), | |
328 | linklock(this, &linklock_type), | |
329 | dirfragtreelock(this, &dirfragtreelock_type), | |
330 | filelock(this, &filelock_type), | |
331 | xattrlock(this, &xattrlock_type), | |
332 | snaplock(this, &snaplock_type), | |
333 | nestlock(this, &nestlock_type), | |
334 | flocklock(this, &flocklock_type), | |
335 | policylock(this, &policylock_type) | |
336 | { | |
f67539c2 TL |
337 | if (auth) |
338 | state_set(STATE_AUTH); | |
11fdf7f2 | 339 | } |
7c673cae FG |
340 | |
341 | void CInode::print(ostream& out) | |
342 | { | |
343 | out << *this; | |
344 | } | |
345 | ||
7c673cae FG |
346 | void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client) |
347 | { | |
11fdf7f2 | 348 | dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl; |
7c673cae FG |
349 | |
350 | if (client_need_snapflush.empty()) { | |
351 | get(CInode::PIN_NEEDSNAPFLUSH); | |
352 | ||
353 | // FIXME: this is non-optimal, as we'll block freezes/migrations for potentially | |
354 | // long periods waiting for clients to flush their snaps. | |
f67539c2 | 355 | auth_pin(this); // pin head get_inode()->.. |
7c673cae FG |
356 | } |
357 | ||
94b18763 | 358 | auto &clients = client_need_snapflush[snapid]; |
7c673cae FG |
359 | if (clients.empty()) |
360 | snapin->auth_pin(this); // ...and pin snapped/old inode! | |
361 | ||
362 | clients.insert(client); | |
363 | } | |
364 | ||
365 | void CInode::remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client) | |
366 | { | |
94b18763 FG |
367 | dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl; |
368 | auto it = client_need_snapflush.find(snapid); | |
369 | if (it == client_need_snapflush.end()) { | |
7c673cae FG |
370 | dout(10) << " snapid not found" << dendl; |
371 | return; | |
372 | } | |
94b18763 FG |
373 | size_t n = it->second.erase(client); |
374 | if (n == 0) { | |
7c673cae FG |
375 | dout(10) << " client not found" << dendl; |
376 | return; | |
377 | } | |
94b18763 FG |
378 | if (it->second.empty()) { |
379 | client_need_snapflush.erase(it); | |
7c673cae FG |
380 | snapin->auth_unpin(this); |
381 | ||
382 | if (client_need_snapflush.empty()) { | |
383 | put(CInode::PIN_NEEDSNAPFLUSH); | |
384 | auth_unpin(this); | |
385 | } | |
386 | } | |
387 | } | |
388 | ||
494da23a | 389 | pair<bool,bool> CInode::split_need_snapflush(CInode *cowin, CInode *in) |
7c673cae | 390 | { |
11fdf7f2 | 391 | dout(10) << __func__ << " [" << cowin->first << "," << cowin->last << "] for " << *cowin << dendl; |
494da23a TL |
392 | bool cowin_need_flush = false; |
393 | bool orig_need_flush = false; | |
394 | auto it = client_need_snapflush.lower_bound(cowin->first); | |
395 | while (it != client_need_snapflush.end() && it->first < in->first) { | |
11fdf7f2 | 396 | ceph_assert(!it->second.empty()); |
94b18763 | 397 | if (cowin->last >= it->first) { |
7c673cae | 398 | cowin->auth_pin(this); |
494da23a | 399 | cowin_need_flush = true; |
94b18763 FG |
400 | ++it; |
401 | } else { | |
402 | it = client_need_snapflush.erase(it); | |
403 | } | |
7c673cae FG |
404 | in->auth_unpin(this); |
405 | } | |
494da23a TL |
406 | |
407 | if (it != client_need_snapflush.end() && it->first <= in->last) | |
408 | orig_need_flush = true; | |
409 | ||
410 | return make_pair(cowin_need_flush, orig_need_flush); | |
7c673cae FG |
411 | } |
412 | ||
413 | void CInode::mark_dirty_rstat() | |
414 | { | |
415 | if (!state_test(STATE_DIRTYRSTAT)) { | |
11fdf7f2 | 416 | dout(10) << __func__ << dendl; |
7c673cae FG |
417 | state_set(STATE_DIRTYRSTAT); |
418 | get(PIN_DIRTYRSTAT); | |
224ce89b WB |
419 | CDentry *pdn = get_projected_parent_dn(); |
420 | if (pdn->is_auth()) { | |
421 | CDir *pdir = pdn->dir; | |
422 | pdir->dirty_rstat_inodes.push_back(&dirty_rstat_item); | |
423 | mdcache->mds->locker->mark_updated_scatterlock(&pdir->inode->nestlock); | |
424 | } else { | |
425 | // under cross-MDS rename. | |
426 | // DIRTYRSTAT flag will get cleared when rename finishes | |
11fdf7f2 | 427 | ceph_assert(state_test(STATE_AMBIGUOUSAUTH)); |
224ce89b | 428 | } |
7c673cae FG |
429 | } |
430 | } | |
431 | void CInode::clear_dirty_rstat() | |
432 | { | |
433 | if (state_test(STATE_DIRTYRSTAT)) { | |
11fdf7f2 | 434 | dout(10) << __func__ << dendl; |
7c673cae FG |
435 | state_clear(STATE_DIRTYRSTAT); |
436 | put(PIN_DIRTYRSTAT); | |
437 | dirty_rstat_item.remove_myself(); | |
438 | } | |
439 | } | |
440 | ||
f67539c2 TL |
441 | CInode::projected_inode CInode::project_inode(const MutationRef& mut, |
442 | bool xattr, bool snap) | |
94b18763 | 443 | { |
f67539c2 TL |
444 | if (mut && mut->is_projected(this)) { |
445 | ceph_assert(!xattr && !snap); | |
446 | auto _inode = std::const_pointer_cast<mempool_inode>(projected_nodes.back().inode); | |
447 | return projected_inode(std::move(_inode), xattr_map_ptr()); | |
448 | } | |
449 | ||
450 | auto pi = allocate_inode(*get_projected_inode()); | |
7c673cae FG |
451 | |
452 | if (scrub_infop && scrub_infop->last_scrub_dirty) { | |
f67539c2 TL |
453 | pi->last_scrub_stamp = scrub_infop->last_scrub_stamp; |
454 | pi->last_scrub_version = scrub_infop->last_scrub_version; | |
7c673cae FG |
455 | scrub_infop->last_scrub_dirty = false; |
456 | scrub_maybe_delete_info(); | |
457 | } | |
94b18763 | 458 | |
f67539c2 TL |
459 | const auto& ox = get_projected_xattrs(); |
460 | xattr_map_ptr px; | |
94b18763 | 461 | if (xattr) { |
f67539c2 TL |
462 | px = allocate_xattr_map(); |
463 | if (ox) | |
464 | *px = *ox; | |
94b18763 FG |
465 | } |
466 | ||
f67539c2 | 467 | sr_t* ps = projected_inode::UNDEF_SRNODE; |
94b18763 | 468 | if (snap) { |
f67539c2 TL |
469 | ps = prepare_new_srnode(0); |
470 | ++num_projected_srnodes; | |
94b18763 FG |
471 | } |
472 | ||
f67539c2 TL |
473 | projected_nodes.emplace_back(pi, xattr ? px : ox , ps); |
474 | if (mut) | |
475 | mut->add_projected_node(this); | |
476 | dout(15) << __func__ << " " << pi->ino << dendl; | |
477 | return projected_inode(std::move(pi), std::move(px), ps); | |
7c673cae FG |
478 | } |
479 | ||
f67539c2 | 480 | void CInode::pop_and_dirty_projected_inode(LogSegment *ls, const MutationRef& mut) |
7c673cae | 481 | { |
11fdf7f2 | 482 | ceph_assert(!projected_nodes.empty()); |
f67539c2 TL |
483 | auto front = std::move(projected_nodes.front()); |
484 | dout(15) << __func__ << " v" << front.inode->version << dendl; | |
f6b5b4d7 | 485 | |
f67539c2 TL |
486 | projected_nodes.pop_front(); |
487 | if (mut) | |
488 | mut->remove_projected_node(this); | |
7c673cae | 489 | |
f67539c2 TL |
490 | bool pool_updated = get_inode()->layout.pool_id != front.inode->layout.pool_id; |
491 | bool pin_updated = (get_inode()->export_pin != front.inode->export_pin) || | |
492 | (get_inode()->export_ephemeral_distributed_pin != | |
493 | front.inode->export_ephemeral_distributed_pin); | |
7c673cae | 494 | |
f67539c2 TL |
495 | reset_inode(std::move(front.inode)); |
496 | if (front.xattrs != get_xattrs()) | |
497 | reset_xattrs(std::move(front.xattrs)); | |
7c673cae | 498 | |
f67539c2 | 499 | if (front.snapnode != projected_inode::UNDEF_SRNODE) { |
7c673cae | 500 | --num_projected_srnodes; |
f67539c2 | 501 | pop_projected_snaprealm(front.snapnode, false); |
7c673cae FG |
502 | } |
503 | ||
f67539c2 TL |
504 | mark_dirty(ls); |
505 | if (get_inode()->is_backtrace_updated()) | |
506 | mark_dirty_parent(ls, pool_updated); | |
7c673cae | 507 | |
f67539c2 TL |
508 | if (pin_updated) |
509 | maybe_export_pin(true); | |
9f95a23c TL |
510 | } |
511 | ||
11fdf7f2 TL |
512 | sr_t *CInode::prepare_new_srnode(snapid_t snapid) |
513 | { | |
514 | const sr_t *cur_srnode = get_projected_srnode(); | |
515 | sr_t *new_srnode; | |
516 | ||
517 | if (cur_srnode) { | |
518 | new_srnode = new sr_t(*cur_srnode); | |
11fdf7f2 TL |
519 | } else { |
520 | if (snapid == 0) | |
521 | snapid = mdcache->get_global_snaprealm()->get_newest_seq(); | |
522 | new_srnode = new sr_t(); | |
523 | new_srnode->seq = snapid; | |
524 | new_srnode->created = snapid; | |
525 | new_srnode->current_parent_since = get_oldest_snap(); | |
526 | } | |
527 | return new_srnode; | |
528 | } | |
529 | ||
9f95a23c TL |
530 | const sr_t *CInode::get_projected_srnode() const { |
531 | if (num_projected_srnodes > 0) { | |
532 | for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it) | |
533 | if (it->snapnode != projected_inode::UNDEF_SRNODE) | |
534 | return it->snapnode; | |
535 | } | |
536 | if (snaprealm) | |
537 | return &snaprealm->srnode; | |
538 | else | |
539 | return NULL; | |
540 | } | |
541 | ||
11fdf7f2 TL |
542 | void CInode::project_snaprealm(sr_t *new_srnode) |
543 | { | |
544 | dout(10) << __func__ << " " << new_srnode << dendl; | |
545 | ceph_assert(projected_nodes.back().snapnode == projected_inode::UNDEF_SRNODE); | |
546 | projected_nodes.back().snapnode = new_srnode; | |
547 | ++num_projected_srnodes; | |
548 | } | |
549 | ||
550 | void CInode::mark_snaprealm_global(sr_t *new_srnode) | |
551 | { | |
552 | ceph_assert(!is_dir()); | |
553 | // 'last_destroyed' is no longer used, use it to store origin 'current_parent_since' | |
554 | new_srnode->last_destroyed = new_srnode->current_parent_since; | |
555 | new_srnode->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1; | |
556 | new_srnode->mark_parent_global(); | |
557 | } | |
558 | ||
559 | void CInode::clear_snaprealm_global(sr_t *new_srnode) | |
560 | { | |
561 | // restore 'current_parent_since' | |
562 | new_srnode->current_parent_since = new_srnode->last_destroyed; | |
563 | new_srnode->last_destroyed = 0; | |
564 | new_srnode->seq = mdcache->get_global_snaprealm()->get_newest_seq(); | |
565 | new_srnode->clear_parent_global(); | |
566 | } | |
567 | ||
568 | bool CInode::is_projected_snaprealm_global() const | |
569 | { | |
570 | const sr_t *srnode = get_projected_srnode(); | |
571 | if (srnode && srnode->is_parent_global()) | |
572 | return true; | |
573 | return false; | |
574 | } | |
575 | ||
576 | void CInode::project_snaprealm_past_parent(SnapRealm *newparent) | |
577 | { | |
578 | sr_t *new_snap = project_snaprealm(); | |
579 | record_snaprealm_past_parent(new_snap, newparent); | |
580 | } | |
581 | ||
582 | ||
7c673cae FG |
583 | /* if newparent != parent, add parent to past_parents |
584 | if parent DNE, we need to find what the parent actually is and fill that in */ | |
11fdf7f2 | 585 | void CInode::record_snaprealm_past_parent(sr_t *new_snap, SnapRealm *newparent) |
7c673cae | 586 | { |
11fdf7f2 | 587 | ceph_assert(!new_snap->is_parent_global()); |
7c673cae FG |
588 | SnapRealm *oldparent; |
589 | if (!snaprealm) { | |
590 | oldparent = find_snaprealm(); | |
11fdf7f2 | 591 | } else { |
7c673cae | 592 | oldparent = snaprealm->parent; |
11fdf7f2 | 593 | } |
7c673cae FG |
594 | |
595 | if (newparent != oldparent) { | |
596 | snapid_t oldparentseq = oldparent->get_newest_seq(); | |
11fdf7f2 TL |
597 | if (oldparentseq + 1 > new_snap->current_parent_since) { |
598 | // copy old parent's snaps | |
599 | const set<snapid_t>& snaps = oldparent->get_snaps(); | |
600 | auto p = snaps.lower_bound(new_snap->current_parent_since); | |
601 | if (p != snaps.end()) | |
602 | new_snap->past_parent_snaps.insert(p, snaps.end()); | |
603 | if (oldparentseq > new_snap->seq) | |
604 | new_snap->seq = oldparentseq; | |
7c673cae | 605 | } |
11fdf7f2 | 606 | new_snap->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1; |
7c673cae FG |
607 | } |
608 | } | |
609 | ||
adb31ebb | 610 | void CInode::record_snaprealm_parent_dentry(sr_t *new_snap, SnapRealm *oldparent, |
11fdf7f2 | 611 | CDentry *dn, bool primary_dn) |
7c673cae | 612 | { |
11fdf7f2 | 613 | ceph_assert(new_snap->is_parent_global()); |
adb31ebb TL |
614 | |
615 | if (!oldparent) | |
616 | oldparent = dn->get_dir()->inode->find_snaprealm(); | |
11fdf7f2 TL |
617 | auto& snaps = oldparent->get_snaps(); |
618 | ||
619 | if (!primary_dn) { | |
620 | auto p = snaps.lower_bound(dn->first); | |
621 | if (p != snaps.end()) | |
622 | new_snap->past_parent_snaps.insert(p, snaps.end()); | |
adb31ebb | 623 | } else { |
11fdf7f2 TL |
624 | // 'last_destroyed' is used as 'current_parent_since' |
625 | auto p = snaps.lower_bound(new_snap->last_destroyed); | |
626 | if (p != snaps.end()) | |
627 | new_snap->past_parent_snaps.insert(p, snaps.end()); | |
628 | new_snap->last_destroyed = mdcache->get_global_snaprealm()->get_newest_seq() + 1; | |
629 | } | |
630 | } | |
7c673cae | 631 | |
11fdf7f2 TL |
632 | void CInode::early_pop_projected_snaprealm() |
633 | { | |
634 | ceph_assert(!projected_nodes.empty()); | |
635 | if (projected_nodes.front().snapnode != projected_inode::UNDEF_SRNODE) { | |
636 | pop_projected_snaprealm(projected_nodes.front().snapnode, true); | |
637 | projected_nodes.front().snapnode = projected_inode::UNDEF_SRNODE; | |
638 | --num_projected_srnodes; | |
7c673cae | 639 | } |
11fdf7f2 | 640 | } |
7c673cae | 641 | |
11fdf7f2 TL |
642 | void CInode::pop_projected_snaprealm(sr_t *next_snaprealm, bool early) |
643 | { | |
644 | if (next_snaprealm) { | |
645 | dout(10) << __func__ << (early ? " (early) " : " ") | |
646 | << next_snaprealm << " seq " << next_snaprealm->seq << dendl; | |
f67539c2 | 647 | if (!snaprealm) |
11fdf7f2 | 648 | open_snaprealm(); |
11fdf7f2 | 649 | |
11fdf7f2 TL |
650 | auto old_flags = snaprealm->srnode.flags; |
651 | snaprealm->srnode = *next_snaprealm; | |
652 | delete next_snaprealm; | |
7c673cae | 653 | |
11fdf7f2 | 654 | if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) { |
11fdf7f2 TL |
655 | snaprealm->adjust_parent(); |
656 | } | |
7c673cae | 657 | |
11fdf7f2 TL |
658 | if (snaprealm->parent) |
659 | dout(10) << " realm " << *snaprealm << " parent " << *snaprealm->parent << dendl; | |
660 | } else { | |
661 | dout(10) << __func__ << (early ? " (early) null" : " null") << dendl; | |
662 | ceph_assert(snaprealm); | |
663 | snaprealm->merge_to(NULL); | |
664 | } | |
7c673cae FG |
665 | } |
666 | ||
667 | ||
668 | // ====== CInode ======= | |
669 | ||
670 | // dirfrags | |
671 | ||
f67539c2 TL |
672 | InodeStoreBase::inode_const_ptr InodeStoreBase::empty_inode = InodeStoreBase::allocate_inode(); |
673 | ||
11fdf7f2 | 674 | __u32 InodeStoreBase::hash_dentry_name(std::string_view dn) |
7c673cae | 675 | { |
f67539c2 | 676 | int which = inode->dir_layout.dl_dir_hash; |
7c673cae FG |
677 | if (!which) |
678 | which = CEPH_STR_HASH_LINUX; | |
11fdf7f2 | 679 | ceph_assert(ceph_str_hash_valid(which)); |
7c673cae FG |
680 | return ceph_str_hash(which, dn.data(), dn.length()); |
681 | } | |
682 | ||
11fdf7f2 | 683 | frag_t InodeStoreBase::pick_dirfrag(std::string_view dn) |
7c673cae FG |
684 | { |
685 | if (dirfragtree.empty()) | |
686 | return frag_t(); // avoid the string hash if we can. | |
687 | ||
688 | __u32 h = hash_dentry_name(dn); | |
689 | return dirfragtree[h]; | |
690 | } | |
691 | ||
9f95a23c | 692 | std::pair<bool, std::vector<CDir*>> CInode::get_dirfrags_under(frag_t fg) |
7c673cae | 693 | { |
9f95a23c TL |
694 | std::pair<bool, std::vector<CDir*>> result; |
695 | auto& all = result.first; | |
696 | auto& dirs = result.second; | |
697 | all = false; | |
698 | ||
699 | if (auto it = dirfrags.find(fg); it != dirfrags.end()){ | |
700 | all = true; | |
701 | dirs.push_back(it->second); | |
702 | return result; | |
7c673cae | 703 | } |
9f95a23c TL |
704 | |
705 | int total = 0; | |
706 | for(auto &[_fg, _dir] : dirfrags){ | |
707 | // frag_t.bits() can indicate the depth of the partition in the directory tree | |
708 | // e.g. | |
709 | // 01* : bit = 2, on the second floor | |
710 | // * | |
711 | // 0* 1* | |
712 | // 00* 01* 10* 11* -- > level 2, bit = 2 | |
713 | // so fragA.bits > fragB.bits means fragA is deeper than fragB | |
714 | ||
715 | if (fg.bits() >= _fg.bits()) { | |
716 | if (_fg.contains(fg)) { | |
717 | all = true; | |
718 | return result; | |
719 | } | |
720 | } else { | |
721 | if (fg.contains(_fg)) { | |
722 | dirs.push_back(_dir); | |
723 | // we can calculate how many sub slices a slice can be divided into | |
724 | // frag_t(*) can be divided into two frags belonging to the first layer(0* 1*) | |
725 | // or 2^2 frags belonging to the second layer(00* 01* 10* 11*) | |
726 | // or (1 << (24 - frag_t(*).bits)) frags belonging to the 24th level | |
727 | total += 1 << (24 - _fg.bits()); | |
11fdf7f2 | 728 | } |
7c673cae | 729 | } |
94b18763 | 730 | } |
7c673cae | 731 | |
9f95a23c TL |
732 | // we convert all the frags into the frags of 24th layer to calculate whether all the frags are included in the memory cache |
733 | all = ((1<<(24-fg.bits())) == total); | |
734 | return result; | |
7c673cae FG |
735 | } |
736 | ||
737 | void CInode::verify_dirfrags() | |
738 | { | |
739 | bool bad = false; | |
94b18763 FG |
740 | for (const auto &p : dirfrags) { |
741 | if (!dirfragtree.is_leaf(p.first)) { | |
742 | dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree | |
743 | << ": " << *p.second << dendl; | |
7c673cae FG |
744 | bad = true; |
745 | } | |
746 | } | |
11fdf7f2 | 747 | ceph_assert(!bad); |
7c673cae FG |
748 | } |
749 | ||
750 | void CInode::force_dirfrags() | |
751 | { | |
752 | bool bad = false; | |
94b18763 FG |
753 | for (auto &p : dirfrags) { |
754 | if (!dirfragtree.is_leaf(p.first)) { | |
755 | dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree | |
756 | << ": " << *p.second << dendl; | |
7c673cae FG |
757 | bad = true; |
758 | } | |
759 | } | |
760 | ||
761 | if (bad) { | |
11fdf7f2 | 762 | frag_vec_t leaves; |
7c673cae | 763 | dirfragtree.get_leaves(leaves); |
11fdf7f2 TL |
764 | for (const auto& leaf : leaves) { |
765 | mdcache->get_force_dirfrag(dirfrag_t(ino(), leaf), true); | |
766 | } | |
7c673cae FG |
767 | } |
768 | ||
769 | verify_dirfrags(); | |
770 | } | |
771 | ||
772 | CDir *CInode::get_approx_dirfrag(frag_t fg) | |
773 | { | |
774 | CDir *dir = get_dirfrag(fg); | |
775 | if (dir) return dir; | |
776 | ||
777 | // find a child? | |
9f95a23c TL |
778 | auto&& p = get_dirfrags_under(fg); |
779 | if (!p.second.empty()) | |
780 | return p.second.front(); | |
7c673cae FG |
781 | |
782 | // try parents? | |
783 | while (fg.bits() > 0) { | |
784 | fg = fg.parent(); | |
785 | dir = get_dirfrag(fg); | |
786 | if (dir) return dir; | |
787 | } | |
788 | return NULL; | |
789 | } | |
790 | ||
7c673cae FG |
791 | CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg) |
792 | { | |
11fdf7f2 | 793 | ceph_assert(is_dir()); |
7c673cae FG |
794 | |
795 | // have it? | |
796 | CDir *dir = get_dirfrag(fg); | |
797 | if (!dir) { | |
798 | // create it. | |
11fdf7f2 | 799 | ceph_assert(is_auth() || mdcache->mds->is_any_replay()); |
7c673cae FG |
800 | dir = new CDir(this, fg, mdcache, is_auth()); |
801 | add_dirfrag(dir); | |
802 | } | |
803 | return dir; | |
804 | } | |
805 | ||
806 | CDir *CInode::add_dirfrag(CDir *dir) | |
807 | { | |
11fdf7f2 TL |
808 | auto em = dirfrags.emplace(std::piecewise_construct, std::forward_as_tuple(dir->dirfrag().frag), std::forward_as_tuple(dir)); |
809 | ceph_assert(em.second); | |
7c673cae FG |
810 | |
811 | if (stickydir_ref > 0) { | |
812 | dir->state_set(CDir::STATE_STICKY); | |
813 | dir->get(CDir::PIN_STICKY); | |
814 | } | |
815 | ||
f67539c2 | 816 | maybe_export_pin(); |
7c673cae FG |
817 | |
818 | return dir; | |
819 | } | |
820 | ||
821 | void CInode::close_dirfrag(frag_t fg) | |
822 | { | |
11fdf7f2 TL |
823 | dout(14) << __func__ << " " << fg << dendl; |
824 | ceph_assert(dirfrags.count(fg)); | |
7c673cae FG |
825 | |
826 | CDir *dir = dirfrags[fg]; | |
827 | dir->remove_null_dentries(); | |
828 | ||
829 | // clear dirty flag | |
830 | if (dir->is_dirty()) | |
831 | dir->mark_clean(); | |
832 | ||
833 | if (stickydir_ref > 0) { | |
834 | dir->state_clear(CDir::STATE_STICKY); | |
835 | dir->put(CDir::PIN_STICKY); | |
836 | } | |
1adf2230 AA |
837 | |
838 | if (dir->is_subtree_root()) | |
839 | num_subtree_roots--; | |
7c673cae FG |
840 | |
841 | // dump any remaining dentries, for debugging purposes | |
94b18763 FG |
842 | for (const auto &p : dir->items) |
843 | dout(14) << __func__ << " LEFTOVER dn " << *p.second << dendl; | |
7c673cae | 844 | |
11fdf7f2 | 845 | ceph_assert(dir->get_num_ref() == 0); |
7c673cae FG |
846 | delete dir; |
847 | dirfrags.erase(fg); | |
848 | } | |
849 | ||
850 | void CInode::close_dirfrags() | |
851 | { | |
852 | while (!dirfrags.empty()) | |
853 | close_dirfrag(dirfrags.begin()->first); | |
854 | } | |
855 | ||
856 | bool CInode::has_subtree_root_dirfrag(int auth) | |
857 | { | |
1adf2230 AA |
858 | if (num_subtree_roots > 0) { |
859 | if (auth == -1) | |
7c673cae | 860 | return true; |
1adf2230 AA |
861 | for (const auto &p : dirfrags) { |
862 | if (p.second->is_subtree_root() && | |
863 | p.second->dir_auth.first == auth) | |
864 | return true; | |
865 | } | |
94b18763 | 866 | } |
7c673cae FG |
867 | return false; |
868 | } | |
869 | ||
870 | bool CInode::has_subtree_or_exporting_dirfrag() | |
871 | { | |
1adf2230 AA |
872 | if (num_subtree_roots > 0 || num_exporting_dirs > 0) |
873 | return true; | |
7c673cae FG |
874 | return false; |
875 | } | |
876 | ||
877 | void CInode::get_stickydirs() | |
878 | { | |
879 | if (stickydir_ref == 0) { | |
880 | get(PIN_STICKYDIRS); | |
94b18763 FG |
881 | for (const auto &p : dirfrags) { |
882 | p.second->state_set(CDir::STATE_STICKY); | |
883 | p.second->get(CDir::PIN_STICKY); | |
7c673cae FG |
884 | } |
885 | } | |
886 | stickydir_ref++; | |
887 | } | |
888 | ||
889 | void CInode::put_stickydirs() | |
890 | { | |
11fdf7f2 | 891 | ceph_assert(stickydir_ref > 0); |
7c673cae FG |
892 | stickydir_ref--; |
893 | if (stickydir_ref == 0) { | |
894 | put(PIN_STICKYDIRS); | |
94b18763 FG |
895 | for (const auto &p : dirfrags) { |
896 | p.second->state_clear(CDir::STATE_STICKY); | |
897 | p.second->put(CDir::PIN_STICKY); | |
7c673cae FG |
898 | } |
899 | } | |
900 | } | |
901 | ||
902 | ||
903 | ||
904 | ||
905 | ||
906 | // pins | |
907 | ||
908 | void CInode::first_get() | |
909 | { | |
910 | // pin my dentry? | |
911 | if (parent) | |
912 | parent->get(CDentry::PIN_INODEPIN); | |
913 | } | |
914 | ||
915 | void CInode::last_put() | |
916 | { | |
917 | // unpin my dentry? | |
918 | if (parent) | |
919 | parent->put(CDentry::PIN_INODEPIN); | |
920 | } | |
921 | ||
922 | void CInode::_put() | |
923 | { | |
924 | if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent()) | |
925 | mdcache->maybe_eval_stray(this, true); | |
926 | } | |
927 | ||
928 | void CInode::add_remote_parent(CDentry *p) | |
929 | { | |
930 | if (remote_parents.empty()) | |
931 | get(PIN_REMOTEPARENT); | |
932 | remote_parents.insert(p); | |
933 | } | |
934 | void CInode::remove_remote_parent(CDentry *p) | |
935 | { | |
936 | remote_parents.erase(p); | |
937 | if (remote_parents.empty()) | |
938 | put(PIN_REMOTEPARENT); | |
939 | } | |
940 | ||
941 | ||
942 | ||
943 | ||
944 | CDir *CInode::get_parent_dir() | |
945 | { | |
946 | if (parent) | |
947 | return parent->dir; | |
948 | return NULL; | |
949 | } | |
950 | CDir *CInode::get_projected_parent_dir() | |
951 | { | |
952 | CDentry *p = get_projected_parent_dn(); | |
953 | if (p) | |
954 | return p->dir; | |
955 | return NULL; | |
956 | } | |
957 | CInode *CInode::get_parent_inode() | |
958 | { | |
959 | if (parent) | |
960 | return parent->dir->inode; | |
961 | return NULL; | |
962 | } | |
963 | ||
11fdf7f2 | 964 | bool CInode::is_ancestor_of(const CInode *other) const |
7c673cae FG |
965 | { |
966 | while (other) { | |
967 | if (other == this) | |
968 | return true; | |
11fdf7f2 TL |
969 | const CDentry *pdn = other->get_oldest_parent_dn(); |
970 | if (!pdn) { | |
971 | ceph_assert(other->is_base()); | |
7c673cae | 972 | break; |
11fdf7f2 TL |
973 | } |
974 | other = pdn->get_dir()->get_inode(); | |
975 | } | |
976 | return false; | |
977 | } | |
978 | ||
979 | bool CInode::is_projected_ancestor_of(const CInode *other) const | |
980 | { | |
981 | while (other) { | |
982 | if (other == this) | |
983 | return true; | |
984 | const CDentry *pdn = other->get_projected_parent_dn(); | |
985 | if (!pdn) { | |
986 | ceph_assert(other->is_base()); | |
987 | break; | |
988 | } | |
989 | other = pdn->get_dir()->get_inode(); | |
7c673cae FG |
990 | } |
991 | return false; | |
992 | } | |
993 | ||
994 | /* | |
995 | * Because a non-directory inode may have multiple links, the use_parent | |
996 | * argument allows selecting which parent to use for path construction. This | |
997 | * argument is only meaningful for the final component (i.e. the first of the | |
998 | * nested calls) because directories cannot have multiple hard links. If | |
999 | * use_parent is NULL and projected is true, the primary parent's projected | |
1000 | * inode is used all the way up the path chain. Otherwise the primary parent | |
1001 | * stable inode is used. | |
1002 | */ | |
1003 | void CInode::make_path_string(string& s, bool projected, const CDentry *use_parent) const | |
1004 | { | |
1005 | if (!use_parent) { | |
1006 | use_parent = projected ? get_projected_parent_dn() : parent; | |
1007 | } | |
1008 | ||
1009 | if (use_parent) { | |
1010 | use_parent->make_path_string(s, projected); | |
1011 | } else if (is_root()) { | |
1012 | s = ""; | |
1013 | } else if (is_mdsdir()) { | |
1014 | char t[40]; | |
1015 | uint64_t eino(ino()); | |
1016 | eino -= MDS_INO_MDSDIR_OFFSET; | |
1017 | snprintf(t, sizeof(t), "~mds%" PRId64, eino); | |
1018 | s = t; | |
1019 | } else { | |
1020 | char n[40]; | |
1021 | uint64_t eino(ino()); | |
1022 | snprintf(n, sizeof(n), "#%" PRIx64, eino); | |
1023 | s += n; | |
1024 | } | |
1025 | } | |
1026 | ||
1027 | void CInode::make_path(filepath& fp, bool projected) const | |
1028 | { | |
1029 | const CDentry *use_parent = projected ? get_projected_parent_dn() : parent; | |
1030 | if (use_parent) { | |
11fdf7f2 | 1031 | ceph_assert(!is_base()); |
7c673cae FG |
1032 | use_parent->make_path(fp, projected); |
1033 | } else { | |
1034 | fp = filepath(ino()); | |
1035 | } | |
1036 | } | |
1037 | ||
1038 | void CInode::name_stray_dentry(string& dname) | |
1039 | { | |
1040 | char s[20]; | |
f67539c2 | 1041 | snprintf(s, sizeof(s), "%llx", (unsigned long long)ino().val); |
7c673cae FG |
1042 | dname = s; |
1043 | } | |
1044 | ||
1045 | version_t CInode::pre_dirty() | |
1046 | { | |
1047 | version_t pv; | |
1048 | CDentry* _cdentry = get_projected_parent_dn(); | |
1049 | if (_cdentry) { | |
1050 | pv = _cdentry->pre_dirty(get_projected_version()); | |
f67539c2 | 1051 | dout(10) << "pre_dirty " << pv << " (current v " << get_inode()->version << ")" << dendl; |
7c673cae | 1052 | } else { |
11fdf7f2 | 1053 | ceph_assert(is_base()); |
7c673cae FG |
1054 | pv = get_projected_version() + 1; |
1055 | } | |
94b18763 | 1056 | // force update backtrace for old format inode (see mempool_inode::decode) |
f67539c2 TL |
1057 | if (get_inode()->backtrace_version == 0 && !projected_nodes.empty()) { |
1058 | auto pi = _get_projected_inode(); | |
1059 | if (pi->backtrace_version == 0) | |
1060 | pi->update_backtrace(pv); | |
7c673cae FG |
1061 | } |
1062 | return pv; | |
1063 | } | |
1064 | ||
1065 | void CInode::_mark_dirty(LogSegment *ls) | |
1066 | { | |
1067 | if (!state_test(STATE_DIRTY)) { | |
1068 | state_set(STATE_DIRTY); | |
1069 | get(PIN_DIRTY); | |
11fdf7f2 | 1070 | ceph_assert(ls); |
7c673cae FG |
1071 | } |
1072 | ||
1073 | // move myself to this segment's dirty list | |
1074 | if (ls) | |
1075 | ls->dirty_inodes.push_back(&item_dirty); | |
1076 | } | |
1077 | ||
f67539c2 | 1078 | void CInode::mark_dirty(LogSegment *ls) { |
7c673cae | 1079 | |
11fdf7f2 | 1080 | dout(10) << __func__ << " " << *this << dendl; |
7c673cae FG |
1081 | |
1082 | /* | |
1083 | NOTE: I may already be dirty, but this fn _still_ needs to be called so that | |
1084 | the directory is (perhaps newly) dirtied, and so that parent_dir_version is | |
1085 | updated below. | |
1086 | */ | |
1087 | ||
1088 | // only auth can get dirty. "dirty" async data in replicas is relative to | |
1089 | // filelock state, not the dirty flag. | |
11fdf7f2 | 1090 | ceph_assert(is_auth()); |
7c673cae FG |
1091 | |
1092 | // touch my private version | |
7c673cae FG |
1093 | _mark_dirty(ls); |
1094 | ||
1095 | // mark dentry too | |
1096 | if (parent) | |
f67539c2 | 1097 | parent->mark_dirty(get_version(), ls); |
7c673cae FG |
1098 | } |
1099 | ||
1100 | ||
1101 | void CInode::mark_clean() | |
1102 | { | |
11fdf7f2 | 1103 | dout(10) << __func__ << " " << *this << dendl; |
7c673cae FG |
1104 | if (state_test(STATE_DIRTY)) { |
1105 | state_clear(STATE_DIRTY); | |
1106 | put(PIN_DIRTY); | |
1107 | ||
1108 | // remove myself from ls dirty list | |
1109 | item_dirty.remove_myself(); | |
1110 | } | |
1111 | } | |
1112 | ||
1113 | ||
1114 | // -------------- | |
1115 | // per-inode storage | |
1116 | // (currently for root inode only) | |
1117 | ||
1118 | struct C_IO_Inode_Stored : public CInodeIOContext { | |
1119 | version_t version; | |
1120 | Context *fin; | |
1121 | C_IO_Inode_Stored(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {} | |
1122 | void finish(int r) override { | |
1123 | in->_stored(r, version, fin); | |
1124 | } | |
91327a77 AA |
1125 | void print(ostream& out) const override { |
1126 | out << "inode_store(" << in->ino() << ")"; | |
1127 | } | |
7c673cae FG |
1128 | }; |
1129 | ||
11fdf7f2 | 1130 | object_t InodeStoreBase::get_object_name(inodeno_t ino, frag_t fg, std::string_view suffix) |
7c673cae FG |
1131 | { |
1132 | char n[60]; | |
11fdf7f2 TL |
1133 | snprintf(n, sizeof(n), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)fg); |
1134 | ceph_assert(strlen(n) + suffix.size() < sizeof n); | |
1135 | strncat(n, suffix.data(), suffix.size()); | |
7c673cae FG |
1136 | return object_t(n); |
1137 | } | |
1138 | ||
11fdf7f2 | 1139 | void CInode::store(MDSContext *fin) |
7c673cae | 1140 | { |
11fdf7f2 TL |
1141 | dout(10) << __func__ << " " << get_version() << dendl; |
1142 | ceph_assert(is_base()); | |
7c673cae FG |
1143 | |
1144 | if (snaprealm) | |
1145 | purge_stale_snap_data(snaprealm->get_snaps()); | |
1146 | ||
1147 | // encode | |
1148 | bufferlist bl; | |
1149 | string magic = CEPH_FS_ONDISK_MAGIC; | |
11fdf7f2 TL |
1150 | using ceph::encode; |
1151 | encode(magic, bl); | |
7c673cae FG |
1152 | encode_store(bl, mdcache->mds->mdsmap->get_up_features()); |
1153 | ||
1154 | // write it. | |
1155 | SnapContext snapc; | |
1156 | ObjectOperation m; | |
1157 | m.write_full(bl); | |
1158 | ||
1159 | object_t oid = CInode::get_object_name(ino(), frag_t(), ".inode"); | |
b3b6e05e | 1160 | object_locator_t oloc(mdcache->mds->get_metadata_pool()); |
7c673cae FG |
1161 | |
1162 | Context *newfin = | |
1163 | new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin), | |
1164 | mdcache->mds->finisher); | |
1165 | mdcache->mds->objecter->mutate(oid, oloc, m, snapc, | |
1166 | ceph::real_clock::now(), 0, | |
1167 | newfin); | |
1168 | } | |
1169 | ||
1170 | void CInode::_stored(int r, version_t v, Context *fin) | |
1171 | { | |
1172 | if (r < 0) { | |
1173 | dout(1) << "store error " << r << " v " << v << " on " << *this << dendl; | |
d2e6a577 FG |
1174 | mdcache->mds->clog->error() << "failed to store inode " << ino() |
1175 | << " object: " << cpp_strerror(r); | |
7c673cae FG |
1176 | mdcache->mds->handle_write_error(r); |
1177 | fin->complete(r); | |
1178 | return; | |
1179 | } | |
1180 | ||
11fdf7f2 | 1181 | dout(10) << __func__ << " " << v << " on " << *this << dendl; |
7c673cae FG |
1182 | if (v == get_projected_version()) |
1183 | mark_clean(); | |
1184 | ||
1185 | fin->complete(0); | |
1186 | } | |
1187 | ||
11fdf7f2 | 1188 | void CInode::flush(MDSContext *fin) |
7c673cae | 1189 | { |
11fdf7f2 TL |
1190 | dout(10) << __func__ << " " << *this << dendl; |
1191 | ceph_assert(is_auth() && can_auth_pin()); | |
7c673cae FG |
1192 | |
1193 | MDSGatherBuilder gather(g_ceph_context); | |
1194 | ||
1195 | if (is_dirty_parent()) { | |
1196 | store_backtrace(gather.new_sub()); | |
1197 | } | |
1198 | if (is_dirty()) { | |
1199 | if (is_base()) { | |
1200 | store(gather.new_sub()); | |
1201 | } else { | |
1202 | parent->dir->commit(0, gather.new_sub()); | |
1203 | } | |
1204 | } | |
1205 | ||
1206 | if (gather.has_subs()) { | |
1207 | gather.set_finisher(fin); | |
1208 | gather.activate(); | |
1209 | } else { | |
1210 | fin->complete(0); | |
1211 | } | |
1212 | } | |
1213 | ||
1214 | struct C_IO_Inode_Fetched : public CInodeIOContext { | |
1215 | bufferlist bl, bl2; | |
1216 | Context *fin; | |
1217 | C_IO_Inode_Fetched(CInode *i, Context *f) : CInodeIOContext(i), fin(f) {} | |
1218 | void finish(int r) override { | |
f67539c2 | 1219 | // Ignore 'r', because we fetch from two places, so r is usually CEPHFS_ENOENT |
7c673cae FG |
1220 | in->_fetched(bl, bl2, fin); |
1221 | } | |
91327a77 AA |
1222 | void print(ostream& out) const override { |
1223 | out << "inode_fetch(" << in->ino() << ")"; | |
1224 | } | |
7c673cae FG |
1225 | }; |
1226 | ||
11fdf7f2 | 1227 | void CInode::fetch(MDSContext *fin) |
7c673cae | 1228 | { |
11fdf7f2 | 1229 | dout(10) << __func__ << dendl; |
7c673cae FG |
1230 | |
1231 | C_IO_Inode_Fetched *c = new C_IO_Inode_Fetched(this, fin); | |
1232 | C_GatherBuilder gather(g_ceph_context, new C_OnFinisher(c, mdcache->mds->finisher)); | |
1233 | ||
1234 | object_t oid = CInode::get_object_name(ino(), frag_t(), ""); | |
b3b6e05e | 1235 | object_locator_t oloc(mdcache->mds->get_metadata_pool()); |
7c673cae FG |
1236 | |
1237 | // Old on-disk format: inode stored in xattr of a dirfrag | |
1238 | ObjectOperation rd; | |
1239 | rd.getxattr("inode", &c->bl, NULL); | |
1240 | mdcache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, (bufferlist*)NULL, 0, gather.new_sub()); | |
1241 | ||
1242 | // Current on-disk format: inode stored in a .inode object | |
1243 | object_t oid2 = CInode::get_object_name(ino(), frag_t(), ".inode"); | |
1244 | mdcache->mds->objecter->read(oid2, oloc, 0, 0, CEPH_NOSNAP, &c->bl2, 0, gather.new_sub()); | |
1245 | ||
1246 | gather.activate(); | |
1247 | } | |
1248 | ||
1249 | void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin) | |
1250 | { | |
11fdf7f2 TL |
1251 | dout(10) << __func__ << " got " << bl.length() << " and " << bl2.length() << dendl; |
1252 | bufferlist::const_iterator p; | |
7c673cae | 1253 | if (bl2.length()) { |
11fdf7f2 | 1254 | p = bl2.cbegin(); |
7c673cae | 1255 | } else if (bl.length()) { |
11fdf7f2 | 1256 | p = bl.cbegin(); |
7c673cae | 1257 | } else { |
d2e6a577 | 1258 | derr << "No data while reading inode " << ino() << dendl; |
f67539c2 | 1259 | fin->complete(-CEPHFS_ENOENT); |
7c673cae FG |
1260 | return; |
1261 | } | |
1262 | ||
11fdf7f2 | 1263 | using ceph::decode; |
7c673cae FG |
1264 | // Attempt decode |
1265 | try { | |
1266 | string magic; | |
11fdf7f2 | 1267 | decode(magic, p); |
7c673cae FG |
1268 | dout(10) << " magic is '" << magic << "' (expecting '" |
1269 | << CEPH_FS_ONDISK_MAGIC << "')" << dendl; | |
1270 | if (magic != CEPH_FS_ONDISK_MAGIC) { | |
1271 | dout(0) << "on disk magic '" << magic << "' != my magic '" << CEPH_FS_ONDISK_MAGIC | |
1272 | << "'" << dendl; | |
f67539c2 | 1273 | fin->complete(-CEPHFS_EINVAL); |
7c673cae FG |
1274 | } else { |
1275 | decode_store(p); | |
1276 | dout(10) << "_fetched " << *this << dendl; | |
1277 | fin->complete(0); | |
1278 | } | |
1279 | } catch (buffer::error &err) { | |
f67539c2 TL |
1280 | derr << "Corrupt inode " << ino() << ": " << err.what() << dendl; |
1281 | fin->complete(-CEPHFS_EINVAL); | |
7c673cae FG |
1282 | return; |
1283 | } | |
1284 | } | |
1285 | ||
1286 | void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt) | |
1287 | { | |
f67539c2 | 1288 | bt.ino = ino(); |
7c673cae FG |
1289 | bt.ancestors.clear(); |
1290 | bt.pool = pool; | |
1291 | ||
1292 | CInode *in = this; | |
1293 | CDentry *pdn = get_parent_dn(); | |
1294 | while (pdn) { | |
1295 | CInode *diri = pdn->get_dir()->get_inode(); | |
f67539c2 | 1296 | bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(), in->get_inode()->version)); |
7c673cae FG |
1297 | in = diri; |
1298 | pdn = in->get_parent_dn(); | |
1299 | } | |
f67539c2 TL |
1300 | bt.old_pools.reserve(get_inode()->old_pools.size()); |
1301 | for (auto &p : get_inode()->old_pools) { | |
7c673cae | 1302 | // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0) |
94b18763 | 1303 | if (p != pool) |
f67539c2 | 1304 | bt.old_pools.push_back(p); |
7c673cae FG |
1305 | } |
1306 | } | |
1307 | ||
1308 | struct C_IO_Inode_StoredBacktrace : public CInodeIOContext { | |
1309 | version_t version; | |
1310 | Context *fin; | |
1311 | C_IO_Inode_StoredBacktrace(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {} | |
1312 | void finish(int r) override { | |
1313 | in->_stored_backtrace(r, version, fin); | |
1314 | } | |
91327a77 AA |
1315 | void print(ostream& out) const override { |
1316 | out << "backtrace_store(" << in->ino() << ")"; | |
1317 | } | |
7c673cae FG |
1318 | }; |
1319 | ||
f67539c2 TL |
1320 | |
1321 | void CInode::_commit_ops(int r, C_GatherBuilder &gather_bld, | |
1322 | std::vector<CInodeCommitOperation> &ops_vec, | |
1323 | inode_backtrace_t &bt) | |
1324 | { | |
1325 | dout(10) << __func__ << dendl; | |
1326 | ||
1327 | if (r < 0) { | |
1328 | mdcache->mds->handle_write_error_with_lock(r); | |
1329 | return; | |
1330 | } | |
1331 | ||
1332 | SnapContext snapc; | |
1333 | object_t oid = get_object_name(ino(), frag_t(), ""); | |
1334 | ||
1335 | for (auto &op : ops_vec) { | |
1336 | ObjectOperation obj_op; | |
1337 | object_locator_t oloc(op.get_pool()); | |
1338 | op.update(obj_op, bt); | |
1339 | mdcache->mds->objecter->mutate(oid, oloc, obj_op, snapc, | |
1340 | ceph::real_clock::now(), | |
1341 | 0, gather_bld.new_sub()); | |
1342 | } | |
1343 | } | |
1344 | ||
1345 | void CInode::_store_backtrace(std::vector<CInodeCommitOperation> &ops_vec, | |
1346 | inode_backtrace_t &bt, int op_prio) | |
7c673cae | 1347 | { |
11fdf7f2 TL |
1348 | dout(10) << __func__ << " on " << *this << dendl; |
1349 | ceph_assert(is_dirty_parent()); | |
7c673cae FG |
1350 | |
1351 | if (op_prio < 0) | |
1352 | op_prio = CEPH_MSG_PRIO_DEFAULT; | |
1353 | ||
1354 | auth_pin(this); | |
1355 | ||
1356 | const int64_t pool = get_backtrace_pool(); | |
7c673cae | 1357 | build_backtrace(pool, bt); |
7c673cae | 1358 | |
20effc67 TL |
1359 | std::string_view slink = ""; |
1360 | if (is_symlink() && mdcache->get_symlink_recovery()) { | |
1361 | slink = symlink; | |
1362 | } | |
1363 | ||
f67539c2 | 1364 | ops_vec.emplace_back(op_prio, pool, get_inode()->layout, |
20effc67 | 1365 | mdcache->mds->mdsmap->get_up_features(), slink); |
7c673cae | 1366 | |
f67539c2 | 1367 | if (!state_test(STATE_DIRTYPOOL) || get_inode()->old_pools.empty()) { |
7c673cae | 1368 | dout(20) << __func__ << ": no dirtypool or no old pools" << dendl; |
7c673cae FG |
1369 | return; |
1370 | } | |
1371 | ||
7c673cae FG |
1372 | // In the case where DIRTYPOOL is set, we update all old pools backtraces |
1373 | // such that anyone reading them will see the new pool ID in | |
1374 | // inode_backtrace_t::pool and go read everything else from there. | |
f67539c2 | 1375 | for (const auto &p : get_inode()->old_pools) { |
94b18763 | 1376 | if (p == pool) |
7c673cae FG |
1377 | continue; |
1378 | ||
94b18763 | 1379 | dout(20) << __func__ << ": updating old pool " << p << dendl; |
7c673cae | 1380 | |
f67539c2 | 1381 | ops_vec.emplace_back(op_prio, p); |
7c673cae | 1382 | } |
f67539c2 TL |
1383 | } |
1384 | ||
1385 | void CInode::store_backtrace(MDSContext *fin, int op_prio) | |
1386 | { | |
1387 | std::vector<CInodeCommitOperation> ops_vec; | |
1388 | inode_backtrace_t bt; | |
1389 | auto version = get_inode()->backtrace_version; | |
1390 | ||
1391 | _store_backtrace(ops_vec, bt, op_prio); | |
1392 | ||
1393 | C_GatherBuilder gather(g_ceph_context, | |
1394 | new C_OnFinisher( | |
1395 | new C_IO_Inode_StoredBacktrace(this, version, fin), | |
1396 | mdcache->mds->finisher)); | |
1397 | _commit_ops(0, gather, ops_vec, bt); | |
1398 | ceph_assert(gather.has_subs()); | |
7c673cae FG |
1399 | gather.activate(); |
1400 | } | |
1401 | ||
f67539c2 TL |
1402 | void CInode::store_backtrace(CInodeCommitOperations &op, int op_prio) |
1403 | { | |
1404 | op.version = get_inode()->backtrace_version; | |
1405 | op.in = this; | |
1406 | ||
1407 | _store_backtrace(op.ops_vec, op.bt, op_prio); | |
1408 | } | |
1409 | ||
7c673cae FG |
1410 | void CInode::_stored_backtrace(int r, version_t v, Context *fin) |
1411 | { | |
f67539c2 | 1412 | if (r == -CEPHFS_ENOENT) { |
7c673cae FG |
1413 | const int64_t pool = get_backtrace_pool(); |
1414 | bool exists = mdcache->mds->objecter->with_osdmap( | |
1415 | [pool](const OSDMap &osd_map) { | |
1416 | return osd_map.have_pg_pool(pool); | |
1417 | }); | |
1418 | ||
f67539c2 | 1419 | // This CEPHFS_ENOENT is because the pool doesn't exist (the user deleted it |
7c673cae FG |
1420 | // out from under us), so the backtrace can never be written, so pretend |
1421 | // to succeed so that the user can proceed to e.g. delete the file. | |
1422 | if (!exists) { | |
f67539c2 | 1423 | dout(4) << __func__ << " got CEPHFS_ENOENT: a data pool was deleted " |
7c673cae FG |
1424 | "beneath us!" << dendl; |
1425 | r = 0; | |
1426 | } | |
1427 | } | |
1428 | ||
1429 | if (r < 0) { | |
1430 | dout(1) << "store backtrace error " << r << " v " << v << dendl; | |
1431 | mdcache->mds->clog->error() << "failed to store backtrace on ino " | |
1432 | << ino() << " object" | |
1433 | << ", pool " << get_backtrace_pool() | |
1434 | << ", errno " << r; | |
1435 | mdcache->mds->handle_write_error(r); | |
1436 | if (fin) | |
1437 | fin->complete(r); | |
1438 | return; | |
1439 | } | |
1440 | ||
11fdf7f2 | 1441 | dout(10) << __func__ << " v " << v << dendl; |
7c673cae FG |
1442 | |
1443 | auth_unpin(this); | |
f67539c2 | 1444 | if (v == get_inode()->backtrace_version) |
7c673cae FG |
1445 | clear_dirty_parent(); |
1446 | if (fin) | |
1447 | fin->complete(0); | |
1448 | } | |
1449 | ||
1450 | void CInode::fetch_backtrace(Context *fin, bufferlist *backtrace) | |
1451 | { | |
f67539c2 | 1452 | mdcache->fetch_backtrace(ino(), get_backtrace_pool(), *backtrace, fin); |
7c673cae FG |
1453 | } |
1454 | ||
28e407b8 | 1455 | void CInode::mark_dirty_parent(LogSegment *ls, bool dirty_pool) |
7c673cae FG |
1456 | { |
1457 | if (!state_test(STATE_DIRTYPARENT)) { | |
11fdf7f2 | 1458 | dout(10) << __func__ << dendl; |
7c673cae FG |
1459 | state_set(STATE_DIRTYPARENT); |
1460 | get(PIN_DIRTYPARENT); | |
11fdf7f2 | 1461 | ceph_assert(ls); |
7c673cae FG |
1462 | } |
1463 | if (dirty_pool) | |
1464 | state_set(STATE_DIRTYPOOL); | |
1465 | if (ls) | |
1466 | ls->dirty_parent_inodes.push_back(&item_dirty_parent); | |
1467 | } | |
1468 | ||
1469 | void CInode::clear_dirty_parent() | |
1470 | { | |
1471 | if (state_test(STATE_DIRTYPARENT)) { | |
11fdf7f2 | 1472 | dout(10) << __func__ << dendl; |
7c673cae FG |
1473 | state_clear(STATE_DIRTYPARENT); |
1474 | state_clear(STATE_DIRTYPOOL); | |
1475 | put(PIN_DIRTYPARENT); | |
1476 | item_dirty_parent.remove_myself(); | |
1477 | } | |
1478 | } | |
1479 | ||
1480 | void CInode::verify_diri_backtrace(bufferlist &bl, int err) | |
1481 | { | |
1482 | if (is_base() || is_dirty_parent() || !is_auth()) | |
1483 | return; | |
1484 | ||
11fdf7f2 | 1485 | dout(10) << __func__ << dendl; |
7c673cae FG |
1486 | |
1487 | if (err == 0) { | |
1488 | inode_backtrace_t backtrace; | |
11fdf7f2 TL |
1489 | using ceph::decode; |
1490 | decode(backtrace, bl); | |
7c673cae FG |
1491 | CDentry *pdn = get_parent_dn(); |
1492 | if (backtrace.ancestors.empty() || | |
94b18763 | 1493 | backtrace.ancestors[0].dname != pdn->get_name() || |
7c673cae | 1494 | backtrace.ancestors[0].dirino != pdn->get_dir()->ino()) |
f67539c2 | 1495 | err = -CEPHFS_EINVAL; |
7c673cae FG |
1496 | } |
1497 | ||
1498 | if (err) { | |
1499 | MDSRank *mds = mdcache->mds; | |
d2e6a577 | 1500 | mds->clog->error() << "bad backtrace on directory inode " << ino(); |
11fdf7f2 | 1501 | ceph_assert(!"bad backtrace" == (g_conf()->mds_verify_backtrace > 1)); |
7c673cae | 1502 | |
28e407b8 | 1503 | mark_dirty_parent(mds->mdlog->get_current_segment(), false); |
7c673cae FG |
1504 | mds->mdlog->flush(); |
1505 | } | |
1506 | } | |
1507 | ||
1508 | // ------------------ | |
1509 | // parent dir | |
1510 | ||
1511 | ||
f67539c2 TL |
1512 | void InodeStoreBase::encode_xattrs(bufferlist &bl) const { |
1513 | using ceph::encode; | |
1514 | if (xattrs) | |
1515 | encode(*xattrs, bl); | |
1516 | else | |
1517 | encode((__u32)0, bl); | |
1518 | } | |
1519 | ||
1520 | void InodeStoreBase::decode_xattrs(bufferlist::const_iterator &p) { | |
1521 | using ceph::decode; | |
1522 | mempool_xattr_map tmp; | |
1523 | decode_noshare(tmp, p); | |
1524 | if (tmp.empty()) { | |
1525 | reset_xattrs(xattr_map_ptr()); | |
1526 | } else { | |
1527 | reset_xattrs(allocate_xattr_map(std::move(tmp))); | |
1528 | } | |
1529 | } | |
1530 | ||
1531 | void InodeStoreBase::encode_old_inodes(bufferlist &bl, uint64_t features) const { | |
1532 | using ceph::encode; | |
1533 | if (old_inodes) | |
1534 | encode(*old_inodes, bl, features); | |
1535 | else | |
1536 | encode((__u32)0, bl); | |
1537 | } | |
1538 | ||
1539 | void InodeStoreBase::decode_old_inodes(bufferlist::const_iterator &p) { | |
1540 | using ceph::decode; | |
1541 | mempool_old_inode_map tmp; | |
1542 | decode(tmp, p); | |
1543 | if (tmp.empty()) { | |
1544 | reset_old_inodes(old_inode_map_ptr()); | |
1545 | } else { | |
1546 | reset_old_inodes(allocate_old_inode_map(std::move(tmp))); | |
1547 | } | |
1548 | } | |
1549 | ||
7c673cae FG |
1550 | void InodeStoreBase::encode_bare(bufferlist &bl, uint64_t features, |
1551 | const bufferlist *snap_blob) const | |
1552 | { | |
11fdf7f2 | 1553 | using ceph::encode; |
f67539c2 TL |
1554 | encode(*inode, bl, features); |
1555 | if (inode->is_symlink()) | |
11fdf7f2 TL |
1556 | encode(symlink, bl); |
1557 | encode(dirfragtree, bl); | |
f67539c2 TL |
1558 | encode_xattrs(bl); |
1559 | ||
7c673cae | 1560 | if (snap_blob) |
11fdf7f2 | 1561 | encode(*snap_blob, bl); |
7c673cae | 1562 | else |
11fdf7f2 | 1563 | encode(bufferlist(), bl); |
f67539c2 | 1564 | encode_old_inodes(bl, features); |
11fdf7f2 TL |
1565 | encode(oldest_snap, bl); |
1566 | encode(damage_flags, bl); | |
7c673cae FG |
1567 | } |
1568 | ||
1569 | void InodeStoreBase::encode(bufferlist &bl, uint64_t features, | |
1570 | const bufferlist *snap_blob) const | |
1571 | { | |
1572 | ENCODE_START(6, 4, bl); | |
1573 | encode_bare(bl, features, snap_blob); | |
1574 | ENCODE_FINISH(bl); | |
1575 | } | |
1576 | ||
1577 | void CInode::encode_store(bufferlist& bl, uint64_t features) | |
1578 | { | |
1579 | bufferlist snap_blob; | |
1580 | encode_snap_blob(snap_blob); | |
1581 | InodeStoreBase::encode(bl, mdcache->mds->mdsmap->get_up_features(), | |
1582 | &snap_blob); | |
1583 | } | |
1584 | ||
11fdf7f2 | 1585 | void InodeStoreBase::decode_bare(bufferlist::const_iterator &bl, |
7c673cae FG |
1586 | bufferlist& snap_blob, __u8 struct_v) |
1587 | { | |
11fdf7f2 | 1588 | using ceph::decode; |
f67539c2 TL |
1589 | |
1590 | auto _inode = allocate_inode(); | |
1591 | decode(*_inode, bl); | |
1592 | ||
1593 | if (_inode->is_symlink()) { | |
94b18763 | 1594 | std::string tmp; |
11fdf7f2 TL |
1595 | decode(tmp, bl); |
1596 | symlink = std::string_view(tmp); | |
94b18763 | 1597 | } |
11fdf7f2 | 1598 | decode(dirfragtree, bl); |
f67539c2 | 1599 | decode_xattrs(bl); |
11fdf7f2 | 1600 | decode(snap_blob, bl); |
7c673cae | 1601 | |
f67539c2 TL |
1602 | decode_old_inodes(bl); |
1603 | if (struct_v == 2 && _inode->is_dir()) { | |
7c673cae | 1604 | bool default_layout_exists; |
11fdf7f2 | 1605 | decode(default_layout_exists, bl); |
7c673cae | 1606 | if (default_layout_exists) { |
11fdf7f2 | 1607 | decode(struct_v, bl); // this was a default_file_layout |
f67539c2 | 1608 | decode(_inode->layout, bl); // but we only care about the layout portion |
7c673cae FG |
1609 | } |
1610 | } | |
1611 | ||
1612 | if (struct_v >= 5) { | |
1613 | // InodeStore is embedded in dentries without proper versioning, so | |
1614 | // we consume up to the end of the buffer | |
1615 | if (!bl.end()) { | |
11fdf7f2 | 1616 | decode(oldest_snap, bl); |
7c673cae FG |
1617 | } |
1618 | ||
1619 | if (!bl.end()) { | |
11fdf7f2 | 1620 | decode(damage_flags, bl); |
7c673cae FG |
1621 | } |
1622 | } | |
f67539c2 TL |
1623 | |
1624 | reset_inode(std::move(_inode)); | |
7c673cae FG |
1625 | } |
1626 | ||
1627 | ||
11fdf7f2 | 1628 | void InodeStoreBase::decode(bufferlist::const_iterator &bl, bufferlist& snap_blob) |
7c673cae FG |
1629 | { |
1630 | DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl); | |
1631 | decode_bare(bl, snap_blob, struct_v); | |
1632 | DECODE_FINISH(bl); | |
1633 | } | |
1634 | ||
11fdf7f2 | 1635 | void CInode::decode_store(bufferlist::const_iterator& bl) |
7c673cae FG |
1636 | { |
1637 | bufferlist snap_blob; | |
1638 | InodeStoreBase::decode(bl, snap_blob); | |
1639 | decode_snap_blob(snap_blob); | |
1640 | } | |
1641 | ||
1642 | // ------------------ | |
1643 | // locking | |
1644 | ||
9f95a23c TL |
1645 | SimpleLock* CInode::get_lock(int type) |
1646 | { | |
1647 | switch (type) { | |
1648 | case CEPH_LOCK_IVERSION: return &versionlock; | |
1649 | case CEPH_LOCK_IFILE: return &filelock; | |
1650 | case CEPH_LOCK_IAUTH: return &authlock; | |
1651 | case CEPH_LOCK_ILINK: return &linklock; | |
1652 | case CEPH_LOCK_IDFT: return &dirfragtreelock; | |
1653 | case CEPH_LOCK_IXATTR: return &xattrlock; | |
1654 | case CEPH_LOCK_ISNAP: return &snaplock; | |
1655 | case CEPH_LOCK_INEST: return &nestlock; | |
1656 | case CEPH_LOCK_IFLOCK: return &flocklock; | |
1657 | case CEPH_LOCK_IPOLICY: return &policylock; | |
1658 | } | |
1659 | return 0; | |
1660 | } | |
1661 | ||
7c673cae FG |
1662 | void CInode::set_object_info(MDSCacheObjectInfo &info) |
1663 | { | |
1664 | info.ino = ino(); | |
1665 | info.snapid = last; | |
1666 | } | |
1667 | ||
9f95a23c | 1668 | void CInode::encode_lock_iauth(bufferlist& bl) |
7c673cae | 1669 | { |
9f95a23c | 1670 | ENCODE_START(1, 1, bl); |
f67539c2 TL |
1671 | encode(get_inode()->version, bl); |
1672 | encode(get_inode()->ctime, bl); | |
1673 | encode(get_inode()->mode, bl); | |
1674 | encode(get_inode()->uid, bl); | |
1675 | encode(get_inode()->gid, bl); | |
9f95a23c TL |
1676 | ENCODE_FINISH(bl); |
1677 | } | |
7c673cae | 1678 | |
9f95a23c TL |
1679 | void CInode::decode_lock_iauth(bufferlist::const_iterator& p) |
1680 | { | |
f67539c2 TL |
1681 | ceph_assert(!is_auth()); |
1682 | auto _inode = allocate_inode(*get_inode()); | |
9f95a23c | 1683 | DECODE_START(1, p); |
f67539c2 | 1684 | decode(_inode->version, p); |
9f95a23c TL |
1685 | utime_t tm; |
1686 | decode(tm, p); | |
f67539c2 TL |
1687 | if (_inode->ctime < tm) _inode->ctime = tm; |
1688 | decode(_inode->mode, p); | |
1689 | decode(_inode->uid, p); | |
1690 | decode(_inode->gid, p); | |
9f95a23c | 1691 | DECODE_FINISH(p); |
f67539c2 | 1692 | reset_inode(std::move(_inode)); |
9f95a23c TL |
1693 | } |
1694 | ||
1695 | void CInode::encode_lock_ilink(bufferlist& bl) | |
1696 | { | |
1697 | ENCODE_START(1, 1, bl); | |
f67539c2 TL |
1698 | encode(get_inode()->version, bl); |
1699 | encode(get_inode()->ctime, bl); | |
1700 | encode(get_inode()->nlink, bl); | |
9f95a23c TL |
1701 | ENCODE_FINISH(bl); |
1702 | } | |
1703 | ||
1704 | void CInode::decode_lock_ilink(bufferlist::const_iterator& p) | |
1705 | { | |
f67539c2 TL |
1706 | ceph_assert(!is_auth()); |
1707 | auto _inode = allocate_inode(*get_inode()); | |
9f95a23c | 1708 | DECODE_START(1, p); |
f67539c2 | 1709 | decode(_inode->version, p); |
9f95a23c TL |
1710 | utime_t tm; |
1711 | decode(tm, p); | |
f67539c2 TL |
1712 | if (_inode->ctime < tm) _inode->ctime = tm; |
1713 | decode(_inode->nlink, p); | |
9f95a23c | 1714 | DECODE_FINISH(p); |
f67539c2 | 1715 | reset_inode(std::move(_inode)); |
9f95a23c TL |
1716 | } |
1717 | ||
1718 | void CInode::encode_lock_idft(bufferlist& bl) | |
1719 | { | |
1720 | ENCODE_START(1, 1, bl); | |
1721 | if (is_auth()) { | |
f67539c2 | 1722 | encode(get_inode()->version, bl); |
9f95a23c TL |
1723 | } else { |
1724 | // treat flushing as dirty when rejoining cache | |
1725 | bool dirty = dirfragtreelock.is_dirty_or_flushing(); | |
1726 | encode(dirty, bl); | |
1727 | } | |
1728 | { | |
1729 | // encode the raw tree | |
1730 | encode(dirfragtree, bl); | |
1731 | ||
1732 | // also specify which frags are mine | |
1733 | set<frag_t> myfrags; | |
1734 | auto&& dfls = get_dirfrags(); | |
1735 | for (const auto& dir : dfls) { | |
1736 | if (dir->is_auth()) { | |
1737 | frag_t fg = dir->get_frag(); | |
1738 | myfrags.insert(fg); | |
1739 | } | |
1740 | } | |
1741 | encode(myfrags, bl); | |
1742 | } | |
1743 | ENCODE_FINISH(bl); | |
1744 | } | |
1745 | ||
1746 | void CInode::decode_lock_idft(bufferlist::const_iterator& p) | |
1747 | { | |
f67539c2 TL |
1748 | inode_ptr _inode; |
1749 | ||
9f95a23c TL |
1750 | DECODE_START(1, p); |
1751 | if (is_auth()) { | |
1752 | bool replica_dirty; | |
1753 | decode(replica_dirty, p); | |
1754 | if (replica_dirty) { | |
1755 | dout(10) << __func__ << " setting dftlock dirty flag" << dendl; | |
1756 | dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle | |
1757 | } | |
1758 | } else { | |
f67539c2 TL |
1759 | _inode = allocate_inode(*get_inode()); |
1760 | decode(_inode->version, p); | |
9f95a23c TL |
1761 | } |
1762 | { | |
1763 | fragtree_t temp; | |
1764 | decode(temp, p); | |
1765 | set<frag_t> authfrags; | |
1766 | decode(authfrags, p); | |
7c673cae | 1767 | if (is_auth()) { |
9f95a23c TL |
1768 | // auth. believe replica's auth frags only. |
1769 | for (auto fg : authfrags) { | |
1770 | if (!dirfragtree.is_leaf(fg)) { | |
1771 | dout(10) << " forcing frag " << fg << " to leaf (split|merge)" << dendl; | |
1772 | dirfragtree.force_to_leaf(g_ceph_context, fg); | |
1773 | dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle | |
1774 | } | |
1775 | } | |
7c673cae | 1776 | } else { |
9f95a23c TL |
1777 | // replica. take the new tree, BUT make sure any open |
1778 | // dirfrags remain leaves (they may have split _after_ this | |
1779 | // dft was scattered, or we may still be be waiting on the | |
1780 | // notify from the auth) | |
1781 | dirfragtree.swap(temp); | |
1782 | for (const auto &p : dirfrags) { | |
1783 | if (!dirfragtree.is_leaf(p.first)) { | |
1784 | dout(10) << " forcing open dirfrag " << p.first << " to leaf (racing with split|merge)" << dendl; | |
1785 | dirfragtree.force_to_leaf(g_ceph_context, p.first); | |
1786 | } | |
1787 | if (p.second->is_auth()) | |
1788 | p.second->state_clear(CDir::STATE_DIRTYDFT); | |
1789 | } | |
7c673cae | 1790 | } |
9f95a23c TL |
1791 | if (g_conf()->mds_debug_frag) |
1792 | verify_dirfrags(); | |
1793 | } | |
1794 | DECODE_FINISH(p); | |
f67539c2 TL |
1795 | |
1796 | if (_inode) | |
1797 | reset_inode(std::move(_inode)); | |
9f95a23c TL |
1798 | } |
1799 | ||
1800 | void CInode::encode_lock_ifile(bufferlist& bl) | |
1801 | { | |
1802 | ENCODE_START(1, 1, bl); | |
1803 | if (is_auth()) { | |
f67539c2 TL |
1804 | encode(get_inode()->version, bl); |
1805 | encode(get_inode()->ctime, bl); | |
1806 | encode(get_inode()->mtime, bl); | |
1807 | encode(get_inode()->atime, bl); | |
1808 | encode(get_inode()->time_warp_seq, bl); | |
9f95a23c | 1809 | if (!is_dir()) { |
f67539c2 TL |
1810 | encode(get_inode()->layout, bl, mdcache->mds->mdsmap->get_up_features()); |
1811 | encode(get_inode()->size, bl); | |
1812 | encode(get_inode()->truncate_seq, bl); | |
1813 | encode(get_inode()->truncate_size, bl); | |
1814 | encode(get_inode()->client_ranges, bl); | |
1815 | encode(get_inode()->inline_data, bl); | |
9f95a23c TL |
1816 | } |
1817 | } else { | |
1818 | // treat flushing as dirty when rejoining cache | |
1819 | bool dirty = filelock.is_dirty_or_flushing(); | |
1820 | encode(dirty, bl); | |
1821 | } | |
f67539c2 TL |
1822 | dout(15) << __func__ << " inode.dirstat is " << get_inode()->dirstat << dendl; |
1823 | encode(get_inode()->dirstat, bl); // only meaningful if i am auth. | |
9f95a23c TL |
1824 | bufferlist tmp; |
1825 | __u32 n = 0; | |
1826 | for (const auto &p : dirfrags) { | |
1827 | frag_t fg = p.first; | |
1828 | CDir *dir = p.second; | |
1829 | if (is_auth() || dir->is_auth()) { | |
f67539c2 | 1830 | const auto& pf = dir->get_projected_fnode(); |
9f95a23c TL |
1831 | dout(15) << fg << " " << *dir << dendl; |
1832 | dout(20) << fg << " fragstat " << pf->fragstat << dendl; | |
1833 | dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl; | |
1834 | encode(fg, tmp); | |
1835 | encode(dir->first, tmp); | |
1836 | encode(pf->fragstat, tmp); | |
1837 | encode(pf->accounted_fragstat, tmp); | |
1838 | n++; | |
7c673cae | 1839 | } |
9f95a23c TL |
1840 | } |
1841 | encode(n, bl); | |
1842 | bl.claim_append(tmp); | |
1843 | ENCODE_FINISH(bl); | |
1844 | } | |
1845 | ||
1846 | void CInode::decode_lock_ifile(bufferlist::const_iterator& p) | |
1847 | { | |
f67539c2 TL |
1848 | inode_ptr _inode; |
1849 | ||
9f95a23c TL |
1850 | DECODE_START(1, p); |
1851 | if (!is_auth()) { | |
f67539c2 TL |
1852 | _inode = allocate_inode(*get_inode()); |
1853 | ||
1854 | decode(_inode->version, p); | |
9f95a23c TL |
1855 | utime_t tm; |
1856 | decode(tm, p); | |
f67539c2 TL |
1857 | if (_inode->ctime < tm) _inode->ctime = tm; |
1858 | decode(_inode->mtime, p); | |
1859 | decode(_inode->atime, p); | |
1860 | decode(_inode->time_warp_seq, p); | |
9f95a23c | 1861 | if (!is_dir()) { |
f67539c2 TL |
1862 | decode(_inode->layout, p); |
1863 | decode(_inode->size, p); | |
1864 | decode(_inode->truncate_seq, p); | |
1865 | decode(_inode->truncate_size, p); | |
1866 | decode(_inode->client_ranges, p); | |
1867 | decode(_inode->inline_data, p); | |
9f95a23c TL |
1868 | } |
1869 | } else { | |
1870 | bool replica_dirty; | |
1871 | decode(replica_dirty, p); | |
1872 | if (replica_dirty) { | |
1873 | dout(10) << __func__ << " setting filelock dirty flag" << dendl; | |
1874 | filelock.mark_dirty(); // ok bc we're auth and caller will handle | |
1875 | } | |
1876 | } | |
1877 | ||
1878 | frag_info_t dirstat; | |
1879 | decode(dirstat, p); | |
1880 | if (!is_auth()) { | |
1881 | dout(10) << " taking inode dirstat " << dirstat << " for " << *this << dendl; | |
f67539c2 | 1882 | _inode->dirstat = dirstat; // take inode summation if replica |
9f95a23c TL |
1883 | } |
1884 | __u32 n; | |
1885 | decode(n, p); | |
1886 | dout(10) << " ...got " << n << " fragstats on " << *this << dendl; | |
1887 | while (n--) { | |
1888 | frag_t fg; | |
1889 | snapid_t fgfirst; | |
1890 | frag_info_t fragstat; | |
1891 | frag_info_t accounted_fragstat; | |
1892 | decode(fg, p); | |
1893 | decode(fgfirst, p); | |
1894 | decode(fragstat, p); | |
1895 | decode(accounted_fragstat, p); | |
1896 | dout(10) << fg << " [" << fgfirst << ",head] " << dendl; | |
1897 | dout(10) << fg << " fragstat " << fragstat << dendl; | |
1898 | dout(20) << fg << " accounted_fragstat " << accounted_fragstat << dendl; | |
1899 | ||
1900 | CDir *dir = get_dirfrag(fg); | |
7c673cae | 1901 | if (is_auth()) { |
9f95a23c TL |
1902 | ceph_assert(dir); // i am auth; i had better have this dir open |
1903 | dout(10) << fg << " first " << dir->first << " -> " << fgfirst | |
1904 | << " on " << *dir << dendl; | |
1905 | dir->first = fgfirst; | |
f67539c2 TL |
1906 | auto _fnode = CDir::allocate_fnode(*dir->get_fnode()); |
1907 | _fnode->fragstat = fragstat; | |
1908 | _fnode->accounted_fragstat = accounted_fragstat; | |
1909 | dir->reset_fnode(std::move(_fnode)); | |
9f95a23c TL |
1910 | if (!(fragstat == accounted_fragstat)) { |
1911 | dout(10) << fg << " setting filelock updated flag" << dendl; | |
1912 | filelock.mark_dirty(); // ok bc we're auth and caller will handle | |
7c673cae FG |
1913 | } |
1914 | } else { | |
9f95a23c TL |
1915 | if (dir && dir->is_auth()) { |
1916 | dout(10) << fg << " first " << dir->first << " -> " << fgfirst | |
1917 | << " on " << *dir << dendl; | |
1918 | dir->first = fgfirst; | |
f67539c2 | 1919 | const auto& pf = dir->get_projected_fnode(); |
9f95a23c | 1920 | finish_scatter_update(&filelock, dir, |
f67539c2 | 1921 | _inode->dirstat.version, pf->accounted_fragstat.version); |
9f95a23c | 1922 | } |
7c673cae | 1923 | } |
9f95a23c TL |
1924 | } |
1925 | DECODE_FINISH(p); | |
f67539c2 TL |
1926 | |
1927 | if (_inode) | |
1928 | reset_inode(std::move(_inode)); | |
9f95a23c | 1929 | } |
7c673cae | 1930 | |
9f95a23c TL |
1931 | void CInode::encode_lock_inest(bufferlist& bl) |
1932 | { | |
1933 | ENCODE_START(1, 1, bl); | |
1934 | if (is_auth()) { | |
f67539c2 | 1935 | encode(get_inode()->version, bl); |
9f95a23c TL |
1936 | } else { |
1937 | // treat flushing as dirty when rejoining cache | |
1938 | bool dirty = nestlock.is_dirty_or_flushing(); | |
1939 | encode(dirty, bl); | |
1940 | } | |
f67539c2 TL |
1941 | dout(15) << __func__ << " inode.rstat is " << get_inode()->rstat << dendl; |
1942 | encode(get_inode()->rstat, bl); // only meaningful if i am auth. | |
9f95a23c TL |
1943 | bufferlist tmp; |
1944 | __u32 n = 0; | |
1945 | for (const auto &p : dirfrags) { | |
1946 | frag_t fg = p.first; | |
1947 | CDir *dir = p.second; | |
1948 | if (is_auth() || dir->is_auth()) { | |
f67539c2 | 1949 | const auto& pf = dir->get_projected_fnode(); |
9f95a23c TL |
1950 | dout(10) << __func__ << " " << fg << " dir " << *dir << dendl; |
1951 | dout(10) << __func__ << " " << fg << " rstat " << pf->rstat << dendl; | |
1952 | dout(10) << __func__ << " " << fg << " accounted_rstat " << pf->rstat << dendl; | |
1953 | dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl; | |
1954 | encode(fg, tmp); | |
1955 | encode(dir->first, tmp); | |
1956 | encode(pf->rstat, tmp); | |
1957 | encode(pf->accounted_rstat, tmp); | |
1958 | encode(dir->dirty_old_rstat, tmp); | |
1959 | n++; | |
7c673cae | 1960 | } |
9f95a23c TL |
1961 | } |
1962 | encode(n, bl); | |
1963 | bl.claim_append(tmp); | |
1964 | ENCODE_FINISH(bl); | |
1965 | } | |
7c673cae | 1966 | |
9f95a23c TL |
1967 | void CInode::decode_lock_inest(bufferlist::const_iterator& p) |
1968 | { | |
f67539c2 TL |
1969 | inode_ptr _inode; |
1970 | ||
9f95a23c TL |
1971 | DECODE_START(1, p); |
1972 | if (is_auth()) { | |
1973 | bool replica_dirty; | |
1974 | decode(replica_dirty, p); | |
1975 | if (replica_dirty) { | |
1976 | dout(10) << __func__ << " setting nestlock dirty flag" << dendl; | |
1977 | nestlock.mark_dirty(); // ok bc we're auth and caller will handle | |
1978 | } | |
1979 | } else { | |
f67539c2 TL |
1980 | _inode = allocate_inode(*get_inode()); |
1981 | decode(_inode->version, p); | |
9f95a23c TL |
1982 | } |
1983 | nest_info_t rstat; | |
1984 | decode(rstat, p); | |
1985 | if (!is_auth()) { | |
1986 | dout(10) << __func__ << " taking inode rstat " << rstat << " for " << *this << dendl; | |
f67539c2 | 1987 | _inode->rstat = rstat; // take inode summation if replica |
9f95a23c TL |
1988 | } |
1989 | __u32 n; | |
1990 | decode(n, p); | |
1991 | while (n--) { | |
1992 | frag_t fg; | |
1993 | snapid_t fgfirst; | |
1994 | nest_info_t rstat; | |
1995 | nest_info_t accounted_rstat; | |
1996 | decltype(CDir::dirty_old_rstat) dirty_old_rstat; | |
1997 | decode(fg, p); | |
1998 | decode(fgfirst, p); | |
1999 | decode(rstat, p); | |
2000 | decode(accounted_rstat, p); | |
2001 | decode(dirty_old_rstat, p); | |
2002 | dout(10) << __func__ << " " << fg << " [" << fgfirst << ",head]" << dendl; | |
2003 | dout(10) << __func__ << " " << fg << " rstat " << rstat << dendl; | |
2004 | dout(10) << __func__ << " " << fg << " accounted_rstat " << accounted_rstat << dendl; | |
2005 | dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dirty_old_rstat << dendl; | |
2006 | CDir *dir = get_dirfrag(fg); | |
7c673cae | 2007 | if (is_auth()) { |
9f95a23c TL |
2008 | ceph_assert(dir); // i am auth; i had better have this dir open |
2009 | dout(10) << fg << " first " << dir->first << " -> " << fgfirst | |
2010 | << " on " << *dir << dendl; | |
2011 | dir->first = fgfirst; | |
f67539c2 TL |
2012 | auto _fnode = CDir::allocate_fnode(*dir->get_fnode()); |
2013 | _fnode->rstat = rstat; | |
2014 | _fnode->accounted_rstat = accounted_rstat; | |
2015 | dir->reset_fnode(std::move(_fnode)); | |
9f95a23c TL |
2016 | dir->dirty_old_rstat.swap(dirty_old_rstat); |
2017 | if (!(rstat == accounted_rstat) || !dir->dirty_old_rstat.empty()) { | |
2018 | dout(10) << fg << " setting nestlock updated flag" << dendl; | |
2019 | nestlock.mark_dirty(); // ok bc we're auth and caller will handle | |
2020 | } | |
7c673cae | 2021 | } else { |
9f95a23c TL |
2022 | if (dir && dir->is_auth()) { |
2023 | dout(10) << fg << " first " << dir->first << " -> " << fgfirst | |
2024 | << " on " << *dir << dendl; | |
2025 | dir->first = fgfirst; | |
f67539c2 | 2026 | const auto& pf = dir->get_projected_fnode(); |
9f95a23c | 2027 | finish_scatter_update(&nestlock, dir, |
f67539c2 | 2028 | _inode->rstat.version, pf->accounted_rstat.version); |
7c673cae | 2029 | } |
7c673cae | 2030 | } |
9f95a23c TL |
2031 | } |
2032 | DECODE_FINISH(p); | |
f67539c2 TL |
2033 | |
2034 | if (_inode) | |
2035 | reset_inode(std::move(_inode)); | |
9f95a23c TL |
2036 | } |
2037 | ||
2038 | void CInode::encode_lock_ixattr(bufferlist& bl) | |
2039 | { | |
2040 | ENCODE_START(1, 1, bl); | |
f67539c2 TL |
2041 | encode(get_inode()->version, bl); |
2042 | encode(get_inode()->ctime, bl); | |
2043 | encode_xattrs(bl); | |
9f95a23c TL |
2044 | ENCODE_FINISH(bl); |
2045 | } | |
2046 | ||
2047 | void CInode::decode_lock_ixattr(bufferlist::const_iterator& p) | |
2048 | { | |
f67539c2 TL |
2049 | ceph_assert(!is_auth()); |
2050 | auto _inode = allocate_inode(*get_inode()); | |
9f95a23c | 2051 | DECODE_START(1, p); |
f67539c2 | 2052 | decode(_inode->version, p); |
9f95a23c TL |
2053 | utime_t tm; |
2054 | decode(tm, p); | |
f67539c2 TL |
2055 | if (_inode->ctime < tm) |
2056 | _inode->ctime = tm; | |
2057 | decode_xattrs(p); | |
9f95a23c | 2058 | DECODE_FINISH(p); |
f67539c2 | 2059 | reset_inode(std::move(_inode)); |
9f95a23c TL |
2060 | } |
2061 | ||
2062 | void CInode::encode_lock_isnap(bufferlist& bl) | |
2063 | { | |
2064 | ENCODE_START(1, 1, bl); | |
f67539c2 TL |
2065 | encode(get_inode()->version, bl); |
2066 | encode(get_inode()->ctime, bl); | |
9f95a23c TL |
2067 | encode_snap(bl); |
2068 | ENCODE_FINISH(bl); | |
2069 | } | |
2070 | ||
2071 | void CInode::decode_lock_isnap(bufferlist::const_iterator& p) | |
2072 | { | |
f67539c2 TL |
2073 | ceph_assert(!is_auth()); |
2074 | auto _inode = allocate_inode(*get_inode()); | |
9f95a23c | 2075 | DECODE_START(1, p); |
f67539c2 | 2076 | decode(_inode->version, p); |
9f95a23c TL |
2077 | utime_t tm; |
2078 | decode(tm, p); | |
f67539c2 | 2079 | if (_inode->ctime < tm) _inode->ctime = tm; |
9f95a23c TL |
2080 | decode_snap(p); |
2081 | DECODE_FINISH(p); | |
f67539c2 | 2082 | reset_inode(std::move(_inode)); |
9f95a23c TL |
2083 | } |
2084 | ||
2085 | void CInode::encode_lock_iflock(bufferlist& bl) | |
2086 | { | |
2087 | ENCODE_START(1, 1, bl); | |
f67539c2 | 2088 | encode(get_inode()->version, bl); |
9f95a23c TL |
2089 | _encode_file_locks(bl); |
2090 | ENCODE_FINISH(bl); | |
2091 | } | |
2092 | ||
2093 | void CInode::decode_lock_iflock(bufferlist::const_iterator& p) | |
2094 | { | |
f67539c2 TL |
2095 | ceph_assert(!is_auth()); |
2096 | auto _inode = allocate_inode(*get_inode()); | |
9f95a23c | 2097 | DECODE_START(1, p); |
f67539c2 | 2098 | decode(_inode->version, p); |
9f95a23c TL |
2099 | _decode_file_locks(p); |
2100 | DECODE_FINISH(p); | |
f67539c2 | 2101 | reset_inode(std::move(_inode)); |
9f95a23c TL |
2102 | } |
2103 | ||
2104 | void CInode::encode_lock_ipolicy(bufferlist& bl) | |
2105 | { | |
f6b5b4d7 | 2106 | ENCODE_START(2, 1, bl); |
f67539c2 TL |
2107 | if (is_dir()) { |
2108 | encode(get_inode()->version, bl); | |
2109 | encode(get_inode()->ctime, bl); | |
2110 | encode(get_inode()->layout, bl, mdcache->mds->mdsmap->get_up_features()); | |
2111 | encode(get_inode()->quota, bl); | |
2112 | encode(get_inode()->export_pin, bl); | |
2113 | encode(get_inode()->export_ephemeral_distributed_pin, bl); | |
2114 | encode(get_inode()->export_ephemeral_random_pin, bl); | |
9f95a23c TL |
2115 | } |
2116 | ENCODE_FINISH(bl); | |
2117 | } | |
2118 | ||
2119 | void CInode::decode_lock_ipolicy(bufferlist::const_iterator& p) | |
2120 | { | |
f67539c2 TL |
2121 | ceph_assert(!is_auth()); |
2122 | auto _inode = allocate_inode(*get_inode()); | |
2123 | DECODE_START(1, p); | |
2124 | if (is_dir()) { | |
2125 | decode(_inode->version, p); | |
9f95a23c TL |
2126 | utime_t tm; |
2127 | decode(tm, p); | |
f67539c2 TL |
2128 | if (_inode->ctime < tm) |
2129 | _inode->ctime = tm; | |
2130 | decode(_inode->layout, p); | |
2131 | decode(_inode->quota, p); | |
2132 | decode(_inode->export_pin, p); | |
f6b5b4d7 | 2133 | if (struct_v >= 2) { |
f67539c2 TL |
2134 | decode(_inode->export_ephemeral_distributed_pin, p); |
2135 | decode(_inode->export_ephemeral_random_pin, p); | |
f6b5b4d7 | 2136 | } |
9f95a23c TL |
2137 | } |
2138 | DECODE_FINISH(p); | |
f67539c2 TL |
2139 | |
2140 | bool pin_updated = (get_inode()->export_pin != _inode->export_pin) || | |
2141 | (get_inode()->export_ephemeral_distributed_pin != | |
2142 | _inode->export_ephemeral_distributed_pin); | |
2143 | reset_inode(std::move(_inode)); | |
2144 | maybe_export_pin(pin_updated); | |
9f95a23c TL |
2145 | } |
2146 | ||
2147 | void CInode::encode_lock_state(int type, bufferlist& bl) | |
2148 | { | |
2149 | ENCODE_START(1, 1, bl); | |
2150 | encode(first, bl); | |
2151 | if (!is_base()) | |
2152 | encode(parent->first, bl); | |
2153 | ||
2154 | switch (type) { | |
2155 | case CEPH_LOCK_IAUTH: | |
2156 | encode_lock_iauth(bl); | |
2157 | break; | |
2158 | ||
2159 | case CEPH_LOCK_ILINK: | |
2160 | encode_lock_ilink(bl); | |
2161 | break; | |
2162 | ||
2163 | case CEPH_LOCK_IDFT: | |
2164 | encode_lock_idft(bl); | |
2165 | break; | |
2166 | ||
2167 | case CEPH_LOCK_IFILE: | |
2168 | encode_lock_ifile(bl); | |
2169 | break; | |
2170 | ||
2171 | case CEPH_LOCK_INEST: | |
2172 | encode_lock_inest(bl); | |
7c673cae FG |
2173 | break; |
2174 | ||
2175 | case CEPH_LOCK_IXATTR: | |
9f95a23c | 2176 | encode_lock_ixattr(bl); |
7c673cae FG |
2177 | break; |
2178 | ||
2179 | case CEPH_LOCK_ISNAP: | |
9f95a23c | 2180 | encode_lock_isnap(bl); |
7c673cae FG |
2181 | break; |
2182 | ||
2183 | case CEPH_LOCK_IFLOCK: | |
9f95a23c | 2184 | encode_lock_iflock(bl); |
7c673cae FG |
2185 | break; |
2186 | ||
2187 | case CEPH_LOCK_IPOLICY: | |
9f95a23c | 2188 | encode_lock_ipolicy(bl); |
7c673cae FG |
2189 | break; |
2190 | ||
2191 | default: | |
2192 | ceph_abort(); | |
2193 | } | |
9f95a23c | 2194 | ENCODE_FINISH(bl); |
7c673cae FG |
2195 | } |
2196 | ||
7c673cae FG |
2197 | /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */ |
2198 | ||
11fdf7f2 | 2199 | void CInode::decode_lock_state(int type, const bufferlist& bl) |
7c673cae | 2200 | { |
11fdf7f2 | 2201 | auto p = bl.cbegin(); |
9f95a23c TL |
2202 | |
2203 | DECODE_START(1, p); | |
7c673cae FG |
2204 | utime_t tm; |
2205 | ||
2206 | snapid_t newfirst; | |
11fdf7f2 TL |
2207 | using ceph::decode; |
2208 | decode(newfirst, p); | |
7c673cae | 2209 | if (!is_auth() && newfirst != first) { |
11fdf7f2 TL |
2210 | dout(10) << __func__ << " first " << first << " -> " << newfirst << dendl; |
2211 | first = newfirst; | |
2212 | } | |
2213 | if (!is_base()) { | |
2214 | decode(newfirst, p); | |
2215 | if (!parent->is_auth() && newfirst != parent->first) { | |
2216 | dout(10) << __func__ << " parent first " << first << " -> " << newfirst << dendl; | |
7c673cae FG |
2217 | parent->first = newfirst; |
2218 | } | |
7c673cae FG |
2219 | } |
2220 | ||
2221 | switch (type) { | |
2222 | case CEPH_LOCK_IAUTH: | |
9f95a23c | 2223 | decode_lock_iauth(p); |
7c673cae FG |
2224 | break; |
2225 | ||
2226 | case CEPH_LOCK_ILINK: | |
9f95a23c | 2227 | decode_lock_ilink(p); |
7c673cae FG |
2228 | break; |
2229 | ||
2230 | case CEPH_LOCK_IDFT: | |
9f95a23c | 2231 | decode_lock_idft(p); |
7c673cae FG |
2232 | break; |
2233 | ||
2234 | case CEPH_LOCK_IFILE: | |
9f95a23c | 2235 | decode_lock_ifile(p); |
7c673cae FG |
2236 | break; |
2237 | ||
2238 | case CEPH_LOCK_INEST: | |
9f95a23c | 2239 | decode_lock_inest(p); |
7c673cae FG |
2240 | break; |
2241 | ||
2242 | case CEPH_LOCK_IXATTR: | |
9f95a23c | 2243 | decode_lock_ixattr(p); |
7c673cae FG |
2244 | break; |
2245 | ||
2246 | case CEPH_LOCK_ISNAP: | |
9f95a23c | 2247 | decode_lock_isnap(p); |
7c673cae FG |
2248 | break; |
2249 | ||
2250 | case CEPH_LOCK_IFLOCK: | |
9f95a23c | 2251 | decode_lock_iflock(p); |
7c673cae FG |
2252 | break; |
2253 | ||
2254 | case CEPH_LOCK_IPOLICY: | |
9f95a23c | 2255 | decode_lock_ipolicy(p); |
7c673cae FG |
2256 | break; |
2257 | ||
2258 | default: | |
2259 | ceph_abort(); | |
2260 | } | |
9f95a23c | 2261 | DECODE_FINISH(p); |
7c673cae FG |
2262 | } |
2263 | ||
2264 | ||
2265 | bool CInode::is_dirty_scattered() | |
2266 | { | |
2267 | return | |
2268 | filelock.is_dirty_or_flushing() || | |
2269 | nestlock.is_dirty_or_flushing() || | |
2270 | dirfragtreelock.is_dirty_or_flushing(); | |
2271 | } | |
2272 | ||
2273 | void CInode::clear_scatter_dirty() | |
2274 | { | |
2275 | filelock.remove_dirty(); | |
2276 | nestlock.remove_dirty(); | |
2277 | dirfragtreelock.remove_dirty(); | |
2278 | } | |
2279 | ||
2280 | void CInode::clear_dirty_scattered(int type) | |
2281 | { | |
11fdf7f2 TL |
2282 | dout(10) << __func__ << " " << type << " on " << *this << dendl; |
2283 | ceph_assert(is_dir()); | |
7c673cae FG |
2284 | switch (type) { |
2285 | case CEPH_LOCK_IFILE: | |
2286 | item_dirty_dirfrag_dir.remove_myself(); | |
2287 | break; | |
2288 | ||
2289 | case CEPH_LOCK_INEST: | |
2290 | item_dirty_dirfrag_nest.remove_myself(); | |
2291 | break; | |
2292 | ||
2293 | case CEPH_LOCK_IDFT: | |
2294 | item_dirty_dirfrag_dirfragtree.remove_myself(); | |
2295 | break; | |
2296 | ||
2297 | default: | |
2298 | ceph_abort(); | |
2299 | } | |
2300 | } | |
2301 | ||
2302 | ||
2303 | /* | |
2304 | * when we initially scatter a lock, we need to check if any of the dirfrags | |
2305 | * have out of date accounted_rstat/fragstat. if so, mark the lock stale. | |
2306 | */ | |
2307 | /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */ | |
2308 | void CInode::start_scatter(ScatterLock *lock) | |
2309 | { | |
11fdf7f2 TL |
2310 | dout(10) << __func__ << " " << *lock << " on " << *this << dendl; |
2311 | ceph_assert(is_auth()); | |
f67539c2 | 2312 | const auto& pi = get_projected_inode(); |
7c673cae | 2313 | |
94b18763 FG |
2314 | for (const auto &p : dirfrags) { |
2315 | frag_t fg = p.first; | |
2316 | CDir *dir = p.second; | |
f67539c2 | 2317 | const auto& pf = dir->get_projected_fnode(); |
7c673cae FG |
2318 | dout(20) << fg << " " << *dir << dendl; |
2319 | ||
2320 | if (!dir->is_auth()) | |
2321 | continue; | |
2322 | ||
2323 | switch (lock->get_type()) { | |
2324 | case CEPH_LOCK_IFILE: | |
2325 | finish_scatter_update(lock, dir, pi->dirstat.version, pf->accounted_fragstat.version); | |
2326 | break; | |
2327 | ||
2328 | case CEPH_LOCK_INEST: | |
2329 | finish_scatter_update(lock, dir, pi->rstat.version, pf->accounted_rstat.version); | |
2330 | break; | |
2331 | ||
2332 | case CEPH_LOCK_IDFT: | |
2333 | dir->state_clear(CDir::STATE_DIRTYDFT); | |
2334 | break; | |
2335 | } | |
2336 | } | |
2337 | } | |
2338 | ||
2339 | ||
2340 | class C_Inode_FragUpdate : public MDSLogContextBase { | |
2341 | protected: | |
2342 | CInode *in; | |
2343 | CDir *dir; | |
2344 | MutationRef mut; | |
2345 | MDSRank *get_mds() override {return in->mdcache->mds;} | |
2346 | void finish(int r) override { | |
2347 | in->_finish_frag_update(dir, mut); | |
2348 | } | |
2349 | ||
2350 | public: | |
2351 | C_Inode_FragUpdate(CInode *i, CDir *d, MutationRef& m) : in(i), dir(d), mut(m) {} | |
2352 | }; | |
2353 | ||
2354 | void CInode::finish_scatter_update(ScatterLock *lock, CDir *dir, | |
2355 | version_t inode_version, version_t dir_accounted_version) | |
2356 | { | |
2357 | frag_t fg = dir->get_frag(); | |
11fdf7f2 | 2358 | ceph_assert(dir->is_auth()); |
7c673cae FG |
2359 | |
2360 | if (dir->is_frozen()) { | |
11fdf7f2 | 2361 | dout(10) << __func__ << " " << fg << " frozen, marking " << *lock << " stale " << *dir << dendl; |
7c673cae | 2362 | } else if (dir->get_version() == 0) { |
11fdf7f2 | 2363 | dout(10) << __func__ << " " << fg << " not loaded, marking " << *lock << " stale " << *dir << dendl; |
7c673cae FG |
2364 | } else { |
2365 | if (dir_accounted_version != inode_version) { | |
11fdf7f2 | 2366 | dout(10) << __func__ << " " << fg << " journaling accounted scatterstat update v" << inode_version << dendl; |
7c673cae FG |
2367 | |
2368 | MDLog *mdlog = mdcache->mds->mdlog; | |
2369 | MutationRef mut(new MutationImpl()); | |
2370 | mut->ls = mdlog->get_current_segment(); | |
2371 | ||
f67539c2 | 2372 | auto pf = dir->project_fnode(mut); |
7c673cae | 2373 | |
9f95a23c | 2374 | std::string_view ename; |
7c673cae FG |
2375 | switch (lock->get_type()) { |
2376 | case CEPH_LOCK_IFILE: | |
f67539c2 | 2377 | pf->fragstat.version = inode_version; |
7c673cae FG |
2378 | pf->accounted_fragstat = pf->fragstat; |
2379 | ename = "lock ifile accounted scatter stat update"; | |
2380 | break; | |
2381 | case CEPH_LOCK_INEST: | |
f67539c2 | 2382 | pf->rstat.version = inode_version; |
7c673cae FG |
2383 | pf->accounted_rstat = pf->rstat; |
2384 | ename = "lock inest accounted scatter stat update"; | |
c07f9fc5 FG |
2385 | |
2386 | if (!is_auth() && lock->get_state() == LOCK_MIX) { | |
11fdf7f2 | 2387 | dout(10) << __func__ << " try to assimilate dirty rstat on " |
c07f9fc5 | 2388 | << *dir << dendl; |
f67539c2 | 2389 | dir->assimilate_dirty_rstat_inodes(mut); |
c07f9fc5 FG |
2390 | } |
2391 | ||
7c673cae FG |
2392 | break; |
2393 | default: | |
2394 | ceph_abort(); | |
2395 | } | |
2396 | ||
7c673cae FG |
2397 | EUpdate *le = new EUpdate(mdlog, ename); |
2398 | mdlog->start_entry(le); | |
2399 | le->metablob.add_dir_context(dir); | |
2400 | le->metablob.add_dir(dir, true); | |
2401 | ||
11fdf7f2 | 2402 | ceph_assert(!dir->is_frozen()); |
7c673cae | 2403 | mut->auth_pin(dir); |
c07f9fc5 FG |
2404 | |
2405 | if (lock->get_type() == CEPH_LOCK_INEST && | |
2406 | !is_auth() && lock->get_state() == LOCK_MIX) { | |
11fdf7f2 | 2407 | dout(10) << __func__ << " finish assimilating dirty rstat on " |
c07f9fc5 | 2408 | << *dir << dendl; |
f67539c2 | 2409 | dir->assimilate_dirty_rstat_inodes_finish(&le->metablob); |
c07f9fc5 FG |
2410 | |
2411 | if (!(pf->rstat == pf->accounted_rstat)) { | |
11fdf7f2 | 2412 | if (!mut->is_wrlocked(&nestlock)) { |
c07f9fc5 FG |
2413 | mdcache->mds->locker->wrlock_force(&nestlock, mut); |
2414 | } | |
2415 | ||
2416 | mdcache->mds->locker->mark_updated_scatterlock(&nestlock); | |
2417 | mut->ls->dirty_dirfrag_nest.push_back(&item_dirty_dirfrag_nest); | |
2418 | } | |
2419 | } | |
f67539c2 TL |
2420 | |
2421 | pf->version = dir->pre_dirty(); | |
7c673cae FG |
2422 | |
2423 | mdlog->submit_entry(le, new C_Inode_FragUpdate(this, dir, mut)); | |
2424 | } else { | |
11fdf7f2 | 2425 | dout(10) << __func__ << " " << fg << " accounted " << *lock |
7c673cae FG |
2426 | << " scatter stat unchanged at v" << dir_accounted_version << dendl; |
2427 | } | |
2428 | } | |
2429 | } | |
2430 | ||
2431 | void CInode::_finish_frag_update(CDir *dir, MutationRef& mut) | |
2432 | { | |
11fdf7f2 | 2433 | dout(10) << __func__ << " on " << *dir << dendl; |
7c673cae | 2434 | mut->apply(); |
c07f9fc5 | 2435 | mdcache->mds->locker->drop_locks(mut.get()); |
7c673cae FG |
2436 | mut->cleanup(); |
2437 | } | |
2438 | ||
2439 | ||
2440 | /* | |
2441 | * when we gather a lock, we need to assimilate dirfrag changes into the inode | |
2442 | * state. it's possible we can't update the dirfrag accounted_rstat/fragstat | |
2443 | * because the frag is auth and frozen, or that the replica couldn't for the same | |
2444 | * reason. hopefully it will get updated the next time the lock cycles. | |
2445 | * | |
2446 | * we have two dimensions of behavior: | |
2447 | * - we may be (auth and !frozen), and able to update, or not. | |
2448 | * - the frag may be stale, or not. | |
2449 | * | |
2450 | * if the frag is non-stale, we want to assimilate the diff into the | |
2451 | * inode, regardless of whether it's auth or updateable. | |
2452 | * | |
2453 | * if we update the frag, we want to set accounted_fragstat = frag, | |
2454 | * both if we took the diff or it was stale and we are making it | |
2455 | * un-stale. | |
2456 | */ | |
2457 | /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */ | |
f67539c2 | 2458 | void CInode::finish_scatter_gather_update(int type, MutationRef& mut) |
7c673cae FG |
2459 | { |
2460 | LogChannelRef clog = mdcache->mds->clog; | |
2461 | ||
11fdf7f2 TL |
2462 | dout(10) << __func__ << " " << type << " on " << *this << dendl; |
2463 | ceph_assert(is_auth()); | |
7c673cae FG |
2464 | |
2465 | switch (type) { | |
2466 | case CEPH_LOCK_IFILE: | |
2467 | { | |
2468 | fragtree_t tmpdft = dirfragtree; | |
2469 | struct frag_info_t dirstat; | |
2470 | bool dirstat_valid = true; | |
2471 | ||
2472 | // adjust summation | |
11fdf7f2 | 2473 | ceph_assert(is_auth()); |
f67539c2 | 2474 | auto pi = _get_projected_inode(); |
7c673cae FG |
2475 | |
2476 | bool touched_mtime = false, touched_chattr = false; | |
2477 | dout(20) << " orig dirstat " << pi->dirstat << dendl; | |
2478 | pi->dirstat.version++; | |
94b18763 FG |
2479 | for (const auto &p : dirfrags) { |
2480 | frag_t fg = p.first; | |
2481 | CDir *dir = p.second; | |
7c673cae FG |
2482 | dout(20) << fg << " " << *dir << dendl; |
2483 | ||
2484 | bool update; | |
2485 | if (dir->get_version() != 0) { | |
2486 | update = dir->is_auth() && !dir->is_frozen(); | |
2487 | } else { | |
2488 | update = false; | |
2489 | dirstat_valid = false; | |
2490 | } | |
2491 | ||
f67539c2 TL |
2492 | CDir::fnode_const_ptr pf; |
2493 | if (update) { | |
2494 | mut->auth_pin(dir); | |
2495 | pf = dir->project_fnode(mut); | |
2496 | } else { | |
2497 | pf = dir->get_projected_fnode(); | |
2498 | } | |
7c673cae FG |
2499 | |
2500 | if (pf->accounted_fragstat.version == pi->dirstat.version - 1) { | |
2501 | dout(20) << fg << " fragstat " << pf->fragstat << dendl; | |
2502 | dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl; | |
2503 | pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr); | |
2504 | } else { | |
2505 | dout(20) << fg << " skipping STALE accounted_fragstat " << pf->accounted_fragstat << dendl; | |
2506 | } | |
2507 | ||
2508 | if (pf->fragstat.nfiles < 0 || | |
2509 | pf->fragstat.nsubdirs < 0) { | |
2510 | clog->error() << "bad/negative dir size on " | |
f67539c2 | 2511 | << dir->dirfrag() << " " << pf->fragstat; |
11fdf7f2 | 2512 | ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter); |
f67539c2 TL |
2513 | |
2514 | auto _pf = const_cast<fnode_t*>(pf.get()); | |
7c673cae | 2515 | if (pf->fragstat.nfiles < 0) |
f67539c2 | 2516 | _pf->fragstat.nfiles = 0; |
7c673cae | 2517 | if (pf->fragstat.nsubdirs < 0) |
f67539c2 | 2518 | _pf->fragstat.nsubdirs = 0; |
7c673cae FG |
2519 | } |
2520 | ||
2521 | if (update) { | |
f67539c2 TL |
2522 | auto _pf = const_cast<fnode_t*>(pf.get()); |
2523 | _pf->accounted_fragstat = _pf->fragstat; | |
2524 | _pf->fragstat.version = _pf->accounted_fragstat.version = pi->dirstat.version; | |
2525 | _pf->version = dir->pre_dirty(); | |
7c673cae FG |
2526 | dout(10) << fg << " updated accounted_fragstat " << pf->fragstat << " on " << *dir << dendl; |
2527 | } | |
2528 | ||
2529 | tmpdft.force_to_leaf(g_ceph_context, fg); | |
2530 | dirstat.add(pf->fragstat); | |
2531 | } | |
2532 | if (touched_mtime) | |
2533 | pi->mtime = pi->ctime = pi->dirstat.mtime; | |
2534 | if (touched_chattr) | |
2535 | pi->change_attr = pi->dirstat.change_attr; | |
2536 | dout(20) << " final dirstat " << pi->dirstat << dendl; | |
2537 | ||
2538 | if (dirstat_valid && !dirstat.same_sums(pi->dirstat)) { | |
11fdf7f2 TL |
2539 | frag_vec_t leaves; |
2540 | tmpdft.get_leaves_under(frag_t(), leaves); | |
2541 | for (const auto& leaf : leaves) { | |
2542 | if (!dirfrags.count(leaf)) { | |
7c673cae FG |
2543 | dirstat_valid = false; |
2544 | break; | |
2545 | } | |
11fdf7f2 | 2546 | } |
7c673cae FG |
2547 | if (dirstat_valid) { |
2548 | if (state_test(CInode::STATE_REPAIRSTATS)) { | |
2549 | dout(20) << " dirstat mismatch, fixing" << dendl; | |
2550 | } else { | |
2551 | clog->error() << "unmatched fragstat on " << ino() << ", inode has " | |
2552 | << pi->dirstat << ", dirfrags have " << dirstat; | |
11fdf7f2 | 2553 | ceph_assert(!"unmatched fragstat" == g_conf()->mds_verify_scatter); |
7c673cae FG |
2554 | } |
2555 | // trust the dirfrags for now | |
2556 | version_t v = pi->dirstat.version; | |
2557 | if (pi->dirstat.mtime > dirstat.mtime) | |
2558 | dirstat.mtime = pi->dirstat.mtime; | |
2559 | if (pi->dirstat.change_attr > dirstat.change_attr) | |
2560 | dirstat.change_attr = pi->dirstat.change_attr; | |
2561 | pi->dirstat = dirstat; | |
2562 | pi->dirstat.version = v; | |
2563 | } | |
2564 | } | |
2565 | ||
f67539c2 | 2566 | if (pi->dirstat.nfiles < 0 || pi->dirstat.nsubdirs < 0) { |
d2e6a577 FG |
2567 | std::string path; |
2568 | make_path_string(path); | |
2569 | clog->error() << "Inconsistent statistics detected: fragstat on inode " | |
2570 | << ino() << " (" << path << "), inode has " << pi->dirstat; | |
11fdf7f2 | 2571 | ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter); |
7c673cae FG |
2572 | |
2573 | if (pi->dirstat.nfiles < 0) | |
2574 | pi->dirstat.nfiles = 0; | |
2575 | if (pi->dirstat.nsubdirs < 0) | |
2576 | pi->dirstat.nsubdirs = 0; | |
2577 | } | |
2578 | } | |
2579 | break; | |
2580 | ||
2581 | case CEPH_LOCK_INEST: | |
2582 | { | |
11fdf7f2 TL |
2583 | // adjust summation |
2584 | ceph_assert(is_auth()); | |
2585 | ||
7c673cae FG |
2586 | fragtree_t tmpdft = dirfragtree; |
2587 | nest_info_t rstat; | |
7c673cae FG |
2588 | bool rstat_valid = true; |
2589 | ||
11fdf7f2 TL |
2590 | rstat.rsubdirs = 1; |
2591 | if (const sr_t *srnode = get_projected_srnode(); srnode) | |
2592 | rstat.rsnaps = srnode->snaps.size(); | |
2593 | ||
f67539c2 | 2594 | auto pi = _get_projected_inode(); |
7c673cae FG |
2595 | dout(20) << " orig rstat " << pi->rstat << dendl; |
2596 | pi->rstat.version++; | |
94b18763 FG |
2597 | for (const auto &p : dirfrags) { |
2598 | frag_t fg = p.first; | |
2599 | CDir *dir = p.second; | |
7c673cae FG |
2600 | dout(20) << fg << " " << *dir << dendl; |
2601 | ||
2602 | bool update; | |
2603 | if (dir->get_version() != 0) { | |
2604 | update = dir->is_auth() && !dir->is_frozen(); | |
2605 | } else { | |
2606 | update = false; | |
2607 | rstat_valid = false; | |
2608 | } | |
2609 | ||
f67539c2 TL |
2610 | CDir::fnode_const_ptr pf; |
2611 | if (update) { | |
2612 | mut->auth_pin(dir); | |
2613 | pf = dir->project_fnode(mut); | |
2614 | } else { | |
2615 | pf = dir->get_projected_fnode(); | |
2616 | } | |
7c673cae FG |
2617 | |
2618 | if (pf->accounted_rstat.version == pi->rstat.version-1) { | |
2619 | // only pull this frag's dirty rstat inodes into the frag if | |
2620 | // the frag is non-stale and updateable. if it's stale, | |
2621 | // that info will just get thrown out! | |
2622 | if (update) | |
f67539c2 | 2623 | dir->assimilate_dirty_rstat_inodes(mut); |
7c673cae FG |
2624 | |
2625 | dout(20) << fg << " rstat " << pf->rstat << dendl; | |
2626 | dout(20) << fg << " accounted_rstat " << pf->accounted_rstat << dendl; | |
2627 | dout(20) << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl; | |
2628 | mdcache->project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, | |
2629 | dir->first, CEPH_NOSNAP, this, true); | |
94b18763 FG |
2630 | for (auto &p : dir->dirty_old_rstat) { |
2631 | mdcache->project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, | |
2632 | p.second.first, p.first, this, true); | |
2633 | } | |
7c673cae FG |
2634 | if (update) // dir contents not valid if frozen or non-auth |
2635 | dir->check_rstats(); | |
2636 | } else { | |
2637 | dout(20) << fg << " skipping STALE accounted_rstat " << pf->accounted_rstat << dendl; | |
2638 | } | |
2639 | if (update) { | |
f67539c2 TL |
2640 | auto _pf = const_cast<fnode_t*>(pf.get()); |
2641 | _pf->accounted_rstat = pf->rstat; | |
2642 | _pf->rstat.version = _pf->accounted_rstat.version = pi->rstat.version; | |
2643 | _pf->version = dir->pre_dirty(); | |
7c673cae | 2644 | dir->dirty_old_rstat.clear(); |
7c673cae FG |
2645 | dir->check_rstats(); |
2646 | dout(10) << fg << " updated accounted_rstat " << pf->rstat << " on " << *dir << dendl; | |
2647 | } | |
2648 | ||
2649 | tmpdft.force_to_leaf(g_ceph_context, fg); | |
2650 | rstat.add(pf->rstat); | |
2651 | } | |
2652 | dout(20) << " final rstat " << pi->rstat << dendl; | |
2653 | ||
2654 | if (rstat_valid && !rstat.same_sums(pi->rstat)) { | |
11fdf7f2 TL |
2655 | frag_vec_t leaves; |
2656 | tmpdft.get_leaves_under(frag_t(), leaves); | |
2657 | for (const auto& leaf : leaves) { | |
2658 | if (!dirfrags.count(leaf)) { | |
7c673cae FG |
2659 | rstat_valid = false; |
2660 | break; | |
2661 | } | |
11fdf7f2 | 2662 | } |
7c673cae FG |
2663 | if (rstat_valid) { |
2664 | if (state_test(CInode::STATE_REPAIRSTATS)) { | |
2665 | dout(20) << " rstat mismatch, fixing" << dendl; | |
2666 | } else { | |
d2e6a577 FG |
2667 | clog->error() << "inconsistent rstat on inode " << ino() |
2668 | << ", inode has " << pi->rstat | |
2669 | << ", directory fragments have " << rstat; | |
11fdf7f2 | 2670 | ceph_assert(!"unmatched rstat" == g_conf()->mds_verify_scatter); |
7c673cae FG |
2671 | } |
2672 | // trust the dirfrag for now | |
2673 | version_t v = pi->rstat.version; | |
2674 | if (pi->rstat.rctime > rstat.rctime) | |
2675 | rstat.rctime = pi->rstat.rctime; | |
2676 | pi->rstat = rstat; | |
2677 | pi->rstat.version = v; | |
2678 | } | |
2679 | } | |
2680 | ||
2681 | mdcache->broadcast_quota_to_client(this); | |
2682 | } | |
2683 | break; | |
2684 | ||
2685 | case CEPH_LOCK_IDFT: | |
2686 | break; | |
2687 | ||
2688 | default: | |
2689 | ceph_abort(); | |
2690 | } | |
2691 | } | |
2692 | ||
f67539c2 | 2693 | void CInode::finish_scatter_gather_update_accounted(int type, EMetaBlob *metablob) |
7c673cae | 2694 | { |
11fdf7f2 TL |
2695 | dout(10) << __func__ << " " << type << " on " << *this << dendl; |
2696 | ceph_assert(is_auth()); | |
7c673cae | 2697 | |
94b18763 FG |
2698 | for (const auto &p : dirfrags) { |
2699 | CDir *dir = p.second; | |
7c673cae FG |
2700 | if (!dir->is_auth() || dir->get_version() == 0 || dir->is_frozen()) |
2701 | continue; | |
2702 | ||
2703 | if (type == CEPH_LOCK_IDFT) | |
2704 | continue; // nothing to do. | |
2705 | ||
f67539c2 TL |
2706 | if (type == CEPH_LOCK_INEST) |
2707 | dir->assimilate_dirty_rstat_inodes_finish(metablob); | |
2708 | ||
7c673cae | 2709 | dout(10) << " journaling updated frag accounted_ on " << *dir << dendl; |
11fdf7f2 | 2710 | ceph_assert(dir->is_projected()); |
7c673cae | 2711 | metablob->add_dir(dir, true); |
7c673cae FG |
2712 | } |
2713 | } | |
2714 | ||
2715 | // waiting | |
2716 | ||
2717 | bool CInode::is_frozen() const | |
2718 | { | |
2719 | if (is_frozen_inode()) return true; | |
2720 | if (parent && parent->dir->is_frozen()) return true; | |
2721 | return false; | |
2722 | } | |
2723 | ||
2724 | bool CInode::is_frozen_dir() const | |
2725 | { | |
2726 | if (parent && parent->dir->is_frozen_dir()) return true; | |
2727 | return false; | |
2728 | } | |
2729 | ||
2730 | bool CInode::is_freezing() const | |
2731 | { | |
2732 | if (is_freezing_inode()) return true; | |
2733 | if (parent && parent->dir->is_freezing()) return true; | |
2734 | return false; | |
2735 | } | |
2736 | ||
11fdf7f2 | 2737 | void CInode::add_dir_waiter(frag_t fg, MDSContext *c) |
7c673cae FG |
2738 | { |
2739 | if (waiting_on_dir.empty()) | |
2740 | get(PIN_DIRWAITER); | |
2741 | waiting_on_dir[fg].push_back(c); | |
11fdf7f2 | 2742 | dout(10) << __func__ << " frag " << fg << " " << c << " on " << *this << dendl; |
7c673cae FG |
2743 | } |
2744 | ||
11fdf7f2 | 2745 | void CInode::take_dir_waiting(frag_t fg, MDSContext::vec& ls) |
7c673cae FG |
2746 | { |
2747 | if (waiting_on_dir.empty()) | |
2748 | return; | |
2749 | ||
94b18763 FG |
2750 | auto it = waiting_on_dir.find(fg); |
2751 | if (it != waiting_on_dir.end()) { | |
2752 | dout(10) << __func__ << " frag " << fg << " on " << *this << dendl; | |
11fdf7f2 TL |
2753 | auto& waiting = it->second; |
2754 | ls.insert(ls.end(), waiting.begin(), waiting.end()); | |
94b18763 | 2755 | waiting_on_dir.erase(it); |
7c673cae FG |
2756 | |
2757 | if (waiting_on_dir.empty()) | |
2758 | put(PIN_DIRWAITER); | |
2759 | } | |
2760 | } | |
2761 | ||
11fdf7f2 | 2762 | void CInode::add_waiter(uint64_t tag, MDSContext *c) |
7c673cae | 2763 | { |
11fdf7f2 | 2764 | dout(10) << __func__ << " tag " << std::hex << tag << std::dec << " " << c |
7c673cae FG |
2765 | << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH) |
2766 | << " !frozen " << !is_frozen_inode() | |
2767 | << " !freezing " << !is_freezing_inode() | |
2768 | << dendl; | |
2769 | // wait on the directory? | |
2770 | // make sure its not the inode that is explicitly ambiguous|freezing|frozen | |
2771 | if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) || | |
2772 | ((tag & WAIT_UNFREEZE) && | |
2773 | !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) { | |
2774 | dout(15) << "passing waiter up tree" << dendl; | |
2775 | parent->dir->add_waiter(tag, c); | |
2776 | return; | |
2777 | } | |
2778 | dout(15) << "taking waiter here" << dendl; | |
2779 | MDSCacheObject::add_waiter(tag, c); | |
2780 | } | |
2781 | ||
11fdf7f2 | 2782 | void CInode::take_waiting(uint64_t mask, MDSContext::vec& ls) |
7c673cae FG |
2783 | { |
2784 | if ((mask & WAIT_DIR) && !waiting_on_dir.empty()) { | |
2785 | // take all dentry waiters | |
2786 | while (!waiting_on_dir.empty()) { | |
94b18763 FG |
2787 | auto it = waiting_on_dir.begin(); |
2788 | dout(10) << __func__ << " dirfrag " << it->first << " on " << *this << dendl; | |
11fdf7f2 TL |
2789 | auto& waiting = it->second; |
2790 | ls.insert(ls.end(), waiting.begin(), waiting.end()); | |
94b18763 | 2791 | waiting_on_dir.erase(it); |
7c673cae FG |
2792 | } |
2793 | put(PIN_DIRWAITER); | |
2794 | } | |
2795 | ||
2796 | // waiting | |
2797 | MDSCacheObject::take_waiting(mask, ls); | |
2798 | } | |
2799 | ||
9f95a23c TL |
2800 | void CInode::maybe_finish_freeze_inode() |
2801 | { | |
2802 | CDir *dir = get_parent_dir(); | |
2803 | if (auth_pins > auth_pin_freeze_allowance || dir->frozen_inode_suppressed) | |
2804 | return; | |
2805 | ||
2806 | dout(10) << "maybe_finish_freeze_inode - frozen" << dendl; | |
2807 | ceph_assert(auth_pins == auth_pin_freeze_allowance); | |
2808 | get(PIN_FROZEN); | |
2809 | put(PIN_FREEZING); | |
2810 | state_clear(STATE_FREEZING); | |
2811 | state_set(STATE_FROZEN); | |
2812 | ||
2813 | item_freezing_inode.remove_myself(); | |
2814 | dir->num_frozen_inodes++; | |
2815 | ||
2816 | finish_waiting(WAIT_FROZEN); | |
2817 | } | |
2818 | ||
7c673cae FG |
2819 | bool CInode::freeze_inode(int auth_pin_allowance) |
2820 | { | |
9f95a23c TL |
2821 | CDir *dir = get_parent_dir(); |
2822 | ceph_assert(dir); | |
2823 | ||
11fdf7f2 TL |
2824 | ceph_assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins |
2825 | ceph_assert(auth_pins >= auth_pin_allowance); | |
9f95a23c TL |
2826 | if (auth_pins == auth_pin_allowance && !dir->frozen_inode_suppressed) { |
2827 | dout(10) << "freeze_inode - frozen" << dendl; | |
2828 | if (!state_test(STATE_FROZEN)) { | |
2829 | get(PIN_FROZEN); | |
2830 | state_set(STATE_FROZEN); | |
2831 | dir->num_frozen_inodes++; | |
2832 | } | |
2833 | return true; | |
7c673cae FG |
2834 | } |
2835 | ||
9f95a23c TL |
2836 | dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl; |
2837 | auth_pin_freeze_allowance = auth_pin_allowance; | |
2838 | dir->freezing_inodes.push_back(&item_freezing_inode); | |
2839 | ||
2840 | get(PIN_FREEZING); | |
2841 | state_set(STATE_FREEZING); | |
2842 | ||
2843 | if (!dir->lock_caches_with_auth_pins.empty()) | |
2844 | mdcache->mds->locker->invalidate_lock_caches(dir); | |
2845 | ||
2846 | const static int lock_types[] = { | |
2847 | CEPH_LOCK_IVERSION, CEPH_LOCK_IFILE, CEPH_LOCK_IAUTH, CEPH_LOCK_ILINK, CEPH_LOCK_IDFT, | |
2848 | CEPH_LOCK_IXATTR, CEPH_LOCK_ISNAP, CEPH_LOCK_INEST, CEPH_LOCK_IFLOCK, CEPH_LOCK_IPOLICY, 0 | |
2849 | }; | |
2850 | for (int i = 0; lock_types[i]; ++i) { | |
2851 | auto lock = get_lock(lock_types[i]); | |
2852 | if (lock->is_cached()) | |
2853 | mdcache->mds->locker->invalidate_lock_caches(lock); | |
7c673cae | 2854 | } |
9f95a23c TL |
2855 | // invalidate_lock_caches() may decrease dir->frozen_inode_suppressed |
2856 | // and finish freezing the inode | |
2857 | return state_test(STATE_FROZEN); | |
7c673cae FG |
2858 | } |
2859 | ||
11fdf7f2 | 2860 | void CInode::unfreeze_inode(MDSContext::vec& finished) |
7c673cae | 2861 | { |
11fdf7f2 | 2862 | dout(10) << __func__ << dendl; |
7c673cae FG |
2863 | if (state_test(STATE_FREEZING)) { |
2864 | state_clear(STATE_FREEZING); | |
2865 | put(PIN_FREEZING); | |
9f95a23c | 2866 | item_freezing_inode.remove_myself(); |
7c673cae FG |
2867 | } else if (state_test(STATE_FROZEN)) { |
2868 | state_clear(STATE_FROZEN); | |
2869 | put(PIN_FROZEN); | |
9f95a23c | 2870 | get_parent_dir()->num_frozen_inodes--; |
7c673cae FG |
2871 | } else |
2872 | ceph_abort(); | |
2873 | take_waiting(WAIT_UNFREEZE, finished); | |
2874 | } | |
2875 | ||
2876 | void CInode::unfreeze_inode() | |
2877 | { | |
11fdf7f2 | 2878 | MDSContext::vec finished; |
7c673cae FG |
2879 | unfreeze_inode(finished); |
2880 | mdcache->mds->queue_waiters(finished); | |
2881 | } | |
2882 | ||
2883 | void CInode::freeze_auth_pin() | |
2884 | { | |
11fdf7f2 | 2885 | ceph_assert(state_test(CInode::STATE_FROZEN)); |
7c673cae | 2886 | state_set(CInode::STATE_FROZENAUTHPIN); |
9f95a23c | 2887 | get_parent_dir()->num_frozen_inodes++; |
7c673cae FG |
2888 | } |
2889 | ||
2890 | void CInode::unfreeze_auth_pin() | |
2891 | { | |
11fdf7f2 | 2892 | ceph_assert(state_test(CInode::STATE_FROZENAUTHPIN)); |
7c673cae | 2893 | state_clear(CInode::STATE_FROZENAUTHPIN); |
9f95a23c | 2894 | get_parent_dir()->num_frozen_inodes--; |
7c673cae | 2895 | if (!state_test(STATE_FREEZING|STATE_FROZEN)) { |
11fdf7f2 | 2896 | MDSContext::vec finished; |
7c673cae FG |
2897 | take_waiting(WAIT_UNFREEZE, finished); |
2898 | mdcache->mds->queue_waiters(finished); | |
2899 | } | |
2900 | } | |
2901 | ||
11fdf7f2 | 2902 | void CInode::clear_ambiguous_auth(MDSContext::vec& finished) |
7c673cae | 2903 | { |
11fdf7f2 | 2904 | ceph_assert(state_test(CInode::STATE_AMBIGUOUSAUTH)); |
7c673cae FG |
2905 | state_clear(CInode::STATE_AMBIGUOUSAUTH); |
2906 | take_waiting(CInode::WAIT_SINGLEAUTH, finished); | |
2907 | } | |
2908 | ||
2909 | void CInode::clear_ambiguous_auth() | |
2910 | { | |
11fdf7f2 | 2911 | MDSContext::vec finished; |
7c673cae FG |
2912 | clear_ambiguous_auth(finished); |
2913 | mdcache->mds->queue_waiters(finished); | |
2914 | } | |
2915 | ||
2916 | // auth_pins | |
91327a77 AA |
2917 | bool CInode::can_auth_pin(int *err_ret) const { |
2918 | int err; | |
2919 | if (!is_auth()) { | |
2920 | err = ERR_NOT_AUTH; | |
2921 | } else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) { | |
2922 | err = ERR_EXPORTING_INODE; | |
2923 | } else { | |
2924 | if (parent) | |
2925 | return parent->can_auth_pin(err_ret); | |
2926 | err = 0; | |
2927 | } | |
2928 | if (err && err_ret) | |
2929 | *err_ret = err; | |
2930 | return !err; | |
7c673cae FG |
2931 | } |
2932 | ||
2933 | void CInode::auth_pin(void *by) | |
2934 | { | |
2935 | if (auth_pins == 0) | |
2936 | get(PIN_AUTHPIN); | |
2937 | auth_pins++; | |
2938 | ||
2939 | #ifdef MDS_AUTHPIN_SET | |
2940 | auth_pin_set.insert(by); | |
2941 | #endif | |
2942 | ||
11fdf7f2 | 2943 | dout(10) << "auth_pin by " << by << " on " << *this << " now " << auth_pins << dendl; |
7c673cae FG |
2944 | |
2945 | if (parent) | |
11fdf7f2 | 2946 | parent->adjust_nested_auth_pins(1, this); |
7c673cae FG |
2947 | } |
2948 | ||
2949 | void CInode::auth_unpin(void *by) | |
2950 | { | |
2951 | auth_pins--; | |
2952 | ||
2953 | #ifdef MDS_AUTHPIN_SET | |
11fdf7f2 TL |
2954 | { |
2955 | auto it = auth_pin_set.find(by); | |
2956 | ceph_assert(it != auth_pin_set.end()); | |
2957 | auth_pin_set.erase(it); | |
2958 | } | |
7c673cae FG |
2959 | #endif |
2960 | ||
2961 | if (auth_pins == 0) | |
2962 | put(PIN_AUTHPIN); | |
2963 | ||
11fdf7f2 | 2964 | dout(10) << "auth_unpin by " << by << " on " << *this << " now " << auth_pins << dendl; |
7c673cae | 2965 | |
11fdf7f2 | 2966 | ceph_assert(auth_pins >= 0); |
7c673cae FG |
2967 | |
2968 | if (parent) | |
11fdf7f2 | 2969 | parent->adjust_nested_auth_pins(-1, by); |
7c673cae | 2970 | |
9f95a23c TL |
2971 | if (is_freezing_inode()) |
2972 | maybe_finish_freeze_inode(); | |
7c673cae FG |
2973 | } |
2974 | ||
7c673cae FG |
2975 | // authority |
2976 | ||
2977 | mds_authority_t CInode::authority() const | |
2978 | { | |
2979 | if (inode_auth.first >= 0) | |
2980 | return inode_auth; | |
2981 | ||
2982 | if (parent) | |
2983 | return parent->dir->authority(); | |
2984 | ||
2985 | // new items that are not yet linked in (in the committed plane) belong | |
2986 | // to their first parent. | |
2987 | if (!projected_parent.empty()) | |
2988 | return projected_parent.front()->dir->authority(); | |
2989 | ||
2990 | return CDIR_AUTH_UNDEF; | |
2991 | } | |
2992 | ||
2993 | ||
2994 | // SNAP | |
2995 | ||
2996 | snapid_t CInode::get_oldest_snap() | |
2997 | { | |
2998 | snapid_t t = first; | |
f67539c2 TL |
2999 | if (is_any_old_inodes()) |
3000 | t = get_old_inodes()->begin()->second.first; | |
11fdf7f2 | 3001 | return std::min(t, oldest_snap); |
7c673cae FG |
3002 | } |
3003 | ||
f67539c2 | 3004 | const CInode::mempool_old_inode& CInode::cow_old_inode(snapid_t follows, bool cow_head) |
7c673cae | 3005 | { |
11fdf7f2 | 3006 | ceph_assert(follows >= first); |
7c673cae | 3007 | |
f67539c2 TL |
3008 | const auto& pi = cow_head ? get_projected_inode() : get_previous_projected_inode(); |
3009 | const auto& px = cow_head ? get_projected_xattrs() : get_previous_projected_xattrs(); | |
3010 | ||
3011 | auto _old_inodes = allocate_old_inode_map(); | |
3012 | if (old_inodes) | |
3013 | *_old_inodes = *old_inodes; | |
7c673cae | 3014 | |
f67539c2 | 3015 | mempool_old_inode &old = (*_old_inodes)[follows]; |
7c673cae FG |
3016 | old.first = first; |
3017 | old.inode = *pi; | |
f67539c2 TL |
3018 | if (px) { |
3019 | dout(10) << " " << px->size() << " xattrs cowed, " << *px << dendl; | |
3020 | old.xattrs = *px; | |
3021 | } | |
7c673cae FG |
3022 | |
3023 | if (first < oldest_snap) | |
3024 | oldest_snap = first; | |
7c673cae FG |
3025 | |
3026 | old.inode.trim_client_ranges(follows); | |
3027 | ||
11fdf7f2 | 3028 | if (g_conf()->mds_snap_rstat && |
7c673cae FG |
3029 | !(old.inode.rstat == old.inode.accounted_rstat)) |
3030 | dirty_old_rstats.insert(follows); | |
3031 | ||
3032 | first = follows+1; | |
3033 | ||
11fdf7f2 | 3034 | dout(10) << __func__ << " " << (cow_head ? "head" : "previous_head" ) |
7c673cae FG |
3035 | << " to [" << old.first << "," << follows << "] on " |
3036 | << *this << dendl; | |
3037 | ||
f67539c2 | 3038 | reset_old_inodes(std::move(_old_inodes)); |
7c673cae FG |
3039 | return old; |
3040 | } | |
3041 | ||
7c673cae FG |
3042 | void CInode::pre_cow_old_inode() |
3043 | { | |
11fdf7f2 | 3044 | snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq(); |
7c673cae FG |
3045 | if (first <= follows) |
3046 | cow_old_inode(follows, true); | |
3047 | } | |
3048 | ||
11fdf7f2 TL |
3049 | bool CInode::has_snap_data(snapid_t snapid) |
3050 | { | |
3051 | bool found = snapid >= first && snapid <= last; | |
f67539c2 TL |
3052 | if (!found && is_any_old_inodes()) { |
3053 | auto p = old_inodes->lower_bound(snapid); | |
3054 | if (p != old_inodes->end()) { | |
11fdf7f2 | 3055 | if (p->second.first > snapid) { |
f67539c2 | 3056 | if (p != old_inodes->begin()) |
11fdf7f2 TL |
3057 | --p; |
3058 | } | |
3059 | if (p->second.first <= snapid && snapid <= p->first) { | |
3060 | found = true; | |
3061 | } | |
3062 | } | |
3063 | } | |
3064 | return found; | |
3065 | } | |
3066 | ||
7c673cae FG |
3067 | void CInode::purge_stale_snap_data(const set<snapid_t>& snaps) |
3068 | { | |
11fdf7f2 | 3069 | dout(10) << __func__ << " " << snaps << dendl; |
7c673cae | 3070 | |
f67539c2 TL |
3071 | if (!get_old_inodes()) |
3072 | return; | |
3073 | ||
3074 | std::vector<snapid_t> to_remove; | |
3075 | for (auto p : *get_old_inodes()) { | |
3076 | const snapid_t &id = p.first; | |
3077 | const auto &s = snaps.lower_bound(p.second.first); | |
94b18763 | 3078 | if (s == snaps.end() || *s > id) { |
f67539c2 TL |
3079 | dout(10) << " purging old_inode [" << p.second.first << "," << id << "]" << dendl; |
3080 | to_remove.push_back(id); | |
94b18763 | 3081 | } |
7c673cae | 3082 | } |
f67539c2 TL |
3083 | |
3084 | if (to_remove.size() == get_old_inodes()->size()) { | |
3085 | reset_old_inodes(old_inode_map_ptr()); | |
3086 | } else if (!to_remove.empty()) { | |
3087 | auto _old_inodes = allocate_old_inode_map(*get_old_inodes()); | |
3088 | for (auto id : to_remove) | |
3089 | _old_inodes->erase(id); | |
3090 | reset_old_inodes(std::move(_old_inodes)); | |
3091 | } | |
7c673cae FG |
3092 | } |
3093 | ||
3094 | /* | |
3095 | * pick/create an old_inode | |
3096 | */ | |
f67539c2 | 3097 | snapid_t CInode::pick_old_inode(snapid_t snap) const |
7c673cae | 3098 | { |
f67539c2 TL |
3099 | if (is_any_old_inodes()) { |
3100 | auto it = old_inodes->lower_bound(snap); // p is first key >= to snap | |
3101 | if (it != old_inodes->end() && it->second.first <= snap) { | |
3102 | dout(10) << __func__ << " snap " << snap << " -> [" << it->second.first << "," << it->first << "]" << dendl; | |
3103 | return it->first; | |
3104 | } | |
7c673cae | 3105 | } |
11fdf7f2 | 3106 | dout(10) << __func__ << " snap " << snap << " -> nothing" << dendl; |
f67539c2 | 3107 | return 0; |
7c673cae FG |
3108 | } |
3109 | ||
3110 | void CInode::open_snaprealm(bool nosplit) | |
3111 | { | |
3112 | if (!snaprealm) { | |
3113 | SnapRealm *parent = find_snaprealm(); | |
3114 | snaprealm = new SnapRealm(mdcache, this); | |
3115 | if (parent) { | |
11fdf7f2 | 3116 | dout(10) << __func__ << " " << snaprealm |
7c673cae FG |
3117 | << " parent is " << parent |
3118 | << dendl; | |
3119 | dout(30) << " siblings are " << parent->open_children << dendl; | |
3120 | snaprealm->parent = parent; | |
3121 | if (!nosplit) | |
3122 | parent->split_at(snaprealm); | |
3123 | parent->open_children.insert(snaprealm); | |
3124 | } | |
3125 | } | |
3126 | } | |
3127 | void CInode::close_snaprealm(bool nojoin) | |
3128 | { | |
3129 | if (snaprealm) { | |
11fdf7f2 | 3130 | dout(15) << __func__ << " " << *snaprealm << dendl; |
7c673cae FG |
3131 | if (snaprealm->parent) { |
3132 | snaprealm->parent->open_children.erase(snaprealm); | |
3133 | //if (!nojoin) | |
3134 | //snaprealm->parent->join(snaprealm); | |
3135 | } | |
3136 | delete snaprealm; | |
3137 | snaprealm = 0; | |
3138 | } | |
3139 | } | |
3140 | ||
3141 | SnapRealm *CInode::find_snaprealm() const | |
3142 | { | |
3143 | const CInode *cur = this; | |
3144 | while (!cur->snaprealm) { | |
11fdf7f2 TL |
3145 | const CDentry *pdn = cur->get_oldest_parent_dn(); |
3146 | if (!pdn) | |
7c673cae | 3147 | break; |
11fdf7f2 | 3148 | cur = pdn->get_dir()->get_inode(); |
7c673cae FG |
3149 | } |
3150 | return cur->snaprealm; | |
3151 | } | |
3152 | ||
3153 | void CInode::encode_snap_blob(bufferlist &snapbl) | |
3154 | { | |
3155 | if (snaprealm) { | |
11fdf7f2 TL |
3156 | using ceph::encode; |
3157 | encode(snaprealm->srnode, snapbl); | |
3158 | dout(20) << __func__ << " " << *snaprealm << dendl; | |
7c673cae FG |
3159 | } |
3160 | } | |
11fdf7f2 | 3161 | void CInode::decode_snap_blob(const bufferlist& snapbl) |
7c673cae | 3162 | { |
11fdf7f2 | 3163 | using ceph::decode; |
7c673cae FG |
3164 | if (snapbl.length()) { |
3165 | open_snaprealm(); | |
11fdf7f2 TL |
3166 | auto old_flags = snaprealm->srnode.flags; |
3167 | auto p = snapbl.cbegin(); | |
3168 | decode(snaprealm->srnode, p); | |
f67539c2 | 3169 | if (!is_base()) { |
11fdf7f2 | 3170 | if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) { |
11fdf7f2 TL |
3171 | snaprealm->adjust_parent(); |
3172 | } | |
7c673cae | 3173 | } |
11fdf7f2 | 3174 | dout(20) << __func__ << " " << *snaprealm << dendl; |
92f5a8d4 TL |
3175 | } else if (snaprealm && |
3176 | !is_root() && !is_mdsdir()) { // see https://tracker.ceph.com/issues/42675 | |
11fdf7f2 TL |
3177 | ceph_assert(mdcache->mds->is_any_replay()); |
3178 | snaprealm->merge_to(NULL); | |
7c673cae FG |
3179 | } |
3180 | } | |
3181 | ||
3182 | void CInode::encode_snap(bufferlist& bl) | |
3183 | { | |
9f95a23c | 3184 | ENCODE_START(1, 1, bl); |
7c673cae FG |
3185 | bufferlist snapbl; |
3186 | encode_snap_blob(snapbl); | |
11fdf7f2 TL |
3187 | encode(snapbl, bl); |
3188 | encode(oldest_snap, bl); | |
9f95a23c | 3189 | ENCODE_FINISH(bl); |
11fdf7f2 | 3190 | } |
7c673cae | 3191 | |
11fdf7f2 | 3192 | void CInode::decode_snap(bufferlist::const_iterator& p) |
7c673cae | 3193 | { |
9f95a23c | 3194 | DECODE_START(1, p); |
7c673cae | 3195 | bufferlist snapbl; |
11fdf7f2 TL |
3196 | decode(snapbl, p); |
3197 | decode(oldest_snap, p); | |
7c673cae | 3198 | decode_snap_blob(snapbl); |
9f95a23c | 3199 | DECODE_FINISH(p); |
7c673cae FG |
3200 | } |
3201 | ||
3202 | // ============================================= | |
3203 | ||
3204 | client_t CInode::calc_ideal_loner() | |
3205 | { | |
3206 | if (mdcache->is_readonly()) | |
3207 | return -1; | |
11fdf7f2 | 3208 | if (!get_mds_caps_wanted().empty()) |
7c673cae FG |
3209 | return -1; |
3210 | ||
3211 | int n = 0; | |
3212 | client_t loner = -1; | |
11fdf7f2 TL |
3213 | for (const auto &p : client_caps) { |
3214 | if (!p.second.is_stale() && | |
9f95a23c TL |
3215 | (is_dir() ? |
3216 | !has_subtree_or_exporting_dirfrag() : | |
3217 | (p.second.wanted() & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_RD)))) { | |
7c673cae FG |
3218 | if (n) |
3219 | return -1; | |
3220 | n++; | |
11fdf7f2 | 3221 | loner = p.first; |
7c673cae | 3222 | } |
11fdf7f2 | 3223 | } |
7c673cae FG |
3224 | return loner; |
3225 | } | |
3226 | ||
b32b8144 | 3227 | bool CInode::choose_ideal_loner() |
7c673cae FG |
3228 | { |
3229 | want_loner_cap = calc_ideal_loner(); | |
b32b8144 FG |
3230 | int changed = false; |
3231 | if (loner_cap >= 0 && loner_cap != want_loner_cap) { | |
3232 | if (!try_drop_loner()) | |
3233 | return false; | |
3234 | changed = true; | |
3235 | } | |
3236 | ||
3237 | if (want_loner_cap >= 0) { | |
3238 | if (loner_cap < 0) { | |
3239 | set_loner_cap(want_loner_cap); | |
3240 | changed = true; | |
3241 | } else | |
11fdf7f2 | 3242 | ceph_assert(loner_cap == want_loner_cap); |
b32b8144 FG |
3243 | } |
3244 | return changed; | |
7c673cae FG |
3245 | } |
3246 | ||
3247 | bool CInode::try_set_loner() | |
3248 | { | |
11fdf7f2 | 3249 | ceph_assert(want_loner_cap >= 0); |
7c673cae FG |
3250 | if (loner_cap >= 0 && loner_cap != want_loner_cap) |
3251 | return false; | |
3252 | set_loner_cap(want_loner_cap); | |
3253 | return true; | |
3254 | } | |
3255 | ||
3256 | void CInode::set_loner_cap(client_t l) | |
3257 | { | |
3258 | loner_cap = l; | |
3259 | authlock.set_excl_client(loner_cap); | |
3260 | filelock.set_excl_client(loner_cap); | |
3261 | linklock.set_excl_client(loner_cap); | |
3262 | xattrlock.set_excl_client(loner_cap); | |
3263 | } | |
3264 | ||
3265 | bool CInode::try_drop_loner() | |
3266 | { | |
3267 | if (loner_cap < 0) | |
3268 | return true; | |
3269 | ||
3270 | int other_allowed = get_caps_allowed_by_type(CAP_ANY); | |
3271 | Capability *cap = get_client_cap(loner_cap); | |
3272 | if (!cap || | |
3273 | (cap->issued() & ~other_allowed) == 0) { | |
3274 | set_loner_cap(-1); | |
3275 | return true; | |
3276 | } | |
3277 | return false; | |
3278 | } | |
3279 | ||
3280 | ||
3281 | // choose new lock state during recovery, based on issued caps | |
3282 | void CInode::choose_lock_state(SimpleLock *lock, int allissued) | |
3283 | { | |
3284 | int shift = lock->get_cap_shift(); | |
3285 | int issued = (allissued >> shift) & lock->get_cap_mask(); | |
3286 | if (is_auth()) { | |
3287 | if (lock->is_xlocked()) { | |
3288 | // do nothing here | |
3289 | } else if (lock->get_state() != LOCK_MIX) { | |
3290 | if (issued & (CEPH_CAP_GEXCL | CEPH_CAP_GBUFFER)) | |
3291 | lock->set_state(LOCK_EXCL); | |
f6b5b4d7 TL |
3292 | else if (issued & CEPH_CAP_GWR) { |
3293 | if (issued & (CEPH_CAP_GCACHE | CEPH_CAP_GSHARED)) | |
3294 | lock->set_state(LOCK_EXCL); | |
3295 | else | |
3296 | lock->set_state(LOCK_MIX); | |
3297 | } else if (lock->is_dirty()) { | |
7c673cae FG |
3298 | if (is_replicated()) |
3299 | lock->set_state(LOCK_MIX); | |
3300 | else | |
3301 | lock->set_state(LOCK_LOCK); | |
3302 | } else | |
3303 | lock->set_state(LOCK_SYNC); | |
3304 | } | |
3305 | } else { | |
3306 | // our states have already been chosen during rejoin. | |
3307 | if (lock->is_xlocked()) | |
11fdf7f2 | 3308 | ceph_assert(lock->get_state() == LOCK_LOCK); |
7c673cae FG |
3309 | } |
3310 | } | |
3311 | ||
3312 | void CInode::choose_lock_states(int dirty_caps) | |
3313 | { | |
3314 | int issued = get_caps_issued() | dirty_caps; | |
b32b8144 FG |
3315 | if (is_auth() && (issued & (CEPH_CAP_ANY_EXCL|CEPH_CAP_ANY_WR))) |
3316 | choose_ideal_loner(); | |
7c673cae FG |
3317 | choose_lock_state(&filelock, issued); |
3318 | choose_lock_state(&nestlock, issued); | |
3319 | choose_lock_state(&dirfragtreelock, issued); | |
3320 | choose_lock_state(&authlock, issued); | |
3321 | choose_lock_state(&xattrlock, issued); | |
3322 | choose_lock_state(&linklock, issued); | |
3323 | } | |
3324 | ||
9f95a23c TL |
3325 | int CInode::count_nonstale_caps() |
3326 | { | |
3327 | int n = 0; | |
3328 | for (const auto &p : client_caps) { | |
3329 | if (!p.second.is_stale()) | |
3330 | n++; | |
3331 | } | |
3332 | return n; | |
3333 | } | |
3334 | ||
3335 | bool CInode::multiple_nonstale_caps() | |
3336 | { | |
3337 | int n = 0; | |
3338 | for (const auto &p : client_caps) { | |
3339 | if (!p.second.is_stale()) { | |
3340 | if (n) | |
3341 | return true; | |
3342 | n++; | |
3343 | } | |
3344 | } | |
3345 | return false; | |
3346 | } | |
3347 | ||
11fdf7f2 TL |
3348 | void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m) |
3349 | { | |
3350 | bool old_empty = mds_caps_wanted.empty(); | |
3351 | mds_caps_wanted.swap(m); | |
3352 | if (old_empty != (bool)mds_caps_wanted.empty()) { | |
3353 | if (old_empty) | |
f91f0fd5 | 3354 | adjust_num_caps_notable(1); |
11fdf7f2 | 3355 | else |
f91f0fd5 | 3356 | adjust_num_caps_notable(-1); |
11fdf7f2 TL |
3357 | } |
3358 | } | |
3359 | ||
3360 | void CInode::set_mds_caps_wanted(mds_rank_t mds, int32_t wanted) | |
3361 | { | |
3362 | bool old_empty = mds_caps_wanted.empty(); | |
3363 | if (wanted) { | |
3364 | mds_caps_wanted[mds] = wanted; | |
3365 | if (old_empty) | |
f91f0fd5 | 3366 | adjust_num_caps_notable(1); |
11fdf7f2 TL |
3367 | } else if (!old_empty) { |
3368 | mds_caps_wanted.erase(mds); | |
3369 | if (mds_caps_wanted.empty()) | |
f91f0fd5 | 3370 | adjust_num_caps_notable(-1); |
11fdf7f2 TL |
3371 | } |
3372 | } | |
3373 | ||
9f95a23c TL |
3374 | Capability *CInode::add_client_cap(client_t client, Session *session, |
3375 | SnapRealm *conrealm, bool new_inode) | |
7c673cae | 3376 | { |
11fdf7f2 | 3377 | ceph_assert(last == CEPH_NOSNAP); |
7c673cae FG |
3378 | if (client_caps.empty()) { |
3379 | get(PIN_CAPS); | |
3380 | if (conrealm) | |
3381 | containing_realm = conrealm; | |
3382 | else | |
3383 | containing_realm = find_snaprealm(); | |
3384 | containing_realm->inodes_with_caps.push_back(&item_caps); | |
11fdf7f2 | 3385 | dout(10) << __func__ << " first cap, joining realm " << *containing_realm << dendl; |
7c673cae | 3386 | |
7c673cae | 3387 | mdcache->num_inodes_with_caps++; |
11fdf7f2 TL |
3388 | if (parent) |
3389 | parent->dir->adjust_num_inodes_with_caps(1); | |
3390 | } | |
3391 | ||
9f95a23c | 3392 | uint64_t cap_id = new_inode ? 1 : ++mdcache->last_cap_id; |
11fdf7f2 TL |
3393 | auto ret = client_caps.emplace(std::piecewise_construct, std::forward_as_tuple(client), |
3394 | std::forward_as_tuple(this, session, cap_id)); | |
3395 | ceph_assert(ret.second == true); | |
3396 | Capability *cap = &ret.first->second; | |
7c673cae | 3397 | |
7c673cae | 3398 | cap->client_follows = first-1; |
7c673cae | 3399 | containing_realm->add_cap(client, cap); |
11fdf7f2 | 3400 | |
7c673cae FG |
3401 | return cap; |
3402 | } | |
3403 | ||
3404 | void CInode::remove_client_cap(client_t client) | |
3405 | { | |
11fdf7f2 TL |
3406 | auto it = client_caps.find(client); |
3407 | ceph_assert(it != client_caps.end()); | |
3408 | Capability *cap = &it->second; | |
7c673cae FG |
3409 | |
3410 | cap->item_session_caps.remove_myself(); | |
3411 | cap->item_revoking_caps.remove_myself(); | |
3412 | cap->item_client_revoking_caps.remove_myself(); | |
3413 | containing_realm->remove_cap(client, cap); | |
3414 | ||
3415 | if (client == loner_cap) | |
3416 | loner_cap = -1; | |
3417 | ||
f91f0fd5 TL |
3418 | if (cap->is_wanted_notable()) |
3419 | adjust_num_caps_notable(-1); | |
11fdf7f2 TL |
3420 | |
3421 | client_caps.erase(it); | |
7c673cae | 3422 | if (client_caps.empty()) { |
11fdf7f2 | 3423 | dout(10) << __func__ << " last cap, leaving realm " << *containing_realm << dendl; |
7c673cae FG |
3424 | put(PIN_CAPS); |
3425 | item_caps.remove_myself(); | |
3426 | containing_realm = NULL; | |
7c673cae | 3427 | mdcache->num_inodes_with_caps--; |
11fdf7f2 TL |
3428 | if (parent) |
3429 | parent->dir->adjust_num_inodes_with_caps(-1); | |
7c673cae FG |
3430 | } |
3431 | ||
3432 | //clean up advisory locks | |
3433 | bool fcntl_removed = fcntl_locks ? fcntl_locks->remove_all_from(client) : false; | |
3434 | bool flock_removed = flock_locks ? flock_locks->remove_all_from(client) : false; | |
3435 | if (fcntl_removed || flock_removed) { | |
11fdf7f2 | 3436 | MDSContext::vec waiters; |
7c673cae FG |
3437 | take_waiting(CInode::WAIT_FLOCK, waiters); |
3438 | mdcache->mds->queue_waiters(waiters); | |
3439 | } | |
3440 | } | |
3441 | ||
3442 | void CInode::move_to_realm(SnapRealm *realm) | |
3443 | { | |
11fdf7f2 | 3444 | dout(10) << __func__ << " joining realm " << *realm |
7c673cae | 3445 | << ", leaving realm " << *containing_realm << dendl; |
11fdf7f2 TL |
3446 | for (auto& p : client_caps) { |
3447 | containing_realm->remove_cap(p.first, &p.second); | |
3448 | realm->add_cap(p.first, &p.second); | |
7c673cae FG |
3449 | } |
3450 | item_caps.remove_myself(); | |
3451 | realm->inodes_with_caps.push_back(&item_caps); | |
3452 | containing_realm = realm; | |
3453 | } | |
3454 | ||
3455 | Capability *CInode::reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session) | |
3456 | { | |
3457 | Capability *cap = get_client_cap(client); | |
3458 | if (cap) { | |
3459 | // FIXME? | |
3460 | cap->merge(icr.capinfo.wanted, icr.capinfo.issued); | |
3461 | } else { | |
3462 | cap = add_client_cap(client, session); | |
3463 | cap->set_cap_id(icr.capinfo.cap_id); | |
3464 | cap->set_wanted(icr.capinfo.wanted); | |
3465 | cap->issue_norevoke(icr.capinfo.issued); | |
3466 | cap->reset_seq(); | |
3467 | } | |
3468 | cap->set_last_issue_stamp(ceph_clock_now()); | |
3469 | return cap; | |
3470 | } | |
3471 | ||
3472 | void CInode::clear_client_caps_after_export() | |
3473 | { | |
3474 | while (!client_caps.empty()) | |
3475 | remove_client_cap(client_caps.begin()->first); | |
3476 | loner_cap = -1; | |
3477 | want_loner_cap = -1; | |
11fdf7f2 TL |
3478 | if (!get_mds_caps_wanted().empty()) { |
3479 | mempool::mds_co::compact_map<int32_t,int32_t> empty; | |
3480 | set_mds_caps_wanted(empty); | |
3481 | } | |
7c673cae FG |
3482 | } |
3483 | ||
3484 | void CInode::export_client_caps(map<client_t,Capability::Export>& cl) | |
3485 | { | |
11fdf7f2 TL |
3486 | for (const auto &p : client_caps) { |
3487 | cl[p.first] = p.second.make_export(); | |
7c673cae FG |
3488 | } |
3489 | } | |
3490 | ||
3491 | // caps allowed | |
3492 | int CInode::get_caps_liked() const | |
3493 | { | |
3494 | if (is_dir()) | |
3495 | return CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; // but not, say, FILE_RD|WR|WRBUFFER | |
3496 | else | |
3497 | return CEPH_CAP_ANY & ~CEPH_CAP_FILE_LAZYIO; | |
3498 | } | |
3499 | ||
3500 | int CInode::get_caps_allowed_ever() const | |
3501 | { | |
3502 | int allowed; | |
3503 | if (is_dir()) | |
3504 | allowed = CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; | |
3505 | else | |
3506 | allowed = CEPH_CAP_ANY; | |
3507 | return allowed & | |
3508 | (CEPH_CAP_PIN | | |
3509 | (filelock.gcaps_allowed_ever() << filelock.get_cap_shift()) | | |
3510 | (authlock.gcaps_allowed_ever() << authlock.get_cap_shift()) | | |
3511 | (xattrlock.gcaps_allowed_ever() << xattrlock.get_cap_shift()) | | |
3512 | (linklock.gcaps_allowed_ever() << linklock.get_cap_shift())); | |
3513 | } | |
3514 | ||
3515 | int CInode::get_caps_allowed_by_type(int type) const | |
3516 | { | |
3517 | return | |
3518 | CEPH_CAP_PIN | | |
3519 | (filelock.gcaps_allowed(type) << filelock.get_cap_shift()) | | |
3520 | (authlock.gcaps_allowed(type) << authlock.get_cap_shift()) | | |
3521 | (xattrlock.gcaps_allowed(type) << xattrlock.get_cap_shift()) | | |
3522 | (linklock.gcaps_allowed(type) << linklock.get_cap_shift()); | |
3523 | } | |
3524 | ||
3525 | int CInode::get_caps_careful() const | |
3526 | { | |
3527 | return | |
3528 | (filelock.gcaps_careful() << filelock.get_cap_shift()) | | |
3529 | (authlock.gcaps_careful() << authlock.get_cap_shift()) | | |
3530 | (xattrlock.gcaps_careful() << xattrlock.get_cap_shift()) | | |
3531 | (linklock.gcaps_careful() << linklock.get_cap_shift()); | |
3532 | } | |
3533 | ||
3534 | int CInode::get_xlocker_mask(client_t client) const | |
3535 | { | |
3536 | return | |
3537 | (filelock.gcaps_xlocker_mask(client) << filelock.get_cap_shift()) | | |
3538 | (authlock.gcaps_xlocker_mask(client) << authlock.get_cap_shift()) | | |
3539 | (xattrlock.gcaps_xlocker_mask(client) << xattrlock.get_cap_shift()) | | |
3540 | (linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift()); | |
3541 | } | |
3542 | ||
11fdf7f2 | 3543 | int CInode::get_caps_allowed_for_client(Session *session, Capability *cap, |
f67539c2 | 3544 | const mempool_inode *file_i) const |
7c673cae | 3545 | { |
11fdf7f2 | 3546 | client_t client = session->get_client(); |
7c673cae FG |
3547 | int allowed; |
3548 | if (client == get_loner()) { | |
3549 | // as the loner, we get the loner_caps AND any xlocker_caps for things we have xlocked | |
3550 | allowed = | |
3551 | get_caps_allowed_by_type(CAP_LONER) | | |
3552 | (get_caps_allowed_by_type(CAP_XLOCKER) & get_xlocker_mask(client)); | |
3553 | } else { | |
3554 | allowed = get_caps_allowed_by_type(CAP_ANY); | |
3555 | } | |
3556 | ||
9f95a23c TL |
3557 | if (is_dir()) { |
3558 | allowed &= ~CEPH_CAP_ANY_DIR_OPS; | |
3559 | if (cap && (allowed & CEPH_CAP_FILE_EXCL)) | |
3560 | allowed |= cap->get_lock_cache_allowed(); | |
3561 | } else { | |
11fdf7f2 TL |
3562 | if (file_i->inline_data.version == CEPH_INLINE_NONE && |
3563 | file_i->layout.pool_ns.empty()) { | |
3564 | // noop | |
3565 | } else if (cap) { | |
3566 | if ((file_i->inline_data.version != CEPH_INLINE_NONE && | |
3567 | cap->is_noinline()) || | |
3568 | (!file_i->layout.pool_ns.empty() && | |
3569 | cap->is_nopoolns())) | |
3570 | allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR); | |
3571 | } else { | |
3572 | auto& conn = session->get_connection(); | |
3573 | if ((file_i->inline_data.version != CEPH_INLINE_NONE && | |
3574 | !conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) || | |
3575 | (!file_i->layout.pool_ns.empty() && | |
3576 | !conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))) | |
3577 | allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR); | |
3578 | } | |
7c673cae FG |
3579 | } |
3580 | return allowed; | |
3581 | } | |
3582 | ||
3583 | // caps issued, wanted | |
3584 | int CInode::get_caps_issued(int *ploner, int *pother, int *pxlocker, | |
3585 | int shift, int mask) | |
3586 | { | |
3587 | int c = 0; | |
3588 | int loner = 0, other = 0, xlocker = 0; | |
3589 | if (!is_auth()) { | |
3590 | loner_cap = -1; | |
3591 | } | |
3592 | ||
11fdf7f2 TL |
3593 | for (const auto &p : client_caps) { |
3594 | int i = p.second.issued(); | |
7c673cae | 3595 | c |= i; |
11fdf7f2 | 3596 | if (p.first == loner_cap) |
7c673cae FG |
3597 | loner |= i; |
3598 | else | |
3599 | other |= i; | |
11fdf7f2 | 3600 | xlocker |= get_xlocker_mask(p.first) & i; |
7c673cae FG |
3601 | } |
3602 | if (ploner) *ploner = (loner >> shift) & mask; | |
3603 | if (pother) *pother = (other >> shift) & mask; | |
3604 | if (pxlocker) *pxlocker = (xlocker >> shift) & mask; | |
3605 | return (c >> shift) & mask; | |
3606 | } | |
3607 | ||
3608 | bool CInode::is_any_caps_wanted() const | |
3609 | { | |
11fdf7f2 TL |
3610 | for (const auto &p : client_caps) { |
3611 | if (p.second.wanted()) | |
7c673cae | 3612 | return true; |
11fdf7f2 | 3613 | } |
7c673cae FG |
3614 | return false; |
3615 | } | |
3616 | ||
3617 | int CInode::get_caps_wanted(int *ploner, int *pother, int shift, int mask) const | |
3618 | { | |
3619 | int w = 0; | |
3620 | int loner = 0, other = 0; | |
11fdf7f2 TL |
3621 | for (const auto &p : client_caps) { |
3622 | if (!p.second.is_stale()) { | |
3623 | int t = p.second.wanted(); | |
7c673cae | 3624 | w |= t; |
11fdf7f2 | 3625 | if (p.first == loner_cap) |
7c673cae FG |
3626 | loner |= t; |
3627 | else | |
3628 | other |= t; | |
3629 | } | |
3630 | //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl; | |
3631 | } | |
3632 | if (is_auth()) | |
94b18763 FG |
3633 | for (const auto &p : mds_caps_wanted) { |
3634 | w |= p.second; | |
3635 | other |= p.second; | |
7c673cae FG |
3636 | //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl; |
3637 | } | |
3638 | if (ploner) *ploner = (loner >> shift) & mask; | |
3639 | if (pother) *pother = (other >> shift) & mask; | |
3640 | return (w >> shift) & mask; | |
3641 | } | |
3642 | ||
3643 | bool CInode::issued_caps_need_gather(SimpleLock *lock) | |
3644 | { | |
3645 | int loner_issued, other_issued, xlocker_issued; | |
3646 | get_caps_issued(&loner_issued, &other_issued, &xlocker_issued, | |
3647 | lock->get_cap_shift(), lock->get_cap_mask()); | |
3648 | if ((loner_issued & ~lock->gcaps_allowed(CAP_LONER)) || | |
3649 | (other_issued & ~lock->gcaps_allowed(CAP_ANY)) || | |
3650 | (xlocker_issued & ~lock->gcaps_allowed(CAP_XLOCKER))) | |
3651 | return true; | |
3652 | return false; | |
3653 | } | |
3654 | ||
f91f0fd5 TL |
3655 | void CInode::adjust_num_caps_notable(int d) |
3656 | { | |
3657 | if (!is_clientwriteable()) { | |
3658 | if (!num_caps_notable && d > 0) | |
3659 | mdcache->open_file_table.add_inode(this); | |
3660 | else if (num_caps_notable > 0 && num_caps_notable == -d) | |
3661 | mdcache->open_file_table.remove_inode(this); | |
3662 | } | |
3663 | ||
3664 | num_caps_notable +=d; | |
3665 | ceph_assert(num_caps_notable >= 0); | |
3666 | } | |
3667 | ||
3668 | void CInode::mark_clientwriteable() | |
3669 | { | |
3670 | if (last != CEPH_NOSNAP) | |
3671 | return; | |
3672 | if (!state_test(STATE_CLIENTWRITEABLE)) { | |
3673 | if (num_caps_notable == 0) | |
3674 | mdcache->open_file_table.add_inode(this); | |
3675 | state_set(STATE_CLIENTWRITEABLE); | |
3676 | } | |
3677 | } | |
3678 | ||
3679 | void CInode::clear_clientwriteable() | |
3680 | { | |
3681 | if (state_test(STATE_CLIENTWRITEABLE)) { | |
3682 | if (num_caps_notable == 0) | |
3683 | mdcache->open_file_table.remove_inode(this); | |
3684 | state_clear(STATE_CLIENTWRITEABLE); | |
3685 | } | |
3686 | } | |
7c673cae FG |
3687 | |
3688 | // ============================================= | |
3689 | ||
3690 | int CInode::encode_inodestat(bufferlist& bl, Session *session, | |
3691 | SnapRealm *dir_realm, | |
3692 | snapid_t snapid, | |
3693 | unsigned max_bytes, | |
3694 | int getattr_caps) | |
3695 | { | |
11fdf7f2 TL |
3696 | client_t client = session->get_client(); |
3697 | ceph_assert(snapid); | |
7c673cae FG |
3698 | |
3699 | bool valid = true; | |
3700 | ||
3701 | // pick a version! | |
f67539c2 TL |
3702 | const mempool_inode *oi = get_inode().get(); |
3703 | const mempool_inode *pi = get_projected_inode().get(); | |
7c673cae | 3704 | |
f67539c2 | 3705 | const mempool_xattr_map *pxattrs = nullptr; |
7c673cae FG |
3706 | |
3707 | if (snapid != CEPH_NOSNAP) { | |
3708 | ||
3709 | // for now at least, old_inodes is only defined/valid on the auth | |
3710 | if (!is_auth()) | |
3711 | valid = false; | |
3712 | ||
f67539c2 TL |
3713 | if (is_any_old_inodes()) { |
3714 | auto it = old_inodes->lower_bound(snapid); | |
3715 | if (it != old_inodes->end()) { | |
94b18763 | 3716 | if (it->second.first > snapid) { |
f67539c2 | 3717 | if (it != old_inodes->begin()) |
94b18763 | 3718 | --it; |
7c673cae | 3719 | } |
94b18763 FG |
3720 | if (it->second.first <= snapid && snapid <= it->first) { |
3721 | dout(15) << __func__ << " snapid " << snapid | |
3722 | << " to old_inode [" << it->second.first << "," << it->first << "]" | |
3723 | << " " << it->second.inode.rstat | |
7c673cae | 3724 | << dendl; |
f67539c2 TL |
3725 | pi = oi = &it->second.inode; |
3726 | pxattrs = &it->second.xattrs; | |
7c673cae FG |
3727 | } else { |
3728 | // snapshoted remote dentry can result this | |
11fdf7f2 | 3729 | dout(0) << __func__ << " old_inode for snapid " << snapid |
7c673cae FG |
3730 | << " not found" << dendl; |
3731 | } | |
3732 | } | |
3733 | } else if (snapid < first || snapid > last) { | |
3734 | // snapshoted remote dentry can result this | |
11fdf7f2 | 3735 | dout(0) << __func__ << " [" << first << "," << last << "]" |
7c673cae FG |
3736 | << " not match snapid " << snapid << dendl; |
3737 | } | |
3738 | } | |
3739 | ||
81eedcae | 3740 | utime_t snap_btime; |
f67539c2 | 3741 | std::map<std::string, std::string> snap_metadata; |
7c673cae | 3742 | SnapRealm *realm = find_snaprealm(); |
81eedcae TL |
3743 | if (snapid != CEPH_NOSNAP && realm) { |
3744 | // add snapshot timestamp vxattr | |
3745 | map<snapid_t,const SnapInfo*> infomap; | |
3746 | realm->get_snap_info(infomap, | |
3747 | snapid, // min | |
3748 | snapid); // max | |
3749 | if (!infomap.empty()) { | |
3750 | ceph_assert(infomap.size() == 1); | |
3751 | const SnapInfo *si = infomap.begin()->second; | |
3752 | snap_btime = si->stamp; | |
f67539c2 | 3753 | snap_metadata = si->metadata; |
81eedcae TL |
3754 | } |
3755 | } | |
3756 | ||
7c673cae FG |
3757 | |
3758 | bool no_caps = !valid || | |
3759 | session->is_stale() || | |
3760 | (dir_realm && realm != dir_realm) || | |
3761 | is_frozen() || | |
3762 | state_test(CInode::STATE_EXPORTINGCAPS); | |
3763 | if (no_caps) | |
11fdf7f2 | 3764 | dout(20) << __func__ << " no caps" |
7c673cae FG |
3765 | << (!valid?", !valid":"") |
3766 | << (session->is_stale()?", session stale ":"") | |
3767 | << ((dir_realm && realm != dir_realm)?", snaprealm differs ":"") | |
3768 | << (is_frozen()?", frozen inode":"") | |
3769 | << (state_test(CInode::STATE_EXPORTINGCAPS)?", exporting caps":"") | |
3770 | << dendl; | |
3771 | ||
3772 | ||
3773 | // "fake" a version that is old (stable) version, +1 if projected. | |
3774 | version_t version = (oi->version * 2) + is_projected(); | |
3775 | ||
3776 | Capability *cap = get_client_cap(client); | |
3777 | bool pfile = filelock.is_xlocked_by_client(client) || get_loner() == client; | |
3778 | //(cap && (cap->issued() & CEPH_CAP_FILE_EXCL)); | |
3779 | bool pauth = authlock.is_xlocked_by_client(client) || get_loner() == client; | |
3780 | bool plink = linklock.is_xlocked_by_client(client) || get_loner() == client; | |
3781 | bool pxattr = xattrlock.is_xlocked_by_client(client) || get_loner() == client; | |
3782 | ||
3783 | bool plocal = versionlock.get_last_wrlock_client() == client; | |
3784 | bool ppolicy = policylock.is_xlocked_by_client(client) || get_loner()==client; | |
3785 | ||
f67539c2 | 3786 | const mempool_inode *any_i = (pfile|pauth|plink|pxattr|plocal) ? pi : oi; |
7c673cae FG |
3787 | |
3788 | dout(20) << " pfile " << pfile << " pauth " << pauth | |
3789 | << " plink " << plink << " pxattr " << pxattr | |
3790 | << " plocal " << plocal | |
3791 | << " ctime " << any_i->ctime | |
3792 | << " valid=" << valid << dendl; | |
3793 | ||
3794 | // file | |
f67539c2 | 3795 | const mempool_inode *file_i = pfile ? pi:oi; |
7c673cae FG |
3796 | file_layout_t layout; |
3797 | if (is_dir()) { | |
3798 | layout = (ppolicy ? pi : oi)->layout; | |
3799 | } else { | |
3800 | layout = file_i->layout; | |
3801 | } | |
3802 | ||
3803 | // max_size is min of projected, actual | |
3804 | uint64_t max_size = | |
f91f0fd5 TL |
3805 | std::min(oi->get_client_range(client), |
3806 | pi->get_client_range(client)); | |
7c673cae FG |
3807 | |
3808 | // inline data | |
3809 | version_t inline_version = 0; | |
3810 | bufferlist inline_data; | |
3811 | if (file_i->inline_data.version == CEPH_INLINE_NONE) { | |
3812 | inline_version = CEPH_INLINE_NONE; | |
3813 | } else if ((!cap && !no_caps) || | |
3814 | (cap && cap->client_inline_version < file_i->inline_data.version) || | |
3815 | (getattr_caps & CEPH_CAP_FILE_RD)) { // client requests inline data | |
3816 | inline_version = file_i->inline_data.version; | |
3817 | if (file_i->inline_data.length() > 0) | |
f67539c2 | 3818 | file_i->inline_data.get_data(inline_data); |
7c673cae FG |
3819 | } |
3820 | ||
3821 | // nest (do same as file... :/) | |
3822 | if (cap) { | |
3823 | cap->last_rbytes = file_i->rstat.rbytes; | |
3824 | cap->last_rsize = file_i->rstat.rsize(); | |
3825 | } | |
3826 | ||
3827 | // auth | |
f67539c2 | 3828 | const mempool_inode *auth_i = pauth ? pi:oi; |
7c673cae FG |
3829 | |
3830 | // link | |
f67539c2 | 3831 | const mempool_inode *link_i = plink ? pi:oi; |
7c673cae FG |
3832 | |
3833 | // xattr | |
f67539c2 | 3834 | const mempool_inode *xattr_i = pxattr ? pi:oi; |
7c673cae | 3835 | |
11fdf7f2 | 3836 | using ceph::encode; |
7c673cae | 3837 | // xattr |
7c673cae FG |
3838 | version_t xattr_version; |
3839 | if ((!cap && !no_caps) || | |
3840 | (cap && cap->client_xattr_version < xattr_i->xattr_version) || | |
3841 | (getattr_caps & CEPH_CAP_XATTR_SHARED)) { // client requests xattrs | |
3842 | if (!pxattrs) | |
f67539c2 | 3843 | pxattrs = pxattr ? get_projected_xattrs().get() : get_xattrs().get(); |
7c673cae FG |
3844 | xattr_version = xattr_i->xattr_version; |
3845 | } else { | |
3846 | xattr_version = 0; | |
3847 | } | |
3848 | ||
3849 | // do we have room? | |
3850 | if (max_bytes) { | |
11fdf7f2 TL |
3851 | unsigned bytes = |
3852 | 8 + 8 + 4 + 8 + 8 + sizeof(ceph_mds_reply_cap) + | |
3853 | sizeof(struct ceph_file_layout) + | |
3854 | sizeof(struct ceph_timespec) * 3 + 4 + // ctime ~ time_warp_seq | |
3855 | 8 + 8 + 8 + 4 + 4 + 4 + 4 + 4 + // size ~ nlink | |
3856 | 8 + 8 + 8 + 8 + 8 + sizeof(struct ceph_timespec) + // dirstat.nfiles ~ rstat.rctime | |
3857 | sizeof(__u32) + sizeof(__u32) * 2 * dirfragtree._splits.size() + // dirfragtree | |
3858 | sizeof(__u32) + symlink.length() + // symlink | |
3859 | sizeof(struct ceph_dir_layout); // dir_layout | |
3860 | ||
3861 | if (xattr_version) { | |
3862 | bytes += sizeof(__u32) + sizeof(__u32); // xattr buffer len + number entries | |
3863 | if (pxattrs) { | |
3864 | for (const auto &p : *pxattrs) | |
3865 | bytes += sizeof(__u32) * 2 + p.first.length() + p.second.length(); | |
3866 | } | |
3867 | } else { | |
3868 | bytes += sizeof(__u32); // xattr buffer len | |
3869 | } | |
3870 | bytes += | |
3871 | sizeof(version_t) + sizeof(__u32) + inline_data.length() + // inline data | |
3872 | 1 + 1 + 8 + 8 + 4 + // quota | |
3873 | 4 + layout.pool_ns.size() + // pool ns | |
3874 | sizeof(struct ceph_timespec) + 8; // btime + change_attr | |
3875 | ||
7c673cae | 3876 | if (bytes > max_bytes) |
f67539c2 | 3877 | return -CEPHFS_ENOSPC; |
7c673cae FG |
3878 | } |
3879 | ||
3880 | ||
3881 | // encode caps | |
3882 | struct ceph_mds_reply_cap ecap; | |
3883 | if (snapid != CEPH_NOSNAP) { | |
3884 | /* | |
3885 | * snapped inodes (files or dirs) only get read-only caps. always | |
3886 | * issue everything possible, since it is read only. | |
3887 | * | |
3888 | * if a snapped inode has caps, limit issued caps based on the | |
3889 | * lock state. | |
3890 | * | |
3891 | * if it is a live inode, limit issued caps based on the lock | |
3892 | * state. | |
3893 | * | |
3894 | * do NOT adjust cap issued state, because the client always | |
3895 | * tracks caps per-snap and the mds does either per-interval or | |
3896 | * multiversion. | |
3897 | */ | |
3898 | ecap.caps = valid ? get_caps_allowed_by_type(CAP_ANY) : CEPH_STAT_CAP_INODE; | |
3899 | if (last == CEPH_NOSNAP || is_any_caps()) | |
11fdf7f2 | 3900 | ecap.caps = ecap.caps & get_caps_allowed_for_client(session, nullptr, file_i); |
7c673cae FG |
3901 | ecap.seq = 0; |
3902 | ecap.mseq = 0; | |
3903 | ecap.realm = 0; | |
3904 | } else { | |
3905 | if (!no_caps && !cap) { | |
3906 | // add a new cap | |
3907 | cap = add_client_cap(client, session, realm); | |
b32b8144 FG |
3908 | if (is_auth()) |
3909 | choose_ideal_loner(); | |
7c673cae FG |
3910 | } |
3911 | ||
3912 | int issue = 0; | |
3913 | if (!no_caps && cap) { | |
3914 | int likes = get_caps_liked(); | |
11fdf7f2 | 3915 | int allowed = get_caps_allowed_for_client(session, cap, file_i); |
7c673cae | 3916 | issue = (cap->wanted() | likes) & allowed; |
494da23a | 3917 | cap->issue_norevoke(issue, true); |
7c673cae FG |
3918 | issue = cap->pending(); |
3919 | dout(10) << "encode_inodestat issuing " << ccap_string(issue) | |
3920 | << " seq " << cap->get_last_seq() << dendl; | |
3921 | } else if (cap && cap->is_new() && !dir_realm) { | |
3922 | // alway issue new caps to client, otherwise the caps get lost | |
11fdf7f2 | 3923 | ceph_assert(cap->is_stale()); |
494da23a TL |
3924 | ceph_assert(!cap->pending()); |
3925 | issue = CEPH_CAP_PIN; | |
3926 | cap->issue_norevoke(issue, true); | |
7c673cae FG |
3927 | dout(10) << "encode_inodestat issuing " << ccap_string(issue) |
3928 | << " seq " << cap->get_last_seq() | |
494da23a | 3929 | << "(stale&new caps)" << dendl; |
7c673cae FG |
3930 | } |
3931 | ||
3932 | if (issue) { | |
3933 | cap->set_last_issue(); | |
3934 | cap->set_last_issue_stamp(ceph_clock_now()); | |
7c673cae FG |
3935 | ecap.caps = issue; |
3936 | ecap.wanted = cap->wanted(); | |
3937 | ecap.cap_id = cap->get_cap_id(); | |
3938 | ecap.seq = cap->get_last_seq(); | |
3939 | ecap.mseq = cap->get_mseq(); | |
3940 | ecap.realm = realm->inode->ino(); | |
3941 | } else { | |
3942 | ecap.cap_id = 0; | |
3943 | ecap.caps = 0; | |
3944 | ecap.seq = 0; | |
3945 | ecap.mseq = 0; | |
3946 | ecap.realm = 0; | |
3947 | ecap.wanted = 0; | |
3948 | } | |
3949 | } | |
3950 | ecap.flags = is_auth() ? CEPH_CAP_FLAG_AUTH : 0; | |
3951 | dout(10) << "encode_inodestat caps " << ccap_string(ecap.caps) | |
3952 | << " seq " << ecap.seq << " mseq " << ecap.mseq | |
11fdf7f2 | 3953 | << " xattrv " << xattr_version << dendl; |
7c673cae FG |
3954 | |
3955 | if (inline_data.length() && cap) { | |
3956 | if ((cap->pending() | getattr_caps) & CEPH_CAP_FILE_SHARED) { | |
3957 | dout(10) << "including inline version " << inline_version << dendl; | |
3958 | cap->client_inline_version = inline_version; | |
3959 | } else { | |
3960 | dout(10) << "dropping inline version " << inline_version << dendl; | |
3961 | inline_version = 0; | |
3962 | inline_data.clear(); | |
3963 | } | |
3964 | } | |
3965 | ||
3966 | // include those xattrs? | |
11fdf7f2 | 3967 | if (xattr_version && cap) { |
7c673cae | 3968 | if ((cap->pending() | getattr_caps) & CEPH_CAP_XATTR_SHARED) { |
11fdf7f2 TL |
3969 | dout(10) << "including xattrs version " << xattr_version << dendl; |
3970 | cap->client_xattr_version = xattr_version; | |
7c673cae | 3971 | } else { |
11fdf7f2 | 3972 | dout(10) << "dropping xattrs version " << xattr_version << dendl; |
7c673cae FG |
3973 | xattr_version = 0; |
3974 | } | |
3975 | } | |
3976 | ||
11fdf7f2 TL |
3977 | // The end result of encode_xattrs() is equivalent to: |
3978 | // { | |
3979 | // bufferlist xbl; | |
3980 | // if (xattr_version) { | |
3981 | // if (pxattrs) | |
3982 | // encode(*pxattrs, bl); | |
3983 | // else | |
3984 | // encode((__u32)0, bl); | |
3985 | // } | |
3986 | // encode(xbl, bl); | |
3987 | // } | |
3988 | // | |
3989 | // But encoding xattrs into the 'xbl' requires a memory allocation. | |
3990 | // The 'bl' should have enough pre-allocated memory in most cases. | |
3991 | // Encoding xattrs directly into it can avoid the extra allocation. | |
3992 | auto encode_xattrs = [xattr_version, pxattrs, &bl]() { | |
3993 | using ceph::encode; | |
3994 | if (xattr_version) { | |
3995 | ceph_le32 xbl_len; | |
3996 | auto filler = bl.append_hole(sizeof(xbl_len)); | |
3997 | const auto starting_bl_len = bl.length(); | |
3998 | if (pxattrs) | |
3999 | encode(*pxattrs, bl); | |
4000 | else | |
4001 | encode((__u32)0, bl); | |
4002 | xbl_len = bl.length() - starting_bl_len; | |
4003 | filler.copy_in(sizeof(xbl_len), (char *)&xbl_len); | |
4004 | } else { | |
4005 | encode((__u32)0, bl); | |
4006 | } | |
4007 | }; | |
4008 | ||
7c673cae FG |
4009 | /* |
4010 | * note: encoding matches MClientReply::InodeStat | |
4011 | */ | |
11fdf7f2 | 4012 | if (session->info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) { |
f67539c2 | 4013 | ENCODE_START(6, 1, bl); |
11fdf7f2 TL |
4014 | encode(oi->ino, bl); |
4015 | encode(snapid, bl); | |
4016 | encode(oi->rdev, bl); | |
4017 | encode(version, bl); | |
4018 | encode(xattr_version, bl); | |
4019 | encode(ecap, bl); | |
4020 | { | |
4021 | ceph_file_layout legacy_layout; | |
4022 | layout.to_legacy(&legacy_layout); | |
4023 | encode(legacy_layout, bl); | |
4024 | } | |
4025 | encode(any_i->ctime, bl); | |
4026 | encode(file_i->mtime, bl); | |
4027 | encode(file_i->atime, bl); | |
4028 | encode(file_i->time_warp_seq, bl); | |
4029 | encode(file_i->size, bl); | |
4030 | encode(max_size, bl); | |
4031 | encode(file_i->truncate_size, bl); | |
4032 | encode(file_i->truncate_seq, bl); | |
4033 | encode(auth_i->mode, bl); | |
4034 | encode((uint32_t)auth_i->uid, bl); | |
4035 | encode((uint32_t)auth_i->gid, bl); | |
4036 | encode(link_i->nlink, bl); | |
4037 | encode(file_i->dirstat.nfiles, bl); | |
4038 | encode(file_i->dirstat.nsubdirs, bl); | |
4039 | encode(file_i->rstat.rbytes, bl); | |
4040 | encode(file_i->rstat.rfiles, bl); | |
4041 | encode(file_i->rstat.rsubdirs, bl); | |
4042 | encode(file_i->rstat.rctime, bl); | |
4043 | dirfragtree.encode(bl); | |
4044 | encode(symlink, bl); | |
4045 | encode(file_i->dir_layout, bl); | |
4046 | encode_xattrs(); | |
4047 | encode(inline_version, bl); | |
4048 | encode(inline_data, bl); | |
f67539c2 | 4049 | const mempool_inode *policy_i = ppolicy ? pi : oi; |
11fdf7f2 TL |
4050 | encode(policy_i->quota, bl); |
4051 | encode(layout.pool_ns, bl); | |
4052 | encode(any_i->btime, bl); | |
4053 | encode(any_i->change_attr, bl); | |
4054 | encode(file_i->export_pin, bl); | |
81eedcae | 4055 | encode(snap_btime, bl); |
f67539c2 TL |
4056 | encode(file_i->rstat.rsnaps, bl); |
4057 | encode(snap_metadata, bl); | |
4058 | encode(file_i->fscrypt, bl); | |
11fdf7f2 TL |
4059 | ENCODE_FINISH(bl); |
4060 | } | |
4061 | else { | |
4062 | ceph_assert(session->get_connection()); | |
4063 | ||
4064 | encode(oi->ino, bl); | |
4065 | encode(snapid, bl); | |
4066 | encode(oi->rdev, bl); | |
4067 | encode(version, bl); | |
4068 | encode(xattr_version, bl); | |
4069 | encode(ecap, bl); | |
4070 | { | |
4071 | ceph_file_layout legacy_layout; | |
4072 | layout.to_legacy(&legacy_layout); | |
4073 | encode(legacy_layout, bl); | |
4074 | } | |
4075 | encode(any_i->ctime, bl); | |
4076 | encode(file_i->mtime, bl); | |
4077 | encode(file_i->atime, bl); | |
4078 | encode(file_i->time_warp_seq, bl); | |
4079 | encode(file_i->size, bl); | |
4080 | encode(max_size, bl); | |
4081 | encode(file_i->truncate_size, bl); | |
4082 | encode(file_i->truncate_seq, bl); | |
4083 | encode(auth_i->mode, bl); | |
4084 | encode((uint32_t)auth_i->uid, bl); | |
4085 | encode((uint32_t)auth_i->gid, bl); | |
4086 | encode(link_i->nlink, bl); | |
4087 | encode(file_i->dirstat.nfiles, bl); | |
4088 | encode(file_i->dirstat.nsubdirs, bl); | |
4089 | encode(file_i->rstat.rbytes, bl); | |
4090 | encode(file_i->rstat.rfiles, bl); | |
4091 | encode(file_i->rstat.rsubdirs, bl); | |
4092 | encode(file_i->rstat.rctime, bl); | |
4093 | dirfragtree.encode(bl); | |
4094 | encode(symlink, bl); | |
4095 | auto& conn = session->get_connection(); | |
4096 | if (conn->has_feature(CEPH_FEATURE_DIRLAYOUTHASH)) { | |
4097 | encode(file_i->dir_layout, bl); | |
4098 | } | |
4099 | encode_xattrs(); | |
4100 | if (conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) { | |
4101 | encode(inline_version, bl); | |
4102 | encode(inline_data, bl); | |
4103 | } | |
4104 | if (conn->has_feature(CEPH_FEATURE_MDS_QUOTA)) { | |
f67539c2 | 4105 | const mempool_inode *policy_i = ppolicy ? pi : oi; |
11fdf7f2 TL |
4106 | encode(policy_i->quota, bl); |
4107 | } | |
4108 | if (conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) { | |
4109 | encode(layout.pool_ns, bl); | |
4110 | } | |
4111 | if (conn->has_feature(CEPH_FEATURE_FS_BTIME)) { | |
4112 | encode(any_i->btime, bl); | |
4113 | encode(any_i->change_attr, bl); | |
4114 | } | |
7c673cae FG |
4115 | } |
4116 | ||
4117 | return valid; | |
4118 | } | |
4119 | ||
9f95a23c | 4120 | void CInode::encode_cap_message(const ref_t<MClientCaps> &m, Capability *cap) |
7c673cae | 4121 | { |
11fdf7f2 | 4122 | ceph_assert(cap); |
7c673cae FG |
4123 | |
4124 | client_t client = cap->get_client(); | |
4125 | ||
4126 | bool pfile = filelock.is_xlocked_by_client(client) || (cap->issued() & CEPH_CAP_FILE_EXCL); | |
4127 | bool pauth = authlock.is_xlocked_by_client(client); | |
4128 | bool plink = linklock.is_xlocked_by_client(client); | |
4129 | bool pxattr = xattrlock.is_xlocked_by_client(client); | |
4130 | ||
f67539c2 TL |
4131 | const mempool_inode *oi = get_inode().get(); |
4132 | const mempool_inode *pi = get_projected_inode().get(); | |
4133 | const mempool_inode *i = (pfile|pauth|plink|pxattr) ? pi : oi; | |
7c673cae | 4134 | |
11fdf7f2 | 4135 | dout(20) << __func__ << " pfile " << pfile |
7c673cae FG |
4136 | << " pauth " << pauth << " plink " << plink << " pxattr " << pxattr |
4137 | << " ctime " << i->ctime << dendl; | |
4138 | ||
4139 | i = pfile ? pi:oi; | |
4140 | m->set_layout(i->layout); | |
4141 | m->size = i->size; | |
4142 | m->truncate_seq = i->truncate_seq; | |
4143 | m->truncate_size = i->truncate_size; | |
4144 | m->mtime = i->mtime; | |
4145 | m->atime = i->atime; | |
4146 | m->ctime = i->ctime; | |
20effc67 | 4147 | m->btime = i->btime; |
7c673cae FG |
4148 | m->change_attr = i->change_attr; |
4149 | m->time_warp_seq = i->time_warp_seq; | |
28e407b8 AA |
4150 | m->nfiles = i->dirstat.nfiles; |
4151 | m->nsubdirs = i->dirstat.nsubdirs; | |
7c673cae FG |
4152 | |
4153 | if (cap->client_inline_version < i->inline_data.version) { | |
4154 | m->inline_version = cap->client_inline_version = i->inline_data.version; | |
4155 | if (i->inline_data.length() > 0) | |
f67539c2 | 4156 | i->inline_data.get_data(m->inline_data); |
7c673cae FG |
4157 | } else { |
4158 | m->inline_version = 0; | |
4159 | } | |
4160 | ||
4161 | // max_size is min of projected, actual. | |
f91f0fd5 TL |
4162 | uint64_t oldms = oi->get_client_range(client); |
4163 | uint64_t newms = pi->get_client_range(client); | |
11fdf7f2 | 4164 | m->max_size = std::min(oldms, newms); |
7c673cae FG |
4165 | |
4166 | i = pauth ? pi:oi; | |
4167 | m->head.mode = i->mode; | |
4168 | m->head.uid = i->uid; | |
4169 | m->head.gid = i->gid; | |
4170 | ||
4171 | i = plink ? pi:oi; | |
4172 | m->head.nlink = i->nlink; | |
4173 | ||
11fdf7f2 | 4174 | using ceph::encode; |
7c673cae | 4175 | i = pxattr ? pi:oi; |
f67539c2 | 4176 | const auto& ix = pxattr ? get_projected_xattrs() : get_xattrs(); |
7c673cae FG |
4177 | if ((cap->pending() & CEPH_CAP_XATTR_SHARED) && |
4178 | i->xattr_version > cap->client_xattr_version) { | |
4179 | dout(10) << " including xattrs v " << i->xattr_version << dendl; | |
f67539c2 TL |
4180 | if (ix) |
4181 | encode(*ix, m->xattrbl); | |
4182 | else | |
4183 | encode((__u32)0, m->xattrbl); | |
7c673cae FG |
4184 | m->head.xattr_version = i->xattr_version; |
4185 | cap->client_xattr_version = i->xattr_version; | |
4186 | } | |
4187 | } | |
4188 | ||
4189 | ||
4190 | ||
4191 | void CInode::_encode_base(bufferlist& bl, uint64_t features) | |
4192 | { | |
9f95a23c | 4193 | ENCODE_START(1, 1, bl); |
11fdf7f2 | 4194 | encode(first, bl); |
f67539c2 | 4195 | encode(*get_inode(), bl, features); |
11fdf7f2 TL |
4196 | encode(symlink, bl); |
4197 | encode(dirfragtree, bl); | |
f67539c2 TL |
4198 | encode_xattrs(bl); |
4199 | encode_old_inodes(bl, features); | |
11fdf7f2 | 4200 | encode(damage_flags, bl); |
7c673cae | 4201 | encode_snap(bl); |
9f95a23c | 4202 | ENCODE_FINISH(bl); |
7c673cae | 4203 | } |
11fdf7f2 | 4204 | void CInode::_decode_base(bufferlist::const_iterator& p) |
7c673cae | 4205 | { |
9f95a23c | 4206 | DECODE_START(1, p); |
11fdf7f2 | 4207 | decode(first, p); |
f67539c2 TL |
4208 | { |
4209 | auto _inode = allocate_inode(); | |
4210 | decode(*_inode, p); | |
4211 | reset_inode(std::move(_inode)); | |
4212 | } | |
94b18763 FG |
4213 | { |
4214 | std::string tmp; | |
11fdf7f2 TL |
4215 | decode(tmp, p); |
4216 | symlink = std::string_view(tmp); | |
94b18763 | 4217 | } |
11fdf7f2 | 4218 | decode(dirfragtree, p); |
f67539c2 TL |
4219 | decode_xattrs(p); |
4220 | decode_old_inodes(p); | |
11fdf7f2 | 4221 | decode(damage_flags, p); |
7c673cae | 4222 | decode_snap(p); |
9f95a23c | 4223 | DECODE_FINISH(p); |
7c673cae FG |
4224 | } |
4225 | ||
4226 | void CInode::_encode_locks_full(bufferlist& bl) | |
4227 | { | |
11fdf7f2 TL |
4228 | using ceph::encode; |
4229 | encode(authlock, bl); | |
4230 | encode(linklock, bl); | |
4231 | encode(dirfragtreelock, bl); | |
4232 | encode(filelock, bl); | |
4233 | encode(xattrlock, bl); | |
4234 | encode(snaplock, bl); | |
4235 | encode(nestlock, bl); | |
4236 | encode(flocklock, bl); | |
4237 | encode(policylock, bl); | |
4238 | ||
4239 | encode(loner_cap, bl); | |
4240 | } | |
4241 | void CInode::_decode_locks_full(bufferlist::const_iterator& p) | |
4242 | { | |
4243 | using ceph::decode; | |
4244 | decode(authlock, p); | |
4245 | decode(linklock, p); | |
4246 | decode(dirfragtreelock, p); | |
4247 | decode(filelock, p); | |
4248 | decode(xattrlock, p); | |
4249 | decode(snaplock, p); | |
4250 | decode(nestlock, p); | |
4251 | decode(flocklock, p); | |
4252 | decode(policylock, p); | |
4253 | ||
4254 | decode(loner_cap, p); | |
7c673cae FG |
4255 | set_loner_cap(loner_cap); |
4256 | want_loner_cap = loner_cap; // for now, we'll eval() shortly. | |
4257 | } | |
4258 | ||
b32b8144 | 4259 | void CInode::_encode_locks_state_for_replica(bufferlist& bl, bool need_recover) |
7c673cae | 4260 | { |
9f95a23c | 4261 | ENCODE_START(1, 1, bl); |
7c673cae FG |
4262 | authlock.encode_state_for_replica(bl); |
4263 | linklock.encode_state_for_replica(bl); | |
4264 | dirfragtreelock.encode_state_for_replica(bl); | |
4265 | filelock.encode_state_for_replica(bl); | |
4266 | nestlock.encode_state_for_replica(bl); | |
4267 | xattrlock.encode_state_for_replica(bl); | |
4268 | snaplock.encode_state_for_replica(bl); | |
4269 | flocklock.encode_state_for_replica(bl); | |
4270 | policylock.encode_state_for_replica(bl); | |
11fdf7f2 | 4271 | encode(need_recover, bl); |
9f95a23c | 4272 | ENCODE_FINISH(bl); |
7c673cae | 4273 | } |
b32b8144 | 4274 | |
7c673cae FG |
4275 | void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep) |
4276 | { | |
4277 | authlock.encode_state_for_replica(bl); | |
4278 | linklock.encode_state_for_replica(bl); | |
4279 | dirfragtreelock.encode_state_for_rejoin(bl, rep); | |
4280 | filelock.encode_state_for_rejoin(bl, rep); | |
4281 | nestlock.encode_state_for_rejoin(bl, rep); | |
4282 | xattrlock.encode_state_for_replica(bl); | |
4283 | snaplock.encode_state_for_replica(bl); | |
4284 | flocklock.encode_state_for_replica(bl); | |
4285 | policylock.encode_state_for_replica(bl); | |
4286 | } | |
b32b8144 | 4287 | |
9f95a23c | 4288 | void CInode::_decode_locks_state_for_replica(bufferlist::const_iterator& p, bool is_new) |
7c673cae | 4289 | { |
9f95a23c | 4290 | DECODE_START(1, p); |
7c673cae FG |
4291 | authlock.decode_state(p, is_new); |
4292 | linklock.decode_state(p, is_new); | |
4293 | dirfragtreelock.decode_state(p, is_new); | |
4294 | filelock.decode_state(p, is_new); | |
4295 | nestlock.decode_state(p, is_new); | |
4296 | xattrlock.decode_state(p, is_new); | |
4297 | snaplock.decode_state(p, is_new); | |
4298 | flocklock.decode_state(p, is_new); | |
4299 | policylock.decode_state(p, is_new); | |
b32b8144 FG |
4300 | |
4301 | bool need_recover; | |
11fdf7f2 | 4302 | decode(need_recover, p); |
b32b8144 FG |
4303 | if (need_recover && is_new) { |
4304 | // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock | |
4305 | // and change the object when replaying unsafe requests. | |
4306 | authlock.mark_need_recover(); | |
4307 | linklock.mark_need_recover(); | |
4308 | dirfragtreelock.mark_need_recover(); | |
4309 | filelock.mark_need_recover(); | |
4310 | nestlock.mark_need_recover(); | |
4311 | xattrlock.mark_need_recover(); | |
4312 | snaplock.mark_need_recover(); | |
4313 | flocklock.mark_need_recover(); | |
4314 | policylock.mark_need_recover(); | |
4315 | } | |
9f95a23c | 4316 | DECODE_FINISH(p); |
7c673cae | 4317 | } |
11fdf7f2 | 4318 | void CInode::_decode_locks_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters, |
b32b8144 FG |
4319 | list<SimpleLock*>& eval_locks, bool survivor) |
4320 | { | |
4321 | authlock.decode_state_rejoin(p, waiters, survivor); | |
4322 | linklock.decode_state_rejoin(p, waiters, survivor); | |
4323 | dirfragtreelock.decode_state_rejoin(p, waiters, survivor); | |
4324 | filelock.decode_state_rejoin(p, waiters, survivor); | |
4325 | nestlock.decode_state_rejoin(p, waiters, survivor); | |
4326 | xattrlock.decode_state_rejoin(p, waiters, survivor); | |
4327 | snaplock.decode_state_rejoin(p, waiters, survivor); | |
4328 | flocklock.decode_state_rejoin(p, waiters, survivor); | |
4329 | policylock.decode_state_rejoin(p, waiters, survivor); | |
7c673cae FG |
4330 | |
4331 | if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked()) | |
4332 | eval_locks.push_back(&dirfragtreelock); | |
4333 | if (!filelock.is_stable() && !filelock.is_wrlocked()) | |
4334 | eval_locks.push_back(&filelock); | |
4335 | if (!nestlock.is_stable() && !nestlock.is_wrlocked()) | |
4336 | eval_locks.push_back(&nestlock); | |
4337 | } | |
4338 | ||
4339 | ||
4340 | // IMPORT/EXPORT | |
4341 | ||
4342 | void CInode::encode_export(bufferlist& bl) | |
4343 | { | |
4344 | ENCODE_START(5, 4, bl); | |
4345 | _encode_base(bl, mdcache->mds->mdsmap->get_up_features()); | |
4346 | ||
11fdf7f2 | 4347 | encode(state, bl); |
7c673cae | 4348 | |
11fdf7f2 | 4349 | encode(pop, bl); |
7c673cae | 4350 | |
11fdf7f2 | 4351 | encode(get_replicas(), bl); |
7c673cae FG |
4352 | |
4353 | // include scatterlock info for any bounding CDirs | |
4354 | bufferlist bounding; | |
f67539c2 | 4355 | if (get_inode()->is_dir()) |
94b18763 FG |
4356 | for (const auto &p : dirfrags) { |
4357 | CDir *dir = p.second; | |
7c673cae | 4358 | if (dir->state_test(CDir::STATE_EXPORTBOUND)) { |
11fdf7f2 | 4359 | encode(p.first, bounding); |
f67539c2 TL |
4360 | encode(dir->get_fnode()->fragstat, bounding); |
4361 | encode(dir->get_fnode()->accounted_fragstat, bounding); | |
4362 | encode(dir->get_fnode()->rstat, bounding); | |
4363 | encode(dir->get_fnode()->accounted_rstat, bounding); | |
7c673cae FG |
4364 | dout(10) << " encoded fragstat/rstat info for " << *dir << dendl; |
4365 | } | |
4366 | } | |
11fdf7f2 | 4367 | encode(bounding, bl); |
7c673cae FG |
4368 | |
4369 | _encode_locks_full(bl); | |
4370 | ||
4371 | _encode_file_locks(bl); | |
4372 | ||
4373 | ENCODE_FINISH(bl); | |
4374 | ||
4375 | get(PIN_TEMPEXPORTING); | |
4376 | } | |
4377 | ||
11fdf7f2 | 4378 | void CInode::finish_export() |
7c673cae FG |
4379 | { |
4380 | state &= MASK_STATE_EXPORT_KEPT; | |
4381 | ||
11fdf7f2 | 4382 | pop.zero(); |
7c673cae FG |
4383 | |
4384 | // just in case! | |
4385 | //dirlock.clear_updated(); | |
4386 | ||
4387 | loner_cap = -1; | |
4388 | ||
4389 | put(PIN_TEMPEXPORTING); | |
4390 | } | |
4391 | ||
11fdf7f2 | 4392 | void CInode::decode_import(bufferlist::const_iterator& p, |
7c673cae FG |
4393 | LogSegment *ls) |
4394 | { | |
4395 | DECODE_START(5, p); | |
4396 | ||
4397 | _decode_base(p); | |
4398 | ||
f6b5b4d7 TL |
4399 | { |
4400 | unsigned s; | |
4401 | decode(s, p); | |
4402 | s &= MASK_STATE_EXPORTED; | |
4403 | ||
f67539c2 TL |
4404 | set_ephemeral_pin((s & STATE_DISTEPHEMERALPIN), |
4405 | (s & STATE_RANDEPHEMERALPIN)); | |
f6b5b4d7 TL |
4406 | state_set(STATE_AUTH | s); |
4407 | } | |
7c673cae FG |
4408 | |
4409 | if (is_dirty()) { | |
4410 | get(PIN_DIRTY); | |
4411 | _mark_dirty(ls); | |
4412 | } | |
4413 | if (is_dirty_parent()) { | |
4414 | get(PIN_DIRTYPARENT); | |
28e407b8 | 4415 | mark_dirty_parent(ls); |
7c673cae FG |
4416 | } |
4417 | ||
11fdf7f2 | 4418 | decode(pop, p); |
7c673cae | 4419 | |
11fdf7f2 | 4420 | decode(get_replicas(), p); |
181888fb | 4421 | if (is_replicated()) |
7c673cae FG |
4422 | get(PIN_REPLICATED); |
4423 | replica_nonce = 0; | |
4424 | ||
4425 | // decode fragstat info on bounding cdirs | |
4426 | bufferlist bounding; | |
11fdf7f2 TL |
4427 | decode(bounding, p); |
4428 | auto q = bounding.cbegin(); | |
7c673cae FG |
4429 | while (!q.end()) { |
4430 | frag_t fg; | |
11fdf7f2 | 4431 | decode(fg, q); |
7c673cae | 4432 | CDir *dir = get_dirfrag(fg); |
11fdf7f2 | 4433 | ceph_assert(dir); // we should have all bounds open |
7c673cae FG |
4434 | |
4435 | // Only take the remote's fragstat/rstat if we are non-auth for | |
4436 | // this dirfrag AND the lock is NOT in a scattered (MIX) state. | |
4437 | // We know lock is stable, and MIX is the only state in which | |
4438 | // the inode auth (who sent us this data) may not have the best | |
4439 | // info. | |
4440 | ||
4441 | // HMM: Are there cases where dir->is_auth() is an insufficient | |
4442 | // check because the dirfrag is under migration? That implies | |
4443 | // it is frozen (and in a SYNC or LOCK state). FIXME. | |
4444 | ||
f67539c2 | 4445 | auto _fnode = CDir::allocate_fnode(*dir->get_fnode()); |
7c673cae FG |
4446 | if (dir->is_auth() || |
4447 | filelock.get_state() == LOCK_MIX) { | |
4448 | dout(10) << " skipped fragstat info for " << *dir << dendl; | |
4449 | frag_info_t f; | |
11fdf7f2 TL |
4450 | decode(f, q); |
4451 | decode(f, q); | |
7c673cae | 4452 | } else { |
f67539c2 TL |
4453 | decode(_fnode->fragstat, q); |
4454 | decode(_fnode->accounted_fragstat, q); | |
7c673cae FG |
4455 | dout(10) << " took fragstat info for " << *dir << dendl; |
4456 | } | |
4457 | if (dir->is_auth() || | |
4458 | nestlock.get_state() == LOCK_MIX) { | |
4459 | dout(10) << " skipped rstat info for " << *dir << dendl; | |
4460 | nest_info_t n; | |
11fdf7f2 TL |
4461 | decode(n, q); |
4462 | decode(n, q); | |
7c673cae | 4463 | } else { |
f67539c2 TL |
4464 | decode(_fnode->rstat, q); |
4465 | decode(_fnode->accounted_rstat, q); | |
7c673cae FG |
4466 | dout(10) << " took rstat info for " << *dir << dendl; |
4467 | } | |
f67539c2 | 4468 | dir->reset_fnode(std::move(_fnode)); |
7c673cae FG |
4469 | } |
4470 | ||
4471 | _decode_locks_full(p); | |
4472 | ||
4473 | _decode_file_locks(p); | |
4474 | ||
4475 | DECODE_FINISH(p); | |
4476 | } | |
4477 | ||
4478 | ||
4479 | void InodeStoreBase::dump(Formatter *f) const | |
4480 | { | |
f67539c2 | 4481 | inode->dump(f); |
7c673cae | 4482 | f->dump_string("symlink", symlink); |
9f95a23c TL |
4483 | |
4484 | f->open_array_section("xattrs"); | |
f67539c2 TL |
4485 | if (xattrs) { |
4486 | for (const auto& [key, val] : *xattrs) { | |
4487 | f->open_object_section("xattr"); | |
4488 | f->dump_string("key", key); | |
4489 | std::string v(val.c_str(), val.length()); | |
4490 | f->dump_string("val", v); | |
4491 | f->close_section(); | |
4492 | } | |
9f95a23c TL |
4493 | } |
4494 | f->close_section(); | |
4495 | f->open_object_section("dirfragtree"); | |
4496 | dirfragtree.dump(f); | |
4497 | f->close_section(); // dirfragtree | |
4498 | ||
7c673cae | 4499 | f->open_array_section("old_inodes"); |
f67539c2 TL |
4500 | if (old_inodes) { |
4501 | for (const auto &p : *old_inodes) { | |
4502 | f->open_object_section("old_inode"); | |
4503 | // The key is the last snapid, the first is in the mempool_old_inode | |
4504 | f->dump_int("last", p.first); | |
4505 | p.second.dump(f); | |
4506 | f->close_section(); // old_inode | |
4507 | } | |
7c673cae FG |
4508 | } |
4509 | f->close_section(); // old_inodes | |
4510 | ||
9f95a23c TL |
4511 | f->dump_unsigned("oldest_snap", oldest_snap); |
4512 | f->dump_unsigned("damage_flags", damage_flags); | |
7c673cae FG |
4513 | } |
4514 | ||
f67539c2 TL |
4515 | template <> |
4516 | void decode_json_obj(mempool::mds_co::string& t, JSONObj *obj){ | |
4517 | ||
4518 | t = mempool::mds_co::string(std::string_view(obj->get_data())); | |
4519 | } | |
4520 | ||
4521 | void InodeStoreBase::decode_json(JSONObj *obj) | |
4522 | { | |
4523 | { | |
4524 | auto _inode = allocate_inode(); | |
4525 | _inode->decode_json(obj); | |
4526 | reset_inode(std::move(_inode)); | |
4527 | } | |
4528 | ||
4529 | JSONDecoder::decode_json("symlink", symlink, obj, true); | |
4530 | // JSONDecoder::decode_json("dirfragtree", dirfragtree, obj, true); // cann't decode it now | |
4531 | // | |
4532 | // | |
4533 | { | |
4534 | mempool_xattr_map tmp; | |
4535 | JSONDecoder::decode_json("xattrs", tmp, xattrs_cb, obj, true); | |
4536 | if (tmp.empty()) | |
4537 | reset_xattrs(xattr_map_ptr()); | |
4538 | else | |
4539 | reset_xattrs(allocate_xattr_map(std::move(tmp))); | |
4540 | } | |
4541 | // JSONDecoder::decode_json("old_inodes", old_inodes, InodeStoreBase::old_indoes_cb, obj, true); // cann't decode old_inodes now | |
4542 | JSONDecoder::decode_json("oldest_snap", oldest_snap.val, obj, true); | |
4543 | JSONDecoder::decode_json("damage_flags", damage_flags, obj, true); | |
4544 | //sr_t srnode; | |
4545 | //JSONDecoder::decode_json("snap_blob", srnode, obj, true); // cann't decode it now | |
4546 | //snap_blob = srnode; | |
4547 | } | |
4548 | ||
4549 | void InodeStoreBase::xattrs_cb(InodeStoreBase::mempool_xattr_map& c, JSONObj *obj){ | |
4550 | ||
4551 | string k; | |
4552 | JSONDecoder::decode_json("key", k, obj, true); | |
4553 | string v; | |
4554 | JSONDecoder::decode_json("val", v, obj, true); | |
4555 | c[k.c_str()] = buffer::copy(v.c_str(), v.size()); | |
4556 | } | |
4557 | ||
4558 | void InodeStoreBase::old_indoes_cb(InodeStoreBase::mempool_old_inode_map& c, JSONObj *obj){ | |
4559 | ||
4560 | snapid_t s; | |
4561 | JSONDecoder::decode_json("last", s.val, obj, true); | |
4562 | InodeStoreBase::mempool_old_inode i; | |
4563 | // i.decode_json(obj); // cann't decode now, simon | |
4564 | c[s] = i; | |
4565 | } | |
7c673cae | 4566 | |
9f95a23c | 4567 | void InodeStore::generate_test_instances(std::list<InodeStore*> &ls) |
7c673cae FG |
4568 | { |
4569 | InodeStore *populated = new InodeStore; | |
f67539c2 | 4570 | populated->get_inode()->ino = 0xdeadbeef; |
7c673cae FG |
4571 | populated->symlink = "rhubarb"; |
4572 | ls.push_back(populated); | |
4573 | } | |
4574 | ||
9f95a23c | 4575 | void InodeStoreBare::generate_test_instances(std::list<InodeStoreBare*> &ls) |
11fdf7f2 TL |
4576 | { |
4577 | InodeStoreBare *populated = new InodeStoreBare; | |
f67539c2 | 4578 | populated->get_inode()->ino = 0xdeadbeef; |
11fdf7f2 TL |
4579 | populated->symlink = "rhubarb"; |
4580 | ls.push_back(populated); | |
4581 | } | |
4582 | ||
7c673cae | 4583 | void CInode::validate_disk_state(CInode::validated_data *results, |
11fdf7f2 | 4584 | MDSContext *fin) |
7c673cae FG |
4585 | { |
4586 | class ValidationContinuation : public MDSContinuation { | |
4587 | public: | |
11fdf7f2 | 4588 | MDSContext *fin; |
7c673cae FG |
4589 | CInode *in; |
4590 | CInode::validated_data *results; | |
4591 | bufferlist bl; | |
4592 | CInode *shadow_in; | |
4593 | ||
4594 | enum { | |
4595 | START = 0, | |
4596 | BACKTRACE, | |
4597 | INODE, | |
11fdf7f2 TL |
4598 | DIRFRAGS, |
4599 | SNAPREALM, | |
7c673cae FG |
4600 | }; |
4601 | ||
4602 | ValidationContinuation(CInode *i, | |
4603 | CInode::validated_data *data_r, | |
11fdf7f2 | 4604 | MDSContext *fin_) : |
7c673cae FG |
4605 | MDSContinuation(i->mdcache->mds->server), |
4606 | fin(fin_), | |
4607 | in(i), | |
4608 | results(data_r), | |
4609 | shadow_in(NULL) { | |
4610 | set_callback(START, static_cast<Continuation::stagePtr>(&ValidationContinuation::_start)); | |
4611 | set_callback(BACKTRACE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_backtrace)); | |
4612 | set_callback(INODE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_inode_disk)); | |
4613 | set_callback(DIRFRAGS, static_cast<Continuation::stagePtr>(&ValidationContinuation::_dirfrags)); | |
4614 | } | |
4615 | ||
4616 | ~ValidationContinuation() override { | |
b32b8144 FG |
4617 | if (shadow_in) { |
4618 | delete shadow_in; | |
4619 | in->mdcache->num_shadow_inodes--; | |
4620 | } | |
7c673cae FG |
4621 | } |
4622 | ||
4623 | /** | |
4624 | * Fetch backtrace and set tag if tag is non-empty | |
4625 | */ | |
11fdf7f2 TL |
4626 | void fetch_backtrace_and_tag(CInode *in, |
4627 | std::string_view tag, bool is_internal, | |
7c673cae FG |
4628 | Context *fin, int *bt_r, bufferlist *bt) |
4629 | { | |
4630 | const int64_t pool = in->get_backtrace_pool(); | |
4631 | object_t oid = CInode::get_object_name(in->ino(), frag_t(), ""); | |
4632 | ||
4633 | ObjectOperation fetch; | |
4634 | fetch.getxattr("parent", bt, bt_r); | |
4635 | in->mdcache->mds->objecter->read(oid, object_locator_t(pool), fetch, CEPH_NOSNAP, | |
4636 | NULL, 0, fin); | |
f67539c2 TL |
4637 | if (in->mdcache->mds->logger) { |
4638 | in->mdcache->mds->logger->inc(l_mds_openino_backtrace_fetch); | |
4639 | in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_fetch); | |
4640 | } | |
4641 | ||
11fdf7f2 TL |
4642 | using ceph::encode; |
4643 | if (!is_internal) { | |
4644 | ObjectOperation scrub_tag; | |
7c673cae | 4645 | bufferlist tag_bl; |
11fdf7f2 | 4646 | encode(tag, tag_bl); |
7c673cae FG |
4647 | scrub_tag.setxattr("scrub_tag", tag_bl); |
4648 | SnapContext snapc; | |
4649 | in->mdcache->mds->objecter->mutate(oid, object_locator_t(pool), scrub_tag, snapc, | |
4650 | ceph::real_clock::now(), | |
4651 | 0, NULL); | |
f67539c2 TL |
4652 | if (in->mdcache->mds->logger) |
4653 | in->mdcache->mds->logger->inc(l_mds_scrub_set_tag); | |
7c673cae FG |
4654 | } |
4655 | } | |
4656 | ||
4657 | bool _start(int rval) { | |
f67539c2 TL |
4658 | ceph_assert(in->can_auth_pin()); |
4659 | in->auth_pin(this); | |
4660 | ||
7c673cae | 4661 | if (in->is_dirty()) { |
f67539c2 TL |
4662 | MDCache *mdcache = in->mdcache; // For the benefit of dout |
4663 | auto ino = [this]() { return in->ino(); }; // For the benefit of dout | |
11fdf7f2 | 4664 | dout(20) << "validating a dirty CInode; results will be inconclusive" |
f67539c2 | 4665 | << dendl; |
7c673cae | 4666 | } |
11fdf7f2 | 4667 | |
7c673cae | 4668 | C_OnFinisher *conf = new C_OnFinisher(get_io_callback(BACKTRACE), |
11fdf7f2 TL |
4669 | in->mdcache->mds->finisher); |
4670 | ||
4671 | std::string_view tag = in->scrub_infop->header->get_tag(); | |
4672 | bool is_internal = in->scrub_infop->header->is_internal_tag(); | |
4673 | // Rather than using the usual CInode::fetch_backtrace, | |
4674 | // use a special variant that optionally writes a tag in the same | |
4675 | // operation. | |
4676 | fetch_backtrace_and_tag(in, tag, is_internal, conf, &results->backtrace.ondisk_read_retval, &bl); | |
7c673cae FG |
4677 | return false; |
4678 | } | |
4679 | ||
4680 | bool _backtrace(int rval) { | |
4681 | // set up basic result reporting and make sure we got the data | |
4682 | results->performed_validation = true; // at least, some of it! | |
4683 | results->backtrace.checked = true; | |
4684 | ||
4685 | const int64_t pool = in->get_backtrace_pool(); | |
4686 | inode_backtrace_t& memory_backtrace = results->backtrace.memory_value; | |
4687 | in->build_backtrace(pool, memory_backtrace); | |
4688 | bool equivalent, divergent; | |
4689 | int memory_newer; | |
4690 | ||
4691 | MDCache *mdcache = in->mdcache; // For the benefit of dout | |
f67539c2 | 4692 | auto ino = [this]() { return in->ino(); }; // For the benefit of dout |
7c673cae FG |
4693 | |
4694 | // Ignore rval because it's the result of a FAILOK operation | |
4695 | // from fetch_backtrace_and_tag: the real result is in | |
4696 | // backtrace.ondisk_read_retval | |
4697 | dout(20) << "ondisk_read_retval: " << results->backtrace.ondisk_read_retval << dendl; | |
4698 | if (results->backtrace.ondisk_read_retval != 0) { | |
4699 | results->backtrace.error_str << "failed to read off disk; see retval"; | |
e306af50 TL |
4700 | // we probably have a new unwritten file! |
4701 | // so skip the backtrace scrub for this entry and say that all's well | |
f67539c2 TL |
4702 | if (in->is_dirty_parent()) { |
4703 | dout(20) << "forcing backtrace as passed since inode is dirty parent" << dendl; | |
e306af50 | 4704 | results->backtrace.passed = true; |
f67539c2 | 4705 | } |
e306af50 | 4706 | goto next; |
7c673cae FG |
4707 | } |
4708 | ||
4709 | // extract the backtrace, and compare it to a newly-constructed one | |
4710 | try { | |
11fdf7f2 TL |
4711 | auto p = bl.cbegin(); |
4712 | using ceph::decode; | |
4713 | decode(results->backtrace.ondisk_value, p); | |
7c673cae FG |
4714 | dout(10) << "decoded " << bl.length() << " bytes of backtrace successfully" << dendl; |
4715 | } catch (buffer::error&) { | |
4716 | if (results->backtrace.ondisk_read_retval == 0 && rval != 0) { | |
4717 | // Cases where something has clearly gone wrong with the overall | |
4718 | // fetch op, though we didn't get a nonzero rc from the getxattr | |
4719 | // operation. e.g. object missing. | |
4720 | results->backtrace.ondisk_read_retval = rval; | |
4721 | } | |
4722 | results->backtrace.error_str << "failed to decode on-disk backtrace (" | |
4723 | << bl.length() << " bytes)!"; | |
e306af50 TL |
4724 | // we probably have a new unwritten file! |
4725 | // so skip the backtrace scrub for this entry and say that all's well | |
f67539c2 TL |
4726 | if (in->is_dirty_parent()) { |
4727 | dout(20) << "decode failed; forcing backtrace as passed since " | |
4728 | "inode is dirty parent" << dendl; | |
e306af50 | 4729 | results->backtrace.passed = true; |
f67539c2 | 4730 | } |
e306af50 | 4731 | |
7c673cae FG |
4732 | goto next; |
4733 | } | |
4734 | ||
4735 | memory_newer = memory_backtrace.compare(results->backtrace.ondisk_value, | |
4736 | &equivalent, &divergent); | |
4737 | ||
4738 | if (divergent || memory_newer < 0) { | |
e306af50 TL |
4739 | // we're divergent, or on-disk version is newer |
4740 | results->backtrace.error_str << "On-disk backtrace is divergent or newer"; | |
f67539c2 TL |
4741 | /* if the backtraces are divergent and the link count is 0, then |
4742 | * most likely its a stray entry that's being purged and things are | |
4743 | * well and there's no reason for alarm | |
4744 | */ | |
4745 | if (divergent && (in->is_dirty_parent() || in->get_inode()->nlink == 0)) { | |
e306af50 | 4746 | results->backtrace.passed = true; |
f67539c2 TL |
4747 | dout(20) << "divergent backtraces are acceptable when dn " |
4748 | "is being purged or has been renamed or moved to a " | |
4749 | "different directory " << *in << dendl; | |
4750 | } | |
7c673cae FG |
4751 | } else { |
4752 | results->backtrace.passed = true; | |
4753 | } | |
4754 | next: | |
4755 | ||
4756 | if (!results->backtrace.passed && in->scrub_infop->header->get_repair()) { | |
4757 | std::string path; | |
4758 | in->make_path_string(path); | |
d2e6a577 FG |
4759 | in->mdcache->mds->clog->warn() << "bad backtrace on inode " << in->ino() |
4760 | << "(" << path << "), rewriting it"; | |
28e407b8 | 4761 | in->mark_dirty_parent(in->mdcache->mds->mdlog->get_current_segment(), |
7c673cae | 4762 | false); |
b32b8144 FG |
4763 | // Flag that we repaired this BT so that it won't go into damagetable |
4764 | results->backtrace.repaired = true; | |
f67539c2 TL |
4765 | if (in->mdcache->mds->logger) |
4766 | in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_repaired); | |
7c673cae FG |
4767 | } |
4768 | ||
4769 | // If the inode's number was free in the InoTable, fix that | |
4770 | // (#15619) | |
4771 | { | |
4772 | InoTable *inotable = mdcache->mds->inotable; | |
4773 | ||
f67539c2 | 4774 | dout(10) << "scrub: inotable ino = " << in->ino() << dendl; |
7c673cae | 4775 | dout(10) << "scrub: inotable free says " |
f67539c2 | 4776 | << inotable->is_marked_free(in->ino()) << dendl; |
7c673cae | 4777 | |
f67539c2 | 4778 | if (inotable->is_marked_free(in->ino())) { |
7c673cae | 4779 | LogChannelRef clog = in->mdcache->mds->clog; |
f67539c2 | 4780 | clog->error() << "scrub: inode wrongly marked free: " << in->ino(); |
7c673cae FG |
4781 | |
4782 | if (in->scrub_infop->header->get_repair()) { | |
f67539c2 | 4783 | bool repaired = inotable->repair(in->ino()); |
7c673cae | 4784 | if (repaired) { |
f67539c2 | 4785 | clog->error() << "inode table repaired for inode: " << in->ino(); |
7c673cae FG |
4786 | |
4787 | inotable->save(); | |
f67539c2 TL |
4788 | if (in->mdcache->mds->logger) |
4789 | in->mdcache->mds->logger->inc(l_mds_scrub_inotable_repaired); | |
7c673cae FG |
4790 | } else { |
4791 | clog->error() << "Cannot repair inotable while other operations" | |
4792 | " are in progress"; | |
4793 | } | |
4794 | } | |
4795 | } | |
4796 | } | |
4797 | ||
7c673cae | 4798 | |
11fdf7f2 | 4799 | if (in->is_dir()) { |
f67539c2 TL |
4800 | if (in->mdcache->mds->logger) |
4801 | in->mdcache->mds->logger->inc(l_mds_scrub_dir_inodes); | |
11fdf7f2 TL |
4802 | return validate_directory_data(); |
4803 | } else { | |
f67539c2 TL |
4804 | if (in->mdcache->mds->logger) |
4805 | in->mdcache->mds->logger->inc(l_mds_scrub_file_inodes); | |
11fdf7f2 | 4806 | // TODO: validate on-disk inode for normal files |
f67539c2 | 4807 | return true; |
11fdf7f2 | 4808 | } |
7c673cae FG |
4809 | } |
4810 | ||
4811 | bool validate_directory_data() { | |
11fdf7f2 | 4812 | ceph_assert(in->is_dir()); |
7c673cae FG |
4813 | |
4814 | if (in->is_base()) { | |
b32b8144 FG |
4815 | if (!shadow_in) { |
4816 | shadow_in = new CInode(in->mdcache); | |
f67539c2 | 4817 | in->mdcache->create_unlinked_system_inode(shadow_in, in->ino(), in->get_inode()->mode); |
b32b8144 FG |
4818 | in->mdcache->num_shadow_inodes++; |
4819 | } | |
7c673cae | 4820 | shadow_in->fetch(get_internal_callback(INODE)); |
f67539c2 TL |
4821 | if (in->mdcache->mds->logger) |
4822 | in->mdcache->mds->logger->inc(l_mds_scrub_dir_base_inodes); | |
7c673cae FG |
4823 | return false; |
4824 | } else { | |
11fdf7f2 | 4825 | // TODO: validate on-disk inode for non-base directories |
f67539c2 TL |
4826 | if (in->mdcache->mds->logger) |
4827 | in->mdcache->mds->logger->inc(l_mds_scrub_dirfrag_rstats); | |
7c673cae | 4828 | results->inode.passed = true; |
11fdf7f2 | 4829 | return check_dirfrag_rstats(); |
7c673cae FG |
4830 | } |
4831 | } | |
4832 | ||
4833 | bool _inode_disk(int rval) { | |
f67539c2 TL |
4834 | const auto& si = shadow_in->get_inode(); |
4835 | const auto& i = in->get_inode(); | |
4836 | ||
7c673cae FG |
4837 | results->inode.checked = true; |
4838 | results->inode.ondisk_read_retval = rval; | |
f67539c2 TL |
4839 | results->inode.ondisk_value = *si; |
4840 | results->inode.memory_value = *i; | |
7c673cae | 4841 | |
f67539c2 | 4842 | if (si->version > i->version) { |
7c673cae | 4843 | // uh, what? |
11fdf7f2 | 4844 | results->inode.error_str << "On-disk inode is newer than in-memory one; "; |
7c673cae FG |
4845 | goto next; |
4846 | } else { | |
4847 | bool divergent = false; | |
f67539c2 | 4848 | int r = i->compare(*si, &divergent); |
7c673cae FG |
4849 | results->inode.passed = !divergent && r >= 0; |
4850 | if (!results->inode.passed) { | |
4851 | results->inode.error_str << | |
11fdf7f2 | 4852 | "On-disk inode is divergent or newer than in-memory one; "; |
7c673cae FG |
4853 | goto next; |
4854 | } | |
4855 | } | |
4856 | next: | |
4857 | return check_dirfrag_rstats(); | |
4858 | } | |
4859 | ||
4860 | bool check_dirfrag_rstats() { | |
f67539c2 TL |
4861 | if (in->has_subtree_root_dirfrag()) { |
4862 | in->mdcache->rdlock_dirfrags_stats(in, get_internal_callback(DIRFRAGS)); | |
4863 | return false; | |
7c673cae | 4864 | } else { |
f67539c2 | 4865 | return immediate(DIRFRAGS, 0); |
7c673cae FG |
4866 | } |
4867 | } | |
4868 | ||
4869 | bool _dirfrags(int rval) { | |
7c673cae FG |
4870 | // basic reporting setup |
4871 | results->raw_stats.checked = true; | |
4872 | results->raw_stats.ondisk_read_retval = rval; | |
4873 | ||
f67539c2 TL |
4874 | results->raw_stats.memory_value.dirstat = in->get_inode()->dirstat; |
4875 | results->raw_stats.memory_value.rstat = in->get_inode()->rstat; | |
7c673cae FG |
4876 | frag_info_t& dir_info = results->raw_stats.ondisk_value.dirstat; |
4877 | nest_info_t& nest_info = results->raw_stats.ondisk_value.rstat; | |
4878 | ||
4879 | if (rval != 0) { | |
4880 | results->raw_stats.error_str << "Failed to read dirfrags off disk"; | |
4881 | goto next; | |
4882 | } | |
4883 | ||
4884 | // check each dirfrag... | |
94b18763 FG |
4885 | for (const auto &p : in->dirfrags) { |
4886 | CDir *dir = p.second; | |
11fdf7f2 | 4887 | ceph_assert(dir->get_version() > 0); |
f67539c2 TL |
4888 | nest_info.add(dir->get_fnode()->accounted_rstat); |
4889 | dir_info.add(dir->get_fnode()->accounted_fragstat); | |
7c673cae FG |
4890 | } |
4891 | nest_info.rsubdirs++; // it gets one to account for self | |
11fdf7f2 TL |
4892 | if (const sr_t *srnode = in->get_projected_srnode(); srnode) |
4893 | nest_info.rsnaps += srnode->snaps.size(); | |
4894 | ||
7c673cae | 4895 | // ...and that their sum matches our inode settings |
f67539c2 TL |
4896 | if (!dir_info.same_sums(in->get_inode()->dirstat) || |
4897 | !nest_info.same_sums(in->get_inode()->rstat)) { | |
11fdf7f2 | 4898 | if (in->scrub_infop->header->get_repair()) { |
7c673cae FG |
4899 | results->raw_stats.error_str |
4900 | << "freshly-calculated rstats don't match existing ones (will be fixed)"; | |
4901 | in->mdcache->repair_inode_stats(in); | |
b32b8144 | 4902 | results->raw_stats.repaired = true; |
7c673cae FG |
4903 | } else { |
4904 | results->raw_stats.error_str | |
4905 | << "freshly-calculated rstats don't match existing ones"; | |
4906 | } | |
f67539c2 TL |
4907 | if (in->is_dirty()) { |
4908 | MDCache *mdcache = in->mdcache; // for dout() | |
4909 | auto ino = [this]() { return in->ino(); }; // for dout() | |
4910 | dout(20) << "raw stats most likely wont match since inode is dirty; " | |
4911 | "please rerun scrub when system is stable; " | |
4912 | "assuming passed for now;" << dendl; | |
4913 | results->raw_stats.passed = true; | |
4914 | } | |
7c673cae FG |
4915 | goto next; |
4916 | } | |
7c673cae FG |
4917 | |
4918 | results->raw_stats.passed = true; | |
f67539c2 TL |
4919 | { |
4920 | MDCache *mdcache = in->mdcache; // for dout() | |
4921 | auto ino = [this]() { return in->ino(); }; // for dout() | |
4922 | dout(20) << "raw stats check passed on " << *in << dendl; | |
11fdf7f2 | 4923 | } |
11fdf7f2 | 4924 | |
f67539c2 | 4925 | next: |
7c673cae FG |
4926 | return true; |
4927 | } | |
4928 | ||
4929 | void _done() override { | |
4930 | if ((!results->raw_stats.checked || results->raw_stats.passed) && | |
4931 | (!results->backtrace.checked || results->backtrace.passed) && | |
4932 | (!results->inode.checked || results->inode.passed)) | |
11fdf7f2 TL |
4933 | results->passed_validation = true; |
4934 | ||
4935 | // Flag that we did some repair work so that our repair operation | |
4936 | // can be flushed at end of scrub | |
4937 | if (results->backtrace.repaired || | |
4938 | results->inode.repaired || | |
4939 | results->raw_stats.repaired) | |
4940 | in->scrub_infop->header->set_repaired(); | |
4941 | if (fin) | |
4942 | fin->complete(get_rval()); | |
f67539c2 TL |
4943 | |
4944 | in->auth_unpin(this); | |
7c673cae FG |
4945 | } |
4946 | }; | |
4947 | ||
4948 | ||
4949 | dout(10) << "scrub starting validate_disk_state on " << *this << dendl; | |
4950 | ValidationContinuation *vc = new ValidationContinuation(this, | |
4951 | results, | |
4952 | fin); | |
4953 | vc->begin(); | |
4954 | } | |
4955 | ||
4956 | void CInode::validated_data::dump(Formatter *f) const | |
4957 | { | |
4958 | f->open_object_section("results"); | |
4959 | { | |
4960 | f->dump_bool("performed_validation", performed_validation); | |
4961 | f->dump_bool("passed_validation", passed_validation); | |
4962 | f->open_object_section("backtrace"); | |
4963 | { | |
4964 | f->dump_bool("checked", backtrace.checked); | |
4965 | f->dump_bool("passed", backtrace.passed); | |
4966 | f->dump_int("read_ret_val", backtrace.ondisk_read_retval); | |
4967 | f->dump_stream("ondisk_value") << backtrace.ondisk_value; | |
4968 | f->dump_stream("memoryvalue") << backtrace.memory_value; | |
4969 | f->dump_string("error_str", backtrace.error_str.str()); | |
4970 | } | |
4971 | f->close_section(); // backtrace | |
4972 | f->open_object_section("raw_stats"); | |
4973 | { | |
4974 | f->dump_bool("checked", raw_stats.checked); | |
4975 | f->dump_bool("passed", raw_stats.passed); | |
4976 | f->dump_int("read_ret_val", raw_stats.ondisk_read_retval); | |
4977 | f->dump_stream("ondisk_value.dirstat") << raw_stats.ondisk_value.dirstat; | |
4978 | f->dump_stream("ondisk_value.rstat") << raw_stats.ondisk_value.rstat; | |
f67539c2 | 4979 | f->dump_stream("memory_value.dirstat") << raw_stats.memory_value.dirstat; |
7c673cae FG |
4980 | f->dump_stream("memory_value.rstat") << raw_stats.memory_value.rstat; |
4981 | f->dump_string("error_str", raw_stats.error_str.str()); | |
4982 | } | |
4983 | f->close_section(); // raw_stats | |
4984 | // dump failure return code | |
4985 | int rc = 0; | |
4986 | if (backtrace.checked && backtrace.ondisk_read_retval) | |
4987 | rc = backtrace.ondisk_read_retval; | |
4988 | if (inode.checked && inode.ondisk_read_retval) | |
4989 | rc = inode.ondisk_read_retval; | |
4990 | if (raw_stats.checked && raw_stats.ondisk_read_retval) | |
4991 | rc = raw_stats.ondisk_read_retval; | |
4992 | f->dump_int("return_code", rc); | |
4993 | } | |
4994 | f->close_section(); // results | |
4995 | } | |
4996 | ||
b32b8144 FG |
4997 | bool CInode::validated_data::all_damage_repaired() const |
4998 | { | |
4999 | bool unrepaired = | |
5000 | (raw_stats.checked && !raw_stats.passed && !raw_stats.repaired) | |
5001 | || | |
5002 | (backtrace.checked && !backtrace.passed && !backtrace.repaired) | |
5003 | || | |
5004 | (inode.checked && !inode.passed && !inode.repaired); | |
5005 | ||
5006 | return !unrepaired; | |
5007 | } | |
5008 | ||
11fdf7f2 TL |
5009 | void CInode::dump(Formatter *f, int flags) const |
5010 | { | |
5011 | if (flags & DUMP_PATH) { | |
5012 | std::string path; | |
5013 | make_path_string(path, true); | |
5014 | if (path.empty()) | |
5015 | path = "/"; | |
5016 | f->dump_string("path", path); | |
5017 | } | |
5018 | ||
5019 | if (flags & DUMP_INODE_STORE_BASE) | |
5020 | InodeStoreBase::dump(f); | |
5021 | ||
5022 | if (flags & DUMP_MDS_CACHE_OBJECT) | |
5023 | MDSCacheObject::dump(f); | |
5024 | ||
5025 | if (flags & DUMP_LOCKS) { | |
5026 | f->open_object_section("versionlock"); | |
5027 | versionlock.dump(f); | |
5028 | f->close_section(); | |
5029 | ||
5030 | f->open_object_section("authlock"); | |
5031 | authlock.dump(f); | |
5032 | f->close_section(); | |
5033 | ||
5034 | f->open_object_section("linklock"); | |
5035 | linklock.dump(f); | |
5036 | f->close_section(); | |
5037 | ||
5038 | f->open_object_section("dirfragtreelock"); | |
5039 | dirfragtreelock.dump(f); | |
5040 | f->close_section(); | |
5041 | ||
5042 | f->open_object_section("filelock"); | |
5043 | filelock.dump(f); | |
5044 | f->close_section(); | |
5045 | ||
5046 | f->open_object_section("xattrlock"); | |
5047 | xattrlock.dump(f); | |
5048 | f->close_section(); | |
5049 | ||
5050 | f->open_object_section("snaplock"); | |
5051 | snaplock.dump(f); | |
5052 | f->close_section(); | |
5053 | ||
5054 | f->open_object_section("nestlock"); | |
5055 | nestlock.dump(f); | |
5056 | f->close_section(); | |
5057 | ||
5058 | f->open_object_section("flocklock"); | |
5059 | flocklock.dump(f); | |
5060 | f->close_section(); | |
5061 | ||
5062 | f->open_object_section("policylock"); | |
5063 | policylock.dump(f); | |
5064 | f->close_section(); | |
5065 | } | |
5066 | ||
5067 | if (flags & DUMP_STATE) { | |
5068 | f->open_array_section("states"); | |
5069 | MDSCacheObject::dump_states(f); | |
5070 | if (state_test(STATE_EXPORTING)) | |
5071 | f->dump_string("state", "exporting"); | |
5072 | if (state_test(STATE_OPENINGDIR)) | |
5073 | f->dump_string("state", "openingdir"); | |
5074 | if (state_test(STATE_FREEZING)) | |
5075 | f->dump_string("state", "freezing"); | |
5076 | if (state_test(STATE_FROZEN)) | |
5077 | f->dump_string("state", "frozen"); | |
5078 | if (state_test(STATE_AMBIGUOUSAUTH)) | |
5079 | f->dump_string("state", "ambiguousauth"); | |
5080 | if (state_test(STATE_EXPORTINGCAPS)) | |
5081 | f->dump_string("state", "exportingcaps"); | |
5082 | if (state_test(STATE_NEEDSRECOVER)) | |
5083 | f->dump_string("state", "needsrecover"); | |
5084 | if (state_test(STATE_PURGING)) | |
5085 | f->dump_string("state", "purging"); | |
5086 | if (state_test(STATE_DIRTYPARENT)) | |
5087 | f->dump_string("state", "dirtyparent"); | |
5088 | if (state_test(STATE_DIRTYRSTAT)) | |
5089 | f->dump_string("state", "dirtyrstat"); | |
5090 | if (state_test(STATE_STRAYPINNED)) | |
5091 | f->dump_string("state", "straypinned"); | |
5092 | if (state_test(STATE_FROZENAUTHPIN)) | |
5093 | f->dump_string("state", "frozenauthpin"); | |
5094 | if (state_test(STATE_DIRTYPOOL)) | |
5095 | f->dump_string("state", "dirtypool"); | |
5096 | if (state_test(STATE_ORPHAN)) | |
5097 | f->dump_string("state", "orphan"); | |
5098 | if (state_test(STATE_MISSINGOBJS)) | |
5099 | f->dump_string("state", "missingobjs"); | |
7c673cae FG |
5100 | f->close_section(); |
5101 | } | |
7c673cae | 5102 | |
11fdf7f2 TL |
5103 | if (flags & DUMP_CAPS) { |
5104 | f->open_array_section("client_caps"); | |
5105 | for (const auto &p : client_caps) { | |
5106 | auto &client = p.first; | |
5107 | auto cap = &p.second; | |
5108 | f->open_object_section("client_cap"); | |
5109 | f->dump_int("client_id", client.v); | |
5110 | f->dump_string("pending", ccap_string(cap->pending())); | |
5111 | f->dump_string("issued", ccap_string(cap->issued())); | |
5112 | f->dump_string("wanted", ccap_string(cap->wanted())); | |
5113 | f->dump_int("last_sent", cap->get_last_seq()); | |
5114 | f->close_section(); | |
5115 | } | |
5116 | f->close_section(); | |
5117 | ||
5118 | f->dump_int("loner", loner_cap.v); | |
5119 | f->dump_int("want_loner", want_loner_cap.v); | |
5120 | ||
5121 | f->open_array_section("mds_caps_wanted"); | |
5122 | for (const auto &p : mds_caps_wanted) { | |
5123 | f->open_object_section("mds_cap_wanted"); | |
5124 | f->dump_int("rank", p.first); | |
5125 | f->dump_string("cap", ccap_string(p.second)); | |
5126 | f->close_section(); | |
5127 | } | |
5128 | f->close_section(); | |
5129 | } | |
7c673cae | 5130 | |
11fdf7f2 TL |
5131 | if (flags & DUMP_DIRFRAGS) { |
5132 | f->open_array_section("dirfrags"); | |
9f95a23c | 5133 | auto&& dfs = get_dirfrags(); |
11fdf7f2 TL |
5134 | for(const auto &dir: dfs) { |
5135 | f->open_object_section("dir"); | |
5136 | dir->dump(f, CDir::DUMP_DEFAULT | CDir::DUMP_ITEMS); | |
5137 | dir->check_rstats(); | |
5138 | f->close_section(); | |
5139 | } | |
7c673cae FG |
5140 | f->close_section(); |
5141 | } | |
7c673cae FG |
5142 | } |
5143 | ||
5144 | /****** Scrub Stuff *****/ | |
5145 | void CInode::scrub_info_create() const | |
5146 | { | |
5147 | dout(25) << __func__ << dendl; | |
11fdf7f2 | 5148 | ceph_assert(!scrub_infop); |
7c673cae FG |
5149 | |
5150 | // break out of const-land to set up implicit initial state | |
5151 | CInode *me = const_cast<CInode*>(this); | |
f67539c2 | 5152 | const auto& pi = me->get_projected_inode(); |
7c673cae | 5153 | |
f67539c2 TL |
5154 | std::unique_ptr<scrub_info_t> si(new scrub_info_t()); |
5155 | si->last_scrub_stamp = pi->last_scrub_stamp; | |
5156 | si->last_scrub_version = pi->last_scrub_version; | |
7c673cae | 5157 | |
f67539c2 | 5158 | me->scrub_infop.swap(si); |
7c673cae FG |
5159 | } |
5160 | ||
5161 | void CInode::scrub_maybe_delete_info() | |
5162 | { | |
5163 | if (scrub_infop && | |
5164 | !scrub_infop->scrub_in_progress && | |
5165 | !scrub_infop->last_scrub_dirty) { | |
f67539c2 | 5166 | scrub_infop.reset(); |
7c673cae FG |
5167 | } |
5168 | } | |
5169 | ||
f67539c2 | 5170 | void CInode::scrub_initialize(ScrubHeaderRef& header) |
7c673cae FG |
5171 | { |
5172 | dout(20) << __func__ << " with scrub_version " << get_version() << dendl; | |
7c673cae | 5173 | |
f67539c2 | 5174 | scrub_info(); |
7c673cae | 5175 | scrub_infop->scrub_in_progress = true; |
f67539c2 | 5176 | scrub_infop->queued_frags.clear(); |
7c673cae | 5177 | scrub_infop->header = header; |
f67539c2 | 5178 | header->inc_num_pending(); |
7c673cae FG |
5179 | // right now we don't handle remote inodes |
5180 | } | |
5181 | ||
f67539c2 | 5182 | void CInode::scrub_aborted() { |
11fdf7f2 TL |
5183 | dout(20) << __func__ << dendl; |
5184 | ceph_assert(scrub_is_in_progress()); | |
5185 | ||
f67539c2 TL |
5186 | scrub_infop->scrub_in_progress = false; |
5187 | scrub_infop->header->dec_num_pending(); | |
5188 | scrub_maybe_delete_info(); | |
11fdf7f2 TL |
5189 | } |
5190 | ||
f67539c2 | 5191 | void CInode::scrub_finished() { |
7c673cae | 5192 | dout(20) << __func__ << dendl; |
11fdf7f2 | 5193 | ceph_assert(scrub_is_in_progress()); |
7c673cae | 5194 | |
f67539c2 TL |
5195 | scrub_infop->last_scrub_version = get_version(); |
5196 | scrub_infop->last_scrub_stamp = ceph_clock_now(); | |
7c673cae FG |
5197 | scrub_infop->last_scrub_dirty = true; |
5198 | scrub_infop->scrub_in_progress = false; | |
f67539c2 | 5199 | scrub_infop->header->dec_num_pending(); |
7c673cae FG |
5200 | } |
5201 | ||
5202 | int64_t CInode::get_backtrace_pool() const | |
5203 | { | |
5204 | if (is_dir()) { | |
b3b6e05e | 5205 | return mdcache->mds->get_metadata_pool(); |
7c673cae FG |
5206 | } else { |
5207 | // Files are required to have an explicit layout that specifies | |
5208 | // a pool | |
f67539c2 TL |
5209 | ceph_assert(get_inode()->layout.pool_id != -1); |
5210 | return get_inode()->layout.pool_id; | |
7c673cae FG |
5211 | } |
5212 | } | |
5213 | ||
f67539c2 | 5214 | void CInode::queue_export_pin(mds_rank_t export_pin) |
31f18b77 | 5215 | { |
31f18b77 FG |
5216 | if (state_test(CInode::STATE_QUEUEDEXPORTPIN)) |
5217 | return; | |
5218 | ||
f67539c2 TL |
5219 | mds_rank_t target; |
5220 | if (export_pin >= 0) | |
5221 | target = export_pin; | |
5222 | else if (export_pin == MDS_RANK_EPHEMERAL_RAND) | |
5223 | target = mdcache->hash_into_rank_bucket(ino()); | |
5224 | else | |
5225 | target = MDS_RANK_NONE; | |
5226 | ||
5227 | unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits(); | |
31f18b77 | 5228 | bool queue = false; |
f6b5b4d7 TL |
5229 | for (auto& p : dirfrags) { |
5230 | CDir *dir = p.second; | |
31f18b77 FG |
5231 | if (!dir->is_auth()) |
5232 | continue; | |
f67539c2 TL |
5233 | |
5234 | if (export_pin == MDS_RANK_EPHEMERAL_DIST) { | |
5235 | if (dir->get_frag().bits() < min_frag_bits) { | |
5236 | // needs split | |
5237 | queue = true; | |
5238 | break; | |
5239 | } | |
5240 | target = mdcache->hash_into_rank_bucket(ino(), dir->get_frag()); | |
5241 | } | |
5242 | ||
f6b5b4d7 | 5243 | if (target != MDS_RANK_NONE) { |
31f18b77 FG |
5244 | if (dir->is_subtree_root()) { |
5245 | // set auxsubtree bit or export it | |
5246 | if (!dir->state_test(CDir::STATE_AUXSUBTREE) || | |
f6b5b4d7 | 5247 | target != dir->get_dir_auth().first) |
31f18b77 FG |
5248 | queue = true; |
5249 | } else { | |
5250 | // create aux subtree or export it | |
5251 | queue = true; | |
7c673cae | 5252 | } |
31f18b77 FG |
5253 | } else { |
5254 | // clear aux subtrees ? | |
5255 | queue = dir->state_test(CDir::STATE_AUXSUBTREE); | |
5256 | } | |
f67539c2 TL |
5257 | |
5258 | if (queue) | |
31f18b77 | 5259 | break; |
f67539c2 TL |
5260 | } |
5261 | if (queue) { | |
5262 | state_set(CInode::STATE_QUEUEDEXPORTPIN); | |
5263 | mdcache->export_pin_queue.insert(this); | |
7c673cae FG |
5264 | } |
5265 | } | |
5266 | ||
f6b5b4d7 TL |
5267 | void CInode::maybe_export_pin(bool update) |
5268 | { | |
5269 | if (!g_conf()->mds_bal_export_pin) | |
5270 | return; | |
5271 | if (!is_dir() || !is_normal()) | |
5272 | return; | |
5273 | ||
5274 | dout(15) << __func__ << " update=" << update << " " << *this << dendl; | |
5275 | ||
f67539c2 TL |
5276 | mds_rank_t export_pin = get_export_pin(false); |
5277 | if (export_pin == MDS_RANK_NONE && !update) | |
f6b5b4d7 | 5278 | return; |
f6b5b4d7 | 5279 | |
f67539c2 | 5280 | check_pin_policy(export_pin); |
f6b5b4d7 TL |
5281 | queue_export_pin(export_pin); |
5282 | } | |
5283 | ||
f67539c2 | 5284 | void CInode::set_ephemeral_pin(bool dist, bool rand) |
f6b5b4d7 | 5285 | { |
f67539c2 TL |
5286 | unsigned state = 0; |
5287 | if (dist) | |
5288 | state |= STATE_DISTEPHEMERALPIN; | |
5289 | if (rand) | |
5290 | state |= STATE_RANDEPHEMERALPIN; | |
5291 | if (!state) | |
f6b5b4d7 | 5292 | return; |
f6b5b4d7 | 5293 | |
f67539c2 TL |
5294 | if (state_test(state) != state) { |
5295 | dout(10) << "set ephemeral (" << (dist ? "dist" : "") | |
5296 | << (rand ? " rand" : "") << ") pin on " << *this << dendl; | |
5297 | if (!is_ephemerally_pinned()) { | |
5298 | auto p = mdcache->export_ephemeral_pins.insert(this); | |
5299 | ceph_assert(p.second); | |
f6b5b4d7 | 5300 | } |
f67539c2 | 5301 | state_set(state); |
f6b5b4d7 TL |
5302 | } |
5303 | } | |
5304 | ||
f67539c2 | 5305 | void CInode::clear_ephemeral_pin(bool dist, bool rand) |
f6b5b4d7 | 5306 | { |
f67539c2 TL |
5307 | unsigned state = 0; |
5308 | if (dist) | |
5309 | state |= STATE_DISTEPHEMERALPIN; | |
5310 | if (rand) | |
5311 | state |= STATE_RANDEPHEMERALPIN; | |
5312 | ||
5313 | if (state_test(state)) { | |
5314 | dout(10) << "clear ephemeral (" << (dist ? "dist" : "") | |
5315 | << (rand ? " rand" : "") << ") pin on " << *this << dendl; | |
5316 | state_clear(state); | |
5317 | if (!is_ephemerally_pinned()) { | |
5318 | auto count = mdcache->export_ephemeral_pins.erase(this); | |
f6b5b4d7 | 5319 | ceph_assert(count == 1); |
f6b5b4d7 TL |
5320 | } |
5321 | } | |
5322 | } | |
5323 | ||
f67539c2 | 5324 | void CInode::maybe_ephemeral_rand(double threshold) |
f6b5b4d7 TL |
5325 | { |
5326 | if (!mdcache->get_export_ephemeral_random_config()) { | |
5327 | dout(15) << __func__ << " config false: cannot ephemeral random pin " << *this << dendl; | |
f67539c2 | 5328 | clear_ephemeral_pin(false, true); |
f6b5b4d7 TL |
5329 | return; |
5330 | } else if (!is_dir() || !is_normal()) { | |
5331 | dout(15) << __func__ << " !dir or !normal: cannot ephemeral random pin " << *this << dendl; | |
f67539c2 | 5332 | clear_ephemeral_pin(false, true); |
f6b5b4d7 | 5333 | return; |
f67539c2 | 5334 | } else if (get_inode()->nlink == 0) { |
f6b5b4d7 | 5335 | dout(15) << __func__ << " unlinked directory: cannot ephemeral random pin " << *this << dendl; |
f67539c2 | 5336 | clear_ephemeral_pin(false, true); |
f6b5b4d7 TL |
5337 | return; |
5338 | } else if (state_test(CInode::STATE_RANDEPHEMERALPIN)) { | |
5339 | dout(10) << __func__ << " already ephemeral random pinned: requeueing " << *this << dendl; | |
f67539c2 | 5340 | queue_export_pin(MDS_RANK_EPHEMERAL_RAND); |
f6b5b4d7 TL |
5341 | return; |
5342 | } | |
5343 | ||
f91f0fd5 TL |
5344 | /* not precomputed? */ |
5345 | if (threshold < 0.0) { | |
5346 | threshold = get_ephemeral_rand(); | |
5347 | } | |
5348 | if (threshold <= 0.0) { | |
5349 | return; | |
5350 | } | |
f6b5b4d7 TL |
5351 | double n = ceph::util::generate_random_number(0.0, 1.0); |
5352 | ||
5353 | dout(15) << __func__ << " rand " << n << " <?= " << threshold | |
5354 | << " " << *this << dendl; | |
5355 | ||
5356 | if (n <= threshold) { | |
5357 | dout(10) << __func__ << " randomly export pinning " << *this << dendl; | |
f67539c2 TL |
5358 | set_ephemeral_pin(false, true); |
5359 | queue_export_pin(MDS_RANK_EPHEMERAL_RAND); | |
f6b5b4d7 TL |
5360 | } |
5361 | } | |
5362 | ||
5363 | void CInode::setxattr_ephemeral_rand(double probability) | |
5364 | { | |
5365 | ceph_assert(is_dir()); | |
f67539c2 | 5366 | _get_projected_inode()->export_ephemeral_random_pin = probability; |
f6b5b4d7 TL |
5367 | } |
5368 | ||
5369 | void CInode::setxattr_ephemeral_dist(bool val) | |
5370 | { | |
5371 | ceph_assert(is_dir()); | |
f67539c2 | 5372 | _get_projected_inode()->export_ephemeral_distributed_pin = val; |
f6b5b4d7 TL |
5373 | } |
5374 | ||
7c673cae FG |
5375 | void CInode::set_export_pin(mds_rank_t rank) |
5376 | { | |
11fdf7f2 | 5377 | ceph_assert(is_dir()); |
f67539c2 TL |
5378 | _get_projected_inode()->export_pin = rank; |
5379 | maybe_export_pin(true); | |
7c673cae FG |
5380 | } |
5381 | ||
f67539c2 | 5382 | mds_rank_t CInode::get_export_pin(bool inherit) const |
f6b5b4d7 | 5383 | { |
f67539c2 TL |
5384 | if (!g_conf()->mds_bal_export_pin) |
5385 | return MDS_RANK_NONE; | |
f6b5b4d7 | 5386 | |
7c673cae FG |
5387 | /* An inode that is export pinned may not necessarily be a subtree root, we |
5388 | * need to traverse the parents. A base or system inode cannot be pinned. | |
5389 | * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not | |
5390 | * have a parent yet. | |
5391 | */ | |
f67539c2 | 5392 | mds_rank_t r_target = MDS_RANK_NONE; |
b32b8144 | 5393 | const CInode *in = this; |
f67539c2 | 5394 | const CDir *dir = nullptr; |
f6b5b4d7 TL |
5395 | while (true) { |
5396 | if (in->is_system()) | |
5397 | break; | |
5398 | const CDentry *pdn = in->get_parent_dn(); | |
5399 | if (!pdn) | |
5400 | break; | |
f67539c2 | 5401 | if (in->get_inode()->nlink == 0) { |
f6b5b4d7 | 5402 | // ignore export pin for unlinked directory |
f67539c2 TL |
5403 | break; |
5404 | } | |
5405 | ||
5406 | if (in->get_inode()->export_pin >= 0) { | |
5407 | return in->get_inode()->export_pin; | |
5408 | } else if (in->get_inode()->export_ephemeral_distributed_pin && | |
5409 | mdcache->get_export_ephemeral_distributed_config()) { | |
5410 | if (in != this) | |
5411 | return mdcache->hash_into_rank_bucket(in->ino(), dir->get_frag()); | |
5412 | return MDS_RANK_EPHEMERAL_DIST; | |
5413 | } else if (r_target != MDS_RANK_NONE && in->get_inode()->export_ephemeral_random_pin > 0.0) { | |
5414 | return r_target; | |
5415 | } else if (r_target == MDS_RANK_NONE && in->is_ephemeral_rand() && | |
5416 | mdcache->get_export_ephemeral_random_config()) { | |
f6b5b4d7 | 5417 | /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */ |
f67539c2 TL |
5418 | if (!inherit) |
5419 | return MDS_RANK_EPHEMERAL_RAND; | |
5420 | if (in == this) | |
5421 | r_target = MDS_RANK_EPHEMERAL_RAND; | |
5422 | else | |
5423 | r_target = mdcache->hash_into_rank_bucket(in->ino()); | |
f6b5b4d7 TL |
5424 | } |
5425 | ||
f67539c2 | 5426 | if (!inherit) |
f6b5b4d7 | 5427 | break; |
f67539c2 TL |
5428 | dir = pdn->get_dir(); |
5429 | in = dir->inode; | |
f6b5b4d7 TL |
5430 | } |
5431 | return MDS_RANK_NONE; | |
5432 | } | |
5433 | ||
f67539c2 TL |
5434 | void CInode::check_pin_policy(mds_rank_t export_pin) |
5435 | { | |
5436 | if (export_pin == MDS_RANK_EPHEMERAL_DIST) { | |
5437 | set_ephemeral_pin(true, false); | |
5438 | clear_ephemeral_pin(false, true); | |
5439 | } else if (export_pin == MDS_RANK_EPHEMERAL_RAND) { | |
5440 | set_ephemeral_pin(false, true); | |
5441 | clear_ephemeral_pin(true, false); | |
5442 | } else if (is_ephemerally_pinned()) { | |
5443 | // export_pin >= 0 || export_pin == MDS_RANK_NONE | |
5444 | clear_ephemeral_pin(true, true); | |
5445 | if (export_pin != get_inode()->export_pin) // inherited export_pin | |
5446 | queue_export_pin(MDS_RANK_NONE); | |
5447 | } | |
5448 | } | |
5449 | ||
5450 | double CInode::get_ephemeral_rand() const | |
f6b5b4d7 TL |
5451 | { |
5452 | /* N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not | |
5453 | * have a parent yet. | |
5454 | */ | |
5455 | const CInode *in = this; | |
5456 | double max = mdcache->export_ephemeral_random_max; | |
b32b8144 FG |
5457 | while (true) { |
5458 | if (in->is_system()) | |
5459 | break; | |
f64942e4 | 5460 | const CDentry *pdn = in->get_parent_dn(); |
b32b8144 FG |
5461 | if (!pdn) |
5462 | break; | |
b32b8144 | 5463 | // ignore export pin for unlinked directory |
f67539c2 | 5464 | if (in->get_inode()->nlink == 0) |
b32b8144 | 5465 | break; |
f6b5b4d7 | 5466 | |
f67539c2 TL |
5467 | if (in->get_inode()->export_ephemeral_random_pin > 0.0) |
5468 | return std::min(in->get_inode()->export_ephemeral_random_pin, max); | |
f6b5b4d7 TL |
5469 | |
5470 | /* An export_pin overrides only if no closer parent (incl. this one) has a | |
5471 | * random pin set. | |
5472 | */ | |
f67539c2 TL |
5473 | if (in->get_inode()->export_pin >= 0 || |
5474 | in->get_inode()->export_ephemeral_distributed_pin) | |
f6b5b4d7 | 5475 | return 0.0; |
b32b8144 | 5476 | |
b32b8144 | 5477 | in = pdn->get_dir()->inode; |
7c673cae | 5478 | } |
f6b5b4d7 | 5479 | return 0.0; |
7c673cae FG |
5480 | } |
5481 | ||
9f95a23c TL |
5482 | void CInode::get_nested_dirfrags(std::vector<CDir*>& v) const |
5483 | { | |
5484 | for (const auto &p : dirfrags) { | |
5485 | const auto& dir = p.second; | |
5486 | if (!dir->is_subtree_root()) | |
5487 | v.push_back(dir); | |
5488 | } | |
5489 | } | |
5490 | ||
5491 | void CInode::get_subtree_dirfrags(std::vector<CDir*>& v) const | |
5492 | { | |
5493 | for (const auto &p : dirfrags) { | |
5494 | const auto& dir = p.second; | |
5495 | if (dir->is_subtree_root()) | |
5496 | v.push_back(dir); | |
5497 | } | |
5498 | } | |
5499 | ||
181888fb | 5500 | MEMPOOL_DEFINE_OBJECT_FACTORY(CInode, co_inode, mds_co); |