]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/CInode.cc
Import ceph 15.2.8
[ceph.git] / ceph / src / mds / CInode.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "include/int_types.h"
16#include "common/errno.h"
17
18#include <string>
19#include <stdio.h>
20
21#include "CInode.h"
22#include "CDir.h"
23#include "CDentry.h"
24
25#include "MDSRank.h"
26#include "MDCache.h"
27#include "MDLog.h"
28#include "Locker.h"
29#include "Mutation.h"
30
31#include "events/EUpdate.h"
32
33#include "osdc/Objecter.h"
34
35#include "snap.h"
36
37#include "LogSegment.h"
38
39#include "common/Clock.h"
40
7c673cae
FG
41#include "common/config.h"
42#include "global/global_context.h"
11fdf7f2 43#include "include/ceph_assert.h"
7c673cae
FG
44
45#include "mds/MDSContinuation.h"
46#include "mds/InoTable.h"
11fdf7f2 47#include "cephfs_features.h"
7c673cae
FG
48
49#define dout_context g_ceph_context
50#define dout_subsys ceph_subsys_mds
51#undef dout_prefix
52#define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") "
53
54
55class CInodeIOContext : public MDSIOContextBase
56{
57protected:
58 CInode *in;
59 MDSRank *get_mds() override {return in->mdcache->mds;}
60public:
61 explicit CInodeIOContext(CInode *in_) : in(in_) {
11fdf7f2 62 ceph_assert(in != NULL);
7c673cae
FG
63 }
64};
65
11fdf7f2 66sr_t* const CInode::projected_inode::UNDEF_SRNODE = (sr_t*)(unsigned long)-1;
7c673cae
FG
67
68LockType CInode::versionlock_type(CEPH_LOCK_IVERSION);
69LockType CInode::authlock_type(CEPH_LOCK_IAUTH);
70LockType CInode::linklock_type(CEPH_LOCK_ILINK);
71LockType CInode::dirfragtreelock_type(CEPH_LOCK_IDFT);
72LockType CInode::filelock_type(CEPH_LOCK_IFILE);
73LockType CInode::xattrlock_type(CEPH_LOCK_IXATTR);
74LockType CInode::snaplock_type(CEPH_LOCK_ISNAP);
75LockType CInode::nestlock_type(CEPH_LOCK_INEST);
76LockType CInode::flocklock_type(CEPH_LOCK_IFLOCK);
77LockType CInode::policylock_type(CEPH_LOCK_IPOLICY);
78
9f95a23c
TL
79std::string_view CInode::pin_name(int p) const
80{
81 switch (p) {
82 case PIN_DIRFRAG: return "dirfrag";
83 case PIN_CAPS: return "caps";
84 case PIN_IMPORTING: return "importing";
85 case PIN_OPENINGDIR: return "openingdir";
86 case PIN_REMOTEPARENT: return "remoteparent";
87 case PIN_BATCHOPENJOURNAL: return "batchopenjournal";
88 case PIN_SCATTERED: return "scattered";
89 case PIN_STICKYDIRS: return "stickydirs";
90 //case PIN_PURGING: return "purging";
91 case PIN_FREEZING: return "freezing";
92 case PIN_FROZEN: return "frozen";
93 case PIN_IMPORTINGCAPS: return "importingcaps";
94 case PIN_EXPORTINGCAPS: return "exportingcaps";
95 case PIN_PASTSNAPPARENT: return "pastsnapparent";
96 case PIN_OPENINGSNAPPARENTS: return "openingsnapparents";
97 case PIN_TRUNCATING: return "truncating";
98 case PIN_STRAY: return "stray";
99 case PIN_NEEDSNAPFLUSH: return "needsnapflush";
100 case PIN_DIRTYRSTAT: return "dirtyrstat";
101 case PIN_DIRTYPARENT: return "dirtyparent";
102 case PIN_DIRWAITER: return "dirwaiter";
103 case PIN_SCRUBQUEUE: return "scrubqueue";
104 default: return generic_pin_name(p);
105 }
106}
107
7c673cae
FG
108//int cinode_pins[CINODE_NUM_PINS]; // counts
109ostream& CInode::print_db_line_prefix(ostream& out)
110{
111 return out << ceph_clock_now() << " mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") ";
112}
113
114/*
115 * write caps and lock ids
116 */
117struct cinode_lock_info_t cinode_lock_info[] = {
118 { CEPH_LOCK_IFILE, CEPH_CAP_ANY_FILE_WR },
119 { CEPH_LOCK_IAUTH, CEPH_CAP_AUTH_EXCL },
120 { CEPH_LOCK_ILINK, CEPH_CAP_LINK_EXCL },
121 { CEPH_LOCK_IXATTR, CEPH_CAP_XATTR_EXCL },
122};
123int num_cinode_locks = sizeof(cinode_lock_info) / sizeof(cinode_lock_info[0]);
124
7c673cae
FG
125ostream& operator<<(ostream& out, const CInode& in)
126{
127 string path;
128 in.make_path_string(path, true);
129
130 out << "[inode " << in.inode.ino;
131 out << " ["
132 << (in.is_multiversion() ? "...":"")
133 << in.first << "," << in.last << "]";
134 out << " " << path << (in.is_dir() ? "/":"");
135
136 if (in.is_auth()) {
137 out << " auth";
138 if (in.is_replicated())
139 out << in.get_replicas();
140 } else {
141 mds_authority_t a = in.authority();
142 out << " rep@" << a.first;
143 if (a.second != CDIR_AUTH_UNKNOWN)
144 out << "," << a.second;
145 out << "." << in.get_replica_nonce();
146 }
147
148 if (in.is_symlink())
149 out << " symlink='" << in.symlink << "'";
150 if (in.is_dir() && !in.dirfragtree.empty())
151 out << " " << in.dirfragtree;
152
153 out << " v" << in.get_version();
154 if (in.get_projected_version() > in.get_version())
155 out << " pv" << in.get_projected_version();
156
11fdf7f2
TL
157 if (in.get_num_auth_pins()) {
158 out << " ap=" << in.get_num_auth_pins();
7c673cae 159#ifdef MDS_AUTHPIN_SET
11fdf7f2 160 in.print_authpin_set(out);
7c673cae
FG
161#endif
162 }
163
164 if (in.snaprealm)
165 out << " snaprealm=" << in.snaprealm;
166
167 if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH";
168 if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " needsrecover";
169 if (in.state_test(CInode::STATE_RECOVERING)) out << " recovering";
170 if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " dirtyparent";
171 if (in.state_test(CInode::STATE_MISSINGOBJS)) out << " missingobjs";
172 if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance;
173 if (in.is_frozen_inode()) out << " FROZEN";
174 if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN";
175
94b18763 176 const CInode::mempool_inode *pi = in.get_projected_inode();
7c673cae
FG
177 if (pi->is_truncating())
178 out << " truncating(" << pi->truncate_from << " to " << pi->truncate_size << ")";
179
180 if (in.inode.is_dir()) {
181 out << " " << in.inode.dirstat;
11fdf7f2 182 if (g_conf()->mds_debug_scatterstat && in.is_projected()) {
94b18763 183 const CInode::mempool_inode *pi = in.get_projected_inode();
7c673cae
FG
184 out << "->" << pi->dirstat;
185 }
186 } else {
187 out << " s=" << in.inode.size;
188 if (in.inode.nlink != 1)
189 out << " nl=" << in.inode.nlink;
190 }
191
192 // rstat
193 out << " " << in.inode.rstat;
194 if (!(in.inode.rstat == in.inode.accounted_rstat))
195 out << "/" << in.inode.accounted_rstat;
11fdf7f2 196 if (g_conf()->mds_debug_scatterstat && in.is_projected()) {
94b18763 197 const CInode::mempool_inode *pi = in.get_projected_inode();
7c673cae
FG
198 out << "->" << pi->rstat;
199 if (!(pi->rstat == pi->accounted_rstat))
200 out << "/" << pi->accounted_rstat;
201 }
202
203 if (!in.client_need_snapflush.empty())
204 out << " need_snapflush=" << in.client_need_snapflush;
205
206
207 // locks
208 if (!in.authlock.is_sync_and_unlocked())
209 out << " " << in.authlock;
210 if (!in.linklock.is_sync_and_unlocked())
211 out << " " << in.linklock;
212 if (in.inode.is_dir()) {
213 if (!in.dirfragtreelock.is_sync_and_unlocked())
214 out << " " << in.dirfragtreelock;
215 if (!in.snaplock.is_sync_and_unlocked())
216 out << " " << in.snaplock;
217 if (!in.nestlock.is_sync_and_unlocked())
218 out << " " << in.nestlock;
219 if (!in.policylock.is_sync_and_unlocked())
220 out << " " << in.policylock;
221 } else {
222 if (!in.flocklock.is_sync_and_unlocked())
223 out << " " << in.flocklock;
224 }
225 if (!in.filelock.is_sync_and_unlocked())
226 out << " " << in.filelock;
227 if (!in.xattrlock.is_sync_and_unlocked())
228 out << " " << in.xattrlock;
229 if (!in.versionlock.is_sync_and_unlocked())
230 out << " " << in.versionlock;
231
232 // hack: spit out crap on which clients have caps
233 if (in.inode.client_ranges.size())
234 out << " cr=" << in.inode.client_ranges;
235
236 if (!in.get_client_caps().empty()) {
237 out << " caps={";
11fdf7f2
TL
238 bool first = true;
239 for (const auto &p : in.get_client_caps()) {
240 if (!first) out << ",";
241 out << p.first << "="
242 << ccap_string(p.second.pending());
243 if (p.second.issued() != p.second.pending())
244 out << "/" << ccap_string(p.second.issued());
245 out << "/" << ccap_string(p.second.wanted())
246 << "@" << p.second.get_last_seq();
247 first = false;
7c673cae
FG
248 }
249 out << "}";
250 if (in.get_loner() >= 0 || in.get_wanted_loner() >= 0) {
251 out << ",l=" << in.get_loner();
252 if (in.get_loner() != in.get_wanted_loner())
253 out << "(" << in.get_wanted_loner() << ")";
254 }
255 }
256 if (!in.get_mds_caps_wanted().empty()) {
257 out << " mcw={";
94b18763
FG
258 bool first = true;
259 for (const auto &p : in.get_mds_caps_wanted()) {
260 if (!first)
7c673cae 261 out << ',';
94b18763
FG
262 out << p.first << '=' << ccap_string(p.second);
263 first = false;
7c673cae
FG
264 }
265 out << '}';
266 }
267
268 if (in.get_num_ref()) {
269 out << " |";
270 in.print_pin_set(out);
271 }
272
273 if (in.inode.export_pin != MDS_RANK_NONE) {
274 out << " export_pin=" << in.inode.export_pin;
275 }
f6b5b4d7
TL
276 if (in.state_test(CInode::STATE_DISTEPHEMERALPIN)) {
277 out << " distepin";
278 }
279 if (in.state_test(CInode::STATE_RANDEPHEMERALPIN)) {
280 out << " randepin";
281 }
7c673cae
FG
282
283 out << " " << &in;
284 out << "]";
285 return out;
286}
287
288ostream& operator<<(ostream& out, const CInode::scrub_stamp_info_t& si)
289{
290 out << "{scrub_start_version: " << si.scrub_start_version
291 << ", scrub_start_stamp: " << si.scrub_start_stamp
292 << ", last_scrub_version: " << si.last_scrub_version
293 << ", last_scrub_stamp: " << si.last_scrub_stamp;
294 return out;
295}
296
11fdf7f2
TL
297CInode::CInode(MDCache *c, bool auth, snapid_t f, snapid_t l)
298 :
299 mdcache(c),
300 first(f), last(l),
301 item_dirty(this),
302 item_caps(this),
303 item_open_file(this),
304 item_dirty_parent(this),
305 item_dirty_dirfrag_dir(this),
306 item_dirty_dirfrag_nest(this),
307 item_dirty_dirfrag_dirfragtree(this),
308 pop(c->decayrate),
309 versionlock(this, &versionlock_type),
310 authlock(this, &authlock_type),
311 linklock(this, &linklock_type),
312 dirfragtreelock(this, &dirfragtreelock_type),
313 filelock(this, &filelock_type),
314 xattrlock(this, &xattrlock_type),
315 snaplock(this, &snaplock_type),
316 nestlock(this, &nestlock_type),
317 flocklock(this, &flocklock_type),
318 policylock(this, &policylock_type)
319{
320 if (auth) state_set(STATE_AUTH);
321}
7c673cae
FG
322
323void CInode::print(ostream& out)
324{
325 out << *this;
326}
327
7c673cae
FG
328void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
329{
11fdf7f2 330 dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl;
7c673cae
FG
331
332 if (client_need_snapflush.empty()) {
333 get(CInode::PIN_NEEDSNAPFLUSH);
334
335 // FIXME: this is non-optimal, as we'll block freezes/migrations for potentially
336 // long periods waiting for clients to flush their snaps.
337 auth_pin(this); // pin head inode...
338 }
339
94b18763 340 auto &clients = client_need_snapflush[snapid];
7c673cae
FG
341 if (clients.empty())
342 snapin->auth_pin(this); // ...and pin snapped/old inode!
343
344 clients.insert(client);
345}
346
347void CInode::remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
348{
94b18763
FG
349 dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl;
350 auto it = client_need_snapflush.find(snapid);
351 if (it == client_need_snapflush.end()) {
7c673cae
FG
352 dout(10) << " snapid not found" << dendl;
353 return;
354 }
94b18763
FG
355 size_t n = it->second.erase(client);
356 if (n == 0) {
7c673cae
FG
357 dout(10) << " client not found" << dendl;
358 return;
359 }
94b18763
FG
360 if (it->second.empty()) {
361 client_need_snapflush.erase(it);
7c673cae
FG
362 snapin->auth_unpin(this);
363
364 if (client_need_snapflush.empty()) {
365 put(CInode::PIN_NEEDSNAPFLUSH);
366 auth_unpin(this);
367 }
368 }
369}
370
494da23a 371pair<bool,bool> CInode::split_need_snapflush(CInode *cowin, CInode *in)
7c673cae 372{
11fdf7f2 373 dout(10) << __func__ << " [" << cowin->first << "," << cowin->last << "] for " << *cowin << dendl;
494da23a
TL
374 bool cowin_need_flush = false;
375 bool orig_need_flush = false;
376 auto it = client_need_snapflush.lower_bound(cowin->first);
377 while (it != client_need_snapflush.end() && it->first < in->first) {
11fdf7f2 378 ceph_assert(!it->second.empty());
94b18763 379 if (cowin->last >= it->first) {
7c673cae 380 cowin->auth_pin(this);
494da23a 381 cowin_need_flush = true;
94b18763
FG
382 ++it;
383 } else {
384 it = client_need_snapflush.erase(it);
385 }
7c673cae
FG
386 in->auth_unpin(this);
387 }
494da23a
TL
388
389 if (it != client_need_snapflush.end() && it->first <= in->last)
390 orig_need_flush = true;
391
392 return make_pair(cowin_need_flush, orig_need_flush);
7c673cae
FG
393}
394
395void CInode::mark_dirty_rstat()
396{
397 if (!state_test(STATE_DIRTYRSTAT)) {
11fdf7f2 398 dout(10) << __func__ << dendl;
7c673cae
FG
399 state_set(STATE_DIRTYRSTAT);
400 get(PIN_DIRTYRSTAT);
224ce89b
WB
401 CDentry *pdn = get_projected_parent_dn();
402 if (pdn->is_auth()) {
403 CDir *pdir = pdn->dir;
404 pdir->dirty_rstat_inodes.push_back(&dirty_rstat_item);
405 mdcache->mds->locker->mark_updated_scatterlock(&pdir->inode->nestlock);
406 } else {
407 // under cross-MDS rename.
408 // DIRTYRSTAT flag will get cleared when rename finishes
11fdf7f2 409 ceph_assert(state_test(STATE_AMBIGUOUSAUTH));
224ce89b 410 }
7c673cae
FG
411 }
412}
413void CInode::clear_dirty_rstat()
414{
415 if (state_test(STATE_DIRTYRSTAT)) {
11fdf7f2 416 dout(10) << __func__ << dendl;
7c673cae
FG
417 state_clear(STATE_DIRTYRSTAT);
418 put(PIN_DIRTYRSTAT);
419 dirty_rstat_item.remove_myself();
420 }
421}
422
94b18763
FG
423CInode::projected_inode &CInode::project_inode(bool xattr, bool snap)
424{
11fdf7f2
TL
425 auto &pi = projected_nodes.empty() ?
426 projected_nodes.emplace_back(inode) :
94b18763 427 projected_nodes.emplace_back(projected_nodes.back().inode);
7c673cae
FG
428
429 if (scrub_infop && scrub_infop->last_scrub_dirty) {
94b18763
FG
430 pi.inode.last_scrub_stamp = scrub_infop->last_scrub_stamp;
431 pi.inode.last_scrub_version = scrub_infop->last_scrub_version;
7c673cae
FG
432 scrub_infop->last_scrub_dirty = false;
433 scrub_maybe_delete_info();
434 }
94b18763
FG
435
436 if (xattr) {
437 pi.xattrs.reset(new mempool_xattr_map(*get_projected_xattrs()));
438 ++num_projected_xattrs;
439 }
440
441 if (snap) {
11fdf7f2 442 project_snaprealm();
94b18763
FG
443 }
444
445 dout(15) << __func__ << " " << pi.inode.ino << dendl;
446 return pi;
7c673cae
FG
447}
448
449void CInode::pop_and_dirty_projected_inode(LogSegment *ls)
450{
11fdf7f2 451 ceph_assert(!projected_nodes.empty());
f6b5b4d7
TL
452 auto& front = projected_nodes.front();
453
94b18763
FG
454 dout(15) << __func__ << " " << front.inode.ino
455 << " v" << front.inode.version << dendl;
f6b5b4d7 456
7c673cae 457 int64_t old_pool = inode.layout.pool_id;
f6b5b4d7
TL
458 bool pin_update = inode.export_pin != front.inode.export_pin;
459 bool dist_update = inode.export_ephemeral_distributed_pin
460 != front.inode.export_ephemeral_distributed_pin;
7c673cae 461
94b18763 462 mark_dirty(front.inode.version, ls);
f6b5b4d7
TL
463
464 inode = std::move(front.inode);
465
466 if (pin_update)
f64942e4 467 maybe_export_pin(true);
f6b5b4d7
TL
468 if (dist_update)
469 maybe_ephemeral_dist_children(true);
7c673cae
FG
470
471 if (inode.is_backtrace_updated())
28e407b8 472 mark_dirty_parent(ls, old_pool != inode.layout.pool_id);
7c673cae 473
94b18763 474 if (front.xattrs) {
7c673cae 475 --num_projected_xattrs;
94b18763 476 xattrs = *front.xattrs;
7c673cae
FG
477 }
478
11fdf7f2
TL
479 if (projected_nodes.front().snapnode != projected_inode::UNDEF_SRNODE) {
480 pop_projected_snaprealm(projected_nodes.front().snapnode, false);
7c673cae
FG
481 --num_projected_srnodes;
482 }
483
7c673cae
FG
484 projected_nodes.pop_front();
485}
486
9f95a23c
TL
487CInode::mempool_xattr_map *CInode::get_projected_xattrs()
488{
489 if (num_projected_xattrs > 0) {
490 for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
491 if (it->xattrs)
492 return it->xattrs.get();
493 }
494 return &xattrs;
495}
496
497CInode::mempool_xattr_map *CInode::get_previous_projected_xattrs()
498{
499 if (num_projected_xattrs > 0) {
500 for (auto it = ++projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
501 if (it->xattrs)
502 return it->xattrs.get();
503 }
504 return &xattrs;
505}
506
11fdf7f2
TL
507sr_t *CInode::prepare_new_srnode(snapid_t snapid)
508{
509 const sr_t *cur_srnode = get_projected_srnode();
510 sr_t *new_srnode;
511
512 if (cur_srnode) {
513 new_srnode = new sr_t(*cur_srnode);
514 if (!new_srnode->past_parents.empty()) {
515 // convert past_parents to past_parent_snaps
516 ceph_assert(snaprealm);
517 auto& snaps = snaprealm->get_snaps();
518 for (auto p : snaps) {
519 if (p >= new_srnode->current_parent_since)
520 break;
521 if (!new_srnode->snaps.count(p))
522 new_srnode->past_parent_snaps.insert(p);
523 }
524 new_srnode->seq = snaprealm->get_newest_seq();
525 new_srnode->past_parents.clear();
526 }
527 if (snaprealm)
528 snaprealm->past_parents_dirty = false;
529 } else {
530 if (snapid == 0)
531 snapid = mdcache->get_global_snaprealm()->get_newest_seq();
532 new_srnode = new sr_t();
533 new_srnode->seq = snapid;
534 new_srnode->created = snapid;
535 new_srnode->current_parent_since = get_oldest_snap();
536 }
537 return new_srnode;
538}
539
9f95a23c
TL
540const sr_t *CInode::get_projected_srnode() const {
541 if (num_projected_srnodes > 0) {
542 for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
543 if (it->snapnode != projected_inode::UNDEF_SRNODE)
544 return it->snapnode;
545 }
546 if (snaprealm)
547 return &snaprealm->srnode;
548 else
549 return NULL;
550}
551
11fdf7f2
TL
552void CInode::project_snaprealm(sr_t *new_srnode)
553{
554 dout(10) << __func__ << " " << new_srnode << dendl;
555 ceph_assert(projected_nodes.back().snapnode == projected_inode::UNDEF_SRNODE);
556 projected_nodes.back().snapnode = new_srnode;
557 ++num_projected_srnodes;
558}
559
560void CInode::mark_snaprealm_global(sr_t *new_srnode)
561{
562 ceph_assert(!is_dir());
563 // 'last_destroyed' is no longer used, use it to store origin 'current_parent_since'
564 new_srnode->last_destroyed = new_srnode->current_parent_since;
565 new_srnode->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
566 new_srnode->mark_parent_global();
567}
568
569void CInode::clear_snaprealm_global(sr_t *new_srnode)
570{
571 // restore 'current_parent_since'
572 new_srnode->current_parent_since = new_srnode->last_destroyed;
573 new_srnode->last_destroyed = 0;
574 new_srnode->seq = mdcache->get_global_snaprealm()->get_newest_seq();
575 new_srnode->clear_parent_global();
576}
577
578bool CInode::is_projected_snaprealm_global() const
579{
580 const sr_t *srnode = get_projected_srnode();
581 if (srnode && srnode->is_parent_global())
582 return true;
583 return false;
584}
585
586void CInode::project_snaprealm_past_parent(SnapRealm *newparent)
587{
588 sr_t *new_snap = project_snaprealm();
589 record_snaprealm_past_parent(new_snap, newparent);
590}
591
592
7c673cae
FG
593/* if newparent != parent, add parent to past_parents
594 if parent DNE, we need to find what the parent actually is and fill that in */
11fdf7f2 595void CInode::record_snaprealm_past_parent(sr_t *new_snap, SnapRealm *newparent)
7c673cae 596{
11fdf7f2 597 ceph_assert(!new_snap->is_parent_global());
7c673cae
FG
598 SnapRealm *oldparent;
599 if (!snaprealm) {
600 oldparent = find_snaprealm();
11fdf7f2 601 } else {
7c673cae 602 oldparent = snaprealm->parent;
11fdf7f2 603 }
7c673cae
FG
604
605 if (newparent != oldparent) {
606 snapid_t oldparentseq = oldparent->get_newest_seq();
11fdf7f2
TL
607 if (oldparentseq + 1 > new_snap->current_parent_since) {
608 // copy old parent's snaps
609 const set<snapid_t>& snaps = oldparent->get_snaps();
610 auto p = snaps.lower_bound(new_snap->current_parent_since);
611 if (p != snaps.end())
612 new_snap->past_parent_snaps.insert(p, snaps.end());
613 if (oldparentseq > new_snap->seq)
614 new_snap->seq = oldparentseq;
7c673cae 615 }
11fdf7f2 616 new_snap->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae
FG
617 }
618}
619
11fdf7f2
TL
620void CInode::record_snaprealm_parent_dentry(sr_t *new_snap, SnapRealm *newparent,
621 CDentry *dn, bool primary_dn)
7c673cae 622{
11fdf7f2
TL
623 ceph_assert(new_snap->is_parent_global());
624 SnapRealm *oldparent = dn->get_dir()->inode->find_snaprealm();
625 auto& snaps = oldparent->get_snaps();
626
627 if (!primary_dn) {
628 auto p = snaps.lower_bound(dn->first);
629 if (p != snaps.end())
630 new_snap->past_parent_snaps.insert(p, snaps.end());
631 } else if (newparent != oldparent) {
632 // 'last_destroyed' is used as 'current_parent_since'
633 auto p = snaps.lower_bound(new_snap->last_destroyed);
634 if (p != snaps.end())
635 new_snap->past_parent_snaps.insert(p, snaps.end());
636 new_snap->last_destroyed = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
637 }
638}
7c673cae 639
11fdf7f2
TL
640void CInode::early_pop_projected_snaprealm()
641{
642 ceph_assert(!projected_nodes.empty());
643 if (projected_nodes.front().snapnode != projected_inode::UNDEF_SRNODE) {
644 pop_projected_snaprealm(projected_nodes.front().snapnode, true);
645 projected_nodes.front().snapnode = projected_inode::UNDEF_SRNODE;
646 --num_projected_srnodes;
7c673cae 647 }
11fdf7f2 648}
7c673cae 649
11fdf7f2
TL
650void CInode::pop_projected_snaprealm(sr_t *next_snaprealm, bool early)
651{
652 if (next_snaprealm) {
653 dout(10) << __func__ << (early ? " (early) " : " ")
654 << next_snaprealm << " seq " << next_snaprealm->seq << dendl;
655 bool invalidate_cached_snaps = false;
656 if (!snaprealm) {
657 open_snaprealm();
658 } else if (!snaprealm->srnode.past_parents.empty()) {
659 invalidate_cached_snaps = true;
660 // re-open past parents
661 snaprealm->close_parents();
662
663 dout(10) << " realm " << *snaprealm << " past_parents " << snaprealm->srnode.past_parents
664 << " -> " << next_snaprealm->past_parents << dendl;
665 }
666 auto old_flags = snaprealm->srnode.flags;
667 snaprealm->srnode = *next_snaprealm;
668 delete next_snaprealm;
7c673cae 669
11fdf7f2
TL
670 if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) {
671 snaprealm->close_parents();
672 snaprealm->adjust_parent();
673 }
7c673cae 674
11fdf7f2
TL
675 // we should be able to open these up (or have them already be open).
676 bool ok = snaprealm->_open_parents(NULL);
677 ceph_assert(ok);
678
679 if (invalidate_cached_snaps)
680 snaprealm->invalidate_cached_snaps();
681
682 if (snaprealm->parent)
683 dout(10) << " realm " << *snaprealm << " parent " << *snaprealm->parent << dendl;
684 } else {
685 dout(10) << __func__ << (early ? " (early) null" : " null") << dendl;
686 ceph_assert(snaprealm);
687 snaprealm->merge_to(NULL);
688 }
7c673cae
FG
689}
690
691
692// ====== CInode =======
693
694// dirfrags
695
11fdf7f2 696__u32 InodeStoreBase::hash_dentry_name(std::string_view dn)
7c673cae
FG
697{
698 int which = inode.dir_layout.dl_dir_hash;
699 if (!which)
700 which = CEPH_STR_HASH_LINUX;
11fdf7f2 701 ceph_assert(ceph_str_hash_valid(which));
7c673cae
FG
702 return ceph_str_hash(which, dn.data(), dn.length());
703}
704
11fdf7f2 705frag_t InodeStoreBase::pick_dirfrag(std::string_view dn)
7c673cae
FG
706{
707 if (dirfragtree.empty())
708 return frag_t(); // avoid the string hash if we can.
709
710 __u32 h = hash_dentry_name(dn);
711 return dirfragtree[h];
712}
713
9f95a23c 714std::pair<bool, std::vector<CDir*>> CInode::get_dirfrags_under(frag_t fg)
7c673cae 715{
9f95a23c
TL
716 std::pair<bool, std::vector<CDir*>> result;
717 auto& all = result.first;
718 auto& dirs = result.second;
719 all = false;
720
721 if (auto it = dirfrags.find(fg); it != dirfrags.end()){
722 all = true;
723 dirs.push_back(it->second);
724 return result;
7c673cae 725 }
9f95a23c
TL
726
727 int total = 0;
728 for(auto &[_fg, _dir] : dirfrags){
729 // frag_t.bits() can indicate the depth of the partition in the directory tree
730 // e.g.
731 // 01* : bit = 2, on the second floor
732 // *
733 // 0* 1*
734 // 00* 01* 10* 11* -- > level 2, bit = 2
735 // so fragA.bits > fragB.bits means fragA is deeper than fragB
736
737 if (fg.bits() >= _fg.bits()) {
738 if (_fg.contains(fg)) {
739 all = true;
740 return result;
741 }
742 } else {
743 if (fg.contains(_fg)) {
744 dirs.push_back(_dir);
745 // we can calculate how many sub slices a slice can be divided into
746 // frag_t(*) can be divided into two frags belonging to the first layer(0* 1*)
747 // or 2^2 frags belonging to the second layer(00* 01* 10* 11*)
748 // or (1 << (24 - frag_t(*).bits)) frags belonging to the 24th level
749 total += 1 << (24 - _fg.bits());
11fdf7f2 750 }
7c673cae 751 }
94b18763 752 }
7c673cae 753
9f95a23c
TL
754 // we convert all the frags into the frags of 24th layer to calculate whether all the frags are included in the memory cache
755 all = ((1<<(24-fg.bits())) == total);
756 return result;
7c673cae
FG
757}
758
759void CInode::verify_dirfrags()
760{
761 bool bad = false;
94b18763
FG
762 for (const auto &p : dirfrags) {
763 if (!dirfragtree.is_leaf(p.first)) {
764 dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
765 << ": " << *p.second << dendl;
7c673cae
FG
766 bad = true;
767 }
768 }
11fdf7f2 769 ceph_assert(!bad);
7c673cae
FG
770}
771
772void CInode::force_dirfrags()
773{
774 bool bad = false;
94b18763
FG
775 for (auto &p : dirfrags) {
776 if (!dirfragtree.is_leaf(p.first)) {
777 dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
778 << ": " << *p.second << dendl;
7c673cae
FG
779 bad = true;
780 }
781 }
782
783 if (bad) {
11fdf7f2 784 frag_vec_t leaves;
7c673cae 785 dirfragtree.get_leaves(leaves);
11fdf7f2
TL
786 for (const auto& leaf : leaves) {
787 mdcache->get_force_dirfrag(dirfrag_t(ino(), leaf), true);
788 }
7c673cae
FG
789 }
790
791 verify_dirfrags();
792}
793
794CDir *CInode::get_approx_dirfrag(frag_t fg)
795{
796 CDir *dir = get_dirfrag(fg);
797 if (dir) return dir;
798
799 // find a child?
9f95a23c
TL
800 auto&& p = get_dirfrags_under(fg);
801 if (!p.second.empty())
802 return p.second.front();
7c673cae
FG
803
804 // try parents?
805 while (fg.bits() > 0) {
806 fg = fg.parent();
807 dir = get_dirfrag(fg);
808 if (dir) return dir;
809 }
810 return NULL;
811}
812
7c673cae
FG
813CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg)
814{
11fdf7f2 815 ceph_assert(is_dir());
7c673cae
FG
816
817 // have it?
818 CDir *dir = get_dirfrag(fg);
819 if (!dir) {
820 // create it.
11fdf7f2 821 ceph_assert(is_auth() || mdcache->mds->is_any_replay());
7c673cae
FG
822 dir = new CDir(this, fg, mdcache, is_auth());
823 add_dirfrag(dir);
824 }
825 return dir;
826}
827
828CDir *CInode::add_dirfrag(CDir *dir)
829{
11fdf7f2
TL
830 auto em = dirfrags.emplace(std::piecewise_construct, std::forward_as_tuple(dir->dirfrag().frag), std::forward_as_tuple(dir));
831 ceph_assert(em.second);
7c673cae
FG
832
833 if (stickydir_ref > 0) {
834 dir->state_set(CDir::STATE_STICKY);
835 dir->get(CDir::PIN_STICKY);
836 }
837
f6b5b4d7 838 maybe_pin();
7c673cae
FG
839
840 return dir;
841}
842
843void CInode::close_dirfrag(frag_t fg)
844{
11fdf7f2
TL
845 dout(14) << __func__ << " " << fg << dendl;
846 ceph_assert(dirfrags.count(fg));
7c673cae
FG
847
848 CDir *dir = dirfrags[fg];
849 dir->remove_null_dentries();
850
851 // clear dirty flag
852 if (dir->is_dirty())
853 dir->mark_clean();
854
855 if (stickydir_ref > 0) {
856 dir->state_clear(CDir::STATE_STICKY);
857 dir->put(CDir::PIN_STICKY);
858 }
1adf2230
AA
859
860 if (dir->is_subtree_root())
861 num_subtree_roots--;
7c673cae
FG
862
863 // dump any remaining dentries, for debugging purposes
94b18763
FG
864 for (const auto &p : dir->items)
865 dout(14) << __func__ << " LEFTOVER dn " << *p.second << dendl;
7c673cae 866
11fdf7f2 867 ceph_assert(dir->get_num_ref() == 0);
7c673cae
FG
868 delete dir;
869 dirfrags.erase(fg);
870}
871
872void CInode::close_dirfrags()
873{
874 while (!dirfrags.empty())
875 close_dirfrag(dirfrags.begin()->first);
876}
877
878bool CInode::has_subtree_root_dirfrag(int auth)
879{
1adf2230
AA
880 if (num_subtree_roots > 0) {
881 if (auth == -1)
7c673cae 882 return true;
1adf2230
AA
883 for (const auto &p : dirfrags) {
884 if (p.second->is_subtree_root() &&
885 p.second->dir_auth.first == auth)
886 return true;
887 }
94b18763 888 }
7c673cae
FG
889 return false;
890}
891
892bool CInode::has_subtree_or_exporting_dirfrag()
893{
1adf2230
AA
894 if (num_subtree_roots > 0 || num_exporting_dirs > 0)
895 return true;
7c673cae
FG
896 return false;
897}
898
899void CInode::get_stickydirs()
900{
901 if (stickydir_ref == 0) {
902 get(PIN_STICKYDIRS);
94b18763
FG
903 for (const auto &p : dirfrags) {
904 p.second->state_set(CDir::STATE_STICKY);
905 p.second->get(CDir::PIN_STICKY);
7c673cae
FG
906 }
907 }
908 stickydir_ref++;
909}
910
911void CInode::put_stickydirs()
912{
11fdf7f2 913 ceph_assert(stickydir_ref > 0);
7c673cae
FG
914 stickydir_ref--;
915 if (stickydir_ref == 0) {
916 put(PIN_STICKYDIRS);
94b18763
FG
917 for (const auto &p : dirfrags) {
918 p.second->state_clear(CDir::STATE_STICKY);
919 p.second->put(CDir::PIN_STICKY);
7c673cae
FG
920 }
921 }
922}
923
924
925
926
927
928// pins
929
930void CInode::first_get()
931{
932 // pin my dentry?
933 if (parent)
934 parent->get(CDentry::PIN_INODEPIN);
935}
936
937void CInode::last_put()
938{
939 // unpin my dentry?
940 if (parent)
941 parent->put(CDentry::PIN_INODEPIN);
942}
943
944void CInode::_put()
945{
946 if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent())
947 mdcache->maybe_eval_stray(this, true);
948}
949
950void CInode::add_remote_parent(CDentry *p)
951{
952 if (remote_parents.empty())
953 get(PIN_REMOTEPARENT);
954 remote_parents.insert(p);
955}
956void CInode::remove_remote_parent(CDentry *p)
957{
958 remote_parents.erase(p);
959 if (remote_parents.empty())
960 put(PIN_REMOTEPARENT);
961}
962
963
964
965
966CDir *CInode::get_parent_dir()
967{
968 if (parent)
969 return parent->dir;
970 return NULL;
971}
972CDir *CInode::get_projected_parent_dir()
973{
974 CDentry *p = get_projected_parent_dn();
975 if (p)
976 return p->dir;
977 return NULL;
978}
979CInode *CInode::get_parent_inode()
980{
981 if (parent)
982 return parent->dir->inode;
983 return NULL;
984}
985
11fdf7f2 986bool CInode::is_ancestor_of(const CInode *other) const
7c673cae
FG
987{
988 while (other) {
989 if (other == this)
990 return true;
11fdf7f2
TL
991 const CDentry *pdn = other->get_oldest_parent_dn();
992 if (!pdn) {
993 ceph_assert(other->is_base());
7c673cae 994 break;
11fdf7f2
TL
995 }
996 other = pdn->get_dir()->get_inode();
997 }
998 return false;
999}
1000
1001bool CInode::is_projected_ancestor_of(const CInode *other) const
1002{
1003 while (other) {
1004 if (other == this)
1005 return true;
1006 const CDentry *pdn = other->get_projected_parent_dn();
1007 if (!pdn) {
1008 ceph_assert(other->is_base());
1009 break;
1010 }
1011 other = pdn->get_dir()->get_inode();
7c673cae
FG
1012 }
1013 return false;
1014}
1015
1016/*
1017 * Because a non-directory inode may have multiple links, the use_parent
1018 * argument allows selecting which parent to use for path construction. This
1019 * argument is only meaningful for the final component (i.e. the first of the
1020 * nested calls) because directories cannot have multiple hard links. If
1021 * use_parent is NULL and projected is true, the primary parent's projected
1022 * inode is used all the way up the path chain. Otherwise the primary parent
1023 * stable inode is used.
1024 */
1025void CInode::make_path_string(string& s, bool projected, const CDentry *use_parent) const
1026{
1027 if (!use_parent) {
1028 use_parent = projected ? get_projected_parent_dn() : parent;
1029 }
1030
1031 if (use_parent) {
1032 use_parent->make_path_string(s, projected);
1033 } else if (is_root()) {
1034 s = "";
1035 } else if (is_mdsdir()) {
1036 char t[40];
1037 uint64_t eino(ino());
1038 eino -= MDS_INO_MDSDIR_OFFSET;
1039 snprintf(t, sizeof(t), "~mds%" PRId64, eino);
1040 s = t;
1041 } else {
1042 char n[40];
1043 uint64_t eino(ino());
1044 snprintf(n, sizeof(n), "#%" PRIx64, eino);
1045 s += n;
1046 }
1047}
1048
1049void CInode::make_path(filepath& fp, bool projected) const
1050{
1051 const CDentry *use_parent = projected ? get_projected_parent_dn() : parent;
1052 if (use_parent) {
11fdf7f2 1053 ceph_assert(!is_base());
7c673cae
FG
1054 use_parent->make_path(fp, projected);
1055 } else {
1056 fp = filepath(ino());
1057 }
1058}
1059
1060void CInode::name_stray_dentry(string& dname)
1061{
1062 char s[20];
1063 snprintf(s, sizeof(s), "%llx", (unsigned long long)inode.ino.val);
1064 dname = s;
1065}
1066
1067version_t CInode::pre_dirty()
1068{
1069 version_t pv;
1070 CDentry* _cdentry = get_projected_parent_dn();
1071 if (_cdentry) {
1072 pv = _cdentry->pre_dirty(get_projected_version());
1073 dout(10) << "pre_dirty " << pv << " (current v " << inode.version << ")" << dendl;
1074 } else {
11fdf7f2 1075 ceph_assert(is_base());
7c673cae
FG
1076 pv = get_projected_version() + 1;
1077 }
94b18763 1078 // force update backtrace for old format inode (see mempool_inode::decode)
7c673cae 1079 if (inode.backtrace_version == 0 && !projected_nodes.empty()) {
94b18763
FG
1080 mempool_inode &pi = projected_nodes.back().inode;
1081 if (pi.backtrace_version == 0)
1082 pi.update_backtrace(pv);
7c673cae
FG
1083 }
1084 return pv;
1085}
1086
1087void CInode::_mark_dirty(LogSegment *ls)
1088{
1089 if (!state_test(STATE_DIRTY)) {
1090 state_set(STATE_DIRTY);
1091 get(PIN_DIRTY);
11fdf7f2 1092 ceph_assert(ls);
7c673cae
FG
1093 }
1094
1095 // move myself to this segment's dirty list
1096 if (ls)
1097 ls->dirty_inodes.push_back(&item_dirty);
1098}
1099
1100void CInode::mark_dirty(version_t pv, LogSegment *ls) {
1101
11fdf7f2 1102 dout(10) << __func__ << " " << *this << dendl;
7c673cae
FG
1103
1104 /*
1105 NOTE: I may already be dirty, but this fn _still_ needs to be called so that
1106 the directory is (perhaps newly) dirtied, and so that parent_dir_version is
1107 updated below.
1108 */
1109
1110 // only auth can get dirty. "dirty" async data in replicas is relative to
1111 // filelock state, not the dirty flag.
11fdf7f2 1112 ceph_assert(is_auth());
7c673cae
FG
1113
1114 // touch my private version
11fdf7f2 1115 ceph_assert(inode.version < pv);
7c673cae
FG
1116 inode.version = pv;
1117 _mark_dirty(ls);
1118
1119 // mark dentry too
1120 if (parent)
1121 parent->mark_dirty(pv, ls);
1122}
1123
1124
1125void CInode::mark_clean()
1126{
11fdf7f2 1127 dout(10) << __func__ << " " << *this << dendl;
7c673cae
FG
1128 if (state_test(STATE_DIRTY)) {
1129 state_clear(STATE_DIRTY);
1130 put(PIN_DIRTY);
1131
1132 // remove myself from ls dirty list
1133 item_dirty.remove_myself();
1134 }
1135}
1136
1137
1138// --------------
1139// per-inode storage
1140// (currently for root inode only)
1141
1142struct C_IO_Inode_Stored : public CInodeIOContext {
1143 version_t version;
1144 Context *fin;
1145 C_IO_Inode_Stored(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {}
1146 void finish(int r) override {
1147 in->_stored(r, version, fin);
1148 }
91327a77
AA
1149 void print(ostream& out) const override {
1150 out << "inode_store(" << in->ino() << ")";
1151 }
7c673cae
FG
1152};
1153
11fdf7f2 1154object_t InodeStoreBase::get_object_name(inodeno_t ino, frag_t fg, std::string_view suffix)
7c673cae
FG
1155{
1156 char n[60];
11fdf7f2
TL
1157 snprintf(n, sizeof(n), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)fg);
1158 ceph_assert(strlen(n) + suffix.size() < sizeof n);
1159 strncat(n, suffix.data(), suffix.size());
7c673cae
FG
1160 return object_t(n);
1161}
1162
11fdf7f2 1163void CInode::store(MDSContext *fin)
7c673cae 1164{
11fdf7f2
TL
1165 dout(10) << __func__ << " " << get_version() << dendl;
1166 ceph_assert(is_base());
7c673cae
FG
1167
1168 if (snaprealm)
1169 purge_stale_snap_data(snaprealm->get_snaps());
1170
1171 // encode
1172 bufferlist bl;
1173 string magic = CEPH_FS_ONDISK_MAGIC;
11fdf7f2
TL
1174 using ceph::encode;
1175 encode(magic, bl);
7c673cae
FG
1176 encode_store(bl, mdcache->mds->mdsmap->get_up_features());
1177
1178 // write it.
1179 SnapContext snapc;
1180 ObjectOperation m;
1181 m.write_full(bl);
1182
1183 object_t oid = CInode::get_object_name(ino(), frag_t(), ".inode");
1184 object_locator_t oloc(mdcache->mds->mdsmap->get_metadata_pool());
1185
1186 Context *newfin =
1187 new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin),
1188 mdcache->mds->finisher);
1189 mdcache->mds->objecter->mutate(oid, oloc, m, snapc,
1190 ceph::real_clock::now(), 0,
1191 newfin);
1192}
1193
1194void CInode::_stored(int r, version_t v, Context *fin)
1195{
1196 if (r < 0) {
1197 dout(1) << "store error " << r << " v " << v << " on " << *this << dendl;
d2e6a577
FG
1198 mdcache->mds->clog->error() << "failed to store inode " << ino()
1199 << " object: " << cpp_strerror(r);
7c673cae
FG
1200 mdcache->mds->handle_write_error(r);
1201 fin->complete(r);
1202 return;
1203 }
1204
11fdf7f2 1205 dout(10) << __func__ << " " << v << " on " << *this << dendl;
7c673cae
FG
1206 if (v == get_projected_version())
1207 mark_clean();
1208
1209 fin->complete(0);
1210}
1211
11fdf7f2 1212void CInode::flush(MDSContext *fin)
7c673cae 1213{
11fdf7f2
TL
1214 dout(10) << __func__ << " " << *this << dendl;
1215 ceph_assert(is_auth() && can_auth_pin());
7c673cae
FG
1216
1217 MDSGatherBuilder gather(g_ceph_context);
1218
1219 if (is_dirty_parent()) {
1220 store_backtrace(gather.new_sub());
1221 }
1222 if (is_dirty()) {
1223 if (is_base()) {
1224 store(gather.new_sub());
1225 } else {
1226 parent->dir->commit(0, gather.new_sub());
1227 }
1228 }
1229
1230 if (gather.has_subs()) {
1231 gather.set_finisher(fin);
1232 gather.activate();
1233 } else {
1234 fin->complete(0);
1235 }
1236}
1237
1238struct C_IO_Inode_Fetched : public CInodeIOContext {
1239 bufferlist bl, bl2;
1240 Context *fin;
1241 C_IO_Inode_Fetched(CInode *i, Context *f) : CInodeIOContext(i), fin(f) {}
1242 void finish(int r) override {
1243 // Ignore 'r', because we fetch from two places, so r is usually ENOENT
1244 in->_fetched(bl, bl2, fin);
1245 }
91327a77
AA
1246 void print(ostream& out) const override {
1247 out << "inode_fetch(" << in->ino() << ")";
1248 }
7c673cae
FG
1249};
1250
11fdf7f2 1251void CInode::fetch(MDSContext *fin)
7c673cae 1252{
11fdf7f2 1253 dout(10) << __func__ << dendl;
7c673cae
FG
1254
1255 C_IO_Inode_Fetched *c = new C_IO_Inode_Fetched(this, fin);
1256 C_GatherBuilder gather(g_ceph_context, new C_OnFinisher(c, mdcache->mds->finisher));
1257
1258 object_t oid = CInode::get_object_name(ino(), frag_t(), "");
1259 object_locator_t oloc(mdcache->mds->mdsmap->get_metadata_pool());
1260
1261 // Old on-disk format: inode stored in xattr of a dirfrag
1262 ObjectOperation rd;
1263 rd.getxattr("inode", &c->bl, NULL);
1264 mdcache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, (bufferlist*)NULL, 0, gather.new_sub());
1265
1266 // Current on-disk format: inode stored in a .inode object
1267 object_t oid2 = CInode::get_object_name(ino(), frag_t(), ".inode");
1268 mdcache->mds->objecter->read(oid2, oloc, 0, 0, CEPH_NOSNAP, &c->bl2, 0, gather.new_sub());
1269
1270 gather.activate();
1271}
1272
1273void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin)
1274{
11fdf7f2
TL
1275 dout(10) << __func__ << " got " << bl.length() << " and " << bl2.length() << dendl;
1276 bufferlist::const_iterator p;
7c673cae 1277 if (bl2.length()) {
11fdf7f2 1278 p = bl2.cbegin();
7c673cae 1279 } else if (bl.length()) {
11fdf7f2 1280 p = bl.cbegin();
7c673cae 1281 } else {
d2e6a577 1282 derr << "No data while reading inode " << ino() << dendl;
7c673cae
FG
1283 fin->complete(-ENOENT);
1284 return;
1285 }
1286
11fdf7f2 1287 using ceph::decode;
7c673cae
FG
1288 // Attempt decode
1289 try {
1290 string magic;
11fdf7f2 1291 decode(magic, p);
7c673cae
FG
1292 dout(10) << " magic is '" << magic << "' (expecting '"
1293 << CEPH_FS_ONDISK_MAGIC << "')" << dendl;
1294 if (magic != CEPH_FS_ONDISK_MAGIC) {
1295 dout(0) << "on disk magic '" << magic << "' != my magic '" << CEPH_FS_ONDISK_MAGIC
1296 << "'" << dendl;
1297 fin->complete(-EINVAL);
1298 } else {
1299 decode_store(p);
1300 dout(10) << "_fetched " << *this << dendl;
1301 fin->complete(0);
1302 }
1303 } catch (buffer::error &err) {
d2e6a577 1304 derr << "Corrupt inode " << ino() << ": " << err << dendl;
7c673cae
FG
1305 fin->complete(-EINVAL);
1306 return;
1307 }
1308}
1309
1310void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt)
1311{
1312 bt.ino = inode.ino;
1313 bt.ancestors.clear();
1314 bt.pool = pool;
1315
1316 CInode *in = this;
1317 CDentry *pdn = get_parent_dn();
1318 while (pdn) {
1319 CInode *diri = pdn->get_dir()->get_inode();
94b18763 1320 bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(), in->inode.version));
7c673cae
FG
1321 in = diri;
1322 pdn = in->get_parent_dn();
1323 }
94b18763 1324 for (auto &p : inode.old_pools) {
7c673cae 1325 // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0)
94b18763
FG
1326 if (p != pool)
1327 bt.old_pools.insert(p);
7c673cae
FG
1328 }
1329}
1330
1331struct C_IO_Inode_StoredBacktrace : public CInodeIOContext {
1332 version_t version;
1333 Context *fin;
1334 C_IO_Inode_StoredBacktrace(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {}
1335 void finish(int r) override {
1336 in->_stored_backtrace(r, version, fin);
1337 }
91327a77
AA
1338 void print(ostream& out) const override {
1339 out << "backtrace_store(" << in->ino() << ")";
1340 }
7c673cae
FG
1341};
1342
11fdf7f2 1343void CInode::store_backtrace(MDSContext *fin, int op_prio)
7c673cae 1344{
11fdf7f2
TL
1345 dout(10) << __func__ << " on " << *this << dendl;
1346 ceph_assert(is_dirty_parent());
7c673cae
FG
1347
1348 if (op_prio < 0)
1349 op_prio = CEPH_MSG_PRIO_DEFAULT;
1350
1351 auth_pin(this);
1352
1353 const int64_t pool = get_backtrace_pool();
1354 inode_backtrace_t bt;
1355 build_backtrace(pool, bt);
1356 bufferlist parent_bl;
11fdf7f2
TL
1357 using ceph::encode;
1358 encode(bt, parent_bl);
7c673cae
FG
1359
1360 ObjectOperation op;
1361 op.priority = op_prio;
1362 op.create(false);
1363 op.setxattr("parent", parent_bl);
1364
1365 bufferlist layout_bl;
11fdf7f2 1366 encode(inode.layout, layout_bl, mdcache->mds->mdsmap->get_up_features());
7c673cae
FG
1367 op.setxattr("layout", layout_bl);
1368
1369 SnapContext snapc;
1370 object_t oid = get_object_name(ino(), frag_t(), "");
1371 object_locator_t oloc(pool);
1372 Context *fin2 = new C_OnFinisher(
1373 new C_IO_Inode_StoredBacktrace(this, inode.backtrace_version, fin),
1374 mdcache->mds->finisher);
1375
1376 if (!state_test(STATE_DIRTYPOOL) || inode.old_pools.empty()) {
1377 dout(20) << __func__ << ": no dirtypool or no old pools" << dendl;
1378 mdcache->mds->objecter->mutate(oid, oloc, op, snapc,
1379 ceph::real_clock::now(),
1380 0, fin2);
1381 return;
1382 }
1383
1384 C_GatherBuilder gather(g_ceph_context, fin2);
1385 mdcache->mds->objecter->mutate(oid, oloc, op, snapc,
1386 ceph::real_clock::now(),
1387 0, gather.new_sub());
1388
1389 // In the case where DIRTYPOOL is set, we update all old pools backtraces
1390 // such that anyone reading them will see the new pool ID in
1391 // inode_backtrace_t::pool and go read everything else from there.
94b18763
FG
1392 for (const auto &p : inode.old_pools) {
1393 if (p == pool)
7c673cae
FG
1394 continue;
1395
94b18763 1396 dout(20) << __func__ << ": updating old pool " << p << dendl;
7c673cae
FG
1397
1398 ObjectOperation op;
1399 op.priority = op_prio;
1400 op.create(false);
1401 op.setxattr("parent", parent_bl);
1402
94b18763 1403 object_locator_t oloc(p);
7c673cae
FG
1404 mdcache->mds->objecter->mutate(oid, oloc, op, snapc,
1405 ceph::real_clock::now(),
1406 0, gather.new_sub());
1407 }
1408 gather.activate();
1409}
1410
1411void CInode::_stored_backtrace(int r, version_t v, Context *fin)
1412{
1413 if (r == -ENOENT) {
1414 const int64_t pool = get_backtrace_pool();
1415 bool exists = mdcache->mds->objecter->with_osdmap(
1416 [pool](const OSDMap &osd_map) {
1417 return osd_map.have_pg_pool(pool);
1418 });
1419
1420 // This ENOENT is because the pool doesn't exist (the user deleted it
1421 // out from under us), so the backtrace can never be written, so pretend
1422 // to succeed so that the user can proceed to e.g. delete the file.
1423 if (!exists) {
11fdf7f2 1424 dout(4) << __func__ << " got ENOENT: a data pool was deleted "
7c673cae
FG
1425 "beneath us!" << dendl;
1426 r = 0;
1427 }
1428 }
1429
1430 if (r < 0) {
1431 dout(1) << "store backtrace error " << r << " v " << v << dendl;
1432 mdcache->mds->clog->error() << "failed to store backtrace on ino "
1433 << ino() << " object"
1434 << ", pool " << get_backtrace_pool()
1435 << ", errno " << r;
1436 mdcache->mds->handle_write_error(r);
1437 if (fin)
1438 fin->complete(r);
1439 return;
1440 }
1441
11fdf7f2 1442 dout(10) << __func__ << " v " << v << dendl;
7c673cae
FG
1443
1444 auth_unpin(this);
1445 if (v == inode.backtrace_version)
1446 clear_dirty_parent();
1447 if (fin)
1448 fin->complete(0);
1449}
1450
1451void CInode::fetch_backtrace(Context *fin, bufferlist *backtrace)
1452{
1453 mdcache->fetch_backtrace(inode.ino, get_backtrace_pool(), *backtrace, fin);
1454}
1455
28e407b8 1456void CInode::mark_dirty_parent(LogSegment *ls, bool dirty_pool)
7c673cae
FG
1457{
1458 if (!state_test(STATE_DIRTYPARENT)) {
11fdf7f2 1459 dout(10) << __func__ << dendl;
7c673cae
FG
1460 state_set(STATE_DIRTYPARENT);
1461 get(PIN_DIRTYPARENT);
11fdf7f2 1462 ceph_assert(ls);
7c673cae
FG
1463 }
1464 if (dirty_pool)
1465 state_set(STATE_DIRTYPOOL);
1466 if (ls)
1467 ls->dirty_parent_inodes.push_back(&item_dirty_parent);
1468}
1469
1470void CInode::clear_dirty_parent()
1471{
1472 if (state_test(STATE_DIRTYPARENT)) {
11fdf7f2 1473 dout(10) << __func__ << dendl;
7c673cae
FG
1474 state_clear(STATE_DIRTYPARENT);
1475 state_clear(STATE_DIRTYPOOL);
1476 put(PIN_DIRTYPARENT);
1477 item_dirty_parent.remove_myself();
1478 }
1479}
1480
1481void CInode::verify_diri_backtrace(bufferlist &bl, int err)
1482{
1483 if (is_base() || is_dirty_parent() || !is_auth())
1484 return;
1485
11fdf7f2 1486 dout(10) << __func__ << dendl;
7c673cae
FG
1487
1488 if (err == 0) {
1489 inode_backtrace_t backtrace;
11fdf7f2
TL
1490 using ceph::decode;
1491 decode(backtrace, bl);
7c673cae
FG
1492 CDentry *pdn = get_parent_dn();
1493 if (backtrace.ancestors.empty() ||
94b18763 1494 backtrace.ancestors[0].dname != pdn->get_name() ||
7c673cae
FG
1495 backtrace.ancestors[0].dirino != pdn->get_dir()->ino())
1496 err = -EINVAL;
1497 }
1498
1499 if (err) {
1500 MDSRank *mds = mdcache->mds;
d2e6a577 1501 mds->clog->error() << "bad backtrace on directory inode " << ino();
11fdf7f2 1502 ceph_assert(!"bad backtrace" == (g_conf()->mds_verify_backtrace > 1));
7c673cae 1503
28e407b8 1504 mark_dirty_parent(mds->mdlog->get_current_segment(), false);
7c673cae
FG
1505 mds->mdlog->flush();
1506 }
1507}
1508
1509// ------------------
1510// parent dir
1511
1512
1513void InodeStoreBase::encode_bare(bufferlist &bl, uint64_t features,
1514 const bufferlist *snap_blob) const
1515{
11fdf7f2
TL
1516 using ceph::encode;
1517 encode(inode, bl, features);
7c673cae 1518 if (is_symlink())
11fdf7f2
TL
1519 encode(symlink, bl);
1520 encode(dirfragtree, bl);
1521 encode(xattrs, bl);
7c673cae 1522 if (snap_blob)
11fdf7f2 1523 encode(*snap_blob, bl);
7c673cae 1524 else
11fdf7f2
TL
1525 encode(bufferlist(), bl);
1526 encode(old_inodes, bl, features);
1527 encode(oldest_snap, bl);
1528 encode(damage_flags, bl);
7c673cae
FG
1529}
1530
1531void InodeStoreBase::encode(bufferlist &bl, uint64_t features,
1532 const bufferlist *snap_blob) const
1533{
1534 ENCODE_START(6, 4, bl);
1535 encode_bare(bl, features, snap_blob);
1536 ENCODE_FINISH(bl);
1537}
1538
1539void CInode::encode_store(bufferlist& bl, uint64_t features)
1540{
1541 bufferlist snap_blob;
1542 encode_snap_blob(snap_blob);
1543 InodeStoreBase::encode(bl, mdcache->mds->mdsmap->get_up_features(),
1544 &snap_blob);
1545}
1546
11fdf7f2 1547void InodeStoreBase::decode_bare(bufferlist::const_iterator &bl,
7c673cae
FG
1548 bufferlist& snap_blob, __u8 struct_v)
1549{
11fdf7f2
TL
1550 using ceph::decode;
1551 decode(inode, bl);
94b18763
FG
1552 if (is_symlink()) {
1553 std::string tmp;
11fdf7f2
TL
1554 decode(tmp, bl);
1555 symlink = std::string_view(tmp);
94b18763 1556 }
11fdf7f2 1557 decode(dirfragtree, bl);
e306af50 1558 decode_noshare(xattrs, bl);
11fdf7f2 1559 decode(snap_blob, bl);
7c673cae 1560
11fdf7f2 1561 decode(old_inodes, bl);
7c673cae
FG
1562 if (struct_v == 2 && inode.is_dir()) {
1563 bool default_layout_exists;
11fdf7f2 1564 decode(default_layout_exists, bl);
7c673cae 1565 if (default_layout_exists) {
11fdf7f2
TL
1566 decode(struct_v, bl); // this was a default_file_layout
1567 decode(inode.layout, bl); // but we only care about the layout portion
7c673cae
FG
1568 }
1569 }
1570
1571 if (struct_v >= 5) {
1572 // InodeStore is embedded in dentries without proper versioning, so
1573 // we consume up to the end of the buffer
1574 if (!bl.end()) {
11fdf7f2 1575 decode(oldest_snap, bl);
7c673cae
FG
1576 }
1577
1578 if (!bl.end()) {
11fdf7f2 1579 decode(damage_flags, bl);
7c673cae
FG
1580 }
1581 }
1582}
1583
1584
11fdf7f2 1585void InodeStoreBase::decode(bufferlist::const_iterator &bl, bufferlist& snap_blob)
7c673cae
FG
1586{
1587 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
1588 decode_bare(bl, snap_blob, struct_v);
1589 DECODE_FINISH(bl);
1590}
1591
11fdf7f2 1592void CInode::decode_store(bufferlist::const_iterator& bl)
7c673cae
FG
1593{
1594 bufferlist snap_blob;
1595 InodeStoreBase::decode(bl, snap_blob);
1596 decode_snap_blob(snap_blob);
1597}
1598
1599// ------------------
1600// locking
1601
9f95a23c
TL
1602SimpleLock* CInode::get_lock(int type)
1603{
1604 switch (type) {
1605 case CEPH_LOCK_IVERSION: return &versionlock;
1606 case CEPH_LOCK_IFILE: return &filelock;
1607 case CEPH_LOCK_IAUTH: return &authlock;
1608 case CEPH_LOCK_ILINK: return &linklock;
1609 case CEPH_LOCK_IDFT: return &dirfragtreelock;
1610 case CEPH_LOCK_IXATTR: return &xattrlock;
1611 case CEPH_LOCK_ISNAP: return &snaplock;
1612 case CEPH_LOCK_INEST: return &nestlock;
1613 case CEPH_LOCK_IFLOCK: return &flocklock;
1614 case CEPH_LOCK_IPOLICY: return &policylock;
1615 }
1616 return 0;
1617}
1618
7c673cae
FG
1619void CInode::set_object_info(MDSCacheObjectInfo &info)
1620{
1621 info.ino = ino();
1622 info.snapid = last;
1623}
1624
9f95a23c 1625void CInode::encode_lock_iauth(bufferlist& bl)
7c673cae 1626{
9f95a23c
TL
1627 ENCODE_START(1, 1, bl);
1628 encode(inode.version, bl);
1629 encode(inode.ctime, bl);
1630 encode(inode.mode, bl);
1631 encode(inode.uid, bl);
1632 encode(inode.gid, bl);
1633 ENCODE_FINISH(bl);
1634}
7c673cae 1635
9f95a23c
TL
1636void CInode::decode_lock_iauth(bufferlist::const_iterator& p)
1637{
1638 DECODE_START(1, p);
1639 decode(inode.version, p);
1640 utime_t tm;
1641 decode(tm, p);
1642 if (inode.ctime < tm) inode.ctime = tm;
1643 decode(inode.mode, p);
1644 decode(inode.uid, p);
1645 decode(inode.gid, p);
1646 DECODE_FINISH(p);
1647}
1648
1649void CInode::encode_lock_ilink(bufferlist& bl)
1650{
1651 ENCODE_START(1, 1, bl);
1652 encode(inode.version, bl);
1653 encode(inode.ctime, bl);
1654 encode(inode.nlink, bl);
1655 ENCODE_FINISH(bl);
1656}
1657
1658void CInode::decode_lock_ilink(bufferlist::const_iterator& p)
1659{
1660 DECODE_START(1, p);
1661 decode(inode.version, p);
1662 utime_t tm;
1663 decode(tm, p);
1664 if (inode.ctime < tm) inode.ctime = tm;
1665 decode(inode.nlink, p);
1666 DECODE_FINISH(p);
1667}
1668
1669void CInode::encode_lock_idft(bufferlist& bl)
1670{
1671 ENCODE_START(1, 1, bl);
1672 if (is_auth()) {
11fdf7f2 1673 encode(inode.version, bl);
9f95a23c
TL
1674 } else {
1675 // treat flushing as dirty when rejoining cache
1676 bool dirty = dirfragtreelock.is_dirty_or_flushing();
1677 encode(dirty, bl);
1678 }
1679 {
1680 // encode the raw tree
1681 encode(dirfragtree, bl);
1682
1683 // also specify which frags are mine
1684 set<frag_t> myfrags;
1685 auto&& dfls = get_dirfrags();
1686 for (const auto& dir : dfls) {
1687 if (dir->is_auth()) {
1688 frag_t fg = dir->get_frag();
1689 myfrags.insert(fg);
1690 }
1691 }
1692 encode(myfrags, bl);
1693 }
1694 ENCODE_FINISH(bl);
1695}
1696
1697void CInode::decode_lock_idft(bufferlist::const_iterator& p)
1698{
1699 DECODE_START(1, p);
1700 if (is_auth()) {
1701 bool replica_dirty;
1702 decode(replica_dirty, p);
1703 if (replica_dirty) {
1704 dout(10) << __func__ << " setting dftlock dirty flag" << dendl;
1705 dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle
1706 }
1707 } else {
1708 decode(inode.version, p);
1709 }
1710 {
1711 fragtree_t temp;
1712 decode(temp, p);
1713 set<frag_t> authfrags;
1714 decode(authfrags, p);
7c673cae 1715 if (is_auth()) {
9f95a23c
TL
1716 // auth. believe replica's auth frags only.
1717 for (auto fg : authfrags) {
1718 if (!dirfragtree.is_leaf(fg)) {
1719 dout(10) << " forcing frag " << fg << " to leaf (split|merge)" << dendl;
1720 dirfragtree.force_to_leaf(g_ceph_context, fg);
1721 dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle
1722 }
1723 }
7c673cae 1724 } else {
9f95a23c
TL
1725 // replica. take the new tree, BUT make sure any open
1726 // dirfrags remain leaves (they may have split _after_ this
1727 // dft was scattered, or we may still be be waiting on the
1728 // notify from the auth)
1729 dirfragtree.swap(temp);
1730 for (const auto &p : dirfrags) {
1731 if (!dirfragtree.is_leaf(p.first)) {
1732 dout(10) << " forcing open dirfrag " << p.first << " to leaf (racing with split|merge)" << dendl;
1733 dirfragtree.force_to_leaf(g_ceph_context, p.first);
1734 }
1735 if (p.second->is_auth())
1736 p.second->state_clear(CDir::STATE_DIRTYDFT);
1737 }
7c673cae 1738 }
9f95a23c
TL
1739 if (g_conf()->mds_debug_frag)
1740 verify_dirfrags();
1741 }
1742 DECODE_FINISH(p);
1743}
1744
1745void CInode::encode_lock_ifile(bufferlist& bl)
1746{
1747 ENCODE_START(1, 1, bl);
1748 if (is_auth()) {
1749 encode(inode.version, bl);
1750 encode(inode.ctime, bl);
1751 encode(inode.mtime, bl);
1752 encode(inode.atime, bl);
1753 encode(inode.time_warp_seq, bl);
1754 if (!is_dir()) {
1755 encode(inode.layout, bl, mdcache->mds->mdsmap->get_up_features());
1756 encode(inode.size, bl);
1757 encode(inode.truncate_seq, bl);
1758 encode(inode.truncate_size, bl);
1759 encode(inode.client_ranges, bl);
1760 encode(inode.inline_data, bl);
1761 }
1762 } else {
1763 // treat flushing as dirty when rejoining cache
1764 bool dirty = filelock.is_dirty_or_flushing();
1765 encode(dirty, bl);
1766 }
1767 dout(15) << __func__ << " inode.dirstat is " << inode.dirstat << dendl;
1768 encode(inode.dirstat, bl); // only meaningful if i am auth.
1769 bufferlist tmp;
1770 __u32 n = 0;
1771 for (const auto &p : dirfrags) {
1772 frag_t fg = p.first;
1773 CDir *dir = p.second;
1774 if (is_auth() || dir->is_auth()) {
1775 fnode_t *pf = dir->get_projected_fnode();
1776 dout(15) << fg << " " << *dir << dendl;
1777 dout(20) << fg << " fragstat " << pf->fragstat << dendl;
1778 dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
1779 encode(fg, tmp);
1780 encode(dir->first, tmp);
1781 encode(pf->fragstat, tmp);
1782 encode(pf->accounted_fragstat, tmp);
1783 n++;
7c673cae 1784 }
9f95a23c
TL
1785 }
1786 encode(n, bl);
1787 bl.claim_append(tmp);
1788 ENCODE_FINISH(bl);
1789}
1790
1791void CInode::decode_lock_ifile(bufferlist::const_iterator& p)
1792{
1793 DECODE_START(1, p);
1794 if (!is_auth()) {
1795 decode(inode.version, p);
1796 utime_t tm;
1797 decode(tm, p);
1798 if (inode.ctime < tm) inode.ctime = tm;
1799 decode(inode.mtime, p);
1800 decode(inode.atime, p);
1801 decode(inode.time_warp_seq, p);
1802 if (!is_dir()) {
1803 decode(inode.layout, p);
1804 decode(inode.size, p);
1805 decode(inode.truncate_seq, p);
1806 decode(inode.truncate_size, p);
1807 decode(inode.client_ranges, p);
1808 decode(inode.inline_data, p);
1809 }
1810 } else {
1811 bool replica_dirty;
1812 decode(replica_dirty, p);
1813 if (replica_dirty) {
1814 dout(10) << __func__ << " setting filelock dirty flag" << dendl;
1815 filelock.mark_dirty(); // ok bc we're auth and caller will handle
1816 }
1817 }
1818
1819 frag_info_t dirstat;
1820 decode(dirstat, p);
1821 if (!is_auth()) {
1822 dout(10) << " taking inode dirstat " << dirstat << " for " << *this << dendl;
1823 inode.dirstat = dirstat; // take inode summation if replica
1824 }
1825 __u32 n;
1826 decode(n, p);
1827 dout(10) << " ...got " << n << " fragstats on " << *this << dendl;
1828 while (n--) {
1829 frag_t fg;
1830 snapid_t fgfirst;
1831 frag_info_t fragstat;
1832 frag_info_t accounted_fragstat;
1833 decode(fg, p);
1834 decode(fgfirst, p);
1835 decode(fragstat, p);
1836 decode(accounted_fragstat, p);
1837 dout(10) << fg << " [" << fgfirst << ",head] " << dendl;
1838 dout(10) << fg << " fragstat " << fragstat << dendl;
1839 dout(20) << fg << " accounted_fragstat " << accounted_fragstat << dendl;
1840
1841 CDir *dir = get_dirfrag(fg);
7c673cae 1842 if (is_auth()) {
9f95a23c
TL
1843 ceph_assert(dir); // i am auth; i had better have this dir open
1844 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
1845 << " on " << *dir << dendl;
1846 dir->first = fgfirst;
1847 dir->fnode.fragstat = fragstat;
1848 dir->fnode.accounted_fragstat = accounted_fragstat;
1849 if (!(fragstat == accounted_fragstat)) {
1850 dout(10) << fg << " setting filelock updated flag" << dendl;
1851 filelock.mark_dirty(); // ok bc we're auth and caller will handle
7c673cae
FG
1852 }
1853 } else {
9f95a23c
TL
1854 if (dir && dir->is_auth()) {
1855 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
1856 << " on " << *dir << dendl;
1857 dir->first = fgfirst;
1858 fnode_t *pf = dir->get_projected_fnode();
1859 finish_scatter_update(&filelock, dir,
1860 inode.dirstat.version, pf->accounted_fragstat.version);
1861 }
7c673cae 1862 }
9f95a23c
TL
1863 }
1864 DECODE_FINISH(p);
1865}
7c673cae 1866
9f95a23c
TL
1867void CInode::encode_lock_inest(bufferlist& bl)
1868{
1869 ENCODE_START(1, 1, bl);
1870 if (is_auth()) {
1871 encode(inode.version, bl);
1872 } else {
1873 // treat flushing as dirty when rejoining cache
1874 bool dirty = nestlock.is_dirty_or_flushing();
1875 encode(dirty, bl);
1876 }
1877 dout(15) << __func__ << " inode.rstat is " << inode.rstat << dendl;
1878 encode(inode.rstat, bl); // only meaningful if i am auth.
1879 bufferlist tmp;
1880 __u32 n = 0;
1881 for (const auto &p : dirfrags) {
1882 frag_t fg = p.first;
1883 CDir *dir = p.second;
1884 if (is_auth() || dir->is_auth()) {
1885 fnode_t *pf = dir->get_projected_fnode();
1886 dout(10) << __func__ << " " << fg << " dir " << *dir << dendl;
1887 dout(10) << __func__ << " " << fg << " rstat " << pf->rstat << dendl;
1888 dout(10) << __func__ << " " << fg << " accounted_rstat " << pf->rstat << dendl;
1889 dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl;
1890 encode(fg, tmp);
1891 encode(dir->first, tmp);
1892 encode(pf->rstat, tmp);
1893 encode(pf->accounted_rstat, tmp);
1894 encode(dir->dirty_old_rstat, tmp);
1895 n++;
7c673cae 1896 }
9f95a23c
TL
1897 }
1898 encode(n, bl);
1899 bl.claim_append(tmp);
1900 ENCODE_FINISH(bl);
1901}
7c673cae 1902
9f95a23c
TL
1903void CInode::decode_lock_inest(bufferlist::const_iterator& p)
1904{
1905 DECODE_START(1, p);
1906 if (is_auth()) {
1907 bool replica_dirty;
1908 decode(replica_dirty, p);
1909 if (replica_dirty) {
1910 dout(10) << __func__ << " setting nestlock dirty flag" << dendl;
1911 nestlock.mark_dirty(); // ok bc we're auth and caller will handle
1912 }
1913 } else {
1914 decode(inode.version, p);
1915 }
1916 nest_info_t rstat;
1917 decode(rstat, p);
1918 if (!is_auth()) {
1919 dout(10) << __func__ << " taking inode rstat " << rstat << " for " << *this << dendl;
1920 inode.rstat = rstat; // take inode summation if replica
1921 }
1922 __u32 n;
1923 decode(n, p);
1924 while (n--) {
1925 frag_t fg;
1926 snapid_t fgfirst;
1927 nest_info_t rstat;
1928 nest_info_t accounted_rstat;
1929 decltype(CDir::dirty_old_rstat) dirty_old_rstat;
1930 decode(fg, p);
1931 decode(fgfirst, p);
1932 decode(rstat, p);
1933 decode(accounted_rstat, p);
1934 decode(dirty_old_rstat, p);
1935 dout(10) << __func__ << " " << fg << " [" << fgfirst << ",head]" << dendl;
1936 dout(10) << __func__ << " " << fg << " rstat " << rstat << dendl;
1937 dout(10) << __func__ << " " << fg << " accounted_rstat " << accounted_rstat << dendl;
1938 dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dirty_old_rstat << dendl;
1939 CDir *dir = get_dirfrag(fg);
7c673cae 1940 if (is_auth()) {
9f95a23c
TL
1941 ceph_assert(dir); // i am auth; i had better have this dir open
1942 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
1943 << " on " << *dir << dendl;
1944 dir->first = fgfirst;
1945 dir->fnode.rstat = rstat;
1946 dir->fnode.accounted_rstat = accounted_rstat;
1947 dir->dirty_old_rstat.swap(dirty_old_rstat);
1948 if (!(rstat == accounted_rstat) || !dir->dirty_old_rstat.empty()) {
1949 dout(10) << fg << " setting nestlock updated flag" << dendl;
1950 nestlock.mark_dirty(); // ok bc we're auth and caller will handle
1951 }
7c673cae 1952 } else {
9f95a23c
TL
1953 if (dir && dir->is_auth()) {
1954 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
1955 << " on " << *dir << dendl;
1956 dir->first = fgfirst;
1957 fnode_t *pf = dir->get_projected_fnode();
1958 finish_scatter_update(&nestlock, dir,
1959 inode.rstat.version, pf->accounted_rstat.version);
7c673cae 1960 }
7c673cae 1961 }
9f95a23c
TL
1962 }
1963 DECODE_FINISH(p);
1964}
1965
1966void CInode::encode_lock_ixattr(bufferlist& bl)
1967{
1968 ENCODE_START(1, 1, bl);
1969 encode(inode.version, bl);
1970 encode(inode.ctime, bl);
1971 encode(xattrs, bl);
1972 ENCODE_FINISH(bl);
1973}
1974
1975void CInode::decode_lock_ixattr(bufferlist::const_iterator& p)
1976{
1977 DECODE_START(1, p);
1978 decode(inode.version, p);
1979 utime_t tm;
1980 decode(tm, p);
1981 if (inode.ctime < tm) inode.ctime = tm;
e306af50 1982 decode_noshare(xattrs, p);
9f95a23c
TL
1983 DECODE_FINISH(p);
1984}
1985
1986void CInode::encode_lock_isnap(bufferlist& bl)
1987{
1988 ENCODE_START(1, 1, bl);
1989 encode(inode.version, bl);
1990 encode(inode.ctime, bl);
1991 encode_snap(bl);
1992 ENCODE_FINISH(bl);
1993}
1994
1995void CInode::decode_lock_isnap(bufferlist::const_iterator& p)
1996{
1997 DECODE_START(1, p);
1998 decode(inode.version, p);
1999 utime_t tm;
2000 decode(tm, p);
2001 if (inode.ctime < tm) inode.ctime = tm;
2002 decode_snap(p);
2003 DECODE_FINISH(p);
2004}
2005
2006void CInode::encode_lock_iflock(bufferlist& bl)
2007{
2008 ENCODE_START(1, 1, bl);
2009 encode(inode.version, bl);
2010 _encode_file_locks(bl);
2011 ENCODE_FINISH(bl);
2012}
2013
2014void CInode::decode_lock_iflock(bufferlist::const_iterator& p)
2015{
2016 DECODE_START(1, p);
2017 decode(inode.version, p);
2018 _decode_file_locks(p);
2019 DECODE_FINISH(p);
2020}
2021
2022void CInode::encode_lock_ipolicy(bufferlist& bl)
2023{
f6b5b4d7 2024 ENCODE_START(2, 1, bl);
9f95a23c
TL
2025 if (inode.is_dir()) {
2026 encode(inode.version, bl);
2027 encode(inode.ctime, bl);
2028 encode(inode.layout, bl, mdcache->mds->mdsmap->get_up_features());
2029 encode(inode.quota, bl);
2030 encode(inode.export_pin, bl);
f6b5b4d7
TL
2031 encode(inode.export_ephemeral_distributed_pin, bl);
2032 encode(inode.export_ephemeral_random_pin, bl);
9f95a23c
TL
2033 }
2034 ENCODE_FINISH(bl);
2035}
2036
2037void CInode::decode_lock_ipolicy(bufferlist::const_iterator& p)
2038{
f6b5b4d7 2039 DECODE_START(2, p);
9f95a23c
TL
2040 if (inode.is_dir()) {
2041 decode(inode.version, p);
2042 utime_t tm;
2043 decode(tm, p);
2044 if (inode.ctime < tm) inode.ctime = tm;
2045 decode(inode.layout, p);
2046 decode(inode.quota, p);
f6b5b4d7
TL
2047 {
2048 mds_rank_t old_pin = inode.export_pin;
2049 decode(inode.export_pin, p);
2050 maybe_export_pin(old_pin != inode.export_pin);
2051 }
2052 if (struct_v >= 2) {
2053 {
2054 bool old_ephemeral_pin = inode.export_ephemeral_distributed_pin;
2055 decode(inode.export_ephemeral_distributed_pin, p);
2056 maybe_ephemeral_dist_children(old_ephemeral_pin != inode.export_ephemeral_distributed_pin);
2057 }
2058 decode(inode.export_ephemeral_random_pin, p);
2059 }
9f95a23c
TL
2060 }
2061 DECODE_FINISH(p);
2062}
2063
2064void CInode::encode_lock_state(int type, bufferlist& bl)
2065{
2066 ENCODE_START(1, 1, bl);
2067 encode(first, bl);
2068 if (!is_base())
2069 encode(parent->first, bl);
2070
2071 switch (type) {
2072 case CEPH_LOCK_IAUTH:
2073 encode_lock_iauth(bl);
2074 break;
2075
2076 case CEPH_LOCK_ILINK:
2077 encode_lock_ilink(bl);
2078 break;
2079
2080 case CEPH_LOCK_IDFT:
2081 encode_lock_idft(bl);
2082 break;
2083
2084 case CEPH_LOCK_IFILE:
2085 encode_lock_ifile(bl);
2086 break;
2087
2088 case CEPH_LOCK_INEST:
2089 encode_lock_inest(bl);
7c673cae
FG
2090 break;
2091
2092 case CEPH_LOCK_IXATTR:
9f95a23c 2093 encode_lock_ixattr(bl);
7c673cae
FG
2094 break;
2095
2096 case CEPH_LOCK_ISNAP:
9f95a23c 2097 encode_lock_isnap(bl);
7c673cae
FG
2098 break;
2099
2100 case CEPH_LOCK_IFLOCK:
9f95a23c 2101 encode_lock_iflock(bl);
7c673cae
FG
2102 break;
2103
2104 case CEPH_LOCK_IPOLICY:
9f95a23c 2105 encode_lock_ipolicy(bl);
7c673cae
FG
2106 break;
2107
2108 default:
2109 ceph_abort();
2110 }
9f95a23c 2111 ENCODE_FINISH(bl);
7c673cae
FG
2112}
2113
7c673cae
FG
2114/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2115
11fdf7f2 2116void CInode::decode_lock_state(int type, const bufferlist& bl)
7c673cae 2117{
11fdf7f2 2118 auto p = bl.cbegin();
9f95a23c
TL
2119
2120 DECODE_START(1, p);
7c673cae
FG
2121 utime_t tm;
2122
2123 snapid_t newfirst;
11fdf7f2
TL
2124 using ceph::decode;
2125 decode(newfirst, p);
7c673cae 2126 if (!is_auth() && newfirst != first) {
11fdf7f2
TL
2127 dout(10) << __func__ << " first " << first << " -> " << newfirst << dendl;
2128 first = newfirst;
2129 }
2130 if (!is_base()) {
2131 decode(newfirst, p);
2132 if (!parent->is_auth() && newfirst != parent->first) {
2133 dout(10) << __func__ << " parent first " << first << " -> " << newfirst << dendl;
7c673cae
FG
2134 parent->first = newfirst;
2135 }
7c673cae
FG
2136 }
2137
2138 switch (type) {
2139 case CEPH_LOCK_IAUTH:
9f95a23c 2140 decode_lock_iauth(p);
7c673cae
FG
2141 break;
2142
2143 case CEPH_LOCK_ILINK:
9f95a23c 2144 decode_lock_ilink(p);
7c673cae
FG
2145 break;
2146
2147 case CEPH_LOCK_IDFT:
9f95a23c 2148 decode_lock_idft(p);
7c673cae
FG
2149 break;
2150
2151 case CEPH_LOCK_IFILE:
9f95a23c 2152 decode_lock_ifile(p);
7c673cae
FG
2153 break;
2154
2155 case CEPH_LOCK_INEST:
9f95a23c 2156 decode_lock_inest(p);
7c673cae
FG
2157 break;
2158
2159 case CEPH_LOCK_IXATTR:
9f95a23c 2160 decode_lock_ixattr(p);
7c673cae
FG
2161 break;
2162
2163 case CEPH_LOCK_ISNAP:
9f95a23c 2164 decode_lock_isnap(p);
7c673cae
FG
2165 break;
2166
2167 case CEPH_LOCK_IFLOCK:
9f95a23c 2168 decode_lock_iflock(p);
7c673cae
FG
2169 break;
2170
2171 case CEPH_LOCK_IPOLICY:
9f95a23c 2172 decode_lock_ipolicy(p);
7c673cae
FG
2173 break;
2174
2175 default:
2176 ceph_abort();
2177 }
9f95a23c 2178 DECODE_FINISH(p);
7c673cae
FG
2179}
2180
2181
2182bool CInode::is_dirty_scattered()
2183{
2184 return
2185 filelock.is_dirty_or_flushing() ||
2186 nestlock.is_dirty_or_flushing() ||
2187 dirfragtreelock.is_dirty_or_flushing();
2188}
2189
2190void CInode::clear_scatter_dirty()
2191{
2192 filelock.remove_dirty();
2193 nestlock.remove_dirty();
2194 dirfragtreelock.remove_dirty();
2195}
2196
2197void CInode::clear_dirty_scattered(int type)
2198{
11fdf7f2
TL
2199 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2200 ceph_assert(is_dir());
7c673cae
FG
2201 switch (type) {
2202 case CEPH_LOCK_IFILE:
2203 item_dirty_dirfrag_dir.remove_myself();
2204 break;
2205
2206 case CEPH_LOCK_INEST:
2207 item_dirty_dirfrag_nest.remove_myself();
2208 break;
2209
2210 case CEPH_LOCK_IDFT:
2211 item_dirty_dirfrag_dirfragtree.remove_myself();
2212 break;
2213
2214 default:
2215 ceph_abort();
2216 }
2217}
2218
2219
2220/*
2221 * when we initially scatter a lock, we need to check if any of the dirfrags
2222 * have out of date accounted_rstat/fragstat. if so, mark the lock stale.
2223 */
2224/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2225void CInode::start_scatter(ScatterLock *lock)
2226{
11fdf7f2
TL
2227 dout(10) << __func__ << " " << *lock << " on " << *this << dendl;
2228 ceph_assert(is_auth());
94b18763 2229 mempool_inode *pi = get_projected_inode();
7c673cae 2230
94b18763
FG
2231 for (const auto &p : dirfrags) {
2232 frag_t fg = p.first;
2233 CDir *dir = p.second;
7c673cae
FG
2234 fnode_t *pf = dir->get_projected_fnode();
2235 dout(20) << fg << " " << *dir << dendl;
2236
2237 if (!dir->is_auth())
2238 continue;
2239
2240 switch (lock->get_type()) {
2241 case CEPH_LOCK_IFILE:
2242 finish_scatter_update(lock, dir, pi->dirstat.version, pf->accounted_fragstat.version);
2243 break;
2244
2245 case CEPH_LOCK_INEST:
2246 finish_scatter_update(lock, dir, pi->rstat.version, pf->accounted_rstat.version);
2247 break;
2248
2249 case CEPH_LOCK_IDFT:
2250 dir->state_clear(CDir::STATE_DIRTYDFT);
2251 break;
2252 }
2253 }
2254}
2255
2256
2257class C_Inode_FragUpdate : public MDSLogContextBase {
2258protected:
2259 CInode *in;
2260 CDir *dir;
2261 MutationRef mut;
2262 MDSRank *get_mds() override {return in->mdcache->mds;}
2263 void finish(int r) override {
2264 in->_finish_frag_update(dir, mut);
2265 }
2266
2267public:
2268 C_Inode_FragUpdate(CInode *i, CDir *d, MutationRef& m) : in(i), dir(d), mut(m) {}
2269};
2270
2271void CInode::finish_scatter_update(ScatterLock *lock, CDir *dir,
2272 version_t inode_version, version_t dir_accounted_version)
2273{
2274 frag_t fg = dir->get_frag();
11fdf7f2 2275 ceph_assert(dir->is_auth());
7c673cae
FG
2276
2277 if (dir->is_frozen()) {
11fdf7f2 2278 dout(10) << __func__ << " " << fg << " frozen, marking " << *lock << " stale " << *dir << dendl;
7c673cae 2279 } else if (dir->get_version() == 0) {
11fdf7f2 2280 dout(10) << __func__ << " " << fg << " not loaded, marking " << *lock << " stale " << *dir << dendl;
7c673cae
FG
2281 } else {
2282 if (dir_accounted_version != inode_version) {
11fdf7f2 2283 dout(10) << __func__ << " " << fg << " journaling accounted scatterstat update v" << inode_version << dendl;
7c673cae
FG
2284
2285 MDLog *mdlog = mdcache->mds->mdlog;
2286 MutationRef mut(new MutationImpl());
2287 mut->ls = mdlog->get_current_segment();
2288
94b18763 2289 mempool_inode *pi = get_projected_inode();
7c673cae 2290 fnode_t *pf = dir->project_fnode();
7c673cae 2291
9f95a23c 2292 std::string_view ename;
7c673cae
FG
2293 switch (lock->get_type()) {
2294 case CEPH_LOCK_IFILE:
2295 pf->fragstat.version = pi->dirstat.version;
2296 pf->accounted_fragstat = pf->fragstat;
2297 ename = "lock ifile accounted scatter stat update";
2298 break;
2299 case CEPH_LOCK_INEST:
2300 pf->rstat.version = pi->rstat.version;
2301 pf->accounted_rstat = pf->rstat;
2302 ename = "lock inest accounted scatter stat update";
c07f9fc5
FG
2303
2304 if (!is_auth() && lock->get_state() == LOCK_MIX) {
11fdf7f2 2305 dout(10) << __func__ << " try to assimilate dirty rstat on "
c07f9fc5
FG
2306 << *dir << dendl;
2307 dir->assimilate_dirty_rstat_inodes();
2308 }
2309
7c673cae
FG
2310 break;
2311 default:
2312 ceph_abort();
2313 }
2314
c07f9fc5 2315 pf->version = dir->pre_dirty();
7c673cae
FG
2316 mut->add_projected_fnode(dir);
2317
2318 EUpdate *le = new EUpdate(mdlog, ename);
2319 mdlog->start_entry(le);
2320 le->metablob.add_dir_context(dir);
2321 le->metablob.add_dir(dir, true);
2322
11fdf7f2 2323 ceph_assert(!dir->is_frozen());
7c673cae 2324 mut->auth_pin(dir);
c07f9fc5
FG
2325
2326 if (lock->get_type() == CEPH_LOCK_INEST &&
2327 !is_auth() && lock->get_state() == LOCK_MIX) {
11fdf7f2 2328 dout(10) << __func__ << " finish assimilating dirty rstat on "
c07f9fc5
FG
2329 << *dir << dendl;
2330 dir->assimilate_dirty_rstat_inodes_finish(mut, &le->metablob);
2331
2332 if (!(pf->rstat == pf->accounted_rstat)) {
11fdf7f2 2333 if (!mut->is_wrlocked(&nestlock)) {
c07f9fc5
FG
2334 mdcache->mds->locker->wrlock_force(&nestlock, mut);
2335 }
2336
2337 mdcache->mds->locker->mark_updated_scatterlock(&nestlock);
2338 mut->ls->dirty_dirfrag_nest.push_back(&item_dirty_dirfrag_nest);
2339 }
2340 }
7c673cae
FG
2341
2342 mdlog->submit_entry(le, new C_Inode_FragUpdate(this, dir, mut));
2343 } else {
11fdf7f2 2344 dout(10) << __func__ << " " << fg << " accounted " << *lock
7c673cae
FG
2345 << " scatter stat unchanged at v" << dir_accounted_version << dendl;
2346 }
2347 }
2348}
2349
2350void CInode::_finish_frag_update(CDir *dir, MutationRef& mut)
2351{
11fdf7f2 2352 dout(10) << __func__ << " on " << *dir << dendl;
7c673cae 2353 mut->apply();
c07f9fc5 2354 mdcache->mds->locker->drop_locks(mut.get());
7c673cae
FG
2355 mut->cleanup();
2356}
2357
2358
2359/*
2360 * when we gather a lock, we need to assimilate dirfrag changes into the inode
2361 * state. it's possible we can't update the dirfrag accounted_rstat/fragstat
2362 * because the frag is auth and frozen, or that the replica couldn't for the same
2363 * reason. hopefully it will get updated the next time the lock cycles.
2364 *
2365 * we have two dimensions of behavior:
2366 * - we may be (auth and !frozen), and able to update, or not.
2367 * - the frag may be stale, or not.
2368 *
2369 * if the frag is non-stale, we want to assimilate the diff into the
2370 * inode, regardless of whether it's auth or updateable.
2371 *
2372 * if we update the frag, we want to set accounted_fragstat = frag,
2373 * both if we took the diff or it was stale and we are making it
2374 * un-stale.
2375 */
2376/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2377void CInode::finish_scatter_gather_update(int type)
2378{
2379 LogChannelRef clog = mdcache->mds->clog;
2380
11fdf7f2
TL
2381 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2382 ceph_assert(is_auth());
7c673cae
FG
2383
2384 switch (type) {
2385 case CEPH_LOCK_IFILE:
2386 {
2387 fragtree_t tmpdft = dirfragtree;
2388 struct frag_info_t dirstat;
2389 bool dirstat_valid = true;
2390
2391 // adjust summation
11fdf7f2 2392 ceph_assert(is_auth());
94b18763 2393 mempool_inode *pi = get_projected_inode();
7c673cae
FG
2394
2395 bool touched_mtime = false, touched_chattr = false;
2396 dout(20) << " orig dirstat " << pi->dirstat << dendl;
2397 pi->dirstat.version++;
94b18763
FG
2398 for (const auto &p : dirfrags) {
2399 frag_t fg = p.first;
2400 CDir *dir = p.second;
7c673cae
FG
2401 dout(20) << fg << " " << *dir << dendl;
2402
2403 bool update;
2404 if (dir->get_version() != 0) {
2405 update = dir->is_auth() && !dir->is_frozen();
2406 } else {
2407 update = false;
2408 dirstat_valid = false;
2409 }
2410
2411 fnode_t *pf = dir->get_projected_fnode();
2412 if (update)
2413 pf = dir->project_fnode();
2414
2415 if (pf->accounted_fragstat.version == pi->dirstat.version - 1) {
2416 dout(20) << fg << " fragstat " << pf->fragstat << dendl;
2417 dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
2418 pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2419 } else {
2420 dout(20) << fg << " skipping STALE accounted_fragstat " << pf->accounted_fragstat << dendl;
2421 }
2422
2423 if (pf->fragstat.nfiles < 0 ||
2424 pf->fragstat.nsubdirs < 0) {
2425 clog->error() << "bad/negative dir size on "
2426 << dir->dirfrag() << " " << pf->fragstat;
11fdf7f2 2427 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter);
7c673cae
FG
2428
2429 if (pf->fragstat.nfiles < 0)
2430 pf->fragstat.nfiles = 0;
2431 if (pf->fragstat.nsubdirs < 0)
2432 pf->fragstat.nsubdirs = 0;
2433 }
2434
2435 if (update) {
2436 pf->accounted_fragstat = pf->fragstat;
2437 pf->fragstat.version = pf->accounted_fragstat.version = pi->dirstat.version;
2438 dout(10) << fg << " updated accounted_fragstat " << pf->fragstat << " on " << *dir << dendl;
2439 }
2440
2441 tmpdft.force_to_leaf(g_ceph_context, fg);
2442 dirstat.add(pf->fragstat);
2443 }
2444 if (touched_mtime)
2445 pi->mtime = pi->ctime = pi->dirstat.mtime;
2446 if (touched_chattr)
2447 pi->change_attr = pi->dirstat.change_attr;
2448 dout(20) << " final dirstat " << pi->dirstat << dendl;
2449
2450 if (dirstat_valid && !dirstat.same_sums(pi->dirstat)) {
11fdf7f2
TL
2451 frag_vec_t leaves;
2452 tmpdft.get_leaves_under(frag_t(), leaves);
2453 for (const auto& leaf : leaves) {
2454 if (!dirfrags.count(leaf)) {
7c673cae
FG
2455 dirstat_valid = false;
2456 break;
2457 }
11fdf7f2 2458 }
7c673cae
FG
2459 if (dirstat_valid) {
2460 if (state_test(CInode::STATE_REPAIRSTATS)) {
2461 dout(20) << " dirstat mismatch, fixing" << dendl;
2462 } else {
2463 clog->error() << "unmatched fragstat on " << ino() << ", inode has "
2464 << pi->dirstat << ", dirfrags have " << dirstat;
11fdf7f2 2465 ceph_assert(!"unmatched fragstat" == g_conf()->mds_verify_scatter);
7c673cae
FG
2466 }
2467 // trust the dirfrags for now
2468 version_t v = pi->dirstat.version;
2469 if (pi->dirstat.mtime > dirstat.mtime)
2470 dirstat.mtime = pi->dirstat.mtime;
2471 if (pi->dirstat.change_attr > dirstat.change_attr)
2472 dirstat.change_attr = pi->dirstat.change_attr;
2473 pi->dirstat = dirstat;
2474 pi->dirstat.version = v;
2475 }
2476 }
2477
d2e6a577
FG
2478 if (pi->dirstat.nfiles < 0 || pi->dirstat.nsubdirs < 0)
2479 {
2480 std::string path;
2481 make_path_string(path);
2482 clog->error() << "Inconsistent statistics detected: fragstat on inode "
2483 << ino() << " (" << path << "), inode has " << pi->dirstat;
11fdf7f2 2484 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter);
7c673cae
FG
2485
2486 if (pi->dirstat.nfiles < 0)
2487 pi->dirstat.nfiles = 0;
2488 if (pi->dirstat.nsubdirs < 0)
2489 pi->dirstat.nsubdirs = 0;
2490 }
2491 }
2492 break;
2493
2494 case CEPH_LOCK_INEST:
2495 {
11fdf7f2
TL
2496 // adjust summation
2497 ceph_assert(is_auth());
2498
7c673cae
FG
2499 fragtree_t tmpdft = dirfragtree;
2500 nest_info_t rstat;
7c673cae
FG
2501 bool rstat_valid = true;
2502
11fdf7f2
TL
2503 rstat.rsubdirs = 1;
2504 if (const sr_t *srnode = get_projected_srnode(); srnode)
2505 rstat.rsnaps = srnode->snaps.size();
2506
94b18763 2507 mempool_inode *pi = get_projected_inode();
7c673cae
FG
2508 dout(20) << " orig rstat " << pi->rstat << dendl;
2509 pi->rstat.version++;
94b18763
FG
2510 for (const auto &p : dirfrags) {
2511 frag_t fg = p.first;
2512 CDir *dir = p.second;
7c673cae
FG
2513 dout(20) << fg << " " << *dir << dendl;
2514
2515 bool update;
2516 if (dir->get_version() != 0) {
2517 update = dir->is_auth() && !dir->is_frozen();
2518 } else {
2519 update = false;
2520 rstat_valid = false;
2521 }
2522
2523 fnode_t *pf = dir->get_projected_fnode();
2524 if (update)
2525 pf = dir->project_fnode();
2526
2527 if (pf->accounted_rstat.version == pi->rstat.version-1) {
2528 // only pull this frag's dirty rstat inodes into the frag if
2529 // the frag is non-stale and updateable. if it's stale,
2530 // that info will just get thrown out!
2531 if (update)
2532 dir->assimilate_dirty_rstat_inodes();
2533
2534 dout(20) << fg << " rstat " << pf->rstat << dendl;
2535 dout(20) << fg << " accounted_rstat " << pf->accounted_rstat << dendl;
2536 dout(20) << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl;
2537 mdcache->project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat,
2538 dir->first, CEPH_NOSNAP, this, true);
94b18763
FG
2539 for (auto &p : dir->dirty_old_rstat) {
2540 mdcache->project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat,
2541 p.second.first, p.first, this, true);
2542 }
7c673cae
FG
2543 if (update) // dir contents not valid if frozen or non-auth
2544 dir->check_rstats();
2545 } else {
2546 dout(20) << fg << " skipping STALE accounted_rstat " << pf->accounted_rstat << dendl;
2547 }
2548 if (update) {
2549 pf->accounted_rstat = pf->rstat;
2550 dir->dirty_old_rstat.clear();
2551 pf->rstat.version = pf->accounted_rstat.version = pi->rstat.version;
2552 dir->check_rstats();
2553 dout(10) << fg << " updated accounted_rstat " << pf->rstat << " on " << *dir << dendl;
2554 }
2555
2556 tmpdft.force_to_leaf(g_ceph_context, fg);
2557 rstat.add(pf->rstat);
2558 }
2559 dout(20) << " final rstat " << pi->rstat << dendl;
2560
2561 if (rstat_valid && !rstat.same_sums(pi->rstat)) {
11fdf7f2
TL
2562 frag_vec_t leaves;
2563 tmpdft.get_leaves_under(frag_t(), leaves);
2564 for (const auto& leaf : leaves) {
2565 if (!dirfrags.count(leaf)) {
7c673cae
FG
2566 rstat_valid = false;
2567 break;
2568 }
11fdf7f2 2569 }
7c673cae
FG
2570 if (rstat_valid) {
2571 if (state_test(CInode::STATE_REPAIRSTATS)) {
2572 dout(20) << " rstat mismatch, fixing" << dendl;
2573 } else {
d2e6a577
FG
2574 clog->error() << "inconsistent rstat on inode " << ino()
2575 << ", inode has " << pi->rstat
2576 << ", directory fragments have " << rstat;
11fdf7f2 2577 ceph_assert(!"unmatched rstat" == g_conf()->mds_verify_scatter);
7c673cae
FG
2578 }
2579 // trust the dirfrag for now
2580 version_t v = pi->rstat.version;
2581 if (pi->rstat.rctime > rstat.rctime)
2582 rstat.rctime = pi->rstat.rctime;
2583 pi->rstat = rstat;
2584 pi->rstat.version = v;
2585 }
2586 }
2587
2588 mdcache->broadcast_quota_to_client(this);
2589 }
2590 break;
2591
2592 case CEPH_LOCK_IDFT:
2593 break;
2594
2595 default:
2596 ceph_abort();
2597 }
2598}
2599
2600void CInode::finish_scatter_gather_update_accounted(int type, MutationRef& mut, EMetaBlob *metablob)
2601{
11fdf7f2
TL
2602 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2603 ceph_assert(is_auth());
7c673cae 2604
94b18763
FG
2605 for (const auto &p : dirfrags) {
2606 CDir *dir = p.second;
7c673cae
FG
2607 if (!dir->is_auth() || dir->get_version() == 0 || dir->is_frozen())
2608 continue;
2609
2610 if (type == CEPH_LOCK_IDFT)
2611 continue; // nothing to do.
2612
2613 dout(10) << " journaling updated frag accounted_ on " << *dir << dendl;
11fdf7f2 2614 ceph_assert(dir->is_projected());
7c673cae
FG
2615 fnode_t *pf = dir->get_projected_fnode();
2616 pf->version = dir->pre_dirty();
2617 mut->add_projected_fnode(dir);
2618 metablob->add_dir(dir, true);
2619 mut->auth_pin(dir);
2620
2621 if (type == CEPH_LOCK_INEST)
2622 dir->assimilate_dirty_rstat_inodes_finish(mut, metablob);
2623 }
2624}
2625
2626// waiting
2627
2628bool CInode::is_frozen() const
2629{
2630 if (is_frozen_inode()) return true;
2631 if (parent && parent->dir->is_frozen()) return true;
2632 return false;
2633}
2634
2635bool CInode::is_frozen_dir() const
2636{
2637 if (parent && parent->dir->is_frozen_dir()) return true;
2638 return false;
2639}
2640
2641bool CInode::is_freezing() const
2642{
2643 if (is_freezing_inode()) return true;
2644 if (parent && parent->dir->is_freezing()) return true;
2645 return false;
2646}
2647
11fdf7f2 2648void CInode::add_dir_waiter(frag_t fg, MDSContext *c)
7c673cae
FG
2649{
2650 if (waiting_on_dir.empty())
2651 get(PIN_DIRWAITER);
2652 waiting_on_dir[fg].push_back(c);
11fdf7f2 2653 dout(10) << __func__ << " frag " << fg << " " << c << " on " << *this << dendl;
7c673cae
FG
2654}
2655
11fdf7f2 2656void CInode::take_dir_waiting(frag_t fg, MDSContext::vec& ls)
7c673cae
FG
2657{
2658 if (waiting_on_dir.empty())
2659 return;
2660
94b18763
FG
2661 auto it = waiting_on_dir.find(fg);
2662 if (it != waiting_on_dir.end()) {
2663 dout(10) << __func__ << " frag " << fg << " on " << *this << dendl;
11fdf7f2
TL
2664 auto& waiting = it->second;
2665 ls.insert(ls.end(), waiting.begin(), waiting.end());
94b18763 2666 waiting_on_dir.erase(it);
7c673cae
FG
2667
2668 if (waiting_on_dir.empty())
2669 put(PIN_DIRWAITER);
2670 }
2671}
2672
11fdf7f2 2673void CInode::add_waiter(uint64_t tag, MDSContext *c)
7c673cae 2674{
11fdf7f2 2675 dout(10) << __func__ << " tag " << std::hex << tag << std::dec << " " << c
7c673cae
FG
2676 << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH)
2677 << " !frozen " << !is_frozen_inode()
2678 << " !freezing " << !is_freezing_inode()
2679 << dendl;
2680 // wait on the directory?
2681 // make sure its not the inode that is explicitly ambiguous|freezing|frozen
2682 if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) ||
2683 ((tag & WAIT_UNFREEZE) &&
2684 !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) {
2685 dout(15) << "passing waiter up tree" << dendl;
2686 parent->dir->add_waiter(tag, c);
2687 return;
2688 }
2689 dout(15) << "taking waiter here" << dendl;
2690 MDSCacheObject::add_waiter(tag, c);
2691}
2692
11fdf7f2 2693void CInode::take_waiting(uint64_t mask, MDSContext::vec& ls)
7c673cae
FG
2694{
2695 if ((mask & WAIT_DIR) && !waiting_on_dir.empty()) {
2696 // take all dentry waiters
2697 while (!waiting_on_dir.empty()) {
94b18763
FG
2698 auto it = waiting_on_dir.begin();
2699 dout(10) << __func__ << " dirfrag " << it->first << " on " << *this << dendl;
11fdf7f2
TL
2700 auto& waiting = it->second;
2701 ls.insert(ls.end(), waiting.begin(), waiting.end());
94b18763 2702 waiting_on_dir.erase(it);
7c673cae
FG
2703 }
2704 put(PIN_DIRWAITER);
2705 }
2706
2707 // waiting
2708 MDSCacheObject::take_waiting(mask, ls);
2709}
2710
9f95a23c
TL
2711void CInode::maybe_finish_freeze_inode()
2712{
2713 CDir *dir = get_parent_dir();
2714 if (auth_pins > auth_pin_freeze_allowance || dir->frozen_inode_suppressed)
2715 return;
2716
2717 dout(10) << "maybe_finish_freeze_inode - frozen" << dendl;
2718 ceph_assert(auth_pins == auth_pin_freeze_allowance);
2719 get(PIN_FROZEN);
2720 put(PIN_FREEZING);
2721 state_clear(STATE_FREEZING);
2722 state_set(STATE_FROZEN);
2723
2724 item_freezing_inode.remove_myself();
2725 dir->num_frozen_inodes++;
2726
2727 finish_waiting(WAIT_FROZEN);
2728}
2729
7c673cae
FG
2730bool CInode::freeze_inode(int auth_pin_allowance)
2731{
9f95a23c
TL
2732 CDir *dir = get_parent_dir();
2733 ceph_assert(dir);
2734
11fdf7f2
TL
2735 ceph_assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins
2736 ceph_assert(auth_pins >= auth_pin_allowance);
9f95a23c
TL
2737 if (auth_pins == auth_pin_allowance && !dir->frozen_inode_suppressed) {
2738 dout(10) << "freeze_inode - frozen" << dendl;
2739 if (!state_test(STATE_FROZEN)) {
2740 get(PIN_FROZEN);
2741 state_set(STATE_FROZEN);
2742 dir->num_frozen_inodes++;
2743 }
2744 return true;
7c673cae
FG
2745 }
2746
9f95a23c
TL
2747 dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl;
2748 auth_pin_freeze_allowance = auth_pin_allowance;
2749 dir->freezing_inodes.push_back(&item_freezing_inode);
2750
2751 get(PIN_FREEZING);
2752 state_set(STATE_FREEZING);
2753
2754 if (!dir->lock_caches_with_auth_pins.empty())
2755 mdcache->mds->locker->invalidate_lock_caches(dir);
2756
2757 const static int lock_types[] = {
2758 CEPH_LOCK_IVERSION, CEPH_LOCK_IFILE, CEPH_LOCK_IAUTH, CEPH_LOCK_ILINK, CEPH_LOCK_IDFT,
2759 CEPH_LOCK_IXATTR, CEPH_LOCK_ISNAP, CEPH_LOCK_INEST, CEPH_LOCK_IFLOCK, CEPH_LOCK_IPOLICY, 0
2760 };
2761 for (int i = 0; lock_types[i]; ++i) {
2762 auto lock = get_lock(lock_types[i]);
2763 if (lock->is_cached())
2764 mdcache->mds->locker->invalidate_lock_caches(lock);
7c673cae 2765 }
9f95a23c
TL
2766 // invalidate_lock_caches() may decrease dir->frozen_inode_suppressed
2767 // and finish freezing the inode
2768 return state_test(STATE_FROZEN);
7c673cae
FG
2769}
2770
11fdf7f2 2771void CInode::unfreeze_inode(MDSContext::vec& finished)
7c673cae 2772{
11fdf7f2 2773 dout(10) << __func__ << dendl;
7c673cae
FG
2774 if (state_test(STATE_FREEZING)) {
2775 state_clear(STATE_FREEZING);
2776 put(PIN_FREEZING);
9f95a23c 2777 item_freezing_inode.remove_myself();
7c673cae
FG
2778 } else if (state_test(STATE_FROZEN)) {
2779 state_clear(STATE_FROZEN);
2780 put(PIN_FROZEN);
9f95a23c 2781 get_parent_dir()->num_frozen_inodes--;
7c673cae
FG
2782 } else
2783 ceph_abort();
2784 take_waiting(WAIT_UNFREEZE, finished);
2785}
2786
2787void CInode::unfreeze_inode()
2788{
11fdf7f2 2789 MDSContext::vec finished;
7c673cae
FG
2790 unfreeze_inode(finished);
2791 mdcache->mds->queue_waiters(finished);
2792}
2793
2794void CInode::freeze_auth_pin()
2795{
11fdf7f2 2796 ceph_assert(state_test(CInode::STATE_FROZEN));
7c673cae 2797 state_set(CInode::STATE_FROZENAUTHPIN);
9f95a23c 2798 get_parent_dir()->num_frozen_inodes++;
7c673cae
FG
2799}
2800
2801void CInode::unfreeze_auth_pin()
2802{
11fdf7f2 2803 ceph_assert(state_test(CInode::STATE_FROZENAUTHPIN));
7c673cae 2804 state_clear(CInode::STATE_FROZENAUTHPIN);
9f95a23c 2805 get_parent_dir()->num_frozen_inodes--;
7c673cae 2806 if (!state_test(STATE_FREEZING|STATE_FROZEN)) {
11fdf7f2 2807 MDSContext::vec finished;
7c673cae
FG
2808 take_waiting(WAIT_UNFREEZE, finished);
2809 mdcache->mds->queue_waiters(finished);
2810 }
2811}
2812
11fdf7f2 2813void CInode::clear_ambiguous_auth(MDSContext::vec& finished)
7c673cae 2814{
11fdf7f2 2815 ceph_assert(state_test(CInode::STATE_AMBIGUOUSAUTH));
7c673cae
FG
2816 state_clear(CInode::STATE_AMBIGUOUSAUTH);
2817 take_waiting(CInode::WAIT_SINGLEAUTH, finished);
2818}
2819
2820void CInode::clear_ambiguous_auth()
2821{
11fdf7f2 2822 MDSContext::vec finished;
7c673cae
FG
2823 clear_ambiguous_auth(finished);
2824 mdcache->mds->queue_waiters(finished);
2825}
2826
2827// auth_pins
91327a77
AA
2828bool CInode::can_auth_pin(int *err_ret) const {
2829 int err;
2830 if (!is_auth()) {
2831 err = ERR_NOT_AUTH;
2832 } else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) {
2833 err = ERR_EXPORTING_INODE;
2834 } else {
2835 if (parent)
2836 return parent->can_auth_pin(err_ret);
2837 err = 0;
2838 }
2839 if (err && err_ret)
2840 *err_ret = err;
2841 return !err;
7c673cae
FG
2842}
2843
2844void CInode::auth_pin(void *by)
2845{
2846 if (auth_pins == 0)
2847 get(PIN_AUTHPIN);
2848 auth_pins++;
2849
2850#ifdef MDS_AUTHPIN_SET
2851 auth_pin_set.insert(by);
2852#endif
2853
11fdf7f2 2854 dout(10) << "auth_pin by " << by << " on " << *this << " now " << auth_pins << dendl;
7c673cae
FG
2855
2856 if (parent)
11fdf7f2 2857 parent->adjust_nested_auth_pins(1, this);
7c673cae
FG
2858}
2859
2860void CInode::auth_unpin(void *by)
2861{
2862 auth_pins--;
2863
2864#ifdef MDS_AUTHPIN_SET
11fdf7f2
TL
2865 {
2866 auto it = auth_pin_set.find(by);
2867 ceph_assert(it != auth_pin_set.end());
2868 auth_pin_set.erase(it);
2869 }
7c673cae
FG
2870#endif
2871
2872 if (auth_pins == 0)
2873 put(PIN_AUTHPIN);
2874
11fdf7f2 2875 dout(10) << "auth_unpin by " << by << " on " << *this << " now " << auth_pins << dendl;
7c673cae 2876
11fdf7f2 2877 ceph_assert(auth_pins >= 0);
7c673cae
FG
2878
2879 if (parent)
11fdf7f2 2880 parent->adjust_nested_auth_pins(-1, by);
7c673cae 2881
9f95a23c
TL
2882 if (is_freezing_inode())
2883 maybe_finish_freeze_inode();
7c673cae
FG
2884}
2885
7c673cae
FG
2886// authority
2887
2888mds_authority_t CInode::authority() const
2889{
2890 if (inode_auth.first >= 0)
2891 return inode_auth;
2892
2893 if (parent)
2894 return parent->dir->authority();
2895
2896 // new items that are not yet linked in (in the committed plane) belong
2897 // to their first parent.
2898 if (!projected_parent.empty())
2899 return projected_parent.front()->dir->authority();
2900
2901 return CDIR_AUTH_UNDEF;
2902}
2903
2904
2905// SNAP
2906
2907snapid_t CInode::get_oldest_snap()
2908{
2909 snapid_t t = first;
2910 if (!old_inodes.empty())
2911 t = old_inodes.begin()->second.first;
11fdf7f2 2912 return std::min(t, oldest_snap);
7c673cae
FG
2913}
2914
94b18763 2915CInode::mempool_old_inode& CInode::cow_old_inode(snapid_t follows, bool cow_head)
7c673cae 2916{
11fdf7f2 2917 ceph_assert(follows >= first);
7c673cae 2918
94b18763
FG
2919 mempool_inode *pi = cow_head ? get_projected_inode() : get_previous_projected_inode();
2920 mempool_xattr_map *px = cow_head ? get_projected_xattrs() : get_previous_projected_xattrs();
7c673cae 2921
94b18763 2922 mempool_old_inode &old = old_inodes[follows];
7c673cae
FG
2923 old.first = first;
2924 old.inode = *pi;
2925 old.xattrs = *px;
2926
2927 if (first < oldest_snap)
2928 oldest_snap = first;
2929
2930 dout(10) << " " << px->size() << " xattrs cowed, " << *px << dendl;
2931
2932 old.inode.trim_client_ranges(follows);
2933
11fdf7f2 2934 if (g_conf()->mds_snap_rstat &&
7c673cae
FG
2935 !(old.inode.rstat == old.inode.accounted_rstat))
2936 dirty_old_rstats.insert(follows);
2937
2938 first = follows+1;
2939
11fdf7f2 2940 dout(10) << __func__ << " " << (cow_head ? "head" : "previous_head" )
7c673cae
FG
2941 << " to [" << old.first << "," << follows << "] on "
2942 << *this << dendl;
2943
2944 return old;
2945}
2946
2947void CInode::split_old_inode(snapid_t snap)
2948{
94b18763 2949 auto it = old_inodes.lower_bound(snap);
11fdf7f2 2950 ceph_assert(it != old_inodes.end() && it->second.first < snap);
7c673cae 2951
94b18763
FG
2952 mempool_old_inode &old = old_inodes[snap - 1];
2953 old = it->second;
7c673cae 2954
94b18763
FG
2955 it->second.first = snap;
2956 dout(10) << __func__ << " " << "[" << old.first << "," << it->first
2957 << "] to [" << snap << "," << it->first << "] on " << *this << dendl;
7c673cae
FG
2958}
2959
2960void CInode::pre_cow_old_inode()
2961{
11fdf7f2 2962 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
7c673cae
FG
2963 if (first <= follows)
2964 cow_old_inode(follows, true);
2965}
2966
11fdf7f2
TL
2967bool CInode::has_snap_data(snapid_t snapid)
2968{
2969 bool found = snapid >= first && snapid <= last;
2970 if (!found && is_multiversion()) {
2971 auto p = old_inodes.lower_bound(snapid);
2972 if (p != old_inodes.end()) {
2973 if (p->second.first > snapid) {
2974 if (p != old_inodes.begin())
2975 --p;
2976 }
2977 if (p->second.first <= snapid && snapid <= p->first) {
2978 found = true;
2979 }
2980 }
2981 }
2982 return found;
2983}
2984
7c673cae
FG
2985void CInode::purge_stale_snap_data(const set<snapid_t>& snaps)
2986{
11fdf7f2 2987 dout(10) << __func__ << " " << snaps << dendl;
7c673cae 2988
94b18763
FG
2989 for (auto it = old_inodes.begin(); it != old_inodes.end(); ) {
2990 const snapid_t &id = it->first;
2991 const auto &s = snaps.lower_bound(it->second.first);
2992 if (s == snaps.end() || *s > id) {
2993 dout(10) << " purging old_inode [" << it->second.first << "," << id << "]" << dendl;
2994 it = old_inodes.erase(it);
2995 } else {
2996 ++it;
2997 }
7c673cae
FG
2998 }
2999}
3000
3001/*
3002 * pick/create an old_inode
3003 */
94b18763 3004CInode::mempool_old_inode * CInode::pick_old_inode(snapid_t snap)
7c673cae 3005{
94b18763
FG
3006 auto it = old_inodes.lower_bound(snap); // p is first key >= to snap
3007 if (it != old_inodes.end() && it->second.first <= snap) {
3008 dout(10) << __func__ << " snap " << snap << " -> [" << it->second.first << "," << it->first << "]" << dendl;
3009 return &it->second;
7c673cae 3010 }
11fdf7f2 3011 dout(10) << __func__ << " snap " << snap << " -> nothing" << dendl;
7c673cae
FG
3012 return NULL;
3013}
3014
3015void CInode::open_snaprealm(bool nosplit)
3016{
3017 if (!snaprealm) {
3018 SnapRealm *parent = find_snaprealm();
3019 snaprealm = new SnapRealm(mdcache, this);
3020 if (parent) {
11fdf7f2 3021 dout(10) << __func__ << " " << snaprealm
7c673cae
FG
3022 << " parent is " << parent
3023 << dendl;
3024 dout(30) << " siblings are " << parent->open_children << dendl;
3025 snaprealm->parent = parent;
3026 if (!nosplit)
3027 parent->split_at(snaprealm);
3028 parent->open_children.insert(snaprealm);
3029 }
3030 }
3031}
3032void CInode::close_snaprealm(bool nojoin)
3033{
3034 if (snaprealm) {
11fdf7f2 3035 dout(15) << __func__ << " " << *snaprealm << dendl;
7c673cae
FG
3036 snaprealm->close_parents();
3037 if (snaprealm->parent) {
3038 snaprealm->parent->open_children.erase(snaprealm);
3039 //if (!nojoin)
3040 //snaprealm->parent->join(snaprealm);
3041 }
3042 delete snaprealm;
3043 snaprealm = 0;
3044 }
3045}
3046
3047SnapRealm *CInode::find_snaprealm() const
3048{
3049 const CInode *cur = this;
3050 while (!cur->snaprealm) {
11fdf7f2
TL
3051 const CDentry *pdn = cur->get_oldest_parent_dn();
3052 if (!pdn)
7c673cae 3053 break;
11fdf7f2 3054 cur = pdn->get_dir()->get_inode();
7c673cae
FG
3055 }
3056 return cur->snaprealm;
3057}
3058
3059void CInode::encode_snap_blob(bufferlist &snapbl)
3060{
3061 if (snaprealm) {
11fdf7f2
TL
3062 using ceph::encode;
3063 encode(snaprealm->srnode, snapbl);
3064 dout(20) << __func__ << " " << *snaprealm << dendl;
7c673cae
FG
3065 }
3066}
11fdf7f2 3067void CInode::decode_snap_blob(const bufferlist& snapbl)
7c673cae 3068{
11fdf7f2 3069 using ceph::decode;
7c673cae
FG
3070 if (snapbl.length()) {
3071 open_snaprealm();
11fdf7f2
TL
3072 auto old_flags = snaprealm->srnode.flags;
3073 auto p = snapbl.cbegin();
3074 decode(snaprealm->srnode, p);
7c673cae
FG
3075 if (is_base()) {
3076 bool ok = snaprealm->_open_parents(NULL);
11fdf7f2
TL
3077 ceph_assert(ok);
3078 } else {
3079 if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) {
3080 snaprealm->close_parents();
3081 snaprealm->adjust_parent();
3082 }
7c673cae 3083 }
11fdf7f2 3084 dout(20) << __func__ << " " << *snaprealm << dendl;
92f5a8d4
TL
3085 } else if (snaprealm &&
3086 !is_root() && !is_mdsdir()) { // see https://tracker.ceph.com/issues/42675
11fdf7f2
TL
3087 ceph_assert(mdcache->mds->is_any_replay());
3088 snaprealm->merge_to(NULL);
7c673cae
FG
3089 }
3090}
3091
3092void CInode::encode_snap(bufferlist& bl)
3093{
9f95a23c 3094 ENCODE_START(1, 1, bl);
7c673cae
FG
3095 bufferlist snapbl;
3096 encode_snap_blob(snapbl);
11fdf7f2
TL
3097 encode(snapbl, bl);
3098 encode(oldest_snap, bl);
9f95a23c 3099 ENCODE_FINISH(bl);
11fdf7f2 3100}
7c673cae 3101
11fdf7f2 3102void CInode::decode_snap(bufferlist::const_iterator& p)
7c673cae 3103{
9f95a23c 3104 DECODE_START(1, p);
7c673cae 3105 bufferlist snapbl;
11fdf7f2
TL
3106 decode(snapbl, p);
3107 decode(oldest_snap, p);
7c673cae 3108 decode_snap_blob(snapbl);
9f95a23c 3109 DECODE_FINISH(p);
7c673cae
FG
3110}
3111
3112// =============================================
3113
3114client_t CInode::calc_ideal_loner()
3115{
3116 if (mdcache->is_readonly())
3117 return -1;
11fdf7f2 3118 if (!get_mds_caps_wanted().empty())
7c673cae
FG
3119 return -1;
3120
3121 int n = 0;
3122 client_t loner = -1;
11fdf7f2
TL
3123 for (const auto &p : client_caps) {
3124 if (!p.second.is_stale() &&
9f95a23c
TL
3125 (is_dir() ?
3126 !has_subtree_or_exporting_dirfrag() :
3127 (p.second.wanted() & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_RD)))) {
7c673cae
FG
3128 if (n)
3129 return -1;
3130 n++;
11fdf7f2 3131 loner = p.first;
7c673cae 3132 }
11fdf7f2 3133 }
7c673cae
FG
3134 return loner;
3135}
3136
b32b8144 3137bool CInode::choose_ideal_loner()
7c673cae
FG
3138{
3139 want_loner_cap = calc_ideal_loner();
b32b8144
FG
3140 int changed = false;
3141 if (loner_cap >= 0 && loner_cap != want_loner_cap) {
3142 if (!try_drop_loner())
3143 return false;
3144 changed = true;
3145 }
3146
3147 if (want_loner_cap >= 0) {
3148 if (loner_cap < 0) {
3149 set_loner_cap(want_loner_cap);
3150 changed = true;
3151 } else
11fdf7f2 3152 ceph_assert(loner_cap == want_loner_cap);
b32b8144
FG
3153 }
3154 return changed;
7c673cae
FG
3155}
3156
3157bool CInode::try_set_loner()
3158{
11fdf7f2 3159 ceph_assert(want_loner_cap >= 0);
7c673cae
FG
3160 if (loner_cap >= 0 && loner_cap != want_loner_cap)
3161 return false;
3162 set_loner_cap(want_loner_cap);
3163 return true;
3164}
3165
3166void CInode::set_loner_cap(client_t l)
3167{
3168 loner_cap = l;
3169 authlock.set_excl_client(loner_cap);
3170 filelock.set_excl_client(loner_cap);
3171 linklock.set_excl_client(loner_cap);
3172 xattrlock.set_excl_client(loner_cap);
3173}
3174
3175bool CInode::try_drop_loner()
3176{
3177 if (loner_cap < 0)
3178 return true;
3179
3180 int other_allowed = get_caps_allowed_by_type(CAP_ANY);
3181 Capability *cap = get_client_cap(loner_cap);
3182 if (!cap ||
3183 (cap->issued() & ~other_allowed) == 0) {
3184 set_loner_cap(-1);
3185 return true;
3186 }
3187 return false;
3188}
3189
3190
3191// choose new lock state during recovery, based on issued caps
3192void CInode::choose_lock_state(SimpleLock *lock, int allissued)
3193{
3194 int shift = lock->get_cap_shift();
3195 int issued = (allissued >> shift) & lock->get_cap_mask();
3196 if (is_auth()) {
3197 if (lock->is_xlocked()) {
3198 // do nothing here
3199 } else if (lock->get_state() != LOCK_MIX) {
3200 if (issued & (CEPH_CAP_GEXCL | CEPH_CAP_GBUFFER))
3201 lock->set_state(LOCK_EXCL);
f6b5b4d7
TL
3202 else if (issued & CEPH_CAP_GWR) {
3203 if (issued & (CEPH_CAP_GCACHE | CEPH_CAP_GSHARED))
3204 lock->set_state(LOCK_EXCL);
3205 else
3206 lock->set_state(LOCK_MIX);
3207 } else if (lock->is_dirty()) {
7c673cae
FG
3208 if (is_replicated())
3209 lock->set_state(LOCK_MIX);
3210 else
3211 lock->set_state(LOCK_LOCK);
3212 } else
3213 lock->set_state(LOCK_SYNC);
3214 }
3215 } else {
3216 // our states have already been chosen during rejoin.
3217 if (lock->is_xlocked())
11fdf7f2 3218 ceph_assert(lock->get_state() == LOCK_LOCK);
7c673cae
FG
3219 }
3220}
3221
3222void CInode::choose_lock_states(int dirty_caps)
3223{
3224 int issued = get_caps_issued() | dirty_caps;
b32b8144
FG
3225 if (is_auth() && (issued & (CEPH_CAP_ANY_EXCL|CEPH_CAP_ANY_WR)))
3226 choose_ideal_loner();
7c673cae
FG
3227 choose_lock_state(&filelock, issued);
3228 choose_lock_state(&nestlock, issued);
3229 choose_lock_state(&dirfragtreelock, issued);
3230 choose_lock_state(&authlock, issued);
3231 choose_lock_state(&xattrlock, issued);
3232 choose_lock_state(&linklock, issued);
3233}
3234
9f95a23c
TL
3235int CInode::count_nonstale_caps()
3236{
3237 int n = 0;
3238 for (const auto &p : client_caps) {
3239 if (!p.second.is_stale())
3240 n++;
3241 }
3242 return n;
3243}
3244
3245bool CInode::multiple_nonstale_caps()
3246{
3247 int n = 0;
3248 for (const auto &p : client_caps) {
3249 if (!p.second.is_stale()) {
3250 if (n)
3251 return true;
3252 n++;
3253 }
3254 }
3255 return false;
3256}
3257
11fdf7f2
TL
3258void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m)
3259{
3260 bool old_empty = mds_caps_wanted.empty();
3261 mds_caps_wanted.swap(m);
3262 if (old_empty != (bool)mds_caps_wanted.empty()) {
3263 if (old_empty)
f91f0fd5 3264 adjust_num_caps_notable(1);
11fdf7f2 3265 else
f91f0fd5 3266 adjust_num_caps_notable(-1);
11fdf7f2
TL
3267 }
3268}
3269
3270void CInode::set_mds_caps_wanted(mds_rank_t mds, int32_t wanted)
3271{
3272 bool old_empty = mds_caps_wanted.empty();
3273 if (wanted) {
3274 mds_caps_wanted[mds] = wanted;
3275 if (old_empty)
f91f0fd5 3276 adjust_num_caps_notable(1);
11fdf7f2
TL
3277 } else if (!old_empty) {
3278 mds_caps_wanted.erase(mds);
3279 if (mds_caps_wanted.empty())
f91f0fd5 3280 adjust_num_caps_notable(-1);
11fdf7f2
TL
3281 }
3282}
3283
9f95a23c
TL
3284Capability *CInode::add_client_cap(client_t client, Session *session,
3285 SnapRealm *conrealm, bool new_inode)
7c673cae 3286{
11fdf7f2 3287 ceph_assert(last == CEPH_NOSNAP);
7c673cae
FG
3288 if (client_caps.empty()) {
3289 get(PIN_CAPS);
3290 if (conrealm)
3291 containing_realm = conrealm;
3292 else
3293 containing_realm = find_snaprealm();
3294 containing_realm->inodes_with_caps.push_back(&item_caps);
11fdf7f2 3295 dout(10) << __func__ << " first cap, joining realm " << *containing_realm << dendl;
7c673cae 3296
7c673cae 3297 mdcache->num_inodes_with_caps++;
11fdf7f2
TL
3298 if (parent)
3299 parent->dir->adjust_num_inodes_with_caps(1);
3300 }
3301
9f95a23c 3302 uint64_t cap_id = new_inode ? 1 : ++mdcache->last_cap_id;
11fdf7f2
TL
3303 auto ret = client_caps.emplace(std::piecewise_construct, std::forward_as_tuple(client),
3304 std::forward_as_tuple(this, session, cap_id));
3305 ceph_assert(ret.second == true);
3306 Capability *cap = &ret.first->second;
7c673cae 3307
7c673cae 3308 cap->client_follows = first-1;
7c673cae 3309 containing_realm->add_cap(client, cap);
11fdf7f2 3310
7c673cae
FG
3311 return cap;
3312}
3313
3314void CInode::remove_client_cap(client_t client)
3315{
11fdf7f2
TL
3316 auto it = client_caps.find(client);
3317 ceph_assert(it != client_caps.end());
3318 Capability *cap = &it->second;
7c673cae
FG
3319
3320 cap->item_session_caps.remove_myself();
3321 cap->item_revoking_caps.remove_myself();
3322 cap->item_client_revoking_caps.remove_myself();
3323 containing_realm->remove_cap(client, cap);
3324
3325 if (client == loner_cap)
3326 loner_cap = -1;
3327
f91f0fd5
TL
3328 if (cap->is_wanted_notable())
3329 adjust_num_caps_notable(-1);
11fdf7f2
TL
3330
3331 client_caps.erase(it);
7c673cae 3332 if (client_caps.empty()) {
11fdf7f2 3333 dout(10) << __func__ << " last cap, leaving realm " << *containing_realm << dendl;
7c673cae
FG
3334 put(PIN_CAPS);
3335 item_caps.remove_myself();
3336 containing_realm = NULL;
7c673cae 3337 mdcache->num_inodes_with_caps--;
11fdf7f2
TL
3338 if (parent)
3339 parent->dir->adjust_num_inodes_with_caps(-1);
7c673cae
FG
3340 }
3341
3342 //clean up advisory locks
3343 bool fcntl_removed = fcntl_locks ? fcntl_locks->remove_all_from(client) : false;
3344 bool flock_removed = flock_locks ? flock_locks->remove_all_from(client) : false;
3345 if (fcntl_removed || flock_removed) {
11fdf7f2 3346 MDSContext::vec waiters;
7c673cae
FG
3347 take_waiting(CInode::WAIT_FLOCK, waiters);
3348 mdcache->mds->queue_waiters(waiters);
3349 }
3350}
3351
3352void CInode::move_to_realm(SnapRealm *realm)
3353{
11fdf7f2 3354 dout(10) << __func__ << " joining realm " << *realm
7c673cae 3355 << ", leaving realm " << *containing_realm << dendl;
11fdf7f2
TL
3356 for (auto& p : client_caps) {
3357 containing_realm->remove_cap(p.first, &p.second);
3358 realm->add_cap(p.first, &p.second);
7c673cae
FG
3359 }
3360 item_caps.remove_myself();
3361 realm->inodes_with_caps.push_back(&item_caps);
3362 containing_realm = realm;
3363}
3364
3365Capability *CInode::reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session)
3366{
3367 Capability *cap = get_client_cap(client);
3368 if (cap) {
3369 // FIXME?
3370 cap->merge(icr.capinfo.wanted, icr.capinfo.issued);
3371 } else {
3372 cap = add_client_cap(client, session);
3373 cap->set_cap_id(icr.capinfo.cap_id);
3374 cap->set_wanted(icr.capinfo.wanted);
3375 cap->issue_norevoke(icr.capinfo.issued);
3376 cap->reset_seq();
3377 }
3378 cap->set_last_issue_stamp(ceph_clock_now());
3379 return cap;
3380}
3381
3382void CInode::clear_client_caps_after_export()
3383{
3384 while (!client_caps.empty())
3385 remove_client_cap(client_caps.begin()->first);
3386 loner_cap = -1;
3387 want_loner_cap = -1;
11fdf7f2
TL
3388 if (!get_mds_caps_wanted().empty()) {
3389 mempool::mds_co::compact_map<int32_t,int32_t> empty;
3390 set_mds_caps_wanted(empty);
3391 }
7c673cae
FG
3392}
3393
3394void CInode::export_client_caps(map<client_t,Capability::Export>& cl)
3395{
11fdf7f2
TL
3396 for (const auto &p : client_caps) {
3397 cl[p.first] = p.second.make_export();
7c673cae
FG
3398 }
3399}
3400
3401 // caps allowed
3402int CInode::get_caps_liked() const
3403{
3404 if (is_dir())
3405 return CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; // but not, say, FILE_RD|WR|WRBUFFER
3406 else
3407 return CEPH_CAP_ANY & ~CEPH_CAP_FILE_LAZYIO;
3408}
3409
3410int CInode::get_caps_allowed_ever() const
3411{
3412 int allowed;
3413 if (is_dir())
3414 allowed = CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED;
3415 else
3416 allowed = CEPH_CAP_ANY;
3417 return allowed &
3418 (CEPH_CAP_PIN |
3419 (filelock.gcaps_allowed_ever() << filelock.get_cap_shift()) |
3420 (authlock.gcaps_allowed_ever() << authlock.get_cap_shift()) |
3421 (xattrlock.gcaps_allowed_ever() << xattrlock.get_cap_shift()) |
3422 (linklock.gcaps_allowed_ever() << linklock.get_cap_shift()));
3423}
3424
3425int CInode::get_caps_allowed_by_type(int type) const
3426{
3427 return
3428 CEPH_CAP_PIN |
3429 (filelock.gcaps_allowed(type) << filelock.get_cap_shift()) |
3430 (authlock.gcaps_allowed(type) << authlock.get_cap_shift()) |
3431 (xattrlock.gcaps_allowed(type) << xattrlock.get_cap_shift()) |
3432 (linklock.gcaps_allowed(type) << linklock.get_cap_shift());
3433}
3434
3435int CInode::get_caps_careful() const
3436{
3437 return
3438 (filelock.gcaps_careful() << filelock.get_cap_shift()) |
3439 (authlock.gcaps_careful() << authlock.get_cap_shift()) |
3440 (xattrlock.gcaps_careful() << xattrlock.get_cap_shift()) |
3441 (linklock.gcaps_careful() << linklock.get_cap_shift());
3442}
3443
3444int CInode::get_xlocker_mask(client_t client) const
3445{
3446 return
3447 (filelock.gcaps_xlocker_mask(client) << filelock.get_cap_shift()) |
3448 (authlock.gcaps_xlocker_mask(client) << authlock.get_cap_shift()) |
3449 (xattrlock.gcaps_xlocker_mask(client) << xattrlock.get_cap_shift()) |
3450 (linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift());
3451}
3452
11fdf7f2
TL
3453int CInode::get_caps_allowed_for_client(Session *session, Capability *cap,
3454 mempool_inode *file_i) const
7c673cae 3455{
11fdf7f2 3456 client_t client = session->get_client();
7c673cae
FG
3457 int allowed;
3458 if (client == get_loner()) {
3459 // as the loner, we get the loner_caps AND any xlocker_caps for things we have xlocked
3460 allowed =
3461 get_caps_allowed_by_type(CAP_LONER) |
3462 (get_caps_allowed_by_type(CAP_XLOCKER) & get_xlocker_mask(client));
3463 } else {
3464 allowed = get_caps_allowed_by_type(CAP_ANY);
3465 }
3466
9f95a23c
TL
3467 if (is_dir()) {
3468 allowed &= ~CEPH_CAP_ANY_DIR_OPS;
3469 if (cap && (allowed & CEPH_CAP_FILE_EXCL))
3470 allowed |= cap->get_lock_cache_allowed();
3471 } else {
11fdf7f2
TL
3472 if (file_i->inline_data.version == CEPH_INLINE_NONE &&
3473 file_i->layout.pool_ns.empty()) {
3474 // noop
3475 } else if (cap) {
3476 if ((file_i->inline_data.version != CEPH_INLINE_NONE &&
3477 cap->is_noinline()) ||
3478 (!file_i->layout.pool_ns.empty() &&
3479 cap->is_nopoolns()))
3480 allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
3481 } else {
3482 auto& conn = session->get_connection();
3483 if ((file_i->inline_data.version != CEPH_INLINE_NONE &&
3484 !conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) ||
3485 (!file_i->layout.pool_ns.empty() &&
3486 !conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)))
3487 allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
3488 }
7c673cae
FG
3489 }
3490 return allowed;
3491}
3492
3493// caps issued, wanted
3494int CInode::get_caps_issued(int *ploner, int *pother, int *pxlocker,
3495 int shift, int mask)
3496{
3497 int c = 0;
3498 int loner = 0, other = 0, xlocker = 0;
3499 if (!is_auth()) {
3500 loner_cap = -1;
3501 }
3502
11fdf7f2
TL
3503 for (const auto &p : client_caps) {
3504 int i = p.second.issued();
7c673cae 3505 c |= i;
11fdf7f2 3506 if (p.first == loner_cap)
7c673cae
FG
3507 loner |= i;
3508 else
3509 other |= i;
11fdf7f2 3510 xlocker |= get_xlocker_mask(p.first) & i;
7c673cae
FG
3511 }
3512 if (ploner) *ploner = (loner >> shift) & mask;
3513 if (pother) *pother = (other >> shift) & mask;
3514 if (pxlocker) *pxlocker = (xlocker >> shift) & mask;
3515 return (c >> shift) & mask;
3516}
3517
3518bool CInode::is_any_caps_wanted() const
3519{
11fdf7f2
TL
3520 for (const auto &p : client_caps) {
3521 if (p.second.wanted())
7c673cae 3522 return true;
11fdf7f2 3523 }
7c673cae
FG
3524 return false;
3525}
3526
3527int CInode::get_caps_wanted(int *ploner, int *pother, int shift, int mask) const
3528{
3529 int w = 0;
3530 int loner = 0, other = 0;
11fdf7f2
TL
3531 for (const auto &p : client_caps) {
3532 if (!p.second.is_stale()) {
3533 int t = p.second.wanted();
7c673cae 3534 w |= t;
11fdf7f2 3535 if (p.first == loner_cap)
7c673cae
FG
3536 loner |= t;
3537 else
3538 other |= t;
3539 }
3540 //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl;
3541 }
3542 if (is_auth())
94b18763
FG
3543 for (const auto &p : mds_caps_wanted) {
3544 w |= p.second;
3545 other |= p.second;
7c673cae
FG
3546 //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl;
3547 }
3548 if (ploner) *ploner = (loner >> shift) & mask;
3549 if (pother) *pother = (other >> shift) & mask;
3550 return (w >> shift) & mask;
3551}
3552
3553bool CInode::issued_caps_need_gather(SimpleLock *lock)
3554{
3555 int loner_issued, other_issued, xlocker_issued;
3556 get_caps_issued(&loner_issued, &other_issued, &xlocker_issued,
3557 lock->get_cap_shift(), lock->get_cap_mask());
3558 if ((loner_issued & ~lock->gcaps_allowed(CAP_LONER)) ||
3559 (other_issued & ~lock->gcaps_allowed(CAP_ANY)) ||
3560 (xlocker_issued & ~lock->gcaps_allowed(CAP_XLOCKER)))
3561 return true;
3562 return false;
3563}
3564
f91f0fd5
TL
3565void CInode::adjust_num_caps_notable(int d)
3566{
3567 if (!is_clientwriteable()) {
3568 if (!num_caps_notable && d > 0)
3569 mdcache->open_file_table.add_inode(this);
3570 else if (num_caps_notable > 0 && num_caps_notable == -d)
3571 mdcache->open_file_table.remove_inode(this);
3572 }
3573
3574 num_caps_notable +=d;
3575 ceph_assert(num_caps_notable >= 0);
3576}
3577
3578void CInode::mark_clientwriteable()
3579{
3580 if (last != CEPH_NOSNAP)
3581 return;
3582 if (!state_test(STATE_CLIENTWRITEABLE)) {
3583 if (num_caps_notable == 0)
3584 mdcache->open_file_table.add_inode(this);
3585 state_set(STATE_CLIENTWRITEABLE);
3586 }
3587}
3588
3589void CInode::clear_clientwriteable()
3590{
3591 if (state_test(STATE_CLIENTWRITEABLE)) {
3592 if (num_caps_notable == 0)
3593 mdcache->open_file_table.remove_inode(this);
3594 state_clear(STATE_CLIENTWRITEABLE);
3595 }
3596}
7c673cae
FG
3597
3598// =============================================
3599
3600int CInode::encode_inodestat(bufferlist& bl, Session *session,
3601 SnapRealm *dir_realm,
3602 snapid_t snapid,
3603 unsigned max_bytes,
3604 int getattr_caps)
3605{
11fdf7f2
TL
3606 client_t client = session->get_client();
3607 ceph_assert(snapid);
7c673cae
FG
3608
3609 bool valid = true;
3610
3611 // pick a version!
94b18763
FG
3612 mempool_inode *oi = &inode;
3613 mempool_inode *pi = get_projected_inode();
7c673cae 3614
94b18763 3615 CInode::mempool_xattr_map *pxattrs = nullptr;
7c673cae
FG
3616
3617 if (snapid != CEPH_NOSNAP) {
3618
3619 // for now at least, old_inodes is only defined/valid on the auth
3620 if (!is_auth())
3621 valid = false;
3622
3623 if (is_multiversion()) {
94b18763
FG
3624 auto it = old_inodes.lower_bound(snapid);
3625 if (it != old_inodes.end()) {
3626 if (it->second.first > snapid) {
3627 if (it != old_inodes.begin())
3628 --it;
7c673cae 3629 }
94b18763
FG
3630 if (it->second.first <= snapid && snapid <= it->first) {
3631 dout(15) << __func__ << " snapid " << snapid
3632 << " to old_inode [" << it->second.first << "," << it->first << "]"
3633 << " " << it->second.inode.rstat
7c673cae 3634 << dendl;
94b18763
FG
3635 auto &p = it->second;
3636 pi = oi = &p.inode;
3637 pxattrs = &p.xattrs;
7c673cae
FG
3638 } else {
3639 // snapshoted remote dentry can result this
11fdf7f2 3640 dout(0) << __func__ << " old_inode for snapid " << snapid
7c673cae
FG
3641 << " not found" << dendl;
3642 }
3643 }
3644 } else if (snapid < first || snapid > last) {
3645 // snapshoted remote dentry can result this
11fdf7f2 3646 dout(0) << __func__ << " [" << first << "," << last << "]"
7c673cae
FG
3647 << " not match snapid " << snapid << dendl;
3648 }
3649 }
3650
81eedcae 3651 utime_t snap_btime;
7c673cae 3652 SnapRealm *realm = find_snaprealm();
81eedcae
TL
3653 if (snapid != CEPH_NOSNAP && realm) {
3654 // add snapshot timestamp vxattr
3655 map<snapid_t,const SnapInfo*> infomap;
3656 realm->get_snap_info(infomap,
3657 snapid, // min
3658 snapid); // max
3659 if (!infomap.empty()) {
3660 ceph_assert(infomap.size() == 1);
3661 const SnapInfo *si = infomap.begin()->second;
3662 snap_btime = si->stamp;
3663 }
3664 }
3665
7c673cae
FG
3666
3667 bool no_caps = !valid ||
3668 session->is_stale() ||
3669 (dir_realm && realm != dir_realm) ||
3670 is_frozen() ||
3671 state_test(CInode::STATE_EXPORTINGCAPS);
3672 if (no_caps)
11fdf7f2 3673 dout(20) << __func__ << " no caps"
7c673cae
FG
3674 << (!valid?", !valid":"")
3675 << (session->is_stale()?", session stale ":"")
3676 << ((dir_realm && realm != dir_realm)?", snaprealm differs ":"")
3677 << (is_frozen()?", frozen inode":"")
3678 << (state_test(CInode::STATE_EXPORTINGCAPS)?", exporting caps":"")
3679 << dendl;
3680
3681
3682 // "fake" a version that is old (stable) version, +1 if projected.
3683 version_t version = (oi->version * 2) + is_projected();
3684
3685 Capability *cap = get_client_cap(client);
3686 bool pfile = filelock.is_xlocked_by_client(client) || get_loner() == client;
3687 //(cap && (cap->issued() & CEPH_CAP_FILE_EXCL));
3688 bool pauth = authlock.is_xlocked_by_client(client) || get_loner() == client;
3689 bool plink = linklock.is_xlocked_by_client(client) || get_loner() == client;
3690 bool pxattr = xattrlock.is_xlocked_by_client(client) || get_loner() == client;
3691
3692 bool plocal = versionlock.get_last_wrlock_client() == client;
3693 bool ppolicy = policylock.is_xlocked_by_client(client) || get_loner()==client;
3694
94b18763 3695 mempool_inode *any_i = (pfile|pauth|plink|pxattr|plocal) ? pi : oi;
7c673cae
FG
3696
3697 dout(20) << " pfile " << pfile << " pauth " << pauth
3698 << " plink " << plink << " pxattr " << pxattr
3699 << " plocal " << plocal
3700 << " ctime " << any_i->ctime
3701 << " valid=" << valid << dendl;
3702
3703 // file
94b18763 3704 mempool_inode *file_i = pfile ? pi:oi;
7c673cae
FG
3705 file_layout_t layout;
3706 if (is_dir()) {
3707 layout = (ppolicy ? pi : oi)->layout;
3708 } else {
3709 layout = file_i->layout;
3710 }
3711
3712 // max_size is min of projected, actual
3713 uint64_t max_size =
f91f0fd5
TL
3714 std::min(oi->get_client_range(client),
3715 pi->get_client_range(client));
7c673cae
FG
3716
3717 // inline data
3718 version_t inline_version = 0;
3719 bufferlist inline_data;
3720 if (file_i->inline_data.version == CEPH_INLINE_NONE) {
3721 inline_version = CEPH_INLINE_NONE;
3722 } else if ((!cap && !no_caps) ||
3723 (cap && cap->client_inline_version < file_i->inline_data.version) ||
3724 (getattr_caps & CEPH_CAP_FILE_RD)) { // client requests inline data
3725 inline_version = file_i->inline_data.version;
3726 if (file_i->inline_data.length() > 0)
3727 inline_data = file_i->inline_data.get_data();
3728 }
3729
3730 // nest (do same as file... :/)
3731 if (cap) {
3732 cap->last_rbytes = file_i->rstat.rbytes;
3733 cap->last_rsize = file_i->rstat.rsize();
3734 }
3735
3736 // auth
94b18763 3737 mempool_inode *auth_i = pauth ? pi:oi;
7c673cae
FG
3738
3739 // link
94b18763 3740 mempool_inode *link_i = plink ? pi:oi;
7c673cae
FG
3741
3742 // xattr
94b18763 3743 mempool_inode *xattr_i = pxattr ? pi:oi;
7c673cae 3744
11fdf7f2 3745 using ceph::encode;
7c673cae 3746 // xattr
7c673cae
FG
3747 version_t xattr_version;
3748 if ((!cap && !no_caps) ||
3749 (cap && cap->client_xattr_version < xattr_i->xattr_version) ||
3750 (getattr_caps & CEPH_CAP_XATTR_SHARED)) { // client requests xattrs
3751 if (!pxattrs)
3752 pxattrs = pxattr ? get_projected_xattrs() : &xattrs;
7c673cae
FG
3753 xattr_version = xattr_i->xattr_version;
3754 } else {
3755 xattr_version = 0;
3756 }
3757
3758 // do we have room?
3759 if (max_bytes) {
11fdf7f2
TL
3760 unsigned bytes =
3761 8 + 8 + 4 + 8 + 8 + sizeof(ceph_mds_reply_cap) +
3762 sizeof(struct ceph_file_layout) +
3763 sizeof(struct ceph_timespec) * 3 + 4 + // ctime ~ time_warp_seq
3764 8 + 8 + 8 + 4 + 4 + 4 + 4 + 4 + // size ~ nlink
3765 8 + 8 + 8 + 8 + 8 + sizeof(struct ceph_timespec) + // dirstat.nfiles ~ rstat.rctime
3766 sizeof(__u32) + sizeof(__u32) * 2 * dirfragtree._splits.size() + // dirfragtree
3767 sizeof(__u32) + symlink.length() + // symlink
3768 sizeof(struct ceph_dir_layout); // dir_layout
3769
3770 if (xattr_version) {
3771 bytes += sizeof(__u32) + sizeof(__u32); // xattr buffer len + number entries
3772 if (pxattrs) {
3773 for (const auto &p : *pxattrs)
3774 bytes += sizeof(__u32) * 2 + p.first.length() + p.second.length();
3775 }
3776 } else {
3777 bytes += sizeof(__u32); // xattr buffer len
3778 }
3779 bytes +=
3780 sizeof(version_t) + sizeof(__u32) + inline_data.length() + // inline data
3781 1 + 1 + 8 + 8 + 4 + // quota
3782 4 + layout.pool_ns.size() + // pool ns
3783 sizeof(struct ceph_timespec) + 8; // btime + change_attr
3784
7c673cae
FG
3785 if (bytes > max_bytes)
3786 return -ENOSPC;
3787 }
3788
3789
3790 // encode caps
3791 struct ceph_mds_reply_cap ecap;
3792 if (snapid != CEPH_NOSNAP) {
3793 /*
3794 * snapped inodes (files or dirs) only get read-only caps. always
3795 * issue everything possible, since it is read only.
3796 *
3797 * if a snapped inode has caps, limit issued caps based on the
3798 * lock state.
3799 *
3800 * if it is a live inode, limit issued caps based on the lock
3801 * state.
3802 *
3803 * do NOT adjust cap issued state, because the client always
3804 * tracks caps per-snap and the mds does either per-interval or
3805 * multiversion.
3806 */
3807 ecap.caps = valid ? get_caps_allowed_by_type(CAP_ANY) : CEPH_STAT_CAP_INODE;
3808 if (last == CEPH_NOSNAP || is_any_caps())
11fdf7f2 3809 ecap.caps = ecap.caps & get_caps_allowed_for_client(session, nullptr, file_i);
7c673cae
FG
3810 ecap.seq = 0;
3811 ecap.mseq = 0;
3812 ecap.realm = 0;
3813 } else {
3814 if (!no_caps && !cap) {
3815 // add a new cap
3816 cap = add_client_cap(client, session, realm);
b32b8144
FG
3817 if (is_auth())
3818 choose_ideal_loner();
7c673cae
FG
3819 }
3820
3821 int issue = 0;
3822 if (!no_caps && cap) {
3823 int likes = get_caps_liked();
11fdf7f2 3824 int allowed = get_caps_allowed_for_client(session, cap, file_i);
7c673cae 3825 issue = (cap->wanted() | likes) & allowed;
494da23a 3826 cap->issue_norevoke(issue, true);
7c673cae
FG
3827 issue = cap->pending();
3828 dout(10) << "encode_inodestat issuing " << ccap_string(issue)
3829 << " seq " << cap->get_last_seq() << dendl;
3830 } else if (cap && cap->is_new() && !dir_realm) {
3831 // alway issue new caps to client, otherwise the caps get lost
11fdf7f2 3832 ceph_assert(cap->is_stale());
494da23a
TL
3833 ceph_assert(!cap->pending());
3834 issue = CEPH_CAP_PIN;
3835 cap->issue_norevoke(issue, true);
7c673cae
FG
3836 dout(10) << "encode_inodestat issuing " << ccap_string(issue)
3837 << " seq " << cap->get_last_seq()
494da23a 3838 << "(stale&new caps)" << dendl;
7c673cae
FG
3839 }
3840
3841 if (issue) {
3842 cap->set_last_issue();
3843 cap->set_last_issue_stamp(ceph_clock_now());
7c673cae
FG
3844 ecap.caps = issue;
3845 ecap.wanted = cap->wanted();
3846 ecap.cap_id = cap->get_cap_id();
3847 ecap.seq = cap->get_last_seq();
3848 ecap.mseq = cap->get_mseq();
3849 ecap.realm = realm->inode->ino();
3850 } else {
3851 ecap.cap_id = 0;
3852 ecap.caps = 0;
3853 ecap.seq = 0;
3854 ecap.mseq = 0;
3855 ecap.realm = 0;
3856 ecap.wanted = 0;
3857 }
3858 }
3859 ecap.flags = is_auth() ? CEPH_CAP_FLAG_AUTH : 0;
3860 dout(10) << "encode_inodestat caps " << ccap_string(ecap.caps)
3861 << " seq " << ecap.seq << " mseq " << ecap.mseq
11fdf7f2 3862 << " xattrv " << xattr_version << dendl;
7c673cae
FG
3863
3864 if (inline_data.length() && cap) {
3865 if ((cap->pending() | getattr_caps) & CEPH_CAP_FILE_SHARED) {
3866 dout(10) << "including inline version " << inline_version << dendl;
3867 cap->client_inline_version = inline_version;
3868 } else {
3869 dout(10) << "dropping inline version " << inline_version << dendl;
3870 inline_version = 0;
3871 inline_data.clear();
3872 }
3873 }
3874
3875 // include those xattrs?
11fdf7f2 3876 if (xattr_version && cap) {
7c673cae 3877 if ((cap->pending() | getattr_caps) & CEPH_CAP_XATTR_SHARED) {
11fdf7f2
TL
3878 dout(10) << "including xattrs version " << xattr_version << dendl;
3879 cap->client_xattr_version = xattr_version;
7c673cae 3880 } else {
11fdf7f2 3881 dout(10) << "dropping xattrs version " << xattr_version << dendl;
7c673cae
FG
3882 xattr_version = 0;
3883 }
3884 }
3885
11fdf7f2
TL
3886 // The end result of encode_xattrs() is equivalent to:
3887 // {
3888 // bufferlist xbl;
3889 // if (xattr_version) {
3890 // if (pxattrs)
3891 // encode(*pxattrs, bl);
3892 // else
3893 // encode((__u32)0, bl);
3894 // }
3895 // encode(xbl, bl);
3896 // }
3897 //
3898 // But encoding xattrs into the 'xbl' requires a memory allocation.
3899 // The 'bl' should have enough pre-allocated memory in most cases.
3900 // Encoding xattrs directly into it can avoid the extra allocation.
3901 auto encode_xattrs = [xattr_version, pxattrs, &bl]() {
3902 using ceph::encode;
3903 if (xattr_version) {
3904 ceph_le32 xbl_len;
3905 auto filler = bl.append_hole(sizeof(xbl_len));
3906 const auto starting_bl_len = bl.length();
3907 if (pxattrs)
3908 encode(*pxattrs, bl);
3909 else
3910 encode((__u32)0, bl);
3911 xbl_len = bl.length() - starting_bl_len;
3912 filler.copy_in(sizeof(xbl_len), (char *)&xbl_len);
3913 } else {
3914 encode((__u32)0, bl);
3915 }
3916 };
3917
7c673cae
FG
3918 /*
3919 * note: encoding matches MClientReply::InodeStat
3920 */
11fdf7f2 3921 if (session->info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) {
81eedcae 3922 ENCODE_START(3, 1, bl);
11fdf7f2
TL
3923 encode(oi->ino, bl);
3924 encode(snapid, bl);
3925 encode(oi->rdev, bl);
3926 encode(version, bl);
3927 encode(xattr_version, bl);
3928 encode(ecap, bl);
3929 {
3930 ceph_file_layout legacy_layout;
3931 layout.to_legacy(&legacy_layout);
3932 encode(legacy_layout, bl);
3933 }
3934 encode(any_i->ctime, bl);
3935 encode(file_i->mtime, bl);
3936 encode(file_i->atime, bl);
3937 encode(file_i->time_warp_seq, bl);
3938 encode(file_i->size, bl);
3939 encode(max_size, bl);
3940 encode(file_i->truncate_size, bl);
3941 encode(file_i->truncate_seq, bl);
3942 encode(auth_i->mode, bl);
3943 encode((uint32_t)auth_i->uid, bl);
3944 encode((uint32_t)auth_i->gid, bl);
3945 encode(link_i->nlink, bl);
3946 encode(file_i->dirstat.nfiles, bl);
3947 encode(file_i->dirstat.nsubdirs, bl);
3948 encode(file_i->rstat.rbytes, bl);
3949 encode(file_i->rstat.rfiles, bl);
3950 encode(file_i->rstat.rsubdirs, bl);
3951 encode(file_i->rstat.rctime, bl);
3952 dirfragtree.encode(bl);
3953 encode(symlink, bl);
3954 encode(file_i->dir_layout, bl);
3955 encode_xattrs();
3956 encode(inline_version, bl);
3957 encode(inline_data, bl);
94b18763 3958 mempool_inode *policy_i = ppolicy ? pi : oi;
11fdf7f2
TL
3959 encode(policy_i->quota, bl);
3960 encode(layout.pool_ns, bl);
3961 encode(any_i->btime, bl);
3962 encode(any_i->change_attr, bl);
3963 encode(file_i->export_pin, bl);
81eedcae 3964 encode(snap_btime, bl);
11fdf7f2
TL
3965 ENCODE_FINISH(bl);
3966 }
3967 else {
3968 ceph_assert(session->get_connection());
3969
3970 encode(oi->ino, bl);
3971 encode(snapid, bl);
3972 encode(oi->rdev, bl);
3973 encode(version, bl);
3974 encode(xattr_version, bl);
3975 encode(ecap, bl);
3976 {
3977 ceph_file_layout legacy_layout;
3978 layout.to_legacy(&legacy_layout);
3979 encode(legacy_layout, bl);
3980 }
3981 encode(any_i->ctime, bl);
3982 encode(file_i->mtime, bl);
3983 encode(file_i->atime, bl);
3984 encode(file_i->time_warp_seq, bl);
3985 encode(file_i->size, bl);
3986 encode(max_size, bl);
3987 encode(file_i->truncate_size, bl);
3988 encode(file_i->truncate_seq, bl);
3989 encode(auth_i->mode, bl);
3990 encode((uint32_t)auth_i->uid, bl);
3991 encode((uint32_t)auth_i->gid, bl);
3992 encode(link_i->nlink, bl);
3993 encode(file_i->dirstat.nfiles, bl);
3994 encode(file_i->dirstat.nsubdirs, bl);
3995 encode(file_i->rstat.rbytes, bl);
3996 encode(file_i->rstat.rfiles, bl);
3997 encode(file_i->rstat.rsubdirs, bl);
3998 encode(file_i->rstat.rctime, bl);
3999 dirfragtree.encode(bl);
4000 encode(symlink, bl);
4001 auto& conn = session->get_connection();
4002 if (conn->has_feature(CEPH_FEATURE_DIRLAYOUTHASH)) {
4003 encode(file_i->dir_layout, bl);
4004 }
4005 encode_xattrs();
4006 if (conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
4007 encode(inline_version, bl);
4008 encode(inline_data, bl);
4009 }
4010 if (conn->has_feature(CEPH_FEATURE_MDS_QUOTA)) {
4011 mempool_inode *policy_i = ppolicy ? pi : oi;
4012 encode(policy_i->quota, bl);
4013 }
4014 if (conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) {
4015 encode(layout.pool_ns, bl);
4016 }
4017 if (conn->has_feature(CEPH_FEATURE_FS_BTIME)) {
4018 encode(any_i->btime, bl);
4019 encode(any_i->change_attr, bl);
4020 }
7c673cae
FG
4021 }
4022
4023 return valid;
4024}
4025
9f95a23c 4026void CInode::encode_cap_message(const ref_t<MClientCaps> &m, Capability *cap)
7c673cae 4027{
11fdf7f2 4028 ceph_assert(cap);
7c673cae
FG
4029
4030 client_t client = cap->get_client();
4031
4032 bool pfile = filelock.is_xlocked_by_client(client) || (cap->issued() & CEPH_CAP_FILE_EXCL);
4033 bool pauth = authlock.is_xlocked_by_client(client);
4034 bool plink = linklock.is_xlocked_by_client(client);
4035 bool pxattr = xattrlock.is_xlocked_by_client(client);
4036
94b18763
FG
4037 mempool_inode *oi = &inode;
4038 mempool_inode *pi = get_projected_inode();
4039 mempool_inode *i = (pfile|pauth|plink|pxattr) ? pi : oi;
7c673cae 4040
11fdf7f2 4041 dout(20) << __func__ << " pfile " << pfile
7c673cae
FG
4042 << " pauth " << pauth << " plink " << plink << " pxattr " << pxattr
4043 << " ctime " << i->ctime << dendl;
4044
4045 i = pfile ? pi:oi;
4046 m->set_layout(i->layout);
4047 m->size = i->size;
4048 m->truncate_seq = i->truncate_seq;
4049 m->truncate_size = i->truncate_size;
4050 m->mtime = i->mtime;
4051 m->atime = i->atime;
4052 m->ctime = i->ctime;
4053 m->change_attr = i->change_attr;
4054 m->time_warp_seq = i->time_warp_seq;
28e407b8
AA
4055 m->nfiles = i->dirstat.nfiles;
4056 m->nsubdirs = i->dirstat.nsubdirs;
7c673cae
FG
4057
4058 if (cap->client_inline_version < i->inline_data.version) {
4059 m->inline_version = cap->client_inline_version = i->inline_data.version;
4060 if (i->inline_data.length() > 0)
4061 m->inline_data = i->inline_data.get_data();
4062 } else {
4063 m->inline_version = 0;
4064 }
4065
4066 // max_size is min of projected, actual.
f91f0fd5
TL
4067 uint64_t oldms = oi->get_client_range(client);
4068 uint64_t newms = pi->get_client_range(client);
11fdf7f2 4069 m->max_size = std::min(oldms, newms);
7c673cae
FG
4070
4071 i = pauth ? pi:oi;
4072 m->head.mode = i->mode;
4073 m->head.uid = i->uid;
4074 m->head.gid = i->gid;
4075
4076 i = plink ? pi:oi;
4077 m->head.nlink = i->nlink;
4078
11fdf7f2 4079 using ceph::encode;
7c673cae 4080 i = pxattr ? pi:oi;
94b18763 4081 auto ix = pxattr ? get_projected_xattrs() : &xattrs;
7c673cae
FG
4082 if ((cap->pending() & CEPH_CAP_XATTR_SHARED) &&
4083 i->xattr_version > cap->client_xattr_version) {
4084 dout(10) << " including xattrs v " << i->xattr_version << dendl;
11fdf7f2 4085 encode(*ix, m->xattrbl);
7c673cae
FG
4086 m->head.xattr_version = i->xattr_version;
4087 cap->client_xattr_version = i->xattr_version;
4088 }
4089}
4090
4091
4092
4093void CInode::_encode_base(bufferlist& bl, uint64_t features)
4094{
9f95a23c 4095 ENCODE_START(1, 1, bl);
11fdf7f2
TL
4096 encode(first, bl);
4097 encode(inode, bl, features);
4098 encode(symlink, bl);
4099 encode(dirfragtree, bl);
4100 encode(xattrs, bl);
4101 encode(old_inodes, bl, features);
4102 encode(damage_flags, bl);
7c673cae 4103 encode_snap(bl);
9f95a23c 4104 ENCODE_FINISH(bl);
7c673cae 4105}
11fdf7f2 4106void CInode::_decode_base(bufferlist::const_iterator& p)
7c673cae 4107{
9f95a23c 4108 DECODE_START(1, p);
11fdf7f2
TL
4109 decode(first, p);
4110 decode(inode, p);
94b18763
FG
4111 {
4112 std::string tmp;
11fdf7f2
TL
4113 decode(tmp, p);
4114 symlink = std::string_view(tmp);
94b18763 4115 }
11fdf7f2 4116 decode(dirfragtree, p);
e306af50 4117 decode_noshare(xattrs, p);
11fdf7f2
TL
4118 decode(old_inodes, p);
4119 decode(damage_flags, p);
7c673cae 4120 decode_snap(p);
9f95a23c 4121 DECODE_FINISH(p);
7c673cae
FG
4122}
4123
4124void CInode::_encode_locks_full(bufferlist& bl)
4125{
11fdf7f2
TL
4126 using ceph::encode;
4127 encode(authlock, bl);
4128 encode(linklock, bl);
4129 encode(dirfragtreelock, bl);
4130 encode(filelock, bl);
4131 encode(xattrlock, bl);
4132 encode(snaplock, bl);
4133 encode(nestlock, bl);
4134 encode(flocklock, bl);
4135 encode(policylock, bl);
4136
4137 encode(loner_cap, bl);
4138}
4139void CInode::_decode_locks_full(bufferlist::const_iterator& p)
4140{
4141 using ceph::decode;
4142 decode(authlock, p);
4143 decode(linklock, p);
4144 decode(dirfragtreelock, p);
4145 decode(filelock, p);
4146 decode(xattrlock, p);
4147 decode(snaplock, p);
4148 decode(nestlock, p);
4149 decode(flocklock, p);
4150 decode(policylock, p);
4151
4152 decode(loner_cap, p);
7c673cae
FG
4153 set_loner_cap(loner_cap);
4154 want_loner_cap = loner_cap; // for now, we'll eval() shortly.
4155}
4156
b32b8144 4157void CInode::_encode_locks_state_for_replica(bufferlist& bl, bool need_recover)
7c673cae 4158{
9f95a23c 4159 ENCODE_START(1, 1, bl);
7c673cae
FG
4160 authlock.encode_state_for_replica(bl);
4161 linklock.encode_state_for_replica(bl);
4162 dirfragtreelock.encode_state_for_replica(bl);
4163 filelock.encode_state_for_replica(bl);
4164 nestlock.encode_state_for_replica(bl);
4165 xattrlock.encode_state_for_replica(bl);
4166 snaplock.encode_state_for_replica(bl);
4167 flocklock.encode_state_for_replica(bl);
4168 policylock.encode_state_for_replica(bl);
11fdf7f2 4169 encode(need_recover, bl);
9f95a23c 4170 ENCODE_FINISH(bl);
7c673cae 4171}
b32b8144 4172
7c673cae
FG
4173void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep)
4174{
4175 authlock.encode_state_for_replica(bl);
4176 linklock.encode_state_for_replica(bl);
4177 dirfragtreelock.encode_state_for_rejoin(bl, rep);
4178 filelock.encode_state_for_rejoin(bl, rep);
4179 nestlock.encode_state_for_rejoin(bl, rep);
4180 xattrlock.encode_state_for_replica(bl);
4181 snaplock.encode_state_for_replica(bl);
4182 flocklock.encode_state_for_replica(bl);
4183 policylock.encode_state_for_replica(bl);
4184}
b32b8144 4185
9f95a23c 4186void CInode::_decode_locks_state_for_replica(bufferlist::const_iterator& p, bool is_new)
7c673cae 4187{
9f95a23c 4188 DECODE_START(1, p);
7c673cae
FG
4189 authlock.decode_state(p, is_new);
4190 linklock.decode_state(p, is_new);
4191 dirfragtreelock.decode_state(p, is_new);
4192 filelock.decode_state(p, is_new);
4193 nestlock.decode_state(p, is_new);
4194 xattrlock.decode_state(p, is_new);
4195 snaplock.decode_state(p, is_new);
4196 flocklock.decode_state(p, is_new);
4197 policylock.decode_state(p, is_new);
b32b8144
FG
4198
4199 bool need_recover;
11fdf7f2 4200 decode(need_recover, p);
b32b8144
FG
4201 if (need_recover && is_new) {
4202 // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock
4203 // and change the object when replaying unsafe requests.
4204 authlock.mark_need_recover();
4205 linklock.mark_need_recover();
4206 dirfragtreelock.mark_need_recover();
4207 filelock.mark_need_recover();
4208 nestlock.mark_need_recover();
4209 xattrlock.mark_need_recover();
4210 snaplock.mark_need_recover();
4211 flocklock.mark_need_recover();
4212 policylock.mark_need_recover();
4213 }
9f95a23c 4214 DECODE_FINISH(p);
7c673cae 4215}
11fdf7f2 4216void CInode::_decode_locks_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters,
b32b8144
FG
4217 list<SimpleLock*>& eval_locks, bool survivor)
4218{
4219 authlock.decode_state_rejoin(p, waiters, survivor);
4220 linklock.decode_state_rejoin(p, waiters, survivor);
4221 dirfragtreelock.decode_state_rejoin(p, waiters, survivor);
4222 filelock.decode_state_rejoin(p, waiters, survivor);
4223 nestlock.decode_state_rejoin(p, waiters, survivor);
4224 xattrlock.decode_state_rejoin(p, waiters, survivor);
4225 snaplock.decode_state_rejoin(p, waiters, survivor);
4226 flocklock.decode_state_rejoin(p, waiters, survivor);
4227 policylock.decode_state_rejoin(p, waiters, survivor);
7c673cae
FG
4228
4229 if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked())
4230 eval_locks.push_back(&dirfragtreelock);
4231 if (!filelock.is_stable() && !filelock.is_wrlocked())
4232 eval_locks.push_back(&filelock);
4233 if (!nestlock.is_stable() && !nestlock.is_wrlocked())
4234 eval_locks.push_back(&nestlock);
4235}
4236
4237
4238// IMPORT/EXPORT
4239
4240void CInode::encode_export(bufferlist& bl)
4241{
4242 ENCODE_START(5, 4, bl);
4243 _encode_base(bl, mdcache->mds->mdsmap->get_up_features());
4244
11fdf7f2 4245 encode(state, bl);
7c673cae 4246
11fdf7f2 4247 encode(pop, bl);
7c673cae 4248
11fdf7f2 4249 encode(get_replicas(), bl);
7c673cae
FG
4250
4251 // include scatterlock info for any bounding CDirs
4252 bufferlist bounding;
4253 if (inode.is_dir())
94b18763
FG
4254 for (const auto &p : dirfrags) {
4255 CDir *dir = p.second;
7c673cae 4256 if (dir->state_test(CDir::STATE_EXPORTBOUND)) {
11fdf7f2
TL
4257 encode(p.first, bounding);
4258 encode(dir->fnode.fragstat, bounding);
4259 encode(dir->fnode.accounted_fragstat, bounding);
4260 encode(dir->fnode.rstat, bounding);
4261 encode(dir->fnode.accounted_rstat, bounding);
7c673cae
FG
4262 dout(10) << " encoded fragstat/rstat info for " << *dir << dendl;
4263 }
4264 }
11fdf7f2 4265 encode(bounding, bl);
7c673cae
FG
4266
4267 _encode_locks_full(bl);
4268
4269 _encode_file_locks(bl);
4270
4271 ENCODE_FINISH(bl);
4272
4273 get(PIN_TEMPEXPORTING);
4274}
4275
11fdf7f2 4276void CInode::finish_export()
7c673cae
FG
4277{
4278 state &= MASK_STATE_EXPORT_KEPT;
4279
11fdf7f2 4280 pop.zero();
7c673cae
FG
4281
4282 // just in case!
4283 //dirlock.clear_updated();
4284
4285 loner_cap = -1;
4286
4287 put(PIN_TEMPEXPORTING);
4288}
4289
11fdf7f2 4290void CInode::decode_import(bufferlist::const_iterator& p,
7c673cae
FG
4291 LogSegment *ls)
4292{
4293 DECODE_START(5, p);
4294
4295 _decode_base(p);
4296
f6b5b4d7
TL
4297 {
4298 unsigned s;
4299 decode(s, p);
4300 s &= MASK_STATE_EXPORTED;
4301
4302 if (s & STATE_RANDEPHEMERALPIN) {
4303 set_ephemeral_rand(true);
4304 }
4305 if (s & STATE_DISTEPHEMERALPIN) {
4306 set_ephemeral_dist(true);
4307 }
4308
4309 state_set(STATE_AUTH | s);
4310 }
7c673cae
FG
4311
4312 if (is_dirty()) {
4313 get(PIN_DIRTY);
4314 _mark_dirty(ls);
4315 }
4316 if (is_dirty_parent()) {
4317 get(PIN_DIRTYPARENT);
28e407b8 4318 mark_dirty_parent(ls);
7c673cae
FG
4319 }
4320
11fdf7f2 4321 decode(pop, p);
7c673cae 4322
11fdf7f2 4323 decode(get_replicas(), p);
181888fb 4324 if (is_replicated())
7c673cae
FG
4325 get(PIN_REPLICATED);
4326 replica_nonce = 0;
4327
4328 // decode fragstat info on bounding cdirs
4329 bufferlist bounding;
11fdf7f2
TL
4330 decode(bounding, p);
4331 auto q = bounding.cbegin();
7c673cae
FG
4332 while (!q.end()) {
4333 frag_t fg;
11fdf7f2 4334 decode(fg, q);
7c673cae 4335 CDir *dir = get_dirfrag(fg);
11fdf7f2 4336 ceph_assert(dir); // we should have all bounds open
7c673cae
FG
4337
4338 // Only take the remote's fragstat/rstat if we are non-auth for
4339 // this dirfrag AND the lock is NOT in a scattered (MIX) state.
4340 // We know lock is stable, and MIX is the only state in which
4341 // the inode auth (who sent us this data) may not have the best
4342 // info.
4343
4344 // HMM: Are there cases where dir->is_auth() is an insufficient
4345 // check because the dirfrag is under migration? That implies
4346 // it is frozen (and in a SYNC or LOCK state). FIXME.
4347
4348 if (dir->is_auth() ||
4349 filelock.get_state() == LOCK_MIX) {
4350 dout(10) << " skipped fragstat info for " << *dir << dendl;
4351 frag_info_t f;
11fdf7f2
TL
4352 decode(f, q);
4353 decode(f, q);
7c673cae 4354 } else {
11fdf7f2
TL
4355 decode(dir->fnode.fragstat, q);
4356 decode(dir->fnode.accounted_fragstat, q);
7c673cae
FG
4357 dout(10) << " took fragstat info for " << *dir << dendl;
4358 }
4359 if (dir->is_auth() ||
4360 nestlock.get_state() == LOCK_MIX) {
4361 dout(10) << " skipped rstat info for " << *dir << dendl;
4362 nest_info_t n;
11fdf7f2
TL
4363 decode(n, q);
4364 decode(n, q);
7c673cae 4365 } else {
11fdf7f2
TL
4366 decode(dir->fnode.rstat, q);
4367 decode(dir->fnode.accounted_rstat, q);
7c673cae
FG
4368 dout(10) << " took rstat info for " << *dir << dendl;
4369 }
4370 }
4371
4372 _decode_locks_full(p);
4373
4374 _decode_file_locks(p);
4375
4376 DECODE_FINISH(p);
4377}
4378
4379
4380void InodeStoreBase::dump(Formatter *f) const
4381{
4382 inode.dump(f);
4383 f->dump_string("symlink", symlink);
9f95a23c
TL
4384
4385 f->open_array_section("xattrs");
4386 for (const auto& [key, val] : xattrs) {
4387 f->open_object_section("xattr");
4388 f->dump_string("key", key);
4389 std::string v(val.c_str(), val.length());
4390 f->dump_string("val", v);
4391 f->close_section();
4392 }
4393 f->close_section();
4394 f->open_object_section("dirfragtree");
4395 dirfragtree.dump(f);
4396 f->close_section(); // dirfragtree
4397
7c673cae 4398 f->open_array_section("old_inodes");
94b18763 4399 for (const auto &p : old_inodes) {
7c673cae 4400 f->open_object_section("old_inode");
94b18763
FG
4401 // The key is the last snapid, the first is in the mempool_old_inode
4402 f->dump_int("last", p.first);
4403 p.second.dump(f);
7c673cae
FG
4404 f->close_section(); // old_inode
4405 }
4406 f->close_section(); // old_inodes
4407
9f95a23c
TL
4408 f->dump_unsigned("oldest_snap", oldest_snap);
4409 f->dump_unsigned("damage_flags", damage_flags);
7c673cae
FG
4410}
4411
4412
9f95a23c 4413void InodeStore::generate_test_instances(std::list<InodeStore*> &ls)
7c673cae
FG
4414{
4415 InodeStore *populated = new InodeStore;
4416 populated->inode.ino = 0xdeadbeef;
4417 populated->symlink = "rhubarb";
4418 ls.push_back(populated);
4419}
4420
9f95a23c 4421void InodeStoreBare::generate_test_instances(std::list<InodeStoreBare*> &ls)
11fdf7f2
TL
4422{
4423 InodeStoreBare *populated = new InodeStoreBare;
4424 populated->inode.ino = 0xdeadbeef;
4425 populated->symlink = "rhubarb";
4426 ls.push_back(populated);
4427}
4428
7c673cae 4429void CInode::validate_disk_state(CInode::validated_data *results,
11fdf7f2 4430 MDSContext *fin)
7c673cae
FG
4431{
4432 class ValidationContinuation : public MDSContinuation {
4433 public:
11fdf7f2 4434 MDSContext *fin;
7c673cae
FG
4435 CInode *in;
4436 CInode::validated_data *results;
4437 bufferlist bl;
4438 CInode *shadow_in;
4439
4440 enum {
4441 START = 0,
4442 BACKTRACE,
4443 INODE,
11fdf7f2
TL
4444 DIRFRAGS,
4445 SNAPREALM,
7c673cae
FG
4446 };
4447
4448 ValidationContinuation(CInode *i,
4449 CInode::validated_data *data_r,
11fdf7f2 4450 MDSContext *fin_) :
7c673cae
FG
4451 MDSContinuation(i->mdcache->mds->server),
4452 fin(fin_),
4453 in(i),
4454 results(data_r),
4455 shadow_in(NULL) {
4456 set_callback(START, static_cast<Continuation::stagePtr>(&ValidationContinuation::_start));
4457 set_callback(BACKTRACE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_backtrace));
4458 set_callback(INODE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_inode_disk));
4459 set_callback(DIRFRAGS, static_cast<Continuation::stagePtr>(&ValidationContinuation::_dirfrags));
11fdf7f2 4460 set_callback(SNAPREALM, static_cast<Continuation::stagePtr>(&ValidationContinuation::_snaprealm));
7c673cae
FG
4461 }
4462
4463 ~ValidationContinuation() override {
b32b8144
FG
4464 if (shadow_in) {
4465 delete shadow_in;
4466 in->mdcache->num_shadow_inodes--;
4467 }
7c673cae
FG
4468 }
4469
4470 /**
4471 * Fetch backtrace and set tag if tag is non-empty
4472 */
11fdf7f2
TL
4473 void fetch_backtrace_and_tag(CInode *in,
4474 std::string_view tag, bool is_internal,
7c673cae
FG
4475 Context *fin, int *bt_r, bufferlist *bt)
4476 {
4477 const int64_t pool = in->get_backtrace_pool();
4478 object_t oid = CInode::get_object_name(in->ino(), frag_t(), "");
4479
4480 ObjectOperation fetch;
4481 fetch.getxattr("parent", bt, bt_r);
4482 in->mdcache->mds->objecter->read(oid, object_locator_t(pool), fetch, CEPH_NOSNAP,
4483 NULL, 0, fin);
11fdf7f2
TL
4484 using ceph::encode;
4485 if (!is_internal) {
4486 ObjectOperation scrub_tag;
7c673cae 4487 bufferlist tag_bl;
11fdf7f2 4488 encode(tag, tag_bl);
7c673cae
FG
4489 scrub_tag.setxattr("scrub_tag", tag_bl);
4490 SnapContext snapc;
4491 in->mdcache->mds->objecter->mutate(oid, object_locator_t(pool), scrub_tag, snapc,
4492 ceph::real_clock::now(),
4493 0, NULL);
4494 }
4495 }
4496
4497 bool _start(int rval) {
4498 if (in->is_dirty()) {
11fdf7f2
TL
4499 MDCache *mdcache = in->mdcache;
4500 mempool_inode& inode = in->inode;
4501 dout(20) << "validating a dirty CInode; results will be inconclusive"
4502 << dendl;
7c673cae
FG
4503 }
4504 if (in->is_symlink()) {
11fdf7f2
TL
4505 // there's nothing to do for symlinks!
4506 return true;
7c673cae
FG
4507 }
4508
11fdf7f2
TL
4509 // prefetch snaprealm's past parents
4510 if (in->snaprealm && !in->snaprealm->have_past_parents_open())
4511 in->snaprealm->open_parents(nullptr);
4512
7c673cae 4513 C_OnFinisher *conf = new C_OnFinisher(get_io_callback(BACKTRACE),
11fdf7f2
TL
4514 in->mdcache->mds->finisher);
4515
4516 std::string_view tag = in->scrub_infop->header->get_tag();
4517 bool is_internal = in->scrub_infop->header->is_internal_tag();
4518 // Rather than using the usual CInode::fetch_backtrace,
4519 // use a special variant that optionally writes a tag in the same
4520 // operation.
4521 fetch_backtrace_and_tag(in, tag, is_internal, conf, &results->backtrace.ondisk_read_retval, &bl);
7c673cae
FG
4522 return false;
4523 }
4524
4525 bool _backtrace(int rval) {
4526 // set up basic result reporting and make sure we got the data
4527 results->performed_validation = true; // at least, some of it!
4528 results->backtrace.checked = true;
4529
4530 const int64_t pool = in->get_backtrace_pool();
4531 inode_backtrace_t& memory_backtrace = results->backtrace.memory_value;
4532 in->build_backtrace(pool, memory_backtrace);
4533 bool equivalent, divergent;
4534 int memory_newer;
4535
4536 MDCache *mdcache = in->mdcache; // For the benefit of dout
94b18763 4537 const mempool_inode& inode = in->inode; // For the benefit of dout
7c673cae
FG
4538
4539 // Ignore rval because it's the result of a FAILOK operation
4540 // from fetch_backtrace_and_tag: the real result is in
4541 // backtrace.ondisk_read_retval
4542 dout(20) << "ondisk_read_retval: " << results->backtrace.ondisk_read_retval << dendl;
4543 if (results->backtrace.ondisk_read_retval != 0) {
4544 results->backtrace.error_str << "failed to read off disk; see retval";
e306af50
TL
4545 // we probably have a new unwritten file!
4546 // so skip the backtrace scrub for this entry and say that all's well
4547 if (in->is_dirty_parent())
4548 results->backtrace.passed = true;
4549 goto next;
7c673cae
FG
4550 }
4551
4552 // extract the backtrace, and compare it to a newly-constructed one
4553 try {
11fdf7f2
TL
4554 auto p = bl.cbegin();
4555 using ceph::decode;
4556 decode(results->backtrace.ondisk_value, p);
7c673cae
FG
4557 dout(10) << "decoded " << bl.length() << " bytes of backtrace successfully" << dendl;
4558 } catch (buffer::error&) {
4559 if (results->backtrace.ondisk_read_retval == 0 && rval != 0) {
4560 // Cases where something has clearly gone wrong with the overall
4561 // fetch op, though we didn't get a nonzero rc from the getxattr
4562 // operation. e.g. object missing.
4563 results->backtrace.ondisk_read_retval = rval;
4564 }
4565 results->backtrace.error_str << "failed to decode on-disk backtrace ("
4566 << bl.length() << " bytes)!";
e306af50
TL
4567 // we probably have a new unwritten file!
4568 // so skip the backtrace scrub for this entry and say that all's well
4569 if (in->is_dirty_parent())
4570 results->backtrace.passed = true;
4571
7c673cae
FG
4572 goto next;
4573 }
4574
4575 memory_newer = memory_backtrace.compare(results->backtrace.ondisk_value,
4576 &equivalent, &divergent);
4577
4578 if (divergent || memory_newer < 0) {
e306af50
TL
4579 // we're divergent, or on-disk version is newer
4580 results->backtrace.error_str << "On-disk backtrace is divergent or newer";
4581 // we probably have a new unwritten file!
4582 // so skip the backtrace scrub for this entry and say that all's well
4583 if (divergent && in->is_dirty_parent())
4584 results->backtrace.passed = true;
7c673cae
FG
4585 } else {
4586 results->backtrace.passed = true;
4587 }
4588next:
4589
4590 if (!results->backtrace.passed && in->scrub_infop->header->get_repair()) {
4591 std::string path;
4592 in->make_path_string(path);
d2e6a577
FG
4593 in->mdcache->mds->clog->warn() << "bad backtrace on inode " << in->ino()
4594 << "(" << path << "), rewriting it";
28e407b8 4595 in->mark_dirty_parent(in->mdcache->mds->mdlog->get_current_segment(),
7c673cae 4596 false);
b32b8144
FG
4597 // Flag that we repaired this BT so that it won't go into damagetable
4598 results->backtrace.repaired = true;
7c673cae
FG
4599 }
4600
4601 // If the inode's number was free in the InoTable, fix that
4602 // (#15619)
4603 {
4604 InoTable *inotable = mdcache->mds->inotable;
4605
d2e6a577 4606 dout(10) << "scrub: inotable ino = " << inode.ino << dendl;
7c673cae
FG
4607 dout(10) << "scrub: inotable free says "
4608 << inotable->is_marked_free(inode.ino) << dendl;
4609
4610 if (inotable->is_marked_free(inode.ino)) {
4611 LogChannelRef clog = in->mdcache->mds->clog;
11fdf7f2 4612 clog->error() << "scrub: inode wrongly marked free: " << inode.ino;
7c673cae
FG
4613
4614 if (in->scrub_infop->header->get_repair()) {
4615 bool repaired = inotable->repair(inode.ino);
4616 if (repaired) {
11fdf7f2 4617 clog->error() << "inode table repaired for inode: " << inode.ino;
7c673cae
FG
4618
4619 inotable->save();
4620 } else {
4621 clog->error() << "Cannot repair inotable while other operations"
4622 " are in progress";
4623 }
4624 }
4625 }
4626 }
4627
7c673cae 4628
11fdf7f2
TL
4629 if (in->is_dir()) {
4630 return validate_directory_data();
4631 } else {
4632 // TODO: validate on-disk inode for normal files
4633 return check_inode_snaprealm();
4634 }
7c673cae
FG
4635 }
4636
4637 bool validate_directory_data() {
11fdf7f2 4638 ceph_assert(in->is_dir());
7c673cae
FG
4639
4640 if (in->is_base()) {
b32b8144
FG
4641 if (!shadow_in) {
4642 shadow_in = new CInode(in->mdcache);
4643 in->mdcache->create_unlinked_system_inode(shadow_in, in->inode.ino, in->inode.mode);
4644 in->mdcache->num_shadow_inodes++;
4645 }
7c673cae
FG
4646 shadow_in->fetch(get_internal_callback(INODE));
4647 return false;
4648 } else {
11fdf7f2 4649 // TODO: validate on-disk inode for non-base directories
7c673cae 4650 results->inode.passed = true;
11fdf7f2 4651 return check_dirfrag_rstats();
7c673cae
FG
4652 }
4653 }
4654
4655 bool _inode_disk(int rval) {
4656 results->inode.checked = true;
4657 results->inode.ondisk_read_retval = rval;
4658 results->inode.ondisk_value = shadow_in->inode;
4659 results->inode.memory_value = in->inode;
4660
94b18763
FG
4661 mempool_inode& si = shadow_in->inode;
4662 mempool_inode& i = in->inode;
7c673cae
FG
4663 if (si.version > i.version) {
4664 // uh, what?
11fdf7f2 4665 results->inode.error_str << "On-disk inode is newer than in-memory one; ";
7c673cae
FG
4666 goto next;
4667 } else {
4668 bool divergent = false;
4669 int r = i.compare(si, &divergent);
4670 results->inode.passed = !divergent && r >= 0;
4671 if (!results->inode.passed) {
4672 results->inode.error_str <<
11fdf7f2 4673 "On-disk inode is divergent or newer than in-memory one; ";
7c673cae
FG
4674 goto next;
4675 }
4676 }
4677next:
4678 return check_dirfrag_rstats();
4679 }
4680
4681 bool check_dirfrag_rstats() {
4682 MDSGatherBuilder gather(g_ceph_context);
11fdf7f2
TL
4683 frag_vec_t leaves;
4684 in->dirfragtree.get_leaves(leaves);
4685 for (const auto& leaf : leaves) {
4686 CDir *dir = in->get_or_open_dirfrag(in->mdcache, leaf);
7c673cae
FG
4687 dir->scrub_info();
4688 if (!dir->scrub_infop->header)
4689 dir->scrub_infop->header = in->scrub_infop->header;
4690 if (dir->is_complete()) {
4691 dir->scrub_local();
4692 } else {
4693 dir->scrub_infop->need_scrub_local = true;
4694 dir->fetch(gather.new_sub(), false);
4695 }
4696 }
4697 if (gather.has_subs()) {
4698 gather.set_finisher(get_internal_callback(DIRFRAGS));
4699 gather.activate();
4700 return false;
4701 } else {
4702 return immediate(DIRFRAGS, 0);
4703 }
4704 }
4705
4706 bool _dirfrags(int rval) {
4707 int frags_errors = 0;
4708 // basic reporting setup
4709 results->raw_stats.checked = true;
4710 results->raw_stats.ondisk_read_retval = rval;
4711
4712 results->raw_stats.memory_value.dirstat = in->inode.dirstat;
4713 results->raw_stats.memory_value.rstat = in->inode.rstat;
4714 frag_info_t& dir_info = results->raw_stats.ondisk_value.dirstat;
4715 nest_info_t& nest_info = results->raw_stats.ondisk_value.rstat;
4716
4717 if (rval != 0) {
4718 results->raw_stats.error_str << "Failed to read dirfrags off disk";
4719 goto next;
4720 }
4721
4722 // check each dirfrag...
94b18763
FG
4723 for (const auto &p : in->dirfrags) {
4724 CDir *dir = p.second;
11fdf7f2 4725 ceph_assert(dir->get_version() > 0);
7c673cae
FG
4726 nest_info.add(dir->fnode.accounted_rstat);
4727 dir_info.add(dir->fnode.accounted_fragstat);
11fdf7f2 4728 if (dir->scrub_infop->pending_scrub_error) {
7c673cae
FG
4729 dir->scrub_infop->pending_scrub_error = false;
4730 if (dir->scrub_infop->header->get_repair()) {
b32b8144 4731 results->raw_stats.repaired = true;
7c673cae 4732 results->raw_stats.error_str
94b18763 4733 << "dirfrag(" << p.first << ") has bad stats (will be fixed); ";
7c673cae
FG
4734 } else {
4735 results->raw_stats.error_str
94b18763 4736 << "dirfrag(" << p.first << ") has bad stats; ";
7c673cae
FG
4737 }
4738 frags_errors++;
4739 }
4740 }
4741 nest_info.rsubdirs++; // it gets one to account for self
11fdf7f2
TL
4742 if (const sr_t *srnode = in->get_projected_srnode(); srnode)
4743 nest_info.rsnaps += srnode->snaps.size();
4744
7c673cae
FG
4745 // ...and that their sum matches our inode settings
4746 if (!dir_info.same_sums(in->inode.dirstat) ||
4747 !nest_info.same_sums(in->inode.rstat)) {
11fdf7f2 4748 if (in->scrub_infop->header->get_repair()) {
7c673cae
FG
4749 results->raw_stats.error_str
4750 << "freshly-calculated rstats don't match existing ones (will be fixed)";
4751 in->mdcache->repair_inode_stats(in);
b32b8144 4752 results->raw_stats.repaired = true;
7c673cae
FG
4753 } else {
4754 results->raw_stats.error_str
4755 << "freshly-calculated rstats don't match existing ones";
4756 }
4757 goto next;
4758 }
4759 if (frags_errors > 0)
4760 goto next;
4761
4762 results->raw_stats.passed = true;
4763next:
11fdf7f2
TL
4764 // snaprealm
4765 return check_inode_snaprealm();
4766 }
4767
4768 bool check_inode_snaprealm() {
4769 if (!in->snaprealm)
4770 return true;
4771
4772 if (!in->snaprealm->have_past_parents_open()) {
4773 in->snaprealm->open_parents(get_internal_callback(SNAPREALM));
4774 return false;
4775 } else {
4776 return immediate(SNAPREALM, 0);
4777 }
4778 }
4779
4780 bool _snaprealm(int rval) {
4781
4782 if (in->snaprealm->past_parents_dirty ||
4783 !in->get_projected_srnode()->past_parents.empty()) {
4784 // temporarily store error in field of on-disk inode validation temporarily
4785 results->inode.checked = true;
4786 results->inode.passed = false;
4787 if (in->scrub_infop->header->get_repair()) {
4788 results->inode.error_str << "Inode has old format snaprealm (will upgrade)";
4789 results->inode.repaired = true;
4790 in->mdcache->upgrade_inode_snaprealm(in);
4791 } else {
4792 results->inode.error_str << "Inode has old format snaprealm";
4793 }
4794 }
7c673cae
FG
4795 return true;
4796 }
4797
4798 void _done() override {
4799 if ((!results->raw_stats.checked || results->raw_stats.passed) &&
4800 (!results->backtrace.checked || results->backtrace.passed) &&
4801 (!results->inode.checked || results->inode.passed))
11fdf7f2
TL
4802 results->passed_validation = true;
4803
4804 // Flag that we did some repair work so that our repair operation
4805 // can be flushed at end of scrub
4806 if (results->backtrace.repaired ||
4807 results->inode.repaired ||
4808 results->raw_stats.repaired)
4809 in->scrub_infop->header->set_repaired();
4810 if (fin)
4811 fin->complete(get_rval());
7c673cae
FG
4812 }
4813 };
4814
4815
4816 dout(10) << "scrub starting validate_disk_state on " << *this << dendl;
4817 ValidationContinuation *vc = new ValidationContinuation(this,
4818 results,
4819 fin);
4820 vc->begin();
4821}
4822
4823void CInode::validated_data::dump(Formatter *f) const
4824{
4825 f->open_object_section("results");
4826 {
4827 f->dump_bool("performed_validation", performed_validation);
4828 f->dump_bool("passed_validation", passed_validation);
4829 f->open_object_section("backtrace");
4830 {
4831 f->dump_bool("checked", backtrace.checked);
4832 f->dump_bool("passed", backtrace.passed);
4833 f->dump_int("read_ret_val", backtrace.ondisk_read_retval);
4834 f->dump_stream("ondisk_value") << backtrace.ondisk_value;
4835 f->dump_stream("memoryvalue") << backtrace.memory_value;
4836 f->dump_string("error_str", backtrace.error_str.str());
4837 }
4838 f->close_section(); // backtrace
4839 f->open_object_section("raw_stats");
4840 {
4841 f->dump_bool("checked", raw_stats.checked);
4842 f->dump_bool("passed", raw_stats.passed);
4843 f->dump_int("read_ret_val", raw_stats.ondisk_read_retval);
4844 f->dump_stream("ondisk_value.dirstat") << raw_stats.ondisk_value.dirstat;
4845 f->dump_stream("ondisk_value.rstat") << raw_stats.ondisk_value.rstat;
4846 f->dump_stream("memory_value.dirrstat") << raw_stats.memory_value.dirstat;
4847 f->dump_stream("memory_value.rstat") << raw_stats.memory_value.rstat;
4848 f->dump_string("error_str", raw_stats.error_str.str());
4849 }
4850 f->close_section(); // raw_stats
4851 // dump failure return code
4852 int rc = 0;
4853 if (backtrace.checked && backtrace.ondisk_read_retval)
4854 rc = backtrace.ondisk_read_retval;
4855 if (inode.checked && inode.ondisk_read_retval)
4856 rc = inode.ondisk_read_retval;
4857 if (raw_stats.checked && raw_stats.ondisk_read_retval)
4858 rc = raw_stats.ondisk_read_retval;
4859 f->dump_int("return_code", rc);
4860 }
4861 f->close_section(); // results
4862}
4863
b32b8144
FG
4864bool CInode::validated_data::all_damage_repaired() const
4865{
4866 bool unrepaired =
4867 (raw_stats.checked && !raw_stats.passed && !raw_stats.repaired)
4868 ||
4869 (backtrace.checked && !backtrace.passed && !backtrace.repaired)
4870 ||
4871 (inode.checked && !inode.passed && !inode.repaired);
4872
4873 return !unrepaired;
4874}
4875
11fdf7f2
TL
4876void CInode::dump(Formatter *f, int flags) const
4877{
4878 if (flags & DUMP_PATH) {
4879 std::string path;
4880 make_path_string(path, true);
4881 if (path.empty())
4882 path = "/";
4883 f->dump_string("path", path);
4884 }
4885
4886 if (flags & DUMP_INODE_STORE_BASE)
4887 InodeStoreBase::dump(f);
4888
4889 if (flags & DUMP_MDS_CACHE_OBJECT)
4890 MDSCacheObject::dump(f);
4891
4892 if (flags & DUMP_LOCKS) {
4893 f->open_object_section("versionlock");
4894 versionlock.dump(f);
4895 f->close_section();
4896
4897 f->open_object_section("authlock");
4898 authlock.dump(f);
4899 f->close_section();
4900
4901 f->open_object_section("linklock");
4902 linklock.dump(f);
4903 f->close_section();
4904
4905 f->open_object_section("dirfragtreelock");
4906 dirfragtreelock.dump(f);
4907 f->close_section();
4908
4909 f->open_object_section("filelock");
4910 filelock.dump(f);
4911 f->close_section();
4912
4913 f->open_object_section("xattrlock");
4914 xattrlock.dump(f);
4915 f->close_section();
4916
4917 f->open_object_section("snaplock");
4918 snaplock.dump(f);
4919 f->close_section();
4920
4921 f->open_object_section("nestlock");
4922 nestlock.dump(f);
4923 f->close_section();
4924
4925 f->open_object_section("flocklock");
4926 flocklock.dump(f);
4927 f->close_section();
4928
4929 f->open_object_section("policylock");
4930 policylock.dump(f);
4931 f->close_section();
4932 }
4933
4934 if (flags & DUMP_STATE) {
4935 f->open_array_section("states");
4936 MDSCacheObject::dump_states(f);
4937 if (state_test(STATE_EXPORTING))
4938 f->dump_string("state", "exporting");
4939 if (state_test(STATE_OPENINGDIR))
4940 f->dump_string("state", "openingdir");
4941 if (state_test(STATE_FREEZING))
4942 f->dump_string("state", "freezing");
4943 if (state_test(STATE_FROZEN))
4944 f->dump_string("state", "frozen");
4945 if (state_test(STATE_AMBIGUOUSAUTH))
4946 f->dump_string("state", "ambiguousauth");
4947 if (state_test(STATE_EXPORTINGCAPS))
4948 f->dump_string("state", "exportingcaps");
4949 if (state_test(STATE_NEEDSRECOVER))
4950 f->dump_string("state", "needsrecover");
4951 if (state_test(STATE_PURGING))
4952 f->dump_string("state", "purging");
4953 if (state_test(STATE_DIRTYPARENT))
4954 f->dump_string("state", "dirtyparent");
4955 if (state_test(STATE_DIRTYRSTAT))
4956 f->dump_string("state", "dirtyrstat");
4957 if (state_test(STATE_STRAYPINNED))
4958 f->dump_string("state", "straypinned");
4959 if (state_test(STATE_FROZENAUTHPIN))
4960 f->dump_string("state", "frozenauthpin");
4961 if (state_test(STATE_DIRTYPOOL))
4962 f->dump_string("state", "dirtypool");
4963 if (state_test(STATE_ORPHAN))
4964 f->dump_string("state", "orphan");
4965 if (state_test(STATE_MISSINGOBJS))
4966 f->dump_string("state", "missingobjs");
7c673cae
FG
4967 f->close_section();
4968 }
7c673cae 4969
11fdf7f2
TL
4970 if (flags & DUMP_CAPS) {
4971 f->open_array_section("client_caps");
4972 for (const auto &p : client_caps) {
4973 auto &client = p.first;
4974 auto cap = &p.second;
4975 f->open_object_section("client_cap");
4976 f->dump_int("client_id", client.v);
4977 f->dump_string("pending", ccap_string(cap->pending()));
4978 f->dump_string("issued", ccap_string(cap->issued()));
4979 f->dump_string("wanted", ccap_string(cap->wanted()));
4980 f->dump_int("last_sent", cap->get_last_seq());
4981 f->close_section();
4982 }
4983 f->close_section();
4984
4985 f->dump_int("loner", loner_cap.v);
4986 f->dump_int("want_loner", want_loner_cap.v);
4987
4988 f->open_array_section("mds_caps_wanted");
4989 for (const auto &p : mds_caps_wanted) {
4990 f->open_object_section("mds_cap_wanted");
4991 f->dump_int("rank", p.first);
4992 f->dump_string("cap", ccap_string(p.second));
4993 f->close_section();
4994 }
4995 f->close_section();
4996 }
7c673cae 4997
11fdf7f2
TL
4998 if (flags & DUMP_DIRFRAGS) {
4999 f->open_array_section("dirfrags");
9f95a23c 5000 auto&& dfs = get_dirfrags();
11fdf7f2
TL
5001 for(const auto &dir: dfs) {
5002 f->open_object_section("dir");
5003 dir->dump(f, CDir::DUMP_DEFAULT | CDir::DUMP_ITEMS);
5004 dir->check_rstats();
5005 f->close_section();
5006 }
7c673cae
FG
5007 f->close_section();
5008 }
7c673cae
FG
5009}
5010
5011/****** Scrub Stuff *****/
5012void CInode::scrub_info_create() const
5013{
5014 dout(25) << __func__ << dendl;
11fdf7f2 5015 ceph_assert(!scrub_infop);
7c673cae
FG
5016
5017 // break out of const-land to set up implicit initial state
5018 CInode *me = const_cast<CInode*>(this);
94b18763 5019 mempool_inode *in = me->get_projected_inode();
7c673cae
FG
5020
5021 scrub_info_t *si = new scrub_info_t();
5022 si->scrub_start_stamp = si->last_scrub_stamp = in->last_scrub_stamp;
5023 si->scrub_start_version = si->last_scrub_version = in->last_scrub_version;
5024
5025 me->scrub_infop = si;
5026}
5027
5028void CInode::scrub_maybe_delete_info()
5029{
5030 if (scrub_infop &&
5031 !scrub_infop->scrub_in_progress &&
5032 !scrub_infop->last_scrub_dirty) {
5033 delete scrub_infop;
5034 scrub_infop = NULL;
5035 }
5036}
5037
5038void CInode::scrub_initialize(CDentry *scrub_parent,
b32b8144 5039 ScrubHeaderRef& header,
11fdf7f2 5040 MDSContext *f)
7c673cae
FG
5041{
5042 dout(20) << __func__ << " with scrub_version " << get_version() << dendl;
94b18763
FG
5043 if (scrub_is_in_progress()) {
5044 dout(20) << __func__ << " inode moved during scrub, reinitializing "
5045 << dendl;
11fdf7f2 5046 ceph_assert(scrub_infop->scrub_parent);
94b18763
FG
5047 CDentry *dn = scrub_infop->scrub_parent;
5048 CDir *dir = dn->dir;
5049 dn->put(CDentry::PIN_SCRUBPARENT);
11fdf7f2 5050 ceph_assert(dir->scrub_infop && dir->scrub_infop->directory_scrubbing);
94b18763
FG
5051 dir->scrub_infop->directories_scrubbing.erase(dn->key());
5052 dir->scrub_infop->others_scrubbing.erase(dn->key());
5053 }
7c673cae
FG
5054 scrub_info();
5055 if (!scrub_infop)
5056 scrub_infop = new scrub_info_t();
5057
5058 if (get_projected_inode()->is_dir()) {
5059 // fill in dirfrag_stamps with initial state
11fdf7f2
TL
5060 frag_vec_t leaves;
5061 dirfragtree.get_leaves(leaves);
5062 for (const auto& leaf : leaves) {
7c673cae 5063 if (header->get_force())
11fdf7f2 5064 scrub_infop->dirfrag_stamps[leaf].reset();
7c673cae 5065 else
11fdf7f2 5066 scrub_infop->dirfrag_stamps[leaf];
7c673cae
FG
5067 }
5068 }
5069
5070 if (scrub_parent)
5071 scrub_parent->get(CDentry::PIN_SCRUBPARENT);
5072 scrub_infop->scrub_parent = scrub_parent;
5073 scrub_infop->on_finish = f;
5074 scrub_infop->scrub_in_progress = true;
5075 scrub_infop->children_scrubbed = false;
5076 scrub_infop->header = header;
5077
5078 scrub_infop->scrub_start_version = get_version();
5079 scrub_infop->scrub_start_stamp = ceph_clock_now();
5080 // right now we don't handle remote inodes
5081}
5082
5083int CInode::scrub_dirfrag_next(frag_t* out_dirfrag)
5084{
5085 dout(20) << __func__ << dendl;
11fdf7f2 5086 ceph_assert(scrub_is_in_progress());
7c673cae
FG
5087
5088 if (!is_dir()) {
5089 return -ENOTDIR;
5090 }
5091
5092 std::map<frag_t, scrub_stamp_info_t>::iterator i =
5093 scrub_infop->dirfrag_stamps.begin();
5094
5095 while (i != scrub_infop->dirfrag_stamps.end()) {
5096 if (i->second.scrub_start_version < scrub_infop->scrub_start_version) {
5097 i->second.scrub_start_version = get_projected_version();
5098 i->second.scrub_start_stamp = ceph_clock_now();
5099 *out_dirfrag = i->first;
5100 dout(20) << " return frag " << *out_dirfrag << dendl;
5101 return 0;
5102 }
5103 ++i;
5104 }
5105
5106 dout(20) << " no frags left, ENOENT " << dendl;
5107 return ENOENT;
5108}
5109
11fdf7f2 5110void CInode::scrub_dirfrags_scrubbing(frag_vec_t* out_dirfrags)
7c673cae 5111{
11fdf7f2
TL
5112 ceph_assert(out_dirfrags != NULL);
5113 ceph_assert(scrub_infop != NULL);
7c673cae
FG
5114
5115 out_dirfrags->clear();
5116 std::map<frag_t, scrub_stamp_info_t>::iterator i =
5117 scrub_infop->dirfrag_stamps.begin();
5118
5119 while (i != scrub_infop->dirfrag_stamps.end()) {
5120 if (i->second.scrub_start_version >= scrub_infop->scrub_start_version) {
5121 if (i->second.last_scrub_version < scrub_infop->scrub_start_version)
5122 out_dirfrags->push_back(i->first);
5123 } else {
5124 return;
5125 }
5126
5127 ++i;
5128 }
5129}
5130
5131void CInode::scrub_dirfrag_finished(frag_t dirfrag)
5132{
5133 dout(20) << __func__ << " on frag " << dirfrag << dendl;
11fdf7f2 5134 ceph_assert(scrub_is_in_progress());
7c673cae
FG
5135
5136 std::map<frag_t, scrub_stamp_info_t>::iterator i =
5137 scrub_infop->dirfrag_stamps.find(dirfrag);
11fdf7f2 5138 ceph_assert(i != scrub_infop->dirfrag_stamps.end());
7c673cae
FG
5139
5140 scrub_stamp_info_t &si = i->second;
5141 si.last_scrub_stamp = si.scrub_start_stamp;
5142 si.last_scrub_version = si.scrub_start_version;
5143}
5144
11fdf7f2
TL
5145void CInode::scrub_aborted(MDSContext **c) {
5146 dout(20) << __func__ << dendl;
5147 ceph_assert(scrub_is_in_progress());
5148
5149 *c = nullptr;
5150 std::swap(*c, scrub_infop->on_finish);
5151
5152 if (scrub_infop->scrub_parent) {
5153 CDentry *dn = scrub_infop->scrub_parent;
5154 scrub_infop->scrub_parent = NULL;
5155 dn->dir->scrub_dentry_finished(dn);
5156 dn->put(CDentry::PIN_SCRUBPARENT);
5157 }
5158
5159 delete scrub_infop;
5160 scrub_infop = nullptr;
5161}
5162
5163void CInode::scrub_finished(MDSContext **c) {
7c673cae 5164 dout(20) << __func__ << dendl;
11fdf7f2 5165 ceph_assert(scrub_is_in_progress());
7c673cae
FG
5166 for (std::map<frag_t, scrub_stamp_info_t>::iterator i =
5167 scrub_infop->dirfrag_stamps.begin();
5168 i != scrub_infop->dirfrag_stamps.end();
5169 ++i) {
5170 if(i->second.last_scrub_version != i->second.scrub_start_version) {
5171 derr << i->second.last_scrub_version << " != "
5172 << i->second.scrub_start_version << dendl;
5173 }
11fdf7f2 5174 ceph_assert(i->second.last_scrub_version == i->second.scrub_start_version);
7c673cae
FG
5175 }
5176
5177 scrub_infop->last_scrub_version = scrub_infop->scrub_start_version;
5178 scrub_infop->last_scrub_stamp = scrub_infop->scrub_start_stamp;
5179 scrub_infop->last_scrub_dirty = true;
5180 scrub_infop->scrub_in_progress = false;
5181
5182 if (scrub_infop->scrub_parent) {
5183 CDentry *dn = scrub_infop->scrub_parent;
5184 scrub_infop->scrub_parent = NULL;
5185 dn->dir->scrub_dentry_finished(dn);
5186 dn->put(CDentry::PIN_SCRUBPARENT);
5187 }
5188
5189 *c = scrub_infop->on_finish;
5190 scrub_infop->on_finish = NULL;
5191
5192 if (scrub_infop->header->get_origin() == this) {
5193 // We are at the point that a tagging scrub was initiated
5194 LogChannelRef clog = mdcache->mds->clog;
11fdf7f2
TL
5195 clog->info() << "scrub complete with tag '"
5196 << scrub_infop->header->get_tag() << "'";
7c673cae
FG
5197 }
5198}
5199
5200int64_t CInode::get_backtrace_pool() const
5201{
5202 if (is_dir()) {
5203 return mdcache->mds->mdsmap->get_metadata_pool();
5204 } else {
5205 // Files are required to have an explicit layout that specifies
5206 // a pool
11fdf7f2 5207 ceph_assert(inode.layout.pool_id != -1);
7c673cae
FG
5208 return inode.layout.pool_id;
5209 }
5210}
5211
f6b5b4d7 5212void CInode::queue_export_pin(mds_rank_t target)
31f18b77 5213{
31f18b77
FG
5214 if (state_test(CInode::STATE_QUEUEDEXPORTPIN))
5215 return;
5216
5217 bool queue = false;
f6b5b4d7
TL
5218 for (auto& p : dirfrags) {
5219 CDir *dir = p.second;
31f18b77
FG
5220 if (!dir->is_auth())
5221 continue;
f6b5b4d7 5222 if (target != MDS_RANK_NONE) {
31f18b77
FG
5223 if (dir->is_subtree_root()) {
5224 // set auxsubtree bit or export it
5225 if (!dir->state_test(CDir::STATE_AUXSUBTREE) ||
f6b5b4d7 5226 target != dir->get_dir_auth().first)
31f18b77
FG
5227 queue = true;
5228 } else {
5229 // create aux subtree or export it
5230 queue = true;
7c673cae 5231 }
31f18b77
FG
5232 } else {
5233 // clear aux subtrees ?
5234 queue = dir->state_test(CDir::STATE_AUXSUBTREE);
5235 }
5236 if (queue) {
5237 state_set(CInode::STATE_QUEUEDEXPORTPIN);
7c673cae 5238 mdcache->export_pin_queue.insert(this);
31f18b77 5239 break;
7c673cae
FG
5240 }
5241 }
5242}
5243
f6b5b4d7
TL
5244void CInode::maybe_export_pin(bool update)
5245{
5246 if (!g_conf()->mds_bal_export_pin)
5247 return;
5248 if (!is_dir() || !is_normal())
5249 return;
5250
5251 dout(15) << __func__ << " update=" << update << " " << *this << dendl;
5252
5253 mds_rank_t export_pin = get_export_pin(false, false);
5254 if (export_pin == MDS_RANK_NONE && !update) {
5255 return;
5256 }
5257
5258 /* disable ephemeral pins */
5259 set_ephemeral_dist(false);
5260 set_ephemeral_rand(false);
5261 queue_export_pin(export_pin);
5262}
5263
5264void CInode::set_ephemeral_dist(bool yes)
5265{
5266 if (yes) {
5267 if (!state_test(CInode::STATE_DISTEPHEMERALPIN)) {
5268 state_set(CInode::STATE_DISTEPHEMERALPIN);
5269 auto p = mdcache->dist_ephemeral_pins.insert(this);
5270 ceph_assert(p.second);
5271 }
5272 } else {
5273 /* avoid std::set::erase if unnecessary */
5274 if (state_test(CInode::STATE_DISTEPHEMERALPIN)) {
5275 dout(10) << "clearing ephemeral distributed pin on " << *this << dendl;
5276 state_clear(CInode::STATE_DISTEPHEMERALPIN);
5277 auto count = mdcache->dist_ephemeral_pins.erase(this);
5278 ceph_assert(count == 1);
5279 queue_export_pin(MDS_RANK_NONE);
5280 }
5281 }
5282}
5283
5284void CInode::maybe_ephemeral_dist(bool update)
5285{
5286 if (!mdcache->get_export_ephemeral_distributed_config()) {
5287 dout(15) << __func__ << " config false: cannot ephemeral distributed pin " << *this << dendl;
5288 set_ephemeral_dist(false);
5289 return;
5290 } else if (!is_dir() || !is_normal()) {
5291 dout(15) << __func__ << " !dir or !normal: cannot ephemeral distributed pin " << *this << dendl;
5292 set_ephemeral_dist(false);
5293 return;
5294 } else if (get_inode().nlink == 0) {
5295 dout(15) << __func__ << " unlinked directory: cannot ephemeral distributed pin " << *this << dendl;
5296 set_ephemeral_dist(false);
5297 return;
5298 } else if (!update && state_test(CInode::STATE_DISTEPHEMERALPIN)) {
5299 dout(15) << __func__ << " requeueing already pinned " << *this << dendl;
5300 queue_export_pin(mdcache->hash_into_rank_bucket(ino()));
5301 return;
5302 }
5303
5304 dout(15) << __func__ << " update=" << update << " " << *this << dendl;
5305
5306 auto dir = get_parent_dir();
5307 if (!dir) {
5308 return;
5309 }
5310
5311 bool pin = dir->get_inode()->get_inode().export_ephemeral_distributed_pin;
5312 if (pin) {
5313 dout(10) << __func__ << " ephemeral distributed pinning " << *this << dendl;
5314 set_ephemeral_dist(true);
5315 queue_export_pin(mdcache->hash_into_rank_bucket(ino()));
5316 } else if (update) {
5317 set_ephemeral_dist(false);
5318 queue_export_pin(MDS_RANK_NONE);
5319 }
5320}
5321
5322void CInode::maybe_ephemeral_dist_children(bool update)
5323{
5324 if (!mdcache->get_export_ephemeral_distributed_config()) {
5325 dout(15) << __func__ << " config false: cannot ephemeral distributed pin " << *this << dendl;
5326 return;
5327 } else if (!is_dir() || !is_normal()) {
5328 dout(15) << __func__ << " !dir or !normal: cannot ephemeral distributed pin " << *this << dendl;
5329 return;
5330 } else if (get_inode().nlink == 0) {
5331 dout(15) << __func__ << " unlinked directory: cannot ephemeral distributed pin " << *this << dendl;
5332 return;
5333 }
5334
5335 bool pin = get_inode().export_ephemeral_distributed_pin;
5336 /* FIXME: expensive to iterate children when not updating */
5337 if (!pin && !update) {
5338 return;
5339 }
5340
5341 dout(10) << __func__ << " maybe ephemerally pinning children of " << *this << dendl;
5342 for (auto& p : dirfrags) {
5343 auto& dir = p.second;
5344 for (auto& q : *dir) {
5345 auto& dn = q.second;
5346 auto&& in = dn->get_linkage()->get_inode();
5347 if (in && in->is_dir()) {
5348 in->maybe_ephemeral_dist(update);
5349 }
5350 }
5351 }
5352}
5353
5354void CInode::set_ephemeral_rand(bool yes)
5355{
5356 if (yes) {
5357 if (!state_test(CInode::STATE_RANDEPHEMERALPIN)) {
5358 state_set(CInode::STATE_RANDEPHEMERALPIN);
5359 auto p = mdcache->rand_ephemeral_pins.insert(this);
5360 ceph_assert(p.second);
5361 }
5362 } else {
5363 if (state_test(CInode::STATE_RANDEPHEMERALPIN)) {
5364 dout(10) << "clearing ephemeral random pin on " << *this << dendl;
5365 state_clear(CInode::STATE_RANDEPHEMERALPIN);
5366 auto count = mdcache->rand_ephemeral_pins.erase(this);
5367 ceph_assert(count == 1);
5368 queue_export_pin(MDS_RANK_NONE);
5369 }
5370 }
5371}
5372
f91f0fd5 5373void CInode::maybe_ephemeral_rand(bool fresh, double threshold)
f6b5b4d7
TL
5374{
5375 if (!mdcache->get_export_ephemeral_random_config()) {
5376 dout(15) << __func__ << " config false: cannot ephemeral random pin " << *this << dendl;
5377 set_ephemeral_rand(false);
5378 return;
5379 } else if (!is_dir() || !is_normal()) {
5380 dout(15) << __func__ << " !dir or !normal: cannot ephemeral random pin " << *this << dendl;
5381 set_ephemeral_rand(false);
5382 return;
5383 } else if (get_inode().nlink == 0) {
5384 dout(15) << __func__ << " unlinked directory: cannot ephemeral random pin " << *this << dendl;
5385 set_ephemeral_rand(false);
5386 return;
5387 } else if (state_test(CInode::STATE_RANDEPHEMERALPIN)) {
5388 dout(10) << __func__ << " already ephemeral random pinned: requeueing " << *this << dendl;
5389 queue_export_pin(mdcache->hash_into_rank_bucket(ino()));
5390 return;
5391 } else if (!fresh) {
5392 return;
5393 }
5394
f91f0fd5
TL
5395 /* not precomputed? */
5396 if (threshold < 0.0) {
5397 threshold = get_ephemeral_rand();
5398 }
5399 if (threshold <= 0.0) {
5400 return;
5401 }
f6b5b4d7
TL
5402 double n = ceph::util::generate_random_number(0.0, 1.0);
5403
5404 dout(15) << __func__ << " rand " << n << " <?= " << threshold
5405 << " " << *this << dendl;
5406
5407 if (n <= threshold) {
5408 dout(10) << __func__ << " randomly export pinning " << *this << dendl;
5409 set_ephemeral_rand(true);
5410 queue_export_pin(mdcache->hash_into_rank_bucket(ino()));
5411 }
5412}
5413
5414void CInode::setxattr_ephemeral_rand(double probability)
5415{
5416 ceph_assert(is_dir());
5417 ceph_assert(is_projected());
5418 get_projected_inode()->export_ephemeral_random_pin = probability;
5419}
5420
5421void CInode::setxattr_ephemeral_dist(bool val)
5422{
5423 ceph_assert(is_dir());
5424 ceph_assert(is_projected());
5425 get_projected_inode()->export_ephemeral_distributed_pin = val;
5426}
5427
7c673cae
FG
5428void CInode::set_export_pin(mds_rank_t rank)
5429{
11fdf7f2
TL
5430 ceph_assert(is_dir());
5431 ceph_assert(is_projected());
7c673cae 5432 get_projected_inode()->export_pin = rank;
7c673cae
FG
5433}
5434
f6b5b4d7
TL
5435void CInode::check_pin_policy()
5436{
5437 const CInode *in = this;
5438 mds_rank_t etarget = MDS_RANK_NONE;
5439 while (true) {
5440 if (in->is_system())
5441 break;
5442 const CDentry *pdn = in->get_parent_dn();
5443 if (!pdn)
5444 break;
5445 if (in->get_inode().nlink == 0) {
5446 // ignore export pin for unlinked directory
5447 return;
5448 } else if (etarget != MDS_RANK_NONE && in->has_ephemeral_policy()) {
5449 return;
5450 } else if (in->get_inode().export_pin >= 0) {
5451 /* clear any epin policy */
5452 set_ephemeral_dist(false);
5453 set_ephemeral_rand(false);
5454 return;
5455 } else if (etarget == MDS_RANK_NONE && in->is_ephemerally_pinned()) {
5456 /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */
5457 etarget = mdcache->hash_into_rank_bucket(in->ino());
5458 }
5459 in = pdn->get_dir()->inode;
5460 }
5461}
5462
5463mds_rank_t CInode::get_export_pin(bool inherit, bool ephemeral) const
7c673cae
FG
5464{
5465 /* An inode that is export pinned may not necessarily be a subtree root, we
5466 * need to traverse the parents. A base or system inode cannot be pinned.
5467 * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5468 * have a parent yet.
5469 */
b32b8144 5470 const CInode *in = this;
f6b5b4d7
TL
5471 mds_rank_t etarget = MDS_RANK_NONE;
5472 while (true) {
5473 if (in->is_system())
5474 break;
5475 const CDentry *pdn = in->get_parent_dn();
5476 if (!pdn)
5477 break;
5478 if (in->get_inode().nlink == 0) {
5479 // ignore export pin for unlinked directory
5480 return MDS_RANK_NONE;
5481 } else if (etarget != MDS_RANK_NONE && in->has_ephemeral_policy()) {
5482 return etarget;
5483 } else if (in->get_inode().export_pin >= 0) {
5484 return in->get_inode().export_pin;
5485 } else if (etarget == MDS_RANK_NONE && ephemeral && in->is_ephemerally_pinned()) {
5486 /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */
5487 etarget = mdcache->hash_into_rank_bucket(in->ino());
5488 if (!inherit) return etarget;
5489 }
5490
5491 if (!inherit) {
5492 break;
5493 }
5494 in = pdn->get_dir()->inode;
5495 }
5496 return MDS_RANK_NONE;
5497}
5498
5499double CInode::get_ephemeral_rand(bool inherit) const
5500{
5501 /* N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5502 * have a parent yet.
5503 */
5504 const CInode *in = this;
5505 double max = mdcache->export_ephemeral_random_max;
b32b8144
FG
5506 while (true) {
5507 if (in->is_system())
5508 break;
f64942e4 5509 const CDentry *pdn = in->get_parent_dn();
b32b8144
FG
5510 if (!pdn)
5511 break;
b32b8144 5512 // ignore export pin for unlinked directory
f64942e4 5513 if (in->get_inode().nlink == 0)
b32b8144 5514 break;
f6b5b4d7
TL
5515
5516 if (in->get_inode().export_ephemeral_random_pin > 0.0)
5517 return std::min(in->get_inode().export_ephemeral_random_pin, max);
5518
5519 /* An export_pin overrides only if no closer parent (incl. this one) has a
5520 * random pin set.
5521 */
f64942e4 5522 if (in->get_inode().export_pin >= 0)
f6b5b4d7 5523 return 0.0;
b32b8144
FG
5524
5525 if (!inherit)
5526 break;
5527 in = pdn->get_dir()->inode;
7c673cae 5528 }
f6b5b4d7 5529 return 0.0;
7c673cae
FG
5530}
5531
5532bool CInode::is_exportable(mds_rank_t dest) const
5533{
5534 mds_rank_t pin = get_export_pin();
5535 if (pin == dest) {
5536 return true;
5537 } else if (pin >= 0) {
5538 return false;
5539 } else {
5540 return true;
5541 }
5542}
181888fb 5543
9f95a23c
TL
5544void CInode::get_nested_dirfrags(std::vector<CDir*>& v) const
5545{
5546 for (const auto &p : dirfrags) {
5547 const auto& dir = p.second;
5548 if (!dir->is_subtree_root())
5549 v.push_back(dir);
5550 }
5551}
5552
5553void CInode::get_subtree_dirfrags(std::vector<CDir*>& v) const
5554{
5555 for (const auto &p : dirfrags) {
5556 const auto& dir = p.second;
5557 if (dir->is_subtree_root())
5558 v.push_back(dir);
5559 }
5560}
5561
181888fb 5562MEMPOOL_DEFINE_OBJECT_FACTORY(CInode, co_inode, mds_co);