]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/CInode.cc
import 15.2.4
[ceph.git] / ceph / src / mds / CInode.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "include/int_types.h"
16#include "common/errno.h"
17
18#include <string>
19#include <stdio.h>
20
21#include "CInode.h"
22#include "CDir.h"
23#include "CDentry.h"
24
25#include "MDSRank.h"
26#include "MDCache.h"
27#include "MDLog.h"
28#include "Locker.h"
29#include "Mutation.h"
30
31#include "events/EUpdate.h"
32
33#include "osdc/Objecter.h"
34
35#include "snap.h"
36
37#include "LogSegment.h"
38
39#include "common/Clock.h"
40
7c673cae
FG
41#include "common/config.h"
42#include "global/global_context.h"
11fdf7f2 43#include "include/ceph_assert.h"
7c673cae
FG
44
45#include "mds/MDSContinuation.h"
46#include "mds/InoTable.h"
11fdf7f2 47#include "cephfs_features.h"
7c673cae
FG
48
49#define dout_context g_ceph_context
50#define dout_subsys ceph_subsys_mds
51#undef dout_prefix
52#define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") "
53
54
55class CInodeIOContext : public MDSIOContextBase
56{
57protected:
58 CInode *in;
59 MDSRank *get_mds() override {return in->mdcache->mds;}
60public:
61 explicit CInodeIOContext(CInode *in_) : in(in_) {
11fdf7f2 62 ceph_assert(in != NULL);
7c673cae
FG
63 }
64};
65
11fdf7f2 66sr_t* const CInode::projected_inode::UNDEF_SRNODE = (sr_t*)(unsigned long)-1;
7c673cae
FG
67
68LockType CInode::versionlock_type(CEPH_LOCK_IVERSION);
69LockType CInode::authlock_type(CEPH_LOCK_IAUTH);
70LockType CInode::linklock_type(CEPH_LOCK_ILINK);
71LockType CInode::dirfragtreelock_type(CEPH_LOCK_IDFT);
72LockType CInode::filelock_type(CEPH_LOCK_IFILE);
73LockType CInode::xattrlock_type(CEPH_LOCK_IXATTR);
74LockType CInode::snaplock_type(CEPH_LOCK_ISNAP);
75LockType CInode::nestlock_type(CEPH_LOCK_INEST);
76LockType CInode::flocklock_type(CEPH_LOCK_IFLOCK);
77LockType CInode::policylock_type(CEPH_LOCK_IPOLICY);
78
9f95a23c
TL
79std::string_view CInode::pin_name(int p) const
80{
81 switch (p) {
82 case PIN_DIRFRAG: return "dirfrag";
83 case PIN_CAPS: return "caps";
84 case PIN_IMPORTING: return "importing";
85 case PIN_OPENINGDIR: return "openingdir";
86 case PIN_REMOTEPARENT: return "remoteparent";
87 case PIN_BATCHOPENJOURNAL: return "batchopenjournal";
88 case PIN_SCATTERED: return "scattered";
89 case PIN_STICKYDIRS: return "stickydirs";
90 //case PIN_PURGING: return "purging";
91 case PIN_FREEZING: return "freezing";
92 case PIN_FROZEN: return "frozen";
93 case PIN_IMPORTINGCAPS: return "importingcaps";
94 case PIN_EXPORTINGCAPS: return "exportingcaps";
95 case PIN_PASTSNAPPARENT: return "pastsnapparent";
96 case PIN_OPENINGSNAPPARENTS: return "openingsnapparents";
97 case PIN_TRUNCATING: return "truncating";
98 case PIN_STRAY: return "stray";
99 case PIN_NEEDSNAPFLUSH: return "needsnapflush";
100 case PIN_DIRTYRSTAT: return "dirtyrstat";
101 case PIN_DIRTYPARENT: return "dirtyparent";
102 case PIN_DIRWAITER: return "dirwaiter";
103 case PIN_SCRUBQUEUE: return "scrubqueue";
104 default: return generic_pin_name(p);
105 }
106}
107
7c673cae
FG
108//int cinode_pins[CINODE_NUM_PINS]; // counts
109ostream& CInode::print_db_line_prefix(ostream& out)
110{
111 return out << ceph_clock_now() << " mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") ";
112}
113
114/*
115 * write caps and lock ids
116 */
117struct cinode_lock_info_t cinode_lock_info[] = {
118 { CEPH_LOCK_IFILE, CEPH_CAP_ANY_FILE_WR },
119 { CEPH_LOCK_IAUTH, CEPH_CAP_AUTH_EXCL },
120 { CEPH_LOCK_ILINK, CEPH_CAP_LINK_EXCL },
121 { CEPH_LOCK_IXATTR, CEPH_CAP_XATTR_EXCL },
122};
123int num_cinode_locks = sizeof(cinode_lock_info) / sizeof(cinode_lock_info[0]);
124
7c673cae
FG
125ostream& operator<<(ostream& out, const CInode& in)
126{
127 string path;
128 in.make_path_string(path, true);
129
130 out << "[inode " << in.inode.ino;
131 out << " ["
132 << (in.is_multiversion() ? "...":"")
133 << in.first << "," << in.last << "]";
134 out << " " << path << (in.is_dir() ? "/":"");
135
136 if (in.is_auth()) {
137 out << " auth";
138 if (in.is_replicated())
139 out << in.get_replicas();
140 } else {
141 mds_authority_t a = in.authority();
142 out << " rep@" << a.first;
143 if (a.second != CDIR_AUTH_UNKNOWN)
144 out << "," << a.second;
145 out << "." << in.get_replica_nonce();
146 }
147
148 if (in.is_symlink())
149 out << " symlink='" << in.symlink << "'";
150 if (in.is_dir() && !in.dirfragtree.empty())
151 out << " " << in.dirfragtree;
152
153 out << " v" << in.get_version();
154 if (in.get_projected_version() > in.get_version())
155 out << " pv" << in.get_projected_version();
156
11fdf7f2
TL
157 if (in.get_num_auth_pins()) {
158 out << " ap=" << in.get_num_auth_pins();
7c673cae 159#ifdef MDS_AUTHPIN_SET
11fdf7f2 160 in.print_authpin_set(out);
7c673cae
FG
161#endif
162 }
163
164 if (in.snaprealm)
165 out << " snaprealm=" << in.snaprealm;
166
167 if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH";
168 if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " needsrecover";
169 if (in.state_test(CInode::STATE_RECOVERING)) out << " recovering";
170 if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " dirtyparent";
171 if (in.state_test(CInode::STATE_MISSINGOBJS)) out << " missingobjs";
172 if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance;
173 if (in.is_frozen_inode()) out << " FROZEN";
174 if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN";
175
94b18763 176 const CInode::mempool_inode *pi = in.get_projected_inode();
7c673cae
FG
177 if (pi->is_truncating())
178 out << " truncating(" << pi->truncate_from << " to " << pi->truncate_size << ")";
179
180 if (in.inode.is_dir()) {
181 out << " " << in.inode.dirstat;
11fdf7f2 182 if (g_conf()->mds_debug_scatterstat && in.is_projected()) {
94b18763 183 const CInode::mempool_inode *pi = in.get_projected_inode();
7c673cae
FG
184 out << "->" << pi->dirstat;
185 }
186 } else {
187 out << " s=" << in.inode.size;
188 if (in.inode.nlink != 1)
189 out << " nl=" << in.inode.nlink;
190 }
191
192 // rstat
193 out << " " << in.inode.rstat;
194 if (!(in.inode.rstat == in.inode.accounted_rstat))
195 out << "/" << in.inode.accounted_rstat;
11fdf7f2 196 if (g_conf()->mds_debug_scatterstat && in.is_projected()) {
94b18763 197 const CInode::mempool_inode *pi = in.get_projected_inode();
7c673cae
FG
198 out << "->" << pi->rstat;
199 if (!(pi->rstat == pi->accounted_rstat))
200 out << "/" << pi->accounted_rstat;
201 }
202
203 if (!in.client_need_snapflush.empty())
204 out << " need_snapflush=" << in.client_need_snapflush;
205
206
207 // locks
208 if (!in.authlock.is_sync_and_unlocked())
209 out << " " << in.authlock;
210 if (!in.linklock.is_sync_and_unlocked())
211 out << " " << in.linklock;
212 if (in.inode.is_dir()) {
213 if (!in.dirfragtreelock.is_sync_and_unlocked())
214 out << " " << in.dirfragtreelock;
215 if (!in.snaplock.is_sync_and_unlocked())
216 out << " " << in.snaplock;
217 if (!in.nestlock.is_sync_and_unlocked())
218 out << " " << in.nestlock;
219 if (!in.policylock.is_sync_and_unlocked())
220 out << " " << in.policylock;
221 } else {
222 if (!in.flocklock.is_sync_and_unlocked())
223 out << " " << in.flocklock;
224 }
225 if (!in.filelock.is_sync_and_unlocked())
226 out << " " << in.filelock;
227 if (!in.xattrlock.is_sync_and_unlocked())
228 out << " " << in.xattrlock;
229 if (!in.versionlock.is_sync_and_unlocked())
230 out << " " << in.versionlock;
231
232 // hack: spit out crap on which clients have caps
233 if (in.inode.client_ranges.size())
234 out << " cr=" << in.inode.client_ranges;
235
236 if (!in.get_client_caps().empty()) {
237 out << " caps={";
11fdf7f2
TL
238 bool first = true;
239 for (const auto &p : in.get_client_caps()) {
240 if (!first) out << ",";
241 out << p.first << "="
242 << ccap_string(p.second.pending());
243 if (p.second.issued() != p.second.pending())
244 out << "/" << ccap_string(p.second.issued());
245 out << "/" << ccap_string(p.second.wanted())
246 << "@" << p.second.get_last_seq();
247 first = false;
7c673cae
FG
248 }
249 out << "}";
250 if (in.get_loner() >= 0 || in.get_wanted_loner() >= 0) {
251 out << ",l=" << in.get_loner();
252 if (in.get_loner() != in.get_wanted_loner())
253 out << "(" << in.get_wanted_loner() << ")";
254 }
255 }
256 if (!in.get_mds_caps_wanted().empty()) {
257 out << " mcw={";
94b18763
FG
258 bool first = true;
259 for (const auto &p : in.get_mds_caps_wanted()) {
260 if (!first)
7c673cae 261 out << ',';
94b18763
FG
262 out << p.first << '=' << ccap_string(p.second);
263 first = false;
7c673cae
FG
264 }
265 out << '}';
266 }
267
268 if (in.get_num_ref()) {
269 out << " |";
270 in.print_pin_set(out);
271 }
272
273 if (in.inode.export_pin != MDS_RANK_NONE) {
274 out << " export_pin=" << in.inode.export_pin;
275 }
276
277 out << " " << &in;
278 out << "]";
279 return out;
280}
281
282ostream& operator<<(ostream& out, const CInode::scrub_stamp_info_t& si)
283{
284 out << "{scrub_start_version: " << si.scrub_start_version
285 << ", scrub_start_stamp: " << si.scrub_start_stamp
286 << ", last_scrub_version: " << si.last_scrub_version
287 << ", last_scrub_stamp: " << si.last_scrub_stamp;
288 return out;
289}
290
11fdf7f2
TL
291CInode::CInode(MDCache *c, bool auth, snapid_t f, snapid_t l)
292 :
293 mdcache(c),
294 first(f), last(l),
295 item_dirty(this),
296 item_caps(this),
297 item_open_file(this),
298 item_dirty_parent(this),
299 item_dirty_dirfrag_dir(this),
300 item_dirty_dirfrag_nest(this),
301 item_dirty_dirfrag_dirfragtree(this),
302 pop(c->decayrate),
303 versionlock(this, &versionlock_type),
304 authlock(this, &authlock_type),
305 linklock(this, &linklock_type),
306 dirfragtreelock(this, &dirfragtreelock_type),
307 filelock(this, &filelock_type),
308 xattrlock(this, &xattrlock_type),
309 snaplock(this, &snaplock_type),
310 nestlock(this, &nestlock_type),
311 flocklock(this, &flocklock_type),
312 policylock(this, &policylock_type)
313{
314 if (auth) state_set(STATE_AUTH);
315}
7c673cae
FG
316
317void CInode::print(ostream& out)
318{
319 out << *this;
320}
321
7c673cae
FG
322void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
323{
11fdf7f2 324 dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl;
7c673cae
FG
325
326 if (client_need_snapflush.empty()) {
327 get(CInode::PIN_NEEDSNAPFLUSH);
328
329 // FIXME: this is non-optimal, as we'll block freezes/migrations for potentially
330 // long periods waiting for clients to flush their snaps.
331 auth_pin(this); // pin head inode...
332 }
333
94b18763 334 auto &clients = client_need_snapflush[snapid];
7c673cae
FG
335 if (clients.empty())
336 snapin->auth_pin(this); // ...and pin snapped/old inode!
337
338 clients.insert(client);
339}
340
341void CInode::remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
342{
94b18763
FG
343 dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl;
344 auto it = client_need_snapflush.find(snapid);
345 if (it == client_need_snapflush.end()) {
7c673cae
FG
346 dout(10) << " snapid not found" << dendl;
347 return;
348 }
94b18763
FG
349 size_t n = it->second.erase(client);
350 if (n == 0) {
7c673cae
FG
351 dout(10) << " client not found" << dendl;
352 return;
353 }
94b18763
FG
354 if (it->second.empty()) {
355 client_need_snapflush.erase(it);
7c673cae
FG
356 snapin->auth_unpin(this);
357
358 if (client_need_snapflush.empty()) {
359 put(CInode::PIN_NEEDSNAPFLUSH);
360 auth_unpin(this);
361 }
362 }
363}
364
494da23a 365pair<bool,bool> CInode::split_need_snapflush(CInode *cowin, CInode *in)
7c673cae 366{
11fdf7f2 367 dout(10) << __func__ << " [" << cowin->first << "," << cowin->last << "] for " << *cowin << dendl;
494da23a
TL
368 bool cowin_need_flush = false;
369 bool orig_need_flush = false;
370 auto it = client_need_snapflush.lower_bound(cowin->first);
371 while (it != client_need_snapflush.end() && it->first < in->first) {
11fdf7f2 372 ceph_assert(!it->second.empty());
94b18763 373 if (cowin->last >= it->first) {
7c673cae 374 cowin->auth_pin(this);
494da23a 375 cowin_need_flush = true;
94b18763
FG
376 ++it;
377 } else {
378 it = client_need_snapflush.erase(it);
379 }
7c673cae
FG
380 in->auth_unpin(this);
381 }
494da23a
TL
382
383 if (it != client_need_snapflush.end() && it->first <= in->last)
384 orig_need_flush = true;
385
386 return make_pair(cowin_need_flush, orig_need_flush);
7c673cae
FG
387}
388
389void CInode::mark_dirty_rstat()
390{
391 if (!state_test(STATE_DIRTYRSTAT)) {
11fdf7f2 392 dout(10) << __func__ << dendl;
7c673cae
FG
393 state_set(STATE_DIRTYRSTAT);
394 get(PIN_DIRTYRSTAT);
224ce89b
WB
395 CDentry *pdn = get_projected_parent_dn();
396 if (pdn->is_auth()) {
397 CDir *pdir = pdn->dir;
398 pdir->dirty_rstat_inodes.push_back(&dirty_rstat_item);
399 mdcache->mds->locker->mark_updated_scatterlock(&pdir->inode->nestlock);
400 } else {
401 // under cross-MDS rename.
402 // DIRTYRSTAT flag will get cleared when rename finishes
11fdf7f2 403 ceph_assert(state_test(STATE_AMBIGUOUSAUTH));
224ce89b 404 }
7c673cae
FG
405 }
406}
407void CInode::clear_dirty_rstat()
408{
409 if (state_test(STATE_DIRTYRSTAT)) {
11fdf7f2 410 dout(10) << __func__ << dendl;
7c673cae
FG
411 state_clear(STATE_DIRTYRSTAT);
412 put(PIN_DIRTYRSTAT);
413 dirty_rstat_item.remove_myself();
414 }
415}
416
94b18763
FG
417CInode::projected_inode &CInode::project_inode(bool xattr, bool snap)
418{
11fdf7f2
TL
419 auto &pi = projected_nodes.empty() ?
420 projected_nodes.emplace_back(inode) :
94b18763 421 projected_nodes.emplace_back(projected_nodes.back().inode);
7c673cae
FG
422
423 if (scrub_infop && scrub_infop->last_scrub_dirty) {
94b18763
FG
424 pi.inode.last_scrub_stamp = scrub_infop->last_scrub_stamp;
425 pi.inode.last_scrub_version = scrub_infop->last_scrub_version;
7c673cae
FG
426 scrub_infop->last_scrub_dirty = false;
427 scrub_maybe_delete_info();
428 }
94b18763
FG
429
430 if (xattr) {
431 pi.xattrs.reset(new mempool_xattr_map(*get_projected_xattrs()));
432 ++num_projected_xattrs;
433 }
434
435 if (snap) {
11fdf7f2 436 project_snaprealm();
94b18763
FG
437 }
438
439 dout(15) << __func__ << " " << pi.inode.ino << dendl;
440 return pi;
7c673cae
FG
441}
442
443void CInode::pop_and_dirty_projected_inode(LogSegment *ls)
444{
11fdf7f2 445 ceph_assert(!projected_nodes.empty());
94b18763
FG
446 auto &front = projected_nodes.front();
447 dout(15) << __func__ << " " << front.inode.ino
448 << " v" << front.inode.version << dendl;
7c673cae
FG
449 int64_t old_pool = inode.layout.pool_id;
450
94b18763 451 mark_dirty(front.inode.version, ls);
f64942e4 452 bool new_export_pin = inode.export_pin != front.inode.export_pin;
94b18763 453 inode = front.inode;
f64942e4
AA
454 if (new_export_pin)
455 maybe_export_pin(true);
7c673cae
FG
456
457 if (inode.is_backtrace_updated())
28e407b8 458 mark_dirty_parent(ls, old_pool != inode.layout.pool_id);
7c673cae 459
94b18763 460 if (front.xattrs) {
7c673cae 461 --num_projected_xattrs;
94b18763 462 xattrs = *front.xattrs;
7c673cae
FG
463 }
464
11fdf7f2
TL
465 if (projected_nodes.front().snapnode != projected_inode::UNDEF_SRNODE) {
466 pop_projected_snaprealm(projected_nodes.front().snapnode, false);
7c673cae
FG
467 --num_projected_srnodes;
468 }
469
7c673cae
FG
470 projected_nodes.pop_front();
471}
472
9f95a23c
TL
473CInode::mempool_xattr_map *CInode::get_projected_xattrs()
474{
475 if (num_projected_xattrs > 0) {
476 for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
477 if (it->xattrs)
478 return it->xattrs.get();
479 }
480 return &xattrs;
481}
482
483CInode::mempool_xattr_map *CInode::get_previous_projected_xattrs()
484{
485 if (num_projected_xattrs > 0) {
486 for (auto it = ++projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
487 if (it->xattrs)
488 return it->xattrs.get();
489 }
490 return &xattrs;
491}
492
11fdf7f2
TL
493sr_t *CInode::prepare_new_srnode(snapid_t snapid)
494{
495 const sr_t *cur_srnode = get_projected_srnode();
496 sr_t *new_srnode;
497
498 if (cur_srnode) {
499 new_srnode = new sr_t(*cur_srnode);
500 if (!new_srnode->past_parents.empty()) {
501 // convert past_parents to past_parent_snaps
502 ceph_assert(snaprealm);
503 auto& snaps = snaprealm->get_snaps();
504 for (auto p : snaps) {
505 if (p >= new_srnode->current_parent_since)
506 break;
507 if (!new_srnode->snaps.count(p))
508 new_srnode->past_parent_snaps.insert(p);
509 }
510 new_srnode->seq = snaprealm->get_newest_seq();
511 new_srnode->past_parents.clear();
512 }
513 if (snaprealm)
514 snaprealm->past_parents_dirty = false;
515 } else {
516 if (snapid == 0)
517 snapid = mdcache->get_global_snaprealm()->get_newest_seq();
518 new_srnode = new sr_t();
519 new_srnode->seq = snapid;
520 new_srnode->created = snapid;
521 new_srnode->current_parent_since = get_oldest_snap();
522 }
523 return new_srnode;
524}
525
9f95a23c
TL
526const sr_t *CInode::get_projected_srnode() const {
527 if (num_projected_srnodes > 0) {
528 for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
529 if (it->snapnode != projected_inode::UNDEF_SRNODE)
530 return it->snapnode;
531 }
532 if (snaprealm)
533 return &snaprealm->srnode;
534 else
535 return NULL;
536}
537
11fdf7f2
TL
538void CInode::project_snaprealm(sr_t *new_srnode)
539{
540 dout(10) << __func__ << " " << new_srnode << dendl;
541 ceph_assert(projected_nodes.back().snapnode == projected_inode::UNDEF_SRNODE);
542 projected_nodes.back().snapnode = new_srnode;
543 ++num_projected_srnodes;
544}
545
546void CInode::mark_snaprealm_global(sr_t *new_srnode)
547{
548 ceph_assert(!is_dir());
549 // 'last_destroyed' is no longer used, use it to store origin 'current_parent_since'
550 new_srnode->last_destroyed = new_srnode->current_parent_since;
551 new_srnode->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
552 new_srnode->mark_parent_global();
553}
554
555void CInode::clear_snaprealm_global(sr_t *new_srnode)
556{
557 // restore 'current_parent_since'
558 new_srnode->current_parent_since = new_srnode->last_destroyed;
559 new_srnode->last_destroyed = 0;
560 new_srnode->seq = mdcache->get_global_snaprealm()->get_newest_seq();
561 new_srnode->clear_parent_global();
562}
563
564bool CInode::is_projected_snaprealm_global() const
565{
566 const sr_t *srnode = get_projected_srnode();
567 if (srnode && srnode->is_parent_global())
568 return true;
569 return false;
570}
571
572void CInode::project_snaprealm_past_parent(SnapRealm *newparent)
573{
574 sr_t *new_snap = project_snaprealm();
575 record_snaprealm_past_parent(new_snap, newparent);
576}
577
578
7c673cae
FG
579/* if newparent != parent, add parent to past_parents
580 if parent DNE, we need to find what the parent actually is and fill that in */
11fdf7f2 581void CInode::record_snaprealm_past_parent(sr_t *new_snap, SnapRealm *newparent)
7c673cae 582{
11fdf7f2 583 ceph_assert(!new_snap->is_parent_global());
7c673cae
FG
584 SnapRealm *oldparent;
585 if (!snaprealm) {
586 oldparent = find_snaprealm();
11fdf7f2 587 } else {
7c673cae 588 oldparent = snaprealm->parent;
11fdf7f2 589 }
7c673cae
FG
590
591 if (newparent != oldparent) {
592 snapid_t oldparentseq = oldparent->get_newest_seq();
11fdf7f2
TL
593 if (oldparentseq + 1 > new_snap->current_parent_since) {
594 // copy old parent's snaps
595 const set<snapid_t>& snaps = oldparent->get_snaps();
596 auto p = snaps.lower_bound(new_snap->current_parent_since);
597 if (p != snaps.end())
598 new_snap->past_parent_snaps.insert(p, snaps.end());
599 if (oldparentseq > new_snap->seq)
600 new_snap->seq = oldparentseq;
7c673cae 601 }
11fdf7f2 602 new_snap->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae
FG
603 }
604}
605
11fdf7f2
TL
606void CInode::record_snaprealm_parent_dentry(sr_t *new_snap, SnapRealm *newparent,
607 CDentry *dn, bool primary_dn)
7c673cae 608{
11fdf7f2
TL
609 ceph_assert(new_snap->is_parent_global());
610 SnapRealm *oldparent = dn->get_dir()->inode->find_snaprealm();
611 auto& snaps = oldparent->get_snaps();
612
613 if (!primary_dn) {
614 auto p = snaps.lower_bound(dn->first);
615 if (p != snaps.end())
616 new_snap->past_parent_snaps.insert(p, snaps.end());
617 } else if (newparent != oldparent) {
618 // 'last_destroyed' is used as 'current_parent_since'
619 auto p = snaps.lower_bound(new_snap->last_destroyed);
620 if (p != snaps.end())
621 new_snap->past_parent_snaps.insert(p, snaps.end());
622 new_snap->last_destroyed = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
623 }
624}
7c673cae 625
11fdf7f2
TL
626void CInode::early_pop_projected_snaprealm()
627{
628 ceph_assert(!projected_nodes.empty());
629 if (projected_nodes.front().snapnode != projected_inode::UNDEF_SRNODE) {
630 pop_projected_snaprealm(projected_nodes.front().snapnode, true);
631 projected_nodes.front().snapnode = projected_inode::UNDEF_SRNODE;
632 --num_projected_srnodes;
7c673cae 633 }
11fdf7f2 634}
7c673cae 635
11fdf7f2
TL
636void CInode::pop_projected_snaprealm(sr_t *next_snaprealm, bool early)
637{
638 if (next_snaprealm) {
639 dout(10) << __func__ << (early ? " (early) " : " ")
640 << next_snaprealm << " seq " << next_snaprealm->seq << dendl;
641 bool invalidate_cached_snaps = false;
642 if (!snaprealm) {
643 open_snaprealm();
644 } else if (!snaprealm->srnode.past_parents.empty()) {
645 invalidate_cached_snaps = true;
646 // re-open past parents
647 snaprealm->close_parents();
648
649 dout(10) << " realm " << *snaprealm << " past_parents " << snaprealm->srnode.past_parents
650 << " -> " << next_snaprealm->past_parents << dendl;
651 }
652 auto old_flags = snaprealm->srnode.flags;
653 snaprealm->srnode = *next_snaprealm;
654 delete next_snaprealm;
7c673cae 655
11fdf7f2
TL
656 if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) {
657 snaprealm->close_parents();
658 snaprealm->adjust_parent();
659 }
7c673cae 660
11fdf7f2
TL
661 // we should be able to open these up (or have them already be open).
662 bool ok = snaprealm->_open_parents(NULL);
663 ceph_assert(ok);
664
665 if (invalidate_cached_snaps)
666 snaprealm->invalidate_cached_snaps();
667
668 if (snaprealm->parent)
669 dout(10) << " realm " << *snaprealm << " parent " << *snaprealm->parent << dendl;
670 } else {
671 dout(10) << __func__ << (early ? " (early) null" : " null") << dendl;
672 ceph_assert(snaprealm);
673 snaprealm->merge_to(NULL);
674 }
7c673cae
FG
675}
676
677
678// ====== CInode =======
679
680// dirfrags
681
11fdf7f2 682__u32 InodeStoreBase::hash_dentry_name(std::string_view dn)
7c673cae
FG
683{
684 int which = inode.dir_layout.dl_dir_hash;
685 if (!which)
686 which = CEPH_STR_HASH_LINUX;
11fdf7f2 687 ceph_assert(ceph_str_hash_valid(which));
7c673cae
FG
688 return ceph_str_hash(which, dn.data(), dn.length());
689}
690
11fdf7f2 691frag_t InodeStoreBase::pick_dirfrag(std::string_view dn)
7c673cae
FG
692{
693 if (dirfragtree.empty())
694 return frag_t(); // avoid the string hash if we can.
695
696 __u32 h = hash_dentry_name(dn);
697 return dirfragtree[h];
698}
699
9f95a23c 700std::pair<bool, std::vector<CDir*>> CInode::get_dirfrags_under(frag_t fg)
7c673cae 701{
9f95a23c
TL
702 std::pair<bool, std::vector<CDir*>> result;
703 auto& all = result.first;
704 auto& dirs = result.second;
705 all = false;
706
707 if (auto it = dirfrags.find(fg); it != dirfrags.end()){
708 all = true;
709 dirs.push_back(it->second);
710 return result;
7c673cae 711 }
9f95a23c
TL
712
713 int total = 0;
714 for(auto &[_fg, _dir] : dirfrags){
715 // frag_t.bits() can indicate the depth of the partition in the directory tree
716 // e.g.
717 // 01* : bit = 2, on the second floor
718 // *
719 // 0* 1*
720 // 00* 01* 10* 11* -- > level 2, bit = 2
721 // so fragA.bits > fragB.bits means fragA is deeper than fragB
722
723 if (fg.bits() >= _fg.bits()) {
724 if (_fg.contains(fg)) {
725 all = true;
726 return result;
727 }
728 } else {
729 if (fg.contains(_fg)) {
730 dirs.push_back(_dir);
731 // we can calculate how many sub slices a slice can be divided into
732 // frag_t(*) can be divided into two frags belonging to the first layer(0* 1*)
733 // or 2^2 frags belonging to the second layer(00* 01* 10* 11*)
734 // or (1 << (24 - frag_t(*).bits)) frags belonging to the 24th level
735 total += 1 << (24 - _fg.bits());
11fdf7f2 736 }
7c673cae 737 }
94b18763 738 }
7c673cae 739
9f95a23c
TL
740 // we convert all the frags into the frags of 24th layer to calculate whether all the frags are included in the memory cache
741 all = ((1<<(24-fg.bits())) == total);
742 return result;
7c673cae
FG
743}
744
745void CInode::verify_dirfrags()
746{
747 bool bad = false;
94b18763
FG
748 for (const auto &p : dirfrags) {
749 if (!dirfragtree.is_leaf(p.first)) {
750 dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
751 << ": " << *p.second << dendl;
7c673cae
FG
752 bad = true;
753 }
754 }
11fdf7f2 755 ceph_assert(!bad);
7c673cae
FG
756}
757
758void CInode::force_dirfrags()
759{
760 bool bad = false;
94b18763
FG
761 for (auto &p : dirfrags) {
762 if (!dirfragtree.is_leaf(p.first)) {
763 dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
764 << ": " << *p.second << dendl;
7c673cae
FG
765 bad = true;
766 }
767 }
768
769 if (bad) {
11fdf7f2 770 frag_vec_t leaves;
7c673cae 771 dirfragtree.get_leaves(leaves);
11fdf7f2
TL
772 for (const auto& leaf : leaves) {
773 mdcache->get_force_dirfrag(dirfrag_t(ino(), leaf), true);
774 }
7c673cae
FG
775 }
776
777 verify_dirfrags();
778}
779
780CDir *CInode::get_approx_dirfrag(frag_t fg)
781{
782 CDir *dir = get_dirfrag(fg);
783 if (dir) return dir;
784
785 // find a child?
9f95a23c
TL
786 auto&& p = get_dirfrags_under(fg);
787 if (!p.second.empty())
788 return p.second.front();
7c673cae
FG
789
790 // try parents?
791 while (fg.bits() > 0) {
792 fg = fg.parent();
793 dir = get_dirfrag(fg);
794 if (dir) return dir;
795 }
796 return NULL;
797}
798
7c673cae
FG
799CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg)
800{
11fdf7f2 801 ceph_assert(is_dir());
7c673cae
FG
802
803 // have it?
804 CDir *dir = get_dirfrag(fg);
805 if (!dir) {
806 // create it.
11fdf7f2 807 ceph_assert(is_auth() || mdcache->mds->is_any_replay());
7c673cae
FG
808 dir = new CDir(this, fg, mdcache, is_auth());
809 add_dirfrag(dir);
810 }
811 return dir;
812}
813
814CDir *CInode::add_dirfrag(CDir *dir)
815{
11fdf7f2
TL
816 auto em = dirfrags.emplace(std::piecewise_construct, std::forward_as_tuple(dir->dirfrag().frag), std::forward_as_tuple(dir));
817 ceph_assert(em.second);
7c673cae
FG
818
819 if (stickydir_ref > 0) {
820 dir->state_set(CDir::STATE_STICKY);
821 dir->get(CDir::PIN_STICKY);
822 }
823
824 maybe_export_pin();
825
826 return dir;
827}
828
829void CInode::close_dirfrag(frag_t fg)
830{
11fdf7f2
TL
831 dout(14) << __func__ << " " << fg << dendl;
832 ceph_assert(dirfrags.count(fg));
7c673cae
FG
833
834 CDir *dir = dirfrags[fg];
835 dir->remove_null_dentries();
836
837 // clear dirty flag
838 if (dir->is_dirty())
839 dir->mark_clean();
840
841 if (stickydir_ref > 0) {
842 dir->state_clear(CDir::STATE_STICKY);
843 dir->put(CDir::PIN_STICKY);
844 }
1adf2230
AA
845
846 if (dir->is_subtree_root())
847 num_subtree_roots--;
7c673cae
FG
848
849 // dump any remaining dentries, for debugging purposes
94b18763
FG
850 for (const auto &p : dir->items)
851 dout(14) << __func__ << " LEFTOVER dn " << *p.second << dendl;
7c673cae 852
11fdf7f2 853 ceph_assert(dir->get_num_ref() == 0);
7c673cae
FG
854 delete dir;
855 dirfrags.erase(fg);
856}
857
858void CInode::close_dirfrags()
859{
860 while (!dirfrags.empty())
861 close_dirfrag(dirfrags.begin()->first);
862}
863
864bool CInode::has_subtree_root_dirfrag(int auth)
865{
1adf2230
AA
866 if (num_subtree_roots > 0) {
867 if (auth == -1)
7c673cae 868 return true;
1adf2230
AA
869 for (const auto &p : dirfrags) {
870 if (p.second->is_subtree_root() &&
871 p.second->dir_auth.first == auth)
872 return true;
873 }
94b18763 874 }
7c673cae
FG
875 return false;
876}
877
878bool CInode::has_subtree_or_exporting_dirfrag()
879{
1adf2230
AA
880 if (num_subtree_roots > 0 || num_exporting_dirs > 0)
881 return true;
7c673cae
FG
882 return false;
883}
884
885void CInode::get_stickydirs()
886{
887 if (stickydir_ref == 0) {
888 get(PIN_STICKYDIRS);
94b18763
FG
889 for (const auto &p : dirfrags) {
890 p.second->state_set(CDir::STATE_STICKY);
891 p.second->get(CDir::PIN_STICKY);
7c673cae
FG
892 }
893 }
894 stickydir_ref++;
895}
896
897void CInode::put_stickydirs()
898{
11fdf7f2 899 ceph_assert(stickydir_ref > 0);
7c673cae
FG
900 stickydir_ref--;
901 if (stickydir_ref == 0) {
902 put(PIN_STICKYDIRS);
94b18763
FG
903 for (const auto &p : dirfrags) {
904 p.second->state_clear(CDir::STATE_STICKY);
905 p.second->put(CDir::PIN_STICKY);
7c673cae
FG
906 }
907 }
908}
909
910
911
912
913
914// pins
915
916void CInode::first_get()
917{
918 // pin my dentry?
919 if (parent)
920 parent->get(CDentry::PIN_INODEPIN);
921}
922
923void CInode::last_put()
924{
925 // unpin my dentry?
926 if (parent)
927 parent->put(CDentry::PIN_INODEPIN);
928}
929
930void CInode::_put()
931{
932 if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent())
933 mdcache->maybe_eval_stray(this, true);
934}
935
936void CInode::add_remote_parent(CDentry *p)
937{
938 if (remote_parents.empty())
939 get(PIN_REMOTEPARENT);
940 remote_parents.insert(p);
941}
942void CInode::remove_remote_parent(CDentry *p)
943{
944 remote_parents.erase(p);
945 if (remote_parents.empty())
946 put(PIN_REMOTEPARENT);
947}
948
949
950
951
952CDir *CInode::get_parent_dir()
953{
954 if (parent)
955 return parent->dir;
956 return NULL;
957}
958CDir *CInode::get_projected_parent_dir()
959{
960 CDentry *p = get_projected_parent_dn();
961 if (p)
962 return p->dir;
963 return NULL;
964}
965CInode *CInode::get_parent_inode()
966{
967 if (parent)
968 return parent->dir->inode;
969 return NULL;
970}
971
11fdf7f2 972bool CInode::is_ancestor_of(const CInode *other) const
7c673cae
FG
973{
974 while (other) {
975 if (other == this)
976 return true;
11fdf7f2
TL
977 const CDentry *pdn = other->get_oldest_parent_dn();
978 if (!pdn) {
979 ceph_assert(other->is_base());
7c673cae 980 break;
11fdf7f2
TL
981 }
982 other = pdn->get_dir()->get_inode();
983 }
984 return false;
985}
986
987bool CInode::is_projected_ancestor_of(const CInode *other) const
988{
989 while (other) {
990 if (other == this)
991 return true;
992 const CDentry *pdn = other->get_projected_parent_dn();
993 if (!pdn) {
994 ceph_assert(other->is_base());
995 break;
996 }
997 other = pdn->get_dir()->get_inode();
7c673cae
FG
998 }
999 return false;
1000}
1001
1002/*
1003 * Because a non-directory inode may have multiple links, the use_parent
1004 * argument allows selecting which parent to use for path construction. This
1005 * argument is only meaningful for the final component (i.e. the first of the
1006 * nested calls) because directories cannot have multiple hard links. If
1007 * use_parent is NULL and projected is true, the primary parent's projected
1008 * inode is used all the way up the path chain. Otherwise the primary parent
1009 * stable inode is used.
1010 */
1011void CInode::make_path_string(string& s, bool projected, const CDentry *use_parent) const
1012{
1013 if (!use_parent) {
1014 use_parent = projected ? get_projected_parent_dn() : parent;
1015 }
1016
1017 if (use_parent) {
1018 use_parent->make_path_string(s, projected);
1019 } else if (is_root()) {
1020 s = "";
1021 } else if (is_mdsdir()) {
1022 char t[40];
1023 uint64_t eino(ino());
1024 eino -= MDS_INO_MDSDIR_OFFSET;
1025 snprintf(t, sizeof(t), "~mds%" PRId64, eino);
1026 s = t;
1027 } else {
1028 char n[40];
1029 uint64_t eino(ino());
1030 snprintf(n, sizeof(n), "#%" PRIx64, eino);
1031 s += n;
1032 }
1033}
1034
1035void CInode::make_path(filepath& fp, bool projected) const
1036{
1037 const CDentry *use_parent = projected ? get_projected_parent_dn() : parent;
1038 if (use_parent) {
11fdf7f2 1039 ceph_assert(!is_base());
7c673cae
FG
1040 use_parent->make_path(fp, projected);
1041 } else {
1042 fp = filepath(ino());
1043 }
1044}
1045
1046void CInode::name_stray_dentry(string& dname)
1047{
1048 char s[20];
1049 snprintf(s, sizeof(s), "%llx", (unsigned long long)inode.ino.val);
1050 dname = s;
1051}
1052
1053version_t CInode::pre_dirty()
1054{
1055 version_t pv;
1056 CDentry* _cdentry = get_projected_parent_dn();
1057 if (_cdentry) {
1058 pv = _cdentry->pre_dirty(get_projected_version());
1059 dout(10) << "pre_dirty " << pv << " (current v " << inode.version << ")" << dendl;
1060 } else {
11fdf7f2 1061 ceph_assert(is_base());
7c673cae
FG
1062 pv = get_projected_version() + 1;
1063 }
94b18763 1064 // force update backtrace for old format inode (see mempool_inode::decode)
7c673cae 1065 if (inode.backtrace_version == 0 && !projected_nodes.empty()) {
94b18763
FG
1066 mempool_inode &pi = projected_nodes.back().inode;
1067 if (pi.backtrace_version == 0)
1068 pi.update_backtrace(pv);
7c673cae
FG
1069 }
1070 return pv;
1071}
1072
1073void CInode::_mark_dirty(LogSegment *ls)
1074{
1075 if (!state_test(STATE_DIRTY)) {
1076 state_set(STATE_DIRTY);
1077 get(PIN_DIRTY);
11fdf7f2 1078 ceph_assert(ls);
7c673cae
FG
1079 }
1080
1081 // move myself to this segment's dirty list
1082 if (ls)
1083 ls->dirty_inodes.push_back(&item_dirty);
1084}
1085
1086void CInode::mark_dirty(version_t pv, LogSegment *ls) {
1087
11fdf7f2 1088 dout(10) << __func__ << " " << *this << dendl;
7c673cae
FG
1089
1090 /*
1091 NOTE: I may already be dirty, but this fn _still_ needs to be called so that
1092 the directory is (perhaps newly) dirtied, and so that parent_dir_version is
1093 updated below.
1094 */
1095
1096 // only auth can get dirty. "dirty" async data in replicas is relative to
1097 // filelock state, not the dirty flag.
11fdf7f2 1098 ceph_assert(is_auth());
7c673cae
FG
1099
1100 // touch my private version
11fdf7f2 1101 ceph_assert(inode.version < pv);
7c673cae
FG
1102 inode.version = pv;
1103 _mark_dirty(ls);
1104
1105 // mark dentry too
1106 if (parent)
1107 parent->mark_dirty(pv, ls);
1108}
1109
1110
1111void CInode::mark_clean()
1112{
11fdf7f2 1113 dout(10) << __func__ << " " << *this << dendl;
7c673cae
FG
1114 if (state_test(STATE_DIRTY)) {
1115 state_clear(STATE_DIRTY);
1116 put(PIN_DIRTY);
1117
1118 // remove myself from ls dirty list
1119 item_dirty.remove_myself();
1120 }
1121}
1122
1123
1124// --------------
1125// per-inode storage
1126// (currently for root inode only)
1127
1128struct C_IO_Inode_Stored : public CInodeIOContext {
1129 version_t version;
1130 Context *fin;
1131 C_IO_Inode_Stored(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {}
1132 void finish(int r) override {
1133 in->_stored(r, version, fin);
1134 }
91327a77
AA
1135 void print(ostream& out) const override {
1136 out << "inode_store(" << in->ino() << ")";
1137 }
7c673cae
FG
1138};
1139
11fdf7f2 1140object_t InodeStoreBase::get_object_name(inodeno_t ino, frag_t fg, std::string_view suffix)
7c673cae
FG
1141{
1142 char n[60];
11fdf7f2
TL
1143 snprintf(n, sizeof(n), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)fg);
1144 ceph_assert(strlen(n) + suffix.size() < sizeof n);
1145 strncat(n, suffix.data(), suffix.size());
7c673cae
FG
1146 return object_t(n);
1147}
1148
11fdf7f2 1149void CInode::store(MDSContext *fin)
7c673cae 1150{
11fdf7f2
TL
1151 dout(10) << __func__ << " " << get_version() << dendl;
1152 ceph_assert(is_base());
7c673cae
FG
1153
1154 if (snaprealm)
1155 purge_stale_snap_data(snaprealm->get_snaps());
1156
1157 // encode
1158 bufferlist bl;
1159 string magic = CEPH_FS_ONDISK_MAGIC;
11fdf7f2
TL
1160 using ceph::encode;
1161 encode(magic, bl);
7c673cae
FG
1162 encode_store(bl, mdcache->mds->mdsmap->get_up_features());
1163
1164 // write it.
1165 SnapContext snapc;
1166 ObjectOperation m;
1167 m.write_full(bl);
1168
1169 object_t oid = CInode::get_object_name(ino(), frag_t(), ".inode");
1170 object_locator_t oloc(mdcache->mds->mdsmap->get_metadata_pool());
1171
1172 Context *newfin =
1173 new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin),
1174 mdcache->mds->finisher);
1175 mdcache->mds->objecter->mutate(oid, oloc, m, snapc,
1176 ceph::real_clock::now(), 0,
1177 newfin);
1178}
1179
1180void CInode::_stored(int r, version_t v, Context *fin)
1181{
1182 if (r < 0) {
1183 dout(1) << "store error " << r << " v " << v << " on " << *this << dendl;
d2e6a577
FG
1184 mdcache->mds->clog->error() << "failed to store inode " << ino()
1185 << " object: " << cpp_strerror(r);
7c673cae
FG
1186 mdcache->mds->handle_write_error(r);
1187 fin->complete(r);
1188 return;
1189 }
1190
11fdf7f2 1191 dout(10) << __func__ << " " << v << " on " << *this << dendl;
7c673cae
FG
1192 if (v == get_projected_version())
1193 mark_clean();
1194
1195 fin->complete(0);
1196}
1197
11fdf7f2 1198void CInode::flush(MDSContext *fin)
7c673cae 1199{
11fdf7f2
TL
1200 dout(10) << __func__ << " " << *this << dendl;
1201 ceph_assert(is_auth() && can_auth_pin());
7c673cae
FG
1202
1203 MDSGatherBuilder gather(g_ceph_context);
1204
1205 if (is_dirty_parent()) {
1206 store_backtrace(gather.new_sub());
1207 }
1208 if (is_dirty()) {
1209 if (is_base()) {
1210 store(gather.new_sub());
1211 } else {
1212 parent->dir->commit(0, gather.new_sub());
1213 }
1214 }
1215
1216 if (gather.has_subs()) {
1217 gather.set_finisher(fin);
1218 gather.activate();
1219 } else {
1220 fin->complete(0);
1221 }
1222}
1223
1224struct C_IO_Inode_Fetched : public CInodeIOContext {
1225 bufferlist bl, bl2;
1226 Context *fin;
1227 C_IO_Inode_Fetched(CInode *i, Context *f) : CInodeIOContext(i), fin(f) {}
1228 void finish(int r) override {
1229 // Ignore 'r', because we fetch from two places, so r is usually ENOENT
1230 in->_fetched(bl, bl2, fin);
1231 }
91327a77
AA
1232 void print(ostream& out) const override {
1233 out << "inode_fetch(" << in->ino() << ")";
1234 }
7c673cae
FG
1235};
1236
11fdf7f2 1237void CInode::fetch(MDSContext *fin)
7c673cae 1238{
11fdf7f2 1239 dout(10) << __func__ << dendl;
7c673cae
FG
1240
1241 C_IO_Inode_Fetched *c = new C_IO_Inode_Fetched(this, fin);
1242 C_GatherBuilder gather(g_ceph_context, new C_OnFinisher(c, mdcache->mds->finisher));
1243
1244 object_t oid = CInode::get_object_name(ino(), frag_t(), "");
1245 object_locator_t oloc(mdcache->mds->mdsmap->get_metadata_pool());
1246
1247 // Old on-disk format: inode stored in xattr of a dirfrag
1248 ObjectOperation rd;
1249 rd.getxattr("inode", &c->bl, NULL);
1250 mdcache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, (bufferlist*)NULL, 0, gather.new_sub());
1251
1252 // Current on-disk format: inode stored in a .inode object
1253 object_t oid2 = CInode::get_object_name(ino(), frag_t(), ".inode");
1254 mdcache->mds->objecter->read(oid2, oloc, 0, 0, CEPH_NOSNAP, &c->bl2, 0, gather.new_sub());
1255
1256 gather.activate();
1257}
1258
1259void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin)
1260{
11fdf7f2
TL
1261 dout(10) << __func__ << " got " << bl.length() << " and " << bl2.length() << dendl;
1262 bufferlist::const_iterator p;
7c673cae 1263 if (bl2.length()) {
11fdf7f2 1264 p = bl2.cbegin();
7c673cae 1265 } else if (bl.length()) {
11fdf7f2 1266 p = bl.cbegin();
7c673cae 1267 } else {
d2e6a577 1268 derr << "No data while reading inode " << ino() << dendl;
7c673cae
FG
1269 fin->complete(-ENOENT);
1270 return;
1271 }
1272
11fdf7f2 1273 using ceph::decode;
7c673cae
FG
1274 // Attempt decode
1275 try {
1276 string magic;
11fdf7f2 1277 decode(magic, p);
7c673cae
FG
1278 dout(10) << " magic is '" << magic << "' (expecting '"
1279 << CEPH_FS_ONDISK_MAGIC << "')" << dendl;
1280 if (magic != CEPH_FS_ONDISK_MAGIC) {
1281 dout(0) << "on disk magic '" << magic << "' != my magic '" << CEPH_FS_ONDISK_MAGIC
1282 << "'" << dendl;
1283 fin->complete(-EINVAL);
1284 } else {
1285 decode_store(p);
1286 dout(10) << "_fetched " << *this << dendl;
1287 fin->complete(0);
1288 }
1289 } catch (buffer::error &err) {
d2e6a577 1290 derr << "Corrupt inode " << ino() << ": " << err << dendl;
7c673cae
FG
1291 fin->complete(-EINVAL);
1292 return;
1293 }
1294}
1295
1296void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt)
1297{
1298 bt.ino = inode.ino;
1299 bt.ancestors.clear();
1300 bt.pool = pool;
1301
1302 CInode *in = this;
1303 CDentry *pdn = get_parent_dn();
1304 while (pdn) {
1305 CInode *diri = pdn->get_dir()->get_inode();
94b18763 1306 bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(), in->inode.version));
7c673cae
FG
1307 in = diri;
1308 pdn = in->get_parent_dn();
1309 }
94b18763 1310 for (auto &p : inode.old_pools) {
7c673cae 1311 // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0)
94b18763
FG
1312 if (p != pool)
1313 bt.old_pools.insert(p);
7c673cae
FG
1314 }
1315}
1316
1317struct C_IO_Inode_StoredBacktrace : public CInodeIOContext {
1318 version_t version;
1319 Context *fin;
1320 C_IO_Inode_StoredBacktrace(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {}
1321 void finish(int r) override {
1322 in->_stored_backtrace(r, version, fin);
1323 }
91327a77
AA
1324 void print(ostream& out) const override {
1325 out << "backtrace_store(" << in->ino() << ")";
1326 }
7c673cae
FG
1327};
1328
11fdf7f2 1329void CInode::store_backtrace(MDSContext *fin, int op_prio)
7c673cae 1330{
11fdf7f2
TL
1331 dout(10) << __func__ << " on " << *this << dendl;
1332 ceph_assert(is_dirty_parent());
7c673cae
FG
1333
1334 if (op_prio < 0)
1335 op_prio = CEPH_MSG_PRIO_DEFAULT;
1336
1337 auth_pin(this);
1338
1339 const int64_t pool = get_backtrace_pool();
1340 inode_backtrace_t bt;
1341 build_backtrace(pool, bt);
1342 bufferlist parent_bl;
11fdf7f2
TL
1343 using ceph::encode;
1344 encode(bt, parent_bl);
7c673cae
FG
1345
1346 ObjectOperation op;
1347 op.priority = op_prio;
1348 op.create(false);
1349 op.setxattr("parent", parent_bl);
1350
1351 bufferlist layout_bl;
11fdf7f2 1352 encode(inode.layout, layout_bl, mdcache->mds->mdsmap->get_up_features());
7c673cae
FG
1353 op.setxattr("layout", layout_bl);
1354
1355 SnapContext snapc;
1356 object_t oid = get_object_name(ino(), frag_t(), "");
1357 object_locator_t oloc(pool);
1358 Context *fin2 = new C_OnFinisher(
1359 new C_IO_Inode_StoredBacktrace(this, inode.backtrace_version, fin),
1360 mdcache->mds->finisher);
1361
1362 if (!state_test(STATE_DIRTYPOOL) || inode.old_pools.empty()) {
1363 dout(20) << __func__ << ": no dirtypool or no old pools" << dendl;
1364 mdcache->mds->objecter->mutate(oid, oloc, op, snapc,
1365 ceph::real_clock::now(),
1366 0, fin2);
1367 return;
1368 }
1369
1370 C_GatherBuilder gather(g_ceph_context, fin2);
1371 mdcache->mds->objecter->mutate(oid, oloc, op, snapc,
1372 ceph::real_clock::now(),
1373 0, gather.new_sub());
1374
1375 // In the case where DIRTYPOOL is set, we update all old pools backtraces
1376 // such that anyone reading them will see the new pool ID in
1377 // inode_backtrace_t::pool and go read everything else from there.
94b18763
FG
1378 for (const auto &p : inode.old_pools) {
1379 if (p == pool)
7c673cae
FG
1380 continue;
1381
94b18763 1382 dout(20) << __func__ << ": updating old pool " << p << dendl;
7c673cae
FG
1383
1384 ObjectOperation op;
1385 op.priority = op_prio;
1386 op.create(false);
1387 op.setxattr("parent", parent_bl);
1388
94b18763 1389 object_locator_t oloc(p);
7c673cae
FG
1390 mdcache->mds->objecter->mutate(oid, oloc, op, snapc,
1391 ceph::real_clock::now(),
1392 0, gather.new_sub());
1393 }
1394 gather.activate();
1395}
1396
1397void CInode::_stored_backtrace(int r, version_t v, Context *fin)
1398{
1399 if (r == -ENOENT) {
1400 const int64_t pool = get_backtrace_pool();
1401 bool exists = mdcache->mds->objecter->with_osdmap(
1402 [pool](const OSDMap &osd_map) {
1403 return osd_map.have_pg_pool(pool);
1404 });
1405
1406 // This ENOENT is because the pool doesn't exist (the user deleted it
1407 // out from under us), so the backtrace can never be written, so pretend
1408 // to succeed so that the user can proceed to e.g. delete the file.
1409 if (!exists) {
11fdf7f2 1410 dout(4) << __func__ << " got ENOENT: a data pool was deleted "
7c673cae
FG
1411 "beneath us!" << dendl;
1412 r = 0;
1413 }
1414 }
1415
1416 if (r < 0) {
1417 dout(1) << "store backtrace error " << r << " v " << v << dendl;
1418 mdcache->mds->clog->error() << "failed to store backtrace on ino "
1419 << ino() << " object"
1420 << ", pool " << get_backtrace_pool()
1421 << ", errno " << r;
1422 mdcache->mds->handle_write_error(r);
1423 if (fin)
1424 fin->complete(r);
1425 return;
1426 }
1427
11fdf7f2 1428 dout(10) << __func__ << " v " << v << dendl;
7c673cae
FG
1429
1430 auth_unpin(this);
1431 if (v == inode.backtrace_version)
1432 clear_dirty_parent();
1433 if (fin)
1434 fin->complete(0);
1435}
1436
1437void CInode::fetch_backtrace(Context *fin, bufferlist *backtrace)
1438{
1439 mdcache->fetch_backtrace(inode.ino, get_backtrace_pool(), *backtrace, fin);
1440}
1441
28e407b8 1442void CInode::mark_dirty_parent(LogSegment *ls, bool dirty_pool)
7c673cae
FG
1443{
1444 if (!state_test(STATE_DIRTYPARENT)) {
11fdf7f2 1445 dout(10) << __func__ << dendl;
7c673cae
FG
1446 state_set(STATE_DIRTYPARENT);
1447 get(PIN_DIRTYPARENT);
11fdf7f2 1448 ceph_assert(ls);
7c673cae
FG
1449 }
1450 if (dirty_pool)
1451 state_set(STATE_DIRTYPOOL);
1452 if (ls)
1453 ls->dirty_parent_inodes.push_back(&item_dirty_parent);
1454}
1455
1456void CInode::clear_dirty_parent()
1457{
1458 if (state_test(STATE_DIRTYPARENT)) {
11fdf7f2 1459 dout(10) << __func__ << dendl;
7c673cae
FG
1460 state_clear(STATE_DIRTYPARENT);
1461 state_clear(STATE_DIRTYPOOL);
1462 put(PIN_DIRTYPARENT);
1463 item_dirty_parent.remove_myself();
1464 }
1465}
1466
1467void CInode::verify_diri_backtrace(bufferlist &bl, int err)
1468{
1469 if (is_base() || is_dirty_parent() || !is_auth())
1470 return;
1471
11fdf7f2 1472 dout(10) << __func__ << dendl;
7c673cae
FG
1473
1474 if (err == 0) {
1475 inode_backtrace_t backtrace;
11fdf7f2
TL
1476 using ceph::decode;
1477 decode(backtrace, bl);
7c673cae
FG
1478 CDentry *pdn = get_parent_dn();
1479 if (backtrace.ancestors.empty() ||
94b18763 1480 backtrace.ancestors[0].dname != pdn->get_name() ||
7c673cae
FG
1481 backtrace.ancestors[0].dirino != pdn->get_dir()->ino())
1482 err = -EINVAL;
1483 }
1484
1485 if (err) {
1486 MDSRank *mds = mdcache->mds;
d2e6a577 1487 mds->clog->error() << "bad backtrace on directory inode " << ino();
11fdf7f2 1488 ceph_assert(!"bad backtrace" == (g_conf()->mds_verify_backtrace > 1));
7c673cae 1489
28e407b8 1490 mark_dirty_parent(mds->mdlog->get_current_segment(), false);
7c673cae
FG
1491 mds->mdlog->flush();
1492 }
1493}
1494
1495// ------------------
1496// parent dir
1497
1498
1499void InodeStoreBase::encode_bare(bufferlist &bl, uint64_t features,
1500 const bufferlist *snap_blob) const
1501{
11fdf7f2
TL
1502 using ceph::encode;
1503 encode(inode, bl, features);
7c673cae 1504 if (is_symlink())
11fdf7f2
TL
1505 encode(symlink, bl);
1506 encode(dirfragtree, bl);
1507 encode(xattrs, bl);
7c673cae 1508 if (snap_blob)
11fdf7f2 1509 encode(*snap_blob, bl);
7c673cae 1510 else
11fdf7f2
TL
1511 encode(bufferlist(), bl);
1512 encode(old_inodes, bl, features);
1513 encode(oldest_snap, bl);
1514 encode(damage_flags, bl);
7c673cae
FG
1515}
1516
1517void InodeStoreBase::encode(bufferlist &bl, uint64_t features,
1518 const bufferlist *snap_blob) const
1519{
1520 ENCODE_START(6, 4, bl);
1521 encode_bare(bl, features, snap_blob);
1522 ENCODE_FINISH(bl);
1523}
1524
1525void CInode::encode_store(bufferlist& bl, uint64_t features)
1526{
1527 bufferlist snap_blob;
1528 encode_snap_blob(snap_blob);
1529 InodeStoreBase::encode(bl, mdcache->mds->mdsmap->get_up_features(),
1530 &snap_blob);
1531}
1532
11fdf7f2 1533void InodeStoreBase::decode_bare(bufferlist::const_iterator &bl,
7c673cae
FG
1534 bufferlist& snap_blob, __u8 struct_v)
1535{
11fdf7f2
TL
1536 using ceph::decode;
1537 decode(inode, bl);
94b18763
FG
1538 if (is_symlink()) {
1539 std::string tmp;
11fdf7f2
TL
1540 decode(tmp, bl);
1541 symlink = std::string_view(tmp);
94b18763 1542 }
11fdf7f2 1543 decode(dirfragtree, bl);
e306af50 1544 decode_noshare(xattrs, bl);
11fdf7f2 1545 decode(snap_blob, bl);
7c673cae 1546
11fdf7f2 1547 decode(old_inodes, bl);
7c673cae
FG
1548 if (struct_v == 2 && inode.is_dir()) {
1549 bool default_layout_exists;
11fdf7f2 1550 decode(default_layout_exists, bl);
7c673cae 1551 if (default_layout_exists) {
11fdf7f2
TL
1552 decode(struct_v, bl); // this was a default_file_layout
1553 decode(inode.layout, bl); // but we only care about the layout portion
7c673cae
FG
1554 }
1555 }
1556
1557 if (struct_v >= 5) {
1558 // InodeStore is embedded in dentries without proper versioning, so
1559 // we consume up to the end of the buffer
1560 if (!bl.end()) {
11fdf7f2 1561 decode(oldest_snap, bl);
7c673cae
FG
1562 }
1563
1564 if (!bl.end()) {
11fdf7f2 1565 decode(damage_flags, bl);
7c673cae
FG
1566 }
1567 }
1568}
1569
1570
11fdf7f2 1571void InodeStoreBase::decode(bufferlist::const_iterator &bl, bufferlist& snap_blob)
7c673cae
FG
1572{
1573 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
1574 decode_bare(bl, snap_blob, struct_v);
1575 DECODE_FINISH(bl);
1576}
1577
11fdf7f2 1578void CInode::decode_store(bufferlist::const_iterator& bl)
7c673cae
FG
1579{
1580 bufferlist snap_blob;
1581 InodeStoreBase::decode(bl, snap_blob);
1582 decode_snap_blob(snap_blob);
1583}
1584
1585// ------------------
1586// locking
1587
9f95a23c
TL
1588SimpleLock* CInode::get_lock(int type)
1589{
1590 switch (type) {
1591 case CEPH_LOCK_IVERSION: return &versionlock;
1592 case CEPH_LOCK_IFILE: return &filelock;
1593 case CEPH_LOCK_IAUTH: return &authlock;
1594 case CEPH_LOCK_ILINK: return &linklock;
1595 case CEPH_LOCK_IDFT: return &dirfragtreelock;
1596 case CEPH_LOCK_IXATTR: return &xattrlock;
1597 case CEPH_LOCK_ISNAP: return &snaplock;
1598 case CEPH_LOCK_INEST: return &nestlock;
1599 case CEPH_LOCK_IFLOCK: return &flocklock;
1600 case CEPH_LOCK_IPOLICY: return &policylock;
1601 }
1602 return 0;
1603}
1604
7c673cae
FG
1605void CInode::set_object_info(MDSCacheObjectInfo &info)
1606{
1607 info.ino = ino();
1608 info.snapid = last;
1609}
1610
9f95a23c 1611void CInode::encode_lock_iauth(bufferlist& bl)
7c673cae 1612{
9f95a23c
TL
1613 ENCODE_START(1, 1, bl);
1614 encode(inode.version, bl);
1615 encode(inode.ctime, bl);
1616 encode(inode.mode, bl);
1617 encode(inode.uid, bl);
1618 encode(inode.gid, bl);
1619 ENCODE_FINISH(bl);
1620}
7c673cae 1621
9f95a23c
TL
1622void CInode::decode_lock_iauth(bufferlist::const_iterator& p)
1623{
1624 DECODE_START(1, p);
1625 decode(inode.version, p);
1626 utime_t tm;
1627 decode(tm, p);
1628 if (inode.ctime < tm) inode.ctime = tm;
1629 decode(inode.mode, p);
1630 decode(inode.uid, p);
1631 decode(inode.gid, p);
1632 DECODE_FINISH(p);
1633}
1634
1635void CInode::encode_lock_ilink(bufferlist& bl)
1636{
1637 ENCODE_START(1, 1, bl);
1638 encode(inode.version, bl);
1639 encode(inode.ctime, bl);
1640 encode(inode.nlink, bl);
1641 ENCODE_FINISH(bl);
1642}
1643
1644void CInode::decode_lock_ilink(bufferlist::const_iterator& p)
1645{
1646 DECODE_START(1, p);
1647 decode(inode.version, p);
1648 utime_t tm;
1649 decode(tm, p);
1650 if (inode.ctime < tm) inode.ctime = tm;
1651 decode(inode.nlink, p);
1652 DECODE_FINISH(p);
1653}
1654
1655void CInode::encode_lock_idft(bufferlist& bl)
1656{
1657 ENCODE_START(1, 1, bl);
1658 if (is_auth()) {
11fdf7f2 1659 encode(inode.version, bl);
9f95a23c
TL
1660 } else {
1661 // treat flushing as dirty when rejoining cache
1662 bool dirty = dirfragtreelock.is_dirty_or_flushing();
1663 encode(dirty, bl);
1664 }
1665 {
1666 // encode the raw tree
1667 encode(dirfragtree, bl);
1668
1669 // also specify which frags are mine
1670 set<frag_t> myfrags;
1671 auto&& dfls = get_dirfrags();
1672 for (const auto& dir : dfls) {
1673 if (dir->is_auth()) {
1674 frag_t fg = dir->get_frag();
1675 myfrags.insert(fg);
1676 }
1677 }
1678 encode(myfrags, bl);
1679 }
1680 ENCODE_FINISH(bl);
1681}
1682
1683void CInode::decode_lock_idft(bufferlist::const_iterator& p)
1684{
1685 DECODE_START(1, p);
1686 if (is_auth()) {
1687 bool replica_dirty;
1688 decode(replica_dirty, p);
1689 if (replica_dirty) {
1690 dout(10) << __func__ << " setting dftlock dirty flag" << dendl;
1691 dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle
1692 }
1693 } else {
1694 decode(inode.version, p);
1695 }
1696 {
1697 fragtree_t temp;
1698 decode(temp, p);
1699 set<frag_t> authfrags;
1700 decode(authfrags, p);
7c673cae 1701 if (is_auth()) {
9f95a23c
TL
1702 // auth. believe replica's auth frags only.
1703 for (auto fg : authfrags) {
1704 if (!dirfragtree.is_leaf(fg)) {
1705 dout(10) << " forcing frag " << fg << " to leaf (split|merge)" << dendl;
1706 dirfragtree.force_to_leaf(g_ceph_context, fg);
1707 dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle
1708 }
1709 }
7c673cae 1710 } else {
9f95a23c
TL
1711 // replica. take the new tree, BUT make sure any open
1712 // dirfrags remain leaves (they may have split _after_ this
1713 // dft was scattered, or we may still be be waiting on the
1714 // notify from the auth)
1715 dirfragtree.swap(temp);
1716 for (const auto &p : dirfrags) {
1717 if (!dirfragtree.is_leaf(p.first)) {
1718 dout(10) << " forcing open dirfrag " << p.first << " to leaf (racing with split|merge)" << dendl;
1719 dirfragtree.force_to_leaf(g_ceph_context, p.first);
1720 }
1721 if (p.second->is_auth())
1722 p.second->state_clear(CDir::STATE_DIRTYDFT);
1723 }
7c673cae 1724 }
9f95a23c
TL
1725 if (g_conf()->mds_debug_frag)
1726 verify_dirfrags();
1727 }
1728 DECODE_FINISH(p);
1729}
1730
1731void CInode::encode_lock_ifile(bufferlist& bl)
1732{
1733 ENCODE_START(1, 1, bl);
1734 if (is_auth()) {
1735 encode(inode.version, bl);
1736 encode(inode.ctime, bl);
1737 encode(inode.mtime, bl);
1738 encode(inode.atime, bl);
1739 encode(inode.time_warp_seq, bl);
1740 if (!is_dir()) {
1741 encode(inode.layout, bl, mdcache->mds->mdsmap->get_up_features());
1742 encode(inode.size, bl);
1743 encode(inode.truncate_seq, bl);
1744 encode(inode.truncate_size, bl);
1745 encode(inode.client_ranges, bl);
1746 encode(inode.inline_data, bl);
1747 }
1748 } else {
1749 // treat flushing as dirty when rejoining cache
1750 bool dirty = filelock.is_dirty_or_flushing();
1751 encode(dirty, bl);
1752 }
1753 dout(15) << __func__ << " inode.dirstat is " << inode.dirstat << dendl;
1754 encode(inode.dirstat, bl); // only meaningful if i am auth.
1755 bufferlist tmp;
1756 __u32 n = 0;
1757 for (const auto &p : dirfrags) {
1758 frag_t fg = p.first;
1759 CDir *dir = p.second;
1760 if (is_auth() || dir->is_auth()) {
1761 fnode_t *pf = dir->get_projected_fnode();
1762 dout(15) << fg << " " << *dir << dendl;
1763 dout(20) << fg << " fragstat " << pf->fragstat << dendl;
1764 dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
1765 encode(fg, tmp);
1766 encode(dir->first, tmp);
1767 encode(pf->fragstat, tmp);
1768 encode(pf->accounted_fragstat, tmp);
1769 n++;
7c673cae 1770 }
9f95a23c
TL
1771 }
1772 encode(n, bl);
1773 bl.claim_append(tmp);
1774 ENCODE_FINISH(bl);
1775}
1776
1777void CInode::decode_lock_ifile(bufferlist::const_iterator& p)
1778{
1779 DECODE_START(1, p);
1780 if (!is_auth()) {
1781 decode(inode.version, p);
1782 utime_t tm;
1783 decode(tm, p);
1784 if (inode.ctime < tm) inode.ctime = tm;
1785 decode(inode.mtime, p);
1786 decode(inode.atime, p);
1787 decode(inode.time_warp_seq, p);
1788 if (!is_dir()) {
1789 decode(inode.layout, p);
1790 decode(inode.size, p);
1791 decode(inode.truncate_seq, p);
1792 decode(inode.truncate_size, p);
1793 decode(inode.client_ranges, p);
1794 decode(inode.inline_data, p);
1795 }
1796 } else {
1797 bool replica_dirty;
1798 decode(replica_dirty, p);
1799 if (replica_dirty) {
1800 dout(10) << __func__ << " setting filelock dirty flag" << dendl;
1801 filelock.mark_dirty(); // ok bc we're auth and caller will handle
1802 }
1803 }
1804
1805 frag_info_t dirstat;
1806 decode(dirstat, p);
1807 if (!is_auth()) {
1808 dout(10) << " taking inode dirstat " << dirstat << " for " << *this << dendl;
1809 inode.dirstat = dirstat; // take inode summation if replica
1810 }
1811 __u32 n;
1812 decode(n, p);
1813 dout(10) << " ...got " << n << " fragstats on " << *this << dendl;
1814 while (n--) {
1815 frag_t fg;
1816 snapid_t fgfirst;
1817 frag_info_t fragstat;
1818 frag_info_t accounted_fragstat;
1819 decode(fg, p);
1820 decode(fgfirst, p);
1821 decode(fragstat, p);
1822 decode(accounted_fragstat, p);
1823 dout(10) << fg << " [" << fgfirst << ",head] " << dendl;
1824 dout(10) << fg << " fragstat " << fragstat << dendl;
1825 dout(20) << fg << " accounted_fragstat " << accounted_fragstat << dendl;
1826
1827 CDir *dir = get_dirfrag(fg);
7c673cae 1828 if (is_auth()) {
9f95a23c
TL
1829 ceph_assert(dir); // i am auth; i had better have this dir open
1830 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
1831 << " on " << *dir << dendl;
1832 dir->first = fgfirst;
1833 dir->fnode.fragstat = fragstat;
1834 dir->fnode.accounted_fragstat = accounted_fragstat;
1835 if (!(fragstat == accounted_fragstat)) {
1836 dout(10) << fg << " setting filelock updated flag" << dendl;
1837 filelock.mark_dirty(); // ok bc we're auth and caller will handle
7c673cae
FG
1838 }
1839 } else {
9f95a23c
TL
1840 if (dir && dir->is_auth()) {
1841 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
1842 << " on " << *dir << dendl;
1843 dir->first = fgfirst;
1844 fnode_t *pf = dir->get_projected_fnode();
1845 finish_scatter_update(&filelock, dir,
1846 inode.dirstat.version, pf->accounted_fragstat.version);
1847 }
7c673cae 1848 }
9f95a23c
TL
1849 }
1850 DECODE_FINISH(p);
1851}
7c673cae 1852
9f95a23c
TL
1853void CInode::encode_lock_inest(bufferlist& bl)
1854{
1855 ENCODE_START(1, 1, bl);
1856 if (is_auth()) {
1857 encode(inode.version, bl);
1858 } else {
1859 // treat flushing as dirty when rejoining cache
1860 bool dirty = nestlock.is_dirty_or_flushing();
1861 encode(dirty, bl);
1862 }
1863 dout(15) << __func__ << " inode.rstat is " << inode.rstat << dendl;
1864 encode(inode.rstat, bl); // only meaningful if i am auth.
1865 bufferlist tmp;
1866 __u32 n = 0;
1867 for (const auto &p : dirfrags) {
1868 frag_t fg = p.first;
1869 CDir *dir = p.second;
1870 if (is_auth() || dir->is_auth()) {
1871 fnode_t *pf = dir->get_projected_fnode();
1872 dout(10) << __func__ << " " << fg << " dir " << *dir << dendl;
1873 dout(10) << __func__ << " " << fg << " rstat " << pf->rstat << dendl;
1874 dout(10) << __func__ << " " << fg << " accounted_rstat " << pf->rstat << dendl;
1875 dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl;
1876 encode(fg, tmp);
1877 encode(dir->first, tmp);
1878 encode(pf->rstat, tmp);
1879 encode(pf->accounted_rstat, tmp);
1880 encode(dir->dirty_old_rstat, tmp);
1881 n++;
7c673cae 1882 }
9f95a23c
TL
1883 }
1884 encode(n, bl);
1885 bl.claim_append(tmp);
1886 ENCODE_FINISH(bl);
1887}
7c673cae 1888
9f95a23c
TL
1889void CInode::decode_lock_inest(bufferlist::const_iterator& p)
1890{
1891 DECODE_START(1, p);
1892 if (is_auth()) {
1893 bool replica_dirty;
1894 decode(replica_dirty, p);
1895 if (replica_dirty) {
1896 dout(10) << __func__ << " setting nestlock dirty flag" << dendl;
1897 nestlock.mark_dirty(); // ok bc we're auth and caller will handle
1898 }
1899 } else {
1900 decode(inode.version, p);
1901 }
1902 nest_info_t rstat;
1903 decode(rstat, p);
1904 if (!is_auth()) {
1905 dout(10) << __func__ << " taking inode rstat " << rstat << " for " << *this << dendl;
1906 inode.rstat = rstat; // take inode summation if replica
1907 }
1908 __u32 n;
1909 decode(n, p);
1910 while (n--) {
1911 frag_t fg;
1912 snapid_t fgfirst;
1913 nest_info_t rstat;
1914 nest_info_t accounted_rstat;
1915 decltype(CDir::dirty_old_rstat) dirty_old_rstat;
1916 decode(fg, p);
1917 decode(fgfirst, p);
1918 decode(rstat, p);
1919 decode(accounted_rstat, p);
1920 decode(dirty_old_rstat, p);
1921 dout(10) << __func__ << " " << fg << " [" << fgfirst << ",head]" << dendl;
1922 dout(10) << __func__ << " " << fg << " rstat " << rstat << dendl;
1923 dout(10) << __func__ << " " << fg << " accounted_rstat " << accounted_rstat << dendl;
1924 dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dirty_old_rstat << dendl;
1925 CDir *dir = get_dirfrag(fg);
7c673cae 1926 if (is_auth()) {
9f95a23c
TL
1927 ceph_assert(dir); // i am auth; i had better have this dir open
1928 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
1929 << " on " << *dir << dendl;
1930 dir->first = fgfirst;
1931 dir->fnode.rstat = rstat;
1932 dir->fnode.accounted_rstat = accounted_rstat;
1933 dir->dirty_old_rstat.swap(dirty_old_rstat);
1934 if (!(rstat == accounted_rstat) || !dir->dirty_old_rstat.empty()) {
1935 dout(10) << fg << " setting nestlock updated flag" << dendl;
1936 nestlock.mark_dirty(); // ok bc we're auth and caller will handle
1937 }
7c673cae 1938 } else {
9f95a23c
TL
1939 if (dir && dir->is_auth()) {
1940 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
1941 << " on " << *dir << dendl;
1942 dir->first = fgfirst;
1943 fnode_t *pf = dir->get_projected_fnode();
1944 finish_scatter_update(&nestlock, dir,
1945 inode.rstat.version, pf->accounted_rstat.version);
7c673cae 1946 }
7c673cae 1947 }
9f95a23c
TL
1948 }
1949 DECODE_FINISH(p);
1950}
1951
1952void CInode::encode_lock_ixattr(bufferlist& bl)
1953{
1954 ENCODE_START(1, 1, bl);
1955 encode(inode.version, bl);
1956 encode(inode.ctime, bl);
1957 encode(xattrs, bl);
1958 ENCODE_FINISH(bl);
1959}
1960
1961void CInode::decode_lock_ixattr(bufferlist::const_iterator& p)
1962{
1963 DECODE_START(1, p);
1964 decode(inode.version, p);
1965 utime_t tm;
1966 decode(tm, p);
1967 if (inode.ctime < tm) inode.ctime = tm;
e306af50 1968 decode_noshare(xattrs, p);
9f95a23c
TL
1969 DECODE_FINISH(p);
1970}
1971
1972void CInode::encode_lock_isnap(bufferlist& bl)
1973{
1974 ENCODE_START(1, 1, bl);
1975 encode(inode.version, bl);
1976 encode(inode.ctime, bl);
1977 encode_snap(bl);
1978 ENCODE_FINISH(bl);
1979}
1980
1981void CInode::decode_lock_isnap(bufferlist::const_iterator& p)
1982{
1983 DECODE_START(1, p);
1984 decode(inode.version, p);
1985 utime_t tm;
1986 decode(tm, p);
1987 if (inode.ctime < tm) inode.ctime = tm;
1988 decode_snap(p);
1989 DECODE_FINISH(p);
1990}
1991
1992void CInode::encode_lock_iflock(bufferlist& bl)
1993{
1994 ENCODE_START(1, 1, bl);
1995 encode(inode.version, bl);
1996 _encode_file_locks(bl);
1997 ENCODE_FINISH(bl);
1998}
1999
2000void CInode::decode_lock_iflock(bufferlist::const_iterator& p)
2001{
2002 DECODE_START(1, p);
2003 decode(inode.version, p);
2004 _decode_file_locks(p);
2005 DECODE_FINISH(p);
2006}
2007
2008void CInode::encode_lock_ipolicy(bufferlist& bl)
2009{
2010 ENCODE_START(1, 1, bl);
2011 if (inode.is_dir()) {
2012 encode(inode.version, bl);
2013 encode(inode.ctime, bl);
2014 encode(inode.layout, bl, mdcache->mds->mdsmap->get_up_features());
2015 encode(inode.quota, bl);
2016 encode(inode.export_pin, bl);
2017 }
2018 ENCODE_FINISH(bl);
2019}
2020
2021void CInode::decode_lock_ipolicy(bufferlist::const_iterator& p)
2022{
2023 DECODE_START(1, p);
2024 if (inode.is_dir()) {
2025 decode(inode.version, p);
2026 utime_t tm;
2027 decode(tm, p);
2028 if (inode.ctime < tm) inode.ctime = tm;
2029 decode(inode.layout, p);
2030 decode(inode.quota, p);
2031 mds_rank_t old_pin = inode.export_pin;
2032 decode(inode.export_pin, p);
2033 maybe_export_pin(old_pin != inode.export_pin);
2034 }
2035 DECODE_FINISH(p);
2036}
2037
2038void CInode::encode_lock_state(int type, bufferlist& bl)
2039{
2040 ENCODE_START(1, 1, bl);
2041 encode(first, bl);
2042 if (!is_base())
2043 encode(parent->first, bl);
2044
2045 switch (type) {
2046 case CEPH_LOCK_IAUTH:
2047 encode_lock_iauth(bl);
2048 break;
2049
2050 case CEPH_LOCK_ILINK:
2051 encode_lock_ilink(bl);
2052 break;
2053
2054 case CEPH_LOCK_IDFT:
2055 encode_lock_idft(bl);
2056 break;
2057
2058 case CEPH_LOCK_IFILE:
2059 encode_lock_ifile(bl);
2060 break;
2061
2062 case CEPH_LOCK_INEST:
2063 encode_lock_inest(bl);
7c673cae
FG
2064 break;
2065
2066 case CEPH_LOCK_IXATTR:
9f95a23c 2067 encode_lock_ixattr(bl);
7c673cae
FG
2068 break;
2069
2070 case CEPH_LOCK_ISNAP:
9f95a23c 2071 encode_lock_isnap(bl);
7c673cae
FG
2072 break;
2073
2074 case CEPH_LOCK_IFLOCK:
9f95a23c 2075 encode_lock_iflock(bl);
7c673cae
FG
2076 break;
2077
2078 case CEPH_LOCK_IPOLICY:
9f95a23c 2079 encode_lock_ipolicy(bl);
7c673cae
FG
2080 break;
2081
2082 default:
2083 ceph_abort();
2084 }
9f95a23c 2085 ENCODE_FINISH(bl);
7c673cae
FG
2086}
2087
7c673cae
FG
2088/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2089
11fdf7f2 2090void CInode::decode_lock_state(int type, const bufferlist& bl)
7c673cae 2091{
11fdf7f2 2092 auto p = bl.cbegin();
9f95a23c
TL
2093
2094 DECODE_START(1, p);
7c673cae
FG
2095 utime_t tm;
2096
2097 snapid_t newfirst;
11fdf7f2
TL
2098 using ceph::decode;
2099 decode(newfirst, p);
7c673cae 2100 if (!is_auth() && newfirst != first) {
11fdf7f2
TL
2101 dout(10) << __func__ << " first " << first << " -> " << newfirst << dendl;
2102 first = newfirst;
2103 }
2104 if (!is_base()) {
2105 decode(newfirst, p);
2106 if (!parent->is_auth() && newfirst != parent->first) {
2107 dout(10) << __func__ << " parent first " << first << " -> " << newfirst << dendl;
7c673cae
FG
2108 parent->first = newfirst;
2109 }
7c673cae
FG
2110 }
2111
2112 switch (type) {
2113 case CEPH_LOCK_IAUTH:
9f95a23c 2114 decode_lock_iauth(p);
7c673cae
FG
2115 break;
2116
2117 case CEPH_LOCK_ILINK:
9f95a23c 2118 decode_lock_ilink(p);
7c673cae
FG
2119 break;
2120
2121 case CEPH_LOCK_IDFT:
9f95a23c 2122 decode_lock_idft(p);
7c673cae
FG
2123 break;
2124
2125 case CEPH_LOCK_IFILE:
9f95a23c 2126 decode_lock_ifile(p);
7c673cae
FG
2127 break;
2128
2129 case CEPH_LOCK_INEST:
9f95a23c 2130 decode_lock_inest(p);
7c673cae
FG
2131 break;
2132
2133 case CEPH_LOCK_IXATTR:
9f95a23c 2134 decode_lock_ixattr(p);
7c673cae
FG
2135 break;
2136
2137 case CEPH_LOCK_ISNAP:
9f95a23c 2138 decode_lock_isnap(p);
7c673cae
FG
2139 break;
2140
2141 case CEPH_LOCK_IFLOCK:
9f95a23c 2142 decode_lock_iflock(p);
7c673cae
FG
2143 break;
2144
2145 case CEPH_LOCK_IPOLICY:
9f95a23c 2146 decode_lock_ipolicy(p);
7c673cae
FG
2147 break;
2148
2149 default:
2150 ceph_abort();
2151 }
9f95a23c 2152 DECODE_FINISH(p);
7c673cae
FG
2153}
2154
2155
2156bool CInode::is_dirty_scattered()
2157{
2158 return
2159 filelock.is_dirty_or_flushing() ||
2160 nestlock.is_dirty_or_flushing() ||
2161 dirfragtreelock.is_dirty_or_flushing();
2162}
2163
2164void CInode::clear_scatter_dirty()
2165{
2166 filelock.remove_dirty();
2167 nestlock.remove_dirty();
2168 dirfragtreelock.remove_dirty();
2169}
2170
2171void CInode::clear_dirty_scattered(int type)
2172{
11fdf7f2
TL
2173 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2174 ceph_assert(is_dir());
7c673cae
FG
2175 switch (type) {
2176 case CEPH_LOCK_IFILE:
2177 item_dirty_dirfrag_dir.remove_myself();
2178 break;
2179
2180 case CEPH_LOCK_INEST:
2181 item_dirty_dirfrag_nest.remove_myself();
2182 break;
2183
2184 case CEPH_LOCK_IDFT:
2185 item_dirty_dirfrag_dirfragtree.remove_myself();
2186 break;
2187
2188 default:
2189 ceph_abort();
2190 }
2191}
2192
2193
2194/*
2195 * when we initially scatter a lock, we need to check if any of the dirfrags
2196 * have out of date accounted_rstat/fragstat. if so, mark the lock stale.
2197 */
2198/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2199void CInode::start_scatter(ScatterLock *lock)
2200{
11fdf7f2
TL
2201 dout(10) << __func__ << " " << *lock << " on " << *this << dendl;
2202 ceph_assert(is_auth());
94b18763 2203 mempool_inode *pi = get_projected_inode();
7c673cae 2204
94b18763
FG
2205 for (const auto &p : dirfrags) {
2206 frag_t fg = p.first;
2207 CDir *dir = p.second;
7c673cae
FG
2208 fnode_t *pf = dir->get_projected_fnode();
2209 dout(20) << fg << " " << *dir << dendl;
2210
2211 if (!dir->is_auth())
2212 continue;
2213
2214 switch (lock->get_type()) {
2215 case CEPH_LOCK_IFILE:
2216 finish_scatter_update(lock, dir, pi->dirstat.version, pf->accounted_fragstat.version);
2217 break;
2218
2219 case CEPH_LOCK_INEST:
2220 finish_scatter_update(lock, dir, pi->rstat.version, pf->accounted_rstat.version);
2221 break;
2222
2223 case CEPH_LOCK_IDFT:
2224 dir->state_clear(CDir::STATE_DIRTYDFT);
2225 break;
2226 }
2227 }
2228}
2229
2230
2231class C_Inode_FragUpdate : public MDSLogContextBase {
2232protected:
2233 CInode *in;
2234 CDir *dir;
2235 MutationRef mut;
2236 MDSRank *get_mds() override {return in->mdcache->mds;}
2237 void finish(int r) override {
2238 in->_finish_frag_update(dir, mut);
2239 }
2240
2241public:
2242 C_Inode_FragUpdate(CInode *i, CDir *d, MutationRef& m) : in(i), dir(d), mut(m) {}
2243};
2244
2245void CInode::finish_scatter_update(ScatterLock *lock, CDir *dir,
2246 version_t inode_version, version_t dir_accounted_version)
2247{
2248 frag_t fg = dir->get_frag();
11fdf7f2 2249 ceph_assert(dir->is_auth());
7c673cae
FG
2250
2251 if (dir->is_frozen()) {
11fdf7f2 2252 dout(10) << __func__ << " " << fg << " frozen, marking " << *lock << " stale " << *dir << dendl;
7c673cae 2253 } else if (dir->get_version() == 0) {
11fdf7f2 2254 dout(10) << __func__ << " " << fg << " not loaded, marking " << *lock << " stale " << *dir << dendl;
7c673cae
FG
2255 } else {
2256 if (dir_accounted_version != inode_version) {
11fdf7f2 2257 dout(10) << __func__ << " " << fg << " journaling accounted scatterstat update v" << inode_version << dendl;
7c673cae
FG
2258
2259 MDLog *mdlog = mdcache->mds->mdlog;
2260 MutationRef mut(new MutationImpl());
2261 mut->ls = mdlog->get_current_segment();
2262
94b18763 2263 mempool_inode *pi = get_projected_inode();
7c673cae 2264 fnode_t *pf = dir->project_fnode();
7c673cae 2265
9f95a23c 2266 std::string_view ename;
7c673cae
FG
2267 switch (lock->get_type()) {
2268 case CEPH_LOCK_IFILE:
2269 pf->fragstat.version = pi->dirstat.version;
2270 pf->accounted_fragstat = pf->fragstat;
2271 ename = "lock ifile accounted scatter stat update";
2272 break;
2273 case CEPH_LOCK_INEST:
2274 pf->rstat.version = pi->rstat.version;
2275 pf->accounted_rstat = pf->rstat;
2276 ename = "lock inest accounted scatter stat update";
c07f9fc5
FG
2277
2278 if (!is_auth() && lock->get_state() == LOCK_MIX) {
11fdf7f2 2279 dout(10) << __func__ << " try to assimilate dirty rstat on "
c07f9fc5
FG
2280 << *dir << dendl;
2281 dir->assimilate_dirty_rstat_inodes();
2282 }
2283
7c673cae
FG
2284 break;
2285 default:
2286 ceph_abort();
2287 }
2288
c07f9fc5 2289 pf->version = dir->pre_dirty();
7c673cae
FG
2290 mut->add_projected_fnode(dir);
2291
2292 EUpdate *le = new EUpdate(mdlog, ename);
2293 mdlog->start_entry(le);
2294 le->metablob.add_dir_context(dir);
2295 le->metablob.add_dir(dir, true);
2296
11fdf7f2 2297 ceph_assert(!dir->is_frozen());
7c673cae 2298 mut->auth_pin(dir);
c07f9fc5
FG
2299
2300 if (lock->get_type() == CEPH_LOCK_INEST &&
2301 !is_auth() && lock->get_state() == LOCK_MIX) {
11fdf7f2 2302 dout(10) << __func__ << " finish assimilating dirty rstat on "
c07f9fc5
FG
2303 << *dir << dendl;
2304 dir->assimilate_dirty_rstat_inodes_finish(mut, &le->metablob);
2305
2306 if (!(pf->rstat == pf->accounted_rstat)) {
11fdf7f2 2307 if (!mut->is_wrlocked(&nestlock)) {
c07f9fc5
FG
2308 mdcache->mds->locker->wrlock_force(&nestlock, mut);
2309 }
2310
2311 mdcache->mds->locker->mark_updated_scatterlock(&nestlock);
2312 mut->ls->dirty_dirfrag_nest.push_back(&item_dirty_dirfrag_nest);
2313 }
2314 }
7c673cae
FG
2315
2316 mdlog->submit_entry(le, new C_Inode_FragUpdate(this, dir, mut));
2317 } else {
11fdf7f2 2318 dout(10) << __func__ << " " << fg << " accounted " << *lock
7c673cae
FG
2319 << " scatter stat unchanged at v" << dir_accounted_version << dendl;
2320 }
2321 }
2322}
2323
2324void CInode::_finish_frag_update(CDir *dir, MutationRef& mut)
2325{
11fdf7f2 2326 dout(10) << __func__ << " on " << *dir << dendl;
7c673cae 2327 mut->apply();
c07f9fc5 2328 mdcache->mds->locker->drop_locks(mut.get());
7c673cae
FG
2329 mut->cleanup();
2330}
2331
2332
2333/*
2334 * when we gather a lock, we need to assimilate dirfrag changes into the inode
2335 * state. it's possible we can't update the dirfrag accounted_rstat/fragstat
2336 * because the frag is auth and frozen, or that the replica couldn't for the same
2337 * reason. hopefully it will get updated the next time the lock cycles.
2338 *
2339 * we have two dimensions of behavior:
2340 * - we may be (auth and !frozen), and able to update, or not.
2341 * - the frag may be stale, or not.
2342 *
2343 * if the frag is non-stale, we want to assimilate the diff into the
2344 * inode, regardless of whether it's auth or updateable.
2345 *
2346 * if we update the frag, we want to set accounted_fragstat = frag,
2347 * both if we took the diff or it was stale and we are making it
2348 * un-stale.
2349 */
2350/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2351void CInode::finish_scatter_gather_update(int type)
2352{
2353 LogChannelRef clog = mdcache->mds->clog;
2354
11fdf7f2
TL
2355 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2356 ceph_assert(is_auth());
7c673cae
FG
2357
2358 switch (type) {
2359 case CEPH_LOCK_IFILE:
2360 {
2361 fragtree_t tmpdft = dirfragtree;
2362 struct frag_info_t dirstat;
2363 bool dirstat_valid = true;
2364
2365 // adjust summation
11fdf7f2 2366 ceph_assert(is_auth());
94b18763 2367 mempool_inode *pi = get_projected_inode();
7c673cae
FG
2368
2369 bool touched_mtime = false, touched_chattr = false;
2370 dout(20) << " orig dirstat " << pi->dirstat << dendl;
2371 pi->dirstat.version++;
94b18763
FG
2372 for (const auto &p : dirfrags) {
2373 frag_t fg = p.first;
2374 CDir *dir = p.second;
7c673cae
FG
2375 dout(20) << fg << " " << *dir << dendl;
2376
2377 bool update;
2378 if (dir->get_version() != 0) {
2379 update = dir->is_auth() && !dir->is_frozen();
2380 } else {
2381 update = false;
2382 dirstat_valid = false;
2383 }
2384
2385 fnode_t *pf = dir->get_projected_fnode();
2386 if (update)
2387 pf = dir->project_fnode();
2388
2389 if (pf->accounted_fragstat.version == pi->dirstat.version - 1) {
2390 dout(20) << fg << " fragstat " << pf->fragstat << dendl;
2391 dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
2392 pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2393 } else {
2394 dout(20) << fg << " skipping STALE accounted_fragstat " << pf->accounted_fragstat << dendl;
2395 }
2396
2397 if (pf->fragstat.nfiles < 0 ||
2398 pf->fragstat.nsubdirs < 0) {
2399 clog->error() << "bad/negative dir size on "
2400 << dir->dirfrag() << " " << pf->fragstat;
11fdf7f2 2401 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter);
7c673cae
FG
2402
2403 if (pf->fragstat.nfiles < 0)
2404 pf->fragstat.nfiles = 0;
2405 if (pf->fragstat.nsubdirs < 0)
2406 pf->fragstat.nsubdirs = 0;
2407 }
2408
2409 if (update) {
2410 pf->accounted_fragstat = pf->fragstat;
2411 pf->fragstat.version = pf->accounted_fragstat.version = pi->dirstat.version;
2412 dout(10) << fg << " updated accounted_fragstat " << pf->fragstat << " on " << *dir << dendl;
2413 }
2414
2415 tmpdft.force_to_leaf(g_ceph_context, fg);
2416 dirstat.add(pf->fragstat);
2417 }
2418 if (touched_mtime)
2419 pi->mtime = pi->ctime = pi->dirstat.mtime;
2420 if (touched_chattr)
2421 pi->change_attr = pi->dirstat.change_attr;
2422 dout(20) << " final dirstat " << pi->dirstat << dendl;
2423
2424 if (dirstat_valid && !dirstat.same_sums(pi->dirstat)) {
11fdf7f2
TL
2425 frag_vec_t leaves;
2426 tmpdft.get_leaves_under(frag_t(), leaves);
2427 for (const auto& leaf : leaves) {
2428 if (!dirfrags.count(leaf)) {
7c673cae
FG
2429 dirstat_valid = false;
2430 break;
2431 }
11fdf7f2 2432 }
7c673cae
FG
2433 if (dirstat_valid) {
2434 if (state_test(CInode::STATE_REPAIRSTATS)) {
2435 dout(20) << " dirstat mismatch, fixing" << dendl;
2436 } else {
2437 clog->error() << "unmatched fragstat on " << ino() << ", inode has "
2438 << pi->dirstat << ", dirfrags have " << dirstat;
11fdf7f2 2439 ceph_assert(!"unmatched fragstat" == g_conf()->mds_verify_scatter);
7c673cae
FG
2440 }
2441 // trust the dirfrags for now
2442 version_t v = pi->dirstat.version;
2443 if (pi->dirstat.mtime > dirstat.mtime)
2444 dirstat.mtime = pi->dirstat.mtime;
2445 if (pi->dirstat.change_attr > dirstat.change_attr)
2446 dirstat.change_attr = pi->dirstat.change_attr;
2447 pi->dirstat = dirstat;
2448 pi->dirstat.version = v;
2449 }
2450 }
2451
d2e6a577
FG
2452 if (pi->dirstat.nfiles < 0 || pi->dirstat.nsubdirs < 0)
2453 {
2454 std::string path;
2455 make_path_string(path);
2456 clog->error() << "Inconsistent statistics detected: fragstat on inode "
2457 << ino() << " (" << path << "), inode has " << pi->dirstat;
11fdf7f2 2458 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter);
7c673cae
FG
2459
2460 if (pi->dirstat.nfiles < 0)
2461 pi->dirstat.nfiles = 0;
2462 if (pi->dirstat.nsubdirs < 0)
2463 pi->dirstat.nsubdirs = 0;
2464 }
2465 }
2466 break;
2467
2468 case CEPH_LOCK_INEST:
2469 {
11fdf7f2
TL
2470 // adjust summation
2471 ceph_assert(is_auth());
2472
7c673cae
FG
2473 fragtree_t tmpdft = dirfragtree;
2474 nest_info_t rstat;
7c673cae
FG
2475 bool rstat_valid = true;
2476
11fdf7f2
TL
2477 rstat.rsubdirs = 1;
2478 if (const sr_t *srnode = get_projected_srnode(); srnode)
2479 rstat.rsnaps = srnode->snaps.size();
2480
94b18763 2481 mempool_inode *pi = get_projected_inode();
7c673cae
FG
2482 dout(20) << " orig rstat " << pi->rstat << dendl;
2483 pi->rstat.version++;
94b18763
FG
2484 for (const auto &p : dirfrags) {
2485 frag_t fg = p.first;
2486 CDir *dir = p.second;
7c673cae
FG
2487 dout(20) << fg << " " << *dir << dendl;
2488
2489 bool update;
2490 if (dir->get_version() != 0) {
2491 update = dir->is_auth() && !dir->is_frozen();
2492 } else {
2493 update = false;
2494 rstat_valid = false;
2495 }
2496
2497 fnode_t *pf = dir->get_projected_fnode();
2498 if (update)
2499 pf = dir->project_fnode();
2500
2501 if (pf->accounted_rstat.version == pi->rstat.version-1) {
2502 // only pull this frag's dirty rstat inodes into the frag if
2503 // the frag is non-stale and updateable. if it's stale,
2504 // that info will just get thrown out!
2505 if (update)
2506 dir->assimilate_dirty_rstat_inodes();
2507
2508 dout(20) << fg << " rstat " << pf->rstat << dendl;
2509 dout(20) << fg << " accounted_rstat " << pf->accounted_rstat << dendl;
2510 dout(20) << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl;
2511 mdcache->project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat,
2512 dir->first, CEPH_NOSNAP, this, true);
94b18763
FG
2513 for (auto &p : dir->dirty_old_rstat) {
2514 mdcache->project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat,
2515 p.second.first, p.first, this, true);
2516 }
7c673cae
FG
2517 if (update) // dir contents not valid if frozen or non-auth
2518 dir->check_rstats();
2519 } else {
2520 dout(20) << fg << " skipping STALE accounted_rstat " << pf->accounted_rstat << dendl;
2521 }
2522 if (update) {
2523 pf->accounted_rstat = pf->rstat;
2524 dir->dirty_old_rstat.clear();
2525 pf->rstat.version = pf->accounted_rstat.version = pi->rstat.version;
2526 dir->check_rstats();
2527 dout(10) << fg << " updated accounted_rstat " << pf->rstat << " on " << *dir << dendl;
2528 }
2529
2530 tmpdft.force_to_leaf(g_ceph_context, fg);
2531 rstat.add(pf->rstat);
2532 }
2533 dout(20) << " final rstat " << pi->rstat << dendl;
2534
2535 if (rstat_valid && !rstat.same_sums(pi->rstat)) {
11fdf7f2
TL
2536 frag_vec_t leaves;
2537 tmpdft.get_leaves_under(frag_t(), leaves);
2538 for (const auto& leaf : leaves) {
2539 if (!dirfrags.count(leaf)) {
7c673cae
FG
2540 rstat_valid = false;
2541 break;
2542 }
11fdf7f2 2543 }
7c673cae
FG
2544 if (rstat_valid) {
2545 if (state_test(CInode::STATE_REPAIRSTATS)) {
2546 dout(20) << " rstat mismatch, fixing" << dendl;
2547 } else {
d2e6a577
FG
2548 clog->error() << "inconsistent rstat on inode " << ino()
2549 << ", inode has " << pi->rstat
2550 << ", directory fragments have " << rstat;
11fdf7f2 2551 ceph_assert(!"unmatched rstat" == g_conf()->mds_verify_scatter);
7c673cae
FG
2552 }
2553 // trust the dirfrag for now
2554 version_t v = pi->rstat.version;
2555 if (pi->rstat.rctime > rstat.rctime)
2556 rstat.rctime = pi->rstat.rctime;
2557 pi->rstat = rstat;
2558 pi->rstat.version = v;
2559 }
2560 }
2561
2562 mdcache->broadcast_quota_to_client(this);
2563 }
2564 break;
2565
2566 case CEPH_LOCK_IDFT:
2567 break;
2568
2569 default:
2570 ceph_abort();
2571 }
2572}
2573
2574void CInode::finish_scatter_gather_update_accounted(int type, MutationRef& mut, EMetaBlob *metablob)
2575{
11fdf7f2
TL
2576 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2577 ceph_assert(is_auth());
7c673cae 2578
94b18763
FG
2579 for (const auto &p : dirfrags) {
2580 CDir *dir = p.second;
7c673cae
FG
2581 if (!dir->is_auth() || dir->get_version() == 0 || dir->is_frozen())
2582 continue;
2583
2584 if (type == CEPH_LOCK_IDFT)
2585 continue; // nothing to do.
2586
2587 dout(10) << " journaling updated frag accounted_ on " << *dir << dendl;
11fdf7f2 2588 ceph_assert(dir->is_projected());
7c673cae
FG
2589 fnode_t *pf = dir->get_projected_fnode();
2590 pf->version = dir->pre_dirty();
2591 mut->add_projected_fnode(dir);
2592 metablob->add_dir(dir, true);
2593 mut->auth_pin(dir);
2594
2595 if (type == CEPH_LOCK_INEST)
2596 dir->assimilate_dirty_rstat_inodes_finish(mut, metablob);
2597 }
2598}
2599
2600// waiting
2601
2602bool CInode::is_frozen() const
2603{
2604 if (is_frozen_inode()) return true;
2605 if (parent && parent->dir->is_frozen()) return true;
2606 return false;
2607}
2608
2609bool CInode::is_frozen_dir() const
2610{
2611 if (parent && parent->dir->is_frozen_dir()) return true;
2612 return false;
2613}
2614
2615bool CInode::is_freezing() const
2616{
2617 if (is_freezing_inode()) return true;
2618 if (parent && parent->dir->is_freezing()) return true;
2619 return false;
2620}
2621
11fdf7f2 2622void CInode::add_dir_waiter(frag_t fg, MDSContext *c)
7c673cae
FG
2623{
2624 if (waiting_on_dir.empty())
2625 get(PIN_DIRWAITER);
2626 waiting_on_dir[fg].push_back(c);
11fdf7f2 2627 dout(10) << __func__ << " frag " << fg << " " << c << " on " << *this << dendl;
7c673cae
FG
2628}
2629
11fdf7f2 2630void CInode::take_dir_waiting(frag_t fg, MDSContext::vec& ls)
7c673cae
FG
2631{
2632 if (waiting_on_dir.empty())
2633 return;
2634
94b18763
FG
2635 auto it = waiting_on_dir.find(fg);
2636 if (it != waiting_on_dir.end()) {
2637 dout(10) << __func__ << " frag " << fg << " on " << *this << dendl;
11fdf7f2
TL
2638 auto& waiting = it->second;
2639 ls.insert(ls.end(), waiting.begin(), waiting.end());
94b18763 2640 waiting_on_dir.erase(it);
7c673cae
FG
2641
2642 if (waiting_on_dir.empty())
2643 put(PIN_DIRWAITER);
2644 }
2645}
2646
11fdf7f2 2647void CInode::add_waiter(uint64_t tag, MDSContext *c)
7c673cae 2648{
11fdf7f2 2649 dout(10) << __func__ << " tag " << std::hex << tag << std::dec << " " << c
7c673cae
FG
2650 << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH)
2651 << " !frozen " << !is_frozen_inode()
2652 << " !freezing " << !is_freezing_inode()
2653 << dendl;
2654 // wait on the directory?
2655 // make sure its not the inode that is explicitly ambiguous|freezing|frozen
2656 if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) ||
2657 ((tag & WAIT_UNFREEZE) &&
2658 !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) {
2659 dout(15) << "passing waiter up tree" << dendl;
2660 parent->dir->add_waiter(tag, c);
2661 return;
2662 }
2663 dout(15) << "taking waiter here" << dendl;
2664 MDSCacheObject::add_waiter(tag, c);
2665}
2666
11fdf7f2 2667void CInode::take_waiting(uint64_t mask, MDSContext::vec& ls)
7c673cae
FG
2668{
2669 if ((mask & WAIT_DIR) && !waiting_on_dir.empty()) {
2670 // take all dentry waiters
2671 while (!waiting_on_dir.empty()) {
94b18763
FG
2672 auto it = waiting_on_dir.begin();
2673 dout(10) << __func__ << " dirfrag " << it->first << " on " << *this << dendl;
11fdf7f2
TL
2674 auto& waiting = it->second;
2675 ls.insert(ls.end(), waiting.begin(), waiting.end());
94b18763 2676 waiting_on_dir.erase(it);
7c673cae
FG
2677 }
2678 put(PIN_DIRWAITER);
2679 }
2680
2681 // waiting
2682 MDSCacheObject::take_waiting(mask, ls);
2683}
2684
9f95a23c
TL
2685void CInode::maybe_finish_freeze_inode()
2686{
2687 CDir *dir = get_parent_dir();
2688 if (auth_pins > auth_pin_freeze_allowance || dir->frozen_inode_suppressed)
2689 return;
2690
2691 dout(10) << "maybe_finish_freeze_inode - frozen" << dendl;
2692 ceph_assert(auth_pins == auth_pin_freeze_allowance);
2693 get(PIN_FROZEN);
2694 put(PIN_FREEZING);
2695 state_clear(STATE_FREEZING);
2696 state_set(STATE_FROZEN);
2697
2698 item_freezing_inode.remove_myself();
2699 dir->num_frozen_inodes++;
2700
2701 finish_waiting(WAIT_FROZEN);
2702}
2703
7c673cae
FG
2704bool CInode::freeze_inode(int auth_pin_allowance)
2705{
9f95a23c
TL
2706 CDir *dir = get_parent_dir();
2707 ceph_assert(dir);
2708
11fdf7f2
TL
2709 ceph_assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins
2710 ceph_assert(auth_pins >= auth_pin_allowance);
9f95a23c
TL
2711 if (auth_pins == auth_pin_allowance && !dir->frozen_inode_suppressed) {
2712 dout(10) << "freeze_inode - frozen" << dendl;
2713 if (!state_test(STATE_FROZEN)) {
2714 get(PIN_FROZEN);
2715 state_set(STATE_FROZEN);
2716 dir->num_frozen_inodes++;
2717 }
2718 return true;
7c673cae
FG
2719 }
2720
9f95a23c
TL
2721 dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl;
2722 auth_pin_freeze_allowance = auth_pin_allowance;
2723 dir->freezing_inodes.push_back(&item_freezing_inode);
2724
2725 get(PIN_FREEZING);
2726 state_set(STATE_FREEZING);
2727
2728 if (!dir->lock_caches_with_auth_pins.empty())
2729 mdcache->mds->locker->invalidate_lock_caches(dir);
2730
2731 const static int lock_types[] = {
2732 CEPH_LOCK_IVERSION, CEPH_LOCK_IFILE, CEPH_LOCK_IAUTH, CEPH_LOCK_ILINK, CEPH_LOCK_IDFT,
2733 CEPH_LOCK_IXATTR, CEPH_LOCK_ISNAP, CEPH_LOCK_INEST, CEPH_LOCK_IFLOCK, CEPH_LOCK_IPOLICY, 0
2734 };
2735 for (int i = 0; lock_types[i]; ++i) {
2736 auto lock = get_lock(lock_types[i]);
2737 if (lock->is_cached())
2738 mdcache->mds->locker->invalidate_lock_caches(lock);
7c673cae 2739 }
9f95a23c
TL
2740 // invalidate_lock_caches() may decrease dir->frozen_inode_suppressed
2741 // and finish freezing the inode
2742 return state_test(STATE_FROZEN);
7c673cae
FG
2743}
2744
11fdf7f2 2745void CInode::unfreeze_inode(MDSContext::vec& finished)
7c673cae 2746{
11fdf7f2 2747 dout(10) << __func__ << dendl;
7c673cae
FG
2748 if (state_test(STATE_FREEZING)) {
2749 state_clear(STATE_FREEZING);
2750 put(PIN_FREEZING);
9f95a23c 2751 item_freezing_inode.remove_myself();
7c673cae
FG
2752 } else if (state_test(STATE_FROZEN)) {
2753 state_clear(STATE_FROZEN);
2754 put(PIN_FROZEN);
9f95a23c 2755 get_parent_dir()->num_frozen_inodes--;
7c673cae
FG
2756 } else
2757 ceph_abort();
2758 take_waiting(WAIT_UNFREEZE, finished);
2759}
2760
2761void CInode::unfreeze_inode()
2762{
11fdf7f2 2763 MDSContext::vec finished;
7c673cae
FG
2764 unfreeze_inode(finished);
2765 mdcache->mds->queue_waiters(finished);
2766}
2767
2768void CInode::freeze_auth_pin()
2769{
11fdf7f2 2770 ceph_assert(state_test(CInode::STATE_FROZEN));
7c673cae 2771 state_set(CInode::STATE_FROZENAUTHPIN);
9f95a23c 2772 get_parent_dir()->num_frozen_inodes++;
7c673cae
FG
2773}
2774
2775void CInode::unfreeze_auth_pin()
2776{
11fdf7f2 2777 ceph_assert(state_test(CInode::STATE_FROZENAUTHPIN));
7c673cae 2778 state_clear(CInode::STATE_FROZENAUTHPIN);
9f95a23c 2779 get_parent_dir()->num_frozen_inodes--;
7c673cae 2780 if (!state_test(STATE_FREEZING|STATE_FROZEN)) {
11fdf7f2 2781 MDSContext::vec finished;
7c673cae
FG
2782 take_waiting(WAIT_UNFREEZE, finished);
2783 mdcache->mds->queue_waiters(finished);
2784 }
2785}
2786
11fdf7f2 2787void CInode::clear_ambiguous_auth(MDSContext::vec& finished)
7c673cae 2788{
11fdf7f2 2789 ceph_assert(state_test(CInode::STATE_AMBIGUOUSAUTH));
7c673cae
FG
2790 state_clear(CInode::STATE_AMBIGUOUSAUTH);
2791 take_waiting(CInode::WAIT_SINGLEAUTH, finished);
2792}
2793
2794void CInode::clear_ambiguous_auth()
2795{
11fdf7f2 2796 MDSContext::vec finished;
7c673cae
FG
2797 clear_ambiguous_auth(finished);
2798 mdcache->mds->queue_waiters(finished);
2799}
2800
2801// auth_pins
91327a77
AA
2802bool CInode::can_auth_pin(int *err_ret) const {
2803 int err;
2804 if (!is_auth()) {
2805 err = ERR_NOT_AUTH;
2806 } else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) {
2807 err = ERR_EXPORTING_INODE;
2808 } else {
2809 if (parent)
2810 return parent->can_auth_pin(err_ret);
2811 err = 0;
2812 }
2813 if (err && err_ret)
2814 *err_ret = err;
2815 return !err;
7c673cae
FG
2816}
2817
2818void CInode::auth_pin(void *by)
2819{
2820 if (auth_pins == 0)
2821 get(PIN_AUTHPIN);
2822 auth_pins++;
2823
2824#ifdef MDS_AUTHPIN_SET
2825 auth_pin_set.insert(by);
2826#endif
2827
11fdf7f2 2828 dout(10) << "auth_pin by " << by << " on " << *this << " now " << auth_pins << dendl;
7c673cae
FG
2829
2830 if (parent)
11fdf7f2 2831 parent->adjust_nested_auth_pins(1, this);
7c673cae
FG
2832}
2833
2834void CInode::auth_unpin(void *by)
2835{
2836 auth_pins--;
2837
2838#ifdef MDS_AUTHPIN_SET
11fdf7f2
TL
2839 {
2840 auto it = auth_pin_set.find(by);
2841 ceph_assert(it != auth_pin_set.end());
2842 auth_pin_set.erase(it);
2843 }
7c673cae
FG
2844#endif
2845
2846 if (auth_pins == 0)
2847 put(PIN_AUTHPIN);
2848
11fdf7f2 2849 dout(10) << "auth_unpin by " << by << " on " << *this << " now " << auth_pins << dendl;
7c673cae 2850
11fdf7f2 2851 ceph_assert(auth_pins >= 0);
7c673cae
FG
2852
2853 if (parent)
11fdf7f2 2854 parent->adjust_nested_auth_pins(-1, by);
7c673cae 2855
9f95a23c
TL
2856 if (is_freezing_inode())
2857 maybe_finish_freeze_inode();
7c673cae
FG
2858}
2859
7c673cae
FG
2860// authority
2861
2862mds_authority_t CInode::authority() const
2863{
2864 if (inode_auth.first >= 0)
2865 return inode_auth;
2866
2867 if (parent)
2868 return parent->dir->authority();
2869
2870 // new items that are not yet linked in (in the committed plane) belong
2871 // to their first parent.
2872 if (!projected_parent.empty())
2873 return projected_parent.front()->dir->authority();
2874
2875 return CDIR_AUTH_UNDEF;
2876}
2877
2878
2879// SNAP
2880
2881snapid_t CInode::get_oldest_snap()
2882{
2883 snapid_t t = first;
2884 if (!old_inodes.empty())
2885 t = old_inodes.begin()->second.first;
11fdf7f2 2886 return std::min(t, oldest_snap);
7c673cae
FG
2887}
2888
94b18763 2889CInode::mempool_old_inode& CInode::cow_old_inode(snapid_t follows, bool cow_head)
7c673cae 2890{
11fdf7f2 2891 ceph_assert(follows >= first);
7c673cae 2892
94b18763
FG
2893 mempool_inode *pi = cow_head ? get_projected_inode() : get_previous_projected_inode();
2894 mempool_xattr_map *px = cow_head ? get_projected_xattrs() : get_previous_projected_xattrs();
7c673cae 2895
94b18763 2896 mempool_old_inode &old = old_inodes[follows];
7c673cae
FG
2897 old.first = first;
2898 old.inode = *pi;
2899 old.xattrs = *px;
2900
2901 if (first < oldest_snap)
2902 oldest_snap = first;
2903
2904 dout(10) << " " << px->size() << " xattrs cowed, " << *px << dendl;
2905
2906 old.inode.trim_client_ranges(follows);
2907
11fdf7f2 2908 if (g_conf()->mds_snap_rstat &&
7c673cae
FG
2909 !(old.inode.rstat == old.inode.accounted_rstat))
2910 dirty_old_rstats.insert(follows);
2911
2912 first = follows+1;
2913
11fdf7f2 2914 dout(10) << __func__ << " " << (cow_head ? "head" : "previous_head" )
7c673cae
FG
2915 << " to [" << old.first << "," << follows << "] on "
2916 << *this << dendl;
2917
2918 return old;
2919}
2920
2921void CInode::split_old_inode(snapid_t snap)
2922{
94b18763 2923 auto it = old_inodes.lower_bound(snap);
11fdf7f2 2924 ceph_assert(it != old_inodes.end() && it->second.first < snap);
7c673cae 2925
94b18763
FG
2926 mempool_old_inode &old = old_inodes[snap - 1];
2927 old = it->second;
7c673cae 2928
94b18763
FG
2929 it->second.first = snap;
2930 dout(10) << __func__ << " " << "[" << old.first << "," << it->first
2931 << "] to [" << snap << "," << it->first << "] on " << *this << dendl;
7c673cae
FG
2932}
2933
2934void CInode::pre_cow_old_inode()
2935{
11fdf7f2 2936 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
7c673cae
FG
2937 if (first <= follows)
2938 cow_old_inode(follows, true);
2939}
2940
11fdf7f2
TL
2941bool CInode::has_snap_data(snapid_t snapid)
2942{
2943 bool found = snapid >= first && snapid <= last;
2944 if (!found && is_multiversion()) {
2945 auto p = old_inodes.lower_bound(snapid);
2946 if (p != old_inodes.end()) {
2947 if (p->second.first > snapid) {
2948 if (p != old_inodes.begin())
2949 --p;
2950 }
2951 if (p->second.first <= snapid && snapid <= p->first) {
2952 found = true;
2953 }
2954 }
2955 }
2956 return found;
2957}
2958
7c673cae
FG
2959void CInode::purge_stale_snap_data(const set<snapid_t>& snaps)
2960{
11fdf7f2 2961 dout(10) << __func__ << " " << snaps << dendl;
7c673cae 2962
94b18763
FG
2963 for (auto it = old_inodes.begin(); it != old_inodes.end(); ) {
2964 const snapid_t &id = it->first;
2965 const auto &s = snaps.lower_bound(it->second.first);
2966 if (s == snaps.end() || *s > id) {
2967 dout(10) << " purging old_inode [" << it->second.first << "," << id << "]" << dendl;
2968 it = old_inodes.erase(it);
2969 } else {
2970 ++it;
2971 }
7c673cae
FG
2972 }
2973}
2974
2975/*
2976 * pick/create an old_inode
2977 */
94b18763 2978CInode::mempool_old_inode * CInode::pick_old_inode(snapid_t snap)
7c673cae 2979{
94b18763
FG
2980 auto it = old_inodes.lower_bound(snap); // p is first key >= to snap
2981 if (it != old_inodes.end() && it->second.first <= snap) {
2982 dout(10) << __func__ << " snap " << snap << " -> [" << it->second.first << "," << it->first << "]" << dendl;
2983 return &it->second;
7c673cae 2984 }
11fdf7f2 2985 dout(10) << __func__ << " snap " << snap << " -> nothing" << dendl;
7c673cae
FG
2986 return NULL;
2987}
2988
2989void CInode::open_snaprealm(bool nosplit)
2990{
2991 if (!snaprealm) {
2992 SnapRealm *parent = find_snaprealm();
2993 snaprealm = new SnapRealm(mdcache, this);
2994 if (parent) {
11fdf7f2 2995 dout(10) << __func__ << " " << snaprealm
7c673cae
FG
2996 << " parent is " << parent
2997 << dendl;
2998 dout(30) << " siblings are " << parent->open_children << dendl;
2999 snaprealm->parent = parent;
3000 if (!nosplit)
3001 parent->split_at(snaprealm);
3002 parent->open_children.insert(snaprealm);
3003 }
3004 }
3005}
3006void CInode::close_snaprealm(bool nojoin)
3007{
3008 if (snaprealm) {
11fdf7f2 3009 dout(15) << __func__ << " " << *snaprealm << dendl;
7c673cae
FG
3010 snaprealm->close_parents();
3011 if (snaprealm->parent) {
3012 snaprealm->parent->open_children.erase(snaprealm);
3013 //if (!nojoin)
3014 //snaprealm->parent->join(snaprealm);
3015 }
3016 delete snaprealm;
3017 snaprealm = 0;
3018 }
3019}
3020
3021SnapRealm *CInode::find_snaprealm() const
3022{
3023 const CInode *cur = this;
3024 while (!cur->snaprealm) {
11fdf7f2
TL
3025 const CDentry *pdn = cur->get_oldest_parent_dn();
3026 if (!pdn)
7c673cae 3027 break;
11fdf7f2 3028 cur = pdn->get_dir()->get_inode();
7c673cae
FG
3029 }
3030 return cur->snaprealm;
3031}
3032
3033void CInode::encode_snap_blob(bufferlist &snapbl)
3034{
3035 if (snaprealm) {
11fdf7f2
TL
3036 using ceph::encode;
3037 encode(snaprealm->srnode, snapbl);
3038 dout(20) << __func__ << " " << *snaprealm << dendl;
7c673cae
FG
3039 }
3040}
11fdf7f2 3041void CInode::decode_snap_blob(const bufferlist& snapbl)
7c673cae 3042{
11fdf7f2 3043 using ceph::decode;
7c673cae
FG
3044 if (snapbl.length()) {
3045 open_snaprealm();
11fdf7f2
TL
3046 auto old_flags = snaprealm->srnode.flags;
3047 auto p = snapbl.cbegin();
3048 decode(snaprealm->srnode, p);
7c673cae
FG
3049 if (is_base()) {
3050 bool ok = snaprealm->_open_parents(NULL);
11fdf7f2
TL
3051 ceph_assert(ok);
3052 } else {
3053 if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) {
3054 snaprealm->close_parents();
3055 snaprealm->adjust_parent();
3056 }
7c673cae 3057 }
11fdf7f2 3058 dout(20) << __func__ << " " << *snaprealm << dendl;
92f5a8d4
TL
3059 } else if (snaprealm &&
3060 !is_root() && !is_mdsdir()) { // see https://tracker.ceph.com/issues/42675
11fdf7f2
TL
3061 ceph_assert(mdcache->mds->is_any_replay());
3062 snaprealm->merge_to(NULL);
7c673cae
FG
3063 }
3064}
3065
3066void CInode::encode_snap(bufferlist& bl)
3067{
9f95a23c 3068 ENCODE_START(1, 1, bl);
7c673cae
FG
3069 bufferlist snapbl;
3070 encode_snap_blob(snapbl);
11fdf7f2
TL
3071 encode(snapbl, bl);
3072 encode(oldest_snap, bl);
9f95a23c 3073 ENCODE_FINISH(bl);
11fdf7f2 3074}
7c673cae 3075
11fdf7f2 3076void CInode::decode_snap(bufferlist::const_iterator& p)
7c673cae 3077{
9f95a23c 3078 DECODE_START(1, p);
7c673cae 3079 bufferlist snapbl;
11fdf7f2
TL
3080 decode(snapbl, p);
3081 decode(oldest_snap, p);
7c673cae 3082 decode_snap_blob(snapbl);
9f95a23c 3083 DECODE_FINISH(p);
7c673cae
FG
3084}
3085
3086// =============================================
3087
3088client_t CInode::calc_ideal_loner()
3089{
3090 if (mdcache->is_readonly())
3091 return -1;
11fdf7f2 3092 if (!get_mds_caps_wanted().empty())
7c673cae
FG
3093 return -1;
3094
3095 int n = 0;
3096 client_t loner = -1;
11fdf7f2
TL
3097 for (const auto &p : client_caps) {
3098 if (!p.second.is_stale() &&
9f95a23c
TL
3099 (is_dir() ?
3100 !has_subtree_or_exporting_dirfrag() :
3101 (p.second.wanted() & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_RD)))) {
7c673cae
FG
3102 if (n)
3103 return -1;
3104 n++;
11fdf7f2 3105 loner = p.first;
7c673cae 3106 }
11fdf7f2 3107 }
7c673cae
FG
3108 return loner;
3109}
3110
b32b8144 3111bool CInode::choose_ideal_loner()
7c673cae
FG
3112{
3113 want_loner_cap = calc_ideal_loner();
b32b8144
FG
3114 int changed = false;
3115 if (loner_cap >= 0 && loner_cap != want_loner_cap) {
3116 if (!try_drop_loner())
3117 return false;
3118 changed = true;
3119 }
3120
3121 if (want_loner_cap >= 0) {
3122 if (loner_cap < 0) {
3123 set_loner_cap(want_loner_cap);
3124 changed = true;
3125 } else
11fdf7f2 3126 ceph_assert(loner_cap == want_loner_cap);
b32b8144
FG
3127 }
3128 return changed;
7c673cae
FG
3129}
3130
3131bool CInode::try_set_loner()
3132{
11fdf7f2 3133 ceph_assert(want_loner_cap >= 0);
7c673cae
FG
3134 if (loner_cap >= 0 && loner_cap != want_loner_cap)
3135 return false;
3136 set_loner_cap(want_loner_cap);
3137 return true;
3138}
3139
3140void CInode::set_loner_cap(client_t l)
3141{
3142 loner_cap = l;
3143 authlock.set_excl_client(loner_cap);
3144 filelock.set_excl_client(loner_cap);
3145 linklock.set_excl_client(loner_cap);
3146 xattrlock.set_excl_client(loner_cap);
3147}
3148
3149bool CInode::try_drop_loner()
3150{
3151 if (loner_cap < 0)
3152 return true;
3153
3154 int other_allowed = get_caps_allowed_by_type(CAP_ANY);
3155 Capability *cap = get_client_cap(loner_cap);
3156 if (!cap ||
3157 (cap->issued() & ~other_allowed) == 0) {
3158 set_loner_cap(-1);
3159 return true;
3160 }
3161 return false;
3162}
3163
3164
3165// choose new lock state during recovery, based on issued caps
3166void CInode::choose_lock_state(SimpleLock *lock, int allissued)
3167{
3168 int shift = lock->get_cap_shift();
3169 int issued = (allissued >> shift) & lock->get_cap_mask();
3170 if (is_auth()) {
3171 if (lock->is_xlocked()) {
3172 // do nothing here
3173 } else if (lock->get_state() != LOCK_MIX) {
3174 if (issued & (CEPH_CAP_GEXCL | CEPH_CAP_GBUFFER))
3175 lock->set_state(LOCK_EXCL);
3176 else if (issued & CEPH_CAP_GWR)
3177 lock->set_state(LOCK_MIX);
3178 else if (lock->is_dirty()) {
3179 if (is_replicated())
3180 lock->set_state(LOCK_MIX);
3181 else
3182 lock->set_state(LOCK_LOCK);
3183 } else
3184 lock->set_state(LOCK_SYNC);
3185 }
3186 } else {
3187 // our states have already been chosen during rejoin.
3188 if (lock->is_xlocked())
11fdf7f2 3189 ceph_assert(lock->get_state() == LOCK_LOCK);
7c673cae
FG
3190 }
3191}
3192
3193void CInode::choose_lock_states(int dirty_caps)
3194{
3195 int issued = get_caps_issued() | dirty_caps;
b32b8144
FG
3196 if (is_auth() && (issued & (CEPH_CAP_ANY_EXCL|CEPH_CAP_ANY_WR)))
3197 choose_ideal_loner();
7c673cae
FG
3198 choose_lock_state(&filelock, issued);
3199 choose_lock_state(&nestlock, issued);
3200 choose_lock_state(&dirfragtreelock, issued);
3201 choose_lock_state(&authlock, issued);
3202 choose_lock_state(&xattrlock, issued);
3203 choose_lock_state(&linklock, issued);
3204}
3205
9f95a23c
TL
3206int CInode::count_nonstale_caps()
3207{
3208 int n = 0;
3209 for (const auto &p : client_caps) {
3210 if (!p.second.is_stale())
3211 n++;
3212 }
3213 return n;
3214}
3215
3216bool CInode::multiple_nonstale_caps()
3217{
3218 int n = 0;
3219 for (const auto &p : client_caps) {
3220 if (!p.second.is_stale()) {
3221 if (n)
3222 return true;
3223 n++;
3224 }
3225 }
3226 return false;
3227}
3228
11fdf7f2
TL
3229void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m)
3230{
3231 bool old_empty = mds_caps_wanted.empty();
3232 mds_caps_wanted.swap(m);
3233 if (old_empty != (bool)mds_caps_wanted.empty()) {
3234 if (old_empty)
3235 adjust_num_caps_wanted(1);
3236 else
3237 adjust_num_caps_wanted(-1);
3238 }
3239}
3240
3241void CInode::set_mds_caps_wanted(mds_rank_t mds, int32_t wanted)
3242{
3243 bool old_empty = mds_caps_wanted.empty();
3244 if (wanted) {
3245 mds_caps_wanted[mds] = wanted;
3246 if (old_empty)
3247 adjust_num_caps_wanted(1);
3248 } else if (!old_empty) {
3249 mds_caps_wanted.erase(mds);
3250 if (mds_caps_wanted.empty())
3251 adjust_num_caps_wanted(-1);
3252 }
3253}
3254
3255void CInode::adjust_num_caps_wanted(int d)
3256{
3257 if (!num_caps_wanted && d > 0)
3258 mdcache->open_file_table.add_inode(this);
3259 else if (num_caps_wanted > 0 && num_caps_wanted == -d)
3260 mdcache->open_file_table.remove_inode(this);
3261
3262 num_caps_wanted +=d;
3263 ceph_assert(num_caps_wanted >= 0);
3264}
3265
9f95a23c
TL
3266Capability *CInode::add_client_cap(client_t client, Session *session,
3267 SnapRealm *conrealm, bool new_inode)
7c673cae 3268{
11fdf7f2 3269 ceph_assert(last == CEPH_NOSNAP);
7c673cae
FG
3270 if (client_caps.empty()) {
3271 get(PIN_CAPS);
3272 if (conrealm)
3273 containing_realm = conrealm;
3274 else
3275 containing_realm = find_snaprealm();
3276 containing_realm->inodes_with_caps.push_back(&item_caps);
11fdf7f2 3277 dout(10) << __func__ << " first cap, joining realm " << *containing_realm << dendl;
7c673cae 3278
7c673cae 3279 mdcache->num_inodes_with_caps++;
11fdf7f2
TL
3280 if (parent)
3281 parent->dir->adjust_num_inodes_with_caps(1);
3282 }
3283
9f95a23c 3284 uint64_t cap_id = new_inode ? 1 : ++mdcache->last_cap_id;
11fdf7f2
TL
3285 auto ret = client_caps.emplace(std::piecewise_construct, std::forward_as_tuple(client),
3286 std::forward_as_tuple(this, session, cap_id));
3287 ceph_assert(ret.second == true);
3288 Capability *cap = &ret.first->second;
7c673cae 3289
7c673cae 3290 cap->client_follows = first-1;
7c673cae 3291 containing_realm->add_cap(client, cap);
11fdf7f2 3292
7c673cae
FG
3293 return cap;
3294}
3295
3296void CInode::remove_client_cap(client_t client)
3297{
11fdf7f2
TL
3298 auto it = client_caps.find(client);
3299 ceph_assert(it != client_caps.end());
3300 Capability *cap = &it->second;
7c673cae
FG
3301
3302 cap->item_session_caps.remove_myself();
3303 cap->item_revoking_caps.remove_myself();
3304 cap->item_client_revoking_caps.remove_myself();
3305 containing_realm->remove_cap(client, cap);
3306
3307 if (client == loner_cap)
3308 loner_cap = -1;
3309
11fdf7f2
TL
3310 if (cap->wanted())
3311 adjust_num_caps_wanted(-1);
3312
3313 client_caps.erase(it);
7c673cae 3314 if (client_caps.empty()) {
11fdf7f2 3315 dout(10) << __func__ << " last cap, leaving realm " << *containing_realm << dendl;
7c673cae
FG
3316 put(PIN_CAPS);
3317 item_caps.remove_myself();
3318 containing_realm = NULL;
7c673cae 3319 mdcache->num_inodes_with_caps--;
11fdf7f2
TL
3320 if (parent)
3321 parent->dir->adjust_num_inodes_with_caps(-1);
7c673cae
FG
3322 }
3323
3324 //clean up advisory locks
3325 bool fcntl_removed = fcntl_locks ? fcntl_locks->remove_all_from(client) : false;
3326 bool flock_removed = flock_locks ? flock_locks->remove_all_from(client) : false;
3327 if (fcntl_removed || flock_removed) {
11fdf7f2 3328 MDSContext::vec waiters;
7c673cae
FG
3329 take_waiting(CInode::WAIT_FLOCK, waiters);
3330 mdcache->mds->queue_waiters(waiters);
3331 }
3332}
3333
3334void CInode::move_to_realm(SnapRealm *realm)
3335{
11fdf7f2 3336 dout(10) << __func__ << " joining realm " << *realm
7c673cae 3337 << ", leaving realm " << *containing_realm << dendl;
11fdf7f2
TL
3338 for (auto& p : client_caps) {
3339 containing_realm->remove_cap(p.first, &p.second);
3340 realm->add_cap(p.first, &p.second);
7c673cae
FG
3341 }
3342 item_caps.remove_myself();
3343 realm->inodes_with_caps.push_back(&item_caps);
3344 containing_realm = realm;
3345}
3346
3347Capability *CInode::reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session)
3348{
3349 Capability *cap = get_client_cap(client);
3350 if (cap) {
3351 // FIXME?
3352 cap->merge(icr.capinfo.wanted, icr.capinfo.issued);
3353 } else {
3354 cap = add_client_cap(client, session);
3355 cap->set_cap_id(icr.capinfo.cap_id);
3356 cap->set_wanted(icr.capinfo.wanted);
3357 cap->issue_norevoke(icr.capinfo.issued);
3358 cap->reset_seq();
3359 }
3360 cap->set_last_issue_stamp(ceph_clock_now());
3361 return cap;
3362}
3363
3364void CInode::clear_client_caps_after_export()
3365{
3366 while (!client_caps.empty())
3367 remove_client_cap(client_caps.begin()->first);
3368 loner_cap = -1;
3369 want_loner_cap = -1;
11fdf7f2
TL
3370 if (!get_mds_caps_wanted().empty()) {
3371 mempool::mds_co::compact_map<int32_t,int32_t> empty;
3372 set_mds_caps_wanted(empty);
3373 }
7c673cae
FG
3374}
3375
3376void CInode::export_client_caps(map<client_t,Capability::Export>& cl)
3377{
11fdf7f2
TL
3378 for (const auto &p : client_caps) {
3379 cl[p.first] = p.second.make_export();
7c673cae
FG
3380 }
3381}
3382
3383 // caps allowed
3384int CInode::get_caps_liked() const
3385{
3386 if (is_dir())
3387 return CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; // but not, say, FILE_RD|WR|WRBUFFER
3388 else
3389 return CEPH_CAP_ANY & ~CEPH_CAP_FILE_LAZYIO;
3390}
3391
3392int CInode::get_caps_allowed_ever() const
3393{
3394 int allowed;
3395 if (is_dir())
3396 allowed = CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED;
3397 else
3398 allowed = CEPH_CAP_ANY;
3399 return allowed &
3400 (CEPH_CAP_PIN |
3401 (filelock.gcaps_allowed_ever() << filelock.get_cap_shift()) |
3402 (authlock.gcaps_allowed_ever() << authlock.get_cap_shift()) |
3403 (xattrlock.gcaps_allowed_ever() << xattrlock.get_cap_shift()) |
3404 (linklock.gcaps_allowed_ever() << linklock.get_cap_shift()));
3405}
3406
3407int CInode::get_caps_allowed_by_type(int type) const
3408{
3409 return
3410 CEPH_CAP_PIN |
3411 (filelock.gcaps_allowed(type) << filelock.get_cap_shift()) |
3412 (authlock.gcaps_allowed(type) << authlock.get_cap_shift()) |
3413 (xattrlock.gcaps_allowed(type) << xattrlock.get_cap_shift()) |
3414 (linklock.gcaps_allowed(type) << linklock.get_cap_shift());
3415}
3416
3417int CInode::get_caps_careful() const
3418{
3419 return
3420 (filelock.gcaps_careful() << filelock.get_cap_shift()) |
3421 (authlock.gcaps_careful() << authlock.get_cap_shift()) |
3422 (xattrlock.gcaps_careful() << xattrlock.get_cap_shift()) |
3423 (linklock.gcaps_careful() << linklock.get_cap_shift());
3424}
3425
3426int CInode::get_xlocker_mask(client_t client) const
3427{
3428 return
3429 (filelock.gcaps_xlocker_mask(client) << filelock.get_cap_shift()) |
3430 (authlock.gcaps_xlocker_mask(client) << authlock.get_cap_shift()) |
3431 (xattrlock.gcaps_xlocker_mask(client) << xattrlock.get_cap_shift()) |
3432 (linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift());
3433}
3434
11fdf7f2
TL
3435int CInode::get_caps_allowed_for_client(Session *session, Capability *cap,
3436 mempool_inode *file_i) const
7c673cae 3437{
11fdf7f2 3438 client_t client = session->get_client();
7c673cae
FG
3439 int allowed;
3440 if (client == get_loner()) {
3441 // as the loner, we get the loner_caps AND any xlocker_caps for things we have xlocked
3442 allowed =
3443 get_caps_allowed_by_type(CAP_LONER) |
3444 (get_caps_allowed_by_type(CAP_XLOCKER) & get_xlocker_mask(client));
3445 } else {
3446 allowed = get_caps_allowed_by_type(CAP_ANY);
3447 }
3448
9f95a23c
TL
3449 if (is_dir()) {
3450 allowed &= ~CEPH_CAP_ANY_DIR_OPS;
3451 if (cap && (allowed & CEPH_CAP_FILE_EXCL))
3452 allowed |= cap->get_lock_cache_allowed();
3453 } else {
11fdf7f2
TL
3454 if (file_i->inline_data.version == CEPH_INLINE_NONE &&
3455 file_i->layout.pool_ns.empty()) {
3456 // noop
3457 } else if (cap) {
3458 if ((file_i->inline_data.version != CEPH_INLINE_NONE &&
3459 cap->is_noinline()) ||
3460 (!file_i->layout.pool_ns.empty() &&
3461 cap->is_nopoolns()))
3462 allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
3463 } else {
3464 auto& conn = session->get_connection();
3465 if ((file_i->inline_data.version != CEPH_INLINE_NONE &&
3466 !conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) ||
3467 (!file_i->layout.pool_ns.empty() &&
3468 !conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)))
3469 allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
3470 }
7c673cae
FG
3471 }
3472 return allowed;
3473}
3474
3475// caps issued, wanted
3476int CInode::get_caps_issued(int *ploner, int *pother, int *pxlocker,
3477 int shift, int mask)
3478{
3479 int c = 0;
3480 int loner = 0, other = 0, xlocker = 0;
3481 if (!is_auth()) {
3482 loner_cap = -1;
3483 }
3484
11fdf7f2
TL
3485 for (const auto &p : client_caps) {
3486 int i = p.second.issued();
7c673cae 3487 c |= i;
11fdf7f2 3488 if (p.first == loner_cap)
7c673cae
FG
3489 loner |= i;
3490 else
3491 other |= i;
11fdf7f2 3492 xlocker |= get_xlocker_mask(p.first) & i;
7c673cae
FG
3493 }
3494 if (ploner) *ploner = (loner >> shift) & mask;
3495 if (pother) *pother = (other >> shift) & mask;
3496 if (pxlocker) *pxlocker = (xlocker >> shift) & mask;
3497 return (c >> shift) & mask;
3498}
3499
3500bool CInode::is_any_caps_wanted() const
3501{
11fdf7f2
TL
3502 for (const auto &p : client_caps) {
3503 if (p.second.wanted())
7c673cae 3504 return true;
11fdf7f2 3505 }
7c673cae
FG
3506 return false;
3507}
3508
3509int CInode::get_caps_wanted(int *ploner, int *pother, int shift, int mask) const
3510{
3511 int w = 0;
3512 int loner = 0, other = 0;
11fdf7f2
TL
3513 for (const auto &p : client_caps) {
3514 if (!p.second.is_stale()) {
3515 int t = p.second.wanted();
7c673cae 3516 w |= t;
11fdf7f2 3517 if (p.first == loner_cap)
7c673cae
FG
3518 loner |= t;
3519 else
3520 other |= t;
3521 }
3522 //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl;
3523 }
3524 if (is_auth())
94b18763
FG
3525 for (const auto &p : mds_caps_wanted) {
3526 w |= p.second;
3527 other |= p.second;
7c673cae
FG
3528 //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl;
3529 }
3530 if (ploner) *ploner = (loner >> shift) & mask;
3531 if (pother) *pother = (other >> shift) & mask;
3532 return (w >> shift) & mask;
3533}
3534
3535bool CInode::issued_caps_need_gather(SimpleLock *lock)
3536{
3537 int loner_issued, other_issued, xlocker_issued;
3538 get_caps_issued(&loner_issued, &other_issued, &xlocker_issued,
3539 lock->get_cap_shift(), lock->get_cap_mask());
3540 if ((loner_issued & ~lock->gcaps_allowed(CAP_LONER)) ||
3541 (other_issued & ~lock->gcaps_allowed(CAP_ANY)) ||
3542 (xlocker_issued & ~lock->gcaps_allowed(CAP_XLOCKER)))
3543 return true;
3544 return false;
3545}
3546
7c673cae
FG
3547
3548// =============================================
3549
3550int CInode::encode_inodestat(bufferlist& bl, Session *session,
3551 SnapRealm *dir_realm,
3552 snapid_t snapid,
3553 unsigned max_bytes,
3554 int getattr_caps)
3555{
11fdf7f2
TL
3556 client_t client = session->get_client();
3557 ceph_assert(snapid);
7c673cae
FG
3558
3559 bool valid = true;
3560
3561 // pick a version!
94b18763
FG
3562 mempool_inode *oi = &inode;
3563 mempool_inode *pi = get_projected_inode();
7c673cae 3564
94b18763 3565 CInode::mempool_xattr_map *pxattrs = nullptr;
7c673cae
FG
3566
3567 if (snapid != CEPH_NOSNAP) {
3568
3569 // for now at least, old_inodes is only defined/valid on the auth
3570 if (!is_auth())
3571 valid = false;
3572
3573 if (is_multiversion()) {
94b18763
FG
3574 auto it = old_inodes.lower_bound(snapid);
3575 if (it != old_inodes.end()) {
3576 if (it->second.first > snapid) {
3577 if (it != old_inodes.begin())
3578 --it;
7c673cae 3579 }
94b18763
FG
3580 if (it->second.first <= snapid && snapid <= it->first) {
3581 dout(15) << __func__ << " snapid " << snapid
3582 << " to old_inode [" << it->second.first << "," << it->first << "]"
3583 << " " << it->second.inode.rstat
7c673cae 3584 << dendl;
94b18763
FG
3585 auto &p = it->second;
3586 pi = oi = &p.inode;
3587 pxattrs = &p.xattrs;
7c673cae
FG
3588 } else {
3589 // snapshoted remote dentry can result this
11fdf7f2 3590 dout(0) << __func__ << " old_inode for snapid " << snapid
7c673cae
FG
3591 << " not found" << dendl;
3592 }
3593 }
3594 } else if (snapid < first || snapid > last) {
3595 // snapshoted remote dentry can result this
11fdf7f2 3596 dout(0) << __func__ << " [" << first << "," << last << "]"
7c673cae
FG
3597 << " not match snapid " << snapid << dendl;
3598 }
3599 }
3600
81eedcae 3601 utime_t snap_btime;
7c673cae 3602 SnapRealm *realm = find_snaprealm();
81eedcae
TL
3603 if (snapid != CEPH_NOSNAP && realm) {
3604 // add snapshot timestamp vxattr
3605 map<snapid_t,const SnapInfo*> infomap;
3606 realm->get_snap_info(infomap,
3607 snapid, // min
3608 snapid); // max
3609 if (!infomap.empty()) {
3610 ceph_assert(infomap.size() == 1);
3611 const SnapInfo *si = infomap.begin()->second;
3612 snap_btime = si->stamp;
3613 }
3614 }
3615
7c673cae
FG
3616
3617 bool no_caps = !valid ||
3618 session->is_stale() ||
3619 (dir_realm && realm != dir_realm) ||
3620 is_frozen() ||
3621 state_test(CInode::STATE_EXPORTINGCAPS);
3622 if (no_caps)
11fdf7f2 3623 dout(20) << __func__ << " no caps"
7c673cae
FG
3624 << (!valid?", !valid":"")
3625 << (session->is_stale()?", session stale ":"")
3626 << ((dir_realm && realm != dir_realm)?", snaprealm differs ":"")
3627 << (is_frozen()?", frozen inode":"")
3628 << (state_test(CInode::STATE_EXPORTINGCAPS)?", exporting caps":"")
3629 << dendl;
3630
3631
3632 // "fake" a version that is old (stable) version, +1 if projected.
3633 version_t version = (oi->version * 2) + is_projected();
3634
3635 Capability *cap = get_client_cap(client);
3636 bool pfile = filelock.is_xlocked_by_client(client) || get_loner() == client;
3637 //(cap && (cap->issued() & CEPH_CAP_FILE_EXCL));
3638 bool pauth = authlock.is_xlocked_by_client(client) || get_loner() == client;
3639 bool plink = linklock.is_xlocked_by_client(client) || get_loner() == client;
3640 bool pxattr = xattrlock.is_xlocked_by_client(client) || get_loner() == client;
3641
3642 bool plocal = versionlock.get_last_wrlock_client() == client;
3643 bool ppolicy = policylock.is_xlocked_by_client(client) || get_loner()==client;
3644
94b18763 3645 mempool_inode *any_i = (pfile|pauth|plink|pxattr|plocal) ? pi : oi;
7c673cae
FG
3646
3647 dout(20) << " pfile " << pfile << " pauth " << pauth
3648 << " plink " << plink << " pxattr " << pxattr
3649 << " plocal " << plocal
3650 << " ctime " << any_i->ctime
3651 << " valid=" << valid << dendl;
3652
3653 // file
94b18763 3654 mempool_inode *file_i = pfile ? pi:oi;
7c673cae
FG
3655 file_layout_t layout;
3656 if (is_dir()) {
3657 layout = (ppolicy ? pi : oi)->layout;
3658 } else {
3659 layout = file_i->layout;
3660 }
3661
3662 // max_size is min of projected, actual
3663 uint64_t max_size =
11fdf7f2 3664 std::min(oi->client_ranges.count(client) ?
7c673cae
FG
3665 oi->client_ranges[client].range.last : 0,
3666 pi->client_ranges.count(client) ?
3667 pi->client_ranges[client].range.last : 0);
3668
3669 // inline data
3670 version_t inline_version = 0;
3671 bufferlist inline_data;
3672 if (file_i->inline_data.version == CEPH_INLINE_NONE) {
3673 inline_version = CEPH_INLINE_NONE;
3674 } else if ((!cap && !no_caps) ||
3675 (cap && cap->client_inline_version < file_i->inline_data.version) ||
3676 (getattr_caps & CEPH_CAP_FILE_RD)) { // client requests inline data
3677 inline_version = file_i->inline_data.version;
3678 if (file_i->inline_data.length() > 0)
3679 inline_data = file_i->inline_data.get_data();
3680 }
3681
3682 // nest (do same as file... :/)
3683 if (cap) {
3684 cap->last_rbytes = file_i->rstat.rbytes;
3685 cap->last_rsize = file_i->rstat.rsize();
3686 }
3687
3688 // auth
94b18763 3689 mempool_inode *auth_i = pauth ? pi:oi;
7c673cae
FG
3690
3691 // link
94b18763 3692 mempool_inode *link_i = plink ? pi:oi;
7c673cae
FG
3693
3694 // xattr
94b18763 3695 mempool_inode *xattr_i = pxattr ? pi:oi;
7c673cae 3696
11fdf7f2 3697 using ceph::encode;
7c673cae 3698 // xattr
7c673cae
FG
3699 version_t xattr_version;
3700 if ((!cap && !no_caps) ||
3701 (cap && cap->client_xattr_version < xattr_i->xattr_version) ||
3702 (getattr_caps & CEPH_CAP_XATTR_SHARED)) { // client requests xattrs
3703 if (!pxattrs)
3704 pxattrs = pxattr ? get_projected_xattrs() : &xattrs;
7c673cae
FG
3705 xattr_version = xattr_i->xattr_version;
3706 } else {
3707 xattr_version = 0;
3708 }
3709
3710 // do we have room?
3711 if (max_bytes) {
11fdf7f2
TL
3712 unsigned bytes =
3713 8 + 8 + 4 + 8 + 8 + sizeof(ceph_mds_reply_cap) +
3714 sizeof(struct ceph_file_layout) +
3715 sizeof(struct ceph_timespec) * 3 + 4 + // ctime ~ time_warp_seq
3716 8 + 8 + 8 + 4 + 4 + 4 + 4 + 4 + // size ~ nlink
3717 8 + 8 + 8 + 8 + 8 + sizeof(struct ceph_timespec) + // dirstat.nfiles ~ rstat.rctime
3718 sizeof(__u32) + sizeof(__u32) * 2 * dirfragtree._splits.size() + // dirfragtree
3719 sizeof(__u32) + symlink.length() + // symlink
3720 sizeof(struct ceph_dir_layout); // dir_layout
3721
3722 if (xattr_version) {
3723 bytes += sizeof(__u32) + sizeof(__u32); // xattr buffer len + number entries
3724 if (pxattrs) {
3725 for (const auto &p : *pxattrs)
3726 bytes += sizeof(__u32) * 2 + p.first.length() + p.second.length();
3727 }
3728 } else {
3729 bytes += sizeof(__u32); // xattr buffer len
3730 }
3731 bytes +=
3732 sizeof(version_t) + sizeof(__u32) + inline_data.length() + // inline data
3733 1 + 1 + 8 + 8 + 4 + // quota
3734 4 + layout.pool_ns.size() + // pool ns
3735 sizeof(struct ceph_timespec) + 8; // btime + change_attr
3736
7c673cae
FG
3737 if (bytes > max_bytes)
3738 return -ENOSPC;
3739 }
3740
3741
3742 // encode caps
3743 struct ceph_mds_reply_cap ecap;
3744 if (snapid != CEPH_NOSNAP) {
3745 /*
3746 * snapped inodes (files or dirs) only get read-only caps. always
3747 * issue everything possible, since it is read only.
3748 *
3749 * if a snapped inode has caps, limit issued caps based on the
3750 * lock state.
3751 *
3752 * if it is a live inode, limit issued caps based on the lock
3753 * state.
3754 *
3755 * do NOT adjust cap issued state, because the client always
3756 * tracks caps per-snap and the mds does either per-interval or
3757 * multiversion.
3758 */
3759 ecap.caps = valid ? get_caps_allowed_by_type(CAP_ANY) : CEPH_STAT_CAP_INODE;
3760 if (last == CEPH_NOSNAP || is_any_caps())
11fdf7f2 3761 ecap.caps = ecap.caps & get_caps_allowed_for_client(session, nullptr, file_i);
7c673cae
FG
3762 ecap.seq = 0;
3763 ecap.mseq = 0;
3764 ecap.realm = 0;
3765 } else {
3766 if (!no_caps && !cap) {
3767 // add a new cap
3768 cap = add_client_cap(client, session, realm);
b32b8144
FG
3769 if (is_auth())
3770 choose_ideal_loner();
7c673cae
FG
3771 }
3772
3773 int issue = 0;
3774 if (!no_caps && cap) {
3775 int likes = get_caps_liked();
11fdf7f2 3776 int allowed = get_caps_allowed_for_client(session, cap, file_i);
7c673cae 3777 issue = (cap->wanted() | likes) & allowed;
494da23a 3778 cap->issue_norevoke(issue, true);
7c673cae
FG
3779 issue = cap->pending();
3780 dout(10) << "encode_inodestat issuing " << ccap_string(issue)
3781 << " seq " << cap->get_last_seq() << dendl;
3782 } else if (cap && cap->is_new() && !dir_realm) {
3783 // alway issue new caps to client, otherwise the caps get lost
11fdf7f2 3784 ceph_assert(cap->is_stale());
494da23a
TL
3785 ceph_assert(!cap->pending());
3786 issue = CEPH_CAP_PIN;
3787 cap->issue_norevoke(issue, true);
7c673cae
FG
3788 dout(10) << "encode_inodestat issuing " << ccap_string(issue)
3789 << " seq " << cap->get_last_seq()
494da23a 3790 << "(stale&new caps)" << dendl;
7c673cae
FG
3791 }
3792
3793 if (issue) {
3794 cap->set_last_issue();
3795 cap->set_last_issue_stamp(ceph_clock_now());
7c673cae
FG
3796 ecap.caps = issue;
3797 ecap.wanted = cap->wanted();
3798 ecap.cap_id = cap->get_cap_id();
3799 ecap.seq = cap->get_last_seq();
3800 ecap.mseq = cap->get_mseq();
3801 ecap.realm = realm->inode->ino();
3802 } else {
3803 ecap.cap_id = 0;
3804 ecap.caps = 0;
3805 ecap.seq = 0;
3806 ecap.mseq = 0;
3807 ecap.realm = 0;
3808 ecap.wanted = 0;
3809 }
3810 }
3811 ecap.flags = is_auth() ? CEPH_CAP_FLAG_AUTH : 0;
3812 dout(10) << "encode_inodestat caps " << ccap_string(ecap.caps)
3813 << " seq " << ecap.seq << " mseq " << ecap.mseq
11fdf7f2 3814 << " xattrv " << xattr_version << dendl;
7c673cae
FG
3815
3816 if (inline_data.length() && cap) {
3817 if ((cap->pending() | getattr_caps) & CEPH_CAP_FILE_SHARED) {
3818 dout(10) << "including inline version " << inline_version << dendl;
3819 cap->client_inline_version = inline_version;
3820 } else {
3821 dout(10) << "dropping inline version " << inline_version << dendl;
3822 inline_version = 0;
3823 inline_data.clear();
3824 }
3825 }
3826
3827 // include those xattrs?
11fdf7f2 3828 if (xattr_version && cap) {
7c673cae 3829 if ((cap->pending() | getattr_caps) & CEPH_CAP_XATTR_SHARED) {
11fdf7f2
TL
3830 dout(10) << "including xattrs version " << xattr_version << dendl;
3831 cap->client_xattr_version = xattr_version;
7c673cae 3832 } else {
11fdf7f2 3833 dout(10) << "dropping xattrs version " << xattr_version << dendl;
7c673cae
FG
3834 xattr_version = 0;
3835 }
3836 }
3837
11fdf7f2
TL
3838 // The end result of encode_xattrs() is equivalent to:
3839 // {
3840 // bufferlist xbl;
3841 // if (xattr_version) {
3842 // if (pxattrs)
3843 // encode(*pxattrs, bl);
3844 // else
3845 // encode((__u32)0, bl);
3846 // }
3847 // encode(xbl, bl);
3848 // }
3849 //
3850 // But encoding xattrs into the 'xbl' requires a memory allocation.
3851 // The 'bl' should have enough pre-allocated memory in most cases.
3852 // Encoding xattrs directly into it can avoid the extra allocation.
3853 auto encode_xattrs = [xattr_version, pxattrs, &bl]() {
3854 using ceph::encode;
3855 if (xattr_version) {
3856 ceph_le32 xbl_len;
3857 auto filler = bl.append_hole(sizeof(xbl_len));
3858 const auto starting_bl_len = bl.length();
3859 if (pxattrs)
3860 encode(*pxattrs, bl);
3861 else
3862 encode((__u32)0, bl);
3863 xbl_len = bl.length() - starting_bl_len;
3864 filler.copy_in(sizeof(xbl_len), (char *)&xbl_len);
3865 } else {
3866 encode((__u32)0, bl);
3867 }
3868 };
3869
7c673cae
FG
3870 /*
3871 * note: encoding matches MClientReply::InodeStat
3872 */
11fdf7f2 3873 if (session->info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) {
81eedcae 3874 ENCODE_START(3, 1, bl);
11fdf7f2
TL
3875 encode(oi->ino, bl);
3876 encode(snapid, bl);
3877 encode(oi->rdev, bl);
3878 encode(version, bl);
3879 encode(xattr_version, bl);
3880 encode(ecap, bl);
3881 {
3882 ceph_file_layout legacy_layout;
3883 layout.to_legacy(&legacy_layout);
3884 encode(legacy_layout, bl);
3885 }
3886 encode(any_i->ctime, bl);
3887 encode(file_i->mtime, bl);
3888 encode(file_i->atime, bl);
3889 encode(file_i->time_warp_seq, bl);
3890 encode(file_i->size, bl);
3891 encode(max_size, bl);
3892 encode(file_i->truncate_size, bl);
3893 encode(file_i->truncate_seq, bl);
3894 encode(auth_i->mode, bl);
3895 encode((uint32_t)auth_i->uid, bl);
3896 encode((uint32_t)auth_i->gid, bl);
3897 encode(link_i->nlink, bl);
3898 encode(file_i->dirstat.nfiles, bl);
3899 encode(file_i->dirstat.nsubdirs, bl);
3900 encode(file_i->rstat.rbytes, bl);
3901 encode(file_i->rstat.rfiles, bl);
3902 encode(file_i->rstat.rsubdirs, bl);
3903 encode(file_i->rstat.rctime, bl);
3904 dirfragtree.encode(bl);
3905 encode(symlink, bl);
3906 encode(file_i->dir_layout, bl);
3907 encode_xattrs();
3908 encode(inline_version, bl);
3909 encode(inline_data, bl);
94b18763 3910 mempool_inode *policy_i = ppolicy ? pi : oi;
11fdf7f2
TL
3911 encode(policy_i->quota, bl);
3912 encode(layout.pool_ns, bl);
3913 encode(any_i->btime, bl);
3914 encode(any_i->change_attr, bl);
3915 encode(file_i->export_pin, bl);
81eedcae 3916 encode(snap_btime, bl);
11fdf7f2
TL
3917 ENCODE_FINISH(bl);
3918 }
3919 else {
3920 ceph_assert(session->get_connection());
3921
3922 encode(oi->ino, bl);
3923 encode(snapid, bl);
3924 encode(oi->rdev, bl);
3925 encode(version, bl);
3926 encode(xattr_version, bl);
3927 encode(ecap, bl);
3928 {
3929 ceph_file_layout legacy_layout;
3930 layout.to_legacy(&legacy_layout);
3931 encode(legacy_layout, bl);
3932 }
3933 encode(any_i->ctime, bl);
3934 encode(file_i->mtime, bl);
3935 encode(file_i->atime, bl);
3936 encode(file_i->time_warp_seq, bl);
3937 encode(file_i->size, bl);
3938 encode(max_size, bl);
3939 encode(file_i->truncate_size, bl);
3940 encode(file_i->truncate_seq, bl);
3941 encode(auth_i->mode, bl);
3942 encode((uint32_t)auth_i->uid, bl);
3943 encode((uint32_t)auth_i->gid, bl);
3944 encode(link_i->nlink, bl);
3945 encode(file_i->dirstat.nfiles, bl);
3946 encode(file_i->dirstat.nsubdirs, bl);
3947 encode(file_i->rstat.rbytes, bl);
3948 encode(file_i->rstat.rfiles, bl);
3949 encode(file_i->rstat.rsubdirs, bl);
3950 encode(file_i->rstat.rctime, bl);
3951 dirfragtree.encode(bl);
3952 encode(symlink, bl);
3953 auto& conn = session->get_connection();
3954 if (conn->has_feature(CEPH_FEATURE_DIRLAYOUTHASH)) {
3955 encode(file_i->dir_layout, bl);
3956 }
3957 encode_xattrs();
3958 if (conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
3959 encode(inline_version, bl);
3960 encode(inline_data, bl);
3961 }
3962 if (conn->has_feature(CEPH_FEATURE_MDS_QUOTA)) {
3963 mempool_inode *policy_i = ppolicy ? pi : oi;
3964 encode(policy_i->quota, bl);
3965 }
3966 if (conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) {
3967 encode(layout.pool_ns, bl);
3968 }
3969 if (conn->has_feature(CEPH_FEATURE_FS_BTIME)) {
3970 encode(any_i->btime, bl);
3971 encode(any_i->change_attr, bl);
3972 }
7c673cae
FG
3973 }
3974
3975 return valid;
3976}
3977
9f95a23c 3978void CInode::encode_cap_message(const ref_t<MClientCaps> &m, Capability *cap)
7c673cae 3979{
11fdf7f2 3980 ceph_assert(cap);
7c673cae
FG
3981
3982 client_t client = cap->get_client();
3983
3984 bool pfile = filelock.is_xlocked_by_client(client) || (cap->issued() & CEPH_CAP_FILE_EXCL);
3985 bool pauth = authlock.is_xlocked_by_client(client);
3986 bool plink = linklock.is_xlocked_by_client(client);
3987 bool pxattr = xattrlock.is_xlocked_by_client(client);
3988
94b18763
FG
3989 mempool_inode *oi = &inode;
3990 mempool_inode *pi = get_projected_inode();
3991 mempool_inode *i = (pfile|pauth|plink|pxattr) ? pi : oi;
7c673cae 3992
11fdf7f2 3993 dout(20) << __func__ << " pfile " << pfile
7c673cae
FG
3994 << " pauth " << pauth << " plink " << plink << " pxattr " << pxattr
3995 << " ctime " << i->ctime << dendl;
3996
3997 i = pfile ? pi:oi;
3998 m->set_layout(i->layout);
3999 m->size = i->size;
4000 m->truncate_seq = i->truncate_seq;
4001 m->truncate_size = i->truncate_size;
4002 m->mtime = i->mtime;
4003 m->atime = i->atime;
4004 m->ctime = i->ctime;
4005 m->change_attr = i->change_attr;
4006 m->time_warp_seq = i->time_warp_seq;
28e407b8
AA
4007 m->nfiles = i->dirstat.nfiles;
4008 m->nsubdirs = i->dirstat.nsubdirs;
7c673cae
FG
4009
4010 if (cap->client_inline_version < i->inline_data.version) {
4011 m->inline_version = cap->client_inline_version = i->inline_data.version;
4012 if (i->inline_data.length() > 0)
4013 m->inline_data = i->inline_data.get_data();
4014 } else {
4015 m->inline_version = 0;
4016 }
4017
4018 // max_size is min of projected, actual.
4019 uint64_t oldms = oi->client_ranges.count(client) ? oi->client_ranges[client].range.last : 0;
4020 uint64_t newms = pi->client_ranges.count(client) ? pi->client_ranges[client].range.last : 0;
11fdf7f2 4021 m->max_size = std::min(oldms, newms);
7c673cae
FG
4022
4023 i = pauth ? pi:oi;
4024 m->head.mode = i->mode;
4025 m->head.uid = i->uid;
4026 m->head.gid = i->gid;
4027
4028 i = plink ? pi:oi;
4029 m->head.nlink = i->nlink;
4030
11fdf7f2 4031 using ceph::encode;
7c673cae 4032 i = pxattr ? pi:oi;
94b18763 4033 auto ix = pxattr ? get_projected_xattrs() : &xattrs;
7c673cae
FG
4034 if ((cap->pending() & CEPH_CAP_XATTR_SHARED) &&
4035 i->xattr_version > cap->client_xattr_version) {
4036 dout(10) << " including xattrs v " << i->xattr_version << dendl;
11fdf7f2 4037 encode(*ix, m->xattrbl);
7c673cae
FG
4038 m->head.xattr_version = i->xattr_version;
4039 cap->client_xattr_version = i->xattr_version;
4040 }
4041}
4042
4043
4044
4045void CInode::_encode_base(bufferlist& bl, uint64_t features)
4046{
9f95a23c 4047 ENCODE_START(1, 1, bl);
11fdf7f2
TL
4048 encode(first, bl);
4049 encode(inode, bl, features);
4050 encode(symlink, bl);
4051 encode(dirfragtree, bl);
4052 encode(xattrs, bl);
4053 encode(old_inodes, bl, features);
4054 encode(damage_flags, bl);
7c673cae 4055 encode_snap(bl);
9f95a23c 4056 ENCODE_FINISH(bl);
7c673cae 4057}
11fdf7f2 4058void CInode::_decode_base(bufferlist::const_iterator& p)
7c673cae 4059{
9f95a23c 4060 DECODE_START(1, p);
11fdf7f2
TL
4061 decode(first, p);
4062 decode(inode, p);
94b18763
FG
4063 {
4064 std::string tmp;
11fdf7f2
TL
4065 decode(tmp, p);
4066 symlink = std::string_view(tmp);
94b18763 4067 }
11fdf7f2 4068 decode(dirfragtree, p);
e306af50 4069 decode_noshare(xattrs, p);
11fdf7f2
TL
4070 decode(old_inodes, p);
4071 decode(damage_flags, p);
7c673cae 4072 decode_snap(p);
9f95a23c 4073 DECODE_FINISH(p);
7c673cae
FG
4074}
4075
4076void CInode::_encode_locks_full(bufferlist& bl)
4077{
11fdf7f2
TL
4078 using ceph::encode;
4079 encode(authlock, bl);
4080 encode(linklock, bl);
4081 encode(dirfragtreelock, bl);
4082 encode(filelock, bl);
4083 encode(xattrlock, bl);
4084 encode(snaplock, bl);
4085 encode(nestlock, bl);
4086 encode(flocklock, bl);
4087 encode(policylock, bl);
4088
4089 encode(loner_cap, bl);
4090}
4091void CInode::_decode_locks_full(bufferlist::const_iterator& p)
4092{
4093 using ceph::decode;
4094 decode(authlock, p);
4095 decode(linklock, p);
4096 decode(dirfragtreelock, p);
4097 decode(filelock, p);
4098 decode(xattrlock, p);
4099 decode(snaplock, p);
4100 decode(nestlock, p);
4101 decode(flocklock, p);
4102 decode(policylock, p);
4103
4104 decode(loner_cap, p);
7c673cae
FG
4105 set_loner_cap(loner_cap);
4106 want_loner_cap = loner_cap; // for now, we'll eval() shortly.
4107}
4108
b32b8144 4109void CInode::_encode_locks_state_for_replica(bufferlist& bl, bool need_recover)
7c673cae 4110{
9f95a23c 4111 ENCODE_START(1, 1, bl);
7c673cae
FG
4112 authlock.encode_state_for_replica(bl);
4113 linklock.encode_state_for_replica(bl);
4114 dirfragtreelock.encode_state_for_replica(bl);
4115 filelock.encode_state_for_replica(bl);
4116 nestlock.encode_state_for_replica(bl);
4117 xattrlock.encode_state_for_replica(bl);
4118 snaplock.encode_state_for_replica(bl);
4119 flocklock.encode_state_for_replica(bl);
4120 policylock.encode_state_for_replica(bl);
11fdf7f2 4121 encode(need_recover, bl);
9f95a23c 4122 ENCODE_FINISH(bl);
7c673cae 4123}
b32b8144 4124
7c673cae
FG
4125void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep)
4126{
4127 authlock.encode_state_for_replica(bl);
4128 linklock.encode_state_for_replica(bl);
4129 dirfragtreelock.encode_state_for_rejoin(bl, rep);
4130 filelock.encode_state_for_rejoin(bl, rep);
4131 nestlock.encode_state_for_rejoin(bl, rep);
4132 xattrlock.encode_state_for_replica(bl);
4133 snaplock.encode_state_for_replica(bl);
4134 flocklock.encode_state_for_replica(bl);
4135 policylock.encode_state_for_replica(bl);
4136}
b32b8144 4137
9f95a23c 4138void CInode::_decode_locks_state_for_replica(bufferlist::const_iterator& p, bool is_new)
7c673cae 4139{
9f95a23c 4140 DECODE_START(1, p);
7c673cae
FG
4141 authlock.decode_state(p, is_new);
4142 linklock.decode_state(p, is_new);
4143 dirfragtreelock.decode_state(p, is_new);
4144 filelock.decode_state(p, is_new);
4145 nestlock.decode_state(p, is_new);
4146 xattrlock.decode_state(p, is_new);
4147 snaplock.decode_state(p, is_new);
4148 flocklock.decode_state(p, is_new);
4149 policylock.decode_state(p, is_new);
b32b8144
FG
4150
4151 bool need_recover;
11fdf7f2 4152 decode(need_recover, p);
b32b8144
FG
4153 if (need_recover && is_new) {
4154 // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock
4155 // and change the object when replaying unsafe requests.
4156 authlock.mark_need_recover();
4157 linklock.mark_need_recover();
4158 dirfragtreelock.mark_need_recover();
4159 filelock.mark_need_recover();
4160 nestlock.mark_need_recover();
4161 xattrlock.mark_need_recover();
4162 snaplock.mark_need_recover();
4163 flocklock.mark_need_recover();
4164 policylock.mark_need_recover();
4165 }
9f95a23c 4166 DECODE_FINISH(p);
7c673cae 4167}
11fdf7f2 4168void CInode::_decode_locks_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters,
b32b8144
FG
4169 list<SimpleLock*>& eval_locks, bool survivor)
4170{
4171 authlock.decode_state_rejoin(p, waiters, survivor);
4172 linklock.decode_state_rejoin(p, waiters, survivor);
4173 dirfragtreelock.decode_state_rejoin(p, waiters, survivor);
4174 filelock.decode_state_rejoin(p, waiters, survivor);
4175 nestlock.decode_state_rejoin(p, waiters, survivor);
4176 xattrlock.decode_state_rejoin(p, waiters, survivor);
4177 snaplock.decode_state_rejoin(p, waiters, survivor);
4178 flocklock.decode_state_rejoin(p, waiters, survivor);
4179 policylock.decode_state_rejoin(p, waiters, survivor);
7c673cae
FG
4180
4181 if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked())
4182 eval_locks.push_back(&dirfragtreelock);
4183 if (!filelock.is_stable() && !filelock.is_wrlocked())
4184 eval_locks.push_back(&filelock);
4185 if (!nestlock.is_stable() && !nestlock.is_wrlocked())
4186 eval_locks.push_back(&nestlock);
4187}
4188
4189
4190// IMPORT/EXPORT
4191
4192void CInode::encode_export(bufferlist& bl)
4193{
4194 ENCODE_START(5, 4, bl);
4195 _encode_base(bl, mdcache->mds->mdsmap->get_up_features());
4196
11fdf7f2 4197 encode(state, bl);
7c673cae 4198
11fdf7f2 4199 encode(pop, bl);
7c673cae 4200
11fdf7f2 4201 encode(get_replicas(), bl);
7c673cae
FG
4202
4203 // include scatterlock info for any bounding CDirs
4204 bufferlist bounding;
4205 if (inode.is_dir())
94b18763
FG
4206 for (const auto &p : dirfrags) {
4207 CDir *dir = p.second;
7c673cae 4208 if (dir->state_test(CDir::STATE_EXPORTBOUND)) {
11fdf7f2
TL
4209 encode(p.first, bounding);
4210 encode(dir->fnode.fragstat, bounding);
4211 encode(dir->fnode.accounted_fragstat, bounding);
4212 encode(dir->fnode.rstat, bounding);
4213 encode(dir->fnode.accounted_rstat, bounding);
7c673cae
FG
4214 dout(10) << " encoded fragstat/rstat info for " << *dir << dendl;
4215 }
4216 }
11fdf7f2 4217 encode(bounding, bl);
7c673cae
FG
4218
4219 _encode_locks_full(bl);
4220
4221 _encode_file_locks(bl);
4222
4223 ENCODE_FINISH(bl);
4224
4225 get(PIN_TEMPEXPORTING);
4226}
4227
11fdf7f2 4228void CInode::finish_export()
7c673cae
FG
4229{
4230 state &= MASK_STATE_EXPORT_KEPT;
4231
11fdf7f2 4232 pop.zero();
7c673cae
FG
4233
4234 // just in case!
4235 //dirlock.clear_updated();
4236
4237 loner_cap = -1;
4238
4239 put(PIN_TEMPEXPORTING);
4240}
4241
11fdf7f2 4242void CInode::decode_import(bufferlist::const_iterator& p,
7c673cae
FG
4243 LogSegment *ls)
4244{
4245 DECODE_START(5, p);
4246
4247 _decode_base(p);
4248
4249 unsigned s;
11fdf7f2 4250 decode(s, p);
7c673cae
FG
4251 state_set(STATE_AUTH | (s & MASK_STATE_EXPORTED));
4252
4253 if (is_dirty()) {
4254 get(PIN_DIRTY);
4255 _mark_dirty(ls);
4256 }
4257 if (is_dirty_parent()) {
4258 get(PIN_DIRTYPARENT);
28e407b8 4259 mark_dirty_parent(ls);
7c673cae
FG
4260 }
4261
11fdf7f2 4262 decode(pop, p);
7c673cae 4263
11fdf7f2 4264 decode(get_replicas(), p);
181888fb 4265 if (is_replicated())
7c673cae
FG
4266 get(PIN_REPLICATED);
4267 replica_nonce = 0;
4268
4269 // decode fragstat info on bounding cdirs
4270 bufferlist bounding;
11fdf7f2
TL
4271 decode(bounding, p);
4272 auto q = bounding.cbegin();
7c673cae
FG
4273 while (!q.end()) {
4274 frag_t fg;
11fdf7f2 4275 decode(fg, q);
7c673cae 4276 CDir *dir = get_dirfrag(fg);
11fdf7f2 4277 ceph_assert(dir); // we should have all bounds open
7c673cae
FG
4278
4279 // Only take the remote's fragstat/rstat if we are non-auth for
4280 // this dirfrag AND the lock is NOT in a scattered (MIX) state.
4281 // We know lock is stable, and MIX is the only state in which
4282 // the inode auth (who sent us this data) may not have the best
4283 // info.
4284
4285 // HMM: Are there cases where dir->is_auth() is an insufficient
4286 // check because the dirfrag is under migration? That implies
4287 // it is frozen (and in a SYNC or LOCK state). FIXME.
4288
4289 if (dir->is_auth() ||
4290 filelock.get_state() == LOCK_MIX) {
4291 dout(10) << " skipped fragstat info for " << *dir << dendl;
4292 frag_info_t f;
11fdf7f2
TL
4293 decode(f, q);
4294 decode(f, q);
7c673cae 4295 } else {
11fdf7f2
TL
4296 decode(dir->fnode.fragstat, q);
4297 decode(dir->fnode.accounted_fragstat, q);
7c673cae
FG
4298 dout(10) << " took fragstat info for " << *dir << dendl;
4299 }
4300 if (dir->is_auth() ||
4301 nestlock.get_state() == LOCK_MIX) {
4302 dout(10) << " skipped rstat info for " << *dir << dendl;
4303 nest_info_t n;
11fdf7f2
TL
4304 decode(n, q);
4305 decode(n, q);
7c673cae 4306 } else {
11fdf7f2
TL
4307 decode(dir->fnode.rstat, q);
4308 decode(dir->fnode.accounted_rstat, q);
7c673cae
FG
4309 dout(10) << " took rstat info for " << *dir << dendl;
4310 }
4311 }
4312
4313 _decode_locks_full(p);
4314
4315 _decode_file_locks(p);
4316
4317 DECODE_FINISH(p);
4318}
4319
4320
4321void InodeStoreBase::dump(Formatter *f) const
4322{
4323 inode.dump(f);
4324 f->dump_string("symlink", symlink);
9f95a23c
TL
4325
4326 f->open_array_section("xattrs");
4327 for (const auto& [key, val] : xattrs) {
4328 f->open_object_section("xattr");
4329 f->dump_string("key", key);
4330 std::string v(val.c_str(), val.length());
4331 f->dump_string("val", v);
4332 f->close_section();
4333 }
4334 f->close_section();
4335 f->open_object_section("dirfragtree");
4336 dirfragtree.dump(f);
4337 f->close_section(); // dirfragtree
4338
7c673cae 4339 f->open_array_section("old_inodes");
94b18763 4340 for (const auto &p : old_inodes) {
7c673cae 4341 f->open_object_section("old_inode");
94b18763
FG
4342 // The key is the last snapid, the first is in the mempool_old_inode
4343 f->dump_int("last", p.first);
4344 p.second.dump(f);
7c673cae
FG
4345 f->close_section(); // old_inode
4346 }
4347 f->close_section(); // old_inodes
4348
9f95a23c
TL
4349 f->dump_unsigned("oldest_snap", oldest_snap);
4350 f->dump_unsigned("damage_flags", damage_flags);
7c673cae
FG
4351}
4352
4353
9f95a23c 4354void InodeStore::generate_test_instances(std::list<InodeStore*> &ls)
7c673cae
FG
4355{
4356 InodeStore *populated = new InodeStore;
4357 populated->inode.ino = 0xdeadbeef;
4358 populated->symlink = "rhubarb";
4359 ls.push_back(populated);
4360}
4361
9f95a23c 4362void InodeStoreBare::generate_test_instances(std::list<InodeStoreBare*> &ls)
11fdf7f2
TL
4363{
4364 InodeStoreBare *populated = new InodeStoreBare;
4365 populated->inode.ino = 0xdeadbeef;
4366 populated->symlink = "rhubarb";
4367 ls.push_back(populated);
4368}
4369
7c673cae 4370void CInode::validate_disk_state(CInode::validated_data *results,
11fdf7f2 4371 MDSContext *fin)
7c673cae
FG
4372{
4373 class ValidationContinuation : public MDSContinuation {
4374 public:
11fdf7f2 4375 MDSContext *fin;
7c673cae
FG
4376 CInode *in;
4377 CInode::validated_data *results;
4378 bufferlist bl;
4379 CInode *shadow_in;
4380
4381 enum {
4382 START = 0,
4383 BACKTRACE,
4384 INODE,
11fdf7f2
TL
4385 DIRFRAGS,
4386 SNAPREALM,
7c673cae
FG
4387 };
4388
4389 ValidationContinuation(CInode *i,
4390 CInode::validated_data *data_r,
11fdf7f2 4391 MDSContext *fin_) :
7c673cae
FG
4392 MDSContinuation(i->mdcache->mds->server),
4393 fin(fin_),
4394 in(i),
4395 results(data_r),
4396 shadow_in(NULL) {
4397 set_callback(START, static_cast<Continuation::stagePtr>(&ValidationContinuation::_start));
4398 set_callback(BACKTRACE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_backtrace));
4399 set_callback(INODE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_inode_disk));
4400 set_callback(DIRFRAGS, static_cast<Continuation::stagePtr>(&ValidationContinuation::_dirfrags));
11fdf7f2 4401 set_callback(SNAPREALM, static_cast<Continuation::stagePtr>(&ValidationContinuation::_snaprealm));
7c673cae
FG
4402 }
4403
4404 ~ValidationContinuation() override {
b32b8144
FG
4405 if (shadow_in) {
4406 delete shadow_in;
4407 in->mdcache->num_shadow_inodes--;
4408 }
7c673cae
FG
4409 }
4410
4411 /**
4412 * Fetch backtrace and set tag if tag is non-empty
4413 */
11fdf7f2
TL
4414 void fetch_backtrace_and_tag(CInode *in,
4415 std::string_view tag, bool is_internal,
7c673cae
FG
4416 Context *fin, int *bt_r, bufferlist *bt)
4417 {
4418 const int64_t pool = in->get_backtrace_pool();
4419 object_t oid = CInode::get_object_name(in->ino(), frag_t(), "");
4420
4421 ObjectOperation fetch;
4422 fetch.getxattr("parent", bt, bt_r);
4423 in->mdcache->mds->objecter->read(oid, object_locator_t(pool), fetch, CEPH_NOSNAP,
4424 NULL, 0, fin);
11fdf7f2
TL
4425 using ceph::encode;
4426 if (!is_internal) {
4427 ObjectOperation scrub_tag;
7c673cae 4428 bufferlist tag_bl;
11fdf7f2 4429 encode(tag, tag_bl);
7c673cae
FG
4430 scrub_tag.setxattr("scrub_tag", tag_bl);
4431 SnapContext snapc;
4432 in->mdcache->mds->objecter->mutate(oid, object_locator_t(pool), scrub_tag, snapc,
4433 ceph::real_clock::now(),
4434 0, NULL);
4435 }
4436 }
4437
4438 bool _start(int rval) {
4439 if (in->is_dirty()) {
11fdf7f2
TL
4440 MDCache *mdcache = in->mdcache;
4441 mempool_inode& inode = in->inode;
4442 dout(20) << "validating a dirty CInode; results will be inconclusive"
4443 << dendl;
7c673cae
FG
4444 }
4445 if (in->is_symlink()) {
11fdf7f2
TL
4446 // there's nothing to do for symlinks!
4447 return true;
7c673cae
FG
4448 }
4449
11fdf7f2
TL
4450 // prefetch snaprealm's past parents
4451 if (in->snaprealm && !in->snaprealm->have_past_parents_open())
4452 in->snaprealm->open_parents(nullptr);
4453
7c673cae 4454 C_OnFinisher *conf = new C_OnFinisher(get_io_callback(BACKTRACE),
11fdf7f2
TL
4455 in->mdcache->mds->finisher);
4456
4457 std::string_view tag = in->scrub_infop->header->get_tag();
4458 bool is_internal = in->scrub_infop->header->is_internal_tag();
4459 // Rather than using the usual CInode::fetch_backtrace,
4460 // use a special variant that optionally writes a tag in the same
4461 // operation.
4462 fetch_backtrace_and_tag(in, tag, is_internal, conf, &results->backtrace.ondisk_read_retval, &bl);
7c673cae
FG
4463 return false;
4464 }
4465
4466 bool _backtrace(int rval) {
4467 // set up basic result reporting and make sure we got the data
4468 results->performed_validation = true; // at least, some of it!
4469 results->backtrace.checked = true;
4470
4471 const int64_t pool = in->get_backtrace_pool();
4472 inode_backtrace_t& memory_backtrace = results->backtrace.memory_value;
4473 in->build_backtrace(pool, memory_backtrace);
4474 bool equivalent, divergent;
4475 int memory_newer;
4476
4477 MDCache *mdcache = in->mdcache; // For the benefit of dout
94b18763 4478 const mempool_inode& inode = in->inode; // For the benefit of dout
7c673cae
FG
4479
4480 // Ignore rval because it's the result of a FAILOK operation
4481 // from fetch_backtrace_and_tag: the real result is in
4482 // backtrace.ondisk_read_retval
4483 dout(20) << "ondisk_read_retval: " << results->backtrace.ondisk_read_retval << dendl;
4484 if (results->backtrace.ondisk_read_retval != 0) {
4485 results->backtrace.error_str << "failed to read off disk; see retval";
e306af50
TL
4486 // we probably have a new unwritten file!
4487 // so skip the backtrace scrub for this entry and say that all's well
4488 if (in->is_dirty_parent())
4489 results->backtrace.passed = true;
4490 goto next;
7c673cae
FG
4491 }
4492
4493 // extract the backtrace, and compare it to a newly-constructed one
4494 try {
11fdf7f2
TL
4495 auto p = bl.cbegin();
4496 using ceph::decode;
4497 decode(results->backtrace.ondisk_value, p);
7c673cae
FG
4498 dout(10) << "decoded " << bl.length() << " bytes of backtrace successfully" << dendl;
4499 } catch (buffer::error&) {
4500 if (results->backtrace.ondisk_read_retval == 0 && rval != 0) {
4501 // Cases where something has clearly gone wrong with the overall
4502 // fetch op, though we didn't get a nonzero rc from the getxattr
4503 // operation. e.g. object missing.
4504 results->backtrace.ondisk_read_retval = rval;
4505 }
4506 results->backtrace.error_str << "failed to decode on-disk backtrace ("
4507 << bl.length() << " bytes)!";
e306af50
TL
4508 // we probably have a new unwritten file!
4509 // so skip the backtrace scrub for this entry and say that all's well
4510 if (in->is_dirty_parent())
4511 results->backtrace.passed = true;
4512
7c673cae
FG
4513 goto next;
4514 }
4515
4516 memory_newer = memory_backtrace.compare(results->backtrace.ondisk_value,
4517 &equivalent, &divergent);
4518
4519 if (divergent || memory_newer < 0) {
e306af50
TL
4520 // we're divergent, or on-disk version is newer
4521 results->backtrace.error_str << "On-disk backtrace is divergent or newer";
4522 // we probably have a new unwritten file!
4523 // so skip the backtrace scrub for this entry and say that all's well
4524 if (divergent && in->is_dirty_parent())
4525 results->backtrace.passed = true;
7c673cae
FG
4526 } else {
4527 results->backtrace.passed = true;
4528 }
4529next:
4530
4531 if (!results->backtrace.passed && in->scrub_infop->header->get_repair()) {
4532 std::string path;
4533 in->make_path_string(path);
d2e6a577
FG
4534 in->mdcache->mds->clog->warn() << "bad backtrace on inode " << in->ino()
4535 << "(" << path << "), rewriting it";
28e407b8 4536 in->mark_dirty_parent(in->mdcache->mds->mdlog->get_current_segment(),
7c673cae 4537 false);
b32b8144
FG
4538 // Flag that we repaired this BT so that it won't go into damagetable
4539 results->backtrace.repaired = true;
7c673cae
FG
4540 }
4541
4542 // If the inode's number was free in the InoTable, fix that
4543 // (#15619)
4544 {
4545 InoTable *inotable = mdcache->mds->inotable;
4546
d2e6a577 4547 dout(10) << "scrub: inotable ino = " << inode.ino << dendl;
7c673cae
FG
4548 dout(10) << "scrub: inotable free says "
4549 << inotable->is_marked_free(inode.ino) << dendl;
4550
4551 if (inotable->is_marked_free(inode.ino)) {
4552 LogChannelRef clog = in->mdcache->mds->clog;
11fdf7f2 4553 clog->error() << "scrub: inode wrongly marked free: " << inode.ino;
7c673cae
FG
4554
4555 if (in->scrub_infop->header->get_repair()) {
4556 bool repaired = inotable->repair(inode.ino);
4557 if (repaired) {
11fdf7f2 4558 clog->error() << "inode table repaired for inode: " << inode.ino;
7c673cae
FG
4559
4560 inotable->save();
4561 } else {
4562 clog->error() << "Cannot repair inotable while other operations"
4563 " are in progress";
4564 }
4565 }
4566 }
4567 }
4568
7c673cae 4569
11fdf7f2
TL
4570 if (in->is_dir()) {
4571 return validate_directory_data();
4572 } else {
4573 // TODO: validate on-disk inode for normal files
4574 return check_inode_snaprealm();
4575 }
7c673cae
FG
4576 }
4577
4578 bool validate_directory_data() {
11fdf7f2 4579 ceph_assert(in->is_dir());
7c673cae
FG
4580
4581 if (in->is_base()) {
b32b8144
FG
4582 if (!shadow_in) {
4583 shadow_in = new CInode(in->mdcache);
4584 in->mdcache->create_unlinked_system_inode(shadow_in, in->inode.ino, in->inode.mode);
4585 in->mdcache->num_shadow_inodes++;
4586 }
7c673cae
FG
4587 shadow_in->fetch(get_internal_callback(INODE));
4588 return false;
4589 } else {
11fdf7f2 4590 // TODO: validate on-disk inode for non-base directories
7c673cae 4591 results->inode.passed = true;
11fdf7f2 4592 return check_dirfrag_rstats();
7c673cae
FG
4593 }
4594 }
4595
4596 bool _inode_disk(int rval) {
4597 results->inode.checked = true;
4598 results->inode.ondisk_read_retval = rval;
4599 results->inode.ondisk_value = shadow_in->inode;
4600 results->inode.memory_value = in->inode;
4601
94b18763
FG
4602 mempool_inode& si = shadow_in->inode;
4603 mempool_inode& i = in->inode;
7c673cae
FG
4604 if (si.version > i.version) {
4605 // uh, what?
11fdf7f2 4606 results->inode.error_str << "On-disk inode is newer than in-memory one; ";
7c673cae
FG
4607 goto next;
4608 } else {
4609 bool divergent = false;
4610 int r = i.compare(si, &divergent);
4611 results->inode.passed = !divergent && r >= 0;
4612 if (!results->inode.passed) {
4613 results->inode.error_str <<
11fdf7f2 4614 "On-disk inode is divergent or newer than in-memory one; ";
7c673cae
FG
4615 goto next;
4616 }
4617 }
4618next:
4619 return check_dirfrag_rstats();
4620 }
4621
4622 bool check_dirfrag_rstats() {
4623 MDSGatherBuilder gather(g_ceph_context);
11fdf7f2
TL
4624 frag_vec_t leaves;
4625 in->dirfragtree.get_leaves(leaves);
4626 for (const auto& leaf : leaves) {
4627 CDir *dir = in->get_or_open_dirfrag(in->mdcache, leaf);
7c673cae
FG
4628 dir->scrub_info();
4629 if (!dir->scrub_infop->header)
4630 dir->scrub_infop->header = in->scrub_infop->header;
4631 if (dir->is_complete()) {
4632 dir->scrub_local();
4633 } else {
4634 dir->scrub_infop->need_scrub_local = true;
4635 dir->fetch(gather.new_sub(), false);
4636 }
4637 }
4638 if (gather.has_subs()) {
4639 gather.set_finisher(get_internal_callback(DIRFRAGS));
4640 gather.activate();
4641 return false;
4642 } else {
4643 return immediate(DIRFRAGS, 0);
4644 }
4645 }
4646
4647 bool _dirfrags(int rval) {
4648 int frags_errors = 0;
4649 // basic reporting setup
4650 results->raw_stats.checked = true;
4651 results->raw_stats.ondisk_read_retval = rval;
4652
4653 results->raw_stats.memory_value.dirstat = in->inode.dirstat;
4654 results->raw_stats.memory_value.rstat = in->inode.rstat;
4655 frag_info_t& dir_info = results->raw_stats.ondisk_value.dirstat;
4656 nest_info_t& nest_info = results->raw_stats.ondisk_value.rstat;
4657
4658 if (rval != 0) {
4659 results->raw_stats.error_str << "Failed to read dirfrags off disk";
4660 goto next;
4661 }
4662
4663 // check each dirfrag...
94b18763
FG
4664 for (const auto &p : in->dirfrags) {
4665 CDir *dir = p.second;
11fdf7f2 4666 ceph_assert(dir->get_version() > 0);
7c673cae
FG
4667 nest_info.add(dir->fnode.accounted_rstat);
4668 dir_info.add(dir->fnode.accounted_fragstat);
11fdf7f2 4669 if (dir->scrub_infop->pending_scrub_error) {
7c673cae
FG
4670 dir->scrub_infop->pending_scrub_error = false;
4671 if (dir->scrub_infop->header->get_repair()) {
b32b8144 4672 results->raw_stats.repaired = true;
7c673cae 4673 results->raw_stats.error_str
94b18763 4674 << "dirfrag(" << p.first << ") has bad stats (will be fixed); ";
7c673cae
FG
4675 } else {
4676 results->raw_stats.error_str
94b18763 4677 << "dirfrag(" << p.first << ") has bad stats; ";
7c673cae
FG
4678 }
4679 frags_errors++;
4680 }
4681 }
4682 nest_info.rsubdirs++; // it gets one to account for self
11fdf7f2
TL
4683 if (const sr_t *srnode = in->get_projected_srnode(); srnode)
4684 nest_info.rsnaps += srnode->snaps.size();
4685
7c673cae
FG
4686 // ...and that their sum matches our inode settings
4687 if (!dir_info.same_sums(in->inode.dirstat) ||
4688 !nest_info.same_sums(in->inode.rstat)) {
11fdf7f2 4689 if (in->scrub_infop->header->get_repair()) {
7c673cae
FG
4690 results->raw_stats.error_str
4691 << "freshly-calculated rstats don't match existing ones (will be fixed)";
4692 in->mdcache->repair_inode_stats(in);
b32b8144 4693 results->raw_stats.repaired = true;
7c673cae
FG
4694 } else {
4695 results->raw_stats.error_str
4696 << "freshly-calculated rstats don't match existing ones";
4697 }
4698 goto next;
4699 }
4700 if (frags_errors > 0)
4701 goto next;
4702
4703 results->raw_stats.passed = true;
4704next:
11fdf7f2
TL
4705 // snaprealm
4706 return check_inode_snaprealm();
4707 }
4708
4709 bool check_inode_snaprealm() {
4710 if (!in->snaprealm)
4711 return true;
4712
4713 if (!in->snaprealm->have_past_parents_open()) {
4714 in->snaprealm->open_parents(get_internal_callback(SNAPREALM));
4715 return false;
4716 } else {
4717 return immediate(SNAPREALM, 0);
4718 }
4719 }
4720
4721 bool _snaprealm(int rval) {
4722
4723 if (in->snaprealm->past_parents_dirty ||
4724 !in->get_projected_srnode()->past_parents.empty()) {
4725 // temporarily store error in field of on-disk inode validation temporarily
4726 results->inode.checked = true;
4727 results->inode.passed = false;
4728 if (in->scrub_infop->header->get_repair()) {
4729 results->inode.error_str << "Inode has old format snaprealm (will upgrade)";
4730 results->inode.repaired = true;
4731 in->mdcache->upgrade_inode_snaprealm(in);
4732 } else {
4733 results->inode.error_str << "Inode has old format snaprealm";
4734 }
4735 }
7c673cae
FG
4736 return true;
4737 }
4738
4739 void _done() override {
4740 if ((!results->raw_stats.checked || results->raw_stats.passed) &&
4741 (!results->backtrace.checked || results->backtrace.passed) &&
4742 (!results->inode.checked || results->inode.passed))
11fdf7f2
TL
4743 results->passed_validation = true;
4744
4745 // Flag that we did some repair work so that our repair operation
4746 // can be flushed at end of scrub
4747 if (results->backtrace.repaired ||
4748 results->inode.repaired ||
4749 results->raw_stats.repaired)
4750 in->scrub_infop->header->set_repaired();
4751 if (fin)
4752 fin->complete(get_rval());
7c673cae
FG
4753 }
4754 };
4755
4756
4757 dout(10) << "scrub starting validate_disk_state on " << *this << dendl;
4758 ValidationContinuation *vc = new ValidationContinuation(this,
4759 results,
4760 fin);
4761 vc->begin();
4762}
4763
4764void CInode::validated_data::dump(Formatter *f) const
4765{
4766 f->open_object_section("results");
4767 {
4768 f->dump_bool("performed_validation", performed_validation);
4769 f->dump_bool("passed_validation", passed_validation);
4770 f->open_object_section("backtrace");
4771 {
4772 f->dump_bool("checked", backtrace.checked);
4773 f->dump_bool("passed", backtrace.passed);
4774 f->dump_int("read_ret_val", backtrace.ondisk_read_retval);
4775 f->dump_stream("ondisk_value") << backtrace.ondisk_value;
4776 f->dump_stream("memoryvalue") << backtrace.memory_value;
4777 f->dump_string("error_str", backtrace.error_str.str());
4778 }
4779 f->close_section(); // backtrace
4780 f->open_object_section("raw_stats");
4781 {
4782 f->dump_bool("checked", raw_stats.checked);
4783 f->dump_bool("passed", raw_stats.passed);
4784 f->dump_int("read_ret_val", raw_stats.ondisk_read_retval);
4785 f->dump_stream("ondisk_value.dirstat") << raw_stats.ondisk_value.dirstat;
4786 f->dump_stream("ondisk_value.rstat") << raw_stats.ondisk_value.rstat;
4787 f->dump_stream("memory_value.dirrstat") << raw_stats.memory_value.dirstat;
4788 f->dump_stream("memory_value.rstat") << raw_stats.memory_value.rstat;
4789 f->dump_string("error_str", raw_stats.error_str.str());
4790 }
4791 f->close_section(); // raw_stats
4792 // dump failure return code
4793 int rc = 0;
4794 if (backtrace.checked && backtrace.ondisk_read_retval)
4795 rc = backtrace.ondisk_read_retval;
4796 if (inode.checked && inode.ondisk_read_retval)
4797 rc = inode.ondisk_read_retval;
4798 if (raw_stats.checked && raw_stats.ondisk_read_retval)
4799 rc = raw_stats.ondisk_read_retval;
4800 f->dump_int("return_code", rc);
4801 }
4802 f->close_section(); // results
4803}
4804
b32b8144
FG
4805bool CInode::validated_data::all_damage_repaired() const
4806{
4807 bool unrepaired =
4808 (raw_stats.checked && !raw_stats.passed && !raw_stats.repaired)
4809 ||
4810 (backtrace.checked && !backtrace.passed && !backtrace.repaired)
4811 ||
4812 (inode.checked && !inode.passed && !inode.repaired);
4813
4814 return !unrepaired;
4815}
4816
11fdf7f2
TL
4817void CInode::dump(Formatter *f, int flags) const
4818{
4819 if (flags & DUMP_PATH) {
4820 std::string path;
4821 make_path_string(path, true);
4822 if (path.empty())
4823 path = "/";
4824 f->dump_string("path", path);
4825 }
4826
4827 if (flags & DUMP_INODE_STORE_BASE)
4828 InodeStoreBase::dump(f);
4829
4830 if (flags & DUMP_MDS_CACHE_OBJECT)
4831 MDSCacheObject::dump(f);
4832
4833 if (flags & DUMP_LOCKS) {
4834 f->open_object_section("versionlock");
4835 versionlock.dump(f);
4836 f->close_section();
4837
4838 f->open_object_section("authlock");
4839 authlock.dump(f);
4840 f->close_section();
4841
4842 f->open_object_section("linklock");
4843 linklock.dump(f);
4844 f->close_section();
4845
4846 f->open_object_section("dirfragtreelock");
4847 dirfragtreelock.dump(f);
4848 f->close_section();
4849
4850 f->open_object_section("filelock");
4851 filelock.dump(f);
4852 f->close_section();
4853
4854 f->open_object_section("xattrlock");
4855 xattrlock.dump(f);
4856 f->close_section();
4857
4858 f->open_object_section("snaplock");
4859 snaplock.dump(f);
4860 f->close_section();
4861
4862 f->open_object_section("nestlock");
4863 nestlock.dump(f);
4864 f->close_section();
4865
4866 f->open_object_section("flocklock");
4867 flocklock.dump(f);
4868 f->close_section();
4869
4870 f->open_object_section("policylock");
4871 policylock.dump(f);
4872 f->close_section();
4873 }
4874
4875 if (flags & DUMP_STATE) {
4876 f->open_array_section("states");
4877 MDSCacheObject::dump_states(f);
4878 if (state_test(STATE_EXPORTING))
4879 f->dump_string("state", "exporting");
4880 if (state_test(STATE_OPENINGDIR))
4881 f->dump_string("state", "openingdir");
4882 if (state_test(STATE_FREEZING))
4883 f->dump_string("state", "freezing");
4884 if (state_test(STATE_FROZEN))
4885 f->dump_string("state", "frozen");
4886 if (state_test(STATE_AMBIGUOUSAUTH))
4887 f->dump_string("state", "ambiguousauth");
4888 if (state_test(STATE_EXPORTINGCAPS))
4889 f->dump_string("state", "exportingcaps");
4890 if (state_test(STATE_NEEDSRECOVER))
4891 f->dump_string("state", "needsrecover");
4892 if (state_test(STATE_PURGING))
4893 f->dump_string("state", "purging");
4894 if (state_test(STATE_DIRTYPARENT))
4895 f->dump_string("state", "dirtyparent");
4896 if (state_test(STATE_DIRTYRSTAT))
4897 f->dump_string("state", "dirtyrstat");
4898 if (state_test(STATE_STRAYPINNED))
4899 f->dump_string("state", "straypinned");
4900 if (state_test(STATE_FROZENAUTHPIN))
4901 f->dump_string("state", "frozenauthpin");
4902 if (state_test(STATE_DIRTYPOOL))
4903 f->dump_string("state", "dirtypool");
4904 if (state_test(STATE_ORPHAN))
4905 f->dump_string("state", "orphan");
4906 if (state_test(STATE_MISSINGOBJS))
4907 f->dump_string("state", "missingobjs");
7c673cae
FG
4908 f->close_section();
4909 }
7c673cae 4910
11fdf7f2
TL
4911 if (flags & DUMP_CAPS) {
4912 f->open_array_section("client_caps");
4913 for (const auto &p : client_caps) {
4914 auto &client = p.first;
4915 auto cap = &p.second;
4916 f->open_object_section("client_cap");
4917 f->dump_int("client_id", client.v);
4918 f->dump_string("pending", ccap_string(cap->pending()));
4919 f->dump_string("issued", ccap_string(cap->issued()));
4920 f->dump_string("wanted", ccap_string(cap->wanted()));
4921 f->dump_int("last_sent", cap->get_last_seq());
4922 f->close_section();
4923 }
4924 f->close_section();
4925
4926 f->dump_int("loner", loner_cap.v);
4927 f->dump_int("want_loner", want_loner_cap.v);
4928
4929 f->open_array_section("mds_caps_wanted");
4930 for (const auto &p : mds_caps_wanted) {
4931 f->open_object_section("mds_cap_wanted");
4932 f->dump_int("rank", p.first);
4933 f->dump_string("cap", ccap_string(p.second));
4934 f->close_section();
4935 }
4936 f->close_section();
4937 }
7c673cae 4938
11fdf7f2
TL
4939 if (flags & DUMP_DIRFRAGS) {
4940 f->open_array_section("dirfrags");
9f95a23c 4941 auto&& dfs = get_dirfrags();
11fdf7f2
TL
4942 for(const auto &dir: dfs) {
4943 f->open_object_section("dir");
4944 dir->dump(f, CDir::DUMP_DEFAULT | CDir::DUMP_ITEMS);
4945 dir->check_rstats();
4946 f->close_section();
4947 }
7c673cae
FG
4948 f->close_section();
4949 }
7c673cae
FG
4950}
4951
4952/****** Scrub Stuff *****/
4953void CInode::scrub_info_create() const
4954{
4955 dout(25) << __func__ << dendl;
11fdf7f2 4956 ceph_assert(!scrub_infop);
7c673cae
FG
4957
4958 // break out of const-land to set up implicit initial state
4959 CInode *me = const_cast<CInode*>(this);
94b18763 4960 mempool_inode *in = me->get_projected_inode();
7c673cae
FG
4961
4962 scrub_info_t *si = new scrub_info_t();
4963 si->scrub_start_stamp = si->last_scrub_stamp = in->last_scrub_stamp;
4964 si->scrub_start_version = si->last_scrub_version = in->last_scrub_version;
4965
4966 me->scrub_infop = si;
4967}
4968
4969void CInode::scrub_maybe_delete_info()
4970{
4971 if (scrub_infop &&
4972 !scrub_infop->scrub_in_progress &&
4973 !scrub_infop->last_scrub_dirty) {
4974 delete scrub_infop;
4975 scrub_infop = NULL;
4976 }
4977}
4978
4979void CInode::scrub_initialize(CDentry *scrub_parent,
b32b8144 4980 ScrubHeaderRef& header,
11fdf7f2 4981 MDSContext *f)
7c673cae
FG
4982{
4983 dout(20) << __func__ << " with scrub_version " << get_version() << dendl;
94b18763
FG
4984 if (scrub_is_in_progress()) {
4985 dout(20) << __func__ << " inode moved during scrub, reinitializing "
4986 << dendl;
11fdf7f2 4987 ceph_assert(scrub_infop->scrub_parent);
94b18763
FG
4988 CDentry *dn = scrub_infop->scrub_parent;
4989 CDir *dir = dn->dir;
4990 dn->put(CDentry::PIN_SCRUBPARENT);
11fdf7f2 4991 ceph_assert(dir->scrub_infop && dir->scrub_infop->directory_scrubbing);
94b18763
FG
4992 dir->scrub_infop->directories_scrubbing.erase(dn->key());
4993 dir->scrub_infop->others_scrubbing.erase(dn->key());
4994 }
7c673cae
FG
4995 scrub_info();
4996 if (!scrub_infop)
4997 scrub_infop = new scrub_info_t();
4998
4999 if (get_projected_inode()->is_dir()) {
5000 // fill in dirfrag_stamps with initial state
11fdf7f2
TL
5001 frag_vec_t leaves;
5002 dirfragtree.get_leaves(leaves);
5003 for (const auto& leaf : leaves) {
7c673cae 5004 if (header->get_force())
11fdf7f2 5005 scrub_infop->dirfrag_stamps[leaf].reset();
7c673cae 5006 else
11fdf7f2 5007 scrub_infop->dirfrag_stamps[leaf];
7c673cae
FG
5008 }
5009 }
5010
5011 if (scrub_parent)
5012 scrub_parent->get(CDentry::PIN_SCRUBPARENT);
5013 scrub_infop->scrub_parent = scrub_parent;
5014 scrub_infop->on_finish = f;
5015 scrub_infop->scrub_in_progress = true;
5016 scrub_infop->children_scrubbed = false;
5017 scrub_infop->header = header;
5018
5019 scrub_infop->scrub_start_version = get_version();
5020 scrub_infop->scrub_start_stamp = ceph_clock_now();
5021 // right now we don't handle remote inodes
5022}
5023
5024int CInode::scrub_dirfrag_next(frag_t* out_dirfrag)
5025{
5026 dout(20) << __func__ << dendl;
11fdf7f2 5027 ceph_assert(scrub_is_in_progress());
7c673cae
FG
5028
5029 if (!is_dir()) {
5030 return -ENOTDIR;
5031 }
5032
5033 std::map<frag_t, scrub_stamp_info_t>::iterator i =
5034 scrub_infop->dirfrag_stamps.begin();
5035
5036 while (i != scrub_infop->dirfrag_stamps.end()) {
5037 if (i->second.scrub_start_version < scrub_infop->scrub_start_version) {
5038 i->second.scrub_start_version = get_projected_version();
5039 i->second.scrub_start_stamp = ceph_clock_now();
5040 *out_dirfrag = i->first;
5041 dout(20) << " return frag " << *out_dirfrag << dendl;
5042 return 0;
5043 }
5044 ++i;
5045 }
5046
5047 dout(20) << " no frags left, ENOENT " << dendl;
5048 return ENOENT;
5049}
5050
11fdf7f2 5051void CInode::scrub_dirfrags_scrubbing(frag_vec_t* out_dirfrags)
7c673cae 5052{
11fdf7f2
TL
5053 ceph_assert(out_dirfrags != NULL);
5054 ceph_assert(scrub_infop != NULL);
7c673cae
FG
5055
5056 out_dirfrags->clear();
5057 std::map<frag_t, scrub_stamp_info_t>::iterator i =
5058 scrub_infop->dirfrag_stamps.begin();
5059
5060 while (i != scrub_infop->dirfrag_stamps.end()) {
5061 if (i->second.scrub_start_version >= scrub_infop->scrub_start_version) {
5062 if (i->second.last_scrub_version < scrub_infop->scrub_start_version)
5063 out_dirfrags->push_back(i->first);
5064 } else {
5065 return;
5066 }
5067
5068 ++i;
5069 }
5070}
5071
5072void CInode::scrub_dirfrag_finished(frag_t dirfrag)
5073{
5074 dout(20) << __func__ << " on frag " << dirfrag << dendl;
11fdf7f2 5075 ceph_assert(scrub_is_in_progress());
7c673cae
FG
5076
5077 std::map<frag_t, scrub_stamp_info_t>::iterator i =
5078 scrub_infop->dirfrag_stamps.find(dirfrag);
11fdf7f2 5079 ceph_assert(i != scrub_infop->dirfrag_stamps.end());
7c673cae
FG
5080
5081 scrub_stamp_info_t &si = i->second;
5082 si.last_scrub_stamp = si.scrub_start_stamp;
5083 si.last_scrub_version = si.scrub_start_version;
5084}
5085
11fdf7f2
TL
5086void CInode::scrub_aborted(MDSContext **c) {
5087 dout(20) << __func__ << dendl;
5088 ceph_assert(scrub_is_in_progress());
5089
5090 *c = nullptr;
5091 std::swap(*c, scrub_infop->on_finish);
5092
5093 if (scrub_infop->scrub_parent) {
5094 CDentry *dn = scrub_infop->scrub_parent;
5095 scrub_infop->scrub_parent = NULL;
5096 dn->dir->scrub_dentry_finished(dn);
5097 dn->put(CDentry::PIN_SCRUBPARENT);
5098 }
5099
5100 delete scrub_infop;
5101 scrub_infop = nullptr;
5102}
5103
5104void CInode::scrub_finished(MDSContext **c) {
7c673cae 5105 dout(20) << __func__ << dendl;
11fdf7f2 5106 ceph_assert(scrub_is_in_progress());
7c673cae
FG
5107 for (std::map<frag_t, scrub_stamp_info_t>::iterator i =
5108 scrub_infop->dirfrag_stamps.begin();
5109 i != scrub_infop->dirfrag_stamps.end();
5110 ++i) {
5111 if(i->second.last_scrub_version != i->second.scrub_start_version) {
5112 derr << i->second.last_scrub_version << " != "
5113 << i->second.scrub_start_version << dendl;
5114 }
11fdf7f2 5115 ceph_assert(i->second.last_scrub_version == i->second.scrub_start_version);
7c673cae
FG
5116 }
5117
5118 scrub_infop->last_scrub_version = scrub_infop->scrub_start_version;
5119 scrub_infop->last_scrub_stamp = scrub_infop->scrub_start_stamp;
5120 scrub_infop->last_scrub_dirty = true;
5121 scrub_infop->scrub_in_progress = false;
5122
5123 if (scrub_infop->scrub_parent) {
5124 CDentry *dn = scrub_infop->scrub_parent;
5125 scrub_infop->scrub_parent = NULL;
5126 dn->dir->scrub_dentry_finished(dn);
5127 dn->put(CDentry::PIN_SCRUBPARENT);
5128 }
5129
5130 *c = scrub_infop->on_finish;
5131 scrub_infop->on_finish = NULL;
5132
5133 if (scrub_infop->header->get_origin() == this) {
5134 // We are at the point that a tagging scrub was initiated
5135 LogChannelRef clog = mdcache->mds->clog;
11fdf7f2
TL
5136 clog->info() << "scrub complete with tag '"
5137 << scrub_infop->header->get_tag() << "'";
7c673cae
FG
5138 }
5139}
5140
5141int64_t CInode::get_backtrace_pool() const
5142{
5143 if (is_dir()) {
5144 return mdcache->mds->mdsmap->get_metadata_pool();
5145 } else {
5146 // Files are required to have an explicit layout that specifies
5147 // a pool
11fdf7f2 5148 ceph_assert(inode.layout.pool_id != -1);
7c673cae
FG
5149 return inode.layout.pool_id;
5150 }
5151}
5152
31f18b77
FG
5153void CInode::maybe_export_pin(bool update)
5154{
11fdf7f2 5155 if (!g_conf()->mds_bal_export_pin)
31f18b77
FG
5156 return;
5157 if (!is_dir() || !is_normal())
5158 return;
7c673cae 5159
31f18b77
FG
5160 mds_rank_t export_pin = get_export_pin(false);
5161 if (export_pin == MDS_RANK_NONE && !update)
5162 return;
7c673cae 5163
31f18b77
FG
5164 if (state_test(CInode::STATE_QUEUEDEXPORTPIN))
5165 return;
5166
5167 bool queue = false;
5168 for (auto p = dirfrags.begin(); p != dirfrags.end(); p++) {
5169 CDir *dir = p->second;
5170 if (!dir->is_auth())
5171 continue;
5172 if (export_pin != MDS_RANK_NONE) {
5173 if (dir->is_subtree_root()) {
5174 // set auxsubtree bit or export it
5175 if (!dir->state_test(CDir::STATE_AUXSUBTREE) ||
5176 export_pin != dir->get_dir_auth().first)
5177 queue = true;
5178 } else {
5179 // create aux subtree or export it
5180 queue = true;
7c673cae 5181 }
31f18b77
FG
5182 } else {
5183 // clear aux subtrees ?
5184 queue = dir->state_test(CDir::STATE_AUXSUBTREE);
5185 }
5186 if (queue) {
5187 state_set(CInode::STATE_QUEUEDEXPORTPIN);
7c673cae 5188 mdcache->export_pin_queue.insert(this);
31f18b77 5189 break;
7c673cae
FG
5190 }
5191 }
5192}
5193
5194void CInode::set_export_pin(mds_rank_t rank)
5195{
11fdf7f2
TL
5196 ceph_assert(is_dir());
5197 ceph_assert(is_projected());
7c673cae 5198 get_projected_inode()->export_pin = rank;
7c673cae
FG
5199}
5200
5201mds_rank_t CInode::get_export_pin(bool inherit) const
5202{
5203 /* An inode that is export pinned may not necessarily be a subtree root, we
5204 * need to traverse the parents. A base or system inode cannot be pinned.
5205 * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5206 * have a parent yet.
5207 */
b32b8144
FG
5208 const CInode *in = this;
5209 while (true) {
5210 if (in->is_system())
5211 break;
f64942e4 5212 const CDentry *pdn = in->get_parent_dn();
b32b8144
FG
5213 if (!pdn)
5214 break;
b32b8144 5215 // ignore export pin for unlinked directory
f64942e4 5216 if (in->get_inode().nlink == 0)
b32b8144 5217 break;
f64942e4
AA
5218 if (in->get_inode().export_pin >= 0)
5219 return in->get_inode().export_pin;
b32b8144
FG
5220
5221 if (!inherit)
5222 break;
5223 in = pdn->get_dir()->inode;
7c673cae
FG
5224 }
5225 return MDS_RANK_NONE;
5226}
5227
5228bool CInode::is_exportable(mds_rank_t dest) const
5229{
5230 mds_rank_t pin = get_export_pin();
5231 if (pin == dest) {
5232 return true;
5233 } else if (pin >= 0) {
5234 return false;
5235 } else {
5236 return true;
5237 }
5238}
181888fb 5239
9f95a23c
TL
5240void CInode::get_nested_dirfrags(std::vector<CDir*>& v) const
5241{
5242 for (const auto &p : dirfrags) {
5243 const auto& dir = p.second;
5244 if (!dir->is_subtree_root())
5245 v.push_back(dir);
5246 }
5247}
5248
5249void CInode::get_subtree_dirfrags(std::vector<CDir*>& v) const
5250{
5251 for (const auto &p : dirfrags) {
5252 const auto& dir = p.second;
5253 if (dir->is_subtree_root())
5254 v.push_back(dir);
5255 }
5256}
5257
181888fb 5258MEMPOOL_DEFINE_OBJECT_FACTORY(CInode, co_inode, mds_co);