]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/CInode.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / mds / CInode.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "include/int_types.h"
16#include "common/errno.h"
17
18#include <string>
7c673cae
FG
19
20#include "CInode.h"
21#include "CDir.h"
22#include "CDentry.h"
23
24#include "MDSRank.h"
25#include "MDCache.h"
26#include "MDLog.h"
27#include "Locker.h"
28#include "Mutation.h"
29
30#include "events/EUpdate.h"
31
32#include "osdc/Objecter.h"
33
34#include "snap.h"
35
36#include "LogSegment.h"
37
38#include "common/Clock.h"
39
7c673cae
FG
40#include "common/config.h"
41#include "global/global_context.h"
11fdf7f2 42#include "include/ceph_assert.h"
7c673cae
FG
43
44#include "mds/MDSContinuation.h"
45#include "mds/InoTable.h"
11fdf7f2 46#include "cephfs_features.h"
f67539c2 47#include "osdc/Objecter.h"
7c673cae
FG
48
49#define dout_context g_ceph_context
50#define dout_subsys ceph_subsys_mds
51#undef dout_prefix
f67539c2
TL
52#define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") "
53
20effc67
TL
54using namespace std;
55
f67539c2
TL
56void CInodeCommitOperation::update(ObjectOperation &op, inode_backtrace_t &bt) {
57 using ceph::encode;
58
59 op.priority = priority;
60 op.create(false);
61
62 bufferlist parent_bl;
63 encode(bt, parent_bl);
64 op.setxattr("parent", parent_bl);
7c673cae 65
20effc67
TL
66 // for the old pool there is no need to update the layout and symlink
67 if (!update_layout_symlink)
f67539c2
TL
68 return;
69
70 bufferlist layout_bl;
71 encode(_layout, layout_bl, _features);
72 op.setxattr("layout", layout_bl);
20effc67
TL
73
74 if (!_symlink.empty()) {
75 bufferlist symlink_bl;
76 encode(_symlink, symlink_bl);
77 op.setxattr("symlink", symlink_bl);
78 }
f67539c2 79}
7c673cae
FG
80
81class CInodeIOContext : public MDSIOContextBase
82{
83protected:
84 CInode *in;
85 MDSRank *get_mds() override {return in->mdcache->mds;}
86public:
87 explicit CInodeIOContext(CInode *in_) : in(in_) {
11fdf7f2 88 ceph_assert(in != NULL);
7c673cae
FG
89 }
90};
91
11fdf7f2 92sr_t* const CInode::projected_inode::UNDEF_SRNODE = (sr_t*)(unsigned long)-1;
7c673cae
FG
93
94LockType CInode::versionlock_type(CEPH_LOCK_IVERSION);
95LockType CInode::authlock_type(CEPH_LOCK_IAUTH);
96LockType CInode::linklock_type(CEPH_LOCK_ILINK);
97LockType CInode::dirfragtreelock_type(CEPH_LOCK_IDFT);
98LockType CInode::filelock_type(CEPH_LOCK_IFILE);
99LockType CInode::xattrlock_type(CEPH_LOCK_IXATTR);
100LockType CInode::snaplock_type(CEPH_LOCK_ISNAP);
101LockType CInode::nestlock_type(CEPH_LOCK_INEST);
102LockType CInode::flocklock_type(CEPH_LOCK_IFLOCK);
103LockType CInode::policylock_type(CEPH_LOCK_IPOLICY);
104
9f95a23c
TL
105std::string_view CInode::pin_name(int p) const
106{
107 switch (p) {
108 case PIN_DIRFRAG: return "dirfrag";
109 case PIN_CAPS: return "caps";
110 case PIN_IMPORTING: return "importing";
111 case PIN_OPENINGDIR: return "openingdir";
112 case PIN_REMOTEPARENT: return "remoteparent";
113 case PIN_BATCHOPENJOURNAL: return "batchopenjournal";
114 case PIN_SCATTERED: return "scattered";
115 case PIN_STICKYDIRS: return "stickydirs";
116 //case PIN_PURGING: return "purging";
117 case PIN_FREEZING: return "freezing";
118 case PIN_FROZEN: return "frozen";
119 case PIN_IMPORTINGCAPS: return "importingcaps";
120 case PIN_EXPORTINGCAPS: return "exportingcaps";
121 case PIN_PASTSNAPPARENT: return "pastsnapparent";
122 case PIN_OPENINGSNAPPARENTS: return "openingsnapparents";
123 case PIN_TRUNCATING: return "truncating";
124 case PIN_STRAY: return "stray";
125 case PIN_NEEDSNAPFLUSH: return "needsnapflush";
126 case PIN_DIRTYRSTAT: return "dirtyrstat";
127 case PIN_DIRTYPARENT: return "dirtyparent";
128 case PIN_DIRWAITER: return "dirwaiter";
9f95a23c
TL
129 default: return generic_pin_name(p);
130 }
131}
132
7c673cae
FG
133//int cinode_pins[CINODE_NUM_PINS]; // counts
134ostream& CInode::print_db_line_prefix(ostream& out)
135{
f67539c2 136 return out << ceph_clock_now() << " mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") ";
7c673cae
FG
137}
138
139/*
140 * write caps and lock ids
141 */
142struct cinode_lock_info_t cinode_lock_info[] = {
143 { CEPH_LOCK_IFILE, CEPH_CAP_ANY_FILE_WR },
144 { CEPH_LOCK_IAUTH, CEPH_CAP_AUTH_EXCL },
145 { CEPH_LOCK_ILINK, CEPH_CAP_LINK_EXCL },
146 { CEPH_LOCK_IXATTR, CEPH_CAP_XATTR_EXCL },
147};
148int num_cinode_locks = sizeof(cinode_lock_info) / sizeof(cinode_lock_info[0]);
149
7c673cae
FG
150ostream& operator<<(ostream& out, const CInode& in)
151{
152 string path;
153 in.make_path_string(path, true);
154
f67539c2 155 out << "[inode " << in.ino();
7c673cae
FG
156 out << " ["
157 << (in.is_multiversion() ? "...":"")
158 << in.first << "," << in.last << "]";
159 out << " " << path << (in.is_dir() ? "/":"");
160
161 if (in.is_auth()) {
162 out << " auth";
163 if (in.is_replicated())
164 out << in.get_replicas();
165 } else {
166 mds_authority_t a = in.authority();
167 out << " rep@" << a.first;
168 if (a.second != CDIR_AUTH_UNKNOWN)
169 out << "," << a.second;
170 out << "." << in.get_replica_nonce();
171 }
172
173 if (in.is_symlink())
174 out << " symlink='" << in.symlink << "'";
175 if (in.is_dir() && !in.dirfragtree.empty())
176 out << " " << in.dirfragtree;
177
178 out << " v" << in.get_version();
179 if (in.get_projected_version() > in.get_version())
180 out << " pv" << in.get_projected_version();
181
11fdf7f2
TL
182 if (in.get_num_auth_pins()) {
183 out << " ap=" << in.get_num_auth_pins();
7c673cae 184#ifdef MDS_AUTHPIN_SET
11fdf7f2 185 in.print_authpin_set(out);
7c673cae
FG
186#endif
187 }
188
189 if (in.snaprealm)
190 out << " snaprealm=" << in.snaprealm;
191
192 if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH";
f67539c2
TL
193 if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " NEEDSRECOVER";
194 if (in.state_test(CInode::STATE_RECOVERING)) out << " RECOVERING";
195 if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " DIRTYPARENT";
196 if (in.state_test(CInode::STATE_MISSINGOBJS)) out << " MISSINGOBJS";
197 if (in.is_ephemeral_dist()) out << " DISTEPHEMERALPIN";
198 if (in.is_ephemeral_rand()) out << " RANDEPHEMERALPIN";
7c673cae
FG
199 if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance;
200 if (in.is_frozen_inode()) out << " FROZEN";
201 if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN";
202
f67539c2 203 const auto& pi = in.get_projected_inode();
7c673cae
FG
204 if (pi->is_truncating())
205 out << " truncating(" << pi->truncate_from << " to " << pi->truncate_size << ")";
206
f67539c2
TL
207 if (in.is_dir()) {
208 out << " " << in.get_inode()->dirstat;
11fdf7f2 209 if (g_conf()->mds_debug_scatterstat && in.is_projected()) {
7c673cae
FG
210 out << "->" << pi->dirstat;
211 }
212 } else {
f67539c2
TL
213 out << " s=" << in.get_inode()->size;
214 if (in.get_inode()->nlink != 1)
215 out << " nl=" << in.get_inode()->nlink;
7c673cae
FG
216 }
217
218 // rstat
f67539c2
TL
219 out << " " << in.get_inode()->rstat;
220 if (!(in.get_inode()->rstat == in.get_inode()->accounted_rstat))
221 out << "/" << in.get_inode()->accounted_rstat;
11fdf7f2 222 if (g_conf()->mds_debug_scatterstat && in.is_projected()) {
7c673cae
FG
223 out << "->" << pi->rstat;
224 if (!(pi->rstat == pi->accounted_rstat))
225 out << "/" << pi->accounted_rstat;
226 }
227
f67539c2
TL
228 if (in.is_any_old_inodes()) {
229 out << " old_inodes=" << in.get_old_inodes()->size();
230 }
231
7c673cae
FG
232 if (!in.client_need_snapflush.empty())
233 out << " need_snapflush=" << in.client_need_snapflush;
234
7c673cae
FG
235 // locks
236 if (!in.authlock.is_sync_and_unlocked())
237 out << " " << in.authlock;
238 if (!in.linklock.is_sync_and_unlocked())
239 out << " " << in.linklock;
f67539c2 240 if (in.get_inode()->is_dir()) {
7c673cae
FG
241 if (!in.dirfragtreelock.is_sync_and_unlocked())
242 out << " " << in.dirfragtreelock;
243 if (!in.snaplock.is_sync_and_unlocked())
244 out << " " << in.snaplock;
245 if (!in.nestlock.is_sync_and_unlocked())
246 out << " " << in.nestlock;
247 if (!in.policylock.is_sync_and_unlocked())
248 out << " " << in.policylock;
249 } else {
250 if (!in.flocklock.is_sync_and_unlocked())
251 out << " " << in.flocklock;
252 }
253 if (!in.filelock.is_sync_and_unlocked())
254 out << " " << in.filelock;
255 if (!in.xattrlock.is_sync_and_unlocked())
256 out << " " << in.xattrlock;
257 if (!in.versionlock.is_sync_and_unlocked())
258 out << " " << in.versionlock;
259
260 // hack: spit out crap on which clients have caps
f67539c2
TL
261 if (in.get_inode()->client_ranges.size())
262 out << " cr=" << in.get_inode()->client_ranges;
7c673cae
FG
263
264 if (!in.get_client_caps().empty()) {
265 out << " caps={";
11fdf7f2
TL
266 bool first = true;
267 for (const auto &p : in.get_client_caps()) {
268 if (!first) out << ",";
269 out << p.first << "="
270 << ccap_string(p.second.pending());
271 if (p.second.issued() != p.second.pending())
272 out << "/" << ccap_string(p.second.issued());
273 out << "/" << ccap_string(p.second.wanted())
274 << "@" << p.second.get_last_seq();
275 first = false;
7c673cae
FG
276 }
277 out << "}";
278 if (in.get_loner() >= 0 || in.get_wanted_loner() >= 0) {
279 out << ",l=" << in.get_loner();
280 if (in.get_loner() != in.get_wanted_loner())
281 out << "(" << in.get_wanted_loner() << ")";
282 }
283 }
284 if (!in.get_mds_caps_wanted().empty()) {
285 out << " mcw={";
94b18763
FG
286 bool first = true;
287 for (const auto &p : in.get_mds_caps_wanted()) {
288 if (!first)
7c673cae 289 out << ',';
94b18763
FG
290 out << p.first << '=' << ccap_string(p.second);
291 first = false;
7c673cae
FG
292 }
293 out << '}';
294 }
295
296 if (in.get_num_ref()) {
297 out << " |";
298 in.print_pin_set(out);
299 }
300
f67539c2
TL
301 if (in.get_inode()->export_pin != MDS_RANK_NONE) {
302 out << " export_pin=" << in.get_inode()->export_pin;
7c673cae 303 }
f6b5b4d7
TL
304 if (in.state_test(CInode::STATE_DISTEPHEMERALPIN)) {
305 out << " distepin";
306 }
307 if (in.state_test(CInode::STATE_RANDEPHEMERALPIN)) {
308 out << " randepin";
309 }
7c673cae
FG
310
311 out << " " << &in;
312 out << "]";
313 return out;
314}
315
f67539c2
TL
316CInode::CInode(MDCache *c, bool auth, snapid_t f, snapid_t l) :
317 mdcache(c), first(f), last(l),
11fdf7f2
TL
318 item_dirty(this),
319 item_caps(this),
320 item_open_file(this),
321 item_dirty_parent(this),
322 item_dirty_dirfrag_dir(this),
323 item_dirty_dirfrag_nest(this),
324 item_dirty_dirfrag_dirfragtree(this),
325 pop(c->decayrate),
326 versionlock(this, &versionlock_type),
327 authlock(this, &authlock_type),
328 linklock(this, &linklock_type),
329 dirfragtreelock(this, &dirfragtreelock_type),
330 filelock(this, &filelock_type),
331 xattrlock(this, &xattrlock_type),
332 snaplock(this, &snaplock_type),
333 nestlock(this, &nestlock_type),
334 flocklock(this, &flocklock_type),
335 policylock(this, &policylock_type)
336{
f67539c2
TL
337 if (auth)
338 state_set(STATE_AUTH);
11fdf7f2 339}
7c673cae
FG
340
341void CInode::print(ostream& out)
342{
343 out << *this;
344}
345
7c673cae
FG
346void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
347{
11fdf7f2 348 dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl;
7c673cae
FG
349
350 if (client_need_snapflush.empty()) {
351 get(CInode::PIN_NEEDSNAPFLUSH);
352
353 // FIXME: this is non-optimal, as we'll block freezes/migrations for potentially
354 // long periods waiting for clients to flush their snaps.
f67539c2 355 auth_pin(this); // pin head get_inode()->..
7c673cae
FG
356 }
357
94b18763 358 auto &clients = client_need_snapflush[snapid];
7c673cae
FG
359 if (clients.empty())
360 snapin->auth_pin(this); // ...and pin snapped/old inode!
361
362 clients.insert(client);
363}
364
365void CInode::remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
366{
94b18763
FG
367 dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl;
368 auto it = client_need_snapflush.find(snapid);
369 if (it == client_need_snapflush.end()) {
7c673cae
FG
370 dout(10) << " snapid not found" << dendl;
371 return;
372 }
94b18763
FG
373 size_t n = it->second.erase(client);
374 if (n == 0) {
7c673cae
FG
375 dout(10) << " client not found" << dendl;
376 return;
377 }
94b18763
FG
378 if (it->second.empty()) {
379 client_need_snapflush.erase(it);
7c673cae
FG
380 snapin->auth_unpin(this);
381
382 if (client_need_snapflush.empty()) {
383 put(CInode::PIN_NEEDSNAPFLUSH);
384 auth_unpin(this);
385 }
386 }
387}
388
494da23a 389pair<bool,bool> CInode::split_need_snapflush(CInode *cowin, CInode *in)
7c673cae 390{
11fdf7f2 391 dout(10) << __func__ << " [" << cowin->first << "," << cowin->last << "] for " << *cowin << dendl;
494da23a
TL
392 bool cowin_need_flush = false;
393 bool orig_need_flush = false;
394 auto it = client_need_snapflush.lower_bound(cowin->first);
395 while (it != client_need_snapflush.end() && it->first < in->first) {
11fdf7f2 396 ceph_assert(!it->second.empty());
94b18763 397 if (cowin->last >= it->first) {
7c673cae 398 cowin->auth_pin(this);
494da23a 399 cowin_need_flush = true;
94b18763
FG
400 ++it;
401 } else {
402 it = client_need_snapflush.erase(it);
403 }
7c673cae
FG
404 in->auth_unpin(this);
405 }
494da23a
TL
406
407 if (it != client_need_snapflush.end() && it->first <= in->last)
408 orig_need_flush = true;
409
410 return make_pair(cowin_need_flush, orig_need_flush);
7c673cae
FG
411}
412
413void CInode::mark_dirty_rstat()
414{
415 if (!state_test(STATE_DIRTYRSTAT)) {
11fdf7f2 416 dout(10) << __func__ << dendl;
7c673cae
FG
417 state_set(STATE_DIRTYRSTAT);
418 get(PIN_DIRTYRSTAT);
224ce89b
WB
419 CDentry *pdn = get_projected_parent_dn();
420 if (pdn->is_auth()) {
421 CDir *pdir = pdn->dir;
422 pdir->dirty_rstat_inodes.push_back(&dirty_rstat_item);
423 mdcache->mds->locker->mark_updated_scatterlock(&pdir->inode->nestlock);
424 } else {
425 // under cross-MDS rename.
426 // DIRTYRSTAT flag will get cleared when rename finishes
11fdf7f2 427 ceph_assert(state_test(STATE_AMBIGUOUSAUTH));
224ce89b 428 }
7c673cae
FG
429 }
430}
431void CInode::clear_dirty_rstat()
432{
433 if (state_test(STATE_DIRTYRSTAT)) {
11fdf7f2 434 dout(10) << __func__ << dendl;
7c673cae
FG
435 state_clear(STATE_DIRTYRSTAT);
436 put(PIN_DIRTYRSTAT);
437 dirty_rstat_item.remove_myself();
438 }
439}
440
f67539c2
TL
441CInode::projected_inode CInode::project_inode(const MutationRef& mut,
442 bool xattr, bool snap)
94b18763 443{
f67539c2
TL
444 if (mut && mut->is_projected(this)) {
445 ceph_assert(!xattr && !snap);
446 auto _inode = std::const_pointer_cast<mempool_inode>(projected_nodes.back().inode);
447 return projected_inode(std::move(_inode), xattr_map_ptr());
448 }
449
450 auto pi = allocate_inode(*get_projected_inode());
7c673cae
FG
451
452 if (scrub_infop && scrub_infop->last_scrub_dirty) {
f67539c2
TL
453 pi->last_scrub_stamp = scrub_infop->last_scrub_stamp;
454 pi->last_scrub_version = scrub_infop->last_scrub_version;
7c673cae
FG
455 scrub_infop->last_scrub_dirty = false;
456 scrub_maybe_delete_info();
457 }
94b18763 458
f67539c2
TL
459 const auto& ox = get_projected_xattrs();
460 xattr_map_ptr px;
94b18763 461 if (xattr) {
f67539c2
TL
462 px = allocate_xattr_map();
463 if (ox)
464 *px = *ox;
94b18763
FG
465 }
466
f67539c2 467 sr_t* ps = projected_inode::UNDEF_SRNODE;
94b18763 468 if (snap) {
f67539c2
TL
469 ps = prepare_new_srnode(0);
470 ++num_projected_srnodes;
94b18763
FG
471 }
472
f67539c2
TL
473 projected_nodes.emplace_back(pi, xattr ? px : ox , ps);
474 if (mut)
475 mut->add_projected_node(this);
476 dout(15) << __func__ << " " << pi->ino << dendl;
477 return projected_inode(std::move(pi), std::move(px), ps);
7c673cae
FG
478}
479
f67539c2 480void CInode::pop_and_dirty_projected_inode(LogSegment *ls, const MutationRef& mut)
7c673cae 481{
11fdf7f2 482 ceph_assert(!projected_nodes.empty());
f67539c2
TL
483 auto front = std::move(projected_nodes.front());
484 dout(15) << __func__ << " v" << front.inode->version << dendl;
f6b5b4d7 485
f67539c2
TL
486 projected_nodes.pop_front();
487 if (mut)
488 mut->remove_projected_node(this);
7c673cae 489
f67539c2
TL
490 bool pool_updated = get_inode()->layout.pool_id != front.inode->layout.pool_id;
491 bool pin_updated = (get_inode()->export_pin != front.inode->export_pin) ||
492 (get_inode()->export_ephemeral_distributed_pin !=
493 front.inode->export_ephemeral_distributed_pin);
7c673cae 494
f67539c2
TL
495 reset_inode(std::move(front.inode));
496 if (front.xattrs != get_xattrs())
497 reset_xattrs(std::move(front.xattrs));
7c673cae 498
f67539c2 499 if (front.snapnode != projected_inode::UNDEF_SRNODE) {
7c673cae 500 --num_projected_srnodes;
f67539c2 501 pop_projected_snaprealm(front.snapnode, false);
7c673cae
FG
502 }
503
f67539c2
TL
504 mark_dirty(ls);
505 if (get_inode()->is_backtrace_updated())
506 mark_dirty_parent(ls, pool_updated);
7c673cae 507
f67539c2
TL
508 if (pin_updated)
509 maybe_export_pin(true);
9f95a23c
TL
510}
511
11fdf7f2
TL
512sr_t *CInode::prepare_new_srnode(snapid_t snapid)
513{
514 const sr_t *cur_srnode = get_projected_srnode();
515 sr_t *new_srnode;
516
517 if (cur_srnode) {
518 new_srnode = new sr_t(*cur_srnode);
11fdf7f2
TL
519 } else {
520 if (snapid == 0)
521 snapid = mdcache->get_global_snaprealm()->get_newest_seq();
522 new_srnode = new sr_t();
523 new_srnode->seq = snapid;
524 new_srnode->created = snapid;
525 new_srnode->current_parent_since = get_oldest_snap();
526 }
527 return new_srnode;
528}
529
9f95a23c
TL
530const sr_t *CInode::get_projected_srnode() const {
531 if (num_projected_srnodes > 0) {
532 for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
533 if (it->snapnode != projected_inode::UNDEF_SRNODE)
534 return it->snapnode;
535 }
536 if (snaprealm)
537 return &snaprealm->srnode;
538 else
539 return NULL;
540}
541
11fdf7f2
TL
542void CInode::project_snaprealm(sr_t *new_srnode)
543{
544 dout(10) << __func__ << " " << new_srnode << dendl;
545 ceph_assert(projected_nodes.back().snapnode == projected_inode::UNDEF_SRNODE);
546 projected_nodes.back().snapnode = new_srnode;
547 ++num_projected_srnodes;
548}
549
550void CInode::mark_snaprealm_global(sr_t *new_srnode)
551{
552 ceph_assert(!is_dir());
553 // 'last_destroyed' is no longer used, use it to store origin 'current_parent_since'
554 new_srnode->last_destroyed = new_srnode->current_parent_since;
555 new_srnode->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
556 new_srnode->mark_parent_global();
557}
558
559void CInode::clear_snaprealm_global(sr_t *new_srnode)
560{
561 // restore 'current_parent_since'
562 new_srnode->current_parent_since = new_srnode->last_destroyed;
563 new_srnode->last_destroyed = 0;
564 new_srnode->seq = mdcache->get_global_snaprealm()->get_newest_seq();
565 new_srnode->clear_parent_global();
566}
567
568bool CInode::is_projected_snaprealm_global() const
569{
570 const sr_t *srnode = get_projected_srnode();
571 if (srnode && srnode->is_parent_global())
572 return true;
573 return false;
574}
575
576void CInode::project_snaprealm_past_parent(SnapRealm *newparent)
577{
578 sr_t *new_snap = project_snaprealm();
579 record_snaprealm_past_parent(new_snap, newparent);
580}
581
582
7c673cae
FG
583/* if newparent != parent, add parent to past_parents
584 if parent DNE, we need to find what the parent actually is and fill that in */
11fdf7f2 585void CInode::record_snaprealm_past_parent(sr_t *new_snap, SnapRealm *newparent)
7c673cae 586{
11fdf7f2 587 ceph_assert(!new_snap->is_parent_global());
7c673cae
FG
588 SnapRealm *oldparent;
589 if (!snaprealm) {
590 oldparent = find_snaprealm();
11fdf7f2 591 } else {
7c673cae 592 oldparent = snaprealm->parent;
11fdf7f2 593 }
7c673cae
FG
594
595 if (newparent != oldparent) {
596 snapid_t oldparentseq = oldparent->get_newest_seq();
11fdf7f2
TL
597 if (oldparentseq + 1 > new_snap->current_parent_since) {
598 // copy old parent's snaps
599 const set<snapid_t>& snaps = oldparent->get_snaps();
600 auto p = snaps.lower_bound(new_snap->current_parent_since);
601 if (p != snaps.end())
602 new_snap->past_parent_snaps.insert(p, snaps.end());
603 if (oldparentseq > new_snap->seq)
604 new_snap->seq = oldparentseq;
7c673cae 605 }
11fdf7f2 606 new_snap->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae
FG
607 }
608}
609
adb31ebb 610void CInode::record_snaprealm_parent_dentry(sr_t *new_snap, SnapRealm *oldparent,
11fdf7f2 611 CDentry *dn, bool primary_dn)
7c673cae 612{
11fdf7f2 613 ceph_assert(new_snap->is_parent_global());
adb31ebb
TL
614
615 if (!oldparent)
616 oldparent = dn->get_dir()->inode->find_snaprealm();
11fdf7f2
TL
617 auto& snaps = oldparent->get_snaps();
618
619 if (!primary_dn) {
620 auto p = snaps.lower_bound(dn->first);
621 if (p != snaps.end())
622 new_snap->past_parent_snaps.insert(p, snaps.end());
adb31ebb 623 } else {
11fdf7f2
TL
624 // 'last_destroyed' is used as 'current_parent_since'
625 auto p = snaps.lower_bound(new_snap->last_destroyed);
626 if (p != snaps.end())
627 new_snap->past_parent_snaps.insert(p, snaps.end());
628 new_snap->last_destroyed = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
629 }
630}
7c673cae 631
11fdf7f2
TL
632void CInode::early_pop_projected_snaprealm()
633{
634 ceph_assert(!projected_nodes.empty());
635 if (projected_nodes.front().snapnode != projected_inode::UNDEF_SRNODE) {
636 pop_projected_snaprealm(projected_nodes.front().snapnode, true);
637 projected_nodes.front().snapnode = projected_inode::UNDEF_SRNODE;
638 --num_projected_srnodes;
7c673cae 639 }
11fdf7f2 640}
7c673cae 641
11fdf7f2
TL
642void CInode::pop_projected_snaprealm(sr_t *next_snaprealm, bool early)
643{
644 if (next_snaprealm) {
645 dout(10) << __func__ << (early ? " (early) " : " ")
646 << next_snaprealm << " seq " << next_snaprealm->seq << dendl;
f67539c2 647 if (!snaprealm)
11fdf7f2 648 open_snaprealm();
11fdf7f2 649
11fdf7f2
TL
650 auto old_flags = snaprealm->srnode.flags;
651 snaprealm->srnode = *next_snaprealm;
652 delete next_snaprealm;
7c673cae 653
11fdf7f2 654 if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) {
11fdf7f2
TL
655 snaprealm->adjust_parent();
656 }
7c673cae 657
11fdf7f2
TL
658 if (snaprealm->parent)
659 dout(10) << " realm " << *snaprealm << " parent " << *snaprealm->parent << dendl;
660 } else {
661 dout(10) << __func__ << (early ? " (early) null" : " null") << dendl;
662 ceph_assert(snaprealm);
663 snaprealm->merge_to(NULL);
664 }
7c673cae
FG
665}
666
667
668// ====== CInode =======
669
670// dirfrags
671
f67539c2
TL
672InodeStoreBase::inode_const_ptr InodeStoreBase::empty_inode = InodeStoreBase::allocate_inode();
673
11fdf7f2 674__u32 InodeStoreBase::hash_dentry_name(std::string_view dn)
7c673cae 675{
f67539c2 676 int which = inode->dir_layout.dl_dir_hash;
7c673cae
FG
677 if (!which)
678 which = CEPH_STR_HASH_LINUX;
11fdf7f2 679 ceph_assert(ceph_str_hash_valid(which));
7c673cae
FG
680 return ceph_str_hash(which, dn.data(), dn.length());
681}
682
11fdf7f2 683frag_t InodeStoreBase::pick_dirfrag(std::string_view dn)
7c673cae
FG
684{
685 if (dirfragtree.empty())
686 return frag_t(); // avoid the string hash if we can.
687
688 __u32 h = hash_dentry_name(dn);
689 return dirfragtree[h];
690}
691
9f95a23c 692std::pair<bool, std::vector<CDir*>> CInode::get_dirfrags_under(frag_t fg)
7c673cae 693{
9f95a23c
TL
694 std::pair<bool, std::vector<CDir*>> result;
695 auto& all = result.first;
696 auto& dirs = result.second;
697 all = false;
698
699 if (auto it = dirfrags.find(fg); it != dirfrags.end()){
700 all = true;
701 dirs.push_back(it->second);
702 return result;
7c673cae 703 }
9f95a23c
TL
704
705 int total = 0;
706 for(auto &[_fg, _dir] : dirfrags){
707 // frag_t.bits() can indicate the depth of the partition in the directory tree
708 // e.g.
709 // 01* : bit = 2, on the second floor
710 // *
711 // 0* 1*
712 // 00* 01* 10* 11* -- > level 2, bit = 2
713 // so fragA.bits > fragB.bits means fragA is deeper than fragB
714
715 if (fg.bits() >= _fg.bits()) {
716 if (_fg.contains(fg)) {
717 all = true;
718 return result;
719 }
720 } else {
721 if (fg.contains(_fg)) {
722 dirs.push_back(_dir);
723 // we can calculate how many sub slices a slice can be divided into
724 // frag_t(*) can be divided into two frags belonging to the first layer(0* 1*)
725 // or 2^2 frags belonging to the second layer(00* 01* 10* 11*)
726 // or (1 << (24 - frag_t(*).bits)) frags belonging to the 24th level
727 total += 1 << (24 - _fg.bits());
11fdf7f2 728 }
7c673cae 729 }
94b18763 730 }
7c673cae 731
9f95a23c
TL
732 // we convert all the frags into the frags of 24th layer to calculate whether all the frags are included in the memory cache
733 all = ((1<<(24-fg.bits())) == total);
734 return result;
7c673cae
FG
735}
736
737void CInode::verify_dirfrags()
738{
739 bool bad = false;
94b18763
FG
740 for (const auto &p : dirfrags) {
741 if (!dirfragtree.is_leaf(p.first)) {
742 dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
743 << ": " << *p.second << dendl;
7c673cae
FG
744 bad = true;
745 }
746 }
11fdf7f2 747 ceph_assert(!bad);
7c673cae
FG
748}
749
750void CInode::force_dirfrags()
751{
752 bool bad = false;
94b18763
FG
753 for (auto &p : dirfrags) {
754 if (!dirfragtree.is_leaf(p.first)) {
755 dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
756 << ": " << *p.second << dendl;
7c673cae
FG
757 bad = true;
758 }
759 }
760
761 if (bad) {
11fdf7f2 762 frag_vec_t leaves;
7c673cae 763 dirfragtree.get_leaves(leaves);
11fdf7f2
TL
764 for (const auto& leaf : leaves) {
765 mdcache->get_force_dirfrag(dirfrag_t(ino(), leaf), true);
766 }
7c673cae
FG
767 }
768
769 verify_dirfrags();
770}
771
772CDir *CInode::get_approx_dirfrag(frag_t fg)
773{
774 CDir *dir = get_dirfrag(fg);
775 if (dir) return dir;
776
777 // find a child?
9f95a23c
TL
778 auto&& p = get_dirfrags_under(fg);
779 if (!p.second.empty())
780 return p.second.front();
7c673cae
FG
781
782 // try parents?
783 while (fg.bits() > 0) {
784 fg = fg.parent();
785 dir = get_dirfrag(fg);
786 if (dir) return dir;
787 }
788 return NULL;
789}
790
7c673cae
FG
791CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg)
792{
11fdf7f2 793 ceph_assert(is_dir());
7c673cae
FG
794
795 // have it?
796 CDir *dir = get_dirfrag(fg);
797 if (!dir) {
798 // create it.
11fdf7f2 799 ceph_assert(is_auth() || mdcache->mds->is_any_replay());
7c673cae
FG
800 dir = new CDir(this, fg, mdcache, is_auth());
801 add_dirfrag(dir);
802 }
803 return dir;
804}
805
806CDir *CInode::add_dirfrag(CDir *dir)
807{
11fdf7f2
TL
808 auto em = dirfrags.emplace(std::piecewise_construct, std::forward_as_tuple(dir->dirfrag().frag), std::forward_as_tuple(dir));
809 ceph_assert(em.second);
7c673cae
FG
810
811 if (stickydir_ref > 0) {
812 dir->state_set(CDir::STATE_STICKY);
813 dir->get(CDir::PIN_STICKY);
814 }
815
f67539c2 816 maybe_export_pin();
7c673cae
FG
817
818 return dir;
819}
820
821void CInode::close_dirfrag(frag_t fg)
822{
11fdf7f2
TL
823 dout(14) << __func__ << " " << fg << dendl;
824 ceph_assert(dirfrags.count(fg));
7c673cae
FG
825
826 CDir *dir = dirfrags[fg];
827 dir->remove_null_dentries();
828
829 // clear dirty flag
830 if (dir->is_dirty())
831 dir->mark_clean();
832
833 if (stickydir_ref > 0) {
834 dir->state_clear(CDir::STATE_STICKY);
835 dir->put(CDir::PIN_STICKY);
836 }
1adf2230
AA
837
838 if (dir->is_subtree_root())
839 num_subtree_roots--;
7c673cae
FG
840
841 // dump any remaining dentries, for debugging purposes
94b18763
FG
842 for (const auto &p : dir->items)
843 dout(14) << __func__ << " LEFTOVER dn " << *p.second << dendl;
7c673cae 844
11fdf7f2 845 ceph_assert(dir->get_num_ref() == 0);
7c673cae
FG
846 delete dir;
847 dirfrags.erase(fg);
848}
849
850void CInode::close_dirfrags()
851{
852 while (!dirfrags.empty())
853 close_dirfrag(dirfrags.begin()->first);
854}
855
856bool CInode::has_subtree_root_dirfrag(int auth)
857{
1adf2230
AA
858 if (num_subtree_roots > 0) {
859 if (auth == -1)
7c673cae 860 return true;
1adf2230
AA
861 for (const auto &p : dirfrags) {
862 if (p.second->is_subtree_root() &&
863 p.second->dir_auth.first == auth)
864 return true;
865 }
94b18763 866 }
7c673cae
FG
867 return false;
868}
869
870bool CInode::has_subtree_or_exporting_dirfrag()
871{
1adf2230
AA
872 if (num_subtree_roots > 0 || num_exporting_dirs > 0)
873 return true;
7c673cae
FG
874 return false;
875}
876
877void CInode::get_stickydirs()
878{
879 if (stickydir_ref == 0) {
880 get(PIN_STICKYDIRS);
94b18763
FG
881 for (const auto &p : dirfrags) {
882 p.second->state_set(CDir::STATE_STICKY);
883 p.second->get(CDir::PIN_STICKY);
7c673cae
FG
884 }
885 }
886 stickydir_ref++;
887}
888
889void CInode::put_stickydirs()
890{
11fdf7f2 891 ceph_assert(stickydir_ref > 0);
7c673cae
FG
892 stickydir_ref--;
893 if (stickydir_ref == 0) {
894 put(PIN_STICKYDIRS);
94b18763
FG
895 for (const auto &p : dirfrags) {
896 p.second->state_clear(CDir::STATE_STICKY);
897 p.second->put(CDir::PIN_STICKY);
7c673cae
FG
898 }
899 }
900}
901
902
903
904
905
906// pins
907
908void CInode::first_get()
909{
910 // pin my dentry?
911 if (parent)
912 parent->get(CDentry::PIN_INODEPIN);
913}
914
915void CInode::last_put()
916{
917 // unpin my dentry?
918 if (parent)
919 parent->put(CDentry::PIN_INODEPIN);
920}
921
922void CInode::_put()
923{
924 if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent())
925 mdcache->maybe_eval_stray(this, true);
926}
927
928void CInode::add_remote_parent(CDentry *p)
929{
930 if (remote_parents.empty())
931 get(PIN_REMOTEPARENT);
932 remote_parents.insert(p);
933}
934void CInode::remove_remote_parent(CDentry *p)
935{
936 remote_parents.erase(p);
937 if (remote_parents.empty())
938 put(PIN_REMOTEPARENT);
939}
940
941
942
943
944CDir *CInode::get_parent_dir()
945{
946 if (parent)
947 return parent->dir;
948 return NULL;
949}
950CDir *CInode::get_projected_parent_dir()
951{
952 CDentry *p = get_projected_parent_dn();
953 if (p)
954 return p->dir;
955 return NULL;
956}
957CInode *CInode::get_parent_inode()
958{
959 if (parent)
960 return parent->dir->inode;
961 return NULL;
962}
963
11fdf7f2 964bool CInode::is_ancestor_of(const CInode *other) const
7c673cae
FG
965{
966 while (other) {
967 if (other == this)
968 return true;
11fdf7f2
TL
969 const CDentry *pdn = other->get_oldest_parent_dn();
970 if (!pdn) {
971 ceph_assert(other->is_base());
7c673cae 972 break;
11fdf7f2
TL
973 }
974 other = pdn->get_dir()->get_inode();
975 }
976 return false;
977}
978
979bool CInode::is_projected_ancestor_of(const CInode *other) const
980{
981 while (other) {
982 if (other == this)
983 return true;
984 const CDentry *pdn = other->get_projected_parent_dn();
985 if (!pdn) {
986 ceph_assert(other->is_base());
987 break;
988 }
989 other = pdn->get_dir()->get_inode();
7c673cae
FG
990 }
991 return false;
992}
993
994/*
995 * Because a non-directory inode may have multiple links, the use_parent
996 * argument allows selecting which parent to use for path construction. This
997 * argument is only meaningful for the final component (i.e. the first of the
998 * nested calls) because directories cannot have multiple hard links. If
999 * use_parent is NULL and projected is true, the primary parent's projected
1000 * inode is used all the way up the path chain. Otherwise the primary parent
1001 * stable inode is used.
1002 */
1003void CInode::make_path_string(string& s, bool projected, const CDentry *use_parent) const
1004{
1005 if (!use_parent) {
1006 use_parent = projected ? get_projected_parent_dn() : parent;
1007 }
1008
1009 if (use_parent) {
1010 use_parent->make_path_string(s, projected);
1011 } else if (is_root()) {
1012 s = "";
1013 } else if (is_mdsdir()) {
1014 char t[40];
1015 uint64_t eino(ino());
1016 eino -= MDS_INO_MDSDIR_OFFSET;
1017 snprintf(t, sizeof(t), "~mds%" PRId64, eino);
1018 s = t;
1019 } else {
1020 char n[40];
1021 uint64_t eino(ino());
1022 snprintf(n, sizeof(n), "#%" PRIx64, eino);
1023 s += n;
1024 }
1025}
1026
1027void CInode::make_path(filepath& fp, bool projected) const
1028{
1029 const CDentry *use_parent = projected ? get_projected_parent_dn() : parent;
1030 if (use_parent) {
11fdf7f2 1031 ceph_assert(!is_base());
7c673cae
FG
1032 use_parent->make_path(fp, projected);
1033 } else {
1034 fp = filepath(ino());
1035 }
1036}
1037
1038void CInode::name_stray_dentry(string& dname)
1039{
1040 char s[20];
f67539c2 1041 snprintf(s, sizeof(s), "%llx", (unsigned long long)ino().val);
7c673cae
FG
1042 dname = s;
1043}
1044
1045version_t CInode::pre_dirty()
1046{
1047 version_t pv;
1048 CDentry* _cdentry = get_projected_parent_dn();
1049 if (_cdentry) {
1050 pv = _cdentry->pre_dirty(get_projected_version());
f67539c2 1051 dout(10) << "pre_dirty " << pv << " (current v " << get_inode()->version << ")" << dendl;
7c673cae 1052 } else {
11fdf7f2 1053 ceph_assert(is_base());
7c673cae
FG
1054 pv = get_projected_version() + 1;
1055 }
94b18763 1056 // force update backtrace for old format inode (see mempool_inode::decode)
f67539c2
TL
1057 if (get_inode()->backtrace_version == 0 && !projected_nodes.empty()) {
1058 auto pi = _get_projected_inode();
1059 if (pi->backtrace_version == 0)
1060 pi->update_backtrace(pv);
7c673cae
FG
1061 }
1062 return pv;
1063}
1064
1065void CInode::_mark_dirty(LogSegment *ls)
1066{
1067 if (!state_test(STATE_DIRTY)) {
1068 state_set(STATE_DIRTY);
1069 get(PIN_DIRTY);
11fdf7f2 1070 ceph_assert(ls);
7c673cae
FG
1071 }
1072
1073 // move myself to this segment's dirty list
1074 if (ls)
1075 ls->dirty_inodes.push_back(&item_dirty);
1076}
1077
f67539c2 1078void CInode::mark_dirty(LogSegment *ls) {
7c673cae 1079
11fdf7f2 1080 dout(10) << __func__ << " " << *this << dendl;
7c673cae
FG
1081
1082 /*
1083 NOTE: I may already be dirty, but this fn _still_ needs to be called so that
1084 the directory is (perhaps newly) dirtied, and so that parent_dir_version is
1085 updated below.
1086 */
1087
1088 // only auth can get dirty. "dirty" async data in replicas is relative to
1089 // filelock state, not the dirty flag.
11fdf7f2 1090 ceph_assert(is_auth());
7c673cae
FG
1091
1092 // touch my private version
7c673cae
FG
1093 _mark_dirty(ls);
1094
1095 // mark dentry too
1096 if (parent)
f67539c2 1097 parent->mark_dirty(get_version(), ls);
7c673cae
FG
1098}
1099
1100
1101void CInode::mark_clean()
1102{
11fdf7f2 1103 dout(10) << __func__ << " " << *this << dendl;
7c673cae
FG
1104 if (state_test(STATE_DIRTY)) {
1105 state_clear(STATE_DIRTY);
1106 put(PIN_DIRTY);
1107
1108 // remove myself from ls dirty list
1109 item_dirty.remove_myself();
1110 }
1111}
1112
1113
1114// --------------
1115// per-inode storage
1116// (currently for root inode only)
1117
1118struct C_IO_Inode_Stored : public CInodeIOContext {
1119 version_t version;
1120 Context *fin;
1121 C_IO_Inode_Stored(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {}
1122 void finish(int r) override {
1123 in->_stored(r, version, fin);
1124 }
91327a77
AA
1125 void print(ostream& out) const override {
1126 out << "inode_store(" << in->ino() << ")";
1127 }
7c673cae
FG
1128};
1129
11fdf7f2 1130object_t InodeStoreBase::get_object_name(inodeno_t ino, frag_t fg, std::string_view suffix)
7c673cae
FG
1131{
1132 char n[60];
11fdf7f2
TL
1133 snprintf(n, sizeof(n), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)fg);
1134 ceph_assert(strlen(n) + suffix.size() < sizeof n);
1135 strncat(n, suffix.data(), suffix.size());
7c673cae
FG
1136 return object_t(n);
1137}
1138
11fdf7f2 1139void CInode::store(MDSContext *fin)
7c673cae 1140{
11fdf7f2
TL
1141 dout(10) << __func__ << " " << get_version() << dendl;
1142 ceph_assert(is_base());
7c673cae
FG
1143
1144 if (snaprealm)
1145 purge_stale_snap_data(snaprealm->get_snaps());
1146
1147 // encode
1148 bufferlist bl;
1149 string magic = CEPH_FS_ONDISK_MAGIC;
11fdf7f2
TL
1150 using ceph::encode;
1151 encode(magic, bl);
7c673cae
FG
1152 encode_store(bl, mdcache->mds->mdsmap->get_up_features());
1153
1154 // write it.
1155 SnapContext snapc;
1156 ObjectOperation m;
1157 m.write_full(bl);
1158
1159 object_t oid = CInode::get_object_name(ino(), frag_t(), ".inode");
b3b6e05e 1160 object_locator_t oloc(mdcache->mds->get_metadata_pool());
7c673cae
FG
1161
1162 Context *newfin =
1163 new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin),
1164 mdcache->mds->finisher);
1165 mdcache->mds->objecter->mutate(oid, oloc, m, snapc,
1166 ceph::real_clock::now(), 0,
1167 newfin);
1168}
1169
1170void CInode::_stored(int r, version_t v, Context *fin)
1171{
1172 if (r < 0) {
1173 dout(1) << "store error " << r << " v " << v << " on " << *this << dendl;
d2e6a577
FG
1174 mdcache->mds->clog->error() << "failed to store inode " << ino()
1175 << " object: " << cpp_strerror(r);
7c673cae
FG
1176 mdcache->mds->handle_write_error(r);
1177 fin->complete(r);
1178 return;
1179 }
1180
11fdf7f2 1181 dout(10) << __func__ << " " << v << " on " << *this << dendl;
7c673cae
FG
1182 if (v == get_projected_version())
1183 mark_clean();
1184
1185 fin->complete(0);
1186}
1187
11fdf7f2 1188void CInode::flush(MDSContext *fin)
7c673cae 1189{
11fdf7f2
TL
1190 dout(10) << __func__ << " " << *this << dendl;
1191 ceph_assert(is_auth() && can_auth_pin());
7c673cae
FG
1192
1193 MDSGatherBuilder gather(g_ceph_context);
1194
1195 if (is_dirty_parent()) {
1196 store_backtrace(gather.new_sub());
1197 }
1198 if (is_dirty()) {
1199 if (is_base()) {
1200 store(gather.new_sub());
1201 } else {
1202 parent->dir->commit(0, gather.new_sub());
1203 }
1204 }
1205
1206 if (gather.has_subs()) {
1207 gather.set_finisher(fin);
1208 gather.activate();
1209 } else {
1210 fin->complete(0);
1211 }
1212}
1213
1214struct C_IO_Inode_Fetched : public CInodeIOContext {
1215 bufferlist bl, bl2;
1216 Context *fin;
1217 C_IO_Inode_Fetched(CInode *i, Context *f) : CInodeIOContext(i), fin(f) {}
1218 void finish(int r) override {
f67539c2 1219 // Ignore 'r', because we fetch from two places, so r is usually CEPHFS_ENOENT
7c673cae
FG
1220 in->_fetched(bl, bl2, fin);
1221 }
91327a77
AA
1222 void print(ostream& out) const override {
1223 out << "inode_fetch(" << in->ino() << ")";
1224 }
7c673cae
FG
1225};
1226
11fdf7f2 1227void CInode::fetch(MDSContext *fin)
7c673cae 1228{
11fdf7f2 1229 dout(10) << __func__ << dendl;
7c673cae
FG
1230
1231 C_IO_Inode_Fetched *c = new C_IO_Inode_Fetched(this, fin);
1232 C_GatherBuilder gather(g_ceph_context, new C_OnFinisher(c, mdcache->mds->finisher));
1233
1234 object_t oid = CInode::get_object_name(ino(), frag_t(), "");
b3b6e05e 1235 object_locator_t oloc(mdcache->mds->get_metadata_pool());
7c673cae
FG
1236
1237 // Old on-disk format: inode stored in xattr of a dirfrag
1238 ObjectOperation rd;
1239 rd.getxattr("inode", &c->bl, NULL);
1240 mdcache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, (bufferlist*)NULL, 0, gather.new_sub());
1241
1242 // Current on-disk format: inode stored in a .inode object
1243 object_t oid2 = CInode::get_object_name(ino(), frag_t(), ".inode");
1244 mdcache->mds->objecter->read(oid2, oloc, 0, 0, CEPH_NOSNAP, &c->bl2, 0, gather.new_sub());
1245
1246 gather.activate();
1247}
1248
1249void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin)
1250{
11fdf7f2
TL
1251 dout(10) << __func__ << " got " << bl.length() << " and " << bl2.length() << dendl;
1252 bufferlist::const_iterator p;
7c673cae 1253 if (bl2.length()) {
11fdf7f2 1254 p = bl2.cbegin();
7c673cae 1255 } else if (bl.length()) {
11fdf7f2 1256 p = bl.cbegin();
7c673cae 1257 } else {
d2e6a577 1258 derr << "No data while reading inode " << ino() << dendl;
f67539c2 1259 fin->complete(-CEPHFS_ENOENT);
7c673cae
FG
1260 return;
1261 }
1262
11fdf7f2 1263 using ceph::decode;
7c673cae
FG
1264 // Attempt decode
1265 try {
1266 string magic;
11fdf7f2 1267 decode(magic, p);
7c673cae
FG
1268 dout(10) << " magic is '" << magic << "' (expecting '"
1269 << CEPH_FS_ONDISK_MAGIC << "')" << dendl;
1270 if (magic != CEPH_FS_ONDISK_MAGIC) {
1271 dout(0) << "on disk magic '" << magic << "' != my magic '" << CEPH_FS_ONDISK_MAGIC
1272 << "'" << dendl;
f67539c2 1273 fin->complete(-CEPHFS_EINVAL);
7c673cae
FG
1274 } else {
1275 decode_store(p);
1276 dout(10) << "_fetched " << *this << dendl;
1277 fin->complete(0);
1278 }
1279 } catch (buffer::error &err) {
f67539c2
TL
1280 derr << "Corrupt inode " << ino() << ": " << err.what() << dendl;
1281 fin->complete(-CEPHFS_EINVAL);
7c673cae
FG
1282 return;
1283 }
1284}
1285
1286void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt)
1287{
f67539c2 1288 bt.ino = ino();
7c673cae
FG
1289 bt.ancestors.clear();
1290 bt.pool = pool;
1291
1292 CInode *in = this;
1293 CDentry *pdn = get_parent_dn();
1294 while (pdn) {
1295 CInode *diri = pdn->get_dir()->get_inode();
f67539c2 1296 bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(), in->get_inode()->version));
7c673cae
FG
1297 in = diri;
1298 pdn = in->get_parent_dn();
1299 }
f67539c2
TL
1300 bt.old_pools.reserve(get_inode()->old_pools.size());
1301 for (auto &p : get_inode()->old_pools) {
7c673cae 1302 // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0)
94b18763 1303 if (p != pool)
f67539c2 1304 bt.old_pools.push_back(p);
7c673cae
FG
1305 }
1306}
1307
1308struct C_IO_Inode_StoredBacktrace : public CInodeIOContext {
1309 version_t version;
1310 Context *fin;
1311 C_IO_Inode_StoredBacktrace(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {}
1312 void finish(int r) override {
1313 in->_stored_backtrace(r, version, fin);
1314 }
91327a77
AA
1315 void print(ostream& out) const override {
1316 out << "backtrace_store(" << in->ino() << ")";
1317 }
7c673cae
FG
1318};
1319
f67539c2
TL
1320
1321void CInode::_commit_ops(int r, C_GatherBuilder &gather_bld,
1322 std::vector<CInodeCommitOperation> &ops_vec,
1323 inode_backtrace_t &bt)
1324{
1325 dout(10) << __func__ << dendl;
1326
1327 if (r < 0) {
1328 mdcache->mds->handle_write_error_with_lock(r);
1329 return;
1330 }
1331
1332 SnapContext snapc;
1333 object_t oid = get_object_name(ino(), frag_t(), "");
1334
1335 for (auto &op : ops_vec) {
1336 ObjectOperation obj_op;
1337 object_locator_t oloc(op.get_pool());
1338 op.update(obj_op, bt);
1339 mdcache->mds->objecter->mutate(oid, oloc, obj_op, snapc,
1340 ceph::real_clock::now(),
1341 0, gather_bld.new_sub());
1342 }
1343}
1344
1345void CInode::_store_backtrace(std::vector<CInodeCommitOperation> &ops_vec,
1346 inode_backtrace_t &bt, int op_prio)
7c673cae 1347{
11fdf7f2
TL
1348 dout(10) << __func__ << " on " << *this << dendl;
1349 ceph_assert(is_dirty_parent());
7c673cae
FG
1350
1351 if (op_prio < 0)
1352 op_prio = CEPH_MSG_PRIO_DEFAULT;
1353
1354 auth_pin(this);
1355
1356 const int64_t pool = get_backtrace_pool();
7c673cae 1357 build_backtrace(pool, bt);
7c673cae 1358
20effc67
TL
1359 std::string_view slink = "";
1360 if (is_symlink() && mdcache->get_symlink_recovery()) {
1361 slink = symlink;
1362 }
1363
f67539c2 1364 ops_vec.emplace_back(op_prio, pool, get_inode()->layout,
20effc67 1365 mdcache->mds->mdsmap->get_up_features(), slink);
7c673cae 1366
f67539c2 1367 if (!state_test(STATE_DIRTYPOOL) || get_inode()->old_pools.empty()) {
7c673cae 1368 dout(20) << __func__ << ": no dirtypool or no old pools" << dendl;
7c673cae
FG
1369 return;
1370 }
1371
7c673cae
FG
1372 // In the case where DIRTYPOOL is set, we update all old pools backtraces
1373 // such that anyone reading them will see the new pool ID in
1374 // inode_backtrace_t::pool and go read everything else from there.
f67539c2 1375 for (const auto &p : get_inode()->old_pools) {
94b18763 1376 if (p == pool)
7c673cae
FG
1377 continue;
1378
94b18763 1379 dout(20) << __func__ << ": updating old pool " << p << dendl;
7c673cae 1380
f67539c2 1381 ops_vec.emplace_back(op_prio, p);
7c673cae 1382 }
f67539c2
TL
1383}
1384
1385void CInode::store_backtrace(MDSContext *fin, int op_prio)
1386{
1387 std::vector<CInodeCommitOperation> ops_vec;
1388 inode_backtrace_t bt;
1389 auto version = get_inode()->backtrace_version;
1390
1391 _store_backtrace(ops_vec, bt, op_prio);
1392
1393 C_GatherBuilder gather(g_ceph_context,
1394 new C_OnFinisher(
1395 new C_IO_Inode_StoredBacktrace(this, version, fin),
1396 mdcache->mds->finisher));
1397 _commit_ops(0, gather, ops_vec, bt);
1398 ceph_assert(gather.has_subs());
7c673cae
FG
1399 gather.activate();
1400}
1401
f67539c2
TL
1402void CInode::store_backtrace(CInodeCommitOperations &op, int op_prio)
1403{
1404 op.version = get_inode()->backtrace_version;
1405 op.in = this;
1406
1407 _store_backtrace(op.ops_vec, op.bt, op_prio);
1408}
1409
7c673cae
FG
1410void CInode::_stored_backtrace(int r, version_t v, Context *fin)
1411{
f67539c2 1412 if (r == -CEPHFS_ENOENT) {
7c673cae
FG
1413 const int64_t pool = get_backtrace_pool();
1414 bool exists = mdcache->mds->objecter->with_osdmap(
1415 [pool](const OSDMap &osd_map) {
1416 return osd_map.have_pg_pool(pool);
1417 });
1418
f67539c2 1419 // This CEPHFS_ENOENT is because the pool doesn't exist (the user deleted it
7c673cae
FG
1420 // out from under us), so the backtrace can never be written, so pretend
1421 // to succeed so that the user can proceed to e.g. delete the file.
1422 if (!exists) {
f67539c2 1423 dout(4) << __func__ << " got CEPHFS_ENOENT: a data pool was deleted "
7c673cae
FG
1424 "beneath us!" << dendl;
1425 r = 0;
1426 }
1427 }
1428
1429 if (r < 0) {
1430 dout(1) << "store backtrace error " << r << " v " << v << dendl;
1431 mdcache->mds->clog->error() << "failed to store backtrace on ino "
1432 << ino() << " object"
1433 << ", pool " << get_backtrace_pool()
1434 << ", errno " << r;
1435 mdcache->mds->handle_write_error(r);
1436 if (fin)
1437 fin->complete(r);
1438 return;
1439 }
1440
11fdf7f2 1441 dout(10) << __func__ << " v " << v << dendl;
7c673cae
FG
1442
1443 auth_unpin(this);
f67539c2 1444 if (v == get_inode()->backtrace_version)
7c673cae
FG
1445 clear_dirty_parent();
1446 if (fin)
1447 fin->complete(0);
1448}
1449
1450void CInode::fetch_backtrace(Context *fin, bufferlist *backtrace)
1451{
f67539c2 1452 mdcache->fetch_backtrace(ino(), get_backtrace_pool(), *backtrace, fin);
7c673cae
FG
1453}
1454
28e407b8 1455void CInode::mark_dirty_parent(LogSegment *ls, bool dirty_pool)
7c673cae
FG
1456{
1457 if (!state_test(STATE_DIRTYPARENT)) {
11fdf7f2 1458 dout(10) << __func__ << dendl;
7c673cae
FG
1459 state_set(STATE_DIRTYPARENT);
1460 get(PIN_DIRTYPARENT);
11fdf7f2 1461 ceph_assert(ls);
7c673cae
FG
1462 }
1463 if (dirty_pool)
1464 state_set(STATE_DIRTYPOOL);
1465 if (ls)
1466 ls->dirty_parent_inodes.push_back(&item_dirty_parent);
1467}
1468
1469void CInode::clear_dirty_parent()
1470{
1471 if (state_test(STATE_DIRTYPARENT)) {
11fdf7f2 1472 dout(10) << __func__ << dendl;
7c673cae
FG
1473 state_clear(STATE_DIRTYPARENT);
1474 state_clear(STATE_DIRTYPOOL);
1475 put(PIN_DIRTYPARENT);
1476 item_dirty_parent.remove_myself();
1477 }
1478}
1479
1480void CInode::verify_diri_backtrace(bufferlist &bl, int err)
1481{
1482 if (is_base() || is_dirty_parent() || !is_auth())
1483 return;
1484
11fdf7f2 1485 dout(10) << __func__ << dendl;
7c673cae
FG
1486
1487 if (err == 0) {
1488 inode_backtrace_t backtrace;
11fdf7f2
TL
1489 using ceph::decode;
1490 decode(backtrace, bl);
7c673cae
FG
1491 CDentry *pdn = get_parent_dn();
1492 if (backtrace.ancestors.empty() ||
94b18763 1493 backtrace.ancestors[0].dname != pdn->get_name() ||
7c673cae 1494 backtrace.ancestors[0].dirino != pdn->get_dir()->ino())
f67539c2 1495 err = -CEPHFS_EINVAL;
7c673cae
FG
1496 }
1497
1498 if (err) {
1499 MDSRank *mds = mdcache->mds;
d2e6a577 1500 mds->clog->error() << "bad backtrace on directory inode " << ino();
11fdf7f2 1501 ceph_assert(!"bad backtrace" == (g_conf()->mds_verify_backtrace > 1));
7c673cae 1502
28e407b8 1503 mark_dirty_parent(mds->mdlog->get_current_segment(), false);
7c673cae
FG
1504 mds->mdlog->flush();
1505 }
1506}
1507
1508// ------------------
1509// parent dir
1510
1511
f67539c2
TL
1512void InodeStoreBase::encode_xattrs(bufferlist &bl) const {
1513 using ceph::encode;
1514 if (xattrs)
1515 encode(*xattrs, bl);
1516 else
1517 encode((__u32)0, bl);
1518}
1519
1520void InodeStoreBase::decode_xattrs(bufferlist::const_iterator &p) {
1521 using ceph::decode;
1522 mempool_xattr_map tmp;
1523 decode_noshare(tmp, p);
1524 if (tmp.empty()) {
1525 reset_xattrs(xattr_map_ptr());
1526 } else {
1527 reset_xattrs(allocate_xattr_map(std::move(tmp)));
1528 }
1529}
1530
1531void InodeStoreBase::encode_old_inodes(bufferlist &bl, uint64_t features) const {
1532 using ceph::encode;
1533 if (old_inodes)
1534 encode(*old_inodes, bl, features);
1535 else
1536 encode((__u32)0, bl);
1537}
1538
1539void InodeStoreBase::decode_old_inodes(bufferlist::const_iterator &p) {
1540 using ceph::decode;
1541 mempool_old_inode_map tmp;
1542 decode(tmp, p);
1543 if (tmp.empty()) {
1544 reset_old_inodes(old_inode_map_ptr());
1545 } else {
1546 reset_old_inodes(allocate_old_inode_map(std::move(tmp)));
1547 }
1548}
1549
7c673cae
FG
1550void InodeStoreBase::encode_bare(bufferlist &bl, uint64_t features,
1551 const bufferlist *snap_blob) const
1552{
11fdf7f2 1553 using ceph::encode;
f67539c2
TL
1554 encode(*inode, bl, features);
1555 if (inode->is_symlink())
11fdf7f2
TL
1556 encode(symlink, bl);
1557 encode(dirfragtree, bl);
f67539c2
TL
1558 encode_xattrs(bl);
1559
7c673cae 1560 if (snap_blob)
11fdf7f2 1561 encode(*snap_blob, bl);
7c673cae 1562 else
11fdf7f2 1563 encode(bufferlist(), bl);
f67539c2 1564 encode_old_inodes(bl, features);
11fdf7f2
TL
1565 encode(oldest_snap, bl);
1566 encode(damage_flags, bl);
7c673cae
FG
1567}
1568
1569void InodeStoreBase::encode(bufferlist &bl, uint64_t features,
1570 const bufferlist *snap_blob) const
1571{
1572 ENCODE_START(6, 4, bl);
1573 encode_bare(bl, features, snap_blob);
1574 ENCODE_FINISH(bl);
1575}
1576
1577void CInode::encode_store(bufferlist& bl, uint64_t features)
1578{
1579 bufferlist snap_blob;
1580 encode_snap_blob(snap_blob);
1581 InodeStoreBase::encode(bl, mdcache->mds->mdsmap->get_up_features(),
1582 &snap_blob);
1583}
1584
11fdf7f2 1585void InodeStoreBase::decode_bare(bufferlist::const_iterator &bl,
7c673cae
FG
1586 bufferlist& snap_blob, __u8 struct_v)
1587{
11fdf7f2 1588 using ceph::decode;
f67539c2
TL
1589
1590 auto _inode = allocate_inode();
1591 decode(*_inode, bl);
1592
1593 if (_inode->is_symlink()) {
94b18763 1594 std::string tmp;
11fdf7f2
TL
1595 decode(tmp, bl);
1596 symlink = std::string_view(tmp);
94b18763 1597 }
11fdf7f2 1598 decode(dirfragtree, bl);
f67539c2 1599 decode_xattrs(bl);
11fdf7f2 1600 decode(snap_blob, bl);
7c673cae 1601
f67539c2
TL
1602 decode_old_inodes(bl);
1603 if (struct_v == 2 && _inode->is_dir()) {
7c673cae 1604 bool default_layout_exists;
11fdf7f2 1605 decode(default_layout_exists, bl);
7c673cae 1606 if (default_layout_exists) {
11fdf7f2 1607 decode(struct_v, bl); // this was a default_file_layout
f67539c2 1608 decode(_inode->layout, bl); // but we only care about the layout portion
7c673cae
FG
1609 }
1610 }
1611
1612 if (struct_v >= 5) {
1613 // InodeStore is embedded in dentries without proper versioning, so
1614 // we consume up to the end of the buffer
1615 if (!bl.end()) {
11fdf7f2 1616 decode(oldest_snap, bl);
7c673cae
FG
1617 }
1618
1619 if (!bl.end()) {
11fdf7f2 1620 decode(damage_flags, bl);
7c673cae
FG
1621 }
1622 }
f67539c2
TL
1623
1624 reset_inode(std::move(_inode));
7c673cae
FG
1625}
1626
1627
11fdf7f2 1628void InodeStoreBase::decode(bufferlist::const_iterator &bl, bufferlist& snap_blob)
7c673cae
FG
1629{
1630 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
1631 decode_bare(bl, snap_blob, struct_v);
1632 DECODE_FINISH(bl);
1633}
1634
11fdf7f2 1635void CInode::decode_store(bufferlist::const_iterator& bl)
7c673cae
FG
1636{
1637 bufferlist snap_blob;
1638 InodeStoreBase::decode(bl, snap_blob);
1639 decode_snap_blob(snap_blob);
1640}
1641
1642// ------------------
1643// locking
1644
9f95a23c
TL
1645SimpleLock* CInode::get_lock(int type)
1646{
1647 switch (type) {
1648 case CEPH_LOCK_IVERSION: return &versionlock;
1649 case CEPH_LOCK_IFILE: return &filelock;
1650 case CEPH_LOCK_IAUTH: return &authlock;
1651 case CEPH_LOCK_ILINK: return &linklock;
1652 case CEPH_LOCK_IDFT: return &dirfragtreelock;
1653 case CEPH_LOCK_IXATTR: return &xattrlock;
1654 case CEPH_LOCK_ISNAP: return &snaplock;
1655 case CEPH_LOCK_INEST: return &nestlock;
1656 case CEPH_LOCK_IFLOCK: return &flocklock;
1657 case CEPH_LOCK_IPOLICY: return &policylock;
1658 }
1659 return 0;
1660}
1661
7c673cae
FG
1662void CInode::set_object_info(MDSCacheObjectInfo &info)
1663{
1664 info.ino = ino();
1665 info.snapid = last;
1666}
1667
9f95a23c 1668void CInode::encode_lock_iauth(bufferlist& bl)
7c673cae 1669{
9f95a23c 1670 ENCODE_START(1, 1, bl);
f67539c2
TL
1671 encode(get_inode()->version, bl);
1672 encode(get_inode()->ctime, bl);
1673 encode(get_inode()->mode, bl);
1674 encode(get_inode()->uid, bl);
1675 encode(get_inode()->gid, bl);
9f95a23c
TL
1676 ENCODE_FINISH(bl);
1677}
7c673cae 1678
9f95a23c
TL
1679void CInode::decode_lock_iauth(bufferlist::const_iterator& p)
1680{
f67539c2
TL
1681 ceph_assert(!is_auth());
1682 auto _inode = allocate_inode(*get_inode());
9f95a23c 1683 DECODE_START(1, p);
f67539c2 1684 decode(_inode->version, p);
9f95a23c
TL
1685 utime_t tm;
1686 decode(tm, p);
f67539c2
TL
1687 if (_inode->ctime < tm) _inode->ctime = tm;
1688 decode(_inode->mode, p);
1689 decode(_inode->uid, p);
1690 decode(_inode->gid, p);
9f95a23c 1691 DECODE_FINISH(p);
f67539c2 1692 reset_inode(std::move(_inode));
9f95a23c
TL
1693}
1694
1695void CInode::encode_lock_ilink(bufferlist& bl)
1696{
1697 ENCODE_START(1, 1, bl);
f67539c2
TL
1698 encode(get_inode()->version, bl);
1699 encode(get_inode()->ctime, bl);
1700 encode(get_inode()->nlink, bl);
9f95a23c
TL
1701 ENCODE_FINISH(bl);
1702}
1703
1704void CInode::decode_lock_ilink(bufferlist::const_iterator& p)
1705{
f67539c2
TL
1706 ceph_assert(!is_auth());
1707 auto _inode = allocate_inode(*get_inode());
9f95a23c 1708 DECODE_START(1, p);
f67539c2 1709 decode(_inode->version, p);
9f95a23c
TL
1710 utime_t tm;
1711 decode(tm, p);
f67539c2
TL
1712 if (_inode->ctime < tm) _inode->ctime = tm;
1713 decode(_inode->nlink, p);
9f95a23c 1714 DECODE_FINISH(p);
f67539c2 1715 reset_inode(std::move(_inode));
9f95a23c
TL
1716}
1717
1718void CInode::encode_lock_idft(bufferlist& bl)
1719{
1720 ENCODE_START(1, 1, bl);
1721 if (is_auth()) {
f67539c2 1722 encode(get_inode()->version, bl);
9f95a23c
TL
1723 } else {
1724 // treat flushing as dirty when rejoining cache
1725 bool dirty = dirfragtreelock.is_dirty_or_flushing();
1726 encode(dirty, bl);
1727 }
1728 {
1729 // encode the raw tree
1730 encode(dirfragtree, bl);
1731
1732 // also specify which frags are mine
1733 set<frag_t> myfrags;
1734 auto&& dfls = get_dirfrags();
1735 for (const auto& dir : dfls) {
1736 if (dir->is_auth()) {
1737 frag_t fg = dir->get_frag();
1738 myfrags.insert(fg);
1739 }
1740 }
1741 encode(myfrags, bl);
1742 }
1743 ENCODE_FINISH(bl);
1744}
1745
1746void CInode::decode_lock_idft(bufferlist::const_iterator& p)
1747{
f67539c2
TL
1748 inode_ptr _inode;
1749
9f95a23c
TL
1750 DECODE_START(1, p);
1751 if (is_auth()) {
1752 bool replica_dirty;
1753 decode(replica_dirty, p);
1754 if (replica_dirty) {
1755 dout(10) << __func__ << " setting dftlock dirty flag" << dendl;
1756 dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle
1757 }
1758 } else {
f67539c2
TL
1759 _inode = allocate_inode(*get_inode());
1760 decode(_inode->version, p);
9f95a23c
TL
1761 }
1762 {
1763 fragtree_t temp;
1764 decode(temp, p);
1765 set<frag_t> authfrags;
1766 decode(authfrags, p);
7c673cae 1767 if (is_auth()) {
9f95a23c
TL
1768 // auth. believe replica's auth frags only.
1769 for (auto fg : authfrags) {
1770 if (!dirfragtree.is_leaf(fg)) {
1771 dout(10) << " forcing frag " << fg << " to leaf (split|merge)" << dendl;
1772 dirfragtree.force_to_leaf(g_ceph_context, fg);
1773 dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle
1774 }
1775 }
7c673cae 1776 } else {
9f95a23c
TL
1777 // replica. take the new tree, BUT make sure any open
1778 // dirfrags remain leaves (they may have split _after_ this
1779 // dft was scattered, or we may still be be waiting on the
1780 // notify from the auth)
1781 dirfragtree.swap(temp);
1782 for (const auto &p : dirfrags) {
1783 if (!dirfragtree.is_leaf(p.first)) {
1784 dout(10) << " forcing open dirfrag " << p.first << " to leaf (racing with split|merge)" << dendl;
1785 dirfragtree.force_to_leaf(g_ceph_context, p.first);
1786 }
1787 if (p.second->is_auth())
1788 p.second->state_clear(CDir::STATE_DIRTYDFT);
1789 }
7c673cae 1790 }
9f95a23c
TL
1791 if (g_conf()->mds_debug_frag)
1792 verify_dirfrags();
1793 }
1794 DECODE_FINISH(p);
f67539c2
TL
1795
1796 if (_inode)
1797 reset_inode(std::move(_inode));
9f95a23c
TL
1798}
1799
1800void CInode::encode_lock_ifile(bufferlist& bl)
1801{
1802 ENCODE_START(1, 1, bl);
1803 if (is_auth()) {
f67539c2
TL
1804 encode(get_inode()->version, bl);
1805 encode(get_inode()->ctime, bl);
1806 encode(get_inode()->mtime, bl);
1807 encode(get_inode()->atime, bl);
1808 encode(get_inode()->time_warp_seq, bl);
9f95a23c 1809 if (!is_dir()) {
f67539c2
TL
1810 encode(get_inode()->layout, bl, mdcache->mds->mdsmap->get_up_features());
1811 encode(get_inode()->size, bl);
1812 encode(get_inode()->truncate_seq, bl);
1813 encode(get_inode()->truncate_size, bl);
1814 encode(get_inode()->client_ranges, bl);
1815 encode(get_inode()->inline_data, bl);
9f95a23c
TL
1816 }
1817 } else {
1818 // treat flushing as dirty when rejoining cache
1819 bool dirty = filelock.is_dirty_or_flushing();
1820 encode(dirty, bl);
1821 }
f67539c2
TL
1822 dout(15) << __func__ << " inode.dirstat is " << get_inode()->dirstat << dendl;
1823 encode(get_inode()->dirstat, bl); // only meaningful if i am auth.
9f95a23c
TL
1824 bufferlist tmp;
1825 __u32 n = 0;
1826 for (const auto &p : dirfrags) {
1827 frag_t fg = p.first;
1828 CDir *dir = p.second;
1829 if (is_auth() || dir->is_auth()) {
f67539c2 1830 const auto& pf = dir->get_projected_fnode();
9f95a23c
TL
1831 dout(15) << fg << " " << *dir << dendl;
1832 dout(20) << fg << " fragstat " << pf->fragstat << dendl;
1833 dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
1834 encode(fg, tmp);
1835 encode(dir->first, tmp);
1836 encode(pf->fragstat, tmp);
1837 encode(pf->accounted_fragstat, tmp);
1838 n++;
7c673cae 1839 }
9f95a23c
TL
1840 }
1841 encode(n, bl);
1842 bl.claim_append(tmp);
1843 ENCODE_FINISH(bl);
1844}
1845
1846void CInode::decode_lock_ifile(bufferlist::const_iterator& p)
1847{
f67539c2
TL
1848 inode_ptr _inode;
1849
9f95a23c
TL
1850 DECODE_START(1, p);
1851 if (!is_auth()) {
f67539c2
TL
1852 _inode = allocate_inode(*get_inode());
1853
1854 decode(_inode->version, p);
9f95a23c
TL
1855 utime_t tm;
1856 decode(tm, p);
f67539c2
TL
1857 if (_inode->ctime < tm) _inode->ctime = tm;
1858 decode(_inode->mtime, p);
1859 decode(_inode->atime, p);
1860 decode(_inode->time_warp_seq, p);
9f95a23c 1861 if (!is_dir()) {
f67539c2
TL
1862 decode(_inode->layout, p);
1863 decode(_inode->size, p);
1864 decode(_inode->truncate_seq, p);
1865 decode(_inode->truncate_size, p);
1866 decode(_inode->client_ranges, p);
1867 decode(_inode->inline_data, p);
9f95a23c
TL
1868 }
1869 } else {
1870 bool replica_dirty;
1871 decode(replica_dirty, p);
1872 if (replica_dirty) {
1873 dout(10) << __func__ << " setting filelock dirty flag" << dendl;
1874 filelock.mark_dirty(); // ok bc we're auth and caller will handle
1875 }
1876 }
1877
1878 frag_info_t dirstat;
1879 decode(dirstat, p);
1880 if (!is_auth()) {
1881 dout(10) << " taking inode dirstat " << dirstat << " for " << *this << dendl;
f67539c2 1882 _inode->dirstat = dirstat; // take inode summation if replica
9f95a23c
TL
1883 }
1884 __u32 n;
1885 decode(n, p);
1886 dout(10) << " ...got " << n << " fragstats on " << *this << dendl;
1887 while (n--) {
1888 frag_t fg;
1889 snapid_t fgfirst;
1890 frag_info_t fragstat;
1891 frag_info_t accounted_fragstat;
1892 decode(fg, p);
1893 decode(fgfirst, p);
1894 decode(fragstat, p);
1895 decode(accounted_fragstat, p);
1896 dout(10) << fg << " [" << fgfirst << ",head] " << dendl;
1897 dout(10) << fg << " fragstat " << fragstat << dendl;
1898 dout(20) << fg << " accounted_fragstat " << accounted_fragstat << dendl;
1899
1900 CDir *dir = get_dirfrag(fg);
7c673cae 1901 if (is_auth()) {
9f95a23c
TL
1902 ceph_assert(dir); // i am auth; i had better have this dir open
1903 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
1904 << " on " << *dir << dendl;
1905 dir->first = fgfirst;
f67539c2
TL
1906 auto _fnode = CDir::allocate_fnode(*dir->get_fnode());
1907 _fnode->fragstat = fragstat;
1908 _fnode->accounted_fragstat = accounted_fragstat;
1909 dir->reset_fnode(std::move(_fnode));
9f95a23c
TL
1910 if (!(fragstat == accounted_fragstat)) {
1911 dout(10) << fg << " setting filelock updated flag" << dendl;
1912 filelock.mark_dirty(); // ok bc we're auth and caller will handle
7c673cae
FG
1913 }
1914 } else {
9f95a23c
TL
1915 if (dir && dir->is_auth()) {
1916 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
1917 << " on " << *dir << dendl;
1918 dir->first = fgfirst;
f67539c2 1919 const auto& pf = dir->get_projected_fnode();
9f95a23c 1920 finish_scatter_update(&filelock, dir,
f67539c2 1921 _inode->dirstat.version, pf->accounted_fragstat.version);
9f95a23c 1922 }
7c673cae 1923 }
9f95a23c
TL
1924 }
1925 DECODE_FINISH(p);
f67539c2
TL
1926
1927 if (_inode)
1928 reset_inode(std::move(_inode));
9f95a23c 1929}
7c673cae 1930
9f95a23c
TL
1931void CInode::encode_lock_inest(bufferlist& bl)
1932{
1933 ENCODE_START(1, 1, bl);
1934 if (is_auth()) {
f67539c2 1935 encode(get_inode()->version, bl);
9f95a23c
TL
1936 } else {
1937 // treat flushing as dirty when rejoining cache
1938 bool dirty = nestlock.is_dirty_or_flushing();
1939 encode(dirty, bl);
1940 }
f67539c2
TL
1941 dout(15) << __func__ << " inode.rstat is " << get_inode()->rstat << dendl;
1942 encode(get_inode()->rstat, bl); // only meaningful if i am auth.
9f95a23c
TL
1943 bufferlist tmp;
1944 __u32 n = 0;
1945 for (const auto &p : dirfrags) {
1946 frag_t fg = p.first;
1947 CDir *dir = p.second;
1948 if (is_auth() || dir->is_auth()) {
f67539c2 1949 const auto& pf = dir->get_projected_fnode();
9f95a23c
TL
1950 dout(10) << __func__ << " " << fg << " dir " << *dir << dendl;
1951 dout(10) << __func__ << " " << fg << " rstat " << pf->rstat << dendl;
1952 dout(10) << __func__ << " " << fg << " accounted_rstat " << pf->rstat << dendl;
1953 dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl;
1954 encode(fg, tmp);
1955 encode(dir->first, tmp);
1956 encode(pf->rstat, tmp);
1957 encode(pf->accounted_rstat, tmp);
1958 encode(dir->dirty_old_rstat, tmp);
1959 n++;
7c673cae 1960 }
9f95a23c
TL
1961 }
1962 encode(n, bl);
1963 bl.claim_append(tmp);
1964 ENCODE_FINISH(bl);
1965}
7c673cae 1966
9f95a23c
TL
1967void CInode::decode_lock_inest(bufferlist::const_iterator& p)
1968{
f67539c2
TL
1969 inode_ptr _inode;
1970
9f95a23c
TL
1971 DECODE_START(1, p);
1972 if (is_auth()) {
1973 bool replica_dirty;
1974 decode(replica_dirty, p);
1975 if (replica_dirty) {
1976 dout(10) << __func__ << " setting nestlock dirty flag" << dendl;
1977 nestlock.mark_dirty(); // ok bc we're auth and caller will handle
1978 }
1979 } else {
f67539c2
TL
1980 _inode = allocate_inode(*get_inode());
1981 decode(_inode->version, p);
9f95a23c
TL
1982 }
1983 nest_info_t rstat;
1984 decode(rstat, p);
1985 if (!is_auth()) {
1986 dout(10) << __func__ << " taking inode rstat " << rstat << " for " << *this << dendl;
f67539c2 1987 _inode->rstat = rstat; // take inode summation if replica
9f95a23c
TL
1988 }
1989 __u32 n;
1990 decode(n, p);
1991 while (n--) {
1992 frag_t fg;
1993 snapid_t fgfirst;
1994 nest_info_t rstat;
1995 nest_info_t accounted_rstat;
1996 decltype(CDir::dirty_old_rstat) dirty_old_rstat;
1997 decode(fg, p);
1998 decode(fgfirst, p);
1999 decode(rstat, p);
2000 decode(accounted_rstat, p);
2001 decode(dirty_old_rstat, p);
2002 dout(10) << __func__ << " " << fg << " [" << fgfirst << ",head]" << dendl;
2003 dout(10) << __func__ << " " << fg << " rstat " << rstat << dendl;
2004 dout(10) << __func__ << " " << fg << " accounted_rstat " << accounted_rstat << dendl;
2005 dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dirty_old_rstat << dendl;
2006 CDir *dir = get_dirfrag(fg);
7c673cae 2007 if (is_auth()) {
9f95a23c
TL
2008 ceph_assert(dir); // i am auth; i had better have this dir open
2009 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
2010 << " on " << *dir << dendl;
2011 dir->first = fgfirst;
f67539c2
TL
2012 auto _fnode = CDir::allocate_fnode(*dir->get_fnode());
2013 _fnode->rstat = rstat;
2014 _fnode->accounted_rstat = accounted_rstat;
2015 dir->reset_fnode(std::move(_fnode));
9f95a23c
TL
2016 dir->dirty_old_rstat.swap(dirty_old_rstat);
2017 if (!(rstat == accounted_rstat) || !dir->dirty_old_rstat.empty()) {
2018 dout(10) << fg << " setting nestlock updated flag" << dendl;
2019 nestlock.mark_dirty(); // ok bc we're auth and caller will handle
2020 }
7c673cae 2021 } else {
9f95a23c
TL
2022 if (dir && dir->is_auth()) {
2023 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
2024 << " on " << *dir << dendl;
2025 dir->first = fgfirst;
f67539c2 2026 const auto& pf = dir->get_projected_fnode();
9f95a23c 2027 finish_scatter_update(&nestlock, dir,
f67539c2 2028 _inode->rstat.version, pf->accounted_rstat.version);
7c673cae 2029 }
7c673cae 2030 }
9f95a23c
TL
2031 }
2032 DECODE_FINISH(p);
f67539c2
TL
2033
2034 if (_inode)
2035 reset_inode(std::move(_inode));
9f95a23c
TL
2036}
2037
2038void CInode::encode_lock_ixattr(bufferlist& bl)
2039{
2040 ENCODE_START(1, 1, bl);
f67539c2
TL
2041 encode(get_inode()->version, bl);
2042 encode(get_inode()->ctime, bl);
2043 encode_xattrs(bl);
9f95a23c
TL
2044 ENCODE_FINISH(bl);
2045}
2046
2047void CInode::decode_lock_ixattr(bufferlist::const_iterator& p)
2048{
f67539c2
TL
2049 ceph_assert(!is_auth());
2050 auto _inode = allocate_inode(*get_inode());
9f95a23c 2051 DECODE_START(1, p);
f67539c2 2052 decode(_inode->version, p);
9f95a23c
TL
2053 utime_t tm;
2054 decode(tm, p);
f67539c2
TL
2055 if (_inode->ctime < tm)
2056 _inode->ctime = tm;
2057 decode_xattrs(p);
9f95a23c 2058 DECODE_FINISH(p);
f67539c2 2059 reset_inode(std::move(_inode));
9f95a23c
TL
2060}
2061
2062void CInode::encode_lock_isnap(bufferlist& bl)
2063{
2064 ENCODE_START(1, 1, bl);
f67539c2
TL
2065 encode(get_inode()->version, bl);
2066 encode(get_inode()->ctime, bl);
9f95a23c
TL
2067 encode_snap(bl);
2068 ENCODE_FINISH(bl);
2069}
2070
2071void CInode::decode_lock_isnap(bufferlist::const_iterator& p)
2072{
f67539c2
TL
2073 ceph_assert(!is_auth());
2074 auto _inode = allocate_inode(*get_inode());
9f95a23c 2075 DECODE_START(1, p);
f67539c2 2076 decode(_inode->version, p);
9f95a23c
TL
2077 utime_t tm;
2078 decode(tm, p);
f67539c2 2079 if (_inode->ctime < tm) _inode->ctime = tm;
9f95a23c
TL
2080 decode_snap(p);
2081 DECODE_FINISH(p);
f67539c2 2082 reset_inode(std::move(_inode));
9f95a23c
TL
2083}
2084
2085void CInode::encode_lock_iflock(bufferlist& bl)
2086{
2087 ENCODE_START(1, 1, bl);
f67539c2 2088 encode(get_inode()->version, bl);
9f95a23c
TL
2089 _encode_file_locks(bl);
2090 ENCODE_FINISH(bl);
2091}
2092
2093void CInode::decode_lock_iflock(bufferlist::const_iterator& p)
2094{
f67539c2
TL
2095 ceph_assert(!is_auth());
2096 auto _inode = allocate_inode(*get_inode());
9f95a23c 2097 DECODE_START(1, p);
f67539c2 2098 decode(_inode->version, p);
9f95a23c
TL
2099 _decode_file_locks(p);
2100 DECODE_FINISH(p);
f67539c2 2101 reset_inode(std::move(_inode));
9f95a23c
TL
2102}
2103
2104void CInode::encode_lock_ipolicy(bufferlist& bl)
2105{
f6b5b4d7 2106 ENCODE_START(2, 1, bl);
f67539c2
TL
2107 if (is_dir()) {
2108 encode(get_inode()->version, bl);
2109 encode(get_inode()->ctime, bl);
2110 encode(get_inode()->layout, bl, mdcache->mds->mdsmap->get_up_features());
2111 encode(get_inode()->quota, bl);
2112 encode(get_inode()->export_pin, bl);
2113 encode(get_inode()->export_ephemeral_distributed_pin, bl);
2114 encode(get_inode()->export_ephemeral_random_pin, bl);
9f95a23c
TL
2115 }
2116 ENCODE_FINISH(bl);
2117}
2118
2119void CInode::decode_lock_ipolicy(bufferlist::const_iterator& p)
2120{
f67539c2
TL
2121 ceph_assert(!is_auth());
2122 auto _inode = allocate_inode(*get_inode());
2123 DECODE_START(1, p);
2124 if (is_dir()) {
2125 decode(_inode->version, p);
9f95a23c
TL
2126 utime_t tm;
2127 decode(tm, p);
f67539c2
TL
2128 if (_inode->ctime < tm)
2129 _inode->ctime = tm;
2130 decode(_inode->layout, p);
2131 decode(_inode->quota, p);
2132 decode(_inode->export_pin, p);
f6b5b4d7 2133 if (struct_v >= 2) {
f67539c2
TL
2134 decode(_inode->export_ephemeral_distributed_pin, p);
2135 decode(_inode->export_ephemeral_random_pin, p);
f6b5b4d7 2136 }
9f95a23c
TL
2137 }
2138 DECODE_FINISH(p);
f67539c2
TL
2139
2140 bool pin_updated = (get_inode()->export_pin != _inode->export_pin) ||
2141 (get_inode()->export_ephemeral_distributed_pin !=
2142 _inode->export_ephemeral_distributed_pin);
2143 reset_inode(std::move(_inode));
2144 maybe_export_pin(pin_updated);
9f95a23c
TL
2145}
2146
2147void CInode::encode_lock_state(int type, bufferlist& bl)
2148{
2149 ENCODE_START(1, 1, bl);
2150 encode(first, bl);
2151 if (!is_base())
2152 encode(parent->first, bl);
2153
2154 switch (type) {
2155 case CEPH_LOCK_IAUTH:
2156 encode_lock_iauth(bl);
2157 break;
2158
2159 case CEPH_LOCK_ILINK:
2160 encode_lock_ilink(bl);
2161 break;
2162
2163 case CEPH_LOCK_IDFT:
2164 encode_lock_idft(bl);
2165 break;
2166
2167 case CEPH_LOCK_IFILE:
2168 encode_lock_ifile(bl);
2169 break;
2170
2171 case CEPH_LOCK_INEST:
2172 encode_lock_inest(bl);
7c673cae
FG
2173 break;
2174
2175 case CEPH_LOCK_IXATTR:
9f95a23c 2176 encode_lock_ixattr(bl);
7c673cae
FG
2177 break;
2178
2179 case CEPH_LOCK_ISNAP:
9f95a23c 2180 encode_lock_isnap(bl);
7c673cae
FG
2181 break;
2182
2183 case CEPH_LOCK_IFLOCK:
9f95a23c 2184 encode_lock_iflock(bl);
7c673cae
FG
2185 break;
2186
2187 case CEPH_LOCK_IPOLICY:
9f95a23c 2188 encode_lock_ipolicy(bl);
7c673cae
FG
2189 break;
2190
2191 default:
2192 ceph_abort();
2193 }
9f95a23c 2194 ENCODE_FINISH(bl);
7c673cae
FG
2195}
2196
7c673cae
FG
2197/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2198
11fdf7f2 2199void CInode::decode_lock_state(int type, const bufferlist& bl)
7c673cae 2200{
11fdf7f2 2201 auto p = bl.cbegin();
9f95a23c
TL
2202
2203 DECODE_START(1, p);
7c673cae
FG
2204 utime_t tm;
2205
2206 snapid_t newfirst;
11fdf7f2
TL
2207 using ceph::decode;
2208 decode(newfirst, p);
7c673cae 2209 if (!is_auth() && newfirst != first) {
11fdf7f2
TL
2210 dout(10) << __func__ << " first " << first << " -> " << newfirst << dendl;
2211 first = newfirst;
2212 }
2213 if (!is_base()) {
2214 decode(newfirst, p);
2215 if (!parent->is_auth() && newfirst != parent->first) {
2216 dout(10) << __func__ << " parent first " << first << " -> " << newfirst << dendl;
7c673cae
FG
2217 parent->first = newfirst;
2218 }
7c673cae
FG
2219 }
2220
2221 switch (type) {
2222 case CEPH_LOCK_IAUTH:
9f95a23c 2223 decode_lock_iauth(p);
7c673cae
FG
2224 break;
2225
2226 case CEPH_LOCK_ILINK:
9f95a23c 2227 decode_lock_ilink(p);
7c673cae
FG
2228 break;
2229
2230 case CEPH_LOCK_IDFT:
9f95a23c 2231 decode_lock_idft(p);
7c673cae
FG
2232 break;
2233
2234 case CEPH_LOCK_IFILE:
9f95a23c 2235 decode_lock_ifile(p);
7c673cae
FG
2236 break;
2237
2238 case CEPH_LOCK_INEST:
9f95a23c 2239 decode_lock_inest(p);
7c673cae
FG
2240 break;
2241
2242 case CEPH_LOCK_IXATTR:
9f95a23c 2243 decode_lock_ixattr(p);
7c673cae
FG
2244 break;
2245
2246 case CEPH_LOCK_ISNAP:
9f95a23c 2247 decode_lock_isnap(p);
7c673cae
FG
2248 break;
2249
2250 case CEPH_LOCK_IFLOCK:
9f95a23c 2251 decode_lock_iflock(p);
7c673cae
FG
2252 break;
2253
2254 case CEPH_LOCK_IPOLICY:
9f95a23c 2255 decode_lock_ipolicy(p);
7c673cae
FG
2256 break;
2257
2258 default:
2259 ceph_abort();
2260 }
9f95a23c 2261 DECODE_FINISH(p);
7c673cae
FG
2262}
2263
2264
2265bool CInode::is_dirty_scattered()
2266{
2267 return
2268 filelock.is_dirty_or_flushing() ||
2269 nestlock.is_dirty_or_flushing() ||
2270 dirfragtreelock.is_dirty_or_flushing();
2271}
2272
2273void CInode::clear_scatter_dirty()
2274{
2275 filelock.remove_dirty();
2276 nestlock.remove_dirty();
2277 dirfragtreelock.remove_dirty();
2278}
2279
2280void CInode::clear_dirty_scattered(int type)
2281{
11fdf7f2
TL
2282 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2283 ceph_assert(is_dir());
7c673cae
FG
2284 switch (type) {
2285 case CEPH_LOCK_IFILE:
2286 item_dirty_dirfrag_dir.remove_myself();
2287 break;
2288
2289 case CEPH_LOCK_INEST:
2290 item_dirty_dirfrag_nest.remove_myself();
2291 break;
2292
2293 case CEPH_LOCK_IDFT:
2294 item_dirty_dirfrag_dirfragtree.remove_myself();
2295 break;
2296
2297 default:
2298 ceph_abort();
2299 }
2300}
2301
2302
2303/*
2304 * when we initially scatter a lock, we need to check if any of the dirfrags
2305 * have out of date accounted_rstat/fragstat. if so, mark the lock stale.
2306 */
2307/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2308void CInode::start_scatter(ScatterLock *lock)
2309{
11fdf7f2
TL
2310 dout(10) << __func__ << " " << *lock << " on " << *this << dendl;
2311 ceph_assert(is_auth());
f67539c2 2312 const auto& pi = get_projected_inode();
7c673cae 2313
94b18763
FG
2314 for (const auto &p : dirfrags) {
2315 frag_t fg = p.first;
2316 CDir *dir = p.second;
f67539c2 2317 const auto& pf = dir->get_projected_fnode();
7c673cae
FG
2318 dout(20) << fg << " " << *dir << dendl;
2319
2320 if (!dir->is_auth())
2321 continue;
2322
2323 switch (lock->get_type()) {
2324 case CEPH_LOCK_IFILE:
2325 finish_scatter_update(lock, dir, pi->dirstat.version, pf->accounted_fragstat.version);
2326 break;
2327
2328 case CEPH_LOCK_INEST:
2329 finish_scatter_update(lock, dir, pi->rstat.version, pf->accounted_rstat.version);
2330 break;
2331
2332 case CEPH_LOCK_IDFT:
2333 dir->state_clear(CDir::STATE_DIRTYDFT);
2334 break;
2335 }
2336 }
2337}
2338
2339
2340class C_Inode_FragUpdate : public MDSLogContextBase {
2341protected:
2342 CInode *in;
2343 CDir *dir;
2344 MutationRef mut;
2345 MDSRank *get_mds() override {return in->mdcache->mds;}
2346 void finish(int r) override {
2347 in->_finish_frag_update(dir, mut);
2348 }
2349
2350public:
2351 C_Inode_FragUpdate(CInode *i, CDir *d, MutationRef& m) : in(i), dir(d), mut(m) {}
2352};
2353
2354void CInode::finish_scatter_update(ScatterLock *lock, CDir *dir,
2355 version_t inode_version, version_t dir_accounted_version)
2356{
2357 frag_t fg = dir->get_frag();
11fdf7f2 2358 ceph_assert(dir->is_auth());
7c673cae
FG
2359
2360 if (dir->is_frozen()) {
11fdf7f2 2361 dout(10) << __func__ << " " << fg << " frozen, marking " << *lock << " stale " << *dir << dendl;
7c673cae 2362 } else if (dir->get_version() == 0) {
11fdf7f2 2363 dout(10) << __func__ << " " << fg << " not loaded, marking " << *lock << " stale " << *dir << dendl;
7c673cae
FG
2364 } else {
2365 if (dir_accounted_version != inode_version) {
11fdf7f2 2366 dout(10) << __func__ << " " << fg << " journaling accounted scatterstat update v" << inode_version << dendl;
7c673cae
FG
2367
2368 MDLog *mdlog = mdcache->mds->mdlog;
2369 MutationRef mut(new MutationImpl());
2370 mut->ls = mdlog->get_current_segment();
2371
f67539c2 2372 auto pf = dir->project_fnode(mut);
7c673cae 2373
9f95a23c 2374 std::string_view ename;
7c673cae
FG
2375 switch (lock->get_type()) {
2376 case CEPH_LOCK_IFILE:
f67539c2 2377 pf->fragstat.version = inode_version;
7c673cae
FG
2378 pf->accounted_fragstat = pf->fragstat;
2379 ename = "lock ifile accounted scatter stat update";
2380 break;
2381 case CEPH_LOCK_INEST:
f67539c2 2382 pf->rstat.version = inode_version;
7c673cae
FG
2383 pf->accounted_rstat = pf->rstat;
2384 ename = "lock inest accounted scatter stat update";
c07f9fc5
FG
2385
2386 if (!is_auth() && lock->get_state() == LOCK_MIX) {
11fdf7f2 2387 dout(10) << __func__ << " try to assimilate dirty rstat on "
c07f9fc5 2388 << *dir << dendl;
f67539c2 2389 dir->assimilate_dirty_rstat_inodes(mut);
c07f9fc5
FG
2390 }
2391
7c673cae
FG
2392 break;
2393 default:
2394 ceph_abort();
2395 }
2396
7c673cae
FG
2397 EUpdate *le = new EUpdate(mdlog, ename);
2398 mdlog->start_entry(le);
2399 le->metablob.add_dir_context(dir);
2400 le->metablob.add_dir(dir, true);
2401
11fdf7f2 2402 ceph_assert(!dir->is_frozen());
7c673cae 2403 mut->auth_pin(dir);
c07f9fc5
FG
2404
2405 if (lock->get_type() == CEPH_LOCK_INEST &&
2406 !is_auth() && lock->get_state() == LOCK_MIX) {
11fdf7f2 2407 dout(10) << __func__ << " finish assimilating dirty rstat on "
c07f9fc5 2408 << *dir << dendl;
f67539c2 2409 dir->assimilate_dirty_rstat_inodes_finish(&le->metablob);
c07f9fc5
FG
2410
2411 if (!(pf->rstat == pf->accounted_rstat)) {
11fdf7f2 2412 if (!mut->is_wrlocked(&nestlock)) {
c07f9fc5
FG
2413 mdcache->mds->locker->wrlock_force(&nestlock, mut);
2414 }
2415
2416 mdcache->mds->locker->mark_updated_scatterlock(&nestlock);
2417 mut->ls->dirty_dirfrag_nest.push_back(&item_dirty_dirfrag_nest);
2418 }
2419 }
f67539c2
TL
2420
2421 pf->version = dir->pre_dirty();
7c673cae
FG
2422
2423 mdlog->submit_entry(le, new C_Inode_FragUpdate(this, dir, mut));
2424 } else {
11fdf7f2 2425 dout(10) << __func__ << " " << fg << " accounted " << *lock
7c673cae
FG
2426 << " scatter stat unchanged at v" << dir_accounted_version << dendl;
2427 }
2428 }
2429}
2430
2431void CInode::_finish_frag_update(CDir *dir, MutationRef& mut)
2432{
11fdf7f2 2433 dout(10) << __func__ << " on " << *dir << dendl;
7c673cae 2434 mut->apply();
c07f9fc5 2435 mdcache->mds->locker->drop_locks(mut.get());
7c673cae
FG
2436 mut->cleanup();
2437}
2438
2439
2440/*
2441 * when we gather a lock, we need to assimilate dirfrag changes into the inode
2442 * state. it's possible we can't update the dirfrag accounted_rstat/fragstat
2443 * because the frag is auth and frozen, or that the replica couldn't for the same
2444 * reason. hopefully it will get updated the next time the lock cycles.
2445 *
2446 * we have two dimensions of behavior:
2447 * - we may be (auth and !frozen), and able to update, or not.
2448 * - the frag may be stale, or not.
2449 *
2450 * if the frag is non-stale, we want to assimilate the diff into the
2451 * inode, regardless of whether it's auth or updateable.
2452 *
2453 * if we update the frag, we want to set accounted_fragstat = frag,
2454 * both if we took the diff or it was stale and we are making it
2455 * un-stale.
2456 */
2457/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
f67539c2 2458void CInode::finish_scatter_gather_update(int type, MutationRef& mut)
7c673cae
FG
2459{
2460 LogChannelRef clog = mdcache->mds->clog;
2461
11fdf7f2
TL
2462 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2463 ceph_assert(is_auth());
7c673cae
FG
2464
2465 switch (type) {
2466 case CEPH_LOCK_IFILE:
2467 {
2468 fragtree_t tmpdft = dirfragtree;
2469 struct frag_info_t dirstat;
2470 bool dirstat_valid = true;
2471
2472 // adjust summation
11fdf7f2 2473 ceph_assert(is_auth());
f67539c2 2474 auto pi = _get_projected_inode();
7c673cae
FG
2475
2476 bool touched_mtime = false, touched_chattr = false;
2477 dout(20) << " orig dirstat " << pi->dirstat << dendl;
2478 pi->dirstat.version++;
94b18763
FG
2479 for (const auto &p : dirfrags) {
2480 frag_t fg = p.first;
2481 CDir *dir = p.second;
7c673cae
FG
2482 dout(20) << fg << " " << *dir << dendl;
2483
2484 bool update;
2485 if (dir->get_version() != 0) {
2486 update = dir->is_auth() && !dir->is_frozen();
2487 } else {
2488 update = false;
2489 dirstat_valid = false;
2490 }
2491
f67539c2
TL
2492 CDir::fnode_const_ptr pf;
2493 if (update) {
2494 mut->auth_pin(dir);
2495 pf = dir->project_fnode(mut);
2496 } else {
2497 pf = dir->get_projected_fnode();
2498 }
7c673cae
FG
2499
2500 if (pf->accounted_fragstat.version == pi->dirstat.version - 1) {
2501 dout(20) << fg << " fragstat " << pf->fragstat << dendl;
2502 dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
2503 pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2504 } else {
2505 dout(20) << fg << " skipping STALE accounted_fragstat " << pf->accounted_fragstat << dendl;
2506 }
2507
2508 if (pf->fragstat.nfiles < 0 ||
2509 pf->fragstat.nsubdirs < 0) {
2510 clog->error() << "bad/negative dir size on "
f67539c2 2511 << dir->dirfrag() << " " << pf->fragstat;
11fdf7f2 2512 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter);
f67539c2
TL
2513
2514 auto _pf = const_cast<fnode_t*>(pf.get());
7c673cae 2515 if (pf->fragstat.nfiles < 0)
f67539c2 2516 _pf->fragstat.nfiles = 0;
7c673cae 2517 if (pf->fragstat.nsubdirs < 0)
f67539c2 2518 _pf->fragstat.nsubdirs = 0;
7c673cae
FG
2519 }
2520
2521 if (update) {
f67539c2
TL
2522 auto _pf = const_cast<fnode_t*>(pf.get());
2523 _pf->accounted_fragstat = _pf->fragstat;
2524 _pf->fragstat.version = _pf->accounted_fragstat.version = pi->dirstat.version;
2525 _pf->version = dir->pre_dirty();
7c673cae
FG
2526 dout(10) << fg << " updated accounted_fragstat " << pf->fragstat << " on " << *dir << dendl;
2527 }
2528
2529 tmpdft.force_to_leaf(g_ceph_context, fg);
2530 dirstat.add(pf->fragstat);
2531 }
2532 if (touched_mtime)
2533 pi->mtime = pi->ctime = pi->dirstat.mtime;
2534 if (touched_chattr)
2535 pi->change_attr = pi->dirstat.change_attr;
2536 dout(20) << " final dirstat " << pi->dirstat << dendl;
2537
2538 if (dirstat_valid && !dirstat.same_sums(pi->dirstat)) {
11fdf7f2
TL
2539 frag_vec_t leaves;
2540 tmpdft.get_leaves_under(frag_t(), leaves);
2541 for (const auto& leaf : leaves) {
2542 if (!dirfrags.count(leaf)) {
7c673cae
FG
2543 dirstat_valid = false;
2544 break;
2545 }
11fdf7f2 2546 }
7c673cae
FG
2547 if (dirstat_valid) {
2548 if (state_test(CInode::STATE_REPAIRSTATS)) {
2549 dout(20) << " dirstat mismatch, fixing" << dendl;
2550 } else {
2551 clog->error() << "unmatched fragstat on " << ino() << ", inode has "
2552 << pi->dirstat << ", dirfrags have " << dirstat;
11fdf7f2 2553 ceph_assert(!"unmatched fragstat" == g_conf()->mds_verify_scatter);
7c673cae
FG
2554 }
2555 // trust the dirfrags for now
2556 version_t v = pi->dirstat.version;
2557 if (pi->dirstat.mtime > dirstat.mtime)
2558 dirstat.mtime = pi->dirstat.mtime;
2559 if (pi->dirstat.change_attr > dirstat.change_attr)
2560 dirstat.change_attr = pi->dirstat.change_attr;
2561 pi->dirstat = dirstat;
2562 pi->dirstat.version = v;
2563 }
2564 }
2565
f67539c2 2566 if (pi->dirstat.nfiles < 0 || pi->dirstat.nsubdirs < 0) {
d2e6a577
FG
2567 std::string path;
2568 make_path_string(path);
2569 clog->error() << "Inconsistent statistics detected: fragstat on inode "
2570 << ino() << " (" << path << "), inode has " << pi->dirstat;
11fdf7f2 2571 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter);
7c673cae
FG
2572
2573 if (pi->dirstat.nfiles < 0)
2574 pi->dirstat.nfiles = 0;
2575 if (pi->dirstat.nsubdirs < 0)
2576 pi->dirstat.nsubdirs = 0;
2577 }
2578 }
2579 break;
2580
2581 case CEPH_LOCK_INEST:
2582 {
11fdf7f2
TL
2583 // adjust summation
2584 ceph_assert(is_auth());
2585
7c673cae
FG
2586 fragtree_t tmpdft = dirfragtree;
2587 nest_info_t rstat;
7c673cae
FG
2588 bool rstat_valid = true;
2589
11fdf7f2
TL
2590 rstat.rsubdirs = 1;
2591 if (const sr_t *srnode = get_projected_srnode(); srnode)
2592 rstat.rsnaps = srnode->snaps.size();
2593
f67539c2 2594 auto pi = _get_projected_inode();
7c673cae
FG
2595 dout(20) << " orig rstat " << pi->rstat << dendl;
2596 pi->rstat.version++;
94b18763
FG
2597 for (const auto &p : dirfrags) {
2598 frag_t fg = p.first;
2599 CDir *dir = p.second;
7c673cae
FG
2600 dout(20) << fg << " " << *dir << dendl;
2601
2602 bool update;
2603 if (dir->get_version() != 0) {
2604 update = dir->is_auth() && !dir->is_frozen();
2605 } else {
2606 update = false;
2607 rstat_valid = false;
2608 }
2609
f67539c2
TL
2610 CDir::fnode_const_ptr pf;
2611 if (update) {
2612 mut->auth_pin(dir);
2613 pf = dir->project_fnode(mut);
2614 } else {
2615 pf = dir->get_projected_fnode();
2616 }
7c673cae
FG
2617
2618 if (pf->accounted_rstat.version == pi->rstat.version-1) {
2619 // only pull this frag's dirty rstat inodes into the frag if
2620 // the frag is non-stale and updateable. if it's stale,
2621 // that info will just get thrown out!
2622 if (update)
f67539c2 2623 dir->assimilate_dirty_rstat_inodes(mut);
7c673cae
FG
2624
2625 dout(20) << fg << " rstat " << pf->rstat << dendl;
2626 dout(20) << fg << " accounted_rstat " << pf->accounted_rstat << dendl;
2627 dout(20) << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl;
2628 mdcache->project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat,
2629 dir->first, CEPH_NOSNAP, this, true);
94b18763
FG
2630 for (auto &p : dir->dirty_old_rstat) {
2631 mdcache->project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat,
2632 p.second.first, p.first, this, true);
2633 }
7c673cae
FG
2634 if (update) // dir contents not valid if frozen or non-auth
2635 dir->check_rstats();
2636 } else {
2637 dout(20) << fg << " skipping STALE accounted_rstat " << pf->accounted_rstat << dendl;
2638 }
2639 if (update) {
f67539c2
TL
2640 auto _pf = const_cast<fnode_t*>(pf.get());
2641 _pf->accounted_rstat = pf->rstat;
2642 _pf->rstat.version = _pf->accounted_rstat.version = pi->rstat.version;
2643 _pf->version = dir->pre_dirty();
7c673cae 2644 dir->dirty_old_rstat.clear();
7c673cae
FG
2645 dir->check_rstats();
2646 dout(10) << fg << " updated accounted_rstat " << pf->rstat << " on " << *dir << dendl;
2647 }
2648
2649 tmpdft.force_to_leaf(g_ceph_context, fg);
2650 rstat.add(pf->rstat);
2651 }
2652 dout(20) << " final rstat " << pi->rstat << dendl;
2653
2654 if (rstat_valid && !rstat.same_sums(pi->rstat)) {
11fdf7f2
TL
2655 frag_vec_t leaves;
2656 tmpdft.get_leaves_under(frag_t(), leaves);
2657 for (const auto& leaf : leaves) {
2658 if (!dirfrags.count(leaf)) {
7c673cae
FG
2659 rstat_valid = false;
2660 break;
2661 }
11fdf7f2 2662 }
7c673cae
FG
2663 if (rstat_valid) {
2664 if (state_test(CInode::STATE_REPAIRSTATS)) {
2665 dout(20) << " rstat mismatch, fixing" << dendl;
2666 } else {
d2e6a577
FG
2667 clog->error() << "inconsistent rstat on inode " << ino()
2668 << ", inode has " << pi->rstat
2669 << ", directory fragments have " << rstat;
11fdf7f2 2670 ceph_assert(!"unmatched rstat" == g_conf()->mds_verify_scatter);
7c673cae
FG
2671 }
2672 // trust the dirfrag for now
2673 version_t v = pi->rstat.version;
2674 if (pi->rstat.rctime > rstat.rctime)
2675 rstat.rctime = pi->rstat.rctime;
2676 pi->rstat = rstat;
2677 pi->rstat.version = v;
2678 }
2679 }
2680
2681 mdcache->broadcast_quota_to_client(this);
2682 }
2683 break;
2684
2685 case CEPH_LOCK_IDFT:
2686 break;
2687
2688 default:
2689 ceph_abort();
2690 }
2691}
2692
f67539c2 2693void CInode::finish_scatter_gather_update_accounted(int type, EMetaBlob *metablob)
7c673cae 2694{
11fdf7f2
TL
2695 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2696 ceph_assert(is_auth());
7c673cae 2697
94b18763
FG
2698 for (const auto &p : dirfrags) {
2699 CDir *dir = p.second;
7c673cae
FG
2700 if (!dir->is_auth() || dir->get_version() == 0 || dir->is_frozen())
2701 continue;
2702
2703 if (type == CEPH_LOCK_IDFT)
2704 continue; // nothing to do.
2705
f67539c2
TL
2706 if (type == CEPH_LOCK_INEST)
2707 dir->assimilate_dirty_rstat_inodes_finish(metablob);
2708
7c673cae 2709 dout(10) << " journaling updated frag accounted_ on " << *dir << dendl;
11fdf7f2 2710 ceph_assert(dir->is_projected());
7c673cae 2711 metablob->add_dir(dir, true);
7c673cae
FG
2712 }
2713}
2714
2715// waiting
2716
2717bool CInode::is_frozen() const
2718{
2719 if (is_frozen_inode()) return true;
2720 if (parent && parent->dir->is_frozen()) return true;
2721 return false;
2722}
2723
2724bool CInode::is_frozen_dir() const
2725{
2726 if (parent && parent->dir->is_frozen_dir()) return true;
2727 return false;
2728}
2729
2730bool CInode::is_freezing() const
2731{
2732 if (is_freezing_inode()) return true;
2733 if (parent && parent->dir->is_freezing()) return true;
2734 return false;
2735}
2736
11fdf7f2 2737void CInode::add_dir_waiter(frag_t fg, MDSContext *c)
7c673cae
FG
2738{
2739 if (waiting_on_dir.empty())
2740 get(PIN_DIRWAITER);
2741 waiting_on_dir[fg].push_back(c);
11fdf7f2 2742 dout(10) << __func__ << " frag " << fg << " " << c << " on " << *this << dendl;
7c673cae
FG
2743}
2744
11fdf7f2 2745void CInode::take_dir_waiting(frag_t fg, MDSContext::vec& ls)
7c673cae
FG
2746{
2747 if (waiting_on_dir.empty())
2748 return;
2749
94b18763
FG
2750 auto it = waiting_on_dir.find(fg);
2751 if (it != waiting_on_dir.end()) {
2752 dout(10) << __func__ << " frag " << fg << " on " << *this << dendl;
11fdf7f2
TL
2753 auto& waiting = it->second;
2754 ls.insert(ls.end(), waiting.begin(), waiting.end());
94b18763 2755 waiting_on_dir.erase(it);
7c673cae
FG
2756
2757 if (waiting_on_dir.empty())
2758 put(PIN_DIRWAITER);
2759 }
2760}
2761
11fdf7f2 2762void CInode::add_waiter(uint64_t tag, MDSContext *c)
7c673cae 2763{
11fdf7f2 2764 dout(10) << __func__ << " tag " << std::hex << tag << std::dec << " " << c
7c673cae
FG
2765 << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH)
2766 << " !frozen " << !is_frozen_inode()
2767 << " !freezing " << !is_freezing_inode()
2768 << dendl;
2769 // wait on the directory?
2770 // make sure its not the inode that is explicitly ambiguous|freezing|frozen
2771 if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) ||
2772 ((tag & WAIT_UNFREEZE) &&
2773 !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) {
2774 dout(15) << "passing waiter up tree" << dendl;
2775 parent->dir->add_waiter(tag, c);
2776 return;
2777 }
2778 dout(15) << "taking waiter here" << dendl;
2779 MDSCacheObject::add_waiter(tag, c);
2780}
2781
11fdf7f2 2782void CInode::take_waiting(uint64_t mask, MDSContext::vec& ls)
7c673cae
FG
2783{
2784 if ((mask & WAIT_DIR) && !waiting_on_dir.empty()) {
2785 // take all dentry waiters
2786 while (!waiting_on_dir.empty()) {
94b18763
FG
2787 auto it = waiting_on_dir.begin();
2788 dout(10) << __func__ << " dirfrag " << it->first << " on " << *this << dendl;
11fdf7f2
TL
2789 auto& waiting = it->second;
2790 ls.insert(ls.end(), waiting.begin(), waiting.end());
94b18763 2791 waiting_on_dir.erase(it);
7c673cae
FG
2792 }
2793 put(PIN_DIRWAITER);
2794 }
2795
2796 // waiting
2797 MDSCacheObject::take_waiting(mask, ls);
2798}
2799
9f95a23c
TL
2800void CInode::maybe_finish_freeze_inode()
2801{
2802 CDir *dir = get_parent_dir();
2803 if (auth_pins > auth_pin_freeze_allowance || dir->frozen_inode_suppressed)
2804 return;
2805
2806 dout(10) << "maybe_finish_freeze_inode - frozen" << dendl;
2807 ceph_assert(auth_pins == auth_pin_freeze_allowance);
2808 get(PIN_FROZEN);
2809 put(PIN_FREEZING);
2810 state_clear(STATE_FREEZING);
2811 state_set(STATE_FROZEN);
2812
2813 item_freezing_inode.remove_myself();
2814 dir->num_frozen_inodes++;
2815
2816 finish_waiting(WAIT_FROZEN);
2817}
2818
7c673cae
FG
2819bool CInode::freeze_inode(int auth_pin_allowance)
2820{
9f95a23c
TL
2821 CDir *dir = get_parent_dir();
2822 ceph_assert(dir);
2823
11fdf7f2
TL
2824 ceph_assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins
2825 ceph_assert(auth_pins >= auth_pin_allowance);
9f95a23c
TL
2826 if (auth_pins == auth_pin_allowance && !dir->frozen_inode_suppressed) {
2827 dout(10) << "freeze_inode - frozen" << dendl;
2828 if (!state_test(STATE_FROZEN)) {
2829 get(PIN_FROZEN);
2830 state_set(STATE_FROZEN);
2831 dir->num_frozen_inodes++;
2832 }
2833 return true;
7c673cae
FG
2834 }
2835
9f95a23c
TL
2836 dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl;
2837 auth_pin_freeze_allowance = auth_pin_allowance;
2838 dir->freezing_inodes.push_back(&item_freezing_inode);
2839
2840 get(PIN_FREEZING);
2841 state_set(STATE_FREEZING);
2842
2843 if (!dir->lock_caches_with_auth_pins.empty())
2844 mdcache->mds->locker->invalidate_lock_caches(dir);
2845
2846 const static int lock_types[] = {
2847 CEPH_LOCK_IVERSION, CEPH_LOCK_IFILE, CEPH_LOCK_IAUTH, CEPH_LOCK_ILINK, CEPH_LOCK_IDFT,
2848 CEPH_LOCK_IXATTR, CEPH_LOCK_ISNAP, CEPH_LOCK_INEST, CEPH_LOCK_IFLOCK, CEPH_LOCK_IPOLICY, 0
2849 };
2850 for (int i = 0; lock_types[i]; ++i) {
2851 auto lock = get_lock(lock_types[i]);
2852 if (lock->is_cached())
2853 mdcache->mds->locker->invalidate_lock_caches(lock);
7c673cae 2854 }
9f95a23c
TL
2855 // invalidate_lock_caches() may decrease dir->frozen_inode_suppressed
2856 // and finish freezing the inode
2857 return state_test(STATE_FROZEN);
7c673cae
FG
2858}
2859
11fdf7f2 2860void CInode::unfreeze_inode(MDSContext::vec& finished)
7c673cae 2861{
11fdf7f2 2862 dout(10) << __func__ << dendl;
7c673cae
FG
2863 if (state_test(STATE_FREEZING)) {
2864 state_clear(STATE_FREEZING);
2865 put(PIN_FREEZING);
9f95a23c 2866 item_freezing_inode.remove_myself();
7c673cae
FG
2867 } else if (state_test(STATE_FROZEN)) {
2868 state_clear(STATE_FROZEN);
2869 put(PIN_FROZEN);
9f95a23c 2870 get_parent_dir()->num_frozen_inodes--;
7c673cae
FG
2871 } else
2872 ceph_abort();
2873 take_waiting(WAIT_UNFREEZE, finished);
2874}
2875
2876void CInode::unfreeze_inode()
2877{
11fdf7f2 2878 MDSContext::vec finished;
7c673cae
FG
2879 unfreeze_inode(finished);
2880 mdcache->mds->queue_waiters(finished);
2881}
2882
2883void CInode::freeze_auth_pin()
2884{
11fdf7f2 2885 ceph_assert(state_test(CInode::STATE_FROZEN));
7c673cae 2886 state_set(CInode::STATE_FROZENAUTHPIN);
9f95a23c 2887 get_parent_dir()->num_frozen_inodes++;
7c673cae
FG
2888}
2889
2890void CInode::unfreeze_auth_pin()
2891{
11fdf7f2 2892 ceph_assert(state_test(CInode::STATE_FROZENAUTHPIN));
7c673cae 2893 state_clear(CInode::STATE_FROZENAUTHPIN);
9f95a23c 2894 get_parent_dir()->num_frozen_inodes--;
7c673cae 2895 if (!state_test(STATE_FREEZING|STATE_FROZEN)) {
11fdf7f2 2896 MDSContext::vec finished;
7c673cae
FG
2897 take_waiting(WAIT_UNFREEZE, finished);
2898 mdcache->mds->queue_waiters(finished);
2899 }
2900}
2901
11fdf7f2 2902void CInode::clear_ambiguous_auth(MDSContext::vec& finished)
7c673cae 2903{
11fdf7f2 2904 ceph_assert(state_test(CInode::STATE_AMBIGUOUSAUTH));
7c673cae
FG
2905 state_clear(CInode::STATE_AMBIGUOUSAUTH);
2906 take_waiting(CInode::WAIT_SINGLEAUTH, finished);
2907}
2908
2909void CInode::clear_ambiguous_auth()
2910{
11fdf7f2 2911 MDSContext::vec finished;
7c673cae
FG
2912 clear_ambiguous_auth(finished);
2913 mdcache->mds->queue_waiters(finished);
2914}
2915
2916// auth_pins
91327a77
AA
2917bool CInode::can_auth_pin(int *err_ret) const {
2918 int err;
2919 if (!is_auth()) {
2920 err = ERR_NOT_AUTH;
2921 } else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) {
2922 err = ERR_EXPORTING_INODE;
2923 } else {
2924 if (parent)
2925 return parent->can_auth_pin(err_ret);
2926 err = 0;
2927 }
2928 if (err && err_ret)
2929 *err_ret = err;
2930 return !err;
7c673cae
FG
2931}
2932
2933void CInode::auth_pin(void *by)
2934{
2935 if (auth_pins == 0)
2936 get(PIN_AUTHPIN);
2937 auth_pins++;
2938
2939#ifdef MDS_AUTHPIN_SET
2940 auth_pin_set.insert(by);
2941#endif
2942
11fdf7f2 2943 dout(10) << "auth_pin by " << by << " on " << *this << " now " << auth_pins << dendl;
7c673cae
FG
2944
2945 if (parent)
11fdf7f2 2946 parent->adjust_nested_auth_pins(1, this);
7c673cae
FG
2947}
2948
2949void CInode::auth_unpin(void *by)
2950{
2951 auth_pins--;
2952
2953#ifdef MDS_AUTHPIN_SET
11fdf7f2
TL
2954 {
2955 auto it = auth_pin_set.find(by);
2956 ceph_assert(it != auth_pin_set.end());
2957 auth_pin_set.erase(it);
2958 }
7c673cae
FG
2959#endif
2960
2961 if (auth_pins == 0)
2962 put(PIN_AUTHPIN);
2963
11fdf7f2 2964 dout(10) << "auth_unpin by " << by << " on " << *this << " now " << auth_pins << dendl;
7c673cae 2965
11fdf7f2 2966 ceph_assert(auth_pins >= 0);
7c673cae
FG
2967
2968 if (parent)
11fdf7f2 2969 parent->adjust_nested_auth_pins(-1, by);
7c673cae 2970
9f95a23c
TL
2971 if (is_freezing_inode())
2972 maybe_finish_freeze_inode();
7c673cae
FG
2973}
2974
7c673cae
FG
2975// authority
2976
2977mds_authority_t CInode::authority() const
2978{
2979 if (inode_auth.first >= 0)
2980 return inode_auth;
2981
2982 if (parent)
2983 return parent->dir->authority();
2984
2985 // new items that are not yet linked in (in the committed plane) belong
2986 // to their first parent.
2987 if (!projected_parent.empty())
2988 return projected_parent.front()->dir->authority();
2989
2990 return CDIR_AUTH_UNDEF;
2991}
2992
2993
2994// SNAP
2995
2996snapid_t CInode::get_oldest_snap()
2997{
2998 snapid_t t = first;
f67539c2
TL
2999 if (is_any_old_inodes())
3000 t = get_old_inodes()->begin()->second.first;
11fdf7f2 3001 return std::min(t, oldest_snap);
7c673cae
FG
3002}
3003
f67539c2 3004const CInode::mempool_old_inode& CInode::cow_old_inode(snapid_t follows, bool cow_head)
7c673cae 3005{
11fdf7f2 3006 ceph_assert(follows >= first);
7c673cae 3007
f67539c2
TL
3008 const auto& pi = cow_head ? get_projected_inode() : get_previous_projected_inode();
3009 const auto& px = cow_head ? get_projected_xattrs() : get_previous_projected_xattrs();
3010
3011 auto _old_inodes = allocate_old_inode_map();
3012 if (old_inodes)
3013 *_old_inodes = *old_inodes;
7c673cae 3014
f67539c2 3015 mempool_old_inode &old = (*_old_inodes)[follows];
7c673cae
FG
3016 old.first = first;
3017 old.inode = *pi;
f67539c2
TL
3018 if (px) {
3019 dout(10) << " " << px->size() << " xattrs cowed, " << *px << dendl;
3020 old.xattrs = *px;
3021 }
7c673cae
FG
3022
3023 if (first < oldest_snap)
3024 oldest_snap = first;
7c673cae
FG
3025
3026 old.inode.trim_client_ranges(follows);
3027
11fdf7f2 3028 if (g_conf()->mds_snap_rstat &&
7c673cae
FG
3029 !(old.inode.rstat == old.inode.accounted_rstat))
3030 dirty_old_rstats.insert(follows);
3031
3032 first = follows+1;
3033
11fdf7f2 3034 dout(10) << __func__ << " " << (cow_head ? "head" : "previous_head" )
7c673cae
FG
3035 << " to [" << old.first << "," << follows << "] on "
3036 << *this << dendl;
3037
f67539c2 3038 reset_old_inodes(std::move(_old_inodes));
7c673cae
FG
3039 return old;
3040}
3041
7c673cae
FG
3042void CInode::pre_cow_old_inode()
3043{
11fdf7f2 3044 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
7c673cae
FG
3045 if (first <= follows)
3046 cow_old_inode(follows, true);
3047}
3048
11fdf7f2
TL
3049bool CInode::has_snap_data(snapid_t snapid)
3050{
3051 bool found = snapid >= first && snapid <= last;
f67539c2
TL
3052 if (!found && is_any_old_inodes()) {
3053 auto p = old_inodes->lower_bound(snapid);
3054 if (p != old_inodes->end()) {
11fdf7f2 3055 if (p->second.first > snapid) {
f67539c2 3056 if (p != old_inodes->begin())
11fdf7f2
TL
3057 --p;
3058 }
3059 if (p->second.first <= snapid && snapid <= p->first) {
3060 found = true;
3061 }
3062 }
3063 }
3064 return found;
3065}
3066
7c673cae
FG
3067void CInode::purge_stale_snap_data(const set<snapid_t>& snaps)
3068{
11fdf7f2 3069 dout(10) << __func__ << " " << snaps << dendl;
7c673cae 3070
f67539c2
TL
3071 if (!get_old_inodes())
3072 return;
3073
3074 std::vector<snapid_t> to_remove;
3075 for (auto p : *get_old_inodes()) {
3076 const snapid_t &id = p.first;
3077 const auto &s = snaps.lower_bound(p.second.first);
94b18763 3078 if (s == snaps.end() || *s > id) {
f67539c2
TL
3079 dout(10) << " purging old_inode [" << p.second.first << "," << id << "]" << dendl;
3080 to_remove.push_back(id);
94b18763 3081 }
7c673cae 3082 }
f67539c2
TL
3083
3084 if (to_remove.size() == get_old_inodes()->size()) {
3085 reset_old_inodes(old_inode_map_ptr());
3086 } else if (!to_remove.empty()) {
3087 auto _old_inodes = allocate_old_inode_map(*get_old_inodes());
3088 for (auto id : to_remove)
3089 _old_inodes->erase(id);
3090 reset_old_inodes(std::move(_old_inodes));
3091 }
7c673cae
FG
3092}
3093
3094/*
3095 * pick/create an old_inode
3096 */
f67539c2 3097snapid_t CInode::pick_old_inode(snapid_t snap) const
7c673cae 3098{
f67539c2
TL
3099 if (is_any_old_inodes()) {
3100 auto it = old_inodes->lower_bound(snap); // p is first key >= to snap
3101 if (it != old_inodes->end() && it->second.first <= snap) {
3102 dout(10) << __func__ << " snap " << snap << " -> [" << it->second.first << "," << it->first << "]" << dendl;
3103 return it->first;
3104 }
7c673cae 3105 }
11fdf7f2 3106 dout(10) << __func__ << " snap " << snap << " -> nothing" << dendl;
f67539c2 3107 return 0;
7c673cae
FG
3108}
3109
3110void CInode::open_snaprealm(bool nosplit)
3111{
3112 if (!snaprealm) {
3113 SnapRealm *parent = find_snaprealm();
3114 snaprealm = new SnapRealm(mdcache, this);
3115 if (parent) {
11fdf7f2 3116 dout(10) << __func__ << " " << snaprealm
7c673cae
FG
3117 << " parent is " << parent
3118 << dendl;
3119 dout(30) << " siblings are " << parent->open_children << dendl;
3120 snaprealm->parent = parent;
3121 if (!nosplit)
3122 parent->split_at(snaprealm);
3123 parent->open_children.insert(snaprealm);
3124 }
3125 }
3126}
3127void CInode::close_snaprealm(bool nojoin)
3128{
3129 if (snaprealm) {
11fdf7f2 3130 dout(15) << __func__ << " " << *snaprealm << dendl;
7c673cae
FG
3131 if (snaprealm->parent) {
3132 snaprealm->parent->open_children.erase(snaprealm);
3133 //if (!nojoin)
3134 //snaprealm->parent->join(snaprealm);
3135 }
3136 delete snaprealm;
3137 snaprealm = 0;
3138 }
3139}
3140
3141SnapRealm *CInode::find_snaprealm() const
3142{
3143 const CInode *cur = this;
3144 while (!cur->snaprealm) {
11fdf7f2
TL
3145 const CDentry *pdn = cur->get_oldest_parent_dn();
3146 if (!pdn)
7c673cae 3147 break;
11fdf7f2 3148 cur = pdn->get_dir()->get_inode();
7c673cae
FG
3149 }
3150 return cur->snaprealm;
3151}
3152
3153void CInode::encode_snap_blob(bufferlist &snapbl)
3154{
3155 if (snaprealm) {
11fdf7f2
TL
3156 using ceph::encode;
3157 encode(snaprealm->srnode, snapbl);
3158 dout(20) << __func__ << " " << *snaprealm << dendl;
7c673cae
FG
3159 }
3160}
11fdf7f2 3161void CInode::decode_snap_blob(const bufferlist& snapbl)
7c673cae 3162{
11fdf7f2 3163 using ceph::decode;
7c673cae
FG
3164 if (snapbl.length()) {
3165 open_snaprealm();
11fdf7f2
TL
3166 auto old_flags = snaprealm->srnode.flags;
3167 auto p = snapbl.cbegin();
3168 decode(snaprealm->srnode, p);
f67539c2 3169 if (!is_base()) {
11fdf7f2 3170 if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) {
11fdf7f2
TL
3171 snaprealm->adjust_parent();
3172 }
7c673cae 3173 }
11fdf7f2 3174 dout(20) << __func__ << " " << *snaprealm << dendl;
92f5a8d4
TL
3175 } else if (snaprealm &&
3176 !is_root() && !is_mdsdir()) { // see https://tracker.ceph.com/issues/42675
11fdf7f2
TL
3177 ceph_assert(mdcache->mds->is_any_replay());
3178 snaprealm->merge_to(NULL);
7c673cae
FG
3179 }
3180}
3181
3182void CInode::encode_snap(bufferlist& bl)
3183{
9f95a23c 3184 ENCODE_START(1, 1, bl);
7c673cae
FG
3185 bufferlist snapbl;
3186 encode_snap_blob(snapbl);
11fdf7f2
TL
3187 encode(snapbl, bl);
3188 encode(oldest_snap, bl);
9f95a23c 3189 ENCODE_FINISH(bl);
11fdf7f2 3190}
7c673cae 3191
11fdf7f2 3192void CInode::decode_snap(bufferlist::const_iterator& p)
7c673cae 3193{
9f95a23c 3194 DECODE_START(1, p);
7c673cae 3195 bufferlist snapbl;
11fdf7f2
TL
3196 decode(snapbl, p);
3197 decode(oldest_snap, p);
7c673cae 3198 decode_snap_blob(snapbl);
9f95a23c 3199 DECODE_FINISH(p);
7c673cae
FG
3200}
3201
3202// =============================================
3203
3204client_t CInode::calc_ideal_loner()
3205{
3206 if (mdcache->is_readonly())
3207 return -1;
11fdf7f2 3208 if (!get_mds_caps_wanted().empty())
7c673cae
FG
3209 return -1;
3210
3211 int n = 0;
3212 client_t loner = -1;
11fdf7f2
TL
3213 for (const auto &p : client_caps) {
3214 if (!p.second.is_stale() &&
9f95a23c
TL
3215 (is_dir() ?
3216 !has_subtree_or_exporting_dirfrag() :
3217 (p.second.wanted() & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_RD)))) {
7c673cae
FG
3218 if (n)
3219 return -1;
3220 n++;
11fdf7f2 3221 loner = p.first;
7c673cae 3222 }
11fdf7f2 3223 }
7c673cae
FG
3224 return loner;
3225}
3226
b32b8144 3227bool CInode::choose_ideal_loner()
7c673cae
FG
3228{
3229 want_loner_cap = calc_ideal_loner();
b32b8144
FG
3230 int changed = false;
3231 if (loner_cap >= 0 && loner_cap != want_loner_cap) {
3232 if (!try_drop_loner())
3233 return false;
3234 changed = true;
3235 }
3236
3237 if (want_loner_cap >= 0) {
3238 if (loner_cap < 0) {
3239 set_loner_cap(want_loner_cap);
3240 changed = true;
3241 } else
11fdf7f2 3242 ceph_assert(loner_cap == want_loner_cap);
b32b8144
FG
3243 }
3244 return changed;
7c673cae
FG
3245}
3246
3247bool CInode::try_set_loner()
3248{
11fdf7f2 3249 ceph_assert(want_loner_cap >= 0);
7c673cae
FG
3250 if (loner_cap >= 0 && loner_cap != want_loner_cap)
3251 return false;
3252 set_loner_cap(want_loner_cap);
3253 return true;
3254}
3255
3256void CInode::set_loner_cap(client_t l)
3257{
3258 loner_cap = l;
3259 authlock.set_excl_client(loner_cap);
3260 filelock.set_excl_client(loner_cap);
3261 linklock.set_excl_client(loner_cap);
3262 xattrlock.set_excl_client(loner_cap);
3263}
3264
3265bool CInode::try_drop_loner()
3266{
3267 if (loner_cap < 0)
3268 return true;
3269
3270 int other_allowed = get_caps_allowed_by_type(CAP_ANY);
3271 Capability *cap = get_client_cap(loner_cap);
3272 if (!cap ||
3273 (cap->issued() & ~other_allowed) == 0) {
3274 set_loner_cap(-1);
3275 return true;
3276 }
3277 return false;
3278}
3279
3280
3281// choose new lock state during recovery, based on issued caps
3282void CInode::choose_lock_state(SimpleLock *lock, int allissued)
3283{
3284 int shift = lock->get_cap_shift();
3285 int issued = (allissued >> shift) & lock->get_cap_mask();
3286 if (is_auth()) {
3287 if (lock->is_xlocked()) {
3288 // do nothing here
3289 } else if (lock->get_state() != LOCK_MIX) {
3290 if (issued & (CEPH_CAP_GEXCL | CEPH_CAP_GBUFFER))
3291 lock->set_state(LOCK_EXCL);
f6b5b4d7
TL
3292 else if (issued & CEPH_CAP_GWR) {
3293 if (issued & (CEPH_CAP_GCACHE | CEPH_CAP_GSHARED))
3294 lock->set_state(LOCK_EXCL);
3295 else
3296 lock->set_state(LOCK_MIX);
3297 } else if (lock->is_dirty()) {
7c673cae
FG
3298 if (is_replicated())
3299 lock->set_state(LOCK_MIX);
3300 else
3301 lock->set_state(LOCK_LOCK);
3302 } else
3303 lock->set_state(LOCK_SYNC);
3304 }
3305 } else {
3306 // our states have already been chosen during rejoin.
3307 if (lock->is_xlocked())
11fdf7f2 3308 ceph_assert(lock->get_state() == LOCK_LOCK);
7c673cae
FG
3309 }
3310}
3311
3312void CInode::choose_lock_states(int dirty_caps)
3313{
3314 int issued = get_caps_issued() | dirty_caps;
b32b8144
FG
3315 if (is_auth() && (issued & (CEPH_CAP_ANY_EXCL|CEPH_CAP_ANY_WR)))
3316 choose_ideal_loner();
7c673cae
FG
3317 choose_lock_state(&filelock, issued);
3318 choose_lock_state(&nestlock, issued);
3319 choose_lock_state(&dirfragtreelock, issued);
3320 choose_lock_state(&authlock, issued);
3321 choose_lock_state(&xattrlock, issued);
3322 choose_lock_state(&linklock, issued);
3323}
3324
9f95a23c
TL
3325int CInode::count_nonstale_caps()
3326{
3327 int n = 0;
3328 for (const auto &p : client_caps) {
3329 if (!p.second.is_stale())
3330 n++;
3331 }
3332 return n;
3333}
3334
3335bool CInode::multiple_nonstale_caps()
3336{
3337 int n = 0;
3338 for (const auto &p : client_caps) {
3339 if (!p.second.is_stale()) {
3340 if (n)
3341 return true;
3342 n++;
3343 }
3344 }
3345 return false;
3346}
3347
11fdf7f2
TL
3348void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m)
3349{
3350 bool old_empty = mds_caps_wanted.empty();
3351 mds_caps_wanted.swap(m);
3352 if (old_empty != (bool)mds_caps_wanted.empty()) {
3353 if (old_empty)
f91f0fd5 3354 adjust_num_caps_notable(1);
11fdf7f2 3355 else
f91f0fd5 3356 adjust_num_caps_notable(-1);
11fdf7f2
TL
3357 }
3358}
3359
3360void CInode::set_mds_caps_wanted(mds_rank_t mds, int32_t wanted)
3361{
3362 bool old_empty = mds_caps_wanted.empty();
3363 if (wanted) {
3364 mds_caps_wanted[mds] = wanted;
3365 if (old_empty)
f91f0fd5 3366 adjust_num_caps_notable(1);
11fdf7f2
TL
3367 } else if (!old_empty) {
3368 mds_caps_wanted.erase(mds);
3369 if (mds_caps_wanted.empty())
f91f0fd5 3370 adjust_num_caps_notable(-1);
11fdf7f2
TL
3371 }
3372}
3373
9f95a23c
TL
3374Capability *CInode::add_client_cap(client_t client, Session *session,
3375 SnapRealm *conrealm, bool new_inode)
7c673cae 3376{
11fdf7f2 3377 ceph_assert(last == CEPH_NOSNAP);
7c673cae
FG
3378 if (client_caps.empty()) {
3379 get(PIN_CAPS);
3380 if (conrealm)
3381 containing_realm = conrealm;
3382 else
3383 containing_realm = find_snaprealm();
3384 containing_realm->inodes_with_caps.push_back(&item_caps);
11fdf7f2 3385 dout(10) << __func__ << " first cap, joining realm " << *containing_realm << dendl;
7c673cae 3386
7c673cae 3387 mdcache->num_inodes_with_caps++;
11fdf7f2
TL
3388 if (parent)
3389 parent->dir->adjust_num_inodes_with_caps(1);
3390 }
3391
9f95a23c 3392 uint64_t cap_id = new_inode ? 1 : ++mdcache->last_cap_id;
11fdf7f2
TL
3393 auto ret = client_caps.emplace(std::piecewise_construct, std::forward_as_tuple(client),
3394 std::forward_as_tuple(this, session, cap_id));
3395 ceph_assert(ret.second == true);
3396 Capability *cap = &ret.first->second;
7c673cae 3397
7c673cae 3398 cap->client_follows = first-1;
7c673cae 3399 containing_realm->add_cap(client, cap);
11fdf7f2 3400
7c673cae
FG
3401 return cap;
3402}
3403
3404void CInode::remove_client_cap(client_t client)
3405{
11fdf7f2
TL
3406 auto it = client_caps.find(client);
3407 ceph_assert(it != client_caps.end());
3408 Capability *cap = &it->second;
7c673cae
FG
3409
3410 cap->item_session_caps.remove_myself();
3411 cap->item_revoking_caps.remove_myself();
3412 cap->item_client_revoking_caps.remove_myself();
3413 containing_realm->remove_cap(client, cap);
3414
3415 if (client == loner_cap)
3416 loner_cap = -1;
3417
f91f0fd5
TL
3418 if (cap->is_wanted_notable())
3419 adjust_num_caps_notable(-1);
11fdf7f2
TL
3420
3421 client_caps.erase(it);
7c673cae 3422 if (client_caps.empty()) {
11fdf7f2 3423 dout(10) << __func__ << " last cap, leaving realm " << *containing_realm << dendl;
7c673cae
FG
3424 put(PIN_CAPS);
3425 item_caps.remove_myself();
3426 containing_realm = NULL;
7c673cae 3427 mdcache->num_inodes_with_caps--;
11fdf7f2
TL
3428 if (parent)
3429 parent->dir->adjust_num_inodes_with_caps(-1);
7c673cae
FG
3430 }
3431
3432 //clean up advisory locks
3433 bool fcntl_removed = fcntl_locks ? fcntl_locks->remove_all_from(client) : false;
3434 bool flock_removed = flock_locks ? flock_locks->remove_all_from(client) : false;
3435 if (fcntl_removed || flock_removed) {
11fdf7f2 3436 MDSContext::vec waiters;
7c673cae
FG
3437 take_waiting(CInode::WAIT_FLOCK, waiters);
3438 mdcache->mds->queue_waiters(waiters);
3439 }
3440}
3441
3442void CInode::move_to_realm(SnapRealm *realm)
3443{
11fdf7f2 3444 dout(10) << __func__ << " joining realm " << *realm
7c673cae 3445 << ", leaving realm " << *containing_realm << dendl;
11fdf7f2
TL
3446 for (auto& p : client_caps) {
3447 containing_realm->remove_cap(p.first, &p.second);
3448 realm->add_cap(p.first, &p.second);
7c673cae
FG
3449 }
3450 item_caps.remove_myself();
3451 realm->inodes_with_caps.push_back(&item_caps);
3452 containing_realm = realm;
3453}
3454
3455Capability *CInode::reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session)
3456{
3457 Capability *cap = get_client_cap(client);
3458 if (cap) {
3459 // FIXME?
3460 cap->merge(icr.capinfo.wanted, icr.capinfo.issued);
3461 } else {
3462 cap = add_client_cap(client, session);
3463 cap->set_cap_id(icr.capinfo.cap_id);
3464 cap->set_wanted(icr.capinfo.wanted);
3465 cap->issue_norevoke(icr.capinfo.issued);
3466 cap->reset_seq();
3467 }
3468 cap->set_last_issue_stamp(ceph_clock_now());
3469 return cap;
3470}
3471
3472void CInode::clear_client_caps_after_export()
3473{
3474 while (!client_caps.empty())
3475 remove_client_cap(client_caps.begin()->first);
3476 loner_cap = -1;
3477 want_loner_cap = -1;
11fdf7f2
TL
3478 if (!get_mds_caps_wanted().empty()) {
3479 mempool::mds_co::compact_map<int32_t,int32_t> empty;
3480 set_mds_caps_wanted(empty);
3481 }
7c673cae
FG
3482}
3483
3484void CInode::export_client_caps(map<client_t,Capability::Export>& cl)
3485{
11fdf7f2
TL
3486 for (const auto &p : client_caps) {
3487 cl[p.first] = p.second.make_export();
7c673cae
FG
3488 }
3489}
3490
3491 // caps allowed
3492int CInode::get_caps_liked() const
3493{
3494 if (is_dir())
3495 return CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; // but not, say, FILE_RD|WR|WRBUFFER
3496 else
3497 return CEPH_CAP_ANY & ~CEPH_CAP_FILE_LAZYIO;
3498}
3499
3500int CInode::get_caps_allowed_ever() const
3501{
3502 int allowed;
3503 if (is_dir())
3504 allowed = CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED;
3505 else
3506 allowed = CEPH_CAP_ANY;
3507 return allowed &
3508 (CEPH_CAP_PIN |
3509 (filelock.gcaps_allowed_ever() << filelock.get_cap_shift()) |
3510 (authlock.gcaps_allowed_ever() << authlock.get_cap_shift()) |
3511 (xattrlock.gcaps_allowed_ever() << xattrlock.get_cap_shift()) |
3512 (linklock.gcaps_allowed_ever() << linklock.get_cap_shift()));
3513}
3514
3515int CInode::get_caps_allowed_by_type(int type) const
3516{
3517 return
3518 CEPH_CAP_PIN |
3519 (filelock.gcaps_allowed(type) << filelock.get_cap_shift()) |
3520 (authlock.gcaps_allowed(type) << authlock.get_cap_shift()) |
3521 (xattrlock.gcaps_allowed(type) << xattrlock.get_cap_shift()) |
3522 (linklock.gcaps_allowed(type) << linklock.get_cap_shift());
3523}
3524
3525int CInode::get_caps_careful() const
3526{
3527 return
3528 (filelock.gcaps_careful() << filelock.get_cap_shift()) |
3529 (authlock.gcaps_careful() << authlock.get_cap_shift()) |
3530 (xattrlock.gcaps_careful() << xattrlock.get_cap_shift()) |
3531 (linklock.gcaps_careful() << linklock.get_cap_shift());
3532}
3533
3534int CInode::get_xlocker_mask(client_t client) const
3535{
3536 return
3537 (filelock.gcaps_xlocker_mask(client) << filelock.get_cap_shift()) |
3538 (authlock.gcaps_xlocker_mask(client) << authlock.get_cap_shift()) |
3539 (xattrlock.gcaps_xlocker_mask(client) << xattrlock.get_cap_shift()) |
3540 (linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift());
3541}
3542
11fdf7f2 3543int CInode::get_caps_allowed_for_client(Session *session, Capability *cap,
f67539c2 3544 const mempool_inode *file_i) const
7c673cae 3545{
11fdf7f2 3546 client_t client = session->get_client();
7c673cae
FG
3547 int allowed;
3548 if (client == get_loner()) {
3549 // as the loner, we get the loner_caps AND any xlocker_caps for things we have xlocked
3550 allowed =
3551 get_caps_allowed_by_type(CAP_LONER) |
3552 (get_caps_allowed_by_type(CAP_XLOCKER) & get_xlocker_mask(client));
3553 } else {
3554 allowed = get_caps_allowed_by_type(CAP_ANY);
3555 }
3556
9f95a23c
TL
3557 if (is_dir()) {
3558 allowed &= ~CEPH_CAP_ANY_DIR_OPS;
3559 if (cap && (allowed & CEPH_CAP_FILE_EXCL))
3560 allowed |= cap->get_lock_cache_allowed();
3561 } else {
11fdf7f2
TL
3562 if (file_i->inline_data.version == CEPH_INLINE_NONE &&
3563 file_i->layout.pool_ns.empty()) {
3564 // noop
3565 } else if (cap) {
3566 if ((file_i->inline_data.version != CEPH_INLINE_NONE &&
3567 cap->is_noinline()) ||
3568 (!file_i->layout.pool_ns.empty() &&
3569 cap->is_nopoolns()))
3570 allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
3571 } else {
3572 auto& conn = session->get_connection();
3573 if ((file_i->inline_data.version != CEPH_INLINE_NONE &&
3574 !conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) ||
3575 (!file_i->layout.pool_ns.empty() &&
3576 !conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)))
3577 allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
3578 }
7c673cae
FG
3579 }
3580 return allowed;
3581}
3582
3583// caps issued, wanted
3584int CInode::get_caps_issued(int *ploner, int *pother, int *pxlocker,
3585 int shift, int mask)
3586{
3587 int c = 0;
3588 int loner = 0, other = 0, xlocker = 0;
3589 if (!is_auth()) {
3590 loner_cap = -1;
3591 }
3592
11fdf7f2
TL
3593 for (const auto &p : client_caps) {
3594 int i = p.second.issued();
7c673cae 3595 c |= i;
11fdf7f2 3596 if (p.first == loner_cap)
7c673cae
FG
3597 loner |= i;
3598 else
3599 other |= i;
11fdf7f2 3600 xlocker |= get_xlocker_mask(p.first) & i;
7c673cae
FG
3601 }
3602 if (ploner) *ploner = (loner >> shift) & mask;
3603 if (pother) *pother = (other >> shift) & mask;
3604 if (pxlocker) *pxlocker = (xlocker >> shift) & mask;
3605 return (c >> shift) & mask;
3606}
3607
3608bool CInode::is_any_caps_wanted() const
3609{
11fdf7f2
TL
3610 for (const auto &p : client_caps) {
3611 if (p.second.wanted())
7c673cae 3612 return true;
11fdf7f2 3613 }
7c673cae
FG
3614 return false;
3615}
3616
3617int CInode::get_caps_wanted(int *ploner, int *pother, int shift, int mask) const
3618{
3619 int w = 0;
3620 int loner = 0, other = 0;
11fdf7f2
TL
3621 for (const auto &p : client_caps) {
3622 if (!p.second.is_stale()) {
3623 int t = p.second.wanted();
7c673cae 3624 w |= t;
11fdf7f2 3625 if (p.first == loner_cap)
7c673cae
FG
3626 loner |= t;
3627 else
3628 other |= t;
3629 }
3630 //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl;
3631 }
3632 if (is_auth())
94b18763
FG
3633 for (const auto &p : mds_caps_wanted) {
3634 w |= p.second;
3635 other |= p.second;
7c673cae
FG
3636 //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl;
3637 }
3638 if (ploner) *ploner = (loner >> shift) & mask;
3639 if (pother) *pother = (other >> shift) & mask;
3640 return (w >> shift) & mask;
3641}
3642
3643bool CInode::issued_caps_need_gather(SimpleLock *lock)
3644{
3645 int loner_issued, other_issued, xlocker_issued;
3646 get_caps_issued(&loner_issued, &other_issued, &xlocker_issued,
3647 lock->get_cap_shift(), lock->get_cap_mask());
3648 if ((loner_issued & ~lock->gcaps_allowed(CAP_LONER)) ||
3649 (other_issued & ~lock->gcaps_allowed(CAP_ANY)) ||
3650 (xlocker_issued & ~lock->gcaps_allowed(CAP_XLOCKER)))
3651 return true;
3652 return false;
3653}
3654
f91f0fd5
TL
3655void CInode::adjust_num_caps_notable(int d)
3656{
3657 if (!is_clientwriteable()) {
3658 if (!num_caps_notable && d > 0)
3659 mdcache->open_file_table.add_inode(this);
3660 else if (num_caps_notable > 0 && num_caps_notable == -d)
3661 mdcache->open_file_table.remove_inode(this);
3662 }
3663
3664 num_caps_notable +=d;
3665 ceph_assert(num_caps_notable >= 0);
3666}
3667
3668void CInode::mark_clientwriteable()
3669{
3670 if (last != CEPH_NOSNAP)
3671 return;
3672 if (!state_test(STATE_CLIENTWRITEABLE)) {
3673 if (num_caps_notable == 0)
3674 mdcache->open_file_table.add_inode(this);
3675 state_set(STATE_CLIENTWRITEABLE);
3676 }
3677}
3678
3679void CInode::clear_clientwriteable()
3680{
3681 if (state_test(STATE_CLIENTWRITEABLE)) {
3682 if (num_caps_notable == 0)
3683 mdcache->open_file_table.remove_inode(this);
3684 state_clear(STATE_CLIENTWRITEABLE);
3685 }
3686}
7c673cae
FG
3687
3688// =============================================
3689
3690int CInode::encode_inodestat(bufferlist& bl, Session *session,
3691 SnapRealm *dir_realm,
3692 snapid_t snapid,
3693 unsigned max_bytes,
3694 int getattr_caps)
3695{
11fdf7f2
TL
3696 client_t client = session->get_client();
3697 ceph_assert(snapid);
7c673cae
FG
3698
3699 bool valid = true;
3700
3701 // pick a version!
f67539c2
TL
3702 const mempool_inode *oi = get_inode().get();
3703 const mempool_inode *pi = get_projected_inode().get();
7c673cae 3704
f67539c2 3705 const mempool_xattr_map *pxattrs = nullptr;
7c673cae
FG
3706
3707 if (snapid != CEPH_NOSNAP) {
3708
3709 // for now at least, old_inodes is only defined/valid on the auth
3710 if (!is_auth())
3711 valid = false;
3712
f67539c2
TL
3713 if (is_any_old_inodes()) {
3714 auto it = old_inodes->lower_bound(snapid);
3715 if (it != old_inodes->end()) {
94b18763 3716 if (it->second.first > snapid) {
f67539c2 3717 if (it != old_inodes->begin())
94b18763 3718 --it;
7c673cae 3719 }
94b18763
FG
3720 if (it->second.first <= snapid && snapid <= it->first) {
3721 dout(15) << __func__ << " snapid " << snapid
3722 << " to old_inode [" << it->second.first << "," << it->first << "]"
3723 << " " << it->second.inode.rstat
7c673cae 3724 << dendl;
f67539c2
TL
3725 pi = oi = &it->second.inode;
3726 pxattrs = &it->second.xattrs;
7c673cae
FG
3727 } else {
3728 // snapshoted remote dentry can result this
11fdf7f2 3729 dout(0) << __func__ << " old_inode for snapid " << snapid
7c673cae
FG
3730 << " not found" << dendl;
3731 }
3732 }
3733 } else if (snapid < first || snapid > last) {
3734 // snapshoted remote dentry can result this
11fdf7f2 3735 dout(0) << __func__ << " [" << first << "," << last << "]"
7c673cae
FG
3736 << " not match snapid " << snapid << dendl;
3737 }
3738 }
3739
81eedcae 3740 utime_t snap_btime;
f67539c2 3741 std::map<std::string, std::string> snap_metadata;
7c673cae 3742 SnapRealm *realm = find_snaprealm();
81eedcae
TL
3743 if (snapid != CEPH_NOSNAP && realm) {
3744 // add snapshot timestamp vxattr
3745 map<snapid_t,const SnapInfo*> infomap;
3746 realm->get_snap_info(infomap,
3747 snapid, // min
3748 snapid); // max
3749 if (!infomap.empty()) {
3750 ceph_assert(infomap.size() == 1);
3751 const SnapInfo *si = infomap.begin()->second;
3752 snap_btime = si->stamp;
f67539c2 3753 snap_metadata = si->metadata;
81eedcae
TL
3754 }
3755 }
3756
7c673cae
FG
3757
3758 bool no_caps = !valid ||
3759 session->is_stale() ||
3760 (dir_realm && realm != dir_realm) ||
3761 is_frozen() ||
3762 state_test(CInode::STATE_EXPORTINGCAPS);
3763 if (no_caps)
11fdf7f2 3764 dout(20) << __func__ << " no caps"
7c673cae
FG
3765 << (!valid?", !valid":"")
3766 << (session->is_stale()?", session stale ":"")
3767 << ((dir_realm && realm != dir_realm)?", snaprealm differs ":"")
3768 << (is_frozen()?", frozen inode":"")
3769 << (state_test(CInode::STATE_EXPORTINGCAPS)?", exporting caps":"")
3770 << dendl;
3771
3772
3773 // "fake" a version that is old (stable) version, +1 if projected.
3774 version_t version = (oi->version * 2) + is_projected();
3775
3776 Capability *cap = get_client_cap(client);
3777 bool pfile = filelock.is_xlocked_by_client(client) || get_loner() == client;
3778 //(cap && (cap->issued() & CEPH_CAP_FILE_EXCL));
3779 bool pauth = authlock.is_xlocked_by_client(client) || get_loner() == client;
3780 bool plink = linklock.is_xlocked_by_client(client) || get_loner() == client;
3781 bool pxattr = xattrlock.is_xlocked_by_client(client) || get_loner() == client;
3782
3783 bool plocal = versionlock.get_last_wrlock_client() == client;
3784 bool ppolicy = policylock.is_xlocked_by_client(client) || get_loner()==client;
3785
f67539c2 3786 const mempool_inode *any_i = (pfile|pauth|plink|pxattr|plocal) ? pi : oi;
7c673cae
FG
3787
3788 dout(20) << " pfile " << pfile << " pauth " << pauth
3789 << " plink " << plink << " pxattr " << pxattr
3790 << " plocal " << plocal
3791 << " ctime " << any_i->ctime
3792 << " valid=" << valid << dendl;
3793
3794 // file
f67539c2 3795 const mempool_inode *file_i = pfile ? pi:oi;
7c673cae
FG
3796 file_layout_t layout;
3797 if (is_dir()) {
3798 layout = (ppolicy ? pi : oi)->layout;
3799 } else {
3800 layout = file_i->layout;
3801 }
3802
3803 // max_size is min of projected, actual
3804 uint64_t max_size =
f91f0fd5
TL
3805 std::min(oi->get_client_range(client),
3806 pi->get_client_range(client));
7c673cae
FG
3807
3808 // inline data
3809 version_t inline_version = 0;
3810 bufferlist inline_data;
3811 if (file_i->inline_data.version == CEPH_INLINE_NONE) {
3812 inline_version = CEPH_INLINE_NONE;
3813 } else if ((!cap && !no_caps) ||
3814 (cap && cap->client_inline_version < file_i->inline_data.version) ||
3815 (getattr_caps & CEPH_CAP_FILE_RD)) { // client requests inline data
3816 inline_version = file_i->inline_data.version;
3817 if (file_i->inline_data.length() > 0)
f67539c2 3818 file_i->inline_data.get_data(inline_data);
7c673cae
FG
3819 }
3820
3821 // nest (do same as file... :/)
3822 if (cap) {
3823 cap->last_rbytes = file_i->rstat.rbytes;
3824 cap->last_rsize = file_i->rstat.rsize();
3825 }
3826
3827 // auth
f67539c2 3828 const mempool_inode *auth_i = pauth ? pi:oi;
7c673cae
FG
3829
3830 // link
f67539c2 3831 const mempool_inode *link_i = plink ? pi:oi;
7c673cae
FG
3832
3833 // xattr
f67539c2 3834 const mempool_inode *xattr_i = pxattr ? pi:oi;
7c673cae 3835
11fdf7f2 3836 using ceph::encode;
7c673cae 3837 // xattr
7c673cae
FG
3838 version_t xattr_version;
3839 if ((!cap && !no_caps) ||
3840 (cap && cap->client_xattr_version < xattr_i->xattr_version) ||
3841 (getattr_caps & CEPH_CAP_XATTR_SHARED)) { // client requests xattrs
3842 if (!pxattrs)
f67539c2 3843 pxattrs = pxattr ? get_projected_xattrs().get() : get_xattrs().get();
7c673cae
FG
3844 xattr_version = xattr_i->xattr_version;
3845 } else {
3846 xattr_version = 0;
3847 }
3848
3849 // do we have room?
3850 if (max_bytes) {
11fdf7f2
TL
3851 unsigned bytes =
3852 8 + 8 + 4 + 8 + 8 + sizeof(ceph_mds_reply_cap) +
3853 sizeof(struct ceph_file_layout) +
3854 sizeof(struct ceph_timespec) * 3 + 4 + // ctime ~ time_warp_seq
3855 8 + 8 + 8 + 4 + 4 + 4 + 4 + 4 + // size ~ nlink
3856 8 + 8 + 8 + 8 + 8 + sizeof(struct ceph_timespec) + // dirstat.nfiles ~ rstat.rctime
3857 sizeof(__u32) + sizeof(__u32) * 2 * dirfragtree._splits.size() + // dirfragtree
3858 sizeof(__u32) + symlink.length() + // symlink
3859 sizeof(struct ceph_dir_layout); // dir_layout
3860
3861 if (xattr_version) {
3862 bytes += sizeof(__u32) + sizeof(__u32); // xattr buffer len + number entries
3863 if (pxattrs) {
3864 for (const auto &p : *pxattrs)
3865 bytes += sizeof(__u32) * 2 + p.first.length() + p.second.length();
3866 }
3867 } else {
3868 bytes += sizeof(__u32); // xattr buffer len
3869 }
3870 bytes +=
3871 sizeof(version_t) + sizeof(__u32) + inline_data.length() + // inline data
3872 1 + 1 + 8 + 8 + 4 + // quota
3873 4 + layout.pool_ns.size() + // pool ns
3874 sizeof(struct ceph_timespec) + 8; // btime + change_attr
3875
7c673cae 3876 if (bytes > max_bytes)
f67539c2 3877 return -CEPHFS_ENOSPC;
7c673cae
FG
3878 }
3879
3880
3881 // encode caps
3882 struct ceph_mds_reply_cap ecap;
3883 if (snapid != CEPH_NOSNAP) {
3884 /*
3885 * snapped inodes (files or dirs) only get read-only caps. always
3886 * issue everything possible, since it is read only.
3887 *
3888 * if a snapped inode has caps, limit issued caps based on the
3889 * lock state.
3890 *
3891 * if it is a live inode, limit issued caps based on the lock
3892 * state.
3893 *
3894 * do NOT adjust cap issued state, because the client always
3895 * tracks caps per-snap and the mds does either per-interval or
3896 * multiversion.
3897 */
3898 ecap.caps = valid ? get_caps_allowed_by_type(CAP_ANY) : CEPH_STAT_CAP_INODE;
3899 if (last == CEPH_NOSNAP || is_any_caps())
11fdf7f2 3900 ecap.caps = ecap.caps & get_caps_allowed_for_client(session, nullptr, file_i);
7c673cae
FG
3901 ecap.seq = 0;
3902 ecap.mseq = 0;
3903 ecap.realm = 0;
3904 } else {
3905 if (!no_caps && !cap) {
3906 // add a new cap
3907 cap = add_client_cap(client, session, realm);
b32b8144
FG
3908 if (is_auth())
3909 choose_ideal_loner();
7c673cae
FG
3910 }
3911
3912 int issue = 0;
3913 if (!no_caps && cap) {
3914 int likes = get_caps_liked();
11fdf7f2 3915 int allowed = get_caps_allowed_for_client(session, cap, file_i);
7c673cae 3916 issue = (cap->wanted() | likes) & allowed;
494da23a 3917 cap->issue_norevoke(issue, true);
7c673cae
FG
3918 issue = cap->pending();
3919 dout(10) << "encode_inodestat issuing " << ccap_string(issue)
3920 << " seq " << cap->get_last_seq() << dendl;
3921 } else if (cap && cap->is_new() && !dir_realm) {
3922 // alway issue new caps to client, otherwise the caps get lost
11fdf7f2 3923 ceph_assert(cap->is_stale());
494da23a
TL
3924 ceph_assert(!cap->pending());
3925 issue = CEPH_CAP_PIN;
3926 cap->issue_norevoke(issue, true);
7c673cae
FG
3927 dout(10) << "encode_inodestat issuing " << ccap_string(issue)
3928 << " seq " << cap->get_last_seq()
494da23a 3929 << "(stale&new caps)" << dendl;
7c673cae
FG
3930 }
3931
3932 if (issue) {
3933 cap->set_last_issue();
3934 cap->set_last_issue_stamp(ceph_clock_now());
7c673cae
FG
3935 ecap.caps = issue;
3936 ecap.wanted = cap->wanted();
3937 ecap.cap_id = cap->get_cap_id();
3938 ecap.seq = cap->get_last_seq();
3939 ecap.mseq = cap->get_mseq();
3940 ecap.realm = realm->inode->ino();
3941 } else {
3942 ecap.cap_id = 0;
3943 ecap.caps = 0;
3944 ecap.seq = 0;
3945 ecap.mseq = 0;
3946 ecap.realm = 0;
3947 ecap.wanted = 0;
3948 }
3949 }
3950 ecap.flags = is_auth() ? CEPH_CAP_FLAG_AUTH : 0;
3951 dout(10) << "encode_inodestat caps " << ccap_string(ecap.caps)
3952 << " seq " << ecap.seq << " mseq " << ecap.mseq
11fdf7f2 3953 << " xattrv " << xattr_version << dendl;
7c673cae
FG
3954
3955 if (inline_data.length() && cap) {
3956 if ((cap->pending() | getattr_caps) & CEPH_CAP_FILE_SHARED) {
3957 dout(10) << "including inline version " << inline_version << dendl;
3958 cap->client_inline_version = inline_version;
3959 } else {
3960 dout(10) << "dropping inline version " << inline_version << dendl;
3961 inline_version = 0;
3962 inline_data.clear();
3963 }
3964 }
3965
3966 // include those xattrs?
11fdf7f2 3967 if (xattr_version && cap) {
7c673cae 3968 if ((cap->pending() | getattr_caps) & CEPH_CAP_XATTR_SHARED) {
11fdf7f2
TL
3969 dout(10) << "including xattrs version " << xattr_version << dendl;
3970 cap->client_xattr_version = xattr_version;
7c673cae 3971 } else {
11fdf7f2 3972 dout(10) << "dropping xattrs version " << xattr_version << dendl;
7c673cae
FG
3973 xattr_version = 0;
3974 }
3975 }
3976
11fdf7f2
TL
3977 // The end result of encode_xattrs() is equivalent to:
3978 // {
3979 // bufferlist xbl;
3980 // if (xattr_version) {
3981 // if (pxattrs)
3982 // encode(*pxattrs, bl);
3983 // else
3984 // encode((__u32)0, bl);
3985 // }
3986 // encode(xbl, bl);
3987 // }
3988 //
3989 // But encoding xattrs into the 'xbl' requires a memory allocation.
3990 // The 'bl' should have enough pre-allocated memory in most cases.
3991 // Encoding xattrs directly into it can avoid the extra allocation.
3992 auto encode_xattrs = [xattr_version, pxattrs, &bl]() {
3993 using ceph::encode;
3994 if (xattr_version) {
3995 ceph_le32 xbl_len;
3996 auto filler = bl.append_hole(sizeof(xbl_len));
3997 const auto starting_bl_len = bl.length();
3998 if (pxattrs)
3999 encode(*pxattrs, bl);
4000 else
4001 encode((__u32)0, bl);
4002 xbl_len = bl.length() - starting_bl_len;
4003 filler.copy_in(sizeof(xbl_len), (char *)&xbl_len);
4004 } else {
4005 encode((__u32)0, bl);
4006 }
4007 };
4008
7c673cae
FG
4009 /*
4010 * note: encoding matches MClientReply::InodeStat
4011 */
11fdf7f2 4012 if (session->info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) {
f67539c2 4013 ENCODE_START(6, 1, bl);
11fdf7f2
TL
4014 encode(oi->ino, bl);
4015 encode(snapid, bl);
4016 encode(oi->rdev, bl);
4017 encode(version, bl);
4018 encode(xattr_version, bl);
4019 encode(ecap, bl);
4020 {
4021 ceph_file_layout legacy_layout;
4022 layout.to_legacy(&legacy_layout);
4023 encode(legacy_layout, bl);
4024 }
4025 encode(any_i->ctime, bl);
4026 encode(file_i->mtime, bl);
4027 encode(file_i->atime, bl);
4028 encode(file_i->time_warp_seq, bl);
4029 encode(file_i->size, bl);
4030 encode(max_size, bl);
4031 encode(file_i->truncate_size, bl);
4032 encode(file_i->truncate_seq, bl);
4033 encode(auth_i->mode, bl);
4034 encode((uint32_t)auth_i->uid, bl);
4035 encode((uint32_t)auth_i->gid, bl);
4036 encode(link_i->nlink, bl);
4037 encode(file_i->dirstat.nfiles, bl);
4038 encode(file_i->dirstat.nsubdirs, bl);
4039 encode(file_i->rstat.rbytes, bl);
4040 encode(file_i->rstat.rfiles, bl);
4041 encode(file_i->rstat.rsubdirs, bl);
4042 encode(file_i->rstat.rctime, bl);
4043 dirfragtree.encode(bl);
4044 encode(symlink, bl);
4045 encode(file_i->dir_layout, bl);
4046 encode_xattrs();
4047 encode(inline_version, bl);
4048 encode(inline_data, bl);
f67539c2 4049 const mempool_inode *policy_i = ppolicy ? pi : oi;
11fdf7f2
TL
4050 encode(policy_i->quota, bl);
4051 encode(layout.pool_ns, bl);
4052 encode(any_i->btime, bl);
4053 encode(any_i->change_attr, bl);
4054 encode(file_i->export_pin, bl);
81eedcae 4055 encode(snap_btime, bl);
f67539c2
TL
4056 encode(file_i->rstat.rsnaps, bl);
4057 encode(snap_metadata, bl);
4058 encode(file_i->fscrypt, bl);
11fdf7f2
TL
4059 ENCODE_FINISH(bl);
4060 }
4061 else {
4062 ceph_assert(session->get_connection());
4063
4064 encode(oi->ino, bl);
4065 encode(snapid, bl);
4066 encode(oi->rdev, bl);
4067 encode(version, bl);
4068 encode(xattr_version, bl);
4069 encode(ecap, bl);
4070 {
4071 ceph_file_layout legacy_layout;
4072 layout.to_legacy(&legacy_layout);
4073 encode(legacy_layout, bl);
4074 }
4075 encode(any_i->ctime, bl);
4076 encode(file_i->mtime, bl);
4077 encode(file_i->atime, bl);
4078 encode(file_i->time_warp_seq, bl);
4079 encode(file_i->size, bl);
4080 encode(max_size, bl);
4081 encode(file_i->truncate_size, bl);
4082 encode(file_i->truncate_seq, bl);
4083 encode(auth_i->mode, bl);
4084 encode((uint32_t)auth_i->uid, bl);
4085 encode((uint32_t)auth_i->gid, bl);
4086 encode(link_i->nlink, bl);
4087 encode(file_i->dirstat.nfiles, bl);
4088 encode(file_i->dirstat.nsubdirs, bl);
4089 encode(file_i->rstat.rbytes, bl);
4090 encode(file_i->rstat.rfiles, bl);
4091 encode(file_i->rstat.rsubdirs, bl);
4092 encode(file_i->rstat.rctime, bl);
4093 dirfragtree.encode(bl);
4094 encode(symlink, bl);
4095 auto& conn = session->get_connection();
4096 if (conn->has_feature(CEPH_FEATURE_DIRLAYOUTHASH)) {
4097 encode(file_i->dir_layout, bl);
4098 }
4099 encode_xattrs();
4100 if (conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
4101 encode(inline_version, bl);
4102 encode(inline_data, bl);
4103 }
4104 if (conn->has_feature(CEPH_FEATURE_MDS_QUOTA)) {
f67539c2 4105 const mempool_inode *policy_i = ppolicy ? pi : oi;
11fdf7f2
TL
4106 encode(policy_i->quota, bl);
4107 }
4108 if (conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) {
4109 encode(layout.pool_ns, bl);
4110 }
4111 if (conn->has_feature(CEPH_FEATURE_FS_BTIME)) {
4112 encode(any_i->btime, bl);
4113 encode(any_i->change_attr, bl);
4114 }
7c673cae
FG
4115 }
4116
4117 return valid;
4118}
4119
9f95a23c 4120void CInode::encode_cap_message(const ref_t<MClientCaps> &m, Capability *cap)
7c673cae 4121{
11fdf7f2 4122 ceph_assert(cap);
7c673cae
FG
4123
4124 client_t client = cap->get_client();
4125
4126 bool pfile = filelock.is_xlocked_by_client(client) || (cap->issued() & CEPH_CAP_FILE_EXCL);
4127 bool pauth = authlock.is_xlocked_by_client(client);
4128 bool plink = linklock.is_xlocked_by_client(client);
4129 bool pxattr = xattrlock.is_xlocked_by_client(client);
4130
f67539c2
TL
4131 const mempool_inode *oi = get_inode().get();
4132 const mempool_inode *pi = get_projected_inode().get();
4133 const mempool_inode *i = (pfile|pauth|plink|pxattr) ? pi : oi;
7c673cae 4134
11fdf7f2 4135 dout(20) << __func__ << " pfile " << pfile
7c673cae
FG
4136 << " pauth " << pauth << " plink " << plink << " pxattr " << pxattr
4137 << " ctime " << i->ctime << dendl;
4138
4139 i = pfile ? pi:oi;
4140 m->set_layout(i->layout);
4141 m->size = i->size;
4142 m->truncate_seq = i->truncate_seq;
4143 m->truncate_size = i->truncate_size;
4144 m->mtime = i->mtime;
4145 m->atime = i->atime;
4146 m->ctime = i->ctime;
20effc67 4147 m->btime = i->btime;
7c673cae
FG
4148 m->change_attr = i->change_attr;
4149 m->time_warp_seq = i->time_warp_seq;
28e407b8
AA
4150 m->nfiles = i->dirstat.nfiles;
4151 m->nsubdirs = i->dirstat.nsubdirs;
7c673cae
FG
4152
4153 if (cap->client_inline_version < i->inline_data.version) {
4154 m->inline_version = cap->client_inline_version = i->inline_data.version;
4155 if (i->inline_data.length() > 0)
f67539c2 4156 i->inline_data.get_data(m->inline_data);
7c673cae
FG
4157 } else {
4158 m->inline_version = 0;
4159 }
4160
4161 // max_size is min of projected, actual.
f91f0fd5
TL
4162 uint64_t oldms = oi->get_client_range(client);
4163 uint64_t newms = pi->get_client_range(client);
11fdf7f2 4164 m->max_size = std::min(oldms, newms);
7c673cae
FG
4165
4166 i = pauth ? pi:oi;
4167 m->head.mode = i->mode;
4168 m->head.uid = i->uid;
4169 m->head.gid = i->gid;
4170
4171 i = plink ? pi:oi;
4172 m->head.nlink = i->nlink;
4173
11fdf7f2 4174 using ceph::encode;
7c673cae 4175 i = pxattr ? pi:oi;
f67539c2 4176 const auto& ix = pxattr ? get_projected_xattrs() : get_xattrs();
7c673cae
FG
4177 if ((cap->pending() & CEPH_CAP_XATTR_SHARED) &&
4178 i->xattr_version > cap->client_xattr_version) {
4179 dout(10) << " including xattrs v " << i->xattr_version << dendl;
f67539c2
TL
4180 if (ix)
4181 encode(*ix, m->xattrbl);
4182 else
4183 encode((__u32)0, m->xattrbl);
7c673cae
FG
4184 m->head.xattr_version = i->xattr_version;
4185 cap->client_xattr_version = i->xattr_version;
4186 }
4187}
4188
4189
4190
4191void CInode::_encode_base(bufferlist& bl, uint64_t features)
4192{
9f95a23c 4193 ENCODE_START(1, 1, bl);
11fdf7f2 4194 encode(first, bl);
f67539c2 4195 encode(*get_inode(), bl, features);
11fdf7f2
TL
4196 encode(symlink, bl);
4197 encode(dirfragtree, bl);
f67539c2
TL
4198 encode_xattrs(bl);
4199 encode_old_inodes(bl, features);
11fdf7f2 4200 encode(damage_flags, bl);
7c673cae 4201 encode_snap(bl);
9f95a23c 4202 ENCODE_FINISH(bl);
7c673cae 4203}
11fdf7f2 4204void CInode::_decode_base(bufferlist::const_iterator& p)
7c673cae 4205{
9f95a23c 4206 DECODE_START(1, p);
11fdf7f2 4207 decode(first, p);
f67539c2
TL
4208 {
4209 auto _inode = allocate_inode();
4210 decode(*_inode, p);
4211 reset_inode(std::move(_inode));
4212 }
94b18763
FG
4213 {
4214 std::string tmp;
11fdf7f2
TL
4215 decode(tmp, p);
4216 symlink = std::string_view(tmp);
94b18763 4217 }
11fdf7f2 4218 decode(dirfragtree, p);
f67539c2
TL
4219 decode_xattrs(p);
4220 decode_old_inodes(p);
11fdf7f2 4221 decode(damage_flags, p);
7c673cae 4222 decode_snap(p);
9f95a23c 4223 DECODE_FINISH(p);
7c673cae
FG
4224}
4225
4226void CInode::_encode_locks_full(bufferlist& bl)
4227{
11fdf7f2
TL
4228 using ceph::encode;
4229 encode(authlock, bl);
4230 encode(linklock, bl);
4231 encode(dirfragtreelock, bl);
4232 encode(filelock, bl);
4233 encode(xattrlock, bl);
4234 encode(snaplock, bl);
4235 encode(nestlock, bl);
4236 encode(flocklock, bl);
4237 encode(policylock, bl);
4238
4239 encode(loner_cap, bl);
4240}
4241void CInode::_decode_locks_full(bufferlist::const_iterator& p)
4242{
4243 using ceph::decode;
4244 decode(authlock, p);
4245 decode(linklock, p);
4246 decode(dirfragtreelock, p);
4247 decode(filelock, p);
4248 decode(xattrlock, p);
4249 decode(snaplock, p);
4250 decode(nestlock, p);
4251 decode(flocklock, p);
4252 decode(policylock, p);
4253
4254 decode(loner_cap, p);
7c673cae
FG
4255 set_loner_cap(loner_cap);
4256 want_loner_cap = loner_cap; // for now, we'll eval() shortly.
4257}
4258
b32b8144 4259void CInode::_encode_locks_state_for_replica(bufferlist& bl, bool need_recover)
7c673cae 4260{
9f95a23c 4261 ENCODE_START(1, 1, bl);
7c673cae
FG
4262 authlock.encode_state_for_replica(bl);
4263 linklock.encode_state_for_replica(bl);
4264 dirfragtreelock.encode_state_for_replica(bl);
4265 filelock.encode_state_for_replica(bl);
4266 nestlock.encode_state_for_replica(bl);
4267 xattrlock.encode_state_for_replica(bl);
4268 snaplock.encode_state_for_replica(bl);
4269 flocklock.encode_state_for_replica(bl);
4270 policylock.encode_state_for_replica(bl);
11fdf7f2 4271 encode(need_recover, bl);
9f95a23c 4272 ENCODE_FINISH(bl);
7c673cae 4273}
b32b8144 4274
7c673cae
FG
4275void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep)
4276{
4277 authlock.encode_state_for_replica(bl);
4278 linklock.encode_state_for_replica(bl);
4279 dirfragtreelock.encode_state_for_rejoin(bl, rep);
4280 filelock.encode_state_for_rejoin(bl, rep);
4281 nestlock.encode_state_for_rejoin(bl, rep);
4282 xattrlock.encode_state_for_replica(bl);
4283 snaplock.encode_state_for_replica(bl);
4284 flocklock.encode_state_for_replica(bl);
4285 policylock.encode_state_for_replica(bl);
4286}
b32b8144 4287
9f95a23c 4288void CInode::_decode_locks_state_for_replica(bufferlist::const_iterator& p, bool is_new)
7c673cae 4289{
9f95a23c 4290 DECODE_START(1, p);
7c673cae
FG
4291 authlock.decode_state(p, is_new);
4292 linklock.decode_state(p, is_new);
4293 dirfragtreelock.decode_state(p, is_new);
4294 filelock.decode_state(p, is_new);
4295 nestlock.decode_state(p, is_new);
4296 xattrlock.decode_state(p, is_new);
4297 snaplock.decode_state(p, is_new);
4298 flocklock.decode_state(p, is_new);
4299 policylock.decode_state(p, is_new);
b32b8144
FG
4300
4301 bool need_recover;
11fdf7f2 4302 decode(need_recover, p);
b32b8144
FG
4303 if (need_recover && is_new) {
4304 // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock
4305 // and change the object when replaying unsafe requests.
4306 authlock.mark_need_recover();
4307 linklock.mark_need_recover();
4308 dirfragtreelock.mark_need_recover();
4309 filelock.mark_need_recover();
4310 nestlock.mark_need_recover();
4311 xattrlock.mark_need_recover();
4312 snaplock.mark_need_recover();
4313 flocklock.mark_need_recover();
4314 policylock.mark_need_recover();
4315 }
9f95a23c 4316 DECODE_FINISH(p);
7c673cae 4317}
11fdf7f2 4318void CInode::_decode_locks_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters,
b32b8144
FG
4319 list<SimpleLock*>& eval_locks, bool survivor)
4320{
4321 authlock.decode_state_rejoin(p, waiters, survivor);
4322 linklock.decode_state_rejoin(p, waiters, survivor);
4323 dirfragtreelock.decode_state_rejoin(p, waiters, survivor);
4324 filelock.decode_state_rejoin(p, waiters, survivor);
4325 nestlock.decode_state_rejoin(p, waiters, survivor);
4326 xattrlock.decode_state_rejoin(p, waiters, survivor);
4327 snaplock.decode_state_rejoin(p, waiters, survivor);
4328 flocklock.decode_state_rejoin(p, waiters, survivor);
4329 policylock.decode_state_rejoin(p, waiters, survivor);
7c673cae
FG
4330
4331 if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked())
4332 eval_locks.push_back(&dirfragtreelock);
4333 if (!filelock.is_stable() && !filelock.is_wrlocked())
4334 eval_locks.push_back(&filelock);
4335 if (!nestlock.is_stable() && !nestlock.is_wrlocked())
4336 eval_locks.push_back(&nestlock);
4337}
4338
4339
4340// IMPORT/EXPORT
4341
4342void CInode::encode_export(bufferlist& bl)
4343{
4344 ENCODE_START(5, 4, bl);
4345 _encode_base(bl, mdcache->mds->mdsmap->get_up_features());
4346
11fdf7f2 4347 encode(state, bl);
7c673cae 4348
11fdf7f2 4349 encode(pop, bl);
7c673cae 4350
11fdf7f2 4351 encode(get_replicas(), bl);
7c673cae
FG
4352
4353 // include scatterlock info for any bounding CDirs
4354 bufferlist bounding;
f67539c2 4355 if (get_inode()->is_dir())
94b18763
FG
4356 for (const auto &p : dirfrags) {
4357 CDir *dir = p.second;
7c673cae 4358 if (dir->state_test(CDir::STATE_EXPORTBOUND)) {
11fdf7f2 4359 encode(p.first, bounding);
f67539c2
TL
4360 encode(dir->get_fnode()->fragstat, bounding);
4361 encode(dir->get_fnode()->accounted_fragstat, bounding);
4362 encode(dir->get_fnode()->rstat, bounding);
4363 encode(dir->get_fnode()->accounted_rstat, bounding);
7c673cae
FG
4364 dout(10) << " encoded fragstat/rstat info for " << *dir << dendl;
4365 }
4366 }
11fdf7f2 4367 encode(bounding, bl);
7c673cae
FG
4368
4369 _encode_locks_full(bl);
4370
4371 _encode_file_locks(bl);
4372
4373 ENCODE_FINISH(bl);
4374
4375 get(PIN_TEMPEXPORTING);
4376}
4377
11fdf7f2 4378void CInode::finish_export()
7c673cae
FG
4379{
4380 state &= MASK_STATE_EXPORT_KEPT;
4381
11fdf7f2 4382 pop.zero();
7c673cae
FG
4383
4384 // just in case!
4385 //dirlock.clear_updated();
4386
4387 loner_cap = -1;
4388
4389 put(PIN_TEMPEXPORTING);
4390}
4391
11fdf7f2 4392void CInode::decode_import(bufferlist::const_iterator& p,
7c673cae
FG
4393 LogSegment *ls)
4394{
4395 DECODE_START(5, p);
4396
4397 _decode_base(p);
4398
f6b5b4d7
TL
4399 {
4400 unsigned s;
4401 decode(s, p);
4402 s &= MASK_STATE_EXPORTED;
4403
f67539c2
TL
4404 set_ephemeral_pin((s & STATE_DISTEPHEMERALPIN),
4405 (s & STATE_RANDEPHEMERALPIN));
f6b5b4d7
TL
4406 state_set(STATE_AUTH | s);
4407 }
7c673cae
FG
4408
4409 if (is_dirty()) {
4410 get(PIN_DIRTY);
4411 _mark_dirty(ls);
4412 }
4413 if (is_dirty_parent()) {
4414 get(PIN_DIRTYPARENT);
28e407b8 4415 mark_dirty_parent(ls);
7c673cae
FG
4416 }
4417
11fdf7f2 4418 decode(pop, p);
7c673cae 4419
11fdf7f2 4420 decode(get_replicas(), p);
181888fb 4421 if (is_replicated())
7c673cae
FG
4422 get(PIN_REPLICATED);
4423 replica_nonce = 0;
4424
4425 // decode fragstat info on bounding cdirs
4426 bufferlist bounding;
11fdf7f2
TL
4427 decode(bounding, p);
4428 auto q = bounding.cbegin();
7c673cae
FG
4429 while (!q.end()) {
4430 frag_t fg;
11fdf7f2 4431 decode(fg, q);
7c673cae 4432 CDir *dir = get_dirfrag(fg);
11fdf7f2 4433 ceph_assert(dir); // we should have all bounds open
7c673cae
FG
4434
4435 // Only take the remote's fragstat/rstat if we are non-auth for
4436 // this dirfrag AND the lock is NOT in a scattered (MIX) state.
4437 // We know lock is stable, and MIX is the only state in which
4438 // the inode auth (who sent us this data) may not have the best
4439 // info.
4440
4441 // HMM: Are there cases where dir->is_auth() is an insufficient
4442 // check because the dirfrag is under migration? That implies
4443 // it is frozen (and in a SYNC or LOCK state). FIXME.
4444
f67539c2 4445 auto _fnode = CDir::allocate_fnode(*dir->get_fnode());
7c673cae
FG
4446 if (dir->is_auth() ||
4447 filelock.get_state() == LOCK_MIX) {
4448 dout(10) << " skipped fragstat info for " << *dir << dendl;
4449 frag_info_t f;
11fdf7f2
TL
4450 decode(f, q);
4451 decode(f, q);
7c673cae 4452 } else {
f67539c2
TL
4453 decode(_fnode->fragstat, q);
4454 decode(_fnode->accounted_fragstat, q);
7c673cae
FG
4455 dout(10) << " took fragstat info for " << *dir << dendl;
4456 }
4457 if (dir->is_auth() ||
4458 nestlock.get_state() == LOCK_MIX) {
4459 dout(10) << " skipped rstat info for " << *dir << dendl;
4460 nest_info_t n;
11fdf7f2
TL
4461 decode(n, q);
4462 decode(n, q);
7c673cae 4463 } else {
f67539c2
TL
4464 decode(_fnode->rstat, q);
4465 decode(_fnode->accounted_rstat, q);
7c673cae
FG
4466 dout(10) << " took rstat info for " << *dir << dendl;
4467 }
f67539c2 4468 dir->reset_fnode(std::move(_fnode));
7c673cae
FG
4469 }
4470
4471 _decode_locks_full(p);
4472
4473 _decode_file_locks(p);
4474
4475 DECODE_FINISH(p);
4476}
4477
4478
4479void InodeStoreBase::dump(Formatter *f) const
4480{
f67539c2 4481 inode->dump(f);
7c673cae 4482 f->dump_string("symlink", symlink);
9f95a23c
TL
4483
4484 f->open_array_section("xattrs");
f67539c2
TL
4485 if (xattrs) {
4486 for (const auto& [key, val] : *xattrs) {
4487 f->open_object_section("xattr");
4488 f->dump_string("key", key);
4489 std::string v(val.c_str(), val.length());
4490 f->dump_string("val", v);
4491 f->close_section();
4492 }
9f95a23c
TL
4493 }
4494 f->close_section();
4495 f->open_object_section("dirfragtree");
4496 dirfragtree.dump(f);
4497 f->close_section(); // dirfragtree
4498
7c673cae 4499 f->open_array_section("old_inodes");
f67539c2
TL
4500 if (old_inodes) {
4501 for (const auto &p : *old_inodes) {
4502 f->open_object_section("old_inode");
4503 // The key is the last snapid, the first is in the mempool_old_inode
4504 f->dump_int("last", p.first);
4505 p.second.dump(f);
4506 f->close_section(); // old_inode
4507 }
7c673cae
FG
4508 }
4509 f->close_section(); // old_inodes
4510
9f95a23c
TL
4511 f->dump_unsigned("oldest_snap", oldest_snap);
4512 f->dump_unsigned("damage_flags", damage_flags);
7c673cae
FG
4513}
4514
f67539c2
TL
4515template <>
4516void decode_json_obj(mempool::mds_co::string& t, JSONObj *obj){
4517
4518 t = mempool::mds_co::string(std::string_view(obj->get_data()));
4519}
4520
4521void InodeStoreBase::decode_json(JSONObj *obj)
4522{
4523 {
4524 auto _inode = allocate_inode();
4525 _inode->decode_json(obj);
4526 reset_inode(std::move(_inode));
4527 }
4528
4529 JSONDecoder::decode_json("symlink", symlink, obj, true);
4530 // JSONDecoder::decode_json("dirfragtree", dirfragtree, obj, true); // cann't decode it now
4531 //
4532 //
4533 {
4534 mempool_xattr_map tmp;
4535 JSONDecoder::decode_json("xattrs", tmp, xattrs_cb, obj, true);
4536 if (tmp.empty())
4537 reset_xattrs(xattr_map_ptr());
4538 else
4539 reset_xattrs(allocate_xattr_map(std::move(tmp)));
4540 }
4541 // JSONDecoder::decode_json("old_inodes", old_inodes, InodeStoreBase::old_indoes_cb, obj, true); // cann't decode old_inodes now
4542 JSONDecoder::decode_json("oldest_snap", oldest_snap.val, obj, true);
4543 JSONDecoder::decode_json("damage_flags", damage_flags, obj, true);
4544 //sr_t srnode;
4545 //JSONDecoder::decode_json("snap_blob", srnode, obj, true); // cann't decode it now
4546 //snap_blob = srnode;
4547}
4548
4549void InodeStoreBase::xattrs_cb(InodeStoreBase::mempool_xattr_map& c, JSONObj *obj){
4550
4551 string k;
4552 JSONDecoder::decode_json("key", k, obj, true);
4553 string v;
4554 JSONDecoder::decode_json("val", v, obj, true);
4555 c[k.c_str()] = buffer::copy(v.c_str(), v.size());
4556}
4557
4558void InodeStoreBase::old_indoes_cb(InodeStoreBase::mempool_old_inode_map& c, JSONObj *obj){
4559
4560 snapid_t s;
4561 JSONDecoder::decode_json("last", s.val, obj, true);
4562 InodeStoreBase::mempool_old_inode i;
4563 // i.decode_json(obj); // cann't decode now, simon
4564 c[s] = i;
4565}
7c673cae 4566
9f95a23c 4567void InodeStore::generate_test_instances(std::list<InodeStore*> &ls)
7c673cae
FG
4568{
4569 InodeStore *populated = new InodeStore;
f67539c2 4570 populated->get_inode()->ino = 0xdeadbeef;
7c673cae
FG
4571 populated->symlink = "rhubarb";
4572 ls.push_back(populated);
4573}
4574
9f95a23c 4575void InodeStoreBare::generate_test_instances(std::list<InodeStoreBare*> &ls)
11fdf7f2
TL
4576{
4577 InodeStoreBare *populated = new InodeStoreBare;
f67539c2 4578 populated->get_inode()->ino = 0xdeadbeef;
11fdf7f2
TL
4579 populated->symlink = "rhubarb";
4580 ls.push_back(populated);
4581}
4582
7c673cae 4583void CInode::validate_disk_state(CInode::validated_data *results,
11fdf7f2 4584 MDSContext *fin)
7c673cae
FG
4585{
4586 class ValidationContinuation : public MDSContinuation {
4587 public:
11fdf7f2 4588 MDSContext *fin;
7c673cae
FG
4589 CInode *in;
4590 CInode::validated_data *results;
4591 bufferlist bl;
4592 CInode *shadow_in;
4593
4594 enum {
4595 START = 0,
4596 BACKTRACE,
4597 INODE,
11fdf7f2
TL
4598 DIRFRAGS,
4599 SNAPREALM,
7c673cae
FG
4600 };
4601
4602 ValidationContinuation(CInode *i,
4603 CInode::validated_data *data_r,
11fdf7f2 4604 MDSContext *fin_) :
7c673cae
FG
4605 MDSContinuation(i->mdcache->mds->server),
4606 fin(fin_),
4607 in(i),
4608 results(data_r),
4609 shadow_in(NULL) {
4610 set_callback(START, static_cast<Continuation::stagePtr>(&ValidationContinuation::_start));
4611 set_callback(BACKTRACE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_backtrace));
4612 set_callback(INODE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_inode_disk));
4613 set_callback(DIRFRAGS, static_cast<Continuation::stagePtr>(&ValidationContinuation::_dirfrags));
4614 }
4615
4616 ~ValidationContinuation() override {
b32b8144
FG
4617 if (shadow_in) {
4618 delete shadow_in;
4619 in->mdcache->num_shadow_inodes--;
4620 }
7c673cae
FG
4621 }
4622
4623 /**
4624 * Fetch backtrace and set tag if tag is non-empty
4625 */
11fdf7f2
TL
4626 void fetch_backtrace_and_tag(CInode *in,
4627 std::string_view tag, bool is_internal,
7c673cae
FG
4628 Context *fin, int *bt_r, bufferlist *bt)
4629 {
4630 const int64_t pool = in->get_backtrace_pool();
4631 object_t oid = CInode::get_object_name(in->ino(), frag_t(), "");
4632
4633 ObjectOperation fetch;
4634 fetch.getxattr("parent", bt, bt_r);
4635 in->mdcache->mds->objecter->read(oid, object_locator_t(pool), fetch, CEPH_NOSNAP,
4636 NULL, 0, fin);
f67539c2
TL
4637 if (in->mdcache->mds->logger) {
4638 in->mdcache->mds->logger->inc(l_mds_openino_backtrace_fetch);
4639 in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_fetch);
4640 }
4641
11fdf7f2
TL
4642 using ceph::encode;
4643 if (!is_internal) {
4644 ObjectOperation scrub_tag;
7c673cae 4645 bufferlist tag_bl;
11fdf7f2 4646 encode(tag, tag_bl);
7c673cae
FG
4647 scrub_tag.setxattr("scrub_tag", tag_bl);
4648 SnapContext snapc;
4649 in->mdcache->mds->objecter->mutate(oid, object_locator_t(pool), scrub_tag, snapc,
4650 ceph::real_clock::now(),
4651 0, NULL);
f67539c2
TL
4652 if (in->mdcache->mds->logger)
4653 in->mdcache->mds->logger->inc(l_mds_scrub_set_tag);
7c673cae
FG
4654 }
4655 }
4656
4657 bool _start(int rval) {
f67539c2
TL
4658 ceph_assert(in->can_auth_pin());
4659 in->auth_pin(this);
4660
7c673cae 4661 if (in->is_dirty()) {
f67539c2
TL
4662 MDCache *mdcache = in->mdcache; // For the benefit of dout
4663 auto ino = [this]() { return in->ino(); }; // For the benefit of dout
11fdf7f2 4664 dout(20) << "validating a dirty CInode; results will be inconclusive"
f67539c2 4665 << dendl;
7c673cae 4666 }
11fdf7f2 4667
7c673cae 4668 C_OnFinisher *conf = new C_OnFinisher(get_io_callback(BACKTRACE),
11fdf7f2
TL
4669 in->mdcache->mds->finisher);
4670
4671 std::string_view tag = in->scrub_infop->header->get_tag();
4672 bool is_internal = in->scrub_infop->header->is_internal_tag();
4673 // Rather than using the usual CInode::fetch_backtrace,
4674 // use a special variant that optionally writes a tag in the same
4675 // operation.
4676 fetch_backtrace_and_tag(in, tag, is_internal, conf, &results->backtrace.ondisk_read_retval, &bl);
7c673cae
FG
4677 return false;
4678 }
4679
4680 bool _backtrace(int rval) {
4681 // set up basic result reporting and make sure we got the data
4682 results->performed_validation = true; // at least, some of it!
4683 results->backtrace.checked = true;
4684
4685 const int64_t pool = in->get_backtrace_pool();
4686 inode_backtrace_t& memory_backtrace = results->backtrace.memory_value;
4687 in->build_backtrace(pool, memory_backtrace);
4688 bool equivalent, divergent;
4689 int memory_newer;
4690
4691 MDCache *mdcache = in->mdcache; // For the benefit of dout
f67539c2 4692 auto ino = [this]() { return in->ino(); }; // For the benefit of dout
7c673cae
FG
4693
4694 // Ignore rval because it's the result of a FAILOK operation
4695 // from fetch_backtrace_and_tag: the real result is in
4696 // backtrace.ondisk_read_retval
4697 dout(20) << "ondisk_read_retval: " << results->backtrace.ondisk_read_retval << dendl;
4698 if (results->backtrace.ondisk_read_retval != 0) {
4699 results->backtrace.error_str << "failed to read off disk; see retval";
e306af50
TL
4700 // we probably have a new unwritten file!
4701 // so skip the backtrace scrub for this entry and say that all's well
f67539c2
TL
4702 if (in->is_dirty_parent()) {
4703 dout(20) << "forcing backtrace as passed since inode is dirty parent" << dendl;
e306af50 4704 results->backtrace.passed = true;
f67539c2 4705 }
e306af50 4706 goto next;
7c673cae
FG
4707 }
4708
4709 // extract the backtrace, and compare it to a newly-constructed one
4710 try {
11fdf7f2
TL
4711 auto p = bl.cbegin();
4712 using ceph::decode;
4713 decode(results->backtrace.ondisk_value, p);
7c673cae
FG
4714 dout(10) << "decoded " << bl.length() << " bytes of backtrace successfully" << dendl;
4715 } catch (buffer::error&) {
4716 if (results->backtrace.ondisk_read_retval == 0 && rval != 0) {
4717 // Cases where something has clearly gone wrong with the overall
4718 // fetch op, though we didn't get a nonzero rc from the getxattr
4719 // operation. e.g. object missing.
4720 results->backtrace.ondisk_read_retval = rval;
4721 }
4722 results->backtrace.error_str << "failed to decode on-disk backtrace ("
4723 << bl.length() << " bytes)!";
e306af50
TL
4724 // we probably have a new unwritten file!
4725 // so skip the backtrace scrub for this entry and say that all's well
f67539c2
TL
4726 if (in->is_dirty_parent()) {
4727 dout(20) << "decode failed; forcing backtrace as passed since "
4728 "inode is dirty parent" << dendl;
e306af50 4729 results->backtrace.passed = true;
f67539c2 4730 }
e306af50 4731
7c673cae
FG
4732 goto next;
4733 }
4734
4735 memory_newer = memory_backtrace.compare(results->backtrace.ondisk_value,
4736 &equivalent, &divergent);
4737
4738 if (divergent || memory_newer < 0) {
e306af50
TL
4739 // we're divergent, or on-disk version is newer
4740 results->backtrace.error_str << "On-disk backtrace is divergent or newer";
f67539c2
TL
4741 /* if the backtraces are divergent and the link count is 0, then
4742 * most likely its a stray entry that's being purged and things are
4743 * well and there's no reason for alarm
4744 */
4745 if (divergent && (in->is_dirty_parent() || in->get_inode()->nlink == 0)) {
e306af50 4746 results->backtrace.passed = true;
f67539c2
TL
4747 dout(20) << "divergent backtraces are acceptable when dn "
4748 "is being purged or has been renamed or moved to a "
4749 "different directory " << *in << dendl;
4750 }
7c673cae
FG
4751 } else {
4752 results->backtrace.passed = true;
4753 }
4754next:
4755
4756 if (!results->backtrace.passed && in->scrub_infop->header->get_repair()) {
4757 std::string path;
4758 in->make_path_string(path);
d2e6a577
FG
4759 in->mdcache->mds->clog->warn() << "bad backtrace on inode " << in->ino()
4760 << "(" << path << "), rewriting it";
28e407b8 4761 in->mark_dirty_parent(in->mdcache->mds->mdlog->get_current_segment(),
7c673cae 4762 false);
b32b8144
FG
4763 // Flag that we repaired this BT so that it won't go into damagetable
4764 results->backtrace.repaired = true;
f67539c2
TL
4765 if (in->mdcache->mds->logger)
4766 in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_repaired);
7c673cae
FG
4767 }
4768
4769 // If the inode's number was free in the InoTable, fix that
4770 // (#15619)
4771 {
4772 InoTable *inotable = mdcache->mds->inotable;
4773
f67539c2 4774 dout(10) << "scrub: inotable ino = " << in->ino() << dendl;
7c673cae 4775 dout(10) << "scrub: inotable free says "
f67539c2 4776 << inotable->is_marked_free(in->ino()) << dendl;
7c673cae 4777
f67539c2 4778 if (inotable->is_marked_free(in->ino())) {
7c673cae 4779 LogChannelRef clog = in->mdcache->mds->clog;
f67539c2 4780 clog->error() << "scrub: inode wrongly marked free: " << in->ino();
7c673cae
FG
4781
4782 if (in->scrub_infop->header->get_repair()) {
f67539c2 4783 bool repaired = inotable->repair(in->ino());
7c673cae 4784 if (repaired) {
f67539c2 4785 clog->error() << "inode table repaired for inode: " << in->ino();
7c673cae
FG
4786
4787 inotable->save();
f67539c2
TL
4788 if (in->mdcache->mds->logger)
4789 in->mdcache->mds->logger->inc(l_mds_scrub_inotable_repaired);
7c673cae
FG
4790 } else {
4791 clog->error() << "Cannot repair inotable while other operations"
4792 " are in progress";
4793 }
4794 }
4795 }
4796 }
4797
7c673cae 4798
11fdf7f2 4799 if (in->is_dir()) {
f67539c2
TL
4800 if (in->mdcache->mds->logger)
4801 in->mdcache->mds->logger->inc(l_mds_scrub_dir_inodes);
11fdf7f2
TL
4802 return validate_directory_data();
4803 } else {
f67539c2
TL
4804 if (in->mdcache->mds->logger)
4805 in->mdcache->mds->logger->inc(l_mds_scrub_file_inodes);
11fdf7f2 4806 // TODO: validate on-disk inode for normal files
f67539c2 4807 return true;
11fdf7f2 4808 }
7c673cae
FG
4809 }
4810
4811 bool validate_directory_data() {
11fdf7f2 4812 ceph_assert(in->is_dir());
7c673cae
FG
4813
4814 if (in->is_base()) {
b32b8144
FG
4815 if (!shadow_in) {
4816 shadow_in = new CInode(in->mdcache);
f67539c2 4817 in->mdcache->create_unlinked_system_inode(shadow_in, in->ino(), in->get_inode()->mode);
b32b8144
FG
4818 in->mdcache->num_shadow_inodes++;
4819 }
7c673cae 4820 shadow_in->fetch(get_internal_callback(INODE));
f67539c2
TL
4821 if (in->mdcache->mds->logger)
4822 in->mdcache->mds->logger->inc(l_mds_scrub_dir_base_inodes);
7c673cae
FG
4823 return false;
4824 } else {
11fdf7f2 4825 // TODO: validate on-disk inode for non-base directories
f67539c2
TL
4826 if (in->mdcache->mds->logger)
4827 in->mdcache->mds->logger->inc(l_mds_scrub_dirfrag_rstats);
7c673cae 4828 results->inode.passed = true;
11fdf7f2 4829 return check_dirfrag_rstats();
7c673cae
FG
4830 }
4831 }
4832
4833 bool _inode_disk(int rval) {
f67539c2
TL
4834 const auto& si = shadow_in->get_inode();
4835 const auto& i = in->get_inode();
4836
7c673cae
FG
4837 results->inode.checked = true;
4838 results->inode.ondisk_read_retval = rval;
f67539c2
TL
4839 results->inode.ondisk_value = *si;
4840 results->inode.memory_value = *i;
7c673cae 4841
f67539c2 4842 if (si->version > i->version) {
7c673cae 4843 // uh, what?
11fdf7f2 4844 results->inode.error_str << "On-disk inode is newer than in-memory one; ";
7c673cae
FG
4845 goto next;
4846 } else {
4847 bool divergent = false;
f67539c2 4848 int r = i->compare(*si, &divergent);
7c673cae
FG
4849 results->inode.passed = !divergent && r >= 0;
4850 if (!results->inode.passed) {
4851 results->inode.error_str <<
11fdf7f2 4852 "On-disk inode is divergent or newer than in-memory one; ";
7c673cae
FG
4853 goto next;
4854 }
4855 }
4856next:
4857 return check_dirfrag_rstats();
4858 }
4859
4860 bool check_dirfrag_rstats() {
f67539c2
TL
4861 if (in->has_subtree_root_dirfrag()) {
4862 in->mdcache->rdlock_dirfrags_stats(in, get_internal_callback(DIRFRAGS));
4863 return false;
7c673cae 4864 } else {
f67539c2 4865 return immediate(DIRFRAGS, 0);
7c673cae
FG
4866 }
4867 }
4868
4869 bool _dirfrags(int rval) {
7c673cae
FG
4870 // basic reporting setup
4871 results->raw_stats.checked = true;
4872 results->raw_stats.ondisk_read_retval = rval;
4873
f67539c2
TL
4874 results->raw_stats.memory_value.dirstat = in->get_inode()->dirstat;
4875 results->raw_stats.memory_value.rstat = in->get_inode()->rstat;
7c673cae
FG
4876 frag_info_t& dir_info = results->raw_stats.ondisk_value.dirstat;
4877 nest_info_t& nest_info = results->raw_stats.ondisk_value.rstat;
4878
4879 if (rval != 0) {
4880 results->raw_stats.error_str << "Failed to read dirfrags off disk";
4881 goto next;
4882 }
4883
4884 // check each dirfrag...
94b18763
FG
4885 for (const auto &p : in->dirfrags) {
4886 CDir *dir = p.second;
11fdf7f2 4887 ceph_assert(dir->get_version() > 0);
f67539c2
TL
4888 nest_info.add(dir->get_fnode()->accounted_rstat);
4889 dir_info.add(dir->get_fnode()->accounted_fragstat);
7c673cae
FG
4890 }
4891 nest_info.rsubdirs++; // it gets one to account for self
11fdf7f2
TL
4892 if (const sr_t *srnode = in->get_projected_srnode(); srnode)
4893 nest_info.rsnaps += srnode->snaps.size();
4894
7c673cae 4895 // ...and that their sum matches our inode settings
f67539c2
TL
4896 if (!dir_info.same_sums(in->get_inode()->dirstat) ||
4897 !nest_info.same_sums(in->get_inode()->rstat)) {
11fdf7f2 4898 if (in->scrub_infop->header->get_repair()) {
7c673cae
FG
4899 results->raw_stats.error_str
4900 << "freshly-calculated rstats don't match existing ones (will be fixed)";
4901 in->mdcache->repair_inode_stats(in);
b32b8144 4902 results->raw_stats.repaired = true;
7c673cae
FG
4903 } else {
4904 results->raw_stats.error_str
4905 << "freshly-calculated rstats don't match existing ones";
4906 }
f67539c2
TL
4907 if (in->is_dirty()) {
4908 MDCache *mdcache = in->mdcache; // for dout()
4909 auto ino = [this]() { return in->ino(); }; // for dout()
4910 dout(20) << "raw stats most likely wont match since inode is dirty; "
4911 "please rerun scrub when system is stable; "
4912 "assuming passed for now;" << dendl;
4913 results->raw_stats.passed = true;
4914 }
7c673cae
FG
4915 goto next;
4916 }
7c673cae
FG
4917
4918 results->raw_stats.passed = true;
f67539c2
TL
4919 {
4920 MDCache *mdcache = in->mdcache; // for dout()
4921 auto ino = [this]() { return in->ino(); }; // for dout()
4922 dout(20) << "raw stats check passed on " << *in << dendl;
11fdf7f2 4923 }
11fdf7f2 4924
f67539c2 4925next:
7c673cae
FG
4926 return true;
4927 }
4928
4929 void _done() override {
4930 if ((!results->raw_stats.checked || results->raw_stats.passed) &&
4931 (!results->backtrace.checked || results->backtrace.passed) &&
4932 (!results->inode.checked || results->inode.passed))
11fdf7f2
TL
4933 results->passed_validation = true;
4934
4935 // Flag that we did some repair work so that our repair operation
4936 // can be flushed at end of scrub
4937 if (results->backtrace.repaired ||
4938 results->inode.repaired ||
4939 results->raw_stats.repaired)
4940 in->scrub_infop->header->set_repaired();
4941 if (fin)
4942 fin->complete(get_rval());
f67539c2
TL
4943
4944 in->auth_unpin(this);
7c673cae
FG
4945 }
4946 };
4947
4948
4949 dout(10) << "scrub starting validate_disk_state on " << *this << dendl;
4950 ValidationContinuation *vc = new ValidationContinuation(this,
4951 results,
4952 fin);
4953 vc->begin();
4954}
4955
4956void CInode::validated_data::dump(Formatter *f) const
4957{
4958 f->open_object_section("results");
4959 {
4960 f->dump_bool("performed_validation", performed_validation);
4961 f->dump_bool("passed_validation", passed_validation);
4962 f->open_object_section("backtrace");
4963 {
4964 f->dump_bool("checked", backtrace.checked);
4965 f->dump_bool("passed", backtrace.passed);
4966 f->dump_int("read_ret_val", backtrace.ondisk_read_retval);
4967 f->dump_stream("ondisk_value") << backtrace.ondisk_value;
4968 f->dump_stream("memoryvalue") << backtrace.memory_value;
4969 f->dump_string("error_str", backtrace.error_str.str());
4970 }
4971 f->close_section(); // backtrace
4972 f->open_object_section("raw_stats");
4973 {
4974 f->dump_bool("checked", raw_stats.checked);
4975 f->dump_bool("passed", raw_stats.passed);
4976 f->dump_int("read_ret_val", raw_stats.ondisk_read_retval);
4977 f->dump_stream("ondisk_value.dirstat") << raw_stats.ondisk_value.dirstat;
4978 f->dump_stream("ondisk_value.rstat") << raw_stats.ondisk_value.rstat;
f67539c2 4979 f->dump_stream("memory_value.dirstat") << raw_stats.memory_value.dirstat;
7c673cae
FG
4980 f->dump_stream("memory_value.rstat") << raw_stats.memory_value.rstat;
4981 f->dump_string("error_str", raw_stats.error_str.str());
4982 }
4983 f->close_section(); // raw_stats
4984 // dump failure return code
4985 int rc = 0;
4986 if (backtrace.checked && backtrace.ondisk_read_retval)
4987 rc = backtrace.ondisk_read_retval;
4988 if (inode.checked && inode.ondisk_read_retval)
4989 rc = inode.ondisk_read_retval;
4990 if (raw_stats.checked && raw_stats.ondisk_read_retval)
4991 rc = raw_stats.ondisk_read_retval;
4992 f->dump_int("return_code", rc);
4993 }
4994 f->close_section(); // results
4995}
4996
b32b8144
FG
4997bool CInode::validated_data::all_damage_repaired() const
4998{
4999 bool unrepaired =
5000 (raw_stats.checked && !raw_stats.passed && !raw_stats.repaired)
5001 ||
5002 (backtrace.checked && !backtrace.passed && !backtrace.repaired)
5003 ||
5004 (inode.checked && !inode.passed && !inode.repaired);
5005
5006 return !unrepaired;
5007}
5008
11fdf7f2
TL
5009void CInode::dump(Formatter *f, int flags) const
5010{
5011 if (flags & DUMP_PATH) {
5012 std::string path;
5013 make_path_string(path, true);
5014 if (path.empty())
5015 path = "/";
5016 f->dump_string("path", path);
5017 }
5018
5019 if (flags & DUMP_INODE_STORE_BASE)
5020 InodeStoreBase::dump(f);
5021
5022 if (flags & DUMP_MDS_CACHE_OBJECT)
5023 MDSCacheObject::dump(f);
5024
5025 if (flags & DUMP_LOCKS) {
5026 f->open_object_section("versionlock");
5027 versionlock.dump(f);
5028 f->close_section();
5029
5030 f->open_object_section("authlock");
5031 authlock.dump(f);
5032 f->close_section();
5033
5034 f->open_object_section("linklock");
5035 linklock.dump(f);
5036 f->close_section();
5037
5038 f->open_object_section("dirfragtreelock");
5039 dirfragtreelock.dump(f);
5040 f->close_section();
5041
5042 f->open_object_section("filelock");
5043 filelock.dump(f);
5044 f->close_section();
5045
5046 f->open_object_section("xattrlock");
5047 xattrlock.dump(f);
5048 f->close_section();
5049
5050 f->open_object_section("snaplock");
5051 snaplock.dump(f);
5052 f->close_section();
5053
5054 f->open_object_section("nestlock");
5055 nestlock.dump(f);
5056 f->close_section();
5057
5058 f->open_object_section("flocklock");
5059 flocklock.dump(f);
5060 f->close_section();
5061
5062 f->open_object_section("policylock");
5063 policylock.dump(f);
5064 f->close_section();
5065 }
5066
5067 if (flags & DUMP_STATE) {
5068 f->open_array_section("states");
5069 MDSCacheObject::dump_states(f);
5070 if (state_test(STATE_EXPORTING))
5071 f->dump_string("state", "exporting");
5072 if (state_test(STATE_OPENINGDIR))
5073 f->dump_string("state", "openingdir");
5074 if (state_test(STATE_FREEZING))
5075 f->dump_string("state", "freezing");
5076 if (state_test(STATE_FROZEN))
5077 f->dump_string("state", "frozen");
5078 if (state_test(STATE_AMBIGUOUSAUTH))
5079 f->dump_string("state", "ambiguousauth");
5080 if (state_test(STATE_EXPORTINGCAPS))
5081 f->dump_string("state", "exportingcaps");
5082 if (state_test(STATE_NEEDSRECOVER))
5083 f->dump_string("state", "needsrecover");
5084 if (state_test(STATE_PURGING))
5085 f->dump_string("state", "purging");
5086 if (state_test(STATE_DIRTYPARENT))
5087 f->dump_string("state", "dirtyparent");
5088 if (state_test(STATE_DIRTYRSTAT))
5089 f->dump_string("state", "dirtyrstat");
5090 if (state_test(STATE_STRAYPINNED))
5091 f->dump_string("state", "straypinned");
5092 if (state_test(STATE_FROZENAUTHPIN))
5093 f->dump_string("state", "frozenauthpin");
5094 if (state_test(STATE_DIRTYPOOL))
5095 f->dump_string("state", "dirtypool");
5096 if (state_test(STATE_ORPHAN))
5097 f->dump_string("state", "orphan");
5098 if (state_test(STATE_MISSINGOBJS))
5099 f->dump_string("state", "missingobjs");
7c673cae
FG
5100 f->close_section();
5101 }
7c673cae 5102
11fdf7f2
TL
5103 if (flags & DUMP_CAPS) {
5104 f->open_array_section("client_caps");
5105 for (const auto &p : client_caps) {
5106 auto &client = p.first;
5107 auto cap = &p.second;
5108 f->open_object_section("client_cap");
5109 f->dump_int("client_id", client.v);
5110 f->dump_string("pending", ccap_string(cap->pending()));
5111 f->dump_string("issued", ccap_string(cap->issued()));
5112 f->dump_string("wanted", ccap_string(cap->wanted()));
5113 f->dump_int("last_sent", cap->get_last_seq());
5114 f->close_section();
5115 }
5116 f->close_section();
5117
5118 f->dump_int("loner", loner_cap.v);
5119 f->dump_int("want_loner", want_loner_cap.v);
5120
5121 f->open_array_section("mds_caps_wanted");
5122 for (const auto &p : mds_caps_wanted) {
5123 f->open_object_section("mds_cap_wanted");
5124 f->dump_int("rank", p.first);
5125 f->dump_string("cap", ccap_string(p.second));
5126 f->close_section();
5127 }
5128 f->close_section();
5129 }
7c673cae 5130
11fdf7f2
TL
5131 if (flags & DUMP_DIRFRAGS) {
5132 f->open_array_section("dirfrags");
9f95a23c 5133 auto&& dfs = get_dirfrags();
11fdf7f2
TL
5134 for(const auto &dir: dfs) {
5135 f->open_object_section("dir");
5136 dir->dump(f, CDir::DUMP_DEFAULT | CDir::DUMP_ITEMS);
5137 dir->check_rstats();
5138 f->close_section();
5139 }
7c673cae
FG
5140 f->close_section();
5141 }
7c673cae
FG
5142}
5143
5144/****** Scrub Stuff *****/
5145void CInode::scrub_info_create() const
5146{
5147 dout(25) << __func__ << dendl;
11fdf7f2 5148 ceph_assert(!scrub_infop);
7c673cae
FG
5149
5150 // break out of const-land to set up implicit initial state
5151 CInode *me = const_cast<CInode*>(this);
f67539c2 5152 const auto& pi = me->get_projected_inode();
7c673cae 5153
f67539c2
TL
5154 std::unique_ptr<scrub_info_t> si(new scrub_info_t());
5155 si->last_scrub_stamp = pi->last_scrub_stamp;
5156 si->last_scrub_version = pi->last_scrub_version;
7c673cae 5157
f67539c2 5158 me->scrub_infop.swap(si);
7c673cae
FG
5159}
5160
5161void CInode::scrub_maybe_delete_info()
5162{
5163 if (scrub_infop &&
5164 !scrub_infop->scrub_in_progress &&
5165 !scrub_infop->last_scrub_dirty) {
f67539c2 5166 scrub_infop.reset();
7c673cae
FG
5167 }
5168}
5169
f67539c2 5170void CInode::scrub_initialize(ScrubHeaderRef& header)
7c673cae
FG
5171{
5172 dout(20) << __func__ << " with scrub_version " << get_version() << dendl;
7c673cae 5173
f67539c2 5174 scrub_info();
7c673cae 5175 scrub_infop->scrub_in_progress = true;
f67539c2 5176 scrub_infop->queued_frags.clear();
7c673cae 5177 scrub_infop->header = header;
f67539c2 5178 header->inc_num_pending();
7c673cae
FG
5179 // right now we don't handle remote inodes
5180}
5181
f67539c2 5182void CInode::scrub_aborted() {
11fdf7f2
TL
5183 dout(20) << __func__ << dendl;
5184 ceph_assert(scrub_is_in_progress());
5185
f67539c2
TL
5186 scrub_infop->scrub_in_progress = false;
5187 scrub_infop->header->dec_num_pending();
5188 scrub_maybe_delete_info();
11fdf7f2
TL
5189}
5190
f67539c2 5191void CInode::scrub_finished() {
7c673cae 5192 dout(20) << __func__ << dendl;
11fdf7f2 5193 ceph_assert(scrub_is_in_progress());
7c673cae 5194
f67539c2
TL
5195 scrub_infop->last_scrub_version = get_version();
5196 scrub_infop->last_scrub_stamp = ceph_clock_now();
7c673cae
FG
5197 scrub_infop->last_scrub_dirty = true;
5198 scrub_infop->scrub_in_progress = false;
f67539c2 5199 scrub_infop->header->dec_num_pending();
7c673cae
FG
5200}
5201
5202int64_t CInode::get_backtrace_pool() const
5203{
5204 if (is_dir()) {
b3b6e05e 5205 return mdcache->mds->get_metadata_pool();
7c673cae
FG
5206 } else {
5207 // Files are required to have an explicit layout that specifies
5208 // a pool
f67539c2
TL
5209 ceph_assert(get_inode()->layout.pool_id != -1);
5210 return get_inode()->layout.pool_id;
7c673cae
FG
5211 }
5212}
5213
f67539c2 5214void CInode::queue_export_pin(mds_rank_t export_pin)
31f18b77 5215{
31f18b77
FG
5216 if (state_test(CInode::STATE_QUEUEDEXPORTPIN))
5217 return;
5218
f67539c2
TL
5219 mds_rank_t target;
5220 if (export_pin >= 0)
5221 target = export_pin;
5222 else if (export_pin == MDS_RANK_EPHEMERAL_RAND)
5223 target = mdcache->hash_into_rank_bucket(ino());
5224 else
5225 target = MDS_RANK_NONE;
5226
5227 unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
31f18b77 5228 bool queue = false;
f6b5b4d7
TL
5229 for (auto& p : dirfrags) {
5230 CDir *dir = p.second;
31f18b77
FG
5231 if (!dir->is_auth())
5232 continue;
f67539c2
TL
5233
5234 if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
5235 if (dir->get_frag().bits() < min_frag_bits) {
5236 // needs split
5237 queue = true;
5238 break;
5239 }
5240 target = mdcache->hash_into_rank_bucket(ino(), dir->get_frag());
5241 }
5242
f6b5b4d7 5243 if (target != MDS_RANK_NONE) {
31f18b77
FG
5244 if (dir->is_subtree_root()) {
5245 // set auxsubtree bit or export it
5246 if (!dir->state_test(CDir::STATE_AUXSUBTREE) ||
f6b5b4d7 5247 target != dir->get_dir_auth().first)
31f18b77
FG
5248 queue = true;
5249 } else {
5250 // create aux subtree or export it
5251 queue = true;
7c673cae 5252 }
31f18b77
FG
5253 } else {
5254 // clear aux subtrees ?
5255 queue = dir->state_test(CDir::STATE_AUXSUBTREE);
5256 }
f67539c2
TL
5257
5258 if (queue)
31f18b77 5259 break;
f67539c2
TL
5260 }
5261 if (queue) {
5262 state_set(CInode::STATE_QUEUEDEXPORTPIN);
5263 mdcache->export_pin_queue.insert(this);
7c673cae
FG
5264 }
5265}
5266
f6b5b4d7
TL
5267void CInode::maybe_export_pin(bool update)
5268{
5269 if (!g_conf()->mds_bal_export_pin)
5270 return;
5271 if (!is_dir() || !is_normal())
5272 return;
5273
5274 dout(15) << __func__ << " update=" << update << " " << *this << dendl;
5275
f67539c2
TL
5276 mds_rank_t export_pin = get_export_pin(false);
5277 if (export_pin == MDS_RANK_NONE && !update)
f6b5b4d7 5278 return;
f6b5b4d7 5279
f67539c2 5280 check_pin_policy(export_pin);
f6b5b4d7
TL
5281 queue_export_pin(export_pin);
5282}
5283
f67539c2 5284void CInode::set_ephemeral_pin(bool dist, bool rand)
f6b5b4d7 5285{
f67539c2
TL
5286 unsigned state = 0;
5287 if (dist)
5288 state |= STATE_DISTEPHEMERALPIN;
5289 if (rand)
5290 state |= STATE_RANDEPHEMERALPIN;
5291 if (!state)
f6b5b4d7 5292 return;
f6b5b4d7 5293
f67539c2
TL
5294 if (state_test(state) != state) {
5295 dout(10) << "set ephemeral (" << (dist ? "dist" : "")
5296 << (rand ? " rand" : "") << ") pin on " << *this << dendl;
5297 if (!is_ephemerally_pinned()) {
5298 auto p = mdcache->export_ephemeral_pins.insert(this);
5299 ceph_assert(p.second);
f6b5b4d7 5300 }
f67539c2 5301 state_set(state);
f6b5b4d7
TL
5302 }
5303}
5304
f67539c2 5305void CInode::clear_ephemeral_pin(bool dist, bool rand)
f6b5b4d7 5306{
f67539c2
TL
5307 unsigned state = 0;
5308 if (dist)
5309 state |= STATE_DISTEPHEMERALPIN;
5310 if (rand)
5311 state |= STATE_RANDEPHEMERALPIN;
5312
5313 if (state_test(state)) {
5314 dout(10) << "clear ephemeral (" << (dist ? "dist" : "")
5315 << (rand ? " rand" : "") << ") pin on " << *this << dendl;
5316 state_clear(state);
5317 if (!is_ephemerally_pinned()) {
5318 auto count = mdcache->export_ephemeral_pins.erase(this);
f6b5b4d7 5319 ceph_assert(count == 1);
f6b5b4d7
TL
5320 }
5321 }
5322}
5323
f67539c2 5324void CInode::maybe_ephemeral_rand(double threshold)
f6b5b4d7
TL
5325{
5326 if (!mdcache->get_export_ephemeral_random_config()) {
5327 dout(15) << __func__ << " config false: cannot ephemeral random pin " << *this << dendl;
f67539c2 5328 clear_ephemeral_pin(false, true);
f6b5b4d7
TL
5329 return;
5330 } else if (!is_dir() || !is_normal()) {
5331 dout(15) << __func__ << " !dir or !normal: cannot ephemeral random pin " << *this << dendl;
f67539c2 5332 clear_ephemeral_pin(false, true);
f6b5b4d7 5333 return;
f67539c2 5334 } else if (get_inode()->nlink == 0) {
f6b5b4d7 5335 dout(15) << __func__ << " unlinked directory: cannot ephemeral random pin " << *this << dendl;
f67539c2 5336 clear_ephemeral_pin(false, true);
f6b5b4d7
TL
5337 return;
5338 } else if (state_test(CInode::STATE_RANDEPHEMERALPIN)) {
5339 dout(10) << __func__ << " already ephemeral random pinned: requeueing " << *this << dendl;
f67539c2 5340 queue_export_pin(MDS_RANK_EPHEMERAL_RAND);
f6b5b4d7
TL
5341 return;
5342 }
5343
f91f0fd5
TL
5344 /* not precomputed? */
5345 if (threshold < 0.0) {
5346 threshold = get_ephemeral_rand();
5347 }
5348 if (threshold <= 0.0) {
5349 return;
5350 }
f6b5b4d7
TL
5351 double n = ceph::util::generate_random_number(0.0, 1.0);
5352
5353 dout(15) << __func__ << " rand " << n << " <?= " << threshold
5354 << " " << *this << dendl;
5355
5356 if (n <= threshold) {
5357 dout(10) << __func__ << " randomly export pinning " << *this << dendl;
f67539c2
TL
5358 set_ephemeral_pin(false, true);
5359 queue_export_pin(MDS_RANK_EPHEMERAL_RAND);
f6b5b4d7
TL
5360 }
5361}
5362
5363void CInode::setxattr_ephemeral_rand(double probability)
5364{
5365 ceph_assert(is_dir());
f67539c2 5366 _get_projected_inode()->export_ephemeral_random_pin = probability;
f6b5b4d7
TL
5367}
5368
5369void CInode::setxattr_ephemeral_dist(bool val)
5370{
5371 ceph_assert(is_dir());
f67539c2 5372 _get_projected_inode()->export_ephemeral_distributed_pin = val;
f6b5b4d7
TL
5373}
5374
7c673cae
FG
5375void CInode::set_export_pin(mds_rank_t rank)
5376{
11fdf7f2 5377 ceph_assert(is_dir());
f67539c2
TL
5378 _get_projected_inode()->export_pin = rank;
5379 maybe_export_pin(true);
7c673cae
FG
5380}
5381
f67539c2 5382mds_rank_t CInode::get_export_pin(bool inherit) const
f6b5b4d7 5383{
f67539c2
TL
5384 if (!g_conf()->mds_bal_export_pin)
5385 return MDS_RANK_NONE;
f6b5b4d7 5386
7c673cae
FG
5387 /* An inode that is export pinned may not necessarily be a subtree root, we
5388 * need to traverse the parents. A base or system inode cannot be pinned.
5389 * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5390 * have a parent yet.
5391 */
f67539c2 5392 mds_rank_t r_target = MDS_RANK_NONE;
b32b8144 5393 const CInode *in = this;
f67539c2 5394 const CDir *dir = nullptr;
f6b5b4d7
TL
5395 while (true) {
5396 if (in->is_system())
5397 break;
5398 const CDentry *pdn = in->get_parent_dn();
5399 if (!pdn)
5400 break;
f67539c2 5401 if (in->get_inode()->nlink == 0) {
f6b5b4d7 5402 // ignore export pin for unlinked directory
f67539c2
TL
5403 break;
5404 }
5405
5406 if (in->get_inode()->export_pin >= 0) {
5407 return in->get_inode()->export_pin;
5408 } else if (in->get_inode()->export_ephemeral_distributed_pin &&
5409 mdcache->get_export_ephemeral_distributed_config()) {
5410 if (in != this)
5411 return mdcache->hash_into_rank_bucket(in->ino(), dir->get_frag());
5412 return MDS_RANK_EPHEMERAL_DIST;
5413 } else if (r_target != MDS_RANK_NONE && in->get_inode()->export_ephemeral_random_pin > 0.0) {
5414 return r_target;
5415 } else if (r_target == MDS_RANK_NONE && in->is_ephemeral_rand() &&
5416 mdcache->get_export_ephemeral_random_config()) {
f6b5b4d7 5417 /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */
f67539c2
TL
5418 if (!inherit)
5419 return MDS_RANK_EPHEMERAL_RAND;
5420 if (in == this)
5421 r_target = MDS_RANK_EPHEMERAL_RAND;
5422 else
5423 r_target = mdcache->hash_into_rank_bucket(in->ino());
f6b5b4d7
TL
5424 }
5425
f67539c2 5426 if (!inherit)
f6b5b4d7 5427 break;
f67539c2
TL
5428 dir = pdn->get_dir();
5429 in = dir->inode;
f6b5b4d7
TL
5430 }
5431 return MDS_RANK_NONE;
5432}
5433
f67539c2
TL
5434void CInode::check_pin_policy(mds_rank_t export_pin)
5435{
5436 if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
5437 set_ephemeral_pin(true, false);
5438 clear_ephemeral_pin(false, true);
5439 } else if (export_pin == MDS_RANK_EPHEMERAL_RAND) {
5440 set_ephemeral_pin(false, true);
5441 clear_ephemeral_pin(true, false);
5442 } else if (is_ephemerally_pinned()) {
5443 // export_pin >= 0 || export_pin == MDS_RANK_NONE
5444 clear_ephemeral_pin(true, true);
5445 if (export_pin != get_inode()->export_pin) // inherited export_pin
5446 queue_export_pin(MDS_RANK_NONE);
5447 }
5448}
5449
5450double CInode::get_ephemeral_rand() const
f6b5b4d7
TL
5451{
5452 /* N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5453 * have a parent yet.
5454 */
5455 const CInode *in = this;
5456 double max = mdcache->export_ephemeral_random_max;
b32b8144
FG
5457 while (true) {
5458 if (in->is_system())
5459 break;
f64942e4 5460 const CDentry *pdn = in->get_parent_dn();
b32b8144
FG
5461 if (!pdn)
5462 break;
b32b8144 5463 // ignore export pin for unlinked directory
f67539c2 5464 if (in->get_inode()->nlink == 0)
b32b8144 5465 break;
f6b5b4d7 5466
f67539c2
TL
5467 if (in->get_inode()->export_ephemeral_random_pin > 0.0)
5468 return std::min(in->get_inode()->export_ephemeral_random_pin, max);
f6b5b4d7
TL
5469
5470 /* An export_pin overrides only if no closer parent (incl. this one) has a
5471 * random pin set.
5472 */
f67539c2
TL
5473 if (in->get_inode()->export_pin >= 0 ||
5474 in->get_inode()->export_ephemeral_distributed_pin)
f6b5b4d7 5475 return 0.0;
b32b8144 5476
b32b8144 5477 in = pdn->get_dir()->inode;
7c673cae 5478 }
f6b5b4d7 5479 return 0.0;
7c673cae
FG
5480}
5481
9f95a23c
TL
5482void CInode::get_nested_dirfrags(std::vector<CDir*>& v) const
5483{
5484 for (const auto &p : dirfrags) {
5485 const auto& dir = p.second;
5486 if (!dir->is_subtree_root())
5487 v.push_back(dir);
5488 }
5489}
5490
5491void CInode::get_subtree_dirfrags(std::vector<CDir*>& v) const
5492{
5493 for (const auto &p : dirfrags) {
5494 const auto& dir = p.second;
5495 if (dir->is_subtree_root())
5496 v.push_back(dir);
5497 }
5498}
5499
181888fb 5500MEMPOOL_DEFINE_OBJECT_FACTORY(CInode, co_inode, mds_co);