]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/CInode.cc
import ceph quincy 17.2.4
[ceph.git] / ceph / src / mds / CInode.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "include/int_types.h"
16#include "common/errno.h"
17
18#include <string>
7c673cae
FG
19
20#include "CInode.h"
21#include "CDir.h"
22#include "CDentry.h"
23
24#include "MDSRank.h"
25#include "MDCache.h"
26#include "MDLog.h"
27#include "Locker.h"
28#include "Mutation.h"
29
30#include "events/EUpdate.h"
31
32#include "osdc/Objecter.h"
33
34#include "snap.h"
35
36#include "LogSegment.h"
37
38#include "common/Clock.h"
39
7c673cae
FG
40#include "common/config.h"
41#include "global/global_context.h"
11fdf7f2 42#include "include/ceph_assert.h"
7c673cae
FG
43
44#include "mds/MDSContinuation.h"
45#include "mds/InoTable.h"
11fdf7f2 46#include "cephfs_features.h"
f67539c2 47#include "osdc/Objecter.h"
7c673cae
FG
48
49#define dout_context g_ceph_context
50#define dout_subsys ceph_subsys_mds
51#undef dout_prefix
f67539c2
TL
52#define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") "
53
20effc67
TL
54using namespace std;
55
f67539c2
TL
56void CInodeCommitOperation::update(ObjectOperation &op, inode_backtrace_t &bt) {
57 using ceph::encode;
58
59 op.priority = priority;
60 op.create(false);
61
62 bufferlist parent_bl;
63 encode(bt, parent_bl);
64 op.setxattr("parent", parent_bl);
7c673cae 65
20effc67
TL
66 // for the old pool there is no need to update the layout and symlink
67 if (!update_layout_symlink)
f67539c2
TL
68 return;
69
70 bufferlist layout_bl;
71 encode(_layout, layout_bl, _features);
72 op.setxattr("layout", layout_bl);
20effc67
TL
73
74 if (!_symlink.empty()) {
75 bufferlist symlink_bl;
76 encode(_symlink, symlink_bl);
77 op.setxattr("symlink", symlink_bl);
78 }
f67539c2 79}
7c673cae
FG
80
81class CInodeIOContext : public MDSIOContextBase
82{
83protected:
84 CInode *in;
85 MDSRank *get_mds() override {return in->mdcache->mds;}
86public:
87 explicit CInodeIOContext(CInode *in_) : in(in_) {
11fdf7f2 88 ceph_assert(in != NULL);
7c673cae
FG
89 }
90};
91
11fdf7f2 92sr_t* const CInode::projected_inode::UNDEF_SRNODE = (sr_t*)(unsigned long)-1;
7c673cae
FG
93
94LockType CInode::versionlock_type(CEPH_LOCK_IVERSION);
95LockType CInode::authlock_type(CEPH_LOCK_IAUTH);
96LockType CInode::linklock_type(CEPH_LOCK_ILINK);
97LockType CInode::dirfragtreelock_type(CEPH_LOCK_IDFT);
98LockType CInode::filelock_type(CEPH_LOCK_IFILE);
99LockType CInode::xattrlock_type(CEPH_LOCK_IXATTR);
100LockType CInode::snaplock_type(CEPH_LOCK_ISNAP);
101LockType CInode::nestlock_type(CEPH_LOCK_INEST);
102LockType CInode::flocklock_type(CEPH_LOCK_IFLOCK);
103LockType CInode::policylock_type(CEPH_LOCK_IPOLICY);
104
9f95a23c
TL
105std::string_view CInode::pin_name(int p) const
106{
107 switch (p) {
108 case PIN_DIRFRAG: return "dirfrag";
109 case PIN_CAPS: return "caps";
110 case PIN_IMPORTING: return "importing";
111 case PIN_OPENINGDIR: return "openingdir";
112 case PIN_REMOTEPARENT: return "remoteparent";
113 case PIN_BATCHOPENJOURNAL: return "batchopenjournal";
114 case PIN_SCATTERED: return "scattered";
115 case PIN_STICKYDIRS: return "stickydirs";
116 //case PIN_PURGING: return "purging";
117 case PIN_FREEZING: return "freezing";
118 case PIN_FROZEN: return "frozen";
119 case PIN_IMPORTINGCAPS: return "importingcaps";
120 case PIN_EXPORTINGCAPS: return "exportingcaps";
121 case PIN_PASTSNAPPARENT: return "pastsnapparent";
122 case PIN_OPENINGSNAPPARENTS: return "openingsnapparents";
123 case PIN_TRUNCATING: return "truncating";
124 case PIN_STRAY: return "stray";
125 case PIN_NEEDSNAPFLUSH: return "needsnapflush";
126 case PIN_DIRTYRSTAT: return "dirtyrstat";
127 case PIN_DIRTYPARENT: return "dirtyparent";
128 case PIN_DIRWAITER: return "dirwaiter";
9f95a23c
TL
129 default: return generic_pin_name(p);
130 }
131}
132
7c673cae
FG
133//int cinode_pins[CINODE_NUM_PINS]; // counts
134ostream& CInode::print_db_line_prefix(ostream& out)
135{
f67539c2 136 return out << ceph_clock_now() << " mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") ";
7c673cae
FG
137}
138
139/*
140 * write caps and lock ids
141 */
142struct cinode_lock_info_t cinode_lock_info[] = {
143 { CEPH_LOCK_IFILE, CEPH_CAP_ANY_FILE_WR },
144 { CEPH_LOCK_IAUTH, CEPH_CAP_AUTH_EXCL },
145 { CEPH_LOCK_ILINK, CEPH_CAP_LINK_EXCL },
146 { CEPH_LOCK_IXATTR, CEPH_CAP_XATTR_EXCL },
147};
148int num_cinode_locks = sizeof(cinode_lock_info) / sizeof(cinode_lock_info[0]);
149
7c673cae
FG
150ostream& operator<<(ostream& out, const CInode& in)
151{
152 string path;
153 in.make_path_string(path, true);
154
f67539c2 155 out << "[inode " << in.ino();
7c673cae
FG
156 out << " ["
157 << (in.is_multiversion() ? "...":"")
158 << in.first << "," << in.last << "]";
159 out << " " << path << (in.is_dir() ? "/":"");
160
161 if (in.is_auth()) {
162 out << " auth";
163 if (in.is_replicated())
164 out << in.get_replicas();
165 } else {
166 mds_authority_t a = in.authority();
167 out << " rep@" << a.first;
168 if (a.second != CDIR_AUTH_UNKNOWN)
169 out << "," << a.second;
170 out << "." << in.get_replica_nonce();
171 }
172
173 if (in.is_symlink())
174 out << " symlink='" << in.symlink << "'";
175 if (in.is_dir() && !in.dirfragtree.empty())
176 out << " " << in.dirfragtree;
177
178 out << " v" << in.get_version();
179 if (in.get_projected_version() > in.get_version())
180 out << " pv" << in.get_projected_version();
181
11fdf7f2
TL
182 if (in.get_num_auth_pins()) {
183 out << " ap=" << in.get_num_auth_pins();
7c673cae 184#ifdef MDS_AUTHPIN_SET
11fdf7f2 185 in.print_authpin_set(out);
7c673cae
FG
186#endif
187 }
188
189 if (in.snaprealm)
190 out << " snaprealm=" << in.snaprealm;
191
192 if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH";
f67539c2
TL
193 if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " NEEDSRECOVER";
194 if (in.state_test(CInode::STATE_RECOVERING)) out << " RECOVERING";
195 if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " DIRTYPARENT";
196 if (in.state_test(CInode::STATE_MISSINGOBJS)) out << " MISSINGOBJS";
197 if (in.is_ephemeral_dist()) out << " DISTEPHEMERALPIN";
198 if (in.is_ephemeral_rand()) out << " RANDEPHEMERALPIN";
7c673cae
FG
199 if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance;
200 if (in.is_frozen_inode()) out << " FROZEN";
201 if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN";
202
f67539c2 203 const auto& pi = in.get_projected_inode();
7c673cae
FG
204 if (pi->is_truncating())
205 out << " truncating(" << pi->truncate_from << " to " << pi->truncate_size << ")";
206
f67539c2
TL
207 if (in.is_dir()) {
208 out << " " << in.get_inode()->dirstat;
11fdf7f2 209 if (g_conf()->mds_debug_scatterstat && in.is_projected()) {
7c673cae
FG
210 out << "->" << pi->dirstat;
211 }
212 } else {
f67539c2
TL
213 out << " s=" << in.get_inode()->size;
214 if (in.get_inode()->nlink != 1)
215 out << " nl=" << in.get_inode()->nlink;
7c673cae
FG
216 }
217
218 // rstat
f67539c2
TL
219 out << " " << in.get_inode()->rstat;
220 if (!(in.get_inode()->rstat == in.get_inode()->accounted_rstat))
221 out << "/" << in.get_inode()->accounted_rstat;
11fdf7f2 222 if (g_conf()->mds_debug_scatterstat && in.is_projected()) {
7c673cae
FG
223 out << "->" << pi->rstat;
224 if (!(pi->rstat == pi->accounted_rstat))
225 out << "/" << pi->accounted_rstat;
226 }
227
f67539c2
TL
228 if (in.is_any_old_inodes()) {
229 out << " old_inodes=" << in.get_old_inodes()->size();
230 }
231
7c673cae
FG
232 if (!in.client_need_snapflush.empty())
233 out << " need_snapflush=" << in.client_need_snapflush;
234
7c673cae
FG
235 // locks
236 if (!in.authlock.is_sync_and_unlocked())
237 out << " " << in.authlock;
238 if (!in.linklock.is_sync_and_unlocked())
239 out << " " << in.linklock;
f67539c2 240 if (in.get_inode()->is_dir()) {
7c673cae
FG
241 if (!in.dirfragtreelock.is_sync_and_unlocked())
242 out << " " << in.dirfragtreelock;
243 if (!in.snaplock.is_sync_and_unlocked())
244 out << " " << in.snaplock;
245 if (!in.nestlock.is_sync_and_unlocked())
246 out << " " << in.nestlock;
247 if (!in.policylock.is_sync_and_unlocked())
248 out << " " << in.policylock;
249 } else {
250 if (!in.flocklock.is_sync_and_unlocked())
251 out << " " << in.flocklock;
252 }
253 if (!in.filelock.is_sync_and_unlocked())
254 out << " " << in.filelock;
255 if (!in.xattrlock.is_sync_and_unlocked())
256 out << " " << in.xattrlock;
257 if (!in.versionlock.is_sync_and_unlocked())
258 out << " " << in.versionlock;
259
260 // hack: spit out crap on which clients have caps
f67539c2
TL
261 if (in.get_inode()->client_ranges.size())
262 out << " cr=" << in.get_inode()->client_ranges;
7c673cae
FG
263
264 if (!in.get_client_caps().empty()) {
265 out << " caps={";
11fdf7f2
TL
266 bool first = true;
267 for (const auto &p : in.get_client_caps()) {
268 if (!first) out << ",";
269 out << p.first << "="
270 << ccap_string(p.second.pending());
271 if (p.second.issued() != p.second.pending())
272 out << "/" << ccap_string(p.second.issued());
273 out << "/" << ccap_string(p.second.wanted())
274 << "@" << p.second.get_last_seq();
275 first = false;
7c673cae
FG
276 }
277 out << "}";
278 if (in.get_loner() >= 0 || in.get_wanted_loner() >= 0) {
279 out << ",l=" << in.get_loner();
280 if (in.get_loner() != in.get_wanted_loner())
281 out << "(" << in.get_wanted_loner() << ")";
282 }
283 }
284 if (!in.get_mds_caps_wanted().empty()) {
285 out << " mcw={";
94b18763
FG
286 bool first = true;
287 for (const auto &p : in.get_mds_caps_wanted()) {
288 if (!first)
7c673cae 289 out << ',';
94b18763
FG
290 out << p.first << '=' << ccap_string(p.second);
291 first = false;
7c673cae
FG
292 }
293 out << '}';
294 }
295
296 if (in.get_num_ref()) {
297 out << " |";
298 in.print_pin_set(out);
299 }
300
f67539c2
TL
301 if (in.get_inode()->export_pin != MDS_RANK_NONE) {
302 out << " export_pin=" << in.get_inode()->export_pin;
7c673cae 303 }
f6b5b4d7
TL
304 if (in.state_test(CInode::STATE_DISTEPHEMERALPIN)) {
305 out << " distepin";
306 }
307 if (in.state_test(CInode::STATE_RANDEPHEMERALPIN)) {
308 out << " randepin";
309 }
7c673cae
FG
310
311 out << " " << &in;
312 out << "]";
313 return out;
314}
315
f67539c2
TL
316CInode::CInode(MDCache *c, bool auth, snapid_t f, snapid_t l) :
317 mdcache(c), first(f), last(l),
11fdf7f2
TL
318 item_dirty(this),
319 item_caps(this),
320 item_open_file(this),
321 item_dirty_parent(this),
322 item_dirty_dirfrag_dir(this),
323 item_dirty_dirfrag_nest(this),
324 item_dirty_dirfrag_dirfragtree(this),
325 pop(c->decayrate),
326 versionlock(this, &versionlock_type),
327 authlock(this, &authlock_type),
328 linklock(this, &linklock_type),
329 dirfragtreelock(this, &dirfragtreelock_type),
330 filelock(this, &filelock_type),
331 xattrlock(this, &xattrlock_type),
332 snaplock(this, &snaplock_type),
333 nestlock(this, &nestlock_type),
334 flocklock(this, &flocklock_type),
335 policylock(this, &policylock_type)
336{
f67539c2
TL
337 if (auth)
338 state_set(STATE_AUTH);
11fdf7f2 339}
7c673cae
FG
340
341void CInode::print(ostream& out)
342{
343 out << *this;
344}
345
7c673cae
FG
346void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
347{
11fdf7f2 348 dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl;
7c673cae
FG
349
350 if (client_need_snapflush.empty()) {
351 get(CInode::PIN_NEEDSNAPFLUSH);
352
353 // FIXME: this is non-optimal, as we'll block freezes/migrations for potentially
354 // long periods waiting for clients to flush their snaps.
f67539c2 355 auth_pin(this); // pin head get_inode()->..
7c673cae
FG
356 }
357
94b18763 358 auto &clients = client_need_snapflush[snapid];
7c673cae
FG
359 if (clients.empty())
360 snapin->auth_pin(this); // ...and pin snapped/old inode!
361
362 clients.insert(client);
363}
364
365void CInode::remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
366{
94b18763
FG
367 dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl;
368 auto it = client_need_snapflush.find(snapid);
369 if (it == client_need_snapflush.end()) {
7c673cae
FG
370 dout(10) << " snapid not found" << dendl;
371 return;
372 }
94b18763
FG
373 size_t n = it->second.erase(client);
374 if (n == 0) {
7c673cae
FG
375 dout(10) << " client not found" << dendl;
376 return;
377 }
94b18763
FG
378 if (it->second.empty()) {
379 client_need_snapflush.erase(it);
7c673cae
FG
380 snapin->auth_unpin(this);
381
382 if (client_need_snapflush.empty()) {
383 put(CInode::PIN_NEEDSNAPFLUSH);
384 auth_unpin(this);
385 }
386 }
387}
388
494da23a 389pair<bool,bool> CInode::split_need_snapflush(CInode *cowin, CInode *in)
7c673cae 390{
11fdf7f2 391 dout(10) << __func__ << " [" << cowin->first << "," << cowin->last << "] for " << *cowin << dendl;
494da23a
TL
392 bool cowin_need_flush = false;
393 bool orig_need_flush = false;
394 auto it = client_need_snapflush.lower_bound(cowin->first);
395 while (it != client_need_snapflush.end() && it->first < in->first) {
11fdf7f2 396 ceph_assert(!it->second.empty());
94b18763 397 if (cowin->last >= it->first) {
7c673cae 398 cowin->auth_pin(this);
494da23a 399 cowin_need_flush = true;
94b18763
FG
400 ++it;
401 } else {
402 it = client_need_snapflush.erase(it);
403 }
7c673cae
FG
404 in->auth_unpin(this);
405 }
494da23a
TL
406
407 if (it != client_need_snapflush.end() && it->first <= in->last)
408 orig_need_flush = true;
409
410 return make_pair(cowin_need_flush, orig_need_flush);
7c673cae
FG
411}
412
413void CInode::mark_dirty_rstat()
414{
415 if (!state_test(STATE_DIRTYRSTAT)) {
11fdf7f2 416 dout(10) << __func__ << dendl;
7c673cae
FG
417 state_set(STATE_DIRTYRSTAT);
418 get(PIN_DIRTYRSTAT);
224ce89b
WB
419 CDentry *pdn = get_projected_parent_dn();
420 if (pdn->is_auth()) {
421 CDir *pdir = pdn->dir;
422 pdir->dirty_rstat_inodes.push_back(&dirty_rstat_item);
423 mdcache->mds->locker->mark_updated_scatterlock(&pdir->inode->nestlock);
424 } else {
425 // under cross-MDS rename.
426 // DIRTYRSTAT flag will get cleared when rename finishes
11fdf7f2 427 ceph_assert(state_test(STATE_AMBIGUOUSAUTH));
224ce89b 428 }
7c673cae
FG
429 }
430}
431void CInode::clear_dirty_rstat()
432{
433 if (state_test(STATE_DIRTYRSTAT)) {
11fdf7f2 434 dout(10) << __func__ << dendl;
7c673cae
FG
435 state_clear(STATE_DIRTYRSTAT);
436 put(PIN_DIRTYRSTAT);
437 dirty_rstat_item.remove_myself();
438 }
439}
440
f67539c2
TL
441CInode::projected_inode CInode::project_inode(const MutationRef& mut,
442 bool xattr, bool snap)
94b18763 443{
f67539c2
TL
444 if (mut && mut->is_projected(this)) {
445 ceph_assert(!xattr && !snap);
446 auto _inode = std::const_pointer_cast<mempool_inode>(projected_nodes.back().inode);
447 return projected_inode(std::move(_inode), xattr_map_ptr());
448 }
449
450 auto pi = allocate_inode(*get_projected_inode());
7c673cae
FG
451
452 if (scrub_infop && scrub_infop->last_scrub_dirty) {
f67539c2
TL
453 pi->last_scrub_stamp = scrub_infop->last_scrub_stamp;
454 pi->last_scrub_version = scrub_infop->last_scrub_version;
7c673cae
FG
455 scrub_infop->last_scrub_dirty = false;
456 scrub_maybe_delete_info();
457 }
94b18763 458
f67539c2
TL
459 const auto& ox = get_projected_xattrs();
460 xattr_map_ptr px;
94b18763 461 if (xattr) {
f67539c2
TL
462 px = allocate_xattr_map();
463 if (ox)
464 *px = *ox;
94b18763
FG
465 }
466
f67539c2 467 sr_t* ps = projected_inode::UNDEF_SRNODE;
94b18763 468 if (snap) {
f67539c2
TL
469 ps = prepare_new_srnode(0);
470 ++num_projected_srnodes;
94b18763
FG
471 }
472
f67539c2
TL
473 projected_nodes.emplace_back(pi, xattr ? px : ox , ps);
474 if (mut)
475 mut->add_projected_node(this);
476 dout(15) << __func__ << " " << pi->ino << dendl;
477 return projected_inode(std::move(pi), std::move(px), ps);
7c673cae
FG
478}
479
f67539c2 480void CInode::pop_and_dirty_projected_inode(LogSegment *ls, const MutationRef& mut)
7c673cae 481{
11fdf7f2 482 ceph_assert(!projected_nodes.empty());
f67539c2
TL
483 auto front = std::move(projected_nodes.front());
484 dout(15) << __func__ << " v" << front.inode->version << dendl;
f6b5b4d7 485
f67539c2
TL
486 projected_nodes.pop_front();
487 if (mut)
488 mut->remove_projected_node(this);
7c673cae 489
f67539c2
TL
490 bool pool_updated = get_inode()->layout.pool_id != front.inode->layout.pool_id;
491 bool pin_updated = (get_inode()->export_pin != front.inode->export_pin) ||
492 (get_inode()->export_ephemeral_distributed_pin !=
493 front.inode->export_ephemeral_distributed_pin);
7c673cae 494
f67539c2
TL
495 reset_inode(std::move(front.inode));
496 if (front.xattrs != get_xattrs())
497 reset_xattrs(std::move(front.xattrs));
7c673cae 498
f67539c2 499 if (front.snapnode != projected_inode::UNDEF_SRNODE) {
7c673cae 500 --num_projected_srnodes;
f67539c2 501 pop_projected_snaprealm(front.snapnode, false);
7c673cae
FG
502 }
503
f67539c2
TL
504 mark_dirty(ls);
505 if (get_inode()->is_backtrace_updated())
506 mark_dirty_parent(ls, pool_updated);
7c673cae 507
f67539c2
TL
508 if (pin_updated)
509 maybe_export_pin(true);
9f95a23c
TL
510}
511
11fdf7f2
TL
512sr_t *CInode::prepare_new_srnode(snapid_t snapid)
513{
514 const sr_t *cur_srnode = get_projected_srnode();
515 sr_t *new_srnode;
516
517 if (cur_srnode) {
518 new_srnode = new sr_t(*cur_srnode);
11fdf7f2
TL
519 } else {
520 if (snapid == 0)
521 snapid = mdcache->get_global_snaprealm()->get_newest_seq();
522 new_srnode = new sr_t();
523 new_srnode->seq = snapid;
524 new_srnode->created = snapid;
525 new_srnode->current_parent_since = get_oldest_snap();
526 }
527 return new_srnode;
528}
529
9f95a23c
TL
530const sr_t *CInode::get_projected_srnode() const {
531 if (num_projected_srnodes > 0) {
532 for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
533 if (it->snapnode != projected_inode::UNDEF_SRNODE)
534 return it->snapnode;
535 }
536 if (snaprealm)
537 return &snaprealm->srnode;
538 else
539 return NULL;
540}
541
11fdf7f2
TL
542void CInode::project_snaprealm(sr_t *new_srnode)
543{
544 dout(10) << __func__ << " " << new_srnode << dendl;
545 ceph_assert(projected_nodes.back().snapnode == projected_inode::UNDEF_SRNODE);
546 projected_nodes.back().snapnode = new_srnode;
547 ++num_projected_srnodes;
548}
549
550void CInode::mark_snaprealm_global(sr_t *new_srnode)
551{
552 ceph_assert(!is_dir());
553 // 'last_destroyed' is no longer used, use it to store origin 'current_parent_since'
554 new_srnode->last_destroyed = new_srnode->current_parent_since;
555 new_srnode->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
556 new_srnode->mark_parent_global();
557}
558
559void CInode::clear_snaprealm_global(sr_t *new_srnode)
560{
561 // restore 'current_parent_since'
562 new_srnode->current_parent_since = new_srnode->last_destroyed;
563 new_srnode->last_destroyed = 0;
564 new_srnode->seq = mdcache->get_global_snaprealm()->get_newest_seq();
565 new_srnode->clear_parent_global();
566}
567
568bool CInode::is_projected_snaprealm_global() const
569{
570 const sr_t *srnode = get_projected_srnode();
571 if (srnode && srnode->is_parent_global())
572 return true;
573 return false;
574}
575
576void CInode::project_snaprealm_past_parent(SnapRealm *newparent)
577{
578 sr_t *new_snap = project_snaprealm();
579 record_snaprealm_past_parent(new_snap, newparent);
580}
581
582
7c673cae
FG
583/* if newparent != parent, add parent to past_parents
584 if parent DNE, we need to find what the parent actually is and fill that in */
11fdf7f2 585void CInode::record_snaprealm_past_parent(sr_t *new_snap, SnapRealm *newparent)
7c673cae 586{
11fdf7f2 587 ceph_assert(!new_snap->is_parent_global());
7c673cae
FG
588 SnapRealm *oldparent;
589 if (!snaprealm) {
590 oldparent = find_snaprealm();
11fdf7f2 591 } else {
7c673cae 592 oldparent = snaprealm->parent;
11fdf7f2 593 }
7c673cae
FG
594
595 if (newparent != oldparent) {
596 snapid_t oldparentseq = oldparent->get_newest_seq();
11fdf7f2
TL
597 if (oldparentseq + 1 > new_snap->current_parent_since) {
598 // copy old parent's snaps
599 const set<snapid_t>& snaps = oldparent->get_snaps();
600 auto p = snaps.lower_bound(new_snap->current_parent_since);
601 if (p != snaps.end())
602 new_snap->past_parent_snaps.insert(p, snaps.end());
603 if (oldparentseq > new_snap->seq)
604 new_snap->seq = oldparentseq;
7c673cae 605 }
11fdf7f2 606 new_snap->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae
FG
607 }
608}
609
adb31ebb 610void CInode::record_snaprealm_parent_dentry(sr_t *new_snap, SnapRealm *oldparent,
11fdf7f2 611 CDentry *dn, bool primary_dn)
7c673cae 612{
11fdf7f2 613 ceph_assert(new_snap->is_parent_global());
adb31ebb
TL
614
615 if (!oldparent)
616 oldparent = dn->get_dir()->inode->find_snaprealm();
11fdf7f2
TL
617 auto& snaps = oldparent->get_snaps();
618
619 if (!primary_dn) {
620 auto p = snaps.lower_bound(dn->first);
621 if (p != snaps.end())
622 new_snap->past_parent_snaps.insert(p, snaps.end());
adb31ebb 623 } else {
11fdf7f2
TL
624 // 'last_destroyed' is used as 'current_parent_since'
625 auto p = snaps.lower_bound(new_snap->last_destroyed);
626 if (p != snaps.end())
627 new_snap->past_parent_snaps.insert(p, snaps.end());
628 new_snap->last_destroyed = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
629 }
630}
7c673cae 631
11fdf7f2
TL
632void CInode::early_pop_projected_snaprealm()
633{
634 ceph_assert(!projected_nodes.empty());
635 if (projected_nodes.front().snapnode != projected_inode::UNDEF_SRNODE) {
636 pop_projected_snaprealm(projected_nodes.front().snapnode, true);
637 projected_nodes.front().snapnode = projected_inode::UNDEF_SRNODE;
638 --num_projected_srnodes;
7c673cae 639 }
11fdf7f2 640}
7c673cae 641
11fdf7f2
TL
642void CInode::pop_projected_snaprealm(sr_t *next_snaprealm, bool early)
643{
644 if (next_snaprealm) {
645 dout(10) << __func__ << (early ? " (early) " : " ")
646 << next_snaprealm << " seq " << next_snaprealm->seq << dendl;
f67539c2 647 if (!snaprealm)
11fdf7f2 648 open_snaprealm();
11fdf7f2 649
11fdf7f2
TL
650 auto old_flags = snaprealm->srnode.flags;
651 snaprealm->srnode = *next_snaprealm;
652 delete next_snaprealm;
7c673cae 653
11fdf7f2 654 if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) {
11fdf7f2
TL
655 snaprealm->adjust_parent();
656 }
7c673cae 657
11fdf7f2
TL
658 if (snaprealm->parent)
659 dout(10) << " realm " << *snaprealm << " parent " << *snaprealm->parent << dendl;
660 } else {
661 dout(10) << __func__ << (early ? " (early) null" : " null") << dendl;
662 ceph_assert(snaprealm);
663 snaprealm->merge_to(NULL);
664 }
7c673cae
FG
665}
666
667
668// ====== CInode =======
669
670// dirfrags
671
f67539c2
TL
672InodeStoreBase::inode_const_ptr InodeStoreBase::empty_inode = InodeStoreBase::allocate_inode();
673
11fdf7f2 674__u32 InodeStoreBase::hash_dentry_name(std::string_view dn)
7c673cae 675{
f67539c2 676 int which = inode->dir_layout.dl_dir_hash;
7c673cae
FG
677 if (!which)
678 which = CEPH_STR_HASH_LINUX;
11fdf7f2 679 ceph_assert(ceph_str_hash_valid(which));
7c673cae
FG
680 return ceph_str_hash(which, dn.data(), dn.length());
681}
682
11fdf7f2 683frag_t InodeStoreBase::pick_dirfrag(std::string_view dn)
7c673cae
FG
684{
685 if (dirfragtree.empty())
686 return frag_t(); // avoid the string hash if we can.
687
688 __u32 h = hash_dentry_name(dn);
689 return dirfragtree[h];
690}
691
9f95a23c 692std::pair<bool, std::vector<CDir*>> CInode::get_dirfrags_under(frag_t fg)
7c673cae 693{
9f95a23c
TL
694 std::pair<bool, std::vector<CDir*>> result;
695 auto& all = result.first;
696 auto& dirs = result.second;
697 all = false;
698
699 if (auto it = dirfrags.find(fg); it != dirfrags.end()){
700 all = true;
701 dirs.push_back(it->second);
702 return result;
7c673cae 703 }
9f95a23c
TL
704
705 int total = 0;
706 for(auto &[_fg, _dir] : dirfrags){
707 // frag_t.bits() can indicate the depth of the partition in the directory tree
708 // e.g.
709 // 01* : bit = 2, on the second floor
710 // *
711 // 0* 1*
712 // 00* 01* 10* 11* -- > level 2, bit = 2
713 // so fragA.bits > fragB.bits means fragA is deeper than fragB
714
715 if (fg.bits() >= _fg.bits()) {
716 if (_fg.contains(fg)) {
717 all = true;
718 return result;
719 }
720 } else {
721 if (fg.contains(_fg)) {
722 dirs.push_back(_dir);
723 // we can calculate how many sub slices a slice can be divided into
724 // frag_t(*) can be divided into two frags belonging to the first layer(0* 1*)
725 // or 2^2 frags belonging to the second layer(00* 01* 10* 11*)
726 // or (1 << (24 - frag_t(*).bits)) frags belonging to the 24th level
727 total += 1 << (24 - _fg.bits());
11fdf7f2 728 }
7c673cae 729 }
94b18763 730 }
7c673cae 731
9f95a23c
TL
732 // we convert all the frags into the frags of 24th layer to calculate whether all the frags are included in the memory cache
733 all = ((1<<(24-fg.bits())) == total);
734 return result;
7c673cae
FG
735}
736
737void CInode::verify_dirfrags()
738{
739 bool bad = false;
94b18763
FG
740 for (const auto &p : dirfrags) {
741 if (!dirfragtree.is_leaf(p.first)) {
742 dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
743 << ": " << *p.second << dendl;
7c673cae
FG
744 bad = true;
745 }
746 }
11fdf7f2 747 ceph_assert(!bad);
7c673cae
FG
748}
749
750void CInode::force_dirfrags()
751{
752 bool bad = false;
94b18763
FG
753 for (auto &p : dirfrags) {
754 if (!dirfragtree.is_leaf(p.first)) {
755 dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
756 << ": " << *p.second << dendl;
7c673cae
FG
757 bad = true;
758 }
759 }
760
761 if (bad) {
11fdf7f2 762 frag_vec_t leaves;
7c673cae 763 dirfragtree.get_leaves(leaves);
11fdf7f2
TL
764 for (const auto& leaf : leaves) {
765 mdcache->get_force_dirfrag(dirfrag_t(ino(), leaf), true);
766 }
7c673cae
FG
767 }
768
769 verify_dirfrags();
770}
771
772CDir *CInode::get_approx_dirfrag(frag_t fg)
773{
774 CDir *dir = get_dirfrag(fg);
775 if (dir) return dir;
776
777 // find a child?
9f95a23c
TL
778 auto&& p = get_dirfrags_under(fg);
779 if (!p.second.empty())
780 return p.second.front();
7c673cae
FG
781
782 // try parents?
783 while (fg.bits() > 0) {
784 fg = fg.parent();
785 dir = get_dirfrag(fg);
786 if (dir) return dir;
787 }
788 return NULL;
789}
790
7c673cae
FG
791CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg)
792{
11fdf7f2 793 ceph_assert(is_dir());
7c673cae
FG
794
795 // have it?
796 CDir *dir = get_dirfrag(fg);
797 if (!dir) {
798 // create it.
11fdf7f2 799 ceph_assert(is_auth() || mdcache->mds->is_any_replay());
7c673cae
FG
800 dir = new CDir(this, fg, mdcache, is_auth());
801 add_dirfrag(dir);
802 }
803 return dir;
804}
805
806CDir *CInode::add_dirfrag(CDir *dir)
807{
11fdf7f2
TL
808 auto em = dirfrags.emplace(std::piecewise_construct, std::forward_as_tuple(dir->dirfrag().frag), std::forward_as_tuple(dir));
809 ceph_assert(em.second);
7c673cae
FG
810
811 if (stickydir_ref > 0) {
812 dir->state_set(CDir::STATE_STICKY);
813 dir->get(CDir::PIN_STICKY);
814 }
815
f67539c2 816 maybe_export_pin();
7c673cae
FG
817
818 return dir;
819}
820
821void CInode::close_dirfrag(frag_t fg)
822{
11fdf7f2
TL
823 dout(14) << __func__ << " " << fg << dendl;
824 ceph_assert(dirfrags.count(fg));
7c673cae
FG
825
826 CDir *dir = dirfrags[fg];
827 dir->remove_null_dentries();
828
829 // clear dirty flag
830 if (dir->is_dirty())
831 dir->mark_clean();
832
833 if (stickydir_ref > 0) {
834 dir->state_clear(CDir::STATE_STICKY);
835 dir->put(CDir::PIN_STICKY);
836 }
1adf2230
AA
837
838 if (dir->is_subtree_root())
839 num_subtree_roots--;
7c673cae
FG
840
841 // dump any remaining dentries, for debugging purposes
94b18763
FG
842 for (const auto &p : dir->items)
843 dout(14) << __func__ << " LEFTOVER dn " << *p.second << dendl;
7c673cae 844
11fdf7f2 845 ceph_assert(dir->get_num_ref() == 0);
7c673cae
FG
846 delete dir;
847 dirfrags.erase(fg);
848}
849
850void CInode::close_dirfrags()
851{
852 while (!dirfrags.empty())
853 close_dirfrag(dirfrags.begin()->first);
854}
855
856bool CInode::has_subtree_root_dirfrag(int auth)
857{
1adf2230
AA
858 if (num_subtree_roots > 0) {
859 if (auth == -1)
7c673cae 860 return true;
1adf2230
AA
861 for (const auto &p : dirfrags) {
862 if (p.second->is_subtree_root() &&
863 p.second->dir_auth.first == auth)
864 return true;
865 }
94b18763 866 }
7c673cae
FG
867 return false;
868}
869
870bool CInode::has_subtree_or_exporting_dirfrag()
871{
1adf2230
AA
872 if (num_subtree_roots > 0 || num_exporting_dirs > 0)
873 return true;
7c673cae
FG
874 return false;
875}
876
877void CInode::get_stickydirs()
878{
879 if (stickydir_ref == 0) {
880 get(PIN_STICKYDIRS);
94b18763
FG
881 for (const auto &p : dirfrags) {
882 p.second->state_set(CDir::STATE_STICKY);
883 p.second->get(CDir::PIN_STICKY);
7c673cae
FG
884 }
885 }
886 stickydir_ref++;
887}
888
889void CInode::put_stickydirs()
890{
11fdf7f2 891 ceph_assert(stickydir_ref > 0);
7c673cae
FG
892 stickydir_ref--;
893 if (stickydir_ref == 0) {
894 put(PIN_STICKYDIRS);
94b18763
FG
895 for (const auto &p : dirfrags) {
896 p.second->state_clear(CDir::STATE_STICKY);
897 p.second->put(CDir::PIN_STICKY);
7c673cae
FG
898 }
899 }
900}
901
902
903
904
905
906// pins
907
908void CInode::first_get()
909{
910 // pin my dentry?
911 if (parent)
912 parent->get(CDentry::PIN_INODEPIN);
913}
914
915void CInode::last_put()
916{
917 // unpin my dentry?
918 if (parent)
919 parent->put(CDentry::PIN_INODEPIN);
920}
921
922void CInode::_put()
923{
924 if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent())
925 mdcache->maybe_eval_stray(this, true);
926}
927
928void CInode::add_remote_parent(CDentry *p)
929{
930 if (remote_parents.empty())
931 get(PIN_REMOTEPARENT);
932 remote_parents.insert(p);
933}
934void CInode::remove_remote_parent(CDentry *p)
935{
936 remote_parents.erase(p);
937 if (remote_parents.empty())
938 put(PIN_REMOTEPARENT);
939}
940
941
942
943
944CDir *CInode::get_parent_dir()
945{
946 if (parent)
947 return parent->dir;
948 return NULL;
949}
950CDir *CInode::get_projected_parent_dir()
951{
952 CDentry *p = get_projected_parent_dn();
953 if (p)
954 return p->dir;
955 return NULL;
956}
957CInode *CInode::get_parent_inode()
958{
959 if (parent)
960 return parent->dir->inode;
961 return NULL;
962}
963
11fdf7f2 964bool CInode::is_ancestor_of(const CInode *other) const
7c673cae
FG
965{
966 while (other) {
967 if (other == this)
968 return true;
11fdf7f2
TL
969 const CDentry *pdn = other->get_oldest_parent_dn();
970 if (!pdn) {
971 ceph_assert(other->is_base());
7c673cae 972 break;
11fdf7f2
TL
973 }
974 other = pdn->get_dir()->get_inode();
975 }
976 return false;
977}
978
979bool CInode::is_projected_ancestor_of(const CInode *other) const
980{
981 while (other) {
982 if (other == this)
983 return true;
984 const CDentry *pdn = other->get_projected_parent_dn();
985 if (!pdn) {
986 ceph_assert(other->is_base());
987 break;
988 }
989 other = pdn->get_dir()->get_inode();
7c673cae
FG
990 }
991 return false;
992}
993
994/*
995 * Because a non-directory inode may have multiple links, the use_parent
996 * argument allows selecting which parent to use for path construction. This
997 * argument is only meaningful for the final component (i.e. the first of the
998 * nested calls) because directories cannot have multiple hard links. If
999 * use_parent is NULL and projected is true, the primary parent's projected
1000 * inode is used all the way up the path chain. Otherwise the primary parent
1001 * stable inode is used.
1002 */
1003void CInode::make_path_string(string& s, bool projected, const CDentry *use_parent) const
1004{
1005 if (!use_parent) {
1006 use_parent = projected ? get_projected_parent_dn() : parent;
1007 }
1008
1009 if (use_parent) {
1010 use_parent->make_path_string(s, projected);
1011 } else if (is_root()) {
1012 s = "";
1013 } else if (is_mdsdir()) {
1014 char t[40];
1015 uint64_t eino(ino());
1016 eino -= MDS_INO_MDSDIR_OFFSET;
1017 snprintf(t, sizeof(t), "~mds%" PRId64, eino);
1018 s = t;
1019 } else {
1020 char n[40];
1021 uint64_t eino(ino());
1022 snprintf(n, sizeof(n), "#%" PRIx64, eino);
1023 s += n;
1024 }
1025}
1026
1027void CInode::make_path(filepath& fp, bool projected) const
1028{
1029 const CDentry *use_parent = projected ? get_projected_parent_dn() : parent;
1030 if (use_parent) {
11fdf7f2 1031 ceph_assert(!is_base());
7c673cae
FG
1032 use_parent->make_path(fp, projected);
1033 } else {
1034 fp = filepath(ino());
1035 }
1036}
1037
1038void CInode::name_stray_dentry(string& dname)
1039{
1040 char s[20];
f67539c2 1041 snprintf(s, sizeof(s), "%llx", (unsigned long long)ino().val);
7c673cae
FG
1042 dname = s;
1043}
1044
1045version_t CInode::pre_dirty()
1046{
1047 version_t pv;
1048 CDentry* _cdentry = get_projected_parent_dn();
1049 if (_cdentry) {
1050 pv = _cdentry->pre_dirty(get_projected_version());
f67539c2 1051 dout(10) << "pre_dirty " << pv << " (current v " << get_inode()->version << ")" << dendl;
7c673cae 1052 } else {
11fdf7f2 1053 ceph_assert(is_base());
7c673cae
FG
1054 pv = get_projected_version() + 1;
1055 }
94b18763 1056 // force update backtrace for old format inode (see mempool_inode::decode)
f67539c2
TL
1057 if (get_inode()->backtrace_version == 0 && !projected_nodes.empty()) {
1058 auto pi = _get_projected_inode();
1059 if (pi->backtrace_version == 0)
1060 pi->update_backtrace(pv);
7c673cae
FG
1061 }
1062 return pv;
1063}
1064
1065void CInode::_mark_dirty(LogSegment *ls)
1066{
1067 if (!state_test(STATE_DIRTY)) {
1068 state_set(STATE_DIRTY);
1069 get(PIN_DIRTY);
11fdf7f2 1070 ceph_assert(ls);
7c673cae
FG
1071 }
1072
1073 // move myself to this segment's dirty list
1074 if (ls)
1075 ls->dirty_inodes.push_back(&item_dirty);
1076}
1077
f67539c2 1078void CInode::mark_dirty(LogSegment *ls) {
7c673cae 1079
11fdf7f2 1080 dout(10) << __func__ << " " << *this << dendl;
7c673cae
FG
1081
1082 /*
1083 NOTE: I may already be dirty, but this fn _still_ needs to be called so that
1084 the directory is (perhaps newly) dirtied, and so that parent_dir_version is
1085 updated below.
1086 */
1087
1088 // only auth can get dirty. "dirty" async data in replicas is relative to
1089 // filelock state, not the dirty flag.
11fdf7f2 1090 ceph_assert(is_auth());
7c673cae
FG
1091
1092 // touch my private version
7c673cae
FG
1093 _mark_dirty(ls);
1094
1095 // mark dentry too
1096 if (parent)
f67539c2 1097 parent->mark_dirty(get_version(), ls);
7c673cae
FG
1098}
1099
1100
1101void CInode::mark_clean()
1102{
11fdf7f2 1103 dout(10) << __func__ << " " << *this << dendl;
7c673cae
FG
1104 if (state_test(STATE_DIRTY)) {
1105 state_clear(STATE_DIRTY);
1106 put(PIN_DIRTY);
1107
1108 // remove myself from ls dirty list
1109 item_dirty.remove_myself();
1110 }
1111}
1112
1113
1114// --------------
1115// per-inode storage
1116// (currently for root inode only)
1117
1118struct C_IO_Inode_Stored : public CInodeIOContext {
1119 version_t version;
1120 Context *fin;
1121 C_IO_Inode_Stored(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {}
1122 void finish(int r) override {
1123 in->_stored(r, version, fin);
1124 }
91327a77
AA
1125 void print(ostream& out) const override {
1126 out << "inode_store(" << in->ino() << ")";
1127 }
7c673cae
FG
1128};
1129
11fdf7f2 1130object_t InodeStoreBase::get_object_name(inodeno_t ino, frag_t fg, std::string_view suffix)
7c673cae
FG
1131{
1132 char n[60];
11fdf7f2
TL
1133 snprintf(n, sizeof(n), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)fg);
1134 ceph_assert(strlen(n) + suffix.size() < sizeof n);
1135 strncat(n, suffix.data(), suffix.size());
7c673cae
FG
1136 return object_t(n);
1137}
1138
11fdf7f2 1139void CInode::store(MDSContext *fin)
7c673cae 1140{
11fdf7f2
TL
1141 dout(10) << __func__ << " " << get_version() << dendl;
1142 ceph_assert(is_base());
7c673cae
FG
1143
1144 if (snaprealm)
1145 purge_stale_snap_data(snaprealm->get_snaps());
1146
1147 // encode
1148 bufferlist bl;
1149 string magic = CEPH_FS_ONDISK_MAGIC;
11fdf7f2
TL
1150 using ceph::encode;
1151 encode(magic, bl);
7c673cae
FG
1152 encode_store(bl, mdcache->mds->mdsmap->get_up_features());
1153
1154 // write it.
1155 SnapContext snapc;
1156 ObjectOperation m;
1157 m.write_full(bl);
1158
1159 object_t oid = CInode::get_object_name(ino(), frag_t(), ".inode");
b3b6e05e 1160 object_locator_t oloc(mdcache->mds->get_metadata_pool());
7c673cae
FG
1161
1162 Context *newfin =
1163 new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin),
1164 mdcache->mds->finisher);
1165 mdcache->mds->objecter->mutate(oid, oloc, m, snapc,
1166 ceph::real_clock::now(), 0,
1167 newfin);
1168}
1169
1170void CInode::_stored(int r, version_t v, Context *fin)
1171{
1172 if (r < 0) {
1173 dout(1) << "store error " << r << " v " << v << " on " << *this << dendl;
d2e6a577
FG
1174 mdcache->mds->clog->error() << "failed to store inode " << ino()
1175 << " object: " << cpp_strerror(r);
7c673cae
FG
1176 mdcache->mds->handle_write_error(r);
1177 fin->complete(r);
1178 return;
1179 }
1180
11fdf7f2 1181 dout(10) << __func__ << " " << v << " on " << *this << dendl;
7c673cae
FG
1182 if (v == get_projected_version())
1183 mark_clean();
1184
1185 fin->complete(0);
1186}
1187
11fdf7f2 1188void CInode::flush(MDSContext *fin)
7c673cae 1189{
11fdf7f2
TL
1190 dout(10) << __func__ << " " << *this << dendl;
1191 ceph_assert(is_auth() && can_auth_pin());
7c673cae
FG
1192
1193 MDSGatherBuilder gather(g_ceph_context);
1194
1195 if (is_dirty_parent()) {
1196 store_backtrace(gather.new_sub());
1197 }
1198 if (is_dirty()) {
1199 if (is_base()) {
1200 store(gather.new_sub());
1201 } else {
1202 parent->dir->commit(0, gather.new_sub());
1203 }
1204 }
1205
1206 if (gather.has_subs()) {
1207 gather.set_finisher(fin);
1208 gather.activate();
1209 } else {
1210 fin->complete(0);
1211 }
1212}
1213
1214struct C_IO_Inode_Fetched : public CInodeIOContext {
1215 bufferlist bl, bl2;
1216 Context *fin;
1217 C_IO_Inode_Fetched(CInode *i, Context *f) : CInodeIOContext(i), fin(f) {}
1218 void finish(int r) override {
f67539c2 1219 // Ignore 'r', because we fetch from two places, so r is usually CEPHFS_ENOENT
7c673cae
FG
1220 in->_fetched(bl, bl2, fin);
1221 }
91327a77
AA
1222 void print(ostream& out) const override {
1223 out << "inode_fetch(" << in->ino() << ")";
1224 }
7c673cae
FG
1225};
1226
11fdf7f2 1227void CInode::fetch(MDSContext *fin)
7c673cae 1228{
11fdf7f2 1229 dout(10) << __func__ << dendl;
7c673cae
FG
1230
1231 C_IO_Inode_Fetched *c = new C_IO_Inode_Fetched(this, fin);
1232 C_GatherBuilder gather(g_ceph_context, new C_OnFinisher(c, mdcache->mds->finisher));
1233
1234 object_t oid = CInode::get_object_name(ino(), frag_t(), "");
b3b6e05e 1235 object_locator_t oloc(mdcache->mds->get_metadata_pool());
7c673cae
FG
1236
1237 // Old on-disk format: inode stored in xattr of a dirfrag
1238 ObjectOperation rd;
1239 rd.getxattr("inode", &c->bl, NULL);
1240 mdcache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, (bufferlist*)NULL, 0, gather.new_sub());
1241
1242 // Current on-disk format: inode stored in a .inode object
1243 object_t oid2 = CInode::get_object_name(ino(), frag_t(), ".inode");
1244 mdcache->mds->objecter->read(oid2, oloc, 0, 0, CEPH_NOSNAP, &c->bl2, 0, gather.new_sub());
1245
1246 gather.activate();
1247}
1248
1249void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin)
1250{
11fdf7f2
TL
1251 dout(10) << __func__ << " got " << bl.length() << " and " << bl2.length() << dendl;
1252 bufferlist::const_iterator p;
7c673cae 1253 if (bl2.length()) {
11fdf7f2 1254 p = bl2.cbegin();
7c673cae 1255 } else if (bl.length()) {
11fdf7f2 1256 p = bl.cbegin();
7c673cae 1257 } else {
d2e6a577 1258 derr << "No data while reading inode " << ino() << dendl;
f67539c2 1259 fin->complete(-CEPHFS_ENOENT);
7c673cae
FG
1260 return;
1261 }
1262
11fdf7f2 1263 using ceph::decode;
7c673cae
FG
1264 // Attempt decode
1265 try {
1266 string magic;
11fdf7f2 1267 decode(magic, p);
7c673cae
FG
1268 dout(10) << " magic is '" << magic << "' (expecting '"
1269 << CEPH_FS_ONDISK_MAGIC << "')" << dendl;
1270 if (magic != CEPH_FS_ONDISK_MAGIC) {
1271 dout(0) << "on disk magic '" << magic << "' != my magic '" << CEPH_FS_ONDISK_MAGIC
1272 << "'" << dendl;
f67539c2 1273 fin->complete(-CEPHFS_EINVAL);
7c673cae
FG
1274 } else {
1275 decode_store(p);
1276 dout(10) << "_fetched " << *this << dendl;
1277 fin->complete(0);
1278 }
1279 } catch (buffer::error &err) {
f67539c2
TL
1280 derr << "Corrupt inode " << ino() << ": " << err.what() << dendl;
1281 fin->complete(-CEPHFS_EINVAL);
7c673cae
FG
1282 return;
1283 }
1284}
1285
1286void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt)
1287{
f67539c2 1288 bt.ino = ino();
7c673cae
FG
1289 bt.ancestors.clear();
1290 bt.pool = pool;
1291
1292 CInode *in = this;
1293 CDentry *pdn = get_parent_dn();
1294 while (pdn) {
1295 CInode *diri = pdn->get_dir()->get_inode();
f67539c2 1296 bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(), in->get_inode()->version));
7c673cae
FG
1297 in = diri;
1298 pdn = in->get_parent_dn();
1299 }
f67539c2
TL
1300 bt.old_pools.reserve(get_inode()->old_pools.size());
1301 for (auto &p : get_inode()->old_pools) {
7c673cae 1302 // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0)
94b18763 1303 if (p != pool)
f67539c2 1304 bt.old_pools.push_back(p);
7c673cae
FG
1305 }
1306}
1307
1308struct C_IO_Inode_StoredBacktrace : public CInodeIOContext {
1309 version_t version;
1310 Context *fin;
1311 C_IO_Inode_StoredBacktrace(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {}
1312 void finish(int r) override {
1313 in->_stored_backtrace(r, version, fin);
1314 }
91327a77
AA
1315 void print(ostream& out) const override {
1316 out << "backtrace_store(" << in->ino() << ")";
1317 }
7c673cae
FG
1318};
1319
f67539c2
TL
1320
1321void CInode::_commit_ops(int r, C_GatherBuilder &gather_bld,
1322 std::vector<CInodeCommitOperation> &ops_vec,
1323 inode_backtrace_t &bt)
1324{
1325 dout(10) << __func__ << dendl;
1326
1327 if (r < 0) {
1328 mdcache->mds->handle_write_error_with_lock(r);
1329 return;
1330 }
1331
1332 SnapContext snapc;
1333 object_t oid = get_object_name(ino(), frag_t(), "");
1334
1335 for (auto &op : ops_vec) {
1336 ObjectOperation obj_op;
1337 object_locator_t oloc(op.get_pool());
1338 op.update(obj_op, bt);
1339 mdcache->mds->objecter->mutate(oid, oloc, obj_op, snapc,
1340 ceph::real_clock::now(),
1341 0, gather_bld.new_sub());
1342 }
1343}
1344
1345void CInode::_store_backtrace(std::vector<CInodeCommitOperation> &ops_vec,
1346 inode_backtrace_t &bt, int op_prio)
7c673cae 1347{
11fdf7f2
TL
1348 dout(10) << __func__ << " on " << *this << dendl;
1349 ceph_assert(is_dirty_parent());
7c673cae
FG
1350
1351 if (op_prio < 0)
1352 op_prio = CEPH_MSG_PRIO_DEFAULT;
1353
1354 auth_pin(this);
1355
1356 const int64_t pool = get_backtrace_pool();
7c673cae 1357 build_backtrace(pool, bt);
7c673cae 1358
20effc67
TL
1359 std::string_view slink = "";
1360 if (is_symlink() && mdcache->get_symlink_recovery()) {
1361 slink = symlink;
1362 }
1363
f67539c2 1364 ops_vec.emplace_back(op_prio, pool, get_inode()->layout,
20effc67 1365 mdcache->mds->mdsmap->get_up_features(), slink);
7c673cae 1366
f67539c2 1367 if (!state_test(STATE_DIRTYPOOL) || get_inode()->old_pools.empty()) {
7c673cae 1368 dout(20) << __func__ << ": no dirtypool or no old pools" << dendl;
7c673cae
FG
1369 return;
1370 }
1371
7c673cae
FG
1372 // In the case where DIRTYPOOL is set, we update all old pools backtraces
1373 // such that anyone reading them will see the new pool ID in
1374 // inode_backtrace_t::pool and go read everything else from there.
f67539c2 1375 for (const auto &p : get_inode()->old_pools) {
94b18763 1376 if (p == pool)
7c673cae
FG
1377 continue;
1378
94b18763 1379 dout(20) << __func__ << ": updating old pool " << p << dendl;
7c673cae 1380
f67539c2 1381 ops_vec.emplace_back(op_prio, p);
7c673cae 1382 }
f67539c2
TL
1383}
1384
1385void CInode::store_backtrace(MDSContext *fin, int op_prio)
1386{
1387 std::vector<CInodeCommitOperation> ops_vec;
1388 inode_backtrace_t bt;
1389 auto version = get_inode()->backtrace_version;
1390
1391 _store_backtrace(ops_vec, bt, op_prio);
1392
1393 C_GatherBuilder gather(g_ceph_context,
1394 new C_OnFinisher(
1395 new C_IO_Inode_StoredBacktrace(this, version, fin),
1396 mdcache->mds->finisher));
1397 _commit_ops(0, gather, ops_vec, bt);
1398 ceph_assert(gather.has_subs());
7c673cae
FG
1399 gather.activate();
1400}
1401
f67539c2
TL
1402void CInode::store_backtrace(CInodeCommitOperations &op, int op_prio)
1403{
1404 op.version = get_inode()->backtrace_version;
1405 op.in = this;
1406
1407 _store_backtrace(op.ops_vec, op.bt, op_prio);
1408}
1409
7c673cae
FG
1410void CInode::_stored_backtrace(int r, version_t v, Context *fin)
1411{
f67539c2 1412 if (r == -CEPHFS_ENOENT) {
7c673cae
FG
1413 const int64_t pool = get_backtrace_pool();
1414 bool exists = mdcache->mds->objecter->with_osdmap(
1415 [pool](const OSDMap &osd_map) {
1416 return osd_map.have_pg_pool(pool);
1417 });
1418
f67539c2 1419 // This CEPHFS_ENOENT is because the pool doesn't exist (the user deleted it
7c673cae
FG
1420 // out from under us), so the backtrace can never be written, so pretend
1421 // to succeed so that the user can proceed to e.g. delete the file.
1422 if (!exists) {
f67539c2 1423 dout(4) << __func__ << " got CEPHFS_ENOENT: a data pool was deleted "
7c673cae
FG
1424 "beneath us!" << dendl;
1425 r = 0;
1426 }
1427 }
1428
1429 if (r < 0) {
1430 dout(1) << "store backtrace error " << r << " v " << v << dendl;
1431 mdcache->mds->clog->error() << "failed to store backtrace on ino "
1432 << ino() << " object"
1433 << ", pool " << get_backtrace_pool()
1434 << ", errno " << r;
1435 mdcache->mds->handle_write_error(r);
1436 if (fin)
1437 fin->complete(r);
1438 return;
1439 }
1440
11fdf7f2 1441 dout(10) << __func__ << " v " << v << dendl;
7c673cae
FG
1442
1443 auth_unpin(this);
f67539c2 1444 if (v == get_inode()->backtrace_version)
7c673cae
FG
1445 clear_dirty_parent();
1446 if (fin)
1447 fin->complete(0);
1448}
1449
1450void CInode::fetch_backtrace(Context *fin, bufferlist *backtrace)
1451{
f67539c2 1452 mdcache->fetch_backtrace(ino(), get_backtrace_pool(), *backtrace, fin);
7c673cae
FG
1453}
1454
28e407b8 1455void CInode::mark_dirty_parent(LogSegment *ls, bool dirty_pool)
7c673cae
FG
1456{
1457 if (!state_test(STATE_DIRTYPARENT)) {
11fdf7f2 1458 dout(10) << __func__ << dendl;
7c673cae
FG
1459 state_set(STATE_DIRTYPARENT);
1460 get(PIN_DIRTYPARENT);
11fdf7f2 1461 ceph_assert(ls);
7c673cae
FG
1462 }
1463 if (dirty_pool)
1464 state_set(STATE_DIRTYPOOL);
1465 if (ls)
1466 ls->dirty_parent_inodes.push_back(&item_dirty_parent);
1467}
1468
1469void CInode::clear_dirty_parent()
1470{
1471 if (state_test(STATE_DIRTYPARENT)) {
11fdf7f2 1472 dout(10) << __func__ << dendl;
7c673cae
FG
1473 state_clear(STATE_DIRTYPARENT);
1474 state_clear(STATE_DIRTYPOOL);
1475 put(PIN_DIRTYPARENT);
1476 item_dirty_parent.remove_myself();
1477 }
1478}
1479
1480void CInode::verify_diri_backtrace(bufferlist &bl, int err)
1481{
1482 if (is_base() || is_dirty_parent() || !is_auth())
1483 return;
1484
11fdf7f2 1485 dout(10) << __func__ << dendl;
7c673cae
FG
1486
1487 if (err == 0) {
1488 inode_backtrace_t backtrace;
11fdf7f2
TL
1489 using ceph::decode;
1490 decode(backtrace, bl);
7c673cae
FG
1491 CDentry *pdn = get_parent_dn();
1492 if (backtrace.ancestors.empty() ||
94b18763 1493 backtrace.ancestors[0].dname != pdn->get_name() ||
7c673cae 1494 backtrace.ancestors[0].dirino != pdn->get_dir()->ino())
f67539c2 1495 err = -CEPHFS_EINVAL;
7c673cae
FG
1496 }
1497
1498 if (err) {
1499 MDSRank *mds = mdcache->mds;
d2e6a577 1500 mds->clog->error() << "bad backtrace on directory inode " << ino();
11fdf7f2 1501 ceph_assert(!"bad backtrace" == (g_conf()->mds_verify_backtrace > 1));
7c673cae 1502
28e407b8 1503 mark_dirty_parent(mds->mdlog->get_current_segment(), false);
7c673cae
FG
1504 mds->mdlog->flush();
1505 }
1506}
1507
1508// ------------------
1509// parent dir
1510
1511
f67539c2
TL
1512void InodeStoreBase::encode_xattrs(bufferlist &bl) const {
1513 using ceph::encode;
1514 if (xattrs)
1515 encode(*xattrs, bl);
1516 else
1517 encode((__u32)0, bl);
1518}
1519
1520void InodeStoreBase::decode_xattrs(bufferlist::const_iterator &p) {
1521 using ceph::decode;
1522 mempool_xattr_map tmp;
1523 decode_noshare(tmp, p);
1524 if (tmp.empty()) {
1525 reset_xattrs(xattr_map_ptr());
1526 } else {
1527 reset_xattrs(allocate_xattr_map(std::move(tmp)));
1528 }
1529}
1530
1531void InodeStoreBase::encode_old_inodes(bufferlist &bl, uint64_t features) const {
1532 using ceph::encode;
1533 if (old_inodes)
1534 encode(*old_inodes, bl, features);
1535 else
1536 encode((__u32)0, bl);
1537}
1538
1539void InodeStoreBase::decode_old_inodes(bufferlist::const_iterator &p) {
1540 using ceph::decode;
1541 mempool_old_inode_map tmp;
1542 decode(tmp, p);
1543 if (tmp.empty()) {
1544 reset_old_inodes(old_inode_map_ptr());
1545 } else {
1546 reset_old_inodes(allocate_old_inode_map(std::move(tmp)));
1547 }
1548}
1549
7c673cae
FG
1550void InodeStoreBase::encode_bare(bufferlist &bl, uint64_t features,
1551 const bufferlist *snap_blob) const
1552{
11fdf7f2 1553 using ceph::encode;
f67539c2
TL
1554 encode(*inode, bl, features);
1555 if (inode->is_symlink())
11fdf7f2
TL
1556 encode(symlink, bl);
1557 encode(dirfragtree, bl);
f67539c2
TL
1558 encode_xattrs(bl);
1559
7c673cae 1560 if (snap_blob)
11fdf7f2 1561 encode(*snap_blob, bl);
7c673cae 1562 else
11fdf7f2 1563 encode(bufferlist(), bl);
f67539c2 1564 encode_old_inodes(bl, features);
11fdf7f2
TL
1565 encode(oldest_snap, bl);
1566 encode(damage_flags, bl);
7c673cae
FG
1567}
1568
1569void InodeStoreBase::encode(bufferlist &bl, uint64_t features,
1570 const bufferlist *snap_blob) const
1571{
1572 ENCODE_START(6, 4, bl);
1573 encode_bare(bl, features, snap_blob);
1574 ENCODE_FINISH(bl);
1575}
1576
1577void CInode::encode_store(bufferlist& bl, uint64_t features)
1578{
1579 bufferlist snap_blob;
1580 encode_snap_blob(snap_blob);
1581 InodeStoreBase::encode(bl, mdcache->mds->mdsmap->get_up_features(),
1582 &snap_blob);
1583}
1584
11fdf7f2 1585void InodeStoreBase::decode_bare(bufferlist::const_iterator &bl,
7c673cae
FG
1586 bufferlist& snap_blob, __u8 struct_v)
1587{
11fdf7f2 1588 using ceph::decode;
f67539c2
TL
1589
1590 auto _inode = allocate_inode();
1591 decode(*_inode, bl);
1592
1593 if (_inode->is_symlink()) {
94b18763 1594 std::string tmp;
11fdf7f2
TL
1595 decode(tmp, bl);
1596 symlink = std::string_view(tmp);
94b18763 1597 }
11fdf7f2 1598 decode(dirfragtree, bl);
f67539c2 1599 decode_xattrs(bl);
11fdf7f2 1600 decode(snap_blob, bl);
7c673cae 1601
f67539c2
TL
1602 decode_old_inodes(bl);
1603 if (struct_v == 2 && _inode->is_dir()) {
7c673cae 1604 bool default_layout_exists;
11fdf7f2 1605 decode(default_layout_exists, bl);
7c673cae 1606 if (default_layout_exists) {
11fdf7f2 1607 decode(struct_v, bl); // this was a default_file_layout
f67539c2 1608 decode(_inode->layout, bl); // but we only care about the layout portion
7c673cae
FG
1609 }
1610 }
1611
1612 if (struct_v >= 5) {
1613 // InodeStore is embedded in dentries without proper versioning, so
1614 // we consume up to the end of the buffer
1615 if (!bl.end()) {
11fdf7f2 1616 decode(oldest_snap, bl);
7c673cae
FG
1617 }
1618
1619 if (!bl.end()) {
11fdf7f2 1620 decode(damage_flags, bl);
7c673cae
FG
1621 }
1622 }
f67539c2
TL
1623
1624 reset_inode(std::move(_inode));
7c673cae
FG
1625}
1626
1627
11fdf7f2 1628void InodeStoreBase::decode(bufferlist::const_iterator &bl, bufferlist& snap_blob)
7c673cae
FG
1629{
1630 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
1631 decode_bare(bl, snap_blob, struct_v);
1632 DECODE_FINISH(bl);
1633}
1634
11fdf7f2 1635void CInode::decode_store(bufferlist::const_iterator& bl)
7c673cae
FG
1636{
1637 bufferlist snap_blob;
1638 InodeStoreBase::decode(bl, snap_blob);
1639 decode_snap_blob(snap_blob);
1640}
1641
1642// ------------------
1643// locking
1644
9f95a23c
TL
1645SimpleLock* CInode::get_lock(int type)
1646{
1647 switch (type) {
1648 case CEPH_LOCK_IVERSION: return &versionlock;
1649 case CEPH_LOCK_IFILE: return &filelock;
1650 case CEPH_LOCK_IAUTH: return &authlock;
1651 case CEPH_LOCK_ILINK: return &linklock;
1652 case CEPH_LOCK_IDFT: return &dirfragtreelock;
1653 case CEPH_LOCK_IXATTR: return &xattrlock;
1654 case CEPH_LOCK_ISNAP: return &snaplock;
1655 case CEPH_LOCK_INEST: return &nestlock;
1656 case CEPH_LOCK_IFLOCK: return &flocklock;
1657 case CEPH_LOCK_IPOLICY: return &policylock;
1658 }
1659 return 0;
1660}
1661
7c673cae
FG
1662void CInode::set_object_info(MDSCacheObjectInfo &info)
1663{
1664 info.ino = ino();
1665 info.snapid = last;
1666}
1667
9f95a23c 1668void CInode::encode_lock_iauth(bufferlist& bl)
7c673cae 1669{
9f95a23c 1670 ENCODE_START(1, 1, bl);
f67539c2
TL
1671 encode(get_inode()->version, bl);
1672 encode(get_inode()->ctime, bl);
1673 encode(get_inode()->mode, bl);
1674 encode(get_inode()->uid, bl);
1675 encode(get_inode()->gid, bl);
9f95a23c
TL
1676 ENCODE_FINISH(bl);
1677}
7c673cae 1678
9f95a23c
TL
1679void CInode::decode_lock_iauth(bufferlist::const_iterator& p)
1680{
f67539c2
TL
1681 ceph_assert(!is_auth());
1682 auto _inode = allocate_inode(*get_inode());
9f95a23c 1683 DECODE_START(1, p);
f67539c2 1684 decode(_inode->version, p);
9f95a23c
TL
1685 utime_t tm;
1686 decode(tm, p);
f67539c2
TL
1687 if (_inode->ctime < tm) _inode->ctime = tm;
1688 decode(_inode->mode, p);
1689 decode(_inode->uid, p);
1690 decode(_inode->gid, p);
9f95a23c 1691 DECODE_FINISH(p);
f67539c2 1692 reset_inode(std::move(_inode));
9f95a23c
TL
1693}
1694
1695void CInode::encode_lock_ilink(bufferlist& bl)
1696{
1697 ENCODE_START(1, 1, bl);
f67539c2
TL
1698 encode(get_inode()->version, bl);
1699 encode(get_inode()->ctime, bl);
1700 encode(get_inode()->nlink, bl);
9f95a23c
TL
1701 ENCODE_FINISH(bl);
1702}
1703
1704void CInode::decode_lock_ilink(bufferlist::const_iterator& p)
1705{
f67539c2
TL
1706 ceph_assert(!is_auth());
1707 auto _inode = allocate_inode(*get_inode());
9f95a23c 1708 DECODE_START(1, p);
f67539c2 1709 decode(_inode->version, p);
9f95a23c
TL
1710 utime_t tm;
1711 decode(tm, p);
f67539c2
TL
1712 if (_inode->ctime < tm) _inode->ctime = tm;
1713 decode(_inode->nlink, p);
9f95a23c 1714 DECODE_FINISH(p);
f67539c2 1715 reset_inode(std::move(_inode));
9f95a23c
TL
1716}
1717
1718void CInode::encode_lock_idft(bufferlist& bl)
1719{
1720 ENCODE_START(1, 1, bl);
1721 if (is_auth()) {
f67539c2 1722 encode(get_inode()->version, bl);
9f95a23c
TL
1723 } else {
1724 // treat flushing as dirty when rejoining cache
1725 bool dirty = dirfragtreelock.is_dirty_or_flushing();
1726 encode(dirty, bl);
1727 }
1728 {
1729 // encode the raw tree
1730 encode(dirfragtree, bl);
1731
1732 // also specify which frags are mine
1733 set<frag_t> myfrags;
1734 auto&& dfls = get_dirfrags();
1735 for (const auto& dir : dfls) {
1736 if (dir->is_auth()) {
1737 frag_t fg = dir->get_frag();
1738 myfrags.insert(fg);
1739 }
1740 }
1741 encode(myfrags, bl);
1742 }
1743 ENCODE_FINISH(bl);
1744}
1745
1746void CInode::decode_lock_idft(bufferlist::const_iterator& p)
1747{
f67539c2
TL
1748 inode_ptr _inode;
1749
9f95a23c
TL
1750 DECODE_START(1, p);
1751 if (is_auth()) {
1752 bool replica_dirty;
1753 decode(replica_dirty, p);
1754 if (replica_dirty) {
1755 dout(10) << __func__ << " setting dftlock dirty flag" << dendl;
1756 dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle
1757 }
1758 } else {
f67539c2
TL
1759 _inode = allocate_inode(*get_inode());
1760 decode(_inode->version, p);
9f95a23c
TL
1761 }
1762 {
1763 fragtree_t temp;
1764 decode(temp, p);
1765 set<frag_t> authfrags;
1766 decode(authfrags, p);
7c673cae 1767 if (is_auth()) {
9f95a23c
TL
1768 // auth. believe replica's auth frags only.
1769 for (auto fg : authfrags) {
1770 if (!dirfragtree.is_leaf(fg)) {
1771 dout(10) << " forcing frag " << fg << " to leaf (split|merge)" << dendl;
1772 dirfragtree.force_to_leaf(g_ceph_context, fg);
1773 dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle
1774 }
1775 }
7c673cae 1776 } else {
9f95a23c
TL
1777 // replica. take the new tree, BUT make sure any open
1778 // dirfrags remain leaves (they may have split _after_ this
1779 // dft was scattered, or we may still be be waiting on the
1780 // notify from the auth)
1781 dirfragtree.swap(temp);
1782 for (const auto &p : dirfrags) {
1783 if (!dirfragtree.is_leaf(p.first)) {
1784 dout(10) << " forcing open dirfrag " << p.first << " to leaf (racing with split|merge)" << dendl;
1785 dirfragtree.force_to_leaf(g_ceph_context, p.first);
1786 }
1787 if (p.second->is_auth())
1788 p.second->state_clear(CDir::STATE_DIRTYDFT);
1789 }
7c673cae 1790 }
9f95a23c
TL
1791 if (g_conf()->mds_debug_frag)
1792 verify_dirfrags();
1793 }
1794 DECODE_FINISH(p);
f67539c2
TL
1795
1796 if (_inode)
1797 reset_inode(std::move(_inode));
9f95a23c
TL
1798}
1799
1800void CInode::encode_lock_ifile(bufferlist& bl)
1801{
1802 ENCODE_START(1, 1, bl);
1803 if (is_auth()) {
f67539c2
TL
1804 encode(get_inode()->version, bl);
1805 encode(get_inode()->ctime, bl);
1806 encode(get_inode()->mtime, bl);
1807 encode(get_inode()->atime, bl);
1808 encode(get_inode()->time_warp_seq, bl);
9f95a23c 1809 if (!is_dir()) {
f67539c2
TL
1810 encode(get_inode()->layout, bl, mdcache->mds->mdsmap->get_up_features());
1811 encode(get_inode()->size, bl);
1812 encode(get_inode()->truncate_seq, bl);
1813 encode(get_inode()->truncate_size, bl);
1814 encode(get_inode()->client_ranges, bl);
1815 encode(get_inode()->inline_data, bl);
9f95a23c
TL
1816 }
1817 } else {
1818 // treat flushing as dirty when rejoining cache
1819 bool dirty = filelock.is_dirty_or_flushing();
1820 encode(dirty, bl);
1821 }
f67539c2
TL
1822 dout(15) << __func__ << " inode.dirstat is " << get_inode()->dirstat << dendl;
1823 encode(get_inode()->dirstat, bl); // only meaningful if i am auth.
9f95a23c
TL
1824 bufferlist tmp;
1825 __u32 n = 0;
1826 for (const auto &p : dirfrags) {
1827 frag_t fg = p.first;
1828 CDir *dir = p.second;
1829 if (is_auth() || dir->is_auth()) {
f67539c2 1830 const auto& pf = dir->get_projected_fnode();
9f95a23c
TL
1831 dout(15) << fg << " " << *dir << dendl;
1832 dout(20) << fg << " fragstat " << pf->fragstat << dendl;
1833 dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
1834 encode(fg, tmp);
1835 encode(dir->first, tmp);
1836 encode(pf->fragstat, tmp);
1837 encode(pf->accounted_fragstat, tmp);
1838 n++;
7c673cae 1839 }
9f95a23c
TL
1840 }
1841 encode(n, bl);
1842 bl.claim_append(tmp);
1843 ENCODE_FINISH(bl);
1844}
1845
1846void CInode::decode_lock_ifile(bufferlist::const_iterator& p)
1847{
f67539c2
TL
1848 inode_ptr _inode;
1849
9f95a23c
TL
1850 DECODE_START(1, p);
1851 if (!is_auth()) {
f67539c2
TL
1852 _inode = allocate_inode(*get_inode());
1853
1854 decode(_inode->version, p);
9f95a23c
TL
1855 utime_t tm;
1856 decode(tm, p);
f67539c2
TL
1857 if (_inode->ctime < tm) _inode->ctime = tm;
1858 decode(_inode->mtime, p);
1859 decode(_inode->atime, p);
1860 decode(_inode->time_warp_seq, p);
9f95a23c 1861 if (!is_dir()) {
f67539c2
TL
1862 decode(_inode->layout, p);
1863 decode(_inode->size, p);
1864 decode(_inode->truncate_seq, p);
1865 decode(_inode->truncate_size, p);
1866 decode(_inode->client_ranges, p);
1867 decode(_inode->inline_data, p);
9f95a23c
TL
1868 }
1869 } else {
1870 bool replica_dirty;
1871 decode(replica_dirty, p);
1872 if (replica_dirty) {
1873 dout(10) << __func__ << " setting filelock dirty flag" << dendl;
1874 filelock.mark_dirty(); // ok bc we're auth and caller will handle
1875 }
1876 }
1877
1878 frag_info_t dirstat;
1879 decode(dirstat, p);
1880 if (!is_auth()) {
1881 dout(10) << " taking inode dirstat " << dirstat << " for " << *this << dendl;
f67539c2 1882 _inode->dirstat = dirstat; // take inode summation if replica
9f95a23c
TL
1883 }
1884 __u32 n;
1885 decode(n, p);
1886 dout(10) << " ...got " << n << " fragstats on " << *this << dendl;
1887 while (n--) {
1888 frag_t fg;
1889 snapid_t fgfirst;
1890 frag_info_t fragstat;
1891 frag_info_t accounted_fragstat;
1892 decode(fg, p);
1893 decode(fgfirst, p);
1894 decode(fragstat, p);
1895 decode(accounted_fragstat, p);
1896 dout(10) << fg << " [" << fgfirst << ",head] " << dendl;
1897 dout(10) << fg << " fragstat " << fragstat << dendl;
1898 dout(20) << fg << " accounted_fragstat " << accounted_fragstat << dendl;
1899
1900 CDir *dir = get_dirfrag(fg);
7c673cae 1901 if (is_auth()) {
9f95a23c
TL
1902 ceph_assert(dir); // i am auth; i had better have this dir open
1903 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
1904 << " on " << *dir << dendl;
1905 dir->first = fgfirst;
f67539c2
TL
1906 auto _fnode = CDir::allocate_fnode(*dir->get_fnode());
1907 _fnode->fragstat = fragstat;
1908 _fnode->accounted_fragstat = accounted_fragstat;
1909 dir->reset_fnode(std::move(_fnode));
9f95a23c
TL
1910 if (!(fragstat == accounted_fragstat)) {
1911 dout(10) << fg << " setting filelock updated flag" << dendl;
1912 filelock.mark_dirty(); // ok bc we're auth and caller will handle
7c673cae
FG
1913 }
1914 } else {
9f95a23c
TL
1915 if (dir && dir->is_auth()) {
1916 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
1917 << " on " << *dir << dendl;
1918 dir->first = fgfirst;
f67539c2 1919 const auto& pf = dir->get_projected_fnode();
9f95a23c 1920 finish_scatter_update(&filelock, dir,
f67539c2 1921 _inode->dirstat.version, pf->accounted_fragstat.version);
9f95a23c 1922 }
7c673cae 1923 }
9f95a23c
TL
1924 }
1925 DECODE_FINISH(p);
f67539c2
TL
1926
1927 if (_inode)
1928 reset_inode(std::move(_inode));
9f95a23c 1929}
7c673cae 1930
9f95a23c
TL
1931void CInode::encode_lock_inest(bufferlist& bl)
1932{
1933 ENCODE_START(1, 1, bl);
1934 if (is_auth()) {
f67539c2 1935 encode(get_inode()->version, bl);
9f95a23c
TL
1936 } else {
1937 // treat flushing as dirty when rejoining cache
1938 bool dirty = nestlock.is_dirty_or_flushing();
1939 encode(dirty, bl);
1940 }
f67539c2
TL
1941 dout(15) << __func__ << " inode.rstat is " << get_inode()->rstat << dendl;
1942 encode(get_inode()->rstat, bl); // only meaningful if i am auth.
9f95a23c
TL
1943 bufferlist tmp;
1944 __u32 n = 0;
1945 for (const auto &p : dirfrags) {
1946 frag_t fg = p.first;
1947 CDir *dir = p.second;
1948 if (is_auth() || dir->is_auth()) {
f67539c2 1949 const auto& pf = dir->get_projected_fnode();
9f95a23c
TL
1950 dout(10) << __func__ << " " << fg << " dir " << *dir << dendl;
1951 dout(10) << __func__ << " " << fg << " rstat " << pf->rstat << dendl;
1952 dout(10) << __func__ << " " << fg << " accounted_rstat " << pf->rstat << dendl;
1953 dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl;
1954 encode(fg, tmp);
1955 encode(dir->first, tmp);
1956 encode(pf->rstat, tmp);
1957 encode(pf->accounted_rstat, tmp);
1958 encode(dir->dirty_old_rstat, tmp);
1959 n++;
7c673cae 1960 }
9f95a23c
TL
1961 }
1962 encode(n, bl);
1963 bl.claim_append(tmp);
1964 ENCODE_FINISH(bl);
1965}
7c673cae 1966
9f95a23c
TL
1967void CInode::decode_lock_inest(bufferlist::const_iterator& p)
1968{
f67539c2
TL
1969 inode_ptr _inode;
1970
9f95a23c
TL
1971 DECODE_START(1, p);
1972 if (is_auth()) {
1973 bool replica_dirty;
1974 decode(replica_dirty, p);
1975 if (replica_dirty) {
1976 dout(10) << __func__ << " setting nestlock dirty flag" << dendl;
1977 nestlock.mark_dirty(); // ok bc we're auth and caller will handle
1978 }
1979 } else {
f67539c2
TL
1980 _inode = allocate_inode(*get_inode());
1981 decode(_inode->version, p);
9f95a23c
TL
1982 }
1983 nest_info_t rstat;
1984 decode(rstat, p);
1985 if (!is_auth()) {
1986 dout(10) << __func__ << " taking inode rstat " << rstat << " for " << *this << dendl;
f67539c2 1987 _inode->rstat = rstat; // take inode summation if replica
9f95a23c
TL
1988 }
1989 __u32 n;
1990 decode(n, p);
1991 while (n--) {
1992 frag_t fg;
1993 snapid_t fgfirst;
1994 nest_info_t rstat;
1995 nest_info_t accounted_rstat;
1996 decltype(CDir::dirty_old_rstat) dirty_old_rstat;
1997 decode(fg, p);
1998 decode(fgfirst, p);
1999 decode(rstat, p);
2000 decode(accounted_rstat, p);
2001 decode(dirty_old_rstat, p);
2002 dout(10) << __func__ << " " << fg << " [" << fgfirst << ",head]" << dendl;
2003 dout(10) << __func__ << " " << fg << " rstat " << rstat << dendl;
2004 dout(10) << __func__ << " " << fg << " accounted_rstat " << accounted_rstat << dendl;
2005 dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dirty_old_rstat << dendl;
2006 CDir *dir = get_dirfrag(fg);
7c673cae 2007 if (is_auth()) {
9f95a23c
TL
2008 ceph_assert(dir); // i am auth; i had better have this dir open
2009 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
2010 << " on " << *dir << dendl;
2011 dir->first = fgfirst;
f67539c2
TL
2012 auto _fnode = CDir::allocate_fnode(*dir->get_fnode());
2013 _fnode->rstat = rstat;
2014 _fnode->accounted_rstat = accounted_rstat;
2015 dir->reset_fnode(std::move(_fnode));
9f95a23c
TL
2016 dir->dirty_old_rstat.swap(dirty_old_rstat);
2017 if (!(rstat == accounted_rstat) || !dir->dirty_old_rstat.empty()) {
2018 dout(10) << fg << " setting nestlock updated flag" << dendl;
2019 nestlock.mark_dirty(); // ok bc we're auth and caller will handle
2020 }
7c673cae 2021 } else {
9f95a23c
TL
2022 if (dir && dir->is_auth()) {
2023 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
2024 << " on " << *dir << dendl;
2025 dir->first = fgfirst;
f67539c2 2026 const auto& pf = dir->get_projected_fnode();
9f95a23c 2027 finish_scatter_update(&nestlock, dir,
f67539c2 2028 _inode->rstat.version, pf->accounted_rstat.version);
7c673cae 2029 }
7c673cae 2030 }
9f95a23c
TL
2031 }
2032 DECODE_FINISH(p);
f67539c2
TL
2033
2034 if (_inode)
2035 reset_inode(std::move(_inode));
9f95a23c
TL
2036}
2037
2038void CInode::encode_lock_ixattr(bufferlist& bl)
2039{
2a845540 2040 ENCODE_START(2, 1, bl);
f67539c2
TL
2041 encode(get_inode()->version, bl);
2042 encode(get_inode()->ctime, bl);
2043 encode_xattrs(bl);
2a845540 2044 encode(get_inode()->xattr_version, bl);
9f95a23c
TL
2045 ENCODE_FINISH(bl);
2046}
2047
2048void CInode::decode_lock_ixattr(bufferlist::const_iterator& p)
2049{
f67539c2
TL
2050 ceph_assert(!is_auth());
2051 auto _inode = allocate_inode(*get_inode());
2a845540 2052 DECODE_START(2, p);
f67539c2 2053 decode(_inode->version, p);
9f95a23c
TL
2054 utime_t tm;
2055 decode(tm, p);
f67539c2
TL
2056 if (_inode->ctime < tm)
2057 _inode->ctime = tm;
2058 decode_xattrs(p);
2a845540
TL
2059 if (struct_v >= 2) {
2060 decode(_inode->xattr_version, p);
2061 }
9f95a23c 2062 DECODE_FINISH(p);
f67539c2 2063 reset_inode(std::move(_inode));
9f95a23c
TL
2064}
2065
2066void CInode::encode_lock_isnap(bufferlist& bl)
2067{
2068 ENCODE_START(1, 1, bl);
f67539c2
TL
2069 encode(get_inode()->version, bl);
2070 encode(get_inode()->ctime, bl);
9f95a23c
TL
2071 encode_snap(bl);
2072 ENCODE_FINISH(bl);
2073}
2074
2075void CInode::decode_lock_isnap(bufferlist::const_iterator& p)
2076{
f67539c2
TL
2077 ceph_assert(!is_auth());
2078 auto _inode = allocate_inode(*get_inode());
9f95a23c 2079 DECODE_START(1, p);
f67539c2 2080 decode(_inode->version, p);
9f95a23c
TL
2081 utime_t tm;
2082 decode(tm, p);
f67539c2 2083 if (_inode->ctime < tm) _inode->ctime = tm;
9f95a23c
TL
2084 decode_snap(p);
2085 DECODE_FINISH(p);
f67539c2 2086 reset_inode(std::move(_inode));
9f95a23c
TL
2087}
2088
2089void CInode::encode_lock_iflock(bufferlist& bl)
2090{
2091 ENCODE_START(1, 1, bl);
f67539c2 2092 encode(get_inode()->version, bl);
9f95a23c
TL
2093 _encode_file_locks(bl);
2094 ENCODE_FINISH(bl);
2095}
2096
2097void CInode::decode_lock_iflock(bufferlist::const_iterator& p)
2098{
f67539c2
TL
2099 ceph_assert(!is_auth());
2100 auto _inode = allocate_inode(*get_inode());
9f95a23c 2101 DECODE_START(1, p);
f67539c2 2102 decode(_inode->version, p);
9f95a23c
TL
2103 _decode_file_locks(p);
2104 DECODE_FINISH(p);
f67539c2 2105 reset_inode(std::move(_inode));
9f95a23c
TL
2106}
2107
2108void CInode::encode_lock_ipolicy(bufferlist& bl)
2109{
f6b5b4d7 2110 ENCODE_START(2, 1, bl);
f67539c2
TL
2111 if (is_dir()) {
2112 encode(get_inode()->version, bl);
2113 encode(get_inode()->ctime, bl);
2114 encode(get_inode()->layout, bl, mdcache->mds->mdsmap->get_up_features());
2115 encode(get_inode()->quota, bl);
2116 encode(get_inode()->export_pin, bl);
2117 encode(get_inode()->export_ephemeral_distributed_pin, bl);
2118 encode(get_inode()->export_ephemeral_random_pin, bl);
9f95a23c
TL
2119 }
2120 ENCODE_FINISH(bl);
2121}
2122
2123void CInode::decode_lock_ipolicy(bufferlist::const_iterator& p)
2124{
f67539c2
TL
2125 ceph_assert(!is_auth());
2126 auto _inode = allocate_inode(*get_inode());
2127 DECODE_START(1, p);
2128 if (is_dir()) {
2129 decode(_inode->version, p);
9f95a23c
TL
2130 utime_t tm;
2131 decode(tm, p);
f67539c2
TL
2132 if (_inode->ctime < tm)
2133 _inode->ctime = tm;
2134 decode(_inode->layout, p);
2135 decode(_inode->quota, p);
2136 decode(_inode->export_pin, p);
f6b5b4d7 2137 if (struct_v >= 2) {
f67539c2
TL
2138 decode(_inode->export_ephemeral_distributed_pin, p);
2139 decode(_inode->export_ephemeral_random_pin, p);
f6b5b4d7 2140 }
9f95a23c
TL
2141 }
2142 DECODE_FINISH(p);
f67539c2
TL
2143
2144 bool pin_updated = (get_inode()->export_pin != _inode->export_pin) ||
2145 (get_inode()->export_ephemeral_distributed_pin !=
2146 _inode->export_ephemeral_distributed_pin);
2147 reset_inode(std::move(_inode));
2148 maybe_export_pin(pin_updated);
9f95a23c
TL
2149}
2150
2151void CInode::encode_lock_state(int type, bufferlist& bl)
2152{
2153 ENCODE_START(1, 1, bl);
2154 encode(first, bl);
2155 if (!is_base())
2156 encode(parent->first, bl);
2157
2158 switch (type) {
2159 case CEPH_LOCK_IAUTH:
2160 encode_lock_iauth(bl);
2161 break;
2162
2163 case CEPH_LOCK_ILINK:
2164 encode_lock_ilink(bl);
2165 break;
2166
2167 case CEPH_LOCK_IDFT:
2168 encode_lock_idft(bl);
2169 break;
2170
2171 case CEPH_LOCK_IFILE:
2172 encode_lock_ifile(bl);
2173 break;
2174
2175 case CEPH_LOCK_INEST:
2176 encode_lock_inest(bl);
7c673cae
FG
2177 break;
2178
2179 case CEPH_LOCK_IXATTR:
9f95a23c 2180 encode_lock_ixattr(bl);
7c673cae
FG
2181 break;
2182
2183 case CEPH_LOCK_ISNAP:
9f95a23c 2184 encode_lock_isnap(bl);
7c673cae
FG
2185 break;
2186
2187 case CEPH_LOCK_IFLOCK:
9f95a23c 2188 encode_lock_iflock(bl);
7c673cae
FG
2189 break;
2190
2191 case CEPH_LOCK_IPOLICY:
9f95a23c 2192 encode_lock_ipolicy(bl);
7c673cae
FG
2193 break;
2194
2195 default:
2196 ceph_abort();
2197 }
9f95a23c 2198 ENCODE_FINISH(bl);
7c673cae
FG
2199}
2200
7c673cae
FG
2201/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2202
11fdf7f2 2203void CInode::decode_lock_state(int type, const bufferlist& bl)
7c673cae 2204{
11fdf7f2 2205 auto p = bl.cbegin();
9f95a23c
TL
2206
2207 DECODE_START(1, p);
7c673cae
FG
2208 utime_t tm;
2209
2210 snapid_t newfirst;
11fdf7f2
TL
2211 using ceph::decode;
2212 decode(newfirst, p);
7c673cae 2213 if (!is_auth() && newfirst != first) {
11fdf7f2
TL
2214 dout(10) << __func__ << " first " << first << " -> " << newfirst << dendl;
2215 first = newfirst;
2216 }
2217 if (!is_base()) {
2218 decode(newfirst, p);
2219 if (!parent->is_auth() && newfirst != parent->first) {
2220 dout(10) << __func__ << " parent first " << first << " -> " << newfirst << dendl;
7c673cae
FG
2221 parent->first = newfirst;
2222 }
7c673cae
FG
2223 }
2224
2225 switch (type) {
2226 case CEPH_LOCK_IAUTH:
9f95a23c 2227 decode_lock_iauth(p);
7c673cae
FG
2228 break;
2229
2230 case CEPH_LOCK_ILINK:
9f95a23c 2231 decode_lock_ilink(p);
7c673cae
FG
2232 break;
2233
2234 case CEPH_LOCK_IDFT:
9f95a23c 2235 decode_lock_idft(p);
7c673cae
FG
2236 break;
2237
2238 case CEPH_LOCK_IFILE:
9f95a23c 2239 decode_lock_ifile(p);
7c673cae
FG
2240 break;
2241
2242 case CEPH_LOCK_INEST:
9f95a23c 2243 decode_lock_inest(p);
7c673cae
FG
2244 break;
2245
2246 case CEPH_LOCK_IXATTR:
9f95a23c 2247 decode_lock_ixattr(p);
7c673cae
FG
2248 break;
2249
2250 case CEPH_LOCK_ISNAP:
9f95a23c 2251 decode_lock_isnap(p);
7c673cae
FG
2252 break;
2253
2254 case CEPH_LOCK_IFLOCK:
9f95a23c 2255 decode_lock_iflock(p);
7c673cae
FG
2256 break;
2257
2258 case CEPH_LOCK_IPOLICY:
9f95a23c 2259 decode_lock_ipolicy(p);
7c673cae
FG
2260 break;
2261
2262 default:
2263 ceph_abort();
2264 }
9f95a23c 2265 DECODE_FINISH(p);
7c673cae
FG
2266}
2267
2268
2269bool CInode::is_dirty_scattered()
2270{
2271 return
2272 filelock.is_dirty_or_flushing() ||
2273 nestlock.is_dirty_or_flushing() ||
2274 dirfragtreelock.is_dirty_or_flushing();
2275}
2276
2277void CInode::clear_scatter_dirty()
2278{
2279 filelock.remove_dirty();
2280 nestlock.remove_dirty();
2281 dirfragtreelock.remove_dirty();
2282}
2283
2284void CInode::clear_dirty_scattered(int type)
2285{
11fdf7f2
TL
2286 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2287 ceph_assert(is_dir());
7c673cae
FG
2288 switch (type) {
2289 case CEPH_LOCK_IFILE:
2290 item_dirty_dirfrag_dir.remove_myself();
2291 break;
2292
2293 case CEPH_LOCK_INEST:
2294 item_dirty_dirfrag_nest.remove_myself();
2295 break;
2296
2297 case CEPH_LOCK_IDFT:
2298 item_dirty_dirfrag_dirfragtree.remove_myself();
2299 break;
2300
2301 default:
2302 ceph_abort();
2303 }
2304}
2305
2306
2307/*
2308 * when we initially scatter a lock, we need to check if any of the dirfrags
2309 * have out of date accounted_rstat/fragstat. if so, mark the lock stale.
2310 */
2311/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2312void CInode::start_scatter(ScatterLock *lock)
2313{
11fdf7f2
TL
2314 dout(10) << __func__ << " " << *lock << " on " << *this << dendl;
2315 ceph_assert(is_auth());
f67539c2 2316 const auto& pi = get_projected_inode();
7c673cae 2317
94b18763
FG
2318 for (const auto &p : dirfrags) {
2319 frag_t fg = p.first;
2320 CDir *dir = p.second;
f67539c2 2321 const auto& pf = dir->get_projected_fnode();
7c673cae
FG
2322 dout(20) << fg << " " << *dir << dendl;
2323
2324 if (!dir->is_auth())
2325 continue;
2326
2327 switch (lock->get_type()) {
2328 case CEPH_LOCK_IFILE:
2329 finish_scatter_update(lock, dir, pi->dirstat.version, pf->accounted_fragstat.version);
2330 break;
2331
2332 case CEPH_LOCK_INEST:
2333 finish_scatter_update(lock, dir, pi->rstat.version, pf->accounted_rstat.version);
2334 break;
2335
2336 case CEPH_LOCK_IDFT:
2337 dir->state_clear(CDir::STATE_DIRTYDFT);
2338 break;
2339 }
2340 }
2341}
2342
2343
2344class C_Inode_FragUpdate : public MDSLogContextBase {
2345protected:
2346 CInode *in;
2347 CDir *dir;
2348 MutationRef mut;
2349 MDSRank *get_mds() override {return in->mdcache->mds;}
2350 void finish(int r) override {
2351 in->_finish_frag_update(dir, mut);
2352 }
2353
2354public:
2355 C_Inode_FragUpdate(CInode *i, CDir *d, MutationRef& m) : in(i), dir(d), mut(m) {}
2356};
2357
2358void CInode::finish_scatter_update(ScatterLock *lock, CDir *dir,
2359 version_t inode_version, version_t dir_accounted_version)
2360{
2361 frag_t fg = dir->get_frag();
11fdf7f2 2362 ceph_assert(dir->is_auth());
7c673cae
FG
2363
2364 if (dir->is_frozen()) {
11fdf7f2 2365 dout(10) << __func__ << " " << fg << " frozen, marking " << *lock << " stale " << *dir << dendl;
7c673cae 2366 } else if (dir->get_version() == 0) {
11fdf7f2 2367 dout(10) << __func__ << " " << fg << " not loaded, marking " << *lock << " stale " << *dir << dendl;
7c673cae
FG
2368 } else {
2369 if (dir_accounted_version != inode_version) {
11fdf7f2 2370 dout(10) << __func__ << " " << fg << " journaling accounted scatterstat update v" << inode_version << dendl;
7c673cae
FG
2371
2372 MDLog *mdlog = mdcache->mds->mdlog;
2373 MutationRef mut(new MutationImpl());
2374 mut->ls = mdlog->get_current_segment();
2375
f67539c2 2376 auto pf = dir->project_fnode(mut);
7c673cae 2377
9f95a23c 2378 std::string_view ename;
7c673cae
FG
2379 switch (lock->get_type()) {
2380 case CEPH_LOCK_IFILE:
f67539c2 2381 pf->fragstat.version = inode_version;
7c673cae
FG
2382 pf->accounted_fragstat = pf->fragstat;
2383 ename = "lock ifile accounted scatter stat update";
2384 break;
2385 case CEPH_LOCK_INEST:
f67539c2 2386 pf->rstat.version = inode_version;
7c673cae
FG
2387 pf->accounted_rstat = pf->rstat;
2388 ename = "lock inest accounted scatter stat update";
c07f9fc5
FG
2389
2390 if (!is_auth() && lock->get_state() == LOCK_MIX) {
11fdf7f2 2391 dout(10) << __func__ << " try to assimilate dirty rstat on "
c07f9fc5 2392 << *dir << dendl;
f67539c2 2393 dir->assimilate_dirty_rstat_inodes(mut);
c07f9fc5
FG
2394 }
2395
7c673cae
FG
2396 break;
2397 default:
2398 ceph_abort();
2399 }
2400
7c673cae
FG
2401 EUpdate *le = new EUpdate(mdlog, ename);
2402 mdlog->start_entry(le);
2403 le->metablob.add_dir_context(dir);
2404 le->metablob.add_dir(dir, true);
2405
11fdf7f2 2406 ceph_assert(!dir->is_frozen());
7c673cae 2407 mut->auth_pin(dir);
c07f9fc5
FG
2408
2409 if (lock->get_type() == CEPH_LOCK_INEST &&
2410 !is_auth() && lock->get_state() == LOCK_MIX) {
11fdf7f2 2411 dout(10) << __func__ << " finish assimilating dirty rstat on "
c07f9fc5 2412 << *dir << dendl;
f67539c2 2413 dir->assimilate_dirty_rstat_inodes_finish(&le->metablob);
c07f9fc5
FG
2414
2415 if (!(pf->rstat == pf->accounted_rstat)) {
11fdf7f2 2416 if (!mut->is_wrlocked(&nestlock)) {
c07f9fc5
FG
2417 mdcache->mds->locker->wrlock_force(&nestlock, mut);
2418 }
2419
2420 mdcache->mds->locker->mark_updated_scatterlock(&nestlock);
2421 mut->ls->dirty_dirfrag_nest.push_back(&item_dirty_dirfrag_nest);
2422 }
2423 }
f67539c2
TL
2424
2425 pf->version = dir->pre_dirty();
7c673cae
FG
2426
2427 mdlog->submit_entry(le, new C_Inode_FragUpdate(this, dir, mut));
2428 } else {
11fdf7f2 2429 dout(10) << __func__ << " " << fg << " accounted " << *lock
7c673cae
FG
2430 << " scatter stat unchanged at v" << dir_accounted_version << dendl;
2431 }
2432 }
2433}
2434
2435void CInode::_finish_frag_update(CDir *dir, MutationRef& mut)
2436{
11fdf7f2 2437 dout(10) << __func__ << " on " << *dir << dendl;
7c673cae 2438 mut->apply();
c07f9fc5 2439 mdcache->mds->locker->drop_locks(mut.get());
7c673cae
FG
2440 mut->cleanup();
2441}
2442
2443
2444/*
2445 * when we gather a lock, we need to assimilate dirfrag changes into the inode
2446 * state. it's possible we can't update the dirfrag accounted_rstat/fragstat
2447 * because the frag is auth and frozen, or that the replica couldn't for the same
2448 * reason. hopefully it will get updated the next time the lock cycles.
2449 *
2450 * we have two dimensions of behavior:
2451 * - we may be (auth and !frozen), and able to update, or not.
2452 * - the frag may be stale, or not.
2453 *
2454 * if the frag is non-stale, we want to assimilate the diff into the
2455 * inode, regardless of whether it's auth or updateable.
2456 *
2457 * if we update the frag, we want to set accounted_fragstat = frag,
2458 * both if we took the diff or it was stale and we are making it
2459 * un-stale.
2460 */
2461/* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
f67539c2 2462void CInode::finish_scatter_gather_update(int type, MutationRef& mut)
7c673cae
FG
2463{
2464 LogChannelRef clog = mdcache->mds->clog;
2465
11fdf7f2
TL
2466 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2467 ceph_assert(is_auth());
7c673cae
FG
2468
2469 switch (type) {
2470 case CEPH_LOCK_IFILE:
2471 {
2472 fragtree_t tmpdft = dirfragtree;
2473 struct frag_info_t dirstat;
2474 bool dirstat_valid = true;
2475
2476 // adjust summation
11fdf7f2 2477 ceph_assert(is_auth());
f67539c2 2478 auto pi = _get_projected_inode();
7c673cae
FG
2479
2480 bool touched_mtime = false, touched_chattr = false;
2481 dout(20) << " orig dirstat " << pi->dirstat << dendl;
2482 pi->dirstat.version++;
94b18763
FG
2483 for (const auto &p : dirfrags) {
2484 frag_t fg = p.first;
2485 CDir *dir = p.second;
7c673cae
FG
2486 dout(20) << fg << " " << *dir << dendl;
2487
2488 bool update;
2489 if (dir->get_version() != 0) {
2490 update = dir->is_auth() && !dir->is_frozen();
2491 } else {
2492 update = false;
2493 dirstat_valid = false;
2494 }
2495
f67539c2
TL
2496 CDir::fnode_const_ptr pf;
2497 if (update) {
2498 mut->auth_pin(dir);
2499 pf = dir->project_fnode(mut);
2500 } else {
2501 pf = dir->get_projected_fnode();
2502 }
7c673cae
FG
2503
2504 if (pf->accounted_fragstat.version == pi->dirstat.version - 1) {
2505 dout(20) << fg << " fragstat " << pf->fragstat << dendl;
2506 dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
2507 pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2508 } else {
2509 dout(20) << fg << " skipping STALE accounted_fragstat " << pf->accounted_fragstat << dendl;
2510 }
2511
2512 if (pf->fragstat.nfiles < 0 ||
2513 pf->fragstat.nsubdirs < 0) {
2514 clog->error() << "bad/negative dir size on "
f67539c2 2515 << dir->dirfrag() << " " << pf->fragstat;
11fdf7f2 2516 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter);
f67539c2
TL
2517
2518 auto _pf = const_cast<fnode_t*>(pf.get());
7c673cae 2519 if (pf->fragstat.nfiles < 0)
f67539c2 2520 _pf->fragstat.nfiles = 0;
7c673cae 2521 if (pf->fragstat.nsubdirs < 0)
f67539c2 2522 _pf->fragstat.nsubdirs = 0;
7c673cae
FG
2523 }
2524
2525 if (update) {
f67539c2
TL
2526 auto _pf = const_cast<fnode_t*>(pf.get());
2527 _pf->accounted_fragstat = _pf->fragstat;
2528 _pf->fragstat.version = _pf->accounted_fragstat.version = pi->dirstat.version;
2529 _pf->version = dir->pre_dirty();
7c673cae
FG
2530 dout(10) << fg << " updated accounted_fragstat " << pf->fragstat << " on " << *dir << dendl;
2531 }
2532
2533 tmpdft.force_to_leaf(g_ceph_context, fg);
2534 dirstat.add(pf->fragstat);
2535 }
2536 if (touched_mtime)
2537 pi->mtime = pi->ctime = pi->dirstat.mtime;
2538 if (touched_chattr)
2539 pi->change_attr = pi->dirstat.change_attr;
2540 dout(20) << " final dirstat " << pi->dirstat << dendl;
2541
2542 if (dirstat_valid && !dirstat.same_sums(pi->dirstat)) {
11fdf7f2
TL
2543 frag_vec_t leaves;
2544 tmpdft.get_leaves_under(frag_t(), leaves);
2545 for (const auto& leaf : leaves) {
2546 if (!dirfrags.count(leaf)) {
7c673cae
FG
2547 dirstat_valid = false;
2548 break;
2549 }
11fdf7f2 2550 }
7c673cae
FG
2551 if (dirstat_valid) {
2552 if (state_test(CInode::STATE_REPAIRSTATS)) {
2553 dout(20) << " dirstat mismatch, fixing" << dendl;
2554 } else {
2555 clog->error() << "unmatched fragstat on " << ino() << ", inode has "
2556 << pi->dirstat << ", dirfrags have " << dirstat;
11fdf7f2 2557 ceph_assert(!"unmatched fragstat" == g_conf()->mds_verify_scatter);
7c673cae
FG
2558 }
2559 // trust the dirfrags for now
2560 version_t v = pi->dirstat.version;
2561 if (pi->dirstat.mtime > dirstat.mtime)
2562 dirstat.mtime = pi->dirstat.mtime;
2563 if (pi->dirstat.change_attr > dirstat.change_attr)
2564 dirstat.change_attr = pi->dirstat.change_attr;
2565 pi->dirstat = dirstat;
2566 pi->dirstat.version = v;
2567 }
2568 }
2569
f67539c2 2570 if (pi->dirstat.nfiles < 0 || pi->dirstat.nsubdirs < 0) {
d2e6a577
FG
2571 std::string path;
2572 make_path_string(path);
2573 clog->error() << "Inconsistent statistics detected: fragstat on inode "
2574 << ino() << " (" << path << "), inode has " << pi->dirstat;
11fdf7f2 2575 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter);
7c673cae
FG
2576
2577 if (pi->dirstat.nfiles < 0)
2578 pi->dirstat.nfiles = 0;
2579 if (pi->dirstat.nsubdirs < 0)
2580 pi->dirstat.nsubdirs = 0;
2581 }
2582 }
2583 break;
2584
2585 case CEPH_LOCK_INEST:
2586 {
11fdf7f2
TL
2587 // adjust summation
2588 ceph_assert(is_auth());
2589
7c673cae
FG
2590 fragtree_t tmpdft = dirfragtree;
2591 nest_info_t rstat;
7c673cae
FG
2592 bool rstat_valid = true;
2593
11fdf7f2
TL
2594 rstat.rsubdirs = 1;
2595 if (const sr_t *srnode = get_projected_srnode(); srnode)
2596 rstat.rsnaps = srnode->snaps.size();
2597
f67539c2 2598 auto pi = _get_projected_inode();
7c673cae
FG
2599 dout(20) << " orig rstat " << pi->rstat << dendl;
2600 pi->rstat.version++;
94b18763
FG
2601 for (const auto &p : dirfrags) {
2602 frag_t fg = p.first;
2603 CDir *dir = p.second;
7c673cae
FG
2604 dout(20) << fg << " " << *dir << dendl;
2605
2606 bool update;
2607 if (dir->get_version() != 0) {
2608 update = dir->is_auth() && !dir->is_frozen();
2609 } else {
2610 update = false;
2611 rstat_valid = false;
2612 }
2613
f67539c2
TL
2614 CDir::fnode_const_ptr pf;
2615 if (update) {
2616 mut->auth_pin(dir);
2617 pf = dir->project_fnode(mut);
2618 } else {
2619 pf = dir->get_projected_fnode();
2620 }
7c673cae
FG
2621
2622 if (pf->accounted_rstat.version == pi->rstat.version-1) {
2623 // only pull this frag's dirty rstat inodes into the frag if
2624 // the frag is non-stale and updateable. if it's stale,
2625 // that info will just get thrown out!
2626 if (update)
f67539c2 2627 dir->assimilate_dirty_rstat_inodes(mut);
7c673cae
FG
2628
2629 dout(20) << fg << " rstat " << pf->rstat << dendl;
2630 dout(20) << fg << " accounted_rstat " << pf->accounted_rstat << dendl;
2631 dout(20) << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl;
2632 mdcache->project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat,
2633 dir->first, CEPH_NOSNAP, this, true);
94b18763
FG
2634 for (auto &p : dir->dirty_old_rstat) {
2635 mdcache->project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat,
2636 p.second.first, p.first, this, true);
2637 }
7c673cae
FG
2638 if (update) // dir contents not valid if frozen or non-auth
2639 dir->check_rstats();
2640 } else {
2641 dout(20) << fg << " skipping STALE accounted_rstat " << pf->accounted_rstat << dendl;
2642 }
2643 if (update) {
f67539c2
TL
2644 auto _pf = const_cast<fnode_t*>(pf.get());
2645 _pf->accounted_rstat = pf->rstat;
2646 _pf->rstat.version = _pf->accounted_rstat.version = pi->rstat.version;
2647 _pf->version = dir->pre_dirty();
7c673cae 2648 dir->dirty_old_rstat.clear();
7c673cae
FG
2649 dir->check_rstats();
2650 dout(10) << fg << " updated accounted_rstat " << pf->rstat << " on " << *dir << dendl;
2651 }
2652
2653 tmpdft.force_to_leaf(g_ceph_context, fg);
2654 rstat.add(pf->rstat);
2655 }
2656 dout(20) << " final rstat " << pi->rstat << dendl;
2657
2658 if (rstat_valid && !rstat.same_sums(pi->rstat)) {
11fdf7f2
TL
2659 frag_vec_t leaves;
2660 tmpdft.get_leaves_under(frag_t(), leaves);
2661 for (const auto& leaf : leaves) {
2662 if (!dirfrags.count(leaf)) {
7c673cae
FG
2663 rstat_valid = false;
2664 break;
2665 }
11fdf7f2 2666 }
7c673cae
FG
2667 if (rstat_valid) {
2668 if (state_test(CInode::STATE_REPAIRSTATS)) {
2669 dout(20) << " rstat mismatch, fixing" << dendl;
2670 } else {
d2e6a577
FG
2671 clog->error() << "inconsistent rstat on inode " << ino()
2672 << ", inode has " << pi->rstat
2673 << ", directory fragments have " << rstat;
11fdf7f2 2674 ceph_assert(!"unmatched rstat" == g_conf()->mds_verify_scatter);
7c673cae
FG
2675 }
2676 // trust the dirfrag for now
2677 version_t v = pi->rstat.version;
2678 if (pi->rstat.rctime > rstat.rctime)
2679 rstat.rctime = pi->rstat.rctime;
2680 pi->rstat = rstat;
2681 pi->rstat.version = v;
2682 }
2683 }
2684
2685 mdcache->broadcast_quota_to_client(this);
2686 }
2687 break;
2688
2689 case CEPH_LOCK_IDFT:
2690 break;
2691
2692 default:
2693 ceph_abort();
2694 }
2695}
2696
f67539c2 2697void CInode::finish_scatter_gather_update_accounted(int type, EMetaBlob *metablob)
7c673cae 2698{
11fdf7f2
TL
2699 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2700 ceph_assert(is_auth());
7c673cae 2701
94b18763
FG
2702 for (const auto &p : dirfrags) {
2703 CDir *dir = p.second;
7c673cae
FG
2704 if (!dir->is_auth() || dir->get_version() == 0 || dir->is_frozen())
2705 continue;
2706
2707 if (type == CEPH_LOCK_IDFT)
2708 continue; // nothing to do.
2709
f67539c2
TL
2710 if (type == CEPH_LOCK_INEST)
2711 dir->assimilate_dirty_rstat_inodes_finish(metablob);
2712
7c673cae 2713 dout(10) << " journaling updated frag accounted_ on " << *dir << dendl;
11fdf7f2 2714 ceph_assert(dir->is_projected());
7c673cae 2715 metablob->add_dir(dir, true);
7c673cae
FG
2716 }
2717}
2718
2719// waiting
2720
2721bool CInode::is_frozen() const
2722{
2723 if (is_frozen_inode()) return true;
2724 if (parent && parent->dir->is_frozen()) return true;
2725 return false;
2726}
2727
2728bool CInode::is_frozen_dir() const
2729{
2730 if (parent && parent->dir->is_frozen_dir()) return true;
2731 return false;
2732}
2733
2734bool CInode::is_freezing() const
2735{
2736 if (is_freezing_inode()) return true;
2737 if (parent && parent->dir->is_freezing()) return true;
2738 return false;
2739}
2740
11fdf7f2 2741void CInode::add_dir_waiter(frag_t fg, MDSContext *c)
7c673cae
FG
2742{
2743 if (waiting_on_dir.empty())
2744 get(PIN_DIRWAITER);
2745 waiting_on_dir[fg].push_back(c);
11fdf7f2 2746 dout(10) << __func__ << " frag " << fg << " " << c << " on " << *this << dendl;
7c673cae
FG
2747}
2748
11fdf7f2 2749void CInode::take_dir_waiting(frag_t fg, MDSContext::vec& ls)
7c673cae
FG
2750{
2751 if (waiting_on_dir.empty())
2752 return;
2753
94b18763
FG
2754 auto it = waiting_on_dir.find(fg);
2755 if (it != waiting_on_dir.end()) {
2756 dout(10) << __func__ << " frag " << fg << " on " << *this << dendl;
11fdf7f2
TL
2757 auto& waiting = it->second;
2758 ls.insert(ls.end(), waiting.begin(), waiting.end());
94b18763 2759 waiting_on_dir.erase(it);
7c673cae
FG
2760
2761 if (waiting_on_dir.empty())
2762 put(PIN_DIRWAITER);
2763 }
2764}
2765
11fdf7f2 2766void CInode::add_waiter(uint64_t tag, MDSContext *c)
7c673cae 2767{
11fdf7f2 2768 dout(10) << __func__ << " tag " << std::hex << tag << std::dec << " " << c
7c673cae
FG
2769 << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH)
2770 << " !frozen " << !is_frozen_inode()
2771 << " !freezing " << !is_freezing_inode()
2772 << dendl;
2773 // wait on the directory?
2774 // make sure its not the inode that is explicitly ambiguous|freezing|frozen
2775 if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) ||
2776 ((tag & WAIT_UNFREEZE) &&
2777 !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) {
2778 dout(15) << "passing waiter up tree" << dendl;
2779 parent->dir->add_waiter(tag, c);
2780 return;
2781 }
2782 dout(15) << "taking waiter here" << dendl;
2783 MDSCacheObject::add_waiter(tag, c);
2784}
2785
11fdf7f2 2786void CInode::take_waiting(uint64_t mask, MDSContext::vec& ls)
7c673cae
FG
2787{
2788 if ((mask & WAIT_DIR) && !waiting_on_dir.empty()) {
2789 // take all dentry waiters
2790 while (!waiting_on_dir.empty()) {
94b18763
FG
2791 auto it = waiting_on_dir.begin();
2792 dout(10) << __func__ << " dirfrag " << it->first << " on " << *this << dendl;
11fdf7f2
TL
2793 auto& waiting = it->second;
2794 ls.insert(ls.end(), waiting.begin(), waiting.end());
94b18763 2795 waiting_on_dir.erase(it);
7c673cae
FG
2796 }
2797 put(PIN_DIRWAITER);
2798 }
2799
2800 // waiting
2801 MDSCacheObject::take_waiting(mask, ls);
2802}
2803
9f95a23c
TL
2804void CInode::maybe_finish_freeze_inode()
2805{
2806 CDir *dir = get_parent_dir();
2807 if (auth_pins > auth_pin_freeze_allowance || dir->frozen_inode_suppressed)
2808 return;
2809
2810 dout(10) << "maybe_finish_freeze_inode - frozen" << dendl;
2811 ceph_assert(auth_pins == auth_pin_freeze_allowance);
2812 get(PIN_FROZEN);
2813 put(PIN_FREEZING);
2814 state_clear(STATE_FREEZING);
2815 state_set(STATE_FROZEN);
2816
2817 item_freezing_inode.remove_myself();
2818 dir->num_frozen_inodes++;
2819
2820 finish_waiting(WAIT_FROZEN);
2821}
2822
7c673cae
FG
2823bool CInode::freeze_inode(int auth_pin_allowance)
2824{
9f95a23c
TL
2825 CDir *dir = get_parent_dir();
2826 ceph_assert(dir);
2827
11fdf7f2
TL
2828 ceph_assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins
2829 ceph_assert(auth_pins >= auth_pin_allowance);
9f95a23c
TL
2830 if (auth_pins == auth_pin_allowance && !dir->frozen_inode_suppressed) {
2831 dout(10) << "freeze_inode - frozen" << dendl;
2832 if (!state_test(STATE_FROZEN)) {
2833 get(PIN_FROZEN);
2834 state_set(STATE_FROZEN);
2835 dir->num_frozen_inodes++;
2836 }
2837 return true;
7c673cae
FG
2838 }
2839
9f95a23c
TL
2840 dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl;
2841 auth_pin_freeze_allowance = auth_pin_allowance;
2842 dir->freezing_inodes.push_back(&item_freezing_inode);
2843
2844 get(PIN_FREEZING);
2845 state_set(STATE_FREEZING);
2846
2847 if (!dir->lock_caches_with_auth_pins.empty())
2848 mdcache->mds->locker->invalidate_lock_caches(dir);
2849
2850 const static int lock_types[] = {
2851 CEPH_LOCK_IVERSION, CEPH_LOCK_IFILE, CEPH_LOCK_IAUTH, CEPH_LOCK_ILINK, CEPH_LOCK_IDFT,
2852 CEPH_LOCK_IXATTR, CEPH_LOCK_ISNAP, CEPH_LOCK_INEST, CEPH_LOCK_IFLOCK, CEPH_LOCK_IPOLICY, 0
2853 };
2854 for (int i = 0; lock_types[i]; ++i) {
2855 auto lock = get_lock(lock_types[i]);
2856 if (lock->is_cached())
2857 mdcache->mds->locker->invalidate_lock_caches(lock);
7c673cae 2858 }
9f95a23c
TL
2859 // invalidate_lock_caches() may decrease dir->frozen_inode_suppressed
2860 // and finish freezing the inode
2861 return state_test(STATE_FROZEN);
7c673cae
FG
2862}
2863
11fdf7f2 2864void CInode::unfreeze_inode(MDSContext::vec& finished)
7c673cae 2865{
11fdf7f2 2866 dout(10) << __func__ << dendl;
7c673cae
FG
2867 if (state_test(STATE_FREEZING)) {
2868 state_clear(STATE_FREEZING);
2869 put(PIN_FREEZING);
9f95a23c 2870 item_freezing_inode.remove_myself();
7c673cae
FG
2871 } else if (state_test(STATE_FROZEN)) {
2872 state_clear(STATE_FROZEN);
2873 put(PIN_FROZEN);
9f95a23c 2874 get_parent_dir()->num_frozen_inodes--;
7c673cae
FG
2875 } else
2876 ceph_abort();
2877 take_waiting(WAIT_UNFREEZE, finished);
2878}
2879
2880void CInode::unfreeze_inode()
2881{
11fdf7f2 2882 MDSContext::vec finished;
7c673cae
FG
2883 unfreeze_inode(finished);
2884 mdcache->mds->queue_waiters(finished);
2885}
2886
2887void CInode::freeze_auth_pin()
2888{
11fdf7f2 2889 ceph_assert(state_test(CInode::STATE_FROZEN));
7c673cae 2890 state_set(CInode::STATE_FROZENAUTHPIN);
9f95a23c 2891 get_parent_dir()->num_frozen_inodes++;
7c673cae
FG
2892}
2893
2894void CInode::unfreeze_auth_pin()
2895{
11fdf7f2 2896 ceph_assert(state_test(CInode::STATE_FROZENAUTHPIN));
7c673cae 2897 state_clear(CInode::STATE_FROZENAUTHPIN);
9f95a23c 2898 get_parent_dir()->num_frozen_inodes--;
7c673cae 2899 if (!state_test(STATE_FREEZING|STATE_FROZEN)) {
11fdf7f2 2900 MDSContext::vec finished;
7c673cae
FG
2901 take_waiting(WAIT_UNFREEZE, finished);
2902 mdcache->mds->queue_waiters(finished);
2903 }
2904}
2905
11fdf7f2 2906void CInode::clear_ambiguous_auth(MDSContext::vec& finished)
7c673cae 2907{
11fdf7f2 2908 ceph_assert(state_test(CInode::STATE_AMBIGUOUSAUTH));
7c673cae
FG
2909 state_clear(CInode::STATE_AMBIGUOUSAUTH);
2910 take_waiting(CInode::WAIT_SINGLEAUTH, finished);
2911}
2912
2913void CInode::clear_ambiguous_auth()
2914{
11fdf7f2 2915 MDSContext::vec finished;
7c673cae
FG
2916 clear_ambiguous_auth(finished);
2917 mdcache->mds->queue_waiters(finished);
2918}
2919
2920// auth_pins
91327a77
AA
2921bool CInode::can_auth_pin(int *err_ret) const {
2922 int err;
2923 if (!is_auth()) {
2924 err = ERR_NOT_AUTH;
2925 } else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) {
2926 err = ERR_EXPORTING_INODE;
2927 } else {
2928 if (parent)
2929 return parent->can_auth_pin(err_ret);
2930 err = 0;
2931 }
2932 if (err && err_ret)
2933 *err_ret = err;
2934 return !err;
7c673cae
FG
2935}
2936
2937void CInode::auth_pin(void *by)
2938{
2939 if (auth_pins == 0)
2940 get(PIN_AUTHPIN);
2941 auth_pins++;
2942
2943#ifdef MDS_AUTHPIN_SET
2944 auth_pin_set.insert(by);
2945#endif
2946
11fdf7f2 2947 dout(10) << "auth_pin by " << by << " on " << *this << " now " << auth_pins << dendl;
7c673cae
FG
2948
2949 if (parent)
11fdf7f2 2950 parent->adjust_nested_auth_pins(1, this);
7c673cae
FG
2951}
2952
2953void CInode::auth_unpin(void *by)
2954{
2955 auth_pins--;
2956
2957#ifdef MDS_AUTHPIN_SET
11fdf7f2
TL
2958 {
2959 auto it = auth_pin_set.find(by);
2960 ceph_assert(it != auth_pin_set.end());
2961 auth_pin_set.erase(it);
2962 }
7c673cae
FG
2963#endif
2964
2965 if (auth_pins == 0)
2966 put(PIN_AUTHPIN);
2967
11fdf7f2 2968 dout(10) << "auth_unpin by " << by << " on " << *this << " now " << auth_pins << dendl;
7c673cae 2969
11fdf7f2 2970 ceph_assert(auth_pins >= 0);
7c673cae
FG
2971
2972 if (parent)
11fdf7f2 2973 parent->adjust_nested_auth_pins(-1, by);
7c673cae 2974
9f95a23c
TL
2975 if (is_freezing_inode())
2976 maybe_finish_freeze_inode();
7c673cae
FG
2977}
2978
7c673cae
FG
2979// authority
2980
2981mds_authority_t CInode::authority() const
2982{
2983 if (inode_auth.first >= 0)
2984 return inode_auth;
2985
2986 if (parent)
2987 return parent->dir->authority();
2988
2989 // new items that are not yet linked in (in the committed plane) belong
2990 // to their first parent.
2991 if (!projected_parent.empty())
2992 return projected_parent.front()->dir->authority();
2993
2994 return CDIR_AUTH_UNDEF;
2995}
2996
2997
2998// SNAP
2999
3000snapid_t CInode::get_oldest_snap()
3001{
3002 snapid_t t = first;
f67539c2
TL
3003 if (is_any_old_inodes())
3004 t = get_old_inodes()->begin()->second.first;
11fdf7f2 3005 return std::min(t, oldest_snap);
7c673cae
FG
3006}
3007
f67539c2 3008const CInode::mempool_old_inode& CInode::cow_old_inode(snapid_t follows, bool cow_head)
7c673cae 3009{
11fdf7f2 3010 ceph_assert(follows >= first);
7c673cae 3011
f67539c2
TL
3012 const auto& pi = cow_head ? get_projected_inode() : get_previous_projected_inode();
3013 const auto& px = cow_head ? get_projected_xattrs() : get_previous_projected_xattrs();
3014
3015 auto _old_inodes = allocate_old_inode_map();
3016 if (old_inodes)
3017 *_old_inodes = *old_inodes;
7c673cae 3018
f67539c2 3019 mempool_old_inode &old = (*_old_inodes)[follows];
7c673cae
FG
3020 old.first = first;
3021 old.inode = *pi;
f67539c2
TL
3022 if (px) {
3023 dout(10) << " " << px->size() << " xattrs cowed, " << *px << dendl;
3024 old.xattrs = *px;
3025 }
7c673cae
FG
3026
3027 if (first < oldest_snap)
3028 oldest_snap = first;
7c673cae
FG
3029
3030 old.inode.trim_client_ranges(follows);
3031
11fdf7f2 3032 if (g_conf()->mds_snap_rstat &&
7c673cae
FG
3033 !(old.inode.rstat == old.inode.accounted_rstat))
3034 dirty_old_rstats.insert(follows);
3035
3036 first = follows+1;
3037
11fdf7f2 3038 dout(10) << __func__ << " " << (cow_head ? "head" : "previous_head" )
7c673cae
FG
3039 << " to [" << old.first << "," << follows << "] on "
3040 << *this << dendl;
3041
f67539c2 3042 reset_old_inodes(std::move(_old_inodes));
7c673cae
FG
3043 return old;
3044}
3045
7c673cae
FG
3046void CInode::pre_cow_old_inode()
3047{
11fdf7f2 3048 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
7c673cae
FG
3049 if (first <= follows)
3050 cow_old_inode(follows, true);
3051}
3052
11fdf7f2
TL
3053bool CInode::has_snap_data(snapid_t snapid)
3054{
3055 bool found = snapid >= first && snapid <= last;
f67539c2
TL
3056 if (!found && is_any_old_inodes()) {
3057 auto p = old_inodes->lower_bound(snapid);
3058 if (p != old_inodes->end()) {
11fdf7f2 3059 if (p->second.first > snapid) {
f67539c2 3060 if (p != old_inodes->begin())
11fdf7f2
TL
3061 --p;
3062 }
3063 if (p->second.first <= snapid && snapid <= p->first) {
3064 found = true;
3065 }
3066 }
3067 }
3068 return found;
3069}
3070
7c673cae
FG
3071void CInode::purge_stale_snap_data(const set<snapid_t>& snaps)
3072{
11fdf7f2 3073 dout(10) << __func__ << " " << snaps << dendl;
7c673cae 3074
f67539c2
TL
3075 if (!get_old_inodes())
3076 return;
3077
3078 std::vector<snapid_t> to_remove;
3079 for (auto p : *get_old_inodes()) {
3080 const snapid_t &id = p.first;
3081 const auto &s = snaps.lower_bound(p.second.first);
94b18763 3082 if (s == snaps.end() || *s > id) {
f67539c2
TL
3083 dout(10) << " purging old_inode [" << p.second.first << "," << id << "]" << dendl;
3084 to_remove.push_back(id);
94b18763 3085 }
7c673cae 3086 }
f67539c2
TL
3087
3088 if (to_remove.size() == get_old_inodes()->size()) {
3089 reset_old_inodes(old_inode_map_ptr());
3090 } else if (!to_remove.empty()) {
3091 auto _old_inodes = allocate_old_inode_map(*get_old_inodes());
3092 for (auto id : to_remove)
3093 _old_inodes->erase(id);
3094 reset_old_inodes(std::move(_old_inodes));
3095 }
7c673cae
FG
3096}
3097
3098/*
3099 * pick/create an old_inode
3100 */
f67539c2 3101snapid_t CInode::pick_old_inode(snapid_t snap) const
7c673cae 3102{
f67539c2
TL
3103 if (is_any_old_inodes()) {
3104 auto it = old_inodes->lower_bound(snap); // p is first key >= to snap
3105 if (it != old_inodes->end() && it->second.first <= snap) {
3106 dout(10) << __func__ << " snap " << snap << " -> [" << it->second.first << "," << it->first << "]" << dendl;
3107 return it->first;
3108 }
7c673cae 3109 }
11fdf7f2 3110 dout(10) << __func__ << " snap " << snap << " -> nothing" << dendl;
f67539c2 3111 return 0;
7c673cae
FG
3112}
3113
3114void CInode::open_snaprealm(bool nosplit)
3115{
3116 if (!snaprealm) {
3117 SnapRealm *parent = find_snaprealm();
3118 snaprealm = new SnapRealm(mdcache, this);
3119 if (parent) {
11fdf7f2 3120 dout(10) << __func__ << " " << snaprealm
7c673cae
FG
3121 << " parent is " << parent
3122 << dendl;
3123 dout(30) << " siblings are " << parent->open_children << dendl;
3124 snaprealm->parent = parent;
3125 if (!nosplit)
3126 parent->split_at(snaprealm);
3127 parent->open_children.insert(snaprealm);
3128 }
3129 }
3130}
3131void CInode::close_snaprealm(bool nojoin)
3132{
3133 if (snaprealm) {
11fdf7f2 3134 dout(15) << __func__ << " " << *snaprealm << dendl;
7c673cae
FG
3135 if (snaprealm->parent) {
3136 snaprealm->parent->open_children.erase(snaprealm);
3137 //if (!nojoin)
3138 //snaprealm->parent->join(snaprealm);
3139 }
3140 delete snaprealm;
3141 snaprealm = 0;
3142 }
3143}
3144
3145SnapRealm *CInode::find_snaprealm() const
3146{
3147 const CInode *cur = this;
3148 while (!cur->snaprealm) {
11fdf7f2
TL
3149 const CDentry *pdn = cur->get_oldest_parent_dn();
3150 if (!pdn)
7c673cae 3151 break;
11fdf7f2 3152 cur = pdn->get_dir()->get_inode();
7c673cae
FG
3153 }
3154 return cur->snaprealm;
3155}
3156
3157void CInode::encode_snap_blob(bufferlist &snapbl)
3158{
3159 if (snaprealm) {
11fdf7f2
TL
3160 using ceph::encode;
3161 encode(snaprealm->srnode, snapbl);
3162 dout(20) << __func__ << " " << *snaprealm << dendl;
7c673cae
FG
3163 }
3164}
11fdf7f2 3165void CInode::decode_snap_blob(const bufferlist& snapbl)
7c673cae 3166{
11fdf7f2 3167 using ceph::decode;
7c673cae
FG
3168 if (snapbl.length()) {
3169 open_snaprealm();
11fdf7f2
TL
3170 auto old_flags = snaprealm->srnode.flags;
3171 auto p = snapbl.cbegin();
3172 decode(snaprealm->srnode, p);
f67539c2 3173 if (!is_base()) {
11fdf7f2 3174 if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) {
11fdf7f2
TL
3175 snaprealm->adjust_parent();
3176 }
7c673cae 3177 }
11fdf7f2 3178 dout(20) << __func__ << " " << *snaprealm << dendl;
92f5a8d4
TL
3179 } else if (snaprealm &&
3180 !is_root() && !is_mdsdir()) { // see https://tracker.ceph.com/issues/42675
11fdf7f2
TL
3181 ceph_assert(mdcache->mds->is_any_replay());
3182 snaprealm->merge_to(NULL);
7c673cae
FG
3183 }
3184}
3185
3186void CInode::encode_snap(bufferlist& bl)
3187{
9f95a23c 3188 ENCODE_START(1, 1, bl);
7c673cae
FG
3189 bufferlist snapbl;
3190 encode_snap_blob(snapbl);
11fdf7f2
TL
3191 encode(snapbl, bl);
3192 encode(oldest_snap, bl);
9f95a23c 3193 ENCODE_FINISH(bl);
11fdf7f2 3194}
7c673cae 3195
11fdf7f2 3196void CInode::decode_snap(bufferlist::const_iterator& p)
7c673cae 3197{
9f95a23c 3198 DECODE_START(1, p);
7c673cae 3199 bufferlist snapbl;
11fdf7f2
TL
3200 decode(snapbl, p);
3201 decode(oldest_snap, p);
7c673cae 3202 decode_snap_blob(snapbl);
9f95a23c 3203 DECODE_FINISH(p);
7c673cae
FG
3204}
3205
3206// =============================================
3207
3208client_t CInode::calc_ideal_loner()
3209{
3210 if (mdcache->is_readonly())
3211 return -1;
11fdf7f2 3212 if (!get_mds_caps_wanted().empty())
7c673cae
FG
3213 return -1;
3214
3215 int n = 0;
3216 client_t loner = -1;
11fdf7f2
TL
3217 for (const auto &p : client_caps) {
3218 if (!p.second.is_stale() &&
9f95a23c
TL
3219 (is_dir() ?
3220 !has_subtree_or_exporting_dirfrag() :
3221 (p.second.wanted() & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_RD)))) {
7c673cae
FG
3222 if (n)
3223 return -1;
3224 n++;
11fdf7f2 3225 loner = p.first;
7c673cae 3226 }
11fdf7f2 3227 }
7c673cae
FG
3228 return loner;
3229}
3230
b32b8144 3231bool CInode::choose_ideal_loner()
7c673cae
FG
3232{
3233 want_loner_cap = calc_ideal_loner();
b32b8144
FG
3234 int changed = false;
3235 if (loner_cap >= 0 && loner_cap != want_loner_cap) {
3236 if (!try_drop_loner())
3237 return false;
3238 changed = true;
3239 }
3240
3241 if (want_loner_cap >= 0) {
3242 if (loner_cap < 0) {
3243 set_loner_cap(want_loner_cap);
3244 changed = true;
3245 } else
11fdf7f2 3246 ceph_assert(loner_cap == want_loner_cap);
b32b8144
FG
3247 }
3248 return changed;
7c673cae
FG
3249}
3250
3251bool CInode::try_set_loner()
3252{
11fdf7f2 3253 ceph_assert(want_loner_cap >= 0);
7c673cae
FG
3254 if (loner_cap >= 0 && loner_cap != want_loner_cap)
3255 return false;
3256 set_loner_cap(want_loner_cap);
3257 return true;
3258}
3259
3260void CInode::set_loner_cap(client_t l)
3261{
3262 loner_cap = l;
3263 authlock.set_excl_client(loner_cap);
3264 filelock.set_excl_client(loner_cap);
3265 linklock.set_excl_client(loner_cap);
3266 xattrlock.set_excl_client(loner_cap);
3267}
3268
3269bool CInode::try_drop_loner()
3270{
3271 if (loner_cap < 0)
3272 return true;
3273
3274 int other_allowed = get_caps_allowed_by_type(CAP_ANY);
3275 Capability *cap = get_client_cap(loner_cap);
3276 if (!cap ||
3277 (cap->issued() & ~other_allowed) == 0) {
3278 set_loner_cap(-1);
3279 return true;
3280 }
3281 return false;
3282}
3283
3284
3285// choose new lock state during recovery, based on issued caps
3286void CInode::choose_lock_state(SimpleLock *lock, int allissued)
3287{
3288 int shift = lock->get_cap_shift();
3289 int issued = (allissued >> shift) & lock->get_cap_mask();
3290 if (is_auth()) {
3291 if (lock->is_xlocked()) {
3292 // do nothing here
3293 } else if (lock->get_state() != LOCK_MIX) {
3294 if (issued & (CEPH_CAP_GEXCL | CEPH_CAP_GBUFFER))
3295 lock->set_state(LOCK_EXCL);
f6b5b4d7
TL
3296 else if (issued & CEPH_CAP_GWR) {
3297 if (issued & (CEPH_CAP_GCACHE | CEPH_CAP_GSHARED))
3298 lock->set_state(LOCK_EXCL);
3299 else
3300 lock->set_state(LOCK_MIX);
3301 } else if (lock->is_dirty()) {
7c673cae
FG
3302 if (is_replicated())
3303 lock->set_state(LOCK_MIX);
3304 else
3305 lock->set_state(LOCK_LOCK);
3306 } else
3307 lock->set_state(LOCK_SYNC);
3308 }
3309 } else {
3310 // our states have already been chosen during rejoin.
3311 if (lock->is_xlocked())
11fdf7f2 3312 ceph_assert(lock->get_state() == LOCK_LOCK);
7c673cae
FG
3313 }
3314}
3315
3316void CInode::choose_lock_states(int dirty_caps)
3317{
3318 int issued = get_caps_issued() | dirty_caps;
b32b8144
FG
3319 if (is_auth() && (issued & (CEPH_CAP_ANY_EXCL|CEPH_CAP_ANY_WR)))
3320 choose_ideal_loner();
7c673cae
FG
3321 choose_lock_state(&filelock, issued);
3322 choose_lock_state(&nestlock, issued);
3323 choose_lock_state(&dirfragtreelock, issued);
3324 choose_lock_state(&authlock, issued);
3325 choose_lock_state(&xattrlock, issued);
3326 choose_lock_state(&linklock, issued);
3327}
3328
9f95a23c
TL
3329int CInode::count_nonstale_caps()
3330{
3331 int n = 0;
3332 for (const auto &p : client_caps) {
3333 if (!p.second.is_stale())
3334 n++;
3335 }
3336 return n;
3337}
3338
3339bool CInode::multiple_nonstale_caps()
3340{
3341 int n = 0;
3342 for (const auto &p : client_caps) {
3343 if (!p.second.is_stale()) {
3344 if (n)
3345 return true;
3346 n++;
3347 }
3348 }
3349 return false;
3350}
3351
11fdf7f2
TL
3352void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m)
3353{
3354 bool old_empty = mds_caps_wanted.empty();
3355 mds_caps_wanted.swap(m);
3356 if (old_empty != (bool)mds_caps_wanted.empty()) {
3357 if (old_empty)
f91f0fd5 3358 adjust_num_caps_notable(1);
11fdf7f2 3359 else
f91f0fd5 3360 adjust_num_caps_notable(-1);
11fdf7f2
TL
3361 }
3362}
3363
3364void CInode::set_mds_caps_wanted(mds_rank_t mds, int32_t wanted)
3365{
3366 bool old_empty = mds_caps_wanted.empty();
3367 if (wanted) {
3368 mds_caps_wanted[mds] = wanted;
3369 if (old_empty)
f91f0fd5 3370 adjust_num_caps_notable(1);
11fdf7f2
TL
3371 } else if (!old_empty) {
3372 mds_caps_wanted.erase(mds);
3373 if (mds_caps_wanted.empty())
f91f0fd5 3374 adjust_num_caps_notable(-1);
11fdf7f2
TL
3375 }
3376}
3377
9f95a23c
TL
3378Capability *CInode::add_client_cap(client_t client, Session *session,
3379 SnapRealm *conrealm, bool new_inode)
7c673cae 3380{
11fdf7f2 3381 ceph_assert(last == CEPH_NOSNAP);
7c673cae
FG
3382 if (client_caps.empty()) {
3383 get(PIN_CAPS);
3384 if (conrealm)
3385 containing_realm = conrealm;
3386 else
3387 containing_realm = find_snaprealm();
3388 containing_realm->inodes_with_caps.push_back(&item_caps);
11fdf7f2 3389 dout(10) << __func__ << " first cap, joining realm " << *containing_realm << dendl;
7c673cae 3390
7c673cae 3391 mdcache->num_inodes_with_caps++;
11fdf7f2
TL
3392 if (parent)
3393 parent->dir->adjust_num_inodes_with_caps(1);
3394 }
3395
9f95a23c 3396 uint64_t cap_id = new_inode ? 1 : ++mdcache->last_cap_id;
11fdf7f2
TL
3397 auto ret = client_caps.emplace(std::piecewise_construct, std::forward_as_tuple(client),
3398 std::forward_as_tuple(this, session, cap_id));
3399 ceph_assert(ret.second == true);
3400 Capability *cap = &ret.first->second;
7c673cae 3401
7c673cae 3402 cap->client_follows = first-1;
7c673cae 3403 containing_realm->add_cap(client, cap);
11fdf7f2 3404
7c673cae
FG
3405 return cap;
3406}
3407
3408void CInode::remove_client_cap(client_t client)
3409{
11fdf7f2
TL
3410 auto it = client_caps.find(client);
3411 ceph_assert(it != client_caps.end());
3412 Capability *cap = &it->second;
7c673cae
FG
3413
3414 cap->item_session_caps.remove_myself();
3415 cap->item_revoking_caps.remove_myself();
3416 cap->item_client_revoking_caps.remove_myself();
3417 containing_realm->remove_cap(client, cap);
3418
3419 if (client == loner_cap)
3420 loner_cap = -1;
3421
f91f0fd5
TL
3422 if (cap->is_wanted_notable())
3423 adjust_num_caps_notable(-1);
11fdf7f2
TL
3424
3425 client_caps.erase(it);
7c673cae 3426 if (client_caps.empty()) {
11fdf7f2 3427 dout(10) << __func__ << " last cap, leaving realm " << *containing_realm << dendl;
7c673cae
FG
3428 put(PIN_CAPS);
3429 item_caps.remove_myself();
3430 containing_realm = NULL;
7c673cae 3431 mdcache->num_inodes_with_caps--;
11fdf7f2
TL
3432 if (parent)
3433 parent->dir->adjust_num_inodes_with_caps(-1);
7c673cae
FG
3434 }
3435
3436 //clean up advisory locks
3437 bool fcntl_removed = fcntl_locks ? fcntl_locks->remove_all_from(client) : false;
3438 bool flock_removed = flock_locks ? flock_locks->remove_all_from(client) : false;
3439 if (fcntl_removed || flock_removed) {
11fdf7f2 3440 MDSContext::vec waiters;
7c673cae
FG
3441 take_waiting(CInode::WAIT_FLOCK, waiters);
3442 mdcache->mds->queue_waiters(waiters);
3443 }
3444}
3445
3446void CInode::move_to_realm(SnapRealm *realm)
3447{
11fdf7f2 3448 dout(10) << __func__ << " joining realm " << *realm
7c673cae 3449 << ", leaving realm " << *containing_realm << dendl;
11fdf7f2
TL
3450 for (auto& p : client_caps) {
3451 containing_realm->remove_cap(p.first, &p.second);
3452 realm->add_cap(p.first, &p.second);
7c673cae
FG
3453 }
3454 item_caps.remove_myself();
3455 realm->inodes_with_caps.push_back(&item_caps);
3456 containing_realm = realm;
3457}
3458
3459Capability *CInode::reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session)
3460{
3461 Capability *cap = get_client_cap(client);
3462 if (cap) {
3463 // FIXME?
3464 cap->merge(icr.capinfo.wanted, icr.capinfo.issued);
3465 } else {
3466 cap = add_client_cap(client, session);
3467 cap->set_cap_id(icr.capinfo.cap_id);
3468 cap->set_wanted(icr.capinfo.wanted);
3469 cap->issue_norevoke(icr.capinfo.issued);
3470 cap->reset_seq();
3471 }
3472 cap->set_last_issue_stamp(ceph_clock_now());
3473 return cap;
3474}
3475
3476void CInode::clear_client_caps_after_export()
3477{
3478 while (!client_caps.empty())
3479 remove_client_cap(client_caps.begin()->first);
3480 loner_cap = -1;
3481 want_loner_cap = -1;
11fdf7f2
TL
3482 if (!get_mds_caps_wanted().empty()) {
3483 mempool::mds_co::compact_map<int32_t,int32_t> empty;
3484 set_mds_caps_wanted(empty);
3485 }
7c673cae
FG
3486}
3487
3488void CInode::export_client_caps(map<client_t,Capability::Export>& cl)
3489{
11fdf7f2
TL
3490 for (const auto &p : client_caps) {
3491 cl[p.first] = p.second.make_export();
7c673cae
FG
3492 }
3493}
3494
3495 // caps allowed
3496int CInode::get_caps_liked() const
3497{
3498 if (is_dir())
3499 return CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; // but not, say, FILE_RD|WR|WRBUFFER
3500 else
3501 return CEPH_CAP_ANY & ~CEPH_CAP_FILE_LAZYIO;
3502}
3503
3504int CInode::get_caps_allowed_ever() const
3505{
3506 int allowed;
3507 if (is_dir())
3508 allowed = CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED;
3509 else
3510 allowed = CEPH_CAP_ANY;
3511 return allowed &
3512 (CEPH_CAP_PIN |
3513 (filelock.gcaps_allowed_ever() << filelock.get_cap_shift()) |
3514 (authlock.gcaps_allowed_ever() << authlock.get_cap_shift()) |
3515 (xattrlock.gcaps_allowed_ever() << xattrlock.get_cap_shift()) |
3516 (linklock.gcaps_allowed_ever() << linklock.get_cap_shift()));
3517}
3518
3519int CInode::get_caps_allowed_by_type(int type) const
3520{
3521 return
3522 CEPH_CAP_PIN |
3523 (filelock.gcaps_allowed(type) << filelock.get_cap_shift()) |
3524 (authlock.gcaps_allowed(type) << authlock.get_cap_shift()) |
3525 (xattrlock.gcaps_allowed(type) << xattrlock.get_cap_shift()) |
3526 (linklock.gcaps_allowed(type) << linklock.get_cap_shift());
3527}
3528
3529int CInode::get_caps_careful() const
3530{
3531 return
3532 (filelock.gcaps_careful() << filelock.get_cap_shift()) |
3533 (authlock.gcaps_careful() << authlock.get_cap_shift()) |
3534 (xattrlock.gcaps_careful() << xattrlock.get_cap_shift()) |
3535 (linklock.gcaps_careful() << linklock.get_cap_shift());
3536}
3537
3538int CInode::get_xlocker_mask(client_t client) const
3539{
3540 return
3541 (filelock.gcaps_xlocker_mask(client) << filelock.get_cap_shift()) |
3542 (authlock.gcaps_xlocker_mask(client) << authlock.get_cap_shift()) |
3543 (xattrlock.gcaps_xlocker_mask(client) << xattrlock.get_cap_shift()) |
3544 (linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift());
3545}
3546
11fdf7f2 3547int CInode::get_caps_allowed_for_client(Session *session, Capability *cap,
f67539c2 3548 const mempool_inode *file_i) const
7c673cae 3549{
11fdf7f2 3550 client_t client = session->get_client();
7c673cae
FG
3551 int allowed;
3552 if (client == get_loner()) {
3553 // as the loner, we get the loner_caps AND any xlocker_caps for things we have xlocked
3554 allowed =
3555 get_caps_allowed_by_type(CAP_LONER) |
3556 (get_caps_allowed_by_type(CAP_XLOCKER) & get_xlocker_mask(client));
3557 } else {
3558 allowed = get_caps_allowed_by_type(CAP_ANY);
3559 }
3560
9f95a23c
TL
3561 if (is_dir()) {
3562 allowed &= ~CEPH_CAP_ANY_DIR_OPS;
3563 if (cap && (allowed & CEPH_CAP_FILE_EXCL))
3564 allowed |= cap->get_lock_cache_allowed();
3565 } else {
11fdf7f2
TL
3566 if (file_i->inline_data.version == CEPH_INLINE_NONE &&
3567 file_i->layout.pool_ns.empty()) {
3568 // noop
3569 } else if (cap) {
3570 if ((file_i->inline_data.version != CEPH_INLINE_NONE &&
3571 cap->is_noinline()) ||
3572 (!file_i->layout.pool_ns.empty() &&
3573 cap->is_nopoolns()))
3574 allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
3575 } else {
3576 auto& conn = session->get_connection();
3577 if ((file_i->inline_data.version != CEPH_INLINE_NONE &&
3578 !conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) ||
3579 (!file_i->layout.pool_ns.empty() &&
3580 !conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)))
3581 allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
3582 }
7c673cae
FG
3583 }
3584 return allowed;
3585}
3586
3587// caps issued, wanted
3588int CInode::get_caps_issued(int *ploner, int *pother, int *pxlocker,
3589 int shift, int mask)
3590{
3591 int c = 0;
3592 int loner = 0, other = 0, xlocker = 0;
3593 if (!is_auth()) {
3594 loner_cap = -1;
3595 }
3596
11fdf7f2
TL
3597 for (const auto &p : client_caps) {
3598 int i = p.second.issued();
7c673cae 3599 c |= i;
11fdf7f2 3600 if (p.first == loner_cap)
7c673cae
FG
3601 loner |= i;
3602 else
3603 other |= i;
11fdf7f2 3604 xlocker |= get_xlocker_mask(p.first) & i;
7c673cae
FG
3605 }
3606 if (ploner) *ploner = (loner >> shift) & mask;
3607 if (pother) *pother = (other >> shift) & mask;
3608 if (pxlocker) *pxlocker = (xlocker >> shift) & mask;
3609 return (c >> shift) & mask;
3610}
3611
3612bool CInode::is_any_caps_wanted() const
3613{
11fdf7f2
TL
3614 for (const auto &p : client_caps) {
3615 if (p.second.wanted())
7c673cae 3616 return true;
11fdf7f2 3617 }
7c673cae
FG
3618 return false;
3619}
3620
3621int CInode::get_caps_wanted(int *ploner, int *pother, int shift, int mask) const
3622{
3623 int w = 0;
3624 int loner = 0, other = 0;
11fdf7f2
TL
3625 for (const auto &p : client_caps) {
3626 if (!p.second.is_stale()) {
3627 int t = p.second.wanted();
7c673cae 3628 w |= t;
11fdf7f2 3629 if (p.first == loner_cap)
7c673cae
FG
3630 loner |= t;
3631 else
3632 other |= t;
3633 }
3634 //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl;
3635 }
3636 if (is_auth())
94b18763
FG
3637 for (const auto &p : mds_caps_wanted) {
3638 w |= p.second;
3639 other |= p.second;
7c673cae
FG
3640 //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl;
3641 }
3642 if (ploner) *ploner = (loner >> shift) & mask;
3643 if (pother) *pother = (other >> shift) & mask;
3644 return (w >> shift) & mask;
3645}
3646
3647bool CInode::issued_caps_need_gather(SimpleLock *lock)
3648{
3649 int loner_issued, other_issued, xlocker_issued;
3650 get_caps_issued(&loner_issued, &other_issued, &xlocker_issued,
3651 lock->get_cap_shift(), lock->get_cap_mask());
3652 if ((loner_issued & ~lock->gcaps_allowed(CAP_LONER)) ||
3653 (other_issued & ~lock->gcaps_allowed(CAP_ANY)) ||
3654 (xlocker_issued & ~lock->gcaps_allowed(CAP_XLOCKER)))
3655 return true;
3656 return false;
3657}
3658
f91f0fd5
TL
3659void CInode::adjust_num_caps_notable(int d)
3660{
3661 if (!is_clientwriteable()) {
3662 if (!num_caps_notable && d > 0)
3663 mdcache->open_file_table.add_inode(this);
3664 else if (num_caps_notable > 0 && num_caps_notable == -d)
3665 mdcache->open_file_table.remove_inode(this);
3666 }
3667
3668 num_caps_notable +=d;
3669 ceph_assert(num_caps_notable >= 0);
3670}
3671
3672void CInode::mark_clientwriteable()
3673{
3674 if (last != CEPH_NOSNAP)
3675 return;
3676 if (!state_test(STATE_CLIENTWRITEABLE)) {
3677 if (num_caps_notable == 0)
3678 mdcache->open_file_table.add_inode(this);
3679 state_set(STATE_CLIENTWRITEABLE);
3680 }
3681}
3682
3683void CInode::clear_clientwriteable()
3684{
3685 if (state_test(STATE_CLIENTWRITEABLE)) {
3686 if (num_caps_notable == 0)
3687 mdcache->open_file_table.remove_inode(this);
3688 state_clear(STATE_CLIENTWRITEABLE);
3689 }
3690}
7c673cae
FG
3691
3692// =============================================
3693
3694int CInode::encode_inodestat(bufferlist& bl, Session *session,
3695 SnapRealm *dir_realm,
3696 snapid_t snapid,
3697 unsigned max_bytes,
3698 int getattr_caps)
3699{
11fdf7f2
TL
3700 client_t client = session->get_client();
3701 ceph_assert(snapid);
7c673cae
FG
3702
3703 bool valid = true;
3704
3705 // pick a version!
f67539c2
TL
3706 const mempool_inode *oi = get_inode().get();
3707 const mempool_inode *pi = get_projected_inode().get();
7c673cae 3708
f67539c2 3709 const mempool_xattr_map *pxattrs = nullptr;
7c673cae
FG
3710
3711 if (snapid != CEPH_NOSNAP) {
3712
3713 // for now at least, old_inodes is only defined/valid on the auth
3714 if (!is_auth())
3715 valid = false;
3716
f67539c2
TL
3717 if (is_any_old_inodes()) {
3718 auto it = old_inodes->lower_bound(snapid);
3719 if (it != old_inodes->end()) {
94b18763 3720 if (it->second.first > snapid) {
f67539c2 3721 if (it != old_inodes->begin())
94b18763 3722 --it;
7c673cae 3723 }
94b18763
FG
3724 if (it->second.first <= snapid && snapid <= it->first) {
3725 dout(15) << __func__ << " snapid " << snapid
3726 << " to old_inode [" << it->second.first << "," << it->first << "]"
3727 << " " << it->second.inode.rstat
7c673cae 3728 << dendl;
f67539c2
TL
3729 pi = oi = &it->second.inode;
3730 pxattrs = &it->second.xattrs;
7c673cae
FG
3731 } else {
3732 // snapshoted remote dentry can result this
11fdf7f2 3733 dout(0) << __func__ << " old_inode for snapid " << snapid
7c673cae
FG
3734 << " not found" << dendl;
3735 }
3736 }
3737 } else if (snapid < first || snapid > last) {
3738 // snapshoted remote dentry can result this
11fdf7f2 3739 dout(0) << __func__ << " [" << first << "," << last << "]"
7c673cae
FG
3740 << " not match snapid " << snapid << dendl;
3741 }
3742 }
3743
81eedcae 3744 utime_t snap_btime;
f67539c2 3745 std::map<std::string, std::string> snap_metadata;
7c673cae 3746 SnapRealm *realm = find_snaprealm();
81eedcae
TL
3747 if (snapid != CEPH_NOSNAP && realm) {
3748 // add snapshot timestamp vxattr
3749 map<snapid_t,const SnapInfo*> infomap;
3750 realm->get_snap_info(infomap,
3751 snapid, // min
3752 snapid); // max
3753 if (!infomap.empty()) {
3754 ceph_assert(infomap.size() == 1);
3755 const SnapInfo *si = infomap.begin()->second;
3756 snap_btime = si->stamp;
f67539c2 3757 snap_metadata = si->metadata;
81eedcae
TL
3758 }
3759 }
3760
7c673cae
FG
3761
3762 bool no_caps = !valid ||
3763 session->is_stale() ||
3764 (dir_realm && realm != dir_realm) ||
3765 is_frozen() ||
3766 state_test(CInode::STATE_EXPORTINGCAPS);
3767 if (no_caps)
11fdf7f2 3768 dout(20) << __func__ << " no caps"
7c673cae
FG
3769 << (!valid?", !valid":"")
3770 << (session->is_stale()?", session stale ":"")
3771 << ((dir_realm && realm != dir_realm)?", snaprealm differs ":"")
3772 << (is_frozen()?", frozen inode":"")
3773 << (state_test(CInode::STATE_EXPORTINGCAPS)?", exporting caps":"")
3774 << dendl;
3775
3776
3777 // "fake" a version that is old (stable) version, +1 if projected.
3778 version_t version = (oi->version * 2) + is_projected();
3779
3780 Capability *cap = get_client_cap(client);
3781 bool pfile = filelock.is_xlocked_by_client(client) || get_loner() == client;
3782 //(cap && (cap->issued() & CEPH_CAP_FILE_EXCL));
3783 bool pauth = authlock.is_xlocked_by_client(client) || get_loner() == client;
3784 bool plink = linklock.is_xlocked_by_client(client) || get_loner() == client;
3785 bool pxattr = xattrlock.is_xlocked_by_client(client) || get_loner() == client;
3786
3787 bool plocal = versionlock.get_last_wrlock_client() == client;
3788 bool ppolicy = policylock.is_xlocked_by_client(client) || get_loner()==client;
3789
f67539c2 3790 const mempool_inode *any_i = (pfile|pauth|plink|pxattr|plocal) ? pi : oi;
7c673cae
FG
3791
3792 dout(20) << " pfile " << pfile << " pauth " << pauth
3793 << " plink " << plink << " pxattr " << pxattr
3794 << " plocal " << plocal
3795 << " ctime " << any_i->ctime
3796 << " valid=" << valid << dendl;
3797
3798 // file
f67539c2 3799 const mempool_inode *file_i = pfile ? pi:oi;
7c673cae
FG
3800 file_layout_t layout;
3801 if (is_dir()) {
3802 layout = (ppolicy ? pi : oi)->layout;
3803 } else {
3804 layout = file_i->layout;
3805 }
3806
3807 // max_size is min of projected, actual
3808 uint64_t max_size =
f91f0fd5
TL
3809 std::min(oi->get_client_range(client),
3810 pi->get_client_range(client));
7c673cae
FG
3811
3812 // inline data
3813 version_t inline_version = 0;
3814 bufferlist inline_data;
3815 if (file_i->inline_data.version == CEPH_INLINE_NONE) {
3816 inline_version = CEPH_INLINE_NONE;
3817 } else if ((!cap && !no_caps) ||
3818 (cap && cap->client_inline_version < file_i->inline_data.version) ||
3819 (getattr_caps & CEPH_CAP_FILE_RD)) { // client requests inline data
3820 inline_version = file_i->inline_data.version;
3821 if (file_i->inline_data.length() > 0)
f67539c2 3822 file_i->inline_data.get_data(inline_data);
7c673cae
FG
3823 }
3824
3825 // nest (do same as file... :/)
3826 if (cap) {
3827 cap->last_rbytes = file_i->rstat.rbytes;
3828 cap->last_rsize = file_i->rstat.rsize();
3829 }
3830
3831 // auth
f67539c2 3832 const mempool_inode *auth_i = pauth ? pi:oi;
7c673cae
FG
3833
3834 // link
f67539c2 3835 const mempool_inode *link_i = plink ? pi:oi;
7c673cae
FG
3836
3837 // xattr
f67539c2 3838 const mempool_inode *xattr_i = pxattr ? pi:oi;
7c673cae 3839
11fdf7f2 3840 using ceph::encode;
7c673cae 3841 // xattr
7c673cae
FG
3842 version_t xattr_version;
3843 if ((!cap && !no_caps) ||
3844 (cap && cap->client_xattr_version < xattr_i->xattr_version) ||
3845 (getattr_caps & CEPH_CAP_XATTR_SHARED)) { // client requests xattrs
3846 if (!pxattrs)
f67539c2 3847 pxattrs = pxattr ? get_projected_xattrs().get() : get_xattrs().get();
7c673cae
FG
3848 xattr_version = xattr_i->xattr_version;
3849 } else {
3850 xattr_version = 0;
3851 }
3852
3853 // do we have room?
3854 if (max_bytes) {
11fdf7f2
TL
3855 unsigned bytes =
3856 8 + 8 + 4 + 8 + 8 + sizeof(ceph_mds_reply_cap) +
3857 sizeof(struct ceph_file_layout) +
3858 sizeof(struct ceph_timespec) * 3 + 4 + // ctime ~ time_warp_seq
3859 8 + 8 + 8 + 4 + 4 + 4 + 4 + 4 + // size ~ nlink
3860 8 + 8 + 8 + 8 + 8 + sizeof(struct ceph_timespec) + // dirstat.nfiles ~ rstat.rctime
3861 sizeof(__u32) + sizeof(__u32) * 2 * dirfragtree._splits.size() + // dirfragtree
3862 sizeof(__u32) + symlink.length() + // symlink
3863 sizeof(struct ceph_dir_layout); // dir_layout
3864
3865 if (xattr_version) {
3866 bytes += sizeof(__u32) + sizeof(__u32); // xattr buffer len + number entries
3867 if (pxattrs) {
3868 for (const auto &p : *pxattrs)
3869 bytes += sizeof(__u32) * 2 + p.first.length() + p.second.length();
3870 }
3871 } else {
3872 bytes += sizeof(__u32); // xattr buffer len
3873 }
3874 bytes +=
3875 sizeof(version_t) + sizeof(__u32) + inline_data.length() + // inline data
3876 1 + 1 + 8 + 8 + 4 + // quota
3877 4 + layout.pool_ns.size() + // pool ns
3878 sizeof(struct ceph_timespec) + 8; // btime + change_attr
3879
7c673cae 3880 if (bytes > max_bytes)
f67539c2 3881 return -CEPHFS_ENOSPC;
7c673cae
FG
3882 }
3883
3884
3885 // encode caps
3886 struct ceph_mds_reply_cap ecap;
3887 if (snapid != CEPH_NOSNAP) {
3888 /*
3889 * snapped inodes (files or dirs) only get read-only caps. always
3890 * issue everything possible, since it is read only.
3891 *
3892 * if a snapped inode has caps, limit issued caps based on the
3893 * lock state.
3894 *
3895 * if it is a live inode, limit issued caps based on the lock
3896 * state.
3897 *
3898 * do NOT adjust cap issued state, because the client always
3899 * tracks caps per-snap and the mds does either per-interval or
3900 * multiversion.
3901 */
3902 ecap.caps = valid ? get_caps_allowed_by_type(CAP_ANY) : CEPH_STAT_CAP_INODE;
3903 if (last == CEPH_NOSNAP || is_any_caps())
11fdf7f2 3904 ecap.caps = ecap.caps & get_caps_allowed_for_client(session, nullptr, file_i);
7c673cae
FG
3905 ecap.seq = 0;
3906 ecap.mseq = 0;
3907 ecap.realm = 0;
3908 } else {
3909 if (!no_caps && !cap) {
3910 // add a new cap
3911 cap = add_client_cap(client, session, realm);
b32b8144
FG
3912 if (is_auth())
3913 choose_ideal_loner();
7c673cae
FG
3914 }
3915
3916 int issue = 0;
3917 if (!no_caps && cap) {
3918 int likes = get_caps_liked();
11fdf7f2 3919 int allowed = get_caps_allowed_for_client(session, cap, file_i);
7c673cae 3920 issue = (cap->wanted() | likes) & allowed;
494da23a 3921 cap->issue_norevoke(issue, true);
7c673cae
FG
3922 issue = cap->pending();
3923 dout(10) << "encode_inodestat issuing " << ccap_string(issue)
3924 << " seq " << cap->get_last_seq() << dendl;
3925 } else if (cap && cap->is_new() && !dir_realm) {
3926 // alway issue new caps to client, otherwise the caps get lost
11fdf7f2 3927 ceph_assert(cap->is_stale());
494da23a
TL
3928 ceph_assert(!cap->pending());
3929 issue = CEPH_CAP_PIN;
3930 cap->issue_norevoke(issue, true);
7c673cae
FG
3931 dout(10) << "encode_inodestat issuing " << ccap_string(issue)
3932 << " seq " << cap->get_last_seq()
494da23a 3933 << "(stale&new caps)" << dendl;
7c673cae
FG
3934 }
3935
3936 if (issue) {
3937 cap->set_last_issue();
3938 cap->set_last_issue_stamp(ceph_clock_now());
7c673cae
FG
3939 ecap.caps = issue;
3940 ecap.wanted = cap->wanted();
3941 ecap.cap_id = cap->get_cap_id();
3942 ecap.seq = cap->get_last_seq();
3943 ecap.mseq = cap->get_mseq();
3944 ecap.realm = realm->inode->ino();
3945 } else {
3946 ecap.cap_id = 0;
3947 ecap.caps = 0;
3948 ecap.seq = 0;
3949 ecap.mseq = 0;
3950 ecap.realm = 0;
3951 ecap.wanted = 0;
3952 }
3953 }
3954 ecap.flags = is_auth() ? CEPH_CAP_FLAG_AUTH : 0;
3955 dout(10) << "encode_inodestat caps " << ccap_string(ecap.caps)
3956 << " seq " << ecap.seq << " mseq " << ecap.mseq
11fdf7f2 3957 << " xattrv " << xattr_version << dendl;
7c673cae
FG
3958
3959 if (inline_data.length() && cap) {
3960 if ((cap->pending() | getattr_caps) & CEPH_CAP_FILE_SHARED) {
3961 dout(10) << "including inline version " << inline_version << dendl;
3962 cap->client_inline_version = inline_version;
3963 } else {
3964 dout(10) << "dropping inline version " << inline_version << dendl;
3965 inline_version = 0;
3966 inline_data.clear();
3967 }
3968 }
3969
3970 // include those xattrs?
11fdf7f2 3971 if (xattr_version && cap) {
7c673cae 3972 if ((cap->pending() | getattr_caps) & CEPH_CAP_XATTR_SHARED) {
11fdf7f2
TL
3973 dout(10) << "including xattrs version " << xattr_version << dendl;
3974 cap->client_xattr_version = xattr_version;
7c673cae 3975 } else {
11fdf7f2 3976 dout(10) << "dropping xattrs version " << xattr_version << dendl;
7c673cae
FG
3977 xattr_version = 0;
3978 }
3979 }
3980
11fdf7f2
TL
3981 // The end result of encode_xattrs() is equivalent to:
3982 // {
3983 // bufferlist xbl;
3984 // if (xattr_version) {
3985 // if (pxattrs)
3986 // encode(*pxattrs, bl);
3987 // else
3988 // encode((__u32)0, bl);
3989 // }
3990 // encode(xbl, bl);
3991 // }
3992 //
3993 // But encoding xattrs into the 'xbl' requires a memory allocation.
3994 // The 'bl' should have enough pre-allocated memory in most cases.
3995 // Encoding xattrs directly into it can avoid the extra allocation.
3996 auto encode_xattrs = [xattr_version, pxattrs, &bl]() {
3997 using ceph::encode;
3998 if (xattr_version) {
3999 ceph_le32 xbl_len;
4000 auto filler = bl.append_hole(sizeof(xbl_len));
4001 const auto starting_bl_len = bl.length();
4002 if (pxattrs)
4003 encode(*pxattrs, bl);
4004 else
4005 encode((__u32)0, bl);
4006 xbl_len = bl.length() - starting_bl_len;
4007 filler.copy_in(sizeof(xbl_len), (char *)&xbl_len);
4008 } else {
4009 encode((__u32)0, bl);
4010 }
4011 };
4012
7c673cae
FG
4013 /*
4014 * note: encoding matches MClientReply::InodeStat
4015 */
11fdf7f2 4016 if (session->info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) {
f67539c2 4017 ENCODE_START(6, 1, bl);
11fdf7f2
TL
4018 encode(oi->ino, bl);
4019 encode(snapid, bl);
4020 encode(oi->rdev, bl);
4021 encode(version, bl);
4022 encode(xattr_version, bl);
4023 encode(ecap, bl);
4024 {
4025 ceph_file_layout legacy_layout;
4026 layout.to_legacy(&legacy_layout);
4027 encode(legacy_layout, bl);
4028 }
4029 encode(any_i->ctime, bl);
4030 encode(file_i->mtime, bl);
4031 encode(file_i->atime, bl);
4032 encode(file_i->time_warp_seq, bl);
4033 encode(file_i->size, bl);
4034 encode(max_size, bl);
4035 encode(file_i->truncate_size, bl);
4036 encode(file_i->truncate_seq, bl);
4037 encode(auth_i->mode, bl);
4038 encode((uint32_t)auth_i->uid, bl);
4039 encode((uint32_t)auth_i->gid, bl);
4040 encode(link_i->nlink, bl);
4041 encode(file_i->dirstat.nfiles, bl);
4042 encode(file_i->dirstat.nsubdirs, bl);
4043 encode(file_i->rstat.rbytes, bl);
4044 encode(file_i->rstat.rfiles, bl);
4045 encode(file_i->rstat.rsubdirs, bl);
4046 encode(file_i->rstat.rctime, bl);
4047 dirfragtree.encode(bl);
4048 encode(symlink, bl);
4049 encode(file_i->dir_layout, bl);
4050 encode_xattrs();
4051 encode(inline_version, bl);
4052 encode(inline_data, bl);
f67539c2 4053 const mempool_inode *policy_i = ppolicy ? pi : oi;
11fdf7f2
TL
4054 encode(policy_i->quota, bl);
4055 encode(layout.pool_ns, bl);
4056 encode(any_i->btime, bl);
4057 encode(any_i->change_attr, bl);
4058 encode(file_i->export_pin, bl);
81eedcae 4059 encode(snap_btime, bl);
f67539c2
TL
4060 encode(file_i->rstat.rsnaps, bl);
4061 encode(snap_metadata, bl);
4062 encode(file_i->fscrypt, bl);
11fdf7f2
TL
4063 ENCODE_FINISH(bl);
4064 }
4065 else {
4066 ceph_assert(session->get_connection());
4067
4068 encode(oi->ino, bl);
4069 encode(snapid, bl);
4070 encode(oi->rdev, bl);
4071 encode(version, bl);
4072 encode(xattr_version, bl);
4073 encode(ecap, bl);
4074 {
4075 ceph_file_layout legacy_layout;
4076 layout.to_legacy(&legacy_layout);
4077 encode(legacy_layout, bl);
4078 }
4079 encode(any_i->ctime, bl);
4080 encode(file_i->mtime, bl);
4081 encode(file_i->atime, bl);
4082 encode(file_i->time_warp_seq, bl);
4083 encode(file_i->size, bl);
4084 encode(max_size, bl);
4085 encode(file_i->truncate_size, bl);
4086 encode(file_i->truncate_seq, bl);
4087 encode(auth_i->mode, bl);
4088 encode((uint32_t)auth_i->uid, bl);
4089 encode((uint32_t)auth_i->gid, bl);
4090 encode(link_i->nlink, bl);
4091 encode(file_i->dirstat.nfiles, bl);
4092 encode(file_i->dirstat.nsubdirs, bl);
4093 encode(file_i->rstat.rbytes, bl);
4094 encode(file_i->rstat.rfiles, bl);
4095 encode(file_i->rstat.rsubdirs, bl);
4096 encode(file_i->rstat.rctime, bl);
4097 dirfragtree.encode(bl);
4098 encode(symlink, bl);
4099 auto& conn = session->get_connection();
4100 if (conn->has_feature(CEPH_FEATURE_DIRLAYOUTHASH)) {
4101 encode(file_i->dir_layout, bl);
4102 }
4103 encode_xattrs();
4104 if (conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
4105 encode(inline_version, bl);
4106 encode(inline_data, bl);
4107 }
4108 if (conn->has_feature(CEPH_FEATURE_MDS_QUOTA)) {
f67539c2 4109 const mempool_inode *policy_i = ppolicy ? pi : oi;
11fdf7f2
TL
4110 encode(policy_i->quota, bl);
4111 }
4112 if (conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) {
4113 encode(layout.pool_ns, bl);
4114 }
4115 if (conn->has_feature(CEPH_FEATURE_FS_BTIME)) {
4116 encode(any_i->btime, bl);
4117 encode(any_i->change_attr, bl);
4118 }
7c673cae
FG
4119 }
4120
4121 return valid;
4122}
4123
9f95a23c 4124void CInode::encode_cap_message(const ref_t<MClientCaps> &m, Capability *cap)
7c673cae 4125{
11fdf7f2 4126 ceph_assert(cap);
7c673cae
FG
4127
4128 client_t client = cap->get_client();
4129
4130 bool pfile = filelock.is_xlocked_by_client(client) || (cap->issued() & CEPH_CAP_FILE_EXCL);
4131 bool pauth = authlock.is_xlocked_by_client(client);
4132 bool plink = linklock.is_xlocked_by_client(client);
4133 bool pxattr = xattrlock.is_xlocked_by_client(client);
4134
f67539c2
TL
4135 const mempool_inode *oi = get_inode().get();
4136 const mempool_inode *pi = get_projected_inode().get();
4137 const mempool_inode *i = (pfile|pauth|plink|pxattr) ? pi : oi;
7c673cae 4138
11fdf7f2 4139 dout(20) << __func__ << " pfile " << pfile
7c673cae
FG
4140 << " pauth " << pauth << " plink " << plink << " pxattr " << pxattr
4141 << " ctime " << i->ctime << dendl;
4142
4143 i = pfile ? pi:oi;
4144 m->set_layout(i->layout);
4145 m->size = i->size;
4146 m->truncate_seq = i->truncate_seq;
4147 m->truncate_size = i->truncate_size;
4148 m->mtime = i->mtime;
4149 m->atime = i->atime;
4150 m->ctime = i->ctime;
20effc67 4151 m->btime = i->btime;
7c673cae
FG
4152 m->change_attr = i->change_attr;
4153 m->time_warp_seq = i->time_warp_seq;
28e407b8
AA
4154 m->nfiles = i->dirstat.nfiles;
4155 m->nsubdirs = i->dirstat.nsubdirs;
7c673cae
FG
4156
4157 if (cap->client_inline_version < i->inline_data.version) {
4158 m->inline_version = cap->client_inline_version = i->inline_data.version;
4159 if (i->inline_data.length() > 0)
f67539c2 4160 i->inline_data.get_data(m->inline_data);
7c673cae
FG
4161 } else {
4162 m->inline_version = 0;
4163 }
4164
4165 // max_size is min of projected, actual.
f91f0fd5
TL
4166 uint64_t oldms = oi->get_client_range(client);
4167 uint64_t newms = pi->get_client_range(client);
11fdf7f2 4168 m->max_size = std::min(oldms, newms);
7c673cae
FG
4169
4170 i = pauth ? pi:oi;
4171 m->head.mode = i->mode;
4172 m->head.uid = i->uid;
4173 m->head.gid = i->gid;
4174
4175 i = plink ? pi:oi;
4176 m->head.nlink = i->nlink;
4177
11fdf7f2 4178 using ceph::encode;
7c673cae 4179 i = pxattr ? pi:oi;
f67539c2 4180 const auto& ix = pxattr ? get_projected_xattrs() : get_xattrs();
7c673cae
FG
4181 if ((cap->pending() & CEPH_CAP_XATTR_SHARED) &&
4182 i->xattr_version > cap->client_xattr_version) {
4183 dout(10) << " including xattrs v " << i->xattr_version << dendl;
f67539c2
TL
4184 if (ix)
4185 encode(*ix, m->xattrbl);
4186 else
4187 encode((__u32)0, m->xattrbl);
7c673cae
FG
4188 m->head.xattr_version = i->xattr_version;
4189 cap->client_xattr_version = i->xattr_version;
4190 }
4191}
4192
4193
4194
4195void CInode::_encode_base(bufferlist& bl, uint64_t features)
4196{
9f95a23c 4197 ENCODE_START(1, 1, bl);
11fdf7f2 4198 encode(first, bl);
f67539c2 4199 encode(*get_inode(), bl, features);
11fdf7f2
TL
4200 encode(symlink, bl);
4201 encode(dirfragtree, bl);
f67539c2
TL
4202 encode_xattrs(bl);
4203 encode_old_inodes(bl, features);
11fdf7f2 4204 encode(damage_flags, bl);
7c673cae 4205 encode_snap(bl);
9f95a23c 4206 ENCODE_FINISH(bl);
7c673cae 4207}
11fdf7f2 4208void CInode::_decode_base(bufferlist::const_iterator& p)
7c673cae 4209{
9f95a23c 4210 DECODE_START(1, p);
11fdf7f2 4211 decode(first, p);
f67539c2
TL
4212 {
4213 auto _inode = allocate_inode();
4214 decode(*_inode, p);
4215 reset_inode(std::move(_inode));
4216 }
94b18763
FG
4217 {
4218 std::string tmp;
11fdf7f2
TL
4219 decode(tmp, p);
4220 symlink = std::string_view(tmp);
94b18763 4221 }
11fdf7f2 4222 decode(dirfragtree, p);
f67539c2
TL
4223 decode_xattrs(p);
4224 decode_old_inodes(p);
11fdf7f2 4225 decode(damage_flags, p);
7c673cae 4226 decode_snap(p);
9f95a23c 4227 DECODE_FINISH(p);
7c673cae
FG
4228}
4229
4230void CInode::_encode_locks_full(bufferlist& bl)
4231{
11fdf7f2
TL
4232 using ceph::encode;
4233 encode(authlock, bl);
4234 encode(linklock, bl);
4235 encode(dirfragtreelock, bl);
4236 encode(filelock, bl);
4237 encode(xattrlock, bl);
4238 encode(snaplock, bl);
4239 encode(nestlock, bl);
4240 encode(flocklock, bl);
4241 encode(policylock, bl);
4242
4243 encode(loner_cap, bl);
4244}
4245void CInode::_decode_locks_full(bufferlist::const_iterator& p)
4246{
4247 using ceph::decode;
4248 decode(authlock, p);
4249 decode(linklock, p);
4250 decode(dirfragtreelock, p);
4251 decode(filelock, p);
4252 decode(xattrlock, p);
4253 decode(snaplock, p);
4254 decode(nestlock, p);
4255 decode(flocklock, p);
4256 decode(policylock, p);
4257
4258 decode(loner_cap, p);
7c673cae
FG
4259 set_loner_cap(loner_cap);
4260 want_loner_cap = loner_cap; // for now, we'll eval() shortly.
4261}
4262
b32b8144 4263void CInode::_encode_locks_state_for_replica(bufferlist& bl, bool need_recover)
7c673cae 4264{
9f95a23c 4265 ENCODE_START(1, 1, bl);
7c673cae
FG
4266 authlock.encode_state_for_replica(bl);
4267 linklock.encode_state_for_replica(bl);
4268 dirfragtreelock.encode_state_for_replica(bl);
4269 filelock.encode_state_for_replica(bl);
4270 nestlock.encode_state_for_replica(bl);
4271 xattrlock.encode_state_for_replica(bl);
4272 snaplock.encode_state_for_replica(bl);
4273 flocklock.encode_state_for_replica(bl);
4274 policylock.encode_state_for_replica(bl);
11fdf7f2 4275 encode(need_recover, bl);
9f95a23c 4276 ENCODE_FINISH(bl);
7c673cae 4277}
b32b8144 4278
7c673cae
FG
4279void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep)
4280{
4281 authlock.encode_state_for_replica(bl);
4282 linklock.encode_state_for_replica(bl);
4283 dirfragtreelock.encode_state_for_rejoin(bl, rep);
4284 filelock.encode_state_for_rejoin(bl, rep);
4285 nestlock.encode_state_for_rejoin(bl, rep);
4286 xattrlock.encode_state_for_replica(bl);
4287 snaplock.encode_state_for_replica(bl);
4288 flocklock.encode_state_for_replica(bl);
4289 policylock.encode_state_for_replica(bl);
4290}
b32b8144 4291
9f95a23c 4292void CInode::_decode_locks_state_for_replica(bufferlist::const_iterator& p, bool is_new)
7c673cae 4293{
9f95a23c 4294 DECODE_START(1, p);
7c673cae
FG
4295 authlock.decode_state(p, is_new);
4296 linklock.decode_state(p, is_new);
4297 dirfragtreelock.decode_state(p, is_new);
4298 filelock.decode_state(p, is_new);
4299 nestlock.decode_state(p, is_new);
4300 xattrlock.decode_state(p, is_new);
4301 snaplock.decode_state(p, is_new);
4302 flocklock.decode_state(p, is_new);
4303 policylock.decode_state(p, is_new);
b32b8144
FG
4304
4305 bool need_recover;
11fdf7f2 4306 decode(need_recover, p);
b32b8144
FG
4307 if (need_recover && is_new) {
4308 // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock
4309 // and change the object when replaying unsafe requests.
4310 authlock.mark_need_recover();
4311 linklock.mark_need_recover();
4312 dirfragtreelock.mark_need_recover();
4313 filelock.mark_need_recover();
4314 nestlock.mark_need_recover();
4315 xattrlock.mark_need_recover();
4316 snaplock.mark_need_recover();
4317 flocklock.mark_need_recover();
4318 policylock.mark_need_recover();
4319 }
9f95a23c 4320 DECODE_FINISH(p);
7c673cae 4321}
11fdf7f2 4322void CInode::_decode_locks_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters,
b32b8144
FG
4323 list<SimpleLock*>& eval_locks, bool survivor)
4324{
4325 authlock.decode_state_rejoin(p, waiters, survivor);
4326 linklock.decode_state_rejoin(p, waiters, survivor);
4327 dirfragtreelock.decode_state_rejoin(p, waiters, survivor);
4328 filelock.decode_state_rejoin(p, waiters, survivor);
4329 nestlock.decode_state_rejoin(p, waiters, survivor);
4330 xattrlock.decode_state_rejoin(p, waiters, survivor);
4331 snaplock.decode_state_rejoin(p, waiters, survivor);
4332 flocklock.decode_state_rejoin(p, waiters, survivor);
4333 policylock.decode_state_rejoin(p, waiters, survivor);
7c673cae
FG
4334
4335 if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked())
4336 eval_locks.push_back(&dirfragtreelock);
4337 if (!filelock.is_stable() && !filelock.is_wrlocked())
4338 eval_locks.push_back(&filelock);
4339 if (!nestlock.is_stable() && !nestlock.is_wrlocked())
4340 eval_locks.push_back(&nestlock);
4341}
4342
4343
4344// IMPORT/EXPORT
4345
4346void CInode::encode_export(bufferlist& bl)
4347{
4348 ENCODE_START(5, 4, bl);
4349 _encode_base(bl, mdcache->mds->mdsmap->get_up_features());
4350
11fdf7f2 4351 encode(state, bl);
7c673cae 4352
11fdf7f2 4353 encode(pop, bl);
7c673cae 4354
11fdf7f2 4355 encode(get_replicas(), bl);
7c673cae
FG
4356
4357 // include scatterlock info for any bounding CDirs
4358 bufferlist bounding;
f67539c2 4359 if (get_inode()->is_dir())
94b18763
FG
4360 for (const auto &p : dirfrags) {
4361 CDir *dir = p.second;
7c673cae 4362 if (dir->state_test(CDir::STATE_EXPORTBOUND)) {
11fdf7f2 4363 encode(p.first, bounding);
f67539c2
TL
4364 encode(dir->get_fnode()->fragstat, bounding);
4365 encode(dir->get_fnode()->accounted_fragstat, bounding);
4366 encode(dir->get_fnode()->rstat, bounding);
4367 encode(dir->get_fnode()->accounted_rstat, bounding);
7c673cae
FG
4368 dout(10) << " encoded fragstat/rstat info for " << *dir << dendl;
4369 }
4370 }
11fdf7f2 4371 encode(bounding, bl);
7c673cae
FG
4372
4373 _encode_locks_full(bl);
4374
4375 _encode_file_locks(bl);
4376
4377 ENCODE_FINISH(bl);
4378
4379 get(PIN_TEMPEXPORTING);
4380}
4381
11fdf7f2 4382void CInode::finish_export()
7c673cae
FG
4383{
4384 state &= MASK_STATE_EXPORT_KEPT;
4385
11fdf7f2 4386 pop.zero();
7c673cae
FG
4387
4388 // just in case!
4389 //dirlock.clear_updated();
4390
4391 loner_cap = -1;
4392
4393 put(PIN_TEMPEXPORTING);
4394}
4395
11fdf7f2 4396void CInode::decode_import(bufferlist::const_iterator& p,
7c673cae
FG
4397 LogSegment *ls)
4398{
4399 DECODE_START(5, p);
4400
4401 _decode_base(p);
4402
f6b5b4d7
TL
4403 {
4404 unsigned s;
4405 decode(s, p);
4406 s &= MASK_STATE_EXPORTED;
4407
f67539c2
TL
4408 set_ephemeral_pin((s & STATE_DISTEPHEMERALPIN),
4409 (s & STATE_RANDEPHEMERALPIN));
f6b5b4d7
TL
4410 state_set(STATE_AUTH | s);
4411 }
7c673cae
FG
4412
4413 if (is_dirty()) {
4414 get(PIN_DIRTY);
4415 _mark_dirty(ls);
4416 }
4417 if (is_dirty_parent()) {
4418 get(PIN_DIRTYPARENT);
28e407b8 4419 mark_dirty_parent(ls);
7c673cae
FG
4420 }
4421
11fdf7f2 4422 decode(pop, p);
7c673cae 4423
11fdf7f2 4424 decode(get_replicas(), p);
181888fb 4425 if (is_replicated())
7c673cae
FG
4426 get(PIN_REPLICATED);
4427 replica_nonce = 0;
4428
4429 // decode fragstat info on bounding cdirs
4430 bufferlist bounding;
11fdf7f2
TL
4431 decode(bounding, p);
4432 auto q = bounding.cbegin();
7c673cae
FG
4433 while (!q.end()) {
4434 frag_t fg;
11fdf7f2 4435 decode(fg, q);
7c673cae 4436 CDir *dir = get_dirfrag(fg);
11fdf7f2 4437 ceph_assert(dir); // we should have all bounds open
7c673cae
FG
4438
4439 // Only take the remote's fragstat/rstat if we are non-auth for
4440 // this dirfrag AND the lock is NOT in a scattered (MIX) state.
4441 // We know lock is stable, and MIX is the only state in which
4442 // the inode auth (who sent us this data) may not have the best
4443 // info.
4444
4445 // HMM: Are there cases where dir->is_auth() is an insufficient
4446 // check because the dirfrag is under migration? That implies
4447 // it is frozen (and in a SYNC or LOCK state). FIXME.
4448
f67539c2 4449 auto _fnode = CDir::allocate_fnode(*dir->get_fnode());
7c673cae
FG
4450 if (dir->is_auth() ||
4451 filelock.get_state() == LOCK_MIX) {
4452 dout(10) << " skipped fragstat info for " << *dir << dendl;
4453 frag_info_t f;
11fdf7f2
TL
4454 decode(f, q);
4455 decode(f, q);
7c673cae 4456 } else {
f67539c2
TL
4457 decode(_fnode->fragstat, q);
4458 decode(_fnode->accounted_fragstat, q);
7c673cae
FG
4459 dout(10) << " took fragstat info for " << *dir << dendl;
4460 }
4461 if (dir->is_auth() ||
4462 nestlock.get_state() == LOCK_MIX) {
4463 dout(10) << " skipped rstat info for " << *dir << dendl;
4464 nest_info_t n;
11fdf7f2
TL
4465 decode(n, q);
4466 decode(n, q);
7c673cae 4467 } else {
f67539c2
TL
4468 decode(_fnode->rstat, q);
4469 decode(_fnode->accounted_rstat, q);
7c673cae
FG
4470 dout(10) << " took rstat info for " << *dir << dendl;
4471 }
f67539c2 4472 dir->reset_fnode(std::move(_fnode));
7c673cae
FG
4473 }
4474
4475 _decode_locks_full(p);
4476
4477 _decode_file_locks(p);
4478
4479 DECODE_FINISH(p);
4480}
4481
4482
4483void InodeStoreBase::dump(Formatter *f) const
4484{
f67539c2 4485 inode->dump(f);
7c673cae 4486 f->dump_string("symlink", symlink);
9f95a23c
TL
4487
4488 f->open_array_section("xattrs");
f67539c2
TL
4489 if (xattrs) {
4490 for (const auto& [key, val] : *xattrs) {
4491 f->open_object_section("xattr");
4492 f->dump_string("key", key);
4493 std::string v(val.c_str(), val.length());
4494 f->dump_string("val", v);
4495 f->close_section();
4496 }
9f95a23c
TL
4497 }
4498 f->close_section();
4499 f->open_object_section("dirfragtree");
4500 dirfragtree.dump(f);
4501 f->close_section(); // dirfragtree
4502
7c673cae 4503 f->open_array_section("old_inodes");
f67539c2
TL
4504 if (old_inodes) {
4505 for (const auto &p : *old_inodes) {
4506 f->open_object_section("old_inode");
4507 // The key is the last snapid, the first is in the mempool_old_inode
4508 f->dump_int("last", p.first);
4509 p.second.dump(f);
4510 f->close_section(); // old_inode
4511 }
7c673cae
FG
4512 }
4513 f->close_section(); // old_inodes
4514
9f95a23c
TL
4515 f->dump_unsigned("oldest_snap", oldest_snap);
4516 f->dump_unsigned("damage_flags", damage_flags);
7c673cae
FG
4517}
4518
f67539c2
TL
4519template <>
4520void decode_json_obj(mempool::mds_co::string& t, JSONObj *obj){
4521
4522 t = mempool::mds_co::string(std::string_view(obj->get_data()));
4523}
4524
4525void InodeStoreBase::decode_json(JSONObj *obj)
4526{
4527 {
4528 auto _inode = allocate_inode();
4529 _inode->decode_json(obj);
4530 reset_inode(std::move(_inode));
4531 }
4532
4533 JSONDecoder::decode_json("symlink", symlink, obj, true);
4534 // JSONDecoder::decode_json("dirfragtree", dirfragtree, obj, true); // cann't decode it now
4535 //
4536 //
4537 {
4538 mempool_xattr_map tmp;
4539 JSONDecoder::decode_json("xattrs", tmp, xattrs_cb, obj, true);
4540 if (tmp.empty())
4541 reset_xattrs(xattr_map_ptr());
4542 else
4543 reset_xattrs(allocate_xattr_map(std::move(tmp)));
4544 }
4545 // JSONDecoder::decode_json("old_inodes", old_inodes, InodeStoreBase::old_indoes_cb, obj, true); // cann't decode old_inodes now
4546 JSONDecoder::decode_json("oldest_snap", oldest_snap.val, obj, true);
4547 JSONDecoder::decode_json("damage_flags", damage_flags, obj, true);
4548 //sr_t srnode;
4549 //JSONDecoder::decode_json("snap_blob", srnode, obj, true); // cann't decode it now
4550 //snap_blob = srnode;
4551}
4552
4553void InodeStoreBase::xattrs_cb(InodeStoreBase::mempool_xattr_map& c, JSONObj *obj){
4554
4555 string k;
4556 JSONDecoder::decode_json("key", k, obj, true);
4557 string v;
4558 JSONDecoder::decode_json("val", v, obj, true);
4559 c[k.c_str()] = buffer::copy(v.c_str(), v.size());
4560}
4561
4562void InodeStoreBase::old_indoes_cb(InodeStoreBase::mempool_old_inode_map& c, JSONObj *obj){
4563
4564 snapid_t s;
4565 JSONDecoder::decode_json("last", s.val, obj, true);
4566 InodeStoreBase::mempool_old_inode i;
4567 // i.decode_json(obj); // cann't decode now, simon
4568 c[s] = i;
4569}
7c673cae 4570
9f95a23c 4571void InodeStore::generate_test_instances(std::list<InodeStore*> &ls)
7c673cae
FG
4572{
4573 InodeStore *populated = new InodeStore;
f67539c2 4574 populated->get_inode()->ino = 0xdeadbeef;
7c673cae
FG
4575 populated->symlink = "rhubarb";
4576 ls.push_back(populated);
4577}
4578
9f95a23c 4579void InodeStoreBare::generate_test_instances(std::list<InodeStoreBare*> &ls)
11fdf7f2
TL
4580{
4581 InodeStoreBare *populated = new InodeStoreBare;
f67539c2 4582 populated->get_inode()->ino = 0xdeadbeef;
11fdf7f2
TL
4583 populated->symlink = "rhubarb";
4584 ls.push_back(populated);
4585}
4586
7c673cae 4587void CInode::validate_disk_state(CInode::validated_data *results,
11fdf7f2 4588 MDSContext *fin)
7c673cae
FG
4589{
4590 class ValidationContinuation : public MDSContinuation {
4591 public:
11fdf7f2 4592 MDSContext *fin;
7c673cae
FG
4593 CInode *in;
4594 CInode::validated_data *results;
4595 bufferlist bl;
4596 CInode *shadow_in;
4597
4598 enum {
4599 START = 0,
4600 BACKTRACE,
4601 INODE,
11fdf7f2
TL
4602 DIRFRAGS,
4603 SNAPREALM,
7c673cae
FG
4604 };
4605
4606 ValidationContinuation(CInode *i,
4607 CInode::validated_data *data_r,
11fdf7f2 4608 MDSContext *fin_) :
7c673cae
FG
4609 MDSContinuation(i->mdcache->mds->server),
4610 fin(fin_),
4611 in(i),
4612 results(data_r),
4613 shadow_in(NULL) {
4614 set_callback(START, static_cast<Continuation::stagePtr>(&ValidationContinuation::_start));
4615 set_callback(BACKTRACE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_backtrace));
4616 set_callback(INODE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_inode_disk));
4617 set_callback(DIRFRAGS, static_cast<Continuation::stagePtr>(&ValidationContinuation::_dirfrags));
4618 }
4619
4620 ~ValidationContinuation() override {
b32b8144
FG
4621 if (shadow_in) {
4622 delete shadow_in;
4623 in->mdcache->num_shadow_inodes--;
4624 }
7c673cae
FG
4625 }
4626
4627 /**
4628 * Fetch backtrace and set tag if tag is non-empty
4629 */
11fdf7f2
TL
4630 void fetch_backtrace_and_tag(CInode *in,
4631 std::string_view tag, bool is_internal,
7c673cae
FG
4632 Context *fin, int *bt_r, bufferlist *bt)
4633 {
4634 const int64_t pool = in->get_backtrace_pool();
4635 object_t oid = CInode::get_object_name(in->ino(), frag_t(), "");
4636
4637 ObjectOperation fetch;
4638 fetch.getxattr("parent", bt, bt_r);
4639 in->mdcache->mds->objecter->read(oid, object_locator_t(pool), fetch, CEPH_NOSNAP,
4640 NULL, 0, fin);
f67539c2
TL
4641 if (in->mdcache->mds->logger) {
4642 in->mdcache->mds->logger->inc(l_mds_openino_backtrace_fetch);
4643 in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_fetch);
4644 }
4645
11fdf7f2
TL
4646 using ceph::encode;
4647 if (!is_internal) {
4648 ObjectOperation scrub_tag;
7c673cae 4649 bufferlist tag_bl;
11fdf7f2 4650 encode(tag, tag_bl);
7c673cae
FG
4651 scrub_tag.setxattr("scrub_tag", tag_bl);
4652 SnapContext snapc;
4653 in->mdcache->mds->objecter->mutate(oid, object_locator_t(pool), scrub_tag, snapc,
4654 ceph::real_clock::now(),
4655 0, NULL);
f67539c2
TL
4656 if (in->mdcache->mds->logger)
4657 in->mdcache->mds->logger->inc(l_mds_scrub_set_tag);
7c673cae
FG
4658 }
4659 }
4660
4661 bool _start(int rval) {
f67539c2
TL
4662 ceph_assert(in->can_auth_pin());
4663 in->auth_pin(this);
4664
7c673cae 4665 if (in->is_dirty()) {
f67539c2
TL
4666 MDCache *mdcache = in->mdcache; // For the benefit of dout
4667 auto ino = [this]() { return in->ino(); }; // For the benefit of dout
11fdf7f2 4668 dout(20) << "validating a dirty CInode; results will be inconclusive"
f67539c2 4669 << dendl;
7c673cae 4670 }
11fdf7f2 4671
7c673cae 4672 C_OnFinisher *conf = new C_OnFinisher(get_io_callback(BACKTRACE),
11fdf7f2
TL
4673 in->mdcache->mds->finisher);
4674
4675 std::string_view tag = in->scrub_infop->header->get_tag();
4676 bool is_internal = in->scrub_infop->header->is_internal_tag();
4677 // Rather than using the usual CInode::fetch_backtrace,
4678 // use a special variant that optionally writes a tag in the same
4679 // operation.
4680 fetch_backtrace_and_tag(in, tag, is_internal, conf, &results->backtrace.ondisk_read_retval, &bl);
7c673cae
FG
4681 return false;
4682 }
4683
4684 bool _backtrace(int rval) {
4685 // set up basic result reporting and make sure we got the data
4686 results->performed_validation = true; // at least, some of it!
4687 results->backtrace.checked = true;
4688
4689 const int64_t pool = in->get_backtrace_pool();
4690 inode_backtrace_t& memory_backtrace = results->backtrace.memory_value;
4691 in->build_backtrace(pool, memory_backtrace);
4692 bool equivalent, divergent;
4693 int memory_newer;
4694
4695 MDCache *mdcache = in->mdcache; // For the benefit of dout
f67539c2 4696 auto ino = [this]() { return in->ino(); }; // For the benefit of dout
7c673cae
FG
4697
4698 // Ignore rval because it's the result of a FAILOK operation
4699 // from fetch_backtrace_and_tag: the real result is in
4700 // backtrace.ondisk_read_retval
4701 dout(20) << "ondisk_read_retval: " << results->backtrace.ondisk_read_retval << dendl;
4702 if (results->backtrace.ondisk_read_retval != 0) {
4703 results->backtrace.error_str << "failed to read off disk; see retval";
e306af50
TL
4704 // we probably have a new unwritten file!
4705 // so skip the backtrace scrub for this entry and say that all's well
f67539c2
TL
4706 if (in->is_dirty_parent()) {
4707 dout(20) << "forcing backtrace as passed since inode is dirty parent" << dendl;
e306af50 4708 results->backtrace.passed = true;
f67539c2 4709 }
e306af50 4710 goto next;
7c673cae
FG
4711 }
4712
4713 // extract the backtrace, and compare it to a newly-constructed one
4714 try {
11fdf7f2
TL
4715 auto p = bl.cbegin();
4716 using ceph::decode;
4717 decode(results->backtrace.ondisk_value, p);
7c673cae
FG
4718 dout(10) << "decoded " << bl.length() << " bytes of backtrace successfully" << dendl;
4719 } catch (buffer::error&) {
4720 if (results->backtrace.ondisk_read_retval == 0 && rval != 0) {
4721 // Cases where something has clearly gone wrong with the overall
4722 // fetch op, though we didn't get a nonzero rc from the getxattr
4723 // operation. e.g. object missing.
4724 results->backtrace.ondisk_read_retval = rval;
4725 }
4726 results->backtrace.error_str << "failed to decode on-disk backtrace ("
4727 << bl.length() << " bytes)!";
e306af50
TL
4728 // we probably have a new unwritten file!
4729 // so skip the backtrace scrub for this entry and say that all's well
f67539c2
TL
4730 if (in->is_dirty_parent()) {
4731 dout(20) << "decode failed; forcing backtrace as passed since "
4732 "inode is dirty parent" << dendl;
e306af50 4733 results->backtrace.passed = true;
f67539c2 4734 }
e306af50 4735
7c673cae
FG
4736 goto next;
4737 }
4738
4739 memory_newer = memory_backtrace.compare(results->backtrace.ondisk_value,
4740 &equivalent, &divergent);
4741
4742 if (divergent || memory_newer < 0) {
e306af50
TL
4743 // we're divergent, or on-disk version is newer
4744 results->backtrace.error_str << "On-disk backtrace is divergent or newer";
f67539c2
TL
4745 /* if the backtraces are divergent and the link count is 0, then
4746 * most likely its a stray entry that's being purged and things are
4747 * well and there's no reason for alarm
4748 */
4749 if (divergent && (in->is_dirty_parent() || in->get_inode()->nlink == 0)) {
e306af50 4750 results->backtrace.passed = true;
f67539c2
TL
4751 dout(20) << "divergent backtraces are acceptable when dn "
4752 "is being purged or has been renamed or moved to a "
4753 "different directory " << *in << dendl;
4754 }
7c673cae
FG
4755 } else {
4756 results->backtrace.passed = true;
4757 }
4758next:
4759
4760 if (!results->backtrace.passed && in->scrub_infop->header->get_repair()) {
4761 std::string path;
4762 in->make_path_string(path);
d2e6a577
FG
4763 in->mdcache->mds->clog->warn() << "bad backtrace on inode " << in->ino()
4764 << "(" << path << "), rewriting it";
28e407b8 4765 in->mark_dirty_parent(in->mdcache->mds->mdlog->get_current_segment(),
7c673cae 4766 false);
b32b8144
FG
4767 // Flag that we repaired this BT so that it won't go into damagetable
4768 results->backtrace.repaired = true;
f67539c2
TL
4769 if (in->mdcache->mds->logger)
4770 in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_repaired);
7c673cae
FG
4771 }
4772
4773 // If the inode's number was free in the InoTable, fix that
4774 // (#15619)
4775 {
4776 InoTable *inotable = mdcache->mds->inotable;
4777
f67539c2 4778 dout(10) << "scrub: inotable ino = " << in->ino() << dendl;
7c673cae 4779 dout(10) << "scrub: inotable free says "
f67539c2 4780 << inotable->is_marked_free(in->ino()) << dendl;
7c673cae 4781
f67539c2 4782 if (inotable->is_marked_free(in->ino())) {
7c673cae 4783 LogChannelRef clog = in->mdcache->mds->clog;
f67539c2 4784 clog->error() << "scrub: inode wrongly marked free: " << in->ino();
7c673cae
FG
4785
4786 if (in->scrub_infop->header->get_repair()) {
f67539c2 4787 bool repaired = inotable->repair(in->ino());
7c673cae 4788 if (repaired) {
f67539c2 4789 clog->error() << "inode table repaired for inode: " << in->ino();
7c673cae
FG
4790
4791 inotable->save();
f67539c2
TL
4792 if (in->mdcache->mds->logger)
4793 in->mdcache->mds->logger->inc(l_mds_scrub_inotable_repaired);
7c673cae
FG
4794 } else {
4795 clog->error() << "Cannot repair inotable while other operations"
4796 " are in progress";
4797 }
4798 }
4799 }
4800 }
4801
7c673cae 4802
11fdf7f2 4803 if (in->is_dir()) {
f67539c2
TL
4804 if (in->mdcache->mds->logger)
4805 in->mdcache->mds->logger->inc(l_mds_scrub_dir_inodes);
11fdf7f2
TL
4806 return validate_directory_data();
4807 } else {
f67539c2
TL
4808 if (in->mdcache->mds->logger)
4809 in->mdcache->mds->logger->inc(l_mds_scrub_file_inodes);
11fdf7f2 4810 // TODO: validate on-disk inode for normal files
f67539c2 4811 return true;
11fdf7f2 4812 }
7c673cae
FG
4813 }
4814
4815 bool validate_directory_data() {
11fdf7f2 4816 ceph_assert(in->is_dir());
7c673cae
FG
4817
4818 if (in->is_base()) {
b32b8144
FG
4819 if (!shadow_in) {
4820 shadow_in = new CInode(in->mdcache);
f67539c2 4821 in->mdcache->create_unlinked_system_inode(shadow_in, in->ino(), in->get_inode()->mode);
b32b8144
FG
4822 in->mdcache->num_shadow_inodes++;
4823 }
7c673cae 4824 shadow_in->fetch(get_internal_callback(INODE));
f67539c2
TL
4825 if (in->mdcache->mds->logger)
4826 in->mdcache->mds->logger->inc(l_mds_scrub_dir_base_inodes);
7c673cae
FG
4827 return false;
4828 } else {
11fdf7f2 4829 // TODO: validate on-disk inode for non-base directories
f67539c2
TL
4830 if (in->mdcache->mds->logger)
4831 in->mdcache->mds->logger->inc(l_mds_scrub_dirfrag_rstats);
7c673cae 4832 results->inode.passed = true;
11fdf7f2 4833 return check_dirfrag_rstats();
7c673cae
FG
4834 }
4835 }
4836
4837 bool _inode_disk(int rval) {
f67539c2
TL
4838 const auto& si = shadow_in->get_inode();
4839 const auto& i = in->get_inode();
4840
7c673cae
FG
4841 results->inode.checked = true;
4842 results->inode.ondisk_read_retval = rval;
f67539c2
TL
4843 results->inode.ondisk_value = *si;
4844 results->inode.memory_value = *i;
7c673cae 4845
f67539c2 4846 if (si->version > i->version) {
7c673cae 4847 // uh, what?
11fdf7f2 4848 results->inode.error_str << "On-disk inode is newer than in-memory one; ";
7c673cae
FG
4849 goto next;
4850 } else {
4851 bool divergent = false;
f67539c2 4852 int r = i->compare(*si, &divergent);
7c673cae
FG
4853 results->inode.passed = !divergent && r >= 0;
4854 if (!results->inode.passed) {
4855 results->inode.error_str <<
11fdf7f2 4856 "On-disk inode is divergent or newer than in-memory one; ";
7c673cae
FG
4857 goto next;
4858 }
4859 }
4860next:
4861 return check_dirfrag_rstats();
4862 }
4863
4864 bool check_dirfrag_rstats() {
f67539c2
TL
4865 if (in->has_subtree_root_dirfrag()) {
4866 in->mdcache->rdlock_dirfrags_stats(in, get_internal_callback(DIRFRAGS));
4867 return false;
7c673cae 4868 } else {
f67539c2 4869 return immediate(DIRFRAGS, 0);
7c673cae
FG
4870 }
4871 }
4872
4873 bool _dirfrags(int rval) {
7c673cae
FG
4874 // basic reporting setup
4875 results->raw_stats.checked = true;
4876 results->raw_stats.ondisk_read_retval = rval;
4877
f67539c2
TL
4878 results->raw_stats.memory_value.dirstat = in->get_inode()->dirstat;
4879 results->raw_stats.memory_value.rstat = in->get_inode()->rstat;
7c673cae
FG
4880 frag_info_t& dir_info = results->raw_stats.ondisk_value.dirstat;
4881 nest_info_t& nest_info = results->raw_stats.ondisk_value.rstat;
4882
4883 if (rval != 0) {
4884 results->raw_stats.error_str << "Failed to read dirfrags off disk";
4885 goto next;
4886 }
4887
4888 // check each dirfrag...
94b18763
FG
4889 for (const auto &p : in->dirfrags) {
4890 CDir *dir = p.second;
11fdf7f2 4891 ceph_assert(dir->get_version() > 0);
f67539c2
TL
4892 nest_info.add(dir->get_fnode()->accounted_rstat);
4893 dir_info.add(dir->get_fnode()->accounted_fragstat);
7c673cae
FG
4894 }
4895 nest_info.rsubdirs++; // it gets one to account for self
11fdf7f2
TL
4896 if (const sr_t *srnode = in->get_projected_srnode(); srnode)
4897 nest_info.rsnaps += srnode->snaps.size();
4898
7c673cae 4899 // ...and that their sum matches our inode settings
f67539c2
TL
4900 if (!dir_info.same_sums(in->get_inode()->dirstat) ||
4901 !nest_info.same_sums(in->get_inode()->rstat)) {
11fdf7f2 4902 if (in->scrub_infop->header->get_repair()) {
7c673cae
FG
4903 results->raw_stats.error_str
4904 << "freshly-calculated rstats don't match existing ones (will be fixed)";
4905 in->mdcache->repair_inode_stats(in);
b32b8144 4906 results->raw_stats.repaired = true;
7c673cae
FG
4907 } else {
4908 results->raw_stats.error_str
4909 << "freshly-calculated rstats don't match existing ones";
4910 }
f67539c2
TL
4911 if (in->is_dirty()) {
4912 MDCache *mdcache = in->mdcache; // for dout()
4913 auto ino = [this]() { return in->ino(); }; // for dout()
4914 dout(20) << "raw stats most likely wont match since inode is dirty; "
4915 "please rerun scrub when system is stable; "
4916 "assuming passed for now;" << dendl;
4917 results->raw_stats.passed = true;
4918 }
7c673cae
FG
4919 goto next;
4920 }
7c673cae
FG
4921
4922 results->raw_stats.passed = true;
f67539c2
TL
4923 {
4924 MDCache *mdcache = in->mdcache; // for dout()
4925 auto ino = [this]() { return in->ino(); }; // for dout()
4926 dout(20) << "raw stats check passed on " << *in << dendl;
11fdf7f2 4927 }
11fdf7f2 4928
f67539c2 4929next:
7c673cae
FG
4930 return true;
4931 }
4932
4933 void _done() override {
4934 if ((!results->raw_stats.checked || results->raw_stats.passed) &&
4935 (!results->backtrace.checked || results->backtrace.passed) &&
4936 (!results->inode.checked || results->inode.passed))
11fdf7f2
TL
4937 results->passed_validation = true;
4938
4939 // Flag that we did some repair work so that our repair operation
4940 // can be flushed at end of scrub
4941 if (results->backtrace.repaired ||
4942 results->inode.repaired ||
4943 results->raw_stats.repaired)
4944 in->scrub_infop->header->set_repaired();
4945 if (fin)
4946 fin->complete(get_rval());
f67539c2
TL
4947
4948 in->auth_unpin(this);
7c673cae
FG
4949 }
4950 };
4951
4952
4953 dout(10) << "scrub starting validate_disk_state on " << *this << dendl;
4954 ValidationContinuation *vc = new ValidationContinuation(this,
4955 results,
4956 fin);
4957 vc->begin();
4958}
4959
4960void CInode::validated_data::dump(Formatter *f) const
4961{
4962 f->open_object_section("results");
4963 {
4964 f->dump_bool("performed_validation", performed_validation);
4965 f->dump_bool("passed_validation", passed_validation);
4966 f->open_object_section("backtrace");
4967 {
4968 f->dump_bool("checked", backtrace.checked);
4969 f->dump_bool("passed", backtrace.passed);
4970 f->dump_int("read_ret_val", backtrace.ondisk_read_retval);
4971 f->dump_stream("ondisk_value") << backtrace.ondisk_value;
4972 f->dump_stream("memoryvalue") << backtrace.memory_value;
4973 f->dump_string("error_str", backtrace.error_str.str());
4974 }
4975 f->close_section(); // backtrace
4976 f->open_object_section("raw_stats");
4977 {
4978 f->dump_bool("checked", raw_stats.checked);
4979 f->dump_bool("passed", raw_stats.passed);
4980 f->dump_int("read_ret_val", raw_stats.ondisk_read_retval);
4981 f->dump_stream("ondisk_value.dirstat") << raw_stats.ondisk_value.dirstat;
4982 f->dump_stream("ondisk_value.rstat") << raw_stats.ondisk_value.rstat;
f67539c2 4983 f->dump_stream("memory_value.dirstat") << raw_stats.memory_value.dirstat;
7c673cae
FG
4984 f->dump_stream("memory_value.rstat") << raw_stats.memory_value.rstat;
4985 f->dump_string("error_str", raw_stats.error_str.str());
4986 }
4987 f->close_section(); // raw_stats
4988 // dump failure return code
4989 int rc = 0;
4990 if (backtrace.checked && backtrace.ondisk_read_retval)
4991 rc = backtrace.ondisk_read_retval;
4992 if (inode.checked && inode.ondisk_read_retval)
4993 rc = inode.ondisk_read_retval;
4994 if (raw_stats.checked && raw_stats.ondisk_read_retval)
4995 rc = raw_stats.ondisk_read_retval;
4996 f->dump_int("return_code", rc);
4997 }
4998 f->close_section(); // results
4999}
5000
b32b8144
FG
5001bool CInode::validated_data::all_damage_repaired() const
5002{
5003 bool unrepaired =
5004 (raw_stats.checked && !raw_stats.passed && !raw_stats.repaired)
5005 ||
5006 (backtrace.checked && !backtrace.passed && !backtrace.repaired)
5007 ||
5008 (inode.checked && !inode.passed && !inode.repaired);
5009
5010 return !unrepaired;
5011}
5012
11fdf7f2
TL
5013void CInode::dump(Formatter *f, int flags) const
5014{
5015 if (flags & DUMP_PATH) {
5016 std::string path;
5017 make_path_string(path, true);
5018 if (path.empty())
5019 path = "/";
5020 f->dump_string("path", path);
5021 }
5022
5023 if (flags & DUMP_INODE_STORE_BASE)
5024 InodeStoreBase::dump(f);
5025
5026 if (flags & DUMP_MDS_CACHE_OBJECT)
5027 MDSCacheObject::dump(f);
5028
5029 if (flags & DUMP_LOCKS) {
5030 f->open_object_section("versionlock");
5031 versionlock.dump(f);
5032 f->close_section();
5033
5034 f->open_object_section("authlock");
5035 authlock.dump(f);
5036 f->close_section();
5037
5038 f->open_object_section("linklock");
5039 linklock.dump(f);
5040 f->close_section();
5041
5042 f->open_object_section("dirfragtreelock");
5043 dirfragtreelock.dump(f);
5044 f->close_section();
5045
5046 f->open_object_section("filelock");
5047 filelock.dump(f);
5048 f->close_section();
5049
5050 f->open_object_section("xattrlock");
5051 xattrlock.dump(f);
5052 f->close_section();
5053
5054 f->open_object_section("snaplock");
5055 snaplock.dump(f);
5056 f->close_section();
5057
5058 f->open_object_section("nestlock");
5059 nestlock.dump(f);
5060 f->close_section();
5061
5062 f->open_object_section("flocklock");
5063 flocklock.dump(f);
5064 f->close_section();
5065
5066 f->open_object_section("policylock");
5067 policylock.dump(f);
5068 f->close_section();
5069 }
5070
5071 if (flags & DUMP_STATE) {
5072 f->open_array_section("states");
5073 MDSCacheObject::dump_states(f);
5074 if (state_test(STATE_EXPORTING))
5075 f->dump_string("state", "exporting");
5076 if (state_test(STATE_OPENINGDIR))
5077 f->dump_string("state", "openingdir");
5078 if (state_test(STATE_FREEZING))
5079 f->dump_string("state", "freezing");
5080 if (state_test(STATE_FROZEN))
5081 f->dump_string("state", "frozen");
5082 if (state_test(STATE_AMBIGUOUSAUTH))
5083 f->dump_string("state", "ambiguousauth");
5084 if (state_test(STATE_EXPORTINGCAPS))
5085 f->dump_string("state", "exportingcaps");
5086 if (state_test(STATE_NEEDSRECOVER))
5087 f->dump_string("state", "needsrecover");
5088 if (state_test(STATE_PURGING))
5089 f->dump_string("state", "purging");
5090 if (state_test(STATE_DIRTYPARENT))
5091 f->dump_string("state", "dirtyparent");
5092 if (state_test(STATE_DIRTYRSTAT))
5093 f->dump_string("state", "dirtyrstat");
5094 if (state_test(STATE_STRAYPINNED))
5095 f->dump_string("state", "straypinned");
5096 if (state_test(STATE_FROZENAUTHPIN))
5097 f->dump_string("state", "frozenauthpin");
5098 if (state_test(STATE_DIRTYPOOL))
5099 f->dump_string("state", "dirtypool");
5100 if (state_test(STATE_ORPHAN))
5101 f->dump_string("state", "orphan");
5102 if (state_test(STATE_MISSINGOBJS))
5103 f->dump_string("state", "missingobjs");
7c673cae
FG
5104 f->close_section();
5105 }
7c673cae 5106
11fdf7f2
TL
5107 if (flags & DUMP_CAPS) {
5108 f->open_array_section("client_caps");
5109 for (const auto &p : client_caps) {
5110 auto &client = p.first;
5111 auto cap = &p.second;
5112 f->open_object_section("client_cap");
5113 f->dump_int("client_id", client.v);
5114 f->dump_string("pending", ccap_string(cap->pending()));
5115 f->dump_string("issued", ccap_string(cap->issued()));
5116 f->dump_string("wanted", ccap_string(cap->wanted()));
5117 f->dump_int("last_sent", cap->get_last_seq());
5118 f->close_section();
5119 }
5120 f->close_section();
5121
5122 f->dump_int("loner", loner_cap.v);
5123 f->dump_int("want_loner", want_loner_cap.v);
5124
5125 f->open_array_section("mds_caps_wanted");
5126 for (const auto &p : mds_caps_wanted) {
5127 f->open_object_section("mds_cap_wanted");
5128 f->dump_int("rank", p.first);
5129 f->dump_string("cap", ccap_string(p.second));
5130 f->close_section();
5131 }
5132 f->close_section();
5133 }
7c673cae 5134
11fdf7f2
TL
5135 if (flags & DUMP_DIRFRAGS) {
5136 f->open_array_section("dirfrags");
9f95a23c 5137 auto&& dfs = get_dirfrags();
11fdf7f2
TL
5138 for(const auto &dir: dfs) {
5139 f->open_object_section("dir");
5140 dir->dump(f, CDir::DUMP_DEFAULT | CDir::DUMP_ITEMS);
5141 dir->check_rstats();
5142 f->close_section();
5143 }
7c673cae
FG
5144 f->close_section();
5145 }
7c673cae
FG
5146}
5147
5148/****** Scrub Stuff *****/
5149void CInode::scrub_info_create() const
5150{
5151 dout(25) << __func__ << dendl;
11fdf7f2 5152 ceph_assert(!scrub_infop);
7c673cae
FG
5153
5154 // break out of const-land to set up implicit initial state
5155 CInode *me = const_cast<CInode*>(this);
f67539c2 5156 const auto& pi = me->get_projected_inode();
7c673cae 5157
f67539c2
TL
5158 std::unique_ptr<scrub_info_t> si(new scrub_info_t());
5159 si->last_scrub_stamp = pi->last_scrub_stamp;
5160 si->last_scrub_version = pi->last_scrub_version;
7c673cae 5161
f67539c2 5162 me->scrub_infop.swap(si);
7c673cae
FG
5163}
5164
5165void CInode::scrub_maybe_delete_info()
5166{
5167 if (scrub_infop &&
5168 !scrub_infop->scrub_in_progress &&
5169 !scrub_infop->last_scrub_dirty) {
f67539c2 5170 scrub_infop.reset();
7c673cae
FG
5171 }
5172}
5173
f67539c2 5174void CInode::scrub_initialize(ScrubHeaderRef& header)
7c673cae
FG
5175{
5176 dout(20) << __func__ << " with scrub_version " << get_version() << dendl;
7c673cae 5177
f67539c2 5178 scrub_info();
7c673cae 5179 scrub_infop->scrub_in_progress = true;
f67539c2 5180 scrub_infop->queued_frags.clear();
7c673cae 5181 scrub_infop->header = header;
f67539c2 5182 header->inc_num_pending();
7c673cae
FG
5183 // right now we don't handle remote inodes
5184}
5185
f67539c2 5186void CInode::scrub_aborted() {
11fdf7f2
TL
5187 dout(20) << __func__ << dendl;
5188 ceph_assert(scrub_is_in_progress());
5189
f67539c2
TL
5190 scrub_infop->scrub_in_progress = false;
5191 scrub_infop->header->dec_num_pending();
5192 scrub_maybe_delete_info();
11fdf7f2
TL
5193}
5194
f67539c2 5195void CInode::scrub_finished() {
7c673cae 5196 dout(20) << __func__ << dendl;
11fdf7f2 5197 ceph_assert(scrub_is_in_progress());
7c673cae 5198
f67539c2
TL
5199 scrub_infop->last_scrub_version = get_version();
5200 scrub_infop->last_scrub_stamp = ceph_clock_now();
7c673cae
FG
5201 scrub_infop->last_scrub_dirty = true;
5202 scrub_infop->scrub_in_progress = false;
f67539c2 5203 scrub_infop->header->dec_num_pending();
7c673cae
FG
5204}
5205
5206int64_t CInode::get_backtrace_pool() const
5207{
5208 if (is_dir()) {
b3b6e05e 5209 return mdcache->mds->get_metadata_pool();
7c673cae
FG
5210 } else {
5211 // Files are required to have an explicit layout that specifies
5212 // a pool
f67539c2
TL
5213 ceph_assert(get_inode()->layout.pool_id != -1);
5214 return get_inode()->layout.pool_id;
7c673cae
FG
5215 }
5216}
5217
f67539c2 5218void CInode::queue_export_pin(mds_rank_t export_pin)
31f18b77 5219{
31f18b77
FG
5220 if (state_test(CInode::STATE_QUEUEDEXPORTPIN))
5221 return;
5222
f67539c2
TL
5223 mds_rank_t target;
5224 if (export_pin >= 0)
5225 target = export_pin;
5226 else if (export_pin == MDS_RANK_EPHEMERAL_RAND)
5227 target = mdcache->hash_into_rank_bucket(ino());
5228 else
5229 target = MDS_RANK_NONE;
5230
5231 unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
31f18b77 5232 bool queue = false;
f6b5b4d7
TL
5233 for (auto& p : dirfrags) {
5234 CDir *dir = p.second;
31f18b77
FG
5235 if (!dir->is_auth())
5236 continue;
f67539c2
TL
5237
5238 if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
5239 if (dir->get_frag().bits() < min_frag_bits) {
5240 // needs split
5241 queue = true;
5242 break;
5243 }
5244 target = mdcache->hash_into_rank_bucket(ino(), dir->get_frag());
5245 }
5246
f6b5b4d7 5247 if (target != MDS_RANK_NONE) {
31f18b77
FG
5248 if (dir->is_subtree_root()) {
5249 // set auxsubtree bit or export it
5250 if (!dir->state_test(CDir::STATE_AUXSUBTREE) ||
f6b5b4d7 5251 target != dir->get_dir_auth().first)
31f18b77
FG
5252 queue = true;
5253 } else {
5254 // create aux subtree or export it
5255 queue = true;
7c673cae 5256 }
31f18b77
FG
5257 } else {
5258 // clear aux subtrees ?
5259 queue = dir->state_test(CDir::STATE_AUXSUBTREE);
5260 }
f67539c2
TL
5261
5262 if (queue)
31f18b77 5263 break;
f67539c2
TL
5264 }
5265 if (queue) {
5266 state_set(CInode::STATE_QUEUEDEXPORTPIN);
5267 mdcache->export_pin_queue.insert(this);
7c673cae
FG
5268 }
5269}
5270
f6b5b4d7
TL
5271void CInode::maybe_export_pin(bool update)
5272{
5273 if (!g_conf()->mds_bal_export_pin)
5274 return;
5275 if (!is_dir() || !is_normal())
5276 return;
5277
5278 dout(15) << __func__ << " update=" << update << " " << *this << dendl;
5279
f67539c2
TL
5280 mds_rank_t export_pin = get_export_pin(false);
5281 if (export_pin == MDS_RANK_NONE && !update)
f6b5b4d7 5282 return;
f6b5b4d7 5283
f67539c2 5284 check_pin_policy(export_pin);
f6b5b4d7
TL
5285 queue_export_pin(export_pin);
5286}
5287
f67539c2 5288void CInode::set_ephemeral_pin(bool dist, bool rand)
f6b5b4d7 5289{
f67539c2
TL
5290 unsigned state = 0;
5291 if (dist)
5292 state |= STATE_DISTEPHEMERALPIN;
5293 if (rand)
5294 state |= STATE_RANDEPHEMERALPIN;
5295 if (!state)
f6b5b4d7 5296 return;
f6b5b4d7 5297
f67539c2
TL
5298 if (state_test(state) != state) {
5299 dout(10) << "set ephemeral (" << (dist ? "dist" : "")
5300 << (rand ? " rand" : "") << ") pin on " << *this << dendl;
5301 if (!is_ephemerally_pinned()) {
5302 auto p = mdcache->export_ephemeral_pins.insert(this);
5303 ceph_assert(p.second);
f6b5b4d7 5304 }
f67539c2 5305 state_set(state);
f6b5b4d7
TL
5306 }
5307}
5308
f67539c2 5309void CInode::clear_ephemeral_pin(bool dist, bool rand)
f6b5b4d7 5310{
f67539c2
TL
5311 unsigned state = 0;
5312 if (dist)
5313 state |= STATE_DISTEPHEMERALPIN;
5314 if (rand)
5315 state |= STATE_RANDEPHEMERALPIN;
5316
5317 if (state_test(state)) {
5318 dout(10) << "clear ephemeral (" << (dist ? "dist" : "")
5319 << (rand ? " rand" : "") << ") pin on " << *this << dendl;
5320 state_clear(state);
5321 if (!is_ephemerally_pinned()) {
5322 auto count = mdcache->export_ephemeral_pins.erase(this);
f6b5b4d7 5323 ceph_assert(count == 1);
f6b5b4d7
TL
5324 }
5325 }
5326}
5327
f67539c2 5328void CInode::maybe_ephemeral_rand(double threshold)
f6b5b4d7
TL
5329{
5330 if (!mdcache->get_export_ephemeral_random_config()) {
5331 dout(15) << __func__ << " config false: cannot ephemeral random pin " << *this << dendl;
f67539c2 5332 clear_ephemeral_pin(false, true);
f6b5b4d7
TL
5333 return;
5334 } else if (!is_dir() || !is_normal()) {
5335 dout(15) << __func__ << " !dir or !normal: cannot ephemeral random pin " << *this << dendl;
f67539c2 5336 clear_ephemeral_pin(false, true);
f6b5b4d7 5337 return;
f67539c2 5338 } else if (get_inode()->nlink == 0) {
f6b5b4d7 5339 dout(15) << __func__ << " unlinked directory: cannot ephemeral random pin " << *this << dendl;
f67539c2 5340 clear_ephemeral_pin(false, true);
f6b5b4d7
TL
5341 return;
5342 } else if (state_test(CInode::STATE_RANDEPHEMERALPIN)) {
5343 dout(10) << __func__ << " already ephemeral random pinned: requeueing " << *this << dendl;
f67539c2 5344 queue_export_pin(MDS_RANK_EPHEMERAL_RAND);
f6b5b4d7
TL
5345 return;
5346 }
5347
f91f0fd5
TL
5348 /* not precomputed? */
5349 if (threshold < 0.0) {
5350 threshold = get_ephemeral_rand();
5351 }
5352 if (threshold <= 0.0) {
5353 return;
5354 }
f6b5b4d7
TL
5355 double n = ceph::util::generate_random_number(0.0, 1.0);
5356
5357 dout(15) << __func__ << " rand " << n << " <?= " << threshold
5358 << " " << *this << dendl;
5359
5360 if (n <= threshold) {
5361 dout(10) << __func__ << " randomly export pinning " << *this << dendl;
f67539c2
TL
5362 set_ephemeral_pin(false, true);
5363 queue_export_pin(MDS_RANK_EPHEMERAL_RAND);
f6b5b4d7
TL
5364 }
5365}
5366
5367void CInode::setxattr_ephemeral_rand(double probability)
5368{
5369 ceph_assert(is_dir());
f67539c2 5370 _get_projected_inode()->export_ephemeral_random_pin = probability;
f6b5b4d7
TL
5371}
5372
5373void CInode::setxattr_ephemeral_dist(bool val)
5374{
5375 ceph_assert(is_dir());
f67539c2 5376 _get_projected_inode()->export_ephemeral_distributed_pin = val;
f6b5b4d7
TL
5377}
5378
7c673cae
FG
5379void CInode::set_export_pin(mds_rank_t rank)
5380{
11fdf7f2 5381 ceph_assert(is_dir());
f67539c2
TL
5382 _get_projected_inode()->export_pin = rank;
5383 maybe_export_pin(true);
7c673cae
FG
5384}
5385
f67539c2 5386mds_rank_t CInode::get_export_pin(bool inherit) const
f6b5b4d7 5387{
f67539c2
TL
5388 if (!g_conf()->mds_bal_export_pin)
5389 return MDS_RANK_NONE;
f6b5b4d7 5390
7c673cae
FG
5391 /* An inode that is export pinned may not necessarily be a subtree root, we
5392 * need to traverse the parents. A base or system inode cannot be pinned.
5393 * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5394 * have a parent yet.
5395 */
f67539c2 5396 mds_rank_t r_target = MDS_RANK_NONE;
b32b8144 5397 const CInode *in = this;
f67539c2 5398 const CDir *dir = nullptr;
f6b5b4d7
TL
5399 while (true) {
5400 if (in->is_system())
5401 break;
5402 const CDentry *pdn = in->get_parent_dn();
5403 if (!pdn)
5404 break;
f67539c2 5405 if (in->get_inode()->nlink == 0) {
f6b5b4d7 5406 // ignore export pin for unlinked directory
f67539c2
TL
5407 break;
5408 }
5409
5410 if (in->get_inode()->export_pin >= 0) {
5411 return in->get_inode()->export_pin;
5412 } else if (in->get_inode()->export_ephemeral_distributed_pin &&
5413 mdcache->get_export_ephemeral_distributed_config()) {
5414 if (in != this)
5415 return mdcache->hash_into_rank_bucket(in->ino(), dir->get_frag());
5416 return MDS_RANK_EPHEMERAL_DIST;
5417 } else if (r_target != MDS_RANK_NONE && in->get_inode()->export_ephemeral_random_pin > 0.0) {
5418 return r_target;
5419 } else if (r_target == MDS_RANK_NONE && in->is_ephemeral_rand() &&
5420 mdcache->get_export_ephemeral_random_config()) {
f6b5b4d7 5421 /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */
f67539c2
TL
5422 if (!inherit)
5423 return MDS_RANK_EPHEMERAL_RAND;
5424 if (in == this)
5425 r_target = MDS_RANK_EPHEMERAL_RAND;
5426 else
5427 r_target = mdcache->hash_into_rank_bucket(in->ino());
f6b5b4d7
TL
5428 }
5429
f67539c2 5430 if (!inherit)
f6b5b4d7 5431 break;
f67539c2
TL
5432 dir = pdn->get_dir();
5433 in = dir->inode;
f6b5b4d7
TL
5434 }
5435 return MDS_RANK_NONE;
5436}
5437
f67539c2
TL
5438void CInode::check_pin_policy(mds_rank_t export_pin)
5439{
5440 if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
5441 set_ephemeral_pin(true, false);
5442 clear_ephemeral_pin(false, true);
5443 } else if (export_pin == MDS_RANK_EPHEMERAL_RAND) {
5444 set_ephemeral_pin(false, true);
5445 clear_ephemeral_pin(true, false);
5446 } else if (is_ephemerally_pinned()) {
5447 // export_pin >= 0 || export_pin == MDS_RANK_NONE
5448 clear_ephemeral_pin(true, true);
5449 if (export_pin != get_inode()->export_pin) // inherited export_pin
5450 queue_export_pin(MDS_RANK_NONE);
5451 }
5452}
5453
5454double CInode::get_ephemeral_rand() const
f6b5b4d7
TL
5455{
5456 /* N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5457 * have a parent yet.
5458 */
5459 const CInode *in = this;
5460 double max = mdcache->export_ephemeral_random_max;
b32b8144
FG
5461 while (true) {
5462 if (in->is_system())
5463 break;
f64942e4 5464 const CDentry *pdn = in->get_parent_dn();
b32b8144
FG
5465 if (!pdn)
5466 break;
b32b8144 5467 // ignore export pin for unlinked directory
f67539c2 5468 if (in->get_inode()->nlink == 0)
b32b8144 5469 break;
f6b5b4d7 5470
f67539c2
TL
5471 if (in->get_inode()->export_ephemeral_random_pin > 0.0)
5472 return std::min(in->get_inode()->export_ephemeral_random_pin, max);
f6b5b4d7
TL
5473
5474 /* An export_pin overrides only if no closer parent (incl. this one) has a
5475 * random pin set.
5476 */
f67539c2
TL
5477 if (in->get_inode()->export_pin >= 0 ||
5478 in->get_inode()->export_ephemeral_distributed_pin)
f6b5b4d7 5479 return 0.0;
b32b8144 5480
b32b8144 5481 in = pdn->get_dir()->inode;
7c673cae 5482 }
f6b5b4d7 5483 return 0.0;
7c673cae
FG
5484}
5485
9f95a23c
TL
5486void CInode::get_nested_dirfrags(std::vector<CDir*>& v) const
5487{
5488 for (const auto &p : dirfrags) {
5489 const auto& dir = p.second;
5490 if (!dir->is_subtree_root())
5491 v.push_back(dir);
5492 }
5493}
5494
5495void CInode::get_subtree_dirfrags(std::vector<CDir*>& v) const
5496{
5497 for (const auto &p : dirfrags) {
5498 const auto& dir = p.second;
5499 if (dir->is_subtree_root())
5500 v.push_back(dir);
5501 }
5502}
5503
181888fb 5504MEMPOOL_DEFINE_OBJECT_FACTORY(CInode, co_inode, mds_co);