]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/CInode.cc
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / mds / CInode.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "include/int_types.h"
16 #include "common/errno.h"
17
18 #include <string>
19
20 #include "CInode.h"
21 #include "CDir.h"
22 #include "CDentry.h"
23
24 #include "MDSRank.h"
25 #include "MDCache.h"
26 #include "MDLog.h"
27 #include "Locker.h"
28 #include "Mutation.h"
29
30 #include "events/EUpdate.h"
31
32 #include "osdc/Objecter.h"
33
34 #include "snap.h"
35
36 #include "LogSegment.h"
37
38 #include "common/Clock.h"
39
40 #include "common/config.h"
41 #include "global/global_context.h"
42 #include "include/ceph_assert.h"
43
44 #include "mds/MDSContinuation.h"
45 #include "mds/InoTable.h"
46 #include "cephfs_features.h"
47 #include "osdc/Objecter.h"
48
49 #define dout_context g_ceph_context
50 #define dout_subsys ceph_subsys_mds
51 #undef dout_prefix
52 #define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") "
53
54 using namespace std;
55
56 void CInodeCommitOperation::update(ObjectOperation &op, inode_backtrace_t &bt) {
57 using ceph::encode;
58
59 op.priority = priority;
60 op.create(false);
61
62 bufferlist parent_bl;
63 encode(bt, parent_bl);
64 op.setxattr("parent", parent_bl);
65
66 // for the old pool there is no need to update the layout and symlink
67 if (!update_layout_symlink)
68 return;
69
70 bufferlist layout_bl;
71 encode(_layout, layout_bl, _features);
72 op.setxattr("layout", layout_bl);
73
74 if (!_symlink.empty()) {
75 bufferlist symlink_bl;
76 encode(_symlink, symlink_bl);
77 op.setxattr("symlink", symlink_bl);
78 }
79 }
80
81 class CInodeIOContext : public MDSIOContextBase
82 {
83 protected:
84 CInode *in;
85 MDSRank *get_mds() override {return in->mdcache->mds;}
86 public:
87 explicit CInodeIOContext(CInode *in_) : in(in_) {
88 ceph_assert(in != NULL);
89 }
90 };
91
92 sr_t* const CInode::projected_inode::UNDEF_SRNODE = (sr_t*)(unsigned long)-1;
93
94 LockType CInode::versionlock_type(CEPH_LOCK_IVERSION);
95 LockType CInode::authlock_type(CEPH_LOCK_IAUTH);
96 LockType CInode::linklock_type(CEPH_LOCK_ILINK);
97 LockType CInode::dirfragtreelock_type(CEPH_LOCK_IDFT);
98 LockType CInode::filelock_type(CEPH_LOCK_IFILE);
99 LockType CInode::xattrlock_type(CEPH_LOCK_IXATTR);
100 LockType CInode::snaplock_type(CEPH_LOCK_ISNAP);
101 LockType CInode::nestlock_type(CEPH_LOCK_INEST);
102 LockType CInode::flocklock_type(CEPH_LOCK_IFLOCK);
103 LockType CInode::policylock_type(CEPH_LOCK_IPOLICY);
104
105 std::string_view CInode::pin_name(int p) const
106 {
107 switch (p) {
108 case PIN_DIRFRAG: return "dirfrag";
109 case PIN_CAPS: return "caps";
110 case PIN_IMPORTING: return "importing";
111 case PIN_OPENINGDIR: return "openingdir";
112 case PIN_REMOTEPARENT: return "remoteparent";
113 case PIN_BATCHOPENJOURNAL: return "batchopenjournal";
114 case PIN_SCATTERED: return "scattered";
115 case PIN_STICKYDIRS: return "stickydirs";
116 //case PIN_PURGING: return "purging";
117 case PIN_FREEZING: return "freezing";
118 case PIN_FROZEN: return "frozen";
119 case PIN_IMPORTINGCAPS: return "importingcaps";
120 case PIN_EXPORTINGCAPS: return "exportingcaps";
121 case PIN_PASTSNAPPARENT: return "pastsnapparent";
122 case PIN_OPENINGSNAPPARENTS: return "openingsnapparents";
123 case PIN_TRUNCATING: return "truncating";
124 case PIN_STRAY: return "stray";
125 case PIN_NEEDSNAPFLUSH: return "needsnapflush";
126 case PIN_DIRTYRSTAT: return "dirtyrstat";
127 case PIN_DIRTYPARENT: return "dirtyparent";
128 case PIN_DIRWAITER: return "dirwaiter";
129 default: return generic_pin_name(p);
130 }
131 }
132
133 //int cinode_pins[CINODE_NUM_PINS]; // counts
134 ostream& CInode::print_db_line_prefix(ostream& out) const
135 {
136 return out << ceph_clock_now() << " mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") ";
137 }
138
139 /*
140 * write caps and lock ids
141 */
142 struct cinode_lock_info_t cinode_lock_info[] = {
143 { CEPH_LOCK_IFILE, CEPH_CAP_ANY_FILE_WR },
144 { CEPH_LOCK_IAUTH, CEPH_CAP_AUTH_EXCL },
145 { CEPH_LOCK_ILINK, CEPH_CAP_LINK_EXCL },
146 { CEPH_LOCK_IXATTR, CEPH_CAP_XATTR_EXCL },
147 };
148 int num_cinode_locks = sizeof(cinode_lock_info) / sizeof(cinode_lock_info[0]);
149
150 ostream& operator<<(ostream& out, const CInode& in)
151 {
152 string path;
153 in.make_path_string(path, true);
154
155 out << "[inode " << in.ino();
156 out << " ["
157 << (in.is_multiversion() ? "...":"")
158 << in.first << "," << in.last << "]";
159 out << " " << path << (in.is_dir() ? "/":"");
160
161 if (in.is_auth()) {
162 out << " auth";
163 if (in.is_replicated())
164 out << in.get_replicas();
165 } else {
166 mds_authority_t a = in.authority();
167 out << " rep@" << a.first;
168 if (a.second != CDIR_AUTH_UNKNOWN)
169 out << "," << a.second;
170 out << "." << in.get_replica_nonce();
171 }
172
173 if (in.is_symlink())
174 out << " symlink='" << in.symlink << "'";
175 if (in.is_dir() && !in.dirfragtree.empty())
176 out << " " << in.dirfragtree;
177
178 out << " v" << in.get_version();
179 if (in.get_projected_version() > in.get_version())
180 out << " pv" << in.get_projected_version();
181
182 if (in.get_num_auth_pins()) {
183 out << " ap=" << in.get_num_auth_pins();
184 #ifdef MDS_AUTHPIN_SET
185 in.print_authpin_set(out);
186 #endif
187 }
188
189 if (in.snaprealm)
190 out << " snaprealm=" << in.snaprealm;
191
192 if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH";
193 if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " NEEDSRECOVER";
194 if (in.state_test(CInode::STATE_RECOVERING)) out << " RECOVERING";
195 if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " DIRTYPARENT";
196 if (in.state_test(CInode::STATE_MISSINGOBJS)) out << " MISSINGOBJS";
197 if (in.is_ephemeral_dist()) out << " DISTEPHEMERALPIN";
198 if (in.is_ephemeral_rand()) out << " RANDEPHEMERALPIN";
199 if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance;
200 if (in.is_frozen_inode()) out << " FROZEN";
201 if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN";
202
203 const auto& pi = in.get_projected_inode();
204 if (pi->is_truncating())
205 out << " truncating(" << pi->truncate_from << " to " << pi->truncate_size << ")";
206
207 if (in.is_dir()) {
208 out << " " << in.get_inode()->dirstat;
209 if (g_conf()->mds_debug_scatterstat && in.is_projected()) {
210 out << "->" << pi->dirstat;
211 }
212 } else {
213 out << " s=" << in.get_inode()->size;
214 if (in.get_inode()->nlink != 1)
215 out << " nl=" << in.get_inode()->nlink;
216 }
217
218 // rstat
219 out << " " << in.get_inode()->rstat;
220 if (!(in.get_inode()->rstat == in.get_inode()->accounted_rstat))
221 out << "/" << in.get_inode()->accounted_rstat;
222 if (g_conf()->mds_debug_scatterstat && in.is_projected()) {
223 out << "->" << pi->rstat;
224 if (!(pi->rstat == pi->accounted_rstat))
225 out << "/" << pi->accounted_rstat;
226 }
227
228 if (in.is_any_old_inodes()) {
229 out << " old_inodes=" << in.get_old_inodes()->size();
230 }
231
232 if (!in.client_need_snapflush.empty())
233 out << " need_snapflush=" << in.client_need_snapflush;
234
235 // locks
236 if (!in.authlock.is_sync_and_unlocked())
237 out << " " << in.authlock;
238 if (!in.linklock.is_sync_and_unlocked())
239 out << " " << in.linklock;
240 if (in.get_inode()->is_dir()) {
241 if (!in.dirfragtreelock.is_sync_and_unlocked())
242 out << " " << in.dirfragtreelock;
243 if (!in.snaplock.is_sync_and_unlocked())
244 out << " " << in.snaplock;
245 if (!in.nestlock.is_sync_and_unlocked())
246 out << " " << in.nestlock;
247 if (!in.policylock.is_sync_and_unlocked())
248 out << " " << in.policylock;
249 } else {
250 if (!in.flocklock.is_sync_and_unlocked())
251 out << " " << in.flocklock;
252 }
253 if (!in.filelock.is_sync_and_unlocked())
254 out << " " << in.filelock;
255 if (!in.xattrlock.is_sync_and_unlocked())
256 out << " " << in.xattrlock;
257 if (!in.versionlock.is_sync_and_unlocked())
258 out << " " << in.versionlock;
259
260 // hack: spit out crap on which clients have caps
261 if (in.get_inode()->client_ranges.size())
262 out << " cr=" << in.get_inode()->client_ranges;
263
264 if (!in.get_client_caps().empty()) {
265 out << " caps={";
266 bool first = true;
267 for (const auto &p : in.get_client_caps()) {
268 if (!first) out << ",";
269 out << p.first << "="
270 << ccap_string(p.second.pending());
271 if (p.second.issued() != p.second.pending())
272 out << "/" << ccap_string(p.second.issued());
273 out << "/" << ccap_string(p.second.wanted())
274 << "@" << p.second.get_last_seq();
275 first = false;
276 }
277 out << "}";
278 if (in.get_loner() >= 0 || in.get_wanted_loner() >= 0) {
279 out << ",l=" << in.get_loner();
280 if (in.get_loner() != in.get_wanted_loner())
281 out << "(" << in.get_wanted_loner() << ")";
282 }
283 }
284 if (!in.get_mds_caps_wanted().empty()) {
285 out << " mcw={";
286 bool first = true;
287 for (const auto &p : in.get_mds_caps_wanted()) {
288 if (!first)
289 out << ',';
290 out << p.first << '=' << ccap_string(p.second);
291 first = false;
292 }
293 out << '}';
294 }
295
296 if (in.get_num_ref()) {
297 out << " |";
298 in.print_pin_set(out);
299 }
300
301 if (in.get_inode()->export_pin != MDS_RANK_NONE) {
302 out << " export_pin=" << in.get_inode()->export_pin;
303 }
304 if (in.state_test(CInode::STATE_DISTEPHEMERALPIN)) {
305 out << " distepin";
306 }
307 if (in.state_test(CInode::STATE_RANDEPHEMERALPIN)) {
308 out << " randepin";
309 }
310
311 out << " " << &in;
312 out << "]";
313 return out;
314 }
315
316 CInode::CInode(MDCache *c, bool auth, snapid_t f, snapid_t l) :
317 mdcache(c), first(f), last(l),
318 item_dirty(this),
319 item_caps(this),
320 item_open_file(this),
321 item_dirty_parent(this),
322 item_dirty_dirfrag_dir(this),
323 item_dirty_dirfrag_nest(this),
324 item_dirty_dirfrag_dirfragtree(this),
325 pop(c->decayrate),
326 versionlock(this, &versionlock_type),
327 authlock(this, &authlock_type),
328 linklock(this, &linklock_type),
329 dirfragtreelock(this, &dirfragtreelock_type),
330 filelock(this, &filelock_type),
331 xattrlock(this, &xattrlock_type),
332 snaplock(this, &snaplock_type),
333 nestlock(this, &nestlock_type),
334 flocklock(this, &flocklock_type),
335 policylock(this, &policylock_type)
336 {
337 if (auth)
338 state_set(STATE_AUTH);
339 }
340
341 void CInode::print(ostream& out) const
342 {
343 out << *this;
344 }
345
346 void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
347 {
348 dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl;
349
350 if (client_need_snapflush.empty()) {
351 get(CInode::PIN_NEEDSNAPFLUSH);
352
353 // FIXME: this is non-optimal, as we'll block freezes/migrations for potentially
354 // long periods waiting for clients to flush their snaps.
355 auth_pin(this); // pin head get_inode()->..
356 }
357
358 auto &clients = client_need_snapflush[snapid];
359 if (clients.empty())
360 snapin->auth_pin(this); // ...and pin snapped/old inode!
361
362 clients.insert(client);
363 }
364
365 void CInode::remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
366 {
367 dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl;
368 auto it = client_need_snapflush.find(snapid);
369 if (it == client_need_snapflush.end()) {
370 dout(10) << " snapid not found" << dendl;
371 return;
372 }
373 size_t n = it->second.erase(client);
374 if (n == 0) {
375 dout(10) << " client not found" << dendl;
376 return;
377 }
378 if (it->second.empty()) {
379 client_need_snapflush.erase(it);
380 snapin->auth_unpin(this);
381
382 if (client_need_snapflush.empty()) {
383 put(CInode::PIN_NEEDSNAPFLUSH);
384 auth_unpin(this);
385 }
386 }
387 }
388
389 pair<bool,bool> CInode::split_need_snapflush(CInode *cowin, CInode *in)
390 {
391 dout(10) << __func__ << " [" << cowin->first << "," << cowin->last << "] for " << *cowin << dendl;
392 bool cowin_need_flush = false;
393 bool orig_need_flush = false;
394 auto it = client_need_snapflush.lower_bound(cowin->first);
395 while (it != client_need_snapflush.end() && it->first < in->first) {
396 ceph_assert(!it->second.empty());
397 if (cowin->last >= it->first) {
398 cowin->auth_pin(this);
399 cowin_need_flush = true;
400 ++it;
401 } else {
402 it = client_need_snapflush.erase(it);
403 }
404 in->auth_unpin(this);
405 }
406
407 if (it != client_need_snapflush.end() && it->first <= in->last)
408 orig_need_flush = true;
409
410 return make_pair(cowin_need_flush, orig_need_flush);
411 }
412
413 void CInode::mark_dirty_rstat()
414 {
415 if (!state_test(STATE_DIRTYRSTAT)) {
416 dout(10) << __func__ << dendl;
417 state_set(STATE_DIRTYRSTAT);
418 get(PIN_DIRTYRSTAT);
419 CDentry *pdn = get_projected_parent_dn();
420 if (pdn->is_auth()) {
421 CDir *pdir = pdn->dir;
422 pdir->dirty_rstat_inodes.push_back(&dirty_rstat_item);
423 mdcache->mds->locker->mark_updated_scatterlock(&pdir->inode->nestlock);
424 } else {
425 // under cross-MDS rename.
426 // DIRTYRSTAT flag will get cleared when rename finishes
427 ceph_assert(state_test(STATE_AMBIGUOUSAUTH));
428 }
429 }
430 }
431 void CInode::clear_dirty_rstat()
432 {
433 if (state_test(STATE_DIRTYRSTAT)) {
434 dout(10) << __func__ << dendl;
435 state_clear(STATE_DIRTYRSTAT);
436 put(PIN_DIRTYRSTAT);
437 dirty_rstat_item.remove_myself();
438 }
439 }
440
441 CInode::projected_inode CInode::project_inode(const MutationRef& mut,
442 bool xattr, bool snap)
443 {
444 if (mut && mut->is_projected(this)) {
445 ceph_assert(!xattr && !snap);
446 auto _inode = std::const_pointer_cast<mempool_inode>(projected_nodes.back().inode);
447 return projected_inode(std::move(_inode), xattr_map_ptr());
448 }
449
450 auto pi = allocate_inode(*get_projected_inode());
451
452 if (scrub_infop && scrub_infop->last_scrub_dirty) {
453 pi->last_scrub_stamp = scrub_infop->last_scrub_stamp;
454 pi->last_scrub_version = scrub_infop->last_scrub_version;
455 scrub_infop->last_scrub_dirty = false;
456 scrub_maybe_delete_info();
457 }
458
459 const auto& ox = get_projected_xattrs();
460 xattr_map_ptr px;
461 if (xattr) {
462 px = allocate_xattr_map();
463 if (ox)
464 *px = *ox;
465 }
466
467 sr_t* ps = projected_inode::UNDEF_SRNODE;
468 if (snap) {
469 ps = prepare_new_srnode(0);
470 ++num_projected_srnodes;
471 }
472
473 projected_nodes.emplace_back(pi, xattr ? px : ox , ps);
474 if (mut)
475 mut->add_projected_node(this);
476 dout(15) << __func__ << " " << pi->ino << dendl;
477 return projected_inode(std::move(pi), std::move(px), ps);
478 }
479
480 void CInode::pop_and_dirty_projected_inode(LogSegment *ls, const MutationRef& mut)
481 {
482 ceph_assert(!projected_nodes.empty());
483 auto front = std::move(projected_nodes.front());
484 dout(15) << __func__ << " v" << front.inode->version << dendl;
485
486 projected_nodes.pop_front();
487 if (mut)
488 mut->remove_projected_node(this);
489
490 bool pool_updated = get_inode()->layout.pool_id != front.inode->layout.pool_id;
491 bool pin_updated = (get_inode()->export_pin != front.inode->export_pin) ||
492 (get_inode()->export_ephemeral_distributed_pin !=
493 front.inode->export_ephemeral_distributed_pin);
494
495 reset_inode(std::move(front.inode));
496 if (front.xattrs != get_xattrs())
497 reset_xattrs(std::move(front.xattrs));
498
499 if (front.snapnode != projected_inode::UNDEF_SRNODE) {
500 --num_projected_srnodes;
501 pop_projected_snaprealm(front.snapnode, false);
502 }
503
504 mark_dirty(ls);
505 if (get_inode()->is_backtrace_updated())
506 mark_dirty_parent(ls, pool_updated);
507
508 if (pin_updated)
509 maybe_export_pin(true);
510 }
511
512 sr_t *CInode::prepare_new_srnode(snapid_t snapid)
513 {
514 const sr_t *cur_srnode = get_projected_srnode();
515 sr_t *new_srnode;
516
517 if (cur_srnode) {
518 new_srnode = new sr_t(*cur_srnode);
519 } else {
520 if (snapid == 0)
521 snapid = mdcache->get_global_snaprealm()->get_newest_seq();
522 new_srnode = new sr_t();
523 new_srnode->seq = snapid;
524 new_srnode->created = snapid;
525 new_srnode->current_parent_since = get_oldest_snap();
526 SnapRealm *sr = find_snaprealm();
527 dout(20) << __func__ << ": inheriting change_attr from " << *sr
528 << dendl;
529 new_srnode->change_attr = sr->srnode.change_attr;
530 }
531 return new_srnode;
532 }
533
534 const sr_t *CInode::get_projected_srnode() const {
535 if (num_projected_srnodes > 0) {
536 for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
537 if (it->snapnode != projected_inode::UNDEF_SRNODE)
538 return it->snapnode;
539 }
540 if (snaprealm)
541 return &snaprealm->srnode;
542 else
543 return NULL;
544 }
545
546 void CInode::project_snaprealm(sr_t *new_srnode)
547 {
548 dout(10) << __func__ << " " << new_srnode << dendl;
549 ceph_assert(projected_nodes.back().snapnode == projected_inode::UNDEF_SRNODE);
550 projected_nodes.back().snapnode = new_srnode;
551 ++num_projected_srnodes;
552 }
553
554 void CInode::mark_snaprealm_global(sr_t *new_srnode)
555 {
556 ceph_assert(!is_dir());
557 // 'last_destroyed' is no longer used, use it to store origin 'current_parent_since'
558 new_srnode->last_destroyed = new_srnode->current_parent_since;
559 new_srnode->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
560 new_srnode->mark_parent_global();
561 }
562
563 void CInode::clear_snaprealm_global(sr_t *new_srnode)
564 {
565 // restore 'current_parent_since'
566 new_srnode->current_parent_since = new_srnode->last_destroyed;
567 new_srnode->last_destroyed = 0;
568 new_srnode->seq = mdcache->get_global_snaprealm()->get_newest_seq();
569 new_srnode->clear_parent_global();
570 }
571
572 bool CInode::is_projected_snaprealm_global() const
573 {
574 const sr_t *srnode = get_projected_srnode();
575 if (srnode && srnode->is_parent_global())
576 return true;
577 return false;
578 }
579
580 void CInode::project_snaprealm_past_parent(SnapRealm *newparent)
581 {
582 sr_t *new_snap = project_snaprealm();
583 record_snaprealm_past_parent(new_snap, newparent);
584 }
585
586
587 /* if newparent != parent, add parent to past_parents
588 if parent DNE, we need to find what the parent actually is and fill that in */
589 void CInode::record_snaprealm_past_parent(sr_t *new_snap, SnapRealm *newparent)
590 {
591 ceph_assert(!new_snap->is_parent_global());
592 SnapRealm *oldparent;
593 if (!snaprealm) {
594 oldparent = find_snaprealm();
595 } else {
596 oldparent = snaprealm->parent;
597 }
598
599 if (newparent != oldparent) {
600 snapid_t oldparentseq = oldparent->get_newest_seq();
601 if (oldparentseq + 1 > new_snap->current_parent_since) {
602 // copy old parent's snaps
603 const set<snapid_t>& snaps = oldparent->get_snaps();
604 auto p = snaps.lower_bound(new_snap->current_parent_since);
605 if (p != snaps.end())
606 new_snap->past_parent_snaps.insert(p, snaps.end());
607 if (oldparentseq > new_snap->seq)
608 new_snap->seq = oldparentseq;
609 }
610 new_snap->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
611 }
612 }
613
614 void CInode::record_snaprealm_parent_dentry(sr_t *new_snap, SnapRealm *oldparent,
615 CDentry *dn, bool primary_dn)
616 {
617 ceph_assert(new_snap->is_parent_global());
618
619 if (!oldparent)
620 oldparent = dn->get_dir()->inode->find_snaprealm();
621 auto& snaps = oldparent->get_snaps();
622
623 if (!primary_dn) {
624 auto p = snaps.lower_bound(dn->first);
625 if (p != snaps.end())
626 new_snap->past_parent_snaps.insert(p, snaps.end());
627 } else {
628 // 'last_destroyed' is used as 'current_parent_since'
629 auto p = snaps.lower_bound(new_snap->last_destroyed);
630 if (p != snaps.end())
631 new_snap->past_parent_snaps.insert(p, snaps.end());
632 new_snap->last_destroyed = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
633 }
634 }
635
636 void CInode::early_pop_projected_snaprealm()
637 {
638 ceph_assert(!projected_nodes.empty());
639 if (projected_nodes.front().snapnode != projected_inode::UNDEF_SRNODE) {
640 pop_projected_snaprealm(projected_nodes.front().snapnode, true);
641 projected_nodes.front().snapnode = projected_inode::UNDEF_SRNODE;
642 --num_projected_srnodes;
643 }
644 }
645
646 void CInode::pop_projected_snaprealm(sr_t *next_snaprealm, bool early)
647 {
648 if (next_snaprealm) {
649 dout(10) << __func__ << (early ? " (early) " : " ")
650 << next_snaprealm << " seq " << next_snaprealm->seq << dendl;
651 if (!snaprealm)
652 open_snaprealm();
653
654 auto old_flags = snaprealm->srnode.flags;
655 snaprealm->srnode = *next_snaprealm;
656 delete next_snaprealm;
657
658 if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) {
659 snaprealm->adjust_parent();
660 }
661
662 if (snaprealm->parent)
663 dout(10) << " realm " << *snaprealm << " parent " << *snaprealm->parent << dendl;
664 } else {
665 dout(10) << __func__ << (early ? " (early) null" : " null") << dendl;
666 ceph_assert(snaprealm);
667 snaprealm->merge_to(NULL);
668 }
669 }
670
671
672 // ====== CInode =======
673
674 // dirfrags
675
676 InodeStoreBase::inode_const_ptr InodeStoreBase::empty_inode = InodeStoreBase::allocate_inode();
677
678 __u32 InodeStoreBase::hash_dentry_name(std::string_view dn)
679 {
680 int which = inode->dir_layout.dl_dir_hash;
681 if (!which)
682 which = CEPH_STR_HASH_LINUX;
683 ceph_assert(ceph_str_hash_valid(which));
684 return ceph_str_hash(which, dn.data(), dn.length());
685 }
686
687 frag_t InodeStoreBase::pick_dirfrag(std::string_view dn)
688 {
689 if (dirfragtree.empty())
690 return frag_t(); // avoid the string hash if we can.
691
692 __u32 h = hash_dentry_name(dn);
693 return dirfragtree[h];
694 }
695
696 std::pair<bool, std::vector<CDir*>> CInode::get_dirfrags_under(frag_t fg)
697 {
698 std::pair<bool, std::vector<CDir*>> result;
699 auto& all = result.first;
700 auto& dirs = result.second;
701 all = false;
702
703 if (auto it = dirfrags.find(fg); it != dirfrags.end()){
704 all = true;
705 dirs.push_back(it->second);
706 return result;
707 }
708
709 int total = 0;
710 for(auto &[_fg, _dir] : dirfrags){
711 // frag_t.bits() can indicate the depth of the partition in the directory tree
712 // e.g.
713 // 01* : bit = 2, on the second floor
714 // *
715 // 0* 1*
716 // 00* 01* 10* 11* -- > level 2, bit = 2
717 // so fragA.bits > fragB.bits means fragA is deeper than fragB
718
719 if (fg.bits() >= _fg.bits()) {
720 if (_fg.contains(fg)) {
721 all = true;
722 return result;
723 }
724 } else {
725 if (fg.contains(_fg)) {
726 dirs.push_back(_dir);
727 // we can calculate how many sub slices a slice can be divided into
728 // frag_t(*) can be divided into two frags belonging to the first layer(0* 1*)
729 // or 2^2 frags belonging to the second layer(00* 01* 10* 11*)
730 // or (1 << (24 - frag_t(*).bits)) frags belonging to the 24th level
731 total += 1 << (24 - _fg.bits());
732 }
733 }
734 }
735
736 // we convert all the frags into the frags of 24th layer to calculate whether all the frags are included in the memory cache
737 all = ((1<<(24-fg.bits())) == total);
738 return result;
739 }
740
741 void CInode::verify_dirfrags()
742 {
743 bool bad = false;
744 for (const auto &p : dirfrags) {
745 if (!dirfragtree.is_leaf(p.first)) {
746 dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
747 << ": " << *p.second << dendl;
748 bad = true;
749 }
750 }
751 ceph_assert(!bad);
752 }
753
754 void CInode::force_dirfrags()
755 {
756 bool bad = false;
757 for (auto &p : dirfrags) {
758 if (!dirfragtree.is_leaf(p.first)) {
759 dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
760 << ": " << *p.second << dendl;
761 bad = true;
762 }
763 }
764
765 if (bad) {
766 frag_vec_t leaves;
767 dirfragtree.get_leaves(leaves);
768 for (const auto& leaf : leaves) {
769 mdcache->get_force_dirfrag(dirfrag_t(ino(), leaf), true);
770 }
771 }
772
773 verify_dirfrags();
774 }
775
776 CDir *CInode::get_approx_dirfrag(frag_t fg)
777 {
778 CDir *dir = get_dirfrag(fg);
779 if (dir) return dir;
780
781 // find a child?
782 auto&& p = get_dirfrags_under(fg);
783 if (!p.second.empty())
784 return p.second.front();
785
786 // try parents?
787 while (fg.bits() > 0) {
788 fg = fg.parent();
789 dir = get_dirfrag(fg);
790 if (dir) return dir;
791 }
792 return NULL;
793 }
794
795 CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg)
796 {
797 ceph_assert(is_dir());
798
799 // have it?
800 CDir *dir = get_dirfrag(fg);
801 if (!dir) {
802 // create it.
803 ceph_assert(is_auth() || mdcache->mds->is_any_replay());
804 dir = new CDir(this, fg, mdcache, is_auth());
805 add_dirfrag(dir);
806 }
807 return dir;
808 }
809
810 CDir *CInode::add_dirfrag(CDir *dir)
811 {
812 auto em = dirfrags.emplace(std::piecewise_construct, std::forward_as_tuple(dir->dirfrag().frag), std::forward_as_tuple(dir));
813 ceph_assert(em.second);
814
815 if (stickydir_ref > 0) {
816 dir->state_set(CDir::STATE_STICKY);
817 dir->get(CDir::PIN_STICKY);
818 }
819
820 maybe_export_pin();
821
822 return dir;
823 }
824
825 void CInode::close_dirfrag(frag_t fg)
826 {
827 dout(14) << __func__ << " " << fg << dendl;
828 ceph_assert(dirfrags.count(fg));
829
830 CDir *dir = dirfrags[fg];
831 dir->remove_null_dentries();
832
833 // clear dirty flag
834 if (dir->is_dirty())
835 dir->mark_clean();
836
837 if (stickydir_ref > 0) {
838 dir->state_clear(CDir::STATE_STICKY);
839 dir->put(CDir::PIN_STICKY);
840 }
841
842 if (dir->is_subtree_root())
843 num_subtree_roots--;
844
845 // dump any remaining dentries, for debugging purposes
846 for (const auto &p : dir->items)
847 dout(14) << __func__ << " LEFTOVER dn " << *p.second << dendl;
848
849 ceph_assert(dir->get_num_ref() == 0);
850 delete dir;
851 dirfrags.erase(fg);
852 }
853
854 void CInode::close_dirfrags()
855 {
856 while (!dirfrags.empty())
857 close_dirfrag(dirfrags.begin()->first);
858 }
859
860 bool CInode::has_subtree_root_dirfrag(int auth)
861 {
862 if (num_subtree_roots > 0) {
863 if (auth == -1)
864 return true;
865 for (const auto &p : dirfrags) {
866 if (p.second->is_subtree_root() &&
867 p.second->dir_auth.first == auth)
868 return true;
869 }
870 }
871 return false;
872 }
873
874 bool CInode::has_subtree_or_exporting_dirfrag()
875 {
876 if (num_subtree_roots > 0 || num_exporting_dirs > 0)
877 return true;
878 return false;
879 }
880
881 void CInode::get_stickydirs()
882 {
883 if (stickydir_ref == 0) {
884 get(PIN_STICKYDIRS);
885 for (const auto &p : dirfrags) {
886 p.second->state_set(CDir::STATE_STICKY);
887 p.second->get(CDir::PIN_STICKY);
888 }
889 }
890 stickydir_ref++;
891 }
892
893 void CInode::put_stickydirs()
894 {
895 ceph_assert(stickydir_ref > 0);
896 stickydir_ref--;
897 if (stickydir_ref == 0) {
898 put(PIN_STICKYDIRS);
899 for (const auto &p : dirfrags) {
900 p.second->state_clear(CDir::STATE_STICKY);
901 p.second->put(CDir::PIN_STICKY);
902 }
903 }
904 }
905
906
907
908
909
910 // pins
911
912 void CInode::first_get()
913 {
914 // pin my dentry?
915 if (parent)
916 parent->get(CDentry::PIN_INODEPIN);
917 }
918
919 void CInode::last_put()
920 {
921 // unpin my dentry?
922 if (parent)
923 parent->put(CDentry::PIN_INODEPIN);
924 }
925
926 void CInode::_put()
927 {
928 if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent())
929 mdcache->maybe_eval_stray(this, true);
930 }
931
932 void CInode::add_remote_parent(CDentry *p)
933 {
934 if (remote_parents.empty())
935 get(PIN_REMOTEPARENT);
936 remote_parents.insert(p);
937 }
938 void CInode::remove_remote_parent(CDentry *p)
939 {
940 remote_parents.erase(p);
941 if (remote_parents.empty())
942 put(PIN_REMOTEPARENT);
943 }
944
945
946
947
948 CDir *CInode::get_parent_dir()
949 {
950 if (parent)
951 return parent->dir;
952 return NULL;
953 }
954 CDir *CInode::get_projected_parent_dir()
955 {
956 CDentry *p = get_projected_parent_dn();
957 if (p)
958 return p->dir;
959 return NULL;
960 }
961 CInode *CInode::get_parent_inode()
962 {
963 if (parent)
964 return parent->dir->inode;
965 return NULL;
966 }
967
968 bool CInode::is_ancestor_of(const CInode *other) const
969 {
970 while (other) {
971 if (other == this)
972 return true;
973 const CDentry *pdn = other->get_oldest_parent_dn();
974 if (!pdn) {
975 ceph_assert(other->is_base());
976 break;
977 }
978 other = pdn->get_dir()->get_inode();
979 }
980 return false;
981 }
982
983 bool CInode::is_projected_ancestor_of(const CInode *other) const
984 {
985 while (other) {
986 if (other == this)
987 return true;
988 const CDentry *pdn = other->get_projected_parent_dn();
989 if (!pdn) {
990 ceph_assert(other->is_base());
991 break;
992 }
993 other = pdn->get_dir()->get_inode();
994 }
995 return false;
996 }
997
998 /*
999 * Because a non-directory inode may have multiple links, the use_parent
1000 * argument allows selecting which parent to use for path construction. This
1001 * argument is only meaningful for the final component (i.e. the first of the
1002 * nested calls) because directories cannot have multiple hard links. If
1003 * use_parent is NULL and projected is true, the primary parent's projected
1004 * inode is used all the way up the path chain. Otherwise the primary parent
1005 * stable inode is used.
1006 */
1007 void CInode::make_path_string(string& s, bool projected, const CDentry *use_parent) const
1008 {
1009 if (!use_parent) {
1010 use_parent = projected ? get_projected_parent_dn() : parent;
1011 }
1012
1013 if (use_parent) {
1014 use_parent->make_path_string(s, projected);
1015 } else if (is_root()) {
1016 s = "";
1017 } else if (is_mdsdir()) {
1018 char t[40];
1019 uint64_t eino(ino());
1020 eino -= MDS_INO_MDSDIR_OFFSET;
1021 snprintf(t, sizeof(t), "~mds%" PRId64, eino);
1022 s = t;
1023 } else {
1024 char n[40];
1025 uint64_t eino(ino());
1026 snprintf(n, sizeof(n), "#%" PRIx64, eino);
1027 s += n;
1028 }
1029 }
1030
1031 void CInode::make_path(filepath& fp, bool projected) const
1032 {
1033 const CDentry *use_parent = projected ? get_projected_parent_dn() : parent;
1034 if (use_parent) {
1035 ceph_assert(!is_base());
1036 use_parent->make_path(fp, projected);
1037 } else {
1038 fp = filepath(ino());
1039 }
1040 }
1041
1042 void CInode::name_stray_dentry(string& dname)
1043 {
1044 char s[20];
1045 snprintf(s, sizeof(s), "%llx", (unsigned long long)ino().val);
1046 dname = s;
1047 }
1048
1049 version_t CInode::pre_dirty()
1050 {
1051 version_t pv;
1052 CDentry* _cdentry = get_projected_parent_dn();
1053 if (_cdentry) {
1054 pv = _cdentry->pre_dirty(get_projected_version());
1055 dout(10) << "pre_dirty " << pv << " (current v " << get_inode()->version << ")" << dendl;
1056 } else {
1057 ceph_assert(is_base());
1058 pv = get_projected_version() + 1;
1059 }
1060 // force update backtrace for old format inode (see mempool_inode::decode)
1061 if (get_inode()->backtrace_version == 0 && !projected_nodes.empty()) {
1062 auto pi = _get_projected_inode();
1063 if (pi->backtrace_version == 0)
1064 pi->update_backtrace(pv);
1065 }
1066 return pv;
1067 }
1068
1069 void CInode::_mark_dirty(LogSegment *ls)
1070 {
1071 if (!state_test(STATE_DIRTY)) {
1072 state_set(STATE_DIRTY);
1073 get(PIN_DIRTY);
1074 ceph_assert(ls);
1075 }
1076
1077 // move myself to this segment's dirty list
1078 if (ls)
1079 ls->dirty_inodes.push_back(&item_dirty);
1080 }
1081
1082 void CInode::mark_dirty(LogSegment *ls) {
1083
1084 dout(10) << __func__ << " " << *this << dendl;
1085
1086 /*
1087 NOTE: I may already be dirty, but this fn _still_ needs to be called so that
1088 the directory is (perhaps newly) dirtied, and so that parent_dir_version is
1089 updated below.
1090 */
1091
1092 // only auth can get dirty. "dirty" async data in replicas is relative to
1093 // filelock state, not the dirty flag.
1094 ceph_assert(is_auth());
1095
1096 // touch my private version
1097 _mark_dirty(ls);
1098
1099 // mark dentry too
1100 if (parent)
1101 parent->mark_dirty(get_version(), ls);
1102 }
1103
1104
1105 void CInode::mark_clean()
1106 {
1107 dout(10) << __func__ << " " << *this << dendl;
1108 if (state_test(STATE_DIRTY)) {
1109 state_clear(STATE_DIRTY);
1110 put(PIN_DIRTY);
1111
1112 // remove myself from ls dirty list
1113 item_dirty.remove_myself();
1114 }
1115 }
1116
1117
1118 // --------------
1119 // per-inode storage
1120 // (currently for root inode only)
1121
1122 struct C_IO_Inode_Stored : public CInodeIOContext {
1123 version_t version;
1124 Context *fin;
1125 C_IO_Inode_Stored(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {}
1126 void finish(int r) override {
1127 in->_stored(r, version, fin);
1128 }
1129 void print(ostream& out) const override {
1130 out << "inode_store(" << in->ino() << ")";
1131 }
1132 };
1133
1134 object_t InodeStoreBase::get_object_name(inodeno_t ino, frag_t fg, std::string_view suffix)
1135 {
1136 char n[60];
1137 snprintf(n, sizeof(n), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)fg);
1138 ceph_assert(strlen(n) + suffix.size() < sizeof n);
1139 strncat(n, suffix.data(), suffix.size());
1140 return object_t(n);
1141 }
1142
1143 void CInode::store(MDSContext *fin)
1144 {
1145 dout(10) << __func__ << " " << get_version() << dendl;
1146 ceph_assert(is_base());
1147
1148 if (snaprealm)
1149 purge_stale_snap_data(snaprealm->get_snaps());
1150
1151 // encode
1152 bufferlist bl;
1153 string magic = CEPH_FS_ONDISK_MAGIC;
1154 using ceph::encode;
1155 encode(magic, bl);
1156 encode_store(bl, mdcache->mds->mdsmap->get_up_features());
1157
1158 // write it.
1159 SnapContext snapc;
1160 ObjectOperation m;
1161 m.write_full(bl);
1162
1163 object_t oid = CInode::get_object_name(ino(), frag_t(), ".inode");
1164 object_locator_t oloc(mdcache->mds->get_metadata_pool());
1165
1166 Context *newfin =
1167 new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin),
1168 mdcache->mds->finisher);
1169 mdcache->mds->objecter->mutate(oid, oloc, m, snapc,
1170 ceph::real_clock::now(), 0,
1171 newfin);
1172 }
1173
1174 void CInode::_stored(int r, version_t v, Context *fin)
1175 {
1176 if (r < 0) {
1177 dout(1) << "store error " << r << " v " << v << " on " << *this << dendl;
1178 mdcache->mds->clog->error() << "failed to store inode " << ino()
1179 << " object: " << cpp_strerror(r);
1180 mdcache->mds->handle_write_error(r);
1181 fin->complete(r);
1182 return;
1183 }
1184
1185 dout(10) << __func__ << " " << v << " on " << *this << dendl;
1186 if (v == get_projected_version())
1187 mark_clean();
1188
1189 fin->complete(0);
1190 }
1191
1192 void CInode::flush(MDSContext *fin)
1193 {
1194 dout(10) << __func__ << " " << *this << dendl;
1195 ceph_assert(is_auth() && can_auth_pin());
1196
1197 MDSGatherBuilder gather(g_ceph_context);
1198
1199 if (is_dirty_parent()) {
1200 store_backtrace(gather.new_sub());
1201 }
1202 if (is_dirty()) {
1203 if (is_base()) {
1204 store(gather.new_sub());
1205 } else {
1206 parent->dir->commit(0, gather.new_sub());
1207 }
1208 }
1209
1210 if (gather.has_subs()) {
1211 gather.set_finisher(fin);
1212 gather.activate();
1213 } else {
1214 fin->complete(0);
1215 }
1216 }
1217
1218 struct C_IO_Inode_Fetched : public CInodeIOContext {
1219 bufferlist bl, bl2;
1220 Context *fin;
1221 C_IO_Inode_Fetched(CInode *i, Context *f) : CInodeIOContext(i), fin(f) {}
1222 void finish(int r) override {
1223 // Ignore 'r', because we fetch from two places, so r is usually CEPHFS_ENOENT
1224 in->_fetched(bl, bl2, fin);
1225 }
1226 void print(ostream& out) const override {
1227 out << "inode_fetch(" << in->ino() << ")";
1228 }
1229 };
1230
1231 void CInode::fetch(MDSContext *fin)
1232 {
1233 dout(10) << __func__ << dendl;
1234
1235 C_IO_Inode_Fetched *c = new C_IO_Inode_Fetched(this, fin);
1236 C_GatherBuilder gather(g_ceph_context, new C_OnFinisher(c, mdcache->mds->finisher));
1237
1238 object_t oid = CInode::get_object_name(ino(), frag_t(), "");
1239 object_locator_t oloc(mdcache->mds->get_metadata_pool());
1240
1241 // Old on-disk format: inode stored in xattr of a dirfrag
1242 ObjectOperation rd;
1243 rd.getxattr("inode", &c->bl, NULL);
1244 mdcache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, (bufferlist*)NULL, 0, gather.new_sub());
1245
1246 // Current on-disk format: inode stored in a .inode object
1247 object_t oid2 = CInode::get_object_name(ino(), frag_t(), ".inode");
1248 mdcache->mds->objecter->read(oid2, oloc, 0, 0, CEPH_NOSNAP, &c->bl2, 0, gather.new_sub());
1249
1250 gather.activate();
1251 }
1252
1253 void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin)
1254 {
1255 dout(10) << __func__ << " got " << bl.length() << " and " << bl2.length() << dendl;
1256 bufferlist::const_iterator p;
1257 if (bl2.length()) {
1258 p = bl2.cbegin();
1259 } else if (bl.length()) {
1260 p = bl.cbegin();
1261 } else {
1262 derr << "No data while reading inode " << ino() << dendl;
1263 fin->complete(-CEPHFS_ENOENT);
1264 return;
1265 }
1266
1267 using ceph::decode;
1268 // Attempt decode
1269 try {
1270 string magic;
1271 decode(magic, p);
1272 dout(10) << " magic is '" << magic << "' (expecting '"
1273 << CEPH_FS_ONDISK_MAGIC << "')" << dendl;
1274 if (magic != CEPH_FS_ONDISK_MAGIC) {
1275 dout(0) << "on disk magic '" << magic << "' != my magic '" << CEPH_FS_ONDISK_MAGIC
1276 << "'" << dendl;
1277 fin->complete(-CEPHFS_EINVAL);
1278 } else {
1279 decode_store(p);
1280 dout(10) << "_fetched " << *this << dendl;
1281 fin->complete(0);
1282 }
1283 } catch (buffer::error &err) {
1284 derr << "Corrupt inode " << ino() << ": " << err.what() << dendl;
1285 fin->complete(-CEPHFS_EINVAL);
1286 return;
1287 }
1288 }
1289
1290 void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt)
1291 {
1292 bt.ino = ino();
1293 bt.ancestors.clear();
1294 bt.pool = pool;
1295
1296 CInode *in = this;
1297 CDentry *pdn = get_parent_dn();
1298 while (pdn) {
1299 CInode *diri = pdn->get_dir()->get_inode();
1300 bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(), in->get_inode()->version));
1301 in = diri;
1302 pdn = in->get_parent_dn();
1303 }
1304 bt.old_pools.reserve(get_inode()->old_pools.size());
1305 for (auto &p : get_inode()->old_pools) {
1306 // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0)
1307 if (p != pool)
1308 bt.old_pools.push_back(p);
1309 }
1310 }
1311
1312 struct C_IO_Inode_StoredBacktrace : public CInodeIOContext {
1313 version_t version;
1314 Context *fin;
1315 C_IO_Inode_StoredBacktrace(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {}
1316 void finish(int r) override {
1317 in->_stored_backtrace(r, version, fin);
1318 }
1319 void print(ostream& out) const override {
1320 out << "backtrace_store(" << in->ino() << ")";
1321 }
1322 };
1323
1324
1325 void CInode::_commit_ops(int r, C_GatherBuilder &gather_bld,
1326 std::vector<CInodeCommitOperation> &ops_vec,
1327 inode_backtrace_t &bt)
1328 {
1329 dout(10) << __func__ << dendl;
1330
1331 if (r < 0) {
1332 mdcache->mds->handle_write_error_with_lock(r);
1333 return;
1334 }
1335
1336 SnapContext snapc;
1337 object_t oid = get_object_name(ino(), frag_t(), "");
1338
1339 for (auto &op : ops_vec) {
1340 ObjectOperation obj_op;
1341 object_locator_t oloc(op.get_pool());
1342 op.update(obj_op, bt);
1343 mdcache->mds->objecter->mutate(oid, oloc, obj_op, snapc,
1344 ceph::real_clock::now(),
1345 0, gather_bld.new_sub());
1346 }
1347 }
1348
1349 void CInode::_store_backtrace(std::vector<CInodeCommitOperation> &ops_vec,
1350 inode_backtrace_t &bt, int op_prio)
1351 {
1352 dout(10) << __func__ << " on " << *this << dendl;
1353 ceph_assert(is_dirty_parent());
1354
1355 if (op_prio < 0)
1356 op_prio = CEPH_MSG_PRIO_DEFAULT;
1357
1358 auth_pin(this);
1359
1360 const int64_t pool = get_backtrace_pool();
1361 build_backtrace(pool, bt);
1362
1363 std::string_view slink = "";
1364 if (is_symlink() && mdcache->get_symlink_recovery()) {
1365 slink = symlink;
1366 }
1367
1368 ops_vec.emplace_back(op_prio, pool, get_inode()->layout,
1369 mdcache->mds->mdsmap->get_up_features(), slink);
1370
1371 if (!state_test(STATE_DIRTYPOOL) || get_inode()->old_pools.empty()) {
1372 dout(20) << __func__ << ": no dirtypool or no old pools" << dendl;
1373 return;
1374 }
1375
1376 // In the case where DIRTYPOOL is set, we update all old pools backtraces
1377 // such that anyone reading them will see the new pool ID in
1378 // inode_backtrace_t::pool and go read everything else from there.
1379 for (const auto &p : get_inode()->old_pools) {
1380 if (p == pool)
1381 continue;
1382
1383 dout(20) << __func__ << ": updating old pool " << p << dendl;
1384
1385 ops_vec.emplace_back(op_prio, p);
1386 }
1387 }
1388
1389 void CInode::store_backtrace(MDSContext *fin, int op_prio)
1390 {
1391 std::vector<CInodeCommitOperation> ops_vec;
1392 inode_backtrace_t bt;
1393 auto version = get_inode()->backtrace_version;
1394
1395 _store_backtrace(ops_vec, bt, op_prio);
1396
1397 C_GatherBuilder gather(g_ceph_context,
1398 new C_OnFinisher(
1399 new C_IO_Inode_StoredBacktrace(this, version, fin),
1400 mdcache->mds->finisher));
1401 _commit_ops(0, gather, ops_vec, bt);
1402 ceph_assert(gather.has_subs());
1403 gather.activate();
1404 }
1405
1406 void CInode::store_backtrace(CInodeCommitOperations &op, int op_prio)
1407 {
1408 op.version = get_inode()->backtrace_version;
1409 op.in = this;
1410
1411 _store_backtrace(op.ops_vec, op.bt, op_prio);
1412 }
1413
1414 void CInode::_stored_backtrace(int r, version_t v, Context *fin)
1415 {
1416 if (r == -CEPHFS_ENOENT) {
1417 const int64_t pool = get_backtrace_pool();
1418 bool exists = mdcache->mds->objecter->with_osdmap(
1419 [pool](const OSDMap &osd_map) {
1420 return osd_map.have_pg_pool(pool);
1421 });
1422
1423 // This CEPHFS_ENOENT is because the pool doesn't exist (the user deleted it
1424 // out from under us), so the backtrace can never be written, so pretend
1425 // to succeed so that the user can proceed to e.g. delete the file.
1426 if (!exists) {
1427 dout(4) << __func__ << " got CEPHFS_ENOENT: a data pool was deleted "
1428 "beneath us!" << dendl;
1429 r = 0;
1430 }
1431 }
1432
1433 if (r < 0) {
1434 dout(1) << "store backtrace error " << r << " v " << v << dendl;
1435 mdcache->mds->clog->error() << "failed to store backtrace on ino "
1436 << ino() << " object"
1437 << ", pool " << get_backtrace_pool()
1438 << ", errno " << r;
1439 mdcache->mds->handle_write_error(r);
1440 if (fin)
1441 fin->complete(r);
1442 return;
1443 }
1444
1445 dout(10) << __func__ << " v " << v << dendl;
1446
1447 auth_unpin(this);
1448 if (v == get_inode()->backtrace_version)
1449 clear_dirty_parent();
1450 if (fin)
1451 fin->complete(0);
1452 }
1453
1454 void CInode::fetch_backtrace(Context *fin, bufferlist *backtrace)
1455 {
1456 mdcache->fetch_backtrace(ino(), get_backtrace_pool(), *backtrace, fin);
1457 }
1458
1459 void CInode::mark_dirty_parent(LogSegment *ls, bool dirty_pool)
1460 {
1461 if (!state_test(STATE_DIRTYPARENT)) {
1462 dout(10) << __func__ << dendl;
1463 state_set(STATE_DIRTYPARENT);
1464 get(PIN_DIRTYPARENT);
1465 ceph_assert(ls);
1466 }
1467 if (dirty_pool)
1468 state_set(STATE_DIRTYPOOL);
1469 if (ls)
1470 ls->dirty_parent_inodes.push_back(&item_dirty_parent);
1471 }
1472
1473 void CInode::clear_dirty_parent()
1474 {
1475 if (state_test(STATE_DIRTYPARENT)) {
1476 dout(10) << __func__ << dendl;
1477 state_clear(STATE_DIRTYPARENT);
1478 state_clear(STATE_DIRTYPOOL);
1479 put(PIN_DIRTYPARENT);
1480 item_dirty_parent.remove_myself();
1481 }
1482 }
1483
1484 void CInode::verify_diri_backtrace(bufferlist &bl, int err)
1485 {
1486 if (is_base() || is_dirty_parent() || !is_auth())
1487 return;
1488
1489 dout(10) << __func__ << dendl;
1490
1491 if (err == 0) {
1492 inode_backtrace_t backtrace;
1493 using ceph::decode;
1494 decode(backtrace, bl);
1495 CDentry *pdn = get_parent_dn();
1496 if (backtrace.ancestors.empty() ||
1497 backtrace.ancestors[0].dname != pdn->get_name() ||
1498 backtrace.ancestors[0].dirino != pdn->get_dir()->ino())
1499 err = -CEPHFS_EINVAL;
1500 }
1501
1502 if (err) {
1503 MDSRank *mds = mdcache->mds;
1504 mds->clog->error() << "bad backtrace on directory inode " << ino();
1505 ceph_assert(!"bad backtrace" == (g_conf()->mds_verify_backtrace > 1));
1506
1507 mark_dirty_parent(mds->mdlog->get_current_segment(), false);
1508 mds->mdlog->flush();
1509 }
1510 }
1511
1512 // ------------------
1513 // parent dir
1514
1515
1516 void InodeStoreBase::encode_xattrs(bufferlist &bl) const {
1517 using ceph::encode;
1518 if (xattrs)
1519 encode(*xattrs, bl);
1520 else
1521 encode((__u32)0, bl);
1522 }
1523
1524 void InodeStoreBase::decode_xattrs(bufferlist::const_iterator &p) {
1525 using ceph::decode;
1526 mempool_xattr_map tmp;
1527 decode_noshare(tmp, p);
1528 if (tmp.empty()) {
1529 reset_xattrs(xattr_map_ptr());
1530 } else {
1531 reset_xattrs(allocate_xattr_map(std::move(tmp)));
1532 }
1533 }
1534
1535 void InodeStoreBase::encode_old_inodes(bufferlist &bl, uint64_t features) const {
1536 using ceph::encode;
1537 if (old_inodes)
1538 encode(*old_inodes, bl, features);
1539 else
1540 encode((__u32)0, bl);
1541 }
1542
1543 void InodeStoreBase::decode_old_inodes(bufferlist::const_iterator &p) {
1544 using ceph::decode;
1545 mempool_old_inode_map tmp;
1546 decode(tmp, p);
1547 if (tmp.empty()) {
1548 reset_old_inodes(old_inode_map_ptr());
1549 } else {
1550 reset_old_inodes(allocate_old_inode_map(std::move(tmp)));
1551 }
1552 }
1553
1554 void InodeStoreBase::encode_bare(bufferlist &bl, uint64_t features,
1555 const bufferlist *snap_blob) const
1556 {
1557 using ceph::encode;
1558 encode(*inode, bl, features);
1559 if (inode->is_symlink())
1560 encode(symlink, bl);
1561 encode(dirfragtree, bl);
1562 encode_xattrs(bl);
1563
1564 if (snap_blob)
1565 encode(*snap_blob, bl);
1566 else
1567 encode(bufferlist(), bl);
1568 encode_old_inodes(bl, features);
1569 encode(oldest_snap, bl);
1570 encode(damage_flags, bl);
1571 }
1572
1573 void InodeStoreBase::encode(bufferlist &bl, uint64_t features,
1574 const bufferlist *snap_blob) const
1575 {
1576 ENCODE_START(6, 4, bl);
1577 encode_bare(bl, features, snap_blob);
1578 ENCODE_FINISH(bl);
1579 }
1580
1581 void CInode::encode_store(bufferlist& bl, uint64_t features)
1582 {
1583 bufferlist snap_blob;
1584 encode_snap_blob(snap_blob);
1585 InodeStoreBase::encode(bl, mdcache->mds->mdsmap->get_up_features(),
1586 &snap_blob);
1587 }
1588
1589 void InodeStoreBase::decode_bare(bufferlist::const_iterator &bl,
1590 bufferlist& snap_blob, __u8 struct_v)
1591 {
1592 using ceph::decode;
1593
1594 auto _inode = allocate_inode();
1595 decode(*_inode, bl);
1596
1597 if (_inode->is_symlink()) {
1598 std::string tmp;
1599 decode(tmp, bl);
1600 symlink = std::string_view(tmp);
1601 }
1602 decode(dirfragtree, bl);
1603 decode_xattrs(bl);
1604 decode(snap_blob, bl);
1605
1606 decode_old_inodes(bl);
1607 if (struct_v == 2 && _inode->is_dir()) {
1608 bool default_layout_exists;
1609 decode(default_layout_exists, bl);
1610 if (default_layout_exists) {
1611 decode(struct_v, bl); // this was a default_file_layout
1612 decode(_inode->layout, bl); // but we only care about the layout portion
1613 }
1614 }
1615
1616 if (struct_v >= 5) {
1617 // InodeStore is embedded in dentries without proper versioning, so
1618 // we consume up to the end of the buffer
1619 if (!bl.end()) {
1620 decode(oldest_snap, bl);
1621 }
1622
1623 if (!bl.end()) {
1624 decode(damage_flags, bl);
1625 }
1626 }
1627
1628 reset_inode(std::move(_inode));
1629 }
1630
1631
1632 void InodeStoreBase::decode(bufferlist::const_iterator &bl, bufferlist& snap_blob)
1633 {
1634 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
1635 decode_bare(bl, snap_blob, struct_v);
1636 DECODE_FINISH(bl);
1637 }
1638
1639 void CInode::decode_store(bufferlist::const_iterator& bl)
1640 {
1641 bufferlist snap_blob;
1642 InodeStoreBase::decode(bl, snap_blob);
1643 decode_snap_blob(snap_blob);
1644 }
1645
1646 // ------------------
1647 // locking
1648
1649 SimpleLock* CInode::get_lock(int type)
1650 {
1651 switch (type) {
1652 case CEPH_LOCK_IVERSION: return &versionlock;
1653 case CEPH_LOCK_IFILE: return &filelock;
1654 case CEPH_LOCK_IAUTH: return &authlock;
1655 case CEPH_LOCK_ILINK: return &linklock;
1656 case CEPH_LOCK_IDFT: return &dirfragtreelock;
1657 case CEPH_LOCK_IXATTR: return &xattrlock;
1658 case CEPH_LOCK_ISNAP: return &snaplock;
1659 case CEPH_LOCK_INEST: return &nestlock;
1660 case CEPH_LOCK_IFLOCK: return &flocklock;
1661 case CEPH_LOCK_IPOLICY: return &policylock;
1662 }
1663 return 0;
1664 }
1665
1666 void CInode::set_object_info(MDSCacheObjectInfo &info)
1667 {
1668 info.ino = ino();
1669 info.snapid = last;
1670 }
1671
1672 void CInode::encode_lock_iauth(bufferlist& bl)
1673 {
1674 ENCODE_START(2, 1, bl);
1675 encode(get_inode()->version, bl);
1676 encode(get_inode()->ctime, bl);
1677 encode(get_inode()->mode, bl);
1678 encode(get_inode()->uid, bl);
1679 encode(get_inode()->gid, bl);
1680 encode(get_inode()->fscrypt_auth, bl);
1681 ENCODE_FINISH(bl);
1682 }
1683
1684 void CInode::decode_lock_iauth(bufferlist::const_iterator& p)
1685 {
1686 ceph_assert(!is_auth());
1687 auto _inode = allocate_inode(*get_inode());
1688 DECODE_START(2, p);
1689 decode(_inode->version, p);
1690 utime_t tm;
1691 decode(tm, p);
1692 if (_inode->ctime < tm) _inode->ctime = tm;
1693 decode(_inode->mode, p);
1694 decode(_inode->uid, p);
1695 decode(_inode->gid, p);
1696 if (struct_v >= 2)
1697 decode(_inode->fscrypt_auth, p);
1698 DECODE_FINISH(p);
1699 reset_inode(std::move(_inode));
1700 }
1701
1702 void CInode::encode_lock_ilink(bufferlist& bl)
1703 {
1704 ENCODE_START(1, 1, bl);
1705 encode(get_inode()->version, bl);
1706 encode(get_inode()->ctime, bl);
1707 encode(get_inode()->nlink, bl);
1708 ENCODE_FINISH(bl);
1709 }
1710
1711 void CInode::decode_lock_ilink(bufferlist::const_iterator& p)
1712 {
1713 ceph_assert(!is_auth());
1714 auto _inode = allocate_inode(*get_inode());
1715 DECODE_START(1, p);
1716 decode(_inode->version, p);
1717 utime_t tm;
1718 decode(tm, p);
1719 if (_inode->ctime < tm) _inode->ctime = tm;
1720 decode(_inode->nlink, p);
1721 DECODE_FINISH(p);
1722 reset_inode(std::move(_inode));
1723 }
1724
1725 void CInode::encode_lock_idft(bufferlist& bl)
1726 {
1727 ENCODE_START(1, 1, bl);
1728 if (is_auth()) {
1729 encode(get_inode()->version, bl);
1730 } else {
1731 // treat flushing as dirty when rejoining cache
1732 bool dirty = dirfragtreelock.is_dirty_or_flushing();
1733 encode(dirty, bl);
1734 }
1735 {
1736 // encode the raw tree
1737 encode(dirfragtree, bl);
1738
1739 // also specify which frags are mine
1740 set<frag_t> myfrags;
1741 auto&& dfls = get_dirfrags();
1742 for (const auto& dir : dfls) {
1743 if (dir->is_auth()) {
1744 frag_t fg = dir->get_frag();
1745 myfrags.insert(fg);
1746 }
1747 }
1748 encode(myfrags, bl);
1749 }
1750 ENCODE_FINISH(bl);
1751 }
1752
1753 void CInode::decode_lock_idft(bufferlist::const_iterator& p)
1754 {
1755 inode_ptr _inode;
1756
1757 DECODE_START(1, p);
1758 if (is_auth()) {
1759 bool replica_dirty;
1760 decode(replica_dirty, p);
1761 if (replica_dirty) {
1762 dout(10) << __func__ << " setting dftlock dirty flag" << dendl;
1763 dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle
1764 }
1765 } else {
1766 _inode = allocate_inode(*get_inode());
1767 decode(_inode->version, p);
1768 }
1769 {
1770 fragtree_t temp;
1771 decode(temp, p);
1772 set<frag_t> authfrags;
1773 decode(authfrags, p);
1774 if (is_auth()) {
1775 // auth. believe replica's auth frags only.
1776 for (auto fg : authfrags) {
1777 if (!dirfragtree.is_leaf(fg)) {
1778 dout(10) << " forcing frag " << fg << " to leaf (split|merge)" << dendl;
1779 dirfragtree.force_to_leaf(g_ceph_context, fg);
1780 dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle
1781 }
1782 }
1783 } else {
1784 // replica. take the new tree, BUT make sure any open
1785 // dirfrags remain leaves (they may have split _after_ this
1786 // dft was scattered, or we may still be be waiting on the
1787 // notify from the auth)
1788 dirfragtree.swap(temp);
1789 for (const auto &p : dirfrags) {
1790 if (!dirfragtree.is_leaf(p.first)) {
1791 dout(10) << " forcing open dirfrag " << p.first << " to leaf (racing with split|merge)" << dendl;
1792 dirfragtree.force_to_leaf(g_ceph_context, p.first);
1793 }
1794 if (p.second->is_auth())
1795 p.second->state_clear(CDir::STATE_DIRTYDFT);
1796 }
1797 }
1798 if (g_conf()->mds_debug_frag)
1799 verify_dirfrags();
1800 }
1801 DECODE_FINISH(p);
1802
1803 if (_inode)
1804 reset_inode(std::move(_inode));
1805 }
1806
1807 void CInode::encode_lock_ifile(bufferlist& bl)
1808 {
1809 ENCODE_START(2, 1, bl);
1810 if (is_auth()) {
1811 encode(get_inode()->version, bl);
1812 encode(get_inode()->ctime, bl);
1813 encode(get_inode()->mtime, bl);
1814 encode(get_inode()->atime, bl);
1815 encode(get_inode()->time_warp_seq, bl);
1816 if (!is_dir()) {
1817 encode(get_inode()->layout, bl, mdcache->mds->mdsmap->get_up_features());
1818 encode(get_inode()->size, bl);
1819 encode(get_inode()->truncate_seq, bl);
1820 encode(get_inode()->truncate_size, bl);
1821 encode(get_inode()->client_ranges, bl);
1822 encode(get_inode()->inline_data, bl);
1823 }
1824 } else {
1825 // treat flushing as dirty when rejoining cache
1826 bool dirty = filelock.is_dirty_or_flushing();
1827 encode(dirty, bl);
1828 }
1829 dout(15) << __func__ << " inode.dirstat is " << get_inode()->dirstat << dendl;
1830 encode(get_inode()->dirstat, bl); // only meaningful if i am auth.
1831 bufferlist tmp;
1832 __u32 n = 0;
1833 for (const auto &p : dirfrags) {
1834 frag_t fg = p.first;
1835 CDir *dir = p.second;
1836 if (is_auth() || dir->is_auth()) {
1837 const auto& pf = dir->get_projected_fnode();
1838 dout(15) << fg << " " << *dir << dendl;
1839 dout(20) << fg << " fragstat " << pf->fragstat << dendl;
1840 dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
1841 encode(fg, tmp);
1842 encode(dir->first, tmp);
1843 encode(pf->fragstat, tmp);
1844 encode(pf->accounted_fragstat, tmp);
1845 n++;
1846 }
1847 }
1848 encode(n, bl);
1849 bl.claim_append(tmp);
1850 if (is_auth())
1851 encode(get_inode()->fscrypt_file, bl);
1852 ENCODE_FINISH(bl);
1853 }
1854
1855 void CInode::decode_lock_ifile(bufferlist::const_iterator& p)
1856 {
1857 inode_ptr _inode;
1858
1859 DECODE_START(2, p);
1860 if (!is_auth()) {
1861 _inode = allocate_inode(*get_inode());
1862
1863 decode(_inode->version, p);
1864 utime_t tm;
1865 decode(tm, p);
1866 if (_inode->ctime < tm) _inode->ctime = tm;
1867 decode(_inode->mtime, p);
1868 decode(_inode->atime, p);
1869 decode(_inode->time_warp_seq, p);
1870 if (!is_dir()) {
1871 decode(_inode->layout, p);
1872 decode(_inode->size, p);
1873 decode(_inode->truncate_seq, p);
1874 decode(_inode->truncate_size, p);
1875 decode(_inode->client_ranges, p);
1876 decode(_inode->inline_data, p);
1877 }
1878 } else {
1879 bool replica_dirty;
1880 decode(replica_dirty, p);
1881 if (replica_dirty) {
1882 dout(10) << __func__ << " setting filelock dirty flag" << dendl;
1883 filelock.mark_dirty(); // ok bc we're auth and caller will handle
1884 }
1885 }
1886
1887 frag_info_t dirstat;
1888 decode(dirstat, p);
1889 if (!is_auth()) {
1890 dout(10) << " taking inode dirstat " << dirstat << " for " << *this << dendl;
1891 _inode->dirstat = dirstat; // take inode summation if replica
1892 }
1893 __u32 n;
1894 decode(n, p);
1895 dout(10) << " ...got " << n << " fragstats on " << *this << dendl;
1896 while (n--) {
1897 frag_t fg;
1898 snapid_t fgfirst;
1899 frag_info_t fragstat;
1900 frag_info_t accounted_fragstat;
1901 decode(fg, p);
1902 decode(fgfirst, p);
1903 decode(fragstat, p);
1904 decode(accounted_fragstat, p);
1905 dout(10) << fg << " [" << fgfirst << ",head] " << dendl;
1906 dout(10) << fg << " fragstat " << fragstat << dendl;
1907 dout(20) << fg << " accounted_fragstat " << accounted_fragstat << dendl;
1908
1909 CDir *dir = get_dirfrag(fg);
1910 if (is_auth()) {
1911 ceph_assert(dir); // i am auth; i had better have this dir open
1912 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
1913 << " on " << *dir << dendl;
1914 dir->first = fgfirst;
1915 auto _fnode = CDir::allocate_fnode(*dir->get_fnode());
1916 _fnode->fragstat = fragstat;
1917 _fnode->accounted_fragstat = accounted_fragstat;
1918 dir->reset_fnode(std::move(_fnode));
1919 if (!(fragstat == accounted_fragstat)) {
1920 dout(10) << fg << " setting filelock updated flag" << dendl;
1921 filelock.mark_dirty(); // ok bc we're auth and caller will handle
1922 }
1923 } else {
1924 if (dir && dir->is_auth()) {
1925 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
1926 << " on " << *dir << dendl;
1927 dir->first = fgfirst;
1928 const auto& pf = dir->get_projected_fnode();
1929 finish_scatter_update(&filelock, dir,
1930 _inode->dirstat.version, pf->accounted_fragstat.version);
1931 }
1932 }
1933 }
1934 if (!is_auth() && struct_v >= 2)
1935 decode(_inode->fscrypt_file, p);
1936 DECODE_FINISH(p);
1937
1938 if (_inode)
1939 reset_inode(std::move(_inode));
1940 }
1941
1942 void CInode::encode_lock_inest(bufferlist& bl)
1943 {
1944 ENCODE_START(1, 1, bl);
1945 if (is_auth()) {
1946 encode(get_inode()->version, bl);
1947 } else {
1948 // treat flushing as dirty when rejoining cache
1949 bool dirty = nestlock.is_dirty_or_flushing();
1950 encode(dirty, bl);
1951 }
1952 dout(15) << __func__ << " inode.rstat is " << get_inode()->rstat << dendl;
1953 encode(get_inode()->rstat, bl); // only meaningful if i am auth.
1954 bufferlist tmp;
1955 __u32 n = 0;
1956 for (const auto &p : dirfrags) {
1957 frag_t fg = p.first;
1958 CDir *dir = p.second;
1959 if (is_auth() || dir->is_auth()) {
1960 const auto& pf = dir->get_projected_fnode();
1961 dout(10) << __func__ << " " << fg << " dir " << *dir << dendl;
1962 dout(10) << __func__ << " " << fg << " rstat " << pf->rstat << dendl;
1963 dout(10) << __func__ << " " << fg << " accounted_rstat " << pf->rstat << dendl;
1964 dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl;
1965 encode(fg, tmp);
1966 encode(dir->first, tmp);
1967 encode(pf->rstat, tmp);
1968 encode(pf->accounted_rstat, tmp);
1969 encode(dir->dirty_old_rstat, tmp);
1970 n++;
1971 }
1972 }
1973 encode(n, bl);
1974 bl.claim_append(tmp);
1975 ENCODE_FINISH(bl);
1976 }
1977
1978 void CInode::decode_lock_inest(bufferlist::const_iterator& p)
1979 {
1980 inode_ptr _inode;
1981
1982 DECODE_START(1, p);
1983 if (is_auth()) {
1984 bool replica_dirty;
1985 decode(replica_dirty, p);
1986 if (replica_dirty) {
1987 dout(10) << __func__ << " setting nestlock dirty flag" << dendl;
1988 nestlock.mark_dirty(); // ok bc we're auth and caller will handle
1989 }
1990 } else {
1991 _inode = allocate_inode(*get_inode());
1992 decode(_inode->version, p);
1993 }
1994 nest_info_t rstat;
1995 decode(rstat, p);
1996 if (!is_auth()) {
1997 dout(10) << __func__ << " taking inode rstat " << rstat << " for " << *this << dendl;
1998 _inode->rstat = rstat; // take inode summation if replica
1999 }
2000 __u32 n;
2001 decode(n, p);
2002 while (n--) {
2003 frag_t fg;
2004 snapid_t fgfirst;
2005 nest_info_t rstat;
2006 nest_info_t accounted_rstat;
2007 decltype(CDir::dirty_old_rstat) dirty_old_rstat;
2008 decode(fg, p);
2009 decode(fgfirst, p);
2010 decode(rstat, p);
2011 decode(accounted_rstat, p);
2012 decode(dirty_old_rstat, p);
2013 dout(10) << __func__ << " " << fg << " [" << fgfirst << ",head]" << dendl;
2014 dout(10) << __func__ << " " << fg << " rstat " << rstat << dendl;
2015 dout(10) << __func__ << " " << fg << " accounted_rstat " << accounted_rstat << dendl;
2016 dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dirty_old_rstat << dendl;
2017 CDir *dir = get_dirfrag(fg);
2018 if (is_auth()) {
2019 ceph_assert(dir); // i am auth; i had better have this dir open
2020 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
2021 << " on " << *dir << dendl;
2022 dir->first = fgfirst;
2023 auto _fnode = CDir::allocate_fnode(*dir->get_fnode());
2024 _fnode->rstat = rstat;
2025 _fnode->accounted_rstat = accounted_rstat;
2026 dir->reset_fnode(std::move(_fnode));
2027 dir->dirty_old_rstat.swap(dirty_old_rstat);
2028 if (!(rstat == accounted_rstat) || !dir->dirty_old_rstat.empty()) {
2029 dout(10) << fg << " setting nestlock updated flag" << dendl;
2030 nestlock.mark_dirty(); // ok bc we're auth and caller will handle
2031 }
2032 } else {
2033 if (dir && dir->is_auth()) {
2034 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
2035 << " on " << *dir << dendl;
2036 dir->first = fgfirst;
2037 const auto& pf = dir->get_projected_fnode();
2038 finish_scatter_update(&nestlock, dir,
2039 _inode->rstat.version, pf->accounted_rstat.version);
2040 }
2041 }
2042 }
2043 DECODE_FINISH(p);
2044
2045 if (_inode)
2046 reset_inode(std::move(_inode));
2047 }
2048
2049 void CInode::encode_lock_ixattr(bufferlist& bl)
2050 {
2051 ENCODE_START(2, 1, bl);
2052 encode(get_inode()->version, bl);
2053 encode(get_inode()->ctime, bl);
2054 encode_xattrs(bl);
2055 encode(get_inode()->xattr_version, bl);
2056 ENCODE_FINISH(bl);
2057 }
2058
2059 void CInode::decode_lock_ixattr(bufferlist::const_iterator& p)
2060 {
2061 ceph_assert(!is_auth());
2062 auto _inode = allocate_inode(*get_inode());
2063 DECODE_START(2, p);
2064 decode(_inode->version, p);
2065 utime_t tm;
2066 decode(tm, p);
2067 if (_inode->ctime < tm)
2068 _inode->ctime = tm;
2069 decode_xattrs(p);
2070 if (struct_v >= 2) {
2071 decode(_inode->xattr_version, p);
2072 }
2073 DECODE_FINISH(p);
2074 reset_inode(std::move(_inode));
2075 }
2076
2077 void CInode::encode_lock_isnap(bufferlist& bl)
2078 {
2079 ENCODE_START(1, 1, bl);
2080 encode(get_inode()->version, bl);
2081 encode(get_inode()->ctime, bl);
2082 encode_snap(bl);
2083 ENCODE_FINISH(bl);
2084 }
2085
2086 void CInode::decode_lock_isnap(bufferlist::const_iterator& p)
2087 {
2088 ceph_assert(!is_auth());
2089 auto _inode = allocate_inode(*get_inode());
2090 DECODE_START(1, p);
2091 decode(_inode->version, p);
2092 utime_t tm;
2093 decode(tm, p);
2094 if (_inode->ctime < tm) _inode->ctime = tm;
2095 decode_snap(p);
2096 DECODE_FINISH(p);
2097 reset_inode(std::move(_inode));
2098 }
2099
2100 void CInode::encode_lock_iflock(bufferlist& bl)
2101 {
2102 ENCODE_START(1, 1, bl);
2103 encode(get_inode()->version, bl);
2104 _encode_file_locks(bl);
2105 ENCODE_FINISH(bl);
2106 }
2107
2108 void CInode::decode_lock_iflock(bufferlist::const_iterator& p)
2109 {
2110 ceph_assert(!is_auth());
2111 auto _inode = allocate_inode(*get_inode());
2112 DECODE_START(1, p);
2113 decode(_inode->version, p);
2114 _decode_file_locks(p);
2115 DECODE_FINISH(p);
2116 reset_inode(std::move(_inode));
2117 }
2118
2119 void CInode::encode_lock_ipolicy(bufferlist& bl)
2120 {
2121 ENCODE_START(2, 1, bl);
2122 if (is_dir()) {
2123 encode(get_inode()->version, bl);
2124 encode(get_inode()->ctime, bl);
2125 encode(get_inode()->layout, bl, mdcache->mds->mdsmap->get_up_features());
2126 encode(get_inode()->quota, bl);
2127 encode(get_inode()->export_pin, bl);
2128 encode(get_inode()->export_ephemeral_distributed_pin, bl);
2129 encode(get_inode()->export_ephemeral_random_pin, bl);
2130 }
2131 ENCODE_FINISH(bl);
2132 }
2133
2134 void CInode::decode_lock_ipolicy(bufferlist::const_iterator& p)
2135 {
2136 ceph_assert(!is_auth());
2137 auto _inode = allocate_inode(*get_inode());
2138 DECODE_START(1, p);
2139 if (is_dir()) {
2140 decode(_inode->version, p);
2141 utime_t tm;
2142 decode(tm, p);
2143 if (_inode->ctime < tm)
2144 _inode->ctime = tm;
2145 decode(_inode->layout, p);
2146 decode(_inode->quota, p);
2147 decode(_inode->export_pin, p);
2148 if (struct_v >= 2) {
2149 decode(_inode->export_ephemeral_distributed_pin, p);
2150 decode(_inode->export_ephemeral_random_pin, p);
2151 }
2152 }
2153 DECODE_FINISH(p);
2154
2155 bool pin_updated = (get_inode()->export_pin != _inode->export_pin) ||
2156 (get_inode()->export_ephemeral_distributed_pin !=
2157 _inode->export_ephemeral_distributed_pin);
2158 reset_inode(std::move(_inode));
2159 maybe_export_pin(pin_updated);
2160 }
2161
2162 void CInode::encode_lock_state(int type, bufferlist& bl)
2163 {
2164 ENCODE_START(1, 1, bl);
2165 encode(first, bl);
2166 if (!is_base())
2167 encode(parent->first, bl);
2168
2169 switch (type) {
2170 case CEPH_LOCK_IAUTH:
2171 encode_lock_iauth(bl);
2172 break;
2173
2174 case CEPH_LOCK_ILINK:
2175 encode_lock_ilink(bl);
2176 break;
2177
2178 case CEPH_LOCK_IDFT:
2179 encode_lock_idft(bl);
2180 break;
2181
2182 case CEPH_LOCK_IFILE:
2183 encode_lock_ifile(bl);
2184 break;
2185
2186 case CEPH_LOCK_INEST:
2187 encode_lock_inest(bl);
2188 break;
2189
2190 case CEPH_LOCK_IXATTR:
2191 encode_lock_ixattr(bl);
2192 break;
2193
2194 case CEPH_LOCK_ISNAP:
2195 encode_lock_isnap(bl);
2196 break;
2197
2198 case CEPH_LOCK_IFLOCK:
2199 encode_lock_iflock(bl);
2200 break;
2201
2202 case CEPH_LOCK_IPOLICY:
2203 encode_lock_ipolicy(bl);
2204 break;
2205
2206 default:
2207 ceph_abort();
2208 }
2209 ENCODE_FINISH(bl);
2210 }
2211
2212 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2213
2214 void CInode::decode_lock_state(int type, const bufferlist& bl)
2215 {
2216 auto p = bl.cbegin();
2217
2218 DECODE_START(1, p);
2219
2220 snapid_t newfirst;
2221 using ceph::decode;
2222 decode(newfirst, p);
2223 if (!is_auth() && newfirst != first) {
2224 dout(10) << __func__ << " first " << first << " -> " << newfirst << dendl;
2225 first = newfirst;
2226 }
2227 if (!is_base()) {
2228 decode(newfirst, p);
2229 if (!parent->is_auth() && newfirst != parent->first) {
2230 dout(10) << __func__ << " parent first " << first << " -> " << newfirst << dendl;
2231 parent->first = newfirst;
2232 }
2233 }
2234
2235 switch (type) {
2236 case CEPH_LOCK_IAUTH:
2237 decode_lock_iauth(p);
2238 break;
2239
2240 case CEPH_LOCK_ILINK:
2241 decode_lock_ilink(p);
2242 break;
2243
2244 case CEPH_LOCK_IDFT:
2245 decode_lock_idft(p);
2246 break;
2247
2248 case CEPH_LOCK_IFILE:
2249 decode_lock_ifile(p);
2250 break;
2251
2252 case CEPH_LOCK_INEST:
2253 decode_lock_inest(p);
2254 break;
2255
2256 case CEPH_LOCK_IXATTR:
2257 decode_lock_ixattr(p);
2258 break;
2259
2260 case CEPH_LOCK_ISNAP:
2261 decode_lock_isnap(p);
2262 break;
2263
2264 case CEPH_LOCK_IFLOCK:
2265 decode_lock_iflock(p);
2266 break;
2267
2268 case CEPH_LOCK_IPOLICY:
2269 decode_lock_ipolicy(p);
2270 break;
2271
2272 default:
2273 ceph_abort();
2274 }
2275 DECODE_FINISH(p);
2276 }
2277
2278
2279 bool CInode::is_dirty_scattered()
2280 {
2281 return
2282 filelock.is_dirty_or_flushing() ||
2283 nestlock.is_dirty_or_flushing() ||
2284 dirfragtreelock.is_dirty_or_flushing();
2285 }
2286
2287 void CInode::clear_scatter_dirty()
2288 {
2289 filelock.remove_dirty();
2290 nestlock.remove_dirty();
2291 dirfragtreelock.remove_dirty();
2292 }
2293
2294 void CInode::clear_dirty_scattered(int type)
2295 {
2296 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2297 ceph_assert(is_dir());
2298 switch (type) {
2299 case CEPH_LOCK_IFILE:
2300 item_dirty_dirfrag_dir.remove_myself();
2301 break;
2302
2303 case CEPH_LOCK_INEST:
2304 item_dirty_dirfrag_nest.remove_myself();
2305 break;
2306
2307 case CEPH_LOCK_IDFT:
2308 item_dirty_dirfrag_dirfragtree.remove_myself();
2309 break;
2310
2311 default:
2312 ceph_abort();
2313 }
2314 }
2315
2316
2317 /*
2318 * when we initially scatter a lock, we need to check if any of the dirfrags
2319 * have out of date accounted_rstat/fragstat. if so, mark the lock stale.
2320 */
2321 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2322 void CInode::start_scatter(ScatterLock *lock)
2323 {
2324 dout(10) << __func__ << " " << *lock << " on " << *this << dendl;
2325 ceph_assert(is_auth());
2326 const auto& pi = get_projected_inode();
2327
2328 for (const auto &p : dirfrags) {
2329 frag_t fg = p.first;
2330 CDir *dir = p.second;
2331 const auto& pf = dir->get_projected_fnode();
2332 dout(20) << fg << " " << *dir << dendl;
2333
2334 if (!dir->is_auth())
2335 continue;
2336
2337 switch (lock->get_type()) {
2338 case CEPH_LOCK_IFILE:
2339 finish_scatter_update(lock, dir, pi->dirstat.version, pf->accounted_fragstat.version);
2340 break;
2341
2342 case CEPH_LOCK_INEST:
2343 finish_scatter_update(lock, dir, pi->rstat.version, pf->accounted_rstat.version);
2344 break;
2345
2346 case CEPH_LOCK_IDFT:
2347 dir->state_clear(CDir::STATE_DIRTYDFT);
2348 break;
2349 }
2350 }
2351 }
2352
2353
2354 class C_Inode_FragUpdate : public MDSLogContextBase {
2355 protected:
2356 CInode *in;
2357 CDir *dir;
2358 MutationRef mut;
2359 MDSRank *get_mds() override {return in->mdcache->mds;}
2360 void finish(int r) override {
2361 in->_finish_frag_update(dir, mut);
2362 }
2363
2364 public:
2365 C_Inode_FragUpdate(CInode *i, CDir *d, MutationRef& m) : in(i), dir(d), mut(m) {}
2366 };
2367
2368 void CInode::finish_scatter_update(ScatterLock *lock, CDir *dir,
2369 version_t inode_version, version_t dir_accounted_version)
2370 {
2371 frag_t fg = dir->get_frag();
2372 ceph_assert(dir->is_auth());
2373
2374 if (dir->is_frozen()) {
2375 dout(10) << __func__ << " " << fg << " frozen, marking " << *lock << " stale " << *dir << dendl;
2376 } else if (dir->get_version() == 0) {
2377 dout(10) << __func__ << " " << fg << " not loaded, marking " << *lock << " stale " << *dir << dendl;
2378 } else {
2379 if (dir_accounted_version != inode_version) {
2380 dout(10) << __func__ << " " << fg << " journaling accounted scatterstat update v" << inode_version << dendl;
2381
2382 MDLog *mdlog = mdcache->mds->mdlog;
2383 MutationRef mut(new MutationImpl());
2384 mut->ls = mdlog->get_current_segment();
2385
2386 auto pf = dir->project_fnode(mut);
2387
2388 std::string_view ename;
2389 switch (lock->get_type()) {
2390 case CEPH_LOCK_IFILE:
2391 pf->fragstat.version = inode_version;
2392 pf->accounted_fragstat = pf->fragstat;
2393 ename = "lock ifile accounted scatter stat update";
2394 break;
2395 case CEPH_LOCK_INEST:
2396 pf->rstat.version = inode_version;
2397 pf->accounted_rstat = pf->rstat;
2398 ename = "lock inest accounted scatter stat update";
2399
2400 if (!is_auth() && lock->get_state() == LOCK_MIX) {
2401 dout(10) << __func__ << " try to assimilate dirty rstat on "
2402 << *dir << dendl;
2403 dir->assimilate_dirty_rstat_inodes(mut);
2404 }
2405
2406 break;
2407 default:
2408 ceph_abort();
2409 }
2410
2411 EUpdate *le = new EUpdate(mdlog, ename);
2412 mdlog->start_entry(le);
2413 le->metablob.add_dir_context(dir);
2414 le->metablob.add_dir(dir, true);
2415
2416 ceph_assert(!dir->is_frozen());
2417 mut->auth_pin(dir);
2418
2419 if (lock->get_type() == CEPH_LOCK_INEST &&
2420 !is_auth() && lock->get_state() == LOCK_MIX) {
2421 dout(10) << __func__ << " finish assimilating dirty rstat on "
2422 << *dir << dendl;
2423 dir->assimilate_dirty_rstat_inodes_finish(&le->metablob);
2424
2425 if (!(pf->rstat == pf->accounted_rstat)) {
2426 if (!mut->is_wrlocked(&nestlock)) {
2427 mdcache->mds->locker->wrlock_force(&nestlock, mut);
2428 }
2429
2430 mdcache->mds->locker->mark_updated_scatterlock(&nestlock);
2431 mut->ls->dirty_dirfrag_nest.push_back(&item_dirty_dirfrag_nest);
2432 }
2433 }
2434
2435 pf->version = dir->pre_dirty();
2436
2437 mdlog->submit_entry(le, new C_Inode_FragUpdate(this, dir, mut));
2438 } else {
2439 dout(10) << __func__ << " " << fg << " accounted " << *lock
2440 << " scatter stat unchanged at v" << dir_accounted_version << dendl;
2441 }
2442 }
2443 }
2444
2445 void CInode::_finish_frag_update(CDir *dir, MutationRef& mut)
2446 {
2447 dout(10) << __func__ << " on " << *dir << dendl;
2448 mut->apply();
2449 mdcache->mds->locker->drop_locks(mut.get());
2450 mut->cleanup();
2451 }
2452
2453
2454 /*
2455 * when we gather a lock, we need to assimilate dirfrag changes into the inode
2456 * state. it's possible we can't update the dirfrag accounted_rstat/fragstat
2457 * because the frag is auth and frozen, or that the replica couldn't for the same
2458 * reason. hopefully it will get updated the next time the lock cycles.
2459 *
2460 * we have two dimensions of behavior:
2461 * - we may be (auth and !frozen), and able to update, or not.
2462 * - the frag may be stale, or not.
2463 *
2464 * if the frag is non-stale, we want to assimilate the diff into the
2465 * inode, regardless of whether it's auth or updateable.
2466 *
2467 * if we update the frag, we want to set accounted_fragstat = frag,
2468 * both if we took the diff or it was stale and we are making it
2469 * un-stale.
2470 */
2471 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2472 void CInode::finish_scatter_gather_update(int type, MutationRef& mut)
2473 {
2474 LogChannelRef clog = mdcache->mds->clog;
2475
2476 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2477 ceph_assert(is_auth());
2478
2479 switch (type) {
2480 case CEPH_LOCK_IFILE:
2481 {
2482 fragtree_t tmpdft = dirfragtree;
2483 struct frag_info_t dirstat;
2484 bool dirstat_valid = true;
2485
2486 // adjust summation
2487 ceph_assert(is_auth());
2488 auto pi = _get_projected_inode();
2489
2490 bool touched_mtime = false, touched_chattr = false;
2491 dout(20) << " orig dirstat " << pi->dirstat << dendl;
2492 pi->dirstat.version++;
2493 for (const auto &p : dirfrags) {
2494 frag_t fg = p.first;
2495 CDir *dir = p.second;
2496 dout(20) << fg << " " << *dir << dendl;
2497
2498 bool update;
2499 if (dir->get_version() != 0) {
2500 update = dir->is_auth() && !dir->is_frozen();
2501 } else {
2502 update = false;
2503 dirstat_valid = false;
2504 }
2505
2506 CDir::fnode_const_ptr pf;
2507 if (update) {
2508 mut->auth_pin(dir);
2509 pf = dir->project_fnode(mut);
2510 } else {
2511 pf = dir->get_projected_fnode();
2512 }
2513
2514 if (pf->accounted_fragstat.version == pi->dirstat.version - 1) {
2515 dout(20) << fg << " fragstat " << pf->fragstat << dendl;
2516 dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
2517 pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2518 } else {
2519 dout(20) << fg << " skipping STALE accounted_fragstat " << pf->accounted_fragstat << dendl;
2520 }
2521
2522 if (pf->fragstat.nfiles < 0 ||
2523 pf->fragstat.nsubdirs < 0) {
2524 clog->error() << "bad/negative dir size on "
2525 << dir->dirfrag() << " " << pf->fragstat;
2526 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter);
2527
2528 auto _pf = const_cast<fnode_t*>(pf.get());
2529 if (pf->fragstat.nfiles < 0)
2530 _pf->fragstat.nfiles = 0;
2531 if (pf->fragstat.nsubdirs < 0)
2532 _pf->fragstat.nsubdirs = 0;
2533 }
2534
2535 if (update) {
2536 auto _pf = const_cast<fnode_t*>(pf.get());
2537 _pf->accounted_fragstat = _pf->fragstat;
2538 _pf->fragstat.version = _pf->accounted_fragstat.version = pi->dirstat.version;
2539 _pf->version = dir->pre_dirty();
2540 dout(10) << fg << " updated accounted_fragstat " << pf->fragstat << " on " << *dir << dendl;
2541 }
2542
2543 tmpdft.force_to_leaf(g_ceph_context, fg);
2544 dirstat.add(pf->fragstat);
2545 }
2546 if (touched_mtime)
2547 pi->mtime = pi->ctime = pi->dirstat.mtime;
2548 if (touched_chattr)
2549 pi->change_attr++;
2550
2551 dout(20) << " final dirstat " << pi->dirstat << dendl;
2552
2553 if (dirstat_valid && !dirstat.same_sums(pi->dirstat)) {
2554 frag_vec_t leaves;
2555 tmpdft.get_leaves_under(frag_t(), leaves);
2556 for (const auto& leaf : leaves) {
2557 if (!dirfrags.count(leaf)) {
2558 dirstat_valid = false;
2559 break;
2560 }
2561 }
2562 if (dirstat_valid) {
2563 if (state_test(CInode::STATE_REPAIRSTATS)) {
2564 dout(20) << " dirstat mismatch, fixing" << dendl;
2565 } else {
2566 clog->error() << "unmatched fragstat on " << ino() << ", inode has "
2567 << pi->dirstat << ", dirfrags have " << dirstat;
2568 ceph_assert(!"unmatched fragstat" == g_conf()->mds_verify_scatter);
2569 }
2570 // trust the dirfrags for now
2571 version_t v = pi->dirstat.version;
2572 if (pi->dirstat.mtime > dirstat.mtime)
2573 dirstat.mtime = pi->dirstat.mtime;
2574 if (pi->dirstat.change_attr > dirstat.change_attr)
2575 dirstat.change_attr = pi->dirstat.change_attr;
2576 pi->dirstat = dirstat;
2577 pi->dirstat.version = v;
2578 }
2579 }
2580
2581 if (pi->dirstat.nfiles < 0 || pi->dirstat.nsubdirs < 0) {
2582 std::string path;
2583 make_path_string(path);
2584 clog->error() << "Inconsistent statistics detected: fragstat on inode "
2585 << ino() << " (" << path << "), inode has " << pi->dirstat;
2586 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter);
2587
2588 if (pi->dirstat.nfiles < 0)
2589 pi->dirstat.nfiles = 0;
2590 if (pi->dirstat.nsubdirs < 0)
2591 pi->dirstat.nsubdirs = 0;
2592 }
2593 }
2594 break;
2595
2596 case CEPH_LOCK_INEST:
2597 {
2598 // adjust summation
2599 ceph_assert(is_auth());
2600
2601 fragtree_t tmpdft = dirfragtree;
2602 nest_info_t rstat;
2603 bool rstat_valid = true;
2604
2605 rstat.rsubdirs = 1;
2606 if (const sr_t *srnode = get_projected_srnode(); srnode)
2607 rstat.rsnaps = srnode->snaps.size();
2608
2609 auto pi = _get_projected_inode();
2610 dout(20) << " orig rstat " << pi->rstat << dendl;
2611 pi->rstat.version++;
2612 for (const auto &p : dirfrags) {
2613 frag_t fg = p.first;
2614 CDir *dir = p.second;
2615 dout(20) << fg << " " << *dir << dendl;
2616
2617 bool update;
2618 if (dir->get_version() != 0) {
2619 update = dir->is_auth() && !dir->is_frozen();
2620 } else {
2621 update = false;
2622 rstat_valid = false;
2623 }
2624
2625 CDir::fnode_const_ptr pf;
2626 if (update) {
2627 mut->auth_pin(dir);
2628 pf = dir->project_fnode(mut);
2629 } else {
2630 pf = dir->get_projected_fnode();
2631 }
2632
2633 if (pf->accounted_rstat.version == pi->rstat.version-1) {
2634 // only pull this frag's dirty rstat inodes into the frag if
2635 // the frag is non-stale and updateable. if it's stale,
2636 // that info will just get thrown out!
2637 if (update)
2638 dir->assimilate_dirty_rstat_inodes(mut);
2639
2640 dout(20) << fg << " rstat " << pf->rstat << dendl;
2641 dout(20) << fg << " accounted_rstat " << pf->accounted_rstat << dendl;
2642 dout(20) << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl;
2643 mdcache->project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat,
2644 dir->first, CEPH_NOSNAP, this, true);
2645 for (auto &p : dir->dirty_old_rstat) {
2646 mdcache->project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat,
2647 p.second.first, p.first, this, true);
2648 }
2649 if (update) // dir contents not valid if frozen or non-auth
2650 dir->check_rstats();
2651 } else {
2652 dout(20) << fg << " skipping STALE accounted_rstat " << pf->accounted_rstat << dendl;
2653 }
2654 if (update) {
2655 auto _pf = const_cast<fnode_t*>(pf.get());
2656 _pf->accounted_rstat = pf->rstat;
2657 _pf->rstat.version = _pf->accounted_rstat.version = pi->rstat.version;
2658 _pf->version = dir->pre_dirty();
2659 dir->dirty_old_rstat.clear();
2660 dir->check_rstats();
2661 dout(10) << fg << " updated accounted_rstat " << pf->rstat << " on " << *dir << dendl;
2662 }
2663
2664 tmpdft.force_to_leaf(g_ceph_context, fg);
2665 rstat.add(pf->rstat);
2666 }
2667 dout(20) << " final rstat " << pi->rstat << dendl;
2668
2669 if (rstat_valid && !rstat.same_sums(pi->rstat)) {
2670 frag_vec_t leaves;
2671 tmpdft.get_leaves_under(frag_t(), leaves);
2672 for (const auto& leaf : leaves) {
2673 if (!dirfrags.count(leaf)) {
2674 rstat_valid = false;
2675 break;
2676 }
2677 }
2678 if (rstat_valid) {
2679 if (state_test(CInode::STATE_REPAIRSTATS)) {
2680 dout(20) << " rstat mismatch, fixing" << dendl;
2681 } else {
2682 clog->error() << "inconsistent rstat on inode " << ino()
2683 << ", inode has " << pi->rstat
2684 << ", directory fragments have " << rstat;
2685 ceph_assert(!"unmatched rstat" == g_conf()->mds_verify_scatter);
2686 }
2687 // trust the dirfrag for now
2688 version_t v = pi->rstat.version;
2689 if (pi->rstat.rctime > rstat.rctime)
2690 rstat.rctime = pi->rstat.rctime;
2691 pi->rstat = rstat;
2692 pi->rstat.version = v;
2693 }
2694 }
2695
2696 mdcache->broadcast_quota_to_client(this);
2697 }
2698 break;
2699
2700 case CEPH_LOCK_IDFT:
2701 break;
2702
2703 default:
2704 ceph_abort();
2705 }
2706 }
2707
2708 void CInode::finish_scatter_gather_update_accounted(int type, EMetaBlob *metablob)
2709 {
2710 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2711 ceph_assert(is_auth());
2712
2713 for (const auto &p : dirfrags) {
2714 CDir *dir = p.second;
2715 if (!dir->is_auth() || dir->get_version() == 0 || dir->is_frozen())
2716 continue;
2717
2718 if (type == CEPH_LOCK_IDFT)
2719 continue; // nothing to do.
2720
2721 if (type == CEPH_LOCK_INEST)
2722 dir->assimilate_dirty_rstat_inodes_finish(metablob);
2723
2724 dout(10) << " journaling updated frag accounted_ on " << *dir << dendl;
2725 ceph_assert(dir->is_projected());
2726 metablob->add_dir(dir, true);
2727 }
2728 }
2729
2730 // waiting
2731
2732 bool CInode::is_frozen() const
2733 {
2734 if (is_frozen_inode()) return true;
2735 if (parent && parent->dir->is_frozen()) return true;
2736 return false;
2737 }
2738
2739 bool CInode::is_frozen_dir() const
2740 {
2741 if (parent && parent->dir->is_frozen_dir()) return true;
2742 return false;
2743 }
2744
2745 bool CInode::is_freezing() const
2746 {
2747 if (is_freezing_inode()) return true;
2748 if (parent && parent->dir->is_freezing()) return true;
2749 return false;
2750 }
2751
2752 void CInode::add_dir_waiter(frag_t fg, MDSContext *c)
2753 {
2754 if (waiting_on_dir.empty())
2755 get(PIN_DIRWAITER);
2756 waiting_on_dir[fg].push_back(c);
2757 dout(10) << __func__ << " frag " << fg << " " << c << " on " << *this << dendl;
2758 }
2759
2760 void CInode::take_dir_waiting(frag_t fg, MDSContext::vec& ls)
2761 {
2762 if (waiting_on_dir.empty())
2763 return;
2764
2765 auto it = waiting_on_dir.find(fg);
2766 if (it != waiting_on_dir.end()) {
2767 dout(10) << __func__ << " frag " << fg << " on " << *this << dendl;
2768 auto& waiting = it->second;
2769 ls.insert(ls.end(), waiting.begin(), waiting.end());
2770 waiting_on_dir.erase(it);
2771
2772 if (waiting_on_dir.empty())
2773 put(PIN_DIRWAITER);
2774 }
2775 }
2776
2777 void CInode::add_waiter(uint64_t tag, MDSContext *c)
2778 {
2779 dout(10) << __func__ << " tag " << std::hex << tag << std::dec << " " << c
2780 << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH)
2781 << " !frozen " << !is_frozen_inode()
2782 << " !freezing " << !is_freezing_inode()
2783 << dendl;
2784 // wait on the directory?
2785 // make sure its not the inode that is explicitly ambiguous|freezing|frozen
2786 if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) ||
2787 ((tag & WAIT_UNFREEZE) &&
2788 !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) {
2789 dout(15) << "passing waiter up tree" << dendl;
2790 parent->dir->add_waiter(tag, c);
2791 return;
2792 }
2793 dout(15) << "taking waiter here" << dendl;
2794 MDSCacheObject::add_waiter(tag, c);
2795 }
2796
2797 void CInode::take_waiting(uint64_t mask, MDSContext::vec& ls)
2798 {
2799 if ((mask & WAIT_DIR) && !waiting_on_dir.empty()) {
2800 // take all dentry waiters
2801 while (!waiting_on_dir.empty()) {
2802 auto it = waiting_on_dir.begin();
2803 dout(10) << __func__ << " dirfrag " << it->first << " on " << *this << dendl;
2804 auto& waiting = it->second;
2805 ls.insert(ls.end(), waiting.begin(), waiting.end());
2806 waiting_on_dir.erase(it);
2807 }
2808 put(PIN_DIRWAITER);
2809 }
2810
2811 // waiting
2812 MDSCacheObject::take_waiting(mask, ls);
2813 }
2814
2815 void CInode::maybe_finish_freeze_inode()
2816 {
2817 CDir *dir = get_parent_dir();
2818 if (auth_pins > auth_pin_freeze_allowance || dir->frozen_inode_suppressed)
2819 return;
2820
2821 dout(10) << "maybe_finish_freeze_inode - frozen" << dendl;
2822 ceph_assert(auth_pins == auth_pin_freeze_allowance);
2823 get(PIN_FROZEN);
2824 put(PIN_FREEZING);
2825 state_clear(STATE_FREEZING);
2826 state_set(STATE_FROZEN);
2827
2828 item_freezing_inode.remove_myself();
2829 dir->num_frozen_inodes++;
2830
2831 finish_waiting(WAIT_FROZEN);
2832 }
2833
2834 bool CInode::freeze_inode(int auth_pin_allowance)
2835 {
2836 CDir *dir = get_parent_dir();
2837 ceph_assert(dir);
2838
2839 ceph_assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins
2840 ceph_assert(auth_pins >= auth_pin_allowance);
2841 if (auth_pins == auth_pin_allowance && !dir->frozen_inode_suppressed) {
2842 dout(10) << "freeze_inode - frozen" << dendl;
2843 if (!state_test(STATE_FROZEN)) {
2844 get(PIN_FROZEN);
2845 state_set(STATE_FROZEN);
2846 dir->num_frozen_inodes++;
2847 }
2848 return true;
2849 }
2850
2851 dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl;
2852 auth_pin_freeze_allowance = auth_pin_allowance;
2853 dir->freezing_inodes.push_back(&item_freezing_inode);
2854
2855 get(PIN_FREEZING);
2856 state_set(STATE_FREEZING);
2857
2858 if (!dir->lock_caches_with_auth_pins.empty())
2859 mdcache->mds->locker->invalidate_lock_caches(dir);
2860
2861 const static int lock_types[] = {
2862 CEPH_LOCK_IVERSION, CEPH_LOCK_IFILE, CEPH_LOCK_IAUTH, CEPH_LOCK_ILINK, CEPH_LOCK_IDFT,
2863 CEPH_LOCK_IXATTR, CEPH_LOCK_ISNAP, CEPH_LOCK_INEST, CEPH_LOCK_IFLOCK, CEPH_LOCK_IPOLICY, 0
2864 };
2865 for (int i = 0; lock_types[i]; ++i) {
2866 auto lock = get_lock(lock_types[i]);
2867 if (lock->is_cached())
2868 mdcache->mds->locker->invalidate_lock_caches(lock);
2869 }
2870 // invalidate_lock_caches() may decrease dir->frozen_inode_suppressed
2871 // and finish freezing the inode
2872 return state_test(STATE_FROZEN);
2873 }
2874
2875 void CInode::unfreeze_inode(MDSContext::vec& finished)
2876 {
2877 dout(10) << __func__ << dendl;
2878 if (state_test(STATE_FREEZING)) {
2879 state_clear(STATE_FREEZING);
2880 put(PIN_FREEZING);
2881 item_freezing_inode.remove_myself();
2882 } else if (state_test(STATE_FROZEN)) {
2883 state_clear(STATE_FROZEN);
2884 put(PIN_FROZEN);
2885 get_parent_dir()->num_frozen_inodes--;
2886 } else
2887 ceph_abort();
2888 take_waiting(WAIT_UNFREEZE, finished);
2889 }
2890
2891 void CInode::unfreeze_inode()
2892 {
2893 MDSContext::vec finished;
2894 unfreeze_inode(finished);
2895 mdcache->mds->queue_waiters(finished);
2896 }
2897
2898 void CInode::freeze_auth_pin()
2899 {
2900 ceph_assert(state_test(CInode::STATE_FROZEN));
2901 state_set(CInode::STATE_FROZENAUTHPIN);
2902 get_parent_dir()->num_frozen_inodes++;
2903 }
2904
2905 void CInode::unfreeze_auth_pin()
2906 {
2907 ceph_assert(state_test(CInode::STATE_FROZENAUTHPIN));
2908 state_clear(CInode::STATE_FROZENAUTHPIN);
2909 get_parent_dir()->num_frozen_inodes--;
2910 if (!state_test(STATE_FREEZING|STATE_FROZEN)) {
2911 MDSContext::vec finished;
2912 take_waiting(WAIT_UNFREEZE, finished);
2913 mdcache->mds->queue_waiters(finished);
2914 }
2915 }
2916
2917 void CInode::clear_ambiguous_auth(MDSContext::vec& finished)
2918 {
2919 ceph_assert(state_test(CInode::STATE_AMBIGUOUSAUTH));
2920 state_clear(CInode::STATE_AMBIGUOUSAUTH);
2921 take_waiting(CInode::WAIT_SINGLEAUTH, finished);
2922 }
2923
2924 void CInode::clear_ambiguous_auth()
2925 {
2926 MDSContext::vec finished;
2927 clear_ambiguous_auth(finished);
2928 mdcache->mds->queue_waiters(finished);
2929 }
2930
2931 // auth_pins
2932 bool CInode::can_auth_pin(int *err_ret) const {
2933 int err;
2934 if (!is_auth()) {
2935 err = ERR_NOT_AUTH;
2936 } else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) {
2937 err = ERR_EXPORTING_INODE;
2938 } else {
2939 if (parent)
2940 return parent->can_auth_pin(err_ret);
2941 err = 0;
2942 }
2943 if (err && err_ret)
2944 *err_ret = err;
2945 return !err;
2946 }
2947
2948 void CInode::auth_pin(void *by)
2949 {
2950 if (auth_pins == 0)
2951 get(PIN_AUTHPIN);
2952 auth_pins++;
2953
2954 #ifdef MDS_AUTHPIN_SET
2955 auth_pin_set.insert(by);
2956 #endif
2957
2958 dout(10) << "auth_pin by " << by << " on " << *this << " now " << auth_pins << dendl;
2959
2960 if (parent)
2961 parent->adjust_nested_auth_pins(1, this);
2962 }
2963
2964 void CInode::auth_unpin(void *by)
2965 {
2966 auth_pins--;
2967
2968 #ifdef MDS_AUTHPIN_SET
2969 {
2970 auto it = auth_pin_set.find(by);
2971 ceph_assert(it != auth_pin_set.end());
2972 auth_pin_set.erase(it);
2973 }
2974 #endif
2975
2976 if (auth_pins == 0)
2977 put(PIN_AUTHPIN);
2978
2979 dout(10) << "auth_unpin by " << by << " on " << *this << " now " << auth_pins << dendl;
2980
2981 ceph_assert(auth_pins >= 0);
2982
2983 if (parent)
2984 parent->adjust_nested_auth_pins(-1, by);
2985
2986 if (is_freezing_inode())
2987 maybe_finish_freeze_inode();
2988 }
2989
2990 // authority
2991
2992 mds_authority_t CInode::authority() const
2993 {
2994 if (inode_auth.first >= 0)
2995 return inode_auth;
2996
2997 if (parent)
2998 return parent->dir->authority();
2999
3000 // new items that are not yet linked in (in the committed plane) belong
3001 // to their first parent.
3002 if (!projected_parent.empty())
3003 return projected_parent.front()->dir->authority();
3004
3005 return CDIR_AUTH_UNDEF;
3006 }
3007
3008
3009 // SNAP
3010
3011 snapid_t CInode::get_oldest_snap()
3012 {
3013 snapid_t t = first;
3014 if (is_any_old_inodes())
3015 t = get_old_inodes()->begin()->second.first;
3016 return std::min(t, oldest_snap);
3017 }
3018
3019 const CInode::mempool_old_inode& CInode::cow_old_inode(snapid_t follows, bool cow_head)
3020 {
3021 ceph_assert(follows >= first);
3022
3023 const auto& pi = cow_head ? get_projected_inode() : get_previous_projected_inode();
3024 const auto& px = cow_head ? get_projected_xattrs() : get_previous_projected_xattrs();
3025
3026 auto _old_inodes = allocate_old_inode_map();
3027 if (old_inodes)
3028 *_old_inodes = *old_inodes;
3029
3030 mempool_old_inode &old = (*_old_inodes)[follows];
3031 old.first = first;
3032 old.inode = *pi;
3033 if (px) {
3034 dout(10) << " " << px->size() << " xattrs cowed, " << *px << dendl;
3035 old.xattrs = *px;
3036 }
3037
3038 if (first < oldest_snap)
3039 oldest_snap = first;
3040
3041 old.inode.trim_client_ranges(follows);
3042
3043 if (g_conf()->mds_snap_rstat &&
3044 !(old.inode.rstat == old.inode.accounted_rstat))
3045 dirty_old_rstats.insert(follows);
3046
3047 first = follows+1;
3048
3049 dout(10) << __func__ << " " << (cow_head ? "head" : "previous_head" )
3050 << " to [" << old.first << "," << follows << "] on "
3051 << *this << dendl;
3052
3053 reset_old_inodes(std::move(_old_inodes));
3054 return old;
3055 }
3056
3057 void CInode::pre_cow_old_inode()
3058 {
3059 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
3060 dout(20) << __func__ << " follows " << follows << " on " << *this << dendl;
3061 if (first <= follows)
3062 cow_old_inode(follows, true);
3063 }
3064
3065 bool CInode::has_snap_data(snapid_t snapid)
3066 {
3067 bool found = snapid >= first && snapid <= last;
3068 if (!found && is_any_old_inodes()) {
3069 auto p = old_inodes->lower_bound(snapid);
3070 if (p != old_inodes->end()) {
3071 if (p->second.first > snapid) {
3072 if (p != old_inodes->begin())
3073 --p;
3074 }
3075 if (p->second.first <= snapid && snapid <= p->first) {
3076 found = true;
3077 }
3078 }
3079 }
3080 return found;
3081 }
3082
3083 void CInode::purge_stale_snap_data(const set<snapid_t>& snaps)
3084 {
3085 dout(10) << __func__ << " " << snaps << dendl;
3086
3087 if (!get_old_inodes())
3088 return;
3089
3090 std::vector<snapid_t> to_remove;
3091 for (auto p : *get_old_inodes()) {
3092 const snapid_t &id = p.first;
3093 const auto &s = snaps.lower_bound(p.second.first);
3094 if (s == snaps.end() || *s > id) {
3095 dout(10) << " purging old_inode [" << p.second.first << "," << id << "]" << dendl;
3096 to_remove.push_back(id);
3097 }
3098 }
3099
3100 if (to_remove.size() == get_old_inodes()->size()) {
3101 reset_old_inodes(old_inode_map_ptr());
3102 } else if (!to_remove.empty()) {
3103 auto _old_inodes = allocate_old_inode_map(*get_old_inodes());
3104 for (auto id : to_remove)
3105 _old_inodes->erase(id);
3106 reset_old_inodes(std::move(_old_inodes));
3107 }
3108 }
3109
3110 /*
3111 * pick/create an old_inode
3112 */
3113 snapid_t CInode::pick_old_inode(snapid_t snap) const
3114 {
3115 if (is_any_old_inodes()) {
3116 auto it = old_inodes->lower_bound(snap); // p is first key >= to snap
3117 if (it != old_inodes->end() && it->second.first <= snap) {
3118 dout(10) << __func__ << " snap " << snap << " -> [" << it->second.first << "," << it->first << "]" << dendl;
3119 return it->first;
3120 }
3121 }
3122 dout(10) << __func__ << " snap " << snap << " -> nothing" << dendl;
3123 return 0;
3124 }
3125
3126 void CInode::open_snaprealm(bool nosplit)
3127 {
3128 if (!snaprealm) {
3129 SnapRealm *parent = find_snaprealm();
3130 snaprealm = new SnapRealm(mdcache, this);
3131 if (parent) {
3132 dout(10) << __func__ << " " << snaprealm
3133 << " parent is " << parent
3134 << dendl;
3135 dout(30) << " siblings are " << parent->open_children << dendl;
3136 snaprealm->parent = parent;
3137 if (!nosplit)
3138 parent->split_at(snaprealm);
3139 parent->open_children.insert(snaprealm);
3140 }
3141 }
3142 }
3143 void CInode::close_snaprealm(bool nojoin)
3144 {
3145 if (snaprealm) {
3146 dout(15) << __func__ << " " << *snaprealm << dendl;
3147 if (snaprealm->parent) {
3148 snaprealm->parent->open_children.erase(snaprealm);
3149 //if (!nojoin)
3150 //snaprealm->parent->join(snaprealm);
3151 }
3152 delete snaprealm;
3153 snaprealm = 0;
3154 }
3155 }
3156
3157 SnapRealm *CInode::find_snaprealm() const
3158 {
3159 const CInode *cur = this;
3160 while (!cur->snaprealm) {
3161 const CDentry *pdn = cur->get_oldest_parent_dn();
3162 if (!pdn)
3163 break;
3164 cur = pdn->get_dir()->get_inode();
3165 }
3166 return cur->snaprealm;
3167 }
3168
3169 void CInode::encode_snap_blob(bufferlist &snapbl)
3170 {
3171 if (snaprealm) {
3172 using ceph::encode;
3173 encode(snaprealm->srnode, snapbl);
3174 dout(20) << __func__ << " " << *snaprealm << dendl;
3175 }
3176 }
3177 void CInode::decode_snap_blob(const bufferlist& snapbl)
3178 {
3179 using ceph::decode;
3180 if (snapbl.length()) {
3181 open_snaprealm();
3182 auto old_flags = snaprealm->srnode.flags;
3183 auto p = snapbl.cbegin();
3184 decode(snaprealm->srnode, p);
3185 if (!is_base()) {
3186 if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) {
3187 snaprealm->adjust_parent();
3188 }
3189 }
3190 dout(20) << __func__ << " " << *snaprealm << dendl;
3191 } else if (snaprealm &&
3192 !is_root() && !is_mdsdir()) { // see https://tracker.ceph.com/issues/42675
3193 ceph_assert(mdcache->mds->is_any_replay());
3194 snaprealm->merge_to(NULL);
3195 }
3196 }
3197
3198 void CInode::encode_snap(bufferlist& bl)
3199 {
3200 ENCODE_START(1, 1, bl);
3201 bufferlist snapbl;
3202 encode_snap_blob(snapbl);
3203 encode(snapbl, bl);
3204 encode(oldest_snap, bl);
3205 ENCODE_FINISH(bl);
3206 }
3207
3208 void CInode::decode_snap(bufferlist::const_iterator& p)
3209 {
3210 DECODE_START(1, p);
3211 bufferlist snapbl;
3212 decode(snapbl, p);
3213 decode(oldest_snap, p);
3214 decode_snap_blob(snapbl);
3215 DECODE_FINISH(p);
3216 }
3217
3218 // =============================================
3219
3220 client_t CInode::calc_ideal_loner()
3221 {
3222 if (mdcache->is_readonly())
3223 return -1;
3224 if (!get_mds_caps_wanted().empty())
3225 return -1;
3226
3227 int n = 0;
3228 client_t loner = -1;
3229 for (const auto &p : client_caps) {
3230 if (!p.second.is_stale() &&
3231 (is_dir() ?
3232 !has_subtree_or_exporting_dirfrag() :
3233 (p.second.wanted() & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_RD)))) {
3234 if (n)
3235 return -1;
3236 n++;
3237 loner = p.first;
3238 }
3239 }
3240 return loner;
3241 }
3242
3243 bool CInode::choose_ideal_loner()
3244 {
3245 want_loner_cap = calc_ideal_loner();
3246 int changed = false;
3247 if (loner_cap >= 0 && loner_cap != want_loner_cap) {
3248 if (!try_drop_loner())
3249 return false;
3250 changed = true;
3251 }
3252
3253 if (want_loner_cap >= 0) {
3254 if (loner_cap < 0) {
3255 set_loner_cap(want_loner_cap);
3256 changed = true;
3257 } else
3258 ceph_assert(loner_cap == want_loner_cap);
3259 }
3260 return changed;
3261 }
3262
3263 bool CInode::try_set_loner()
3264 {
3265 ceph_assert(want_loner_cap >= 0);
3266 if (loner_cap >= 0 && loner_cap != want_loner_cap)
3267 return false;
3268 set_loner_cap(want_loner_cap);
3269 return true;
3270 }
3271
3272 void CInode::set_loner_cap(client_t l)
3273 {
3274 loner_cap = l;
3275 authlock.set_excl_client(loner_cap);
3276 filelock.set_excl_client(loner_cap);
3277 linklock.set_excl_client(loner_cap);
3278 xattrlock.set_excl_client(loner_cap);
3279 }
3280
3281 bool CInode::try_drop_loner()
3282 {
3283 if (loner_cap < 0)
3284 return true;
3285
3286 int other_allowed = get_caps_allowed_by_type(CAP_ANY);
3287 Capability *cap = get_client_cap(loner_cap);
3288 if (!cap ||
3289 (cap->issued() & ~other_allowed) == 0) {
3290 set_loner_cap(-1);
3291 return true;
3292 }
3293 return false;
3294 }
3295
3296
3297 // choose new lock state during recovery, based on issued caps
3298 void CInode::choose_lock_state(SimpleLock *lock, int allissued)
3299 {
3300 int shift = lock->get_cap_shift();
3301 int issued = (allissued >> shift) & lock->get_cap_mask();
3302 if (is_auth()) {
3303 if (lock->is_xlocked()) {
3304 // do nothing here
3305 } else if (lock->get_state() != LOCK_MIX) {
3306 if (issued & (CEPH_CAP_GEXCL | CEPH_CAP_GBUFFER))
3307 lock->set_state(LOCK_EXCL);
3308 else if (issued & CEPH_CAP_GWR) {
3309 if (issued & (CEPH_CAP_GCACHE | CEPH_CAP_GSHARED))
3310 lock->set_state(LOCK_EXCL);
3311 else
3312 lock->set_state(LOCK_MIX);
3313 } else if (lock->is_dirty()) {
3314 if (is_replicated())
3315 lock->set_state(LOCK_MIX);
3316 else
3317 lock->set_state(LOCK_LOCK);
3318 } else
3319 lock->set_state(LOCK_SYNC);
3320 }
3321 } else {
3322 // our states have already been chosen during rejoin.
3323 if (lock->is_xlocked())
3324 ceph_assert(lock->get_state() == LOCK_LOCK);
3325 }
3326 }
3327
3328 void CInode::choose_lock_states(int dirty_caps)
3329 {
3330 int issued = get_caps_issued() | dirty_caps;
3331 if (is_auth() && (issued & (CEPH_CAP_ANY_EXCL|CEPH_CAP_ANY_WR)))
3332 choose_ideal_loner();
3333 choose_lock_state(&filelock, issued);
3334 choose_lock_state(&nestlock, issued);
3335 choose_lock_state(&dirfragtreelock, issued);
3336 choose_lock_state(&authlock, issued);
3337 choose_lock_state(&xattrlock, issued);
3338 choose_lock_state(&linklock, issued);
3339 }
3340
3341 int CInode::count_nonstale_caps()
3342 {
3343 int n = 0;
3344 for (const auto &p : client_caps) {
3345 if (!p.second.is_stale())
3346 n++;
3347 }
3348 return n;
3349 }
3350
3351 bool CInode::multiple_nonstale_caps()
3352 {
3353 int n = 0;
3354 for (const auto &p : client_caps) {
3355 if (!p.second.is_stale()) {
3356 if (n)
3357 return true;
3358 n++;
3359 }
3360 }
3361 return false;
3362 }
3363
3364 void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m)
3365 {
3366 bool old_empty = mds_caps_wanted.empty();
3367 mds_caps_wanted.swap(m);
3368 if (old_empty != (bool)mds_caps_wanted.empty()) {
3369 if (old_empty)
3370 adjust_num_caps_notable(1);
3371 else
3372 adjust_num_caps_notable(-1);
3373 }
3374 }
3375
3376 void CInode::set_mds_caps_wanted(mds_rank_t mds, int32_t wanted)
3377 {
3378 bool old_empty = mds_caps_wanted.empty();
3379 if (wanted) {
3380 mds_caps_wanted[mds] = wanted;
3381 if (old_empty)
3382 adjust_num_caps_notable(1);
3383 } else if (!old_empty) {
3384 mds_caps_wanted.erase(mds);
3385 if (mds_caps_wanted.empty())
3386 adjust_num_caps_notable(-1);
3387 }
3388 }
3389
3390 Capability *CInode::add_client_cap(client_t client, Session *session,
3391 SnapRealm *conrealm, bool new_inode)
3392 {
3393 ceph_assert(last == CEPH_NOSNAP);
3394 if (client_caps.empty()) {
3395 get(PIN_CAPS);
3396 if (conrealm)
3397 containing_realm = conrealm;
3398 else
3399 containing_realm = find_snaprealm();
3400 containing_realm->inodes_with_caps.push_back(&item_caps);
3401 dout(10) << __func__ << " first cap, joining realm " << *containing_realm << dendl;
3402
3403 mdcache->num_inodes_with_caps++;
3404 if (parent)
3405 parent->dir->adjust_num_inodes_with_caps(1);
3406 }
3407
3408 uint64_t cap_id = new_inode ? 1 : ++mdcache->last_cap_id;
3409 auto ret = client_caps.emplace(std::piecewise_construct, std::forward_as_tuple(client),
3410 std::forward_as_tuple(this, session, cap_id));
3411 ceph_assert(ret.second == true);
3412 Capability *cap = &ret.first->second;
3413
3414 cap->client_follows = first-1;
3415 containing_realm->add_cap(client, cap);
3416
3417 return cap;
3418 }
3419
3420 void CInode::remove_client_cap(client_t client)
3421 {
3422 auto it = client_caps.find(client);
3423 ceph_assert(it != client_caps.end());
3424 Capability *cap = &it->second;
3425
3426 cap->item_session_caps.remove_myself();
3427 cap->item_revoking_caps.remove_myself();
3428 cap->item_client_revoking_caps.remove_myself();
3429 containing_realm->remove_cap(client, cap);
3430
3431 if (client == loner_cap)
3432 loner_cap = -1;
3433
3434 if (cap->is_wanted_notable())
3435 adjust_num_caps_notable(-1);
3436
3437 client_caps.erase(it);
3438 if (client_caps.empty()) {
3439 dout(10) << __func__ << " last cap, leaving realm " << *containing_realm << dendl;
3440 put(PIN_CAPS);
3441 item_caps.remove_myself();
3442 containing_realm = NULL;
3443 mdcache->num_inodes_with_caps--;
3444 if (parent)
3445 parent->dir->adjust_num_inodes_with_caps(-1);
3446 }
3447
3448 //clean up advisory locks
3449 bool fcntl_removed = fcntl_locks ? fcntl_locks->remove_all_from(client) : false;
3450 bool flock_removed = flock_locks ? flock_locks->remove_all_from(client) : false;
3451 if (fcntl_removed || flock_removed) {
3452 MDSContext::vec waiters;
3453 take_waiting(CInode::WAIT_FLOCK, waiters);
3454 mdcache->mds->queue_waiters(waiters);
3455 }
3456 }
3457
3458 void CInode::move_to_realm(SnapRealm *realm)
3459 {
3460 dout(10) << __func__ << " joining realm " << *realm
3461 << ", leaving realm " << *containing_realm << dendl;
3462 for (auto& p : client_caps) {
3463 containing_realm->remove_cap(p.first, &p.second);
3464 realm->add_cap(p.first, &p.second);
3465 }
3466 item_caps.remove_myself();
3467 realm->inodes_with_caps.push_back(&item_caps);
3468 containing_realm = realm;
3469 }
3470
3471 Capability *CInode::reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session)
3472 {
3473 Capability *cap = get_client_cap(client);
3474 if (cap) {
3475 // FIXME?
3476 cap->merge(icr.capinfo.wanted, icr.capinfo.issued);
3477 } else {
3478 cap = add_client_cap(client, session);
3479 cap->set_cap_id(icr.capinfo.cap_id);
3480 cap->set_wanted(icr.capinfo.wanted);
3481 cap->issue_norevoke(icr.capinfo.issued);
3482 cap->reset_seq();
3483 }
3484 cap->set_last_issue_stamp(ceph_clock_now());
3485 return cap;
3486 }
3487
3488 void CInode::clear_client_caps_after_export()
3489 {
3490 while (!client_caps.empty())
3491 remove_client_cap(client_caps.begin()->first);
3492 loner_cap = -1;
3493 want_loner_cap = -1;
3494 if (!get_mds_caps_wanted().empty()) {
3495 mempool::mds_co::compact_map<int32_t,int32_t> empty;
3496 set_mds_caps_wanted(empty);
3497 }
3498 }
3499
3500 void CInode::export_client_caps(map<client_t,Capability::Export>& cl)
3501 {
3502 for (const auto &p : client_caps) {
3503 cl[p.first] = p.second.make_export();
3504 }
3505 }
3506
3507 // caps allowed
3508 int CInode::get_caps_liked() const
3509 {
3510 if (is_dir())
3511 return CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; // but not, say, FILE_RD|WR|WRBUFFER
3512 else
3513 return CEPH_CAP_ANY & ~CEPH_CAP_FILE_LAZYIO;
3514 }
3515
3516 int CInode::get_caps_allowed_ever() const
3517 {
3518 int allowed;
3519 if (is_dir())
3520 allowed = CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED;
3521 else
3522 allowed = CEPH_CAP_ANY;
3523 return allowed &
3524 (CEPH_CAP_PIN |
3525 (filelock.gcaps_allowed_ever() << filelock.get_cap_shift()) |
3526 (authlock.gcaps_allowed_ever() << authlock.get_cap_shift()) |
3527 (xattrlock.gcaps_allowed_ever() << xattrlock.get_cap_shift()) |
3528 (linklock.gcaps_allowed_ever() << linklock.get_cap_shift()));
3529 }
3530
3531 int CInode::get_caps_allowed_by_type(int type) const
3532 {
3533 return
3534 CEPH_CAP_PIN |
3535 (filelock.gcaps_allowed(type) << filelock.get_cap_shift()) |
3536 (authlock.gcaps_allowed(type) << authlock.get_cap_shift()) |
3537 (xattrlock.gcaps_allowed(type) << xattrlock.get_cap_shift()) |
3538 (linklock.gcaps_allowed(type) << linklock.get_cap_shift());
3539 }
3540
3541 int CInode::get_caps_careful() const
3542 {
3543 return
3544 (filelock.gcaps_careful() << filelock.get_cap_shift()) |
3545 (authlock.gcaps_careful() << authlock.get_cap_shift()) |
3546 (xattrlock.gcaps_careful() << xattrlock.get_cap_shift()) |
3547 (linklock.gcaps_careful() << linklock.get_cap_shift());
3548 }
3549
3550 int CInode::get_xlocker_mask(client_t client) const
3551 {
3552 return
3553 (filelock.gcaps_xlocker_mask(client) << filelock.get_cap_shift()) |
3554 (authlock.gcaps_xlocker_mask(client) << authlock.get_cap_shift()) |
3555 (xattrlock.gcaps_xlocker_mask(client) << xattrlock.get_cap_shift()) |
3556 (linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift());
3557 }
3558
3559 int CInode::get_caps_allowed_for_client(Session *session, Capability *cap,
3560 const mempool_inode *file_i) const
3561 {
3562 client_t client = session->get_client();
3563 int allowed;
3564 if (client == get_loner()) {
3565 // as the loner, we get the loner_caps AND any xlocker_caps for things we have xlocked
3566 allowed =
3567 get_caps_allowed_by_type(CAP_LONER) |
3568 (get_caps_allowed_by_type(CAP_XLOCKER) & get_xlocker_mask(client));
3569 } else {
3570 allowed = get_caps_allowed_by_type(CAP_ANY);
3571 }
3572
3573 if (is_dir()) {
3574 allowed &= ~CEPH_CAP_ANY_DIR_OPS;
3575 if (cap && (allowed & CEPH_CAP_FILE_EXCL))
3576 allowed |= cap->get_lock_cache_allowed();
3577 } else {
3578 if (file_i->inline_data.version == CEPH_INLINE_NONE &&
3579 file_i->layout.pool_ns.empty()) {
3580 // noop
3581 } else if (cap) {
3582 if ((file_i->inline_data.version != CEPH_INLINE_NONE &&
3583 cap->is_noinline()) ||
3584 (!file_i->layout.pool_ns.empty() &&
3585 cap->is_nopoolns()))
3586 allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
3587 } else {
3588 auto& conn = session->get_connection();
3589 if ((file_i->inline_data.version != CEPH_INLINE_NONE &&
3590 !conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) ||
3591 (!file_i->layout.pool_ns.empty() &&
3592 !conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)))
3593 allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
3594 }
3595 }
3596 return allowed;
3597 }
3598
3599 // caps issued, wanted
3600 int CInode::get_caps_issued(int *ploner, int *pother, int *pxlocker,
3601 int shift, int mask)
3602 {
3603 int c = 0;
3604 int loner = 0, other = 0, xlocker = 0;
3605 if (!is_auth()) {
3606 loner_cap = -1;
3607 }
3608
3609 for (const auto &p : client_caps) {
3610 int i = p.second.issued();
3611 c |= i;
3612 if (p.first == loner_cap)
3613 loner |= i;
3614 else
3615 other |= i;
3616 xlocker |= get_xlocker_mask(p.first) & i;
3617 }
3618 if (ploner) *ploner = (loner >> shift) & mask;
3619 if (pother) *pother = (other >> shift) & mask;
3620 if (pxlocker) *pxlocker = (xlocker >> shift) & mask;
3621 return (c >> shift) & mask;
3622 }
3623
3624 bool CInode::is_any_caps_wanted() const
3625 {
3626 for (const auto &p : client_caps) {
3627 if (p.second.wanted())
3628 return true;
3629 }
3630 return false;
3631 }
3632
3633 int CInode::get_caps_wanted(int *ploner, int *pother, int shift, int mask) const
3634 {
3635 int w = 0;
3636 int loner = 0, other = 0;
3637 for (const auto &p : client_caps) {
3638 if (!p.second.is_stale()) {
3639 int t = p.second.wanted();
3640 w |= t;
3641 if (p.first == loner_cap)
3642 loner |= t;
3643 else
3644 other |= t;
3645 }
3646 //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl;
3647 }
3648 if (is_auth())
3649 for (const auto &p : mds_caps_wanted) {
3650 w |= p.second;
3651 other |= p.second;
3652 //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl;
3653 }
3654 if (ploner) *ploner = (loner >> shift) & mask;
3655 if (pother) *pother = (other >> shift) & mask;
3656 return (w >> shift) & mask;
3657 }
3658
3659 bool CInode::issued_caps_need_gather(SimpleLock *lock)
3660 {
3661 int loner_issued, other_issued, xlocker_issued;
3662 get_caps_issued(&loner_issued, &other_issued, &xlocker_issued,
3663 lock->get_cap_shift(), lock->get_cap_mask());
3664 if ((loner_issued & ~lock->gcaps_allowed(CAP_LONER)) ||
3665 (other_issued & ~lock->gcaps_allowed(CAP_ANY)) ||
3666 (xlocker_issued & ~lock->gcaps_allowed(CAP_XLOCKER)))
3667 return true;
3668 return false;
3669 }
3670
3671 void CInode::adjust_num_caps_notable(int d)
3672 {
3673 if (!is_clientwriteable()) {
3674 if (!num_caps_notable && d > 0)
3675 mdcache->open_file_table.add_inode(this);
3676 else if (num_caps_notable > 0 && num_caps_notable == -d)
3677 mdcache->open_file_table.remove_inode(this);
3678 }
3679
3680 num_caps_notable +=d;
3681 ceph_assert(num_caps_notable >= 0);
3682 }
3683
3684 void CInode::mark_clientwriteable()
3685 {
3686 if (last != CEPH_NOSNAP)
3687 return;
3688 if (!state_test(STATE_CLIENTWRITEABLE)) {
3689 if (num_caps_notable == 0)
3690 mdcache->open_file_table.add_inode(this);
3691 state_set(STATE_CLIENTWRITEABLE);
3692 }
3693 }
3694
3695 void CInode::clear_clientwriteable()
3696 {
3697 if (state_test(STATE_CLIENTWRITEABLE)) {
3698 if (num_caps_notable == 0)
3699 mdcache->open_file_table.remove_inode(this);
3700 state_clear(STATE_CLIENTWRITEABLE);
3701 }
3702 }
3703
3704 // =============================================
3705
3706 int CInode::encode_inodestat(bufferlist& bl, Session *session,
3707 SnapRealm *dir_realm,
3708 snapid_t snapid,
3709 unsigned max_bytes,
3710 int getattr_caps)
3711 {
3712 client_t client = session->get_client();
3713 ceph_assert(snapid);
3714
3715 bool valid = true;
3716
3717 // pick a version!
3718 const mempool_inode *oi = get_inode().get();
3719 const mempool_inode *pi = get_projected_inode().get();
3720
3721 const mempool_xattr_map *pxattrs = nullptr;
3722
3723 if (snapid != CEPH_NOSNAP) {
3724
3725 // for now at least, old_inodes is only defined/valid on the auth
3726 if (!is_auth())
3727 valid = false;
3728
3729 if (is_any_old_inodes()) {
3730 auto it = old_inodes->lower_bound(snapid);
3731 if (it != old_inodes->end()) {
3732 if (it->second.first > snapid) {
3733 if (it != old_inodes->begin())
3734 --it;
3735 }
3736 if (it->second.first <= snapid && snapid <= it->first) {
3737 dout(15) << __func__ << " snapid " << snapid
3738 << " to old_inode [" << it->second.first << "," << it->first << "]"
3739 << " " << it->second.inode.rstat
3740 << dendl;
3741 pi = oi = &it->second.inode;
3742 pxattrs = &it->second.xattrs;
3743 } else {
3744 // snapshoted remote dentry can result this
3745 dout(0) << __func__ << " old_inode for snapid " << snapid
3746 << " not found" << dendl;
3747 }
3748 }
3749 } else if (snapid < first || snapid > last) {
3750 // snapshoted remote dentry can result this
3751 dout(0) << __func__ << " [" << first << "," << last << "]"
3752 << " not match snapid " << snapid << dendl;
3753 }
3754 }
3755
3756 utime_t snap_btime;
3757 std::map<std::string, std::string> snap_metadata;
3758 SnapRealm *realm = find_snaprealm();
3759 if (snapid != CEPH_NOSNAP && realm) {
3760 // add snapshot timestamp vxattr
3761 map<snapid_t,const SnapInfo*> infomap;
3762 realm->get_snap_info(infomap,
3763 snapid, // min
3764 snapid); // max
3765 if (!infomap.empty()) {
3766 ceph_assert(infomap.size() == 1);
3767 const SnapInfo *si = infomap.begin()->second;
3768 snap_btime = si->stamp;
3769 snap_metadata = si->metadata;
3770 }
3771 }
3772
3773
3774 bool no_caps = !valid ||
3775 session->is_stale() ||
3776 (dir_realm && realm != dir_realm) ||
3777 is_frozen() ||
3778 state_test(CInode::STATE_EXPORTINGCAPS);
3779 if (no_caps)
3780 dout(20) << __func__ << " no caps"
3781 << (!valid?", !valid":"")
3782 << (session->is_stale()?", session stale ":"")
3783 << ((dir_realm && realm != dir_realm)?", snaprealm differs ":"")
3784 << (is_frozen()?", frozen inode":"")
3785 << (state_test(CInode::STATE_EXPORTINGCAPS)?", exporting caps":"")
3786 << dendl;
3787
3788
3789 // "fake" a version that is odd (stable) version, +1 if projected.
3790 version_t version = (oi->version * 2) + is_projected();
3791
3792 Capability *cap = get_client_cap(client);
3793 bool pfile = filelock.is_xlocked_by_client(client) || get_loner() == client;
3794 //(cap && (cap->issued() & CEPH_CAP_FILE_EXCL));
3795 bool pauth = authlock.is_xlocked_by_client(client) || get_loner() == client;
3796 bool plink = linklock.is_xlocked_by_client(client) || get_loner() == client;
3797 bool pxattr = xattrlock.is_xlocked_by_client(client) || get_loner() == client;
3798
3799 bool plocal = versionlock.get_last_wrlock_client() == client;
3800 bool ppolicy = policylock.is_xlocked_by_client(client) || get_loner()==client;
3801
3802 const mempool_inode *any_i = (pfile|pauth|plink|pxattr|plocal) ? pi : oi;
3803
3804 dout(20) << " pfile " << pfile << " pauth " << pauth
3805 << " plink " << plink << " pxattr " << pxattr
3806 << " plocal " << plocal
3807 << " mtime " << any_i->mtime
3808 << " ctime " << any_i->ctime
3809 << " change_attr " << any_i->change_attr
3810 << " valid=" << valid << dendl;
3811
3812 // file
3813 const mempool_inode *file_i = pfile ? pi:oi;
3814 file_layout_t layout;
3815 if (is_dir()) {
3816 layout = (ppolicy ? pi : oi)->layout;
3817 } else {
3818 layout = file_i->layout;
3819 }
3820
3821 // max_size is min of projected, actual
3822 uint64_t max_size =
3823 std::min(oi->get_client_range(client),
3824 pi->get_client_range(client));
3825
3826 // inline data
3827 version_t inline_version = 0;
3828 bufferlist inline_data;
3829 if (file_i->inline_data.version == CEPH_INLINE_NONE) {
3830 inline_version = CEPH_INLINE_NONE;
3831 } else if ((!cap && !no_caps) ||
3832 (cap && cap->client_inline_version < file_i->inline_data.version) ||
3833 (getattr_caps & CEPH_CAP_FILE_RD)) { // client requests inline data
3834 inline_version = file_i->inline_data.version;
3835 if (file_i->inline_data.length() > 0)
3836 file_i->inline_data.get_data(inline_data);
3837 }
3838
3839 // nest (do same as file... :/)
3840 if (cap) {
3841 cap->last_rbytes = file_i->rstat.rbytes;
3842 cap->last_rsize = file_i->rstat.rsize();
3843 }
3844
3845 // auth
3846 const mempool_inode *auth_i = pauth ? pi:oi;
3847
3848 // link
3849 const mempool_inode *link_i = plink ? pi:oi;
3850
3851 // xattr
3852 const mempool_inode *xattr_i = pxattr ? pi:oi;
3853
3854 using ceph::encode;
3855 // xattr
3856 version_t xattr_version;
3857 if ((!cap && !no_caps) ||
3858 (cap && cap->client_xattr_version < xattr_i->xattr_version) ||
3859 (getattr_caps & CEPH_CAP_XATTR_SHARED)) { // client requests xattrs
3860 if (!pxattrs)
3861 pxattrs = pxattr ? get_projected_xattrs().get() : get_xattrs().get();
3862 xattr_version = xattr_i->xattr_version;
3863 } else {
3864 xattr_version = 0;
3865 }
3866
3867 // do we have room?
3868 if (max_bytes) {
3869 unsigned bytes =
3870 8 + 8 + 4 + 8 + 8 + sizeof(ceph_mds_reply_cap) +
3871 sizeof(struct ceph_file_layout) +
3872 sizeof(struct ceph_timespec) * 3 + 4 + // ctime ~ time_warp_seq
3873 8 + 8 + 8 + 4 + 4 + 4 + 4 + 4 + // size ~ nlink
3874 8 + 8 + 8 + 8 + 8 + sizeof(struct ceph_timespec) + // dirstat.nfiles ~ rstat.rctime
3875 sizeof(__u32) + sizeof(__u32) * 2 * dirfragtree._splits.size() + // dirfragtree
3876 sizeof(__u32) + symlink.length() + // symlink
3877 sizeof(struct ceph_dir_layout); // dir_layout
3878
3879 if (xattr_version) {
3880 bytes += sizeof(__u32) + sizeof(__u32); // xattr buffer len + number entries
3881 if (pxattrs) {
3882 for (const auto &p : *pxattrs)
3883 bytes += sizeof(__u32) * 2 + p.first.length() + p.second.length();
3884 }
3885 } else {
3886 bytes += sizeof(__u32); // xattr buffer len
3887 }
3888 bytes +=
3889 sizeof(version_t) + sizeof(__u32) + inline_data.length() + // inline data
3890 1 + 1 + 8 + 8 + 4 + // quota
3891 4 + layout.pool_ns.size() + // pool ns
3892 sizeof(struct ceph_timespec) + 8; // btime + change_attr
3893
3894 if (bytes > max_bytes)
3895 return -CEPHFS_ENOSPC;
3896 }
3897
3898
3899 // encode caps
3900 struct ceph_mds_reply_cap ecap;
3901 if (snapid != CEPH_NOSNAP) {
3902 /*
3903 * snapped inodes (files or dirs) only get read-only caps. always
3904 * issue everything possible, since it is read only.
3905 *
3906 * if a snapped inode has caps, limit issued caps based on the
3907 * lock state.
3908 *
3909 * if it is a live inode, limit issued caps based on the lock
3910 * state.
3911 *
3912 * do NOT adjust cap issued state, because the client always
3913 * tracks caps per-snap and the mds does either per-interval or
3914 * multiversion.
3915 */
3916 ecap.caps = valid ? get_caps_allowed_by_type(CAP_ANY) : CEPH_STAT_CAP_INODE;
3917 if (last == CEPH_NOSNAP || is_any_caps())
3918 ecap.caps = ecap.caps & get_caps_allowed_for_client(session, nullptr, file_i);
3919 ecap.seq = 0;
3920 ecap.mseq = 0;
3921 ecap.realm = 0;
3922 } else {
3923 if (!no_caps && !cap) {
3924 // add a new cap
3925 cap = add_client_cap(client, session, realm);
3926 if (is_auth())
3927 choose_ideal_loner();
3928 }
3929
3930 int issue = 0;
3931 if (!no_caps && cap) {
3932 int likes = get_caps_liked();
3933 int allowed = get_caps_allowed_for_client(session, cap, file_i);
3934 issue = (cap->wanted() | likes) & allowed;
3935 cap->issue_norevoke(issue, true);
3936 issue = cap->pending();
3937 dout(10) << "encode_inodestat issuing " << ccap_string(issue)
3938 << " seq " << cap->get_last_seq() << dendl;
3939 } else if (cap && cap->is_new() && !dir_realm) {
3940 // alway issue new caps to client, otherwise the caps get lost
3941 ceph_assert(cap->is_stale());
3942 ceph_assert(!cap->pending());
3943 issue = CEPH_CAP_PIN;
3944 cap->issue_norevoke(issue, true);
3945 dout(10) << "encode_inodestat issuing " << ccap_string(issue)
3946 << " seq " << cap->get_last_seq()
3947 << "(stale&new caps)" << dendl;
3948 }
3949
3950 if (issue) {
3951 cap->set_last_issue();
3952 cap->set_last_issue_stamp(ceph_clock_now());
3953 ecap.caps = issue;
3954 ecap.wanted = cap->wanted();
3955 ecap.cap_id = cap->get_cap_id();
3956 ecap.seq = cap->get_last_seq();
3957 ecap.mseq = cap->get_mseq();
3958 ecap.realm = realm->inode->ino();
3959 } else {
3960 ecap.cap_id = 0;
3961 ecap.caps = 0;
3962 ecap.seq = 0;
3963 ecap.mseq = 0;
3964 ecap.realm = 0;
3965 ecap.wanted = 0;
3966 }
3967 }
3968 ecap.flags = is_auth() ? CEPH_CAP_FLAG_AUTH : 0;
3969 dout(10) << "encode_inodestat caps " << ccap_string(ecap.caps)
3970 << " seq " << ecap.seq << " mseq " << ecap.mseq
3971 << " xattrv " << xattr_version << dendl;
3972
3973 if (inline_data.length() && cap) {
3974 if ((cap->pending() | getattr_caps) & CEPH_CAP_FILE_SHARED) {
3975 dout(10) << "including inline version " << inline_version << dendl;
3976 cap->client_inline_version = inline_version;
3977 } else {
3978 dout(10) << "dropping inline version " << inline_version << dendl;
3979 inline_version = 0;
3980 inline_data.clear();
3981 }
3982 }
3983
3984 // include those xattrs?
3985 if (xattr_version && cap) {
3986 if ((cap->pending() | getattr_caps) & CEPH_CAP_XATTR_SHARED) {
3987 dout(10) << "including xattrs version " << xattr_version << dendl;
3988 cap->client_xattr_version = xattr_version;
3989 } else {
3990 dout(10) << "dropping xattrs version " << xattr_version << dendl;
3991 xattr_version = 0;
3992 }
3993 }
3994
3995 // The end result of encode_xattrs() is equivalent to:
3996 // {
3997 // bufferlist xbl;
3998 // if (xattr_version) {
3999 // if (pxattrs)
4000 // encode(*pxattrs, bl);
4001 // else
4002 // encode((__u32)0, bl);
4003 // }
4004 // encode(xbl, bl);
4005 // }
4006 //
4007 // But encoding xattrs into the 'xbl' requires a memory allocation.
4008 // The 'bl' should have enough pre-allocated memory in most cases.
4009 // Encoding xattrs directly into it can avoid the extra allocation.
4010 auto encode_xattrs = [xattr_version, pxattrs, &bl]() {
4011 using ceph::encode;
4012 if (xattr_version) {
4013 ceph_le32 xbl_len;
4014 auto filler = bl.append_hole(sizeof(xbl_len));
4015 const auto starting_bl_len = bl.length();
4016 if (pxattrs)
4017 encode(*pxattrs, bl);
4018 else
4019 encode((__u32)0, bl);
4020 xbl_len = bl.length() - starting_bl_len;
4021 filler.copy_in(sizeof(xbl_len), (char *)&xbl_len);
4022 } else {
4023 encode((__u32)0, bl);
4024 }
4025 };
4026
4027 /*
4028 * note: encoding matches MClientReply::InodeStat
4029 */
4030 if (session->info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) {
4031 ENCODE_START(7, 1, bl);
4032 encode(oi->ino, bl);
4033 encode(snapid, bl);
4034 encode(oi->rdev, bl);
4035 encode(version, bl);
4036 encode(xattr_version, bl);
4037 encode(ecap, bl);
4038 {
4039 ceph_file_layout legacy_layout;
4040 layout.to_legacy(&legacy_layout);
4041 encode(legacy_layout, bl);
4042 }
4043 encode(any_i->ctime, bl);
4044 encode(file_i->mtime, bl);
4045 encode(file_i->atime, bl);
4046 encode(file_i->time_warp_seq, bl);
4047 encode(file_i->size, bl);
4048 encode(max_size, bl);
4049 encode(file_i->truncate_size, bl);
4050 encode(file_i->truncate_seq, bl);
4051 encode(auth_i->mode, bl);
4052 encode((uint32_t)auth_i->uid, bl);
4053 encode((uint32_t)auth_i->gid, bl);
4054 encode(link_i->nlink, bl);
4055 encode(file_i->dirstat.nfiles, bl);
4056 encode(file_i->dirstat.nsubdirs, bl);
4057 encode(file_i->rstat.rbytes, bl);
4058 encode(file_i->rstat.rfiles, bl);
4059 encode(file_i->rstat.rsubdirs, bl);
4060 encode(file_i->rstat.rctime, bl);
4061 dirfragtree.encode(bl);
4062 encode(symlink, bl);
4063 encode(file_i->dir_layout, bl);
4064 encode_xattrs();
4065 encode(inline_version, bl);
4066 encode(inline_data, bl);
4067 const mempool_inode *policy_i = ppolicy ? pi : oi;
4068 encode(policy_i->quota, bl);
4069 encode(layout.pool_ns, bl);
4070 encode(any_i->btime, bl);
4071 encode(any_i->change_attr, bl);
4072 encode(file_i->export_pin, bl);
4073 encode(snap_btime, bl);
4074 encode(file_i->rstat.rsnaps, bl);
4075 encode(snap_metadata, bl);
4076 encode(!file_i->fscrypt_auth.empty(), bl);
4077 encode(file_i->fscrypt_auth, bl);
4078 encode(file_i->fscrypt_file, bl);
4079 ENCODE_FINISH(bl);
4080 }
4081 else {
4082 ceph_assert(session->get_connection());
4083
4084 encode(oi->ino, bl);
4085 encode(snapid, bl);
4086 encode(oi->rdev, bl);
4087 encode(version, bl);
4088 encode(xattr_version, bl);
4089 encode(ecap, bl);
4090 {
4091 ceph_file_layout legacy_layout;
4092 layout.to_legacy(&legacy_layout);
4093 encode(legacy_layout, bl);
4094 }
4095 encode(any_i->ctime, bl);
4096 encode(file_i->mtime, bl);
4097 encode(file_i->atime, bl);
4098 encode(file_i->time_warp_seq, bl);
4099 encode(file_i->size, bl);
4100 encode(max_size, bl);
4101 encode(file_i->truncate_size, bl);
4102 encode(file_i->truncate_seq, bl);
4103 encode(auth_i->mode, bl);
4104 encode((uint32_t)auth_i->uid, bl);
4105 encode((uint32_t)auth_i->gid, bl);
4106 encode(link_i->nlink, bl);
4107 encode(file_i->dirstat.nfiles, bl);
4108 encode(file_i->dirstat.nsubdirs, bl);
4109 encode(file_i->rstat.rbytes, bl);
4110 encode(file_i->rstat.rfiles, bl);
4111 encode(file_i->rstat.rsubdirs, bl);
4112 encode(file_i->rstat.rctime, bl);
4113 dirfragtree.encode(bl);
4114 encode(symlink, bl);
4115 auto& conn = session->get_connection();
4116 if (conn->has_feature(CEPH_FEATURE_DIRLAYOUTHASH)) {
4117 encode(file_i->dir_layout, bl);
4118 }
4119 encode_xattrs();
4120 if (conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
4121 encode(inline_version, bl);
4122 encode(inline_data, bl);
4123 }
4124 if (conn->has_feature(CEPH_FEATURE_MDS_QUOTA)) {
4125 const mempool_inode *policy_i = ppolicy ? pi : oi;
4126 encode(policy_i->quota, bl);
4127 }
4128 if (conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) {
4129 encode(layout.pool_ns, bl);
4130 }
4131 if (conn->has_feature(CEPH_FEATURE_FS_BTIME)) {
4132 encode(any_i->btime, bl);
4133 encode(any_i->change_attr, bl);
4134 }
4135 }
4136
4137 return valid;
4138 }
4139
4140 void CInode::encode_cap_message(const ref_t<MClientCaps> &m, Capability *cap)
4141 {
4142 ceph_assert(cap);
4143
4144 client_t client = cap->get_client();
4145
4146 bool pfile = filelock.is_xlocked_by_client(client) || (cap->issued() & CEPH_CAP_FILE_EXCL);
4147 bool pauth = authlock.is_xlocked_by_client(client);
4148 bool plink = linklock.is_xlocked_by_client(client);
4149 bool pxattr = xattrlock.is_xlocked_by_client(client);
4150
4151 const mempool_inode *oi = get_inode().get();
4152 const mempool_inode *pi = get_projected_inode().get();
4153 const mempool_inode *i = (pfile|pauth|plink|pxattr) ? pi : oi;
4154
4155 dout(20) << __func__ << " pfile " << pfile
4156 << " pauth " << pauth << " plink " << plink << " pxattr " << pxattr
4157 << " mtime " << i->mtime << " ctime " << i->ctime << " change_attr " << i->change_attr << dendl;
4158
4159 i = pfile ? pi:oi;
4160 m->set_layout(i->layout);
4161 m->size = i->size;
4162 m->truncate_seq = i->truncate_seq;
4163 m->truncate_size = i->truncate_size;
4164 m->fscrypt_file = i->fscrypt_file;
4165 m->fscrypt_auth = i->fscrypt_auth;
4166 m->mtime = i->mtime;
4167 m->atime = i->atime;
4168 m->ctime = i->ctime;
4169 m->btime = i->btime;
4170 m->change_attr = i->change_attr;
4171 m->time_warp_seq = i->time_warp_seq;
4172 m->nfiles = i->dirstat.nfiles;
4173 m->nsubdirs = i->dirstat.nsubdirs;
4174
4175 if (cap->client_inline_version < i->inline_data.version) {
4176 m->inline_version = cap->client_inline_version = i->inline_data.version;
4177 if (i->inline_data.length() > 0)
4178 i->inline_data.get_data(m->inline_data);
4179 } else {
4180 m->inline_version = 0;
4181 }
4182
4183 // max_size is min of projected, actual.
4184 uint64_t oldms = oi->get_client_range(client);
4185 uint64_t newms = pi->get_client_range(client);
4186 m->max_size = std::min(oldms, newms);
4187
4188 i = pauth ? pi:oi;
4189 m->head.mode = i->mode;
4190 m->head.uid = i->uid;
4191 m->head.gid = i->gid;
4192
4193 i = plink ? pi:oi;
4194 m->head.nlink = i->nlink;
4195
4196 using ceph::encode;
4197 i = pxattr ? pi:oi;
4198 const auto& ix = pxattr ? get_projected_xattrs() : get_xattrs();
4199 if ((cap->pending() & CEPH_CAP_XATTR_SHARED) &&
4200 i->xattr_version > cap->client_xattr_version) {
4201 dout(10) << " including xattrs v " << i->xattr_version << dendl;
4202 if (ix)
4203 encode(*ix, m->xattrbl);
4204 else
4205 encode((__u32)0, m->xattrbl);
4206 m->head.xattr_version = i->xattr_version;
4207 cap->client_xattr_version = i->xattr_version;
4208 }
4209 }
4210
4211
4212
4213 void CInode::_encode_base(bufferlist& bl, uint64_t features)
4214 {
4215 ENCODE_START(1, 1, bl);
4216 encode(first, bl);
4217 encode(*get_inode(), bl, features);
4218 encode(symlink, bl);
4219 encode(dirfragtree, bl);
4220 encode_xattrs(bl);
4221 encode_old_inodes(bl, features);
4222 encode(damage_flags, bl);
4223 encode_snap(bl);
4224 ENCODE_FINISH(bl);
4225 }
4226 void CInode::_decode_base(bufferlist::const_iterator& p)
4227 {
4228 DECODE_START(1, p);
4229 decode(first, p);
4230 {
4231 auto _inode = allocate_inode();
4232 decode(*_inode, p);
4233 reset_inode(std::move(_inode));
4234 }
4235 {
4236 std::string tmp;
4237 decode(tmp, p);
4238 symlink = std::string_view(tmp);
4239 }
4240 decode(dirfragtree, p);
4241 decode_xattrs(p);
4242 decode_old_inodes(p);
4243 decode(damage_flags, p);
4244 decode_snap(p);
4245 DECODE_FINISH(p);
4246 }
4247
4248 void CInode::_encode_locks_full(bufferlist& bl)
4249 {
4250 using ceph::encode;
4251 encode(authlock, bl);
4252 encode(linklock, bl);
4253 encode(dirfragtreelock, bl);
4254 encode(filelock, bl);
4255 encode(xattrlock, bl);
4256 encode(snaplock, bl);
4257 encode(nestlock, bl);
4258 encode(flocklock, bl);
4259 encode(policylock, bl);
4260
4261 encode(loner_cap, bl);
4262 }
4263 void CInode::_decode_locks_full(bufferlist::const_iterator& p)
4264 {
4265 using ceph::decode;
4266 decode(authlock, p);
4267 decode(linklock, p);
4268 decode(dirfragtreelock, p);
4269 decode(filelock, p);
4270 decode(xattrlock, p);
4271 decode(snaplock, p);
4272 decode(nestlock, p);
4273 decode(flocklock, p);
4274 decode(policylock, p);
4275
4276 decode(loner_cap, p);
4277 set_loner_cap(loner_cap);
4278 want_loner_cap = loner_cap; // for now, we'll eval() shortly.
4279 }
4280
4281 void CInode::_encode_locks_state_for_replica(bufferlist& bl, bool need_recover)
4282 {
4283 ENCODE_START(1, 1, bl);
4284 authlock.encode_state_for_replica(bl);
4285 linklock.encode_state_for_replica(bl);
4286 dirfragtreelock.encode_state_for_replica(bl);
4287 filelock.encode_state_for_replica(bl);
4288 nestlock.encode_state_for_replica(bl);
4289 xattrlock.encode_state_for_replica(bl);
4290 snaplock.encode_state_for_replica(bl);
4291 flocklock.encode_state_for_replica(bl);
4292 policylock.encode_state_for_replica(bl);
4293 encode(need_recover, bl);
4294 ENCODE_FINISH(bl);
4295 }
4296
4297 void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep)
4298 {
4299 authlock.encode_state_for_replica(bl);
4300 linklock.encode_state_for_replica(bl);
4301 dirfragtreelock.encode_state_for_rejoin(bl, rep);
4302 filelock.encode_state_for_rejoin(bl, rep);
4303 nestlock.encode_state_for_rejoin(bl, rep);
4304 xattrlock.encode_state_for_replica(bl);
4305 snaplock.encode_state_for_replica(bl);
4306 flocklock.encode_state_for_replica(bl);
4307 policylock.encode_state_for_replica(bl);
4308 }
4309
4310 void CInode::_decode_locks_state_for_replica(bufferlist::const_iterator& p, bool is_new)
4311 {
4312 DECODE_START(1, p);
4313 authlock.decode_state(p, is_new);
4314 linklock.decode_state(p, is_new);
4315 dirfragtreelock.decode_state(p, is_new);
4316 filelock.decode_state(p, is_new);
4317 nestlock.decode_state(p, is_new);
4318 xattrlock.decode_state(p, is_new);
4319 snaplock.decode_state(p, is_new);
4320 flocklock.decode_state(p, is_new);
4321 policylock.decode_state(p, is_new);
4322
4323 bool need_recover;
4324 decode(need_recover, p);
4325 if (need_recover && is_new) {
4326 // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock
4327 // and change the object when replaying unsafe requests.
4328 authlock.mark_need_recover();
4329 linklock.mark_need_recover();
4330 dirfragtreelock.mark_need_recover();
4331 filelock.mark_need_recover();
4332 nestlock.mark_need_recover();
4333 xattrlock.mark_need_recover();
4334 snaplock.mark_need_recover();
4335 flocklock.mark_need_recover();
4336 policylock.mark_need_recover();
4337 }
4338 DECODE_FINISH(p);
4339 }
4340 void CInode::_decode_locks_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters,
4341 list<SimpleLock*>& eval_locks, bool survivor)
4342 {
4343 authlock.decode_state_rejoin(p, waiters, survivor);
4344 linklock.decode_state_rejoin(p, waiters, survivor);
4345 dirfragtreelock.decode_state_rejoin(p, waiters, survivor);
4346 filelock.decode_state_rejoin(p, waiters, survivor);
4347 nestlock.decode_state_rejoin(p, waiters, survivor);
4348 xattrlock.decode_state_rejoin(p, waiters, survivor);
4349 snaplock.decode_state_rejoin(p, waiters, survivor);
4350 flocklock.decode_state_rejoin(p, waiters, survivor);
4351 policylock.decode_state_rejoin(p, waiters, survivor);
4352
4353 if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked())
4354 eval_locks.push_back(&dirfragtreelock);
4355 if (!filelock.is_stable() && !filelock.is_wrlocked())
4356 eval_locks.push_back(&filelock);
4357 if (!nestlock.is_stable() && !nestlock.is_wrlocked())
4358 eval_locks.push_back(&nestlock);
4359 }
4360
4361
4362 // IMPORT/EXPORT
4363
4364 void CInode::encode_export(bufferlist& bl)
4365 {
4366 ENCODE_START(5, 4, bl);
4367 _encode_base(bl, mdcache->mds->mdsmap->get_up_features());
4368
4369 encode(state, bl);
4370
4371 encode(pop, bl);
4372
4373 encode(get_replicas(), bl);
4374
4375 // include scatterlock info for any bounding CDirs
4376 bufferlist bounding;
4377 if (get_inode()->is_dir())
4378 for (const auto &p : dirfrags) {
4379 CDir *dir = p.second;
4380 if (dir->state_test(CDir::STATE_EXPORTBOUND)) {
4381 encode(p.first, bounding);
4382 encode(dir->get_fnode()->fragstat, bounding);
4383 encode(dir->get_fnode()->accounted_fragstat, bounding);
4384 encode(dir->get_fnode()->rstat, bounding);
4385 encode(dir->get_fnode()->accounted_rstat, bounding);
4386 dout(10) << " encoded fragstat/rstat info for " << *dir << dendl;
4387 }
4388 }
4389 encode(bounding, bl);
4390
4391 _encode_locks_full(bl);
4392
4393 _encode_file_locks(bl);
4394
4395 ENCODE_FINISH(bl);
4396
4397 get(PIN_TEMPEXPORTING);
4398 }
4399
4400 void CInode::finish_export()
4401 {
4402 state &= MASK_STATE_EXPORT_KEPT;
4403
4404 pop.zero();
4405
4406 // just in case!
4407 //dirlock.clear_updated();
4408
4409 loner_cap = -1;
4410
4411 put(PIN_TEMPEXPORTING);
4412 }
4413
4414 void CInode::decode_import(bufferlist::const_iterator& p,
4415 LogSegment *ls)
4416 {
4417 DECODE_START(5, p);
4418
4419 _decode_base(p);
4420
4421 {
4422 unsigned s;
4423 decode(s, p);
4424 s &= MASK_STATE_EXPORTED;
4425
4426 set_ephemeral_pin((s & STATE_DISTEPHEMERALPIN),
4427 (s & STATE_RANDEPHEMERALPIN));
4428 state_set(STATE_AUTH | s);
4429 }
4430
4431 if (is_dirty()) {
4432 get(PIN_DIRTY);
4433 _mark_dirty(ls);
4434 }
4435 if (is_dirty_parent()) {
4436 get(PIN_DIRTYPARENT);
4437 mark_dirty_parent(ls);
4438 }
4439
4440 decode(pop, p);
4441
4442 decode(get_replicas(), p);
4443 if (is_replicated())
4444 get(PIN_REPLICATED);
4445 replica_nonce = 0;
4446
4447 // decode fragstat info on bounding cdirs
4448 bufferlist bounding;
4449 decode(bounding, p);
4450 auto q = bounding.cbegin();
4451 while (!q.end()) {
4452 frag_t fg;
4453 decode(fg, q);
4454 CDir *dir = get_dirfrag(fg);
4455 ceph_assert(dir); // we should have all bounds open
4456
4457 // Only take the remote's fragstat/rstat if we are non-auth for
4458 // this dirfrag AND the lock is NOT in a scattered (MIX) state.
4459 // We know lock is stable, and MIX is the only state in which
4460 // the inode auth (who sent us this data) may not have the best
4461 // info.
4462
4463 // HMM: Are there cases where dir->is_auth() is an insufficient
4464 // check because the dirfrag is under migration? That implies
4465 // it is frozen (and in a SYNC or LOCK state). FIXME.
4466
4467 auto _fnode = CDir::allocate_fnode(*dir->get_fnode());
4468 if (dir->is_auth() ||
4469 filelock.get_state() == LOCK_MIX) {
4470 dout(10) << " skipped fragstat info for " << *dir << dendl;
4471 frag_info_t f;
4472 decode(f, q);
4473 decode(f, q);
4474 } else {
4475 decode(_fnode->fragstat, q);
4476 decode(_fnode->accounted_fragstat, q);
4477 dout(10) << " took fragstat info for " << *dir << dendl;
4478 }
4479 if (dir->is_auth() ||
4480 nestlock.get_state() == LOCK_MIX) {
4481 dout(10) << " skipped rstat info for " << *dir << dendl;
4482 nest_info_t n;
4483 decode(n, q);
4484 decode(n, q);
4485 } else {
4486 decode(_fnode->rstat, q);
4487 decode(_fnode->accounted_rstat, q);
4488 dout(10) << " took rstat info for " << *dir << dendl;
4489 }
4490 dir->reset_fnode(std::move(_fnode));
4491 }
4492
4493 _decode_locks_full(p);
4494
4495 _decode_file_locks(p);
4496
4497 DECODE_FINISH(p);
4498 }
4499
4500
4501 void InodeStoreBase::dump(Formatter *f) const
4502 {
4503 inode->dump(f);
4504 f->dump_string("symlink", symlink);
4505
4506 f->open_array_section("xattrs");
4507 if (xattrs) {
4508 for (const auto& [key, val] : *xattrs) {
4509 f->open_object_section("xattr");
4510 f->dump_string("key", key);
4511 std::string v(val.c_str(), val.length());
4512 f->dump_string("val", v);
4513 f->close_section();
4514 }
4515 }
4516 f->close_section();
4517 f->open_object_section("dirfragtree");
4518 dirfragtree.dump(f);
4519 f->close_section(); // dirfragtree
4520
4521 f->open_array_section("old_inodes");
4522 if (old_inodes) {
4523 for (const auto &p : *old_inodes) {
4524 f->open_object_section("old_inode");
4525 // The key is the last snapid, the first is in the mempool_old_inode
4526 f->dump_int("last", p.first);
4527 p.second.dump(f);
4528 f->close_section(); // old_inode
4529 }
4530 }
4531 f->close_section(); // old_inodes
4532
4533 f->dump_unsigned("oldest_snap", oldest_snap);
4534 f->dump_unsigned("damage_flags", damage_flags);
4535 }
4536
4537 template <>
4538 void decode_json_obj(mempool::mds_co::string& t, JSONObj *obj){
4539
4540 t = mempool::mds_co::string(std::string_view(obj->get_data()));
4541 }
4542
4543 void InodeStoreBase::decode_json(JSONObj *obj)
4544 {
4545 {
4546 auto _inode = allocate_inode();
4547 _inode->decode_json(obj);
4548 reset_inode(std::move(_inode));
4549 }
4550
4551 JSONDecoder::decode_json("symlink", symlink, obj, true);
4552 // JSONDecoder::decode_json("dirfragtree", dirfragtree, obj, true); // cann't decode it now
4553 //
4554 //
4555 {
4556 mempool_xattr_map tmp;
4557 JSONDecoder::decode_json("xattrs", tmp, xattrs_cb, obj, true);
4558 if (tmp.empty())
4559 reset_xattrs(xattr_map_ptr());
4560 else
4561 reset_xattrs(allocate_xattr_map(std::move(tmp)));
4562 }
4563 // JSONDecoder::decode_json("old_inodes", old_inodes, InodeStoreBase::old_indoes_cb, obj, true); // cann't decode old_inodes now
4564 JSONDecoder::decode_json("oldest_snap", oldest_snap.val, obj, true);
4565 JSONDecoder::decode_json("damage_flags", damage_flags, obj, true);
4566 //sr_t srnode;
4567 //JSONDecoder::decode_json("snap_blob", srnode, obj, true); // cann't decode it now
4568 //snap_blob = srnode;
4569 }
4570
4571 void InodeStoreBase::xattrs_cb(InodeStoreBase::mempool_xattr_map& c, JSONObj *obj){
4572
4573 string k;
4574 JSONDecoder::decode_json("key", k, obj, true);
4575 string v;
4576 JSONDecoder::decode_json("val", v, obj, true);
4577 c[k.c_str()] = buffer::copy(v.c_str(), v.size());
4578 }
4579
4580 void InodeStoreBase::old_indoes_cb(InodeStoreBase::mempool_old_inode_map& c, JSONObj *obj){
4581
4582 snapid_t s;
4583 JSONDecoder::decode_json("last", s.val, obj, true);
4584 InodeStoreBase::mempool_old_inode i;
4585 // i.decode_json(obj); // cann't decode now, simon
4586 c[s] = i;
4587 }
4588
4589 void InodeStore::generate_test_instances(std::list<InodeStore*> &ls)
4590 {
4591 InodeStore *populated = new InodeStore;
4592 populated->get_inode()->ino = 0xdeadbeef;
4593 populated->symlink = "rhubarb";
4594 ls.push_back(populated);
4595 }
4596
4597 void InodeStoreBare::generate_test_instances(std::list<InodeStoreBare*> &ls)
4598 {
4599 InodeStoreBare *populated = new InodeStoreBare;
4600 populated->get_inode()->ino = 0xdeadbeef;
4601 populated->symlink = "rhubarb";
4602 ls.push_back(populated);
4603 }
4604
4605 void CInode::validate_disk_state(CInode::validated_data *results,
4606 MDSContext *fin)
4607 {
4608 class ValidationContinuation : public MDSContinuation {
4609 public:
4610 MDSContext *fin;
4611 CInode *in;
4612 CInode::validated_data *results;
4613 bufferlist bl;
4614 CInode *shadow_in;
4615
4616 enum {
4617 START = 0,
4618 BACKTRACE,
4619 INODE,
4620 DIRFRAGS,
4621 SNAPREALM,
4622 };
4623
4624 ValidationContinuation(CInode *i,
4625 CInode::validated_data *data_r,
4626 MDSContext *fin_) :
4627 MDSContinuation(i->mdcache->mds->server),
4628 fin(fin_),
4629 in(i),
4630 results(data_r),
4631 shadow_in(NULL) {
4632 set_callback(START, static_cast<Continuation::stagePtr>(&ValidationContinuation::_start));
4633 set_callback(BACKTRACE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_backtrace));
4634 set_callback(INODE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_inode_disk));
4635 set_callback(DIRFRAGS, static_cast<Continuation::stagePtr>(&ValidationContinuation::_dirfrags));
4636 }
4637
4638 ~ValidationContinuation() override {
4639 if (shadow_in) {
4640 delete shadow_in;
4641 in->mdcache->num_shadow_inodes--;
4642 }
4643 }
4644
4645 /**
4646 * Fetch backtrace and set tag if tag is non-empty
4647 */
4648 void fetch_backtrace_and_tag(CInode *in,
4649 std::string_view tag, bool is_internal,
4650 Context *fin, int *bt_r, bufferlist *bt)
4651 {
4652 const int64_t pool = in->get_backtrace_pool();
4653 object_t oid = CInode::get_object_name(in->ino(), frag_t(), "");
4654
4655 ObjectOperation fetch;
4656 fetch.getxattr("parent", bt, bt_r);
4657 in->mdcache->mds->objecter->read(oid, object_locator_t(pool), fetch, CEPH_NOSNAP,
4658 NULL, 0, fin);
4659 if (in->mdcache->mds->logger) {
4660 in->mdcache->mds->logger->inc(l_mds_openino_backtrace_fetch);
4661 in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_fetch);
4662 }
4663
4664 using ceph::encode;
4665 if (!is_internal) {
4666 ObjectOperation scrub_tag;
4667 bufferlist tag_bl;
4668 encode(tag, tag_bl);
4669 scrub_tag.setxattr("scrub_tag", tag_bl);
4670 SnapContext snapc;
4671 in->mdcache->mds->objecter->mutate(oid, object_locator_t(pool), scrub_tag, snapc,
4672 ceph::real_clock::now(),
4673 0, NULL);
4674 if (in->mdcache->mds->logger)
4675 in->mdcache->mds->logger->inc(l_mds_scrub_set_tag);
4676 }
4677 }
4678
4679 bool _start(int rval) {
4680 ceph_assert(in->can_auth_pin());
4681 in->auth_pin(this);
4682
4683 if (in->is_dirty()) {
4684 MDCache *mdcache = in->mdcache; // For the benefit of dout
4685 auto ino = [this]() { return in->ino(); }; // For the benefit of dout
4686 dout(20) << "validating a dirty CInode; results will be inconclusive"
4687 << dendl;
4688 }
4689
4690 C_OnFinisher *conf = new C_OnFinisher(get_io_callback(BACKTRACE),
4691 in->mdcache->mds->finisher);
4692
4693 std::string_view tag = in->scrub_infop->header->get_tag();
4694 bool is_internal = in->scrub_infop->header->is_internal_tag();
4695 // Rather than using the usual CInode::fetch_backtrace,
4696 // use a special variant that optionally writes a tag in the same
4697 // operation.
4698 fetch_backtrace_and_tag(in, tag, is_internal, conf, &results->backtrace.ondisk_read_retval, &bl);
4699 return false;
4700 }
4701
4702 bool _backtrace(int rval) {
4703 // set up basic result reporting and make sure we got the data
4704 results->performed_validation = true; // at least, some of it!
4705 results->backtrace.checked = true;
4706
4707 const int64_t pool = in->get_backtrace_pool();
4708 inode_backtrace_t& memory_backtrace = results->backtrace.memory_value;
4709 in->build_backtrace(pool, memory_backtrace);
4710 bool equivalent, divergent;
4711 int memory_newer;
4712
4713 MDCache *mdcache = in->mdcache; // For the benefit of dout
4714 auto ino = [this]() { return in->ino(); }; // For the benefit of dout
4715
4716 // Ignore rval because it's the result of a FAILOK operation
4717 // from fetch_backtrace_and_tag: the real result is in
4718 // backtrace.ondisk_read_retval
4719 dout(20) << "ondisk_read_retval: " << results->backtrace.ondisk_read_retval << dendl;
4720 if (results->backtrace.ondisk_read_retval != 0) {
4721 results->backtrace.error_str << "failed to read off disk; see retval";
4722 // we probably have a new unwritten file!
4723 // so skip the backtrace scrub for this entry and say that all's well
4724 if (in->is_mdsdir()){
4725 dout(20) << "forcing backtrace as passed since mdsdir actually doesn't have backtrace" << dendl;
4726 results->backtrace.passed = true;
4727 }
4728 if (in->is_dirty_parent()) {
4729 dout(20) << "forcing backtrace as passed since inode is dirty parent" << dendl;
4730 results->backtrace.passed = true;
4731 }
4732 goto next;
4733 }
4734
4735 // extract the backtrace, and compare it to a newly-constructed one
4736 try {
4737 auto p = bl.cbegin();
4738 using ceph::decode;
4739 decode(results->backtrace.ondisk_value, p);
4740 dout(10) << "decoded " << bl.length() << " bytes of backtrace successfully" << dendl;
4741 } catch (buffer::error&) {
4742 if (results->backtrace.ondisk_read_retval == 0 && rval != 0) {
4743 // Cases where something has clearly gone wrong with the overall
4744 // fetch op, though we didn't get a nonzero rc from the getxattr
4745 // operation. e.g. object missing.
4746 results->backtrace.ondisk_read_retval = rval;
4747 }
4748 results->backtrace.error_str << "failed to decode on-disk backtrace ("
4749 << bl.length() << " bytes)!";
4750 // we probably have a new unwritten file!
4751 // so skip the backtrace scrub for this entry and say that all's well
4752 if (in->is_dirty_parent()) {
4753 dout(20) << "decode failed; forcing backtrace as passed since "
4754 "inode is dirty parent" << dendl;
4755 results->backtrace.passed = true;
4756 }
4757
4758 goto next;
4759 }
4760
4761 memory_newer = memory_backtrace.compare(results->backtrace.ondisk_value,
4762 &equivalent, &divergent);
4763
4764 if (divergent || memory_newer < 0) {
4765 // we're divergent, or on-disk version is newer
4766 results->backtrace.error_str << "On-disk backtrace is divergent or newer";
4767 /* if the backtraces are divergent and the link count is 0, then
4768 * most likely its a stray entry that's being purged and things are
4769 * well and there's no reason for alarm
4770 */
4771 if (divergent && (in->is_dirty_parent() || in->get_inode()->nlink == 0)) {
4772 results->backtrace.passed = true;
4773 dout(20) << "divergent backtraces are acceptable when dn "
4774 "is being purged or has been renamed or moved to a "
4775 "different directory " << *in << dendl;
4776 }
4777 } else {
4778 results->backtrace.passed = true;
4779 }
4780 next:
4781
4782 if (!results->backtrace.passed && in->scrub_infop->header->get_repair()) {
4783 std::string path;
4784 in->make_path_string(path);
4785 in->mdcache->mds->clog->warn() << "bad backtrace on inode " << in->ino()
4786 << "(" << path << "), rewriting it";
4787 in->mark_dirty_parent(in->mdcache->mds->mdlog->get_current_segment(),
4788 false);
4789 // Flag that we repaired this BT so that it won't go into damagetable
4790 results->backtrace.repaired = true;
4791 if (in->mdcache->mds->logger)
4792 in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_repaired);
4793 }
4794
4795 // If the inode's number was free in the InoTable, fix that
4796 // (#15619)
4797 {
4798 InoTable *inotable = mdcache->mds->inotable;
4799
4800 dout(10) << "scrub: inotable ino = " << in->ino() << dendl;
4801 dout(10) << "scrub: inotable free says "
4802 << inotable->is_marked_free(in->ino()) << dendl;
4803
4804 if (inotable->is_marked_free(in->ino())) {
4805 LogChannelRef clog = in->mdcache->mds->clog;
4806 clog->error() << "scrub: inode wrongly marked free: " << in->ino();
4807
4808 if (in->scrub_infop->header->get_repair()) {
4809 bool repaired = inotable->repair(in->ino());
4810 if (repaired) {
4811 clog->error() << "inode table repaired for inode: " << in->ino();
4812
4813 inotable->save();
4814 if (in->mdcache->mds->logger)
4815 in->mdcache->mds->logger->inc(l_mds_scrub_inotable_repaired);
4816 } else {
4817 clog->error() << "Cannot repair inotable while other operations"
4818 " are in progress";
4819 }
4820 }
4821 }
4822 }
4823
4824
4825 if (in->is_dir()) {
4826 if (in->mdcache->mds->logger)
4827 in->mdcache->mds->logger->inc(l_mds_scrub_dir_inodes);
4828 return validate_directory_data();
4829 } else {
4830 if (in->mdcache->mds->logger)
4831 in->mdcache->mds->logger->inc(l_mds_scrub_file_inodes);
4832 // TODO: validate on-disk inode for normal files
4833 return true;
4834 }
4835 }
4836
4837 bool validate_directory_data() {
4838 ceph_assert(in->is_dir());
4839
4840 if (in->is_base()) {
4841 if (!shadow_in) {
4842 shadow_in = new CInode(in->mdcache);
4843 in->mdcache->create_unlinked_system_inode(shadow_in, in->ino(), in->get_inode()->mode);
4844 in->mdcache->num_shadow_inodes++;
4845 }
4846 shadow_in->fetch(get_internal_callback(INODE));
4847 if (in->mdcache->mds->logger)
4848 in->mdcache->mds->logger->inc(l_mds_scrub_dir_base_inodes);
4849 return false;
4850 } else {
4851 // TODO: validate on-disk inode for non-base directories
4852 if (in->mdcache->mds->logger)
4853 in->mdcache->mds->logger->inc(l_mds_scrub_dirfrag_rstats);
4854 results->inode.passed = true;
4855 return check_dirfrag_rstats();
4856 }
4857 }
4858
4859 bool _inode_disk(int rval) {
4860 const auto& si = shadow_in->get_inode();
4861 const auto& i = in->get_inode();
4862
4863 results->inode.checked = true;
4864 results->inode.ondisk_read_retval = rval;
4865 results->inode.ondisk_value = *si;
4866 results->inode.memory_value = *i;
4867
4868 if (si->version > i->version) {
4869 // uh, what?
4870 results->inode.error_str << "On-disk inode is newer than in-memory one; ";
4871 goto next;
4872 } else {
4873 bool divergent = false;
4874 int r = i->compare(*si, &divergent);
4875 results->inode.passed = !divergent && r >= 0;
4876 if (!results->inode.passed) {
4877 results->inode.error_str <<
4878 "On-disk inode is divergent or newer than in-memory one; ";
4879 goto next;
4880 }
4881 }
4882 next:
4883 return check_dirfrag_rstats();
4884 }
4885
4886 bool check_dirfrag_rstats() {
4887 if (in->has_subtree_root_dirfrag()) {
4888 in->mdcache->rdlock_dirfrags_stats(in, get_internal_callback(DIRFRAGS));
4889 return false;
4890 } else {
4891 return immediate(DIRFRAGS, 0);
4892 }
4893 }
4894
4895 bool _dirfrags(int rval) {
4896 // basic reporting setup
4897 results->raw_stats.checked = true;
4898 results->raw_stats.ondisk_read_retval = rval;
4899
4900 results->raw_stats.memory_value.dirstat = in->get_inode()->dirstat;
4901 results->raw_stats.memory_value.rstat = in->get_inode()->rstat;
4902 frag_info_t& dir_info = results->raw_stats.ondisk_value.dirstat;
4903 nest_info_t& nest_info = results->raw_stats.ondisk_value.rstat;
4904
4905 if (rval != 0) {
4906 results->raw_stats.error_str << "Failed to read dirfrags off disk";
4907 goto next;
4908 }
4909
4910 // check each dirfrag...
4911 for (const auto &p : in->dirfrags) {
4912 CDir *dir = p.second;
4913 ceph_assert(dir->get_version() > 0);
4914 nest_info.add(dir->get_fnode()->accounted_rstat);
4915 dir_info.add(dir->get_fnode()->accounted_fragstat);
4916 }
4917 nest_info.rsubdirs++; // it gets one to account for self
4918 if (const sr_t *srnode = in->get_projected_srnode(); srnode)
4919 nest_info.rsnaps += srnode->snaps.size();
4920
4921 // ...and that their sum matches our inode settings
4922 if (!dir_info.same_sums(in->get_inode()->dirstat) ||
4923 !nest_info.same_sums(in->get_inode()->rstat)) {
4924 if (in->scrub_infop->header->get_repair()) {
4925 results->raw_stats.error_str
4926 << "freshly-calculated rstats don't match existing ones (will be fixed)";
4927 in->mdcache->repair_inode_stats(in);
4928 results->raw_stats.repaired = true;
4929 } else {
4930 results->raw_stats.error_str
4931 << "freshly-calculated rstats don't match existing ones";
4932 }
4933 if (in->is_dirty()) {
4934 MDCache *mdcache = in->mdcache; // for dout()
4935 auto ino = [this]() { return in->ino(); }; // for dout()
4936 dout(20) << "raw stats most likely wont match since inode is dirty; "
4937 "please rerun scrub when system is stable; "
4938 "assuming passed for now;" << dendl;
4939 results->raw_stats.passed = true;
4940 }
4941 goto next;
4942 }
4943
4944 results->raw_stats.passed = true;
4945 {
4946 MDCache *mdcache = in->mdcache; // for dout()
4947 auto ino = [this]() { return in->ino(); }; // for dout()
4948 dout(20) << "raw stats check passed on " << *in << dendl;
4949 }
4950
4951 next:
4952 return true;
4953 }
4954
4955 void _done() override {
4956 if ((!results->raw_stats.checked || results->raw_stats.passed) &&
4957 (!results->backtrace.checked || results->backtrace.passed) &&
4958 (!results->inode.checked || results->inode.passed))
4959 results->passed_validation = true;
4960
4961 // Flag that we did some repair work so that our repair operation
4962 // can be flushed at end of scrub
4963 if (results->backtrace.repaired ||
4964 results->inode.repaired ||
4965 results->raw_stats.repaired)
4966 in->scrub_infop->header->set_repaired();
4967 if (fin)
4968 fin->complete(get_rval());
4969
4970 in->auth_unpin(this);
4971 }
4972 };
4973
4974
4975 dout(10) << "scrub starting validate_disk_state on " << *this << dendl;
4976 ValidationContinuation *vc = new ValidationContinuation(this,
4977 results,
4978 fin);
4979 vc->begin();
4980 }
4981
4982 void CInode::validated_data::dump(Formatter *f) const
4983 {
4984 f->open_object_section("results");
4985 {
4986 f->dump_bool("performed_validation", performed_validation);
4987 f->dump_bool("passed_validation", passed_validation);
4988 f->open_object_section("backtrace");
4989 {
4990 f->dump_bool("checked", backtrace.checked);
4991 f->dump_bool("passed", backtrace.passed);
4992 f->dump_int("read_ret_val", backtrace.ondisk_read_retval);
4993 f->dump_stream("ondisk_value") << backtrace.ondisk_value;
4994 f->dump_stream("memoryvalue") << backtrace.memory_value;
4995 f->dump_string("error_str", backtrace.error_str.str());
4996 }
4997 f->close_section(); // backtrace
4998 f->open_object_section("raw_stats");
4999 {
5000 f->dump_bool("checked", raw_stats.checked);
5001 f->dump_bool("passed", raw_stats.passed);
5002 f->dump_int("read_ret_val", raw_stats.ondisk_read_retval);
5003 f->dump_stream("ondisk_value.dirstat") << raw_stats.ondisk_value.dirstat;
5004 f->dump_stream("ondisk_value.rstat") << raw_stats.ondisk_value.rstat;
5005 f->dump_stream("memory_value.dirstat") << raw_stats.memory_value.dirstat;
5006 f->dump_stream("memory_value.rstat") << raw_stats.memory_value.rstat;
5007 f->dump_string("error_str", raw_stats.error_str.str());
5008 }
5009 f->close_section(); // raw_stats
5010 // dump failure return code
5011 int rc = 0;
5012 if (backtrace.checked && backtrace.ondisk_read_retval)
5013 rc = backtrace.ondisk_read_retval;
5014 if (inode.checked && inode.ondisk_read_retval)
5015 rc = inode.ondisk_read_retval;
5016 if (raw_stats.checked && raw_stats.ondisk_read_retval)
5017 rc = raw_stats.ondisk_read_retval;
5018 f->dump_int("return_code", rc);
5019 }
5020 f->close_section(); // results
5021 }
5022
5023 bool CInode::validated_data::all_damage_repaired() const
5024 {
5025 bool unrepaired =
5026 (raw_stats.checked && !raw_stats.passed && !raw_stats.repaired)
5027 ||
5028 (backtrace.checked && !backtrace.passed && !backtrace.repaired)
5029 ||
5030 (inode.checked && !inode.passed && !inode.repaired);
5031
5032 return !unrepaired;
5033 }
5034
5035 void CInode::dump(Formatter *f, int flags) const
5036 {
5037 if (flags & DUMP_PATH) {
5038 std::string path;
5039 make_path_string(path, true);
5040 if (path.empty())
5041 path = "/";
5042 f->dump_string("path", path);
5043 }
5044
5045 if (flags & DUMP_INODE_STORE_BASE)
5046 InodeStoreBase::dump(f);
5047
5048 if (flags & DUMP_MDS_CACHE_OBJECT)
5049 MDSCacheObject::dump(f);
5050
5051 if (flags & DUMP_LOCKS) {
5052 f->open_object_section("versionlock");
5053 versionlock.dump(f);
5054 f->close_section();
5055
5056 f->open_object_section("authlock");
5057 authlock.dump(f);
5058 f->close_section();
5059
5060 f->open_object_section("linklock");
5061 linklock.dump(f);
5062 f->close_section();
5063
5064 f->open_object_section("dirfragtreelock");
5065 dirfragtreelock.dump(f);
5066 f->close_section();
5067
5068 f->open_object_section("filelock");
5069 filelock.dump(f);
5070 f->close_section();
5071
5072 f->open_object_section("xattrlock");
5073 xattrlock.dump(f);
5074 f->close_section();
5075
5076 f->open_object_section("snaplock");
5077 snaplock.dump(f);
5078 f->close_section();
5079
5080 f->open_object_section("nestlock");
5081 nestlock.dump(f);
5082 f->close_section();
5083
5084 f->open_object_section("flocklock");
5085 flocklock.dump(f);
5086 f->close_section();
5087
5088 f->open_object_section("policylock");
5089 policylock.dump(f);
5090 f->close_section();
5091 }
5092
5093 if (flags & DUMP_STATE) {
5094 f->open_array_section("states");
5095 MDSCacheObject::dump_states(f);
5096 if (state_test(STATE_EXPORTING))
5097 f->dump_string("state", "exporting");
5098 if (state_test(STATE_OPENINGDIR))
5099 f->dump_string("state", "openingdir");
5100 if (state_test(STATE_FREEZING))
5101 f->dump_string("state", "freezing");
5102 if (state_test(STATE_FROZEN))
5103 f->dump_string("state", "frozen");
5104 if (state_test(STATE_AMBIGUOUSAUTH))
5105 f->dump_string("state", "ambiguousauth");
5106 if (state_test(STATE_EXPORTINGCAPS))
5107 f->dump_string("state", "exportingcaps");
5108 if (state_test(STATE_NEEDSRECOVER))
5109 f->dump_string("state", "needsrecover");
5110 if (state_test(STATE_PURGING))
5111 f->dump_string("state", "purging");
5112 if (state_test(STATE_DIRTYPARENT))
5113 f->dump_string("state", "dirtyparent");
5114 if (state_test(STATE_DIRTYRSTAT))
5115 f->dump_string("state", "dirtyrstat");
5116 if (state_test(STATE_STRAYPINNED))
5117 f->dump_string("state", "straypinned");
5118 if (state_test(STATE_FROZENAUTHPIN))
5119 f->dump_string("state", "frozenauthpin");
5120 if (state_test(STATE_DIRTYPOOL))
5121 f->dump_string("state", "dirtypool");
5122 if (state_test(STATE_ORPHAN))
5123 f->dump_string("state", "orphan");
5124 if (state_test(STATE_MISSINGOBJS))
5125 f->dump_string("state", "missingobjs");
5126 f->close_section();
5127 }
5128
5129 if (flags & DUMP_CAPS) {
5130 f->open_array_section("client_caps");
5131 for (const auto &p : client_caps) {
5132 auto &client = p.first;
5133 auto cap = &p.second;
5134 f->open_object_section("client_cap");
5135 f->dump_int("client_id", client.v);
5136 f->dump_string("pending", ccap_string(cap->pending()));
5137 f->dump_string("issued", ccap_string(cap->issued()));
5138 f->dump_string("wanted", ccap_string(cap->wanted()));
5139 f->dump_int("last_sent", cap->get_last_seq());
5140 f->close_section();
5141 }
5142 f->close_section();
5143
5144 f->dump_int("loner", loner_cap.v);
5145 f->dump_int("want_loner", want_loner_cap.v);
5146
5147 f->open_array_section("mds_caps_wanted");
5148 for (const auto &p : mds_caps_wanted) {
5149 f->open_object_section("mds_cap_wanted");
5150 f->dump_int("rank", p.first);
5151 f->dump_string("cap", ccap_string(p.second));
5152 f->close_section();
5153 }
5154 f->close_section();
5155 }
5156
5157 if (flags & DUMP_DIRFRAGS) {
5158 f->open_array_section("dirfrags");
5159 auto&& dfs = get_dirfrags();
5160 for(const auto &dir: dfs) {
5161 f->open_object_section("dir");
5162 dir->dump(f, CDir::DUMP_DEFAULT | CDir::DUMP_ITEMS);
5163 dir->check_rstats();
5164 f->close_section();
5165 }
5166 f->close_section();
5167 }
5168 }
5169
5170 /****** Scrub Stuff *****/
5171 void CInode::scrub_info_create() const
5172 {
5173 dout(25) << __func__ << dendl;
5174 ceph_assert(!scrub_infop);
5175
5176 // break out of const-land to set up implicit initial state
5177 CInode *me = const_cast<CInode*>(this);
5178 const auto& pi = me->get_projected_inode();
5179
5180 std::unique_ptr<scrub_info_t> si(new scrub_info_t());
5181 si->last_scrub_stamp = pi->last_scrub_stamp;
5182 si->last_scrub_version = pi->last_scrub_version;
5183
5184 me->scrub_infop.swap(si);
5185 }
5186
5187 void CInode::scrub_maybe_delete_info()
5188 {
5189 if (scrub_infop &&
5190 !scrub_infop->scrub_in_progress &&
5191 !scrub_infop->last_scrub_dirty) {
5192 scrub_infop.reset();
5193 }
5194 }
5195
5196 void CInode::scrub_initialize(ScrubHeaderRef& header)
5197 {
5198 dout(20) << __func__ << " with scrub_version " << get_version() << dendl;
5199
5200 scrub_info();
5201 scrub_infop->scrub_in_progress = true;
5202 scrub_infop->queued_frags.clear();
5203 scrub_infop->header = header;
5204 header->inc_num_pending();
5205 // right now we don't handle remote inodes
5206 }
5207
5208 void CInode::scrub_aborted() {
5209 dout(20) << __func__ << dendl;
5210 ceph_assert(scrub_is_in_progress());
5211
5212 scrub_infop->scrub_in_progress = false;
5213 scrub_infop->header->dec_num_pending();
5214 scrub_maybe_delete_info();
5215 }
5216
5217 void CInode::scrub_finished() {
5218 dout(20) << __func__ << dendl;
5219 ceph_assert(scrub_is_in_progress());
5220
5221 scrub_infop->last_scrub_version = get_version();
5222 scrub_infop->last_scrub_stamp = ceph_clock_now();
5223 scrub_infop->last_scrub_dirty = true;
5224 scrub_infop->scrub_in_progress = false;
5225 scrub_infop->header->dec_num_pending();
5226 }
5227
5228 int64_t CInode::get_backtrace_pool() const
5229 {
5230 if (is_dir()) {
5231 return mdcache->mds->get_metadata_pool();
5232 } else {
5233 // Files are required to have an explicit layout that specifies
5234 // a pool
5235 ceph_assert(get_inode()->layout.pool_id != -1);
5236 return get_inode()->layout.pool_id;
5237 }
5238 }
5239
5240 void CInode::queue_export_pin(mds_rank_t export_pin)
5241 {
5242 if (state_test(CInode::STATE_QUEUEDEXPORTPIN))
5243 return;
5244
5245 mds_rank_t target;
5246 if (export_pin >= 0)
5247 target = export_pin;
5248 else if (export_pin == MDS_RANK_EPHEMERAL_RAND)
5249 target = mdcache->hash_into_rank_bucket(ino());
5250 else
5251 target = MDS_RANK_NONE;
5252
5253 unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
5254 bool queue = false;
5255 for (auto& p : dirfrags) {
5256 CDir *dir = p.second;
5257 if (!dir->is_auth())
5258 continue;
5259
5260 if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
5261 if (dir->get_frag().bits() < min_frag_bits) {
5262 // needs split
5263 queue = true;
5264 break;
5265 }
5266 target = mdcache->hash_into_rank_bucket(ino(), dir->get_frag());
5267 }
5268
5269 if (target != MDS_RANK_NONE) {
5270 if (dir->is_subtree_root()) {
5271 // set auxsubtree bit or export it
5272 if (!dir->state_test(CDir::STATE_AUXSUBTREE) ||
5273 target != dir->get_dir_auth().first)
5274 queue = true;
5275 } else {
5276 // create aux subtree or export it
5277 queue = true;
5278 }
5279 } else {
5280 // clear aux subtrees ?
5281 queue = dir->state_test(CDir::STATE_AUXSUBTREE);
5282 }
5283
5284 if (queue)
5285 break;
5286 }
5287 if (queue) {
5288 state_set(CInode::STATE_QUEUEDEXPORTPIN);
5289 mdcache->export_pin_queue.insert(this);
5290 }
5291 }
5292
5293 void CInode::maybe_export_pin(bool update)
5294 {
5295 if (!g_conf()->mds_bal_export_pin)
5296 return;
5297 if (!is_dir() || !is_normal())
5298 return;
5299
5300 dout(15) << __func__ << " update=" << update << " " << *this << dendl;
5301
5302 mds_rank_t export_pin = get_export_pin(false);
5303 if (export_pin == MDS_RANK_NONE && !update)
5304 return;
5305
5306 check_pin_policy(export_pin);
5307 queue_export_pin(export_pin);
5308 }
5309
5310 void CInode::set_ephemeral_pin(bool dist, bool rand)
5311 {
5312 unsigned state = 0;
5313 if (dist)
5314 state |= STATE_DISTEPHEMERALPIN;
5315 if (rand)
5316 state |= STATE_RANDEPHEMERALPIN;
5317 if (!state)
5318 return;
5319
5320 if (state_test(state) != state) {
5321 dout(10) << "set ephemeral (" << (dist ? "dist" : "")
5322 << (rand ? " rand" : "") << ") pin on " << *this << dendl;
5323 if (!is_ephemerally_pinned()) {
5324 auto p = mdcache->export_ephemeral_pins.insert(this);
5325 ceph_assert(p.second);
5326 }
5327 state_set(state);
5328 }
5329 }
5330
5331 void CInode::clear_ephemeral_pin(bool dist, bool rand)
5332 {
5333 unsigned state = 0;
5334 if (dist)
5335 state |= STATE_DISTEPHEMERALPIN;
5336 if (rand)
5337 state |= STATE_RANDEPHEMERALPIN;
5338
5339 if (state_test(state)) {
5340 dout(10) << "clear ephemeral (" << (dist ? "dist" : "")
5341 << (rand ? " rand" : "") << ") pin on " << *this << dendl;
5342 state_clear(state);
5343 if (!is_ephemerally_pinned()) {
5344 auto count = mdcache->export_ephemeral_pins.erase(this);
5345 ceph_assert(count == 1);
5346 }
5347 }
5348 }
5349
5350 void CInode::maybe_ephemeral_rand(double threshold)
5351 {
5352 if (!mdcache->get_export_ephemeral_random_config()) {
5353 dout(15) << __func__ << " config false: cannot ephemeral random pin " << *this << dendl;
5354 clear_ephemeral_pin(false, true);
5355 return;
5356 } else if (!is_dir() || !is_normal()) {
5357 dout(15) << __func__ << " !dir or !normal: cannot ephemeral random pin " << *this << dendl;
5358 clear_ephemeral_pin(false, true);
5359 return;
5360 } else if (get_inode()->nlink == 0) {
5361 dout(15) << __func__ << " unlinked directory: cannot ephemeral random pin " << *this << dendl;
5362 clear_ephemeral_pin(false, true);
5363 return;
5364 } else if (state_test(CInode::STATE_RANDEPHEMERALPIN)) {
5365 dout(10) << __func__ << " already ephemeral random pinned: requeueing " << *this << dendl;
5366 queue_export_pin(MDS_RANK_EPHEMERAL_RAND);
5367 return;
5368 }
5369
5370 /* not precomputed? */
5371 if (threshold < 0.0) {
5372 threshold = get_ephemeral_rand();
5373 }
5374 if (threshold <= 0.0) {
5375 return;
5376 }
5377 double n = ceph::util::generate_random_number(0.0, 1.0);
5378
5379 dout(15) << __func__ << " rand " << n << " <?= " << threshold
5380 << " " << *this << dendl;
5381
5382 if (n <= threshold) {
5383 dout(10) << __func__ << " randomly export pinning " << *this << dendl;
5384 set_ephemeral_pin(false, true);
5385 queue_export_pin(MDS_RANK_EPHEMERAL_RAND);
5386 }
5387 }
5388
5389 void CInode::setxattr_ephemeral_rand(double probability)
5390 {
5391 ceph_assert(is_dir());
5392 _get_projected_inode()->export_ephemeral_random_pin = probability;
5393 }
5394
5395 void CInode::setxattr_ephemeral_dist(bool val)
5396 {
5397 ceph_assert(is_dir());
5398 _get_projected_inode()->export_ephemeral_distributed_pin = val;
5399 }
5400
5401 void CInode::set_export_pin(mds_rank_t rank)
5402 {
5403 ceph_assert(is_dir());
5404 _get_projected_inode()->export_pin = rank;
5405 maybe_export_pin(true);
5406 }
5407
5408 mds_rank_t CInode::get_export_pin(bool inherit) const
5409 {
5410 if (!g_conf()->mds_bal_export_pin)
5411 return MDS_RANK_NONE;
5412
5413 /* An inode that is export pinned may not necessarily be a subtree root, we
5414 * need to traverse the parents. A base or system inode cannot be pinned.
5415 * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5416 * have a parent yet.
5417 */
5418 mds_rank_t r_target = MDS_RANK_NONE;
5419 const CInode *in = this;
5420 const CDir *dir = nullptr;
5421 while (true) {
5422 if (in->is_system())
5423 break;
5424 const CDentry *pdn = in->get_parent_dn();
5425 if (!pdn)
5426 break;
5427 if (in->get_inode()->nlink == 0) {
5428 // ignore export pin for unlinked directory
5429 break;
5430 }
5431
5432 if (in->get_inode()->export_pin >= 0) {
5433 return in->get_inode()->export_pin;
5434 } else if (in->get_inode()->export_ephemeral_distributed_pin &&
5435 mdcache->get_export_ephemeral_distributed_config()) {
5436 if (in != this)
5437 return mdcache->hash_into_rank_bucket(in->ino(), dir->get_frag());
5438 return MDS_RANK_EPHEMERAL_DIST;
5439 } else if (r_target != MDS_RANK_NONE && in->get_inode()->export_ephemeral_random_pin > 0.0) {
5440 return r_target;
5441 } else if (r_target == MDS_RANK_NONE && in->is_ephemeral_rand() &&
5442 mdcache->get_export_ephemeral_random_config()) {
5443 /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */
5444 if (!inherit)
5445 return MDS_RANK_EPHEMERAL_RAND;
5446 if (in == this)
5447 r_target = MDS_RANK_EPHEMERAL_RAND;
5448 else
5449 r_target = mdcache->hash_into_rank_bucket(in->ino());
5450 }
5451
5452 if (!inherit)
5453 break;
5454 dir = pdn->get_dir();
5455 in = dir->inode;
5456 }
5457 return MDS_RANK_NONE;
5458 }
5459
5460 void CInode::check_pin_policy(mds_rank_t export_pin)
5461 {
5462 if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
5463 set_ephemeral_pin(true, false);
5464 clear_ephemeral_pin(false, true);
5465 } else if (export_pin == MDS_RANK_EPHEMERAL_RAND) {
5466 set_ephemeral_pin(false, true);
5467 clear_ephemeral_pin(true, false);
5468 } else if (is_ephemerally_pinned()) {
5469 // export_pin >= 0 || export_pin == MDS_RANK_NONE
5470 clear_ephemeral_pin(true, true);
5471 if (export_pin != get_inode()->export_pin) // inherited export_pin
5472 queue_export_pin(MDS_RANK_NONE);
5473 }
5474 }
5475
5476 double CInode::get_ephemeral_rand() const
5477 {
5478 /* N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5479 * have a parent yet.
5480 */
5481 const CInode *in = this;
5482 double max = mdcache->export_ephemeral_random_max;
5483 while (true) {
5484 if (in->is_system())
5485 break;
5486 const CDentry *pdn = in->get_parent_dn();
5487 if (!pdn)
5488 break;
5489 // ignore export pin for unlinked directory
5490 if (in->get_inode()->nlink == 0)
5491 break;
5492
5493 if (in->get_inode()->export_ephemeral_random_pin > 0.0)
5494 return std::min(in->get_inode()->export_ephemeral_random_pin, max);
5495
5496 /* An export_pin overrides only if no closer parent (incl. this one) has a
5497 * random pin set.
5498 */
5499 if (in->get_inode()->export_pin >= 0 ||
5500 in->get_inode()->export_ephemeral_distributed_pin)
5501 return 0.0;
5502
5503 in = pdn->get_dir()->inode;
5504 }
5505 return 0.0;
5506 }
5507
5508 void CInode::get_nested_dirfrags(std::vector<CDir*>& v) const
5509 {
5510 for (const auto &p : dirfrags) {
5511 const auto& dir = p.second;
5512 if (!dir->is_subtree_root())
5513 v.push_back(dir);
5514 }
5515 }
5516
5517 void CInode::get_subtree_dirfrags(std::vector<CDir*>& v) const
5518 {
5519 for (const auto &p : dirfrags) {
5520 const auto& dir = p.second;
5521 if (dir->is_subtree_root())
5522 v.push_back(dir);
5523 }
5524 }
5525
5526 MEMPOOL_DEFINE_OBJECT_FACTORY(CInode, co_inode, mds_co);