]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/CInode.cc
import ceph pacific 16.2.5
[ceph.git] / ceph / src / mds / CInode.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "include/int_types.h"
16 #include "common/errno.h"
17
18 #include <string>
19
20 #include "CInode.h"
21 #include "CDir.h"
22 #include "CDentry.h"
23
24 #include "MDSRank.h"
25 #include "MDCache.h"
26 #include "MDLog.h"
27 #include "Locker.h"
28 #include "Mutation.h"
29
30 #include "events/EUpdate.h"
31
32 #include "osdc/Objecter.h"
33
34 #include "snap.h"
35
36 #include "LogSegment.h"
37
38 #include "common/Clock.h"
39
40 #include "common/config.h"
41 #include "global/global_context.h"
42 #include "include/ceph_assert.h"
43
44 #include "mds/MDSContinuation.h"
45 #include "mds/InoTable.h"
46 #include "cephfs_features.h"
47 #include "osdc/Objecter.h"
48
49 #define dout_context g_ceph_context
50 #define dout_subsys ceph_subsys_mds
51 #undef dout_prefix
52 #define dout_prefix *_dout << "mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") "
53
54 void CInodeCommitOperation::update(ObjectOperation &op, inode_backtrace_t &bt) {
55 using ceph::encode;
56
57 op.priority = priority;
58 op.create(false);
59
60 bufferlist parent_bl;
61 encode(bt, parent_bl);
62 op.setxattr("parent", parent_bl);
63
64 // for the old pool there is no need to update the layout
65 if (!update_layout)
66 return;
67
68 bufferlist layout_bl;
69 encode(_layout, layout_bl, _features);
70 op.setxattr("layout", layout_bl);
71 }
72
73 class CInodeIOContext : public MDSIOContextBase
74 {
75 protected:
76 CInode *in;
77 MDSRank *get_mds() override {return in->mdcache->mds;}
78 public:
79 explicit CInodeIOContext(CInode *in_) : in(in_) {
80 ceph_assert(in != NULL);
81 }
82 };
83
84 sr_t* const CInode::projected_inode::UNDEF_SRNODE = (sr_t*)(unsigned long)-1;
85
86 LockType CInode::versionlock_type(CEPH_LOCK_IVERSION);
87 LockType CInode::authlock_type(CEPH_LOCK_IAUTH);
88 LockType CInode::linklock_type(CEPH_LOCK_ILINK);
89 LockType CInode::dirfragtreelock_type(CEPH_LOCK_IDFT);
90 LockType CInode::filelock_type(CEPH_LOCK_IFILE);
91 LockType CInode::xattrlock_type(CEPH_LOCK_IXATTR);
92 LockType CInode::snaplock_type(CEPH_LOCK_ISNAP);
93 LockType CInode::nestlock_type(CEPH_LOCK_INEST);
94 LockType CInode::flocklock_type(CEPH_LOCK_IFLOCK);
95 LockType CInode::policylock_type(CEPH_LOCK_IPOLICY);
96
97 std::string_view CInode::pin_name(int p) const
98 {
99 switch (p) {
100 case PIN_DIRFRAG: return "dirfrag";
101 case PIN_CAPS: return "caps";
102 case PIN_IMPORTING: return "importing";
103 case PIN_OPENINGDIR: return "openingdir";
104 case PIN_REMOTEPARENT: return "remoteparent";
105 case PIN_BATCHOPENJOURNAL: return "batchopenjournal";
106 case PIN_SCATTERED: return "scattered";
107 case PIN_STICKYDIRS: return "stickydirs";
108 //case PIN_PURGING: return "purging";
109 case PIN_FREEZING: return "freezing";
110 case PIN_FROZEN: return "frozen";
111 case PIN_IMPORTINGCAPS: return "importingcaps";
112 case PIN_EXPORTINGCAPS: return "exportingcaps";
113 case PIN_PASTSNAPPARENT: return "pastsnapparent";
114 case PIN_OPENINGSNAPPARENTS: return "openingsnapparents";
115 case PIN_TRUNCATING: return "truncating";
116 case PIN_STRAY: return "stray";
117 case PIN_NEEDSNAPFLUSH: return "needsnapflush";
118 case PIN_DIRTYRSTAT: return "dirtyrstat";
119 case PIN_DIRTYPARENT: return "dirtyparent";
120 case PIN_DIRWAITER: return "dirwaiter";
121 default: return generic_pin_name(p);
122 }
123 }
124
125 //int cinode_pins[CINODE_NUM_PINS]; // counts
126 ostream& CInode::print_db_line_prefix(ostream& out)
127 {
128 return out << ceph_clock_now() << " mds." << mdcache->mds->get_nodeid() << ".cache.ino(" << ino() << ") ";
129 }
130
131 /*
132 * write caps and lock ids
133 */
134 struct cinode_lock_info_t cinode_lock_info[] = {
135 { CEPH_LOCK_IFILE, CEPH_CAP_ANY_FILE_WR },
136 { CEPH_LOCK_IAUTH, CEPH_CAP_AUTH_EXCL },
137 { CEPH_LOCK_ILINK, CEPH_CAP_LINK_EXCL },
138 { CEPH_LOCK_IXATTR, CEPH_CAP_XATTR_EXCL },
139 };
140 int num_cinode_locks = sizeof(cinode_lock_info) / sizeof(cinode_lock_info[0]);
141
142 ostream& operator<<(ostream& out, const CInode& in)
143 {
144 string path;
145 in.make_path_string(path, true);
146
147 out << "[inode " << in.ino();
148 out << " ["
149 << (in.is_multiversion() ? "...":"")
150 << in.first << "," << in.last << "]";
151 out << " " << path << (in.is_dir() ? "/":"");
152
153 if (in.is_auth()) {
154 out << " auth";
155 if (in.is_replicated())
156 out << in.get_replicas();
157 } else {
158 mds_authority_t a = in.authority();
159 out << " rep@" << a.first;
160 if (a.second != CDIR_AUTH_UNKNOWN)
161 out << "," << a.second;
162 out << "." << in.get_replica_nonce();
163 }
164
165 if (in.is_symlink())
166 out << " symlink='" << in.symlink << "'";
167 if (in.is_dir() && !in.dirfragtree.empty())
168 out << " " << in.dirfragtree;
169
170 out << " v" << in.get_version();
171 if (in.get_projected_version() > in.get_version())
172 out << " pv" << in.get_projected_version();
173
174 if (in.get_num_auth_pins()) {
175 out << " ap=" << in.get_num_auth_pins();
176 #ifdef MDS_AUTHPIN_SET
177 in.print_authpin_set(out);
178 #endif
179 }
180
181 if (in.snaprealm)
182 out << " snaprealm=" << in.snaprealm;
183
184 if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH";
185 if (in.state_test(CInode::STATE_NEEDSRECOVER)) out << " NEEDSRECOVER";
186 if (in.state_test(CInode::STATE_RECOVERING)) out << " RECOVERING";
187 if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " DIRTYPARENT";
188 if (in.state_test(CInode::STATE_MISSINGOBJS)) out << " MISSINGOBJS";
189 if (in.is_ephemeral_dist()) out << " DISTEPHEMERALPIN";
190 if (in.is_ephemeral_rand()) out << " RANDEPHEMERALPIN";
191 if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance;
192 if (in.is_frozen_inode()) out << " FROZEN";
193 if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN";
194
195 const auto& pi = in.get_projected_inode();
196 if (pi->is_truncating())
197 out << " truncating(" << pi->truncate_from << " to " << pi->truncate_size << ")";
198
199 if (in.is_dir()) {
200 out << " " << in.get_inode()->dirstat;
201 if (g_conf()->mds_debug_scatterstat && in.is_projected()) {
202 out << "->" << pi->dirstat;
203 }
204 } else {
205 out << " s=" << in.get_inode()->size;
206 if (in.get_inode()->nlink != 1)
207 out << " nl=" << in.get_inode()->nlink;
208 }
209
210 // rstat
211 out << " " << in.get_inode()->rstat;
212 if (!(in.get_inode()->rstat == in.get_inode()->accounted_rstat))
213 out << "/" << in.get_inode()->accounted_rstat;
214 if (g_conf()->mds_debug_scatterstat && in.is_projected()) {
215 out << "->" << pi->rstat;
216 if (!(pi->rstat == pi->accounted_rstat))
217 out << "/" << pi->accounted_rstat;
218 }
219
220 if (in.is_any_old_inodes()) {
221 out << " old_inodes=" << in.get_old_inodes()->size();
222 }
223
224 if (!in.client_need_snapflush.empty())
225 out << " need_snapflush=" << in.client_need_snapflush;
226
227 // locks
228 if (!in.authlock.is_sync_and_unlocked())
229 out << " " << in.authlock;
230 if (!in.linklock.is_sync_and_unlocked())
231 out << " " << in.linklock;
232 if (in.get_inode()->is_dir()) {
233 if (!in.dirfragtreelock.is_sync_and_unlocked())
234 out << " " << in.dirfragtreelock;
235 if (!in.snaplock.is_sync_and_unlocked())
236 out << " " << in.snaplock;
237 if (!in.nestlock.is_sync_and_unlocked())
238 out << " " << in.nestlock;
239 if (!in.policylock.is_sync_and_unlocked())
240 out << " " << in.policylock;
241 } else {
242 if (!in.flocklock.is_sync_and_unlocked())
243 out << " " << in.flocklock;
244 }
245 if (!in.filelock.is_sync_and_unlocked())
246 out << " " << in.filelock;
247 if (!in.xattrlock.is_sync_and_unlocked())
248 out << " " << in.xattrlock;
249 if (!in.versionlock.is_sync_and_unlocked())
250 out << " " << in.versionlock;
251
252 // hack: spit out crap on which clients have caps
253 if (in.get_inode()->client_ranges.size())
254 out << " cr=" << in.get_inode()->client_ranges;
255
256 if (!in.get_client_caps().empty()) {
257 out << " caps={";
258 bool first = true;
259 for (const auto &p : in.get_client_caps()) {
260 if (!first) out << ",";
261 out << p.first << "="
262 << ccap_string(p.second.pending());
263 if (p.second.issued() != p.second.pending())
264 out << "/" << ccap_string(p.second.issued());
265 out << "/" << ccap_string(p.second.wanted())
266 << "@" << p.second.get_last_seq();
267 first = false;
268 }
269 out << "}";
270 if (in.get_loner() >= 0 || in.get_wanted_loner() >= 0) {
271 out << ",l=" << in.get_loner();
272 if (in.get_loner() != in.get_wanted_loner())
273 out << "(" << in.get_wanted_loner() << ")";
274 }
275 }
276 if (!in.get_mds_caps_wanted().empty()) {
277 out << " mcw={";
278 bool first = true;
279 for (const auto &p : in.get_mds_caps_wanted()) {
280 if (!first)
281 out << ',';
282 out << p.first << '=' << ccap_string(p.second);
283 first = false;
284 }
285 out << '}';
286 }
287
288 if (in.get_num_ref()) {
289 out << " |";
290 in.print_pin_set(out);
291 }
292
293 if (in.get_inode()->export_pin != MDS_RANK_NONE) {
294 out << " export_pin=" << in.get_inode()->export_pin;
295 }
296 if (in.state_test(CInode::STATE_DISTEPHEMERALPIN)) {
297 out << " distepin";
298 }
299 if (in.state_test(CInode::STATE_RANDEPHEMERALPIN)) {
300 out << " randepin";
301 }
302
303 out << " " << &in;
304 out << "]";
305 return out;
306 }
307
308 CInode::CInode(MDCache *c, bool auth, snapid_t f, snapid_t l) :
309 mdcache(c), first(f), last(l),
310 item_dirty(this),
311 item_caps(this),
312 item_open_file(this),
313 item_dirty_parent(this),
314 item_dirty_dirfrag_dir(this),
315 item_dirty_dirfrag_nest(this),
316 item_dirty_dirfrag_dirfragtree(this),
317 pop(c->decayrate),
318 versionlock(this, &versionlock_type),
319 authlock(this, &authlock_type),
320 linklock(this, &linklock_type),
321 dirfragtreelock(this, &dirfragtreelock_type),
322 filelock(this, &filelock_type),
323 xattrlock(this, &xattrlock_type),
324 snaplock(this, &snaplock_type),
325 nestlock(this, &nestlock_type),
326 flocklock(this, &flocklock_type),
327 policylock(this, &policylock_type)
328 {
329 if (auth)
330 state_set(STATE_AUTH);
331 }
332
333 void CInode::print(ostream& out)
334 {
335 out << *this;
336 }
337
338 void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
339 {
340 dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl;
341
342 if (client_need_snapflush.empty()) {
343 get(CInode::PIN_NEEDSNAPFLUSH);
344
345 // FIXME: this is non-optimal, as we'll block freezes/migrations for potentially
346 // long periods waiting for clients to flush their snaps.
347 auth_pin(this); // pin head get_inode()->..
348 }
349
350 auto &clients = client_need_snapflush[snapid];
351 if (clients.empty())
352 snapin->auth_pin(this); // ...and pin snapped/old inode!
353
354 clients.insert(client);
355 }
356
357 void CInode::remove_need_snapflush(CInode *snapin, snapid_t snapid, client_t client)
358 {
359 dout(10) << __func__ << " client." << client << " snapid " << snapid << " on " << snapin << dendl;
360 auto it = client_need_snapflush.find(snapid);
361 if (it == client_need_snapflush.end()) {
362 dout(10) << " snapid not found" << dendl;
363 return;
364 }
365 size_t n = it->second.erase(client);
366 if (n == 0) {
367 dout(10) << " client not found" << dendl;
368 return;
369 }
370 if (it->second.empty()) {
371 client_need_snapflush.erase(it);
372 snapin->auth_unpin(this);
373
374 if (client_need_snapflush.empty()) {
375 put(CInode::PIN_NEEDSNAPFLUSH);
376 auth_unpin(this);
377 }
378 }
379 }
380
381 pair<bool,bool> CInode::split_need_snapflush(CInode *cowin, CInode *in)
382 {
383 dout(10) << __func__ << " [" << cowin->first << "," << cowin->last << "] for " << *cowin << dendl;
384 bool cowin_need_flush = false;
385 bool orig_need_flush = false;
386 auto it = client_need_snapflush.lower_bound(cowin->first);
387 while (it != client_need_snapflush.end() && it->first < in->first) {
388 ceph_assert(!it->second.empty());
389 if (cowin->last >= it->first) {
390 cowin->auth_pin(this);
391 cowin_need_flush = true;
392 ++it;
393 } else {
394 it = client_need_snapflush.erase(it);
395 }
396 in->auth_unpin(this);
397 }
398
399 if (it != client_need_snapflush.end() && it->first <= in->last)
400 orig_need_flush = true;
401
402 return make_pair(cowin_need_flush, orig_need_flush);
403 }
404
405 void CInode::mark_dirty_rstat()
406 {
407 if (!state_test(STATE_DIRTYRSTAT)) {
408 dout(10) << __func__ << dendl;
409 state_set(STATE_DIRTYRSTAT);
410 get(PIN_DIRTYRSTAT);
411 CDentry *pdn = get_projected_parent_dn();
412 if (pdn->is_auth()) {
413 CDir *pdir = pdn->dir;
414 pdir->dirty_rstat_inodes.push_back(&dirty_rstat_item);
415 mdcache->mds->locker->mark_updated_scatterlock(&pdir->inode->nestlock);
416 } else {
417 // under cross-MDS rename.
418 // DIRTYRSTAT flag will get cleared when rename finishes
419 ceph_assert(state_test(STATE_AMBIGUOUSAUTH));
420 }
421 }
422 }
423 void CInode::clear_dirty_rstat()
424 {
425 if (state_test(STATE_DIRTYRSTAT)) {
426 dout(10) << __func__ << dendl;
427 state_clear(STATE_DIRTYRSTAT);
428 put(PIN_DIRTYRSTAT);
429 dirty_rstat_item.remove_myself();
430 }
431 }
432
433 CInode::projected_inode CInode::project_inode(const MutationRef& mut,
434 bool xattr, bool snap)
435 {
436 if (mut && mut->is_projected(this)) {
437 ceph_assert(!xattr && !snap);
438 auto _inode = std::const_pointer_cast<mempool_inode>(projected_nodes.back().inode);
439 return projected_inode(std::move(_inode), xattr_map_ptr());
440 }
441
442 auto pi = allocate_inode(*get_projected_inode());
443
444 if (scrub_infop && scrub_infop->last_scrub_dirty) {
445 pi->last_scrub_stamp = scrub_infop->last_scrub_stamp;
446 pi->last_scrub_version = scrub_infop->last_scrub_version;
447 scrub_infop->last_scrub_dirty = false;
448 scrub_maybe_delete_info();
449 }
450
451 const auto& ox = get_projected_xattrs();
452 xattr_map_ptr px;
453 if (xattr) {
454 px = allocate_xattr_map();
455 if (ox)
456 *px = *ox;
457 }
458
459 sr_t* ps = projected_inode::UNDEF_SRNODE;
460 if (snap) {
461 ps = prepare_new_srnode(0);
462 ++num_projected_srnodes;
463 }
464
465 projected_nodes.emplace_back(pi, xattr ? px : ox , ps);
466 if (mut)
467 mut->add_projected_node(this);
468 dout(15) << __func__ << " " << pi->ino << dendl;
469 return projected_inode(std::move(pi), std::move(px), ps);
470 }
471
472 void CInode::pop_and_dirty_projected_inode(LogSegment *ls, const MutationRef& mut)
473 {
474 ceph_assert(!projected_nodes.empty());
475 auto front = std::move(projected_nodes.front());
476 dout(15) << __func__ << " v" << front.inode->version << dendl;
477
478 projected_nodes.pop_front();
479 if (mut)
480 mut->remove_projected_node(this);
481
482 bool pool_updated = get_inode()->layout.pool_id != front.inode->layout.pool_id;
483 bool pin_updated = (get_inode()->export_pin != front.inode->export_pin) ||
484 (get_inode()->export_ephemeral_distributed_pin !=
485 front.inode->export_ephemeral_distributed_pin);
486
487 reset_inode(std::move(front.inode));
488 if (front.xattrs != get_xattrs())
489 reset_xattrs(std::move(front.xattrs));
490
491 if (front.snapnode != projected_inode::UNDEF_SRNODE) {
492 --num_projected_srnodes;
493 pop_projected_snaprealm(front.snapnode, false);
494 }
495
496 mark_dirty(ls);
497 if (get_inode()->is_backtrace_updated())
498 mark_dirty_parent(ls, pool_updated);
499
500 if (pin_updated)
501 maybe_export_pin(true);
502 }
503
504 sr_t *CInode::prepare_new_srnode(snapid_t snapid)
505 {
506 const sr_t *cur_srnode = get_projected_srnode();
507 sr_t *new_srnode;
508
509 if (cur_srnode) {
510 new_srnode = new sr_t(*cur_srnode);
511 } else {
512 if (snapid == 0)
513 snapid = mdcache->get_global_snaprealm()->get_newest_seq();
514 new_srnode = new sr_t();
515 new_srnode->seq = snapid;
516 new_srnode->created = snapid;
517 new_srnode->current_parent_since = get_oldest_snap();
518 }
519 return new_srnode;
520 }
521
522 const sr_t *CInode::get_projected_srnode() const {
523 if (num_projected_srnodes > 0) {
524 for (auto it = projected_nodes.rbegin(); it != projected_nodes.rend(); ++it)
525 if (it->snapnode != projected_inode::UNDEF_SRNODE)
526 return it->snapnode;
527 }
528 if (snaprealm)
529 return &snaprealm->srnode;
530 else
531 return NULL;
532 }
533
534 void CInode::project_snaprealm(sr_t *new_srnode)
535 {
536 dout(10) << __func__ << " " << new_srnode << dendl;
537 ceph_assert(projected_nodes.back().snapnode == projected_inode::UNDEF_SRNODE);
538 projected_nodes.back().snapnode = new_srnode;
539 ++num_projected_srnodes;
540 }
541
542 void CInode::mark_snaprealm_global(sr_t *new_srnode)
543 {
544 ceph_assert(!is_dir());
545 // 'last_destroyed' is no longer used, use it to store origin 'current_parent_since'
546 new_srnode->last_destroyed = new_srnode->current_parent_since;
547 new_srnode->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
548 new_srnode->mark_parent_global();
549 }
550
551 void CInode::clear_snaprealm_global(sr_t *new_srnode)
552 {
553 // restore 'current_parent_since'
554 new_srnode->current_parent_since = new_srnode->last_destroyed;
555 new_srnode->last_destroyed = 0;
556 new_srnode->seq = mdcache->get_global_snaprealm()->get_newest_seq();
557 new_srnode->clear_parent_global();
558 }
559
560 bool CInode::is_projected_snaprealm_global() const
561 {
562 const sr_t *srnode = get_projected_srnode();
563 if (srnode && srnode->is_parent_global())
564 return true;
565 return false;
566 }
567
568 void CInode::project_snaprealm_past_parent(SnapRealm *newparent)
569 {
570 sr_t *new_snap = project_snaprealm();
571 record_snaprealm_past_parent(new_snap, newparent);
572 }
573
574
575 /* if newparent != parent, add parent to past_parents
576 if parent DNE, we need to find what the parent actually is and fill that in */
577 void CInode::record_snaprealm_past_parent(sr_t *new_snap, SnapRealm *newparent)
578 {
579 ceph_assert(!new_snap->is_parent_global());
580 SnapRealm *oldparent;
581 if (!snaprealm) {
582 oldparent = find_snaprealm();
583 } else {
584 oldparent = snaprealm->parent;
585 }
586
587 if (newparent != oldparent) {
588 snapid_t oldparentseq = oldparent->get_newest_seq();
589 if (oldparentseq + 1 > new_snap->current_parent_since) {
590 // copy old parent's snaps
591 const set<snapid_t>& snaps = oldparent->get_snaps();
592 auto p = snaps.lower_bound(new_snap->current_parent_since);
593 if (p != snaps.end())
594 new_snap->past_parent_snaps.insert(p, snaps.end());
595 if (oldparentseq > new_snap->seq)
596 new_snap->seq = oldparentseq;
597 }
598 new_snap->current_parent_since = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
599 }
600 }
601
602 void CInode::record_snaprealm_parent_dentry(sr_t *new_snap, SnapRealm *oldparent,
603 CDentry *dn, bool primary_dn)
604 {
605 ceph_assert(new_snap->is_parent_global());
606
607 if (!oldparent)
608 oldparent = dn->get_dir()->inode->find_snaprealm();
609 auto& snaps = oldparent->get_snaps();
610
611 if (!primary_dn) {
612 auto p = snaps.lower_bound(dn->first);
613 if (p != snaps.end())
614 new_snap->past_parent_snaps.insert(p, snaps.end());
615 } else {
616 // 'last_destroyed' is used as 'current_parent_since'
617 auto p = snaps.lower_bound(new_snap->last_destroyed);
618 if (p != snaps.end())
619 new_snap->past_parent_snaps.insert(p, snaps.end());
620 new_snap->last_destroyed = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
621 }
622 }
623
624 void CInode::early_pop_projected_snaprealm()
625 {
626 ceph_assert(!projected_nodes.empty());
627 if (projected_nodes.front().snapnode != projected_inode::UNDEF_SRNODE) {
628 pop_projected_snaprealm(projected_nodes.front().snapnode, true);
629 projected_nodes.front().snapnode = projected_inode::UNDEF_SRNODE;
630 --num_projected_srnodes;
631 }
632 }
633
634 void CInode::pop_projected_snaprealm(sr_t *next_snaprealm, bool early)
635 {
636 if (next_snaprealm) {
637 dout(10) << __func__ << (early ? " (early) " : " ")
638 << next_snaprealm << " seq " << next_snaprealm->seq << dendl;
639 if (!snaprealm)
640 open_snaprealm();
641
642 auto old_flags = snaprealm->srnode.flags;
643 snaprealm->srnode = *next_snaprealm;
644 delete next_snaprealm;
645
646 if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) {
647 snaprealm->adjust_parent();
648 }
649
650 if (snaprealm->parent)
651 dout(10) << " realm " << *snaprealm << " parent " << *snaprealm->parent << dendl;
652 } else {
653 dout(10) << __func__ << (early ? " (early) null" : " null") << dendl;
654 ceph_assert(snaprealm);
655 snaprealm->merge_to(NULL);
656 }
657 }
658
659
660 // ====== CInode =======
661
662 // dirfrags
663
664 InodeStoreBase::inode_const_ptr InodeStoreBase::empty_inode = InodeStoreBase::allocate_inode();
665
666 __u32 InodeStoreBase::hash_dentry_name(std::string_view dn)
667 {
668 int which = inode->dir_layout.dl_dir_hash;
669 if (!which)
670 which = CEPH_STR_HASH_LINUX;
671 ceph_assert(ceph_str_hash_valid(which));
672 return ceph_str_hash(which, dn.data(), dn.length());
673 }
674
675 frag_t InodeStoreBase::pick_dirfrag(std::string_view dn)
676 {
677 if (dirfragtree.empty())
678 return frag_t(); // avoid the string hash if we can.
679
680 __u32 h = hash_dentry_name(dn);
681 return dirfragtree[h];
682 }
683
684 std::pair<bool, std::vector<CDir*>> CInode::get_dirfrags_under(frag_t fg)
685 {
686 std::pair<bool, std::vector<CDir*>> result;
687 auto& all = result.first;
688 auto& dirs = result.second;
689 all = false;
690
691 if (auto it = dirfrags.find(fg); it != dirfrags.end()){
692 all = true;
693 dirs.push_back(it->second);
694 return result;
695 }
696
697 int total = 0;
698 for(auto &[_fg, _dir] : dirfrags){
699 // frag_t.bits() can indicate the depth of the partition in the directory tree
700 // e.g.
701 // 01* : bit = 2, on the second floor
702 // *
703 // 0* 1*
704 // 00* 01* 10* 11* -- > level 2, bit = 2
705 // so fragA.bits > fragB.bits means fragA is deeper than fragB
706
707 if (fg.bits() >= _fg.bits()) {
708 if (_fg.contains(fg)) {
709 all = true;
710 return result;
711 }
712 } else {
713 if (fg.contains(_fg)) {
714 dirs.push_back(_dir);
715 // we can calculate how many sub slices a slice can be divided into
716 // frag_t(*) can be divided into two frags belonging to the first layer(0* 1*)
717 // or 2^2 frags belonging to the second layer(00* 01* 10* 11*)
718 // or (1 << (24 - frag_t(*).bits)) frags belonging to the 24th level
719 total += 1 << (24 - _fg.bits());
720 }
721 }
722 }
723
724 // we convert all the frags into the frags of 24th layer to calculate whether all the frags are included in the memory cache
725 all = ((1<<(24-fg.bits())) == total);
726 return result;
727 }
728
729 void CInode::verify_dirfrags()
730 {
731 bool bad = false;
732 for (const auto &p : dirfrags) {
733 if (!dirfragtree.is_leaf(p.first)) {
734 dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
735 << ": " << *p.second << dendl;
736 bad = true;
737 }
738 }
739 ceph_assert(!bad);
740 }
741
742 void CInode::force_dirfrags()
743 {
744 bool bad = false;
745 for (auto &p : dirfrags) {
746 if (!dirfragtree.is_leaf(p.first)) {
747 dout(0) << "have open dirfrag " << p.first << " but not leaf in " << dirfragtree
748 << ": " << *p.second << dendl;
749 bad = true;
750 }
751 }
752
753 if (bad) {
754 frag_vec_t leaves;
755 dirfragtree.get_leaves(leaves);
756 for (const auto& leaf : leaves) {
757 mdcache->get_force_dirfrag(dirfrag_t(ino(), leaf), true);
758 }
759 }
760
761 verify_dirfrags();
762 }
763
764 CDir *CInode::get_approx_dirfrag(frag_t fg)
765 {
766 CDir *dir = get_dirfrag(fg);
767 if (dir) return dir;
768
769 // find a child?
770 auto&& p = get_dirfrags_under(fg);
771 if (!p.second.empty())
772 return p.second.front();
773
774 // try parents?
775 while (fg.bits() > 0) {
776 fg = fg.parent();
777 dir = get_dirfrag(fg);
778 if (dir) return dir;
779 }
780 return NULL;
781 }
782
783 CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg)
784 {
785 ceph_assert(is_dir());
786
787 // have it?
788 CDir *dir = get_dirfrag(fg);
789 if (!dir) {
790 // create it.
791 ceph_assert(is_auth() || mdcache->mds->is_any_replay());
792 dir = new CDir(this, fg, mdcache, is_auth());
793 add_dirfrag(dir);
794 }
795 return dir;
796 }
797
798 CDir *CInode::add_dirfrag(CDir *dir)
799 {
800 auto em = dirfrags.emplace(std::piecewise_construct, std::forward_as_tuple(dir->dirfrag().frag), std::forward_as_tuple(dir));
801 ceph_assert(em.second);
802
803 if (stickydir_ref > 0) {
804 dir->state_set(CDir::STATE_STICKY);
805 dir->get(CDir::PIN_STICKY);
806 }
807
808 maybe_export_pin();
809
810 return dir;
811 }
812
813 void CInode::close_dirfrag(frag_t fg)
814 {
815 dout(14) << __func__ << " " << fg << dendl;
816 ceph_assert(dirfrags.count(fg));
817
818 CDir *dir = dirfrags[fg];
819 dir->remove_null_dentries();
820
821 // clear dirty flag
822 if (dir->is_dirty())
823 dir->mark_clean();
824
825 if (stickydir_ref > 0) {
826 dir->state_clear(CDir::STATE_STICKY);
827 dir->put(CDir::PIN_STICKY);
828 }
829
830 if (dir->is_subtree_root())
831 num_subtree_roots--;
832
833 // dump any remaining dentries, for debugging purposes
834 for (const auto &p : dir->items)
835 dout(14) << __func__ << " LEFTOVER dn " << *p.second << dendl;
836
837 ceph_assert(dir->get_num_ref() == 0);
838 delete dir;
839 dirfrags.erase(fg);
840 }
841
842 void CInode::close_dirfrags()
843 {
844 while (!dirfrags.empty())
845 close_dirfrag(dirfrags.begin()->first);
846 }
847
848 bool CInode::has_subtree_root_dirfrag(int auth)
849 {
850 if (num_subtree_roots > 0) {
851 if (auth == -1)
852 return true;
853 for (const auto &p : dirfrags) {
854 if (p.second->is_subtree_root() &&
855 p.second->dir_auth.first == auth)
856 return true;
857 }
858 }
859 return false;
860 }
861
862 bool CInode::has_subtree_or_exporting_dirfrag()
863 {
864 if (num_subtree_roots > 0 || num_exporting_dirs > 0)
865 return true;
866 return false;
867 }
868
869 void CInode::get_stickydirs()
870 {
871 if (stickydir_ref == 0) {
872 get(PIN_STICKYDIRS);
873 for (const auto &p : dirfrags) {
874 p.second->state_set(CDir::STATE_STICKY);
875 p.second->get(CDir::PIN_STICKY);
876 }
877 }
878 stickydir_ref++;
879 }
880
881 void CInode::put_stickydirs()
882 {
883 ceph_assert(stickydir_ref > 0);
884 stickydir_ref--;
885 if (stickydir_ref == 0) {
886 put(PIN_STICKYDIRS);
887 for (const auto &p : dirfrags) {
888 p.second->state_clear(CDir::STATE_STICKY);
889 p.second->put(CDir::PIN_STICKY);
890 }
891 }
892 }
893
894
895
896
897
898 // pins
899
900 void CInode::first_get()
901 {
902 // pin my dentry?
903 if (parent)
904 parent->get(CDentry::PIN_INODEPIN);
905 }
906
907 void CInode::last_put()
908 {
909 // unpin my dentry?
910 if (parent)
911 parent->put(CDentry::PIN_INODEPIN);
912 }
913
914 void CInode::_put()
915 {
916 if (get_num_ref() == (int)is_dirty() + (int)is_dirty_parent())
917 mdcache->maybe_eval_stray(this, true);
918 }
919
920 void CInode::add_remote_parent(CDentry *p)
921 {
922 if (remote_parents.empty())
923 get(PIN_REMOTEPARENT);
924 remote_parents.insert(p);
925 }
926 void CInode::remove_remote_parent(CDentry *p)
927 {
928 remote_parents.erase(p);
929 if (remote_parents.empty())
930 put(PIN_REMOTEPARENT);
931 }
932
933
934
935
936 CDir *CInode::get_parent_dir()
937 {
938 if (parent)
939 return parent->dir;
940 return NULL;
941 }
942 CDir *CInode::get_projected_parent_dir()
943 {
944 CDentry *p = get_projected_parent_dn();
945 if (p)
946 return p->dir;
947 return NULL;
948 }
949 CInode *CInode::get_parent_inode()
950 {
951 if (parent)
952 return parent->dir->inode;
953 return NULL;
954 }
955
956 bool CInode::is_ancestor_of(const CInode *other) const
957 {
958 while (other) {
959 if (other == this)
960 return true;
961 const CDentry *pdn = other->get_oldest_parent_dn();
962 if (!pdn) {
963 ceph_assert(other->is_base());
964 break;
965 }
966 other = pdn->get_dir()->get_inode();
967 }
968 return false;
969 }
970
971 bool CInode::is_projected_ancestor_of(const CInode *other) const
972 {
973 while (other) {
974 if (other == this)
975 return true;
976 const CDentry *pdn = other->get_projected_parent_dn();
977 if (!pdn) {
978 ceph_assert(other->is_base());
979 break;
980 }
981 other = pdn->get_dir()->get_inode();
982 }
983 return false;
984 }
985
986 /*
987 * Because a non-directory inode may have multiple links, the use_parent
988 * argument allows selecting which parent to use for path construction. This
989 * argument is only meaningful for the final component (i.e. the first of the
990 * nested calls) because directories cannot have multiple hard links. If
991 * use_parent is NULL and projected is true, the primary parent's projected
992 * inode is used all the way up the path chain. Otherwise the primary parent
993 * stable inode is used.
994 */
995 void CInode::make_path_string(string& s, bool projected, const CDentry *use_parent) const
996 {
997 if (!use_parent) {
998 use_parent = projected ? get_projected_parent_dn() : parent;
999 }
1000
1001 if (use_parent) {
1002 use_parent->make_path_string(s, projected);
1003 } else if (is_root()) {
1004 s = "";
1005 } else if (is_mdsdir()) {
1006 char t[40];
1007 uint64_t eino(ino());
1008 eino -= MDS_INO_MDSDIR_OFFSET;
1009 snprintf(t, sizeof(t), "~mds%" PRId64, eino);
1010 s = t;
1011 } else {
1012 char n[40];
1013 uint64_t eino(ino());
1014 snprintf(n, sizeof(n), "#%" PRIx64, eino);
1015 s += n;
1016 }
1017 }
1018
1019 void CInode::make_path(filepath& fp, bool projected) const
1020 {
1021 const CDentry *use_parent = projected ? get_projected_parent_dn() : parent;
1022 if (use_parent) {
1023 ceph_assert(!is_base());
1024 use_parent->make_path(fp, projected);
1025 } else {
1026 fp = filepath(ino());
1027 }
1028 }
1029
1030 void CInode::name_stray_dentry(string& dname)
1031 {
1032 char s[20];
1033 snprintf(s, sizeof(s), "%llx", (unsigned long long)ino().val);
1034 dname = s;
1035 }
1036
1037 version_t CInode::pre_dirty()
1038 {
1039 version_t pv;
1040 CDentry* _cdentry = get_projected_parent_dn();
1041 if (_cdentry) {
1042 pv = _cdentry->pre_dirty(get_projected_version());
1043 dout(10) << "pre_dirty " << pv << " (current v " << get_inode()->version << ")" << dendl;
1044 } else {
1045 ceph_assert(is_base());
1046 pv = get_projected_version() + 1;
1047 }
1048 // force update backtrace for old format inode (see mempool_inode::decode)
1049 if (get_inode()->backtrace_version == 0 && !projected_nodes.empty()) {
1050 auto pi = _get_projected_inode();
1051 if (pi->backtrace_version == 0)
1052 pi->update_backtrace(pv);
1053 }
1054 return pv;
1055 }
1056
1057 void CInode::_mark_dirty(LogSegment *ls)
1058 {
1059 if (!state_test(STATE_DIRTY)) {
1060 state_set(STATE_DIRTY);
1061 get(PIN_DIRTY);
1062 ceph_assert(ls);
1063 }
1064
1065 // move myself to this segment's dirty list
1066 if (ls)
1067 ls->dirty_inodes.push_back(&item_dirty);
1068 }
1069
1070 void CInode::mark_dirty(LogSegment *ls) {
1071
1072 dout(10) << __func__ << " " << *this << dendl;
1073
1074 /*
1075 NOTE: I may already be dirty, but this fn _still_ needs to be called so that
1076 the directory is (perhaps newly) dirtied, and so that parent_dir_version is
1077 updated below.
1078 */
1079
1080 // only auth can get dirty. "dirty" async data in replicas is relative to
1081 // filelock state, not the dirty flag.
1082 ceph_assert(is_auth());
1083
1084 // touch my private version
1085 _mark_dirty(ls);
1086
1087 // mark dentry too
1088 if (parent)
1089 parent->mark_dirty(get_version(), ls);
1090 }
1091
1092
1093 void CInode::mark_clean()
1094 {
1095 dout(10) << __func__ << " " << *this << dendl;
1096 if (state_test(STATE_DIRTY)) {
1097 state_clear(STATE_DIRTY);
1098 put(PIN_DIRTY);
1099
1100 // remove myself from ls dirty list
1101 item_dirty.remove_myself();
1102 }
1103 }
1104
1105
1106 // --------------
1107 // per-inode storage
1108 // (currently for root inode only)
1109
1110 struct C_IO_Inode_Stored : public CInodeIOContext {
1111 version_t version;
1112 Context *fin;
1113 C_IO_Inode_Stored(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {}
1114 void finish(int r) override {
1115 in->_stored(r, version, fin);
1116 }
1117 void print(ostream& out) const override {
1118 out << "inode_store(" << in->ino() << ")";
1119 }
1120 };
1121
1122 object_t InodeStoreBase::get_object_name(inodeno_t ino, frag_t fg, std::string_view suffix)
1123 {
1124 char n[60];
1125 snprintf(n, sizeof(n), "%llx.%08llx", (long long unsigned)ino, (long long unsigned)fg);
1126 ceph_assert(strlen(n) + suffix.size() < sizeof n);
1127 strncat(n, suffix.data(), suffix.size());
1128 return object_t(n);
1129 }
1130
1131 void CInode::store(MDSContext *fin)
1132 {
1133 dout(10) << __func__ << " " << get_version() << dendl;
1134 ceph_assert(is_base());
1135
1136 if (snaprealm)
1137 purge_stale_snap_data(snaprealm->get_snaps());
1138
1139 // encode
1140 bufferlist bl;
1141 string magic = CEPH_FS_ONDISK_MAGIC;
1142 using ceph::encode;
1143 encode(magic, bl);
1144 encode_store(bl, mdcache->mds->mdsmap->get_up_features());
1145
1146 // write it.
1147 SnapContext snapc;
1148 ObjectOperation m;
1149 m.write_full(bl);
1150
1151 object_t oid = CInode::get_object_name(ino(), frag_t(), ".inode");
1152 object_locator_t oloc(mdcache->mds->get_metadata_pool());
1153
1154 Context *newfin =
1155 new C_OnFinisher(new C_IO_Inode_Stored(this, get_version(), fin),
1156 mdcache->mds->finisher);
1157 mdcache->mds->objecter->mutate(oid, oloc, m, snapc,
1158 ceph::real_clock::now(), 0,
1159 newfin);
1160 }
1161
1162 void CInode::_stored(int r, version_t v, Context *fin)
1163 {
1164 if (r < 0) {
1165 dout(1) << "store error " << r << " v " << v << " on " << *this << dendl;
1166 mdcache->mds->clog->error() << "failed to store inode " << ino()
1167 << " object: " << cpp_strerror(r);
1168 mdcache->mds->handle_write_error(r);
1169 fin->complete(r);
1170 return;
1171 }
1172
1173 dout(10) << __func__ << " " << v << " on " << *this << dendl;
1174 if (v == get_projected_version())
1175 mark_clean();
1176
1177 fin->complete(0);
1178 }
1179
1180 void CInode::flush(MDSContext *fin)
1181 {
1182 dout(10) << __func__ << " " << *this << dendl;
1183 ceph_assert(is_auth() && can_auth_pin());
1184
1185 MDSGatherBuilder gather(g_ceph_context);
1186
1187 if (is_dirty_parent()) {
1188 store_backtrace(gather.new_sub());
1189 }
1190 if (is_dirty()) {
1191 if (is_base()) {
1192 store(gather.new_sub());
1193 } else {
1194 parent->dir->commit(0, gather.new_sub());
1195 }
1196 }
1197
1198 if (gather.has_subs()) {
1199 gather.set_finisher(fin);
1200 gather.activate();
1201 } else {
1202 fin->complete(0);
1203 }
1204 }
1205
1206 struct C_IO_Inode_Fetched : public CInodeIOContext {
1207 bufferlist bl, bl2;
1208 Context *fin;
1209 C_IO_Inode_Fetched(CInode *i, Context *f) : CInodeIOContext(i), fin(f) {}
1210 void finish(int r) override {
1211 // Ignore 'r', because we fetch from two places, so r is usually CEPHFS_ENOENT
1212 in->_fetched(bl, bl2, fin);
1213 }
1214 void print(ostream& out) const override {
1215 out << "inode_fetch(" << in->ino() << ")";
1216 }
1217 };
1218
1219 void CInode::fetch(MDSContext *fin)
1220 {
1221 dout(10) << __func__ << dendl;
1222
1223 C_IO_Inode_Fetched *c = new C_IO_Inode_Fetched(this, fin);
1224 C_GatherBuilder gather(g_ceph_context, new C_OnFinisher(c, mdcache->mds->finisher));
1225
1226 object_t oid = CInode::get_object_name(ino(), frag_t(), "");
1227 object_locator_t oloc(mdcache->mds->get_metadata_pool());
1228
1229 // Old on-disk format: inode stored in xattr of a dirfrag
1230 ObjectOperation rd;
1231 rd.getxattr("inode", &c->bl, NULL);
1232 mdcache->mds->objecter->read(oid, oloc, rd, CEPH_NOSNAP, (bufferlist*)NULL, 0, gather.new_sub());
1233
1234 // Current on-disk format: inode stored in a .inode object
1235 object_t oid2 = CInode::get_object_name(ino(), frag_t(), ".inode");
1236 mdcache->mds->objecter->read(oid2, oloc, 0, 0, CEPH_NOSNAP, &c->bl2, 0, gather.new_sub());
1237
1238 gather.activate();
1239 }
1240
1241 void CInode::_fetched(bufferlist& bl, bufferlist& bl2, Context *fin)
1242 {
1243 dout(10) << __func__ << " got " << bl.length() << " and " << bl2.length() << dendl;
1244 bufferlist::const_iterator p;
1245 if (bl2.length()) {
1246 p = bl2.cbegin();
1247 } else if (bl.length()) {
1248 p = bl.cbegin();
1249 } else {
1250 derr << "No data while reading inode " << ino() << dendl;
1251 fin->complete(-CEPHFS_ENOENT);
1252 return;
1253 }
1254
1255 using ceph::decode;
1256 // Attempt decode
1257 try {
1258 string magic;
1259 decode(magic, p);
1260 dout(10) << " magic is '" << magic << "' (expecting '"
1261 << CEPH_FS_ONDISK_MAGIC << "')" << dendl;
1262 if (magic != CEPH_FS_ONDISK_MAGIC) {
1263 dout(0) << "on disk magic '" << magic << "' != my magic '" << CEPH_FS_ONDISK_MAGIC
1264 << "'" << dendl;
1265 fin->complete(-CEPHFS_EINVAL);
1266 } else {
1267 decode_store(p);
1268 dout(10) << "_fetched " << *this << dendl;
1269 fin->complete(0);
1270 }
1271 } catch (buffer::error &err) {
1272 derr << "Corrupt inode " << ino() << ": " << err.what() << dendl;
1273 fin->complete(-CEPHFS_EINVAL);
1274 return;
1275 }
1276 }
1277
1278 void CInode::build_backtrace(int64_t pool, inode_backtrace_t& bt)
1279 {
1280 bt.ino = ino();
1281 bt.ancestors.clear();
1282 bt.pool = pool;
1283
1284 CInode *in = this;
1285 CDentry *pdn = get_parent_dn();
1286 while (pdn) {
1287 CInode *diri = pdn->get_dir()->get_inode();
1288 bt.ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(), in->get_inode()->version));
1289 in = diri;
1290 pdn = in->get_parent_dn();
1291 }
1292 bt.old_pools.reserve(get_inode()->old_pools.size());
1293 for (auto &p : get_inode()->old_pools) {
1294 // don't add our own pool id to old_pools to avoid looping (e.g. setlayout 0, 1, 0)
1295 if (p != pool)
1296 bt.old_pools.push_back(p);
1297 }
1298 }
1299
1300 struct C_IO_Inode_StoredBacktrace : public CInodeIOContext {
1301 version_t version;
1302 Context *fin;
1303 C_IO_Inode_StoredBacktrace(CInode *i, version_t v, Context *f) : CInodeIOContext(i), version(v), fin(f) {}
1304 void finish(int r) override {
1305 in->_stored_backtrace(r, version, fin);
1306 }
1307 void print(ostream& out) const override {
1308 out << "backtrace_store(" << in->ino() << ")";
1309 }
1310 };
1311
1312
1313 void CInode::_commit_ops(int r, C_GatherBuilder &gather_bld,
1314 std::vector<CInodeCommitOperation> &ops_vec,
1315 inode_backtrace_t &bt)
1316 {
1317 dout(10) << __func__ << dendl;
1318
1319 if (r < 0) {
1320 mdcache->mds->handle_write_error_with_lock(r);
1321 return;
1322 }
1323
1324 SnapContext snapc;
1325 object_t oid = get_object_name(ino(), frag_t(), "");
1326
1327 for (auto &op : ops_vec) {
1328 ObjectOperation obj_op;
1329 object_locator_t oloc(op.get_pool());
1330 op.update(obj_op, bt);
1331 mdcache->mds->objecter->mutate(oid, oloc, obj_op, snapc,
1332 ceph::real_clock::now(),
1333 0, gather_bld.new_sub());
1334 }
1335 }
1336
1337 void CInode::_store_backtrace(std::vector<CInodeCommitOperation> &ops_vec,
1338 inode_backtrace_t &bt, int op_prio)
1339 {
1340 dout(10) << __func__ << " on " << *this << dendl;
1341 ceph_assert(is_dirty_parent());
1342
1343 if (op_prio < 0)
1344 op_prio = CEPH_MSG_PRIO_DEFAULT;
1345
1346 auth_pin(this);
1347
1348 const int64_t pool = get_backtrace_pool();
1349 build_backtrace(pool, bt);
1350
1351 ops_vec.emplace_back(op_prio, pool, get_inode()->layout,
1352 mdcache->mds->mdsmap->get_up_features());
1353
1354 if (!state_test(STATE_DIRTYPOOL) || get_inode()->old_pools.empty()) {
1355 dout(20) << __func__ << ": no dirtypool or no old pools" << dendl;
1356 return;
1357 }
1358
1359 // In the case where DIRTYPOOL is set, we update all old pools backtraces
1360 // such that anyone reading them will see the new pool ID in
1361 // inode_backtrace_t::pool and go read everything else from there.
1362 for (const auto &p : get_inode()->old_pools) {
1363 if (p == pool)
1364 continue;
1365
1366 dout(20) << __func__ << ": updating old pool " << p << dendl;
1367
1368 ops_vec.emplace_back(op_prio, p);
1369 }
1370 }
1371
1372 void CInode::store_backtrace(MDSContext *fin, int op_prio)
1373 {
1374 std::vector<CInodeCommitOperation> ops_vec;
1375 inode_backtrace_t bt;
1376 auto version = get_inode()->backtrace_version;
1377
1378 _store_backtrace(ops_vec, bt, op_prio);
1379
1380 C_GatherBuilder gather(g_ceph_context,
1381 new C_OnFinisher(
1382 new C_IO_Inode_StoredBacktrace(this, version, fin),
1383 mdcache->mds->finisher));
1384 _commit_ops(0, gather, ops_vec, bt);
1385 ceph_assert(gather.has_subs());
1386 gather.activate();
1387 }
1388
1389 void CInode::store_backtrace(CInodeCommitOperations &op, int op_prio)
1390 {
1391 op.version = get_inode()->backtrace_version;
1392 op.in = this;
1393
1394 _store_backtrace(op.ops_vec, op.bt, op_prio);
1395 }
1396
1397 void CInode::_stored_backtrace(int r, version_t v, Context *fin)
1398 {
1399 if (r == -CEPHFS_ENOENT) {
1400 const int64_t pool = get_backtrace_pool();
1401 bool exists = mdcache->mds->objecter->with_osdmap(
1402 [pool](const OSDMap &osd_map) {
1403 return osd_map.have_pg_pool(pool);
1404 });
1405
1406 // This CEPHFS_ENOENT is because the pool doesn't exist (the user deleted it
1407 // out from under us), so the backtrace can never be written, so pretend
1408 // to succeed so that the user can proceed to e.g. delete the file.
1409 if (!exists) {
1410 dout(4) << __func__ << " got CEPHFS_ENOENT: a data pool was deleted "
1411 "beneath us!" << dendl;
1412 r = 0;
1413 }
1414 }
1415
1416 if (r < 0) {
1417 dout(1) << "store backtrace error " << r << " v " << v << dendl;
1418 mdcache->mds->clog->error() << "failed to store backtrace on ino "
1419 << ino() << " object"
1420 << ", pool " << get_backtrace_pool()
1421 << ", errno " << r;
1422 mdcache->mds->handle_write_error(r);
1423 if (fin)
1424 fin->complete(r);
1425 return;
1426 }
1427
1428 dout(10) << __func__ << " v " << v << dendl;
1429
1430 auth_unpin(this);
1431 if (v == get_inode()->backtrace_version)
1432 clear_dirty_parent();
1433 if (fin)
1434 fin->complete(0);
1435 }
1436
1437 void CInode::fetch_backtrace(Context *fin, bufferlist *backtrace)
1438 {
1439 mdcache->fetch_backtrace(ino(), get_backtrace_pool(), *backtrace, fin);
1440 }
1441
1442 void CInode::mark_dirty_parent(LogSegment *ls, bool dirty_pool)
1443 {
1444 if (!state_test(STATE_DIRTYPARENT)) {
1445 dout(10) << __func__ << dendl;
1446 state_set(STATE_DIRTYPARENT);
1447 get(PIN_DIRTYPARENT);
1448 ceph_assert(ls);
1449 }
1450 if (dirty_pool)
1451 state_set(STATE_DIRTYPOOL);
1452 if (ls)
1453 ls->dirty_parent_inodes.push_back(&item_dirty_parent);
1454 }
1455
1456 void CInode::clear_dirty_parent()
1457 {
1458 if (state_test(STATE_DIRTYPARENT)) {
1459 dout(10) << __func__ << dendl;
1460 state_clear(STATE_DIRTYPARENT);
1461 state_clear(STATE_DIRTYPOOL);
1462 put(PIN_DIRTYPARENT);
1463 item_dirty_parent.remove_myself();
1464 }
1465 }
1466
1467 void CInode::verify_diri_backtrace(bufferlist &bl, int err)
1468 {
1469 if (is_base() || is_dirty_parent() || !is_auth())
1470 return;
1471
1472 dout(10) << __func__ << dendl;
1473
1474 if (err == 0) {
1475 inode_backtrace_t backtrace;
1476 using ceph::decode;
1477 decode(backtrace, bl);
1478 CDentry *pdn = get_parent_dn();
1479 if (backtrace.ancestors.empty() ||
1480 backtrace.ancestors[0].dname != pdn->get_name() ||
1481 backtrace.ancestors[0].dirino != pdn->get_dir()->ino())
1482 err = -CEPHFS_EINVAL;
1483 }
1484
1485 if (err) {
1486 MDSRank *mds = mdcache->mds;
1487 mds->clog->error() << "bad backtrace on directory inode " << ino();
1488 ceph_assert(!"bad backtrace" == (g_conf()->mds_verify_backtrace > 1));
1489
1490 mark_dirty_parent(mds->mdlog->get_current_segment(), false);
1491 mds->mdlog->flush();
1492 }
1493 }
1494
1495 // ------------------
1496 // parent dir
1497
1498
1499 void InodeStoreBase::encode_xattrs(bufferlist &bl) const {
1500 using ceph::encode;
1501 if (xattrs)
1502 encode(*xattrs, bl);
1503 else
1504 encode((__u32)0, bl);
1505 }
1506
1507 void InodeStoreBase::decode_xattrs(bufferlist::const_iterator &p) {
1508 using ceph::decode;
1509 mempool_xattr_map tmp;
1510 decode_noshare(tmp, p);
1511 if (tmp.empty()) {
1512 reset_xattrs(xattr_map_ptr());
1513 } else {
1514 reset_xattrs(allocate_xattr_map(std::move(tmp)));
1515 }
1516 }
1517
1518 void InodeStoreBase::encode_old_inodes(bufferlist &bl, uint64_t features) const {
1519 using ceph::encode;
1520 if (old_inodes)
1521 encode(*old_inodes, bl, features);
1522 else
1523 encode((__u32)0, bl);
1524 }
1525
1526 void InodeStoreBase::decode_old_inodes(bufferlist::const_iterator &p) {
1527 using ceph::decode;
1528 mempool_old_inode_map tmp;
1529 decode(tmp, p);
1530 if (tmp.empty()) {
1531 reset_old_inodes(old_inode_map_ptr());
1532 } else {
1533 reset_old_inodes(allocate_old_inode_map(std::move(tmp)));
1534 }
1535 }
1536
1537 void InodeStoreBase::encode_bare(bufferlist &bl, uint64_t features,
1538 const bufferlist *snap_blob) const
1539 {
1540 using ceph::encode;
1541 encode(*inode, bl, features);
1542 if (inode->is_symlink())
1543 encode(symlink, bl);
1544 encode(dirfragtree, bl);
1545 encode_xattrs(bl);
1546
1547 if (snap_blob)
1548 encode(*snap_blob, bl);
1549 else
1550 encode(bufferlist(), bl);
1551 encode_old_inodes(bl, features);
1552 encode(oldest_snap, bl);
1553 encode(damage_flags, bl);
1554 }
1555
1556 void InodeStoreBase::encode(bufferlist &bl, uint64_t features,
1557 const bufferlist *snap_blob) const
1558 {
1559 ENCODE_START(6, 4, bl);
1560 encode_bare(bl, features, snap_blob);
1561 ENCODE_FINISH(bl);
1562 }
1563
1564 void CInode::encode_store(bufferlist& bl, uint64_t features)
1565 {
1566 bufferlist snap_blob;
1567 encode_snap_blob(snap_blob);
1568 InodeStoreBase::encode(bl, mdcache->mds->mdsmap->get_up_features(),
1569 &snap_blob);
1570 }
1571
1572 void InodeStoreBase::decode_bare(bufferlist::const_iterator &bl,
1573 bufferlist& snap_blob, __u8 struct_v)
1574 {
1575 using ceph::decode;
1576
1577 auto _inode = allocate_inode();
1578 decode(*_inode, bl);
1579
1580 if (_inode->is_symlink()) {
1581 std::string tmp;
1582 decode(tmp, bl);
1583 symlink = std::string_view(tmp);
1584 }
1585 decode(dirfragtree, bl);
1586 decode_xattrs(bl);
1587 decode(snap_blob, bl);
1588
1589 decode_old_inodes(bl);
1590 if (struct_v == 2 && _inode->is_dir()) {
1591 bool default_layout_exists;
1592 decode(default_layout_exists, bl);
1593 if (default_layout_exists) {
1594 decode(struct_v, bl); // this was a default_file_layout
1595 decode(_inode->layout, bl); // but we only care about the layout portion
1596 }
1597 }
1598
1599 if (struct_v >= 5) {
1600 // InodeStore is embedded in dentries without proper versioning, so
1601 // we consume up to the end of the buffer
1602 if (!bl.end()) {
1603 decode(oldest_snap, bl);
1604 }
1605
1606 if (!bl.end()) {
1607 decode(damage_flags, bl);
1608 }
1609 }
1610
1611 reset_inode(std::move(_inode));
1612 }
1613
1614
1615 void InodeStoreBase::decode(bufferlist::const_iterator &bl, bufferlist& snap_blob)
1616 {
1617 DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
1618 decode_bare(bl, snap_blob, struct_v);
1619 DECODE_FINISH(bl);
1620 }
1621
1622 void CInode::decode_store(bufferlist::const_iterator& bl)
1623 {
1624 bufferlist snap_blob;
1625 InodeStoreBase::decode(bl, snap_blob);
1626 decode_snap_blob(snap_blob);
1627 }
1628
1629 // ------------------
1630 // locking
1631
1632 SimpleLock* CInode::get_lock(int type)
1633 {
1634 switch (type) {
1635 case CEPH_LOCK_IVERSION: return &versionlock;
1636 case CEPH_LOCK_IFILE: return &filelock;
1637 case CEPH_LOCK_IAUTH: return &authlock;
1638 case CEPH_LOCK_ILINK: return &linklock;
1639 case CEPH_LOCK_IDFT: return &dirfragtreelock;
1640 case CEPH_LOCK_IXATTR: return &xattrlock;
1641 case CEPH_LOCK_ISNAP: return &snaplock;
1642 case CEPH_LOCK_INEST: return &nestlock;
1643 case CEPH_LOCK_IFLOCK: return &flocklock;
1644 case CEPH_LOCK_IPOLICY: return &policylock;
1645 }
1646 return 0;
1647 }
1648
1649 void CInode::set_object_info(MDSCacheObjectInfo &info)
1650 {
1651 info.ino = ino();
1652 info.snapid = last;
1653 }
1654
1655 void CInode::encode_lock_iauth(bufferlist& bl)
1656 {
1657 ENCODE_START(1, 1, bl);
1658 encode(get_inode()->version, bl);
1659 encode(get_inode()->ctime, bl);
1660 encode(get_inode()->mode, bl);
1661 encode(get_inode()->uid, bl);
1662 encode(get_inode()->gid, bl);
1663 ENCODE_FINISH(bl);
1664 }
1665
1666 void CInode::decode_lock_iauth(bufferlist::const_iterator& p)
1667 {
1668 ceph_assert(!is_auth());
1669 auto _inode = allocate_inode(*get_inode());
1670 DECODE_START(1, p);
1671 decode(_inode->version, p);
1672 utime_t tm;
1673 decode(tm, p);
1674 if (_inode->ctime < tm) _inode->ctime = tm;
1675 decode(_inode->mode, p);
1676 decode(_inode->uid, p);
1677 decode(_inode->gid, p);
1678 DECODE_FINISH(p);
1679 reset_inode(std::move(_inode));
1680 }
1681
1682 void CInode::encode_lock_ilink(bufferlist& bl)
1683 {
1684 ENCODE_START(1, 1, bl);
1685 encode(get_inode()->version, bl);
1686 encode(get_inode()->ctime, bl);
1687 encode(get_inode()->nlink, bl);
1688 ENCODE_FINISH(bl);
1689 }
1690
1691 void CInode::decode_lock_ilink(bufferlist::const_iterator& p)
1692 {
1693 ceph_assert(!is_auth());
1694 auto _inode = allocate_inode(*get_inode());
1695 DECODE_START(1, p);
1696 decode(_inode->version, p);
1697 utime_t tm;
1698 decode(tm, p);
1699 if (_inode->ctime < tm) _inode->ctime = tm;
1700 decode(_inode->nlink, p);
1701 DECODE_FINISH(p);
1702 reset_inode(std::move(_inode));
1703 }
1704
1705 void CInode::encode_lock_idft(bufferlist& bl)
1706 {
1707 ENCODE_START(1, 1, bl);
1708 if (is_auth()) {
1709 encode(get_inode()->version, bl);
1710 } else {
1711 // treat flushing as dirty when rejoining cache
1712 bool dirty = dirfragtreelock.is_dirty_or_flushing();
1713 encode(dirty, bl);
1714 }
1715 {
1716 // encode the raw tree
1717 encode(dirfragtree, bl);
1718
1719 // also specify which frags are mine
1720 set<frag_t> myfrags;
1721 auto&& dfls = get_dirfrags();
1722 for (const auto& dir : dfls) {
1723 if (dir->is_auth()) {
1724 frag_t fg = dir->get_frag();
1725 myfrags.insert(fg);
1726 }
1727 }
1728 encode(myfrags, bl);
1729 }
1730 ENCODE_FINISH(bl);
1731 }
1732
1733 void CInode::decode_lock_idft(bufferlist::const_iterator& p)
1734 {
1735 inode_ptr _inode;
1736
1737 DECODE_START(1, p);
1738 if (is_auth()) {
1739 bool replica_dirty;
1740 decode(replica_dirty, p);
1741 if (replica_dirty) {
1742 dout(10) << __func__ << " setting dftlock dirty flag" << dendl;
1743 dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle
1744 }
1745 } else {
1746 _inode = allocate_inode(*get_inode());
1747 decode(_inode->version, p);
1748 }
1749 {
1750 fragtree_t temp;
1751 decode(temp, p);
1752 set<frag_t> authfrags;
1753 decode(authfrags, p);
1754 if (is_auth()) {
1755 // auth. believe replica's auth frags only.
1756 for (auto fg : authfrags) {
1757 if (!dirfragtree.is_leaf(fg)) {
1758 dout(10) << " forcing frag " << fg << " to leaf (split|merge)" << dendl;
1759 dirfragtree.force_to_leaf(g_ceph_context, fg);
1760 dirfragtreelock.mark_dirty(); // ok bc we're auth and caller will handle
1761 }
1762 }
1763 } else {
1764 // replica. take the new tree, BUT make sure any open
1765 // dirfrags remain leaves (they may have split _after_ this
1766 // dft was scattered, or we may still be be waiting on the
1767 // notify from the auth)
1768 dirfragtree.swap(temp);
1769 for (const auto &p : dirfrags) {
1770 if (!dirfragtree.is_leaf(p.first)) {
1771 dout(10) << " forcing open dirfrag " << p.first << " to leaf (racing with split|merge)" << dendl;
1772 dirfragtree.force_to_leaf(g_ceph_context, p.first);
1773 }
1774 if (p.second->is_auth())
1775 p.second->state_clear(CDir::STATE_DIRTYDFT);
1776 }
1777 }
1778 if (g_conf()->mds_debug_frag)
1779 verify_dirfrags();
1780 }
1781 DECODE_FINISH(p);
1782
1783 if (_inode)
1784 reset_inode(std::move(_inode));
1785 }
1786
1787 void CInode::encode_lock_ifile(bufferlist& bl)
1788 {
1789 ENCODE_START(1, 1, bl);
1790 if (is_auth()) {
1791 encode(get_inode()->version, bl);
1792 encode(get_inode()->ctime, bl);
1793 encode(get_inode()->mtime, bl);
1794 encode(get_inode()->atime, bl);
1795 encode(get_inode()->time_warp_seq, bl);
1796 if (!is_dir()) {
1797 encode(get_inode()->layout, bl, mdcache->mds->mdsmap->get_up_features());
1798 encode(get_inode()->size, bl);
1799 encode(get_inode()->truncate_seq, bl);
1800 encode(get_inode()->truncate_size, bl);
1801 encode(get_inode()->client_ranges, bl);
1802 encode(get_inode()->inline_data, bl);
1803 }
1804 } else {
1805 // treat flushing as dirty when rejoining cache
1806 bool dirty = filelock.is_dirty_or_flushing();
1807 encode(dirty, bl);
1808 }
1809 dout(15) << __func__ << " inode.dirstat is " << get_inode()->dirstat << dendl;
1810 encode(get_inode()->dirstat, bl); // only meaningful if i am auth.
1811 bufferlist tmp;
1812 __u32 n = 0;
1813 for (const auto &p : dirfrags) {
1814 frag_t fg = p.first;
1815 CDir *dir = p.second;
1816 if (is_auth() || dir->is_auth()) {
1817 const auto& pf = dir->get_projected_fnode();
1818 dout(15) << fg << " " << *dir << dendl;
1819 dout(20) << fg << " fragstat " << pf->fragstat << dendl;
1820 dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
1821 encode(fg, tmp);
1822 encode(dir->first, tmp);
1823 encode(pf->fragstat, tmp);
1824 encode(pf->accounted_fragstat, tmp);
1825 n++;
1826 }
1827 }
1828 encode(n, bl);
1829 bl.claim_append(tmp);
1830 ENCODE_FINISH(bl);
1831 }
1832
1833 void CInode::decode_lock_ifile(bufferlist::const_iterator& p)
1834 {
1835 inode_ptr _inode;
1836
1837 DECODE_START(1, p);
1838 if (!is_auth()) {
1839 _inode = allocate_inode(*get_inode());
1840
1841 decode(_inode->version, p);
1842 utime_t tm;
1843 decode(tm, p);
1844 if (_inode->ctime < tm) _inode->ctime = tm;
1845 decode(_inode->mtime, p);
1846 decode(_inode->atime, p);
1847 decode(_inode->time_warp_seq, p);
1848 if (!is_dir()) {
1849 decode(_inode->layout, p);
1850 decode(_inode->size, p);
1851 decode(_inode->truncate_seq, p);
1852 decode(_inode->truncate_size, p);
1853 decode(_inode->client_ranges, p);
1854 decode(_inode->inline_data, p);
1855 }
1856 } else {
1857 bool replica_dirty;
1858 decode(replica_dirty, p);
1859 if (replica_dirty) {
1860 dout(10) << __func__ << " setting filelock dirty flag" << dendl;
1861 filelock.mark_dirty(); // ok bc we're auth and caller will handle
1862 }
1863 }
1864
1865 frag_info_t dirstat;
1866 decode(dirstat, p);
1867 if (!is_auth()) {
1868 dout(10) << " taking inode dirstat " << dirstat << " for " << *this << dendl;
1869 _inode->dirstat = dirstat; // take inode summation if replica
1870 }
1871 __u32 n;
1872 decode(n, p);
1873 dout(10) << " ...got " << n << " fragstats on " << *this << dendl;
1874 while (n--) {
1875 frag_t fg;
1876 snapid_t fgfirst;
1877 frag_info_t fragstat;
1878 frag_info_t accounted_fragstat;
1879 decode(fg, p);
1880 decode(fgfirst, p);
1881 decode(fragstat, p);
1882 decode(accounted_fragstat, p);
1883 dout(10) << fg << " [" << fgfirst << ",head] " << dendl;
1884 dout(10) << fg << " fragstat " << fragstat << dendl;
1885 dout(20) << fg << " accounted_fragstat " << accounted_fragstat << dendl;
1886
1887 CDir *dir = get_dirfrag(fg);
1888 if (is_auth()) {
1889 ceph_assert(dir); // i am auth; i had better have this dir open
1890 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
1891 << " on " << *dir << dendl;
1892 dir->first = fgfirst;
1893 auto _fnode = CDir::allocate_fnode(*dir->get_fnode());
1894 _fnode->fragstat = fragstat;
1895 _fnode->accounted_fragstat = accounted_fragstat;
1896 dir->reset_fnode(std::move(_fnode));
1897 if (!(fragstat == accounted_fragstat)) {
1898 dout(10) << fg << " setting filelock updated flag" << dendl;
1899 filelock.mark_dirty(); // ok bc we're auth and caller will handle
1900 }
1901 } else {
1902 if (dir && dir->is_auth()) {
1903 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
1904 << " on " << *dir << dendl;
1905 dir->first = fgfirst;
1906 const auto& pf = dir->get_projected_fnode();
1907 finish_scatter_update(&filelock, dir,
1908 _inode->dirstat.version, pf->accounted_fragstat.version);
1909 }
1910 }
1911 }
1912 DECODE_FINISH(p);
1913
1914 if (_inode)
1915 reset_inode(std::move(_inode));
1916 }
1917
1918 void CInode::encode_lock_inest(bufferlist& bl)
1919 {
1920 ENCODE_START(1, 1, bl);
1921 if (is_auth()) {
1922 encode(get_inode()->version, bl);
1923 } else {
1924 // treat flushing as dirty when rejoining cache
1925 bool dirty = nestlock.is_dirty_or_flushing();
1926 encode(dirty, bl);
1927 }
1928 dout(15) << __func__ << " inode.rstat is " << get_inode()->rstat << dendl;
1929 encode(get_inode()->rstat, bl); // only meaningful if i am auth.
1930 bufferlist tmp;
1931 __u32 n = 0;
1932 for (const auto &p : dirfrags) {
1933 frag_t fg = p.first;
1934 CDir *dir = p.second;
1935 if (is_auth() || dir->is_auth()) {
1936 const auto& pf = dir->get_projected_fnode();
1937 dout(10) << __func__ << " " << fg << " dir " << *dir << dendl;
1938 dout(10) << __func__ << " " << fg << " rstat " << pf->rstat << dendl;
1939 dout(10) << __func__ << " " << fg << " accounted_rstat " << pf->rstat << dendl;
1940 dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl;
1941 encode(fg, tmp);
1942 encode(dir->first, tmp);
1943 encode(pf->rstat, tmp);
1944 encode(pf->accounted_rstat, tmp);
1945 encode(dir->dirty_old_rstat, tmp);
1946 n++;
1947 }
1948 }
1949 encode(n, bl);
1950 bl.claim_append(tmp);
1951 ENCODE_FINISH(bl);
1952 }
1953
1954 void CInode::decode_lock_inest(bufferlist::const_iterator& p)
1955 {
1956 inode_ptr _inode;
1957
1958 DECODE_START(1, p);
1959 if (is_auth()) {
1960 bool replica_dirty;
1961 decode(replica_dirty, p);
1962 if (replica_dirty) {
1963 dout(10) << __func__ << " setting nestlock dirty flag" << dendl;
1964 nestlock.mark_dirty(); // ok bc we're auth and caller will handle
1965 }
1966 } else {
1967 _inode = allocate_inode(*get_inode());
1968 decode(_inode->version, p);
1969 }
1970 nest_info_t rstat;
1971 decode(rstat, p);
1972 if (!is_auth()) {
1973 dout(10) << __func__ << " taking inode rstat " << rstat << " for " << *this << dendl;
1974 _inode->rstat = rstat; // take inode summation if replica
1975 }
1976 __u32 n;
1977 decode(n, p);
1978 while (n--) {
1979 frag_t fg;
1980 snapid_t fgfirst;
1981 nest_info_t rstat;
1982 nest_info_t accounted_rstat;
1983 decltype(CDir::dirty_old_rstat) dirty_old_rstat;
1984 decode(fg, p);
1985 decode(fgfirst, p);
1986 decode(rstat, p);
1987 decode(accounted_rstat, p);
1988 decode(dirty_old_rstat, p);
1989 dout(10) << __func__ << " " << fg << " [" << fgfirst << ",head]" << dendl;
1990 dout(10) << __func__ << " " << fg << " rstat " << rstat << dendl;
1991 dout(10) << __func__ << " " << fg << " accounted_rstat " << accounted_rstat << dendl;
1992 dout(10) << __func__ << " " << fg << " dirty_old_rstat " << dirty_old_rstat << dendl;
1993 CDir *dir = get_dirfrag(fg);
1994 if (is_auth()) {
1995 ceph_assert(dir); // i am auth; i had better have this dir open
1996 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
1997 << " on " << *dir << dendl;
1998 dir->first = fgfirst;
1999 auto _fnode = CDir::allocate_fnode(*dir->get_fnode());
2000 _fnode->rstat = rstat;
2001 _fnode->accounted_rstat = accounted_rstat;
2002 dir->reset_fnode(std::move(_fnode));
2003 dir->dirty_old_rstat.swap(dirty_old_rstat);
2004 if (!(rstat == accounted_rstat) || !dir->dirty_old_rstat.empty()) {
2005 dout(10) << fg << " setting nestlock updated flag" << dendl;
2006 nestlock.mark_dirty(); // ok bc we're auth and caller will handle
2007 }
2008 } else {
2009 if (dir && dir->is_auth()) {
2010 dout(10) << fg << " first " << dir->first << " -> " << fgfirst
2011 << " on " << *dir << dendl;
2012 dir->first = fgfirst;
2013 const auto& pf = dir->get_projected_fnode();
2014 finish_scatter_update(&nestlock, dir,
2015 _inode->rstat.version, pf->accounted_rstat.version);
2016 }
2017 }
2018 }
2019 DECODE_FINISH(p);
2020
2021 if (_inode)
2022 reset_inode(std::move(_inode));
2023 }
2024
2025 void CInode::encode_lock_ixattr(bufferlist& bl)
2026 {
2027 ENCODE_START(1, 1, bl);
2028 encode(get_inode()->version, bl);
2029 encode(get_inode()->ctime, bl);
2030 encode_xattrs(bl);
2031 ENCODE_FINISH(bl);
2032 }
2033
2034 void CInode::decode_lock_ixattr(bufferlist::const_iterator& p)
2035 {
2036 ceph_assert(!is_auth());
2037 auto _inode = allocate_inode(*get_inode());
2038 DECODE_START(1, p);
2039 decode(_inode->version, p);
2040 utime_t tm;
2041 decode(tm, p);
2042 if (_inode->ctime < tm)
2043 _inode->ctime = tm;
2044 decode_xattrs(p);
2045 DECODE_FINISH(p);
2046 reset_inode(std::move(_inode));
2047 }
2048
2049 void CInode::encode_lock_isnap(bufferlist& bl)
2050 {
2051 ENCODE_START(1, 1, bl);
2052 encode(get_inode()->version, bl);
2053 encode(get_inode()->ctime, bl);
2054 encode_snap(bl);
2055 ENCODE_FINISH(bl);
2056 }
2057
2058 void CInode::decode_lock_isnap(bufferlist::const_iterator& p)
2059 {
2060 ceph_assert(!is_auth());
2061 auto _inode = allocate_inode(*get_inode());
2062 DECODE_START(1, p);
2063 decode(_inode->version, p);
2064 utime_t tm;
2065 decode(tm, p);
2066 if (_inode->ctime < tm) _inode->ctime = tm;
2067 decode_snap(p);
2068 DECODE_FINISH(p);
2069 reset_inode(std::move(_inode));
2070 }
2071
2072 void CInode::encode_lock_iflock(bufferlist& bl)
2073 {
2074 ENCODE_START(1, 1, bl);
2075 encode(get_inode()->version, bl);
2076 _encode_file_locks(bl);
2077 ENCODE_FINISH(bl);
2078 }
2079
2080 void CInode::decode_lock_iflock(bufferlist::const_iterator& p)
2081 {
2082 ceph_assert(!is_auth());
2083 auto _inode = allocate_inode(*get_inode());
2084 DECODE_START(1, p);
2085 decode(_inode->version, p);
2086 _decode_file_locks(p);
2087 DECODE_FINISH(p);
2088 reset_inode(std::move(_inode));
2089 }
2090
2091 void CInode::encode_lock_ipolicy(bufferlist& bl)
2092 {
2093 ENCODE_START(2, 1, bl);
2094 if (is_dir()) {
2095 encode(get_inode()->version, bl);
2096 encode(get_inode()->ctime, bl);
2097 encode(get_inode()->layout, bl, mdcache->mds->mdsmap->get_up_features());
2098 encode(get_inode()->quota, bl);
2099 encode(get_inode()->export_pin, bl);
2100 encode(get_inode()->export_ephemeral_distributed_pin, bl);
2101 encode(get_inode()->export_ephemeral_random_pin, bl);
2102 }
2103 ENCODE_FINISH(bl);
2104 }
2105
2106 void CInode::decode_lock_ipolicy(bufferlist::const_iterator& p)
2107 {
2108 ceph_assert(!is_auth());
2109 auto _inode = allocate_inode(*get_inode());
2110 DECODE_START(1, p);
2111 if (is_dir()) {
2112 decode(_inode->version, p);
2113 utime_t tm;
2114 decode(tm, p);
2115 if (_inode->ctime < tm)
2116 _inode->ctime = tm;
2117 decode(_inode->layout, p);
2118 decode(_inode->quota, p);
2119 decode(_inode->export_pin, p);
2120 if (struct_v >= 2) {
2121 decode(_inode->export_ephemeral_distributed_pin, p);
2122 decode(_inode->export_ephemeral_random_pin, p);
2123 }
2124 }
2125 DECODE_FINISH(p);
2126
2127 bool pin_updated = (get_inode()->export_pin != _inode->export_pin) ||
2128 (get_inode()->export_ephemeral_distributed_pin !=
2129 _inode->export_ephemeral_distributed_pin);
2130 reset_inode(std::move(_inode));
2131 maybe_export_pin(pin_updated);
2132 }
2133
2134 void CInode::encode_lock_state(int type, bufferlist& bl)
2135 {
2136 ENCODE_START(1, 1, bl);
2137 encode(first, bl);
2138 if (!is_base())
2139 encode(parent->first, bl);
2140
2141 switch (type) {
2142 case CEPH_LOCK_IAUTH:
2143 encode_lock_iauth(bl);
2144 break;
2145
2146 case CEPH_LOCK_ILINK:
2147 encode_lock_ilink(bl);
2148 break;
2149
2150 case CEPH_LOCK_IDFT:
2151 encode_lock_idft(bl);
2152 break;
2153
2154 case CEPH_LOCK_IFILE:
2155 encode_lock_ifile(bl);
2156 break;
2157
2158 case CEPH_LOCK_INEST:
2159 encode_lock_inest(bl);
2160 break;
2161
2162 case CEPH_LOCK_IXATTR:
2163 encode_lock_ixattr(bl);
2164 break;
2165
2166 case CEPH_LOCK_ISNAP:
2167 encode_lock_isnap(bl);
2168 break;
2169
2170 case CEPH_LOCK_IFLOCK:
2171 encode_lock_iflock(bl);
2172 break;
2173
2174 case CEPH_LOCK_IPOLICY:
2175 encode_lock_ipolicy(bl);
2176 break;
2177
2178 default:
2179 ceph_abort();
2180 }
2181 ENCODE_FINISH(bl);
2182 }
2183
2184 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2185
2186 void CInode::decode_lock_state(int type, const bufferlist& bl)
2187 {
2188 auto p = bl.cbegin();
2189
2190 DECODE_START(1, p);
2191 utime_t tm;
2192
2193 snapid_t newfirst;
2194 using ceph::decode;
2195 decode(newfirst, p);
2196 if (!is_auth() && newfirst != first) {
2197 dout(10) << __func__ << " first " << first << " -> " << newfirst << dendl;
2198 first = newfirst;
2199 }
2200 if (!is_base()) {
2201 decode(newfirst, p);
2202 if (!parent->is_auth() && newfirst != parent->first) {
2203 dout(10) << __func__ << " parent first " << first << " -> " << newfirst << dendl;
2204 parent->first = newfirst;
2205 }
2206 }
2207
2208 switch (type) {
2209 case CEPH_LOCK_IAUTH:
2210 decode_lock_iauth(p);
2211 break;
2212
2213 case CEPH_LOCK_ILINK:
2214 decode_lock_ilink(p);
2215 break;
2216
2217 case CEPH_LOCK_IDFT:
2218 decode_lock_idft(p);
2219 break;
2220
2221 case CEPH_LOCK_IFILE:
2222 decode_lock_ifile(p);
2223 break;
2224
2225 case CEPH_LOCK_INEST:
2226 decode_lock_inest(p);
2227 break;
2228
2229 case CEPH_LOCK_IXATTR:
2230 decode_lock_ixattr(p);
2231 break;
2232
2233 case CEPH_LOCK_ISNAP:
2234 decode_lock_isnap(p);
2235 break;
2236
2237 case CEPH_LOCK_IFLOCK:
2238 decode_lock_iflock(p);
2239 break;
2240
2241 case CEPH_LOCK_IPOLICY:
2242 decode_lock_ipolicy(p);
2243 break;
2244
2245 default:
2246 ceph_abort();
2247 }
2248 DECODE_FINISH(p);
2249 }
2250
2251
2252 bool CInode::is_dirty_scattered()
2253 {
2254 return
2255 filelock.is_dirty_or_flushing() ||
2256 nestlock.is_dirty_or_flushing() ||
2257 dirfragtreelock.is_dirty_or_flushing();
2258 }
2259
2260 void CInode::clear_scatter_dirty()
2261 {
2262 filelock.remove_dirty();
2263 nestlock.remove_dirty();
2264 dirfragtreelock.remove_dirty();
2265 }
2266
2267 void CInode::clear_dirty_scattered(int type)
2268 {
2269 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2270 ceph_assert(is_dir());
2271 switch (type) {
2272 case CEPH_LOCK_IFILE:
2273 item_dirty_dirfrag_dir.remove_myself();
2274 break;
2275
2276 case CEPH_LOCK_INEST:
2277 item_dirty_dirfrag_nest.remove_myself();
2278 break;
2279
2280 case CEPH_LOCK_IDFT:
2281 item_dirty_dirfrag_dirfragtree.remove_myself();
2282 break;
2283
2284 default:
2285 ceph_abort();
2286 }
2287 }
2288
2289
2290 /*
2291 * when we initially scatter a lock, we need to check if any of the dirfrags
2292 * have out of date accounted_rstat/fragstat. if so, mark the lock stale.
2293 */
2294 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2295 void CInode::start_scatter(ScatterLock *lock)
2296 {
2297 dout(10) << __func__ << " " << *lock << " on " << *this << dendl;
2298 ceph_assert(is_auth());
2299 const auto& pi = get_projected_inode();
2300
2301 for (const auto &p : dirfrags) {
2302 frag_t fg = p.first;
2303 CDir *dir = p.second;
2304 const auto& pf = dir->get_projected_fnode();
2305 dout(20) << fg << " " << *dir << dendl;
2306
2307 if (!dir->is_auth())
2308 continue;
2309
2310 switch (lock->get_type()) {
2311 case CEPH_LOCK_IFILE:
2312 finish_scatter_update(lock, dir, pi->dirstat.version, pf->accounted_fragstat.version);
2313 break;
2314
2315 case CEPH_LOCK_INEST:
2316 finish_scatter_update(lock, dir, pi->rstat.version, pf->accounted_rstat.version);
2317 break;
2318
2319 case CEPH_LOCK_IDFT:
2320 dir->state_clear(CDir::STATE_DIRTYDFT);
2321 break;
2322 }
2323 }
2324 }
2325
2326
2327 class C_Inode_FragUpdate : public MDSLogContextBase {
2328 protected:
2329 CInode *in;
2330 CDir *dir;
2331 MutationRef mut;
2332 MDSRank *get_mds() override {return in->mdcache->mds;}
2333 void finish(int r) override {
2334 in->_finish_frag_update(dir, mut);
2335 }
2336
2337 public:
2338 C_Inode_FragUpdate(CInode *i, CDir *d, MutationRef& m) : in(i), dir(d), mut(m) {}
2339 };
2340
2341 void CInode::finish_scatter_update(ScatterLock *lock, CDir *dir,
2342 version_t inode_version, version_t dir_accounted_version)
2343 {
2344 frag_t fg = dir->get_frag();
2345 ceph_assert(dir->is_auth());
2346
2347 if (dir->is_frozen()) {
2348 dout(10) << __func__ << " " << fg << " frozen, marking " << *lock << " stale " << *dir << dendl;
2349 } else if (dir->get_version() == 0) {
2350 dout(10) << __func__ << " " << fg << " not loaded, marking " << *lock << " stale " << *dir << dendl;
2351 } else {
2352 if (dir_accounted_version != inode_version) {
2353 dout(10) << __func__ << " " << fg << " journaling accounted scatterstat update v" << inode_version << dendl;
2354
2355 MDLog *mdlog = mdcache->mds->mdlog;
2356 MutationRef mut(new MutationImpl());
2357 mut->ls = mdlog->get_current_segment();
2358
2359 auto pf = dir->project_fnode(mut);
2360
2361 std::string_view ename;
2362 switch (lock->get_type()) {
2363 case CEPH_LOCK_IFILE:
2364 pf->fragstat.version = inode_version;
2365 pf->accounted_fragstat = pf->fragstat;
2366 ename = "lock ifile accounted scatter stat update";
2367 break;
2368 case CEPH_LOCK_INEST:
2369 pf->rstat.version = inode_version;
2370 pf->accounted_rstat = pf->rstat;
2371 ename = "lock inest accounted scatter stat update";
2372
2373 if (!is_auth() && lock->get_state() == LOCK_MIX) {
2374 dout(10) << __func__ << " try to assimilate dirty rstat on "
2375 << *dir << dendl;
2376 dir->assimilate_dirty_rstat_inodes(mut);
2377 }
2378
2379 break;
2380 default:
2381 ceph_abort();
2382 }
2383
2384 EUpdate *le = new EUpdate(mdlog, ename);
2385 mdlog->start_entry(le);
2386 le->metablob.add_dir_context(dir);
2387 le->metablob.add_dir(dir, true);
2388
2389 ceph_assert(!dir->is_frozen());
2390 mut->auth_pin(dir);
2391
2392 if (lock->get_type() == CEPH_LOCK_INEST &&
2393 !is_auth() && lock->get_state() == LOCK_MIX) {
2394 dout(10) << __func__ << " finish assimilating dirty rstat on "
2395 << *dir << dendl;
2396 dir->assimilate_dirty_rstat_inodes_finish(&le->metablob);
2397
2398 if (!(pf->rstat == pf->accounted_rstat)) {
2399 if (!mut->is_wrlocked(&nestlock)) {
2400 mdcache->mds->locker->wrlock_force(&nestlock, mut);
2401 }
2402
2403 mdcache->mds->locker->mark_updated_scatterlock(&nestlock);
2404 mut->ls->dirty_dirfrag_nest.push_back(&item_dirty_dirfrag_nest);
2405 }
2406 }
2407
2408 pf->version = dir->pre_dirty();
2409
2410 mdlog->submit_entry(le, new C_Inode_FragUpdate(this, dir, mut));
2411 } else {
2412 dout(10) << __func__ << " " << fg << " accounted " << *lock
2413 << " scatter stat unchanged at v" << dir_accounted_version << dendl;
2414 }
2415 }
2416 }
2417
2418 void CInode::_finish_frag_update(CDir *dir, MutationRef& mut)
2419 {
2420 dout(10) << __func__ << " on " << *dir << dendl;
2421 mut->apply();
2422 mdcache->mds->locker->drop_locks(mut.get());
2423 mut->cleanup();
2424 }
2425
2426
2427 /*
2428 * when we gather a lock, we need to assimilate dirfrag changes into the inode
2429 * state. it's possible we can't update the dirfrag accounted_rstat/fragstat
2430 * because the frag is auth and frozen, or that the replica couldn't for the same
2431 * reason. hopefully it will get updated the next time the lock cycles.
2432 *
2433 * we have two dimensions of behavior:
2434 * - we may be (auth and !frozen), and able to update, or not.
2435 * - the frag may be stale, or not.
2436 *
2437 * if the frag is non-stale, we want to assimilate the diff into the
2438 * inode, regardless of whether it's auth or updateable.
2439 *
2440 * if we update the frag, we want to set accounted_fragstat = frag,
2441 * both if we took the diff or it was stale and we are making it
2442 * un-stale.
2443 */
2444 /* for more info on scatterlocks, see comments by Locker::scatter_writebehind */
2445 void CInode::finish_scatter_gather_update(int type, MutationRef& mut)
2446 {
2447 LogChannelRef clog = mdcache->mds->clog;
2448
2449 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2450 ceph_assert(is_auth());
2451
2452 switch (type) {
2453 case CEPH_LOCK_IFILE:
2454 {
2455 fragtree_t tmpdft = dirfragtree;
2456 struct frag_info_t dirstat;
2457 bool dirstat_valid = true;
2458
2459 // adjust summation
2460 ceph_assert(is_auth());
2461 auto pi = _get_projected_inode();
2462
2463 bool touched_mtime = false, touched_chattr = false;
2464 dout(20) << " orig dirstat " << pi->dirstat << dendl;
2465 pi->dirstat.version++;
2466 for (const auto &p : dirfrags) {
2467 frag_t fg = p.first;
2468 CDir *dir = p.second;
2469 dout(20) << fg << " " << *dir << dendl;
2470
2471 bool update;
2472 if (dir->get_version() != 0) {
2473 update = dir->is_auth() && !dir->is_frozen();
2474 } else {
2475 update = false;
2476 dirstat_valid = false;
2477 }
2478
2479 CDir::fnode_const_ptr pf;
2480 if (update) {
2481 mut->auth_pin(dir);
2482 pf = dir->project_fnode(mut);
2483 } else {
2484 pf = dir->get_projected_fnode();
2485 }
2486
2487 if (pf->accounted_fragstat.version == pi->dirstat.version - 1) {
2488 dout(20) << fg << " fragstat " << pf->fragstat << dendl;
2489 dout(20) << fg << " accounted_fragstat " << pf->accounted_fragstat << dendl;
2490 pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2491 } else {
2492 dout(20) << fg << " skipping STALE accounted_fragstat " << pf->accounted_fragstat << dendl;
2493 }
2494
2495 if (pf->fragstat.nfiles < 0 ||
2496 pf->fragstat.nsubdirs < 0) {
2497 clog->error() << "bad/negative dir size on "
2498 << dir->dirfrag() << " " << pf->fragstat;
2499 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter);
2500
2501 auto _pf = const_cast<fnode_t*>(pf.get());
2502 if (pf->fragstat.nfiles < 0)
2503 _pf->fragstat.nfiles = 0;
2504 if (pf->fragstat.nsubdirs < 0)
2505 _pf->fragstat.nsubdirs = 0;
2506 }
2507
2508 if (update) {
2509 auto _pf = const_cast<fnode_t*>(pf.get());
2510 _pf->accounted_fragstat = _pf->fragstat;
2511 _pf->fragstat.version = _pf->accounted_fragstat.version = pi->dirstat.version;
2512 _pf->version = dir->pre_dirty();
2513 dout(10) << fg << " updated accounted_fragstat " << pf->fragstat << " on " << *dir << dendl;
2514 }
2515
2516 tmpdft.force_to_leaf(g_ceph_context, fg);
2517 dirstat.add(pf->fragstat);
2518 }
2519 if (touched_mtime)
2520 pi->mtime = pi->ctime = pi->dirstat.mtime;
2521 if (touched_chattr)
2522 pi->change_attr = pi->dirstat.change_attr;
2523 dout(20) << " final dirstat " << pi->dirstat << dendl;
2524
2525 if (dirstat_valid && !dirstat.same_sums(pi->dirstat)) {
2526 frag_vec_t leaves;
2527 tmpdft.get_leaves_under(frag_t(), leaves);
2528 for (const auto& leaf : leaves) {
2529 if (!dirfrags.count(leaf)) {
2530 dirstat_valid = false;
2531 break;
2532 }
2533 }
2534 if (dirstat_valid) {
2535 if (state_test(CInode::STATE_REPAIRSTATS)) {
2536 dout(20) << " dirstat mismatch, fixing" << dendl;
2537 } else {
2538 clog->error() << "unmatched fragstat on " << ino() << ", inode has "
2539 << pi->dirstat << ", dirfrags have " << dirstat;
2540 ceph_assert(!"unmatched fragstat" == g_conf()->mds_verify_scatter);
2541 }
2542 // trust the dirfrags for now
2543 version_t v = pi->dirstat.version;
2544 if (pi->dirstat.mtime > dirstat.mtime)
2545 dirstat.mtime = pi->dirstat.mtime;
2546 if (pi->dirstat.change_attr > dirstat.change_attr)
2547 dirstat.change_attr = pi->dirstat.change_attr;
2548 pi->dirstat = dirstat;
2549 pi->dirstat.version = v;
2550 }
2551 }
2552
2553 if (pi->dirstat.nfiles < 0 || pi->dirstat.nsubdirs < 0) {
2554 std::string path;
2555 make_path_string(path);
2556 clog->error() << "Inconsistent statistics detected: fragstat on inode "
2557 << ino() << " (" << path << "), inode has " << pi->dirstat;
2558 ceph_assert(!"bad/negative fragstat" == g_conf()->mds_verify_scatter);
2559
2560 if (pi->dirstat.nfiles < 0)
2561 pi->dirstat.nfiles = 0;
2562 if (pi->dirstat.nsubdirs < 0)
2563 pi->dirstat.nsubdirs = 0;
2564 }
2565 }
2566 break;
2567
2568 case CEPH_LOCK_INEST:
2569 {
2570 // adjust summation
2571 ceph_assert(is_auth());
2572
2573 fragtree_t tmpdft = dirfragtree;
2574 nest_info_t rstat;
2575 bool rstat_valid = true;
2576
2577 rstat.rsubdirs = 1;
2578 if (const sr_t *srnode = get_projected_srnode(); srnode)
2579 rstat.rsnaps = srnode->snaps.size();
2580
2581 auto pi = _get_projected_inode();
2582 dout(20) << " orig rstat " << pi->rstat << dendl;
2583 pi->rstat.version++;
2584 for (const auto &p : dirfrags) {
2585 frag_t fg = p.first;
2586 CDir *dir = p.second;
2587 dout(20) << fg << " " << *dir << dendl;
2588
2589 bool update;
2590 if (dir->get_version() != 0) {
2591 update = dir->is_auth() && !dir->is_frozen();
2592 } else {
2593 update = false;
2594 rstat_valid = false;
2595 }
2596
2597 CDir::fnode_const_ptr pf;
2598 if (update) {
2599 mut->auth_pin(dir);
2600 pf = dir->project_fnode(mut);
2601 } else {
2602 pf = dir->get_projected_fnode();
2603 }
2604
2605 if (pf->accounted_rstat.version == pi->rstat.version-1) {
2606 // only pull this frag's dirty rstat inodes into the frag if
2607 // the frag is non-stale and updateable. if it's stale,
2608 // that info will just get thrown out!
2609 if (update)
2610 dir->assimilate_dirty_rstat_inodes(mut);
2611
2612 dout(20) << fg << " rstat " << pf->rstat << dendl;
2613 dout(20) << fg << " accounted_rstat " << pf->accounted_rstat << dendl;
2614 dout(20) << fg << " dirty_old_rstat " << dir->dirty_old_rstat << dendl;
2615 mdcache->project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat,
2616 dir->first, CEPH_NOSNAP, this, true);
2617 for (auto &p : dir->dirty_old_rstat) {
2618 mdcache->project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat,
2619 p.second.first, p.first, this, true);
2620 }
2621 if (update) // dir contents not valid if frozen or non-auth
2622 dir->check_rstats();
2623 } else {
2624 dout(20) << fg << " skipping STALE accounted_rstat " << pf->accounted_rstat << dendl;
2625 }
2626 if (update) {
2627 auto _pf = const_cast<fnode_t*>(pf.get());
2628 _pf->accounted_rstat = pf->rstat;
2629 _pf->rstat.version = _pf->accounted_rstat.version = pi->rstat.version;
2630 _pf->version = dir->pre_dirty();
2631 dir->dirty_old_rstat.clear();
2632 dir->check_rstats();
2633 dout(10) << fg << " updated accounted_rstat " << pf->rstat << " on " << *dir << dendl;
2634 }
2635
2636 tmpdft.force_to_leaf(g_ceph_context, fg);
2637 rstat.add(pf->rstat);
2638 }
2639 dout(20) << " final rstat " << pi->rstat << dendl;
2640
2641 if (rstat_valid && !rstat.same_sums(pi->rstat)) {
2642 frag_vec_t leaves;
2643 tmpdft.get_leaves_under(frag_t(), leaves);
2644 for (const auto& leaf : leaves) {
2645 if (!dirfrags.count(leaf)) {
2646 rstat_valid = false;
2647 break;
2648 }
2649 }
2650 if (rstat_valid) {
2651 if (state_test(CInode::STATE_REPAIRSTATS)) {
2652 dout(20) << " rstat mismatch, fixing" << dendl;
2653 } else {
2654 clog->error() << "inconsistent rstat on inode " << ino()
2655 << ", inode has " << pi->rstat
2656 << ", directory fragments have " << rstat;
2657 ceph_assert(!"unmatched rstat" == g_conf()->mds_verify_scatter);
2658 }
2659 // trust the dirfrag for now
2660 version_t v = pi->rstat.version;
2661 if (pi->rstat.rctime > rstat.rctime)
2662 rstat.rctime = pi->rstat.rctime;
2663 pi->rstat = rstat;
2664 pi->rstat.version = v;
2665 }
2666 }
2667
2668 mdcache->broadcast_quota_to_client(this);
2669 }
2670 break;
2671
2672 case CEPH_LOCK_IDFT:
2673 break;
2674
2675 default:
2676 ceph_abort();
2677 }
2678 }
2679
2680 void CInode::finish_scatter_gather_update_accounted(int type, EMetaBlob *metablob)
2681 {
2682 dout(10) << __func__ << " " << type << " on " << *this << dendl;
2683 ceph_assert(is_auth());
2684
2685 for (const auto &p : dirfrags) {
2686 CDir *dir = p.second;
2687 if (!dir->is_auth() || dir->get_version() == 0 || dir->is_frozen())
2688 continue;
2689
2690 if (type == CEPH_LOCK_IDFT)
2691 continue; // nothing to do.
2692
2693 if (type == CEPH_LOCK_INEST)
2694 dir->assimilate_dirty_rstat_inodes_finish(metablob);
2695
2696 dout(10) << " journaling updated frag accounted_ on " << *dir << dendl;
2697 ceph_assert(dir->is_projected());
2698 metablob->add_dir(dir, true);
2699 }
2700 }
2701
2702 // waiting
2703
2704 bool CInode::is_frozen() const
2705 {
2706 if (is_frozen_inode()) return true;
2707 if (parent && parent->dir->is_frozen()) return true;
2708 return false;
2709 }
2710
2711 bool CInode::is_frozen_dir() const
2712 {
2713 if (parent && parent->dir->is_frozen_dir()) return true;
2714 return false;
2715 }
2716
2717 bool CInode::is_freezing() const
2718 {
2719 if (is_freezing_inode()) return true;
2720 if (parent && parent->dir->is_freezing()) return true;
2721 return false;
2722 }
2723
2724 void CInode::add_dir_waiter(frag_t fg, MDSContext *c)
2725 {
2726 if (waiting_on_dir.empty())
2727 get(PIN_DIRWAITER);
2728 waiting_on_dir[fg].push_back(c);
2729 dout(10) << __func__ << " frag " << fg << " " << c << " on " << *this << dendl;
2730 }
2731
2732 void CInode::take_dir_waiting(frag_t fg, MDSContext::vec& ls)
2733 {
2734 if (waiting_on_dir.empty())
2735 return;
2736
2737 auto it = waiting_on_dir.find(fg);
2738 if (it != waiting_on_dir.end()) {
2739 dout(10) << __func__ << " frag " << fg << " on " << *this << dendl;
2740 auto& waiting = it->second;
2741 ls.insert(ls.end(), waiting.begin(), waiting.end());
2742 waiting_on_dir.erase(it);
2743
2744 if (waiting_on_dir.empty())
2745 put(PIN_DIRWAITER);
2746 }
2747 }
2748
2749 void CInode::add_waiter(uint64_t tag, MDSContext *c)
2750 {
2751 dout(10) << __func__ << " tag " << std::hex << tag << std::dec << " " << c
2752 << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH)
2753 << " !frozen " << !is_frozen_inode()
2754 << " !freezing " << !is_freezing_inode()
2755 << dendl;
2756 // wait on the directory?
2757 // make sure its not the inode that is explicitly ambiguous|freezing|frozen
2758 if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) ||
2759 ((tag & WAIT_UNFREEZE) &&
2760 !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) {
2761 dout(15) << "passing waiter up tree" << dendl;
2762 parent->dir->add_waiter(tag, c);
2763 return;
2764 }
2765 dout(15) << "taking waiter here" << dendl;
2766 MDSCacheObject::add_waiter(tag, c);
2767 }
2768
2769 void CInode::take_waiting(uint64_t mask, MDSContext::vec& ls)
2770 {
2771 if ((mask & WAIT_DIR) && !waiting_on_dir.empty()) {
2772 // take all dentry waiters
2773 while (!waiting_on_dir.empty()) {
2774 auto it = waiting_on_dir.begin();
2775 dout(10) << __func__ << " dirfrag " << it->first << " on " << *this << dendl;
2776 auto& waiting = it->second;
2777 ls.insert(ls.end(), waiting.begin(), waiting.end());
2778 waiting_on_dir.erase(it);
2779 }
2780 put(PIN_DIRWAITER);
2781 }
2782
2783 // waiting
2784 MDSCacheObject::take_waiting(mask, ls);
2785 }
2786
2787 void CInode::maybe_finish_freeze_inode()
2788 {
2789 CDir *dir = get_parent_dir();
2790 if (auth_pins > auth_pin_freeze_allowance || dir->frozen_inode_suppressed)
2791 return;
2792
2793 dout(10) << "maybe_finish_freeze_inode - frozen" << dendl;
2794 ceph_assert(auth_pins == auth_pin_freeze_allowance);
2795 get(PIN_FROZEN);
2796 put(PIN_FREEZING);
2797 state_clear(STATE_FREEZING);
2798 state_set(STATE_FROZEN);
2799
2800 item_freezing_inode.remove_myself();
2801 dir->num_frozen_inodes++;
2802
2803 finish_waiting(WAIT_FROZEN);
2804 }
2805
2806 bool CInode::freeze_inode(int auth_pin_allowance)
2807 {
2808 CDir *dir = get_parent_dir();
2809 ceph_assert(dir);
2810
2811 ceph_assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins
2812 ceph_assert(auth_pins >= auth_pin_allowance);
2813 if (auth_pins == auth_pin_allowance && !dir->frozen_inode_suppressed) {
2814 dout(10) << "freeze_inode - frozen" << dendl;
2815 if (!state_test(STATE_FROZEN)) {
2816 get(PIN_FROZEN);
2817 state_set(STATE_FROZEN);
2818 dir->num_frozen_inodes++;
2819 }
2820 return true;
2821 }
2822
2823 dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl;
2824 auth_pin_freeze_allowance = auth_pin_allowance;
2825 dir->freezing_inodes.push_back(&item_freezing_inode);
2826
2827 get(PIN_FREEZING);
2828 state_set(STATE_FREEZING);
2829
2830 if (!dir->lock_caches_with_auth_pins.empty())
2831 mdcache->mds->locker->invalidate_lock_caches(dir);
2832
2833 const static int lock_types[] = {
2834 CEPH_LOCK_IVERSION, CEPH_LOCK_IFILE, CEPH_LOCK_IAUTH, CEPH_LOCK_ILINK, CEPH_LOCK_IDFT,
2835 CEPH_LOCK_IXATTR, CEPH_LOCK_ISNAP, CEPH_LOCK_INEST, CEPH_LOCK_IFLOCK, CEPH_LOCK_IPOLICY, 0
2836 };
2837 for (int i = 0; lock_types[i]; ++i) {
2838 auto lock = get_lock(lock_types[i]);
2839 if (lock->is_cached())
2840 mdcache->mds->locker->invalidate_lock_caches(lock);
2841 }
2842 // invalidate_lock_caches() may decrease dir->frozen_inode_suppressed
2843 // and finish freezing the inode
2844 return state_test(STATE_FROZEN);
2845 }
2846
2847 void CInode::unfreeze_inode(MDSContext::vec& finished)
2848 {
2849 dout(10) << __func__ << dendl;
2850 if (state_test(STATE_FREEZING)) {
2851 state_clear(STATE_FREEZING);
2852 put(PIN_FREEZING);
2853 item_freezing_inode.remove_myself();
2854 } else if (state_test(STATE_FROZEN)) {
2855 state_clear(STATE_FROZEN);
2856 put(PIN_FROZEN);
2857 get_parent_dir()->num_frozen_inodes--;
2858 } else
2859 ceph_abort();
2860 take_waiting(WAIT_UNFREEZE, finished);
2861 }
2862
2863 void CInode::unfreeze_inode()
2864 {
2865 MDSContext::vec finished;
2866 unfreeze_inode(finished);
2867 mdcache->mds->queue_waiters(finished);
2868 }
2869
2870 void CInode::freeze_auth_pin()
2871 {
2872 ceph_assert(state_test(CInode::STATE_FROZEN));
2873 state_set(CInode::STATE_FROZENAUTHPIN);
2874 get_parent_dir()->num_frozen_inodes++;
2875 }
2876
2877 void CInode::unfreeze_auth_pin()
2878 {
2879 ceph_assert(state_test(CInode::STATE_FROZENAUTHPIN));
2880 state_clear(CInode::STATE_FROZENAUTHPIN);
2881 get_parent_dir()->num_frozen_inodes--;
2882 if (!state_test(STATE_FREEZING|STATE_FROZEN)) {
2883 MDSContext::vec finished;
2884 take_waiting(WAIT_UNFREEZE, finished);
2885 mdcache->mds->queue_waiters(finished);
2886 }
2887 }
2888
2889 void CInode::clear_ambiguous_auth(MDSContext::vec& finished)
2890 {
2891 ceph_assert(state_test(CInode::STATE_AMBIGUOUSAUTH));
2892 state_clear(CInode::STATE_AMBIGUOUSAUTH);
2893 take_waiting(CInode::WAIT_SINGLEAUTH, finished);
2894 }
2895
2896 void CInode::clear_ambiguous_auth()
2897 {
2898 MDSContext::vec finished;
2899 clear_ambiguous_auth(finished);
2900 mdcache->mds->queue_waiters(finished);
2901 }
2902
2903 // auth_pins
2904 bool CInode::can_auth_pin(int *err_ret) const {
2905 int err;
2906 if (!is_auth()) {
2907 err = ERR_NOT_AUTH;
2908 } else if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) {
2909 err = ERR_EXPORTING_INODE;
2910 } else {
2911 if (parent)
2912 return parent->can_auth_pin(err_ret);
2913 err = 0;
2914 }
2915 if (err && err_ret)
2916 *err_ret = err;
2917 return !err;
2918 }
2919
2920 void CInode::auth_pin(void *by)
2921 {
2922 if (auth_pins == 0)
2923 get(PIN_AUTHPIN);
2924 auth_pins++;
2925
2926 #ifdef MDS_AUTHPIN_SET
2927 auth_pin_set.insert(by);
2928 #endif
2929
2930 dout(10) << "auth_pin by " << by << " on " << *this << " now " << auth_pins << dendl;
2931
2932 if (parent)
2933 parent->adjust_nested_auth_pins(1, this);
2934 }
2935
2936 void CInode::auth_unpin(void *by)
2937 {
2938 auth_pins--;
2939
2940 #ifdef MDS_AUTHPIN_SET
2941 {
2942 auto it = auth_pin_set.find(by);
2943 ceph_assert(it != auth_pin_set.end());
2944 auth_pin_set.erase(it);
2945 }
2946 #endif
2947
2948 if (auth_pins == 0)
2949 put(PIN_AUTHPIN);
2950
2951 dout(10) << "auth_unpin by " << by << " on " << *this << " now " << auth_pins << dendl;
2952
2953 ceph_assert(auth_pins >= 0);
2954
2955 if (parent)
2956 parent->adjust_nested_auth_pins(-1, by);
2957
2958 if (is_freezing_inode())
2959 maybe_finish_freeze_inode();
2960 }
2961
2962 // authority
2963
2964 mds_authority_t CInode::authority() const
2965 {
2966 if (inode_auth.first >= 0)
2967 return inode_auth;
2968
2969 if (parent)
2970 return parent->dir->authority();
2971
2972 // new items that are not yet linked in (in the committed plane) belong
2973 // to their first parent.
2974 if (!projected_parent.empty())
2975 return projected_parent.front()->dir->authority();
2976
2977 return CDIR_AUTH_UNDEF;
2978 }
2979
2980
2981 // SNAP
2982
2983 snapid_t CInode::get_oldest_snap()
2984 {
2985 snapid_t t = first;
2986 if (is_any_old_inodes())
2987 t = get_old_inodes()->begin()->second.first;
2988 return std::min(t, oldest_snap);
2989 }
2990
2991 const CInode::mempool_old_inode& CInode::cow_old_inode(snapid_t follows, bool cow_head)
2992 {
2993 ceph_assert(follows >= first);
2994
2995 const auto& pi = cow_head ? get_projected_inode() : get_previous_projected_inode();
2996 const auto& px = cow_head ? get_projected_xattrs() : get_previous_projected_xattrs();
2997
2998 auto _old_inodes = allocate_old_inode_map();
2999 if (old_inodes)
3000 *_old_inodes = *old_inodes;
3001
3002 mempool_old_inode &old = (*_old_inodes)[follows];
3003 old.first = first;
3004 old.inode = *pi;
3005 if (px) {
3006 dout(10) << " " << px->size() << " xattrs cowed, " << *px << dendl;
3007 old.xattrs = *px;
3008 }
3009
3010 if (first < oldest_snap)
3011 oldest_snap = first;
3012
3013 old.inode.trim_client_ranges(follows);
3014
3015 if (g_conf()->mds_snap_rstat &&
3016 !(old.inode.rstat == old.inode.accounted_rstat))
3017 dirty_old_rstats.insert(follows);
3018
3019 first = follows+1;
3020
3021 dout(10) << __func__ << " " << (cow_head ? "head" : "previous_head" )
3022 << " to [" << old.first << "," << follows << "] on "
3023 << *this << dendl;
3024
3025 reset_old_inodes(std::move(_old_inodes));
3026 return old;
3027 }
3028
3029 void CInode::pre_cow_old_inode()
3030 {
3031 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
3032 if (first <= follows)
3033 cow_old_inode(follows, true);
3034 }
3035
3036 bool CInode::has_snap_data(snapid_t snapid)
3037 {
3038 bool found = snapid >= first && snapid <= last;
3039 if (!found && is_any_old_inodes()) {
3040 auto p = old_inodes->lower_bound(snapid);
3041 if (p != old_inodes->end()) {
3042 if (p->second.first > snapid) {
3043 if (p != old_inodes->begin())
3044 --p;
3045 }
3046 if (p->second.first <= snapid && snapid <= p->first) {
3047 found = true;
3048 }
3049 }
3050 }
3051 return found;
3052 }
3053
3054 void CInode::purge_stale_snap_data(const set<snapid_t>& snaps)
3055 {
3056 dout(10) << __func__ << " " << snaps << dendl;
3057
3058 if (!get_old_inodes())
3059 return;
3060
3061 std::vector<snapid_t> to_remove;
3062 for (auto p : *get_old_inodes()) {
3063 const snapid_t &id = p.first;
3064 const auto &s = snaps.lower_bound(p.second.first);
3065 if (s == snaps.end() || *s > id) {
3066 dout(10) << " purging old_inode [" << p.second.first << "," << id << "]" << dendl;
3067 to_remove.push_back(id);
3068 }
3069 }
3070
3071 if (to_remove.size() == get_old_inodes()->size()) {
3072 reset_old_inodes(old_inode_map_ptr());
3073 } else if (!to_remove.empty()) {
3074 auto _old_inodes = allocate_old_inode_map(*get_old_inodes());
3075 for (auto id : to_remove)
3076 _old_inodes->erase(id);
3077 reset_old_inodes(std::move(_old_inodes));
3078 }
3079 }
3080
3081 /*
3082 * pick/create an old_inode
3083 */
3084 snapid_t CInode::pick_old_inode(snapid_t snap) const
3085 {
3086 if (is_any_old_inodes()) {
3087 auto it = old_inodes->lower_bound(snap); // p is first key >= to snap
3088 if (it != old_inodes->end() && it->second.first <= snap) {
3089 dout(10) << __func__ << " snap " << snap << " -> [" << it->second.first << "," << it->first << "]" << dendl;
3090 return it->first;
3091 }
3092 }
3093 dout(10) << __func__ << " snap " << snap << " -> nothing" << dendl;
3094 return 0;
3095 }
3096
3097 void CInode::open_snaprealm(bool nosplit)
3098 {
3099 if (!snaprealm) {
3100 SnapRealm *parent = find_snaprealm();
3101 snaprealm = new SnapRealm(mdcache, this);
3102 if (parent) {
3103 dout(10) << __func__ << " " << snaprealm
3104 << " parent is " << parent
3105 << dendl;
3106 dout(30) << " siblings are " << parent->open_children << dendl;
3107 snaprealm->parent = parent;
3108 if (!nosplit)
3109 parent->split_at(snaprealm);
3110 parent->open_children.insert(snaprealm);
3111 }
3112 }
3113 }
3114 void CInode::close_snaprealm(bool nojoin)
3115 {
3116 if (snaprealm) {
3117 dout(15) << __func__ << " " << *snaprealm << dendl;
3118 if (snaprealm->parent) {
3119 snaprealm->parent->open_children.erase(snaprealm);
3120 //if (!nojoin)
3121 //snaprealm->parent->join(snaprealm);
3122 }
3123 delete snaprealm;
3124 snaprealm = 0;
3125 }
3126 }
3127
3128 SnapRealm *CInode::find_snaprealm() const
3129 {
3130 const CInode *cur = this;
3131 while (!cur->snaprealm) {
3132 const CDentry *pdn = cur->get_oldest_parent_dn();
3133 if (!pdn)
3134 break;
3135 cur = pdn->get_dir()->get_inode();
3136 }
3137 return cur->snaprealm;
3138 }
3139
3140 void CInode::encode_snap_blob(bufferlist &snapbl)
3141 {
3142 if (snaprealm) {
3143 using ceph::encode;
3144 encode(snaprealm->srnode, snapbl);
3145 dout(20) << __func__ << " " << *snaprealm << dendl;
3146 }
3147 }
3148 void CInode::decode_snap_blob(const bufferlist& snapbl)
3149 {
3150 using ceph::decode;
3151 if (snapbl.length()) {
3152 open_snaprealm();
3153 auto old_flags = snaprealm->srnode.flags;
3154 auto p = snapbl.cbegin();
3155 decode(snaprealm->srnode, p);
3156 if (!is_base()) {
3157 if ((snaprealm->srnode.flags ^ old_flags) & sr_t::PARENT_GLOBAL) {
3158 snaprealm->adjust_parent();
3159 }
3160 }
3161 dout(20) << __func__ << " " << *snaprealm << dendl;
3162 } else if (snaprealm &&
3163 !is_root() && !is_mdsdir()) { // see https://tracker.ceph.com/issues/42675
3164 ceph_assert(mdcache->mds->is_any_replay());
3165 snaprealm->merge_to(NULL);
3166 }
3167 }
3168
3169 void CInode::encode_snap(bufferlist& bl)
3170 {
3171 ENCODE_START(1, 1, bl);
3172 bufferlist snapbl;
3173 encode_snap_blob(snapbl);
3174 encode(snapbl, bl);
3175 encode(oldest_snap, bl);
3176 ENCODE_FINISH(bl);
3177 }
3178
3179 void CInode::decode_snap(bufferlist::const_iterator& p)
3180 {
3181 DECODE_START(1, p);
3182 bufferlist snapbl;
3183 decode(snapbl, p);
3184 decode(oldest_snap, p);
3185 decode_snap_blob(snapbl);
3186 DECODE_FINISH(p);
3187 }
3188
3189 // =============================================
3190
3191 client_t CInode::calc_ideal_loner()
3192 {
3193 if (mdcache->is_readonly())
3194 return -1;
3195 if (!get_mds_caps_wanted().empty())
3196 return -1;
3197
3198 int n = 0;
3199 client_t loner = -1;
3200 for (const auto &p : client_caps) {
3201 if (!p.second.is_stale() &&
3202 (is_dir() ?
3203 !has_subtree_or_exporting_dirfrag() :
3204 (p.second.wanted() & (CEPH_CAP_ANY_WR|CEPH_CAP_FILE_RD)))) {
3205 if (n)
3206 return -1;
3207 n++;
3208 loner = p.first;
3209 }
3210 }
3211 return loner;
3212 }
3213
3214 bool CInode::choose_ideal_loner()
3215 {
3216 want_loner_cap = calc_ideal_loner();
3217 int changed = false;
3218 if (loner_cap >= 0 && loner_cap != want_loner_cap) {
3219 if (!try_drop_loner())
3220 return false;
3221 changed = true;
3222 }
3223
3224 if (want_loner_cap >= 0) {
3225 if (loner_cap < 0) {
3226 set_loner_cap(want_loner_cap);
3227 changed = true;
3228 } else
3229 ceph_assert(loner_cap == want_loner_cap);
3230 }
3231 return changed;
3232 }
3233
3234 bool CInode::try_set_loner()
3235 {
3236 ceph_assert(want_loner_cap >= 0);
3237 if (loner_cap >= 0 && loner_cap != want_loner_cap)
3238 return false;
3239 set_loner_cap(want_loner_cap);
3240 return true;
3241 }
3242
3243 void CInode::set_loner_cap(client_t l)
3244 {
3245 loner_cap = l;
3246 authlock.set_excl_client(loner_cap);
3247 filelock.set_excl_client(loner_cap);
3248 linklock.set_excl_client(loner_cap);
3249 xattrlock.set_excl_client(loner_cap);
3250 }
3251
3252 bool CInode::try_drop_loner()
3253 {
3254 if (loner_cap < 0)
3255 return true;
3256
3257 int other_allowed = get_caps_allowed_by_type(CAP_ANY);
3258 Capability *cap = get_client_cap(loner_cap);
3259 if (!cap ||
3260 (cap->issued() & ~other_allowed) == 0) {
3261 set_loner_cap(-1);
3262 return true;
3263 }
3264 return false;
3265 }
3266
3267
3268 // choose new lock state during recovery, based on issued caps
3269 void CInode::choose_lock_state(SimpleLock *lock, int allissued)
3270 {
3271 int shift = lock->get_cap_shift();
3272 int issued = (allissued >> shift) & lock->get_cap_mask();
3273 if (is_auth()) {
3274 if (lock->is_xlocked()) {
3275 // do nothing here
3276 } else if (lock->get_state() != LOCK_MIX) {
3277 if (issued & (CEPH_CAP_GEXCL | CEPH_CAP_GBUFFER))
3278 lock->set_state(LOCK_EXCL);
3279 else if (issued & CEPH_CAP_GWR) {
3280 if (issued & (CEPH_CAP_GCACHE | CEPH_CAP_GSHARED))
3281 lock->set_state(LOCK_EXCL);
3282 else
3283 lock->set_state(LOCK_MIX);
3284 } else if (lock->is_dirty()) {
3285 if (is_replicated())
3286 lock->set_state(LOCK_MIX);
3287 else
3288 lock->set_state(LOCK_LOCK);
3289 } else
3290 lock->set_state(LOCK_SYNC);
3291 }
3292 } else {
3293 // our states have already been chosen during rejoin.
3294 if (lock->is_xlocked())
3295 ceph_assert(lock->get_state() == LOCK_LOCK);
3296 }
3297 }
3298
3299 void CInode::choose_lock_states(int dirty_caps)
3300 {
3301 int issued = get_caps_issued() | dirty_caps;
3302 if (is_auth() && (issued & (CEPH_CAP_ANY_EXCL|CEPH_CAP_ANY_WR)))
3303 choose_ideal_loner();
3304 choose_lock_state(&filelock, issued);
3305 choose_lock_state(&nestlock, issued);
3306 choose_lock_state(&dirfragtreelock, issued);
3307 choose_lock_state(&authlock, issued);
3308 choose_lock_state(&xattrlock, issued);
3309 choose_lock_state(&linklock, issued);
3310 }
3311
3312 int CInode::count_nonstale_caps()
3313 {
3314 int n = 0;
3315 for (const auto &p : client_caps) {
3316 if (!p.second.is_stale())
3317 n++;
3318 }
3319 return n;
3320 }
3321
3322 bool CInode::multiple_nonstale_caps()
3323 {
3324 int n = 0;
3325 for (const auto &p : client_caps) {
3326 if (!p.second.is_stale()) {
3327 if (n)
3328 return true;
3329 n++;
3330 }
3331 }
3332 return false;
3333 }
3334
3335 void CInode::set_mds_caps_wanted(mempool::mds_co::compact_map<int32_t,int32_t>& m)
3336 {
3337 bool old_empty = mds_caps_wanted.empty();
3338 mds_caps_wanted.swap(m);
3339 if (old_empty != (bool)mds_caps_wanted.empty()) {
3340 if (old_empty)
3341 adjust_num_caps_notable(1);
3342 else
3343 adjust_num_caps_notable(-1);
3344 }
3345 }
3346
3347 void CInode::set_mds_caps_wanted(mds_rank_t mds, int32_t wanted)
3348 {
3349 bool old_empty = mds_caps_wanted.empty();
3350 if (wanted) {
3351 mds_caps_wanted[mds] = wanted;
3352 if (old_empty)
3353 adjust_num_caps_notable(1);
3354 } else if (!old_empty) {
3355 mds_caps_wanted.erase(mds);
3356 if (mds_caps_wanted.empty())
3357 adjust_num_caps_notable(-1);
3358 }
3359 }
3360
3361 Capability *CInode::add_client_cap(client_t client, Session *session,
3362 SnapRealm *conrealm, bool new_inode)
3363 {
3364 ceph_assert(last == CEPH_NOSNAP);
3365 if (client_caps.empty()) {
3366 get(PIN_CAPS);
3367 if (conrealm)
3368 containing_realm = conrealm;
3369 else
3370 containing_realm = find_snaprealm();
3371 containing_realm->inodes_with_caps.push_back(&item_caps);
3372 dout(10) << __func__ << " first cap, joining realm " << *containing_realm << dendl;
3373
3374 mdcache->num_inodes_with_caps++;
3375 if (parent)
3376 parent->dir->adjust_num_inodes_with_caps(1);
3377 }
3378
3379 uint64_t cap_id = new_inode ? 1 : ++mdcache->last_cap_id;
3380 auto ret = client_caps.emplace(std::piecewise_construct, std::forward_as_tuple(client),
3381 std::forward_as_tuple(this, session, cap_id));
3382 ceph_assert(ret.second == true);
3383 Capability *cap = &ret.first->second;
3384
3385 cap->client_follows = first-1;
3386 containing_realm->add_cap(client, cap);
3387
3388 return cap;
3389 }
3390
3391 void CInode::remove_client_cap(client_t client)
3392 {
3393 auto it = client_caps.find(client);
3394 ceph_assert(it != client_caps.end());
3395 Capability *cap = &it->second;
3396
3397 cap->item_session_caps.remove_myself();
3398 cap->item_revoking_caps.remove_myself();
3399 cap->item_client_revoking_caps.remove_myself();
3400 containing_realm->remove_cap(client, cap);
3401
3402 if (client == loner_cap)
3403 loner_cap = -1;
3404
3405 if (cap->is_wanted_notable())
3406 adjust_num_caps_notable(-1);
3407
3408 client_caps.erase(it);
3409 if (client_caps.empty()) {
3410 dout(10) << __func__ << " last cap, leaving realm " << *containing_realm << dendl;
3411 put(PIN_CAPS);
3412 item_caps.remove_myself();
3413 containing_realm = NULL;
3414 mdcache->num_inodes_with_caps--;
3415 if (parent)
3416 parent->dir->adjust_num_inodes_with_caps(-1);
3417 }
3418
3419 //clean up advisory locks
3420 bool fcntl_removed = fcntl_locks ? fcntl_locks->remove_all_from(client) : false;
3421 bool flock_removed = flock_locks ? flock_locks->remove_all_from(client) : false;
3422 if (fcntl_removed || flock_removed) {
3423 MDSContext::vec waiters;
3424 take_waiting(CInode::WAIT_FLOCK, waiters);
3425 mdcache->mds->queue_waiters(waiters);
3426 }
3427 }
3428
3429 void CInode::move_to_realm(SnapRealm *realm)
3430 {
3431 dout(10) << __func__ << " joining realm " << *realm
3432 << ", leaving realm " << *containing_realm << dendl;
3433 for (auto& p : client_caps) {
3434 containing_realm->remove_cap(p.first, &p.second);
3435 realm->add_cap(p.first, &p.second);
3436 }
3437 item_caps.remove_myself();
3438 realm->inodes_with_caps.push_back(&item_caps);
3439 containing_realm = realm;
3440 }
3441
3442 Capability *CInode::reconnect_cap(client_t client, const cap_reconnect_t& icr, Session *session)
3443 {
3444 Capability *cap = get_client_cap(client);
3445 if (cap) {
3446 // FIXME?
3447 cap->merge(icr.capinfo.wanted, icr.capinfo.issued);
3448 } else {
3449 cap = add_client_cap(client, session);
3450 cap->set_cap_id(icr.capinfo.cap_id);
3451 cap->set_wanted(icr.capinfo.wanted);
3452 cap->issue_norevoke(icr.capinfo.issued);
3453 cap->reset_seq();
3454 }
3455 cap->set_last_issue_stamp(ceph_clock_now());
3456 return cap;
3457 }
3458
3459 void CInode::clear_client_caps_after_export()
3460 {
3461 while (!client_caps.empty())
3462 remove_client_cap(client_caps.begin()->first);
3463 loner_cap = -1;
3464 want_loner_cap = -1;
3465 if (!get_mds_caps_wanted().empty()) {
3466 mempool::mds_co::compact_map<int32_t,int32_t> empty;
3467 set_mds_caps_wanted(empty);
3468 }
3469 }
3470
3471 void CInode::export_client_caps(map<client_t,Capability::Export>& cl)
3472 {
3473 for (const auto &p : client_caps) {
3474 cl[p.first] = p.second.make_export();
3475 }
3476 }
3477
3478 // caps allowed
3479 int CInode::get_caps_liked() const
3480 {
3481 if (is_dir())
3482 return CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED; // but not, say, FILE_RD|WR|WRBUFFER
3483 else
3484 return CEPH_CAP_ANY & ~CEPH_CAP_FILE_LAZYIO;
3485 }
3486
3487 int CInode::get_caps_allowed_ever() const
3488 {
3489 int allowed;
3490 if (is_dir())
3491 allowed = CEPH_CAP_PIN | CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_SHARED;
3492 else
3493 allowed = CEPH_CAP_ANY;
3494 return allowed &
3495 (CEPH_CAP_PIN |
3496 (filelock.gcaps_allowed_ever() << filelock.get_cap_shift()) |
3497 (authlock.gcaps_allowed_ever() << authlock.get_cap_shift()) |
3498 (xattrlock.gcaps_allowed_ever() << xattrlock.get_cap_shift()) |
3499 (linklock.gcaps_allowed_ever() << linklock.get_cap_shift()));
3500 }
3501
3502 int CInode::get_caps_allowed_by_type(int type) const
3503 {
3504 return
3505 CEPH_CAP_PIN |
3506 (filelock.gcaps_allowed(type) << filelock.get_cap_shift()) |
3507 (authlock.gcaps_allowed(type) << authlock.get_cap_shift()) |
3508 (xattrlock.gcaps_allowed(type) << xattrlock.get_cap_shift()) |
3509 (linklock.gcaps_allowed(type) << linklock.get_cap_shift());
3510 }
3511
3512 int CInode::get_caps_careful() const
3513 {
3514 return
3515 (filelock.gcaps_careful() << filelock.get_cap_shift()) |
3516 (authlock.gcaps_careful() << authlock.get_cap_shift()) |
3517 (xattrlock.gcaps_careful() << xattrlock.get_cap_shift()) |
3518 (linklock.gcaps_careful() << linklock.get_cap_shift());
3519 }
3520
3521 int CInode::get_xlocker_mask(client_t client) const
3522 {
3523 return
3524 (filelock.gcaps_xlocker_mask(client) << filelock.get_cap_shift()) |
3525 (authlock.gcaps_xlocker_mask(client) << authlock.get_cap_shift()) |
3526 (xattrlock.gcaps_xlocker_mask(client) << xattrlock.get_cap_shift()) |
3527 (linklock.gcaps_xlocker_mask(client) << linklock.get_cap_shift());
3528 }
3529
3530 int CInode::get_caps_allowed_for_client(Session *session, Capability *cap,
3531 const mempool_inode *file_i) const
3532 {
3533 client_t client = session->get_client();
3534 int allowed;
3535 if (client == get_loner()) {
3536 // as the loner, we get the loner_caps AND any xlocker_caps for things we have xlocked
3537 allowed =
3538 get_caps_allowed_by_type(CAP_LONER) |
3539 (get_caps_allowed_by_type(CAP_XLOCKER) & get_xlocker_mask(client));
3540 } else {
3541 allowed = get_caps_allowed_by_type(CAP_ANY);
3542 }
3543
3544 if (is_dir()) {
3545 allowed &= ~CEPH_CAP_ANY_DIR_OPS;
3546 if (cap && (allowed & CEPH_CAP_FILE_EXCL))
3547 allowed |= cap->get_lock_cache_allowed();
3548 } else {
3549 if (file_i->inline_data.version == CEPH_INLINE_NONE &&
3550 file_i->layout.pool_ns.empty()) {
3551 // noop
3552 } else if (cap) {
3553 if ((file_i->inline_data.version != CEPH_INLINE_NONE &&
3554 cap->is_noinline()) ||
3555 (!file_i->layout.pool_ns.empty() &&
3556 cap->is_nopoolns()))
3557 allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
3558 } else {
3559 auto& conn = session->get_connection();
3560 if ((file_i->inline_data.version != CEPH_INLINE_NONE &&
3561 !conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) ||
3562 (!file_i->layout.pool_ns.empty() &&
3563 !conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)))
3564 allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
3565 }
3566 }
3567 return allowed;
3568 }
3569
3570 // caps issued, wanted
3571 int CInode::get_caps_issued(int *ploner, int *pother, int *pxlocker,
3572 int shift, int mask)
3573 {
3574 int c = 0;
3575 int loner = 0, other = 0, xlocker = 0;
3576 if (!is_auth()) {
3577 loner_cap = -1;
3578 }
3579
3580 for (const auto &p : client_caps) {
3581 int i = p.second.issued();
3582 c |= i;
3583 if (p.first == loner_cap)
3584 loner |= i;
3585 else
3586 other |= i;
3587 xlocker |= get_xlocker_mask(p.first) & i;
3588 }
3589 if (ploner) *ploner = (loner >> shift) & mask;
3590 if (pother) *pother = (other >> shift) & mask;
3591 if (pxlocker) *pxlocker = (xlocker >> shift) & mask;
3592 return (c >> shift) & mask;
3593 }
3594
3595 bool CInode::is_any_caps_wanted() const
3596 {
3597 for (const auto &p : client_caps) {
3598 if (p.second.wanted())
3599 return true;
3600 }
3601 return false;
3602 }
3603
3604 int CInode::get_caps_wanted(int *ploner, int *pother, int shift, int mask) const
3605 {
3606 int w = 0;
3607 int loner = 0, other = 0;
3608 for (const auto &p : client_caps) {
3609 if (!p.second.is_stale()) {
3610 int t = p.second.wanted();
3611 w |= t;
3612 if (p.first == loner_cap)
3613 loner |= t;
3614 else
3615 other |= t;
3616 }
3617 //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl;
3618 }
3619 if (is_auth())
3620 for (const auto &p : mds_caps_wanted) {
3621 w |= p.second;
3622 other |= p.second;
3623 //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl;
3624 }
3625 if (ploner) *ploner = (loner >> shift) & mask;
3626 if (pother) *pother = (other >> shift) & mask;
3627 return (w >> shift) & mask;
3628 }
3629
3630 bool CInode::issued_caps_need_gather(SimpleLock *lock)
3631 {
3632 int loner_issued, other_issued, xlocker_issued;
3633 get_caps_issued(&loner_issued, &other_issued, &xlocker_issued,
3634 lock->get_cap_shift(), lock->get_cap_mask());
3635 if ((loner_issued & ~lock->gcaps_allowed(CAP_LONER)) ||
3636 (other_issued & ~lock->gcaps_allowed(CAP_ANY)) ||
3637 (xlocker_issued & ~lock->gcaps_allowed(CAP_XLOCKER)))
3638 return true;
3639 return false;
3640 }
3641
3642 void CInode::adjust_num_caps_notable(int d)
3643 {
3644 if (!is_clientwriteable()) {
3645 if (!num_caps_notable && d > 0)
3646 mdcache->open_file_table.add_inode(this);
3647 else if (num_caps_notable > 0 && num_caps_notable == -d)
3648 mdcache->open_file_table.remove_inode(this);
3649 }
3650
3651 num_caps_notable +=d;
3652 ceph_assert(num_caps_notable >= 0);
3653 }
3654
3655 void CInode::mark_clientwriteable()
3656 {
3657 if (last != CEPH_NOSNAP)
3658 return;
3659 if (!state_test(STATE_CLIENTWRITEABLE)) {
3660 if (num_caps_notable == 0)
3661 mdcache->open_file_table.add_inode(this);
3662 state_set(STATE_CLIENTWRITEABLE);
3663 }
3664 }
3665
3666 void CInode::clear_clientwriteable()
3667 {
3668 if (state_test(STATE_CLIENTWRITEABLE)) {
3669 if (num_caps_notable == 0)
3670 mdcache->open_file_table.remove_inode(this);
3671 state_clear(STATE_CLIENTWRITEABLE);
3672 }
3673 }
3674
3675 // =============================================
3676
3677 int CInode::encode_inodestat(bufferlist& bl, Session *session,
3678 SnapRealm *dir_realm,
3679 snapid_t snapid,
3680 unsigned max_bytes,
3681 int getattr_caps)
3682 {
3683 client_t client = session->get_client();
3684 ceph_assert(snapid);
3685
3686 bool valid = true;
3687
3688 // pick a version!
3689 const mempool_inode *oi = get_inode().get();
3690 const mempool_inode *pi = get_projected_inode().get();
3691
3692 const mempool_xattr_map *pxattrs = nullptr;
3693
3694 if (snapid != CEPH_NOSNAP) {
3695
3696 // for now at least, old_inodes is only defined/valid on the auth
3697 if (!is_auth())
3698 valid = false;
3699
3700 if (is_any_old_inodes()) {
3701 auto it = old_inodes->lower_bound(snapid);
3702 if (it != old_inodes->end()) {
3703 if (it->second.first > snapid) {
3704 if (it != old_inodes->begin())
3705 --it;
3706 }
3707 if (it->second.first <= snapid && snapid <= it->first) {
3708 dout(15) << __func__ << " snapid " << snapid
3709 << " to old_inode [" << it->second.first << "," << it->first << "]"
3710 << " " << it->second.inode.rstat
3711 << dendl;
3712 pi = oi = &it->second.inode;
3713 pxattrs = &it->second.xattrs;
3714 } else {
3715 // snapshoted remote dentry can result this
3716 dout(0) << __func__ << " old_inode for snapid " << snapid
3717 << " not found" << dendl;
3718 }
3719 }
3720 } else if (snapid < first || snapid > last) {
3721 // snapshoted remote dentry can result this
3722 dout(0) << __func__ << " [" << first << "," << last << "]"
3723 << " not match snapid " << snapid << dendl;
3724 }
3725 }
3726
3727 utime_t snap_btime;
3728 std::map<std::string, std::string> snap_metadata;
3729 SnapRealm *realm = find_snaprealm();
3730 if (snapid != CEPH_NOSNAP && realm) {
3731 // add snapshot timestamp vxattr
3732 map<snapid_t,const SnapInfo*> infomap;
3733 realm->get_snap_info(infomap,
3734 snapid, // min
3735 snapid); // max
3736 if (!infomap.empty()) {
3737 ceph_assert(infomap.size() == 1);
3738 const SnapInfo *si = infomap.begin()->second;
3739 snap_btime = si->stamp;
3740 snap_metadata = si->metadata;
3741 }
3742 }
3743
3744
3745 bool no_caps = !valid ||
3746 session->is_stale() ||
3747 (dir_realm && realm != dir_realm) ||
3748 is_frozen() ||
3749 state_test(CInode::STATE_EXPORTINGCAPS);
3750 if (no_caps)
3751 dout(20) << __func__ << " no caps"
3752 << (!valid?", !valid":"")
3753 << (session->is_stale()?", session stale ":"")
3754 << ((dir_realm && realm != dir_realm)?", snaprealm differs ":"")
3755 << (is_frozen()?", frozen inode":"")
3756 << (state_test(CInode::STATE_EXPORTINGCAPS)?", exporting caps":"")
3757 << dendl;
3758
3759
3760 // "fake" a version that is old (stable) version, +1 if projected.
3761 version_t version = (oi->version * 2) + is_projected();
3762
3763 Capability *cap = get_client_cap(client);
3764 bool pfile = filelock.is_xlocked_by_client(client) || get_loner() == client;
3765 //(cap && (cap->issued() & CEPH_CAP_FILE_EXCL));
3766 bool pauth = authlock.is_xlocked_by_client(client) || get_loner() == client;
3767 bool plink = linklock.is_xlocked_by_client(client) || get_loner() == client;
3768 bool pxattr = xattrlock.is_xlocked_by_client(client) || get_loner() == client;
3769
3770 bool plocal = versionlock.get_last_wrlock_client() == client;
3771 bool ppolicy = policylock.is_xlocked_by_client(client) || get_loner()==client;
3772
3773 const mempool_inode *any_i = (pfile|pauth|plink|pxattr|plocal) ? pi : oi;
3774
3775 dout(20) << " pfile " << pfile << " pauth " << pauth
3776 << " plink " << plink << " pxattr " << pxattr
3777 << " plocal " << plocal
3778 << " ctime " << any_i->ctime
3779 << " valid=" << valid << dendl;
3780
3781 // file
3782 const mempool_inode *file_i = pfile ? pi:oi;
3783 file_layout_t layout;
3784 if (is_dir()) {
3785 layout = (ppolicy ? pi : oi)->layout;
3786 } else {
3787 layout = file_i->layout;
3788 }
3789
3790 // max_size is min of projected, actual
3791 uint64_t max_size =
3792 std::min(oi->get_client_range(client),
3793 pi->get_client_range(client));
3794
3795 // inline data
3796 version_t inline_version = 0;
3797 bufferlist inline_data;
3798 if (file_i->inline_data.version == CEPH_INLINE_NONE) {
3799 inline_version = CEPH_INLINE_NONE;
3800 } else if ((!cap && !no_caps) ||
3801 (cap && cap->client_inline_version < file_i->inline_data.version) ||
3802 (getattr_caps & CEPH_CAP_FILE_RD)) { // client requests inline data
3803 inline_version = file_i->inline_data.version;
3804 if (file_i->inline_data.length() > 0)
3805 file_i->inline_data.get_data(inline_data);
3806 }
3807
3808 // nest (do same as file... :/)
3809 if (cap) {
3810 cap->last_rbytes = file_i->rstat.rbytes;
3811 cap->last_rsize = file_i->rstat.rsize();
3812 }
3813
3814 // auth
3815 const mempool_inode *auth_i = pauth ? pi:oi;
3816
3817 // link
3818 const mempool_inode *link_i = plink ? pi:oi;
3819
3820 // xattr
3821 const mempool_inode *xattr_i = pxattr ? pi:oi;
3822
3823 using ceph::encode;
3824 // xattr
3825 version_t xattr_version;
3826 if ((!cap && !no_caps) ||
3827 (cap && cap->client_xattr_version < xattr_i->xattr_version) ||
3828 (getattr_caps & CEPH_CAP_XATTR_SHARED)) { // client requests xattrs
3829 if (!pxattrs)
3830 pxattrs = pxattr ? get_projected_xattrs().get() : get_xattrs().get();
3831 xattr_version = xattr_i->xattr_version;
3832 } else {
3833 xattr_version = 0;
3834 }
3835
3836 // do we have room?
3837 if (max_bytes) {
3838 unsigned bytes =
3839 8 + 8 + 4 + 8 + 8 + sizeof(ceph_mds_reply_cap) +
3840 sizeof(struct ceph_file_layout) +
3841 sizeof(struct ceph_timespec) * 3 + 4 + // ctime ~ time_warp_seq
3842 8 + 8 + 8 + 4 + 4 + 4 + 4 + 4 + // size ~ nlink
3843 8 + 8 + 8 + 8 + 8 + sizeof(struct ceph_timespec) + // dirstat.nfiles ~ rstat.rctime
3844 sizeof(__u32) + sizeof(__u32) * 2 * dirfragtree._splits.size() + // dirfragtree
3845 sizeof(__u32) + symlink.length() + // symlink
3846 sizeof(struct ceph_dir_layout); // dir_layout
3847
3848 if (xattr_version) {
3849 bytes += sizeof(__u32) + sizeof(__u32); // xattr buffer len + number entries
3850 if (pxattrs) {
3851 for (const auto &p : *pxattrs)
3852 bytes += sizeof(__u32) * 2 + p.first.length() + p.second.length();
3853 }
3854 } else {
3855 bytes += sizeof(__u32); // xattr buffer len
3856 }
3857 bytes +=
3858 sizeof(version_t) + sizeof(__u32) + inline_data.length() + // inline data
3859 1 + 1 + 8 + 8 + 4 + // quota
3860 4 + layout.pool_ns.size() + // pool ns
3861 sizeof(struct ceph_timespec) + 8; // btime + change_attr
3862
3863 if (bytes > max_bytes)
3864 return -CEPHFS_ENOSPC;
3865 }
3866
3867
3868 // encode caps
3869 struct ceph_mds_reply_cap ecap;
3870 if (snapid != CEPH_NOSNAP) {
3871 /*
3872 * snapped inodes (files or dirs) only get read-only caps. always
3873 * issue everything possible, since it is read only.
3874 *
3875 * if a snapped inode has caps, limit issued caps based on the
3876 * lock state.
3877 *
3878 * if it is a live inode, limit issued caps based on the lock
3879 * state.
3880 *
3881 * do NOT adjust cap issued state, because the client always
3882 * tracks caps per-snap and the mds does either per-interval or
3883 * multiversion.
3884 */
3885 ecap.caps = valid ? get_caps_allowed_by_type(CAP_ANY) : CEPH_STAT_CAP_INODE;
3886 if (last == CEPH_NOSNAP || is_any_caps())
3887 ecap.caps = ecap.caps & get_caps_allowed_for_client(session, nullptr, file_i);
3888 ecap.seq = 0;
3889 ecap.mseq = 0;
3890 ecap.realm = 0;
3891 } else {
3892 if (!no_caps && !cap) {
3893 // add a new cap
3894 cap = add_client_cap(client, session, realm);
3895 if (is_auth())
3896 choose_ideal_loner();
3897 }
3898
3899 int issue = 0;
3900 if (!no_caps && cap) {
3901 int likes = get_caps_liked();
3902 int allowed = get_caps_allowed_for_client(session, cap, file_i);
3903 issue = (cap->wanted() | likes) & allowed;
3904 cap->issue_norevoke(issue, true);
3905 issue = cap->pending();
3906 dout(10) << "encode_inodestat issuing " << ccap_string(issue)
3907 << " seq " << cap->get_last_seq() << dendl;
3908 } else if (cap && cap->is_new() && !dir_realm) {
3909 // alway issue new caps to client, otherwise the caps get lost
3910 ceph_assert(cap->is_stale());
3911 ceph_assert(!cap->pending());
3912 issue = CEPH_CAP_PIN;
3913 cap->issue_norevoke(issue, true);
3914 dout(10) << "encode_inodestat issuing " << ccap_string(issue)
3915 << " seq " << cap->get_last_seq()
3916 << "(stale&new caps)" << dendl;
3917 }
3918
3919 if (issue) {
3920 cap->set_last_issue();
3921 cap->set_last_issue_stamp(ceph_clock_now());
3922 ecap.caps = issue;
3923 ecap.wanted = cap->wanted();
3924 ecap.cap_id = cap->get_cap_id();
3925 ecap.seq = cap->get_last_seq();
3926 ecap.mseq = cap->get_mseq();
3927 ecap.realm = realm->inode->ino();
3928 } else {
3929 ecap.cap_id = 0;
3930 ecap.caps = 0;
3931 ecap.seq = 0;
3932 ecap.mseq = 0;
3933 ecap.realm = 0;
3934 ecap.wanted = 0;
3935 }
3936 }
3937 ecap.flags = is_auth() ? CEPH_CAP_FLAG_AUTH : 0;
3938 dout(10) << "encode_inodestat caps " << ccap_string(ecap.caps)
3939 << " seq " << ecap.seq << " mseq " << ecap.mseq
3940 << " xattrv " << xattr_version << dendl;
3941
3942 if (inline_data.length() && cap) {
3943 if ((cap->pending() | getattr_caps) & CEPH_CAP_FILE_SHARED) {
3944 dout(10) << "including inline version " << inline_version << dendl;
3945 cap->client_inline_version = inline_version;
3946 } else {
3947 dout(10) << "dropping inline version " << inline_version << dendl;
3948 inline_version = 0;
3949 inline_data.clear();
3950 }
3951 }
3952
3953 // include those xattrs?
3954 if (xattr_version && cap) {
3955 if ((cap->pending() | getattr_caps) & CEPH_CAP_XATTR_SHARED) {
3956 dout(10) << "including xattrs version " << xattr_version << dendl;
3957 cap->client_xattr_version = xattr_version;
3958 } else {
3959 dout(10) << "dropping xattrs version " << xattr_version << dendl;
3960 xattr_version = 0;
3961 }
3962 }
3963
3964 // The end result of encode_xattrs() is equivalent to:
3965 // {
3966 // bufferlist xbl;
3967 // if (xattr_version) {
3968 // if (pxattrs)
3969 // encode(*pxattrs, bl);
3970 // else
3971 // encode((__u32)0, bl);
3972 // }
3973 // encode(xbl, bl);
3974 // }
3975 //
3976 // But encoding xattrs into the 'xbl' requires a memory allocation.
3977 // The 'bl' should have enough pre-allocated memory in most cases.
3978 // Encoding xattrs directly into it can avoid the extra allocation.
3979 auto encode_xattrs = [xattr_version, pxattrs, &bl]() {
3980 using ceph::encode;
3981 if (xattr_version) {
3982 ceph_le32 xbl_len;
3983 auto filler = bl.append_hole(sizeof(xbl_len));
3984 const auto starting_bl_len = bl.length();
3985 if (pxattrs)
3986 encode(*pxattrs, bl);
3987 else
3988 encode((__u32)0, bl);
3989 xbl_len = bl.length() - starting_bl_len;
3990 filler.copy_in(sizeof(xbl_len), (char *)&xbl_len);
3991 } else {
3992 encode((__u32)0, bl);
3993 }
3994 };
3995
3996 /*
3997 * note: encoding matches MClientReply::InodeStat
3998 */
3999 if (session->info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) {
4000 ENCODE_START(6, 1, bl);
4001 encode(oi->ino, bl);
4002 encode(snapid, bl);
4003 encode(oi->rdev, bl);
4004 encode(version, bl);
4005 encode(xattr_version, bl);
4006 encode(ecap, bl);
4007 {
4008 ceph_file_layout legacy_layout;
4009 layout.to_legacy(&legacy_layout);
4010 encode(legacy_layout, bl);
4011 }
4012 encode(any_i->ctime, bl);
4013 encode(file_i->mtime, bl);
4014 encode(file_i->atime, bl);
4015 encode(file_i->time_warp_seq, bl);
4016 encode(file_i->size, bl);
4017 encode(max_size, bl);
4018 encode(file_i->truncate_size, bl);
4019 encode(file_i->truncate_seq, bl);
4020 encode(auth_i->mode, bl);
4021 encode((uint32_t)auth_i->uid, bl);
4022 encode((uint32_t)auth_i->gid, bl);
4023 encode(link_i->nlink, bl);
4024 encode(file_i->dirstat.nfiles, bl);
4025 encode(file_i->dirstat.nsubdirs, bl);
4026 encode(file_i->rstat.rbytes, bl);
4027 encode(file_i->rstat.rfiles, bl);
4028 encode(file_i->rstat.rsubdirs, bl);
4029 encode(file_i->rstat.rctime, bl);
4030 dirfragtree.encode(bl);
4031 encode(symlink, bl);
4032 encode(file_i->dir_layout, bl);
4033 encode_xattrs();
4034 encode(inline_version, bl);
4035 encode(inline_data, bl);
4036 const mempool_inode *policy_i = ppolicy ? pi : oi;
4037 encode(policy_i->quota, bl);
4038 encode(layout.pool_ns, bl);
4039 encode(any_i->btime, bl);
4040 encode(any_i->change_attr, bl);
4041 encode(file_i->export_pin, bl);
4042 encode(snap_btime, bl);
4043 encode(file_i->rstat.rsnaps, bl);
4044 encode(snap_metadata, bl);
4045 encode(file_i->fscrypt, bl);
4046 ENCODE_FINISH(bl);
4047 }
4048 else {
4049 ceph_assert(session->get_connection());
4050
4051 encode(oi->ino, bl);
4052 encode(snapid, bl);
4053 encode(oi->rdev, bl);
4054 encode(version, bl);
4055 encode(xattr_version, bl);
4056 encode(ecap, bl);
4057 {
4058 ceph_file_layout legacy_layout;
4059 layout.to_legacy(&legacy_layout);
4060 encode(legacy_layout, bl);
4061 }
4062 encode(any_i->ctime, bl);
4063 encode(file_i->mtime, bl);
4064 encode(file_i->atime, bl);
4065 encode(file_i->time_warp_seq, bl);
4066 encode(file_i->size, bl);
4067 encode(max_size, bl);
4068 encode(file_i->truncate_size, bl);
4069 encode(file_i->truncate_seq, bl);
4070 encode(auth_i->mode, bl);
4071 encode((uint32_t)auth_i->uid, bl);
4072 encode((uint32_t)auth_i->gid, bl);
4073 encode(link_i->nlink, bl);
4074 encode(file_i->dirstat.nfiles, bl);
4075 encode(file_i->dirstat.nsubdirs, bl);
4076 encode(file_i->rstat.rbytes, bl);
4077 encode(file_i->rstat.rfiles, bl);
4078 encode(file_i->rstat.rsubdirs, bl);
4079 encode(file_i->rstat.rctime, bl);
4080 dirfragtree.encode(bl);
4081 encode(symlink, bl);
4082 auto& conn = session->get_connection();
4083 if (conn->has_feature(CEPH_FEATURE_DIRLAYOUTHASH)) {
4084 encode(file_i->dir_layout, bl);
4085 }
4086 encode_xattrs();
4087 if (conn->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
4088 encode(inline_version, bl);
4089 encode(inline_data, bl);
4090 }
4091 if (conn->has_feature(CEPH_FEATURE_MDS_QUOTA)) {
4092 const mempool_inode *policy_i = ppolicy ? pi : oi;
4093 encode(policy_i->quota, bl);
4094 }
4095 if (conn->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2)) {
4096 encode(layout.pool_ns, bl);
4097 }
4098 if (conn->has_feature(CEPH_FEATURE_FS_BTIME)) {
4099 encode(any_i->btime, bl);
4100 encode(any_i->change_attr, bl);
4101 }
4102 }
4103
4104 return valid;
4105 }
4106
4107 void CInode::encode_cap_message(const ref_t<MClientCaps> &m, Capability *cap)
4108 {
4109 ceph_assert(cap);
4110
4111 client_t client = cap->get_client();
4112
4113 bool pfile = filelock.is_xlocked_by_client(client) || (cap->issued() & CEPH_CAP_FILE_EXCL);
4114 bool pauth = authlock.is_xlocked_by_client(client);
4115 bool plink = linklock.is_xlocked_by_client(client);
4116 bool pxattr = xattrlock.is_xlocked_by_client(client);
4117
4118 const mempool_inode *oi = get_inode().get();
4119 const mempool_inode *pi = get_projected_inode().get();
4120 const mempool_inode *i = (pfile|pauth|plink|pxattr) ? pi : oi;
4121
4122 dout(20) << __func__ << " pfile " << pfile
4123 << " pauth " << pauth << " plink " << plink << " pxattr " << pxattr
4124 << " ctime " << i->ctime << dendl;
4125
4126 i = pfile ? pi:oi;
4127 m->set_layout(i->layout);
4128 m->size = i->size;
4129 m->truncate_seq = i->truncate_seq;
4130 m->truncate_size = i->truncate_size;
4131 m->mtime = i->mtime;
4132 m->atime = i->atime;
4133 m->ctime = i->ctime;
4134 m->change_attr = i->change_attr;
4135 m->time_warp_seq = i->time_warp_seq;
4136 m->nfiles = i->dirstat.nfiles;
4137 m->nsubdirs = i->dirstat.nsubdirs;
4138
4139 if (cap->client_inline_version < i->inline_data.version) {
4140 m->inline_version = cap->client_inline_version = i->inline_data.version;
4141 if (i->inline_data.length() > 0)
4142 i->inline_data.get_data(m->inline_data);
4143 } else {
4144 m->inline_version = 0;
4145 }
4146
4147 // max_size is min of projected, actual.
4148 uint64_t oldms = oi->get_client_range(client);
4149 uint64_t newms = pi->get_client_range(client);
4150 m->max_size = std::min(oldms, newms);
4151
4152 i = pauth ? pi:oi;
4153 m->head.mode = i->mode;
4154 m->head.uid = i->uid;
4155 m->head.gid = i->gid;
4156
4157 i = plink ? pi:oi;
4158 m->head.nlink = i->nlink;
4159
4160 using ceph::encode;
4161 i = pxattr ? pi:oi;
4162 const auto& ix = pxattr ? get_projected_xattrs() : get_xattrs();
4163 if ((cap->pending() & CEPH_CAP_XATTR_SHARED) &&
4164 i->xattr_version > cap->client_xattr_version) {
4165 dout(10) << " including xattrs v " << i->xattr_version << dendl;
4166 if (ix)
4167 encode(*ix, m->xattrbl);
4168 else
4169 encode((__u32)0, m->xattrbl);
4170 m->head.xattr_version = i->xattr_version;
4171 cap->client_xattr_version = i->xattr_version;
4172 }
4173 }
4174
4175
4176
4177 void CInode::_encode_base(bufferlist& bl, uint64_t features)
4178 {
4179 ENCODE_START(1, 1, bl);
4180 encode(first, bl);
4181 encode(*get_inode(), bl, features);
4182 encode(symlink, bl);
4183 encode(dirfragtree, bl);
4184 encode_xattrs(bl);
4185 encode_old_inodes(bl, features);
4186 encode(damage_flags, bl);
4187 encode_snap(bl);
4188 ENCODE_FINISH(bl);
4189 }
4190 void CInode::_decode_base(bufferlist::const_iterator& p)
4191 {
4192 DECODE_START(1, p);
4193 decode(first, p);
4194 {
4195 auto _inode = allocate_inode();
4196 decode(*_inode, p);
4197 reset_inode(std::move(_inode));
4198 }
4199 {
4200 std::string tmp;
4201 decode(tmp, p);
4202 symlink = std::string_view(tmp);
4203 }
4204 decode(dirfragtree, p);
4205 decode_xattrs(p);
4206 decode_old_inodes(p);
4207 decode(damage_flags, p);
4208 decode_snap(p);
4209 DECODE_FINISH(p);
4210 }
4211
4212 void CInode::_encode_locks_full(bufferlist& bl)
4213 {
4214 using ceph::encode;
4215 encode(authlock, bl);
4216 encode(linklock, bl);
4217 encode(dirfragtreelock, bl);
4218 encode(filelock, bl);
4219 encode(xattrlock, bl);
4220 encode(snaplock, bl);
4221 encode(nestlock, bl);
4222 encode(flocklock, bl);
4223 encode(policylock, bl);
4224
4225 encode(loner_cap, bl);
4226 }
4227 void CInode::_decode_locks_full(bufferlist::const_iterator& p)
4228 {
4229 using ceph::decode;
4230 decode(authlock, p);
4231 decode(linklock, p);
4232 decode(dirfragtreelock, p);
4233 decode(filelock, p);
4234 decode(xattrlock, p);
4235 decode(snaplock, p);
4236 decode(nestlock, p);
4237 decode(flocklock, p);
4238 decode(policylock, p);
4239
4240 decode(loner_cap, p);
4241 set_loner_cap(loner_cap);
4242 want_loner_cap = loner_cap; // for now, we'll eval() shortly.
4243 }
4244
4245 void CInode::_encode_locks_state_for_replica(bufferlist& bl, bool need_recover)
4246 {
4247 ENCODE_START(1, 1, bl);
4248 authlock.encode_state_for_replica(bl);
4249 linklock.encode_state_for_replica(bl);
4250 dirfragtreelock.encode_state_for_replica(bl);
4251 filelock.encode_state_for_replica(bl);
4252 nestlock.encode_state_for_replica(bl);
4253 xattrlock.encode_state_for_replica(bl);
4254 snaplock.encode_state_for_replica(bl);
4255 flocklock.encode_state_for_replica(bl);
4256 policylock.encode_state_for_replica(bl);
4257 encode(need_recover, bl);
4258 ENCODE_FINISH(bl);
4259 }
4260
4261 void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep)
4262 {
4263 authlock.encode_state_for_replica(bl);
4264 linklock.encode_state_for_replica(bl);
4265 dirfragtreelock.encode_state_for_rejoin(bl, rep);
4266 filelock.encode_state_for_rejoin(bl, rep);
4267 nestlock.encode_state_for_rejoin(bl, rep);
4268 xattrlock.encode_state_for_replica(bl);
4269 snaplock.encode_state_for_replica(bl);
4270 flocklock.encode_state_for_replica(bl);
4271 policylock.encode_state_for_replica(bl);
4272 }
4273
4274 void CInode::_decode_locks_state_for_replica(bufferlist::const_iterator& p, bool is_new)
4275 {
4276 DECODE_START(1, p);
4277 authlock.decode_state(p, is_new);
4278 linklock.decode_state(p, is_new);
4279 dirfragtreelock.decode_state(p, is_new);
4280 filelock.decode_state(p, is_new);
4281 nestlock.decode_state(p, is_new);
4282 xattrlock.decode_state(p, is_new);
4283 snaplock.decode_state(p, is_new);
4284 flocklock.decode_state(p, is_new);
4285 policylock.decode_state(p, is_new);
4286
4287 bool need_recover;
4288 decode(need_recover, p);
4289 if (need_recover && is_new) {
4290 // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock
4291 // and change the object when replaying unsafe requests.
4292 authlock.mark_need_recover();
4293 linklock.mark_need_recover();
4294 dirfragtreelock.mark_need_recover();
4295 filelock.mark_need_recover();
4296 nestlock.mark_need_recover();
4297 xattrlock.mark_need_recover();
4298 snaplock.mark_need_recover();
4299 flocklock.mark_need_recover();
4300 policylock.mark_need_recover();
4301 }
4302 DECODE_FINISH(p);
4303 }
4304 void CInode::_decode_locks_rejoin(bufferlist::const_iterator& p, MDSContext::vec& waiters,
4305 list<SimpleLock*>& eval_locks, bool survivor)
4306 {
4307 authlock.decode_state_rejoin(p, waiters, survivor);
4308 linklock.decode_state_rejoin(p, waiters, survivor);
4309 dirfragtreelock.decode_state_rejoin(p, waiters, survivor);
4310 filelock.decode_state_rejoin(p, waiters, survivor);
4311 nestlock.decode_state_rejoin(p, waiters, survivor);
4312 xattrlock.decode_state_rejoin(p, waiters, survivor);
4313 snaplock.decode_state_rejoin(p, waiters, survivor);
4314 flocklock.decode_state_rejoin(p, waiters, survivor);
4315 policylock.decode_state_rejoin(p, waiters, survivor);
4316
4317 if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked())
4318 eval_locks.push_back(&dirfragtreelock);
4319 if (!filelock.is_stable() && !filelock.is_wrlocked())
4320 eval_locks.push_back(&filelock);
4321 if (!nestlock.is_stable() && !nestlock.is_wrlocked())
4322 eval_locks.push_back(&nestlock);
4323 }
4324
4325
4326 // IMPORT/EXPORT
4327
4328 void CInode::encode_export(bufferlist& bl)
4329 {
4330 ENCODE_START(5, 4, bl);
4331 _encode_base(bl, mdcache->mds->mdsmap->get_up_features());
4332
4333 encode(state, bl);
4334
4335 encode(pop, bl);
4336
4337 encode(get_replicas(), bl);
4338
4339 // include scatterlock info for any bounding CDirs
4340 bufferlist bounding;
4341 if (get_inode()->is_dir())
4342 for (const auto &p : dirfrags) {
4343 CDir *dir = p.second;
4344 if (dir->state_test(CDir::STATE_EXPORTBOUND)) {
4345 encode(p.first, bounding);
4346 encode(dir->get_fnode()->fragstat, bounding);
4347 encode(dir->get_fnode()->accounted_fragstat, bounding);
4348 encode(dir->get_fnode()->rstat, bounding);
4349 encode(dir->get_fnode()->accounted_rstat, bounding);
4350 dout(10) << " encoded fragstat/rstat info for " << *dir << dendl;
4351 }
4352 }
4353 encode(bounding, bl);
4354
4355 _encode_locks_full(bl);
4356
4357 _encode_file_locks(bl);
4358
4359 ENCODE_FINISH(bl);
4360
4361 get(PIN_TEMPEXPORTING);
4362 }
4363
4364 void CInode::finish_export()
4365 {
4366 state &= MASK_STATE_EXPORT_KEPT;
4367
4368 pop.zero();
4369
4370 // just in case!
4371 //dirlock.clear_updated();
4372
4373 loner_cap = -1;
4374
4375 put(PIN_TEMPEXPORTING);
4376 }
4377
4378 void CInode::decode_import(bufferlist::const_iterator& p,
4379 LogSegment *ls)
4380 {
4381 DECODE_START(5, p);
4382
4383 _decode_base(p);
4384
4385 {
4386 unsigned s;
4387 decode(s, p);
4388 s &= MASK_STATE_EXPORTED;
4389
4390 set_ephemeral_pin((s & STATE_DISTEPHEMERALPIN),
4391 (s & STATE_RANDEPHEMERALPIN));
4392 state_set(STATE_AUTH | s);
4393 }
4394
4395 if (is_dirty()) {
4396 get(PIN_DIRTY);
4397 _mark_dirty(ls);
4398 }
4399 if (is_dirty_parent()) {
4400 get(PIN_DIRTYPARENT);
4401 mark_dirty_parent(ls);
4402 }
4403
4404 decode(pop, p);
4405
4406 decode(get_replicas(), p);
4407 if (is_replicated())
4408 get(PIN_REPLICATED);
4409 replica_nonce = 0;
4410
4411 // decode fragstat info on bounding cdirs
4412 bufferlist bounding;
4413 decode(bounding, p);
4414 auto q = bounding.cbegin();
4415 while (!q.end()) {
4416 frag_t fg;
4417 decode(fg, q);
4418 CDir *dir = get_dirfrag(fg);
4419 ceph_assert(dir); // we should have all bounds open
4420
4421 // Only take the remote's fragstat/rstat if we are non-auth for
4422 // this dirfrag AND the lock is NOT in a scattered (MIX) state.
4423 // We know lock is stable, and MIX is the only state in which
4424 // the inode auth (who sent us this data) may not have the best
4425 // info.
4426
4427 // HMM: Are there cases where dir->is_auth() is an insufficient
4428 // check because the dirfrag is under migration? That implies
4429 // it is frozen (and in a SYNC or LOCK state). FIXME.
4430
4431 auto _fnode = CDir::allocate_fnode(*dir->get_fnode());
4432 if (dir->is_auth() ||
4433 filelock.get_state() == LOCK_MIX) {
4434 dout(10) << " skipped fragstat info for " << *dir << dendl;
4435 frag_info_t f;
4436 decode(f, q);
4437 decode(f, q);
4438 } else {
4439 decode(_fnode->fragstat, q);
4440 decode(_fnode->accounted_fragstat, q);
4441 dout(10) << " took fragstat info for " << *dir << dendl;
4442 }
4443 if (dir->is_auth() ||
4444 nestlock.get_state() == LOCK_MIX) {
4445 dout(10) << " skipped rstat info for " << *dir << dendl;
4446 nest_info_t n;
4447 decode(n, q);
4448 decode(n, q);
4449 } else {
4450 decode(_fnode->rstat, q);
4451 decode(_fnode->accounted_rstat, q);
4452 dout(10) << " took rstat info for " << *dir << dendl;
4453 }
4454 dir->reset_fnode(std::move(_fnode));
4455 }
4456
4457 _decode_locks_full(p);
4458
4459 _decode_file_locks(p);
4460
4461 DECODE_FINISH(p);
4462 }
4463
4464
4465 void InodeStoreBase::dump(Formatter *f) const
4466 {
4467 inode->dump(f);
4468 f->dump_string("symlink", symlink);
4469
4470 f->open_array_section("xattrs");
4471 if (xattrs) {
4472 for (const auto& [key, val] : *xattrs) {
4473 f->open_object_section("xattr");
4474 f->dump_string("key", key);
4475 std::string v(val.c_str(), val.length());
4476 f->dump_string("val", v);
4477 f->close_section();
4478 }
4479 }
4480 f->close_section();
4481 f->open_object_section("dirfragtree");
4482 dirfragtree.dump(f);
4483 f->close_section(); // dirfragtree
4484
4485 f->open_array_section("old_inodes");
4486 if (old_inodes) {
4487 for (const auto &p : *old_inodes) {
4488 f->open_object_section("old_inode");
4489 // The key is the last snapid, the first is in the mempool_old_inode
4490 f->dump_int("last", p.first);
4491 p.second.dump(f);
4492 f->close_section(); // old_inode
4493 }
4494 }
4495 f->close_section(); // old_inodes
4496
4497 f->dump_unsigned("oldest_snap", oldest_snap);
4498 f->dump_unsigned("damage_flags", damage_flags);
4499 }
4500
4501 template <>
4502 void decode_json_obj(mempool::mds_co::string& t, JSONObj *obj){
4503
4504 t = mempool::mds_co::string(std::string_view(obj->get_data()));
4505 }
4506
4507 void InodeStoreBase::decode_json(JSONObj *obj)
4508 {
4509 {
4510 auto _inode = allocate_inode();
4511 _inode->decode_json(obj);
4512 reset_inode(std::move(_inode));
4513 }
4514
4515 JSONDecoder::decode_json("symlink", symlink, obj, true);
4516 // JSONDecoder::decode_json("dirfragtree", dirfragtree, obj, true); // cann't decode it now
4517 //
4518 //
4519 {
4520 mempool_xattr_map tmp;
4521 JSONDecoder::decode_json("xattrs", tmp, xattrs_cb, obj, true);
4522 if (tmp.empty())
4523 reset_xattrs(xattr_map_ptr());
4524 else
4525 reset_xattrs(allocate_xattr_map(std::move(tmp)));
4526 }
4527 // JSONDecoder::decode_json("old_inodes", old_inodes, InodeStoreBase::old_indoes_cb, obj, true); // cann't decode old_inodes now
4528 JSONDecoder::decode_json("oldest_snap", oldest_snap.val, obj, true);
4529 JSONDecoder::decode_json("damage_flags", damage_flags, obj, true);
4530 //sr_t srnode;
4531 //JSONDecoder::decode_json("snap_blob", srnode, obj, true); // cann't decode it now
4532 //snap_blob = srnode;
4533 }
4534
4535 void InodeStoreBase::xattrs_cb(InodeStoreBase::mempool_xattr_map& c, JSONObj *obj){
4536
4537 string k;
4538 JSONDecoder::decode_json("key", k, obj, true);
4539 string v;
4540 JSONDecoder::decode_json("val", v, obj, true);
4541 c[k.c_str()] = buffer::copy(v.c_str(), v.size());
4542 }
4543
4544 void InodeStoreBase::old_indoes_cb(InodeStoreBase::mempool_old_inode_map& c, JSONObj *obj){
4545
4546 snapid_t s;
4547 JSONDecoder::decode_json("last", s.val, obj, true);
4548 InodeStoreBase::mempool_old_inode i;
4549 // i.decode_json(obj); // cann't decode now, simon
4550 c[s] = i;
4551 }
4552
4553 void InodeStore::generate_test_instances(std::list<InodeStore*> &ls)
4554 {
4555 InodeStore *populated = new InodeStore;
4556 populated->get_inode()->ino = 0xdeadbeef;
4557 populated->symlink = "rhubarb";
4558 ls.push_back(populated);
4559 }
4560
4561 void InodeStoreBare::generate_test_instances(std::list<InodeStoreBare*> &ls)
4562 {
4563 InodeStoreBare *populated = new InodeStoreBare;
4564 populated->get_inode()->ino = 0xdeadbeef;
4565 populated->symlink = "rhubarb";
4566 ls.push_back(populated);
4567 }
4568
4569 void CInode::validate_disk_state(CInode::validated_data *results,
4570 MDSContext *fin)
4571 {
4572 class ValidationContinuation : public MDSContinuation {
4573 public:
4574 MDSContext *fin;
4575 CInode *in;
4576 CInode::validated_data *results;
4577 bufferlist bl;
4578 CInode *shadow_in;
4579
4580 enum {
4581 START = 0,
4582 BACKTRACE,
4583 INODE,
4584 DIRFRAGS,
4585 SNAPREALM,
4586 };
4587
4588 ValidationContinuation(CInode *i,
4589 CInode::validated_data *data_r,
4590 MDSContext *fin_) :
4591 MDSContinuation(i->mdcache->mds->server),
4592 fin(fin_),
4593 in(i),
4594 results(data_r),
4595 shadow_in(NULL) {
4596 set_callback(START, static_cast<Continuation::stagePtr>(&ValidationContinuation::_start));
4597 set_callback(BACKTRACE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_backtrace));
4598 set_callback(INODE, static_cast<Continuation::stagePtr>(&ValidationContinuation::_inode_disk));
4599 set_callback(DIRFRAGS, static_cast<Continuation::stagePtr>(&ValidationContinuation::_dirfrags));
4600 }
4601
4602 ~ValidationContinuation() override {
4603 if (shadow_in) {
4604 delete shadow_in;
4605 in->mdcache->num_shadow_inodes--;
4606 }
4607 }
4608
4609 /**
4610 * Fetch backtrace and set tag if tag is non-empty
4611 */
4612 void fetch_backtrace_and_tag(CInode *in,
4613 std::string_view tag, bool is_internal,
4614 Context *fin, int *bt_r, bufferlist *bt)
4615 {
4616 const int64_t pool = in->get_backtrace_pool();
4617 object_t oid = CInode::get_object_name(in->ino(), frag_t(), "");
4618
4619 ObjectOperation fetch;
4620 fetch.getxattr("parent", bt, bt_r);
4621 in->mdcache->mds->objecter->read(oid, object_locator_t(pool), fetch, CEPH_NOSNAP,
4622 NULL, 0, fin);
4623 if (in->mdcache->mds->logger) {
4624 in->mdcache->mds->logger->inc(l_mds_openino_backtrace_fetch);
4625 in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_fetch);
4626 }
4627
4628 using ceph::encode;
4629 if (!is_internal) {
4630 ObjectOperation scrub_tag;
4631 bufferlist tag_bl;
4632 encode(tag, tag_bl);
4633 scrub_tag.setxattr("scrub_tag", tag_bl);
4634 SnapContext snapc;
4635 in->mdcache->mds->objecter->mutate(oid, object_locator_t(pool), scrub_tag, snapc,
4636 ceph::real_clock::now(),
4637 0, NULL);
4638 if (in->mdcache->mds->logger)
4639 in->mdcache->mds->logger->inc(l_mds_scrub_set_tag);
4640 }
4641 }
4642
4643 bool _start(int rval) {
4644 ceph_assert(in->can_auth_pin());
4645 in->auth_pin(this);
4646
4647 if (in->is_dirty()) {
4648 MDCache *mdcache = in->mdcache; // For the benefit of dout
4649 auto ino = [this]() { return in->ino(); }; // For the benefit of dout
4650 dout(20) << "validating a dirty CInode; results will be inconclusive"
4651 << dendl;
4652 }
4653
4654 C_OnFinisher *conf = new C_OnFinisher(get_io_callback(BACKTRACE),
4655 in->mdcache->mds->finisher);
4656
4657 std::string_view tag = in->scrub_infop->header->get_tag();
4658 bool is_internal = in->scrub_infop->header->is_internal_tag();
4659 // Rather than using the usual CInode::fetch_backtrace,
4660 // use a special variant that optionally writes a tag in the same
4661 // operation.
4662 fetch_backtrace_and_tag(in, tag, is_internal, conf, &results->backtrace.ondisk_read_retval, &bl);
4663 return false;
4664 }
4665
4666 bool _backtrace(int rval) {
4667 // set up basic result reporting and make sure we got the data
4668 results->performed_validation = true; // at least, some of it!
4669 results->backtrace.checked = true;
4670
4671 const int64_t pool = in->get_backtrace_pool();
4672 inode_backtrace_t& memory_backtrace = results->backtrace.memory_value;
4673 in->build_backtrace(pool, memory_backtrace);
4674 bool equivalent, divergent;
4675 int memory_newer;
4676
4677 MDCache *mdcache = in->mdcache; // For the benefit of dout
4678 auto ino = [this]() { return in->ino(); }; // For the benefit of dout
4679
4680 // Ignore rval because it's the result of a FAILOK operation
4681 // from fetch_backtrace_and_tag: the real result is in
4682 // backtrace.ondisk_read_retval
4683 dout(20) << "ondisk_read_retval: " << results->backtrace.ondisk_read_retval << dendl;
4684 if (results->backtrace.ondisk_read_retval != 0) {
4685 results->backtrace.error_str << "failed to read off disk; see retval";
4686 // we probably have a new unwritten file!
4687 // so skip the backtrace scrub for this entry and say that all's well
4688 if (in->is_dirty_parent()) {
4689 dout(20) << "forcing backtrace as passed since inode is dirty parent" << dendl;
4690 results->backtrace.passed = true;
4691 }
4692 goto next;
4693 }
4694
4695 // extract the backtrace, and compare it to a newly-constructed one
4696 try {
4697 auto p = bl.cbegin();
4698 using ceph::decode;
4699 decode(results->backtrace.ondisk_value, p);
4700 dout(10) << "decoded " << bl.length() << " bytes of backtrace successfully" << dendl;
4701 } catch (buffer::error&) {
4702 if (results->backtrace.ondisk_read_retval == 0 && rval != 0) {
4703 // Cases where something has clearly gone wrong with the overall
4704 // fetch op, though we didn't get a nonzero rc from the getxattr
4705 // operation. e.g. object missing.
4706 results->backtrace.ondisk_read_retval = rval;
4707 }
4708 results->backtrace.error_str << "failed to decode on-disk backtrace ("
4709 << bl.length() << " bytes)!";
4710 // we probably have a new unwritten file!
4711 // so skip the backtrace scrub for this entry and say that all's well
4712 if (in->is_dirty_parent()) {
4713 dout(20) << "decode failed; forcing backtrace as passed since "
4714 "inode is dirty parent" << dendl;
4715 results->backtrace.passed = true;
4716 }
4717
4718 goto next;
4719 }
4720
4721 memory_newer = memory_backtrace.compare(results->backtrace.ondisk_value,
4722 &equivalent, &divergent);
4723
4724 if (divergent || memory_newer < 0) {
4725 // we're divergent, or on-disk version is newer
4726 results->backtrace.error_str << "On-disk backtrace is divergent or newer";
4727 /* if the backtraces are divergent and the link count is 0, then
4728 * most likely its a stray entry that's being purged and things are
4729 * well and there's no reason for alarm
4730 */
4731 if (divergent && (in->is_dirty_parent() || in->get_inode()->nlink == 0)) {
4732 results->backtrace.passed = true;
4733 dout(20) << "divergent backtraces are acceptable when dn "
4734 "is being purged or has been renamed or moved to a "
4735 "different directory " << *in << dendl;
4736 }
4737 } else {
4738 results->backtrace.passed = true;
4739 }
4740 next:
4741
4742 if (!results->backtrace.passed && in->scrub_infop->header->get_repair()) {
4743 std::string path;
4744 in->make_path_string(path);
4745 in->mdcache->mds->clog->warn() << "bad backtrace on inode " << in->ino()
4746 << "(" << path << "), rewriting it";
4747 in->mark_dirty_parent(in->mdcache->mds->mdlog->get_current_segment(),
4748 false);
4749 // Flag that we repaired this BT so that it won't go into damagetable
4750 results->backtrace.repaired = true;
4751 if (in->mdcache->mds->logger)
4752 in->mdcache->mds->logger->inc(l_mds_scrub_backtrace_repaired);
4753 }
4754
4755 // If the inode's number was free in the InoTable, fix that
4756 // (#15619)
4757 {
4758 InoTable *inotable = mdcache->mds->inotable;
4759
4760 dout(10) << "scrub: inotable ino = " << in->ino() << dendl;
4761 dout(10) << "scrub: inotable free says "
4762 << inotable->is_marked_free(in->ino()) << dendl;
4763
4764 if (inotable->is_marked_free(in->ino())) {
4765 LogChannelRef clog = in->mdcache->mds->clog;
4766 clog->error() << "scrub: inode wrongly marked free: " << in->ino();
4767
4768 if (in->scrub_infop->header->get_repair()) {
4769 bool repaired = inotable->repair(in->ino());
4770 if (repaired) {
4771 clog->error() << "inode table repaired for inode: " << in->ino();
4772
4773 inotable->save();
4774 if (in->mdcache->mds->logger)
4775 in->mdcache->mds->logger->inc(l_mds_scrub_inotable_repaired);
4776 } else {
4777 clog->error() << "Cannot repair inotable while other operations"
4778 " are in progress";
4779 }
4780 }
4781 }
4782 }
4783
4784
4785 if (in->is_dir()) {
4786 if (in->mdcache->mds->logger)
4787 in->mdcache->mds->logger->inc(l_mds_scrub_dir_inodes);
4788 return validate_directory_data();
4789 } else {
4790 if (in->mdcache->mds->logger)
4791 in->mdcache->mds->logger->inc(l_mds_scrub_file_inodes);
4792 // TODO: validate on-disk inode for normal files
4793 return true;
4794 }
4795 }
4796
4797 bool validate_directory_data() {
4798 ceph_assert(in->is_dir());
4799
4800 if (in->is_base()) {
4801 if (!shadow_in) {
4802 shadow_in = new CInode(in->mdcache);
4803 in->mdcache->create_unlinked_system_inode(shadow_in, in->ino(), in->get_inode()->mode);
4804 in->mdcache->num_shadow_inodes++;
4805 }
4806 shadow_in->fetch(get_internal_callback(INODE));
4807 if (in->mdcache->mds->logger)
4808 in->mdcache->mds->logger->inc(l_mds_scrub_dir_base_inodes);
4809 return false;
4810 } else {
4811 // TODO: validate on-disk inode for non-base directories
4812 if (in->mdcache->mds->logger)
4813 in->mdcache->mds->logger->inc(l_mds_scrub_dirfrag_rstats);
4814 results->inode.passed = true;
4815 return check_dirfrag_rstats();
4816 }
4817 }
4818
4819 bool _inode_disk(int rval) {
4820 const auto& si = shadow_in->get_inode();
4821 const auto& i = in->get_inode();
4822
4823 results->inode.checked = true;
4824 results->inode.ondisk_read_retval = rval;
4825 results->inode.ondisk_value = *si;
4826 results->inode.memory_value = *i;
4827
4828 if (si->version > i->version) {
4829 // uh, what?
4830 results->inode.error_str << "On-disk inode is newer than in-memory one; ";
4831 goto next;
4832 } else {
4833 bool divergent = false;
4834 int r = i->compare(*si, &divergent);
4835 results->inode.passed = !divergent && r >= 0;
4836 if (!results->inode.passed) {
4837 results->inode.error_str <<
4838 "On-disk inode is divergent or newer than in-memory one; ";
4839 goto next;
4840 }
4841 }
4842 next:
4843 return check_dirfrag_rstats();
4844 }
4845
4846 bool check_dirfrag_rstats() {
4847 if (in->has_subtree_root_dirfrag()) {
4848 in->mdcache->rdlock_dirfrags_stats(in, get_internal_callback(DIRFRAGS));
4849 return false;
4850 } else {
4851 return immediate(DIRFRAGS, 0);
4852 }
4853 }
4854
4855 bool _dirfrags(int rval) {
4856 // basic reporting setup
4857 results->raw_stats.checked = true;
4858 results->raw_stats.ondisk_read_retval = rval;
4859
4860 results->raw_stats.memory_value.dirstat = in->get_inode()->dirstat;
4861 results->raw_stats.memory_value.rstat = in->get_inode()->rstat;
4862 frag_info_t& dir_info = results->raw_stats.ondisk_value.dirstat;
4863 nest_info_t& nest_info = results->raw_stats.ondisk_value.rstat;
4864
4865 if (rval != 0) {
4866 results->raw_stats.error_str << "Failed to read dirfrags off disk";
4867 goto next;
4868 }
4869
4870 // check each dirfrag...
4871 for (const auto &p : in->dirfrags) {
4872 CDir *dir = p.second;
4873 ceph_assert(dir->get_version() > 0);
4874 nest_info.add(dir->get_fnode()->accounted_rstat);
4875 dir_info.add(dir->get_fnode()->accounted_fragstat);
4876 }
4877 nest_info.rsubdirs++; // it gets one to account for self
4878 if (const sr_t *srnode = in->get_projected_srnode(); srnode)
4879 nest_info.rsnaps += srnode->snaps.size();
4880
4881 // ...and that their sum matches our inode settings
4882 if (!dir_info.same_sums(in->get_inode()->dirstat) ||
4883 !nest_info.same_sums(in->get_inode()->rstat)) {
4884 if (in->scrub_infop->header->get_repair()) {
4885 results->raw_stats.error_str
4886 << "freshly-calculated rstats don't match existing ones (will be fixed)";
4887 in->mdcache->repair_inode_stats(in);
4888 results->raw_stats.repaired = true;
4889 } else {
4890 results->raw_stats.error_str
4891 << "freshly-calculated rstats don't match existing ones";
4892 }
4893 if (in->is_dirty()) {
4894 MDCache *mdcache = in->mdcache; // for dout()
4895 auto ino = [this]() { return in->ino(); }; // for dout()
4896 dout(20) << "raw stats most likely wont match since inode is dirty; "
4897 "please rerun scrub when system is stable; "
4898 "assuming passed for now;" << dendl;
4899 results->raw_stats.passed = true;
4900 }
4901 goto next;
4902 }
4903
4904 results->raw_stats.passed = true;
4905 {
4906 MDCache *mdcache = in->mdcache; // for dout()
4907 auto ino = [this]() { return in->ino(); }; // for dout()
4908 dout(20) << "raw stats check passed on " << *in << dendl;
4909 }
4910
4911 next:
4912 return true;
4913 }
4914
4915 void _done() override {
4916 if ((!results->raw_stats.checked || results->raw_stats.passed) &&
4917 (!results->backtrace.checked || results->backtrace.passed) &&
4918 (!results->inode.checked || results->inode.passed))
4919 results->passed_validation = true;
4920
4921 // Flag that we did some repair work so that our repair operation
4922 // can be flushed at end of scrub
4923 if (results->backtrace.repaired ||
4924 results->inode.repaired ||
4925 results->raw_stats.repaired)
4926 in->scrub_infop->header->set_repaired();
4927 if (fin)
4928 fin->complete(get_rval());
4929
4930 in->auth_unpin(this);
4931 }
4932 };
4933
4934
4935 dout(10) << "scrub starting validate_disk_state on " << *this << dendl;
4936 ValidationContinuation *vc = new ValidationContinuation(this,
4937 results,
4938 fin);
4939 vc->begin();
4940 }
4941
4942 void CInode::validated_data::dump(Formatter *f) const
4943 {
4944 f->open_object_section("results");
4945 {
4946 f->dump_bool("performed_validation", performed_validation);
4947 f->dump_bool("passed_validation", passed_validation);
4948 f->open_object_section("backtrace");
4949 {
4950 f->dump_bool("checked", backtrace.checked);
4951 f->dump_bool("passed", backtrace.passed);
4952 f->dump_int("read_ret_val", backtrace.ondisk_read_retval);
4953 f->dump_stream("ondisk_value") << backtrace.ondisk_value;
4954 f->dump_stream("memoryvalue") << backtrace.memory_value;
4955 f->dump_string("error_str", backtrace.error_str.str());
4956 }
4957 f->close_section(); // backtrace
4958 f->open_object_section("raw_stats");
4959 {
4960 f->dump_bool("checked", raw_stats.checked);
4961 f->dump_bool("passed", raw_stats.passed);
4962 f->dump_int("read_ret_val", raw_stats.ondisk_read_retval);
4963 f->dump_stream("ondisk_value.dirstat") << raw_stats.ondisk_value.dirstat;
4964 f->dump_stream("ondisk_value.rstat") << raw_stats.ondisk_value.rstat;
4965 f->dump_stream("memory_value.dirstat") << raw_stats.memory_value.dirstat;
4966 f->dump_stream("memory_value.rstat") << raw_stats.memory_value.rstat;
4967 f->dump_string("error_str", raw_stats.error_str.str());
4968 }
4969 f->close_section(); // raw_stats
4970 // dump failure return code
4971 int rc = 0;
4972 if (backtrace.checked && backtrace.ondisk_read_retval)
4973 rc = backtrace.ondisk_read_retval;
4974 if (inode.checked && inode.ondisk_read_retval)
4975 rc = inode.ondisk_read_retval;
4976 if (raw_stats.checked && raw_stats.ondisk_read_retval)
4977 rc = raw_stats.ondisk_read_retval;
4978 f->dump_int("return_code", rc);
4979 }
4980 f->close_section(); // results
4981 }
4982
4983 bool CInode::validated_data::all_damage_repaired() const
4984 {
4985 bool unrepaired =
4986 (raw_stats.checked && !raw_stats.passed && !raw_stats.repaired)
4987 ||
4988 (backtrace.checked && !backtrace.passed && !backtrace.repaired)
4989 ||
4990 (inode.checked && !inode.passed && !inode.repaired);
4991
4992 return !unrepaired;
4993 }
4994
4995 void CInode::dump(Formatter *f, int flags) const
4996 {
4997 if (flags & DUMP_PATH) {
4998 std::string path;
4999 make_path_string(path, true);
5000 if (path.empty())
5001 path = "/";
5002 f->dump_string("path", path);
5003 }
5004
5005 if (flags & DUMP_INODE_STORE_BASE)
5006 InodeStoreBase::dump(f);
5007
5008 if (flags & DUMP_MDS_CACHE_OBJECT)
5009 MDSCacheObject::dump(f);
5010
5011 if (flags & DUMP_LOCKS) {
5012 f->open_object_section("versionlock");
5013 versionlock.dump(f);
5014 f->close_section();
5015
5016 f->open_object_section("authlock");
5017 authlock.dump(f);
5018 f->close_section();
5019
5020 f->open_object_section("linklock");
5021 linklock.dump(f);
5022 f->close_section();
5023
5024 f->open_object_section("dirfragtreelock");
5025 dirfragtreelock.dump(f);
5026 f->close_section();
5027
5028 f->open_object_section("filelock");
5029 filelock.dump(f);
5030 f->close_section();
5031
5032 f->open_object_section("xattrlock");
5033 xattrlock.dump(f);
5034 f->close_section();
5035
5036 f->open_object_section("snaplock");
5037 snaplock.dump(f);
5038 f->close_section();
5039
5040 f->open_object_section("nestlock");
5041 nestlock.dump(f);
5042 f->close_section();
5043
5044 f->open_object_section("flocklock");
5045 flocklock.dump(f);
5046 f->close_section();
5047
5048 f->open_object_section("policylock");
5049 policylock.dump(f);
5050 f->close_section();
5051 }
5052
5053 if (flags & DUMP_STATE) {
5054 f->open_array_section("states");
5055 MDSCacheObject::dump_states(f);
5056 if (state_test(STATE_EXPORTING))
5057 f->dump_string("state", "exporting");
5058 if (state_test(STATE_OPENINGDIR))
5059 f->dump_string("state", "openingdir");
5060 if (state_test(STATE_FREEZING))
5061 f->dump_string("state", "freezing");
5062 if (state_test(STATE_FROZEN))
5063 f->dump_string("state", "frozen");
5064 if (state_test(STATE_AMBIGUOUSAUTH))
5065 f->dump_string("state", "ambiguousauth");
5066 if (state_test(STATE_EXPORTINGCAPS))
5067 f->dump_string("state", "exportingcaps");
5068 if (state_test(STATE_NEEDSRECOVER))
5069 f->dump_string("state", "needsrecover");
5070 if (state_test(STATE_PURGING))
5071 f->dump_string("state", "purging");
5072 if (state_test(STATE_DIRTYPARENT))
5073 f->dump_string("state", "dirtyparent");
5074 if (state_test(STATE_DIRTYRSTAT))
5075 f->dump_string("state", "dirtyrstat");
5076 if (state_test(STATE_STRAYPINNED))
5077 f->dump_string("state", "straypinned");
5078 if (state_test(STATE_FROZENAUTHPIN))
5079 f->dump_string("state", "frozenauthpin");
5080 if (state_test(STATE_DIRTYPOOL))
5081 f->dump_string("state", "dirtypool");
5082 if (state_test(STATE_ORPHAN))
5083 f->dump_string("state", "orphan");
5084 if (state_test(STATE_MISSINGOBJS))
5085 f->dump_string("state", "missingobjs");
5086 f->close_section();
5087 }
5088
5089 if (flags & DUMP_CAPS) {
5090 f->open_array_section("client_caps");
5091 for (const auto &p : client_caps) {
5092 auto &client = p.first;
5093 auto cap = &p.second;
5094 f->open_object_section("client_cap");
5095 f->dump_int("client_id", client.v);
5096 f->dump_string("pending", ccap_string(cap->pending()));
5097 f->dump_string("issued", ccap_string(cap->issued()));
5098 f->dump_string("wanted", ccap_string(cap->wanted()));
5099 f->dump_int("last_sent", cap->get_last_seq());
5100 f->close_section();
5101 }
5102 f->close_section();
5103
5104 f->dump_int("loner", loner_cap.v);
5105 f->dump_int("want_loner", want_loner_cap.v);
5106
5107 f->open_array_section("mds_caps_wanted");
5108 for (const auto &p : mds_caps_wanted) {
5109 f->open_object_section("mds_cap_wanted");
5110 f->dump_int("rank", p.first);
5111 f->dump_string("cap", ccap_string(p.second));
5112 f->close_section();
5113 }
5114 f->close_section();
5115 }
5116
5117 if (flags & DUMP_DIRFRAGS) {
5118 f->open_array_section("dirfrags");
5119 auto&& dfs = get_dirfrags();
5120 for(const auto &dir: dfs) {
5121 f->open_object_section("dir");
5122 dir->dump(f, CDir::DUMP_DEFAULT | CDir::DUMP_ITEMS);
5123 dir->check_rstats();
5124 f->close_section();
5125 }
5126 f->close_section();
5127 }
5128 }
5129
5130 /****** Scrub Stuff *****/
5131 void CInode::scrub_info_create() const
5132 {
5133 dout(25) << __func__ << dendl;
5134 ceph_assert(!scrub_infop);
5135
5136 // break out of const-land to set up implicit initial state
5137 CInode *me = const_cast<CInode*>(this);
5138 const auto& pi = me->get_projected_inode();
5139
5140 std::unique_ptr<scrub_info_t> si(new scrub_info_t());
5141 si->last_scrub_stamp = pi->last_scrub_stamp;
5142 si->last_scrub_version = pi->last_scrub_version;
5143
5144 me->scrub_infop.swap(si);
5145 }
5146
5147 void CInode::scrub_maybe_delete_info()
5148 {
5149 if (scrub_infop &&
5150 !scrub_infop->scrub_in_progress &&
5151 !scrub_infop->last_scrub_dirty) {
5152 scrub_infop.reset();
5153 }
5154 }
5155
5156 void CInode::scrub_initialize(ScrubHeaderRef& header)
5157 {
5158 dout(20) << __func__ << " with scrub_version " << get_version() << dendl;
5159
5160 scrub_info();
5161 scrub_infop->scrub_in_progress = true;
5162 scrub_infop->queued_frags.clear();
5163 scrub_infop->header = header;
5164 header->inc_num_pending();
5165 // right now we don't handle remote inodes
5166 }
5167
5168 void CInode::scrub_aborted() {
5169 dout(20) << __func__ << dendl;
5170 ceph_assert(scrub_is_in_progress());
5171
5172 scrub_infop->scrub_in_progress = false;
5173 scrub_infop->header->dec_num_pending();
5174 scrub_maybe_delete_info();
5175 }
5176
5177 void CInode::scrub_finished() {
5178 dout(20) << __func__ << dendl;
5179 ceph_assert(scrub_is_in_progress());
5180
5181 scrub_infop->last_scrub_version = get_version();
5182 scrub_infop->last_scrub_stamp = ceph_clock_now();
5183 scrub_infop->last_scrub_dirty = true;
5184 scrub_infop->scrub_in_progress = false;
5185 scrub_infop->header->dec_num_pending();
5186 }
5187
5188 int64_t CInode::get_backtrace_pool() const
5189 {
5190 if (is_dir()) {
5191 return mdcache->mds->get_metadata_pool();
5192 } else {
5193 // Files are required to have an explicit layout that specifies
5194 // a pool
5195 ceph_assert(get_inode()->layout.pool_id != -1);
5196 return get_inode()->layout.pool_id;
5197 }
5198 }
5199
5200 void CInode::queue_export_pin(mds_rank_t export_pin)
5201 {
5202 if (state_test(CInode::STATE_QUEUEDEXPORTPIN))
5203 return;
5204
5205 mds_rank_t target;
5206 if (export_pin >= 0)
5207 target = export_pin;
5208 else if (export_pin == MDS_RANK_EPHEMERAL_RAND)
5209 target = mdcache->hash_into_rank_bucket(ino());
5210 else
5211 target = MDS_RANK_NONE;
5212
5213 unsigned min_frag_bits = mdcache->get_ephemeral_dist_frag_bits();
5214 bool queue = false;
5215 for (auto& p : dirfrags) {
5216 CDir *dir = p.second;
5217 if (!dir->is_auth())
5218 continue;
5219
5220 if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
5221 if (dir->get_frag().bits() < min_frag_bits) {
5222 // needs split
5223 queue = true;
5224 break;
5225 }
5226 target = mdcache->hash_into_rank_bucket(ino(), dir->get_frag());
5227 }
5228
5229 if (target != MDS_RANK_NONE) {
5230 if (dir->is_subtree_root()) {
5231 // set auxsubtree bit or export it
5232 if (!dir->state_test(CDir::STATE_AUXSUBTREE) ||
5233 target != dir->get_dir_auth().first)
5234 queue = true;
5235 } else {
5236 // create aux subtree or export it
5237 queue = true;
5238 }
5239 } else {
5240 // clear aux subtrees ?
5241 queue = dir->state_test(CDir::STATE_AUXSUBTREE);
5242 }
5243
5244 if (queue)
5245 break;
5246 }
5247 if (queue) {
5248 state_set(CInode::STATE_QUEUEDEXPORTPIN);
5249 mdcache->export_pin_queue.insert(this);
5250 }
5251 }
5252
5253 void CInode::maybe_export_pin(bool update)
5254 {
5255 if (!g_conf()->mds_bal_export_pin)
5256 return;
5257 if (!is_dir() || !is_normal())
5258 return;
5259
5260 dout(15) << __func__ << " update=" << update << " " << *this << dendl;
5261
5262 mds_rank_t export_pin = get_export_pin(false);
5263 if (export_pin == MDS_RANK_NONE && !update)
5264 return;
5265
5266 check_pin_policy(export_pin);
5267 queue_export_pin(export_pin);
5268 }
5269
5270 void CInode::set_ephemeral_pin(bool dist, bool rand)
5271 {
5272 unsigned state = 0;
5273 if (dist)
5274 state |= STATE_DISTEPHEMERALPIN;
5275 if (rand)
5276 state |= STATE_RANDEPHEMERALPIN;
5277 if (!state)
5278 return;
5279
5280 if (state_test(state) != state) {
5281 dout(10) << "set ephemeral (" << (dist ? "dist" : "")
5282 << (rand ? " rand" : "") << ") pin on " << *this << dendl;
5283 if (!is_ephemerally_pinned()) {
5284 auto p = mdcache->export_ephemeral_pins.insert(this);
5285 ceph_assert(p.second);
5286 }
5287 state_set(state);
5288 }
5289 }
5290
5291 void CInode::clear_ephemeral_pin(bool dist, bool rand)
5292 {
5293 unsigned state = 0;
5294 if (dist)
5295 state |= STATE_DISTEPHEMERALPIN;
5296 if (rand)
5297 state |= STATE_RANDEPHEMERALPIN;
5298
5299 if (state_test(state)) {
5300 dout(10) << "clear ephemeral (" << (dist ? "dist" : "")
5301 << (rand ? " rand" : "") << ") pin on " << *this << dendl;
5302 state_clear(state);
5303 if (!is_ephemerally_pinned()) {
5304 auto count = mdcache->export_ephemeral_pins.erase(this);
5305 ceph_assert(count == 1);
5306 }
5307 }
5308 }
5309
5310 void CInode::maybe_ephemeral_rand(double threshold)
5311 {
5312 if (!mdcache->get_export_ephemeral_random_config()) {
5313 dout(15) << __func__ << " config false: cannot ephemeral random pin " << *this << dendl;
5314 clear_ephemeral_pin(false, true);
5315 return;
5316 } else if (!is_dir() || !is_normal()) {
5317 dout(15) << __func__ << " !dir or !normal: cannot ephemeral random pin " << *this << dendl;
5318 clear_ephemeral_pin(false, true);
5319 return;
5320 } else if (get_inode()->nlink == 0) {
5321 dout(15) << __func__ << " unlinked directory: cannot ephemeral random pin " << *this << dendl;
5322 clear_ephemeral_pin(false, true);
5323 return;
5324 } else if (state_test(CInode::STATE_RANDEPHEMERALPIN)) {
5325 dout(10) << __func__ << " already ephemeral random pinned: requeueing " << *this << dendl;
5326 queue_export_pin(MDS_RANK_EPHEMERAL_RAND);
5327 return;
5328 }
5329
5330 /* not precomputed? */
5331 if (threshold < 0.0) {
5332 threshold = get_ephemeral_rand();
5333 }
5334 if (threshold <= 0.0) {
5335 return;
5336 }
5337 double n = ceph::util::generate_random_number(0.0, 1.0);
5338
5339 dout(15) << __func__ << " rand " << n << " <?= " << threshold
5340 << " " << *this << dendl;
5341
5342 if (n <= threshold) {
5343 dout(10) << __func__ << " randomly export pinning " << *this << dendl;
5344 set_ephemeral_pin(false, true);
5345 queue_export_pin(MDS_RANK_EPHEMERAL_RAND);
5346 }
5347 }
5348
5349 void CInode::setxattr_ephemeral_rand(double probability)
5350 {
5351 ceph_assert(is_dir());
5352 _get_projected_inode()->export_ephemeral_random_pin = probability;
5353 }
5354
5355 void CInode::setxattr_ephemeral_dist(bool val)
5356 {
5357 ceph_assert(is_dir());
5358 _get_projected_inode()->export_ephemeral_distributed_pin = val;
5359 }
5360
5361 void CInode::set_export_pin(mds_rank_t rank)
5362 {
5363 ceph_assert(is_dir());
5364 _get_projected_inode()->export_pin = rank;
5365 maybe_export_pin(true);
5366 }
5367
5368 mds_rank_t CInode::get_export_pin(bool inherit) const
5369 {
5370 if (!g_conf()->mds_bal_export_pin)
5371 return MDS_RANK_NONE;
5372
5373 /* An inode that is export pinned may not necessarily be a subtree root, we
5374 * need to traverse the parents. A base or system inode cannot be pinned.
5375 * N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5376 * have a parent yet.
5377 */
5378 mds_rank_t r_target = MDS_RANK_NONE;
5379 const CInode *in = this;
5380 const CDir *dir = nullptr;
5381 while (true) {
5382 if (in->is_system())
5383 break;
5384 const CDentry *pdn = in->get_parent_dn();
5385 if (!pdn)
5386 break;
5387 if (in->get_inode()->nlink == 0) {
5388 // ignore export pin for unlinked directory
5389 break;
5390 }
5391
5392 if (in->get_inode()->export_pin >= 0) {
5393 return in->get_inode()->export_pin;
5394 } else if (in->get_inode()->export_ephemeral_distributed_pin &&
5395 mdcache->get_export_ephemeral_distributed_config()) {
5396 if (in != this)
5397 return mdcache->hash_into_rank_bucket(in->ino(), dir->get_frag());
5398 return MDS_RANK_EPHEMERAL_DIST;
5399 } else if (r_target != MDS_RANK_NONE && in->get_inode()->export_ephemeral_random_pin > 0.0) {
5400 return r_target;
5401 } else if (r_target == MDS_RANK_NONE && in->is_ephemeral_rand() &&
5402 mdcache->get_export_ephemeral_random_config()) {
5403 /* If a parent overrides a grandparent ephemeral pin policy with an export pin, we use that export pin instead. */
5404 if (!inherit)
5405 return MDS_RANK_EPHEMERAL_RAND;
5406 if (in == this)
5407 r_target = MDS_RANK_EPHEMERAL_RAND;
5408 else
5409 r_target = mdcache->hash_into_rank_bucket(in->ino());
5410 }
5411
5412 if (!inherit)
5413 break;
5414 dir = pdn->get_dir();
5415 in = dir->inode;
5416 }
5417 return MDS_RANK_NONE;
5418 }
5419
5420 void CInode::check_pin_policy(mds_rank_t export_pin)
5421 {
5422 if (export_pin == MDS_RANK_EPHEMERAL_DIST) {
5423 set_ephemeral_pin(true, false);
5424 clear_ephemeral_pin(false, true);
5425 } else if (export_pin == MDS_RANK_EPHEMERAL_RAND) {
5426 set_ephemeral_pin(false, true);
5427 clear_ephemeral_pin(true, false);
5428 } else if (is_ephemerally_pinned()) {
5429 // export_pin >= 0 || export_pin == MDS_RANK_NONE
5430 clear_ephemeral_pin(true, true);
5431 if (export_pin != get_inode()->export_pin) // inherited export_pin
5432 queue_export_pin(MDS_RANK_NONE);
5433 }
5434 }
5435
5436 double CInode::get_ephemeral_rand() const
5437 {
5438 /* N.B. inodes not yet linked into a dir (i.e. anonymous inodes) will not
5439 * have a parent yet.
5440 */
5441 const CInode *in = this;
5442 double max = mdcache->export_ephemeral_random_max;
5443 while (true) {
5444 if (in->is_system())
5445 break;
5446 const CDentry *pdn = in->get_parent_dn();
5447 if (!pdn)
5448 break;
5449 // ignore export pin for unlinked directory
5450 if (in->get_inode()->nlink == 0)
5451 break;
5452
5453 if (in->get_inode()->export_ephemeral_random_pin > 0.0)
5454 return std::min(in->get_inode()->export_ephemeral_random_pin, max);
5455
5456 /* An export_pin overrides only if no closer parent (incl. this one) has a
5457 * random pin set.
5458 */
5459 if (in->get_inode()->export_pin >= 0 ||
5460 in->get_inode()->export_ephemeral_distributed_pin)
5461 return 0.0;
5462
5463 in = pdn->get_dir()->inode;
5464 }
5465 return 0.0;
5466 }
5467
5468 void CInode::get_nested_dirfrags(std::vector<CDir*>& v) const
5469 {
5470 for (const auto &p : dirfrags) {
5471 const auto& dir = p.second;
5472 if (!dir->is_subtree_root())
5473 v.push_back(dir);
5474 }
5475 }
5476
5477 void CInode::get_subtree_dirfrags(std::vector<CDir*>& v) const
5478 {
5479 for (const auto &p : dirfrags) {
5480 const auto& dir = p.second;
5481 if (dir->is_subtree_root())
5482 v.push_back(dir);
5483 }
5484 }
5485
5486 MEMPOOL_DEFINE_OBJECT_FACTORY(CInode, co_inode, mds_co);