1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
9 #include "MetaSession.h"
10 #include "ClientSnapRealm.h"
11 #include "Delegation.h"
13 #include "mds/flock.h"
17 delay_cap_item
.remove_myself();
18 dirty_cap_item
.remove_myself();
19 snaprealm_item
.remove_myself();
22 snapdir_parent
->flags
&= ~I_SNAPDIR_OPEN
;
23 snapdir_parent
.reset();
26 if (!oset
.objects
.empty()) {
27 lsubdout(client
->cct
, client
, 0) << __func__
<< ": leftover objects on inode 0x"
28 << std::hex
<< ino
<< std::dec
<< dendl
;
29 ceph_assert(oset
.objects
.empty());
32 if (!delegations
.empty()) {
33 lsubdout(client
->cct
, client
, 0) << __func__
<< ": leftover delegations on inode 0x"
34 << std::hex
<< ino
<< std::dec
<< dendl
;
35 ceph_assert(delegations
.empty());
39 ostream
& operator<<(ostream
&out
, const Inode
&in
)
41 out
<< in
.vino() << "("
42 << "faked_ino=" << in
.faked_ino
44 << " ll_ref=" << in
.ll_ref
45 << " cap_refs=" << in
.cap_refs
46 << " open=" << in
.open_by_mode
47 << " mode=" << oct
<< in
.mode
<< dec
48 << " size=" << in
.size
<< "/" << in
.max_size
49 << " nlink=" << in
.nlink
50 << " btime=" << in
.btime
51 << " mtime=" << in
.mtime
52 << " ctime=" << in
.ctime
53 << " caps=" << ccap_string(in
.caps_issued());
54 if (!in
.caps
.empty()) {
57 for (const auto &pair
: in
.caps
) {
60 out
<< pair
.first
<< '=' << ccap_string(pair
.second
.issued
);
66 out
<< " dirty_caps=" << ccap_string(in
.dirty_caps
);
68 out
<< " flushing_caps=" << ccap_string(in
.flushing_caps
);
70 if (in
.flags
& I_COMPLETE
)
74 out
<< " " << in
.oset
;
76 if (!in
.dentries
.empty())
77 out
<< " parents=" << in
.dentries
;
79 if (in
.is_dir() && in
.has_dir_layout())
80 out
<< " has_dir_layout";
82 if (in
.quota
.is_enable())
83 out
<< " " << in
.quota
;
85 out
<< ' ' << &in
<< ")";
90 void Inode::make_long_path(filepath
& p
)
92 if (!dentries
.empty()) {
93 Dentry
*dn
= get_first_parent();
94 ceph_assert(dn
->dir
&& dn
->dir
->parent_inode
);
95 dn
->dir
->parent_inode
->make_long_path(p
);
96 p
.push_dentry(dn
->name
);
97 } else if (snapdir_parent
) {
98 make_nosnap_relative_path(p
);
103 void Inode::make_short_path(filepath
& p
)
105 if (!dentries
.empty()) {
106 Dentry
*dn
= get_first_parent();
107 ceph_assert(dn
->dir
&& dn
->dir
->parent_inode
);
108 p
= filepath(dn
->name
, dn
->dir
->parent_inode
->ino
);
109 } else if (snapdir_parent
) {
110 make_nosnap_relative_path(p
);
116 * make a filepath suitable for an mds request:
117 * - if we are non-snapped/live, the ino is sufficient, e.g. #1234
118 * - if we are snapped, make filepath relative to first non-snapped parent.
120 void Inode::make_nosnap_relative_path(filepath
& p
)
122 if (snapid
== CEPH_NOSNAP
) {
124 } else if (snapdir_parent
) {
125 snapdir_parent
->make_nosnap_relative_path(p
);
127 p
.push_dentry(empty
);
128 } else if (!dentries
.empty()) {
129 Dentry
*dn
= get_first_parent();
130 ceph_assert(dn
->dir
&& dn
->dir
->parent_inode
);
131 dn
->dir
->parent_inode
->make_nosnap_relative_path(p
);
132 p
.push_dentry(dn
->name
);
138 void Inode::get_open_ref(int mode
)
140 client
->inc_opened_files();
141 if (open_by_mode
.count(mode
) == 0)
142 client
->inc_opened_inodes();
143 open_by_mode
[mode
]++;
144 break_deleg(!(mode
& CEPH_FILE_MODE_WR
));
147 bool Inode::put_open_ref(int mode
)
149 //cout << "open_by_mode[" << mode << "] " << open_by_mode[mode] << " -> " << (open_by_mode[mode]-1) << std::endl;
150 auto& ref
= open_by_mode
.at(mode
);
151 ceph_assert(ref
> 0);
152 client
->dec_opened_files();
154 client
->dec_opened_inodes();
160 void Inode::get_cap_ref(int cap
)
167 //cout << "inode " << *this << " get " << cap_string(c) << " " << (cap_refs[c]-1) << " -> " << cap_refs[c] << std::endl;
174 int Inode::put_cap_ref(int cap
)
181 if (cap_refs
[c
] <= 0) {
182 lderr(client
->cct
) << "put_cap_ref " << ccap_string(c
) << " went negative on " << *this << dendl
;
183 ceph_assert(cap_refs
[c
] > 0);
185 if (--cap_refs
[c
] == 0)
187 //cout << "inode " << *this << " put " << cap_string(c) << " " << (cap_refs[c]+1) << " -> " << cap_refs[c] << std::endl;
195 bool Inode::is_any_caps()
197 return !caps
.empty() || snap_caps
;
200 bool Inode::cap_is_valid(const Cap
&cap
) const
202 /*cout << "cap_gen " << cap->session-> cap_gen << std::endl
203 << "session gen " << cap->gen << std::endl
204 << "cap expire " << cap->session->cap_ttl << std::endl
205 << "cur time " << ceph_clock_now(cct) << std::endl;*/
206 if ((cap
.session
->cap_gen
<= cap
.gen
)
207 && (ceph_clock_now() < cap
.session
->cap_ttl
)) {
213 int Inode::caps_issued(int *implemented
) const
217 for (const auto &pair
: caps
) {
218 const Cap
&cap
= pair
.second
;
219 if (cap_is_valid(cap
)) {
221 i
|= cap
.implemented
;
224 // exclude caps issued by non-auth MDS, but are been revoking by
225 // the auth MDS. The non-auth MDS should be revoking/exporting
226 // these caps, but the message is delayed.
228 c
&= ~auth_cap
->implemented
| auth_cap
->issued
;
235 void Inode::try_touch_cap(mds_rank_t mds
)
237 auto it
= caps
.find(mds
);
238 if (it
!= caps
.end()) {
244 * caps_issued_mask - check whether we have all of the caps in the mask
245 * @mask: mask to check against
246 * @allow_impl: whether the caller can also use caps that are implemented but not issued
248 * This is the bog standard "check whether we have the required caps" operation.
249 * Typically, we only check against the capset that is currently "issued".
250 * In other words, we ignore caps that have been revoked but not yet released.
251 * Also account capability hit/miss stats.
253 * Some callers (particularly those doing attribute retrieval) can also make
254 * use of the full set of "implemented" caps to satisfy requests from the
257 * Those callers should refrain from taking new references to implemented
260 bool Inode::caps_issued_mask(unsigned mask
, bool allow_impl
)
265 if ((c
& mask
) == mask
)
269 cap_is_valid(*auth_cap
) &&
270 (auth_cap
->issued
& mask
) == mask
) {
276 for (auto &pair
: caps
) {
277 Cap
&cap
= pair
.second
;
278 if (cap_is_valid(cap
)) {
279 if ((cap
.issued
& mask
) == mask
) {
285 i
|= cap
.implemented
;
292 if ((c
& mask
) == mask
) {
293 // bah.. touch them all
294 for (auto &pair
: caps
) {
305 int Inode::caps_used()
308 for (map
<int,int>::iterator p
= cap_refs
.begin();
316 int Inode::caps_file_wanted()
319 for (map
<int,int>::iterator p
= open_by_mode
.begin();
320 p
!= open_by_mode
.end();
323 want
|= ceph_caps_for_mode(p
->first
);
327 int Inode::caps_wanted()
329 int want
= caps_file_wanted() | caps_used();
330 if (want
& CEPH_CAP_FILE_BUFFER
)
331 want
|= CEPH_CAP_FILE_EXCL
;
335 int Inode::caps_mds_wanted()
338 for (const auto &pair
: caps
) {
339 want
|= pair
.second
.wanted
;
344 int Inode::caps_dirty()
346 return dirty_caps
| flushing_caps
;
349 const UserPerm
* Inode::get_best_perms()
351 const UserPerm
*perms
= NULL
;
352 for (const auto &pair
: caps
) {
353 const UserPerm
& iperm
= pair
.second
.latest_perms
;
354 if (!perms
) { // we don't have any, take what's present
356 } else if (iperm
.uid() == uid
) {
357 if (iperm
.gid() == gid
) { // we have the best possible, return
360 if (perms
->uid() != uid
) { // take uid > gid every time
363 } else if (perms
->uid() != uid
&& iperm
.gid() == gid
) {
364 perms
= &iperm
; // a matching gid is better than nothing
370 bool Inode::have_valid_size()
372 // RD+RDCACHE or WR+WRBUFFER => valid size
373 if (caps_issued() & (CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_EXCL
))
378 // open Dir for an inode. if it's not open, allocated it (and pin dentry in memory).
379 Dir
*Inode::open_dir()
383 lsubdout(client
->cct
, client
, 15) << "open_dir " << dir
<< " on " << this << dendl
;
384 ceph_assert(dentries
.size() < 2); // dirs can't be hard-linked
385 if (!dentries
.empty())
386 get_first_parent()->get(); // pin dentry
392 bool Inode::check_mode(const UserPerm
& perms
, unsigned want
)
394 if (uid
== perms
.uid()) {
395 // if uid is owner, owner entry determines access
397 } else if (perms
.gid_in_groups(gid
)) {
398 // if a gid or sgid matches the owning group, group entry determines access
402 return (mode
& want
) == want
;
407 lsubdout(client
->cct
, client
, 15) << "inode.get on " << this << " " << ino
<< '.' << snapid
408 << " now " << _ref
<< dendl
;
411 //private method to put a reference; see Client::put_inode()
412 int Inode::_put(int n
) {
414 lsubdout(client
->cct
, client
, 15) << "inode.put on " << this << " " << ino
<< '.' << snapid
415 << " now " << _ref
<< dendl
;
416 ceph_assert(_ref
>= 0);
421 void Inode::dump(Formatter
*f
) const
423 f
->dump_stream("ino") << ino
;
424 f
->dump_stream("snapid") << snapid
;
426 f
->dump_unsigned("rdev", rdev
);
427 f
->dump_stream("ctime") << ctime
;
428 f
->dump_stream("btime") << btime
;
429 f
->dump_stream("mode") << '0' << std::oct
<< mode
<< std::dec
;
430 f
->dump_unsigned("uid", uid
);
431 f
->dump_unsigned("gid", gid
);
432 f
->dump_int("nlink", nlink
);
434 f
->dump_unsigned("size", size
);
435 f
->dump_unsigned("max_size", max_size
);
436 f
->dump_unsigned("truncate_seq", truncate_seq
);
437 f
->dump_unsigned("truncate_size", truncate_size
);
438 f
->dump_stream("mtime") << mtime
;
439 f
->dump_stream("atime") << atime
;
440 f
->dump_unsigned("time_warp_seq", time_warp_seq
);
441 f
->dump_unsigned("change_attr", change_attr
);
443 f
->dump_object("layout", layout
);
445 f
->open_object_section("dir_layout");
446 ::dump(dir_layout
, f
);
449 f
->dump_bool("complete", flags
& I_COMPLETE
);
450 f
->dump_bool("ordered", flags
& I_DIR_ORDERED
);
452 /* FIXME when wip-mds-encoding is merged ***
453 f->open_object_section("dir_stat");
457 f->open_object_section("rstat");
463 f
->dump_unsigned("version", version
);
464 f
->dump_unsigned("xattr_version", xattr_version
);
465 f
->dump_unsigned("flags", flags
);
468 f
->dump_int("dir_hashed", (int)dir_hashed
);
469 f
->dump_int("dir_replicated", (int)dir_replicated
);
470 if (dir_replicated
) {
471 f
->open_array_section("dirfrags");
472 for (const auto &frag
: frag_repmap
) {
473 f
->open_object_section("frags");
474 CachedStackStringStream css
;
475 *css
<< std::hex
<< frag
.first
.value() << "/" << std::dec
<< frag
.first
.bits();
476 f
->dump_string("frag", css
->strv());
478 f
->open_array_section("repmap");
479 for (const auto &mds
: frag
.second
) {
480 f
->dump_int("mds", mds
);
490 f
->open_array_section("caps");
491 for (const auto &pair
: caps
) {
492 f
->open_object_section("cap");
493 f
->dump_int("mds", pair
.first
);
494 if (&pair
.second
== auth_cap
)
495 f
->dump_int("auth", 1);
501 f
->dump_int("auth_cap", auth_cap
->session
->mds_num
);
503 f
->dump_stream("dirty_caps") << ccap_string(dirty_caps
);
505 f
->dump_stream("flushings_caps") << ccap_string(flushing_caps
);
506 f
->open_object_section("flushing_cap_tid");
507 for (map
<ceph_tid_t
, int>::const_iterator p
= flushing_cap_tids
.begin();
508 p
!= flushing_cap_tids
.end();
510 string
n(ccap_string(p
->second
));
511 f
->dump_unsigned(n
.c_str(), p
->first
);
515 f
->dump_int("shared_gen", shared_gen
);
516 f
->dump_int("cache_gen", cache_gen
);
518 f
->dump_int("snap_caps", snap_caps
);
519 f
->dump_int("snap_cap_refs", snap_cap_refs
);
522 f
->dump_stream("hold_caps_until") << hold_caps_until
;
525 f
->open_object_section("snaprealm");
529 if (!cap_snaps
.empty()) {
530 for (const auto &p
: cap_snaps
) {
531 f
->open_object_section("cap_snap");
532 f
->dump_stream("follows") << p
.first
;
539 if (!open_by_mode
.empty()) {
540 f
->open_array_section("open_by_mode");
541 for (map
<int,int>::const_iterator p
= open_by_mode
.begin(); p
!= open_by_mode
.end(); ++p
) {
542 f
->open_object_section("ref");
543 f
->dump_int("mode", p
->first
);
544 f
->dump_int("refs", p
->second
);
549 if (!cap_refs
.empty()) {
550 f
->open_array_section("cap_refs");
551 for (map
<int,int>::const_iterator p
= cap_refs
.begin(); p
!= cap_refs
.end(); ++p
) {
552 f
->open_object_section("cap_ref");
553 f
->dump_stream("cap") << ccap_string(p
->first
);
554 f
->dump_int("refs", p
->second
);
560 f
->dump_unsigned("reported_size", reported_size
);
561 if (wanted_max_size
!= max_size
)
562 f
->dump_unsigned("wanted_max_size", wanted_max_size
);
563 if (requested_max_size
!= max_size
)
564 f
->dump_unsigned("requested_max_size", requested_max_size
);
566 f
->dump_int("ref", _ref
);
567 f
->dump_int("ll_ref", ll_ref
);
569 if (!dentries
.empty()) {
570 f
->open_array_section("parents");
571 for (const auto &&dn
: dentries
) {
572 f
->open_object_section("dentry");
573 f
->dump_stream("dir_ino") << dn
->dir
->parent_inode
->ino
;
574 f
->dump_string("name", dn
->name
);
581 void Cap::dump(Formatter
*f
) const
583 f
->dump_int("mds", session
->mds_num
);
584 f
->dump_stream("ino") << inode
.ino
;
585 f
->dump_unsigned("cap_id", cap_id
);
586 f
->dump_stream("issued") << ccap_string(issued
);
587 if (implemented
!= issued
)
588 f
->dump_stream("implemented") << ccap_string(implemented
);
589 f
->dump_stream("wanted") << ccap_string(wanted
);
590 f
->dump_unsigned("seq", seq
);
591 f
->dump_unsigned("issue_seq", issue_seq
);
592 f
->dump_unsigned("mseq", mseq
);
593 f
->dump_unsigned("gen", gen
);
596 void CapSnap::dump(Formatter
*f
) const
598 f
->dump_stream("ino") << in
->ino
;
599 f
->dump_stream("issued") << ccap_string(issued
);
600 f
->dump_stream("dirty") << ccap_string(dirty
);
601 f
->dump_unsigned("size", size
);
602 f
->dump_stream("ctime") << ctime
;
603 f
->dump_stream("mtime") << mtime
;
604 f
->dump_stream("atime") << atime
;
605 f
->dump_int("time_warp_seq", time_warp_seq
);
606 f
->dump_stream("mode") << '0' << std::oct
<< mode
<< std::dec
;
607 f
->dump_unsigned("uid", uid
);
608 f
->dump_unsigned("gid", gid
);
609 if (!xattrs
.empty()) {
610 f
->open_object_section("xattr_lens");
611 for (map
<string
,bufferptr
>::const_iterator p
= xattrs
.begin(); p
!= xattrs
.end(); ++p
)
612 f
->dump_int(p
->first
.c_str(), p
->second
.length());
615 f
->dump_unsigned("xattr_version", xattr_version
);
616 f
->dump_int("writing", (int)writing
);
617 f
->dump_int("dirty_data", (int)dirty_data
);
618 f
->dump_unsigned("flush_tid", flush_tid
);
621 void Inode::set_async_err(int r
)
623 for (const auto &fh
: fhs
) {
628 bool Inode::has_recalled_deleg()
630 if (delegations
.empty())
633 // Either all delegations are recalled or none are. Just check the first.
634 Delegation
& deleg
= delegations
.front();
635 return deleg
.is_recalled();
638 void Inode::recall_deleg(bool skip_read
)
640 if (delegations
.empty())
644 for (list
<Delegation
>::iterator d
= delegations
.begin();
645 d
!= delegations
.end(); ++d
) {
647 Delegation
& deleg
= *d
;
648 deleg
.recall(skip_read
);
652 bool Inode::delegations_broken(bool skip_read
)
654 if (delegations
.empty()) {
655 lsubdout(client
->cct
, client
, 10) <<
656 __func__
<< ": delegations empty on " << *this << dendl
;
661 Delegation
& deleg
= delegations
.front();
662 lsubdout(client
->cct
, client
, 10) <<
663 __func__
<< ": read delegs only on " << *this << dendl
;
664 if (deleg
.get_type() == CEPH_FILE_MODE_RD
) {
668 lsubdout(client
->cct
, client
, 10) <<
669 __func__
<< ": not broken" << *this << dendl
;
673 void Inode::break_deleg(bool skip_read
)
675 lsubdout(client
->cct
, client
, 10) <<
676 __func__
<< ": breaking delegs on " << *this << dendl
;
678 recall_deleg(skip_read
);
680 while (!delegations_broken(skip_read
))
681 client
->wait_on_list(waitfor_deleg
);
685 * set_deleg: request a delegation on an open Fh
686 * @fh: filehandle on which to acquire it
687 * @type: delegation request type
688 * @cb: delegation recall callback function
689 * @priv: private pointer to be passed to callback
691 * Attempt to acquire a delegation on an open file handle. If there are no
692 * conflicts and we have the right caps, allocate a new delegation, fill it
693 * out and return 0. Return an error if we can't get one for any reason.
695 int Inode::set_deleg(Fh
*fh
, unsigned type
, ceph_deleg_cb_t cb
, void *priv
)
697 lsubdout(client
->cct
, client
, 10) <<
698 __func__
<< ": inode " << *this << dendl
;
701 * 0 deleg timeout means that they haven't been explicitly enabled. Don't
702 * allow it, with an unusual error to make it clear.
704 if (!client
->get_deleg_timeout())
705 return -CEPHFS_ETIME
;
707 // Just say no if we have any recalled delegs still outstanding
708 if (has_recalled_deleg()) {
709 lsubdout(client
->cct
, client
, 10) << __func__
<<
710 ": has_recalled_deleg" << dendl
;
711 return -CEPHFS_EAGAIN
;
714 // check vs. currently open files on this inode
716 case CEPH_DELEGATION_RD
:
717 if (open_count_for_write()) {
718 lsubdout(client
->cct
, client
, 10) << __func__
<<
719 ": open for write" << dendl
;
720 return -CEPHFS_EAGAIN
;
723 case CEPH_DELEGATION_WR
:
724 if (open_count() > 1) {
725 lsubdout(client
->cct
, client
, 10) << __func__
<< ": open" << dendl
;
726 return -CEPHFS_EAGAIN
;
730 return -CEPHFS_EINVAL
;
734 * A delegation is essentially a long-held container for cap references that
735 * we delegate to the client until recalled. The caps required depend on the
736 * type of delegation (read vs. rw). This is entirely an opportunistic thing.
737 * If we don't have the necessary caps for the delegation, then we just don't
740 * In principle we could request the caps from the MDS, but a delegation is
741 * usually requested just after an open. If we don't have the necessary caps
742 * already, then it's likely that there is some sort of conflicting access.
744 * In the future, we may need to add a way to have this request caps more
745 * aggressively -- for instance, to handle WANT_DELEGATION for NFSv4.1+.
747 int need
= ceph_deleg_caps_for_type(type
);
748 if (!caps_issued_mask(need
)) {
749 lsubdout(client
->cct
, client
, 10) << __func__
<< ": cap mismatch, have="
750 << ccap_string(caps_issued()) << " need=" << ccap_string(need
) << dendl
;
751 return -CEPHFS_EAGAIN
;
754 for (list
<Delegation
>::iterator d
= delegations
.begin();
755 d
!= delegations
.end(); ++d
) {
756 Delegation
& deleg
= *d
;
757 if (deleg
.get_fh() == fh
) {
758 deleg
.reinit(type
, cb
, priv
);
763 delegations
.emplace_back(fh
, type
, cb
, priv
);
768 * unset_deleg - remove a delegation that was previously set
769 * @fh: file handle to clear delegation of
771 * Unlink delegation from the Inode (if there is one), put caps and free it.
773 void Inode::unset_deleg(Fh
*fh
)
775 for (list
<Delegation
>::iterator d
= delegations
.begin();
776 d
!= delegations
.end(); ++d
) {
777 Delegation
& deleg
= *d
;
778 if (deleg
.get_fh() == fh
) {
779 delegations
.erase(d
);
780 client
->signal_cond_list(waitfor_deleg
);
787 * mark_caps_dirty - mark some caps dirty
788 * @caps: the dirty caps
790 * note that if there is no dirty and flushing caps before, we need to pin this inode.
791 * it will be unpined by handle_cap_flush_ack when there are no dirty and flushing caps.
793 void Inode::mark_caps_dirty(int caps
)
795 lsubdout(client
->cct
, client
, 10) << __func__
<< " " << *this << " " << ccap_string(dirty_caps
) << " -> "
796 << ccap_string(dirty_caps
| caps
) << dendl
;
797 if (caps
&& !caps_dirty())
800 client
->get_dirty_list().push_back(&dirty_cap_item
);
804 * mark_caps_clean - only clean the dirty_caps and caller should start flushing the dirty caps.
806 void Inode::mark_caps_clean()
808 lsubdout(client
->cct
, client
, 10) << __func__
<< " " << *this << dendl
;
810 dirty_cap_item
.remove_myself();