]> git.proxmox.com Git - ceph.git/blob - ceph/src/client/Inode.cc
update sources to v12.2.3
[ceph.git] / ceph / src / client / Inode.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "Client.h"
5 #include "Inode.h"
6 #include "Dentry.h"
7 #include "Dir.h"
8 #include "Fh.h"
9 #include "MetaSession.h"
10 #include "ClientSnapRealm.h"
11 #include "Delegation.h"
12
13 #include "mds/flock.h"
14
15 Inode::~Inode()
16 {
17 cap_item.remove_myself();
18 snaprealm_item.remove_myself();
19
20 if (snapdir_parent) {
21 snapdir_parent->flags &= ~I_SNAPDIR_OPEN;
22 snapdir_parent.reset();
23 }
24
25 if (!oset.objects.empty()) {
26 lsubdout(client->cct, client, 0) << __func__ << ": leftover objects on inode 0x"
27 << std::hex << ino << std::dec << dendl;
28 assert(oset.objects.empty());
29 }
30
31 if (!delegations.empty()) {
32 lsubdout(client->cct, client, 0) << __func__ << ": leftover delegations on inode 0x"
33 << std::hex << ino << std::dec << dendl;
34 assert(delegations.empty());
35 }
36
37 delete fcntl_locks;
38 delete flock_locks;
39 }
40
41 ostream& operator<<(ostream &out, const Inode &in)
42 {
43 out << in.vino() << "("
44 << "faked_ino=" << in.faked_ino
45 << " ref=" << in._ref
46 << " ll_ref=" << in.ll_ref
47 << " cap_refs=" << in.cap_refs
48 << " open=" << in.open_by_mode
49 << " mode=" << oct << in.mode << dec
50 << " size=" << in.size << "/" << in.max_size
51 << " mtime=" << in.mtime
52 << " caps=" << ccap_string(in.caps_issued());
53 if (!in.caps.empty()) {
54 out << "(";
55 for (auto p = in.caps.begin(); p != in.caps.end(); ++p) {
56 if (p != in.caps.begin())
57 out << ',';
58 out << p->first << '=' << ccap_string(p->second->issued);
59 }
60 out << ")";
61 }
62 if (in.dirty_caps)
63 out << " dirty_caps=" << ccap_string(in.dirty_caps);
64 if (in.flushing_caps)
65 out << " flushing_caps=" << ccap_string(in.flushing_caps);
66
67 if (in.flags & I_COMPLETE)
68 out << " COMPLETE";
69
70 if (in.is_file())
71 out << " " << in.oset;
72
73 if (!in.dn_set.empty())
74 out << " parents=" << in.dn_set;
75
76 if (in.is_dir() && in.has_dir_layout())
77 out << " has_dir_layout";
78
79 if (in.quota.is_enable())
80 out << " " << in.quota;
81
82 out << ' ' << &in << ")";
83 return out;
84 }
85
86
87 void Inode::make_long_path(filepath& p)
88 {
89 if (!dn_set.empty()) {
90 assert((*dn_set.begin())->dir && (*dn_set.begin())->dir->parent_inode);
91 (*dn_set.begin())->dir->parent_inode->make_long_path(p);
92 p.push_dentry((*dn_set.begin())->name);
93 } else if (snapdir_parent) {
94 snapdir_parent->make_nosnap_relative_path(p);
95 string empty;
96 p.push_dentry(empty);
97 } else
98 p = filepath(ino);
99 }
100
101 /*
102 * make a filepath suitable for an mds request:
103 * - if we are non-snapped/live, the ino is sufficient, e.g. #1234
104 * - if we are snapped, make filepath relative to first non-snapped parent.
105 */
106 void Inode::make_nosnap_relative_path(filepath& p)
107 {
108 if (snapid == CEPH_NOSNAP) {
109 p = filepath(ino);
110 } else if (snapdir_parent) {
111 snapdir_parent->make_nosnap_relative_path(p);
112 string empty;
113 p.push_dentry(empty);
114 } else if (!dn_set.empty()) {
115 assert((*dn_set.begin())->dir && (*dn_set.begin())->dir->parent_inode);
116 (*dn_set.begin())->dir->parent_inode->make_nosnap_relative_path(p);
117 p.push_dentry((*dn_set.begin())->name);
118 } else {
119 p = filepath(ino);
120 }
121 }
122
123 void Inode::get_open_ref(int mode)
124 {
125 open_by_mode[mode]++;
126 break_deleg(!(mode & CEPH_FILE_MODE_WR));
127 }
128
129 bool Inode::put_open_ref(int mode)
130 {
131 //cout << "open_by_mode[" << mode << "] " << open_by_mode[mode] << " -> " << (open_by_mode[mode]-1) << std::endl;
132 if (--open_by_mode[mode] == 0)
133 return true;
134 return false;
135 }
136
137 void Inode::get_cap_ref(int cap)
138 {
139 int n = 0;
140 while (cap) {
141 if (cap & 1) {
142 int c = 1 << n;
143 cap_refs[c]++;
144 //cout << "inode " << *this << " get " << cap_string(c) << " " << (cap_refs[c]-1) << " -> " << cap_refs[c] << std::endl;
145 }
146 cap >>= 1;
147 n++;
148 }
149 }
150
151 int Inode::put_cap_ref(int cap)
152 {
153 int last = 0;
154 int n = 0;
155 while (cap) {
156 if (cap & 1) {
157 int c = 1 << n;
158 if (cap_refs[c] <= 0) {
159 lderr(client->cct) << "put_cap_ref " << ccap_string(c) << " went negative on " << *this << dendl;
160 assert(cap_refs[c] > 0);
161 }
162 if (--cap_refs[c] == 0)
163 last |= c;
164 //cout << "inode " << *this << " put " << cap_string(c) << " " << (cap_refs[c]+1) << " -> " << cap_refs[c] << std::endl;
165 }
166 cap >>= 1;
167 n++;
168 }
169 return last;
170 }
171
172 bool Inode::is_any_caps()
173 {
174 return !caps.empty() || snap_caps;
175 }
176
177 bool Inode::cap_is_valid(Cap* cap) const
178 {
179 /*cout << "cap_gen " << cap->session-> cap_gen << std::endl
180 << "session gen " << cap->gen << std::endl
181 << "cap expire " << cap->session->cap_ttl << std::endl
182 << "cur time " << ceph_clock_now(cct) << std::endl;*/
183 if ((cap->session->cap_gen <= cap->gen)
184 && (ceph_clock_now() < cap->session->cap_ttl)) {
185 return true;
186 }
187 return false;
188 }
189
190 int Inode::caps_issued(int *implemented) const
191 {
192 int c = snap_caps;
193 int i = 0;
194 for (map<mds_rank_t,Cap*>::const_iterator it = caps.begin();
195 it != caps.end();
196 ++it)
197 if (cap_is_valid(it->second)) {
198 c |= it->second->issued;
199 i |= it->second->implemented;
200 }
201 if (implemented)
202 *implemented = i;
203 return c;
204 }
205
206 void Inode::touch_cap(Cap *cap)
207 {
208 // move to back of LRU
209 cap->session->caps.push_back(&cap->cap_item);
210 }
211
212 void Inode::try_touch_cap(mds_rank_t mds)
213 {
214 if (caps.count(mds))
215 touch_cap(caps[mds]);
216 }
217
218 bool Inode::caps_issued_mask(unsigned mask)
219 {
220 int c = snap_caps;
221 if ((c & mask) == mask)
222 return true;
223 // prefer auth cap
224 if (auth_cap &&
225 cap_is_valid(auth_cap) &&
226 (auth_cap->issued & mask) == mask) {
227 touch_cap(auth_cap);
228 return true;
229 }
230 // try any cap
231 for (map<mds_rank_t,Cap*>::iterator it = caps.begin();
232 it != caps.end();
233 ++it) {
234 if (cap_is_valid(it->second)) {
235 if ((it->second->issued & mask) == mask) {
236 touch_cap(it->second);
237 return true;
238 }
239 c |= it->second->issued;
240 }
241 }
242 if ((c & mask) == mask) {
243 // bah.. touch them all
244 for (map<mds_rank_t,Cap*>::iterator it = caps.begin();
245 it != caps.end();
246 ++it)
247 touch_cap(it->second);
248 return true;
249 }
250 return false;
251 }
252
253 int Inode::caps_used()
254 {
255 int w = 0;
256 for (map<int,int>::iterator p = cap_refs.begin();
257 p != cap_refs.end();
258 ++p)
259 if (p->second)
260 w |= p->first;
261 return w;
262 }
263
264 int Inode::caps_file_wanted()
265 {
266 int want = 0;
267 for (map<int,int>::iterator p = open_by_mode.begin();
268 p != open_by_mode.end();
269 ++p)
270 if (p->second)
271 want |= ceph_caps_for_mode(p->first);
272 return want;
273 }
274
275 int Inode::caps_wanted()
276 {
277 int want = caps_file_wanted() | caps_used();
278 if (want & CEPH_CAP_FILE_BUFFER)
279 want |= CEPH_CAP_FILE_EXCL;
280 return want;
281 }
282
283 int Inode::caps_mds_wanted()
284 {
285 int want = 0;
286 for (auto it = caps.begin(); it != caps.end(); ++it)
287 want |= it->second->wanted;
288 return want;
289 }
290
291 int Inode::caps_dirty()
292 {
293 return dirty_caps | flushing_caps;
294 }
295
296 const UserPerm* Inode::get_best_perms()
297 {
298 const UserPerm *perms = NULL;
299 for (const auto ci : caps) {
300 const UserPerm& iperm = ci.second->latest_perms;
301 if (!perms) { // we don't have any, take what's present
302 perms = &iperm;
303 } else if (iperm.uid() == uid) {
304 if (iperm.gid() == gid) { // we have the best possible, return
305 return &iperm;
306 }
307 if (perms->uid() != uid) { // take uid > gid every time
308 perms = &iperm;
309 }
310 } else if (perms->uid() != uid && iperm.gid() == gid) {
311 perms = &iperm; // a matching gid is better than nothing
312 }
313 }
314 return perms;
315 }
316
317 bool Inode::have_valid_size()
318 {
319 // RD+RDCACHE or WR+WRBUFFER => valid size
320 if (caps_issued() & (CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL))
321 return true;
322 return false;
323 }
324
325 // open Dir for an inode. if it's not open, allocated it (and pin dentry in memory).
326 Dir *Inode::open_dir()
327 {
328 if (!dir) {
329 dir = new Dir(this);
330 lsubdout(client->cct, client, 15) << "open_dir " << dir << " on " << this << dendl;
331 assert(dn_set.size() < 2); // dirs can't be hard-linked
332 if (!dn_set.empty())
333 (*dn_set.begin())->get(); // pin dentry
334 get(); // pin inode
335 }
336 return dir;
337 }
338
339 bool Inode::check_mode(const UserPerm& perms, unsigned want)
340 {
341 if (uid == perms.uid()) {
342 // if uid is owner, owner entry determines access
343 want = want << 6;
344 } else if (perms.gid_in_groups(gid)) {
345 // if a gid or sgid matches the owning group, group entry determines access
346 want = want << 3;
347 }
348
349 return (mode & want) == want;
350 }
351
352 void Inode::get() {
353 _ref++;
354 lsubdout(client->cct, client, 15) << "inode.get on " << this << " " << ino << '.' << snapid
355 << " now " << _ref << dendl;
356 }
357
358 //private method to put a reference; see Client::put_inode()
359 int Inode::_put(int n) {
360 _ref -= n;
361 lsubdout(client->cct, client, 15) << "inode.put on " << this << " " << ino << '.' << snapid
362 << " now " << _ref << dendl;
363 assert(_ref >= 0);
364 return _ref;
365 }
366
367
368 void Inode::dump(Formatter *f) const
369 {
370 f->dump_stream("ino") << ino;
371 f->dump_stream("snapid") << snapid;
372 if (rdev)
373 f->dump_unsigned("rdev", rdev);
374 f->dump_stream("ctime") << ctime;
375 f->dump_stream("btime") << btime;
376 f->dump_stream("mode") << '0' << std::oct << mode << std::dec;
377 f->dump_unsigned("uid", uid);
378 f->dump_unsigned("gid", gid);
379 f->dump_int("nlink", nlink);
380
381 f->dump_unsigned("size", size);
382 f->dump_unsigned("max_size", max_size);
383 f->dump_unsigned("truncate_seq", truncate_seq);
384 f->dump_unsigned("truncate_size", truncate_size);
385 f->dump_stream("mtime") << mtime;
386 f->dump_stream("atime") << atime;
387 f->dump_unsigned("time_warp_seq", time_warp_seq);
388 f->dump_unsigned("change_attr", change_attr);
389
390 f->dump_object("layout", layout);
391 if (is_dir()) {
392 f->open_object_section("dir_layout");
393 ::dump(dir_layout, f);
394 f->close_section();
395
396 f->dump_bool("complete", flags & I_COMPLETE);
397 f->dump_bool("ordered", flags & I_DIR_ORDERED);
398
399 /* FIXME when wip-mds-encoding is merged ***
400 f->open_object_section("dir_stat");
401 dirstat.dump(f);
402 f->close_section();
403
404 f->open_object_section("rstat");
405 rstat.dump(f);
406 f->close_section();
407 */
408 }
409
410 f->dump_unsigned("version", version);
411 f->dump_unsigned("xattr_version", xattr_version);
412 f->dump_unsigned("flags", flags);
413
414 if (is_dir()) {
415 if (!dir_contacts.empty()) {
416 f->open_object_section("dir_contants");
417 for (set<int>::iterator p = dir_contacts.begin(); p != dir_contacts.end(); ++p)
418 f->dump_int("mds", *p);
419 f->close_section();
420 }
421 f->dump_int("dir_hashed", (int)dir_hashed);
422 f->dump_int("dir_replicated", (int)dir_replicated);
423 }
424
425 f->open_array_section("caps");
426 for (map<mds_rank_t,Cap*>::const_iterator p = caps.begin(); p != caps.end(); ++p) {
427 f->open_object_section("cap");
428 f->dump_int("mds", p->first);
429 if (p->second == auth_cap)
430 f->dump_int("auth", 1);
431 p->second->dump(f);
432 f->close_section();
433 }
434 f->close_section();
435 if (auth_cap)
436 f->dump_int("auth_cap", auth_cap->session->mds_num);
437
438 f->dump_stream("dirty_caps") << ccap_string(dirty_caps);
439 if (flushing_caps) {
440 f->dump_stream("flushings_caps") << ccap_string(flushing_caps);
441 f->open_object_section("flushing_cap_tid");
442 for (map<ceph_tid_t, int>::const_iterator p = flushing_cap_tids.begin();
443 p != flushing_cap_tids.end();
444 ++p) {
445 string n(ccap_string(p->second));
446 f->dump_unsigned(n.c_str(), p->first);
447 }
448 f->close_section();
449 }
450 f->dump_int("shared_gen", shared_gen);
451 f->dump_int("cache_gen", cache_gen);
452 if (snap_caps) {
453 f->dump_int("snap_caps", snap_caps);
454 f->dump_int("snap_cap_refs", snap_cap_refs);
455 }
456
457 f->dump_stream("hold_caps_until") << hold_caps_until;
458
459 if (snaprealm) {
460 f->open_object_section("snaprealm");
461 snaprealm->dump(f);
462 f->close_section();
463 }
464 if (!cap_snaps.empty()) {
465 for (const auto &p : cap_snaps) {
466 f->open_object_section("cap_snap");
467 f->dump_stream("follows") << p.first;
468 p.second.dump(f);
469 f->close_section();
470 }
471 }
472
473 // open
474 if (!open_by_mode.empty()) {
475 f->open_array_section("open_by_mode");
476 for (map<int,int>::const_iterator p = open_by_mode.begin(); p != open_by_mode.end(); ++p) {
477 f->open_object_section("ref");
478 f->dump_int("mode", p->first);
479 f->dump_int("refs", p->second);
480 f->close_section();
481 }
482 f->close_section();
483 }
484 if (!cap_refs.empty()) {
485 f->open_array_section("cap_refs");
486 for (map<int,int>::const_iterator p = cap_refs.begin(); p != cap_refs.end(); ++p) {
487 f->open_object_section("cap_ref");
488 f->dump_stream("cap") << ccap_string(p->first);
489 f->dump_int("refs", p->second);
490 f->close_section();
491 }
492 f->close_section();
493 }
494
495 f->dump_unsigned("reported_size", reported_size);
496 if (wanted_max_size != max_size)
497 f->dump_unsigned("wanted_max_size", wanted_max_size);
498 if (requested_max_size != max_size)
499 f->dump_unsigned("requested_max_size", requested_max_size);
500
501 f->dump_int("ref", _ref);
502 f->dump_int("ll_ref", ll_ref);
503
504 if (!dn_set.empty()) {
505 f->open_array_section("parents");
506 for (set<Dentry*>::const_iterator p = dn_set.begin(); p != dn_set.end(); ++p) {
507 f->open_object_section("dentry");
508 f->dump_stream("dir_ino") << (*p)->dir->parent_inode->ino;
509 f->dump_string("name", (*p)->name);
510 f->close_section();
511 }
512 f->close_section();
513 }
514 }
515
516 void Cap::dump(Formatter *f) const
517 {
518 f->dump_int("mds", session->mds_num);
519 f->dump_stream("ino") << inode->ino;
520 f->dump_unsigned("cap_id", cap_id);
521 f->dump_stream("issued") << ccap_string(issued);
522 if (implemented != issued)
523 f->dump_stream("implemented") << ccap_string(implemented);
524 f->dump_stream("wanted") << ccap_string(wanted);
525 f->dump_unsigned("seq", seq);
526 f->dump_unsigned("issue_seq", issue_seq);
527 f->dump_unsigned("mseq", mseq);
528 f->dump_unsigned("gen", gen);
529 }
530
531 void CapSnap::dump(Formatter *f) const
532 {
533 f->dump_stream("ino") << in->ino;
534 f->dump_stream("issued") << ccap_string(issued);
535 f->dump_stream("dirty") << ccap_string(dirty);
536 f->dump_unsigned("size", size);
537 f->dump_stream("ctime") << ctime;
538 f->dump_stream("mtime") << mtime;
539 f->dump_stream("atime") << atime;
540 f->dump_int("time_warp_seq", time_warp_seq);
541 f->dump_stream("mode") << '0' << std::oct << mode << std::dec;
542 f->dump_unsigned("uid", uid);
543 f->dump_unsigned("gid", gid);
544 if (!xattrs.empty()) {
545 f->open_object_section("xattr_lens");
546 for (map<string,bufferptr>::const_iterator p = xattrs.begin(); p != xattrs.end(); ++p)
547 f->dump_int(p->first.c_str(), p->second.length());
548 f->close_section();
549 }
550 f->dump_unsigned("xattr_version", xattr_version);
551 f->dump_int("writing", (int)writing);
552 f->dump_int("dirty_data", (int)dirty_data);
553 f->dump_unsigned("flush_tid", flush_tid);
554 }
555
556 void Inode::set_async_err(int r)
557 {
558 for (const auto &fh : fhs) {
559 fh->async_err = r;
560 }
561 }
562
563 bool Inode::has_recalled_deleg()
564 {
565 if (delegations.empty())
566 return false;
567
568 // Either all delegations are recalled or none are. Just check the first.
569 Delegation& deleg = delegations.front();
570 return deleg.is_recalled();
571 }
572
573 void Inode::recall_deleg(bool skip_read)
574 {
575 if (delegations.empty())
576 return;
577
578 // Issue any recalls
579 for (list<Delegation>::iterator d = delegations.begin();
580 d != delegations.end(); ++d) {
581
582 Delegation& deleg = *d;
583 deleg.recall(skip_read);
584 }
585 }
586
587 bool Inode::delegations_broken(bool skip_read)
588 {
589 if (delegations.empty()) {
590 lsubdout(client->cct, client, 10) <<
591 __func__ << ": delegations empty on " << *this << dendl;
592 return true;
593 }
594
595 if (skip_read) {
596 Delegation& deleg = delegations.front();
597 lsubdout(client->cct, client, 10) <<
598 __func__ << ": read delegs only on " << *this << dendl;
599 if (deleg.get_type() == CEPH_FILE_MODE_RD) {
600 return true;
601 }
602 }
603 lsubdout(client->cct, client, 10) <<
604 __func__ << ": not broken" << *this << dendl;
605 return false;
606 }
607
608 void Inode::break_deleg(bool skip_read)
609 {
610 lsubdout(client->cct, client, 10) <<
611 __func__ << ": breaking delegs on " << *this << dendl;
612
613 recall_deleg(skip_read);
614
615 while (!delegations_broken(skip_read))
616 client->wait_on_list(waitfor_deleg);
617 }
618
619 /**
620 * set_deleg: request a delegation on an open Fh
621 * @fh: filehandle on which to acquire it
622 * @type: delegation request type
623 * @cb: delegation recall callback function
624 * @priv: private pointer to be passed to callback
625 *
626 * Attempt to acquire a delegation on an open file handle. If there are no
627 * conflicts and we have the right caps, allocate a new delegation, fill it
628 * out and return 0. Return an error if we can't get one for any reason.
629 */
630 int Inode::set_deleg(Fh *fh, unsigned type, ceph_deleg_cb_t cb, void *priv)
631 {
632 lsubdout(client->cct, client, 10) <<
633 __func__ << ": inode " << *this << dendl;
634
635 /*
636 * 0 deleg timeout means that they haven't been explicitly enabled. Don't
637 * allow it, with an unusual error to make it clear.
638 */
639 if (!client->get_deleg_timeout())
640 return -ETIME;
641
642 // Just say no if we have any recalled delegs still outstanding
643 if (has_recalled_deleg()) {
644 lsubdout(client->cct, client, 10) << __func__ <<
645 ": has_recalled_deleg" << dendl;
646 return -EAGAIN;
647 }
648
649 // check vs. currently open files on this inode
650 switch (type) {
651 case CEPH_DELEGATION_RD:
652 if (open_count_for_write()) {
653 lsubdout(client->cct, client, 10) << __func__ <<
654 ": open for write" << dendl;
655 return -EAGAIN;
656 }
657 break;
658 case CEPH_DELEGATION_WR:
659 if (open_count() > 1) {
660 lsubdout(client->cct, client, 10) << __func__ << ": open" << dendl;
661 return -EAGAIN;
662 }
663 break;
664 default:
665 return -EINVAL;
666 }
667
668 /*
669 * A delegation is essentially a long-held container for cap references that
670 * we delegate to the client until recalled. The caps required depend on the
671 * type of delegation (read vs. rw). This is entirely an opportunistic thing.
672 * If we don't have the necessary caps for the delegation, then we just don't
673 * grant one.
674 *
675 * In principle we could request the caps from the MDS, but a delegation is
676 * usually requested just after an open. If we don't have the necessary caps
677 * already, then it's likely that there is some sort of conflicting access.
678 *
679 * In the future, we may need to add a way to have this request caps more
680 * aggressively -- for instance, to handle WANT_DELEGATION for NFSv4.1+.
681 */
682 int need = ceph_deleg_caps_for_type(type);
683 if (!caps_issued_mask(need)) {
684 lsubdout(client->cct, client, 10) << __func__ << ": cap mismatch, have="
685 << ccap_string(caps_issued()) << " need=" << ccap_string(need) << dendl;
686 return -EAGAIN;
687 }
688
689 for (list<Delegation>::iterator d = delegations.begin();
690 d != delegations.end(); ++d) {
691 Delegation& deleg = *d;
692 if (deleg.get_fh() == fh) {
693 deleg.reinit(type, cb, priv);
694 return 0;
695 }
696 }
697
698 delegations.emplace_back(fh, type, cb, priv);
699 return 0;
700 }
701
702 /**
703 * unset_deleg - remove a delegation that was previously set
704 * @fh: file handle to clear delegation of
705 *
706 * Unlink delegation from the Inode (if there is one), put caps and free it.
707 */
708 void Inode::unset_deleg(Fh *fh)
709 {
710 for (list<Delegation>::iterator d = delegations.begin();
711 d != delegations.end(); ++d) {
712 Delegation& deleg = *d;
713 if (deleg.get_fh() == fh) {
714 delegations.erase(d);
715 client->signal_cond_list(waitfor_deleg);
716 break;
717 }
718 }
719 }