1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <sys/types.h>
23 #include <sys/param.h>
26 #include <sys/utsname.h>
29 #include <boost/lexical_cast.hpp>
30 #include <boost/fusion/include/std_pair.hpp>
32 #if defined(__FreeBSD__)
33 #define XATTR_CREATE 0x1
34 #define XATTR_REPLACE 0x2
36 #include <sys/xattr.h>
39 #if defined(__linux__)
40 #include <linux/falloc.h>
43 #include <sys/statvfs.h>
45 #include "common/config.h"
46 #include "common/version.h"
48 #include "mon/MonClient.h"
50 #include "messages/MClientCaps.h"
51 #include "messages/MClientLease.h"
52 #include "messages/MClientQuota.h"
53 #include "messages/MClientReclaim.h"
54 #include "messages/MClientReclaimReply.h"
55 #include "messages/MClientReconnect.h"
56 #include "messages/MClientReply.h"
57 #include "messages/MClientRequest.h"
58 #include "messages/MClientRequestForward.h"
59 #include "messages/MClientSession.h"
60 #include "messages/MClientSnap.h"
61 #include "messages/MCommandReply.h"
62 #include "messages/MFSMap.h"
63 #include "messages/MFSMapUser.h"
64 #include "messages/MMDSMap.h"
65 #include "messages/MOSDMap.h"
67 #include "mds/flock.h"
68 #include "mds/cephfs_features.h"
69 #include "osd/OSDMap.h"
70 #include "osdc/Filer.h"
72 #include "common/Cond.h"
73 #include "common/Mutex.h"
74 #include "common/perf_counters.h"
75 #include "common/admin_socket.h"
76 #include "common/errno.h"
77 #include "include/str_list.h"
79 #define dout_subsys ceph_subsys_client
81 #include "include/lru.h"
82 #include "include/compat.h"
83 #include "include/stringify.h"
88 #include "Delegation.h"
90 #include "ClientSnapRealm.h"
92 #include "MetaSession.h"
93 #include "MetaRequest.h"
94 #include "ObjecterWriteback.h"
95 #include "posix_acl.h"
97 #include "include/ceph_assert.h"
98 #include "include/stat.h"
100 #include "include/cephfs/ceph_statx.h"
102 #if HAVE_GETGROUPLIST
109 #define dout_prefix *_dout << "client." << whoami << " "
111 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
113 // FreeBSD fails to define this
117 // Darwin fails to define this
126 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
128 void client_flush_set_callback(void *p
, ObjectCacher::ObjectSet
*oset
)
130 Client
*client
= static_cast<Client
*>(p
);
131 client
->flush_set_callback(oset
);
137 Client::CommandHook::CommandHook(Client
*client
) :
142 bool Client::CommandHook::call(std::string_view command
,
143 const cmdmap_t
& cmdmap
,
144 std::string_view format
, bufferlist
& out
)
146 std::unique_ptr
<Formatter
> f(Formatter::create(format
));
147 f
->open_object_section("result");
148 m_client
->client_lock
.Lock();
149 if (command
== "mds_requests")
150 m_client
->dump_mds_requests(f
.get());
151 else if (command
== "mds_sessions")
152 m_client
->dump_mds_sessions(f
.get());
153 else if (command
== "dump_cache")
154 m_client
->dump_cache(f
.get());
155 else if (command
== "kick_stale_sessions")
156 m_client
->_kick_stale_sessions();
157 else if (command
== "status")
158 m_client
->dump_status(f
.get());
160 ceph_abort_msg("bad command registered");
161 m_client
->client_lock
.Unlock();
170 dir_result_t::dir_result_t(Inode
*in
, const UserPerm
& perms
)
171 : inode(in
), offset(0), next_offset(2),
172 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
176 void Client::_reset_faked_inos()
179 free_faked_inos
.clear();
180 free_faked_inos
.insert(start
, (uint32_t)-1 - start
+ 1);
181 last_used_faked_ino
= 0;
182 last_used_faked_root
= 0;
183 _use_faked_inos
= sizeof(ino_t
) < 8 || cct
->_conf
->client_use_faked_inos
;
186 void Client::_assign_faked_ino(Inode
*in
)
188 if (0 == last_used_faked_ino
)
189 last_used_faked_ino
= last_used_faked_ino
+ 2048; // start(1024)~2048 reserved for _assign_faked_root
190 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
191 if (it
== free_faked_inos
.end() && last_used_faked_ino
> 0) {
192 last_used_faked_ino
= 2048;
193 it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
195 ceph_assert(it
!= free_faked_inos
.end());
196 if (last_used_faked_ino
< it
.get_start()) {
197 ceph_assert(it
.get_len() > 0);
198 last_used_faked_ino
= it
.get_start();
200 ++last_used_faked_ino
;
201 ceph_assert(it
.get_start() + it
.get_len() > last_used_faked_ino
);
203 in
->faked_ino
= last_used_faked_ino
;
204 free_faked_inos
.erase(in
->faked_ino
);
205 faked_ino_map
[in
->faked_ino
] = in
->vino();
209 * In the faked mode, if you export multiple subdirectories,
210 * you will see that the inode numbers of the exported subdirectories
211 * are the same. so we distinguish the mount point by reserving
212 * the "fake ids" between "1024~2048" and combining the last
213 * 10bits(0x3ff) of the "root inodes".
215 void Client::_assign_faked_root(Inode
*in
)
217 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
218 if (it
== free_faked_inos
.end() && last_used_faked_root
> 0) {
219 last_used_faked_root
= 0;
220 it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
222 assert(it
!= free_faked_inos
.end());
223 vinodeno_t inode_info
= in
->vino();
224 uint64_t inode_num
= (uint64_t)inode_info
.ino
;
225 ldout(cct
, 10) << "inode_num " << inode_num
<< "inode_num & 0x3ff=" << (inode_num
& 0x3ff)<< dendl
;
226 last_used_faked_root
= it
.get_start() + (inode_num
& 0x3ff); // 0x3ff mask and get_start will not exceed 2048
227 assert(it
.get_start() + it
.get_len() > last_used_faked_root
);
229 in
->faked_ino
= last_used_faked_root
;
230 free_faked_inos
.erase(in
->faked_ino
);
231 faked_ino_map
[in
->faked_ino
] = in
->vino();
234 void Client::_release_faked_ino(Inode
*in
)
236 free_faked_inos
.insert(in
->faked_ino
);
237 faked_ino_map
.erase(in
->faked_ino
);
240 vinodeno_t
Client::_map_faked_ino(ino_t ino
)
245 else if (faked_ino_map
.count(ino
))
246 vino
= faked_ino_map
[ino
];
248 vino
= vinodeno_t(0, CEPH_NOSNAP
);
249 ldout(cct
, 10) << __func__
<< " " << ino
<< " -> " << vino
<< dendl
;
253 vinodeno_t
Client::map_faked_ino(ino_t ino
)
255 std::lock_guard
lock(client_lock
);
256 return _map_faked_ino(ino
);
261 Client::Client(Messenger
*m
, MonClient
*mc
, Objecter
*objecter_
)
262 : Dispatcher(m
->cct
),
263 timer(m
->cct
, client_lock
),
264 client_lock("Client::client_lock"),
268 whoami(mc
->get_global_id()),
269 async_ino_invalidator(m
->cct
),
270 async_dentry_invalidator(m
->cct
),
271 interrupt_finisher(m
->cct
),
272 remount_finisher(m
->cct
),
273 objecter_finisher(m
->cct
),
274 m_command_hook(this),
279 user_id
= cct
->_conf
->client_mount_uid
;
280 group_id
= cct
->_conf
->client_mount_gid
;
282 if (cct
->_conf
->client_acl_type
== "posix_acl")
283 acl_type
= POSIX_ACL
;
285 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
288 free_fd_set
.insert(10, 1<<30);
290 mdsmap
.reset(new MDSMap
);
293 writeback_handler
.reset(new ObjecterWriteback(objecter
, &objecter_finisher
,
295 objectcacher
.reset(new ObjectCacher(cct
, "libcephfs", *writeback_handler
, client_lock
,
296 client_flush_set_callback
, // all commit callback
298 cct
->_conf
->client_oc_size
,
299 cct
->_conf
->client_oc_max_objects
,
300 cct
->_conf
->client_oc_max_dirty
,
301 cct
->_conf
->client_oc_target_dirty
,
302 cct
->_conf
->client_oc_max_dirty_age
,
304 objecter_finisher
.start();
305 filer
.reset(new Filer(objecter
, &objecter_finisher
));
306 objecter
->enable_blacklist_events();
312 ceph_assert(!client_lock
.is_locked());
314 // It is necessary to hold client_lock, because any inode destruction
315 // may call into ObjectCacher, which asserts that it's lock (which is
316 // client_lock) is held.
319 client_lock
.Unlock();
322 void Client::tear_down_cache()
325 for (ceph::unordered_map
<int, Fh
*>::iterator it
= fd_map
.begin();
329 ldout(cct
, 1) << __func__
<< " forcing close of fh " << it
->first
<< " ino " << fh
->inode
->ino
<< dendl
;
334 while (!opened_dirs
.empty()) {
335 dir_result_t
*dirp
= *opened_dirs
.begin();
336 ldout(cct
, 1) << __func__
<< " forcing close of dir " << dirp
<< " ino " << dirp
->inode
->ino
<< dendl
;
345 ceph_assert(lru
.lru_get_size() == 0);
348 ceph_assert(inode_map
.size() <= 1 + root_parents
.size());
349 if (root
&& inode_map
.size() == 1 + root_parents
.size()) {
353 while (!root_parents
.empty())
354 root_parents
.erase(root_parents
.begin());
359 ceph_assert(inode_map
.empty());
362 inodeno_t
Client::get_root_ino()
364 std::lock_guard
l(client_lock
);
365 if (use_faked_inos())
366 return root
->faked_ino
;
371 Inode
*Client::get_root()
373 std::lock_guard
l(client_lock
);
381 void Client::dump_inode(Formatter
*f
, Inode
*in
, set
<Inode
*>& did
, bool disconnected
)
384 in
->make_long_path(path
);
385 ldout(cct
, 1) << "dump_inode: "
386 << (disconnected
? "DISCONNECTED ":"")
387 << "inode " << in
->ino
389 << " ref " << in
->get_num_ref()
393 f
->open_object_section("inode");
394 f
->dump_stream("path") << path
;
396 f
->dump_int("disconnected", 1);
403 ldout(cct
, 1) << " dir " << in
->dir
<< " size " << in
->dir
->dentries
.size() << dendl
;
404 for (ceph::unordered_map
<string
, Dentry
*>::iterator it
= in
->dir
->dentries
.begin();
405 it
!= in
->dir
->dentries
.end();
407 ldout(cct
, 1) << " " << in
->ino
<< " dn " << it
->first
<< " " << it
->second
<< " ref " << it
->second
->ref
<< dendl
;
409 f
->open_object_section("dentry");
413 if (it
->second
->inode
)
414 dump_inode(f
, it
->second
->inode
.get(), did
, false);
419 void Client::dump_cache(Formatter
*f
)
423 ldout(cct
, 1) << __func__
<< dendl
;
426 f
->open_array_section("cache");
429 dump_inode(f
, root
, did
, true);
431 // make a second pass to catch anything disconnected
432 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
433 it
!= inode_map
.end();
435 if (did
.count(it
->second
))
437 dump_inode(f
, it
->second
, did
, true);
444 void Client::dump_status(Formatter
*f
)
446 ceph_assert(client_lock
.is_locked_by_me());
448 ldout(cct
, 1) << __func__
<< dendl
;
450 const epoch_t osd_epoch
451 = objecter
->with_osdmap(std::mem_fn(&OSDMap::get_epoch
));
454 f
->open_object_section("metadata");
455 for (const auto& kv
: metadata
)
456 f
->dump_string(kv
.first
.c_str(), kv
.second
);
459 f
->dump_int("dentry_count", lru
.lru_get_size());
460 f
->dump_int("dentry_pinned_count", lru
.lru_get_num_pinned());
461 f
->dump_int("id", get_nodeid().v
);
462 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
463 f
->dump_object("inst", inst
);
464 f
->dump_object("addr", inst
.addr
);
465 f
->dump_stream("inst_str") << inst
.name
<< " " << inst
.addr
.get_legacy_str();
466 f
->dump_string("addr_str", inst
.addr
.get_legacy_str());
467 f
->dump_int("inode_count", inode_map
.size());
468 f
->dump_int("mds_epoch", mdsmap
->get_epoch());
469 f
->dump_int("osd_epoch", osd_epoch
);
470 f
->dump_int("osd_epoch_barrier", cap_epoch_barrier
);
471 f
->dump_bool("blacklisted", blacklisted
);
478 objectcacher
->start();
481 ceph_assert(!initialized
);
483 messenger
->add_dispatcher_tail(this);
484 client_lock
.Unlock();
490 void Client::_finish_init()
494 PerfCountersBuilder
plb(cct
, "client", l_c_first
, l_c_last
);
495 plb
.add_time_avg(l_c_reply
, "reply", "Latency of receiving a reply on metadata request");
496 plb
.add_time_avg(l_c_lat
, "lat", "Latency of processing a metadata request");
497 plb
.add_time_avg(l_c_wrlat
, "wrlat", "Latency of a file data write operation");
498 plb
.add_time_avg(l_c_read
, "rdlat", "Latency of a file data read operation");
499 plb
.add_time_avg(l_c_fsync
, "fsync", "Latency of a file sync operation");
500 logger
.reset(plb
.create_perf_counters());
501 cct
->get_perfcounters_collection()->add(logger
.get());
503 client_lock
.Unlock();
505 cct
->_conf
.add_observer(this);
507 AdminSocket
* admin_socket
= cct
->get_admin_socket();
508 int ret
= admin_socket
->register_command("mds_requests",
511 "show in-progress mds requests");
513 lderr(cct
) << "error registering admin socket command: "
514 << cpp_strerror(-ret
) << dendl
;
516 ret
= admin_socket
->register_command("mds_sessions",
519 "show mds session state");
521 lderr(cct
) << "error registering admin socket command: "
522 << cpp_strerror(-ret
) << dendl
;
524 ret
= admin_socket
->register_command("dump_cache",
527 "show in-memory metadata cache contents");
529 lderr(cct
) << "error registering admin socket command: "
530 << cpp_strerror(-ret
) << dendl
;
532 ret
= admin_socket
->register_command("kick_stale_sessions",
533 "kick_stale_sessions",
535 "kick sessions that were remote reset");
537 lderr(cct
) << "error registering admin socket command: "
538 << cpp_strerror(-ret
) << dendl
;
540 ret
= admin_socket
->register_command("status",
543 "show overall client status");
545 lderr(cct
) << "error registering admin socket command: "
546 << cpp_strerror(-ret
) << dendl
;
551 client_lock
.Unlock();
554 void Client::shutdown()
556 ldout(cct
, 1) << __func__
<< dendl
;
558 // If we were not mounted, but were being used for sending
559 // MDS commands, we may have sessions that need closing.
562 client_lock
.Unlock();
564 cct
->_conf
.remove_observer(this);
566 cct
->get_admin_socket()->unregister_commands(&m_command_hook
);
568 if (ino_invalidate_cb
) {
569 ldout(cct
, 10) << "shutdown stopping cache invalidator finisher" << dendl
;
570 async_ino_invalidator
.wait_for_empty();
571 async_ino_invalidator
.stop();
574 if (dentry_invalidate_cb
) {
575 ldout(cct
, 10) << "shutdown stopping dentry invalidator finisher" << dendl
;
576 async_dentry_invalidator
.wait_for_empty();
577 async_dentry_invalidator
.stop();
580 if (switch_interrupt_cb
) {
581 ldout(cct
, 10) << "shutdown stopping interrupt finisher" << dendl
;
582 interrupt_finisher
.wait_for_empty();
583 interrupt_finisher
.stop();
587 ldout(cct
, 10) << "shutdown stopping remount finisher" << dendl
;
588 remount_finisher
.wait_for_empty();
589 remount_finisher
.stop();
592 objectcacher
->stop(); // outside of client_lock! this does a join.
595 ceph_assert(initialized
);
598 client_lock
.Unlock();
600 objecter_finisher
.wait_for_empty();
601 objecter_finisher
.stop();
604 cct
->get_perfcounters_collection()->remove(logger
.get());
610 // ===================
611 // metadata cache stuff
613 void Client::trim_cache(bool trim_kernel_dcache
)
615 uint64_t max
= cct
->_conf
->client_cache_size
;
616 ldout(cct
, 20) << "trim_cache size " << lru
.lru_get_size() << " max " << max
<< dendl
;
618 while (lru
.lru_get_size() != last
) {
619 last
= lru
.lru_get_size();
621 if (!unmounting
&& lru
.lru_get_size() <= max
) break;
624 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_get_next_expire());
631 if (trim_kernel_dcache
&& lru
.lru_get_size() > max
)
632 _invalidate_kernel_dcache();
635 if (lru
.lru_get_size() == 0 && root
&& root
->get_num_ref() == 0 && inode_map
.size() == 1 + root_parents
.size()) {
636 ldout(cct
, 15) << "trim_cache trimmed root " << root
<< dendl
;
640 while (!root_parents
.empty())
641 root_parents
.erase(root_parents
.begin());
647 void Client::trim_cache_for_reconnect(MetaSession
*s
)
649 mds_rank_t mds
= s
->mds_num
;
650 ldout(cct
, 20) << __func__
<< " mds." << mds
<< dendl
;
653 list
<Dentry
*> skipped
;
654 while (lru
.lru_get_size() > 0) {
655 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_expire());
659 if ((dn
->inode
&& dn
->inode
->caps
.count(mds
)) ||
660 dn
->dir
->parent_inode
->caps
.count(mds
)) {
664 skipped
.push_back(dn
);
667 for(list
<Dentry
*>::iterator p
= skipped
.begin(); p
!= skipped
.end(); ++p
)
668 lru
.lru_insert_mid(*p
);
670 ldout(cct
, 20) << __func__
<< " mds." << mds
671 << " trimmed " << trimmed
<< " dentries" << dendl
;
673 if (s
->caps
.size() > 0)
674 _invalidate_kernel_dcache();
677 void Client::trim_dentry(Dentry
*dn
)
679 ldout(cct
, 15) << "trim_dentry unlinking dn " << dn
->name
681 << std::hex
<< dn
->dir
->parent_inode
->ino
<< std::dec
684 Inode
*diri
= dn
->dir
->parent_inode
;
685 diri
->dir_release_count
++;
686 clear_dir_complete_and_ordered(diri
, true);
688 unlink(dn
, false, false); // drop dir, drop dentry
692 void Client::update_inode_file_size(Inode
*in
, int issued
, uint64_t size
,
693 uint64_t truncate_seq
, uint64_t truncate_size
)
695 uint64_t prior_size
= in
->size
;
697 if (truncate_seq
> in
->truncate_seq
||
698 (truncate_seq
== in
->truncate_seq
&& size
> in
->size
)) {
699 ldout(cct
, 10) << "size " << in
->size
<< " -> " << size
<< dendl
;
701 in
->reported_size
= size
;
702 if (truncate_seq
!= in
->truncate_seq
) {
703 ldout(cct
, 10) << "truncate_seq " << in
->truncate_seq
<< " -> "
704 << truncate_seq
<< dendl
;
705 in
->truncate_seq
= truncate_seq
;
706 in
->oset
.truncate_seq
= truncate_seq
;
708 // truncate cached file data
709 if (prior_size
> size
) {
710 _invalidate_inode_cache(in
, truncate_size
, prior_size
- truncate_size
);
714 // truncate inline data
715 if (in
->inline_version
< CEPH_INLINE_NONE
) {
716 uint32_t len
= in
->inline_data
.length();
718 in
->inline_data
.splice(size
, len
- size
);
721 if (truncate_seq
>= in
->truncate_seq
&&
722 in
->truncate_size
!= truncate_size
) {
724 ldout(cct
, 10) << "truncate_size " << in
->truncate_size
<< " -> "
725 << truncate_size
<< dendl
;
726 in
->truncate_size
= truncate_size
;
727 in
->oset
.truncate_size
= truncate_size
;
729 ldout(cct
, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl
;
734 void Client::update_inode_file_time(Inode
*in
, int issued
, uint64_t time_warp_seq
,
735 utime_t ctime
, utime_t mtime
, utime_t atime
)
737 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << ccap_string(issued
)
738 << " ctime " << ctime
<< " mtime " << mtime
<< dendl
;
740 if (time_warp_seq
> in
->time_warp_seq
)
741 ldout(cct
, 10) << " mds time_warp_seq " << time_warp_seq
742 << " is higher than local time_warp_seq "
743 << in
->time_warp_seq
<< dendl
;
746 // be careful with size, mtime, atime
747 if (issued
& (CEPH_CAP_FILE_EXCL
|
749 CEPH_CAP_FILE_BUFFER
|
751 CEPH_CAP_XATTR_EXCL
)) {
752 ldout(cct
, 30) << "Yay have enough caps to look at our times" << dendl
;
753 if (ctime
> in
->ctime
)
755 if (time_warp_seq
> in
->time_warp_seq
) {
756 //the mds updated times, so take those!
759 in
->time_warp_seq
= time_warp_seq
;
760 } else if (time_warp_seq
== in
->time_warp_seq
) {
762 if (mtime
> in
->mtime
)
764 if (atime
> in
->atime
)
766 } else if (issued
& CEPH_CAP_FILE_EXCL
) {
767 //ignore mds values as we have a higher seq
770 ldout(cct
, 30) << "Don't have enough caps, just taking mds' time values" << dendl
;
771 if (time_warp_seq
>= in
->time_warp_seq
) {
775 in
->time_warp_seq
= time_warp_seq
;
779 ldout(cct
, 0) << "WARNING: " << *in
<< " mds time_warp_seq "
780 << time_warp_seq
<< " is lower than local time_warp_seq "
786 void Client::_fragmap_remove_non_leaves(Inode
*in
)
788 for (map
<frag_t
,int>::iterator p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
789 if (!in
->dirfragtree
.is_leaf(p
->first
))
790 in
->fragmap
.erase(p
++);
795 void Client::_fragmap_remove_stopped_mds(Inode
*in
, mds_rank_t mds
)
797 for (auto p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
798 if (p
->second
== mds
)
799 in
->fragmap
.erase(p
++);
804 Inode
* Client::add_update_inode(InodeStat
*st
, utime_t from
,
805 MetaSession
*session
,
806 const UserPerm
& request_perms
)
809 bool was_new
= false;
810 if (inode_map
.count(st
->vino
)) {
811 in
= inode_map
[st
->vino
];
812 ldout(cct
, 12) << __func__
<< " had " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
814 in
= new Inode(this, st
->vino
, &st
->layout
);
815 inode_map
[st
->vino
] = in
;
817 if (use_faked_inos())
818 _assign_faked_ino(in
);
822 if (use_faked_inos())
823 _assign_faked_root(root
);
826 } else if (!mounted
) {
827 root_parents
[root_ancestor
] = in
;
832 in
->ino
= st
->vino
.ino
;
833 in
->snapid
= st
->vino
.snapid
;
834 in
->mode
= st
->mode
& S_IFMT
;
839 if (in
->is_symlink())
840 in
->symlink
= st
->symlink
;
842 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
843 bool new_version
= false;
844 if (in
->version
== 0 ||
845 ((st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
) &&
846 (in
->version
& ~1) < st
->version
))
850 in
->caps_issued(&issued
);
851 issued
|= in
->caps_dirty();
852 int new_issued
= ~issued
& (int)st
->cap
.caps
;
854 if ((new_version
|| (new_issued
& CEPH_CAP_AUTH_SHARED
)) &&
855 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
859 in
->btime
= st
->btime
;
860 in
->snap_btime
= st
->snap_btime
;
863 if ((new_version
|| (new_issued
& CEPH_CAP_LINK_SHARED
)) &&
864 !(issued
& CEPH_CAP_LINK_EXCL
)) {
865 in
->nlink
= st
->nlink
;
868 if (new_version
|| (new_issued
& CEPH_CAP_ANY_RD
)) {
869 update_inode_file_time(in
, issued
, st
->time_warp_seq
,
870 st
->ctime
, st
->mtime
, st
->atime
);
874 (new_issued
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
))) {
875 in
->layout
= st
->layout
;
876 update_inode_file_size(in
, issued
, st
->size
, st
->truncate_seq
, st
->truncate_size
);
880 if (new_version
|| (new_issued
& CEPH_CAP_FILE_SHARED
)) {
881 in
->dirstat
= st
->dirstat
;
883 // dir_layout/rstat/quota are not tracked by capability, update them only if
884 // the inode stat is from auth mds
885 if (new_version
|| (st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
)) {
886 in
->dir_layout
= st
->dir_layout
;
887 ldout(cct
, 20) << " dir hash is " << (int)in
->dir_layout
.dl_dir_hash
<< dendl
;
888 in
->rstat
= st
->rstat
;
889 in
->quota
= st
->quota
;
890 in
->dir_pin
= st
->dir_pin
;
892 // move me if/when version reflects fragtree changes.
893 if (in
->dirfragtree
!= st
->dirfragtree
) {
894 in
->dirfragtree
= st
->dirfragtree
;
895 _fragmap_remove_non_leaves(in
);
899 if ((in
->xattr_version
== 0 || !(issued
& CEPH_CAP_XATTR_EXCL
)) &&
900 st
->xattrbl
.length() &&
901 st
->xattr_version
> in
->xattr_version
) {
902 auto p
= st
->xattrbl
.cbegin();
903 decode(in
->xattrs
, p
);
904 in
->xattr_version
= st
->xattr_version
;
907 if (st
->inline_version
> in
->inline_version
) {
908 in
->inline_data
= st
->inline_data
;
909 in
->inline_version
= st
->inline_version
;
912 /* always take a newer change attr */
913 if (st
->change_attr
> in
->change_attr
)
914 in
->change_attr
= st
->change_attr
;
916 if (st
->version
> in
->version
)
917 in
->version
= st
->version
;
920 ldout(cct
, 12) << __func__
<< " adding " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
923 return in
; // as with readdir returning indoes in different snaprealms (no caps!)
925 if (in
->snapid
== CEPH_NOSNAP
) {
926 add_update_cap(in
, session
, st
->cap
.cap_id
, st
->cap
.caps
, st
->cap
.wanted
,
927 st
->cap
.seq
, st
->cap
.mseq
, inodeno_t(st
->cap
.realm
),
928 st
->cap
.flags
, request_perms
);
929 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
930 in
->max_size
= st
->max_size
;
931 in
->rstat
= st
->rstat
;
934 // setting I_COMPLETE needs to happen after adding the cap
936 (st
->cap
.caps
& CEPH_CAP_FILE_SHARED
) &&
937 (issued
& CEPH_CAP_FILE_EXCL
) == 0 &&
938 in
->dirstat
.nfiles
== 0 &&
939 in
->dirstat
.nsubdirs
== 0) {
940 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in
<< dendl
;
941 in
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
943 ldout(cct
, 10) << " dir is open on empty dir " << in
->ino
<< " with "
944 << in
->dir
->dentries
.size() << " entries, marking all dentries null" << dendl
;
945 in
->dir
->readdir_cache
.clear();
946 for (const auto& p
: in
->dir
->dentries
) {
947 unlink(p
.second
, true, true); // keep dir, keep dentry
949 if (in
->dir
->dentries
.empty())
954 in
->snap_caps
|= st
->cap
.caps
;
962 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
964 Dentry
*Client::insert_dentry_inode(Dir
*dir
, const string
& dname
, LeaseStat
*dlease
,
965 Inode
*in
, utime_t from
, MetaSession
*session
,
969 if (dir
->dentries
.count(dname
))
970 dn
= dir
->dentries
[dname
];
972 ldout(cct
, 12) << __func__
<< " '" << dname
<< "' vino " << in
->vino()
973 << " in dir " << dir
->parent_inode
->vino() << " dn " << dn
976 if (dn
&& dn
->inode
) {
977 if (dn
->inode
->vino() == in
->vino()) {
979 ldout(cct
, 12) << " had dentry " << dname
980 << " with correct vino " << dn
->inode
->vino()
983 ldout(cct
, 12) << " had dentry " << dname
984 << " with WRONG vino " << dn
->inode
->vino()
986 unlink(dn
, true, true); // keep dir, keep dentry
990 if (!dn
|| !dn
->inode
) {
991 InodeRef
tmp_ref(in
);
993 if (old_dentry
->dir
!= dir
) {
994 Inode
*old_diri
= old_dentry
->dir
->parent_inode
;
995 old_diri
->dir_ordered_count
++;
996 clear_dir_complete_and_ordered(old_diri
, false);
998 unlink(old_dentry
, dir
== old_dentry
->dir
, false); // drop dentry, keep dir open if its the same dir
1000 Inode
*diri
= dir
->parent_inode
;
1001 diri
->dir_ordered_count
++;
1002 clear_dir_complete_and_ordered(diri
, false);
1003 dn
= link(dir
, dname
, in
, dn
);
1006 update_dentry_lease(dn
, dlease
, from
, session
);
1010 void Client::update_dentry_lease(Dentry
*dn
, LeaseStat
*dlease
, utime_t from
, MetaSession
*session
)
1012 utime_t dttl
= from
;
1013 dttl
+= (float)dlease
->duration_ms
/ 1000.0;
1017 if (dlease
->mask
& CEPH_LOCK_DN
) {
1018 if (dttl
> dn
->lease_ttl
) {
1019 ldout(cct
, 10) << "got dentry lease on " << dn
->name
1020 << " dur " << dlease
->duration_ms
<< "ms ttl " << dttl
<< dendl
;
1021 dn
->lease_ttl
= dttl
;
1022 dn
->lease_mds
= session
->mds_num
;
1023 dn
->lease_seq
= dlease
->seq
;
1024 dn
->lease_gen
= session
->cap_gen
;
1027 dn
->cap_shared_gen
= dn
->dir
->parent_inode
->shared_gen
;
1032 * update MDS location cache for a single inode
1034 void Client::update_dir_dist(Inode
*in
, DirStat
*dst
)
1037 ldout(cct
, 20) << "got dirfrag map for " << in
->ino
<< " frag " << dst
->frag
<< " to mds " << dst
->auth
<< dendl
;
1038 if (dst
->auth
>= 0) {
1039 in
->fragmap
[dst
->frag
] = dst
->auth
;
1041 in
->fragmap
.erase(dst
->frag
);
1043 if (!in
->dirfragtree
.is_leaf(dst
->frag
)) {
1044 in
->dirfragtree
.force_to_leaf(cct
, dst
->frag
);
1045 _fragmap_remove_non_leaves(in
);
1049 in
->dir_replicated
= !dst
->dist
.empty(); // FIXME that's just one frag!
1053 if (!st->dirfrag_dist.empty()) { // FIXME
1054 set<int> dist = st->dirfrag_dist.begin()->second;
1055 if (dist.empty() && !in->dir_contacts.empty())
1056 ldout(cct, 9) << "lost dist spec for " << in->ino
1057 << " " << dist << dendl;
1058 if (!dist.empty() && in->dir_contacts.empty())
1059 ldout(cct, 9) << "got dist spec for " << in->ino
1060 << " " << dist << dendl;
1061 in->dir_contacts = dist;
1066 void Client::clear_dir_complete_and_ordered(Inode
*diri
, bool complete
)
1068 if (diri
->flags
& I_COMPLETE
) {
1070 ldout(cct
, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
1071 diri
->flags
&= ~(I_COMPLETE
| I_DIR_ORDERED
);
1073 if (diri
->flags
& I_DIR_ORDERED
) {
1074 ldout(cct
, 10) << " clearing I_DIR_ORDERED on " << *diri
<< dendl
;
1075 diri
->flags
&= ~I_DIR_ORDERED
;
1079 diri
->dir
->readdir_cache
.clear();
1084 * insert results from readdir or lssnap into the metadata cache.
1086 void Client::insert_readdir_results(MetaRequest
*request
, MetaSession
*session
, Inode
*diri
) {
1088 auto& reply
= request
->reply
;
1089 ConnectionRef con
= request
->reply
->get_connection();
1091 if(session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1092 features
= (uint64_t)-1;
1095 features
= con
->get_features();
1098 dir_result_t
*dirp
= request
->dirp
;
1101 // the extra buffer list is only set for readdir and lssnap replies
1102 auto p
= reply
->get_extra_bl().cbegin();
1105 if (request
->head
.op
== CEPH_MDS_OP_LSSNAP
) {
1107 diri
= open_snapdir(diri
);
1110 // only open dir if we're actually adding stuff to it!
1111 Dir
*dir
= diri
->open_dir();
1115 DirStat
dst(p
, features
);
1121 bool end
= ((unsigned)flags
& CEPH_READDIR_FRAG_END
);
1122 bool hash_order
= ((unsigned)flags
& CEPH_READDIR_HASH_ORDER
);
1124 frag_t fg
= (unsigned)request
->head
.args
.readdir
.frag
;
1125 unsigned readdir_offset
= dirp
->next_offset
;
1126 string readdir_start
= dirp
->last_name
;
1127 ceph_assert(!readdir_start
.empty() || readdir_offset
== 2);
1129 unsigned last_hash
= 0;
1131 if (!readdir_start
.empty()) {
1132 last_hash
= ceph_frag_value(diri
->hash_dentry_name(readdir_start
));
1133 } else if (flags
& CEPH_READDIR_OFFSET_HASH
) {
1134 /* mds understands offset_hash */
1135 last_hash
= (unsigned)request
->head
.args
.readdir
.offset_hash
;
1139 if (fg
!= dst
.frag
) {
1140 ldout(cct
, 10) << "insert_trace got new frag " << fg
<< " -> " << dst
.frag
<< dendl
;
1144 readdir_start
.clear();
1145 dirp
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
, false);
1149 ldout(cct
, 10) << __func__
<< " " << numdn
<< " readdir items, end=" << end
1150 << ", hash_order=" << hash_order
1151 << ", readdir_start " << readdir_start
1152 << ", last_hash " << last_hash
1153 << ", next_offset " << readdir_offset
<< dendl
;
1155 if (diri
->snapid
!= CEPH_SNAPDIR
&&
1156 fg
.is_leftmost() && readdir_offset
== 2 &&
1157 !(hash_order
&& last_hash
)) {
1158 dirp
->release_count
= diri
->dir_release_count
;
1159 dirp
->ordered_count
= diri
->dir_ordered_count
;
1160 dirp
->start_shared_gen
= diri
->shared_gen
;
1161 dirp
->cache_index
= 0;
1164 dirp
->buffer_frag
= fg
;
1166 _readdir_drop_dirp_buffer(dirp
);
1167 dirp
->buffer
.reserve(numdn
);
1171 for (unsigned i
=0; i
<numdn
; i
++) {
1173 dlease
.decode(p
, features
);
1174 InodeStat
ist(p
, features
);
1176 ldout(cct
, 15) << "" << i
<< ": '" << dname
<< "'" << dendl
;
1178 Inode
*in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1181 if (diri
->dir
->dentries
.count(dname
)) {
1182 Dentry
*olddn
= diri
->dir
->dentries
[dname
];
1183 if (olddn
->inode
!= in
) {
1184 // replace incorrect dentry
1185 unlink(olddn
, true, true); // keep dir, dentry
1186 dn
= link(dir
, dname
, in
, olddn
);
1187 ceph_assert(dn
== olddn
);
1195 dn
= link(dir
, dname
, in
, NULL
);
1198 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1200 unsigned hash
= ceph_frag_value(diri
->hash_dentry_name(dname
));
1201 if (hash
!= last_hash
)
1204 dn
->offset
= dir_result_t::make_fpos(hash
, readdir_offset
++, true);
1206 dn
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
++, false);
1208 // add to readdir cache
1209 if (dirp
->release_count
== diri
->dir_release_count
&&
1210 dirp
->ordered_count
== diri
->dir_ordered_count
&&
1211 dirp
->start_shared_gen
== diri
->shared_gen
) {
1212 if (dirp
->cache_index
== dir
->readdir_cache
.size()) {
1214 ceph_assert(!dirp
->inode
->is_complete_and_ordered());
1215 dir
->readdir_cache
.reserve(dirp
->cache_index
+ numdn
);
1217 dir
->readdir_cache
.push_back(dn
);
1218 } else if (dirp
->cache_index
< dir
->readdir_cache
.size()) {
1219 if (dirp
->inode
->is_complete_and_ordered())
1220 ceph_assert(dir
->readdir_cache
[dirp
->cache_index
] == dn
);
1222 dir
->readdir_cache
[dirp
->cache_index
] = dn
;
1224 ceph_abort_msg("unexpected readdir buffer idx");
1226 dirp
->cache_index
++;
1228 // add to cached result list
1229 dirp
->buffer
.push_back(dir_result_t::dentry(dn
->offset
, dname
, in
));
1230 ldout(cct
, 15) << __func__
<< " " << hex
<< dn
->offset
<< dec
<< ": '" << dname
<< "' -> " << in
->ino
<< dendl
;
1234 dirp
->last_name
= dname
;
1236 dirp
->next_offset
= 2;
1238 dirp
->next_offset
= readdir_offset
;
1240 if (dir
->is_empty())
1247 * insert a trace from a MDS reply into the cache.
1249 Inode
* Client::insert_trace(MetaRequest
*request
, MetaSession
*session
)
1251 auto& reply
= request
->reply
;
1252 int op
= request
->get_op();
1254 ldout(cct
, 10) << "insert_trace from " << request
->sent_stamp
<< " mds." << session
->mds_num
1255 << " is_target=" << (int)reply
->head
.is_target
1256 << " is_dentry=" << (int)reply
->head
.is_dentry
1259 auto p
= reply
->get_trace_bl().cbegin();
1260 if (request
->got_unsafe
) {
1261 ldout(cct
, 10) << "insert_trace -- already got unsafe; ignoring" << dendl
;
1262 ceph_assert(p
.end());
1267 ldout(cct
, 10) << "insert_trace -- no trace" << dendl
;
1269 Dentry
*d
= request
->dentry();
1271 Inode
*diri
= d
->dir
->parent_inode
;
1272 diri
->dir_release_count
++;
1273 clear_dir_complete_and_ordered(diri
, true);
1276 if (d
&& reply
->get_result() == 0) {
1277 if (op
== CEPH_MDS_OP_RENAME
) {
1279 Dentry
*od
= request
->old_dentry();
1280 ldout(cct
, 10) << " unlinking rename src dn " << od
<< " for traceless reply" << dendl
;
1282 unlink(od
, true, true); // keep dir, dentry
1283 } else if (op
== CEPH_MDS_OP_RMDIR
||
1284 op
== CEPH_MDS_OP_UNLINK
) {
1286 ldout(cct
, 10) << " unlinking unlink/rmdir dn " << d
<< " for traceless reply" << dendl
;
1287 unlink(d
, true, true); // keep dir, dentry
1293 ConnectionRef con
= request
->reply
->get_connection();
1295 if (session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1296 features
= (uint64_t)-1;
1299 features
= con
->get_features();
1301 ldout(cct
, 10) << " features 0x" << hex
<< features
<< dec
<< dendl
;
1304 SnapRealm
*realm
= NULL
;
1305 if (reply
->snapbl
.length())
1306 update_snap_trace(reply
->snapbl
, &realm
);
1308 ldout(cct
, 10) << " hrm "
1309 << " is_target=" << (int)reply
->head
.is_target
1310 << " is_dentry=" << (int)reply
->head
.is_dentry
1319 if (reply
->head
.is_dentry
) {
1320 dirst
.decode(p
, features
);
1321 dst
.decode(p
, features
);
1323 dlease
.decode(p
, features
);
1327 if (reply
->head
.is_target
) {
1328 ist
.decode(p
, features
);
1329 if (cct
->_conf
->client_debug_getattr_caps
) {
1330 unsigned wanted
= 0;
1331 if (op
== CEPH_MDS_OP_GETATTR
|| op
== CEPH_MDS_OP_LOOKUP
)
1332 wanted
= request
->head
.args
.getattr
.mask
;
1333 else if (op
== CEPH_MDS_OP_OPEN
|| op
== CEPH_MDS_OP_CREATE
)
1334 wanted
= request
->head
.args
.open
.mask
;
1336 if ((wanted
& CEPH_CAP_XATTR_SHARED
) &&
1337 !(ist
.xattr_version
> 0 && ist
.xattrbl
.length() > 0))
1338 ceph_abort_msg("MDS reply does not contain xattrs");
1341 in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1346 if (reply
->head
.is_dentry
) {
1347 diri
= add_update_inode(&dirst
, request
->sent_stamp
, session
,
1349 update_dir_dist(diri
, &dst
); // dir stat info is attached to ..
1352 Dir
*dir
= diri
->open_dir();
1353 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
,
1354 (op
== CEPH_MDS_OP_RENAME
) ? request
->old_dentry() : NULL
);
1357 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1358 dn
= diri
->dir
->dentries
[dname
];
1360 diri
->dir_ordered_count
++;
1361 clear_dir_complete_and_ordered(diri
, false);
1362 unlink(dn
, true, true); // keep dir, dentry
1365 if (dlease
.duration_ms
> 0) {
1367 Dir
*dir
= diri
->open_dir();
1368 dn
= link(dir
, dname
, NULL
, NULL
);
1370 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1373 } else if (op
== CEPH_MDS_OP_LOOKUPSNAP
||
1374 op
== CEPH_MDS_OP_MKSNAP
) {
1375 ldout(cct
, 10) << " faking snap lookup weirdness" << dendl
;
1376 // fake it for snap lookup
1377 vinodeno_t vino
= ist
.vino
;
1378 vino
.snapid
= CEPH_SNAPDIR
;
1379 ceph_assert(inode_map
.count(vino
));
1380 diri
= inode_map
[vino
];
1382 string dname
= request
->path
.last_dentry();
1385 dlease
.duration_ms
= 0;
1388 Dir
*dir
= diri
->open_dir();
1389 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
);
1391 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1392 Dentry
*dn
= diri
->dir
->dentries
[dname
];
1394 unlink(dn
, true, true); // keep dir, dentry
1400 if (op
== CEPH_MDS_OP_READDIR
||
1401 op
== CEPH_MDS_OP_LSSNAP
) {
1402 insert_readdir_results(request
, session
, in
);
1403 } else if (op
== CEPH_MDS_OP_LOOKUPNAME
) {
1404 // hack: return parent inode instead
1408 if (request
->dentry() == NULL
&& in
!= request
->inode()) {
1409 // pin the target inode if its parent dentry is not pinned
1410 request
->set_other_inode(in
);
1415 put_snap_realm(realm
);
1417 request
->target
= in
;
1423 mds_rank_t
Client::choose_target_mds(MetaRequest
*req
, Inode
** phash_diri
)
1425 mds_rank_t mds
= MDS_RANK_NONE
;
1427 bool is_hash
= false;
1432 if (req
->resend_mds
>= 0) {
1433 mds
= req
->resend_mds
;
1434 req
->resend_mds
= -1;
1435 ldout(cct
, 10) << __func__
<< " resend_mds specified as mds." << mds
<< dendl
;
1439 if (cct
->_conf
->client_use_random_mds
)
1445 ldout(cct
, 20) << __func__
<< " starting with req->inode " << *in
<< dendl
;
1446 if (req
->path
.depth()) {
1447 hash
= in
->hash_dentry_name(req
->path
[0]);
1448 ldout(cct
, 20) << __func__
<< " inode dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1449 << " on " << req
->path
[0]
1450 << " => " << hash
<< dendl
;
1455 in
= de
->inode
.get();
1456 ldout(cct
, 20) << __func__
<< " starting with req->dentry inode " << *in
<< dendl
;
1458 in
= de
->dir
->parent_inode
;
1459 hash
= in
->hash_dentry_name(de
->name
);
1460 ldout(cct
, 20) << __func__
<< " dentry dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1461 << " on " << de
->name
1462 << " => " << hash
<< dendl
;
1467 if (in
->snapid
!= CEPH_NOSNAP
) {
1468 ldout(cct
, 10) << __func__
<< " " << *in
<< " is snapped, using nonsnap parent" << dendl
;
1469 while (in
->snapid
!= CEPH_NOSNAP
) {
1470 if (in
->snapid
== CEPH_SNAPDIR
)
1471 in
= in
->snapdir_parent
.get();
1472 else if (!in
->dentries
.empty())
1473 /* In most cases there will only be one dentry, so getting it
1474 * will be the correct action. If there are multiple hard links,
1475 * I think the MDS should be able to redirect as needed*/
1476 in
= in
->get_first_parent()->dir
->parent_inode
;
1478 ldout(cct
, 10) << "got unlinked inode, can't look at parent" << dendl
;
1485 ldout(cct
, 20) << __func__
<< " " << *in
<< " is_hash=" << is_hash
1486 << " hash=" << hash
<< dendl
;
1488 if (is_hash
&& S_ISDIR(in
->mode
) && !in
->fragmap
.empty()) {
1489 frag_t fg
= in
->dirfragtree
[hash
];
1490 if (in
->fragmap
.count(fg
)) {
1491 mds
= in
->fragmap
[fg
];
1494 } else if (in
->auth_cap
) {
1495 mds
= in
->auth_cap
->session
->mds_num
;
1498 ldout(cct
, 10) << __func__
<< " from dirfragtree hash" << dendl
;
1503 if (in
->auth_cap
&& req
->auth_is_best()) {
1504 mds
= in
->auth_cap
->session
->mds_num
;
1505 } else if (!in
->caps
.empty()) {
1506 mds
= in
->caps
.begin()->second
.session
->mds_num
;
1510 ldout(cct
, 10) << __func__
<< " from caps on inode " << *in
<< dendl
;
1517 mds
= _get_random_up_mds();
1518 ldout(cct
, 10) << "did not get mds through better means, so chose random mds " << mds
<< dendl
;
1522 ldout(cct
, 20) << "mds is " << mds
<< dendl
;
1527 void Client::connect_mds_targets(mds_rank_t mds
)
1529 ldout(cct
, 10) << __func__
<< " for mds." << mds
<< dendl
;
1530 ceph_assert(mds_sessions
.count(mds
));
1531 const MDSMap::mds_info_t
& info
= mdsmap
->get_mds_info(mds
);
1532 for (set
<mds_rank_t
>::const_iterator q
= info
.export_targets
.begin();
1533 q
!= info
.export_targets
.end();
1535 if (mds_sessions
.count(*q
) == 0 &&
1536 mdsmap
->is_clientreplay_or_active_or_stopping(*q
)) {
1537 ldout(cct
, 10) << "check_mds_sessions opening mds." << mds
1538 << " export target mds." << *q
<< dendl
;
1539 _open_mds_session(*q
);
1544 void Client::dump_mds_sessions(Formatter
*f
)
1546 f
->dump_int("id", get_nodeid().v
);
1547 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
1548 f
->dump_object("inst", inst
);
1549 f
->dump_stream("inst_str") << inst
;
1550 f
->dump_stream("addr_str") << inst
.addr
;
1551 f
->open_array_section("sessions");
1552 for (const auto &p
: mds_sessions
) {
1553 f
->open_object_section("session");
1558 f
->dump_int("mdsmap_epoch", mdsmap
->get_epoch());
1560 void Client::dump_mds_requests(Formatter
*f
)
1562 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
1563 p
!= mds_requests
.end();
1565 f
->open_object_section("request");
1571 int Client::verify_reply_trace(int r
,
1572 MetaRequest
*request
, const MConstRef
<MClientReply
>& reply
,
1573 InodeRef
*ptarget
, bool *pcreated
,
1574 const UserPerm
& perms
)
1576 // check whether this request actually did the create, and set created flag
1577 bufferlist extra_bl
;
1578 inodeno_t created_ino
;
1579 bool got_created_ino
= false;
1580 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
;
1582 extra_bl
= reply
->get_extra_bl();
1583 if (extra_bl
.length() >= 8) {
1584 // if the extra bufferlist has a buffer, we assume its the created inode
1585 // and that this request to create succeeded in actually creating
1586 // the inode (won the race with other create requests)
1587 decode(created_ino
, extra_bl
);
1588 got_created_ino
= true;
1589 ldout(cct
, 10) << "make_request created ino " << created_ino
<< dendl
;
1593 *pcreated
= got_created_ino
;
1595 if (request
->target
) {
1596 *ptarget
= request
->target
;
1597 ldout(cct
, 20) << "make_request target is " << *ptarget
->get() << dendl
;
1599 if (got_created_ino
&& (p
= inode_map
.find(vinodeno_t(created_ino
, CEPH_NOSNAP
))) != inode_map
.end()) {
1600 (*ptarget
) = p
->second
;
1601 ldout(cct
, 20) << "make_request created, target is " << *ptarget
->get() << dendl
;
1603 // we got a traceless reply, and need to look up what we just
1604 // created. for now, do this by name. someday, do this by the
1605 // ino... which we know! FIXME.
1607 Dentry
*d
= request
->dentry();
1610 ldout(cct
, 10) << "make_request got traceless reply, looking up #"
1611 << d
->dir
->parent_inode
->ino
<< "/" << d
->name
1612 << " got_ino " << got_created_ino
1613 << " ino " << created_ino
1615 r
= _do_lookup(d
->dir
->parent_inode
, d
->name
, request
->regetattr_mask
,
1618 // if the dentry is not linked, just do our best. see #5021.
1619 ceph_abort_msg("how did this happen? i want logs!");
1622 Inode
*in
= request
->inode();
1623 ldout(cct
, 10) << "make_request got traceless reply, forcing getattr on #"
1624 << in
->ino
<< dendl
;
1625 r
= _getattr(in
, request
->regetattr_mask
, perms
, true);
1629 // verify ino returned in reply and trace_dist are the same
1630 if (got_created_ino
&&
1631 created_ino
.val
!= target
->ino
.val
) {
1632 ldout(cct
, 5) << "create got ino " << created_ino
<< " but then failed on lookup; EINTR?" << dendl
;
1636 ptarget
->swap(target
);
1648 * Blocking helper to make an MDS request.
1650 * If the ptarget flag is set, behavior changes slightly: the caller
1651 * expects to get a pointer to the inode we are creating or operating
1652 * on. As a result, we will follow up any traceless mutation reply
1653 * with a getattr or lookup to transparently handle a traceless reply
1654 * from the MDS (as when the MDS restarts and the client has to replay
1657 * @param request the MetaRequest to execute
1658 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1659 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1660 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1661 * @param use_mds [optional] prefer a specific mds (-1 for default)
1662 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1664 int Client::make_request(MetaRequest
*request
,
1665 const UserPerm
& perms
,
1666 InodeRef
*ptarget
, bool *pcreated
,
1672 // assign a unique tid
1673 ceph_tid_t tid
= ++last_tid
;
1674 request
->set_tid(tid
);
1677 request
->op_stamp
= ceph_clock_now();
1680 mds_requests
[tid
] = request
->get();
1681 if (oldest_tid
== 0 && request
->get_op() != CEPH_MDS_OP_SETFILELOCK
)
1684 request
->set_caller_perms(perms
);
1686 if (cct
->_conf
->client_inject_fixed_oldest_tid
) {
1687 ldout(cct
, 20) << __func__
<< " injecting fixed oldest_client_tid(1)" << dendl
;
1688 request
->set_oldest_client_tid(1);
1690 request
->set_oldest_client_tid(oldest_tid
);
1695 request
->resend_mds
= use_mds
;
1698 if (request
->aborted())
1702 request
->abort(-EBLACKLISTED
);
1708 request
->caller_cond
= &caller_cond
;
1711 Inode
*hash_diri
= NULL
;
1712 mds_rank_t mds
= choose_target_mds(request
, &hash_diri
);
1713 int mds_state
= (mds
== MDS_RANK_NONE
) ? MDSMap::STATE_NULL
: mdsmap
->get_state(mds
);
1714 if (mds_state
!= MDSMap::STATE_ACTIVE
&& mds_state
!= MDSMap::STATE_STOPPING
) {
1715 if (mds_state
== MDSMap::STATE_NULL
&& mds
>= mdsmap
->get_max_mds()) {
1717 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, remove it from fragmap" << dendl
;
1718 _fragmap_remove_stopped_mds(hash_diri
, mds
);
1720 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, trying a random mds" << dendl
;
1721 request
->resend_mds
= _get_random_up_mds();
1724 ldout(cct
, 10) << " target mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
1725 wait_on_list(waiting_for_mdsmap
);
1731 MetaSession
*session
= NULL
;
1732 if (!have_open_session(mds
)) {
1733 session
= _get_or_open_mds_session(mds
);
1736 if (session
->state
== MetaSession::STATE_OPENING
) {
1737 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
1738 wait_on_context_list(session
->waiting_for_open
);
1739 // Abort requests on REJECT from MDS
1740 if (rejected_by_mds
.count(mds
)) {
1741 request
->abort(-EPERM
);
1747 if (!have_open_session(mds
))
1750 session
= &mds_sessions
.at(mds
);
1754 send_request(request
, session
);
1757 ldout(cct
, 20) << "awaiting reply|forward|kick on " << &caller_cond
<< dendl
;
1758 request
->kick
= false;
1759 while (!request
->reply
&& // reply
1760 request
->resend_mds
< 0 && // forward
1762 caller_cond
.Wait(client_lock
);
1763 request
->caller_cond
= NULL
;
1765 // did we get a reply?
1770 if (!request
->reply
) {
1771 ceph_assert(request
->aborted());
1772 ceph_assert(!request
->got_unsafe
);
1773 r
= request
->get_abort_code();
1774 request
->item
.remove_myself();
1775 unregister_request(request
);
1776 put_request(request
);
1781 auto reply
= std::move(request
->reply
);
1782 r
= reply
->get_result();
1784 request
->success
= true;
1786 // kick dispatcher (we've got it!)
1787 ceph_assert(request
->dispatch_cond
);
1788 request
->dispatch_cond
->Signal();
1789 ldout(cct
, 20) << "sendrecv kickback on tid " << tid
<< " " << request
->dispatch_cond
<< dendl
;
1790 request
->dispatch_cond
= 0;
1792 if (r
>= 0 && ptarget
)
1793 r
= verify_reply_trace(r
, request
, reply
, ptarget
, pcreated
, perms
);
1796 *pdirbl
= reply
->get_extra_bl();
1799 utime_t lat
= ceph_clock_now();
1800 lat
-= request
->sent_stamp
;
1801 ldout(cct
, 20) << "lat " << lat
<< dendl
;
1802 logger
->tinc(l_c_lat
, lat
);
1803 logger
->tinc(l_c_reply
, lat
);
1805 put_request(request
);
1809 void Client::unregister_request(MetaRequest
*req
)
1811 mds_requests
.erase(req
->tid
);
1812 if (req
->tid
== oldest_tid
) {
1813 map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.upper_bound(oldest_tid
);
1815 if (p
== mds_requests
.end()) {
1819 if (p
->second
->get_op() != CEPH_MDS_OP_SETFILELOCK
) {
1820 oldest_tid
= p
->first
;
1829 void Client::put_request(MetaRequest
*request
)
1831 if (request
->_put()) {
1833 if (request
->success
)
1834 op
= request
->get_op();
1836 request
->take_other_inode(&other_in
);
1840 (op
== CEPH_MDS_OP_RMDIR
||
1841 op
== CEPH_MDS_OP_RENAME
||
1842 op
== CEPH_MDS_OP_RMSNAP
)) {
1843 _try_to_trim_inode(other_in
.get(), false);
1848 int Client::encode_inode_release(Inode
*in
, MetaRequest
*req
,
1849 mds_rank_t mds
, int drop
,
1850 int unless
, int force
)
1852 ldout(cct
, 20) << __func__
<< " enter(in:" << *in
<< ", req:" << req
1853 << " mds:" << mds
<< ", drop:" << drop
<< ", unless:" << unless
1854 << ", have:" << ", force:" << force
<< ")" << dendl
;
1856 auto it
= in
->caps
.find(mds
);
1857 if (it
!= in
->caps
.end()) {
1858 Cap
&cap
= it
->second
;
1859 drop
&= ~(in
->dirty_caps
| get_caps_used(in
));
1860 if ((drop
& cap
.issued
) &&
1861 !(unless
& cap
.issued
)) {
1862 ldout(cct
, 25) << "Dropping caps. Initial " << ccap_string(cap
.issued
) << dendl
;
1863 cap
.issued
&= ~drop
;
1864 cap
.implemented
&= ~drop
;
1866 ldout(cct
, 25) << "Now have: " << ccap_string(cap
.issued
) << dendl
;
1871 ceph_mds_request_release rel
;
1873 rel
.cap_id
= cap
.cap_id
;
1875 rel
.issue_seq
= cap
.issue_seq
;
1876 rel
.mseq
= cap
.mseq
;
1877 rel
.caps
= cap
.implemented
;
1878 rel
.wanted
= cap
.wanted
;
1881 req
->cap_releases
.push_back(MClientRequest::Release(rel
,""));
1884 ldout(cct
, 25) << __func__
<< " exit(in:" << *in
<< ") released:"
1885 << released
<< dendl
;
1889 void Client::encode_dentry_release(Dentry
*dn
, MetaRequest
*req
,
1890 mds_rank_t mds
, int drop
, int unless
)
1892 ldout(cct
, 20) << __func__
<< " enter(dn:"
1893 << dn
<< ")" << dendl
;
1896 released
= encode_inode_release(dn
->dir
->parent_inode
, req
,
1897 mds
, drop
, unless
, 1);
1898 if (released
&& dn
->lease_mds
== mds
) {
1899 ldout(cct
, 25) << "preemptively releasing dn to mds" << dendl
;
1900 auto& rel
= req
->cap_releases
.back();
1901 rel
.item
.dname_len
= dn
->name
.length();
1902 rel
.item
.dname_seq
= dn
->lease_seq
;
1903 rel
.dname
= dn
->name
;
1905 ldout(cct
, 25) << __func__
<< " exit(dn:"
1906 << dn
<< ")" << dendl
;
1911 * This requires the MClientRequest *request member to be set.
1912 * It will error out horribly without one.
1913 * Additionally, if you set any *drop member, you'd better have
1914 * set the corresponding dentry!
1916 void Client::encode_cap_releases(MetaRequest
*req
, mds_rank_t mds
)
1918 ldout(cct
, 20) << __func__
<< " enter (req: "
1919 << req
<< ", mds: " << mds
<< ")" << dendl
;
1920 if (req
->inode_drop
&& req
->inode())
1921 encode_inode_release(req
->inode(), req
,
1922 mds
, req
->inode_drop
,
1925 if (req
->old_inode_drop
&& req
->old_inode())
1926 encode_inode_release(req
->old_inode(), req
,
1927 mds
, req
->old_inode_drop
,
1928 req
->old_inode_unless
);
1929 if (req
->other_inode_drop
&& req
->other_inode())
1930 encode_inode_release(req
->other_inode(), req
,
1931 mds
, req
->other_inode_drop
,
1932 req
->other_inode_unless
);
1934 if (req
->dentry_drop
&& req
->dentry())
1935 encode_dentry_release(req
->dentry(), req
,
1936 mds
, req
->dentry_drop
,
1937 req
->dentry_unless
);
1939 if (req
->old_dentry_drop
&& req
->old_dentry())
1940 encode_dentry_release(req
->old_dentry(), req
,
1941 mds
, req
->old_dentry_drop
,
1942 req
->old_dentry_unless
);
1943 ldout(cct
, 25) << __func__
<< " exit (req: "
1944 << req
<< ", mds " << mds
<<dendl
;
1947 bool Client::have_open_session(mds_rank_t mds
)
1949 const auto &it
= mds_sessions
.find(mds
);
1950 return it
!= mds_sessions
.end() &&
1951 (it
->second
.state
== MetaSession::STATE_OPEN
||
1952 it
->second
.state
== MetaSession::STATE_STALE
);
1955 MetaSession
*Client::_get_mds_session(mds_rank_t mds
, Connection
*con
)
1957 const auto &it
= mds_sessions
.find(mds
);
1958 if (it
== mds_sessions
.end() || it
->second
.con
!= con
) {
1965 MetaSession
*Client::_get_or_open_mds_session(mds_rank_t mds
)
1967 auto it
= mds_sessions
.find(mds
);
1968 return it
== mds_sessions
.end() ? _open_mds_session(mds
) : &it
->second
;
1972 * Populate a map of strings with client-identifying metadata,
1973 * such as the hostname. Call this once at initialization.
1975 void Client::populate_metadata(const std::string
&mount_root
)
1981 metadata
["hostname"] = u
.nodename
;
1982 ldout(cct
, 20) << __func__
<< " read hostname '" << u
.nodename
<< "'" << dendl
;
1984 ldout(cct
, 1) << __func__
<< " failed to read hostname (" << cpp_strerror(r
) << ")" << dendl
;
1987 metadata
["pid"] = stringify(getpid());
1989 // Ceph entity id (the '0' in "client.0")
1990 metadata
["entity_id"] = cct
->_conf
->name
.get_id();
1992 // Our mount position
1993 if (!mount_root
.empty()) {
1994 metadata
["root"] = mount_root
;
1998 metadata
["ceph_version"] = pretty_version_to_str();
1999 metadata
["ceph_sha1"] = git_version_to_str();
2001 // Apply any metadata from the user's configured overrides
2002 std::vector
<std::string
> tokens
;
2003 get_str_vec(cct
->_conf
->client_metadata
, ",", tokens
);
2004 for (const auto &i
: tokens
) {
2005 auto eqpos
= i
.find("=");
2006 // Throw out anything that isn't of the form "<str>=<str>"
2007 if (eqpos
== 0 || eqpos
== std::string::npos
|| eqpos
== i
.size()) {
2008 lderr(cct
) << "Invalid metadata keyval pair: '" << i
<< "'" << dendl
;
2011 metadata
[i
.substr(0, eqpos
)] = i
.substr(eqpos
+ 1);
2016 * Optionally add or override client metadata fields.
2018 void Client::update_metadata(std::string
const &k
, std::string
const &v
)
2020 std::lock_guard
l(client_lock
);
2021 ceph_assert(initialized
);
2023 auto it
= metadata
.find(k
);
2024 if (it
!= metadata
.end()) {
2025 ldout(cct
, 1) << __func__
<< " warning, overriding metadata field '" << k
2026 << "' from '" << it
->second
<< "' to '" << v
<< "'" << dendl
;
2032 MetaSession
*Client::_open_mds_session(mds_rank_t mds
)
2034 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
2035 auto addrs
= mdsmap
->get_addrs(mds
);
2036 auto em
= mds_sessions
.emplace(std::piecewise_construct
,
2037 std::forward_as_tuple(mds
),
2038 std::forward_as_tuple(mds
, messenger
->connect_to_mds(addrs
), addrs
));
2039 ceph_assert(em
.second
); /* not already present */
2040 MetaSession
*session
= &em
.first
->second
;
2042 // Maybe skip sending a request to open if this MDS daemon
2043 // has previously sent us a REJECT.
2044 if (rejected_by_mds
.count(mds
)) {
2045 if (rejected_by_mds
[mds
] == session
->addrs
) {
2046 ldout(cct
, 4) << __func__
<< " mds." << mds
<< " skipping "
2047 "because we were rejected" << dendl
;
2050 ldout(cct
, 4) << __func__
<< " mds." << mds
<< " old inst "
2051 "rejected us, trying with new inst" << dendl
;
2052 rejected_by_mds
.erase(mds
);
2056 auto m
= MClientSession::create(CEPH_SESSION_REQUEST_OPEN
);
2057 m
->metadata
= metadata
;
2058 m
->supported_features
= feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED
);
2059 session
->con
->send_message2(std::move(m
));
2063 void Client::_close_mds_session(MetaSession
*s
)
2065 ldout(cct
, 2) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2066 s
->state
= MetaSession::STATE_CLOSING
;
2067 s
->con
->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2070 void Client::_closed_mds_session(MetaSession
*s
)
2072 ldout(cct
, 5) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2073 s
->state
= MetaSession::STATE_CLOSED
;
2074 s
->con
->mark_down();
2075 signal_context_list(s
->waiting_for_open
);
2076 mount_cond
.Signal();
2077 remove_session_caps(s
);
2078 kick_requests_closed(s
);
2079 mds_sessions
.erase(s
->mds_num
);
2082 void Client::handle_client_session(const MConstRef
<MClientSession
>& m
)
2084 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2085 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << from
<< dendl
;
2087 MetaSession
*session
= _get_mds_session(from
, m
->get_connection().get());
2089 ldout(cct
, 10) << " discarding session message from sessionless mds " << m
->get_source_inst() << dendl
;
2093 switch (m
->get_op()) {
2094 case CEPH_SESSION_OPEN
:
2096 feature_bitset_t
missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED
);
2097 missing_features
-= m
->supported_features
;
2098 if (!missing_features
.empty()) {
2099 lderr(cct
) << "mds." << from
<< " lacks required features '"
2100 << missing_features
<< "', closing session " << dendl
;
2101 rejected_by_mds
[session
->mds_num
] = session
->addrs
;
2102 _close_mds_session(session
);
2103 _closed_mds_session(session
);
2106 session
->mds_features
= std::move(m
->supported_features
);
2108 renew_caps(session
);
2109 session
->state
= MetaSession::STATE_OPEN
;
2111 mount_cond
.Signal();
2113 connect_mds_targets(from
);
2114 signal_context_list(session
->waiting_for_open
);
2118 case CEPH_SESSION_CLOSE
:
2119 _closed_mds_session(session
);
2122 case CEPH_SESSION_RENEWCAPS
:
2123 if (session
->cap_renew_seq
== m
->get_seq()) {
2124 bool was_stale
= ceph_clock_now() >= session
->cap_ttl
;
2126 session
->last_cap_renew_request
+ mdsmap
->get_session_timeout();
2128 wake_up_session_caps(session
, false);
2132 case CEPH_SESSION_STALE
:
2133 // invalidate session caps/leases
2135 session
->cap_ttl
= ceph_clock_now();
2136 session
->cap_ttl
-= 1;
2137 renew_caps(session
);
2140 case CEPH_SESSION_RECALL_STATE
:
2141 trim_caps(session
, m
->get_max_caps());
2144 case CEPH_SESSION_FLUSHMSG
:
2145 /* flush cap release */
2146 if (auto& m
= session
->release
; m
) {
2147 session
->con
->send_message2(std::move(m
));
2149 session
->con
->send_message2(MClientSession::create(CEPH_SESSION_FLUSHMSG_ACK
, m
->get_seq()));
2152 case CEPH_SESSION_FORCE_RO
:
2153 force_session_readonly(session
);
2156 case CEPH_SESSION_REJECT
:
2158 std::string_view error_str
;
2159 auto it
= m
->metadata
.find("error_string");
2160 if (it
!= m
->metadata
.end())
2161 error_str
= it
->second
;
2163 error_str
= "unknown error";
2164 lderr(cct
) << "mds." << from
<< " rejected us (" << error_str
<< ")" << dendl
;
2166 rejected_by_mds
[session
->mds_num
] = session
->addrs
;
2167 _closed_mds_session(session
);
2176 bool Client::_any_stale_sessions() const
2178 ceph_assert(client_lock
.is_locked_by_me());
2180 for (const auto &p
: mds_sessions
) {
2181 if (p
.second
.state
== MetaSession::STATE_STALE
) {
2189 void Client::_kick_stale_sessions()
2191 ldout(cct
, 1) << __func__
<< dendl
;
2193 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
2194 MetaSession
&s
= it
->second
;
2196 if (s
.state
== MetaSession::STATE_STALE
)
2197 _closed_mds_session(&s
);
2201 void Client::send_request(MetaRequest
*request
, MetaSession
*session
,
2202 bool drop_cap_releases
)
2205 mds_rank_t mds
= session
->mds_num
;
2206 ldout(cct
, 10) << __func__
<< " rebuilding request " << request
->get_tid()
2207 << " for mds." << mds
<< dendl
;
2208 auto r
= build_client_request(request
);
2209 if (request
->dentry()) {
2210 r
->set_dentry_wanted();
2212 if (request
->got_unsafe
) {
2213 r
->set_replayed_op();
2214 if (request
->target
)
2215 r
->head
.ino
= request
->target
->ino
;
2217 encode_cap_releases(request
, mds
);
2218 if (drop_cap_releases
) // we haven't send cap reconnect yet, drop cap releases
2219 request
->cap_releases
.clear();
2221 r
->releases
.swap(request
->cap_releases
);
2223 r
->set_mdsmap_epoch(mdsmap
->get_epoch());
2224 if (r
->head
.op
== CEPH_MDS_OP_SETXATTR
) {
2225 objecter
->with_osdmap([r
](const OSDMap
& o
) {
2226 r
->set_osdmap_epoch(o
.get_epoch());
2230 if (request
->mds
== -1) {
2231 request
->sent_stamp
= ceph_clock_now();
2232 ldout(cct
, 20) << __func__
<< " set sent_stamp to " << request
->sent_stamp
<< dendl
;
2236 Inode
*in
= request
->inode();
2238 auto it
= in
->caps
.find(mds
);
2239 if (it
!= in
->caps
.end()) {
2240 request
->sent_on_mseq
= it
->second
.mseq
;
2244 session
->requests
.push_back(&request
->item
);
2246 ldout(cct
, 10) << __func__
<< " " << *r
<< " to mds." << mds
<< dendl
;
2247 session
->con
->send_message2(std::move(r
));
2250 MClientRequest::ref
Client::build_client_request(MetaRequest
*request
)
2252 auto req
= MClientRequest::create(request
->get_op());
2253 req
->set_tid(request
->tid
);
2254 req
->set_stamp(request
->op_stamp
);
2255 memcpy(&req
->head
, &request
->head
, sizeof(ceph_mds_request_head
));
2257 // if the filepath's haven't been set, set them!
2258 if (request
->path
.empty()) {
2259 Inode
*in
= request
->inode();
2260 Dentry
*de
= request
->dentry();
2262 in
->make_nosnap_relative_path(request
->path
);
2265 de
->inode
->make_nosnap_relative_path(request
->path
);
2267 de
->dir
->parent_inode
->make_nosnap_relative_path(request
->path
);
2268 request
->path
.push_dentry(de
->name
);
2270 else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2271 << " No path, inode, or appropriately-endowed dentry given!"
2273 } else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2274 << " No path, inode, or dentry given!"
2277 req
->set_filepath(request
->get_filepath());
2278 req
->set_filepath2(request
->get_filepath2());
2279 req
->set_data(request
->data
);
2280 req
->set_retry_attempt(request
->retry_attempt
++);
2281 req
->head
.num_fwd
= request
->num_fwd
;
2283 int gid_count
= request
->perms
.get_gids(&_gids
);
2284 req
->set_gid_list(gid_count
, _gids
);
2290 void Client::handle_client_request_forward(const MConstRef
<MClientRequestForward
>& fwd
)
2292 mds_rank_t mds
= mds_rank_t(fwd
->get_source().num());
2293 MetaSession
*session
= _get_mds_session(mds
, fwd
->get_connection().get());
2297 ceph_tid_t tid
= fwd
->get_tid();
2299 if (mds_requests
.count(tid
) == 0) {
2300 ldout(cct
, 10) << __func__
<< " no pending request on tid " << tid
<< dendl
;
2304 MetaRequest
*request
= mds_requests
[tid
];
2305 ceph_assert(request
);
2307 // reset retry counter
2308 request
->retry_attempt
= 0;
2310 // request not forwarded, or dest mds has no session.
2312 ldout(cct
, 10) << __func__
<< " tid " << tid
2313 << " fwd " << fwd
->get_num_fwd()
2314 << " to mds." << fwd
->get_dest_mds()
2315 << ", resending to " << fwd
->get_dest_mds()
2319 request
->item
.remove_myself();
2320 request
->num_fwd
= fwd
->get_num_fwd();
2321 request
->resend_mds
= fwd
->get_dest_mds();
2322 request
->caller_cond
->Signal();
2325 bool Client::is_dir_operation(MetaRequest
*req
)
2327 int op
= req
->get_op();
2328 if (op
== CEPH_MDS_OP_MKNOD
|| op
== CEPH_MDS_OP_LINK
||
2329 op
== CEPH_MDS_OP_UNLINK
|| op
== CEPH_MDS_OP_RENAME
||
2330 op
== CEPH_MDS_OP_MKDIR
|| op
== CEPH_MDS_OP_RMDIR
||
2331 op
== CEPH_MDS_OP_SYMLINK
|| op
== CEPH_MDS_OP_CREATE
)
2336 void Client::handle_client_reply(const MConstRef
<MClientReply
>& reply
)
2338 mds_rank_t mds_num
= mds_rank_t(reply
->get_source().num());
2339 MetaSession
*session
= _get_mds_session(mds_num
, reply
->get_connection().get());
2344 ceph_tid_t tid
= reply
->get_tid();
2345 bool is_safe
= reply
->is_safe();
2347 if (mds_requests
.count(tid
) == 0) {
2348 lderr(cct
) << __func__
<< " no pending request on tid " << tid
2349 << " safe is:" << is_safe
<< dendl
;
2352 MetaRequest
*request
= mds_requests
.at(tid
);
2354 ldout(cct
, 20) << __func__
<< " got a reply. Safe:" << is_safe
2355 << " tid " << tid
<< dendl
;
2357 if (request
->got_unsafe
&& !is_safe
) {
2358 //duplicate response
2359 ldout(cct
, 0) << "got a duplicate reply on tid " << tid
<< " from mds "
2360 << mds_num
<< " safe:" << is_safe
<< dendl
;
2364 if (-ESTALE
== reply
->get_result()) { // see if we can get to proper MDS
2365 ldout(cct
, 20) << "got ESTALE on tid " << request
->tid
2366 << " from mds." << request
->mds
<< dendl
;
2367 request
->send_to_auth
= true;
2368 request
->resend_mds
= choose_target_mds(request
);
2369 Inode
*in
= request
->inode();
2370 std::map
<mds_rank_t
, Cap
>::const_iterator it
;
2371 if (request
->resend_mds
>= 0 &&
2372 request
->resend_mds
== request
->mds
&&
2374 (it
= in
->caps
.find(request
->resend_mds
)) != in
->caps
.end() ||
2375 request
->sent_on_mseq
== it
->second
.mseq
)) {
2376 ldout(cct
, 20) << "have to return ESTALE" << dendl
;
2378 request
->caller_cond
->Signal();
2383 ceph_assert(!request
->reply
);
2384 request
->reply
= reply
;
2385 insert_trace(request
, session
);
2387 // Handle unsafe reply
2389 request
->got_unsafe
= true;
2390 session
->unsafe_requests
.push_back(&request
->unsafe_item
);
2391 if (is_dir_operation(request
)) {
2392 Inode
*dir
= request
->inode();
2394 dir
->unsafe_ops
.push_back(&request
->unsafe_dir_item
);
2396 if (request
->target
) {
2397 InodeRef
&in
= request
->target
;
2398 in
->unsafe_ops
.push_back(&request
->unsafe_target_item
);
2402 // Only signal the caller once (on the first reply):
2403 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2404 if (!is_safe
|| !request
->got_unsafe
) {
2406 request
->dispatch_cond
= &cond
;
2409 ldout(cct
, 20) << __func__
<< " signalling caller " << (void*)request
->caller_cond
<< dendl
;
2410 request
->caller_cond
->Signal();
2412 // wake for kick back
2413 while (request
->dispatch_cond
) {
2414 ldout(cct
, 20) << __func__
<< " awaiting kickback on tid " << tid
<< " " << &cond
<< dendl
;
2415 cond
.Wait(client_lock
);
2420 // the filesystem change is committed to disk
2421 // we're done, clean up
2422 if (request
->got_unsafe
) {
2423 request
->unsafe_item
.remove_myself();
2424 request
->unsafe_dir_item
.remove_myself();
2425 request
->unsafe_target_item
.remove_myself();
2426 signal_cond_list(request
->waitfor_safe
);
2428 request
->item
.remove_myself();
2429 unregister_request(request
);
2432 mount_cond
.Signal();
2435 void Client::_handle_full_flag(int64_t pool
)
2437 ldout(cct
, 1) << __func__
<< ": FULL: cancelling outstanding operations "
2438 << "on " << pool
<< dendl
;
2439 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2440 // to do this rather than blocking, because otherwise when we fill up we
2441 // potentially lock caps forever on files with dirty pages, and we need
2442 // to be able to release those caps to the MDS so that it can delete files
2443 // and free up space.
2444 epoch_t cancelled_epoch
= objecter
->op_cancel_writes(-ENOSPC
, pool
);
2446 // For all inodes with layouts in this pool and a pending flush write op
2447 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2448 // from ObjectCacher so that it doesn't re-issue the write in response to
2449 // the ENOSPC error.
2450 // Fortunately since we're cancelling everything in a given pool, we don't
2451 // need to know which ops belong to which ObjectSet, we can just blow all
2452 // the un-flushed cached data away and mark any dirty inodes' async_err
2453 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2454 // affecting this pool, and all the objectsets we're purging were also
2456 for (unordered_map
<vinodeno_t
,Inode
*>::iterator i
= inode_map
.begin();
2457 i
!= inode_map
.end(); ++i
)
2459 Inode
*inode
= i
->second
;
2460 if (inode
->oset
.dirty_or_tx
2461 && (pool
== -1 || inode
->layout
.pool_id
== pool
)) {
2462 ldout(cct
, 4) << __func__
<< ": FULL: inode 0x" << std::hex
<< i
->first
<< std::dec
2463 << " has dirty objects, purging and setting ENOSPC" << dendl
;
2464 objectcacher
->purge_set(&inode
->oset
);
2465 inode
->set_async_err(-ENOSPC
);
2469 if (cancelled_epoch
!= (epoch_t
)-1) {
2470 set_cap_epoch_barrier(cancelled_epoch
);
2474 void Client::handle_osd_map(const MConstRef
<MOSDMap
>& m
)
2476 std::set
<entity_addr_t
> new_blacklists
;
2477 objecter
->consume_blacklist_events(&new_blacklists
);
2479 const auto myaddrs
= messenger
->get_myaddrs();
2480 bool new_blacklist
= false;
2481 bool prenautilus
= objecter
->with_osdmap(
2482 [&](const OSDMap
& o
) {
2483 return o
.require_osd_release
< CEPH_RELEASE_NAUTILUS
;
2486 for (auto a
: myaddrs
.v
) {
2487 // blacklist entries are always TYPE_ANY for nautilus+
2488 a
.set_type(entity_addr_t::TYPE_ANY
);
2489 if (new_blacklists
.count(a
)) {
2490 new_blacklist
= true;
2494 // ...except pre-nautilus, they were TYPE_LEGACY
2495 a
.set_type(entity_addr_t::TYPE_LEGACY
);
2496 if (new_blacklists
.count(a
)) {
2497 new_blacklist
= true;
2503 if (new_blacklist
) {
2504 auto epoch
= objecter
->with_osdmap([](const OSDMap
&o
){
2505 return o
.get_epoch();
2507 lderr(cct
) << "I was blacklisted at osd epoch " << epoch
<< dendl
;
2510 _abort_mds_sessions(-EBLACKLISTED
);
2512 // Since we know all our OSD ops will fail, cancel them all preemtively,
2513 // so that on an unhealthy cluster we can umount promptly even if e.g.
2514 // some PGs were inaccessible.
2515 objecter
->op_cancel_writes(-EBLACKLISTED
);
2517 } else if (blacklisted
) {
2518 // Handle case where we were blacklisted but no longer are
2519 blacklisted
= objecter
->with_osdmap([myaddrs
](const OSDMap
&o
){
2520 return o
.is_blacklisted(myaddrs
);});
2523 // Always subscribe to next osdmap for blacklisted client
2524 // until this client is not blacklisted.
2526 objecter
->maybe_request_map();
2529 if (objecter
->osdmap_full_flag()) {
2530 _handle_full_flag(-1);
2532 // Accumulate local list of full pools so that I can drop
2533 // the objecter lock before re-entering objecter in
2535 std::vector
<int64_t> full_pools
;
2537 objecter
->with_osdmap([&full_pools
](const OSDMap
&o
) {
2538 for (const auto& kv
: o
.get_pools()) {
2539 if (kv
.second
.has_flag(pg_pool_t::FLAG_FULL
)) {
2540 full_pools
.push_back(kv
.first
);
2545 for (auto p
: full_pools
)
2546 _handle_full_flag(p
);
2548 // Subscribe to subsequent maps to watch for the full flag going
2549 // away. For the global full flag objecter does this for us, but
2550 // it pays no attention to the per-pool full flag so in this branch
2551 // we do it ourselves.
2552 if (!full_pools
.empty()) {
2553 objecter
->maybe_request_map();
2559 // ------------------------
2560 // incoming messages
2563 bool Client::ms_dispatch2(const MessageRef
&m
)
2565 std::lock_guard
l(client_lock
);
2567 ldout(cct
, 10) << "inactive, discarding " << *m
<< dendl
;
2571 switch (m
->get_type()) {
2572 // mounting and mds sessions
2573 case CEPH_MSG_MDS_MAP
:
2574 handle_mds_map(MMDSMap::msgref_cast(m
));
2576 case CEPH_MSG_FS_MAP
:
2577 handle_fs_map(MFSMap::msgref_cast(m
));
2579 case CEPH_MSG_FS_MAP_USER
:
2580 handle_fs_map_user(MFSMapUser::msgref_cast(m
));
2582 case CEPH_MSG_CLIENT_SESSION
:
2583 handle_client_session(MClientSession::msgref_cast(m
));
2586 case CEPH_MSG_OSD_MAP
:
2587 handle_osd_map(MOSDMap::msgref_cast(m
));
2591 case CEPH_MSG_CLIENT_REQUEST_FORWARD
:
2592 handle_client_request_forward(MClientRequestForward::msgref_cast(m
));
2594 case CEPH_MSG_CLIENT_REPLY
:
2595 handle_client_reply(MClientReply::msgref_cast(m
));
2599 case CEPH_MSG_CLIENT_RECLAIM_REPLY
:
2600 handle_client_reclaim_reply(MClientReclaimReply::msgref_cast(m
));
2603 case CEPH_MSG_CLIENT_SNAP
:
2604 handle_snap(MClientSnap::msgref_cast(m
));
2606 case CEPH_MSG_CLIENT_CAPS
:
2607 handle_caps(MClientCaps::msgref_cast(m
));
2609 case CEPH_MSG_CLIENT_LEASE
:
2610 handle_lease(MClientLease::msgref_cast(m
));
2612 case MSG_COMMAND_REPLY
:
2613 if (m
->get_source().type() == CEPH_ENTITY_TYPE_MDS
) {
2614 handle_command_reply(MCommandReply::msgref_cast(m
));
2619 case CEPH_MSG_CLIENT_QUOTA
:
2620 handle_quota(MClientQuota::msgref_cast(m
));
2629 ldout(cct
, 10) << "unmounting: trim pass, size was " << lru
.lru_get_size()
2630 << "+" << inode_map
.size() << dendl
;
2631 long unsigned size
= lru
.lru_get_size() + inode_map
.size();
2633 if (size
< lru
.lru_get_size() + inode_map
.size()) {
2634 ldout(cct
, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl
;
2635 mount_cond
.Signal();
2637 ldout(cct
, 10) << "unmounting: trim pass, size still " << lru
.lru_get_size()
2638 << "+" << inode_map
.size() << dendl
;
2645 void Client::handle_fs_map(const MConstRef
<MFSMap
>& m
)
2647 fsmap
.reset(new FSMap(m
->get_fsmap()));
2649 signal_cond_list(waiting_for_fsmap
);
2651 monclient
->sub_got("fsmap", fsmap
->get_epoch());
2654 void Client::handle_fs_map_user(const MConstRef
<MFSMapUser
>& m
)
2656 fsmap_user
.reset(new FSMapUser
);
2657 *fsmap_user
= m
->get_fsmap();
2659 monclient
->sub_got("fsmap.user", fsmap_user
->get_epoch());
2660 signal_cond_list(waiting_for_fsmap
);
2663 void Client::handle_mds_map(const MConstRef
<MMDSMap
>& m
)
2665 mds_gid_t old_inc
, new_inc
;
2666 if (m
->get_epoch() <= mdsmap
->get_epoch()) {
2667 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch()
2668 << " is identical to or older than our "
2669 << mdsmap
->get_epoch() << dendl
;
2673 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch() << dendl
;
2675 std::unique_ptr
<MDSMap
> oldmap(new MDSMap
);
2676 oldmap
.swap(mdsmap
);
2678 mdsmap
->decode(m
->get_encoded());
2680 // Cancel any commands for missing or laggy GIDs
2681 std::list
<ceph_tid_t
> cancel_ops
;
2682 auto &commands
= command_table
.get_commands();
2683 for (const auto &i
: commands
) {
2684 auto &op
= i
.second
;
2685 const mds_gid_t op_mds_gid
= op
.mds_gid
;
2686 if (mdsmap
->is_dne_gid(op_mds_gid
) || mdsmap
->is_laggy_gid(op_mds_gid
)) {
2687 ldout(cct
, 1) << __func__
<< ": cancelling command op " << i
.first
<< dendl
;
2688 cancel_ops
.push_back(i
.first
);
2690 std::ostringstream ss
;
2691 ss
<< "MDS " << op_mds_gid
<< " went away";
2692 *(op
.outs
) = ss
.str();
2694 op
.con
->mark_down();
2696 op
.on_finish
->complete(-ETIMEDOUT
);
2701 for (std::list
<ceph_tid_t
>::iterator i
= cancel_ops
.begin();
2702 i
!= cancel_ops
.end(); ++i
) {
2703 command_table
.erase(*i
);
2707 for (auto p
= mds_sessions
.begin(); p
!= mds_sessions
.end(); ) {
2708 mds_rank_t mds
= p
->first
;
2709 MetaSession
*session
= &p
->second
;
2712 int oldstate
= oldmap
->get_state(mds
);
2713 int newstate
= mdsmap
->get_state(mds
);
2714 if (!mdsmap
->is_up(mds
)) {
2715 session
->con
->mark_down();
2716 } else if (mdsmap
->get_addrs(mds
) != session
->addrs
) {
2717 old_inc
= oldmap
->get_incarnation(mds
);
2718 new_inc
= mdsmap
->get_incarnation(mds
);
2719 if (old_inc
!= new_inc
) {
2720 ldout(cct
, 1) << "mds incarnation changed from "
2721 << old_inc
<< " to " << new_inc
<< dendl
;
2722 oldstate
= MDSMap::STATE_NULL
;
2724 session
->con
->mark_down();
2725 session
->addrs
= mdsmap
->get_addrs(mds
);
2726 // When new MDS starts to take over, notify kernel to trim unused entries
2727 // in its dcache/icache. Hopefully, the kernel will release some unused
2728 // inodes before the new MDS enters reconnect state.
2729 trim_cache_for_reconnect(session
);
2730 } else if (oldstate
== newstate
)
2731 continue; // no change
2733 session
->mds_state
= newstate
;
2734 if (newstate
== MDSMap::STATE_RECONNECT
) {
2735 session
->con
= messenger
->connect_to_mds(session
->addrs
);
2736 send_reconnect(session
);
2737 } else if (newstate
> MDSMap::STATE_RECONNECT
) {
2738 if (oldstate
< MDSMap::STATE_RECONNECT
) {
2739 ldout(cct
, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl
;
2740 _closed_mds_session(session
);
2743 if (newstate
>= MDSMap::STATE_ACTIVE
) {
2744 if (oldstate
< MDSMap::STATE_ACTIVE
) {
2745 // kick new requests
2746 kick_requests(session
);
2747 kick_flushing_caps(session
);
2748 signal_context_list(session
->waiting_for_open
);
2749 wake_up_session_caps(session
, true);
2751 connect_mds_targets(mds
);
2753 } else if (newstate
== MDSMap::STATE_NULL
&&
2754 mds
>= mdsmap
->get_max_mds()) {
2755 _closed_mds_session(session
);
2759 // kick any waiting threads
2760 signal_cond_list(waiting_for_mdsmap
);
2762 monclient
->sub_got("mdsmap", mdsmap
->get_epoch());
2765 void Client::send_reconnect(MetaSession
*session
)
2767 mds_rank_t mds
= session
->mds_num
;
2768 ldout(cct
, 10) << __func__
<< " to mds." << mds
<< dendl
;
2770 // trim unused caps to reduce MDS's cache rejoin time
2771 trim_cache_for_reconnect(session
);
2773 session
->readonly
= false;
2775 session
->release
.reset();
2777 // reset my cap seq number
2779 //connect to the mds' offload targets
2780 connect_mds_targets(mds
);
2781 //make sure unsafe requests get saved
2782 resend_unsafe_requests(session
);
2784 early_kick_flushing_caps(session
);
2786 auto m
= MClientReconnect::create();
2787 bool allow_multi
= session
->mds_features
.test(CEPHFS_FEATURE_MULTI_RECONNECT
);
2789 // i have an open session.
2790 ceph::unordered_set
<inodeno_t
> did_snaprealm
;
2791 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
= inode_map
.begin();
2792 p
!= inode_map
.end();
2794 Inode
*in
= p
->second
;
2795 auto it
= in
->caps
.find(mds
);
2796 if (it
!= in
->caps
.end()) {
2798 m
->get_approx_size() >= (std::numeric_limits
<int>::max() >> 1)) {
2800 session
->con
->send_message2(std::move(m
));
2802 m
= MClientReconnect::create();
2805 Cap
&cap
= it
->second
;
2806 ldout(cct
, 10) << " caps on " << p
->first
2807 << " " << ccap_string(cap
.issued
)
2808 << " wants " << ccap_string(in
->caps_wanted())
2811 in
->make_long_path(path
);
2812 ldout(cct
, 10) << " path " << path
<< dendl
;
2815 _encode_filelocks(in
, flockbl
);
2817 cap
.seq
= 0; // reset seq.
2818 cap
.issue_seq
= 0; // reset seq.
2819 cap
.mseq
= 0; // reset seq.
2820 // cap gen should catch up with session cap_gen
2821 if (cap
.gen
< session
->cap_gen
) {
2822 cap
.gen
= session
->cap_gen
;
2823 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
2825 cap
.issued
= cap
.implemented
;
2827 snapid_t snap_follows
= 0;
2828 if (!in
->cap_snaps
.empty())
2829 snap_follows
= in
->cap_snaps
.begin()->first
;
2831 m
->add_cap(p
->first
.ino
,
2833 path
.get_ino(), path
.get_path(), // ino
2834 in
->caps_wanted(), // wanted
2835 cap
.issued
, // issued
2840 if (did_snaprealm
.count(in
->snaprealm
->ino
) == 0) {
2841 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
2842 m
->add_snaprealm(in
->snaprealm
->ino
, in
->snaprealm
->seq
, in
->snaprealm
->parent
);
2843 did_snaprealm
.insert(in
->snaprealm
->ino
);
2849 m
->set_encoding_version(0); // use connection features to choose encoding
2850 session
->con
->send_message2(std::move(m
));
2852 mount_cond
.Signal();
2854 if (session
->reclaim_state
== MetaSession::RECLAIMING
)
2855 signal_cond_list(waiting_for_reclaim
);
2859 void Client::kick_requests(MetaSession
*session
)
2861 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
2862 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2863 p
!= mds_requests
.end();
2865 MetaRequest
*req
= p
->second
;
2866 if (req
->got_unsafe
)
2868 if (req
->aborted()) {
2869 if (req
->caller_cond
) {
2871 req
->caller_cond
->Signal();
2875 if (req
->retry_attempt
> 0)
2876 continue; // new requests only
2877 if (req
->mds
== session
->mds_num
) {
2878 send_request(p
->second
, session
);
2883 void Client::resend_unsafe_requests(MetaSession
*session
)
2885 for (xlist
<MetaRequest
*>::iterator iter
= session
->unsafe_requests
.begin();
2888 send_request(*iter
, session
);
2890 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2891 // process completed requests in clientreplay stage.
2892 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2893 p
!= mds_requests
.end();
2895 MetaRequest
*req
= p
->second
;
2896 if (req
->got_unsafe
)
2900 if (req
->retry_attempt
== 0)
2901 continue; // old requests only
2902 if (req
->mds
== session
->mds_num
)
2903 send_request(req
, session
, true);
2907 void Client::wait_unsafe_requests()
2909 list
<MetaRequest
*> last_unsafe_reqs
;
2910 for (const auto &p
: mds_sessions
) {
2911 const MetaSession
&s
= p
.second
;
2912 if (!s
.unsafe_requests
.empty()) {
2913 MetaRequest
*req
= s
.unsafe_requests
.back();
2915 last_unsafe_reqs
.push_back(req
);
2919 for (list
<MetaRequest
*>::iterator p
= last_unsafe_reqs
.begin();
2920 p
!= last_unsafe_reqs
.end();
2922 MetaRequest
*req
= *p
;
2923 if (req
->unsafe_item
.is_on_list())
2924 wait_on_list(req
->waitfor_safe
);
2929 void Client::kick_requests_closed(MetaSession
*session
)
2931 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
2932 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2933 p
!= mds_requests
.end(); ) {
2934 MetaRequest
*req
= p
->second
;
2936 if (req
->mds
== session
->mds_num
) {
2937 if (req
->caller_cond
) {
2939 req
->caller_cond
->Signal();
2941 req
->item
.remove_myself();
2942 if (req
->got_unsafe
) {
2943 lderr(cct
) << __func__
<< " removing unsafe request " << req
->get_tid() << dendl
;
2944 req
->unsafe_item
.remove_myself();
2945 if (is_dir_operation(req
)) {
2946 Inode
*dir
= req
->inode();
2948 dir
->set_async_err(-EIO
);
2949 lderr(cct
) << "kick_requests_closed drop req of inode(dir) : "
2950 << dir
->ino
<< " " << req
->get_tid() << dendl
;
2951 req
->unsafe_dir_item
.remove_myself();
2954 InodeRef
&in
= req
->target
;
2955 in
->set_async_err(-EIO
);
2956 lderr(cct
) << "kick_requests_closed drop req of inode : "
2957 << in
->ino
<< " " << req
->get_tid() << dendl
;
2958 req
->unsafe_target_item
.remove_myself();
2960 signal_cond_list(req
->waitfor_safe
);
2961 unregister_request(req
);
2965 ceph_assert(session
->requests
.empty());
2966 ceph_assert(session
->unsafe_requests
.empty());
2976 void Client::got_mds_push(MetaSession
*s
)
2979 ldout(cct
, 10) << " mds." << s
->mds_num
<< " seq now " << s
->seq
<< dendl
;
2980 if (s
->state
== MetaSession::STATE_CLOSING
) {
2981 s
->con
->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2985 void Client::handle_lease(const MConstRef
<MClientLease
>& m
)
2987 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
2989 ceph_assert(m
->get_action() == CEPH_MDS_LEASE_REVOKE
);
2991 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
2992 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
2997 got_mds_push(session
);
2999 ceph_seq_t seq
= m
->get_seq();
3002 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
3003 if (inode_map
.count(vino
) == 0) {
3004 ldout(cct
, 10) << " don't have vino " << vino
<< dendl
;
3007 in
= inode_map
[vino
];
3009 if (m
->get_mask() & CEPH_LOCK_DN
) {
3010 if (!in
->dir
|| in
->dir
->dentries
.count(m
->dname
) == 0) {
3011 ldout(cct
, 10) << " don't have dir|dentry " << m
->get_ino() << "/" << m
->dname
<<dendl
;
3014 Dentry
*dn
= in
->dir
->dentries
[m
->dname
];
3015 ldout(cct
, 10) << " revoked DN lease on " << dn
<< dendl
;
3021 auto reply
= MClientLease::create(CEPH_MDS_LEASE_RELEASE
, seq
, m
->get_mask(), m
->get_ino(), m
->get_first(), m
->get_last(), m
->dname
);
3022 m
->get_connection()->send_message2(std::move(reply
));
3026 void Client::put_inode(Inode
*in
, int n
)
3028 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3029 int left
= in
->_put(n
);
3032 remove_all_caps(in
);
3034 ldout(cct
, 10) << __func__
<< " deleting " << *in
<< dendl
;
3035 bool unclean
= objectcacher
->release_set(&in
->oset
);
3036 ceph_assert(!unclean
);
3037 inode_map
.erase(in
->vino());
3038 if (use_faked_inos())
3039 _release_faked_ino(in
);
3044 while (!root_parents
.empty())
3045 root_parents
.erase(root_parents
.begin());
3052 void Client::close_dir(Dir
*dir
)
3054 Inode
*in
= dir
->parent_inode
;
3055 ldout(cct
, 15) << __func__
<< " dir " << dir
<< " on " << in
<< dendl
;
3056 ceph_assert(dir
->is_empty());
3057 ceph_assert(in
->dir
== dir
);
3058 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
3059 if (!in
->dentries
.empty())
3060 in
->get_first_parent()->put(); // unpin dentry
3064 put_inode(in
); // unpin inode
3068 * Don't call this with in==NULL, use get_or_create for that
3069 * leave dn set to default NULL unless you're trying to add
3070 * a new inode to a pre-created Dentry
3072 Dentry
* Client::link(Dir
*dir
, const string
& name
, Inode
*in
, Dentry
*dn
)
3075 // create a new Dentry
3076 dn
= new Dentry(dir
, name
);
3078 lru
.lru_insert_mid(dn
); // mid or top?
3080 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3081 << " dn " << dn
<< " (new dn)" << dendl
;
3083 ceph_assert(!dn
->inode
);
3084 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3085 << " dn " << dn
<< " (old dn)" << dendl
;
3088 if (in
) { // link to inode
3090 // only one parent for directories!
3091 if (in
->is_dir() && !in
->dentries
.empty()) {
3092 tmp_ref
= in
; // prevent unlink below from freeing the inode.
3093 Dentry
*olddn
= in
->get_first_parent();
3094 ceph_assert(olddn
->dir
!= dir
|| olddn
->name
!= name
);
3095 Inode
*old_diri
= olddn
->dir
->parent_inode
;
3096 old_diri
->dir_release_count
++;
3097 clear_dir_complete_and_ordered(old_diri
, true);
3098 unlink(olddn
, true, true); // keep dir, dentry
3102 ldout(cct
, 20) << "link inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3108 void Client::unlink(Dentry
*dn
, bool keepdir
, bool keepdentry
)
3110 InodeRef
in(dn
->inode
);
3111 ldout(cct
, 15) << "unlink dir " << dn
->dir
->parent_inode
<< " '" << dn
->name
<< "' dn " << dn
3112 << " inode " << dn
->inode
<< dendl
;
3114 // unlink from inode
3117 ldout(cct
, 20) << "unlink inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3123 ldout(cct
, 15) << "unlink removing '" << dn
->name
<< "' dn " << dn
<< dendl
;
3133 if (dir
->is_empty() && !keepdir
)
3139 * For asynchronous flushes, check for errors from the IO and
3140 * update the inode if necessary
3142 class C_Client_FlushComplete
: public Context
{
3147 C_Client_FlushComplete(Client
*c
, Inode
*in
) : client(c
), inode(in
) { }
3148 void finish(int r
) override
{
3149 ceph_assert(client
->client_lock
.is_locked_by_me());
3151 client_t
const whoami
= client
->whoami
; // For the benefit of ldout prefix
3152 ldout(client
->cct
, 1) << "I/O error from flush on inode " << inode
3153 << " 0x" << std::hex
<< inode
->ino
<< std::dec
3154 << ": " << r
<< "(" << cpp_strerror(r
) << ")" << dendl
;
3155 inode
->set_async_err(r
);
3165 void Client::get_cap_ref(Inode
*in
, int cap
)
3167 if ((cap
& CEPH_CAP_FILE_BUFFER
) &&
3168 in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] == 0) {
3169 ldout(cct
, 5) << __func__
<< " got first FILE_BUFFER ref on " << *in
<< dendl
;
3172 if ((cap
& CEPH_CAP_FILE_CACHE
) &&
3173 in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3174 ldout(cct
, 5) << __func__
<< " got first FILE_CACHE ref on " << *in
<< dendl
;
3177 in
->get_cap_ref(cap
);
3180 void Client::put_cap_ref(Inode
*in
, int cap
)
3182 int last
= in
->put_cap_ref(cap
);
3185 int drop
= last
& ~in
->caps_issued();
3186 if (in
->snapid
== CEPH_NOSNAP
) {
3187 if ((last
& CEPH_CAP_FILE_WR
) &&
3188 !in
->cap_snaps
.empty() &&
3189 in
->cap_snaps
.rbegin()->second
.writing
) {
3190 ldout(cct
, 10) << __func__
<< " finishing pending cap_snap on " << *in
<< dendl
;
3191 in
->cap_snaps
.rbegin()->second
.writing
= 0;
3192 finish_cap_snap(in
, in
->cap_snaps
.rbegin()->second
, get_caps_used(in
));
3193 signal_cond_list(in
->waitfor_caps
); // wake up blocked sync writers
3195 if (last
& CEPH_CAP_FILE_BUFFER
) {
3196 for (auto &p
: in
->cap_snaps
)
3197 p
.second
.dirty_data
= 0;
3198 signal_cond_list(in
->waitfor_commit
);
3199 ldout(cct
, 5) << __func__
<< " dropped last FILE_BUFFER ref on " << *in
<< dendl
;
3203 if (last
& CEPH_CAP_FILE_CACHE
) {
3204 ldout(cct
, 5) << __func__
<< " dropped last FILE_CACHE ref on " << *in
<< dendl
;
3210 put_inode(in
, put_nref
);
3214 int Client::get_caps(Inode
*in
, int need
, int want
, int *phave
, loff_t endoff
)
3216 int r
= check_pool_perm(in
, need
);
3221 int file_wanted
= in
->caps_file_wanted();
3222 if ((file_wanted
& need
) != need
) {
3223 ldout(cct
, 10) << "get_caps " << *in
<< " need " << ccap_string(need
)
3224 << " file_wanted " << ccap_string(file_wanted
) << ", EBADF "
3230 int have
= in
->caps_issued(&implemented
);
3232 bool waitfor_caps
= false;
3233 bool waitfor_commit
= false;
3235 if (have
& need
& CEPH_CAP_FILE_WR
) {
3237 (endoff
>= (loff_t
)in
->max_size
||
3238 endoff
> (loff_t
)(in
->size
<< 1)) &&
3239 endoff
> (loff_t
)in
->wanted_max_size
) {
3240 ldout(cct
, 10) << "wanted_max_size " << in
->wanted_max_size
<< " -> " << endoff
<< dendl
;
3241 in
->wanted_max_size
= endoff
;
3245 if (endoff
>= 0 && endoff
> (loff_t
)in
->max_size
) {
3246 ldout(cct
, 10) << "waiting on max_size, endoff " << endoff
<< " max_size " << in
->max_size
<< " on " << *in
<< dendl
;
3247 waitfor_caps
= true;
3249 if (!in
->cap_snaps
.empty()) {
3250 if (in
->cap_snaps
.rbegin()->second
.writing
) {
3251 ldout(cct
, 10) << "waiting on cap_snap write to complete" << dendl
;
3252 waitfor_caps
= true;
3254 for (auto &p
: in
->cap_snaps
) {
3255 if (p
.second
.dirty_data
) {
3256 waitfor_commit
= true;
3260 if (waitfor_commit
) {
3261 _flush(in
, new C_Client_FlushComplete(this, in
));
3262 ldout(cct
, 10) << "waiting for WRBUFFER to get dropped" << dendl
;
3267 if (!waitfor_caps
&& !waitfor_commit
) {
3268 if ((have
& need
) == need
) {
3269 int revoking
= implemented
& ~have
;
3270 ldout(cct
, 10) << "get_caps " << *in
<< " have " << ccap_string(have
)
3271 << " need " << ccap_string(need
) << " want " << ccap_string(want
)
3272 << " revoking " << ccap_string(revoking
)
3274 if ((revoking
& want
) == 0) {
3275 *phave
= need
| (have
& want
);
3276 in
->get_cap_ref(need
);
3280 ldout(cct
, 10) << "waiting for caps " << *in
<< " need " << ccap_string(need
) << " want " << ccap_string(want
) << dendl
;
3281 waitfor_caps
= true;
3284 if ((need
& CEPH_CAP_FILE_WR
) && in
->auth_cap
&&
3285 in
->auth_cap
->session
->readonly
)
3288 if (in
->flags
& I_CAP_DROPPED
) {
3289 int mds_wanted
= in
->caps_mds_wanted();
3290 if ((mds_wanted
& need
) != need
) {
3291 int ret
= _renew_caps(in
);
3296 if (!(file_wanted
& ~mds_wanted
))
3297 in
->flags
&= ~I_CAP_DROPPED
;
3301 wait_on_list(in
->waitfor_caps
);
3302 else if (waitfor_commit
)
3303 wait_on_list(in
->waitfor_commit
);
3307 int Client::get_caps_used(Inode
*in
)
3309 unsigned used
= in
->caps_used();
3310 if (!(used
& CEPH_CAP_FILE_CACHE
) &&
3311 !objectcacher
->set_is_empty(&in
->oset
))
3312 used
|= CEPH_CAP_FILE_CACHE
;
3316 void Client::cap_delay_requeue(Inode
*in
)
3318 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3319 in
->hold_caps_until
= ceph_clock_now();
3320 in
->hold_caps_until
+= cct
->_conf
->client_caps_release_delay
;
3321 delayed_list
.push_back(&in
->delay_cap_item
);
3324 void Client::send_cap(Inode
*in
, MetaSession
*session
, Cap
*cap
,
3325 int flags
, int used
, int want
, int retain
,
3326 int flush
, ceph_tid_t flush_tid
)
3328 int held
= cap
->issued
| cap
->implemented
;
3329 int revoking
= cap
->implemented
& ~cap
->issued
;
3330 retain
&= ~revoking
;
3331 int dropping
= cap
->issued
& ~retain
;
3332 int op
= CEPH_CAP_OP_UPDATE
;
3334 ldout(cct
, 10) << __func__
<< " " << *in
3335 << " mds." << session
->mds_num
<< " seq " << cap
->seq
3336 << " used " << ccap_string(used
)
3337 << " want " << ccap_string(want
)
3338 << " flush " << ccap_string(flush
)
3339 << " retain " << ccap_string(retain
)
3340 << " held "<< ccap_string(held
)
3341 << " revoking " << ccap_string(revoking
)
3342 << " dropping " << ccap_string(dropping
)
3345 if (cct
->_conf
->client_inject_release_failure
&& revoking
) {
3346 const int would_have_issued
= cap
->issued
& retain
;
3347 const int would_have_implemented
= cap
->implemented
& (cap
->issued
| used
);
3349 // - tell the server we think issued is whatever they issued plus whatever we implemented
3350 // - leave what we have implemented in place
3351 ldout(cct
, 20) << __func__
<< " injecting failure to release caps" << dendl
;
3352 cap
->issued
= cap
->issued
| cap
->implemented
;
3354 // Make an exception for revoking xattr caps: we are injecting
3355 // failure to release other caps, but allow xattr because client
3356 // will block on xattr ops if it can't release these to MDS (#9800)
3357 const int xattr_mask
= CEPH_CAP_XATTR_SHARED
| CEPH_CAP_XATTR_EXCL
;
3358 cap
->issued
^= xattr_mask
& revoking
;
3359 cap
->implemented
^= xattr_mask
& revoking
;
3361 ldout(cct
, 20) << __func__
<< " issued " << ccap_string(cap
->issued
) << " vs " << ccap_string(would_have_issued
) << dendl
;
3362 ldout(cct
, 20) << __func__
<< " implemented " << ccap_string(cap
->implemented
) << " vs " << ccap_string(would_have_implemented
) << dendl
;
3365 cap
->issued
&= retain
;
3366 cap
->implemented
&= cap
->issued
| used
;
3369 snapid_t follows
= 0;
3372 follows
= in
->snaprealm
->get_snap_context().seq
;
3374 auto m
= MClientCaps::create(op
,
3377 cap
->cap_id
, cap
->seq
,
3383 m
->caller_uid
= in
->cap_dirtier_uid
;
3384 m
->caller_gid
= in
->cap_dirtier_gid
;
3386 m
->head
.issue_seq
= cap
->issue_seq
;
3387 m
->set_tid(flush_tid
);
3389 m
->head
.uid
= in
->uid
;
3390 m
->head
.gid
= in
->gid
;
3391 m
->head
.mode
= in
->mode
;
3393 m
->head
.nlink
= in
->nlink
;
3395 if (flush
& CEPH_CAP_XATTR_EXCL
) {
3396 encode(in
->xattrs
, m
->xattrbl
);
3397 m
->head
.xattr_version
= in
->xattr_version
;
3401 m
->max_size
= in
->max_size
;
3402 m
->truncate_seq
= in
->truncate_seq
;
3403 m
->truncate_size
= in
->truncate_size
;
3404 m
->mtime
= in
->mtime
;
3405 m
->atime
= in
->atime
;
3406 m
->ctime
= in
->ctime
;
3407 m
->btime
= in
->btime
;
3408 m
->time_warp_seq
= in
->time_warp_seq
;
3409 m
->change_attr
= in
->change_attr
;
3411 if (!(flags
& MClientCaps::FLAG_PENDING_CAPSNAP
) &&
3412 !in
->cap_snaps
.empty() &&
3413 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3414 flags
|= MClientCaps::FLAG_PENDING_CAPSNAP
;
3417 if (flush
& CEPH_CAP_FILE_WR
) {
3418 m
->inline_version
= in
->inline_version
;
3419 m
->inline_data
= in
->inline_data
;
3422 in
->reported_size
= in
->size
;
3423 m
->set_snap_follows(follows
);
3425 if (cap
== in
->auth_cap
) {
3426 m
->set_max_size(in
->wanted_max_size
);
3427 in
->requested_max_size
= in
->wanted_max_size
;
3428 ldout(cct
, 15) << "auth cap, setting max_size = " << in
->requested_max_size
<< dendl
;
3431 if (!session
->flushing_caps_tids
.empty())
3432 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3434 session
->con
->send_message2(std::move(m
));
3437 static bool is_max_size_approaching(Inode
*in
)
3439 /* mds will adjust max size according to the reported size */
3440 if (in
->flushing_caps
& CEPH_CAP_FILE_WR
)
3442 if (in
->size
>= in
->max_size
)
3444 /* half of previous max_size increment has been used */
3445 if (in
->max_size
> in
->reported_size
&&
3446 (in
->size
<< 1) >= in
->max_size
+ in
->reported_size
)
3451 static int adjust_caps_used_for_lazyio(int used
, int issued
, int implemented
)
3453 if (!(used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
)))
3455 if (!(implemented
& CEPH_CAP_FILE_LAZYIO
))
3458 if (issued
& CEPH_CAP_FILE_LAZYIO
) {
3459 if (!(issued
& CEPH_CAP_FILE_CACHE
)) {
3460 used
&= ~CEPH_CAP_FILE_CACHE
;
3461 used
|= CEPH_CAP_FILE_LAZYIO
;
3463 if (!(issued
& CEPH_CAP_FILE_BUFFER
)) {
3464 used
&= ~CEPH_CAP_FILE_BUFFER
;
3465 used
|= CEPH_CAP_FILE_LAZYIO
;
3468 if (!(implemented
& CEPH_CAP_FILE_CACHE
)) {
3469 used
&= ~CEPH_CAP_FILE_CACHE
;
3470 used
|= CEPH_CAP_FILE_LAZYIO
;
3472 if (!(implemented
& CEPH_CAP_FILE_BUFFER
)) {
3473 used
&= ~CEPH_CAP_FILE_BUFFER
;
3474 used
|= CEPH_CAP_FILE_LAZYIO
;
3483 * Examine currently used and wanted versus held caps. Release, flush or ack
3484 * revoked caps to the MDS as appropriate.
3486 * @param in the inode to check
3487 * @param flags flags to apply to cap check
3489 void Client::check_caps(Inode
*in
, unsigned flags
)
3491 unsigned wanted
= in
->caps_wanted();
3492 unsigned used
= get_caps_used(in
);
3496 int issued
= in
->caps_issued(&implemented
);
3497 int revoking
= implemented
& ~issued
;
3499 int orig_used
= used
;
3500 used
= adjust_caps_used_for_lazyio(used
, issued
, implemented
);
3502 int retain
= wanted
| used
| CEPH_CAP_PIN
;
3503 if (!unmounting
&& in
->nlink
> 0) {
3505 retain
|= CEPH_CAP_ANY
;
3506 } else if (in
->is_dir() &&
3507 (issued
& CEPH_CAP_FILE_SHARED
) &&
3508 (in
->flags
& I_COMPLETE
)) {
3509 // we do this here because we don't want to drop to Fs (and then
3510 // drop the Fs if we do a create!) if that alone makes us send lookups
3511 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3512 wanted
= CEPH_CAP_ANY_SHARED
| CEPH_CAP_FILE_EXCL
;
3515 retain
|= CEPH_CAP_ANY_SHARED
;
3516 // keep RD only if we didn't have the file open RW,
3517 // because then the mds would revoke it anyway to
3518 // journal max_size=0.
3519 if (in
->max_size
== 0)
3520 retain
|= CEPH_CAP_ANY_RD
;
3524 ldout(cct
, 10) << __func__
<< " on " << *in
3525 << " wanted " << ccap_string(wanted
)
3526 << " used " << ccap_string(used
)
3527 << " issued " << ccap_string(issued
)
3528 << " revoking " << ccap_string(revoking
)
3529 << " flags=" << flags
3532 if (in
->snapid
!= CEPH_NOSNAP
)
3533 return; //snap caps last forever, can't write
3535 if (in
->caps
.empty())
3536 return; // guard if at end of func
3538 if (!(orig_used
& CEPH_CAP_FILE_BUFFER
) &&
3539 (revoking
& used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
3541 used
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
3545 for (auto &p
: in
->caps
) {
3546 mds_rank_t mds
= p
.first
;
3547 Cap
&cap
= p
.second
;
3549 MetaSession
*session
= &mds_sessions
.at(mds
);
3552 if (in
->auth_cap
&& &cap
!= in
->auth_cap
)
3553 cap_used
&= ~in
->auth_cap
->issued
;
3555 revoking
= cap
.implemented
& ~cap
.issued
;
3557 ldout(cct
, 10) << " cap mds." << mds
3558 << " issued " << ccap_string(cap
.issued
)
3559 << " implemented " << ccap_string(cap
.implemented
)
3560 << " revoking " << ccap_string(revoking
) << dendl
;
3562 if (in
->wanted_max_size
> in
->max_size
&&
3563 in
->wanted_max_size
> in
->requested_max_size
&&
3564 &cap
== in
->auth_cap
)
3567 /* approaching file_max? */
3568 if ((cap
.issued
& CEPH_CAP_FILE_WR
) &&
3569 &cap
== in
->auth_cap
&&
3570 is_max_size_approaching(in
)) {
3571 ldout(cct
, 10) << "size " << in
->size
<< " approaching max_size " << in
->max_size
3572 << ", reported " << in
->reported_size
<< dendl
;
3576 /* completed revocation? */
3577 if (revoking
&& (revoking
& cap_used
) == 0) {
3578 ldout(cct
, 10) << "completed revocation of " << ccap_string(cap
.implemented
& ~cap
.issued
) << dendl
;
3582 /* want more caps from mds? */
3583 if (wanted
& ~(cap
.wanted
| cap
.issued
))
3586 if (!revoking
&& unmounting
&& (cap_used
== 0))
3589 if ((cap
.issued
& ~retain
) == 0 && // and we don't have anything we wouldn't like
3590 !in
->dirty_caps
) // and we have no dirty caps
3593 if (!(flags
& CHECK_CAPS_NODELAY
)) {
3594 ldout(cct
, 10) << "delaying cap release" << dendl
;
3595 cap_delay_requeue(in
);
3600 if (&cap
== in
->auth_cap
) {
3601 if (in
->flags
& I_KICK_FLUSH
) {
3602 ldout(cct
, 20) << " reflushing caps (check_caps) on " << *in
3603 << " to mds." << mds
<< dendl
;
3604 kick_flushing_caps(in
, session
);
3606 if (!in
->cap_snaps
.empty() &&
3607 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3612 ceph_tid_t flush_tid
;
3613 if (in
->auth_cap
== &cap
&& in
->dirty_caps
) {
3614 flushing
= mark_caps_flushing(in
, &flush_tid
);
3620 int msg_flags
= (flags
& CHECK_CAPS_SYNCHRONOUS
) ? MClientCaps::FLAG_SYNC
: 0;
3621 send_cap(in
, session
, &cap
, msg_flags
, cap_used
, wanted
, retain
,
3622 flushing
, flush_tid
);
3627 void Client::queue_cap_snap(Inode
*in
, SnapContext
& old_snapc
)
3629 int used
= get_caps_used(in
);
3630 int dirty
= in
->caps_dirty();
3631 ldout(cct
, 10) << __func__
<< " " << *in
<< " snapc " << old_snapc
<< " used " << ccap_string(used
) << dendl
;
3633 if (in
->cap_snaps
.size() &&
3634 in
->cap_snaps
.rbegin()->second
.writing
) {
3635 ldout(cct
, 10) << __func__
<< " already have pending cap_snap on " << *in
<< dendl
;
3637 } else if (in
->caps_dirty() ||
3638 (used
& CEPH_CAP_FILE_WR
) ||
3639 (dirty
& CEPH_CAP_ANY_WR
)) {
3640 const auto &capsnapem
= in
->cap_snaps
.emplace(std::piecewise_construct
, std::make_tuple(old_snapc
.seq
), std::make_tuple(in
));
3641 ceph_assert(capsnapem
.second
); /* element inserted */
3642 CapSnap
&capsnap
= capsnapem
.first
->second
;
3643 capsnap
.context
= old_snapc
;
3644 capsnap
.issued
= in
->caps_issued();
3645 capsnap
.dirty
= in
->caps_dirty();
3647 capsnap
.dirty_data
= (used
& CEPH_CAP_FILE_BUFFER
);
3649 capsnap
.uid
= in
->uid
;
3650 capsnap
.gid
= in
->gid
;
3651 capsnap
.mode
= in
->mode
;
3652 capsnap
.btime
= in
->btime
;
3653 capsnap
.xattrs
= in
->xattrs
;
3654 capsnap
.xattr_version
= in
->xattr_version
;
3655 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3656 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3658 if (used
& CEPH_CAP_FILE_WR
) {
3659 ldout(cct
, 10) << __func__
<< " WR used on " << *in
<< dendl
;
3660 capsnap
.writing
= 1;
3662 finish_cap_snap(in
, capsnap
, used
);
3665 ldout(cct
, 10) << __func__
<< " not dirty|writing on " << *in
<< dendl
;
3669 void Client::finish_cap_snap(Inode
*in
, CapSnap
&capsnap
, int used
)
3671 ldout(cct
, 10) << __func__
<< " " << *in
<< " capsnap " << (void *)&capsnap
<< " used " << ccap_string(used
) << dendl
;
3672 capsnap
.size
= in
->size
;
3673 capsnap
.mtime
= in
->mtime
;
3674 capsnap
.atime
= in
->atime
;
3675 capsnap
.ctime
= in
->ctime
;
3676 capsnap
.time_warp_seq
= in
->time_warp_seq
;
3677 capsnap
.change_attr
= in
->change_attr
;
3678 capsnap
.dirty
|= in
->caps_dirty();
3680 /* Only reset it if it wasn't set before */
3681 if (capsnap
.cap_dirtier_uid
== -1) {
3682 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3683 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3686 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3687 capsnap
.inline_data
= in
->inline_data
;
3688 capsnap
.inline_version
= in
->inline_version
;
3691 if (used
& CEPH_CAP_FILE_BUFFER
) {
3692 ldout(cct
, 10) << __func__
<< " " << *in
<< " cap_snap " << &capsnap
<< " used " << used
3693 << " WRBUFFER, delaying" << dendl
;
3695 capsnap
.dirty_data
= 0;
3700 void Client::_flushed_cap_snap(Inode
*in
, snapid_t seq
)
3702 ldout(cct
, 10) << __func__
<< " seq " << seq
<< " on " << *in
<< dendl
;
3703 in
->cap_snaps
.at(seq
).dirty_data
= 0;
3707 void Client::send_flush_snap(Inode
*in
, MetaSession
*session
,
3708 snapid_t follows
, CapSnap
& capsnap
)
3710 auto m
= MClientCaps::create(CEPH_CAP_OP_FLUSHSNAP
,
3711 in
->ino
, in
->snaprealm
->ino
, 0,
3712 in
->auth_cap
->mseq
, cap_epoch_barrier
);
3713 m
->caller_uid
= capsnap
.cap_dirtier_uid
;
3714 m
->caller_gid
= capsnap
.cap_dirtier_gid
;
3716 m
->set_client_tid(capsnap
.flush_tid
);
3717 m
->head
.snap_follows
= follows
;
3719 m
->head
.caps
= capsnap
.issued
;
3720 m
->head
.dirty
= capsnap
.dirty
;
3722 m
->head
.uid
= capsnap
.uid
;
3723 m
->head
.gid
= capsnap
.gid
;
3724 m
->head
.mode
= capsnap
.mode
;
3725 m
->btime
= capsnap
.btime
;
3727 m
->size
= capsnap
.size
;
3729 m
->head
.xattr_version
= capsnap
.xattr_version
;
3730 encode(capsnap
.xattrs
, m
->xattrbl
);
3732 m
->ctime
= capsnap
.ctime
;
3733 m
->btime
= capsnap
.btime
;
3734 m
->mtime
= capsnap
.mtime
;
3735 m
->atime
= capsnap
.atime
;
3736 m
->time_warp_seq
= capsnap
.time_warp_seq
;
3737 m
->change_attr
= capsnap
.change_attr
;
3739 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3740 m
->inline_version
= in
->inline_version
;
3741 m
->inline_data
= in
->inline_data
;
3744 ceph_assert(!session
->flushing_caps_tids
.empty());
3745 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3747 session
->con
->send_message2(std::move(m
));
3750 void Client::flush_snaps(Inode
*in
)
3752 ldout(cct
, 10) << "flush_snaps on " << *in
<< dendl
;
3753 ceph_assert(in
->cap_snaps
.size());
3756 ceph_assert(in
->auth_cap
);
3757 MetaSession
*session
= in
->auth_cap
->session
;
3759 for (auto &p
: in
->cap_snaps
) {
3760 CapSnap
&capsnap
= p
.second
;
3761 // only do new flush
3762 if (capsnap
.flush_tid
> 0)
3765 ldout(cct
, 10) << "flush_snaps mds." << session
->mds_num
3766 << " follows " << p
.first
3767 << " size " << capsnap
.size
3768 << " mtime " << capsnap
.mtime
3769 << " dirty_data=" << capsnap
.dirty_data
3770 << " writing=" << capsnap
.writing
3771 << " on " << *in
<< dendl
;
3772 if (capsnap
.dirty_data
|| capsnap
.writing
)
3775 capsnap
.flush_tid
= ++last_flush_tid
;
3776 session
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
3777 in
->flushing_cap_tids
[capsnap
.flush_tid
] = 0;
3778 if (!in
->flushing_cap_item
.is_on_list())
3779 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
3781 send_flush_snap(in
, session
, p
.first
, capsnap
);
3785 void Client::wait_on_list(list
<Cond
*>& ls
)
3788 ls
.push_back(&cond
);
3789 cond
.Wait(client_lock
);
3793 void Client::signal_cond_list(list
<Cond
*>& ls
)
3795 for (list
<Cond
*>::iterator it
= ls
.begin(); it
!= ls
.end(); ++it
)
3799 void Client::wait_on_context_list(list
<Context
*>& ls
)
3804 ls
.push_back(new C_Cond(&cond
, &done
, &r
));
3806 cond
.Wait(client_lock
);
3809 void Client::signal_context_list(list
<Context
*>& ls
)
3811 while (!ls
.empty()) {
3812 ls
.front()->complete(0);
3817 void Client::wake_up_session_caps(MetaSession
*s
, bool reconnect
)
3819 for (const auto &cap
: s
->caps
) {
3820 auto &in
= cap
->inode
;
3822 in
.requested_max_size
= 0;
3823 in
.wanted_max_size
= 0;
3825 if (cap
->gen
< s
->cap_gen
) {
3826 // mds did not re-issue stale cap.
3827 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
3828 // make sure mds knows what we want.
3829 if (in
.caps_file_wanted() & ~cap
->wanted
)
3830 in
.flags
|= I_CAP_DROPPED
;
3833 signal_cond_list(in
.waitfor_caps
);
3838 // flush dirty data (from objectcache)
3840 class C_Client_CacheInvalidate
: public Context
{
3844 int64_t offset
, length
;
3846 C_Client_CacheInvalidate(Client
*c
, Inode
*in
, int64_t off
, int64_t len
) :
3847 client(c
), offset(off
), length(len
) {
3848 if (client
->use_faked_inos())
3849 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
3853 void finish(int r
) override
{
3854 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3855 ceph_assert(!client
->client_lock
.is_locked_by_me());
3856 client
->_async_invalidate(ino
, offset
, length
);
3860 void Client::_async_invalidate(vinodeno_t ino
, int64_t off
, int64_t len
)
3864 ldout(cct
, 10) << __func__
<< " " << ino
<< " " << off
<< "~" << len
<< dendl
;
3865 ino_invalidate_cb(callback_handle
, ino
, off
, len
);
3868 void Client::_schedule_invalidate_callback(Inode
*in
, int64_t off
, int64_t len
) {
3870 if (ino_invalidate_cb
)
3871 // we queue the invalidate, which calls the callback and decrements the ref
3872 async_ino_invalidator
.queue(new C_Client_CacheInvalidate(this, in
, off
, len
));
3875 void Client::_invalidate_inode_cache(Inode
*in
)
3877 ldout(cct
, 10) << __func__
<< " " << *in
<< dendl
;
3879 // invalidate our userspace inode cache
3880 if (cct
->_conf
->client_oc
) {
3881 objectcacher
->release_set(&in
->oset
);
3882 if (!objectcacher
->set_is_empty(&in
->oset
))
3883 lderr(cct
) << "failed to invalidate cache for " << *in
<< dendl
;
3886 _schedule_invalidate_callback(in
, 0, 0);
3889 void Client::_invalidate_inode_cache(Inode
*in
, int64_t off
, int64_t len
)
3891 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
3893 // invalidate our userspace inode cache
3894 if (cct
->_conf
->client_oc
) {
3895 vector
<ObjectExtent
> ls
;
3896 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, len
, in
->truncate_size
, ls
);
3897 objectcacher
->discard_writeback(&in
->oset
, ls
, nullptr);
3900 _schedule_invalidate_callback(in
, off
, len
);
3903 bool Client::_release(Inode
*in
)
3905 ldout(cct
, 20) << "_release " << *in
<< dendl
;
3906 if (in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3907 _invalidate_inode_cache(in
);
3913 bool Client::_flush(Inode
*in
, Context
*onfinish
)
3915 ldout(cct
, 10) << "_flush " << *in
<< dendl
;
3917 if (!in
->oset
.dirty_or_tx
) {
3918 ldout(cct
, 10) << " nothing to flush" << dendl
;
3919 onfinish
->complete(0);
3923 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
3924 ldout(cct
, 8) << __func__
<< ": FULL, purging for ENOSPC" << dendl
;
3925 objectcacher
->purge_set(&in
->oset
);
3927 onfinish
->complete(-ENOSPC
);
3932 return objectcacher
->flush_set(&in
->oset
, onfinish
);
3935 void Client::_flush_range(Inode
*in
, int64_t offset
, uint64_t size
)
3937 ceph_assert(client_lock
.is_locked());
3938 if (!in
->oset
.dirty_or_tx
) {
3939 ldout(cct
, 10) << " nothing to flush" << dendl
;
3943 C_SaferCond
onflush("Client::_flush_range flock");
3944 bool ret
= objectcacher
->file_flush(&in
->oset
, &in
->layout
, in
->snaprealm
->get_snap_context(),
3945 offset
, size
, &onflush
);
3948 client_lock
.Unlock();
3954 void Client::flush_set_callback(ObjectCacher::ObjectSet
*oset
)
3956 // std::lock_guard l(client_lock);
3957 ceph_assert(client_lock
.is_locked()); // will be called via dispatch() -> objecter -> ...
3958 Inode
*in
= static_cast<Inode
*>(oset
->parent
);
3963 void Client::_flushed(Inode
*in
)
3965 ldout(cct
, 10) << "_flushed " << *in
<< dendl
;
3967 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
3972 // checks common to add_update_cap, handle_cap_grant
3973 void Client::check_cap_issue(Inode
*in
, unsigned issued
)
3975 unsigned had
= in
->caps_issued();
3977 if ((issued
& CEPH_CAP_FILE_CACHE
) &&
3978 !(had
& CEPH_CAP_FILE_CACHE
))
3981 if ((issued
& CEPH_CAP_FILE_SHARED
) &&
3982 !(had
& CEPH_CAP_FILE_SHARED
)) {
3986 clear_dir_complete_and_ordered(in
, true);
3990 void Client::add_update_cap(Inode
*in
, MetaSession
*mds_session
, uint64_t cap_id
,
3991 unsigned issued
, unsigned wanted
, unsigned seq
, unsigned mseq
,
3992 inodeno_t realm
, int flags
, const UserPerm
& cap_perms
)
3994 if (!in
->is_any_caps()) {
3995 ceph_assert(in
->snaprealm
== 0);
3996 in
->snaprealm
= get_snap_realm(realm
);
3997 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
3998 ldout(cct
, 15) << __func__
<< " first one, opened snaprealm " << in
->snaprealm
<< dendl
;
4000 ceph_assert(in
->snaprealm
);
4001 if ((flags
& CEPH_CAP_FLAG_AUTH
) &&
4002 realm
!= inodeno_t(-1) && in
->snaprealm
->ino
!= realm
) {
4003 in
->snaprealm_item
.remove_myself();
4004 auto oldrealm
= in
->snaprealm
;
4005 in
->snaprealm
= get_snap_realm(realm
);
4006 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4007 put_snap_realm(oldrealm
);
4011 mds_rank_t mds
= mds_session
->mds_num
;
4012 const auto &capem
= in
->caps
.emplace(std::piecewise_construct
, std::forward_as_tuple(mds
), std::forward_as_tuple(*in
, mds_session
));
4013 Cap
&cap
= capem
.first
->second
;
4014 if (!capem
.second
) {
4015 if (cap
.gen
< mds_session
->cap_gen
)
4016 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
4019 * auth mds of the inode changed. we received the cap export
4020 * message, but still haven't received the cap import message.
4021 * handle_cap_export() updated the new auth MDS' cap.
4023 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4024 * a message that was send before the cap import message. So
4025 * don't remove caps.
4027 if (ceph_seq_cmp(seq
, cap
.seq
) <= 0) {
4028 ceph_assert(&cap
== in
->auth_cap
);
4029 ceph_assert(cap
.cap_id
== cap_id
);
4032 issued
|= cap
.issued
;
4033 flags
|= CEPH_CAP_FLAG_AUTH
;
4037 check_cap_issue(in
, issued
);
4039 if (flags
& CEPH_CAP_FLAG_AUTH
) {
4040 if (in
->auth_cap
!= &cap
&&
4041 (!in
->auth_cap
|| ceph_seq_cmp(in
->auth_cap
->mseq
, mseq
) < 0)) {
4042 if (in
->auth_cap
&& in
->flushing_cap_item
.is_on_list()) {
4043 ldout(cct
, 10) << __func__
<< " changing auth cap: "
4044 << "add myself to new auth MDS' flushing caps list" << dendl
;
4045 adjust_session_flushing_caps(in
, in
->auth_cap
->session
, mds_session
);
4047 in
->auth_cap
= &cap
;
4051 unsigned old_caps
= cap
.issued
;
4052 cap
.cap_id
= cap_id
;
4053 cap
.issued
= issued
;
4054 cap
.implemented
|= issued
;
4055 if (ceph_seq_cmp(mseq
, cap
.mseq
) > 0)
4056 cap
.wanted
= wanted
;
4058 cap
.wanted
|= wanted
;
4060 cap
.issue_seq
= seq
;
4062 cap
.gen
= mds_session
->cap_gen
;
4063 cap
.latest_perms
= cap_perms
;
4064 ldout(cct
, 10) << __func__
<< " issued " << ccap_string(old_caps
) << " -> " << ccap_string(cap
.issued
)
4065 << " from mds." << mds
4069 if ((issued
& ~old_caps
) && in
->auth_cap
== &cap
) {
4070 // non-auth MDS is revoking the newly grant caps ?
4071 for (auto &p
: in
->caps
) {
4072 if (&p
.second
== &cap
)
4074 if (p
.second
.implemented
& ~p
.second
.issued
& issued
) {
4075 check_caps(in
, CHECK_CAPS_NODELAY
);
4081 if (issued
& ~old_caps
)
4082 signal_cond_list(in
->waitfor_caps
);
4085 void Client::remove_cap(Cap
*cap
, bool queue_release
)
4087 auto &in
= cap
->inode
;
4088 MetaSession
*session
= cap
->session
;
4089 mds_rank_t mds
= cap
->session
->mds_num
;
4091 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " on " << in
<< dendl
;
4093 if (queue_release
) {
4094 session
->enqueue_cap_release(
4102 if (in
.auth_cap
== cap
) {
4103 if (in
.flushing_cap_item
.is_on_list()) {
4104 ldout(cct
, 10) << " removing myself from flushing_cap list" << dendl
;
4105 in
.flushing_cap_item
.remove_myself();
4109 size_t n
= in
.caps
.erase(mds
);
4110 ceph_assert(n
== 1);
4113 if (!in
.is_any_caps()) {
4114 ldout(cct
, 15) << __func__
<< " last one, closing snaprealm " << in
.snaprealm
<< dendl
;
4115 in
.snaprealm_item
.remove_myself();
4116 put_snap_realm(in
.snaprealm
);
4121 void Client::remove_all_caps(Inode
*in
)
4123 while (!in
->caps
.empty())
4124 remove_cap(&in
->caps
.begin()->second
, true);
4127 void Client::remove_session_caps(MetaSession
*s
)
4129 ldout(cct
, 10) << __func__
<< " mds." << s
->mds_num
<< dendl
;
4131 while (s
->caps
.size()) {
4132 Cap
*cap
= *s
->caps
.begin();
4133 InodeRef
in(&cap
->inode
);
4134 bool dirty_caps
= false;
4135 if (in
->auth_cap
== cap
) {
4136 dirty_caps
= in
->dirty_caps
| in
->flushing_caps
;
4137 in
->wanted_max_size
= 0;
4138 in
->requested_max_size
= 0;
4140 if (cap
->wanted
| cap
->issued
)
4141 in
->flags
|= I_CAP_DROPPED
;
4142 remove_cap(cap
, false);
4143 in
->cap_snaps
.clear();
4145 lderr(cct
) << __func__
<< " still has dirty|flushing caps on " << *in
<< dendl
;
4146 if (in
->flushing_caps
) {
4147 num_flushing_caps
--;
4148 in
->flushing_cap_tids
.clear();
4150 in
->flushing_caps
= 0;
4151 in
->mark_caps_clean();
4152 put_inode(in
.get());
4154 signal_cond_list(in
->waitfor_caps
);
4156 s
->flushing_caps_tids
.clear();
4160 int Client::_do_remount(bool retry_on_error
)
4162 uint64_t max_retries
= g_conf().get_val
<uint64_t>("mds_max_retries_on_remount_failure");
4165 int r
= remount_cb(callback_handle
);
4167 retries_on_invalidate
= 0;
4170 client_t whoami
= get_nodeid();
4173 "failed to remount (to trim kernel dentries): "
4174 "errno = " << e
<< " (" << strerror(e
) << ")" << dendl
;
4177 "failed to remount (to trim kernel dentries): "
4178 "return code = " << r
<< dendl
;
4181 (cct
->_conf
.get_val
<bool>("client_die_on_failed_remount") ||
4182 cct
->_conf
.get_val
<bool>("client_die_on_failed_dentry_invalidate")) &&
4183 !(retry_on_error
&& (++retries_on_invalidate
< max_retries
));
4184 if (should_abort
&& !unmounting
) {
4185 lderr(cct
) << "failed to remount for kernel dentry trimming; quitting!" << dendl
;
4192 class C_Client_Remount
: public Context
{
4196 explicit C_Client_Remount(Client
*c
) : client(c
) {}
4197 void finish(int r
) override
{
4198 ceph_assert(r
== 0);
4199 client
->_do_remount(true);
4203 void Client::_invalidate_kernel_dcache()
4207 if (can_invalidate_dentries
) {
4208 if (dentry_invalidate_cb
&& root
->dir
) {
4209 for (ceph::unordered_map
<string
, Dentry
*>::iterator p
= root
->dir
->dentries
.begin();
4210 p
!= root
->dir
->dentries
.end();
4212 if (p
->second
->inode
)
4213 _schedule_invalidate_dentry_callback(p
->second
, false);
4216 } else if (remount_cb
) {
4218 // when remounting a file system, linux kernel trims all unused dentries in the fs
4219 remount_finisher
.queue(new C_Client_Remount(this));
4223 void Client::_trim_negative_child_dentries(InodeRef
& in
)
4229 if (dir
&& dir
->dentries
.size() == dir
->num_null_dentries
) {
4230 for (auto p
= dir
->dentries
.begin(); p
!= dir
->dentries
.end(); ) {
4231 Dentry
*dn
= p
->second
;
4233 ceph_assert(!dn
->inode
);
4234 if (dn
->lru_is_expireable())
4235 unlink(dn
, true, false); // keep dir, drop dentry
4237 if (dir
->dentries
.empty()) {
4242 if (in
->flags
& I_SNAPDIR_OPEN
) {
4243 InodeRef snapdir
= open_snapdir(in
.get());
4244 _trim_negative_child_dentries(snapdir
);
4248 void Client::trim_caps(MetaSession
*s
, uint64_t max
)
4250 mds_rank_t mds
= s
->mds_num
;
4251 size_t caps_size
= s
->caps
.size();
4252 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " max " << max
4253 << " caps " << caps_size
<< dendl
;
4255 uint64_t trimmed
= 0;
4256 auto p
= s
->caps
.begin();
4257 std::set
<Dentry
*> to_trim
; /* this avoids caps other than the one we're
4258 * looking at from getting deleted during traversal. */
4259 while ((caps_size
- trimmed
) > max
&& !p
.end()) {
4261 InodeRef
in(&cap
->inode
);
4263 // Increment p early because it will be invalidated if cap
4264 // is deleted inside remove_cap
4267 if (in
->caps
.size() > 1 && cap
!= in
->auth_cap
) {
4268 int mine
= cap
->issued
| cap
->implemented
;
4269 int oissued
= in
->auth_cap
? in
->auth_cap
->issued
: 0;
4270 // disposable non-auth cap
4271 if (!(get_caps_used(in
.get()) & ~oissued
& mine
)) {
4272 ldout(cct
, 20) << " removing unused, unneeded non-auth cap on " << *in
<< dendl
;
4273 cap
= (remove_cap(cap
, true), nullptr);
4277 ldout(cct
, 20) << " trying to trim dentries for " << *in
<< dendl
;
4278 _trim_negative_child_dentries(in
);
4280 auto q
= in
->dentries
.begin();
4281 while (q
!= in
->dentries
.end()) {
4284 if (dn
->lru_is_expireable()) {
4285 if (can_invalidate_dentries
&&
4286 dn
->dir
->parent_inode
->ino
== MDS_INO_ROOT
) {
4287 // Only issue one of these per DN for inodes in root: handle
4288 // others more efficiently by calling for root-child DNs at
4289 // the end of this function.
4290 _schedule_invalidate_dentry_callback(dn
, true);
4292 ldout(cct
, 20) << " queueing dentry for trimming: " << dn
->name
<< dendl
;
4295 ldout(cct
, 20) << " not expirable: " << dn
->name
<< dendl
;
4299 if (all
&& in
->ino
!= MDS_INO_ROOT
) {
4300 ldout(cct
, 20) << __func__
<< " counting as trimmed: " << *in
<< dendl
;
4305 ldout(cct
, 20) << " trimming queued dentries: " << dendl
;
4306 for (const auto &dn
: to_trim
) {
4311 caps_size
= s
->caps
.size();
4312 if (caps_size
> (size_t)max
)
4313 _invalidate_kernel_dcache();
4316 void Client::force_session_readonly(MetaSession
*s
)
4319 for (xlist
<Cap
*>::iterator p
= s
->caps
.begin(); !p
.end(); ++p
) {
4320 auto &in
= (*p
)->inode
;
4321 if (in
.caps_wanted() & CEPH_CAP_FILE_WR
)
4322 signal_cond_list(in
.waitfor_caps
);
4326 int Client::mark_caps_flushing(Inode
*in
, ceph_tid_t
* ptid
)
4328 MetaSession
*session
= in
->auth_cap
->session
;
4330 int flushing
= in
->dirty_caps
;
4331 ceph_assert(flushing
);
4333 ceph_tid_t flush_tid
= ++last_flush_tid
;
4334 in
->flushing_cap_tids
[flush_tid
] = flushing
;
4336 if (!in
->flushing_caps
) {
4337 ldout(cct
, 10) << __func__
<< " " << ccap_string(flushing
) << " " << *in
<< dendl
;
4338 num_flushing_caps
++;
4340 ldout(cct
, 10) << __func__
<< " (more) " << ccap_string(flushing
) << " " << *in
<< dendl
;
4343 in
->flushing_caps
|= flushing
;
4344 in
->mark_caps_clean();
4346 if (!in
->flushing_cap_item
.is_on_list())
4347 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4348 session
->flushing_caps_tids
.insert(flush_tid
);
4354 void Client::adjust_session_flushing_caps(Inode
*in
, MetaSession
*old_s
, MetaSession
*new_s
)
4356 for (auto &p
: in
->cap_snaps
) {
4357 CapSnap
&capsnap
= p
.second
;
4358 if (capsnap
.flush_tid
> 0) {
4359 old_s
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
4360 new_s
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4363 for (map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4364 it
!= in
->flushing_cap_tids
.end();
4366 old_s
->flushing_caps_tids
.erase(it
->first
);
4367 new_s
->flushing_caps_tids
.insert(it
->first
);
4369 new_s
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4373 * Flush all caps back to the MDS. Because the callers generally wait on the
4374 * result of this function (syncfs and umount cases), we set
4375 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4377 void Client::flush_caps_sync()
4379 ldout(cct
, 10) << __func__
<< dendl
;
4380 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
4382 unsigned flags
= CHECK_CAPS_NODELAY
;
4386 delayed_list
.pop_front();
4387 if (p
.end() && dirty_list
.empty())
4388 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4389 check_caps(in
, flags
);
4393 p
= dirty_list
.begin();
4395 unsigned flags
= CHECK_CAPS_NODELAY
;
4400 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4401 check_caps(in
, flags
);
4405 void Client::wait_sync_caps(Inode
*in
, ceph_tid_t want
)
4407 while (in
->flushing_caps
) {
4408 map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4409 ceph_assert(it
!= in
->flushing_cap_tids
.end());
4410 if (it
->first
> want
)
4412 ldout(cct
, 10) << __func__
<< " on " << *in
<< " flushing "
4413 << ccap_string(it
->second
) << " want " << want
4414 << " last " << it
->first
<< dendl
;
4415 wait_on_list(in
->waitfor_caps
);
4419 void Client::wait_sync_caps(ceph_tid_t want
)
4422 ldout(cct
, 10) << __func__
<< " want " << want
<< " (last is " << last_flush_tid
<< ", "
4423 << num_flushing_caps
<< " total flushing)" << dendl
;
4424 for (auto &p
: mds_sessions
) {
4425 MetaSession
*s
= &p
.second
;
4426 if (s
->flushing_caps_tids
.empty())
4428 ceph_tid_t oldest_tid
= *s
->flushing_caps_tids
.begin();
4429 if (oldest_tid
<= want
) {
4430 ldout(cct
, 10) << " waiting on mds." << p
.first
<< " tid " << oldest_tid
4431 << " (want " << want
<< ")" << dendl
;
4432 sync_cond
.Wait(client_lock
);
4438 void Client::kick_flushing_caps(Inode
*in
, MetaSession
*session
)
4440 in
->flags
&= ~I_KICK_FLUSH
;
4442 Cap
*cap
= in
->auth_cap
;
4443 ceph_assert(cap
->session
== session
);
4445 ceph_tid_t last_snap_flush
= 0;
4446 for (auto p
= in
->flushing_cap_tids
.rbegin();
4447 p
!= in
->flushing_cap_tids
.rend();
4450 last_snap_flush
= p
->first
;
4455 int wanted
= in
->caps_wanted();
4456 int used
= get_caps_used(in
) | in
->caps_dirty();
4457 auto it
= in
->cap_snaps
.begin();
4458 for (auto& p
: in
->flushing_cap_tids
) {
4460 int msg_flags
= p
.first
< last_snap_flush
? MClientCaps::FLAG_PENDING_CAPSNAP
: 0;
4461 send_cap(in
, session
, cap
, msg_flags
, used
, wanted
, (cap
->issued
| cap
->implemented
),
4464 ceph_assert(it
!= in
->cap_snaps
.end());
4465 ceph_assert(it
->second
.flush_tid
== p
.first
);
4466 send_flush_snap(in
, session
, it
->first
, it
->second
);
4472 void Client::kick_flushing_caps(MetaSession
*session
)
4474 mds_rank_t mds
= session
->mds_num
;
4475 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
4477 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4479 if (in
->flags
& I_KICK_FLUSH
) {
4480 ldout(cct
, 20) << " reflushing caps on " << *in
<< " to mds." << mds
<< dendl
;
4481 kick_flushing_caps(in
, session
);
4486 void Client::early_kick_flushing_caps(MetaSession
*session
)
4488 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4490 Cap
*cap
= in
->auth_cap
;
4493 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4494 // stage. This guarantees that MDS processes the cap flush message before issuing
4495 // the flushing caps to other client.
4496 if ((in
->flushing_caps
& in
->auth_cap
->issued
) == in
->flushing_caps
) {
4497 in
->flags
|= I_KICK_FLUSH
;
4501 ldout(cct
, 20) << " reflushing caps (early_kick) on " << *in
4502 << " to mds." << session
->mds_num
<< dendl
;
4503 // send_reconnect() also will reset these sequence numbers. make sure
4504 // sequence numbers in cap flush message match later reconnect message.
4508 cap
->issued
= cap
->implemented
;
4510 kick_flushing_caps(in
, session
);
4514 void SnapRealm::build_snap_context()
4516 set
<snapid_t
> snaps
;
4517 snapid_t max_seq
= seq
;
4519 // start with prior_parents?
4520 for (unsigned i
=0; i
<prior_parent_snaps
.size(); i
++)
4521 snaps
.insert(prior_parent_snaps
[i
]);
4523 // current parent's snaps
4525 const SnapContext
& psnapc
= pparent
->get_snap_context();
4526 for (unsigned i
=0; i
<psnapc
.snaps
.size(); i
++)
4527 if (psnapc
.snaps
[i
] >= parent_since
)
4528 snaps
.insert(psnapc
.snaps
[i
]);
4529 if (psnapc
.seq
> max_seq
)
4530 max_seq
= psnapc
.seq
;
4534 for (unsigned i
=0; i
<my_snaps
.size(); i
++)
4535 snaps
.insert(my_snaps
[i
]);
4538 cached_snap_context
.seq
= max_seq
;
4539 cached_snap_context
.snaps
.resize(0);
4540 cached_snap_context
.snaps
.reserve(snaps
.size());
4541 for (set
<snapid_t
>::reverse_iterator p
= snaps
.rbegin(); p
!= snaps
.rend(); ++p
)
4542 cached_snap_context
.snaps
.push_back(*p
);
4545 void Client::invalidate_snaprealm_and_children(SnapRealm
*realm
)
4550 while (!q
.empty()) {
4554 ldout(cct
, 10) << __func__
<< " " << *realm
<< dendl
;
4555 realm
->invalidate_cache();
4557 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4558 p
!= realm
->pchildren
.end();
4564 SnapRealm
*Client::get_snap_realm(inodeno_t r
)
4566 SnapRealm
*realm
= snap_realms
[r
];
4568 snap_realms
[r
] = realm
= new SnapRealm(r
);
4569 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4574 SnapRealm
*Client::get_snap_realm_maybe(inodeno_t r
)
4576 if (snap_realms
.count(r
) == 0) {
4577 ldout(cct
, 20) << __func__
<< " " << r
<< " fail" << dendl
;
4580 SnapRealm
*realm
= snap_realms
[r
];
4581 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4586 void Client::put_snap_realm(SnapRealm
*realm
)
4588 ldout(cct
, 20) << __func__
<< " " << realm
->ino
<< " " << realm
4589 << " " << realm
->nref
<< " -> " << (realm
->nref
- 1) << dendl
;
4590 if (--realm
->nref
== 0) {
4591 snap_realms
.erase(realm
->ino
);
4592 if (realm
->pparent
) {
4593 realm
->pparent
->pchildren
.erase(realm
);
4594 put_snap_realm(realm
->pparent
);
4600 bool Client::adjust_realm_parent(SnapRealm
*realm
, inodeno_t parent
)
4602 if (realm
->parent
!= parent
) {
4603 ldout(cct
, 10) << __func__
<< " " << *realm
4604 << " " << realm
->parent
<< " -> " << parent
<< dendl
;
4605 realm
->parent
= parent
;
4606 if (realm
->pparent
) {
4607 realm
->pparent
->pchildren
.erase(realm
);
4608 put_snap_realm(realm
->pparent
);
4610 realm
->pparent
= get_snap_realm(parent
);
4611 realm
->pparent
->pchildren
.insert(realm
);
4617 static bool has_new_snaps(const SnapContext
& old_snapc
,
4618 const SnapContext
& new_snapc
)
4620 return !new_snapc
.snaps
.empty() && new_snapc
.snaps
[0] > old_snapc
.seq
;
4624 void Client::update_snap_trace(const bufferlist
& bl
, SnapRealm
**realm_ret
, bool flush
)
4626 SnapRealm
*first_realm
= NULL
;
4627 ldout(cct
, 10) << __func__
<< " len " << bl
.length() << dendl
;
4629 map
<SnapRealm
*, SnapContext
> dirty_realms
;
4631 auto p
= bl
.cbegin();
4635 SnapRealm
*realm
= get_snap_realm(info
.ino());
4637 bool invalidate
= false;
4639 if (info
.seq() > realm
->seq
) {
4640 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq() << " > " << realm
->seq
4644 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4645 // flush me + children
4648 while (!q
.empty()) {
4649 SnapRealm
*realm
= q
.front();
4652 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4653 p
!= realm
->pchildren
.end();
4657 if (dirty_realms
.count(realm
) == 0) {
4659 dirty_realms
[realm
] = realm
->get_snap_context();
4665 realm
->seq
= info
.seq();
4666 realm
->created
= info
.created();
4667 realm
->parent_since
= info
.parent_since();
4668 realm
->prior_parent_snaps
= info
.prior_parent_snaps
;
4669 realm
->my_snaps
= info
.my_snaps
;
4673 // _always_ verify parent
4674 if (adjust_realm_parent(realm
, info
.parent()))
4678 invalidate_snaprealm_and_children(realm
);
4679 ldout(cct
, 15) << __func__
<< " " << *realm
<< " self|parent updated" << dendl
;
4680 ldout(cct
, 15) << " snapc " << realm
->get_snap_context() << dendl
;
4682 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq()
4683 << " <= " << realm
->seq
<< " and same parent, SKIPPING" << dendl
;
4687 first_realm
= realm
;
4689 put_snap_realm(realm
);
4692 for (map
<SnapRealm
*, SnapContext
>::iterator q
= dirty_realms
.begin();
4693 q
!= dirty_realms
.end();
4695 SnapRealm
*realm
= q
->first
;
4696 // if there are new snaps ?
4697 if (has_new_snaps(q
->second
, realm
->get_snap_context())) {
4698 ldout(cct
, 10) << " flushing caps on " << *realm
<< dendl
;
4699 xlist
<Inode
*>::iterator r
= realm
->inodes_with_caps
.begin();
4703 queue_cap_snap(in
, q
->second
);
4706 ldout(cct
, 10) << " no new snap on " << *realm
<< dendl
;
4708 put_snap_realm(realm
);
4712 *realm_ret
= first_realm
;
4714 put_snap_realm(first_realm
);
4717 void Client::handle_snap(const MConstRef
<MClientSnap
>& m
)
4719 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
4720 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4721 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4726 got_mds_push(session
);
4728 map
<Inode
*, SnapContext
> to_move
;
4729 SnapRealm
*realm
= 0;
4731 if (m
->head
.op
== CEPH_SNAP_OP_SPLIT
) {
4732 ceph_assert(m
->head
.split
);
4734 auto p
= m
->bl
.cbegin();
4736 ceph_assert(info
.ino() == m
->head
.split
);
4738 // flush, then move, ino's.
4739 realm
= get_snap_realm(info
.ino());
4740 ldout(cct
, 10) << " splitting off " << *realm
<< dendl
;
4741 for (auto& ino
: m
->split_inos
) {
4742 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
4743 if (inode_map
.count(vino
)) {
4744 Inode
*in
= inode_map
[vino
];
4745 if (!in
->snaprealm
|| in
->snaprealm
== realm
)
4747 if (in
->snaprealm
->created
> info
.created()) {
4748 ldout(cct
, 10) << " NOT moving " << *in
<< " from _newer_ realm "
4749 << *in
->snaprealm
<< dendl
;
4752 ldout(cct
, 10) << " moving " << *in
<< " from " << *in
->snaprealm
<< dendl
;
4755 in
->snaprealm_item
.remove_myself();
4756 to_move
[in
] = in
->snaprealm
->get_snap_context();
4757 put_snap_realm(in
->snaprealm
);
4761 // move child snaprealms, too
4762 for (auto& child_realm
: m
->split_realms
) {
4763 ldout(cct
, 10) << "adjusting snaprealm " << child_realm
<< " parent" << dendl
;
4764 SnapRealm
*child
= get_snap_realm_maybe(child_realm
);
4767 adjust_realm_parent(child
, realm
->ino
);
4768 put_snap_realm(child
);
4772 update_snap_trace(m
->bl
, NULL
, m
->head
.op
!= CEPH_SNAP_OP_DESTROY
);
4775 for (auto p
= to_move
.begin(); p
!= to_move
.end(); ++p
) {
4776 Inode
*in
= p
->first
;
4777 in
->snaprealm
= realm
;
4778 realm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4780 // queue for snap writeback
4781 if (has_new_snaps(p
->second
, realm
->get_snap_context()))
4782 queue_cap_snap(in
, p
->second
);
4784 put_snap_realm(realm
);
4788 void Client::handle_quota(const MConstRef
<MClientQuota
>& m
)
4790 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4791 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4796 got_mds_push(session
);
4798 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << mds
<< dendl
;
4800 vinodeno_t
vino(m
->ino
, CEPH_NOSNAP
);
4801 if (inode_map
.count(vino
)) {
4803 in
= inode_map
[vino
];
4806 in
->quota
= m
->quota
;
4807 in
->rstat
= m
->rstat
;
4812 void Client::handle_caps(const MConstRef
<MClientCaps
>& m
)
4814 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4815 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4820 if (m
->osd_epoch_barrier
&& !objecter
->have_map(m
->osd_epoch_barrier
)) {
4821 // Pause RADOS operations until we see the required epoch
4822 objecter
->set_epoch_barrier(m
->osd_epoch_barrier
);
4825 if (m
->osd_epoch_barrier
> cap_epoch_barrier
) {
4826 // Record the barrier so that we will transmit it to MDS when releasing
4827 set_cap_epoch_barrier(m
->osd_epoch_barrier
);
4830 got_mds_push(session
);
4833 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
4834 if (auto it
= inode_map
.find(vino
); it
!= inode_map
.end()) {
4837 if (m
->get_op() == CEPH_CAP_OP_IMPORT
) {
4838 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< " on IMPORT, immediately releasing" << dendl
;
4839 session
->enqueue_cap_release(
4846 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< ", dropping" << dendl
;
4849 // in case the mds is waiting on e.g. a revocation
4850 flush_cap_releases();
4854 switch (m
->get_op()) {
4855 case CEPH_CAP_OP_EXPORT
: return handle_cap_export(session
, in
, m
);
4856 case CEPH_CAP_OP_FLUSHSNAP_ACK
: return handle_cap_flushsnap_ack(session
, in
, m
);
4857 case CEPH_CAP_OP_IMPORT
: /* no return */ handle_cap_import(session
, in
, m
);
4860 if (auto it
= in
->caps
.find(mds
); it
!= in
->caps
.end()) {
4861 Cap
&cap
= in
->caps
.at(mds
);
4863 switch (m
->get_op()) {
4864 case CEPH_CAP_OP_TRUNC
: return handle_cap_trunc(session
, in
, m
);
4865 case CEPH_CAP_OP_IMPORT
:
4866 case CEPH_CAP_OP_REVOKE
:
4867 case CEPH_CAP_OP_GRANT
: return handle_cap_grant(session
, in
, &cap
, m
);
4868 case CEPH_CAP_OP_FLUSH_ACK
: return handle_cap_flush_ack(session
, in
, &cap
, m
);
4871 ldout(cct
, 5) << __func__
<< " don't have " << *in
<< " cap on mds." << mds
<< dendl
;
4876 void Client::handle_cap_import(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4878 mds_rank_t mds
= session
->mds_num
;
4880 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4881 << " IMPORT from mds." << mds
<< dendl
;
4883 const mds_rank_t peer_mds
= mds_rank_t(m
->peer
.mds
);
4886 if (auto it
= in
->caps
.find(peer_mds
); m
->peer
.cap_id
&& it
!= in
->caps
.end()) {
4888 cap_perms
= cap
->latest_perms
;
4892 SnapRealm
*realm
= NULL
;
4893 update_snap_trace(m
->snapbl
, &realm
);
4895 add_update_cap(in
, session
, m
->get_cap_id(),
4896 m
->get_caps(), m
->get_wanted(), m
->get_seq(), m
->get_mseq(),
4897 m
->get_realm(), CEPH_CAP_FLAG_AUTH
, cap_perms
);
4899 if (cap
&& cap
->cap_id
== m
->peer
.cap_id
) {
4900 remove_cap(cap
, (m
->peer
.flags
& CEPH_CAP_FLAG_RELEASE
));
4904 put_snap_realm(realm
);
4906 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
4907 // reflush any/all caps (if we are now the auth_cap)
4908 kick_flushing_caps(in
, session
);
4912 void Client::handle_cap_export(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4914 mds_rank_t mds
= session
->mds_num
;
4916 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4917 << " EXPORT from mds." << mds
<< dendl
;
4919 auto it
= in
->caps
.find(mds
);
4920 if (it
!= in
->caps
.end()) {
4921 Cap
&cap
= it
->second
;
4922 if (cap
.cap_id
== m
->get_cap_id()) {
4923 if (m
->peer
.cap_id
) {
4924 const auto peer_mds
= mds_rank_t(m
->peer
.mds
);
4925 MetaSession
*tsession
= _get_or_open_mds_session(peer_mds
);
4926 auto it
= in
->caps
.find(peer_mds
);
4927 if (it
!= in
->caps
.end()) {
4928 Cap
&tcap
= it
->second
;
4929 if (tcap
.cap_id
== m
->peer
.cap_id
&&
4930 ceph_seq_cmp(tcap
.seq
, m
->peer
.seq
) < 0) {
4931 tcap
.cap_id
= m
->peer
.cap_id
;
4932 tcap
.seq
= m
->peer
.seq
- 1;
4933 tcap
.issue_seq
= tcap
.seq
;
4934 tcap
.issued
|= cap
.issued
;
4935 tcap
.implemented
|= cap
.issued
;
4936 if (&cap
== in
->auth_cap
)
4937 in
->auth_cap
= &tcap
;
4938 if (in
->auth_cap
== &tcap
&& in
->flushing_cap_item
.is_on_list())
4939 adjust_session_flushing_caps(in
, session
, tsession
);
4942 add_update_cap(in
, tsession
, m
->peer
.cap_id
, cap
.issued
, 0,
4943 m
->peer
.seq
- 1, m
->peer
.mseq
, (uint64_t)-1,
4944 &cap
== in
->auth_cap
? CEPH_CAP_FLAG_AUTH
: 0,
4948 if (cap
.wanted
| cap
.issued
)
4949 in
->flags
|= I_CAP_DROPPED
;
4952 remove_cap(&cap
, false);
4957 void Client::handle_cap_trunc(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4959 mds_rank_t mds
= session
->mds_num
;
4960 ceph_assert(in
->caps
.count(mds
));
4962 ldout(cct
, 10) << __func__
<< " on ino " << *in
4963 << " size " << in
->size
<< " -> " << m
->get_size()
4967 in
->caps_issued(&issued
);
4968 issued
|= in
->caps_dirty();
4969 update_inode_file_size(in
, issued
, m
->get_size(),
4970 m
->get_truncate_seq(), m
->get_truncate_size());
4973 void Client::handle_cap_flush_ack(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
4975 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
4976 int dirty
= m
->get_dirty();
4980 auto it
= in
->flushing_cap_tids
.begin();
4981 if (it
->first
< flush_ack_tid
) {
4982 ldout(cct
, 0) << __func__
<< " mds." << session
->mds_num
4983 << " got unexpected flush ack tid " << flush_ack_tid
4984 << " expected is " << it
->first
<< dendl
;
4986 for (; it
!= in
->flushing_cap_tids
.end(); ) {
4992 if (it
->first
== flush_ack_tid
)
4993 cleaned
= it
->second
;
4994 if (it
->first
<= flush_ack_tid
) {
4995 session
->flushing_caps_tids
.erase(it
->first
);
4996 in
->flushing_cap_tids
.erase(it
++);
5000 cleaned
&= ~it
->second
;
5006 ldout(cct
, 5) << __func__
<< " mds." << session
->mds_num
5007 << " cleaned " << ccap_string(cleaned
) << " on " << *in
5008 << " with " << ccap_string(dirty
) << dendl
;
5011 signal_cond_list(in
->waitfor_caps
);
5012 if (session
->flushing_caps_tids
.empty() ||
5013 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5018 in
->cap_dirtier_uid
= -1;
5019 in
->cap_dirtier_gid
= -1;
5023 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != any cap bit tids" << dendl
;
5025 if (in
->flushing_caps
) {
5026 ldout(cct
, 5) << " flushing_caps " << ccap_string(in
->flushing_caps
)
5027 << " -> " << ccap_string(in
->flushing_caps
& ~cleaned
) << dendl
;
5028 in
->flushing_caps
&= ~cleaned
;
5029 if (in
->flushing_caps
== 0) {
5030 ldout(cct
, 10) << " " << *in
<< " !flushing" << dendl
;
5031 num_flushing_caps
--;
5032 if (in
->flushing_cap_tids
.empty())
5033 in
->flushing_cap_item
.remove_myself();
5035 if (!in
->caps_dirty())
5042 void Client::handle_cap_flushsnap_ack(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5044 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5045 mds_rank_t mds
= session
->mds_num
;
5046 ceph_assert(in
->caps
.count(mds
));
5047 snapid_t follows
= m
->get_snap_follows();
5049 if (auto it
= in
->cap_snaps
.find(follows
); it
!= in
->cap_snaps
.end()) {
5050 auto& capsnap
= it
->second
;
5051 if (flush_ack_tid
!= capsnap
.flush_tid
) {
5052 ldout(cct
, 10) << " tid " << flush_ack_tid
<< " != " << capsnap
.flush_tid
<< dendl
;
5054 InodeRef
tmp_ref(in
);
5055 ldout(cct
, 5) << __func__
<< " mds." << mds
<< " flushed snap follows " << follows
5056 << " on " << *in
<< dendl
;
5057 session
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
5058 in
->flushing_cap_tids
.erase(capsnap
.flush_tid
);
5059 if (in
->flushing_caps
== 0 && in
->flushing_cap_tids
.empty())
5060 in
->flushing_cap_item
.remove_myself();
5061 in
->cap_snaps
.erase(it
);
5063 signal_cond_list(in
->waitfor_caps
);
5064 if (session
->flushing_caps_tids
.empty() ||
5065 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5069 ldout(cct
, 5) << __func__
<< " DUP(?) mds." << mds
<< " flushed snap follows " << follows
5070 << " on " << *in
<< dendl
;
5071 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5075 class C_Client_DentryInvalidate
: public Context
{
5082 C_Client_DentryInvalidate(Client
*c
, Dentry
*dn
, bool del
) :
5083 client(c
), name(dn
->name
) {
5084 if (client
->use_faked_inos()) {
5085 dirino
.ino
= dn
->dir
->parent_inode
->faked_ino
;
5087 ino
.ino
= dn
->inode
->faked_ino
;
5089 dirino
= dn
->dir
->parent_inode
->vino();
5091 ino
= dn
->inode
->vino();
5094 ino
.ino
= inodeno_t();
5096 void finish(int r
) override
{
5097 // _async_dentry_invalidate is responsible for its own locking
5098 ceph_assert(!client
->client_lock
.is_locked_by_me());
5099 client
->_async_dentry_invalidate(dirino
, ino
, name
);
5103 void Client::_async_dentry_invalidate(vinodeno_t dirino
, vinodeno_t ino
, string
& name
)
5107 ldout(cct
, 10) << __func__
<< " '" << name
<< "' ino " << ino
5108 << " in dir " << dirino
<< dendl
;
5109 dentry_invalidate_cb(callback_handle
, dirino
, ino
, name
);
5112 void Client::_schedule_invalidate_dentry_callback(Dentry
*dn
, bool del
)
5114 if (dentry_invalidate_cb
&& dn
->inode
->ll_ref
> 0)
5115 async_dentry_invalidator
.queue(new C_Client_DentryInvalidate(this, dn
, del
));
5118 void Client::_try_to_trim_inode(Inode
*in
, bool sched_inval
)
5120 int ref
= in
->get_num_ref();
5121 ldout(cct
, 5) << __func__
<< " in " << *in
<<dendl
;
5123 if (in
->dir
&& !in
->dir
->dentries
.empty()) {
5124 for (auto p
= in
->dir
->dentries
.begin();
5125 p
!= in
->dir
->dentries
.end(); ) {
5126 Dentry
*dn
= p
->second
;
5128 /* rmsnap removes whole subtree, need trim inodes recursively.
5129 * we don't need to invalidate dentries recursively. because
5130 * invalidating a directory dentry effectively invalidate
5132 if (in
->snapid
!= CEPH_NOSNAP
&& dn
->inode
&& dn
->inode
->is_dir())
5133 _try_to_trim_inode(dn
->inode
.get(), false);
5135 if (dn
->lru_is_expireable())
5136 unlink(dn
, true, false); // keep dir, drop dentry
5138 if (in
->dir
->dentries
.empty()) {
5144 if (ref
> 0 && (in
->flags
& I_SNAPDIR_OPEN
)) {
5145 InodeRef snapdir
= open_snapdir(in
);
5146 _try_to_trim_inode(snapdir
.get(), false);
5151 auto q
= in
->dentries
.begin();
5152 while (q
!= in
->dentries
.end()) {
5155 if( in
->ll_ref
> 0 && sched_inval
) {
5156 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5157 // so in->dentries doesn't always reflect the state of kernel's dcache.
5158 _schedule_invalidate_dentry_callback(dn
, true);
5160 unlink(dn
, true, true);
5165 void Client::handle_cap_grant(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5167 mds_rank_t mds
= session
->mds_num
;
5168 int used
= get_caps_used(in
);
5169 int wanted
= in
->caps_wanted();
5171 const unsigned new_caps
= m
->get_caps();
5172 const bool was_stale
= session
->cap_gen
> cap
->gen
;
5173 ldout(cct
, 5) << __func__
<< " on in " << m
->get_ino()
5174 << " mds." << mds
<< " seq " << m
->get_seq()
5175 << " caps now " << ccap_string(new_caps
)
5176 << " was " << ccap_string(cap
->issued
)
5177 << (was_stale
? "" : " (stale)") << dendl
;
5180 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
5181 cap
->seq
= m
->get_seq();
5182 cap
->gen
= session
->cap_gen
;
5184 check_cap_issue(in
, new_caps
);
5188 in
->caps_issued(&issued
);
5189 issued
|= in
->caps_dirty();
5191 if ((new_caps
& CEPH_CAP_AUTH_SHARED
) &&
5192 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
5193 in
->mode
= m
->head
.mode
;
5194 in
->uid
= m
->head
.uid
;
5195 in
->gid
= m
->head
.gid
;
5196 in
->btime
= m
->btime
;
5198 bool deleted_inode
= false;
5199 if ((new_caps
& CEPH_CAP_LINK_SHARED
) &&
5200 !(issued
& CEPH_CAP_LINK_EXCL
)) {
5201 in
->nlink
= m
->head
.nlink
;
5202 if (in
->nlink
== 0 &&
5203 (new_caps
& (CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
)))
5204 deleted_inode
= true;
5206 if (!(issued
& CEPH_CAP_XATTR_EXCL
) &&
5207 m
->xattrbl
.length() &&
5208 m
->head
.xattr_version
> in
->xattr_version
) {
5209 auto p
= m
->xattrbl
.cbegin();
5210 decode(in
->xattrs
, p
);
5211 in
->xattr_version
= m
->head
.xattr_version
;
5214 if ((new_caps
& CEPH_CAP_FILE_SHARED
) && m
->dirstat_is_valid()) {
5215 in
->dirstat
.nfiles
= m
->get_nfiles();
5216 in
->dirstat
.nsubdirs
= m
->get_nsubdirs();
5219 if (new_caps
& CEPH_CAP_ANY_RD
) {
5220 update_inode_file_time(in
, issued
, m
->get_time_warp_seq(),
5221 m
->get_ctime(), m
->get_mtime(), m
->get_atime());
5224 if (new_caps
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
)) {
5225 in
->layout
= m
->get_layout();
5226 update_inode_file_size(in
, issued
, m
->get_size(),
5227 m
->get_truncate_seq(), m
->get_truncate_size());
5230 if (m
->inline_version
> in
->inline_version
) {
5231 in
->inline_data
= m
->inline_data
;
5232 in
->inline_version
= m
->inline_version
;
5235 /* always take a newer change attr */
5236 if (m
->get_change_attr() > in
->change_attr
)
5237 in
->change_attr
= m
->get_change_attr();
5240 if (cap
== in
->auth_cap
&&
5241 (new_caps
& CEPH_CAP_ANY_FILE_WR
) &&
5242 (m
->get_max_size() != in
->max_size
)) {
5243 ldout(cct
, 10) << "max_size " << in
->max_size
<< " -> " << m
->get_max_size() << dendl
;
5244 in
->max_size
= m
->get_max_size();
5245 if (in
->max_size
> in
->wanted_max_size
) {
5246 in
->wanted_max_size
= 0;
5247 in
->requested_max_size
= 0;
5252 if ((was_stale
|| m
->get_op() == CEPH_CAP_OP_IMPORT
) &&
5253 (wanted
& ~(cap
->wanted
| new_caps
))) {
5254 // If mds is importing cap, prior cap messages that update 'wanted'
5255 // may get dropped by mds (migrate seq mismatch).
5257 // We don't send cap message to update 'wanted' if what we want are
5258 // already issued. If mds revokes caps, cap message that releases caps
5259 // also tells mds what we want. But if caps got revoked by mds forcedly
5260 // (session stale). We may haven't told mds what we want.
5266 auto revoked
= cap
->issued
& ~new_caps
;
5268 ldout(cct
, 10) << " revocation of " << ccap_string(revoked
) << dendl
;
5269 cap
->issued
= new_caps
;
5270 cap
->implemented
|= new_caps
;
5272 // recall delegations if we're losing caps necessary for them
5273 if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_RD
))
5274 in
->recall_deleg(false);
5275 else if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_WR
))
5276 in
->recall_deleg(true);
5278 used
= adjust_caps_used_for_lazyio(used
, cap
->issued
, cap
->implemented
);
5279 if ((used
& revoked
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
)) &&
5280 !_flush(in
, new C_Client_FlushComplete(this, in
))) {
5281 // waitin' for flush
5282 } else if (used
& revoked
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
)) {
5286 cap
->wanted
= 0; // don't let check_caps skip sending a response to MDS
5289 } else if (cap
->issued
== new_caps
) {
5290 ldout(cct
, 10) << " caps unchanged at " << ccap_string(cap
->issued
) << dendl
;
5292 ldout(cct
, 10) << " grant, new caps are " << ccap_string(new_caps
& ~cap
->issued
) << dendl
;
5293 cap
->issued
= new_caps
;
5294 cap
->implemented
|= new_caps
;
5296 if (cap
== in
->auth_cap
) {
5297 // non-auth MDS is revoking the newly grant caps ?
5298 for (const auto &p
: in
->caps
) {
5299 if (&p
.second
== cap
)
5301 if (p
.second
.implemented
& ~p
.second
.issued
& new_caps
) {
5314 signal_cond_list(in
->waitfor_caps
);
5316 // may drop inode's last ref
5318 _try_to_trim_inode(in
, true);
5321 int Client::inode_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
5323 if (perms
.uid() == 0)
5326 if (perms
.uid() != in
->uid
&& (in
->mode
& S_IRWXG
)) {
5327 int ret
= _posix_acl_permission(in
, perms
, want
);
5332 // check permissions before doing anything else
5333 if (!in
->check_mode(perms
, want
))
5338 int Client::xattr_permission(Inode
*in
, const char *name
, unsigned want
,
5339 const UserPerm
& perms
)
5341 int r
= _getattr_for_perm(in
, perms
);
5346 if (strncmp(name
, "system.", 7) == 0) {
5347 if ((want
& MAY_WRITE
) && (perms
.uid() != 0 && perms
.uid() != in
->uid
))
5350 r
= inode_permission(in
, perms
, want
);
5353 ldout(cct
, 5) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5357 ostream
& operator<<(ostream
&out
, const UserPerm
& perm
) {
5358 out
<< "UserPerm(uid: " << perm
.uid() << ", gid: " << perm
.gid() << ")";
5362 int Client::may_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
5363 const UserPerm
& perms
)
5365 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5366 int r
= _getattr_for_perm(in
, perms
);
5370 if (mask
& CEPH_SETATTR_SIZE
) {
5371 r
= inode_permission(in
, perms
, MAY_WRITE
);
5377 if (mask
& CEPH_SETATTR_UID
) {
5378 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
|| stx
->stx_uid
!= in
->uid
))
5381 if (mask
& CEPH_SETATTR_GID
) {
5382 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
||
5383 (!perms
.gid_in_groups(stx
->stx_gid
) && stx
->stx_gid
!= in
->gid
)))
5387 if (mask
& CEPH_SETATTR_MODE
) {
5388 if (perms
.uid() != 0 && perms
.uid() != in
->uid
)
5391 gid_t i_gid
= (mask
& CEPH_SETATTR_GID
) ? stx
->stx_gid
: in
->gid
;
5392 if (perms
.uid() != 0 && !perms
.gid_in_groups(i_gid
))
5393 stx
->stx_mode
&= ~S_ISGID
;
5396 if (mask
& (CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
|
5397 CEPH_SETATTR_MTIME
| CEPH_SETATTR_ATIME
)) {
5398 if (perms
.uid() != 0 && perms
.uid() != in
->uid
) {
5399 int check_mask
= CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
;
5400 if (!(mask
& CEPH_SETATTR_MTIME_NOW
))
5401 check_mask
|= CEPH_SETATTR_MTIME
;
5402 if (!(mask
& CEPH_SETATTR_ATIME_NOW
))
5403 check_mask
|= CEPH_SETATTR_ATIME
;
5404 if (check_mask
& mask
) {
5407 r
= inode_permission(in
, perms
, MAY_WRITE
);
5415 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5419 int Client::may_open(Inode
*in
, int flags
, const UserPerm
& perms
)
5421 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5424 if ((flags
& O_ACCMODE
) == O_WRONLY
)
5426 else if ((flags
& O_ACCMODE
) == O_RDWR
)
5427 want
= MAY_READ
| MAY_WRITE
;
5428 else if ((flags
& O_ACCMODE
) == O_RDONLY
)
5430 if (flags
& O_TRUNC
)
5434 switch (in
->mode
& S_IFMT
) {
5439 if (want
& MAY_WRITE
) {
5446 r
= _getattr_for_perm(in
, perms
);
5450 r
= inode_permission(in
, perms
, want
);
5452 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5456 int Client::may_lookup(Inode
*dir
, const UserPerm
& perms
)
5458 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5459 int r
= _getattr_for_perm(dir
, perms
);
5463 r
= inode_permission(dir
, perms
, MAY_EXEC
);
5465 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5469 int Client::may_create(Inode
*dir
, const UserPerm
& perms
)
5471 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5472 int r
= _getattr_for_perm(dir
, perms
);
5476 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5478 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5482 int Client::may_delete(Inode
*dir
, const char *name
, const UserPerm
& perms
)
5484 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << "; name " << name
<< "; " << perms
<< dendl
;
5485 int r
= _getattr_for_perm(dir
, perms
);
5489 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5493 /* 'name == NULL' means rmsnap */
5494 if (perms
.uid() != 0 && name
&& (dir
->mode
& S_ISVTX
)) {
5496 r
= _lookup(dir
, name
, CEPH_CAP_AUTH_SHARED
, &otherin
, perms
);
5499 if (dir
->uid
!= perms
.uid() && otherin
->uid
!= perms
.uid())
5503 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5507 int Client::may_hardlink(Inode
*in
, const UserPerm
& perms
)
5509 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5510 int r
= _getattr_for_perm(in
, perms
);
5514 if (perms
.uid() == 0 || perms
.uid() == in
->uid
) {
5520 if (!S_ISREG(in
->mode
))
5523 if (in
->mode
& S_ISUID
)
5526 if ((in
->mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
))
5529 r
= inode_permission(in
, perms
, MAY_READ
| MAY_WRITE
);
5531 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5535 int Client::_getattr_for_perm(Inode
*in
, const UserPerm
& perms
)
5537 int mask
= CEPH_STAT_CAP_MODE
;
5539 if (acl_type
!= NO_ACL
) {
5540 mask
|= CEPH_STAT_CAP_XATTR
;
5541 force
= in
->xattr_version
== 0;
5543 return _getattr(in
, mask
, perms
, force
);
5546 vinodeno_t
Client::_get_vino(Inode
*in
)
5548 /* The caller must hold the client lock */
5549 return vinodeno_t(in
->ino
, in
->snapid
);
5553 * Resolve an MDS spec to a list of MDS daemon GIDs.
5555 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5556 * It may be '*' in which case it matches all GIDs.
5558 * If no error is returned, the `targets` vector will be populated with at least
5561 int Client::resolve_mds(
5562 const std::string
&mds_spec
,
5563 std::vector
<mds_gid_t
> *targets
)
5566 ceph_assert(targets
!= nullptr);
5569 std::stringstream ss
;
5570 int role_r
= fsmap
->parse_role(mds_spec
, &role
, ss
);
5572 // We got a role, resolve it to a GID
5573 ldout(cct
, 10) << __func__
<< ": resolved '" << mds_spec
<< "' to role '"
5574 << role
<< "'" << dendl
;
5576 fsmap
->get_filesystem(role
.fscid
)->mds_map
.get_info(role
.rank
).global_id
);
5580 std::string strtol_err
;
5581 long long rank_or_gid
= strict_strtoll(mds_spec
.c_str(), 10, &strtol_err
);
5582 if (strtol_err
.empty()) {
5583 // It is a possible GID
5584 const mds_gid_t mds_gid
= mds_gid_t(rank_or_gid
);
5585 if (fsmap
->gid_exists(mds_gid
)) {
5586 ldout(cct
, 10) << __func__
<< ": validated GID " << mds_gid
<< dendl
;
5587 targets
->push_back(mds_gid
);
5589 lderr(cct
) << __func__
<< ": GID " << mds_gid
<< " not in MDS map"
5593 } else if (mds_spec
== "*") {
5594 // It is a wildcard: use all MDSs
5595 const auto mds_info
= fsmap
->get_mds_info();
5597 if (mds_info
.empty()) {
5598 lderr(cct
) << __func__
<< ": * passed but no MDS daemons found" << dendl
;
5602 for (const auto i
: mds_info
) {
5603 targets
->push_back(i
.first
);
5606 // It did not parse as an integer, it is not a wildcard, it must be a name
5607 const mds_gid_t mds_gid
= fsmap
->find_mds_gid_by_name(mds_spec
);
5609 lderr(cct
) << "MDS ID '" << mds_spec
<< "' not found" << dendl
;
5611 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5615 ldout(cct
, 10) << __func__
<< ": resolved ID '" << mds_spec
5616 << "' to GID " << mds_gid
<< dendl
;
5617 targets
->push_back(mds_gid
);
5626 * Authenticate with mon and establish global ID
5628 int Client::authenticate()
5630 ceph_assert(client_lock
.is_locked_by_me());
5632 if (monclient
->is_authenticated()) {
5636 client_lock
.Unlock();
5637 int r
= monclient
->authenticate(cct
->_conf
->client_mount_timeout
);
5643 whoami
= monclient
->get_global_id();
5644 messenger
->set_myname(entity_name_t::CLIENT(whoami
.v
));
5649 int Client::fetch_fsmap(bool user
)
5652 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5653 // rather than MDSMap because no one MDSMap contains all the daemons, and
5654 // a `tell` can address any daemon.
5655 version_t fsmap_latest
;
5658 monclient
->get_version("fsmap", &fsmap_latest
, NULL
, &cond
);
5659 client_lock
.Unlock();
5662 } while (r
== -EAGAIN
);
5665 lderr(cct
) << "Failed to learn FSMap version: " << cpp_strerror(r
) << dendl
;
5669 ldout(cct
, 10) << __func__
<< " learned FSMap version " << fsmap_latest
<< dendl
;
5672 if (!fsmap_user
|| fsmap_user
->get_epoch() < fsmap_latest
) {
5673 monclient
->sub_want("fsmap.user", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5674 monclient
->renew_subs();
5675 wait_on_list(waiting_for_fsmap
);
5677 ceph_assert(fsmap_user
);
5678 ceph_assert(fsmap_user
->get_epoch() >= fsmap_latest
);
5680 if (!fsmap
|| fsmap
->get_epoch() < fsmap_latest
) {
5681 monclient
->sub_want("fsmap", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5682 monclient
->renew_subs();
5683 wait_on_list(waiting_for_fsmap
);
5686 ceph_assert(fsmap
->get_epoch() >= fsmap_latest
);
5688 ldout(cct
, 10) << __func__
<< " finished waiting for FSMap version "
5689 << fsmap_latest
<< dendl
;
5695 * @mds_spec one of ID, rank, GID, "*"
5698 int Client::mds_command(
5699 const std::string
&mds_spec
,
5700 const vector
<string
>& cmd
,
5701 const bufferlist
& inbl
,
5706 std::lock_guard
lock(client_lock
);
5717 r
= fetch_fsmap(false);
5722 // Look up MDS target(s) of the command
5723 std::vector
<mds_gid_t
> targets
;
5724 r
= resolve_mds(mds_spec
, &targets
);
5729 // If daemons are laggy, we won't send them commands. If all
5730 // are laggy then we fail.
5731 std::vector
<mds_gid_t
> non_laggy
;
5732 for (const auto gid
: targets
) {
5733 const auto info
= fsmap
->get_info_gid(gid
);
5734 if (!info
.laggy()) {
5735 non_laggy
.push_back(gid
);
5738 if (non_laggy
.size() == 0) {
5739 *outs
= "All targeted MDS daemons are laggy";
5743 if (metadata
.empty()) {
5744 // We are called on an unmounted client, so metadata
5745 // won't be initialized yet.
5746 populate_metadata("");
5749 // Send commands to targets
5750 C_GatherBuilder
gather(cct
, onfinish
);
5751 for (const auto target_gid
: non_laggy
) {
5752 const auto info
= fsmap
->get_info_gid(target_gid
);
5754 // Open a connection to the target MDS
5755 ConnectionRef conn
= messenger
->connect_to_mds(info
.get_addrs());
5757 // Generate MDSCommandOp state
5758 auto &op
= command_table
.start_command();
5760 op
.on_finish
= gather
.new_sub();
5765 op
.mds_gid
= target_gid
;
5768 ldout(cct
, 4) << __func__
<< ": new command op to " << target_gid
5769 << " tid=" << op
.tid
<< cmd
<< dendl
;
5771 // Construct and send MCommand
5772 auto m
= op
.get_message(monclient
->get_fsid());
5773 conn
->send_message2(std::move(m
));
5780 void Client::handle_command_reply(const MConstRef
<MCommandReply
>& m
)
5782 ceph_tid_t
const tid
= m
->get_tid();
5784 ldout(cct
, 10) << __func__
<< ": tid=" << m
->get_tid() << dendl
;
5786 if (!command_table
.exists(tid
)) {
5787 ldout(cct
, 1) << __func__
<< ": unknown tid " << tid
<< ", dropping" << dendl
;
5791 auto &op
= command_table
.get_command(tid
);
5793 *op
.outbl
= m
->get_data();
5800 op
.on_finish
->complete(m
->r
);
5803 command_table
.erase(tid
);
5806 // -------------------
5809 int Client::subscribe_mdsmap(const std::string
&fs_name
)
5811 int r
= authenticate();
5813 lderr(cct
) << "authentication failed: " << cpp_strerror(r
) << dendl
;
5817 std::string resolved_fs_name
;
5818 if (fs_name
.empty()) {
5819 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_mds_namespace");
5821 resolved_fs_name
= fs_name
;
5824 std::string want
= "mdsmap";
5825 if (!resolved_fs_name
.empty()) {
5826 r
= fetch_fsmap(true);
5829 fscid
= fsmap_user
->get_fs_cid(resolved_fs_name
);
5830 if (fscid
== FS_CLUSTER_ID_NONE
) {
5834 std::ostringstream oss
;
5835 oss
<< want
<< "." << fscid
;
5838 ldout(cct
, 10) << "Subscribing to map '" << want
<< "'" << dendl
;
5840 monclient
->sub_want(want
, 0, 0);
5841 monclient
->renew_subs();
5846 int Client::mount(const std::string
&mount_root
, const UserPerm
& perms
,
5847 bool require_mds
, const std::string
&fs_name
)
5849 std::lock_guard
lock(client_lock
);
5852 ldout(cct
, 5) << "already mounted" << dendl
;
5858 int r
= subscribe_mdsmap(fs_name
);
5860 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
5864 tick(); // start tick
5868 auto availability
= mdsmap
->is_cluster_available();
5869 if (availability
== MDSMap::STUCK_UNAVAILABLE
) {
5871 ldout(cct
, 10) << "mds cluster unavailable: epoch=" << mdsmap
->get_epoch() << dendl
;
5872 return CEPH_FUSE_NO_MDS_UP
;
5873 } else if (availability
== MDSMap::AVAILABLE
) {
5874 // Continue to mount
5876 } else if (availability
== MDSMap::TRANSIENT_UNAVAILABLE
) {
5877 // Else, wait. MDSMonitor will update the map to bring
5878 // us to a conclusion eventually.
5879 wait_on_list(waiting_for_mdsmap
);
5881 // Unexpected value!
5887 populate_metadata(mount_root
.empty() ? "/" : mount_root
);
5889 filepath
fp(CEPH_INO_ROOT
);
5890 if (!mount_root
.empty()) {
5891 fp
= filepath(mount_root
.c_str());
5894 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
5895 req
->set_filepath(fp
);
5896 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
5897 int res
= make_request(req
, perms
);
5899 if (res
== -EACCES
&& root
) {
5900 ldout(cct
, 1) << __func__
<< " EACCES on parent of mount point; quotas may not work" << dendl
;
5918 if (!cct
->_conf
->client_trace
.empty()) {
5919 traceout
.open(cct
->_conf
->client_trace
.c_str());
5920 if (traceout
.is_open()) {
5921 ldout(cct
, 1) << "opened trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
5923 ldout(cct
, 1) << "FAILED to open trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
5928 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5929 ldout(cct, 3) << "op: struct stat st;" << dendl;
5930 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5931 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5932 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5933 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5934 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5935 ldout(cct, 3) << "op: int fd;" << dendl;
5942 void Client::_close_sessions()
5944 while (!mds_sessions
.empty()) {
5945 // send session closes!
5946 for (auto &p
: mds_sessions
) {
5947 if (p
.second
.state
!= MetaSession::STATE_CLOSING
) {
5948 _close_mds_session(&p
.second
);
5952 // wait for sessions to close
5953 ldout(cct
, 2) << "waiting for " << mds_sessions
.size() << " mds sessions to close" << dendl
;
5954 mount_cond
.Wait(client_lock
);
5958 void Client::flush_mdlog_sync()
5960 if (mds_requests
.empty())
5962 for (auto &p
: mds_sessions
) {
5963 flush_mdlog(&p
.second
);
5967 void Client::flush_mdlog(MetaSession
*session
)
5969 // Only send this to Luminous or newer MDS daemons, older daemons
5970 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5971 const uint64_t features
= session
->con
->get_features();
5972 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
5973 auto m
= MClientSession::create(CEPH_SESSION_REQUEST_FLUSH_MDLOG
);
5974 session
->con
->send_message2(std::move(m
));
5979 void Client::_abort_mds_sessions(int err
)
5981 for (auto p
= mds_requests
.begin(); p
!= mds_requests
.end(); ) {
5982 auto req
= p
->second
;
5984 // unsafe requests will be removed during close session below.
5985 if (req
->got_unsafe
)
5989 if (req
->caller_cond
) {
5991 req
->caller_cond
->Signal();
5995 // Process aborts on any requests that were on this waitlist.
5996 // Any requests that were on a waiting_for_open session waitlist
5997 // will get kicked during close session below.
5998 signal_cond_list(waiting_for_mdsmap
);
6000 // Force-close all sessions
6001 while(!mds_sessions
.empty()) {
6002 auto& session
= mds_sessions
.begin()->second
;
6003 _closed_mds_session(&session
);
6007 void Client::_unmount(bool abort
)
6012 if (abort
|| blacklisted
) {
6013 ldout(cct
, 2) << "unmounting (" << (abort
? "abort)" : "blacklisted)") << dendl
;
6015 ldout(cct
, 2) << "unmounting" << dendl
;
6022 // Abort all mds sessions
6023 _abort_mds_sessions(-ENOTCONN
);
6025 objecter
->op_cancel_writes(-ENOTCONN
);
6027 // flush the mdlog for pending requests, if any
6031 while (!mds_requests
.empty()) {
6032 ldout(cct
, 10) << "waiting on " << mds_requests
.size() << " requests" << dendl
;
6033 mount_cond
.Wait(client_lock
);
6037 timer
.cancel_event(tick_event
);
6042 // clean up any unclosed files
6043 while (!fd_map
.empty()) {
6044 Fh
*fh
= fd_map
.begin()->second
;
6045 fd_map
.erase(fd_map
.begin());
6046 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *fh
->inode
<< dendl
;
6050 while (!ll_unclosed_fh_set
.empty()) {
6051 set
<Fh
*>::iterator it
= ll_unclosed_fh_set
.begin();
6053 ll_unclosed_fh_set
.erase(fh
);
6054 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *(fh
->inode
) << dendl
;
6058 while (!opened_dirs
.empty()) {
6059 dir_result_t
*dirp
= *opened_dirs
.begin();
6060 ldout(cct
, 0) << " destroyed lost open dir " << dirp
<< " on " << *dirp
->inode
<< dendl
;
6066 while (unsafe_sync_write
> 0) {
6067 ldout(cct
, 0) << unsafe_sync_write
<< " unsafe_sync_writes, waiting" << dendl
;
6068 mount_cond
.Wait(client_lock
);
6071 if (cct
->_conf
->client_oc
) {
6072 // flush/release all buffered data
6073 std::list
<InodeRef
> anchor
;
6074 for (auto& p
: inode_map
) {
6075 Inode
*in
= p
.second
;
6077 ldout(cct
, 0) << "null inode_map entry ino " << p
.first
<< dendl
;
6081 // prevent inode from getting freed
6082 anchor
.emplace_back(in
);
6084 if (abort
|| blacklisted
) {
6085 objectcacher
->purge_set(&in
->oset
);
6086 } else if (!in
->caps
.empty()) {
6088 _flush(in
, new C_Client_FlushComplete(this, in
));
6093 if (abort
|| blacklisted
) {
6094 for (auto p
= dirty_list
.begin(); !p
.end(); ) {
6097 if (in
->dirty_caps
) {
6098 ldout(cct
, 0) << " drop dirty caps on " << *in
<< dendl
;
6099 in
->mark_caps_clean();
6105 wait_sync_caps(last_flush_tid
);
6111 while (lru
.lru_get_size() > 0 ||
6112 !inode_map
.empty()) {
6113 ldout(cct
, 2) << "cache still has " << lru
.lru_get_size()
6114 << "+" << inode_map
.size() << " items"
6115 << ", waiting (for caps to release?)"
6117 utime_t until
= ceph_clock_now() + utime_t(5, 0);
6118 int r
= mount_cond
.WaitUntil(client_lock
, until
);
6119 if (r
== ETIMEDOUT
) {
6123 ceph_assert(lru
.lru_get_size() == 0);
6124 ceph_assert(inode_map
.empty());
6127 if (!cct
->_conf
->client_trace
.empty()) {
6128 ldout(cct
, 1) << "closing trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6136 ldout(cct
, 2) << "unmounted." << dendl
;
6139 void Client::unmount()
6141 std::lock_guard
lock(client_lock
);
6145 void Client::abort_conn()
6147 std::lock_guard
lock(client_lock
);
6151 void Client::flush_cap_releases()
6153 // send any cap releases
6154 for (auto &p
: mds_sessions
) {
6155 auto &session
= p
.second
;
6156 if (session
.release
&& mdsmap
->is_clientreplay_or_active_or_stopping(
6158 if (cct
->_conf
->client_inject_release_failure
) {
6159 ldout(cct
, 20) << __func__
<< " injecting failure to send cap release message" << dendl
;
6161 session
.con
->send_message2(std::move(session
.release
));
6163 session
.release
.reset();
6170 if (cct
->_conf
->client_debug_inject_tick_delay
> 0) {
6171 sleep(cct
->_conf
->client_debug_inject_tick_delay
);
6172 ceph_assert(0 == cct
->_conf
.set_val("client_debug_inject_tick_delay", "0"));
6173 cct
->_conf
.apply_changes(nullptr);
6176 ldout(cct
, 21) << "tick" << dendl
;
6177 tick_event
= timer
.add_event_after(
6178 cct
->_conf
->client_tick_interval
,
6179 new FunctionContext([this](int) {
6180 // Called back via Timer, which takes client_lock for us
6181 ceph_assert(client_lock
.is_locked_by_me());
6184 utime_t now
= ceph_clock_now();
6186 if (!mounted
&& !mds_requests
.empty()) {
6187 MetaRequest
*req
= mds_requests
.begin()->second
;
6188 if (req
->op_stamp
+ cct
->_conf
->client_mount_timeout
< now
) {
6189 req
->abort(-ETIMEDOUT
);
6190 if (req
->caller_cond
) {
6192 req
->caller_cond
->Signal();
6194 signal_cond_list(waiting_for_mdsmap
);
6195 for (auto &p
: mds_sessions
) {
6196 signal_context_list(p
.second
.waiting_for_open
);
6201 if (mdsmap
->get_epoch()) {
6203 utime_t el
= now
- last_cap_renew
;
6204 if (el
> mdsmap
->get_session_timeout() / 3.0)
6207 flush_cap_releases();
6211 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
6215 if (in
->hold_caps_until
> now
)
6217 delayed_list
.pop_front();
6218 check_caps(in
, CHECK_CAPS_NODELAY
);
6224 void Client::renew_caps()
6226 ldout(cct
, 10) << "renew_caps()" << dendl
;
6227 last_cap_renew
= ceph_clock_now();
6229 for (auto &p
: mds_sessions
) {
6230 ldout(cct
, 15) << "renew_caps requesting from mds." << p
.first
<< dendl
;
6231 if (mdsmap
->get_state(p
.first
) >= MDSMap::STATE_REJOIN
)
6232 renew_caps(&p
.second
);
6236 void Client::renew_caps(MetaSession
*session
)
6238 ldout(cct
, 10) << "renew_caps mds." << session
->mds_num
<< dendl
;
6239 session
->last_cap_renew_request
= ceph_clock_now();
6240 uint64_t seq
= ++session
->cap_renew_seq
;
6241 session
->con
->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_RENEWCAPS
, seq
));
6245 // ===============================================================
6246 // high level (POSIXy) interface
6248 int Client::_do_lookup(Inode
*dir
, const string
& name
, int mask
,
6249 InodeRef
*target
, const UserPerm
& perms
)
6251 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_LOOKUPSNAP
: CEPH_MDS_OP_LOOKUP
;
6252 MetaRequest
*req
= new MetaRequest(op
);
6254 dir
->make_nosnap_relative_path(path
);
6255 path
.push_dentry(name
);
6256 req
->set_filepath(path
);
6257 req
->set_inode(dir
);
6258 if (cct
->_conf
->client_debug_getattr_caps
&& op
== CEPH_MDS_OP_LOOKUP
)
6259 mask
|= DEBUG_GETATTR_CAPS
;
6260 req
->head
.args
.getattr
.mask
= mask
;
6262 ldout(cct
, 10) << __func__
<< " on " << path
<< dendl
;
6264 int r
= make_request(req
, perms
, target
);
6265 ldout(cct
, 10) << __func__
<< " res is " << r
<< dendl
;
6269 int Client::_lookup(Inode
*dir
, const string
& dname
, int mask
, InodeRef
*target
,
6270 const UserPerm
& perms
)
6275 if (dname
== "..") {
6276 if (dir
->dentries
.empty()) {
6277 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
6278 filepath
path(dir
->ino
);
6279 req
->set_filepath(path
);
6282 int r
= make_request(req
, perms
, &tmptarget
, NULL
, rand() % mdsmap
->get_num_in_mds());
6285 Inode
*tempino
= tmptarget
.get();
6288 ldout(cct
, 8) << __func__
<< " found target " << (*target
)->ino
<< dendl
;
6294 *target
= dir
->get_first_parent()->dir
->parent_inode
; //dirs can't be hard-linked
6303 if (!dir
->is_dir()) {
6308 if (dname
.length() > NAME_MAX
) {
6313 if (dname
== cct
->_conf
->client_snapdir
&&
6314 dir
->snapid
== CEPH_NOSNAP
) {
6315 *target
= open_snapdir(dir
);
6320 dir
->dir
->dentries
.count(dname
)) {
6321 dn
= dir
->dir
->dentries
[dname
];
6323 ldout(cct
, 20) << __func__
<< " have dn " << dname
<< " mds." << dn
->lease_mds
<< " ttl " << dn
->lease_ttl
6324 << " seq " << dn
->lease_seq
6327 if (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)) {
6328 // is dn lease valid?
6329 utime_t now
= ceph_clock_now();
6330 if (dn
->lease_mds
>= 0 &&
6331 dn
->lease_ttl
> now
&&
6332 mds_sessions
.count(dn
->lease_mds
)) {
6333 MetaSession
&s
= mds_sessions
.at(dn
->lease_mds
);
6334 if (s
.cap_ttl
> now
&&
6335 s
.cap_gen
== dn
->lease_gen
) {
6336 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6337 // make trim_caps() behave.
6338 dir
->try_touch_cap(dn
->lease_mds
);
6341 ldout(cct
, 20) << " bad lease, cap_ttl " << s
.cap_ttl
<< ", cap_gen " << s
.cap_gen
6342 << " vs lease_gen " << dn
->lease_gen
<< dendl
;
6345 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
6346 if (dn
->cap_shared_gen
== dir
->shared_gen
&&
6347 (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)))
6349 if (!dn
->inode
&& (dir
->flags
& I_COMPLETE
)) {
6350 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for "
6351 << *dir
<< " dn '" << dname
<< "'" << dendl
;
6356 ldout(cct
, 20) << " no cap on " << dn
->inode
->vino() << dendl
;
6359 // can we conclude ENOENT locally?
6360 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true) &&
6361 (dir
->flags
& I_COMPLETE
)) {
6362 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for " << *dir
<< " dn '" << dname
<< "'" << dendl
;
6367 r
= _do_lookup(dir
, dname
, mask
, target
, perms
);
6372 *target
= dn
->inode
;
6380 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << r
<< dendl
;
6382 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << **target
<< dendl
;
6386 int Client::get_or_create(Inode
*dir
, const char* name
,
6387 Dentry
**pdn
, bool expect_null
)
6390 ldout(cct
, 20) << __func__
<< " " << *dir
<< " name " << name
<< dendl
;
6392 if (dir
->dir
->dentries
.count(name
)) {
6393 Dentry
*dn
= dir
->dir
->dentries
[name
];
6395 // is dn lease valid?
6396 utime_t now
= ceph_clock_now();
6398 dn
->lease_mds
>= 0 &&
6399 dn
->lease_ttl
> now
&&
6400 mds_sessions
.count(dn
->lease_mds
)) {
6401 MetaSession
&s
= mds_sessions
.at(dn
->lease_mds
);
6402 if (s
.cap_ttl
> now
&&
6403 s
.cap_gen
== dn
->lease_gen
) {
6410 // otherwise link up a new one
6411 *pdn
= link(dir
->dir
, name
, NULL
, NULL
);
6418 int Client::path_walk(const filepath
& origpath
, InodeRef
*end
,
6419 const UserPerm
& perms
, bool followsym
, int mask
)
6421 filepath path
= origpath
;
6423 if (origpath
.absolute())
6429 ldout(cct
, 10) << __func__
<< " " << path
<< dendl
;
6434 while (i
< path
.depth() && cur
) {
6436 const string
&dname
= path
[i
];
6437 ldout(cct
, 10) << " " << i
<< " " << *cur
<< " " << dname
<< dendl
;
6438 ldout(cct
, 20) << " (path is " << path
<< ")" << dendl
;
6440 if (cct
->_conf
->client_permissions
) {
6441 int r
= may_lookup(cur
.get(), perms
);
6444 caps
= CEPH_CAP_AUTH_SHARED
;
6447 /* Get extra requested caps on the last component */
6448 if (i
== (path
.depth() - 1))
6450 int r
= _lookup(cur
.get(), dname
, caps
, &next
, perms
);
6453 // only follow trailing symlink if followsym. always follow
6454 // 'directory' symlinks.
6455 if (next
&& next
->is_symlink()) {
6457 ldout(cct
, 20) << " symlink count " << symlinks
<< ", value is '" << next
->symlink
<< "'" << dendl
;
6458 if (symlinks
> MAXSYMLINKS
) {
6462 if (i
< path
.depth() - 1) {
6464 // replace consumed components of path with symlink dir target
6465 filepath
resolved(next
->symlink
.c_str());
6466 resolved
.append(path
.postfixpath(i
+ 1));
6469 if (next
->symlink
[0] == '/') {
6473 } else if (followsym
) {
6474 if (next
->symlink
[0] == '/') {
6475 path
= next
->symlink
.c_str();
6480 filepath
more(next
->symlink
.c_str());
6481 // we need to remove the symlink component from off of the path
6482 // before adding the target that the symlink points to. remain
6483 // at the same position in the path.
6503 int Client::link(const char *relexisting
, const char *relpath
, const UserPerm
& perm
)
6505 std::lock_guard
lock(client_lock
);
6506 tout(cct
) << "link" << std::endl
;
6507 tout(cct
) << relexisting
<< std::endl
;
6508 tout(cct
) << relpath
<< std::endl
;
6513 filepath
existing(relexisting
);
6516 int r
= path_walk(existing
, &in
, perm
, true);
6519 if (std::string(relpath
) == "/") {
6523 filepath
path(relpath
);
6524 string name
= path
.last_dentry();
6527 r
= path_walk(path
, &dir
, perm
, true);
6530 if (cct
->_conf
->client_permissions
) {
6531 if (S_ISDIR(in
->mode
)) {
6535 r
= may_hardlink(in
.get(), perm
);
6538 r
= may_create(dir
.get(), perm
);
6542 r
= _link(in
.get(), dir
.get(), name
.c_str(), perm
);
6546 int Client::unlink(const char *relpath
, const UserPerm
& perm
)
6548 std::lock_guard
lock(client_lock
);
6549 tout(cct
) << __func__
<< std::endl
;
6550 tout(cct
) << relpath
<< std::endl
;
6555 if (std::string(relpath
) == "/")
6558 filepath
path(relpath
);
6559 string name
= path
.last_dentry();
6562 int r
= path_walk(path
, &dir
, perm
);
6565 if (cct
->_conf
->client_permissions
) {
6566 r
= may_delete(dir
.get(), name
.c_str(), perm
);
6570 return _unlink(dir
.get(), name
.c_str(), perm
);
6573 int Client::rename(const char *relfrom
, const char *relto
, const UserPerm
& perm
)
6575 std::lock_guard
lock(client_lock
);
6576 tout(cct
) << __func__
<< std::endl
;
6577 tout(cct
) << relfrom
<< std::endl
;
6578 tout(cct
) << relto
<< std::endl
;
6583 if (std::string(relfrom
) == "/" || std::string(relto
) == "/")
6586 filepath
from(relfrom
);
6588 string fromname
= from
.last_dentry();
6590 string toname
= to
.last_dentry();
6593 InodeRef fromdir
, todir
;
6594 int r
= path_walk(from
, &fromdir
, perm
);
6597 r
= path_walk(to
, &todir
, perm
);
6601 if (cct
->_conf
->client_permissions
) {
6602 int r
= may_delete(fromdir
.get(), fromname
.c_str(), perm
);
6605 r
= may_delete(todir
.get(), toname
.c_str(), perm
);
6606 if (r
< 0 && r
!= -ENOENT
)
6609 r
= _rename(fromdir
.get(), fromname
.c_str(), todir
.get(), toname
.c_str(), perm
);
6616 int Client::mkdir(const char *relpath
, mode_t mode
, const UserPerm
& perm
)
6618 std::lock_guard
lock(client_lock
);
6619 tout(cct
) << __func__
<< std::endl
;
6620 tout(cct
) << relpath
<< std::endl
;
6621 tout(cct
) << mode
<< std::endl
;
6622 ldout(cct
, 10) << __func__
<< ": " << relpath
<< dendl
;
6627 if (std::string(relpath
) == "/")
6630 filepath
path(relpath
);
6631 string name
= path
.last_dentry();
6634 int r
= path_walk(path
, &dir
, perm
);
6637 if (cct
->_conf
->client_permissions
) {
6638 r
= may_create(dir
.get(), perm
);
6642 return _mkdir(dir
.get(), name
.c_str(), mode
, perm
);
6645 int Client::mkdirs(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
6647 std::lock_guard
lock(client_lock
);
6648 ldout(cct
, 10) << "Client::mkdirs " << relpath
<< dendl
;
6649 tout(cct
) << __func__
<< std::endl
;
6650 tout(cct
) << relpath
<< std::endl
;
6651 tout(cct
) << mode
<< std::endl
;
6656 //get through existing parts of path
6657 filepath
path(relpath
);
6659 int r
= 0, caps
= 0;
6662 for (i
=0; i
<path
.depth(); ++i
) {
6663 if (cct
->_conf
->client_permissions
) {
6664 r
= may_lookup(cur
.get(), perms
);
6667 caps
= CEPH_CAP_AUTH_SHARED
;
6669 r
= _lookup(cur
.get(), path
[i
].c_str(), caps
, &next
, perms
);
6674 if (r
!=-ENOENT
) return r
;
6675 ldout(cct
, 20) << __func__
<< " got through " << i
<< " directories on path " << relpath
<< dendl
;
6676 //make new directory at each level
6677 for (; i
<path
.depth(); ++i
) {
6678 if (cct
->_conf
->client_permissions
) {
6679 r
= may_create(cur
.get(), perms
);
6684 r
= _mkdir(cur
.get(), path
[i
].c_str(), mode
, perms
, &next
);
6686 //check proper creation/existence
6687 if(-EEXIST
== r
&& i
< path
.depth() - 1) {
6688 r
= _lookup(cur
.get(), path
[i
].c_str(), CEPH_CAP_AUTH_SHARED
, &next
, perms
);
6692 //move to new dir and continue
6694 ldout(cct
, 20) << __func__
<< ": successfully created directory "
6695 << filepath(cur
->ino
).get_path() << dendl
;
6700 int Client::rmdir(const char *relpath
, const UserPerm
& perms
)
6702 std::lock_guard
lock(client_lock
);
6703 tout(cct
) << __func__
<< std::endl
;
6704 tout(cct
) << relpath
<< std::endl
;
6709 if (std::string(relpath
) == "/")
6712 filepath
path(relpath
);
6713 string name
= path
.last_dentry();
6716 int r
= path_walk(path
, &dir
, perms
);
6719 if (cct
->_conf
->client_permissions
) {
6720 int r
= may_delete(dir
.get(), name
.c_str(), perms
);
6724 return _rmdir(dir
.get(), name
.c_str(), perms
);
6727 int Client::mknod(const char *relpath
, mode_t mode
, const UserPerm
& perms
, dev_t rdev
)
6729 std::lock_guard
lock(client_lock
);
6730 tout(cct
) << __func__
<< std::endl
;
6731 tout(cct
) << relpath
<< std::endl
;
6732 tout(cct
) << mode
<< std::endl
;
6733 tout(cct
) << rdev
<< std::endl
;
6738 if (std::string(relpath
) == "/")
6741 filepath
path(relpath
);
6742 string name
= path
.last_dentry();
6745 int r
= path_walk(path
, &dir
, perms
);
6748 if (cct
->_conf
->client_permissions
) {
6749 int r
= may_create(dir
.get(), perms
);
6753 return _mknod(dir
.get(), name
.c_str(), mode
, rdev
, perms
);
6758 int Client::symlink(const char *target
, const char *relpath
, const UserPerm
& perms
)
6760 std::lock_guard
lock(client_lock
);
6761 tout(cct
) << __func__
<< std::endl
;
6762 tout(cct
) << target
<< std::endl
;
6763 tout(cct
) << relpath
<< std::endl
;
6768 if (std::string(relpath
) == "/")
6771 filepath
path(relpath
);
6772 string name
= path
.last_dentry();
6775 int r
= path_walk(path
, &dir
, perms
);
6778 if (cct
->_conf
->client_permissions
) {
6779 int r
= may_create(dir
.get(), perms
);
6783 return _symlink(dir
.get(), name
.c_str(), target
, perms
);
6786 int Client::readlink(const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
)
6788 std::lock_guard
lock(client_lock
);
6789 tout(cct
) << __func__
<< std::endl
;
6790 tout(cct
) << relpath
<< std::endl
;
6795 filepath
path(relpath
);
6797 int r
= path_walk(path
, &in
, perms
, false);
6801 return _readlink(in
.get(), buf
, size
);
6804 int Client::_readlink(Inode
*in
, char *buf
, size_t size
)
6806 if (!in
->is_symlink())
6809 // copy into buf (at most size bytes)
6810 int r
= in
->symlink
.length();
6813 memcpy(buf
, in
->symlink
.c_str(), r
);
6820 int Client::_getattr(Inode
*in
, int mask
, const UserPerm
& perms
, bool force
)
6822 bool yes
= in
->caps_issued_mask(mask
, true);
6824 ldout(cct
, 10) << __func__
<< " mask " << ccap_string(mask
) << " issued=" << yes
<< dendl
;
6828 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
6830 in
->make_nosnap_relative_path(path
);
6831 req
->set_filepath(path
);
6833 req
->head
.args
.getattr
.mask
= mask
;
6835 int res
= make_request(req
, perms
);
6836 ldout(cct
, 10) << __func__
<< " result=" << res
<< dendl
;
6840 int Client::_do_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
6841 const UserPerm
& perms
, InodeRef
*inp
)
6843 int issued
= in
->caps_issued();
6845 ldout(cct
, 10) << __func__
<< " mask " << mask
<< " issued " <<
6846 ccap_string(issued
) << dendl
;
6848 if (in
->snapid
!= CEPH_NOSNAP
) {
6851 if ((mask
& CEPH_SETATTR_SIZE
) &&
6852 (unsigned long)stx
->stx_size
> in
->size
&&
6853 is_quota_bytes_exceeded(in
, (unsigned long)stx
->stx_size
- in
->size
,
6858 // make the change locally?
6859 if ((in
->cap_dirtier_uid
>= 0 && perms
.uid() != in
->cap_dirtier_uid
) ||
6860 (in
->cap_dirtier_gid
>= 0 && perms
.gid() != in
->cap_dirtier_gid
)) {
6861 ldout(cct
, 10) << __func__
<< " caller " << perms
.uid() << ":" << perms
.gid()
6862 << " != cap dirtier " << in
->cap_dirtier_uid
<< ":"
6863 << in
->cap_dirtier_gid
<< ", forcing sync setattr"
6866 * This works because we implicitly flush the caps as part of the
6867 * request, so the cap update check will happen with the writeback
6868 * cap context, and then the setattr check will happen with the
6871 * In reality this pattern is likely pretty rare (different users
6872 * setattr'ing the same file). If that turns out not to be the
6873 * case later, we can build a more complex pipelined cap writeback
6877 mask
|= CEPH_SETATTR_CTIME
;
6882 // caller just needs us to bump the ctime
6883 in
->ctime
= ceph_clock_now();
6884 in
->cap_dirtier_uid
= perms
.uid();
6885 in
->cap_dirtier_gid
= perms
.gid();
6886 if (issued
& CEPH_CAP_AUTH_EXCL
)
6887 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6888 else if (issued
& CEPH_CAP_FILE_EXCL
)
6889 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
6890 else if (issued
& CEPH_CAP_XATTR_EXCL
)
6891 in
->mark_caps_dirty(CEPH_CAP_XATTR_EXCL
);
6893 mask
|= CEPH_SETATTR_CTIME
;
6896 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
6897 bool kill_sguid
= mask
& (CEPH_SETATTR_SIZE
|CEPH_SETATTR_KILL_SGUID
);
6899 mask
&= ~CEPH_SETATTR_KILL_SGUID
;
6901 if (mask
& CEPH_SETATTR_UID
) {
6902 in
->ctime
= ceph_clock_now();
6903 in
->cap_dirtier_uid
= perms
.uid();
6904 in
->cap_dirtier_gid
= perms
.gid();
6905 in
->uid
= stx
->stx_uid
;
6906 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6907 mask
&= ~CEPH_SETATTR_UID
;
6909 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
6911 if (mask
& CEPH_SETATTR_GID
) {
6912 in
->ctime
= ceph_clock_now();
6913 in
->cap_dirtier_uid
= perms
.uid();
6914 in
->cap_dirtier_gid
= perms
.gid();
6915 in
->gid
= stx
->stx_gid
;
6916 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6917 mask
&= ~CEPH_SETATTR_GID
;
6919 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
6922 if (mask
& CEPH_SETATTR_MODE
) {
6923 in
->ctime
= ceph_clock_now();
6924 in
->cap_dirtier_uid
= perms
.uid();
6925 in
->cap_dirtier_gid
= perms
.gid();
6926 in
->mode
= (in
->mode
& ~07777) | (stx
->stx_mode
& 07777);
6927 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6928 mask
&= ~CEPH_SETATTR_MODE
;
6929 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
6930 } else if (kill_sguid
&& S_ISREG(in
->mode
) && (in
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
6931 /* Must squash the any setuid/setgid bits with an ownership change */
6932 in
->mode
&= ~(S_ISUID
|S_ISGID
);
6933 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6936 if (mask
& CEPH_SETATTR_BTIME
) {
6937 in
->ctime
= ceph_clock_now();
6938 in
->cap_dirtier_uid
= perms
.uid();
6939 in
->cap_dirtier_gid
= perms
.gid();
6940 in
->btime
= utime_t(stx
->stx_btime
);
6941 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6942 mask
&= ~CEPH_SETATTR_BTIME
;
6943 ldout(cct
,10) << "changing btime to " << in
->btime
<< dendl
;
6945 } else if (mask
& CEPH_SETATTR_SIZE
) {
6946 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6947 mask
|= CEPH_SETATTR_KILL_SGUID
;
6950 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
6951 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
)) {
6952 if (mask
& CEPH_SETATTR_MTIME
)
6953 in
->mtime
= utime_t(stx
->stx_mtime
);
6954 if (mask
& CEPH_SETATTR_ATIME
)
6955 in
->atime
= utime_t(stx
->stx_atime
);
6956 in
->ctime
= ceph_clock_now();
6957 in
->cap_dirtier_uid
= perms
.uid();
6958 in
->cap_dirtier_gid
= perms
.gid();
6959 in
->time_warp_seq
++;
6960 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
6961 mask
&= ~(CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
);
6970 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETATTR
);
6974 in
->make_nosnap_relative_path(path
);
6975 req
->set_filepath(path
);
6978 if (mask
& CEPH_SETATTR_KILL_SGUID
) {
6979 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6981 if (mask
& CEPH_SETATTR_MODE
) {
6982 req
->head
.args
.setattr
.mode
= stx
->stx_mode
;
6983 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6984 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
6986 if (mask
& CEPH_SETATTR_UID
) {
6987 req
->head
.args
.setattr
.uid
= stx
->stx_uid
;
6988 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6989 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
6991 if (mask
& CEPH_SETATTR_GID
) {
6992 req
->head
.args
.setattr
.gid
= stx
->stx_gid
;
6993 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6994 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
6996 if (mask
& CEPH_SETATTR_BTIME
) {
6997 req
->head
.args
.setattr
.btime
= utime_t(stx
->stx_btime
);
6998 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7000 if (mask
& CEPH_SETATTR_MTIME
) {
7001 req
->head
.args
.setattr
.mtime
= utime_t(stx
->stx_mtime
);
7002 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7005 if (mask
& CEPH_SETATTR_ATIME
) {
7006 req
->head
.args
.setattr
.atime
= utime_t(stx
->stx_atime
);
7007 req
->inode_drop
|= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_RD
|
7010 if (mask
& CEPH_SETATTR_SIZE
) {
7011 if ((unsigned long)stx
->stx_size
< mdsmap
->get_max_filesize()) {
7012 req
->head
.args
.setattr
.size
= stx
->stx_size
;
7013 ldout(cct
,10) << "changing size to " << stx
->stx_size
<< dendl
;
7016 ldout(cct
,10) << "unable to set size to " << stx
->stx_size
<< ". Too large!" << dendl
;
7019 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7022 req
->head
.args
.setattr
.mask
= mask
;
7024 req
->regetattr_mask
= mask
;
7026 int res
= make_request(req
, perms
, inp
);
7027 ldout(cct
, 10) << "_setattr result=" << res
<< dendl
;
7031 /* Note that we only care about attrs that setattr cares about */
7032 void Client::stat_to_statx(struct stat
*st
, struct ceph_statx
*stx
)
7034 stx
->stx_size
= st
->st_size
;
7035 stx
->stx_mode
= st
->st_mode
;
7036 stx
->stx_uid
= st
->st_uid
;
7037 stx
->stx_gid
= st
->st_gid
;
7039 stx
->stx_mtime
= st
->st_mtimespec
;
7040 stx
->stx_atime
= st
->st_atimespec
;
7042 stx
->stx_mtime
= st
->st_mtim
;
7043 stx
->stx_atime
= st
->st_atim
;
7047 int Client::__setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
7048 const UserPerm
& perms
, InodeRef
*inp
)
7050 int ret
= _do_setattr(in
, stx
, mask
, perms
, inp
);
7053 if (mask
& CEPH_SETATTR_MODE
)
7054 ret
= _posix_acl_chmod(in
, stx
->stx_mode
, perms
);
7058 int Client::_setattrx(InodeRef
&in
, struct ceph_statx
*stx
, int mask
,
7059 const UserPerm
& perms
)
7061 mask
&= (CEPH_SETATTR_MODE
| CEPH_SETATTR_UID
|
7062 CEPH_SETATTR_GID
| CEPH_SETATTR_MTIME
|
7063 CEPH_SETATTR_ATIME
| CEPH_SETATTR_SIZE
|
7064 CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
);
7065 if (cct
->_conf
->client_permissions
) {
7066 int r
= may_setattr(in
.get(), stx
, mask
, perms
);
7070 return __setattrx(in
.get(), stx
, mask
, perms
);
7073 int Client::_setattr(InodeRef
&in
, struct stat
*attr
, int mask
,
7074 const UserPerm
& perms
)
7076 struct ceph_statx stx
;
7078 stat_to_statx(attr
, &stx
);
7079 mask
&= ~CEPH_SETATTR_BTIME
;
7081 if ((mask
& CEPH_SETATTR_UID
) && attr
->st_uid
== static_cast<uid_t
>(-1)) {
7082 mask
&= ~CEPH_SETATTR_UID
;
7084 if ((mask
& CEPH_SETATTR_GID
) && attr
->st_gid
== static_cast<uid_t
>(-1)) {
7085 mask
&= ~CEPH_SETATTR_GID
;
7088 return _setattrx(in
, &stx
, mask
, perms
);
7091 int Client::setattr(const char *relpath
, struct stat
*attr
, int mask
,
7092 const UserPerm
& perms
)
7094 std::lock_guard
lock(client_lock
);
7095 tout(cct
) << __func__
<< std::endl
;
7096 tout(cct
) << relpath
<< std::endl
;
7097 tout(cct
) << mask
<< std::endl
;
7102 filepath
path(relpath
);
7104 int r
= path_walk(path
, &in
, perms
);
7107 return _setattr(in
, attr
, mask
, perms
);
7110 int Client::setattrx(const char *relpath
, struct ceph_statx
*stx
, int mask
,
7111 const UserPerm
& perms
, int flags
)
7113 std::lock_guard
lock(client_lock
);
7114 tout(cct
) << __func__
<< std::endl
;
7115 tout(cct
) << relpath
<< std::endl
;
7116 tout(cct
) << mask
<< std::endl
;
7121 filepath
path(relpath
);
7123 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
));
7126 return _setattrx(in
, stx
, mask
, perms
);
7129 int Client::fsetattr(int fd
, struct stat
*attr
, int mask
, const UserPerm
& perms
)
7131 std::lock_guard
lock(client_lock
);
7132 tout(cct
) << __func__
<< std::endl
;
7133 tout(cct
) << fd
<< std::endl
;
7134 tout(cct
) << mask
<< std::endl
;
7139 Fh
*f
= get_filehandle(fd
);
7142 #if defined(__linux__) && defined(O_PATH)
7143 if (f
->flags
& O_PATH
)
7146 return _setattr(f
->inode
, attr
, mask
, perms
);
7149 int Client::fsetattrx(int fd
, struct ceph_statx
*stx
, int mask
, const UserPerm
& perms
)
7151 std::lock_guard
lock(client_lock
);
7152 tout(cct
) << __func__
<< std::endl
;
7153 tout(cct
) << fd
<< std::endl
;
7154 tout(cct
) << mask
<< std::endl
;
7159 Fh
*f
= get_filehandle(fd
);
7162 #if defined(__linux__) && defined(O_PATH)
7163 if (f
->flags
& O_PATH
)
7166 return _setattrx(f
->inode
, stx
, mask
, perms
);
7169 int Client::stat(const char *relpath
, struct stat
*stbuf
, const UserPerm
& perms
,
7170 frag_info_t
*dirstat
, int mask
)
7172 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7173 std::lock_guard
lock(client_lock
);
7174 tout(cct
) << "stat" << std::endl
;
7175 tout(cct
) << relpath
<< std::endl
;
7180 filepath
path(relpath
);
7182 int r
= path_walk(path
, &in
, perms
, true, mask
);
7185 r
= _getattr(in
, mask
, perms
);
7187 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7190 fill_stat(in
, stbuf
, dirstat
);
7191 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7195 unsigned Client::statx_to_mask(unsigned int flags
, unsigned int want
)
7199 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7200 if (flags
& AT_NO_ATTR_SYNC
)
7203 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7204 mask
|= CEPH_CAP_PIN
;
7205 if (want
& (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7206 mask
|= CEPH_CAP_AUTH_SHARED
;
7207 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7208 mask
|= CEPH_CAP_LINK_SHARED
;
7209 if (want
& (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|CEPH_STATX_CTIME
|CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
|CEPH_STATX_VERSION
))
7210 mask
|= CEPH_CAP_FILE_SHARED
;
7211 if (want
& (CEPH_STATX_VERSION
|CEPH_STATX_CTIME
))
7212 mask
|= CEPH_CAP_XATTR_SHARED
;
7217 int Client::statx(const char *relpath
, struct ceph_statx
*stx
,
7218 const UserPerm
& perms
,
7219 unsigned int want
, unsigned int flags
)
7221 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " want " << want
<< ")" << dendl
;
7222 std::lock_guard
lock(client_lock
);
7223 tout(cct
) << "statx" << std::endl
;
7224 tout(cct
) << relpath
<< std::endl
;
7229 filepath
path(relpath
);
7232 unsigned mask
= statx_to_mask(flags
, want
);
7234 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
7238 r
= _getattr(in
, mask
, perms
);
7240 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7244 fill_statx(in
, mask
, stx
);
7245 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << stx
->stx_mask
<< ")" << dendl
;
7249 int Client::lstat(const char *relpath
, struct stat
*stbuf
,
7250 const UserPerm
& perms
, frag_info_t
*dirstat
, int mask
)
7252 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7253 std::lock_guard
lock(client_lock
);
7254 tout(cct
) << __func__
<< std::endl
;
7255 tout(cct
) << relpath
<< std::endl
;
7260 filepath
path(relpath
);
7262 // don't follow symlinks
7263 int r
= path_walk(path
, &in
, perms
, false, mask
);
7266 r
= _getattr(in
, mask
, perms
);
7268 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7271 fill_stat(in
, stbuf
, dirstat
);
7272 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7276 int Client::fill_stat(Inode
*in
, struct stat
*st
, frag_info_t
*dirstat
, nest_info_t
*rstat
)
7278 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7279 << " mode 0" << oct
<< in
->mode
<< dec
7280 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7281 memset(st
, 0, sizeof(struct stat
));
7282 if (use_faked_inos())
7283 st
->st_ino
= in
->faked_ino
;
7285 st
->st_ino
= in
->ino
;
7286 st
->st_dev
= in
->snapid
;
7287 st
->st_mode
= in
->mode
;
7288 st
->st_rdev
= in
->rdev
;
7290 switch (in
->nlink
) {
7292 st
->st_nlink
= 0; /* dir is unlinked */
7295 st
->st_nlink
= 1 /* parent dentry */
7297 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7303 st
->st_nlink
= in
->nlink
;
7305 st
->st_uid
= in
->uid
;
7306 st
->st_gid
= in
->gid
;
7307 if (in
->ctime
> in
->mtime
) {
7308 stat_set_ctime_sec(st
, in
->ctime
.sec());
7309 stat_set_ctime_nsec(st
, in
->ctime
.nsec());
7311 stat_set_ctime_sec(st
, in
->mtime
.sec());
7312 stat_set_ctime_nsec(st
, in
->mtime
.nsec());
7314 stat_set_atime_sec(st
, in
->atime
.sec());
7315 stat_set_atime_nsec(st
, in
->atime
.nsec());
7316 stat_set_mtime_sec(st
, in
->mtime
.sec());
7317 stat_set_mtime_nsec(st
, in
->mtime
.nsec());
7319 if (cct
->_conf
->client_dirsize_rbytes
)
7320 st
->st_size
= in
->rstat
.rbytes
;
7322 st
->st_size
= in
->dirstat
.size();
7325 st
->st_size
= in
->size
;
7326 st
->st_blocks
= (in
->size
+ 511) >> 9;
7328 st
->st_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7331 *dirstat
= in
->dirstat
;
7335 return in
->caps_issued();
7338 void Client::fill_statx(Inode
*in
, unsigned int mask
, struct ceph_statx
*stx
)
7340 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7341 << " mode 0" << oct
<< in
->mode
<< dec
7342 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7343 memset(stx
, 0, sizeof(struct ceph_statx
));
7346 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7347 * so that all bits are set.
7352 /* These are always considered to be available */
7353 stx
->stx_dev
= in
->snapid
;
7354 stx
->stx_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7356 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7357 stx
->stx_mode
= S_IFMT
& in
->mode
;
7358 stx
->stx_ino
= use_faked_inos() ? in
->faked_ino
: (ino_t
)in
->ino
;
7359 stx
->stx_rdev
= in
->rdev
;
7360 stx
->stx_mask
|= (CEPH_STATX_INO
|CEPH_STATX_RDEV
);
7362 if (mask
& CEPH_CAP_AUTH_SHARED
) {
7363 stx
->stx_uid
= in
->uid
;
7364 stx
->stx_gid
= in
->gid
;
7365 stx
->stx_mode
= in
->mode
;
7366 in
->btime
.to_timespec(&stx
->stx_btime
);
7367 stx
->stx_mask
|= (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
);
7370 if (mask
& CEPH_CAP_LINK_SHARED
) {
7372 switch (in
->nlink
) {
7374 stx
->stx_nlink
= 0; /* dir is unlinked */
7377 stx
->stx_nlink
= 1 /* parent dentry */
7379 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7385 stx
->stx_nlink
= in
->nlink
;
7387 stx
->stx_mask
|= CEPH_STATX_NLINK
;
7390 if (mask
& CEPH_CAP_FILE_SHARED
) {
7392 in
->atime
.to_timespec(&stx
->stx_atime
);
7393 in
->mtime
.to_timespec(&stx
->stx_mtime
);
7396 if (cct
->_conf
->client_dirsize_rbytes
)
7397 stx
->stx_size
= in
->rstat
.rbytes
;
7399 stx
->stx_size
= in
->dirstat
.size();
7400 stx
->stx_blocks
= 1;
7402 stx
->stx_size
= in
->size
;
7403 stx
->stx_blocks
= (in
->size
+ 511) >> 9;
7405 stx
->stx_mask
|= (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|
7406 CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
);
7409 /* Change time and change_attr both require all shared caps to view */
7410 if ((mask
& CEPH_STAT_CAP_INODE_ALL
) == CEPH_STAT_CAP_INODE_ALL
) {
7411 stx
->stx_version
= in
->change_attr
;
7412 if (in
->ctime
> in
->mtime
)
7413 in
->ctime
.to_timespec(&stx
->stx_ctime
);
7415 in
->mtime
.to_timespec(&stx
->stx_ctime
);
7416 stx
->stx_mask
|= (CEPH_STATX_CTIME
|CEPH_STATX_VERSION
);
7421 void Client::touch_dn(Dentry
*dn
)
7426 int Client::chmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7428 std::lock_guard
lock(client_lock
);
7429 tout(cct
) << __func__
<< std::endl
;
7430 tout(cct
) << relpath
<< std::endl
;
7431 tout(cct
) << mode
<< std::endl
;
7436 filepath
path(relpath
);
7438 int r
= path_walk(path
, &in
, perms
);
7442 attr
.st_mode
= mode
;
7443 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7446 int Client::fchmod(int fd
, mode_t mode
, const UserPerm
& perms
)
7448 std::lock_guard
lock(client_lock
);
7449 tout(cct
) << __func__
<< std::endl
;
7450 tout(cct
) << fd
<< std::endl
;
7451 tout(cct
) << mode
<< std::endl
;
7456 Fh
*f
= get_filehandle(fd
);
7459 #if defined(__linux__) && defined(O_PATH)
7460 if (f
->flags
& O_PATH
)
7464 attr
.st_mode
= mode
;
7465 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MODE
, perms
);
7468 int Client::lchmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7470 std::lock_guard
lock(client_lock
);
7471 tout(cct
) << __func__
<< std::endl
;
7472 tout(cct
) << relpath
<< std::endl
;
7473 tout(cct
) << mode
<< std::endl
;
7478 filepath
path(relpath
);
7480 // don't follow symlinks
7481 int r
= path_walk(path
, &in
, perms
, false);
7485 attr
.st_mode
= mode
;
7486 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7489 int Client::chown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7490 const UserPerm
& perms
)
7492 std::lock_guard
lock(client_lock
);
7493 tout(cct
) << __func__
<< std::endl
;
7494 tout(cct
) << relpath
<< std::endl
;
7495 tout(cct
) << new_uid
<< std::endl
;
7496 tout(cct
) << new_gid
<< std::endl
;
7501 filepath
path(relpath
);
7503 int r
= path_walk(path
, &in
, perms
);
7507 attr
.st_uid
= new_uid
;
7508 attr
.st_gid
= new_gid
;
7509 return _setattr(in
, &attr
, CEPH_SETATTR_UID
|CEPH_SETATTR_GID
, perms
);
7512 int Client::fchown(int fd
, uid_t new_uid
, gid_t new_gid
, const UserPerm
& perms
)
7514 std::lock_guard
lock(client_lock
);
7515 tout(cct
) << __func__
<< std::endl
;
7516 tout(cct
) << fd
<< std::endl
;
7517 tout(cct
) << new_uid
<< std::endl
;
7518 tout(cct
) << new_gid
<< std::endl
;
7523 Fh
*f
= get_filehandle(fd
);
7526 #if defined(__linux__) && defined(O_PATH)
7527 if (f
->flags
& O_PATH
)
7531 attr
.st_uid
= new_uid
;
7532 attr
.st_gid
= new_gid
;
7534 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7535 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7536 return _setattr(f
->inode
, &attr
, mask
, perms
);
7539 int Client::lchown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7540 const UserPerm
& perms
)
7542 std::lock_guard
lock(client_lock
);
7543 tout(cct
) << __func__
<< std::endl
;
7544 tout(cct
) << relpath
<< std::endl
;
7545 tout(cct
) << new_uid
<< std::endl
;
7546 tout(cct
) << new_gid
<< std::endl
;
7551 filepath
path(relpath
);
7553 // don't follow symlinks
7554 int r
= path_walk(path
, &in
, perms
, false);
7558 attr
.st_uid
= new_uid
;
7559 attr
.st_gid
= new_gid
;
7561 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7562 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7563 return _setattr(in
, &attr
, mask
, perms
);
7566 static void attr_set_atime_and_mtime(struct stat
*attr
,
7567 const utime_t
&atime
,
7568 const utime_t
&mtime
)
7570 stat_set_atime_sec(attr
, atime
.tv
.tv_sec
);
7571 stat_set_atime_nsec(attr
, atime
.tv
.tv_nsec
);
7572 stat_set_mtime_sec(attr
, mtime
.tv
.tv_sec
);
7573 stat_set_mtime_nsec(attr
, mtime
.tv
.tv_nsec
);
7576 // for [l]utime() invoke the timeval variant as the timespec
7577 // variant are not yet implemented. for futime[s](), invoke
7578 // the timespec variant.
7579 int Client::utime(const char *relpath
, struct utimbuf
*buf
,
7580 const UserPerm
& perms
)
7582 struct timeval tv
[2];
7583 tv
[0].tv_sec
= buf
->actime
;
7585 tv
[1].tv_sec
= buf
->modtime
;
7588 return utimes(relpath
, tv
, perms
);
7591 int Client::lutime(const char *relpath
, struct utimbuf
*buf
,
7592 const UserPerm
& perms
)
7594 struct timeval tv
[2];
7595 tv
[0].tv_sec
= buf
->actime
;
7597 tv
[1].tv_sec
= buf
->modtime
;
7600 return lutimes(relpath
, tv
, perms
);
7603 int Client::futime(int fd
, struct utimbuf
*buf
, const UserPerm
& perms
)
7605 struct timespec ts
[2];
7606 ts
[0].tv_sec
= buf
->actime
;
7608 ts
[1].tv_sec
= buf
->modtime
;
7611 return futimens(fd
, ts
, perms
);
7614 int Client::utimes(const char *relpath
, struct timeval times
[2],
7615 const UserPerm
& perms
)
7617 std::lock_guard
lock(client_lock
);
7618 tout(cct
) << __func__
<< std::endl
;
7619 tout(cct
) << relpath
<< std::endl
;
7620 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
7622 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
7628 filepath
path(relpath
);
7630 int r
= path_walk(path
, &in
, perms
);
7634 utime_t
atime(times
[0]);
7635 utime_t
mtime(times
[1]);
7637 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7638 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7641 int Client::lutimes(const char *relpath
, struct timeval times
[2],
7642 const UserPerm
& perms
)
7644 std::lock_guard
lock(client_lock
);
7645 tout(cct
) << __func__
<< std::endl
;
7646 tout(cct
) << relpath
<< std::endl
;
7647 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
7649 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
7655 filepath
path(relpath
);
7657 int r
= path_walk(path
, &in
, perms
, false);
7661 utime_t
atime(times
[0]);
7662 utime_t
mtime(times
[1]);
7664 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7665 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7668 int Client::futimes(int fd
, struct timeval times
[2], const UserPerm
& perms
)
7670 struct timespec ts
[2];
7671 ts
[0].tv_sec
= times
[0].tv_sec
;
7672 ts
[0].tv_nsec
= times
[0].tv_usec
* 1000;
7673 ts
[1].tv_sec
= times
[1].tv_sec
;
7674 ts
[1].tv_nsec
= times
[1].tv_usec
* 1000;
7676 return futimens(fd
, ts
, perms
);
7679 int Client::futimens(int fd
, struct timespec times
[2], const UserPerm
& perms
)
7681 std::lock_guard
lock(client_lock
);
7682 tout(cct
) << __func__
<< std::endl
;
7683 tout(cct
) << fd
<< std::endl
;
7684 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_nsec
7686 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_nsec
7692 Fh
*f
= get_filehandle(fd
);
7695 #if defined(__linux__) && defined(O_PATH)
7696 if (f
->flags
& O_PATH
)
7700 utime_t
atime(times
[0]);
7701 utime_t
mtime(times
[1]);
7703 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7704 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7707 int Client::flock(int fd
, int operation
, uint64_t owner
)
7709 std::lock_guard
lock(client_lock
);
7710 tout(cct
) << __func__
<< std::endl
;
7711 tout(cct
) << fd
<< std::endl
;
7712 tout(cct
) << operation
<< std::endl
;
7713 tout(cct
) << owner
<< std::endl
;
7718 Fh
*f
= get_filehandle(fd
);
7722 return _flock(f
, operation
, owner
);
7725 int Client::opendir(const char *relpath
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7727 std::lock_guard
lock(client_lock
);
7728 tout(cct
) << __func__
<< std::endl
;
7729 tout(cct
) << relpath
<< std::endl
;
7734 filepath
path(relpath
);
7736 int r
= path_walk(path
, &in
, perms
, true);
7739 if (cct
->_conf
->client_permissions
) {
7740 int r
= may_open(in
.get(), O_RDONLY
, perms
);
7744 r
= _opendir(in
.get(), dirpp
, perms
);
7745 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7747 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
7751 int Client::_opendir(Inode
*in
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7755 *dirpp
= new dir_result_t(in
, perms
);
7756 opened_dirs
.insert(*dirpp
);
7757 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ") = " << 0 << " (" << *dirpp
<< ")" << dendl
;
7762 int Client::closedir(dir_result_t
*dir
)
7764 std::lock_guard
lock(client_lock
);
7765 tout(cct
) << __func__
<< std::endl
;
7766 tout(cct
) << (unsigned long)dir
<< std::endl
;
7768 ldout(cct
, 3) << __func__
<< "(" << dir
<< ") = 0" << dendl
;
7773 void Client::_closedir(dir_result_t
*dirp
)
7775 ldout(cct
, 10) << __func__
<< "(" << dirp
<< ")" << dendl
;
7777 ldout(cct
, 10) << __func__
<< " detaching inode " << dirp
->inode
<< dendl
;
7778 dirp
->inode
.reset();
7780 _readdir_drop_dirp_buffer(dirp
);
7781 opened_dirs
.erase(dirp
);
7785 void Client::rewinddir(dir_result_t
*dirp
)
7787 std::lock_guard
lock(client_lock
);
7788 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ")" << dendl
;
7793 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7794 _readdir_drop_dirp_buffer(d
);
7798 loff_t
Client::telldir(dir_result_t
*dirp
)
7800 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7801 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ") = " << d
->offset
<< dendl
;
7805 void Client::seekdir(dir_result_t
*dirp
, loff_t offset
)
7807 std::lock_guard
lock(client_lock
);
7809 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ", " << offset
<< ")" << dendl
;
7814 if (offset
== dirp
->offset
)
7817 if (offset
> dirp
->offset
)
7818 dirp
->release_count
= 0; // bump if we do a forward seek
7820 dirp
->ordered_count
= 0; // disable filling readdir cache
7822 if (dirp
->hash_order()) {
7823 if (dirp
->offset
> offset
) {
7824 _readdir_drop_dirp_buffer(dirp
);
7829 dirp
->buffer_frag
!= frag_t(dir_result_t::fpos_high(offset
)) ||
7830 dirp
->offset_low() > dir_result_t::fpos_low(offset
)) {
7831 _readdir_drop_dirp_buffer(dirp
);
7836 dirp
->offset
= offset
;
7841 // ino_t d_ino; /* inode number */
7842 // off_t d_off; /* offset to the next dirent */
7843 // unsigned short d_reclen; /* length of this record */
7844 // unsigned char d_type; /* type of file */
7845 // char d_name[256]; /* filename */
7847 void Client::fill_dirent(struct dirent
*de
, const char *name
, int type
, uint64_t ino
, loff_t next_off
)
7849 strncpy(de
->d_name
, name
, 255);
7850 de
->d_name
[255] = '\0';
7853 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7854 de
->d_off
= next_off
;
7857 de
->d_type
= IFTODT(type
);
7858 ldout(cct
, 10) << __func__
<< " '" << de
->d_name
<< "' -> " << inodeno_t(de
->d_ino
)
7859 << " type " << (int)de
->d_type
<< " w/ next_off " << hex
<< next_off
<< dec
<< dendl
;
7863 void Client::_readdir_next_frag(dir_result_t
*dirp
)
7865 frag_t fg
= dirp
->buffer_frag
;
7867 if (fg
.is_rightmost()) {
7868 ldout(cct
, 10) << __func__
<< " advance from " << fg
<< " to END" << dendl
;
7875 ldout(cct
, 10) << __func__
<< " advance from " << dirp
->buffer_frag
<< " to " << fg
<< dendl
;
7877 if (dirp
->hash_order()) {
7879 int64_t new_offset
= dir_result_t::make_fpos(fg
.value(), 2, true);
7880 if (dirp
->offset
< new_offset
) // don't decrease offset
7881 dirp
->offset
= new_offset
;
7883 dirp
->last_name
.clear();
7884 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
7885 _readdir_rechoose_frag(dirp
);
7889 void Client::_readdir_rechoose_frag(dir_result_t
*dirp
)
7891 ceph_assert(dirp
->inode
);
7893 if (dirp
->hash_order())
7896 frag_t cur
= frag_t(dirp
->offset_high());
7897 frag_t fg
= dirp
->inode
->dirfragtree
[cur
.value()];
7899 ldout(cct
, 10) << __func__
<< " frag " << cur
<< " maps to " << fg
<< dendl
;
7900 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
7901 dirp
->last_name
.clear();
7902 dirp
->next_offset
= 2;
7906 void Client::_readdir_drop_dirp_buffer(dir_result_t
*dirp
)
7908 ldout(cct
, 10) << __func__
<< " " << dirp
<< dendl
;
7909 dirp
->buffer
.clear();
7912 int Client::_readdir_get_frag(dir_result_t
*dirp
)
7915 ceph_assert(dirp
->inode
);
7917 // get the current frag.
7919 if (dirp
->hash_order())
7920 fg
= dirp
->inode
->dirfragtree
[dirp
->offset_high()];
7922 fg
= frag_t(dirp
->offset_high());
7924 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " fg " << fg
7925 << " offset " << hex
<< dirp
->offset
<< dec
<< dendl
;
7927 int op
= CEPH_MDS_OP_READDIR
;
7928 if (dirp
->inode
&& dirp
->inode
->snapid
== CEPH_SNAPDIR
)
7929 op
= CEPH_MDS_OP_LSSNAP
;
7931 InodeRef
& diri
= dirp
->inode
;
7933 MetaRequest
*req
= new MetaRequest(op
);
7935 diri
->make_nosnap_relative_path(path
);
7936 req
->set_filepath(path
);
7937 req
->set_inode(diri
.get());
7938 req
->head
.args
.readdir
.frag
= fg
;
7939 req
->head
.args
.readdir
.flags
= CEPH_READDIR_REPLY_BITFLAGS
;
7940 if (dirp
->last_name
.length()) {
7941 req
->path2
.set_path(dirp
->last_name
);
7942 } else if (dirp
->hash_order()) {
7943 req
->head
.args
.readdir
.offset_hash
= dirp
->offset_high();
7948 int res
= make_request(req
, dirp
->perms
, NULL
, NULL
, -1, &dirbl
);
7950 if (res
== -EAGAIN
) {
7951 ldout(cct
, 10) << __func__
<< " got EAGAIN, retrying" << dendl
;
7952 _readdir_rechoose_frag(dirp
);
7953 return _readdir_get_frag(dirp
);
7957 ldout(cct
, 10) << __func__
<< " " << dirp
<< " got frag " << dirp
->buffer_frag
7958 << " size " << dirp
->buffer
.size() << dendl
;
7960 ldout(cct
, 10) << __func__
<< " got error " << res
<< ", setting end flag" << dendl
;
7967 struct dentry_off_lt
{
7968 bool operator()(const Dentry
* dn
, int64_t off
) const {
7969 return dir_result_t::fpos_cmp(dn
->offset
, off
) < 0;
7973 int Client::_readdir_cache_cb(dir_result_t
*dirp
, add_dirent_cb_t cb
, void *p
,
7974 int caps
, bool getref
)
7976 ceph_assert(client_lock
.is_locked());
7977 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
7978 << " last_name " << dirp
->last_name
<< " offset " << hex
<< dirp
->offset
<< dec
7980 Dir
*dir
= dirp
->inode
->dir
;
7983 ldout(cct
, 10) << " dir is empty" << dendl
;
7988 vector
<Dentry
*>::iterator pd
= std::lower_bound(dir
->readdir_cache
.begin(),
7989 dir
->readdir_cache
.end(),
7990 dirp
->offset
, dentry_off_lt());
7994 if (!dirp
->inode
->is_complete_and_ordered())
7996 if (pd
== dir
->readdir_cache
.end())
7999 if (dn
->inode
== NULL
) {
8000 ldout(cct
, 15) << " skipping null '" << dn
->name
<< "'" << dendl
;
8004 if (dn
->cap_shared_gen
!= dir
->parent_inode
->shared_gen
) {
8005 ldout(cct
, 15) << " skipping mismatch shared gen '" << dn
->name
<< "'" << dendl
;
8010 int r
= _getattr(dn
->inode
, caps
, dirp
->perms
);
8014 struct ceph_statx stx
;
8016 fill_statx(dn
->inode
, caps
, &stx
);
8018 uint64_t next_off
= dn
->offset
+ 1;
8019 fill_dirent(&de
, dn
->name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8021 if (pd
== dir
->readdir_cache
.end())
8022 next_off
= dir_result_t::END
;
8026 in
= dn
->inode
.get();
8030 dn_name
= dn
->name
; // fill in name while we have lock
8032 client_lock
.Unlock();
8033 r
= cb(p
, &de
, &stx
, next_off
, in
); // _next_ offset
8035 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< dn
->offset
<< dec
8036 << " = " << r
<< dendl
;
8041 dirp
->offset
= next_off
;
8043 dirp
->next_offset
= 2;
8045 dirp
->next_offset
= dirp
->offset_low();
8046 dirp
->last_name
= dn_name
; // we successfully returned this one; update!
8047 dirp
->release_count
= 0; // last_name no longer match cache index
8052 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " at end" << dendl
;
8057 int Client::readdir_r_cb(dir_result_t
*d
, add_dirent_cb_t cb
, void *p
,
8058 unsigned want
, unsigned flags
, bool getref
)
8060 int caps
= statx_to_mask(flags
, want
);
8062 std::lock_guard
lock(client_lock
);
8067 dir_result_t
*dirp
= static_cast<dir_result_t
*>(d
);
8069 ldout(cct
, 10) << __func__
<< " " << *dirp
->inode
<< " offset " << hex
<< dirp
->offset
8070 << dec
<< " at_end=" << dirp
->at_end()
8071 << " hash_order=" << dirp
->hash_order() << dendl
;
8074 struct ceph_statx stx
;
8075 memset(&de
, 0, sizeof(de
));
8076 memset(&stx
, 0, sizeof(stx
));
8078 InodeRef
& diri
= dirp
->inode
;
8083 if (dirp
->offset
== 0) {
8084 ldout(cct
, 15) << " including ." << dendl
;
8085 ceph_assert(diri
->dentries
.size() < 2); // can't have multiple hard-links to a dir
8086 uint64_t next_off
= 1;
8089 r
= _getattr(diri
, caps
, dirp
->perms
);
8093 fill_statx(diri
, caps
, &stx
);
8094 fill_dirent(&de
, ".", S_IFDIR
, stx
.stx_ino
, next_off
);
8096 Inode
*inode
= NULL
;
8102 client_lock
.Unlock();
8103 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8108 dirp
->offset
= next_off
;
8112 if (dirp
->offset
== 1) {
8113 ldout(cct
, 15) << " including .." << dendl
;
8114 uint64_t next_off
= 2;
8116 if (diri
->dentries
.empty())
8119 in
= diri
->get_first_parent()->dir
->parent_inode
;
8122 r
= _getattr(in
, caps
, dirp
->perms
);
8126 fill_statx(in
, caps
, &stx
);
8127 fill_dirent(&de
, "..", S_IFDIR
, stx
.stx_ino
, next_off
);
8129 Inode
*inode
= NULL
;
8135 client_lock
.Unlock();
8136 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8141 dirp
->offset
= next_off
;
8146 // can we read from our cache?
8147 ldout(cct
, 10) << "offset " << hex
<< dirp
->offset
<< dec
8148 << " snapid " << dirp
->inode
->snapid
<< " (complete && ordered) "
8149 << dirp
->inode
->is_complete_and_ordered()
8150 << " issued " << ccap_string(dirp
->inode
->caps_issued())
8152 if (dirp
->inode
->snapid
!= CEPH_SNAPDIR
&&
8153 dirp
->inode
->is_complete_and_ordered() &&
8154 dirp
->inode
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
8155 int err
= _readdir_cache_cb(dirp
, cb
, p
, caps
, getref
);
8164 bool check_caps
= true;
8165 if (!dirp
->is_cached()) {
8166 int r
= _readdir_get_frag(dirp
);
8169 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8170 // different than the requested one. (our dirfragtree was outdated)
8173 frag_t fg
= dirp
->buffer_frag
;
8175 ldout(cct
, 10) << "frag " << fg
<< " buffer size " << dirp
->buffer
.size()
8176 << " offset " << hex
<< dirp
->offset
<< dendl
;
8178 for (auto it
= std::lower_bound(dirp
->buffer
.begin(), dirp
->buffer
.end(),
8179 dirp
->offset
, dir_result_t::dentry_off_lt());
8180 it
!= dirp
->buffer
.end();
8182 dir_result_t::dentry
&entry
= *it
;
8184 uint64_t next_off
= entry
.offset
+ 1;
8188 r
= _getattr(entry
.inode
, caps
, dirp
->perms
);
8193 fill_statx(entry
.inode
, caps
, &stx
);
8194 fill_dirent(&de
, entry
.name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8196 Inode
*inode
= NULL
;
8198 inode
= entry
.inode
.get();
8202 client_lock
.Unlock();
8203 r
= cb(p
, &de
, &stx
, next_off
, inode
); // _next_ offset
8206 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< next_off
- 1 << dec
8207 << " = " << r
<< dendl
;
8211 dirp
->offset
= next_off
;
8216 if (dirp
->next_offset
> 2) {
8217 ldout(cct
, 10) << " fetching next chunk of this frag" << dendl
;
8218 _readdir_drop_dirp_buffer(dirp
);
8222 if (!fg
.is_rightmost()) {
8224 _readdir_next_frag(dirp
);
8228 if (diri
->shared_gen
== dirp
->start_shared_gen
&&
8229 diri
->dir_release_count
== dirp
->release_count
) {
8230 if (diri
->dir_ordered_count
== dirp
->ordered_count
) {
8231 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
8233 ceph_assert(diri
->dir
->readdir_cache
.size() >= dirp
->cache_index
);
8234 diri
->dir
->readdir_cache
.resize(dirp
->cache_index
);
8236 diri
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
8238 ldout(cct
, 10) << " marking I_COMPLETE on " << *diri
<< dendl
;
8239 diri
->flags
|= I_COMPLETE
;
8251 int Client::readdir_r(dir_result_t
*d
, struct dirent
*de
)
8253 return readdirplus_r(d
, de
, 0, 0, 0, NULL
);
8260 * 1 if we got a dirent
8261 * 0 for end of directory
8265 struct single_readdir
{
8267 struct ceph_statx
*stx
;
8272 static int _readdir_single_dirent_cb(void *p
, struct dirent
*de
,
8273 struct ceph_statx
*stx
, off_t off
,
8276 single_readdir
*c
= static_cast<single_readdir
*>(p
);
8279 return -1; // already filled this dirent
8289 struct dirent
*Client::readdir(dir_result_t
*d
)
8292 static struct dirent de
;
8299 // our callback fills the dirent and sets sr.full=true on first
8300 // call, and returns -1 the second time around.
8301 ret
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
);
8303 errno
= -ret
; // this sucks.
8304 return (dirent
*) NULL
;
8309 return (dirent
*) NULL
;
8312 int Client::readdirplus_r(dir_result_t
*d
, struct dirent
*de
,
8313 struct ceph_statx
*stx
, unsigned want
,
8314 unsigned flags
, Inode
**out
)
8322 // our callback fills the dirent and sets sr.full=true on first
8323 // call, and returns -1 the second time around.
8324 int r
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
, want
, flags
, out
);
8336 struct getdents_result
{
8343 static int _readdir_getdent_cb(void *p
, struct dirent
*de
,
8344 struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8346 struct getdents_result
*c
= static_cast<getdents_result
*>(p
);
8352 dlen
= strlen(de
->d_name
) + 1;
8354 if (c
->pos
+ dlen
> c
->buflen
)
8355 return -1; // doesn't fit
8358 memcpy(c
->buf
+ c
->pos
, de
, sizeof(*de
));
8360 memcpy(c
->buf
+ c
->pos
, de
->d_name
, dlen
);
8366 int Client::_getdents(dir_result_t
*dir
, char *buf
, int buflen
, bool fullent
)
8371 gr
.fullent
= fullent
;
8374 int r
= readdir_r_cb(dir
, _readdir_getdent_cb
, (void *)&gr
);
8376 if (r
< 0) { // some error
8377 if (r
== -1) { // buffer ran out of space
8378 if (gr
.pos
) { // but we got some entries already!
8380 } // or we need a larger buffer
8382 } else { // actual error, return it
8391 struct getdir_result
{
8392 list
<string
> *contents
;
8396 static int _getdir_cb(void *p
, struct dirent
*de
, struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8398 getdir_result
*r
= static_cast<getdir_result
*>(p
);
8400 r
->contents
->push_back(de
->d_name
);
8405 int Client::getdir(const char *relpath
, list
<string
>& contents
,
8406 const UserPerm
& perms
)
8408 ldout(cct
, 3) << "getdir(" << relpath
<< ")" << dendl
;
8410 std::lock_guard
lock(client_lock
);
8411 tout(cct
) << "getdir" << std::endl
;
8412 tout(cct
) << relpath
<< std::endl
;
8416 int r
= opendir(relpath
, &d
, perms
);
8421 gr
.contents
= &contents
;
8423 r
= readdir_r_cb(d
, _getdir_cb
, (void *)&gr
);
8433 /****** file i/o **********/
8434 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
,
8435 mode_t mode
, int stripe_unit
, int stripe_count
,
8436 int object_size
, const char *data_pool
)
8438 ldout(cct
, 3) << "open enter(" << relpath
<< ", " << ceph_flags_sys2wire(flags
) << "," << mode
<< ")" << dendl
;
8439 std::lock_guard
lock(client_lock
);
8440 tout(cct
) << "open" << std::endl
;
8441 tout(cct
) << relpath
<< std::endl
;
8442 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
8449 #if defined(__linux__) && defined(O_PATH)
8450 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8451 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8452 * in kernel (fs/open.c). */
8454 flags
&= O_DIRECTORY
| O_NOFOLLOW
| O_PATH
;
8457 filepath
path(relpath
);
8459 bool created
= false;
8460 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8461 bool followsym
= !((flags
& O_NOFOLLOW
) || ((flags
& O_CREAT
) && (flags
& O_EXCL
)));
8462 int r
= path_walk(path
, &in
, perms
, followsym
, ceph_caps_for_mode(mode
));
8464 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
8467 #if defined(__linux__) && defined(O_PATH)
8468 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
) && !(flags
& O_PATH
))
8470 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
))
8474 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
8475 filepath dirpath
= path
;
8476 string dname
= dirpath
.last_dentry();
8477 dirpath
.pop_dentry();
8479 r
= path_walk(dirpath
, &dir
, perms
, true,
8480 cct
->_conf
->client_permissions
? CEPH_CAP_AUTH_SHARED
: 0);
8483 if (cct
->_conf
->client_permissions
) {
8484 r
= may_create(dir
.get(), perms
);
8488 r
= _create(dir
.get(), dname
.c_str(), flags
, mode
, &in
, &fh
, stripe_unit
,
8489 stripe_count
, object_size
, data_pool
, &created
, perms
);
8495 // posix says we can only check permissions of existing files
8496 if (cct
->_conf
->client_permissions
) {
8497 r
= may_open(in
.get(), flags
, perms
);
8504 r
= _open(in
.get(), flags
, mode
, &fh
, perms
);
8506 // allocate a integer file descriptor
8509 ceph_assert(fd_map
.count(r
) == 0);
8514 tout(cct
) << r
<< std::endl
;
8515 ldout(cct
, 3) << "open exit(" << path
<< ", " << ceph_flags_sys2wire(flags
) << ") = " << r
<< dendl
;
8519 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
, mode_t mode
)
8521 /* Use default file striping parameters */
8522 return open(relpath
, flags
, perms
, mode
, 0, 0, 0, NULL
);
8525 int Client::lookup_hash(inodeno_t ino
, inodeno_t dirino
, const char *name
,
8526 const UserPerm
& perms
)
8528 std::lock_guard
lock(client_lock
);
8529 ldout(cct
, 3) << __func__
<< " enter(" << ino
<< ", #" << dirino
<< "/" << name
<< ")" << dendl
;
8534 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPHASH
);
8536 req
->set_filepath(path
);
8538 uint32_t h
= ceph_str_hash(CEPH_STR_HASH_RJENKINS
, name
, strlen(name
));
8540 sprintf(f
, "%u", h
);
8541 filepath
path2(dirino
);
8542 path2
.push_dentry(string(f
));
8543 req
->set_filepath2(path2
);
8545 int r
= make_request(req
, perms
, NULL
, NULL
,
8546 rand() % mdsmap
->get_num_in_mds());
8547 ldout(cct
, 3) << __func__
<< " exit(" << ino
<< ", #" << dirino
<< "/" << name
<< ") = " << r
<< dendl
;
8553 * Load inode into local cache.
8555 * If inode pointer is non-NULL, and take a reference on
8556 * the resulting Inode object in one operation, so that caller
8557 * can safely assume inode will still be there after return.
8559 int Client::_lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8561 ldout(cct
, 8) << __func__
<< " enter(" << ino
<< ")" << dendl
;
8566 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPINO
);
8568 req
->set_filepath(path
);
8570 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8571 if (r
== 0 && inode
!= NULL
) {
8572 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
8573 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
8574 ceph_assert(p
!= inode_map
.end());
8578 ldout(cct
, 8) << __func__
<< " exit(" << ino
<< ") = " << r
<< dendl
;
8582 int Client::lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8584 std::lock_guard
lock(client_lock
);
8585 return _lookup_ino(ino
, perms
, inode
);
8589 * Find the parent inode of `ino` and insert it into
8590 * our cache. Conditionally also set `parent` to a referenced
8591 * Inode* if caller provides non-NULL value.
8593 int Client::_lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
8595 ldout(cct
, 8) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
8597 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
8598 filepath
path(ino
->ino
);
8599 req
->set_filepath(path
);
8602 int r
= make_request(req
, perms
, &target
, NULL
, rand() % mdsmap
->get_num_in_mds());
8603 // Give caller a reference to the parent ino if they provided a pointer.
8604 if (parent
!= NULL
) {
8606 *parent
= target
.get();
8608 ldout(cct
, 8) << __func__
<< " found parent " << (*parent
)->ino
<< dendl
;
8613 ldout(cct
, 8) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8618 * Populate the parent dentry for `ino`, provided it is
8619 * a child of `parent`.
8621 int Client::_lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8623 ceph_assert(parent
->is_dir());
8624 ldout(cct
, 3) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
8629 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
8630 req
->set_filepath2(filepath(parent
->ino
));
8631 req
->set_filepath(filepath(ino
->ino
));
8632 req
->set_inode(ino
);
8634 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8635 ldout(cct
, 3) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8639 int Client::lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8641 std::lock_guard
lock(client_lock
);
8642 return _lookup_name(ino
, parent
, perms
);
8645 Fh
*Client::_create_fh(Inode
*in
, int flags
, int cmode
, const UserPerm
& perms
)
8648 Fh
*f
= new Fh(in
, flags
, cmode
, perms
);
8650 ldout(cct
, 10) << __func__
<< " " << in
->ino
<< " mode " << cmode
<< dendl
;
8652 if (in
->snapid
!= CEPH_NOSNAP
) {
8653 in
->snap_cap_refs
++;
8654 ldout(cct
, 5) << "open success, fh is " << f
<< " combined IMMUTABLE SNAP caps "
8655 << ccap_string(in
->caps_issued()) << dendl
;
8658 const auto& conf
= cct
->_conf
;
8659 f
->readahead
.set_trigger_requests(1);
8660 f
->readahead
.set_min_readahead_size(conf
->client_readahead_min
);
8661 uint64_t max_readahead
= Readahead::NO_LIMIT
;
8662 if (conf
->client_readahead_max_bytes
) {
8663 max_readahead
= std::min(max_readahead
, (uint64_t)conf
->client_readahead_max_bytes
);
8665 if (conf
->client_readahead_max_periods
) {
8666 max_readahead
= std::min(max_readahead
, in
->layout
.get_period()*(uint64_t)conf
->client_readahead_max_periods
);
8668 f
->readahead
.set_max_readahead_size(max_readahead
);
8669 vector
<uint64_t> alignments
;
8670 alignments
.push_back(in
->layout
.get_period());
8671 alignments
.push_back(in
->layout
.stripe_unit
);
8672 f
->readahead
.set_alignments(alignments
);
8677 int Client::_release_fh(Fh
*f
)
8679 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8680 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8681 Inode
*in
= f
->inode
.get();
8682 ldout(cct
, 8) << __func__
<< " " << f
<< " mode " << f
->mode
<< " on " << *in
<< dendl
;
8686 if (in
->snapid
== CEPH_NOSNAP
) {
8687 if (in
->put_open_ref(f
->mode
)) {
8688 _flush(in
, new C_Client_FlushComplete(this, in
));
8692 ceph_assert(in
->snap_cap_refs
> 0);
8693 in
->snap_cap_refs
--;
8696 _release_filelocks(f
);
8698 // Finally, read any async err (i.e. from flushes)
8699 int err
= f
->take_async_err();
8701 ldout(cct
, 1) << __func__
<< " " << f
<< " on inode " << *in
<< " caught async_err = "
8702 << cpp_strerror(err
) << dendl
;
8704 ldout(cct
, 10) << __func__
<< " " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
8712 void Client::_put_fh(Fh
*f
)
8714 int left
= f
->put();
8720 int Client::_open(Inode
*in
, int flags
, mode_t mode
, Fh
**fhp
,
8721 const UserPerm
& perms
)
8723 if (in
->snapid
!= CEPH_NOSNAP
&&
8724 (flags
& (O_WRONLY
| O_RDWR
| O_CREAT
| O_TRUNC
| O_APPEND
))) {
8728 // use normalized flags to generate cmode
8729 int cflags
= ceph_flags_sys2wire(flags
);
8730 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
8731 cflags
|= CEPH_O_LAZY
;
8733 int cmode
= ceph_flags_to_mode(cflags
);
8734 int want
= ceph_caps_for_mode(cmode
);
8737 in
->get_open_ref(cmode
); // make note of pending open, since it effects _wanted_ caps.
8739 if ((flags
& O_TRUNC
) == 0 && in
->caps_issued_mask(want
)) {
8741 check_caps(in
, CHECK_CAPS_NODELAY
);
8744 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8746 in
->make_nosnap_relative_path(path
);
8747 req
->set_filepath(path
);
8748 req
->head
.args
.open
.flags
= cflags
& ~CEPH_O_CREAT
;
8749 req
->head
.args
.open
.mode
= mode
;
8750 req
->head
.args
.open
.pool
= -1;
8751 if (cct
->_conf
->client_debug_getattr_caps
)
8752 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8754 req
->head
.args
.open
.mask
= 0;
8755 req
->head
.args
.open
.old_size
= in
->size
; // for O_TRUNC
8757 result
= make_request(req
, perms
);
8760 * NFS expects that delegations will be broken on a conflicting open,
8761 * not just when there is actual conflicting access to the file. SMB leases
8762 * and oplocks also have similar semantics.
8764 * Ensure that clients that have delegations enabled will wait on minimal
8765 * caps during open, just to ensure that other clients holding delegations
8766 * return theirs first.
8768 if (deleg_timeout
&& result
== 0) {
8771 if (cmode
& CEPH_FILE_MODE_WR
)
8772 need
|= CEPH_CAP_FILE_WR
;
8773 if (cmode
& CEPH_FILE_MODE_RD
)
8774 need
|= CEPH_CAP_FILE_RD
;
8776 result
= get_caps(in
, need
, want
, &have
, -1);
8778 ldout(cct
, 8) << "Unable to get caps after open of inode " << *in
<<
8779 " . Denying open: " <<
8780 cpp_strerror(result
) << dendl
;
8781 in
->put_open_ref(cmode
);
8783 put_cap_ref(in
, need
);
8791 *fhp
= _create_fh(in
, flags
, cmode
, perms
);
8793 in
->put_open_ref(cmode
);
8801 int Client::_renew_caps(Inode
*in
)
8803 int wanted
= in
->caps_file_wanted();
8804 if (in
->is_any_caps() &&
8805 ((wanted
& CEPH_CAP_ANY_WR
) == 0 || in
->auth_cap
)) {
8806 check_caps(in
, CHECK_CAPS_NODELAY
);
8811 if ((wanted
& CEPH_CAP_FILE_RD
) && (wanted
& CEPH_CAP_FILE_WR
))
8813 else if (wanted
& CEPH_CAP_FILE_RD
)
8815 else if (wanted
& CEPH_CAP_FILE_WR
)
8818 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8820 in
->make_nosnap_relative_path(path
);
8821 req
->set_filepath(path
);
8822 req
->head
.args
.open
.flags
= flags
;
8823 req
->head
.args
.open
.pool
= -1;
8824 if (cct
->_conf
->client_debug_getattr_caps
)
8825 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8827 req
->head
.args
.open
.mask
= 0;
8830 // duplicate in case Cap goes away; not sure if that race is a concern?
8831 const UserPerm
*pperm
= in
->get_best_perms();
8835 int ret
= make_request(req
, perms
);
8839 int Client::close(int fd
)
8841 ldout(cct
, 3) << "close enter(" << fd
<< ")" << dendl
;
8842 std::lock_guard
lock(client_lock
);
8843 tout(cct
) << "close" << std::endl
;
8844 tout(cct
) << fd
<< std::endl
;
8849 Fh
*fh
= get_filehandle(fd
);
8852 int err
= _release_fh(fh
);
8855 ldout(cct
, 3) << "close exit(" << fd
<< ")" << dendl
;
8863 loff_t
Client::lseek(int fd
, loff_t offset
, int whence
)
8865 std::lock_guard
lock(client_lock
);
8866 tout(cct
) << "lseek" << std::endl
;
8867 tout(cct
) << fd
<< std::endl
;
8868 tout(cct
) << offset
<< std::endl
;
8869 tout(cct
) << whence
<< std::endl
;
8874 Fh
*f
= get_filehandle(fd
);
8877 #if defined(__linux__) && defined(O_PATH)
8878 if (f
->flags
& O_PATH
)
8881 return _lseek(f
, offset
, whence
);
8884 loff_t
Client::_lseek(Fh
*f
, loff_t offset
, int whence
)
8886 Inode
*in
= f
->inode
.get();
8900 r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
8903 pos
= in
->size
+ offset
;
8916 ldout(cct
, 8) << "_lseek(" << f
<< ", " << offset
<< ", " << whence
<< ") = " << f
->pos
<< dendl
;
8921 void Client::lock_fh_pos(Fh
*f
)
8923 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
8925 if (f
->pos_locked
|| !f
->pos_waiters
.empty()) {
8927 f
->pos_waiters
.push_back(&cond
);
8928 ldout(cct
, 10) << __func__
<< " BLOCKING on " << f
<< dendl
;
8929 while (f
->pos_locked
|| f
->pos_waiters
.front() != &cond
)
8930 cond
.Wait(client_lock
);
8931 ldout(cct
, 10) << __func__
<< " UNBLOCKING on " << f
<< dendl
;
8932 ceph_assert(f
->pos_waiters
.front() == &cond
);
8933 f
->pos_waiters
.pop_front();
8936 f
->pos_locked
= true;
8939 void Client::unlock_fh_pos(Fh
*f
)
8941 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
8942 f
->pos_locked
= false;
8945 int Client::uninline_data(Inode
*in
, Context
*onfinish
)
8947 if (!in
->inline_data
.length()) {
8948 onfinish
->complete(0);
8953 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (long long unsigned)in
->ino
);
8954 object_t oid
= oid_buf
;
8956 ObjectOperation create_ops
;
8957 create_ops
.create(false);
8959 objecter
->mutate(oid
,
8960 OSDMap::file_to_object_locator(in
->layout
),
8962 in
->snaprealm
->get_snap_context(),
8963 ceph::real_clock::now(),
8967 bufferlist inline_version_bl
;
8968 encode(in
->inline_version
, inline_version_bl
);
8970 ObjectOperation uninline_ops
;
8971 uninline_ops
.cmpxattr("inline_version",
8972 CEPH_OSD_CMPXATTR_OP_GT
,
8973 CEPH_OSD_CMPXATTR_MODE_U64
,
8975 bufferlist inline_data
= in
->inline_data
;
8976 uninline_ops
.write(0, inline_data
, in
->truncate_size
, in
->truncate_seq
);
8977 uninline_ops
.setxattr("inline_version", stringify(in
->inline_version
));
8979 objecter
->mutate(oid
,
8980 OSDMap::file_to_object_locator(in
->layout
),
8982 in
->snaprealm
->get_snap_context(),
8983 ceph::real_clock::now(),
8992 // blocking osd interface
8994 int Client::read(int fd
, char *buf
, loff_t size
, loff_t offset
)
8996 std::lock_guard
lock(client_lock
);
8997 tout(cct
) << "read" << std::endl
;
8998 tout(cct
) << fd
<< std::endl
;
8999 tout(cct
) << size
<< std::endl
;
9000 tout(cct
) << offset
<< std::endl
;
9005 Fh
*f
= get_filehandle(fd
);
9008 #if defined(__linux__) && defined(O_PATH)
9009 if (f
->flags
& O_PATH
)
9013 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9014 size
= std::min(size
, (loff_t
)INT_MAX
);
9015 int r
= _read(f
, offset
, size
, &bl
);
9016 ldout(cct
, 3) << "read(" << fd
<< ", " << (void*)buf
<< ", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9018 bl
.copy(0, bl
.length(), buf
);
9024 int Client::preadv(int fd
, const struct iovec
*iov
, int iovcnt
, loff_t offset
)
9028 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, false);
9031 int64_t Client::_read(Fh
*f
, int64_t offset
, uint64_t size
, bufferlist
*bl
)
9034 bool movepos
= false;
9035 std::unique_ptr
<C_SaferCond
> onuninline
;
9037 const auto& conf
= cct
->_conf
;
9038 Inode
*in
= f
->inode
.get();
9040 utime_t start
= ceph_clock_now();
9042 if ((f
->mode
& CEPH_FILE_MODE_RD
) == 0)
9044 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9051 loff_t start_pos
= offset
;
9053 if (in
->inline_version
== 0) {
9054 r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9058 ceph_assert(in
->inline_version
> 0);
9062 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9063 want
= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
;
9065 want
= CEPH_CAP_FILE_CACHE
;
9066 r
= get_caps(in
, CEPH_CAP_FILE_RD
, want
, &have
, -1);
9070 if (f
->flags
& O_DIRECT
)
9071 have
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
9073 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9074 if (!(have
& CEPH_CAP_FILE_CACHE
)) {
9075 onuninline
.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9076 uninline_data(in
, onuninline
.get());
9078 uint32_t len
= in
->inline_data
.length();
9079 uint64_t endoff
= offset
+ size
;
9080 if (endoff
> in
->size
)
9084 if (endoff
<= len
) {
9085 bl
->substr_of(in
->inline_data
, offset
, endoff
- offset
);
9087 bl
->substr_of(in
->inline_data
, offset
, len
- offset
);
9088 bl
->append_zero(endoff
- len
);
9090 r
= endoff
- offset
;
9091 } else if ((uint64_t)offset
< endoff
) {
9092 bl
->append_zero(endoff
- offset
);
9093 r
= endoff
- offset
;
9101 if (!conf
->client_debug_force_sync_read
&&
9103 (have
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
9105 if (f
->flags
& O_RSYNC
) {
9106 _flush_range(in
, offset
, size
);
9108 r
= _read_async(f
, offset
, size
, bl
);
9112 if (f
->flags
& O_DIRECT
)
9113 _flush_range(in
, offset
, size
);
9115 bool checkeof
= false;
9116 r
= _read_sync(f
, offset
, size
, bl
, &checkeof
);
9123 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9126 r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
9131 if ((uint64_t)offset
< in
->size
)
9137 ceph_assert(r
>= 0);
9140 f
->pos
= start_pos
+ r
;
9143 lat
= ceph_clock_now();
9145 logger
->tinc(l_c_read
, lat
);
9151 client_lock
.Unlock();
9152 int ret
= onuninline
->wait();
9154 if (ret
>= 0 || ret
== -ECANCELED
) {
9155 in
->inline_data
.clear();
9156 in
->inline_version
= CEPH_INLINE_NONE
;
9157 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9163 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9171 Client::C_Readahead::C_Readahead(Client
*c
, Fh
*f
) :
9174 f
->readahead
.inc_pending();
9177 Client::C_Readahead::~C_Readahead() {
9178 f
->readahead
.dec_pending();
9182 void Client::C_Readahead::finish(int r
) {
9183 lgeneric_subdout(client
->cct
, client
, 20) << "client." << client
->get_nodeid() << " " << "C_Readahead on " << f
->inode
<< dendl
;
9184 client
->put_cap_ref(f
->inode
.get(), CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9187 int Client::_read_async(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
)
9189 const auto& conf
= cct
->_conf
;
9190 Inode
*in
= f
->inode
.get();
9192 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9194 // trim read based on file size?
9195 if (off
>= in
->size
)
9199 if (off
+ len
> in
->size
) {
9200 len
= in
->size
- off
;
9203 ldout(cct
, 10) << " min_bytes=" << f
->readahead
.get_min_readahead_size()
9204 << " max_bytes=" << f
->readahead
.get_max_readahead_size()
9205 << " max_periods=" << conf
->client_readahead_max_periods
<< dendl
;
9207 // read (and possibly block)
9209 C_SaferCond
onfinish("Client::_read_async flock");
9210 r
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9211 off
, len
, bl
, 0, &onfinish
);
9213 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9214 client_lock
.Unlock();
9215 r
= onfinish
.wait();
9217 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9220 if(f
->readahead
.get_min_readahead_size() > 0) {
9221 pair
<uint64_t, uint64_t> readahead_extent
= f
->readahead
.update(off
, len
, in
->size
);
9222 if (readahead_extent
.second
> 0) {
9223 ldout(cct
, 20) << "readahead " << readahead_extent
.first
<< "~" << readahead_extent
.second
9224 << " (caller wants " << off
<< "~" << len
<< ")" << dendl
;
9225 Context
*onfinish2
= new C_Readahead(this, f
);
9226 int r2
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9227 readahead_extent
.first
, readahead_extent
.second
,
9228 NULL
, 0, onfinish2
);
9230 ldout(cct
, 20) << "readahead initiated, c " << onfinish2
<< dendl
;
9231 get_cap_ref(in
, CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9233 ldout(cct
, 20) << "readahead was no-op, already cached" << dendl
;
9242 int Client::_read_sync(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
,
9245 Inode
*in
= f
->inode
.get();
9250 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9252 Mutex
flock("Client::_read_sync flock");
9255 C_SaferCond
onfinish("Client::_read_sync flock");
9259 filer
->read_trunc(in
->ino
, &in
->layout
, in
->snapid
,
9261 in
->truncate_size
, in
->truncate_seq
,
9263 client_lock
.Unlock();
9264 int r
= onfinish
.wait();
9267 // if we get ENOENT from OSD, assume 0 bytes returned
9278 bl
->claim_append(tbl
);
9281 if (r
>= 0 && r
< wanted
) {
9282 if (pos
< in
->size
) {
9283 // zero up to known EOF
9284 int64_t some
= in
->size
- pos
;
9287 auto z
= buffer::ptr_node::create(some
);
9289 bl
->push_back(std::move(z
));
9306 * we keep count of uncommitted sync writes on the inode, so that
9309 void Client::_sync_write_commit(Inode
*in
)
9311 ceph_assert(unsafe_sync_write
> 0);
9312 unsafe_sync_write
--;
9314 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9316 ldout(cct
, 15) << __func__
<< " unsafe_sync_write = " << unsafe_sync_write
<< dendl
;
9317 if (unsafe_sync_write
== 0 && unmounting
) {
9318 ldout(cct
, 10) << __func__
<< " -- no more unsafe writes, unmount can proceed" << dendl
;
9319 mount_cond
.Signal();
9323 int Client::write(int fd
, const char *buf
, loff_t size
, loff_t offset
)
9325 std::lock_guard
lock(client_lock
);
9326 tout(cct
) << "write" << std::endl
;
9327 tout(cct
) << fd
<< std::endl
;
9328 tout(cct
) << size
<< std::endl
;
9329 tout(cct
) << offset
<< std::endl
;
9334 Fh
*fh
= get_filehandle(fd
);
9337 #if defined(__linux__) && defined(O_PATH)
9338 if (fh
->flags
& O_PATH
)
9341 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9342 size
= std::min(size
, (loff_t
)INT_MAX
);
9343 int r
= _write(fh
, offset
, size
, buf
, NULL
, false);
9344 ldout(cct
, 3) << "write(" << fd
<< ", \"...\", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9348 int Client::pwritev(int fd
, const struct iovec
*iov
, int iovcnt
, int64_t offset
)
9352 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, true);
9355 int64_t Client::_preadv_pwritev_locked(Fh
*fh
, const struct iovec
*iov
,
9356 unsigned iovcnt
, int64_t offset
, bool write
,
9359 #if defined(__linux__) && defined(O_PATH)
9360 if (fh
->flags
& O_PATH
)
9363 loff_t totallen
= 0;
9364 for (unsigned i
= 0; i
< iovcnt
; i
++) {
9365 totallen
+= iov
[i
].iov_len
;
9369 * Some of the API functions take 64-bit size values, but only return
9370 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9371 * we don't do I/Os larger than the values we can return.
9374 totallen
= std::min(totallen
, (loff_t
)INT_MAX
);
9377 int64_t w
= _write(fh
, offset
, totallen
, NULL
, iov
, iovcnt
);
9378 ldout(cct
, 3) << "pwritev(" << fh
<< ", \"...\", " << totallen
<< ", " << offset
<< ") = " << w
<< dendl
;
9382 int64_t r
= _read(fh
, offset
, totallen
, &bl
);
9383 ldout(cct
, 3) << "preadv(" << fh
<< ", " << offset
<< ") = " << r
<< dendl
;
9388 for (unsigned j
= 0, resid
= r
; j
< iovcnt
&& resid
> 0; j
++) {
9390 * This piece of code aims to handle the case that bufferlist does not have enough data
9391 * to fill in the iov
9393 if (resid
< iov
[j
].iov_len
) {
9394 bl
.copy(bufoff
, resid
, (char *)iov
[j
].iov_base
);
9397 bl
.copy(bufoff
, iov
[j
].iov_len
, (char *)iov
[j
].iov_base
);
9399 resid
-= iov
[j
].iov_len
;
9400 bufoff
+= iov
[j
].iov_len
;
9406 int Client::_preadv_pwritev(int fd
, const struct iovec
*iov
, unsigned iovcnt
, int64_t offset
, bool write
)
9408 std::lock_guard
lock(client_lock
);
9409 tout(cct
) << fd
<< std::endl
;
9410 tout(cct
) << offset
<< std::endl
;
9415 Fh
*fh
= get_filehandle(fd
);
9418 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, offset
, write
, true);
9421 int64_t Client::_write(Fh
*f
, int64_t offset
, uint64_t size
, const char *buf
,
9422 const struct iovec
*iov
, int iovcnt
)
9426 if ((uint64_t)(offset
+size
) > mdsmap
->get_max_filesize()) //too large!
9429 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9430 Inode
*in
= f
->inode
.get();
9432 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
9436 ceph_assert(in
->snapid
== CEPH_NOSNAP
);
9438 // was Fh opened as writeable?
9439 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
9442 // use/adjust fd pos?
9446 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9447 * change out from under us.
9449 if (f
->flags
& O_APPEND
) {
9450 int r
= _lseek(f
, 0, SEEK_END
);
9462 uint64_t endoff
= offset
+ size
;
9463 if (endoff
> in
->size
&& is_quota_bytes_exceeded(in
, endoff
- in
->size
,
9468 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9470 ldout(cct
, 10) << "cur file size is " << in
->size
<< dendl
;
9473 utime_t start
= ceph_clock_now();
9475 if (in
->inline_version
== 0) {
9476 int r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9479 ceph_assert(in
->inline_version
> 0);
9482 // copy into fresh buffer (since our write may be resub, async)
9486 bl
.append(buf
, size
);
9488 for (int i
= 0; i
< iovcnt
; i
++) {
9489 if (iov
[i
].iov_len
> 0) {
9490 bl
.append((const char *)iov
[i
].iov_base
, iov
[i
].iov_len
);
9496 uint64_t totalwritten
;
9498 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9499 want
= CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
;
9501 want
= CEPH_CAP_FILE_BUFFER
;
9502 int r
= get_caps(in
, CEPH_CAP_FILE_WR
|CEPH_CAP_AUTH_SHARED
, want
, &have
, endoff
);
9506 /* clear the setuid/setgid bits, if any */
9507 if (unlikely(in
->mode
& (S_ISUID
|S_ISGID
)) && size
> 0) {
9508 struct ceph_statx stx
= { 0 };
9510 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9511 r
= __setattrx(in
, &stx
, CEPH_SETATTR_KILL_SGUID
, f
->actor_perms
);
9515 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9518 if (f
->flags
& O_DIRECT
)
9519 have
&= ~(CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
);
9521 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
9523 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
9525 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9526 if (endoff
> cct
->_conf
->client_max_inline_size
||
9527 endoff
> CEPH_INLINE_MAX_SIZE
||
9528 !(have
& CEPH_CAP_FILE_BUFFER
)) {
9529 onuninline
.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9530 uninline_data(in
, onuninline
.get());
9532 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9534 uint32_t len
= in
->inline_data
.length();
9537 in
->inline_data
.copy(endoff
, len
- endoff
, bl
);
9540 in
->inline_data
.splice(offset
, len
- offset
);
9541 else if (offset
> len
)
9542 in
->inline_data
.append_zero(offset
- len
);
9544 in
->inline_data
.append(bl
);
9545 in
->inline_version
++;
9547 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9553 if (cct
->_conf
->client_oc
&&
9554 (have
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
))) {
9555 // do buffered write
9556 if (!in
->oset
.dirty_or_tx
)
9557 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
9559 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9561 // async, caching, non-blocking.
9562 r
= objectcacher
->file_write(&in
->oset
, &in
->layout
,
9563 in
->snaprealm
->get_snap_context(),
9564 offset
, size
, bl
, ceph::real_clock::now(),
9566 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9571 // flush cached write if O_SYNC is set on file fh
9572 // O_DSYNC == O_SYNC on linux < 2.6.33
9573 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9574 if ((f
->flags
& O_SYNC
) || (f
->flags
& O_DSYNC
)) {
9575 _flush_range(in
, offset
, size
);
9578 if (f
->flags
& O_DIRECT
)
9579 _flush_range(in
, offset
, size
);
9581 // simple, non-atomic sync write
9582 C_SaferCond
onfinish("Client::_write flock");
9583 unsafe_sync_write
++;
9584 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
); // released by onsafe callback
9586 filer
->write_trunc(in
->ino
, &in
->layout
, in
->snaprealm
->get_snap_context(),
9587 offset
, size
, bl
, ceph::real_clock::now(), 0,
9588 in
->truncate_size
, in
->truncate_seq
,
9590 client_lock
.Unlock();
9593 _sync_write_commit(in
);
9596 // if we get here, write was successful, update client metadata
9599 lat
= ceph_clock_now();
9601 logger
->tinc(l_c_wrlat
, lat
);
9608 totalwritten
= size
;
9609 r
= (int64_t)totalwritten
;
9612 if (totalwritten
+ offset
> in
->size
) {
9613 in
->size
= totalwritten
+ offset
;
9614 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9616 if (is_quota_bytes_approaching(in
, f
->actor_perms
)) {
9617 check_caps(in
, CHECK_CAPS_NODELAY
);
9618 } else if (is_max_size_approaching(in
)) {
9622 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", extending file size" << dendl
;
9624 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", leaving file size at " << in
->size
<< dendl
;
9628 in
->mtime
= in
->ctime
= ceph_clock_now();
9630 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9634 if (nullptr != onuninline
) {
9635 client_lock
.Unlock();
9636 int uninline_ret
= onuninline
->wait();
9639 if (uninline_ret
>= 0 || uninline_ret
== -ECANCELED
) {
9640 in
->inline_data
.clear();
9641 in
->inline_version
= CEPH_INLINE_NONE
;
9642 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9648 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
9652 int Client::_flush(Fh
*f
)
9654 Inode
*in
= f
->inode
.get();
9655 int err
= f
->take_async_err();
9657 ldout(cct
, 1) << __func__
<< ": " << f
<< " on inode " << *in
<< " caught async_err = "
9658 << cpp_strerror(err
) << dendl
;
9660 ldout(cct
, 10) << __func__
<< ": " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
9666 int Client::truncate(const char *relpath
, loff_t length
, const UserPerm
& perms
)
9668 struct ceph_statx stx
;
9669 stx
.stx_size
= length
;
9670 return setattrx(relpath
, &stx
, CEPH_SETATTR_SIZE
, perms
);
9673 int Client::ftruncate(int fd
, loff_t length
, const UserPerm
& perms
)
9675 std::lock_guard
lock(client_lock
);
9676 tout(cct
) << __func__
<< std::endl
;
9677 tout(cct
) << fd
<< std::endl
;
9678 tout(cct
) << length
<< std::endl
;
9683 Fh
*f
= get_filehandle(fd
);
9686 #if defined(__linux__) && defined(O_PATH)
9687 if (f
->flags
& O_PATH
)
9691 attr
.st_size
= length
;
9692 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_SIZE
, perms
);
9695 int Client::fsync(int fd
, bool syncdataonly
)
9697 std::lock_guard
lock(client_lock
);
9698 tout(cct
) << "fsync" << std::endl
;
9699 tout(cct
) << fd
<< std::endl
;
9700 tout(cct
) << syncdataonly
<< std::endl
;
9705 Fh
*f
= get_filehandle(fd
);
9708 #if defined(__linux__) && defined(O_PATH)
9709 if (f
->flags
& O_PATH
)
9712 int r
= _fsync(f
, syncdataonly
);
9714 // The IOs in this fsync were okay, but maybe something happened
9715 // in the background that we shoudl be reporting?
9716 r
= f
->take_async_err();
9717 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
9718 << ") = 0, async_err = " << r
<< dendl
;
9720 // Assume that an error we encountered during fsync, even reported
9721 // synchronously, would also have applied the error to the Fh, and we
9722 // should clear it here to avoid returning the same error again on next
9724 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
<< ") = "
9726 f
->take_async_err();
9731 int Client::_fsync(Inode
*in
, bool syncdataonly
)
9734 std::unique_ptr
<C_SaferCond
> object_cacher_completion
= nullptr;
9735 ceph_tid_t flush_tid
= 0;
9738 utime_t start
= ceph_clock_now();
9740 ldout(cct
, 8) << "_fsync on " << *in
<< " " << (syncdataonly
? "(dataonly)":"(data+metadata)") << dendl
;
9742 if (cct
->_conf
->client_oc
) {
9743 object_cacher_completion
.reset(new C_SaferCond("Client::_fsync::lock"));
9744 tmp_ref
= in
; // take a reference; C_SaferCond doesn't and _flush won't either
9745 _flush(in
, object_cacher_completion
.get());
9746 ldout(cct
, 15) << "using return-valued form of _fsync" << dendl
;
9749 if (!syncdataonly
&& in
->dirty_caps
) {
9750 check_caps(in
, CHECK_CAPS_NODELAY
|CHECK_CAPS_SYNCHRONOUS
);
9751 if (in
->flushing_caps
)
9752 flush_tid
= last_flush_tid
;
9753 } else ldout(cct
, 10) << "no metadata needs to commit" << dendl
;
9755 if (!syncdataonly
&& !in
->unsafe_ops
.empty()) {
9758 MetaRequest
*req
= in
->unsafe_ops
.back();
9759 ldout(cct
, 15) << "waiting on unsafe requests, last tid " << req
->get_tid() << dendl
;
9762 wait_on_list(req
->waitfor_safe
);
9766 if (nullptr != object_cacher_completion
) { // wait on a real reply instead of guessing
9767 client_lock
.Unlock();
9768 ldout(cct
, 15) << "waiting on data to flush" << dendl
;
9769 r
= object_cacher_completion
->wait();
9771 ldout(cct
, 15) << "got " << r
<< " from flush writeback" << dendl
;
9773 // FIXME: this can starve
9774 while (in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] > 0) {
9775 ldout(cct
, 10) << "ino " << in
->ino
<< " has " << in
->cap_refs
[CEPH_CAP_FILE_BUFFER
]
9776 << " uncommitted, waiting" << dendl
;
9777 wait_on_list(in
->waitfor_commit
);
9783 wait_sync_caps(in
, flush_tid
);
9785 ldout(cct
, 10) << "ino " << in
->ino
<< " has no uncommitted writes" << dendl
;
9787 ldout(cct
, 8) << "ino " << in
->ino
<< " failed to commit to disk! "
9788 << cpp_strerror(-r
) << dendl
;
9791 lat
= ceph_clock_now();
9793 logger
->tinc(l_c_fsync
, lat
);
9798 int Client::_fsync(Fh
*f
, bool syncdataonly
)
9800 ldout(cct
, 8) << "_fsync(" << f
<< ", " << (syncdataonly
? "dataonly)":"data+metadata)") << dendl
;
9801 return _fsync(f
->inode
.get(), syncdataonly
);
9804 int Client::fstat(int fd
, struct stat
*stbuf
, const UserPerm
& perms
, int mask
)
9806 std::lock_guard
lock(client_lock
);
9807 tout(cct
) << "fstat mask " << hex
<< mask
<< dec
<< std::endl
;
9808 tout(cct
) << fd
<< std::endl
;
9813 Fh
*f
= get_filehandle(fd
);
9816 int r
= _getattr(f
->inode
, mask
, perms
);
9819 fill_stat(f
->inode
, stbuf
, NULL
);
9820 ldout(cct
, 5) << "fstat(" << fd
<< ", " << stbuf
<< ") = " << r
<< dendl
;
9824 int Client::fstatx(int fd
, struct ceph_statx
*stx
, const UserPerm
& perms
,
9825 unsigned int want
, unsigned int flags
)
9827 std::lock_guard
lock(client_lock
);
9828 tout(cct
) << "fstatx flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
9829 tout(cct
) << fd
<< std::endl
;
9834 Fh
*f
= get_filehandle(fd
);
9838 unsigned mask
= statx_to_mask(flags
, want
);
9841 if (mask
&& !f
->inode
->caps_issued_mask(mask
, true)) {
9842 r
= _getattr(f
->inode
, mask
, perms
);
9844 ldout(cct
, 3) << "fstatx exit on error!" << dendl
;
9849 fill_statx(f
->inode
, mask
, stx
);
9850 ldout(cct
, 3) << "fstatx(" << fd
<< ", " << stx
<< ") = " << r
<< dendl
;
9854 // not written yet, but i want to link!
9856 int Client::chdir(const char *relpath
, std::string
&new_cwd
,
9857 const UserPerm
& perms
)
9859 std::lock_guard
lock(client_lock
);
9860 tout(cct
) << "chdir" << std::endl
;
9861 tout(cct
) << relpath
<< std::endl
;
9866 filepath
path(relpath
);
9868 int r
= path_walk(path
, &in
, perms
);
9873 ldout(cct
, 3) << "chdir(" << relpath
<< ") cwd now " << cwd
->ino
<< dendl
;
9875 _getcwd(new_cwd
, perms
);
9879 void Client::_getcwd(string
& dir
, const UserPerm
& perms
)
9882 ldout(cct
, 10) << __func__
<< " " << *cwd
<< dendl
;
9884 Inode
*in
= cwd
.get();
9885 while (in
!= root
) {
9886 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
9888 // A cwd or ancester is unlinked
9889 if (in
->dentries
.empty()) {
9893 Dentry
*dn
= in
->get_first_parent();
9898 ldout(cct
, 10) << __func__
<< " looking up parent for " << *in
<< dendl
;
9899 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
9900 filepath
path(in
->ino
);
9901 req
->set_filepath(path
);
9903 int res
= make_request(req
, perms
);
9912 path
.push_front_dentry(dn
->name
);
9913 in
= dn
->dir
->parent_inode
;
9916 dir
+= path
.get_path();
9919 void Client::getcwd(string
& dir
, const UserPerm
& perms
)
9921 std::lock_guard
l(client_lock
);
9923 _getcwd(dir
, perms
);
9926 int Client::statfs(const char *path
, struct statvfs
*stbuf
,
9927 const UserPerm
& perms
)
9929 std::lock_guard
l(client_lock
);
9930 tout(cct
) << __func__
<< std::endl
;
9931 unsigned long int total_files_on_fs
;
9939 const vector
<int64_t> &data_pools
= mdsmap
->get_data_pools();
9940 if (data_pools
.size() == 1) {
9941 objecter
->get_fs_stats(stats
, data_pools
[0], &cond
);
9943 objecter
->get_fs_stats(stats
, boost::optional
<int64_t>(), &cond
);
9946 client_lock
.Unlock();
9947 int rval
= cond
.wait();
9949 total_files_on_fs
= root
->rstat
.rfiles
+ root
->rstat
.rsubdirs
;
9953 ldout(cct
, 1) << "underlying call to statfs returned error: "
9954 << cpp_strerror(rval
)
9959 memset(stbuf
, 0, sizeof(*stbuf
));
9962 * we're going to set a block size of 4MB so we can represent larger
9963 * FSes without overflowing. Additionally convert the space
9964 * measurements from KB to bytes while making them in terms of
9965 * blocks. We use 4MB only because it is big enough, and because it
9966 * actually *is* the (ceph) default block size.
9968 const int CEPH_BLOCK_SHIFT
= 22;
9969 stbuf
->f_frsize
= 1 << CEPH_BLOCK_SHIFT
;
9970 stbuf
->f_bsize
= 1 << CEPH_BLOCK_SHIFT
;
9971 stbuf
->f_files
= total_files_on_fs
;
9973 stbuf
->f_favail
= -1;
9974 stbuf
->f_fsid
= -1; // ??
9975 stbuf
->f_flag
= 0; // ??
9976 stbuf
->f_namemax
= NAME_MAX
;
9978 // Usually quota_root will == root_ancestor, but if the mount root has no
9979 // quota but we can see a parent of it that does have a quota, we'll
9980 // respect that one instead.
9981 ceph_assert(root
!= nullptr);
9982 Inode
*quota_root
= root
->quota
.is_enable() ? root
: get_quota_root(root
, perms
);
9984 // get_quota_root should always give us something
9985 // because client quotas are always enabled
9986 ceph_assert(quota_root
!= nullptr);
9988 if (quota_root
&& cct
->_conf
->client_quota_df
&& quota_root
->quota
.max_bytes
) {
9990 // Skip the getattr if any sessions are stale, as we don't want to
9991 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9993 if (!_any_stale_sessions()) {
9994 int r
= _getattr(quota_root
, 0, perms
, true);
9996 // Ignore return value: error getting latest inode metadata is not a good
9997 // reason to break "df".
9998 lderr(cct
) << "Error in getattr on quota root 0x"
9999 << std::hex
<< quota_root
->ino
<< std::dec
10000 << " statfs result may be outdated" << dendl
;
10004 // Special case: if there is a size quota set on the Inode acting
10005 // as the root for this client mount, then report the quota status
10006 // as the filesystem statistics.
10007 const fsblkcnt_t total
= quota_root
->quota
.max_bytes
>> CEPH_BLOCK_SHIFT
;
10008 const fsblkcnt_t used
= quota_root
->rstat
.rbytes
>> CEPH_BLOCK_SHIFT
;
10009 // It is possible for a quota to be exceeded: arithmetic here must
10010 // handle case where used > total.
10011 const fsblkcnt_t free
= total
> used
? total
- used
: 0;
10013 stbuf
->f_blocks
= total
;
10014 stbuf
->f_bfree
= free
;
10015 stbuf
->f_bavail
= free
;
10017 // General case: report the cluster statistics returned from RADOS. Because
10018 // multiple pools may be used without one filesystem namespace via
10019 // layouts, this is the most correct thing we can do.
10020 stbuf
->f_blocks
= stats
.kb
>> (CEPH_BLOCK_SHIFT
- 10);
10021 stbuf
->f_bfree
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10022 stbuf
->f_bavail
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10028 int Client::_do_filelock(Inode
*in
, Fh
*fh
, int lock_type
, int op
, int sleep
,
10029 struct flock
*fl
, uint64_t owner
, bool removing
)
10031 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
10032 << (lock_type
== CEPH_LOCK_FCNTL
? " fcntl" : " flock")
10033 << " type " << fl
->l_type
<< " owner " << owner
10034 << " " << fl
->l_start
<< "~" << fl
->l_len
<< dendl
;
10037 if (F_RDLCK
== fl
->l_type
)
10038 lock_cmd
= CEPH_LOCK_SHARED
;
10039 else if (F_WRLCK
== fl
->l_type
)
10040 lock_cmd
= CEPH_LOCK_EXCL
;
10041 else if (F_UNLCK
== fl
->l_type
)
10042 lock_cmd
= CEPH_LOCK_UNLOCK
;
10046 if (op
!= CEPH_MDS_OP_SETFILELOCK
|| lock_cmd
== CEPH_LOCK_UNLOCK
)
10050 * Set the most significant bit, so that MDS knows the 'owner'
10051 * is sufficient to identify the owner of lock. (old code uses
10052 * both 'owner' and 'pid')
10054 owner
|= (1ULL << 63);
10056 MetaRequest
*req
= new MetaRequest(op
);
10058 in
->make_nosnap_relative_path(path
);
10059 req
->set_filepath(path
);
10060 req
->set_inode(in
);
10062 req
->head
.args
.filelock_change
.rule
= lock_type
;
10063 req
->head
.args
.filelock_change
.type
= lock_cmd
;
10064 req
->head
.args
.filelock_change
.owner
= owner
;
10065 req
->head
.args
.filelock_change
.pid
= fl
->l_pid
;
10066 req
->head
.args
.filelock_change
.start
= fl
->l_start
;
10067 req
->head
.args
.filelock_change
.length
= fl
->l_len
;
10068 req
->head
.args
.filelock_change
.wait
= sleep
;
10073 if (sleep
&& switch_interrupt_cb
) {
10074 // enable interrupt
10075 switch_interrupt_cb(callback_handle
, req
->get());
10076 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10077 // disable interrupt
10078 switch_interrupt_cb(callback_handle
, NULL
);
10079 if (ret
== 0 && req
->aborted()) {
10080 // effect of this lock request has been revoked by the 'lock intr' request
10081 ret
= req
->get_abort_code();
10085 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10089 if (op
== CEPH_MDS_OP_GETFILELOCK
) {
10090 ceph_filelock filelock
;
10091 auto p
= bl
.cbegin();
10092 decode(filelock
, p
);
10094 if (CEPH_LOCK_SHARED
== filelock
.type
)
10095 fl
->l_type
= F_RDLCK
;
10096 else if (CEPH_LOCK_EXCL
== filelock
.type
)
10097 fl
->l_type
= F_WRLCK
;
10099 fl
->l_type
= F_UNLCK
;
10101 fl
->l_whence
= SEEK_SET
;
10102 fl
->l_start
= filelock
.start
;
10103 fl
->l_len
= filelock
.length
;
10104 fl
->l_pid
= filelock
.pid
;
10105 } else if (op
== CEPH_MDS_OP_SETFILELOCK
) {
10106 ceph_lock_state_t
*lock_state
;
10107 if (lock_type
== CEPH_LOCK_FCNTL
) {
10108 if (!in
->fcntl_locks
)
10109 in
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10110 lock_state
= in
->fcntl_locks
.get();
10111 } else if (lock_type
== CEPH_LOCK_FLOCK
) {
10112 if (!in
->flock_locks
)
10113 in
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10114 lock_state
= in
->flock_locks
.get();
10119 _update_lock_state(fl
, owner
, lock_state
);
10122 if (lock_type
== CEPH_LOCK_FCNTL
) {
10123 if (!fh
->fcntl_locks
)
10124 fh
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10125 lock_state
= fh
->fcntl_locks
.get();
10127 if (!fh
->flock_locks
)
10128 fh
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10129 lock_state
= fh
->flock_locks
.get();
10131 _update_lock_state(fl
, owner
, lock_state
);
10139 int Client::_interrupt_filelock(MetaRequest
*req
)
10141 // Set abort code, but do not kick. The abort code prevents the request
10142 // from being re-sent.
10143 req
->abort(-EINTR
);
10145 return 0; // haven't sent the request
10147 Inode
*in
= req
->inode();
10150 if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FLOCK
)
10151 lock_type
= CEPH_LOCK_FLOCK_INTR
;
10152 else if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FCNTL
)
10153 lock_type
= CEPH_LOCK_FCNTL_INTR
;
10159 MetaRequest
*intr_req
= new MetaRequest(CEPH_MDS_OP_SETFILELOCK
);
10161 in
->make_nosnap_relative_path(path
);
10162 intr_req
->set_filepath(path
);
10163 intr_req
->set_inode(in
);
10164 intr_req
->head
.args
.filelock_change
= req
->head
.args
.filelock_change
;
10165 intr_req
->head
.args
.filelock_change
.rule
= lock_type
;
10166 intr_req
->head
.args
.filelock_change
.type
= CEPH_LOCK_UNLOCK
;
10168 UserPerm
perms(req
->get_uid(), req
->get_gid());
10169 return make_request(intr_req
, perms
, NULL
, NULL
, -1);
10172 void Client::_encode_filelocks(Inode
*in
, bufferlist
& bl
)
10174 if (!in
->fcntl_locks
&& !in
->flock_locks
)
10177 unsigned nr_fcntl_locks
= in
->fcntl_locks
? in
->fcntl_locks
->held_locks
.size() : 0;
10178 encode(nr_fcntl_locks
, bl
);
10179 if (nr_fcntl_locks
) {
10180 auto &lock_state
= in
->fcntl_locks
;
10181 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10182 p
!= lock_state
->held_locks
.end();
10184 encode(p
->second
, bl
);
10187 unsigned nr_flock_locks
= in
->flock_locks
? in
->flock_locks
->held_locks
.size() : 0;
10188 encode(nr_flock_locks
, bl
);
10189 if (nr_flock_locks
) {
10190 auto &lock_state
= in
->flock_locks
;
10191 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10192 p
!= lock_state
->held_locks
.end();
10194 encode(p
->second
, bl
);
10197 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< ", " << nr_fcntl_locks
10198 << " fcntl locks, " << nr_flock_locks
<< " flock locks" << dendl
;
10201 void Client::_release_filelocks(Fh
*fh
)
10203 if (!fh
->fcntl_locks
&& !fh
->flock_locks
)
10206 Inode
*in
= fh
->inode
.get();
10207 ldout(cct
, 10) << __func__
<< " " << fh
<< " ino " << in
->ino
<< dendl
;
10209 list
<pair
<int, ceph_filelock
> > to_release
;
10211 if (fh
->fcntl_locks
) {
10212 auto &lock_state
= fh
->fcntl_locks
;
10213 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10214 p
!= lock_state
->held_locks
.end();
10216 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FCNTL
, p
->second
));
10217 lock_state
.reset();
10219 if (fh
->flock_locks
) {
10220 auto &lock_state
= fh
->flock_locks
;
10221 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10222 p
!= lock_state
->held_locks
.end();
10224 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FLOCK
, p
->second
));
10225 lock_state
.reset();
10228 if (to_release
.empty())
10231 // mds has already released filelocks if session was closed.
10232 if (in
->caps
.empty())
10236 memset(&fl
, 0, sizeof(fl
));
10237 fl
.l_whence
= SEEK_SET
;
10238 fl
.l_type
= F_UNLCK
;
10240 for (list
<pair
<int, ceph_filelock
> >::iterator p
= to_release
.begin();
10241 p
!= to_release
.end();
10243 fl
.l_start
= p
->second
.start
;
10244 fl
.l_len
= p
->second
.length
;
10245 fl
.l_pid
= p
->second
.pid
;
10246 _do_filelock(in
, fh
, p
->first
, CEPH_MDS_OP_SETFILELOCK
, 0, &fl
,
10247 p
->second
.owner
, true);
10251 void Client::_update_lock_state(struct flock
*fl
, uint64_t owner
,
10252 ceph_lock_state_t
*lock_state
)
10255 if (F_RDLCK
== fl
->l_type
)
10256 lock_cmd
= CEPH_LOCK_SHARED
;
10257 else if (F_WRLCK
== fl
->l_type
)
10258 lock_cmd
= CEPH_LOCK_EXCL
;
10260 lock_cmd
= CEPH_LOCK_UNLOCK
;;
10262 ceph_filelock filelock
;
10263 filelock
.start
= fl
->l_start
;
10264 filelock
.length
= fl
->l_len
;
10265 filelock
.client
= 0;
10266 // see comment in _do_filelock()
10267 filelock
.owner
= owner
| (1ULL << 63);
10268 filelock
.pid
= fl
->l_pid
;
10269 filelock
.type
= lock_cmd
;
10271 if (filelock
.type
== CEPH_LOCK_UNLOCK
) {
10272 list
<ceph_filelock
> activated_locks
;
10273 lock_state
->remove_lock(filelock
, activated_locks
);
10275 bool r
= lock_state
->add_lock(filelock
, false, false, NULL
);
10280 int Client::_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
10282 Inode
*in
= fh
->inode
.get();
10283 ldout(cct
, 10) << "_getlk " << fh
<< " ino " << in
->ino
<< dendl
;
10284 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_GETFILELOCK
, 0, fl
, owner
);
10288 int Client::_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
10290 Inode
*in
= fh
->inode
.get();
10291 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< dendl
;
10292 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_SETFILELOCK
, sleep
, fl
, owner
);
10293 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10297 int Client::_flock(Fh
*fh
, int cmd
, uint64_t owner
)
10299 Inode
*in
= fh
->inode
.get();
10300 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< dendl
;
10302 int sleep
= !(cmd
& LOCK_NB
);
10321 memset(&fl
, 0, sizeof(fl
));
10323 fl
.l_whence
= SEEK_SET
;
10325 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FLOCK
, CEPH_MDS_OP_SETFILELOCK
, sleep
, &fl
, owner
);
10326 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10330 int Client::ll_statfs(Inode
*in
, struct statvfs
*stbuf
, const UserPerm
& perms
)
10332 /* Since the only thing this does is wrap a call to statfs, and
10333 statfs takes a lock, it doesn't seem we have a need to split it
10335 return statfs(0, stbuf
, perms
);
10338 void Client::ll_register_callbacks(struct client_callback_args
*args
)
10342 std::lock_guard
l(client_lock
);
10343 ldout(cct
, 10) << __func__
<< " cb " << args
->handle
10344 << " invalidate_ino_cb " << args
->ino_cb
10345 << " invalidate_dentry_cb " << args
->dentry_cb
10346 << " switch_interrupt_cb " << args
->switch_intr_cb
10347 << " remount_cb " << args
->remount_cb
10349 callback_handle
= args
->handle
;
10350 if (args
->ino_cb
) {
10351 ino_invalidate_cb
= args
->ino_cb
;
10352 async_ino_invalidator
.start();
10354 if (args
->dentry_cb
) {
10355 dentry_invalidate_cb
= args
->dentry_cb
;
10356 async_dentry_invalidator
.start();
10358 if (args
->switch_intr_cb
) {
10359 switch_interrupt_cb
= args
->switch_intr_cb
;
10360 interrupt_finisher
.start();
10362 if (args
->remount_cb
) {
10363 remount_cb
= args
->remount_cb
;
10364 remount_finisher
.start();
10366 umask_cb
= args
->umask_cb
;
10369 int Client::test_dentry_handling(bool can_invalidate
)
10373 can_invalidate_dentries
= can_invalidate
;
10375 if (can_invalidate_dentries
) {
10376 ceph_assert(dentry_invalidate_cb
);
10377 ldout(cct
, 1) << "using dentry_invalidate_cb" << dendl
;
10380 ceph_assert(remount_cb
);
10381 ldout(cct
, 1) << "using remount_cb" << dendl
;
10382 r
= _do_remount(false);
10388 int Client::_sync_fs()
10390 ldout(cct
, 10) << __func__
<< dendl
;
10393 std::unique_ptr
<C_SaferCond
> cond
= nullptr;
10394 if (cct
->_conf
->client_oc
) {
10395 cond
.reset(new C_SaferCond("Client::_sync_fs:lock"));
10396 objectcacher
->flush_all(cond
.get());
10401 ceph_tid_t flush_tid
= last_flush_tid
;
10403 // wait for unsafe mds requests
10404 wait_unsafe_requests();
10406 wait_sync_caps(flush_tid
);
10408 if (nullptr != cond
) {
10409 client_lock
.Unlock();
10410 ldout(cct
, 15) << __func__
<< " waiting on data to flush" << dendl
;
10412 ldout(cct
, 15) << __func__
<< " flush finished" << dendl
;
10413 client_lock
.Lock();
10419 int Client::sync_fs()
10421 std::lock_guard
l(client_lock
);
10429 int64_t Client::drop_caches()
10431 std::lock_guard
l(client_lock
);
10432 return objectcacher
->release_all();
10435 int Client::_lazyio(Fh
*fh
, int enable
)
10437 Inode
*in
= fh
->inode
.get();
10438 ldout(cct
, 20) << __func__
<< " " << *in
<< " " << !!enable
<< dendl
;
10440 if (!!(fh
->mode
& CEPH_FILE_MODE_LAZY
) == !!enable
)
10443 int orig_mode
= fh
->mode
;
10445 fh
->mode
|= CEPH_FILE_MODE_LAZY
;
10446 in
->get_open_ref(fh
->mode
);
10447 in
->put_open_ref(orig_mode
);
10448 check_caps(in
, CHECK_CAPS_NODELAY
);
10450 fh
->mode
&= ~CEPH_FILE_MODE_LAZY
;
10451 in
->get_open_ref(fh
->mode
);
10452 in
->put_open_ref(orig_mode
);
10459 int Client::lazyio(int fd
, int enable
)
10461 std::lock_guard
l(client_lock
);
10462 Fh
*f
= get_filehandle(fd
);
10466 return _lazyio(f
, enable
);
10469 int Client::ll_lazyio(Fh
*fh
, int enable
)
10471 std::lock_guard
lock(client_lock
);
10472 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << !!enable
<< dendl
;
10473 tout(cct
) << __func__
<< std::endl
;
10475 return _lazyio(fh
, enable
);
10478 int Client::lazyio_propogate(int fd
, loff_t offset
, size_t count
)
10480 std::lock_guard
l(client_lock
);
10481 ldout(cct
, 3) << "op: client->lazyio_propogate(" << fd
10482 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10484 Fh
*f
= get_filehandle(fd
);
10494 int Client::lazyio_synchronize(int fd
, loff_t offset
, size_t count
)
10496 std::lock_guard
l(client_lock
);
10497 ldout(cct
, 3) << "op: client->lazyio_synchronize(" << fd
10498 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10500 Fh
*f
= get_filehandle(fd
);
10503 Inode
*in
= f
->inode
.get();
10512 // =============================
10515 int Client::mksnap(const char *relpath
, const char *name
, const UserPerm
& perm
)
10517 std::lock_guard
l(client_lock
);
10522 filepath
path(relpath
);
10524 int r
= path_walk(path
, &in
, perm
);
10527 if (cct
->_conf
->client_permissions
) {
10528 r
= may_create(in
.get(), perm
);
10532 Inode
*snapdir
= open_snapdir(in
.get());
10533 return _mkdir(snapdir
, name
, 0, perm
);
10536 int Client::rmsnap(const char *relpath
, const char *name
, const UserPerm
& perms
)
10538 std::lock_guard
l(client_lock
);
10543 filepath
path(relpath
);
10545 int r
= path_walk(path
, &in
, perms
);
10548 if (cct
->_conf
->client_permissions
) {
10549 r
= may_delete(in
.get(), NULL
, perms
);
10553 Inode
*snapdir
= open_snapdir(in
.get());
10554 return _rmdir(snapdir
, name
, perms
);
10557 // =============================
10560 int Client::get_caps_issued(int fd
) {
10562 std::lock_guard
lock(client_lock
);
10567 Fh
*f
= get_filehandle(fd
);
10571 return f
->inode
->caps_issued();
10574 int Client::get_caps_issued(const char *path
, const UserPerm
& perms
)
10576 std::lock_guard
lock(client_lock
);
10583 int r
= path_walk(p
, &in
, perms
, true);
10586 return in
->caps_issued();
10589 // =========================================
10592 Inode
*Client::open_snapdir(Inode
*diri
)
10595 vinodeno_t
vino(diri
->ino
, CEPH_SNAPDIR
);
10596 if (!inode_map
.count(vino
)) {
10597 in
= new Inode(this, vino
, &diri
->layout
);
10599 in
->ino
= diri
->ino
;
10600 in
->snapid
= CEPH_SNAPDIR
;
10601 in
->mode
= diri
->mode
;
10602 in
->uid
= diri
->uid
;
10603 in
->gid
= diri
->gid
;
10605 in
->mtime
= diri
->mtime
;
10606 in
->ctime
= diri
->ctime
;
10607 in
->btime
= diri
->btime
;
10608 in
->size
= diri
->size
;
10609 in
->change_attr
= diri
->change_attr
;
10611 in
->dirfragtree
.clear();
10612 in
->snapdir_parent
= diri
;
10613 diri
->flags
|= I_SNAPDIR_OPEN
;
10614 inode_map
[vino
] = in
;
10615 if (use_faked_inos())
10616 _assign_faked_ino(in
);
10617 ldout(cct
, 10) << "open_snapdir created snapshot inode " << *in
<< dendl
;
10619 in
= inode_map
[vino
];
10620 ldout(cct
, 10) << "open_snapdir had snapshot inode " << *in
<< dendl
;
10625 int Client::ll_lookup(Inode
*parent
, const char *name
, struct stat
*attr
,
10626 Inode
**out
, const UserPerm
& perms
)
10628 std::lock_guard
lock(client_lock
);
10629 vinodeno_t vparent
= _get_vino(parent
);
10630 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
10631 tout(cct
) << __func__
<< std::endl
;
10632 tout(cct
) << name
<< std::endl
;
10638 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
10639 "fuse_default_permissions");
10640 if (!fuse_default_permissions
) {
10641 if (strcmp(name
, ".") && strcmp(name
, "..")) {
10642 r
= may_lookup(parent
, perms
);
10648 string
dname(name
);
10651 r
= _lookup(parent
, dname
, CEPH_STAT_CAP_INODE_ALL
, &in
, perms
);
10658 fill_stat(in
, attr
);
10662 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
10663 << " -> " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
10664 tout(cct
) << attr
->st_ino
<< std::endl
;
10669 int Client::ll_lookup_inode(
10670 struct inodeno_t ino
,
10671 const UserPerm
& perms
,
10674 ceph_assert(inode
!= NULL
);
10675 std::lock_guard
lock(client_lock
);
10676 ldout(cct
, 3) << "ll_lookup_inode " << ino
<< dendl
;
10681 // Num1: get inode and *inode
10682 int r
= _lookup_ino(ino
, perms
, inode
);
10686 ceph_assert(*inode
!= NULL
);
10688 if (!(*inode
)->dentries
.empty()) {
10689 ldout(cct
, 8) << __func__
<< " dentry already present" << dendl
;
10693 if ((*inode
)->is_root()) {
10694 ldout(cct
, 8) << "ino is root, no parent" << dendl
;
10698 // Num2: Request the parent inode, so that we can look up the name
10700 r
= _lookup_parent(*inode
, perms
, &parent
);
10702 _ll_forget(*inode
, 1);
10706 ceph_assert(parent
!= NULL
);
10708 // Num3: Finally, get the name (dentry) of the requested inode
10709 r
= _lookup_name(*inode
, parent
, perms
);
10711 // Unexpected error
10712 _ll_forget(parent
, 1);
10713 _ll_forget(*inode
, 1);
10717 _ll_forget(parent
, 1);
10721 int Client::ll_lookupx(Inode
*parent
, const char *name
, Inode
**out
,
10722 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
10723 const UserPerm
& perms
)
10725 std::lock_guard
lock(client_lock
);
10726 vinodeno_t vparent
= _get_vino(parent
);
10727 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
10728 tout(cct
) << "ll_lookupx" << std::endl
;
10729 tout(cct
) << name
<< std::endl
;
10735 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
10736 "fuse_default_permissions");
10737 if (!fuse_default_permissions
) {
10738 r
= may_lookup(parent
, perms
);
10743 string
dname(name
);
10746 unsigned mask
= statx_to_mask(flags
, want
);
10747 r
= _lookup(parent
, dname
, mask
, &in
, perms
);
10753 fill_statx(in
, mask
, stx
);
10757 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
10758 << " -> " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
10759 tout(cct
) << stx
->stx_ino
<< std::endl
;
10764 int Client::ll_walk(const char* name
, Inode
**out
, struct ceph_statx
*stx
,
10765 unsigned int want
, unsigned int flags
, const UserPerm
& perms
)
10767 std::lock_guard
lock(client_lock
);
10772 filepath
fp(name
, 0);
10775 unsigned mask
= statx_to_mask(flags
, want
);
10777 ldout(cct
, 3) << __func__
<< " " << name
<< dendl
;
10778 tout(cct
) << __func__
<< std::endl
;
10779 tout(cct
) << name
<< std::endl
;
10781 rc
= path_walk(fp
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
10783 /* zero out mask, just in case... */
10790 fill_statx(in
, mask
, stx
);
10797 void Client::_ll_get(Inode
*in
)
10799 if (in
->ll_ref
== 0) {
10801 if (in
->is_dir() && !in
->dentries
.empty()) {
10802 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
10803 in
->get_first_parent()->get(); // pin dentry
10805 if (in
->snapid
!= CEPH_NOSNAP
)
10806 ll_snap_ref
[in
->snapid
]++;
10809 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " -> " << in
->ll_ref
<< dendl
;
10812 int Client::_ll_put(Inode
*in
, uint64_t num
)
10815 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " " << num
<< " -> " << in
->ll_ref
<< dendl
;
10816 if (in
->ll_ref
== 0) {
10817 if (in
->is_dir() && !in
->dentries
.empty()) {
10818 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
10819 in
->get_first_parent()->put(); // unpin dentry
10821 if (in
->snapid
!= CEPH_NOSNAP
) {
10822 auto p
= ll_snap_ref
.find(in
->snapid
);
10823 ceph_assert(p
!= ll_snap_ref
.end());
10824 ceph_assert(p
->second
> 0);
10825 if (--p
->second
== 0)
10826 ll_snap_ref
.erase(p
);
10835 void Client::_ll_drop_pins()
10837 ldout(cct
, 10) << __func__
<< dendl
;
10838 std::set
<InodeRef
> to_be_put
; //this set will be deconstructed item by item when exit
10839 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator next
;
10840 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
10841 it
!= inode_map
.end();
10843 Inode
*in
= it
->second
;
10847 to_be_put
.insert(in
);
10848 _ll_put(in
, in
->ll_ref
);
10853 bool Client::_ll_forget(Inode
*in
, uint64_t count
)
10855 inodeno_t ino
= in
->ino
;
10857 ldout(cct
, 8) << __func__
<< " " << ino
<< " " << count
<< dendl
;
10858 tout(cct
) << __func__
<< std::endl
;
10859 tout(cct
) << ino
.val
<< std::endl
;
10860 tout(cct
) << count
<< std::endl
;
10862 // Ignore forget if we're no longer mounted
10866 if (ino
== 1) return true; // ignore forget on root.
10869 if (in
->ll_ref
< count
) {
10870 ldout(cct
, 1) << "WARNING: ll_forget on " << ino
<< " " << count
10871 << ", which only has ll_ref=" << in
->ll_ref
<< dendl
;
10872 _ll_put(in
, in
->ll_ref
);
10875 if (_ll_put(in
, count
) == 0)
10882 bool Client::ll_forget(Inode
*in
, uint64_t count
)
10884 std::lock_guard
lock(client_lock
);
10885 return _ll_forget(in
, count
);
10888 bool Client::ll_put(Inode
*in
)
10890 /* ll_forget already takes the lock */
10891 return ll_forget(in
, 1);
10894 int Client::ll_get_snap_ref(snapid_t snap
)
10896 std::lock_guard
lock(client_lock
);
10897 auto p
= ll_snap_ref
.find(snap
);
10898 if (p
!= ll_snap_ref
.end())
10903 snapid_t
Client::ll_get_snapid(Inode
*in
)
10905 std::lock_guard
lock(client_lock
);
10909 Inode
*Client::ll_get_inode(ino_t ino
)
10911 std::lock_guard
lock(client_lock
);
10916 vinodeno_t vino
= _map_faked_ino(ino
);
10917 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
10918 if (p
== inode_map
.end())
10920 Inode
*in
= p
->second
;
10925 Inode
*Client::ll_get_inode(vinodeno_t vino
)
10927 std::lock_guard
lock(client_lock
);
10932 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
10933 if (p
== inode_map
.end())
10935 Inode
*in
= p
->second
;
10940 int Client::_ll_getattr(Inode
*in
, int caps
, const UserPerm
& perms
)
10942 vinodeno_t vino
= _get_vino(in
);
10944 ldout(cct
, 8) << __func__
<< " " << vino
<< dendl
;
10945 tout(cct
) << __func__
<< std::endl
;
10946 tout(cct
) << vino
.ino
.val
<< std::endl
;
10948 if (vino
.snapid
< CEPH_NOSNAP
)
10951 return _getattr(in
, caps
, perms
);
10954 int Client::ll_getattr(Inode
*in
, struct stat
*attr
, const UserPerm
& perms
)
10956 std::lock_guard
lock(client_lock
);
10961 int res
= _ll_getattr(in
, CEPH_STAT_CAP_INODE_ALL
, perms
);
10964 fill_stat(in
, attr
);
10965 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
10969 int Client::ll_getattrx(Inode
*in
, struct ceph_statx
*stx
, unsigned int want
,
10970 unsigned int flags
, const UserPerm
& perms
)
10972 std::lock_guard
lock(client_lock
);
10978 unsigned mask
= statx_to_mask(flags
, want
);
10980 if (mask
&& !in
->caps_issued_mask(mask
, true))
10981 res
= _ll_getattr(in
, mask
, perms
);
10984 fill_statx(in
, mask
, stx
);
10985 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
10989 int Client::_ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
10990 const UserPerm
& perms
, InodeRef
*inp
)
10992 vinodeno_t vino
= _get_vino(in
);
10994 ldout(cct
, 8) << __func__
<< " " << vino
<< " mask " << hex
<< mask
<< dec
10996 tout(cct
) << __func__
<< std::endl
;
10997 tout(cct
) << vino
.ino
.val
<< std::endl
;
10998 tout(cct
) << stx
->stx_mode
<< std::endl
;
10999 tout(cct
) << stx
->stx_uid
<< std::endl
;
11000 tout(cct
) << stx
->stx_gid
<< std::endl
;
11001 tout(cct
) << stx
->stx_size
<< std::endl
;
11002 tout(cct
) << stx
->stx_mtime
<< std::endl
;
11003 tout(cct
) << stx
->stx_atime
<< std::endl
;
11004 tout(cct
) << stx
->stx_btime
<< std::endl
;
11005 tout(cct
) << mask
<< std::endl
;
11007 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
11008 "fuse_default_permissions");
11009 if (!fuse_default_permissions
) {
11010 int res
= may_setattr(in
, stx
, mask
, perms
);
11015 mask
&= ~(CEPH_SETATTR_MTIME_NOW
| CEPH_SETATTR_ATIME_NOW
);
11017 return __setattrx(in
, stx
, mask
, perms
, inp
);
11020 int Client::ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
11021 const UserPerm
& perms
)
11023 std::lock_guard
lock(client_lock
);
11028 InodeRef
target(in
);
11029 int res
= _ll_setattrx(in
, stx
, mask
, perms
, &target
);
11031 ceph_assert(in
== target
.get());
11032 fill_statx(in
, in
->caps_issued(), stx
);
11035 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11039 int Client::ll_setattr(Inode
*in
, struct stat
*attr
, int mask
,
11040 const UserPerm
& perms
)
11042 struct ceph_statx stx
;
11043 stat_to_statx(attr
, &stx
);
11045 std::lock_guard
lock(client_lock
);
11050 InodeRef
target(in
);
11051 int res
= _ll_setattrx(in
, &stx
, mask
, perms
, &target
);
11053 ceph_assert(in
== target
.get());
11054 fill_stat(in
, attr
);
11057 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11065 int Client::getxattr(const char *path
, const char *name
, void *value
, size_t size
,
11066 const UserPerm
& perms
)
11068 std::lock_guard
lock(client_lock
);
11074 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11077 return _getxattr(in
, name
, value
, size
, perms
);
11080 int Client::lgetxattr(const char *path
, const char *name
, void *value
, size_t size
,
11081 const UserPerm
& perms
)
11083 std::lock_guard
lock(client_lock
);
11089 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11092 return _getxattr(in
, name
, value
, size
, perms
);
11095 int Client::fgetxattr(int fd
, const char *name
, void *value
, size_t size
,
11096 const UserPerm
& perms
)
11098 std::lock_guard
lock(client_lock
);
11103 Fh
*f
= get_filehandle(fd
);
11106 return _getxattr(f
->inode
, name
, value
, size
, perms
);
11109 int Client::listxattr(const char *path
, char *list
, size_t size
,
11110 const UserPerm
& perms
)
11112 std::lock_guard
lock(client_lock
);
11118 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11121 return Client::_listxattr(in
.get(), list
, size
, perms
);
11124 int Client::llistxattr(const char *path
, char *list
, size_t size
,
11125 const UserPerm
& perms
)
11127 std::lock_guard
lock(client_lock
);
11133 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11136 return Client::_listxattr(in
.get(), list
, size
, perms
);
11139 int Client::flistxattr(int fd
, char *list
, size_t size
, const UserPerm
& perms
)
11141 std::lock_guard
lock(client_lock
);
11146 Fh
*f
= get_filehandle(fd
);
11149 return Client::_listxattr(f
->inode
.get(), list
, size
, perms
);
11152 int Client::removexattr(const char *path
, const char *name
,
11153 const UserPerm
& perms
)
11155 std::lock_guard
lock(client_lock
);
11161 int r
= Client::path_walk(path
, &in
, perms
, true);
11164 return _removexattr(in
, name
, perms
);
11167 int Client::lremovexattr(const char *path
, const char *name
,
11168 const UserPerm
& perms
)
11170 std::lock_guard
lock(client_lock
);
11176 int r
= Client::path_walk(path
, &in
, perms
, false);
11179 return _removexattr(in
, name
, perms
);
11182 int Client::fremovexattr(int fd
, const char *name
, const UserPerm
& perms
)
11184 std::lock_guard
lock(client_lock
);
11189 Fh
*f
= get_filehandle(fd
);
11192 return _removexattr(f
->inode
, name
, perms
);
11195 int Client::setxattr(const char *path
, const char *name
, const void *value
,
11196 size_t size
, int flags
, const UserPerm
& perms
)
11198 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11200 std::lock_guard
lock(client_lock
);
11206 int r
= Client::path_walk(path
, &in
, perms
, true);
11209 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11212 int Client::lsetxattr(const char *path
, const char *name
, const void *value
,
11213 size_t size
, int flags
, const UserPerm
& perms
)
11215 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11217 std::lock_guard
lock(client_lock
);
11223 int r
= Client::path_walk(path
, &in
, perms
, false);
11226 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11229 int Client::fsetxattr(int fd
, const char *name
, const void *value
, size_t size
,
11230 int flags
, const UserPerm
& perms
)
11232 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11234 std::lock_guard
lock(client_lock
);
11239 Fh
*f
= get_filehandle(fd
);
11242 return _setxattr(f
->inode
, name
, value
, size
, flags
, perms
);
11245 int Client::_getxattr(Inode
*in
, const char *name
, void *value
, size_t size
,
11246 const UserPerm
& perms
)
11250 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11254 // Do a force getattr to get the latest quota before returning
11255 // a value to userspace.
11257 if (vxattr
->flags
& VXATTR_RSTAT
) {
11258 flags
|= CEPH_STAT_RSTAT
;
11260 r
= _getattr(in
, flags
, perms
, true);
11262 // Error from getattr!
11266 // call pointer-to-member function
11268 if (!(vxattr
->exists_cb
&& !(this->*(vxattr
->exists_cb
))(in
))) {
11269 r
= (this->*(vxattr
->getxattr_cb
))(in
, buf
, sizeof(buf
));
11275 if (r
> (int)size
) {
11277 } else if (r
> 0) {
11278 memcpy(value
, buf
, r
);
11284 if (acl_type
== NO_ACL
&& !strncmp(name
, "system.", 7)) {
11289 r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11293 if (in
->xattrs
.count(n
)) {
11294 r
= in
->xattrs
[n
].length();
11295 if (r
> 0 && size
!= 0) {
11296 if (size
>= (unsigned)r
)
11297 memcpy(value
, in
->xattrs
[n
].c_str(), r
);
11304 ldout(cct
, 8) << "_getxattr(" << in
->ino
<< ", \"" << name
<< "\", " << size
<< ") = " << r
<< dendl
;
11308 int Client::_getxattr(InodeRef
&in
, const char *name
, void *value
, size_t size
,
11309 const UserPerm
& perms
)
11311 if (cct
->_conf
->client_permissions
) {
11312 int r
= xattr_permission(in
.get(), name
, MAY_READ
, perms
);
11316 return _getxattr(in
.get(), name
, value
, size
, perms
);
11319 int Client::ll_getxattr(Inode
*in
, const char *name
, void *value
,
11320 size_t size
, const UserPerm
& perms
)
11322 std::lock_guard
lock(client_lock
);
11327 vinodeno_t vino
= _get_vino(in
);
11329 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
11330 tout(cct
) << __func__
<< std::endl
;
11331 tout(cct
) << vino
.ino
.val
<< std::endl
;
11332 tout(cct
) << name
<< std::endl
;
11334 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
11335 "fuse_default_permissions");
11336 if (!fuse_default_permissions
) {
11337 int r
= xattr_permission(in
, name
, MAY_READ
, perms
);
11342 return _getxattr(in
, name
, value
, size
, perms
);
11345 int Client::_listxattr(Inode
*in
, char *name
, size_t size
,
11346 const UserPerm
& perms
)
11348 bool len_only
= (size
== 0);
11349 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11355 for (const auto& p
: in
->xattrs
) {
11356 size_t this_len
= p
.first
.length() + 1;
11361 if (this_len
> size
) {
11366 memcpy(name
, p
.first
.c_str(), this_len
);
11371 const VXattr
*vxattr
;
11372 for (vxattr
= _get_vxattrs(in
); vxattr
&& !vxattr
->name
.empty(); vxattr
++) {
11373 if (vxattr
->hidden
)
11375 // call pointer-to-member function
11376 if (vxattr
->exists_cb
&& !(this->*(vxattr
->exists_cb
))(in
))
11379 size_t this_len
= vxattr
->name
.length() + 1;
11384 if (this_len
> size
) {
11389 memcpy(name
, vxattr
->name
.c_str(), this_len
);
11394 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ", " << size
<< ") = " << r
<< dendl
;
11398 int Client::ll_listxattr(Inode
*in
, char *names
, size_t size
,
11399 const UserPerm
& perms
)
11401 std::lock_guard
lock(client_lock
);
11406 vinodeno_t vino
= _get_vino(in
);
11408 ldout(cct
, 3) << __func__
<< " " << vino
<< " size " << size
<< dendl
;
11409 tout(cct
) << __func__
<< std::endl
;
11410 tout(cct
) << vino
.ino
.val
<< std::endl
;
11411 tout(cct
) << size
<< std::endl
;
11413 return _listxattr(in
, names
, size
, perms
);
11416 int Client::_do_setxattr(Inode
*in
, const char *name
, const void *value
,
11417 size_t size
, int flags
, const UserPerm
& perms
)
11420 int xattr_flags
= 0;
11422 xattr_flags
|= CEPH_XATTR_REMOVE
;
11423 if (flags
& XATTR_CREATE
)
11424 xattr_flags
|= CEPH_XATTR_CREATE
;
11425 if (flags
& XATTR_REPLACE
)
11426 xattr_flags
|= CEPH_XATTR_REPLACE
;
11428 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETXATTR
);
11430 in
->make_nosnap_relative_path(path
);
11431 req
->set_filepath(path
);
11432 req
->set_string2(name
);
11433 req
->set_inode(in
);
11434 req
->head
.args
.setxattr
.flags
= xattr_flags
;
11437 assert (value
|| size
== 0);
11438 bl
.append((const char*)value
, size
);
11441 int res
= make_request(req
, perms
);
11444 ldout(cct
, 3) << __func__
<< "(" << in
->ino
<< ", \"" << name
<< "\") = " <<
11449 int Client::_setxattr(Inode
*in
, const char *name
, const void *value
,
11450 size_t size
, int flags
, const UserPerm
& perms
)
11452 if (in
->snapid
!= CEPH_NOSNAP
) {
11456 bool posix_acl_xattr
= false;
11457 if (acl_type
== POSIX_ACL
)
11458 posix_acl_xattr
= !strncmp(name
, "system.", 7);
11460 if (strncmp(name
, "user.", 5) &&
11461 strncmp(name
, "security.", 9) &&
11462 strncmp(name
, "trusted.", 8) &&
11463 strncmp(name
, "ceph.", 5) &&
11465 return -EOPNOTSUPP
;
11467 bool check_realm
= false;
11469 if (posix_acl_xattr
) {
11470 if (!strcmp(name
, ACL_EA_ACCESS
)) {
11471 mode_t new_mode
= in
->mode
;
11473 int ret
= posix_acl_equiv_mode(value
, size
, &new_mode
);
11480 if (new_mode
!= in
->mode
) {
11481 struct ceph_statx stx
;
11482 stx
.stx_mode
= new_mode
;
11483 ret
= _do_setattr(in
, &stx
, CEPH_SETATTR_MODE
, perms
, NULL
);
11488 } else if (!strcmp(name
, ACL_EA_DEFAULT
)) {
11490 if (!S_ISDIR(in
->mode
))
11492 int ret
= posix_acl_check(value
, size
);
11501 return -EOPNOTSUPP
;
11504 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11506 if (vxattr
->readonly
)
11507 return -EOPNOTSUPP
;
11508 if (vxattr
->name
.compare(0, 10, "ceph.quota") == 0 && value
)
11509 check_realm
= true;
11513 int ret
= _do_setxattr(in
, name
, value
, size
, flags
, perms
);
11514 if (ret
>= 0 && check_realm
) {
11515 // check if snaprealm was created for quota inode
11516 if (in
->quota
.is_enable() &&
11517 !(in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
))
11524 int Client::_setxattr(InodeRef
&in
, const char *name
, const void *value
,
11525 size_t size
, int flags
, const UserPerm
& perms
)
11527 if (cct
->_conf
->client_permissions
) {
11528 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11532 return _setxattr(in
.get(), name
, value
, size
, flags
, perms
);
11535 int Client::_setxattr_check_data_pool(string
& name
, string
& value
, const OSDMap
*osdmap
)
11538 if (name
== "layout") {
11539 string::iterator begin
= value
.begin();
11540 string::iterator end
= value
.end();
11541 keys_and_values
<string::iterator
> p
; // create instance of parser
11542 std::map
<string
, string
> m
; // map to receive results
11543 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
11548 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
11549 if (q
->first
== "pool") {
11554 } else if (name
== "layout.pool") {
11558 if (tmp
.length()) {
11561 pool
= boost::lexical_cast
<unsigned>(tmp
);
11562 if (!osdmap
->have_pg_pool(pool
))
11564 } catch (boost::bad_lexical_cast
const&) {
11565 pool
= osdmap
->lookup_pg_pool_name(tmp
);
11575 void Client::_setxattr_maybe_wait_for_osdmap(const char *name
, const void *value
, size_t size
)
11577 // For setting pool of layout, MetaRequest need osdmap epoch.
11578 // There is a race which create a new data pool but client and mds both don't have.
11579 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11580 if (strcmp(name
, "ceph.file.layout.pool") == 0 || strcmp(name
, "ceph.dir.layout.pool") == 0 ||
11581 strcmp(name
, "ceph.file.layout") == 0 || strcmp(name
, "ceph.dir.layout") == 0) {
11582 string
rest(strstr(name
, "layout"));
11583 string
v((const char*)value
, size
);
11584 int r
= objecter
->with_osdmap([&](const OSDMap
& o
) {
11585 return _setxattr_check_data_pool(rest
, v
, &o
);
11588 if (r
== -ENOENT
) {
11590 objecter
->wait_for_latest_osdmap(&ctx
);
11596 int Client::ll_setxattr(Inode
*in
, const char *name
, const void *value
,
11597 size_t size
, int flags
, const UserPerm
& perms
)
11599 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11601 std::lock_guard
lock(client_lock
);
11606 vinodeno_t vino
= _get_vino(in
);
11608 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
11609 tout(cct
) << __func__
<< std::endl
;
11610 tout(cct
) << vino
.ino
.val
<< std::endl
;
11611 tout(cct
) << name
<< std::endl
;
11613 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
11614 "fuse_default_permissions");
11615 if (!fuse_default_permissions
) {
11616 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11620 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11623 int Client::_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11625 if (in
->snapid
!= CEPH_NOSNAP
) {
11629 // same xattrs supported by kernel client
11630 if (strncmp(name
, "user.", 5) &&
11631 strncmp(name
, "system.", 7) &&
11632 strncmp(name
, "security.", 9) &&
11633 strncmp(name
, "trusted.", 8) &&
11634 strncmp(name
, "ceph.", 5))
11635 return -EOPNOTSUPP
;
11637 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11638 if (vxattr
&& vxattr
->readonly
)
11639 return -EOPNOTSUPP
;
11641 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_RMXATTR
);
11643 in
->make_nosnap_relative_path(path
);
11644 req
->set_filepath(path
);
11645 req
->set_filepath2(name
);
11646 req
->set_inode(in
);
11648 int res
= make_request(req
, perms
);
11651 ldout(cct
, 8) << "_removexattr(" << in
->ino
<< ", \"" << name
<< "\") = " << res
<< dendl
;
11655 int Client::_removexattr(InodeRef
&in
, const char *name
, const UserPerm
& perms
)
11657 if (cct
->_conf
->client_permissions
) {
11658 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11662 return _removexattr(in
.get(), name
, perms
);
11665 int Client::ll_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11667 std::lock_guard
lock(client_lock
);
11672 vinodeno_t vino
= _get_vino(in
);
11674 ldout(cct
, 3) << "ll_removexattr " << vino
<< " " << name
<< dendl
;
11675 tout(cct
) << "ll_removexattr" << std::endl
;
11676 tout(cct
) << vino
.ino
.val
<< std::endl
;
11677 tout(cct
) << name
<< std::endl
;
11679 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
11680 "fuse_default_permissions");
11681 if (!fuse_default_permissions
) {
11682 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11687 return _removexattr(in
, name
, perms
);
11690 bool Client::_vxattrcb_quota_exists(Inode
*in
)
11692 return in
->quota
.is_enable() &&
11693 in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
;
11695 size_t Client::_vxattrcb_quota(Inode
*in
, char *val
, size_t size
)
11697 return snprintf(val
, size
,
11698 "max_bytes=%lld max_files=%lld",
11699 (long long int)in
->quota
.max_bytes
,
11700 (long long int)in
->quota
.max_files
);
11702 size_t Client::_vxattrcb_quota_max_bytes(Inode
*in
, char *val
, size_t size
)
11704 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_bytes
);
11706 size_t Client::_vxattrcb_quota_max_files(Inode
*in
, char *val
, size_t size
)
11708 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_files
);
11711 bool Client::_vxattrcb_layout_exists(Inode
*in
)
11713 return in
->layout
!= file_layout_t();
11715 size_t Client::_vxattrcb_layout(Inode
*in
, char *val
, size_t size
)
11717 int r
= snprintf(val
, size
,
11718 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11719 (unsigned long long)in
->layout
.stripe_unit
,
11720 (unsigned long long)in
->layout
.stripe_count
,
11721 (unsigned long long)in
->layout
.object_size
);
11722 objecter
->with_osdmap([&](const OSDMap
& o
) {
11723 if (o
.have_pg_pool(in
->layout
.pool_id
))
11724 r
+= snprintf(val
+ r
, size
- r
, "%s",
11725 o
.get_pool_name(in
->layout
.pool_id
).c_str());
11727 r
+= snprintf(val
+ r
, size
- r
, "%" PRIu64
,
11728 (uint64_t)in
->layout
.pool_id
);
11730 if (in
->layout
.pool_ns
.length())
11731 r
+= snprintf(val
+ r
, size
- r
, " pool_namespace=%s",
11732 in
->layout
.pool_ns
.c_str());
11735 size_t Client::_vxattrcb_layout_stripe_unit(Inode
*in
, char *val
, size_t size
)
11737 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_unit
);
11739 size_t Client::_vxattrcb_layout_stripe_count(Inode
*in
, char *val
, size_t size
)
11741 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_count
);
11743 size_t Client::_vxattrcb_layout_object_size(Inode
*in
, char *val
, size_t size
)
11745 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.object_size
);
11747 size_t Client::_vxattrcb_layout_pool(Inode
*in
, char *val
, size_t size
)
11750 objecter
->with_osdmap([&](const OSDMap
& o
) {
11751 if (o
.have_pg_pool(in
->layout
.pool_id
))
11752 r
= snprintf(val
, size
, "%s", o
.get_pool_name(
11753 in
->layout
.pool_id
).c_str());
11755 r
= snprintf(val
, size
, "%" PRIu64
, (uint64_t)in
->layout
.pool_id
);
11759 size_t Client::_vxattrcb_layout_pool_namespace(Inode
*in
, char *val
, size_t size
)
11761 return snprintf(val
, size
, "%s", in
->layout
.pool_ns
.c_str());
11763 size_t Client::_vxattrcb_dir_entries(Inode
*in
, char *val
, size_t size
)
11765 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->dirstat
.nfiles
+ in
->dirstat
.nsubdirs
));
11767 size_t Client::_vxattrcb_dir_files(Inode
*in
, char *val
, size_t size
)
11769 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nfiles
);
11771 size_t Client::_vxattrcb_dir_subdirs(Inode
*in
, char *val
, size_t size
)
11773 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nsubdirs
);
11775 size_t Client::_vxattrcb_dir_rentries(Inode
*in
, char *val
, size_t size
)
11777 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->rstat
.rfiles
+ in
->rstat
.rsubdirs
));
11779 size_t Client::_vxattrcb_dir_rfiles(Inode
*in
, char *val
, size_t size
)
11781 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rfiles
);
11783 size_t Client::_vxattrcb_dir_rsubdirs(Inode
*in
, char *val
, size_t size
)
11785 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsubdirs
);
11787 size_t Client::_vxattrcb_dir_rbytes(Inode
*in
, char *val
, size_t size
)
11789 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rbytes
);
11791 size_t Client::_vxattrcb_dir_rctime(Inode
*in
, char *val
, size_t size
)
11793 return snprintf(val
, size
, "%ld.%09ld", (long)in
->rstat
.rctime
.sec(),
11794 (long)in
->rstat
.rctime
.nsec());
11796 bool Client::_vxattrcb_dir_pin_exists(Inode
*in
)
11798 return in
->dir_pin
!= -ENODATA
;
11800 size_t Client::_vxattrcb_dir_pin(Inode
*in
, char *val
, size_t size
)
11802 return snprintf(val
, size
, "%ld", (long)in
->dir_pin
);
11805 bool Client::_vxattrcb_snap_btime_exists(Inode
*in
)
11807 return !in
->snap_btime
.is_zero();
11810 size_t Client::_vxattrcb_snap_btime(Inode
*in
, char *val
, size_t size
)
11812 return snprintf(val
, size
, "%llu.%09lu",
11813 (long long unsigned)in
->snap_btime
.sec(),
11814 (long unsigned)in
->snap_btime
.nsec());
11817 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11818 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11820 #define XATTR_NAME_CEPH(_type, _name) \
11822 name: CEPH_XATTR_NAME(_type, _name), \
11823 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11829 #define XATTR_NAME_CEPH2(_type, _name, _flags) \
11831 name: CEPH_XATTR_NAME(_type, _name), \
11832 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11838 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11840 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11841 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11844 exists_cb: &Client::_vxattrcb_layout_exists, \
11847 #define XATTR_QUOTA_FIELD(_type, _name) \
11849 name: CEPH_XATTR_NAME(_type, _name), \
11850 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11853 exists_cb: &Client::_vxattrcb_quota_exists, \
11857 const Client::VXattr
Client::_dir_vxattrs
[] = {
11859 name
: "ceph.dir.layout",
11860 getxattr_cb
: &Client::_vxattrcb_layout
,
11863 exists_cb
: &Client::_vxattrcb_layout_exists
,
11866 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_unit
),
11867 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_count
),
11868 XATTR_LAYOUT_FIELD(dir
, layout
, object_size
),
11869 XATTR_LAYOUT_FIELD(dir
, layout
, pool
),
11870 XATTR_LAYOUT_FIELD(dir
, layout
, pool_namespace
),
11871 XATTR_NAME_CEPH(dir
, entries
),
11872 XATTR_NAME_CEPH(dir
, files
),
11873 XATTR_NAME_CEPH(dir
, subdirs
),
11874 XATTR_NAME_CEPH2(dir
, rentries
, VXATTR_RSTAT
),
11875 XATTR_NAME_CEPH2(dir
, rfiles
, VXATTR_RSTAT
),
11876 XATTR_NAME_CEPH2(dir
, rsubdirs
, VXATTR_RSTAT
),
11877 XATTR_NAME_CEPH2(dir
, rbytes
, VXATTR_RSTAT
),
11878 XATTR_NAME_CEPH2(dir
, rctime
, VXATTR_RSTAT
),
11880 name
: "ceph.quota",
11881 getxattr_cb
: &Client::_vxattrcb_quota
,
11884 exists_cb
: &Client::_vxattrcb_quota_exists
,
11887 XATTR_QUOTA_FIELD(quota
, max_bytes
),
11888 XATTR_QUOTA_FIELD(quota
, max_files
),
11890 name
: "ceph.dir.pin",
11891 getxattr_cb
: &Client::_vxattrcb_dir_pin
,
11894 exists_cb
: &Client::_vxattrcb_dir_pin_exists
,
11898 name
: "ceph.snap.btime",
11899 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
11902 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
11905 { name
: "" } /* Required table terminator */
11908 const Client::VXattr
Client::_file_vxattrs
[] = {
11910 name
: "ceph.file.layout",
11911 getxattr_cb
: &Client::_vxattrcb_layout
,
11914 exists_cb
: &Client::_vxattrcb_layout_exists
,
11917 XATTR_LAYOUT_FIELD(file
, layout
, stripe_unit
),
11918 XATTR_LAYOUT_FIELD(file
, layout
, stripe_count
),
11919 XATTR_LAYOUT_FIELD(file
, layout
, object_size
),
11920 XATTR_LAYOUT_FIELD(file
, layout
, pool
),
11921 XATTR_LAYOUT_FIELD(file
, layout
, pool_namespace
),
11923 name
: "ceph.snap.btime",
11924 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
11927 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
11930 { name
: "" } /* Required table terminator */
11933 const Client::VXattr
*Client::_get_vxattrs(Inode
*in
)
11936 return _dir_vxattrs
;
11937 else if (in
->is_file())
11938 return _file_vxattrs
;
11942 const Client::VXattr
*Client::_match_vxattr(Inode
*in
, const char *name
)
11944 if (strncmp(name
, "ceph.", 5) == 0) {
11945 const VXattr
*vxattr
= _get_vxattrs(in
);
11947 while (!vxattr
->name
.empty()) {
11948 if (vxattr
->name
== name
)
11957 int Client::ll_readlink(Inode
*in
, char *buf
, size_t buflen
, const UserPerm
& perms
)
11959 std::lock_guard
lock(client_lock
);
11964 vinodeno_t vino
= _get_vino(in
);
11966 ldout(cct
, 3) << "ll_readlink " << vino
<< dendl
;
11967 tout(cct
) << "ll_readlink" << std::endl
;
11968 tout(cct
) << vino
.ino
.val
<< std::endl
;
11970 for (auto dn
: in
->dentries
) {
11974 int r
= _readlink(in
, buf
, buflen
); // FIXME: no permission checking!
11975 ldout(cct
, 3) << "ll_readlink " << vino
<< " = " << r
<< dendl
;
11979 int Client::_mknod(Inode
*dir
, const char *name
, mode_t mode
, dev_t rdev
,
11980 const UserPerm
& perms
, InodeRef
*inp
)
11982 ldout(cct
, 8) << "_mknod(" << dir
->ino
<< " " << name
<< ", 0" << oct
11983 << mode
<< dec
<< ", " << rdev
<< ", uid " << perms
.uid()
11984 << ", gid " << perms
.gid() << ")" << dendl
;
11986 if (strlen(name
) > NAME_MAX
)
11987 return -ENAMETOOLONG
;
11989 if (dir
->snapid
!= CEPH_NOSNAP
) {
11992 if (is_quota_files_exceeded(dir
, perms
)) {
11996 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_MKNOD
);
11999 dir
->make_nosnap_relative_path(path
);
12000 path
.push_dentry(name
);
12001 req
->set_filepath(path
);
12002 req
->set_inode(dir
);
12003 req
->head
.args
.mknod
.rdev
= rdev
;
12004 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12005 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12007 bufferlist xattrs_bl
;
12008 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
12011 req
->head
.args
.mknod
.mode
= mode
;
12012 if (xattrs_bl
.length() > 0)
12013 req
->set_data(xattrs_bl
);
12016 res
= get_or_create(dir
, name
, &de
);
12019 req
->set_dentry(de
);
12021 res
= make_request(req
, perms
, inp
);
12025 ldout(cct
, 8) << "mknod(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
12033 int Client::ll_mknod(Inode
*parent
, const char *name
, mode_t mode
,
12034 dev_t rdev
, struct stat
*attr
, Inode
**out
,
12035 const UserPerm
& perms
)
12037 std::lock_guard
lock(client_lock
);
12042 vinodeno_t vparent
= _get_vino(parent
);
12044 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
<< dendl
;
12045 tout(cct
) << "ll_mknod" << std::endl
;
12046 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12047 tout(cct
) << name
<< std::endl
;
12048 tout(cct
) << mode
<< std::endl
;
12049 tout(cct
) << rdev
<< std::endl
;
12051 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12052 "fuse_default_permissions");
12053 if (!fuse_default_permissions
) {
12054 int r
= may_create(parent
, perms
);
12060 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12062 fill_stat(in
, attr
);
12065 tout(cct
) << attr
->st_ino
<< std::endl
;
12066 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
12067 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12072 int Client::ll_mknodx(Inode
*parent
, const char *name
, mode_t mode
,
12073 dev_t rdev
, Inode
**out
,
12074 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12075 const UserPerm
& perms
)
12077 unsigned caps
= statx_to_mask(flags
, want
);
12078 std::lock_guard
lock(client_lock
);
12083 vinodeno_t vparent
= _get_vino(parent
);
12085 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
<< dendl
;
12086 tout(cct
) << "ll_mknodx" << std::endl
;
12087 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12088 tout(cct
) << name
<< std::endl
;
12089 tout(cct
) << mode
<< std::endl
;
12090 tout(cct
) << rdev
<< std::endl
;
12092 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12093 "fuse_default_permissions");
12094 if (!fuse_default_permissions
) {
12095 int r
= may_create(parent
, perms
);
12101 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12103 fill_statx(in
, caps
, stx
);
12106 tout(cct
) << stx
->stx_ino
<< std::endl
;
12107 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
12108 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12113 int Client::_create(Inode
*dir
, const char *name
, int flags
, mode_t mode
,
12114 InodeRef
*inp
, Fh
**fhp
, int stripe_unit
, int stripe_count
,
12115 int object_size
, const char *data_pool
, bool *created
,
12116 const UserPerm
& perms
)
12118 ldout(cct
, 8) << "_create(" << dir
->ino
<< " " << name
<< ", 0" << oct
<<
12119 mode
<< dec
<< ")" << dendl
;
12121 if (strlen(name
) > NAME_MAX
)
12122 return -ENAMETOOLONG
;
12123 if (dir
->snapid
!= CEPH_NOSNAP
) {
12126 if (is_quota_files_exceeded(dir
, perms
)) {
12130 // use normalized flags to generate cmode
12131 int cflags
= ceph_flags_sys2wire(flags
);
12132 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
12133 cflags
|= CEPH_O_LAZY
;
12135 int cmode
= ceph_flags_to_mode(cflags
);
12137 int64_t pool_id
= -1;
12138 if (data_pool
&& *data_pool
) {
12139 pool_id
= objecter
->with_osdmap(
12140 std::mem_fn(&OSDMap::lookup_pg_pool_name
), data_pool
);
12143 if (pool_id
> 0xffffffffll
)
12144 return -ERANGE
; // bummer!
12147 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_CREATE
);
12150 dir
->make_nosnap_relative_path(path
);
12151 path
.push_dentry(name
);
12152 req
->set_filepath(path
);
12153 req
->set_inode(dir
);
12154 req
->head
.args
.open
.flags
= cflags
| CEPH_O_CREAT
;
12156 req
->head
.args
.open
.stripe_unit
= stripe_unit
;
12157 req
->head
.args
.open
.stripe_count
= stripe_count
;
12158 req
->head
.args
.open
.object_size
= object_size
;
12159 if (cct
->_conf
->client_debug_getattr_caps
)
12160 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
12162 req
->head
.args
.open
.mask
= 0;
12163 req
->head
.args
.open
.pool
= pool_id
;
12164 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12165 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12168 bufferlist xattrs_bl
;
12169 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
12172 req
->head
.args
.open
.mode
= mode
;
12173 if (xattrs_bl
.length() > 0)
12174 req
->set_data(xattrs_bl
);
12177 res
= get_or_create(dir
, name
, &de
);
12180 req
->set_dentry(de
);
12182 res
= make_request(req
, perms
, inp
, created
);
12187 /* If the caller passed a value in fhp, do the open */
12189 (*inp
)->get_open_ref(cmode
);
12190 *fhp
= _create_fh(inp
->get(), flags
, cmode
, perms
);
12196 ldout(cct
, 8) << "create(" << path
<< ", 0" << oct
<< mode
<< dec
12197 << " layout " << stripe_unit
12198 << ' ' << stripe_count
12199 << ' ' << object_size
12200 <<") = " << res
<< dendl
;
12209 int Client::_mkdir(Inode
*dir
, const char *name
, mode_t mode
, const UserPerm
& perm
,
12212 ldout(cct
, 8) << "_mkdir(" << dir
->ino
<< " " << name
<< ", 0" << oct
12213 << mode
<< dec
<< ", uid " << perm
.uid()
12214 << ", gid " << perm
.gid() << ")" << dendl
;
12216 if (strlen(name
) > NAME_MAX
)
12217 return -ENAMETOOLONG
;
12219 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12222 if (is_quota_files_exceeded(dir
, perm
)) {
12225 MetaRequest
*req
= new MetaRequest(dir
->snapid
== CEPH_SNAPDIR
?
12226 CEPH_MDS_OP_MKSNAP
: CEPH_MDS_OP_MKDIR
);
12229 dir
->make_nosnap_relative_path(path
);
12230 path
.push_dentry(name
);
12231 req
->set_filepath(path
);
12232 req
->set_inode(dir
);
12233 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12234 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12237 bufferlist xattrs_bl
;
12238 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perm
);
12241 req
->head
.args
.mkdir
.mode
= mode
;
12242 if (xattrs_bl
.length() > 0)
12243 req
->set_data(xattrs_bl
);
12246 res
= get_or_create(dir
, name
, &de
);
12249 req
->set_dentry(de
);
12251 ldout(cct
, 10) << "_mkdir: making request" << dendl
;
12252 res
= make_request(req
, perm
, inp
);
12253 ldout(cct
, 10) << "_mkdir result is " << res
<< dendl
;
12257 ldout(cct
, 8) << "_mkdir(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
12265 int Client::ll_mkdir(Inode
*parent
, const char *name
, mode_t mode
,
12266 struct stat
*attr
, Inode
**out
, const UserPerm
& perm
)
12268 std::lock_guard
lock(client_lock
);
12273 vinodeno_t vparent
= _get_vino(parent
);
12275 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
<< dendl
;
12276 tout(cct
) << "ll_mkdir" << std::endl
;
12277 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12278 tout(cct
) << name
<< std::endl
;
12279 tout(cct
) << mode
<< std::endl
;
12281 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12282 "fuse_default_permissions");
12283 if (!fuse_default_permissions
) {
12284 int r
= may_create(parent
, perm
);
12290 int r
= _mkdir(parent
, name
, mode
, perm
, &in
);
12292 fill_stat(in
, attr
);
12295 tout(cct
) << attr
->st_ino
<< std::endl
;
12296 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
12297 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12302 int Client::ll_mkdirx(Inode
*parent
, const char *name
, mode_t mode
, Inode
**out
,
12303 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12304 const UserPerm
& perms
)
12306 std::lock_guard
lock(client_lock
);
12311 vinodeno_t vparent
= _get_vino(parent
);
12313 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
<< dendl
;
12314 tout(cct
) << "ll_mkdirx" << std::endl
;
12315 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12316 tout(cct
) << name
<< std::endl
;
12317 tout(cct
) << mode
<< std::endl
;
12319 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12320 "fuse_default_permissions");
12321 if (!fuse_default_permissions
) {
12322 int r
= may_create(parent
, perms
);
12328 int r
= _mkdir(parent
, name
, mode
, perms
, &in
);
12330 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12336 tout(cct
) << stx
->stx_ino
<< std::endl
;
12337 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
12338 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12343 int Client::_symlink(Inode
*dir
, const char *name
, const char *target
,
12344 const UserPerm
& perms
, InodeRef
*inp
)
12346 ldout(cct
, 8) << "_symlink(" << dir
->ino
<< " " << name
<< ", " << target
12347 << ", uid " << perms
.uid() << ", gid " << perms
.gid() << ")"
12350 if (strlen(name
) > NAME_MAX
)
12351 return -ENAMETOOLONG
;
12353 if (dir
->snapid
!= CEPH_NOSNAP
) {
12356 if (is_quota_files_exceeded(dir
, perms
)) {
12360 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SYMLINK
);
12363 dir
->make_nosnap_relative_path(path
);
12364 path
.push_dentry(name
);
12365 req
->set_filepath(path
);
12366 req
->set_inode(dir
);
12367 req
->set_string2(target
);
12368 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12369 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12372 int res
= get_or_create(dir
, name
, &de
);
12375 req
->set_dentry(de
);
12377 res
= make_request(req
, perms
, inp
);
12380 ldout(cct
, 8) << "_symlink(\"" << path
<< "\", \"" << target
<< "\") = " <<
12389 int Client::ll_symlink(Inode
*parent
, const char *name
, const char *value
,
12390 struct stat
*attr
, Inode
**out
, const UserPerm
& perms
)
12392 std::lock_guard
lock(client_lock
);
12397 vinodeno_t vparent
= _get_vino(parent
);
12399 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
<< " -> " << value
12401 tout(cct
) << "ll_symlink" << std::endl
;
12402 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12403 tout(cct
) << name
<< std::endl
;
12404 tout(cct
) << value
<< std::endl
;
12406 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12407 "fuse_default_permissions");
12408 if (!fuse_default_permissions
) {
12409 int r
= may_create(parent
, perms
);
12415 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12417 fill_stat(in
, attr
);
12420 tout(cct
) << attr
->st_ino
<< std::endl
;
12421 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
12422 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12427 int Client::ll_symlinkx(Inode
*parent
, const char *name
, const char *value
,
12428 Inode
**out
, struct ceph_statx
*stx
, unsigned want
,
12429 unsigned flags
, const UserPerm
& perms
)
12431 std::lock_guard
lock(client_lock
);
12436 vinodeno_t vparent
= _get_vino(parent
);
12438 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
<< " -> " << value
12440 tout(cct
) << "ll_symlinkx" << std::endl
;
12441 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12442 tout(cct
) << name
<< std::endl
;
12443 tout(cct
) << value
<< std::endl
;
12445 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12446 "fuse_default_permissions");
12447 if (!fuse_default_permissions
) {
12448 int r
= may_create(parent
, perms
);
12454 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12456 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12459 tout(cct
) << stx
->stx_ino
<< std::endl
;
12460 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
12461 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12466 int Client::_unlink(Inode
*dir
, const char *name
, const UserPerm
& perm
)
12468 ldout(cct
, 8) << "_unlink(" << dir
->ino
<< " " << name
12469 << " uid " << perm
.uid() << " gid " << perm
.gid()
12472 if (dir
->snapid
!= CEPH_NOSNAP
) {
12476 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_UNLINK
);
12479 dir
->make_nosnap_relative_path(path
);
12480 path
.push_dentry(name
);
12481 req
->set_filepath(path
);
12487 int res
= get_or_create(dir
, name
, &de
);
12490 req
->set_dentry(de
);
12491 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12492 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12494 res
= _lookup(dir
, name
, 0, &otherin
, perm
);
12498 in
= otherin
.get();
12499 req
->set_other_inode(in
);
12500 in
->break_all_delegs();
12501 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12503 req
->set_inode(dir
);
12505 res
= make_request(req
, perm
);
12508 ldout(cct
, 8) << "unlink(" << path
<< ") = " << res
<< dendl
;
12516 int Client::ll_unlink(Inode
*in
, const char *name
, const UserPerm
& perm
)
12518 std::lock_guard
lock(client_lock
);
12523 vinodeno_t vino
= _get_vino(in
);
12525 ldout(cct
, 3) << "ll_unlink " << vino
<< " " << name
<< dendl
;
12526 tout(cct
) << "ll_unlink" << std::endl
;
12527 tout(cct
) << vino
.ino
.val
<< std::endl
;
12528 tout(cct
) << name
<< std::endl
;
12530 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12531 "fuse_default_permissions");
12532 if (!fuse_default_permissions
) {
12533 int r
= may_delete(in
, name
, perm
);
12537 return _unlink(in
, name
, perm
);
12540 int Client::_rmdir(Inode
*dir
, const char *name
, const UserPerm
& perms
)
12542 ldout(cct
, 8) << "_rmdir(" << dir
->ino
<< " " << name
<< " uid "
12543 << perms
.uid() << " gid " << perms
.gid() << ")" << dendl
;
12545 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12549 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_RMSNAP
: CEPH_MDS_OP_RMDIR
;
12550 MetaRequest
*req
= new MetaRequest(op
);
12552 dir
->make_nosnap_relative_path(path
);
12553 path
.push_dentry(name
);
12554 req
->set_filepath(path
);
12555 req
->set_inode(dir
);
12557 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12558 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12559 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12564 int res
= get_or_create(dir
, name
, &de
);
12567 if (op
== CEPH_MDS_OP_RMDIR
)
12568 req
->set_dentry(de
);
12572 res
= _lookup(dir
, name
, 0, &in
, perms
);
12576 if (op
== CEPH_MDS_OP_RMSNAP
) {
12577 unlink(de
, true, true);
12580 req
->set_other_inode(in
.get());
12582 res
= make_request(req
, perms
);
12585 ldout(cct
, 8) << "rmdir(" << path
<< ") = " << res
<< dendl
;
12593 int Client::ll_rmdir(Inode
*in
, const char *name
, const UserPerm
& perms
)
12595 std::lock_guard
lock(client_lock
);
12600 vinodeno_t vino
= _get_vino(in
);
12602 ldout(cct
, 3) << "ll_rmdir " << vino
<< " " << name
<< dendl
;
12603 tout(cct
) << "ll_rmdir" << std::endl
;
12604 tout(cct
) << vino
.ino
.val
<< std::endl
;
12605 tout(cct
) << name
<< std::endl
;
12607 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12608 "fuse_default_permissions");
12609 if (!fuse_default_permissions
) {
12610 int r
= may_delete(in
, name
, perms
);
12615 return _rmdir(in
, name
, perms
);
12618 int Client::_rename(Inode
*fromdir
, const char *fromname
, Inode
*todir
, const char *toname
, const UserPerm
& perm
)
12620 ldout(cct
, 8) << "_rename(" << fromdir
->ino
<< " " << fromname
<< " to "
12621 << todir
->ino
<< " " << toname
12622 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")"
12625 if (fromdir
->snapid
!= todir
->snapid
)
12628 int op
= CEPH_MDS_OP_RENAME
;
12629 if (fromdir
->snapid
!= CEPH_NOSNAP
) {
12630 if (fromdir
== todir
&& fromdir
->snapid
== CEPH_SNAPDIR
)
12631 op
= CEPH_MDS_OP_RENAMESNAP
;
12635 if (fromdir
!= todir
) {
12636 Inode
*fromdir_root
=
12637 fromdir
->quota
.is_enable() ? fromdir
: get_quota_root(fromdir
, perm
);
12638 Inode
*todir_root
=
12639 todir
->quota
.is_enable() ? todir
: get_quota_root(todir
, perm
);
12640 if (fromdir_root
!= todir_root
) {
12646 MetaRequest
*req
= new MetaRequest(op
);
12649 fromdir
->make_nosnap_relative_path(from
);
12650 from
.push_dentry(fromname
);
12652 todir
->make_nosnap_relative_path(to
);
12653 to
.push_dentry(toname
);
12654 req
->set_filepath(to
);
12655 req
->set_filepath2(from
);
12658 int res
= get_or_create(fromdir
, fromname
, &oldde
);
12662 res
= get_or_create(todir
, toname
, &de
);
12666 if (op
== CEPH_MDS_OP_RENAME
) {
12667 req
->set_old_dentry(oldde
);
12668 req
->old_dentry_drop
= CEPH_CAP_FILE_SHARED
;
12669 req
->old_dentry_unless
= CEPH_CAP_FILE_EXCL
;
12671 req
->set_dentry(de
);
12672 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12673 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12675 InodeRef oldin
, otherin
;
12676 res
= _lookup(fromdir
, fromname
, 0, &oldin
, perm
);
12680 Inode
*oldinode
= oldin
.get();
12681 oldinode
->break_all_delegs();
12682 req
->set_old_inode(oldinode
);
12683 req
->old_inode_drop
= CEPH_CAP_LINK_SHARED
;
12685 res
= _lookup(todir
, toname
, 0, &otherin
, perm
);
12689 Inode
*in
= otherin
.get();
12690 req
->set_other_inode(in
);
12691 in
->break_all_delegs();
12693 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12701 req
->set_inode(todir
);
12703 // renamesnap reply contains no tracedn, so we need to invalidate
12705 unlink(oldde
, true, true);
12706 unlink(de
, true, true);
12708 req
->set_inode(todir
);
12711 res
= make_request(req
, perm
, &target
);
12712 ldout(cct
, 10) << "rename result is " << res
<< dendl
;
12714 // renamed item from our cache
12717 ldout(cct
, 8) << "_rename(" << from
<< ", " << to
<< ") = " << res
<< dendl
;
12725 int Client::ll_rename(Inode
*parent
, const char *name
, Inode
*newparent
,
12726 const char *newname
, const UserPerm
& perm
)
12728 std::lock_guard
lock(client_lock
);
12733 vinodeno_t vparent
= _get_vino(parent
);
12734 vinodeno_t vnewparent
= _get_vino(newparent
);
12736 ldout(cct
, 3) << "ll_rename " << vparent
<< " " << name
<< " to "
12737 << vnewparent
<< " " << newname
<< dendl
;
12738 tout(cct
) << "ll_rename" << std::endl
;
12739 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12740 tout(cct
) << name
<< std::endl
;
12741 tout(cct
) << vnewparent
.ino
.val
<< std::endl
;
12742 tout(cct
) << newname
<< std::endl
;
12744 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12745 "fuse_default_permissions");
12746 if (!fuse_default_permissions
) {
12747 int r
= may_delete(parent
, name
, perm
);
12750 r
= may_delete(newparent
, newname
, perm
);
12751 if (r
< 0 && r
!= -ENOENT
)
12755 return _rename(parent
, name
, newparent
, newname
, perm
);
12758 int Client::_link(Inode
*in
, Inode
*dir
, const char *newname
, const UserPerm
& perm
, InodeRef
*inp
)
12760 ldout(cct
, 8) << "_link(" << in
->ino
<< " to " << dir
->ino
<< " " << newname
12761 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")" << dendl
;
12763 if (strlen(newname
) > NAME_MAX
)
12764 return -ENAMETOOLONG
;
12766 if (in
->snapid
!= CEPH_NOSNAP
|| dir
->snapid
!= CEPH_NOSNAP
) {
12769 if (is_quota_files_exceeded(dir
, perm
)) {
12773 in
->break_all_delegs();
12774 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LINK
);
12776 filepath
path(newname
, dir
->ino
);
12777 req
->set_filepath(path
);
12778 filepath
existing(in
->ino
);
12779 req
->set_filepath2(existing
);
12781 req
->set_inode(dir
);
12782 req
->inode_drop
= CEPH_CAP_FILE_SHARED
;
12783 req
->inode_unless
= CEPH_CAP_FILE_EXCL
;
12786 int res
= get_or_create(dir
, newname
, &de
);
12789 req
->set_dentry(de
);
12791 res
= make_request(req
, perm
, inp
);
12792 ldout(cct
, 10) << "link result is " << res
<< dendl
;
12795 ldout(cct
, 8) << "link(" << existing
<< ", " << path
<< ") = " << res
<< dendl
;
12803 int Client::ll_link(Inode
*in
, Inode
*newparent
, const char *newname
,
12804 const UserPerm
& perm
)
12806 std::lock_guard
lock(client_lock
);
12811 vinodeno_t vino
= _get_vino(in
);
12812 vinodeno_t vnewparent
= _get_vino(newparent
);
12814 ldout(cct
, 3) << "ll_link " << vino
<< " to " << vnewparent
<< " " <<
12816 tout(cct
) << "ll_link" << std::endl
;
12817 tout(cct
) << vino
.ino
.val
<< std::endl
;
12818 tout(cct
) << vnewparent
<< std::endl
;
12819 tout(cct
) << newname
<< std::endl
;
12823 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12824 "fuse_default_permissions");
12825 if (!fuse_default_permissions
) {
12826 if (S_ISDIR(in
->mode
))
12829 int r
= may_hardlink(in
, perm
);
12833 r
= may_create(newparent
, perm
);
12838 return _link(in
, newparent
, newname
, perm
, &target
);
12841 int Client::ll_num_osds(void)
12843 std::lock_guard
lock(client_lock
);
12844 return objecter
->with_osdmap(std::mem_fn(&OSDMap::get_num_osds
));
12847 int Client::ll_osdaddr(int osd
, uint32_t *addr
)
12849 std::lock_guard
lock(client_lock
);
12852 bool exists
= objecter
->with_osdmap([&](const OSDMap
& o
) {
12853 if (!o
.exists(osd
))
12855 g
= o
.get_addrs(osd
).front();
12860 uint32_t nb_addr
= (g
.in4_addr()).sin_addr
.s_addr
;
12861 *addr
= ntohl(nb_addr
);
12865 uint32_t Client::ll_stripe_unit(Inode
*in
)
12867 std::lock_guard
lock(client_lock
);
12868 return in
->layout
.stripe_unit
;
12871 uint64_t Client::ll_snap_seq(Inode
*in
)
12873 std::lock_guard
lock(client_lock
);
12874 return in
->snaprealm
->seq
;
12877 int Client::ll_file_layout(Inode
*in
, file_layout_t
*layout
)
12879 std::lock_guard
lock(client_lock
);
12880 *layout
= in
->layout
;
12884 int Client::ll_file_layout(Fh
*fh
, file_layout_t
*layout
)
12886 return ll_file_layout(fh
->inode
.get(), layout
);
12889 /* Currently we cannot take advantage of redundancy in reads, since we
12890 would have to go through all possible placement groups (a
12891 potentially quite large number determined by a hash), and use CRUSH
12892 to calculate the appropriate set of OSDs for each placement group,
12893 then index into that. An array with one entry per OSD is much more
12894 tractable and works for demonstration purposes. */
12896 int Client::ll_get_stripe_osd(Inode
*in
, uint64_t blockno
,
12897 file_layout_t
* layout
)
12899 std::lock_guard
lock(client_lock
);
12901 inodeno_t ino
= in
->ino
;
12902 uint32_t object_size
= layout
->object_size
;
12903 uint32_t su
= layout
->stripe_unit
;
12904 uint32_t stripe_count
= layout
->stripe_count
;
12905 uint64_t stripes_per_object
= object_size
/ su
;
12906 uint64_t stripeno
= 0, stripepos
= 0;
12909 stripeno
= blockno
/ stripe_count
; // which horizontal stripe (Y)
12910 stripepos
= blockno
% stripe_count
; // which object in the object set (X)
12912 uint64_t objectsetno
= stripeno
/ stripes_per_object
; // which object set
12913 uint64_t objectno
= objectsetno
* stripe_count
+ stripepos
; // object id
12915 object_t oid
= file_object_t(ino
, objectno
);
12916 return objecter
->with_osdmap([&](const OSDMap
& o
) {
12917 ceph_object_layout olayout
=
12918 o
.file_to_object_layout(oid
, *layout
);
12919 pg_t pg
= (pg_t
)olayout
.ol_pgid
;
12922 o
.pg_to_acting_osds(pg
, &osds
, &primary
);
12927 /* Return the offset of the block, internal to the object */
12929 uint64_t Client::ll_get_internal_offset(Inode
*in
, uint64_t blockno
)
12931 std::lock_guard
lock(client_lock
);
12932 file_layout_t
*layout
=&(in
->layout
);
12933 uint32_t object_size
= layout
->object_size
;
12934 uint32_t su
= layout
->stripe_unit
;
12935 uint64_t stripes_per_object
= object_size
/ su
;
12937 return (blockno
% stripes_per_object
) * su
;
12940 int Client::ll_opendir(Inode
*in
, int flags
, dir_result_t
** dirpp
,
12941 const UserPerm
& perms
)
12943 std::lock_guard
lock(client_lock
);
12948 vinodeno_t vino
= _get_vino(in
);
12950 ldout(cct
, 3) << "ll_opendir " << vino
<< dendl
;
12951 tout(cct
) << "ll_opendir" << std::endl
;
12952 tout(cct
) << vino
.ino
.val
<< std::endl
;
12954 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12955 "fuse_default_permissions");
12956 if (!fuse_default_permissions
) {
12957 int r
= may_open(in
, flags
, perms
);
12962 int r
= _opendir(in
, dirpp
, perms
);
12963 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
12965 ldout(cct
, 3) << "ll_opendir " << vino
<< " = " << r
<< " (" << *dirpp
<< ")"
12970 int Client::ll_releasedir(dir_result_t
*dirp
)
12972 std::lock_guard
lock(client_lock
);
12973 ldout(cct
, 3) << "ll_releasedir " << dirp
<< dendl
;
12974 tout(cct
) << "ll_releasedir" << std::endl
;
12975 tout(cct
) << (unsigned long)dirp
<< std::endl
;
12984 int Client::ll_fsyncdir(dir_result_t
*dirp
)
12986 std::lock_guard
lock(client_lock
);
12987 ldout(cct
, 3) << "ll_fsyncdir " << dirp
<< dendl
;
12988 tout(cct
) << "ll_fsyncdir" << std::endl
;
12989 tout(cct
) << (unsigned long)dirp
<< std::endl
;
12994 return _fsync(dirp
->inode
.get(), false);
12997 int Client::ll_open(Inode
*in
, int flags
, Fh
**fhp
, const UserPerm
& perms
)
12999 ceph_assert(!(flags
& O_CREAT
));
13001 std::lock_guard
lock(client_lock
);
13006 vinodeno_t vino
= _get_vino(in
);
13008 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) << dendl
;
13009 tout(cct
) << "ll_open" << std::endl
;
13010 tout(cct
) << vino
.ino
.val
<< std::endl
;
13011 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
13014 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
13015 "fuse_default_permissions");
13016 if (!fuse_default_permissions
) {
13017 r
= may_open(in
, flags
, perms
);
13022 r
= _open(in
, flags
, 0, fhp
/* may be NULL */, perms
);
13025 Fh
*fhptr
= fhp
? *fhp
: NULL
;
13027 ll_unclosed_fh_set
.insert(fhptr
);
13029 tout(cct
) << (unsigned long)fhptr
<< std::endl
;
13030 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) <<
13031 " = " << r
<< " (" << fhptr
<< ")" << dendl
;
13035 int Client::_ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13036 int flags
, InodeRef
*in
, int caps
, Fh
**fhp
,
13037 const UserPerm
& perms
)
13041 vinodeno_t vparent
= _get_vino(parent
);
13043 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13044 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << ", uid " << perms
.uid()
13045 << ", gid " << perms
.gid() << dendl
;
13046 tout(cct
) << "ll_create" << std::endl
;
13047 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13048 tout(cct
) << name
<< std::endl
;
13049 tout(cct
) << mode
<< std::endl
;
13050 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
13052 bool created
= false;
13053 int r
= _lookup(parent
, name
, caps
, in
, perms
);
13055 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
13058 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
13059 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
13060 "fuse_default_permissions");
13061 if (!fuse_default_permissions
) {
13062 r
= may_create(parent
, perms
);
13066 r
= _create(parent
, name
, flags
, mode
, in
, fhp
, 0, 0, 0, NULL
, &created
,
13077 ldout(cct
, 20) << "_ll_create created = " << created
<< dendl
;
13079 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
13080 "fuse_default_permissions");
13081 if (!fuse_default_permissions
) {
13082 r
= may_open(in
->get(), flags
, perms
);
13085 int release_r
= _release_fh(*fhp
);
13086 ceph_assert(release_r
== 0); // during create, no async data ops should have happened
13091 if (*fhp
== NULL
) {
13092 r
= _open(in
->get(), flags
, mode
, fhp
, perms
);
13100 ll_unclosed_fh_set
.insert(*fhp
);
13105 Inode
*inode
= in
->get();
13106 if (use_faked_inos())
13107 ino
= inode
->faked_ino
;
13112 tout(cct
) << (unsigned long)*fhp
<< std::endl
;
13113 tout(cct
) << ino
<< std::endl
;
13114 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13115 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << " = " << r
<< " (" <<
13116 *fhp
<< " " << hex
<< ino
<< dec
<< ")" << dendl
;
13121 int Client::ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13122 int flags
, struct stat
*attr
, Inode
**outp
, Fh
**fhp
,
13123 const UserPerm
& perms
)
13125 std::lock_guard
lock(client_lock
);
13131 int r
= _ll_create(parent
, name
, mode
, flags
, &in
, CEPH_STAT_CAP_INODE_ALL
,
13136 // passing an Inode in outp requires an additional ref
13141 fill_stat(in
, attr
);
13149 int Client::ll_createx(Inode
*parent
, const char *name
, mode_t mode
,
13150 int oflags
, Inode
**outp
, Fh
**fhp
,
13151 struct ceph_statx
*stx
, unsigned want
, unsigned lflags
,
13152 const UserPerm
& perms
)
13154 unsigned caps
= statx_to_mask(lflags
, want
);
13155 std::lock_guard
lock(client_lock
);
13161 int r
= _ll_create(parent
, name
, mode
, oflags
, &in
, caps
, fhp
, perms
);
13165 // passing an Inode in outp requires an additional ref
13170 fill_statx(in
, caps
, stx
);
13179 loff_t
Client::ll_lseek(Fh
*fh
, loff_t offset
, int whence
)
13181 std::lock_guard
lock(client_lock
);
13182 tout(cct
) << "ll_lseek" << std::endl
;
13183 tout(cct
) << offset
<< std::endl
;
13184 tout(cct
) << whence
<< std::endl
;
13189 return _lseek(fh
, offset
, whence
);
13192 int Client::ll_read(Fh
*fh
, loff_t off
, loff_t len
, bufferlist
*bl
)
13194 std::lock_guard
lock(client_lock
);
13195 ldout(cct
, 3) << "ll_read " << fh
<< " " << fh
->inode
->ino
<< " " << " " << off
<< "~" << len
<< dendl
;
13196 tout(cct
) << "ll_read" << std::endl
;
13197 tout(cct
) << (unsigned long)fh
<< std::endl
;
13198 tout(cct
) << off
<< std::endl
;
13199 tout(cct
) << len
<< std::endl
;
13204 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13205 len
= std::min(len
, (loff_t
)INT_MAX
);
13206 return _read(fh
, off
, len
, bl
);
13209 int Client::ll_read_block(Inode
*in
, uint64_t blockid
,
13213 file_layout_t
* layout
)
13215 std::lock_guard
lock(client_lock
);
13220 vinodeno_t vino
= _get_vino(in
);
13221 object_t oid
= file_object_t(vino
.ino
, blockid
);
13222 C_SaferCond onfinish
;
13225 objecter
->read(oid
,
13226 object_locator_t(layout
->pool_id
),
13231 CEPH_OSD_FLAG_READ
,
13234 client_lock
.Unlock();
13235 int r
= onfinish
.wait();
13236 client_lock
.Lock();
13239 bl
.copy(0, bl
.length(), buf
);
13246 /* It appears that the OSD doesn't return success unless the entire
13247 buffer was written, return the write length on success. */
13249 int Client::ll_write_block(Inode
*in
, uint64_t blockid
,
13250 char* buf
, uint64_t offset
,
13251 uint64_t length
, file_layout_t
* layout
,
13252 uint64_t snapseq
, uint32_t sync
)
13254 vinodeno_t vino
= ll_get_vino(in
);
13256 std::unique_ptr
<C_SaferCond
> onsafe
= nullptr;
13261 if (true || sync
) {
13262 /* if write is stable, the epilogue is waiting on
13264 onsafe
.reset(new C_SaferCond("Client::ll_write_block flock"));
13266 object_t oid
= file_object_t(vino
.ino
, blockid
);
13267 SnapContext fakesnap
;
13268 ceph::bufferlist bl
;
13270 bl
.push_back(buffer::copy(buf
, length
));
13273 ldout(cct
, 1) << "ll_block_write for " << vino
.ino
<< "." << blockid
13276 fakesnap
.seq
= snapseq
;
13278 /* lock just in time */
13279 client_lock
.Lock();
13281 client_lock
.Unlock();
13285 objecter
->write(oid
,
13286 object_locator_t(layout
->pool_id
),
13291 ceph::real_clock::now(),
13295 client_lock
.Unlock();
13296 if (nullptr != onsafe
) {
13297 r
= onsafe
->wait();
13307 int Client::ll_commit_blocks(Inode
*in
,
13311 std::lock_guard
lock(client_lock
);
13313 BarrierContext *bctx;
13314 vinodeno_t vino = _get_vino(in);
13315 uint64_t ino = vino.ino;
13317 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13318 << offset << " to " << length << dendl;
13324 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13325 if (p != barriers.end()) {
13326 barrier_interval civ(offset, offset + length);
13327 p->second->commit_barrier(civ);
13333 int Client::ll_write(Fh
*fh
, loff_t off
, loff_t len
, const char *data
)
13335 std::lock_guard
lock(client_lock
);
13336 ldout(cct
, 3) << "ll_write " << fh
<< " " << fh
->inode
->ino
<< " " << off
<<
13337 "~" << len
<< dendl
;
13338 tout(cct
) << "ll_write" << std::endl
;
13339 tout(cct
) << (unsigned long)fh
<< std::endl
;
13340 tout(cct
) << off
<< std::endl
;
13341 tout(cct
) << len
<< std::endl
;
13346 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13347 len
= std::min(len
, (loff_t
)INT_MAX
);
13348 int r
= _write(fh
, off
, len
, data
, NULL
, 0);
13349 ldout(cct
, 3) << "ll_write " << fh
<< " " << off
<< "~" << len
<< " = " << r
13354 int64_t Client::ll_writev(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
13356 std::lock_guard
lock(client_lock
);
13359 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, true, false);
13362 int64_t Client::ll_readv(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
13364 std::lock_guard
lock(client_lock
);
13367 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, false, false);
13370 int Client::ll_flush(Fh
*fh
)
13372 std::lock_guard
lock(client_lock
);
13373 ldout(cct
, 3) << "ll_flush " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13374 tout(cct
) << "ll_flush" << std::endl
;
13375 tout(cct
) << (unsigned long)fh
<< std::endl
;
13383 int Client::ll_fsync(Fh
*fh
, bool syncdataonly
)
13385 std::lock_guard
lock(client_lock
);
13386 ldout(cct
, 3) << "ll_fsync " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13387 tout(cct
) << "ll_fsync" << std::endl
;
13388 tout(cct
) << (unsigned long)fh
<< std::endl
;
13393 int r
= _fsync(fh
, syncdataonly
);
13395 // If we're returning an error, clear it from the FH
13396 fh
->take_async_err();
13401 int Client::ll_sync_inode(Inode
*in
, bool syncdataonly
)
13403 std::lock_guard
lock(client_lock
);
13404 ldout(cct
, 3) << "ll_sync_inode " << *in
<< " " << dendl
;
13405 tout(cct
) << "ll_sync_inode" << std::endl
;
13406 tout(cct
) << (unsigned long)in
<< std::endl
;
13411 return _fsync(in
, syncdataonly
);
13414 #ifdef FALLOC_FL_PUNCH_HOLE
13416 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13418 if (offset
< 0 || length
<= 0)
13421 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
))
13422 return -EOPNOTSUPP
;
13424 if ((mode
& FALLOC_FL_PUNCH_HOLE
) && !(mode
& FALLOC_FL_KEEP_SIZE
))
13425 return -EOPNOTSUPP
;
13427 Inode
*in
= fh
->inode
.get();
13429 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
) &&
13430 !(mode
& FALLOC_FL_PUNCH_HOLE
)) {
13434 if (in
->snapid
!= CEPH_NOSNAP
)
13437 if ((fh
->mode
& CEPH_FILE_MODE_WR
) == 0)
13440 uint64_t size
= offset
+ length
;
13441 if (!(mode
& (FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
)) &&
13443 is_quota_bytes_exceeded(in
, size
- in
->size
, fh
->actor_perms
)) {
13448 int r
= get_caps(in
, CEPH_CAP_FILE_WR
, CEPH_CAP_FILE_BUFFER
, &have
, -1);
13452 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
13453 if (mode
& FALLOC_FL_PUNCH_HOLE
) {
13454 if (in
->inline_version
< CEPH_INLINE_NONE
&&
13455 (have
& CEPH_CAP_FILE_BUFFER
)) {
13457 int len
= in
->inline_data
.length();
13458 if (offset
< len
) {
13460 in
->inline_data
.copy(0, offset
, bl
);
13462 if (offset
+ size
> len
)
13463 size
= len
- offset
;
13465 bl
.append_zero(size
);
13466 if (offset
+ size
< len
)
13467 in
->inline_data
.copy(offset
+ size
, len
- offset
- size
, bl
);
13468 in
->inline_data
= bl
;
13469 in
->inline_version
++;
13471 in
->mtime
= in
->ctime
= ceph_clock_now();
13473 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13475 if (in
->inline_version
< CEPH_INLINE_NONE
) {
13476 onuninline
.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13477 uninline_data(in
, onuninline
.get());
13480 C_SaferCond
onfinish("Client::_punch_hole flock");
13482 unsafe_sync_write
++;
13483 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
13485 _invalidate_inode_cache(in
, offset
, length
);
13486 filer
->zero(in
->ino
, &in
->layout
,
13487 in
->snaprealm
->get_snap_context(),
13489 ceph::real_clock::now(),
13490 0, true, &onfinish
);
13491 in
->mtime
= in
->ctime
= ceph_clock_now();
13493 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13495 client_lock
.Unlock();
13497 client_lock
.Lock();
13498 _sync_write_commit(in
);
13500 } else if (!(mode
& FALLOC_FL_KEEP_SIZE
)) {
13501 uint64_t size
= offset
+ length
;
13502 if (size
> in
->size
) {
13504 in
->mtime
= in
->ctime
= ceph_clock_now();
13506 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13508 if (is_quota_bytes_approaching(in
, fh
->actor_perms
)) {
13509 check_caps(in
, CHECK_CAPS_NODELAY
);
13510 } else if (is_max_size_approaching(in
)) {
13516 if (nullptr != onuninline
) {
13517 client_lock
.Unlock();
13518 int ret
= onuninline
->wait();
13519 client_lock
.Lock();
13521 if (ret
>= 0 || ret
== -ECANCELED
) {
13522 in
->inline_data
.clear();
13523 in
->inline_version
= CEPH_INLINE_NONE
;
13524 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13530 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
13535 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13537 return -EOPNOTSUPP
;
13543 int Client::ll_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13545 std::lock_guard
lock(client_lock
);
13546 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13547 tout(cct
) << __func__
<< " " << mode
<< " " << offset
<< " " << length
<< std::endl
;
13548 tout(cct
) << (unsigned long)fh
<< std::endl
;
13553 return _fallocate(fh
, mode
, offset
, length
);
13556 int Client::fallocate(int fd
, int mode
, loff_t offset
, loff_t length
)
13558 std::lock_guard
lock(client_lock
);
13559 tout(cct
) << __func__
<< " " << " " << fd
<< mode
<< " " << offset
<< " " << length
<< std::endl
;
13564 Fh
*fh
= get_filehandle(fd
);
13567 #if defined(__linux__) && defined(O_PATH)
13568 if (fh
->flags
& O_PATH
)
13571 return _fallocate(fh
, mode
, offset
, length
);
13574 int Client::ll_release(Fh
*fh
)
13576 std::lock_guard
lock(client_lock
);
13581 ldout(cct
, 3) << __func__
<< " (fh)" << fh
<< " " << fh
->inode
->ino
<< " " <<
13583 tout(cct
) << __func__
<< " (fh)" << std::endl
;
13584 tout(cct
) << (unsigned long)fh
<< std::endl
;
13586 if (ll_unclosed_fh_set
.count(fh
))
13587 ll_unclosed_fh_set
.erase(fh
);
13588 return _release_fh(fh
);
13591 int Client::ll_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
13593 std::lock_guard
lock(client_lock
);
13595 ldout(cct
, 3) << "ll_getlk (fh)" << fh
<< " " << fh
->inode
->ino
<< dendl
;
13596 tout(cct
) << "ll_getk (fh)" << (unsigned long)fh
<< std::endl
;
13601 return _getlk(fh
, fl
, owner
);
13604 int Client::ll_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
13606 std::lock_guard
lock(client_lock
);
13608 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13609 tout(cct
) << __func__
<< " (fh)" << (unsigned long)fh
<< std::endl
;
13614 return _setlk(fh
, fl
, owner
, sleep
);
13617 int Client::ll_flock(Fh
*fh
, int cmd
, uint64_t owner
)
13619 std::lock_guard
lock(client_lock
);
13621 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13622 tout(cct
) << __func__
<< " (fh)" << (unsigned long)fh
<< std::endl
;
13627 return _flock(fh
, cmd
, owner
);
13630 int Client::set_deleg_timeout(uint32_t timeout
)
13632 std::lock_guard
lock(client_lock
);
13635 * The whole point is to prevent blacklisting so we must time out the
13636 * delegation before the session autoclose timeout kicks in.
13638 if (timeout
>= mdsmap
->get_session_autoclose())
13641 deleg_timeout
= timeout
;
13645 int Client::ll_delegation(Fh
*fh
, unsigned cmd
, ceph_deleg_cb_t cb
, void *priv
)
13649 std::lock_guard
lock(client_lock
);
13654 Inode
*inode
= fh
->inode
.get();
13657 case CEPH_DELEGATION_NONE
:
13658 inode
->unset_deleg(fh
);
13663 ret
= inode
->set_deleg(fh
, cmd
, cb
, priv
);
13664 } catch (std::bad_alloc
&) {
13672 class C_Client_RequestInterrupt
: public Context
{
13677 C_Client_RequestInterrupt(Client
*c
, MetaRequest
*r
) : client(c
), req(r
) {
13680 void finish(int r
) override
{
13681 std::lock_guard
l(client
->client_lock
);
13682 ceph_assert(req
->head
.op
== CEPH_MDS_OP_SETFILELOCK
);
13683 client
->_interrupt_filelock(req
);
13684 client
->put_request(req
);
13688 void Client::ll_interrupt(void *d
)
13690 MetaRequest
*req
= static_cast<MetaRequest
*>(d
);
13691 ldout(cct
, 3) << __func__
<< " tid " << req
->get_tid() << dendl
;
13692 tout(cct
) << __func__
<< " tid " << req
->get_tid() << std::endl
;
13693 interrupt_finisher
.queue(new C_Client_RequestInterrupt(this, req
));
13696 // =========================================
13699 // expose file layouts
13701 int Client::describe_layout(const char *relpath
, file_layout_t
*lp
,
13702 const UserPerm
& perms
)
13704 std::lock_guard
lock(client_lock
);
13709 filepath
path(relpath
);
13711 int r
= path_walk(path
, &in
, perms
);
13717 ldout(cct
, 3) << __func__
<< "(" << relpath
<< ") = 0" << dendl
;
13721 int Client::fdescribe_layout(int fd
, file_layout_t
*lp
)
13723 std::lock_guard
lock(client_lock
);
13728 Fh
*f
= get_filehandle(fd
);
13731 Inode
*in
= f
->inode
.get();
13735 ldout(cct
, 3) << __func__
<< "(" << fd
<< ") = 0" << dendl
;
13739 int64_t Client::get_default_pool_id()
13741 std::lock_guard
lock(client_lock
);
13746 /* first data pool is the default */
13747 return mdsmap
->get_first_data_pool();
13752 int64_t Client::get_pool_id(const char *pool_name
)
13754 std::lock_guard
lock(client_lock
);
13759 return objecter
->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name
),
13763 string
Client::get_pool_name(int64_t pool
)
13765 std::lock_guard
lock(client_lock
);
13770 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13771 return o
.have_pg_pool(pool
) ? o
.get_pool_name(pool
) : string();
13775 int Client::get_pool_replication(int64_t pool
)
13777 std::lock_guard
lock(client_lock
);
13782 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13783 return o
.have_pg_pool(pool
) ? o
.get_pg_pool(pool
)->get_size() : -ENOENT
;
13787 int Client::get_file_extent_osds(int fd
, loff_t off
, loff_t
*len
, vector
<int>& osds
)
13789 std::lock_guard
lock(client_lock
);
13794 Fh
*f
= get_filehandle(fd
);
13797 Inode
*in
= f
->inode
.get();
13799 vector
<ObjectExtent
> extents
;
13800 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, 1, in
->truncate_size
, extents
);
13801 ceph_assert(extents
.size() == 1);
13803 objecter
->with_osdmap([&](const OSDMap
& o
) {
13804 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
13805 o
.pg_to_acting_osds(pg
, osds
);
13812 * Return the remainder of the extent (stripe unit)
13814 * If length = 1 is passed to Striper::file_to_extents we get a single
13815 * extent back, but its length is one so we still need to compute the length
13816 * to the end of the stripe unit.
13818 * If length = su then we may get 1 or 2 objects back in the extents vector
13819 * which would have to be examined. Even then, the offsets are local to the
13820 * object, so matching up to the file offset is extra work.
13822 * It seems simpler to stick with length = 1 and manually compute the
13826 uint64_t su
= in
->layout
.stripe_unit
;
13827 *len
= su
- (off
% su
);
13833 int Client::get_osd_crush_location(int id
, vector
<pair
<string
, string
> >& path
)
13835 std::lock_guard
lock(client_lock
);
13842 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13843 return o
.crush
->get_full_location_ordered(id
, path
);
13847 int Client::get_file_stripe_address(int fd
, loff_t offset
,
13848 vector
<entity_addr_t
>& address
)
13850 std::lock_guard
lock(client_lock
);
13855 Fh
*f
= get_filehandle(fd
);
13858 Inode
*in
= f
->inode
.get();
13861 vector
<ObjectExtent
> extents
;
13862 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, 1,
13863 in
->truncate_size
, extents
);
13864 ceph_assert(extents
.size() == 1);
13866 // now we have the object and its 'layout'
13867 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13868 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
13870 o
.pg_to_acting_osds(pg
, osds
);
13873 for (unsigned i
= 0; i
< osds
.size(); i
++) {
13874 entity_addr_t addr
= o
.get_addrs(osds
[i
]).front();
13875 address
.push_back(addr
);
13881 int Client::get_osd_addr(int osd
, entity_addr_t
& addr
)
13883 std::lock_guard
lock(client_lock
);
13888 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13889 if (!o
.exists(osd
))
13892 addr
= o
.get_addrs(osd
).front();
13897 int Client::enumerate_layout(int fd
, vector
<ObjectExtent
>& result
,
13898 loff_t length
, loff_t offset
)
13900 std::lock_guard
lock(client_lock
);
13905 Fh
*f
= get_filehandle(fd
);
13908 Inode
*in
= f
->inode
.get();
13910 // map to a list of extents
13911 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, length
, in
->truncate_size
, result
);
13913 ldout(cct
, 3) << __func__
<< "(" << fd
<< ", " << length
<< ", " << offset
<< ") = 0" << dendl
;
13918 /* find an osd with the same ip. -ENXIO if none. */
13919 int Client::get_local_osd()
13921 std::lock_guard
lock(client_lock
);
13926 objecter
->with_osdmap([this](const OSDMap
& o
) {
13927 if (o
.get_epoch() != local_osd_epoch
) {
13928 local_osd
= o
.find_osd_on_ip(messenger
->get_myaddrs().front());
13929 local_osd_epoch
= o
.get_epoch();
13940 // ===============================
13942 void Client::ms_handle_connect(Connection
*con
)
13944 ldout(cct
, 10) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
13947 bool Client::ms_handle_reset(Connection
*con
)
13949 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
13953 void Client::ms_handle_remote_reset(Connection
*con
)
13955 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
13956 std::lock_guard
l(client_lock
);
13957 switch (con
->get_peer_type()) {
13958 case CEPH_ENTITY_TYPE_MDS
:
13960 // kludge to figure out which mds this is; fixme with a Connection* state
13961 mds_rank_t mds
= MDS_RANK_NONE
;
13962 MetaSession
*s
= NULL
;
13963 for (auto &p
: mds_sessions
) {
13964 if (mdsmap
->get_addrs(p
.first
) == con
->get_peer_addrs()) {
13970 assert (s
!= NULL
);
13971 switch (s
->state
) {
13972 case MetaSession::STATE_CLOSING
:
13973 ldout(cct
, 1) << "reset from mds we were closing; we'll call that closed" << dendl
;
13974 _closed_mds_session(s
);
13977 case MetaSession::STATE_OPENING
:
13979 ldout(cct
, 1) << "reset from mds we were opening; retrying" << dendl
;
13980 list
<Context
*> waiters
;
13981 waiters
.swap(s
->waiting_for_open
);
13982 _closed_mds_session(s
);
13983 MetaSession
*news
= _get_or_open_mds_session(mds
);
13984 news
->waiting_for_open
.swap(waiters
);
13988 case MetaSession::STATE_OPEN
:
13990 objecter
->maybe_request_map(); /* to check if we are blacklisted */
13991 const auto& conf
= cct
->_conf
;
13992 if (conf
->client_reconnect_stale
) {
13993 ldout(cct
, 1) << "reset from mds we were open; close mds session for reconnect" << dendl
;
13994 _closed_mds_session(s
);
13996 ldout(cct
, 1) << "reset from mds we were open; mark session as stale" << dendl
;
13997 s
->state
= MetaSession::STATE_STALE
;
14002 case MetaSession::STATE_NEW
:
14003 case MetaSession::STATE_CLOSED
:
14013 bool Client::ms_handle_refused(Connection
*con
)
14015 ldout(cct
, 1) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14019 bool Client::ms_get_authorizer(int dest_type
, AuthAuthorizer
**authorizer
)
14021 if (dest_type
== CEPH_ENTITY_TYPE_MON
)
14023 *authorizer
= monclient
->build_authorizer(dest_type
);
14027 Inode
*Client::get_quota_root(Inode
*in
, const UserPerm
& perms
)
14029 Inode
*quota_in
= root_ancestor
;
14030 SnapRealm
*realm
= in
->snaprealm
;
14032 ldout(cct
, 10) << __func__
<< " realm " << realm
->ino
<< dendl
;
14033 if (realm
->ino
!= in
->ino
) {
14034 auto p
= inode_map
.find(vinodeno_t(realm
->ino
, CEPH_NOSNAP
));
14035 if (p
== inode_map
.end())
14038 if (p
->second
->quota
.is_enable()) {
14039 quota_in
= p
->second
;
14043 realm
= realm
->pparent
;
14045 ldout(cct
, 10) << __func__
<< " " << in
->vino() << " -> " << quota_in
->vino() << dendl
;
14050 * Traverse quota ancestors of the Inode, return true
14051 * if any of them passes the passed function
14053 bool Client::check_quota_condition(Inode
*in
, const UserPerm
& perms
,
14054 std::function
<bool (const Inode
&in
)> test
)
14057 ceph_assert(in
!= NULL
);
14062 if (in
== root_ancestor
) {
14063 // We're done traversing, drop out
14066 // Continue up the tree
14067 in
= get_quota_root(in
, perms
);
14074 bool Client::is_quota_files_exceeded(Inode
*in
, const UserPerm
& perms
)
14076 return check_quota_condition(in
, perms
,
14077 [](const Inode
&in
) {
14078 return in
.quota
.max_files
&& in
.rstat
.rsize() >= in
.quota
.max_files
;
14082 bool Client::is_quota_bytes_exceeded(Inode
*in
, int64_t new_bytes
,
14083 const UserPerm
& perms
)
14085 return check_quota_condition(in
, perms
,
14086 [&new_bytes
](const Inode
&in
) {
14087 return in
.quota
.max_bytes
&& (in
.rstat
.rbytes
+ new_bytes
)
14088 > in
.quota
.max_bytes
;
14092 bool Client::is_quota_bytes_approaching(Inode
*in
, const UserPerm
& perms
)
14094 return check_quota_condition(in
, perms
,
14095 [](const Inode
&in
) {
14096 if (in
.quota
.max_bytes
) {
14097 if (in
.rstat
.rbytes
>= in
.quota
.max_bytes
) {
14101 ceph_assert(in
.size
>= in
.reported_size
);
14102 const uint64_t space
= in
.quota
.max_bytes
- in
.rstat
.rbytes
;
14103 const uint64_t size
= in
.size
- in
.reported_size
;
14104 return (space
>> 4) < size
;
14118 int Client::check_pool_perm(Inode
*in
, int need
)
14120 if (!cct
->_conf
->client_check_pool_perm
)
14123 int64_t pool_id
= in
->layout
.pool_id
;
14124 std::string pool_ns
= in
->layout
.pool_ns
;
14125 std::pair
<int64_t, std::string
> perm_key(pool_id
, pool_ns
);
14128 auto it
= pool_perms
.find(perm_key
);
14129 if (it
== pool_perms
.end())
14131 if (it
->second
== POOL_CHECKING
) {
14132 // avoid concurrent checkings
14133 wait_on_list(waiting_for_pool_perm
);
14136 ceph_assert(have
& POOL_CHECKED
);
14142 if (in
->snapid
!= CEPH_NOSNAP
) {
14143 // pool permission check needs to write to the first object. But for snapshot,
14144 // head of the first object may have alread been deleted. To avoid creating
14145 // orphan object, skip the check for now.
14149 pool_perms
[perm_key
] = POOL_CHECKING
;
14152 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (unsigned long long)in
->ino
);
14153 object_t oid
= oid_buf
;
14155 SnapContext nullsnapc
;
14157 C_SaferCond rd_cond
;
14158 ObjectOperation rd_op
;
14159 rd_op
.stat(NULL
, (ceph::real_time
*)nullptr, NULL
);
14161 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), rd_op
,
14162 nullsnapc
, ceph::real_clock::now(), 0, &rd_cond
);
14164 C_SaferCond wr_cond
;
14165 ObjectOperation wr_op
;
14166 wr_op
.create(true);
14168 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), wr_op
,
14169 nullsnapc
, ceph::real_clock::now(), 0, &wr_cond
);
14171 client_lock
.Unlock();
14172 int rd_ret
= rd_cond
.wait();
14173 int wr_ret
= wr_cond
.wait();
14174 client_lock
.Lock();
14176 bool errored
= false;
14178 if (rd_ret
== 0 || rd_ret
== -ENOENT
)
14180 else if (rd_ret
!= -EPERM
) {
14181 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14182 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
14186 if (wr_ret
== 0 || wr_ret
== -EEXIST
)
14187 have
|= POOL_WRITE
;
14188 else if (wr_ret
!= -EPERM
) {
14189 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14190 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
14195 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14196 // Raise EIO because actual error code might be misleading for
14197 // userspace filesystem user.
14198 pool_perms
.erase(perm_key
);
14199 signal_cond_list(waiting_for_pool_perm
);
14203 pool_perms
[perm_key
] = have
| POOL_CHECKED
;
14204 signal_cond_list(waiting_for_pool_perm
);
14207 if ((need
& CEPH_CAP_FILE_RD
) && !(have
& POOL_READ
)) {
14208 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14209 << " need " << ccap_string(need
) << ", but no read perm" << dendl
;
14212 if ((need
& CEPH_CAP_FILE_WR
) && !(have
& POOL_WRITE
)) {
14213 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14214 << " need " << ccap_string(need
) << ", but no write perm" << dendl
;
14221 int Client::_posix_acl_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
14223 if (acl_type
== POSIX_ACL
) {
14224 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
14225 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
14227 return posix_acl_permits(access_acl
, in
->uid
, in
->gid
, perms
, want
);
14233 int Client::_posix_acl_chmod(Inode
*in
, mode_t mode
, const UserPerm
& perms
)
14235 if (acl_type
== NO_ACL
)
14238 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
14242 if (acl_type
== POSIX_ACL
) {
14243 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
14244 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
14245 bufferptr
acl(access_acl
.c_str(), access_acl
.length());
14246 r
= posix_acl_access_chmod(acl
, mode
);
14249 r
= _do_setxattr(in
, ACL_EA_ACCESS
, acl
.c_str(), acl
.length(), 0, perms
);
14255 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< " result=" << r
<< dendl
;
14259 int Client::_posix_acl_create(Inode
*dir
, mode_t
*mode
, bufferlist
& xattrs_bl
,
14260 const UserPerm
& perms
)
14262 if (acl_type
== NO_ACL
)
14265 if (S_ISLNK(*mode
))
14268 int r
= _getattr(dir
, CEPH_STAT_CAP_XATTR
, perms
, dir
->xattr_version
== 0);
14272 if (acl_type
== POSIX_ACL
) {
14273 if (dir
->xattrs
.count(ACL_EA_DEFAULT
)) {
14274 map
<string
, bufferptr
> xattrs
;
14276 const bufferptr
& default_acl
= dir
->xattrs
[ACL_EA_DEFAULT
];
14277 bufferptr
acl(default_acl
.c_str(), default_acl
.length());
14278 r
= posix_acl_inherit_mode(acl
, mode
);
14283 r
= posix_acl_equiv_mode(acl
.c_str(), acl
.length(), mode
);
14287 xattrs
[ACL_EA_ACCESS
] = acl
;
14290 if (S_ISDIR(*mode
))
14291 xattrs
[ACL_EA_DEFAULT
] = dir
->xattrs
[ACL_EA_DEFAULT
];
14295 encode(xattrs
, xattrs_bl
);
14298 *mode
&= ~umask_cb(callback_handle
);
14303 ldout(cct
, 10) << __func__
<< " dir ino " << dir
->ino
<< " result=" << r
<< dendl
;
14307 void Client::set_filer_flags(int flags
)
14309 std::lock_guard
l(client_lock
);
14310 ceph_assert(flags
== 0 ||
14311 flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14312 objecter
->add_global_op_flags(flags
);
14315 void Client::clear_filer_flags(int flags
)
14317 std::lock_guard
l(client_lock
);
14318 ceph_assert(flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14319 objecter
->clear_global_op_flag(flags
);
14322 // called before mount
14323 void Client::set_uuid(const std::string
& uuid
)
14325 std::lock_guard
l(client_lock
);
14326 assert(initialized
);
14327 assert(!uuid
.empty());
14329 metadata
["uuid"] = uuid
;
14333 // called before mount. 0 means infinite
14334 void Client::set_session_timeout(unsigned timeout
)
14336 std::lock_guard
l(client_lock
);
14337 assert(initialized
);
14339 metadata
["timeout"] = stringify(timeout
);
14342 // called before mount
14343 int Client::start_reclaim(const std::string
& uuid
, unsigned flags
,
14344 const std::string
& fs_name
)
14346 std::lock_guard
l(client_lock
);
14354 auto it
= metadata
.find("uuid");
14355 if (it
!= metadata
.end() && it
->second
== uuid
)
14359 int r
= subscribe_mdsmap(fs_name
);
14361 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
14365 if (metadata
.empty())
14366 populate_metadata("");
14368 while (mdsmap
->get_epoch() == 0)
14369 wait_on_list(waiting_for_mdsmap
);
14372 for (unsigned mds
= 0; mds
< mdsmap
->get_num_in_mds(); ) {
14373 if (!mdsmap
->is_up(mds
)) {
14374 ldout(cct
, 10) << "mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
14375 wait_on_list(waiting_for_mdsmap
);
14379 MetaSession
*session
;
14380 if (!have_open_session(mds
)) {
14381 session
= _get_or_open_mds_session(mds
);
14382 if (session
->state
!= MetaSession::STATE_OPENING
) {
14386 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
14387 wait_on_context_list(session
->waiting_for_open
);
14388 if (rejected_by_mds
.count(mds
))
14393 session
= &mds_sessions
.at(mds
);
14394 if (!session
->mds_features
.test(CEPHFS_FEATURE_RECLAIM_CLIENT
))
14395 return -EOPNOTSUPP
;
14397 if (session
->reclaim_state
== MetaSession::RECLAIM_NULL
||
14398 session
->reclaim_state
== MetaSession::RECLAIMING
) {
14399 session
->reclaim_state
= MetaSession::RECLAIMING
;
14400 auto m
= MClientReclaim::create(uuid
, flags
);
14401 session
->con
->send_message2(std::move(m
));
14402 wait_on_list(waiting_for_reclaim
);
14403 } else if (session
->reclaim_state
== MetaSession::RECLAIM_FAIL
) {
14404 return reclaim_errno
? : -ENOTRECOVERABLE
;
14410 // didn't find target session in any mds
14411 if (reclaim_target_addrs
.empty()) {
14412 if (flags
& CEPH_RECLAIM_RESET
)
14414 return -ENOTRECOVERABLE
;
14417 if (flags
& CEPH_RECLAIM_RESET
)
14420 // use blacklist to check if target session was killed
14421 // (config option mds_session_blacklist_on_evict needs to be true)
14423 if (!objecter
->wait_for_map(reclaim_osd_epoch
, &cond
)) {
14424 ldout(cct
, 10) << __func__
<< ": waiting for OSD epoch " << reclaim_osd_epoch
<< dendl
;
14425 client_lock
.Unlock();
14427 client_lock
.Lock();
14430 bool blacklisted
= objecter
->with_osdmap(
14431 [this](const OSDMap
&osd_map
) -> bool {
14432 return osd_map
.is_blacklisted(reclaim_target_addrs
);
14435 return -ENOTRECOVERABLE
;
14437 metadata
["reclaiming_uuid"] = uuid
;
14441 void Client::finish_reclaim()
14443 auto it
= metadata
.find("reclaiming_uuid");
14444 if (it
== metadata
.end()) {
14445 for (auto &p
: mds_sessions
)
14446 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
14450 for (auto &p
: mds_sessions
) {
14451 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
14452 auto m
= MClientReclaim::create("", MClientReclaim::FLAG_FINISH
);
14453 p
.second
.con
->send_message2(std::move(m
));
14456 metadata
["uuid"] = it
->second
;
14457 metadata
.erase(it
);
14460 void Client::handle_client_reclaim_reply(const MConstRef
<MClientReclaimReply
>& reply
)
14462 mds_rank_t from
= mds_rank_t(reply
->get_source().num());
14463 ldout(cct
, 10) << __func__
<< " " << *reply
<< " from mds." << from
<< dendl
;
14465 MetaSession
*session
= _get_mds_session(from
, reply
->get_connection().get());
14467 ldout(cct
, 10) << " discarding reclaim reply from sessionless mds." << from
<< dendl
;
14471 if (reply
->get_result() >= 0) {
14472 session
->reclaim_state
= MetaSession::RECLAIM_OK
;
14473 if (reply
->get_epoch() > reclaim_osd_epoch
)
14474 reclaim_osd_epoch
= reply
->get_epoch();
14475 if (!reply
->get_addrs().empty())
14476 reclaim_target_addrs
= reply
->get_addrs();
14478 session
->reclaim_state
= MetaSession::RECLAIM_FAIL
;
14479 reclaim_errno
= reply
->get_result();
14482 signal_cond_list(waiting_for_reclaim
);
14486 * This is included in cap release messages, to cause
14487 * the MDS to wait until this OSD map epoch. It is necessary
14488 * in corner cases where we cancel RADOS ops, so that
14489 * nobody else tries to do IO to the same objects in
14490 * the same epoch as the cancelled ops.
14492 void Client::set_cap_epoch_barrier(epoch_t e
)
14494 ldout(cct
, 5) << __func__
<< " epoch = " << e
<< dendl
;
14495 cap_epoch_barrier
= e
;
14498 const char** Client::get_tracked_conf_keys() const
14500 static const char* keys
[] = {
14501 "client_cache_size",
14502 "client_cache_mid",
14504 "client_deleg_timeout",
14505 "client_deleg_break_on_open",
14511 void Client::handle_conf_change(const ConfigProxy
& conf
,
14512 const std::set
<std::string
> &changed
)
14514 std::lock_guard
lock(client_lock
);
14516 if (changed
.count("client_cache_mid")) {
14517 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
14519 if (changed
.count("client_acl_type")) {
14521 if (cct
->_conf
->client_acl_type
== "posix_acl")
14522 acl_type
= POSIX_ACL
;
14526 void intrusive_ptr_add_ref(Inode
*in
)
14531 void intrusive_ptr_release(Inode
*in
)
14533 in
->client
->put_inode(in
);
14536 mds_rank_t
Client::_get_random_up_mds() const
14538 ceph_assert(client_lock
.is_locked_by_me());
14540 std::set
<mds_rank_t
> up
;
14541 mdsmap
->get_up_mds_set(up
);
14544 return MDS_RANK_NONE
;
14545 std::set
<mds_rank_t
>::const_iterator p
= up
.begin();
14546 for (int n
= rand() % up
.size(); n
; n
--)
14552 StandaloneClient::StandaloneClient(Messenger
*m
, MonClient
*mc
)
14553 : Client(m
, mc
, new Objecter(m
->cct
, m
, mc
, NULL
, 0, 0))
14555 monclient
->set_messenger(m
);
14556 objecter
->set_client_incarnation(0);
14559 StandaloneClient::~StandaloneClient()
14562 objecter
= nullptr;
14565 int StandaloneClient::init()
14568 objectcacher
->start();
14571 client_lock
.Lock();
14572 ceph_assert(!is_initialized());
14574 messenger
->add_dispatcher_tail(objecter
);
14575 messenger
->add_dispatcher_tail(this);
14577 monclient
->set_want_keys(CEPH_ENTITY_TYPE_MDS
| CEPH_ENTITY_TYPE_OSD
);
14578 int r
= monclient
->init();
14580 // need to do cleanup because we're in an intermediate init state
14582 client_lock
.Unlock();
14583 objecter
->shutdown();
14584 objectcacher
->stop();
14585 monclient
->shutdown();
14590 client_lock
.Unlock();
14596 void StandaloneClient::shutdown()
14598 Client::shutdown();
14599 objecter
->shutdown();
14600 monclient
->shutdown();