1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <sys/types.h>
23 #include <sys/param.h>
26 #include <sys/utsname.h>
29 #include <boost/lexical_cast.hpp>
30 #include <boost/fusion/include/std_pair.hpp>
32 #if defined(__FreeBSD__)
33 #define XATTR_CREATE 0x1
34 #define XATTR_REPLACE 0x2
36 #include <sys/xattr.h>
39 #if defined(__linux__)
40 #include <linux/falloc.h>
43 #include <sys/statvfs.h>
45 #include "common/config.h"
46 #include "common/version.h"
48 #include "mon/MonClient.h"
50 #include "messages/MClientCaps.h"
51 #include "messages/MClientLease.h"
52 #include "messages/MClientQuota.h"
53 #include "messages/MClientReclaim.h"
54 #include "messages/MClientReclaimReply.h"
55 #include "messages/MClientReconnect.h"
56 #include "messages/MClientReply.h"
57 #include "messages/MClientRequest.h"
58 #include "messages/MClientRequestForward.h"
59 #include "messages/MClientSession.h"
60 #include "messages/MClientSnap.h"
61 #include "messages/MCommandReply.h"
62 #include "messages/MFSMap.h"
63 #include "messages/MFSMapUser.h"
64 #include "messages/MMDSMap.h"
65 #include "messages/MOSDMap.h"
67 #include "mds/flock.h"
68 #include "mds/cephfs_features.h"
69 #include "osd/OSDMap.h"
70 #include "osdc/Filer.h"
72 #include "common/Cond.h"
73 #include "common/perf_counters.h"
74 #include "common/admin_socket.h"
75 #include "common/errno.h"
76 #include "include/str_list.h"
78 #define dout_subsys ceph_subsys_client
80 #include "include/lru.h"
81 #include "include/compat.h"
82 #include "include/stringify.h"
87 #include "Delegation.h"
89 #include "ClientSnapRealm.h"
91 #include "MetaSession.h"
92 #include "MetaRequest.h"
93 #include "ObjecterWriteback.h"
94 #include "posix_acl.h"
96 #include "include/ceph_assert.h"
97 #include "include/stat.h"
99 #include "include/cephfs/ceph_ll_client.h"
101 #if HAVE_GETGROUPLIST
108 #define dout_prefix *_dout << "client." << whoami << " "
110 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
112 // FreeBSD fails to define this
116 // Darwin fails to define this
125 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
127 using namespace TOPNSPC::common
;
129 void client_flush_set_callback(void *p
, ObjectCacher::ObjectSet
*oset
)
131 Client
*client
= static_cast<Client
*>(p
);
132 client
->flush_set_callback(oset
);
138 Client::CommandHook::CommandHook(Client
*client
) :
143 int Client::CommandHook::call(
144 std::string_view command
,
145 const cmdmap_t
& cmdmap
,
150 f
->open_object_section("result");
152 std::lock_guard l
{m_client
->client_lock
};
153 if (command
== "mds_requests")
154 m_client
->dump_mds_requests(f
);
155 else if (command
== "mds_sessions") {
156 bool cap_dump
= false;
157 cmd_getval(cmdmap
, "cap_dump", cap_dump
);
158 m_client
->dump_mds_sessions(f
, cap_dump
);
159 } else if (command
== "dump_cache")
160 m_client
->dump_cache(f
);
161 else if (command
== "kick_stale_sessions")
162 m_client
->_kick_stale_sessions();
163 else if (command
== "status")
164 m_client
->dump_status(f
);
166 ceph_abort_msg("bad command registered");
175 dir_result_t::dir_result_t(Inode
*in
, const UserPerm
& perms
)
176 : inode(in
), offset(0), next_offset(2),
177 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
181 void Client::_reset_faked_inos()
184 free_faked_inos
.clear();
185 free_faked_inos
.insert(start
, (uint32_t)-1 - start
+ 1);
186 last_used_faked_ino
= 0;
187 last_used_faked_root
= 0;
188 _use_faked_inos
= sizeof(ino_t
) < 8 || cct
->_conf
->client_use_faked_inos
;
191 void Client::_assign_faked_ino(Inode
*in
)
193 if (0 == last_used_faked_ino
)
194 last_used_faked_ino
= last_used_faked_ino
+ 2048; // start(1024)~2048 reserved for _assign_faked_root
195 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
196 if (it
== free_faked_inos
.end() && last_used_faked_ino
> 0) {
197 last_used_faked_ino
= 2048;
198 it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
200 ceph_assert(it
!= free_faked_inos
.end());
201 if (last_used_faked_ino
< it
.get_start()) {
202 ceph_assert(it
.get_len() > 0);
203 last_used_faked_ino
= it
.get_start();
205 ++last_used_faked_ino
;
206 ceph_assert(it
.get_start() + it
.get_len() > last_used_faked_ino
);
208 in
->faked_ino
= last_used_faked_ino
;
209 free_faked_inos
.erase(in
->faked_ino
);
210 faked_ino_map
[in
->faked_ino
] = in
->vino();
214 * In the faked mode, if you export multiple subdirectories,
215 * you will see that the inode numbers of the exported subdirectories
216 * are the same. so we distinguish the mount point by reserving
217 * the "fake ids" between "1024~2048" and combining the last
218 * 10bits(0x3ff) of the "root inodes".
220 void Client::_assign_faked_root(Inode
*in
)
222 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
223 if (it
== free_faked_inos
.end() && last_used_faked_root
> 0) {
224 last_used_faked_root
= 0;
225 it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
227 assert(it
!= free_faked_inos
.end());
228 vinodeno_t inode_info
= in
->vino();
229 uint64_t inode_num
= (uint64_t)inode_info
.ino
;
230 ldout(cct
, 10) << "inode_num " << inode_num
<< "inode_num & 0x3ff=" << (inode_num
& 0x3ff)<< dendl
;
231 last_used_faked_root
= it
.get_start() + (inode_num
& 0x3ff); // 0x3ff mask and get_start will not exceed 2048
232 assert(it
.get_start() + it
.get_len() > last_used_faked_root
);
234 in
->faked_ino
= last_used_faked_root
;
235 free_faked_inos
.erase(in
->faked_ino
);
236 faked_ino_map
[in
->faked_ino
] = in
->vino();
239 void Client::_release_faked_ino(Inode
*in
)
241 free_faked_inos
.insert(in
->faked_ino
);
242 faked_ino_map
.erase(in
->faked_ino
);
245 vinodeno_t
Client::_map_faked_ino(ino_t ino
)
250 else if (faked_ino_map
.count(ino
))
251 vino
= faked_ino_map
[ino
];
253 vino
= vinodeno_t(0, CEPH_NOSNAP
);
254 ldout(cct
, 10) << __func__
<< " " << ino
<< " -> " << vino
<< dendl
;
258 vinodeno_t
Client::map_faked_ino(ino_t ino
)
260 std::lock_guard
lock(client_lock
);
261 return _map_faked_ino(ino
);
266 Client::Client(Messenger
*m
, MonClient
*mc
, Objecter
*objecter_
)
267 : Dispatcher(m
->cct
),
268 timer(m
->cct
, client_lock
),
272 whoami(mc
->get_global_id()),
273 async_ino_invalidator(m
->cct
),
274 async_dentry_invalidator(m
->cct
),
275 interrupt_finisher(m
->cct
),
276 remount_finisher(m
->cct
),
277 async_ino_releasor(m
->cct
),
278 objecter_finisher(m
->cct
),
279 m_command_hook(this),
284 user_id
= cct
->_conf
->client_mount_uid
;
285 group_id
= cct
->_conf
->client_mount_gid
;
286 fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
287 "fuse_default_permissions");
289 if (cct
->_conf
->client_acl_type
== "posix_acl")
290 acl_type
= POSIX_ACL
;
292 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
295 free_fd_set
.insert(10, 1<<30);
297 mdsmap
.reset(new MDSMap
);
300 writeback_handler
.reset(new ObjecterWriteback(objecter
, &objecter_finisher
,
302 objectcacher
.reset(new ObjectCacher(cct
, "libcephfs", *writeback_handler
, client_lock
,
303 client_flush_set_callback
, // all commit callback
305 cct
->_conf
->client_oc_size
,
306 cct
->_conf
->client_oc_max_objects
,
307 cct
->_conf
->client_oc_max_dirty
,
308 cct
->_conf
->client_oc_target_dirty
,
309 cct
->_conf
->client_oc_max_dirty_age
,
316 ceph_assert(ceph_mutex_is_not_locked(client_lock
));
318 // It is necessary to hold client_lock, because any inode destruction
319 // may call into ObjectCacher, which asserts that it's lock (which is
320 // client_lock) is held.
321 std::lock_guard l
{client_lock
};
325 void Client::tear_down_cache()
328 for (ceph::unordered_map
<int, Fh
*>::iterator it
= fd_map
.begin();
332 ldout(cct
, 1) << __func__
<< " forcing close of fh " << it
->first
<< " ino " << fh
->inode
->ino
<< dendl
;
337 while (!opened_dirs
.empty()) {
338 dir_result_t
*dirp
= *opened_dirs
.begin();
339 ldout(cct
, 1) << __func__
<< " forcing close of dir " << dirp
<< " ino " << dirp
->inode
->ino
<< dendl
;
348 ceph_assert(lru
.lru_get_size() == 0);
351 ceph_assert(inode_map
.size() <= 1 + root_parents
.size());
352 if (root
&& inode_map
.size() == 1 + root_parents
.size()) {
356 while (!root_parents
.empty())
357 root_parents
.erase(root_parents
.begin());
362 ceph_assert(inode_map
.empty());
365 inodeno_t
Client::get_root_ino()
367 std::lock_guard
l(client_lock
);
368 if (use_faked_inos())
369 return root
->faked_ino
;
374 Inode
*Client::get_root()
376 std::lock_guard
l(client_lock
);
384 void Client::dump_inode(Formatter
*f
, Inode
*in
, set
<Inode
*>& did
, bool disconnected
)
387 in
->make_long_path(path
);
388 ldout(cct
, 1) << "dump_inode: "
389 << (disconnected
? "DISCONNECTED ":"")
390 << "inode " << in
->ino
392 << " ref " << in
->get_num_ref()
396 f
->open_object_section("inode");
397 f
->dump_stream("path") << path
;
399 f
->dump_int("disconnected", 1);
406 ldout(cct
, 1) << " dir " << in
->dir
<< " size " << in
->dir
->dentries
.size() << dendl
;
407 for (ceph::unordered_map
<string
, Dentry
*>::iterator it
= in
->dir
->dentries
.begin();
408 it
!= in
->dir
->dentries
.end();
410 ldout(cct
, 1) << " " << in
->ino
<< " dn " << it
->first
<< " " << it
->second
<< " ref " << it
->second
->ref
<< dendl
;
412 f
->open_object_section("dentry");
416 if (it
->second
->inode
)
417 dump_inode(f
, it
->second
->inode
.get(), did
, false);
422 void Client::dump_cache(Formatter
*f
)
426 ldout(cct
, 1) << __func__
<< dendl
;
429 f
->open_array_section("cache");
432 dump_inode(f
, root
, did
, true);
434 // make a second pass to catch anything disconnected
435 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
436 it
!= inode_map
.end();
438 if (did
.count(it
->second
))
440 dump_inode(f
, it
->second
, did
, true);
447 void Client::dump_status(Formatter
*f
)
449 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
451 ldout(cct
, 1) << __func__
<< dendl
;
453 const epoch_t osd_epoch
454 = objecter
->with_osdmap(std::mem_fn(&OSDMap::get_epoch
));
457 f
->open_object_section("metadata");
458 for (const auto& kv
: metadata
)
459 f
->dump_string(kv
.first
.c_str(), kv
.second
);
462 f
->dump_int("dentry_count", lru
.lru_get_size());
463 f
->dump_int("dentry_pinned_count", lru
.lru_get_num_pinned());
464 f
->dump_int("id", get_nodeid().v
);
465 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
466 f
->dump_object("inst", inst
);
467 f
->dump_object("addr", inst
.addr
);
468 f
->dump_stream("inst_str") << inst
.name
<< " " << inst
.addr
.get_legacy_str();
469 f
->dump_string("addr_str", inst
.addr
.get_legacy_str());
470 f
->dump_int("inode_count", inode_map
.size());
471 f
->dump_int("mds_epoch", mdsmap
->get_epoch());
472 f
->dump_int("osd_epoch", osd_epoch
);
473 f
->dump_int("osd_epoch_barrier", cap_epoch_barrier
);
474 f
->dump_bool("blacklisted", blacklisted
);
475 f
->dump_string("fs_name", mdsmap
->get_fs_name());
479 void Client::_pre_init()
483 objecter_finisher
.start();
484 filer
.reset(new Filer(objecter
, &objecter_finisher
));
485 objecter
->enable_blacklist_events();
487 objectcacher
->start();
494 std::lock_guard l
{client_lock
};
495 ceph_assert(!initialized
);
496 messenger
->add_dispatcher_tail(this);
502 void Client::_finish_init()
505 std::lock_guard l
{client_lock
};
507 PerfCountersBuilder
plb(cct
, "client", l_c_first
, l_c_last
);
508 plb
.add_time_avg(l_c_reply
, "reply", "Latency of receiving a reply on metadata request");
509 plb
.add_time_avg(l_c_lat
, "lat", "Latency of processing a metadata request");
510 plb
.add_time_avg(l_c_wrlat
, "wrlat", "Latency of a file data write operation");
511 plb
.add_time_avg(l_c_read
, "rdlat", "Latency of a file data read operation");
512 plb
.add_time_avg(l_c_fsync
, "fsync", "Latency of a file sync operation");
513 logger
.reset(plb
.create_perf_counters());
514 cct
->get_perfcounters_collection()->add(logger
.get());
517 cct
->_conf
.add_observer(this);
519 AdminSocket
* admin_socket
= cct
->get_admin_socket();
520 int ret
= admin_socket
->register_command("mds_requests",
522 "show in-progress mds requests");
524 lderr(cct
) << "error registering admin socket command: "
525 << cpp_strerror(-ret
) << dendl
;
527 ret
= admin_socket
->register_command("mds_sessions "
528 "name=cap_dump,type=CephBool,req=false",
530 "show mds session state");
532 lderr(cct
) << "error registering admin socket command: "
533 << cpp_strerror(-ret
) << dendl
;
535 ret
= admin_socket
->register_command("dump_cache",
537 "show in-memory metadata cache contents");
539 lderr(cct
) << "error registering admin socket command: "
540 << cpp_strerror(-ret
) << dendl
;
542 ret
= admin_socket
->register_command("kick_stale_sessions",
544 "kick sessions that were remote reset");
546 lderr(cct
) << "error registering admin socket command: "
547 << cpp_strerror(-ret
) << dendl
;
549 ret
= admin_socket
->register_command("status",
551 "show overall client status");
553 lderr(cct
) << "error registering admin socket command: "
554 << cpp_strerror(-ret
) << dendl
;
557 std::lock_guard l
{client_lock
};
561 void Client::shutdown()
563 ldout(cct
, 1) << __func__
<< dendl
;
565 // If we were not mounted, but were being used for sending
566 // MDS commands, we may have sessions that need closing.
568 std::lock_guard l
{client_lock
};
571 cct
->_conf
.remove_observer(this);
573 cct
->get_admin_socket()->unregister_commands(&m_command_hook
);
575 if (ino_invalidate_cb
) {
576 ldout(cct
, 10) << "shutdown stopping cache invalidator finisher" << dendl
;
577 async_ino_invalidator
.wait_for_empty();
578 async_ino_invalidator
.stop();
581 if (dentry_invalidate_cb
) {
582 ldout(cct
, 10) << "shutdown stopping dentry invalidator finisher" << dendl
;
583 async_dentry_invalidator
.wait_for_empty();
584 async_dentry_invalidator
.stop();
587 if (switch_interrupt_cb
) {
588 ldout(cct
, 10) << "shutdown stopping interrupt finisher" << dendl
;
589 interrupt_finisher
.wait_for_empty();
590 interrupt_finisher
.stop();
594 ldout(cct
, 10) << "shutdown stopping remount finisher" << dendl
;
595 remount_finisher
.wait_for_empty();
596 remount_finisher
.stop();
599 if (ino_release_cb
) {
600 ldout(cct
, 10) << "shutdown stopping inode release finisher" << dendl
;
601 async_ino_releasor
.wait_for_empty();
602 async_ino_releasor
.stop();
605 objectcacher
->stop(); // outside of client_lock! this does a join.
607 std::lock_guard l
{client_lock
};
608 ceph_assert(initialized
);
612 objecter_finisher
.wait_for_empty();
613 objecter_finisher
.stop();
616 cct
->get_perfcounters_collection()->remove(logger
.get());
622 // ===================
623 // metadata cache stuff
625 void Client::trim_cache(bool trim_kernel_dcache
)
627 uint64_t max
= cct
->_conf
->client_cache_size
;
628 ldout(cct
, 20) << "trim_cache size " << lru
.lru_get_size() << " max " << max
<< dendl
;
630 while (lru
.lru_get_size() != last
) {
631 last
= lru
.lru_get_size();
633 if (!unmounting
&& lru
.lru_get_size() <= max
) break;
636 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_get_next_expire());
643 if (trim_kernel_dcache
&& lru
.lru_get_size() > max
)
644 _invalidate_kernel_dcache();
647 if (lru
.lru_get_size() == 0 && root
&& root
->get_num_ref() == 0 && inode_map
.size() == 1 + root_parents
.size()) {
648 ldout(cct
, 15) << "trim_cache trimmed root " << root
<< dendl
;
652 while (!root_parents
.empty())
653 root_parents
.erase(root_parents
.begin());
659 void Client::trim_cache_for_reconnect(MetaSession
*s
)
661 mds_rank_t mds
= s
->mds_num
;
662 ldout(cct
, 20) << __func__
<< " mds." << mds
<< dendl
;
665 list
<Dentry
*> skipped
;
666 while (lru
.lru_get_size() > 0) {
667 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_expire());
671 if ((dn
->inode
&& dn
->inode
->caps
.count(mds
)) ||
672 dn
->dir
->parent_inode
->caps
.count(mds
)) {
676 skipped
.push_back(dn
);
679 for(list
<Dentry
*>::iterator p
= skipped
.begin(); p
!= skipped
.end(); ++p
)
680 lru
.lru_insert_mid(*p
);
682 ldout(cct
, 20) << __func__
<< " mds." << mds
683 << " trimmed " << trimmed
<< " dentries" << dendl
;
685 if (s
->caps
.size() > 0)
686 _invalidate_kernel_dcache();
689 void Client::trim_dentry(Dentry
*dn
)
691 ldout(cct
, 15) << "trim_dentry unlinking dn " << dn
->name
693 << std::hex
<< dn
->dir
->parent_inode
->ino
<< std::dec
696 Inode
*diri
= dn
->dir
->parent_inode
;
697 clear_dir_complete_and_ordered(diri
, true);
699 unlink(dn
, false, false); // drop dir, drop dentry
703 void Client::update_inode_file_size(Inode
*in
, int issued
, uint64_t size
,
704 uint64_t truncate_seq
, uint64_t truncate_size
)
706 uint64_t prior_size
= in
->size
;
708 if (truncate_seq
> in
->truncate_seq
||
709 (truncate_seq
== in
->truncate_seq
&& size
> in
->size
)) {
710 ldout(cct
, 10) << "size " << in
->size
<< " -> " << size
<< dendl
;
712 in
->reported_size
= size
;
713 if (truncate_seq
!= in
->truncate_seq
) {
714 ldout(cct
, 10) << "truncate_seq " << in
->truncate_seq
<< " -> "
715 << truncate_seq
<< dendl
;
716 in
->truncate_seq
= truncate_seq
;
717 in
->oset
.truncate_seq
= truncate_seq
;
719 // truncate cached file data
720 if (prior_size
> size
) {
721 _invalidate_inode_cache(in
, truncate_size
, prior_size
- truncate_size
);
725 // truncate inline data
726 if (in
->inline_version
< CEPH_INLINE_NONE
) {
727 uint32_t len
= in
->inline_data
.length();
729 in
->inline_data
.splice(size
, len
- size
);
732 if (truncate_seq
>= in
->truncate_seq
&&
733 in
->truncate_size
!= truncate_size
) {
735 ldout(cct
, 10) << "truncate_size " << in
->truncate_size
<< " -> "
736 << truncate_size
<< dendl
;
737 in
->truncate_size
= truncate_size
;
738 in
->oset
.truncate_size
= truncate_size
;
740 ldout(cct
, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl
;
745 void Client::update_inode_file_time(Inode
*in
, int issued
, uint64_t time_warp_seq
,
746 utime_t ctime
, utime_t mtime
, utime_t atime
)
748 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << ccap_string(issued
)
749 << " ctime " << ctime
<< " mtime " << mtime
<< dendl
;
751 if (time_warp_seq
> in
->time_warp_seq
)
752 ldout(cct
, 10) << " mds time_warp_seq " << time_warp_seq
753 << " is higher than local time_warp_seq "
754 << in
->time_warp_seq
<< dendl
;
757 // be careful with size, mtime, atime
758 if (issued
& (CEPH_CAP_FILE_EXCL
|
760 CEPH_CAP_FILE_BUFFER
|
762 CEPH_CAP_XATTR_EXCL
)) {
763 ldout(cct
, 30) << "Yay have enough caps to look at our times" << dendl
;
764 if (ctime
> in
->ctime
)
766 if (time_warp_seq
> in
->time_warp_seq
) {
767 //the mds updated times, so take those!
770 in
->time_warp_seq
= time_warp_seq
;
771 } else if (time_warp_seq
== in
->time_warp_seq
) {
773 if (mtime
> in
->mtime
)
775 if (atime
> in
->atime
)
777 } else if (issued
& CEPH_CAP_FILE_EXCL
) {
778 //ignore mds values as we have a higher seq
781 ldout(cct
, 30) << "Don't have enough caps, just taking mds' time values" << dendl
;
782 if (time_warp_seq
>= in
->time_warp_seq
) {
786 in
->time_warp_seq
= time_warp_seq
;
790 ldout(cct
, 0) << "WARNING: " << *in
<< " mds time_warp_seq "
791 << time_warp_seq
<< " is lower than local time_warp_seq "
797 void Client::_fragmap_remove_non_leaves(Inode
*in
)
799 for (map
<frag_t
,int>::iterator p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
800 if (!in
->dirfragtree
.is_leaf(p
->first
))
801 in
->fragmap
.erase(p
++);
806 void Client::_fragmap_remove_stopped_mds(Inode
*in
, mds_rank_t mds
)
808 for (auto p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
809 if (p
->second
== mds
)
810 in
->fragmap
.erase(p
++);
815 Inode
* Client::add_update_inode(InodeStat
*st
, utime_t from
,
816 MetaSession
*session
,
817 const UserPerm
& request_perms
)
820 bool was_new
= false;
821 if (inode_map
.count(st
->vino
)) {
822 in
= inode_map
[st
->vino
];
823 ldout(cct
, 12) << __func__
<< " had " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
825 in
= new Inode(this, st
->vino
, &st
->layout
);
826 inode_map
[st
->vino
] = in
;
828 if (use_faked_inos())
829 _assign_faked_ino(in
);
833 if (use_faked_inos())
834 _assign_faked_root(root
);
837 } else if (!mounted
) {
838 root_parents
[root_ancestor
] = in
;
843 in
->ino
= st
->vino
.ino
;
844 in
->snapid
= st
->vino
.snapid
;
845 in
->mode
= st
->mode
& S_IFMT
;
850 if (in
->is_symlink())
851 in
->symlink
= st
->symlink
;
853 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
854 bool new_version
= false;
855 if (in
->version
== 0 ||
856 ((st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
) &&
857 (in
->version
& ~1) < st
->version
))
861 in
->caps_issued(&issued
);
862 issued
|= in
->caps_dirty();
863 int new_issued
= ~issued
& (int)st
->cap
.caps
;
865 if ((new_version
|| (new_issued
& CEPH_CAP_AUTH_SHARED
)) &&
866 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
870 in
->btime
= st
->btime
;
871 in
->snap_btime
= st
->snap_btime
;
874 if ((new_version
|| (new_issued
& CEPH_CAP_LINK_SHARED
)) &&
875 !(issued
& CEPH_CAP_LINK_EXCL
)) {
876 in
->nlink
= st
->nlink
;
879 if (new_version
|| (new_issued
& CEPH_CAP_ANY_RD
)) {
880 update_inode_file_time(in
, issued
, st
->time_warp_seq
,
881 st
->ctime
, st
->mtime
, st
->atime
);
885 (new_issued
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
))) {
886 in
->layout
= st
->layout
;
887 update_inode_file_size(in
, issued
, st
->size
, st
->truncate_seq
, st
->truncate_size
);
891 if (new_version
|| (new_issued
& CEPH_CAP_FILE_SHARED
)) {
892 in
->dirstat
= st
->dirstat
;
894 // dir_layout/rstat/quota are not tracked by capability, update them only if
895 // the inode stat is from auth mds
896 if (new_version
|| (st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
)) {
897 in
->dir_layout
= st
->dir_layout
;
898 ldout(cct
, 20) << " dir hash is " << (int)in
->dir_layout
.dl_dir_hash
<< dendl
;
899 in
->rstat
= st
->rstat
;
900 in
->quota
= st
->quota
;
901 in
->dir_pin
= st
->dir_pin
;
903 // move me if/when version reflects fragtree changes.
904 if (in
->dirfragtree
!= st
->dirfragtree
) {
905 in
->dirfragtree
= st
->dirfragtree
;
906 _fragmap_remove_non_leaves(in
);
910 if ((in
->xattr_version
== 0 || !(issued
& CEPH_CAP_XATTR_EXCL
)) &&
911 st
->xattrbl
.length() &&
912 st
->xattr_version
> in
->xattr_version
) {
913 auto p
= st
->xattrbl
.cbegin();
914 decode(in
->xattrs
, p
);
915 in
->xattr_version
= st
->xattr_version
;
918 if (st
->inline_version
> in
->inline_version
) {
919 in
->inline_data
= st
->inline_data
;
920 in
->inline_version
= st
->inline_version
;
923 /* always take a newer change attr */
924 if (st
->change_attr
> in
->change_attr
)
925 in
->change_attr
= st
->change_attr
;
927 if (st
->version
> in
->version
)
928 in
->version
= st
->version
;
931 ldout(cct
, 12) << __func__
<< " adding " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
934 return in
; // as with readdir returning indoes in different snaprealms (no caps!)
936 if (in
->snapid
== CEPH_NOSNAP
) {
937 add_update_cap(in
, session
, st
->cap
.cap_id
, st
->cap
.caps
, st
->cap
.wanted
,
938 st
->cap
.seq
, st
->cap
.mseq
, inodeno_t(st
->cap
.realm
),
939 st
->cap
.flags
, request_perms
);
940 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
941 in
->max_size
= st
->max_size
;
942 in
->rstat
= st
->rstat
;
945 // setting I_COMPLETE needs to happen after adding the cap
947 (st
->cap
.caps
& CEPH_CAP_FILE_SHARED
) &&
948 (issued
& CEPH_CAP_FILE_EXCL
) == 0 &&
949 in
->dirstat
.nfiles
== 0 &&
950 in
->dirstat
.nsubdirs
== 0) {
951 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in
<< dendl
;
952 in
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
954 ldout(cct
, 10) << " dir is open on empty dir " << in
->ino
<< " with "
955 << in
->dir
->dentries
.size() << " entries, marking all dentries null" << dendl
;
956 in
->dir
->readdir_cache
.clear();
957 for (const auto& p
: in
->dir
->dentries
) {
958 unlink(p
.second
, true, true); // keep dir, keep dentry
960 if (in
->dir
->dentries
.empty())
965 in
->snap_caps
|= st
->cap
.caps
;
973 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
975 Dentry
*Client::insert_dentry_inode(Dir
*dir
, const string
& dname
, LeaseStat
*dlease
,
976 Inode
*in
, utime_t from
, MetaSession
*session
,
980 if (dir
->dentries
.count(dname
))
981 dn
= dir
->dentries
[dname
];
983 ldout(cct
, 12) << __func__
<< " '" << dname
<< "' vino " << in
->vino()
984 << " in dir " << dir
->parent_inode
->vino() << " dn " << dn
987 if (dn
&& dn
->inode
) {
988 if (dn
->inode
->vino() == in
->vino()) {
990 ldout(cct
, 12) << " had dentry " << dname
991 << " with correct vino " << dn
->inode
->vino()
994 ldout(cct
, 12) << " had dentry " << dname
995 << " with WRONG vino " << dn
->inode
->vino()
997 unlink(dn
, true, true); // keep dir, keep dentry
1001 if (!dn
|| !dn
->inode
) {
1002 InodeRef
tmp_ref(in
);
1004 if (old_dentry
->dir
!= dir
) {
1005 Inode
*old_diri
= old_dentry
->dir
->parent_inode
;
1006 clear_dir_complete_and_ordered(old_diri
, false);
1008 unlink(old_dentry
, dir
== old_dentry
->dir
, false); // drop dentry, keep dir open if its the same dir
1010 Inode
*diri
= dir
->parent_inode
;
1011 clear_dir_complete_and_ordered(diri
, false);
1012 dn
= link(dir
, dname
, in
, dn
);
1015 update_dentry_lease(dn
, dlease
, from
, session
);
1019 void Client::update_dentry_lease(Dentry
*dn
, LeaseStat
*dlease
, utime_t from
, MetaSession
*session
)
1021 utime_t dttl
= from
;
1022 dttl
+= (float)dlease
->duration_ms
/ 1000.0;
1026 if (dlease
->mask
& CEPH_LEASE_VALID
) {
1027 if (dttl
> dn
->lease_ttl
) {
1028 ldout(cct
, 10) << "got dentry lease on " << dn
->name
1029 << " dur " << dlease
->duration_ms
<< "ms ttl " << dttl
<< dendl
;
1030 dn
->lease_ttl
= dttl
;
1031 dn
->lease_mds
= session
->mds_num
;
1032 dn
->lease_seq
= dlease
->seq
;
1033 dn
->lease_gen
= session
->cap_gen
;
1036 dn
->cap_shared_gen
= dn
->dir
->parent_inode
->shared_gen
;
1037 if (dlease
->mask
& CEPH_LEASE_PRIMARY_LINK
)
1043 * update MDS location cache for a single inode
1045 void Client::update_dir_dist(Inode
*in
, DirStat
*dst
)
1048 ldout(cct
, 20) << "got dirfrag map for " << in
->ino
<< " frag " << dst
->frag
<< " to mds " << dst
->auth
<< dendl
;
1049 if (dst
->auth
>= 0) {
1050 in
->fragmap
[dst
->frag
] = dst
->auth
;
1052 in
->fragmap
.erase(dst
->frag
);
1054 if (!in
->dirfragtree
.is_leaf(dst
->frag
)) {
1055 in
->dirfragtree
.force_to_leaf(cct
, dst
->frag
);
1056 _fragmap_remove_non_leaves(in
);
1060 in
->dir_replicated
= !dst
->dist
.empty(); // FIXME that's just one frag!
1063 void Client::clear_dir_complete_and_ordered(Inode
*diri
, bool complete
)
1066 diri
->dir_release_count
++;
1068 diri
->dir_ordered_count
++;
1069 if (diri
->flags
& I_COMPLETE
) {
1071 ldout(cct
, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
1072 diri
->flags
&= ~(I_COMPLETE
| I_DIR_ORDERED
);
1074 if (diri
->flags
& I_DIR_ORDERED
) {
1075 ldout(cct
, 10) << " clearing I_DIR_ORDERED on " << *diri
<< dendl
;
1076 diri
->flags
&= ~I_DIR_ORDERED
;
1080 diri
->dir
->readdir_cache
.clear();
1085 * insert results from readdir or lssnap into the metadata cache.
1087 void Client::insert_readdir_results(MetaRequest
*request
, MetaSession
*session
, Inode
*diri
) {
1089 auto& reply
= request
->reply
;
1090 ConnectionRef con
= request
->reply
->get_connection();
1092 if(session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1093 features
= (uint64_t)-1;
1096 features
= con
->get_features();
1099 dir_result_t
*dirp
= request
->dirp
;
1102 // the extra buffer list is only set for readdir and lssnap replies
1103 auto p
= reply
->get_extra_bl().cbegin();
1106 if (request
->head
.op
== CEPH_MDS_OP_LSSNAP
) {
1108 diri
= open_snapdir(diri
);
1111 // only open dir if we're actually adding stuff to it!
1112 Dir
*dir
= diri
->open_dir();
1116 DirStat
dst(p
, features
);
1122 bool end
= ((unsigned)flags
& CEPH_READDIR_FRAG_END
);
1123 bool hash_order
= ((unsigned)flags
& CEPH_READDIR_HASH_ORDER
);
1125 frag_t fg
= (unsigned)request
->head
.args
.readdir
.frag
;
1126 unsigned readdir_offset
= dirp
->next_offset
;
1127 string readdir_start
= dirp
->last_name
;
1128 ceph_assert(!readdir_start
.empty() || readdir_offset
== 2);
1130 unsigned last_hash
= 0;
1132 if (!readdir_start
.empty()) {
1133 last_hash
= ceph_frag_value(diri
->hash_dentry_name(readdir_start
));
1134 } else if (flags
& CEPH_READDIR_OFFSET_HASH
) {
1135 /* mds understands offset_hash */
1136 last_hash
= (unsigned)request
->head
.args
.readdir
.offset_hash
;
1140 if (fg
!= dst
.frag
) {
1141 ldout(cct
, 10) << "insert_trace got new frag " << fg
<< " -> " << dst
.frag
<< dendl
;
1145 readdir_start
.clear();
1146 dirp
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
, false);
1150 ldout(cct
, 10) << __func__
<< " " << numdn
<< " readdir items, end=" << end
1151 << ", hash_order=" << hash_order
1152 << ", readdir_start " << readdir_start
1153 << ", last_hash " << last_hash
1154 << ", next_offset " << readdir_offset
<< dendl
;
1156 if (diri
->snapid
!= CEPH_SNAPDIR
&&
1157 fg
.is_leftmost() && readdir_offset
== 2 &&
1158 !(hash_order
&& last_hash
)) {
1159 dirp
->release_count
= diri
->dir_release_count
;
1160 dirp
->ordered_count
= diri
->dir_ordered_count
;
1161 dirp
->start_shared_gen
= diri
->shared_gen
;
1162 dirp
->cache_index
= 0;
1165 dirp
->buffer_frag
= fg
;
1167 _readdir_drop_dirp_buffer(dirp
);
1168 dirp
->buffer
.reserve(numdn
);
1172 for (unsigned i
=0; i
<numdn
; i
++) {
1174 dlease
.decode(p
, features
);
1175 InodeStat
ist(p
, features
);
1177 ldout(cct
, 15) << "" << i
<< ": '" << dname
<< "'" << dendl
;
1179 Inode
*in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1182 if (diri
->dir
->dentries
.count(dname
)) {
1183 Dentry
*olddn
= diri
->dir
->dentries
[dname
];
1184 if (olddn
->inode
!= in
) {
1185 // replace incorrect dentry
1186 unlink(olddn
, true, true); // keep dir, dentry
1187 dn
= link(dir
, dname
, in
, olddn
);
1188 ceph_assert(dn
== olddn
);
1196 dn
= link(dir
, dname
, in
, NULL
);
1199 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1201 unsigned hash
= ceph_frag_value(diri
->hash_dentry_name(dname
));
1202 if (hash
!= last_hash
)
1205 dn
->offset
= dir_result_t::make_fpos(hash
, readdir_offset
++, true);
1207 dn
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
++, false);
1209 // add to readdir cache
1210 if (dirp
->release_count
== diri
->dir_release_count
&&
1211 dirp
->ordered_count
== diri
->dir_ordered_count
&&
1212 dirp
->start_shared_gen
== diri
->shared_gen
) {
1213 if (dirp
->cache_index
== dir
->readdir_cache
.size()) {
1215 ceph_assert(!dirp
->inode
->is_complete_and_ordered());
1216 dir
->readdir_cache
.reserve(dirp
->cache_index
+ numdn
);
1218 dir
->readdir_cache
.push_back(dn
);
1219 } else if (dirp
->cache_index
< dir
->readdir_cache
.size()) {
1220 if (dirp
->inode
->is_complete_and_ordered())
1221 ceph_assert(dir
->readdir_cache
[dirp
->cache_index
] == dn
);
1223 dir
->readdir_cache
[dirp
->cache_index
] = dn
;
1225 ceph_abort_msg("unexpected readdir buffer idx");
1227 dirp
->cache_index
++;
1229 // add to cached result list
1230 dirp
->buffer
.push_back(dir_result_t::dentry(dn
->offset
, dname
, in
));
1231 ldout(cct
, 15) << __func__
<< " " << hex
<< dn
->offset
<< dec
<< ": '" << dname
<< "' -> " << in
->ino
<< dendl
;
1235 dirp
->last_name
= dname
;
1237 dirp
->next_offset
= 2;
1239 dirp
->next_offset
= readdir_offset
;
1241 if (dir
->is_empty())
1248 * insert a trace from a MDS reply into the cache.
1250 Inode
* Client::insert_trace(MetaRequest
*request
, MetaSession
*session
)
1252 auto& reply
= request
->reply
;
1253 int op
= request
->get_op();
1255 ldout(cct
, 10) << "insert_trace from " << request
->sent_stamp
<< " mds." << session
->mds_num
1256 << " is_target=" << (int)reply
->head
.is_target
1257 << " is_dentry=" << (int)reply
->head
.is_dentry
1260 auto p
= reply
->get_trace_bl().cbegin();
1261 if (request
->got_unsafe
) {
1262 ldout(cct
, 10) << "insert_trace -- already got unsafe; ignoring" << dendl
;
1263 ceph_assert(p
.end());
1268 ldout(cct
, 10) << "insert_trace -- no trace" << dendl
;
1270 Dentry
*d
= request
->dentry();
1272 Inode
*diri
= d
->dir
->parent_inode
;
1273 clear_dir_complete_and_ordered(diri
, true);
1276 if (d
&& reply
->get_result() == 0) {
1277 if (op
== CEPH_MDS_OP_RENAME
) {
1279 Dentry
*od
= request
->old_dentry();
1280 ldout(cct
, 10) << " unlinking rename src dn " << od
<< " for traceless reply" << dendl
;
1282 unlink(od
, true, true); // keep dir, dentry
1283 } else if (op
== CEPH_MDS_OP_RMDIR
||
1284 op
== CEPH_MDS_OP_UNLINK
) {
1286 ldout(cct
, 10) << " unlinking unlink/rmdir dn " << d
<< " for traceless reply" << dendl
;
1287 unlink(d
, true, true); // keep dir, dentry
1293 ConnectionRef con
= request
->reply
->get_connection();
1295 if (session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1296 features
= (uint64_t)-1;
1299 features
= con
->get_features();
1301 ldout(cct
, 10) << " features 0x" << hex
<< features
<< dec
<< dendl
;
1304 SnapRealm
*realm
= NULL
;
1305 if (reply
->snapbl
.length())
1306 update_snap_trace(reply
->snapbl
, &realm
);
1308 ldout(cct
, 10) << " hrm "
1309 << " is_target=" << (int)reply
->head
.is_target
1310 << " is_dentry=" << (int)reply
->head
.is_dentry
1319 if (reply
->head
.is_dentry
) {
1320 dirst
.decode(p
, features
);
1321 dst
.decode(p
, features
);
1323 dlease
.decode(p
, features
);
1327 if (reply
->head
.is_target
) {
1328 ist
.decode(p
, features
);
1329 if (cct
->_conf
->client_debug_getattr_caps
) {
1330 unsigned wanted
= 0;
1331 if (op
== CEPH_MDS_OP_GETATTR
|| op
== CEPH_MDS_OP_LOOKUP
)
1332 wanted
= request
->head
.args
.getattr
.mask
;
1333 else if (op
== CEPH_MDS_OP_OPEN
|| op
== CEPH_MDS_OP_CREATE
)
1334 wanted
= request
->head
.args
.open
.mask
;
1336 if ((wanted
& CEPH_CAP_XATTR_SHARED
) &&
1337 !(ist
.xattr_version
> 0 && ist
.xattrbl
.length() > 0))
1338 ceph_abort_msg("MDS reply does not contain xattrs");
1341 in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1346 if (reply
->head
.is_dentry
) {
1347 diri
= add_update_inode(&dirst
, request
->sent_stamp
, session
,
1349 update_dir_dist(diri
, &dst
); // dir stat info is attached to ..
1352 Dir
*dir
= diri
->open_dir();
1353 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
,
1354 (op
== CEPH_MDS_OP_RENAME
) ? request
->old_dentry() : NULL
);
1357 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1358 dn
= diri
->dir
->dentries
[dname
];
1360 clear_dir_complete_and_ordered(diri
, false);
1361 unlink(dn
, true, true); // keep dir, dentry
1364 if (dlease
.duration_ms
> 0) {
1366 Dir
*dir
= diri
->open_dir();
1367 dn
= link(dir
, dname
, NULL
, NULL
);
1369 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1372 } else if (op
== CEPH_MDS_OP_LOOKUPSNAP
||
1373 op
== CEPH_MDS_OP_MKSNAP
) {
1374 ldout(cct
, 10) << " faking snap lookup weirdness" << dendl
;
1375 // fake it for snap lookup
1376 vinodeno_t vino
= ist
.vino
;
1377 vino
.snapid
= CEPH_SNAPDIR
;
1378 ceph_assert(inode_map
.count(vino
));
1379 diri
= inode_map
[vino
];
1381 string dname
= request
->path
.last_dentry();
1384 dlease
.duration_ms
= 0;
1387 Dir
*dir
= diri
->open_dir();
1388 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
);
1390 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1391 Dentry
*dn
= diri
->dir
->dentries
[dname
];
1393 unlink(dn
, true, true); // keep dir, dentry
1399 if (op
== CEPH_MDS_OP_READDIR
||
1400 op
== CEPH_MDS_OP_LSSNAP
) {
1401 insert_readdir_results(request
, session
, in
);
1402 } else if (op
== CEPH_MDS_OP_LOOKUPNAME
) {
1403 // hack: return parent inode instead
1407 if (request
->dentry() == NULL
&& in
!= request
->inode()) {
1408 // pin the target inode if its parent dentry is not pinned
1409 request
->set_other_inode(in
);
1414 put_snap_realm(realm
);
1416 request
->target
= in
;
1422 mds_rank_t
Client::choose_target_mds(MetaRequest
*req
, Inode
** phash_diri
)
1424 mds_rank_t mds
= MDS_RANK_NONE
;
1426 bool is_hash
= false;
1431 if (req
->resend_mds
>= 0) {
1432 mds
= req
->resend_mds
;
1433 req
->resend_mds
= -1;
1434 ldout(cct
, 10) << __func__
<< " resend_mds specified as mds." << mds
<< dendl
;
1438 if (cct
->_conf
->client_use_random_mds
)
1444 ldout(cct
, 20) << __func__
<< " starting with req->inode " << *in
<< dendl
;
1445 if (req
->path
.depth()) {
1446 hash
= in
->hash_dentry_name(req
->path
[0]);
1447 ldout(cct
, 20) << __func__
<< " inode dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1448 << " on " << req
->path
[0]
1449 << " => " << hash
<< dendl
;
1454 in
= de
->inode
.get();
1455 ldout(cct
, 20) << __func__
<< " starting with req->dentry inode " << *in
<< dendl
;
1457 in
= de
->dir
->parent_inode
;
1458 hash
= in
->hash_dentry_name(de
->name
);
1459 ldout(cct
, 20) << __func__
<< " dentry dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1460 << " on " << de
->name
1461 << " => " << hash
<< dendl
;
1466 if (in
->snapid
!= CEPH_NOSNAP
) {
1467 ldout(cct
, 10) << __func__
<< " " << *in
<< " is snapped, using nonsnap parent" << dendl
;
1468 while (in
->snapid
!= CEPH_NOSNAP
) {
1469 if (in
->snapid
== CEPH_SNAPDIR
)
1470 in
= in
->snapdir_parent
.get();
1471 else if (!in
->dentries
.empty())
1472 /* In most cases there will only be one dentry, so getting it
1473 * will be the correct action. If there are multiple hard links,
1474 * I think the MDS should be able to redirect as needed*/
1475 in
= in
->get_first_parent()->dir
->parent_inode
;
1477 ldout(cct
, 10) << "got unlinked inode, can't look at parent" << dendl
;
1484 ldout(cct
, 20) << __func__
<< " " << *in
<< " is_hash=" << is_hash
1485 << " hash=" << hash
<< dendl
;
1487 if (is_hash
&& S_ISDIR(in
->mode
) && !in
->fragmap
.empty()) {
1488 frag_t fg
= in
->dirfragtree
[hash
];
1489 if (in
->fragmap
.count(fg
)) {
1490 mds
= in
->fragmap
[fg
];
1493 } else if (in
->auth_cap
) {
1494 mds
= in
->auth_cap
->session
->mds_num
;
1497 ldout(cct
, 10) << __func__
<< " from dirfragtree hash" << dendl
;
1502 if (in
->auth_cap
&& req
->auth_is_best()) {
1503 mds
= in
->auth_cap
->session
->mds_num
;
1504 } else if (!in
->caps
.empty()) {
1505 mds
= in
->caps
.begin()->second
.session
->mds_num
;
1509 ldout(cct
, 10) << __func__
<< " from caps on inode " << *in
<< dendl
;
1516 mds
= _get_random_up_mds();
1517 ldout(cct
, 10) << "did not get mds through better means, so chose random mds " << mds
<< dendl
;
1521 ldout(cct
, 20) << "mds is " << mds
<< dendl
;
1526 void Client::connect_mds_targets(mds_rank_t mds
)
1528 ldout(cct
, 10) << __func__
<< " for mds." << mds
<< dendl
;
1529 ceph_assert(mds_sessions
.count(mds
));
1530 const MDSMap::mds_info_t
& info
= mdsmap
->get_mds_info(mds
);
1531 for (set
<mds_rank_t
>::const_iterator q
= info
.export_targets
.begin();
1532 q
!= info
.export_targets
.end();
1534 if (mds_sessions
.count(*q
) == 0 &&
1535 mdsmap
->is_clientreplay_or_active_or_stopping(*q
)) {
1536 ldout(cct
, 10) << "check_mds_sessions opening mds." << mds
1537 << " export target mds." << *q
<< dendl
;
1538 _open_mds_session(*q
);
1543 void Client::dump_mds_sessions(Formatter
*f
, bool cap_dump
)
1545 f
->dump_int("id", get_nodeid().v
);
1546 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
1547 f
->dump_object("inst", inst
);
1548 f
->dump_stream("inst_str") << inst
;
1549 f
->dump_stream("addr_str") << inst
.addr
;
1550 f
->open_array_section("sessions");
1551 for (const auto &p
: mds_sessions
) {
1552 f
->open_object_section("session");
1553 p
.second
.dump(f
, cap_dump
);
1557 f
->dump_int("mdsmap_epoch", mdsmap
->get_epoch());
1559 void Client::dump_mds_requests(Formatter
*f
)
1561 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
1562 p
!= mds_requests
.end();
1564 f
->open_object_section("request");
1570 int Client::verify_reply_trace(int r
, MetaSession
*session
,
1571 MetaRequest
*request
, const MConstRef
<MClientReply
>& reply
,
1572 InodeRef
*ptarget
, bool *pcreated
,
1573 const UserPerm
& perms
)
1575 // check whether this request actually did the create, and set created flag
1576 bufferlist extra_bl
;
1577 inodeno_t created_ino
;
1578 bool got_created_ino
= false;
1579 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
;
1581 extra_bl
= reply
->get_extra_bl();
1582 if (extra_bl
.length() >= 8) {
1583 if (session
->mds_features
.test(CEPHFS_FEATURE_DELEG_INO
)) {
1584 struct openc_response_t ocres
;
1586 decode(ocres
, extra_bl
);
1587 created_ino
= ocres
.created_ino
;
1589 * The userland cephfs client doesn't have a way to do an async create
1590 * (yet), so just discard delegated_inos for now. Eventually we should
1591 * store them and use them in create calls, even if they are synchronous,
1592 * if only for testing purposes.
1594 ldout(cct
, 10) << "delegated_inos: " << ocres
.delegated_inos
<< dendl
;
1596 // u64 containing number of created ino
1597 decode(created_ino
, extra_bl
);
1599 ldout(cct
, 10) << "make_request created ino " << created_ino
<< dendl
;
1600 got_created_ino
= true;
1604 *pcreated
= got_created_ino
;
1606 if (request
->target
) {
1607 *ptarget
= request
->target
;
1608 ldout(cct
, 20) << "make_request target is " << *ptarget
->get() << dendl
;
1610 if (got_created_ino
&& (p
= inode_map
.find(vinodeno_t(created_ino
, CEPH_NOSNAP
))) != inode_map
.end()) {
1611 (*ptarget
) = p
->second
;
1612 ldout(cct
, 20) << "make_request created, target is " << *ptarget
->get() << dendl
;
1614 // we got a traceless reply, and need to look up what we just
1615 // created. for now, do this by name. someday, do this by the
1616 // ino... which we know! FIXME.
1618 Dentry
*d
= request
->dentry();
1621 ldout(cct
, 10) << "make_request got traceless reply, looking up #"
1622 << d
->dir
->parent_inode
->ino
<< "/" << d
->name
1623 << " got_ino " << got_created_ino
1624 << " ino " << created_ino
1626 r
= _do_lookup(d
->dir
->parent_inode
, d
->name
, request
->regetattr_mask
,
1629 // if the dentry is not linked, just do our best. see #5021.
1630 ceph_abort_msg("how did this happen? i want logs!");
1633 Inode
*in
= request
->inode();
1634 ldout(cct
, 10) << "make_request got traceless reply, forcing getattr on #"
1635 << in
->ino
<< dendl
;
1636 r
= _getattr(in
, request
->regetattr_mask
, perms
, true);
1640 // verify ino returned in reply and trace_dist are the same
1641 if (got_created_ino
&&
1642 created_ino
.val
!= target
->ino
.val
) {
1643 ldout(cct
, 5) << "create got ino " << created_ino
<< " but then failed on lookup; EINTR?" << dendl
;
1647 ptarget
->swap(target
);
1659 * Blocking helper to make an MDS request.
1661 * If the ptarget flag is set, behavior changes slightly: the caller
1662 * expects to get a pointer to the inode we are creating or operating
1663 * on. As a result, we will follow up any traceless mutation reply
1664 * with a getattr or lookup to transparently handle a traceless reply
1665 * from the MDS (as when the MDS restarts and the client has to replay
1668 * @param request the MetaRequest to execute
1669 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1670 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1671 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1672 * @param use_mds [optional] prefer a specific mds (-1 for default)
1673 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1675 int Client::make_request(MetaRequest
*request
,
1676 const UserPerm
& perms
,
1677 InodeRef
*ptarget
, bool *pcreated
,
1683 // assign a unique tid
1684 ceph_tid_t tid
= ++last_tid
;
1685 request
->set_tid(tid
);
1688 request
->op_stamp
= ceph_clock_now();
1691 mds_requests
[tid
] = request
->get();
1692 if (oldest_tid
== 0 && request
->get_op() != CEPH_MDS_OP_SETFILELOCK
)
1695 request
->set_caller_perms(perms
);
1697 if (cct
->_conf
->client_inject_fixed_oldest_tid
) {
1698 ldout(cct
, 20) << __func__
<< " injecting fixed oldest_client_tid(1)" << dendl
;
1699 request
->set_oldest_client_tid(1);
1701 request
->set_oldest_client_tid(oldest_tid
);
1706 request
->resend_mds
= use_mds
;
1708 MetaSession
*session
= NULL
;
1710 if (request
->aborted())
1714 request
->abort(-EBLACKLISTED
);
1719 ceph::condition_variable caller_cond
;
1720 request
->caller_cond
= &caller_cond
;
1723 Inode
*hash_diri
= NULL
;
1724 mds_rank_t mds
= choose_target_mds(request
, &hash_diri
);
1725 int mds_state
= (mds
== MDS_RANK_NONE
) ? MDSMap::STATE_NULL
: mdsmap
->get_state(mds
);
1726 if (mds_state
!= MDSMap::STATE_ACTIVE
&& mds_state
!= MDSMap::STATE_STOPPING
) {
1727 if (mds_state
== MDSMap::STATE_NULL
&& mds
>= mdsmap
->get_max_mds()) {
1729 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, remove it from fragmap" << dendl
;
1730 _fragmap_remove_stopped_mds(hash_diri
, mds
);
1732 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, trying a random mds" << dendl
;
1733 request
->resend_mds
= _get_random_up_mds();
1736 ldout(cct
, 10) << " target mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
1737 wait_on_list(waiting_for_mdsmap
);
1743 if (!have_open_session(mds
)) {
1744 session
= _get_or_open_mds_session(mds
);
1745 if (session
->state
== MetaSession::STATE_REJECTED
) {
1746 request
->abort(-EPERM
);
1750 if (session
->state
== MetaSession::STATE_OPENING
) {
1751 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
1752 wait_on_context_list(session
->waiting_for_open
);
1756 if (!have_open_session(mds
))
1759 session
= &mds_sessions
.at(mds
);
1763 send_request(request
, session
);
1766 ldout(cct
, 20) << "awaiting reply|forward|kick on " << &caller_cond
<< dendl
;
1767 request
->kick
= false;
1768 std::unique_lock l
{client_lock
, std::adopt_lock
};
1769 caller_cond
.wait(l
, [request
] {
1770 return (request
->reply
|| // reply
1771 request
->resend_mds
>= 0 || // forward
1775 request
->caller_cond
= nullptr;
1777 // did we get a reply?
1782 if (!request
->reply
) {
1783 ceph_assert(request
->aborted());
1784 ceph_assert(!request
->got_unsafe
);
1785 r
= request
->get_abort_code();
1786 request
->item
.remove_myself();
1787 unregister_request(request
);
1788 put_request(request
);
1793 auto reply
= std::move(request
->reply
);
1794 r
= reply
->get_result();
1796 request
->success
= true;
1798 // kick dispatcher (we've got it!)
1799 ceph_assert(request
->dispatch_cond
);
1800 request
->dispatch_cond
->notify_all();
1801 ldout(cct
, 20) << "sendrecv kickback on tid " << tid
<< " " << request
->dispatch_cond
<< dendl
;
1802 request
->dispatch_cond
= 0;
1804 if (r
>= 0 && ptarget
)
1805 r
= verify_reply_trace(r
, session
, request
, reply
, ptarget
, pcreated
, perms
);
1808 *pdirbl
= reply
->get_extra_bl();
1811 utime_t lat
= ceph_clock_now();
1812 lat
-= request
->sent_stamp
;
1813 ldout(cct
, 20) << "lat " << lat
<< dendl
;
1814 logger
->tinc(l_c_lat
, lat
);
1815 logger
->tinc(l_c_reply
, lat
);
1817 put_request(request
);
1821 void Client::unregister_request(MetaRequest
*req
)
1823 mds_requests
.erase(req
->tid
);
1824 if (req
->tid
== oldest_tid
) {
1825 map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.upper_bound(oldest_tid
);
1827 if (p
== mds_requests
.end()) {
1831 if (p
->second
->get_op() != CEPH_MDS_OP_SETFILELOCK
) {
1832 oldest_tid
= p
->first
;
1841 void Client::put_request(MetaRequest
*request
)
1843 if (request
->_put()) {
1845 if (request
->success
)
1846 op
= request
->get_op();
1848 request
->take_other_inode(&other_in
);
1852 (op
== CEPH_MDS_OP_RMDIR
||
1853 op
== CEPH_MDS_OP_RENAME
||
1854 op
== CEPH_MDS_OP_RMSNAP
)) {
1855 _try_to_trim_inode(other_in
.get(), false);
1860 int Client::encode_inode_release(Inode
*in
, MetaRequest
*req
,
1861 mds_rank_t mds
, int drop
,
1862 int unless
, int force
)
1864 ldout(cct
, 20) << __func__
<< " enter(in:" << *in
<< ", req:" << req
1865 << " mds:" << mds
<< ", drop:" << drop
<< ", unless:" << unless
1866 << ", force:" << force
<< ")" << dendl
;
1868 auto it
= in
->caps
.find(mds
);
1869 if (it
!= in
->caps
.end()) {
1870 Cap
&cap
= it
->second
;
1871 drop
&= ~(in
->dirty_caps
| get_caps_used(in
));
1872 if ((drop
& cap
.issued
) &&
1873 !(unless
& cap
.issued
)) {
1874 ldout(cct
, 25) << "dropping caps " << ccap_string(drop
) << dendl
;
1875 cap
.issued
&= ~drop
;
1876 cap
.implemented
&= ~drop
;
1882 cap
.wanted
= in
->caps_wanted();
1883 if (&cap
== in
->auth_cap
&&
1884 !(cap
.wanted
& CEPH_CAP_ANY_FILE_WR
)) {
1885 in
->requested_max_size
= 0;
1886 ldout(cct
, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl
;
1888 ceph_mds_request_release rel
;
1890 rel
.cap_id
= cap
.cap_id
;
1892 rel
.issue_seq
= cap
.issue_seq
;
1893 rel
.mseq
= cap
.mseq
;
1894 rel
.caps
= cap
.implemented
;
1895 rel
.wanted
= cap
.wanted
;
1898 req
->cap_releases
.push_back(MClientRequest::Release(rel
,""));
1901 ldout(cct
, 25) << __func__
<< " exit(in:" << *in
<< ") released:"
1902 << released
<< dendl
;
1906 void Client::encode_dentry_release(Dentry
*dn
, MetaRequest
*req
,
1907 mds_rank_t mds
, int drop
, int unless
)
1909 ldout(cct
, 20) << __func__
<< " enter(dn:"
1910 << dn
<< ")" << dendl
;
1913 released
= encode_inode_release(dn
->dir
->parent_inode
, req
,
1914 mds
, drop
, unless
, 1);
1915 if (released
&& dn
->lease_mds
== mds
) {
1916 ldout(cct
, 25) << "preemptively releasing dn to mds" << dendl
;
1917 auto& rel
= req
->cap_releases
.back();
1918 rel
.item
.dname_len
= dn
->name
.length();
1919 rel
.item
.dname_seq
= dn
->lease_seq
;
1920 rel
.dname
= dn
->name
;
1923 ldout(cct
, 25) << __func__
<< " exit(dn:"
1924 << dn
<< ")" << dendl
;
1929 * This requires the MClientRequest *request member to be set.
1930 * It will error out horribly without one.
1931 * Additionally, if you set any *drop member, you'd better have
1932 * set the corresponding dentry!
1934 void Client::encode_cap_releases(MetaRequest
*req
, mds_rank_t mds
)
1936 ldout(cct
, 20) << __func__
<< " enter (req: "
1937 << req
<< ", mds: " << mds
<< ")" << dendl
;
1938 if (req
->inode_drop
&& req
->inode())
1939 encode_inode_release(req
->inode(), req
,
1940 mds
, req
->inode_drop
,
1943 if (req
->old_inode_drop
&& req
->old_inode())
1944 encode_inode_release(req
->old_inode(), req
,
1945 mds
, req
->old_inode_drop
,
1946 req
->old_inode_unless
);
1947 if (req
->other_inode_drop
&& req
->other_inode())
1948 encode_inode_release(req
->other_inode(), req
,
1949 mds
, req
->other_inode_drop
,
1950 req
->other_inode_unless
);
1952 if (req
->dentry_drop
&& req
->dentry())
1953 encode_dentry_release(req
->dentry(), req
,
1954 mds
, req
->dentry_drop
,
1955 req
->dentry_unless
);
1957 if (req
->old_dentry_drop
&& req
->old_dentry())
1958 encode_dentry_release(req
->old_dentry(), req
,
1959 mds
, req
->old_dentry_drop
,
1960 req
->old_dentry_unless
);
1961 ldout(cct
, 25) << __func__
<< " exit (req: "
1962 << req
<< ", mds " << mds
<<dendl
;
1965 bool Client::have_open_session(mds_rank_t mds
)
1967 const auto &it
= mds_sessions
.find(mds
);
1968 return it
!= mds_sessions
.end() &&
1969 (it
->second
.state
== MetaSession::STATE_OPEN
||
1970 it
->second
.state
== MetaSession::STATE_STALE
);
1973 MetaSession
*Client::_get_mds_session(mds_rank_t mds
, Connection
*con
)
1975 const auto &it
= mds_sessions
.find(mds
);
1976 if (it
== mds_sessions
.end() || it
->second
.con
!= con
) {
1983 MetaSession
*Client::_get_or_open_mds_session(mds_rank_t mds
)
1985 auto it
= mds_sessions
.find(mds
);
1986 return it
== mds_sessions
.end() ? _open_mds_session(mds
) : &it
->second
;
1990 * Populate a map of strings with client-identifying metadata,
1991 * such as the hostname. Call this once at initialization.
1993 void Client::populate_metadata(const std::string
&mount_root
)
1999 metadata
["hostname"] = u
.nodename
;
2000 ldout(cct
, 20) << __func__
<< " read hostname '" << u
.nodename
<< "'" << dendl
;
2002 ldout(cct
, 1) << __func__
<< " failed to read hostname (" << cpp_strerror(r
) << ")" << dendl
;
2005 metadata
["pid"] = stringify(getpid());
2007 // Ceph entity id (the '0' in "client.0")
2008 metadata
["entity_id"] = cct
->_conf
->name
.get_id();
2010 // Our mount position
2011 if (!mount_root
.empty()) {
2012 metadata
["root"] = mount_root
;
2016 metadata
["ceph_version"] = pretty_version_to_str();
2017 metadata
["ceph_sha1"] = git_version_to_str();
2019 // Apply any metadata from the user's configured overrides
2020 std::vector
<std::string
> tokens
;
2021 get_str_vec(cct
->_conf
->client_metadata
, ",", tokens
);
2022 for (const auto &i
: tokens
) {
2023 auto eqpos
= i
.find("=");
2024 // Throw out anything that isn't of the form "<str>=<str>"
2025 if (eqpos
== 0 || eqpos
== std::string::npos
|| eqpos
== i
.size()) {
2026 lderr(cct
) << "Invalid metadata keyval pair: '" << i
<< "'" << dendl
;
2029 metadata
[i
.substr(0, eqpos
)] = i
.substr(eqpos
+ 1);
2034 * Optionally add or override client metadata fields.
2036 void Client::update_metadata(std::string
const &k
, std::string
const &v
)
2038 std::lock_guard
l(client_lock
);
2039 ceph_assert(initialized
);
2041 auto it
= metadata
.find(k
);
2042 if (it
!= metadata
.end()) {
2043 ldout(cct
, 1) << __func__
<< " warning, overriding metadata field '" << k
2044 << "' from '" << it
->second
<< "' to '" << v
<< "'" << dendl
;
2050 MetaSession
*Client::_open_mds_session(mds_rank_t mds
)
2052 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
2053 auto addrs
= mdsmap
->get_addrs(mds
);
2054 auto em
= mds_sessions
.emplace(std::piecewise_construct
,
2055 std::forward_as_tuple(mds
),
2056 std::forward_as_tuple(mds
, messenger
->connect_to_mds(addrs
), addrs
));
2057 ceph_assert(em
.second
); /* not already present */
2058 MetaSession
*session
= &em
.first
->second
;
2060 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_OPEN
);
2061 m
->metadata
= metadata
;
2062 m
->supported_features
= feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED
);
2063 session
->con
->send_message2(std::move(m
));
2067 void Client::_close_mds_session(MetaSession
*s
)
2069 ldout(cct
, 2) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2070 s
->state
= MetaSession::STATE_CLOSING
;
2071 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2074 void Client::_closed_mds_session(MetaSession
*s
, int err
, bool rejected
)
2076 ldout(cct
, 5) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2077 if (rejected
&& s
->state
!= MetaSession::STATE_CLOSING
)
2078 s
->state
= MetaSession::STATE_REJECTED
;
2080 s
->state
= MetaSession::STATE_CLOSED
;
2081 s
->con
->mark_down();
2082 signal_context_list(s
->waiting_for_open
);
2083 mount_cond
.notify_all();
2084 remove_session_caps(s
, err
);
2085 kick_requests_closed(s
);
2086 mds_ranks_closing
.erase(s
->mds_num
);
2087 if (s
->state
== MetaSession::STATE_CLOSED
)
2088 mds_sessions
.erase(s
->mds_num
);
2091 void Client::handle_client_session(const MConstRef
<MClientSession
>& m
)
2093 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2094 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << from
<< dendl
;
2096 MetaSession
*session
= _get_mds_session(from
, m
->get_connection().get());
2098 ldout(cct
, 10) << " discarding session message from sessionless mds " << m
->get_source_inst() << dendl
;
2102 switch (m
->get_op()) {
2103 case CEPH_SESSION_OPEN
:
2105 feature_bitset_t
missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED
);
2106 missing_features
-= m
->supported_features
;
2107 if (!missing_features
.empty()) {
2108 lderr(cct
) << "mds." << from
<< " lacks required features '"
2109 << missing_features
<< "', closing session " << dendl
;
2110 _close_mds_session(session
);
2111 _closed_mds_session(session
, -EPERM
, true);
2114 session
->mds_features
= std::move(m
->supported_features
);
2116 renew_caps(session
);
2117 session
->state
= MetaSession::STATE_OPEN
;
2119 mount_cond
.notify_all();
2121 connect_mds_targets(from
);
2122 signal_context_list(session
->waiting_for_open
);
2126 case CEPH_SESSION_CLOSE
:
2127 _closed_mds_session(session
);
2130 case CEPH_SESSION_RENEWCAPS
:
2131 if (session
->cap_renew_seq
== m
->get_seq()) {
2132 bool was_stale
= ceph_clock_now() >= session
->cap_ttl
;
2134 session
->last_cap_renew_request
+ mdsmap
->get_session_timeout();
2136 wake_up_session_caps(session
, false);
2140 case CEPH_SESSION_STALE
:
2141 // invalidate session caps/leases
2143 session
->cap_ttl
= ceph_clock_now();
2144 session
->cap_ttl
-= 1;
2145 renew_caps(session
);
2148 case CEPH_SESSION_RECALL_STATE
:
2149 trim_caps(session
, m
->get_max_caps());
2152 case CEPH_SESSION_FLUSHMSG
:
2153 /* flush cap release */
2154 if (auto& m
= session
->release
; m
) {
2155 session
->con
->send_message2(std::move(m
));
2157 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG_ACK
, m
->get_seq()));
2160 case CEPH_SESSION_FORCE_RO
:
2161 force_session_readonly(session
);
2164 case CEPH_SESSION_REJECT
:
2166 std::string_view error_str
;
2167 auto it
= m
->metadata
.find("error_string");
2168 if (it
!= m
->metadata
.end())
2169 error_str
= it
->second
;
2171 error_str
= "unknown error";
2172 lderr(cct
) << "mds." << from
<< " rejected us (" << error_str
<< ")" << dendl
;
2174 _closed_mds_session(session
, -EPERM
, true);
2183 bool Client::_any_stale_sessions() const
2185 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
2187 for (const auto &p
: mds_sessions
) {
2188 if (p
.second
.state
== MetaSession::STATE_STALE
) {
2196 void Client::_kick_stale_sessions()
2198 ldout(cct
, 1) << __func__
<< dendl
;
2200 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
2201 MetaSession
&s
= it
->second
;
2202 if (s
.state
== MetaSession::STATE_REJECTED
) {
2203 mds_sessions
.erase(it
++);
2207 if (s
.state
== MetaSession::STATE_STALE
)
2208 _closed_mds_session(&s
);
2212 void Client::send_request(MetaRequest
*request
, MetaSession
*session
,
2213 bool drop_cap_releases
)
2216 mds_rank_t mds
= session
->mds_num
;
2217 ldout(cct
, 10) << __func__
<< " rebuilding request " << request
->get_tid()
2218 << " for mds." << mds
<< dendl
;
2219 auto r
= build_client_request(request
);
2220 if (request
->dentry()) {
2221 r
->set_dentry_wanted();
2223 if (request
->got_unsafe
) {
2224 r
->set_replayed_op();
2225 if (request
->target
)
2226 r
->head
.ino
= request
->target
->ino
;
2228 encode_cap_releases(request
, mds
);
2229 if (drop_cap_releases
) // we haven't send cap reconnect yet, drop cap releases
2230 request
->cap_releases
.clear();
2232 r
->releases
.swap(request
->cap_releases
);
2234 r
->set_mdsmap_epoch(mdsmap
->get_epoch());
2235 if (r
->head
.op
== CEPH_MDS_OP_SETXATTR
) {
2236 objecter
->with_osdmap([r
](const OSDMap
& o
) {
2237 r
->set_osdmap_epoch(o
.get_epoch());
2241 if (request
->mds
== -1) {
2242 request
->sent_stamp
= ceph_clock_now();
2243 ldout(cct
, 20) << __func__
<< " set sent_stamp to " << request
->sent_stamp
<< dendl
;
2247 Inode
*in
= request
->inode();
2249 auto it
= in
->caps
.find(mds
);
2250 if (it
!= in
->caps
.end()) {
2251 request
->sent_on_mseq
= it
->second
.mseq
;
2255 session
->requests
.push_back(&request
->item
);
2257 ldout(cct
, 10) << __func__
<< " " << *r
<< " to mds." << mds
<< dendl
;
2258 session
->con
->send_message2(std::move(r
));
2261 ref_t
<MClientRequest
> Client::build_client_request(MetaRequest
*request
)
2263 auto req
= make_message
<MClientRequest
>(request
->get_op());
2264 req
->set_tid(request
->tid
);
2265 req
->set_stamp(request
->op_stamp
);
2266 memcpy(&req
->head
, &request
->head
, sizeof(ceph_mds_request_head
));
2268 // if the filepath's haven't been set, set them!
2269 if (request
->path
.empty()) {
2270 Inode
*in
= request
->inode();
2271 Dentry
*de
= request
->dentry();
2273 in
->make_nosnap_relative_path(request
->path
);
2276 de
->inode
->make_nosnap_relative_path(request
->path
);
2278 de
->dir
->parent_inode
->make_nosnap_relative_path(request
->path
);
2279 request
->path
.push_dentry(de
->name
);
2281 else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2282 << " No path, inode, or appropriately-endowed dentry given!"
2284 } else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2285 << " No path, inode, or dentry given!"
2288 req
->set_filepath(request
->get_filepath());
2289 req
->set_filepath2(request
->get_filepath2());
2290 req
->set_data(request
->data
);
2291 req
->set_retry_attempt(request
->retry_attempt
++);
2292 req
->head
.num_fwd
= request
->num_fwd
;
2294 int gid_count
= request
->perms
.get_gids(&_gids
);
2295 req
->set_gid_list(gid_count
, _gids
);
2301 void Client::handle_client_request_forward(const MConstRef
<MClientRequestForward
>& fwd
)
2303 mds_rank_t mds
= mds_rank_t(fwd
->get_source().num());
2304 MetaSession
*session
= _get_mds_session(mds
, fwd
->get_connection().get());
2308 ceph_tid_t tid
= fwd
->get_tid();
2310 if (mds_requests
.count(tid
) == 0) {
2311 ldout(cct
, 10) << __func__
<< " no pending request on tid " << tid
<< dendl
;
2315 MetaRequest
*request
= mds_requests
[tid
];
2316 ceph_assert(request
);
2318 // reset retry counter
2319 request
->retry_attempt
= 0;
2321 // request not forwarded, or dest mds has no session.
2323 ldout(cct
, 10) << __func__
<< " tid " << tid
2324 << " fwd " << fwd
->get_num_fwd()
2325 << " to mds." << fwd
->get_dest_mds()
2326 << ", resending to " << fwd
->get_dest_mds()
2330 request
->item
.remove_myself();
2331 request
->num_fwd
= fwd
->get_num_fwd();
2332 request
->resend_mds
= fwd
->get_dest_mds();
2333 request
->caller_cond
->notify_all();
2336 bool Client::is_dir_operation(MetaRequest
*req
)
2338 int op
= req
->get_op();
2339 if (op
== CEPH_MDS_OP_MKNOD
|| op
== CEPH_MDS_OP_LINK
||
2340 op
== CEPH_MDS_OP_UNLINK
|| op
== CEPH_MDS_OP_RENAME
||
2341 op
== CEPH_MDS_OP_MKDIR
|| op
== CEPH_MDS_OP_RMDIR
||
2342 op
== CEPH_MDS_OP_SYMLINK
|| op
== CEPH_MDS_OP_CREATE
)
2347 void Client::handle_client_reply(const MConstRef
<MClientReply
>& reply
)
2349 mds_rank_t mds_num
= mds_rank_t(reply
->get_source().num());
2350 MetaSession
*session
= _get_mds_session(mds_num
, reply
->get_connection().get());
2355 ceph_tid_t tid
= reply
->get_tid();
2356 bool is_safe
= reply
->is_safe();
2358 if (mds_requests
.count(tid
) == 0) {
2359 lderr(cct
) << __func__
<< " no pending request on tid " << tid
2360 << " safe is:" << is_safe
<< dendl
;
2363 MetaRequest
*request
= mds_requests
.at(tid
);
2365 ldout(cct
, 20) << __func__
<< " got a reply. Safe:" << is_safe
2366 << " tid " << tid
<< dendl
;
2368 if (request
->got_unsafe
&& !is_safe
) {
2369 //duplicate response
2370 ldout(cct
, 0) << "got a duplicate reply on tid " << tid
<< " from mds "
2371 << mds_num
<< " safe:" << is_safe
<< dendl
;
2375 if (-ESTALE
== reply
->get_result()) { // see if we can get to proper MDS
2376 ldout(cct
, 20) << "got ESTALE on tid " << request
->tid
2377 << " from mds." << request
->mds
<< dendl
;
2378 request
->send_to_auth
= true;
2379 request
->resend_mds
= choose_target_mds(request
);
2380 Inode
*in
= request
->inode();
2381 std::map
<mds_rank_t
, Cap
>::const_iterator it
;
2382 if (request
->resend_mds
>= 0 &&
2383 request
->resend_mds
== request
->mds
&&
2385 (it
= in
->caps
.find(request
->resend_mds
)) != in
->caps
.end() ||
2386 request
->sent_on_mseq
== it
->second
.mseq
)) {
2387 ldout(cct
, 20) << "have to return ESTALE" << dendl
;
2389 request
->caller_cond
->notify_all();
2394 ceph_assert(!request
->reply
);
2395 request
->reply
= reply
;
2396 insert_trace(request
, session
);
2398 // Handle unsafe reply
2400 request
->got_unsafe
= true;
2401 session
->unsafe_requests
.push_back(&request
->unsafe_item
);
2402 if (is_dir_operation(request
)) {
2403 Inode
*dir
= request
->inode();
2405 dir
->unsafe_ops
.push_back(&request
->unsafe_dir_item
);
2407 if (request
->target
) {
2408 InodeRef
&in
= request
->target
;
2409 in
->unsafe_ops
.push_back(&request
->unsafe_target_item
);
2413 // Only signal the caller once (on the first reply):
2414 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2415 if (!is_safe
|| !request
->got_unsafe
) {
2416 ceph::condition_variable cond
;
2417 request
->dispatch_cond
= &cond
;
2420 ldout(cct
, 20) << __func__
<< " signalling caller " << (void*)request
->caller_cond
<< dendl
;
2421 request
->caller_cond
->notify_all();
2423 // wake for kick back
2424 std::unique_lock l
{client_lock
, std::adopt_lock
};
2425 cond
.wait(l
, [tid
, request
, &cond
, this] {
2426 if (request
->dispatch_cond
) {
2427 ldout(cct
, 20) << "handle_client_reply awaiting kickback on tid "
2428 << tid
<< " " << &cond
<< dendl
;
2430 return !request
->dispatch_cond
;
2436 // the filesystem change is committed to disk
2437 // we're done, clean up
2438 if (request
->got_unsafe
) {
2439 request
->unsafe_item
.remove_myself();
2440 request
->unsafe_dir_item
.remove_myself();
2441 request
->unsafe_target_item
.remove_myself();
2442 signal_cond_list(request
->waitfor_safe
);
2444 request
->item
.remove_myself();
2445 unregister_request(request
);
2448 mount_cond
.notify_all();
2451 void Client::_handle_full_flag(int64_t pool
)
2453 ldout(cct
, 1) << __func__
<< ": FULL: cancelling outstanding operations "
2454 << "on " << pool
<< dendl
;
2455 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2456 // to do this rather than blocking, because otherwise when we fill up we
2457 // potentially lock caps forever on files with dirty pages, and we need
2458 // to be able to release those caps to the MDS so that it can delete files
2459 // and free up space.
2460 epoch_t cancelled_epoch
= objecter
->op_cancel_writes(-ENOSPC
, pool
);
2462 // For all inodes with layouts in this pool and a pending flush write op
2463 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2464 // from ObjectCacher so that it doesn't re-issue the write in response to
2465 // the ENOSPC error.
2466 // Fortunately since we're cancelling everything in a given pool, we don't
2467 // need to know which ops belong to which ObjectSet, we can just blow all
2468 // the un-flushed cached data away and mark any dirty inodes' async_err
2469 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2470 // affecting this pool, and all the objectsets we're purging were also
2472 for (unordered_map
<vinodeno_t
,Inode
*>::iterator i
= inode_map
.begin();
2473 i
!= inode_map
.end(); ++i
)
2475 Inode
*inode
= i
->second
;
2476 if (inode
->oset
.dirty_or_tx
2477 && (pool
== -1 || inode
->layout
.pool_id
== pool
)) {
2478 ldout(cct
, 4) << __func__
<< ": FULL: inode 0x" << std::hex
<< i
->first
<< std::dec
2479 << " has dirty objects, purging and setting ENOSPC" << dendl
;
2480 objectcacher
->purge_set(&inode
->oset
);
2481 inode
->set_async_err(-ENOSPC
);
2485 if (cancelled_epoch
!= (epoch_t
)-1) {
2486 set_cap_epoch_barrier(cancelled_epoch
);
2490 void Client::handle_osd_map(const MConstRef
<MOSDMap
>& m
)
2492 std::set
<entity_addr_t
> new_blacklists
;
2493 objecter
->consume_blacklist_events(&new_blacklists
);
2495 const auto myaddrs
= messenger
->get_myaddrs();
2496 bool new_blacklist
= false;
2497 bool prenautilus
= objecter
->with_osdmap(
2498 [&](const OSDMap
& o
) {
2499 return o
.require_osd_release
< ceph_release_t::nautilus
;
2502 for (auto a
: myaddrs
.v
) {
2503 // blacklist entries are always TYPE_ANY for nautilus+
2504 a
.set_type(entity_addr_t::TYPE_ANY
);
2505 if (new_blacklists
.count(a
)) {
2506 new_blacklist
= true;
2510 // ...except pre-nautilus, they were TYPE_LEGACY
2511 a
.set_type(entity_addr_t::TYPE_LEGACY
);
2512 if (new_blacklists
.count(a
)) {
2513 new_blacklist
= true;
2519 if (new_blacklist
) {
2520 auto epoch
= objecter
->with_osdmap([](const OSDMap
&o
){
2521 return o
.get_epoch();
2523 lderr(cct
) << "I was blacklisted at osd epoch " << epoch
<< dendl
;
2526 _abort_mds_sessions(-EBLACKLISTED
);
2528 // Since we know all our OSD ops will fail, cancel them all preemtively,
2529 // so that on an unhealthy cluster we can umount promptly even if e.g.
2530 // some PGs were inaccessible.
2531 objecter
->op_cancel_writes(-EBLACKLISTED
);
2533 } else if (blacklisted
) {
2534 // Handle case where we were blacklisted but no longer are
2535 blacklisted
= objecter
->with_osdmap([myaddrs
](const OSDMap
&o
){
2536 return o
.is_blacklisted(myaddrs
);});
2539 // Always subscribe to next osdmap for blacklisted client
2540 // until this client is not blacklisted.
2542 objecter
->maybe_request_map();
2545 if (objecter
->osdmap_full_flag()) {
2546 _handle_full_flag(-1);
2548 // Accumulate local list of full pools so that I can drop
2549 // the objecter lock before re-entering objecter in
2551 std::vector
<int64_t> full_pools
;
2553 objecter
->with_osdmap([&full_pools
](const OSDMap
&o
) {
2554 for (const auto& kv
: o
.get_pools()) {
2555 if (kv
.second
.has_flag(pg_pool_t::FLAG_FULL
)) {
2556 full_pools
.push_back(kv
.first
);
2561 for (auto p
: full_pools
)
2562 _handle_full_flag(p
);
2564 // Subscribe to subsequent maps to watch for the full flag going
2565 // away. For the global full flag objecter does this for us, but
2566 // it pays no attention to the per-pool full flag so in this branch
2567 // we do it ourselves.
2568 if (!full_pools
.empty()) {
2569 objecter
->maybe_request_map();
2575 // ------------------------
2576 // incoming messages
2579 bool Client::ms_dispatch2(const MessageRef
&m
)
2581 std::lock_guard
l(client_lock
);
2583 ldout(cct
, 10) << "inactive, discarding " << *m
<< dendl
;
2587 switch (m
->get_type()) {
2588 // mounting and mds sessions
2589 case CEPH_MSG_MDS_MAP
:
2590 handle_mds_map(ref_cast
<MMDSMap
>(m
));
2592 case CEPH_MSG_FS_MAP
:
2593 handle_fs_map(ref_cast
<MFSMap
>(m
));
2595 case CEPH_MSG_FS_MAP_USER
:
2596 handle_fs_map_user(ref_cast
<MFSMapUser
>(m
));
2598 case CEPH_MSG_CLIENT_SESSION
:
2599 handle_client_session(ref_cast
<MClientSession
>(m
));
2602 case CEPH_MSG_OSD_MAP
:
2603 handle_osd_map(ref_cast
<MOSDMap
>(m
));
2607 case CEPH_MSG_CLIENT_REQUEST_FORWARD
:
2608 handle_client_request_forward(ref_cast
<MClientRequestForward
>(m
));
2610 case CEPH_MSG_CLIENT_REPLY
:
2611 handle_client_reply(ref_cast
<MClientReply
>(m
));
2615 case CEPH_MSG_CLIENT_RECLAIM_REPLY
:
2616 handle_client_reclaim_reply(ref_cast
<MClientReclaimReply
>(m
));
2619 case CEPH_MSG_CLIENT_SNAP
:
2620 handle_snap(ref_cast
<MClientSnap
>(m
));
2622 case CEPH_MSG_CLIENT_CAPS
:
2623 handle_caps(ref_cast
<MClientCaps
>(m
));
2625 case CEPH_MSG_CLIENT_LEASE
:
2626 handle_lease(ref_cast
<MClientLease
>(m
));
2628 case MSG_COMMAND_REPLY
:
2629 if (m
->get_source().type() == CEPH_ENTITY_TYPE_MDS
) {
2630 handle_command_reply(ref_cast
<MCommandReply
>(m
));
2635 case CEPH_MSG_CLIENT_QUOTA
:
2636 handle_quota(ref_cast
<MClientQuota
>(m
));
2645 ldout(cct
, 10) << "unmounting: trim pass, size was " << lru
.lru_get_size()
2646 << "+" << inode_map
.size() << dendl
;
2647 long unsigned size
= lru
.lru_get_size() + inode_map
.size();
2649 if (size
< lru
.lru_get_size() + inode_map
.size()) {
2650 ldout(cct
, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl
;
2651 mount_cond
.notify_all();
2653 ldout(cct
, 10) << "unmounting: trim pass, size still " << lru
.lru_get_size()
2654 << "+" << inode_map
.size() << dendl
;
2661 void Client::handle_fs_map(const MConstRef
<MFSMap
>& m
)
2663 fsmap
.reset(new FSMap(m
->get_fsmap()));
2665 signal_cond_list(waiting_for_fsmap
);
2667 monclient
->sub_got("fsmap", fsmap
->get_epoch());
2670 void Client::handle_fs_map_user(const MConstRef
<MFSMapUser
>& m
)
2672 fsmap_user
.reset(new FSMapUser
);
2673 *fsmap_user
= m
->get_fsmap();
2675 monclient
->sub_got("fsmap.user", fsmap_user
->get_epoch());
2676 signal_cond_list(waiting_for_fsmap
);
2679 void Client::handle_mds_map(const MConstRef
<MMDSMap
>& m
)
2681 mds_gid_t old_inc
, new_inc
;
2682 if (m
->get_epoch() <= mdsmap
->get_epoch()) {
2683 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch()
2684 << " is identical to or older than our "
2685 << mdsmap
->get_epoch() << dendl
;
2689 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch() << dendl
;
2691 std::unique_ptr
<MDSMap
> oldmap(new MDSMap
);
2692 oldmap
.swap(mdsmap
);
2694 mdsmap
->decode(m
->get_encoded());
2696 // Cancel any commands for missing or laggy GIDs
2697 std::list
<ceph_tid_t
> cancel_ops
;
2698 auto &commands
= command_table
.get_commands();
2699 for (const auto &i
: commands
) {
2700 auto &op
= i
.second
;
2701 const mds_gid_t op_mds_gid
= op
.mds_gid
;
2702 if (mdsmap
->is_dne_gid(op_mds_gid
) || mdsmap
->is_laggy_gid(op_mds_gid
)) {
2703 ldout(cct
, 1) << __func__
<< ": cancelling command op " << i
.first
<< dendl
;
2704 cancel_ops
.push_back(i
.first
);
2706 std::ostringstream ss
;
2707 ss
<< "MDS " << op_mds_gid
<< " went away";
2708 *(op
.outs
) = ss
.str();
2710 op
.con
->mark_down();
2712 op
.on_finish
->complete(-ETIMEDOUT
);
2717 for (std::list
<ceph_tid_t
>::iterator i
= cancel_ops
.begin();
2718 i
!= cancel_ops
.end(); ++i
) {
2719 command_table
.erase(*i
);
2723 for (auto p
= mds_sessions
.begin(); p
!= mds_sessions
.end(); ) {
2724 mds_rank_t mds
= p
->first
;
2725 MetaSession
*session
= &p
->second
;
2728 int oldstate
= oldmap
->get_state(mds
);
2729 int newstate
= mdsmap
->get_state(mds
);
2730 if (!mdsmap
->is_up(mds
)) {
2731 session
->con
->mark_down();
2732 } else if (mdsmap
->get_addrs(mds
) != session
->addrs
) {
2733 old_inc
= oldmap
->get_incarnation(mds
);
2734 new_inc
= mdsmap
->get_incarnation(mds
);
2735 if (old_inc
!= new_inc
) {
2736 ldout(cct
, 1) << "mds incarnation changed from "
2737 << old_inc
<< " to " << new_inc
<< dendl
;
2738 oldstate
= MDSMap::STATE_NULL
;
2740 session
->con
->mark_down();
2741 session
->addrs
= mdsmap
->get_addrs(mds
);
2742 // When new MDS starts to take over, notify kernel to trim unused entries
2743 // in its dcache/icache. Hopefully, the kernel will release some unused
2744 // inodes before the new MDS enters reconnect state.
2745 trim_cache_for_reconnect(session
);
2746 } else if (oldstate
== newstate
)
2747 continue; // no change
2749 session
->mds_state
= newstate
;
2750 if (newstate
== MDSMap::STATE_RECONNECT
) {
2751 session
->con
= messenger
->connect_to_mds(session
->addrs
);
2752 send_reconnect(session
);
2753 } else if (newstate
> MDSMap::STATE_RECONNECT
) {
2754 if (oldstate
< MDSMap::STATE_RECONNECT
) {
2755 ldout(cct
, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl
;
2756 _closed_mds_session(session
);
2759 if (newstate
>= MDSMap::STATE_ACTIVE
) {
2760 if (oldstate
< MDSMap::STATE_ACTIVE
) {
2761 // kick new requests
2762 kick_requests(session
);
2763 kick_flushing_caps(session
);
2764 signal_context_list(session
->waiting_for_open
);
2765 wake_up_session_caps(session
, true);
2767 connect_mds_targets(mds
);
2769 } else if (newstate
== MDSMap::STATE_NULL
&&
2770 mds
>= mdsmap
->get_max_mds()) {
2771 _closed_mds_session(session
);
2775 // kick any waiting threads
2776 signal_cond_list(waiting_for_mdsmap
);
2778 monclient
->sub_got("mdsmap", mdsmap
->get_epoch());
2781 void Client::send_reconnect(MetaSession
*session
)
2783 mds_rank_t mds
= session
->mds_num
;
2784 ldout(cct
, 10) << __func__
<< " to mds." << mds
<< dendl
;
2786 // trim unused caps to reduce MDS's cache rejoin time
2787 trim_cache_for_reconnect(session
);
2789 session
->readonly
= false;
2791 session
->release
.reset();
2793 // reset my cap seq number
2795 //connect to the mds' offload targets
2796 connect_mds_targets(mds
);
2797 //make sure unsafe requests get saved
2798 resend_unsafe_requests(session
);
2800 early_kick_flushing_caps(session
);
2802 auto m
= make_message
<MClientReconnect
>();
2803 bool allow_multi
= session
->mds_features
.test(CEPHFS_FEATURE_MULTI_RECONNECT
);
2805 // i have an open session.
2806 ceph::unordered_set
<inodeno_t
> did_snaprealm
;
2807 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
= inode_map
.begin();
2808 p
!= inode_map
.end();
2810 Inode
*in
= p
->second
;
2811 auto it
= in
->caps
.find(mds
);
2812 if (it
!= in
->caps
.end()) {
2814 m
->get_approx_size() >=
2815 static_cast<size_t>((std::numeric_limits
<int>::max() >> 1))) {
2817 session
->con
->send_message2(std::move(m
));
2819 m
= make_message
<MClientReconnect
>();
2822 Cap
&cap
= it
->second
;
2823 ldout(cct
, 10) << " caps on " << p
->first
2824 << " " << ccap_string(cap
.issued
)
2825 << " wants " << ccap_string(in
->caps_wanted())
2828 in
->make_short_path(path
);
2829 ldout(cct
, 10) << " path " << path
<< dendl
;
2832 _encode_filelocks(in
, flockbl
);
2834 cap
.seq
= 0; // reset seq.
2835 cap
.issue_seq
= 0; // reset seq.
2836 cap
.mseq
= 0; // reset seq.
2837 // cap gen should catch up with session cap_gen
2838 if (cap
.gen
< session
->cap_gen
) {
2839 cap
.gen
= session
->cap_gen
;
2840 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
2842 cap
.issued
= cap
.implemented
;
2844 snapid_t snap_follows
= 0;
2845 if (!in
->cap_snaps
.empty())
2846 snap_follows
= in
->cap_snaps
.begin()->first
;
2848 m
->add_cap(p
->first
.ino
,
2850 path
.get_ino(), path
.get_path(), // ino
2851 in
->caps_wanted(), // wanted
2852 cap
.issued
, // issued
2857 if (did_snaprealm
.count(in
->snaprealm
->ino
) == 0) {
2858 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
2859 m
->add_snaprealm(in
->snaprealm
->ino
, in
->snaprealm
->seq
, in
->snaprealm
->parent
);
2860 did_snaprealm
.insert(in
->snaprealm
->ino
);
2866 m
->set_encoding_version(0); // use connection features to choose encoding
2867 session
->con
->send_message2(std::move(m
));
2869 mount_cond
.notify_all();
2871 if (session
->reclaim_state
== MetaSession::RECLAIMING
)
2872 signal_cond_list(waiting_for_reclaim
);
2876 void Client::kick_requests(MetaSession
*session
)
2878 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
2879 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2880 p
!= mds_requests
.end();
2882 MetaRequest
*req
= p
->second
;
2883 if (req
->got_unsafe
)
2885 if (req
->aborted()) {
2886 if (req
->caller_cond
) {
2888 req
->caller_cond
->notify_all();
2892 if (req
->retry_attempt
> 0)
2893 continue; // new requests only
2894 if (req
->mds
== session
->mds_num
) {
2895 send_request(p
->second
, session
);
2900 void Client::resend_unsafe_requests(MetaSession
*session
)
2902 for (xlist
<MetaRequest
*>::iterator iter
= session
->unsafe_requests
.begin();
2905 send_request(*iter
, session
);
2907 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2908 // process completed requests in clientreplay stage.
2909 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2910 p
!= mds_requests
.end();
2912 MetaRequest
*req
= p
->second
;
2913 if (req
->got_unsafe
)
2917 if (req
->retry_attempt
== 0)
2918 continue; // old requests only
2919 if (req
->mds
== session
->mds_num
)
2920 send_request(req
, session
, true);
2924 void Client::wait_unsafe_requests()
2926 list
<MetaRequest
*> last_unsafe_reqs
;
2927 for (const auto &p
: mds_sessions
) {
2928 const MetaSession
&s
= p
.second
;
2929 if (!s
.unsafe_requests
.empty()) {
2930 MetaRequest
*req
= s
.unsafe_requests
.back();
2932 last_unsafe_reqs
.push_back(req
);
2936 for (list
<MetaRequest
*>::iterator p
= last_unsafe_reqs
.begin();
2937 p
!= last_unsafe_reqs
.end();
2939 MetaRequest
*req
= *p
;
2940 if (req
->unsafe_item
.is_on_list())
2941 wait_on_list(req
->waitfor_safe
);
2946 void Client::kick_requests_closed(MetaSession
*session
)
2948 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
2949 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2950 p
!= mds_requests
.end(); ) {
2951 MetaRequest
*req
= p
->second
;
2953 if (req
->mds
== session
->mds_num
) {
2954 if (req
->caller_cond
) {
2956 req
->caller_cond
->notify_all();
2958 req
->item
.remove_myself();
2959 if (req
->got_unsafe
) {
2960 lderr(cct
) << __func__
<< " removing unsafe request " << req
->get_tid() << dendl
;
2961 req
->unsafe_item
.remove_myself();
2962 if (is_dir_operation(req
)) {
2963 Inode
*dir
= req
->inode();
2965 dir
->set_async_err(-EIO
);
2966 lderr(cct
) << "kick_requests_closed drop req of inode(dir) : "
2967 << dir
->ino
<< " " << req
->get_tid() << dendl
;
2968 req
->unsafe_dir_item
.remove_myself();
2971 InodeRef
&in
= req
->target
;
2972 in
->set_async_err(-EIO
);
2973 lderr(cct
) << "kick_requests_closed drop req of inode : "
2974 << in
->ino
<< " " << req
->get_tid() << dendl
;
2975 req
->unsafe_target_item
.remove_myself();
2977 signal_cond_list(req
->waitfor_safe
);
2978 unregister_request(req
);
2982 ceph_assert(session
->requests
.empty());
2983 ceph_assert(session
->unsafe_requests
.empty());
2993 void Client::got_mds_push(MetaSession
*s
)
2996 ldout(cct
, 10) << " mds." << s
->mds_num
<< " seq now " << s
->seq
<< dendl
;
2997 if (s
->state
== MetaSession::STATE_CLOSING
) {
2998 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
3002 void Client::handle_lease(const MConstRef
<MClientLease
>& m
)
3004 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
3006 ceph_assert(m
->get_action() == CEPH_MDS_LEASE_REVOKE
);
3008 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
3009 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
3014 got_mds_push(session
);
3016 ceph_seq_t seq
= m
->get_seq();
3019 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
3020 if (inode_map
.count(vino
) == 0) {
3021 ldout(cct
, 10) << " don't have vino " << vino
<< dendl
;
3024 in
= inode_map
[vino
];
3026 if (m
->get_mask() & CEPH_LEASE_VALID
) {
3027 if (!in
->dir
|| in
->dir
->dentries
.count(m
->dname
) == 0) {
3028 ldout(cct
, 10) << " don't have dir|dentry " << m
->get_ino() << "/" << m
->dname
<<dendl
;
3031 Dentry
*dn
= in
->dir
->dentries
[m
->dname
];
3032 ldout(cct
, 10) << " revoked DN lease on " << dn
<< dendl
;
3038 auto reply
= make_message
<MClientLease
>(CEPH_MDS_LEASE_RELEASE
, seq
,
3039 m
->get_mask(), m
->get_ino(),
3040 m
->get_first(), m
->get_last(), m
->dname
);
3041 m
->get_connection()->send_message2(std::move(reply
));
3045 void Client::put_inode(Inode
*in
, int n
)
3047 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3048 int left
= in
->_put(n
);
3051 remove_all_caps(in
);
3053 ldout(cct
, 10) << __func__
<< " deleting " << *in
<< dendl
;
3054 bool unclean
= objectcacher
->release_set(&in
->oset
);
3055 ceph_assert(!unclean
);
3056 inode_map
.erase(in
->vino());
3057 if (use_faked_inos())
3058 _release_faked_ino(in
);
3063 while (!root_parents
.empty())
3064 root_parents
.erase(root_parents
.begin());
3071 void Client::close_dir(Dir
*dir
)
3073 Inode
*in
= dir
->parent_inode
;
3074 ldout(cct
, 15) << __func__
<< " dir " << dir
<< " on " << in
<< dendl
;
3075 ceph_assert(dir
->is_empty());
3076 ceph_assert(in
->dir
== dir
);
3077 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
3078 if (!in
->dentries
.empty())
3079 in
->get_first_parent()->put(); // unpin dentry
3083 put_inode(in
); // unpin inode
3087 * Don't call this with in==NULL, use get_or_create for that
3088 * leave dn set to default NULL unless you're trying to add
3089 * a new inode to a pre-created Dentry
3091 Dentry
* Client::link(Dir
*dir
, const string
& name
, Inode
*in
, Dentry
*dn
)
3094 // create a new Dentry
3095 dn
= new Dentry(dir
, name
);
3097 lru
.lru_insert_mid(dn
); // mid or top?
3099 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3100 << " dn " << dn
<< " (new dn)" << dendl
;
3102 ceph_assert(!dn
->inode
);
3103 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3104 << " dn " << dn
<< " (old dn)" << dendl
;
3107 if (in
) { // link to inode
3109 // only one parent for directories!
3110 if (in
->is_dir() && !in
->dentries
.empty()) {
3111 tmp_ref
= in
; // prevent unlink below from freeing the inode.
3112 Dentry
*olddn
= in
->get_first_parent();
3113 ceph_assert(olddn
->dir
!= dir
|| olddn
->name
!= name
);
3114 Inode
*old_diri
= olddn
->dir
->parent_inode
;
3115 clear_dir_complete_and_ordered(old_diri
, true);
3116 unlink(olddn
, true, true); // keep dir, dentry
3120 ldout(cct
, 20) << "link inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3126 void Client::unlink(Dentry
*dn
, bool keepdir
, bool keepdentry
)
3128 InodeRef
in(dn
->inode
);
3129 ldout(cct
, 15) << "unlink dir " << dn
->dir
->parent_inode
<< " '" << dn
->name
<< "' dn " << dn
3130 << " inode " << dn
->inode
<< dendl
;
3132 // unlink from inode
3135 ldout(cct
, 20) << "unlink inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3141 ldout(cct
, 15) << "unlink removing '" << dn
->name
<< "' dn " << dn
<< dendl
;
3151 if (dir
->is_empty() && !keepdir
)
3157 * For asynchronous flushes, check for errors from the IO and
3158 * update the inode if necessary
3160 class C_Client_FlushComplete
: public Context
{
3165 C_Client_FlushComplete(Client
*c
, Inode
*in
) : client(c
), inode(in
) { }
3166 void finish(int r
) override
{
3167 ceph_assert(ceph_mutex_is_locked_by_me(client
->client_lock
));
3169 client_t
const whoami
= client
->whoami
; // For the benefit of ldout prefix
3170 ldout(client
->cct
, 1) << "I/O error from flush on inode " << inode
3171 << " 0x" << std::hex
<< inode
->ino
<< std::dec
3172 << ": " << r
<< "(" << cpp_strerror(r
) << ")" << dendl
;
3173 inode
->set_async_err(r
);
3183 void Client::get_cap_ref(Inode
*in
, int cap
)
3185 if ((cap
& CEPH_CAP_FILE_BUFFER
) &&
3186 in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] == 0) {
3187 ldout(cct
, 5) << __func__
<< " got first FILE_BUFFER ref on " << *in
<< dendl
;
3190 if ((cap
& CEPH_CAP_FILE_CACHE
) &&
3191 in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3192 ldout(cct
, 5) << __func__
<< " got first FILE_CACHE ref on " << *in
<< dendl
;
3195 in
->get_cap_ref(cap
);
3198 void Client::put_cap_ref(Inode
*in
, int cap
)
3200 int last
= in
->put_cap_ref(cap
);
3203 int drop
= last
& ~in
->caps_issued();
3204 if (in
->snapid
== CEPH_NOSNAP
) {
3205 if ((last
& (CEPH_CAP_FILE_WR
| CEPH_CAP_FILE_BUFFER
)) &&
3206 !in
->cap_snaps
.empty() &&
3207 in
->cap_snaps
.rbegin()->second
.writing
) {
3208 ldout(cct
, 10) << __func__
<< " finishing pending cap_snap on " << *in
<< dendl
;
3209 in
->cap_snaps
.rbegin()->second
.writing
= 0;
3210 finish_cap_snap(in
, in
->cap_snaps
.rbegin()->second
, get_caps_used(in
));
3211 signal_cond_list(in
->waitfor_caps
); // wake up blocked sync writers
3213 if (last
& CEPH_CAP_FILE_BUFFER
) {
3214 for (auto &p
: in
->cap_snaps
)
3215 p
.second
.dirty_data
= 0;
3216 signal_cond_list(in
->waitfor_commit
);
3217 ldout(cct
, 5) << __func__
<< " dropped last FILE_BUFFER ref on " << *in
<< dendl
;
3221 if (last
& CEPH_CAP_FILE_CACHE
) {
3222 ldout(cct
, 5) << __func__
<< " dropped last FILE_CACHE ref on " << *in
<< dendl
;
3228 put_inode(in
, put_nref
);
3232 int Client::get_caps(Fh
*fh
, int need
, int want
, int *phave
, loff_t endoff
)
3234 Inode
*in
= fh
->inode
.get();
3236 int r
= check_pool_perm(in
, need
);
3241 int file_wanted
= in
->caps_file_wanted();
3242 if ((file_wanted
& need
) != need
) {
3243 ldout(cct
, 10) << "get_caps " << *in
<< " need " << ccap_string(need
)
3244 << " file_wanted " << ccap_string(file_wanted
) << ", EBADF "
3249 if ((fh
->mode
& CEPH_FILE_MODE_WR
) && fh
->gen
!= fd_gen
)
3252 if ((in
->flags
& I_ERROR_FILELOCK
) && fh
->has_any_filelocks())
3256 int have
= in
->caps_issued(&implemented
);
3258 bool waitfor_caps
= false;
3259 bool waitfor_commit
= false;
3261 if (have
& need
& CEPH_CAP_FILE_WR
) {
3263 if ((endoff
>= (loff_t
)in
->max_size
||
3264 endoff
> (loff_t
)(in
->size
<< 1)) &&
3265 endoff
> (loff_t
)in
->wanted_max_size
) {
3266 ldout(cct
, 10) << "wanted_max_size " << in
->wanted_max_size
<< " -> " << endoff
<< dendl
;
3267 in
->wanted_max_size
= endoff
;
3269 if (in
->wanted_max_size
> in
->max_size
&&
3270 in
->wanted_max_size
> in
->requested_max_size
)
3274 if (endoff
>= 0 && endoff
> (loff_t
)in
->max_size
) {
3275 ldout(cct
, 10) << "waiting on max_size, endoff " << endoff
<< " max_size " << in
->max_size
<< " on " << *in
<< dendl
;
3276 waitfor_caps
= true;
3278 if (!in
->cap_snaps
.empty()) {
3279 if (in
->cap_snaps
.rbegin()->second
.writing
) {
3280 ldout(cct
, 10) << "waiting on cap_snap write to complete" << dendl
;
3281 waitfor_caps
= true;
3283 for (auto &p
: in
->cap_snaps
) {
3284 if (p
.second
.dirty_data
) {
3285 waitfor_commit
= true;
3289 if (waitfor_commit
) {
3290 _flush(in
, new C_Client_FlushComplete(this, in
));
3291 ldout(cct
, 10) << "waiting for WRBUFFER to get dropped" << dendl
;
3296 if (!waitfor_caps
&& !waitfor_commit
) {
3297 if ((have
& need
) == need
) {
3298 int revoking
= implemented
& ~have
;
3299 ldout(cct
, 10) << "get_caps " << *in
<< " have " << ccap_string(have
)
3300 << " need " << ccap_string(need
) << " want " << ccap_string(want
)
3301 << " revoking " << ccap_string(revoking
)
3303 if ((revoking
& want
) == 0) {
3304 *phave
= need
| (have
& want
);
3305 in
->get_cap_ref(need
);
3309 ldout(cct
, 10) << "waiting for caps " << *in
<< " need " << ccap_string(need
) << " want " << ccap_string(want
) << dendl
;
3310 waitfor_caps
= true;
3313 if ((need
& CEPH_CAP_FILE_WR
) && in
->auth_cap
&&
3314 in
->auth_cap
->session
->readonly
)
3317 if (in
->flags
& I_CAP_DROPPED
) {
3318 int mds_wanted
= in
->caps_mds_wanted();
3319 if ((mds_wanted
& need
) != need
) {
3320 int ret
= _renew_caps(in
);
3325 if (!(file_wanted
& ~mds_wanted
))
3326 in
->flags
&= ~I_CAP_DROPPED
;
3330 wait_on_list(in
->waitfor_caps
);
3331 else if (waitfor_commit
)
3332 wait_on_list(in
->waitfor_commit
);
3336 int Client::get_caps_used(Inode
*in
)
3338 unsigned used
= in
->caps_used();
3339 if (!(used
& CEPH_CAP_FILE_CACHE
) &&
3340 !objectcacher
->set_is_empty(&in
->oset
))
3341 used
|= CEPH_CAP_FILE_CACHE
;
3345 void Client::cap_delay_requeue(Inode
*in
)
3347 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3348 in
->hold_caps_until
= ceph_clock_now();
3349 in
->hold_caps_until
+= cct
->_conf
->client_caps_release_delay
;
3350 delayed_list
.push_back(&in
->delay_cap_item
);
3353 void Client::send_cap(Inode
*in
, MetaSession
*session
, Cap
*cap
,
3354 int flags
, int used
, int want
, int retain
,
3355 int flush
, ceph_tid_t flush_tid
)
3357 int held
= cap
->issued
| cap
->implemented
;
3358 int revoking
= cap
->implemented
& ~cap
->issued
;
3359 retain
&= ~revoking
;
3360 int dropping
= cap
->issued
& ~retain
;
3361 int op
= CEPH_CAP_OP_UPDATE
;
3363 ldout(cct
, 10) << __func__
<< " " << *in
3364 << " mds." << session
->mds_num
<< " seq " << cap
->seq
3365 << " used " << ccap_string(used
)
3366 << " want " << ccap_string(want
)
3367 << " flush " << ccap_string(flush
)
3368 << " retain " << ccap_string(retain
)
3369 << " held "<< ccap_string(held
)
3370 << " revoking " << ccap_string(revoking
)
3371 << " dropping " << ccap_string(dropping
)
3374 if (cct
->_conf
->client_inject_release_failure
&& revoking
) {
3375 const int would_have_issued
= cap
->issued
& retain
;
3376 const int would_have_implemented
= cap
->implemented
& (cap
->issued
| used
);
3378 // - tell the server we think issued is whatever they issued plus whatever we implemented
3379 // - leave what we have implemented in place
3380 ldout(cct
, 20) << __func__
<< " injecting failure to release caps" << dendl
;
3381 cap
->issued
= cap
->issued
| cap
->implemented
;
3383 // Make an exception for revoking xattr caps: we are injecting
3384 // failure to release other caps, but allow xattr because client
3385 // will block on xattr ops if it can't release these to MDS (#9800)
3386 const int xattr_mask
= CEPH_CAP_XATTR_SHARED
| CEPH_CAP_XATTR_EXCL
;
3387 cap
->issued
^= xattr_mask
& revoking
;
3388 cap
->implemented
^= xattr_mask
& revoking
;
3390 ldout(cct
, 20) << __func__
<< " issued " << ccap_string(cap
->issued
) << " vs " << ccap_string(would_have_issued
) << dendl
;
3391 ldout(cct
, 20) << __func__
<< " implemented " << ccap_string(cap
->implemented
) << " vs " << ccap_string(would_have_implemented
) << dendl
;
3394 cap
->issued
&= retain
;
3395 cap
->implemented
&= cap
->issued
| used
;
3398 snapid_t follows
= 0;
3401 follows
= in
->snaprealm
->get_snap_context().seq
;
3403 auto m
= make_message
<MClientCaps
>(op
,
3406 cap
->cap_id
, cap
->seq
,
3412 m
->caller_uid
= in
->cap_dirtier_uid
;
3413 m
->caller_gid
= in
->cap_dirtier_gid
;
3415 m
->head
.issue_seq
= cap
->issue_seq
;
3416 m
->set_tid(flush_tid
);
3418 m
->head
.uid
= in
->uid
;
3419 m
->head
.gid
= in
->gid
;
3420 m
->head
.mode
= in
->mode
;
3422 m
->head
.nlink
= in
->nlink
;
3424 if (flush
& CEPH_CAP_XATTR_EXCL
) {
3425 encode(in
->xattrs
, m
->xattrbl
);
3426 m
->head
.xattr_version
= in
->xattr_version
;
3430 m
->max_size
= in
->max_size
;
3431 m
->truncate_seq
= in
->truncate_seq
;
3432 m
->truncate_size
= in
->truncate_size
;
3433 m
->mtime
= in
->mtime
;
3434 m
->atime
= in
->atime
;
3435 m
->ctime
= in
->ctime
;
3436 m
->btime
= in
->btime
;
3437 m
->time_warp_seq
= in
->time_warp_seq
;
3438 m
->change_attr
= in
->change_attr
;
3440 if (!(flags
& MClientCaps::FLAG_PENDING_CAPSNAP
) &&
3441 !in
->cap_snaps
.empty() &&
3442 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3443 flags
|= MClientCaps::FLAG_PENDING_CAPSNAP
;
3446 if (flush
& CEPH_CAP_FILE_WR
) {
3447 m
->inline_version
= in
->inline_version
;
3448 m
->inline_data
= in
->inline_data
;
3451 in
->reported_size
= in
->size
;
3452 m
->set_snap_follows(follows
);
3454 if (cap
== in
->auth_cap
) {
3455 if (want
& CEPH_CAP_ANY_FILE_WR
) {
3456 m
->set_max_size(in
->wanted_max_size
);
3457 in
->requested_max_size
= in
->wanted_max_size
;
3458 ldout(cct
, 15) << "auth cap, requesting max_size " << in
->requested_max_size
<< dendl
;
3460 in
->requested_max_size
= 0;
3461 ldout(cct
, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl
;
3465 if (!session
->flushing_caps_tids
.empty())
3466 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3468 session
->con
->send_message2(std::move(m
));
3471 static bool is_max_size_approaching(Inode
*in
)
3473 /* mds will adjust max size according to the reported size */
3474 if (in
->flushing_caps
& CEPH_CAP_FILE_WR
)
3476 if (in
->size
>= in
->max_size
)
3478 /* half of previous max_size increment has been used */
3479 if (in
->max_size
> in
->reported_size
&&
3480 (in
->size
<< 1) >= in
->max_size
+ in
->reported_size
)
3485 static int adjust_caps_used_for_lazyio(int used
, int issued
, int implemented
)
3487 if (!(used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
)))
3489 if (!(implemented
& CEPH_CAP_FILE_LAZYIO
))
3492 if (issued
& CEPH_CAP_FILE_LAZYIO
) {
3493 if (!(issued
& CEPH_CAP_FILE_CACHE
)) {
3494 used
&= ~CEPH_CAP_FILE_CACHE
;
3495 used
|= CEPH_CAP_FILE_LAZYIO
;
3497 if (!(issued
& CEPH_CAP_FILE_BUFFER
)) {
3498 used
&= ~CEPH_CAP_FILE_BUFFER
;
3499 used
|= CEPH_CAP_FILE_LAZYIO
;
3502 if (!(implemented
& CEPH_CAP_FILE_CACHE
)) {
3503 used
&= ~CEPH_CAP_FILE_CACHE
;
3504 used
|= CEPH_CAP_FILE_LAZYIO
;
3506 if (!(implemented
& CEPH_CAP_FILE_BUFFER
)) {
3507 used
&= ~CEPH_CAP_FILE_BUFFER
;
3508 used
|= CEPH_CAP_FILE_LAZYIO
;
3517 * Examine currently used and wanted versus held caps. Release, flush or ack
3518 * revoked caps to the MDS as appropriate.
3520 * @param in the inode to check
3521 * @param flags flags to apply to cap check
3523 void Client::check_caps(Inode
*in
, unsigned flags
)
3525 unsigned wanted
= in
->caps_wanted();
3526 unsigned used
= get_caps_used(in
);
3530 int issued
= in
->caps_issued(&implemented
);
3531 int revoking
= implemented
& ~issued
;
3533 int orig_used
= used
;
3534 used
= adjust_caps_used_for_lazyio(used
, issued
, implemented
);
3536 int retain
= wanted
| used
| CEPH_CAP_PIN
;
3537 if (!unmounting
&& in
->nlink
> 0) {
3539 retain
|= CEPH_CAP_ANY
;
3540 } else if (in
->is_dir() &&
3541 (issued
& CEPH_CAP_FILE_SHARED
) &&
3542 (in
->flags
& I_COMPLETE
)) {
3543 // we do this here because we don't want to drop to Fs (and then
3544 // drop the Fs if we do a create!) if that alone makes us send lookups
3545 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3546 wanted
= CEPH_CAP_ANY_SHARED
| CEPH_CAP_FILE_EXCL
;
3549 retain
|= CEPH_CAP_ANY_SHARED
;
3550 // keep RD only if we didn't have the file open RW,
3551 // because then the mds would revoke it anyway to
3552 // journal max_size=0.
3553 if (in
->max_size
== 0)
3554 retain
|= CEPH_CAP_ANY_RD
;
3558 ldout(cct
, 10) << __func__
<< " on " << *in
3559 << " wanted " << ccap_string(wanted
)
3560 << " used " << ccap_string(used
)
3561 << " issued " << ccap_string(issued
)
3562 << " revoking " << ccap_string(revoking
)
3563 << " flags=" << flags
3566 if (in
->snapid
!= CEPH_NOSNAP
)
3567 return; //snap caps last forever, can't write
3569 if (in
->caps
.empty())
3570 return; // guard if at end of func
3572 if (!(orig_used
& CEPH_CAP_FILE_BUFFER
) &&
3573 (revoking
& used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
3575 used
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
3579 for (auto &p
: in
->caps
) {
3580 mds_rank_t mds
= p
.first
;
3581 Cap
&cap
= p
.second
;
3583 MetaSession
*session
= &mds_sessions
.at(mds
);
3586 if (in
->auth_cap
&& &cap
!= in
->auth_cap
)
3587 cap_used
&= ~in
->auth_cap
->issued
;
3589 revoking
= cap
.implemented
& ~cap
.issued
;
3591 ldout(cct
, 10) << " cap mds." << mds
3592 << " issued " << ccap_string(cap
.issued
)
3593 << " implemented " << ccap_string(cap
.implemented
)
3594 << " revoking " << ccap_string(revoking
) << dendl
;
3596 if (in
->wanted_max_size
> in
->max_size
&&
3597 in
->wanted_max_size
> in
->requested_max_size
&&
3598 &cap
== in
->auth_cap
)
3601 /* approaching file_max? */
3602 if ((cap
.issued
& CEPH_CAP_FILE_WR
) &&
3603 &cap
== in
->auth_cap
&&
3604 is_max_size_approaching(in
)) {
3605 ldout(cct
, 10) << "size " << in
->size
<< " approaching max_size " << in
->max_size
3606 << ", reported " << in
->reported_size
<< dendl
;
3610 /* completed revocation? */
3611 if (revoking
&& (revoking
& cap_used
) == 0) {
3612 ldout(cct
, 10) << "completed revocation of " << ccap_string(cap
.implemented
& ~cap
.issued
) << dendl
;
3616 /* want more caps from mds? */
3617 if (wanted
& ~(cap
.wanted
| cap
.issued
))
3620 if (!revoking
&& unmounting
&& (cap_used
== 0))
3623 if ((cap
.issued
& ~retain
) == 0 && // and we don't have anything we wouldn't like
3624 !in
->dirty_caps
) // and we have no dirty caps
3627 if (!(flags
& CHECK_CAPS_NODELAY
)) {
3628 ldout(cct
, 10) << "delaying cap release" << dendl
;
3629 cap_delay_requeue(in
);
3634 if (&cap
== in
->auth_cap
) {
3635 if (in
->flags
& I_KICK_FLUSH
) {
3636 ldout(cct
, 20) << " reflushing caps (check_caps) on " << *in
3637 << " to mds." << mds
<< dendl
;
3638 kick_flushing_caps(in
, session
);
3640 if (!in
->cap_snaps
.empty() &&
3641 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3647 ceph_tid_t flush_tid
;
3648 if (in
->auth_cap
== &cap
&& in
->dirty_caps
) {
3649 flushing
= mark_caps_flushing(in
, &flush_tid
);
3650 if (flags
& CHECK_CAPS_SYNCHRONOUS
)
3651 msg_flags
|= MClientCaps::FLAG_SYNC
;
3657 send_cap(in
, session
, &cap
, msg_flags
, cap_used
, wanted
, retain
,
3658 flushing
, flush_tid
);
3663 void Client::queue_cap_snap(Inode
*in
, SnapContext
& old_snapc
)
3665 int used
= get_caps_used(in
);
3666 int dirty
= in
->caps_dirty();
3667 ldout(cct
, 10) << __func__
<< " " << *in
<< " snapc " << old_snapc
<< " used " << ccap_string(used
) << dendl
;
3669 if (in
->cap_snaps
.size() &&
3670 in
->cap_snaps
.rbegin()->second
.writing
) {
3671 ldout(cct
, 10) << __func__
<< " already have pending cap_snap on " << *in
<< dendl
;
3673 } else if (in
->caps_dirty() ||
3674 (used
& CEPH_CAP_FILE_WR
) ||
3675 (dirty
& CEPH_CAP_ANY_WR
)) {
3676 const auto &capsnapem
= in
->cap_snaps
.emplace(std::piecewise_construct
, std::make_tuple(old_snapc
.seq
), std::make_tuple(in
));
3677 ceph_assert(capsnapem
.second
); /* element inserted */
3678 CapSnap
&capsnap
= capsnapem
.first
->second
;
3679 capsnap
.context
= old_snapc
;
3680 capsnap
.issued
= in
->caps_issued();
3681 capsnap
.dirty
= in
->caps_dirty();
3683 capsnap
.dirty_data
= (used
& CEPH_CAP_FILE_BUFFER
);
3685 capsnap
.uid
= in
->uid
;
3686 capsnap
.gid
= in
->gid
;
3687 capsnap
.mode
= in
->mode
;
3688 capsnap
.btime
= in
->btime
;
3689 capsnap
.xattrs
= in
->xattrs
;
3690 capsnap
.xattr_version
= in
->xattr_version
;
3691 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3692 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3694 if (used
& CEPH_CAP_FILE_WR
) {
3695 ldout(cct
, 10) << __func__
<< " WR used on " << *in
<< dendl
;
3696 capsnap
.writing
= 1;
3698 finish_cap_snap(in
, capsnap
, used
);
3701 ldout(cct
, 10) << __func__
<< " not dirty|writing on " << *in
<< dendl
;
3705 void Client::finish_cap_snap(Inode
*in
, CapSnap
&capsnap
, int used
)
3707 ldout(cct
, 10) << __func__
<< " " << *in
<< " capsnap " << (void *)&capsnap
<< " used " << ccap_string(used
) << dendl
;
3708 capsnap
.size
= in
->size
;
3709 capsnap
.mtime
= in
->mtime
;
3710 capsnap
.atime
= in
->atime
;
3711 capsnap
.ctime
= in
->ctime
;
3712 capsnap
.time_warp_seq
= in
->time_warp_seq
;
3713 capsnap
.change_attr
= in
->change_attr
;
3714 capsnap
.dirty
|= in
->caps_dirty();
3716 /* Only reset it if it wasn't set before */
3717 if (capsnap
.cap_dirtier_uid
== -1) {
3718 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3719 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3722 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3723 capsnap
.inline_data
= in
->inline_data
;
3724 capsnap
.inline_version
= in
->inline_version
;
3727 if (used
& CEPH_CAP_FILE_BUFFER
) {
3728 capsnap
.writing
= 1;
3729 ldout(cct
, 10) << __func__
<< " " << *in
<< " cap_snap " << &capsnap
<< " used " << used
3730 << " WRBUFFER, delaying" << dendl
;
3732 capsnap
.dirty_data
= 0;
3737 void Client::send_flush_snap(Inode
*in
, MetaSession
*session
,
3738 snapid_t follows
, CapSnap
& capsnap
)
3740 auto m
= make_message
<MClientCaps
>(CEPH_CAP_OP_FLUSHSNAP
,
3741 in
->ino
, in
->snaprealm
->ino
, 0,
3742 in
->auth_cap
->mseq
, cap_epoch_barrier
);
3743 m
->caller_uid
= capsnap
.cap_dirtier_uid
;
3744 m
->caller_gid
= capsnap
.cap_dirtier_gid
;
3746 m
->set_client_tid(capsnap
.flush_tid
);
3747 m
->head
.snap_follows
= follows
;
3749 m
->head
.caps
= capsnap
.issued
;
3750 m
->head
.dirty
= capsnap
.dirty
;
3752 m
->head
.uid
= capsnap
.uid
;
3753 m
->head
.gid
= capsnap
.gid
;
3754 m
->head
.mode
= capsnap
.mode
;
3755 m
->btime
= capsnap
.btime
;
3757 m
->size
= capsnap
.size
;
3759 m
->head
.xattr_version
= capsnap
.xattr_version
;
3760 encode(capsnap
.xattrs
, m
->xattrbl
);
3762 m
->ctime
= capsnap
.ctime
;
3763 m
->btime
= capsnap
.btime
;
3764 m
->mtime
= capsnap
.mtime
;
3765 m
->atime
= capsnap
.atime
;
3766 m
->time_warp_seq
= capsnap
.time_warp_seq
;
3767 m
->change_attr
= capsnap
.change_attr
;
3769 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3770 m
->inline_version
= in
->inline_version
;
3771 m
->inline_data
= in
->inline_data
;
3774 ceph_assert(!session
->flushing_caps_tids
.empty());
3775 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3777 session
->con
->send_message2(std::move(m
));
3780 void Client::flush_snaps(Inode
*in
)
3782 ldout(cct
, 10) << "flush_snaps on " << *in
<< dendl
;
3783 ceph_assert(in
->cap_snaps
.size());
3786 ceph_assert(in
->auth_cap
);
3787 MetaSession
*session
= in
->auth_cap
->session
;
3789 for (auto &p
: in
->cap_snaps
) {
3790 CapSnap
&capsnap
= p
.second
;
3791 // only do new flush
3792 if (capsnap
.flush_tid
> 0)
3795 ldout(cct
, 10) << "flush_snaps mds." << session
->mds_num
3796 << " follows " << p
.first
3797 << " size " << capsnap
.size
3798 << " mtime " << capsnap
.mtime
3799 << " dirty_data=" << capsnap
.dirty_data
3800 << " writing=" << capsnap
.writing
3801 << " on " << *in
<< dendl
;
3802 if (capsnap
.dirty_data
|| capsnap
.writing
)
3805 capsnap
.flush_tid
= ++last_flush_tid
;
3806 session
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
3807 in
->flushing_cap_tids
[capsnap
.flush_tid
] = 0;
3808 if (!in
->flushing_cap_item
.is_on_list())
3809 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
3811 send_flush_snap(in
, session
, p
.first
, capsnap
);
3815 void Client::wait_on_list(list
<ceph::condition_variable
*>& ls
)
3817 ceph::condition_variable cond
;
3818 ls
.push_back(&cond
);
3819 std::unique_lock l
{client_lock
, std::adopt_lock
};
3825 void Client::signal_cond_list(list
<ceph::condition_variable
*>& ls
)
3827 for (auto cond
: ls
) {
3832 void Client::wait_on_context_list(list
<Context
*>& ls
)
3834 ceph::condition_variable cond
;
3837 ls
.push_back(new C_Cond(cond
, &done
, &r
));
3838 std::unique_lock l
{client_lock
, std::adopt_lock
};
3839 cond
.wait(l
, [&done
] { return done
;});
3843 void Client::signal_context_list(list
<Context
*>& ls
)
3845 while (!ls
.empty()) {
3846 ls
.front()->complete(0);
3851 void Client::wake_up_session_caps(MetaSession
*s
, bool reconnect
)
3853 for (const auto &cap
: s
->caps
) {
3854 auto &in
= cap
->inode
;
3856 in
.requested_max_size
= 0;
3857 in
.wanted_max_size
= 0;
3859 if (cap
->gen
< s
->cap_gen
) {
3860 // mds did not re-issue stale cap.
3861 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
3862 // make sure mds knows what we want.
3863 if (in
.caps_file_wanted() & ~cap
->wanted
)
3864 in
.flags
|= I_CAP_DROPPED
;
3867 signal_cond_list(in
.waitfor_caps
);
3872 // flush dirty data (from objectcache)
3874 class C_Client_CacheInvalidate
: public Context
{
3878 int64_t offset
, length
;
3880 C_Client_CacheInvalidate(Client
*c
, Inode
*in
, int64_t off
, int64_t len
) :
3881 client(c
), offset(off
), length(len
) {
3882 if (client
->use_faked_inos())
3883 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
3887 void finish(int r
) override
{
3888 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3889 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
3890 client
->_async_invalidate(ino
, offset
, length
);
3894 void Client::_async_invalidate(vinodeno_t ino
, int64_t off
, int64_t len
)
3898 ldout(cct
, 10) << __func__
<< " " << ino
<< " " << off
<< "~" << len
<< dendl
;
3899 ino_invalidate_cb(callback_handle
, ino
, off
, len
);
3902 void Client::_schedule_invalidate_callback(Inode
*in
, int64_t off
, int64_t len
) {
3904 if (ino_invalidate_cb
)
3905 // we queue the invalidate, which calls the callback and decrements the ref
3906 async_ino_invalidator
.queue(new C_Client_CacheInvalidate(this, in
, off
, len
));
3909 void Client::_invalidate_inode_cache(Inode
*in
)
3911 ldout(cct
, 10) << __func__
<< " " << *in
<< dendl
;
3913 // invalidate our userspace inode cache
3914 if (cct
->_conf
->client_oc
) {
3915 objectcacher
->release_set(&in
->oset
);
3916 if (!objectcacher
->set_is_empty(&in
->oset
))
3917 lderr(cct
) << "failed to invalidate cache for " << *in
<< dendl
;
3920 _schedule_invalidate_callback(in
, 0, 0);
3923 void Client::_invalidate_inode_cache(Inode
*in
, int64_t off
, int64_t len
)
3925 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
3927 // invalidate our userspace inode cache
3928 if (cct
->_conf
->client_oc
) {
3929 vector
<ObjectExtent
> ls
;
3930 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, len
, in
->truncate_size
, ls
);
3931 objectcacher
->discard_writeback(&in
->oset
, ls
, nullptr);
3934 _schedule_invalidate_callback(in
, off
, len
);
3937 bool Client::_release(Inode
*in
)
3939 ldout(cct
, 20) << "_release " << *in
<< dendl
;
3940 if (in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3941 _invalidate_inode_cache(in
);
3947 bool Client::_flush(Inode
*in
, Context
*onfinish
)
3949 ldout(cct
, 10) << "_flush " << *in
<< dendl
;
3951 if (!in
->oset
.dirty_or_tx
) {
3952 ldout(cct
, 10) << " nothing to flush" << dendl
;
3953 onfinish
->complete(0);
3957 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
3958 ldout(cct
, 8) << __func__
<< ": FULL, purging for ENOSPC" << dendl
;
3959 objectcacher
->purge_set(&in
->oset
);
3961 onfinish
->complete(-ENOSPC
);
3966 return objectcacher
->flush_set(&in
->oset
, onfinish
);
3969 void Client::_flush_range(Inode
*in
, int64_t offset
, uint64_t size
)
3971 ceph_assert(ceph_mutex_is_locked(client_lock
));
3972 if (!in
->oset
.dirty_or_tx
) {
3973 ldout(cct
, 10) << " nothing to flush" << dendl
;
3977 C_SaferCond
onflush("Client::_flush_range flock");
3978 bool ret
= objectcacher
->file_flush(&in
->oset
, &in
->layout
, in
->snaprealm
->get_snap_context(),
3979 offset
, size
, &onflush
);
3982 client_lock
.unlock();
3988 void Client::flush_set_callback(ObjectCacher::ObjectSet
*oset
)
3990 // std::lock_guard l(client_lock);
3991 ceph_assert(ceph_mutex_is_locked(client_lock
)); // will be called via dispatch() -> objecter -> ...
3992 Inode
*in
= static_cast<Inode
*>(oset
->parent
);
3997 void Client::_flushed(Inode
*in
)
3999 ldout(cct
, 10) << "_flushed " << *in
<< dendl
;
4001 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
4006 // checks common to add_update_cap, handle_cap_grant
4007 void Client::check_cap_issue(Inode
*in
, unsigned issued
)
4009 unsigned had
= in
->caps_issued();
4011 if ((issued
& CEPH_CAP_FILE_CACHE
) &&
4012 !(had
& CEPH_CAP_FILE_CACHE
))
4015 if ((issued
& CEPH_CAP_FILE_SHARED
) !=
4016 (had
& CEPH_CAP_FILE_SHARED
)) {
4017 if (issued
& CEPH_CAP_FILE_SHARED
)
4020 clear_dir_complete_and_ordered(in
, true);
4024 void Client::add_update_cap(Inode
*in
, MetaSession
*mds_session
, uint64_t cap_id
,
4025 unsigned issued
, unsigned wanted
, unsigned seq
, unsigned mseq
,
4026 inodeno_t realm
, int flags
, const UserPerm
& cap_perms
)
4028 if (!in
->is_any_caps()) {
4029 ceph_assert(in
->snaprealm
== 0);
4030 in
->snaprealm
= get_snap_realm(realm
);
4031 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4032 ldout(cct
, 15) << __func__
<< " first one, opened snaprealm " << in
->snaprealm
<< dendl
;
4034 ceph_assert(in
->snaprealm
);
4035 if ((flags
& CEPH_CAP_FLAG_AUTH
) &&
4036 realm
!= inodeno_t(-1) && in
->snaprealm
->ino
!= realm
) {
4037 in
->snaprealm_item
.remove_myself();
4038 auto oldrealm
= in
->snaprealm
;
4039 in
->snaprealm
= get_snap_realm(realm
);
4040 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4041 put_snap_realm(oldrealm
);
4045 mds_rank_t mds
= mds_session
->mds_num
;
4046 const auto &capem
= in
->caps
.emplace(std::piecewise_construct
, std::forward_as_tuple(mds
), std::forward_as_tuple(*in
, mds_session
));
4047 Cap
&cap
= capem
.first
->second
;
4048 if (!capem
.second
) {
4049 if (cap
.gen
< mds_session
->cap_gen
)
4050 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
4053 * auth mds of the inode changed. we received the cap export
4054 * message, but still haven't received the cap import message.
4055 * handle_cap_export() updated the new auth MDS' cap.
4057 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4058 * a message that was send before the cap import message. So
4059 * don't remove caps.
4061 if (ceph_seq_cmp(seq
, cap
.seq
) <= 0) {
4062 if (&cap
!= in
->auth_cap
)
4063 ldout(cct
, 0) << "WARNING: " << "inode " << *in
<< " caps on mds." << mds
<< " != auth_cap." << dendl
;
4065 ceph_assert(cap
.cap_id
== cap_id
);
4068 issued
|= cap
.issued
;
4069 flags
|= CEPH_CAP_FLAG_AUTH
;
4073 check_cap_issue(in
, issued
);
4075 if (flags
& CEPH_CAP_FLAG_AUTH
) {
4076 if (in
->auth_cap
!= &cap
&&
4077 (!in
->auth_cap
|| ceph_seq_cmp(in
->auth_cap
->mseq
, mseq
) < 0)) {
4078 if (in
->auth_cap
&& in
->flushing_cap_item
.is_on_list()) {
4079 ldout(cct
, 10) << __func__
<< " changing auth cap: "
4080 << "add myself to new auth MDS' flushing caps list" << dendl
;
4081 adjust_session_flushing_caps(in
, in
->auth_cap
->session
, mds_session
);
4083 in
->auth_cap
= &cap
;
4087 unsigned old_caps
= cap
.issued
;
4088 cap
.cap_id
= cap_id
;
4089 cap
.issued
= issued
;
4090 cap
.implemented
|= issued
;
4091 if (ceph_seq_cmp(mseq
, cap
.mseq
) > 0)
4092 cap
.wanted
= wanted
;
4094 cap
.wanted
|= wanted
;
4096 cap
.issue_seq
= seq
;
4098 cap
.gen
= mds_session
->cap_gen
;
4099 cap
.latest_perms
= cap_perms
;
4100 ldout(cct
, 10) << __func__
<< " issued " << ccap_string(old_caps
) << " -> " << ccap_string(cap
.issued
)
4101 << " from mds." << mds
4105 if ((issued
& ~old_caps
) && in
->auth_cap
== &cap
) {
4106 // non-auth MDS is revoking the newly grant caps ?
4107 for (auto &p
: in
->caps
) {
4108 if (&p
.second
== &cap
)
4110 if (p
.second
.implemented
& ~p
.second
.issued
& issued
) {
4111 check_caps(in
, CHECK_CAPS_NODELAY
);
4117 if (issued
& ~old_caps
)
4118 signal_cond_list(in
->waitfor_caps
);
4121 void Client::remove_cap(Cap
*cap
, bool queue_release
)
4123 auto &in
= cap
->inode
;
4124 MetaSession
*session
= cap
->session
;
4125 mds_rank_t mds
= cap
->session
->mds_num
;
4127 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " on " << in
<< dendl
;
4129 if (queue_release
) {
4130 session
->enqueue_cap_release(
4138 if (in
.auth_cap
== cap
) {
4139 if (in
.flushing_cap_item
.is_on_list()) {
4140 ldout(cct
, 10) << " removing myself from flushing_cap list" << dendl
;
4141 in
.flushing_cap_item
.remove_myself();
4145 size_t n
= in
.caps
.erase(mds
);
4146 ceph_assert(n
== 1);
4149 if (!in
.is_any_caps()) {
4150 ldout(cct
, 15) << __func__
<< " last one, closing snaprealm " << in
.snaprealm
<< dendl
;
4151 in
.snaprealm_item
.remove_myself();
4152 put_snap_realm(in
.snaprealm
);
4157 void Client::remove_all_caps(Inode
*in
)
4159 while (!in
->caps
.empty())
4160 remove_cap(&in
->caps
.begin()->second
, true);
4163 void Client::remove_session_caps(MetaSession
*s
, int err
)
4165 ldout(cct
, 10) << __func__
<< " mds." << s
->mds_num
<< dendl
;
4167 while (s
->caps
.size()) {
4168 Cap
*cap
= *s
->caps
.begin();
4169 InodeRef
in(&cap
->inode
);
4170 bool dirty_caps
= false;
4171 if (in
->auth_cap
== cap
) {
4172 dirty_caps
= in
->dirty_caps
| in
->flushing_caps
;
4173 in
->wanted_max_size
= 0;
4174 in
->requested_max_size
= 0;
4175 if (in
->has_any_filelocks())
4176 in
->flags
|= I_ERROR_FILELOCK
;
4178 auto caps
= cap
->implemented
;
4179 if (cap
->wanted
| cap
->issued
)
4180 in
->flags
|= I_CAP_DROPPED
;
4181 remove_cap(cap
, false);
4182 in
->cap_snaps
.clear();
4184 lderr(cct
) << __func__
<< " still has dirty|flushing caps on " << *in
<< dendl
;
4185 if (in
->flushing_caps
) {
4186 num_flushing_caps
--;
4187 in
->flushing_cap_tids
.clear();
4189 in
->flushing_caps
= 0;
4190 in
->mark_caps_clean();
4191 put_inode(in
.get());
4193 caps
&= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
;
4194 if (caps
&& !in
->caps_issued_mask(caps
, true)) {
4195 if (err
== -EBLACKLISTED
) {
4196 if (in
->oset
.dirty_or_tx
) {
4197 lderr(cct
) << __func__
<< " still has dirty data on " << *in
<< dendl
;
4198 in
->set_async_err(err
);
4200 objectcacher
->purge_set(&in
->oset
);
4202 objectcacher
->release_set(&in
->oset
);
4204 _schedule_invalidate_callback(in
.get(), 0, 0);
4207 signal_cond_list(in
->waitfor_caps
);
4209 s
->flushing_caps_tids
.clear();
4210 sync_cond
.notify_all();
4213 int Client::_do_remount(bool retry_on_error
)
4215 uint64_t max_retries
= cct
->_conf
.get_val
<uint64_t>("mds_max_retries_on_remount_failure");
4218 int r
= remount_cb(callback_handle
);
4220 retries_on_invalidate
= 0;
4223 client_t whoami
= get_nodeid();
4226 "failed to remount (to trim kernel dentries): "
4227 "errno = " << e
<< " (" << strerror(e
) << ")" << dendl
;
4230 "failed to remount (to trim kernel dentries): "
4231 "return code = " << r
<< dendl
;
4234 (cct
->_conf
.get_val
<bool>("client_die_on_failed_remount") ||
4235 cct
->_conf
.get_val
<bool>("client_die_on_failed_dentry_invalidate")) &&
4236 !(retry_on_error
&& (++retries_on_invalidate
< max_retries
));
4237 if (should_abort
&& !unmounting
) {
4238 lderr(cct
) << "failed to remount for kernel dentry trimming; quitting!" << dendl
;
4245 class C_Client_Remount
: public Context
{
4249 explicit C_Client_Remount(Client
*c
) : client(c
) {}
4250 void finish(int r
) override
{
4251 ceph_assert(r
== 0);
4252 client
->_do_remount(true);
4256 void Client::_invalidate_kernel_dcache()
4260 if (can_invalidate_dentries
) {
4261 if (dentry_invalidate_cb
&& root
->dir
) {
4262 for (ceph::unordered_map
<string
, Dentry
*>::iterator p
= root
->dir
->dentries
.begin();
4263 p
!= root
->dir
->dentries
.end();
4265 if (p
->second
->inode
)
4266 _schedule_invalidate_dentry_callback(p
->second
, false);
4269 } else if (remount_cb
) {
4271 // when remounting a file system, linux kernel trims all unused dentries in the fs
4272 remount_finisher
.queue(new C_Client_Remount(this));
4276 void Client::_trim_negative_child_dentries(InodeRef
& in
)
4282 if (dir
&& dir
->dentries
.size() == dir
->num_null_dentries
) {
4283 for (auto p
= dir
->dentries
.begin(); p
!= dir
->dentries
.end(); ) {
4284 Dentry
*dn
= p
->second
;
4286 ceph_assert(!dn
->inode
);
4287 if (dn
->lru_is_expireable())
4288 unlink(dn
, true, false); // keep dir, drop dentry
4290 if (dir
->dentries
.empty()) {
4295 if (in
->flags
& I_SNAPDIR_OPEN
) {
4296 InodeRef snapdir
= open_snapdir(in
.get());
4297 _trim_negative_child_dentries(snapdir
);
4301 class C_Client_CacheRelease
: public Context
{
4306 C_Client_CacheRelease(Client
*c
, Inode
*in
) :
4308 if (client
->use_faked_inos())
4309 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
4313 void finish(int r
) override
{
4314 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
4315 client
->_async_inode_release(ino
);
4319 void Client::_async_inode_release(vinodeno_t ino
)
4323 ldout(cct
, 10) << __func__
<< " " << ino
<< dendl
;
4324 ino_release_cb(callback_handle
, ino
);
4327 void Client::_schedule_ino_release_callback(Inode
*in
) {
4330 // we queue the invalidate, which calls the callback and decrements the ref
4331 async_ino_releasor
.queue(new C_Client_CacheRelease(this, in
));
4334 void Client::trim_caps(MetaSession
*s
, uint64_t max
)
4336 mds_rank_t mds
= s
->mds_num
;
4337 size_t caps_size
= s
->caps
.size();
4338 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " max " << max
4339 << " caps " << caps_size
<< dendl
;
4341 uint64_t trimmed
= 0;
4342 auto p
= s
->caps
.begin();
4343 std::set
<Dentry
*> to_trim
; /* this avoids caps other than the one we're
4344 * looking at from getting deleted during traversal. */
4345 while ((caps_size
- trimmed
) > max
&& !p
.end()) {
4347 InodeRef
in(&cap
->inode
);
4349 // Increment p early because it will be invalidated if cap
4350 // is deleted inside remove_cap
4353 if (in
->caps
.size() > 1 && cap
!= in
->auth_cap
) {
4354 int mine
= cap
->issued
| cap
->implemented
;
4355 int oissued
= in
->auth_cap
? in
->auth_cap
->issued
: 0;
4356 // disposable non-auth cap
4357 if (!(get_caps_used(in
.get()) & ~oissued
& mine
)) {
4358 ldout(cct
, 20) << " removing unused, unneeded non-auth cap on " << *in
<< dendl
;
4359 cap
= (remove_cap(cap
, true), nullptr);
4363 ldout(cct
, 20) << " trying to trim dentries for " << *in
<< dendl
;
4364 _trim_negative_child_dentries(in
);
4366 auto q
= in
->dentries
.begin();
4367 while (q
!= in
->dentries
.end()) {
4370 if (dn
->lru_is_expireable()) {
4371 if (can_invalidate_dentries
&&
4372 dn
->dir
->parent_inode
->ino
== MDS_INO_ROOT
) {
4373 // Only issue one of these per DN for inodes in root: handle
4374 // others more efficiently by calling for root-child DNs at
4375 // the end of this function.
4376 _schedule_invalidate_dentry_callback(dn
, true);
4378 ldout(cct
, 20) << " queueing dentry for trimming: " << dn
->name
<< dendl
;
4381 ldout(cct
, 20) << " not expirable: " << dn
->name
<< dendl
;
4385 if (in
->ll_ref
== 1 && in
->ino
!= MDS_INO_ROOT
) {
4386 _schedule_ino_release_callback(in
.get());
4388 if (all
&& in
->ino
!= MDS_INO_ROOT
) {
4389 ldout(cct
, 20) << __func__
<< " counting as trimmed: " << *in
<< dendl
;
4394 ldout(cct
, 20) << " trimming queued dentries: " << dendl
;
4395 for (const auto &dn
: to_trim
) {
4400 caps_size
= s
->caps
.size();
4401 if (caps_size
> (size_t)max
)
4402 _invalidate_kernel_dcache();
4405 void Client::force_session_readonly(MetaSession
*s
)
4408 for (xlist
<Cap
*>::iterator p
= s
->caps
.begin(); !p
.end(); ++p
) {
4409 auto &in
= (*p
)->inode
;
4410 if (in
.caps_wanted() & CEPH_CAP_FILE_WR
)
4411 signal_cond_list(in
.waitfor_caps
);
4415 int Client::mark_caps_flushing(Inode
*in
, ceph_tid_t
* ptid
)
4417 MetaSession
*session
= in
->auth_cap
->session
;
4419 int flushing
= in
->dirty_caps
;
4420 ceph_assert(flushing
);
4422 ceph_tid_t flush_tid
= ++last_flush_tid
;
4423 in
->flushing_cap_tids
[flush_tid
] = flushing
;
4425 if (!in
->flushing_caps
) {
4426 ldout(cct
, 10) << __func__
<< " " << ccap_string(flushing
) << " " << *in
<< dendl
;
4427 num_flushing_caps
++;
4429 ldout(cct
, 10) << __func__
<< " (more) " << ccap_string(flushing
) << " " << *in
<< dendl
;
4432 in
->flushing_caps
|= flushing
;
4433 in
->mark_caps_clean();
4435 if (!in
->flushing_cap_item
.is_on_list())
4436 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4437 session
->flushing_caps_tids
.insert(flush_tid
);
4443 void Client::adjust_session_flushing_caps(Inode
*in
, MetaSession
*old_s
, MetaSession
*new_s
)
4445 for (auto &p
: in
->cap_snaps
) {
4446 CapSnap
&capsnap
= p
.second
;
4447 if (capsnap
.flush_tid
> 0) {
4448 old_s
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
4449 new_s
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4452 for (map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4453 it
!= in
->flushing_cap_tids
.end();
4455 old_s
->flushing_caps_tids
.erase(it
->first
);
4456 new_s
->flushing_caps_tids
.insert(it
->first
);
4458 new_s
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4462 * Flush all caps back to the MDS. Because the callers generally wait on the
4463 * result of this function (syncfs and umount cases), we set
4464 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4466 void Client::flush_caps_sync()
4468 ldout(cct
, 10) << __func__
<< dendl
;
4469 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
4471 unsigned flags
= CHECK_CAPS_NODELAY
;
4475 delayed_list
.pop_front();
4476 if (p
.end() && dirty_list
.empty())
4477 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4478 check_caps(in
, flags
);
4482 p
= dirty_list
.begin();
4484 unsigned flags
= CHECK_CAPS_NODELAY
;
4489 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4490 check_caps(in
, flags
);
4494 void Client::wait_sync_caps(Inode
*in
, ceph_tid_t want
)
4496 while (in
->flushing_caps
) {
4497 map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4498 ceph_assert(it
!= in
->flushing_cap_tids
.end());
4499 if (it
->first
> want
)
4501 ldout(cct
, 10) << __func__
<< " on " << *in
<< " flushing "
4502 << ccap_string(it
->second
) << " want " << want
4503 << " last " << it
->first
<< dendl
;
4504 wait_on_list(in
->waitfor_caps
);
4508 void Client::wait_sync_caps(ceph_tid_t want
)
4511 ldout(cct
, 10) << __func__
<< " want " << want
<< " (last is " << last_flush_tid
<< ", "
4512 << num_flushing_caps
<< " total flushing)" << dendl
;
4513 for (auto &p
: mds_sessions
) {
4514 MetaSession
*s
= &p
.second
;
4515 if (s
->flushing_caps_tids
.empty())
4517 ceph_tid_t oldest_tid
= *s
->flushing_caps_tids
.begin();
4518 if (oldest_tid
<= want
) {
4519 ldout(cct
, 10) << " waiting on mds." << p
.first
<< " tid " << oldest_tid
4520 << " (want " << want
<< ")" << dendl
;
4521 std::unique_lock l
{client_lock
, std::adopt_lock
};
4529 void Client::kick_flushing_caps(Inode
*in
, MetaSession
*session
)
4531 in
->flags
&= ~I_KICK_FLUSH
;
4533 Cap
*cap
= in
->auth_cap
;
4534 ceph_assert(cap
->session
== session
);
4536 ceph_tid_t last_snap_flush
= 0;
4537 for (auto p
= in
->flushing_cap_tids
.rbegin();
4538 p
!= in
->flushing_cap_tids
.rend();
4541 last_snap_flush
= p
->first
;
4546 int wanted
= in
->caps_wanted();
4547 int used
= get_caps_used(in
) | in
->caps_dirty();
4548 auto it
= in
->cap_snaps
.begin();
4549 for (auto& p
: in
->flushing_cap_tids
) {
4551 int msg_flags
= p
.first
< last_snap_flush
? MClientCaps::FLAG_PENDING_CAPSNAP
: 0;
4552 send_cap(in
, session
, cap
, msg_flags
, used
, wanted
, (cap
->issued
| cap
->implemented
),
4555 ceph_assert(it
!= in
->cap_snaps
.end());
4556 ceph_assert(it
->second
.flush_tid
== p
.first
);
4557 send_flush_snap(in
, session
, it
->first
, it
->second
);
4563 void Client::kick_flushing_caps(MetaSession
*session
)
4565 mds_rank_t mds
= session
->mds_num
;
4566 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
4568 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4570 if (in
->flags
& I_KICK_FLUSH
) {
4571 ldout(cct
, 20) << " reflushing caps on " << *in
<< " to mds." << mds
<< dendl
;
4572 kick_flushing_caps(in
, session
);
4577 void Client::early_kick_flushing_caps(MetaSession
*session
)
4579 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4581 Cap
*cap
= in
->auth_cap
;
4584 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4585 // stage. This guarantees that MDS processes the cap flush message before issuing
4586 // the flushing caps to other client.
4587 if ((in
->flushing_caps
& in
->auth_cap
->issued
) == in
->flushing_caps
) {
4588 in
->flags
|= I_KICK_FLUSH
;
4592 ldout(cct
, 20) << " reflushing caps (early_kick) on " << *in
4593 << " to mds." << session
->mds_num
<< dendl
;
4594 // send_reconnect() also will reset these sequence numbers. make sure
4595 // sequence numbers in cap flush message match later reconnect message.
4599 cap
->issued
= cap
->implemented
;
4601 kick_flushing_caps(in
, session
);
4605 void SnapRealm::build_snap_context()
4607 set
<snapid_t
> snaps
;
4608 snapid_t max_seq
= seq
;
4610 // start with prior_parents?
4611 for (unsigned i
=0; i
<prior_parent_snaps
.size(); i
++)
4612 snaps
.insert(prior_parent_snaps
[i
]);
4614 // current parent's snaps
4616 const SnapContext
& psnapc
= pparent
->get_snap_context();
4617 for (unsigned i
=0; i
<psnapc
.snaps
.size(); i
++)
4618 if (psnapc
.snaps
[i
] >= parent_since
)
4619 snaps
.insert(psnapc
.snaps
[i
]);
4620 if (psnapc
.seq
> max_seq
)
4621 max_seq
= psnapc
.seq
;
4625 for (unsigned i
=0; i
<my_snaps
.size(); i
++)
4626 snaps
.insert(my_snaps
[i
]);
4629 cached_snap_context
.seq
= max_seq
;
4630 cached_snap_context
.snaps
.resize(0);
4631 cached_snap_context
.snaps
.reserve(snaps
.size());
4632 for (set
<snapid_t
>::reverse_iterator p
= snaps
.rbegin(); p
!= snaps
.rend(); ++p
)
4633 cached_snap_context
.snaps
.push_back(*p
);
4636 void Client::invalidate_snaprealm_and_children(SnapRealm
*realm
)
4641 while (!q
.empty()) {
4645 ldout(cct
, 10) << __func__
<< " " << *realm
<< dendl
;
4646 realm
->invalidate_cache();
4648 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4649 p
!= realm
->pchildren
.end();
4655 SnapRealm
*Client::get_snap_realm(inodeno_t r
)
4657 SnapRealm
*realm
= snap_realms
[r
];
4659 snap_realms
[r
] = realm
= new SnapRealm(r
);
4660 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4665 SnapRealm
*Client::get_snap_realm_maybe(inodeno_t r
)
4667 if (snap_realms
.count(r
) == 0) {
4668 ldout(cct
, 20) << __func__
<< " " << r
<< " fail" << dendl
;
4671 SnapRealm
*realm
= snap_realms
[r
];
4672 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4677 void Client::put_snap_realm(SnapRealm
*realm
)
4679 ldout(cct
, 20) << __func__
<< " " << realm
->ino
<< " " << realm
4680 << " " << realm
->nref
<< " -> " << (realm
->nref
- 1) << dendl
;
4681 if (--realm
->nref
== 0) {
4682 snap_realms
.erase(realm
->ino
);
4683 if (realm
->pparent
) {
4684 realm
->pparent
->pchildren
.erase(realm
);
4685 put_snap_realm(realm
->pparent
);
4691 bool Client::adjust_realm_parent(SnapRealm
*realm
, inodeno_t parent
)
4693 if (realm
->parent
!= parent
) {
4694 ldout(cct
, 10) << __func__
<< " " << *realm
4695 << " " << realm
->parent
<< " -> " << parent
<< dendl
;
4696 realm
->parent
= parent
;
4697 if (realm
->pparent
) {
4698 realm
->pparent
->pchildren
.erase(realm
);
4699 put_snap_realm(realm
->pparent
);
4701 realm
->pparent
= get_snap_realm(parent
);
4702 realm
->pparent
->pchildren
.insert(realm
);
4708 static bool has_new_snaps(const SnapContext
& old_snapc
,
4709 const SnapContext
& new_snapc
)
4711 return !new_snapc
.snaps
.empty() && new_snapc
.snaps
[0] > old_snapc
.seq
;
4715 void Client::update_snap_trace(const bufferlist
& bl
, SnapRealm
**realm_ret
, bool flush
)
4717 SnapRealm
*first_realm
= NULL
;
4718 ldout(cct
, 10) << __func__
<< " len " << bl
.length() << dendl
;
4720 map
<SnapRealm
*, SnapContext
> dirty_realms
;
4722 auto p
= bl
.cbegin();
4726 SnapRealm
*realm
= get_snap_realm(info
.ino());
4728 bool invalidate
= false;
4730 if (info
.seq() > realm
->seq
) {
4731 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq() << " > " << realm
->seq
4735 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4736 // flush me + children
4739 while (!q
.empty()) {
4740 SnapRealm
*realm
= q
.front();
4743 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4744 p
!= realm
->pchildren
.end();
4748 if (dirty_realms
.count(realm
) == 0) {
4750 dirty_realms
[realm
] = realm
->get_snap_context();
4756 realm
->seq
= info
.seq();
4757 realm
->created
= info
.created();
4758 realm
->parent_since
= info
.parent_since();
4759 realm
->prior_parent_snaps
= info
.prior_parent_snaps
;
4760 realm
->my_snaps
= info
.my_snaps
;
4764 // _always_ verify parent
4765 if (adjust_realm_parent(realm
, info
.parent()))
4769 invalidate_snaprealm_and_children(realm
);
4770 ldout(cct
, 15) << __func__
<< " " << *realm
<< " self|parent updated" << dendl
;
4771 ldout(cct
, 15) << " snapc " << realm
->get_snap_context() << dendl
;
4773 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq()
4774 << " <= " << realm
->seq
<< " and same parent, SKIPPING" << dendl
;
4778 first_realm
= realm
;
4780 put_snap_realm(realm
);
4783 for (auto &[realm
, snapc
] : dirty_realms
) {
4784 // if there are new snaps ?
4785 if (has_new_snaps(snapc
, realm
->get_snap_context())) {
4786 ldout(cct
, 10) << " flushing caps on " << *realm
<< dendl
;
4787 for (auto&& in
: realm
->inodes_with_caps
) {
4788 queue_cap_snap(in
, snapc
);
4791 ldout(cct
, 10) << " no new snap on " << *realm
<< dendl
;
4793 put_snap_realm(realm
);
4797 *realm_ret
= first_realm
;
4799 put_snap_realm(first_realm
);
4802 void Client::handle_snap(const MConstRef
<MClientSnap
>& m
)
4804 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
4805 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4806 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4811 got_mds_push(session
);
4813 map
<Inode
*, SnapContext
> to_move
;
4814 SnapRealm
*realm
= 0;
4816 if (m
->head
.op
== CEPH_SNAP_OP_SPLIT
) {
4817 ceph_assert(m
->head
.split
);
4819 auto p
= m
->bl
.cbegin();
4821 ceph_assert(info
.ino() == m
->head
.split
);
4823 // flush, then move, ino's.
4824 realm
= get_snap_realm(info
.ino());
4825 ldout(cct
, 10) << " splitting off " << *realm
<< dendl
;
4826 for (auto& ino
: m
->split_inos
) {
4827 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
4828 if (inode_map
.count(vino
)) {
4829 Inode
*in
= inode_map
[vino
];
4830 if (!in
->snaprealm
|| in
->snaprealm
== realm
)
4832 if (in
->snaprealm
->created
> info
.created()) {
4833 ldout(cct
, 10) << " NOT moving " << *in
<< " from _newer_ realm "
4834 << *in
->snaprealm
<< dendl
;
4837 ldout(cct
, 10) << " moving " << *in
<< " from " << *in
->snaprealm
<< dendl
;
4840 in
->snaprealm_item
.remove_myself();
4841 to_move
[in
] = in
->snaprealm
->get_snap_context();
4842 put_snap_realm(in
->snaprealm
);
4846 // move child snaprealms, too
4847 for (auto& child_realm
: m
->split_realms
) {
4848 ldout(cct
, 10) << "adjusting snaprealm " << child_realm
<< " parent" << dendl
;
4849 SnapRealm
*child
= get_snap_realm_maybe(child_realm
);
4852 adjust_realm_parent(child
, realm
->ino
);
4853 put_snap_realm(child
);
4857 update_snap_trace(m
->bl
, NULL
, m
->head
.op
!= CEPH_SNAP_OP_DESTROY
);
4860 for (auto p
= to_move
.begin(); p
!= to_move
.end(); ++p
) {
4861 Inode
*in
= p
->first
;
4862 in
->snaprealm
= realm
;
4863 realm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4865 // queue for snap writeback
4866 if (has_new_snaps(p
->second
, realm
->get_snap_context()))
4867 queue_cap_snap(in
, p
->second
);
4869 put_snap_realm(realm
);
4873 void Client::handle_quota(const MConstRef
<MClientQuota
>& m
)
4875 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4876 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4881 got_mds_push(session
);
4883 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << mds
<< dendl
;
4885 vinodeno_t
vino(m
->ino
, CEPH_NOSNAP
);
4886 if (inode_map
.count(vino
)) {
4888 in
= inode_map
[vino
];
4891 in
->quota
= m
->quota
;
4892 in
->rstat
= m
->rstat
;
4897 void Client::handle_caps(const MConstRef
<MClientCaps
>& m
)
4899 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4900 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4905 if (m
->osd_epoch_barrier
&& !objecter
->have_map(m
->osd_epoch_barrier
)) {
4906 // Pause RADOS operations until we see the required epoch
4907 objecter
->set_epoch_barrier(m
->osd_epoch_barrier
);
4910 if (m
->osd_epoch_barrier
> cap_epoch_barrier
) {
4911 // Record the barrier so that we will transmit it to MDS when releasing
4912 set_cap_epoch_barrier(m
->osd_epoch_barrier
);
4915 got_mds_push(session
);
4918 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
4919 if (auto it
= inode_map
.find(vino
); it
!= inode_map
.end()) {
4922 if (m
->get_op() == CEPH_CAP_OP_IMPORT
) {
4923 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< " on IMPORT, immediately releasing" << dendl
;
4924 session
->enqueue_cap_release(
4931 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< ", dropping" << dendl
;
4934 // in case the mds is waiting on e.g. a revocation
4935 flush_cap_releases();
4939 switch (m
->get_op()) {
4940 case CEPH_CAP_OP_EXPORT
: return handle_cap_export(session
, in
, m
);
4941 case CEPH_CAP_OP_FLUSHSNAP_ACK
: return handle_cap_flushsnap_ack(session
, in
, m
);
4942 case CEPH_CAP_OP_IMPORT
: /* no return */ handle_cap_import(session
, in
, m
);
4945 if (auto it
= in
->caps
.find(mds
); it
!= in
->caps
.end()) {
4946 Cap
&cap
= in
->caps
.at(mds
);
4948 switch (m
->get_op()) {
4949 case CEPH_CAP_OP_TRUNC
: return handle_cap_trunc(session
, in
, m
);
4950 case CEPH_CAP_OP_IMPORT
:
4951 case CEPH_CAP_OP_REVOKE
:
4952 case CEPH_CAP_OP_GRANT
: return handle_cap_grant(session
, in
, &cap
, m
);
4953 case CEPH_CAP_OP_FLUSH_ACK
: return handle_cap_flush_ack(session
, in
, &cap
, m
);
4956 ldout(cct
, 5) << __func__
<< " don't have " << *in
<< " cap on mds." << mds
<< dendl
;
4961 void Client::handle_cap_import(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4963 mds_rank_t mds
= session
->mds_num
;
4965 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4966 << " IMPORT from mds." << mds
<< dendl
;
4968 const mds_rank_t peer_mds
= mds_rank_t(m
->peer
.mds
);
4971 if (auto it
= in
->caps
.find(peer_mds
); m
->peer
.cap_id
&& it
!= in
->caps
.end()) {
4973 cap_perms
= cap
->latest_perms
;
4977 SnapRealm
*realm
= NULL
;
4978 update_snap_trace(m
->snapbl
, &realm
);
4980 int issued
= m
->get_caps();
4981 int wanted
= m
->get_wanted();
4982 add_update_cap(in
, session
, m
->get_cap_id(),
4983 issued
, wanted
, m
->get_seq(), m
->get_mseq(),
4984 m
->get_realm(), CEPH_CAP_FLAG_AUTH
, cap_perms
);
4986 if (cap
&& cap
->cap_id
== m
->peer
.cap_id
) {
4987 remove_cap(cap
, (m
->peer
.flags
& CEPH_CAP_FLAG_RELEASE
));
4991 put_snap_realm(realm
);
4993 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
4994 if (!(wanted
& CEPH_CAP_ANY_FILE_WR
) ||
4995 in
->requested_max_size
> m
->get_max_size()) {
4996 in
->requested_max_size
= 0;
4997 ldout(cct
, 15) << "reset requested_max_size after cap import" << dendl
;
4999 // reflush any/all caps (if we are now the auth_cap)
5000 kick_flushing_caps(in
, session
);
5004 void Client::handle_cap_export(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5006 mds_rank_t mds
= session
->mds_num
;
5008 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
5009 << " EXPORT from mds." << mds
<< dendl
;
5011 auto it
= in
->caps
.find(mds
);
5012 if (it
!= in
->caps
.end()) {
5013 Cap
&cap
= it
->second
;
5014 if (cap
.cap_id
== m
->get_cap_id()) {
5015 if (m
->peer
.cap_id
) {
5016 const auto peer_mds
= mds_rank_t(m
->peer
.mds
);
5017 MetaSession
*tsession
= _get_or_open_mds_session(peer_mds
);
5018 auto it
= in
->caps
.find(peer_mds
);
5019 if (it
!= in
->caps
.end()) {
5020 Cap
&tcap
= it
->second
;
5021 if (tcap
.cap_id
== m
->peer
.cap_id
&&
5022 ceph_seq_cmp(tcap
.seq
, m
->peer
.seq
) < 0) {
5023 tcap
.cap_id
= m
->peer
.cap_id
;
5024 tcap
.seq
= m
->peer
.seq
- 1;
5025 tcap
.issue_seq
= tcap
.seq
;
5026 tcap
.issued
|= cap
.issued
;
5027 tcap
.implemented
|= cap
.issued
;
5028 if (&cap
== in
->auth_cap
)
5029 in
->auth_cap
= &tcap
;
5030 if (in
->auth_cap
== &tcap
&& in
->flushing_cap_item
.is_on_list())
5031 adjust_session_flushing_caps(in
, session
, tsession
);
5034 add_update_cap(in
, tsession
, m
->peer
.cap_id
, cap
.issued
, 0,
5035 m
->peer
.seq
- 1, m
->peer
.mseq
, (uint64_t)-1,
5036 &cap
== in
->auth_cap
? CEPH_CAP_FLAG_AUTH
: 0,
5040 if (cap
.wanted
| cap
.issued
)
5041 in
->flags
|= I_CAP_DROPPED
;
5044 remove_cap(&cap
, false);
5049 void Client::handle_cap_trunc(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5051 mds_rank_t mds
= session
->mds_num
;
5052 ceph_assert(in
->caps
.count(mds
));
5054 ldout(cct
, 10) << __func__
<< " on ino " << *in
5055 << " size " << in
->size
<< " -> " << m
->get_size()
5059 in
->caps_issued(&issued
);
5060 issued
|= in
->caps_dirty();
5061 update_inode_file_size(in
, issued
, m
->get_size(),
5062 m
->get_truncate_seq(), m
->get_truncate_size());
5065 void Client::handle_cap_flush_ack(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5067 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5068 int dirty
= m
->get_dirty();
5072 auto it
= in
->flushing_cap_tids
.begin();
5073 if (it
->first
< flush_ack_tid
) {
5074 ldout(cct
, 0) << __func__
<< " mds." << session
->mds_num
5075 << " got unexpected flush ack tid " << flush_ack_tid
5076 << " expected is " << it
->first
<< dendl
;
5078 for (; it
!= in
->flushing_cap_tids
.end(); ) {
5084 if (it
->first
== flush_ack_tid
)
5085 cleaned
= it
->second
;
5086 if (it
->first
<= flush_ack_tid
) {
5087 session
->flushing_caps_tids
.erase(it
->first
);
5088 in
->flushing_cap_tids
.erase(it
++);
5092 cleaned
&= ~it
->second
;
5098 ldout(cct
, 5) << __func__
<< " mds." << session
->mds_num
5099 << " cleaned " << ccap_string(cleaned
) << " on " << *in
5100 << " with " << ccap_string(dirty
) << dendl
;
5103 signal_cond_list(in
->waitfor_caps
);
5104 if (session
->flushing_caps_tids
.empty() ||
5105 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5106 sync_cond
.notify_all();
5110 in
->cap_dirtier_uid
= -1;
5111 in
->cap_dirtier_gid
= -1;
5115 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != any cap bit tids" << dendl
;
5117 if (in
->flushing_caps
) {
5118 ldout(cct
, 5) << " flushing_caps " << ccap_string(in
->flushing_caps
)
5119 << " -> " << ccap_string(in
->flushing_caps
& ~cleaned
) << dendl
;
5120 in
->flushing_caps
&= ~cleaned
;
5121 if (in
->flushing_caps
== 0) {
5122 ldout(cct
, 10) << " " << *in
<< " !flushing" << dendl
;
5123 num_flushing_caps
--;
5124 if (in
->flushing_cap_tids
.empty())
5125 in
->flushing_cap_item
.remove_myself();
5127 if (!in
->caps_dirty())
5134 void Client::handle_cap_flushsnap_ack(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5136 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5137 mds_rank_t mds
= session
->mds_num
;
5138 ceph_assert(in
->caps
.count(mds
));
5139 snapid_t follows
= m
->get_snap_follows();
5141 if (auto it
= in
->cap_snaps
.find(follows
); it
!= in
->cap_snaps
.end()) {
5142 auto& capsnap
= it
->second
;
5143 if (flush_ack_tid
!= capsnap
.flush_tid
) {
5144 ldout(cct
, 10) << " tid " << flush_ack_tid
<< " != " << capsnap
.flush_tid
<< dendl
;
5146 InodeRef
tmp_ref(in
);
5147 ldout(cct
, 5) << __func__
<< " mds." << mds
<< " flushed snap follows " << follows
5148 << " on " << *in
<< dendl
;
5149 session
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
5150 in
->flushing_cap_tids
.erase(capsnap
.flush_tid
);
5151 if (in
->flushing_caps
== 0 && in
->flushing_cap_tids
.empty())
5152 in
->flushing_cap_item
.remove_myself();
5153 in
->cap_snaps
.erase(it
);
5155 signal_cond_list(in
->waitfor_caps
);
5156 if (session
->flushing_caps_tids
.empty() ||
5157 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5158 sync_cond
.notify_all();
5161 ldout(cct
, 5) << __func__
<< " DUP(?) mds." << mds
<< " flushed snap follows " << follows
5162 << " on " << *in
<< dendl
;
5163 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5167 class C_Client_DentryInvalidate
: public Context
{
5174 C_Client_DentryInvalidate(Client
*c
, Dentry
*dn
, bool del
) :
5175 client(c
), name(dn
->name
) {
5176 if (client
->use_faked_inos()) {
5177 dirino
.ino
= dn
->dir
->parent_inode
->faked_ino
;
5179 ino
.ino
= dn
->inode
->faked_ino
;
5181 dirino
= dn
->dir
->parent_inode
->vino();
5183 ino
= dn
->inode
->vino();
5186 ino
.ino
= inodeno_t();
5188 void finish(int r
) override
{
5189 // _async_dentry_invalidate is responsible for its own locking
5190 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
5191 client
->_async_dentry_invalidate(dirino
, ino
, name
);
5195 void Client::_async_dentry_invalidate(vinodeno_t dirino
, vinodeno_t ino
, string
& name
)
5199 ldout(cct
, 10) << __func__
<< " '" << name
<< "' ino " << ino
5200 << " in dir " << dirino
<< dendl
;
5201 dentry_invalidate_cb(callback_handle
, dirino
, ino
, name
.c_str(), name
.length());
5204 void Client::_schedule_invalidate_dentry_callback(Dentry
*dn
, bool del
)
5206 if (dentry_invalidate_cb
&& dn
->inode
->ll_ref
> 0)
5207 async_dentry_invalidator
.queue(new C_Client_DentryInvalidate(this, dn
, del
));
5210 void Client::_try_to_trim_inode(Inode
*in
, bool sched_inval
)
5212 int ref
= in
->get_num_ref();
5213 ldout(cct
, 5) << __func__
<< " in " << *in
<<dendl
;
5215 if (in
->dir
&& !in
->dir
->dentries
.empty()) {
5216 for (auto p
= in
->dir
->dentries
.begin();
5217 p
!= in
->dir
->dentries
.end(); ) {
5218 Dentry
*dn
= p
->second
;
5220 /* rmsnap removes whole subtree, need trim inodes recursively.
5221 * we don't need to invalidate dentries recursively. because
5222 * invalidating a directory dentry effectively invalidate
5224 if (in
->snapid
!= CEPH_NOSNAP
&& dn
->inode
&& dn
->inode
->is_dir())
5225 _try_to_trim_inode(dn
->inode
.get(), false);
5227 if (dn
->lru_is_expireable())
5228 unlink(dn
, true, false); // keep dir, drop dentry
5230 if (in
->dir
->dentries
.empty()) {
5236 if (ref
> 0 && (in
->flags
& I_SNAPDIR_OPEN
)) {
5237 InodeRef snapdir
= open_snapdir(in
);
5238 _try_to_trim_inode(snapdir
.get(), false);
5243 auto q
= in
->dentries
.begin();
5244 while (q
!= in
->dentries
.end()) {
5247 if( in
->ll_ref
> 0 && sched_inval
) {
5248 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5249 // so in->dentries doesn't always reflect the state of kernel's dcache.
5250 _schedule_invalidate_dentry_callback(dn
, true);
5252 unlink(dn
, true, true);
5257 void Client::handle_cap_grant(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5259 mds_rank_t mds
= session
->mds_num
;
5260 int used
= get_caps_used(in
);
5261 int wanted
= in
->caps_wanted();
5263 const unsigned new_caps
= m
->get_caps();
5264 const bool was_stale
= session
->cap_gen
> cap
->gen
;
5265 ldout(cct
, 5) << __func__
<< " on in " << m
->get_ino()
5266 << " mds." << mds
<< " seq " << m
->get_seq()
5267 << " caps now " << ccap_string(new_caps
)
5268 << " was " << ccap_string(cap
->issued
)
5269 << (was_stale
? " (stale)" : "") << dendl
;
5272 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
5273 cap
->seq
= m
->get_seq();
5274 cap
->gen
= session
->cap_gen
;
5276 check_cap_issue(in
, new_caps
);
5280 in
->caps_issued(&issued
);
5281 issued
|= in
->caps_dirty();
5283 if ((new_caps
& CEPH_CAP_AUTH_SHARED
) &&
5284 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
5285 in
->mode
= m
->head
.mode
;
5286 in
->uid
= m
->head
.uid
;
5287 in
->gid
= m
->head
.gid
;
5288 in
->btime
= m
->btime
;
5290 bool deleted_inode
= false;
5291 if ((new_caps
& CEPH_CAP_LINK_SHARED
) &&
5292 !(issued
& CEPH_CAP_LINK_EXCL
)) {
5293 in
->nlink
= m
->head
.nlink
;
5294 if (in
->nlink
== 0 &&
5295 (new_caps
& (CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
)))
5296 deleted_inode
= true;
5298 if (!(issued
& CEPH_CAP_XATTR_EXCL
) &&
5299 m
->xattrbl
.length() &&
5300 m
->head
.xattr_version
> in
->xattr_version
) {
5301 auto p
= m
->xattrbl
.cbegin();
5302 decode(in
->xattrs
, p
);
5303 in
->xattr_version
= m
->head
.xattr_version
;
5306 if ((new_caps
& CEPH_CAP_FILE_SHARED
) && m
->dirstat_is_valid()) {
5307 in
->dirstat
.nfiles
= m
->get_nfiles();
5308 in
->dirstat
.nsubdirs
= m
->get_nsubdirs();
5311 if (new_caps
& CEPH_CAP_ANY_RD
) {
5312 update_inode_file_time(in
, issued
, m
->get_time_warp_seq(),
5313 m
->get_ctime(), m
->get_mtime(), m
->get_atime());
5316 if (new_caps
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
)) {
5317 in
->layout
= m
->get_layout();
5318 update_inode_file_size(in
, issued
, m
->get_size(),
5319 m
->get_truncate_seq(), m
->get_truncate_size());
5322 if (m
->inline_version
> in
->inline_version
) {
5323 in
->inline_data
= m
->inline_data
;
5324 in
->inline_version
= m
->inline_version
;
5327 /* always take a newer change attr */
5328 if (m
->get_change_attr() > in
->change_attr
)
5329 in
->change_attr
= m
->get_change_attr();
5332 if (cap
== in
->auth_cap
&&
5333 (new_caps
& CEPH_CAP_ANY_FILE_WR
) &&
5334 (m
->get_max_size() != in
->max_size
)) {
5335 ldout(cct
, 10) << "max_size " << in
->max_size
<< " -> " << m
->get_max_size() << dendl
;
5336 in
->max_size
= m
->get_max_size();
5337 if (in
->max_size
> in
->wanted_max_size
) {
5338 in
->wanted_max_size
= 0;
5339 in
->requested_max_size
= 0;
5344 if ((was_stale
|| m
->get_op() == CEPH_CAP_OP_IMPORT
) &&
5345 (wanted
& ~(cap
->wanted
| new_caps
))) {
5346 // If mds is importing cap, prior cap messages that update 'wanted'
5347 // may get dropped by mds (migrate seq mismatch).
5349 // We don't send cap message to update 'wanted' if what we want are
5350 // already issued. If mds revokes caps, cap message that releases caps
5351 // also tells mds what we want. But if caps got revoked by mds forcedly
5352 // (session stale). We may haven't told mds what we want.
5358 auto revoked
= cap
->issued
& ~new_caps
;
5360 ldout(cct
, 10) << " revocation of " << ccap_string(revoked
) << dendl
;
5361 cap
->issued
= new_caps
;
5362 cap
->implemented
|= new_caps
;
5364 // recall delegations if we're losing caps necessary for them
5365 if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_RD
))
5366 in
->recall_deleg(false);
5367 else if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_WR
))
5368 in
->recall_deleg(true);
5370 used
= adjust_caps_used_for_lazyio(used
, cap
->issued
, cap
->implemented
);
5371 if ((used
& revoked
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
)) &&
5372 !_flush(in
, new C_Client_FlushComplete(this, in
))) {
5373 // waitin' for flush
5374 } else if (used
& revoked
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
)) {
5378 cap
->wanted
= 0; // don't let check_caps skip sending a response to MDS
5381 } else if (cap
->issued
== new_caps
) {
5382 ldout(cct
, 10) << " caps unchanged at " << ccap_string(cap
->issued
) << dendl
;
5384 ldout(cct
, 10) << " grant, new caps are " << ccap_string(new_caps
& ~cap
->issued
) << dendl
;
5385 cap
->issued
= new_caps
;
5386 cap
->implemented
|= new_caps
;
5388 if (cap
== in
->auth_cap
) {
5389 // non-auth MDS is revoking the newly grant caps ?
5390 for (const auto &p
: in
->caps
) {
5391 if (&p
.second
== cap
)
5393 if (p
.second
.implemented
& ~p
.second
.issued
& new_caps
) {
5406 signal_cond_list(in
->waitfor_caps
);
5408 // may drop inode's last ref
5410 _try_to_trim_inode(in
, true);
5413 int Client::inode_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
5415 if (perms
.uid() == 0)
5418 if (perms
.uid() != in
->uid
&& (in
->mode
& S_IRWXG
)) {
5419 int ret
= _posix_acl_permission(in
, perms
, want
);
5424 // check permissions before doing anything else
5425 if (!in
->check_mode(perms
, want
))
5430 int Client::xattr_permission(Inode
*in
, const char *name
, unsigned want
,
5431 const UserPerm
& perms
)
5433 int r
= _getattr_for_perm(in
, perms
);
5438 if (strncmp(name
, "system.", 7) == 0) {
5439 if ((want
& MAY_WRITE
) && (perms
.uid() != 0 && perms
.uid() != in
->uid
))
5442 r
= inode_permission(in
, perms
, want
);
5445 ldout(cct
, 5) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5449 ostream
& operator<<(ostream
&out
, const UserPerm
& perm
) {
5450 out
<< "UserPerm(uid: " << perm
.uid() << ", gid: " << perm
.gid() << ")";
5454 int Client::may_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
5455 const UserPerm
& perms
)
5457 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5458 int r
= _getattr_for_perm(in
, perms
);
5462 if (mask
& CEPH_SETATTR_SIZE
) {
5463 r
= inode_permission(in
, perms
, MAY_WRITE
);
5469 if (mask
& CEPH_SETATTR_UID
) {
5470 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
|| stx
->stx_uid
!= in
->uid
))
5473 if (mask
& CEPH_SETATTR_GID
) {
5474 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
||
5475 (!perms
.gid_in_groups(stx
->stx_gid
) && stx
->stx_gid
!= in
->gid
)))
5479 if (mask
& CEPH_SETATTR_MODE
) {
5480 if (perms
.uid() != 0 && perms
.uid() != in
->uid
)
5483 gid_t i_gid
= (mask
& CEPH_SETATTR_GID
) ? stx
->stx_gid
: in
->gid
;
5484 if (perms
.uid() != 0 && !perms
.gid_in_groups(i_gid
))
5485 stx
->stx_mode
&= ~S_ISGID
;
5488 if (mask
& (CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
|
5489 CEPH_SETATTR_MTIME
| CEPH_SETATTR_ATIME
)) {
5490 if (perms
.uid() != 0 && perms
.uid() != in
->uid
) {
5491 int check_mask
= CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
;
5492 if (!(mask
& CEPH_SETATTR_MTIME_NOW
))
5493 check_mask
|= CEPH_SETATTR_MTIME
;
5494 if (!(mask
& CEPH_SETATTR_ATIME_NOW
))
5495 check_mask
|= CEPH_SETATTR_ATIME
;
5496 if (check_mask
& mask
) {
5499 r
= inode_permission(in
, perms
, MAY_WRITE
);
5507 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5511 int Client::may_open(Inode
*in
, int flags
, const UserPerm
& perms
)
5513 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5516 if ((flags
& O_ACCMODE
) == O_WRONLY
)
5518 else if ((flags
& O_ACCMODE
) == O_RDWR
)
5519 want
= MAY_READ
| MAY_WRITE
;
5520 else if ((flags
& O_ACCMODE
) == O_RDONLY
)
5522 if (flags
& O_TRUNC
)
5526 switch (in
->mode
& S_IFMT
) {
5531 if (want
& MAY_WRITE
) {
5538 r
= _getattr_for_perm(in
, perms
);
5542 r
= inode_permission(in
, perms
, want
);
5544 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5548 int Client::may_lookup(Inode
*dir
, const UserPerm
& perms
)
5550 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5551 int r
= _getattr_for_perm(dir
, perms
);
5555 r
= inode_permission(dir
, perms
, MAY_EXEC
);
5557 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5561 int Client::may_create(Inode
*dir
, const UserPerm
& perms
)
5563 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5564 int r
= _getattr_for_perm(dir
, perms
);
5568 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5570 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5574 int Client::may_delete(Inode
*dir
, const char *name
, const UserPerm
& perms
)
5576 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << "; name " << name
<< "; " << perms
<< dendl
;
5577 int r
= _getattr_for_perm(dir
, perms
);
5581 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5585 /* 'name == NULL' means rmsnap */
5586 if (perms
.uid() != 0 && name
&& (dir
->mode
& S_ISVTX
)) {
5588 r
= _lookup(dir
, name
, CEPH_CAP_AUTH_SHARED
, &otherin
, perms
);
5591 if (dir
->uid
!= perms
.uid() && otherin
->uid
!= perms
.uid())
5595 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5599 int Client::may_hardlink(Inode
*in
, const UserPerm
& perms
)
5601 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5602 int r
= _getattr_for_perm(in
, perms
);
5606 if (perms
.uid() == 0 || perms
.uid() == in
->uid
) {
5612 if (!S_ISREG(in
->mode
))
5615 if (in
->mode
& S_ISUID
)
5618 if ((in
->mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
))
5621 r
= inode_permission(in
, perms
, MAY_READ
| MAY_WRITE
);
5623 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5627 int Client::_getattr_for_perm(Inode
*in
, const UserPerm
& perms
)
5629 int mask
= CEPH_STAT_CAP_MODE
;
5631 if (acl_type
!= NO_ACL
) {
5632 mask
|= CEPH_STAT_CAP_XATTR
;
5633 force
= in
->xattr_version
== 0;
5635 return _getattr(in
, mask
, perms
, force
);
5638 vinodeno_t
Client::_get_vino(Inode
*in
)
5640 /* The caller must hold the client lock */
5641 return vinodeno_t(in
->ino
, in
->snapid
);
5645 * Resolve an MDS spec to a list of MDS daemon GIDs.
5647 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5648 * It may be '*' in which case it matches all GIDs.
5650 * If no error is returned, the `targets` vector will be populated with at least
5653 int Client::resolve_mds(
5654 const std::string
&mds_spec
,
5655 std::vector
<mds_gid_t
> *targets
)
5658 ceph_assert(targets
!= nullptr);
5661 std::stringstream ss
;
5662 int role_r
= fsmap
->parse_role(mds_spec
, &role
, ss
);
5664 // We got a role, resolve it to a GID
5665 ldout(cct
, 10) << __func__
<< ": resolved '" << mds_spec
<< "' to role '"
5666 << role
<< "'" << dendl
;
5668 fsmap
->get_filesystem(role
.fscid
)->mds_map
.get_info(role
.rank
).global_id
);
5672 std::string strtol_err
;
5673 long long rank_or_gid
= strict_strtoll(mds_spec
.c_str(), 10, &strtol_err
);
5674 if (strtol_err
.empty()) {
5675 // It is a possible GID
5676 const mds_gid_t mds_gid
= mds_gid_t(rank_or_gid
);
5677 if (fsmap
->gid_exists(mds_gid
)) {
5678 ldout(cct
, 10) << __func__
<< ": validated GID " << mds_gid
<< dendl
;
5679 targets
->push_back(mds_gid
);
5681 lderr(cct
) << __func__
<< ": GID " << mds_gid
<< " not in MDS map"
5685 } else if (mds_spec
== "*") {
5686 // It is a wildcard: use all MDSs
5687 const auto mds_info
= fsmap
->get_mds_info();
5689 if (mds_info
.empty()) {
5690 lderr(cct
) << __func__
<< ": * passed but no MDS daemons found" << dendl
;
5694 for (const auto i
: mds_info
) {
5695 targets
->push_back(i
.first
);
5698 // It did not parse as an integer, it is not a wildcard, it must be a name
5699 const mds_gid_t mds_gid
= fsmap
->find_mds_gid_by_name(mds_spec
);
5701 lderr(cct
) << "MDS ID '" << mds_spec
<< "' not found" << dendl
;
5703 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5707 ldout(cct
, 10) << __func__
<< ": resolved ID '" << mds_spec
5708 << "' to GID " << mds_gid
<< dendl
;
5709 targets
->push_back(mds_gid
);
5718 * Authenticate with mon and establish global ID
5720 int Client::authenticate()
5722 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
5724 if (monclient
->is_authenticated()) {
5728 client_lock
.unlock();
5729 int r
= monclient
->authenticate(cct
->_conf
->client_mount_timeout
);
5735 whoami
= monclient
->get_global_id();
5736 messenger
->set_myname(entity_name_t::CLIENT(whoami
.v
));
5741 int Client::fetch_fsmap(bool user
)
5744 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5745 // rather than MDSMap because no one MDSMap contains all the daemons, and
5746 // a `tell` can address any daemon.
5747 version_t fsmap_latest
;
5750 monclient
->get_version("fsmap", &fsmap_latest
, NULL
, &cond
);
5751 client_lock
.unlock();
5754 } while (r
== -EAGAIN
);
5757 lderr(cct
) << "Failed to learn FSMap version: " << cpp_strerror(r
) << dendl
;
5761 ldout(cct
, 10) << __func__
<< " learned FSMap version " << fsmap_latest
<< dendl
;
5764 if (!fsmap_user
|| fsmap_user
->get_epoch() < fsmap_latest
) {
5765 monclient
->sub_want("fsmap.user", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5766 monclient
->renew_subs();
5767 wait_on_list(waiting_for_fsmap
);
5769 ceph_assert(fsmap_user
);
5770 ceph_assert(fsmap_user
->get_epoch() >= fsmap_latest
);
5772 if (!fsmap
|| fsmap
->get_epoch() < fsmap_latest
) {
5773 monclient
->sub_want("fsmap", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5774 monclient
->renew_subs();
5775 wait_on_list(waiting_for_fsmap
);
5778 ceph_assert(fsmap
->get_epoch() >= fsmap_latest
);
5780 ldout(cct
, 10) << __func__
<< " finished waiting for FSMap version "
5781 << fsmap_latest
<< dendl
;
5787 * @mds_spec one of ID, rank, GID, "*"
5790 int Client::mds_command(
5791 const std::string
&mds_spec
,
5792 const vector
<string
>& cmd
,
5793 const bufferlist
& inbl
,
5798 std::lock_guard
lock(client_lock
);
5809 r
= fetch_fsmap(false);
5814 // Look up MDS target(s) of the command
5815 std::vector
<mds_gid_t
> targets
;
5816 r
= resolve_mds(mds_spec
, &targets
);
5821 // If daemons are laggy, we won't send them commands. If all
5822 // are laggy then we fail.
5823 std::vector
<mds_gid_t
> non_laggy
;
5824 for (const auto gid
: targets
) {
5825 const auto info
= fsmap
->get_info_gid(gid
);
5826 if (!info
.laggy()) {
5827 non_laggy
.push_back(gid
);
5830 if (non_laggy
.size() == 0) {
5831 *outs
= "All targeted MDS daemons are laggy";
5835 if (metadata
.empty()) {
5836 // We are called on an unmounted client, so metadata
5837 // won't be initialized yet.
5838 populate_metadata("");
5841 // Send commands to targets
5842 C_GatherBuilder
gather(cct
, onfinish
);
5843 for (const auto target_gid
: non_laggy
) {
5844 const auto info
= fsmap
->get_info_gid(target_gid
);
5846 // Open a connection to the target MDS
5847 ConnectionRef conn
= messenger
->connect_to_mds(info
.get_addrs());
5849 // Generate MDSCommandOp state
5850 auto &op
= command_table
.start_command();
5852 op
.on_finish
= gather
.new_sub();
5857 op
.mds_gid
= target_gid
;
5860 ldout(cct
, 4) << __func__
<< ": new command op to " << target_gid
5861 << " tid=" << op
.tid
<< cmd
<< dendl
;
5863 // Construct and send MCommand
5864 auto m
= op
.get_message(monclient
->get_fsid());
5865 conn
->send_message2(std::move(m
));
5872 void Client::handle_command_reply(const MConstRef
<MCommandReply
>& m
)
5874 ceph_tid_t
const tid
= m
->get_tid();
5876 ldout(cct
, 10) << __func__
<< ": tid=" << m
->get_tid() << dendl
;
5878 if (!command_table
.exists(tid
)) {
5879 ldout(cct
, 1) << __func__
<< ": unknown tid " << tid
<< ", dropping" << dendl
;
5883 auto &op
= command_table
.get_command(tid
);
5885 *op
.outbl
= m
->get_data();
5892 op
.on_finish
->complete(m
->r
);
5895 command_table
.erase(tid
);
5898 // -------------------
5901 int Client::subscribe_mdsmap(const std::string
&fs_name
)
5903 int r
= authenticate();
5905 lderr(cct
) << "authentication failed: " << cpp_strerror(r
) << dendl
;
5909 std::string resolved_fs_name
;
5910 if (fs_name
.empty()) {
5911 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_fs");
5912 if (resolved_fs_name
.empty())
5913 // Try the backwards compatibility fs name option
5914 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_mds_namespace");
5916 resolved_fs_name
= fs_name
;
5919 std::string want
= "mdsmap";
5920 if (!resolved_fs_name
.empty()) {
5921 r
= fetch_fsmap(true);
5924 fscid
= fsmap_user
->get_fs_cid(resolved_fs_name
);
5925 if (fscid
== FS_CLUSTER_ID_NONE
) {
5929 std::ostringstream oss
;
5930 oss
<< want
<< "." << fscid
;
5933 ldout(cct
, 10) << "Subscribing to map '" << want
<< "'" << dendl
;
5935 monclient
->sub_want(want
, 0, 0);
5936 monclient
->renew_subs();
5941 int Client::mount(const std::string
&mount_root
, const UserPerm
& perms
,
5942 bool require_mds
, const std::string
&fs_name
)
5944 std::lock_guard
lock(client_lock
);
5947 ldout(cct
, 5) << "already mounted" << dendl
;
5953 int r
= subscribe_mdsmap(fs_name
);
5955 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
5959 tick(); // start tick
5963 auto availability
= mdsmap
->is_cluster_available();
5964 if (availability
== MDSMap::STUCK_UNAVAILABLE
) {
5966 ldout(cct
, 10) << "mds cluster unavailable: epoch=" << mdsmap
->get_epoch() << dendl
;
5967 return CEPH_FUSE_NO_MDS_UP
;
5968 } else if (availability
== MDSMap::AVAILABLE
) {
5969 // Continue to mount
5971 } else if (availability
== MDSMap::TRANSIENT_UNAVAILABLE
) {
5972 // Else, wait. MDSMonitor will update the map to bring
5973 // us to a conclusion eventually.
5974 wait_on_list(waiting_for_mdsmap
);
5976 // Unexpected value!
5982 populate_metadata(mount_root
.empty() ? "/" : mount_root
);
5984 filepath
fp(CEPH_INO_ROOT
);
5985 if (!mount_root
.empty()) {
5986 fp
= filepath(mount_root
.c_str());
5989 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
5990 req
->set_filepath(fp
);
5991 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
5992 int res
= make_request(req
, perms
);
5994 if (res
== -EACCES
&& root
) {
5995 ldout(cct
, 1) << __func__
<< " EACCES on parent of mount point; quotas may not work" << dendl
;
6013 if (!cct
->_conf
->client_trace
.empty()) {
6014 traceout
.open(cct
->_conf
->client_trace
.c_str());
6015 if (traceout
.is_open()) {
6016 ldout(cct
, 1) << "opened trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6018 ldout(cct
, 1) << "FAILED to open trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6023 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6024 ldout(cct, 3) << "op: struct stat st;" << dendl;
6025 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6026 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6027 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6028 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6029 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6030 ldout(cct, 3) << "op: int fd;" << dendl;
6037 void Client::_close_sessions()
6039 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
6040 if (it
->second
.state
== MetaSession::STATE_REJECTED
)
6041 mds_sessions
.erase(it
++);
6046 while (!mds_sessions
.empty()) {
6047 // send session closes!
6048 for (auto &p
: mds_sessions
) {
6049 if (p
.second
.state
!= MetaSession::STATE_CLOSING
) {
6050 _close_mds_session(&p
.second
);
6051 mds_ranks_closing
.insert(p
.first
);
6055 // wait for sessions to close
6056 double timo
= cct
->_conf
.get_val
<std::chrono::seconds
>("client_shutdown_timeout").count();
6057 ldout(cct
, 2) << "waiting for " << mds_ranks_closing
.size() << " mds session(s) to close (timeout: "
6058 << timo
<< "s)" << dendl
;
6059 std::unique_lock l
{client_lock
, std::adopt_lock
};
6062 } else if (!mount_cond
.wait_for(l
, ceph::make_timespan(timo
), [this] { return mds_ranks_closing
.empty(); })) {
6063 ldout(cct
, 1) << mds_ranks_closing
.size() << " mds(s) did not respond to session close -- timing out." << dendl
;
6064 while (!mds_ranks_closing
.empty()) {
6065 auto session
= mds_sessions
.at(*mds_ranks_closing
.begin());
6066 // this prunes entry from mds_sessions and mds_ranks_closing
6067 _closed_mds_session(&session
, -ETIMEDOUT
);
6071 mds_ranks_closing
.clear();
6076 void Client::flush_mdlog_sync()
6078 if (mds_requests
.empty())
6080 for (auto &p
: mds_sessions
) {
6081 flush_mdlog(&p
.second
);
6085 void Client::flush_mdlog(MetaSession
*session
)
6087 // Only send this to Luminous or newer MDS daemons, older daemons
6088 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6089 const uint64_t features
= session
->con
->get_features();
6090 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
6091 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_FLUSH_MDLOG
);
6092 session
->con
->send_message2(std::move(m
));
6097 void Client::_abort_mds_sessions(int err
)
6099 for (auto p
= mds_requests
.begin(); p
!= mds_requests
.end(); ) {
6100 auto req
= p
->second
;
6102 // unsafe requests will be removed during close session below.
6103 if (req
->got_unsafe
)
6107 if (req
->caller_cond
) {
6109 req
->caller_cond
->notify_all();
6113 // Process aborts on any requests that were on this waitlist.
6114 // Any requests that were on a waiting_for_open session waitlist
6115 // will get kicked during close session below.
6116 signal_cond_list(waiting_for_mdsmap
);
6118 // Force-close all sessions
6119 while(!mds_sessions
.empty()) {
6120 auto& session
= mds_sessions
.begin()->second
;
6121 _closed_mds_session(&session
, err
);
6125 void Client::_unmount(bool abort
)
6127 std::unique_lock lock
{client_lock
, std::adopt_lock
};
6131 if (abort
|| blacklisted
) {
6132 ldout(cct
, 2) << "unmounting (" << (abort
? "abort)" : "blacklisted)") << dendl
;
6134 ldout(cct
, 2) << "unmounting" << dendl
;
6141 // Abort all mds sessions
6142 _abort_mds_sessions(-ENOTCONN
);
6144 objecter
->op_cancel_writes(-ENOTCONN
);
6146 // flush the mdlog for pending requests, if any
6150 mount_cond
.wait(lock
, [this] {
6151 if (!mds_requests
.empty()) {
6152 ldout(cct
, 10) << "waiting on " << mds_requests
.size() << " requests"
6155 return mds_requests
.empty();
6158 timer
.cancel_event(tick_event
);
6163 // clean up any unclosed files
6164 while (!fd_map
.empty()) {
6165 Fh
*fh
= fd_map
.begin()->second
;
6166 fd_map
.erase(fd_map
.begin());
6167 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *fh
->inode
<< dendl
;
6171 while (!ll_unclosed_fh_set
.empty()) {
6172 set
<Fh
*>::iterator it
= ll_unclosed_fh_set
.begin();
6174 ll_unclosed_fh_set
.erase(fh
);
6175 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *(fh
->inode
) << dendl
;
6179 while (!opened_dirs
.empty()) {
6180 dir_result_t
*dirp
= *opened_dirs
.begin();
6181 ldout(cct
, 0) << " destroyed lost open dir " << dirp
<< " on " << *dirp
->inode
<< dendl
;
6187 mount_cond
.wait(lock
, [this] {
6188 if (unsafe_sync_write
> 0) {
6189 ldout(cct
, 0) << unsafe_sync_write
<< " unsafe_sync_writes, waiting"
6192 return unsafe_sync_write
<= 0;
6195 if (cct
->_conf
->client_oc
) {
6196 // flush/release all buffered data
6197 std::list
<InodeRef
> anchor
;
6198 for (auto& p
: inode_map
) {
6199 Inode
*in
= p
.second
;
6201 ldout(cct
, 0) << "null inode_map entry ino " << p
.first
<< dendl
;
6205 // prevent inode from getting freed
6206 anchor
.emplace_back(in
);
6208 if (abort
|| blacklisted
) {
6209 objectcacher
->purge_set(&in
->oset
);
6210 } else if (!in
->caps
.empty()) {
6212 _flush(in
, new C_Client_FlushComplete(this, in
));
6217 if (abort
|| blacklisted
) {
6218 for (auto p
= dirty_list
.begin(); !p
.end(); ) {
6221 if (in
->dirty_caps
) {
6222 ldout(cct
, 0) << " drop dirty caps on " << *in
<< dendl
;
6223 in
->mark_caps_clean();
6229 wait_sync_caps(last_flush_tid
);
6235 while (lru
.lru_get_size() > 0 ||
6236 !inode_map
.empty()) {
6237 ldout(cct
, 2) << "cache still has " << lru
.lru_get_size()
6238 << "+" << inode_map
.size() << " items"
6239 << ", waiting (for caps to release?)"
6241 if (auto r
= mount_cond
.wait_for(lock
, ceph::make_timespan(5));
6242 r
== std::cv_status::timeout
) {
6246 ceph_assert(lru
.lru_get_size() == 0);
6247 ceph_assert(inode_map
.empty());
6250 if (!cct
->_conf
->client_trace
.empty()) {
6251 ldout(cct
, 1) << "closing trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6260 ldout(cct
, 2) << "unmounted." << dendl
;
6263 void Client::unmount()
6265 std::lock_guard
lock(client_lock
);
6269 void Client::abort_conn()
6271 std::lock_guard
lock(client_lock
);
6275 void Client::flush_cap_releases()
6277 // send any cap releases
6278 for (auto &p
: mds_sessions
) {
6279 auto &session
= p
.second
;
6280 if (session
.release
&& mdsmap
->is_clientreplay_or_active_or_stopping(
6282 if (cct
->_conf
->client_inject_release_failure
) {
6283 ldout(cct
, 20) << __func__
<< " injecting failure to send cap release message" << dendl
;
6285 session
.con
->send_message2(std::move(session
.release
));
6287 session
.release
.reset();
6294 if (cct
->_conf
->client_debug_inject_tick_delay
> 0) {
6295 sleep(cct
->_conf
->client_debug_inject_tick_delay
);
6296 ceph_assert(0 == cct
->_conf
.set_val("client_debug_inject_tick_delay", "0"));
6297 cct
->_conf
.apply_changes(nullptr);
6300 ldout(cct
, 21) << "tick" << dendl
;
6301 tick_event
= timer
.add_event_after(
6302 cct
->_conf
->client_tick_interval
,
6303 new LambdaContext([this](int) {
6304 // Called back via Timer, which takes client_lock for us
6305 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6308 utime_t now
= ceph_clock_now();
6310 if (!mounted
&& !mds_requests
.empty()) {
6311 MetaRequest
*req
= mds_requests
.begin()->second
;
6312 if (req
->op_stamp
+ cct
->_conf
->client_mount_timeout
< now
) {
6313 req
->abort(-ETIMEDOUT
);
6314 if (req
->caller_cond
) {
6316 req
->caller_cond
->notify_all();
6318 signal_cond_list(waiting_for_mdsmap
);
6319 for (auto &p
: mds_sessions
) {
6320 signal_context_list(p
.second
.waiting_for_open
);
6325 if (mdsmap
->get_epoch()) {
6327 utime_t el
= now
- last_cap_renew
;
6328 if (el
> mdsmap
->get_session_timeout() / 3.0)
6331 flush_cap_releases();
6335 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
6339 if (in
->hold_caps_until
> now
)
6341 delayed_list
.pop_front();
6342 check_caps(in
, CHECK_CAPS_NODELAY
);
6347 if (blacklisted
&& mounted
&&
6348 last_auto_reconnect
+ 30 * 60 < now
&&
6349 cct
->_conf
.get_val
<bool>("client_reconnect_stale")) {
6350 messenger
->client_reset();
6351 fd_gen
++; // invalidate open files
6352 blacklisted
= false;
6353 _kick_stale_sessions();
6354 last_auto_reconnect
= now
;
6358 void Client::renew_caps()
6360 ldout(cct
, 10) << "renew_caps()" << dendl
;
6361 last_cap_renew
= ceph_clock_now();
6363 for (auto &p
: mds_sessions
) {
6364 ldout(cct
, 15) << "renew_caps requesting from mds." << p
.first
<< dendl
;
6365 if (mdsmap
->get_state(p
.first
) >= MDSMap::STATE_REJOIN
)
6366 renew_caps(&p
.second
);
6370 void Client::renew_caps(MetaSession
*session
)
6372 ldout(cct
, 10) << "renew_caps mds." << session
->mds_num
<< dendl
;
6373 session
->last_cap_renew_request
= ceph_clock_now();
6374 uint64_t seq
= ++session
->cap_renew_seq
;
6375 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_RENEWCAPS
, seq
));
6379 // ===============================================================
6380 // high level (POSIXy) interface
6382 int Client::_do_lookup(Inode
*dir
, const string
& name
, int mask
,
6383 InodeRef
*target
, const UserPerm
& perms
)
6385 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_LOOKUPSNAP
: CEPH_MDS_OP_LOOKUP
;
6386 MetaRequest
*req
= new MetaRequest(op
);
6388 dir
->make_nosnap_relative_path(path
);
6389 path
.push_dentry(name
);
6390 req
->set_filepath(path
);
6391 req
->set_inode(dir
);
6392 if (cct
->_conf
->client_debug_getattr_caps
&& op
== CEPH_MDS_OP_LOOKUP
)
6393 mask
|= DEBUG_GETATTR_CAPS
;
6394 req
->head
.args
.getattr
.mask
= mask
;
6396 ldout(cct
, 10) << __func__
<< " on " << path
<< dendl
;
6398 int r
= make_request(req
, perms
, target
);
6399 ldout(cct
, 10) << __func__
<< " res is " << r
<< dendl
;
6403 int Client::_lookup(Inode
*dir
, const string
& dname
, int mask
, InodeRef
*target
,
6404 const UserPerm
& perms
)
6408 // can only request shared caps
6409 mask
&= CEPH_CAP_ANY_SHARED
| CEPH_STAT_RSTAT
;
6411 if (dname
== "..") {
6412 if (dir
->dentries
.empty()) {
6413 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
6414 filepath
path(dir
->ino
);
6415 req
->set_filepath(path
);
6418 int r
= make_request(req
, perms
, &tmptarget
, NULL
, rand() % mdsmap
->get_num_in_mds());
6421 *target
= std::move(tmptarget
);
6422 ldout(cct
, 8) << __func__
<< " found target " << (*target
)->ino
<< dendl
;
6428 *target
= dir
->get_first_parent()->dir
->parent_inode
; //dirs can't be hard-linked
6437 if (!dir
->is_dir()) {
6442 if (dname
.length() > NAME_MAX
) {
6447 if (dname
== cct
->_conf
->client_snapdir
&&
6448 dir
->snapid
== CEPH_NOSNAP
) {
6449 *target
= open_snapdir(dir
);
6454 dir
->dir
->dentries
.count(dname
)) {
6455 dn
= dir
->dir
->dentries
[dname
];
6457 ldout(cct
, 20) << __func__
<< " have dn " << dname
<< " mds." << dn
->lease_mds
<< " ttl " << dn
->lease_ttl
6458 << " seq " << dn
->lease_seq
6461 if (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)) {
6462 // is dn lease valid?
6463 utime_t now
= ceph_clock_now();
6464 if (dn
->lease_mds
>= 0 &&
6465 dn
->lease_ttl
> now
&&
6466 mds_sessions
.count(dn
->lease_mds
)) {
6467 MetaSession
&s
= mds_sessions
.at(dn
->lease_mds
);
6468 if (s
.cap_ttl
> now
&&
6469 s
.cap_gen
== dn
->lease_gen
) {
6470 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6471 // make trim_caps() behave.
6472 dir
->try_touch_cap(dn
->lease_mds
);
6475 ldout(cct
, 20) << " bad lease, cap_ttl " << s
.cap_ttl
<< ", cap_gen " << s
.cap_gen
6476 << " vs lease_gen " << dn
->lease_gen
<< dendl
;
6479 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
6480 if (dn
->cap_shared_gen
== dir
->shared_gen
&&
6481 (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)))
6483 if (!dn
->inode
&& (dir
->flags
& I_COMPLETE
)) {
6484 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for "
6485 << *dir
<< " dn '" << dname
<< "'" << dendl
;
6490 ldout(cct
, 20) << " no cap on " << dn
->inode
->vino() << dendl
;
6493 // can we conclude ENOENT locally?
6494 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true) &&
6495 (dir
->flags
& I_COMPLETE
)) {
6496 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for " << *dir
<< " dn '" << dname
<< "'" << dendl
;
6501 r
= _do_lookup(dir
, dname
, mask
, target
, perms
);
6506 *target
= dn
->inode
;
6514 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << r
<< dendl
;
6516 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << **target
<< dendl
;
6520 int Client::get_or_create(Inode
*dir
, const char* name
,
6521 Dentry
**pdn
, bool expect_null
)
6524 ldout(cct
, 20) << __func__
<< " " << *dir
<< " name " << name
<< dendl
;
6526 if (dir
->dir
->dentries
.count(name
)) {
6527 Dentry
*dn
= dir
->dir
->dentries
[name
];
6529 // is dn lease valid?
6530 utime_t now
= ceph_clock_now();
6532 dn
->lease_mds
>= 0 &&
6533 dn
->lease_ttl
> now
&&
6534 mds_sessions
.count(dn
->lease_mds
)) {
6535 MetaSession
&s
= mds_sessions
.at(dn
->lease_mds
);
6536 if (s
.cap_ttl
> now
&&
6537 s
.cap_gen
== dn
->lease_gen
) {
6544 // otherwise link up a new one
6545 *pdn
= link(dir
->dir
, name
, NULL
, NULL
);
6552 int Client::path_walk(const filepath
& origpath
, InodeRef
*end
,
6553 const UserPerm
& perms
, bool followsym
, int mask
)
6555 filepath path
= origpath
;
6557 if (origpath
.absolute())
6563 ldout(cct
, 10) << __func__
<< " " << path
<< dendl
;
6568 while (i
< path
.depth() && cur
) {
6570 const string
&dname
= path
[i
];
6571 ldout(cct
, 10) << " " << i
<< " " << *cur
<< " " << dname
<< dendl
;
6572 ldout(cct
, 20) << " (path is " << path
<< ")" << dendl
;
6574 if (cct
->_conf
->client_permissions
) {
6575 int r
= may_lookup(cur
.get(), perms
);
6578 caps
= CEPH_CAP_AUTH_SHARED
;
6581 /* Get extra requested caps on the last component */
6582 if (i
== (path
.depth() - 1))
6584 int r
= _lookup(cur
.get(), dname
, caps
, &next
, perms
);
6587 // only follow trailing symlink if followsym. always follow
6588 // 'directory' symlinks.
6589 if (next
&& next
->is_symlink()) {
6591 ldout(cct
, 20) << " symlink count " << symlinks
<< ", value is '" << next
->symlink
<< "'" << dendl
;
6592 if (symlinks
> MAXSYMLINKS
) {
6596 if (i
< path
.depth() - 1) {
6598 // replace consumed components of path with symlink dir target
6599 filepath
resolved(next
->symlink
.c_str());
6600 resolved
.append(path
.postfixpath(i
+ 1));
6603 if (next
->symlink
[0] == '/') {
6607 } else if (followsym
) {
6608 if (next
->symlink
[0] == '/') {
6609 path
= next
->symlink
.c_str();
6614 filepath
more(next
->symlink
.c_str());
6615 // we need to remove the symlink component from off of the path
6616 // before adding the target that the symlink points to. remain
6617 // at the same position in the path.
6637 int Client::link(const char *relexisting
, const char *relpath
, const UserPerm
& perm
)
6639 std::lock_guard
lock(client_lock
);
6640 tout(cct
) << "link" << std::endl
;
6641 tout(cct
) << relexisting
<< std::endl
;
6642 tout(cct
) << relpath
<< std::endl
;
6647 filepath
existing(relexisting
);
6650 int r
= path_walk(existing
, &in
, perm
, true);
6653 if (std::string(relpath
) == "/") {
6657 filepath
path(relpath
);
6658 string name
= path
.last_dentry();
6661 r
= path_walk(path
, &dir
, perm
, true);
6664 if (cct
->_conf
->client_permissions
) {
6665 if (S_ISDIR(in
->mode
)) {
6669 r
= may_hardlink(in
.get(), perm
);
6672 r
= may_create(dir
.get(), perm
);
6676 r
= _link(in
.get(), dir
.get(), name
.c_str(), perm
);
6680 int Client::unlink(const char *relpath
, const UserPerm
& perm
)
6682 std::lock_guard
lock(client_lock
);
6683 tout(cct
) << __func__
<< std::endl
;
6684 tout(cct
) << relpath
<< std::endl
;
6689 if (std::string(relpath
) == "/")
6692 filepath
path(relpath
);
6693 string name
= path
.last_dentry();
6696 int r
= path_walk(path
, &dir
, perm
);
6699 if (cct
->_conf
->client_permissions
) {
6700 r
= may_delete(dir
.get(), name
.c_str(), perm
);
6704 return _unlink(dir
.get(), name
.c_str(), perm
);
6707 int Client::rename(const char *relfrom
, const char *relto
, const UserPerm
& perm
)
6709 std::lock_guard
lock(client_lock
);
6710 tout(cct
) << __func__
<< std::endl
;
6711 tout(cct
) << relfrom
<< std::endl
;
6712 tout(cct
) << relto
<< std::endl
;
6717 if (std::string(relfrom
) == "/" || std::string(relto
) == "/")
6720 filepath
from(relfrom
);
6722 string fromname
= from
.last_dentry();
6724 string toname
= to
.last_dentry();
6727 InodeRef fromdir
, todir
;
6728 int r
= path_walk(from
, &fromdir
, perm
);
6731 r
= path_walk(to
, &todir
, perm
);
6735 if (cct
->_conf
->client_permissions
) {
6736 int r
= may_delete(fromdir
.get(), fromname
.c_str(), perm
);
6739 r
= may_delete(todir
.get(), toname
.c_str(), perm
);
6740 if (r
< 0 && r
!= -ENOENT
)
6743 r
= _rename(fromdir
.get(), fromname
.c_str(), todir
.get(), toname
.c_str(), perm
);
6750 int Client::mkdir(const char *relpath
, mode_t mode
, const UserPerm
& perm
)
6752 std::lock_guard
lock(client_lock
);
6753 tout(cct
) << __func__
<< std::endl
;
6754 tout(cct
) << relpath
<< std::endl
;
6755 tout(cct
) << mode
<< std::endl
;
6756 ldout(cct
, 10) << __func__
<< ": " << relpath
<< dendl
;
6761 if (std::string(relpath
) == "/")
6764 filepath
path(relpath
);
6765 string name
= path
.last_dentry();
6768 int r
= path_walk(path
, &dir
, perm
);
6771 if (cct
->_conf
->client_permissions
) {
6772 r
= may_create(dir
.get(), perm
);
6776 return _mkdir(dir
.get(), name
.c_str(), mode
, perm
);
6779 int Client::mkdirs(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
6781 std::lock_guard
lock(client_lock
);
6782 ldout(cct
, 10) << "Client::mkdirs " << relpath
<< dendl
;
6783 tout(cct
) << __func__
<< std::endl
;
6784 tout(cct
) << relpath
<< std::endl
;
6785 tout(cct
) << mode
<< std::endl
;
6790 //get through existing parts of path
6791 filepath
path(relpath
);
6793 int r
= 0, caps
= 0;
6796 for (i
=0; i
<path
.depth(); ++i
) {
6797 if (cct
->_conf
->client_permissions
) {
6798 r
= may_lookup(cur
.get(), perms
);
6801 caps
= CEPH_CAP_AUTH_SHARED
;
6803 r
= _lookup(cur
.get(), path
[i
].c_str(), caps
, &next
, perms
);
6808 if (r
!=-ENOENT
) return r
;
6809 ldout(cct
, 20) << __func__
<< " got through " << i
<< " directories on path " << relpath
<< dendl
;
6810 //make new directory at each level
6811 for (; i
<path
.depth(); ++i
) {
6812 if (cct
->_conf
->client_permissions
) {
6813 r
= may_create(cur
.get(), perms
);
6818 r
= _mkdir(cur
.get(), path
[i
].c_str(), mode
, perms
, &next
);
6820 //check proper creation/existence
6821 if(-EEXIST
== r
&& i
< path
.depth() - 1) {
6822 r
= _lookup(cur
.get(), path
[i
].c_str(), CEPH_CAP_AUTH_SHARED
, &next
, perms
);
6826 //move to new dir and continue
6828 ldout(cct
, 20) << __func__
<< ": successfully created directory "
6829 << filepath(cur
->ino
).get_path() << dendl
;
6834 int Client::rmdir(const char *relpath
, const UserPerm
& perms
)
6836 std::lock_guard
lock(client_lock
);
6837 tout(cct
) << __func__
<< std::endl
;
6838 tout(cct
) << relpath
<< std::endl
;
6843 if (std::string(relpath
) == "/")
6846 filepath
path(relpath
);
6847 string name
= path
.last_dentry();
6850 int r
= path_walk(path
, &dir
, perms
);
6853 if (cct
->_conf
->client_permissions
) {
6854 int r
= may_delete(dir
.get(), name
.c_str(), perms
);
6858 return _rmdir(dir
.get(), name
.c_str(), perms
);
6861 int Client::mknod(const char *relpath
, mode_t mode
, const UserPerm
& perms
, dev_t rdev
)
6863 std::lock_guard
lock(client_lock
);
6864 tout(cct
) << __func__
<< std::endl
;
6865 tout(cct
) << relpath
<< std::endl
;
6866 tout(cct
) << mode
<< std::endl
;
6867 tout(cct
) << rdev
<< std::endl
;
6872 if (std::string(relpath
) == "/")
6875 filepath
path(relpath
);
6876 string name
= path
.last_dentry();
6879 int r
= path_walk(path
, &dir
, perms
);
6882 if (cct
->_conf
->client_permissions
) {
6883 int r
= may_create(dir
.get(), perms
);
6887 return _mknod(dir
.get(), name
.c_str(), mode
, rdev
, perms
);
6892 int Client::symlink(const char *target
, const char *relpath
, const UserPerm
& perms
)
6894 std::lock_guard
lock(client_lock
);
6895 tout(cct
) << __func__
<< std::endl
;
6896 tout(cct
) << target
<< std::endl
;
6897 tout(cct
) << relpath
<< std::endl
;
6902 if (std::string(relpath
) == "/")
6905 filepath
path(relpath
);
6906 string name
= path
.last_dentry();
6909 int r
= path_walk(path
, &dir
, perms
);
6912 if (cct
->_conf
->client_permissions
) {
6913 int r
= may_create(dir
.get(), perms
);
6917 return _symlink(dir
.get(), name
.c_str(), target
, perms
);
6920 int Client::readlink(const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
)
6922 std::lock_guard
lock(client_lock
);
6923 tout(cct
) << __func__
<< std::endl
;
6924 tout(cct
) << relpath
<< std::endl
;
6929 filepath
path(relpath
);
6931 int r
= path_walk(path
, &in
, perms
, false);
6935 return _readlink(in
.get(), buf
, size
);
6938 int Client::_readlink(Inode
*in
, char *buf
, size_t size
)
6940 if (!in
->is_symlink())
6943 // copy into buf (at most size bytes)
6944 int r
= in
->symlink
.length();
6947 memcpy(buf
, in
->symlink
.c_str(), r
);
6954 int Client::_getattr(Inode
*in
, int mask
, const UserPerm
& perms
, bool force
)
6956 bool yes
= in
->caps_issued_mask(mask
, true);
6958 ldout(cct
, 10) << __func__
<< " mask " << ccap_string(mask
) << " issued=" << yes
<< dendl
;
6962 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
6964 in
->make_nosnap_relative_path(path
);
6965 req
->set_filepath(path
);
6967 req
->head
.args
.getattr
.mask
= mask
;
6969 int res
= make_request(req
, perms
);
6970 ldout(cct
, 10) << __func__
<< " result=" << res
<< dendl
;
6974 int Client::_do_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
6975 const UserPerm
& perms
, InodeRef
*inp
)
6977 int issued
= in
->caps_issued();
6979 ldout(cct
, 10) << __func__
<< " mask " << mask
<< " issued " <<
6980 ccap_string(issued
) << dendl
;
6982 if (in
->snapid
!= CEPH_NOSNAP
) {
6985 if ((mask
& CEPH_SETATTR_SIZE
) &&
6986 (unsigned long)stx
->stx_size
> in
->size
&&
6987 is_quota_bytes_exceeded(in
, (unsigned long)stx
->stx_size
- in
->size
,
6992 // make the change locally?
6993 if ((in
->cap_dirtier_uid
>= 0 && perms
.uid() != in
->cap_dirtier_uid
) ||
6994 (in
->cap_dirtier_gid
>= 0 && perms
.gid() != in
->cap_dirtier_gid
)) {
6995 ldout(cct
, 10) << __func__
<< " caller " << perms
.uid() << ":" << perms
.gid()
6996 << " != cap dirtier " << in
->cap_dirtier_uid
<< ":"
6997 << in
->cap_dirtier_gid
<< ", forcing sync setattr"
7000 * This works because we implicitly flush the caps as part of the
7001 * request, so the cap update check will happen with the writeback
7002 * cap context, and then the setattr check will happen with the
7005 * In reality this pattern is likely pretty rare (different users
7006 * setattr'ing the same file). If that turns out not to be the
7007 * case later, we can build a more complex pipelined cap writeback
7011 mask
|= CEPH_SETATTR_CTIME
;
7016 // caller just needs us to bump the ctime
7017 in
->ctime
= ceph_clock_now();
7018 in
->cap_dirtier_uid
= perms
.uid();
7019 in
->cap_dirtier_gid
= perms
.gid();
7020 if (issued
& CEPH_CAP_AUTH_EXCL
)
7021 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7022 else if (issued
& CEPH_CAP_FILE_EXCL
)
7023 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7024 else if (issued
& CEPH_CAP_XATTR_EXCL
)
7025 in
->mark_caps_dirty(CEPH_CAP_XATTR_EXCL
);
7027 mask
|= CEPH_SETATTR_CTIME
;
7030 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7031 bool kill_sguid
= mask
& (CEPH_SETATTR_SIZE
|CEPH_SETATTR_KILL_SGUID
);
7033 mask
&= ~CEPH_SETATTR_KILL_SGUID
;
7035 if (mask
& CEPH_SETATTR_UID
) {
7036 in
->ctime
= ceph_clock_now();
7037 in
->cap_dirtier_uid
= perms
.uid();
7038 in
->cap_dirtier_gid
= perms
.gid();
7039 in
->uid
= stx
->stx_uid
;
7040 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7041 mask
&= ~CEPH_SETATTR_UID
;
7043 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
7045 if (mask
& CEPH_SETATTR_GID
) {
7046 in
->ctime
= ceph_clock_now();
7047 in
->cap_dirtier_uid
= perms
.uid();
7048 in
->cap_dirtier_gid
= perms
.gid();
7049 in
->gid
= stx
->stx_gid
;
7050 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7051 mask
&= ~CEPH_SETATTR_GID
;
7053 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
7056 if (mask
& CEPH_SETATTR_MODE
) {
7057 in
->ctime
= ceph_clock_now();
7058 in
->cap_dirtier_uid
= perms
.uid();
7059 in
->cap_dirtier_gid
= perms
.gid();
7060 in
->mode
= (in
->mode
& ~07777) | (stx
->stx_mode
& 07777);
7061 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7062 mask
&= ~CEPH_SETATTR_MODE
;
7063 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
7064 } else if (kill_sguid
&& S_ISREG(in
->mode
) && (in
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
7065 /* Must squash the any setuid/setgid bits with an ownership change */
7066 in
->mode
&= ~(S_ISUID
|S_ISGID
);
7067 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7070 if (mask
& CEPH_SETATTR_BTIME
) {
7071 in
->ctime
= ceph_clock_now();
7072 in
->cap_dirtier_uid
= perms
.uid();
7073 in
->cap_dirtier_gid
= perms
.gid();
7074 in
->btime
= utime_t(stx
->stx_btime
);
7075 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7076 mask
&= ~CEPH_SETATTR_BTIME
;
7077 ldout(cct
,10) << "changing btime to " << in
->btime
<< dendl
;
7079 } else if (mask
& CEPH_SETATTR_SIZE
) {
7080 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7081 mask
|= CEPH_SETATTR_KILL_SGUID
;
7084 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
7085 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
)) {
7086 if (mask
& CEPH_SETATTR_MTIME
)
7087 in
->mtime
= utime_t(stx
->stx_mtime
);
7088 if (mask
& CEPH_SETATTR_ATIME
)
7089 in
->atime
= utime_t(stx
->stx_atime
);
7090 in
->ctime
= ceph_clock_now();
7091 in
->cap_dirtier_uid
= perms
.uid();
7092 in
->cap_dirtier_gid
= perms
.gid();
7093 in
->time_warp_seq
++;
7094 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7095 mask
&= ~(CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
);
7104 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETATTR
);
7108 in
->make_nosnap_relative_path(path
);
7109 req
->set_filepath(path
);
7112 if (mask
& CEPH_SETATTR_KILL_SGUID
) {
7113 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7115 if (mask
& CEPH_SETATTR_MODE
) {
7116 req
->head
.args
.setattr
.mode
= stx
->stx_mode
;
7117 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7118 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
7120 if (mask
& CEPH_SETATTR_UID
) {
7121 req
->head
.args
.setattr
.uid
= stx
->stx_uid
;
7122 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7123 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
7125 if (mask
& CEPH_SETATTR_GID
) {
7126 req
->head
.args
.setattr
.gid
= stx
->stx_gid
;
7127 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7128 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
7130 if (mask
& CEPH_SETATTR_BTIME
) {
7131 req
->head
.args
.setattr
.btime
= utime_t(stx
->stx_btime
);
7132 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7134 if (mask
& CEPH_SETATTR_MTIME
) {
7135 req
->head
.args
.setattr
.mtime
= utime_t(stx
->stx_mtime
);
7136 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7139 if (mask
& CEPH_SETATTR_ATIME
) {
7140 req
->head
.args
.setattr
.atime
= utime_t(stx
->stx_atime
);
7141 req
->inode_drop
|= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_RD
|
7144 if (mask
& CEPH_SETATTR_SIZE
) {
7145 if ((unsigned long)stx
->stx_size
< mdsmap
->get_max_filesize()) {
7146 req
->head
.args
.setattr
.size
= stx
->stx_size
;
7147 ldout(cct
,10) << "changing size to " << stx
->stx_size
<< dendl
;
7150 ldout(cct
,10) << "unable to set size to " << stx
->stx_size
<< ". Too large!" << dendl
;
7153 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7156 req
->head
.args
.setattr
.mask
= mask
;
7158 req
->regetattr_mask
= mask
;
7160 int res
= make_request(req
, perms
, inp
);
7161 ldout(cct
, 10) << "_setattr result=" << res
<< dendl
;
7165 /* Note that we only care about attrs that setattr cares about */
7166 void Client::stat_to_statx(struct stat
*st
, struct ceph_statx
*stx
)
7168 stx
->stx_size
= st
->st_size
;
7169 stx
->stx_mode
= st
->st_mode
;
7170 stx
->stx_uid
= st
->st_uid
;
7171 stx
->stx_gid
= st
->st_gid
;
7173 stx
->stx_mtime
= st
->st_mtimespec
;
7174 stx
->stx_atime
= st
->st_atimespec
;
7176 stx
->stx_mtime
= st
->st_mtim
;
7177 stx
->stx_atime
= st
->st_atim
;
7181 int Client::__setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
7182 const UserPerm
& perms
, InodeRef
*inp
)
7184 int ret
= _do_setattr(in
, stx
, mask
, perms
, inp
);
7187 if (mask
& CEPH_SETATTR_MODE
)
7188 ret
= _posix_acl_chmod(in
, stx
->stx_mode
, perms
);
7192 int Client::_setattrx(InodeRef
&in
, struct ceph_statx
*stx
, int mask
,
7193 const UserPerm
& perms
)
7195 mask
&= (CEPH_SETATTR_MODE
| CEPH_SETATTR_UID
|
7196 CEPH_SETATTR_GID
| CEPH_SETATTR_MTIME
|
7197 CEPH_SETATTR_ATIME
| CEPH_SETATTR_SIZE
|
7198 CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
);
7199 if (cct
->_conf
->client_permissions
) {
7200 int r
= may_setattr(in
.get(), stx
, mask
, perms
);
7204 return __setattrx(in
.get(), stx
, mask
, perms
);
7207 int Client::_setattr(InodeRef
&in
, struct stat
*attr
, int mask
,
7208 const UserPerm
& perms
)
7210 struct ceph_statx stx
;
7212 stat_to_statx(attr
, &stx
);
7213 mask
&= ~CEPH_SETATTR_BTIME
;
7215 if ((mask
& CEPH_SETATTR_UID
) && attr
->st_uid
== static_cast<uid_t
>(-1)) {
7216 mask
&= ~CEPH_SETATTR_UID
;
7218 if ((mask
& CEPH_SETATTR_GID
) && attr
->st_gid
== static_cast<uid_t
>(-1)) {
7219 mask
&= ~CEPH_SETATTR_GID
;
7222 return _setattrx(in
, &stx
, mask
, perms
);
7225 int Client::setattr(const char *relpath
, struct stat
*attr
, int mask
,
7226 const UserPerm
& perms
)
7228 std::lock_guard
lock(client_lock
);
7229 tout(cct
) << __func__
<< std::endl
;
7230 tout(cct
) << relpath
<< std::endl
;
7231 tout(cct
) << mask
<< std::endl
;
7236 filepath
path(relpath
);
7238 int r
= path_walk(path
, &in
, perms
);
7241 return _setattr(in
, attr
, mask
, perms
);
7244 int Client::setattrx(const char *relpath
, struct ceph_statx
*stx
, int mask
,
7245 const UserPerm
& perms
, int flags
)
7247 std::lock_guard
lock(client_lock
);
7248 tout(cct
) << __func__
<< std::endl
;
7249 tout(cct
) << relpath
<< std::endl
;
7250 tout(cct
) << mask
<< std::endl
;
7255 filepath
path(relpath
);
7257 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
));
7260 return _setattrx(in
, stx
, mask
, perms
);
7263 int Client::fsetattr(int fd
, struct stat
*attr
, int mask
, const UserPerm
& perms
)
7265 std::lock_guard
lock(client_lock
);
7266 tout(cct
) << __func__
<< std::endl
;
7267 tout(cct
) << fd
<< std::endl
;
7268 tout(cct
) << mask
<< std::endl
;
7273 Fh
*f
= get_filehandle(fd
);
7276 #if defined(__linux__) && defined(O_PATH)
7277 if (f
->flags
& O_PATH
)
7280 return _setattr(f
->inode
, attr
, mask
, perms
);
7283 int Client::fsetattrx(int fd
, struct ceph_statx
*stx
, int mask
, const UserPerm
& perms
)
7285 std::lock_guard
lock(client_lock
);
7286 tout(cct
) << __func__
<< std::endl
;
7287 tout(cct
) << fd
<< std::endl
;
7288 tout(cct
) << mask
<< std::endl
;
7293 Fh
*f
= get_filehandle(fd
);
7296 #if defined(__linux__) && defined(O_PATH)
7297 if (f
->flags
& O_PATH
)
7300 return _setattrx(f
->inode
, stx
, mask
, perms
);
7303 int Client::stat(const char *relpath
, struct stat
*stbuf
, const UserPerm
& perms
,
7304 frag_info_t
*dirstat
, int mask
)
7306 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7307 std::lock_guard
lock(client_lock
);
7308 tout(cct
) << "stat" << std::endl
;
7309 tout(cct
) << relpath
<< std::endl
;
7314 filepath
path(relpath
);
7316 int r
= path_walk(path
, &in
, perms
, true, mask
);
7319 r
= _getattr(in
, mask
, perms
);
7321 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7324 fill_stat(in
, stbuf
, dirstat
);
7325 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7329 unsigned Client::statx_to_mask(unsigned int flags
, unsigned int want
)
7333 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7334 if (flags
& AT_NO_ATTR_SYNC
)
7337 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7338 mask
|= CEPH_CAP_PIN
;
7339 if (want
& (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7340 mask
|= CEPH_CAP_AUTH_SHARED
;
7341 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7342 mask
|= CEPH_CAP_LINK_SHARED
;
7343 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|CEPH_STATX_CTIME
|CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
|CEPH_STATX_VERSION
))
7344 mask
|= CEPH_CAP_FILE_SHARED
;
7345 if (want
& (CEPH_STATX_VERSION
|CEPH_STATX_CTIME
))
7346 mask
|= CEPH_CAP_XATTR_SHARED
;
7351 int Client::statx(const char *relpath
, struct ceph_statx
*stx
,
7352 const UserPerm
& perms
,
7353 unsigned int want
, unsigned int flags
)
7355 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " want " << want
<< ")" << dendl
;
7356 std::lock_guard
lock(client_lock
);
7357 tout(cct
) << "statx" << std::endl
;
7358 tout(cct
) << relpath
<< std::endl
;
7363 filepath
path(relpath
);
7366 unsigned mask
= statx_to_mask(flags
, want
);
7368 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
7372 r
= _getattr(in
, mask
, perms
);
7374 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7378 fill_statx(in
, mask
, stx
);
7379 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << stx
->stx_mask
<< ")" << dendl
;
7383 int Client::lstat(const char *relpath
, struct stat
*stbuf
,
7384 const UserPerm
& perms
, frag_info_t
*dirstat
, int mask
)
7386 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7387 std::lock_guard
lock(client_lock
);
7388 tout(cct
) << __func__
<< std::endl
;
7389 tout(cct
) << relpath
<< std::endl
;
7394 filepath
path(relpath
);
7396 // don't follow symlinks
7397 int r
= path_walk(path
, &in
, perms
, false, mask
);
7400 r
= _getattr(in
, mask
, perms
);
7402 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7405 fill_stat(in
, stbuf
, dirstat
);
7406 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7410 int Client::fill_stat(Inode
*in
, struct stat
*st
, frag_info_t
*dirstat
, nest_info_t
*rstat
)
7412 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7413 << " mode 0" << oct
<< in
->mode
<< dec
7414 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7415 memset(st
, 0, sizeof(struct stat
));
7416 if (use_faked_inos())
7417 st
->st_ino
= in
->faked_ino
;
7419 st
->st_ino
= in
->ino
;
7420 st
->st_dev
= in
->snapid
;
7421 st
->st_mode
= in
->mode
;
7422 st
->st_rdev
= in
->rdev
;
7424 switch (in
->nlink
) {
7426 st
->st_nlink
= 0; /* dir is unlinked */
7429 st
->st_nlink
= 1 /* parent dentry */
7431 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7437 st
->st_nlink
= in
->nlink
;
7439 st
->st_uid
= in
->uid
;
7440 st
->st_gid
= in
->gid
;
7441 if (in
->ctime
> in
->mtime
) {
7442 stat_set_ctime_sec(st
, in
->ctime
.sec());
7443 stat_set_ctime_nsec(st
, in
->ctime
.nsec());
7445 stat_set_ctime_sec(st
, in
->mtime
.sec());
7446 stat_set_ctime_nsec(st
, in
->mtime
.nsec());
7448 stat_set_atime_sec(st
, in
->atime
.sec());
7449 stat_set_atime_nsec(st
, in
->atime
.nsec());
7450 stat_set_mtime_sec(st
, in
->mtime
.sec());
7451 stat_set_mtime_nsec(st
, in
->mtime
.nsec());
7453 if (cct
->_conf
->client_dirsize_rbytes
)
7454 st
->st_size
= in
->rstat
.rbytes
;
7456 st
->st_size
= in
->dirstat
.size();
7459 st
->st_size
= in
->size
;
7460 st
->st_blocks
= (in
->size
+ 511) >> 9;
7462 st
->st_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7465 *dirstat
= in
->dirstat
;
7469 return in
->caps_issued();
7472 void Client::fill_statx(Inode
*in
, unsigned int mask
, struct ceph_statx
*stx
)
7474 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7475 << " mode 0" << oct
<< in
->mode
<< dec
7476 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7477 memset(stx
, 0, sizeof(struct ceph_statx
));
7480 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7481 * so that all bits are set.
7486 /* These are always considered to be available */
7487 stx
->stx_dev
= in
->snapid
;
7488 stx
->stx_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7490 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7491 stx
->stx_mode
= S_IFMT
& in
->mode
;
7492 stx
->stx_ino
= use_faked_inos() ? in
->faked_ino
: (ino_t
)in
->ino
;
7493 stx
->stx_rdev
= in
->rdev
;
7494 stx
->stx_mask
|= (CEPH_STATX_INO
|CEPH_STATX_RDEV
);
7496 if (mask
& CEPH_CAP_AUTH_SHARED
) {
7497 stx
->stx_uid
= in
->uid
;
7498 stx
->stx_gid
= in
->gid
;
7499 stx
->stx_mode
= in
->mode
;
7500 in
->btime
.to_timespec(&stx
->stx_btime
);
7501 stx
->stx_mask
|= (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
);
7504 if (mask
& CEPH_CAP_LINK_SHARED
) {
7506 switch (in
->nlink
) {
7508 stx
->stx_nlink
= 0; /* dir is unlinked */
7511 stx
->stx_nlink
= 1 /* parent dentry */
7513 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7519 stx
->stx_nlink
= in
->nlink
;
7521 stx
->stx_mask
|= CEPH_STATX_NLINK
;
7524 if (mask
& CEPH_CAP_FILE_SHARED
) {
7526 in
->atime
.to_timespec(&stx
->stx_atime
);
7527 in
->mtime
.to_timespec(&stx
->stx_mtime
);
7530 if (cct
->_conf
->client_dirsize_rbytes
)
7531 stx
->stx_size
= in
->rstat
.rbytes
;
7533 stx
->stx_size
= in
->dirstat
.size();
7534 stx
->stx_blocks
= 1;
7536 stx
->stx_size
= in
->size
;
7537 stx
->stx_blocks
= (in
->size
+ 511) >> 9;
7539 stx
->stx_mask
|= (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|
7540 CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
);
7543 /* Change time and change_attr both require all shared caps to view */
7544 if ((mask
& CEPH_STAT_CAP_INODE_ALL
) == CEPH_STAT_CAP_INODE_ALL
) {
7545 stx
->stx_version
= in
->change_attr
;
7546 if (in
->ctime
> in
->mtime
)
7547 in
->ctime
.to_timespec(&stx
->stx_ctime
);
7549 in
->mtime
.to_timespec(&stx
->stx_ctime
);
7550 stx
->stx_mask
|= (CEPH_STATX_CTIME
|CEPH_STATX_VERSION
);
7555 void Client::touch_dn(Dentry
*dn
)
7560 int Client::chmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7562 std::lock_guard
lock(client_lock
);
7563 tout(cct
) << __func__
<< std::endl
;
7564 tout(cct
) << relpath
<< std::endl
;
7565 tout(cct
) << mode
<< std::endl
;
7570 filepath
path(relpath
);
7572 int r
= path_walk(path
, &in
, perms
);
7576 attr
.st_mode
= mode
;
7577 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7580 int Client::fchmod(int fd
, mode_t mode
, const UserPerm
& perms
)
7582 std::lock_guard
lock(client_lock
);
7583 tout(cct
) << __func__
<< std::endl
;
7584 tout(cct
) << fd
<< std::endl
;
7585 tout(cct
) << mode
<< std::endl
;
7590 Fh
*f
= get_filehandle(fd
);
7593 #if defined(__linux__) && defined(O_PATH)
7594 if (f
->flags
& O_PATH
)
7598 attr
.st_mode
= mode
;
7599 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MODE
, perms
);
7602 int Client::lchmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7604 std::lock_guard
lock(client_lock
);
7605 tout(cct
) << __func__
<< std::endl
;
7606 tout(cct
) << relpath
<< std::endl
;
7607 tout(cct
) << mode
<< std::endl
;
7612 filepath
path(relpath
);
7614 // don't follow symlinks
7615 int r
= path_walk(path
, &in
, perms
, false);
7619 attr
.st_mode
= mode
;
7620 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7623 int Client::chown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7624 const UserPerm
& perms
)
7626 std::lock_guard
lock(client_lock
);
7627 tout(cct
) << __func__
<< std::endl
;
7628 tout(cct
) << relpath
<< std::endl
;
7629 tout(cct
) << new_uid
<< std::endl
;
7630 tout(cct
) << new_gid
<< std::endl
;
7635 filepath
path(relpath
);
7637 int r
= path_walk(path
, &in
, perms
);
7641 attr
.st_uid
= new_uid
;
7642 attr
.st_gid
= new_gid
;
7643 return _setattr(in
, &attr
, CEPH_SETATTR_UID
|CEPH_SETATTR_GID
, perms
);
7646 int Client::fchown(int fd
, uid_t new_uid
, gid_t new_gid
, const UserPerm
& perms
)
7648 std::lock_guard
lock(client_lock
);
7649 tout(cct
) << __func__
<< std::endl
;
7650 tout(cct
) << fd
<< std::endl
;
7651 tout(cct
) << new_uid
<< std::endl
;
7652 tout(cct
) << new_gid
<< std::endl
;
7657 Fh
*f
= get_filehandle(fd
);
7660 #if defined(__linux__) && defined(O_PATH)
7661 if (f
->flags
& O_PATH
)
7665 attr
.st_uid
= new_uid
;
7666 attr
.st_gid
= new_gid
;
7668 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7669 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7670 return _setattr(f
->inode
, &attr
, mask
, perms
);
7673 int Client::lchown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7674 const UserPerm
& perms
)
7676 std::lock_guard
lock(client_lock
);
7677 tout(cct
) << __func__
<< std::endl
;
7678 tout(cct
) << relpath
<< std::endl
;
7679 tout(cct
) << new_uid
<< std::endl
;
7680 tout(cct
) << new_gid
<< std::endl
;
7685 filepath
path(relpath
);
7687 // don't follow symlinks
7688 int r
= path_walk(path
, &in
, perms
, false);
7692 attr
.st_uid
= new_uid
;
7693 attr
.st_gid
= new_gid
;
7695 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7696 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7697 return _setattr(in
, &attr
, mask
, perms
);
7700 static void attr_set_atime_and_mtime(struct stat
*attr
,
7701 const utime_t
&atime
,
7702 const utime_t
&mtime
)
7704 stat_set_atime_sec(attr
, atime
.tv
.tv_sec
);
7705 stat_set_atime_nsec(attr
, atime
.tv
.tv_nsec
);
7706 stat_set_mtime_sec(attr
, mtime
.tv
.tv_sec
);
7707 stat_set_mtime_nsec(attr
, mtime
.tv
.tv_nsec
);
7710 // for [l]utime() invoke the timeval variant as the timespec
7711 // variant are not yet implemented. for futime[s](), invoke
7712 // the timespec variant.
7713 int Client::utime(const char *relpath
, struct utimbuf
*buf
,
7714 const UserPerm
& perms
)
7716 struct timeval tv
[2];
7717 tv
[0].tv_sec
= buf
->actime
;
7719 tv
[1].tv_sec
= buf
->modtime
;
7722 return utimes(relpath
, tv
, perms
);
7725 int Client::lutime(const char *relpath
, struct utimbuf
*buf
,
7726 const UserPerm
& perms
)
7728 struct timeval tv
[2];
7729 tv
[0].tv_sec
= buf
->actime
;
7731 tv
[1].tv_sec
= buf
->modtime
;
7734 return lutimes(relpath
, tv
, perms
);
7737 int Client::futime(int fd
, struct utimbuf
*buf
, const UserPerm
& perms
)
7739 struct timespec ts
[2];
7740 ts
[0].tv_sec
= buf
->actime
;
7742 ts
[1].tv_sec
= buf
->modtime
;
7745 return futimens(fd
, ts
, perms
);
7748 int Client::utimes(const char *relpath
, struct timeval times
[2],
7749 const UserPerm
& perms
)
7751 std::lock_guard
lock(client_lock
);
7752 tout(cct
) << __func__
<< std::endl
;
7753 tout(cct
) << relpath
<< std::endl
;
7754 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
7756 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
7762 filepath
path(relpath
);
7764 int r
= path_walk(path
, &in
, perms
);
7768 utime_t
atime(times
[0]);
7769 utime_t
mtime(times
[1]);
7771 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7772 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7775 int Client::lutimes(const char *relpath
, struct timeval times
[2],
7776 const UserPerm
& perms
)
7778 std::lock_guard
lock(client_lock
);
7779 tout(cct
) << __func__
<< std::endl
;
7780 tout(cct
) << relpath
<< std::endl
;
7781 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
7783 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
7789 filepath
path(relpath
);
7791 int r
= path_walk(path
, &in
, perms
, false);
7795 utime_t
atime(times
[0]);
7796 utime_t
mtime(times
[1]);
7798 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7799 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7802 int Client::futimes(int fd
, struct timeval times
[2], const UserPerm
& perms
)
7804 struct timespec ts
[2];
7805 ts
[0].tv_sec
= times
[0].tv_sec
;
7806 ts
[0].tv_nsec
= times
[0].tv_usec
* 1000;
7807 ts
[1].tv_sec
= times
[1].tv_sec
;
7808 ts
[1].tv_nsec
= times
[1].tv_usec
* 1000;
7810 return futimens(fd
, ts
, perms
);
7813 int Client::futimens(int fd
, struct timespec times
[2], const UserPerm
& perms
)
7815 std::lock_guard
lock(client_lock
);
7816 tout(cct
) << __func__
<< std::endl
;
7817 tout(cct
) << fd
<< std::endl
;
7818 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_nsec
7820 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_nsec
7826 Fh
*f
= get_filehandle(fd
);
7829 #if defined(__linux__) && defined(O_PATH)
7830 if (f
->flags
& O_PATH
)
7834 utime_t
atime(times
[0]);
7835 utime_t
mtime(times
[1]);
7837 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7838 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7841 int Client::flock(int fd
, int operation
, uint64_t owner
)
7843 std::lock_guard
lock(client_lock
);
7844 tout(cct
) << __func__
<< std::endl
;
7845 tout(cct
) << fd
<< std::endl
;
7846 tout(cct
) << operation
<< std::endl
;
7847 tout(cct
) << owner
<< std::endl
;
7852 Fh
*f
= get_filehandle(fd
);
7856 return _flock(f
, operation
, owner
);
7859 int Client::opendir(const char *relpath
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7861 std::lock_guard
lock(client_lock
);
7862 tout(cct
) << __func__
<< std::endl
;
7863 tout(cct
) << relpath
<< std::endl
;
7868 filepath
path(relpath
);
7870 int r
= path_walk(path
, &in
, perms
, true);
7873 if (cct
->_conf
->client_permissions
) {
7874 int r
= may_open(in
.get(), O_RDONLY
, perms
);
7878 r
= _opendir(in
.get(), dirpp
, perms
);
7879 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7881 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
7885 int Client::_opendir(Inode
*in
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7889 *dirpp
= new dir_result_t(in
, perms
);
7890 opened_dirs
.insert(*dirpp
);
7891 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ") = " << 0 << " (" << *dirpp
<< ")" << dendl
;
7896 int Client::closedir(dir_result_t
*dir
)
7898 std::lock_guard
lock(client_lock
);
7899 tout(cct
) << __func__
<< std::endl
;
7900 tout(cct
) << (unsigned long)dir
<< std::endl
;
7902 ldout(cct
, 3) << __func__
<< "(" << dir
<< ") = 0" << dendl
;
7907 void Client::_closedir(dir_result_t
*dirp
)
7909 ldout(cct
, 10) << __func__
<< "(" << dirp
<< ")" << dendl
;
7911 ldout(cct
, 10) << __func__
<< " detaching inode " << dirp
->inode
<< dendl
;
7912 dirp
->inode
.reset();
7914 _readdir_drop_dirp_buffer(dirp
);
7915 opened_dirs
.erase(dirp
);
7919 void Client::rewinddir(dir_result_t
*dirp
)
7921 std::lock_guard
lock(client_lock
);
7922 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ")" << dendl
;
7927 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7928 _readdir_drop_dirp_buffer(d
);
7932 loff_t
Client::telldir(dir_result_t
*dirp
)
7934 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7935 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ") = " << d
->offset
<< dendl
;
7939 void Client::seekdir(dir_result_t
*dirp
, loff_t offset
)
7941 std::lock_guard
lock(client_lock
);
7943 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ", " << offset
<< ")" << dendl
;
7948 if (offset
== dirp
->offset
)
7951 if (offset
> dirp
->offset
)
7952 dirp
->release_count
= 0; // bump if we do a forward seek
7954 dirp
->ordered_count
= 0; // disable filling readdir cache
7956 if (dirp
->hash_order()) {
7957 if (dirp
->offset
> offset
) {
7958 _readdir_drop_dirp_buffer(dirp
);
7963 dirp
->buffer_frag
!= frag_t(dir_result_t::fpos_high(offset
)) ||
7964 dirp
->offset_low() > dir_result_t::fpos_low(offset
)) {
7965 _readdir_drop_dirp_buffer(dirp
);
7970 dirp
->offset
= offset
;
7975 // ino_t d_ino; /* inode number */
7976 // off_t d_off; /* offset to the next dirent */
7977 // unsigned short d_reclen; /* length of this record */
7978 // unsigned char d_type; /* type of file */
7979 // char d_name[256]; /* filename */
7981 void Client::fill_dirent(struct dirent
*de
, const char *name
, int type
, uint64_t ino
, loff_t next_off
)
7983 strncpy(de
->d_name
, name
, 255);
7984 de
->d_name
[255] = '\0';
7987 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7988 de
->d_off
= next_off
;
7991 de
->d_type
= IFTODT(type
);
7992 ldout(cct
, 10) << __func__
<< " '" << de
->d_name
<< "' -> " << inodeno_t(de
->d_ino
)
7993 << " type " << (int)de
->d_type
<< " w/ next_off " << hex
<< next_off
<< dec
<< dendl
;
7997 void Client::_readdir_next_frag(dir_result_t
*dirp
)
7999 frag_t fg
= dirp
->buffer_frag
;
8001 if (fg
.is_rightmost()) {
8002 ldout(cct
, 10) << __func__
<< " advance from " << fg
<< " to END" << dendl
;
8009 ldout(cct
, 10) << __func__
<< " advance from " << dirp
->buffer_frag
<< " to " << fg
<< dendl
;
8011 if (dirp
->hash_order()) {
8013 int64_t new_offset
= dir_result_t::make_fpos(fg
.value(), 2, true);
8014 if (dirp
->offset
< new_offset
) // don't decrease offset
8015 dirp
->offset
= new_offset
;
8017 dirp
->last_name
.clear();
8018 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
8019 _readdir_rechoose_frag(dirp
);
8023 void Client::_readdir_rechoose_frag(dir_result_t
*dirp
)
8025 ceph_assert(dirp
->inode
);
8027 if (dirp
->hash_order())
8030 frag_t cur
= frag_t(dirp
->offset_high());
8031 frag_t fg
= dirp
->inode
->dirfragtree
[cur
.value()];
8033 ldout(cct
, 10) << __func__
<< " frag " << cur
<< " maps to " << fg
<< dendl
;
8034 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
8035 dirp
->last_name
.clear();
8036 dirp
->next_offset
= 2;
8040 void Client::_readdir_drop_dirp_buffer(dir_result_t
*dirp
)
8042 ldout(cct
, 10) << __func__
<< " " << dirp
<< dendl
;
8043 dirp
->buffer
.clear();
8046 int Client::_readdir_get_frag(dir_result_t
*dirp
)
8049 ceph_assert(dirp
->inode
);
8051 // get the current frag.
8053 if (dirp
->hash_order())
8054 fg
= dirp
->inode
->dirfragtree
[dirp
->offset_high()];
8056 fg
= frag_t(dirp
->offset_high());
8058 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " fg " << fg
8059 << " offset " << hex
<< dirp
->offset
<< dec
<< dendl
;
8061 int op
= CEPH_MDS_OP_READDIR
;
8062 if (dirp
->inode
&& dirp
->inode
->snapid
== CEPH_SNAPDIR
)
8063 op
= CEPH_MDS_OP_LSSNAP
;
8065 InodeRef
& diri
= dirp
->inode
;
8067 MetaRequest
*req
= new MetaRequest(op
);
8069 diri
->make_nosnap_relative_path(path
);
8070 req
->set_filepath(path
);
8071 req
->set_inode(diri
.get());
8072 req
->head
.args
.readdir
.frag
= fg
;
8073 req
->head
.args
.readdir
.flags
= CEPH_READDIR_REPLY_BITFLAGS
;
8074 if (dirp
->last_name
.length()) {
8075 req
->path2
.set_path(dirp
->last_name
);
8076 } else if (dirp
->hash_order()) {
8077 req
->head
.args
.readdir
.offset_hash
= dirp
->offset_high();
8082 int res
= make_request(req
, dirp
->perms
, NULL
, NULL
, -1, &dirbl
);
8084 if (res
== -EAGAIN
) {
8085 ldout(cct
, 10) << __func__
<< " got EAGAIN, retrying" << dendl
;
8086 _readdir_rechoose_frag(dirp
);
8087 return _readdir_get_frag(dirp
);
8091 ldout(cct
, 10) << __func__
<< " " << dirp
<< " got frag " << dirp
->buffer_frag
8092 << " size " << dirp
->buffer
.size() << dendl
;
8094 ldout(cct
, 10) << __func__
<< " got error " << res
<< ", setting end flag" << dendl
;
8101 struct dentry_off_lt
{
8102 bool operator()(const Dentry
* dn
, int64_t off
) const {
8103 return dir_result_t::fpos_cmp(dn
->offset
, off
) < 0;
8107 int Client::_readdir_cache_cb(dir_result_t
*dirp
, add_dirent_cb_t cb
, void *p
,
8108 int caps
, bool getref
)
8110 ceph_assert(ceph_mutex_is_locked(client_lock
));
8111 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
8112 << " last_name " << dirp
->last_name
<< " offset " << hex
<< dirp
->offset
<< dec
8114 Dir
*dir
= dirp
->inode
->dir
;
8117 ldout(cct
, 10) << " dir is empty" << dendl
;
8122 vector
<Dentry
*>::iterator pd
= std::lower_bound(dir
->readdir_cache
.begin(),
8123 dir
->readdir_cache
.end(),
8124 dirp
->offset
, dentry_off_lt());
8129 if (!dirp
->inode
->is_complete_and_ordered())
8131 if (pd
== dir
->readdir_cache
.end())
8134 if (dn
->inode
== NULL
) {
8135 ldout(cct
, 15) << " skipping null '" << dn
->name
<< "'" << dendl
;
8139 if (dn
->cap_shared_gen
!= dir
->parent_inode
->shared_gen
) {
8140 ldout(cct
, 15) << " skipping mismatch shared gen '" << dn
->name
<< "'" << dendl
;
8145 int idx
= pd
- dir
->readdir_cache
.begin();
8146 if (dn
->inode
->is_dir()) {
8147 mask
|= CEPH_STAT_RSTAT
;
8149 int r
= _getattr(dn
->inode
, mask
, dirp
->perms
);
8153 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8154 pd
= dir
->readdir_cache
.begin() + idx
;
8155 if (pd
>= dir
->readdir_cache
.end() || *pd
!= dn
)
8158 struct ceph_statx stx
;
8160 fill_statx(dn
->inode
, caps
, &stx
);
8162 uint64_t next_off
= dn
->offset
+ 1;
8163 fill_dirent(&de
, dn
->name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8165 if (pd
== dir
->readdir_cache
.end())
8166 next_off
= dir_result_t::END
;
8170 in
= dn
->inode
.get();
8174 dn_name
= dn
->name
; // fill in name while we have lock
8176 client_lock
.unlock();
8177 r
= cb(p
, &de
, &stx
, next_off
, in
); // _next_ offset
8179 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< dn
->offset
<< dec
8180 << " = " << r
<< dendl
;
8185 dirp
->offset
= next_off
;
8187 dirp
->next_offset
= 2;
8189 dirp
->next_offset
= dirp
->offset_low();
8190 dirp
->last_name
= dn_name
; // we successfully returned this one; update!
8191 dirp
->release_count
= 0; // last_name no longer match cache index
8196 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " at end" << dendl
;
8201 int Client::readdir_r_cb(dir_result_t
*d
, add_dirent_cb_t cb
, void *p
,
8202 unsigned want
, unsigned flags
, bool getref
)
8204 int caps
= statx_to_mask(flags
, want
);
8206 std::lock_guard
lock(client_lock
);
8211 dir_result_t
*dirp
= static_cast<dir_result_t
*>(d
);
8213 ldout(cct
, 10) << __func__
<< " " << *dirp
->inode
<< " offset " << hex
<< dirp
->offset
8214 << dec
<< " at_end=" << dirp
->at_end()
8215 << " hash_order=" << dirp
->hash_order() << dendl
;
8218 struct ceph_statx stx
;
8219 memset(&de
, 0, sizeof(de
));
8220 memset(&stx
, 0, sizeof(stx
));
8222 InodeRef
& diri
= dirp
->inode
;
8227 if (dirp
->offset
== 0) {
8228 ldout(cct
, 15) << " including ." << dendl
;
8229 ceph_assert(diri
->dentries
.size() < 2); // can't have multiple hard-links to a dir
8230 uint64_t next_off
= 1;
8233 r
= _getattr(diri
, caps
| CEPH_STAT_RSTAT
, dirp
->perms
);
8237 fill_statx(diri
, caps
, &stx
);
8238 fill_dirent(&de
, ".", S_IFDIR
, stx
.stx_ino
, next_off
);
8240 Inode
*inode
= NULL
;
8246 client_lock
.unlock();
8247 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8252 dirp
->offset
= next_off
;
8256 if (dirp
->offset
== 1) {
8257 ldout(cct
, 15) << " including .." << dendl
;
8258 uint64_t next_off
= 2;
8260 if (diri
->dentries
.empty())
8263 in
= diri
->get_first_parent()->dir
->parent_inode
;
8266 r
= _getattr(in
, caps
| CEPH_STAT_RSTAT
, dirp
->perms
);
8270 fill_statx(in
, caps
, &stx
);
8271 fill_dirent(&de
, "..", S_IFDIR
, stx
.stx_ino
, next_off
);
8273 Inode
*inode
= NULL
;
8279 client_lock
.unlock();
8280 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8285 dirp
->offset
= next_off
;
8290 // can we read from our cache?
8291 ldout(cct
, 10) << "offset " << hex
<< dirp
->offset
<< dec
8292 << " snapid " << dirp
->inode
->snapid
<< " (complete && ordered) "
8293 << dirp
->inode
->is_complete_and_ordered()
8294 << " issued " << ccap_string(dirp
->inode
->caps_issued())
8296 if (dirp
->inode
->snapid
!= CEPH_SNAPDIR
&&
8297 dirp
->inode
->is_complete_and_ordered() &&
8298 dirp
->inode
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
8299 int err
= _readdir_cache_cb(dirp
, cb
, p
, caps
, getref
);
8308 bool check_caps
= true;
8309 if (!dirp
->is_cached()) {
8310 int r
= _readdir_get_frag(dirp
);
8313 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8314 // different than the requested one. (our dirfragtree was outdated)
8317 frag_t fg
= dirp
->buffer_frag
;
8319 ldout(cct
, 10) << "frag " << fg
<< " buffer size " << dirp
->buffer
.size()
8320 << " offset " << hex
<< dirp
->offset
<< dendl
;
8322 for (auto it
= std::lower_bound(dirp
->buffer
.begin(), dirp
->buffer
.end(),
8323 dirp
->offset
, dir_result_t::dentry_off_lt());
8324 it
!= dirp
->buffer
.end();
8326 dir_result_t::dentry
&entry
= *it
;
8328 uint64_t next_off
= entry
.offset
+ 1;
8333 if(entry
.inode
->is_dir()){
8334 mask
|= CEPH_STAT_RSTAT
;
8336 r
= _getattr(entry
.inode
, mask
, dirp
->perms
);
8341 fill_statx(entry
.inode
, caps
, &stx
);
8342 fill_dirent(&de
, entry
.name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8344 Inode
*inode
= NULL
;
8346 inode
= entry
.inode
.get();
8350 client_lock
.unlock();
8351 r
= cb(p
, &de
, &stx
, next_off
, inode
); // _next_ offset
8354 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< next_off
- 1 << dec
8355 << " = " << r
<< dendl
;
8359 dirp
->offset
= next_off
;
8364 if (dirp
->next_offset
> 2) {
8365 ldout(cct
, 10) << " fetching next chunk of this frag" << dendl
;
8366 _readdir_drop_dirp_buffer(dirp
);
8370 if (!fg
.is_rightmost()) {
8372 _readdir_next_frag(dirp
);
8376 if (diri
->shared_gen
== dirp
->start_shared_gen
&&
8377 diri
->dir_release_count
== dirp
->release_count
) {
8378 if (diri
->dir_ordered_count
== dirp
->ordered_count
) {
8379 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
8381 ceph_assert(diri
->dir
->readdir_cache
.size() >= dirp
->cache_index
);
8382 diri
->dir
->readdir_cache
.resize(dirp
->cache_index
);
8384 diri
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
8386 ldout(cct
, 10) << " marking I_COMPLETE on " << *diri
<< dendl
;
8387 diri
->flags
|= I_COMPLETE
;
8399 int Client::readdir_r(dir_result_t
*d
, struct dirent
*de
)
8401 return readdirplus_r(d
, de
, 0, 0, 0, NULL
);
8408 * 1 if we got a dirent
8409 * 0 for end of directory
8413 struct single_readdir
{
8415 struct ceph_statx
*stx
;
8420 static int _readdir_single_dirent_cb(void *p
, struct dirent
*de
,
8421 struct ceph_statx
*stx
, off_t off
,
8424 single_readdir
*c
= static_cast<single_readdir
*>(p
);
8427 return -1; // already filled this dirent
8437 struct dirent
*Client::readdir(dir_result_t
*d
)
8447 // our callback fills the dirent and sets sr.full=true on first
8448 // call, and returns -1 the second time around.
8449 ret
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
);
8451 errno
= -ret
; // this sucks.
8452 return (dirent
*) NULL
;
8457 return (dirent
*) NULL
;
8460 int Client::readdirplus_r(dir_result_t
*d
, struct dirent
*de
,
8461 struct ceph_statx
*stx
, unsigned want
,
8462 unsigned flags
, Inode
**out
)
8470 // our callback fills the dirent and sets sr.full=true on first
8471 // call, and returns -1 the second time around.
8472 int r
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
, want
, flags
, out
);
8484 struct getdents_result
{
8491 static int _readdir_getdent_cb(void *p
, struct dirent
*de
,
8492 struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8494 struct getdents_result
*c
= static_cast<getdents_result
*>(p
);
8500 dlen
= strlen(de
->d_name
) + 1;
8502 if (c
->pos
+ dlen
> c
->buflen
)
8503 return -1; // doesn't fit
8506 memcpy(c
->buf
+ c
->pos
, de
, sizeof(*de
));
8508 memcpy(c
->buf
+ c
->pos
, de
->d_name
, dlen
);
8514 int Client::_getdents(dir_result_t
*dir
, char *buf
, int buflen
, bool fullent
)
8519 gr
.fullent
= fullent
;
8522 int r
= readdir_r_cb(dir
, _readdir_getdent_cb
, (void *)&gr
);
8524 if (r
< 0) { // some error
8525 if (r
== -1) { // buffer ran out of space
8526 if (gr
.pos
) { // but we got some entries already!
8528 } // or we need a larger buffer
8530 } else { // actual error, return it
8539 struct getdir_result
{
8540 list
<string
> *contents
;
8544 static int _getdir_cb(void *p
, struct dirent
*de
, struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8546 getdir_result
*r
= static_cast<getdir_result
*>(p
);
8548 r
->contents
->push_back(de
->d_name
);
8553 int Client::getdir(const char *relpath
, list
<string
>& contents
,
8554 const UserPerm
& perms
)
8556 ldout(cct
, 3) << "getdir(" << relpath
<< ")" << dendl
;
8558 std::lock_guard
lock(client_lock
);
8559 tout(cct
) << "getdir" << std::endl
;
8560 tout(cct
) << relpath
<< std::endl
;
8564 int r
= opendir(relpath
, &d
, perms
);
8569 gr
.contents
= &contents
;
8571 r
= readdir_r_cb(d
, _getdir_cb
, (void *)&gr
);
8581 /****** file i/o **********/
8582 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
,
8583 mode_t mode
, int stripe_unit
, int stripe_count
,
8584 int object_size
, const char *data_pool
)
8586 int cflags
= ceph_flags_sys2wire(flags
);
8588 ldout(cct
, 3) << "open enter(" << relpath
<< ", " << cflags
<< "," << mode
<< ")" << dendl
;
8589 std::lock_guard
lock(client_lock
);
8590 tout(cct
) << "open" << std::endl
;
8591 tout(cct
) << relpath
<< std::endl
;
8592 tout(cct
) << cflags
<< std::endl
;
8599 #if defined(__linux__) && defined(O_PATH)
8600 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8601 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8602 * in kernel (fs/open.c). */
8604 flags
&= O_DIRECTORY
| O_NOFOLLOW
| O_PATH
;
8607 filepath
path(relpath
);
8609 bool created
= false;
8610 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8611 bool followsym
= !((flags
& O_NOFOLLOW
) || ((flags
& O_CREAT
) && (flags
& O_EXCL
)));
8612 int mask
= ceph_caps_for_mode(ceph_flags_to_mode(cflags
));
8614 int r
= path_walk(path
, &in
, perms
, followsym
, mask
);
8616 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
8619 #if defined(__linux__) && defined(O_PATH)
8620 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
) && !(flags
& O_PATH
))
8622 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
))
8626 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
8627 filepath dirpath
= path
;
8628 string dname
= dirpath
.last_dentry();
8629 dirpath
.pop_dentry();
8631 r
= path_walk(dirpath
, &dir
, perms
, true,
8632 cct
->_conf
->client_permissions
? CEPH_CAP_AUTH_SHARED
: 0);
8635 if (cct
->_conf
->client_permissions
) {
8636 r
= may_create(dir
.get(), perms
);
8640 r
= _create(dir
.get(), dname
.c_str(), flags
, mode
, &in
, &fh
, stripe_unit
,
8641 stripe_count
, object_size
, data_pool
, &created
, perms
);
8647 // posix says we can only check permissions of existing files
8648 if (cct
->_conf
->client_permissions
) {
8649 r
= may_open(in
.get(), flags
, perms
);
8656 r
= _open(in
.get(), flags
, mode
, &fh
, perms
);
8658 // allocate a integer file descriptor
8661 ceph_assert(fd_map
.count(r
) == 0);
8666 tout(cct
) << r
<< std::endl
;
8667 ldout(cct
, 3) << "open exit(" << path
<< ", " << cflags
<< ") = " << r
<< dendl
;
8671 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
, mode_t mode
)
8673 /* Use default file striping parameters */
8674 return open(relpath
, flags
, perms
, mode
, 0, 0, 0, NULL
);
8677 int Client::lookup_hash(inodeno_t ino
, inodeno_t dirino
, const char *name
,
8678 const UserPerm
& perms
)
8680 std::lock_guard
lock(client_lock
);
8681 ldout(cct
, 3) << __func__
<< " enter(" << ino
<< ", #" << dirino
<< "/" << name
<< ")" << dendl
;
8686 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPHASH
);
8688 req
->set_filepath(path
);
8690 uint32_t h
= ceph_str_hash(CEPH_STR_HASH_RJENKINS
, name
, strlen(name
));
8692 sprintf(f
, "%u", h
);
8693 filepath
path2(dirino
);
8694 path2
.push_dentry(string(f
));
8695 req
->set_filepath2(path2
);
8697 int r
= make_request(req
, perms
, NULL
, NULL
,
8698 rand() % mdsmap
->get_num_in_mds());
8699 ldout(cct
, 3) << __func__
<< " exit(" << ino
<< ", #" << dirino
<< "/" << name
<< ") = " << r
<< dendl
;
8705 * Load inode into local cache.
8707 * If inode pointer is non-NULL, and take a reference on
8708 * the resulting Inode object in one operation, so that caller
8709 * can safely assume inode will still be there after return.
8711 int Client::_lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8713 ldout(cct
, 8) << __func__
<< " enter(" << ino
<< ")" << dendl
;
8718 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPINO
);
8720 req
->set_filepath(path
);
8722 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8723 if (r
== 0 && inode
!= NULL
) {
8724 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
8725 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
8726 ceph_assert(p
!= inode_map
.end());
8730 ldout(cct
, 8) << __func__
<< " exit(" << ino
<< ") = " << r
<< dendl
;
8734 int Client::lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8736 std::lock_guard
lock(client_lock
);
8737 return _lookup_ino(ino
, perms
, inode
);
8741 * Find the parent inode of `ino` and insert it into
8742 * our cache. Conditionally also set `parent` to a referenced
8743 * Inode* if caller provides non-NULL value.
8745 int Client::_lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
8747 ldout(cct
, 8) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
8749 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
8750 filepath
path(ino
->ino
);
8751 req
->set_filepath(path
);
8754 int r
= make_request(req
, perms
, &target
, NULL
, rand() % mdsmap
->get_num_in_mds());
8755 // Give caller a reference to the parent ino if they provided a pointer.
8756 if (parent
!= NULL
) {
8758 *parent
= target
.get();
8760 ldout(cct
, 8) << __func__
<< " found parent " << (*parent
)->ino
<< dendl
;
8765 ldout(cct
, 8) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8770 * Populate the parent dentry for `ino`, provided it is
8771 * a child of `parent`.
8773 int Client::_lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8775 ceph_assert(parent
->is_dir());
8776 ldout(cct
, 3) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
8781 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
8782 req
->set_filepath2(filepath(parent
->ino
));
8783 req
->set_filepath(filepath(ino
->ino
));
8784 req
->set_inode(ino
);
8786 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8787 ldout(cct
, 3) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8791 int Client::lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8793 std::lock_guard
lock(client_lock
);
8794 return _lookup_name(ino
, parent
, perms
);
8797 Fh
*Client::_create_fh(Inode
*in
, int flags
, int cmode
, const UserPerm
& perms
)
8800 Fh
*f
= new Fh(in
, flags
, cmode
, fd_gen
, perms
);
8802 ldout(cct
, 10) << __func__
<< " " << in
->ino
<< " mode " << cmode
<< dendl
;
8804 if (in
->snapid
!= CEPH_NOSNAP
) {
8805 in
->snap_cap_refs
++;
8806 ldout(cct
, 5) << "open success, fh is " << f
<< " combined IMMUTABLE SNAP caps "
8807 << ccap_string(in
->caps_issued()) << dendl
;
8810 const auto& conf
= cct
->_conf
;
8811 f
->readahead
.set_trigger_requests(1);
8812 f
->readahead
.set_min_readahead_size(conf
->client_readahead_min
);
8813 uint64_t max_readahead
= Readahead::NO_LIMIT
;
8814 if (conf
->client_readahead_max_bytes
) {
8815 max_readahead
= std::min(max_readahead
, (uint64_t)conf
->client_readahead_max_bytes
);
8817 if (conf
->client_readahead_max_periods
) {
8818 max_readahead
= std::min(max_readahead
, in
->layout
.get_period()*(uint64_t)conf
->client_readahead_max_periods
);
8820 f
->readahead
.set_max_readahead_size(max_readahead
);
8821 vector
<uint64_t> alignments
;
8822 alignments
.push_back(in
->layout
.get_period());
8823 alignments
.push_back(in
->layout
.stripe_unit
);
8824 f
->readahead
.set_alignments(alignments
);
8829 int Client::_release_fh(Fh
*f
)
8831 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8832 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8833 Inode
*in
= f
->inode
.get();
8834 ldout(cct
, 8) << __func__
<< " " << f
<< " mode " << f
->mode
<< " on " << *in
<< dendl
;
8838 if (in
->snapid
== CEPH_NOSNAP
) {
8839 if (in
->put_open_ref(f
->mode
)) {
8840 _flush(in
, new C_Client_FlushComplete(this, in
));
8844 ceph_assert(in
->snap_cap_refs
> 0);
8845 in
->snap_cap_refs
--;
8848 _release_filelocks(f
);
8850 // Finally, read any async err (i.e. from flushes)
8851 int err
= f
->take_async_err();
8853 ldout(cct
, 1) << __func__
<< " " << f
<< " on inode " << *in
<< " caught async_err = "
8854 << cpp_strerror(err
) << dendl
;
8856 ldout(cct
, 10) << __func__
<< " " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
8864 void Client::_put_fh(Fh
*f
)
8866 int left
= f
->put();
8872 int Client::_open(Inode
*in
, int flags
, mode_t mode
, Fh
**fhp
,
8873 const UserPerm
& perms
)
8875 if (in
->snapid
!= CEPH_NOSNAP
&&
8876 (flags
& (O_WRONLY
| O_RDWR
| O_CREAT
| O_TRUNC
| O_APPEND
))) {
8880 // use normalized flags to generate cmode
8881 int cflags
= ceph_flags_sys2wire(flags
);
8882 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
8883 cflags
|= CEPH_O_LAZY
;
8885 int cmode
= ceph_flags_to_mode(cflags
);
8886 int want
= ceph_caps_for_mode(cmode
);
8889 in
->get_open_ref(cmode
); // make note of pending open, since it effects _wanted_ caps.
8891 if ((flags
& O_TRUNC
) == 0 && in
->caps_issued_mask(want
)) {
8893 check_caps(in
, CHECK_CAPS_NODELAY
);
8896 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8898 in
->make_nosnap_relative_path(path
);
8899 req
->set_filepath(path
);
8900 req
->head
.args
.open
.flags
= cflags
& ~CEPH_O_CREAT
;
8901 req
->head
.args
.open
.mode
= mode
;
8902 req
->head
.args
.open
.pool
= -1;
8903 if (cct
->_conf
->client_debug_getattr_caps
)
8904 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8906 req
->head
.args
.open
.mask
= 0;
8907 req
->head
.args
.open
.old_size
= in
->size
; // for O_TRUNC
8909 result
= make_request(req
, perms
);
8912 * NFS expects that delegations will be broken on a conflicting open,
8913 * not just when there is actual conflicting access to the file. SMB leases
8914 * and oplocks also have similar semantics.
8916 * Ensure that clients that have delegations enabled will wait on minimal
8917 * caps during open, just to ensure that other clients holding delegations
8918 * return theirs first.
8920 if (deleg_timeout
&& result
== 0) {
8923 if (cmode
& CEPH_FILE_MODE_WR
)
8924 need
|= CEPH_CAP_FILE_WR
;
8925 if (cmode
& CEPH_FILE_MODE_RD
)
8926 need
|= CEPH_CAP_FILE_RD
;
8928 Fh
fh(in
, flags
, cmode
, fd_gen
, perms
);
8929 result
= get_caps(&fh
, need
, want
, &have
, -1);
8931 ldout(cct
, 8) << "Unable to get caps after open of inode " << *in
<<
8932 " . Denying open: " <<
8933 cpp_strerror(result
) << dendl
;
8935 put_cap_ref(in
, need
);
8943 *fhp
= _create_fh(in
, flags
, cmode
, perms
);
8945 in
->put_open_ref(cmode
);
8953 int Client::_renew_caps(Inode
*in
)
8955 int wanted
= in
->caps_file_wanted();
8956 if (in
->is_any_caps() &&
8957 ((wanted
& CEPH_CAP_ANY_WR
) == 0 || in
->auth_cap
)) {
8958 check_caps(in
, CHECK_CAPS_NODELAY
);
8963 if ((wanted
& CEPH_CAP_FILE_RD
) && (wanted
& CEPH_CAP_FILE_WR
))
8965 else if (wanted
& CEPH_CAP_FILE_RD
)
8967 else if (wanted
& CEPH_CAP_FILE_WR
)
8970 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8972 in
->make_nosnap_relative_path(path
);
8973 req
->set_filepath(path
);
8974 req
->head
.args
.open
.flags
= flags
;
8975 req
->head
.args
.open
.pool
= -1;
8976 if (cct
->_conf
->client_debug_getattr_caps
)
8977 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8979 req
->head
.args
.open
.mask
= 0;
8982 // duplicate in case Cap goes away; not sure if that race is a concern?
8983 const UserPerm
*pperm
= in
->get_best_perms();
8987 int ret
= make_request(req
, perms
);
8991 int Client::close(int fd
)
8993 ldout(cct
, 3) << "close enter(" << fd
<< ")" << dendl
;
8994 std::lock_guard
lock(client_lock
);
8995 tout(cct
) << "close" << std::endl
;
8996 tout(cct
) << fd
<< std::endl
;
9001 Fh
*fh
= get_filehandle(fd
);
9004 int err
= _release_fh(fh
);
9007 ldout(cct
, 3) << "close exit(" << fd
<< ")" << dendl
;
9015 loff_t
Client::lseek(int fd
, loff_t offset
, int whence
)
9017 std::lock_guard
lock(client_lock
);
9018 tout(cct
) << "lseek" << std::endl
;
9019 tout(cct
) << fd
<< std::endl
;
9020 tout(cct
) << offset
<< std::endl
;
9021 tout(cct
) << whence
<< std::endl
;
9026 Fh
*f
= get_filehandle(fd
);
9029 #if defined(__linux__) && defined(O_PATH)
9030 if (f
->flags
& O_PATH
)
9033 return _lseek(f
, offset
, whence
);
9036 loff_t
Client::_lseek(Fh
*f
, loff_t offset
, int whence
)
9038 Inode
*in
= f
->inode
.get();
9039 bool whence_check
= false;
9044 whence_check
= true;
9049 whence_check
= true;
9055 whence_check
= true;
9061 int r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
9072 pos
= f
->pos
+ offset
;
9076 pos
= in
->size
+ offset
;
9081 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
9089 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
9096 ldout(cct
, 1) << __func__
<< ": invalid whence value " << whence
<< dendl
;
9106 ldout(cct
, 8) << "_lseek(" << f
<< ", " << offset
<< ", " << whence
<< ") = " << f
->pos
<< dendl
;
9111 void Client::lock_fh_pos(Fh
*f
)
9113 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
9115 if (f
->pos_locked
|| !f
->pos_waiters
.empty()) {
9116 ceph::condition_variable cond
;
9117 f
->pos_waiters
.push_back(&cond
);
9118 ldout(cct
, 10) << __func__
<< " BLOCKING on " << f
<< dendl
;
9119 std::unique_lock l
{client_lock
, std::adopt_lock
};
9120 cond
.wait(l
, [f
, me
=&cond
] {
9121 return !f
->pos_locked
&& f
->pos_waiters
.front() == me
;
9124 ldout(cct
, 10) << __func__
<< " UNBLOCKING on " << f
<< dendl
;
9125 ceph_assert(f
->pos_waiters
.front() == &cond
);
9126 f
->pos_waiters
.pop_front();
9129 f
->pos_locked
= true;
9132 void Client::unlock_fh_pos(Fh
*f
)
9134 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
9136 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
9137 f
->pos_locked
= false;
9138 if (!f
->pos_waiters
.empty()) {
9139 // only wake up the oldest waiter
9140 auto cond
= f
->pos_waiters
.front();
9145 int Client::uninline_data(Inode
*in
, Context
*onfinish
)
9147 if (!in
->inline_data
.length()) {
9148 onfinish
->complete(0);
9153 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (long long unsigned)in
->ino
);
9154 object_t oid
= oid_buf
;
9156 ObjectOperation create_ops
;
9157 create_ops
.create(false);
9159 objecter
->mutate(oid
,
9160 OSDMap::file_to_object_locator(in
->layout
),
9162 in
->snaprealm
->get_snap_context(),
9163 ceph::real_clock::now(),
9167 bufferlist inline_version_bl
;
9168 encode(in
->inline_version
, inline_version_bl
);
9170 ObjectOperation uninline_ops
;
9171 uninline_ops
.cmpxattr("inline_version",
9172 CEPH_OSD_CMPXATTR_OP_GT
,
9173 CEPH_OSD_CMPXATTR_MODE_U64
,
9175 bufferlist inline_data
= in
->inline_data
;
9176 uninline_ops
.write(0, inline_data
, in
->truncate_size
, in
->truncate_seq
);
9177 uninline_ops
.setxattr("inline_version", stringify(in
->inline_version
));
9179 objecter
->mutate(oid
,
9180 OSDMap::file_to_object_locator(in
->layout
),
9182 in
->snaprealm
->get_snap_context(),
9183 ceph::real_clock::now(),
9192 // blocking osd interface
9194 int Client::read(int fd
, char *buf
, loff_t size
, loff_t offset
)
9196 std::unique_lock
lock(client_lock
);
9197 tout(cct
) << "read" << std::endl
;
9198 tout(cct
) << fd
<< std::endl
;
9199 tout(cct
) << size
<< std::endl
;
9200 tout(cct
) << offset
<< std::endl
;
9205 Fh
*f
= get_filehandle(fd
);
9208 #if defined(__linux__) && defined(O_PATH)
9209 if (f
->flags
& O_PATH
)
9213 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9214 size
= std::min(size
, (loff_t
)INT_MAX
);
9215 int r
= _read(f
, offset
, size
, &bl
);
9216 ldout(cct
, 3) << "read(" << fd
<< ", " << (void*)buf
<< ", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9219 bl
.begin().copy(bl
.length(), buf
);
9225 int Client::preadv(int fd
, const struct iovec
*iov
, int iovcnt
, loff_t offset
)
9229 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, false);
9232 int64_t Client::_read(Fh
*f
, int64_t offset
, uint64_t size
, bufferlist
*bl
)
9235 bool movepos
= false;
9236 std::unique_ptr
<C_SaferCond
> onuninline
;
9238 const auto& conf
= cct
->_conf
;
9239 Inode
*in
= f
->inode
.get();
9241 utime_t start
= ceph_clock_now();
9243 if ((f
->mode
& CEPH_FILE_MODE_RD
) == 0)
9245 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9252 loff_t start_pos
= offset
;
9254 if (in
->inline_version
== 0) {
9255 auto r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9260 ceph_assert(in
->inline_version
> 0);
9264 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9265 want
= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
;
9267 want
= CEPH_CAP_FILE_CACHE
;
9269 auto r
= get_caps(f
, CEPH_CAP_FILE_RD
, want
, &have
, -1);
9275 if (f
->flags
& O_DIRECT
)
9276 have
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
9278 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9279 if (!(have
& CEPH_CAP_FILE_CACHE
)) {
9280 onuninline
.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9281 uninline_data(in
, onuninline
.get());
9283 uint32_t len
= in
->inline_data
.length();
9284 uint64_t endoff
= offset
+ size
;
9285 if (endoff
> in
->size
)
9289 if (endoff
<= len
) {
9290 bl
->substr_of(in
->inline_data
, offset
, endoff
- offset
);
9292 bl
->substr_of(in
->inline_data
, offset
, len
- offset
);
9293 bl
->append_zero(endoff
- len
);
9295 rc
= endoff
- offset
;
9296 } else if ((uint64_t)offset
< endoff
) {
9297 bl
->append_zero(endoff
- offset
);
9298 rc
= endoff
- offset
;
9306 if (!conf
->client_debug_force_sync_read
&&
9308 (have
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
9310 if (f
->flags
& O_RSYNC
) {
9311 _flush_range(in
, offset
, size
);
9313 rc
= _read_async(f
, offset
, size
, bl
);
9317 if (f
->flags
& O_DIRECT
)
9318 _flush_range(in
, offset
, size
);
9320 bool checkeof
= false;
9321 rc
= _read_sync(f
, offset
, size
, bl
, &checkeof
);
9328 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9332 auto r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
9340 if ((uint64_t)offset
< in
->size
)
9346 ceph_assert(rc
>= 0);
9349 f
->pos
= start_pos
+ rc
;
9352 lat
= ceph_clock_now();
9354 logger
->tinc(l_c_read
, lat
);
9360 client_lock
.unlock();
9361 int ret
= onuninline
->wait();
9363 if (ret
>= 0 || ret
== -ECANCELED
) {
9364 in
->inline_data
.clear();
9365 in
->inline_version
= CEPH_INLINE_NONE
;
9366 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9372 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9380 Client::C_Readahead::C_Readahead(Client
*c
, Fh
*f
) :
9383 f
->readahead
.inc_pending();
9386 Client::C_Readahead::~C_Readahead() {
9387 f
->readahead
.dec_pending();
9391 void Client::C_Readahead::finish(int r
) {
9392 lgeneric_subdout(client
->cct
, client
, 20) << "client." << client
->get_nodeid() << " " << "C_Readahead on " << f
->inode
<< dendl
;
9393 client
->put_cap_ref(f
->inode
.get(), CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9396 int Client::_read_async(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
)
9398 const auto& conf
= cct
->_conf
;
9399 Inode
*in
= f
->inode
.get();
9401 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9403 // trim read based on file size?
9404 if (off
>= in
->size
)
9408 if (off
+ len
> in
->size
) {
9409 len
= in
->size
- off
;
9412 ldout(cct
, 10) << " min_bytes=" << f
->readahead
.get_min_readahead_size()
9413 << " max_bytes=" << f
->readahead
.get_max_readahead_size()
9414 << " max_periods=" << conf
->client_readahead_max_periods
<< dendl
;
9416 // read (and possibly block)
9418 C_SaferCond
onfinish("Client::_read_async flock");
9419 r
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9420 off
, len
, bl
, 0, &onfinish
);
9422 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9423 client_lock
.unlock();
9424 r
= onfinish
.wait();
9426 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9429 if(f
->readahead
.get_min_readahead_size() > 0) {
9430 pair
<uint64_t, uint64_t> readahead_extent
= f
->readahead
.update(off
, len
, in
->size
);
9431 if (readahead_extent
.second
> 0) {
9432 ldout(cct
, 20) << "readahead " << readahead_extent
.first
<< "~" << readahead_extent
.second
9433 << " (caller wants " << off
<< "~" << len
<< ")" << dendl
;
9434 Context
*onfinish2
= new C_Readahead(this, f
);
9435 int r2
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9436 readahead_extent
.first
, readahead_extent
.second
,
9437 NULL
, 0, onfinish2
);
9439 ldout(cct
, 20) << "readahead initiated, c " << onfinish2
<< dendl
;
9440 get_cap_ref(in
, CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9442 ldout(cct
, 20) << "readahead was no-op, already cached" << dendl
;
9451 int Client::_read_sync(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
,
9454 Inode
*in
= f
->inode
.get();
9459 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9462 C_SaferCond
onfinish("Client::_read_sync flock");
9466 filer
->read_trunc(in
->ino
, &in
->layout
, in
->snapid
,
9468 in
->truncate_size
, in
->truncate_seq
,
9470 client_lock
.unlock();
9471 int r
= onfinish
.wait();
9474 // if we get ENOENT from OSD, assume 0 bytes returned
9485 bl
->claim_append(tbl
);
9488 if (r
>= 0 && r
< wanted
) {
9489 if (pos
< in
->size
) {
9490 // zero up to known EOF
9491 int64_t some
= in
->size
- pos
;
9494 auto z
= buffer::ptr_node::create(some
);
9496 bl
->push_back(std::move(z
));
9513 * we keep count of uncommitted sync writes on the inode, so that
9516 void Client::_sync_write_commit(Inode
*in
)
9518 ceph_assert(unsafe_sync_write
> 0);
9519 unsafe_sync_write
--;
9521 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9523 ldout(cct
, 15) << __func__
<< " unsafe_sync_write = " << unsafe_sync_write
<< dendl
;
9524 if (unsafe_sync_write
== 0 && unmounting
) {
9525 ldout(cct
, 10) << __func__
<< " -- no more unsafe writes, unmount can proceed" << dendl
;
9526 mount_cond
.notify_all();
9530 int Client::write(int fd
, const char *buf
, loff_t size
, loff_t offset
)
9532 std::lock_guard
lock(client_lock
);
9533 tout(cct
) << "write" << std::endl
;
9534 tout(cct
) << fd
<< std::endl
;
9535 tout(cct
) << size
<< std::endl
;
9536 tout(cct
) << offset
<< std::endl
;
9541 Fh
*fh
= get_filehandle(fd
);
9544 #if defined(__linux__) && defined(O_PATH)
9545 if (fh
->flags
& O_PATH
)
9548 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9549 size
= std::min(size
, (loff_t
)INT_MAX
);
9550 int r
= _write(fh
, offset
, size
, buf
, NULL
, false);
9551 ldout(cct
, 3) << "write(" << fd
<< ", \"...\", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9555 int Client::pwritev(int fd
, const struct iovec
*iov
, int iovcnt
, int64_t offset
)
9559 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, true);
9562 int64_t Client::_preadv_pwritev_locked(Fh
*fh
, const struct iovec
*iov
,
9563 unsigned iovcnt
, int64_t offset
, bool write
,
9566 #if defined(__linux__) && defined(O_PATH)
9567 if (fh
->flags
& O_PATH
)
9570 loff_t totallen
= 0;
9571 for (unsigned i
= 0; i
< iovcnt
; i
++) {
9572 totallen
+= iov
[i
].iov_len
;
9576 * Some of the API functions take 64-bit size values, but only return
9577 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9578 * we don't do I/Os larger than the values we can return.
9581 totallen
= std::min(totallen
, (loff_t
)INT_MAX
);
9584 int64_t w
= _write(fh
, offset
, totallen
, NULL
, iov
, iovcnt
);
9585 ldout(cct
, 3) << "pwritev(" << fh
<< ", \"...\", " << totallen
<< ", " << offset
<< ") = " << w
<< dendl
;
9589 int64_t r
= _read(fh
, offset
, totallen
, &bl
);
9590 ldout(cct
, 3) << "preadv(" << fh
<< ", " << offset
<< ") = " << r
<< dendl
;
9594 auto iter
= bl
.cbegin();
9595 for (unsigned j
= 0, resid
= r
; j
< iovcnt
&& resid
> 0; j
++) {
9597 * This piece of code aims to handle the case that bufferlist does not have enough data
9598 * to fill in the iov
9600 const auto round_size
= std::min
<unsigned>(resid
, iov
[j
].iov_len
);
9601 iter
.copy(round_size
, reinterpret_cast<char*>(iov
[j
].iov_base
));
9602 resid
-= round_size
;
9603 /* iter is self-updating */
9609 int Client::_preadv_pwritev(int fd
, const struct iovec
*iov
, unsigned iovcnt
, int64_t offset
, bool write
)
9611 std::lock_guard
lock(client_lock
);
9612 tout(cct
) << fd
<< std::endl
;
9613 tout(cct
) << offset
<< std::endl
;
9618 Fh
*fh
= get_filehandle(fd
);
9621 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, offset
, write
, true);
9624 int64_t Client::_write(Fh
*f
, int64_t offset
, uint64_t size
, const char *buf
,
9625 const struct iovec
*iov
, int iovcnt
)
9629 if ((uint64_t)(offset
+size
) > mdsmap
->get_max_filesize()) //too large!
9632 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9633 Inode
*in
= f
->inode
.get();
9635 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
9639 ceph_assert(in
->snapid
== CEPH_NOSNAP
);
9641 // was Fh opened as writeable?
9642 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
9645 // use/adjust fd pos?
9649 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9650 * change out from under us.
9652 if (f
->flags
& O_APPEND
) {
9653 auto r
= _lseek(f
, 0, SEEK_END
);
9665 uint64_t endoff
= offset
+ size
;
9666 if (endoff
> in
->size
&& is_quota_bytes_exceeded(in
, endoff
- in
->size
,
9671 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9673 ldout(cct
, 10) << "cur file size is " << in
->size
<< dendl
;
9676 utime_t start
= ceph_clock_now();
9678 if (in
->inline_version
== 0) {
9679 int r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9682 ceph_assert(in
->inline_version
> 0);
9685 // copy into fresh buffer (since our write may be resub, async)
9689 bl
.append(buf
, size
);
9691 for (int i
= 0; i
< iovcnt
; i
++) {
9692 if (iov
[i
].iov_len
> 0) {
9693 bl
.append((const char *)iov
[i
].iov_base
, iov
[i
].iov_len
);
9699 uint64_t totalwritten
;
9701 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9702 want
= CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
;
9704 want
= CEPH_CAP_FILE_BUFFER
;
9705 int r
= get_caps(f
, CEPH_CAP_FILE_WR
|CEPH_CAP_AUTH_SHARED
, want
, &have
, endoff
);
9709 /* clear the setuid/setgid bits, if any */
9710 if (unlikely(in
->mode
& (S_ISUID
|S_ISGID
)) && size
> 0) {
9711 struct ceph_statx stx
= { 0 };
9713 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9714 r
= __setattrx(in
, &stx
, CEPH_SETATTR_KILL_SGUID
, f
->actor_perms
);
9718 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9721 if (f
->flags
& O_DIRECT
)
9722 have
&= ~(CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
);
9724 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
9726 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
9728 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9729 if (endoff
> cct
->_conf
->client_max_inline_size
||
9730 endoff
> CEPH_INLINE_MAX_SIZE
||
9731 !(have
& CEPH_CAP_FILE_BUFFER
)) {
9732 onuninline
.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9733 uninline_data(in
, onuninline
.get());
9735 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9737 uint32_t len
= in
->inline_data
.length();
9740 in
->inline_data
.begin(endoff
).copy(len
- endoff
, bl
); // XXX
9743 in
->inline_data
.splice(offset
, len
- offset
);
9744 else if (offset
> len
)
9745 in
->inline_data
.append_zero(offset
- len
);
9747 in
->inline_data
.append(bl
);
9748 in
->inline_version
++;
9750 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9756 if (cct
->_conf
->client_oc
&&
9757 (have
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
))) {
9758 // do buffered write
9759 if (!in
->oset
.dirty_or_tx
)
9760 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
9762 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9764 // async, caching, non-blocking.
9765 r
= objectcacher
->file_write(&in
->oset
, &in
->layout
,
9766 in
->snaprealm
->get_snap_context(),
9767 offset
, size
, bl
, ceph::real_clock::now(),
9769 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9774 // flush cached write if O_SYNC is set on file fh
9775 // O_DSYNC == O_SYNC on linux < 2.6.33
9776 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9777 if ((f
->flags
& O_SYNC
) || (f
->flags
& O_DSYNC
)) {
9778 _flush_range(in
, offset
, size
);
9781 if (f
->flags
& O_DIRECT
)
9782 _flush_range(in
, offset
, size
);
9784 // simple, non-atomic sync write
9785 C_SaferCond
onfinish("Client::_write flock");
9786 unsafe_sync_write
++;
9787 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
); // released by onsafe callback
9789 filer
->write_trunc(in
->ino
, &in
->layout
, in
->snaprealm
->get_snap_context(),
9790 offset
, size
, bl
, ceph::real_clock::now(), 0,
9791 in
->truncate_size
, in
->truncate_seq
,
9793 client_lock
.unlock();
9794 r
= onfinish
.wait();
9796 _sync_write_commit(in
);
9801 // if we get here, write was successful, update client metadata
9804 lat
= ceph_clock_now();
9806 logger
->tinc(l_c_wrlat
, lat
);
9813 totalwritten
= size
;
9814 r
= (int64_t)totalwritten
;
9817 if (totalwritten
+ offset
> in
->size
) {
9818 in
->size
= totalwritten
+ offset
;
9819 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9821 if (is_quota_bytes_approaching(in
, f
->actor_perms
)) {
9822 check_caps(in
, CHECK_CAPS_NODELAY
);
9823 } else if (is_max_size_approaching(in
)) {
9827 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", extending file size" << dendl
;
9829 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", leaving file size at " << in
->size
<< dendl
;
9833 in
->mtime
= in
->ctime
= ceph_clock_now();
9835 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9839 if (nullptr != onuninline
) {
9840 client_lock
.unlock();
9841 int uninline_ret
= onuninline
->wait();
9844 if (uninline_ret
>= 0 || uninline_ret
== -ECANCELED
) {
9845 in
->inline_data
.clear();
9846 in
->inline_version
= CEPH_INLINE_NONE
;
9847 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9853 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
9857 int Client::_flush(Fh
*f
)
9859 Inode
*in
= f
->inode
.get();
9860 int err
= f
->take_async_err();
9862 ldout(cct
, 1) << __func__
<< ": " << f
<< " on inode " << *in
<< " caught async_err = "
9863 << cpp_strerror(err
) << dendl
;
9865 ldout(cct
, 10) << __func__
<< ": " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
9871 int Client::truncate(const char *relpath
, loff_t length
, const UserPerm
& perms
)
9873 struct ceph_statx stx
;
9874 stx
.stx_size
= length
;
9875 return setattrx(relpath
, &stx
, CEPH_SETATTR_SIZE
, perms
);
9878 int Client::ftruncate(int fd
, loff_t length
, const UserPerm
& perms
)
9880 std::lock_guard
lock(client_lock
);
9881 tout(cct
) << __func__
<< std::endl
;
9882 tout(cct
) << fd
<< std::endl
;
9883 tout(cct
) << length
<< std::endl
;
9888 Fh
*f
= get_filehandle(fd
);
9891 #if defined(__linux__) && defined(O_PATH)
9892 if (f
->flags
& O_PATH
)
9895 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
9898 attr
.st_size
= length
;
9899 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_SIZE
, perms
);
9902 int Client::fsync(int fd
, bool syncdataonly
)
9904 std::lock_guard
lock(client_lock
);
9905 tout(cct
) << "fsync" << std::endl
;
9906 tout(cct
) << fd
<< std::endl
;
9907 tout(cct
) << syncdataonly
<< std::endl
;
9912 Fh
*f
= get_filehandle(fd
);
9915 #if defined(__linux__) && defined(O_PATH)
9916 if (f
->flags
& O_PATH
)
9919 int r
= _fsync(f
, syncdataonly
);
9921 // The IOs in this fsync were okay, but maybe something happened
9922 // in the background that we shoudl be reporting?
9923 r
= f
->take_async_err();
9924 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
9925 << ") = 0, async_err = " << r
<< dendl
;
9927 // Assume that an error we encountered during fsync, even reported
9928 // synchronously, would also have applied the error to the Fh, and we
9929 // should clear it here to avoid returning the same error again on next
9931 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
<< ") = "
9933 f
->take_async_err();
9938 int Client::_fsync(Inode
*in
, bool syncdataonly
)
9941 std::unique_ptr
<C_SaferCond
> object_cacher_completion
= nullptr;
9942 ceph_tid_t flush_tid
= 0;
9945 utime_t start
= ceph_clock_now();
9947 ldout(cct
, 8) << "_fsync on " << *in
<< " " << (syncdataonly
? "(dataonly)":"(data+metadata)") << dendl
;
9949 if (cct
->_conf
->client_oc
) {
9950 object_cacher_completion
.reset(new C_SaferCond("Client::_fsync::lock"));
9951 tmp_ref
= in
; // take a reference; C_SaferCond doesn't and _flush won't either
9952 _flush(in
, object_cacher_completion
.get());
9953 ldout(cct
, 15) << "using return-valued form of _fsync" << dendl
;
9956 if (!syncdataonly
&& in
->dirty_caps
) {
9957 check_caps(in
, CHECK_CAPS_NODELAY
|CHECK_CAPS_SYNCHRONOUS
);
9958 if (in
->flushing_caps
)
9959 flush_tid
= last_flush_tid
;
9960 } else ldout(cct
, 10) << "no metadata needs to commit" << dendl
;
9962 if (!syncdataonly
&& !in
->unsafe_ops
.empty()) {
9965 MetaRequest
*req
= in
->unsafe_ops
.back();
9966 ldout(cct
, 15) << "waiting on unsafe requests, last tid " << req
->get_tid() << dendl
;
9969 wait_on_list(req
->waitfor_safe
);
9973 if (nullptr != object_cacher_completion
) { // wait on a real reply instead of guessing
9974 client_lock
.unlock();
9975 ldout(cct
, 15) << "waiting on data to flush" << dendl
;
9976 r
= object_cacher_completion
->wait();
9978 ldout(cct
, 15) << "got " << r
<< " from flush writeback" << dendl
;
9980 // FIXME: this can starve
9981 while (in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] > 0) {
9982 ldout(cct
, 10) << "ino " << in
->ino
<< " has " << in
->cap_refs
[CEPH_CAP_FILE_BUFFER
]
9983 << " uncommitted, waiting" << dendl
;
9984 wait_on_list(in
->waitfor_commit
);
9990 wait_sync_caps(in
, flush_tid
);
9992 ldout(cct
, 10) << "ino " << in
->ino
<< " has no uncommitted writes" << dendl
;
9994 ldout(cct
, 8) << "ino " << in
->ino
<< " failed to commit to disk! "
9995 << cpp_strerror(-r
) << dendl
;
9998 lat
= ceph_clock_now();
10000 logger
->tinc(l_c_fsync
, lat
);
10005 int Client::_fsync(Fh
*f
, bool syncdataonly
)
10007 ldout(cct
, 8) << "_fsync(" << f
<< ", " << (syncdataonly
? "dataonly)":"data+metadata)") << dendl
;
10008 return _fsync(f
->inode
.get(), syncdataonly
);
10011 int Client::fstat(int fd
, struct stat
*stbuf
, const UserPerm
& perms
, int mask
)
10013 std::lock_guard
lock(client_lock
);
10014 tout(cct
) << "fstat mask " << hex
<< mask
<< dec
<< std::endl
;
10015 tout(cct
) << fd
<< std::endl
;
10020 Fh
*f
= get_filehandle(fd
);
10023 int r
= _getattr(f
->inode
, mask
, perms
);
10026 fill_stat(f
->inode
, stbuf
, NULL
);
10027 ldout(cct
, 5) << "fstat(" << fd
<< ", " << stbuf
<< ") = " << r
<< dendl
;
10031 int Client::fstatx(int fd
, struct ceph_statx
*stx
, const UserPerm
& perms
,
10032 unsigned int want
, unsigned int flags
)
10034 std::lock_guard
lock(client_lock
);
10035 tout(cct
) << "fstatx flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
10036 tout(cct
) << fd
<< std::endl
;
10041 Fh
*f
= get_filehandle(fd
);
10045 unsigned mask
= statx_to_mask(flags
, want
);
10048 if (mask
&& !f
->inode
->caps_issued_mask(mask
, true)) {
10049 r
= _getattr(f
->inode
, mask
, perms
);
10051 ldout(cct
, 3) << "fstatx exit on error!" << dendl
;
10056 fill_statx(f
->inode
, mask
, stx
);
10057 ldout(cct
, 3) << "fstatx(" << fd
<< ", " << stx
<< ") = " << r
<< dendl
;
10061 // not written yet, but i want to link!
10063 int Client::chdir(const char *relpath
, std::string
&new_cwd
,
10064 const UserPerm
& perms
)
10066 std::lock_guard
lock(client_lock
);
10067 tout(cct
) << "chdir" << std::endl
;
10068 tout(cct
) << relpath
<< std::endl
;
10073 filepath
path(relpath
);
10075 int r
= path_walk(path
, &in
, perms
);
10079 if (!(in
.get()->is_dir()))
10084 ldout(cct
, 3) << "chdir(" << relpath
<< ") cwd now " << cwd
->ino
<< dendl
;
10086 _getcwd(new_cwd
, perms
);
10090 void Client::_getcwd(string
& dir
, const UserPerm
& perms
)
10093 ldout(cct
, 10) << __func__
<< " " << *cwd
<< dendl
;
10095 Inode
*in
= cwd
.get();
10096 while (in
!= root
) {
10097 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
10099 // A cwd or ancester is unlinked
10100 if (in
->dentries
.empty()) {
10104 Dentry
*dn
= in
->get_first_parent();
10109 ldout(cct
, 10) << __func__
<< " looking up parent for " << *in
<< dendl
;
10110 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
10111 filepath
path(in
->ino
);
10112 req
->set_filepath(path
);
10113 req
->set_inode(in
);
10114 int res
= make_request(req
, perms
);
10123 path
.push_front_dentry(dn
->name
);
10124 in
= dn
->dir
->parent_inode
;
10127 dir
+= path
.get_path();
10130 void Client::getcwd(string
& dir
, const UserPerm
& perms
)
10132 std::lock_guard
l(client_lock
);
10134 _getcwd(dir
, perms
);
10137 int Client::statfs(const char *path
, struct statvfs
*stbuf
,
10138 const UserPerm
& perms
)
10140 std::lock_guard
l(client_lock
);
10141 tout(cct
) << __func__
<< std::endl
;
10142 unsigned long int total_files_on_fs
;
10150 const vector
<int64_t> &data_pools
= mdsmap
->get_data_pools();
10151 if (data_pools
.size() == 1) {
10152 objecter
->get_fs_stats(stats
, data_pools
[0], &cond
);
10154 objecter
->get_fs_stats(stats
, boost::optional
<int64_t>(), &cond
);
10157 client_lock
.unlock();
10158 int rval
= cond
.wait();
10160 total_files_on_fs
= root
->rstat
.rfiles
+ root
->rstat
.rsubdirs
;
10161 client_lock
.lock();
10164 ldout(cct
, 1) << "underlying call to statfs returned error: "
10165 << cpp_strerror(rval
)
10170 memset(stbuf
, 0, sizeof(*stbuf
));
10173 * we're going to set a block size of 4MB so we can represent larger
10174 * FSes without overflowing. Additionally convert the space
10175 * measurements from KB to bytes while making them in terms of
10176 * blocks. We use 4MB only because it is big enough, and because it
10177 * actually *is* the (ceph) default block size.
10179 const int CEPH_BLOCK_SHIFT
= 22;
10180 stbuf
->f_frsize
= 1 << CEPH_BLOCK_SHIFT
;
10181 stbuf
->f_bsize
= 1 << CEPH_BLOCK_SHIFT
;
10182 stbuf
->f_files
= total_files_on_fs
;
10183 stbuf
->f_ffree
= 0;
10184 stbuf
->f_favail
= -1;
10185 stbuf
->f_fsid
= -1; // ??
10186 stbuf
->f_flag
= 0; // ??
10187 stbuf
->f_namemax
= NAME_MAX
;
10189 // Usually quota_root will == root_ancestor, but if the mount root has no
10190 // quota but we can see a parent of it that does have a quota, we'll
10191 // respect that one instead.
10192 ceph_assert(root
!= nullptr);
10193 Inode
*quota_root
= root
->quota
.is_enable() ? root
: get_quota_root(root
, perms
);
10195 // get_quota_root should always give us something
10196 // because client quotas are always enabled
10197 ceph_assert(quota_root
!= nullptr);
10199 if (quota_root
&& cct
->_conf
->client_quota_df
&& quota_root
->quota
.max_bytes
) {
10201 // Skip the getattr if any sessions are stale, as we don't want to
10202 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10204 if (!_any_stale_sessions()) {
10205 int r
= _getattr(quota_root
, 0, perms
, true);
10207 // Ignore return value: error getting latest inode metadata is not a good
10208 // reason to break "df".
10209 lderr(cct
) << "Error in getattr on quota root 0x"
10210 << std::hex
<< quota_root
->ino
<< std::dec
10211 << " statfs result may be outdated" << dendl
;
10215 // Special case: if there is a size quota set on the Inode acting
10216 // as the root for this client mount, then report the quota status
10217 // as the filesystem statistics.
10218 const fsblkcnt_t total
= quota_root
->quota
.max_bytes
>> CEPH_BLOCK_SHIFT
;
10219 const fsblkcnt_t used
= quota_root
->rstat
.rbytes
>> CEPH_BLOCK_SHIFT
;
10220 // It is possible for a quota to be exceeded: arithmetic here must
10221 // handle case where used > total.
10222 const fsblkcnt_t free
= total
> used
? total
- used
: 0;
10224 stbuf
->f_blocks
= total
;
10225 stbuf
->f_bfree
= free
;
10226 stbuf
->f_bavail
= free
;
10228 // General case: report the cluster statistics returned from RADOS. Because
10229 // multiple pools may be used without one filesystem namespace via
10230 // layouts, this is the most correct thing we can do.
10231 stbuf
->f_blocks
= stats
.kb
>> (CEPH_BLOCK_SHIFT
- 10);
10232 stbuf
->f_bfree
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10233 stbuf
->f_bavail
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10239 int Client::_do_filelock(Inode
*in
, Fh
*fh
, int lock_type
, int op
, int sleep
,
10240 struct flock
*fl
, uint64_t owner
, bool removing
)
10242 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
10243 << (lock_type
== CEPH_LOCK_FCNTL
? " fcntl" : " flock")
10244 << " type " << fl
->l_type
<< " owner " << owner
10245 << " " << fl
->l_start
<< "~" << fl
->l_len
<< dendl
;
10247 if (in
->flags
& I_ERROR_FILELOCK
)
10251 if (F_RDLCK
== fl
->l_type
)
10252 lock_cmd
= CEPH_LOCK_SHARED
;
10253 else if (F_WRLCK
== fl
->l_type
)
10254 lock_cmd
= CEPH_LOCK_EXCL
;
10255 else if (F_UNLCK
== fl
->l_type
)
10256 lock_cmd
= CEPH_LOCK_UNLOCK
;
10260 if (op
!= CEPH_MDS_OP_SETFILELOCK
|| lock_cmd
== CEPH_LOCK_UNLOCK
)
10264 * Set the most significant bit, so that MDS knows the 'owner'
10265 * is sufficient to identify the owner of lock. (old code uses
10266 * both 'owner' and 'pid')
10268 owner
|= (1ULL << 63);
10270 MetaRequest
*req
= new MetaRequest(op
);
10272 in
->make_nosnap_relative_path(path
);
10273 req
->set_filepath(path
);
10274 req
->set_inode(in
);
10276 req
->head
.args
.filelock_change
.rule
= lock_type
;
10277 req
->head
.args
.filelock_change
.type
= lock_cmd
;
10278 req
->head
.args
.filelock_change
.owner
= owner
;
10279 req
->head
.args
.filelock_change
.pid
= fl
->l_pid
;
10280 req
->head
.args
.filelock_change
.start
= fl
->l_start
;
10281 req
->head
.args
.filelock_change
.length
= fl
->l_len
;
10282 req
->head
.args
.filelock_change
.wait
= sleep
;
10287 if (sleep
&& switch_interrupt_cb
) {
10288 // enable interrupt
10289 switch_interrupt_cb(callback_handle
, req
->get());
10290 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10291 // disable interrupt
10292 switch_interrupt_cb(callback_handle
, NULL
);
10293 if (ret
== 0 && req
->aborted()) {
10294 // effect of this lock request has been revoked by the 'lock intr' request
10295 ret
= req
->get_abort_code();
10299 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10303 if (op
== CEPH_MDS_OP_GETFILELOCK
) {
10304 ceph_filelock filelock
;
10305 auto p
= bl
.cbegin();
10306 decode(filelock
, p
);
10308 if (CEPH_LOCK_SHARED
== filelock
.type
)
10309 fl
->l_type
= F_RDLCK
;
10310 else if (CEPH_LOCK_EXCL
== filelock
.type
)
10311 fl
->l_type
= F_WRLCK
;
10313 fl
->l_type
= F_UNLCK
;
10315 fl
->l_whence
= SEEK_SET
;
10316 fl
->l_start
= filelock
.start
;
10317 fl
->l_len
= filelock
.length
;
10318 fl
->l_pid
= filelock
.pid
;
10319 } else if (op
== CEPH_MDS_OP_SETFILELOCK
) {
10320 ceph_lock_state_t
*lock_state
;
10321 if (lock_type
== CEPH_LOCK_FCNTL
) {
10322 if (!in
->fcntl_locks
)
10323 in
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10324 lock_state
= in
->fcntl_locks
.get();
10325 } else if (lock_type
== CEPH_LOCK_FLOCK
) {
10326 if (!in
->flock_locks
)
10327 in
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10328 lock_state
= in
->flock_locks
.get();
10333 _update_lock_state(fl
, owner
, lock_state
);
10336 if (lock_type
== CEPH_LOCK_FCNTL
) {
10337 if (!fh
->fcntl_locks
)
10338 fh
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10339 lock_state
= fh
->fcntl_locks
.get();
10341 if (!fh
->flock_locks
)
10342 fh
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10343 lock_state
= fh
->flock_locks
.get();
10345 _update_lock_state(fl
, owner
, lock_state
);
10353 int Client::_interrupt_filelock(MetaRequest
*req
)
10355 // Set abort code, but do not kick. The abort code prevents the request
10356 // from being re-sent.
10357 req
->abort(-EINTR
);
10359 return 0; // haven't sent the request
10361 Inode
*in
= req
->inode();
10364 if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FLOCK
)
10365 lock_type
= CEPH_LOCK_FLOCK_INTR
;
10366 else if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FCNTL
)
10367 lock_type
= CEPH_LOCK_FCNTL_INTR
;
10373 MetaRequest
*intr_req
= new MetaRequest(CEPH_MDS_OP_SETFILELOCK
);
10375 in
->make_nosnap_relative_path(path
);
10376 intr_req
->set_filepath(path
);
10377 intr_req
->set_inode(in
);
10378 intr_req
->head
.args
.filelock_change
= req
->head
.args
.filelock_change
;
10379 intr_req
->head
.args
.filelock_change
.rule
= lock_type
;
10380 intr_req
->head
.args
.filelock_change
.type
= CEPH_LOCK_UNLOCK
;
10382 UserPerm
perms(req
->get_uid(), req
->get_gid());
10383 return make_request(intr_req
, perms
, NULL
, NULL
, -1);
10386 void Client::_encode_filelocks(Inode
*in
, bufferlist
& bl
)
10388 if (!in
->fcntl_locks
&& !in
->flock_locks
)
10391 unsigned nr_fcntl_locks
= in
->fcntl_locks
? in
->fcntl_locks
->held_locks
.size() : 0;
10392 encode(nr_fcntl_locks
, bl
);
10393 if (nr_fcntl_locks
) {
10394 auto &lock_state
= in
->fcntl_locks
;
10395 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10396 p
!= lock_state
->held_locks
.end();
10398 encode(p
->second
, bl
);
10401 unsigned nr_flock_locks
= in
->flock_locks
? in
->flock_locks
->held_locks
.size() : 0;
10402 encode(nr_flock_locks
, bl
);
10403 if (nr_flock_locks
) {
10404 auto &lock_state
= in
->flock_locks
;
10405 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10406 p
!= lock_state
->held_locks
.end();
10408 encode(p
->second
, bl
);
10411 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< ", " << nr_fcntl_locks
10412 << " fcntl locks, " << nr_flock_locks
<< " flock locks" << dendl
;
10415 void Client::_release_filelocks(Fh
*fh
)
10417 if (!fh
->fcntl_locks
&& !fh
->flock_locks
)
10420 Inode
*in
= fh
->inode
.get();
10421 ldout(cct
, 10) << __func__
<< " " << fh
<< " ino " << in
->ino
<< dendl
;
10423 list
<ceph_filelock
> activated_locks
;
10425 list
<pair
<int, ceph_filelock
> > to_release
;
10427 if (fh
->fcntl_locks
) {
10428 auto &lock_state
= fh
->fcntl_locks
;
10429 for(auto p
= lock_state
->held_locks
.begin(); p
!= lock_state
->held_locks
.end(); ) {
10431 if (in
->flags
& I_ERROR_FILELOCK
) {
10432 lock_state
->remove_lock(q
->second
, activated_locks
);
10434 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FCNTL
, q
->second
));
10437 lock_state
.reset();
10439 if (fh
->flock_locks
) {
10440 auto &lock_state
= fh
->flock_locks
;
10441 for(auto p
= lock_state
->held_locks
.begin(); p
!= lock_state
->held_locks
.end(); ) {
10443 if (in
->flags
& I_ERROR_FILELOCK
) {
10444 lock_state
->remove_lock(q
->second
, activated_locks
);
10446 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FLOCK
, q
->second
));
10449 lock_state
.reset();
10452 if ((in
->flags
& I_ERROR_FILELOCK
) && !in
->has_any_filelocks())
10453 in
->flags
&= ~I_ERROR_FILELOCK
;
10455 if (to_release
.empty())
10459 memset(&fl
, 0, sizeof(fl
));
10460 fl
.l_whence
= SEEK_SET
;
10461 fl
.l_type
= F_UNLCK
;
10463 for (list
<pair
<int, ceph_filelock
> >::iterator p
= to_release
.begin();
10464 p
!= to_release
.end();
10466 fl
.l_start
= p
->second
.start
;
10467 fl
.l_len
= p
->second
.length
;
10468 fl
.l_pid
= p
->second
.pid
;
10469 _do_filelock(in
, fh
, p
->first
, CEPH_MDS_OP_SETFILELOCK
, 0, &fl
,
10470 p
->second
.owner
, true);
10474 void Client::_update_lock_state(struct flock
*fl
, uint64_t owner
,
10475 ceph_lock_state_t
*lock_state
)
10478 if (F_RDLCK
== fl
->l_type
)
10479 lock_cmd
= CEPH_LOCK_SHARED
;
10480 else if (F_WRLCK
== fl
->l_type
)
10481 lock_cmd
= CEPH_LOCK_EXCL
;
10483 lock_cmd
= CEPH_LOCK_UNLOCK
;;
10485 ceph_filelock filelock
;
10486 filelock
.start
= fl
->l_start
;
10487 filelock
.length
= fl
->l_len
;
10488 filelock
.client
= 0;
10489 // see comment in _do_filelock()
10490 filelock
.owner
= owner
| (1ULL << 63);
10491 filelock
.pid
= fl
->l_pid
;
10492 filelock
.type
= lock_cmd
;
10494 if (filelock
.type
== CEPH_LOCK_UNLOCK
) {
10495 list
<ceph_filelock
> activated_locks
;
10496 lock_state
->remove_lock(filelock
, activated_locks
);
10498 bool r
= lock_state
->add_lock(filelock
, false, false, NULL
);
10503 int Client::_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
10505 Inode
*in
= fh
->inode
.get();
10506 ldout(cct
, 10) << "_getlk " << fh
<< " ino " << in
->ino
<< dendl
;
10507 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_GETFILELOCK
, 0, fl
, owner
);
10511 int Client::_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
10513 Inode
*in
= fh
->inode
.get();
10514 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< dendl
;
10515 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_SETFILELOCK
, sleep
, fl
, owner
);
10516 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10520 int Client::_flock(Fh
*fh
, int cmd
, uint64_t owner
)
10522 Inode
*in
= fh
->inode
.get();
10523 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< dendl
;
10525 int sleep
= !(cmd
& LOCK_NB
);
10544 memset(&fl
, 0, sizeof(fl
));
10546 fl
.l_whence
= SEEK_SET
;
10548 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FLOCK
, CEPH_MDS_OP_SETFILELOCK
, sleep
, &fl
, owner
);
10549 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10553 int Client::ll_statfs(Inode
*in
, struct statvfs
*stbuf
, const UserPerm
& perms
)
10555 /* Since the only thing this does is wrap a call to statfs, and
10556 statfs takes a lock, it doesn't seem we have a need to split it
10558 return statfs(0, stbuf
, perms
);
10561 void Client::ll_register_callbacks(struct ceph_client_callback_args
*args
)
10565 std::lock_guard
l(client_lock
);
10566 ldout(cct
, 10) << __func__
<< " cb " << args
->handle
10567 << " invalidate_ino_cb " << args
->ino_cb
10568 << " invalidate_dentry_cb " << args
->dentry_cb
10569 << " switch_interrupt_cb " << args
->switch_intr_cb
10570 << " remount_cb " << args
->remount_cb
10572 callback_handle
= args
->handle
;
10573 if (args
->ino_cb
) {
10574 ino_invalidate_cb
= args
->ino_cb
;
10575 async_ino_invalidator
.start();
10577 if (args
->dentry_cb
) {
10578 dentry_invalidate_cb
= args
->dentry_cb
;
10579 async_dentry_invalidator
.start();
10581 if (args
->switch_intr_cb
) {
10582 switch_interrupt_cb
= args
->switch_intr_cb
;
10583 interrupt_finisher
.start();
10585 if (args
->remount_cb
) {
10586 remount_cb
= args
->remount_cb
;
10587 remount_finisher
.start();
10589 if (args
->ino_release_cb
) {
10590 ino_release_cb
= args
->ino_release_cb
;
10591 async_ino_releasor
.start();
10593 if (args
->umask_cb
)
10594 umask_cb
= args
->umask_cb
;
10597 int Client::test_dentry_handling(bool can_invalidate
)
10601 can_invalidate_dentries
= can_invalidate
;
10603 if (can_invalidate_dentries
) {
10604 ceph_assert(dentry_invalidate_cb
);
10605 ldout(cct
, 1) << "using dentry_invalidate_cb" << dendl
;
10608 ceph_assert(remount_cb
);
10609 ldout(cct
, 1) << "using remount_cb" << dendl
;
10610 r
= _do_remount(false);
10616 int Client::_sync_fs()
10618 ldout(cct
, 10) << __func__
<< dendl
;
10621 std::unique_ptr
<C_SaferCond
> cond
= nullptr;
10622 if (cct
->_conf
->client_oc
) {
10623 cond
.reset(new C_SaferCond("Client::_sync_fs:lock"));
10624 objectcacher
->flush_all(cond
.get());
10629 ceph_tid_t flush_tid
= last_flush_tid
;
10631 // wait for unsafe mds requests
10632 wait_unsafe_requests();
10634 wait_sync_caps(flush_tid
);
10636 if (nullptr != cond
) {
10637 client_lock
.unlock();
10638 ldout(cct
, 15) << __func__
<< " waiting on data to flush" << dendl
;
10640 ldout(cct
, 15) << __func__
<< " flush finished" << dendl
;
10641 client_lock
.lock();
10647 int Client::sync_fs()
10649 std::lock_guard
l(client_lock
);
10657 int64_t Client::drop_caches()
10659 std::lock_guard
l(client_lock
);
10660 return objectcacher
->release_all();
10663 int Client::_lazyio(Fh
*fh
, int enable
)
10665 Inode
*in
= fh
->inode
.get();
10666 ldout(cct
, 20) << __func__
<< " " << *in
<< " " << !!enable
<< dendl
;
10668 if (!!(fh
->mode
& CEPH_FILE_MODE_LAZY
) == !!enable
)
10671 int orig_mode
= fh
->mode
;
10673 fh
->mode
|= CEPH_FILE_MODE_LAZY
;
10674 in
->get_open_ref(fh
->mode
);
10675 in
->put_open_ref(orig_mode
);
10676 check_caps(in
, CHECK_CAPS_NODELAY
);
10678 fh
->mode
&= ~CEPH_FILE_MODE_LAZY
;
10679 in
->get_open_ref(fh
->mode
);
10680 in
->put_open_ref(orig_mode
);
10687 int Client::lazyio(int fd
, int enable
)
10689 std::lock_guard
l(client_lock
);
10690 Fh
*f
= get_filehandle(fd
);
10694 return _lazyio(f
, enable
);
10697 int Client::ll_lazyio(Fh
*fh
, int enable
)
10699 std::lock_guard
lock(client_lock
);
10700 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << !!enable
<< dendl
;
10701 tout(cct
) << __func__
<< std::endl
;
10703 return _lazyio(fh
, enable
);
10706 int Client::lazyio_propagate(int fd
, loff_t offset
, size_t count
)
10708 std::lock_guard
l(client_lock
);
10709 ldout(cct
, 3) << "op: client->lazyio_propagate(" << fd
10710 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10712 Fh
*f
= get_filehandle(fd
);
10722 int Client::lazyio_synchronize(int fd
, loff_t offset
, size_t count
)
10724 std::lock_guard
l(client_lock
);
10725 ldout(cct
, 3) << "op: client->lazyio_synchronize(" << fd
10726 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10728 Fh
*f
= get_filehandle(fd
);
10731 Inode
*in
= f
->inode
.get();
10734 if (_release(in
)) {
10735 int r
=_getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
10743 // =============================
10746 int Client::mksnap(const char *relpath
, const char *name
, const UserPerm
& perm
)
10748 std::lock_guard
l(client_lock
);
10753 filepath
path(relpath
);
10755 int r
= path_walk(path
, &in
, perm
);
10758 if (cct
->_conf
->client_permissions
) {
10759 r
= may_create(in
.get(), perm
);
10763 Inode
*snapdir
= open_snapdir(in
.get());
10764 return _mkdir(snapdir
, name
, 0, perm
);
10767 int Client::rmsnap(const char *relpath
, const char *name
, const UserPerm
& perms
)
10769 std::lock_guard
l(client_lock
);
10774 filepath
path(relpath
);
10776 int r
= path_walk(path
, &in
, perms
);
10779 if (cct
->_conf
->client_permissions
) {
10780 r
= may_delete(in
.get(), NULL
, perms
);
10784 Inode
*snapdir
= open_snapdir(in
.get());
10785 return _rmdir(snapdir
, name
, perms
);
10788 // =============================
10791 int Client::get_caps_issued(int fd
) {
10793 std::lock_guard
lock(client_lock
);
10798 Fh
*f
= get_filehandle(fd
);
10802 return f
->inode
->caps_issued();
10805 int Client::get_caps_issued(const char *path
, const UserPerm
& perms
)
10807 std::lock_guard
lock(client_lock
);
10814 int r
= path_walk(p
, &in
, perms
, true);
10817 return in
->caps_issued();
10820 // =========================================
10823 Inode
*Client::open_snapdir(Inode
*diri
)
10826 vinodeno_t
vino(diri
->ino
, CEPH_SNAPDIR
);
10827 if (!inode_map
.count(vino
)) {
10828 in
= new Inode(this, vino
, &diri
->layout
);
10830 in
->ino
= diri
->ino
;
10831 in
->snapid
= CEPH_SNAPDIR
;
10832 in
->mode
= diri
->mode
;
10833 in
->uid
= diri
->uid
;
10834 in
->gid
= diri
->gid
;
10836 in
->mtime
= diri
->mtime
;
10837 in
->ctime
= diri
->ctime
;
10838 in
->btime
= diri
->btime
;
10839 in
->atime
= diri
->atime
;
10840 in
->size
= diri
->size
;
10841 in
->change_attr
= diri
->change_attr
;
10843 in
->dirfragtree
.clear();
10844 in
->snapdir_parent
= diri
;
10845 diri
->flags
|= I_SNAPDIR_OPEN
;
10846 inode_map
[vino
] = in
;
10847 if (use_faked_inos())
10848 _assign_faked_ino(in
);
10849 ldout(cct
, 10) << "open_snapdir created snapshot inode " << *in
<< dendl
;
10851 in
= inode_map
[vino
];
10852 ldout(cct
, 10) << "open_snapdir had snapshot inode " << *in
<< dendl
;
10857 int Client::ll_lookup(Inode
*parent
, const char *name
, struct stat
*attr
,
10858 Inode
**out
, const UserPerm
& perms
)
10860 std::lock_guard
lock(client_lock
);
10861 vinodeno_t vparent
= _get_vino(parent
);
10862 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
10863 tout(cct
) << __func__
<< std::endl
;
10864 tout(cct
) << name
<< std::endl
;
10870 if (!fuse_default_permissions
) {
10871 if (strcmp(name
, ".") && strcmp(name
, "..")) {
10872 r
= may_lookup(parent
, perms
);
10878 string
dname(name
);
10881 r
= _lookup(parent
, dname
, CEPH_STAT_CAP_INODE_ALL
, &in
, perms
);
10888 fill_stat(in
, attr
);
10892 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
10893 << " -> " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
10894 tout(cct
) << attr
->st_ino
<< std::endl
;
10899 int Client::ll_lookup_inode(
10900 struct inodeno_t ino
,
10901 const UserPerm
& perms
,
10904 ceph_assert(inode
!= NULL
);
10905 std::lock_guard
lock(client_lock
);
10906 ldout(cct
, 3) << "ll_lookup_inode " << ino
<< dendl
;
10911 // Num1: get inode and *inode
10912 int r
= _lookup_ino(ino
, perms
, inode
);
10916 ceph_assert(*inode
!= NULL
);
10918 if (!(*inode
)->dentries
.empty()) {
10919 ldout(cct
, 8) << __func__
<< " dentry already present" << dendl
;
10923 if ((*inode
)->is_root()) {
10924 ldout(cct
, 8) << "ino is root, no parent" << dendl
;
10928 // Num2: Request the parent inode, so that we can look up the name
10930 r
= _lookup_parent(*inode
, perms
, &parent
);
10932 _ll_forget(*inode
, 1);
10936 ceph_assert(parent
!= NULL
);
10938 // Num3: Finally, get the name (dentry) of the requested inode
10939 r
= _lookup_name(*inode
, parent
, perms
);
10941 // Unexpected error
10942 _ll_forget(parent
, 1);
10943 _ll_forget(*inode
, 1);
10947 _ll_forget(parent
, 1);
10951 int Client::ll_lookupx(Inode
*parent
, const char *name
, Inode
**out
,
10952 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
10953 const UserPerm
& perms
)
10955 std::lock_guard
lock(client_lock
);
10956 vinodeno_t vparent
= _get_vino(parent
);
10957 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
10958 tout(cct
) << "ll_lookupx" << std::endl
;
10959 tout(cct
) << name
<< std::endl
;
10965 if (!fuse_default_permissions
) {
10966 r
= may_lookup(parent
, perms
);
10971 string
dname(name
);
10974 unsigned mask
= statx_to_mask(flags
, want
);
10975 r
= _lookup(parent
, dname
, mask
, &in
, perms
);
10981 fill_statx(in
, mask
, stx
);
10985 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
10986 << " -> " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
10987 tout(cct
) << stx
->stx_ino
<< std::endl
;
10992 int Client::ll_walk(const char* name
, Inode
**out
, struct ceph_statx
*stx
,
10993 unsigned int want
, unsigned int flags
, const UserPerm
& perms
)
10995 std::lock_guard
lock(client_lock
);
11000 filepath
fp(name
, 0);
11003 unsigned mask
= statx_to_mask(flags
, want
);
11005 ldout(cct
, 3) << __func__
<< " " << name
<< dendl
;
11006 tout(cct
) << __func__
<< std::endl
;
11007 tout(cct
) << name
<< std::endl
;
11009 rc
= path_walk(fp
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
11011 /* zero out mask, just in case... */
11018 fill_statx(in
, mask
, stx
);
11025 void Client::_ll_get(Inode
*in
)
11027 if (in
->ll_ref
== 0) {
11029 if (in
->is_dir() && !in
->dentries
.empty()) {
11030 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
11031 in
->get_first_parent()->get(); // pin dentry
11033 if (in
->snapid
!= CEPH_NOSNAP
)
11034 ll_snap_ref
[in
->snapid
]++;
11037 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " -> " << in
->ll_ref
<< dendl
;
11040 int Client::_ll_put(Inode
*in
, uint64_t num
)
11043 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " " << num
<< " -> " << in
->ll_ref
<< dendl
;
11044 if (in
->ll_ref
== 0) {
11045 if (in
->is_dir() && !in
->dentries
.empty()) {
11046 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
11047 in
->get_first_parent()->put(); // unpin dentry
11049 if (in
->snapid
!= CEPH_NOSNAP
) {
11050 auto p
= ll_snap_ref
.find(in
->snapid
);
11051 ceph_assert(p
!= ll_snap_ref
.end());
11052 ceph_assert(p
->second
> 0);
11053 if (--p
->second
== 0)
11054 ll_snap_ref
.erase(p
);
11063 void Client::_ll_drop_pins()
11065 ldout(cct
, 10) << __func__
<< dendl
;
11066 std::set
<InodeRef
> to_be_put
; //this set will be deconstructed item by item when exit
11067 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator next
;
11068 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
11069 it
!= inode_map
.end();
11071 Inode
*in
= it
->second
;
11075 to_be_put
.insert(in
);
11076 _ll_put(in
, in
->ll_ref
);
11081 bool Client::_ll_forget(Inode
*in
, uint64_t count
)
11083 inodeno_t ino
= in
->ino
;
11085 ldout(cct
, 8) << __func__
<< " " << ino
<< " " << count
<< dendl
;
11086 tout(cct
) << __func__
<< std::endl
;
11087 tout(cct
) << ino
.val
<< std::endl
;
11088 tout(cct
) << count
<< std::endl
;
11090 // Ignore forget if we're no longer mounted
11094 if (ino
== 1) return true; // ignore forget on root.
11097 if (in
->ll_ref
< count
) {
11098 ldout(cct
, 1) << "WARNING: ll_forget on " << ino
<< " " << count
11099 << ", which only has ll_ref=" << in
->ll_ref
<< dendl
;
11100 _ll_put(in
, in
->ll_ref
);
11103 if (_ll_put(in
, count
) == 0)
11110 bool Client::ll_forget(Inode
*in
, uint64_t count
)
11112 std::lock_guard
lock(client_lock
);
11113 return _ll_forget(in
, count
);
11116 bool Client::ll_put(Inode
*in
)
11118 /* ll_forget already takes the lock */
11119 return ll_forget(in
, 1);
11122 int Client::ll_get_snap_ref(snapid_t snap
)
11124 std::lock_guard
lock(client_lock
);
11125 auto p
= ll_snap_ref
.find(snap
);
11126 if (p
!= ll_snap_ref
.end())
11131 snapid_t
Client::ll_get_snapid(Inode
*in
)
11133 std::lock_guard
lock(client_lock
);
11137 Inode
*Client::ll_get_inode(ino_t ino
)
11139 std::lock_guard
lock(client_lock
);
11144 vinodeno_t vino
= _map_faked_ino(ino
);
11145 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
11146 if (p
== inode_map
.end())
11148 Inode
*in
= p
->second
;
11153 Inode
*Client::ll_get_inode(vinodeno_t vino
)
11155 std::lock_guard
lock(client_lock
);
11160 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
11161 if (p
== inode_map
.end())
11163 Inode
*in
= p
->second
;
11168 int Client::_ll_getattr(Inode
*in
, int caps
, const UserPerm
& perms
)
11170 vinodeno_t vino
= _get_vino(in
);
11172 ldout(cct
, 8) << __func__
<< " " << vino
<< dendl
;
11173 tout(cct
) << __func__
<< std::endl
;
11174 tout(cct
) << vino
.ino
.val
<< std::endl
;
11176 if (vino
.snapid
< CEPH_NOSNAP
)
11179 return _getattr(in
, caps
, perms
);
11182 int Client::ll_getattr(Inode
*in
, struct stat
*attr
, const UserPerm
& perms
)
11184 std::lock_guard
lock(client_lock
);
11189 int res
= _ll_getattr(in
, CEPH_STAT_CAP_INODE_ALL
, perms
);
11192 fill_stat(in
, attr
);
11193 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11197 int Client::ll_getattrx(Inode
*in
, struct ceph_statx
*stx
, unsigned int want
,
11198 unsigned int flags
, const UserPerm
& perms
)
11200 std::lock_guard
lock(client_lock
);
11206 unsigned mask
= statx_to_mask(flags
, want
);
11208 if (mask
&& !in
->caps_issued_mask(mask
, true))
11209 res
= _ll_getattr(in
, mask
, perms
);
11212 fill_statx(in
, mask
, stx
);
11213 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11217 int Client::_ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
11218 const UserPerm
& perms
, InodeRef
*inp
)
11220 vinodeno_t vino
= _get_vino(in
);
11222 ldout(cct
, 8) << __func__
<< " " << vino
<< " mask " << hex
<< mask
<< dec
11224 tout(cct
) << __func__
<< std::endl
;
11225 tout(cct
) << vino
.ino
.val
<< std::endl
;
11226 tout(cct
) << stx
->stx_mode
<< std::endl
;
11227 tout(cct
) << stx
->stx_uid
<< std::endl
;
11228 tout(cct
) << stx
->stx_gid
<< std::endl
;
11229 tout(cct
) << stx
->stx_size
<< std::endl
;
11230 tout(cct
) << stx
->stx_mtime
<< std::endl
;
11231 tout(cct
) << stx
->stx_atime
<< std::endl
;
11232 tout(cct
) << stx
->stx_btime
<< std::endl
;
11233 tout(cct
) << mask
<< std::endl
;
11235 if (!fuse_default_permissions
) {
11236 int res
= may_setattr(in
, stx
, mask
, perms
);
11241 mask
&= ~(CEPH_SETATTR_MTIME_NOW
| CEPH_SETATTR_ATIME_NOW
);
11243 return __setattrx(in
, stx
, mask
, perms
, inp
);
11246 int Client::ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
11247 const UserPerm
& perms
)
11249 std::lock_guard
lock(client_lock
);
11254 InodeRef
target(in
);
11255 int res
= _ll_setattrx(in
, stx
, mask
, perms
, &target
);
11257 ceph_assert(in
== target
.get());
11258 fill_statx(in
, in
->caps_issued(), stx
);
11261 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11265 int Client::ll_setattr(Inode
*in
, struct stat
*attr
, int mask
,
11266 const UserPerm
& perms
)
11268 struct ceph_statx stx
;
11269 stat_to_statx(attr
, &stx
);
11271 std::lock_guard
lock(client_lock
);
11276 InodeRef
target(in
);
11277 int res
= _ll_setattrx(in
, &stx
, mask
, perms
, &target
);
11279 ceph_assert(in
== target
.get());
11280 fill_stat(in
, attr
);
11283 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11291 int Client::getxattr(const char *path
, const char *name
, void *value
, size_t size
,
11292 const UserPerm
& perms
)
11294 std::lock_guard
lock(client_lock
);
11300 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11303 return _getxattr(in
, name
, value
, size
, perms
);
11306 int Client::lgetxattr(const char *path
, const char *name
, void *value
, size_t size
,
11307 const UserPerm
& perms
)
11309 std::lock_guard
lock(client_lock
);
11315 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11318 return _getxattr(in
, name
, value
, size
, perms
);
11321 int Client::fgetxattr(int fd
, const char *name
, void *value
, size_t size
,
11322 const UserPerm
& perms
)
11324 std::lock_guard
lock(client_lock
);
11329 Fh
*f
= get_filehandle(fd
);
11332 return _getxattr(f
->inode
, name
, value
, size
, perms
);
11335 int Client::listxattr(const char *path
, char *list
, size_t size
,
11336 const UserPerm
& perms
)
11338 std::lock_guard
lock(client_lock
);
11344 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11347 return Client::_listxattr(in
.get(), list
, size
, perms
);
11350 int Client::llistxattr(const char *path
, char *list
, size_t size
,
11351 const UserPerm
& perms
)
11353 std::lock_guard
lock(client_lock
);
11359 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11362 return Client::_listxattr(in
.get(), list
, size
, perms
);
11365 int Client::flistxattr(int fd
, char *list
, size_t size
, const UserPerm
& perms
)
11367 std::lock_guard
lock(client_lock
);
11372 Fh
*f
= get_filehandle(fd
);
11375 return Client::_listxattr(f
->inode
.get(), list
, size
, perms
);
11378 int Client::removexattr(const char *path
, const char *name
,
11379 const UserPerm
& perms
)
11381 std::lock_guard
lock(client_lock
);
11387 int r
= Client::path_walk(path
, &in
, perms
, true);
11390 return _removexattr(in
, name
, perms
);
11393 int Client::lremovexattr(const char *path
, const char *name
,
11394 const UserPerm
& perms
)
11396 std::lock_guard
lock(client_lock
);
11402 int r
= Client::path_walk(path
, &in
, perms
, false);
11405 return _removexattr(in
, name
, perms
);
11408 int Client::fremovexattr(int fd
, const char *name
, const UserPerm
& perms
)
11410 std::lock_guard
lock(client_lock
);
11415 Fh
*f
= get_filehandle(fd
);
11418 return _removexattr(f
->inode
, name
, perms
);
11421 int Client::setxattr(const char *path
, const char *name
, const void *value
,
11422 size_t size
, int flags
, const UserPerm
& perms
)
11424 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11426 std::lock_guard
lock(client_lock
);
11432 int r
= Client::path_walk(path
, &in
, perms
, true);
11435 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11438 int Client::lsetxattr(const char *path
, const char *name
, const void *value
,
11439 size_t size
, int flags
, const UserPerm
& perms
)
11441 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11443 std::lock_guard
lock(client_lock
);
11449 int r
= Client::path_walk(path
, &in
, perms
, false);
11452 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11455 int Client::fsetxattr(int fd
, const char *name
, const void *value
, size_t size
,
11456 int flags
, const UserPerm
& perms
)
11458 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11460 std::lock_guard
lock(client_lock
);
11465 Fh
*f
= get_filehandle(fd
);
11468 return _setxattr(f
->inode
, name
, value
, size
, flags
, perms
);
11471 int Client::_getxattr(Inode
*in
, const char *name
, void *value
, size_t size
,
11472 const UserPerm
& perms
)
11476 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11480 // Do a force getattr to get the latest quota before returning
11481 // a value to userspace.
11483 if (vxattr
->flags
& VXATTR_RSTAT
) {
11484 flags
|= CEPH_STAT_RSTAT
;
11486 if (vxattr
->flags
& VXATTR_DIRSTAT
) {
11487 flags
|= CEPH_CAP_FILE_SHARED
;
11489 r
= _getattr(in
, flags
, perms
, true);
11491 // Error from getattr!
11495 // call pointer-to-member function
11497 if (!(vxattr
->exists_cb
&& !(this->*(vxattr
->exists_cb
))(in
))) {
11498 r
= (this->*(vxattr
->getxattr_cb
))(in
, buf
, sizeof(buf
));
11504 if (r
> (int)size
) {
11506 } else if (r
> 0) {
11507 memcpy(value
, buf
, r
);
11513 if (acl_type
== NO_ACL
&& !strncmp(name
, "system.", 7)) {
11518 r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11522 if (in
->xattrs
.count(n
)) {
11523 r
= in
->xattrs
[n
].length();
11524 if (r
> 0 && size
!= 0) {
11525 if (size
>= (unsigned)r
)
11526 memcpy(value
, in
->xattrs
[n
].c_str(), r
);
11533 ldout(cct
, 8) << "_getxattr(" << in
->ino
<< ", \"" << name
<< "\", " << size
<< ") = " << r
<< dendl
;
11537 int Client::_getxattr(InodeRef
&in
, const char *name
, void *value
, size_t size
,
11538 const UserPerm
& perms
)
11540 if (cct
->_conf
->client_permissions
) {
11541 int r
= xattr_permission(in
.get(), name
, MAY_READ
, perms
);
11545 return _getxattr(in
.get(), name
, value
, size
, perms
);
11548 int Client::ll_getxattr(Inode
*in
, const char *name
, void *value
,
11549 size_t size
, const UserPerm
& perms
)
11551 std::lock_guard
lock(client_lock
);
11556 vinodeno_t vino
= _get_vino(in
);
11558 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
11559 tout(cct
) << __func__
<< std::endl
;
11560 tout(cct
) << vino
.ino
.val
<< std::endl
;
11561 tout(cct
) << name
<< std::endl
;
11563 if (!fuse_default_permissions
) {
11564 int r
= xattr_permission(in
, name
, MAY_READ
, perms
);
11569 return _getxattr(in
, name
, value
, size
, perms
);
11572 int Client::_listxattr(Inode
*in
, char *name
, size_t size
,
11573 const UserPerm
& perms
)
11575 bool len_only
= (size
== 0);
11576 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11582 for (const auto& p
: in
->xattrs
) {
11583 size_t this_len
= p
.first
.length() + 1;
11588 if (this_len
> size
) {
11593 memcpy(name
, p
.first
.c_str(), this_len
);
11598 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ", " << size
<< ") = " << r
<< dendl
;
11602 int Client::ll_listxattr(Inode
*in
, char *names
, size_t size
,
11603 const UserPerm
& perms
)
11605 std::lock_guard
lock(client_lock
);
11610 vinodeno_t vino
= _get_vino(in
);
11612 ldout(cct
, 3) << __func__
<< " " << vino
<< " size " << size
<< dendl
;
11613 tout(cct
) << __func__
<< std::endl
;
11614 tout(cct
) << vino
.ino
.val
<< std::endl
;
11615 tout(cct
) << size
<< std::endl
;
11617 return _listxattr(in
, names
, size
, perms
);
11620 int Client::_do_setxattr(Inode
*in
, const char *name
, const void *value
,
11621 size_t size
, int flags
, const UserPerm
& perms
)
11624 int xattr_flags
= 0;
11626 xattr_flags
|= CEPH_XATTR_REMOVE
;
11627 if (flags
& XATTR_CREATE
)
11628 xattr_flags
|= CEPH_XATTR_CREATE
;
11629 if (flags
& XATTR_REPLACE
)
11630 xattr_flags
|= CEPH_XATTR_REPLACE
;
11632 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETXATTR
);
11634 in
->make_nosnap_relative_path(path
);
11635 req
->set_filepath(path
);
11636 req
->set_string2(name
);
11637 req
->set_inode(in
);
11638 req
->head
.args
.setxattr
.flags
= xattr_flags
;
11641 assert (value
|| size
== 0);
11642 bl
.append((const char*)value
, size
);
11645 int res
= make_request(req
, perms
);
11648 ldout(cct
, 3) << __func__
<< "(" << in
->ino
<< ", \"" << name
<< "\") = " <<
11653 int Client::_setxattr(Inode
*in
, const char *name
, const void *value
,
11654 size_t size
, int flags
, const UserPerm
& perms
)
11656 if (in
->snapid
!= CEPH_NOSNAP
) {
11662 } else if (value
== NULL
) {
11666 bool posix_acl_xattr
= false;
11667 if (acl_type
== POSIX_ACL
)
11668 posix_acl_xattr
= !strncmp(name
, "system.", 7);
11670 if (strncmp(name
, "user.", 5) &&
11671 strncmp(name
, "security.", 9) &&
11672 strncmp(name
, "trusted.", 8) &&
11673 strncmp(name
, "ceph.", 5) &&
11675 return -EOPNOTSUPP
;
11677 bool check_realm
= false;
11679 if (posix_acl_xattr
) {
11680 if (!strcmp(name
, ACL_EA_ACCESS
)) {
11681 mode_t new_mode
= in
->mode
;
11683 int ret
= posix_acl_equiv_mode(value
, size
, &new_mode
);
11690 if (new_mode
!= in
->mode
) {
11691 struct ceph_statx stx
;
11692 stx
.stx_mode
= new_mode
;
11693 ret
= _do_setattr(in
, &stx
, CEPH_SETATTR_MODE
, perms
, NULL
);
11698 } else if (!strcmp(name
, ACL_EA_DEFAULT
)) {
11700 if (!S_ISDIR(in
->mode
))
11702 int ret
= posix_acl_check(value
, size
);
11711 return -EOPNOTSUPP
;
11714 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11716 if (vxattr
->readonly
)
11717 return -EOPNOTSUPP
;
11718 if (vxattr
->name
.compare(0, 10, "ceph.quota") == 0 && value
)
11719 check_realm
= true;
11723 int ret
= _do_setxattr(in
, name
, value
, size
, flags
, perms
);
11724 if (ret
>= 0 && check_realm
) {
11725 // check if snaprealm was created for quota inode
11726 if (in
->quota
.is_enable() &&
11727 !(in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
))
11734 int Client::_setxattr(InodeRef
&in
, const char *name
, const void *value
,
11735 size_t size
, int flags
, const UserPerm
& perms
)
11737 if (cct
->_conf
->client_permissions
) {
11738 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11742 return _setxattr(in
.get(), name
, value
, size
, flags
, perms
);
11745 int Client::_setxattr_check_data_pool(string
& name
, string
& value
, const OSDMap
*osdmap
)
11748 if (name
== "layout") {
11749 string::iterator begin
= value
.begin();
11750 string::iterator end
= value
.end();
11751 keys_and_values
<string::iterator
> p
; // create instance of parser
11752 std::map
<string
, string
> m
; // map to receive results
11753 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
11758 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
11759 if (q
->first
== "pool") {
11764 } else if (name
== "layout.pool") {
11768 if (tmp
.length()) {
11771 pool
= boost::lexical_cast
<unsigned>(tmp
);
11772 if (!osdmap
->have_pg_pool(pool
))
11774 } catch (boost::bad_lexical_cast
const&) {
11775 pool
= osdmap
->lookup_pg_pool_name(tmp
);
11785 void Client::_setxattr_maybe_wait_for_osdmap(const char *name
, const void *value
, size_t size
)
11787 // For setting pool of layout, MetaRequest need osdmap epoch.
11788 // There is a race which create a new data pool but client and mds both don't have.
11789 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11790 if (strcmp(name
, "ceph.file.layout.pool") == 0 || strcmp(name
, "ceph.dir.layout.pool") == 0 ||
11791 strcmp(name
, "ceph.file.layout") == 0 || strcmp(name
, "ceph.dir.layout") == 0) {
11792 string
rest(strstr(name
, "layout"));
11793 string
v((const char*)value
, size
);
11794 int r
= objecter
->with_osdmap([&](const OSDMap
& o
) {
11795 return _setxattr_check_data_pool(rest
, v
, &o
);
11798 if (r
== -ENOENT
) {
11800 objecter
->wait_for_latest_osdmap(&ctx
);
11806 int Client::ll_setxattr(Inode
*in
, const char *name
, const void *value
,
11807 size_t size
, int flags
, const UserPerm
& perms
)
11809 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11811 std::lock_guard
lock(client_lock
);
11816 vinodeno_t vino
= _get_vino(in
);
11818 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
11819 tout(cct
) << __func__
<< std::endl
;
11820 tout(cct
) << vino
.ino
.val
<< std::endl
;
11821 tout(cct
) << name
<< std::endl
;
11823 if (!fuse_default_permissions
) {
11824 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11828 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11831 int Client::_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11833 if (in
->snapid
!= CEPH_NOSNAP
) {
11837 // same xattrs supported by kernel client
11838 if (strncmp(name
, "user.", 5) &&
11839 strncmp(name
, "system.", 7) &&
11840 strncmp(name
, "security.", 9) &&
11841 strncmp(name
, "trusted.", 8) &&
11842 strncmp(name
, "ceph.", 5))
11843 return -EOPNOTSUPP
;
11845 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11846 if (vxattr
&& vxattr
->readonly
)
11847 return -EOPNOTSUPP
;
11849 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_RMXATTR
);
11851 in
->make_nosnap_relative_path(path
);
11852 req
->set_filepath(path
);
11853 req
->set_filepath2(name
);
11854 req
->set_inode(in
);
11856 int res
= make_request(req
, perms
);
11859 ldout(cct
, 8) << "_removexattr(" << in
->ino
<< ", \"" << name
<< "\") = " << res
<< dendl
;
11863 int Client::_removexattr(InodeRef
&in
, const char *name
, const UserPerm
& perms
)
11865 if (cct
->_conf
->client_permissions
) {
11866 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11870 return _removexattr(in
.get(), name
, perms
);
11873 int Client::ll_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11875 std::lock_guard
lock(client_lock
);
11880 vinodeno_t vino
= _get_vino(in
);
11882 ldout(cct
, 3) << "ll_removexattr " << vino
<< " " << name
<< dendl
;
11883 tout(cct
) << "ll_removexattr" << std::endl
;
11884 tout(cct
) << vino
.ino
.val
<< std::endl
;
11885 tout(cct
) << name
<< std::endl
;
11887 if (!fuse_default_permissions
) {
11888 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11893 return _removexattr(in
, name
, perms
);
11896 bool Client::_vxattrcb_quota_exists(Inode
*in
)
11898 return in
->quota
.is_enable() &&
11899 (in
->snapid
!= CEPH_NOSNAP
||
11900 (in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
));
11902 size_t Client::_vxattrcb_quota(Inode
*in
, char *val
, size_t size
)
11904 return snprintf(val
, size
,
11905 "max_bytes=%lld max_files=%lld",
11906 (long long int)in
->quota
.max_bytes
,
11907 (long long int)in
->quota
.max_files
);
11909 size_t Client::_vxattrcb_quota_max_bytes(Inode
*in
, char *val
, size_t size
)
11911 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_bytes
);
11913 size_t Client::_vxattrcb_quota_max_files(Inode
*in
, char *val
, size_t size
)
11915 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_files
);
11918 bool Client::_vxattrcb_layout_exists(Inode
*in
)
11920 return in
->layout
!= file_layout_t();
11922 size_t Client::_vxattrcb_layout(Inode
*in
, char *val
, size_t size
)
11924 int r
= snprintf(val
, size
,
11925 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11926 (unsigned long long)in
->layout
.stripe_unit
,
11927 (unsigned long long)in
->layout
.stripe_count
,
11928 (unsigned long long)in
->layout
.object_size
);
11929 objecter
->with_osdmap([&](const OSDMap
& o
) {
11930 if (o
.have_pg_pool(in
->layout
.pool_id
))
11931 r
+= snprintf(val
+ r
, size
- r
, "%s",
11932 o
.get_pool_name(in
->layout
.pool_id
).c_str());
11934 r
+= snprintf(val
+ r
, size
- r
, "%" PRIu64
,
11935 (uint64_t)in
->layout
.pool_id
);
11937 if (in
->layout
.pool_ns
.length())
11938 r
+= snprintf(val
+ r
, size
- r
, " pool_namespace=%s",
11939 in
->layout
.pool_ns
.c_str());
11942 size_t Client::_vxattrcb_layout_stripe_unit(Inode
*in
, char *val
, size_t size
)
11944 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_unit
);
11946 size_t Client::_vxattrcb_layout_stripe_count(Inode
*in
, char *val
, size_t size
)
11948 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_count
);
11950 size_t Client::_vxattrcb_layout_object_size(Inode
*in
, char *val
, size_t size
)
11952 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.object_size
);
11954 size_t Client::_vxattrcb_layout_pool(Inode
*in
, char *val
, size_t size
)
11957 objecter
->with_osdmap([&](const OSDMap
& o
) {
11958 if (o
.have_pg_pool(in
->layout
.pool_id
))
11959 r
= snprintf(val
, size
, "%s", o
.get_pool_name(
11960 in
->layout
.pool_id
).c_str());
11962 r
= snprintf(val
, size
, "%" PRIu64
, (uint64_t)in
->layout
.pool_id
);
11966 size_t Client::_vxattrcb_layout_pool_namespace(Inode
*in
, char *val
, size_t size
)
11968 return snprintf(val
, size
, "%s", in
->layout
.pool_ns
.c_str());
11970 size_t Client::_vxattrcb_dir_entries(Inode
*in
, char *val
, size_t size
)
11972 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->dirstat
.nfiles
+ in
->dirstat
.nsubdirs
));
11974 size_t Client::_vxattrcb_dir_files(Inode
*in
, char *val
, size_t size
)
11976 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nfiles
);
11978 size_t Client::_vxattrcb_dir_subdirs(Inode
*in
, char *val
, size_t size
)
11980 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nsubdirs
);
11982 size_t Client::_vxattrcb_dir_rentries(Inode
*in
, char *val
, size_t size
)
11984 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->rstat
.rfiles
+ in
->rstat
.rsubdirs
));
11986 size_t Client::_vxattrcb_dir_rfiles(Inode
*in
, char *val
, size_t size
)
11988 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rfiles
);
11990 size_t Client::_vxattrcb_dir_rsubdirs(Inode
*in
, char *val
, size_t size
)
11992 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsubdirs
);
11994 size_t Client::_vxattrcb_dir_rbytes(Inode
*in
, char *val
, size_t size
)
11996 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rbytes
);
11998 size_t Client::_vxattrcb_dir_rctime(Inode
*in
, char *val
, size_t size
)
12000 return snprintf(val
, size
, "%ld.%09ld", (long)in
->rstat
.rctime
.sec(),
12001 (long)in
->rstat
.rctime
.nsec());
12003 bool Client::_vxattrcb_dir_pin_exists(Inode
*in
)
12005 return in
->dir_pin
!= -ENODATA
;
12007 size_t Client::_vxattrcb_dir_pin(Inode
*in
, char *val
, size_t size
)
12009 return snprintf(val
, size
, "%ld", (long)in
->dir_pin
);
12012 bool Client::_vxattrcb_snap_btime_exists(Inode
*in
)
12014 return !in
->snap_btime
.is_zero();
12017 size_t Client::_vxattrcb_snap_btime(Inode
*in
, char *val
, size_t size
)
12019 return snprintf(val
, size
, "%llu.%09lu",
12020 (long long unsigned)in
->snap_btime
.sec(),
12021 (long unsigned)in
->snap_btime
.nsec());
12024 size_t Client::_vxattrcb_cluster_fsid(Inode
*in
, char *val
, size_t size
)
12026 return snprintf(val
, size
, "%s", monclient
->get_fsid().to_string().c_str());
12029 size_t Client::_vxattrcb_client_id(Inode
*in
, char *val
, size_t size
)
12031 auto name
= messenger
->get_myname();
12032 return snprintf(val
, size
, "%s%ld", name
.type_str(), name
.num());
12035 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
12036 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
12038 #define XATTR_NAME_CEPH(_type, _name, _flags) \
12040 name: CEPH_XATTR_NAME(_type, _name), \
12041 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12046 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
12048 name: CEPH_XATTR_NAME2(_type, _name, _field), \
12049 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
12051 exists_cb: &Client::_vxattrcb_layout_exists, \
12054 #define XATTR_QUOTA_FIELD(_type, _name) \
12056 name: CEPH_XATTR_NAME(_type, _name), \
12057 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12059 exists_cb: &Client::_vxattrcb_quota_exists, \
12063 const Client::VXattr
Client::_dir_vxattrs
[] = {
12065 name
: "ceph.dir.layout",
12066 getxattr_cb
: &Client::_vxattrcb_layout
,
12068 exists_cb
: &Client::_vxattrcb_layout_exists
,
12071 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_unit
),
12072 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_count
),
12073 XATTR_LAYOUT_FIELD(dir
, layout
, object_size
),
12074 XATTR_LAYOUT_FIELD(dir
, layout
, pool
),
12075 XATTR_LAYOUT_FIELD(dir
, layout
, pool_namespace
),
12076 XATTR_NAME_CEPH(dir
, entries
, VXATTR_DIRSTAT
),
12077 XATTR_NAME_CEPH(dir
, files
, VXATTR_DIRSTAT
),
12078 XATTR_NAME_CEPH(dir
, subdirs
, VXATTR_DIRSTAT
),
12079 XATTR_NAME_CEPH(dir
, rentries
, VXATTR_RSTAT
),
12080 XATTR_NAME_CEPH(dir
, rfiles
, VXATTR_RSTAT
),
12081 XATTR_NAME_CEPH(dir
, rsubdirs
, VXATTR_RSTAT
),
12082 XATTR_NAME_CEPH(dir
, rbytes
, VXATTR_RSTAT
),
12083 XATTR_NAME_CEPH(dir
, rctime
, VXATTR_RSTAT
),
12085 name
: "ceph.quota",
12086 getxattr_cb
: &Client::_vxattrcb_quota
,
12088 exists_cb
: &Client::_vxattrcb_quota_exists
,
12091 XATTR_QUOTA_FIELD(quota
, max_bytes
),
12092 XATTR_QUOTA_FIELD(quota
, max_files
),
12094 name
: "ceph.dir.pin",
12095 getxattr_cb
: &Client::_vxattrcb_dir_pin
,
12097 exists_cb
: &Client::_vxattrcb_dir_pin_exists
,
12101 name
: "ceph.snap.btime",
12102 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
12104 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
12107 { name
: "" } /* Required table terminator */
12110 const Client::VXattr
Client::_file_vxattrs
[] = {
12112 name
: "ceph.file.layout",
12113 getxattr_cb
: &Client::_vxattrcb_layout
,
12115 exists_cb
: &Client::_vxattrcb_layout_exists
,
12118 XATTR_LAYOUT_FIELD(file
, layout
, stripe_unit
),
12119 XATTR_LAYOUT_FIELD(file
, layout
, stripe_count
),
12120 XATTR_LAYOUT_FIELD(file
, layout
, object_size
),
12121 XATTR_LAYOUT_FIELD(file
, layout
, pool
),
12122 XATTR_LAYOUT_FIELD(file
, layout
, pool_namespace
),
12124 name
: "ceph.snap.btime",
12125 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
12127 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
12130 { name
: "" } /* Required table terminator */
12133 const Client::VXattr
Client::_common_vxattrs
[] = {
12135 name
: "ceph.cluster_fsid",
12136 getxattr_cb
: &Client::_vxattrcb_cluster_fsid
,
12138 exists_cb
: nullptr,
12142 name
: "ceph.client_id",
12143 getxattr_cb
: &Client::_vxattrcb_client_id
,
12145 exists_cb
: nullptr,
12148 { name
: "" } /* Required table terminator */
12151 const Client::VXattr
*Client::_get_vxattrs(Inode
*in
)
12154 return _dir_vxattrs
;
12155 else if (in
->is_file())
12156 return _file_vxattrs
;
12160 const Client::VXattr
*Client::_match_vxattr(Inode
*in
, const char *name
)
12162 if (strncmp(name
, "ceph.", 5) == 0) {
12163 const VXattr
*vxattr
= _get_vxattrs(in
);
12165 while (!vxattr
->name
.empty()) {
12166 if (vxattr
->name
== name
)
12172 // for common vxattrs
12173 vxattr
= _common_vxattrs
;
12174 while (!vxattr
->name
.empty()) {
12175 if (vxattr
->name
== name
)
12184 int Client::ll_readlink(Inode
*in
, char *buf
, size_t buflen
, const UserPerm
& perms
)
12186 std::lock_guard
lock(client_lock
);
12191 vinodeno_t vino
= _get_vino(in
);
12193 ldout(cct
, 3) << "ll_readlink " << vino
<< dendl
;
12194 tout(cct
) << "ll_readlink" << std::endl
;
12195 tout(cct
) << vino
.ino
.val
<< std::endl
;
12197 for (auto dn
: in
->dentries
) {
12201 int r
= _readlink(in
, buf
, buflen
); // FIXME: no permission checking!
12202 ldout(cct
, 3) << "ll_readlink " << vino
<< " = " << r
<< dendl
;
12206 int Client::_mknod(Inode
*dir
, const char *name
, mode_t mode
, dev_t rdev
,
12207 const UserPerm
& perms
, InodeRef
*inp
)
12209 ldout(cct
, 8) << "_mknod(" << dir
->ino
<< " " << name
<< ", 0" << oct
12210 << mode
<< dec
<< ", " << rdev
<< ", uid " << perms
.uid()
12211 << ", gid " << perms
.gid() << ")" << dendl
;
12213 if (strlen(name
) > NAME_MAX
)
12214 return -ENAMETOOLONG
;
12216 if (dir
->snapid
!= CEPH_NOSNAP
) {
12219 if (is_quota_files_exceeded(dir
, perms
)) {
12223 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_MKNOD
);
12226 dir
->make_nosnap_relative_path(path
);
12227 path
.push_dentry(name
);
12228 req
->set_filepath(path
);
12229 req
->set_inode(dir
);
12230 req
->head
.args
.mknod
.rdev
= rdev
;
12231 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12232 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12234 bufferlist xattrs_bl
;
12235 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
12238 req
->head
.args
.mknod
.mode
= mode
;
12239 if (xattrs_bl
.length() > 0)
12240 req
->set_data(xattrs_bl
);
12243 res
= get_or_create(dir
, name
, &de
);
12246 req
->set_dentry(de
);
12248 res
= make_request(req
, perms
, inp
);
12252 ldout(cct
, 8) << "mknod(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
12260 int Client::ll_mknod(Inode
*parent
, const char *name
, mode_t mode
,
12261 dev_t rdev
, struct stat
*attr
, Inode
**out
,
12262 const UserPerm
& perms
)
12264 std::lock_guard
lock(client_lock
);
12269 vinodeno_t vparent
= _get_vino(parent
);
12271 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
<< dendl
;
12272 tout(cct
) << "ll_mknod" << std::endl
;
12273 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12274 tout(cct
) << name
<< std::endl
;
12275 tout(cct
) << mode
<< std::endl
;
12276 tout(cct
) << rdev
<< std::endl
;
12278 if (!fuse_default_permissions
) {
12279 int r
= may_create(parent
, perms
);
12285 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12287 fill_stat(in
, attr
);
12290 tout(cct
) << attr
->st_ino
<< std::endl
;
12291 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
12292 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12297 int Client::ll_mknodx(Inode
*parent
, const char *name
, mode_t mode
,
12298 dev_t rdev
, Inode
**out
,
12299 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12300 const UserPerm
& perms
)
12302 unsigned caps
= statx_to_mask(flags
, want
);
12303 std::lock_guard
lock(client_lock
);
12308 vinodeno_t vparent
= _get_vino(parent
);
12310 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
<< dendl
;
12311 tout(cct
) << "ll_mknodx" << std::endl
;
12312 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12313 tout(cct
) << name
<< std::endl
;
12314 tout(cct
) << mode
<< std::endl
;
12315 tout(cct
) << rdev
<< std::endl
;
12317 if (!fuse_default_permissions
) {
12318 int r
= may_create(parent
, perms
);
12324 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12326 fill_statx(in
, caps
, stx
);
12329 tout(cct
) << stx
->stx_ino
<< std::endl
;
12330 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
12331 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12336 int Client::_create(Inode
*dir
, const char *name
, int flags
, mode_t mode
,
12337 InodeRef
*inp
, Fh
**fhp
, int stripe_unit
, int stripe_count
,
12338 int object_size
, const char *data_pool
, bool *created
,
12339 const UserPerm
& perms
)
12341 ldout(cct
, 8) << "_create(" << dir
->ino
<< " " << name
<< ", 0" << oct
<<
12342 mode
<< dec
<< ")" << dendl
;
12344 if (strlen(name
) > NAME_MAX
)
12345 return -ENAMETOOLONG
;
12346 if (dir
->snapid
!= CEPH_NOSNAP
) {
12349 if (is_quota_files_exceeded(dir
, perms
)) {
12353 // use normalized flags to generate cmode
12354 int cflags
= ceph_flags_sys2wire(flags
);
12355 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
12356 cflags
|= CEPH_O_LAZY
;
12358 int cmode
= ceph_flags_to_mode(cflags
);
12360 int64_t pool_id
= -1;
12361 if (data_pool
&& *data_pool
) {
12362 pool_id
= objecter
->with_osdmap(
12363 std::mem_fn(&OSDMap::lookup_pg_pool_name
), data_pool
);
12366 if (pool_id
> 0xffffffffll
)
12367 return -ERANGE
; // bummer!
12370 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_CREATE
);
12373 dir
->make_nosnap_relative_path(path
);
12374 path
.push_dentry(name
);
12375 req
->set_filepath(path
);
12376 req
->set_inode(dir
);
12377 req
->head
.args
.open
.flags
= cflags
| CEPH_O_CREAT
;
12379 req
->head
.args
.open
.stripe_unit
= stripe_unit
;
12380 req
->head
.args
.open
.stripe_count
= stripe_count
;
12381 req
->head
.args
.open
.object_size
= object_size
;
12382 if (cct
->_conf
->client_debug_getattr_caps
)
12383 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
12385 req
->head
.args
.open
.mask
= 0;
12386 req
->head
.args
.open
.pool
= pool_id
;
12387 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12388 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12391 bufferlist xattrs_bl
;
12392 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
12395 req
->head
.args
.open
.mode
= mode
;
12396 if (xattrs_bl
.length() > 0)
12397 req
->set_data(xattrs_bl
);
12400 res
= get_or_create(dir
, name
, &de
);
12403 req
->set_dentry(de
);
12405 res
= make_request(req
, perms
, inp
, created
);
12410 /* If the caller passed a value in fhp, do the open */
12412 (*inp
)->get_open_ref(cmode
);
12413 *fhp
= _create_fh(inp
->get(), flags
, cmode
, perms
);
12419 ldout(cct
, 8) << "create(" << path
<< ", 0" << oct
<< mode
<< dec
12420 << " layout " << stripe_unit
12421 << ' ' << stripe_count
12422 << ' ' << object_size
12423 <<") = " << res
<< dendl
;
12432 int Client::_mkdir(Inode
*dir
, const char *name
, mode_t mode
, const UserPerm
& perm
,
12435 ldout(cct
, 8) << "_mkdir(" << dir
->ino
<< " " << name
<< ", 0" << oct
12436 << mode
<< dec
<< ", uid " << perm
.uid()
12437 << ", gid " << perm
.gid() << ")" << dendl
;
12439 if (strlen(name
) > NAME_MAX
)
12440 return -ENAMETOOLONG
;
12442 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12445 if (is_quota_files_exceeded(dir
, perm
)) {
12448 MetaRequest
*req
= new MetaRequest(dir
->snapid
== CEPH_SNAPDIR
?
12449 CEPH_MDS_OP_MKSNAP
: CEPH_MDS_OP_MKDIR
);
12452 dir
->make_nosnap_relative_path(path
);
12453 path
.push_dentry(name
);
12454 req
->set_filepath(path
);
12455 req
->set_inode(dir
);
12456 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12457 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12460 bufferlist xattrs_bl
;
12461 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perm
);
12464 req
->head
.args
.mkdir
.mode
= mode
;
12465 if (xattrs_bl
.length() > 0)
12466 req
->set_data(xattrs_bl
);
12469 res
= get_or_create(dir
, name
, &de
);
12472 req
->set_dentry(de
);
12474 ldout(cct
, 10) << "_mkdir: making request" << dendl
;
12475 res
= make_request(req
, perm
, inp
);
12476 ldout(cct
, 10) << "_mkdir result is " << res
<< dendl
;
12480 ldout(cct
, 8) << "_mkdir(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
12488 int Client::ll_mkdir(Inode
*parent
, const char *name
, mode_t mode
,
12489 struct stat
*attr
, Inode
**out
, const UserPerm
& perm
)
12491 std::lock_guard
lock(client_lock
);
12496 vinodeno_t vparent
= _get_vino(parent
);
12498 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
<< dendl
;
12499 tout(cct
) << "ll_mkdir" << std::endl
;
12500 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12501 tout(cct
) << name
<< std::endl
;
12502 tout(cct
) << mode
<< std::endl
;
12504 if (!fuse_default_permissions
) {
12505 int r
= may_create(parent
, perm
);
12511 int r
= _mkdir(parent
, name
, mode
, perm
, &in
);
12513 fill_stat(in
, attr
);
12516 tout(cct
) << attr
->st_ino
<< std::endl
;
12517 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
12518 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12523 int Client::ll_mkdirx(Inode
*parent
, const char *name
, mode_t mode
, Inode
**out
,
12524 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12525 const UserPerm
& perms
)
12527 std::lock_guard
lock(client_lock
);
12532 vinodeno_t vparent
= _get_vino(parent
);
12534 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
<< dendl
;
12535 tout(cct
) << "ll_mkdirx" << std::endl
;
12536 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12537 tout(cct
) << name
<< std::endl
;
12538 tout(cct
) << mode
<< std::endl
;
12540 if (!fuse_default_permissions
) {
12541 int r
= may_create(parent
, perms
);
12547 int r
= _mkdir(parent
, name
, mode
, perms
, &in
);
12549 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12555 tout(cct
) << stx
->stx_ino
<< std::endl
;
12556 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
12557 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12562 int Client::_symlink(Inode
*dir
, const char *name
, const char *target
,
12563 const UserPerm
& perms
, InodeRef
*inp
)
12565 ldout(cct
, 8) << "_symlink(" << dir
->ino
<< " " << name
<< ", " << target
12566 << ", uid " << perms
.uid() << ", gid " << perms
.gid() << ")"
12569 if (strlen(name
) > NAME_MAX
)
12570 return -ENAMETOOLONG
;
12572 if (dir
->snapid
!= CEPH_NOSNAP
) {
12575 if (is_quota_files_exceeded(dir
, perms
)) {
12579 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SYMLINK
);
12582 dir
->make_nosnap_relative_path(path
);
12583 path
.push_dentry(name
);
12584 req
->set_filepath(path
);
12585 req
->set_inode(dir
);
12586 req
->set_string2(target
);
12587 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12588 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12591 int res
= get_or_create(dir
, name
, &de
);
12594 req
->set_dentry(de
);
12596 res
= make_request(req
, perms
, inp
);
12599 ldout(cct
, 8) << "_symlink(\"" << path
<< "\", \"" << target
<< "\") = " <<
12608 int Client::ll_symlink(Inode
*parent
, const char *name
, const char *value
,
12609 struct stat
*attr
, Inode
**out
, const UserPerm
& perms
)
12611 std::lock_guard
lock(client_lock
);
12616 vinodeno_t vparent
= _get_vino(parent
);
12618 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
<< " -> " << value
12620 tout(cct
) << "ll_symlink" << std::endl
;
12621 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12622 tout(cct
) << name
<< std::endl
;
12623 tout(cct
) << value
<< std::endl
;
12625 if (!fuse_default_permissions
) {
12626 int r
= may_create(parent
, perms
);
12632 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12634 fill_stat(in
, attr
);
12637 tout(cct
) << attr
->st_ino
<< std::endl
;
12638 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
12639 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12644 int Client::ll_symlinkx(Inode
*parent
, const char *name
, const char *value
,
12645 Inode
**out
, struct ceph_statx
*stx
, unsigned want
,
12646 unsigned flags
, const UserPerm
& perms
)
12648 std::lock_guard
lock(client_lock
);
12653 vinodeno_t vparent
= _get_vino(parent
);
12655 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
<< " -> " << value
12657 tout(cct
) << "ll_symlinkx" << std::endl
;
12658 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12659 tout(cct
) << name
<< std::endl
;
12660 tout(cct
) << value
<< std::endl
;
12662 if (!fuse_default_permissions
) {
12663 int r
= may_create(parent
, perms
);
12669 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12671 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12674 tout(cct
) << stx
->stx_ino
<< std::endl
;
12675 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
12676 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12681 int Client::_unlink(Inode
*dir
, const char *name
, const UserPerm
& perm
)
12683 ldout(cct
, 8) << "_unlink(" << dir
->ino
<< " " << name
12684 << " uid " << perm
.uid() << " gid " << perm
.gid()
12687 if (dir
->snapid
!= CEPH_NOSNAP
) {
12691 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_UNLINK
);
12694 dir
->make_nosnap_relative_path(path
);
12695 path
.push_dentry(name
);
12696 req
->set_filepath(path
);
12702 int res
= get_or_create(dir
, name
, &de
);
12705 req
->set_dentry(de
);
12706 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12707 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12709 res
= _lookup(dir
, name
, 0, &otherin
, perm
);
12713 in
= otherin
.get();
12714 req
->set_other_inode(in
);
12715 in
->break_all_delegs();
12716 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12718 req
->set_inode(dir
);
12720 res
= make_request(req
, perm
);
12723 ldout(cct
, 8) << "unlink(" << path
<< ") = " << res
<< dendl
;
12731 int Client::ll_unlink(Inode
*in
, const char *name
, const UserPerm
& perm
)
12733 std::lock_guard
lock(client_lock
);
12738 vinodeno_t vino
= _get_vino(in
);
12740 ldout(cct
, 3) << "ll_unlink " << vino
<< " " << name
<< dendl
;
12741 tout(cct
) << "ll_unlink" << std::endl
;
12742 tout(cct
) << vino
.ino
.val
<< std::endl
;
12743 tout(cct
) << name
<< std::endl
;
12745 if (!fuse_default_permissions
) {
12746 int r
= may_delete(in
, name
, perm
);
12750 return _unlink(in
, name
, perm
);
12753 int Client::_rmdir(Inode
*dir
, const char *name
, const UserPerm
& perms
)
12755 ldout(cct
, 8) << "_rmdir(" << dir
->ino
<< " " << name
<< " uid "
12756 << perms
.uid() << " gid " << perms
.gid() << ")" << dendl
;
12758 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12762 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_RMSNAP
: CEPH_MDS_OP_RMDIR
;
12763 MetaRequest
*req
= new MetaRequest(op
);
12765 dir
->make_nosnap_relative_path(path
);
12766 path
.push_dentry(name
);
12767 req
->set_filepath(path
);
12768 req
->set_inode(dir
);
12770 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12771 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12772 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12777 int res
= get_or_create(dir
, name
, &de
);
12780 if (op
== CEPH_MDS_OP_RMDIR
)
12781 req
->set_dentry(de
);
12785 res
= _lookup(dir
, name
, 0, &in
, perms
);
12789 if (op
== CEPH_MDS_OP_RMSNAP
) {
12790 unlink(de
, true, true);
12793 req
->set_other_inode(in
.get());
12795 res
= make_request(req
, perms
);
12798 ldout(cct
, 8) << "rmdir(" << path
<< ") = " << res
<< dendl
;
12806 int Client::ll_rmdir(Inode
*in
, const char *name
, const UserPerm
& perms
)
12808 std::lock_guard
lock(client_lock
);
12813 vinodeno_t vino
= _get_vino(in
);
12815 ldout(cct
, 3) << "ll_rmdir " << vino
<< " " << name
<< dendl
;
12816 tout(cct
) << "ll_rmdir" << std::endl
;
12817 tout(cct
) << vino
.ino
.val
<< std::endl
;
12818 tout(cct
) << name
<< std::endl
;
12820 if (!fuse_default_permissions
) {
12821 int r
= may_delete(in
, name
, perms
);
12826 return _rmdir(in
, name
, perms
);
12829 int Client::_rename(Inode
*fromdir
, const char *fromname
, Inode
*todir
, const char *toname
, const UserPerm
& perm
)
12831 ldout(cct
, 8) << "_rename(" << fromdir
->ino
<< " " << fromname
<< " to "
12832 << todir
->ino
<< " " << toname
12833 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")"
12836 if (fromdir
->snapid
!= todir
->snapid
)
12839 int op
= CEPH_MDS_OP_RENAME
;
12840 if (fromdir
->snapid
!= CEPH_NOSNAP
) {
12841 if (fromdir
== todir
&& fromdir
->snapid
== CEPH_SNAPDIR
)
12842 op
= CEPH_MDS_OP_RENAMESNAP
;
12848 MetaRequest
*req
= new MetaRequest(op
);
12851 fromdir
->make_nosnap_relative_path(from
);
12852 from
.push_dentry(fromname
);
12854 todir
->make_nosnap_relative_path(to
);
12855 to
.push_dentry(toname
);
12856 req
->set_filepath(to
);
12857 req
->set_filepath2(from
);
12860 int res
= get_or_create(fromdir
, fromname
, &oldde
);
12864 res
= get_or_create(todir
, toname
, &de
);
12868 if (op
== CEPH_MDS_OP_RENAME
) {
12869 req
->set_old_dentry(oldde
);
12870 req
->old_dentry_drop
= CEPH_CAP_FILE_SHARED
;
12871 req
->old_dentry_unless
= CEPH_CAP_FILE_EXCL
;
12873 req
->set_dentry(de
);
12874 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12875 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12877 InodeRef oldin
, otherin
;
12878 Inode
*fromdir_root
= nullptr;
12879 Inode
*todir_root
= nullptr;
12881 bool quota_check
= false;
12882 if (fromdir
!= todir
) {
12884 fromdir
->quota
.is_enable() ? fromdir
: get_quota_root(fromdir
, perm
);
12886 todir
->quota
.is_enable() ? todir
: get_quota_root(todir
, perm
);
12888 if (todir_root
->quota
.is_enable() && fromdir_root
!= todir_root
) {
12889 // use CEPH_STAT_RSTAT mask to force send getattr or lookup request
12890 // to auth MDS to get latest rstat for todir_root and source dir
12891 // even if their dentry caches and inode caps are satisfied.
12892 res
= _getattr(todir_root
, CEPH_STAT_RSTAT
, perm
, true);
12896 quota_check
= true;
12897 if (oldde
->inode
&& oldde
->inode
->is_dir()) {
12898 mask
|= CEPH_STAT_RSTAT
;
12903 res
= _lookup(fromdir
, fromname
, mask
, &oldin
, perm
);
12907 Inode
*oldinode
= oldin
.get();
12908 oldinode
->break_all_delegs();
12909 req
->set_old_inode(oldinode
);
12910 req
->old_inode_drop
= CEPH_CAP_LINK_SHARED
;
12913 int64_t old_bytes
, old_files
;
12914 if (oldinode
->is_dir()) {
12915 old_bytes
= oldinode
->rstat
.rbytes
;
12916 old_files
= oldinode
->rstat
.rsize();
12918 old_bytes
= oldinode
->size
;
12922 bool quota_exceed
= false;
12923 if (todir_root
&& todir_root
->quota
.max_bytes
&&
12924 (old_bytes
+ todir_root
->rstat
.rbytes
) >= todir_root
->quota
.max_bytes
) {
12925 ldout(cct
, 10) << "_rename (" << oldinode
->ino
<< " bytes="
12926 << old_bytes
<< ") to (" << todir
->ino
12927 << ") will exceed quota on " << *todir_root
<< dendl
;
12928 quota_exceed
= true;
12931 if (todir_root
&& todir_root
->quota
.max_files
&&
12932 (old_files
+ todir_root
->rstat
.rsize()) >= todir_root
->quota
.max_files
) {
12933 ldout(cct
, 10) << "_rename (" << oldinode
->ino
<< " files="
12934 << old_files
<< ") to (" << todir
->ino
12935 << ") will exceed quota on " << *todir_root
<< dendl
;
12936 quota_exceed
= true;
12939 if (quota_exceed
) {
12940 res
= (oldinode
->is_dir()) ? -EXDEV
: -EDQUOT
;
12945 res
= _lookup(todir
, toname
, 0, &otherin
, perm
);
12949 Inode
*in
= otherin
.get();
12950 req
->set_other_inode(in
);
12951 in
->break_all_delegs();
12953 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12961 req
->set_inode(todir
);
12963 // renamesnap reply contains no tracedn, so we need to invalidate
12965 unlink(oldde
, true, true);
12966 unlink(de
, true, true);
12968 req
->set_inode(todir
);
12971 res
= make_request(req
, perm
, &target
);
12972 ldout(cct
, 10) << "rename result is " << res
<< dendl
;
12974 // renamed item from our cache
12977 ldout(cct
, 8) << "_rename(" << from
<< ", " << to
<< ") = " << res
<< dendl
;
12985 int Client::ll_rename(Inode
*parent
, const char *name
, Inode
*newparent
,
12986 const char *newname
, const UserPerm
& perm
)
12988 std::lock_guard
lock(client_lock
);
12993 vinodeno_t vparent
= _get_vino(parent
);
12994 vinodeno_t vnewparent
= _get_vino(newparent
);
12996 ldout(cct
, 3) << "ll_rename " << vparent
<< " " << name
<< " to "
12997 << vnewparent
<< " " << newname
<< dendl
;
12998 tout(cct
) << "ll_rename" << std::endl
;
12999 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13000 tout(cct
) << name
<< std::endl
;
13001 tout(cct
) << vnewparent
.ino
.val
<< std::endl
;
13002 tout(cct
) << newname
<< std::endl
;
13004 if (!fuse_default_permissions
) {
13005 int r
= may_delete(parent
, name
, perm
);
13008 r
= may_delete(newparent
, newname
, perm
);
13009 if (r
< 0 && r
!= -ENOENT
)
13013 return _rename(parent
, name
, newparent
, newname
, perm
);
13016 int Client::_link(Inode
*in
, Inode
*dir
, const char *newname
, const UserPerm
& perm
, InodeRef
*inp
)
13018 ldout(cct
, 8) << "_link(" << in
->ino
<< " to " << dir
->ino
<< " " << newname
13019 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")" << dendl
;
13021 if (strlen(newname
) > NAME_MAX
)
13022 return -ENAMETOOLONG
;
13024 if (in
->snapid
!= CEPH_NOSNAP
|| dir
->snapid
!= CEPH_NOSNAP
) {
13027 if (is_quota_files_exceeded(dir
, perm
)) {
13031 in
->break_all_delegs();
13032 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LINK
);
13034 filepath
path(newname
, dir
->ino
);
13035 req
->set_filepath(path
);
13036 filepath
existing(in
->ino
);
13037 req
->set_filepath2(existing
);
13039 req
->set_inode(dir
);
13040 req
->inode_drop
= CEPH_CAP_FILE_SHARED
;
13041 req
->inode_unless
= CEPH_CAP_FILE_EXCL
;
13044 int res
= get_or_create(dir
, newname
, &de
);
13047 req
->set_dentry(de
);
13049 res
= make_request(req
, perm
, inp
);
13050 ldout(cct
, 10) << "link result is " << res
<< dendl
;
13053 ldout(cct
, 8) << "link(" << existing
<< ", " << path
<< ") = " << res
<< dendl
;
13061 int Client::ll_link(Inode
*in
, Inode
*newparent
, const char *newname
,
13062 const UserPerm
& perm
)
13064 std::lock_guard
lock(client_lock
);
13069 vinodeno_t vino
= _get_vino(in
);
13070 vinodeno_t vnewparent
= _get_vino(newparent
);
13072 ldout(cct
, 3) << "ll_link " << vino
<< " to " << vnewparent
<< " " <<
13074 tout(cct
) << "ll_link" << std::endl
;
13075 tout(cct
) << vino
.ino
.val
<< std::endl
;
13076 tout(cct
) << vnewparent
<< std::endl
;
13077 tout(cct
) << newname
<< std::endl
;
13081 if (!fuse_default_permissions
) {
13082 if (S_ISDIR(in
->mode
))
13085 int r
= may_hardlink(in
, perm
);
13089 r
= may_create(newparent
, perm
);
13094 return _link(in
, newparent
, newname
, perm
, &target
);
13097 int Client::ll_num_osds(void)
13099 std::lock_guard
lock(client_lock
);
13100 return objecter
->with_osdmap(std::mem_fn(&OSDMap::get_num_osds
));
13103 int Client::ll_osdaddr(int osd
, uint32_t *addr
)
13105 std::lock_guard
lock(client_lock
);
13108 bool exists
= objecter
->with_osdmap([&](const OSDMap
& o
) {
13109 if (!o
.exists(osd
))
13111 g
= o
.get_addrs(osd
).front();
13116 uint32_t nb_addr
= (g
.in4_addr()).sin_addr
.s_addr
;
13117 *addr
= ntohl(nb_addr
);
13121 uint32_t Client::ll_stripe_unit(Inode
*in
)
13123 std::lock_guard
lock(client_lock
);
13124 return in
->layout
.stripe_unit
;
13127 uint64_t Client::ll_snap_seq(Inode
*in
)
13129 std::lock_guard
lock(client_lock
);
13130 return in
->snaprealm
->seq
;
13133 int Client::ll_file_layout(Inode
*in
, file_layout_t
*layout
)
13135 std::lock_guard
lock(client_lock
);
13136 *layout
= in
->layout
;
13140 int Client::ll_file_layout(Fh
*fh
, file_layout_t
*layout
)
13142 return ll_file_layout(fh
->inode
.get(), layout
);
13145 /* Currently we cannot take advantage of redundancy in reads, since we
13146 would have to go through all possible placement groups (a
13147 potentially quite large number determined by a hash), and use CRUSH
13148 to calculate the appropriate set of OSDs for each placement group,
13149 then index into that. An array with one entry per OSD is much more
13150 tractable and works for demonstration purposes. */
13152 int Client::ll_get_stripe_osd(Inode
*in
, uint64_t blockno
,
13153 file_layout_t
* layout
)
13155 std::lock_guard
lock(client_lock
);
13157 inodeno_t ino
= in
->ino
;
13158 uint32_t object_size
= layout
->object_size
;
13159 uint32_t su
= layout
->stripe_unit
;
13160 uint32_t stripe_count
= layout
->stripe_count
;
13161 uint64_t stripes_per_object
= object_size
/ su
;
13162 uint64_t stripeno
= 0, stripepos
= 0;
13165 stripeno
= blockno
/ stripe_count
; // which horizontal stripe (Y)
13166 stripepos
= blockno
% stripe_count
; // which object in the object set (X)
13168 uint64_t objectsetno
= stripeno
/ stripes_per_object
; // which object set
13169 uint64_t objectno
= objectsetno
* stripe_count
+ stripepos
; // object id
13171 object_t oid
= file_object_t(ino
, objectno
);
13172 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13173 ceph_object_layout olayout
=
13174 o
.file_to_object_layout(oid
, *layout
);
13175 pg_t pg
= (pg_t
)olayout
.ol_pgid
;
13178 o
.pg_to_acting_osds(pg
, &osds
, &primary
);
13183 /* Return the offset of the block, internal to the object */
13185 uint64_t Client::ll_get_internal_offset(Inode
*in
, uint64_t blockno
)
13187 std::lock_guard
lock(client_lock
);
13188 file_layout_t
*layout
=&(in
->layout
);
13189 uint32_t object_size
= layout
->object_size
;
13190 uint32_t su
= layout
->stripe_unit
;
13191 uint64_t stripes_per_object
= object_size
/ su
;
13193 return (blockno
% stripes_per_object
) * su
;
13196 int Client::ll_opendir(Inode
*in
, int flags
, dir_result_t
** dirpp
,
13197 const UserPerm
& perms
)
13199 std::lock_guard
lock(client_lock
);
13204 vinodeno_t vino
= _get_vino(in
);
13206 ldout(cct
, 3) << "ll_opendir " << vino
<< dendl
;
13207 tout(cct
) << "ll_opendir" << std::endl
;
13208 tout(cct
) << vino
.ino
.val
<< std::endl
;
13210 if (!fuse_default_permissions
) {
13211 int r
= may_open(in
, flags
, perms
);
13216 int r
= _opendir(in
, dirpp
, perms
);
13217 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
13219 ldout(cct
, 3) << "ll_opendir " << vino
<< " = " << r
<< " (" << *dirpp
<< ")"
13224 int Client::ll_releasedir(dir_result_t
*dirp
)
13226 std::lock_guard
lock(client_lock
);
13227 ldout(cct
, 3) << "ll_releasedir " << dirp
<< dendl
;
13228 tout(cct
) << "ll_releasedir" << std::endl
;
13229 tout(cct
) << (unsigned long)dirp
<< std::endl
;
13238 int Client::ll_fsyncdir(dir_result_t
*dirp
)
13240 std::lock_guard
lock(client_lock
);
13241 ldout(cct
, 3) << "ll_fsyncdir " << dirp
<< dendl
;
13242 tout(cct
) << "ll_fsyncdir" << std::endl
;
13243 tout(cct
) << (unsigned long)dirp
<< std::endl
;
13248 return _fsync(dirp
->inode
.get(), false);
13251 int Client::ll_open(Inode
*in
, int flags
, Fh
**fhp
, const UserPerm
& perms
)
13253 ceph_assert(!(flags
& O_CREAT
));
13255 std::lock_guard
lock(client_lock
);
13260 vinodeno_t vino
= _get_vino(in
);
13262 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) << dendl
;
13263 tout(cct
) << "ll_open" << std::endl
;
13264 tout(cct
) << vino
.ino
.val
<< std::endl
;
13265 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
13268 if (!fuse_default_permissions
) {
13269 r
= may_open(in
, flags
, perms
);
13274 r
= _open(in
, flags
, 0, fhp
/* may be NULL */, perms
);
13277 Fh
*fhptr
= fhp
? *fhp
: NULL
;
13279 ll_unclosed_fh_set
.insert(fhptr
);
13281 tout(cct
) << (unsigned long)fhptr
<< std::endl
;
13282 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) <<
13283 " = " << r
<< " (" << fhptr
<< ")" << dendl
;
13287 int Client::_ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13288 int flags
, InodeRef
*in
, int caps
, Fh
**fhp
,
13289 const UserPerm
& perms
)
13293 vinodeno_t vparent
= _get_vino(parent
);
13295 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13296 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << ", uid " << perms
.uid()
13297 << ", gid " << perms
.gid() << dendl
;
13298 tout(cct
) << "ll_create" << std::endl
;
13299 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13300 tout(cct
) << name
<< std::endl
;
13301 tout(cct
) << mode
<< std::endl
;
13302 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
13304 bool created
= false;
13305 int r
= _lookup(parent
, name
, caps
, in
, perms
);
13307 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
13310 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
13311 if (!fuse_default_permissions
) {
13312 r
= may_create(parent
, perms
);
13316 r
= _create(parent
, name
, flags
, mode
, in
, fhp
, 0, 0, 0, NULL
, &created
,
13327 ldout(cct
, 20) << "_ll_create created = " << created
<< dendl
;
13329 if (!fuse_default_permissions
) {
13330 r
= may_open(in
->get(), flags
, perms
);
13333 int release_r
= _release_fh(*fhp
);
13334 ceph_assert(release_r
== 0); // during create, no async data ops should have happened
13339 if (*fhp
== NULL
) {
13340 r
= _open(in
->get(), flags
, mode
, fhp
, perms
);
13348 ll_unclosed_fh_set
.insert(*fhp
);
13353 Inode
*inode
= in
->get();
13354 if (use_faked_inos())
13355 ino
= inode
->faked_ino
;
13360 tout(cct
) << (unsigned long)*fhp
<< std::endl
;
13361 tout(cct
) << ino
<< std::endl
;
13362 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13363 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << " = " << r
<< " (" <<
13364 *fhp
<< " " << hex
<< ino
<< dec
<< ")" << dendl
;
13369 int Client::ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13370 int flags
, struct stat
*attr
, Inode
**outp
, Fh
**fhp
,
13371 const UserPerm
& perms
)
13373 std::lock_guard
lock(client_lock
);
13379 int r
= _ll_create(parent
, name
, mode
, flags
, &in
, CEPH_STAT_CAP_INODE_ALL
,
13384 // passing an Inode in outp requires an additional ref
13389 fill_stat(in
, attr
);
13397 int Client::ll_createx(Inode
*parent
, const char *name
, mode_t mode
,
13398 int oflags
, Inode
**outp
, Fh
**fhp
,
13399 struct ceph_statx
*stx
, unsigned want
, unsigned lflags
,
13400 const UserPerm
& perms
)
13402 unsigned caps
= statx_to_mask(lflags
, want
);
13403 std::lock_guard
lock(client_lock
);
13409 int r
= _ll_create(parent
, name
, mode
, oflags
, &in
, caps
, fhp
, perms
);
13413 // passing an Inode in outp requires an additional ref
13418 fill_statx(in
, caps
, stx
);
13427 loff_t
Client::ll_lseek(Fh
*fh
, loff_t offset
, int whence
)
13429 std::lock_guard
lock(client_lock
);
13430 tout(cct
) << "ll_lseek" << std::endl
;
13431 tout(cct
) << offset
<< std::endl
;
13432 tout(cct
) << whence
<< std::endl
;
13437 return _lseek(fh
, offset
, whence
);
13440 int Client::ll_read(Fh
*fh
, loff_t off
, loff_t len
, bufferlist
*bl
)
13442 std::lock_guard
lock(client_lock
);
13443 ldout(cct
, 3) << "ll_read " << fh
<< " " << fh
->inode
->ino
<< " " << " " << off
<< "~" << len
<< dendl
;
13444 tout(cct
) << "ll_read" << std::endl
;
13445 tout(cct
) << (unsigned long)fh
<< std::endl
;
13446 tout(cct
) << off
<< std::endl
;
13447 tout(cct
) << len
<< std::endl
;
13452 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13453 len
= std::min(len
, (loff_t
)INT_MAX
);
13454 int r
= _read(fh
, off
, len
, bl
);
13455 ldout(cct
, 3) << "ll_read " << fh
<< " " << off
<< "~" << len
<< " = " << r
13460 int Client::ll_read_block(Inode
*in
, uint64_t blockid
,
13464 file_layout_t
* layout
)
13466 std::lock_guard
lock(client_lock
);
13471 vinodeno_t vino
= _get_vino(in
);
13472 object_t oid
= file_object_t(vino
.ino
, blockid
);
13473 C_SaferCond onfinish
;
13476 objecter
->read(oid
,
13477 object_locator_t(layout
->pool_id
),
13482 CEPH_OSD_FLAG_READ
,
13485 client_lock
.unlock();
13486 int r
= onfinish
.wait();
13487 client_lock
.lock();
13490 bl
.begin().copy(bl
.length(), buf
);
13497 /* It appears that the OSD doesn't return success unless the entire
13498 buffer was written, return the write length on success. */
13500 int Client::ll_write_block(Inode
*in
, uint64_t blockid
,
13501 char* buf
, uint64_t offset
,
13502 uint64_t length
, file_layout_t
* layout
,
13503 uint64_t snapseq
, uint32_t sync
)
13505 vinodeno_t vino
= ll_get_vino(in
);
13507 std::unique_ptr
<C_SaferCond
> onsafe
= nullptr;
13512 if (true || sync
) {
13513 /* if write is stable, the epilogue is waiting on
13515 onsafe
.reset(new C_SaferCond("Client::ll_write_block flock"));
13517 object_t oid
= file_object_t(vino
.ino
, blockid
);
13518 SnapContext fakesnap
;
13519 ceph::bufferlist bl
;
13521 bl
.push_back(buffer::copy(buf
, length
));
13524 ldout(cct
, 1) << "ll_block_write for " << vino
.ino
<< "." << blockid
13527 fakesnap
.seq
= snapseq
;
13529 /* lock just in time */
13530 client_lock
.lock();
13532 client_lock
.unlock();
13536 objecter
->write(oid
,
13537 object_locator_t(layout
->pool_id
),
13542 ceph::real_clock::now(),
13546 client_lock
.unlock();
13547 if (nullptr != onsafe
) {
13548 r
= onsafe
->wait();
13558 int Client::ll_commit_blocks(Inode
*in
,
13562 std::lock_guard
lock(client_lock
);
13564 BarrierContext *bctx;
13565 vinodeno_t vino = _get_vino(in);
13566 uint64_t ino = vino.ino;
13568 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13569 << offset << " to " << length << dendl;
13575 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13576 if (p != barriers.end()) {
13577 barrier_interval civ(offset, offset + length);
13578 p->second->commit_barrier(civ);
13584 int Client::ll_write(Fh
*fh
, loff_t off
, loff_t len
, const char *data
)
13586 std::lock_guard
lock(client_lock
);
13587 ldout(cct
, 3) << "ll_write " << fh
<< " " << fh
->inode
->ino
<< " " << off
<<
13588 "~" << len
<< dendl
;
13589 tout(cct
) << "ll_write" << std::endl
;
13590 tout(cct
) << (unsigned long)fh
<< std::endl
;
13591 tout(cct
) << off
<< std::endl
;
13592 tout(cct
) << len
<< std::endl
;
13597 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13598 len
= std::min(len
, (loff_t
)INT_MAX
);
13599 int r
= _write(fh
, off
, len
, data
, NULL
, 0);
13600 ldout(cct
, 3) << "ll_write " << fh
<< " " << off
<< "~" << len
<< " = " << r
13605 int64_t Client::ll_writev(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
13607 std::lock_guard
lock(client_lock
);
13610 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, true, false);
13613 int64_t Client::ll_readv(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
13615 std::lock_guard
lock(client_lock
);
13618 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, false, false);
13621 int Client::ll_flush(Fh
*fh
)
13623 std::lock_guard
lock(client_lock
);
13624 ldout(cct
, 3) << "ll_flush " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13625 tout(cct
) << "ll_flush" << std::endl
;
13626 tout(cct
) << (unsigned long)fh
<< std::endl
;
13634 int Client::ll_fsync(Fh
*fh
, bool syncdataonly
)
13636 std::lock_guard
lock(client_lock
);
13637 ldout(cct
, 3) << "ll_fsync " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13638 tout(cct
) << "ll_fsync" << std::endl
;
13639 tout(cct
) << (unsigned long)fh
<< std::endl
;
13644 int r
= _fsync(fh
, syncdataonly
);
13646 // If we're returning an error, clear it from the FH
13647 fh
->take_async_err();
13652 int Client::ll_sync_inode(Inode
*in
, bool syncdataonly
)
13654 std::lock_guard
lock(client_lock
);
13655 ldout(cct
, 3) << "ll_sync_inode " << *in
<< " " << dendl
;
13656 tout(cct
) << "ll_sync_inode" << std::endl
;
13657 tout(cct
) << (unsigned long)in
<< std::endl
;
13662 return _fsync(in
, syncdataonly
);
13665 #ifdef FALLOC_FL_PUNCH_HOLE
13667 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13669 if (offset
< 0 || length
<= 0)
13672 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
))
13673 return -EOPNOTSUPP
;
13675 if ((mode
& FALLOC_FL_PUNCH_HOLE
) && !(mode
& FALLOC_FL_KEEP_SIZE
))
13676 return -EOPNOTSUPP
;
13678 Inode
*in
= fh
->inode
.get();
13680 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
) &&
13681 !(mode
& FALLOC_FL_PUNCH_HOLE
)) {
13685 if (in
->snapid
!= CEPH_NOSNAP
)
13688 if ((fh
->mode
& CEPH_FILE_MODE_WR
) == 0)
13691 uint64_t size
= offset
+ length
;
13692 if (!(mode
& (FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
)) &&
13694 is_quota_bytes_exceeded(in
, size
- in
->size
, fh
->actor_perms
)) {
13699 int r
= get_caps(fh
, CEPH_CAP_FILE_WR
, CEPH_CAP_FILE_BUFFER
, &have
, -1);
13703 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
13704 if (mode
& FALLOC_FL_PUNCH_HOLE
) {
13705 if (in
->inline_version
< CEPH_INLINE_NONE
&&
13706 (have
& CEPH_CAP_FILE_BUFFER
)) {
13708 auto inline_iter
= in
->inline_data
.cbegin();
13709 int len
= in
->inline_data
.length();
13710 if (offset
< len
) {
13712 inline_iter
.copy(offset
, bl
);
13714 if (offset
+ size
> len
)
13715 size
= len
- offset
;
13717 bl
.append_zero(size
);
13718 if (offset
+ size
< len
) {
13719 inline_iter
+= size
;
13720 inline_iter
.copy(len
- offset
- size
, bl
);
13722 in
->inline_data
= bl
;
13723 in
->inline_version
++;
13725 in
->mtime
= in
->ctime
= ceph_clock_now();
13727 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13729 if (in
->inline_version
< CEPH_INLINE_NONE
) {
13730 onuninline
.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13731 uninline_data(in
, onuninline
.get());
13734 C_SaferCond
onfinish("Client::_punch_hole flock");
13736 unsafe_sync_write
++;
13737 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
13739 _invalidate_inode_cache(in
, offset
, length
);
13740 filer
->zero(in
->ino
, &in
->layout
,
13741 in
->snaprealm
->get_snap_context(),
13743 ceph::real_clock::now(),
13744 0, true, &onfinish
);
13745 in
->mtime
= in
->ctime
= ceph_clock_now();
13747 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13749 client_lock
.unlock();
13751 client_lock
.lock();
13752 _sync_write_commit(in
);
13754 } else if (!(mode
& FALLOC_FL_KEEP_SIZE
)) {
13755 uint64_t size
= offset
+ length
;
13756 if (size
> in
->size
) {
13758 in
->mtime
= in
->ctime
= ceph_clock_now();
13760 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13762 if (is_quota_bytes_approaching(in
, fh
->actor_perms
)) {
13763 check_caps(in
, CHECK_CAPS_NODELAY
);
13764 } else if (is_max_size_approaching(in
)) {
13770 if (nullptr != onuninline
) {
13771 client_lock
.unlock();
13772 int ret
= onuninline
->wait();
13773 client_lock
.lock();
13775 if (ret
>= 0 || ret
== -ECANCELED
) {
13776 in
->inline_data
.clear();
13777 in
->inline_version
= CEPH_INLINE_NONE
;
13778 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13784 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
13789 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13791 return -EOPNOTSUPP
;
13797 int Client::ll_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13799 std::lock_guard
lock(client_lock
);
13800 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13801 tout(cct
) << __func__
<< " " << mode
<< " " << offset
<< " " << length
<< std::endl
;
13802 tout(cct
) << (unsigned long)fh
<< std::endl
;
13807 return _fallocate(fh
, mode
, offset
, length
);
13810 int Client::fallocate(int fd
, int mode
, loff_t offset
, loff_t length
)
13812 std::lock_guard
lock(client_lock
);
13813 tout(cct
) << __func__
<< " " << " " << fd
<< mode
<< " " << offset
<< " " << length
<< std::endl
;
13818 Fh
*fh
= get_filehandle(fd
);
13821 #if defined(__linux__) && defined(O_PATH)
13822 if (fh
->flags
& O_PATH
)
13825 return _fallocate(fh
, mode
, offset
, length
);
13828 int Client::ll_release(Fh
*fh
)
13830 std::lock_guard
lock(client_lock
);
13835 ldout(cct
, 3) << __func__
<< " (fh)" << fh
<< " " << fh
->inode
->ino
<< " " <<
13837 tout(cct
) << __func__
<< " (fh)" << std::endl
;
13838 tout(cct
) << (unsigned long)fh
<< std::endl
;
13840 if (ll_unclosed_fh_set
.count(fh
))
13841 ll_unclosed_fh_set
.erase(fh
);
13842 return _release_fh(fh
);
13845 int Client::ll_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
13847 std::lock_guard
lock(client_lock
);
13849 ldout(cct
, 3) << "ll_getlk (fh)" << fh
<< " " << fh
->inode
->ino
<< dendl
;
13850 tout(cct
) << "ll_getk (fh)" << (unsigned long)fh
<< std::endl
;
13855 return _getlk(fh
, fl
, owner
);
13858 int Client::ll_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
13860 std::lock_guard
lock(client_lock
);
13862 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13863 tout(cct
) << __func__
<< " (fh)" << (unsigned long)fh
<< std::endl
;
13868 return _setlk(fh
, fl
, owner
, sleep
);
13871 int Client::ll_flock(Fh
*fh
, int cmd
, uint64_t owner
)
13873 std::lock_guard
lock(client_lock
);
13875 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13876 tout(cct
) << __func__
<< " (fh)" << (unsigned long)fh
<< std::endl
;
13881 return _flock(fh
, cmd
, owner
);
13884 int Client::set_deleg_timeout(uint32_t timeout
)
13886 std::lock_guard
lock(client_lock
);
13889 * The whole point is to prevent blacklisting so we must time out the
13890 * delegation before the session autoclose timeout kicks in.
13892 if (timeout
>= mdsmap
->get_session_autoclose())
13895 deleg_timeout
= timeout
;
13899 int Client::ll_delegation(Fh
*fh
, unsigned cmd
, ceph_deleg_cb_t cb
, void *priv
)
13903 std::lock_guard
lock(client_lock
);
13908 Inode
*inode
= fh
->inode
.get();
13911 case CEPH_DELEGATION_NONE
:
13912 inode
->unset_deleg(fh
);
13917 ret
= inode
->set_deleg(fh
, cmd
, cb
, priv
);
13918 } catch (std::bad_alloc
&) {
13926 class C_Client_RequestInterrupt
: public Context
{
13931 C_Client_RequestInterrupt(Client
*c
, MetaRequest
*r
) : client(c
), req(r
) {
13934 void finish(int r
) override
{
13935 std::lock_guard
l(client
->client_lock
);
13936 ceph_assert(req
->head
.op
== CEPH_MDS_OP_SETFILELOCK
);
13937 client
->_interrupt_filelock(req
);
13938 client
->put_request(req
);
13942 void Client::ll_interrupt(void *d
)
13944 MetaRequest
*req
= static_cast<MetaRequest
*>(d
);
13945 ldout(cct
, 3) << __func__
<< " tid " << req
->get_tid() << dendl
;
13946 tout(cct
) << __func__
<< " tid " << req
->get_tid() << std::endl
;
13947 interrupt_finisher
.queue(new C_Client_RequestInterrupt(this, req
));
13950 // =========================================
13953 // expose file layouts
13955 int Client::describe_layout(const char *relpath
, file_layout_t
*lp
,
13956 const UserPerm
& perms
)
13958 std::lock_guard
lock(client_lock
);
13963 filepath
path(relpath
);
13965 int r
= path_walk(path
, &in
, perms
);
13971 ldout(cct
, 3) << __func__
<< "(" << relpath
<< ") = 0" << dendl
;
13975 int Client::fdescribe_layout(int fd
, file_layout_t
*lp
)
13977 std::lock_guard
lock(client_lock
);
13982 Fh
*f
= get_filehandle(fd
);
13985 Inode
*in
= f
->inode
.get();
13989 ldout(cct
, 3) << __func__
<< "(" << fd
<< ") = 0" << dendl
;
13993 int64_t Client::get_default_pool_id()
13995 std::lock_guard
lock(client_lock
);
14000 /* first data pool is the default */
14001 return mdsmap
->get_first_data_pool();
14006 int64_t Client::get_pool_id(const char *pool_name
)
14008 std::lock_guard
lock(client_lock
);
14013 return objecter
->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name
),
14017 string
Client::get_pool_name(int64_t pool
)
14019 std::lock_guard
lock(client_lock
);
14024 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
14025 return o
.have_pg_pool(pool
) ? o
.get_pool_name(pool
) : string();
14029 int Client::get_pool_replication(int64_t pool
)
14031 std::lock_guard
lock(client_lock
);
14036 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
14037 return o
.have_pg_pool(pool
) ? o
.get_pg_pool(pool
)->get_size() : -ENOENT
;
14041 int Client::get_file_extent_osds(int fd
, loff_t off
, loff_t
*len
, vector
<int>& osds
)
14043 std::lock_guard
lock(client_lock
);
14048 Fh
*f
= get_filehandle(fd
);
14051 Inode
*in
= f
->inode
.get();
14053 vector
<ObjectExtent
> extents
;
14054 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, 1, in
->truncate_size
, extents
);
14055 ceph_assert(extents
.size() == 1);
14057 objecter
->with_osdmap([&](const OSDMap
& o
) {
14058 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
14059 o
.pg_to_acting_osds(pg
, osds
);
14066 * Return the remainder of the extent (stripe unit)
14068 * If length = 1 is passed to Striper::file_to_extents we get a single
14069 * extent back, but its length is one so we still need to compute the length
14070 * to the end of the stripe unit.
14072 * If length = su then we may get 1 or 2 objects back in the extents vector
14073 * which would have to be examined. Even then, the offsets are local to the
14074 * object, so matching up to the file offset is extra work.
14076 * It seems simpler to stick with length = 1 and manually compute the
14080 uint64_t su
= in
->layout
.stripe_unit
;
14081 *len
= su
- (off
% su
);
14087 int Client::get_osd_crush_location(int id
, vector
<pair
<string
, string
> >& path
)
14089 std::lock_guard
lock(client_lock
);
14096 return objecter
->with_osdmap([&](const OSDMap
& o
) {
14097 return o
.crush
->get_full_location_ordered(id
, path
);
14101 int Client::get_file_stripe_address(int fd
, loff_t offset
,
14102 vector
<entity_addr_t
>& address
)
14104 std::lock_guard
lock(client_lock
);
14109 Fh
*f
= get_filehandle(fd
);
14112 Inode
*in
= f
->inode
.get();
14115 vector
<ObjectExtent
> extents
;
14116 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, 1,
14117 in
->truncate_size
, extents
);
14118 ceph_assert(extents
.size() == 1);
14120 // now we have the object and its 'layout'
14121 return objecter
->with_osdmap([&](const OSDMap
& o
) {
14122 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
14124 o
.pg_to_acting_osds(pg
, osds
);
14127 for (unsigned i
= 0; i
< osds
.size(); i
++) {
14128 entity_addr_t addr
= o
.get_addrs(osds
[i
]).front();
14129 address
.push_back(addr
);
14135 int Client::get_osd_addr(int osd
, entity_addr_t
& addr
)
14137 std::lock_guard
lock(client_lock
);
14142 return objecter
->with_osdmap([&](const OSDMap
& o
) {
14143 if (!o
.exists(osd
))
14146 addr
= o
.get_addrs(osd
).front();
14151 int Client::enumerate_layout(int fd
, vector
<ObjectExtent
>& result
,
14152 loff_t length
, loff_t offset
)
14154 std::lock_guard
lock(client_lock
);
14159 Fh
*f
= get_filehandle(fd
);
14162 Inode
*in
= f
->inode
.get();
14164 // map to a list of extents
14165 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, length
, in
->truncate_size
, result
);
14167 ldout(cct
, 3) << __func__
<< "(" << fd
<< ", " << length
<< ", " << offset
<< ") = 0" << dendl
;
14172 /* find an osd with the same ip. -ENXIO if none. */
14173 int Client::get_local_osd()
14175 std::lock_guard
lock(client_lock
);
14180 objecter
->with_osdmap([this](const OSDMap
& o
) {
14181 if (o
.get_epoch() != local_osd_epoch
) {
14182 local_osd
= o
.find_osd_on_ip(messenger
->get_myaddrs().front());
14183 local_osd_epoch
= o
.get_epoch();
14194 // ===============================
14196 void Client::ms_handle_connect(Connection
*con
)
14198 ldout(cct
, 10) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14201 bool Client::ms_handle_reset(Connection
*con
)
14203 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14207 void Client::ms_handle_remote_reset(Connection
*con
)
14209 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14210 std::lock_guard
l(client_lock
);
14211 switch (con
->get_peer_type()) {
14212 case CEPH_ENTITY_TYPE_MDS
:
14214 // kludge to figure out which mds this is; fixme with a Connection* state
14215 mds_rank_t mds
= MDS_RANK_NONE
;
14216 MetaSession
*s
= NULL
;
14217 for (auto &p
: mds_sessions
) {
14218 if (mdsmap
->get_addrs(p
.first
) == con
->get_peer_addrs()) {
14224 assert (s
!= NULL
);
14225 switch (s
->state
) {
14226 case MetaSession::STATE_CLOSING
:
14227 ldout(cct
, 1) << "reset from mds we were closing; we'll call that closed" << dendl
;
14228 _closed_mds_session(s
);
14231 case MetaSession::STATE_OPENING
:
14233 ldout(cct
, 1) << "reset from mds we were opening; retrying" << dendl
;
14234 list
<Context
*> waiters
;
14235 waiters
.swap(s
->waiting_for_open
);
14236 _closed_mds_session(s
);
14237 MetaSession
*news
= _get_or_open_mds_session(mds
);
14238 news
->waiting_for_open
.swap(waiters
);
14242 case MetaSession::STATE_OPEN
:
14244 objecter
->maybe_request_map(); /* to check if we are blacklisted */
14245 if (cct
->_conf
.get_val
<bool>("client_reconnect_stale")) {
14246 ldout(cct
, 1) << "reset from mds we were open; close mds session for reconnect" << dendl
;
14247 _closed_mds_session(s
);
14249 ldout(cct
, 1) << "reset from mds we were open; mark session as stale" << dendl
;
14250 s
->state
= MetaSession::STATE_STALE
;
14255 case MetaSession::STATE_NEW
:
14256 case MetaSession::STATE_CLOSED
:
14266 bool Client::ms_handle_refused(Connection
*con
)
14268 ldout(cct
, 1) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14272 Inode
*Client::get_quota_root(Inode
*in
, const UserPerm
& perms
)
14274 Inode
*quota_in
= root_ancestor
;
14275 SnapRealm
*realm
= in
->snaprealm
;
14277 ldout(cct
, 10) << __func__
<< " realm " << realm
->ino
<< dendl
;
14278 if (realm
->ino
!= in
->ino
) {
14279 auto p
= inode_map
.find(vinodeno_t(realm
->ino
, CEPH_NOSNAP
));
14280 if (p
== inode_map
.end())
14283 if (p
->second
->quota
.is_enable()) {
14284 quota_in
= p
->second
;
14288 realm
= realm
->pparent
;
14290 ldout(cct
, 10) << __func__
<< " " << in
->vino() << " -> " << quota_in
->vino() << dendl
;
14295 * Traverse quota ancestors of the Inode, return true
14296 * if any of them passes the passed function
14298 bool Client::check_quota_condition(Inode
*in
, const UserPerm
& perms
,
14299 std::function
<bool (const Inode
&in
)> test
)
14302 ceph_assert(in
!= NULL
);
14307 if (in
== root_ancestor
) {
14308 // We're done traversing, drop out
14311 // Continue up the tree
14312 in
= get_quota_root(in
, perms
);
14319 bool Client::is_quota_files_exceeded(Inode
*in
, const UserPerm
& perms
)
14321 return check_quota_condition(in
, perms
,
14322 [](const Inode
&in
) {
14323 return in
.quota
.max_files
&& in
.rstat
.rsize() >= in
.quota
.max_files
;
14327 bool Client::is_quota_bytes_exceeded(Inode
*in
, int64_t new_bytes
,
14328 const UserPerm
& perms
)
14330 return check_quota_condition(in
, perms
,
14331 [&new_bytes
](const Inode
&in
) {
14332 return in
.quota
.max_bytes
&& (in
.rstat
.rbytes
+ new_bytes
)
14333 > in
.quota
.max_bytes
;
14337 bool Client::is_quota_bytes_approaching(Inode
*in
, const UserPerm
& perms
)
14339 ceph_assert(in
->size
>= in
->reported_size
);
14340 const uint64_t size
= in
->size
- in
->reported_size
;
14341 return check_quota_condition(in
, perms
,
14342 [&size
](const Inode
&in
) {
14343 if (in
.quota
.max_bytes
) {
14344 if (in
.rstat
.rbytes
>= in
.quota
.max_bytes
) {
14348 const uint64_t space
= in
.quota
.max_bytes
- in
.rstat
.rbytes
;
14349 return (space
>> 4) < size
;
14363 int Client::check_pool_perm(Inode
*in
, int need
)
14365 if (!cct
->_conf
->client_check_pool_perm
)
14368 /* Only need to do this for regular files */
14369 if (!in
->is_file())
14372 int64_t pool_id
= in
->layout
.pool_id
;
14373 std::string pool_ns
= in
->layout
.pool_ns
;
14374 std::pair
<int64_t, std::string
> perm_key(pool_id
, pool_ns
);
14377 auto it
= pool_perms
.find(perm_key
);
14378 if (it
== pool_perms
.end())
14380 if (it
->second
== POOL_CHECKING
) {
14381 // avoid concurrent checkings
14382 wait_on_list(waiting_for_pool_perm
);
14385 ceph_assert(have
& POOL_CHECKED
);
14391 if (in
->snapid
!= CEPH_NOSNAP
) {
14392 // pool permission check needs to write to the first object. But for snapshot,
14393 // head of the first object may have alread been deleted. To avoid creating
14394 // orphan object, skip the check for now.
14398 pool_perms
[perm_key
] = POOL_CHECKING
;
14401 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (unsigned long long)in
->ino
);
14402 object_t oid
= oid_buf
;
14404 SnapContext nullsnapc
;
14406 C_SaferCond rd_cond
;
14407 ObjectOperation rd_op
;
14408 rd_op
.stat(NULL
, (ceph::real_time
*)nullptr, NULL
);
14410 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), rd_op
,
14411 nullsnapc
, ceph::real_clock::now(), 0, &rd_cond
);
14413 C_SaferCond wr_cond
;
14414 ObjectOperation wr_op
;
14415 wr_op
.create(true);
14417 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), wr_op
,
14418 nullsnapc
, ceph::real_clock::now(), 0, &wr_cond
);
14420 client_lock
.unlock();
14421 int rd_ret
= rd_cond
.wait();
14422 int wr_ret
= wr_cond
.wait();
14423 client_lock
.lock();
14425 bool errored
= false;
14427 if (rd_ret
== 0 || rd_ret
== -ENOENT
)
14429 else if (rd_ret
!= -EPERM
) {
14430 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14431 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
14435 if (wr_ret
== 0 || wr_ret
== -EEXIST
)
14436 have
|= POOL_WRITE
;
14437 else if (wr_ret
!= -EPERM
) {
14438 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14439 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
14444 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14445 // Raise EIO because actual error code might be misleading for
14446 // userspace filesystem user.
14447 pool_perms
.erase(perm_key
);
14448 signal_cond_list(waiting_for_pool_perm
);
14452 pool_perms
[perm_key
] = have
| POOL_CHECKED
;
14453 signal_cond_list(waiting_for_pool_perm
);
14456 if ((need
& CEPH_CAP_FILE_RD
) && !(have
& POOL_READ
)) {
14457 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14458 << " need " << ccap_string(need
) << ", but no read perm" << dendl
;
14461 if ((need
& CEPH_CAP_FILE_WR
) && !(have
& POOL_WRITE
)) {
14462 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14463 << " need " << ccap_string(need
) << ", but no write perm" << dendl
;
14470 int Client::_posix_acl_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
14472 if (acl_type
== POSIX_ACL
) {
14473 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
14474 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
14476 return posix_acl_permits(access_acl
, in
->uid
, in
->gid
, perms
, want
);
14482 int Client::_posix_acl_chmod(Inode
*in
, mode_t mode
, const UserPerm
& perms
)
14484 if (acl_type
== NO_ACL
)
14487 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
14491 if (acl_type
== POSIX_ACL
) {
14492 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
14493 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
14494 bufferptr
acl(access_acl
.c_str(), access_acl
.length());
14495 r
= posix_acl_access_chmod(acl
, mode
);
14498 r
= _do_setxattr(in
, ACL_EA_ACCESS
, acl
.c_str(), acl
.length(), 0, perms
);
14504 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< " result=" << r
<< dendl
;
14508 int Client::_posix_acl_create(Inode
*dir
, mode_t
*mode
, bufferlist
& xattrs_bl
,
14509 const UserPerm
& perms
)
14511 if (acl_type
== NO_ACL
)
14514 if (S_ISLNK(*mode
))
14517 int r
= _getattr(dir
, CEPH_STAT_CAP_XATTR
, perms
, dir
->xattr_version
== 0);
14521 if (acl_type
== POSIX_ACL
) {
14522 if (dir
->xattrs
.count(ACL_EA_DEFAULT
)) {
14523 map
<string
, bufferptr
> xattrs
;
14525 const bufferptr
& default_acl
= dir
->xattrs
[ACL_EA_DEFAULT
];
14526 bufferptr
acl(default_acl
.c_str(), default_acl
.length());
14527 r
= posix_acl_inherit_mode(acl
, mode
);
14532 r
= posix_acl_equiv_mode(acl
.c_str(), acl
.length(), mode
);
14536 xattrs
[ACL_EA_ACCESS
] = acl
;
14539 if (S_ISDIR(*mode
))
14540 xattrs
[ACL_EA_DEFAULT
] = dir
->xattrs
[ACL_EA_DEFAULT
];
14544 encode(xattrs
, xattrs_bl
);
14547 *mode
&= ~umask_cb(callback_handle
);
14552 ldout(cct
, 10) << __func__
<< " dir ino " << dir
->ino
<< " result=" << r
<< dendl
;
14556 void Client::set_filer_flags(int flags
)
14558 std::lock_guard
l(client_lock
);
14559 ceph_assert(flags
== 0 ||
14560 flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14561 objecter
->add_global_op_flags(flags
);
14564 void Client::clear_filer_flags(int flags
)
14566 std::lock_guard
l(client_lock
);
14567 ceph_assert(flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14568 objecter
->clear_global_op_flag(flags
);
14571 // called before mount
14572 void Client::set_uuid(const std::string
& uuid
)
14574 std::lock_guard
l(client_lock
);
14575 assert(initialized
);
14576 assert(!uuid
.empty());
14578 metadata
["uuid"] = uuid
;
14582 // called before mount. 0 means infinite
14583 void Client::set_session_timeout(unsigned timeout
)
14585 std::lock_guard
l(client_lock
);
14586 assert(initialized
);
14588 metadata
["timeout"] = stringify(timeout
);
14591 // called before mount
14592 int Client::start_reclaim(const std::string
& uuid
, unsigned flags
,
14593 const std::string
& fs_name
)
14595 std::lock_guard
l(client_lock
);
14603 auto it
= metadata
.find("uuid");
14604 if (it
!= metadata
.end() && it
->second
== uuid
)
14608 int r
= subscribe_mdsmap(fs_name
);
14610 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
14614 if (metadata
.empty())
14615 populate_metadata("");
14617 while (mdsmap
->get_epoch() == 0)
14618 wait_on_list(waiting_for_mdsmap
);
14621 for (unsigned mds
= 0; mds
< mdsmap
->get_num_in_mds(); ) {
14622 if (!mdsmap
->is_up(mds
)) {
14623 ldout(cct
, 10) << "mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
14624 wait_on_list(waiting_for_mdsmap
);
14628 MetaSession
*session
;
14629 if (!have_open_session(mds
)) {
14630 session
= _get_or_open_mds_session(mds
);
14631 if (session
->state
== MetaSession::STATE_REJECTED
)
14633 if (session
->state
!= MetaSession::STATE_OPENING
) {
14637 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
14638 wait_on_context_list(session
->waiting_for_open
);
14642 session
= &mds_sessions
.at(mds
);
14643 if (!session
->mds_features
.test(CEPHFS_FEATURE_RECLAIM_CLIENT
))
14644 return -EOPNOTSUPP
;
14646 if (session
->reclaim_state
== MetaSession::RECLAIM_NULL
||
14647 session
->reclaim_state
== MetaSession::RECLAIMING
) {
14648 session
->reclaim_state
= MetaSession::RECLAIMING
;
14649 auto m
= make_message
<MClientReclaim
>(uuid
, flags
);
14650 session
->con
->send_message2(std::move(m
));
14651 wait_on_list(waiting_for_reclaim
);
14652 } else if (session
->reclaim_state
== MetaSession::RECLAIM_FAIL
) {
14653 return reclaim_errno
? : -ENOTRECOVERABLE
;
14659 // didn't find target session in any mds
14660 if (reclaim_target_addrs
.empty()) {
14661 if (flags
& CEPH_RECLAIM_RESET
)
14663 return -ENOTRECOVERABLE
;
14666 if (flags
& CEPH_RECLAIM_RESET
)
14669 // use blacklist to check if target session was killed
14670 // (config option mds_session_blacklist_on_evict needs to be true)
14672 if (!objecter
->wait_for_map(reclaim_osd_epoch
, &cond
)) {
14673 ldout(cct
, 10) << __func__
<< ": waiting for OSD epoch " << reclaim_osd_epoch
<< dendl
;
14674 client_lock
.unlock();
14676 client_lock
.lock();
14679 bool blacklisted
= objecter
->with_osdmap(
14680 [this](const OSDMap
&osd_map
) -> bool {
14681 return osd_map
.is_blacklisted(reclaim_target_addrs
);
14684 return -ENOTRECOVERABLE
;
14686 metadata
["reclaiming_uuid"] = uuid
;
14690 void Client::finish_reclaim()
14692 auto it
= metadata
.find("reclaiming_uuid");
14693 if (it
== metadata
.end()) {
14694 for (auto &p
: mds_sessions
)
14695 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
14699 for (auto &p
: mds_sessions
) {
14700 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
14701 auto m
= make_message
<MClientReclaim
>("", MClientReclaim::FLAG_FINISH
);
14702 p
.second
.con
->send_message2(std::move(m
));
14705 metadata
["uuid"] = it
->second
;
14706 metadata
.erase(it
);
14709 void Client::handle_client_reclaim_reply(const MConstRef
<MClientReclaimReply
>& reply
)
14711 mds_rank_t from
= mds_rank_t(reply
->get_source().num());
14712 ldout(cct
, 10) << __func__
<< " " << *reply
<< " from mds." << from
<< dendl
;
14714 MetaSession
*session
= _get_mds_session(from
, reply
->get_connection().get());
14716 ldout(cct
, 10) << " discarding reclaim reply from sessionless mds." << from
<< dendl
;
14720 if (reply
->get_result() >= 0) {
14721 session
->reclaim_state
= MetaSession::RECLAIM_OK
;
14722 if (reply
->get_epoch() > reclaim_osd_epoch
)
14723 reclaim_osd_epoch
= reply
->get_epoch();
14724 if (!reply
->get_addrs().empty())
14725 reclaim_target_addrs
= reply
->get_addrs();
14727 session
->reclaim_state
= MetaSession::RECLAIM_FAIL
;
14728 reclaim_errno
= reply
->get_result();
14731 signal_cond_list(waiting_for_reclaim
);
14735 * This is included in cap release messages, to cause
14736 * the MDS to wait until this OSD map epoch. It is necessary
14737 * in corner cases where we cancel RADOS ops, so that
14738 * nobody else tries to do IO to the same objects in
14739 * the same epoch as the cancelled ops.
14741 void Client::set_cap_epoch_barrier(epoch_t e
)
14743 ldout(cct
, 5) << __func__
<< " epoch = " << e
<< dendl
;
14744 cap_epoch_barrier
= e
;
14747 const char** Client::get_tracked_conf_keys() const
14749 static const char* keys
[] = {
14750 "client_cache_size",
14751 "client_cache_mid",
14753 "client_deleg_timeout",
14754 "client_deleg_break_on_open",
14760 void Client::handle_conf_change(const ConfigProxy
& conf
,
14761 const std::set
<std::string
> &changed
)
14763 std::lock_guard
lock(client_lock
);
14765 if (changed
.count("client_cache_mid")) {
14766 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
14768 if (changed
.count("client_acl_type")) {
14770 if (cct
->_conf
->client_acl_type
== "posix_acl")
14771 acl_type
= POSIX_ACL
;
14775 void intrusive_ptr_add_ref(Inode
*in
)
14780 void intrusive_ptr_release(Inode
*in
)
14782 in
->client
->put_inode(in
);
14785 mds_rank_t
Client::_get_random_up_mds() const
14787 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
14789 std::set
<mds_rank_t
> up
;
14790 mdsmap
->get_up_mds_set(up
);
14793 return MDS_RANK_NONE
;
14794 std::set
<mds_rank_t
>::const_iterator p
= up
.begin();
14795 for (int n
= rand() % up
.size(); n
; n
--)
14801 StandaloneClient::StandaloneClient(Messenger
*m
, MonClient
*mc
)
14802 : Client(m
, mc
, new Objecter(m
->cct
, m
, mc
, nullptr))
14804 monclient
->set_messenger(m
);
14805 objecter
->set_client_incarnation(0);
14808 StandaloneClient::~StandaloneClient()
14811 objecter
= nullptr;
14814 int StandaloneClient::init()
14819 client_lock
.lock();
14820 ceph_assert(!is_initialized());
14822 messenger
->add_dispatcher_tail(objecter
);
14823 messenger
->add_dispatcher_tail(this);
14825 monclient
->set_want_keys(CEPH_ENTITY_TYPE_MDS
| CEPH_ENTITY_TYPE_OSD
);
14826 int r
= monclient
->init();
14828 // need to do cleanup because we're in an intermediate init state
14830 client_lock
.unlock();
14831 objecter
->shutdown();
14832 objectcacher
->stop();
14833 monclient
->shutdown();
14838 client_lock
.unlock();
14844 void StandaloneClient::shutdown()
14846 Client::shutdown();
14847 objecter
->shutdown();
14848 monclient
->shutdown();