1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <sys/types.h>
23 #include <sys/param.h>
27 #include <sys/utsname.h>
31 #include <boost/lexical_cast.hpp>
32 #include <boost/fusion/include/std_pair.hpp>
34 #include "common/async/waiter.h"
36 #if defined(__FreeBSD__) || defined(_WIN32)
37 #define XATTR_CREATE 0x1
38 #define XATTR_REPLACE 0x2
40 #include <sys/xattr.h>
43 #if defined(__linux__)
44 #include <linux/falloc.h>
47 #include <sys/statvfs.h>
49 #include "common/config.h"
50 #include "common/version.h"
51 #include "common/async/blocked_completion.h"
53 #include "mon/MonClient.h"
55 #include "messages/MClientCaps.h"
56 #include "messages/MClientLease.h"
57 #include "messages/MClientQuota.h"
58 #include "messages/MClientReclaim.h"
59 #include "messages/MClientReclaimReply.h"
60 #include "messages/MClientReconnect.h"
61 #include "messages/MClientReply.h"
62 #include "messages/MClientRequest.h"
63 #include "messages/MClientRequestForward.h"
64 #include "messages/MClientSession.h"
65 #include "messages/MClientSnap.h"
66 #include "messages/MClientMetrics.h"
67 #include "messages/MCommandReply.h"
68 #include "messages/MFSMap.h"
69 #include "messages/MFSMapUser.h"
70 #include "messages/MMDSMap.h"
71 #include "messages/MOSDMap.h"
73 #include "mds/flock.h"
74 #include "mds/cephfs_features.h"
75 #include "osd/OSDMap.h"
76 #include "osdc/Filer.h"
78 #include "common/Cond.h"
79 #include "common/perf_counters.h"
80 #include "common/admin_socket.h"
81 #include "common/errno.h"
82 #include "include/str_list.h"
84 #define dout_subsys ceph_subsys_client
86 #include "include/lru.h"
87 #include "include/compat.h"
88 #include "include/stringify.h"
89 #include "include/random.h"
94 #include "Delegation.h"
96 #include "ClientSnapRealm.h"
98 #include "MetaSession.h"
99 #include "MetaRequest.h"
100 #include "ObjecterWriteback.h"
101 #include "posix_acl.h"
103 #include "include/ceph_assert.h"
104 #include "include/stat.h"
106 #include "include/cephfs/ceph_ll_client.h"
108 #if HAVE_GETGROUPLIST
115 #define dout_prefix *_dout << "client." << whoami << " "
117 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
119 // FreeBSD fails to define this
123 // Darwin fails to define this
132 // Windows doesn't define those values. While the Posix compatibilty layer
133 // doesn't support those values, the Windows native functions do provide
134 // similar flags. Special care should be taken if we're going to use those
135 // flags in ceph-dokan. The current values are no-ops, while propagating
136 // them to the rest of the code might cause the Windows functions to reject
139 #define O_NOFOLLOW 0x0
146 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
148 using namespace TOPNSPC::common
;
150 namespace bs
= boost::system
;
151 namespace ca
= ceph::async
;
153 void client_flush_set_callback(void *p
, ObjectCacher::ObjectSet
*oset
)
155 Client
*client
= static_cast<Client
*>(p
);
156 client
->flush_set_callback(oset
);
162 Client::CommandHook::CommandHook(Client
*client
) :
167 int Client::CommandHook::call(
168 std::string_view command
,
169 const cmdmap_t
& cmdmap
,
174 f
->open_object_section("result");
176 std::scoped_lock l
{m_client
->client_lock
};
177 if (command
== "mds_requests")
178 m_client
->dump_mds_requests(f
);
179 else if (command
== "mds_sessions") {
180 bool cap_dump
= false;
181 cmd_getval(cmdmap
, "cap_dump", cap_dump
);
182 m_client
->dump_mds_sessions(f
, cap_dump
);
183 } else if (command
== "dump_cache")
184 m_client
->dump_cache(f
);
185 else if (command
== "kick_stale_sessions")
186 m_client
->_kick_stale_sessions();
187 else if (command
== "status")
188 m_client
->dump_status(f
);
190 ceph_abort_msg("bad command registered");
199 dir_result_t::dir_result_t(Inode
*in
, const UserPerm
& perms
)
200 : inode(in
), offset(0), next_offset(2),
201 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
205 void Client::_reset_faked_inos()
208 free_faked_inos
.clear();
209 free_faked_inos
.insert(start
, (uint32_t)-1 - start
+ 1);
210 last_used_faked_ino
= 0;
211 last_used_faked_root
= 0;
213 // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
214 // Windows structures, including Dokan ones, are using 64B identifiers.
215 _use_faked_inos
= false;
217 _use_faked_inos
= sizeof(ino_t
) < 8 || cct
->_conf
->client_use_faked_inos
;
221 void Client::_assign_faked_ino(Inode
*in
)
223 if (0 == last_used_faked_ino
)
224 last_used_faked_ino
= last_used_faked_ino
+ 2048; // start(1024)~2048 reserved for _assign_faked_root
225 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
226 if (it
== free_faked_inos
.end() && last_used_faked_ino
> 0) {
227 last_used_faked_ino
= 2048;
228 it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
230 ceph_assert(it
!= free_faked_inos
.end());
231 if (last_used_faked_ino
< it
.get_start()) {
232 ceph_assert(it
.get_len() > 0);
233 last_used_faked_ino
= it
.get_start();
235 ++last_used_faked_ino
;
236 ceph_assert(it
.get_start() + it
.get_len() > last_used_faked_ino
);
238 in
->faked_ino
= last_used_faked_ino
;
239 free_faked_inos
.erase(in
->faked_ino
);
240 faked_ino_map
[in
->faked_ino
] = in
->vino();
244 * In the faked mode, if you export multiple subdirectories,
245 * you will see that the inode numbers of the exported subdirectories
246 * are the same. so we distinguish the mount point by reserving
247 * the "fake ids" between "1024~2048" and combining the last
248 * 10bits(0x3ff) of the "root inodes".
250 void Client::_assign_faked_root(Inode
*in
)
252 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
253 if (it
== free_faked_inos
.end() && last_used_faked_root
> 0) {
254 last_used_faked_root
= 0;
255 it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
257 assert(it
!= free_faked_inos
.end());
258 vinodeno_t inode_info
= in
->vino();
259 uint64_t inode_num
= (uint64_t)inode_info
.ino
;
260 ldout(cct
, 10) << "inode_num " << inode_num
<< "inode_num & 0x3ff=" << (inode_num
& 0x3ff)<< dendl
;
261 last_used_faked_root
= it
.get_start() + (inode_num
& 0x3ff); // 0x3ff mask and get_start will not exceed 2048
262 assert(it
.get_start() + it
.get_len() > last_used_faked_root
);
264 in
->faked_ino
= last_used_faked_root
;
265 free_faked_inos
.erase(in
->faked_ino
);
266 faked_ino_map
[in
->faked_ino
] = in
->vino();
269 void Client::_release_faked_ino(Inode
*in
)
271 free_faked_inos
.insert(in
->faked_ino
);
272 faked_ino_map
.erase(in
->faked_ino
);
275 vinodeno_t
Client::_map_faked_ino(ino_t ino
)
280 else if (faked_ino_map
.count(ino
))
281 vino
= faked_ino_map
[ino
];
283 vino
= vinodeno_t(0, CEPH_NOSNAP
);
284 ldout(cct
, 10) << __func__
<< " " << ino
<< " -> " << vino
<< dendl
;
288 vinodeno_t
Client::map_faked_ino(ino_t ino
)
290 std::scoped_lock
lock(client_lock
);
291 return _map_faked_ino(ino
);
296 Client::Client(Messenger
*m
, MonClient
*mc
, Objecter
*objecter_
)
297 : Dispatcher(m
->cct
->get()),
298 timer(m
->cct
, timer_lock
, false),
302 whoami(mc
->get_global_id()),
303 mount_state(CLIENT_UNMOUNTED
, "Client::mountstate_lock"),
304 initialize_state(CLIENT_NEW
, "Client::initstate_lock"),
305 cct_deleter
{m
->cct
, [](CephContext
*p
) {p
->put();}},
306 async_ino_invalidator(m
->cct
),
307 async_dentry_invalidator(m
->cct
),
308 interrupt_finisher(m
->cct
),
309 remount_finisher(m
->cct
),
310 async_ino_releasor(m
->cct
),
311 objecter_finisher(m
->cct
),
312 m_command_hook(this),
317 user_id
= cct
->_conf
->client_mount_uid
;
318 group_id
= cct
->_conf
->client_mount_gid
;
319 fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
320 "fuse_default_permissions");
322 if (cct
->_conf
->client_acl_type
== "posix_acl")
323 acl_type
= POSIX_ACL
;
325 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
328 free_fd_set
.insert(10, 1<<30);
330 mdsmap
.reset(new MDSMap
);
333 writeback_handler
.reset(new ObjecterWriteback(objecter
, &objecter_finisher
,
335 objectcacher
.reset(new ObjectCacher(cct
, "libcephfs", *writeback_handler
, client_lock
,
336 client_flush_set_callback
, // all commit callback
338 cct
->_conf
->client_oc_size
,
339 cct
->_conf
->client_oc_max_objects
,
340 cct
->_conf
->client_oc_max_dirty
,
341 cct
->_conf
->client_oc_target_dirty
,
342 cct
->_conf
->client_oc_max_dirty_age
,
349 ceph_assert(ceph_mutex_is_not_locked(client_lock
));
351 // If the task is crashed or aborted and doesn't
352 // get any chance to run the umount and shutdow.
354 std::scoped_lock l
{client_lock
};
355 tick_thread_stopped
= true;
356 upkeep_cond
.notify_one();
359 if (upkeeper
.joinable())
362 // It is necessary to hold client_lock, because any inode destruction
363 // may call into ObjectCacher, which asserts that it's lock (which is
364 // client_lock) is held.
365 std::scoped_lock l
{client_lock
};
369 void Client::tear_down_cache()
372 for (auto &[fd
, fh
] : fd_map
) {
373 ldout(cct
, 1) << __func__
<< " forcing close of fh " << fd
<< " ino " << fh
->inode
->ino
<< dendl
;
378 while (!opened_dirs
.empty()) {
379 dir_result_t
*dirp
= *opened_dirs
.begin();
380 ldout(cct
, 1) << __func__
<< " forcing close of dir " << dirp
<< " ino " << dirp
->inode
->ino
<< dendl
;
389 ceph_assert(lru
.lru_get_size() == 0);
392 ceph_assert(inode_map
.size() <= 1 + root_parents
.size());
393 if (root
&& inode_map
.size() == 1 + root_parents
.size()) {
397 while (!root_parents
.empty())
398 root_parents
.erase(root_parents
.begin());
403 ceph_assert(inode_map
.empty());
406 inodeno_t
Client::get_root_ino()
408 std::scoped_lock
l(client_lock
);
409 if (use_faked_inos())
410 return root
->faked_ino
;
415 Inode
*Client::get_root()
417 std::scoped_lock
l(client_lock
);
425 void Client::dump_inode(Formatter
*f
, Inode
*in
, set
<Inode
*>& did
, bool disconnected
)
428 in
->make_long_path(path
);
429 ldout(cct
, 1) << "dump_inode: "
430 << (disconnected
? "DISCONNECTED ":"")
431 << "inode " << in
->ino
433 << " ref " << in
->get_num_ref()
434 << " " << *in
<< dendl
;
437 f
->open_object_section("inode");
438 f
->dump_stream("path") << path
;
440 f
->dump_int("disconnected", 1);
447 ldout(cct
, 1) << " dir " << in
->dir
<< " size " << in
->dir
->dentries
.size() << dendl
;
448 for (ceph::unordered_map
<string
, Dentry
*>::iterator it
= in
->dir
->dentries
.begin();
449 it
!= in
->dir
->dentries
.end();
451 ldout(cct
, 1) << " " << in
->ino
<< " dn " << it
->first
<< " " << it
->second
<< " ref " << it
->second
->ref
<< dendl
;
453 f
->open_object_section("dentry");
457 if (it
->second
->inode
)
458 dump_inode(f
, it
->second
->inode
.get(), did
, false);
463 void Client::dump_cache(Formatter
*f
)
467 ldout(cct
, 1) << __func__
<< dendl
;
470 f
->open_array_section("cache");
473 dump_inode(f
, root
, did
, true);
475 // make a second pass to catch anything disconnected
476 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
477 it
!= inode_map
.end();
479 if (did
.count(it
->second
))
481 dump_inode(f
, it
->second
, did
, true);
488 void Client::dump_status(Formatter
*f
)
490 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
492 ldout(cct
, 1) << __func__
<< dendl
;
494 const epoch_t osd_epoch
495 = objecter
->with_osdmap(std::mem_fn(&OSDMap::get_epoch
));
498 f
->open_object_section("metadata");
499 for (const auto& kv
: metadata
)
500 f
->dump_string(kv
.first
.c_str(), kv
.second
);
503 f
->dump_int("dentry_count", lru
.lru_get_size());
504 f
->dump_int("dentry_pinned_count", lru
.lru_get_num_pinned());
505 f
->dump_int("id", get_nodeid().v
);
506 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
507 f
->dump_object("inst", inst
);
508 f
->dump_object("addr", inst
.addr
);
509 f
->dump_stream("inst_str") << inst
.name
<< " " << inst
.addr
.get_legacy_str();
510 f
->dump_string("addr_str", inst
.addr
.get_legacy_str());
511 f
->dump_int("inode_count", inode_map
.size());
512 f
->dump_int("mds_epoch", mdsmap
->get_epoch());
513 f
->dump_int("osd_epoch", osd_epoch
);
514 f
->dump_int("osd_epoch_barrier", cap_epoch_barrier
);
515 f
->dump_bool("blocklisted", blocklisted
);
516 f
->dump_string("fs_name", mdsmap
->get_fs_name());
520 void Client::_pre_init()
524 objecter_finisher
.start();
525 filer
.reset(new Filer(objecter
, &objecter_finisher
));
526 objecter
->enable_blocklist_events();
528 objectcacher
->start();
533 RWRef_t
iref_writer(initialize_state
, CLIENT_INITIALIZING
, false);
534 ceph_assert(iref_writer
.is_first_writer());
538 std::scoped_lock l
{client_lock
};
539 messenger
->add_dispatcher_tail(this);
542 iref_writer
.update_state(CLIENT_INITIALIZED
);
546 void Client::_finish_init()
549 std::scoped_lock l
{client_lock
};
551 PerfCountersBuilder
plb(cct
, "client", l_c_first
, l_c_last
);
552 plb
.add_time_avg(l_c_reply
, "reply", "Latency of receiving a reply on metadata request");
553 plb
.add_time_avg(l_c_lat
, "lat", "Latency of processing a metadata request");
554 plb
.add_time_avg(l_c_wrlat
, "wrlat", "Latency of a file data write operation");
555 plb
.add_time_avg(l_c_read
, "rdlat", "Latency of a file data read operation");
556 plb
.add_time_avg(l_c_fsync
, "fsync", "Latency of a file sync operation");
557 logger
.reset(plb
.create_perf_counters());
558 cct
->get_perfcounters_collection()->add(logger
.get());
561 cct
->_conf
.add_observer(this);
563 AdminSocket
* admin_socket
= cct
->get_admin_socket();
564 int ret
= admin_socket
->register_command("mds_requests",
566 "show in-progress mds requests");
568 lderr(cct
) << "error registering admin socket command: "
569 << cpp_strerror(-ret
) << dendl
;
571 ret
= admin_socket
->register_command("mds_sessions "
572 "name=cap_dump,type=CephBool,req=false",
574 "show mds session state");
576 lderr(cct
) << "error registering admin socket command: "
577 << cpp_strerror(-ret
) << dendl
;
579 ret
= admin_socket
->register_command("dump_cache",
581 "show in-memory metadata cache contents");
583 lderr(cct
) << "error registering admin socket command: "
584 << cpp_strerror(-ret
) << dendl
;
586 ret
= admin_socket
->register_command("kick_stale_sessions",
588 "kick sessions that were remote reset");
590 lderr(cct
) << "error registering admin socket command: "
591 << cpp_strerror(-ret
) << dendl
;
593 ret
= admin_socket
->register_command("status",
595 "show overall client status");
597 lderr(cct
) << "error registering admin socket command: "
598 << cpp_strerror(-ret
) << dendl
;
602 void Client::shutdown()
604 ldout(cct
, 1) << __func__
<< dendl
;
606 // If we were not mounted, but were being used for sending
607 // MDS commands, we may have sessions that need closing.
609 std::scoped_lock l
{client_lock
};
611 // To make sure the tick thread will be stoppped before
612 // destructing the Client, just in case like the _mount()
613 // failed but didn't not get a chance to stop the tick
615 tick_thread_stopped
= true;
616 upkeep_cond
.notify_one();
620 cct
->_conf
.remove_observer(this);
622 cct
->get_admin_socket()->unregister_commands(&m_command_hook
);
624 if (ino_invalidate_cb
) {
625 ldout(cct
, 10) << "shutdown stopping cache invalidator finisher" << dendl
;
626 async_ino_invalidator
.wait_for_empty();
627 async_ino_invalidator
.stop();
630 if (dentry_invalidate_cb
) {
631 ldout(cct
, 10) << "shutdown stopping dentry invalidator finisher" << dendl
;
632 async_dentry_invalidator
.wait_for_empty();
633 async_dentry_invalidator
.stop();
636 if (switch_interrupt_cb
) {
637 ldout(cct
, 10) << "shutdown stopping interrupt finisher" << dendl
;
638 interrupt_finisher
.wait_for_empty();
639 interrupt_finisher
.stop();
643 ldout(cct
, 10) << "shutdown stopping remount finisher" << dendl
;
644 remount_finisher
.wait_for_empty();
645 remount_finisher
.stop();
648 if (ino_release_cb
) {
649 ldout(cct
, 10) << "shutdown stopping inode release finisher" << dendl
;
650 async_ino_releasor
.wait_for_empty();
651 async_ino_releasor
.stop();
654 objectcacher
->stop(); // outside of client_lock! this does a join.
657 * We are shuting down the client.
659 * Just declare the state to CLIENT_NEW to block and fail any
660 * new comming "reader" and then try to wait all the in-flight
661 * "readers" to finish.
663 RWRef_t
iref_writer(initialize_state
, CLIENT_NEW
, false);
664 if (!iref_writer
.is_first_writer())
666 iref_writer
.wait_readers_done();
669 std::scoped_lock
l(timer_lock
);
673 objecter_finisher
.wait_for_empty();
674 objecter_finisher
.stop();
677 cct
->get_perfcounters_collection()->remove(logger
.get());
683 // ===================
684 // metadata cache stuff
686 void Client::trim_cache(bool trim_kernel_dcache
)
688 uint64_t max
= cct
->_conf
->client_cache_size
;
689 ldout(cct
, 20) << "trim_cache size " << lru
.lru_get_size() << " max " << max
<< dendl
;
691 while (lru
.lru_get_size() != last
) {
692 last
= lru
.lru_get_size();
694 if (!is_unmounting() && lru
.lru_get_size() <= max
) break;
697 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_get_next_expire());
704 if (trim_kernel_dcache
&& lru
.lru_get_size() > max
)
705 _invalidate_kernel_dcache();
708 if (lru
.lru_get_size() == 0 && root
&& root
->get_num_ref() == 0 && inode_map
.size() == 1 + root_parents
.size()) {
709 ldout(cct
, 15) << "trim_cache trimmed root " << root
<< dendl
;
713 while (!root_parents
.empty())
714 root_parents
.erase(root_parents
.begin());
720 void Client::trim_cache_for_reconnect(MetaSession
*s
)
722 mds_rank_t mds
= s
->mds_num
;
723 ldout(cct
, 20) << __func__
<< " mds." << mds
<< dendl
;
726 list
<Dentry
*> skipped
;
727 while (lru
.lru_get_size() > 0) {
728 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_expire());
732 if ((dn
->inode
&& dn
->inode
->caps
.count(mds
)) ||
733 dn
->dir
->parent_inode
->caps
.count(mds
)) {
737 skipped
.push_back(dn
);
740 for(list
<Dentry
*>::iterator p
= skipped
.begin(); p
!= skipped
.end(); ++p
)
741 lru
.lru_insert_mid(*p
);
743 ldout(cct
, 20) << __func__
<< " mds." << mds
744 << " trimmed " << trimmed
<< " dentries" << dendl
;
746 if (s
->caps
.size() > 0)
747 _invalidate_kernel_dcache();
750 void Client::trim_dentry(Dentry
*dn
)
752 ldout(cct
, 15) << "trim_dentry unlinking dn " << dn
->name
754 << std::hex
<< dn
->dir
->parent_inode
->ino
<< std::dec
757 Inode
*diri
= dn
->dir
->parent_inode
;
758 clear_dir_complete_and_ordered(diri
, true);
760 unlink(dn
, false, false); // drop dir, drop dentry
764 void Client::update_inode_file_size(Inode
*in
, int issued
, uint64_t size
,
765 uint64_t truncate_seq
, uint64_t truncate_size
)
767 uint64_t prior_size
= in
->size
;
769 if (truncate_seq
> in
->truncate_seq
||
770 (truncate_seq
== in
->truncate_seq
&& size
> in
->size
)) {
771 ldout(cct
, 10) << "size " << in
->size
<< " -> " << size
<< dendl
;
773 in
->reported_size
= size
;
774 if (truncate_seq
!= in
->truncate_seq
) {
775 ldout(cct
, 10) << "truncate_seq " << in
->truncate_seq
<< " -> "
776 << truncate_seq
<< dendl
;
777 in
->truncate_seq
= truncate_seq
;
778 in
->oset
.truncate_seq
= truncate_seq
;
780 // truncate cached file data
781 if (prior_size
> size
) {
782 _invalidate_inode_cache(in
, truncate_size
, prior_size
- truncate_size
);
786 // truncate inline data
787 if (in
->inline_version
< CEPH_INLINE_NONE
) {
788 uint32_t len
= in
->inline_data
.length();
790 in
->inline_data
.splice(size
, len
- size
);
793 if (truncate_seq
>= in
->truncate_seq
&&
794 in
->truncate_size
!= truncate_size
) {
796 ldout(cct
, 10) << "truncate_size " << in
->truncate_size
<< " -> "
797 << truncate_size
<< dendl
;
798 in
->truncate_size
= truncate_size
;
799 in
->oset
.truncate_size
= truncate_size
;
801 ldout(cct
, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl
;
806 void Client::update_inode_file_time(Inode
*in
, int issued
, uint64_t time_warp_seq
,
807 utime_t ctime
, utime_t mtime
, utime_t atime
)
809 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << ccap_string(issued
)
810 << " ctime " << ctime
<< " mtime " << mtime
<< dendl
;
812 if (time_warp_seq
> in
->time_warp_seq
)
813 ldout(cct
, 10) << " mds time_warp_seq " << time_warp_seq
814 << " is higher than local time_warp_seq "
815 << in
->time_warp_seq
<< dendl
;
818 // be careful with size, mtime, atime
819 if (issued
& (CEPH_CAP_FILE_EXCL
|
821 CEPH_CAP_FILE_BUFFER
|
823 CEPH_CAP_XATTR_EXCL
)) {
824 ldout(cct
, 30) << "Yay have enough caps to look at our times" << dendl
;
825 if (ctime
> in
->ctime
)
827 if (time_warp_seq
> in
->time_warp_seq
) {
828 //the mds updated times, so take those!
831 in
->time_warp_seq
= time_warp_seq
;
832 } else if (time_warp_seq
== in
->time_warp_seq
) {
834 if (mtime
> in
->mtime
)
836 if (atime
> in
->atime
)
838 } else if (issued
& CEPH_CAP_FILE_EXCL
) {
839 //ignore mds values as we have a higher seq
842 ldout(cct
, 30) << "Don't have enough caps, just taking mds' time values" << dendl
;
843 if (time_warp_seq
>= in
->time_warp_seq
) {
847 in
->time_warp_seq
= time_warp_seq
;
851 ldout(cct
, 0) << "WARNING: " << *in
<< " mds time_warp_seq "
852 << time_warp_seq
<< " is lower than local time_warp_seq "
858 void Client::_fragmap_remove_non_leaves(Inode
*in
)
860 for (map
<frag_t
,int>::iterator p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
861 if (!in
->dirfragtree
.is_leaf(p
->first
))
862 in
->fragmap
.erase(p
++);
867 void Client::_fragmap_remove_stopped_mds(Inode
*in
, mds_rank_t mds
)
869 for (auto p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
870 if (p
->second
== mds
)
871 in
->fragmap
.erase(p
++);
876 Inode
* Client::add_update_inode(InodeStat
*st
, utime_t from
,
877 MetaSession
*session
,
878 const UserPerm
& request_perms
)
881 bool was_new
= false;
882 if (inode_map
.count(st
->vino
)) {
883 in
= inode_map
[st
->vino
];
884 ldout(cct
, 12) << __func__
<< " had " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
886 in
= new Inode(this, st
->vino
, &st
->layout
);
887 inode_map
[st
->vino
] = in
;
889 if (use_faked_inos())
890 _assign_faked_ino(in
);
894 if (use_faked_inos())
895 _assign_faked_root(root
);
898 } else if (is_mounting()) {
899 root_parents
[root_ancestor
] = in
;
904 in
->ino
= st
->vino
.ino
;
905 in
->snapid
= st
->vino
.snapid
;
906 in
->mode
= st
->mode
& S_IFMT
;
911 if (in
->is_symlink())
912 in
->symlink
= st
->symlink
;
914 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
915 bool new_version
= false;
916 if (in
->version
== 0 ||
917 ((st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
) &&
918 (in
->version
& ~1) < st
->version
))
922 in
->caps_issued(&issued
);
923 issued
|= in
->caps_dirty();
924 int new_issued
= ~issued
& (int)st
->cap
.caps
;
926 if ((new_version
|| (new_issued
& CEPH_CAP_AUTH_SHARED
)) &&
927 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
931 in
->btime
= st
->btime
;
932 in
->snap_btime
= st
->snap_btime
;
933 in
->snap_metadata
= st
->snap_metadata
;
936 if ((new_version
|| (new_issued
& CEPH_CAP_LINK_SHARED
)) &&
937 !(issued
& CEPH_CAP_LINK_EXCL
)) {
938 in
->nlink
= st
->nlink
;
941 if (new_version
|| (new_issued
& CEPH_CAP_ANY_RD
)) {
942 update_inode_file_time(in
, issued
, st
->time_warp_seq
,
943 st
->ctime
, st
->mtime
, st
->atime
);
947 (new_issued
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
))) {
948 in
->layout
= st
->layout
;
949 update_inode_file_size(in
, issued
, st
->size
, st
->truncate_seq
, st
->truncate_size
);
953 if (new_version
|| (new_issued
& CEPH_CAP_FILE_SHARED
)) {
954 in
->dirstat
= st
->dirstat
;
956 // dir_layout/rstat/quota are not tracked by capability, update them only if
957 // the inode stat is from auth mds
958 if (new_version
|| (st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
)) {
959 in
->dir_layout
= st
->dir_layout
;
960 ldout(cct
, 20) << " dir hash is " << (int)in
->dir_layout
.dl_dir_hash
<< dendl
;
961 in
->rstat
= st
->rstat
;
962 in
->quota
= st
->quota
;
963 in
->dir_pin
= st
->dir_pin
;
965 // move me if/when version reflects fragtree changes.
966 if (in
->dirfragtree
!= st
->dirfragtree
) {
967 in
->dirfragtree
= st
->dirfragtree
;
968 _fragmap_remove_non_leaves(in
);
972 if ((in
->xattr_version
== 0 || !(issued
& CEPH_CAP_XATTR_EXCL
)) &&
973 st
->xattrbl
.length() &&
974 st
->xattr_version
> in
->xattr_version
) {
975 auto p
= st
->xattrbl
.cbegin();
976 decode(in
->xattrs
, p
);
977 in
->xattr_version
= st
->xattr_version
;
980 if (st
->inline_version
> in
->inline_version
) {
981 in
->inline_data
= st
->inline_data
;
982 in
->inline_version
= st
->inline_version
;
985 /* always take a newer change attr */
986 if (st
->change_attr
> in
->change_attr
)
987 in
->change_attr
= st
->change_attr
;
989 if (st
->version
> in
->version
)
990 in
->version
= st
->version
;
993 ldout(cct
, 12) << __func__
<< " adding " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
996 return in
; // as with readdir returning indoes in different snaprealms (no caps!)
998 if (in
->snapid
== CEPH_NOSNAP
) {
999 add_update_cap(in
, session
, st
->cap
.cap_id
, st
->cap
.caps
, st
->cap
.wanted
,
1000 st
->cap
.seq
, st
->cap
.mseq
, inodeno_t(st
->cap
.realm
),
1001 st
->cap
.flags
, request_perms
);
1002 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
1003 in
->max_size
= st
->max_size
;
1004 in
->rstat
= st
->rstat
;
1007 // setting I_COMPLETE needs to happen after adding the cap
1009 (st
->cap
.caps
& CEPH_CAP_FILE_SHARED
) &&
1010 (issued
& CEPH_CAP_FILE_EXCL
) == 0 &&
1011 in
->dirstat
.nfiles
== 0 &&
1012 in
->dirstat
.nsubdirs
== 0) {
1013 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in
<< dendl
;
1014 in
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
1016 ldout(cct
, 10) << " dir is open on empty dir " << in
->ino
<< " with "
1017 << in
->dir
->dentries
.size() << " entries, marking all dentries null" << dendl
;
1018 in
->dir
->readdir_cache
.clear();
1019 for (const auto& p
: in
->dir
->dentries
) {
1020 unlink(p
.second
, true, true); // keep dir, keep dentry
1022 if (in
->dir
->dentries
.empty())
1027 in
->snap_caps
|= st
->cap
.caps
;
1030 in
->fscrypt
= st
->fscrypt
;
1036 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1038 Dentry
*Client::insert_dentry_inode(Dir
*dir
, const string
& dname
, LeaseStat
*dlease
,
1039 Inode
*in
, utime_t from
, MetaSession
*session
,
1043 if (dir
->dentries
.count(dname
))
1044 dn
= dir
->dentries
[dname
];
1046 ldout(cct
, 12) << __func__
<< " '" << dname
<< "' vino " << in
->vino()
1047 << " in dir " << dir
->parent_inode
->vino() << " dn " << dn
1050 if (dn
&& dn
->inode
) {
1051 if (dn
->inode
->vino() == in
->vino()) {
1053 ldout(cct
, 12) << " had dentry " << dname
1054 << " with correct vino " << dn
->inode
->vino()
1057 ldout(cct
, 12) << " had dentry " << dname
1058 << " with WRONG vino " << dn
->inode
->vino()
1060 unlink(dn
, true, true); // keep dir, keep dentry
1064 if (!dn
|| !dn
->inode
) {
1065 InodeRef
tmp_ref(in
);
1067 if (old_dentry
->dir
!= dir
) {
1068 Inode
*old_diri
= old_dentry
->dir
->parent_inode
;
1069 clear_dir_complete_and_ordered(old_diri
, false);
1071 unlink(old_dentry
, dir
== old_dentry
->dir
, false); // drop dentry, keep dir open if its the same dir
1073 Inode
*diri
= dir
->parent_inode
;
1074 clear_dir_complete_and_ordered(diri
, false);
1075 dn
= link(dir
, dname
, in
, dn
);
1078 update_dentry_lease(dn
, dlease
, from
, session
);
1082 void Client::update_dentry_lease(Dentry
*dn
, LeaseStat
*dlease
, utime_t from
, MetaSession
*session
)
1084 utime_t dttl
= from
;
1085 dttl
+= (float)dlease
->duration_ms
/ 1000.0;
1087 ldout(cct
, 15) << __func__
<< " " << *dn
<< " " << *dlease
<< " from " << from
<< dendl
;
1091 if (dlease
->mask
& CEPH_LEASE_VALID
) {
1092 if (dttl
> dn
->lease_ttl
) {
1093 ldout(cct
, 10) << "got dentry lease on " << dn
->name
1094 << " dur " << dlease
->duration_ms
<< "ms ttl " << dttl
<< dendl
;
1095 dn
->lease_ttl
= dttl
;
1096 dn
->lease_mds
= session
->mds_num
;
1097 dn
->lease_seq
= dlease
->seq
;
1098 dn
->lease_gen
= session
->cap_gen
;
1101 dn
->cap_shared_gen
= dn
->dir
->parent_inode
->shared_gen
;
1102 if (dlease
->mask
& CEPH_LEASE_PRIMARY_LINK
)
1104 dn
->alternate_name
= std::move(dlease
->alternate_name
);
1109 * update MDS location cache for a single inode
1111 void Client::update_dir_dist(Inode
*in
, DirStat
*dst
)
1114 ldout(cct
, 20) << "got dirfrag map for " << in
->ino
<< " frag " << dst
->frag
<< " to mds " << dst
->auth
<< dendl
;
1115 if (dst
->auth
>= 0) {
1116 in
->fragmap
[dst
->frag
] = dst
->auth
;
1118 in
->fragmap
.erase(dst
->frag
);
1120 if (!in
->dirfragtree
.is_leaf(dst
->frag
)) {
1121 in
->dirfragtree
.force_to_leaf(cct
, dst
->frag
);
1122 _fragmap_remove_non_leaves(in
);
1126 in
->dir_replicated
= !dst
->dist
.empty();
1127 if (!dst
->dist
.empty())
1128 in
->frag_repmap
[dst
->frag
].assign(dst
->dist
.begin(), dst
->dist
.end()) ;
1130 in
->frag_repmap
.erase(dst
->frag
);
1133 void Client::clear_dir_complete_and_ordered(Inode
*diri
, bool complete
)
1136 diri
->dir_release_count
++;
1138 diri
->dir_ordered_count
++;
1139 if (diri
->flags
& I_COMPLETE
) {
1141 ldout(cct
, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
1142 diri
->flags
&= ~(I_COMPLETE
| I_DIR_ORDERED
);
1144 if (diri
->flags
& I_DIR_ORDERED
) {
1145 ldout(cct
, 10) << " clearing I_DIR_ORDERED on " << *diri
<< dendl
;
1146 diri
->flags
&= ~I_DIR_ORDERED
;
1150 diri
->dir
->readdir_cache
.clear();
1155 * insert results from readdir or lssnap into the metadata cache.
1157 void Client::insert_readdir_results(MetaRequest
*request
, MetaSession
*session
, Inode
*diri
) {
1159 auto& reply
= request
->reply
;
1160 ConnectionRef con
= request
->reply
->get_connection();
1162 if(session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1163 features
= (uint64_t)-1;
1166 features
= con
->get_features();
1169 dir_result_t
*dirp
= request
->dirp
;
1172 // the extra buffer list is only set for readdir and lssnap replies
1173 auto p
= reply
->get_extra_bl().cbegin();
1176 if (request
->head
.op
== CEPH_MDS_OP_LSSNAP
) {
1178 diri
= open_snapdir(diri
);
1181 // only open dir if we're actually adding stuff to it!
1182 Dir
*dir
= diri
->open_dir();
1186 DirStat
dst(p
, features
);
1192 bool end
= ((unsigned)flags
& CEPH_READDIR_FRAG_END
);
1193 bool hash_order
= ((unsigned)flags
& CEPH_READDIR_HASH_ORDER
);
1195 frag_t fg
= (unsigned)request
->head
.args
.readdir
.frag
;
1196 unsigned readdir_offset
= dirp
->next_offset
;
1197 string readdir_start
= dirp
->last_name
;
1198 ceph_assert(!readdir_start
.empty() || readdir_offset
== 2);
1200 unsigned last_hash
= 0;
1202 if (!readdir_start
.empty()) {
1203 last_hash
= ceph_frag_value(diri
->hash_dentry_name(readdir_start
));
1204 } else if (flags
& CEPH_READDIR_OFFSET_HASH
) {
1205 /* mds understands offset_hash */
1206 last_hash
= (unsigned)request
->head
.args
.readdir
.offset_hash
;
1210 if (fg
!= dst
.frag
) {
1211 ldout(cct
, 10) << "insert_trace got new frag " << fg
<< " -> " << dst
.frag
<< dendl
;
1215 readdir_start
.clear();
1216 dirp
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
, false);
1220 ldout(cct
, 10) << __func__
<< " " << numdn
<< " readdir items, end=" << end
1221 << ", hash_order=" << hash_order
1222 << ", readdir_start " << readdir_start
1223 << ", last_hash " << last_hash
1224 << ", next_offset " << readdir_offset
<< dendl
;
1226 if (diri
->snapid
!= CEPH_SNAPDIR
&&
1227 fg
.is_leftmost() && readdir_offset
== 2 &&
1228 !(hash_order
&& last_hash
)) {
1229 dirp
->release_count
= diri
->dir_release_count
;
1230 dirp
->ordered_count
= diri
->dir_ordered_count
;
1231 dirp
->start_shared_gen
= diri
->shared_gen
;
1232 dirp
->cache_index
= 0;
1235 dirp
->buffer_frag
= fg
;
1237 _readdir_drop_dirp_buffer(dirp
);
1238 dirp
->buffer
.reserve(numdn
);
1242 for (unsigned i
=0; i
<numdn
; i
++) {
1244 dlease
.decode(p
, features
);
1245 InodeStat
ist(p
, features
);
1247 ldout(cct
, 15) << "" << i
<< ": '" << dname
<< "'" << dendl
;
1249 Inode
*in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1252 if (diri
->dir
->dentries
.count(dname
)) {
1253 Dentry
*olddn
= diri
->dir
->dentries
[dname
];
1254 if (olddn
->inode
!= in
) {
1255 // replace incorrect dentry
1256 unlink(olddn
, true, true); // keep dir, dentry
1257 dn
= link(dir
, dname
, in
, olddn
);
1258 ceph_assert(dn
== olddn
);
1266 dn
= link(dir
, dname
, in
, NULL
);
1268 dn
->alternate_name
= std::move(dlease
.alternate_name
);
1270 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1272 unsigned hash
= ceph_frag_value(diri
->hash_dentry_name(dname
));
1273 if (hash
!= last_hash
)
1276 dn
->offset
= dir_result_t::make_fpos(hash
, readdir_offset
++, true);
1278 dn
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
++, false);
1280 // add to readdir cache
1281 if (dirp
->release_count
== diri
->dir_release_count
&&
1282 dirp
->ordered_count
== diri
->dir_ordered_count
&&
1283 dirp
->start_shared_gen
== diri
->shared_gen
) {
1284 if (dirp
->cache_index
== dir
->readdir_cache
.size()) {
1286 ceph_assert(!dirp
->inode
->is_complete_and_ordered());
1287 dir
->readdir_cache
.reserve(dirp
->cache_index
+ numdn
);
1289 dir
->readdir_cache
.push_back(dn
);
1290 } else if (dirp
->cache_index
< dir
->readdir_cache
.size()) {
1291 if (dirp
->inode
->is_complete_and_ordered())
1292 ceph_assert(dir
->readdir_cache
[dirp
->cache_index
] == dn
);
1294 dir
->readdir_cache
[dirp
->cache_index
] = dn
;
1296 ceph_abort_msg("unexpected readdir buffer idx");
1298 dirp
->cache_index
++;
1300 // add to cached result list
1301 dirp
->buffer
.push_back(dir_result_t::dentry(dn
->offset
, dname
, dn
->alternate_name
, in
));
1302 ldout(cct
, 15) << __func__
<< " " << hex
<< dn
->offset
<< dec
<< ": '" << dname
<< "' -> " << in
->ino
<< dendl
;
1306 dirp
->last_name
= dname
;
1308 dirp
->next_offset
= 2;
1310 dirp
->next_offset
= readdir_offset
;
1312 if (dir
->is_empty())
1319 * insert a trace from a MDS reply into the cache.
1321 Inode
* Client::insert_trace(MetaRequest
*request
, MetaSession
*session
)
1323 auto& reply
= request
->reply
;
1324 int op
= request
->get_op();
1326 ldout(cct
, 10) << "insert_trace from " << request
->sent_stamp
<< " mds." << session
->mds_num
1327 << " is_target=" << (int)reply
->head
.is_target
1328 << " is_dentry=" << (int)reply
->head
.is_dentry
1331 auto p
= reply
->get_trace_bl().cbegin();
1332 if (request
->got_unsafe
) {
1333 ldout(cct
, 10) << "insert_trace -- already got unsafe; ignoring" << dendl
;
1334 ceph_assert(p
.end());
1339 ldout(cct
, 10) << "insert_trace -- no trace" << dendl
;
1341 Dentry
*d
= request
->dentry();
1343 Inode
*diri
= d
->dir
->parent_inode
;
1344 clear_dir_complete_and_ordered(diri
, true);
1347 if (d
&& reply
->get_result() == 0) {
1348 if (op
== CEPH_MDS_OP_RENAME
) {
1350 Dentry
*od
= request
->old_dentry();
1351 ldout(cct
, 10) << " unlinking rename src dn " << od
<< " for traceless reply" << dendl
;
1353 unlink(od
, true, true); // keep dir, dentry
1354 } else if (op
== CEPH_MDS_OP_RMDIR
||
1355 op
== CEPH_MDS_OP_UNLINK
) {
1357 ldout(cct
, 10) << " unlinking unlink/rmdir dn " << d
<< " for traceless reply" << dendl
;
1358 unlink(d
, true, true); // keep dir, dentry
1364 ConnectionRef con
= request
->reply
->get_connection();
1366 if (session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1367 features
= (uint64_t)-1;
1370 features
= con
->get_features();
1372 ldout(cct
, 10) << " features 0x" << hex
<< features
<< dec
<< dendl
;
1375 SnapRealm
*realm
= NULL
;
1376 if (reply
->snapbl
.length())
1377 update_snap_trace(reply
->snapbl
, &realm
);
1379 ldout(cct
, 10) << " hrm "
1380 << " is_target=" << (int)reply
->head
.is_target
1381 << " is_dentry=" << (int)reply
->head
.is_dentry
1390 if (reply
->head
.is_dentry
) {
1391 dirst
.decode(p
, features
);
1392 dst
.decode(p
, features
);
1394 dlease
.decode(p
, features
);
1398 if (reply
->head
.is_target
) {
1399 ist
.decode(p
, features
);
1400 if (cct
->_conf
->client_debug_getattr_caps
) {
1401 unsigned wanted
= 0;
1402 if (op
== CEPH_MDS_OP_GETATTR
|| op
== CEPH_MDS_OP_LOOKUP
)
1403 wanted
= request
->head
.args
.getattr
.mask
;
1404 else if (op
== CEPH_MDS_OP_OPEN
|| op
== CEPH_MDS_OP_CREATE
)
1405 wanted
= request
->head
.args
.open
.mask
;
1407 if ((wanted
& CEPH_CAP_XATTR_SHARED
) &&
1408 !(ist
.xattr_version
> 0 && ist
.xattrbl
.length() > 0))
1409 ceph_abort_msg("MDS reply does not contain xattrs");
1412 in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1417 if (reply
->head
.is_dentry
) {
1418 diri
= add_update_inode(&dirst
, request
->sent_stamp
, session
,
1420 update_dir_dist(diri
, &dst
); // dir stat info is attached to ..
1423 Dir
*dir
= diri
->open_dir();
1424 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
,
1425 (op
== CEPH_MDS_OP_RENAME
) ? request
->old_dentry() : NULL
);
1428 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1429 dn
= diri
->dir
->dentries
[dname
];
1431 clear_dir_complete_and_ordered(diri
, false);
1432 unlink(dn
, true, true); // keep dir, dentry
1435 if (dlease
.duration_ms
> 0) {
1437 Dir
*dir
= diri
->open_dir();
1438 dn
= link(dir
, dname
, NULL
, NULL
);
1440 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1443 } else if (op
== CEPH_MDS_OP_LOOKUPSNAP
||
1444 op
== CEPH_MDS_OP_MKSNAP
) {
1445 ldout(cct
, 10) << " faking snap lookup weirdness" << dendl
;
1446 // fake it for snap lookup
1447 vinodeno_t vino
= ist
.vino
;
1448 vino
.snapid
= CEPH_SNAPDIR
;
1449 ceph_assert(inode_map
.count(vino
));
1450 diri
= inode_map
[vino
];
1452 string dname
= request
->path
.last_dentry();
1455 dlease
.duration_ms
= 0;
1458 Dir
*dir
= diri
->open_dir();
1459 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
);
1461 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1462 Dentry
*dn
= diri
->dir
->dentries
[dname
];
1464 unlink(dn
, true, true); // keep dir, dentry
1470 if (op
== CEPH_MDS_OP_READDIR
||
1471 op
== CEPH_MDS_OP_LSSNAP
) {
1472 insert_readdir_results(request
, session
, in
);
1473 } else if (op
== CEPH_MDS_OP_LOOKUPNAME
) {
1474 // hack: return parent inode instead
1478 if (request
->dentry() == NULL
&& in
!= request
->inode()) {
1479 // pin the target inode if its parent dentry is not pinned
1480 request
->set_other_inode(in
);
1485 put_snap_realm(realm
);
1487 request
->target
= in
;
1493 mds_rank_t
Client::choose_target_mds(MetaRequest
*req
, Inode
** phash_diri
)
1495 mds_rank_t mds
= MDS_RANK_NONE
;
1497 bool is_hash
= false;
1502 if (req
->resend_mds
>= 0) {
1503 mds
= req
->resend_mds
;
1504 req
->resend_mds
= -1;
1505 ldout(cct
, 10) << __func__
<< " resend_mds specified as mds." << mds
<< dendl
;
1509 if (cct
->_conf
->client_use_random_mds
)
1515 ldout(cct
, 20) << __func__
<< " starting with req->inode " << *in
<< dendl
;
1516 if (req
->path
.depth()) {
1517 hash
= in
->hash_dentry_name(req
->path
[0]);
1518 ldout(cct
, 20) << __func__
<< " inode dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1519 << " on " << req
->path
[0]
1520 << " => " << hash
<< dendl
;
1525 in
= de
->inode
.get();
1526 ldout(cct
, 20) << __func__
<< " starting with req->dentry inode " << *in
<< dendl
;
1528 in
= de
->dir
->parent_inode
;
1529 hash
= in
->hash_dentry_name(de
->name
);
1530 ldout(cct
, 20) << __func__
<< " dentry dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1531 << " on " << de
->name
1532 << " => " << hash
<< dendl
;
1537 if (in
->snapid
!= CEPH_NOSNAP
) {
1538 ldout(cct
, 10) << __func__
<< " " << *in
<< " is snapped, using nonsnap parent" << dendl
;
1539 while (in
->snapid
!= CEPH_NOSNAP
) {
1540 if (in
->snapid
== CEPH_SNAPDIR
)
1541 in
= in
->snapdir_parent
.get();
1542 else if (!in
->dentries
.empty())
1543 /* In most cases there will only be one dentry, so getting it
1544 * will be the correct action. If there are multiple hard links,
1545 * I think the MDS should be able to redirect as needed*/
1546 in
= in
->get_first_parent()->dir
->parent_inode
;
1548 ldout(cct
, 10) << "got unlinked inode, can't look at parent" << dendl
;
1555 ldout(cct
, 20) << __func__
<< " " << *in
<< " is_hash=" << is_hash
1556 << " hash=" << hash
<< dendl
;
1558 if (is_hash
&& S_ISDIR(in
->mode
) && (!in
->fragmap
.empty() || !in
->frag_repmap
.empty())) {
1559 frag_t fg
= in
->dirfragtree
[hash
];
1560 if (!req
->auth_is_best()) {
1561 auto repmapit
= in
->frag_repmap
.find(fg
);
1562 if (repmapit
!= in
->frag_repmap
.end()) {
1563 auto& repmap
= repmapit
->second
;
1564 auto r
= ceph::util::generate_random_number
<uint64_t>(0, repmap
.size()-1);
1567 } else if (in
->fragmap
.count(fg
)) {
1568 mds
= in
->fragmap
[fg
];
1571 } else if (in
->auth_cap
) {
1572 req
->send_to_auth
= true;
1573 mds
= in
->auth_cap
->session
->mds_num
;
1576 ldout(cct
, 10) << __func__
<< " from dirfragtree hash" << dendl
;
1581 if (in
->auth_cap
&& req
->auth_is_best()) {
1582 mds
= in
->auth_cap
->session
->mds_num
;
1583 } else if (!in
->caps
.empty()) {
1584 mds
= in
->caps
.begin()->second
.session
->mds_num
;
1588 ldout(cct
, 10) << __func__
<< " from caps on inode " << *in
<< dendl
;
1595 mds
= _get_random_up_mds();
1596 ldout(cct
, 10) << "did not get mds through better means, so chose random mds " << mds
<< dendl
;
1600 ldout(cct
, 20) << "mds is " << mds
<< dendl
;
1604 void Client::connect_mds_targets(mds_rank_t mds
)
1606 ldout(cct
, 10) << __func__
<< " for mds." << mds
<< dendl
;
1607 ceph_assert(mds_sessions
.count(mds
));
1608 const MDSMap::mds_info_t
& info
= mdsmap
->get_mds_info(mds
);
1609 for (const auto &rank
: info
.export_targets
) {
1610 if (mds_sessions
.count(rank
) == 0 &&
1611 mdsmap
->is_clientreplay_or_active_or_stopping(rank
)) {
1612 ldout(cct
, 10) << "check_mds_sessions opening mds." << mds
1613 << " export target mds." << rank
<< dendl
;
1614 _open_mds_session(rank
);
1619 void Client::dump_mds_sessions(Formatter
*f
, bool cap_dump
)
1621 f
->dump_int("id", get_nodeid().v
);
1622 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
1623 f
->dump_object("inst", inst
);
1624 f
->dump_stream("inst_str") << inst
;
1625 f
->dump_stream("addr_str") << inst
.addr
;
1626 f
->open_array_section("sessions");
1627 for (const auto &p
: mds_sessions
) {
1628 f
->open_object_section("session");
1629 p
.second
.dump(f
, cap_dump
);
1633 f
->dump_int("mdsmap_epoch", mdsmap
->get_epoch());
1636 void Client::dump_mds_requests(Formatter
*f
)
1638 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
1639 p
!= mds_requests
.end();
1641 f
->open_object_section("request");
1647 int Client::verify_reply_trace(int r
, MetaSession
*session
,
1648 MetaRequest
*request
, const MConstRef
<MClientReply
>& reply
,
1649 InodeRef
*ptarget
, bool *pcreated
,
1650 const UserPerm
& perms
)
1652 // check whether this request actually did the create, and set created flag
1653 bufferlist extra_bl
;
1654 inodeno_t created_ino
;
1655 bool got_created_ino
= false;
1656 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
;
1658 extra_bl
= reply
->get_extra_bl();
1659 if (extra_bl
.length() >= 8) {
1660 if (session
->mds_features
.test(CEPHFS_FEATURE_DELEG_INO
)) {
1661 struct openc_response_t ocres
;
1663 decode(ocres
, extra_bl
);
1664 created_ino
= ocres
.created_ino
;
1666 * The userland cephfs client doesn't have a way to do an async create
1667 * (yet), so just discard delegated_inos for now. Eventually we should
1668 * store them and use them in create calls, even if they are synchronous,
1669 * if only for testing purposes.
1671 ldout(cct
, 10) << "delegated_inos: " << ocres
.delegated_inos
<< dendl
;
1673 // u64 containing number of created ino
1674 decode(created_ino
, extra_bl
);
1676 ldout(cct
, 10) << "make_request created ino " << created_ino
<< dendl
;
1677 got_created_ino
= true;
1681 *pcreated
= got_created_ino
;
1683 if (request
->target
) {
1684 *ptarget
= request
->target
;
1685 ldout(cct
, 20) << "make_request target is " << *ptarget
->get() << dendl
;
1687 if (got_created_ino
&& (p
= inode_map
.find(vinodeno_t(created_ino
, CEPH_NOSNAP
))) != inode_map
.end()) {
1688 (*ptarget
) = p
->second
;
1689 ldout(cct
, 20) << "make_request created, target is " << *ptarget
->get() << dendl
;
1691 // we got a traceless reply, and need to look up what we just
1692 // created. for now, do this by name. someday, do this by the
1693 // ino... which we know! FIXME.
1695 Dentry
*d
= request
->dentry();
1698 ldout(cct
, 10) << "make_request got traceless reply, looking up #"
1699 << d
->dir
->parent_inode
->ino
<< "/" << d
->name
1700 << " got_ino " << got_created_ino
1701 << " ino " << created_ino
1703 r
= _do_lookup(d
->dir
->parent_inode
, d
->name
, request
->regetattr_mask
,
1706 // if the dentry is not linked, just do our best. see #5021.
1707 ceph_abort_msg("how did this happen? i want logs!");
1710 Inode
*in
= request
->inode();
1711 ldout(cct
, 10) << "make_request got traceless reply, forcing getattr on #"
1712 << in
->ino
<< dendl
;
1713 r
= _getattr(in
, request
->regetattr_mask
, perms
, true);
1717 // verify ino returned in reply and trace_dist are the same
1718 if (got_created_ino
&&
1719 created_ino
.val
!= target
->ino
.val
) {
1720 ldout(cct
, 5) << "create got ino " << created_ino
<< " but then failed on lookup; EINTR?" << dendl
;
1724 ptarget
->swap(target
);
1736 * Blocking helper to make an MDS request.
1738 * If the ptarget flag is set, behavior changes slightly: the caller
1739 * expects to get a pointer to the inode we are creating or operating
1740 * on. As a result, we will follow up any traceless mutation reply
1741 * with a getattr or lookup to transparently handle a traceless reply
1742 * from the MDS (as when the MDS restarts and the client has to replay
1745 * @param request the MetaRequest to execute
1746 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1747 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1748 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1749 * @param use_mds [optional] prefer a specific mds (-1 for default)
1750 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1752 int Client::make_request(MetaRequest
*request
,
1753 const UserPerm
& perms
,
1754 InodeRef
*ptarget
, bool *pcreated
,
1760 // assign a unique tid
1761 ceph_tid_t tid
= ++last_tid
;
1762 request
->set_tid(tid
);
1765 request
->op_stamp
= ceph_clock_now();
1768 mds_requests
[tid
] = request
->get();
1769 if (oldest_tid
== 0 && request
->get_op() != CEPH_MDS_OP_SETFILELOCK
)
1772 request
->set_caller_perms(perms
);
1774 if (cct
->_conf
->client_inject_fixed_oldest_tid
) {
1775 ldout(cct
, 20) << __func__
<< " injecting fixed oldest_client_tid(1)" << dendl
;
1776 request
->set_oldest_client_tid(1);
1778 request
->set_oldest_client_tid(oldest_tid
);
1783 request
->resend_mds
= use_mds
;
1785 MetaSession
*session
= NULL
;
1787 if (request
->aborted())
1791 request
->abort(-CEPHFS_EBLOCKLISTED
);
1796 ceph::condition_variable caller_cond
;
1797 request
->caller_cond
= &caller_cond
;
1800 Inode
*hash_diri
= NULL
;
1801 mds_rank_t mds
= choose_target_mds(request
, &hash_diri
);
1802 int mds_state
= (mds
== MDS_RANK_NONE
) ? MDSMap::STATE_NULL
: mdsmap
->get_state(mds
);
1803 if (mds_state
!= MDSMap::STATE_ACTIVE
&& mds_state
!= MDSMap::STATE_STOPPING
) {
1804 if (mds_state
== MDSMap::STATE_NULL
&& mds
>= mdsmap
->get_max_mds()) {
1806 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, remove it from fragmap" << dendl
;
1807 _fragmap_remove_stopped_mds(hash_diri
, mds
);
1809 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, trying a random mds" << dendl
;
1810 request
->resend_mds
= _get_random_up_mds();
1813 ldout(cct
, 10) << " target mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
1814 wait_on_list(waiting_for_mdsmap
);
1820 if (!have_open_session(mds
)) {
1821 session
= _get_or_open_mds_session(mds
);
1822 if (session
->state
== MetaSession::STATE_REJECTED
) {
1823 request
->abort(-CEPHFS_EPERM
);
1827 if (session
->state
== MetaSession::STATE_OPENING
) {
1828 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
1829 wait_on_context_list(session
->waiting_for_open
);
1833 if (!have_open_session(mds
))
1836 session
= &mds_sessions
.at(mds
);
1840 send_request(request
, session
);
1843 ldout(cct
, 20) << "awaiting reply|forward|kick on " << &caller_cond
<< dendl
;
1844 request
->kick
= false;
1845 std::unique_lock l
{client_lock
, std::adopt_lock
};
1846 caller_cond
.wait(l
, [request
] {
1847 return (request
->reply
|| // reply
1848 request
->resend_mds
>= 0 || // forward
1852 request
->caller_cond
= nullptr;
1854 // did we get a reply?
1859 if (!request
->reply
) {
1860 ceph_assert(request
->aborted());
1861 ceph_assert(!request
->got_unsafe
);
1862 r
= request
->get_abort_code();
1863 request
->item
.remove_myself();
1864 unregister_request(request
);
1865 put_request(request
);
1870 auto reply
= std::move(request
->reply
);
1871 r
= reply
->get_result();
1873 request
->success
= true;
1875 // kick dispatcher (we've got it!)
1876 ceph_assert(request
->dispatch_cond
);
1877 request
->dispatch_cond
->notify_all();
1878 ldout(cct
, 20) << "sendrecv kickback on tid " << tid
<< " " << request
->dispatch_cond
<< dendl
;
1879 request
->dispatch_cond
= 0;
1881 if (r
>= 0 && ptarget
)
1882 r
= verify_reply_trace(r
, session
, request
, reply
, ptarget
, pcreated
, perms
);
1885 *pdirbl
= reply
->get_extra_bl();
1888 utime_t lat
= ceph_clock_now();
1889 lat
-= request
->sent_stamp
;
1890 ldout(cct
, 20) << "lat " << lat
<< dendl
;
1891 logger
->tinc(l_c_lat
, lat
);
1892 logger
->tinc(l_c_reply
, lat
);
1894 put_request(request
);
1898 void Client::unregister_request(MetaRequest
*req
)
1900 mds_requests
.erase(req
->tid
);
1901 if (req
->tid
== oldest_tid
) {
1902 map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.upper_bound(oldest_tid
);
1904 if (p
== mds_requests
.end()) {
1908 if (p
->second
->get_op() != CEPH_MDS_OP_SETFILELOCK
) {
1909 oldest_tid
= p
->first
;
1918 void Client::put_request(MetaRequest
*request
)
1920 if (request
->_put()) {
1922 if (request
->success
)
1923 op
= request
->get_op();
1925 request
->take_other_inode(&other_in
);
1929 (op
== CEPH_MDS_OP_RMDIR
||
1930 op
== CEPH_MDS_OP_RENAME
||
1931 op
== CEPH_MDS_OP_RMSNAP
)) {
1932 _try_to_trim_inode(other_in
.get(), false);
1937 int Client::encode_inode_release(Inode
*in
, MetaRequest
*req
,
1938 mds_rank_t mds
, int drop
,
1939 int unless
, int force
)
1941 ldout(cct
, 20) << __func__
<< " enter(in:" << *in
<< ", req:" << req
1942 << " mds:" << mds
<< ", drop:" << ccap_string(drop
) << ", unless:" << ccap_string(unless
)
1943 << ", force:" << force
<< ")" << dendl
;
1945 auto it
= in
->caps
.find(mds
);
1946 if (it
!= in
->caps
.end()) {
1947 Cap
&cap
= it
->second
;
1948 drop
&= ~(in
->dirty_caps
| get_caps_used(in
));
1949 if ((drop
& cap
.issued
) &&
1950 !(unless
& cap
.issued
)) {
1951 ldout(cct
, 25) << "dropping caps " << ccap_string(drop
) << dendl
;
1952 cap
.issued
&= ~drop
;
1953 cap
.implemented
&= ~drop
;
1959 cap
.wanted
= in
->caps_wanted();
1960 if (&cap
== in
->auth_cap
&&
1961 !(cap
.wanted
& CEPH_CAP_ANY_FILE_WR
)) {
1962 in
->requested_max_size
= 0;
1963 ldout(cct
, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl
;
1965 ceph_mds_request_release rel
;
1967 rel
.cap_id
= cap
.cap_id
;
1969 rel
.issue_seq
= cap
.issue_seq
;
1970 rel
.mseq
= cap
.mseq
;
1971 rel
.caps
= cap
.implemented
;
1972 rel
.wanted
= cap
.wanted
;
1975 req
->cap_releases
.push_back(MClientRequest::Release(rel
,""));
1978 ldout(cct
, 25) << __func__
<< " exit(in:" << *in
<< ") released:"
1979 << released
<< dendl
;
1983 void Client::encode_dentry_release(Dentry
*dn
, MetaRequest
*req
,
1984 mds_rank_t mds
, int drop
, int unless
)
1986 ldout(cct
, 20) << __func__
<< " enter(dn:"
1987 << dn
<< ")" << dendl
;
1990 released
= encode_inode_release(dn
->dir
->parent_inode
, req
,
1991 mds
, drop
, unless
, 1);
1992 if (released
&& dn
->lease_mds
== mds
) {
1993 ldout(cct
, 25) << "preemptively releasing dn to mds" << dendl
;
1994 auto& rel
= req
->cap_releases
.back();
1995 rel
.item
.dname_len
= dn
->name
.length();
1996 rel
.item
.dname_seq
= dn
->lease_seq
;
1997 rel
.dname
= dn
->name
;
2000 ldout(cct
, 25) << __func__
<< " exit(dn:"
2001 << dn
<< ")" << dendl
;
2006 * This requires the MClientRequest *request member to be set.
2007 * It will error out horribly without one.
2008 * Additionally, if you set any *drop member, you'd better have
2009 * set the corresponding dentry!
2011 void Client::encode_cap_releases(MetaRequest
*req
, mds_rank_t mds
)
2013 ldout(cct
, 20) << __func__
<< " enter (req: "
2014 << req
<< ", mds: " << mds
<< ")" << dendl
;
2015 if (req
->inode_drop
&& req
->inode())
2016 encode_inode_release(req
->inode(), req
,
2017 mds
, req
->inode_drop
,
2020 if (req
->old_inode_drop
&& req
->old_inode())
2021 encode_inode_release(req
->old_inode(), req
,
2022 mds
, req
->old_inode_drop
,
2023 req
->old_inode_unless
);
2024 if (req
->other_inode_drop
&& req
->other_inode())
2025 encode_inode_release(req
->other_inode(), req
,
2026 mds
, req
->other_inode_drop
,
2027 req
->other_inode_unless
);
2029 if (req
->dentry_drop
&& req
->dentry())
2030 encode_dentry_release(req
->dentry(), req
,
2031 mds
, req
->dentry_drop
,
2032 req
->dentry_unless
);
2034 if (req
->old_dentry_drop
&& req
->old_dentry())
2035 encode_dentry_release(req
->old_dentry(), req
,
2036 mds
, req
->old_dentry_drop
,
2037 req
->old_dentry_unless
);
2038 ldout(cct
, 25) << __func__
<< " exit (req: "
2039 << req
<< ", mds " << mds
<<dendl
;
2042 bool Client::have_open_session(mds_rank_t mds
)
2044 const auto &it
= mds_sessions
.find(mds
);
2045 return it
!= mds_sessions
.end() &&
2046 (it
->second
.state
== MetaSession::STATE_OPEN
||
2047 it
->second
.state
== MetaSession::STATE_STALE
);
2050 MetaSession
*Client::_get_mds_session(mds_rank_t mds
, Connection
*con
)
2052 const auto &it
= mds_sessions
.find(mds
);
2053 if (it
== mds_sessions
.end() || it
->second
.con
!= con
) {
2060 MetaSession
*Client::_get_or_open_mds_session(mds_rank_t mds
)
2062 auto it
= mds_sessions
.find(mds
);
2063 return it
== mds_sessions
.end() ? _open_mds_session(mds
) : &it
->second
;
2067 * Populate a map of strings with client-identifying metadata,
2068 * such as the hostname. Call this once at initialization.
2070 void Client::populate_metadata(const std::string
&mount_root
)
2074 // TODO: move this to compat.h
2076 DWORD hostname_sz
= 64;
2077 GetComputerNameA(hostname
, &hostname_sz
);
2078 metadata
["hostname"] = hostname
;
2083 metadata
["hostname"] = u
.nodename
;
2084 ldout(cct
, 20) << __func__
<< " read hostname '" << u
.nodename
<< "'" << dendl
;
2086 ldout(cct
, 1) << __func__
<< " failed to read hostname (" << cpp_strerror(r
) << ")" << dendl
;
2090 metadata
["pid"] = stringify(getpid());
2092 // Ceph entity id (the '0' in "client.0")
2093 metadata
["entity_id"] = cct
->_conf
->name
.get_id();
2095 // Our mount position
2096 if (!mount_root
.empty()) {
2097 metadata
["root"] = mount_root
;
2101 metadata
["ceph_version"] = pretty_version_to_str();
2102 metadata
["ceph_sha1"] = git_version_to_str();
2104 // Apply any metadata from the user's configured overrides
2105 std::vector
<std::string
> tokens
;
2106 get_str_vec(cct
->_conf
->client_metadata
, ",", tokens
);
2107 for (const auto &i
: tokens
) {
2108 auto eqpos
= i
.find("=");
2109 // Throw out anything that isn't of the form "<str>=<str>"
2110 if (eqpos
== 0 || eqpos
== std::string::npos
|| eqpos
== i
.size()) {
2111 lderr(cct
) << "Invalid metadata keyval pair: '" << i
<< "'" << dendl
;
2114 metadata
[i
.substr(0, eqpos
)] = i
.substr(eqpos
+ 1);
2119 * Optionally add or override client metadata fields.
2121 void Client::update_metadata(std::string
const &k
, std::string
const &v
)
2123 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
2124 ceph_assert(iref_reader
.is_state_satisfied());
2126 std::scoped_lock
l(client_lock
);
2128 auto it
= metadata
.find(k
);
2129 if (it
!= metadata
.end()) {
2130 ldout(cct
, 1) << __func__
<< " warning, overriding metadata field '" << k
2131 << "' from '" << it
->second
<< "' to '" << v
<< "'" << dendl
;
2137 MetaSession
*Client::_open_mds_session(mds_rank_t mds
)
2139 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
2140 auto addrs
= mdsmap
->get_addrs(mds
);
2141 auto em
= mds_sessions
.emplace(std::piecewise_construct
,
2142 std::forward_as_tuple(mds
),
2143 std::forward_as_tuple(mds
, messenger
->connect_to_mds(addrs
), addrs
));
2144 ceph_assert(em
.second
); /* not already present */
2145 MetaSession
*session
= &em
.first
->second
;
2147 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_OPEN
);
2148 m
->metadata
= metadata
;
2149 m
->supported_features
= feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED
);
2150 m
->metric_spec
= feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL
);
2151 session
->con
->send_message2(std::move(m
));
2155 void Client::_close_mds_session(MetaSession
*s
)
2157 ldout(cct
, 2) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2158 s
->state
= MetaSession::STATE_CLOSING
;
2159 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2162 void Client::_closed_mds_session(MetaSession
*s
, int err
, bool rejected
)
2164 ldout(cct
, 5) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2165 if (rejected
&& s
->state
!= MetaSession::STATE_CLOSING
)
2166 s
->state
= MetaSession::STATE_REJECTED
;
2168 s
->state
= MetaSession::STATE_CLOSED
;
2169 s
->con
->mark_down();
2170 signal_context_list(s
->waiting_for_open
);
2171 mount_cond
.notify_all();
2172 remove_session_caps(s
, err
);
2173 kick_requests_closed(s
);
2174 mds_ranks_closing
.erase(s
->mds_num
);
2175 if (s
->state
== MetaSession::STATE_CLOSED
)
2176 mds_sessions
.erase(s
->mds_num
);
2179 void Client::handle_client_session(const MConstRef
<MClientSession
>& m
)
2181 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2182 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << from
<< dendl
;
2184 std::scoped_lock
cl(client_lock
);
2185 MetaSession
*session
= _get_mds_session(from
, m
->get_connection().get());
2187 ldout(cct
, 10) << " discarding session message from sessionless mds " << m
->get_source_inst() << dendl
;
2191 switch (m
->get_op()) {
2192 case CEPH_SESSION_OPEN
:
2194 feature_bitset_t
missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED
);
2195 missing_features
-= m
->supported_features
;
2196 if (!missing_features
.empty()) {
2197 lderr(cct
) << "mds." << from
<< " lacks required features '"
2198 << missing_features
<< "', closing session " << dendl
;
2199 _close_mds_session(session
);
2200 _closed_mds_session(session
, -CEPHFS_EPERM
, true);
2203 session
->mds_features
= std::move(m
->supported_features
);
2205 renew_caps(session
);
2206 session
->state
= MetaSession::STATE_OPEN
;
2207 if (is_unmounting())
2208 mount_cond
.notify_all();
2210 connect_mds_targets(from
);
2211 signal_context_list(session
->waiting_for_open
);
2215 case CEPH_SESSION_CLOSE
:
2216 _closed_mds_session(session
);
2219 case CEPH_SESSION_RENEWCAPS
:
2220 if (session
->cap_renew_seq
== m
->get_seq()) {
2221 bool was_stale
= ceph_clock_now() >= session
->cap_ttl
;
2223 session
->last_cap_renew_request
+ mdsmap
->get_session_timeout();
2225 wake_up_session_caps(session
, false);
2229 case CEPH_SESSION_STALE
:
2230 // invalidate session caps/leases
2232 session
->cap_ttl
= ceph_clock_now();
2233 session
->cap_ttl
-= 1;
2234 renew_caps(session
);
2237 case CEPH_SESSION_RECALL_STATE
:
2239 * Call the renew caps and flush cap releases just before
2240 * triming the caps in case the tick() won't get a chance
2241 * to run them, which could cause the client to be blocklisted
2242 * and MDS daemons trying to recall the caps again and
2245 * In most cases it will do nothing, and the new cap releases
2246 * added by trim_caps() followed will be deferred flushing
2249 renew_and_flush_cap_releases();
2250 trim_caps(session
, m
->get_max_caps());
2253 case CEPH_SESSION_FLUSHMSG
:
2254 /* flush cap release */
2255 if (auto& m
= session
->release
; m
) {
2256 session
->con
->send_message2(std::move(m
));
2258 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG_ACK
, m
->get_seq()));
2261 case CEPH_SESSION_FORCE_RO
:
2262 force_session_readonly(session
);
2265 case CEPH_SESSION_REJECT
:
2267 std::string_view error_str
;
2268 auto it
= m
->metadata
.find("error_string");
2269 if (it
!= m
->metadata
.end())
2270 error_str
= it
->second
;
2272 error_str
= "unknown error";
2273 lderr(cct
) << "mds." << from
<< " rejected us (" << error_str
<< ")" << dendl
;
2275 _closed_mds_session(session
, -CEPHFS_EPERM
, true);
2284 bool Client::_any_stale_sessions() const
2286 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
2288 for (const auto &p
: mds_sessions
) {
2289 if (p
.second
.state
== MetaSession::STATE_STALE
) {
2297 void Client::_kick_stale_sessions()
2299 ldout(cct
, 1) << __func__
<< dendl
;
2301 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
2302 MetaSession
&s
= it
->second
;
2303 if (s
.state
== MetaSession::STATE_REJECTED
) {
2304 mds_sessions
.erase(it
++);
2308 if (s
.state
== MetaSession::STATE_STALE
)
2309 _closed_mds_session(&s
);
2313 void Client::send_request(MetaRequest
*request
, MetaSession
*session
,
2314 bool drop_cap_releases
)
2317 mds_rank_t mds
= session
->mds_num
;
2318 ldout(cct
, 10) << __func__
<< " rebuilding request " << request
->get_tid()
2319 << " for mds." << mds
<< dendl
;
2320 auto r
= build_client_request(request
);
2321 if (request
->dentry()) {
2322 r
->set_dentry_wanted();
2324 if (request
->got_unsafe
) {
2325 r
->set_replayed_op();
2326 if (request
->target
)
2327 r
->head
.ino
= request
->target
->ino
;
2329 encode_cap_releases(request
, mds
);
2330 if (drop_cap_releases
) // we haven't send cap reconnect yet, drop cap releases
2331 request
->cap_releases
.clear();
2333 r
->releases
.swap(request
->cap_releases
);
2335 r
->set_mdsmap_epoch(mdsmap
->get_epoch());
2336 if (r
->head
.op
== CEPH_MDS_OP_SETXATTR
) {
2337 objecter
->with_osdmap([r
](const OSDMap
& o
) {
2338 r
->set_osdmap_epoch(o
.get_epoch());
2342 if (request
->mds
== -1) {
2343 request
->sent_stamp
= ceph_clock_now();
2344 ldout(cct
, 20) << __func__
<< " set sent_stamp to " << request
->sent_stamp
<< dendl
;
2348 Inode
*in
= request
->inode();
2350 auto it
= in
->caps
.find(mds
);
2351 if (it
!= in
->caps
.end()) {
2352 request
->sent_on_mseq
= it
->second
.mseq
;
2356 session
->requests
.push_back(&request
->item
);
2358 ldout(cct
, 10) << __func__
<< " " << *r
<< " to mds." << mds
<< dendl
;
2359 session
->con
->send_message2(std::move(r
));
2362 ref_t
<MClientRequest
> Client::build_client_request(MetaRequest
*request
)
2364 auto req
= make_message
<MClientRequest
>(request
->get_op());
2365 req
->set_tid(request
->tid
);
2366 req
->set_stamp(request
->op_stamp
);
2367 memcpy(&req
->head
, &request
->head
, sizeof(ceph_mds_request_head
));
2369 // if the filepath's haven't been set, set them!
2370 if (request
->path
.empty()) {
2371 Inode
*in
= request
->inode();
2372 Dentry
*de
= request
->dentry();
2374 in
->make_nosnap_relative_path(request
->path
);
2377 de
->inode
->make_nosnap_relative_path(request
->path
);
2379 de
->dir
->parent_inode
->make_nosnap_relative_path(request
->path
);
2380 request
->path
.push_dentry(de
->name
);
2382 else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2383 << " No path, inode, or appropriately-endowed dentry given!"
2385 } else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2386 << " No path, inode, or dentry given!"
2389 req
->set_filepath(request
->get_filepath());
2390 req
->set_filepath2(request
->get_filepath2());
2391 req
->set_alternate_name(request
->alternate_name
);
2392 req
->set_data(request
->data
);
2393 req
->set_retry_attempt(request
->retry_attempt
++);
2394 req
->head
.num_fwd
= request
->num_fwd
;
2396 int gid_count
= request
->perms
.get_gids(&_gids
);
2397 req
->set_gid_list(gid_count
, _gids
);
2403 void Client::handle_client_request_forward(const MConstRef
<MClientRequestForward
>& fwd
)
2405 mds_rank_t mds
= mds_rank_t(fwd
->get_source().num());
2407 std::scoped_lock
cl(client_lock
);
2408 MetaSession
*session
= _get_mds_session(mds
, fwd
->get_connection().get());
2412 ceph_tid_t tid
= fwd
->get_tid();
2414 if (mds_requests
.count(tid
) == 0) {
2415 ldout(cct
, 10) << __func__
<< " no pending request on tid " << tid
<< dendl
;
2419 MetaRequest
*request
= mds_requests
[tid
];
2420 ceph_assert(request
);
2422 // reset retry counter
2423 request
->retry_attempt
= 0;
2425 // request not forwarded, or dest mds has no session.
2427 ldout(cct
, 10) << __func__
<< " tid " << tid
2428 << " fwd " << fwd
->get_num_fwd()
2429 << " to mds." << fwd
->get_dest_mds()
2430 << ", resending to " << fwd
->get_dest_mds()
2434 request
->item
.remove_myself();
2435 request
->num_fwd
= fwd
->get_num_fwd();
2436 request
->resend_mds
= fwd
->get_dest_mds();
2437 request
->caller_cond
->notify_all();
2440 bool Client::is_dir_operation(MetaRequest
*req
)
2442 int op
= req
->get_op();
2443 if (op
== CEPH_MDS_OP_MKNOD
|| op
== CEPH_MDS_OP_LINK
||
2444 op
== CEPH_MDS_OP_UNLINK
|| op
== CEPH_MDS_OP_RENAME
||
2445 op
== CEPH_MDS_OP_MKDIR
|| op
== CEPH_MDS_OP_RMDIR
||
2446 op
== CEPH_MDS_OP_SYMLINK
|| op
== CEPH_MDS_OP_CREATE
)
2451 void Client::handle_client_reply(const MConstRef
<MClientReply
>& reply
)
2453 mds_rank_t mds_num
= mds_rank_t(reply
->get_source().num());
2455 std::scoped_lock
cl(client_lock
);
2456 MetaSession
*session
= _get_mds_session(mds_num
, reply
->get_connection().get());
2461 ceph_tid_t tid
= reply
->get_tid();
2462 bool is_safe
= reply
->is_safe();
2464 if (mds_requests
.count(tid
) == 0) {
2465 lderr(cct
) << __func__
<< " no pending request on tid " << tid
2466 << " safe is:" << is_safe
<< dendl
;
2469 MetaRequest
*request
= mds_requests
.at(tid
);
2471 ldout(cct
, 20) << __func__
<< " got a reply. Safe:" << is_safe
2472 << " tid " << tid
<< dendl
;
2474 if (request
->got_unsafe
&& !is_safe
) {
2475 //duplicate response
2476 ldout(cct
, 0) << "got a duplicate reply on tid " << tid
<< " from mds "
2477 << mds_num
<< " safe:" << is_safe
<< dendl
;
2481 if (-CEPHFS_ESTALE
== reply
->get_result()) { // see if we can get to proper MDS
2482 ldout(cct
, 20) << "got ESTALE on tid " << request
->tid
2483 << " from mds." << request
->mds
<< dendl
;
2484 request
->send_to_auth
= true;
2485 request
->resend_mds
= choose_target_mds(request
);
2486 Inode
*in
= request
->inode();
2487 std::map
<mds_rank_t
, Cap
>::const_iterator it
;
2488 if (request
->resend_mds
>= 0 &&
2489 request
->resend_mds
== request
->mds
&&
2491 (it
= in
->caps
.find(request
->resend_mds
)) != in
->caps
.end() ||
2492 request
->sent_on_mseq
== it
->second
.mseq
)) {
2493 ldout(cct
, 20) << "have to return ESTALE" << dendl
;
2495 request
->caller_cond
->notify_all();
2500 ceph_assert(!request
->reply
);
2501 request
->reply
= reply
;
2502 insert_trace(request
, session
);
2504 // Handle unsafe reply
2506 request
->got_unsafe
= true;
2507 session
->unsafe_requests
.push_back(&request
->unsafe_item
);
2508 if (is_dir_operation(request
)) {
2509 Inode
*dir
= request
->inode();
2511 dir
->unsafe_ops
.push_back(&request
->unsafe_dir_item
);
2513 if (request
->target
) {
2514 InodeRef
&in
= request
->target
;
2515 in
->unsafe_ops
.push_back(&request
->unsafe_target_item
);
2519 // Only signal the caller once (on the first reply):
2520 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2521 if (!is_safe
|| !request
->got_unsafe
) {
2522 ceph::condition_variable cond
;
2523 request
->dispatch_cond
= &cond
;
2526 ldout(cct
, 20) << __func__
<< " signalling caller " << (void*)request
->caller_cond
<< dendl
;
2527 request
->caller_cond
->notify_all();
2529 // wake for kick back
2530 std::unique_lock l
{client_lock
, std::adopt_lock
};
2531 cond
.wait(l
, [tid
, request
, &cond
, this] {
2532 if (request
->dispatch_cond
) {
2533 ldout(cct
, 20) << "handle_client_reply awaiting kickback on tid "
2534 << tid
<< " " << &cond
<< dendl
;
2536 return !request
->dispatch_cond
;
2542 // the filesystem change is committed to disk
2543 // we're done, clean up
2544 if (request
->got_unsafe
) {
2545 request
->unsafe_item
.remove_myself();
2546 request
->unsafe_dir_item
.remove_myself();
2547 request
->unsafe_target_item
.remove_myself();
2548 signal_cond_list(request
->waitfor_safe
);
2550 request
->item
.remove_myself();
2551 unregister_request(request
);
2553 if (is_unmounting())
2554 mount_cond
.notify_all();
2557 void Client::_handle_full_flag(int64_t pool
)
2559 ldout(cct
, 1) << __func__
<< ": FULL: cancelling outstanding operations "
2560 << "on " << pool
<< dendl
;
2561 // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
2562 // to do this rather than blocking, because otherwise when we fill up we
2563 // potentially lock caps forever on files with dirty pages, and we need
2564 // to be able to release those caps to the MDS so that it can delete files
2565 // and free up space.
2566 epoch_t cancelled_epoch
= objecter
->op_cancel_writes(-CEPHFS_ENOSPC
, pool
);
2568 // For all inodes with layouts in this pool and a pending flush write op
2569 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2570 // from ObjectCacher so that it doesn't re-issue the write in response to
2571 // the ENOSPC error.
2572 // Fortunately since we're cancelling everything in a given pool, we don't
2573 // need to know which ops belong to which ObjectSet, we can just blow all
2574 // the un-flushed cached data away and mark any dirty inodes' async_err
2575 // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
2576 // affecting this pool, and all the objectsets we're purging were also
2578 for (unordered_map
<vinodeno_t
,Inode
*>::iterator i
= inode_map
.begin();
2579 i
!= inode_map
.end(); ++i
)
2581 Inode
*inode
= i
->second
;
2582 if (inode
->oset
.dirty_or_tx
2583 && (pool
== -1 || inode
->layout
.pool_id
== pool
)) {
2584 ldout(cct
, 4) << __func__
<< ": FULL: inode 0x" << std::hex
<< i
->first
<< std::dec
2585 << " has dirty objects, purging and setting ENOSPC" << dendl
;
2586 objectcacher
->purge_set(&inode
->oset
);
2587 inode
->set_async_err(-CEPHFS_ENOSPC
);
2591 if (cancelled_epoch
!= (epoch_t
)-1) {
2592 set_cap_epoch_barrier(cancelled_epoch
);
2596 void Client::handle_osd_map(const MConstRef
<MOSDMap
>& m
)
2598 std::set
<entity_addr_t
> new_blocklists
;
2600 std::scoped_lock
cl(client_lock
);
2601 objecter
->consume_blocklist_events(&new_blocklists
);
2603 const auto myaddrs
= messenger
->get_myaddrs();
2604 bool new_blocklist
= false;
2605 bool prenautilus
= objecter
->with_osdmap(
2606 [&](const OSDMap
& o
) {
2607 return o
.require_osd_release
< ceph_release_t::nautilus
;
2610 for (auto a
: myaddrs
.v
) {
2611 // blocklist entries are always TYPE_ANY for nautilus+
2612 a
.set_type(entity_addr_t::TYPE_ANY
);
2613 if (new_blocklists
.count(a
)) {
2614 new_blocklist
= true;
2618 // ...except pre-nautilus, they were TYPE_LEGACY
2619 a
.set_type(entity_addr_t::TYPE_LEGACY
);
2620 if (new_blocklists
.count(a
)) {
2621 new_blocklist
= true;
2627 if (new_blocklist
) {
2628 auto epoch
= objecter
->with_osdmap([](const OSDMap
&o
){
2629 return o
.get_epoch();
2631 lderr(cct
) << "I was blocklisted at osd epoch " << epoch
<< dendl
;
2634 _abort_mds_sessions(-CEPHFS_EBLOCKLISTED
);
2636 // Since we know all our OSD ops will fail, cancel them all preemtively,
2637 // so that on an unhealthy cluster we can umount promptly even if e.g.
2638 // some PGs were inaccessible.
2639 objecter
->op_cancel_writes(-CEPHFS_EBLOCKLISTED
);
2644 // Handle case where we were blocklisted but no longer are
2645 blocklisted
= objecter
->with_osdmap([myaddrs
](const OSDMap
&o
){
2646 return o
.is_blocklisted(myaddrs
);});
2649 // Always subscribe to next osdmap for blocklisted client
2650 // until this client is not blocklisted.
2652 objecter
->maybe_request_map();
2655 if (objecter
->osdmap_full_flag()) {
2656 _handle_full_flag(-1);
2658 // Accumulate local list of full pools so that I can drop
2659 // the objecter lock before re-entering objecter in
2661 std::vector
<int64_t> full_pools
;
2663 objecter
->with_osdmap([&full_pools
](const OSDMap
&o
) {
2664 for (const auto& kv
: o
.get_pools()) {
2665 if (kv
.second
.has_flag(pg_pool_t::FLAG_FULL
)) {
2666 full_pools
.push_back(kv
.first
);
2671 for (auto p
: full_pools
)
2672 _handle_full_flag(p
);
2674 // Subscribe to subsequent maps to watch for the full flag going
2675 // away. For the global full flag objecter does this for us, but
2676 // it pays no attention to the per-pool full flag so in this branch
2677 // we do it ourselves.
2678 if (!full_pools
.empty()) {
2679 objecter
->maybe_request_map();
2685 // ------------------------
2686 // incoming messages
2689 bool Client::ms_dispatch2(const MessageRef
&m
)
2691 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
2692 if (!iref_reader
.is_state_satisfied()) {
2693 ldout(cct
, 10) << "inactive, discarding " << *m
<< dendl
;
2697 switch (m
->get_type()) {
2698 // mounting and mds sessions
2699 case CEPH_MSG_MDS_MAP
:
2700 handle_mds_map(ref_cast
<MMDSMap
>(m
));
2702 case CEPH_MSG_FS_MAP
:
2703 handle_fs_map(ref_cast
<MFSMap
>(m
));
2705 case CEPH_MSG_FS_MAP_USER
:
2706 handle_fs_map_user(ref_cast
<MFSMapUser
>(m
));
2708 case CEPH_MSG_CLIENT_SESSION
:
2709 handle_client_session(ref_cast
<MClientSession
>(m
));
2712 case CEPH_MSG_OSD_MAP
:
2713 handle_osd_map(ref_cast
<MOSDMap
>(m
));
2717 case CEPH_MSG_CLIENT_REQUEST_FORWARD
:
2718 handle_client_request_forward(ref_cast
<MClientRequestForward
>(m
));
2720 case CEPH_MSG_CLIENT_REPLY
:
2721 handle_client_reply(ref_cast
<MClientReply
>(m
));
2725 case CEPH_MSG_CLIENT_RECLAIM_REPLY
:
2726 handle_client_reclaim_reply(ref_cast
<MClientReclaimReply
>(m
));
2729 case CEPH_MSG_CLIENT_SNAP
:
2730 handle_snap(ref_cast
<MClientSnap
>(m
));
2732 case CEPH_MSG_CLIENT_CAPS
:
2733 handle_caps(ref_cast
<MClientCaps
>(m
));
2735 case CEPH_MSG_CLIENT_LEASE
:
2736 handle_lease(ref_cast
<MClientLease
>(m
));
2738 case MSG_COMMAND_REPLY
:
2739 if (m
->get_source().type() == CEPH_ENTITY_TYPE_MDS
) {
2740 handle_command_reply(ref_cast
<MCommandReply
>(m
));
2745 case CEPH_MSG_CLIENT_QUOTA
:
2746 handle_quota(ref_cast
<MClientQuota
>(m
));
2754 std::scoped_lock
cl(client_lock
);
2755 if (is_unmounting()) {
2756 ldout(cct
, 10) << "unmounting: trim pass, size was " << lru
.lru_get_size()
2757 << "+" << inode_map
.size() << dendl
;
2758 uint64_t size
= lru
.lru_get_size() + inode_map
.size();
2760 if (size
> lru
.lru_get_size() + inode_map
.size()) {
2761 ldout(cct
, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl
;
2762 mount_cond
.notify_all();
2764 ldout(cct
, 10) << "unmounting: trim pass, size still " << lru
.lru_get_size()
2765 << "+" << inode_map
.size() << dendl
;
2772 void Client::handle_fs_map(const MConstRef
<MFSMap
>& m
)
2774 std::scoped_lock
cl(client_lock
);
2775 fsmap
.reset(new FSMap(m
->get_fsmap()));
2777 signal_cond_list(waiting_for_fsmap
);
2779 monclient
->sub_got("fsmap", fsmap
->get_epoch());
2782 void Client::handle_fs_map_user(const MConstRef
<MFSMapUser
>& m
)
2784 std::scoped_lock
cl(client_lock
);
2785 fsmap_user
.reset(new FSMapUser
);
2786 *fsmap_user
= m
->get_fsmap();
2788 monclient
->sub_got("fsmap.user", fsmap_user
->get_epoch());
2789 signal_cond_list(waiting_for_fsmap
);
2792 // Cancel all the commands for missing or laggy GIDs
2793 void Client::cancel_commands(const MDSMap
& newmap
)
2795 std::vector
<ceph_tid_t
> cancel_ops
;
2797 std::scoped_lock
cmd_lock(command_lock
);
2798 auto &commands
= command_table
.get_commands();
2799 for (const auto &[tid
, op
] : commands
) {
2800 const mds_gid_t op_mds_gid
= op
.mds_gid
;
2801 if (newmap
.is_dne_gid(op_mds_gid
) || newmap
.is_laggy_gid(op_mds_gid
)) {
2802 ldout(cct
, 1) << __func__
<< ": cancelling command op " << tid
<< dendl
;
2803 cancel_ops
.push_back(tid
);
2805 std::ostringstream ss
;
2806 ss
<< "MDS " << op_mds_gid
<< " went away";
2807 *(op
.outs
) = ss
.str();
2810 * No need to make the con->mark_down under
2811 * client_lock here, because the con will
2814 op
.con
->mark_down();
2816 op
.on_finish
->complete(-CEPHFS_ETIMEDOUT
);
2820 for (const auto &tid
: cancel_ops
)
2821 command_table
.erase(tid
);
2824 void Client::handle_mds_map(const MConstRef
<MMDSMap
>& m
)
2826 std::unique_lock
cl(client_lock
);
2827 if (m
->get_epoch() <= mdsmap
->get_epoch()) {
2828 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch()
2829 << " is identical to or older than our "
2830 << mdsmap
->get_epoch() << dendl
;
2835 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch() << dendl
;
2836 std::unique_ptr
<MDSMap
> _mdsmap(new MDSMap
);
2837 _mdsmap
->decode(m
->get_encoded());
2838 cancel_commands(*_mdsmap
.get());
2841 _mdsmap
.swap(mdsmap
);
2844 for (auto p
= mds_sessions
.begin(); p
!= mds_sessions
.end(); ) {
2845 mds_rank_t mds
= p
->first
;
2846 MetaSession
*session
= &p
->second
;
2849 int oldstate
= _mdsmap
->get_state(mds
);
2850 int newstate
= mdsmap
->get_state(mds
);
2851 if (!mdsmap
->is_up(mds
)) {
2852 session
->con
->mark_down();
2853 } else if (mdsmap
->get_addrs(mds
) != session
->addrs
) {
2854 auto old_inc
= _mdsmap
->get_incarnation(mds
);
2855 auto new_inc
= mdsmap
->get_incarnation(mds
);
2856 if (old_inc
!= new_inc
) {
2857 ldout(cct
, 1) << "mds incarnation changed from "
2858 << old_inc
<< " to " << new_inc
<< dendl
;
2859 oldstate
= MDSMap::STATE_NULL
;
2861 session
->con
->mark_down();
2862 session
->addrs
= mdsmap
->get_addrs(mds
);
2863 // When new MDS starts to take over, notify kernel to trim unused entries
2864 // in its dcache/icache. Hopefully, the kernel will release some unused
2865 // inodes before the new MDS enters reconnect state.
2866 trim_cache_for_reconnect(session
);
2867 } else if (oldstate
== newstate
)
2868 continue; // no change
2870 session
->mds_state
= newstate
;
2871 if (newstate
== MDSMap::STATE_RECONNECT
) {
2872 session
->con
= messenger
->connect_to_mds(session
->addrs
);
2873 send_reconnect(session
);
2874 } else if (newstate
> MDSMap::STATE_RECONNECT
) {
2875 if (oldstate
< MDSMap::STATE_RECONNECT
) {
2876 ldout(cct
, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl
;
2877 _closed_mds_session(session
);
2880 if (newstate
>= MDSMap::STATE_ACTIVE
) {
2881 if (oldstate
< MDSMap::STATE_ACTIVE
) {
2882 // kick new requests
2883 kick_requests(session
);
2884 kick_flushing_caps(session
);
2885 signal_context_list(session
->waiting_for_open
);
2886 wake_up_session_caps(session
, true);
2888 connect_mds_targets(mds
);
2890 } else if (newstate
== MDSMap::STATE_NULL
&&
2891 mds
>= mdsmap
->get_max_mds()) {
2892 _closed_mds_session(session
);
2896 // kick any waiting threads
2897 signal_cond_list(waiting_for_mdsmap
);
2899 monclient
->sub_got("mdsmap", mdsmap
->get_epoch());
2902 void Client::send_reconnect(MetaSession
*session
)
2904 mds_rank_t mds
= session
->mds_num
;
2905 ldout(cct
, 10) << __func__
<< " to mds." << mds
<< dendl
;
2907 // trim unused caps to reduce MDS's cache rejoin time
2908 trim_cache_for_reconnect(session
);
2910 session
->readonly
= false;
2912 session
->release
.reset();
2914 // reset my cap seq number
2916 //connect to the mds' offload targets
2917 connect_mds_targets(mds
);
2918 //make sure unsafe requests get saved
2919 resend_unsafe_requests(session
);
2921 early_kick_flushing_caps(session
);
2923 auto m
= make_message
<MClientReconnect
>();
2924 bool allow_multi
= session
->mds_features
.test(CEPHFS_FEATURE_MULTI_RECONNECT
);
2926 // i have an open session.
2927 ceph::unordered_set
<inodeno_t
> did_snaprealm
;
2928 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
= inode_map
.begin();
2929 p
!= inode_map
.end();
2931 Inode
*in
= p
->second
;
2932 auto it
= in
->caps
.find(mds
);
2933 if (it
!= in
->caps
.end()) {
2935 m
->get_approx_size() >=
2936 static_cast<size_t>((std::numeric_limits
<int>::max() >> 1))) {
2938 session
->con
->send_message2(std::move(m
));
2940 m
= make_message
<MClientReconnect
>();
2943 Cap
&cap
= it
->second
;
2944 ldout(cct
, 10) << " caps on " << p
->first
2945 << " " << ccap_string(cap
.issued
)
2946 << " wants " << ccap_string(in
->caps_wanted())
2949 in
->make_short_path(path
);
2950 ldout(cct
, 10) << " path " << path
<< dendl
;
2953 _encode_filelocks(in
, flockbl
);
2955 cap
.seq
= 0; // reset seq.
2956 cap
.issue_seq
= 0; // reset seq.
2957 cap
.mseq
= 0; // reset seq.
2958 // cap gen should catch up with session cap_gen
2959 if (cap
.gen
< session
->cap_gen
) {
2960 cap
.gen
= session
->cap_gen
;
2961 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
2963 cap
.issued
= cap
.implemented
;
2965 snapid_t snap_follows
= 0;
2966 if (!in
->cap_snaps
.empty())
2967 snap_follows
= in
->cap_snaps
.begin()->first
;
2969 m
->add_cap(p
->first
.ino
,
2971 path
.get_ino(), path
.get_path(), // ino
2972 in
->caps_wanted(), // wanted
2973 cap
.issued
, // issued
2978 if (did_snaprealm
.count(in
->snaprealm
->ino
) == 0) {
2979 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
2980 m
->add_snaprealm(in
->snaprealm
->ino
, in
->snaprealm
->seq
, in
->snaprealm
->parent
);
2981 did_snaprealm
.insert(in
->snaprealm
->ino
);
2987 m
->set_encoding_version(0); // use connection features to choose encoding
2988 session
->con
->send_message2(std::move(m
));
2990 mount_cond
.notify_all();
2992 if (session
->reclaim_state
== MetaSession::RECLAIMING
)
2993 signal_cond_list(waiting_for_reclaim
);
2997 void Client::kick_requests(MetaSession
*session
)
2999 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
3000 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
3001 p
!= mds_requests
.end();
3003 MetaRequest
*req
= p
->second
;
3004 if (req
->got_unsafe
)
3006 if (req
->aborted()) {
3007 if (req
->caller_cond
) {
3009 req
->caller_cond
->notify_all();
3013 if (req
->retry_attempt
> 0)
3014 continue; // new requests only
3015 if (req
->mds
== session
->mds_num
) {
3016 send_request(p
->second
, session
);
3021 void Client::resend_unsafe_requests(MetaSession
*session
)
3023 for (xlist
<MetaRequest
*>::iterator iter
= session
->unsafe_requests
.begin();
3026 send_request(*iter
, session
);
3028 // also re-send old requests when MDS enters reconnect stage. So that MDS can
3029 // process completed requests in clientreplay stage.
3030 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
3031 p
!= mds_requests
.end();
3033 MetaRequest
*req
= p
->second
;
3034 if (req
->got_unsafe
)
3038 if (req
->retry_attempt
== 0)
3039 continue; // old requests only
3040 if (req
->mds
== session
->mds_num
)
3041 send_request(req
, session
, true);
3045 void Client::wait_unsafe_requests()
3047 list
<MetaRequest
*> last_unsafe_reqs
;
3048 for (const auto &p
: mds_sessions
) {
3049 const MetaSession
&s
= p
.second
;
3050 if (!s
.unsafe_requests
.empty()) {
3051 MetaRequest
*req
= s
.unsafe_requests
.back();
3053 last_unsafe_reqs
.push_back(req
);
3057 for (list
<MetaRequest
*>::iterator p
= last_unsafe_reqs
.begin();
3058 p
!= last_unsafe_reqs
.end();
3060 MetaRequest
*req
= *p
;
3061 if (req
->unsafe_item
.is_on_list())
3062 wait_on_list(req
->waitfor_safe
);
3067 void Client::kick_requests_closed(MetaSession
*session
)
3069 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
3070 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
3071 p
!= mds_requests
.end(); ) {
3072 MetaRequest
*req
= p
->second
;
3074 if (req
->mds
== session
->mds_num
) {
3075 if (req
->caller_cond
) {
3077 req
->caller_cond
->notify_all();
3079 req
->item
.remove_myself();
3080 if (req
->got_unsafe
) {
3081 lderr(cct
) << __func__
<< " removing unsafe request " << req
->get_tid() << dendl
;
3082 req
->unsafe_item
.remove_myself();
3083 if (is_dir_operation(req
)) {
3084 Inode
*dir
= req
->inode();
3086 dir
->set_async_err(-CEPHFS_EIO
);
3087 lderr(cct
) << "kick_requests_closed drop req of inode(dir) : "
3088 << dir
->ino
<< " " << req
->get_tid() << dendl
;
3089 req
->unsafe_dir_item
.remove_myself();
3092 InodeRef
&in
= req
->target
;
3093 in
->set_async_err(-CEPHFS_EIO
);
3094 lderr(cct
) << "kick_requests_closed drop req of inode : "
3095 << in
->ino
<< " " << req
->get_tid() << dendl
;
3096 req
->unsafe_target_item
.remove_myself();
3098 signal_cond_list(req
->waitfor_safe
);
3099 unregister_request(req
);
3103 ceph_assert(session
->requests
.empty());
3104 ceph_assert(session
->unsafe_requests
.empty());
3114 void Client::got_mds_push(MetaSession
*s
)
3117 ldout(cct
, 10) << " mds." << s
->mds_num
<< " seq now " << s
->seq
<< dendl
;
3118 if (s
->state
== MetaSession::STATE_CLOSING
) {
3119 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
3123 void Client::handle_lease(const MConstRef
<MClientLease
>& m
)
3125 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
3127 ceph_assert(m
->get_action() == CEPH_MDS_LEASE_REVOKE
);
3128 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
3130 std::scoped_lock
cl(client_lock
);
3131 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
3136 got_mds_push(session
);
3138 ceph_seq_t seq
= m
->get_seq();
3141 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
3142 if (inode_map
.count(vino
) == 0) {
3143 ldout(cct
, 10) << " don't have vino " << vino
<< dendl
;
3146 in
= inode_map
[vino
];
3148 if (m
->get_mask() & CEPH_LEASE_VALID
) {
3149 if (!in
->dir
|| in
->dir
->dentries
.count(m
->dname
) == 0) {
3150 ldout(cct
, 10) << " don't have dir|dentry " << m
->get_ino() << "/" << m
->dname
<<dendl
;
3153 Dentry
*dn
= in
->dir
->dentries
[m
->dname
];
3154 ldout(cct
, 10) << " revoked DN lease on " << dn
<< dendl
;
3160 auto reply
= make_message
<MClientLease
>(CEPH_MDS_LEASE_RELEASE
, seq
,
3161 m
->get_mask(), m
->get_ino(),
3162 m
->get_first(), m
->get_last(), m
->dname
);
3163 m
->get_connection()->send_message2(std::move(reply
));
3167 void Client::_put_inode(Inode
*in
, int n
)
3169 ldout(cct
, 10) << __func__
<< " on " << *in
<< " n = " << n
<< dendl
;
3171 int left
= in
->_put(n
);
3174 remove_all_caps(in
);
3176 ldout(cct
, 10) << __func__
<< " deleting " << *in
<< dendl
;
3177 bool unclean
= objectcacher
->release_set(&in
->oset
);
3178 ceph_assert(!unclean
);
3179 inode_map
.erase(in
->vino());
3180 if (use_faked_inos())
3181 _release_faked_ino(in
);
3186 while (!root_parents
.empty())
3187 root_parents
.erase(root_parents
.begin());
3194 void Client::delay_put_inodes(bool wakeup
)
3196 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
3198 std::map
<Inode
*,int> release
;
3200 std::scoped_lock
dl(delay_i_lock
);
3201 release
.swap(delay_i_release
);
3204 if (release
.empty())
3207 for (auto &[in
, cnt
] : release
)
3208 _put_inode(in
, cnt
);
3211 mount_cond
.notify_all();
3214 void Client::put_inode(Inode
*in
, int n
)
3216 ldout(cct
, 20) << __func__
<< " on " << *in
<< " n = " << n
<< dendl
;
3218 std::scoped_lock
dl(delay_i_lock
);
3219 delay_i_release
[in
] += n
;
3222 void Client::close_dir(Dir
*dir
)
3224 Inode
*in
= dir
->parent_inode
;
3225 ldout(cct
, 15) << __func__
<< " dir " << dir
<< " on " << in
<< dendl
;
3226 ceph_assert(dir
->is_empty());
3227 ceph_assert(in
->dir
== dir
);
3228 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
3229 if (!in
->dentries
.empty())
3230 in
->get_first_parent()->put(); // unpin dentry
3234 put_inode(in
); // unpin inode
3238 * Don't call this with in==NULL, use get_or_create for that
3239 * leave dn set to default NULL unless you're trying to add
3240 * a new inode to a pre-created Dentry
3242 Dentry
* Client::link(Dir
*dir
, const string
& name
, Inode
*in
, Dentry
*dn
)
3245 // create a new Dentry
3246 dn
= new Dentry(dir
, name
);
3248 lru
.lru_insert_mid(dn
); // mid or top?
3250 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3251 << " dn " << dn
<< " (new dn)" << dendl
;
3253 ceph_assert(!dn
->inode
);
3254 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3255 << " dn " << dn
<< " (old dn)" << dendl
;
3258 if (in
) { // link to inode
3260 // only one parent for directories!
3261 if (in
->is_dir() && !in
->dentries
.empty()) {
3262 tmp_ref
= in
; // prevent unlink below from freeing the inode.
3263 Dentry
*olddn
= in
->get_first_parent();
3264 ceph_assert(olddn
->dir
!= dir
|| olddn
->name
!= name
);
3265 Inode
*old_diri
= olddn
->dir
->parent_inode
;
3266 clear_dir_complete_and_ordered(old_diri
, true);
3267 unlink(olddn
, true, true); // keep dir, dentry
3272 ldout(cct
, 20) << "link inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3278 void Client::unlink(Dentry
*dn
, bool keepdir
, bool keepdentry
)
3280 InodeRef
in(dn
->inode
);
3281 ldout(cct
, 15) << "unlink dir " << dn
->dir
->parent_inode
<< " '" << dn
->name
<< "' dn " << dn
3282 << " inode " << dn
->inode
<< dendl
;
3284 // unlink from inode
3288 ldout(cct
, 20) << "unlink inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3294 ldout(cct
, 15) << "unlink removing '" << dn
->name
<< "' dn " << dn
<< dendl
;
3304 if (dir
->is_empty() && !keepdir
)
3310 * For asynchronous flushes, check for errors from the IO and
3311 * update the inode if necessary
3313 class C_Client_FlushComplete
: public Context
{
3318 C_Client_FlushComplete(Client
*c
, Inode
*in
) : client(c
), inode(in
) { }
3319 void finish(int r
) override
{
3320 ceph_assert(ceph_mutex_is_locked_by_me(client
->client_lock
));
3322 client_t
const whoami
= client
->whoami
; // For the benefit of ldout prefix
3323 ldout(client
->cct
, 1) << "I/O error from flush on inode " << inode
3324 << " 0x" << std::hex
<< inode
->ino
<< std::dec
3325 << ": " << r
<< "(" << cpp_strerror(r
) << ")" << dendl
;
3326 inode
->set_async_err(r
);
3336 void Client::get_cap_ref(Inode
*in
, int cap
)
3338 if ((cap
& CEPH_CAP_FILE_BUFFER
) &&
3339 in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] == 0) {
3340 ldout(cct
, 5) << __func__
<< " got first FILE_BUFFER ref on " << *in
<< dendl
;
3343 if ((cap
& CEPH_CAP_FILE_CACHE
) &&
3344 in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3345 ldout(cct
, 5) << __func__
<< " got first FILE_CACHE ref on " << *in
<< dendl
;
3348 in
->get_cap_ref(cap
);
3351 void Client::put_cap_ref(Inode
*in
, int cap
)
3353 int last
= in
->put_cap_ref(cap
);
3356 int drop
= last
& ~in
->caps_issued();
3357 if (in
->snapid
== CEPH_NOSNAP
) {
3358 if ((last
& (CEPH_CAP_FILE_WR
| CEPH_CAP_FILE_BUFFER
)) &&
3359 !in
->cap_snaps
.empty() &&
3360 in
->cap_snaps
.rbegin()->second
.writing
) {
3361 ldout(cct
, 10) << __func__
<< " finishing pending cap_snap on " << *in
<< dendl
;
3362 in
->cap_snaps
.rbegin()->second
.writing
= 0;
3363 finish_cap_snap(in
, in
->cap_snaps
.rbegin()->second
, get_caps_used(in
));
3364 signal_cond_list(in
->waitfor_caps
); // wake up blocked sync writers
3366 if (last
& CEPH_CAP_FILE_BUFFER
) {
3367 for (auto &p
: in
->cap_snaps
)
3368 p
.second
.dirty_data
= 0;
3369 signal_cond_list(in
->waitfor_commit
);
3370 ldout(cct
, 5) << __func__
<< " dropped last FILE_BUFFER ref on " << *in
<< dendl
;
3374 if (last
& CEPH_CAP_FILE_CACHE
) {
3375 ldout(cct
, 5) << __func__
<< " dropped last FILE_CACHE ref on " << *in
<< dendl
;
3381 put_inode(in
, put_nref
);
3385 // get caps for a given file handle -- the inode should have @need caps
3386 // issued by the mds and @want caps not revoked (or not under revocation).
3387 // this routine blocks till the cap requirement is satisfied. also account
3388 // (track) for capability hit when required (when cap requirement succeedes).
3389 int Client::get_caps(Fh
*fh
, int need
, int want
, int *phave
, loff_t endoff
)
3391 Inode
*in
= fh
->inode
.get();
3393 int r
= check_pool_perm(in
, need
);
3398 int file_wanted
= in
->caps_file_wanted();
3399 if ((file_wanted
& need
) != need
) {
3400 ldout(cct
, 10) << "get_caps " << *in
<< " need " << ccap_string(need
)
3401 << " file_wanted " << ccap_string(file_wanted
) << ", EBADF "
3403 return -CEPHFS_EBADF
;
3406 if ((fh
->mode
& CEPH_FILE_MODE_WR
) && fh
->gen
!= fd_gen
)
3407 return -CEPHFS_EBADF
;
3409 if ((in
->flags
& I_ERROR_FILELOCK
) && fh
->has_any_filelocks())
3413 int have
= in
->caps_issued(&implemented
);
3415 bool waitfor_caps
= false;
3416 bool waitfor_commit
= false;
3418 if (have
& need
& CEPH_CAP_FILE_WR
) {
3420 if ((endoff
>= (loff_t
)in
->max_size
||
3421 endoff
> (loff_t
)(in
->size
<< 1)) &&
3422 endoff
> (loff_t
)in
->wanted_max_size
) {
3423 ldout(cct
, 10) << "wanted_max_size " << in
->wanted_max_size
<< " -> " << endoff
<< dendl
;
3424 in
->wanted_max_size
= endoff
;
3426 if (in
->wanted_max_size
> in
->max_size
&&
3427 in
->wanted_max_size
> in
->requested_max_size
)
3431 if (endoff
>= 0 && endoff
> (loff_t
)in
->max_size
) {
3432 ldout(cct
, 10) << "waiting on max_size, endoff " << endoff
<< " max_size " << in
->max_size
<< " on " << *in
<< dendl
;
3433 waitfor_caps
= true;
3435 if (!in
->cap_snaps
.empty()) {
3436 if (in
->cap_snaps
.rbegin()->second
.writing
) {
3437 ldout(cct
, 10) << "waiting on cap_snap write to complete" << dendl
;
3438 waitfor_caps
= true;
3440 for (auto &p
: in
->cap_snaps
) {
3441 if (p
.second
.dirty_data
) {
3442 waitfor_commit
= true;
3446 if (waitfor_commit
) {
3447 _flush(in
, new C_Client_FlushComplete(this, in
));
3448 ldout(cct
, 10) << "waiting for WRBUFFER to get dropped" << dendl
;
3453 if (!waitfor_caps
&& !waitfor_commit
) {
3454 if ((have
& need
) == need
) {
3455 int revoking
= implemented
& ~have
;
3456 ldout(cct
, 10) << "get_caps " << *in
<< " have " << ccap_string(have
)
3457 << " need " << ccap_string(need
) << " want " << ccap_string(want
)
3458 << " revoking " << ccap_string(revoking
)
3460 if ((revoking
& want
) == 0) {
3461 *phave
= need
| (have
& want
);
3462 in
->get_cap_ref(need
);
3467 ldout(cct
, 10) << "waiting for caps " << *in
<< " need " << ccap_string(need
) << " want " << ccap_string(want
) << dendl
;
3468 waitfor_caps
= true;
3471 if ((need
& CEPH_CAP_FILE_WR
) && in
->auth_cap
&&
3472 in
->auth_cap
->session
->readonly
)
3473 return -CEPHFS_EROFS
;
3475 if (in
->flags
& I_CAP_DROPPED
) {
3476 int mds_wanted
= in
->caps_mds_wanted();
3477 if ((mds_wanted
& need
) != need
) {
3478 int ret
= _renew_caps(in
);
3483 if (!(file_wanted
& ~mds_wanted
))
3484 in
->flags
&= ~I_CAP_DROPPED
;
3488 wait_on_list(in
->waitfor_caps
);
3489 else if (waitfor_commit
)
3490 wait_on_list(in
->waitfor_commit
);
3494 int Client::get_caps_used(Inode
*in
)
3496 unsigned used
= in
->caps_used();
3497 if (!(used
& CEPH_CAP_FILE_CACHE
) &&
3498 !objectcacher
->set_is_empty(&in
->oset
))
3499 used
|= CEPH_CAP_FILE_CACHE
;
3503 void Client::cap_delay_requeue(Inode
*in
)
3505 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3506 in
->hold_caps_until
= ceph_clock_now();
3507 in
->hold_caps_until
+= cct
->_conf
->client_caps_release_delay
;
3508 delayed_list
.push_back(&in
->delay_cap_item
);
3511 void Client::send_cap(Inode
*in
, MetaSession
*session
, Cap
*cap
,
3512 int flags
, int used
, int want
, int retain
,
3513 int flush
, ceph_tid_t flush_tid
)
3515 int held
= cap
->issued
| cap
->implemented
;
3516 int revoking
= cap
->implemented
& ~cap
->issued
;
3517 retain
&= ~revoking
;
3518 int dropping
= cap
->issued
& ~retain
;
3519 int op
= CEPH_CAP_OP_UPDATE
;
3521 ldout(cct
, 10) << __func__
<< " " << *in
3522 << " mds." << session
->mds_num
<< " seq " << cap
->seq
3523 << " used " << ccap_string(used
)
3524 << " want " << ccap_string(want
)
3525 << " flush " << ccap_string(flush
)
3526 << " retain " << ccap_string(retain
)
3527 << " held "<< ccap_string(held
)
3528 << " revoking " << ccap_string(revoking
)
3529 << " dropping " << ccap_string(dropping
)
3532 if (cct
->_conf
->client_inject_release_failure
&& revoking
) {
3533 const int would_have_issued
= cap
->issued
& retain
;
3534 const int would_have_implemented
= cap
->implemented
& (cap
->issued
| used
);
3536 // - tell the server we think issued is whatever they issued plus whatever we implemented
3537 // - leave what we have implemented in place
3538 ldout(cct
, 20) << __func__
<< " injecting failure to release caps" << dendl
;
3539 cap
->issued
= cap
->issued
| cap
->implemented
;
3541 // Make an exception for revoking xattr caps: we are injecting
3542 // failure to release other caps, but allow xattr because client
3543 // will block on xattr ops if it can't release these to MDS (#9800)
3544 const int xattr_mask
= CEPH_CAP_XATTR_SHARED
| CEPH_CAP_XATTR_EXCL
;
3545 cap
->issued
^= xattr_mask
& revoking
;
3546 cap
->implemented
^= xattr_mask
& revoking
;
3548 ldout(cct
, 20) << __func__
<< " issued " << ccap_string(cap
->issued
) << " vs " << ccap_string(would_have_issued
) << dendl
;
3549 ldout(cct
, 20) << __func__
<< " implemented " << ccap_string(cap
->implemented
) << " vs " << ccap_string(would_have_implemented
) << dendl
;
3552 cap
->issued
&= retain
;
3553 cap
->implemented
&= cap
->issued
| used
;
3556 snapid_t follows
= 0;
3559 follows
= in
->snaprealm
->get_snap_context().seq
;
3561 auto m
= make_message
<MClientCaps
>(op
,
3564 cap
->cap_id
, cap
->seq
,
3570 m
->caller_uid
= in
->cap_dirtier_uid
;
3571 m
->caller_gid
= in
->cap_dirtier_gid
;
3573 m
->head
.issue_seq
= cap
->issue_seq
;
3574 m
->set_tid(flush_tid
);
3576 m
->head
.uid
= in
->uid
;
3577 m
->head
.gid
= in
->gid
;
3578 m
->head
.mode
= in
->mode
;
3580 m
->head
.nlink
= in
->nlink
;
3582 if (flush
& CEPH_CAP_XATTR_EXCL
) {
3583 encode(in
->xattrs
, m
->xattrbl
);
3584 m
->head
.xattr_version
= in
->xattr_version
;
3588 m
->max_size
= in
->max_size
;
3589 m
->truncate_seq
= in
->truncate_seq
;
3590 m
->truncate_size
= in
->truncate_size
;
3591 m
->mtime
= in
->mtime
;
3592 m
->atime
= in
->atime
;
3593 m
->ctime
= in
->ctime
;
3594 m
->btime
= in
->btime
;
3595 m
->time_warp_seq
= in
->time_warp_seq
;
3596 m
->change_attr
= in
->change_attr
;
3598 if (!(flags
& MClientCaps::FLAG_PENDING_CAPSNAP
) &&
3599 !in
->cap_snaps
.empty() &&
3600 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3601 flags
|= MClientCaps::FLAG_PENDING_CAPSNAP
;
3604 if (flush
& CEPH_CAP_FILE_WR
) {
3605 m
->inline_version
= in
->inline_version
;
3606 m
->inline_data
= in
->inline_data
;
3609 in
->reported_size
= in
->size
;
3610 m
->set_snap_follows(follows
);
3612 if (cap
== in
->auth_cap
) {
3613 if (want
& CEPH_CAP_ANY_FILE_WR
) {
3614 m
->set_max_size(in
->wanted_max_size
);
3615 in
->requested_max_size
= in
->wanted_max_size
;
3616 ldout(cct
, 15) << "auth cap, requesting max_size " << in
->requested_max_size
<< dendl
;
3618 in
->requested_max_size
= 0;
3619 ldout(cct
, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl
;
3623 if (!session
->flushing_caps_tids
.empty())
3624 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3626 session
->con
->send_message2(std::move(m
));
3629 static bool is_max_size_approaching(Inode
*in
)
3631 /* mds will adjust max size according to the reported size */
3632 if (in
->flushing_caps
& CEPH_CAP_FILE_WR
)
3634 if (in
->size
>= in
->max_size
)
3636 /* half of previous max_size increment has been used */
3637 if (in
->max_size
> in
->reported_size
&&
3638 (in
->size
<< 1) >= in
->max_size
+ in
->reported_size
)
3643 static int adjust_caps_used_for_lazyio(int used
, int issued
, int implemented
)
3645 if (!(used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
)))
3647 if (!(implemented
& CEPH_CAP_FILE_LAZYIO
))
3650 if (issued
& CEPH_CAP_FILE_LAZYIO
) {
3651 if (!(issued
& CEPH_CAP_FILE_CACHE
)) {
3652 used
&= ~CEPH_CAP_FILE_CACHE
;
3653 used
|= CEPH_CAP_FILE_LAZYIO
;
3655 if (!(issued
& CEPH_CAP_FILE_BUFFER
)) {
3656 used
&= ~CEPH_CAP_FILE_BUFFER
;
3657 used
|= CEPH_CAP_FILE_LAZYIO
;
3660 if (!(implemented
& CEPH_CAP_FILE_CACHE
)) {
3661 used
&= ~CEPH_CAP_FILE_CACHE
;
3662 used
|= CEPH_CAP_FILE_LAZYIO
;
3664 if (!(implemented
& CEPH_CAP_FILE_BUFFER
)) {
3665 used
&= ~CEPH_CAP_FILE_BUFFER
;
3666 used
|= CEPH_CAP_FILE_LAZYIO
;
3675 * Examine currently used and wanted versus held caps. Release, flush or ack
3676 * revoked caps to the MDS as appropriate.
3678 * @param in the inode to check
3679 * @param flags flags to apply to cap check
3681 void Client::check_caps(Inode
*in
, unsigned flags
)
3683 unsigned wanted
= in
->caps_wanted();
3684 unsigned used
= get_caps_used(in
);
3688 int issued
= in
->caps_issued(&implemented
);
3689 int revoking
= implemented
& ~issued
;
3691 int orig_used
= used
;
3692 used
= adjust_caps_used_for_lazyio(used
, issued
, implemented
);
3694 int retain
= wanted
| used
| CEPH_CAP_PIN
;
3695 if (!is_unmounting() && in
->nlink
> 0) {
3697 retain
|= CEPH_CAP_ANY
;
3698 } else if (in
->is_dir() &&
3699 (issued
& CEPH_CAP_FILE_SHARED
) &&
3700 (in
->flags
& I_COMPLETE
)) {
3701 // we do this here because we don't want to drop to Fs (and then
3702 // drop the Fs if we do a create!) if that alone makes us send lookups
3703 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3704 wanted
= CEPH_CAP_ANY_SHARED
| CEPH_CAP_FILE_EXCL
;
3707 retain
|= CEPH_CAP_ANY_SHARED
;
3708 // keep RD only if we didn't have the file open RW,
3709 // because then the mds would revoke it anyway to
3710 // journal max_size=0.
3711 if (in
->max_size
== 0)
3712 retain
|= CEPH_CAP_ANY_RD
;
3716 ldout(cct
, 10) << __func__
<< " on " << *in
3717 << " wanted " << ccap_string(wanted
)
3718 << " used " << ccap_string(used
)
3719 << " issued " << ccap_string(issued
)
3720 << " revoking " << ccap_string(revoking
)
3721 << " flags=" << flags
3724 if (in
->snapid
!= CEPH_NOSNAP
)
3725 return; //snap caps last forever, can't write
3727 if (in
->caps
.empty())
3728 return; // guard if at end of func
3730 if (!(orig_used
& CEPH_CAP_FILE_BUFFER
) &&
3731 (revoking
& used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
3733 used
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
3737 for (auto &p
: in
->caps
) {
3738 mds_rank_t mds
= p
.first
;
3739 Cap
&cap
= p
.second
;
3741 MetaSession
*session
= &mds_sessions
.at(mds
);
3744 if (in
->auth_cap
&& &cap
!= in
->auth_cap
)
3745 cap_used
&= ~in
->auth_cap
->issued
;
3747 revoking
= cap
.implemented
& ~cap
.issued
;
3749 ldout(cct
, 10) << " cap mds." << mds
3750 << " issued " << ccap_string(cap
.issued
)
3751 << " implemented " << ccap_string(cap
.implemented
)
3752 << " revoking " << ccap_string(revoking
) << dendl
;
3754 if (in
->wanted_max_size
> in
->max_size
&&
3755 in
->wanted_max_size
> in
->requested_max_size
&&
3756 &cap
== in
->auth_cap
)
3759 /* approaching file_max? */
3760 if ((cap
.issued
& CEPH_CAP_FILE_WR
) &&
3761 &cap
== in
->auth_cap
&&
3762 is_max_size_approaching(in
)) {
3763 ldout(cct
, 10) << "size " << in
->size
<< " approaching max_size " << in
->max_size
3764 << ", reported " << in
->reported_size
<< dendl
;
3768 /* completed revocation? */
3769 if (revoking
&& (revoking
& cap_used
) == 0) {
3770 ldout(cct
, 10) << "completed revocation of " << ccap_string(cap
.implemented
& ~cap
.issued
) << dendl
;
3774 /* want more caps from mds? */
3775 if (wanted
& ~(cap
.wanted
| cap
.issued
))
3778 if (!revoking
&& is_unmounting() && (cap_used
== 0))
3781 if ((cap
.issued
& ~retain
) == 0 && // and we don't have anything we wouldn't like
3782 !in
->dirty_caps
) // and we have no dirty caps
3785 if (!(flags
& CHECK_CAPS_NODELAY
)) {
3786 ldout(cct
, 10) << "delaying cap release" << dendl
;
3787 cap_delay_requeue(in
);
3792 if (&cap
== in
->auth_cap
) {
3793 if (in
->flags
& I_KICK_FLUSH
) {
3794 ldout(cct
, 20) << " reflushing caps (check_caps) on " << *in
3795 << " to mds." << mds
<< dendl
;
3796 kick_flushing_caps(in
, session
);
3798 if (!in
->cap_snaps
.empty() &&
3799 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3805 ceph_tid_t flush_tid
;
3806 if (in
->auth_cap
== &cap
&& in
->dirty_caps
) {
3807 flushing
= mark_caps_flushing(in
, &flush_tid
);
3808 if (flags
& CHECK_CAPS_SYNCHRONOUS
)
3809 msg_flags
|= MClientCaps::FLAG_SYNC
;
3815 send_cap(in
, session
, &cap
, msg_flags
, cap_used
, wanted
, retain
,
3816 flushing
, flush_tid
);
3821 void Client::queue_cap_snap(Inode
*in
, SnapContext
& old_snapc
)
3823 int used
= get_caps_used(in
);
3824 int dirty
= in
->caps_dirty();
3825 ldout(cct
, 10) << __func__
<< " " << *in
<< " snapc " << old_snapc
<< " used " << ccap_string(used
) << dendl
;
3827 if (in
->cap_snaps
.size() &&
3828 in
->cap_snaps
.rbegin()->second
.writing
) {
3829 ldout(cct
, 10) << __func__
<< " already have pending cap_snap on " << *in
<< dendl
;
3831 } else if (in
->caps_dirty() ||
3832 (used
& CEPH_CAP_FILE_WR
) ||
3833 (dirty
& CEPH_CAP_ANY_WR
)) {
3834 const auto &capsnapem
= in
->cap_snaps
.emplace(std::piecewise_construct
, std::make_tuple(old_snapc
.seq
), std::make_tuple(in
));
3835 ceph_assert(capsnapem
.second
); /* element inserted */
3836 CapSnap
&capsnap
= capsnapem
.first
->second
;
3837 capsnap
.context
= old_snapc
;
3838 capsnap
.issued
= in
->caps_issued();
3839 capsnap
.dirty
= in
->caps_dirty();
3841 capsnap
.dirty_data
= (used
& CEPH_CAP_FILE_BUFFER
);
3843 capsnap
.uid
= in
->uid
;
3844 capsnap
.gid
= in
->gid
;
3845 capsnap
.mode
= in
->mode
;
3846 capsnap
.btime
= in
->btime
;
3847 capsnap
.xattrs
= in
->xattrs
;
3848 capsnap
.xattr_version
= in
->xattr_version
;
3849 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3850 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3852 if (used
& CEPH_CAP_FILE_WR
) {
3853 ldout(cct
, 10) << __func__
<< " WR used on " << *in
<< dendl
;
3854 capsnap
.writing
= 1;
3856 finish_cap_snap(in
, capsnap
, used
);
3859 ldout(cct
, 10) << __func__
<< " not dirty|writing on " << *in
<< dendl
;
3863 void Client::finish_cap_snap(Inode
*in
, CapSnap
&capsnap
, int used
)
3865 ldout(cct
, 10) << __func__
<< " " << *in
<< " capsnap " << (void *)&capsnap
<< " used " << ccap_string(used
) << dendl
;
3866 capsnap
.size
= in
->size
;
3867 capsnap
.mtime
= in
->mtime
;
3868 capsnap
.atime
= in
->atime
;
3869 capsnap
.ctime
= in
->ctime
;
3870 capsnap
.time_warp_seq
= in
->time_warp_seq
;
3871 capsnap
.change_attr
= in
->change_attr
;
3872 capsnap
.dirty
|= in
->caps_dirty();
3874 /* Only reset it if it wasn't set before */
3875 if (capsnap
.cap_dirtier_uid
== -1) {
3876 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3877 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3880 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3881 capsnap
.inline_data
= in
->inline_data
;
3882 capsnap
.inline_version
= in
->inline_version
;
3885 if (used
& CEPH_CAP_FILE_BUFFER
) {
3886 capsnap
.writing
= 1;
3887 ldout(cct
, 10) << __func__
<< " " << *in
<< " cap_snap " << &capsnap
<< " used " << used
3888 << " WRBUFFER, delaying" << dendl
;
3890 capsnap
.dirty_data
= 0;
3895 void Client::send_flush_snap(Inode
*in
, MetaSession
*session
,
3896 snapid_t follows
, CapSnap
& capsnap
)
3898 auto m
= make_message
<MClientCaps
>(CEPH_CAP_OP_FLUSHSNAP
,
3899 in
->ino
, in
->snaprealm
->ino
, 0,
3900 in
->auth_cap
->mseq
, cap_epoch_barrier
);
3901 m
->caller_uid
= capsnap
.cap_dirtier_uid
;
3902 m
->caller_gid
= capsnap
.cap_dirtier_gid
;
3904 m
->set_client_tid(capsnap
.flush_tid
);
3905 m
->head
.snap_follows
= follows
;
3907 m
->head
.caps
= capsnap
.issued
;
3908 m
->head
.dirty
= capsnap
.dirty
;
3910 m
->head
.uid
= capsnap
.uid
;
3911 m
->head
.gid
= capsnap
.gid
;
3912 m
->head
.mode
= capsnap
.mode
;
3913 m
->btime
= capsnap
.btime
;
3915 m
->size
= capsnap
.size
;
3917 m
->head
.xattr_version
= capsnap
.xattr_version
;
3918 encode(capsnap
.xattrs
, m
->xattrbl
);
3920 m
->ctime
= capsnap
.ctime
;
3921 m
->btime
= capsnap
.btime
;
3922 m
->mtime
= capsnap
.mtime
;
3923 m
->atime
= capsnap
.atime
;
3924 m
->time_warp_seq
= capsnap
.time_warp_seq
;
3925 m
->change_attr
= capsnap
.change_attr
;
3927 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3928 m
->inline_version
= in
->inline_version
;
3929 m
->inline_data
= in
->inline_data
;
3932 ceph_assert(!session
->flushing_caps_tids
.empty());
3933 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3935 session
->con
->send_message2(std::move(m
));
3938 void Client::flush_snaps(Inode
*in
)
3940 ldout(cct
, 10) << "flush_snaps on " << *in
<< dendl
;
3941 ceph_assert(in
->cap_snaps
.size());
3944 ceph_assert(in
->auth_cap
);
3945 MetaSession
*session
= in
->auth_cap
->session
;
3947 for (auto &p
: in
->cap_snaps
) {
3948 CapSnap
&capsnap
= p
.second
;
3949 // only do new flush
3950 if (capsnap
.flush_tid
> 0)
3953 ldout(cct
, 10) << "flush_snaps mds." << session
->mds_num
3954 << " follows " << p
.first
3955 << " size " << capsnap
.size
3956 << " mtime " << capsnap
.mtime
3957 << " dirty_data=" << capsnap
.dirty_data
3958 << " writing=" << capsnap
.writing
3959 << " on " << *in
<< dendl
;
3960 if (capsnap
.dirty_data
|| capsnap
.writing
)
3963 capsnap
.flush_tid
= ++last_flush_tid
;
3964 session
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
3965 in
->flushing_cap_tids
[capsnap
.flush_tid
] = 0;
3966 if (!in
->flushing_cap_item
.is_on_list())
3967 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
3969 send_flush_snap(in
, session
, p
.first
, capsnap
);
3973 void Client::wait_on_list(list
<ceph::condition_variable
*>& ls
)
3975 ceph::condition_variable cond
;
3976 ls
.push_back(&cond
);
3977 std::unique_lock l
{client_lock
, std::adopt_lock
};
3983 void Client::signal_cond_list(list
<ceph::condition_variable
*>& ls
)
3985 for (auto cond
: ls
) {
3990 void Client::wait_on_context_list(list
<Context
*>& ls
)
3992 ceph::condition_variable cond
;
3995 ls
.push_back(new C_Cond(cond
, &done
, &r
));
3996 std::unique_lock l
{client_lock
, std::adopt_lock
};
3997 cond
.wait(l
, [&done
] { return done
;});
4001 void Client::signal_context_list(list
<Context
*>& ls
)
4003 while (!ls
.empty()) {
4004 ls
.front()->complete(0);
4009 void Client::wake_up_session_caps(MetaSession
*s
, bool reconnect
)
4011 for (const auto &cap
: s
->caps
) {
4012 auto &in
= cap
->inode
;
4014 in
.requested_max_size
= 0;
4015 in
.wanted_max_size
= 0;
4017 if (cap
->gen
< s
->cap_gen
) {
4018 // mds did not re-issue stale cap.
4019 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
4020 // make sure mds knows what we want.
4021 if (in
.caps_file_wanted() & ~cap
->wanted
)
4022 in
.flags
|= I_CAP_DROPPED
;
4025 signal_cond_list(in
.waitfor_caps
);
4030 // flush dirty data (from objectcache)
4032 class C_Client_CacheInvalidate
: public Context
{
4036 int64_t offset
, length
;
4038 C_Client_CacheInvalidate(Client
*c
, Inode
*in
, int64_t off
, int64_t len
) :
4039 client(c
), offset(off
), length(len
) {
4040 if (client
->use_faked_inos())
4041 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
4045 void finish(int r
) override
{
4046 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
4047 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
4048 client
->_async_invalidate(ino
, offset
, length
);
4052 void Client::_async_invalidate(vinodeno_t ino
, int64_t off
, int64_t len
)
4054 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
4055 if (!mref_reader
.is_state_satisfied())
4058 ldout(cct
, 10) << __func__
<< " " << ino
<< " " << off
<< "~" << len
<< dendl
;
4059 ino_invalidate_cb(callback_handle
, ino
, off
, len
);
4062 void Client::_schedule_invalidate_callback(Inode
*in
, int64_t off
, int64_t len
) {
4064 if (ino_invalidate_cb
)
4065 // we queue the invalidate, which calls the callback and decrements the ref
4066 async_ino_invalidator
.queue(new C_Client_CacheInvalidate(this, in
, off
, len
));
4069 void Client::_invalidate_inode_cache(Inode
*in
)
4071 ldout(cct
, 10) << __func__
<< " " << *in
<< dendl
;
4073 // invalidate our userspace inode cache
4074 if (cct
->_conf
->client_oc
) {
4075 objectcacher
->release_set(&in
->oset
);
4076 if (!objectcacher
->set_is_empty(&in
->oset
))
4077 lderr(cct
) << "failed to invalidate cache for " << *in
<< dendl
;
4080 _schedule_invalidate_callback(in
, 0, 0);
4083 void Client::_invalidate_inode_cache(Inode
*in
, int64_t off
, int64_t len
)
4085 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
4087 // invalidate our userspace inode cache
4088 if (cct
->_conf
->client_oc
) {
4089 vector
<ObjectExtent
> ls
;
4090 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, len
, in
->truncate_size
, ls
);
4091 objectcacher
->discard_writeback(&in
->oset
, ls
, nullptr);
4094 _schedule_invalidate_callback(in
, off
, len
);
4097 bool Client::_release(Inode
*in
)
4099 ldout(cct
, 20) << "_release " << *in
<< dendl
;
4100 if (in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
4101 _invalidate_inode_cache(in
);
4107 bool Client::_flush(Inode
*in
, Context
*onfinish
)
4109 ldout(cct
, 10) << "_flush " << *in
<< dendl
;
4111 if (!in
->oset
.dirty_or_tx
) {
4112 ldout(cct
, 10) << " nothing to flush" << dendl
;
4113 onfinish
->complete(0);
4117 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
4118 ldout(cct
, 8) << __func__
<< ": FULL, purging for ENOSPC" << dendl
;
4119 objectcacher
->purge_set(&in
->oset
);
4121 onfinish
->complete(-CEPHFS_ENOSPC
);
4126 return objectcacher
->flush_set(&in
->oset
, onfinish
);
4129 void Client::_flush_range(Inode
*in
, int64_t offset
, uint64_t size
)
4131 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
4132 if (!in
->oset
.dirty_or_tx
) {
4133 ldout(cct
, 10) << " nothing to flush" << dendl
;
4137 C_SaferCond
onflush("Client::_flush_range flock");
4138 bool ret
= objectcacher
->file_flush(&in
->oset
, &in
->layout
, in
->snaprealm
->get_snap_context(),
4139 offset
, size
, &onflush
);
4142 client_lock
.unlock();
4148 void Client::flush_set_callback(ObjectCacher::ObjectSet
*oset
)
4150 // std::scoped_lock l(client_lock);
4151 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
)); // will be called via dispatch() -> objecter -> ...
4152 Inode
*in
= static_cast<Inode
*>(oset
->parent
);
4157 void Client::_flushed(Inode
*in
)
4159 ldout(cct
, 10) << "_flushed " << *in
<< dendl
;
4161 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
4166 // checks common to add_update_cap, handle_cap_grant
4167 void Client::check_cap_issue(Inode
*in
, unsigned issued
)
4169 unsigned had
= in
->caps_issued();
4171 if ((issued
& CEPH_CAP_FILE_CACHE
) &&
4172 !(had
& CEPH_CAP_FILE_CACHE
))
4175 if ((issued
& CEPH_CAP_FILE_SHARED
) !=
4176 (had
& CEPH_CAP_FILE_SHARED
)) {
4177 if (issued
& CEPH_CAP_FILE_SHARED
)
4180 clear_dir_complete_and_ordered(in
, true);
4184 void Client::add_update_cap(Inode
*in
, MetaSession
*mds_session
, uint64_t cap_id
,
4185 unsigned issued
, unsigned wanted
, unsigned seq
, unsigned mseq
,
4186 inodeno_t realm
, int flags
, const UserPerm
& cap_perms
)
4188 if (!in
->is_any_caps()) {
4189 ceph_assert(in
->snaprealm
== 0);
4190 in
->snaprealm
= get_snap_realm(realm
);
4191 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4192 ldout(cct
, 15) << __func__
<< " first one, opened snaprealm " << in
->snaprealm
<< dendl
;
4194 ceph_assert(in
->snaprealm
);
4195 if ((flags
& CEPH_CAP_FLAG_AUTH
) &&
4196 realm
!= inodeno_t(-1) && in
->snaprealm
->ino
!= realm
) {
4197 in
->snaprealm_item
.remove_myself();
4198 auto oldrealm
= in
->snaprealm
;
4199 in
->snaprealm
= get_snap_realm(realm
);
4200 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4201 put_snap_realm(oldrealm
);
4205 mds_rank_t mds
= mds_session
->mds_num
;
4206 const auto &capem
= in
->caps
.emplace(std::piecewise_construct
, std::forward_as_tuple(mds
), std::forward_as_tuple(*in
, mds_session
));
4207 Cap
&cap
= capem
.first
->second
;
4208 if (!capem
.second
) {
4209 if (cap
.gen
< mds_session
->cap_gen
)
4210 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
4213 * auth mds of the inode changed. we received the cap export
4214 * message, but still haven't received the cap import message.
4215 * handle_cap_export() updated the new auth MDS' cap.
4217 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4218 * a message that was send before the cap import message. So
4219 * don't remove caps.
4221 if (ceph_seq_cmp(seq
, cap
.seq
) <= 0) {
4222 if (&cap
!= in
->auth_cap
)
4223 ldout(cct
, 0) << "WARNING: " << "inode " << *in
<< " caps on mds." << mds
<< " != auth_cap." << dendl
;
4225 ceph_assert(cap
.cap_id
== cap_id
);
4228 issued
|= cap
.issued
;
4229 flags
|= CEPH_CAP_FLAG_AUTH
;
4235 check_cap_issue(in
, issued
);
4237 if (flags
& CEPH_CAP_FLAG_AUTH
) {
4238 if (in
->auth_cap
!= &cap
&&
4239 (!in
->auth_cap
|| ceph_seq_cmp(in
->auth_cap
->mseq
, mseq
) < 0)) {
4240 if (in
->auth_cap
&& in
->flushing_cap_item
.is_on_list()) {
4241 ldout(cct
, 10) << __func__
<< " changing auth cap: "
4242 << "add myself to new auth MDS' flushing caps list" << dendl
;
4243 adjust_session_flushing_caps(in
, in
->auth_cap
->session
, mds_session
);
4245 in
->auth_cap
= &cap
;
4249 unsigned old_caps
= cap
.issued
;
4250 cap
.cap_id
= cap_id
;
4251 cap
.issued
= issued
;
4252 cap
.implemented
|= issued
;
4253 if (ceph_seq_cmp(mseq
, cap
.mseq
) > 0)
4254 cap
.wanted
= wanted
;
4256 cap
.wanted
|= wanted
;
4258 cap
.issue_seq
= seq
;
4260 cap
.gen
= mds_session
->cap_gen
;
4261 cap
.latest_perms
= cap_perms
;
4262 ldout(cct
, 10) << __func__
<< " issued " << ccap_string(old_caps
) << " -> " << ccap_string(cap
.issued
)
4263 << " from mds." << mds
4267 if ((issued
& ~old_caps
) && in
->auth_cap
== &cap
) {
4268 // non-auth MDS is revoking the newly grant caps ?
4269 for (auto &p
: in
->caps
) {
4270 if (&p
.second
== &cap
)
4272 if (p
.second
.implemented
& ~p
.second
.issued
& issued
) {
4273 check_caps(in
, CHECK_CAPS_NODELAY
);
4279 if (issued
& ~old_caps
)
4280 signal_cond_list(in
->waitfor_caps
);
4283 void Client::remove_cap(Cap
*cap
, bool queue_release
)
4285 auto &in
= cap
->inode
;
4286 MetaSession
*session
= cap
->session
;
4287 mds_rank_t mds
= cap
->session
->mds_num
;
4289 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " on " << in
<< dendl
;
4291 if (queue_release
) {
4292 session
->enqueue_cap_release(
4303 if (in
.auth_cap
== cap
) {
4304 if (in
.flushing_cap_item
.is_on_list()) {
4305 ldout(cct
, 10) << " removing myself from flushing_cap list" << dendl
;
4306 in
.flushing_cap_item
.remove_myself();
4310 size_t n
= in
.caps
.erase(mds
);
4311 ceph_assert(n
== 1);
4314 if (!in
.is_any_caps()) {
4315 ldout(cct
, 15) << __func__
<< " last one, closing snaprealm " << in
.snaprealm
<< dendl
;
4316 in
.snaprealm_item
.remove_myself();
4317 put_snap_realm(in
.snaprealm
);
4322 void Client::remove_all_caps(Inode
*in
)
4324 while (!in
->caps
.empty())
4325 remove_cap(&in
->caps
.begin()->second
, true);
4328 void Client::remove_session_caps(MetaSession
*s
, int err
)
4330 ldout(cct
, 10) << __func__
<< " mds." << s
->mds_num
<< dendl
;
4332 while (s
->caps
.size()) {
4333 Cap
*cap
= *s
->caps
.begin();
4334 InodeRef
in(&cap
->inode
);
4335 bool dirty_caps
= false;
4336 if (in
->auth_cap
== cap
) {
4337 dirty_caps
= in
->dirty_caps
| in
->flushing_caps
;
4338 in
->wanted_max_size
= 0;
4339 in
->requested_max_size
= 0;
4340 if (in
->has_any_filelocks())
4341 in
->flags
|= I_ERROR_FILELOCK
;
4343 auto caps
= cap
->implemented
;
4344 if (cap
->wanted
| cap
->issued
)
4345 in
->flags
|= I_CAP_DROPPED
;
4346 remove_cap(cap
, false);
4347 in
->cap_snaps
.clear();
4349 lderr(cct
) << __func__
<< " still has dirty|flushing caps on " << *in
<< dendl
;
4350 if (in
->flushing_caps
) {
4351 num_flushing_caps
--;
4352 in
->flushing_cap_tids
.clear();
4354 in
->flushing_caps
= 0;
4355 in
->mark_caps_clean();
4356 put_inode(in
.get());
4358 caps
&= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
;
4359 if (caps
&& !in
->caps_issued_mask(caps
, true)) {
4360 if (err
== -CEPHFS_EBLOCKLISTED
) {
4361 if (in
->oset
.dirty_or_tx
) {
4362 lderr(cct
) << __func__
<< " still has dirty data on " << *in
<< dendl
;
4363 in
->set_async_err(err
);
4365 objectcacher
->purge_set(&in
->oset
);
4367 objectcacher
->release_set(&in
->oset
);
4369 _schedule_invalidate_callback(in
.get(), 0, 0);
4372 signal_cond_list(in
->waitfor_caps
);
4374 s
->flushing_caps_tids
.clear();
4375 sync_cond
.notify_all();
4378 int Client::_do_remount(bool retry_on_error
)
4380 uint64_t max_retries
= cct
->_conf
.get_val
<uint64_t>("mds_max_retries_on_remount_failure");
4383 int r
= remount_cb(callback_handle
);
4385 retries_on_invalidate
= 0;
4388 client_t whoami
= get_nodeid();
4391 "failed to remount (to trim kernel dentries): "
4392 "errno = " << e
<< " (" << strerror(e
) << ")" << dendl
;
4395 "failed to remount (to trim kernel dentries): "
4396 "return code = " << r
<< dendl
;
4399 (cct
->_conf
.get_val
<bool>("client_die_on_failed_remount") ||
4400 cct
->_conf
.get_val
<bool>("client_die_on_failed_dentry_invalidate")) &&
4401 !(retry_on_error
&& (++retries_on_invalidate
< max_retries
));
4402 if (should_abort
&& !is_unmounting()) {
4403 lderr(cct
) << "failed to remount for kernel dentry trimming; quitting!" << dendl
;
4410 class C_Client_Remount
: public Context
{
4414 explicit C_Client_Remount(Client
*c
) : client(c
) {}
4415 void finish(int r
) override
{
4416 ceph_assert(r
== 0);
4417 client
->_do_remount(true);
4421 void Client::_invalidate_kernel_dcache()
4423 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
4424 if (!mref_reader
.is_state_satisfied())
4427 if (can_invalidate_dentries
) {
4428 if (dentry_invalidate_cb
&& root
->dir
) {
4429 for (ceph::unordered_map
<string
, Dentry
*>::iterator p
= root
->dir
->dentries
.begin();
4430 p
!= root
->dir
->dentries
.end();
4432 if (p
->second
->inode
)
4433 _schedule_invalidate_dentry_callback(p
->second
, false);
4436 } else if (remount_cb
) {
4438 // when remounting a file system, linux kernel trims all unused dentries in the fs
4439 remount_finisher
.queue(new C_Client_Remount(this));
4443 void Client::_trim_negative_child_dentries(InodeRef
& in
)
4449 if (dir
&& dir
->dentries
.size() == dir
->num_null_dentries
) {
4450 for (auto p
= dir
->dentries
.begin(); p
!= dir
->dentries
.end(); ) {
4451 Dentry
*dn
= p
->second
;
4453 ceph_assert(!dn
->inode
);
4454 if (dn
->lru_is_expireable())
4455 unlink(dn
, true, false); // keep dir, drop dentry
4457 if (dir
->dentries
.empty()) {
4462 if (in
->flags
& I_SNAPDIR_OPEN
) {
4463 InodeRef snapdir
= open_snapdir(in
.get());
4464 _trim_negative_child_dentries(snapdir
);
4468 class C_Client_CacheRelease
: public Context
{
4473 C_Client_CacheRelease(Client
*c
, Inode
*in
) :
4475 if (client
->use_faked_inos())
4476 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
4480 void finish(int r
) override
{
4481 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
4482 client
->_async_inode_release(ino
);
4486 void Client::_async_inode_release(vinodeno_t ino
)
4488 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
4489 if (!mref_reader
.is_state_satisfied())
4492 ldout(cct
, 10) << __func__
<< " " << ino
<< dendl
;
4493 ino_release_cb(callback_handle
, ino
);
4496 void Client::_schedule_ino_release_callback(Inode
*in
) {
4499 // we queue the invalidate, which calls the callback and decrements the ref
4500 async_ino_releasor
.queue(new C_Client_CacheRelease(this, in
));
4503 void Client::trim_caps(MetaSession
*s
, uint64_t max
)
4505 mds_rank_t mds
= s
->mds_num
;
4506 size_t caps_size
= s
->caps
.size();
4507 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " max " << max
4508 << " caps " << caps_size
<< dendl
;
4510 uint64_t trimmed
= 0;
4511 auto p
= s
->caps
.begin();
4512 std::set
<Dentry
*> to_trim
; /* this avoids caps other than the one we're
4513 * looking at from getting deleted during traversal. */
4514 while ((caps_size
- trimmed
) > max
&& !p
.end()) {
4516 InodeRef
in(&cap
->inode
);
4518 // Increment p early because it will be invalidated if cap
4519 // is deleted inside remove_cap
4522 if (in
->caps
.size() > 1 && cap
!= in
->auth_cap
) {
4523 int mine
= cap
->issued
| cap
->implemented
;
4524 int oissued
= in
->auth_cap
? in
->auth_cap
->issued
: 0;
4525 // disposable non-auth cap
4526 if (!(get_caps_used(in
.get()) & ~oissued
& mine
)) {
4527 ldout(cct
, 20) << " removing unused, unneeded non-auth cap on " << *in
<< dendl
;
4528 cap
= (remove_cap(cap
, true), nullptr);
4532 ldout(cct
, 20) << " trying to trim dentries for " << *in
<< dendl
;
4533 _trim_negative_child_dentries(in
);
4535 auto q
= in
->dentries
.begin();
4536 while (q
!= in
->dentries
.end()) {
4539 if (dn
->lru_is_expireable()) {
4540 if (can_invalidate_dentries
&&
4541 dn
->dir
->parent_inode
->ino
== MDS_INO_ROOT
) {
4542 // Only issue one of these per DN for inodes in root: handle
4543 // others more efficiently by calling for root-child DNs at
4544 // the end of this function.
4545 _schedule_invalidate_dentry_callback(dn
, true);
4547 ldout(cct
, 20) << " queueing dentry for trimming: " << dn
->name
<< dendl
;
4550 ldout(cct
, 20) << " not expirable: " << dn
->name
<< dendl
;
4554 if (in
->ll_ref
== 1 && in
->ino
!= MDS_INO_ROOT
) {
4555 _schedule_ino_release_callback(in
.get());
4557 if (all
&& in
->ino
!= MDS_INO_ROOT
) {
4558 ldout(cct
, 20) << __func__
<< " counting as trimmed: " << *in
<< dendl
;
4563 ldout(cct
, 20) << " trimming queued dentries: " << dendl
;
4564 for (const auto &dn
: to_trim
) {
4569 caps_size
= s
->caps
.size();
4570 if (caps_size
> (size_t)max
)
4571 _invalidate_kernel_dcache();
4574 void Client::force_session_readonly(MetaSession
*s
)
4577 for (xlist
<Cap
*>::iterator p
= s
->caps
.begin(); !p
.end(); ++p
) {
4578 auto &in
= (*p
)->inode
;
4579 if (in
.caps_wanted() & CEPH_CAP_FILE_WR
)
4580 signal_cond_list(in
.waitfor_caps
);
4584 int Client::mark_caps_flushing(Inode
*in
, ceph_tid_t
* ptid
)
4586 MetaSession
*session
= in
->auth_cap
->session
;
4588 int flushing
= in
->dirty_caps
;
4589 ceph_assert(flushing
);
4591 ceph_tid_t flush_tid
= ++last_flush_tid
;
4592 in
->flushing_cap_tids
[flush_tid
] = flushing
;
4594 if (!in
->flushing_caps
) {
4595 ldout(cct
, 10) << __func__
<< " " << ccap_string(flushing
) << " " << *in
<< dendl
;
4596 num_flushing_caps
++;
4598 ldout(cct
, 10) << __func__
<< " (more) " << ccap_string(flushing
) << " " << *in
<< dendl
;
4601 in
->flushing_caps
|= flushing
;
4602 in
->mark_caps_clean();
4604 if (!in
->flushing_cap_item
.is_on_list())
4605 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4606 session
->flushing_caps_tids
.insert(flush_tid
);
4612 void Client::adjust_session_flushing_caps(Inode
*in
, MetaSession
*old_s
, MetaSession
*new_s
)
4614 for (auto &p
: in
->cap_snaps
) {
4615 CapSnap
&capsnap
= p
.second
;
4616 if (capsnap
.flush_tid
> 0) {
4617 old_s
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
4618 new_s
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4621 for (map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4622 it
!= in
->flushing_cap_tids
.end();
4624 old_s
->flushing_caps_tids
.erase(it
->first
);
4625 new_s
->flushing_caps_tids
.insert(it
->first
);
4627 new_s
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4631 * Flush all caps back to the MDS. Because the callers generally wait on the
4632 * result of this function (syncfs and umount cases), we set
4633 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4635 void Client::flush_caps_sync()
4637 ldout(cct
, 10) << __func__
<< dendl
;
4638 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
4640 unsigned flags
= CHECK_CAPS_NODELAY
;
4644 delayed_list
.pop_front();
4645 if (p
.end() && dirty_list
.empty())
4646 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4647 check_caps(in
, flags
);
4651 p
= dirty_list
.begin();
4653 unsigned flags
= CHECK_CAPS_NODELAY
;
4658 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4659 check_caps(in
, flags
);
4663 void Client::wait_sync_caps(Inode
*in
, ceph_tid_t want
)
4665 while (in
->flushing_caps
) {
4666 map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4667 ceph_assert(it
!= in
->flushing_cap_tids
.end());
4668 if (it
->first
> want
)
4670 ldout(cct
, 10) << __func__
<< " on " << *in
<< " flushing "
4671 << ccap_string(it
->second
) << " want " << want
4672 << " last " << it
->first
<< dendl
;
4673 wait_on_list(in
->waitfor_caps
);
4677 void Client::wait_sync_caps(ceph_tid_t want
)
4680 ldout(cct
, 10) << __func__
<< " want " << want
<< " (last is " << last_flush_tid
<< ", "
4681 << num_flushing_caps
<< " total flushing)" << dendl
;
4682 for (auto &p
: mds_sessions
) {
4683 MetaSession
*s
= &p
.second
;
4684 if (s
->flushing_caps_tids
.empty())
4686 ceph_tid_t oldest_tid
= *s
->flushing_caps_tids
.begin();
4687 if (oldest_tid
<= want
) {
4688 ldout(cct
, 10) << " waiting on mds." << p
.first
<< " tid " << oldest_tid
4689 << " (want " << want
<< ")" << dendl
;
4690 std::unique_lock l
{client_lock
, std::adopt_lock
};
4698 void Client::kick_flushing_caps(Inode
*in
, MetaSession
*session
)
4700 in
->flags
&= ~I_KICK_FLUSH
;
4702 Cap
*cap
= in
->auth_cap
;
4703 ceph_assert(cap
->session
== session
);
4705 ceph_tid_t last_snap_flush
= 0;
4706 for (auto p
= in
->flushing_cap_tids
.rbegin();
4707 p
!= in
->flushing_cap_tids
.rend();
4710 last_snap_flush
= p
->first
;
4715 int wanted
= in
->caps_wanted();
4716 int used
= get_caps_used(in
) | in
->caps_dirty();
4717 auto it
= in
->cap_snaps
.begin();
4718 for (auto& p
: in
->flushing_cap_tids
) {
4720 int msg_flags
= p
.first
< last_snap_flush
? MClientCaps::FLAG_PENDING_CAPSNAP
: 0;
4721 send_cap(in
, session
, cap
, msg_flags
, used
, wanted
, (cap
->issued
| cap
->implemented
),
4724 ceph_assert(it
!= in
->cap_snaps
.end());
4725 ceph_assert(it
->second
.flush_tid
== p
.first
);
4726 send_flush_snap(in
, session
, it
->first
, it
->second
);
4732 void Client::kick_flushing_caps(MetaSession
*session
)
4734 mds_rank_t mds
= session
->mds_num
;
4735 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
4737 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4739 if (in
->flags
& I_KICK_FLUSH
) {
4740 ldout(cct
, 20) << " reflushing caps on " << *in
<< " to mds." << mds
<< dendl
;
4741 kick_flushing_caps(in
, session
);
4746 void Client::early_kick_flushing_caps(MetaSession
*session
)
4748 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4750 Cap
*cap
= in
->auth_cap
;
4753 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4754 // stage. This guarantees that MDS processes the cap flush message before issuing
4755 // the flushing caps to other client.
4756 if ((in
->flushing_caps
& in
->auth_cap
->issued
) == in
->flushing_caps
) {
4757 in
->flags
|= I_KICK_FLUSH
;
4761 ldout(cct
, 20) << " reflushing caps (early_kick) on " << *in
4762 << " to mds." << session
->mds_num
<< dendl
;
4763 // send_reconnect() also will reset these sequence numbers. make sure
4764 // sequence numbers in cap flush message match later reconnect message.
4768 cap
->issued
= cap
->implemented
;
4770 kick_flushing_caps(in
, session
);
4774 void SnapRealm::build_snap_context()
4776 set
<snapid_t
> snaps
;
4777 snapid_t max_seq
= seq
;
4779 // start with prior_parents?
4780 for (unsigned i
=0; i
<prior_parent_snaps
.size(); i
++)
4781 snaps
.insert(prior_parent_snaps
[i
]);
4783 // current parent's snaps
4785 const SnapContext
& psnapc
= pparent
->get_snap_context();
4786 for (unsigned i
=0; i
<psnapc
.snaps
.size(); i
++)
4787 if (psnapc
.snaps
[i
] >= parent_since
)
4788 snaps
.insert(psnapc
.snaps
[i
]);
4789 if (psnapc
.seq
> max_seq
)
4790 max_seq
= psnapc
.seq
;
4794 for (unsigned i
=0; i
<my_snaps
.size(); i
++)
4795 snaps
.insert(my_snaps
[i
]);
4798 cached_snap_context
.seq
= max_seq
;
4799 cached_snap_context
.snaps
.resize(0);
4800 cached_snap_context
.snaps
.reserve(snaps
.size());
4801 for (set
<snapid_t
>::reverse_iterator p
= snaps
.rbegin(); p
!= snaps
.rend(); ++p
)
4802 cached_snap_context
.snaps
.push_back(*p
);
4805 void Client::invalidate_snaprealm_and_children(SnapRealm
*realm
)
4810 while (!q
.empty()) {
4814 ldout(cct
, 10) << __func__
<< " " << *realm
<< dendl
;
4815 realm
->invalidate_cache();
4817 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4818 p
!= realm
->pchildren
.end();
4824 SnapRealm
*Client::get_snap_realm(inodeno_t r
)
4826 SnapRealm
*realm
= snap_realms
[r
];
4828 snap_realms
[r
] = realm
= new SnapRealm(r
);
4829 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4834 SnapRealm
*Client::get_snap_realm_maybe(inodeno_t r
)
4836 if (snap_realms
.count(r
) == 0) {
4837 ldout(cct
, 20) << __func__
<< " " << r
<< " fail" << dendl
;
4840 SnapRealm
*realm
= snap_realms
[r
];
4841 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4846 void Client::put_snap_realm(SnapRealm
*realm
)
4848 ldout(cct
, 20) << __func__
<< " " << realm
->ino
<< " " << realm
4849 << " " << realm
->nref
<< " -> " << (realm
->nref
- 1) << dendl
;
4850 if (--realm
->nref
== 0) {
4851 snap_realms
.erase(realm
->ino
);
4852 if (realm
->pparent
) {
4853 realm
->pparent
->pchildren
.erase(realm
);
4854 put_snap_realm(realm
->pparent
);
4860 bool Client::adjust_realm_parent(SnapRealm
*realm
, inodeno_t parent
)
4862 if (realm
->parent
!= parent
) {
4863 ldout(cct
, 10) << __func__
<< " " << *realm
4864 << " " << realm
->parent
<< " -> " << parent
<< dendl
;
4865 realm
->parent
= parent
;
4866 if (realm
->pparent
) {
4867 realm
->pparent
->pchildren
.erase(realm
);
4868 put_snap_realm(realm
->pparent
);
4870 realm
->pparent
= get_snap_realm(parent
);
4871 realm
->pparent
->pchildren
.insert(realm
);
4877 static bool has_new_snaps(const SnapContext
& old_snapc
,
4878 const SnapContext
& new_snapc
)
4880 return !new_snapc
.snaps
.empty() && new_snapc
.snaps
[0] > old_snapc
.seq
;
4884 void Client::update_snap_trace(const bufferlist
& bl
, SnapRealm
**realm_ret
, bool flush
)
4886 SnapRealm
*first_realm
= NULL
;
4887 ldout(cct
, 10) << __func__
<< " len " << bl
.length() << dendl
;
4889 map
<SnapRealm
*, SnapContext
> dirty_realms
;
4891 auto p
= bl
.cbegin();
4895 SnapRealm
*realm
= get_snap_realm(info
.ino());
4897 bool invalidate
= false;
4899 if (info
.seq() > realm
->seq
) {
4900 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq() << " > " << realm
->seq
4904 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4905 // flush me + children
4908 while (!q
.empty()) {
4909 SnapRealm
*realm
= q
.front();
4912 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4913 p
!= realm
->pchildren
.end();
4917 if (dirty_realms
.count(realm
) == 0) {
4919 dirty_realms
[realm
] = realm
->get_snap_context();
4925 realm
->seq
= info
.seq();
4926 realm
->created
= info
.created();
4927 realm
->parent_since
= info
.parent_since();
4928 realm
->prior_parent_snaps
= info
.prior_parent_snaps
;
4929 realm
->my_snaps
= info
.my_snaps
;
4933 // _always_ verify parent
4934 if (adjust_realm_parent(realm
, info
.parent()))
4938 invalidate_snaprealm_and_children(realm
);
4939 ldout(cct
, 15) << __func__
<< " " << *realm
<< " self|parent updated" << dendl
;
4940 ldout(cct
, 15) << " snapc " << realm
->get_snap_context() << dendl
;
4942 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq()
4943 << " <= " << realm
->seq
<< " and same parent, SKIPPING" << dendl
;
4947 first_realm
= realm
;
4949 put_snap_realm(realm
);
4952 for (auto &[realm
, snapc
] : dirty_realms
) {
4953 // if there are new snaps ?
4954 if (has_new_snaps(snapc
, realm
->get_snap_context())) {
4955 ldout(cct
, 10) << " flushing caps on " << *realm
<< dendl
;
4956 for (auto&& in
: realm
->inodes_with_caps
) {
4957 queue_cap_snap(in
, snapc
);
4960 ldout(cct
, 10) << " no new snap on " << *realm
<< dendl
;
4962 put_snap_realm(realm
);
4966 *realm_ret
= first_realm
;
4968 put_snap_realm(first_realm
);
4971 void Client::handle_snap(const MConstRef
<MClientSnap
>& m
)
4973 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
4974 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4976 std::scoped_lock
cl(client_lock
);
4977 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4982 got_mds_push(session
);
4984 map
<Inode
*, SnapContext
> to_move
;
4985 SnapRealm
*realm
= 0;
4987 if (m
->head
.op
== CEPH_SNAP_OP_SPLIT
) {
4988 ceph_assert(m
->head
.split
);
4990 auto p
= m
->bl
.cbegin();
4992 ceph_assert(info
.ino() == m
->head
.split
);
4994 // flush, then move, ino's.
4995 realm
= get_snap_realm(info
.ino());
4996 ldout(cct
, 10) << " splitting off " << *realm
<< dendl
;
4997 for (auto& ino
: m
->split_inos
) {
4998 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
4999 if (inode_map
.count(vino
)) {
5000 Inode
*in
= inode_map
[vino
];
5001 if (!in
->snaprealm
|| in
->snaprealm
== realm
)
5003 if (in
->snaprealm
->created
> info
.created()) {
5004 ldout(cct
, 10) << " NOT moving " << *in
<< " from _newer_ realm "
5005 << *in
->snaprealm
<< dendl
;
5008 ldout(cct
, 10) << " moving " << *in
<< " from " << *in
->snaprealm
<< dendl
;
5011 in
->snaprealm_item
.remove_myself();
5012 to_move
[in
] = in
->snaprealm
->get_snap_context();
5013 put_snap_realm(in
->snaprealm
);
5017 // move child snaprealms, too
5018 for (auto& child_realm
: m
->split_realms
) {
5019 ldout(cct
, 10) << "adjusting snaprealm " << child_realm
<< " parent" << dendl
;
5020 SnapRealm
*child
= get_snap_realm_maybe(child_realm
);
5023 adjust_realm_parent(child
, realm
->ino
);
5024 put_snap_realm(child
);
5028 update_snap_trace(m
->bl
, NULL
, m
->head
.op
!= CEPH_SNAP_OP_DESTROY
);
5031 for (auto p
= to_move
.begin(); p
!= to_move
.end(); ++p
) {
5032 Inode
*in
= p
->first
;
5033 in
->snaprealm
= realm
;
5034 realm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
5036 // queue for snap writeback
5037 if (has_new_snaps(p
->second
, realm
->get_snap_context()))
5038 queue_cap_snap(in
, p
->second
);
5040 put_snap_realm(realm
);
5044 void Client::handle_quota(const MConstRef
<MClientQuota
>& m
)
5046 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
5048 std::scoped_lock
cl(client_lock
);
5049 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
5054 got_mds_push(session
);
5056 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << mds
<< dendl
;
5058 vinodeno_t
vino(m
->ino
, CEPH_NOSNAP
);
5059 if (inode_map
.count(vino
)) {
5061 in
= inode_map
[vino
];
5064 in
->quota
= m
->quota
;
5065 in
->rstat
= m
->rstat
;
5070 void Client::handle_caps(const MConstRef
<MClientCaps
>& m
)
5072 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
5074 std::scoped_lock
cl(client_lock
);
5075 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
5080 if (m
->osd_epoch_barrier
&& !objecter
->have_map(m
->osd_epoch_barrier
)) {
5081 // Pause RADOS operations until we see the required epoch
5082 objecter
->set_epoch_barrier(m
->osd_epoch_barrier
);
5085 if (m
->osd_epoch_barrier
> cap_epoch_barrier
) {
5086 // Record the barrier so that we will transmit it to MDS when releasing
5087 set_cap_epoch_barrier(m
->osd_epoch_barrier
);
5090 got_mds_push(session
);
5093 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
5094 if (auto it
= inode_map
.find(vino
); it
!= inode_map
.end()) {
5097 if (m
->get_op() == CEPH_CAP_OP_IMPORT
) {
5098 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< " on IMPORT, immediately releasing" << dendl
;
5099 session
->enqueue_cap_release(
5106 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< ", dropping" << dendl
;
5109 // in case the mds is waiting on e.g. a revocation
5110 flush_cap_releases();
5114 switch (m
->get_op()) {
5115 case CEPH_CAP_OP_EXPORT
: return handle_cap_export(session
, in
, m
);
5116 case CEPH_CAP_OP_FLUSHSNAP_ACK
: return handle_cap_flushsnap_ack(session
, in
, m
);
5117 case CEPH_CAP_OP_IMPORT
: /* no return */ handle_cap_import(session
, in
, m
);
5120 if (auto it
= in
->caps
.find(mds
); it
!= in
->caps
.end()) {
5121 Cap
&cap
= in
->caps
.at(mds
);
5123 switch (m
->get_op()) {
5124 case CEPH_CAP_OP_TRUNC
: return handle_cap_trunc(session
, in
, m
);
5125 case CEPH_CAP_OP_IMPORT
:
5126 case CEPH_CAP_OP_REVOKE
:
5127 case CEPH_CAP_OP_GRANT
: return handle_cap_grant(session
, in
, &cap
, m
);
5128 case CEPH_CAP_OP_FLUSH_ACK
: return handle_cap_flush_ack(session
, in
, &cap
, m
);
5131 ldout(cct
, 5) << __func__
<< " don't have " << *in
<< " cap on mds." << mds
<< dendl
;
5136 void Client::handle_cap_import(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5138 mds_rank_t mds
= session
->mds_num
;
5140 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
5141 << " IMPORT from mds." << mds
<< dendl
;
5143 const mds_rank_t peer_mds
= mds_rank_t(m
->peer
.mds
);
5146 if (auto it
= in
->caps
.find(peer_mds
); m
->peer
.cap_id
&& it
!= in
->caps
.end()) {
5148 cap_perms
= cap
->latest_perms
;
5152 SnapRealm
*realm
= NULL
;
5153 update_snap_trace(m
->snapbl
, &realm
);
5155 int issued
= m
->get_caps();
5156 int wanted
= m
->get_wanted();
5157 add_update_cap(in
, session
, m
->get_cap_id(),
5158 issued
, wanted
, m
->get_seq(), m
->get_mseq(),
5159 m
->get_realm(), CEPH_CAP_FLAG_AUTH
, cap_perms
);
5161 if (cap
&& cap
->cap_id
== m
->peer
.cap_id
) {
5162 remove_cap(cap
, (m
->peer
.flags
& CEPH_CAP_FLAG_RELEASE
));
5166 put_snap_realm(realm
);
5168 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
5169 if (!(wanted
& CEPH_CAP_ANY_FILE_WR
) ||
5170 in
->requested_max_size
> m
->get_max_size()) {
5171 in
->requested_max_size
= 0;
5172 ldout(cct
, 15) << "reset requested_max_size after cap import" << dendl
;
5174 // reflush any/all caps (if we are now the auth_cap)
5175 kick_flushing_caps(in
, session
);
5179 void Client::handle_cap_export(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5181 mds_rank_t mds
= session
->mds_num
;
5183 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
5184 << " EXPORT from mds." << mds
<< dendl
;
5186 auto it
= in
->caps
.find(mds
);
5187 if (it
!= in
->caps
.end()) {
5188 Cap
&cap
= it
->second
;
5189 if (cap
.cap_id
== m
->get_cap_id()) {
5190 if (m
->peer
.cap_id
) {
5191 const auto peer_mds
= mds_rank_t(m
->peer
.mds
);
5192 MetaSession
*tsession
= _get_or_open_mds_session(peer_mds
);
5193 auto it
= in
->caps
.find(peer_mds
);
5194 if (it
!= in
->caps
.end()) {
5195 Cap
&tcap
= it
->second
;
5196 if (tcap
.cap_id
== m
->peer
.cap_id
&&
5197 ceph_seq_cmp(tcap
.seq
, m
->peer
.seq
) < 0) {
5198 tcap
.cap_id
= m
->peer
.cap_id
;
5199 tcap
.seq
= m
->peer
.seq
- 1;
5200 tcap
.issue_seq
= tcap
.seq
;
5201 tcap
.issued
|= cap
.issued
;
5202 tcap
.implemented
|= cap
.issued
;
5203 if (&cap
== in
->auth_cap
)
5204 in
->auth_cap
= &tcap
;
5205 if (in
->auth_cap
== &tcap
&& in
->flushing_cap_item
.is_on_list())
5206 adjust_session_flushing_caps(in
, session
, tsession
);
5209 add_update_cap(in
, tsession
, m
->peer
.cap_id
, cap
.issued
, 0,
5210 m
->peer
.seq
- 1, m
->peer
.mseq
, (uint64_t)-1,
5211 &cap
== in
->auth_cap
? CEPH_CAP_FLAG_AUTH
: 0,
5215 if (cap
.wanted
| cap
.issued
)
5216 in
->flags
|= I_CAP_DROPPED
;
5219 remove_cap(&cap
, false);
5224 void Client::handle_cap_trunc(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5226 mds_rank_t mds
= session
->mds_num
;
5227 ceph_assert(in
->caps
.count(mds
));
5229 ldout(cct
, 10) << __func__
<< " on ino " << *in
5230 << " size " << in
->size
<< " -> " << m
->get_size()
5234 in
->caps_issued(&issued
);
5235 issued
|= in
->caps_dirty();
5236 update_inode_file_size(in
, issued
, m
->get_size(),
5237 m
->get_truncate_seq(), m
->get_truncate_size());
5240 void Client::handle_cap_flush_ack(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5242 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5243 int dirty
= m
->get_dirty();
5247 auto it
= in
->flushing_cap_tids
.begin();
5248 if (it
->first
< flush_ack_tid
) {
5249 ldout(cct
, 0) << __func__
<< " mds." << session
->mds_num
5250 << " got unexpected flush ack tid " << flush_ack_tid
5251 << " expected is " << it
->first
<< dendl
;
5253 for (; it
!= in
->flushing_cap_tids
.end(); ) {
5259 if (it
->first
== flush_ack_tid
)
5260 cleaned
= it
->second
;
5261 if (it
->first
<= flush_ack_tid
) {
5262 session
->flushing_caps_tids
.erase(it
->first
);
5263 in
->flushing_cap_tids
.erase(it
++);
5267 cleaned
&= ~it
->second
;
5273 ldout(cct
, 5) << __func__
<< " mds." << session
->mds_num
5274 << " cleaned " << ccap_string(cleaned
) << " on " << *in
5275 << " with " << ccap_string(dirty
) << dendl
;
5278 signal_cond_list(in
->waitfor_caps
);
5279 if (session
->flushing_caps_tids
.empty() ||
5280 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5281 sync_cond
.notify_all();
5285 in
->cap_dirtier_uid
= -1;
5286 in
->cap_dirtier_gid
= -1;
5290 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != any cap bit tids" << dendl
;
5292 if (in
->flushing_caps
) {
5293 ldout(cct
, 5) << " flushing_caps " << ccap_string(in
->flushing_caps
)
5294 << " -> " << ccap_string(in
->flushing_caps
& ~cleaned
) << dendl
;
5295 in
->flushing_caps
&= ~cleaned
;
5296 if (in
->flushing_caps
== 0) {
5297 ldout(cct
, 10) << " " << *in
<< " !flushing" << dendl
;
5298 num_flushing_caps
--;
5299 if (in
->flushing_cap_tids
.empty())
5300 in
->flushing_cap_item
.remove_myself();
5302 if (!in
->caps_dirty())
5309 void Client::handle_cap_flushsnap_ack(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5311 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5312 mds_rank_t mds
= session
->mds_num
;
5313 ceph_assert(in
->caps
.count(mds
));
5314 snapid_t follows
= m
->get_snap_follows();
5316 if (auto it
= in
->cap_snaps
.find(follows
); it
!= in
->cap_snaps
.end()) {
5317 auto& capsnap
= it
->second
;
5318 if (flush_ack_tid
!= capsnap
.flush_tid
) {
5319 ldout(cct
, 10) << " tid " << flush_ack_tid
<< " != " << capsnap
.flush_tid
<< dendl
;
5321 InodeRef
tmp_ref(in
);
5322 ldout(cct
, 5) << __func__
<< " mds." << mds
<< " flushed snap follows " << follows
5323 << " on " << *in
<< dendl
;
5324 session
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
5325 in
->flushing_cap_tids
.erase(capsnap
.flush_tid
);
5326 if (in
->flushing_caps
== 0 && in
->flushing_cap_tids
.empty())
5327 in
->flushing_cap_item
.remove_myself();
5328 in
->cap_snaps
.erase(it
);
5330 signal_cond_list(in
->waitfor_caps
);
5331 if (session
->flushing_caps_tids
.empty() ||
5332 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5333 sync_cond
.notify_all();
5336 ldout(cct
, 5) << __func__
<< " DUP(?) mds." << mds
<< " flushed snap follows " << follows
5337 << " on " << *in
<< dendl
;
5338 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5342 class C_Client_DentryInvalidate
: public Context
{
5349 C_Client_DentryInvalidate(Client
*c
, Dentry
*dn
, bool del
) :
5350 client(c
), name(dn
->name
) {
5351 if (client
->use_faked_inos()) {
5352 dirino
.ino
= dn
->dir
->parent_inode
->faked_ino
;
5354 ino
.ino
= dn
->inode
->faked_ino
;
5356 dirino
= dn
->dir
->parent_inode
->vino();
5358 ino
= dn
->inode
->vino();
5361 ino
.ino
= inodeno_t();
5363 void finish(int r
) override
{
5364 // _async_dentry_invalidate is responsible for its own locking
5365 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
5366 client
->_async_dentry_invalidate(dirino
, ino
, name
);
5370 void Client::_async_dentry_invalidate(vinodeno_t dirino
, vinodeno_t ino
, string
& name
)
5372 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
5373 if (!mref_reader
.is_state_satisfied())
5376 ldout(cct
, 10) << __func__
<< " '" << name
<< "' ino " << ino
5377 << " in dir " << dirino
<< dendl
;
5378 dentry_invalidate_cb(callback_handle
, dirino
, ino
, name
.c_str(), name
.length());
5381 void Client::_schedule_invalidate_dentry_callback(Dentry
*dn
, bool del
)
5383 if (dentry_invalidate_cb
&& dn
->inode
->ll_ref
> 0)
5384 async_dentry_invalidator
.queue(new C_Client_DentryInvalidate(this, dn
, del
));
5387 void Client::_try_to_trim_inode(Inode
*in
, bool sched_inval
)
5389 int ref
= in
->get_num_ref();
5390 ldout(cct
, 5) << __func__
<< " in " << *in
<<dendl
;
5392 if (in
->dir
&& !in
->dir
->dentries
.empty()) {
5393 for (auto p
= in
->dir
->dentries
.begin();
5394 p
!= in
->dir
->dentries
.end(); ) {
5395 Dentry
*dn
= p
->second
;
5397 /* rmsnap removes whole subtree, need trim inodes recursively.
5398 * we don't need to invalidate dentries recursively. because
5399 * invalidating a directory dentry effectively invalidate
5401 if (in
->snapid
!= CEPH_NOSNAP
&& dn
->inode
&& dn
->inode
->is_dir())
5402 _try_to_trim_inode(dn
->inode
.get(), false);
5404 if (dn
->lru_is_expireable())
5405 unlink(dn
, true, false); // keep dir, drop dentry
5407 if (in
->dir
->dentries
.empty()) {
5413 if (ref
> 0 && (in
->flags
& I_SNAPDIR_OPEN
)) {
5414 InodeRef snapdir
= open_snapdir(in
);
5415 _try_to_trim_inode(snapdir
.get(), false);
5420 auto q
= in
->dentries
.begin();
5421 while (q
!= in
->dentries
.end()) {
5424 if( in
->ll_ref
> 0 && sched_inval
) {
5425 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5426 // so in->dentries doesn't always reflect the state of kernel's dcache.
5427 _schedule_invalidate_dentry_callback(dn
, true);
5429 unlink(dn
, true, true);
5434 void Client::handle_cap_grant(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5436 mds_rank_t mds
= session
->mds_num
;
5437 int used
= get_caps_used(in
);
5438 int wanted
= in
->caps_wanted();
5440 const unsigned new_caps
= m
->get_caps();
5441 const bool was_stale
= session
->cap_gen
> cap
->gen
;
5442 ldout(cct
, 5) << __func__
<< " on in " << m
->get_ino()
5443 << " mds." << mds
<< " seq " << m
->get_seq()
5444 << " caps now " << ccap_string(new_caps
)
5445 << " was " << ccap_string(cap
->issued
)
5446 << (was_stale
? " (stale)" : "") << dendl
;
5449 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
5450 cap
->seq
= m
->get_seq();
5451 cap
->gen
= session
->cap_gen
;
5453 check_cap_issue(in
, new_caps
);
5457 in
->caps_issued(&issued
);
5458 issued
|= in
->caps_dirty();
5460 if ((new_caps
& CEPH_CAP_AUTH_SHARED
) &&
5461 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
5462 in
->mode
= m
->head
.mode
;
5463 in
->uid
= m
->head
.uid
;
5464 in
->gid
= m
->head
.gid
;
5465 in
->btime
= m
->btime
;
5467 bool deleted_inode
= false;
5468 if ((new_caps
& CEPH_CAP_LINK_SHARED
) &&
5469 !(issued
& CEPH_CAP_LINK_EXCL
)) {
5470 in
->nlink
= m
->head
.nlink
;
5471 if (in
->nlink
== 0 &&
5472 (new_caps
& (CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
)))
5473 deleted_inode
= true;
5475 if (!(issued
& CEPH_CAP_XATTR_EXCL
) &&
5476 m
->xattrbl
.length() &&
5477 m
->head
.xattr_version
> in
->xattr_version
) {
5478 auto p
= m
->xattrbl
.cbegin();
5479 decode(in
->xattrs
, p
);
5480 in
->xattr_version
= m
->head
.xattr_version
;
5483 if ((new_caps
& CEPH_CAP_FILE_SHARED
) && m
->dirstat_is_valid()) {
5484 in
->dirstat
.nfiles
= m
->get_nfiles();
5485 in
->dirstat
.nsubdirs
= m
->get_nsubdirs();
5488 if (new_caps
& CEPH_CAP_ANY_RD
) {
5489 update_inode_file_time(in
, issued
, m
->get_time_warp_seq(),
5490 m
->get_ctime(), m
->get_mtime(), m
->get_atime());
5493 if (new_caps
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
)) {
5494 in
->layout
= m
->get_layout();
5495 update_inode_file_size(in
, issued
, m
->get_size(),
5496 m
->get_truncate_seq(), m
->get_truncate_size());
5499 if (m
->inline_version
> in
->inline_version
) {
5500 in
->inline_data
= m
->inline_data
;
5501 in
->inline_version
= m
->inline_version
;
5504 /* always take a newer change attr */
5505 if (m
->get_change_attr() > in
->change_attr
)
5506 in
->change_attr
= m
->get_change_attr();
5509 if (cap
== in
->auth_cap
&&
5510 (new_caps
& CEPH_CAP_ANY_FILE_WR
) &&
5511 (m
->get_max_size() != in
->max_size
)) {
5512 ldout(cct
, 10) << "max_size " << in
->max_size
<< " -> " << m
->get_max_size() << dendl
;
5513 in
->max_size
= m
->get_max_size();
5514 if (in
->max_size
> in
->wanted_max_size
) {
5515 in
->wanted_max_size
= 0;
5516 in
->requested_max_size
= 0;
5521 if ((was_stale
|| m
->get_op() == CEPH_CAP_OP_IMPORT
) &&
5522 (wanted
& ~(cap
->wanted
| new_caps
))) {
5523 // If mds is importing cap, prior cap messages that update 'wanted'
5524 // may get dropped by mds (migrate seq mismatch).
5526 // We don't send cap message to update 'wanted' if what we want are
5527 // already issued. If mds revokes caps, cap message that releases caps
5528 // also tells mds what we want. But if caps got revoked by mds forcedly
5529 // (session stale). We may haven't told mds what we want.
5535 auto revoked
= cap
->issued
& ~new_caps
;
5537 ldout(cct
, 10) << " revocation of " << ccap_string(revoked
) << dendl
;
5538 cap
->issued
= new_caps
;
5539 cap
->implemented
|= new_caps
;
5541 // recall delegations if we're losing caps necessary for them
5542 if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_RD
))
5543 in
->recall_deleg(false);
5544 else if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_WR
))
5545 in
->recall_deleg(true);
5547 used
= adjust_caps_used_for_lazyio(used
, cap
->issued
, cap
->implemented
);
5548 if ((used
& revoked
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
)) &&
5549 !_flush(in
, new C_Client_FlushComplete(this, in
))) {
5550 // waitin' for flush
5551 } else if (used
& revoked
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
)) {
5555 cap
->wanted
= 0; // don't let check_caps skip sending a response to MDS
5558 } else if (cap
->issued
== new_caps
) {
5559 ldout(cct
, 10) << " caps unchanged at " << ccap_string(cap
->issued
) << dendl
;
5561 ldout(cct
, 10) << " grant, new caps are " << ccap_string(new_caps
& ~cap
->issued
) << dendl
;
5562 cap
->issued
= new_caps
;
5563 cap
->implemented
|= new_caps
;
5565 if (cap
== in
->auth_cap
) {
5566 // non-auth MDS is revoking the newly grant caps ?
5567 for (const auto &p
: in
->caps
) {
5568 if (&p
.second
== cap
)
5570 if (p
.second
.implemented
& ~p
.second
.issued
& new_caps
) {
5583 signal_cond_list(in
->waitfor_caps
);
5585 // may drop inode's last ref
5587 _try_to_trim_inode(in
, true);
5590 int Client::inode_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
5592 if (perms
.uid() == 0)
5595 if (perms
.uid() != in
->uid
&& (in
->mode
& S_IRWXG
)) {
5596 int ret
= _posix_acl_permission(in
, perms
, want
);
5597 if (ret
!= -CEPHFS_EAGAIN
)
5601 // check permissions before doing anything else
5602 if (!in
->check_mode(perms
, want
))
5603 return -CEPHFS_EACCES
;
5607 int Client::xattr_permission(Inode
*in
, const char *name
, unsigned want
,
5608 const UserPerm
& perms
)
5610 int r
= _getattr_for_perm(in
, perms
);
5615 if (strncmp(name
, "system.", 7) == 0) {
5616 if ((want
& MAY_WRITE
) && (perms
.uid() != 0 && perms
.uid() != in
->uid
))
5619 r
= inode_permission(in
, perms
, want
);
5622 ldout(cct
, 5) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5626 ostream
& operator<<(ostream
&out
, const UserPerm
& perm
) {
5627 out
<< "UserPerm(uid: " << perm
.uid() << ", gid: " << perm
.gid() << ")";
5631 int Client::may_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
5632 const UserPerm
& perms
)
5634 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5635 int r
= _getattr_for_perm(in
, perms
);
5639 if (mask
& CEPH_SETATTR_SIZE
) {
5640 r
= inode_permission(in
, perms
, MAY_WRITE
);
5646 if (mask
& CEPH_SETATTR_UID
) {
5647 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
|| stx
->stx_uid
!= in
->uid
))
5650 if (mask
& CEPH_SETATTR_GID
) {
5651 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
||
5652 (!perms
.gid_in_groups(stx
->stx_gid
) && stx
->stx_gid
!= in
->gid
)))
5656 if (mask
& CEPH_SETATTR_MODE
) {
5657 if (perms
.uid() != 0 && perms
.uid() != in
->uid
)
5660 gid_t i_gid
= (mask
& CEPH_SETATTR_GID
) ? stx
->stx_gid
: in
->gid
;
5661 if (perms
.uid() != 0 && !perms
.gid_in_groups(i_gid
))
5662 stx
->stx_mode
&= ~S_ISGID
;
5665 if (mask
& (CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
|
5666 CEPH_SETATTR_MTIME
| CEPH_SETATTR_ATIME
)) {
5667 if (perms
.uid() != 0 && perms
.uid() != in
->uid
) {
5668 int check_mask
= CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
;
5669 if (!(mask
& CEPH_SETATTR_MTIME_NOW
))
5670 check_mask
|= CEPH_SETATTR_MTIME
;
5671 if (!(mask
& CEPH_SETATTR_ATIME_NOW
))
5672 check_mask
|= CEPH_SETATTR_ATIME
;
5673 if (check_mask
& mask
) {
5676 r
= inode_permission(in
, perms
, MAY_WRITE
);
5684 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5688 int Client::may_open(Inode
*in
, int flags
, const UserPerm
& perms
)
5690 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5693 if ((flags
& O_ACCMODE
) == O_WRONLY
)
5695 else if ((flags
& O_ACCMODE
) == O_RDWR
)
5696 want
= MAY_READ
| MAY_WRITE
;
5697 else if ((flags
& O_ACCMODE
) == O_RDONLY
)
5699 if (flags
& O_TRUNC
)
5703 switch (in
->mode
& S_IFMT
) {
5708 if (want
& MAY_WRITE
) {
5715 r
= _getattr_for_perm(in
, perms
);
5719 r
= inode_permission(in
, perms
, want
);
5721 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5725 int Client::may_lookup(Inode
*dir
, const UserPerm
& perms
)
5727 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5728 int r
= _getattr_for_perm(dir
, perms
);
5732 r
= inode_permission(dir
, perms
, MAY_EXEC
);
5734 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5738 int Client::may_create(Inode
*dir
, const UserPerm
& perms
)
5740 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5741 int r
= _getattr_for_perm(dir
, perms
);
5745 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5747 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5751 int Client::may_delete(Inode
*dir
, const char *name
, const UserPerm
& perms
)
5753 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << "; name " << name
<< "; " << perms
<< dendl
;
5754 int r
= _getattr_for_perm(dir
, perms
);
5758 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5762 /* 'name == NULL' means rmsnap w/o permission checks */
5763 if (perms
.uid() != 0 && name
&& (dir
->mode
& S_ISVTX
)) {
5765 r
= _lookup(dir
, name
, CEPH_CAP_AUTH_SHARED
, &otherin
, perms
);
5768 if (dir
->uid
!= perms
.uid() && otherin
->uid
!= perms
.uid())
5772 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5776 int Client::may_delete(const char *relpath
, const UserPerm
& perms
) {
5777 ldout(cct
, 20) << __func__
<< " " << relpath
<< "; " << perms
<< dendl
;
5779 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
5780 if (!mref_reader
.is_state_satisfied())
5783 filepath
path(relpath
);
5784 string name
= path
.last_dentry();
5788 std::scoped_lock
lock(client_lock
);
5789 int r
= path_walk(path
, &dir
, perms
);
5792 if (cct
->_conf
->client_permissions
) {
5793 int r
= may_delete(dir
.get(), name
.c_str(), perms
);
5801 int Client::may_hardlink(Inode
*in
, const UserPerm
& perms
)
5803 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5804 int r
= _getattr_for_perm(in
, perms
);
5808 if (perms
.uid() == 0 || perms
.uid() == in
->uid
) {
5814 if (!S_ISREG(in
->mode
))
5817 if (in
->mode
& S_ISUID
)
5820 if ((in
->mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
))
5823 r
= inode_permission(in
, perms
, MAY_READ
| MAY_WRITE
);
5825 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5829 int Client::_getattr_for_perm(Inode
*in
, const UserPerm
& perms
)
5831 int mask
= CEPH_STAT_CAP_MODE
;
5833 if (acl_type
!= NO_ACL
) {
5834 mask
|= CEPH_STAT_CAP_XATTR
;
5835 force
= in
->xattr_version
== 0;
5837 return _getattr(in
, mask
, perms
, force
);
5840 vinodeno_t
Client::_get_vino(Inode
*in
)
5842 /* The caller must hold the client lock */
5843 return vinodeno_t(in
->ino
, in
->snapid
);
5847 * Resolve an MDS spec to a list of MDS daemon GIDs.
5849 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5850 * It may be '*' in which case it matches all GIDs.
5852 * If no error is returned, the `targets` vector will be populated with at least
5855 int Client::resolve_mds(
5856 const std::string
&mds_spec
,
5857 std::vector
<mds_gid_t
> *targets
)
5860 ceph_assert(targets
!= nullptr);
5863 CachedStackStringStream css
;
5864 int role_r
= fsmap
->parse_role(mds_spec
, &role
, *css
);
5866 // We got a role, resolve it to a GID
5867 auto& info
= fsmap
->get_filesystem(role
.fscid
)->mds_map
.get_info(role
.rank
);
5868 ldout(cct
, 10) << __func__
<< ": resolved " << mds_spec
<< " to role '"
5869 << role
<< "' aka " << info
.human_name() << dendl
;
5870 targets
->push_back(info
.global_id
);
5874 std::string strtol_err
;
5875 long long rank_or_gid
= strict_strtoll(mds_spec
.c_str(), 10, &strtol_err
);
5876 if (strtol_err
.empty()) {
5877 // It is a possible GID
5878 const mds_gid_t mds_gid
= mds_gid_t(rank_or_gid
);
5879 if (fsmap
->gid_exists(mds_gid
)) {
5880 auto& info
= fsmap
->get_info_gid(mds_gid
);
5881 ldout(cct
, 10) << __func__
<< ": validated gid " << mds_gid
<< " aka "
5882 << info
.human_name() << dendl
;
5883 targets
->push_back(mds_gid
);
5886 lderr(cct
) << __func__
<< ": gid " << mds_gid
<< " not in MDS map"
5888 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5889 return -CEPHFS_ENOENT
;
5891 } else if (mds_spec
== "*") {
5892 // It is a wildcard: use all MDSs
5893 const auto& mds_info
= fsmap
->get_mds_info();
5895 ldout(cct
, 10) << __func__
<< ": resolving `*' to all MDS daemons" << dendl
;
5896 if (mds_info
.empty()) {
5897 lderr(cct
) << __func__
<< ": no MDS daemons found" << dendl
;
5898 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5899 return -CEPHFS_ENOENT
;
5902 for (const auto& [gid
, info
] : mds_info
) {
5903 ldout(cct
, 10) << __func__
<< ": appending " << info
.human_name() << " to targets" << dendl
;
5904 targets
->push_back(gid
);
5908 // It did not parse as an integer, it is not a wildcard, it must be a name
5909 const mds_gid_t mds_gid
= fsmap
->find_mds_gid_by_name(mds_spec
);
5911 lderr(cct
) << __func__
<< ": no MDS daemons found by name `" << mds_spec
<< "'" << dendl
;
5912 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5913 return -CEPHFS_ENOENT
;
5915 auto& info
= fsmap
->get_info_gid(mds_gid
);
5916 ldout(cct
, 10) << __func__
<< ": resolved name '" << mds_spec
5917 << "' to " << info
.human_name() << dendl
;
5918 targets
->push_back(mds_gid
);
5926 * Authenticate with mon and establish global ID
5928 int Client::authenticate()
5930 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
5932 if (monclient
->is_authenticated()) {
5936 client_lock
.unlock();
5937 int r
= monclient
->authenticate(cct
->_conf
->client_mount_timeout
);
5943 whoami
= monclient
->get_global_id();
5944 messenger
->set_myname(entity_name_t::CLIENT(whoami
.v
));
5949 int Client::fetch_fsmap(bool user
)
5951 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
5953 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5954 // rather than MDSMap because no one MDSMap contains all the daemons, and
5955 // a `tell` can address any daemon.
5956 version_t fsmap_latest
;
5959 client_lock
.unlock();
5960 std::tie(fsmap_latest
, std::ignore
) =
5961 monclient
->get_version("fsmap", ca::use_blocked
[ec
]);
5963 } while (ec
== bs::errc::resource_unavailable_try_again
);
5966 lderr(cct
) << "Failed to learn FSMap version: " << ec
<< dendl
;
5967 return ceph::from_error_code(ec
);
5970 ldout(cct
, 10) << __func__
<< " learned FSMap version " << fsmap_latest
<< dendl
;
5973 if (!fsmap_user
|| fsmap_user
->get_epoch() < fsmap_latest
) {
5974 monclient
->sub_want("fsmap.user", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5975 monclient
->renew_subs();
5976 wait_on_list(waiting_for_fsmap
);
5978 ceph_assert(fsmap_user
);
5979 ceph_assert(fsmap_user
->get_epoch() >= fsmap_latest
);
5981 if (!fsmap
|| fsmap
->get_epoch() < fsmap_latest
) {
5982 monclient
->sub_want("fsmap", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5983 monclient
->renew_subs();
5984 wait_on_list(waiting_for_fsmap
);
5987 ceph_assert(fsmap
->get_epoch() >= fsmap_latest
);
5989 ldout(cct
, 10) << __func__
<< " finished waiting for FSMap version "
5990 << fsmap_latest
<< dendl
;
5996 * @mds_spec one of ID, rank, GID, "*"
5999 int Client::mds_command(
6000 const std::string
&mds_spec
,
6001 const vector
<string
>& cmd
,
6002 const bufferlist
& inbl
,
6007 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
6008 if (!iref_reader
.is_state_satisfied())
6009 return -CEPHFS_ENOTCONN
;
6011 std::unique_lock
cl(client_lock
);
6019 r
= fetch_fsmap(false);
6024 // Look up MDS target(s) of the command
6025 std::vector
<mds_gid_t
> targets
;
6026 r
= resolve_mds(mds_spec
, &targets
);
6031 // If daemons are laggy, we won't send them commands. If all
6032 // are laggy then we fail.
6033 std::vector
<mds_gid_t
> non_laggy
;
6034 for (const auto& gid
: targets
) {
6035 const auto info
= fsmap
->get_info_gid(gid
);
6036 if (!info
.laggy()) {
6037 non_laggy
.push_back(gid
);
6040 if (non_laggy
.size() == 0) {
6041 *outs
= "All targeted MDS daemons are laggy";
6042 return -CEPHFS_ENOENT
;
6045 if (metadata
.empty()) {
6046 // We are called on an unmounted client, so metadata
6047 // won't be initialized yet.
6048 populate_metadata("");
6051 // Send commands to targets
6052 C_GatherBuilder
gather(cct
, onfinish
);
6053 for (const auto& target_gid
: non_laggy
) {
6054 const auto info
= fsmap
->get_info_gid(target_gid
);
6056 // Open a connection to the target MDS
6057 ConnectionRef conn
= messenger
->connect_to_mds(info
.get_addrs());
6061 std::scoped_lock
cmd_lock(command_lock
);
6062 // Generate MDSCommandOp state
6063 auto &op
= command_table
.start_command();
6065 op
.on_finish
= gather
.new_sub();
6070 op
.mds_gid
= target_gid
;
6073 ldout(cct
, 4) << __func__
<< ": new command op to " << target_gid
6074 << " tid=" << op
.tid
<< cmd
<< dendl
;
6076 // Construct and send MCommand
6077 MessageRef m
= op
.get_message(monclient
->get_fsid());
6078 conn
->send_message2(std::move(m
));
6087 void Client::handle_command_reply(const MConstRef
<MCommandReply
>& m
)
6089 ceph_tid_t
const tid
= m
->get_tid();
6091 ldout(cct
, 10) << __func__
<< ": tid=" << m
->get_tid() << dendl
;
6093 std::scoped_lock
cmd_lock(command_lock
);
6094 if (!command_table
.exists(tid
)) {
6095 ldout(cct
, 1) << __func__
<< ": unknown tid " << tid
<< ", dropping" << dendl
;
6099 auto &op
= command_table
.get_command(tid
);
6101 *op
.outbl
= m
->get_data();
6108 op
.on_finish
->complete(m
->r
);
6111 command_table
.erase(tid
);
6114 // -------------------
6117 int Client::subscribe_mdsmap(const std::string
&fs_name
)
6119 int r
= authenticate();
6121 lderr(cct
) << "authentication failed: " << cpp_strerror(r
) << dendl
;
6125 std::string resolved_fs_name
;
6126 if (fs_name
.empty()) {
6127 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_fs");
6128 if (resolved_fs_name
.empty())
6129 // Try the backwards compatibility fs name option
6130 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_mds_namespace");
6132 resolved_fs_name
= fs_name
;
6135 std::string want
= "mdsmap";
6136 if (!resolved_fs_name
.empty()) {
6137 r
= fetch_fsmap(true);
6140 fscid
= fsmap_user
->get_fs_cid(resolved_fs_name
);
6141 if (fscid
== FS_CLUSTER_ID_NONE
) {
6142 return -CEPHFS_ENOENT
;
6145 std::ostringstream oss
;
6146 oss
<< want
<< "." << fscid
;
6149 ldout(cct
, 10) << "Subscribing to map '" << want
<< "'" << dendl
;
6151 monclient
->sub_want(want
, 0, 0);
6152 monclient
->renew_subs();
6157 int Client::mount(const std::string
&mount_root
, const UserPerm
& perms
,
6158 bool require_mds
, const std::string
&fs_name
)
6160 ceph_assert(is_initialized());
6163 * To make sure that the _unmount() must wait until the mount()
6166 RWRef_t
mref_writer(mount_state
, CLIENT_MOUNTING
, false);
6167 if (!mref_writer
.is_first_writer()) // already mounting or mounted
6170 std::unique_lock
cl(client_lock
);
6172 int r
= subscribe_mdsmap(fs_name
);
6174 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
6178 start_tick_thread(); // start tick thread
6182 auto availability
= mdsmap
->is_cluster_available();
6183 if (availability
== MDSMap::STUCK_UNAVAILABLE
) {
6185 ldout(cct
, 10) << "mds cluster unavailable: epoch=" << mdsmap
->get_epoch() << dendl
;
6186 return CEPH_FUSE_NO_MDS_UP
;
6187 } else if (availability
== MDSMap::AVAILABLE
) {
6188 // Continue to mount
6190 } else if (availability
== MDSMap::TRANSIENT_UNAVAILABLE
) {
6191 // Else, wait. MDSMonitor will update the map to bring
6192 // us to a conclusion eventually.
6193 wait_on_list(waiting_for_mdsmap
);
6195 // Unexpected value!
6201 populate_metadata(mount_root
.empty() ? "/" : mount_root
);
6203 filepath
fp(CEPH_INO_ROOT
);
6204 if (!mount_root
.empty()) {
6205 fp
= filepath(mount_root
.c_str());
6208 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
6209 req
->set_filepath(fp
);
6210 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
6211 int res
= make_request(req
, perms
);
6213 if (res
== -CEPHFS_EACCES
&& root
) {
6214 ldout(cct
, 1) << __func__
<< " EACCES on parent of mount point; quotas may not work" << dendl
;
6230 if (!cct
->_conf
->client_trace
.empty()) {
6231 traceout
.open(cct
->_conf
->client_trace
.c_str());
6232 if (traceout
.is_open()) {
6233 ldout(cct
, 1) << "opened trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6235 ldout(cct
, 1) << "FAILED to open trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6240 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6241 ldout(cct, 3) << "op: struct stat st;" << dendl;
6242 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6243 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6244 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6245 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6246 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6247 ldout(cct, 3) << "op: int fd;" << dendl;
6250 mref_writer
.update_state(CLIENT_MOUNTED
);
6256 void Client::_close_sessions()
6258 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
6259 if (it
->second
.state
== MetaSession::STATE_REJECTED
)
6260 mds_sessions
.erase(it
++);
6265 while (!mds_sessions
.empty()) {
6266 // send session closes!
6267 for (auto &p
: mds_sessions
) {
6268 if (p
.second
.state
!= MetaSession::STATE_CLOSING
) {
6269 _close_mds_session(&p
.second
);
6270 mds_ranks_closing
.insert(p
.first
);
6274 // wait for sessions to close
6275 double timo
= cct
->_conf
.get_val
<std::chrono::seconds
>("client_shutdown_timeout").count();
6276 ldout(cct
, 2) << "waiting for " << mds_ranks_closing
.size() << " mds session(s) to close (timeout: "
6277 << timo
<< "s)" << dendl
;
6278 std::unique_lock l
{client_lock
, std::adopt_lock
};
6281 } else if (!mount_cond
.wait_for(l
, ceph::make_timespan(timo
), [this] { return mds_ranks_closing
.empty(); })) {
6282 ldout(cct
, 1) << mds_ranks_closing
.size() << " mds(s) did not respond to session close -- timing out." << dendl
;
6283 while (!mds_ranks_closing
.empty()) {
6284 auto session
= mds_sessions
.at(*mds_ranks_closing
.begin());
6285 // this prunes entry from mds_sessions and mds_ranks_closing
6286 _closed_mds_session(&session
, -CEPHFS_ETIMEDOUT
);
6290 mds_ranks_closing
.clear();
6295 void Client::flush_mdlog_sync()
6297 if (mds_requests
.empty())
6299 for (auto &p
: mds_sessions
) {
6300 flush_mdlog(&p
.second
);
6304 void Client::flush_mdlog(MetaSession
*session
)
6306 // Only send this to Luminous or newer MDS daemons, older daemons
6307 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6308 const uint64_t features
= session
->con
->get_features();
6309 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
6310 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_FLUSH_MDLOG
);
6311 session
->con
->send_message2(std::move(m
));
6316 void Client::_abort_mds_sessions(int err
)
6318 for (auto p
= mds_requests
.begin(); p
!= mds_requests
.end(); ) {
6319 auto req
= p
->second
;
6321 // unsafe requests will be removed during close session below.
6322 if (req
->got_unsafe
)
6326 if (req
->caller_cond
) {
6328 req
->caller_cond
->notify_all();
6332 // Process aborts on any requests that were on this waitlist.
6333 // Any requests that were on a waiting_for_open session waitlist
6334 // will get kicked during close session below.
6335 signal_cond_list(waiting_for_mdsmap
);
6337 // Force-close all sessions
6338 while(!mds_sessions
.empty()) {
6339 auto& session
= mds_sessions
.begin()->second
;
6340 _closed_mds_session(&session
, err
);
6344 void Client::_unmount(bool abort
)
6347 * We are unmounting the client.
6349 * Just declare the state to STATE_UNMOUNTING to block and fail
6350 * any new comming "reader" and then try to wait all the in-flight
6351 * "readers" to finish.
6353 RWRef_t
mref_writer(mount_state
, CLIENT_UNMOUNTING
, false);
6354 if (!mref_writer
.is_first_writer())
6356 mref_writer
.wait_readers_done();
6358 std::unique_lock lock
{client_lock
};
6360 if (abort
|| blocklisted
) {
6361 ldout(cct
, 2) << "unmounting (" << (abort
? "abort)" : "blocklisted)") << dendl
;
6363 ldout(cct
, 2) << "unmounting" << dendl
;
6369 mount_aborted
= true;
6370 // Abort all mds sessions
6371 _abort_mds_sessions(-CEPHFS_ENOTCONN
);
6373 objecter
->op_cancel_writes(-CEPHFS_ENOTCONN
);
6375 // flush the mdlog for pending requests, if any
6379 mount_cond
.wait(lock
, [this] {
6380 if (!mds_requests
.empty()) {
6381 ldout(cct
, 10) << "waiting on " << mds_requests
.size() << " requests"
6384 return mds_requests
.empty();
6389 // clean up any unclosed files
6390 while (!fd_map
.empty()) {
6391 Fh
*fh
= fd_map
.begin()->second
;
6392 fd_map
.erase(fd_map
.begin());
6393 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *fh
->inode
<< dendl
;
6397 while (!ll_unclosed_fh_set
.empty()) {
6398 set
<Fh
*>::iterator it
= ll_unclosed_fh_set
.begin();
6400 ll_unclosed_fh_set
.erase(fh
);
6401 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *(fh
->inode
) << dendl
;
6405 while (!opened_dirs
.empty()) {
6406 dir_result_t
*dirp
= *opened_dirs
.begin();
6407 ldout(cct
, 0) << " destroyed lost open dir " << dirp
<< " on " << *dirp
->inode
<< dendl
;
6413 if (cct
->_conf
->client_oc
) {
6414 // flush/release all buffered data
6415 std::list
<InodeRef
> anchor
;
6416 for (auto& p
: inode_map
) {
6417 Inode
*in
= p
.second
;
6419 ldout(cct
, 0) << "null inode_map entry ino " << p
.first
<< dendl
;
6423 // prevent inode from getting freed
6424 anchor
.emplace_back(in
);
6426 if (abort
|| blocklisted
) {
6427 objectcacher
->purge_set(&in
->oset
);
6428 } else if (!in
->caps
.empty()) {
6430 _flush(in
, new C_Client_FlushComplete(this, in
));
6435 if (abort
|| blocklisted
) {
6436 for (auto p
= dirty_list
.begin(); !p
.end(); ) {
6439 if (in
->dirty_caps
) {
6440 ldout(cct
, 0) << " drop dirty caps on " << *in
<< dendl
;
6441 in
->mark_caps_clean();
6447 wait_sync_caps(last_flush_tid
);
6455 while (lru
.lru_get_size() > 0 ||
6456 !inode_map
.empty()) {
6457 ldout(cct
, 2) << "cache still has " << lru
.lru_get_size()
6458 << "+" << inode_map
.size() << " items"
6459 << ", waiting (for caps to release?)"
6462 if (auto r
= mount_cond
.wait_for(lock
, ceph::make_timespan(5));
6463 r
== std::cv_status::timeout
) {
6467 ceph_assert(lru
.lru_get_size() == 0);
6468 ceph_assert(inode_map
.empty());
6471 if (!cct
->_conf
->client_trace
.empty()) {
6472 ldout(cct
, 1) << "closing trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6476 // stop the tick thread
6477 tick_thread_stopped
= true;
6478 upkeep_cond
.notify_one();
6482 mref_writer
.update_state(CLIENT_UNMOUNTED
);
6484 ldout(cct
, 2) << "unmounted." << dendl
;
6487 void Client::unmount()
6492 void Client::abort_conn()
6497 void Client::flush_cap_releases()
6499 uint64_t nr_caps
= 0;
6501 // send any cap releases
6502 for (auto &p
: mds_sessions
) {
6503 auto &session
= p
.second
;
6504 if (session
.release
&& mdsmap
->is_clientreplay_or_active_or_stopping(
6506 nr_caps
+= session
.release
->caps
.size();
6507 if (cct
->_conf
->client_inject_release_failure
) {
6508 ldout(cct
, 20) << __func__
<< " injecting failure to send cap release message" << dendl
;
6510 session
.con
->send_message2(std::move(session
.release
));
6512 session
.release
.reset();
6517 dec_pinned_icaps(nr_caps
);
6521 void Client::renew_and_flush_cap_releases()
6523 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6525 if (!mount_aborted
&& mdsmap
->get_epoch()) {
6527 utime_t el
= ceph_clock_now() - last_cap_renew
;
6528 if (unlikely(el
> mdsmap
->get_session_timeout() / 3.0))
6531 flush_cap_releases();
6537 ldout(cct
, 20) << "tick" << dendl
;
6539 utime_t now
= ceph_clock_now();
6542 * If the mount() is not finished
6544 if (is_mounting() && !mds_requests
.empty()) {
6545 MetaRequest
*req
= mds_requests
.begin()->second
;
6547 if (req
->op_stamp
+ cct
->_conf
->client_mount_timeout
< now
) {
6548 req
->abort(-CEPHFS_ETIMEDOUT
);
6549 if (req
->caller_cond
) {
6551 req
->caller_cond
->notify_all();
6553 signal_cond_list(waiting_for_mdsmap
);
6554 for (auto &p
: mds_sessions
) {
6555 signal_context_list(p
.second
.waiting_for_open
);
6560 renew_and_flush_cap_releases();
6563 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
6567 if (!mount_aborted
&& in
->hold_caps_until
> now
)
6569 delayed_list
.pop_front();
6571 check_caps(in
, CHECK_CAPS_NODELAY
);
6575 collect_and_send_metrics();
6577 delay_put_inodes(is_unmounting());
6580 if (blocklisted
&& (is_mounted() || is_unmounting()) &&
6581 last_auto_reconnect
+ 30 * 60 < now
&&
6582 cct
->_conf
.get_val
<bool>("client_reconnect_stale")) {
6583 messenger
->client_reset();
6584 fd_gen
++; // invalidate open files
6585 blocklisted
= false;
6586 _kick_stale_sessions();
6587 last_auto_reconnect
= now
;
6591 void Client::start_tick_thread()
6593 upkeeper
= std::thread([this]() {
6594 using time
= ceph::coarse_mono_time
;
6595 using sec
= std::chrono::seconds
;
6597 auto last_tick
= time::min();
6599 std::unique_lock
cl(client_lock
);
6600 while (!tick_thread_stopped
) {
6601 auto now
= clock::now();
6602 auto since
= now
- last_tick
;
6604 auto t_interval
= clock::duration(cct
->_conf
.get_val
<sec
>("client_tick_interval"));
6605 auto d_interval
= clock::duration(cct
->_conf
.get_val
<sec
>("client_debug_inject_tick_delay"));
6607 auto interval
= std::max(t_interval
, d_interval
);
6608 if (likely(since
>= interval
*.90)) {
6610 last_tick
= clock::now();
6615 ldout(cct
, 20) << "upkeep thread waiting interval " << interval
<< dendl
;
6616 if (!tick_thread_stopped
)
6617 upkeep_cond
.wait_for(cl
, interval
);
6622 void Client::collect_and_send_metrics() {
6623 ldout(cct
, 20) << __func__
<< dendl
;
6625 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6627 // right now, we only track and send global metrics. its sufficient
6628 // to send these metrics to MDS rank0.
6629 collect_and_send_global_metrics();
6632 void Client::collect_and_send_global_metrics() {
6633 ldout(cct
, 20) << __func__
<< dendl
;
6634 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6636 if (!have_open_session((mds_rank_t
)0)) {
6637 ldout(cct
, 5) << __func__
<< ": no session with rank=0 -- not sending metric"
6641 auto session
= _get_or_open_mds_session((mds_rank_t
)0);
6642 if (!session
->mds_features
.test(CEPHFS_FEATURE_METRIC_COLLECT
)) {
6643 ldout(cct
, 5) << __func__
<< ": rank=0 does not support metrics" << dendl
;
6647 ClientMetricMessage metric
;
6648 std::vector
<ClientMetricMessage
> message
;
6651 metric
= ClientMetricMessage(ReadLatencyPayload(logger
->tget(l_c_read
)));
6652 message
.push_back(metric
);
6655 metric
= ClientMetricMessage(WriteLatencyPayload(logger
->tget(l_c_wrlat
)));
6656 message
.push_back(metric
);
6659 metric
= ClientMetricMessage(MetadataLatencyPayload(logger
->tget(l_c_lat
)));
6660 message
.push_back(metric
);
6662 // cap hit ratio -- nr_caps is unused right now
6663 auto [cap_hits
, cap_misses
] = get_cap_hit_rates();
6664 metric
= ClientMetricMessage(CapInfoPayload(cap_hits
, cap_misses
, 0));
6665 message
.push_back(metric
);
6667 // dentry lease hit ratio
6668 auto [dlease_hits
, dlease_misses
, nr
] = get_dlease_hit_rates();
6669 metric
= ClientMetricMessage(DentryLeasePayload(dlease_hits
, dlease_misses
, nr
));
6670 message
.push_back(metric
);
6674 auto [opened_files
, total_inodes
] = get_opened_files_rates();
6675 metric
= ClientMetricMessage(OpenedFilesPayload(opened_files
, total_inodes
));
6677 message
.push_back(metric
);
6681 auto [pinned_icaps
, total_inodes
] = get_pinned_icaps_rates();
6682 metric
= ClientMetricMessage(PinnedIcapsPayload(pinned_icaps
, total_inodes
));
6684 message
.push_back(metric
);
6688 auto [opened_inodes
, total_inodes
] = get_opened_inodes_rates();
6689 metric
= ClientMetricMessage(OpenedInodesPayload(opened_inodes
, total_inodes
));
6691 message
.push_back(metric
);
6693 session
->con
->send_message2(make_message
<MClientMetrics
>(std::move(message
)));
6696 void Client::renew_caps()
6698 ldout(cct
, 10) << "renew_caps()" << dendl
;
6699 last_cap_renew
= ceph_clock_now();
6701 for (auto &p
: mds_sessions
) {
6702 ldout(cct
, 15) << "renew_caps requesting from mds." << p
.first
<< dendl
;
6703 if (mdsmap
->get_state(p
.first
) >= MDSMap::STATE_REJOIN
)
6704 renew_caps(&p
.second
);
6708 void Client::renew_caps(MetaSession
*session
)
6710 ldout(cct
, 10) << "renew_caps mds." << session
->mds_num
<< dendl
;
6711 session
->last_cap_renew_request
= ceph_clock_now();
6712 uint64_t seq
= ++session
->cap_renew_seq
;
6713 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_RENEWCAPS
, seq
));
6717 // ===============================================================
6718 // high level (POSIXy) interface
6720 int Client::_do_lookup(Inode
*dir
, const string
& name
, int mask
,
6721 InodeRef
*target
, const UserPerm
& perms
)
6723 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_LOOKUPSNAP
: CEPH_MDS_OP_LOOKUP
;
6724 MetaRequest
*req
= new MetaRequest(op
);
6726 dir
->make_nosnap_relative_path(path
);
6727 path
.push_dentry(name
);
6728 req
->set_filepath(path
);
6729 req
->set_inode(dir
);
6730 if (cct
->_conf
->client_debug_getattr_caps
&& op
== CEPH_MDS_OP_LOOKUP
)
6731 mask
|= DEBUG_GETATTR_CAPS
;
6732 req
->head
.args
.getattr
.mask
= mask
;
6734 ldout(cct
, 10) << __func__
<< " on " << path
<< dendl
;
6736 int r
= make_request(req
, perms
, target
);
6737 ldout(cct
, 10) << __func__
<< " res is " << r
<< dendl
;
6741 bool Client::_dentry_valid(const Dentry
*dn
)
6743 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6745 // is dn lease valid?
6746 utime_t now
= ceph_clock_now();
6747 if (dn
->lease_mds
>= 0 && dn
->lease_ttl
> now
&&
6748 mds_sessions
.count(dn
->lease_mds
)) {
6749 MetaSession
&s
= mds_sessions
.at(dn
->lease_mds
);
6750 if (s
.cap_ttl
> now
&& s
.cap_gen
== dn
->lease_gen
) {
6755 ldout(cct
, 20) << " bad lease, cap_ttl " << s
.cap_ttl
<< ", cap_gen " << s
.cap_gen
6756 << " vs lease_gen " << dn
->lease_gen
<< dendl
;
6763 int Client::_lookup(Inode
*dir
, const string
& dname
, int mask
, InodeRef
*target
,
6764 const UserPerm
& perms
, std::string
* alternate_name
)
6768 bool did_lookup_request
= false;
6769 // can only request shared caps
6770 mask
&= CEPH_CAP_ANY_SHARED
| CEPH_STAT_RSTAT
;
6772 if (dname
== "..") {
6773 if (dir
->dentries
.empty()) {
6774 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
6775 filepath
path(dir
->ino
);
6776 req
->set_filepath(path
);
6779 int r
= make_request(req
, perms
, &tmptarget
, NULL
, rand() % mdsmap
->get_num_in_mds());
6782 *target
= std::move(tmptarget
);
6783 ldout(cct
, 8) << __func__
<< " found target " << (*target
)->ino
<< dendl
;
6789 *target
= dir
->get_first_parent()->dir
->parent_inode
; //dirs can't be hard-linked
6798 if (!dir
->is_dir()) {
6799 r
= -CEPHFS_ENOTDIR
;
6803 if (dname
.length() > NAME_MAX
) {
6804 r
= -CEPHFS_ENAMETOOLONG
;
6808 if (dname
== cct
->_conf
->client_snapdir
&&
6809 dir
->snapid
== CEPH_NOSNAP
) {
6810 *target
= open_snapdir(dir
);
6816 dir
->dir
->dentries
.count(dname
)) {
6817 dn
= dir
->dir
->dentries
[dname
];
6819 ldout(cct
, 20) << __func__
<< " have " << *dn
<< " from mds." << dn
->lease_mds
6820 << " ttl " << dn
->lease_ttl
<< " seq " << dn
->lease_seq
<< dendl
;
6822 if (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)) {
6823 if (_dentry_valid(dn
)) {
6824 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6825 // make trim_caps() behave.
6826 dir
->try_touch_cap(dn
->lease_mds
);
6830 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
6831 if (dn
->cap_shared_gen
== dir
->shared_gen
&&
6832 (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)))
6834 if (!dn
->inode
&& (dir
->flags
& I_COMPLETE
)) {
6835 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for "
6836 << *dir
<< " dn '" << dname
<< "'" << dendl
;
6837 return -CEPHFS_ENOENT
;
6841 ldout(cct
, 20) << " no cap on " << dn
->inode
->vino() << dendl
;
6844 // can we conclude ENOENT locally?
6845 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true) &&
6846 (dir
->flags
& I_COMPLETE
)) {
6847 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for " << *dir
<< " dn '" << dname
<< "'" << dendl
;
6848 return -CEPHFS_ENOENT
;
6852 if (did_lookup_request
) {
6856 r
= _do_lookup(dir
, dname
, mask
, target
, perms
);
6857 did_lookup_request
= true;
6859 /* complete lookup to get dentry for alternate_name */
6867 *target
= dn
->inode
;
6869 *alternate_name
= dn
->alternate_name
;
6878 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << r
<< dendl
;
6880 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << **target
<< dendl
;
6884 int Client::get_or_create(Inode
*dir
, const char* name
,
6885 Dentry
**pdn
, bool expect_null
)
6888 ldout(cct
, 20) << __func__
<< " " << *dir
<< " name " << name
<< dendl
;
6890 if (dir
->dir
->dentries
.count(name
)) {
6891 Dentry
*dn
= dir
->dir
->dentries
[name
];
6892 if (_dentry_valid(dn
)) {
6894 return -CEPHFS_EEXIST
;
6898 // otherwise link up a new one
6899 *pdn
= link(dir
->dir
, name
, NULL
, NULL
);
6906 int Client::walk(std::string_view path
, walk_dentry_result
* wdr
, const UserPerm
& perms
, bool followsym
)
6908 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
6909 if (!mref_reader
.is_state_satisfied())
6910 return -CEPHFS_ENOTCONN
;
6912 ldout(cct
, 10) << __func__
<< ": " << path
<< dendl
;
6914 std::scoped_lock
lock(client_lock
);
6916 return path_walk(path
, wdr
, perms
, followsym
);
6919 int Client::path_walk(const filepath
& origpath
, InodeRef
*end
,
6920 const UserPerm
& perms
, bool followsym
, int mask
)
6922 walk_dentry_result wdr
;
6923 int rc
= path_walk(origpath
, &wdr
, perms
, followsym
, mask
);
6924 *end
= std::move(wdr
.in
);
6928 int Client::path_walk(const filepath
& origpath
, walk_dentry_result
* result
, const UserPerm
& perms
, bool followsym
, int mask
)
6930 filepath path
= origpath
;
6932 std::string alternate_name
;
6933 if (origpath
.absolute())
6939 ldout(cct
, 10) << __func__
<< " " << path
<< dendl
;
6944 while (i
< path
.depth() && cur
) {
6946 const string
&dname
= path
[i
];
6947 ldout(cct
, 10) << " " << i
<< " " << *cur
<< " " << dname
<< dendl
;
6948 ldout(cct
, 20) << " (path is " << path
<< ")" << dendl
;
6950 if (cct
->_conf
->client_permissions
) {
6951 int r
= may_lookup(cur
.get(), perms
);
6954 caps
= CEPH_CAP_AUTH_SHARED
;
6957 /* Get extra requested caps on the last component */
6958 if (i
== (path
.depth() - 1))
6960 int r
= _lookup(cur
.get(), dname
, caps
, &next
, perms
, &alternate_name
);
6963 // only follow trailing symlink if followsym. always follow
6964 // 'directory' symlinks.
6965 if (next
&& next
->is_symlink()) {
6967 ldout(cct
, 20) << " symlink count " << symlinks
<< ", value is '" << next
->symlink
<< "'" << dendl
;
6968 if (symlinks
> MAXSYMLINKS
) {
6969 return -CEPHFS_ELOOP
;
6972 if (i
< path
.depth() - 1) {
6974 // replace consumed components of path with symlink dir target
6975 filepath
resolved(next
->symlink
.c_str());
6976 resolved
.append(path
.postfixpath(i
+ 1));
6979 if (next
->symlink
[0] == '/') {
6983 } else if (followsym
) {
6984 if (next
->symlink
[0] == '/') {
6985 path
= next
->symlink
.c_str();
6990 filepath
more(next
->symlink
.c_str());
6991 // we need to remove the symlink component from off of the path
6992 // before adding the target that the symlink points to. remain
6993 // at the same position in the path.
7004 return -CEPHFS_ENOENT
;
7006 result
->in
= std::move(cur
);
7007 result
->alternate_name
= std::move(alternate_name
);
7015 int Client::link(const char *relexisting
, const char *relpath
, const UserPerm
& perm
, std::string alternate_name
)
7017 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7018 if (!mref_reader
.is_state_satisfied())
7019 return -CEPHFS_ENOTCONN
;
7021 tout(cct
) << "link" << std::endl
;
7022 tout(cct
) << relexisting
<< std::endl
;
7023 tout(cct
) << relpath
<< std::endl
;
7025 filepath
existing(relexisting
);
7029 std::scoped_lock
lock(client_lock
);
7030 int r
= path_walk(existing
, &in
, perm
, true);
7033 if (std::string(relpath
) == "/") {
7037 filepath
path(relpath
);
7038 string name
= path
.last_dentry();
7041 r
= path_walk(path
, &dir
, perm
, true);
7044 if (cct
->_conf
->client_permissions
) {
7045 if (S_ISDIR(in
->mode
)) {
7049 r
= may_hardlink(in
.get(), perm
);
7052 r
= may_create(dir
.get(), perm
);
7056 r
= _link(in
.get(), dir
.get(), name
.c_str(), perm
, std::move(alternate_name
));
7060 int Client::unlink(const char *relpath
, const UserPerm
& perm
)
7062 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7063 if (!mref_reader
.is_state_satisfied())
7064 return -CEPHFS_ENOTCONN
;
7066 tout(cct
) << __func__
<< std::endl
;
7067 tout(cct
) << relpath
<< std::endl
;
7069 if (std::string(relpath
) == "/")
7070 return -CEPHFS_EISDIR
;
7072 filepath
path(relpath
);
7073 string name
= path
.last_dentry();
7077 std::scoped_lock
lock(client_lock
);
7078 int r
= path_walk(path
, &dir
, perm
);
7081 if (cct
->_conf
->client_permissions
) {
7082 r
= may_delete(dir
.get(), name
.c_str(), perm
);
7086 return _unlink(dir
.get(), name
.c_str(), perm
);
7089 int Client::rename(const char *relfrom
, const char *relto
, const UserPerm
& perm
, std::string alternate_name
)
7091 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7092 if (!mref_reader
.is_state_satisfied())
7093 return -CEPHFS_ENOTCONN
;
7095 tout(cct
) << __func__
<< std::endl
;
7096 tout(cct
) << relfrom
<< std::endl
;
7097 tout(cct
) << relto
<< std::endl
;
7099 if (std::string(relfrom
) == "/" || std::string(relto
) == "/")
7100 return -CEPHFS_EBUSY
;
7102 filepath
from(relfrom
);
7104 string fromname
= from
.last_dentry();
7106 string toname
= to
.last_dentry();
7109 InodeRef fromdir
, todir
;
7111 std::scoped_lock
lock(client_lock
);
7112 int r
= path_walk(from
, &fromdir
, perm
);
7115 r
= path_walk(to
, &todir
, perm
);
7119 if (cct
->_conf
->client_permissions
) {
7120 int r
= may_delete(fromdir
.get(), fromname
.c_str(), perm
);
7123 r
= may_delete(todir
.get(), toname
.c_str(), perm
);
7124 if (r
< 0 && r
!= -CEPHFS_ENOENT
)
7127 r
= _rename(fromdir
.get(), fromname
.c_str(), todir
.get(), toname
.c_str(), perm
, std::move(alternate_name
));
7134 int Client::mkdir(const char *relpath
, mode_t mode
, const UserPerm
& perm
, std::string alternate_name
)
7136 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7137 if (!mref_reader
.is_state_satisfied())
7138 return -CEPHFS_ENOTCONN
;
7140 tout(cct
) << __func__
<< std::endl
;
7141 tout(cct
) << relpath
<< std::endl
;
7142 tout(cct
) << mode
<< std::endl
;
7143 ldout(cct
, 10) << __func__
<< ": " << relpath
<< dendl
;
7145 if (std::string(relpath
) == "/")
7146 return -CEPHFS_EEXIST
;
7148 filepath
path(relpath
);
7149 string name
= path
.last_dentry();
7153 std::scoped_lock
lock(client_lock
);
7154 int r
= path_walk(path
, &dir
, perm
);
7157 if (cct
->_conf
->client_permissions
) {
7158 r
= may_create(dir
.get(), perm
);
7162 return _mkdir(dir
.get(), name
.c_str(), mode
, perm
, 0, {}, std::move(alternate_name
));
7165 int Client::mkdirs(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7167 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7168 if (!mref_reader
.is_state_satisfied())
7169 return -CEPHFS_ENOTCONN
;
7171 ldout(cct
, 10) << "Client::mkdirs " << relpath
<< dendl
;
7172 tout(cct
) << __func__
<< std::endl
;
7173 tout(cct
) << relpath
<< std::endl
;
7174 tout(cct
) << mode
<< std::endl
;
7176 //get through existing parts of path
7177 filepath
path(relpath
);
7179 int r
= 0, caps
= 0;
7182 std::scoped_lock
lock(client_lock
);
7184 for (i
=0; i
<path
.depth(); ++i
) {
7185 if (cct
->_conf
->client_permissions
) {
7186 r
= may_lookup(cur
.get(), perms
);
7189 caps
= CEPH_CAP_AUTH_SHARED
;
7191 r
= _lookup(cur
.get(), path
[i
].c_str(), caps
, &next
, perms
);
7196 if (r
!=-CEPHFS_ENOENT
) return r
;
7197 ldout(cct
, 20) << __func__
<< " got through " << i
<< " directories on path " << relpath
<< dendl
;
7198 //make new directory at each level
7199 for (; i
<path
.depth(); ++i
) {
7200 if (cct
->_conf
->client_permissions
) {
7201 r
= may_create(cur
.get(), perms
);
7206 r
= _mkdir(cur
.get(), path
[i
].c_str(), mode
, perms
, &next
);
7208 //check proper creation/existence
7209 if(-CEPHFS_EEXIST
== r
&& i
< path
.depth() - 1) {
7210 r
= _lookup(cur
.get(), path
[i
].c_str(), CEPH_CAP_AUTH_SHARED
, &next
, perms
);
7214 //move to new dir and continue
7216 ldout(cct
, 20) << __func__
<< ": successfully created directory "
7217 << filepath(cur
->ino
).get_path() << dendl
;
7222 int Client::rmdir(const char *relpath
, const UserPerm
& perms
)
7224 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7225 if (!mref_reader
.is_state_satisfied())
7226 return -CEPHFS_ENOTCONN
;
7228 tout(cct
) << __func__
<< std::endl
;
7229 tout(cct
) << relpath
<< std::endl
;
7231 if (std::string(relpath
) == "/")
7232 return -CEPHFS_EBUSY
;
7234 filepath
path(relpath
);
7235 string name
= path
.last_dentry();
7239 std::scoped_lock
lock(client_lock
);
7240 int r
= path_walk(path
, &dir
, perms
);
7243 if (cct
->_conf
->client_permissions
) {
7244 int r
= may_delete(dir
.get(), name
.c_str(), perms
);
7248 return _rmdir(dir
.get(), name
.c_str(), perms
);
7251 int Client::mknod(const char *relpath
, mode_t mode
, const UserPerm
& perms
, dev_t rdev
)
7253 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7254 if (!mref_reader
.is_state_satisfied())
7255 return -CEPHFS_ENOTCONN
;
7257 tout(cct
) << __func__
<< std::endl
;
7258 tout(cct
) << relpath
<< std::endl
;
7259 tout(cct
) << mode
<< std::endl
;
7260 tout(cct
) << rdev
<< std::endl
;
7262 if (std::string(relpath
) == "/")
7263 return -CEPHFS_EEXIST
;
7265 filepath
path(relpath
);
7266 string name
= path
.last_dentry();
7270 std::scoped_lock
lock(client_lock
);
7271 int r
= path_walk(path
, &dir
, perms
);
7274 if (cct
->_conf
->client_permissions
) {
7275 int r
= may_create(dir
.get(), perms
);
7279 return _mknod(dir
.get(), name
.c_str(), mode
, rdev
, perms
);
7284 int Client::symlink(const char *target
, const char *relpath
, const UserPerm
& perms
, std::string alternate_name
)
7286 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7287 if (!mref_reader
.is_state_satisfied())
7288 return -CEPHFS_ENOTCONN
;
7290 tout(cct
) << __func__
<< std::endl
;
7291 tout(cct
) << target
<< std::endl
;
7292 tout(cct
) << relpath
<< std::endl
;
7294 if (std::string(relpath
) == "/")
7295 return -CEPHFS_EEXIST
;
7297 filepath
path(relpath
);
7298 string name
= path
.last_dentry();
7302 std::scoped_lock
lock(client_lock
);
7303 int r
= path_walk(path
, &dir
, perms
);
7306 if (cct
->_conf
->client_permissions
) {
7307 int r
= may_create(dir
.get(), perms
);
7311 return _symlink(dir
.get(), name
.c_str(), target
, perms
, std::move(alternate_name
));
7314 int Client::readlink(const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
)
7316 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7317 if (!mref_reader
.is_state_satisfied())
7318 return -CEPHFS_ENOTCONN
;
7320 tout(cct
) << __func__
<< std::endl
;
7321 tout(cct
) << relpath
<< std::endl
;
7323 filepath
path(relpath
);
7326 std::scoped_lock
lock(client_lock
);
7327 int r
= path_walk(path
, &in
, perms
, false);
7331 return _readlink(in
.get(), buf
, size
);
7334 int Client::_readlink(Inode
*in
, char *buf
, size_t size
)
7336 if (!in
->is_symlink())
7337 return -CEPHFS_EINVAL
;
7339 // copy into buf (at most size bytes)
7340 int r
= in
->symlink
.length();
7343 memcpy(buf
, in
->symlink
.c_str(), r
);
7350 int Client::_getattr(Inode
*in
, int mask
, const UserPerm
& perms
, bool force
)
7352 bool yes
= in
->caps_issued_mask(mask
, true);
7354 ldout(cct
, 10) << __func__
<< " mask " << ccap_string(mask
) << " issued=" << yes
<< dendl
;
7358 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
7360 in
->make_nosnap_relative_path(path
);
7361 req
->set_filepath(path
);
7363 req
->head
.args
.getattr
.mask
= mask
;
7365 int res
= make_request(req
, perms
);
7366 ldout(cct
, 10) << __func__
<< " result=" << res
<< dendl
;
7370 int Client::_do_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
7371 const UserPerm
& perms
, InodeRef
*inp
)
7373 int issued
= in
->caps_issued();
7375 ldout(cct
, 10) << __func__
<< " mask " << mask
<< " issued " <<
7376 ccap_string(issued
) << dendl
;
7378 if (in
->snapid
!= CEPH_NOSNAP
) {
7379 return -CEPHFS_EROFS
;
7381 if ((mask
& CEPH_SETATTR_SIZE
) &&
7382 (uint64_t)stx
->stx_size
> in
->size
&&
7383 is_quota_bytes_exceeded(in
, (uint64_t)stx
->stx_size
- in
->size
,
7385 return -CEPHFS_EDQUOT
;
7388 // make the change locally?
7389 if ((in
->cap_dirtier_uid
>= 0 && perms
.uid() != in
->cap_dirtier_uid
) ||
7390 (in
->cap_dirtier_gid
>= 0 && perms
.gid() != in
->cap_dirtier_gid
)) {
7391 ldout(cct
, 10) << __func__
<< " caller " << perms
.uid() << ":" << perms
.gid()
7392 << " != cap dirtier " << in
->cap_dirtier_uid
<< ":"
7393 << in
->cap_dirtier_gid
<< ", forcing sync setattr"
7396 * This works because we implicitly flush the caps as part of the
7397 * request, so the cap update check will happen with the writeback
7398 * cap context, and then the setattr check will happen with the
7401 * In reality this pattern is likely pretty rare (different users
7402 * setattr'ing the same file). If that turns out not to be the
7403 * case later, we can build a more complex pipelined cap writeback
7407 mask
|= CEPH_SETATTR_CTIME
;
7412 // caller just needs us to bump the ctime
7413 in
->ctime
= ceph_clock_now();
7414 in
->cap_dirtier_uid
= perms
.uid();
7415 in
->cap_dirtier_gid
= perms
.gid();
7416 if (issued
& CEPH_CAP_AUTH_EXCL
)
7417 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7418 else if (issued
& CEPH_CAP_FILE_EXCL
)
7419 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7420 else if (issued
& CEPH_CAP_XATTR_EXCL
)
7421 in
->mark_caps_dirty(CEPH_CAP_XATTR_EXCL
);
7423 mask
|= CEPH_SETATTR_CTIME
;
7426 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7427 bool kill_sguid
= mask
& (CEPH_SETATTR_SIZE
|CEPH_SETATTR_KILL_SGUID
);
7429 mask
&= ~CEPH_SETATTR_KILL_SGUID
;
7431 if (mask
& CEPH_SETATTR_UID
) {
7432 in
->ctime
= ceph_clock_now();
7433 in
->cap_dirtier_uid
= perms
.uid();
7434 in
->cap_dirtier_gid
= perms
.gid();
7435 in
->uid
= stx
->stx_uid
;
7436 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7437 mask
&= ~CEPH_SETATTR_UID
;
7439 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
7441 if (mask
& CEPH_SETATTR_GID
) {
7442 in
->ctime
= ceph_clock_now();
7443 in
->cap_dirtier_uid
= perms
.uid();
7444 in
->cap_dirtier_gid
= perms
.gid();
7445 in
->gid
= stx
->stx_gid
;
7446 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7447 mask
&= ~CEPH_SETATTR_GID
;
7449 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
7452 if (mask
& CEPH_SETATTR_MODE
) {
7453 in
->ctime
= ceph_clock_now();
7454 in
->cap_dirtier_uid
= perms
.uid();
7455 in
->cap_dirtier_gid
= perms
.gid();
7456 in
->mode
= (in
->mode
& ~07777) | (stx
->stx_mode
& 07777);
7457 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7458 mask
&= ~CEPH_SETATTR_MODE
;
7459 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
7460 } else if (kill_sguid
&& S_ISREG(in
->mode
) && (in
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
7461 /* Must squash the any setuid/setgid bits with an ownership change */
7462 in
->mode
&= ~(S_ISUID
|S_ISGID
);
7463 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7466 if (mask
& CEPH_SETATTR_BTIME
) {
7467 in
->ctime
= ceph_clock_now();
7468 in
->cap_dirtier_uid
= perms
.uid();
7469 in
->cap_dirtier_gid
= perms
.gid();
7470 in
->btime
= utime_t(stx
->stx_btime
);
7471 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7472 mask
&= ~CEPH_SETATTR_BTIME
;
7473 ldout(cct
,10) << "changing btime to " << in
->btime
<< dendl
;
7475 } else if (mask
& CEPH_SETATTR_SIZE
) {
7476 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7477 mask
|= CEPH_SETATTR_KILL_SGUID
;
7480 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
7481 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
)) {
7482 if (mask
& CEPH_SETATTR_MTIME
)
7483 in
->mtime
= utime_t(stx
->stx_mtime
);
7484 if (mask
& CEPH_SETATTR_ATIME
)
7485 in
->atime
= utime_t(stx
->stx_atime
);
7486 in
->ctime
= ceph_clock_now();
7487 in
->cap_dirtier_uid
= perms
.uid();
7488 in
->cap_dirtier_gid
= perms
.gid();
7489 in
->time_warp_seq
++;
7490 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7491 mask
&= ~(CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
);
7500 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETATTR
);
7504 in
->make_nosnap_relative_path(path
);
7505 req
->set_filepath(path
);
7508 if (mask
& CEPH_SETATTR_KILL_SGUID
) {
7509 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7511 if (mask
& CEPH_SETATTR_MODE
) {
7512 req
->head
.args
.setattr
.mode
= stx
->stx_mode
;
7513 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7514 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
7516 if (mask
& CEPH_SETATTR_UID
) {
7517 req
->head
.args
.setattr
.uid
= stx
->stx_uid
;
7518 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7519 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
7521 if (mask
& CEPH_SETATTR_GID
) {
7522 req
->head
.args
.setattr
.gid
= stx
->stx_gid
;
7523 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7524 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
7526 if (mask
& CEPH_SETATTR_BTIME
) {
7527 req
->head
.args
.setattr
.btime
= utime_t(stx
->stx_btime
);
7528 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7530 if (mask
& CEPH_SETATTR_MTIME
) {
7531 req
->head
.args
.setattr
.mtime
= utime_t(stx
->stx_mtime
);
7532 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7535 if (mask
& CEPH_SETATTR_ATIME
) {
7536 req
->head
.args
.setattr
.atime
= utime_t(stx
->stx_atime
);
7537 req
->inode_drop
|= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_RD
|
7540 if (mask
& CEPH_SETATTR_SIZE
) {
7541 if ((uint64_t)stx
->stx_size
< mdsmap
->get_max_filesize()) {
7542 req
->head
.args
.setattr
.size
= stx
->stx_size
;
7543 ldout(cct
,10) << "changing size to " << stx
->stx_size
<< dendl
;
7546 ldout(cct
,10) << "unable to set size to " << stx
->stx_size
<< ". Too large!" << dendl
;
7547 return -CEPHFS_EFBIG
;
7549 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7552 req
->head
.args
.setattr
.mask
= mask
;
7554 req
->regetattr_mask
= mask
;
7556 int res
= make_request(req
, perms
, inp
);
7557 ldout(cct
, 10) << "_setattr result=" << res
<< dendl
;
7561 /* Note that we only care about attrs that setattr cares about */
7562 void Client::stat_to_statx(struct stat
*st
, struct ceph_statx
*stx
)
7564 stx
->stx_size
= st
->st_size
;
7565 stx
->stx_mode
= st
->st_mode
;
7566 stx
->stx_uid
= st
->st_uid
;
7567 stx
->stx_gid
= st
->st_gid
;
7569 stx
->stx_mtime
= st
->st_mtimespec
;
7570 stx
->stx_atime
= st
->st_atimespec
;
7572 stx
->stx_mtime
.tv_sec
= st
->st_mtime
;
7573 stx
->stx_atime
.tv_sec
= st
->st_atime
;
7575 stx
->stx_mtime
= st
->st_mtim
;
7576 stx
->stx_atime
= st
->st_atim
;
7580 int Client::__setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
7581 const UserPerm
& perms
, InodeRef
*inp
)
7583 int ret
= _do_setattr(in
, stx
, mask
, perms
, inp
);
7586 if (mask
& CEPH_SETATTR_MODE
)
7587 ret
= _posix_acl_chmod(in
, stx
->stx_mode
, perms
);
7591 int Client::_setattrx(InodeRef
&in
, struct ceph_statx
*stx
, int mask
,
7592 const UserPerm
& perms
)
7594 mask
&= (CEPH_SETATTR_MODE
| CEPH_SETATTR_UID
|
7595 CEPH_SETATTR_GID
| CEPH_SETATTR_MTIME
|
7596 CEPH_SETATTR_ATIME
| CEPH_SETATTR_SIZE
|
7597 CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
);
7598 if (cct
->_conf
->client_permissions
) {
7599 int r
= may_setattr(in
.get(), stx
, mask
, perms
);
7603 return __setattrx(in
.get(), stx
, mask
, perms
);
7606 int Client::_setattr(InodeRef
&in
, struct stat
*attr
, int mask
,
7607 const UserPerm
& perms
)
7609 struct ceph_statx stx
;
7611 stat_to_statx(attr
, &stx
);
7612 mask
&= ~CEPH_SETATTR_BTIME
;
7614 if ((mask
& CEPH_SETATTR_UID
) && attr
->st_uid
== static_cast<uid_t
>(-1)) {
7615 mask
&= ~CEPH_SETATTR_UID
;
7617 if ((mask
& CEPH_SETATTR_GID
) && attr
->st_gid
== static_cast<uid_t
>(-1)) {
7618 mask
&= ~CEPH_SETATTR_GID
;
7621 return _setattrx(in
, &stx
, mask
, perms
);
7624 int Client::setattr(const char *relpath
, struct stat
*attr
, int mask
,
7625 const UserPerm
& perms
)
7627 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7628 if (!mref_reader
.is_state_satisfied())
7629 return -CEPHFS_ENOTCONN
;
7631 tout(cct
) << __func__
<< std::endl
;
7632 tout(cct
) << relpath
<< std::endl
;
7633 tout(cct
) << mask
<< std::endl
;
7635 filepath
path(relpath
);
7638 std::scoped_lock
lock(client_lock
);
7639 int r
= path_walk(path
, &in
, perms
);
7642 return _setattr(in
, attr
, mask
, perms
);
7645 int Client::setattrx(const char *relpath
, struct ceph_statx
*stx
, int mask
,
7646 const UserPerm
& perms
, int flags
)
7648 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7649 if (!mref_reader
.is_state_satisfied())
7650 return -CEPHFS_ENOTCONN
;
7652 tout(cct
) << __func__
<< std::endl
;
7653 tout(cct
) << relpath
<< std::endl
;
7654 tout(cct
) << mask
<< std::endl
;
7656 filepath
path(relpath
);
7659 std::scoped_lock
lock(client_lock
);
7660 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
));
7663 return _setattrx(in
, stx
, mask
, perms
);
7666 int Client::fsetattr(int fd
, struct stat
*attr
, int mask
, const UserPerm
& perms
)
7668 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7669 if (!mref_reader
.is_state_satisfied())
7670 return -CEPHFS_ENOTCONN
;
7672 tout(cct
) << __func__
<< std::endl
;
7673 tout(cct
) << fd
<< std::endl
;
7674 tout(cct
) << mask
<< std::endl
;
7676 std::scoped_lock
lock(client_lock
);
7677 Fh
*f
= get_filehandle(fd
);
7679 return -CEPHFS_EBADF
;
7680 #if defined(__linux__) && defined(O_PATH)
7681 if (f
->flags
& O_PATH
)
7682 return -CEPHFS_EBADF
;
7684 return _setattr(f
->inode
, attr
, mask
, perms
);
7687 int Client::fsetattrx(int fd
, struct ceph_statx
*stx
, int mask
, const UserPerm
& perms
)
7689 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7690 if (!mref_reader
.is_state_satisfied())
7691 return -CEPHFS_ENOTCONN
;
7693 tout(cct
) << __func__
<< std::endl
;
7694 tout(cct
) << fd
<< std::endl
;
7695 tout(cct
) << mask
<< std::endl
;
7697 std::scoped_lock
lock(client_lock
);
7698 Fh
*f
= get_filehandle(fd
);
7700 return -CEPHFS_EBADF
;
7701 #if defined(__linux__) && defined(O_PATH)
7702 if (f
->flags
& O_PATH
)
7703 return -CEPHFS_EBADF
;
7705 return _setattrx(f
->inode
, stx
, mask
, perms
);
7708 int Client::stat(const char *relpath
, struct stat
*stbuf
, const UserPerm
& perms
,
7709 frag_info_t
*dirstat
, int mask
)
7711 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7712 if (!mref_reader
.is_state_satisfied())
7713 return -CEPHFS_ENOTCONN
;
7715 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7716 tout(cct
) << "stat" << std::endl
;
7717 tout(cct
) << relpath
<< std::endl
;
7719 filepath
path(relpath
);
7722 std::scoped_lock
lock(client_lock
);
7723 int r
= path_walk(path
, &in
, perms
, true, mask
);
7726 r
= _getattr(in
, mask
, perms
);
7728 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7731 fill_stat(in
, stbuf
, dirstat
);
7732 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7736 unsigned Client::statx_to_mask(unsigned int flags
, unsigned int want
)
7740 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7741 if (flags
& AT_NO_ATTR_SYNC
)
7744 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7745 mask
|= CEPH_CAP_PIN
;
7746 if (want
& (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7747 mask
|= CEPH_CAP_AUTH_SHARED
;
7748 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7749 mask
|= CEPH_CAP_LINK_SHARED
;
7750 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|CEPH_STATX_CTIME
|CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
|CEPH_STATX_VERSION
))
7751 mask
|= CEPH_CAP_FILE_SHARED
;
7752 if (want
& (CEPH_STATX_VERSION
|CEPH_STATX_CTIME
))
7753 mask
|= CEPH_CAP_XATTR_SHARED
;
7758 int Client::statx(const char *relpath
, struct ceph_statx
*stx
,
7759 const UserPerm
& perms
,
7760 unsigned int want
, unsigned int flags
)
7762 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7763 if (!mref_reader
.is_state_satisfied())
7764 return -CEPHFS_ENOTCONN
;
7766 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " want " << want
<< ")" << dendl
;
7767 tout(cct
) << "statx" << std::endl
;
7768 tout(cct
) << relpath
<< std::endl
;
7770 filepath
path(relpath
);
7773 unsigned mask
= statx_to_mask(flags
, want
);
7775 std::scoped_lock
lock(client_lock
);
7776 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
7780 r
= _getattr(in
, mask
, perms
);
7782 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7786 fill_statx(in
, mask
, stx
);
7787 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << stx
->stx_mask
<< ")" << dendl
;
7791 int Client::lstat(const char *relpath
, struct stat
*stbuf
,
7792 const UserPerm
& perms
, frag_info_t
*dirstat
, int mask
)
7794 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7795 if (!mref_reader
.is_state_satisfied())
7796 return -CEPHFS_ENOTCONN
;
7798 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7799 tout(cct
) << __func__
<< std::endl
;
7800 tout(cct
) << relpath
<< std::endl
;
7802 filepath
path(relpath
);
7805 std::scoped_lock
lock(client_lock
);
7806 // don't follow symlinks
7807 int r
= path_walk(path
, &in
, perms
, false, mask
);
7810 r
= _getattr(in
, mask
, perms
);
7812 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7815 fill_stat(in
, stbuf
, dirstat
);
7816 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7820 int Client::fill_stat(Inode
*in
, struct stat
*st
, frag_info_t
*dirstat
, nest_info_t
*rstat
)
7822 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7823 << " mode 0" << oct
<< in
->mode
<< dec
7824 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7825 memset(st
, 0, sizeof(struct stat
));
7826 if (use_faked_inos())
7827 st
->st_ino
= in
->faked_ino
;
7829 st
->st_ino
= in
->ino
;
7830 st
->st_dev
= in
->snapid
;
7831 st
->st_mode
= in
->mode
;
7832 st
->st_rdev
= in
->rdev
;
7834 switch (in
->nlink
) {
7836 st
->st_nlink
= 0; /* dir is unlinked */
7839 st
->st_nlink
= 1 /* parent dentry */
7841 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7847 st
->st_nlink
= in
->nlink
;
7849 st
->st_uid
= in
->uid
;
7850 st
->st_gid
= in
->gid
;
7851 if (in
->ctime
> in
->mtime
) {
7852 stat_set_ctime_sec(st
, in
->ctime
.sec());
7853 stat_set_ctime_nsec(st
, in
->ctime
.nsec());
7855 stat_set_ctime_sec(st
, in
->mtime
.sec());
7856 stat_set_ctime_nsec(st
, in
->mtime
.nsec());
7858 stat_set_atime_sec(st
, in
->atime
.sec());
7859 stat_set_atime_nsec(st
, in
->atime
.nsec());
7860 stat_set_mtime_sec(st
, in
->mtime
.sec());
7861 stat_set_mtime_nsec(st
, in
->mtime
.nsec());
7863 if (cct
->_conf
->client_dirsize_rbytes
)
7864 st
->st_size
= in
->rstat
.rbytes
;
7866 st
->st_size
= in
->dirstat
.size();
7867 // The Windows "stat" structure provides just a subset of the fields that are
7868 // available on Linux.
7873 st
->st_size
= in
->size
;
7875 st
->st_blocks
= (in
->size
+ 511) >> 9;
7879 st
->st_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7883 *dirstat
= in
->dirstat
;
7887 return in
->caps_issued();
7890 void Client::fill_statx(Inode
*in
, unsigned int mask
, struct ceph_statx
*stx
)
7892 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7893 << " mode 0" << oct
<< in
->mode
<< dec
7894 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7895 memset(stx
, 0, sizeof(struct ceph_statx
));
7898 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7899 * so that all bits are set.
7904 /* These are always considered to be available */
7905 stx
->stx_dev
= in
->snapid
;
7906 stx
->stx_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7908 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7909 stx
->stx_mode
= S_IFMT
& in
->mode
;
7910 stx
->stx_ino
= use_faked_inos() ? in
->faked_ino
: (ino_t
)in
->ino
;
7911 stx
->stx_rdev
= in
->rdev
;
7912 stx
->stx_mask
|= (CEPH_STATX_INO
|CEPH_STATX_RDEV
);
7914 if (mask
& CEPH_CAP_AUTH_SHARED
) {
7915 stx
->stx_uid
= in
->uid
;
7916 stx
->stx_gid
= in
->gid
;
7917 stx
->stx_mode
= in
->mode
;
7918 in
->btime
.to_timespec(&stx
->stx_btime
);
7919 stx
->stx_mask
|= (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
);
7922 if (mask
& CEPH_CAP_LINK_SHARED
) {
7924 switch (in
->nlink
) {
7926 stx
->stx_nlink
= 0; /* dir is unlinked */
7929 stx
->stx_nlink
= 1 /* parent dentry */
7931 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7937 stx
->stx_nlink
= in
->nlink
;
7939 stx
->stx_mask
|= CEPH_STATX_NLINK
;
7942 if (mask
& CEPH_CAP_FILE_SHARED
) {
7944 in
->atime
.to_timespec(&stx
->stx_atime
);
7945 in
->mtime
.to_timespec(&stx
->stx_mtime
);
7948 if (cct
->_conf
->client_dirsize_rbytes
)
7949 stx
->stx_size
= in
->rstat
.rbytes
;
7951 stx
->stx_size
= in
->dirstat
.size();
7952 stx
->stx_blocks
= 1;
7954 stx
->stx_size
= in
->size
;
7955 stx
->stx_blocks
= (in
->size
+ 511) >> 9;
7957 stx
->stx_mask
|= (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|
7958 CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
);
7961 /* Change time and change_attr both require all shared caps to view */
7962 if ((mask
& CEPH_STAT_CAP_INODE_ALL
) == CEPH_STAT_CAP_INODE_ALL
) {
7963 stx
->stx_version
= in
->change_attr
;
7964 if (in
->ctime
> in
->mtime
)
7965 in
->ctime
.to_timespec(&stx
->stx_ctime
);
7967 in
->mtime
.to_timespec(&stx
->stx_ctime
);
7968 stx
->stx_mask
|= (CEPH_STATX_CTIME
|CEPH_STATX_VERSION
);
7973 void Client::touch_dn(Dentry
*dn
)
7978 int Client::chmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7980 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7981 if (!mref_reader
.is_state_satisfied())
7982 return -CEPHFS_ENOTCONN
;
7984 tout(cct
) << __func__
<< std::endl
;
7985 tout(cct
) << relpath
<< std::endl
;
7986 tout(cct
) << mode
<< std::endl
;
7988 filepath
path(relpath
);
7991 std::scoped_lock
lock(client_lock
);
7992 int r
= path_walk(path
, &in
, perms
);
7996 attr
.st_mode
= mode
;
7997 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
8000 int Client::fchmod(int fd
, mode_t mode
, const UserPerm
& perms
)
8002 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8003 if (!mref_reader
.is_state_satisfied())
8004 return -CEPHFS_ENOTCONN
;
8006 tout(cct
) << __func__
<< std::endl
;
8007 tout(cct
) << fd
<< std::endl
;
8008 tout(cct
) << mode
<< std::endl
;
8010 std::scoped_lock
lock(client_lock
);
8011 Fh
*f
= get_filehandle(fd
);
8013 return -CEPHFS_EBADF
;
8014 #if defined(__linux__) && defined(O_PATH)
8015 if (f
->flags
& O_PATH
)
8016 return -CEPHFS_EBADF
;
8019 attr
.st_mode
= mode
;
8020 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MODE
, perms
);
8023 int Client::lchmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
8025 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8026 if (!mref_reader
.is_state_satisfied())
8027 return -CEPHFS_ENOTCONN
;
8029 tout(cct
) << __func__
<< std::endl
;
8030 tout(cct
) << relpath
<< std::endl
;
8031 tout(cct
) << mode
<< std::endl
;
8033 filepath
path(relpath
);
8036 std::scoped_lock
lock(client_lock
);
8037 // don't follow symlinks
8038 int r
= path_walk(path
, &in
, perms
, false);
8042 attr
.st_mode
= mode
;
8043 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
8046 int Client::chown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
8047 const UserPerm
& perms
)
8049 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8050 if (!mref_reader
.is_state_satisfied())
8051 return -CEPHFS_ENOTCONN
;
8053 tout(cct
) << __func__
<< std::endl
;
8054 tout(cct
) << relpath
<< std::endl
;
8055 tout(cct
) << new_uid
<< std::endl
;
8056 tout(cct
) << new_gid
<< std::endl
;
8058 filepath
path(relpath
);
8061 std::scoped_lock
lock(client_lock
);
8062 int r
= path_walk(path
, &in
, perms
);
8066 attr
.st_uid
= new_uid
;
8067 attr
.st_gid
= new_gid
;
8068 return _setattr(in
, &attr
, CEPH_SETATTR_UID
|CEPH_SETATTR_GID
, perms
);
8071 int Client::fchown(int fd
, uid_t new_uid
, gid_t new_gid
, const UserPerm
& perms
)
8073 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8074 if (!mref_reader
.is_state_satisfied())
8075 return -CEPHFS_ENOTCONN
;
8077 tout(cct
) << __func__
<< std::endl
;
8078 tout(cct
) << fd
<< std::endl
;
8079 tout(cct
) << new_uid
<< std::endl
;
8080 tout(cct
) << new_gid
<< std::endl
;
8082 std::scoped_lock
lock(client_lock
);
8083 Fh
*f
= get_filehandle(fd
);
8085 return -CEPHFS_EBADF
;
8086 #if defined(__linux__) && defined(O_PATH)
8087 if (f
->flags
& O_PATH
)
8088 return -CEPHFS_EBADF
;
8091 attr
.st_uid
= new_uid
;
8092 attr
.st_gid
= new_gid
;
8094 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
8095 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
8096 return _setattr(f
->inode
, &attr
, mask
, perms
);
8099 int Client::lchown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
8100 const UserPerm
& perms
)
8102 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8103 if (!mref_reader
.is_state_satisfied())
8104 return -CEPHFS_ENOTCONN
;
8106 tout(cct
) << __func__
<< std::endl
;
8107 tout(cct
) << relpath
<< std::endl
;
8108 tout(cct
) << new_uid
<< std::endl
;
8109 tout(cct
) << new_gid
<< std::endl
;
8111 filepath
path(relpath
);
8114 std::scoped_lock
lock(client_lock
);
8115 // don't follow symlinks
8116 int r
= path_walk(path
, &in
, perms
, false);
8120 attr
.st_uid
= new_uid
;
8121 attr
.st_gid
= new_gid
;
8123 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
8124 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
8125 return _setattr(in
, &attr
, mask
, perms
);
8128 static void attr_set_atime_and_mtime(struct stat
*attr
,
8129 const utime_t
&atime
,
8130 const utime_t
&mtime
)
8132 stat_set_atime_sec(attr
, atime
.tv
.tv_sec
);
8133 stat_set_atime_nsec(attr
, atime
.tv
.tv_nsec
);
8134 stat_set_mtime_sec(attr
, mtime
.tv
.tv_sec
);
8135 stat_set_mtime_nsec(attr
, mtime
.tv
.tv_nsec
);
8138 // for [l]utime() invoke the timeval variant as the timespec
8139 // variant are not yet implemented. for futime[s](), invoke
8140 // the timespec variant.
8141 int Client::utime(const char *relpath
, struct utimbuf
*buf
,
8142 const UserPerm
& perms
)
8144 struct timeval tv
[2];
8145 tv
[0].tv_sec
= buf
->actime
;
8147 tv
[1].tv_sec
= buf
->modtime
;
8150 return utimes(relpath
, tv
, perms
);
8153 int Client::lutime(const char *relpath
, struct utimbuf
*buf
,
8154 const UserPerm
& perms
)
8156 struct timeval tv
[2];
8157 tv
[0].tv_sec
= buf
->actime
;
8159 tv
[1].tv_sec
= buf
->modtime
;
8162 return lutimes(relpath
, tv
, perms
);
8165 int Client::futime(int fd
, struct utimbuf
*buf
, const UserPerm
& perms
)
8167 struct timespec ts
[2];
8168 ts
[0].tv_sec
= buf
->actime
;
8170 ts
[1].tv_sec
= buf
->modtime
;
8173 return futimens(fd
, ts
, perms
);
8176 int Client::utimes(const char *relpath
, struct timeval times
[2],
8177 const UserPerm
& perms
)
8179 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8180 if (!mref_reader
.is_state_satisfied())
8181 return -CEPHFS_ENOTCONN
;
8183 tout(cct
) << __func__
<< std::endl
;
8184 tout(cct
) << relpath
<< std::endl
;
8185 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
8187 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
8190 filepath
path(relpath
);
8193 std::scoped_lock
lock(client_lock
);
8194 int r
= path_walk(path
, &in
, perms
);
8198 utime_t
atime(times
[0]);
8199 utime_t
mtime(times
[1]);
8201 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
8202 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8205 int Client::lutimes(const char *relpath
, struct timeval times
[2],
8206 const UserPerm
& perms
)
8208 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8209 if (!mref_reader
.is_state_satisfied())
8210 return -CEPHFS_ENOTCONN
;
8212 tout(cct
) << __func__
<< std::endl
;
8213 tout(cct
) << relpath
<< std::endl
;
8214 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
8216 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
8219 filepath
path(relpath
);
8222 std::scoped_lock
lock(client_lock
);
8223 int r
= path_walk(path
, &in
, perms
, false);
8227 utime_t
atime(times
[0]);
8228 utime_t
mtime(times
[1]);
8230 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
8231 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8234 int Client::futimes(int fd
, struct timeval times
[2], const UserPerm
& perms
)
8236 struct timespec ts
[2];
8237 ts
[0].tv_sec
= times
[0].tv_sec
;
8238 ts
[0].tv_nsec
= times
[0].tv_usec
* 1000;
8239 ts
[1].tv_sec
= times
[1].tv_sec
;
8240 ts
[1].tv_nsec
= times
[1].tv_usec
* 1000;
8242 return futimens(fd
, ts
, perms
);
8245 int Client::futimens(int fd
, struct timespec times
[2], const UserPerm
& perms
)
8247 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8248 if (!mref_reader
.is_state_satisfied())
8249 return -CEPHFS_ENOTCONN
;
8251 tout(cct
) << __func__
<< std::endl
;
8252 tout(cct
) << fd
<< std::endl
;
8253 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_nsec
8255 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_nsec
8258 std::scoped_lock
lock(client_lock
);
8259 Fh
*f
= get_filehandle(fd
);
8261 return -CEPHFS_EBADF
;
8262 #if defined(__linux__) && defined(O_PATH)
8263 if (f
->flags
& O_PATH
)
8264 return -CEPHFS_EBADF
;
8267 utime_t
atime(times
[0]);
8268 utime_t
mtime(times
[1]);
8270 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
8271 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8274 int Client::flock(int fd
, int operation
, uint64_t owner
)
8276 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8277 if (!mref_reader
.is_state_satisfied())
8278 return -CEPHFS_ENOTCONN
;
8280 tout(cct
) << __func__
<< std::endl
;
8281 tout(cct
) << fd
<< std::endl
;
8282 tout(cct
) << operation
<< std::endl
;
8283 tout(cct
) << owner
<< std::endl
;
8285 std::scoped_lock
lock(client_lock
);
8286 Fh
*f
= get_filehandle(fd
);
8288 return -CEPHFS_EBADF
;
8290 return _flock(f
, operation
, owner
);
8293 int Client::opendir(const char *relpath
, dir_result_t
**dirpp
, const UserPerm
& perms
)
8295 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8296 if (!mref_reader
.is_state_satisfied())
8297 return -CEPHFS_ENOTCONN
;
8299 tout(cct
) << __func__
<< std::endl
;
8300 tout(cct
) << relpath
<< std::endl
;
8302 filepath
path(relpath
);
8305 std::scoped_lock
lock(client_lock
);
8306 int r
= path_walk(path
, &in
, perms
, true);
8309 if (cct
->_conf
->client_permissions
) {
8310 int r
= may_open(in
.get(), O_RDONLY
, perms
);
8314 r
= _opendir(in
.get(), dirpp
, perms
);
8315 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8316 if (r
!= -CEPHFS_ENOTDIR
)
8317 tout(cct
) << (uintptr_t)*dirpp
<< std::endl
;
8321 int Client::_opendir(Inode
*in
, dir_result_t
**dirpp
, const UserPerm
& perms
)
8324 return -CEPHFS_ENOTDIR
;
8325 *dirpp
= new dir_result_t(in
, perms
);
8326 opened_dirs
.insert(*dirpp
);
8327 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ") = " << 0 << " (" << *dirpp
<< ")" << dendl
;
8332 int Client::closedir(dir_result_t
*dir
)
8334 tout(cct
) << __func__
<< std::endl
;
8335 tout(cct
) << (uintptr_t)dir
<< std::endl
;
8337 ldout(cct
, 3) << __func__
<< "(" << dir
<< ") = 0" << dendl
;
8338 std::scoped_lock
lock(client_lock
);
8343 void Client::_closedir(dir_result_t
*dirp
)
8345 ldout(cct
, 10) << __func__
<< "(" << dirp
<< ")" << dendl
;
8348 ldout(cct
, 10) << __func__
<< " detaching inode " << dirp
->inode
<< dendl
;
8349 dirp
->inode
.reset();
8351 _readdir_drop_dirp_buffer(dirp
);
8352 opened_dirs
.erase(dirp
);
8356 void Client::rewinddir(dir_result_t
*dirp
)
8358 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ")" << dendl
;
8360 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8361 if (!mref_reader
.is_state_satisfied())
8364 std::scoped_lock
lock(client_lock
);
8365 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
8366 _readdir_drop_dirp_buffer(d
);
8370 loff_t
Client::telldir(dir_result_t
*dirp
)
8372 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
8373 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ") = " << d
->offset
<< dendl
;
8377 void Client::seekdir(dir_result_t
*dirp
, loff_t offset
)
8379 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ", " << offset
<< ")" << dendl
;
8381 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8382 if (!mref_reader
.is_state_satisfied())
8385 std::scoped_lock
lock(client_lock
);
8387 if (offset
== dirp
->offset
)
8390 if (offset
> dirp
->offset
)
8391 dirp
->release_count
= 0; // bump if we do a forward seek
8393 dirp
->ordered_count
= 0; // disable filling readdir cache
8395 if (dirp
->hash_order()) {
8396 if (dirp
->offset
> offset
) {
8397 _readdir_drop_dirp_buffer(dirp
);
8402 dirp
->buffer_frag
!= frag_t(dir_result_t::fpos_high(offset
)) ||
8403 dirp
->offset_low() > dir_result_t::fpos_low(offset
)) {
8404 _readdir_drop_dirp_buffer(dirp
);
8409 dirp
->offset
= offset
;
8414 // ino_t d_ino; /* inode number */
8415 // off_t d_off; /* offset to the next dirent */
8416 // unsigned short d_reclen; /* length of this record */
8417 // unsigned char d_type; /* type of file */
8418 // char d_name[256]; /* filename */
8420 void Client::fill_dirent(struct dirent
*de
, const char *name
, int type
, uint64_t ino
, loff_t next_off
)
8422 strncpy(de
->d_name
, name
, 255);
8423 de
->d_name
[255] = '\0';
8424 #if !defined(__CYGWIN__) && !(defined(_WIN32))
8426 #if !defined(__APPLE__) && !defined(__FreeBSD__)
8427 de
->d_off
= next_off
;
8430 de
->d_type
= IFTODT(type
);
8431 ldout(cct
, 10) << __func__
<< " '" << de
->d_name
<< "' -> " << inodeno_t(de
->d_ino
)
8432 << " type " << (int)de
->d_type
<< " w/ next_off " << hex
<< next_off
<< dec
<< dendl
;
8436 void Client::_readdir_next_frag(dir_result_t
*dirp
)
8438 frag_t fg
= dirp
->buffer_frag
;
8440 if (fg
.is_rightmost()) {
8441 ldout(cct
, 10) << __func__
<< " advance from " << fg
<< " to END" << dendl
;
8448 ldout(cct
, 10) << __func__
<< " advance from " << dirp
->buffer_frag
<< " to " << fg
<< dendl
;
8450 if (dirp
->hash_order()) {
8452 int64_t new_offset
= dir_result_t::make_fpos(fg
.value(), 2, true);
8453 if (dirp
->offset
< new_offset
) // don't decrease offset
8454 dirp
->offset
= new_offset
;
8456 dirp
->last_name
.clear();
8457 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
8458 _readdir_rechoose_frag(dirp
);
8462 void Client::_readdir_rechoose_frag(dir_result_t
*dirp
)
8464 ceph_assert(dirp
->inode
);
8466 if (dirp
->hash_order())
8469 frag_t cur
= frag_t(dirp
->offset_high());
8470 frag_t fg
= dirp
->inode
->dirfragtree
[cur
.value()];
8472 ldout(cct
, 10) << __func__
<< " frag " << cur
<< " maps to " << fg
<< dendl
;
8473 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
8474 dirp
->last_name
.clear();
8475 dirp
->next_offset
= 2;
8479 void Client::_readdir_drop_dirp_buffer(dir_result_t
*dirp
)
8481 ldout(cct
, 10) << __func__
<< " " << dirp
<< dendl
;
8482 dirp
->buffer
.clear();
8485 int Client::_readdir_get_frag(dir_result_t
*dirp
)
8488 ceph_assert(dirp
->inode
);
8490 // get the current frag.
8492 if (dirp
->hash_order())
8493 fg
= dirp
->inode
->dirfragtree
[dirp
->offset_high()];
8495 fg
= frag_t(dirp
->offset_high());
8497 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " fg " << fg
8498 << " offset " << hex
<< dirp
->offset
<< dec
<< dendl
;
8500 int op
= CEPH_MDS_OP_READDIR
;
8501 if (dirp
->inode
&& dirp
->inode
->snapid
== CEPH_SNAPDIR
)
8502 op
= CEPH_MDS_OP_LSSNAP
;
8504 InodeRef
& diri
= dirp
->inode
;
8506 MetaRequest
*req
= new MetaRequest(op
);
8508 diri
->make_nosnap_relative_path(path
);
8509 req
->set_filepath(path
);
8510 req
->set_inode(diri
.get());
8511 req
->head
.args
.readdir
.frag
= fg
;
8512 req
->head
.args
.readdir
.flags
= CEPH_READDIR_REPLY_BITFLAGS
;
8513 if (dirp
->last_name
.length()) {
8514 req
->path2
.set_path(dirp
->last_name
);
8515 } else if (dirp
->hash_order()) {
8516 req
->head
.args
.readdir
.offset_hash
= dirp
->offset_high();
8521 int res
= make_request(req
, dirp
->perms
, NULL
, NULL
, -1, &dirbl
);
8523 if (res
== -CEPHFS_EAGAIN
) {
8524 ldout(cct
, 10) << __func__
<< " got EAGAIN, retrying" << dendl
;
8525 _readdir_rechoose_frag(dirp
);
8526 return _readdir_get_frag(dirp
);
8530 ldout(cct
, 10) << __func__
<< " " << dirp
<< " got frag " << dirp
->buffer_frag
8531 << " size " << dirp
->buffer
.size() << dendl
;
8533 ldout(cct
, 10) << __func__
<< " got error " << res
<< ", setting end flag" << dendl
;
8540 struct dentry_off_lt
{
8541 bool operator()(const Dentry
* dn
, int64_t off
) const {
8542 return dir_result_t::fpos_cmp(dn
->offset
, off
) < 0;
8546 int Client::_readdir_cache_cb(dir_result_t
*dirp
, add_dirent_cb_t cb
, void *p
,
8547 int caps
, bool getref
)
8549 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
8550 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
8551 << " last_name " << dirp
->last_name
<< " offset " << hex
<< dirp
->offset
<< dec
8553 Dir
*dir
= dirp
->inode
->dir
;
8556 ldout(cct
, 10) << " dir is empty" << dendl
;
8561 vector
<Dentry
*>::iterator pd
= std::lower_bound(dir
->readdir_cache
.begin(),
8562 dir
->readdir_cache
.end(),
8563 dirp
->offset
, dentry_off_lt());
8568 if (!dirp
->inode
->is_complete_and_ordered())
8569 return -CEPHFS_EAGAIN
;
8570 if (pd
== dir
->readdir_cache
.end())
8573 if (dn
->inode
== NULL
) {
8574 ldout(cct
, 15) << " skipping null '" << dn
->name
<< "'" << dendl
;
8578 if (dn
->cap_shared_gen
!= dir
->parent_inode
->shared_gen
) {
8579 ldout(cct
, 15) << " skipping mismatch shared gen '" << dn
->name
<< "'" << dendl
;
8584 int idx
= pd
- dir
->readdir_cache
.begin();
8585 if (dn
->inode
->is_dir()) {
8586 mask
|= CEPH_STAT_RSTAT
;
8588 int r
= _getattr(dn
->inode
, mask
, dirp
->perms
);
8592 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8593 pd
= dir
->readdir_cache
.begin() + idx
;
8594 if (pd
>= dir
->readdir_cache
.end() || *pd
!= dn
)
8595 return -CEPHFS_EAGAIN
;
8597 struct ceph_statx stx
;
8599 fill_statx(dn
->inode
, caps
, &stx
);
8601 uint64_t next_off
= dn
->offset
+ 1;
8602 fill_dirent(&de
, dn
->name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8604 if (pd
== dir
->readdir_cache
.end())
8605 next_off
= dir_result_t::END
;
8609 in
= dn
->inode
.get();
8613 dn_name
= dn
->name
; // fill in name while we have lock
8615 client_lock
.unlock();
8616 r
= cb(p
, &de
, &stx
, next_off
, in
); // _next_ offset
8618 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< dn
->offset
<< dec
8619 << " = " << r
<< dendl
;
8624 dirp
->offset
= next_off
;
8626 dirp
->next_offset
= 2;
8628 dirp
->next_offset
= dirp
->offset_low();
8629 dirp
->last_name
= dn_name
; // we successfully returned this one; update!
8630 dirp
->release_count
= 0; // last_name no longer match cache index
8635 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " at end" << dendl
;
8640 int Client::readdir_r_cb(dir_result_t
*d
, add_dirent_cb_t cb
, void *p
,
8641 unsigned want
, unsigned flags
, bool getref
)
8643 int caps
= statx_to_mask(flags
, want
);
8645 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8646 if (!mref_reader
.is_state_satisfied())
8647 return -CEPHFS_ENOTCONN
;
8649 std::unique_lock
cl(client_lock
);
8651 dir_result_t
*dirp
= static_cast<dir_result_t
*>(d
);
8653 ldout(cct
, 10) << __func__
<< " " << *dirp
->inode
<< " offset " << hex
<< dirp
->offset
8654 << dec
<< " at_end=" << dirp
->at_end()
8655 << " hash_order=" << dirp
->hash_order() << dendl
;
8658 struct ceph_statx stx
;
8659 memset(&de
, 0, sizeof(de
));
8660 memset(&stx
, 0, sizeof(stx
));
8662 InodeRef
& diri
= dirp
->inode
;
8667 if (dirp
->offset
== 0) {
8668 ldout(cct
, 15) << " including ." << dendl
;
8669 ceph_assert(diri
->dentries
.size() < 2); // can't have multiple hard-links to a dir
8670 uint64_t next_off
= 1;
8673 r
= _getattr(diri
, caps
| CEPH_STAT_RSTAT
, dirp
->perms
);
8677 fill_statx(diri
, caps
, &stx
);
8678 fill_dirent(&de
, ".", S_IFDIR
, stx
.stx_ino
, next_off
);
8680 Inode
*inode
= NULL
;
8687 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8692 dirp
->offset
= next_off
;
8696 if (dirp
->offset
== 1) {
8697 ldout(cct
, 15) << " including .." << dendl
;
8698 uint64_t next_off
= 2;
8700 if (diri
->dentries
.empty())
8703 in
= diri
->get_first_parent()->dir
->parent_inode
;
8706 r
= _getattr(in
, caps
| CEPH_STAT_RSTAT
, dirp
->perms
);
8710 fill_statx(in
, caps
, &stx
);
8711 fill_dirent(&de
, "..", S_IFDIR
, stx
.stx_ino
, next_off
);
8713 Inode
*inode
= NULL
;
8720 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8725 dirp
->offset
= next_off
;
8730 // can we read from our cache?
8731 ldout(cct
, 10) << "offset " << hex
<< dirp
->offset
<< dec
8732 << " snapid " << dirp
->inode
->snapid
<< " (complete && ordered) "
8733 << dirp
->inode
->is_complete_and_ordered()
8734 << " issued " << ccap_string(dirp
->inode
->caps_issued())
8736 if (dirp
->inode
->snapid
!= CEPH_SNAPDIR
&&
8737 dirp
->inode
->is_complete_and_ordered() &&
8738 dirp
->inode
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
8739 int err
= _readdir_cache_cb(dirp
, cb
, p
, caps
, getref
);
8740 if (err
!= -CEPHFS_EAGAIN
)
8748 bool check_caps
= true;
8749 if (!dirp
->is_cached()) {
8750 int r
= _readdir_get_frag(dirp
);
8753 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8754 // different than the requested one. (our dirfragtree was outdated)
8757 frag_t fg
= dirp
->buffer_frag
;
8759 ldout(cct
, 10) << "frag " << fg
<< " buffer size " << dirp
->buffer
.size()
8760 << " offset " << hex
<< dirp
->offset
<< dendl
;
8762 for (auto it
= std::lower_bound(dirp
->buffer
.begin(), dirp
->buffer
.end(),
8763 dirp
->offset
, dir_result_t::dentry_off_lt());
8764 it
!= dirp
->buffer
.end();
8766 dir_result_t::dentry
&entry
= *it
;
8768 uint64_t next_off
= entry
.offset
+ 1;
8773 if(entry
.inode
->is_dir()){
8774 mask
|= CEPH_STAT_RSTAT
;
8776 r
= _getattr(entry
.inode
, mask
, dirp
->perms
);
8781 fill_statx(entry
.inode
, caps
, &stx
);
8782 fill_dirent(&de
, entry
.name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8784 Inode
*inode
= NULL
;
8786 inode
= entry
.inode
.get();
8791 r
= cb(p
, &de
, &stx
, next_off
, inode
); // _next_ offset
8794 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< next_off
- 1 << dec
8795 << " = " << r
<< dendl
;
8799 dirp
->offset
= next_off
;
8804 if (dirp
->next_offset
> 2) {
8805 ldout(cct
, 10) << " fetching next chunk of this frag" << dendl
;
8806 _readdir_drop_dirp_buffer(dirp
);
8810 if (!fg
.is_rightmost()) {
8812 _readdir_next_frag(dirp
);
8816 if (diri
->shared_gen
== dirp
->start_shared_gen
&&
8817 diri
->dir_release_count
== dirp
->release_count
) {
8818 if (diri
->dir_ordered_count
== dirp
->ordered_count
) {
8819 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
8821 ceph_assert(diri
->dir
->readdir_cache
.size() >= dirp
->cache_index
);
8822 diri
->dir
->readdir_cache
.resize(dirp
->cache_index
);
8824 diri
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
8826 ldout(cct
, 10) << " marking I_COMPLETE on " << *diri
<< dendl
;
8827 diri
->flags
|= I_COMPLETE
;
8839 int Client::readdir_r(dir_result_t
*d
, struct dirent
*de
)
8841 return readdirplus_r(d
, de
, 0, 0, 0, NULL
);
8848 * 1 if we got a dirent
8849 * 0 for end of directory
8853 struct single_readdir
{
8855 struct ceph_statx
*stx
;
8860 static int _readdir_single_dirent_cb(void *p
, struct dirent
*de
,
8861 struct ceph_statx
*stx
, off_t off
,
8864 single_readdir
*c
= static_cast<single_readdir
*>(p
);
8867 return -1; // already filled this dirent
8877 struct dirent
*Client::readdir(dir_result_t
*d
)
8887 // our callback fills the dirent and sets sr.full=true on first
8888 // call, and returns -1 the second time around.
8889 ret
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
);
8891 errno
= -ret
; // this sucks.
8892 return (dirent
*) NULL
;
8897 return (dirent
*) NULL
;
8900 int Client::readdirplus_r(dir_result_t
*d
, struct dirent
*de
,
8901 struct ceph_statx
*stx
, unsigned want
,
8902 unsigned flags
, Inode
**out
)
8910 // our callback fills the dirent and sets sr.full=true on first
8911 // call, and returns -1 the second time around.
8912 int r
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
, want
, flags
, out
);
8924 struct getdents_result
{
8931 static int _readdir_getdent_cb(void *p
, struct dirent
*de
,
8932 struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8934 struct getdents_result
*c
= static_cast<getdents_result
*>(p
);
8940 dlen
= strlen(de
->d_name
) + 1;
8942 if (c
->pos
+ dlen
> c
->buflen
)
8943 return -1; // doesn't fit
8946 memcpy(c
->buf
+ c
->pos
, de
, sizeof(*de
));
8948 memcpy(c
->buf
+ c
->pos
, de
->d_name
, dlen
);
8954 int Client::_getdents(dir_result_t
*dir
, char *buf
, int buflen
, bool fullent
)
8959 gr
.fullent
= fullent
;
8962 int r
= readdir_r_cb(dir
, _readdir_getdent_cb
, (void *)&gr
);
8964 if (r
< 0) { // some error
8965 if (r
== -1) { // buffer ran out of space
8966 if (gr
.pos
) { // but we got some entries already!
8968 } // or we need a larger buffer
8969 return -CEPHFS_ERANGE
;
8970 } else { // actual error, return it
8979 struct getdir_result
{
8980 list
<string
> *contents
;
8984 static int _getdir_cb(void *p
, struct dirent
*de
, struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8986 getdir_result
*r
= static_cast<getdir_result
*>(p
);
8988 r
->contents
->push_back(de
->d_name
);
8993 int Client::getdir(const char *relpath
, list
<string
>& contents
,
8994 const UserPerm
& perms
)
8996 ldout(cct
, 3) << "getdir(" << relpath
<< ")" << dendl
;
8997 tout(cct
) << "getdir" << std::endl
;
8998 tout(cct
) << relpath
<< std::endl
;
9001 int r
= opendir(relpath
, &d
, perms
);
9006 gr
.contents
= &contents
;
9008 r
= readdir_r_cb(d
, _getdir_cb
, (void *)&gr
);
9018 /****** file i/o **********/
9019 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
,
9020 mode_t mode
, int stripe_unit
, int stripe_count
,
9021 int object_size
, const char *data_pool
, std::string alternate_name
)
9023 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9024 if (!mref_reader
.is_state_satisfied())
9025 return -CEPHFS_ENOTCONN
;
9027 int cflags
= ceph_flags_sys2wire(flags
);
9029 ldout(cct
, 3) << "open enter(" << relpath
<< ", " << cflags
<< "," << mode
<< ")" << dendl
;
9030 tout(cct
) << "open" << std::endl
;
9031 tout(cct
) << relpath
<< std::endl
;
9032 tout(cct
) << cflags
<< std::endl
;
9036 #if defined(__linux__) && defined(O_PATH)
9037 /* When the O_PATH is being specified, others flags than O_DIRECTORY
9038 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9039 * in kernel (fs/open.c). */
9041 flags
&= O_DIRECTORY
| O_NOFOLLOW
| O_PATH
;
9044 filepath
path(relpath
);
9046 bool created
= false;
9047 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9048 bool followsym
= !((flags
& O_NOFOLLOW
) || ((flags
& O_CREAT
) && (flags
& O_EXCL
)));
9049 int mask
= ceph_caps_for_mode(ceph_flags_to_mode(cflags
));
9051 std::scoped_lock
lock(client_lock
);
9052 int r
= path_walk(path
, &in
, perms
, followsym
, mask
);
9054 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
9055 return -CEPHFS_EEXIST
;
9057 #if defined(__linux__) && defined(O_PATH)
9058 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
) && !(flags
& O_PATH
))
9060 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
))
9062 return -CEPHFS_ELOOP
;
9064 if (r
== -CEPHFS_ENOENT
&& (flags
& O_CREAT
)) {
9065 filepath dirpath
= path
;
9066 string dname
= dirpath
.last_dentry();
9067 dirpath
.pop_dentry();
9069 r
= path_walk(dirpath
, &dir
, perms
, true,
9070 cct
->_conf
->client_permissions
? CEPH_CAP_AUTH_SHARED
: 0);
9073 if (cct
->_conf
->client_permissions
) {
9074 r
= may_create(dir
.get(), perms
);
9078 r
= _create(dir
.get(), dname
.c_str(), flags
, mode
, &in
, &fh
, stripe_unit
,
9079 stripe_count
, object_size
, data_pool
, &created
, perms
,
9080 std::move(alternate_name
));
9086 // posix says we can only check permissions of existing files
9087 if (cct
->_conf
->client_permissions
) {
9088 r
= may_open(in
.get(), flags
, perms
);
9095 r
= _open(in
.get(), flags
, mode
, &fh
, perms
);
9097 // allocate a integer file descriptor
9100 ceph_assert(fd_map
.count(r
) == 0);
9105 tout(cct
) << r
<< std::endl
;
9106 ldout(cct
, 3) << "open exit(" << path
<< ", " << cflags
<< ") = " << r
<< dendl
;
9110 int Client::lookup_hash(inodeno_t ino
, inodeno_t dirino
, const char *name
,
9111 const UserPerm
& perms
)
9113 ldout(cct
, 3) << __func__
<< " enter(" << ino
<< ", #" << dirino
<< "/" << name
<< ")" << dendl
;
9115 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9116 if (!mref_reader
.is_state_satisfied())
9117 return -CEPHFS_ENOTCONN
;
9119 std::scoped_lock
lock(client_lock
);
9120 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPHASH
);
9122 req
->set_filepath(path
);
9124 uint32_t h
= ceph_str_hash(CEPH_STR_HASH_RJENKINS
, name
, strlen(name
));
9126 sprintf(f
, "%u", h
);
9127 filepath
path2(dirino
);
9128 path2
.push_dentry(string(f
));
9129 req
->set_filepath2(path2
);
9131 int r
= make_request(req
, perms
, NULL
, NULL
,
9132 rand() % mdsmap
->get_num_in_mds());
9133 ldout(cct
, 3) << __func__
<< " exit(" << ino
<< ", #" << dirino
<< "/" << name
<< ") = " << r
<< dendl
;
9139 * Load inode into local cache.
9141 * If inode pointer is non-NULL, and take a reference on
9142 * the resulting Inode object in one operation, so that caller
9143 * can safely assume inode will still be there after return.
9145 int Client::_lookup_vino(vinodeno_t vino
, const UserPerm
& perms
, Inode
**inode
)
9147 ldout(cct
, 8) << __func__
<< " enter(" << vino
<< ")" << dendl
;
9149 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9150 if (!mref_reader
.is_state_satisfied())
9151 return -CEPHFS_ENOTCONN
;
9153 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPINO
);
9154 filepath
path(vino
.ino
);
9155 req
->set_filepath(path
);
9158 * The MDS expects either a "real" snapid here or 0. The special value
9159 * carveouts for the snapid are all at the end of the range so we can
9160 * just look for any snapid below this value.
9162 if (vino
.snapid
< CEPH_NOSNAP
)
9163 req
->head
.args
.lookupino
.snapid
= vino
.snapid
;
9165 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
9166 if (r
== 0 && inode
!= NULL
) {
9167 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
9168 ceph_assert(p
!= inode_map
.end());
9172 ldout(cct
, 8) << __func__
<< " exit(" << vino
<< ") = " << r
<< dendl
;
9176 int Client::lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
9178 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
9179 std::scoped_lock
lock(client_lock
);
9180 return _lookup_vino(vino
, perms
, inode
);
9184 * Find the parent inode of `ino` and insert it into
9185 * our cache. Conditionally also set `parent` to a referenced
9186 * Inode* if caller provides non-NULL value.
9188 int Client::_lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
9190 ldout(cct
, 8) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
9192 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
9193 filepath
path(ino
->ino
);
9194 req
->set_filepath(path
);
9197 int r
= make_request(req
, perms
, &target
, NULL
, rand() % mdsmap
->get_num_in_mds());
9198 // Give caller a reference to the parent ino if they provided a pointer.
9199 if (parent
!= NULL
) {
9201 *parent
= target
.get();
9203 ldout(cct
, 8) << __func__
<< " found parent " << (*parent
)->ino
<< dendl
;
9208 ldout(cct
, 8) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
9213 * Populate the parent dentry for `ino`, provided it is
9214 * a child of `parent`.
9216 int Client::_lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
9218 ceph_assert(parent
->is_dir());
9219 ldout(cct
, 3) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
9221 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9222 if (!mref_reader
.is_state_satisfied())
9223 return -CEPHFS_ENOTCONN
;
9225 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
9226 req
->set_filepath2(filepath(parent
->ino
));
9227 req
->set_filepath(filepath(ino
->ino
));
9228 req
->set_inode(ino
);
9230 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
9231 ldout(cct
, 3) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
9235 int Client::lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
9237 std::scoped_lock
lock(client_lock
);
9238 return _lookup_name(ino
, parent
, perms
);
9241 Fh
*Client::_create_fh(Inode
*in
, int flags
, int cmode
, const UserPerm
& perms
)
9244 Fh
*f
= new Fh(in
, flags
, cmode
, fd_gen
, perms
);
9246 ldout(cct
, 10) << __func__
<< " " << in
->ino
<< " mode " << cmode
<< dendl
;
9248 if (in
->snapid
!= CEPH_NOSNAP
) {
9249 in
->snap_cap_refs
++;
9250 ldout(cct
, 5) << "open success, fh is " << f
<< " combined IMMUTABLE SNAP caps "
9251 << ccap_string(in
->caps_issued()) << dendl
;
9254 const auto& conf
= cct
->_conf
;
9255 f
->readahead
.set_trigger_requests(1);
9256 f
->readahead
.set_min_readahead_size(conf
->client_readahead_min
);
9257 uint64_t max_readahead
= Readahead::NO_LIMIT
;
9258 if (conf
->client_readahead_max_bytes
) {
9259 max_readahead
= std::min(max_readahead
, (uint64_t)conf
->client_readahead_max_bytes
);
9261 if (conf
->client_readahead_max_periods
) {
9262 max_readahead
= std::min(max_readahead
, in
->layout
.get_period()*(uint64_t)conf
->client_readahead_max_periods
);
9264 f
->readahead
.set_max_readahead_size(max_readahead
);
9265 vector
<uint64_t> alignments
;
9266 alignments
.push_back(in
->layout
.get_period());
9267 alignments
.push_back(in
->layout
.stripe_unit
);
9268 f
->readahead
.set_alignments(alignments
);
9273 int Client::_release_fh(Fh
*f
)
9275 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
9276 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
9277 Inode
*in
= f
->inode
.get();
9278 ldout(cct
, 8) << __func__
<< " " << f
<< " mode " << f
->mode
<< " on " << *in
<< dendl
;
9282 if (in
->snapid
== CEPH_NOSNAP
) {
9283 if (in
->put_open_ref(f
->mode
)) {
9284 _flush(in
, new C_Client_FlushComplete(this, in
));
9288 ceph_assert(in
->snap_cap_refs
> 0);
9289 in
->snap_cap_refs
--;
9292 _release_filelocks(f
);
9294 // Finally, read any async err (i.e. from flushes)
9295 int err
= f
->take_async_err();
9297 ldout(cct
, 1) << __func__
<< " " << f
<< " on inode " << *in
<< " caught async_err = "
9298 << cpp_strerror(err
) << dendl
;
9300 ldout(cct
, 10) << __func__
<< " " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
9308 void Client::_put_fh(Fh
*f
)
9310 int left
= f
->put();
9316 int Client::_open(Inode
*in
, int flags
, mode_t mode
, Fh
**fhp
,
9317 const UserPerm
& perms
)
9319 if (in
->snapid
!= CEPH_NOSNAP
&&
9320 (flags
& (O_WRONLY
| O_RDWR
| O_CREAT
| O_TRUNC
| O_APPEND
))) {
9321 return -CEPHFS_EROFS
;
9324 // use normalized flags to generate cmode
9325 int cflags
= ceph_flags_sys2wire(flags
);
9326 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
9327 cflags
|= CEPH_O_LAZY
;
9329 int cmode
= ceph_flags_to_mode(cflags
);
9330 int want
= ceph_caps_for_mode(cmode
);
9333 in
->get_open_ref(cmode
); // make note of pending open, since it effects _wanted_ caps.
9335 if ((flags
& O_TRUNC
) == 0 && in
->caps_issued_mask(want
)) {
9337 check_caps(in
, CHECK_CAPS_NODELAY
);
9340 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
9342 in
->make_nosnap_relative_path(path
);
9343 req
->set_filepath(path
);
9344 req
->head
.args
.open
.flags
= cflags
& ~CEPH_O_CREAT
;
9345 req
->head
.args
.open
.mode
= mode
;
9346 req
->head
.args
.open
.pool
= -1;
9347 if (cct
->_conf
->client_debug_getattr_caps
)
9348 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
9350 req
->head
.args
.open
.mask
= 0;
9351 req
->head
.args
.open
.old_size
= in
->size
; // for O_TRUNC
9353 result
= make_request(req
, perms
);
9356 * NFS expects that delegations will be broken on a conflicting open,
9357 * not just when there is actual conflicting access to the file. SMB leases
9358 * and oplocks also have similar semantics.
9360 * Ensure that clients that have delegations enabled will wait on minimal
9361 * caps during open, just to ensure that other clients holding delegations
9362 * return theirs first.
9364 if (deleg_timeout
&& result
== 0) {
9367 if (cmode
& CEPH_FILE_MODE_WR
)
9368 need
|= CEPH_CAP_FILE_WR
;
9369 if (cmode
& CEPH_FILE_MODE_RD
)
9370 need
|= CEPH_CAP_FILE_RD
;
9372 Fh
fh(in
, flags
, cmode
, fd_gen
, perms
);
9373 result
= get_caps(&fh
, need
, want
, &have
, -1);
9375 ldout(cct
, 8) << "Unable to get caps after open of inode " << *in
<<
9376 " . Denying open: " <<
9377 cpp_strerror(result
) << dendl
;
9379 put_cap_ref(in
, need
);
9387 *fhp
= _create_fh(in
, flags
, cmode
, perms
);
9389 in
->put_open_ref(cmode
);
9397 int Client::_renew_caps(Inode
*in
)
9399 int wanted
= in
->caps_file_wanted();
9400 if (in
->is_any_caps() &&
9401 ((wanted
& CEPH_CAP_ANY_WR
) == 0 || in
->auth_cap
)) {
9402 check_caps(in
, CHECK_CAPS_NODELAY
);
9407 if ((wanted
& CEPH_CAP_FILE_RD
) && (wanted
& CEPH_CAP_FILE_WR
))
9409 else if (wanted
& CEPH_CAP_FILE_RD
)
9411 else if (wanted
& CEPH_CAP_FILE_WR
)
9414 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
9416 in
->make_nosnap_relative_path(path
);
9417 req
->set_filepath(path
);
9418 req
->head
.args
.open
.flags
= flags
;
9419 req
->head
.args
.open
.pool
= -1;
9420 if (cct
->_conf
->client_debug_getattr_caps
)
9421 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
9423 req
->head
.args
.open
.mask
= 0;
9426 // duplicate in case Cap goes away; not sure if that race is a concern?
9427 const UserPerm
*pperm
= in
->get_best_perms();
9431 int ret
= make_request(req
, perms
);
9435 int Client::close(int fd
)
9437 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9438 if (!mref_reader
.is_state_satisfied())
9439 return -CEPHFS_ENOTCONN
;
9441 ldout(cct
, 3) << "close enter(" << fd
<< ")" << dendl
;
9442 tout(cct
) << "close" << std::endl
;
9443 tout(cct
) << fd
<< std::endl
;
9445 std::scoped_lock
lock(client_lock
);
9446 Fh
*fh
= get_filehandle(fd
);
9448 return -CEPHFS_EBADF
;
9449 int err
= _release_fh(fh
);
9452 ldout(cct
, 3) << "close exit(" << fd
<< ")" << dendl
;
9460 loff_t
Client::lseek(int fd
, loff_t offset
, int whence
)
9462 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9463 if (!mref_reader
.is_state_satisfied())
9464 return -CEPHFS_ENOTCONN
;
9466 tout(cct
) << "lseek" << std::endl
;
9467 tout(cct
) << fd
<< std::endl
;
9468 tout(cct
) << offset
<< std::endl
;
9469 tout(cct
) << whence
<< std::endl
;
9471 std::scoped_lock
lock(client_lock
);
9472 Fh
*f
= get_filehandle(fd
);
9474 return -CEPHFS_EBADF
;
9475 #if defined(__linux__) && defined(O_PATH)
9476 if (f
->flags
& O_PATH
)
9477 return -CEPHFS_EBADF
;
9479 return _lseek(f
, offset
, whence
);
9482 loff_t
Client::_lseek(Fh
*f
, loff_t offset
, int whence
)
9484 Inode
*in
= f
->inode
.get();
9485 bool whence_check
= false;
9490 whence_check
= true;
9495 whence_check
= true;
9501 whence_check
= true;
9507 int r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
9518 pos
= f
->pos
+ offset
;
9522 pos
= in
->size
+ offset
;
9527 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
9528 return -CEPHFS_ENXIO
;
9535 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
9536 return -CEPHFS_ENXIO
;
9542 ldout(cct
, 1) << __func__
<< ": invalid whence value " << whence
<< dendl
;
9543 return -CEPHFS_EINVAL
;
9547 return -CEPHFS_EINVAL
;
9552 ldout(cct
, 8) << "_lseek(" << f
<< ", " << offset
<< ", " << whence
<< ") = " << f
->pos
<< dendl
;
9557 void Client::lock_fh_pos(Fh
*f
)
9559 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
9561 if (f
->pos_locked
|| !f
->pos_waiters
.empty()) {
9562 ceph::condition_variable cond
;
9563 f
->pos_waiters
.push_back(&cond
);
9564 ldout(cct
, 10) << __func__
<< " BLOCKING on " << f
<< dendl
;
9565 std::unique_lock l
{client_lock
, std::adopt_lock
};
9566 cond
.wait(l
, [f
, me
=&cond
] {
9567 return !f
->pos_locked
&& f
->pos_waiters
.front() == me
;
9570 ldout(cct
, 10) << __func__
<< " UNBLOCKING on " << f
<< dendl
;
9571 ceph_assert(f
->pos_waiters
.front() == &cond
);
9572 f
->pos_waiters
.pop_front();
9575 f
->pos_locked
= true;
9578 void Client::unlock_fh_pos(Fh
*f
)
9580 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
9582 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
9583 f
->pos_locked
= false;
9584 if (!f
->pos_waiters
.empty()) {
9585 // only wake up the oldest waiter
9586 auto cond
= f
->pos_waiters
.front();
9591 int Client::uninline_data(Inode
*in
, Context
*onfinish
)
9593 if (!in
->inline_data
.length()) {
9594 onfinish
->complete(0);
9599 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (long long unsigned)in
->ino
);
9600 object_t oid
= oid_buf
;
9602 ObjectOperation create_ops
;
9603 create_ops
.create(false);
9605 objecter
->mutate(oid
,
9606 OSDMap::file_to_object_locator(in
->layout
),
9608 in
->snaprealm
->get_snap_context(),
9609 ceph::real_clock::now(),
9613 bufferlist inline_version_bl
;
9614 encode(in
->inline_version
, inline_version_bl
);
9616 ObjectOperation uninline_ops
;
9617 uninline_ops
.cmpxattr("inline_version",
9618 CEPH_OSD_CMPXATTR_OP_GT
,
9619 CEPH_OSD_CMPXATTR_MODE_U64
,
9621 bufferlist inline_data
= in
->inline_data
;
9622 uninline_ops
.write(0, inline_data
, in
->truncate_size
, in
->truncate_seq
);
9623 uninline_ops
.setxattr("inline_version", stringify(in
->inline_version
));
9625 objecter
->mutate(oid
,
9626 OSDMap::file_to_object_locator(in
->layout
),
9628 in
->snaprealm
->get_snap_context(),
9629 ceph::real_clock::now(),
9638 // blocking osd interface
9640 int Client::read(int fd
, char *buf
, loff_t size
, loff_t offset
)
9642 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9643 if (!mref_reader
.is_state_satisfied())
9644 return -CEPHFS_ENOTCONN
;
9646 tout(cct
) << "read" << std::endl
;
9647 tout(cct
) << fd
<< std::endl
;
9648 tout(cct
) << size
<< std::endl
;
9649 tout(cct
) << offset
<< std::endl
;
9651 std::unique_lock
lock(client_lock
);
9652 Fh
*f
= get_filehandle(fd
);
9654 return -CEPHFS_EBADF
;
9655 #if defined(__linux__) && defined(O_PATH)
9656 if (f
->flags
& O_PATH
)
9657 return -CEPHFS_EBADF
;
9660 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9661 size
= std::min(size
, (loff_t
)INT_MAX
);
9662 int r
= _read(f
, offset
, size
, &bl
);
9663 ldout(cct
, 3) << "read(" << fd
<< ", " << (void*)buf
<< ", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9666 bl
.begin().copy(bl
.length(), buf
);
9672 int Client::preadv(int fd
, const struct iovec
*iov
, int iovcnt
, loff_t offset
)
9675 return -CEPHFS_EINVAL
;
9676 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, false);
9679 int64_t Client::_read(Fh
*f
, int64_t offset
, uint64_t size
, bufferlist
*bl
)
9681 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
9684 bool movepos
= false;
9685 std::unique_ptr
<C_SaferCond
> onuninline
;
9687 const auto& conf
= cct
->_conf
;
9688 Inode
*in
= f
->inode
.get();
9690 utime_t start
= ceph_clock_now();
9692 if ((f
->mode
& CEPH_FILE_MODE_RD
) == 0)
9693 return -CEPHFS_EBADF
;
9694 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9701 loff_t start_pos
= offset
;
9703 if (in
->inline_version
== 0) {
9704 auto r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9709 ceph_assert(in
->inline_version
> 0);
9713 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9714 want
= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
;
9716 want
= CEPH_CAP_FILE_CACHE
;
9718 auto r
= get_caps(f
, CEPH_CAP_FILE_RD
, want
, &have
, -1);
9724 if (f
->flags
& O_DIRECT
)
9725 have
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
9727 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9728 if (!(have
& CEPH_CAP_FILE_CACHE
)) {
9729 onuninline
.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9730 uninline_data(in
, onuninline
.get());
9732 uint32_t len
= in
->inline_data
.length();
9733 uint64_t endoff
= offset
+ size
;
9734 if (endoff
> in
->size
)
9738 if (endoff
<= len
) {
9739 bl
->substr_of(in
->inline_data
, offset
, endoff
- offset
);
9741 bl
->substr_of(in
->inline_data
, offset
, len
- offset
);
9742 bl
->append_zero(endoff
- len
);
9744 rc
= endoff
- offset
;
9745 } else if ((uint64_t)offset
< endoff
) {
9746 bl
->append_zero(endoff
- offset
);
9747 rc
= endoff
- offset
;
9755 if (!conf
->client_debug_force_sync_read
&&
9757 (have
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
9759 if (f
->flags
& O_RSYNC
) {
9760 _flush_range(in
, offset
, size
);
9762 rc
= _read_async(f
, offset
, size
, bl
);
9766 if (f
->flags
& O_DIRECT
)
9767 _flush_range(in
, offset
, size
);
9769 bool checkeof
= false;
9770 rc
= _read_sync(f
, offset
, size
, bl
, &checkeof
);
9777 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9781 auto r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
9789 if ((uint64_t)offset
< in
->size
)
9795 ceph_assert(rc
>= 0);
9798 f
->pos
= start_pos
+ rc
;
9801 lat
= ceph_clock_now();
9803 logger
->tinc(l_c_read
, lat
);
9809 client_lock
.unlock();
9810 int ret
= onuninline
->wait();
9812 if (ret
>= 0 || ret
== -CEPHFS_ECANCELED
) {
9813 in
->inline_data
.clear();
9814 in
->inline_version
= CEPH_INLINE_NONE
;
9815 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9821 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9829 Client::C_Readahead::C_Readahead(Client
*c
, Fh
*f
) :
9832 f
->readahead
.inc_pending();
9835 Client::C_Readahead::~C_Readahead() {
9836 f
->readahead
.dec_pending();
9840 void Client::C_Readahead::finish(int r
) {
9841 lgeneric_subdout(client
->cct
, client
, 20) << "client." << client
->get_nodeid() << " " << "C_Readahead on " << f
->inode
<< dendl
;
9842 client
->put_cap_ref(f
->inode
.get(), CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9845 int Client::_read_async(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
)
9847 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
9849 const auto& conf
= cct
->_conf
;
9850 Inode
*in
= f
->inode
.get();
9852 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9854 // trim read based on file size?
9855 if (off
>= in
->size
)
9859 if (off
+ len
> in
->size
) {
9860 len
= in
->size
- off
;
9863 ldout(cct
, 10) << " min_bytes=" << f
->readahead
.get_min_readahead_size()
9864 << " max_bytes=" << f
->readahead
.get_max_readahead_size()
9865 << " max_periods=" << conf
->client_readahead_max_periods
<< dendl
;
9867 // read (and possibly block)
9869 C_SaferCond
onfinish("Client::_read_async flock");
9870 r
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9871 off
, len
, bl
, 0, &onfinish
);
9873 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9874 client_lock
.unlock();
9875 r
= onfinish
.wait();
9877 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9880 if(f
->readahead
.get_min_readahead_size() > 0) {
9881 pair
<uint64_t, uint64_t> readahead_extent
= f
->readahead
.update(off
, len
, in
->size
);
9882 if (readahead_extent
.second
> 0) {
9883 ldout(cct
, 20) << "readahead " << readahead_extent
.first
<< "~" << readahead_extent
.second
9884 << " (caller wants " << off
<< "~" << len
<< ")" << dendl
;
9885 Context
*onfinish2
= new C_Readahead(this, f
);
9886 int r2
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9887 readahead_extent
.first
, readahead_extent
.second
,
9888 NULL
, 0, onfinish2
);
9890 ldout(cct
, 20) << "readahead initiated, c " << onfinish2
<< dendl
;
9891 get_cap_ref(in
, CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9893 ldout(cct
, 20) << "readahead was no-op, already cached" << dendl
;
9902 int Client::_read_sync(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
,
9905 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
9907 Inode
*in
= f
->inode
.get();
9912 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9914 // 0 success, 1 continue and < 0 error happen.
9915 auto wait_and_copy
= [&](C_SaferCond
&onfinish
, bufferlist
&tbl
, int wanted
) {
9916 int r
= onfinish
.wait();
9918 // if we get ENOENT from OSD, assume 0 bytes returned
9919 if (r
== -CEPHFS_ENOENT
)
9930 bl
->claim_append(tbl
);
9933 if (r
>= 0 && r
< wanted
) {
9934 if (pos
< in
->size
) {
9935 // zero up to known EOF
9936 int64_t some
= in
->size
- pos
;
9939 auto z
= buffer::ptr_node::create(some
);
9941 bl
->push_back(std::move(z
));
9956 C_SaferCond
onfinish("Client::_read_sync flock");
9960 filer
->read_trunc(in
->ino
, &in
->layout
, in
->snapid
,
9962 in
->truncate_size
, in
->truncate_seq
,
9964 client_lock
.unlock();
9965 int r
= wait_and_copy(onfinish
, tbl
, wanted
);
9975 int Client::write(int fd
, const char *buf
, loff_t size
, loff_t offset
)
9977 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9978 if (!mref_reader
.is_state_satisfied())
9979 return -CEPHFS_ENOTCONN
;
9981 tout(cct
) << "write" << std::endl
;
9982 tout(cct
) << fd
<< std::endl
;
9983 tout(cct
) << size
<< std::endl
;
9984 tout(cct
) << offset
<< std::endl
;
9986 std::scoped_lock
lock(client_lock
);
9987 Fh
*fh
= get_filehandle(fd
);
9989 return -CEPHFS_EBADF
;
9990 #if defined(__linux__) && defined(O_PATH)
9991 if (fh
->flags
& O_PATH
)
9992 return -CEPHFS_EBADF
;
9994 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9995 size
= std::min(size
, (loff_t
)INT_MAX
);
9996 int r
= _write(fh
, offset
, size
, buf
, NULL
, false);
9997 ldout(cct
, 3) << "write(" << fd
<< ", \"...\", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
10001 int Client::pwritev(int fd
, const struct iovec
*iov
, int iovcnt
, int64_t offset
)
10004 return -CEPHFS_EINVAL
;
10005 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, true);
10008 int64_t Client::_preadv_pwritev_locked(Fh
*fh
, const struct iovec
*iov
,
10009 unsigned iovcnt
, int64_t offset
, bool write
,
10010 bool clamp_to_int
, std::unique_lock
<ceph::mutex
> &cl
)
10012 #if defined(__linux__) && defined(O_PATH)
10013 if (fh
->flags
& O_PATH
)
10014 return -CEPHFS_EBADF
;
10016 loff_t totallen
= 0;
10017 for (unsigned i
= 0; i
< iovcnt
; i
++) {
10018 totallen
+= iov
[i
].iov_len
;
10022 * Some of the API functions take 64-bit size values, but only return
10023 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10024 * we don't do I/Os larger than the values we can return.
10026 if (clamp_to_int
) {
10027 totallen
= std::min(totallen
, (loff_t
)INT_MAX
);
10030 int64_t w
= _write(fh
, offset
, totallen
, NULL
, iov
, iovcnt
);
10031 ldout(cct
, 3) << "pwritev(" << fh
<< ", \"...\", " << totallen
<< ", " << offset
<< ") = " << w
<< dendl
;
10035 int64_t r
= _read(fh
, offset
, totallen
, &bl
);
10036 ldout(cct
, 3) << "preadv(" << fh
<< ", " << offset
<< ") = " << r
<< dendl
;
10041 auto iter
= bl
.cbegin();
10042 for (unsigned j
= 0, resid
= r
; j
< iovcnt
&& resid
> 0; j
++) {
10044 * This piece of code aims to handle the case that bufferlist
10045 * does not have enough data to fill in the iov
10047 const auto round_size
= std::min
<unsigned>(resid
, iov
[j
].iov_len
);
10048 iter
.copy(round_size
, reinterpret_cast<char*>(iov
[j
].iov_base
));
10049 resid
-= round_size
;
10050 /* iter is self-updating */
10057 int Client::_preadv_pwritev(int fd
, const struct iovec
*iov
, unsigned iovcnt
, int64_t offset
, bool write
)
10059 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10060 if (!mref_reader
.is_state_satisfied())
10061 return -CEPHFS_ENOTCONN
;
10063 tout(cct
) << fd
<< std::endl
;
10064 tout(cct
) << offset
<< std::endl
;
10066 std::unique_lock
cl(client_lock
);
10067 Fh
*fh
= get_filehandle(fd
);
10069 return -CEPHFS_EBADF
;
10070 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, offset
, write
, true, cl
);
10073 int64_t Client::_write(Fh
*f
, int64_t offset
, uint64_t size
, const char *buf
,
10074 const struct iovec
*iov
, int iovcnt
)
10076 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10080 if ((uint64_t)(offset
+size
) > mdsmap
->get_max_filesize()) //too large!
10081 return -CEPHFS_EFBIG
;
10083 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
10084 Inode
*in
= f
->inode
.get();
10086 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
10087 return -CEPHFS_ENOSPC
;
10090 ceph_assert(in
->snapid
== CEPH_NOSNAP
);
10092 // was Fh opened as writeable?
10093 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
10094 return -CEPHFS_EBADF
;
10096 // use/adjust fd pos?
10100 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10101 * change out from under us.
10103 if (f
->flags
& O_APPEND
) {
10104 auto r
= _lseek(f
, 0, SEEK_END
);
10111 fpos
= offset
+size
;
10116 uint64_t endoff
= offset
+ size
;
10117 if (endoff
> in
->size
&& is_quota_bytes_exceeded(in
, endoff
- in
->size
,
10119 return -CEPHFS_EDQUOT
;
10122 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10124 ldout(cct
, 10) << "cur file size is " << in
->size
<< dendl
;
10127 utime_t start
= ceph_clock_now();
10129 if (in
->inline_version
== 0) {
10130 int r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
10133 ceph_assert(in
->inline_version
> 0);
10136 // copy into fresh buffer (since our write may be resub, async)
10140 bl
.append(buf
, size
);
10142 for (int i
= 0; i
< iovcnt
; i
++) {
10143 if (iov
[i
].iov_len
> 0) {
10144 bl
.append((const char *)iov
[i
].iov_base
, iov
[i
].iov_len
);
10150 uint64_t totalwritten
;
10152 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
10153 want
= CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
;
10155 want
= CEPH_CAP_FILE_BUFFER
;
10156 int r
= get_caps(f
, CEPH_CAP_FILE_WR
|CEPH_CAP_AUTH_SHARED
, want
, &have
, endoff
);
10160 /* clear the setuid/setgid bits, if any */
10161 if (unlikely(in
->mode
& (S_ISUID
|S_ISGID
)) && size
> 0) {
10162 struct ceph_statx stx
= { 0 };
10164 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
10165 r
= __setattrx(in
, &stx
, CEPH_SETATTR_KILL_SGUID
, f
->actor_perms
);
10169 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
10172 if (f
->flags
& O_DIRECT
)
10173 have
&= ~(CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
);
10175 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
10177 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
10179 if (in
->inline_version
< CEPH_INLINE_NONE
) {
10180 if (endoff
> cct
->_conf
->client_max_inline_size
||
10181 endoff
> CEPH_INLINE_MAX_SIZE
||
10182 !(have
& CEPH_CAP_FILE_BUFFER
)) {
10183 onuninline
.reset(new C_SaferCond("Client::_write_uninline_data flock"));
10184 uninline_data(in
, onuninline
.get());
10186 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10188 uint32_t len
= in
->inline_data
.length();
10191 in
->inline_data
.begin(endoff
).copy(len
- endoff
, bl
); // XXX
10194 in
->inline_data
.splice(offset
, len
- offset
);
10195 else if (offset
> len
)
10196 in
->inline_data
.append_zero(offset
- len
);
10198 in
->inline_data
.append(bl
);
10199 in
->inline_version
++;
10201 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10207 if (cct
->_conf
->client_oc
&&
10208 (have
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
))) {
10209 // do buffered write
10210 if (!in
->oset
.dirty_or_tx
)
10211 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
10213 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10215 // async, caching, non-blocking.
10216 r
= objectcacher
->file_write(&in
->oset
, &in
->layout
,
10217 in
->snaprealm
->get_snap_context(),
10218 offset
, size
, bl
, ceph::real_clock::now(),
10220 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10225 // flush cached write if O_SYNC is set on file fh
10226 // O_DSYNC == O_SYNC on linux < 2.6.33
10227 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
10228 if ((f
->flags
& O_SYNC
) || (f
->flags
& O_DSYNC
)) {
10229 _flush_range(in
, offset
, size
);
10232 if (f
->flags
& O_DIRECT
)
10233 _flush_range(in
, offset
, size
);
10235 // simple, non-atomic sync write
10236 C_SaferCond
onfinish("Client::_write flock");
10237 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10239 filer
->write_trunc(in
->ino
, &in
->layout
, in
->snaprealm
->get_snap_context(),
10240 offset
, size
, bl
, ceph::real_clock::now(), 0,
10241 in
->truncate_size
, in
->truncate_seq
,
10243 client_lock
.unlock();
10244 r
= onfinish
.wait();
10245 client_lock
.lock();
10246 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10251 // if we get here, write was successful, update client metadata
10254 lat
= ceph_clock_now();
10256 logger
->tinc(l_c_wrlat
, lat
);
10263 totalwritten
= size
;
10264 r
= (int64_t)totalwritten
;
10267 if (totalwritten
+ offset
> in
->size
) {
10268 in
->size
= totalwritten
+ offset
;
10269 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
10271 if (is_quota_bytes_approaching(in
, f
->actor_perms
)) {
10272 check_caps(in
, CHECK_CAPS_NODELAY
);
10273 } else if (is_max_size_approaching(in
)) {
10277 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", extending file size" << dendl
;
10279 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", leaving file size at " << in
->size
<< dendl
;
10283 in
->mtime
= in
->ctime
= ceph_clock_now();
10285 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
10289 if (nullptr != onuninline
) {
10290 client_lock
.unlock();
10291 int uninline_ret
= onuninline
->wait();
10292 client_lock
.lock();
10294 if (uninline_ret
>= 0 || uninline_ret
== -CEPHFS_ECANCELED
) {
10295 in
->inline_data
.clear();
10296 in
->inline_version
= CEPH_INLINE_NONE
;
10297 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
10303 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
10307 int Client::_flush(Fh
*f
)
10309 Inode
*in
= f
->inode
.get();
10310 int err
= f
->take_async_err();
10312 ldout(cct
, 1) << __func__
<< ": " << f
<< " on inode " << *in
<< " caught async_err = "
10313 << cpp_strerror(err
) << dendl
;
10315 ldout(cct
, 10) << __func__
<< ": " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
10321 int Client::truncate(const char *relpath
, loff_t length
, const UserPerm
& perms
)
10323 struct ceph_statx stx
;
10324 stx
.stx_size
= length
;
10325 return setattrx(relpath
, &stx
, CEPH_SETATTR_SIZE
, perms
);
10328 int Client::ftruncate(int fd
, loff_t length
, const UserPerm
& perms
)
10330 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10331 if (!mref_reader
.is_state_satisfied())
10332 return -CEPHFS_ENOTCONN
;
10334 tout(cct
) << __func__
<< std::endl
;
10335 tout(cct
) << fd
<< std::endl
;
10336 tout(cct
) << length
<< std::endl
;
10338 std::scoped_lock
lock(client_lock
);
10339 Fh
*f
= get_filehandle(fd
);
10341 return -CEPHFS_EBADF
;
10342 #if defined(__linux__) && defined(O_PATH)
10343 if (f
->flags
& O_PATH
)
10344 return -CEPHFS_EBADF
;
10346 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
10347 return -CEPHFS_EBADF
;
10349 attr
.st_size
= length
;
10350 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_SIZE
, perms
);
10353 int Client::fsync(int fd
, bool syncdataonly
)
10355 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10356 if (!mref_reader
.is_state_satisfied())
10357 return -CEPHFS_ENOTCONN
;
10359 tout(cct
) << "fsync" << std::endl
;
10360 tout(cct
) << fd
<< std::endl
;
10361 tout(cct
) << syncdataonly
<< std::endl
;
10363 std::scoped_lock
lock(client_lock
);
10364 Fh
*f
= get_filehandle(fd
);
10366 return -CEPHFS_EBADF
;
10367 #if defined(__linux__) && defined(O_PATH)
10368 if (f
->flags
& O_PATH
)
10369 return -CEPHFS_EBADF
;
10371 int r
= _fsync(f
, syncdataonly
);
10373 // The IOs in this fsync were okay, but maybe something happened
10374 // in the background that we shoudl be reporting?
10375 r
= f
->take_async_err();
10376 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
10377 << ") = 0, async_err = " << r
<< dendl
;
10379 // Assume that an error we encountered during fsync, even reported
10380 // synchronously, would also have applied the error to the Fh, and we
10381 // should clear it here to avoid returning the same error again on next
10383 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
<< ") = "
10385 f
->take_async_err();
10390 int Client::_fsync(Inode
*in
, bool syncdataonly
)
10392 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10395 std::unique_ptr
<C_SaferCond
> object_cacher_completion
= nullptr;
10396 ceph_tid_t flush_tid
= 0;
10399 utime_t start
= ceph_clock_now();
10401 ldout(cct
, 8) << "_fsync on " << *in
<< " " << (syncdataonly
? "(dataonly)":"(data+metadata)") << dendl
;
10403 if (cct
->_conf
->client_oc
) {
10404 object_cacher_completion
.reset(new C_SaferCond("Client::_fsync::lock"));
10405 tmp_ref
= in
; // take a reference; C_SaferCond doesn't and _flush won't either
10406 _flush(in
, object_cacher_completion
.get());
10407 ldout(cct
, 15) << "using return-valued form of _fsync" << dendl
;
10410 if (!syncdataonly
&& in
->dirty_caps
) {
10411 check_caps(in
, CHECK_CAPS_NODELAY
|CHECK_CAPS_SYNCHRONOUS
);
10412 if (in
->flushing_caps
)
10413 flush_tid
= last_flush_tid
;
10414 } else ldout(cct
, 10) << "no metadata needs to commit" << dendl
;
10416 if (!syncdataonly
&& !in
->unsafe_ops
.empty()) {
10417 flush_mdlog_sync();
10419 MetaRequest
*req
= in
->unsafe_ops
.back();
10420 ldout(cct
, 15) << "waiting on unsafe requests, last tid " << req
->get_tid() << dendl
;
10423 wait_on_list(req
->waitfor_safe
);
10427 if (nullptr != object_cacher_completion
) { // wait on a real reply instead of guessing
10428 client_lock
.unlock();
10429 ldout(cct
, 15) << "waiting on data to flush" << dendl
;
10430 r
= object_cacher_completion
->wait();
10431 client_lock
.lock();
10432 ldout(cct
, 15) << "got " << r
<< " from flush writeback" << dendl
;
10434 // FIXME: this can starve
10435 while (in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] > 0) {
10436 ldout(cct
, 10) << "ino " << in
->ino
<< " has " << in
->cap_refs
[CEPH_CAP_FILE_BUFFER
]
10437 << " uncommitted, waiting" << dendl
;
10438 wait_on_list(in
->waitfor_commit
);
10444 wait_sync_caps(in
, flush_tid
);
10446 ldout(cct
, 10) << "ino " << in
->ino
<< " has no uncommitted writes" << dendl
;
10448 ldout(cct
, 8) << "ino " << in
->ino
<< " failed to commit to disk! "
10449 << cpp_strerror(-r
) << dendl
;
10452 lat
= ceph_clock_now();
10454 logger
->tinc(l_c_fsync
, lat
);
10459 int Client::_fsync(Fh
*f
, bool syncdataonly
)
10461 ldout(cct
, 8) << "_fsync(" << f
<< ", " << (syncdataonly
? "dataonly)":"data+metadata)") << dendl
;
10462 return _fsync(f
->inode
.get(), syncdataonly
);
10465 int Client::fstat(int fd
, struct stat
*stbuf
, const UserPerm
& perms
, int mask
)
10467 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10468 if (!mref_reader
.is_state_satisfied())
10469 return -CEPHFS_ENOTCONN
;
10471 tout(cct
) << "fstat mask " << hex
<< mask
<< dec
<< std::endl
;
10472 tout(cct
) << fd
<< std::endl
;
10474 std::scoped_lock
lock(client_lock
);
10475 Fh
*f
= get_filehandle(fd
);
10477 return -CEPHFS_EBADF
;
10478 int r
= _getattr(f
->inode
, mask
, perms
);
10481 fill_stat(f
->inode
, stbuf
, NULL
);
10482 ldout(cct
, 5) << "fstat(" << fd
<< ", " << stbuf
<< ") = " << r
<< dendl
;
10486 int Client::fstatx(int fd
, struct ceph_statx
*stx
, const UserPerm
& perms
,
10487 unsigned int want
, unsigned int flags
)
10489 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10490 if (!mref_reader
.is_state_satisfied())
10491 return -CEPHFS_ENOTCONN
;
10493 tout(cct
) << "fstatx flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
10494 tout(cct
) << fd
<< std::endl
;
10496 std::scoped_lock
lock(client_lock
);
10497 Fh
*f
= get_filehandle(fd
);
10499 return -CEPHFS_EBADF
;
10501 unsigned mask
= statx_to_mask(flags
, want
);
10504 if (mask
&& !f
->inode
->caps_issued_mask(mask
, true)) {
10505 r
= _getattr(f
->inode
, mask
, perms
);
10507 ldout(cct
, 3) << "fstatx exit on error!" << dendl
;
10512 fill_statx(f
->inode
, mask
, stx
);
10513 ldout(cct
, 3) << "fstatx(" << fd
<< ", " << stx
<< ") = " << r
<< dendl
;
10517 // not written yet, but i want to link!
10519 int Client::chdir(const char *relpath
, std::string
&new_cwd
,
10520 const UserPerm
& perms
)
10522 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10523 if (!mref_reader
.is_state_satisfied())
10524 return -CEPHFS_ENOTCONN
;
10526 tout(cct
) << "chdir" << std::endl
;
10527 tout(cct
) << relpath
<< std::endl
;
10529 filepath
path(relpath
);
10532 std::scoped_lock
lock(client_lock
);
10533 int r
= path_walk(path
, &in
, perms
);
10537 if (!(in
.get()->is_dir()))
10538 return -CEPHFS_ENOTDIR
;
10542 ldout(cct
, 3) << "chdir(" << relpath
<< ") cwd now " << cwd
->ino
<< dendl
;
10544 _getcwd(new_cwd
, perms
);
10548 void Client::_getcwd(string
& dir
, const UserPerm
& perms
)
10551 ldout(cct
, 10) << __func__
<< " " << *cwd
<< dendl
;
10553 Inode
*in
= cwd
.get();
10554 while (in
!= root
) {
10555 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
10557 // A cwd or ancester is unlinked
10558 if (in
->dentries
.empty()) {
10562 Dentry
*dn
= in
->get_first_parent();
10567 ldout(cct
, 10) << __func__
<< " looking up parent for " << *in
<< dendl
;
10568 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
10569 filepath
path(in
->ino
);
10570 req
->set_filepath(path
);
10571 req
->set_inode(in
);
10572 int res
= make_request(req
, perms
);
10581 path
.push_front_dentry(dn
->name
);
10582 in
= dn
->dir
->parent_inode
;
10585 dir
+= path
.get_path();
10588 void Client::getcwd(string
& dir
, const UserPerm
& perms
)
10590 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10591 if (!mref_reader
.is_state_satisfied())
10594 std::scoped_lock
l(client_lock
);
10596 _getcwd(dir
, perms
);
10599 int Client::statfs(const char *path
, struct statvfs
*stbuf
,
10600 const UserPerm
& perms
)
10602 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10603 if (!mref_reader
.is_state_satisfied())
10604 return -CEPHFS_ENOTCONN
;
10606 tout(cct
) << __func__
<< std::endl
;
10607 unsigned long int total_files_on_fs
;
10612 std::unique_lock
lock(client_lock
);
10613 const vector
<int64_t> &data_pools
= mdsmap
->get_data_pools();
10614 if (data_pools
.size() == 1) {
10615 objecter
->get_fs_stats(stats
, data_pools
[0], &cond
);
10617 objecter
->get_fs_stats(stats
, boost::optional
<int64_t>(), &cond
);
10621 int rval
= cond
.wait();
10625 total_files_on_fs
= root
->rstat
.rfiles
+ root
->rstat
.rsubdirs
;
10628 ldout(cct
, 1) << "underlying call to statfs returned error: "
10629 << cpp_strerror(rval
)
10634 memset(stbuf
, 0, sizeof(*stbuf
));
10637 * we're going to set a block size of 4MB so we can represent larger
10638 * FSes without overflowing. Additionally convert the space
10639 * measurements from KB to bytes while making them in terms of
10640 * blocks. We use 4MB only because it is big enough, and because it
10641 * actually *is* the (ceph) default block size.
10643 const int CEPH_BLOCK_SHIFT
= 22;
10644 stbuf
->f_frsize
= 1 << CEPH_BLOCK_SHIFT
;
10645 stbuf
->f_bsize
= 1 << CEPH_BLOCK_SHIFT
;
10646 stbuf
->f_files
= total_files_on_fs
;
10647 stbuf
->f_ffree
= -1;
10648 stbuf
->f_favail
= -1;
10649 stbuf
->f_fsid
= -1; // ??
10650 stbuf
->f_flag
= 0; // ??
10651 stbuf
->f_namemax
= NAME_MAX
;
10653 // Usually quota_root will == root_ancestor, but if the mount root has no
10654 // quota but we can see a parent of it that does have a quota, we'll
10655 // respect that one instead.
10656 ceph_assert(root
!= nullptr);
10657 Inode
*quota_root
= root
->quota
.is_enable() ? root
: get_quota_root(root
, perms
);
10659 // get_quota_root should always give us something
10660 // because client quotas are always enabled
10661 ceph_assert(quota_root
!= nullptr);
10663 if (quota_root
&& cct
->_conf
->client_quota_df
&& quota_root
->quota
.max_bytes
) {
10665 // Skip the getattr if any sessions are stale, as we don't want to
10666 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10668 if (!_any_stale_sessions()) {
10669 int r
= _getattr(quota_root
, 0, perms
, true);
10671 // Ignore return value: error getting latest inode metadata is not a good
10672 // reason to break "df".
10673 lderr(cct
) << "Error in getattr on quota root 0x"
10674 << std::hex
<< quota_root
->ino
<< std::dec
10675 << " statfs result may be outdated" << dendl
;
10679 // Special case: if there is a size quota set on the Inode acting
10680 // as the root for this client mount, then report the quota status
10681 // as the filesystem statistics.
10682 const fsblkcnt_t total
= quota_root
->quota
.max_bytes
>> CEPH_BLOCK_SHIFT
;
10683 const fsblkcnt_t used
= quota_root
->rstat
.rbytes
>> CEPH_BLOCK_SHIFT
;
10684 // It is possible for a quota to be exceeded: arithmetic here must
10685 // handle case where used > total.
10686 const fsblkcnt_t free
= total
> used
? total
- used
: 0;
10688 stbuf
->f_blocks
= total
;
10689 stbuf
->f_bfree
= free
;
10690 stbuf
->f_bavail
= free
;
10692 // General case: report the cluster statistics returned from RADOS. Because
10693 // multiple pools may be used without one filesystem namespace via
10694 // layouts, this is the most correct thing we can do.
10695 stbuf
->f_blocks
= stats
.kb
>> (CEPH_BLOCK_SHIFT
- 10);
10696 stbuf
->f_bfree
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10697 stbuf
->f_bavail
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10703 int Client::_do_filelock(Inode
*in
, Fh
*fh
, int lock_type
, int op
, int sleep
,
10704 struct flock
*fl
, uint64_t owner
, bool removing
)
10706 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
10707 << (lock_type
== CEPH_LOCK_FCNTL
? " fcntl" : " flock")
10708 << " type " << fl
->l_type
<< " owner " << owner
10709 << " " << fl
->l_start
<< "~" << fl
->l_len
<< dendl
;
10711 if (in
->flags
& I_ERROR_FILELOCK
)
10712 return -CEPHFS_EIO
;
10715 if (F_RDLCK
== fl
->l_type
)
10716 lock_cmd
= CEPH_LOCK_SHARED
;
10717 else if (F_WRLCK
== fl
->l_type
)
10718 lock_cmd
= CEPH_LOCK_EXCL
;
10719 else if (F_UNLCK
== fl
->l_type
)
10720 lock_cmd
= CEPH_LOCK_UNLOCK
;
10722 return -CEPHFS_EIO
;
10724 if (op
!= CEPH_MDS_OP_SETFILELOCK
|| lock_cmd
== CEPH_LOCK_UNLOCK
)
10728 * Set the most significant bit, so that MDS knows the 'owner'
10729 * is sufficient to identify the owner of lock. (old code uses
10730 * both 'owner' and 'pid')
10732 owner
|= (1ULL << 63);
10734 MetaRequest
*req
= new MetaRequest(op
);
10736 in
->make_nosnap_relative_path(path
);
10737 req
->set_filepath(path
);
10738 req
->set_inode(in
);
10740 req
->head
.args
.filelock_change
.rule
= lock_type
;
10741 req
->head
.args
.filelock_change
.type
= lock_cmd
;
10742 req
->head
.args
.filelock_change
.owner
= owner
;
10743 req
->head
.args
.filelock_change
.pid
= fl
->l_pid
;
10744 req
->head
.args
.filelock_change
.start
= fl
->l_start
;
10745 req
->head
.args
.filelock_change
.length
= fl
->l_len
;
10746 req
->head
.args
.filelock_change
.wait
= sleep
;
10751 if (sleep
&& switch_interrupt_cb
) {
10752 // enable interrupt
10753 switch_interrupt_cb(callback_handle
, req
->get());
10754 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10755 // disable interrupt
10756 switch_interrupt_cb(callback_handle
, NULL
);
10757 if (ret
== 0 && req
->aborted()) {
10758 // effect of this lock request has been revoked by the 'lock intr' request
10759 ret
= req
->get_abort_code();
10763 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10767 if (op
== CEPH_MDS_OP_GETFILELOCK
) {
10768 ceph_filelock filelock
;
10769 auto p
= bl
.cbegin();
10770 decode(filelock
, p
);
10772 if (CEPH_LOCK_SHARED
== filelock
.type
)
10773 fl
->l_type
= F_RDLCK
;
10774 else if (CEPH_LOCK_EXCL
== filelock
.type
)
10775 fl
->l_type
= F_WRLCK
;
10777 fl
->l_type
= F_UNLCK
;
10779 fl
->l_whence
= SEEK_SET
;
10780 fl
->l_start
= filelock
.start
;
10781 fl
->l_len
= filelock
.length
;
10782 fl
->l_pid
= filelock
.pid
;
10783 } else if (op
== CEPH_MDS_OP_SETFILELOCK
) {
10784 ceph_lock_state_t
*lock_state
;
10785 if (lock_type
== CEPH_LOCK_FCNTL
) {
10786 if (!in
->fcntl_locks
)
10787 in
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10788 lock_state
= in
->fcntl_locks
.get();
10789 } else if (lock_type
== CEPH_LOCK_FLOCK
) {
10790 if (!in
->flock_locks
)
10791 in
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10792 lock_state
= in
->flock_locks
.get();
10795 return -CEPHFS_EINVAL
;
10797 _update_lock_state(fl
, owner
, lock_state
);
10800 if (lock_type
== CEPH_LOCK_FCNTL
) {
10801 if (!fh
->fcntl_locks
)
10802 fh
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10803 lock_state
= fh
->fcntl_locks
.get();
10805 if (!fh
->flock_locks
)
10806 fh
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10807 lock_state
= fh
->flock_locks
.get();
10809 _update_lock_state(fl
, owner
, lock_state
);
10817 int Client::_interrupt_filelock(MetaRequest
*req
)
10819 // Set abort code, but do not kick. The abort code prevents the request
10820 // from being re-sent.
10821 req
->abort(-CEPHFS_EINTR
);
10823 return 0; // haven't sent the request
10825 Inode
*in
= req
->inode();
10828 if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FLOCK
)
10829 lock_type
= CEPH_LOCK_FLOCK_INTR
;
10830 else if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FCNTL
)
10831 lock_type
= CEPH_LOCK_FCNTL_INTR
;
10834 return -CEPHFS_EINVAL
;
10837 MetaRequest
*intr_req
= new MetaRequest(CEPH_MDS_OP_SETFILELOCK
);
10839 in
->make_nosnap_relative_path(path
);
10840 intr_req
->set_filepath(path
);
10841 intr_req
->set_inode(in
);
10842 intr_req
->head
.args
.filelock_change
= req
->head
.args
.filelock_change
;
10843 intr_req
->head
.args
.filelock_change
.rule
= lock_type
;
10844 intr_req
->head
.args
.filelock_change
.type
= CEPH_LOCK_UNLOCK
;
10846 UserPerm
perms(req
->get_uid(), req
->get_gid());
10847 return make_request(intr_req
, perms
, NULL
, NULL
, -1);
10850 void Client::_encode_filelocks(Inode
*in
, bufferlist
& bl
)
10852 if (!in
->fcntl_locks
&& !in
->flock_locks
)
10855 unsigned nr_fcntl_locks
= in
->fcntl_locks
? in
->fcntl_locks
->held_locks
.size() : 0;
10856 encode(nr_fcntl_locks
, bl
);
10857 if (nr_fcntl_locks
) {
10858 auto &lock_state
= in
->fcntl_locks
;
10859 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10860 p
!= lock_state
->held_locks
.end();
10862 encode(p
->second
, bl
);
10865 unsigned nr_flock_locks
= in
->flock_locks
? in
->flock_locks
->held_locks
.size() : 0;
10866 encode(nr_flock_locks
, bl
);
10867 if (nr_flock_locks
) {
10868 auto &lock_state
= in
->flock_locks
;
10869 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10870 p
!= lock_state
->held_locks
.end();
10872 encode(p
->second
, bl
);
10875 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< ", " << nr_fcntl_locks
10876 << " fcntl locks, " << nr_flock_locks
<< " flock locks" << dendl
;
10879 void Client::_release_filelocks(Fh
*fh
)
10881 if (!fh
->fcntl_locks
&& !fh
->flock_locks
)
10884 Inode
*in
= fh
->inode
.get();
10885 ldout(cct
, 10) << __func__
<< " " << fh
<< " ino " << in
->ino
<< dendl
;
10887 list
<ceph_filelock
> activated_locks
;
10889 list
<pair
<int, ceph_filelock
> > to_release
;
10891 if (fh
->fcntl_locks
) {
10892 auto &lock_state
= fh
->fcntl_locks
;
10893 for(auto p
= lock_state
->held_locks
.begin(); p
!= lock_state
->held_locks
.end(); ) {
10895 if (in
->flags
& I_ERROR_FILELOCK
) {
10896 lock_state
->remove_lock(q
->second
, activated_locks
);
10898 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FCNTL
, q
->second
));
10901 lock_state
.reset();
10903 if (fh
->flock_locks
) {
10904 auto &lock_state
= fh
->flock_locks
;
10905 for(auto p
= lock_state
->held_locks
.begin(); p
!= lock_state
->held_locks
.end(); ) {
10907 if (in
->flags
& I_ERROR_FILELOCK
) {
10908 lock_state
->remove_lock(q
->second
, activated_locks
);
10910 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FLOCK
, q
->second
));
10913 lock_state
.reset();
10916 if ((in
->flags
& I_ERROR_FILELOCK
) && !in
->has_any_filelocks())
10917 in
->flags
&= ~I_ERROR_FILELOCK
;
10919 if (to_release
.empty())
10923 memset(&fl
, 0, sizeof(fl
));
10924 fl
.l_whence
= SEEK_SET
;
10925 fl
.l_type
= F_UNLCK
;
10927 for (list
<pair
<int, ceph_filelock
> >::iterator p
= to_release
.begin();
10928 p
!= to_release
.end();
10930 fl
.l_start
= p
->second
.start
;
10931 fl
.l_len
= p
->second
.length
;
10932 fl
.l_pid
= p
->second
.pid
;
10933 _do_filelock(in
, fh
, p
->first
, CEPH_MDS_OP_SETFILELOCK
, 0, &fl
,
10934 p
->second
.owner
, true);
10938 void Client::_update_lock_state(struct flock
*fl
, uint64_t owner
,
10939 ceph_lock_state_t
*lock_state
)
10942 if (F_RDLCK
== fl
->l_type
)
10943 lock_cmd
= CEPH_LOCK_SHARED
;
10944 else if (F_WRLCK
== fl
->l_type
)
10945 lock_cmd
= CEPH_LOCK_EXCL
;
10947 lock_cmd
= CEPH_LOCK_UNLOCK
;;
10949 ceph_filelock filelock
;
10950 filelock
.start
= fl
->l_start
;
10951 filelock
.length
= fl
->l_len
;
10952 filelock
.client
= 0;
10953 // see comment in _do_filelock()
10954 filelock
.owner
= owner
| (1ULL << 63);
10955 filelock
.pid
= fl
->l_pid
;
10956 filelock
.type
= lock_cmd
;
10958 if (filelock
.type
== CEPH_LOCK_UNLOCK
) {
10959 list
<ceph_filelock
> activated_locks
;
10960 lock_state
->remove_lock(filelock
, activated_locks
);
10962 bool r
= lock_state
->add_lock(filelock
, false, false, NULL
);
10967 int Client::_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
10969 Inode
*in
= fh
->inode
.get();
10970 ldout(cct
, 10) << "_getlk " << fh
<< " ino " << in
->ino
<< dendl
;
10971 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_GETFILELOCK
, 0, fl
, owner
);
10975 int Client::_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
10977 Inode
*in
= fh
->inode
.get();
10978 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< dendl
;
10979 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_SETFILELOCK
, sleep
, fl
, owner
);
10980 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10984 int Client::_flock(Fh
*fh
, int cmd
, uint64_t owner
)
10986 Inode
*in
= fh
->inode
.get();
10987 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< dendl
;
10989 int sleep
= !(cmd
& LOCK_NB
);
11004 return -CEPHFS_EINVAL
;
11008 memset(&fl
, 0, sizeof(fl
));
11010 fl
.l_whence
= SEEK_SET
;
11012 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FLOCK
, CEPH_MDS_OP_SETFILELOCK
, sleep
, &fl
, owner
);
11013 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
11017 int Client::get_snap_info(const char *path
, const UserPerm
&perms
, SnapInfo
*snap_info
) {
11018 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11019 if (!mref_reader
.is_state_satisfied()) {
11020 return -CEPHFS_ENOTCONN
;
11023 std::unique_lock
locker(client_lock
);
11025 int r
= Client::path_walk(path
, &in
, perms
, true);
11030 if (in
->snapid
== CEPH_NOSNAP
) {
11031 return -CEPHFS_EINVAL
;
11034 snap_info
->id
= in
->snapid
;
11035 snap_info
->metadata
= in
->snap_metadata
;
11039 int Client::ll_statfs(Inode
*in
, struct statvfs
*stbuf
, const UserPerm
& perms
)
11041 /* Since the only thing this does is wrap a call to statfs, and
11042 statfs takes a lock, it doesn't seem we have a need to split it
11044 return statfs(0, stbuf
, perms
);
11047 void Client::ll_register_callbacks(struct ceph_client_callback_args
*args
)
11051 std::scoped_lock
l(client_lock
);
11052 ldout(cct
, 10) << __func__
<< " cb " << args
->handle
11053 << " invalidate_ino_cb " << args
->ino_cb
11054 << " invalidate_dentry_cb " << args
->dentry_cb
11055 << " switch_interrupt_cb " << args
->switch_intr_cb
11056 << " remount_cb " << args
->remount_cb
11058 callback_handle
= args
->handle
;
11059 if (args
->ino_cb
) {
11060 ino_invalidate_cb
= args
->ino_cb
;
11061 async_ino_invalidator
.start();
11063 if (args
->dentry_cb
) {
11064 dentry_invalidate_cb
= args
->dentry_cb
;
11065 async_dentry_invalidator
.start();
11067 if (args
->switch_intr_cb
) {
11068 switch_interrupt_cb
= args
->switch_intr_cb
;
11069 interrupt_finisher
.start();
11071 if (args
->remount_cb
) {
11072 remount_cb
= args
->remount_cb
;
11073 remount_finisher
.start();
11075 if (args
->ino_release_cb
) {
11076 ino_release_cb
= args
->ino_release_cb
;
11077 async_ino_releasor
.start();
11079 if (args
->umask_cb
)
11080 umask_cb
= args
->umask_cb
;
11083 int Client::test_dentry_handling(bool can_invalidate
)
11087 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
11088 if (!iref_reader
.is_state_satisfied())
11089 return -CEPHFS_ENOTCONN
;
11091 can_invalidate_dentries
= can_invalidate
;
11093 if (can_invalidate_dentries
) {
11094 ceph_assert(dentry_invalidate_cb
);
11095 ldout(cct
, 1) << "using dentry_invalidate_cb" << dendl
;
11098 ceph_assert(remount_cb
);
11099 ldout(cct
, 1) << "using remount_cb" << dendl
;
11100 r
= _do_remount(false);
11106 int Client::_sync_fs()
11108 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
11110 ldout(cct
, 10) << __func__
<< dendl
;
11113 std::unique_ptr
<C_SaferCond
> cond
= nullptr;
11114 if (cct
->_conf
->client_oc
) {
11115 cond
.reset(new C_SaferCond("Client::_sync_fs:lock"));
11116 objectcacher
->flush_all(cond
.get());
11121 ceph_tid_t flush_tid
= last_flush_tid
;
11123 // wait for unsafe mds requests
11124 wait_unsafe_requests();
11126 wait_sync_caps(flush_tid
);
11128 if (nullptr != cond
) {
11129 client_lock
.unlock();
11130 ldout(cct
, 15) << __func__
<< " waiting on data to flush" << dendl
;
11132 ldout(cct
, 15) << __func__
<< " flush finished" << dendl
;
11133 client_lock
.lock();
11139 int Client::sync_fs()
11141 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11142 if (!mref_reader
.is_state_satisfied())
11143 return -CEPHFS_ENOTCONN
;
11145 std::scoped_lock
l(client_lock
);
11150 int64_t Client::drop_caches()
11152 std::scoped_lock
l(client_lock
);
11153 return objectcacher
->release_all();
11156 int Client::_lazyio(Fh
*fh
, int enable
)
11158 Inode
*in
= fh
->inode
.get();
11159 ldout(cct
, 20) << __func__
<< " " << *in
<< " " << !!enable
<< dendl
;
11161 if (!!(fh
->mode
& CEPH_FILE_MODE_LAZY
) == !!enable
)
11164 int orig_mode
= fh
->mode
;
11166 fh
->mode
|= CEPH_FILE_MODE_LAZY
;
11167 in
->get_open_ref(fh
->mode
);
11168 in
->put_open_ref(orig_mode
);
11169 check_caps(in
, CHECK_CAPS_NODELAY
);
11171 fh
->mode
&= ~CEPH_FILE_MODE_LAZY
;
11172 in
->get_open_ref(fh
->mode
);
11173 in
->put_open_ref(orig_mode
);
11180 int Client::lazyio(int fd
, int enable
)
11182 std::scoped_lock
l(client_lock
);
11183 Fh
*f
= get_filehandle(fd
);
11185 return -CEPHFS_EBADF
;
11187 return _lazyio(f
, enable
);
11190 int Client::ll_lazyio(Fh
*fh
, int enable
)
11192 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << !!enable
<< dendl
;
11193 tout(cct
) << __func__
<< std::endl
;
11195 std::scoped_lock
lock(client_lock
);
11196 return _lazyio(fh
, enable
);
11199 int Client::lazyio_propagate(int fd
, loff_t offset
, size_t count
)
11201 std::scoped_lock
l(client_lock
);
11202 ldout(cct
, 3) << "op: client->lazyio_propagate(" << fd
11203 << ", " << offset
<< ", " << count
<< ")" << dendl
;
11205 Fh
*f
= get_filehandle(fd
);
11207 return -CEPHFS_EBADF
;
11215 int Client::lazyio_synchronize(int fd
, loff_t offset
, size_t count
)
11217 std::scoped_lock
l(client_lock
);
11218 ldout(cct
, 3) << "op: client->lazyio_synchronize(" << fd
11219 << ", " << offset
<< ", " << count
<< ")" << dendl
;
11221 Fh
*f
= get_filehandle(fd
);
11223 return -CEPHFS_EBADF
;
11224 Inode
*in
= f
->inode
.get();
11227 if (_release(in
)) {
11228 int r
=_getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
11236 // =============================
11239 int Client::mksnap(const char *relpath
, const char *name
, const UserPerm
& perm
,
11240 mode_t mode
, const std::map
<std::string
, std::string
> &metadata
)
11242 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11243 if (!mref_reader
.is_state_satisfied())
11244 return -CEPHFS_ENOTCONN
;
11246 std::scoped_lock
l(client_lock
);
11248 filepath
path(relpath
);
11250 int r
= path_walk(path
, &in
, perm
);
11253 if (cct
->_conf
->client_permissions
) {
11254 r
= may_create(in
.get(), perm
);
11258 Inode
*snapdir
= open_snapdir(in
.get());
11259 return _mkdir(snapdir
, name
, mode
, perm
, nullptr, metadata
);
11262 int Client::rmsnap(const char *relpath
, const char *name
, const UserPerm
& perms
, bool check_perms
)
11264 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11265 if (!mref_reader
.is_state_satisfied())
11266 return -CEPHFS_ENOTCONN
;
11268 std::scoped_lock
l(client_lock
);
11270 filepath
path(relpath
);
11272 int r
= path_walk(path
, &in
, perms
);
11275 Inode
*snapdir
= open_snapdir(in
.get());
11276 if (cct
->_conf
->client_permissions
) {
11277 r
= may_delete(snapdir
, check_perms
? name
: NULL
, perms
);
11281 return _rmdir(snapdir
, name
, perms
);
11284 // =============================
11287 int Client::get_caps_issued(int fd
)
11289 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11290 if (!mref_reader
.is_state_satisfied())
11291 return -CEPHFS_ENOTCONN
;
11293 std::scoped_lock
lock(client_lock
);
11295 Fh
*f
= get_filehandle(fd
);
11297 return -CEPHFS_EBADF
;
11299 return f
->inode
->caps_issued();
11302 int Client::get_caps_issued(const char *path
, const UserPerm
& perms
)
11304 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11305 if (!mref_reader
.is_state_satisfied())
11306 return -CEPHFS_ENOTCONN
;
11308 std::scoped_lock
lock(client_lock
);
11312 int r
= path_walk(p
, &in
, perms
, true);
11315 return in
->caps_issued();
11318 // =========================================
11321 Inode
*Client::open_snapdir(Inode
*diri
)
11324 vinodeno_t
vino(diri
->ino
, CEPH_SNAPDIR
);
11325 if (!inode_map
.count(vino
)) {
11326 in
= new Inode(this, vino
, &diri
->layout
);
11328 in
->ino
= diri
->ino
;
11329 in
->snapid
= CEPH_SNAPDIR
;
11330 in
->mode
= diri
->mode
;
11331 in
->uid
= diri
->uid
;
11332 in
->gid
= diri
->gid
;
11334 in
->mtime
= diri
->mtime
;
11335 in
->ctime
= diri
->ctime
;
11336 in
->btime
= diri
->btime
;
11337 in
->atime
= diri
->atime
;
11338 in
->size
= diri
->size
;
11339 in
->change_attr
= diri
->change_attr
;
11341 in
->dirfragtree
.clear();
11342 in
->snapdir_parent
= diri
;
11343 diri
->flags
|= I_SNAPDIR_OPEN
;
11344 inode_map
[vino
] = in
;
11345 if (use_faked_inos())
11346 _assign_faked_ino(in
);
11347 ldout(cct
, 10) << "open_snapdir created snapshot inode " << *in
<< dendl
;
11349 in
= inode_map
[vino
];
11350 ldout(cct
, 10) << "open_snapdir had snapshot inode " << *in
<< dendl
;
11355 int Client::ll_lookup(Inode
*parent
, const char *name
, struct stat
*attr
,
11356 Inode
**out
, const UserPerm
& perms
)
11358 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11359 if (!mref_reader
.is_state_satisfied())
11360 return -CEPHFS_ENOTCONN
;
11362 vinodeno_t vparent
= _get_vino(parent
);
11363 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
11364 tout(cct
) << __func__
<< std::endl
;
11365 tout(cct
) << name
<< std::endl
;
11367 std::scoped_lock
lock(client_lock
);
11370 if (!fuse_default_permissions
) {
11371 if (strcmp(name
, ".") && strcmp(name
, "..")) {
11372 r
= may_lookup(parent
, perms
);
11378 string
dname(name
);
11381 r
= _lookup(parent
, dname
, CEPH_STAT_CAP_INODE_ALL
, &in
, perms
);
11388 fill_stat(in
, attr
);
11392 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
11393 << " -> " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
11394 tout(cct
) << attr
->st_ino
<< std::endl
;
11399 int Client::ll_lookup_vino(
11401 const UserPerm
& perms
,
11404 ceph_assert(inode
!= NULL
);
11405 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11406 if (!mref_reader
.is_state_satisfied())
11407 return -CEPHFS_ENOTCONN
;
11409 std::scoped_lock
lock(client_lock
);
11410 ldout(cct
, 3) << __func__
<< " " << vino
<< dendl
;
11412 // Check the cache first
11413 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
11414 if (p
!= inode_map
.end()) {
11415 *inode
= p
->second
;
11420 uint64_t snapid
= vino
.snapid
;
11422 // for snapdir, find the non-snapped dir inode
11423 if (snapid
== CEPH_SNAPDIR
)
11424 vino
.snapid
= CEPH_NOSNAP
;
11426 int r
= _lookup_vino(vino
, perms
, inode
);
11429 ceph_assert(*inode
!= NULL
);
11431 if (snapid
== CEPH_SNAPDIR
) {
11432 Inode
*tmp
= *inode
;
11434 // open the snapdir and put the inode ref
11435 *inode
= open_snapdir(tmp
);
11436 _ll_forget(tmp
, 1);
11442 int Client::ll_lookup_inode(
11443 struct inodeno_t ino
,
11444 const UserPerm
& perms
,
11447 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
11448 return ll_lookup_vino(vino
, perms
, inode
);
11451 int Client::ll_lookupx(Inode
*parent
, const char *name
, Inode
**out
,
11452 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
11453 const UserPerm
& perms
)
11455 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11456 if (!mref_reader
.is_state_satisfied())
11457 return -CEPHFS_ENOTCONN
;
11459 vinodeno_t vparent
= _get_vino(parent
);
11460 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
11461 tout(cct
) << "ll_lookupx" << std::endl
;
11462 tout(cct
) << name
<< std::endl
;
11464 std::scoped_lock
lock(client_lock
);
11467 if (!fuse_default_permissions
) {
11468 r
= may_lookup(parent
, perms
);
11473 string
dname(name
);
11476 unsigned mask
= statx_to_mask(flags
, want
);
11477 r
= _lookup(parent
, dname
, mask
, &in
, perms
);
11483 fill_statx(in
, mask
, stx
);
11487 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
11488 << " -> " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
11489 tout(cct
) << stx
->stx_ino
<< std::endl
;
11494 int Client::ll_walk(const char* name
, Inode
**out
, struct ceph_statx
*stx
,
11495 unsigned int want
, unsigned int flags
, const UserPerm
& perms
)
11497 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11498 if (!mref_reader
.is_state_satisfied())
11499 return -CEPHFS_ENOTCONN
;
11501 filepath
fp(name
, 0);
11504 unsigned mask
= statx_to_mask(flags
, want
);
11506 ldout(cct
, 3) << __func__
<< " " << name
<< dendl
;
11507 tout(cct
) << __func__
<< std::endl
;
11508 tout(cct
) << name
<< std::endl
;
11510 std::scoped_lock
lock(client_lock
);
11511 rc
= path_walk(fp
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
11513 /* zero out mask, just in case... */
11520 fill_statx(in
, mask
, stx
);
11527 void Client::_ll_get(Inode
*in
)
11529 if (in
->ll_ref
== 0) {
11531 if (in
->is_dir() && !in
->dentries
.empty()) {
11532 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
11533 in
->get_first_parent()->get(); // pin dentry
11535 if (in
->snapid
!= CEPH_NOSNAP
)
11536 ll_snap_ref
[in
->snapid
]++;
11539 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " -> " << in
->ll_ref
<< dendl
;
11542 int Client::_ll_put(Inode
*in
, uint64_t num
)
11545 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " " << num
<< " -> " << in
->ll_ref
<< dendl
;
11546 if (in
->ll_ref
== 0) {
11547 if (in
->is_dir() && !in
->dentries
.empty()) {
11548 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
11549 in
->get_first_parent()->put(); // unpin dentry
11551 if (in
->snapid
!= CEPH_NOSNAP
) {
11552 auto p
= ll_snap_ref
.find(in
->snapid
);
11553 ceph_assert(p
!= ll_snap_ref
.end());
11554 ceph_assert(p
->second
> 0);
11555 if (--p
->second
== 0)
11556 ll_snap_ref
.erase(p
);
11565 void Client::_ll_drop_pins()
11567 ldout(cct
, 10) << __func__
<< dendl
;
11568 std::set
<InodeRef
> to_be_put
; //this set will be deconstructed item by item when exit
11569 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator next
;
11570 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
11571 it
!= inode_map
.end();
11573 Inode
*in
= it
->second
;
11577 to_be_put
.insert(in
);
11578 _ll_put(in
, in
->ll_ref
);
11583 bool Client::_ll_forget(Inode
*in
, uint64_t count
)
11585 inodeno_t ino
= in
->ino
;
11587 ldout(cct
, 8) << __func__
<< " " << ino
<< " " << count
<< dendl
;
11588 tout(cct
) << __func__
<< std::endl
;
11589 tout(cct
) << ino
.val
<< std::endl
;
11590 tout(cct
) << count
<< std::endl
;
11592 // Ignore forget if we're no longer mounted
11593 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11594 if (!mref_reader
.is_state_satisfied())
11597 if (ino
== 1) return true; // ignore forget on root.
11600 if (in
->ll_ref
< count
) {
11601 ldout(cct
, 1) << "WARNING: ll_forget on " << ino
<< " " << count
11602 << ", which only has ll_ref=" << in
->ll_ref
<< dendl
;
11603 _ll_put(in
, in
->ll_ref
);
11606 if (_ll_put(in
, count
) == 0)
11613 bool Client::ll_forget(Inode
*in
, uint64_t count
)
11615 std::scoped_lock
lock(client_lock
);
11616 return _ll_forget(in
, count
);
11619 bool Client::ll_put(Inode
*in
)
11621 /* ll_forget already takes the lock */
11622 return ll_forget(in
, 1);
11625 int Client::ll_get_snap_ref(snapid_t snap
)
11627 std::scoped_lock
lock(client_lock
);
11628 auto p
= ll_snap_ref
.find(snap
);
11629 if (p
!= ll_snap_ref
.end())
11634 snapid_t
Client::ll_get_snapid(Inode
*in
)
11636 std::scoped_lock
lock(client_lock
);
11640 Inode
*Client::ll_get_inode(ino_t ino
)
11642 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11643 if (!mref_reader
.is_state_satisfied())
11646 std::scoped_lock
lock(client_lock
);
11648 vinodeno_t vino
= _map_faked_ino(ino
);
11649 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
11650 if (p
== inode_map
.end())
11652 Inode
*in
= p
->second
;
11657 Inode
*Client::ll_get_inode(vinodeno_t vino
)
11659 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11660 if (!mref_reader
.is_state_satisfied())
11663 std::scoped_lock
lock(client_lock
);
11665 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
11666 if (p
== inode_map
.end())
11668 Inode
*in
= p
->second
;
11673 int Client::_ll_getattr(Inode
*in
, int caps
, const UserPerm
& perms
)
11675 vinodeno_t vino
= _get_vino(in
);
11677 ldout(cct
, 8) << __func__
<< " " << vino
<< dendl
;
11678 tout(cct
) << __func__
<< std::endl
;
11679 tout(cct
) << vino
.ino
.val
<< std::endl
;
11681 if (vino
.snapid
< CEPH_NOSNAP
)
11684 return _getattr(in
, caps
, perms
);
11687 int Client::ll_getattr(Inode
*in
, struct stat
*attr
, const UserPerm
& perms
)
11689 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11690 if (!mref_reader
.is_state_satisfied())
11691 return -CEPHFS_ENOTCONN
;
11693 std::scoped_lock
lock(client_lock
);
11695 int res
= _ll_getattr(in
, CEPH_STAT_CAP_INODE_ALL
, perms
);
11698 fill_stat(in
, attr
);
11699 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11703 int Client::ll_getattrx(Inode
*in
, struct ceph_statx
*stx
, unsigned int want
,
11704 unsigned int flags
, const UserPerm
& perms
)
11706 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11707 if (!mref_reader
.is_state_satisfied())
11708 return -CEPHFS_ENOTCONN
;
11710 std::scoped_lock
lock(client_lock
);
11713 unsigned mask
= statx_to_mask(flags
, want
);
11715 if (mask
&& !in
->caps_issued_mask(mask
, true))
11716 res
= _ll_getattr(in
, mask
, perms
);
11719 fill_statx(in
, mask
, stx
);
11720 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11724 int Client::_ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
11725 const UserPerm
& perms
, InodeRef
*inp
)
11727 vinodeno_t vino
= _get_vino(in
);
11729 ldout(cct
, 8) << __func__
<< " " << vino
<< " mask " << hex
<< mask
<< dec
11731 tout(cct
) << __func__
<< std::endl
;
11732 tout(cct
) << vino
.ino
.val
<< std::endl
;
11733 tout(cct
) << stx
->stx_mode
<< std::endl
;
11734 tout(cct
) << stx
->stx_uid
<< std::endl
;
11735 tout(cct
) << stx
->stx_gid
<< std::endl
;
11736 tout(cct
) << stx
->stx_size
<< std::endl
;
11737 tout(cct
) << stx
->stx_mtime
<< std::endl
;
11738 tout(cct
) << stx
->stx_atime
<< std::endl
;
11739 tout(cct
) << stx
->stx_btime
<< std::endl
;
11740 tout(cct
) << mask
<< std::endl
;
11742 if (!fuse_default_permissions
) {
11743 int res
= may_setattr(in
, stx
, mask
, perms
);
11748 mask
&= ~(CEPH_SETATTR_MTIME_NOW
| CEPH_SETATTR_ATIME_NOW
);
11750 return __setattrx(in
, stx
, mask
, perms
, inp
);
11753 int Client::ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
11754 const UserPerm
& perms
)
11756 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11757 if (!mref_reader
.is_state_satisfied())
11758 return -CEPHFS_ENOTCONN
;
11760 std::scoped_lock
lock(client_lock
);
11762 InodeRef
target(in
);
11763 int res
= _ll_setattrx(in
, stx
, mask
, perms
, &target
);
11765 ceph_assert(in
== target
.get());
11766 fill_statx(in
, in
->caps_issued(), stx
);
11769 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11773 int Client::ll_setattr(Inode
*in
, struct stat
*attr
, int mask
,
11774 const UserPerm
& perms
)
11776 struct ceph_statx stx
;
11777 stat_to_statx(attr
, &stx
);
11779 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11780 if (!mref_reader
.is_state_satisfied())
11781 return -CEPHFS_ENOTCONN
;
11783 std::scoped_lock
lock(client_lock
);
11785 InodeRef
target(in
);
11786 int res
= _ll_setattrx(in
, &stx
, mask
, perms
, &target
);
11788 ceph_assert(in
== target
.get());
11789 fill_stat(in
, attr
);
11792 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11800 int Client::getxattr(const char *path
, const char *name
, void *value
, size_t size
,
11801 const UserPerm
& perms
)
11803 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11804 if (!mref_reader
.is_state_satisfied())
11805 return -CEPHFS_ENOTCONN
;
11807 std::scoped_lock
lock(client_lock
);
11810 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11813 return _getxattr(in
, name
, value
, size
, perms
);
11816 int Client::lgetxattr(const char *path
, const char *name
, void *value
, size_t size
,
11817 const UserPerm
& perms
)
11819 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11820 if (!mref_reader
.is_state_satisfied())
11821 return -CEPHFS_ENOTCONN
;
11823 std::scoped_lock
lock(client_lock
);
11826 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11829 return _getxattr(in
, name
, value
, size
, perms
);
11832 int Client::fgetxattr(int fd
, const char *name
, void *value
, size_t size
,
11833 const UserPerm
& perms
)
11835 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11836 if (!mref_reader
.is_state_satisfied())
11837 return -CEPHFS_ENOTCONN
;
11839 std::scoped_lock
lock(client_lock
);
11841 Fh
*f
= get_filehandle(fd
);
11843 return -CEPHFS_EBADF
;
11844 return _getxattr(f
->inode
, name
, value
, size
, perms
);
11847 int Client::listxattr(const char *path
, char *list
, size_t size
,
11848 const UserPerm
& perms
)
11850 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11851 if (!mref_reader
.is_state_satisfied())
11852 return -CEPHFS_ENOTCONN
;
11854 std::scoped_lock
lock(client_lock
);
11857 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11860 return Client::_listxattr(in
.get(), list
, size
, perms
);
11863 int Client::llistxattr(const char *path
, char *list
, size_t size
,
11864 const UserPerm
& perms
)
11866 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11867 if (!mref_reader
.is_state_satisfied())
11868 return -CEPHFS_ENOTCONN
;
11870 std::scoped_lock
lock(client_lock
);
11873 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11876 return Client::_listxattr(in
.get(), list
, size
, perms
);
11879 int Client::flistxattr(int fd
, char *list
, size_t size
, const UserPerm
& perms
)
11881 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11882 if (!mref_reader
.is_state_satisfied())
11883 return -CEPHFS_ENOTCONN
;
11885 std::scoped_lock
lock(client_lock
);
11887 Fh
*f
= get_filehandle(fd
);
11889 return -CEPHFS_EBADF
;
11890 return Client::_listxattr(f
->inode
.get(), list
, size
, perms
);
11893 int Client::removexattr(const char *path
, const char *name
,
11894 const UserPerm
& perms
)
11896 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11897 if (!mref_reader
.is_state_satisfied())
11898 return -CEPHFS_ENOTCONN
;
11900 std::scoped_lock
lock(client_lock
);
11903 int r
= Client::path_walk(path
, &in
, perms
, true);
11906 return _removexattr(in
, name
, perms
);
11909 int Client::lremovexattr(const char *path
, const char *name
,
11910 const UserPerm
& perms
)
11912 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11913 if (!mref_reader
.is_state_satisfied())
11914 return -CEPHFS_ENOTCONN
;
11916 std::scoped_lock
lock(client_lock
);
11919 int r
= Client::path_walk(path
, &in
, perms
, false);
11922 return _removexattr(in
, name
, perms
);
11925 int Client::fremovexattr(int fd
, const char *name
, const UserPerm
& perms
)
11927 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11928 if (!mref_reader
.is_state_satisfied())
11929 return -CEPHFS_ENOTCONN
;
11931 std::scoped_lock
lock(client_lock
);
11933 Fh
*f
= get_filehandle(fd
);
11935 return -CEPHFS_EBADF
;
11936 return _removexattr(f
->inode
, name
, perms
);
11939 int Client::setxattr(const char *path
, const char *name
, const void *value
,
11940 size_t size
, int flags
, const UserPerm
& perms
)
11942 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11943 if (!mref_reader
.is_state_satisfied())
11944 return -CEPHFS_ENOTCONN
;
11946 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11948 std::scoped_lock
lock(client_lock
);
11951 int r
= Client::path_walk(path
, &in
, perms
, true);
11954 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11957 int Client::lsetxattr(const char *path
, const char *name
, const void *value
,
11958 size_t size
, int flags
, const UserPerm
& perms
)
11960 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11961 if (!mref_reader
.is_state_satisfied())
11962 return -CEPHFS_ENOTCONN
;
11964 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11966 std::scoped_lock
lock(client_lock
);
11969 int r
= Client::path_walk(path
, &in
, perms
, false);
11972 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11975 int Client::fsetxattr(int fd
, const char *name
, const void *value
, size_t size
,
11976 int flags
, const UserPerm
& perms
)
11978 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11979 if (!mref_reader
.is_state_satisfied())
11980 return -CEPHFS_ENOTCONN
;
11982 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11984 std::scoped_lock
lock(client_lock
);
11986 Fh
*f
= get_filehandle(fd
);
11988 return -CEPHFS_EBADF
;
11989 return _setxattr(f
->inode
, name
, value
, size
, flags
, perms
);
11992 int Client::_getxattr(Inode
*in
, const char *name
, void *value
, size_t size
,
11993 const UserPerm
& perms
)
11997 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11999 r
= -CEPHFS_ENODATA
;
12001 // Do a force getattr to get the latest quota before returning
12002 // a value to userspace.
12004 if (vxattr
->flags
& VXATTR_RSTAT
) {
12005 flags
|= CEPH_STAT_RSTAT
;
12007 if (vxattr
->flags
& VXATTR_DIRSTAT
) {
12008 flags
|= CEPH_CAP_FILE_SHARED
;
12010 r
= _getattr(in
, flags
| CEPH_STAT_CAP_XATTR
, perms
, true);
12012 // Error from getattr!
12016 // call pointer-to-member function
12018 if (!(vxattr
->exists_cb
&& !(this->*(vxattr
->exists_cb
))(in
))) {
12019 r
= (this->*(vxattr
->getxattr_cb
))(in
, buf
, sizeof(buf
));
12021 r
= -CEPHFS_ENODATA
;
12025 if (r
> (int)size
) {
12026 r
= -CEPHFS_ERANGE
;
12027 } else if (r
> 0) {
12028 memcpy(value
, buf
, r
);
12034 if (acl_type
== NO_ACL
&& !strncmp(name
, "system.", 7)) {
12035 r
= -CEPHFS_EOPNOTSUPP
;
12039 r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
12042 r
= -CEPHFS_ENODATA
;
12043 if (in
->xattrs
.count(n
)) {
12044 r
= in
->xattrs
[n
].length();
12045 if (r
> 0 && size
!= 0) {
12046 if (size
>= (unsigned)r
)
12047 memcpy(value
, in
->xattrs
[n
].c_str(), r
);
12049 r
= -CEPHFS_ERANGE
;
12054 ldout(cct
, 8) << "_getxattr(" << in
->ino
<< ", \"" << name
<< "\", " << size
<< ") = " << r
<< dendl
;
12058 int Client::_getxattr(InodeRef
&in
, const char *name
, void *value
, size_t size
,
12059 const UserPerm
& perms
)
12061 if (cct
->_conf
->client_permissions
) {
12062 int r
= xattr_permission(in
.get(), name
, MAY_READ
, perms
);
12066 return _getxattr(in
.get(), name
, value
, size
, perms
);
12069 int Client::ll_getxattr(Inode
*in
, const char *name
, void *value
,
12070 size_t size
, const UserPerm
& perms
)
12072 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12073 if (!mref_reader
.is_state_satisfied())
12074 return -CEPHFS_ENOTCONN
;
12076 vinodeno_t vino
= _get_vino(in
);
12078 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
12079 tout(cct
) << __func__
<< std::endl
;
12080 tout(cct
) << vino
.ino
.val
<< std::endl
;
12081 tout(cct
) << name
<< std::endl
;
12083 std::scoped_lock
lock(client_lock
);
12084 if (!fuse_default_permissions
) {
12085 int r
= xattr_permission(in
, name
, MAY_READ
, perms
);
12090 return _getxattr(in
, name
, value
, size
, perms
);
12093 int Client::_listxattr(Inode
*in
, char *name
, size_t size
,
12094 const UserPerm
& perms
)
12096 bool len_only
= (size
== 0);
12097 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
12103 for ([[maybe_unused
]] const auto &[xattr_name
, xattr_value_bl
] : in
->xattrs
) {
12104 if (xattr_name
.rfind("ceph.", 0) == 0) {
12108 size_t this_len
= xattr_name
.length() + 1;
12113 if (this_len
> size
) {
12114 r
= -CEPHFS_ERANGE
;
12118 memcpy(name
, xattr_name
.c_str(), this_len
);
12123 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ", " << size
<< ") = " << r
<< dendl
;
12127 int Client::ll_listxattr(Inode
*in
, char *names
, size_t size
,
12128 const UserPerm
& perms
)
12130 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12131 if (!mref_reader
.is_state_satisfied())
12132 return -CEPHFS_ENOTCONN
;
12134 vinodeno_t vino
= _get_vino(in
);
12136 ldout(cct
, 3) << __func__
<< " " << vino
<< " size " << size
<< dendl
;
12137 tout(cct
) << __func__
<< std::endl
;
12138 tout(cct
) << vino
.ino
.val
<< std::endl
;
12139 tout(cct
) << size
<< std::endl
;
12141 std::scoped_lock
lock(client_lock
);
12142 return _listxattr(in
, names
, size
, perms
);
12145 int Client::_do_setxattr(Inode
*in
, const char *name
, const void *value
,
12146 size_t size
, int flags
, const UserPerm
& perms
)
12149 int xattr_flags
= 0;
12151 xattr_flags
|= CEPH_XATTR_REMOVE
;
12152 if (flags
& XATTR_CREATE
)
12153 xattr_flags
|= CEPH_XATTR_CREATE
;
12154 if (flags
& XATTR_REPLACE
)
12155 xattr_flags
|= CEPH_XATTR_REPLACE
;
12157 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETXATTR
);
12159 in
->make_nosnap_relative_path(path
);
12160 req
->set_filepath(path
);
12161 req
->set_string2(name
);
12162 req
->set_inode(in
);
12163 req
->head
.args
.setxattr
.flags
= xattr_flags
;
12166 assert (value
|| size
== 0);
12167 bl
.append((const char*)value
, size
);
12170 int res
= make_request(req
, perms
);
12173 ldout(cct
, 3) << __func__
<< "(" << in
->ino
<< ", \"" << name
<< "\") = " <<
12178 int Client::_setxattr(Inode
*in
, const char *name
, const void *value
,
12179 size_t size
, int flags
, const UserPerm
& perms
)
12181 if (in
->snapid
!= CEPH_NOSNAP
) {
12182 return -CEPHFS_EROFS
;
12187 } else if (value
== NULL
) {
12188 return -CEPHFS_EINVAL
;
12191 bool posix_acl_xattr
= false;
12192 if (acl_type
== POSIX_ACL
)
12193 posix_acl_xattr
= !strncmp(name
, "system.", 7);
12195 if (strncmp(name
, "user.", 5) &&
12196 strncmp(name
, "security.", 9) &&
12197 strncmp(name
, "trusted.", 8) &&
12198 strncmp(name
, "ceph.", 5) &&
12200 return -CEPHFS_EOPNOTSUPP
;
12202 bool check_realm
= false;
12204 if (posix_acl_xattr
) {
12205 if (!strcmp(name
, ACL_EA_ACCESS
)) {
12206 mode_t new_mode
= in
->mode
;
12208 int ret
= posix_acl_equiv_mode(value
, size
, &new_mode
);
12215 if (new_mode
!= in
->mode
) {
12216 struct ceph_statx stx
;
12217 stx
.stx_mode
= new_mode
;
12218 ret
= _do_setattr(in
, &stx
, CEPH_SETATTR_MODE
, perms
, NULL
);
12223 } else if (!strcmp(name
, ACL_EA_DEFAULT
)) {
12225 if (!S_ISDIR(in
->mode
))
12226 return -CEPHFS_EACCES
;
12227 int ret
= posix_acl_check(value
, size
);
12229 return -CEPHFS_EINVAL
;
12236 return -CEPHFS_EOPNOTSUPP
;
12239 const VXattr
*vxattr
= _match_vxattr(in
, name
);
12241 if (vxattr
->readonly
)
12242 return -CEPHFS_EOPNOTSUPP
;
12243 if (vxattr
->name
.compare(0, 10, "ceph.quota") == 0 && value
)
12244 check_realm
= true;
12248 int ret
= _do_setxattr(in
, name
, value
, size
, flags
, perms
);
12249 if (ret
>= 0 && check_realm
) {
12250 // check if snaprealm was created for quota inode
12251 if (in
->quota
.is_enable() &&
12252 !(in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
))
12253 ret
= -CEPHFS_EOPNOTSUPP
;
12259 int Client::_setxattr(InodeRef
&in
, const char *name
, const void *value
,
12260 size_t size
, int flags
, const UserPerm
& perms
)
12262 if (cct
->_conf
->client_permissions
) {
12263 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
12267 return _setxattr(in
.get(), name
, value
, size
, flags
, perms
);
12270 int Client::_setxattr_check_data_pool(string
& name
, string
& value
, const OSDMap
*osdmap
)
12273 if (name
== "layout") {
12274 string::iterator begin
= value
.begin();
12275 string::iterator end
= value
.end();
12276 keys_and_values
<string::iterator
> p
; // create instance of parser
12277 std::map
<string
, string
> m
; // map to receive results
12278 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
12279 return -CEPHFS_EINVAL
;
12282 return -CEPHFS_EINVAL
;
12283 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
12284 if (q
->first
== "pool") {
12289 } else if (name
== "layout.pool") {
12293 if (tmp
.length()) {
12296 pool
= boost::lexical_cast
<unsigned>(tmp
);
12297 if (!osdmap
->have_pg_pool(pool
))
12298 return -CEPHFS_ENOENT
;
12299 } catch (boost::bad_lexical_cast
const&) {
12300 pool
= osdmap
->lookup_pg_pool_name(tmp
);
12302 return -CEPHFS_ENOENT
;
12310 void Client::_setxattr_maybe_wait_for_osdmap(const char *name
, const void *value
, size_t size
)
12312 // For setting pool of layout, MetaRequest need osdmap epoch.
12313 // There is a race which create a new data pool but client and mds both don't have.
12314 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
12315 ldout(cct
, 15) << __func__
<< ": name = " << name
<< dendl
;
12316 if (strcmp(name
, "ceph.file.layout.pool") == 0 || strcmp(name
, "ceph.dir.layout.pool") == 0 ||
12317 strcmp(name
, "ceph.file.layout") == 0 || strcmp(name
, "ceph.dir.layout") == 0) {
12318 string
rest(strstr(name
, "layout"));
12319 string
v((const char*)value
, size
);
12320 int r
= objecter
->with_osdmap([&](const OSDMap
& o
) {
12321 return _setxattr_check_data_pool(rest
, v
, &o
);
12324 if (r
== -CEPHFS_ENOENT
) {
12326 ldout(cct
, 20) << __func__
<< ": waiting for latest osdmap" << dendl
;
12327 objecter
->wait_for_latest_osdmap(ca::use_blocked
[ec
]);
12328 ldout(cct
, 20) << __func__
<< ": got latest osdmap: " << ec
<< dendl
;
12333 int Client::ll_setxattr(Inode
*in
, const char *name
, const void *value
,
12334 size_t size
, int flags
, const UserPerm
& perms
)
12336 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12337 if (!mref_reader
.is_state_satisfied())
12338 return -CEPHFS_ENOTCONN
;
12340 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
12342 vinodeno_t vino
= _get_vino(in
);
12344 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
12345 tout(cct
) << __func__
<< std::endl
;
12346 tout(cct
) << vino
.ino
.val
<< std::endl
;
12347 tout(cct
) << name
<< std::endl
;
12349 std::scoped_lock
lock(client_lock
);
12350 if (!fuse_default_permissions
) {
12351 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
12355 return _setxattr(in
, name
, value
, size
, flags
, perms
);
12358 int Client::_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
12360 if (in
->snapid
!= CEPH_NOSNAP
) {
12361 return -CEPHFS_EROFS
;
12364 // same xattrs supported by kernel client
12365 if (strncmp(name
, "user.", 5) &&
12366 strncmp(name
, "system.", 7) &&
12367 strncmp(name
, "security.", 9) &&
12368 strncmp(name
, "trusted.", 8) &&
12369 strncmp(name
, "ceph.", 5))
12370 return -CEPHFS_EOPNOTSUPP
;
12372 const VXattr
*vxattr
= _match_vxattr(in
, name
);
12373 if (vxattr
&& vxattr
->readonly
)
12374 return -CEPHFS_EOPNOTSUPP
;
12376 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_RMXATTR
);
12378 in
->make_nosnap_relative_path(path
);
12379 req
->set_filepath(path
);
12380 req
->set_filepath2(name
);
12381 req
->set_inode(in
);
12383 int res
= make_request(req
, perms
);
12386 ldout(cct
, 8) << "_removexattr(" << in
->ino
<< ", \"" << name
<< "\") = " << res
<< dendl
;
12390 int Client::_removexattr(InodeRef
&in
, const char *name
, const UserPerm
& perms
)
12392 if (cct
->_conf
->client_permissions
) {
12393 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
12397 return _removexattr(in
.get(), name
, perms
);
12400 int Client::ll_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
12402 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12403 if (!mref_reader
.is_state_satisfied())
12404 return -CEPHFS_ENOTCONN
;
12406 vinodeno_t vino
= _get_vino(in
);
12408 ldout(cct
, 3) << "ll_removexattr " << vino
<< " " << name
<< dendl
;
12409 tout(cct
) << "ll_removexattr" << std::endl
;
12410 tout(cct
) << vino
.ino
.val
<< std::endl
;
12411 tout(cct
) << name
<< std::endl
;
12413 std::scoped_lock
lock(client_lock
);
12414 if (!fuse_default_permissions
) {
12415 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
12420 return _removexattr(in
, name
, perms
);
12423 bool Client::_vxattrcb_quota_exists(Inode
*in
)
12425 return in
->quota
.is_enable() &&
12426 (in
->snapid
!= CEPH_NOSNAP
||
12427 (in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
));
12429 size_t Client::_vxattrcb_quota(Inode
*in
, char *val
, size_t size
)
12431 return snprintf(val
, size
,
12432 "max_bytes=%lld max_files=%lld",
12433 (long long int)in
->quota
.max_bytes
,
12434 (long long int)in
->quota
.max_files
);
12436 size_t Client::_vxattrcb_quota_max_bytes(Inode
*in
, char *val
, size_t size
)
12438 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_bytes
);
12440 size_t Client::_vxattrcb_quota_max_files(Inode
*in
, char *val
, size_t size
)
12442 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_files
);
12445 bool Client::_vxattrcb_layout_exists(Inode
*in
)
12447 return in
->layout
!= file_layout_t();
12449 size_t Client::_vxattrcb_layout(Inode
*in
, char *val
, size_t size
)
12451 int r
= snprintf(val
, size
,
12452 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
12453 (unsigned long long)in
->layout
.stripe_unit
,
12454 (unsigned long long)in
->layout
.stripe_count
,
12455 (unsigned long long)in
->layout
.object_size
);
12456 objecter
->with_osdmap([&](const OSDMap
& o
) {
12457 if (o
.have_pg_pool(in
->layout
.pool_id
))
12458 r
+= snprintf(val
+ r
, size
- r
, "%s",
12459 o
.get_pool_name(in
->layout
.pool_id
).c_str());
12461 r
+= snprintf(val
+ r
, size
- r
, "%" PRIu64
,
12462 (uint64_t)in
->layout
.pool_id
);
12464 if (in
->layout
.pool_ns
.length())
12465 r
+= snprintf(val
+ r
, size
- r
, " pool_namespace=%s",
12466 in
->layout
.pool_ns
.c_str());
12469 size_t Client::_vxattrcb_layout_stripe_unit(Inode
*in
, char *val
, size_t size
)
12471 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_unit
);
12473 size_t Client::_vxattrcb_layout_stripe_count(Inode
*in
, char *val
, size_t size
)
12475 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_count
);
12477 size_t Client::_vxattrcb_layout_object_size(Inode
*in
, char *val
, size_t size
)
12479 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.object_size
);
12481 size_t Client::_vxattrcb_layout_pool(Inode
*in
, char *val
, size_t size
)
12484 objecter
->with_osdmap([&](const OSDMap
& o
) {
12485 if (o
.have_pg_pool(in
->layout
.pool_id
))
12486 r
= snprintf(val
, size
, "%s", o
.get_pool_name(
12487 in
->layout
.pool_id
).c_str());
12489 r
= snprintf(val
, size
, "%" PRIu64
, (uint64_t)in
->layout
.pool_id
);
12493 size_t Client::_vxattrcb_layout_pool_namespace(Inode
*in
, char *val
, size_t size
)
12495 return snprintf(val
, size
, "%s", in
->layout
.pool_ns
.c_str());
12497 size_t Client::_vxattrcb_dir_entries(Inode
*in
, char *val
, size_t size
)
12499 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->dirstat
.nfiles
+ in
->dirstat
.nsubdirs
));
12501 size_t Client::_vxattrcb_dir_files(Inode
*in
, char *val
, size_t size
)
12503 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nfiles
);
12505 size_t Client::_vxattrcb_dir_subdirs(Inode
*in
, char *val
, size_t size
)
12507 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nsubdirs
);
12509 size_t Client::_vxattrcb_dir_rentries(Inode
*in
, char *val
, size_t size
)
12511 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->rstat
.rfiles
+ in
->rstat
.rsubdirs
));
12513 size_t Client::_vxattrcb_dir_rfiles(Inode
*in
, char *val
, size_t size
)
12515 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rfiles
);
12517 size_t Client::_vxattrcb_dir_rsubdirs(Inode
*in
, char *val
, size_t size
)
12519 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsubdirs
);
12521 size_t Client::_vxattrcb_dir_rsnaps(Inode
*in
, char *val
, size_t size
)
12523 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsnaps
);
12525 size_t Client::_vxattrcb_dir_rbytes(Inode
*in
, char *val
, size_t size
)
12527 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rbytes
);
12529 size_t Client::_vxattrcb_dir_rctime(Inode
*in
, char *val
, size_t size
)
12531 return snprintf(val
, size
, "%ld.%09ld", (long)in
->rstat
.rctime
.sec(),
12532 (long)in
->rstat
.rctime
.nsec());
12534 bool Client::_vxattrcb_dir_pin_exists(Inode
*in
)
12536 return in
->dir_pin
!= -CEPHFS_ENODATA
;
12538 size_t Client::_vxattrcb_dir_pin(Inode
*in
, char *val
, size_t size
)
12540 return snprintf(val
, size
, "%ld", (long)in
->dir_pin
);
12543 bool Client::_vxattrcb_snap_btime_exists(Inode
*in
)
12545 return !in
->snap_btime
.is_zero();
12548 size_t Client::_vxattrcb_snap_btime(Inode
*in
, char *val
, size_t size
)
12550 return snprintf(val
, size
, "%llu.%09lu",
12551 (long long unsigned)in
->snap_btime
.sec(),
12552 (long unsigned)in
->snap_btime
.nsec());
12555 bool Client::_vxattrcb_mirror_info_exists(Inode
*in
)
12557 // checking one of the xattrs would suffice
12558 return in
->xattrs
.count("ceph.mirror.info.cluster_id") != 0;
12561 size_t Client::_vxattrcb_mirror_info(Inode
*in
, char *val
, size_t size
)
12563 return snprintf(val
, size
, "cluster_id=%.*s fs_id=%.*s",
12564 in
->xattrs
["ceph.mirror.info.cluster_id"].length(),
12565 in
->xattrs
["ceph.mirror.info.cluster_id"].c_str(),
12566 in
->xattrs
["ceph.mirror.info.fs_id"].length(),
12567 in
->xattrs
["ceph.mirror.info.fs_id"].c_str());
12570 size_t Client::_vxattrcb_cluster_fsid(Inode
*in
, char *val
, size_t size
)
12572 return snprintf(val
, size
, "%s", monclient
->get_fsid().to_string().c_str());
12575 size_t Client::_vxattrcb_client_id(Inode
*in
, char *val
, size_t size
)
12577 auto name
= messenger
->get_myname();
12578 return snprintf(val
, size
, "%s%ld", name
.type_str(), name
.num());
12581 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
12582 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
12584 #define XATTR_NAME_CEPH(_type, _name, _flags) \
12586 name: CEPH_XATTR_NAME(_type, _name), \
12587 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12592 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
12594 name: CEPH_XATTR_NAME2(_type, _name, _field), \
12595 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
12597 exists_cb: &Client::_vxattrcb_layout_exists, \
12600 #define XATTR_QUOTA_FIELD(_type, _name) \
12602 name: CEPH_XATTR_NAME(_type, _name), \
12603 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12605 exists_cb: &Client::_vxattrcb_quota_exists, \
12609 const Client::VXattr
Client::_dir_vxattrs
[] = {
12611 name
: "ceph.dir.layout",
12612 getxattr_cb
: &Client::_vxattrcb_layout
,
12614 exists_cb
: &Client::_vxattrcb_layout_exists
,
12617 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_unit
),
12618 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_count
),
12619 XATTR_LAYOUT_FIELD(dir
, layout
, object_size
),
12620 XATTR_LAYOUT_FIELD(dir
, layout
, pool
),
12621 XATTR_LAYOUT_FIELD(dir
, layout
, pool_namespace
),
12622 XATTR_NAME_CEPH(dir
, entries
, VXATTR_DIRSTAT
),
12623 XATTR_NAME_CEPH(dir
, files
, VXATTR_DIRSTAT
),
12624 XATTR_NAME_CEPH(dir
, subdirs
, VXATTR_DIRSTAT
),
12625 XATTR_NAME_CEPH(dir
, rentries
, VXATTR_RSTAT
),
12626 XATTR_NAME_CEPH(dir
, rfiles
, VXATTR_RSTAT
),
12627 XATTR_NAME_CEPH(dir
, rsubdirs
, VXATTR_RSTAT
),
12628 XATTR_NAME_CEPH(dir
, rsnaps
, VXATTR_RSTAT
),
12629 XATTR_NAME_CEPH(dir
, rbytes
, VXATTR_RSTAT
),
12630 XATTR_NAME_CEPH(dir
, rctime
, VXATTR_RSTAT
),
12632 name
: "ceph.quota",
12633 getxattr_cb
: &Client::_vxattrcb_quota
,
12635 exists_cb
: &Client::_vxattrcb_quota_exists
,
12638 XATTR_QUOTA_FIELD(quota
, max_bytes
),
12639 XATTR_QUOTA_FIELD(quota
, max_files
),
12641 name
: "ceph.dir.pin",
12642 getxattr_cb
: &Client::_vxattrcb_dir_pin
,
12644 exists_cb
: &Client::_vxattrcb_dir_pin_exists
,
12648 name
: "ceph.snap.btime",
12649 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
12651 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
12655 name
: "ceph.mirror.info",
12656 getxattr_cb
: &Client::_vxattrcb_mirror_info
,
12658 exists_cb
: &Client::_vxattrcb_mirror_info_exists
,
12661 { name
: "" } /* Required table terminator */
12664 const Client::VXattr
Client::_file_vxattrs
[] = {
12666 name
: "ceph.file.layout",
12667 getxattr_cb
: &Client::_vxattrcb_layout
,
12669 exists_cb
: &Client::_vxattrcb_layout_exists
,
12672 XATTR_LAYOUT_FIELD(file
, layout
, stripe_unit
),
12673 XATTR_LAYOUT_FIELD(file
, layout
, stripe_count
),
12674 XATTR_LAYOUT_FIELD(file
, layout
, object_size
),
12675 XATTR_LAYOUT_FIELD(file
, layout
, pool
),
12676 XATTR_LAYOUT_FIELD(file
, layout
, pool_namespace
),
12678 name
: "ceph.snap.btime",
12679 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
12681 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
12684 { name
: "" } /* Required table terminator */
12687 const Client::VXattr
Client::_common_vxattrs
[] = {
12689 name
: "ceph.cluster_fsid",
12690 getxattr_cb
: &Client::_vxattrcb_cluster_fsid
,
12692 exists_cb
: nullptr,
12696 name
: "ceph.client_id",
12697 getxattr_cb
: &Client::_vxattrcb_client_id
,
12699 exists_cb
: nullptr,
12702 { name
: "" } /* Required table terminator */
12705 const Client::VXattr
*Client::_get_vxattrs(Inode
*in
)
12708 return _dir_vxattrs
;
12709 else if (in
->is_file())
12710 return _file_vxattrs
;
12714 const Client::VXattr
*Client::_match_vxattr(Inode
*in
, const char *name
)
12716 if (strncmp(name
, "ceph.", 5) == 0) {
12717 const VXattr
*vxattr
= _get_vxattrs(in
);
12719 while (!vxattr
->name
.empty()) {
12720 if (vxattr
->name
== name
)
12726 // for common vxattrs
12727 vxattr
= _common_vxattrs
;
12728 while (!vxattr
->name
.empty()) {
12729 if (vxattr
->name
== name
)
12738 int Client::ll_readlink(Inode
*in
, char *buf
, size_t buflen
, const UserPerm
& perms
)
12740 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12741 if (!mref_reader
.is_state_satisfied())
12742 return -CEPHFS_ENOTCONN
;
12744 vinodeno_t vino
= _get_vino(in
);
12746 ldout(cct
, 3) << "ll_readlink " << vino
<< dendl
;
12747 tout(cct
) << "ll_readlink" << std::endl
;
12748 tout(cct
) << vino
.ino
.val
<< std::endl
;
12750 std::scoped_lock
lock(client_lock
);
12751 for (auto dn
: in
->dentries
) {
12755 int r
= _readlink(in
, buf
, buflen
); // FIXME: no permission checking!
12756 ldout(cct
, 3) << "ll_readlink " << vino
<< " = " << r
<< dendl
;
12760 int Client::_mknod(Inode
*dir
, const char *name
, mode_t mode
, dev_t rdev
,
12761 const UserPerm
& perms
, InodeRef
*inp
)
12763 ldout(cct
, 8) << "_mknod(" << dir
->ino
<< " " << name
<< ", 0" << oct
12764 << mode
<< dec
<< ", " << rdev
<< ", uid " << perms
.uid()
12765 << ", gid " << perms
.gid() << ")" << dendl
;
12767 if (strlen(name
) > NAME_MAX
)
12768 return -CEPHFS_ENAMETOOLONG
;
12770 if (dir
->snapid
!= CEPH_NOSNAP
) {
12771 return -CEPHFS_EROFS
;
12773 if (is_quota_files_exceeded(dir
, perms
)) {
12774 return -CEPHFS_EDQUOT
;
12777 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_MKNOD
);
12780 dir
->make_nosnap_relative_path(path
);
12781 path
.push_dentry(name
);
12782 req
->set_filepath(path
);
12783 req
->set_inode(dir
);
12784 req
->head
.args
.mknod
.rdev
= rdev
;
12785 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12786 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12788 bufferlist xattrs_bl
;
12789 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
12792 req
->head
.args
.mknod
.mode
= mode
;
12793 if (xattrs_bl
.length() > 0)
12794 req
->set_data(xattrs_bl
);
12797 res
= get_or_create(dir
, name
, &de
);
12800 req
->set_dentry(de
);
12802 res
= make_request(req
, perms
, inp
);
12806 ldout(cct
, 8) << "mknod(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
12814 int Client::ll_mknod(Inode
*parent
, const char *name
, mode_t mode
,
12815 dev_t rdev
, struct stat
*attr
, Inode
**out
,
12816 const UserPerm
& perms
)
12818 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12819 if (!mref_reader
.is_state_satisfied())
12820 return -CEPHFS_ENOTCONN
;
12822 vinodeno_t vparent
= _get_vino(parent
);
12824 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
<< dendl
;
12825 tout(cct
) << "ll_mknod" << std::endl
;
12826 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12827 tout(cct
) << name
<< std::endl
;
12828 tout(cct
) << mode
<< std::endl
;
12829 tout(cct
) << rdev
<< std::endl
;
12831 std::scoped_lock
lock(client_lock
);
12832 if (!fuse_default_permissions
) {
12833 int r
= may_create(parent
, perms
);
12839 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12841 fill_stat(in
, attr
);
12844 tout(cct
) << attr
->st_ino
<< std::endl
;
12845 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
12846 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12851 int Client::ll_mknodx(Inode
*parent
, const char *name
, mode_t mode
,
12852 dev_t rdev
, Inode
**out
,
12853 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12854 const UserPerm
& perms
)
12856 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12857 if (!mref_reader
.is_state_satisfied())
12858 return -CEPHFS_ENOTCONN
;
12860 unsigned caps
= statx_to_mask(flags
, want
);
12862 vinodeno_t vparent
= _get_vino(parent
);
12864 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
<< dendl
;
12865 tout(cct
) << "ll_mknodx" << std::endl
;
12866 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12867 tout(cct
) << name
<< std::endl
;
12868 tout(cct
) << mode
<< std::endl
;
12869 tout(cct
) << rdev
<< std::endl
;
12871 std::scoped_lock
lock(client_lock
);
12873 if (!fuse_default_permissions
) {
12874 int r
= may_create(parent
, perms
);
12880 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12882 fill_statx(in
, caps
, stx
);
12885 tout(cct
) << stx
->stx_ino
<< std::endl
;
12886 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
12887 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12892 int Client::_create(Inode
*dir
, const char *name
, int flags
, mode_t mode
,
12893 InodeRef
*inp
, Fh
**fhp
, int stripe_unit
, int stripe_count
,
12894 int object_size
, const char *data_pool
, bool *created
,
12895 const UserPerm
& perms
, std::string alternate_name
)
12897 ldout(cct
, 8) << "_create(" << dir
->ino
<< " " << name
<< ", 0" << oct
<<
12898 mode
<< dec
<< ")" << dendl
;
12900 if (strlen(name
) > NAME_MAX
)
12901 return -CEPHFS_ENAMETOOLONG
;
12902 if (dir
->snapid
!= CEPH_NOSNAP
) {
12903 return -CEPHFS_EROFS
;
12905 if (is_quota_files_exceeded(dir
, perms
)) {
12906 return -CEPHFS_EDQUOT
;
12909 // use normalized flags to generate cmode
12910 int cflags
= ceph_flags_sys2wire(flags
);
12911 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
12912 cflags
|= CEPH_O_LAZY
;
12914 int cmode
= ceph_flags_to_mode(cflags
);
12916 int64_t pool_id
= -1;
12917 if (data_pool
&& *data_pool
) {
12918 pool_id
= objecter
->with_osdmap(
12919 std::mem_fn(&OSDMap::lookup_pg_pool_name
), data_pool
);
12921 return -CEPHFS_EINVAL
;
12922 if (pool_id
> 0xffffffffll
)
12923 return -CEPHFS_ERANGE
; // bummer!
12926 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_CREATE
);
12929 dir
->make_nosnap_relative_path(path
);
12930 path
.push_dentry(name
);
12931 req
->set_filepath(path
);
12932 req
->set_alternate_name(std::move(alternate_name
));
12933 req
->set_inode(dir
);
12934 req
->head
.args
.open
.flags
= cflags
| CEPH_O_CREAT
;
12936 req
->head
.args
.open
.stripe_unit
= stripe_unit
;
12937 req
->head
.args
.open
.stripe_count
= stripe_count
;
12938 req
->head
.args
.open
.object_size
= object_size
;
12939 if (cct
->_conf
->client_debug_getattr_caps
)
12940 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
12942 req
->head
.args
.open
.mask
= 0;
12943 req
->head
.args
.open
.pool
= pool_id
;
12944 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12945 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12948 bufferlist xattrs_bl
;
12949 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
12952 req
->head
.args
.open
.mode
= mode
;
12953 if (xattrs_bl
.length() > 0)
12954 req
->set_data(xattrs_bl
);
12957 res
= get_or_create(dir
, name
, &de
);
12960 req
->set_dentry(de
);
12962 res
= make_request(req
, perms
, inp
, created
);
12967 /* If the caller passed a value in fhp, do the open */
12969 (*inp
)->get_open_ref(cmode
);
12970 *fhp
= _create_fh(inp
->get(), flags
, cmode
, perms
);
12976 ldout(cct
, 8) << "create(" << path
<< ", 0" << oct
<< mode
<< dec
12977 << " layout " << stripe_unit
12978 << ' ' << stripe_count
12979 << ' ' << object_size
12980 <<") = " << res
<< dendl
;
12988 int Client::_mkdir(Inode
*dir
, const char *name
, mode_t mode
, const UserPerm
& perm
,
12989 InodeRef
*inp
, const std::map
<std::string
, std::string
> &metadata
,
12990 std::string alternate_name
)
12992 ldout(cct
, 8) << "_mkdir(" << dir
->ino
<< " " << name
<< ", 0" << oct
12993 << mode
<< dec
<< ", uid " << perm
.uid()
12994 << ", gid " << perm
.gid() << ")" << dendl
;
12996 if (strlen(name
) > NAME_MAX
)
12997 return -CEPHFS_ENAMETOOLONG
;
12999 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
13000 return -CEPHFS_EROFS
;
13002 if (is_quota_files_exceeded(dir
, perm
)) {
13003 return -CEPHFS_EDQUOT
;
13006 bool is_snap_op
= dir
->snapid
== CEPH_SNAPDIR
;
13007 MetaRequest
*req
= new MetaRequest(is_snap_op
?
13008 CEPH_MDS_OP_MKSNAP
: CEPH_MDS_OP_MKDIR
);
13011 dir
->make_nosnap_relative_path(path
);
13012 path
.push_dentry(name
);
13013 req
->set_filepath(path
);
13014 req
->set_inode(dir
);
13015 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13016 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13017 req
->set_alternate_name(std::move(alternate_name
));
13021 int res
= _posix_acl_create(dir
, &mode
, bl
, perm
);
13024 req
->head
.args
.mkdir
.mode
= mode
;
13026 SnapPayload payload
;
13027 // clear the bufferlist that may have been populated by the call
13028 // to _posix_acl_create(). MDS mksnap does not make use of it.
13029 // So, reuse it to pass metadata payload.
13031 payload
.metadata
= metadata
;
13032 encode(payload
, bl
);
13034 if (bl
.length() > 0) {
13039 res
= get_or_create(dir
, name
, &de
);
13042 req
->set_dentry(de
);
13044 ldout(cct
, 10) << "_mkdir: making request" << dendl
;
13045 res
= make_request(req
, perm
, inp
);
13046 ldout(cct
, 10) << "_mkdir result is " << res
<< dendl
;
13050 ldout(cct
, 8) << "_mkdir(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
13058 int Client::ll_mkdir(Inode
*parent
, const char *name
, mode_t mode
,
13059 struct stat
*attr
, Inode
**out
, const UserPerm
& perm
)
13061 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13062 if (!mref_reader
.is_state_satisfied())
13063 return -CEPHFS_ENOTCONN
;
13065 vinodeno_t vparent
= _get_vino(parent
);
13067 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
<< dendl
;
13068 tout(cct
) << "ll_mkdir" << std::endl
;
13069 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13070 tout(cct
) << name
<< std::endl
;
13071 tout(cct
) << mode
<< std::endl
;
13073 std::scoped_lock
lock(client_lock
);
13075 if (!fuse_default_permissions
) {
13076 int r
= may_create(parent
, perm
);
13082 int r
= _mkdir(parent
, name
, mode
, perm
, &in
);
13084 fill_stat(in
, attr
);
13087 tout(cct
) << attr
->st_ino
<< std::endl
;
13088 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
13089 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
13094 int Client::ll_mkdirx(Inode
*parent
, const char *name
, mode_t mode
, Inode
**out
,
13095 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
13096 const UserPerm
& perms
)
13098 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13099 if (!mref_reader
.is_state_satisfied())
13100 return -CEPHFS_ENOTCONN
;
13102 vinodeno_t vparent
= _get_vino(parent
);
13104 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
<< dendl
;
13105 tout(cct
) << "ll_mkdirx" << std::endl
;
13106 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13107 tout(cct
) << name
<< std::endl
;
13108 tout(cct
) << mode
<< std::endl
;
13110 std::scoped_lock
lock(client_lock
);
13112 if (!fuse_default_permissions
) {
13113 int r
= may_create(parent
, perms
);
13119 int r
= _mkdir(parent
, name
, mode
, perms
, &in
);
13121 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
13127 tout(cct
) << stx
->stx_ino
<< std::endl
;
13128 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
13129 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
13134 int Client::_symlink(Inode
*dir
, const char *name
, const char *target
,
13135 const UserPerm
& perms
, std::string alternate_name
, InodeRef
*inp
)
13137 ldout(cct
, 8) << "_symlink(" << dir
->ino
<< " " << name
<< ", " << target
13138 << ", uid " << perms
.uid() << ", gid " << perms
.gid() << ")"
13141 if (strlen(name
) > NAME_MAX
)
13142 return -CEPHFS_ENAMETOOLONG
;
13144 if (dir
->snapid
!= CEPH_NOSNAP
) {
13145 return -CEPHFS_EROFS
;
13147 if (is_quota_files_exceeded(dir
, perms
)) {
13148 return -CEPHFS_EDQUOT
;
13151 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SYMLINK
);
13154 dir
->make_nosnap_relative_path(path
);
13155 path
.push_dentry(name
);
13156 req
->set_filepath(path
);
13157 req
->set_alternate_name(std::move(alternate_name
));
13158 req
->set_inode(dir
);
13159 req
->set_string2(target
);
13160 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13161 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13164 int res
= get_or_create(dir
, name
, &de
);
13167 req
->set_dentry(de
);
13169 res
= make_request(req
, perms
, inp
);
13172 ldout(cct
, 8) << "_symlink(\"" << path
<< "\", \"" << target
<< "\") = " <<
13181 int Client::ll_symlink(Inode
*parent
, const char *name
, const char *value
,
13182 struct stat
*attr
, Inode
**out
, const UserPerm
& perms
)
13184 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13185 if (!mref_reader
.is_state_satisfied())
13186 return -CEPHFS_ENOTCONN
;
13188 vinodeno_t vparent
= _get_vino(parent
);
13190 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
<< " -> " << value
13192 tout(cct
) << "ll_symlink" << std::endl
;
13193 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13194 tout(cct
) << name
<< std::endl
;
13195 tout(cct
) << value
<< std::endl
;
13197 std::scoped_lock
lock(client_lock
);
13199 if (!fuse_default_permissions
) {
13200 int r
= may_create(parent
, perms
);
13206 int r
= _symlink(parent
, name
, value
, perms
, "", &in
);
13208 fill_stat(in
, attr
);
13211 tout(cct
) << attr
->st_ino
<< std::endl
;
13212 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
13213 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
13218 int Client::ll_symlinkx(Inode
*parent
, const char *name
, const char *value
,
13219 Inode
**out
, struct ceph_statx
*stx
, unsigned want
,
13220 unsigned flags
, const UserPerm
& perms
)
13222 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13223 if (!mref_reader
.is_state_satisfied())
13224 return -CEPHFS_ENOTCONN
;
13226 vinodeno_t vparent
= _get_vino(parent
);
13228 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
<< " -> " << value
13230 tout(cct
) << "ll_symlinkx" << std::endl
;
13231 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13232 tout(cct
) << name
<< std::endl
;
13233 tout(cct
) << value
<< std::endl
;
13235 std::scoped_lock
lock(client_lock
);
13237 if (!fuse_default_permissions
) {
13238 int r
= may_create(parent
, perms
);
13244 int r
= _symlink(parent
, name
, value
, perms
, "", &in
);
13246 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
13249 tout(cct
) << stx
->stx_ino
<< std::endl
;
13250 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
13251 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
13256 int Client::_unlink(Inode
*dir
, const char *name
, const UserPerm
& perm
)
13258 ldout(cct
, 8) << "_unlink(" << dir
->ino
<< " " << name
13259 << " uid " << perm
.uid() << " gid " << perm
.gid()
13262 if (dir
->snapid
!= CEPH_NOSNAP
) {
13263 return -CEPHFS_EROFS
;
13266 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_UNLINK
);
13269 dir
->make_nosnap_relative_path(path
);
13270 path
.push_dentry(name
);
13271 req
->set_filepath(path
);
13277 int res
= get_or_create(dir
, name
, &de
);
13280 req
->set_dentry(de
);
13281 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13282 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13284 res
= _lookup(dir
, name
, 0, &otherin
, perm
);
13288 in
= otherin
.get();
13289 req
->set_other_inode(in
);
13290 in
->break_all_delegs();
13291 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
13293 req
->set_inode(dir
);
13295 res
= make_request(req
, perm
);
13298 ldout(cct
, 8) << "unlink(" << path
<< ") = " << res
<< dendl
;
13306 int Client::ll_unlink(Inode
*in
, const char *name
, const UserPerm
& perm
)
13308 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13309 if (!mref_reader
.is_state_satisfied())
13310 return -CEPHFS_ENOTCONN
;
13312 vinodeno_t vino
= _get_vino(in
);
13314 ldout(cct
, 3) << "ll_unlink " << vino
<< " " << name
<< dendl
;
13315 tout(cct
) << "ll_unlink" << std::endl
;
13316 tout(cct
) << vino
.ino
.val
<< std::endl
;
13317 tout(cct
) << name
<< std::endl
;
13319 std::scoped_lock
lock(client_lock
);
13321 if (!fuse_default_permissions
) {
13322 int r
= may_delete(in
, name
, perm
);
13326 return _unlink(in
, name
, perm
);
13329 int Client::_rmdir(Inode
*dir
, const char *name
, const UserPerm
& perms
)
13331 ldout(cct
, 8) << "_rmdir(" << dir
->ino
<< " " << name
<< " uid "
13332 << perms
.uid() << " gid " << perms
.gid() << ")" << dendl
;
13334 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
13335 return -CEPHFS_EROFS
;
13338 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_RMSNAP
: CEPH_MDS_OP_RMDIR
;
13339 MetaRequest
*req
= new MetaRequest(op
);
13341 dir
->make_nosnap_relative_path(path
);
13342 path
.push_dentry(name
);
13343 req
->set_filepath(path
);
13344 req
->set_inode(dir
);
13346 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13347 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13348 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
13353 int res
= get_or_create(dir
, name
, &de
);
13356 if (op
== CEPH_MDS_OP_RMDIR
)
13357 req
->set_dentry(de
);
13361 res
= _lookup(dir
, name
, 0, &in
, perms
);
13365 if (op
== CEPH_MDS_OP_RMSNAP
) {
13366 unlink(de
, true, true);
13369 req
->set_other_inode(in
.get());
13371 res
= make_request(req
, perms
);
13374 ldout(cct
, 8) << "rmdir(" << path
<< ") = " << res
<< dendl
;
13382 int Client::ll_rmdir(Inode
*in
, const char *name
, const UserPerm
& perms
)
13384 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13385 if (!mref_reader
.is_state_satisfied())
13386 return -CEPHFS_ENOTCONN
;
13388 vinodeno_t vino
= _get_vino(in
);
13390 ldout(cct
, 3) << "ll_rmdir " << vino
<< " " << name
<< dendl
;
13391 tout(cct
) << "ll_rmdir" << std::endl
;
13392 tout(cct
) << vino
.ino
.val
<< std::endl
;
13393 tout(cct
) << name
<< std::endl
;
13395 std::scoped_lock
lock(client_lock
);
13397 if (!fuse_default_permissions
) {
13398 int r
= may_delete(in
, name
, perms
);
13403 return _rmdir(in
, name
, perms
);
13406 int Client::_rename(Inode
*fromdir
, const char *fromname
, Inode
*todir
, const char *toname
, const UserPerm
& perm
, std::string alternate_name
)
13408 ldout(cct
, 8) << "_rename(" << fromdir
->ino
<< " " << fromname
<< " to "
13409 << todir
->ino
<< " " << toname
13410 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")"
13413 if (fromdir
->snapid
!= todir
->snapid
)
13414 return -CEPHFS_EXDEV
;
13416 int op
= CEPH_MDS_OP_RENAME
;
13417 if (fromdir
->snapid
!= CEPH_NOSNAP
) {
13418 if (fromdir
== todir
&& fromdir
->snapid
== CEPH_SNAPDIR
)
13419 op
= CEPH_MDS_OP_RENAMESNAP
;
13421 return -CEPHFS_EROFS
;
13423 if (fromdir
!= todir
) {
13424 Inode
*fromdir_root
=
13425 fromdir
->quota
.is_enable() ? fromdir
: get_quota_root(fromdir
, perm
);
13426 Inode
*todir_root
=
13427 todir
->quota
.is_enable() ? todir
: get_quota_root(todir
, perm
);
13428 if (fromdir_root
!= todir_root
) {
13429 return -CEPHFS_EXDEV
;
13434 MetaRequest
*req
= new MetaRequest(op
);
13437 fromdir
->make_nosnap_relative_path(from
);
13438 from
.push_dentry(fromname
);
13440 todir
->make_nosnap_relative_path(to
);
13441 to
.push_dentry(toname
);
13442 req
->set_filepath(to
);
13443 req
->set_filepath2(from
);
13444 req
->set_alternate_name(std::move(alternate_name
));
13447 int res
= get_or_create(fromdir
, fromname
, &oldde
);
13451 res
= get_or_create(todir
, toname
, &de
);
13455 if (op
== CEPH_MDS_OP_RENAME
) {
13456 req
->set_old_dentry(oldde
);
13457 req
->old_dentry_drop
= CEPH_CAP_FILE_SHARED
;
13458 req
->old_dentry_unless
= CEPH_CAP_FILE_EXCL
;
13460 req
->set_dentry(de
);
13461 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13462 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13464 InodeRef oldin
, otherin
;
13465 res
= _lookup(fromdir
, fromname
, 0, &oldin
, perm
);
13469 Inode
*oldinode
= oldin
.get();
13470 oldinode
->break_all_delegs();
13471 req
->set_old_inode(oldinode
);
13472 req
->old_inode_drop
= CEPH_CAP_LINK_SHARED
;
13474 res
= _lookup(todir
, toname
, 0, &otherin
, perm
);
13478 Inode
*in
= otherin
.get();
13479 req
->set_other_inode(in
);
13480 in
->break_all_delegs();
13482 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
13484 case -CEPHFS_ENOENT
:
13490 req
->set_inode(todir
);
13492 // renamesnap reply contains no tracedn, so we need to invalidate
13494 unlink(oldde
, true, true);
13495 unlink(de
, true, true);
13497 req
->set_inode(todir
);
13500 res
= make_request(req
, perm
, &target
);
13501 ldout(cct
, 10) << "rename result is " << res
<< dendl
;
13503 // renamed item from our cache
13506 ldout(cct
, 8) << "_rename(" << from
<< ", " << to
<< ") = " << res
<< dendl
;
13514 int Client::ll_rename(Inode
*parent
, const char *name
, Inode
*newparent
,
13515 const char *newname
, const UserPerm
& perm
)
13517 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13518 if (!mref_reader
.is_state_satisfied())
13519 return -CEPHFS_ENOTCONN
;
13521 vinodeno_t vparent
= _get_vino(parent
);
13522 vinodeno_t vnewparent
= _get_vino(newparent
);
13524 ldout(cct
, 3) << "ll_rename " << vparent
<< " " << name
<< " to "
13525 << vnewparent
<< " " << newname
<< dendl
;
13526 tout(cct
) << "ll_rename" << std::endl
;
13527 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13528 tout(cct
) << name
<< std::endl
;
13529 tout(cct
) << vnewparent
.ino
.val
<< std::endl
;
13530 tout(cct
) << newname
<< std::endl
;
13532 std::scoped_lock
lock(client_lock
);
13534 if (!fuse_default_permissions
) {
13535 int r
= may_delete(parent
, name
, perm
);
13538 r
= may_delete(newparent
, newname
, perm
);
13539 if (r
< 0 && r
!= -CEPHFS_ENOENT
)
13543 return _rename(parent
, name
, newparent
, newname
, perm
, "");
13546 int Client::_link(Inode
*in
, Inode
*dir
, const char *newname
, const UserPerm
& perm
, std::string alternate_name
, InodeRef
*inp
)
13548 ldout(cct
, 8) << "_link(" << in
->ino
<< " to " << dir
->ino
<< " " << newname
13549 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")" << dendl
;
13551 if (strlen(newname
) > NAME_MAX
)
13552 return -CEPHFS_ENAMETOOLONG
;
13554 if (in
->snapid
!= CEPH_NOSNAP
|| dir
->snapid
!= CEPH_NOSNAP
) {
13555 return -CEPHFS_EROFS
;
13557 if (is_quota_files_exceeded(dir
, perm
)) {
13558 return -CEPHFS_EDQUOT
;
13561 in
->break_all_delegs();
13562 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LINK
);
13564 filepath
path(newname
, dir
->ino
);
13565 req
->set_filepath(path
);
13566 req
->set_alternate_name(std::move(alternate_name
));
13567 filepath
existing(in
->ino
);
13568 req
->set_filepath2(existing
);
13570 req
->set_inode(dir
);
13571 req
->inode_drop
= CEPH_CAP_FILE_SHARED
;
13572 req
->inode_unless
= CEPH_CAP_FILE_EXCL
;
13575 int res
= get_or_create(dir
, newname
, &de
);
13578 req
->set_dentry(de
);
13580 res
= make_request(req
, perm
, inp
);
13581 ldout(cct
, 10) << "link result is " << res
<< dendl
;
13584 ldout(cct
, 8) << "link(" << existing
<< ", " << path
<< ") = " << res
<< dendl
;
13592 int Client::ll_link(Inode
*in
, Inode
*newparent
, const char *newname
,
13593 const UserPerm
& perm
)
13595 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13596 if (!mref_reader
.is_state_satisfied())
13597 return -CEPHFS_ENOTCONN
;
13599 vinodeno_t vino
= _get_vino(in
);
13600 vinodeno_t vnewparent
= _get_vino(newparent
);
13602 ldout(cct
, 3) << "ll_link " << vino
<< " to " << vnewparent
<< " " <<
13604 tout(cct
) << "ll_link" << std::endl
;
13605 tout(cct
) << vino
.ino
.val
<< std::endl
;
13606 tout(cct
) << vnewparent
<< std::endl
;
13607 tout(cct
) << newname
<< std::endl
;
13611 std::scoped_lock
lock(client_lock
);
13613 if (!fuse_default_permissions
) {
13614 if (S_ISDIR(in
->mode
))
13615 return -CEPHFS_EPERM
;
13617 int r
= may_hardlink(in
, perm
);
13621 r
= may_create(newparent
, perm
);
13626 return _link(in
, newparent
, newname
, perm
, "", &target
);
13629 int Client::ll_num_osds(void)
13631 std::scoped_lock
lock(client_lock
);
13632 return objecter
->with_osdmap(std::mem_fn(&OSDMap::get_num_osds
));
13635 int Client::ll_osdaddr(int osd
, uint32_t *addr
)
13637 std::scoped_lock
lock(client_lock
);
13640 bool exists
= objecter
->with_osdmap([&](const OSDMap
& o
) {
13641 if (!o
.exists(osd
))
13643 g
= o
.get_addrs(osd
).front();
13648 uint32_t nb_addr
= (g
.in4_addr()).sin_addr
.s_addr
;
13649 *addr
= ntohl(nb_addr
);
13653 uint32_t Client::ll_stripe_unit(Inode
*in
)
13655 std::scoped_lock
lock(client_lock
);
13656 return in
->layout
.stripe_unit
;
13659 uint64_t Client::ll_snap_seq(Inode
*in
)
13661 std::scoped_lock
lock(client_lock
);
13662 return in
->snaprealm
->seq
;
13665 int Client::ll_file_layout(Inode
*in
, file_layout_t
*layout
)
13667 std::scoped_lock
lock(client_lock
);
13668 *layout
= in
->layout
;
13672 int Client::ll_file_layout(Fh
*fh
, file_layout_t
*layout
)
13674 return ll_file_layout(fh
->inode
.get(), layout
);
13677 /* Currently we cannot take advantage of redundancy in reads, since we
13678 would have to go through all possible placement groups (a
13679 potentially quite large number determined by a hash), and use CRUSH
13680 to calculate the appropriate set of OSDs for each placement group,
13681 then index into that. An array with one entry per OSD is much more
13682 tractable and works for demonstration purposes. */
13684 int Client::ll_get_stripe_osd(Inode
*in
, uint64_t blockno
,
13685 file_layout_t
* layout
)
13687 std::scoped_lock
lock(client_lock
);
13689 inodeno_t ino
= in
->ino
;
13690 uint32_t object_size
= layout
->object_size
;
13691 uint32_t su
= layout
->stripe_unit
;
13692 uint32_t stripe_count
= layout
->stripe_count
;
13693 uint64_t stripes_per_object
= object_size
/ su
;
13694 uint64_t stripeno
= 0, stripepos
= 0;
13697 stripeno
= blockno
/ stripe_count
; // which horizontal stripe (Y)
13698 stripepos
= blockno
% stripe_count
; // which object in the object set (X)
13700 uint64_t objectsetno
= stripeno
/ stripes_per_object
; // which object set
13701 uint64_t objectno
= objectsetno
* stripe_count
+ stripepos
; // object id
13703 object_t oid
= file_object_t(ino
, objectno
);
13704 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13705 ceph_object_layout olayout
=
13706 o
.file_to_object_layout(oid
, *layout
);
13707 pg_t pg
= (pg_t
)olayout
.ol_pgid
;
13710 o
.pg_to_acting_osds(pg
, &osds
, &primary
);
13715 /* Return the offset of the block, internal to the object */
13717 uint64_t Client::ll_get_internal_offset(Inode
*in
, uint64_t blockno
)
13719 std::scoped_lock
lock(client_lock
);
13720 file_layout_t
*layout
=&(in
->layout
);
13721 uint32_t object_size
= layout
->object_size
;
13722 uint32_t su
= layout
->stripe_unit
;
13723 uint64_t stripes_per_object
= object_size
/ su
;
13725 return (blockno
% stripes_per_object
) * su
;
13728 int Client::ll_opendir(Inode
*in
, int flags
, dir_result_t
** dirpp
,
13729 const UserPerm
& perms
)
13731 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13732 if (!mref_reader
.is_state_satisfied())
13733 return -CEPHFS_ENOTCONN
;
13735 vinodeno_t vino
= _get_vino(in
);
13737 ldout(cct
, 3) << "ll_opendir " << vino
<< dendl
;
13738 tout(cct
) << "ll_opendir" << std::endl
;
13739 tout(cct
) << vino
.ino
.val
<< std::endl
;
13741 std::scoped_lock
lock(client_lock
);
13743 if (!fuse_default_permissions
) {
13744 int r
= may_open(in
, flags
, perms
);
13749 int r
= _opendir(in
, dirpp
, perms
);
13750 tout(cct
) << (uintptr_t)*dirpp
<< std::endl
;
13752 ldout(cct
, 3) << "ll_opendir " << vino
<< " = " << r
<< " (" << *dirpp
<< ")"
13757 int Client::ll_releasedir(dir_result_t
*dirp
)
13759 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13760 if (!mref_reader
.is_state_satisfied())
13761 return -CEPHFS_ENOTCONN
;
13763 ldout(cct
, 3) << "ll_releasedir " << dirp
<< dendl
;
13764 tout(cct
) << "ll_releasedir" << std::endl
;
13765 tout(cct
) << (uintptr_t)dirp
<< std::endl
;
13767 std::scoped_lock
lock(client_lock
);
13773 int Client::ll_fsyncdir(dir_result_t
*dirp
)
13775 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13776 if (!mref_reader
.is_state_satisfied())
13777 return -CEPHFS_ENOTCONN
;
13779 ldout(cct
, 3) << "ll_fsyncdir " << dirp
<< dendl
;
13780 tout(cct
) << "ll_fsyncdir" << std::endl
;
13781 tout(cct
) << (uintptr_t)dirp
<< std::endl
;
13783 std::scoped_lock
lock(client_lock
);
13784 return _fsync(dirp
->inode
.get(), false);
13787 int Client::ll_open(Inode
*in
, int flags
, Fh
**fhp
, const UserPerm
& perms
)
13789 ceph_assert(!(flags
& O_CREAT
));
13791 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13792 if (!mref_reader
.is_state_satisfied())
13793 return -CEPHFS_ENOTCONN
;
13795 vinodeno_t vino
= _get_vino(in
);
13797 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) << dendl
;
13798 tout(cct
) << "ll_open" << std::endl
;
13799 tout(cct
) << vino
.ino
.val
<< std::endl
;
13800 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
13802 std::scoped_lock
lock(client_lock
);
13805 if (!fuse_default_permissions
) {
13806 r
= may_open(in
, flags
, perms
);
13811 r
= _open(in
, flags
, 0, fhp
/* may be NULL */, perms
);
13814 Fh
*fhptr
= fhp
? *fhp
: NULL
;
13816 ll_unclosed_fh_set
.insert(fhptr
);
13818 tout(cct
) << (uintptr_t)fhptr
<< std::endl
;
13819 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) <<
13820 " = " << r
<< " (" << fhptr
<< ")" << dendl
;
13824 int Client::_ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13825 int flags
, InodeRef
*in
, int caps
, Fh
**fhp
,
13826 const UserPerm
& perms
)
13830 vinodeno_t vparent
= _get_vino(parent
);
13832 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13833 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << ", uid " << perms
.uid()
13834 << ", gid " << perms
.gid() << dendl
;
13835 tout(cct
) << "ll_create" << std::endl
;
13836 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13837 tout(cct
) << name
<< std::endl
;
13838 tout(cct
) << mode
<< std::endl
;
13839 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
13841 bool created
= false;
13842 int r
= _lookup(parent
, name
, caps
, in
, perms
);
13844 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
13845 return -CEPHFS_EEXIST
;
13847 if (r
== -CEPHFS_ENOENT
&& (flags
& O_CREAT
)) {
13848 if (!fuse_default_permissions
) {
13849 r
= may_create(parent
, perms
);
13853 r
= _create(parent
, name
, flags
, mode
, in
, fhp
, 0, 0, 0, NULL
, &created
,
13864 ldout(cct
, 20) << "_ll_create created = " << created
<< dendl
;
13866 if (!fuse_default_permissions
) {
13867 r
= may_open(in
->get(), flags
, perms
);
13870 int release_r
= _release_fh(*fhp
);
13871 ceph_assert(release_r
== 0); // during create, no async data ops should have happened
13876 if (*fhp
== NULL
) {
13877 r
= _open(in
->get(), flags
, mode
, fhp
, perms
);
13885 ll_unclosed_fh_set
.insert(*fhp
);
13890 Inode
*inode
= in
->get();
13891 if (use_faked_inos())
13892 ino
= inode
->faked_ino
;
13897 tout(cct
) << (uintptr_t)*fhp
<< std::endl
;
13898 tout(cct
) << ino
<< std::endl
;
13899 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13900 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << " = " << r
<< " (" <<
13901 *fhp
<< " " << hex
<< ino
<< dec
<< ")" << dendl
;
13906 int Client::ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13907 int flags
, struct stat
*attr
, Inode
**outp
, Fh
**fhp
,
13908 const UserPerm
& perms
)
13910 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13911 if (!mref_reader
.is_state_satisfied())
13912 return -CEPHFS_ENOTCONN
;
13914 std::scoped_lock
lock(client_lock
);
13917 int r
= _ll_create(parent
, name
, mode
, flags
, &in
, CEPH_STAT_CAP_INODE_ALL
,
13922 // passing an Inode in outp requires an additional ref
13927 fill_stat(in
, attr
);
13935 int Client::ll_createx(Inode
*parent
, const char *name
, mode_t mode
,
13936 int oflags
, Inode
**outp
, Fh
**fhp
,
13937 struct ceph_statx
*stx
, unsigned want
, unsigned lflags
,
13938 const UserPerm
& perms
)
13940 unsigned caps
= statx_to_mask(lflags
, want
);
13941 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13942 if (!mref_reader
.is_state_satisfied())
13943 return -CEPHFS_ENOTCONN
;
13945 std::scoped_lock
lock(client_lock
);
13948 int r
= _ll_create(parent
, name
, mode
, oflags
, &in
, caps
, fhp
, perms
);
13952 // passing an Inode in outp requires an additional ref
13957 fill_statx(in
, caps
, stx
);
13966 loff_t
Client::ll_lseek(Fh
*fh
, loff_t offset
, int whence
)
13968 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13969 if (!mref_reader
.is_state_satisfied())
13970 return -CEPHFS_ENOTCONN
;
13972 tout(cct
) << "ll_lseek" << std::endl
;
13973 tout(cct
) << offset
<< std::endl
;
13974 tout(cct
) << whence
<< std::endl
;
13976 std::scoped_lock
lock(client_lock
);
13977 return _lseek(fh
, offset
, whence
);
13980 int Client::ll_read(Fh
*fh
, loff_t off
, loff_t len
, bufferlist
*bl
)
13982 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13983 if (!mref_reader
.is_state_satisfied())
13984 return -CEPHFS_ENOTCONN
;
13986 ldout(cct
, 3) << "ll_read " << fh
<< " " << fh
->inode
->ino
<< " " << " " << off
<< "~" << len
<< dendl
;
13987 tout(cct
) << "ll_read" << std::endl
;
13988 tout(cct
) << (uintptr_t)fh
<< std::endl
;
13989 tout(cct
) << off
<< std::endl
;
13990 tout(cct
) << len
<< std::endl
;
13992 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13993 len
= std::min(len
, (loff_t
)INT_MAX
);
13994 std::scoped_lock
lock(client_lock
);
13996 int r
= _read(fh
, off
, len
, bl
);
13997 ldout(cct
, 3) << "ll_read " << fh
<< " " << off
<< "~" << len
<< " = " << r
14002 int Client::ll_read_block(Inode
*in
, uint64_t blockid
,
14006 file_layout_t
* layout
)
14008 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14009 if (!mref_reader
.is_state_satisfied())
14010 return -CEPHFS_ENOTCONN
;
14012 vinodeno_t vino
= _get_vino(in
);
14013 object_t oid
= file_object_t(vino
.ino
, blockid
);
14014 C_SaferCond onfinish
;
14017 objecter
->read(oid
,
14018 object_locator_t(layout
->pool_id
),
14023 CEPH_OSD_FLAG_READ
,
14026 int r
= onfinish
.wait();
14028 bl
.begin().copy(bl
.length(), buf
);
14035 /* It appears that the OSD doesn't return success unless the entire
14036 buffer was written, return the write length on success. */
14038 int Client::ll_write_block(Inode
*in
, uint64_t blockid
,
14039 char* buf
, uint64_t offset
,
14040 uint64_t length
, file_layout_t
* layout
,
14041 uint64_t snapseq
, uint32_t sync
)
14043 vinodeno_t vino
= ll_get_vino(in
);
14045 std::unique_ptr
<C_SaferCond
> onsafe
= nullptr;
14047 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14048 if (!mref_reader
.is_state_satisfied())
14049 return -CEPHFS_ENOTCONN
;
14052 return -CEPHFS_EINVAL
;
14054 if (true || sync
) {
14055 /* if write is stable, the epilogue is waiting on
14057 onsafe
.reset(new C_SaferCond("Client::ll_write_block flock"));
14059 object_t oid
= file_object_t(vino
.ino
, blockid
);
14060 SnapContext fakesnap
;
14061 ceph::bufferlist bl
;
14063 bl
.push_back(buffer::copy(buf
, length
));
14066 ldout(cct
, 1) << "ll_block_write for " << vino
.ino
<< "." << blockid
14069 fakesnap
.seq
= snapseq
;
14071 /* lock just in time */
14072 objecter
->write(oid
,
14073 object_locator_t(layout
->pool_id
),
14078 ceph::real_clock::now(),
14082 if (nullptr != onsafe
) {
14083 r
= onsafe
->wait();
14093 int Client::ll_commit_blocks(Inode
*in
,
14098 BarrierContext *bctx;
14099 vinodeno_t vino = _get_vino(in);
14100 uint64_t ino = vino.ino;
14102 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
14103 << offset << " to " << length << dendl;
14106 return -CEPHFS_EINVAL;
14109 std::scoped_lock lock(client_lock);
14110 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
14111 if (p != barriers.end()) {
14112 barrier_interval civ(offset, offset + length);
14113 p->second->commit_barrier(civ);
14119 int Client::ll_write(Fh
*fh
, loff_t off
, loff_t len
, const char *data
)
14121 ldout(cct
, 3) << "ll_write " << fh
<< " " << fh
->inode
->ino
<< " " << off
<<
14122 "~" << len
<< dendl
;
14123 tout(cct
) << "ll_write" << std::endl
;
14124 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14125 tout(cct
) << off
<< std::endl
;
14126 tout(cct
) << len
<< std::endl
;
14128 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14129 if (!mref_reader
.is_state_satisfied())
14130 return -CEPHFS_ENOTCONN
;
14132 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14133 len
= std::min(len
, (loff_t
)INT_MAX
);
14134 std::scoped_lock
lock(client_lock
);
14136 int r
= _write(fh
, off
, len
, data
, NULL
, 0);
14137 ldout(cct
, 3) << "ll_write " << fh
<< " " << off
<< "~" << len
<< " = " << r
14142 int64_t Client::ll_writev(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
14144 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14145 if (!mref_reader
.is_state_satisfied())
14146 return -CEPHFS_ENOTCONN
;
14148 std::unique_lock
cl(client_lock
);
14149 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, true, false, cl
);
14152 int64_t Client::ll_readv(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
14154 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14155 if (!mref_reader
.is_state_satisfied())
14156 return -CEPHFS_ENOTCONN
;
14158 std::unique_lock
cl(client_lock
);
14159 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, false, false, cl
);
14162 int Client::ll_flush(Fh
*fh
)
14164 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14165 if (!mref_reader
.is_state_satisfied())
14166 return -CEPHFS_ENOTCONN
;
14168 ldout(cct
, 3) << "ll_flush " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
14169 tout(cct
) << "ll_flush" << std::endl
;
14170 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14172 std::scoped_lock
lock(client_lock
);
14176 int Client::ll_fsync(Fh
*fh
, bool syncdataonly
)
14178 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14179 if (!mref_reader
.is_state_satisfied())
14180 return -CEPHFS_ENOTCONN
;
14182 ldout(cct
, 3) << "ll_fsync " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
14183 tout(cct
) << "ll_fsync" << std::endl
;
14184 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14186 std::scoped_lock
lock(client_lock
);
14187 int r
= _fsync(fh
, syncdataonly
);
14189 // If we're returning an error, clear it from the FH
14190 fh
->take_async_err();
14195 int Client::ll_sync_inode(Inode
*in
, bool syncdataonly
)
14197 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14198 if (!mref_reader
.is_state_satisfied())
14199 return -CEPHFS_ENOTCONN
;
14201 ldout(cct
, 3) << "ll_sync_inode " << *in
<< " " << dendl
;
14202 tout(cct
) << "ll_sync_inode" << std::endl
;
14203 tout(cct
) << (uintptr_t)in
<< std::endl
;
14205 std::scoped_lock
lock(client_lock
);
14206 return _fsync(in
, syncdataonly
);
14209 #ifdef FALLOC_FL_PUNCH_HOLE
14211 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
14213 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
14215 if (offset
< 0 || length
<= 0)
14216 return -CEPHFS_EINVAL
;
14218 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
))
14219 return -CEPHFS_EOPNOTSUPP
;
14221 if ((mode
& FALLOC_FL_PUNCH_HOLE
) && !(mode
& FALLOC_FL_KEEP_SIZE
))
14222 return -CEPHFS_EOPNOTSUPP
;
14224 Inode
*in
= fh
->inode
.get();
14226 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
) &&
14227 !(mode
& FALLOC_FL_PUNCH_HOLE
)) {
14228 return -CEPHFS_ENOSPC
;
14231 if (in
->snapid
!= CEPH_NOSNAP
)
14232 return -CEPHFS_EROFS
;
14234 if ((fh
->mode
& CEPH_FILE_MODE_WR
) == 0)
14235 return -CEPHFS_EBADF
;
14237 uint64_t size
= offset
+ length
;
14238 if (!(mode
& (FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
)) &&
14240 is_quota_bytes_exceeded(in
, size
- in
->size
, fh
->actor_perms
)) {
14241 return -CEPHFS_EDQUOT
;
14245 int r
= get_caps(fh
, CEPH_CAP_FILE_WR
, CEPH_CAP_FILE_BUFFER
, &have
, -1);
14249 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
14250 if (mode
& FALLOC_FL_PUNCH_HOLE
) {
14251 if (in
->inline_version
< CEPH_INLINE_NONE
&&
14252 (have
& CEPH_CAP_FILE_BUFFER
)) {
14254 auto inline_iter
= in
->inline_data
.cbegin();
14255 int len
= in
->inline_data
.length();
14256 if (offset
< len
) {
14258 inline_iter
.copy(offset
, bl
);
14260 if (offset
+ size
> len
)
14261 size
= len
- offset
;
14263 bl
.append_zero(size
);
14264 if (offset
+ size
< len
) {
14265 inline_iter
+= size
;
14266 inline_iter
.copy(len
- offset
- size
, bl
);
14268 in
->inline_data
= bl
;
14269 in
->inline_version
++;
14271 in
->mtime
= in
->ctime
= ceph_clock_now();
14273 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
14275 if (in
->inline_version
< CEPH_INLINE_NONE
) {
14276 onuninline
.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
14277 uninline_data(in
, onuninline
.get());
14280 C_SaferCond
onfinish("Client::_punch_hole flock");
14282 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
14284 _invalidate_inode_cache(in
, offset
, length
);
14285 filer
->zero(in
->ino
, &in
->layout
,
14286 in
->snaprealm
->get_snap_context(),
14288 ceph::real_clock::now(),
14289 0, true, &onfinish
);
14290 in
->mtime
= in
->ctime
= ceph_clock_now();
14292 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
14294 client_lock
.unlock();
14296 client_lock
.lock();
14297 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
14299 } else if (!(mode
& FALLOC_FL_KEEP_SIZE
)) {
14300 uint64_t size
= offset
+ length
;
14301 if (size
> in
->size
) {
14303 in
->mtime
= in
->ctime
= ceph_clock_now();
14305 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
14307 if (is_quota_bytes_approaching(in
, fh
->actor_perms
)) {
14308 check_caps(in
, CHECK_CAPS_NODELAY
);
14309 } else if (is_max_size_approaching(in
)) {
14315 if (nullptr != onuninline
) {
14316 client_lock
.unlock();
14317 int ret
= onuninline
->wait();
14318 client_lock
.lock();
14320 if (ret
>= 0 || ret
== -CEPHFS_ECANCELED
) {
14321 in
->inline_data
.clear();
14322 in
->inline_version
= CEPH_INLINE_NONE
;
14323 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
14329 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
14334 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
14336 return -CEPHFS_EOPNOTSUPP
;
14342 int Client::ll_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
14344 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14345 if (!mref_reader
.is_state_satisfied())
14346 return -CEPHFS_ENOTCONN
;
14348 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
14349 tout(cct
) << __func__
<< " " << mode
<< " " << offset
<< " " << length
<< std::endl
;
14350 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14352 std::scoped_lock
lock(client_lock
);
14353 return _fallocate(fh
, mode
, offset
, length
);
14356 int Client::fallocate(int fd
, int mode
, loff_t offset
, loff_t length
)
14358 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14359 if (!mref_reader
.is_state_satisfied())
14360 return -CEPHFS_ENOTCONN
;
14362 tout(cct
) << __func__
<< " " << " " << fd
<< mode
<< " " << offset
<< " " << length
<< std::endl
;
14364 std::scoped_lock
lock(client_lock
);
14365 Fh
*fh
= get_filehandle(fd
);
14367 return -CEPHFS_EBADF
;
14368 #if defined(__linux__) && defined(O_PATH)
14369 if (fh
->flags
& O_PATH
)
14370 return -CEPHFS_EBADF
;
14372 return _fallocate(fh
, mode
, offset
, length
);
14375 int Client::ll_release(Fh
*fh
)
14377 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14378 if (!mref_reader
.is_state_satisfied())
14379 return -CEPHFS_ENOTCONN
;
14381 ldout(cct
, 3) << __func__
<< " (fh)" << fh
<< " " << fh
->inode
->ino
<< " " <<
14383 tout(cct
) << __func__
<< " (fh)" << std::endl
;
14384 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14386 std::scoped_lock
lock(client_lock
);
14388 if (ll_unclosed_fh_set
.count(fh
))
14389 ll_unclosed_fh_set
.erase(fh
);
14390 return _release_fh(fh
);
14393 int Client::ll_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
14395 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14396 if (!mref_reader
.is_state_satisfied())
14397 return -CEPHFS_ENOTCONN
;
14399 ldout(cct
, 3) << "ll_getlk (fh)" << fh
<< " " << fh
->inode
->ino
<< dendl
;
14400 tout(cct
) << "ll_getk (fh)" << (uintptr_t)fh
<< std::endl
;
14402 std::scoped_lock
lock(client_lock
);
14403 return _getlk(fh
, fl
, owner
);
14406 int Client::ll_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
14408 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14409 if (!mref_reader
.is_state_satisfied())
14410 return -CEPHFS_ENOTCONN
;
14412 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
14413 tout(cct
) << __func__
<< " (fh)" << (uintptr_t)fh
<< std::endl
;
14415 std::scoped_lock
lock(client_lock
);
14416 return _setlk(fh
, fl
, owner
, sleep
);
14419 int Client::ll_flock(Fh
*fh
, int cmd
, uint64_t owner
)
14421 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14422 if (!mref_reader
.is_state_satisfied())
14423 return -CEPHFS_ENOTCONN
;
14425 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
14426 tout(cct
) << __func__
<< " (fh)" << (uintptr_t)fh
<< std::endl
;
14428 std::scoped_lock
lock(client_lock
);
14429 return _flock(fh
, cmd
, owner
);
14432 int Client::set_deleg_timeout(uint32_t timeout
)
14434 std::scoped_lock
lock(client_lock
);
14437 * The whole point is to prevent blocklisting so we must time out the
14438 * delegation before the session autoclose timeout kicks in.
14440 if (timeout
>= mdsmap
->get_session_autoclose())
14441 return -CEPHFS_EINVAL
;
14443 deleg_timeout
= timeout
;
14447 int Client::ll_delegation(Fh
*fh
, unsigned cmd
, ceph_deleg_cb_t cb
, void *priv
)
14449 int ret
= -CEPHFS_EINVAL
;
14451 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14452 if (!mref_reader
.is_state_satisfied())
14453 return -CEPHFS_ENOTCONN
;
14455 std::scoped_lock
lock(client_lock
);
14457 Inode
*inode
= fh
->inode
.get();
14460 case CEPH_DELEGATION_NONE
:
14461 inode
->unset_deleg(fh
);
14466 ret
= inode
->set_deleg(fh
, cmd
, cb
, priv
);
14467 } catch (std::bad_alloc
&) {
14468 ret
= -CEPHFS_ENOMEM
;
14475 class C_Client_RequestInterrupt
: public Context
{
14480 C_Client_RequestInterrupt(Client
*c
, MetaRequest
*r
) : client(c
), req(r
) {
14483 void finish(int r
) override
{
14484 std::scoped_lock
l(client
->client_lock
);
14485 ceph_assert(req
->head
.op
== CEPH_MDS_OP_SETFILELOCK
);
14486 client
->_interrupt_filelock(req
);
14487 client
->put_request(req
);
14491 void Client::ll_interrupt(void *d
)
14493 MetaRequest
*req
= static_cast<MetaRequest
*>(d
);
14494 ldout(cct
, 3) << __func__
<< " tid " << req
->get_tid() << dendl
;
14495 tout(cct
) << __func__
<< " tid " << req
->get_tid() << std::endl
;
14496 interrupt_finisher
.queue(new C_Client_RequestInterrupt(this, req
));
14499 // =========================================
14502 // expose file layouts
14504 int Client::describe_layout(const char *relpath
, file_layout_t
*lp
,
14505 const UserPerm
& perms
)
14507 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14508 if (!mref_reader
.is_state_satisfied())
14509 return -CEPHFS_ENOTCONN
;
14511 std::scoped_lock
lock(client_lock
);
14513 filepath
path(relpath
);
14515 int r
= path_walk(path
, &in
, perms
);
14521 ldout(cct
, 3) << __func__
<< "(" << relpath
<< ") = 0" << dendl
;
14525 int Client::fdescribe_layout(int fd
, file_layout_t
*lp
)
14527 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14528 if (!mref_reader
.is_state_satisfied())
14529 return -CEPHFS_ENOTCONN
;
14531 std::scoped_lock
lock(client_lock
);
14533 Fh
*f
= get_filehandle(fd
);
14535 return -CEPHFS_EBADF
;
14536 Inode
*in
= f
->inode
.get();
14540 ldout(cct
, 3) << __func__
<< "(" << fd
<< ") = 0" << dendl
;
14544 int64_t Client::get_default_pool_id()
14546 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14547 if (!mref_reader
.is_state_satisfied())
14548 return -CEPHFS_ENOTCONN
;
14550 std::scoped_lock
lock(client_lock
);
14552 /* first data pool is the default */
14553 return mdsmap
->get_first_data_pool();
14558 int64_t Client::get_pool_id(const char *pool_name
)
14560 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14561 if (!mref_reader
.is_state_satisfied())
14562 return -CEPHFS_ENOTCONN
;
14564 std::scoped_lock
lock(client_lock
);
14566 return objecter
->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name
),
14570 string
Client::get_pool_name(int64_t pool
)
14572 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14573 if (!mref_reader
.is_state_satisfied())
14576 std::scoped_lock
lock(client_lock
);
14578 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
14579 return o
.have_pg_pool(pool
) ? o
.get_pool_name(pool
) : string();
14583 int Client::get_pool_replication(int64_t pool
)
14585 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14586 if (!mref_reader
.is_state_satisfied())
14587 return -CEPHFS_ENOTCONN
;
14589 std::scoped_lock
lock(client_lock
);
14591 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
14592 return o
.have_pg_pool(pool
) ? o
.get_pg_pool(pool
)->get_size() : -CEPHFS_ENOENT
;
14596 int Client::get_file_extent_osds(int fd
, loff_t off
, loff_t
*len
, vector
<int>& osds
)
14598 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14599 if (!mref_reader
.is_state_satisfied())
14600 return -CEPHFS_ENOTCONN
;
14602 std::scoped_lock
lock(client_lock
);
14604 Fh
*f
= get_filehandle(fd
);
14606 return -CEPHFS_EBADF
;
14607 Inode
*in
= f
->inode
.get();
14609 vector
<ObjectExtent
> extents
;
14610 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, 1, in
->truncate_size
, extents
);
14611 ceph_assert(extents
.size() == 1);
14613 objecter
->with_osdmap([&](const OSDMap
& o
) {
14614 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
14615 o
.pg_to_acting_osds(pg
, osds
);
14619 return -CEPHFS_EINVAL
;
14622 * Return the remainder of the extent (stripe unit)
14624 * If length = 1 is passed to Striper::file_to_extents we get a single
14625 * extent back, but its length is one so we still need to compute the length
14626 * to the end of the stripe unit.
14628 * If length = su then we may get 1 or 2 objects back in the extents vector
14629 * which would have to be examined. Even then, the offsets are local to the
14630 * object, so matching up to the file offset is extra work.
14632 * It seems simpler to stick with length = 1 and manually compute the
14636 uint64_t su
= in
->layout
.stripe_unit
;
14637 *len
= su
- (off
% su
);
14643 int Client::get_osd_crush_location(int id
, vector
<pair
<string
, string
> >& path
)
14645 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14646 if (!mref_reader
.is_state_satisfied())
14647 return -CEPHFS_ENOTCONN
;
14649 std::scoped_lock
lock(client_lock
);
14652 return -CEPHFS_EINVAL
;
14653 return objecter
->with_osdmap([&](const OSDMap
& o
) {
14654 return o
.crush
->get_full_location_ordered(id
, path
);
14658 int Client::get_file_stripe_address(int fd
, loff_t offset
,
14659 vector
<entity_addr_t
>& address
)
14661 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14662 if (!mref_reader
.is_state_satisfied())
14663 return -CEPHFS_ENOTCONN
;
14665 std::scoped_lock
lock(client_lock
);
14667 Fh
*f
= get_filehandle(fd
);
14669 return -CEPHFS_EBADF
;
14670 Inode
*in
= f
->inode
.get();
14673 vector
<ObjectExtent
> extents
;
14674 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, 1,
14675 in
->truncate_size
, extents
);
14676 ceph_assert(extents
.size() == 1);
14678 // now we have the object and its 'layout'
14679 return objecter
->with_osdmap([&](const OSDMap
& o
) {
14680 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
14682 o
.pg_to_acting_osds(pg
, osds
);
14684 return -CEPHFS_EINVAL
;
14685 for (unsigned i
= 0; i
< osds
.size(); i
++) {
14686 entity_addr_t addr
= o
.get_addrs(osds
[i
]).front();
14687 address
.push_back(addr
);
14693 int Client::get_osd_addr(int osd
, entity_addr_t
& addr
)
14695 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14696 if (!mref_reader
.is_state_satisfied())
14697 return -CEPHFS_ENOTCONN
;
14699 std::scoped_lock
lock(client_lock
);
14701 return objecter
->with_osdmap([&](const OSDMap
& o
) {
14702 if (!o
.exists(osd
))
14703 return -CEPHFS_ENOENT
;
14705 addr
= o
.get_addrs(osd
).front();
14710 int Client::enumerate_layout(int fd
, vector
<ObjectExtent
>& result
,
14711 loff_t length
, loff_t offset
)
14713 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14714 if (!mref_reader
.is_state_satisfied())
14715 return -CEPHFS_ENOTCONN
;
14717 std::scoped_lock
lock(client_lock
);
14719 Fh
*f
= get_filehandle(fd
);
14721 return -CEPHFS_EBADF
;
14722 Inode
*in
= f
->inode
.get();
14724 // map to a list of extents
14725 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, length
, in
->truncate_size
, result
);
14727 ldout(cct
, 3) << __func__
<< "(" << fd
<< ", " << length
<< ", " << offset
<< ") = 0" << dendl
;
14732 /* find an osd with the same ip. -CEPHFS_ENXIO if none. */
14733 int Client::get_local_osd()
14735 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14736 if (!mref_reader
.is_state_satisfied())
14737 return -CEPHFS_ENOTCONN
;
14739 std::scoped_lock
lock(client_lock
);
14741 objecter
->with_osdmap([this](const OSDMap
& o
) {
14742 if (o
.get_epoch() != local_osd_epoch
) {
14743 local_osd
= o
.find_osd_on_ip(messenger
->get_myaddrs().front());
14744 local_osd_epoch
= o
.get_epoch();
14755 // ===============================
14757 void Client::ms_handle_connect(Connection
*con
)
14759 ldout(cct
, 10) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14762 bool Client::ms_handle_reset(Connection
*con
)
14764 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14768 void Client::ms_handle_remote_reset(Connection
*con
)
14770 std::scoped_lock
lock(client_lock
);
14771 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14772 switch (con
->get_peer_type()) {
14773 case CEPH_ENTITY_TYPE_MDS
:
14775 // kludge to figure out which mds this is; fixme with a Connection* state
14776 mds_rank_t mds
= MDS_RANK_NONE
;
14777 MetaSession
*s
= NULL
;
14778 for (auto &p
: mds_sessions
) {
14779 if (mdsmap
->get_addrs(p
.first
) == con
->get_peer_addrs()) {
14785 assert (s
!= NULL
);
14786 switch (s
->state
) {
14787 case MetaSession::STATE_CLOSING
:
14788 ldout(cct
, 1) << "reset from mds we were closing; we'll call that closed" << dendl
;
14789 _closed_mds_session(s
);
14792 case MetaSession::STATE_OPENING
:
14794 ldout(cct
, 1) << "reset from mds we were opening; retrying" << dendl
;
14795 list
<Context
*> waiters
;
14796 waiters
.swap(s
->waiting_for_open
);
14797 _closed_mds_session(s
);
14798 MetaSession
*news
= _get_or_open_mds_session(mds
);
14799 news
->waiting_for_open
.swap(waiters
);
14803 case MetaSession::STATE_OPEN
:
14805 objecter
->maybe_request_map(); /* to check if we are blocklisted */
14806 if (cct
->_conf
.get_val
<bool>("client_reconnect_stale")) {
14807 ldout(cct
, 1) << "reset from mds we were open; close mds session for reconnect" << dendl
;
14808 _closed_mds_session(s
);
14810 ldout(cct
, 1) << "reset from mds we were open; mark session as stale" << dendl
;
14811 s
->state
= MetaSession::STATE_STALE
;
14816 case MetaSession::STATE_NEW
:
14817 case MetaSession::STATE_CLOSED
:
14827 bool Client::ms_handle_refused(Connection
*con
)
14829 ldout(cct
, 1) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14833 Inode
*Client::get_quota_root(Inode
*in
, const UserPerm
& perms
)
14835 Inode
*quota_in
= root_ancestor
;
14836 SnapRealm
*realm
= in
->snaprealm
;
14838 ldout(cct
, 10) << __func__
<< " realm " << realm
->ino
<< dendl
;
14839 if (realm
->ino
!= in
->ino
) {
14840 auto p
= inode_map
.find(vinodeno_t(realm
->ino
, CEPH_NOSNAP
));
14841 if (p
== inode_map
.end())
14844 if (p
->second
->quota
.is_enable()) {
14845 quota_in
= p
->second
;
14849 realm
= realm
->pparent
;
14851 ldout(cct
, 10) << __func__
<< " " << in
->vino() << " -> " << quota_in
->vino() << dendl
;
14856 * Traverse quota ancestors of the Inode, return true
14857 * if any of them passes the passed function
14859 bool Client::check_quota_condition(Inode
*in
, const UserPerm
& perms
,
14860 std::function
<bool (const Inode
&in
)> test
)
14863 ceph_assert(in
!= NULL
);
14868 if (in
== root_ancestor
) {
14869 // We're done traversing, drop out
14872 // Continue up the tree
14873 in
= get_quota_root(in
, perms
);
14880 bool Client::is_quota_files_exceeded(Inode
*in
, const UserPerm
& perms
)
14882 return check_quota_condition(in
, perms
,
14883 [](const Inode
&in
) {
14884 return in
.quota
.max_files
&& in
.rstat
.rsize() >= in
.quota
.max_files
;
14888 bool Client::is_quota_bytes_exceeded(Inode
*in
, int64_t new_bytes
,
14889 const UserPerm
& perms
)
14891 return check_quota_condition(in
, perms
,
14892 [&new_bytes
](const Inode
&in
) {
14893 return in
.quota
.max_bytes
&& (in
.rstat
.rbytes
+ new_bytes
)
14894 > in
.quota
.max_bytes
;
14898 bool Client::is_quota_bytes_approaching(Inode
*in
, const UserPerm
& perms
)
14900 ceph_assert(in
->size
>= in
->reported_size
);
14901 const uint64_t size
= in
->size
- in
->reported_size
;
14902 return check_quota_condition(in
, perms
,
14903 [&size
](const Inode
&in
) {
14904 if (in
.quota
.max_bytes
) {
14905 if (in
.rstat
.rbytes
>= in
.quota
.max_bytes
) {
14909 const uint64_t space
= in
.quota
.max_bytes
- in
.rstat
.rbytes
;
14910 return (space
>> 4) < size
;
14924 int Client::check_pool_perm(Inode
*in
, int need
)
14926 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
14928 if (!cct
->_conf
->client_check_pool_perm
)
14931 /* Only need to do this for regular files */
14932 if (!in
->is_file())
14935 int64_t pool_id
= in
->layout
.pool_id
;
14936 std::string pool_ns
= in
->layout
.pool_ns
;
14937 std::pair
<int64_t, std::string
> perm_key(pool_id
, pool_ns
);
14940 auto it
= pool_perms
.find(perm_key
);
14941 if (it
== pool_perms
.end())
14943 if (it
->second
== POOL_CHECKING
) {
14944 // avoid concurrent checkings
14945 wait_on_list(waiting_for_pool_perm
);
14948 ceph_assert(have
& POOL_CHECKED
);
14954 if (in
->snapid
!= CEPH_NOSNAP
) {
14955 // pool permission check needs to write to the first object. But for snapshot,
14956 // head of the first object may have alread been deleted. To avoid creating
14957 // orphan object, skip the check for now.
14961 pool_perms
[perm_key
] = POOL_CHECKING
;
14964 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (unsigned long long)in
->ino
);
14965 object_t oid
= oid_buf
;
14967 SnapContext nullsnapc
;
14969 C_SaferCond rd_cond
;
14970 ObjectOperation rd_op
;
14971 rd_op
.stat(nullptr, nullptr, nullptr);
14973 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), rd_op
,
14974 nullsnapc
, ceph::real_clock::now(), 0, &rd_cond
);
14976 C_SaferCond wr_cond
;
14977 ObjectOperation wr_op
;
14978 wr_op
.create(true);
14980 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), wr_op
,
14981 nullsnapc
, ceph::real_clock::now(), 0, &wr_cond
);
14983 client_lock
.unlock();
14984 int rd_ret
= rd_cond
.wait();
14985 int wr_ret
= wr_cond
.wait();
14986 client_lock
.lock();
14988 bool errored
= false;
14990 if (rd_ret
== 0 || rd_ret
== -CEPHFS_ENOENT
)
14992 else if (rd_ret
!= -CEPHFS_EPERM
) {
14993 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14994 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
14998 if (wr_ret
== 0 || wr_ret
== -CEPHFS_EEXIST
)
14999 have
|= POOL_WRITE
;
15000 else if (wr_ret
!= -CEPHFS_EPERM
) {
15001 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
15002 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
15007 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
15008 // Raise EIO because actual error code might be misleading for
15009 // userspace filesystem user.
15010 pool_perms
.erase(perm_key
);
15011 signal_cond_list(waiting_for_pool_perm
);
15012 return -CEPHFS_EIO
;
15015 pool_perms
[perm_key
] = have
| POOL_CHECKED
;
15016 signal_cond_list(waiting_for_pool_perm
);
15019 if ((need
& CEPH_CAP_FILE_RD
) && !(have
& POOL_READ
)) {
15020 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
15021 << " need " << ccap_string(need
) << ", but no read perm" << dendl
;
15022 return -CEPHFS_EPERM
;
15024 if ((need
& CEPH_CAP_FILE_WR
) && !(have
& POOL_WRITE
)) {
15025 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
15026 << " need " << ccap_string(need
) << ", but no write perm" << dendl
;
15027 return -CEPHFS_EPERM
;
15033 int Client::_posix_acl_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
15035 if (acl_type
== POSIX_ACL
) {
15036 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
15037 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
15039 return posix_acl_permits(access_acl
, in
->uid
, in
->gid
, perms
, want
);
15042 return -CEPHFS_EAGAIN
;
15045 int Client::_posix_acl_chmod(Inode
*in
, mode_t mode
, const UserPerm
& perms
)
15047 if (acl_type
== NO_ACL
)
15050 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
15054 if (acl_type
== POSIX_ACL
) {
15055 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
15056 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
15057 bufferptr
acl(access_acl
.c_str(), access_acl
.length());
15058 r
= posix_acl_access_chmod(acl
, mode
);
15061 r
= _do_setxattr(in
, ACL_EA_ACCESS
, acl
.c_str(), acl
.length(), 0, perms
);
15067 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< " result=" << r
<< dendl
;
15071 int Client::_posix_acl_create(Inode
*dir
, mode_t
*mode
, bufferlist
& xattrs_bl
,
15072 const UserPerm
& perms
)
15074 if (acl_type
== NO_ACL
)
15077 if (S_ISLNK(*mode
))
15080 int r
= _getattr(dir
, CEPH_STAT_CAP_XATTR
, perms
, dir
->xattr_version
== 0);
15084 if (acl_type
== POSIX_ACL
) {
15085 if (dir
->xattrs
.count(ACL_EA_DEFAULT
)) {
15086 map
<string
, bufferptr
> xattrs
;
15088 const bufferptr
& default_acl
= dir
->xattrs
[ACL_EA_DEFAULT
];
15089 bufferptr
acl(default_acl
.c_str(), default_acl
.length());
15090 r
= posix_acl_inherit_mode(acl
, mode
);
15095 r
= posix_acl_equiv_mode(acl
.c_str(), acl
.length(), mode
);
15099 xattrs
[ACL_EA_ACCESS
] = acl
;
15102 if (S_ISDIR(*mode
))
15103 xattrs
[ACL_EA_DEFAULT
] = dir
->xattrs
[ACL_EA_DEFAULT
];
15107 encode(xattrs
, xattrs_bl
);
15110 *mode
&= ~umask_cb(callback_handle
);
15115 ldout(cct
, 10) << __func__
<< " dir ino " << dir
->ino
<< " result=" << r
<< dendl
;
15119 void Client::set_filer_flags(int flags
)
15121 std::scoped_lock
l(client_lock
);
15122 ceph_assert(flags
== 0 ||
15123 flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
15124 objecter
->add_global_op_flags(flags
);
15127 void Client::clear_filer_flags(int flags
)
15129 std::scoped_lock
l(client_lock
);
15130 ceph_assert(flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
15131 objecter
->clear_global_op_flag(flags
);
15134 // called before mount
15135 void Client::set_uuid(const std::string
& uuid
)
15137 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
15138 ceph_assert(iref_reader
.is_state_satisfied());
15140 std::scoped_lock
l(client_lock
);
15141 assert(!uuid
.empty());
15143 metadata
["uuid"] = uuid
;
15147 // called before mount. 0 means infinite
15148 void Client::set_session_timeout(unsigned timeout
)
15150 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
15151 ceph_assert(iref_reader
.is_state_satisfied());
15153 std::scoped_lock
l(client_lock
);
15155 metadata
["timeout"] = stringify(timeout
);
15158 // called before mount
15159 int Client::start_reclaim(const std::string
& uuid
, unsigned flags
,
15160 const std::string
& fs_name
)
15162 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
15163 if (!iref_reader
.is_state_satisfied())
15164 return -CEPHFS_ENOTCONN
;
15167 return -CEPHFS_EINVAL
;
15169 std::unique_lock
l(client_lock
);
15171 auto it
= metadata
.find("uuid");
15172 if (it
!= metadata
.end() && it
->second
== uuid
)
15173 return -CEPHFS_EINVAL
;
15176 int r
= subscribe_mdsmap(fs_name
);
15178 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
15182 if (metadata
.empty())
15183 populate_metadata("");
15185 while (mdsmap
->get_epoch() == 0)
15186 wait_on_list(waiting_for_mdsmap
);
15189 for (unsigned mds
= 0; mds
< mdsmap
->get_num_in_mds(); ) {
15190 if (!mdsmap
->is_up(mds
)) {
15191 ldout(cct
, 10) << "mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
15192 wait_on_list(waiting_for_mdsmap
);
15196 MetaSession
*session
;
15197 if (!have_open_session(mds
)) {
15198 session
= _get_or_open_mds_session(mds
);
15199 if (session
->state
== MetaSession::STATE_REJECTED
)
15200 return -CEPHFS_EPERM
;
15201 if (session
->state
!= MetaSession::STATE_OPENING
) {
15203 return -CEPHFS_EINVAL
;
15205 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
15206 wait_on_context_list(session
->waiting_for_open
);
15210 session
= &mds_sessions
.at(mds
);
15211 if (!session
->mds_features
.test(CEPHFS_FEATURE_RECLAIM_CLIENT
))
15212 return -CEPHFS_EOPNOTSUPP
;
15214 if (session
->reclaim_state
== MetaSession::RECLAIM_NULL
||
15215 session
->reclaim_state
== MetaSession::RECLAIMING
) {
15216 session
->reclaim_state
= MetaSession::RECLAIMING
;
15217 auto m
= make_message
<MClientReclaim
>(uuid
, flags
);
15218 session
->con
->send_message2(std::move(m
));
15219 wait_on_list(waiting_for_reclaim
);
15220 } else if (session
->reclaim_state
== MetaSession::RECLAIM_FAIL
) {
15221 return reclaim_errno
? : -CEPHFS_ENOTRECOVERABLE
;
15227 // didn't find target session in any mds
15228 if (reclaim_target_addrs
.empty()) {
15229 if (flags
& CEPH_RECLAIM_RESET
)
15230 return -CEPHFS_ENOENT
;
15231 return -CEPHFS_ENOTRECOVERABLE
;
15234 if (flags
& CEPH_RECLAIM_RESET
)
15237 // use blocklist to check if target session was killed
15238 // (config option mds_session_blocklist_on_evict needs to be true)
15239 ldout(cct
, 10) << __func__
<< ": waiting for OSD epoch " << reclaim_osd_epoch
<< dendl
;
15242 objecter
->wait_for_map(reclaim_osd_epoch
, ca::use_blocked
[ec
]);
15246 return ceph::from_error_code(ec
);
15248 bool blocklisted
= objecter
->with_osdmap(
15249 [this](const OSDMap
&osd_map
) -> bool {
15250 return osd_map
.is_blocklisted(reclaim_target_addrs
);
15253 return -CEPHFS_ENOTRECOVERABLE
;
15255 metadata
["reclaiming_uuid"] = uuid
;
15259 void Client::finish_reclaim()
15261 auto it
= metadata
.find("reclaiming_uuid");
15262 if (it
== metadata
.end()) {
15263 for (auto &p
: mds_sessions
)
15264 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
15268 for (auto &p
: mds_sessions
) {
15269 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
15270 auto m
= make_message
<MClientReclaim
>("", MClientReclaim::FLAG_FINISH
);
15271 p
.second
.con
->send_message2(std::move(m
));
15274 metadata
["uuid"] = it
->second
;
15275 metadata
.erase(it
);
15278 void Client::handle_client_reclaim_reply(const MConstRef
<MClientReclaimReply
>& reply
)
15280 mds_rank_t from
= mds_rank_t(reply
->get_source().num());
15281 ldout(cct
, 10) << __func__
<< " " << *reply
<< " from mds." << from
<< dendl
;
15283 std::scoped_lock
cl(client_lock
);
15284 MetaSession
*session
= _get_mds_session(from
, reply
->get_connection().get());
15286 ldout(cct
, 10) << " discarding reclaim reply from sessionless mds." << from
<< dendl
;
15290 if (reply
->get_result() >= 0) {
15291 session
->reclaim_state
= MetaSession::RECLAIM_OK
;
15292 if (reply
->get_epoch() > reclaim_osd_epoch
)
15293 reclaim_osd_epoch
= reply
->get_epoch();
15294 if (!reply
->get_addrs().empty())
15295 reclaim_target_addrs
= reply
->get_addrs();
15297 session
->reclaim_state
= MetaSession::RECLAIM_FAIL
;
15298 reclaim_errno
= reply
->get_result();
15301 signal_cond_list(waiting_for_reclaim
);
15305 * This is included in cap release messages, to cause
15306 * the MDS to wait until this OSD map epoch. It is necessary
15307 * in corner cases where we cancel RADOS ops, so that
15308 * nobody else tries to do IO to the same objects in
15309 * the same epoch as the cancelled ops.
15311 void Client::set_cap_epoch_barrier(epoch_t e
)
15313 ldout(cct
, 5) << __func__
<< " epoch = " << e
<< dendl
;
15314 cap_epoch_barrier
= e
;
15317 const char** Client::get_tracked_conf_keys() const
15319 static const char* keys
[] = {
15320 "client_cache_size",
15321 "client_cache_mid",
15323 "client_deleg_timeout",
15324 "client_deleg_break_on_open",
15326 "client_oc_max_objects",
15327 "client_oc_max_dirty",
15328 "client_oc_target_dirty",
15329 "client_oc_max_dirty_age",
15335 void Client::handle_conf_change(const ConfigProxy
& conf
,
15336 const std::set
<std::string
> &changed
)
15338 std::scoped_lock
lock(client_lock
);
15340 if (changed
.count("client_cache_mid")) {
15341 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
15343 if (changed
.count("client_acl_type")) {
15345 if (cct
->_conf
->client_acl_type
== "posix_acl")
15346 acl_type
= POSIX_ACL
;
15348 if (changed
.count("client_oc_size")) {
15349 objectcacher
->set_max_size(cct
->_conf
->client_oc_size
);
15351 if (changed
.count("client_oc_max_objects")) {
15352 objectcacher
->set_max_objects(cct
->_conf
->client_oc_max_objects
);
15354 if (changed
.count("client_oc_max_dirty")) {
15355 objectcacher
->set_max_dirty(cct
->_conf
->client_oc_max_dirty
);
15357 if (changed
.count("client_oc_target_dirty")) {
15358 objectcacher
->set_target_dirty(cct
->_conf
->client_oc_target_dirty
);
15360 if (changed
.count("client_oc_max_dirty_age")) {
15361 objectcacher
->set_max_dirty_age(cct
->_conf
->client_oc_max_dirty_age
);
15365 void intrusive_ptr_add_ref(Inode
*in
)
15370 void intrusive_ptr_release(Inode
*in
)
15372 in
->client
->put_inode(in
);
15375 mds_rank_t
Client::_get_random_up_mds() const
15377 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
15379 std::set
<mds_rank_t
> up
;
15380 mdsmap
->get_up_mds_set(up
);
15383 return MDS_RANK_NONE
;
15384 std::set
<mds_rank_t
>::const_iterator p
= up
.begin();
15385 for (int n
= rand() % up
.size(); n
; n
--)
15391 StandaloneClient::StandaloneClient(Messenger
*m
, MonClient
*mc
,
15392 boost::asio::io_context
& ictx
)
15393 : Client(m
, mc
, new Objecter(m
->cct
, m
, mc
, ictx
))
15395 monclient
->set_messenger(m
);
15396 objecter
->set_client_incarnation(0);
15399 StandaloneClient::~StandaloneClient()
15402 objecter
= nullptr;
15405 int StandaloneClient::init()
15407 RWRef_t
iref_writer(initialize_state
, CLIENT_INITIALIZING
, false);
15408 ceph_assert(iref_writer
.is_first_writer());
15413 client_lock
.lock();
15415 messenger
->add_dispatcher_tail(objecter
);
15416 messenger
->add_dispatcher_tail(this);
15418 monclient
->set_want_keys(CEPH_ENTITY_TYPE_MDS
| CEPH_ENTITY_TYPE_OSD
);
15419 int r
= monclient
->init();
15421 // need to do cleanup because we're in an intermediate init state
15423 std::scoped_lock
l(timer_lock
);
15427 client_lock
.unlock();
15428 objecter
->shutdown();
15429 objectcacher
->stop();
15430 monclient
->shutdown();
15435 client_lock
.unlock();
15437 iref_writer
.update_state(CLIENT_INITIALIZED
);
15442 void StandaloneClient::shutdown()
15444 Client::shutdown();
15445 objecter
->shutdown();
15446 monclient
->shutdown();