1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <sys/types.h>
23 #include <sys/param.h>
27 #include <sys/utsname.h>
31 #include <boost/lexical_cast.hpp>
32 #include <boost/fusion/include/std_pair.hpp>
34 #include "common/async/waiter.h"
36 #if defined(__FreeBSD__) || defined(_WIN32)
37 #define XATTR_CREATE 0x1
38 #define XATTR_REPLACE 0x2
40 #include <sys/xattr.h>
43 #if defined(__linux__)
44 #include <linux/falloc.h>
47 #include <sys/statvfs.h>
49 #include "common/config.h"
50 #include "common/version.h"
51 #include "common/async/blocked_completion.h"
53 #include "mon/MonClient.h"
55 #include "messages/MClientCaps.h"
56 #include "messages/MClientLease.h"
57 #include "messages/MClientQuota.h"
58 #include "messages/MClientReclaim.h"
59 #include "messages/MClientReclaimReply.h"
60 #include "messages/MClientReconnect.h"
61 #include "messages/MClientReply.h"
62 #include "messages/MClientRequest.h"
63 #include "messages/MClientRequestForward.h"
64 #include "messages/MClientSession.h"
65 #include "messages/MClientSnap.h"
66 #include "messages/MClientMetrics.h"
67 #include "messages/MCommandReply.h"
68 #include "messages/MFSMap.h"
69 #include "messages/MFSMapUser.h"
70 #include "messages/MMDSMap.h"
71 #include "messages/MOSDMap.h"
73 #include "mds/flock.h"
74 #include "mds/cephfs_features.h"
75 #include "osd/OSDMap.h"
76 #include "osdc/Filer.h"
78 #include "common/Cond.h"
79 #include "common/perf_counters.h"
80 #include "common/admin_socket.h"
81 #include "common/errno.h"
82 #include "include/str_list.h"
84 #define dout_subsys ceph_subsys_client
86 #include "include/lru.h"
87 #include "include/compat.h"
88 #include "include/stringify.h"
89 #include "include/random.h"
94 #include "Delegation.h"
96 #include "ClientSnapRealm.h"
98 #include "MetaSession.h"
99 #include "MetaRequest.h"
100 #include "ObjecterWriteback.h"
101 #include "posix_acl.h"
103 #include "include/ceph_assert.h"
104 #include "include/stat.h"
106 #include "include/cephfs/ceph_ll_client.h"
108 #if HAVE_GETGROUPLIST
115 #define dout_prefix *_dout << "client." << whoami << " "
117 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
119 // FreeBSD fails to define this
123 // Darwin fails to define this
132 // Windows doesn't define those values. While the Posix compatibilty layer
133 // doesn't support those values, the Windows native functions do provide
134 // similar flags. Special care should be taken if we're going to use those
135 // flags in ceph-dokan. The current values are no-ops, while propagating
136 // them to the rest of the code might cause the Windows functions to reject
139 #define O_NOFOLLOW 0x0
146 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
149 #define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)
160 using namespace TOPNSPC::common
;
162 namespace bs
= boost::system
;
163 namespace ca
= ceph::async
;
165 void client_flush_set_callback(void *p
, ObjectCacher::ObjectSet
*oset
)
167 Client
*client
= static_cast<Client
*>(p
);
168 client
->flush_set_callback(oset
);
171 bool Client::is_reserved_vino(vinodeno_t
&vino
) {
172 if (MDS_IS_PRIVATE_INO(vino
.ino
)) {
173 ldout(cct
, -1) << __func__
<< " attempt to access reserved inode number " << vino
<< dendl
;
182 Client::CommandHook::CommandHook(Client
*client
) :
187 int Client::CommandHook::call(
188 std::string_view command
,
189 const cmdmap_t
& cmdmap
,
194 f
->open_object_section("result");
196 std::scoped_lock l
{m_client
->client_lock
};
197 if (command
== "mds_requests")
198 m_client
->dump_mds_requests(f
);
199 else if (command
== "mds_sessions") {
200 bool cap_dump
= false;
201 cmd_getval(cmdmap
, "cap_dump", cap_dump
);
202 m_client
->dump_mds_sessions(f
, cap_dump
);
203 } else if (command
== "dump_cache")
204 m_client
->dump_cache(f
);
205 else if (command
== "kick_stale_sessions")
206 m_client
->_kick_stale_sessions();
207 else if (command
== "status")
208 m_client
->dump_status(f
);
210 ceph_abort_msg("bad command registered");
219 int Client::get_fd_inode(int fd
, InodeRef
*in
) {
221 if (fd
== CEPHFS_AT_FDCWD
) {
224 Fh
*f
= get_filehandle(fd
);
234 dir_result_t::dir_result_t(Inode
*in
, const UserPerm
& perms
)
235 : inode(in
), offset(0), next_offset(2),
236 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
240 void Client::_reset_faked_inos()
243 free_faked_inos
.clear();
244 free_faked_inos
.insert(start
, (uint32_t)-1 - start
+ 1);
245 last_used_faked_ino
= 0;
246 last_used_faked_root
= 0;
248 // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
249 // Windows structures, including Dokan ones, are using 64B identifiers.
250 _use_faked_inos
= false;
252 _use_faked_inos
= sizeof(ino_t
) < 8 || cct
->_conf
->client_use_faked_inos
;
256 void Client::_assign_faked_ino(Inode
*in
)
258 if (0 == last_used_faked_ino
)
259 last_used_faked_ino
= last_used_faked_ino
+ 2048; // start(1024)~2048 reserved for _assign_faked_root
260 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
261 if (it
== free_faked_inos
.end() && last_used_faked_ino
> 0) {
262 last_used_faked_ino
= 2048;
263 it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
265 ceph_assert(it
!= free_faked_inos
.end());
266 if (last_used_faked_ino
< it
.get_start()) {
267 ceph_assert(it
.get_len() > 0);
268 last_used_faked_ino
= it
.get_start();
270 ++last_used_faked_ino
;
271 ceph_assert(it
.get_start() + it
.get_len() > last_used_faked_ino
);
273 in
->faked_ino
= last_used_faked_ino
;
274 free_faked_inos
.erase(in
->faked_ino
);
275 faked_ino_map
[in
->faked_ino
] = in
->vino();
279 * In the faked mode, if you export multiple subdirectories,
280 * you will see that the inode numbers of the exported subdirectories
281 * are the same. so we distinguish the mount point by reserving
282 * the "fake ids" between "1024~2048" and combining the last
283 * 10bits(0x3ff) of the "root inodes".
285 void Client::_assign_faked_root(Inode
*in
)
287 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
288 if (it
== free_faked_inos
.end() && last_used_faked_root
> 0) {
289 last_used_faked_root
= 0;
290 it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
292 ceph_assert(it
!= free_faked_inos
.end());
293 vinodeno_t inode_info
= in
->vino();
294 uint64_t inode_num
= (uint64_t)inode_info
.ino
;
295 ldout(cct
, 10) << "inode_num " << inode_num
<< "inode_num & 0x3ff=" << (inode_num
& 0x3ff)<< dendl
;
296 last_used_faked_root
= it
.get_start() + (inode_num
& 0x3ff); // 0x3ff mask and get_start will not exceed 2048
297 ceph_assert(it
.get_start() + it
.get_len() > last_used_faked_root
);
299 in
->faked_ino
= last_used_faked_root
;
300 free_faked_inos
.erase(in
->faked_ino
);
301 faked_ino_map
[in
->faked_ino
] = in
->vino();
304 void Client::_release_faked_ino(Inode
*in
)
306 free_faked_inos
.insert(in
->faked_ino
);
307 faked_ino_map
.erase(in
->faked_ino
);
310 vinodeno_t
Client::_map_faked_ino(ino_t ino
)
315 else if (faked_ino_map
.count(ino
))
316 vino
= faked_ino_map
[ino
];
318 vino
= vinodeno_t(0, CEPH_NOSNAP
);
319 ldout(cct
, 10) << __func__
<< " " << ino
<< " -> " << vino
<< dendl
;
323 vinodeno_t
Client::map_faked_ino(ino_t ino
)
325 std::scoped_lock
lock(client_lock
);
326 return _map_faked_ino(ino
);
331 Client::Client(Messenger
*m
, MonClient
*mc
, Objecter
*objecter_
)
332 : Dispatcher(m
->cct
->get()),
333 timer(m
->cct
, timer_lock
, false),
337 whoami(mc
->get_global_id()),
338 mount_state(CLIENT_UNMOUNTED
, "Client::mountstate_lock"),
339 initialize_state(CLIENT_NEW
, "Client::initstate_lock"),
340 cct_deleter
{m
->cct
, [](CephContext
*p
) {p
->put();}},
341 async_ino_invalidator(m
->cct
),
342 async_dentry_invalidator(m
->cct
),
343 interrupt_finisher(m
->cct
),
344 remount_finisher(m
->cct
),
345 async_ino_releasor(m
->cct
),
346 objecter_finisher(m
->cct
),
347 m_command_hook(this),
352 user_id
= cct
->_conf
->client_mount_uid
;
353 group_id
= cct
->_conf
->client_mount_gid
;
354 fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
355 "fuse_default_permissions");
357 if (cct
->_conf
->client_acl_type
== "posix_acl")
358 acl_type
= POSIX_ACL
;
360 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
363 free_fd_set
.insert(10, 1<<30);
365 mdsmap
.reset(new MDSMap
);
368 writeback_handler
.reset(new ObjecterWriteback(objecter
, &objecter_finisher
,
370 objectcacher
.reset(new ObjectCacher(cct
, "libcephfs", *writeback_handler
, client_lock
,
371 client_flush_set_callback
, // all commit callback
373 cct
->_conf
->client_oc_size
,
374 cct
->_conf
->client_oc_max_objects
,
375 cct
->_conf
->client_oc_max_dirty
,
376 cct
->_conf
->client_oc_target_dirty
,
377 cct
->_conf
->client_oc_max_dirty_age
,
384 ceph_assert(ceph_mutex_is_not_locked(client_lock
));
386 // If the task is crashed or aborted and doesn't
387 // get any chance to run the umount and shutdow.
389 std::scoped_lock l
{client_lock
};
390 tick_thread_stopped
= true;
391 upkeep_cond
.notify_one();
394 if (upkeeper
.joinable())
397 // It is necessary to hold client_lock, because any inode destruction
398 // may call into ObjectCacher, which asserts that it's lock (which is
399 // client_lock) is held.
400 std::scoped_lock l
{client_lock
};
404 void Client::tear_down_cache()
407 for (auto &[fd
, fh
] : fd_map
) {
408 ldout(cct
, 1) << __func__
<< " forcing close of fh " << fd
<< " ino " << fh
->inode
->ino
<< dendl
;
413 while (!opened_dirs
.empty()) {
414 dir_result_t
*dirp
= *opened_dirs
.begin();
415 ldout(cct
, 1) << __func__
<< " forcing close of dir " << dirp
<< " ino " << dirp
->inode
->ino
<< dendl
;
424 ceph_assert(lru
.lru_get_size() == 0);
427 ceph_assert(inode_map
.size() <= 1 + root_parents
.size());
428 if (root
&& inode_map
.size() == 1 + root_parents
.size()) {
432 ceph_assert(inode_map
.empty());
435 inodeno_t
Client::get_root_ino()
437 std::scoped_lock
l(client_lock
);
438 if (use_faked_inos())
439 return root
->faked_ino
;
444 Inode
*Client::get_root()
446 std::scoped_lock
l(client_lock
);
454 void Client::dump_inode(Formatter
*f
, Inode
*in
, set
<Inode
*>& did
, bool disconnected
)
457 in
->make_long_path(path
);
458 ldout(cct
, 1) << "dump_inode: "
459 << (disconnected
? "DISCONNECTED ":"")
460 << "inode " << in
->ino
462 << " ref " << in
->get_nref()
463 << " " << *in
<< dendl
;
466 f
->open_object_section("inode");
467 f
->dump_stream("path") << path
;
469 f
->dump_int("disconnected", 1);
476 ldout(cct
, 1) << " dir " << in
->dir
<< " size " << in
->dir
->dentries
.size() << dendl
;
477 for (ceph::unordered_map
<string
, Dentry
*>::iterator it
= in
->dir
->dentries
.begin();
478 it
!= in
->dir
->dentries
.end();
480 ldout(cct
, 1) << " " << in
->ino
<< " dn " << it
->first
<< " " << it
->second
<< " ref " << it
->second
->ref
<< dendl
;
482 f
->open_object_section("dentry");
486 if (it
->second
->inode
)
487 dump_inode(f
, it
->second
->inode
.get(), did
, false);
492 void Client::dump_cache(Formatter
*f
)
496 ldout(cct
, 1) << __func__
<< dendl
;
499 f
->open_array_section("cache");
502 dump_inode(f
, root
.get(), did
, true);
504 // make a second pass to catch anything disconnected
505 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
506 it
!= inode_map
.end();
508 if (did
.count(it
->second
))
510 dump_inode(f
, it
->second
, did
, true);
517 void Client::dump_status(Formatter
*f
)
519 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
521 ldout(cct
, 1) << __func__
<< dendl
;
523 const epoch_t osd_epoch
524 = objecter
->with_osdmap(std::mem_fn(&OSDMap::get_epoch
));
527 f
->open_object_section("metadata");
528 for (const auto& kv
: metadata
)
529 f
->dump_string(kv
.first
.c_str(), kv
.second
);
532 f
->dump_int("dentry_count", lru
.lru_get_size());
533 f
->dump_int("dentry_pinned_count", lru
.lru_get_num_pinned());
534 f
->dump_int("id", get_nodeid().v
);
535 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
536 f
->dump_object("inst", inst
);
537 f
->dump_object("addr", inst
.addr
);
538 f
->dump_stream("inst_str") << inst
.name
<< " " << inst
.addr
.get_legacy_str();
539 f
->dump_string("addr_str", inst
.addr
.get_legacy_str());
540 f
->dump_int("inode_count", inode_map
.size());
541 f
->dump_int("mds_epoch", mdsmap
->get_epoch());
542 f
->dump_int("osd_epoch", osd_epoch
);
543 f
->dump_int("osd_epoch_barrier", cap_epoch_barrier
);
544 f
->dump_bool("blocklisted", blocklisted
);
545 f
->dump_string("fs_name", mdsmap
->get_fs_name());
549 void Client::_pre_init()
553 objecter_finisher
.start();
554 filer
.reset(new Filer(objecter
, &objecter_finisher
));
555 objecter
->enable_blocklist_events();
557 objectcacher
->start();
562 RWRef_t
iref_writer(initialize_state
, CLIENT_INITIALIZING
, false);
563 ceph_assert(iref_writer
.is_first_writer());
567 std::scoped_lock l
{client_lock
};
568 messenger
->add_dispatcher_tail(this);
571 iref_writer
.update_state(CLIENT_INITIALIZED
);
575 void Client::_finish_init()
578 std::scoped_lock l
{client_lock
};
580 PerfCountersBuilder
plb(cct
, "client", l_c_first
, l_c_last
);
581 plb
.add_time_avg(l_c_reply
, "reply", "Latency of receiving a reply on metadata request");
582 plb
.add_time_avg(l_c_lat
, "lat", "Latency of processing a metadata request");
583 plb
.add_time_avg(l_c_wrlat
, "wrlat", "Latency of a file data write operation");
584 plb
.add_time_avg(l_c_read
, "rdlat", "Latency of a file data read operation");
585 plb
.add_time_avg(l_c_fsync
, "fsync", "Latency of a file sync operation");
586 logger
.reset(plb
.create_perf_counters());
587 cct
->get_perfcounters_collection()->add(logger
.get());
590 cct
->_conf
.add_observer(this);
592 AdminSocket
* admin_socket
= cct
->get_admin_socket();
593 int ret
= admin_socket
->register_command("mds_requests",
595 "show in-progress mds requests");
597 lderr(cct
) << "error registering admin socket command: "
598 << cpp_strerror(-ret
) << dendl
;
600 ret
= admin_socket
->register_command("mds_sessions "
601 "name=cap_dump,type=CephBool,req=false",
603 "show mds session state");
605 lderr(cct
) << "error registering admin socket command: "
606 << cpp_strerror(-ret
) << dendl
;
608 ret
= admin_socket
->register_command("dump_cache",
610 "show in-memory metadata cache contents");
612 lderr(cct
) << "error registering admin socket command: "
613 << cpp_strerror(-ret
) << dendl
;
615 ret
= admin_socket
->register_command("kick_stale_sessions",
617 "kick sessions that were remote reset");
619 lderr(cct
) << "error registering admin socket command: "
620 << cpp_strerror(-ret
) << dendl
;
622 ret
= admin_socket
->register_command("status",
624 "show overall client status");
626 lderr(cct
) << "error registering admin socket command: "
627 << cpp_strerror(-ret
) << dendl
;
631 void Client::shutdown()
633 ldout(cct
, 1) << __func__
<< dendl
;
635 // If we were not mounted, but were being used for sending
636 // MDS commands, we may have sessions that need closing.
638 std::scoped_lock l
{client_lock
};
640 // To make sure the tick thread will be stoppped before
641 // destructing the Client, just in case like the _mount()
642 // failed but didn't not get a chance to stop the tick
644 tick_thread_stopped
= true;
645 upkeep_cond
.notify_one();
649 cct
->_conf
.remove_observer(this);
651 cct
->get_admin_socket()->unregister_commands(&m_command_hook
);
653 if (ino_invalidate_cb
) {
654 ldout(cct
, 10) << "shutdown stopping cache invalidator finisher" << dendl
;
655 async_ino_invalidator
.wait_for_empty();
656 async_ino_invalidator
.stop();
659 if (dentry_invalidate_cb
) {
660 ldout(cct
, 10) << "shutdown stopping dentry invalidator finisher" << dendl
;
661 async_dentry_invalidator
.wait_for_empty();
662 async_dentry_invalidator
.stop();
665 if (switch_interrupt_cb
) {
666 ldout(cct
, 10) << "shutdown stopping interrupt finisher" << dendl
;
667 interrupt_finisher
.wait_for_empty();
668 interrupt_finisher
.stop();
672 ldout(cct
, 10) << "shutdown stopping remount finisher" << dendl
;
673 remount_finisher
.wait_for_empty();
674 remount_finisher
.stop();
677 if (ino_release_cb
) {
678 ldout(cct
, 10) << "shutdown stopping inode release finisher" << dendl
;
679 async_ino_releasor
.wait_for_empty();
680 async_ino_releasor
.stop();
683 objectcacher
->stop(); // outside of client_lock! this does a join.
686 * We are shuting down the client.
688 * Just declare the state to CLIENT_NEW to block and fail any
689 * new comming "reader" and then try to wait all the in-flight
690 * "readers" to finish.
692 RWRef_t
iref_writer(initialize_state
, CLIENT_NEW
, false);
693 if (!iref_writer
.is_first_writer())
695 iref_writer
.wait_readers_done();
698 std::scoped_lock
l(timer_lock
);
702 objecter_finisher
.wait_for_empty();
703 objecter_finisher
.stop();
706 cct
->get_perfcounters_collection()->remove(logger
.get());
712 // ===================
713 // metadata cache stuff
715 void Client::trim_cache(bool trim_kernel_dcache
)
717 uint64_t max
= cct
->_conf
->client_cache_size
;
718 ldout(cct
, 20) << "trim_cache size " << lru
.lru_get_size() << " max " << max
<< dendl
;
720 while (lru
.lru_get_size() != last
) {
721 last
= lru
.lru_get_size();
723 if (!is_unmounting() && lru
.lru_get_size() <= max
) break;
726 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_get_next_expire());
733 if (trim_kernel_dcache
&& lru
.lru_get_size() > max
)
734 _invalidate_kernel_dcache();
737 if (lru
.lru_get_size() == 0 && root
&& root
->get_nref() == 1 && inode_map
.size() == 1 + root_parents
.size()) {
738 ldout(cct
, 15) << "trim_cache trimmed root " << root
<< dendl
;
743 void Client::trim_cache_for_reconnect(MetaSession
*s
)
745 mds_rank_t mds
= s
->mds_num
;
746 ldout(cct
, 20) << __func__
<< " mds." << mds
<< dendl
;
749 list
<Dentry
*> skipped
;
750 while (lru
.lru_get_size() > 0) {
751 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_expire());
755 if ((dn
->inode
&& dn
->inode
->caps
.count(mds
)) ||
756 dn
->dir
->parent_inode
->caps
.count(mds
)) {
760 skipped
.push_back(dn
);
763 for(list
<Dentry
*>::iterator p
= skipped
.begin(); p
!= skipped
.end(); ++p
)
764 lru
.lru_insert_mid(*p
);
766 ldout(cct
, 20) << __func__
<< " mds." << mds
767 << " trimmed " << trimmed
<< " dentries" << dendl
;
769 if (s
->caps
.size() > 0)
770 _invalidate_kernel_dcache();
773 void Client::trim_dentry(Dentry
*dn
)
775 ldout(cct
, 15) << "trim_dentry unlinking dn " << dn
->name
777 << std::hex
<< dn
->dir
->parent_inode
->ino
<< std::dec
780 Inode
*diri
= dn
->dir
->parent_inode
;
781 clear_dir_complete_and_ordered(diri
, true);
783 unlink(dn
, false, false); // drop dir, drop dentry
787 void Client::update_inode_file_size(Inode
*in
, int issued
, uint64_t size
,
788 uint64_t truncate_seq
, uint64_t truncate_size
)
790 uint64_t prior_size
= in
->size
;
792 if (truncate_seq
> in
->truncate_seq
||
793 (truncate_seq
== in
->truncate_seq
&& size
> in
->size
)) {
794 ldout(cct
, 10) << "size " << in
->size
<< " -> " << size
<< dendl
;
796 in
->reported_size
= size
;
797 if (truncate_seq
!= in
->truncate_seq
) {
798 ldout(cct
, 10) << "truncate_seq " << in
->truncate_seq
<< " -> "
799 << truncate_seq
<< dendl
;
800 in
->truncate_seq
= truncate_seq
;
801 in
->oset
.truncate_seq
= truncate_seq
;
803 // truncate cached file data
804 if (prior_size
> size
) {
805 _invalidate_inode_cache(in
, truncate_size
, prior_size
- truncate_size
);
809 // truncate inline data
810 if (in
->inline_version
< CEPH_INLINE_NONE
) {
811 uint32_t len
= in
->inline_data
.length();
813 in
->inline_data
.splice(size
, len
- size
);
816 if (truncate_seq
>= in
->truncate_seq
&&
817 in
->truncate_size
!= truncate_size
) {
819 ldout(cct
, 10) << "truncate_size " << in
->truncate_size
<< " -> "
820 << truncate_size
<< dendl
;
821 in
->truncate_size
= truncate_size
;
822 in
->oset
.truncate_size
= truncate_size
;
824 ldout(cct
, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl
;
829 void Client::update_inode_file_time(Inode
*in
, int issued
, uint64_t time_warp_seq
,
830 utime_t ctime
, utime_t mtime
, utime_t atime
)
832 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << ccap_string(issued
)
833 << " ctime " << ctime
<< " mtime " << mtime
<< dendl
;
835 if (time_warp_seq
> in
->time_warp_seq
)
836 ldout(cct
, 10) << " mds time_warp_seq " << time_warp_seq
837 << " is higher than local time_warp_seq "
838 << in
->time_warp_seq
<< dendl
;
841 // be careful with size, mtime, atime
842 if (issued
& (CEPH_CAP_FILE_EXCL
|
844 CEPH_CAP_FILE_BUFFER
|
846 CEPH_CAP_XATTR_EXCL
)) {
847 ldout(cct
, 30) << "Yay have enough caps to look at our times" << dendl
;
848 if (ctime
> in
->ctime
)
850 if (time_warp_seq
> in
->time_warp_seq
) {
851 //the mds updated times, so take those!
854 in
->time_warp_seq
= time_warp_seq
;
855 } else if (time_warp_seq
== in
->time_warp_seq
) {
857 if (mtime
> in
->mtime
)
859 if (atime
> in
->atime
)
861 } else if (issued
& CEPH_CAP_FILE_EXCL
) {
862 //ignore mds values as we have a higher seq
865 ldout(cct
, 30) << "Don't have enough caps, just taking mds' time values" << dendl
;
866 if (time_warp_seq
>= in
->time_warp_seq
) {
870 in
->time_warp_seq
= time_warp_seq
;
874 ldout(cct
, 0) << "WARNING: " << *in
<< " mds time_warp_seq "
875 << time_warp_seq
<< " is lower than local time_warp_seq "
881 void Client::_fragmap_remove_non_leaves(Inode
*in
)
883 for (map
<frag_t
,int>::iterator p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
884 if (!in
->dirfragtree
.is_leaf(p
->first
))
885 in
->fragmap
.erase(p
++);
890 void Client::_fragmap_remove_stopped_mds(Inode
*in
, mds_rank_t mds
)
892 for (auto p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
893 if (p
->second
== mds
)
894 in
->fragmap
.erase(p
++);
899 Inode
* Client::add_update_inode(InodeStat
*st
, utime_t from
,
900 MetaSession
*session
,
901 const UserPerm
& request_perms
)
904 bool was_new
= false;
905 if (inode_map
.count(st
->vino
)) {
906 in
= inode_map
[st
->vino
];
907 ldout(cct
, 12) << __func__
<< " had " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
909 in
= new Inode(this, st
->vino
, &st
->layout
);
910 inode_map
[st
->vino
] = in
;
912 if (use_faked_inos())
913 _assign_faked_ino(in
);
917 if (use_faked_inos())
918 _assign_faked_root(root
.get());
921 } else if (is_mounting()) {
922 root_parents
[root_ancestor
] = in
;
927 in
->ino
= st
->vino
.ino
;
928 in
->snapid
= st
->vino
.snapid
;
929 in
->mode
= st
->mode
& S_IFMT
;
934 if (in
->is_symlink())
935 in
->symlink
= st
->symlink
;
937 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
938 bool new_version
= false;
939 if (in
->version
== 0 ||
940 ((st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
) &&
941 (in
->version
& ~1) < st
->version
))
945 in
->caps_issued(&issued
);
946 issued
|= in
->caps_dirty();
947 int new_issued
= ~issued
& (int)st
->cap
.caps
;
949 if ((new_version
|| (new_issued
& CEPH_CAP_AUTH_SHARED
)) &&
950 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
954 in
->btime
= st
->btime
;
955 in
->snap_btime
= st
->snap_btime
;
956 in
->snap_metadata
= st
->snap_metadata
;
959 if ((new_version
|| (new_issued
& CEPH_CAP_LINK_SHARED
)) &&
960 !(issued
& CEPH_CAP_LINK_EXCL
)) {
961 in
->nlink
= st
->nlink
;
964 if (new_version
|| (new_issued
& CEPH_CAP_ANY_RD
)) {
965 update_inode_file_time(in
, issued
, st
->time_warp_seq
,
966 st
->ctime
, st
->mtime
, st
->atime
);
970 (new_issued
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
))) {
971 in
->layout
= st
->layout
;
972 update_inode_file_size(in
, issued
, st
->size
, st
->truncate_seq
, st
->truncate_size
);
976 if (new_version
|| (new_issued
& CEPH_CAP_FILE_SHARED
)) {
977 in
->dirstat
= st
->dirstat
;
979 // dir_layout/rstat/quota are not tracked by capability, update them only if
980 // the inode stat is from auth mds
981 if (new_version
|| (st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
)) {
982 in
->dir_layout
= st
->dir_layout
;
983 ldout(cct
, 20) << " dir hash is " << (int)in
->dir_layout
.dl_dir_hash
<< dendl
;
984 in
->rstat
= st
->rstat
;
985 in
->quota
= st
->quota
;
986 in
->dir_pin
= st
->dir_pin
;
988 // move me if/when version reflects fragtree changes.
989 if (in
->dirfragtree
!= st
->dirfragtree
) {
990 in
->dirfragtree
= st
->dirfragtree
;
991 _fragmap_remove_non_leaves(in
);
995 if ((in
->xattr_version
== 0 || !(issued
& CEPH_CAP_XATTR_EXCL
)) &&
996 st
->xattrbl
.length() &&
997 st
->xattr_version
> in
->xattr_version
) {
998 auto p
= st
->xattrbl
.cbegin();
999 decode(in
->xattrs
, p
);
1000 in
->xattr_version
= st
->xattr_version
;
1003 if (st
->inline_version
> in
->inline_version
) {
1004 in
->inline_data
= st
->inline_data
;
1005 in
->inline_version
= st
->inline_version
;
1008 /* always take a newer change attr */
1009 if (st
->change_attr
> in
->change_attr
)
1010 in
->change_attr
= st
->change_attr
;
1012 if (st
->version
> in
->version
)
1013 in
->version
= st
->version
;
1016 ldout(cct
, 12) << __func__
<< " adding " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
1019 return in
; // as with readdir returning indoes in different snaprealms (no caps!)
1021 if (in
->snapid
== CEPH_NOSNAP
) {
1022 add_update_cap(in
, session
, st
->cap
.cap_id
, st
->cap
.caps
, st
->cap
.wanted
,
1023 st
->cap
.seq
, st
->cap
.mseq
, inodeno_t(st
->cap
.realm
),
1024 st
->cap
.flags
, request_perms
);
1025 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
1026 in
->max_size
= st
->max_size
;
1027 in
->rstat
= st
->rstat
;
1030 // setting I_COMPLETE needs to happen after adding the cap
1032 (st
->cap
.caps
& CEPH_CAP_FILE_SHARED
) &&
1033 (issued
& CEPH_CAP_FILE_EXCL
) == 0 &&
1034 in
->dirstat
.nfiles
== 0 &&
1035 in
->dirstat
.nsubdirs
== 0) {
1036 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in
<< dendl
;
1037 in
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
1039 ldout(cct
, 10) << " dir is open on empty dir " << in
->ino
<< " with "
1040 << in
->dir
->dentries
.size() << " entries, marking all dentries null" << dendl
;
1041 in
->dir
->readdir_cache
.clear();
1042 for (const auto& p
: in
->dir
->dentries
) {
1043 unlink(p
.second
, true, true); // keep dir, keep dentry
1045 if (in
->dir
->dentries
.empty())
1050 in
->snap_caps
|= st
->cap
.caps
;
1053 in
->fscrypt
= st
->fscrypt
;
1059 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1061 Dentry
*Client::insert_dentry_inode(Dir
*dir
, const string
& dname
, LeaseStat
*dlease
,
1062 Inode
*in
, utime_t from
, MetaSession
*session
,
1066 if (dir
->dentries
.count(dname
))
1067 dn
= dir
->dentries
[dname
];
1069 ldout(cct
, 12) << __func__
<< " '" << dname
<< "' vino " << in
->vino()
1070 << " in dir " << dir
->parent_inode
->vino() << " dn " << dn
1073 if (dn
&& dn
->inode
) {
1074 if (dn
->inode
->vino() == in
->vino()) {
1076 ldout(cct
, 12) << " had dentry " << dname
1077 << " with correct vino " << dn
->inode
->vino()
1080 ldout(cct
, 12) << " had dentry " << dname
1081 << " with WRONG vino " << dn
->inode
->vino()
1083 unlink(dn
, true, true); // keep dir, keep dentry
1087 if (!dn
|| !dn
->inode
) {
1088 InodeRef
tmp_ref(in
);
1090 if (old_dentry
->dir
!= dir
) {
1091 Inode
*old_diri
= old_dentry
->dir
->parent_inode
;
1092 clear_dir_complete_and_ordered(old_diri
, false);
1094 unlink(old_dentry
, dir
== old_dentry
->dir
, false); // drop dentry, keep dir open if its the same dir
1096 Inode
*diri
= dir
->parent_inode
;
1097 clear_dir_complete_and_ordered(diri
, false);
1098 dn
= link(dir
, dname
, in
, dn
);
1101 update_dentry_lease(dn
, dlease
, from
, session
);
1105 void Client::update_dentry_lease(Dentry
*dn
, LeaseStat
*dlease
, utime_t from
, MetaSession
*session
)
1107 utime_t dttl
= from
;
1108 dttl
+= (float)dlease
->duration_ms
/ 1000.0;
1110 ldout(cct
, 15) << __func__
<< " " << *dn
<< " " << *dlease
<< " from " << from
<< dendl
;
1114 if (dlease
->mask
& CEPH_LEASE_VALID
) {
1115 if (dttl
> dn
->lease_ttl
) {
1116 ldout(cct
, 10) << "got dentry lease on " << dn
->name
1117 << " dur " << dlease
->duration_ms
<< "ms ttl " << dttl
<< dendl
;
1118 dn
->lease_ttl
= dttl
;
1119 dn
->lease_mds
= session
->mds_num
;
1120 dn
->lease_seq
= dlease
->seq
;
1121 dn
->lease_gen
= session
->cap_gen
;
1124 dn
->cap_shared_gen
= dn
->dir
->parent_inode
->shared_gen
;
1125 if (dlease
->mask
& CEPH_LEASE_PRIMARY_LINK
)
1127 dn
->alternate_name
= std::move(dlease
->alternate_name
);
1132 * update MDS location cache for a single inode
1134 void Client::update_dir_dist(Inode
*in
, DirStat
*dst
, mds_rank_t from
)
1137 ldout(cct
, 20) << "got dirfrag map for " << in
->ino
<< " frag " << dst
->frag
<< " to mds " << dst
->auth
<< dendl
;
1138 if (dst
->auth
>= 0) {
1139 in
->fragmap
[dst
->frag
] = dst
->auth
;
1141 in
->fragmap
.erase(dst
->frag
);
1143 if (!in
->dirfragtree
.is_leaf(dst
->frag
)) {
1144 in
->dirfragtree
.force_to_leaf(cct
, dst
->frag
);
1145 _fragmap_remove_non_leaves(in
);
1148 // replicated, only update from auth mds reply
1149 if (from
== dst
->auth
) {
1150 in
->dir_replicated
= !dst
->dist
.empty();
1151 if (!dst
->dist
.empty())
1152 in
->frag_repmap
[dst
->frag
].assign(dst
->dist
.begin(), dst
->dist
.end()) ;
1154 in
->frag_repmap
.erase(dst
->frag
);
1158 void Client::clear_dir_complete_and_ordered(Inode
*diri
, bool complete
)
1161 diri
->dir_release_count
++;
1163 diri
->dir_ordered_count
++;
1164 if (diri
->flags
& I_COMPLETE
) {
1166 ldout(cct
, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
1167 diri
->flags
&= ~(I_COMPLETE
| I_DIR_ORDERED
);
1169 if (diri
->flags
& I_DIR_ORDERED
) {
1170 ldout(cct
, 10) << " clearing I_DIR_ORDERED on " << *diri
<< dendl
;
1171 diri
->flags
&= ~I_DIR_ORDERED
;
1175 diri
->dir
->readdir_cache
.clear();
1180 * insert results from readdir or lssnap into the metadata cache.
1182 void Client::insert_readdir_results(MetaRequest
*request
, MetaSession
*session
, Inode
*diri
) {
1184 auto& reply
= request
->reply
;
1185 ConnectionRef con
= request
->reply
->get_connection();
1187 if(session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1188 features
= (uint64_t)-1;
1191 features
= con
->get_features();
1194 dir_result_t
*dirp
= request
->dirp
;
1197 // the extra buffer list is only set for readdir and lssnap replies
1198 auto p
= reply
->get_extra_bl().cbegin();
1201 if (request
->head
.op
== CEPH_MDS_OP_LSSNAP
) {
1203 diri
= open_snapdir(diri
);
1206 // only open dir if we're actually adding stuff to it!
1207 Dir
*dir
= diri
->open_dir();
1211 DirStat
dst(p
, features
);
1217 bool end
= ((unsigned)flags
& CEPH_READDIR_FRAG_END
);
1218 bool hash_order
= ((unsigned)flags
& CEPH_READDIR_HASH_ORDER
);
1220 frag_t fg
= (unsigned)request
->head
.args
.readdir
.frag
;
1221 unsigned readdir_offset
= dirp
->next_offset
;
1222 string readdir_start
= dirp
->last_name
;
1223 ceph_assert(!readdir_start
.empty() || readdir_offset
== 2);
1225 unsigned last_hash
= 0;
1227 if (!readdir_start
.empty()) {
1228 last_hash
= ceph_frag_value(diri
->hash_dentry_name(readdir_start
));
1229 } else if (flags
& CEPH_READDIR_OFFSET_HASH
) {
1230 /* mds understands offset_hash */
1231 last_hash
= (unsigned)request
->head
.args
.readdir
.offset_hash
;
1235 if (fg
!= dst
.frag
) {
1236 ldout(cct
, 10) << "insert_trace got new frag " << fg
<< " -> " << dst
.frag
<< dendl
;
1240 readdir_start
.clear();
1241 dirp
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
, false);
1245 ldout(cct
, 10) << __func__
<< " " << numdn
<< " readdir items, end=" << end
1246 << ", hash_order=" << hash_order
1247 << ", readdir_start " << readdir_start
1248 << ", last_hash " << last_hash
1249 << ", next_offset " << readdir_offset
<< dendl
;
1251 if (diri
->snapid
!= CEPH_SNAPDIR
&&
1252 fg
.is_leftmost() && readdir_offset
== 2 &&
1253 !(hash_order
&& last_hash
)) {
1254 dirp
->release_count
= diri
->dir_release_count
;
1255 dirp
->ordered_count
= diri
->dir_ordered_count
;
1256 dirp
->start_shared_gen
= diri
->shared_gen
;
1257 dirp
->cache_index
= 0;
1260 dirp
->buffer_frag
= fg
;
1262 _readdir_drop_dirp_buffer(dirp
);
1263 dirp
->buffer
.reserve(numdn
);
1267 for (unsigned i
=0; i
<numdn
; i
++) {
1269 dlease
.decode(p
, features
);
1270 InodeStat
ist(p
, features
);
1272 ldout(cct
, 15) << "" << i
<< ": '" << dname
<< "'" << dendl
;
1274 Inode
*in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1277 if (diri
->dir
->dentries
.count(dname
)) {
1278 Dentry
*olddn
= diri
->dir
->dentries
[dname
];
1279 if (olddn
->inode
!= in
) {
1280 // replace incorrect dentry
1281 unlink(olddn
, true, true); // keep dir, dentry
1282 dn
= link(dir
, dname
, in
, olddn
);
1283 ceph_assert(dn
== olddn
);
1291 dn
= link(dir
, dname
, in
, NULL
);
1293 dn
->alternate_name
= std::move(dlease
.alternate_name
);
1295 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1297 unsigned hash
= ceph_frag_value(diri
->hash_dentry_name(dname
));
1298 if (hash
!= last_hash
)
1301 dn
->offset
= dir_result_t::make_fpos(hash
, readdir_offset
++, true);
1303 dn
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
++, false);
1305 // add to readdir cache
1306 if (dirp
->release_count
== diri
->dir_release_count
&&
1307 dirp
->ordered_count
== diri
->dir_ordered_count
&&
1308 dirp
->start_shared_gen
== diri
->shared_gen
) {
1309 if (dirp
->cache_index
== dir
->readdir_cache
.size()) {
1311 ceph_assert(!dirp
->inode
->is_complete_and_ordered());
1312 dir
->readdir_cache
.reserve(dirp
->cache_index
+ numdn
);
1314 dir
->readdir_cache
.push_back(dn
);
1315 } else if (dirp
->cache_index
< dir
->readdir_cache
.size()) {
1316 if (dirp
->inode
->is_complete_and_ordered())
1317 ceph_assert(dir
->readdir_cache
[dirp
->cache_index
] == dn
);
1319 dir
->readdir_cache
[dirp
->cache_index
] = dn
;
1321 ceph_abort_msg("unexpected readdir buffer idx");
1323 dirp
->cache_index
++;
1325 // add to cached result list
1326 dirp
->buffer
.push_back(dir_result_t::dentry(dn
->offset
, dname
, dn
->alternate_name
, in
));
1327 ldout(cct
, 15) << __func__
<< " " << hex
<< dn
->offset
<< dec
<< ": '" << dname
<< "' -> " << in
->ino
<< dendl
;
1331 dirp
->last_name
= dname
;
1333 dirp
->next_offset
= 2;
1335 dirp
->next_offset
= readdir_offset
;
1337 if (dir
->is_empty())
1344 * insert a trace from a MDS reply into the cache.
1346 Inode
* Client::insert_trace(MetaRequest
*request
, MetaSession
*session
)
1348 auto& reply
= request
->reply
;
1349 int op
= request
->get_op();
1351 ldout(cct
, 10) << "insert_trace from " << request
->sent_stamp
<< " mds." << session
->mds_num
1352 << " is_target=" << (int)reply
->head
.is_target
1353 << " is_dentry=" << (int)reply
->head
.is_dentry
1356 auto p
= reply
->get_trace_bl().cbegin();
1357 if (request
->got_unsafe
) {
1358 ldout(cct
, 10) << "insert_trace -- already got unsafe; ignoring" << dendl
;
1359 ceph_assert(p
.end());
1364 ldout(cct
, 10) << "insert_trace -- no trace" << dendl
;
1366 Dentry
*d
= request
->dentry();
1368 Inode
*diri
= d
->dir
->parent_inode
;
1369 clear_dir_complete_and_ordered(diri
, true);
1372 if (d
&& reply
->get_result() == 0) {
1373 if (op
== CEPH_MDS_OP_RENAME
) {
1375 Dentry
*od
= request
->old_dentry();
1376 ldout(cct
, 10) << " unlinking rename src dn " << od
<< " for traceless reply" << dendl
;
1378 unlink(od
, true, true); // keep dir, dentry
1379 } else if (op
== CEPH_MDS_OP_RMDIR
||
1380 op
== CEPH_MDS_OP_UNLINK
) {
1382 ldout(cct
, 10) << " unlinking unlink/rmdir dn " << d
<< " for traceless reply" << dendl
;
1383 unlink(d
, true, true); // keep dir, dentry
1389 ConnectionRef con
= request
->reply
->get_connection();
1391 if (session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1392 features
= (uint64_t)-1;
1395 features
= con
->get_features();
1397 ldout(cct
, 10) << " features 0x" << hex
<< features
<< dec
<< dendl
;
1400 SnapRealm
*realm
= NULL
;
1401 if (reply
->snapbl
.length())
1402 update_snap_trace(reply
->snapbl
, &realm
);
1404 ldout(cct
, 10) << " hrm "
1405 << " is_target=" << (int)reply
->head
.is_target
1406 << " is_dentry=" << (int)reply
->head
.is_dentry
1415 if (reply
->head
.is_dentry
) {
1416 dirst
.decode(p
, features
);
1417 dst
.decode(p
, features
);
1419 dlease
.decode(p
, features
);
1423 if (reply
->head
.is_target
) {
1424 ist
.decode(p
, features
);
1425 if (cct
->_conf
->client_debug_getattr_caps
) {
1426 unsigned wanted
= 0;
1427 if (op
== CEPH_MDS_OP_GETATTR
|| op
== CEPH_MDS_OP_LOOKUP
)
1428 wanted
= request
->head
.args
.getattr
.mask
;
1429 else if (op
== CEPH_MDS_OP_OPEN
|| op
== CEPH_MDS_OP_CREATE
)
1430 wanted
= request
->head
.args
.open
.mask
;
1432 if ((wanted
& CEPH_CAP_XATTR_SHARED
) &&
1433 !(ist
.xattr_version
> 0 && ist
.xattrbl
.length() > 0))
1434 ceph_abort_msg("MDS reply does not contain xattrs");
1437 in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1442 if (reply
->head
.is_dentry
) {
1443 diri
= add_update_inode(&dirst
, request
->sent_stamp
, session
,
1445 mds_rank_t from_mds
= mds_rank_t(reply
->get_source().num());
1446 update_dir_dist(diri
, &dst
, from_mds
); // dir stat info is attached to ..
1449 Dir
*dir
= diri
->open_dir();
1450 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
,
1451 (op
== CEPH_MDS_OP_RENAME
) ? request
->old_dentry() : NULL
);
1454 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1455 dn
= diri
->dir
->dentries
[dname
];
1457 clear_dir_complete_and_ordered(diri
, false);
1458 unlink(dn
, true, true); // keep dir, dentry
1461 if (dlease
.duration_ms
> 0) {
1463 Dir
*dir
= diri
->open_dir();
1464 dn
= link(dir
, dname
, NULL
, NULL
);
1466 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1469 } else if (op
== CEPH_MDS_OP_LOOKUPSNAP
||
1470 op
== CEPH_MDS_OP_MKSNAP
) {
1471 ldout(cct
, 10) << " faking snap lookup weirdness" << dendl
;
1472 // fake it for snap lookup
1473 vinodeno_t vino
= ist
.vino
;
1474 vino
.snapid
= CEPH_SNAPDIR
;
1475 ceph_assert(inode_map
.count(vino
));
1476 diri
= inode_map
[vino
];
1478 string dname
= request
->path
.last_dentry();
1481 dlease
.duration_ms
= 0;
1484 Dir
*dir
= diri
->open_dir();
1485 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
);
1487 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1488 Dentry
*dn
= diri
->dir
->dentries
[dname
];
1490 unlink(dn
, true, true); // keep dir, dentry
1496 if (op
== CEPH_MDS_OP_READDIR
||
1497 op
== CEPH_MDS_OP_LSSNAP
) {
1498 insert_readdir_results(request
, session
, in
);
1499 } else if (op
== CEPH_MDS_OP_LOOKUPNAME
) {
1500 // hack: return parent inode instead
1504 if (request
->dentry() == NULL
&& in
!= request
->inode()) {
1505 // pin the target inode if its parent dentry is not pinned
1506 request
->set_other_inode(in
);
1511 put_snap_realm(realm
);
1513 request
->target
= in
;
1519 mds_rank_t
Client::choose_target_mds(MetaRequest
*req
, Inode
** phash_diri
)
1521 mds_rank_t mds
= MDS_RANK_NONE
;
1523 bool is_hash
= false;
1528 if (req
->resend_mds
>= 0) {
1529 mds
= req
->resend_mds
;
1530 req
->resend_mds
= -1;
1531 ldout(cct
, 10) << __func__
<< " resend_mds specified as mds." << mds
<< dendl
;
1535 if (cct
->_conf
->client_use_random_mds
)
1541 ldout(cct
, 20) << __func__
<< " starting with req->inode " << *in
<< dendl
;
1542 if (req
->path
.depth()) {
1543 hash
= in
->hash_dentry_name(req
->path
[0]);
1544 ldout(cct
, 20) << __func__
<< " inode dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1545 << " on " << req
->path
[0]
1546 << " => " << hash
<< dendl
;
1551 in
= de
->inode
.get();
1552 ldout(cct
, 20) << __func__
<< " starting with req->dentry inode " << *in
<< dendl
;
1554 in
= de
->dir
->parent_inode
;
1555 hash
= in
->hash_dentry_name(de
->name
);
1556 ldout(cct
, 20) << __func__
<< " dentry dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1557 << " on " << de
->name
1558 << " => " << hash
<< dendl
;
1563 if (in
->snapid
!= CEPH_NOSNAP
) {
1564 ldout(cct
, 10) << __func__
<< " " << *in
<< " is snapped, using nonsnap parent" << dendl
;
1565 while (in
->snapid
!= CEPH_NOSNAP
) {
1566 if (in
->snapid
== CEPH_SNAPDIR
)
1567 in
= in
->snapdir_parent
.get();
1568 else if (!in
->dentries
.empty())
1569 /* In most cases there will only be one dentry, so getting it
1570 * will be the correct action. If there are multiple hard links,
1571 * I think the MDS should be able to redirect as needed*/
1572 in
= in
->get_first_parent()->dir
->parent_inode
;
1574 ldout(cct
, 10) << "got unlinked inode, can't look at parent" << dendl
;
1581 ldout(cct
, 20) << __func__
<< " " << *in
<< " is_hash=" << is_hash
1582 << " hash=" << hash
<< dendl
;
1584 if (is_hash
&& S_ISDIR(in
->mode
) && (!in
->fragmap
.empty() || !in
->frag_repmap
.empty())) {
1585 frag_t fg
= in
->dirfragtree
[hash
];
1586 if (!req
->auth_is_best()) {
1587 auto repmapit
= in
->frag_repmap
.find(fg
);
1588 if (repmapit
!= in
->frag_repmap
.end()) {
1589 auto& repmap
= repmapit
->second
;
1590 auto r
= ceph::util::generate_random_number
<uint64_t>(0, repmap
.size()-1);
1593 } else if (in
->fragmap
.count(fg
)) {
1594 mds
= in
->fragmap
[fg
];
1597 } else if (in
->auth_cap
) {
1598 req
->send_to_auth
= true;
1599 mds
= in
->auth_cap
->session
->mds_num
;
1602 ldout(cct
, 10) << __func__
<< " from dirfragtree hash" << dendl
;
1607 if (in
->auth_cap
&& req
->auth_is_best()) {
1608 mds
= in
->auth_cap
->session
->mds_num
;
1609 } else if (!in
->caps
.empty()) {
1610 mds
= in
->caps
.begin()->second
.session
->mds_num
;
1614 ldout(cct
, 10) << __func__
<< " from caps on inode " << *in
<< dendl
;
1621 mds
= _get_random_up_mds();
1622 ldout(cct
, 10) << "did not get mds through better means, so chose random mds " << mds
<< dendl
;
1626 ldout(cct
, 20) << "mds is " << mds
<< dendl
;
1630 void Client::connect_mds_targets(mds_rank_t mds
)
1632 ldout(cct
, 10) << __func__
<< " for mds." << mds
<< dendl
;
1633 ceph_assert(mds_sessions
.count(mds
));
1634 const MDSMap::mds_info_t
& info
= mdsmap
->get_mds_info(mds
);
1635 for (const auto &rank
: info
.export_targets
) {
1636 if (mds_sessions
.count(rank
) == 0 &&
1637 mdsmap
->is_clientreplay_or_active_or_stopping(rank
)) {
1638 ldout(cct
, 10) << "check_mds_sessions opening mds." << mds
1639 << " export target mds." << rank
<< dendl
;
1640 _open_mds_session(rank
);
1645 void Client::dump_mds_sessions(Formatter
*f
, bool cap_dump
)
1647 f
->dump_int("id", get_nodeid().v
);
1648 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
1649 f
->dump_object("inst", inst
);
1650 f
->dump_stream("inst_str") << inst
;
1651 f
->dump_stream("addr_str") << inst
.addr
;
1652 f
->open_array_section("sessions");
1653 for (const auto &p
: mds_sessions
) {
1654 f
->open_object_section("session");
1655 p
.second
->dump(f
, cap_dump
);
1659 f
->dump_int("mdsmap_epoch", mdsmap
->get_epoch());
1662 void Client::dump_mds_requests(Formatter
*f
)
1664 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
1665 p
!= mds_requests
.end();
1667 f
->open_object_section("request");
1673 int Client::verify_reply_trace(int r
, MetaSession
*session
,
1674 MetaRequest
*request
, const MConstRef
<MClientReply
>& reply
,
1675 InodeRef
*ptarget
, bool *pcreated
,
1676 const UserPerm
& perms
)
1678 // check whether this request actually did the create, and set created flag
1679 bufferlist extra_bl
;
1680 inodeno_t created_ino
;
1681 bool got_created_ino
= false;
1682 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
;
1684 extra_bl
= reply
->get_extra_bl();
1685 if (extra_bl
.length() >= 8) {
1686 if (session
->mds_features
.test(CEPHFS_FEATURE_DELEG_INO
)) {
1687 struct openc_response_t ocres
;
1689 decode(ocres
, extra_bl
);
1690 created_ino
= ocres
.created_ino
;
1692 * The userland cephfs client doesn't have a way to do an async create
1693 * (yet), so just discard delegated_inos for now. Eventually we should
1694 * store them and use them in create calls, even if they are synchronous,
1695 * if only for testing purposes.
1697 ldout(cct
, 10) << "delegated_inos: " << ocres
.delegated_inos
<< dendl
;
1699 // u64 containing number of created ino
1700 decode(created_ino
, extra_bl
);
1702 ldout(cct
, 10) << "make_request created ino " << created_ino
<< dendl
;
1703 got_created_ino
= true;
1707 *pcreated
= got_created_ino
;
1709 if (request
->target
) {
1710 *ptarget
= request
->target
;
1711 ldout(cct
, 20) << "make_request target is " << *ptarget
->get() << dendl
;
1713 if (got_created_ino
&& (p
= inode_map
.find(vinodeno_t(created_ino
, CEPH_NOSNAP
))) != inode_map
.end()) {
1714 (*ptarget
) = p
->second
;
1715 ldout(cct
, 20) << "make_request created, target is " << *ptarget
->get() << dendl
;
1717 // we got a traceless reply, and need to look up what we just
1718 // created. for now, do this by name. someday, do this by the
1719 // ino... which we know! FIXME.
1721 Dentry
*d
= request
->dentry();
1724 ldout(cct
, 10) << "make_request got traceless reply, looking up #"
1725 << d
->dir
->parent_inode
->ino
<< "/" << d
->name
1726 << " got_ino " << got_created_ino
1727 << " ino " << created_ino
1729 r
= _do_lookup(d
->dir
->parent_inode
, d
->name
, request
->regetattr_mask
,
1732 // if the dentry is not linked, just do our best. see #5021.
1733 ceph_abort_msg("how did this happen? i want logs!");
1736 Inode
*in
= request
->inode();
1737 ldout(cct
, 10) << "make_request got traceless reply, forcing getattr on #"
1738 << in
->ino
<< dendl
;
1739 r
= _getattr(in
, request
->regetattr_mask
, perms
, true);
1743 // verify ino returned in reply and trace_dist are the same
1744 if (got_created_ino
&&
1745 created_ino
.val
!= target
->ino
.val
) {
1746 ldout(cct
, 5) << "create got ino " << created_ino
<< " but then failed on lookup; EINTR?" << dendl
;
1750 ptarget
->swap(target
);
1762 * Blocking helper to make an MDS request.
1764 * If the ptarget flag is set, behavior changes slightly: the caller
1765 * expects to get a pointer to the inode we are creating or operating
1766 * on. As a result, we will follow up any traceless mutation reply
1767 * with a getattr or lookup to transparently handle a traceless reply
1768 * from the MDS (as when the MDS restarts and the client has to replay
1771 * @param request the MetaRequest to execute
1772 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1773 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1774 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1775 * @param use_mds [optional] prefer a specific mds (-1 for default)
1776 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1778 int Client::make_request(MetaRequest
*request
,
1779 const UserPerm
& perms
,
1780 InodeRef
*ptarget
, bool *pcreated
,
1786 // assign a unique tid
1787 ceph_tid_t tid
= ++last_tid
;
1788 request
->set_tid(tid
);
1791 request
->op_stamp
= ceph_clock_now();
1794 mds_requests
[tid
] = request
->get();
1795 if (oldest_tid
== 0 && request
->get_op() != CEPH_MDS_OP_SETFILELOCK
)
1798 request
->set_caller_perms(perms
);
1800 if (cct
->_conf
->client_inject_fixed_oldest_tid
) {
1801 ldout(cct
, 20) << __func__
<< " injecting fixed oldest_client_tid(1)" << dendl
;
1802 request
->set_oldest_client_tid(1);
1804 request
->set_oldest_client_tid(oldest_tid
);
1809 request
->resend_mds
= use_mds
;
1811 MetaSessionRef session
= NULL
;
1813 if (request
->aborted())
1817 request
->abort(-CEPHFS_EBLOCKLISTED
);
1822 ceph::condition_variable caller_cond
;
1823 request
->caller_cond
= &caller_cond
;
1826 Inode
*hash_diri
= NULL
;
1827 mds_rank_t mds
= choose_target_mds(request
, &hash_diri
);
1828 int mds_state
= (mds
== MDS_RANK_NONE
) ? MDSMap::STATE_NULL
: mdsmap
->get_state(mds
);
1829 if (mds_state
!= MDSMap::STATE_ACTIVE
&& mds_state
!= MDSMap::STATE_STOPPING
) {
1830 if (mds_state
== MDSMap::STATE_NULL
&& mds
>= mdsmap
->get_max_mds()) {
1832 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, remove it from fragmap" << dendl
;
1833 _fragmap_remove_stopped_mds(hash_diri
, mds
);
1835 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, trying a random mds" << dendl
;
1836 request
->resend_mds
= _get_random_up_mds();
1839 ldout(cct
, 10) << " target mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
1840 wait_on_list(waiting_for_mdsmap
);
1846 if (!have_open_session(mds
)) {
1847 session
= _get_or_open_mds_session(mds
);
1848 if (session
->state
== MetaSession::STATE_REJECTED
) {
1849 request
->abort(-CEPHFS_EPERM
);
1853 if (session
->state
== MetaSession::STATE_OPENING
) {
1854 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
1855 wait_on_context_list(session
->waiting_for_open
);
1859 if (!have_open_session(mds
))
1862 session
= mds_sessions
.at(mds
);
1866 send_request(request
, session
.get());
1869 ldout(cct
, 20) << "awaiting reply|forward|kick on " << &caller_cond
<< dendl
;
1870 request
->kick
= false;
1871 std::unique_lock l
{client_lock
, std::adopt_lock
};
1872 caller_cond
.wait(l
, [request
] {
1873 return (request
->reply
|| // reply
1874 request
->resend_mds
>= 0 || // forward
1878 request
->caller_cond
= nullptr;
1880 // did we get a reply?
1885 if (!request
->reply
) {
1886 ceph_assert(request
->aborted());
1887 ceph_assert(!request
->got_unsafe
);
1888 r
= request
->get_abort_code();
1889 request
->item
.remove_myself();
1890 unregister_request(request
);
1891 put_request(request
);
1896 auto reply
= std::move(request
->reply
);
1897 r
= reply
->get_result();
1899 request
->success
= true;
1901 // kick dispatcher (we've got it!)
1902 ceph_assert(request
->dispatch_cond
);
1903 request
->dispatch_cond
->notify_all();
1904 ldout(cct
, 20) << "sendrecv kickback on tid " << tid
<< " " << request
->dispatch_cond
<< dendl
;
1905 request
->dispatch_cond
= 0;
1907 if (r
>= 0 && ptarget
)
1908 r
= verify_reply_trace(r
, session
.get(), request
, reply
, ptarget
, pcreated
, perms
);
1911 *pdirbl
= reply
->get_extra_bl();
1914 utime_t lat
= ceph_clock_now();
1915 lat
-= request
->sent_stamp
;
1916 ldout(cct
, 20) << "lat " << lat
<< dendl
;
1917 logger
->tinc(l_c_lat
, lat
);
1918 logger
->tinc(l_c_reply
, lat
);
1920 put_request(request
);
1924 void Client::unregister_request(MetaRequest
*req
)
1926 mds_requests
.erase(req
->tid
);
1927 if (req
->tid
== oldest_tid
) {
1928 map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.upper_bound(oldest_tid
);
1930 if (p
== mds_requests
.end()) {
1934 if (p
->second
->get_op() != CEPH_MDS_OP_SETFILELOCK
) {
1935 oldest_tid
= p
->first
;
1944 void Client::put_request(MetaRequest
*request
)
1946 if (request
->_put()) {
1948 if (request
->success
)
1949 op
= request
->get_op();
1951 request
->take_other_inode(&other_in
);
1955 (op
== CEPH_MDS_OP_RMDIR
||
1956 op
== CEPH_MDS_OP_RENAME
||
1957 op
== CEPH_MDS_OP_RMSNAP
)) {
1958 _try_to_trim_inode(other_in
.get(), false);
1963 int Client::encode_inode_release(Inode
*in
, MetaRequest
*req
,
1964 mds_rank_t mds
, int drop
,
1965 int unless
, int force
)
1967 ldout(cct
, 20) << __func__
<< " enter(in:" << *in
<< ", req:" << req
1968 << " mds:" << mds
<< ", drop:" << ccap_string(drop
) << ", unless:" << ccap_string(unless
)
1969 << ", force:" << force
<< ")" << dendl
;
1971 auto it
= in
->caps
.find(mds
);
1972 if (it
!= in
->caps
.end()) {
1973 Cap
&cap
= it
->second
;
1974 drop
&= ~(in
->dirty_caps
| get_caps_used(in
));
1975 if ((drop
& cap
.issued
) &&
1976 !(unless
& cap
.issued
)) {
1977 ldout(cct
, 25) << "dropping caps " << ccap_string(drop
) << dendl
;
1978 cap
.issued
&= ~drop
;
1979 cap
.implemented
&= ~drop
;
1985 cap
.wanted
= in
->caps_wanted();
1986 if (&cap
== in
->auth_cap
&&
1987 !(cap
.wanted
& CEPH_CAP_ANY_FILE_WR
)) {
1988 in
->requested_max_size
= 0;
1989 ldout(cct
, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl
;
1991 ceph_mds_request_release rel
;
1993 rel
.cap_id
= cap
.cap_id
;
1995 rel
.issue_seq
= cap
.issue_seq
;
1996 rel
.mseq
= cap
.mseq
;
1997 rel
.caps
= cap
.implemented
;
1998 rel
.wanted
= cap
.wanted
;
2001 req
->cap_releases
.push_back(MClientRequest::Release(rel
,""));
2004 ldout(cct
, 25) << __func__
<< " exit(in:" << *in
<< ") released:"
2005 << released
<< dendl
;
2009 void Client::encode_dentry_release(Dentry
*dn
, MetaRequest
*req
,
2010 mds_rank_t mds
, int drop
, int unless
)
2012 ldout(cct
, 20) << __func__
<< " enter(dn:"
2013 << dn
<< ")" << dendl
;
2016 released
= encode_inode_release(dn
->dir
->parent_inode
, req
,
2017 mds
, drop
, unless
, 1);
2018 if (released
&& dn
->lease_mds
== mds
) {
2019 ldout(cct
, 25) << "preemptively releasing dn to mds" << dendl
;
2020 auto& rel
= req
->cap_releases
.back();
2021 rel
.item
.dname_len
= dn
->name
.length();
2022 rel
.item
.dname_seq
= dn
->lease_seq
;
2023 rel
.dname
= dn
->name
;
2026 ldout(cct
, 25) << __func__
<< " exit(dn:"
2027 << dn
<< ")" << dendl
;
2032 * This requires the MClientRequest *request member to be set.
2033 * It will error out horribly without one.
2034 * Additionally, if you set any *drop member, you'd better have
2035 * set the corresponding dentry!
2037 void Client::encode_cap_releases(MetaRequest
*req
, mds_rank_t mds
)
2039 ldout(cct
, 20) << __func__
<< " enter (req: "
2040 << req
<< ", mds: " << mds
<< ")" << dendl
;
2041 if (req
->inode_drop
&& req
->inode())
2042 encode_inode_release(req
->inode(), req
,
2043 mds
, req
->inode_drop
,
2046 if (req
->old_inode_drop
&& req
->old_inode())
2047 encode_inode_release(req
->old_inode(), req
,
2048 mds
, req
->old_inode_drop
,
2049 req
->old_inode_unless
);
2050 if (req
->other_inode_drop
&& req
->other_inode())
2051 encode_inode_release(req
->other_inode(), req
,
2052 mds
, req
->other_inode_drop
,
2053 req
->other_inode_unless
);
2055 if (req
->dentry_drop
&& req
->dentry())
2056 encode_dentry_release(req
->dentry(), req
,
2057 mds
, req
->dentry_drop
,
2058 req
->dentry_unless
);
2060 if (req
->old_dentry_drop
&& req
->old_dentry())
2061 encode_dentry_release(req
->old_dentry(), req
,
2062 mds
, req
->old_dentry_drop
,
2063 req
->old_dentry_unless
);
2064 ldout(cct
, 25) << __func__
<< " exit (req: "
2065 << req
<< ", mds " << mds
<<dendl
;
2068 bool Client::have_open_session(mds_rank_t mds
)
2070 const auto &it
= mds_sessions
.find(mds
);
2071 return it
!= mds_sessions
.end() &&
2072 (it
->second
->state
== MetaSession::STATE_OPEN
||
2073 it
->second
->state
== MetaSession::STATE_STALE
);
2076 MetaSessionRef
Client::_get_mds_session(mds_rank_t mds
, Connection
*con
)
2078 const auto &it
= mds_sessions
.find(mds
);
2079 if (it
== mds_sessions
.end() || it
->second
->con
!= con
) {
2086 MetaSessionRef
Client::_get_or_open_mds_session(mds_rank_t mds
)
2088 auto it
= mds_sessions
.find(mds
);
2089 return it
== mds_sessions
.end() ? _open_mds_session(mds
) : it
->second
;
2093 * Populate a map of strings with client-identifying metadata,
2094 * such as the hostname. Call this once at initialization.
2096 void Client::populate_metadata(const std::string
&mount_root
)
2100 // TODO: move this to compat.h
2102 DWORD hostname_sz
= 64;
2103 GetComputerNameA(hostname
, &hostname_sz
);
2104 metadata
["hostname"] = hostname
;
2109 metadata
["hostname"] = u
.nodename
;
2110 ldout(cct
, 20) << __func__
<< " read hostname '" << u
.nodename
<< "'" << dendl
;
2112 ldout(cct
, 1) << __func__
<< " failed to read hostname (" << cpp_strerror(r
) << ")" << dendl
;
2116 metadata
["pid"] = stringify(getpid());
2118 // Ceph entity id (the '0' in "client.0")
2119 metadata
["entity_id"] = cct
->_conf
->name
.get_id();
2121 // Our mount position
2122 if (!mount_root
.empty()) {
2123 metadata
["root"] = mount_root
;
2127 metadata
["ceph_version"] = pretty_version_to_str();
2128 metadata
["ceph_sha1"] = git_version_to_str();
2130 // Apply any metadata from the user's configured overrides
2131 std::vector
<std::string
> tokens
;
2132 get_str_vec(cct
->_conf
->client_metadata
, ",", tokens
);
2133 for (const auto &i
: tokens
) {
2134 auto eqpos
= i
.find("=");
2135 // Throw out anything that isn't of the form "<str>=<str>"
2136 if (eqpos
== 0 || eqpos
== std::string::npos
|| eqpos
== i
.size()) {
2137 lderr(cct
) << "Invalid metadata keyval pair: '" << i
<< "'" << dendl
;
2140 metadata
[i
.substr(0, eqpos
)] = i
.substr(eqpos
+ 1);
2145 * Optionally add or override client metadata fields.
2147 void Client::update_metadata(std::string
const &k
, std::string
const &v
)
2149 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
2150 ceph_assert(iref_reader
.is_state_satisfied());
2152 std::scoped_lock
l(client_lock
);
2154 auto it
= metadata
.find(k
);
2155 if (it
!= metadata
.end()) {
2156 ldout(cct
, 1) << __func__
<< " warning, overriding metadata field '" << k
2157 << "' from '" << it
->second
<< "' to '" << v
<< "'" << dendl
;
2163 MetaSessionRef
Client::_open_mds_session(mds_rank_t mds
)
2165 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
2166 auto addrs
= mdsmap
->get_addrs(mds
);
2167 auto em
= mds_sessions
.emplace(std::piecewise_construct
,
2168 std::forward_as_tuple(mds
),
2169 std::forward_as_tuple(new MetaSession(mds
, messenger
->connect_to_mds(addrs
), addrs
)));
2170 ceph_assert(em
.second
); /* not already present */
2171 auto session
= em
.first
->second
;
2173 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_OPEN
);
2174 m
->metadata
= metadata
;
2175 m
->supported_features
= feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED
);
2176 m
->metric_spec
= feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL
);
2177 session
->con
->send_message2(std::move(m
));
2181 void Client::_close_mds_session(MetaSession
*s
)
2183 ldout(cct
, 2) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2184 s
->state
= MetaSession::STATE_CLOSING
;
2185 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2188 void Client::_closed_mds_session(MetaSession
*s
, int err
, bool rejected
)
2190 ldout(cct
, 5) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2191 if (rejected
&& s
->state
!= MetaSession::STATE_CLOSING
)
2192 s
->state
= MetaSession::STATE_REJECTED
;
2194 s
->state
= MetaSession::STATE_CLOSED
;
2195 s
->con
->mark_down();
2196 signal_context_list(s
->waiting_for_open
);
2197 mount_cond
.notify_all();
2198 remove_session_caps(s
, err
);
2199 kick_requests_closed(s
);
2200 mds_ranks_closing
.erase(s
->mds_num
);
2201 if (s
->state
== MetaSession::STATE_CLOSED
)
2202 mds_sessions
.erase(s
->mds_num
);
2205 void Client::handle_client_session(const MConstRef
<MClientSession
>& m
)
2207 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2208 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << from
<< dendl
;
2210 std::scoped_lock
cl(client_lock
);
2211 auto session
= _get_mds_session(from
, m
->get_connection().get());
2213 ldout(cct
, 10) << " discarding session message from sessionless mds " << m
->get_source_inst() << dendl
;
2217 switch (m
->get_op()) {
2218 case CEPH_SESSION_OPEN
:
2220 feature_bitset_t
missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED
);
2221 missing_features
-= m
->supported_features
;
2222 if (!missing_features
.empty()) {
2223 lderr(cct
) << "mds." << from
<< " lacks required features '"
2224 << missing_features
<< "', closing session " << dendl
;
2225 _close_mds_session(session
.get());
2226 _closed_mds_session(session
.get(), -CEPHFS_EPERM
, true);
2229 session
->mds_features
= std::move(m
->supported_features
);
2231 renew_caps(session
.get());
2232 session
->state
= MetaSession::STATE_OPEN
;
2233 if (is_unmounting())
2234 mount_cond
.notify_all();
2236 connect_mds_targets(from
);
2237 signal_context_list(session
->waiting_for_open
);
2241 case CEPH_SESSION_CLOSE
:
2242 _closed_mds_session(session
.get());
2245 case CEPH_SESSION_RENEWCAPS
:
2246 if (session
->cap_renew_seq
== m
->get_seq()) {
2247 bool was_stale
= ceph_clock_now() >= session
->cap_ttl
;
2249 session
->last_cap_renew_request
+ mdsmap
->get_session_timeout();
2251 wake_up_session_caps(session
.get(), false);
2255 case CEPH_SESSION_STALE
:
2256 // invalidate session caps/leases
2258 session
->cap_ttl
= ceph_clock_now();
2259 session
->cap_ttl
-= 1;
2260 renew_caps(session
.get());
2263 case CEPH_SESSION_RECALL_STATE
:
2265 * Call the renew caps and flush cap releases just before
2266 * triming the caps in case the tick() won't get a chance
2267 * to run them, which could cause the client to be blocklisted
2268 * and MDS daemons trying to recall the caps again and
2271 * In most cases it will do nothing, and the new cap releases
2272 * added by trim_caps() followed will be deferred flushing
2275 renew_and_flush_cap_releases();
2276 trim_caps(session
.get(), m
->get_max_caps());
2279 case CEPH_SESSION_FLUSHMSG
:
2280 /* flush cap release */
2281 if (auto& m
= session
->release
; m
) {
2282 session
->con
->send_message2(std::move(m
));
2284 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG_ACK
, m
->get_seq()));
2287 case CEPH_SESSION_FORCE_RO
:
2288 force_session_readonly(session
.get());
2291 case CEPH_SESSION_REJECT
:
2293 std::string_view error_str
;
2294 auto it
= m
->metadata
.find("error_string");
2295 if (it
!= m
->metadata
.end())
2296 error_str
= it
->second
;
2298 error_str
= "unknown error";
2299 lderr(cct
) << "mds." << from
<< " rejected us (" << error_str
<< ")" << dendl
;
2301 _closed_mds_session(session
.get(), -CEPHFS_EPERM
, true);
2310 bool Client::_any_stale_sessions() const
2312 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
2314 for (const auto &p
: mds_sessions
) {
2315 if (p
.second
->state
== MetaSession::STATE_STALE
) {
2323 void Client::_kick_stale_sessions()
2325 ldout(cct
, 1) << __func__
<< dendl
;
2327 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
2328 auto s
= it
->second
;
2329 if (s
->state
== MetaSession::STATE_REJECTED
) {
2330 mds_sessions
.erase(it
->first
);
2333 if (s
->state
== MetaSession::STATE_STALE
)
2334 _closed_mds_session(s
.get());
2338 void Client::send_request(MetaRequest
*request
, MetaSession
*session
,
2339 bool drop_cap_releases
)
2342 mds_rank_t mds
= session
->mds_num
;
2343 ldout(cct
, 10) << __func__
<< " rebuilding request " << request
->get_tid()
2344 << " for mds." << mds
<< dendl
;
2345 auto r
= build_client_request(request
);
2346 if (request
->dentry()) {
2347 r
->set_dentry_wanted();
2349 if (request
->got_unsafe
) {
2350 r
->set_replayed_op();
2351 if (request
->target
)
2352 r
->head
.ino
= request
->target
->ino
;
2354 encode_cap_releases(request
, mds
);
2355 if (drop_cap_releases
) // we haven't send cap reconnect yet, drop cap releases
2356 request
->cap_releases
.clear();
2358 r
->releases
.swap(request
->cap_releases
);
2360 r
->set_mdsmap_epoch(mdsmap
->get_epoch());
2361 if (r
->head
.op
== CEPH_MDS_OP_SETXATTR
) {
2362 objecter
->with_osdmap([r
](const OSDMap
& o
) {
2363 r
->set_osdmap_epoch(o
.get_epoch());
2367 if (request
->mds
== -1) {
2368 request
->sent_stamp
= ceph_clock_now();
2369 ldout(cct
, 20) << __func__
<< " set sent_stamp to " << request
->sent_stamp
<< dendl
;
2373 Inode
*in
= request
->inode();
2375 auto it
= in
->caps
.find(mds
);
2376 if (it
!= in
->caps
.end()) {
2377 request
->sent_on_mseq
= it
->second
.mseq
;
2381 session
->requests
.push_back(&request
->item
);
2383 ldout(cct
, 10) << __func__
<< " " << *r
<< " to mds." << mds
<< dendl
;
2384 session
->con
->send_message2(std::move(r
));
2387 ref_t
<MClientRequest
> Client::build_client_request(MetaRequest
*request
)
2389 auto req
= make_message
<MClientRequest
>(request
->get_op());
2390 req
->set_tid(request
->tid
);
2391 req
->set_stamp(request
->op_stamp
);
2392 memcpy(&req
->head
, &request
->head
, sizeof(ceph_mds_request_head
));
2394 // if the filepath's haven't been set, set them!
2395 if (request
->path
.empty()) {
2396 Inode
*in
= request
->inode();
2397 Dentry
*de
= request
->dentry();
2399 in
->make_nosnap_relative_path(request
->path
);
2402 de
->inode
->make_nosnap_relative_path(request
->path
);
2404 de
->dir
->parent_inode
->make_nosnap_relative_path(request
->path
);
2405 request
->path
.push_dentry(de
->name
);
2407 else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2408 << " No path, inode, or appropriately-endowed dentry given!"
2410 } else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2411 << " No path, inode, or dentry given!"
2414 req
->set_filepath(request
->get_filepath());
2415 req
->set_filepath2(request
->get_filepath2());
2416 req
->set_alternate_name(request
->alternate_name
);
2417 req
->set_data(request
->data
);
2418 req
->set_retry_attempt(request
->retry_attempt
++);
2419 req
->head
.num_fwd
= request
->num_fwd
;
2421 int gid_count
= request
->perms
.get_gids(&_gids
);
2422 req
->set_gid_list(gid_count
, _gids
);
2428 void Client::handle_client_request_forward(const MConstRef
<MClientRequestForward
>& fwd
)
2430 mds_rank_t mds
= mds_rank_t(fwd
->get_source().num());
2432 std::scoped_lock
cl(client_lock
);
2433 auto session
= _get_mds_session(mds
, fwd
->get_connection().get());
2437 ceph_tid_t tid
= fwd
->get_tid();
2439 if (mds_requests
.count(tid
) == 0) {
2440 ldout(cct
, 10) << __func__
<< " no pending request on tid " << tid
<< dendl
;
2444 MetaRequest
*request
= mds_requests
[tid
];
2445 ceph_assert(request
);
2447 // reset retry counter
2448 request
->retry_attempt
= 0;
2450 // request not forwarded, or dest mds has no session.
2452 ldout(cct
, 10) << __func__
<< " tid " << tid
2453 << " fwd " << fwd
->get_num_fwd()
2454 << " to mds." << fwd
->get_dest_mds()
2455 << ", resending to " << fwd
->get_dest_mds()
2459 request
->item
.remove_myself();
2460 request
->num_fwd
= fwd
->get_num_fwd();
2461 request
->resend_mds
= fwd
->get_dest_mds();
2462 request
->caller_cond
->notify_all();
2465 bool Client::is_dir_operation(MetaRequest
*req
)
2467 int op
= req
->get_op();
2468 if (op
== CEPH_MDS_OP_MKNOD
|| op
== CEPH_MDS_OP_LINK
||
2469 op
== CEPH_MDS_OP_UNLINK
|| op
== CEPH_MDS_OP_RENAME
||
2470 op
== CEPH_MDS_OP_MKDIR
|| op
== CEPH_MDS_OP_RMDIR
||
2471 op
== CEPH_MDS_OP_SYMLINK
|| op
== CEPH_MDS_OP_CREATE
)
2476 void Client::handle_client_reply(const MConstRef
<MClientReply
>& reply
)
2478 mds_rank_t mds_num
= mds_rank_t(reply
->get_source().num());
2480 std::scoped_lock
cl(client_lock
);
2481 auto session
= _get_mds_session(mds_num
, reply
->get_connection().get());
2486 ceph_tid_t tid
= reply
->get_tid();
2487 bool is_safe
= reply
->is_safe();
2489 if (mds_requests
.count(tid
) == 0) {
2490 lderr(cct
) << __func__
<< " no pending request on tid " << tid
2491 << " safe is:" << is_safe
<< dendl
;
2494 MetaRequest
*request
= mds_requests
.at(tid
);
2496 ldout(cct
, 20) << __func__
<< " got a reply. Safe:" << is_safe
2497 << " tid " << tid
<< dendl
;
2499 if (request
->got_unsafe
&& !is_safe
) {
2500 //duplicate response
2501 ldout(cct
, 0) << "got a duplicate reply on tid " << tid
<< " from mds "
2502 << mds_num
<< " safe:" << is_safe
<< dendl
;
2506 if (-CEPHFS_ESTALE
== reply
->get_result()) { // see if we can get to proper MDS
2507 ldout(cct
, 20) << "got ESTALE on tid " << request
->tid
2508 << " from mds." << request
->mds
<< dendl
;
2509 request
->send_to_auth
= true;
2510 request
->resend_mds
= choose_target_mds(request
);
2511 Inode
*in
= request
->inode();
2512 std::map
<mds_rank_t
, Cap
>::const_iterator it
;
2513 if (request
->resend_mds
>= 0 &&
2514 request
->resend_mds
== request
->mds
&&
2516 (it
= in
->caps
.find(request
->resend_mds
)) != in
->caps
.end() ||
2517 request
->sent_on_mseq
== it
->second
.mseq
)) {
2518 ldout(cct
, 20) << "have to return ESTALE" << dendl
;
2520 request
->caller_cond
->notify_all();
2525 ceph_assert(!request
->reply
);
2526 request
->reply
= reply
;
2527 insert_trace(request
, session
.get());
2529 // Handle unsafe reply
2531 request
->got_unsafe
= true;
2532 session
->unsafe_requests
.push_back(&request
->unsafe_item
);
2533 if (is_dir_operation(request
)) {
2534 Inode
*dir
= request
->inode();
2536 dir
->unsafe_ops
.push_back(&request
->unsafe_dir_item
);
2538 if (request
->target
) {
2539 InodeRef
&in
= request
->target
;
2540 in
->unsafe_ops
.push_back(&request
->unsafe_target_item
);
2544 // Only signal the caller once (on the first reply):
2545 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2546 if (!is_safe
|| !request
->got_unsafe
) {
2547 ceph::condition_variable cond
;
2548 request
->dispatch_cond
= &cond
;
2551 ldout(cct
, 20) << __func__
<< " signalling caller " << (void*)request
->caller_cond
<< dendl
;
2552 request
->caller_cond
->notify_all();
2554 // wake for kick back
2555 std::unique_lock l
{client_lock
, std::adopt_lock
};
2556 cond
.wait(l
, [tid
, request
, &cond
, this] {
2557 if (request
->dispatch_cond
) {
2558 ldout(cct
, 20) << "handle_client_reply awaiting kickback on tid "
2559 << tid
<< " " << &cond
<< dendl
;
2561 return !request
->dispatch_cond
;
2567 // the filesystem change is committed to disk
2568 // we're done, clean up
2569 if (request
->got_unsafe
) {
2570 request
->unsafe_item
.remove_myself();
2571 request
->unsafe_dir_item
.remove_myself();
2572 request
->unsafe_target_item
.remove_myself();
2573 signal_cond_list(request
->waitfor_safe
);
2575 request
->item
.remove_myself();
2576 unregister_request(request
);
2578 if (is_unmounting())
2579 mount_cond
.notify_all();
2582 void Client::_handle_full_flag(int64_t pool
)
2584 ldout(cct
, 1) << __func__
<< ": FULL: cancelling outstanding operations "
2585 << "on " << pool
<< dendl
;
2586 // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
2587 // to do this rather than blocking, because otherwise when we fill up we
2588 // potentially lock caps forever on files with dirty pages, and we need
2589 // to be able to release those caps to the MDS so that it can delete files
2590 // and free up space.
2591 epoch_t cancelled_epoch
= objecter
->op_cancel_writes(-CEPHFS_ENOSPC
, pool
);
2593 // For all inodes with layouts in this pool and a pending flush write op
2594 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2595 // from ObjectCacher so that it doesn't re-issue the write in response to
2596 // the ENOSPC error.
2597 // Fortunately since we're cancelling everything in a given pool, we don't
2598 // need to know which ops belong to which ObjectSet, we can just blow all
2599 // the un-flushed cached data away and mark any dirty inodes' async_err
2600 // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
2601 // affecting this pool, and all the objectsets we're purging were also
2603 for (unordered_map
<vinodeno_t
,Inode
*>::iterator i
= inode_map
.begin();
2604 i
!= inode_map
.end(); ++i
)
2606 Inode
*inode
= i
->second
;
2607 if (inode
->oset
.dirty_or_tx
2608 && (pool
== -1 || inode
->layout
.pool_id
== pool
)) {
2609 ldout(cct
, 4) << __func__
<< ": FULL: inode 0x" << std::hex
<< i
->first
<< std::dec
2610 << " has dirty objects, purging and setting ENOSPC" << dendl
;
2611 objectcacher
->purge_set(&inode
->oset
);
2612 inode
->set_async_err(-CEPHFS_ENOSPC
);
2616 if (cancelled_epoch
!= (epoch_t
)-1) {
2617 set_cap_epoch_barrier(cancelled_epoch
);
2621 void Client::handle_osd_map(const MConstRef
<MOSDMap
>& m
)
2623 std::set
<entity_addr_t
> new_blocklists
;
2625 std::scoped_lock
cl(client_lock
);
2626 objecter
->consume_blocklist_events(&new_blocklists
);
2628 const auto myaddrs
= messenger
->get_myaddrs();
2629 bool new_blocklist
= false;
2630 bool prenautilus
= objecter
->with_osdmap(
2631 [&](const OSDMap
& o
) {
2632 return o
.require_osd_release
< ceph_release_t::nautilus
;
2635 for (auto a
: myaddrs
.v
) {
2636 // blocklist entries are always TYPE_ANY for nautilus+
2637 a
.set_type(entity_addr_t::TYPE_ANY
);
2638 if (new_blocklists
.count(a
)) {
2639 new_blocklist
= true;
2643 // ...except pre-nautilus, they were TYPE_LEGACY
2644 a
.set_type(entity_addr_t::TYPE_LEGACY
);
2645 if (new_blocklists
.count(a
)) {
2646 new_blocklist
= true;
2652 if (new_blocklist
) {
2653 auto epoch
= objecter
->with_osdmap([](const OSDMap
&o
){
2654 return o
.get_epoch();
2656 lderr(cct
) << "I was blocklisted at osd epoch " << epoch
<< dendl
;
2659 _abort_mds_sessions(-CEPHFS_EBLOCKLISTED
);
2661 // Since we know all our OSD ops will fail, cancel them all preemtively,
2662 // so that on an unhealthy cluster we can umount promptly even if e.g.
2663 // some PGs were inaccessible.
2664 objecter
->op_cancel_writes(-CEPHFS_EBLOCKLISTED
);
2669 // Handle case where we were blocklisted but no longer are
2670 blocklisted
= objecter
->with_osdmap([myaddrs
](const OSDMap
&o
){
2671 return o
.is_blocklisted(myaddrs
);});
2674 // Always subscribe to next osdmap for blocklisted client
2675 // until this client is not blocklisted.
2677 objecter
->maybe_request_map();
2680 if (objecter
->osdmap_full_flag()) {
2681 _handle_full_flag(-1);
2683 // Accumulate local list of full pools so that I can drop
2684 // the objecter lock before re-entering objecter in
2686 std::vector
<int64_t> full_pools
;
2688 objecter
->with_osdmap([&full_pools
](const OSDMap
&o
) {
2689 for (const auto& kv
: o
.get_pools()) {
2690 if (kv
.second
.has_flag(pg_pool_t::FLAG_FULL
)) {
2691 full_pools
.push_back(kv
.first
);
2696 for (auto p
: full_pools
)
2697 _handle_full_flag(p
);
2699 // Subscribe to subsequent maps to watch for the full flag going
2700 // away. For the global full flag objecter does this for us, but
2701 // it pays no attention to the per-pool full flag so in this branch
2702 // we do it ourselves.
2703 if (!full_pools
.empty()) {
2704 objecter
->maybe_request_map();
2710 // ------------------------
2711 // incoming messages
2714 bool Client::ms_dispatch2(const MessageRef
&m
)
2716 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
2717 if (!iref_reader
.is_state_satisfied()) {
2718 ldout(cct
, 10) << "inactive, discarding " << *m
<< dendl
;
2722 switch (m
->get_type()) {
2723 // mounting and mds sessions
2724 case CEPH_MSG_MDS_MAP
:
2725 handle_mds_map(ref_cast
<MMDSMap
>(m
));
2727 case CEPH_MSG_FS_MAP
:
2728 handle_fs_map(ref_cast
<MFSMap
>(m
));
2730 case CEPH_MSG_FS_MAP_USER
:
2731 handle_fs_map_user(ref_cast
<MFSMapUser
>(m
));
2733 case CEPH_MSG_CLIENT_SESSION
:
2734 handle_client_session(ref_cast
<MClientSession
>(m
));
2737 case CEPH_MSG_OSD_MAP
:
2738 handle_osd_map(ref_cast
<MOSDMap
>(m
));
2742 case CEPH_MSG_CLIENT_REQUEST_FORWARD
:
2743 handle_client_request_forward(ref_cast
<MClientRequestForward
>(m
));
2745 case CEPH_MSG_CLIENT_REPLY
:
2746 handle_client_reply(ref_cast
<MClientReply
>(m
));
2750 case CEPH_MSG_CLIENT_RECLAIM_REPLY
:
2751 handle_client_reclaim_reply(ref_cast
<MClientReclaimReply
>(m
));
2754 case CEPH_MSG_CLIENT_SNAP
:
2755 handle_snap(ref_cast
<MClientSnap
>(m
));
2757 case CEPH_MSG_CLIENT_CAPS
:
2758 handle_caps(ref_cast
<MClientCaps
>(m
));
2760 case CEPH_MSG_CLIENT_LEASE
:
2761 handle_lease(ref_cast
<MClientLease
>(m
));
2763 case MSG_COMMAND_REPLY
:
2764 if (m
->get_source().type() == CEPH_ENTITY_TYPE_MDS
) {
2765 handle_command_reply(ref_cast
<MCommandReply
>(m
));
2770 case CEPH_MSG_CLIENT_QUOTA
:
2771 handle_quota(ref_cast
<MClientQuota
>(m
));
2779 std::scoped_lock
cl(client_lock
);
2780 if (is_unmounting()) {
2781 ldout(cct
, 10) << "unmounting: trim pass, size was " << lru
.lru_get_size()
2782 << "+" << inode_map
.size() << dendl
;
2783 uint64_t size
= lru
.lru_get_size() + inode_map
.size();
2785 if (size
> lru
.lru_get_size() + inode_map
.size()) {
2786 ldout(cct
, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl
;
2787 mount_cond
.notify_all();
2789 ldout(cct
, 10) << "unmounting: trim pass, size still " << lru
.lru_get_size()
2790 << "+" << inode_map
.size() << dendl
;
2797 void Client::handle_fs_map(const MConstRef
<MFSMap
>& m
)
2799 std::scoped_lock
cl(client_lock
);
2800 fsmap
.reset(new FSMap(m
->get_fsmap()));
2802 signal_cond_list(waiting_for_fsmap
);
2804 monclient
->sub_got("fsmap", fsmap
->get_epoch());
2807 void Client::handle_fs_map_user(const MConstRef
<MFSMapUser
>& m
)
2809 std::scoped_lock
cl(client_lock
);
2810 fsmap_user
.reset(new FSMapUser
);
2811 *fsmap_user
= m
->get_fsmap();
2813 monclient
->sub_got("fsmap.user", fsmap_user
->get_epoch());
2814 signal_cond_list(waiting_for_fsmap
);
2817 // Cancel all the commands for missing or laggy GIDs
2818 void Client::cancel_commands(const MDSMap
& newmap
)
2820 std::vector
<ceph_tid_t
> cancel_ops
;
2822 std::scoped_lock
cmd_lock(command_lock
);
2823 auto &commands
= command_table
.get_commands();
2824 for (const auto &[tid
, op
] : commands
) {
2825 const mds_gid_t op_mds_gid
= op
.mds_gid
;
2826 if (newmap
.is_dne_gid(op_mds_gid
) || newmap
.is_laggy_gid(op_mds_gid
)) {
2827 ldout(cct
, 1) << __func__
<< ": cancelling command op " << tid
<< dendl
;
2828 cancel_ops
.push_back(tid
);
2830 std::ostringstream ss
;
2831 ss
<< "MDS " << op_mds_gid
<< " went away";
2832 *(op
.outs
) = ss
.str();
2835 * No need to make the con->mark_down under
2836 * client_lock here, because the con will
2839 op
.con
->mark_down();
2841 op
.on_finish
->complete(-CEPHFS_ETIMEDOUT
);
2845 for (const auto &tid
: cancel_ops
)
2846 command_table
.erase(tid
);
2849 void Client::handle_mds_map(const MConstRef
<MMDSMap
>& m
)
2851 std::unique_lock
cl(client_lock
);
2852 if (m
->get_epoch() <= mdsmap
->get_epoch()) {
2853 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch()
2854 << " is identical to or older than our "
2855 << mdsmap
->get_epoch() << dendl
;
2860 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch() << dendl
;
2861 std::unique_ptr
<MDSMap
> _mdsmap(new MDSMap
);
2862 _mdsmap
->decode(m
->get_encoded());
2863 cancel_commands(*_mdsmap
.get());
2866 _mdsmap
.swap(mdsmap
);
2869 for (auto p
= mds_sessions
.begin(); p
!= mds_sessions
.end(); ) {
2870 mds_rank_t mds
= p
->first
;
2871 MetaSessionRef session
= p
->second
;
2874 int oldstate
= _mdsmap
->get_state(mds
);
2875 int newstate
= mdsmap
->get_state(mds
);
2876 if (!mdsmap
->is_up(mds
)) {
2877 session
->con
->mark_down();
2878 } else if (mdsmap
->get_addrs(mds
) != session
->addrs
) {
2879 auto old_inc
= _mdsmap
->get_incarnation(mds
);
2880 auto new_inc
= mdsmap
->get_incarnation(mds
);
2881 if (old_inc
!= new_inc
) {
2882 ldout(cct
, 1) << "mds incarnation changed from "
2883 << old_inc
<< " to " << new_inc
<< dendl
;
2884 oldstate
= MDSMap::STATE_NULL
;
2886 session
->con
->mark_down();
2887 session
->addrs
= mdsmap
->get_addrs(mds
);
2888 // When new MDS starts to take over, notify kernel to trim unused entries
2889 // in its dcache/icache. Hopefully, the kernel will release some unused
2890 // inodes before the new MDS enters reconnect state.
2891 trim_cache_for_reconnect(session
.get());
2892 } else if (oldstate
== newstate
)
2893 continue; // no change
2895 session
->mds_state
= newstate
;
2896 if (newstate
== MDSMap::STATE_RECONNECT
) {
2897 session
->con
= messenger
->connect_to_mds(session
->addrs
);
2898 send_reconnect(session
.get());
2899 } else if (newstate
> MDSMap::STATE_RECONNECT
) {
2900 if (oldstate
< MDSMap::STATE_RECONNECT
) {
2901 ldout(cct
, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl
;
2902 _closed_mds_session(session
.get());
2905 if (newstate
>= MDSMap::STATE_ACTIVE
) {
2906 if (oldstate
< MDSMap::STATE_ACTIVE
) {
2907 // kick new requests
2908 kick_requests(session
.get());
2909 kick_flushing_caps(session
.get());
2910 signal_context_list(session
->waiting_for_open
);
2911 wake_up_session_caps(session
.get(), true);
2913 connect_mds_targets(mds
);
2915 } else if (newstate
== MDSMap::STATE_NULL
&&
2916 mds
>= mdsmap
->get_max_mds()) {
2917 _closed_mds_session(session
.get());
2921 // kick any waiting threads
2922 signal_cond_list(waiting_for_mdsmap
);
2924 monclient
->sub_got("mdsmap", mdsmap
->get_epoch());
2927 void Client::send_reconnect(MetaSession
*session
)
2929 mds_rank_t mds
= session
->mds_num
;
2930 ldout(cct
, 10) << __func__
<< " to mds." << mds
<< dendl
;
2932 // trim unused caps to reduce MDS's cache rejoin time
2933 trim_cache_for_reconnect(session
);
2935 session
->readonly
= false;
2937 session
->release
.reset();
2939 // reset my cap seq number
2941 //connect to the mds' offload targets
2942 connect_mds_targets(mds
);
2943 //make sure unsafe requests get saved
2944 resend_unsafe_requests(session
);
2946 early_kick_flushing_caps(session
);
2948 auto m
= make_message
<MClientReconnect
>();
2949 bool allow_multi
= session
->mds_features
.test(CEPHFS_FEATURE_MULTI_RECONNECT
);
2951 // i have an open session.
2952 ceph::unordered_set
<inodeno_t
> did_snaprealm
;
2953 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
= inode_map
.begin();
2954 p
!= inode_map
.end();
2956 Inode
*in
= p
->second
;
2957 auto it
= in
->caps
.find(mds
);
2958 if (it
!= in
->caps
.end()) {
2960 m
->get_approx_size() >=
2961 static_cast<size_t>((std::numeric_limits
<int>::max() >> 1))) {
2963 session
->con
->send_message2(std::move(m
));
2965 m
= make_message
<MClientReconnect
>();
2968 Cap
&cap
= it
->second
;
2969 ldout(cct
, 10) << " caps on " << p
->first
2970 << " " << ccap_string(cap
.issued
)
2971 << " wants " << ccap_string(in
->caps_wanted())
2974 in
->make_short_path(path
);
2975 ldout(cct
, 10) << " path " << path
<< dendl
;
2978 _encode_filelocks(in
, flockbl
);
2980 cap
.seq
= 0; // reset seq.
2981 cap
.issue_seq
= 0; // reset seq.
2982 cap
.mseq
= 0; // reset seq.
2983 // cap gen should catch up with session cap_gen
2984 if (cap
.gen
< session
->cap_gen
) {
2985 cap
.gen
= session
->cap_gen
;
2986 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
2988 cap
.issued
= cap
.implemented
;
2990 snapid_t snap_follows
= 0;
2991 if (!in
->cap_snaps
.empty())
2992 snap_follows
= in
->cap_snaps
.begin()->first
;
2994 m
->add_cap(p
->first
.ino
,
2996 path
.get_ino(), path
.get_path(), // ino
2997 in
->caps_wanted(), // wanted
2998 cap
.issued
, // issued
3003 if (did_snaprealm
.count(in
->snaprealm
->ino
) == 0) {
3004 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
3005 m
->add_snaprealm(in
->snaprealm
->ino
, in
->snaprealm
->seq
, in
->snaprealm
->parent
);
3006 did_snaprealm
.insert(in
->snaprealm
->ino
);
3012 m
->set_encoding_version(0); // use connection features to choose encoding
3013 session
->con
->send_message2(std::move(m
));
3015 mount_cond
.notify_all();
3017 if (session
->reclaim_state
== MetaSession::RECLAIMING
)
3018 signal_cond_list(waiting_for_reclaim
);
3022 void Client::kick_requests(MetaSession
*session
)
3024 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
3025 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
3026 p
!= mds_requests
.end();
3028 MetaRequest
*req
= p
->second
;
3029 if (req
->got_unsafe
)
3031 if (req
->aborted()) {
3032 if (req
->caller_cond
) {
3034 req
->caller_cond
->notify_all();
3038 if (req
->retry_attempt
> 0)
3039 continue; // new requests only
3040 if (req
->mds
== session
->mds_num
) {
3041 send_request(p
->second
, session
);
3046 void Client::resend_unsafe_requests(MetaSession
*session
)
3048 for (xlist
<MetaRequest
*>::iterator iter
= session
->unsafe_requests
.begin();
3051 send_request(*iter
, session
);
3053 // also re-send old requests when MDS enters reconnect stage. So that MDS can
3054 // process completed requests in clientreplay stage.
3055 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
3056 p
!= mds_requests
.end();
3058 MetaRequest
*req
= p
->second
;
3059 if (req
->got_unsafe
)
3063 if (req
->retry_attempt
== 0)
3064 continue; // old requests only
3065 if (req
->mds
== session
->mds_num
)
3066 send_request(req
, session
, true);
3070 void Client::wait_unsafe_requests()
3072 list
<MetaRequest
*> last_unsafe_reqs
;
3073 for (const auto &p
: mds_sessions
) {
3074 const auto s
= p
.second
;
3075 if (!s
->unsafe_requests
.empty()) {
3076 MetaRequest
*req
= s
->unsafe_requests
.back();
3078 last_unsafe_reqs
.push_back(req
);
3082 for (list
<MetaRequest
*>::iterator p
= last_unsafe_reqs
.begin();
3083 p
!= last_unsafe_reqs
.end();
3085 MetaRequest
*req
= *p
;
3086 if (req
->unsafe_item
.is_on_list())
3087 wait_on_list(req
->waitfor_safe
);
3092 void Client::kick_requests_closed(MetaSession
*session
)
3094 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
3095 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
3096 p
!= mds_requests
.end(); ) {
3097 MetaRequest
*req
= p
->second
;
3099 if (req
->mds
== session
->mds_num
) {
3100 if (req
->caller_cond
) {
3102 req
->caller_cond
->notify_all();
3104 req
->item
.remove_myself();
3105 if (req
->got_unsafe
) {
3106 lderr(cct
) << __func__
<< " removing unsafe request " << req
->get_tid() << dendl
;
3107 req
->unsafe_item
.remove_myself();
3108 if (is_dir_operation(req
)) {
3109 Inode
*dir
= req
->inode();
3111 dir
->set_async_err(-CEPHFS_EIO
);
3112 lderr(cct
) << "kick_requests_closed drop req of inode(dir) : "
3113 << dir
->ino
<< " " << req
->get_tid() << dendl
;
3114 req
->unsafe_dir_item
.remove_myself();
3117 InodeRef
&in
= req
->target
;
3118 in
->set_async_err(-CEPHFS_EIO
);
3119 lderr(cct
) << "kick_requests_closed drop req of inode : "
3120 << in
->ino
<< " " << req
->get_tid() << dendl
;
3121 req
->unsafe_target_item
.remove_myself();
3123 signal_cond_list(req
->waitfor_safe
);
3124 unregister_request(req
);
3128 ceph_assert(session
->requests
.empty());
3129 ceph_assert(session
->unsafe_requests
.empty());
3139 void Client::got_mds_push(MetaSession
*s
)
3142 ldout(cct
, 10) << " mds." << s
->mds_num
<< " seq now " << s
->seq
<< dendl
;
3143 if (s
->state
== MetaSession::STATE_CLOSING
) {
3144 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
3148 void Client::handle_lease(const MConstRef
<MClientLease
>& m
)
3150 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
3152 ceph_assert(m
->get_action() == CEPH_MDS_LEASE_REVOKE
);
3153 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
3155 std::scoped_lock
cl(client_lock
);
3156 auto session
= _get_mds_session(mds
, m
->get_connection().get());
3161 got_mds_push(session
.get());
3163 ceph_seq_t seq
= m
->get_seq();
3166 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
3167 if (inode_map
.count(vino
) == 0) {
3168 ldout(cct
, 10) << " don't have vino " << vino
<< dendl
;
3171 in
= inode_map
[vino
];
3173 if (m
->get_mask() & CEPH_LEASE_VALID
) {
3174 if (!in
->dir
|| in
->dir
->dentries
.count(m
->dname
) == 0) {
3175 ldout(cct
, 10) << " don't have dir|dentry " << m
->get_ino() << "/" << m
->dname
<<dendl
;
3178 Dentry
*dn
= in
->dir
->dentries
[m
->dname
];
3179 ldout(cct
, 10) << " revoked DN lease on " << dn
<< dendl
;
3185 auto reply
= make_message
<MClientLease
>(CEPH_MDS_LEASE_RELEASE
, seq
,
3186 m
->get_mask(), m
->get_ino(),
3187 m
->get_first(), m
->get_last(), m
->dname
);
3188 m
->get_connection()->send_message2(std::move(reply
));
3192 void Client::_put_inode(Inode
*in
, int n
)
3194 ldout(cct
, 10) << __func__
<< " on " << *in
<< " n = " << n
<< dendl
;
3196 int left
= in
->get_nref();
3197 ceph_assert(left
>= n
+ 1);
3200 if (left
== 1) { // the last one will be held by the inode_map
3202 remove_all_caps(in
);
3204 ldout(cct
, 10) << __func__
<< " deleting " << *in
<< dendl
;
3205 bool unclean
= objectcacher
->release_set(&in
->oset
);
3206 ceph_assert(!unclean
);
3207 inode_map
.erase(in
->vino());
3208 if (use_faked_inos())
3209 _release_faked_ino(in
);
3211 if (root
== nullptr) {
3213 while (!root_parents
.empty())
3214 root_parents
.erase(root_parents
.begin());
3221 void Client::delay_put_inodes(bool wakeup
)
3223 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
3225 std::map
<Inode
*,int> release
;
3227 std::scoped_lock
dl(delay_i_lock
);
3228 release
.swap(delay_i_release
);
3231 if (release
.empty())
3234 for (auto &[in
, cnt
] : release
)
3235 _put_inode(in
, cnt
);
3238 mount_cond
.notify_all();
3241 void Client::put_inode(Inode
*in
, int n
)
3243 ldout(cct
, 20) << __func__
<< " on " << *in
<< " n = " << n
<< dendl
;
3245 std::scoped_lock
dl(delay_i_lock
);
3246 delay_i_release
[in
] += n
;
3249 void Client::close_dir(Dir
*dir
)
3251 Inode
*in
= dir
->parent_inode
;
3252 ldout(cct
, 15) << __func__
<< " dir " << dir
<< " on " << in
<< dendl
;
3253 ceph_assert(dir
->is_empty());
3254 ceph_assert(in
->dir
== dir
);
3255 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
3256 if (!in
->dentries
.empty())
3257 in
->get_first_parent()->put(); // unpin dentry
3261 put_inode(in
); // unpin inode
3265 * Don't call this with in==NULL, use get_or_create for that
3266 * leave dn set to default NULL unless you're trying to add
3267 * a new inode to a pre-created Dentry
3269 Dentry
* Client::link(Dir
*dir
, const string
& name
, Inode
*in
, Dentry
*dn
)
3272 // create a new Dentry
3273 dn
= new Dentry(dir
, name
);
3275 lru
.lru_insert_mid(dn
); // mid or top?
3277 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3278 << " dn " << dn
<< " (new dn)" << dendl
;
3280 ceph_assert(!dn
->inode
);
3281 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3282 << " dn " << dn
<< " (old dn)" << dendl
;
3285 if (in
) { // link to inode
3287 // only one parent for directories!
3288 if (in
->is_dir() && !in
->dentries
.empty()) {
3289 tmp_ref
= in
; // prevent unlink below from freeing the inode.
3290 Dentry
*olddn
= in
->get_first_parent();
3291 ceph_assert(olddn
->dir
!= dir
|| olddn
->name
!= name
);
3292 Inode
*old_diri
= olddn
->dir
->parent_inode
;
3293 clear_dir_complete_and_ordered(old_diri
, true);
3294 unlink(olddn
, true, true); // keep dir, dentry
3299 ldout(cct
, 20) << "link inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3305 void Client::unlink(Dentry
*dn
, bool keepdir
, bool keepdentry
)
3307 InodeRef
in(dn
->inode
);
3308 ldout(cct
, 15) << "unlink dir " << dn
->dir
->parent_inode
<< " '" << dn
->name
<< "' dn " << dn
3309 << " inode " << dn
->inode
<< dendl
;
3311 // unlink from inode
3315 ldout(cct
, 20) << "unlink inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3321 ldout(cct
, 15) << "unlink removing '" << dn
->name
<< "' dn " << dn
<< dendl
;
3331 if (dir
->is_empty() && !keepdir
)
3337 * For asynchronous flushes, check for errors from the IO and
3338 * update the inode if necessary
3340 class C_Client_FlushComplete
: public Context
{
3345 C_Client_FlushComplete(Client
*c
, Inode
*in
) : client(c
), inode(in
) { }
3346 void finish(int r
) override
{
3347 ceph_assert(ceph_mutex_is_locked_by_me(client
->client_lock
));
3349 client_t
const whoami
= client
->whoami
; // For the benefit of ldout prefix
3350 ldout(client
->cct
, 1) << "I/O error from flush on inode " << inode
3351 << " 0x" << std::hex
<< inode
->ino
<< std::dec
3352 << ": " << r
<< "(" << cpp_strerror(r
) << ")" << dendl
;
3353 inode
->set_async_err(r
);
3363 void Client::get_cap_ref(Inode
*in
, int cap
)
3365 if ((cap
& CEPH_CAP_FILE_BUFFER
) &&
3366 in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] == 0) {
3367 ldout(cct
, 5) << __func__
<< " got first FILE_BUFFER ref on " << *in
<< dendl
;
3370 if ((cap
& CEPH_CAP_FILE_CACHE
) &&
3371 in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3372 ldout(cct
, 5) << __func__
<< " got first FILE_CACHE ref on " << *in
<< dendl
;
3375 in
->get_cap_ref(cap
);
3378 void Client::put_cap_ref(Inode
*in
, int cap
)
3380 int last
= in
->put_cap_ref(cap
);
3383 int drop
= last
& ~in
->caps_issued();
3384 if (in
->snapid
== CEPH_NOSNAP
) {
3385 if ((last
& (CEPH_CAP_FILE_WR
| CEPH_CAP_FILE_BUFFER
)) &&
3386 !in
->cap_snaps
.empty() &&
3387 in
->cap_snaps
.rbegin()->second
.writing
) {
3388 ldout(cct
, 10) << __func__
<< " finishing pending cap_snap on " << *in
<< dendl
;
3389 in
->cap_snaps
.rbegin()->second
.writing
= 0;
3390 finish_cap_snap(in
, in
->cap_snaps
.rbegin()->second
, get_caps_used(in
));
3391 signal_cond_list(in
->waitfor_caps
); // wake up blocked sync writers
3393 if (last
& CEPH_CAP_FILE_BUFFER
) {
3394 for (auto &p
: in
->cap_snaps
)
3395 p
.second
.dirty_data
= 0;
3396 signal_cond_list(in
->waitfor_commit
);
3397 ldout(cct
, 5) << __func__
<< " dropped last FILE_BUFFER ref on " << *in
<< dendl
;
3401 if (last
& CEPH_CAP_FILE_CACHE
) {
3402 ldout(cct
, 5) << __func__
<< " dropped last FILE_CACHE ref on " << *in
<< dendl
;
3408 put_inode(in
, put_nref
);
3412 // get caps for a given file handle -- the inode should have @need caps
3413 // issued by the mds and @want caps not revoked (or not under revocation).
3414 // this routine blocks till the cap requirement is satisfied. also account
3415 // (track) for capability hit when required (when cap requirement succeedes).
3416 int Client::get_caps(Fh
*fh
, int need
, int want
, int *phave
, loff_t endoff
)
3418 Inode
*in
= fh
->inode
.get();
3420 int r
= check_pool_perm(in
, need
);
3425 int file_wanted
= in
->caps_file_wanted();
3426 if ((file_wanted
& need
) != need
) {
3427 ldout(cct
, 10) << "get_caps " << *in
<< " need " << ccap_string(need
)
3428 << " file_wanted " << ccap_string(file_wanted
) << ", EBADF "
3430 return -CEPHFS_EBADF
;
3433 if ((fh
->mode
& CEPH_FILE_MODE_WR
) && fh
->gen
!= fd_gen
)
3434 return -CEPHFS_EBADF
;
3436 if ((in
->flags
& I_ERROR_FILELOCK
) && fh
->has_any_filelocks())
3440 int have
= in
->caps_issued(&implemented
);
3442 bool waitfor_caps
= false;
3443 bool waitfor_commit
= false;
3445 if (have
& need
& CEPH_CAP_FILE_WR
) {
3447 if ((endoff
>= (loff_t
)in
->max_size
||
3448 endoff
> (loff_t
)(in
->size
<< 1)) &&
3449 endoff
> (loff_t
)in
->wanted_max_size
) {
3450 ldout(cct
, 10) << "wanted_max_size " << in
->wanted_max_size
<< " -> " << endoff
<< dendl
;
3451 in
->wanted_max_size
= endoff
;
3453 if (in
->wanted_max_size
> in
->max_size
&&
3454 in
->wanted_max_size
> in
->requested_max_size
)
3458 if (endoff
>= 0 && endoff
> (loff_t
)in
->max_size
) {
3459 ldout(cct
, 10) << "waiting on max_size, endoff " << endoff
<< " max_size " << in
->max_size
<< " on " << *in
<< dendl
;
3460 waitfor_caps
= true;
3462 if (!in
->cap_snaps
.empty()) {
3463 if (in
->cap_snaps
.rbegin()->second
.writing
) {
3464 ldout(cct
, 10) << "waiting on cap_snap write to complete" << dendl
;
3465 waitfor_caps
= true;
3467 for (auto &p
: in
->cap_snaps
) {
3468 if (p
.second
.dirty_data
) {
3469 waitfor_commit
= true;
3473 if (waitfor_commit
) {
3474 _flush(in
, new C_Client_FlushComplete(this, in
));
3475 ldout(cct
, 10) << "waiting for WRBUFFER to get dropped" << dendl
;
3480 if (!waitfor_caps
&& !waitfor_commit
) {
3481 if ((have
& need
) == need
) {
3482 int revoking
= implemented
& ~have
;
3483 ldout(cct
, 10) << "get_caps " << *in
<< " have " << ccap_string(have
)
3484 << " need " << ccap_string(need
) << " want " << ccap_string(want
)
3485 << " revoking " << ccap_string(revoking
)
3487 if ((revoking
& want
) == 0) {
3488 *phave
= need
| (have
& want
);
3489 in
->get_cap_ref(need
);
3494 ldout(cct
, 10) << "waiting for caps " << *in
<< " need " << ccap_string(need
) << " want " << ccap_string(want
) << dendl
;
3495 waitfor_caps
= true;
3498 if ((need
& CEPH_CAP_FILE_WR
) && in
->auth_cap
&&
3499 in
->auth_cap
->session
->readonly
)
3500 return -CEPHFS_EROFS
;
3502 if (in
->flags
& I_CAP_DROPPED
) {
3503 int mds_wanted
= in
->caps_mds_wanted();
3504 if ((mds_wanted
& need
) != need
) {
3505 int ret
= _renew_caps(in
);
3510 if (!(file_wanted
& ~mds_wanted
))
3511 in
->flags
&= ~I_CAP_DROPPED
;
3515 wait_on_list(in
->waitfor_caps
);
3516 else if (waitfor_commit
)
3517 wait_on_list(in
->waitfor_commit
);
3521 int Client::get_caps_used(Inode
*in
)
3523 unsigned used
= in
->caps_used();
3524 if (!(used
& CEPH_CAP_FILE_CACHE
) &&
3525 !objectcacher
->set_is_empty(&in
->oset
))
3526 used
|= CEPH_CAP_FILE_CACHE
;
3530 void Client::cap_delay_requeue(Inode
*in
)
3532 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3533 in
->hold_caps_until
= ceph_clock_now();
3534 in
->hold_caps_until
+= cct
->_conf
->client_caps_release_delay
;
3535 delayed_list
.push_back(&in
->delay_cap_item
);
3538 void Client::send_cap(Inode
*in
, MetaSession
*session
, Cap
*cap
,
3539 int flags
, int used
, int want
, int retain
,
3540 int flush
, ceph_tid_t flush_tid
)
3542 int held
= cap
->issued
| cap
->implemented
;
3543 int revoking
= cap
->implemented
& ~cap
->issued
;
3544 retain
&= ~revoking
;
3545 int dropping
= cap
->issued
& ~retain
;
3546 int op
= CEPH_CAP_OP_UPDATE
;
3548 ldout(cct
, 10) << __func__
<< " " << *in
3549 << " mds." << session
->mds_num
<< " seq " << cap
->seq
3550 << " used " << ccap_string(used
)
3551 << " want " << ccap_string(want
)
3552 << " flush " << ccap_string(flush
)
3553 << " retain " << ccap_string(retain
)
3554 << " held "<< ccap_string(held
)
3555 << " revoking " << ccap_string(revoking
)
3556 << " dropping " << ccap_string(dropping
)
3559 if (cct
->_conf
->client_inject_release_failure
&& revoking
) {
3560 const int would_have_issued
= cap
->issued
& retain
;
3561 const int would_have_implemented
= cap
->implemented
& (cap
->issued
| used
);
3563 // - tell the server we think issued is whatever they issued plus whatever we implemented
3564 // - leave what we have implemented in place
3565 ldout(cct
, 20) << __func__
<< " injecting failure to release caps" << dendl
;
3566 cap
->issued
= cap
->issued
| cap
->implemented
;
3568 // Make an exception for revoking xattr caps: we are injecting
3569 // failure to release other caps, but allow xattr because client
3570 // will block on xattr ops if it can't release these to MDS (#9800)
3571 const int xattr_mask
= CEPH_CAP_XATTR_SHARED
| CEPH_CAP_XATTR_EXCL
;
3572 cap
->issued
^= xattr_mask
& revoking
;
3573 cap
->implemented
^= xattr_mask
& revoking
;
3575 ldout(cct
, 20) << __func__
<< " issued " << ccap_string(cap
->issued
) << " vs " << ccap_string(would_have_issued
) << dendl
;
3576 ldout(cct
, 20) << __func__
<< " implemented " << ccap_string(cap
->implemented
) << " vs " << ccap_string(would_have_implemented
) << dendl
;
3579 cap
->issued
&= retain
;
3580 cap
->implemented
&= cap
->issued
| used
;
3583 snapid_t follows
= 0;
3586 follows
= in
->snaprealm
->get_snap_context().seq
;
3588 auto m
= make_message
<MClientCaps
>(op
,
3591 cap
->cap_id
, cap
->seq
,
3597 m
->caller_uid
= in
->cap_dirtier_uid
;
3598 m
->caller_gid
= in
->cap_dirtier_gid
;
3600 m
->head
.issue_seq
= cap
->issue_seq
;
3601 m
->set_tid(flush_tid
);
3603 m
->head
.uid
= in
->uid
;
3604 m
->head
.gid
= in
->gid
;
3605 m
->head
.mode
= in
->mode
;
3607 m
->head
.nlink
= in
->nlink
;
3609 if (flush
& CEPH_CAP_XATTR_EXCL
) {
3610 encode(in
->xattrs
, m
->xattrbl
);
3611 m
->head
.xattr_version
= in
->xattr_version
;
3615 m
->max_size
= in
->max_size
;
3616 m
->truncate_seq
= in
->truncate_seq
;
3617 m
->truncate_size
= in
->truncate_size
;
3618 m
->mtime
= in
->mtime
;
3619 m
->atime
= in
->atime
;
3620 m
->ctime
= in
->ctime
;
3621 m
->btime
= in
->btime
;
3622 m
->time_warp_seq
= in
->time_warp_seq
;
3623 m
->change_attr
= in
->change_attr
;
3625 if (!(flags
& MClientCaps::FLAG_PENDING_CAPSNAP
) &&
3626 !in
->cap_snaps
.empty() &&
3627 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3628 flags
|= MClientCaps::FLAG_PENDING_CAPSNAP
;
3631 if (flush
& CEPH_CAP_FILE_WR
) {
3632 m
->inline_version
= in
->inline_version
;
3633 m
->inline_data
= in
->inline_data
;
3636 in
->reported_size
= in
->size
;
3637 m
->set_snap_follows(follows
);
3639 if (cap
== in
->auth_cap
) {
3640 if (want
& CEPH_CAP_ANY_FILE_WR
) {
3641 m
->set_max_size(in
->wanted_max_size
);
3642 in
->requested_max_size
= in
->wanted_max_size
;
3643 ldout(cct
, 15) << "auth cap, requesting max_size " << in
->requested_max_size
<< dendl
;
3645 in
->requested_max_size
= 0;
3646 ldout(cct
, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl
;
3650 if (!session
->flushing_caps_tids
.empty())
3651 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3653 session
->con
->send_message2(std::move(m
));
3656 static bool is_max_size_approaching(Inode
*in
)
3658 /* mds will adjust max size according to the reported size */
3659 if (in
->flushing_caps
& CEPH_CAP_FILE_WR
)
3661 if (in
->size
>= in
->max_size
)
3663 /* half of previous max_size increment has been used */
3664 if (in
->max_size
> in
->reported_size
&&
3665 (in
->size
<< 1) >= in
->max_size
+ in
->reported_size
)
3670 static int adjust_caps_used_for_lazyio(int used
, int issued
, int implemented
)
3672 if (!(used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
)))
3674 if (!(implemented
& CEPH_CAP_FILE_LAZYIO
))
3677 if (issued
& CEPH_CAP_FILE_LAZYIO
) {
3678 if (!(issued
& CEPH_CAP_FILE_CACHE
)) {
3679 used
&= ~CEPH_CAP_FILE_CACHE
;
3680 used
|= CEPH_CAP_FILE_LAZYIO
;
3682 if (!(issued
& CEPH_CAP_FILE_BUFFER
)) {
3683 used
&= ~CEPH_CAP_FILE_BUFFER
;
3684 used
|= CEPH_CAP_FILE_LAZYIO
;
3687 if (!(implemented
& CEPH_CAP_FILE_CACHE
)) {
3688 used
&= ~CEPH_CAP_FILE_CACHE
;
3689 used
|= CEPH_CAP_FILE_LAZYIO
;
3691 if (!(implemented
& CEPH_CAP_FILE_BUFFER
)) {
3692 used
&= ~CEPH_CAP_FILE_BUFFER
;
3693 used
|= CEPH_CAP_FILE_LAZYIO
;
3702 * Examine currently used and wanted versus held caps. Release, flush or ack
3703 * revoked caps to the MDS as appropriate.
3705 * @param in the inode to check
3706 * @param flags flags to apply to cap check
3708 void Client::check_caps(Inode
*in
, unsigned flags
)
3710 unsigned wanted
= in
->caps_wanted();
3711 unsigned used
= get_caps_used(in
);
3715 int issued
= in
->caps_issued(&implemented
);
3716 int revoking
= implemented
& ~issued
;
3718 int orig_used
= used
;
3719 used
= adjust_caps_used_for_lazyio(used
, issued
, implemented
);
3721 int retain
= wanted
| used
| CEPH_CAP_PIN
;
3722 if (!is_unmounting() && in
->nlink
> 0) {
3724 retain
|= CEPH_CAP_ANY
;
3725 } else if (in
->is_dir() &&
3726 (issued
& CEPH_CAP_FILE_SHARED
) &&
3727 (in
->flags
& I_COMPLETE
)) {
3728 // we do this here because we don't want to drop to Fs (and then
3729 // drop the Fs if we do a create!) if that alone makes us send lookups
3730 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3731 wanted
= CEPH_CAP_ANY_SHARED
| CEPH_CAP_FILE_EXCL
;
3734 retain
|= CEPH_CAP_ANY_SHARED
;
3735 // keep RD only if we didn't have the file open RW,
3736 // because then the mds would revoke it anyway to
3737 // journal max_size=0.
3738 if (in
->max_size
== 0)
3739 retain
|= CEPH_CAP_ANY_RD
;
3743 ldout(cct
, 10) << __func__
<< " on " << *in
3744 << " wanted " << ccap_string(wanted
)
3745 << " used " << ccap_string(used
)
3746 << " issued " << ccap_string(issued
)
3747 << " revoking " << ccap_string(revoking
)
3748 << " flags=" << flags
3751 if (in
->snapid
!= CEPH_NOSNAP
)
3752 return; //snap caps last forever, can't write
3754 if (in
->caps
.empty())
3755 return; // guard if at end of func
3757 if (!(orig_used
& CEPH_CAP_FILE_BUFFER
) &&
3758 (revoking
& used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
3760 used
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
3763 for (auto &[mds
, cap
] : in
->caps
) {
3764 auto session
= mds_sessions
.at(mds
);
3767 if (in
->auth_cap
&& &cap
!= in
->auth_cap
)
3768 cap_used
&= ~in
->auth_cap
->issued
;
3770 revoking
= cap
.implemented
& ~cap
.issued
;
3772 ldout(cct
, 10) << " cap mds." << mds
3773 << " issued " << ccap_string(cap
.issued
)
3774 << " implemented " << ccap_string(cap
.implemented
)
3775 << " revoking " << ccap_string(revoking
) << dendl
;
3777 if (in
->wanted_max_size
> in
->max_size
&&
3778 in
->wanted_max_size
> in
->requested_max_size
&&
3779 &cap
== in
->auth_cap
)
3782 /* approaching file_max? */
3783 if ((cap
.issued
& CEPH_CAP_FILE_WR
) &&
3784 &cap
== in
->auth_cap
&&
3785 is_max_size_approaching(in
)) {
3786 ldout(cct
, 10) << "size " << in
->size
<< " approaching max_size " << in
->max_size
3787 << ", reported " << in
->reported_size
<< dendl
;
3791 /* completed revocation? */
3792 if (revoking
&& (revoking
& cap_used
) == 0) {
3793 ldout(cct
, 10) << "completed revocation of " << ccap_string(cap
.implemented
& ~cap
.issued
) << dendl
;
3797 /* want more caps from mds? */
3798 if (wanted
& ~(cap
.wanted
| cap
.issued
))
3801 if (!revoking
&& is_unmounting() && (cap_used
== 0))
3804 if ((cap
.issued
& ~retain
) == 0 && // and we don't have anything we wouldn't like
3805 !in
->dirty_caps
) // and we have no dirty caps
3808 if (!(flags
& CHECK_CAPS_NODELAY
)) {
3809 ldout(cct
, 10) << "delaying cap release" << dendl
;
3810 cap_delay_requeue(in
);
3815 if (&cap
== in
->auth_cap
) {
3816 if (in
->flags
& I_KICK_FLUSH
) {
3817 ldout(cct
, 20) << " reflushing caps (check_caps) on " << *in
3818 << " to mds." << mds
<< dendl
;
3819 kick_flushing_caps(in
, session
.get());
3821 if (!in
->cap_snaps
.empty() &&
3822 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3828 ceph_tid_t flush_tid
;
3829 if (in
->auth_cap
== &cap
&& in
->dirty_caps
) {
3830 flushing
= mark_caps_flushing(in
, &flush_tid
);
3831 if (flags
& CHECK_CAPS_SYNCHRONOUS
)
3832 msg_flags
|= MClientCaps::FLAG_SYNC
;
3838 in
->delay_cap_item
.remove_myself();
3839 send_cap(in
, session
.get(), &cap
, msg_flags
, cap_used
, wanted
, retain
,
3840 flushing
, flush_tid
);
3845 void Client::queue_cap_snap(Inode
*in
, SnapContext
& old_snapc
)
3847 int used
= get_caps_used(in
);
3848 int dirty
= in
->caps_dirty();
3849 ldout(cct
, 10) << __func__
<< " " << *in
<< " snapc " << old_snapc
<< " used " << ccap_string(used
) << dendl
;
3851 if (in
->cap_snaps
.size() &&
3852 in
->cap_snaps
.rbegin()->second
.writing
) {
3853 ldout(cct
, 10) << __func__
<< " already have pending cap_snap on " << *in
<< dendl
;
3855 } else if (in
->caps_dirty() ||
3856 (used
& CEPH_CAP_FILE_WR
) ||
3857 (dirty
& CEPH_CAP_ANY_WR
)) {
3858 const auto &capsnapem
= in
->cap_snaps
.emplace(std::piecewise_construct
, std::make_tuple(old_snapc
.seq
), std::make_tuple(in
));
3859 ceph_assert(capsnapem
.second
); /* element inserted */
3860 CapSnap
&capsnap
= capsnapem
.first
->second
;
3861 capsnap
.context
= old_snapc
;
3862 capsnap
.issued
= in
->caps_issued();
3863 capsnap
.dirty
= in
->caps_dirty();
3865 capsnap
.dirty_data
= (used
& CEPH_CAP_FILE_BUFFER
);
3867 capsnap
.uid
= in
->uid
;
3868 capsnap
.gid
= in
->gid
;
3869 capsnap
.mode
= in
->mode
;
3870 capsnap
.btime
= in
->btime
;
3871 capsnap
.xattrs
= in
->xattrs
;
3872 capsnap
.xattr_version
= in
->xattr_version
;
3873 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3874 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3876 if (used
& CEPH_CAP_FILE_WR
) {
3877 ldout(cct
, 10) << __func__
<< " WR used on " << *in
<< dendl
;
3878 capsnap
.writing
= 1;
3880 finish_cap_snap(in
, capsnap
, used
);
3883 ldout(cct
, 10) << __func__
<< " not dirty|writing on " << *in
<< dendl
;
3887 void Client::finish_cap_snap(Inode
*in
, CapSnap
&capsnap
, int used
)
3889 ldout(cct
, 10) << __func__
<< " " << *in
<< " capsnap " << (void *)&capsnap
<< " used " << ccap_string(used
) << dendl
;
3890 capsnap
.size
= in
->size
;
3891 capsnap
.mtime
= in
->mtime
;
3892 capsnap
.atime
= in
->atime
;
3893 capsnap
.ctime
= in
->ctime
;
3894 capsnap
.time_warp_seq
= in
->time_warp_seq
;
3895 capsnap
.change_attr
= in
->change_attr
;
3896 capsnap
.dirty
|= in
->caps_dirty();
3898 /* Only reset it if it wasn't set before */
3899 if (capsnap
.cap_dirtier_uid
== -1) {
3900 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3901 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3904 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3905 capsnap
.inline_data
= in
->inline_data
;
3906 capsnap
.inline_version
= in
->inline_version
;
3909 if (used
& CEPH_CAP_FILE_BUFFER
) {
3910 capsnap
.writing
= 1;
3911 ldout(cct
, 10) << __func__
<< " " << *in
<< " cap_snap " << &capsnap
<< " used " << used
3912 << " WRBUFFER, delaying" << dendl
;
3914 capsnap
.dirty_data
= 0;
3919 void Client::send_flush_snap(Inode
*in
, MetaSession
*session
,
3920 snapid_t follows
, CapSnap
& capsnap
)
3922 auto m
= make_message
<MClientCaps
>(CEPH_CAP_OP_FLUSHSNAP
,
3923 in
->ino
, in
->snaprealm
->ino
, 0,
3924 in
->auth_cap
->mseq
, cap_epoch_barrier
);
3925 m
->caller_uid
= capsnap
.cap_dirtier_uid
;
3926 m
->caller_gid
= capsnap
.cap_dirtier_gid
;
3928 m
->set_client_tid(capsnap
.flush_tid
);
3929 m
->head
.snap_follows
= follows
;
3931 m
->head
.caps
= capsnap
.issued
;
3932 m
->head
.dirty
= capsnap
.dirty
;
3934 m
->head
.uid
= capsnap
.uid
;
3935 m
->head
.gid
= capsnap
.gid
;
3936 m
->head
.mode
= capsnap
.mode
;
3937 m
->btime
= capsnap
.btime
;
3939 m
->size
= capsnap
.size
;
3941 m
->head
.xattr_version
= capsnap
.xattr_version
;
3942 encode(capsnap
.xattrs
, m
->xattrbl
);
3944 m
->ctime
= capsnap
.ctime
;
3945 m
->btime
= capsnap
.btime
;
3946 m
->mtime
= capsnap
.mtime
;
3947 m
->atime
= capsnap
.atime
;
3948 m
->time_warp_seq
= capsnap
.time_warp_seq
;
3949 m
->change_attr
= capsnap
.change_attr
;
3951 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3952 m
->inline_version
= in
->inline_version
;
3953 m
->inline_data
= in
->inline_data
;
3956 ceph_assert(!session
->flushing_caps_tids
.empty());
3957 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3959 session
->con
->send_message2(std::move(m
));
3962 void Client::flush_snaps(Inode
*in
)
3964 ldout(cct
, 10) << "flush_snaps on " << *in
<< dendl
;
3965 ceph_assert(in
->cap_snaps
.size());
3968 ceph_assert(in
->auth_cap
);
3969 MetaSession
*session
= in
->auth_cap
->session
;
3971 for (auto &p
: in
->cap_snaps
) {
3972 CapSnap
&capsnap
= p
.second
;
3973 // only do new flush
3974 if (capsnap
.flush_tid
> 0)
3977 ldout(cct
, 10) << "flush_snaps mds." << session
->mds_num
3978 << " follows " << p
.first
3979 << " size " << capsnap
.size
3980 << " mtime " << capsnap
.mtime
3981 << " dirty_data=" << capsnap
.dirty_data
3982 << " writing=" << capsnap
.writing
3983 << " on " << *in
<< dendl
;
3984 if (capsnap
.dirty_data
|| capsnap
.writing
)
3987 capsnap
.flush_tid
= ++last_flush_tid
;
3988 session
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
3989 in
->flushing_cap_tids
[capsnap
.flush_tid
] = 0;
3990 if (!in
->flushing_cap_item
.is_on_list())
3991 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
3993 send_flush_snap(in
, session
, p
.first
, capsnap
);
3997 void Client::wait_on_list(list
<ceph::condition_variable
*>& ls
)
3999 ceph::condition_variable cond
;
4000 ls
.push_back(&cond
);
4001 std::unique_lock l
{client_lock
, std::adopt_lock
};
4007 void Client::signal_cond_list(list
<ceph::condition_variable
*>& ls
)
4009 for (auto cond
: ls
) {
4014 void Client::wait_on_context_list(list
<Context
*>& ls
)
4016 ceph::condition_variable cond
;
4019 ls
.push_back(new C_Cond(cond
, &done
, &r
));
4020 std::unique_lock l
{client_lock
, std::adopt_lock
};
4021 cond
.wait(l
, [&done
] { return done
;});
4025 void Client::signal_context_list(list
<Context
*>& ls
)
4027 while (!ls
.empty()) {
4028 ls
.front()->complete(0);
4033 void Client::wake_up_session_caps(MetaSession
*s
, bool reconnect
)
4035 for (const auto &cap
: s
->caps
) {
4036 auto &in
= cap
->inode
;
4038 in
.requested_max_size
= 0;
4039 in
.wanted_max_size
= 0;
4041 if (cap
->gen
< s
->cap_gen
) {
4042 // mds did not re-issue stale cap.
4043 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
4044 // make sure mds knows what we want.
4045 if (in
.caps_file_wanted() & ~cap
->wanted
)
4046 in
.flags
|= I_CAP_DROPPED
;
4049 signal_cond_list(in
.waitfor_caps
);
4054 // flush dirty data (from objectcache)
4056 class C_Client_CacheInvalidate
: public Context
{
4060 int64_t offset
, length
;
4062 C_Client_CacheInvalidate(Client
*c
, Inode
*in
, int64_t off
, int64_t len
) :
4063 client(c
), offset(off
), length(len
) {
4064 if (client
->use_faked_inos())
4065 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
4069 void finish(int r
) override
{
4070 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
4071 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
4072 client
->_async_invalidate(ino
, offset
, length
);
4076 void Client::_async_invalidate(vinodeno_t ino
, int64_t off
, int64_t len
)
4078 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
4079 if (!mref_reader
.is_state_satisfied())
4082 ldout(cct
, 10) << __func__
<< " " << ino
<< " " << off
<< "~" << len
<< dendl
;
4083 ino_invalidate_cb(callback_handle
, ino
, off
, len
);
4086 void Client::_schedule_invalidate_callback(Inode
*in
, int64_t off
, int64_t len
) {
4088 if (ino_invalidate_cb
)
4089 // we queue the invalidate, which calls the callback and decrements the ref
4090 async_ino_invalidator
.queue(new C_Client_CacheInvalidate(this, in
, off
, len
));
4093 void Client::_invalidate_inode_cache(Inode
*in
)
4095 ldout(cct
, 10) << __func__
<< " " << *in
<< dendl
;
4097 // invalidate our userspace inode cache
4098 if (cct
->_conf
->client_oc
) {
4099 objectcacher
->release_set(&in
->oset
);
4100 if (!objectcacher
->set_is_empty(&in
->oset
))
4101 lderr(cct
) << "failed to invalidate cache for " << *in
<< dendl
;
4104 _schedule_invalidate_callback(in
, 0, 0);
4107 void Client::_invalidate_inode_cache(Inode
*in
, int64_t off
, int64_t len
)
4109 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
4111 // invalidate our userspace inode cache
4112 if (cct
->_conf
->client_oc
) {
4113 vector
<ObjectExtent
> ls
;
4114 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, len
, in
->truncate_size
, ls
);
4115 objectcacher
->discard_writeback(&in
->oset
, ls
, nullptr);
4118 _schedule_invalidate_callback(in
, off
, len
);
4121 bool Client::_release(Inode
*in
)
4123 ldout(cct
, 20) << "_release " << *in
<< dendl
;
4124 if (in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
4125 _invalidate_inode_cache(in
);
4131 bool Client::_flush(Inode
*in
, Context
*onfinish
)
4133 ldout(cct
, 10) << "_flush " << *in
<< dendl
;
4135 if (!in
->oset
.dirty_or_tx
) {
4136 ldout(cct
, 10) << " nothing to flush" << dendl
;
4137 onfinish
->complete(0);
4141 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
4142 ldout(cct
, 8) << __func__
<< ": FULL, purging for ENOSPC" << dendl
;
4143 objectcacher
->purge_set(&in
->oset
);
4145 onfinish
->complete(-CEPHFS_ENOSPC
);
4150 return objectcacher
->flush_set(&in
->oset
, onfinish
);
4153 void Client::_flush_range(Inode
*in
, int64_t offset
, uint64_t size
)
4155 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
4156 if (!in
->oset
.dirty_or_tx
) {
4157 ldout(cct
, 10) << " nothing to flush" << dendl
;
4161 C_SaferCond
onflush("Client::_flush_range flock");
4162 bool ret
= objectcacher
->file_flush(&in
->oset
, &in
->layout
, in
->snaprealm
->get_snap_context(),
4163 offset
, size
, &onflush
);
4166 client_lock
.unlock();
4172 void Client::flush_set_callback(ObjectCacher::ObjectSet
*oset
)
4174 // std::scoped_lock l(client_lock);
4175 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
)); // will be called via dispatch() -> objecter -> ...
4176 Inode
*in
= static_cast<Inode
*>(oset
->parent
);
4181 void Client::_flushed(Inode
*in
)
4183 ldout(cct
, 10) << "_flushed " << *in
<< dendl
;
4185 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
4190 // checks common to add_update_cap, handle_cap_grant
4191 void Client::check_cap_issue(Inode
*in
, unsigned issued
)
4193 unsigned had
= in
->caps_issued();
4195 if ((issued
& CEPH_CAP_FILE_CACHE
) &&
4196 !(had
& CEPH_CAP_FILE_CACHE
))
4199 if ((issued
& CEPH_CAP_FILE_SHARED
) !=
4200 (had
& CEPH_CAP_FILE_SHARED
)) {
4201 if (issued
& CEPH_CAP_FILE_SHARED
)
4204 clear_dir_complete_and_ordered(in
, true);
4208 void Client::add_update_cap(Inode
*in
, MetaSession
*mds_session
, uint64_t cap_id
,
4209 unsigned issued
, unsigned wanted
, unsigned seq
, unsigned mseq
,
4210 inodeno_t realm
, int flags
, const UserPerm
& cap_perms
)
4212 if (!in
->is_any_caps()) {
4213 ceph_assert(in
->snaprealm
== 0);
4214 in
->snaprealm
= get_snap_realm(realm
);
4215 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4216 ldout(cct
, 15) << __func__
<< " first one, opened snaprealm " << in
->snaprealm
<< dendl
;
4218 ceph_assert(in
->snaprealm
);
4219 if ((flags
& CEPH_CAP_FLAG_AUTH
) &&
4220 realm
!= inodeno_t(-1) && in
->snaprealm
->ino
!= realm
) {
4221 in
->snaprealm_item
.remove_myself();
4222 auto oldrealm
= in
->snaprealm
;
4223 in
->snaprealm
= get_snap_realm(realm
);
4224 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4225 put_snap_realm(oldrealm
);
4229 mds_rank_t mds
= mds_session
->mds_num
;
4230 const auto &capem
= in
->caps
.emplace(std::piecewise_construct
, std::forward_as_tuple(mds
), std::forward_as_tuple(*in
, mds_session
));
4231 Cap
&cap
= capem
.first
->second
;
4232 if (!capem
.second
) {
4233 if (cap
.gen
< mds_session
->cap_gen
)
4234 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
4237 * auth mds of the inode changed. we received the cap export
4238 * message, but still haven't received the cap import message.
4239 * handle_cap_export() updated the new auth MDS' cap.
4241 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4242 * a message that was send before the cap import message. So
4243 * don't remove caps.
4245 if (ceph_seq_cmp(seq
, cap
.seq
) <= 0) {
4246 if (&cap
!= in
->auth_cap
)
4247 ldout(cct
, 0) << "WARNING: " << "inode " << *in
<< " caps on mds." << mds
<< " != auth_cap." << dendl
;
4249 ceph_assert(cap
.cap_id
== cap_id
);
4252 issued
|= cap
.issued
;
4253 flags
|= CEPH_CAP_FLAG_AUTH
;
4259 check_cap_issue(in
, issued
);
4261 if (flags
& CEPH_CAP_FLAG_AUTH
) {
4262 if (in
->auth_cap
!= &cap
&&
4263 (!in
->auth_cap
|| ceph_seq_cmp(in
->auth_cap
->mseq
, mseq
) < 0)) {
4264 if (in
->auth_cap
&& in
->flushing_cap_item
.is_on_list()) {
4265 ldout(cct
, 10) << __func__
<< " changing auth cap: "
4266 << "add myself to new auth MDS' flushing caps list" << dendl
;
4267 adjust_session_flushing_caps(in
, in
->auth_cap
->session
, mds_session
);
4269 in
->auth_cap
= &cap
;
4273 unsigned old_caps
= cap
.issued
;
4274 cap
.cap_id
= cap_id
;
4275 cap
.issued
= issued
;
4276 cap
.implemented
|= issued
;
4277 if (ceph_seq_cmp(mseq
, cap
.mseq
) > 0)
4278 cap
.wanted
= wanted
;
4280 cap
.wanted
|= wanted
;
4282 cap
.issue_seq
= seq
;
4284 cap
.gen
= mds_session
->cap_gen
;
4285 cap
.latest_perms
= cap_perms
;
4286 ldout(cct
, 10) << __func__
<< " issued " << ccap_string(old_caps
) << " -> " << ccap_string(cap
.issued
)
4287 << " from mds." << mds
4291 if ((issued
& ~old_caps
) && in
->auth_cap
== &cap
) {
4292 // non-auth MDS is revoking the newly grant caps ?
4293 for (auto &p
: in
->caps
) {
4294 if (&p
.second
== &cap
)
4296 if (p
.second
.implemented
& ~p
.second
.issued
& issued
) {
4297 check_caps(in
, CHECK_CAPS_NODELAY
);
4303 if (issued
& ~old_caps
)
4304 signal_cond_list(in
->waitfor_caps
);
4307 void Client::remove_cap(Cap
*cap
, bool queue_release
)
4309 auto &in
= cap
->inode
;
4310 MetaSession
*session
= cap
->session
;
4311 mds_rank_t mds
= cap
->session
->mds_num
;
4313 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " on " << in
<< dendl
;
4315 if (queue_release
) {
4316 session
->enqueue_cap_release(
4327 if (in
.auth_cap
== cap
) {
4328 if (in
.flushing_cap_item
.is_on_list()) {
4329 ldout(cct
, 10) << " removing myself from flushing_cap list" << dendl
;
4330 in
.flushing_cap_item
.remove_myself();
4334 size_t n
= in
.caps
.erase(mds
);
4335 ceph_assert(n
== 1);
4338 if (!in
.is_any_caps()) {
4339 ldout(cct
, 15) << __func__
<< " last one, closing snaprealm " << in
.snaprealm
<< dendl
;
4340 in
.snaprealm_item
.remove_myself();
4341 put_snap_realm(in
.snaprealm
);
4346 void Client::remove_all_caps(Inode
*in
)
4348 while (!in
->caps
.empty())
4349 remove_cap(&in
->caps
.begin()->second
, true);
4352 void Client::remove_session_caps(MetaSession
*s
, int err
)
4354 ldout(cct
, 10) << __func__
<< " mds." << s
->mds_num
<< dendl
;
4356 while (s
->caps
.size()) {
4357 Cap
*cap
= *s
->caps
.begin();
4358 InodeRef
in(&cap
->inode
);
4359 bool dirty_caps
= false;
4360 if (in
->auth_cap
== cap
) {
4361 dirty_caps
= in
->dirty_caps
| in
->flushing_caps
;
4362 in
->wanted_max_size
= 0;
4363 in
->requested_max_size
= 0;
4364 if (in
->has_any_filelocks())
4365 in
->flags
|= I_ERROR_FILELOCK
;
4367 auto caps
= cap
->implemented
;
4368 if (cap
->wanted
| cap
->issued
)
4369 in
->flags
|= I_CAP_DROPPED
;
4370 remove_cap(cap
, false);
4371 in
->cap_snaps
.clear();
4373 lderr(cct
) << __func__
<< " still has dirty|flushing caps on " << *in
<< dendl
;
4374 if (in
->flushing_caps
) {
4375 num_flushing_caps
--;
4376 in
->flushing_cap_tids
.clear();
4378 in
->flushing_caps
= 0;
4379 in
->mark_caps_clean();
4380 put_inode(in
.get());
4382 caps
&= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
;
4383 if (caps
&& !in
->caps_issued_mask(caps
, true)) {
4384 if (err
== -CEPHFS_EBLOCKLISTED
) {
4385 if (in
->oset
.dirty_or_tx
) {
4386 lderr(cct
) << __func__
<< " still has dirty data on " << *in
<< dendl
;
4387 in
->set_async_err(err
);
4389 objectcacher
->purge_set(&in
->oset
);
4391 objectcacher
->release_set(&in
->oset
);
4393 _schedule_invalidate_callback(in
.get(), 0, 0);
4396 signal_cond_list(in
->waitfor_caps
);
4398 s
->flushing_caps_tids
.clear();
4399 sync_cond
.notify_all();
4402 std::pair
<int, bool> Client::_do_remount(bool retry_on_error
)
4404 uint64_t max_retries
= cct
->_conf
.get_val
<uint64_t>("mds_max_retries_on_remount_failure");
4405 bool abort_on_failure
= false;
4408 int r
= remount_cb(callback_handle
);
4410 retries_on_invalidate
= 0;
4413 client_t whoami
= get_nodeid();
4416 "failed to remount (to trim kernel dentries): "
4417 "errno = " << e
<< " (" << strerror(e
) << ")" << dendl
;
4420 "failed to remount (to trim kernel dentries): "
4421 "return code = " << r
<< dendl
;
4424 (cct
->_conf
.get_val
<bool>("client_die_on_failed_remount") ||
4425 cct
->_conf
.get_val
<bool>("client_die_on_failed_dentry_invalidate")) &&
4426 !(retry_on_error
&& (++retries_on_invalidate
< max_retries
));
4427 if (should_abort
&& !is_unmounting()) {
4428 lderr(cct
) << "failed to remount for kernel dentry trimming; quitting!" << dendl
;
4429 abort_on_failure
= true;
4432 return std::make_pair(r
, abort_on_failure
);
4435 class C_Client_Remount
: public Context
{
4439 explicit C_Client_Remount(Client
*c
) : client(c
) {}
4440 void finish(int r
) override
{
4441 ceph_assert(r
== 0);
4442 client
->_do_remount(true);
4446 void Client::_invalidate_kernel_dcache()
4448 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
4449 if (!mref_reader
.is_state_satisfied())
4452 if (can_invalidate_dentries
) {
4453 if (dentry_invalidate_cb
&& root
->dir
) {
4454 for (ceph::unordered_map
<string
, Dentry
*>::iterator p
= root
->dir
->dentries
.begin();
4455 p
!= root
->dir
->dentries
.end();
4457 if (p
->second
->inode
)
4458 _schedule_invalidate_dentry_callback(p
->second
, false);
4461 } else if (remount_cb
) {
4463 // when remounting a file system, linux kernel trims all unused dentries in the fs
4464 remount_finisher
.queue(new C_Client_Remount(this));
4468 void Client::_trim_negative_child_dentries(InodeRef
& in
)
4474 if (dir
&& dir
->dentries
.size() == dir
->num_null_dentries
) {
4475 for (auto p
= dir
->dentries
.begin(); p
!= dir
->dentries
.end(); ) {
4476 Dentry
*dn
= p
->second
;
4478 ceph_assert(!dn
->inode
);
4479 if (dn
->lru_is_expireable())
4480 unlink(dn
, true, false); // keep dir, drop dentry
4482 if (dir
->dentries
.empty()) {
4487 if (in
->flags
& I_SNAPDIR_OPEN
) {
4488 InodeRef snapdir
= open_snapdir(in
.get());
4489 _trim_negative_child_dentries(snapdir
);
4493 class C_Client_CacheRelease
: public Context
{
4498 C_Client_CacheRelease(Client
*c
, Inode
*in
) :
4500 if (client
->use_faked_inos())
4501 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
4505 void finish(int r
) override
{
4506 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
4507 client
->_async_inode_release(ino
);
4511 void Client::_async_inode_release(vinodeno_t ino
)
4513 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
4514 if (!mref_reader
.is_state_satisfied())
4517 ldout(cct
, 10) << __func__
<< " " << ino
<< dendl
;
4518 ino_release_cb(callback_handle
, ino
);
4521 void Client::_schedule_ino_release_callback(Inode
*in
) {
4524 // we queue the invalidate, which calls the callback and decrements the ref
4525 async_ino_releasor
.queue(new C_Client_CacheRelease(this, in
));
4528 void Client::trim_caps(MetaSession
*s
, uint64_t max
)
4530 mds_rank_t mds
= s
->mds_num
;
4531 size_t caps_size
= s
->caps
.size();
4532 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " max " << max
4533 << " caps " << caps_size
<< dendl
;
4535 uint64_t trimmed
= 0;
4536 auto p
= s
->caps
.begin();
4537 std::set
<Dentry
*> to_trim
; /* this avoids caps other than the one we're
4538 * looking at from getting deleted during traversal. */
4539 while ((caps_size
- trimmed
) > max
&& !p
.end()) {
4541 InodeRef
in(&cap
->inode
);
4543 // Increment p early because it will be invalidated if cap
4544 // is deleted inside remove_cap
4547 if (in
->caps
.size() > 1 && cap
!= in
->auth_cap
) {
4548 int mine
= cap
->issued
| cap
->implemented
;
4549 int oissued
= in
->auth_cap
? in
->auth_cap
->issued
: 0;
4550 // disposable non-auth cap
4551 if (!(get_caps_used(in
.get()) & ~oissued
& mine
)) {
4552 ldout(cct
, 20) << " removing unused, unneeded non-auth cap on " << *in
<< dendl
;
4553 cap
= (remove_cap(cap
, true), nullptr);
4557 ldout(cct
, 20) << " trying to trim dentries for " << *in
<< dendl
;
4558 _trim_negative_child_dentries(in
);
4560 auto q
= in
->dentries
.begin();
4561 while (q
!= in
->dentries
.end()) {
4564 if (dn
->lru_is_expireable()) {
4565 if (can_invalidate_dentries
&&
4566 dn
->dir
->parent_inode
->ino
== CEPH_INO_ROOT
) {
4567 // Only issue one of these per DN for inodes in root: handle
4568 // others more efficiently by calling for root-child DNs at
4569 // the end of this function.
4570 _schedule_invalidate_dentry_callback(dn
, true);
4572 ldout(cct
, 20) << " queueing dentry for trimming: " << dn
->name
<< dendl
;
4575 ldout(cct
, 20) << " not expirable: " << dn
->name
<< dendl
;
4579 if (in
->ll_ref
== 1 && in
->ino
!= CEPH_INO_ROOT
) {
4580 _schedule_ino_release_callback(in
.get());
4582 if (all
&& in
->ino
!= CEPH_INO_ROOT
) {
4583 ldout(cct
, 20) << __func__
<< " counting as trimmed: " << *in
<< dendl
;
4588 ldout(cct
, 20) << " trimming queued dentries: " << dendl
;
4589 for (const auto &dn
: to_trim
) {
4594 caps_size
= s
->caps
.size();
4595 if (caps_size
> (size_t)max
)
4596 _invalidate_kernel_dcache();
4599 void Client::force_session_readonly(MetaSession
*s
)
4602 for (xlist
<Cap
*>::iterator p
= s
->caps
.begin(); !p
.end(); ++p
) {
4603 auto &in
= (*p
)->inode
;
4604 if (in
.caps_wanted() & CEPH_CAP_FILE_WR
)
4605 signal_cond_list(in
.waitfor_caps
);
4609 int Client::mark_caps_flushing(Inode
*in
, ceph_tid_t
* ptid
)
4611 MetaSession
*session
= in
->auth_cap
->session
;
4613 int flushing
= in
->dirty_caps
;
4614 ceph_assert(flushing
);
4616 ceph_tid_t flush_tid
= ++last_flush_tid
;
4617 in
->flushing_cap_tids
[flush_tid
] = flushing
;
4619 if (!in
->flushing_caps
) {
4620 ldout(cct
, 10) << __func__
<< " " << ccap_string(flushing
) << " " << *in
<< dendl
;
4621 num_flushing_caps
++;
4623 ldout(cct
, 10) << __func__
<< " (more) " << ccap_string(flushing
) << " " << *in
<< dendl
;
4626 in
->flushing_caps
|= flushing
;
4627 in
->mark_caps_clean();
4629 if (!in
->flushing_cap_item
.is_on_list())
4630 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4631 session
->flushing_caps_tids
.insert(flush_tid
);
4637 void Client::adjust_session_flushing_caps(Inode
*in
, MetaSession
*old_s
, MetaSession
*new_s
)
4639 for (auto &p
: in
->cap_snaps
) {
4640 CapSnap
&capsnap
= p
.second
;
4641 if (capsnap
.flush_tid
> 0) {
4642 old_s
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
4643 new_s
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4646 for (map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4647 it
!= in
->flushing_cap_tids
.end();
4649 old_s
->flushing_caps_tids
.erase(it
->first
);
4650 new_s
->flushing_caps_tids
.insert(it
->first
);
4652 new_s
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4656 * Flush all the dirty caps back to the MDS. Because the callers
4657 * generally wait on the result of this function (syncfs and umount
4658 * cases), we set CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4660 void Client::flush_caps_sync()
4662 ldout(cct
, 10) << __func__
<< dendl
;
4663 for (auto &q
: mds_sessions
) {
4665 xlist
<Inode
*>::iterator p
= s
->dirty_list
.begin();
4667 unsigned flags
= CHECK_CAPS_NODELAY
;
4672 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4673 check_caps(in
, flags
);
4678 void Client::wait_sync_caps(Inode
*in
, ceph_tid_t want
)
4680 while (in
->flushing_caps
) {
4681 map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4682 ceph_assert(it
!= in
->flushing_cap_tids
.end());
4683 if (it
->first
> want
)
4685 ldout(cct
, 10) << __func__
<< " on " << *in
<< " flushing "
4686 << ccap_string(it
->second
) << " want " << want
4687 << " last " << it
->first
<< dendl
;
4688 wait_on_list(in
->waitfor_caps
);
4692 void Client::wait_sync_caps(ceph_tid_t want
)
4695 ldout(cct
, 10) << __func__
<< " want " << want
<< " (last is " << last_flush_tid
<< ", "
4696 << num_flushing_caps
<< " total flushing)" << dendl
;
4697 for (auto &p
: mds_sessions
) {
4699 if (s
->flushing_caps_tids
.empty())
4701 ceph_tid_t oldest_tid
= *s
->flushing_caps_tids
.begin();
4702 if (oldest_tid
<= want
) {
4703 ldout(cct
, 10) << " waiting on mds." << p
.first
<< " tid " << oldest_tid
4704 << " (want " << want
<< ")" << dendl
;
4705 std::unique_lock l
{client_lock
, std::adopt_lock
};
4713 void Client::kick_flushing_caps(Inode
*in
, MetaSession
*session
)
4715 in
->flags
&= ~I_KICK_FLUSH
;
4717 Cap
*cap
= in
->auth_cap
;
4718 ceph_assert(cap
->session
== session
);
4720 ceph_tid_t last_snap_flush
= 0;
4721 for (auto p
= in
->flushing_cap_tids
.rbegin();
4722 p
!= in
->flushing_cap_tids
.rend();
4725 last_snap_flush
= p
->first
;
4730 int wanted
= in
->caps_wanted();
4731 int used
= get_caps_used(in
) | in
->caps_dirty();
4732 auto it
= in
->cap_snaps
.begin();
4733 for (auto& p
: in
->flushing_cap_tids
) {
4735 int msg_flags
= p
.first
< last_snap_flush
? MClientCaps::FLAG_PENDING_CAPSNAP
: 0;
4736 send_cap(in
, session
, cap
, msg_flags
, used
, wanted
, (cap
->issued
| cap
->implemented
),
4739 ceph_assert(it
!= in
->cap_snaps
.end());
4740 ceph_assert(it
->second
.flush_tid
== p
.first
);
4741 send_flush_snap(in
, session
, it
->first
, it
->second
);
4747 void Client::kick_flushing_caps(MetaSession
*session
)
4749 mds_rank_t mds
= session
->mds_num
;
4750 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
4752 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4754 if (in
->flags
& I_KICK_FLUSH
) {
4755 ldout(cct
, 20) << " reflushing caps on " << *in
<< " to mds." << mds
<< dendl
;
4756 kick_flushing_caps(in
, session
);
4761 void Client::early_kick_flushing_caps(MetaSession
*session
)
4763 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4765 Cap
*cap
= in
->auth_cap
;
4768 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4769 // stage. This guarantees that MDS processes the cap flush message before issuing
4770 // the flushing caps to other client.
4771 if ((in
->flushing_caps
& in
->auth_cap
->issued
) == in
->flushing_caps
) {
4772 in
->flags
|= I_KICK_FLUSH
;
4776 ldout(cct
, 20) << " reflushing caps (early_kick) on " << *in
4777 << " to mds." << session
->mds_num
<< dendl
;
4778 // send_reconnect() also will reset these sequence numbers. make sure
4779 // sequence numbers in cap flush message match later reconnect message.
4783 cap
->issued
= cap
->implemented
;
4785 kick_flushing_caps(in
, session
);
4789 void Client::invalidate_snaprealm_and_children(SnapRealm
*realm
)
4794 while (!q
.empty()) {
4798 ldout(cct
, 10) << __func__
<< " " << *realm
<< dendl
;
4799 realm
->invalidate_cache();
4801 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4802 p
!= realm
->pchildren
.end();
4808 SnapRealm
*Client::get_snap_realm(inodeno_t r
)
4810 SnapRealm
*realm
= snap_realms
[r
];
4812 snap_realms
[r
] = realm
= new SnapRealm(r
);
4813 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4818 SnapRealm
*Client::get_snap_realm_maybe(inodeno_t r
)
4820 if (snap_realms
.count(r
) == 0) {
4821 ldout(cct
, 20) << __func__
<< " " << r
<< " fail" << dendl
;
4824 SnapRealm
*realm
= snap_realms
[r
];
4825 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4830 void Client::put_snap_realm(SnapRealm
*realm
)
4832 ldout(cct
, 20) << __func__
<< " " << realm
->ino
<< " " << realm
4833 << " " << realm
->nref
<< " -> " << (realm
->nref
- 1) << dendl
;
4834 if (--realm
->nref
== 0) {
4835 snap_realms
.erase(realm
->ino
);
4836 if (realm
->pparent
) {
4837 realm
->pparent
->pchildren
.erase(realm
);
4838 put_snap_realm(realm
->pparent
);
4844 bool Client::adjust_realm_parent(SnapRealm
*realm
, inodeno_t parent
)
4846 if (realm
->parent
!= parent
) {
4847 ldout(cct
, 10) << __func__
<< " " << *realm
4848 << " " << realm
->parent
<< " -> " << parent
<< dendl
;
4849 realm
->parent
= parent
;
4850 if (realm
->pparent
) {
4851 realm
->pparent
->pchildren
.erase(realm
);
4852 put_snap_realm(realm
->pparent
);
4854 realm
->pparent
= get_snap_realm(parent
);
4855 realm
->pparent
->pchildren
.insert(realm
);
4861 static bool has_new_snaps(const SnapContext
& old_snapc
,
4862 const SnapContext
& new_snapc
)
4864 return !new_snapc
.snaps
.empty() && new_snapc
.snaps
[0] > old_snapc
.seq
;
4868 void Client::update_snap_trace(const bufferlist
& bl
, SnapRealm
**realm_ret
, bool flush
)
4870 SnapRealm
*first_realm
= NULL
;
4871 ldout(cct
, 10) << __func__
<< " len " << bl
.length() << dendl
;
4873 map
<SnapRealm
*, SnapContext
> dirty_realms
;
4875 auto p
= bl
.cbegin();
4879 SnapRealm
*realm
= get_snap_realm(info
.ino());
4881 bool invalidate
= false;
4883 if (info
.seq() > realm
->seq
) {
4884 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq() << " > " << realm
->seq
4888 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4889 // flush me + children
4892 while (!q
.empty()) {
4893 SnapRealm
*realm
= q
.front();
4896 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4897 p
!= realm
->pchildren
.end();
4901 if (dirty_realms
.count(realm
) == 0) {
4903 dirty_realms
[realm
] = realm
->get_snap_context();
4909 realm
->seq
= info
.seq();
4910 realm
->created
= info
.created();
4911 realm
->parent_since
= info
.parent_since();
4912 realm
->prior_parent_snaps
= info
.prior_parent_snaps
;
4913 realm
->my_snaps
= info
.my_snaps
;
4917 // _always_ verify parent
4918 if (adjust_realm_parent(realm
, info
.parent()))
4922 invalidate_snaprealm_and_children(realm
);
4923 ldout(cct
, 15) << __func__
<< " " << *realm
<< " self|parent updated" << dendl
;
4924 ldout(cct
, 15) << " snapc " << realm
->get_snap_context() << dendl
;
4926 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq()
4927 << " <= " << realm
->seq
<< " and same parent, SKIPPING" << dendl
;
4931 first_realm
= realm
;
4933 put_snap_realm(realm
);
4936 for (auto &[realm
, snapc
] : dirty_realms
) {
4937 // if there are new snaps ?
4938 if (has_new_snaps(snapc
, realm
->get_snap_context())) {
4939 ldout(cct
, 10) << " flushing caps on " << *realm
<< dendl
;
4940 for (auto&& in
: realm
->inodes_with_caps
) {
4941 queue_cap_snap(in
, snapc
);
4944 ldout(cct
, 10) << " no new snap on " << *realm
<< dendl
;
4946 put_snap_realm(realm
);
4950 *realm_ret
= first_realm
;
4952 put_snap_realm(first_realm
);
4955 void Client::handle_snap(const MConstRef
<MClientSnap
>& m
)
4957 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
4958 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4960 std::scoped_lock
cl(client_lock
);
4961 auto session
= _get_mds_session(mds
, m
->get_connection().get());
4966 got_mds_push(session
.get());
4968 map
<Inode
*, SnapContext
> to_move
;
4969 SnapRealm
*realm
= 0;
4971 if (m
->head
.op
== CEPH_SNAP_OP_SPLIT
) {
4972 ceph_assert(m
->head
.split
);
4974 auto p
= m
->bl
.cbegin();
4976 ceph_assert(info
.ino() == m
->head
.split
);
4978 // flush, then move, ino's.
4979 realm
= get_snap_realm(info
.ino());
4980 ldout(cct
, 10) << " splitting off " << *realm
<< dendl
;
4981 for (auto& ino
: m
->split_inos
) {
4982 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
4983 if (inode_map
.count(vino
)) {
4984 Inode
*in
= inode_map
[vino
];
4985 if (!in
->snaprealm
|| in
->snaprealm
== realm
)
4987 if (in
->snaprealm
->created
> info
.created()) {
4988 ldout(cct
, 10) << " NOT moving " << *in
<< " from _newer_ realm "
4989 << *in
->snaprealm
<< dendl
;
4992 ldout(cct
, 10) << " moving " << *in
<< " from " << *in
->snaprealm
<< dendl
;
4995 in
->snaprealm_item
.remove_myself();
4996 to_move
[in
] = in
->snaprealm
->get_snap_context();
4997 put_snap_realm(in
->snaprealm
);
5001 // move child snaprealms, too
5002 for (auto& child_realm
: m
->split_realms
) {
5003 ldout(cct
, 10) << "adjusting snaprealm " << child_realm
<< " parent" << dendl
;
5004 SnapRealm
*child
= get_snap_realm_maybe(child_realm
);
5007 adjust_realm_parent(child
, realm
->ino
);
5008 put_snap_realm(child
);
5012 update_snap_trace(m
->bl
, NULL
, m
->head
.op
!= CEPH_SNAP_OP_DESTROY
);
5015 for (auto p
= to_move
.begin(); p
!= to_move
.end(); ++p
) {
5016 Inode
*in
= p
->first
;
5017 in
->snaprealm
= realm
;
5018 realm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
5020 // queue for snap writeback
5021 if (has_new_snaps(p
->second
, realm
->get_snap_context()))
5022 queue_cap_snap(in
, p
->second
);
5024 put_snap_realm(realm
);
5028 void Client::handle_quota(const MConstRef
<MClientQuota
>& m
)
5030 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
5032 std::scoped_lock
cl(client_lock
);
5033 auto session
= _get_mds_session(mds
, m
->get_connection().get());
5038 got_mds_push(session
.get());
5040 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << mds
<< dendl
;
5042 vinodeno_t
vino(m
->ino
, CEPH_NOSNAP
);
5043 if (inode_map
.count(vino
)) {
5045 in
= inode_map
[vino
];
5048 in
->quota
= m
->quota
;
5049 in
->rstat
= m
->rstat
;
5054 void Client::handle_caps(const MConstRef
<MClientCaps
>& m
)
5056 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
5058 std::scoped_lock
cl(client_lock
);
5059 auto session
= _get_mds_session(mds
, m
->get_connection().get());
5064 if (m
->osd_epoch_barrier
&& !objecter
->have_map(m
->osd_epoch_barrier
)) {
5065 // Pause RADOS operations until we see the required epoch
5066 objecter
->set_epoch_barrier(m
->osd_epoch_barrier
);
5069 if (m
->osd_epoch_barrier
> cap_epoch_barrier
) {
5070 // Record the barrier so that we will transmit it to MDS when releasing
5071 set_cap_epoch_barrier(m
->osd_epoch_barrier
);
5074 got_mds_push(session
.get());
5077 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
5078 if (auto it
= inode_map
.find(vino
); it
!= inode_map
.end()) {
5081 if (m
->get_op() == CEPH_CAP_OP_IMPORT
) {
5082 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< " on IMPORT, immediately releasing" << dendl
;
5083 session
->enqueue_cap_release(
5090 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< ", dropping" << dendl
;
5093 // in case the mds is waiting on e.g. a revocation
5094 flush_cap_releases();
5098 switch (m
->get_op()) {
5099 case CEPH_CAP_OP_EXPORT
: return handle_cap_export(session
.get(), in
, m
);
5100 case CEPH_CAP_OP_FLUSHSNAP_ACK
: return handle_cap_flushsnap_ack(session
.get(), in
, m
);
5101 case CEPH_CAP_OP_IMPORT
: /* no return */ handle_cap_import(session
.get(), in
, m
);
5104 if (auto it
= in
->caps
.find(mds
); it
!= in
->caps
.end()) {
5105 Cap
&cap
= in
->caps
.at(mds
);
5107 switch (m
->get_op()) {
5108 case CEPH_CAP_OP_TRUNC
: return handle_cap_trunc(session
.get(), in
, m
);
5109 case CEPH_CAP_OP_IMPORT
:
5110 case CEPH_CAP_OP_REVOKE
:
5111 case CEPH_CAP_OP_GRANT
: return handle_cap_grant(session
.get(), in
, &cap
, m
);
5112 case CEPH_CAP_OP_FLUSH_ACK
: return handle_cap_flush_ack(session
.get(), in
, &cap
, m
);
5115 ldout(cct
, 5) << __func__
<< " don't have " << *in
<< " cap on mds." << mds
<< dendl
;
5120 void Client::handle_cap_import(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5122 mds_rank_t mds
= session
->mds_num
;
5124 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
5125 << " IMPORT from mds." << mds
<< dendl
;
5127 const mds_rank_t peer_mds
= mds_rank_t(m
->peer
.mds
);
5130 if (auto it
= in
->caps
.find(peer_mds
); m
->peer
.cap_id
&& it
!= in
->caps
.end()) {
5132 cap_perms
= cap
->latest_perms
;
5136 SnapRealm
*realm
= NULL
;
5137 update_snap_trace(m
->snapbl
, &realm
);
5139 int issued
= m
->get_caps();
5140 int wanted
= m
->get_wanted();
5141 add_update_cap(in
, session
, m
->get_cap_id(),
5142 issued
, wanted
, m
->get_seq(), m
->get_mseq(),
5143 m
->get_realm(), CEPH_CAP_FLAG_AUTH
, cap_perms
);
5145 if (cap
&& cap
->cap_id
== m
->peer
.cap_id
) {
5146 remove_cap(cap
, (m
->peer
.flags
& CEPH_CAP_FLAG_RELEASE
));
5150 put_snap_realm(realm
);
5152 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
5153 if (!(wanted
& CEPH_CAP_ANY_FILE_WR
) ||
5154 in
->requested_max_size
> m
->get_max_size()) {
5155 in
->requested_max_size
= 0;
5156 ldout(cct
, 15) << "reset requested_max_size after cap import" << dendl
;
5158 // reflush any/all caps (if we are now the auth_cap)
5159 kick_flushing_caps(in
, session
);
5163 void Client::handle_cap_export(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5165 mds_rank_t mds
= session
->mds_num
;
5167 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
5168 << " EXPORT from mds." << mds
<< dendl
;
5170 auto it
= in
->caps
.find(mds
);
5171 if (it
!= in
->caps
.end()) {
5172 Cap
&cap
= it
->second
;
5173 if (cap
.cap_id
== m
->get_cap_id()) {
5174 if (m
->peer
.cap_id
) {
5175 const auto peer_mds
= mds_rank_t(m
->peer
.mds
);
5176 auto tsession
= _get_or_open_mds_session(peer_mds
);
5177 auto it
= in
->caps
.find(peer_mds
);
5178 if (it
!= in
->caps
.end()) {
5179 Cap
&tcap
= it
->second
;
5180 if (tcap
.cap_id
== m
->peer
.cap_id
&&
5181 ceph_seq_cmp(tcap
.seq
, m
->peer
.seq
) < 0) {
5182 tcap
.cap_id
= m
->peer
.cap_id
;
5183 tcap
.seq
= m
->peer
.seq
- 1;
5184 tcap
.issue_seq
= tcap
.seq
;
5185 tcap
.issued
|= cap
.issued
;
5186 tcap
.implemented
|= cap
.issued
;
5187 if (&cap
== in
->auth_cap
)
5188 in
->auth_cap
= &tcap
;
5189 if (in
->auth_cap
== &tcap
&& in
->flushing_cap_item
.is_on_list())
5190 adjust_session_flushing_caps(in
, session
, tsession
.get());
5193 add_update_cap(in
, tsession
.get(), m
->peer
.cap_id
, cap
.issued
, 0,
5194 m
->peer
.seq
- 1, m
->peer
.mseq
, (uint64_t)-1,
5195 &cap
== in
->auth_cap
? CEPH_CAP_FLAG_AUTH
: 0,
5199 if (cap
.wanted
| cap
.issued
)
5200 in
->flags
|= I_CAP_DROPPED
;
5203 remove_cap(&cap
, false);
5208 void Client::handle_cap_trunc(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5210 mds_rank_t mds
= session
->mds_num
;
5211 ceph_assert(in
->caps
.count(mds
));
5213 ldout(cct
, 10) << __func__
<< " on ino " << *in
5214 << " size " << in
->size
<< " -> " << m
->get_size()
5218 in
->caps_issued(&issued
);
5219 issued
|= in
->caps_dirty();
5220 update_inode_file_size(in
, issued
, m
->get_size(),
5221 m
->get_truncate_seq(), m
->get_truncate_size());
5224 void Client::handle_cap_flush_ack(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5226 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5227 int dirty
= m
->get_dirty();
5231 auto it
= in
->flushing_cap_tids
.begin();
5232 if (it
->first
< flush_ack_tid
) {
5233 ldout(cct
, 0) << __func__
<< " mds." << session
->mds_num
5234 << " got unexpected flush ack tid " << flush_ack_tid
5235 << " expected is " << it
->first
<< dendl
;
5237 for (; it
!= in
->flushing_cap_tids
.end(); ) {
5243 if (it
->first
== flush_ack_tid
)
5244 cleaned
= it
->second
;
5245 if (it
->first
<= flush_ack_tid
) {
5246 session
->flushing_caps_tids
.erase(it
->first
);
5247 in
->flushing_cap_tids
.erase(it
++);
5251 cleaned
&= ~it
->second
;
5257 ldout(cct
, 5) << __func__
<< " mds." << session
->mds_num
5258 << " cleaned " << ccap_string(cleaned
) << " on " << *in
5259 << " with " << ccap_string(dirty
) << dendl
;
5262 signal_cond_list(in
->waitfor_caps
);
5263 if (session
->flushing_caps_tids
.empty() ||
5264 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5265 sync_cond
.notify_all();
5269 in
->cap_dirtier_uid
= -1;
5270 in
->cap_dirtier_gid
= -1;
5274 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != any cap bit tids" << dendl
;
5276 if (in
->flushing_caps
) {
5277 ldout(cct
, 5) << " flushing_caps " << ccap_string(in
->flushing_caps
)
5278 << " -> " << ccap_string(in
->flushing_caps
& ~cleaned
) << dendl
;
5279 in
->flushing_caps
&= ~cleaned
;
5280 if (in
->flushing_caps
== 0) {
5281 ldout(cct
, 10) << " " << *in
<< " !flushing" << dendl
;
5282 num_flushing_caps
--;
5283 if (in
->flushing_cap_tids
.empty())
5284 in
->flushing_cap_item
.remove_myself();
5286 if (!in
->caps_dirty())
5293 void Client::handle_cap_flushsnap_ack(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5295 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5296 mds_rank_t mds
= session
->mds_num
;
5297 ceph_assert(in
->caps
.count(mds
));
5298 snapid_t follows
= m
->get_snap_follows();
5300 if (auto it
= in
->cap_snaps
.find(follows
); it
!= in
->cap_snaps
.end()) {
5301 auto& capsnap
= it
->second
;
5302 if (flush_ack_tid
!= capsnap
.flush_tid
) {
5303 ldout(cct
, 10) << " tid " << flush_ack_tid
<< " != " << capsnap
.flush_tid
<< dendl
;
5305 InodeRef
tmp_ref(in
);
5306 ldout(cct
, 5) << __func__
<< " mds." << mds
<< " flushed snap follows " << follows
5307 << " on " << *in
<< dendl
;
5308 session
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
5309 in
->flushing_cap_tids
.erase(capsnap
.flush_tid
);
5310 if (in
->flushing_caps
== 0 && in
->flushing_cap_tids
.empty())
5311 in
->flushing_cap_item
.remove_myself();
5312 in
->cap_snaps
.erase(it
);
5314 signal_cond_list(in
->waitfor_caps
);
5315 if (session
->flushing_caps_tids
.empty() ||
5316 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5317 sync_cond
.notify_all();
5320 ldout(cct
, 5) << __func__
<< " DUP(?) mds." << mds
<< " flushed snap follows " << follows
5321 << " on " << *in
<< dendl
;
5322 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5326 class C_Client_DentryInvalidate
: public Context
{
5333 C_Client_DentryInvalidate(Client
*c
, Dentry
*dn
, bool del
) :
5334 client(c
), name(dn
->name
) {
5335 if (client
->use_faked_inos()) {
5336 dirino
.ino
= dn
->dir
->parent_inode
->faked_ino
;
5338 ino
.ino
= dn
->inode
->faked_ino
;
5340 dirino
= dn
->dir
->parent_inode
->vino();
5342 ino
= dn
->inode
->vino();
5345 ino
.ino
= inodeno_t();
5347 void finish(int r
) override
{
5348 // _async_dentry_invalidate is responsible for its own locking
5349 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
5350 client
->_async_dentry_invalidate(dirino
, ino
, name
);
5354 void Client::_async_dentry_invalidate(vinodeno_t dirino
, vinodeno_t ino
, string
& name
)
5356 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
5357 if (!mref_reader
.is_state_satisfied())
5360 ldout(cct
, 10) << __func__
<< " '" << name
<< "' ino " << ino
5361 << " in dir " << dirino
<< dendl
;
5362 dentry_invalidate_cb(callback_handle
, dirino
, ino
, name
.c_str(), name
.length());
5365 void Client::_schedule_invalidate_dentry_callback(Dentry
*dn
, bool del
)
5367 if (dentry_invalidate_cb
&& dn
->inode
->ll_ref
> 0)
5368 async_dentry_invalidator
.queue(new C_Client_DentryInvalidate(this, dn
, del
));
5371 void Client::_try_to_trim_inode(Inode
*in
, bool sched_inval
)
5373 int ref
= in
->get_nref();
5374 ldout(cct
, 5) << __func__
<< " in " << *in
<<dendl
;
5376 if (in
->dir
&& !in
->dir
->dentries
.empty()) {
5377 for (auto p
= in
->dir
->dentries
.begin();
5378 p
!= in
->dir
->dentries
.end(); ) {
5379 Dentry
*dn
= p
->second
;
5381 /* rmsnap removes whole subtree, need trim inodes recursively.
5382 * we don't need to invalidate dentries recursively. because
5383 * invalidating a directory dentry effectively invalidate
5385 if (in
->snapid
!= CEPH_NOSNAP
&& dn
->inode
&& dn
->inode
->is_dir())
5386 _try_to_trim_inode(dn
->inode
.get(), false);
5388 if (dn
->lru_is_expireable())
5389 unlink(dn
, true, false); // keep dir, drop dentry
5391 if (in
->dir
->dentries
.empty()) {
5397 if (ref
> 1 && (in
->flags
& I_SNAPDIR_OPEN
)) {
5398 InodeRef snapdir
= open_snapdir(in
);
5399 _try_to_trim_inode(snapdir
.get(), false);
5404 auto q
= in
->dentries
.begin();
5405 while (q
!= in
->dentries
.end()) {
5408 if( in
->ll_ref
> 0 && sched_inval
) {
5409 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5410 // so in->dentries doesn't always reflect the state of kernel's dcache.
5411 _schedule_invalidate_dentry_callback(dn
, true);
5413 unlink(dn
, true, true);
5418 void Client::handle_cap_grant(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5420 mds_rank_t mds
= session
->mds_num
;
5421 int used
= get_caps_used(in
);
5422 int wanted
= in
->caps_wanted();
5425 const unsigned new_caps
= m
->get_caps();
5426 const bool was_stale
= session
->cap_gen
> cap
->gen
;
5427 ldout(cct
, 5) << __func__
<< " on in " << m
->get_ino()
5428 << " mds." << mds
<< " seq " << m
->get_seq()
5429 << " caps now " << ccap_string(new_caps
)
5430 << " was " << ccap_string(cap
->issued
)
5431 << (was_stale
? " (stale)" : "") << dendl
;
5434 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
5435 cap
->seq
= m
->get_seq();
5436 cap
->gen
= session
->cap_gen
;
5438 check_cap_issue(in
, new_caps
);
5442 in
->caps_issued(&issued
);
5443 issued
|= in
->caps_dirty();
5445 if ((new_caps
& CEPH_CAP_AUTH_SHARED
) &&
5446 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
5447 in
->mode
= m
->head
.mode
;
5448 in
->uid
= m
->head
.uid
;
5449 in
->gid
= m
->head
.gid
;
5450 in
->btime
= m
->btime
;
5452 bool deleted_inode
= false;
5453 if ((new_caps
& CEPH_CAP_LINK_SHARED
) &&
5454 !(issued
& CEPH_CAP_LINK_EXCL
)) {
5455 in
->nlink
= m
->head
.nlink
;
5457 deleted_inode
= true;
5459 if (!(issued
& CEPH_CAP_XATTR_EXCL
) &&
5460 m
->xattrbl
.length() &&
5461 m
->head
.xattr_version
> in
->xattr_version
) {
5462 auto p
= m
->xattrbl
.cbegin();
5463 decode(in
->xattrs
, p
);
5464 in
->xattr_version
= m
->head
.xattr_version
;
5467 if ((new_caps
& CEPH_CAP_FILE_SHARED
) && m
->dirstat_is_valid()) {
5468 in
->dirstat
.nfiles
= m
->get_nfiles();
5469 in
->dirstat
.nsubdirs
= m
->get_nsubdirs();
5472 if (new_caps
& CEPH_CAP_ANY_RD
) {
5473 update_inode_file_time(in
, issued
, m
->get_time_warp_seq(),
5474 m
->get_ctime(), m
->get_mtime(), m
->get_atime());
5477 if (new_caps
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
)) {
5478 in
->layout
= m
->get_layout();
5479 update_inode_file_size(in
, issued
, m
->get_size(),
5480 m
->get_truncate_seq(), m
->get_truncate_size());
5483 if (m
->inline_version
> in
->inline_version
) {
5484 in
->inline_data
= m
->inline_data
;
5485 in
->inline_version
= m
->inline_version
;
5488 /* always take a newer change attr */
5489 if (m
->get_change_attr() > in
->change_attr
)
5490 in
->change_attr
= m
->get_change_attr();
5493 if (cap
== in
->auth_cap
&&
5494 (new_caps
& CEPH_CAP_ANY_FILE_WR
) &&
5495 (m
->get_max_size() != in
->max_size
)) {
5496 ldout(cct
, 10) << "max_size " << in
->max_size
<< " -> " << m
->get_max_size() << dendl
;
5497 in
->max_size
= m
->get_max_size();
5498 if (in
->max_size
> in
->wanted_max_size
) {
5499 in
->wanted_max_size
= 0;
5500 in
->requested_max_size
= 0;
5505 if ((was_stale
|| m
->get_op() == CEPH_CAP_OP_IMPORT
) &&
5506 (wanted
& ~(cap
->wanted
| new_caps
))) {
5507 // If mds is importing cap, prior cap messages that update 'wanted'
5508 // may get dropped by mds (migrate seq mismatch).
5510 // We don't send cap message to update 'wanted' if what we want are
5511 // already issued. If mds revokes caps, cap message that releases caps
5512 // also tells mds what we want. But if caps got revoked by mds forcedly
5513 // (session stale). We may haven't told mds what we want.
5519 auto revoked
= cap
->issued
& ~new_caps
;
5521 ldout(cct
, 10) << " revocation of " << ccap_string(revoked
) << dendl
;
5522 cap
->issued
= new_caps
;
5523 cap
->implemented
|= new_caps
;
5525 // recall delegations if we're losing caps necessary for them
5526 if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_RD
))
5527 in
->recall_deleg(false);
5528 else if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_WR
))
5529 in
->recall_deleg(true);
5531 used
= adjust_caps_used_for_lazyio(used
, cap
->issued
, cap
->implemented
);
5532 if ((used
& revoked
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
)) &&
5533 !_flush(in
, new C_Client_FlushComplete(this, in
))) {
5534 // waitin' for flush
5535 } else if (used
& revoked
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
)) {
5538 flags
= CHECK_CAPS_NODELAY
;
5541 cap
->wanted
= 0; // don't let check_caps skip sending a response to MDS
5543 flags
= CHECK_CAPS_NODELAY
;
5545 } else if (cap
->issued
== new_caps
) {
5546 ldout(cct
, 10) << " caps unchanged at " << ccap_string(cap
->issued
) << dendl
;
5548 ldout(cct
, 10) << " grant, new caps are " << ccap_string(new_caps
& ~cap
->issued
) << dendl
;
5549 cap
->issued
= new_caps
;
5550 cap
->implemented
|= new_caps
;
5552 if (cap
== in
->auth_cap
) {
5553 // non-auth MDS is revoking the newly grant caps ?
5554 for (const auto &p
: in
->caps
) {
5555 if (&p
.second
== cap
)
5557 if (p
.second
.implemented
& ~p
.second
.issued
& new_caps
) {
5566 check_caps(in
, flags
);
5570 signal_cond_list(in
->waitfor_caps
);
5572 // may drop inode's last ref
5574 _try_to_trim_inode(in
, true);
5577 int Client::inode_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
5579 if (perms
.uid() == 0) {
5580 // Executable are overridable when there is at least one exec bit set
5581 if((want
& MAY_EXEC
) && !(in
->mode
& S_IXUGO
))
5582 return -CEPHFS_EACCES
;
5586 if (perms
.uid() != in
->uid
&& (in
->mode
& S_IRWXG
)) {
5587 int ret
= _posix_acl_permission(in
, perms
, want
);
5588 if (ret
!= -CEPHFS_EAGAIN
)
5592 // check permissions before doing anything else
5593 if (!in
->check_mode(perms
, want
))
5594 return -CEPHFS_EACCES
;
5598 int Client::xattr_permission(Inode
*in
, const char *name
, unsigned want
,
5599 const UserPerm
& perms
)
5601 int r
= _getattr_for_perm(in
, perms
);
5606 if (strncmp(name
, "system.", 7) == 0) {
5607 if ((want
& MAY_WRITE
) && (perms
.uid() != 0 && perms
.uid() != in
->uid
))
5610 r
= inode_permission(in
, perms
, want
);
5613 ldout(cct
, 5) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5617 std::ostream
& operator<<(std::ostream
&out
, const UserPerm
& perm
) {
5618 out
<< "UserPerm(uid: " << perm
.uid() << ", gid: " << perm
.gid() << ")";
5622 int Client::may_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
5623 const UserPerm
& perms
)
5625 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5626 int r
= _getattr_for_perm(in
, perms
);
5630 if (mask
& CEPH_SETATTR_SIZE
) {
5631 r
= inode_permission(in
, perms
, MAY_WRITE
);
5637 if (mask
& CEPH_SETATTR_UID
) {
5638 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
|| stx
->stx_uid
!= in
->uid
))
5641 if (mask
& CEPH_SETATTR_GID
) {
5642 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
||
5643 (!perms
.gid_in_groups(stx
->stx_gid
) && stx
->stx_gid
!= in
->gid
)))
5647 if (mask
& CEPH_SETATTR_MODE
) {
5648 if (perms
.uid() != 0 && perms
.uid() != in
->uid
)
5651 gid_t i_gid
= (mask
& CEPH_SETATTR_GID
) ? stx
->stx_gid
: in
->gid
;
5652 if (perms
.uid() != 0 && !perms
.gid_in_groups(i_gid
))
5653 stx
->stx_mode
&= ~S_ISGID
;
5656 if (mask
& (CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
|
5657 CEPH_SETATTR_MTIME
| CEPH_SETATTR_ATIME
)) {
5658 if (perms
.uid() != 0 && perms
.uid() != in
->uid
) {
5659 int check_mask
= CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
;
5660 if (!(mask
& CEPH_SETATTR_MTIME_NOW
))
5661 check_mask
|= CEPH_SETATTR_MTIME
;
5662 if (!(mask
& CEPH_SETATTR_ATIME_NOW
))
5663 check_mask
|= CEPH_SETATTR_ATIME
;
5664 if (check_mask
& mask
) {
5667 r
= inode_permission(in
, perms
, MAY_WRITE
);
5675 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5679 int Client::may_open(Inode
*in
, int flags
, const UserPerm
& perms
)
5681 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5684 if ((flags
& O_ACCMODE
) == O_WRONLY
)
5686 else if ((flags
& O_ACCMODE
) == O_RDWR
)
5687 want
= MAY_READ
| MAY_WRITE
;
5688 else if ((flags
& O_ACCMODE
) == O_RDONLY
)
5690 if (flags
& O_TRUNC
)
5694 switch (in
->mode
& S_IFMT
) {
5699 if (want
& MAY_WRITE
) {
5706 r
= _getattr_for_perm(in
, perms
);
5710 r
= inode_permission(in
, perms
, want
);
5712 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5716 int Client::may_lookup(Inode
*dir
, const UserPerm
& perms
)
5718 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5719 int r
= _getattr_for_perm(dir
, perms
);
5723 r
= inode_permission(dir
, perms
, MAY_EXEC
);
5725 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5729 int Client::may_create(Inode
*dir
, const UserPerm
& perms
)
5731 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5732 int r
= _getattr_for_perm(dir
, perms
);
5736 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5738 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5742 int Client::may_delete(Inode
*dir
, const char *name
, const UserPerm
& perms
)
5744 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << "; name " << name
<< "; " << perms
<< dendl
;
5745 int r
= _getattr_for_perm(dir
, perms
);
5749 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5753 /* 'name == NULL' means rmsnap w/o permission checks */
5754 if (perms
.uid() != 0 && name
&& (dir
->mode
& S_ISVTX
)) {
5756 r
= _lookup(dir
, name
, CEPH_CAP_AUTH_SHARED
, &otherin
, perms
);
5759 if (dir
->uid
!= perms
.uid() && otherin
->uid
!= perms
.uid())
5763 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5767 int Client::may_delete(const char *relpath
, const UserPerm
& perms
) {
5768 ldout(cct
, 20) << __func__
<< " " << relpath
<< "; " << perms
<< dendl
;
5770 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
5771 if (!mref_reader
.is_state_satisfied())
5774 filepath
path(relpath
);
5775 string name
= path
.last_dentry();
5779 std::scoped_lock
lock(client_lock
);
5780 int r
= path_walk(path
, &dir
, perms
);
5783 if (cct
->_conf
->client_permissions
) {
5784 int r
= may_delete(dir
.get(), name
.c_str(), perms
);
5792 int Client::may_hardlink(Inode
*in
, const UserPerm
& perms
)
5794 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5795 int r
= _getattr_for_perm(in
, perms
);
5799 if (perms
.uid() == 0 || perms
.uid() == in
->uid
) {
5805 if (!S_ISREG(in
->mode
))
5808 if (in
->mode
& S_ISUID
)
5811 if ((in
->mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
))
5814 r
= inode_permission(in
, perms
, MAY_READ
| MAY_WRITE
);
5816 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5820 int Client::_getattr_for_perm(Inode
*in
, const UserPerm
& perms
)
5822 int mask
= CEPH_STAT_CAP_MODE
;
5824 if (acl_type
!= NO_ACL
) {
5825 mask
|= CEPH_STAT_CAP_XATTR
;
5826 force
= in
->xattr_version
== 0;
5828 return _getattr(in
, mask
, perms
, force
);
5831 vinodeno_t
Client::_get_vino(Inode
*in
)
5833 /* The caller must hold the client lock */
5834 return vinodeno_t(in
->ino
, in
->snapid
);
5838 * Resolve an MDS spec to a list of MDS daemon GIDs.
5840 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5841 * It may be '*' in which case it matches all GIDs.
5843 * If no error is returned, the `targets` vector will be populated with at least
5846 int Client::resolve_mds(
5847 const std::string
&mds_spec
,
5848 std::vector
<mds_gid_t
> *targets
)
5851 ceph_assert(targets
!= nullptr);
5854 CachedStackStringStream css
;
5855 int role_r
= fsmap
->parse_role(mds_spec
, &role
, *css
);
5857 // We got a role, resolve it to a GID
5858 auto& info
= fsmap
->get_filesystem(role
.fscid
)->mds_map
.get_info(role
.rank
);
5859 ldout(cct
, 10) << __func__
<< ": resolved " << mds_spec
<< " to role '"
5860 << role
<< "' aka " << info
.human_name() << dendl
;
5861 targets
->push_back(info
.global_id
);
5865 std::string strtol_err
;
5866 long long rank_or_gid
= strict_strtoll(mds_spec
.c_str(), 10, &strtol_err
);
5867 if (strtol_err
.empty()) {
5868 // It is a possible GID
5869 const mds_gid_t mds_gid
= mds_gid_t(rank_or_gid
);
5870 if (fsmap
->gid_exists(mds_gid
)) {
5871 auto& info
= fsmap
->get_info_gid(mds_gid
);
5872 ldout(cct
, 10) << __func__
<< ": validated gid " << mds_gid
<< " aka "
5873 << info
.human_name() << dendl
;
5874 targets
->push_back(mds_gid
);
5877 lderr(cct
) << __func__
<< ": gid " << mds_gid
<< " not in MDS map"
5879 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5880 return -CEPHFS_ENOENT
;
5882 } else if (mds_spec
== "*") {
5883 // It is a wildcard: use all MDSs
5884 const auto& mds_info
= fsmap
->get_mds_info();
5886 ldout(cct
, 10) << __func__
<< ": resolving `*' to all MDS daemons" << dendl
;
5887 if (mds_info
.empty()) {
5888 lderr(cct
) << __func__
<< ": no MDS daemons found" << dendl
;
5889 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5890 return -CEPHFS_ENOENT
;
5893 for (const auto& [gid
, info
] : mds_info
) {
5894 ldout(cct
, 10) << __func__
<< ": appending " << info
.human_name() << " to targets" << dendl
;
5895 targets
->push_back(gid
);
5899 // It did not parse as an integer, it is not a wildcard, it must be a name
5900 const mds_gid_t mds_gid
= fsmap
->find_mds_gid_by_name(mds_spec
);
5902 lderr(cct
) << __func__
<< ": no MDS daemons found by name `" << mds_spec
<< "'" << dendl
;
5903 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5904 return -CEPHFS_ENOENT
;
5906 auto& info
= fsmap
->get_info_gid(mds_gid
);
5907 ldout(cct
, 10) << __func__
<< ": resolved name '" << mds_spec
5908 << "' to " << info
.human_name() << dendl
;
5909 targets
->push_back(mds_gid
);
5917 * Authenticate with mon and establish global ID
5919 int Client::authenticate()
5921 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
5923 if (monclient
->is_authenticated()) {
5927 client_lock
.unlock();
5928 int r
= monclient
->authenticate(cct
->_conf
->client_mount_timeout
);
5934 whoami
= monclient
->get_global_id();
5935 messenger
->set_myname(entity_name_t::CLIENT(whoami
.v
));
5940 int Client::fetch_fsmap(bool user
)
5942 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
5944 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5945 // rather than MDSMap because no one MDSMap contains all the daemons, and
5946 // a `tell` can address any daemon.
5947 version_t fsmap_latest
;
5950 client_lock
.unlock();
5951 std::tie(fsmap_latest
, std::ignore
) =
5952 monclient
->get_version("fsmap", ca::use_blocked
[ec
]);
5954 } while (ec
== bs::errc::resource_unavailable_try_again
);
5957 lderr(cct
) << "Failed to learn FSMap version: " << ec
<< dendl
;
5958 return ceph::from_error_code(ec
);
5961 ldout(cct
, 10) << __func__
<< " learned FSMap version " << fsmap_latest
<< dendl
;
5964 if (!fsmap_user
|| fsmap_user
->get_epoch() < fsmap_latest
) {
5965 monclient
->sub_want("fsmap.user", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5966 monclient
->renew_subs();
5967 wait_on_list(waiting_for_fsmap
);
5969 ceph_assert(fsmap_user
);
5970 ceph_assert(fsmap_user
->get_epoch() >= fsmap_latest
);
5972 if (!fsmap
|| fsmap
->get_epoch() < fsmap_latest
) {
5973 monclient
->sub_want("fsmap", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5974 monclient
->renew_subs();
5975 wait_on_list(waiting_for_fsmap
);
5978 ceph_assert(fsmap
->get_epoch() >= fsmap_latest
);
5980 ldout(cct
, 10) << __func__
<< " finished waiting for FSMap version "
5981 << fsmap_latest
<< dendl
;
5987 * @mds_spec one of ID, rank, GID, "*"
5990 int Client::mds_command(
5991 const std::string
&mds_spec
,
5992 const vector
<string
>& cmd
,
5993 const bufferlist
& inbl
,
5998 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
5999 if (!iref_reader
.is_state_satisfied())
6000 return -CEPHFS_ENOTCONN
;
6002 std::unique_lock
cl(client_lock
);
6010 r
= fetch_fsmap(false);
6015 // Look up MDS target(s) of the command
6016 std::vector
<mds_gid_t
> targets
;
6017 r
= resolve_mds(mds_spec
, &targets
);
6022 // If daemons are laggy, we won't send them commands. If all
6023 // are laggy then we fail.
6024 std::vector
<mds_gid_t
> non_laggy
;
6025 for (const auto& gid
: targets
) {
6026 const auto info
= fsmap
->get_info_gid(gid
);
6027 if (!info
.laggy()) {
6028 non_laggy
.push_back(gid
);
6031 if (non_laggy
.size() == 0) {
6032 *outs
= "All targeted MDS daemons are laggy";
6033 return -CEPHFS_ENOENT
;
6036 if (metadata
.empty()) {
6037 // We are called on an unmounted client, so metadata
6038 // won't be initialized yet.
6039 populate_metadata("");
6042 // Send commands to targets
6043 C_GatherBuilder
gather(cct
, onfinish
);
6044 for (const auto& target_gid
: non_laggy
) {
6045 const auto info
= fsmap
->get_info_gid(target_gid
);
6047 // Open a connection to the target MDS
6048 ConnectionRef conn
= messenger
->connect_to_mds(info
.get_addrs());
6052 std::scoped_lock
cmd_lock(command_lock
);
6053 // Generate MDSCommandOp state
6054 auto &op
= command_table
.start_command();
6056 op
.on_finish
= gather
.new_sub();
6061 op
.mds_gid
= target_gid
;
6064 ldout(cct
, 4) << __func__
<< ": new command op to " << target_gid
6065 << " tid=" << op
.tid
<< cmd
<< dendl
;
6067 // Construct and send MCommand
6068 MessageRef m
= op
.get_message(monclient
->get_fsid());
6069 conn
->send_message2(std::move(m
));
6078 void Client::handle_command_reply(const MConstRef
<MCommandReply
>& m
)
6080 ceph_tid_t
const tid
= m
->get_tid();
6082 ldout(cct
, 10) << __func__
<< ": tid=" << m
->get_tid() << dendl
;
6084 std::scoped_lock
cmd_lock(command_lock
);
6085 if (!command_table
.exists(tid
)) {
6086 ldout(cct
, 1) << __func__
<< ": unknown tid " << tid
<< ", dropping" << dendl
;
6090 auto &op
= command_table
.get_command(tid
);
6092 *op
.outbl
= m
->get_data();
6099 op
.on_finish
->complete(m
->r
);
6102 command_table
.erase(tid
);
6105 // -------------------
6108 int Client::subscribe_mdsmap(const std::string
&fs_name
)
6110 int r
= authenticate();
6112 lderr(cct
) << "authentication failed: " << cpp_strerror(r
) << dendl
;
6116 std::string resolved_fs_name
;
6117 if (fs_name
.empty()) {
6118 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_fs");
6119 if (resolved_fs_name
.empty())
6120 // Try the backwards compatibility fs name option
6121 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_mds_namespace");
6123 resolved_fs_name
= fs_name
;
6126 std::string want
= "mdsmap";
6127 if (!resolved_fs_name
.empty()) {
6128 r
= fetch_fsmap(true);
6131 fscid
= fsmap_user
->get_fs_cid(resolved_fs_name
);
6132 if (fscid
== FS_CLUSTER_ID_NONE
) {
6133 return -CEPHFS_ENOENT
;
6136 std::ostringstream oss
;
6137 oss
<< want
<< "." << fscid
;
6140 ldout(cct
, 10) << "Subscribing to map '" << want
<< "'" << dendl
;
6142 monclient
->sub_want(want
, 0, 0);
6143 monclient
->renew_subs();
6148 int Client::mount(const std::string
&mount_root
, const UserPerm
& perms
,
6149 bool require_mds
, const std::string
&fs_name
)
6151 ceph_assert(is_initialized());
6154 * To make sure that the _unmount() must wait until the mount()
6157 RWRef_t
mref_writer(mount_state
, CLIENT_MOUNTING
, false);
6158 if (!mref_writer
.is_first_writer()) // already mounting or mounted
6161 std::unique_lock
cl(client_lock
);
6163 int r
= subscribe_mdsmap(fs_name
);
6165 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
6169 start_tick_thread(); // start tick thread
6173 auto availability
= mdsmap
->is_cluster_available();
6174 if (availability
== MDSMap::STUCK_UNAVAILABLE
) {
6176 ldout(cct
, 10) << "mds cluster unavailable: epoch=" << mdsmap
->get_epoch() << dendl
;
6177 return CEPH_FUSE_NO_MDS_UP
;
6178 } else if (availability
== MDSMap::AVAILABLE
) {
6179 // Continue to mount
6181 } else if (availability
== MDSMap::TRANSIENT_UNAVAILABLE
) {
6182 // Else, wait. MDSMonitor will update the map to bring
6183 // us to a conclusion eventually.
6184 wait_on_list(waiting_for_mdsmap
);
6186 // Unexpected value!
6192 populate_metadata(mount_root
.empty() ? "/" : mount_root
);
6194 filepath
fp(CEPH_INO_ROOT
);
6195 if (!mount_root
.empty()) {
6196 fp
= filepath(mount_root
.c_str());
6199 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
6200 req
->set_filepath(fp
);
6201 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
6202 int res
= make_request(req
, perms
);
6204 if (res
== -CEPHFS_EACCES
&& root
) {
6205 ldout(cct
, 1) << __func__
<< " EACCES on parent of mount point; quotas may not work" << dendl
;
6218 _ll_get(root
.get());
6221 if (!cct
->_conf
->client_trace
.empty()) {
6222 traceout
.open(cct
->_conf
->client_trace
.c_str());
6223 if (traceout
.is_open()) {
6224 ldout(cct
, 1) << "opened trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6226 ldout(cct
, 1) << "FAILED to open trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6231 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6232 ldout(cct, 3) << "op: struct stat st;" << dendl;
6233 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6234 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6235 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6236 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6237 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6238 ldout(cct, 3) << "op: int fd;" << dendl;
6241 mref_writer
.update_state(CLIENT_MOUNTED
);
6247 void Client::_close_sessions()
6249 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
6250 if (it
->second
->state
== MetaSession::STATE_REJECTED
)
6251 mds_sessions
.erase(it
++);
6256 while (!mds_sessions
.empty()) {
6257 // send session closes!
6258 for (auto &p
: mds_sessions
) {
6259 if (p
.second
->state
!= MetaSession::STATE_CLOSING
) {
6260 _close_mds_session(p
.second
.get());
6261 mds_ranks_closing
.insert(p
.first
);
6265 // wait for sessions to close
6266 double timo
= cct
->_conf
.get_val
<std::chrono::seconds
>("client_shutdown_timeout").count();
6267 ldout(cct
, 2) << "waiting for " << mds_ranks_closing
.size() << " mds session(s) to close (timeout: "
6268 << timo
<< "s)" << dendl
;
6269 std::unique_lock l
{client_lock
, std::adopt_lock
};
6272 } else if (!mount_cond
.wait_for(l
, ceph::make_timespan(timo
), [this] { return mds_ranks_closing
.empty(); })) {
6273 ldout(cct
, 1) << mds_ranks_closing
.size() << " mds(s) did not respond to session close -- timing out." << dendl
;
6274 while (!mds_ranks_closing
.empty()) {
6275 auto session
= mds_sessions
.at(*mds_ranks_closing
.begin());
6276 // this prunes entry from mds_sessions and mds_ranks_closing
6277 _closed_mds_session(session
.get(), -CEPHFS_ETIMEDOUT
);
6281 mds_ranks_closing
.clear();
6286 void Client::flush_mdlog_sync(Inode
*in
)
6288 if (in
->unsafe_ops
.empty()) {
6292 std::set
<mds_rank_t
> anchor
;
6293 for (auto &&p
: in
->unsafe_ops
) {
6294 anchor
.emplace(p
->mds
);
6297 anchor
.emplace(in
->auth_cap
->session
->mds_num
);
6300 for (auto &rank
: anchor
) {
6301 auto session
= &mds_sessions
.at(rank
);
6302 flush_mdlog(session
->get());
6306 void Client::flush_mdlog_sync()
6308 if (mds_requests
.empty())
6310 for (auto &p
: mds_sessions
) {
6311 flush_mdlog(p
.second
.get());
6315 void Client::flush_mdlog(MetaSession
*session
)
6317 // Only send this to Luminous or newer MDS daemons, older daemons
6318 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6319 const uint64_t features
= session
->con
->get_features();
6320 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
6321 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_FLUSH_MDLOG
);
6322 session
->con
->send_message2(std::move(m
));
6327 void Client::_abort_mds_sessions(int err
)
6329 for (auto p
= mds_requests
.begin(); p
!= mds_requests
.end(); ) {
6330 auto req
= p
->second
;
6332 // unsafe requests will be removed during close session below.
6333 if (req
->got_unsafe
)
6337 if (req
->caller_cond
) {
6339 req
->caller_cond
->notify_all();
6343 // Process aborts on any requests that were on this waitlist.
6344 // Any requests that were on a waiting_for_open session waitlist
6345 // will get kicked during close session below.
6346 signal_cond_list(waiting_for_mdsmap
);
6348 // Force-close all sessions
6349 while(!mds_sessions
.empty()) {
6350 auto session
= mds_sessions
.begin()->second
;
6351 _closed_mds_session(session
.get(), err
);
6355 void Client::_unmount(bool abort
)
6358 * We are unmounting the client.
6360 * Just declare the state to STATE_UNMOUNTING to block and fail
6361 * any new comming "reader" and then try to wait all the in-flight
6362 * "readers" to finish.
6364 RWRef_t
mref_writer(mount_state
, CLIENT_UNMOUNTING
, false);
6365 if (!mref_writer
.is_first_writer())
6367 mref_writer
.wait_readers_done();
6369 std::unique_lock lock
{client_lock
};
6371 if (abort
|| blocklisted
) {
6372 ldout(cct
, 2) << "unmounting (" << (abort
? "abort)" : "blocklisted)") << dendl
;
6374 ldout(cct
, 2) << "unmounting" << dendl
;
6380 mount_aborted
= true;
6381 // Abort all mds sessions
6382 _abort_mds_sessions(-CEPHFS_ENOTCONN
);
6384 objecter
->op_cancel_writes(-CEPHFS_ENOTCONN
);
6386 // flush the mdlog for pending requests, if any
6390 mount_cond
.wait(lock
, [this] {
6391 if (!mds_requests
.empty()) {
6392 ldout(cct
, 10) << "waiting on " << mds_requests
.size() << " requests"
6395 return mds_requests
.empty();
6401 // clean up any unclosed files
6402 while (!fd_map
.empty()) {
6403 Fh
*fh
= fd_map
.begin()->second
;
6404 fd_map
.erase(fd_map
.begin());
6405 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *fh
->inode
<< dendl
;
6409 while (!ll_unclosed_fh_set
.empty()) {
6410 set
<Fh
*>::iterator it
= ll_unclosed_fh_set
.begin();
6412 ll_unclosed_fh_set
.erase(fh
);
6413 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *(fh
->inode
) << dendl
;
6417 while (!opened_dirs
.empty()) {
6418 dir_result_t
*dirp
= *opened_dirs
.begin();
6419 ldout(cct
, 0) << " destroyed lost open dir " << dirp
<< " on " << *dirp
->inode
<< dendl
;
6425 if (cct
->_conf
->client_oc
) {
6426 // flush/release all buffered data
6427 std::list
<InodeRef
> anchor
;
6428 for (auto& p
: inode_map
) {
6429 Inode
*in
= p
.second
;
6431 ldout(cct
, 0) << "null inode_map entry ino " << p
.first
<< dendl
;
6435 // prevent inode from getting freed
6436 anchor
.emplace_back(in
);
6438 if (abort
|| blocklisted
) {
6439 objectcacher
->purge_set(&in
->oset
);
6440 } else if (!in
->caps
.empty()) {
6442 _flush(in
, new C_Client_FlushComplete(this, in
));
6447 if (abort
|| blocklisted
) {
6448 for (auto &q
: mds_sessions
) {
6450 for (auto p
= s
->dirty_list
.begin(); !p
.end(); ) {
6453 if (in
->dirty_caps
) {
6454 ldout(cct
, 0) << " drop dirty caps on " << *in
<< dendl
;
6455 in
->mark_caps_clean();
6462 wait_sync_caps(last_flush_tid
);
6470 while (lru
.lru_get_size() > 0 ||
6471 !inode_map
.empty()) {
6472 ldout(cct
, 2) << "cache still has " << lru
.lru_get_size()
6473 << "+" << inode_map
.size() << " items"
6474 << ", waiting (for caps to release?)"
6477 if (auto r
= mount_cond
.wait_for(lock
, ceph::make_timespan(5));
6478 r
== std::cv_status::timeout
) {
6482 ceph_assert(lru
.lru_get_size() == 0);
6483 ceph_assert(inode_map
.empty());
6486 if (!cct
->_conf
->client_trace
.empty()) {
6487 ldout(cct
, 1) << "closing trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6491 // stop the tick thread
6492 tick_thread_stopped
= true;
6493 upkeep_cond
.notify_one();
6497 mref_writer
.update_state(CLIENT_UNMOUNTED
);
6499 ldout(cct
, 2) << "unmounted." << dendl
;
6502 void Client::unmount()
6507 void Client::abort_conn()
6512 void Client::flush_cap_releases()
6514 uint64_t nr_caps
= 0;
6516 // send any cap releases
6517 for (auto &p
: mds_sessions
) {
6518 auto session
= p
.second
;
6519 if (session
->release
&& mdsmap
->is_clientreplay_or_active_or_stopping(
6521 nr_caps
+= session
->release
->caps
.size();
6522 if (cct
->_conf
->client_inject_release_failure
) {
6523 ldout(cct
, 20) << __func__
<< " injecting failure to send cap release message" << dendl
;
6525 session
->con
->send_message2(std::move(session
->release
));
6527 session
->release
.reset();
6532 dec_pinned_icaps(nr_caps
);
6536 void Client::renew_and_flush_cap_releases()
6538 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6540 if (!mount_aborted
&& mdsmap
->get_epoch()) {
6542 utime_t el
= ceph_clock_now() - last_cap_renew
;
6543 if (unlikely(el
> mdsmap
->get_session_timeout() / 3.0))
6546 flush_cap_releases();
6552 ldout(cct
, 20) << "tick" << dendl
;
6554 utime_t now
= ceph_clock_now();
6557 * If the mount() is not finished
6559 if (is_mounting() && !mds_requests
.empty()) {
6560 MetaRequest
*req
= mds_requests
.begin()->second
;
6562 if (req
->op_stamp
+ cct
->_conf
->client_mount_timeout
< now
) {
6563 req
->abort(-CEPHFS_ETIMEDOUT
);
6564 if (req
->caller_cond
) {
6566 req
->caller_cond
->notify_all();
6568 signal_cond_list(waiting_for_mdsmap
);
6569 for (auto &p
: mds_sessions
) {
6570 signal_context_list(p
.second
->waiting_for_open
);
6575 renew_and_flush_cap_releases();
6578 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
6582 if (!mount_aborted
&& in
->hold_caps_until
> now
)
6584 delayed_list
.pop_front();
6586 check_caps(in
, CHECK_CAPS_NODELAY
);
6590 collect_and_send_metrics();
6592 delay_put_inodes(is_unmounting());
6595 if (blocklisted
&& (is_mounted() || is_unmounting()) &&
6596 last_auto_reconnect
+ 30 * 60 < now
&&
6597 cct
->_conf
.get_val
<bool>("client_reconnect_stale")) {
6598 messenger
->client_reset();
6599 fd_gen
++; // invalidate open files
6600 blocklisted
= false;
6601 _kick_stale_sessions();
6602 last_auto_reconnect
= now
;
6606 void Client::start_tick_thread()
6608 upkeeper
= std::thread([this]() {
6609 using time
= ceph::coarse_mono_time
;
6610 using sec
= std::chrono::seconds
;
6612 auto last_tick
= time::min();
6614 std::unique_lock
cl(client_lock
);
6615 while (!tick_thread_stopped
) {
6616 auto now
= clock::now();
6617 auto since
= now
- last_tick
;
6619 auto t_interval
= clock::duration(cct
->_conf
.get_val
<sec
>("client_tick_interval"));
6620 auto d_interval
= clock::duration(cct
->_conf
.get_val
<sec
>("client_debug_inject_tick_delay"));
6622 auto interval
= std::max(t_interval
, d_interval
);
6623 if (likely(since
>= interval
*.90)) {
6625 last_tick
= clock::now();
6630 ldout(cct
, 20) << "upkeep thread waiting interval " << interval
<< dendl
;
6631 if (!tick_thread_stopped
)
6632 upkeep_cond
.wait_for(cl
, interval
);
6637 void Client::collect_and_send_metrics() {
6638 ldout(cct
, 20) << __func__
<< dendl
;
6640 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6642 // right now, we only track and send global metrics. its sufficient
6643 // to send these metrics to MDS rank0.
6644 collect_and_send_global_metrics();
6647 void Client::collect_and_send_global_metrics() {
6648 ldout(cct
, 20) << __func__
<< dendl
;
6649 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6651 if (!have_open_session((mds_rank_t
)0)) {
6652 ldout(cct
, 5) << __func__
<< ": no session with rank=0 -- not sending metric"
6656 auto session
= _get_or_open_mds_session((mds_rank_t
)0);
6657 if (!session
->mds_features
.test(CEPHFS_FEATURE_METRIC_COLLECT
)) {
6658 ldout(cct
, 5) << __func__
<< ": rank=0 does not support metrics" << dendl
;
6662 ClientMetricMessage metric
;
6663 std::vector
<ClientMetricMessage
> message
;
6666 metric
= ClientMetricMessage(ReadLatencyPayload(logger
->tget(l_c_read
)));
6667 message
.push_back(metric
);
6670 metric
= ClientMetricMessage(WriteLatencyPayload(logger
->tget(l_c_wrlat
)));
6671 message
.push_back(metric
);
6674 metric
= ClientMetricMessage(MetadataLatencyPayload(logger
->tget(l_c_lat
)));
6675 message
.push_back(metric
);
6677 // cap hit ratio -- nr_caps is unused right now
6678 auto [cap_hits
, cap_misses
] = get_cap_hit_rates();
6679 metric
= ClientMetricMessage(CapInfoPayload(cap_hits
, cap_misses
, 0));
6680 message
.push_back(metric
);
6682 // dentry lease hit ratio
6683 auto [dlease_hits
, dlease_misses
, nr
] = get_dlease_hit_rates();
6684 metric
= ClientMetricMessage(DentryLeasePayload(dlease_hits
, dlease_misses
, nr
));
6685 message
.push_back(metric
);
6689 auto [opened_files
, total_inodes
] = get_opened_files_rates();
6690 metric
= ClientMetricMessage(OpenedFilesPayload(opened_files
, total_inodes
));
6692 message
.push_back(metric
);
6696 auto [pinned_icaps
, total_inodes
] = get_pinned_icaps_rates();
6697 metric
= ClientMetricMessage(PinnedIcapsPayload(pinned_icaps
, total_inodes
));
6699 message
.push_back(metric
);
6703 auto [opened_inodes
, total_inodes
] = get_opened_inodes_rates();
6704 metric
= ClientMetricMessage(OpenedInodesPayload(opened_inodes
, total_inodes
));
6706 message
.push_back(metric
);
6709 metric
= ClientMetricMessage(ReadIoSizesPayload(total_read_ops
,
6711 message
.push_back(metric
);
6714 metric
= ClientMetricMessage(WriteIoSizesPayload(total_write_ops
,
6716 message
.push_back(metric
);
6718 session
->con
->send_message2(make_message
<MClientMetrics
>(std::move(message
)));
6721 void Client::renew_caps()
6723 ldout(cct
, 10) << "renew_caps()" << dendl
;
6724 last_cap_renew
= ceph_clock_now();
6726 for (auto &p
: mds_sessions
) {
6727 ldout(cct
, 15) << "renew_caps requesting from mds." << p
.first
<< dendl
;
6728 if (mdsmap
->get_state(p
.first
) >= MDSMap::STATE_REJOIN
)
6729 renew_caps(p
.second
.get());
6733 void Client::renew_caps(MetaSession
*session
)
6735 ldout(cct
, 10) << "renew_caps mds." << session
->mds_num
<< dendl
;
6736 session
->last_cap_renew_request
= ceph_clock_now();
6737 uint64_t seq
= ++session
->cap_renew_seq
;
6738 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_RENEWCAPS
, seq
));
6742 // ===============================================================
6743 // high level (POSIXy) interface
6745 int Client::_do_lookup(Inode
*dir
, const string
& name
, int mask
,
6746 InodeRef
*target
, const UserPerm
& perms
)
6748 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_LOOKUPSNAP
: CEPH_MDS_OP_LOOKUP
;
6749 MetaRequest
*req
= new MetaRequest(op
);
6751 dir
->make_nosnap_relative_path(path
);
6752 path
.push_dentry(name
);
6753 req
->set_filepath(path
);
6754 req
->set_inode(dir
);
6755 if (cct
->_conf
->client_debug_getattr_caps
&& op
== CEPH_MDS_OP_LOOKUP
)
6756 mask
|= DEBUG_GETATTR_CAPS
;
6757 req
->head
.args
.getattr
.mask
= mask
;
6759 ldout(cct
, 10) << __func__
<< " on " << path
<< dendl
;
6761 int r
= make_request(req
, perms
, target
);
6762 ldout(cct
, 10) << __func__
<< " res is " << r
<< dendl
;
6766 bool Client::_dentry_valid(const Dentry
*dn
)
6768 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6770 // is dn lease valid?
6771 utime_t now
= ceph_clock_now();
6772 if (dn
->lease_mds
>= 0 && dn
->lease_ttl
> now
&&
6773 mds_sessions
.count(dn
->lease_mds
)) {
6774 auto s
= mds_sessions
.at(dn
->lease_mds
);
6775 if (s
->cap_ttl
> now
&& s
->cap_gen
== dn
->lease_gen
) {
6780 ldout(cct
, 20) << " bad lease, cap_ttl " << s
->cap_ttl
<< ", cap_gen " << s
->cap_gen
6781 << " vs lease_gen " << dn
->lease_gen
<< dendl
;
6788 int Client::_lookup(Inode
*dir
, const string
& dname
, int mask
, InodeRef
*target
,
6789 const UserPerm
& perms
, std::string
* alternate_name
)
6793 bool did_lookup_request
= false;
6794 // can only request shared caps
6795 mask
&= CEPH_CAP_ANY_SHARED
| CEPH_STAT_RSTAT
;
6797 if (dname
== "..") {
6798 if (dir
->dentries
.empty()) {
6799 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
6800 filepath
path(dir
->ino
);
6801 req
->set_filepath(path
);
6804 int r
= make_request(req
, perms
, &tmptarget
, NULL
, rand() % mdsmap
->get_num_in_mds());
6807 *target
= std::move(tmptarget
);
6808 ldout(cct
, 8) << __func__
<< " found target " << (*target
)->ino
<< dendl
;
6814 *target
= dir
->get_first_parent()->dir
->parent_inode
; //dirs can't be hard-linked
6823 if (!dir
->is_dir()) {
6824 r
= -CEPHFS_ENOTDIR
;
6828 if (dname
.length() > NAME_MAX
) {
6829 r
= -CEPHFS_ENAMETOOLONG
;
6833 if (dname
== cct
->_conf
->client_snapdir
&&
6834 dir
->snapid
== CEPH_NOSNAP
) {
6835 *target
= open_snapdir(dir
);
6841 dir
->dir
->dentries
.count(dname
)) {
6842 dn
= dir
->dir
->dentries
[dname
];
6844 ldout(cct
, 20) << __func__
<< " have " << *dn
<< " from mds." << dn
->lease_mds
6845 << " ttl " << dn
->lease_ttl
<< " seq " << dn
->lease_seq
<< dendl
;
6847 if (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)) {
6848 if (_dentry_valid(dn
)) {
6849 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6850 // make trim_caps() behave.
6851 dir
->try_touch_cap(dn
->lease_mds
);
6855 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
6856 if (dn
->cap_shared_gen
== dir
->shared_gen
&&
6857 (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)))
6859 if (!dn
->inode
&& (dir
->flags
& I_COMPLETE
)) {
6860 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for "
6861 << *dir
<< " dn '" << dname
<< "'" << dendl
;
6862 return -CEPHFS_ENOENT
;
6866 ldout(cct
, 20) << " no cap on " << dn
->inode
->vino() << dendl
;
6869 // can we conclude ENOENT locally?
6870 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true) &&
6871 (dir
->flags
& I_COMPLETE
)) {
6872 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for " << *dir
<< " dn '" << dname
<< "'" << dendl
;
6873 return -CEPHFS_ENOENT
;
6877 if (did_lookup_request
) {
6881 r
= _do_lookup(dir
, dname
, mask
, target
, perms
);
6882 did_lookup_request
= true;
6884 /* complete lookup to get dentry for alternate_name */
6892 *target
= dn
->inode
;
6894 *alternate_name
= dn
->alternate_name
;
6903 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << r
<< dendl
;
6905 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << **target
<< dendl
;
6909 int Client::get_or_create(Inode
*dir
, const char* name
,
6910 Dentry
**pdn
, bool expect_null
)
6913 ldout(cct
, 20) << __func__
<< " " << *dir
<< " name " << name
<< dendl
;
6915 if (dir
->dir
->dentries
.count(name
)) {
6916 Dentry
*dn
= dir
->dir
->dentries
[name
];
6917 if (_dentry_valid(dn
)) {
6919 return -CEPHFS_EEXIST
;
6923 // otherwise link up a new one
6924 *pdn
= link(dir
->dir
, name
, NULL
, NULL
);
6931 int Client::walk(std::string_view path
, walk_dentry_result
* wdr
, const UserPerm
& perms
, bool followsym
)
6933 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
6934 if (!mref_reader
.is_state_satisfied())
6935 return -CEPHFS_ENOTCONN
;
6937 ldout(cct
, 10) << __func__
<< ": " << path
<< dendl
;
6939 std::scoped_lock
lock(client_lock
);
6941 return path_walk(path
, wdr
, perms
, followsym
);
6944 int Client::path_walk(const filepath
& origpath
, InodeRef
*end
,
6945 const UserPerm
& perms
, bool followsym
, int mask
, InodeRef dirinode
)
6947 walk_dentry_result wdr
;
6948 int rc
= path_walk(origpath
, &wdr
, perms
, followsym
, mask
, dirinode
);
6949 *end
= std::move(wdr
.in
);
6953 int Client::path_walk(const filepath
& origpath
, walk_dentry_result
* result
, const UserPerm
& perms
,
6954 bool followsym
, int mask
, InodeRef dirinode
)
6956 filepath path
= origpath
;
6958 std::string alternate_name
;
6959 if (origpath
.absolute())
6968 ldout(cct
, 20) << __func__
<< " cur=" << *cur
<< dendl
;
6969 ldout(cct
, 10) << __func__
<< " " << path
<< dendl
;
6974 while (i
< path
.depth() && cur
) {
6976 const string
&dname
= path
[i
];
6977 ldout(cct
, 10) << " " << i
<< " " << *cur
<< " " << dname
<< dendl
;
6978 ldout(cct
, 20) << " (path is " << path
<< ")" << dendl
;
6980 if (cct
->_conf
->client_permissions
) {
6981 int r
= may_lookup(cur
.get(), perms
);
6984 caps
= CEPH_CAP_AUTH_SHARED
;
6987 /* Get extra requested caps on the last component */
6988 if (i
== (path
.depth() - 1))
6990 int r
= _lookup(cur
.get(), dname
, caps
, &next
, perms
, &alternate_name
);
6993 // only follow trailing symlink if followsym. always follow
6994 // 'directory' symlinks.
6995 if (next
&& next
->is_symlink()) {
6997 ldout(cct
, 20) << " symlink count " << symlinks
<< ", value is '" << next
->symlink
<< "'" << dendl
;
6998 if (symlinks
> MAXSYMLINKS
) {
6999 return -CEPHFS_ELOOP
;
7002 if (i
< path
.depth() - 1) {
7004 // replace consumed components of path with symlink dir target
7005 filepath
resolved(next
->symlink
.c_str());
7006 resolved
.append(path
.postfixpath(i
+ 1));
7009 if (next
->symlink
[0] == '/') {
7013 } else if (followsym
) {
7014 if (next
->symlink
[0] == '/') {
7015 path
= next
->symlink
.c_str();
7020 filepath
more(next
->symlink
.c_str());
7021 // we need to remove the symlink component from off of the path
7022 // before adding the target that the symlink points to. remain
7023 // at the same position in the path.
7034 return -CEPHFS_ENOENT
;
7036 result
->in
= std::move(cur
);
7037 result
->alternate_name
= std::move(alternate_name
);
7045 int Client::link(const char *relexisting
, const char *relpath
, const UserPerm
& perm
, std::string alternate_name
)
7047 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7048 if (!mref_reader
.is_state_satisfied())
7049 return -CEPHFS_ENOTCONN
;
7051 tout(cct
) << "link" << std::endl
;
7052 tout(cct
) << relexisting
<< std::endl
;
7053 tout(cct
) << relpath
<< std::endl
;
7055 filepath
existing(relexisting
);
7059 std::scoped_lock
lock(client_lock
);
7060 int r
= path_walk(existing
, &in
, perm
, true);
7063 if (std::string(relpath
) == "/") {
7067 filepath
path(relpath
);
7068 string name
= path
.last_dentry();
7071 r
= path_walk(path
, &dir
, perm
, true);
7074 if (cct
->_conf
->client_permissions
) {
7075 if (S_ISDIR(in
->mode
)) {
7079 r
= may_hardlink(in
.get(), perm
);
7082 r
= may_create(dir
.get(), perm
);
7086 r
= _link(in
.get(), dir
.get(), name
.c_str(), perm
, std::move(alternate_name
));
7090 int Client::unlink(const char *relpath
, const UserPerm
& perm
)
7092 return unlinkat(CEPHFS_AT_FDCWD
, relpath
, 0, perm
);
7095 int Client::unlinkat(int dirfd
, const char *relpath
, int flags
, const UserPerm
& perm
)
7097 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7098 if (!mref_reader
.is_state_satisfied()) {
7099 return -CEPHFS_ENOTCONN
;
7102 tout(cct
) << __func__
<< std::endl
;
7103 tout(cct
) << dirfd
<< std::endl
;
7104 tout(cct
) << relpath
<< std::endl
;
7105 tout(cct
) << flags
<< std::endl
;
7107 if (std::string(relpath
) == "/") {
7108 return flags
& AT_REMOVEDIR
? -CEPHFS_EBUSY
: -CEPHFS_EISDIR
;
7111 filepath
path(relpath
);
7112 string name
= path
.last_dentry();
7116 std::scoped_lock
lock(client_lock
);
7119 int r
= get_fd_inode(dirfd
, &dirinode
);
7124 r
= path_walk(path
, &dir
, perm
, true, 0, dirinode
);
7128 if (cct
->_conf
->client_permissions
) {
7129 r
= may_delete(dir
.get(), name
.c_str(), perm
);
7134 if (flags
& AT_REMOVEDIR
) {
7135 r
= _rmdir(dir
.get(), name
.c_str(), perm
);
7137 r
= _unlink(dir
.get(), name
.c_str(), perm
);
7142 int Client::rename(const char *relfrom
, const char *relto
, const UserPerm
& perm
, std::string alternate_name
)
7144 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7145 if (!mref_reader
.is_state_satisfied())
7146 return -CEPHFS_ENOTCONN
;
7148 tout(cct
) << __func__
<< std::endl
;
7149 tout(cct
) << relfrom
<< std::endl
;
7150 tout(cct
) << relto
<< std::endl
;
7152 if (std::string(relfrom
) == "/" || std::string(relto
) == "/")
7153 return -CEPHFS_EBUSY
;
7155 filepath
from(relfrom
);
7157 string fromname
= from
.last_dentry();
7159 string toname
= to
.last_dentry();
7162 InodeRef fromdir
, todir
;
7164 std::scoped_lock
lock(client_lock
);
7165 int r
= path_walk(from
, &fromdir
, perm
);
7168 r
= path_walk(to
, &todir
, perm
);
7172 if (cct
->_conf
->client_permissions
) {
7173 int r
= may_delete(fromdir
.get(), fromname
.c_str(), perm
);
7176 r
= may_delete(todir
.get(), toname
.c_str(), perm
);
7177 if (r
< 0 && r
!= -CEPHFS_ENOENT
)
7180 r
= _rename(fromdir
.get(), fromname
.c_str(), todir
.get(), toname
.c_str(), perm
, std::move(alternate_name
));
7187 int Client::mkdir(const char *relpath
, mode_t mode
, const UserPerm
& perm
, std::string alternate_name
)
7189 return mkdirat(CEPHFS_AT_FDCWD
, relpath
, mode
, perm
, alternate_name
);
7192 int Client::mkdirat(int dirfd
, const char *relpath
, mode_t mode
, const UserPerm
& perm
,
7193 std::string alternate_name
)
7195 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7196 if (!mref_reader
.is_state_satisfied())
7197 return -CEPHFS_ENOTCONN
;
7199 tout(cct
) << __func__
<< std::endl
;
7200 tout(cct
) << dirfd
<< std::endl
;
7201 tout(cct
) << relpath
<< std::endl
;
7202 tout(cct
) << mode
<< std::endl
;
7203 ldout(cct
, 10) << __func__
<< ": " << relpath
<< dendl
;
7205 if (std::string(relpath
) == "/") {
7206 return -CEPHFS_EEXIST
;
7209 filepath
path(relpath
);
7210 string name
= path
.last_dentry();
7214 std::scoped_lock
lock(client_lock
);
7217 int r
= get_fd_inode(dirfd
, &dirinode
);
7222 r
= path_walk(path
, &dir
, perm
, true, 0, dirinode
);
7226 if (cct
->_conf
->client_permissions
) {
7227 r
= may_create(dir
.get(), perm
);
7232 return _mkdir(dir
.get(), name
.c_str(), mode
, perm
, 0, {}, std::move(alternate_name
));
7235 int Client::mkdirs(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7237 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7238 if (!mref_reader
.is_state_satisfied())
7239 return -CEPHFS_ENOTCONN
;
7241 ldout(cct
, 10) << "Client::mkdirs " << relpath
<< dendl
;
7242 tout(cct
) << __func__
<< std::endl
;
7243 tout(cct
) << relpath
<< std::endl
;
7244 tout(cct
) << mode
<< std::endl
;
7246 //get through existing parts of path
7247 filepath
path(relpath
);
7249 int r
= 0, caps
= 0;
7252 std::scoped_lock
lock(client_lock
);
7254 for (i
=0; i
<path
.depth(); ++i
) {
7255 if (cct
->_conf
->client_permissions
) {
7256 r
= may_lookup(cur
.get(), perms
);
7259 caps
= CEPH_CAP_AUTH_SHARED
;
7261 r
= _lookup(cur
.get(), path
[i
].c_str(), caps
, &next
, perms
);
7266 if (r
!=-CEPHFS_ENOENT
) return r
;
7267 ldout(cct
, 20) << __func__
<< " got through " << i
<< " directories on path " << relpath
<< dendl
;
7268 //make new directory at each level
7269 for (; i
<path
.depth(); ++i
) {
7270 if (cct
->_conf
->client_permissions
) {
7271 r
= may_create(cur
.get(), perms
);
7276 r
= _mkdir(cur
.get(), path
[i
].c_str(), mode
, perms
, &next
);
7278 //check proper creation/existence
7279 if(-CEPHFS_EEXIST
== r
&& i
< path
.depth() - 1) {
7280 r
= _lookup(cur
.get(), path
[i
].c_str(), CEPH_CAP_AUTH_SHARED
, &next
, perms
);
7284 //move to new dir and continue
7286 ldout(cct
, 20) << __func__
<< ": successfully created directory "
7287 << filepath(cur
->ino
).get_path() << dendl
;
7292 int Client::rmdir(const char *relpath
, const UserPerm
& perms
)
7294 return unlinkat(CEPHFS_AT_FDCWD
, relpath
, AT_REMOVEDIR
, perms
);
7297 int Client::mknod(const char *relpath
, mode_t mode
, const UserPerm
& perms
, dev_t rdev
)
7299 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7300 if (!mref_reader
.is_state_satisfied())
7301 return -CEPHFS_ENOTCONN
;
7303 tout(cct
) << __func__
<< std::endl
;
7304 tout(cct
) << relpath
<< std::endl
;
7305 tout(cct
) << mode
<< std::endl
;
7306 tout(cct
) << rdev
<< std::endl
;
7308 if (std::string(relpath
) == "/")
7309 return -CEPHFS_EEXIST
;
7311 filepath
path(relpath
);
7312 string name
= path
.last_dentry();
7316 std::scoped_lock
lock(client_lock
);
7317 int r
= path_walk(path
, &dir
, perms
);
7320 if (cct
->_conf
->client_permissions
) {
7321 int r
= may_create(dir
.get(), perms
);
7325 return _mknod(dir
.get(), name
.c_str(), mode
, rdev
, perms
);
7330 int Client::symlink(const char *target
, const char *relpath
, const UserPerm
& perms
, std::string alternate_name
)
7332 return symlinkat(target
, CEPHFS_AT_FDCWD
, relpath
, perms
, alternate_name
);
7335 int Client::symlinkat(const char *target
, int dirfd
, const char *relpath
, const UserPerm
& perms
,
7336 std::string alternate_name
)
7338 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7339 if (!mref_reader
.is_state_satisfied()) {
7340 return -CEPHFS_ENOTCONN
;
7343 tout(cct
) << __func__
<< std::endl
;
7344 tout(cct
) << target
<< std::endl
;
7345 tout(cct
) << dirfd
<< std::endl
;
7346 tout(cct
) << relpath
<< std::endl
;
7348 if (std::string(relpath
) == "/") {
7349 return -CEPHFS_EEXIST
;
7352 filepath
path(relpath
);
7353 string name
= path
.last_dentry();
7357 std::scoped_lock
lock(client_lock
);
7360 int r
= get_fd_inode(dirfd
, &dirinode
);
7364 r
= path_walk(path
, &dir
, perms
, true, 0, dirinode
);
7368 if (cct
->_conf
->client_permissions
) {
7369 int r
= may_create(dir
.get(), perms
);
7374 return _symlink(dir
.get(), name
.c_str(), target
, perms
, std::move(alternate_name
));
7377 int Client::readlink(const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
)
7379 return readlinkat(CEPHFS_AT_FDCWD
, relpath
, buf
, size
, perms
);
7382 int Client::readlinkat(int dirfd
, const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
) {
7383 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7384 if (!mref_reader
.is_state_satisfied()) {
7385 return -CEPHFS_ENOTCONN
;
7388 tout(cct
) << __func__
<< std::endl
;
7389 tout(cct
) << dirfd
<< std::endl
;
7390 tout(cct
) << relpath
<< std::endl
;
7393 std::scoped_lock
lock(client_lock
);
7394 int r
= get_fd_inode(dirfd
, &dirinode
);
7400 filepath
path(relpath
);
7401 r
= path_walk(path
, &in
, perms
, false, 0, dirinode
);
7406 return _readlink(in
.get(), buf
, size
);
7409 int Client::_readlink(Inode
*in
, char *buf
, size_t size
)
7411 if (!in
->is_symlink())
7412 return -CEPHFS_EINVAL
;
7414 // copy into buf (at most size bytes)
7415 int r
= in
->symlink
.length();
7418 memcpy(buf
, in
->symlink
.c_str(), r
);
7425 int Client::_getattr(Inode
*in
, int mask
, const UserPerm
& perms
, bool force
)
7427 bool yes
= in
->caps_issued_mask(mask
, true);
7429 ldout(cct
, 10) << __func__
<< " mask " << ccap_string(mask
) << " issued=" << yes
<< dendl
;
7433 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
7435 in
->make_nosnap_relative_path(path
);
7436 req
->set_filepath(path
);
7438 req
->head
.args
.getattr
.mask
= mask
;
7440 int res
= make_request(req
, perms
);
7441 ldout(cct
, 10) << __func__
<< " result=" << res
<< dendl
;
7445 int Client::_getvxattr(
7447 const UserPerm
& perms
,
7448 const char *xattr_name
,
7453 if (!xattr_name
|| strlen(xattr_name
) <= 0 || strlen(xattr_name
) > 255) {
7454 return -CEPHFS_ENODATA
;
7457 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETVXATTR
);
7459 in
->make_nosnap_relative_path(path
);
7460 req
->set_filepath(path
);
7462 req
->set_string2(xattr_name
);
7465 int res
= make_request(req
, perms
, nullptr, nullptr, rank
, &bl
);
7466 ldout(cct
, 10) << __func__
<< " result=" << res
<< dendl
;
7473 auto p
= bl
.cbegin();
7479 ssize_t len
= buf
.length();
7481 res
= len
; // refer to man getxattr(2) for output buffer size == 0
7485 res
= -CEPHFS_ERANGE
; // insufficient output buffer space
7487 memcpy(value
, buf
.c_str(), len
);
7493 int Client::_do_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
7494 const UserPerm
& perms
, InodeRef
*inp
)
7496 int issued
= in
->caps_issued();
7497 union ceph_mds_request_args args
;
7498 bool kill_sguid
= false;
7501 ldout(cct
, 10) << __func__
<< " mask " << mask
<< " issued " <<
7502 ccap_string(issued
) << dendl
;
7504 if (in
->snapid
!= CEPH_NOSNAP
) {
7505 return -CEPHFS_EROFS
;
7507 if ((mask
& CEPH_SETATTR_SIZE
) &&
7508 (uint64_t)stx
->stx_size
> in
->size
&&
7509 is_quota_bytes_exceeded(in
, (uint64_t)stx
->stx_size
- in
->size
,
7511 return -CEPHFS_EDQUOT
;
7514 memset(&args
, 0, sizeof(args
));
7516 // make the change locally?
7517 if ((in
->cap_dirtier_uid
>= 0 && perms
.uid() != in
->cap_dirtier_uid
) ||
7518 (in
->cap_dirtier_gid
>= 0 && perms
.gid() != in
->cap_dirtier_gid
)) {
7519 ldout(cct
, 10) << __func__
<< " caller " << perms
.uid() << ":" << perms
.gid()
7520 << " != cap dirtier " << in
->cap_dirtier_uid
<< ":"
7521 << in
->cap_dirtier_gid
<< ", forcing sync setattr"
7524 * This works because we implicitly flush the caps as part of the
7525 * request, so the cap update check will happen with the writeback
7526 * cap context, and then the setattr check will happen with the
7529 * In reality this pattern is likely pretty rare (different users
7530 * setattr'ing the same file). If that turns out not to be the
7531 * case later, we can build a more complex pipelined cap writeback
7534 mask
|= CEPH_SETATTR_CTIME
;
7538 // caller just needs us to bump the ctime
7539 in
->ctime
= ceph_clock_now();
7540 in
->cap_dirtier_uid
= perms
.uid();
7541 in
->cap_dirtier_gid
= perms
.gid();
7542 if (issued
& CEPH_CAP_AUTH_EXCL
)
7543 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7544 else if (issued
& CEPH_CAP_FILE_EXCL
)
7545 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7546 else if (issued
& CEPH_CAP_XATTR_EXCL
)
7547 in
->mark_caps_dirty(CEPH_CAP_XATTR_EXCL
);
7549 mask
|= CEPH_SETATTR_CTIME
;
7552 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7553 kill_sguid
= mask
& (CEPH_SETATTR_SIZE
|CEPH_SETATTR_KILL_SGUID
);
7555 mask
&= ~CEPH_SETATTR_KILL_SGUID
;
7556 } else if (mask
& CEPH_SETATTR_SIZE
) {
7557 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7558 mask
|= CEPH_SETATTR_KILL_SGUID
;
7559 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7562 if (mask
& CEPH_SETATTR_UID
) {
7563 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
7565 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7566 in
->ctime
= ceph_clock_now();
7567 in
->cap_dirtier_uid
= perms
.uid();
7568 in
->cap_dirtier_gid
= perms
.gid();
7569 in
->uid
= stx
->stx_uid
;
7570 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7571 mask
&= ~CEPH_SETATTR_UID
;
7573 } else if (!in
->caps_issued_mask(CEPH_CAP_AUTH_SHARED
) ||
7574 in
->uid
!= stx
->stx_uid
) {
7575 args
.setattr
.uid
= stx
->stx_uid
;
7576 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7578 mask
&= ~CEPH_SETATTR_UID
;
7582 if (mask
& CEPH_SETATTR_GID
) {
7583 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
7585 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7586 in
->ctime
= ceph_clock_now();
7587 in
->cap_dirtier_uid
= perms
.uid();
7588 in
->cap_dirtier_gid
= perms
.gid();
7589 in
->gid
= stx
->stx_gid
;
7590 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7591 mask
&= ~CEPH_SETATTR_GID
;
7593 } else if (!in
->caps_issued_mask(CEPH_CAP_AUTH_SHARED
) ||
7594 in
->gid
!= stx
->stx_gid
) {
7595 args
.setattr
.gid
= stx
->stx_gid
;
7596 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7598 mask
&= ~CEPH_SETATTR_GID
;
7602 if (mask
& CEPH_SETATTR_MODE
) {
7603 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
7605 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7606 in
->ctime
= ceph_clock_now();
7607 in
->cap_dirtier_uid
= perms
.uid();
7608 in
->cap_dirtier_gid
= perms
.gid();
7609 in
->mode
= (in
->mode
& ~07777) | (stx
->stx_mode
& 07777);
7610 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7611 mask
&= ~CEPH_SETATTR_MODE
;
7612 } else if (!in
->caps_issued_mask(CEPH_CAP_AUTH_SHARED
) ||
7613 in
->mode
!= stx
->stx_mode
) {
7614 args
.setattr
.mode
= stx
->stx_mode
;
7615 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7617 mask
&= ~CEPH_SETATTR_MODE
;
7619 } else if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
) &&
7620 kill_sguid
&& S_ISREG(in
->mode
) &&
7621 (in
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
7622 /* Must squash the any setuid/setgid bits with an ownership change */
7623 in
->mode
&= ~(S_ISUID
|S_ISGID
);
7624 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7627 if (mask
& CEPH_SETATTR_BTIME
) {
7628 ldout(cct
,10) << "changing btime to " << in
->btime
<< dendl
;
7630 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7631 in
->ctime
= ceph_clock_now();
7632 in
->cap_dirtier_uid
= perms
.uid();
7633 in
->cap_dirtier_gid
= perms
.gid();
7634 in
->btime
= utime_t(stx
->stx_btime
);
7635 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7636 mask
&= ~CEPH_SETATTR_BTIME
;
7637 } else if (!in
->caps_issued_mask(CEPH_CAP_AUTH_SHARED
) ||
7638 in
->btime
!= utime_t(stx
->stx_btime
)) {
7639 args
.setattr
.btime
= utime_t(stx
->stx_btime
);
7640 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7642 mask
&= ~CEPH_SETATTR_BTIME
;
7646 if (mask
& CEPH_SETATTR_SIZE
) {
7647 if ((uint64_t)stx
->stx_size
>= mdsmap
->get_max_filesize()) {
7649 ldout(cct
,10) << "unable to set size to " << stx
->stx_size
<< ". Too large!" << dendl
;
7650 return -CEPHFS_EFBIG
;
7653 ldout(cct
,10) << "changing size to " << stx
->stx_size
<< dendl
;
7654 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
) &&
7655 !(mask
& CEPH_SETATTR_KILL_SGUID
) &&
7656 stx
->stx_size
>= in
->size
) {
7657 if (stx
->stx_size
> in
->size
) {
7658 in
->size
= in
->reported_size
= stx
->stx_size
;
7659 in
->cap_dirtier_uid
= perms
.uid();
7660 in
->cap_dirtier_gid
= perms
.gid();
7661 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7662 mask
&= ~(CEPH_SETATTR_SIZE
);
7663 mask
|= CEPH_SETATTR_MTIME
;
7665 // ignore it when size doesn't change
7666 mask
&= ~(CEPH_SETATTR_SIZE
);
7669 args
.setattr
.size
= stx
->stx_size
;
7670 inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7675 if (mask
& CEPH_SETATTR_MTIME
) {
7676 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
7677 in
->mtime
= utime_t(stx
->stx_mtime
);
7678 in
->ctime
= ceph_clock_now();
7679 in
->cap_dirtier_uid
= perms
.uid();
7680 in
->cap_dirtier_gid
= perms
.gid();
7681 in
->time_warp_seq
++;
7682 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7683 mask
&= ~CEPH_SETATTR_MTIME
;
7684 } else if (in
->caps_issued_mask(CEPH_CAP_FILE_WR
) &&
7685 utime_t(stx
->stx_mtime
) > in
->mtime
) {
7686 in
->mtime
= utime_t(stx
->stx_mtime
);
7687 in
->ctime
= ceph_clock_now();
7688 in
->cap_dirtier_uid
= perms
.uid();
7689 in
->cap_dirtier_gid
= perms
.gid();
7690 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
7691 mask
&= ~CEPH_SETATTR_MTIME
;
7692 } else if (!in
->caps_issued_mask(CEPH_CAP_FILE_SHARED
) ||
7693 in
->mtime
!= utime_t(stx
->stx_mtime
)) {
7694 args
.setattr
.mtime
= utime_t(stx
->stx_mtime
);
7695 inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7698 mask
&= ~CEPH_SETATTR_MTIME
;
7702 if (mask
& CEPH_SETATTR_ATIME
) {
7703 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
7704 in
->atime
= utime_t(stx
->stx_atime
);
7705 in
->ctime
= ceph_clock_now();
7706 in
->cap_dirtier_uid
= perms
.uid();
7707 in
->cap_dirtier_gid
= perms
.gid();
7708 in
->time_warp_seq
++;
7709 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7710 mask
&= ~CEPH_SETATTR_ATIME
;
7711 } else if (in
->caps_issued_mask(CEPH_CAP_FILE_WR
) &&
7712 utime_t(stx
->stx_atime
) > in
->atime
) {
7713 in
->atime
= utime_t(stx
->stx_atime
);
7714 in
->ctime
= ceph_clock_now();
7715 in
->cap_dirtier_uid
= perms
.uid();
7716 in
->cap_dirtier_gid
= perms
.gid();
7717 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
7718 mask
&= ~CEPH_SETATTR_ATIME
;
7719 } else if (!in
->caps_issued_mask(CEPH_CAP_FILE_SHARED
) ||
7720 in
->atime
!= utime_t(stx
->stx_atime
)) {
7721 args
.setattr
.atime
= utime_t(stx
->stx_atime
);
7722 inode_drop
|= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_RD
|
7725 mask
&= ~CEPH_SETATTR_ATIME
;
7734 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETATTR
);
7738 in
->make_nosnap_relative_path(path
);
7739 req
->set_filepath(path
);
7742 req
->head
.args
= args
;
7743 req
->inode_drop
= inode_drop
;
7744 req
->head
.args
.setattr
.mask
= mask
;
7745 req
->regetattr_mask
= mask
;
7747 int res
= make_request(req
, perms
, inp
);
7748 ldout(cct
, 10) << "_setattr result=" << res
<< dendl
;
7752 /* Note that we only care about attrs that setattr cares about */
7753 void Client::stat_to_statx(struct stat
*st
, struct ceph_statx
*stx
)
7755 stx
->stx_size
= st
->st_size
;
7756 stx
->stx_mode
= st
->st_mode
;
7757 stx
->stx_uid
= st
->st_uid
;
7758 stx
->stx_gid
= st
->st_gid
;
7760 stx
->stx_mtime
= st
->st_mtimespec
;
7761 stx
->stx_atime
= st
->st_atimespec
;
7763 stx
->stx_mtime
.tv_sec
= st
->st_mtime
;
7764 stx
->stx_atime
.tv_sec
= st
->st_atime
;
7766 stx
->stx_mtime
= st
->st_mtim
;
7767 stx
->stx_atime
= st
->st_atim
;
7771 int Client::__setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
7772 const UserPerm
& perms
, InodeRef
*inp
)
7774 int ret
= _do_setattr(in
, stx
, mask
, perms
, inp
);
7777 if (mask
& CEPH_SETATTR_MODE
)
7778 ret
= _posix_acl_chmod(in
, stx
->stx_mode
, perms
);
7782 int Client::_setattrx(InodeRef
&in
, struct ceph_statx
*stx
, int mask
,
7783 const UserPerm
& perms
)
7785 mask
&= (CEPH_SETATTR_MODE
| CEPH_SETATTR_UID
|
7786 CEPH_SETATTR_GID
| CEPH_SETATTR_MTIME
|
7787 CEPH_SETATTR_ATIME
| CEPH_SETATTR_SIZE
|
7788 CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
);
7789 if (cct
->_conf
->client_permissions
) {
7790 int r
= may_setattr(in
.get(), stx
, mask
, perms
);
7794 return __setattrx(in
.get(), stx
, mask
, perms
);
7797 int Client::_setattr(InodeRef
&in
, struct stat
*attr
, int mask
,
7798 const UserPerm
& perms
)
7800 struct ceph_statx stx
;
7802 stat_to_statx(attr
, &stx
);
7803 mask
&= ~CEPH_SETATTR_BTIME
;
7805 if ((mask
& CEPH_SETATTR_UID
) && attr
->st_uid
== static_cast<uid_t
>(-1)) {
7806 mask
&= ~CEPH_SETATTR_UID
;
7808 if ((mask
& CEPH_SETATTR_GID
) && attr
->st_gid
== static_cast<uid_t
>(-1)) {
7809 mask
&= ~CEPH_SETATTR_GID
;
7812 return _setattrx(in
, &stx
, mask
, perms
);
7815 int Client::setattr(const char *relpath
, struct stat
*attr
, int mask
,
7816 const UserPerm
& perms
)
7818 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7819 if (!mref_reader
.is_state_satisfied())
7820 return -CEPHFS_ENOTCONN
;
7822 tout(cct
) << __func__
<< std::endl
;
7823 tout(cct
) << relpath
<< std::endl
;
7824 tout(cct
) << mask
<< std::endl
;
7826 filepath
path(relpath
);
7829 std::scoped_lock
lock(client_lock
);
7830 int r
= path_walk(path
, &in
, perms
);
7833 return _setattr(in
, attr
, mask
, perms
);
7836 int Client::setattrx(const char *relpath
, struct ceph_statx
*stx
, int mask
,
7837 const UserPerm
& perms
, int flags
)
7839 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7840 if (!mref_reader
.is_state_satisfied())
7841 return -CEPHFS_ENOTCONN
;
7843 tout(cct
) << __func__
<< std::endl
;
7844 tout(cct
) << relpath
<< std::endl
;
7845 tout(cct
) << mask
<< std::endl
;
7847 filepath
path(relpath
);
7850 std::scoped_lock
lock(client_lock
);
7851 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
));
7854 return _setattrx(in
, stx
, mask
, perms
);
7857 int Client::fsetattr(int fd
, struct stat
*attr
, int mask
, const UserPerm
& perms
)
7859 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7860 if (!mref_reader
.is_state_satisfied())
7861 return -CEPHFS_ENOTCONN
;
7863 tout(cct
) << __func__
<< std::endl
;
7864 tout(cct
) << fd
<< std::endl
;
7865 tout(cct
) << mask
<< std::endl
;
7867 std::scoped_lock
lock(client_lock
);
7868 Fh
*f
= get_filehandle(fd
);
7870 return -CEPHFS_EBADF
;
7871 #if defined(__linux__) && defined(O_PATH)
7872 if (f
->flags
& O_PATH
)
7873 return -CEPHFS_EBADF
;
7875 return _setattr(f
->inode
, attr
, mask
, perms
);
7878 int Client::fsetattrx(int fd
, struct ceph_statx
*stx
, int mask
, const UserPerm
& perms
)
7880 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7881 if (!mref_reader
.is_state_satisfied())
7882 return -CEPHFS_ENOTCONN
;
7884 tout(cct
) << __func__
<< std::endl
;
7885 tout(cct
) << fd
<< std::endl
;
7886 tout(cct
) << mask
<< std::endl
;
7888 std::scoped_lock
lock(client_lock
);
7889 Fh
*f
= get_filehandle(fd
);
7891 return -CEPHFS_EBADF
;
7892 #if defined(__linux__) && defined(O_PATH)
7893 if (f
->flags
& O_PATH
)
7894 return -CEPHFS_EBADF
;
7896 return _setattrx(f
->inode
, stx
, mask
, perms
);
7899 int Client::stat(const char *relpath
, struct stat
*stbuf
, const UserPerm
& perms
,
7900 frag_info_t
*dirstat
, int mask
)
7902 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7903 if (!mref_reader
.is_state_satisfied())
7904 return -CEPHFS_ENOTCONN
;
7906 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7907 tout(cct
) << "stat" << std::endl
;
7908 tout(cct
) << relpath
<< std::endl
;
7910 filepath
path(relpath
);
7913 std::scoped_lock
lock(client_lock
);
7914 int r
= path_walk(path
, &in
, perms
, true, mask
);
7917 r
= _getattr(in
, mask
, perms
);
7919 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7922 fill_stat(in
, stbuf
, dirstat
);
7923 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7927 unsigned Client::statx_to_mask(unsigned int flags
, unsigned int want
)
7931 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7932 if (flags
& AT_NO_ATTR_SYNC
)
7935 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7936 mask
|= CEPH_CAP_PIN
;
7937 if (want
& (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7938 mask
|= CEPH_CAP_AUTH_SHARED
;
7939 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7940 mask
|= CEPH_CAP_LINK_SHARED
;
7941 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|CEPH_STATX_CTIME
|CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
|CEPH_STATX_VERSION
))
7942 mask
|= CEPH_CAP_FILE_SHARED
;
7943 if (want
& (CEPH_STATX_VERSION
|CEPH_STATX_CTIME
))
7944 mask
|= CEPH_CAP_XATTR_SHARED
;
7949 int Client::statx(const char *relpath
, struct ceph_statx
*stx
,
7950 const UserPerm
& perms
,
7951 unsigned int want
, unsigned int flags
)
7953 return statxat(CEPHFS_AT_FDCWD
, relpath
, stx
, perms
, want
, flags
);
7956 int Client::lstat(const char *relpath
, struct stat
*stbuf
,
7957 const UserPerm
& perms
, frag_info_t
*dirstat
, int mask
)
7959 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7960 if (!mref_reader
.is_state_satisfied())
7961 return -CEPHFS_ENOTCONN
;
7963 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7964 tout(cct
) << __func__
<< std::endl
;
7965 tout(cct
) << relpath
<< std::endl
;
7967 filepath
path(relpath
);
7970 std::scoped_lock
lock(client_lock
);
7971 // don't follow symlinks
7972 int r
= path_walk(path
, &in
, perms
, false, mask
);
7975 r
= _getattr(in
, mask
, perms
);
7977 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7980 fill_stat(in
, stbuf
, dirstat
);
7981 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7985 int Client::fill_stat(Inode
*in
, struct stat
*st
, frag_info_t
*dirstat
, nest_info_t
*rstat
)
7987 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7988 << " mode 0" << oct
<< in
->mode
<< dec
7989 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7990 memset(st
, 0, sizeof(struct stat
));
7991 if (use_faked_inos())
7992 st
->st_ino
= in
->faked_ino
;
7994 st
->st_ino
= in
->ino
;
7995 st
->st_dev
= in
->snapid
;
7996 st
->st_mode
= in
->mode
;
7997 st
->st_rdev
= in
->rdev
;
7999 switch (in
->nlink
) {
8001 st
->st_nlink
= 0; /* dir is unlinked */
8004 st
->st_nlink
= 1 /* parent dentry */
8006 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
8012 st
->st_nlink
= in
->nlink
;
8014 st
->st_uid
= in
->uid
;
8015 st
->st_gid
= in
->gid
;
8016 if (in
->ctime
> in
->mtime
) {
8017 stat_set_ctime_sec(st
, in
->ctime
.sec());
8018 stat_set_ctime_nsec(st
, in
->ctime
.nsec());
8020 stat_set_ctime_sec(st
, in
->mtime
.sec());
8021 stat_set_ctime_nsec(st
, in
->mtime
.nsec());
8023 stat_set_atime_sec(st
, in
->atime
.sec());
8024 stat_set_atime_nsec(st
, in
->atime
.nsec());
8025 stat_set_mtime_sec(st
, in
->mtime
.sec());
8026 stat_set_mtime_nsec(st
, in
->mtime
.nsec());
8028 if (cct
->_conf
->client_dirsize_rbytes
)
8029 st
->st_size
= in
->rstat
.rbytes
;
8031 st
->st_size
= in
->dirstat
.size();
8032 // The Windows "stat" structure provides just a subset of the fields that are
8033 // available on Linux.
8038 st
->st_size
= in
->size
;
8040 st
->st_blocks
= (in
->size
+ 511) >> 9;
8044 st
->st_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
8048 *dirstat
= in
->dirstat
;
8052 return in
->caps_issued();
8055 void Client::fill_statx(Inode
*in
, unsigned int mask
, struct ceph_statx
*stx
)
8057 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
8058 << " mode 0" << oct
<< in
->mode
<< dec
8059 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
8060 memset(stx
, 0, sizeof(struct ceph_statx
));
8063 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
8064 * so that all bits are set.
8069 /* These are always considered to be available */
8070 stx
->stx_dev
= in
->snapid
;
8071 stx
->stx_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
8073 /* Type bits are always set, even when CEPH_STATX_MODE is not */
8074 stx
->stx_mode
= S_IFMT
& in
->mode
;
8075 stx
->stx_ino
= use_faked_inos() ? in
->faked_ino
: (ino_t
)in
->ino
;
8076 stx
->stx_rdev
= in
->rdev
;
8077 stx
->stx_mask
|= (CEPH_STATX_INO
|CEPH_STATX_RDEV
);
8079 if (mask
& CEPH_CAP_AUTH_SHARED
) {
8080 stx
->stx_uid
= in
->uid
;
8081 stx
->stx_gid
= in
->gid
;
8082 stx
->stx_mode
= in
->mode
;
8083 in
->btime
.to_timespec(&stx
->stx_btime
);
8084 stx
->stx_mask
|= (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
);
8087 if (mask
& CEPH_CAP_LINK_SHARED
) {
8089 switch (in
->nlink
) {
8091 stx
->stx_nlink
= 0; /* dir is unlinked */
8094 stx
->stx_nlink
= 1 /* parent dentry */
8096 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
8102 stx
->stx_nlink
= in
->nlink
;
8104 stx
->stx_mask
|= CEPH_STATX_NLINK
;
8107 if (mask
& CEPH_CAP_FILE_SHARED
) {
8109 in
->atime
.to_timespec(&stx
->stx_atime
);
8110 in
->mtime
.to_timespec(&stx
->stx_mtime
);
8113 if (cct
->_conf
->client_dirsize_rbytes
)
8114 stx
->stx_size
= in
->rstat
.rbytes
;
8116 stx
->stx_size
= in
->dirstat
.size();
8117 stx
->stx_blocks
= 1;
8119 stx
->stx_size
= in
->size
;
8120 stx
->stx_blocks
= (in
->size
+ 511) >> 9;
8122 stx
->stx_mask
|= (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|
8123 CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
);
8126 /* Change time and change_attr both require all shared caps to view */
8127 if ((mask
& CEPH_STAT_CAP_INODE_ALL
) == CEPH_STAT_CAP_INODE_ALL
) {
8128 stx
->stx_version
= in
->change_attr
;
8129 if (in
->ctime
> in
->mtime
)
8130 in
->ctime
.to_timespec(&stx
->stx_ctime
);
8132 in
->mtime
.to_timespec(&stx
->stx_ctime
);
8133 stx
->stx_mask
|= (CEPH_STATX_CTIME
|CEPH_STATX_VERSION
);
8138 void Client::touch_dn(Dentry
*dn
)
8143 int Client::chmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
8145 return chmodat(CEPHFS_AT_FDCWD
, relpath
, mode
, 0, perms
);
8148 int Client::fchmod(int fd
, mode_t mode
, const UserPerm
& perms
)
8150 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8151 if (!mref_reader
.is_state_satisfied())
8152 return -CEPHFS_ENOTCONN
;
8154 tout(cct
) << __func__
<< std::endl
;
8155 tout(cct
) << fd
<< std::endl
;
8156 tout(cct
) << mode
<< std::endl
;
8158 std::scoped_lock
lock(client_lock
);
8159 Fh
*f
= get_filehandle(fd
);
8161 return -CEPHFS_EBADF
;
8162 #if defined(__linux__) && defined(O_PATH)
8163 if (f
->flags
& O_PATH
)
8164 return -CEPHFS_EBADF
;
8167 attr
.st_mode
= mode
;
8168 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MODE
, perms
);
8171 int Client::chmodat(int dirfd
, const char *relpath
, mode_t mode
, int flags
,
8172 const UserPerm
& perms
) {
8173 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8174 if (!mref_reader
.is_state_satisfied()) {
8175 return -CEPHFS_ENOTCONN
;
8178 tout(cct
) << __func__
<< std::endl
;
8179 tout(cct
) << dirfd
<< std::endl
;
8180 tout(cct
) << relpath
<< std::endl
;
8181 tout(cct
) << mode
<< std::endl
;
8182 tout(cct
) << flags
<< std::endl
;
8184 filepath
path(relpath
);
8188 std::scoped_lock
lock(client_lock
);
8189 int r
= get_fd_inode(dirfd
, &dirinode
);
8194 r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), 0, dirinode
);
8199 attr
.st_mode
= mode
;
8200 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
8203 int Client::lchmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
8205 return chmodat(CEPHFS_AT_FDCWD
, relpath
, mode
, AT_SYMLINK_NOFOLLOW
, perms
);
8208 int Client::chown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
8209 const UserPerm
& perms
)
8211 return chownat(CEPHFS_AT_FDCWD
, relpath
, new_uid
, new_gid
, 0, perms
);
8214 int Client::fchown(int fd
, uid_t new_uid
, gid_t new_gid
, const UserPerm
& perms
)
8216 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8217 if (!mref_reader
.is_state_satisfied())
8218 return -CEPHFS_ENOTCONN
;
8220 tout(cct
) << __func__
<< std::endl
;
8221 tout(cct
) << fd
<< std::endl
;
8222 tout(cct
) << new_uid
<< std::endl
;
8223 tout(cct
) << new_gid
<< std::endl
;
8225 std::scoped_lock
lock(client_lock
);
8226 Fh
*f
= get_filehandle(fd
);
8228 return -CEPHFS_EBADF
;
8229 #if defined(__linux__) && defined(O_PATH)
8230 if (f
->flags
& O_PATH
)
8231 return -CEPHFS_EBADF
;
8234 attr
.st_uid
= new_uid
;
8235 attr
.st_gid
= new_gid
;
8237 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
8238 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
8239 return _setattr(f
->inode
, &attr
, mask
, perms
);
8242 int Client::lchown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
8243 const UserPerm
& perms
)
8245 return chownat(CEPHFS_AT_FDCWD
, relpath
, new_uid
, new_gid
, AT_SYMLINK_NOFOLLOW
, perms
);
8248 int Client::chownat(int dirfd
, const char *relpath
, uid_t new_uid
, gid_t new_gid
,
8249 int flags
, const UserPerm
& perms
) {
8250 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8251 if (!mref_reader
.is_state_satisfied()) {
8252 return -CEPHFS_ENOTCONN
;
8255 tout(cct
) << __func__
<< std::endl
;
8256 tout(cct
) << dirfd
<< std::endl
;
8257 tout(cct
) << relpath
<< std::endl
;
8258 tout(cct
) << new_uid
<< std::endl
;
8259 tout(cct
) << new_gid
<< std::endl
;
8260 tout(cct
) << flags
<< std::endl
;
8262 filepath
path(relpath
);
8266 std::scoped_lock
lock(client_lock
);
8267 int r
= get_fd_inode(dirfd
, &dirinode
);
8272 r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), 0, dirinode
);
8277 attr
.st_uid
= new_uid
;
8278 attr
.st_gid
= new_gid
;
8279 return _setattr(in
, &attr
, CEPH_SETATTR_UID
|CEPH_SETATTR_GID
, perms
);
8282 static void attr_set_atime_and_mtime(struct stat
*attr
,
8283 const utime_t
&atime
,
8284 const utime_t
&mtime
)
8286 stat_set_atime_sec(attr
, atime
.tv
.tv_sec
);
8287 stat_set_atime_nsec(attr
, atime
.tv
.tv_nsec
);
8288 stat_set_mtime_sec(attr
, mtime
.tv
.tv_sec
);
8289 stat_set_mtime_nsec(attr
, mtime
.tv
.tv_nsec
);
8292 // for [l]utime() invoke the timeval variant as the timespec
8293 // variant are not yet implemented. for futime[s](), invoke
8294 // the timespec variant.
8295 int Client::utime(const char *relpath
, struct utimbuf
*buf
,
8296 const UserPerm
& perms
)
8298 struct timeval tv
[2];
8299 tv
[0].tv_sec
= buf
->actime
;
8301 tv
[1].tv_sec
= buf
->modtime
;
8304 return utimes(relpath
, tv
, perms
);
8307 int Client::lutime(const char *relpath
, struct utimbuf
*buf
,
8308 const UserPerm
& perms
)
8310 struct timeval tv
[2];
8311 tv
[0].tv_sec
= buf
->actime
;
8313 tv
[1].tv_sec
= buf
->modtime
;
8316 return lutimes(relpath
, tv
, perms
);
8319 int Client::futime(int fd
, struct utimbuf
*buf
, const UserPerm
& perms
)
8321 struct timespec ts
[2];
8322 ts
[0].tv_sec
= buf
->actime
;
8324 ts
[1].tv_sec
= buf
->modtime
;
8327 return futimens(fd
, ts
, perms
);
8330 int Client::utimes(const char *relpath
, struct timeval times
[2],
8331 const UserPerm
& perms
)
8333 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8334 if (!mref_reader
.is_state_satisfied())
8335 return -CEPHFS_ENOTCONN
;
8337 tout(cct
) << __func__
<< std::endl
;
8338 tout(cct
) << relpath
<< std::endl
;
8339 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
8341 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
8344 filepath
path(relpath
);
8347 std::scoped_lock
lock(client_lock
);
8348 int r
= path_walk(path
, &in
, perms
);
8352 utime_t
atime(times
[0]);
8353 utime_t
mtime(times
[1]);
8355 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
8356 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8359 int Client::lutimes(const char *relpath
, struct timeval times
[2],
8360 const UserPerm
& perms
)
8362 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8363 if (!mref_reader
.is_state_satisfied())
8364 return -CEPHFS_ENOTCONN
;
8366 tout(cct
) << __func__
<< std::endl
;
8367 tout(cct
) << relpath
<< std::endl
;
8368 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
8370 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
8373 filepath
path(relpath
);
8376 std::scoped_lock
lock(client_lock
);
8377 int r
= path_walk(path
, &in
, perms
, false);
8381 utime_t
atime(times
[0]);
8382 utime_t
mtime(times
[1]);
8384 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
8385 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8388 int Client::futimes(int fd
, struct timeval times
[2], const UserPerm
& perms
)
8390 struct timespec ts
[2];
8391 ts
[0].tv_sec
= times
[0].tv_sec
;
8392 ts
[0].tv_nsec
= times
[0].tv_usec
* 1000;
8393 ts
[1].tv_sec
= times
[1].tv_sec
;
8394 ts
[1].tv_nsec
= times
[1].tv_usec
* 1000;
8396 return futimens(fd
, ts
, perms
);
8399 int Client::futimens(int fd
, struct timespec times
[2], const UserPerm
& perms
)
8401 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8402 if (!mref_reader
.is_state_satisfied())
8403 return -CEPHFS_ENOTCONN
;
8405 tout(cct
) << __func__
<< std::endl
;
8406 tout(cct
) << fd
<< std::endl
;
8407 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_nsec
8409 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_nsec
8412 std::scoped_lock
lock(client_lock
);
8413 Fh
*f
= get_filehandle(fd
);
8415 return -CEPHFS_EBADF
;
8416 #if defined(__linux__) && defined(O_PATH)
8417 if (f
->flags
& O_PATH
)
8418 return -CEPHFS_EBADF
;
8421 utime_t
atime(times
[0]);
8422 utime_t
mtime(times
[1]);
8424 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
8425 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8428 int Client::utimensat(int dirfd
, const char *relpath
, struct timespec times
[2], int flags
,
8429 const UserPerm
& perms
) {
8430 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8431 if (!mref_reader
.is_state_satisfied()) {
8432 return -CEPHFS_ENOTCONN
;
8435 tout(cct
) << __func__
<< std::endl
;
8436 tout(cct
) << dirfd
<< std::endl
;
8437 tout(cct
) << relpath
<< std::endl
;
8438 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_nsec
8440 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_nsec
8442 tout(cct
) << flags
<< std::endl
;
8444 filepath
path(relpath
);
8448 std::scoped_lock
lock(client_lock
);
8449 int r
= get_fd_inode(dirfd
, &dirinode
);
8454 #if defined(__linux__) && defined(O_PATH)
8455 if (flags
& O_PATH
) {
8456 return -CEPHFS_EBADF
;
8460 r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), 0, dirinode
);
8465 utime_t
atime(times
[0]);
8466 utime_t
mtime(times
[1]);
8468 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
8469 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8472 int Client::flock(int fd
, int operation
, uint64_t owner
)
8474 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8475 if (!mref_reader
.is_state_satisfied())
8476 return -CEPHFS_ENOTCONN
;
8478 tout(cct
) << __func__
<< std::endl
;
8479 tout(cct
) << fd
<< std::endl
;
8480 tout(cct
) << operation
<< std::endl
;
8481 tout(cct
) << owner
<< std::endl
;
8483 std::scoped_lock
lock(client_lock
);
8484 Fh
*f
= get_filehandle(fd
);
8486 return -CEPHFS_EBADF
;
8488 return _flock(f
, operation
, owner
);
8491 int Client::opendir(const char *relpath
, dir_result_t
**dirpp
, const UserPerm
& perms
)
8493 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8494 if (!mref_reader
.is_state_satisfied())
8495 return -CEPHFS_ENOTCONN
;
8497 tout(cct
) << __func__
<< std::endl
;
8498 tout(cct
) << relpath
<< std::endl
;
8500 filepath
path(relpath
);
8503 std::scoped_lock
lock(client_lock
);
8504 int r
= path_walk(path
, &in
, perms
, true);
8507 if (cct
->_conf
->client_permissions
) {
8508 int r
= may_open(in
.get(), O_RDONLY
, perms
);
8512 r
= _opendir(in
.get(), dirpp
, perms
);
8513 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8514 if (r
!= -CEPHFS_ENOTDIR
)
8515 tout(cct
) << (uintptr_t)*dirpp
<< std::endl
;
8519 int Client::fdopendir(int dirfd
, dir_result_t
**dirpp
, const UserPerm
&perms
) {
8520 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8521 if (!mref_reader
.is_state_satisfied()) {
8522 return -CEPHFS_ENOTCONN
;
8525 tout(cct
) << __func__
<< std::endl
;
8526 tout(cct
) << dirfd
<< std::endl
;
8529 std::scoped_lock
locker(client_lock
);
8530 int r
= get_fd_inode(dirfd
, &dirinode
);
8535 if (cct
->_conf
->client_permissions
) {
8536 r
= may_open(dirinode
.get(), O_RDONLY
, perms
);
8541 r
= _opendir(dirinode
.get(), dirpp
, perms
);
8542 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8543 if (r
!= -CEPHFS_ENOTDIR
) {
8544 tout(cct
) << (uintptr_t)*dirpp
<< std::endl
;
8549 int Client::_opendir(Inode
*in
, dir_result_t
**dirpp
, const UserPerm
& perms
)
8552 return -CEPHFS_ENOTDIR
;
8553 *dirpp
= new dir_result_t(in
, perms
);
8554 opened_dirs
.insert(*dirpp
);
8555 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ") = " << 0 << " (" << *dirpp
<< ")" << dendl
;
8560 int Client::closedir(dir_result_t
*dir
)
8562 tout(cct
) << __func__
<< std::endl
;
8563 tout(cct
) << (uintptr_t)dir
<< std::endl
;
8565 ldout(cct
, 3) << __func__
<< "(" << dir
<< ") = 0" << dendl
;
8566 std::scoped_lock
lock(client_lock
);
8571 void Client::_closedir(dir_result_t
*dirp
)
8573 ldout(cct
, 10) << __func__
<< "(" << dirp
<< ")" << dendl
;
8576 ldout(cct
, 10) << __func__
<< " detaching inode " << dirp
->inode
<< dendl
;
8577 dirp
->inode
.reset();
8579 _readdir_drop_dirp_buffer(dirp
);
8580 opened_dirs
.erase(dirp
);
8584 void Client::rewinddir(dir_result_t
*dirp
)
8586 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ")" << dendl
;
8588 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8589 if (!mref_reader
.is_state_satisfied())
8592 std::scoped_lock
lock(client_lock
);
8593 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
8594 _readdir_drop_dirp_buffer(d
);
8598 loff_t
Client::telldir(dir_result_t
*dirp
)
8600 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
8601 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ") = " << d
->offset
<< dendl
;
8605 void Client::seekdir(dir_result_t
*dirp
, loff_t offset
)
8607 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ", " << offset
<< ")" << dendl
;
8609 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8610 if (!mref_reader
.is_state_satisfied())
8613 std::scoped_lock
lock(client_lock
);
8615 if (offset
== dirp
->offset
)
8618 if (offset
> dirp
->offset
)
8619 dirp
->release_count
= 0; // bump if we do a forward seek
8621 dirp
->ordered_count
= 0; // disable filling readdir cache
8623 if (dirp
->hash_order()) {
8624 if (dirp
->offset
> offset
) {
8625 _readdir_drop_dirp_buffer(dirp
);
8630 dirp
->buffer_frag
!= frag_t(dir_result_t::fpos_high(offset
)) ||
8631 dirp
->offset_low() > dir_result_t::fpos_low(offset
)) {
8632 _readdir_drop_dirp_buffer(dirp
);
8637 dirp
->offset
= offset
;
8642 // ino_t d_ino; /* inode number */
8643 // off_t d_off; /* offset to the next dirent */
8644 // unsigned short d_reclen; /* length of this record */
8645 // unsigned char d_type; /* type of file */
8646 // char d_name[256]; /* filename */
8648 void Client::fill_dirent(struct dirent
*de
, const char *name
, int type
, uint64_t ino
, loff_t next_off
)
8650 strncpy(de
->d_name
, name
, 255);
8651 de
->d_name
[255] = '\0';
8652 #if !defined(__CYGWIN__) && !(defined(_WIN32))
8654 #if !defined(__APPLE__) && !defined(__FreeBSD__)
8655 de
->d_off
= next_off
;
8658 de
->d_type
= IFTODT(type
);
8659 ldout(cct
, 10) << __func__
<< " '" << de
->d_name
<< "' -> " << inodeno_t(de
->d_ino
)
8660 << " type " << (int)de
->d_type
<< " w/ next_off " << hex
<< next_off
<< dec
<< dendl
;
8664 void Client::_readdir_next_frag(dir_result_t
*dirp
)
8666 frag_t fg
= dirp
->buffer_frag
;
8668 if (fg
.is_rightmost()) {
8669 ldout(cct
, 10) << __func__
<< " advance from " << fg
<< " to END" << dendl
;
8676 ldout(cct
, 10) << __func__
<< " advance from " << dirp
->buffer_frag
<< " to " << fg
<< dendl
;
8678 if (dirp
->hash_order()) {
8680 int64_t new_offset
= dir_result_t::make_fpos(fg
.value(), 2, true);
8681 if (dirp
->offset
< new_offset
) // don't decrease offset
8682 dirp
->offset
= new_offset
;
8684 dirp
->last_name
.clear();
8685 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
8686 _readdir_rechoose_frag(dirp
);
8690 void Client::_readdir_rechoose_frag(dir_result_t
*dirp
)
8692 ceph_assert(dirp
->inode
);
8694 if (dirp
->hash_order())
8697 frag_t cur
= frag_t(dirp
->offset_high());
8698 frag_t fg
= dirp
->inode
->dirfragtree
[cur
.value()];
8700 ldout(cct
, 10) << __func__
<< " frag " << cur
<< " maps to " << fg
<< dendl
;
8701 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
8702 dirp
->last_name
.clear();
8703 dirp
->next_offset
= 2;
8707 void Client::_readdir_drop_dirp_buffer(dir_result_t
*dirp
)
8709 ldout(cct
, 10) << __func__
<< " " << dirp
<< dendl
;
8710 dirp
->buffer
.clear();
8713 int Client::_readdir_get_frag(dir_result_t
*dirp
)
8716 ceph_assert(dirp
->inode
);
8718 // get the current frag.
8720 if (dirp
->hash_order())
8721 fg
= dirp
->inode
->dirfragtree
[dirp
->offset_high()];
8723 fg
= frag_t(dirp
->offset_high());
8725 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " fg " << fg
8726 << " offset " << hex
<< dirp
->offset
<< dec
<< dendl
;
8728 int op
= CEPH_MDS_OP_READDIR
;
8729 if (dirp
->inode
&& dirp
->inode
->snapid
== CEPH_SNAPDIR
)
8730 op
= CEPH_MDS_OP_LSSNAP
;
8732 InodeRef
& diri
= dirp
->inode
;
8734 MetaRequest
*req
= new MetaRequest(op
);
8736 diri
->make_nosnap_relative_path(path
);
8737 req
->set_filepath(path
);
8738 req
->set_inode(diri
.get());
8739 req
->head
.args
.readdir
.frag
= fg
;
8740 req
->head
.args
.readdir
.flags
= CEPH_READDIR_REPLY_BITFLAGS
;
8741 if (dirp
->last_name
.length()) {
8742 req
->path2
.set_path(dirp
->last_name
);
8743 } else if (dirp
->hash_order()) {
8744 req
->head
.args
.readdir
.offset_hash
= dirp
->offset_high();
8749 int res
= make_request(req
, dirp
->perms
, NULL
, NULL
, -1, &dirbl
);
8751 if (res
== -CEPHFS_EAGAIN
) {
8752 ldout(cct
, 10) << __func__
<< " got EAGAIN, retrying" << dendl
;
8753 _readdir_rechoose_frag(dirp
);
8754 return _readdir_get_frag(dirp
);
8758 ldout(cct
, 10) << __func__
<< " " << dirp
<< " got frag " << dirp
->buffer_frag
8759 << " size " << dirp
->buffer
.size() << dendl
;
8761 ldout(cct
, 10) << __func__
<< " got error " << res
<< ", setting end flag" << dendl
;
8768 struct dentry_off_lt
{
8769 bool operator()(const Dentry
* dn
, int64_t off
) const {
8770 return dir_result_t::fpos_cmp(dn
->offset
, off
) < 0;
8774 int Client::_readdir_cache_cb(dir_result_t
*dirp
, add_dirent_cb_t cb
, void *p
,
8775 int caps
, bool getref
)
8777 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
8778 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
8779 << " last_name " << dirp
->last_name
<< " offset " << hex
<< dirp
->offset
<< dec
8781 Dir
*dir
= dirp
->inode
->dir
;
8784 ldout(cct
, 10) << " dir is empty" << dendl
;
8789 vector
<Dentry
*>::iterator pd
= std::lower_bound(dir
->readdir_cache
.begin(),
8790 dir
->readdir_cache
.end(),
8791 dirp
->offset
, dentry_off_lt());
8796 if (!dirp
->inode
->is_complete_and_ordered())
8797 return -CEPHFS_EAGAIN
;
8798 if (pd
== dir
->readdir_cache
.end())
8801 if (dn
->inode
== NULL
) {
8802 ldout(cct
, 15) << " skipping null '" << dn
->name
<< "'" << dendl
;
8806 if (dn
->cap_shared_gen
!= dir
->parent_inode
->shared_gen
) {
8807 ldout(cct
, 15) << " skipping mismatch shared gen '" << dn
->name
<< "'" << dendl
;
8812 int idx
= pd
- dir
->readdir_cache
.begin();
8813 if (dn
->inode
->is_dir()) {
8814 mask
|= CEPH_STAT_RSTAT
;
8816 int r
= _getattr(dn
->inode
, mask
, dirp
->perms
);
8820 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8821 pd
= dir
->readdir_cache
.begin() + idx
;
8822 if (pd
>= dir
->readdir_cache
.end() || *pd
!= dn
)
8823 return -CEPHFS_EAGAIN
;
8825 struct ceph_statx stx
;
8827 fill_statx(dn
->inode
, caps
, &stx
);
8829 uint64_t next_off
= dn
->offset
+ 1;
8830 fill_dirent(&de
, dn
->name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8832 if (pd
== dir
->readdir_cache
.end())
8833 next_off
= dir_result_t::END
;
8837 in
= dn
->inode
.get();
8841 dn_name
= dn
->name
; // fill in name while we have lock
8843 client_lock
.unlock();
8844 r
= cb(p
, &de
, &stx
, next_off
, in
); // _next_ offset
8846 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< dn
->offset
<< dec
8847 << " = " << r
<< dendl
;
8852 dirp
->offset
= next_off
;
8854 dirp
->next_offset
= 2;
8856 dirp
->next_offset
= dirp
->offset_low();
8857 dirp
->last_name
= dn_name
; // we successfully returned this one; update!
8858 dirp
->release_count
= 0; // last_name no longer match cache index
8863 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " at end" << dendl
;
8868 int Client::readdir_r_cb(dir_result_t
*d
, add_dirent_cb_t cb
, void *p
,
8869 unsigned want
, unsigned flags
, bool getref
)
8871 int caps
= statx_to_mask(flags
, want
);
8873 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8874 if (!mref_reader
.is_state_satisfied())
8875 return -CEPHFS_ENOTCONN
;
8877 std::unique_lock
cl(client_lock
);
8879 dir_result_t
*dirp
= static_cast<dir_result_t
*>(d
);
8881 ldout(cct
, 10) << __func__
<< " " << *dirp
->inode
<< " offset " << hex
<< dirp
->offset
8882 << dec
<< " at_end=" << dirp
->at_end()
8883 << " hash_order=" << dirp
->hash_order() << dendl
;
8886 struct ceph_statx stx
;
8887 memset(&de
, 0, sizeof(de
));
8888 memset(&stx
, 0, sizeof(stx
));
8890 InodeRef
& diri
= dirp
->inode
;
8895 if (dirp
->offset
== 0) {
8896 ldout(cct
, 15) << " including ." << dendl
;
8897 ceph_assert(diri
->dentries
.size() < 2); // can't have multiple hard-links to a dir
8898 uint64_t next_off
= 1;
8901 r
= _getattr(diri
, caps
| CEPH_STAT_RSTAT
, dirp
->perms
);
8905 fill_statx(diri
, caps
, &stx
);
8906 fill_dirent(&de
, ".", S_IFDIR
, stx
.stx_ino
, next_off
);
8908 Inode
*inode
= NULL
;
8915 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8920 dirp
->offset
= next_off
;
8924 if (dirp
->offset
== 1) {
8925 ldout(cct
, 15) << " including .." << dendl
;
8926 uint64_t next_off
= 2;
8928 if (diri
->dentries
.empty())
8931 in
= diri
->get_first_parent()->dir
->parent_inode
;
8934 r
= _getattr(in
, caps
| CEPH_STAT_RSTAT
, dirp
->perms
);
8938 fill_statx(in
, caps
, &stx
);
8939 fill_dirent(&de
, "..", S_IFDIR
, stx
.stx_ino
, next_off
);
8941 Inode
*inode
= NULL
;
8948 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8953 dirp
->offset
= next_off
;
8958 // can we read from our cache?
8959 ldout(cct
, 10) << "offset " << hex
<< dirp
->offset
<< dec
8960 << " snapid " << dirp
->inode
->snapid
<< " (complete && ordered) "
8961 << dirp
->inode
->is_complete_and_ordered()
8962 << " issued " << ccap_string(dirp
->inode
->caps_issued())
8964 if (dirp
->inode
->snapid
!= CEPH_SNAPDIR
&&
8965 dirp
->inode
->is_complete_and_ordered() &&
8966 dirp
->inode
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
8967 int err
= _readdir_cache_cb(dirp
, cb
, p
, caps
, getref
);
8968 if (err
!= -CEPHFS_EAGAIN
)
8976 bool check_caps
= true;
8977 if (!dirp
->is_cached()) {
8978 int r
= _readdir_get_frag(dirp
);
8981 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8982 // different than the requested one. (our dirfragtree was outdated)
8985 frag_t fg
= dirp
->buffer_frag
;
8987 ldout(cct
, 10) << "frag " << fg
<< " buffer size " << dirp
->buffer
.size()
8988 << " offset " << hex
<< dirp
->offset
<< dendl
;
8990 for (auto it
= std::lower_bound(dirp
->buffer
.begin(), dirp
->buffer
.end(),
8991 dirp
->offset
, dir_result_t::dentry_off_lt());
8992 it
!= dirp
->buffer
.end();
8994 dir_result_t::dentry
&entry
= *it
;
8996 uint64_t next_off
= entry
.offset
+ 1;
9001 if(entry
.inode
->is_dir()){
9002 mask
|= CEPH_STAT_RSTAT
;
9004 r
= _getattr(entry
.inode
, mask
, dirp
->perms
);
9009 fill_statx(entry
.inode
, caps
, &stx
);
9010 fill_dirent(&de
, entry
.name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
9012 Inode
*inode
= NULL
;
9014 inode
= entry
.inode
.get();
9019 r
= cb(p
, &de
, &stx
, next_off
, inode
); // _next_ offset
9022 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< next_off
- 1 << dec
9023 << " = " << r
<< dendl
;
9027 dirp
->offset
= next_off
;
9032 if (dirp
->next_offset
> 2) {
9033 ldout(cct
, 10) << " fetching next chunk of this frag" << dendl
;
9034 _readdir_drop_dirp_buffer(dirp
);
9038 if (!fg
.is_rightmost()) {
9040 _readdir_next_frag(dirp
);
9044 if (diri
->shared_gen
== dirp
->start_shared_gen
&&
9045 diri
->dir_release_count
== dirp
->release_count
) {
9046 if (diri
->dir_ordered_count
== dirp
->ordered_count
) {
9047 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
9049 ceph_assert(diri
->dir
->readdir_cache
.size() >= dirp
->cache_index
);
9050 diri
->dir
->readdir_cache
.resize(dirp
->cache_index
);
9052 diri
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
9054 ldout(cct
, 10) << " marking I_COMPLETE on " << *diri
<< dendl
;
9055 diri
->flags
|= I_COMPLETE
;
9067 int Client::readdir_r(dir_result_t
*d
, struct dirent
*de
)
9069 return readdirplus_r(d
, de
, 0, 0, 0, NULL
);
9076 * 1 if we got a dirent
9077 * 0 for end of directory
9081 struct single_readdir
{
9083 struct ceph_statx
*stx
;
9088 static int _readdir_single_dirent_cb(void *p
, struct dirent
*de
,
9089 struct ceph_statx
*stx
, off_t off
,
9092 single_readdir
*c
= static_cast<single_readdir
*>(p
);
9095 return -1; // already filled this dirent
9105 struct dirent
*Client::readdir(dir_result_t
*d
)
9115 // our callback fills the dirent and sets sr.full=true on first
9116 // call, and returns -1 the second time around.
9117 ret
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
);
9119 errno
= -ret
; // this sucks.
9120 return (dirent
*) NULL
;
9125 return (dirent
*) NULL
;
9128 int Client::readdirplus_r(dir_result_t
*d
, struct dirent
*de
,
9129 struct ceph_statx
*stx
, unsigned want
,
9130 unsigned flags
, Inode
**out
)
9138 // our callback fills the dirent and sets sr.full=true on first
9139 // call, and returns -1 the second time around.
9140 int r
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
, want
, flags
, out
);
9152 struct getdents_result
{
9159 static int _readdir_getdent_cb(void *p
, struct dirent
*de
,
9160 struct ceph_statx
*stx
, off_t off
, Inode
*in
)
9162 struct getdents_result
*c
= static_cast<getdents_result
*>(p
);
9168 dlen
= strlen(de
->d_name
) + 1;
9170 if (c
->pos
+ dlen
> c
->buflen
)
9171 return -1; // doesn't fit
9174 memcpy(c
->buf
+ c
->pos
, de
, sizeof(*de
));
9176 memcpy(c
->buf
+ c
->pos
, de
->d_name
, dlen
);
9182 int Client::_getdents(dir_result_t
*dir
, char *buf
, int buflen
, bool fullent
)
9187 gr
.fullent
= fullent
;
9190 int r
= readdir_r_cb(dir
, _readdir_getdent_cb
, (void *)&gr
);
9192 if (r
< 0) { // some error
9193 if (r
== -1) { // buffer ran out of space
9194 if (gr
.pos
) { // but we got some entries already!
9196 } // or we need a larger buffer
9197 return -CEPHFS_ERANGE
;
9198 } else { // actual error, return it
9207 struct getdir_result
{
9208 list
<string
> *contents
;
9212 static int _getdir_cb(void *p
, struct dirent
*de
, struct ceph_statx
*stx
, off_t off
, Inode
*in
)
9214 getdir_result
*r
= static_cast<getdir_result
*>(p
);
9216 r
->contents
->push_back(de
->d_name
);
9221 int Client::getdir(const char *relpath
, list
<string
>& contents
,
9222 const UserPerm
& perms
)
9224 ldout(cct
, 3) << "getdir(" << relpath
<< ")" << dendl
;
9225 tout(cct
) << "getdir" << std::endl
;
9226 tout(cct
) << relpath
<< std::endl
;
9229 int r
= opendir(relpath
, &d
, perms
);
9234 gr
.contents
= &contents
;
9236 r
= readdir_r_cb(d
, _getdir_cb
, (void *)&gr
);
9246 /****** file i/o **********/
9248 // common parts for open and openat. call with client_lock locked.
9249 int Client::create_and_open(int dirfd
, const char *relpath
, int flags
,
9250 const UserPerm
& perms
, mode_t mode
, int stripe_unit
,
9251 int stripe_count
, int object_size
, const char *data_pool
,
9252 std::string alternate_name
) {
9253 ceph_assert(ceph_mutex_is_locked(client_lock
));
9254 int cflags
= ceph_flags_sys2wire(flags
);
9255 tout(cct
) << cflags
<< std::endl
;
9259 #if defined(__linux__) && defined(O_PATH)
9260 /* When the O_PATH is being specified, others flags than O_DIRECTORY
9261 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9262 * in kernel (fs/open.c). */
9264 flags
&= O_DIRECTORY
| O_NOFOLLOW
| O_PATH
;
9267 filepath
path(relpath
);
9269 bool created
= false;
9270 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9271 bool followsym
= !((flags
& O_NOFOLLOW
) || ((flags
& O_CREAT
) && (flags
& O_EXCL
)));
9272 int mask
= ceph_caps_for_mode(ceph_flags_to_mode(cflags
));
9274 InodeRef dirinode
= nullptr;
9275 int r
= get_fd_inode(dirfd
, &dirinode
);
9280 r
= path_walk(path
, &in
, perms
, followsym
, mask
, dirinode
);
9281 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
9282 return -CEPHFS_EEXIST
;
9284 #if defined(__linux__) && defined(O_PATH)
9285 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
) && !(flags
& O_PATH
))
9287 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
))
9289 return -CEPHFS_ELOOP
;
9291 if (r
== -CEPHFS_ENOENT
&& (flags
& O_CREAT
)) {
9292 filepath dirpath
= path
;
9293 string dname
= dirpath
.last_dentry();
9294 dirpath
.pop_dentry();
9296 r
= path_walk(dirpath
, &dir
, perms
, true,
9297 cct
->_conf
->client_permissions
? CEPH_CAP_AUTH_SHARED
: 0, dirinode
);
9301 if (cct
->_conf
->client_permissions
) {
9302 r
= may_create(dir
.get(), perms
);
9306 r
= _create(dir
.get(), dname
.c_str(), flags
, mode
, &in
, &fh
, stripe_unit
,
9307 stripe_count
, object_size
, data_pool
, &created
, perms
,
9308 std::move(alternate_name
));
9314 // posix says we can only check permissions of existing files
9315 if (cct
->_conf
->client_permissions
) {
9316 r
= may_open(in
.get(), flags
, perms
);
9323 r
= _open(in
.get(), flags
, mode
, &fh
, perms
);
9325 // allocate a integer file descriptor
9328 ceph_assert(fd_map
.count(r
) == 0);
9336 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
,
9337 mode_t mode
, int stripe_unit
, int stripe_count
,
9338 int object_size
, const char *data_pool
, std::string alternate_name
)
9340 return openat(CEPHFS_AT_FDCWD
, relpath
, flags
, perms
, mode
, stripe_unit
,
9341 stripe_count
, object_size
, data_pool
, alternate_name
);
9344 int Client::openat(int dirfd
, const char *relpath
, int flags
, const UserPerm
& perms
,
9345 mode_t mode
, int stripe_unit
, int stripe_count
, int object_size
,
9346 const char *data_pool
, std::string alternate_name
) {
9347 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9348 if (!mref_reader
.is_state_satisfied()) {
9349 return -CEPHFS_ENOTCONN
;
9352 ldout(cct
, 3) << "openat enter(" << relpath
<< ")" << dendl
;
9353 tout(cct
) << dirfd
<< std::endl
;
9354 tout(cct
) << relpath
<< std::endl
;
9355 tout(cct
) << flags
<< std::endl
;
9356 tout(cct
) << mode
<< std::endl
;
9358 std::scoped_lock
locker(client_lock
);
9359 int r
= create_and_open(dirfd
, relpath
, flags
, perms
, mode
, stripe_unit
, stripe_count
,
9360 object_size
, data_pool
, alternate_name
);
9362 tout(cct
) << r
<< std::endl
;
9363 ldout(cct
, 3) << "openat exit(" << relpath
<< ")" << dendl
;
9367 int Client::lookup_hash(inodeno_t ino
, inodeno_t dirino
, const char *name
,
9368 const UserPerm
& perms
)
9370 ldout(cct
, 3) << __func__
<< " enter(" << ino
<< ", #" << dirino
<< "/" << name
<< ")" << dendl
;
9372 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9373 if (!mref_reader
.is_state_satisfied())
9374 return -CEPHFS_ENOTCONN
;
9376 std::scoped_lock
lock(client_lock
);
9377 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPHASH
);
9379 req
->set_filepath(path
);
9381 uint32_t h
= ceph_str_hash(CEPH_STR_HASH_RJENKINS
, name
, strlen(name
));
9383 sprintf(f
, "%u", h
);
9384 filepath
path2(dirino
);
9385 path2
.push_dentry(string(f
));
9386 req
->set_filepath2(path2
);
9388 int r
= make_request(req
, perms
, NULL
, NULL
,
9389 rand() % mdsmap
->get_num_in_mds());
9390 ldout(cct
, 3) << __func__
<< " exit(" << ino
<< ", #" << dirino
<< "/" << name
<< ") = " << r
<< dendl
;
9396 * Load inode into local cache.
9398 * If inode pointer is non-NULL, and take a reference on
9399 * the resulting Inode object in one operation, so that caller
9400 * can safely assume inode will still be there after return.
9402 int Client::_lookup_vino(vinodeno_t vino
, const UserPerm
& perms
, Inode
**inode
)
9404 ldout(cct
, 8) << __func__
<< " enter(" << vino
<< ")" << dendl
;
9406 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9407 if (!mref_reader
.is_state_satisfied())
9408 return -CEPHFS_ENOTCONN
;
9410 if (is_reserved_vino(vino
))
9411 return -CEPHFS_ESTALE
;
9413 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPINO
);
9414 filepath
path(vino
.ino
);
9415 req
->set_filepath(path
);
9418 * The MDS expects either a "real" snapid here or 0. The special value
9419 * carveouts for the snapid are all at the end of the range so we can
9420 * just look for any snapid below this value.
9422 if (vino
.snapid
< CEPH_NOSNAP
)
9423 req
->head
.args
.lookupino
.snapid
= vino
.snapid
;
9425 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
9426 if (r
== 0 && inode
!= NULL
) {
9427 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
9428 ceph_assert(p
!= inode_map
.end());
9432 ldout(cct
, 8) << __func__
<< " exit(" << vino
<< ") = " << r
<< dendl
;
9436 int Client::lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
9438 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
9439 std::scoped_lock
lock(client_lock
);
9440 return _lookup_vino(vino
, perms
, inode
);
9444 * Find the parent inode of `ino` and insert it into
9445 * our cache. Conditionally also set `parent` to a referenced
9446 * Inode* if caller provides non-NULL value.
9448 int Client::_lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
9450 ldout(cct
, 8) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
9452 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
9453 filepath
path(ino
->ino
);
9454 req
->set_filepath(path
);
9457 int r
= make_request(req
, perms
, &target
, NULL
, rand() % mdsmap
->get_num_in_mds());
9458 // Give caller a reference to the parent ino if they provided a pointer.
9459 if (parent
!= NULL
) {
9461 *parent
= target
.get();
9463 ldout(cct
, 8) << __func__
<< " found parent " << (*parent
)->ino
<< dendl
;
9468 ldout(cct
, 8) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
9473 * Populate the parent dentry for `ino`, provided it is
9474 * a child of `parent`.
9476 int Client::_lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
9478 ceph_assert(parent
->is_dir());
9479 ldout(cct
, 3) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
9481 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9482 if (!mref_reader
.is_state_satisfied())
9483 return -CEPHFS_ENOTCONN
;
9485 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
9486 req
->set_filepath2(filepath(parent
->ino
));
9487 req
->set_filepath(filepath(ino
->ino
));
9488 req
->set_inode(ino
);
9490 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
9491 ldout(cct
, 3) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
9495 int Client::lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
9497 std::scoped_lock
lock(client_lock
);
9498 return _lookup_name(ino
, parent
, perms
);
9501 Fh
*Client::_create_fh(Inode
*in
, int flags
, int cmode
, const UserPerm
& perms
)
9504 Fh
*f
= new Fh(in
, flags
, cmode
, fd_gen
, perms
);
9506 ldout(cct
, 10) << __func__
<< " " << in
->ino
<< " mode " << cmode
<< dendl
;
9508 if (in
->snapid
!= CEPH_NOSNAP
) {
9509 in
->snap_cap_refs
++;
9510 ldout(cct
, 5) << "open success, fh is " << f
<< " combined IMMUTABLE SNAP caps "
9511 << ccap_string(in
->caps_issued()) << dendl
;
9514 const auto& conf
= cct
->_conf
;
9515 f
->readahead
.set_trigger_requests(1);
9516 f
->readahead
.set_min_readahead_size(conf
->client_readahead_min
);
9517 uint64_t max_readahead
= Readahead::NO_LIMIT
;
9518 if (conf
->client_readahead_max_bytes
) {
9519 max_readahead
= std::min(max_readahead
, (uint64_t)conf
->client_readahead_max_bytes
);
9521 if (conf
->client_readahead_max_periods
) {
9522 max_readahead
= std::min(max_readahead
, in
->layout
.get_period()*(uint64_t)conf
->client_readahead_max_periods
);
9524 f
->readahead
.set_max_readahead_size(max_readahead
);
9525 vector
<uint64_t> alignments
;
9526 alignments
.push_back(in
->layout
.get_period());
9527 alignments
.push_back(in
->layout
.stripe_unit
);
9528 f
->readahead
.set_alignments(alignments
);
9533 int Client::_release_fh(Fh
*f
)
9535 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
9536 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
9537 Inode
*in
= f
->inode
.get();
9538 ldout(cct
, 8) << __func__
<< " " << f
<< " mode " << f
->mode
<< " on " << *in
<< dendl
;
9542 if (in
->snapid
== CEPH_NOSNAP
) {
9543 if (in
->put_open_ref(f
->mode
)) {
9544 _flush(in
, new C_Client_FlushComplete(this, in
));
9548 ceph_assert(in
->snap_cap_refs
> 0);
9549 in
->snap_cap_refs
--;
9552 _release_filelocks(f
);
9554 // Finally, read any async err (i.e. from flushes)
9555 int err
= f
->take_async_err();
9557 ldout(cct
, 1) << __func__
<< " " << f
<< " on inode " << *in
<< " caught async_err = "
9558 << cpp_strerror(err
) << dendl
;
9560 ldout(cct
, 10) << __func__
<< " " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
9568 void Client::_put_fh(Fh
*f
)
9570 int left
= f
->put();
9576 int Client::_open(Inode
*in
, int flags
, mode_t mode
, Fh
**fhp
,
9577 const UserPerm
& perms
)
9579 if (in
->snapid
!= CEPH_NOSNAP
&&
9580 (flags
& (O_WRONLY
| O_RDWR
| O_CREAT
| O_TRUNC
| O_APPEND
))) {
9581 return -CEPHFS_EROFS
;
9584 // use normalized flags to generate cmode
9585 int cflags
= ceph_flags_sys2wire(flags
);
9586 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
9587 cflags
|= CEPH_O_LAZY
;
9589 int cmode
= ceph_flags_to_mode(cflags
);
9590 int want
= ceph_caps_for_mode(cmode
);
9593 in
->get_open_ref(cmode
); // make note of pending open, since it effects _wanted_ caps.
9595 if ((flags
& O_TRUNC
) == 0 && in
->caps_issued_mask(want
)) {
9597 check_caps(in
, CHECK_CAPS_NODELAY
);
9600 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
9602 in
->make_nosnap_relative_path(path
);
9603 req
->set_filepath(path
);
9604 req
->head
.args
.open
.flags
= cflags
& ~CEPH_O_CREAT
;
9605 req
->head
.args
.open
.mode
= mode
;
9606 req
->head
.args
.open
.pool
= -1;
9607 if (cct
->_conf
->client_debug_getattr_caps
)
9608 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
9610 req
->head
.args
.open
.mask
= 0;
9611 req
->head
.args
.open
.old_size
= in
->size
; // for O_TRUNC
9613 result
= make_request(req
, perms
);
9616 * NFS expects that delegations will be broken on a conflicting open,
9617 * not just when there is actual conflicting access to the file. SMB leases
9618 * and oplocks also have similar semantics.
9620 * Ensure that clients that have delegations enabled will wait on minimal
9621 * caps during open, just to ensure that other clients holding delegations
9622 * return theirs first.
9624 if (deleg_timeout
&& result
== 0) {
9627 if (cmode
& CEPH_FILE_MODE_WR
)
9628 need
|= CEPH_CAP_FILE_WR
;
9629 if (cmode
& CEPH_FILE_MODE_RD
)
9630 need
|= CEPH_CAP_FILE_RD
;
9632 Fh
fh(in
, flags
, cmode
, fd_gen
, perms
);
9633 result
= get_caps(&fh
, need
, want
, &have
, -1);
9635 ldout(cct
, 8) << "Unable to get caps after open of inode " << *in
<<
9636 " . Denying open: " <<
9637 cpp_strerror(result
) << dendl
;
9639 put_cap_ref(in
, need
);
9647 *fhp
= _create_fh(in
, flags
, cmode
, perms
);
9649 in
->put_open_ref(cmode
);
9657 int Client::_renew_caps(Inode
*in
)
9659 int wanted
= in
->caps_file_wanted();
9660 if (in
->is_any_caps() &&
9661 ((wanted
& CEPH_CAP_ANY_WR
) == 0 || in
->auth_cap
)) {
9662 check_caps(in
, CHECK_CAPS_NODELAY
);
9667 if ((wanted
& CEPH_CAP_FILE_RD
) && (wanted
& CEPH_CAP_FILE_WR
))
9669 else if (wanted
& CEPH_CAP_FILE_RD
)
9671 else if (wanted
& CEPH_CAP_FILE_WR
)
9674 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
9676 in
->make_nosnap_relative_path(path
);
9677 req
->set_filepath(path
);
9678 req
->head
.args
.open
.flags
= flags
;
9679 req
->head
.args
.open
.pool
= -1;
9680 if (cct
->_conf
->client_debug_getattr_caps
)
9681 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
9683 req
->head
.args
.open
.mask
= 0;
9686 // duplicate in case Cap goes away; not sure if that race is a concern?
9687 const UserPerm
*pperm
= in
->get_best_perms();
9691 int ret
= make_request(req
, perms
);
9695 int Client::_close(int fd
)
9697 ldout(cct
, 3) << "close enter(" << fd
<< ")" << dendl
;
9698 tout(cct
) << "close" << std::endl
;
9699 tout(cct
) << fd
<< std::endl
;
9701 Fh
*fh
= get_filehandle(fd
);
9703 return -CEPHFS_EBADF
;
9704 int err
= _release_fh(fh
);
9707 ldout(cct
, 3) << "close exit(" << fd
<< ")" << dendl
;
9711 int Client::close(int fd
) {
9712 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9713 if (!mref_reader
.is_state_satisfied())
9714 return -CEPHFS_ENOTCONN
;
9716 std::scoped_lock
lock(client_lock
);
9723 loff_t
Client::lseek(int fd
, loff_t offset
, int whence
)
9725 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9726 if (!mref_reader
.is_state_satisfied())
9727 return -CEPHFS_ENOTCONN
;
9729 tout(cct
) << "lseek" << std::endl
;
9730 tout(cct
) << fd
<< std::endl
;
9731 tout(cct
) << offset
<< std::endl
;
9732 tout(cct
) << whence
<< std::endl
;
9734 std::scoped_lock
lock(client_lock
);
9735 Fh
*f
= get_filehandle(fd
);
9737 return -CEPHFS_EBADF
;
9738 #if defined(__linux__) && defined(O_PATH)
9739 if (f
->flags
& O_PATH
)
9740 return -CEPHFS_EBADF
;
9742 return _lseek(f
, offset
, whence
);
9745 loff_t
Client::_lseek(Fh
*f
, loff_t offset
, int whence
)
9747 Inode
*in
= f
->inode
.get();
9748 bool whence_check
= false;
9753 whence_check
= true;
9758 whence_check
= true;
9764 whence_check
= true;
9770 int r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
9781 pos
= f
->pos
+ offset
;
9785 pos
= in
->size
+ offset
;
9790 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
9791 return -CEPHFS_ENXIO
;
9798 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
9799 return -CEPHFS_ENXIO
;
9805 ldout(cct
, 1) << __func__
<< ": invalid whence value " << whence
<< dendl
;
9806 return -CEPHFS_EINVAL
;
9810 return -CEPHFS_EINVAL
;
9815 ldout(cct
, 8) << "_lseek(" << f
<< ", " << offset
<< ", " << whence
<< ") = " << f
->pos
<< dendl
;
9820 void Client::lock_fh_pos(Fh
*f
)
9822 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
9824 if (f
->pos_locked
|| !f
->pos_waiters
.empty()) {
9825 ceph::condition_variable cond
;
9826 f
->pos_waiters
.push_back(&cond
);
9827 ldout(cct
, 10) << __func__
<< " BLOCKING on " << f
<< dendl
;
9828 std::unique_lock l
{client_lock
, std::adopt_lock
};
9829 cond
.wait(l
, [f
, me
=&cond
] {
9830 return !f
->pos_locked
&& f
->pos_waiters
.front() == me
;
9833 ldout(cct
, 10) << __func__
<< " UNBLOCKING on " << f
<< dendl
;
9834 ceph_assert(f
->pos_waiters
.front() == &cond
);
9835 f
->pos_waiters
.pop_front();
9838 f
->pos_locked
= true;
9841 void Client::unlock_fh_pos(Fh
*f
)
9843 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
9845 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
9846 f
->pos_locked
= false;
9847 if (!f
->pos_waiters
.empty()) {
9848 // only wake up the oldest waiter
9849 auto cond
= f
->pos_waiters
.front();
9854 int Client::uninline_data(Inode
*in
, Context
*onfinish
)
9856 if (!in
->inline_data
.length()) {
9857 onfinish
->complete(0);
9862 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (long long unsigned)in
->ino
);
9863 object_t oid
= oid_buf
;
9865 ObjectOperation create_ops
;
9866 create_ops
.create(false);
9868 objecter
->mutate(oid
,
9869 OSDMap::file_to_object_locator(in
->layout
),
9871 in
->snaprealm
->get_snap_context(),
9872 ceph::real_clock::now(),
9876 bufferlist inline_version_bl
;
9877 encode(in
->inline_version
, inline_version_bl
);
9879 ObjectOperation uninline_ops
;
9880 uninline_ops
.cmpxattr("inline_version",
9881 CEPH_OSD_CMPXATTR_OP_GT
,
9882 CEPH_OSD_CMPXATTR_MODE_U64
,
9884 bufferlist inline_data
= in
->inline_data
;
9885 uninline_ops
.write(0, inline_data
, in
->truncate_size
, in
->truncate_seq
);
9886 uninline_ops
.setxattr("inline_version", stringify(in
->inline_version
));
9888 objecter
->mutate(oid
,
9889 OSDMap::file_to_object_locator(in
->layout
),
9891 in
->snaprealm
->get_snap_context(),
9892 ceph::real_clock::now(),
9901 // blocking osd interface
9903 int Client::read(int fd
, char *buf
, loff_t size
, loff_t offset
)
9905 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9906 if (!mref_reader
.is_state_satisfied())
9907 return -CEPHFS_ENOTCONN
;
9909 tout(cct
) << "read" << std::endl
;
9910 tout(cct
) << fd
<< std::endl
;
9911 tout(cct
) << size
<< std::endl
;
9912 tout(cct
) << offset
<< std::endl
;
9914 std::unique_lock
lock(client_lock
);
9915 Fh
*f
= get_filehandle(fd
);
9917 return -CEPHFS_EBADF
;
9918 #if defined(__linux__) && defined(O_PATH)
9919 if (f
->flags
& O_PATH
)
9920 return -CEPHFS_EBADF
;
9923 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9924 size
= std::min(size
, (loff_t
)INT_MAX
);
9925 int r
= _read(f
, offset
, size
, &bl
);
9926 ldout(cct
, 3) << "read(" << fd
<< ", " << (void*)buf
<< ", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9929 bl
.begin().copy(bl
.length(), buf
);
9935 int Client::preadv(int fd
, const struct iovec
*iov
, int iovcnt
, loff_t offset
)
9938 return -CEPHFS_EINVAL
;
9939 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, false);
9942 int64_t Client::_read(Fh
*f
, int64_t offset
, uint64_t size
, bufferlist
*bl
)
9944 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
9947 bool movepos
= false;
9948 std::unique_ptr
<C_SaferCond
> onuninline
;
9950 const auto& conf
= cct
->_conf
;
9951 Inode
*in
= f
->inode
.get();
9953 utime_t start
= ceph_clock_now();
9955 if ((f
->mode
& CEPH_FILE_MODE_RD
) == 0)
9956 return -CEPHFS_EBADF
;
9957 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9964 loff_t start_pos
= offset
;
9966 if (in
->inline_version
== 0) {
9967 auto r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9972 ceph_assert(in
->inline_version
> 0);
9976 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9977 want
= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
;
9979 want
= CEPH_CAP_FILE_CACHE
;
9981 auto r
= get_caps(f
, CEPH_CAP_FILE_RD
, want
, &have
, -1);
9987 if (f
->flags
& O_DIRECT
)
9988 have
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
9990 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9991 if (!(have
& CEPH_CAP_FILE_CACHE
)) {
9992 onuninline
.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9993 uninline_data(in
, onuninline
.get());
9995 uint32_t len
= in
->inline_data
.length();
9996 uint64_t endoff
= offset
+ size
;
9997 if (endoff
> in
->size
)
10000 if (offset
< len
) {
10001 if (endoff
<= len
) {
10002 bl
->substr_of(in
->inline_data
, offset
, endoff
- offset
);
10004 bl
->substr_of(in
->inline_data
, offset
, len
- offset
);
10005 bl
->append_zero(endoff
- len
);
10007 rc
= endoff
- offset
;
10008 } else if ((uint64_t)offset
< endoff
) {
10009 bl
->append_zero(endoff
- offset
);
10010 rc
= endoff
- offset
;
10018 if (!conf
->client_debug_force_sync_read
&&
10020 (have
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
10022 if (f
->flags
& O_RSYNC
) {
10023 _flush_range(in
, offset
, size
);
10025 rc
= _read_async(f
, offset
, size
, bl
);
10029 if (f
->flags
& O_DIRECT
)
10030 _flush_range(in
, offset
, size
);
10032 bool checkeof
= false;
10033 rc
= _read_sync(f
, offset
, size
, bl
, &checkeof
);
10040 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
10044 auto r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
10051 // eof? short read.
10052 if ((uint64_t)offset
< in
->size
)
10058 ceph_assert(rc
>= 0);
10059 update_read_io_size(bl
->length());
10062 f
->pos
= start_pos
+ rc
;
10065 lat
= ceph_clock_now();
10067 logger
->tinc(l_c_read
, lat
);
10073 client_lock
.unlock();
10074 int ret
= onuninline
->wait();
10075 client_lock
.lock();
10076 if (ret
>= 0 || ret
== -CEPHFS_ECANCELED
) {
10077 in
->inline_data
.clear();
10078 in
->inline_version
= CEPH_INLINE_NONE
;
10079 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
10085 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
10093 Client::C_Readahead::C_Readahead(Client
*c
, Fh
*f
) :
10096 f
->readahead
.inc_pending();
10099 Client::C_Readahead::~C_Readahead() {
10100 f
->readahead
.dec_pending();
10101 client
->_put_fh(f
);
10104 void Client::C_Readahead::finish(int r
) {
10105 lgeneric_subdout(client
->cct
, client
, 20) << "client." << client
->get_nodeid() << " " << "C_Readahead on " << f
->inode
<< dendl
;
10106 client
->put_cap_ref(f
->inode
.get(), CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
10108 client
->update_read_io_size(r
);
10112 int Client::_read_async(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
)
10114 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10116 const auto& conf
= cct
->_conf
;
10117 Inode
*in
= f
->inode
.get();
10119 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
10121 // trim read based on file size?
10122 if (off
>= in
->size
)
10126 if (off
+ len
> in
->size
) {
10127 len
= in
->size
- off
;
10130 ldout(cct
, 10) << " min_bytes=" << f
->readahead
.get_min_readahead_size()
10131 << " max_bytes=" << f
->readahead
.get_max_readahead_size()
10132 << " max_periods=" << conf
->client_readahead_max_periods
<< dendl
;
10134 // read (and possibly block)
10136 C_SaferCond
onfinish("Client::_read_async flock");
10137 r
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
10138 off
, len
, bl
, 0, &onfinish
);
10140 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
10141 client_lock
.unlock();
10142 r
= onfinish
.wait();
10143 client_lock
.lock();
10144 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
10145 update_read_io_size(bl
->length());
10148 if(f
->readahead
.get_min_readahead_size() > 0) {
10149 pair
<uint64_t, uint64_t> readahead_extent
= f
->readahead
.update(off
, len
, in
->size
);
10150 if (readahead_extent
.second
> 0) {
10151 ldout(cct
, 20) << "readahead " << readahead_extent
.first
<< "~" << readahead_extent
.second
10152 << " (caller wants " << off
<< "~" << len
<< ")" << dendl
;
10153 Context
*onfinish2
= new C_Readahead(this, f
);
10154 int r2
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
10155 readahead_extent
.first
, readahead_extent
.second
,
10156 NULL
, 0, onfinish2
);
10158 ldout(cct
, 20) << "readahead initiated, c " << onfinish2
<< dendl
;
10159 get_cap_ref(in
, CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
10161 ldout(cct
, 20) << "readahead was no-op, already cached" << dendl
;
10170 int Client::_read_sync(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
,
10173 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10175 Inode
*in
= f
->inode
.get();
10176 uint64_t pos
= off
;
10180 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
10182 // 0 success, 1 continue and < 0 error happen.
10183 auto wait_and_copy
= [&](C_SaferCond
&onfinish
, bufferlist
&tbl
, int wanted
) {
10184 int r
= onfinish
.wait();
10186 // if we get ENOENT from OSD, assume 0 bytes returned
10187 if (r
== -CEPHFS_ENOENT
)
10192 if (tbl
.length()) {
10198 bl
->claim_append(tbl
);
10201 if (r
>= 0 && r
< wanted
) {
10202 if (pos
< in
->size
) {
10203 // zero up to known EOF
10204 int64_t some
= in
->size
- pos
;
10207 auto z
= buffer::ptr_node::create(some
);
10209 bl
->push_back(std::move(z
));
10224 C_SaferCond
onfinish("Client::_read_sync flock");
10228 filer
->read_trunc(in
->ino
, &in
->layout
, in
->snapid
,
10229 pos
, left
, &tbl
, 0,
10230 in
->truncate_size
, in
->truncate_seq
,
10232 client_lock
.unlock();
10233 int r
= wait_and_copy(onfinish
, tbl
, wanted
);
10234 client_lock
.lock();
10243 int Client::write(int fd
, const char *buf
, loff_t size
, loff_t offset
)
10245 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10246 if (!mref_reader
.is_state_satisfied())
10247 return -CEPHFS_ENOTCONN
;
10249 tout(cct
) << "write" << std::endl
;
10250 tout(cct
) << fd
<< std::endl
;
10251 tout(cct
) << size
<< std::endl
;
10252 tout(cct
) << offset
<< std::endl
;
10254 std::scoped_lock
lock(client_lock
);
10255 Fh
*fh
= get_filehandle(fd
);
10257 return -CEPHFS_EBADF
;
10258 #if defined(__linux__) && defined(O_PATH)
10259 if (fh
->flags
& O_PATH
)
10260 return -CEPHFS_EBADF
;
10262 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10263 size
= std::min(size
, (loff_t
)INT_MAX
);
10264 int r
= _write(fh
, offset
, size
, buf
, NULL
, false);
10265 ldout(cct
, 3) << "write(" << fd
<< ", \"...\", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
10269 int Client::pwritev(int fd
, const struct iovec
*iov
, int iovcnt
, int64_t offset
)
10272 return -CEPHFS_EINVAL
;
10273 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, true);
10276 int64_t Client::_preadv_pwritev_locked(Fh
*fh
, const struct iovec
*iov
,
10277 unsigned iovcnt
, int64_t offset
,
10278 bool write
, bool clamp_to_int
)
10280 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10282 #if defined(__linux__) && defined(O_PATH)
10283 if (fh
->flags
& O_PATH
)
10284 return -CEPHFS_EBADF
;
10286 loff_t totallen
= 0;
10287 for (unsigned i
= 0; i
< iovcnt
; i
++) {
10288 totallen
+= iov
[i
].iov_len
;
10292 * Some of the API functions take 64-bit size values, but only return
10293 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10294 * we don't do I/Os larger than the values we can return.
10296 if (clamp_to_int
) {
10297 totallen
= std::min(totallen
, (loff_t
)INT_MAX
);
10300 int64_t w
= _write(fh
, offset
, totallen
, NULL
, iov
, iovcnt
);
10301 ldout(cct
, 3) << "pwritev(" << fh
<< ", \"...\", " << totallen
<< ", " << offset
<< ") = " << w
<< dendl
;
10305 int64_t r
= _read(fh
, offset
, totallen
, &bl
);
10306 ldout(cct
, 3) << "preadv(" << fh
<< ", " << offset
<< ") = " << r
<< dendl
;
10310 client_lock
.unlock();
10311 auto iter
= bl
.cbegin();
10312 for (unsigned j
= 0, resid
= r
; j
< iovcnt
&& resid
> 0; j
++) {
10314 * This piece of code aims to handle the case that bufferlist
10315 * does not have enough data to fill in the iov
10317 const auto round_size
= std::min
<unsigned>(resid
, iov
[j
].iov_len
);
10318 iter
.copy(round_size
, reinterpret_cast<char*>(iov
[j
].iov_base
));
10319 resid
-= round_size
;
10320 /* iter is self-updating */
10322 client_lock
.lock();
10327 int Client::_preadv_pwritev(int fd
, const struct iovec
*iov
, unsigned iovcnt
, int64_t offset
, bool write
)
10329 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10330 if (!mref_reader
.is_state_satisfied())
10331 return -CEPHFS_ENOTCONN
;
10333 tout(cct
) << fd
<< std::endl
;
10334 tout(cct
) << offset
<< std::endl
;
10336 std::scoped_lock
cl(client_lock
);
10337 Fh
*fh
= get_filehandle(fd
);
10339 return -CEPHFS_EBADF
;
10340 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, offset
, write
, true);
10343 int64_t Client::_write(Fh
*f
, int64_t offset
, uint64_t size
, const char *buf
,
10344 const struct iovec
*iov
, int iovcnt
)
10346 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10350 if ((uint64_t)(offset
+size
) > mdsmap
->get_max_filesize()) //too large!
10351 return -CEPHFS_EFBIG
;
10353 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
10354 Inode
*in
= f
->inode
.get();
10356 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
10357 return -CEPHFS_ENOSPC
;
10360 ceph_assert(in
->snapid
== CEPH_NOSNAP
);
10362 // was Fh opened as writeable?
10363 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
10364 return -CEPHFS_EBADF
;
10366 // use/adjust fd pos?
10370 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10371 * change out from under us.
10373 if (f
->flags
& O_APPEND
) {
10374 auto r
= _lseek(f
, 0, SEEK_END
);
10381 fpos
= offset
+size
;
10386 uint64_t endoff
= offset
+ size
;
10387 if (endoff
> in
->size
&& is_quota_bytes_exceeded(in
, endoff
- in
->size
,
10389 return -CEPHFS_EDQUOT
;
10392 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10394 ldout(cct
, 10) << "cur file size is " << in
->size
<< dendl
;
10397 utime_t start
= ceph_clock_now();
10399 if (in
->inline_version
== 0) {
10400 int r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
10403 ceph_assert(in
->inline_version
> 0);
10406 // copy into fresh buffer (since our write may be resub, async)
10410 bl
.append(buf
, size
);
10412 for (int i
= 0; i
< iovcnt
; i
++) {
10413 if (iov
[i
].iov_len
> 0) {
10414 bl
.append((const char *)iov
[i
].iov_base
, iov
[i
].iov_len
);
10420 uint64_t totalwritten
;
10422 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
10423 want
= CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
;
10425 want
= CEPH_CAP_FILE_BUFFER
;
10426 int r
= get_caps(f
, CEPH_CAP_FILE_WR
|CEPH_CAP_AUTH_SHARED
, want
, &have
, endoff
);
10430 /* clear the setuid/setgid bits, if any */
10431 if (unlikely(in
->mode
& (S_ISUID
|S_ISGID
)) && size
> 0) {
10432 struct ceph_statx stx
= { 0 };
10434 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
10435 r
= __setattrx(in
, &stx
, CEPH_SETATTR_KILL_SGUID
, f
->actor_perms
);
10439 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
10442 if (f
->flags
& O_DIRECT
)
10443 have
&= ~(CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
);
10445 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
10447 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
10449 if (in
->inline_version
< CEPH_INLINE_NONE
) {
10450 if (endoff
> cct
->_conf
->client_max_inline_size
||
10451 endoff
> CEPH_INLINE_MAX_SIZE
||
10452 !(have
& CEPH_CAP_FILE_BUFFER
)) {
10453 onuninline
.reset(new C_SaferCond("Client::_write_uninline_data flock"));
10454 uninline_data(in
, onuninline
.get());
10456 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10458 uint32_t len
= in
->inline_data
.length();
10461 in
->inline_data
.begin(endoff
).copy(len
- endoff
, bl
); // XXX
10464 in
->inline_data
.splice(offset
, len
- offset
);
10465 else if (offset
> len
)
10466 in
->inline_data
.append_zero(offset
- len
);
10468 in
->inline_data
.append(bl
);
10469 in
->inline_version
++;
10471 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10477 if (cct
->_conf
->client_oc
&&
10478 (have
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
))) {
10479 // do buffered write
10480 if (!in
->oset
.dirty_or_tx
)
10481 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
10483 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10485 // async, caching, non-blocking.
10486 r
= objectcacher
->file_write(&in
->oset
, &in
->layout
,
10487 in
->snaprealm
->get_snap_context(),
10488 offset
, size
, bl
, ceph::real_clock::now(),
10490 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10495 // flush cached write if O_SYNC is set on file fh
10496 // O_DSYNC == O_SYNC on linux < 2.6.33
10497 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
10498 if ((f
->flags
& O_SYNC
) || (f
->flags
& O_DSYNC
)) {
10499 _flush_range(in
, offset
, size
);
10502 if (f
->flags
& O_DIRECT
)
10503 _flush_range(in
, offset
, size
);
10505 // simple, non-atomic sync write
10506 C_SaferCond
onfinish("Client::_write flock");
10507 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10509 filer
->write_trunc(in
->ino
, &in
->layout
, in
->snaprealm
->get_snap_context(),
10510 offset
, size
, bl
, ceph::real_clock::now(), 0,
10511 in
->truncate_size
, in
->truncate_seq
,
10513 client_lock
.unlock();
10514 r
= onfinish
.wait();
10515 client_lock
.lock();
10516 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10521 // if we get here, write was successful, update client metadata
10523 update_write_io_size(size
);
10525 lat
= ceph_clock_now();
10527 logger
->tinc(l_c_wrlat
, lat
);
10534 totalwritten
= size
;
10535 r
= (int64_t)totalwritten
;
10538 if (totalwritten
+ offset
> in
->size
) {
10539 in
->size
= totalwritten
+ offset
;
10540 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
10542 if (is_quota_bytes_approaching(in
, f
->actor_perms
)) {
10543 check_caps(in
, CHECK_CAPS_NODELAY
);
10544 } else if (is_max_size_approaching(in
)) {
10548 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", extending file size" << dendl
;
10550 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", leaving file size at " << in
->size
<< dendl
;
10554 in
->mtime
= in
->ctime
= ceph_clock_now();
10556 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
10560 if (nullptr != onuninline
) {
10561 client_lock
.unlock();
10562 int uninline_ret
= onuninline
->wait();
10563 client_lock
.lock();
10565 if (uninline_ret
>= 0 || uninline_ret
== -CEPHFS_ECANCELED
) {
10566 in
->inline_data
.clear();
10567 in
->inline_version
= CEPH_INLINE_NONE
;
10568 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
10574 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
10578 int Client::_flush(Fh
*f
)
10580 Inode
*in
= f
->inode
.get();
10581 int err
= f
->take_async_err();
10583 ldout(cct
, 1) << __func__
<< ": " << f
<< " on inode " << *in
<< " caught async_err = "
10584 << cpp_strerror(err
) << dendl
;
10586 ldout(cct
, 10) << __func__
<< ": " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
10592 int Client::truncate(const char *relpath
, loff_t length
, const UserPerm
& perms
)
10594 struct ceph_statx stx
;
10595 stx
.stx_size
= length
;
10596 return setattrx(relpath
, &stx
, CEPH_SETATTR_SIZE
, perms
);
10599 int Client::ftruncate(int fd
, loff_t length
, const UserPerm
& perms
)
10601 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10602 if (!mref_reader
.is_state_satisfied())
10603 return -CEPHFS_ENOTCONN
;
10605 tout(cct
) << __func__
<< std::endl
;
10606 tout(cct
) << fd
<< std::endl
;
10607 tout(cct
) << length
<< std::endl
;
10609 std::scoped_lock
lock(client_lock
);
10610 Fh
*f
= get_filehandle(fd
);
10612 return -CEPHFS_EBADF
;
10613 #if defined(__linux__) && defined(O_PATH)
10614 if (f
->flags
& O_PATH
)
10615 return -CEPHFS_EBADF
;
10617 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
10618 return -CEPHFS_EBADF
;
10620 attr
.st_size
= length
;
10621 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_SIZE
, perms
);
10624 int Client::fsync(int fd
, bool syncdataonly
)
10626 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10627 if (!mref_reader
.is_state_satisfied())
10628 return -CEPHFS_ENOTCONN
;
10630 tout(cct
) << "fsync" << std::endl
;
10631 tout(cct
) << fd
<< std::endl
;
10632 tout(cct
) << syncdataonly
<< std::endl
;
10634 std::scoped_lock
lock(client_lock
);
10635 Fh
*f
= get_filehandle(fd
);
10637 return -CEPHFS_EBADF
;
10638 #if defined(__linux__) && defined(O_PATH)
10639 if (f
->flags
& O_PATH
)
10640 return -CEPHFS_EBADF
;
10642 int r
= _fsync(f
, syncdataonly
);
10644 // The IOs in this fsync were okay, but maybe something happened
10645 // in the background that we shoudl be reporting?
10646 r
= f
->take_async_err();
10647 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
10648 << ") = 0, async_err = " << r
<< dendl
;
10650 // Assume that an error we encountered during fsync, even reported
10651 // synchronously, would also have applied the error to the Fh, and we
10652 // should clear it here to avoid returning the same error again on next
10654 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
<< ") = "
10656 f
->take_async_err();
10661 int Client::_fsync(Inode
*in
, bool syncdataonly
)
10663 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10666 std::unique_ptr
<C_SaferCond
> object_cacher_completion
= nullptr;
10667 ceph_tid_t flush_tid
= 0;
10670 utime_t start
= ceph_clock_now();
10672 ldout(cct
, 8) << "_fsync on " << *in
<< " " << (syncdataonly
? "(dataonly)":"(data+metadata)") << dendl
;
10674 if (cct
->_conf
->client_oc
) {
10675 object_cacher_completion
.reset(new C_SaferCond("Client::_fsync::lock"));
10676 tmp_ref
= in
; // take a reference; C_SaferCond doesn't and _flush won't either
10677 _flush(in
, object_cacher_completion
.get());
10678 ldout(cct
, 15) << "using return-valued form of _fsync" << dendl
;
10681 if (!syncdataonly
&& in
->dirty_caps
) {
10682 check_caps(in
, CHECK_CAPS_NODELAY
|CHECK_CAPS_SYNCHRONOUS
);
10683 if (in
->flushing_caps
)
10684 flush_tid
= last_flush_tid
;
10685 } else ldout(cct
, 10) << "no metadata needs to commit" << dendl
;
10687 if (!syncdataonly
&& !in
->unsafe_ops
.empty()) {
10688 flush_mdlog_sync(in
);
10690 MetaRequest
*req
= in
->unsafe_ops
.back();
10691 ldout(cct
, 15) << "waiting on unsafe requests, last tid " << req
->get_tid() << dendl
;
10694 wait_on_list(req
->waitfor_safe
);
10698 if (nullptr != object_cacher_completion
) { // wait on a real reply instead of guessing
10699 client_lock
.unlock();
10700 ldout(cct
, 15) << "waiting on data to flush" << dendl
;
10701 r
= object_cacher_completion
->wait();
10702 client_lock
.lock();
10703 ldout(cct
, 15) << "got " << r
<< " from flush writeback" << dendl
;
10705 // FIXME: this can starve
10706 while (in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] > 0) {
10707 ldout(cct
, 10) << "ino " << in
->ino
<< " has " << in
->cap_refs
[CEPH_CAP_FILE_BUFFER
]
10708 << " uncommitted, waiting" << dendl
;
10709 wait_on_list(in
->waitfor_commit
);
10715 wait_sync_caps(in
, flush_tid
);
10717 ldout(cct
, 10) << "ino " << in
->ino
<< " has no uncommitted writes" << dendl
;
10719 ldout(cct
, 8) << "ino " << in
->ino
<< " failed to commit to disk! "
10720 << cpp_strerror(-r
) << dendl
;
10723 lat
= ceph_clock_now();
10725 logger
->tinc(l_c_fsync
, lat
);
10730 int Client::_fsync(Fh
*f
, bool syncdataonly
)
10732 ldout(cct
, 8) << "_fsync(" << f
<< ", " << (syncdataonly
? "dataonly)":"data+metadata)") << dendl
;
10733 return _fsync(f
->inode
.get(), syncdataonly
);
10736 int Client::fstat(int fd
, struct stat
*stbuf
, const UserPerm
& perms
, int mask
)
10738 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10739 if (!mref_reader
.is_state_satisfied())
10740 return -CEPHFS_ENOTCONN
;
10742 tout(cct
) << "fstat mask " << hex
<< mask
<< dec
<< std::endl
;
10743 tout(cct
) << fd
<< std::endl
;
10745 std::scoped_lock
lock(client_lock
);
10746 Fh
*f
= get_filehandle(fd
);
10748 return -CEPHFS_EBADF
;
10749 int r
= _getattr(f
->inode
, mask
, perms
);
10752 fill_stat(f
->inode
, stbuf
, NULL
);
10753 ldout(cct
, 5) << "fstat(" << fd
<< ", " << stbuf
<< ") = " << r
<< dendl
;
10757 int Client::fstatx(int fd
, struct ceph_statx
*stx
, const UserPerm
& perms
,
10758 unsigned int want
, unsigned int flags
)
10760 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10761 if (!mref_reader
.is_state_satisfied())
10762 return -CEPHFS_ENOTCONN
;
10764 tout(cct
) << "fstatx flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
10765 tout(cct
) << fd
<< std::endl
;
10767 std::scoped_lock
lock(client_lock
);
10768 Fh
*f
= get_filehandle(fd
);
10770 return -CEPHFS_EBADF
;
10772 unsigned mask
= statx_to_mask(flags
, want
);
10776 r
= _getattr(f
->inode
, mask
, perms
);
10778 ldout(cct
, 3) << "fstatx exit on error!" << dendl
;
10783 fill_statx(f
->inode
, mask
, stx
);
10784 ldout(cct
, 3) << "fstatx(" << fd
<< ", " << stx
<< ") = " << r
<< dendl
;
10788 int Client::statxat(int dirfd
, const char *relpath
,
10789 struct ceph_statx
*stx
, const UserPerm
& perms
,
10790 unsigned int want
, unsigned int flags
) {
10791 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10792 if (!mref_reader
.is_state_satisfied()) {
10793 return -CEPHFS_ENOTCONN
;
10796 tout(cct
) << __func__
<< " flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
10797 tout(cct
) << dirfd
<< std::endl
;
10798 tout(cct
) << relpath
<< std::endl
;
10800 unsigned mask
= statx_to_mask(flags
, want
);
10803 std::scoped_lock
lock(client_lock
);
10804 int r
= get_fd_inode(dirfd
, &dirinode
);
10810 filepath
path(relpath
);
10811 r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
, dirinode
);
10815 r
= _getattr(in
, mask
, perms
);
10817 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
10821 fill_statx(in
, mask
, stx
);
10822 ldout(cct
, 3) << __func__
<< " dirfd" << dirfd
<< ", r= " << r
<< dendl
;
10826 // not written yet, but i want to link!
10828 int Client::chdir(const char *relpath
, std::string
&new_cwd
,
10829 const UserPerm
& perms
)
10831 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10832 if (!mref_reader
.is_state_satisfied())
10833 return -CEPHFS_ENOTCONN
;
10835 tout(cct
) << "chdir" << std::endl
;
10836 tout(cct
) << relpath
<< std::endl
;
10838 filepath
path(relpath
);
10841 std::scoped_lock
lock(client_lock
);
10842 int r
= path_walk(path
, &in
, perms
);
10846 if (!(in
.get()->is_dir()))
10847 return -CEPHFS_ENOTDIR
;
10851 ldout(cct
, 3) << "chdir(" << relpath
<< ") cwd now " << cwd
->ino
<< dendl
;
10853 _getcwd(new_cwd
, perms
);
10857 void Client::_getcwd(string
& dir
, const UserPerm
& perms
)
10860 ldout(cct
, 10) << __func__
<< " " << *cwd
<< dendl
;
10862 Inode
*in
= cwd
.get();
10863 while (in
!= root
.get()) {
10864 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
10866 // A cwd or ancester is unlinked
10867 if (in
->dentries
.empty()) {
10871 Dentry
*dn
= in
->get_first_parent();
10876 ldout(cct
, 10) << __func__
<< " looking up parent for " << *in
<< dendl
;
10877 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
10878 filepath
path(in
->ino
);
10879 req
->set_filepath(path
);
10880 req
->set_inode(in
);
10881 int res
= make_request(req
, perms
);
10890 path
.push_front_dentry(dn
->name
);
10891 in
= dn
->dir
->parent_inode
;
10894 dir
+= path
.get_path();
10897 void Client::getcwd(string
& dir
, const UserPerm
& perms
)
10899 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10900 if (!mref_reader
.is_state_satisfied())
10903 std::scoped_lock
l(client_lock
);
10905 _getcwd(dir
, perms
);
10908 int Client::statfs(const char *path
, struct statvfs
*stbuf
,
10909 const UserPerm
& perms
)
10911 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10912 if (!mref_reader
.is_state_satisfied())
10913 return -CEPHFS_ENOTCONN
;
10915 tout(cct
) << __func__
<< std::endl
;
10916 unsigned long int total_files_on_fs
;
10921 std::unique_lock
lock(client_lock
);
10922 const vector
<int64_t> &data_pools
= mdsmap
->get_data_pools();
10923 if (data_pools
.size() == 1) {
10924 objecter
->get_fs_stats(stats
, data_pools
[0], &cond
);
10926 objecter
->get_fs_stats(stats
, std::optional
<int64_t>(), &cond
);
10930 int rval
= cond
.wait();
10934 total_files_on_fs
= root
->rstat
.rfiles
+ root
->rstat
.rsubdirs
;
10937 ldout(cct
, 1) << "underlying call to statfs returned error: "
10938 << cpp_strerror(rval
)
10943 memset(stbuf
, 0, sizeof(*stbuf
));
10946 * we're going to set a block size of 4MB so we can represent larger
10947 * FSes without overflowing. Additionally convert the space
10948 * measurements from KB to bytes while making them in terms of
10949 * blocks. We use 4MB only because it is big enough, and because it
10950 * actually *is* the (ceph) default block size.
10952 const int CEPH_BLOCK_SHIFT
= 22;
10953 stbuf
->f_frsize
= 1 << CEPH_BLOCK_SHIFT
;
10954 stbuf
->f_bsize
= 1 << CEPH_BLOCK_SHIFT
;
10955 stbuf
->f_files
= total_files_on_fs
;
10956 stbuf
->f_ffree
= -1;
10957 stbuf
->f_favail
= -1;
10958 stbuf
->f_fsid
= -1; // ??
10959 stbuf
->f_flag
= 0; // ??
10960 stbuf
->f_namemax
= NAME_MAX
;
10962 // Usually quota_root will == root_ancestor, but if the mount root has no
10963 // quota but we can see a parent of it that does have a quota, we'll
10964 // respect that one instead.
10965 ceph_assert(root
!= nullptr);
10966 InodeRef quota_root
= root
->quota
.is_enable() ? root
: get_quota_root(root
.get(), perms
);
10968 // get_quota_root should always give us something
10969 // because client quotas are always enabled
10970 ceph_assert(quota_root
!= nullptr);
10972 if (quota_root
&& cct
->_conf
->client_quota_df
&& quota_root
->quota
.max_bytes
) {
10974 // Skip the getattr if any sessions are stale, as we don't want to
10975 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10977 if (!_any_stale_sessions()) {
10978 int r
= _getattr(quota_root
, 0, perms
, true);
10980 // Ignore return value: error getting latest inode metadata is not a good
10981 // reason to break "df".
10982 lderr(cct
) << "Error in getattr on quota root 0x"
10983 << std::hex
<< quota_root
->ino
<< std::dec
10984 << " statfs result may be outdated" << dendl
;
10988 // Special case: if there is a size quota set on the Inode acting
10989 // as the root for this client mount, then report the quota status
10990 // as the filesystem statistics.
10991 const fsblkcnt_t total
= quota_root
->quota
.max_bytes
>> CEPH_BLOCK_SHIFT
;
10992 const fsblkcnt_t used
= quota_root
->rstat
.rbytes
>> CEPH_BLOCK_SHIFT
;
10993 // It is possible for a quota to be exceeded: arithmetic here must
10994 // handle case where used > total.
10995 const fsblkcnt_t free
= total
> used
? total
- used
: 0;
10997 stbuf
->f_blocks
= total
;
10998 stbuf
->f_bfree
= free
;
10999 stbuf
->f_bavail
= free
;
11001 // General case: report the cluster statistics returned from RADOS. Because
11002 // multiple pools may be used without one filesystem namespace via
11003 // layouts, this is the most correct thing we can do.
11004 stbuf
->f_blocks
= stats
.kb
>> (CEPH_BLOCK_SHIFT
- 10);
11005 stbuf
->f_bfree
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
11006 stbuf
->f_bavail
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
11012 int Client::_do_filelock(Inode
*in
, Fh
*fh
, int lock_type
, int op
, int sleep
,
11013 struct flock
*fl
, uint64_t owner
, bool removing
)
11015 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
11016 << (lock_type
== CEPH_LOCK_FCNTL
? " fcntl" : " flock")
11017 << " type " << fl
->l_type
<< " owner " << owner
11018 << " " << fl
->l_start
<< "~" << fl
->l_len
<< dendl
;
11020 if (in
->flags
& I_ERROR_FILELOCK
)
11021 return -CEPHFS_EIO
;
11024 if (F_RDLCK
== fl
->l_type
)
11025 lock_cmd
= CEPH_LOCK_SHARED
;
11026 else if (F_WRLCK
== fl
->l_type
)
11027 lock_cmd
= CEPH_LOCK_EXCL
;
11028 else if (F_UNLCK
== fl
->l_type
)
11029 lock_cmd
= CEPH_LOCK_UNLOCK
;
11031 return -CEPHFS_EIO
;
11033 if (op
!= CEPH_MDS_OP_SETFILELOCK
|| lock_cmd
== CEPH_LOCK_UNLOCK
)
11037 * Set the most significant bit, so that MDS knows the 'owner'
11038 * is sufficient to identify the owner of lock. (old code uses
11039 * both 'owner' and 'pid')
11041 owner
|= (1ULL << 63);
11043 MetaRequest
*req
= new MetaRequest(op
);
11045 in
->make_nosnap_relative_path(path
);
11046 req
->set_filepath(path
);
11047 req
->set_inode(in
);
11049 req
->head
.args
.filelock_change
.rule
= lock_type
;
11050 req
->head
.args
.filelock_change
.type
= lock_cmd
;
11051 req
->head
.args
.filelock_change
.owner
= owner
;
11052 req
->head
.args
.filelock_change
.pid
= fl
->l_pid
;
11053 req
->head
.args
.filelock_change
.start
= fl
->l_start
;
11054 req
->head
.args
.filelock_change
.length
= fl
->l_len
;
11055 req
->head
.args
.filelock_change
.wait
= sleep
;
11060 if (sleep
&& switch_interrupt_cb
) {
11061 // enable interrupt
11062 switch_interrupt_cb(callback_handle
, req
->get());
11063 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
11064 // disable interrupt
11065 switch_interrupt_cb(callback_handle
, NULL
);
11066 if (ret
== 0 && req
->aborted()) {
11067 // effect of this lock request has been revoked by the 'lock intr' request
11068 ret
= req
->get_abort_code();
11072 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
11076 if (op
== CEPH_MDS_OP_GETFILELOCK
) {
11077 ceph_filelock filelock
;
11078 auto p
= bl
.cbegin();
11079 decode(filelock
, p
);
11081 if (CEPH_LOCK_SHARED
== filelock
.type
)
11082 fl
->l_type
= F_RDLCK
;
11083 else if (CEPH_LOCK_EXCL
== filelock
.type
)
11084 fl
->l_type
= F_WRLCK
;
11086 fl
->l_type
= F_UNLCK
;
11088 fl
->l_whence
= SEEK_SET
;
11089 fl
->l_start
= filelock
.start
;
11090 fl
->l_len
= filelock
.length
;
11091 fl
->l_pid
= filelock
.pid
;
11092 } else if (op
== CEPH_MDS_OP_SETFILELOCK
) {
11093 ceph_lock_state_t
*lock_state
;
11094 if (lock_type
== CEPH_LOCK_FCNTL
) {
11095 if (!in
->fcntl_locks
)
11096 in
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
11097 lock_state
= in
->fcntl_locks
.get();
11098 } else if (lock_type
== CEPH_LOCK_FLOCK
) {
11099 if (!in
->flock_locks
)
11100 in
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
11101 lock_state
= in
->flock_locks
.get();
11104 return -CEPHFS_EINVAL
;
11106 _update_lock_state(fl
, owner
, lock_state
);
11109 if (lock_type
== CEPH_LOCK_FCNTL
) {
11110 if (!fh
->fcntl_locks
)
11111 fh
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
11112 lock_state
= fh
->fcntl_locks
.get();
11114 if (!fh
->flock_locks
)
11115 fh
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
11116 lock_state
= fh
->flock_locks
.get();
11118 _update_lock_state(fl
, owner
, lock_state
);
11126 int Client::_interrupt_filelock(MetaRequest
*req
)
11128 // Set abort code, but do not kick. The abort code prevents the request
11129 // from being re-sent.
11130 req
->abort(-CEPHFS_EINTR
);
11132 return 0; // haven't sent the request
11134 Inode
*in
= req
->inode();
11137 if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FLOCK
)
11138 lock_type
= CEPH_LOCK_FLOCK_INTR
;
11139 else if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FCNTL
)
11140 lock_type
= CEPH_LOCK_FCNTL_INTR
;
11143 return -CEPHFS_EINVAL
;
11146 MetaRequest
*intr_req
= new MetaRequest(CEPH_MDS_OP_SETFILELOCK
);
11148 in
->make_nosnap_relative_path(path
);
11149 intr_req
->set_filepath(path
);
11150 intr_req
->set_inode(in
);
11151 intr_req
->head
.args
.filelock_change
= req
->head
.args
.filelock_change
;
11152 intr_req
->head
.args
.filelock_change
.rule
= lock_type
;
11153 intr_req
->head
.args
.filelock_change
.type
= CEPH_LOCK_UNLOCK
;
11155 UserPerm
perms(req
->get_uid(), req
->get_gid());
11156 return make_request(intr_req
, perms
, NULL
, NULL
, -1);
11159 void Client::_encode_filelocks(Inode
*in
, bufferlist
& bl
)
11161 if (!in
->fcntl_locks
&& !in
->flock_locks
)
11164 unsigned nr_fcntl_locks
= in
->fcntl_locks
? in
->fcntl_locks
->held_locks
.size() : 0;
11165 encode(nr_fcntl_locks
, bl
);
11166 if (nr_fcntl_locks
) {
11167 auto &lock_state
= in
->fcntl_locks
;
11168 for(auto p
= lock_state
->held_locks
.begin();
11169 p
!= lock_state
->held_locks
.end();
11171 encode(p
->second
, bl
);
11174 unsigned nr_flock_locks
= in
->flock_locks
? in
->flock_locks
->held_locks
.size() : 0;
11175 encode(nr_flock_locks
, bl
);
11176 if (nr_flock_locks
) {
11177 auto &lock_state
= in
->flock_locks
;
11178 for(auto p
= lock_state
->held_locks
.begin();
11179 p
!= lock_state
->held_locks
.end();
11181 encode(p
->second
, bl
);
11184 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< ", " << nr_fcntl_locks
11185 << " fcntl locks, " << nr_flock_locks
<< " flock locks" << dendl
;
11188 void Client::_release_filelocks(Fh
*fh
)
11190 if (!fh
->fcntl_locks
&& !fh
->flock_locks
)
11193 Inode
*in
= fh
->inode
.get();
11194 ldout(cct
, 10) << __func__
<< " " << fh
<< " ino " << in
->ino
<< dendl
;
11196 list
<ceph_filelock
> activated_locks
;
11198 list
<pair
<int, ceph_filelock
> > to_release
;
11200 if (fh
->fcntl_locks
) {
11201 auto &lock_state
= fh
->fcntl_locks
;
11202 for(auto p
= lock_state
->held_locks
.begin(); p
!= lock_state
->held_locks
.end(); ) {
11204 if (in
->flags
& I_ERROR_FILELOCK
) {
11205 lock_state
->remove_lock(q
->second
, activated_locks
);
11207 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FCNTL
, q
->second
));
11210 lock_state
.reset();
11212 if (fh
->flock_locks
) {
11213 auto &lock_state
= fh
->flock_locks
;
11214 for(auto p
= lock_state
->held_locks
.begin(); p
!= lock_state
->held_locks
.end(); ) {
11216 if (in
->flags
& I_ERROR_FILELOCK
) {
11217 lock_state
->remove_lock(q
->second
, activated_locks
);
11219 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FLOCK
, q
->second
));
11222 lock_state
.reset();
11225 if ((in
->flags
& I_ERROR_FILELOCK
) && !in
->has_any_filelocks())
11226 in
->flags
&= ~I_ERROR_FILELOCK
;
11228 if (to_release
.empty())
11232 memset(&fl
, 0, sizeof(fl
));
11233 fl
.l_whence
= SEEK_SET
;
11234 fl
.l_type
= F_UNLCK
;
11236 for (list
<pair
<int, ceph_filelock
> >::iterator p
= to_release
.begin();
11237 p
!= to_release
.end();
11239 fl
.l_start
= p
->second
.start
;
11240 fl
.l_len
= p
->second
.length
;
11241 fl
.l_pid
= p
->second
.pid
;
11242 _do_filelock(in
, fh
, p
->first
, CEPH_MDS_OP_SETFILELOCK
, 0, &fl
,
11243 p
->second
.owner
, true);
11247 void Client::_update_lock_state(struct flock
*fl
, uint64_t owner
,
11248 ceph_lock_state_t
*lock_state
)
11251 if (F_RDLCK
== fl
->l_type
)
11252 lock_cmd
= CEPH_LOCK_SHARED
;
11253 else if (F_WRLCK
== fl
->l_type
)
11254 lock_cmd
= CEPH_LOCK_EXCL
;
11256 lock_cmd
= CEPH_LOCK_UNLOCK
;;
11258 ceph_filelock filelock
;
11259 filelock
.start
= fl
->l_start
;
11260 filelock
.length
= fl
->l_len
;
11261 filelock
.client
= 0;
11262 // see comment in _do_filelock()
11263 filelock
.owner
= owner
| (1ULL << 63);
11264 filelock
.pid
= fl
->l_pid
;
11265 filelock
.type
= lock_cmd
;
11267 if (filelock
.type
== CEPH_LOCK_UNLOCK
) {
11268 list
<ceph_filelock
> activated_locks
;
11269 lock_state
->remove_lock(filelock
, activated_locks
);
11271 bool r
= lock_state
->add_lock(filelock
, false, false, NULL
);
11276 int Client::_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
11278 Inode
*in
= fh
->inode
.get();
11279 ldout(cct
, 10) << "_getlk " << fh
<< " ino " << in
->ino
<< dendl
;
11280 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_GETFILELOCK
, 0, fl
, owner
);
11284 int Client::_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
11286 Inode
*in
= fh
->inode
.get();
11287 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< dendl
;
11288 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_SETFILELOCK
, sleep
, fl
, owner
);
11289 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
11293 int Client::_flock(Fh
*fh
, int cmd
, uint64_t owner
)
11295 Inode
*in
= fh
->inode
.get();
11296 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< dendl
;
11298 int sleep
= !(cmd
& LOCK_NB
);
11313 return -CEPHFS_EINVAL
;
11317 memset(&fl
, 0, sizeof(fl
));
11319 fl
.l_whence
= SEEK_SET
;
11321 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FLOCK
, CEPH_MDS_OP_SETFILELOCK
, sleep
, &fl
, owner
);
11322 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
11326 int Client::get_snap_info(const char *path
, const UserPerm
&perms
, SnapInfo
*snap_info
) {
11327 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11328 if (!mref_reader
.is_state_satisfied()) {
11329 return -CEPHFS_ENOTCONN
;
11332 std::scoped_lock
lock(client_lock
);
11334 int r
= Client::path_walk(path
, &in
, perms
, true);
11339 if (in
->snapid
== CEPH_NOSNAP
) {
11340 return -CEPHFS_EINVAL
;
11343 snap_info
->id
= in
->snapid
;
11344 snap_info
->metadata
= in
->snap_metadata
;
11348 int Client::ll_statfs(Inode
*in
, struct statvfs
*stbuf
, const UserPerm
& perms
)
11350 /* Since the only thing this does is wrap a call to statfs, and
11351 statfs takes a lock, it doesn't seem we have a need to split it
11353 return statfs(0, stbuf
, perms
);
11356 void Client::_ll_register_callbacks(struct ceph_client_callback_args
*args
)
11361 ldout(cct
, 10) << __func__
<< " cb " << args
->handle
11362 << " invalidate_ino_cb " << args
->ino_cb
11363 << " invalidate_dentry_cb " << args
->dentry_cb
11364 << " switch_interrupt_cb " << args
->switch_intr_cb
11365 << " remount_cb " << args
->remount_cb
11367 callback_handle
= args
->handle
;
11368 if (args
->ino_cb
) {
11369 ino_invalidate_cb
= args
->ino_cb
;
11370 async_ino_invalidator
.start();
11372 if (args
->dentry_cb
) {
11373 dentry_invalidate_cb
= args
->dentry_cb
;
11374 async_dentry_invalidator
.start();
11376 if (args
->switch_intr_cb
) {
11377 switch_interrupt_cb
= args
->switch_intr_cb
;
11378 interrupt_finisher
.start();
11380 if (args
->remount_cb
) {
11381 remount_cb
= args
->remount_cb
;
11382 remount_finisher
.start();
11384 if (args
->ino_release_cb
) {
11385 ino_release_cb
= args
->ino_release_cb
;
11386 async_ino_releasor
.start();
11388 if (args
->umask_cb
)
11389 umask_cb
= args
->umask_cb
;
11392 // This is deprecated, use ll_register_callbacks2() instead.
11393 void Client::ll_register_callbacks(struct ceph_client_callback_args
*args
)
11395 ceph_assert(!is_mounting() && !is_mounted() && !is_unmounting());
11397 _ll_register_callbacks(args
);
11400 int Client::ll_register_callbacks2(struct ceph_client_callback_args
*args
)
11402 if (is_mounting() || is_mounted() || is_unmounting())
11403 return -CEPHFS_EBUSY
;
11405 _ll_register_callbacks(args
);
11409 std::pair
<int, bool> Client::test_dentry_handling(bool can_invalidate
)
11411 std::pair
<int, bool> r(0, false);
11413 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
11414 if (!iref_reader
.is_state_satisfied())
11415 return std::make_pair(-CEPHFS_ENOTCONN
, false);
11417 can_invalidate_dentries
= can_invalidate
;
11419 if (can_invalidate_dentries
) {
11420 ceph_assert(dentry_invalidate_cb
);
11421 ldout(cct
, 1) << "using dentry_invalidate_cb" << dendl
;
11423 ceph_assert(remount_cb
);
11424 ldout(cct
, 1) << "using remount_cb" << dendl
;
11425 r
= _do_remount(false);
11431 int Client::_sync_fs()
11433 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
11435 ldout(cct
, 10) << __func__
<< dendl
;
11438 std::unique_ptr
<C_SaferCond
> cond
= nullptr;
11439 if (cct
->_conf
->client_oc
) {
11440 cond
.reset(new C_SaferCond("Client::_sync_fs:lock"));
11441 objectcacher
->flush_all(cond
.get());
11446 ceph_tid_t flush_tid
= last_flush_tid
;
11448 // wait for unsafe mds requests
11449 wait_unsafe_requests();
11451 wait_sync_caps(flush_tid
);
11453 if (nullptr != cond
) {
11454 client_lock
.unlock();
11455 ldout(cct
, 15) << __func__
<< " waiting on data to flush" << dendl
;
11457 ldout(cct
, 15) << __func__
<< " flush finished" << dendl
;
11458 client_lock
.lock();
11464 int Client::sync_fs()
11466 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11467 if (!mref_reader
.is_state_satisfied())
11468 return -CEPHFS_ENOTCONN
;
11470 std::scoped_lock
l(client_lock
);
11475 int64_t Client::drop_caches()
11477 std::scoped_lock
l(client_lock
);
11478 return objectcacher
->release_all();
11481 int Client::_lazyio(Fh
*fh
, int enable
)
11483 Inode
*in
= fh
->inode
.get();
11484 ldout(cct
, 20) << __func__
<< " " << *in
<< " " << !!enable
<< dendl
;
11486 if (!!(fh
->mode
& CEPH_FILE_MODE_LAZY
) == !!enable
)
11489 int orig_mode
= fh
->mode
;
11491 fh
->mode
|= CEPH_FILE_MODE_LAZY
;
11492 in
->get_open_ref(fh
->mode
);
11493 in
->put_open_ref(orig_mode
);
11494 check_caps(in
, CHECK_CAPS_NODELAY
);
11496 fh
->mode
&= ~CEPH_FILE_MODE_LAZY
;
11497 in
->get_open_ref(fh
->mode
);
11498 in
->put_open_ref(orig_mode
);
11505 int Client::lazyio(int fd
, int enable
)
11507 std::scoped_lock
l(client_lock
);
11508 Fh
*f
= get_filehandle(fd
);
11510 return -CEPHFS_EBADF
;
11512 return _lazyio(f
, enable
);
11515 int Client::ll_lazyio(Fh
*fh
, int enable
)
11517 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << !!enable
<< dendl
;
11518 tout(cct
) << __func__
<< std::endl
;
11520 std::scoped_lock
lock(client_lock
);
11521 return _lazyio(fh
, enable
);
11524 int Client::lazyio_propagate(int fd
, loff_t offset
, size_t count
)
11526 std::scoped_lock
l(client_lock
);
11527 ldout(cct
, 3) << "op: client->lazyio_propagate(" << fd
11528 << ", " << offset
<< ", " << count
<< ")" << dendl
;
11530 Fh
*f
= get_filehandle(fd
);
11532 return -CEPHFS_EBADF
;
11540 int Client::lazyio_synchronize(int fd
, loff_t offset
, size_t count
)
11542 std::scoped_lock
l(client_lock
);
11543 ldout(cct
, 3) << "op: client->lazyio_synchronize(" << fd
11544 << ", " << offset
<< ", " << count
<< ")" << dendl
;
11546 Fh
*f
= get_filehandle(fd
);
11548 return -CEPHFS_EBADF
;
11549 Inode
*in
= f
->inode
.get();
11552 if (_release(in
)) {
11553 int r
=_getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
11561 // =============================
11564 int Client::mksnap(const char *relpath
, const char *name
, const UserPerm
& perm
,
11565 mode_t mode
, const std::map
<std::string
, std::string
> &metadata
)
11567 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11568 if (!mref_reader
.is_state_satisfied())
11569 return -CEPHFS_ENOTCONN
;
11571 std::scoped_lock
l(client_lock
);
11573 filepath
path(relpath
);
11575 int r
= path_walk(path
, &in
, perm
);
11578 if (cct
->_conf
->client_permissions
) {
11579 r
= may_create(in
.get(), perm
);
11583 Inode
*snapdir
= open_snapdir(in
.get());
11584 return _mkdir(snapdir
, name
, mode
, perm
, nullptr, metadata
);
11587 int Client::rmsnap(const char *relpath
, const char *name
, const UserPerm
& perms
, bool check_perms
)
11589 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11590 if (!mref_reader
.is_state_satisfied())
11591 return -CEPHFS_ENOTCONN
;
11593 std::scoped_lock
l(client_lock
);
11595 filepath
path(relpath
);
11597 int r
= path_walk(path
, &in
, perms
);
11600 Inode
*snapdir
= open_snapdir(in
.get());
11601 if (cct
->_conf
->client_permissions
) {
11602 r
= may_delete(snapdir
, check_perms
? name
: NULL
, perms
);
11606 return _rmdir(snapdir
, name
, perms
);
11609 // =============================
11612 int Client::get_caps_issued(int fd
)
11614 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11615 if (!mref_reader
.is_state_satisfied())
11616 return -CEPHFS_ENOTCONN
;
11618 std::scoped_lock
lock(client_lock
);
11620 Fh
*f
= get_filehandle(fd
);
11622 return -CEPHFS_EBADF
;
11624 return f
->inode
->caps_issued();
11627 int Client::get_caps_issued(const char *path
, const UserPerm
& perms
)
11629 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11630 if (!mref_reader
.is_state_satisfied())
11631 return -CEPHFS_ENOTCONN
;
11633 std::scoped_lock
lock(client_lock
);
11637 int r
= path_walk(p
, &in
, perms
, true);
11640 return in
->caps_issued();
11643 // =========================================
11646 Inode
*Client::open_snapdir(Inode
*diri
)
11649 vinodeno_t
vino(diri
->ino
, CEPH_SNAPDIR
);
11650 if (!inode_map
.count(vino
)) {
11651 in
= new Inode(this, vino
, &diri
->layout
);
11653 in
->ino
= diri
->ino
;
11654 in
->snapid
= CEPH_SNAPDIR
;
11655 in
->mode
= diri
->mode
;
11656 in
->uid
= diri
->uid
;
11657 in
->gid
= diri
->gid
;
11659 in
->mtime
= diri
->mtime
;
11660 in
->ctime
= diri
->ctime
;
11661 in
->btime
= diri
->btime
;
11662 in
->atime
= diri
->atime
;
11663 in
->size
= diri
->size
;
11664 in
->change_attr
= diri
->change_attr
;
11666 in
->dirfragtree
.clear();
11667 in
->snapdir_parent
= diri
;
11668 diri
->flags
|= I_SNAPDIR_OPEN
;
11669 inode_map
[vino
] = in
;
11670 if (use_faked_inos())
11671 _assign_faked_ino(in
);
11672 ldout(cct
, 10) << "open_snapdir created snapshot inode " << *in
<< dendl
;
11674 in
= inode_map
[vino
];
11675 ldout(cct
, 10) << "open_snapdir had snapshot inode " << *in
<< dendl
;
11680 int Client::ll_lookup(Inode
*parent
, const char *name
, struct stat
*attr
,
11681 Inode
**out
, const UserPerm
& perms
)
11683 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11684 if (!mref_reader
.is_state_satisfied())
11685 return -CEPHFS_ENOTCONN
;
11687 vinodeno_t vparent
= _get_vino(parent
);
11688 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
11689 tout(cct
) << __func__
<< std::endl
;
11690 tout(cct
) << name
<< std::endl
;
11692 std::scoped_lock
lock(client_lock
);
11695 if (!fuse_default_permissions
) {
11696 if (strcmp(name
, ".") && strcmp(name
, "..")) {
11697 r
= may_lookup(parent
, perms
);
11703 string
dname(name
);
11706 r
= _lookup(parent
, dname
, CEPH_STAT_CAP_INODE_ALL
, &in
, perms
);
11713 fill_stat(in
, attr
);
11717 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
11718 << " -> " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
11719 tout(cct
) << attr
->st_ino
<< std::endl
;
11724 int Client::ll_lookup_vino(
11726 const UserPerm
& perms
,
11729 ceph_assert(inode
!= NULL
);
11730 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11731 if (!mref_reader
.is_state_satisfied())
11732 return -CEPHFS_ENOTCONN
;
11734 if (is_reserved_vino(vino
))
11735 return -CEPHFS_ESTALE
;
11737 std::scoped_lock
lock(client_lock
);
11738 ldout(cct
, 3) << __func__
<< " " << vino
<< dendl
;
11740 // Check the cache first
11741 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
11742 if (p
!= inode_map
.end()) {
11743 *inode
= p
->second
;
11748 uint64_t snapid
= vino
.snapid
;
11750 // for snapdir, find the non-snapped dir inode
11751 if (snapid
== CEPH_SNAPDIR
)
11752 vino
.snapid
= CEPH_NOSNAP
;
11754 int r
= _lookup_vino(vino
, perms
, inode
);
11757 ceph_assert(*inode
!= NULL
);
11759 if (snapid
== CEPH_SNAPDIR
) {
11760 Inode
*tmp
= *inode
;
11762 // open the snapdir and put the inode ref
11763 *inode
= open_snapdir(tmp
);
11764 _ll_forget(tmp
, 1);
11770 int Client::ll_lookup_inode(
11771 struct inodeno_t ino
,
11772 const UserPerm
& perms
,
11775 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
11776 return ll_lookup_vino(vino
, perms
, inode
);
11779 int Client::ll_lookupx(Inode
*parent
, const char *name
, Inode
**out
,
11780 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
11781 const UserPerm
& perms
)
11783 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11784 if (!mref_reader
.is_state_satisfied())
11785 return -CEPHFS_ENOTCONN
;
11787 vinodeno_t vparent
= _get_vino(parent
);
11788 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
11789 tout(cct
) << "ll_lookupx" << std::endl
;
11790 tout(cct
) << name
<< std::endl
;
11792 std::scoped_lock
lock(client_lock
);
11795 if (!fuse_default_permissions
) {
11796 r
= may_lookup(parent
, perms
);
11801 string
dname(name
);
11804 unsigned mask
= statx_to_mask(flags
, want
);
11805 r
= _lookup(parent
, dname
, mask
, &in
, perms
);
11811 fill_statx(in
, mask
, stx
);
11815 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
11816 << " -> " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
11817 tout(cct
) << stx
->stx_ino
<< std::endl
;
11822 int Client::ll_walk(const char* name
, Inode
**out
, struct ceph_statx
*stx
,
11823 unsigned int want
, unsigned int flags
, const UserPerm
& perms
)
11825 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11826 if (!mref_reader
.is_state_satisfied())
11827 return -CEPHFS_ENOTCONN
;
11829 filepath
fp(name
, 0);
11832 unsigned mask
= statx_to_mask(flags
, want
);
11834 ldout(cct
, 3) << __func__
<< " " << name
<< dendl
;
11835 tout(cct
) << __func__
<< std::endl
;
11836 tout(cct
) << name
<< std::endl
;
11838 std::scoped_lock
lock(client_lock
);
11839 rc
= path_walk(fp
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
11841 /* zero out mask, just in case... */
11848 fill_statx(in
, mask
, stx
);
11855 void Client::_ll_get(Inode
*in
)
11857 if (in
->ll_ref
== 0) {
11859 if (in
->is_dir() && !in
->dentries
.empty()) {
11860 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
11861 in
->get_first_parent()->get(); // pin dentry
11863 if (in
->snapid
!= CEPH_NOSNAP
)
11864 ll_snap_ref
[in
->snapid
]++;
11867 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " -> " << in
->ll_ref
<< dendl
;
11870 int Client::_ll_put(Inode
*in
, uint64_t num
)
11873 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " " << num
<< " -> " << in
->ll_ref
<< dendl
;
11874 if (in
->ll_ref
== 0) {
11875 if (in
->is_dir() && !in
->dentries
.empty()) {
11876 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
11877 in
->get_first_parent()->put(); // unpin dentry
11879 if (in
->snapid
!= CEPH_NOSNAP
) {
11880 auto p
= ll_snap_ref
.find(in
->snapid
);
11881 ceph_assert(p
!= ll_snap_ref
.end());
11882 ceph_assert(p
->second
> 0);
11883 if (--p
->second
== 0)
11884 ll_snap_ref
.erase(p
);
11893 void Client::_ll_drop_pins()
11895 ldout(cct
, 10) << __func__
<< dendl
;
11896 std::set
<InodeRef
> to_be_put
; //this set will be deconstructed item by item when exit
11897 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator next
;
11898 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
11899 it
!= inode_map
.end();
11901 Inode
*in
= it
->second
;
11905 to_be_put
.insert(in
);
11906 _ll_put(in
, in
->ll_ref
);
11911 bool Client::_ll_forget(Inode
*in
, uint64_t count
)
11913 inodeno_t ino
= in
->ino
;
11915 ldout(cct
, 8) << __func__
<< " " << ino
<< " " << count
<< dendl
;
11916 tout(cct
) << __func__
<< std::endl
;
11917 tout(cct
) << ino
.val
<< std::endl
;
11918 tout(cct
) << count
<< std::endl
;
11920 // Ignore forget if we're no longer mounted
11921 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11922 if (!mref_reader
.is_state_satisfied())
11925 if (ino
== 1) return true; // ignore forget on root.
11928 if (in
->ll_ref
< count
) {
11929 ldout(cct
, 1) << "WARNING: ll_forget on " << ino
<< " " << count
11930 << ", which only has ll_ref=" << in
->ll_ref
<< dendl
;
11931 _ll_put(in
, in
->ll_ref
);
11934 if (_ll_put(in
, count
) == 0)
11941 bool Client::ll_forget(Inode
*in
, uint64_t count
)
11943 std::scoped_lock
lock(client_lock
);
11944 return _ll_forget(in
, count
);
11947 bool Client::ll_put(Inode
*in
)
11949 /* ll_forget already takes the lock */
11950 return ll_forget(in
, 1);
11953 int Client::ll_get_snap_ref(snapid_t snap
)
11955 std::scoped_lock
lock(client_lock
);
11956 auto p
= ll_snap_ref
.find(snap
);
11957 if (p
!= ll_snap_ref
.end())
11962 snapid_t
Client::ll_get_snapid(Inode
*in
)
11964 std::scoped_lock
lock(client_lock
);
11968 Inode
*Client::ll_get_inode(ino_t ino
)
11970 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11971 if (!mref_reader
.is_state_satisfied())
11974 std::scoped_lock
lock(client_lock
);
11976 vinodeno_t vino
= _map_faked_ino(ino
);
11977 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
11978 if (p
== inode_map
.end())
11980 Inode
*in
= p
->second
;
11985 Inode
*Client::ll_get_inode(vinodeno_t vino
)
11987 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11988 if (!mref_reader
.is_state_satisfied())
11991 if (is_reserved_vino(vino
))
11994 std::scoped_lock
lock(client_lock
);
11996 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
11997 if (p
== inode_map
.end())
11999 Inode
*in
= p
->second
;
12004 int Client::_ll_getattr(Inode
*in
, int caps
, const UserPerm
& perms
)
12006 vinodeno_t vino
= _get_vino(in
);
12008 ldout(cct
, 8) << __func__
<< " " << vino
<< dendl
;
12009 tout(cct
) << __func__
<< std::endl
;
12010 tout(cct
) << vino
.ino
.val
<< std::endl
;
12012 if (vino
.snapid
< CEPH_NOSNAP
)
12015 return _getattr(in
, caps
, perms
);
12018 int Client::ll_getattr(Inode
*in
, struct stat
*attr
, const UserPerm
& perms
)
12020 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12021 if (!mref_reader
.is_state_satisfied())
12022 return -CEPHFS_ENOTCONN
;
12024 std::scoped_lock
lock(client_lock
);
12026 int res
= _ll_getattr(in
, CEPH_STAT_CAP_INODE_ALL
, perms
);
12029 fill_stat(in
, attr
);
12030 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
12034 int Client::ll_getattrx(Inode
*in
, struct ceph_statx
*stx
, unsigned int want
,
12035 unsigned int flags
, const UserPerm
& perms
)
12037 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12038 if (!mref_reader
.is_state_satisfied())
12039 return -CEPHFS_ENOTCONN
;
12041 std::scoped_lock
lock(client_lock
);
12044 unsigned mask
= statx_to_mask(flags
, want
);
12046 if (mask
&& !in
->caps_issued_mask(mask
, true))
12047 res
= _ll_getattr(in
, mask
, perms
);
12050 fill_statx(in
, mask
, stx
);
12051 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
12055 int Client::_ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
12056 const UserPerm
& perms
, InodeRef
*inp
)
12058 vinodeno_t vino
= _get_vino(in
);
12060 ldout(cct
, 8) << __func__
<< " " << vino
<< " mask " << hex
<< mask
<< dec
12062 tout(cct
) << __func__
<< std::endl
;
12063 tout(cct
) << vino
.ino
.val
<< std::endl
;
12064 tout(cct
) << stx
->stx_mode
<< std::endl
;
12065 tout(cct
) << stx
->stx_uid
<< std::endl
;
12066 tout(cct
) << stx
->stx_gid
<< std::endl
;
12067 tout(cct
) << stx
->stx_size
<< std::endl
;
12068 tout(cct
) << stx
->stx_mtime
<< std::endl
;
12069 tout(cct
) << stx
->stx_atime
<< std::endl
;
12070 tout(cct
) << stx
->stx_btime
<< std::endl
;
12071 tout(cct
) << mask
<< std::endl
;
12073 if (!fuse_default_permissions
) {
12074 int res
= may_setattr(in
, stx
, mask
, perms
);
12079 mask
&= ~(CEPH_SETATTR_MTIME_NOW
| CEPH_SETATTR_ATIME_NOW
);
12081 return __setattrx(in
, stx
, mask
, perms
, inp
);
12084 int Client::ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
12085 const UserPerm
& perms
)
12087 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12088 if (!mref_reader
.is_state_satisfied())
12089 return -CEPHFS_ENOTCONN
;
12091 std::scoped_lock
lock(client_lock
);
12093 InodeRef
target(in
);
12094 int res
= _ll_setattrx(in
, stx
, mask
, perms
, &target
);
12096 ceph_assert(in
== target
.get());
12097 fill_statx(in
, in
->caps_issued(), stx
);
12100 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
12104 int Client::ll_setattr(Inode
*in
, struct stat
*attr
, int mask
,
12105 const UserPerm
& perms
)
12107 struct ceph_statx stx
;
12108 stat_to_statx(attr
, &stx
);
12110 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12111 if (!mref_reader
.is_state_satisfied())
12112 return -CEPHFS_ENOTCONN
;
12114 std::scoped_lock
lock(client_lock
);
12116 InodeRef
target(in
);
12117 int res
= _ll_setattrx(in
, &stx
, mask
, perms
, &target
);
12119 ceph_assert(in
== target
.get());
12120 fill_stat(in
, attr
);
12123 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
12131 int Client::getxattr(const char *path
, const char *name
, void *value
, size_t size
,
12132 const UserPerm
& perms
)
12134 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12135 if (!mref_reader
.is_state_satisfied())
12136 return -CEPHFS_ENOTCONN
;
12138 std::scoped_lock
lock(client_lock
);
12141 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
12144 return _getxattr(in
, name
, value
, size
, perms
);
12147 int Client::lgetxattr(const char *path
, const char *name
, void *value
, size_t size
,
12148 const UserPerm
& perms
)
12150 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12151 if (!mref_reader
.is_state_satisfied())
12152 return -CEPHFS_ENOTCONN
;
12154 std::scoped_lock
lock(client_lock
);
12157 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
12160 return _getxattr(in
, name
, value
, size
, perms
);
12163 int Client::fgetxattr(int fd
, const char *name
, void *value
, size_t size
,
12164 const UserPerm
& perms
)
12166 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12167 if (!mref_reader
.is_state_satisfied())
12168 return -CEPHFS_ENOTCONN
;
12170 std::scoped_lock
lock(client_lock
);
12172 Fh
*f
= get_filehandle(fd
);
12174 return -CEPHFS_EBADF
;
12175 return _getxattr(f
->inode
, name
, value
, size
, perms
);
12178 int Client::listxattr(const char *path
, char *list
, size_t size
,
12179 const UserPerm
& perms
)
12181 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12182 if (!mref_reader
.is_state_satisfied())
12183 return -CEPHFS_ENOTCONN
;
12185 std::scoped_lock
lock(client_lock
);
12188 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
12191 return Client::_listxattr(in
.get(), list
, size
, perms
);
12194 int Client::llistxattr(const char *path
, char *list
, size_t size
,
12195 const UserPerm
& perms
)
12197 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12198 if (!mref_reader
.is_state_satisfied())
12199 return -CEPHFS_ENOTCONN
;
12201 std::scoped_lock
lock(client_lock
);
12204 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
12207 return Client::_listxattr(in
.get(), list
, size
, perms
);
12210 int Client::flistxattr(int fd
, char *list
, size_t size
, const UserPerm
& perms
)
12212 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12213 if (!mref_reader
.is_state_satisfied())
12214 return -CEPHFS_ENOTCONN
;
12216 std::scoped_lock
lock(client_lock
);
12218 Fh
*f
= get_filehandle(fd
);
12220 return -CEPHFS_EBADF
;
12221 return Client::_listxattr(f
->inode
.get(), list
, size
, perms
);
12224 int Client::removexattr(const char *path
, const char *name
,
12225 const UserPerm
& perms
)
12227 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12228 if (!mref_reader
.is_state_satisfied())
12229 return -CEPHFS_ENOTCONN
;
12231 std::scoped_lock
lock(client_lock
);
12234 int r
= Client::path_walk(path
, &in
, perms
, true);
12237 return _removexattr(in
, name
, perms
);
12240 int Client::lremovexattr(const char *path
, const char *name
,
12241 const UserPerm
& perms
)
12243 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12244 if (!mref_reader
.is_state_satisfied())
12245 return -CEPHFS_ENOTCONN
;
12247 std::scoped_lock
lock(client_lock
);
12250 int r
= Client::path_walk(path
, &in
, perms
, false);
12253 return _removexattr(in
, name
, perms
);
12256 int Client::fremovexattr(int fd
, const char *name
, const UserPerm
& perms
)
12258 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12259 if (!mref_reader
.is_state_satisfied())
12260 return -CEPHFS_ENOTCONN
;
12262 std::scoped_lock
lock(client_lock
);
12264 Fh
*f
= get_filehandle(fd
);
12266 return -CEPHFS_EBADF
;
12267 return _removexattr(f
->inode
, name
, perms
);
12270 int Client::setxattr(const char *path
, const char *name
, const void *value
,
12271 size_t size
, int flags
, const UserPerm
& perms
)
12273 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12274 if (!mref_reader
.is_state_satisfied())
12275 return -CEPHFS_ENOTCONN
;
12277 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
12279 std::scoped_lock
lock(client_lock
);
12282 int r
= Client::path_walk(path
, &in
, perms
, true);
12285 return _setxattr(in
, name
, value
, size
, flags
, perms
);
12288 int Client::lsetxattr(const char *path
, const char *name
, const void *value
,
12289 size_t size
, int flags
, const UserPerm
& perms
)
12291 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12292 if (!mref_reader
.is_state_satisfied())
12293 return -CEPHFS_ENOTCONN
;
12295 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
12297 std::scoped_lock
lock(client_lock
);
12300 int r
= Client::path_walk(path
, &in
, perms
, false);
12303 return _setxattr(in
, name
, value
, size
, flags
, perms
);
12306 int Client::fsetxattr(int fd
, const char *name
, const void *value
, size_t size
,
12307 int flags
, const UserPerm
& perms
)
12309 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12310 if (!mref_reader
.is_state_satisfied())
12311 return -CEPHFS_ENOTCONN
;
12313 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
12315 std::scoped_lock
lock(client_lock
);
12317 Fh
*f
= get_filehandle(fd
);
12319 return -CEPHFS_EBADF
;
12320 return _setxattr(f
->inode
, name
, value
, size
, flags
, perms
);
12323 int Client::_getxattr(Inode
*in
, const char *name
, void *value
, size_t size
,
12324 const UserPerm
& perms
)
12327 const VXattr
*vxattr
= nullptr;
12329 vxattr
= _match_vxattr(in
, name
);
12331 r
= -CEPHFS_ENODATA
;
12333 // Do a force getattr to get the latest quota before returning
12334 // a value to userspace.
12336 if (vxattr
->flags
& VXATTR_RSTAT
) {
12337 flags
|= CEPH_STAT_RSTAT
;
12339 if (vxattr
->flags
& VXATTR_DIRSTAT
) {
12340 flags
|= CEPH_CAP_FILE_SHARED
;
12342 r
= _getattr(in
, flags
| CEPH_STAT_CAP_XATTR
, perms
, true);
12344 // Error from getattr!
12348 // call pointer-to-member function
12350 if (!(vxattr
->exists_cb
&& !(this->*(vxattr
->exists_cb
))(in
))) {
12351 r
= (this->*(vxattr
->getxattr_cb
))(in
, buf
, sizeof(buf
));
12353 r
= -CEPHFS_ENODATA
;
12357 if (r
> (int)size
) {
12358 r
= -CEPHFS_ERANGE
;
12359 } else if (r
> 0) {
12360 memcpy(value
, buf
, r
);
12366 if (!strncmp(name
, "ceph.", 5)) {
12367 r
= _getvxattr(in
, perms
, name
, size
, value
, MDS_RANK_NONE
);
12371 if (acl_type
== NO_ACL
&& !strncmp(name
, "system.", 7)) {
12372 r
= -CEPHFS_EOPNOTSUPP
;
12376 r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
12379 r
= -CEPHFS_ENODATA
;
12380 if (in
->xattrs
.count(n
)) {
12381 r
= in
->xattrs
[n
].length();
12382 if (r
> 0 && size
!= 0) {
12383 if (size
>= (unsigned)r
)
12384 memcpy(value
, in
->xattrs
[n
].c_str(), r
);
12386 r
= -CEPHFS_ERANGE
;
12391 ldout(cct
, 8) << "_getxattr(" << in
->ino
<< ", \"" << name
<< "\", " << size
<< ") = " << r
<< dendl
;
12395 int Client::_getxattr(InodeRef
&in
, const char *name
, void *value
, size_t size
,
12396 const UserPerm
& perms
)
12398 if (cct
->_conf
->client_permissions
) {
12399 int r
= xattr_permission(in
.get(), name
, MAY_READ
, perms
);
12403 return _getxattr(in
.get(), name
, value
, size
, perms
);
12406 int Client::ll_getxattr(Inode
*in
, const char *name
, void *value
,
12407 size_t size
, const UserPerm
& perms
)
12409 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12410 if (!mref_reader
.is_state_satisfied())
12411 return -CEPHFS_ENOTCONN
;
12413 vinodeno_t vino
= _get_vino(in
);
12415 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
12416 tout(cct
) << __func__
<< std::endl
;
12417 tout(cct
) << vino
.ino
.val
<< std::endl
;
12418 tout(cct
) << name
<< std::endl
;
12420 std::scoped_lock
lock(client_lock
);
12421 if (!fuse_default_permissions
) {
12422 int r
= xattr_permission(in
, name
, MAY_READ
, perms
);
12427 return _getxattr(in
, name
, value
, size
, perms
);
12430 int Client::_listxattr(Inode
*in
, char *name
, size_t size
,
12431 const UserPerm
& perms
)
12433 bool len_only
= (size
== 0);
12434 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
12440 for ([[maybe_unused
]] const auto &[xattr_name
, xattr_value_bl
] : in
->xattrs
) {
12441 if (xattr_name
.rfind("ceph.", 0) == 0) {
12445 size_t this_len
= xattr_name
.length() + 1;
12450 if (this_len
> size
) {
12451 r
= -CEPHFS_ERANGE
;
12455 memcpy(name
, xattr_name
.c_str(), this_len
);
12460 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ", " << size
<< ") = " << r
<< dendl
;
12464 int Client::ll_listxattr(Inode
*in
, char *names
, size_t size
,
12465 const UserPerm
& perms
)
12467 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12468 if (!mref_reader
.is_state_satisfied())
12469 return -CEPHFS_ENOTCONN
;
12471 vinodeno_t vino
= _get_vino(in
);
12473 ldout(cct
, 3) << __func__
<< " " << vino
<< " size " << size
<< dendl
;
12474 tout(cct
) << __func__
<< std::endl
;
12475 tout(cct
) << vino
.ino
.val
<< std::endl
;
12476 tout(cct
) << size
<< std::endl
;
12478 std::scoped_lock
lock(client_lock
);
12479 return _listxattr(in
, names
, size
, perms
);
12482 int Client::_do_setxattr(Inode
*in
, const char *name
, const void *value
,
12483 size_t size
, int flags
, const UserPerm
& perms
)
12486 int xattr_flags
= 0;
12488 xattr_flags
|= CEPH_XATTR_REMOVE
;
12489 if (flags
& XATTR_CREATE
)
12490 xattr_flags
|= CEPH_XATTR_CREATE
;
12491 if (flags
& XATTR_REPLACE
)
12492 xattr_flags
|= CEPH_XATTR_REPLACE
;
12494 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETXATTR
);
12496 in
->make_nosnap_relative_path(path
);
12497 req
->set_filepath(path
);
12498 req
->set_string2(name
);
12499 req
->set_inode(in
);
12500 req
->head
.args
.setxattr
.flags
= xattr_flags
;
12503 ceph_assert(value
|| size
== 0);
12504 bl
.append((const char*)value
, size
);
12507 int res
= make_request(req
, perms
);
12510 ldout(cct
, 3) << __func__
<< "(" << in
->ino
<< ", \"" << name
<< "\") = " <<
12515 int Client::_setxattr(Inode
*in
, const char *name
, const void *value
,
12516 size_t size
, int flags
, const UserPerm
& perms
)
12518 if (in
->snapid
!= CEPH_NOSNAP
) {
12519 return -CEPHFS_EROFS
;
12524 } else if (value
== NULL
) {
12525 return -CEPHFS_EINVAL
;
12528 bool posix_acl_xattr
= false;
12529 if (acl_type
== POSIX_ACL
)
12530 posix_acl_xattr
= !strncmp(name
, "system.", 7);
12532 if (strncmp(name
, "user.", 5) &&
12533 strncmp(name
, "security.", 9) &&
12534 strncmp(name
, "trusted.", 8) &&
12535 strncmp(name
, "ceph.", 5) &&
12537 return -CEPHFS_EOPNOTSUPP
;
12539 bool check_realm
= false;
12541 if (posix_acl_xattr
) {
12542 if (!strcmp(name
, ACL_EA_ACCESS
)) {
12543 mode_t new_mode
= in
->mode
;
12545 int ret
= posix_acl_equiv_mode(value
, size
, &new_mode
);
12552 if (new_mode
!= in
->mode
) {
12553 struct ceph_statx stx
;
12554 stx
.stx_mode
= new_mode
;
12555 ret
= _do_setattr(in
, &stx
, CEPH_SETATTR_MODE
, perms
, NULL
);
12560 } else if (!strcmp(name
, ACL_EA_DEFAULT
)) {
12562 if (!S_ISDIR(in
->mode
))
12563 return -CEPHFS_EACCES
;
12564 int ret
= posix_acl_check(value
, size
);
12566 return -CEPHFS_EINVAL
;
12573 return -CEPHFS_EOPNOTSUPP
;
12576 const VXattr
*vxattr
= _match_vxattr(in
, name
);
12578 if (vxattr
->readonly
)
12579 return -CEPHFS_EOPNOTSUPP
;
12580 if (vxattr
->name
.compare(0, 10, "ceph.quota") == 0 && value
)
12581 check_realm
= true;
12585 int ret
= _do_setxattr(in
, name
, value
, size
, flags
, perms
);
12586 if (ret
>= 0 && check_realm
) {
12587 // check if snaprealm was created for quota inode
12588 if (in
->quota
.is_enable() &&
12589 !(in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
))
12590 ret
= -CEPHFS_EOPNOTSUPP
;
12596 int Client::_setxattr(InodeRef
&in
, const char *name
, const void *value
,
12597 size_t size
, int flags
, const UserPerm
& perms
)
12599 if (cct
->_conf
->client_permissions
) {
12600 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
12604 return _setxattr(in
.get(), name
, value
, size
, flags
, perms
);
12607 int Client::_setxattr_check_data_pool(string
& name
, string
& value
, const OSDMap
*osdmap
)
12610 if (name
== "layout") {
12611 string::iterator begin
= value
.begin();
12612 string::iterator end
= value
.end();
12613 keys_and_values
<string::iterator
> p
; // create instance of parser
12614 std::map
<string
, string
> m
; // map to receive results
12615 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
12616 return -CEPHFS_EINVAL
;
12619 return -CEPHFS_EINVAL
;
12620 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
12621 if (q
->first
== "pool") {
12626 } else if (name
== "layout.pool") {
12630 if (tmp
.length()) {
12633 pool
= boost::lexical_cast
<unsigned>(tmp
);
12634 if (!osdmap
->have_pg_pool(pool
))
12635 return -CEPHFS_ENOENT
;
12636 } catch (boost::bad_lexical_cast
const&) {
12637 pool
= osdmap
->lookup_pg_pool_name(tmp
);
12639 return -CEPHFS_ENOENT
;
12647 void Client::_setxattr_maybe_wait_for_osdmap(const char *name
, const void *value
, size_t size
)
12649 // For setting pool of layout, MetaRequest need osdmap epoch.
12650 // There is a race which create a new data pool but client and mds both don't have.
12651 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
12652 ldout(cct
, 15) << __func__
<< ": name = " << name
<< dendl
;
12653 if (strcmp(name
, "ceph.file.layout.pool") == 0 || strcmp(name
, "ceph.dir.layout.pool") == 0 ||
12654 strcmp(name
, "ceph.file.layout") == 0 || strcmp(name
, "ceph.dir.layout") == 0) {
12655 string
rest(strstr(name
, "layout"));
12656 string
v((const char*)value
, size
);
12657 int r
= objecter
->with_osdmap([&](const OSDMap
& o
) {
12658 return _setxattr_check_data_pool(rest
, v
, &o
);
12661 if (r
== -CEPHFS_ENOENT
) {
12663 ldout(cct
, 20) << __func__
<< ": waiting for latest osdmap" << dendl
;
12664 objecter
->wait_for_latest_osdmap(ca::use_blocked
[ec
]);
12665 ldout(cct
, 20) << __func__
<< ": got latest osdmap: " << ec
<< dendl
;
12670 int Client::ll_setxattr(Inode
*in
, const char *name
, const void *value
,
12671 size_t size
, int flags
, const UserPerm
& perms
)
12673 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12674 if (!mref_reader
.is_state_satisfied())
12675 return -CEPHFS_ENOTCONN
;
12677 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
12679 vinodeno_t vino
= _get_vino(in
);
12681 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
12682 tout(cct
) << __func__
<< std::endl
;
12683 tout(cct
) << vino
.ino
.val
<< std::endl
;
12684 tout(cct
) << name
<< std::endl
;
12686 std::scoped_lock
lock(client_lock
);
12687 if (!fuse_default_permissions
) {
12688 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
12692 return _setxattr(in
, name
, value
, size
, flags
, perms
);
12695 int Client::_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
12697 if (in
->snapid
!= CEPH_NOSNAP
) {
12698 return -CEPHFS_EROFS
;
12701 // same xattrs supported by kernel client
12702 if (strncmp(name
, "user.", 5) &&
12703 strncmp(name
, "system.", 7) &&
12704 strncmp(name
, "security.", 9) &&
12705 strncmp(name
, "trusted.", 8) &&
12706 strncmp(name
, "ceph.", 5))
12707 return -CEPHFS_EOPNOTSUPP
;
12709 const VXattr
*vxattr
= _match_vxattr(in
, name
);
12710 if (vxattr
&& vxattr
->readonly
)
12711 return -CEPHFS_EOPNOTSUPP
;
12713 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_RMXATTR
);
12715 in
->make_nosnap_relative_path(path
);
12716 req
->set_filepath(path
);
12717 req
->set_filepath2(name
);
12718 req
->set_inode(in
);
12720 int res
= make_request(req
, perms
);
12723 ldout(cct
, 8) << "_removexattr(" << in
->ino
<< ", \"" << name
<< "\") = " << res
<< dendl
;
12727 int Client::_removexattr(InodeRef
&in
, const char *name
, const UserPerm
& perms
)
12729 if (cct
->_conf
->client_permissions
) {
12730 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
12734 return _removexattr(in
.get(), name
, perms
);
12737 int Client::ll_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
12739 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12740 if (!mref_reader
.is_state_satisfied())
12741 return -CEPHFS_ENOTCONN
;
12743 vinodeno_t vino
= _get_vino(in
);
12745 ldout(cct
, 3) << "ll_removexattr " << vino
<< " " << name
<< dendl
;
12746 tout(cct
) << "ll_removexattr" << std::endl
;
12747 tout(cct
) << vino
.ino
.val
<< std::endl
;
12748 tout(cct
) << name
<< std::endl
;
12750 std::scoped_lock
lock(client_lock
);
12751 if (!fuse_default_permissions
) {
12752 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
12757 return _removexattr(in
, name
, perms
);
12760 bool Client::_vxattrcb_quota_exists(Inode
*in
)
12762 return in
->quota
.is_enable() &&
12763 (in
->snapid
!= CEPH_NOSNAP
||
12764 (in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
));
12766 size_t Client::_vxattrcb_quota(Inode
*in
, char *val
, size_t size
)
12768 return snprintf(val
, size
,
12769 "max_bytes=%lld max_files=%lld",
12770 (long long int)in
->quota
.max_bytes
,
12771 (long long int)in
->quota
.max_files
);
12773 size_t Client::_vxattrcb_quota_max_bytes(Inode
*in
, char *val
, size_t size
)
12775 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_bytes
);
12777 size_t Client::_vxattrcb_quota_max_files(Inode
*in
, char *val
, size_t size
)
12779 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_files
);
12782 bool Client::_vxattrcb_layout_exists(Inode
*in
)
12784 return in
->layout
!= file_layout_t();
12786 size_t Client::_vxattrcb_layout(Inode
*in
, char *val
, size_t size
)
12788 int r
= snprintf(val
, size
,
12789 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
12790 (unsigned long long)in
->layout
.stripe_unit
,
12791 (unsigned long long)in
->layout
.stripe_count
,
12792 (unsigned long long)in
->layout
.object_size
);
12793 objecter
->with_osdmap([&](const OSDMap
& o
) {
12794 if (o
.have_pg_pool(in
->layout
.pool_id
))
12795 r
+= snprintf(val
+ r
, size
- r
, "%s",
12796 o
.get_pool_name(in
->layout
.pool_id
).c_str());
12798 r
+= snprintf(val
+ r
, size
- r
, "%" PRIu64
,
12799 (uint64_t)in
->layout
.pool_id
);
12801 if (in
->layout
.pool_ns
.length())
12802 r
+= snprintf(val
+ r
, size
- r
, " pool_namespace=%s",
12803 in
->layout
.pool_ns
.c_str());
12806 size_t Client::_vxattrcb_layout_stripe_unit(Inode
*in
, char *val
, size_t size
)
12808 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_unit
);
12810 size_t Client::_vxattrcb_layout_stripe_count(Inode
*in
, char *val
, size_t size
)
12812 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_count
);
12814 size_t Client::_vxattrcb_layout_object_size(Inode
*in
, char *val
, size_t size
)
12816 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.object_size
);
12818 size_t Client::_vxattrcb_layout_pool(Inode
*in
, char *val
, size_t size
)
12821 objecter
->with_osdmap([&](const OSDMap
& o
) {
12822 if (o
.have_pg_pool(in
->layout
.pool_id
))
12823 r
= snprintf(val
, size
, "%s", o
.get_pool_name(
12824 in
->layout
.pool_id
).c_str());
12826 r
= snprintf(val
, size
, "%" PRIu64
, (uint64_t)in
->layout
.pool_id
);
12830 size_t Client::_vxattrcb_layout_pool_namespace(Inode
*in
, char *val
, size_t size
)
12832 return snprintf(val
, size
, "%s", in
->layout
.pool_ns
.c_str());
12834 size_t Client::_vxattrcb_dir_entries(Inode
*in
, char *val
, size_t size
)
12836 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->dirstat
.nfiles
+ in
->dirstat
.nsubdirs
));
12838 size_t Client::_vxattrcb_dir_files(Inode
*in
, char *val
, size_t size
)
12840 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nfiles
);
12842 size_t Client::_vxattrcb_dir_subdirs(Inode
*in
, char *val
, size_t size
)
12844 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nsubdirs
);
12846 size_t Client::_vxattrcb_dir_rentries(Inode
*in
, char *val
, size_t size
)
12848 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->rstat
.rfiles
+ in
->rstat
.rsubdirs
));
12850 size_t Client::_vxattrcb_dir_rfiles(Inode
*in
, char *val
, size_t size
)
12852 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rfiles
);
12854 size_t Client::_vxattrcb_dir_rsubdirs(Inode
*in
, char *val
, size_t size
)
12856 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsubdirs
);
12858 size_t Client::_vxattrcb_dir_rsnaps(Inode
*in
, char *val
, size_t size
)
12860 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsnaps
);
12862 size_t Client::_vxattrcb_dir_rbytes(Inode
*in
, char *val
, size_t size
)
12864 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rbytes
);
12866 size_t Client::_vxattrcb_dir_rctime(Inode
*in
, char *val
, size_t size
)
12868 return snprintf(val
, size
, "%ld.%09ld", (long)in
->rstat
.rctime
.sec(),
12869 (long)in
->rstat
.rctime
.nsec());
12871 bool Client::_vxattrcb_dir_pin_exists(Inode
*in
)
12873 return in
->dir_pin
!= -CEPHFS_ENODATA
;
12875 size_t Client::_vxattrcb_dir_pin(Inode
*in
, char *val
, size_t size
)
12877 return snprintf(val
, size
, "%ld", (long)in
->dir_pin
);
12880 bool Client::_vxattrcb_snap_btime_exists(Inode
*in
)
12882 return !in
->snap_btime
.is_zero();
12885 size_t Client::_vxattrcb_snap_btime(Inode
*in
, char *val
, size_t size
)
12887 return snprintf(val
, size
, "%llu.%09lu",
12888 (long long unsigned)in
->snap_btime
.sec(),
12889 (long unsigned)in
->snap_btime
.nsec());
12892 size_t Client::_vxattrcb_caps(Inode
*in
, char *val
, size_t size
)
12896 in
->caps_issued(&issued
);
12897 return snprintf(val
, size
, "%s/0x%x", ccap_string(issued
).c_str(), issued
);
12900 bool Client::_vxattrcb_mirror_info_exists(Inode
*in
)
12902 // checking one of the xattrs would suffice
12903 return in
->xattrs
.count("ceph.mirror.info.cluster_id") != 0;
12906 size_t Client::_vxattrcb_mirror_info(Inode
*in
, char *val
, size_t size
)
12908 return snprintf(val
, size
, "cluster_id=%.*s fs_id=%.*s",
12909 in
->xattrs
["ceph.mirror.info.cluster_id"].length(),
12910 in
->xattrs
["ceph.mirror.info.cluster_id"].c_str(),
12911 in
->xattrs
["ceph.mirror.info.fs_id"].length(),
12912 in
->xattrs
["ceph.mirror.info.fs_id"].c_str());
12915 size_t Client::_vxattrcb_cluster_fsid(Inode
*in
, char *val
, size_t size
)
12917 return snprintf(val
, size
, "%s", monclient
->get_fsid().to_string().c_str());
12920 size_t Client::_vxattrcb_client_id(Inode
*in
, char *val
, size_t size
)
12922 auto name
= messenger
->get_myname();
12923 return snprintf(val
, size
, "%s%" PRId64
, name
.type_str(), name
.num());
12926 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
12927 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
12929 #define XATTR_NAME_CEPH(_type, _name, _flags) \
12931 name: CEPH_XATTR_NAME(_type, _name), \
12932 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12937 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
12939 name: CEPH_XATTR_NAME2(_type, _name, _field), \
12940 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
12942 exists_cb: &Client::_vxattrcb_layout_exists, \
12945 #define XATTR_QUOTA_FIELD(_type, _name) \
12947 name: CEPH_XATTR_NAME(_type, _name), \
12948 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12950 exists_cb: &Client::_vxattrcb_quota_exists, \
12954 const Client::VXattr
Client::_dir_vxattrs
[] = {
12956 name
: "ceph.dir.layout",
12957 getxattr_cb
: &Client::_vxattrcb_layout
,
12959 exists_cb
: &Client::_vxattrcb_layout_exists
,
12963 // Delete the following dir layout field definitions for release "S"
12964 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_unit
),
12965 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_count
),
12966 XATTR_LAYOUT_FIELD(dir
, layout
, object_size
),
12967 XATTR_LAYOUT_FIELD(dir
, layout
, pool
),
12968 XATTR_LAYOUT_FIELD(dir
, layout
, pool_namespace
),
12969 XATTR_NAME_CEPH(dir
, entries
, VXATTR_DIRSTAT
),
12970 XATTR_NAME_CEPH(dir
, files
, VXATTR_DIRSTAT
),
12971 XATTR_NAME_CEPH(dir
, subdirs
, VXATTR_DIRSTAT
),
12972 XATTR_NAME_CEPH(dir
, rentries
, VXATTR_RSTAT
),
12973 XATTR_NAME_CEPH(dir
, rfiles
, VXATTR_RSTAT
),
12974 XATTR_NAME_CEPH(dir
, rsubdirs
, VXATTR_RSTAT
),
12975 XATTR_NAME_CEPH(dir
, rsnaps
, VXATTR_RSTAT
),
12976 XATTR_NAME_CEPH(dir
, rbytes
, VXATTR_RSTAT
),
12977 XATTR_NAME_CEPH(dir
, rctime
, VXATTR_RSTAT
),
12979 name
: "ceph.quota",
12980 getxattr_cb
: &Client::_vxattrcb_quota
,
12982 exists_cb
: &Client::_vxattrcb_quota_exists
,
12985 XATTR_QUOTA_FIELD(quota
, max_bytes
),
12986 XATTR_QUOTA_FIELD(quota
, max_files
),
12988 // Delete the following dir pin field definitions for release "S"
12990 name
: "ceph.dir.pin",
12991 getxattr_cb
: &Client::_vxattrcb_dir_pin
,
12993 exists_cb
: &Client::_vxattrcb_dir_pin_exists
,
12997 name
: "ceph.snap.btime",
12998 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
13000 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
13004 name
: "ceph.mirror.info",
13005 getxattr_cb
: &Client::_vxattrcb_mirror_info
,
13007 exists_cb
: &Client::_vxattrcb_mirror_info_exists
,
13012 getxattr_cb
: &Client::_vxattrcb_caps
,
13017 { name
: "" } /* Required table terminator */
13020 const Client::VXattr
Client::_file_vxattrs
[] = {
13022 name
: "ceph.file.layout",
13023 getxattr_cb
: &Client::_vxattrcb_layout
,
13025 exists_cb
: &Client::_vxattrcb_layout_exists
,
13028 XATTR_LAYOUT_FIELD(file
, layout
, stripe_unit
),
13029 XATTR_LAYOUT_FIELD(file
, layout
, stripe_count
),
13030 XATTR_LAYOUT_FIELD(file
, layout
, object_size
),
13031 XATTR_LAYOUT_FIELD(file
, layout
, pool
),
13032 XATTR_LAYOUT_FIELD(file
, layout
, pool_namespace
),
13034 name
: "ceph.snap.btime",
13035 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
13037 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
13042 getxattr_cb
: &Client::_vxattrcb_caps
,
13047 { name
: "" } /* Required table terminator */
13050 const Client::VXattr
Client::_common_vxattrs
[] = {
13052 name
: "ceph.cluster_fsid",
13053 getxattr_cb
: &Client::_vxattrcb_cluster_fsid
,
13055 exists_cb
: nullptr,
13059 name
: "ceph.client_id",
13060 getxattr_cb
: &Client::_vxattrcb_client_id
,
13062 exists_cb
: nullptr,
13065 { name
: "" } /* Required table terminator */
13068 const Client::VXattr
*Client::_get_vxattrs(Inode
*in
)
13071 return _dir_vxattrs
;
13072 else if (in
->is_file())
13073 return _file_vxattrs
;
13077 const Client::VXattr
*Client::_match_vxattr(Inode
*in
, const char *name
)
13079 if (strncmp(name
, "ceph.", 5) == 0) {
13080 const VXattr
*vxattr
= _get_vxattrs(in
);
13082 while (!vxattr
->name
.empty()) {
13083 if (vxattr
->name
== name
)
13089 // for common vxattrs
13090 vxattr
= _common_vxattrs
;
13091 while (!vxattr
->name
.empty()) {
13092 if (vxattr
->name
== name
)
13101 int Client::ll_readlink(Inode
*in
, char *buf
, size_t buflen
, const UserPerm
& perms
)
13103 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13104 if (!mref_reader
.is_state_satisfied())
13105 return -CEPHFS_ENOTCONN
;
13107 vinodeno_t vino
= _get_vino(in
);
13109 ldout(cct
, 3) << "ll_readlink " << vino
<< dendl
;
13110 tout(cct
) << "ll_readlink" << std::endl
;
13111 tout(cct
) << vino
.ino
.val
<< std::endl
;
13113 std::scoped_lock
lock(client_lock
);
13114 for (auto dn
: in
->dentries
) {
13118 int r
= _readlink(in
, buf
, buflen
); // FIXME: no permission checking!
13119 ldout(cct
, 3) << "ll_readlink " << vino
<< " = " << r
<< dendl
;
13123 int Client::_mknod(Inode
*dir
, const char *name
, mode_t mode
, dev_t rdev
,
13124 const UserPerm
& perms
, InodeRef
*inp
)
13126 ldout(cct
, 8) << "_mknod(" << dir
->ino
<< " " << name
<< ", 0" << oct
13127 << mode
<< dec
<< ", " << rdev
<< ", uid " << perms
.uid()
13128 << ", gid " << perms
.gid() << ")" << dendl
;
13130 if (strlen(name
) > NAME_MAX
)
13131 return -CEPHFS_ENAMETOOLONG
;
13133 if (dir
->snapid
!= CEPH_NOSNAP
) {
13134 return -CEPHFS_EROFS
;
13136 if (is_quota_files_exceeded(dir
, perms
)) {
13137 return -CEPHFS_EDQUOT
;
13140 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_MKNOD
);
13143 dir
->make_nosnap_relative_path(path
);
13144 path
.push_dentry(name
);
13145 req
->set_filepath(path
);
13146 req
->set_inode(dir
);
13147 req
->head
.args
.mknod
.rdev
= rdev
;
13148 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13149 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13151 bufferlist xattrs_bl
;
13152 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
13155 req
->head
.args
.mknod
.mode
= mode
;
13156 if (xattrs_bl
.length() > 0)
13157 req
->set_data(xattrs_bl
);
13160 res
= get_or_create(dir
, name
, &de
);
13163 req
->set_dentry(de
);
13165 res
= make_request(req
, perms
, inp
);
13169 ldout(cct
, 8) << "mknod(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
13177 int Client::ll_mknod(Inode
*parent
, const char *name
, mode_t mode
,
13178 dev_t rdev
, struct stat
*attr
, Inode
**out
,
13179 const UserPerm
& perms
)
13181 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13182 if (!mref_reader
.is_state_satisfied())
13183 return -CEPHFS_ENOTCONN
;
13185 vinodeno_t vparent
= _get_vino(parent
);
13187 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
<< dendl
;
13188 tout(cct
) << "ll_mknod" << std::endl
;
13189 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13190 tout(cct
) << name
<< std::endl
;
13191 tout(cct
) << mode
<< std::endl
;
13192 tout(cct
) << rdev
<< std::endl
;
13194 std::scoped_lock
lock(client_lock
);
13195 if (!fuse_default_permissions
) {
13196 int r
= may_create(parent
, perms
);
13202 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
13204 fill_stat(in
, attr
);
13207 tout(cct
) << attr
->st_ino
<< std::endl
;
13208 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
13209 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
13214 int Client::ll_mknodx(Inode
*parent
, const char *name
, mode_t mode
,
13215 dev_t rdev
, Inode
**out
,
13216 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
13217 const UserPerm
& perms
)
13219 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13220 if (!mref_reader
.is_state_satisfied())
13221 return -CEPHFS_ENOTCONN
;
13223 unsigned caps
= statx_to_mask(flags
, want
);
13225 vinodeno_t vparent
= _get_vino(parent
);
13227 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
<< dendl
;
13228 tout(cct
) << "ll_mknodx" << std::endl
;
13229 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13230 tout(cct
) << name
<< std::endl
;
13231 tout(cct
) << mode
<< std::endl
;
13232 tout(cct
) << rdev
<< std::endl
;
13234 std::scoped_lock
lock(client_lock
);
13236 if (!fuse_default_permissions
) {
13237 int r
= may_create(parent
, perms
);
13243 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
13245 fill_statx(in
, caps
, stx
);
13248 tout(cct
) << stx
->stx_ino
<< std::endl
;
13249 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
13250 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
13255 int Client::_create(Inode
*dir
, const char *name
, int flags
, mode_t mode
,
13256 InodeRef
*inp
, Fh
**fhp
, int stripe_unit
, int stripe_count
,
13257 int object_size
, const char *data_pool
, bool *created
,
13258 const UserPerm
& perms
, std::string alternate_name
)
13260 ldout(cct
, 8) << "_create(" << dir
->ino
<< " " << name
<< ", 0" << oct
<<
13261 mode
<< dec
<< ")" << dendl
;
13263 if (strlen(name
) > NAME_MAX
)
13264 return -CEPHFS_ENAMETOOLONG
;
13265 if (dir
->snapid
!= CEPH_NOSNAP
) {
13266 return -CEPHFS_EROFS
;
13268 if (is_quota_files_exceeded(dir
, perms
)) {
13269 return -CEPHFS_EDQUOT
;
13272 // use normalized flags to generate cmode
13273 int cflags
= ceph_flags_sys2wire(flags
);
13274 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
13275 cflags
|= CEPH_O_LAZY
;
13277 int cmode
= ceph_flags_to_mode(cflags
);
13279 int64_t pool_id
= -1;
13280 if (data_pool
&& *data_pool
) {
13281 pool_id
= objecter
->with_osdmap(
13282 std::mem_fn(&OSDMap::lookup_pg_pool_name
), data_pool
);
13284 return -CEPHFS_EINVAL
;
13285 if (pool_id
> 0xffffffffll
)
13286 return -CEPHFS_ERANGE
; // bummer!
13289 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_CREATE
);
13292 dir
->make_nosnap_relative_path(path
);
13293 path
.push_dentry(name
);
13294 req
->set_filepath(path
);
13295 req
->set_alternate_name(std::move(alternate_name
));
13296 req
->set_inode(dir
);
13297 req
->head
.args
.open
.flags
= cflags
| CEPH_O_CREAT
;
13299 req
->head
.args
.open
.stripe_unit
= stripe_unit
;
13300 req
->head
.args
.open
.stripe_count
= stripe_count
;
13301 req
->head
.args
.open
.object_size
= object_size
;
13302 if (cct
->_conf
->client_debug_getattr_caps
)
13303 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
13305 req
->head
.args
.open
.mask
= 0;
13306 req
->head
.args
.open
.pool
= pool_id
;
13307 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13308 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13311 bufferlist xattrs_bl
;
13312 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
13315 req
->head
.args
.open
.mode
= mode
;
13316 if (xattrs_bl
.length() > 0)
13317 req
->set_data(xattrs_bl
);
13320 res
= get_or_create(dir
, name
, &de
);
13323 req
->set_dentry(de
);
13325 res
= make_request(req
, perms
, inp
, created
);
13330 /* If the caller passed a value in fhp, do the open */
13332 (*inp
)->get_open_ref(cmode
);
13333 *fhp
= _create_fh(inp
->get(), flags
, cmode
, perms
);
13339 ldout(cct
, 8) << "create(" << path
<< ", 0" << oct
<< mode
<< dec
13340 << " layout " << stripe_unit
13341 << ' ' << stripe_count
13342 << ' ' << object_size
13343 <<") = " << res
<< dendl
;
13351 int Client::_mkdir(Inode
*dir
, const char *name
, mode_t mode
, const UserPerm
& perm
,
13352 InodeRef
*inp
, const std::map
<std::string
, std::string
> &metadata
,
13353 std::string alternate_name
)
13355 ldout(cct
, 8) << "_mkdir(" << dir
->ino
<< " " << name
<< ", 0" << oct
13356 << mode
<< dec
<< ", uid " << perm
.uid()
13357 << ", gid " << perm
.gid() << ")" << dendl
;
13359 if (strlen(name
) > NAME_MAX
)
13360 return -CEPHFS_ENAMETOOLONG
;
13362 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
13363 return -CEPHFS_EROFS
;
13365 if (is_quota_files_exceeded(dir
, perm
)) {
13366 return -CEPHFS_EDQUOT
;
13369 bool is_snap_op
= dir
->snapid
== CEPH_SNAPDIR
;
13370 MetaRequest
*req
= new MetaRequest(is_snap_op
?
13371 CEPH_MDS_OP_MKSNAP
: CEPH_MDS_OP_MKDIR
);
13374 dir
->make_nosnap_relative_path(path
);
13375 path
.push_dentry(name
);
13376 req
->set_filepath(path
);
13377 req
->set_inode(dir
);
13378 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13379 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13380 req
->set_alternate_name(std::move(alternate_name
));
13384 int res
= _posix_acl_create(dir
, &mode
, bl
, perm
);
13387 req
->head
.args
.mkdir
.mode
= mode
;
13389 SnapPayload payload
;
13390 // clear the bufferlist that may have been populated by the call
13391 // to _posix_acl_create(). MDS mksnap does not make use of it.
13392 // So, reuse it to pass metadata payload.
13394 payload
.metadata
= metadata
;
13395 encode(payload
, bl
);
13397 if (bl
.length() > 0) {
13402 res
= get_or_create(dir
, name
, &de
);
13405 req
->set_dentry(de
);
13407 ldout(cct
, 10) << "_mkdir: making request" << dendl
;
13408 res
= make_request(req
, perm
, inp
);
13409 ldout(cct
, 10) << "_mkdir result is " << res
<< dendl
;
13413 ldout(cct
, 8) << "_mkdir(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
13421 int Client::ll_mkdir(Inode
*parent
, const char *name
, mode_t mode
,
13422 struct stat
*attr
, Inode
**out
, const UserPerm
& perm
)
13424 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13425 if (!mref_reader
.is_state_satisfied())
13426 return -CEPHFS_ENOTCONN
;
13428 vinodeno_t vparent
= _get_vino(parent
);
13430 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
<< dendl
;
13431 tout(cct
) << "ll_mkdir" << std::endl
;
13432 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13433 tout(cct
) << name
<< std::endl
;
13434 tout(cct
) << mode
<< std::endl
;
13436 std::scoped_lock
lock(client_lock
);
13438 if (!fuse_default_permissions
) {
13439 int r
= may_create(parent
, perm
);
13445 int r
= _mkdir(parent
, name
, mode
, perm
, &in
);
13447 fill_stat(in
, attr
);
13450 tout(cct
) << attr
->st_ino
<< std::endl
;
13451 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
13452 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
13457 int Client::ll_mkdirx(Inode
*parent
, const char *name
, mode_t mode
, Inode
**out
,
13458 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
13459 const UserPerm
& perms
)
13461 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13462 if (!mref_reader
.is_state_satisfied())
13463 return -CEPHFS_ENOTCONN
;
13465 vinodeno_t vparent
= _get_vino(parent
);
13467 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
<< dendl
;
13468 tout(cct
) << "ll_mkdirx" << std::endl
;
13469 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13470 tout(cct
) << name
<< std::endl
;
13471 tout(cct
) << mode
<< std::endl
;
13473 std::scoped_lock
lock(client_lock
);
13475 if (!fuse_default_permissions
) {
13476 int r
= may_create(parent
, perms
);
13482 int r
= _mkdir(parent
, name
, mode
, perms
, &in
);
13484 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
13490 tout(cct
) << stx
->stx_ino
<< std::endl
;
13491 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
13492 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
13497 int Client::_symlink(Inode
*dir
, const char *name
, const char *target
,
13498 const UserPerm
& perms
, std::string alternate_name
, InodeRef
*inp
)
13500 ldout(cct
, 8) << "_symlink(" << dir
->ino
<< " " << name
<< ", " << target
13501 << ", uid " << perms
.uid() << ", gid " << perms
.gid() << ")"
13504 if (strlen(name
) > NAME_MAX
)
13505 return -CEPHFS_ENAMETOOLONG
;
13507 if (dir
->snapid
!= CEPH_NOSNAP
) {
13508 return -CEPHFS_EROFS
;
13510 if (is_quota_files_exceeded(dir
, perms
)) {
13511 return -CEPHFS_EDQUOT
;
13514 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SYMLINK
);
13517 dir
->make_nosnap_relative_path(path
);
13518 path
.push_dentry(name
);
13519 req
->set_filepath(path
);
13520 req
->set_alternate_name(std::move(alternate_name
));
13521 req
->set_inode(dir
);
13522 req
->set_string2(target
);
13523 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13524 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13527 int res
= get_or_create(dir
, name
, &de
);
13530 req
->set_dentry(de
);
13532 res
= make_request(req
, perms
, inp
);
13535 ldout(cct
, 8) << "_symlink(\"" << path
<< "\", \"" << target
<< "\") = " <<
13544 int Client::ll_symlink(Inode
*parent
, const char *name
, const char *value
,
13545 struct stat
*attr
, Inode
**out
, const UserPerm
& perms
)
13547 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13548 if (!mref_reader
.is_state_satisfied())
13549 return -CEPHFS_ENOTCONN
;
13551 vinodeno_t vparent
= _get_vino(parent
);
13553 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
<< " -> " << value
13555 tout(cct
) << "ll_symlink" << std::endl
;
13556 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13557 tout(cct
) << name
<< std::endl
;
13558 tout(cct
) << value
<< std::endl
;
13560 std::scoped_lock
lock(client_lock
);
13562 if (!fuse_default_permissions
) {
13563 int r
= may_create(parent
, perms
);
13569 int r
= _symlink(parent
, name
, value
, perms
, "", &in
);
13571 fill_stat(in
, attr
);
13574 tout(cct
) << attr
->st_ino
<< std::endl
;
13575 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
13576 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
13581 int Client::ll_symlinkx(Inode
*parent
, const char *name
, const char *value
,
13582 Inode
**out
, struct ceph_statx
*stx
, unsigned want
,
13583 unsigned flags
, const UserPerm
& perms
)
13585 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13586 if (!mref_reader
.is_state_satisfied())
13587 return -CEPHFS_ENOTCONN
;
13589 vinodeno_t vparent
= _get_vino(parent
);
13591 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
<< " -> " << value
13593 tout(cct
) << "ll_symlinkx" << std::endl
;
13594 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13595 tout(cct
) << name
<< std::endl
;
13596 tout(cct
) << value
<< std::endl
;
13598 std::scoped_lock
lock(client_lock
);
13600 if (!fuse_default_permissions
) {
13601 int r
= may_create(parent
, perms
);
13607 int r
= _symlink(parent
, name
, value
, perms
, "", &in
);
13609 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
13612 tout(cct
) << stx
->stx_ino
<< std::endl
;
13613 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
13614 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
13619 int Client::_unlink(Inode
*dir
, const char *name
, const UserPerm
& perm
)
13621 ldout(cct
, 8) << "_unlink(" << dir
->ino
<< " " << name
13622 << " uid " << perm
.uid() << " gid " << perm
.gid()
13625 if (dir
->snapid
!= CEPH_NOSNAP
) {
13626 return -CEPHFS_EROFS
;
13629 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_UNLINK
);
13632 dir
->make_nosnap_relative_path(path
);
13633 path
.push_dentry(name
);
13634 req
->set_filepath(path
);
13640 int res
= get_or_create(dir
, name
, &de
);
13643 req
->set_dentry(de
);
13644 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13645 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13647 res
= _lookup(dir
, name
, 0, &otherin
, perm
);
13651 in
= otherin
.get();
13652 req
->set_other_inode(in
);
13653 in
->break_all_delegs();
13654 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
13656 req
->set_inode(dir
);
13658 res
= make_request(req
, perm
);
13661 ldout(cct
, 8) << "unlink(" << path
<< ") = " << res
<< dendl
;
13669 int Client::ll_unlink(Inode
*in
, const char *name
, const UserPerm
& perm
)
13671 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13672 if (!mref_reader
.is_state_satisfied())
13673 return -CEPHFS_ENOTCONN
;
13675 vinodeno_t vino
= _get_vino(in
);
13677 ldout(cct
, 3) << "ll_unlink " << vino
<< " " << name
<< dendl
;
13678 tout(cct
) << "ll_unlink" << std::endl
;
13679 tout(cct
) << vino
.ino
.val
<< std::endl
;
13680 tout(cct
) << name
<< std::endl
;
13682 std::scoped_lock
lock(client_lock
);
13684 if (!fuse_default_permissions
) {
13685 int r
= may_delete(in
, name
, perm
);
13689 return _unlink(in
, name
, perm
);
13692 int Client::_rmdir(Inode
*dir
, const char *name
, const UserPerm
& perms
)
13694 ldout(cct
, 8) << "_rmdir(" << dir
->ino
<< " " << name
<< " uid "
13695 << perms
.uid() << " gid " << perms
.gid() << ")" << dendl
;
13697 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
13698 return -CEPHFS_EROFS
;
13701 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_RMSNAP
: CEPH_MDS_OP_RMDIR
;
13702 MetaRequest
*req
= new MetaRequest(op
);
13704 dir
->make_nosnap_relative_path(path
);
13705 path
.push_dentry(name
);
13706 req
->set_filepath(path
);
13707 req
->set_inode(dir
);
13709 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13710 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13711 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
13716 int res
= get_or_create(dir
, name
, &de
);
13719 if (op
== CEPH_MDS_OP_RMDIR
)
13720 req
->set_dentry(de
);
13724 res
= _lookup(dir
, name
, 0, &in
, perms
);
13728 if (op
== CEPH_MDS_OP_RMSNAP
) {
13729 unlink(de
, true, true);
13732 req
->set_other_inode(in
.get());
13734 res
= make_request(req
, perms
);
13737 ldout(cct
, 8) << "rmdir(" << path
<< ") = " << res
<< dendl
;
13745 int Client::ll_rmdir(Inode
*in
, const char *name
, const UserPerm
& perms
)
13747 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13748 if (!mref_reader
.is_state_satisfied())
13749 return -CEPHFS_ENOTCONN
;
13751 vinodeno_t vino
= _get_vino(in
);
13753 ldout(cct
, 3) << "ll_rmdir " << vino
<< " " << name
<< dendl
;
13754 tout(cct
) << "ll_rmdir" << std::endl
;
13755 tout(cct
) << vino
.ino
.val
<< std::endl
;
13756 tout(cct
) << name
<< std::endl
;
13758 std::scoped_lock
lock(client_lock
);
13760 if (!fuse_default_permissions
) {
13761 int r
= may_delete(in
, name
, perms
);
13766 return _rmdir(in
, name
, perms
);
13769 int Client::_rename(Inode
*fromdir
, const char *fromname
, Inode
*todir
, const char *toname
, const UserPerm
& perm
, std::string alternate_name
)
13771 ldout(cct
, 8) << "_rename(" << fromdir
->ino
<< " " << fromname
<< " to "
13772 << todir
->ino
<< " " << toname
13773 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")"
13776 if (fromdir
->snapid
!= todir
->snapid
)
13777 return -CEPHFS_EXDEV
;
13779 int op
= CEPH_MDS_OP_RENAME
;
13780 if (fromdir
->snapid
!= CEPH_NOSNAP
) {
13781 if (fromdir
== todir
&& fromdir
->snapid
== CEPH_SNAPDIR
)
13782 op
= CEPH_MDS_OP_RENAMESNAP
;
13784 return -CEPHFS_EROFS
;
13786 if (fromdir
!= todir
) {
13787 Inode
*fromdir_root
=
13788 fromdir
->quota
.is_enable() ? fromdir
: get_quota_root(fromdir
, perm
);
13789 Inode
*todir_root
=
13790 todir
->quota
.is_enable() ? todir
: get_quota_root(todir
, perm
);
13791 if (fromdir_root
!= todir_root
) {
13792 return -CEPHFS_EXDEV
;
13797 MetaRequest
*req
= new MetaRequest(op
);
13800 fromdir
->make_nosnap_relative_path(from
);
13801 from
.push_dentry(fromname
);
13803 todir
->make_nosnap_relative_path(to
);
13804 to
.push_dentry(toname
);
13805 req
->set_filepath(to
);
13806 req
->set_filepath2(from
);
13807 req
->set_alternate_name(std::move(alternate_name
));
13810 int res
= get_or_create(fromdir
, fromname
, &oldde
);
13814 res
= get_or_create(todir
, toname
, &de
);
13818 if (op
== CEPH_MDS_OP_RENAME
) {
13819 req
->set_old_dentry(oldde
);
13820 req
->old_dentry_drop
= CEPH_CAP_FILE_SHARED
;
13821 req
->old_dentry_unless
= CEPH_CAP_FILE_EXCL
;
13823 req
->set_dentry(de
);
13824 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13825 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13827 InodeRef oldin
, otherin
;
13828 res
= _lookup(fromdir
, fromname
, 0, &oldin
, perm
);
13832 Inode
*oldinode
= oldin
.get();
13833 oldinode
->break_all_delegs();
13834 req
->set_old_inode(oldinode
);
13835 req
->old_inode_drop
= CEPH_CAP_LINK_SHARED
;
13837 res
= _lookup(todir
, toname
, 0, &otherin
, perm
);
13841 Inode
*in
= otherin
.get();
13842 req
->set_other_inode(in
);
13843 in
->break_all_delegs();
13845 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
13847 case -CEPHFS_ENOENT
:
13853 req
->set_inode(todir
);
13855 // renamesnap reply contains no tracedn, so we need to invalidate
13857 unlink(oldde
, true, true);
13858 unlink(de
, true, true);
13860 req
->set_inode(todir
);
13863 res
= make_request(req
, perm
, &target
);
13864 ldout(cct
, 10) << "rename result is " << res
<< dendl
;
13866 // renamed item from our cache
13869 ldout(cct
, 8) << "_rename(" << from
<< ", " << to
<< ") = " << res
<< dendl
;
13877 int Client::ll_rename(Inode
*parent
, const char *name
, Inode
*newparent
,
13878 const char *newname
, const UserPerm
& perm
)
13880 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13881 if (!mref_reader
.is_state_satisfied())
13882 return -CEPHFS_ENOTCONN
;
13884 vinodeno_t vparent
= _get_vino(parent
);
13885 vinodeno_t vnewparent
= _get_vino(newparent
);
13887 ldout(cct
, 3) << "ll_rename " << vparent
<< " " << name
<< " to "
13888 << vnewparent
<< " " << newname
<< dendl
;
13889 tout(cct
) << "ll_rename" << std::endl
;
13890 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13891 tout(cct
) << name
<< std::endl
;
13892 tout(cct
) << vnewparent
.ino
.val
<< std::endl
;
13893 tout(cct
) << newname
<< std::endl
;
13895 std::scoped_lock
lock(client_lock
);
13897 if (!fuse_default_permissions
) {
13898 int r
= may_delete(parent
, name
, perm
);
13901 r
= may_delete(newparent
, newname
, perm
);
13902 if (r
< 0 && r
!= -CEPHFS_ENOENT
)
13906 return _rename(parent
, name
, newparent
, newname
, perm
, "");
13909 int Client::_link(Inode
*in
, Inode
*dir
, const char *newname
, const UserPerm
& perm
, std::string alternate_name
, InodeRef
*inp
)
13911 ldout(cct
, 8) << "_link(" << in
->ino
<< " to " << dir
->ino
<< " " << newname
13912 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")" << dendl
;
13914 if (strlen(newname
) > NAME_MAX
)
13915 return -CEPHFS_ENAMETOOLONG
;
13917 if (in
->snapid
!= CEPH_NOSNAP
|| dir
->snapid
!= CEPH_NOSNAP
) {
13918 return -CEPHFS_EROFS
;
13920 if (is_quota_files_exceeded(dir
, perm
)) {
13921 return -CEPHFS_EDQUOT
;
13924 in
->break_all_delegs();
13925 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LINK
);
13927 filepath
path(newname
, dir
->ino
);
13928 req
->set_filepath(path
);
13929 req
->set_alternate_name(std::move(alternate_name
));
13930 filepath
existing(in
->ino
);
13931 req
->set_filepath2(existing
);
13933 req
->set_inode(dir
);
13934 req
->inode_drop
= CEPH_CAP_FILE_SHARED
;
13935 req
->inode_unless
= CEPH_CAP_FILE_EXCL
;
13938 int res
= get_or_create(dir
, newname
, &de
);
13941 req
->set_dentry(de
);
13943 res
= make_request(req
, perm
, inp
);
13944 ldout(cct
, 10) << "link result is " << res
<< dendl
;
13947 ldout(cct
, 8) << "link(" << existing
<< ", " << path
<< ") = " << res
<< dendl
;
13955 int Client::ll_link(Inode
*in
, Inode
*newparent
, const char *newname
,
13956 const UserPerm
& perm
)
13958 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13959 if (!mref_reader
.is_state_satisfied())
13960 return -CEPHFS_ENOTCONN
;
13962 vinodeno_t vino
= _get_vino(in
);
13963 vinodeno_t vnewparent
= _get_vino(newparent
);
13965 ldout(cct
, 3) << "ll_link " << vino
<< " to " << vnewparent
<< " " <<
13967 tout(cct
) << "ll_link" << std::endl
;
13968 tout(cct
) << vino
.ino
.val
<< std::endl
;
13969 tout(cct
) << vnewparent
<< std::endl
;
13970 tout(cct
) << newname
<< std::endl
;
13974 std::scoped_lock
lock(client_lock
);
13976 if (!fuse_default_permissions
) {
13977 if (S_ISDIR(in
->mode
))
13978 return -CEPHFS_EPERM
;
13980 int r
= may_hardlink(in
, perm
);
13984 r
= may_create(newparent
, perm
);
13989 return _link(in
, newparent
, newname
, perm
, "", &target
);
13992 int Client::ll_num_osds(void)
13994 std::scoped_lock
lock(client_lock
);
13995 return objecter
->with_osdmap(std::mem_fn(&OSDMap::get_num_osds
));
13998 int Client::ll_osdaddr(int osd
, uint32_t *addr
)
14000 std::scoped_lock
lock(client_lock
);
14003 bool exists
= objecter
->with_osdmap([&](const OSDMap
& o
) {
14004 if (!o
.exists(osd
))
14006 g
= o
.get_addrs(osd
).front();
14011 uint32_t nb_addr
= (g
.in4_addr()).sin_addr
.s_addr
;
14012 *addr
= ntohl(nb_addr
);
14016 uint32_t Client::ll_stripe_unit(Inode
*in
)
14018 std::scoped_lock
lock(client_lock
);
14019 return in
->layout
.stripe_unit
;
14022 uint64_t Client::ll_snap_seq(Inode
*in
)
14024 std::scoped_lock
lock(client_lock
);
14025 return in
->snaprealm
->seq
;
14028 int Client::ll_file_layout(Inode
*in
, file_layout_t
*layout
)
14030 std::scoped_lock
lock(client_lock
);
14031 *layout
= in
->layout
;
14035 int Client::ll_file_layout(Fh
*fh
, file_layout_t
*layout
)
14037 return ll_file_layout(fh
->inode
.get(), layout
);
14040 /* Currently we cannot take advantage of redundancy in reads, since we
14041 would have to go through all possible placement groups (a
14042 potentially quite large number determined by a hash), and use CRUSH
14043 to calculate the appropriate set of OSDs for each placement group,
14044 then index into that. An array with one entry per OSD is much more
14045 tractable and works for demonstration purposes. */
14047 int Client::ll_get_stripe_osd(Inode
*in
, uint64_t blockno
,
14048 file_layout_t
* layout
)
14050 std::scoped_lock
lock(client_lock
);
14052 inodeno_t ino
= in
->ino
;
14053 uint32_t object_size
= layout
->object_size
;
14054 uint32_t su
= layout
->stripe_unit
;
14055 uint32_t stripe_count
= layout
->stripe_count
;
14056 uint64_t stripes_per_object
= object_size
/ su
;
14057 uint64_t stripeno
= 0, stripepos
= 0;
14060 stripeno
= blockno
/ stripe_count
; // which horizontal stripe (Y)
14061 stripepos
= blockno
% stripe_count
; // which object in the object set (X)
14063 uint64_t objectsetno
= stripeno
/ stripes_per_object
; // which object set
14064 uint64_t objectno
= objectsetno
* stripe_count
+ stripepos
; // object id
14066 object_t oid
= file_object_t(ino
, objectno
);
14067 return objecter
->with_osdmap([&](const OSDMap
& o
) {
14068 ceph_object_layout olayout
=
14069 o
.file_to_object_layout(oid
, *layout
);
14070 pg_t pg
= (pg_t
)olayout
.ol_pgid
;
14073 o
.pg_to_acting_osds(pg
, &osds
, &primary
);
14078 /* Return the offset of the block, internal to the object */
14080 uint64_t Client::ll_get_internal_offset(Inode
*in
, uint64_t blockno
)
14082 std::scoped_lock
lock(client_lock
);
14083 file_layout_t
*layout
=&(in
->layout
);
14084 uint32_t object_size
= layout
->object_size
;
14085 uint32_t su
= layout
->stripe_unit
;
14086 uint64_t stripes_per_object
= object_size
/ su
;
14088 return (blockno
% stripes_per_object
) * su
;
14091 int Client::ll_opendir(Inode
*in
, int flags
, dir_result_t
** dirpp
,
14092 const UserPerm
& perms
)
14094 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14095 if (!mref_reader
.is_state_satisfied())
14096 return -CEPHFS_ENOTCONN
;
14098 vinodeno_t vino
= _get_vino(in
);
14100 ldout(cct
, 3) << "ll_opendir " << vino
<< dendl
;
14101 tout(cct
) << "ll_opendir" << std::endl
;
14102 tout(cct
) << vino
.ino
.val
<< std::endl
;
14104 std::scoped_lock
lock(client_lock
);
14106 if (!fuse_default_permissions
) {
14107 int r
= may_open(in
, flags
, perms
);
14112 int r
= _opendir(in
, dirpp
, perms
);
14113 tout(cct
) << (uintptr_t)*dirpp
<< std::endl
;
14115 ldout(cct
, 3) << "ll_opendir " << vino
<< " = " << r
<< " (" << *dirpp
<< ")"
14120 int Client::ll_releasedir(dir_result_t
*dirp
)
14122 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14123 if (!mref_reader
.is_state_satisfied())
14124 return -CEPHFS_ENOTCONN
;
14126 ldout(cct
, 3) << "ll_releasedir " << dirp
<< dendl
;
14127 tout(cct
) << "ll_releasedir" << std::endl
;
14128 tout(cct
) << (uintptr_t)dirp
<< std::endl
;
14130 std::scoped_lock
lock(client_lock
);
14136 int Client::ll_fsyncdir(dir_result_t
*dirp
)
14138 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14139 if (!mref_reader
.is_state_satisfied())
14140 return -CEPHFS_ENOTCONN
;
14142 ldout(cct
, 3) << "ll_fsyncdir " << dirp
<< dendl
;
14143 tout(cct
) << "ll_fsyncdir" << std::endl
;
14144 tout(cct
) << (uintptr_t)dirp
<< std::endl
;
14146 std::scoped_lock
lock(client_lock
);
14147 return _fsync(dirp
->inode
.get(), false);
14150 int Client::ll_open(Inode
*in
, int flags
, Fh
**fhp
, const UserPerm
& perms
)
14152 ceph_assert(!(flags
& O_CREAT
));
14154 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14155 if (!mref_reader
.is_state_satisfied())
14156 return -CEPHFS_ENOTCONN
;
14158 vinodeno_t vino
= _get_vino(in
);
14160 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) << dendl
;
14161 tout(cct
) << "ll_open" << std::endl
;
14162 tout(cct
) << vino
.ino
.val
<< std::endl
;
14163 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
14165 std::scoped_lock
lock(client_lock
);
14168 if (!fuse_default_permissions
) {
14169 r
= may_open(in
, flags
, perms
);
14174 r
= _open(in
, flags
, 0, fhp
/* may be NULL */, perms
);
14177 Fh
*fhptr
= fhp
? *fhp
: NULL
;
14179 ll_unclosed_fh_set
.insert(fhptr
);
14181 tout(cct
) << (uintptr_t)fhptr
<< std::endl
;
14182 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) <<
14183 " = " << r
<< " (" << fhptr
<< ")" << dendl
;
14187 int Client::_ll_create(Inode
*parent
, const char *name
, mode_t mode
,
14188 int flags
, InodeRef
*in
, int caps
, Fh
**fhp
,
14189 const UserPerm
& perms
)
14193 vinodeno_t vparent
= _get_vino(parent
);
14195 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
14196 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << ", uid " << perms
.uid()
14197 << ", gid " << perms
.gid() << dendl
;
14198 tout(cct
) << "ll_create" << std::endl
;
14199 tout(cct
) << vparent
.ino
.val
<< std::endl
;
14200 tout(cct
) << name
<< std::endl
;
14201 tout(cct
) << mode
<< std::endl
;
14202 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
14204 bool created
= false;
14205 int r
= _lookup(parent
, name
, caps
, in
, perms
);
14207 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
14208 return -CEPHFS_EEXIST
;
14210 if (r
== -CEPHFS_ENOENT
&& (flags
& O_CREAT
)) {
14211 if (!fuse_default_permissions
) {
14212 r
= may_create(parent
, perms
);
14216 r
= _create(parent
, name
, flags
, mode
, in
, fhp
, 0, 0, 0, NULL
, &created
,
14227 ldout(cct
, 20) << "_ll_create created = " << created
<< dendl
;
14229 if (!fuse_default_permissions
) {
14230 r
= may_open(in
->get(), flags
, perms
);
14233 int release_r
= _release_fh(*fhp
);
14234 ceph_assert(release_r
== 0); // during create, no async data ops should have happened
14239 if (*fhp
== NULL
) {
14240 r
= _open(in
->get(), flags
, mode
, fhp
, perms
);
14248 ll_unclosed_fh_set
.insert(*fhp
);
14253 Inode
*inode
= in
->get();
14254 if (use_faked_inos())
14255 ino
= inode
->faked_ino
;
14260 tout(cct
) << (uintptr_t)*fhp
<< std::endl
;
14261 tout(cct
) << ino
<< std::endl
;
14262 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
14263 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << " = " << r
<< " (" <<
14264 *fhp
<< " " << hex
<< ino
<< dec
<< ")" << dendl
;
14269 int Client::ll_create(Inode
*parent
, const char *name
, mode_t mode
,
14270 int flags
, struct stat
*attr
, Inode
**outp
, Fh
**fhp
,
14271 const UserPerm
& perms
)
14273 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14274 if (!mref_reader
.is_state_satisfied())
14275 return -CEPHFS_ENOTCONN
;
14277 std::scoped_lock
lock(client_lock
);
14280 int r
= _ll_create(parent
, name
, mode
, flags
, &in
, CEPH_STAT_CAP_INODE_ALL
,
14285 // passing an Inode in outp requires an additional ref
14290 fill_stat(in
, attr
);
14298 int Client::ll_createx(Inode
*parent
, const char *name
, mode_t mode
,
14299 int oflags
, Inode
**outp
, Fh
**fhp
,
14300 struct ceph_statx
*stx
, unsigned want
, unsigned lflags
,
14301 const UserPerm
& perms
)
14303 unsigned caps
= statx_to_mask(lflags
, want
);
14304 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14305 if (!mref_reader
.is_state_satisfied())
14306 return -CEPHFS_ENOTCONN
;
14308 std::scoped_lock
lock(client_lock
);
14311 int r
= _ll_create(parent
, name
, mode
, oflags
, &in
, caps
, fhp
, perms
);
14315 // passing an Inode in outp requires an additional ref
14320 fill_statx(in
, caps
, stx
);
14329 loff_t
Client::ll_lseek(Fh
*fh
, loff_t offset
, int whence
)
14331 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14332 if (!mref_reader
.is_state_satisfied())
14333 return -CEPHFS_ENOTCONN
;
14335 tout(cct
) << "ll_lseek" << std::endl
;
14336 tout(cct
) << offset
<< std::endl
;
14337 tout(cct
) << whence
<< std::endl
;
14339 std::scoped_lock
lock(client_lock
);
14340 return _lseek(fh
, offset
, whence
);
14343 int Client::ll_read(Fh
*fh
, loff_t off
, loff_t len
, bufferlist
*bl
)
14345 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14346 if (!mref_reader
.is_state_satisfied())
14347 return -CEPHFS_ENOTCONN
;
14349 ldout(cct
, 3) << "ll_read " << fh
<< " " << fh
->inode
->ino
<< " " << " " << off
<< "~" << len
<< dendl
;
14350 tout(cct
) << "ll_read" << std::endl
;
14351 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14352 tout(cct
) << off
<< std::endl
;
14353 tout(cct
) << len
<< std::endl
;
14355 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14356 len
= std::min(len
, (loff_t
)INT_MAX
);
14357 std::scoped_lock
lock(client_lock
);
14359 int r
= _read(fh
, off
, len
, bl
);
14360 ldout(cct
, 3) << "ll_read " << fh
<< " " << off
<< "~" << len
<< " = " << r
14365 int Client::ll_read_block(Inode
*in
, uint64_t blockid
,
14369 file_layout_t
* layout
)
14371 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14372 if (!mref_reader
.is_state_satisfied())
14373 return -CEPHFS_ENOTCONN
;
14375 vinodeno_t vino
= _get_vino(in
);
14376 object_t oid
= file_object_t(vino
.ino
, blockid
);
14377 C_SaferCond onfinish
;
14380 objecter
->read(oid
,
14381 object_locator_t(layout
->pool_id
),
14386 CEPH_OSD_FLAG_READ
,
14389 int r
= onfinish
.wait();
14391 bl
.begin().copy(bl
.length(), buf
);
14398 /* It appears that the OSD doesn't return success unless the entire
14399 buffer was written, return the write length on success. */
14401 int Client::ll_write_block(Inode
*in
, uint64_t blockid
,
14402 char* buf
, uint64_t offset
,
14403 uint64_t length
, file_layout_t
* layout
,
14404 uint64_t snapseq
, uint32_t sync
)
14406 vinodeno_t vino
= ll_get_vino(in
);
14408 std::unique_ptr
<C_SaferCond
> onsafe
= nullptr;
14410 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14411 if (!mref_reader
.is_state_satisfied())
14412 return -CEPHFS_ENOTCONN
;
14415 return -CEPHFS_EINVAL
;
14417 if (true || sync
) {
14418 /* if write is stable, the epilogue is waiting on
14420 onsafe
.reset(new C_SaferCond("Client::ll_write_block flock"));
14422 object_t oid
= file_object_t(vino
.ino
, blockid
);
14423 SnapContext fakesnap
;
14424 ceph::bufferlist bl
;
14426 bl
.push_back(buffer::copy(buf
, length
));
14429 ldout(cct
, 1) << "ll_block_write for " << vino
.ino
<< "." << blockid
14432 fakesnap
.seq
= snapseq
;
14434 /* lock just in time */
14435 objecter
->write(oid
,
14436 object_locator_t(layout
->pool_id
),
14441 ceph::real_clock::now(),
14445 if (nullptr != onsafe
) {
14446 r
= onsafe
->wait();
14456 int Client::ll_commit_blocks(Inode
*in
,
14461 BarrierContext *bctx;
14462 vinodeno_t vino = _get_vino(in);
14463 uint64_t ino = vino.ino;
14465 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
14466 << offset << " to " << length << dendl;
14469 return -CEPHFS_EINVAL;
14472 std::scoped_lock lock(client_lock);
14473 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
14474 if (p != barriers.end()) {
14475 barrier_interval civ(offset, offset + length);
14476 p->second->commit_barrier(civ);
14482 int Client::ll_write(Fh
*fh
, loff_t off
, loff_t len
, const char *data
)
14484 ldout(cct
, 3) << "ll_write " << fh
<< " " << fh
->inode
->ino
<< " " << off
<<
14485 "~" << len
<< dendl
;
14486 tout(cct
) << "ll_write" << std::endl
;
14487 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14488 tout(cct
) << off
<< std::endl
;
14489 tout(cct
) << len
<< std::endl
;
14491 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14492 if (!mref_reader
.is_state_satisfied())
14493 return -CEPHFS_ENOTCONN
;
14495 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14496 len
= std::min(len
, (loff_t
)INT_MAX
);
14497 std::scoped_lock
lock(client_lock
);
14499 int r
= _write(fh
, off
, len
, data
, NULL
, 0);
14500 ldout(cct
, 3) << "ll_write " << fh
<< " " << off
<< "~" << len
<< " = " << r
14505 int64_t Client::ll_writev(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
14507 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14508 if (!mref_reader
.is_state_satisfied())
14509 return -CEPHFS_ENOTCONN
;
14511 std::scoped_lock
cl(client_lock
);
14512 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, true, false);
14515 int64_t Client::ll_readv(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
14517 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14518 if (!mref_reader
.is_state_satisfied())
14519 return -CEPHFS_ENOTCONN
;
14521 std::scoped_lock
cl(client_lock
);
14522 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, false, false);
14525 int Client::ll_flush(Fh
*fh
)
14527 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14528 if (!mref_reader
.is_state_satisfied())
14529 return -CEPHFS_ENOTCONN
;
14531 ldout(cct
, 3) << "ll_flush " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
14532 tout(cct
) << "ll_flush" << std::endl
;
14533 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14535 std::scoped_lock
lock(client_lock
);
14539 int Client::ll_fsync(Fh
*fh
, bool syncdataonly
)
14541 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14542 if (!mref_reader
.is_state_satisfied())
14543 return -CEPHFS_ENOTCONN
;
14545 ldout(cct
, 3) << "ll_fsync " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
14546 tout(cct
) << "ll_fsync" << std::endl
;
14547 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14549 std::scoped_lock
lock(client_lock
);
14550 int r
= _fsync(fh
, syncdataonly
);
14552 // If we're returning an error, clear it from the FH
14553 fh
->take_async_err();
14558 int Client::ll_sync_inode(Inode
*in
, bool syncdataonly
)
14560 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14561 if (!mref_reader
.is_state_satisfied())
14562 return -CEPHFS_ENOTCONN
;
14564 ldout(cct
, 3) << "ll_sync_inode " << *in
<< " " << dendl
;
14565 tout(cct
) << "ll_sync_inode" << std::endl
;
14566 tout(cct
) << (uintptr_t)in
<< std::endl
;
14568 std::scoped_lock
lock(client_lock
);
14569 return _fsync(in
, syncdataonly
);
14572 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
14574 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
14576 if (offset
< 0 || length
<= 0)
14577 return -CEPHFS_EINVAL
;
14579 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
))
14580 return -CEPHFS_EOPNOTSUPP
;
14582 if ((mode
& FALLOC_FL_PUNCH_HOLE
) && !(mode
& FALLOC_FL_KEEP_SIZE
))
14583 return -CEPHFS_EOPNOTSUPP
;
14585 Inode
*in
= fh
->inode
.get();
14587 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
) &&
14588 !(mode
& FALLOC_FL_PUNCH_HOLE
)) {
14589 return -CEPHFS_ENOSPC
;
14592 if (in
->snapid
!= CEPH_NOSNAP
)
14593 return -CEPHFS_EROFS
;
14595 if ((fh
->mode
& CEPH_FILE_MODE_WR
) == 0)
14596 return -CEPHFS_EBADF
;
14598 uint64_t size
= offset
+ length
;
14599 if (!(mode
& (FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
)) &&
14601 is_quota_bytes_exceeded(in
, size
- in
->size
, fh
->actor_perms
)) {
14602 return -CEPHFS_EDQUOT
;
14606 int r
= get_caps(fh
, CEPH_CAP_FILE_WR
, CEPH_CAP_FILE_BUFFER
, &have
, -1);
14610 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
14611 if (mode
& FALLOC_FL_PUNCH_HOLE
) {
14612 if (in
->inline_version
< CEPH_INLINE_NONE
&&
14613 (have
& CEPH_CAP_FILE_BUFFER
)) {
14615 auto inline_iter
= in
->inline_data
.cbegin();
14616 int len
= in
->inline_data
.length();
14617 if (offset
< len
) {
14619 inline_iter
.copy(offset
, bl
);
14621 if (offset
+ size
> len
)
14622 size
= len
- offset
;
14624 bl
.append_zero(size
);
14625 if (offset
+ size
< len
) {
14626 inline_iter
+= size
;
14627 inline_iter
.copy(len
- offset
- size
, bl
);
14629 in
->inline_data
= bl
;
14630 in
->inline_version
++;
14632 in
->mtime
= in
->ctime
= ceph_clock_now();
14634 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
14636 if (in
->inline_version
< CEPH_INLINE_NONE
) {
14637 onuninline
.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
14638 uninline_data(in
, onuninline
.get());
14641 C_SaferCond
onfinish("Client::_punch_hole flock");
14643 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
14645 _invalidate_inode_cache(in
, offset
, length
);
14646 filer
->zero(in
->ino
, &in
->layout
,
14647 in
->snaprealm
->get_snap_context(),
14649 ceph::real_clock::now(),
14650 0, true, &onfinish
);
14651 in
->mtime
= in
->ctime
= ceph_clock_now();
14653 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
14655 client_lock
.unlock();
14657 client_lock
.lock();
14658 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
14660 } else if (!(mode
& FALLOC_FL_KEEP_SIZE
)) {
14661 uint64_t size
= offset
+ length
;
14662 if (size
> in
->size
) {
14664 in
->mtime
= in
->ctime
= ceph_clock_now();
14666 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
14668 if (is_quota_bytes_approaching(in
, fh
->actor_perms
)) {
14669 check_caps(in
, CHECK_CAPS_NODELAY
);
14670 } else if (is_max_size_approaching(in
)) {
14676 if (nullptr != onuninline
) {
14677 client_lock
.unlock();
14678 int ret
= onuninline
->wait();
14679 client_lock
.lock();
14681 if (ret
>= 0 || ret
== -CEPHFS_ECANCELED
) {
14682 in
->inline_data
.clear();
14683 in
->inline_version
= CEPH_INLINE_NONE
;
14684 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
14690 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
14694 int Client::ll_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
14696 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14697 if (!mref_reader
.is_state_satisfied())
14698 return -CEPHFS_ENOTCONN
;
14700 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
14701 tout(cct
) << __func__
<< " " << mode
<< " " << offset
<< " " << length
<< std::endl
;
14702 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14704 std::scoped_lock
lock(client_lock
);
14705 return _fallocate(fh
, mode
, offset
, length
);
14708 int Client::fallocate(int fd
, int mode
, loff_t offset
, loff_t length
)
14710 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14711 if (!mref_reader
.is_state_satisfied())
14712 return -CEPHFS_ENOTCONN
;
14714 tout(cct
) << __func__
<< " " << " " << fd
<< mode
<< " " << offset
<< " " << length
<< std::endl
;
14716 std::scoped_lock
lock(client_lock
);
14717 Fh
*fh
= get_filehandle(fd
);
14719 return -CEPHFS_EBADF
;
14720 #if defined(__linux__) && defined(O_PATH)
14721 if (fh
->flags
& O_PATH
)
14722 return -CEPHFS_EBADF
;
14724 return _fallocate(fh
, mode
, offset
, length
);
14727 int Client::ll_release(Fh
*fh
)
14729 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14730 if (!mref_reader
.is_state_satisfied())
14731 return -CEPHFS_ENOTCONN
;
14733 ldout(cct
, 3) << __func__
<< " (fh)" << fh
<< " " << fh
->inode
->ino
<< " " <<
14735 tout(cct
) << __func__
<< " (fh)" << std::endl
;
14736 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14738 std::scoped_lock
lock(client_lock
);
14740 if (ll_unclosed_fh_set
.count(fh
))
14741 ll_unclosed_fh_set
.erase(fh
);
14742 return _release_fh(fh
);
14745 int Client::ll_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
14747 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14748 if (!mref_reader
.is_state_satisfied())
14749 return -CEPHFS_ENOTCONN
;
14751 ldout(cct
, 3) << "ll_getlk (fh)" << fh
<< " " << fh
->inode
->ino
<< dendl
;
14752 tout(cct
) << "ll_getk (fh)" << (uintptr_t)fh
<< std::endl
;
14754 std::scoped_lock
lock(client_lock
);
14755 return _getlk(fh
, fl
, owner
);
14758 int Client::ll_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
14760 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14761 if (!mref_reader
.is_state_satisfied())
14762 return -CEPHFS_ENOTCONN
;
14764 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
14765 tout(cct
) << __func__
<< " (fh)" << (uintptr_t)fh
<< std::endl
;
14767 std::scoped_lock
lock(client_lock
);
14768 return _setlk(fh
, fl
, owner
, sleep
);
14771 int Client::ll_flock(Fh
*fh
, int cmd
, uint64_t owner
)
14773 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14774 if (!mref_reader
.is_state_satisfied())
14775 return -CEPHFS_ENOTCONN
;
14777 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
14778 tout(cct
) << __func__
<< " (fh)" << (uintptr_t)fh
<< std::endl
;
14780 std::scoped_lock
lock(client_lock
);
14781 return _flock(fh
, cmd
, owner
);
14784 int Client::set_deleg_timeout(uint32_t timeout
)
14786 std::scoped_lock
lock(client_lock
);
14789 * The whole point is to prevent blocklisting so we must time out the
14790 * delegation before the session autoclose timeout kicks in.
14792 if (timeout
>= mdsmap
->get_session_autoclose())
14793 return -CEPHFS_EINVAL
;
14795 deleg_timeout
= timeout
;
14799 int Client::ll_delegation(Fh
*fh
, unsigned cmd
, ceph_deleg_cb_t cb
, void *priv
)
14801 int ret
= -CEPHFS_EINVAL
;
14803 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14804 if (!mref_reader
.is_state_satisfied())
14805 return -CEPHFS_ENOTCONN
;
14807 std::scoped_lock
lock(client_lock
);
14809 Inode
*inode
= fh
->inode
.get();
14812 case CEPH_DELEGATION_NONE
:
14813 inode
->unset_deleg(fh
);
14818 ret
= inode
->set_deleg(fh
, cmd
, cb
, priv
);
14819 } catch (std::bad_alloc
&) {
14820 ret
= -CEPHFS_ENOMEM
;
14827 class C_Client_RequestInterrupt
: public Context
{
14832 C_Client_RequestInterrupt(Client
*c
, MetaRequest
*r
) : client(c
), req(r
) {
14835 void finish(int r
) override
{
14836 std::scoped_lock
l(client
->client_lock
);
14837 ceph_assert(req
->head
.op
== CEPH_MDS_OP_SETFILELOCK
);
14838 client
->_interrupt_filelock(req
);
14839 client
->put_request(req
);
14843 void Client::ll_interrupt(void *d
)
14845 MetaRequest
*req
= static_cast<MetaRequest
*>(d
);
14846 ldout(cct
, 3) << __func__
<< " tid " << req
->get_tid() << dendl
;
14847 tout(cct
) << __func__
<< " tid " << req
->get_tid() << std::endl
;
14848 interrupt_finisher
.queue(new C_Client_RequestInterrupt(this, req
));
14851 // =========================================
14854 // expose file layouts
14856 int Client::describe_layout(const char *relpath
, file_layout_t
*lp
,
14857 const UserPerm
& perms
)
14859 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14860 if (!mref_reader
.is_state_satisfied())
14861 return -CEPHFS_ENOTCONN
;
14863 std::scoped_lock
lock(client_lock
);
14865 filepath
path(relpath
);
14867 int r
= path_walk(path
, &in
, perms
);
14873 ldout(cct
, 3) << __func__
<< "(" << relpath
<< ") = 0" << dendl
;
14877 int Client::fdescribe_layout(int fd
, file_layout_t
*lp
)
14879 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14880 if (!mref_reader
.is_state_satisfied())
14881 return -CEPHFS_ENOTCONN
;
14883 std::scoped_lock
lock(client_lock
);
14885 Fh
*f
= get_filehandle(fd
);
14887 return -CEPHFS_EBADF
;
14888 Inode
*in
= f
->inode
.get();
14892 ldout(cct
, 3) << __func__
<< "(" << fd
<< ") = 0" << dendl
;
14896 int64_t Client::get_default_pool_id()
14898 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14899 if (!mref_reader
.is_state_satisfied())
14900 return -CEPHFS_ENOTCONN
;
14902 std::scoped_lock
lock(client_lock
);
14904 /* first data pool is the default */
14905 return mdsmap
->get_first_data_pool();
14910 int64_t Client::get_pool_id(const char *pool_name
)
14912 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14913 if (!mref_reader
.is_state_satisfied())
14914 return -CEPHFS_ENOTCONN
;
14916 std::scoped_lock
lock(client_lock
);
14918 return objecter
->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name
),
14922 string
Client::get_pool_name(int64_t pool
)
14924 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14925 if (!mref_reader
.is_state_satisfied())
14928 std::scoped_lock
lock(client_lock
);
14930 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
14931 return o
.have_pg_pool(pool
) ? o
.get_pool_name(pool
) : string();
14935 int Client::get_pool_replication(int64_t pool
)
14937 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14938 if (!mref_reader
.is_state_satisfied())
14939 return -CEPHFS_ENOTCONN
;
14941 std::scoped_lock
lock(client_lock
);
14943 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
14944 return o
.have_pg_pool(pool
) ? o
.get_pg_pool(pool
)->get_size() : -CEPHFS_ENOENT
;
14948 int Client::get_file_extent_osds(int fd
, loff_t off
, loff_t
*len
, vector
<int>& osds
)
14950 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14951 if (!mref_reader
.is_state_satisfied())
14952 return -CEPHFS_ENOTCONN
;
14954 std::scoped_lock
lock(client_lock
);
14956 Fh
*f
= get_filehandle(fd
);
14958 return -CEPHFS_EBADF
;
14959 Inode
*in
= f
->inode
.get();
14961 vector
<ObjectExtent
> extents
;
14962 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, 1, in
->truncate_size
, extents
);
14963 ceph_assert(extents
.size() == 1);
14965 objecter
->with_osdmap([&](const OSDMap
& o
) {
14966 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
14967 o
.pg_to_acting_osds(pg
, osds
);
14971 return -CEPHFS_EINVAL
;
14974 * Return the remainder of the extent (stripe unit)
14976 * If length = 1 is passed to Striper::file_to_extents we get a single
14977 * extent back, but its length is one so we still need to compute the length
14978 * to the end of the stripe unit.
14980 * If length = su then we may get 1 or 2 objects back in the extents vector
14981 * which would have to be examined. Even then, the offsets are local to the
14982 * object, so matching up to the file offset is extra work.
14984 * It seems simpler to stick with length = 1 and manually compute the
14988 uint64_t su
= in
->layout
.stripe_unit
;
14989 *len
= su
- (off
% su
);
14995 int Client::get_osd_crush_location(int id
, vector
<pair
<string
, string
> >& path
)
14997 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14998 if (!mref_reader
.is_state_satisfied())
14999 return -CEPHFS_ENOTCONN
;
15001 std::scoped_lock
lock(client_lock
);
15004 return -CEPHFS_EINVAL
;
15005 return objecter
->with_osdmap([&](const OSDMap
& o
) {
15006 return o
.crush
->get_full_location_ordered(id
, path
);
15010 int Client::get_file_stripe_address(int fd
, loff_t offset
,
15011 vector
<entity_addr_t
>& address
)
15013 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15014 if (!mref_reader
.is_state_satisfied())
15015 return -CEPHFS_ENOTCONN
;
15017 std::scoped_lock
lock(client_lock
);
15019 Fh
*f
= get_filehandle(fd
);
15021 return -CEPHFS_EBADF
;
15022 Inode
*in
= f
->inode
.get();
15025 vector
<ObjectExtent
> extents
;
15026 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, 1,
15027 in
->truncate_size
, extents
);
15028 ceph_assert(extents
.size() == 1);
15030 // now we have the object and its 'layout'
15031 return objecter
->with_osdmap([&](const OSDMap
& o
) {
15032 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
15034 o
.pg_to_acting_osds(pg
, osds
);
15036 return -CEPHFS_EINVAL
;
15037 for (unsigned i
= 0; i
< osds
.size(); i
++) {
15038 entity_addr_t addr
= o
.get_addrs(osds
[i
]).front();
15039 address
.push_back(addr
);
15045 int Client::get_osd_addr(int osd
, entity_addr_t
& addr
)
15047 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15048 if (!mref_reader
.is_state_satisfied())
15049 return -CEPHFS_ENOTCONN
;
15051 std::scoped_lock
lock(client_lock
);
15053 return objecter
->with_osdmap([&](const OSDMap
& o
) {
15054 if (!o
.exists(osd
))
15055 return -CEPHFS_ENOENT
;
15057 addr
= o
.get_addrs(osd
).front();
15062 int Client::enumerate_layout(int fd
, vector
<ObjectExtent
>& result
,
15063 loff_t length
, loff_t offset
)
15065 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15066 if (!mref_reader
.is_state_satisfied())
15067 return -CEPHFS_ENOTCONN
;
15069 std::scoped_lock
lock(client_lock
);
15071 Fh
*f
= get_filehandle(fd
);
15073 return -CEPHFS_EBADF
;
15074 Inode
*in
= f
->inode
.get();
15076 // map to a list of extents
15077 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, length
, in
->truncate_size
, result
);
15079 ldout(cct
, 3) << __func__
<< "(" << fd
<< ", " << length
<< ", " << offset
<< ") = 0" << dendl
;
15084 /* find an osd with the same ip. -CEPHFS_ENXIO if none. */
15085 int Client::get_local_osd()
15087 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15088 if (!mref_reader
.is_state_satisfied())
15089 return -CEPHFS_ENOTCONN
;
15091 std::scoped_lock
lock(client_lock
);
15093 objecter
->with_osdmap([this](const OSDMap
& o
) {
15094 if (o
.get_epoch() != local_osd_epoch
) {
15095 local_osd
= o
.find_osd_on_ip(messenger
->get_myaddrs().front());
15096 local_osd_epoch
= o
.get_epoch();
15107 // ===============================
15109 void Client::ms_handle_connect(Connection
*con
)
15111 ldout(cct
, 10) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
15114 bool Client::ms_handle_reset(Connection
*con
)
15116 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
15120 void Client::ms_handle_remote_reset(Connection
*con
)
15122 std::scoped_lock
lock(client_lock
);
15123 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
15124 switch (con
->get_peer_type()) {
15125 case CEPH_ENTITY_TYPE_MDS
:
15127 // kludge to figure out which mds this is; fixme with a Connection* state
15128 mds_rank_t mds
= MDS_RANK_NONE
;
15129 MetaSessionRef s
= NULL
;
15130 for (auto &p
: mds_sessions
) {
15131 if (mdsmap
->have_inst(p
.first
) && mdsmap
->get_addrs(p
.first
) == con
->get_peer_addrs()) {
15137 ceph_assert(s
!= NULL
);
15138 switch (s
->state
) {
15139 case MetaSession::STATE_CLOSING
:
15140 ldout(cct
, 1) << "reset from mds we were closing; we'll call that closed" << dendl
;
15141 _closed_mds_session(s
.get());
15144 case MetaSession::STATE_OPENING
:
15146 ldout(cct
, 1) << "reset from mds we were opening; retrying" << dendl
;
15147 list
<Context
*> waiters
;
15148 waiters
.swap(s
->waiting_for_open
);
15149 _closed_mds_session(s
.get());
15150 auto news
= _get_or_open_mds_session(mds
);
15151 news
->waiting_for_open
.swap(waiters
);
15155 case MetaSession::STATE_OPEN
:
15157 objecter
->maybe_request_map(); /* to check if we are blocklisted */
15158 if (cct
->_conf
.get_val
<bool>("client_reconnect_stale")) {
15159 ldout(cct
, 1) << "reset from mds we were open; close mds session for reconnect" << dendl
;
15160 _closed_mds_session(s
.get());
15162 ldout(cct
, 1) << "reset from mds we were open; mark session as stale" << dendl
;
15163 s
->state
= MetaSession::STATE_STALE
;
15168 case MetaSession::STATE_NEW
:
15169 case MetaSession::STATE_CLOSED
:
15179 bool Client::ms_handle_refused(Connection
*con
)
15181 ldout(cct
, 1) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
15185 Inode
*Client::get_quota_root(Inode
*in
, const UserPerm
& perms
)
15187 Inode
*quota_in
= root_ancestor
;
15188 SnapRealm
*realm
= in
->snaprealm
;
15190 ldout(cct
, 10) << __func__
<< " realm " << realm
->ino
<< dendl
;
15191 if (realm
->ino
!= in
->ino
) {
15192 auto p
= inode_map
.find(vinodeno_t(realm
->ino
, CEPH_NOSNAP
));
15193 if (p
== inode_map
.end())
15196 if (p
->second
->quota
.is_enable()) {
15197 quota_in
= p
->second
;
15201 realm
= realm
->pparent
;
15203 ldout(cct
, 10) << __func__
<< " " << in
->vino() << " -> " << quota_in
->vino() << dendl
;
15208 * Traverse quota ancestors of the Inode, return true
15209 * if any of them passes the passed function
15211 bool Client::check_quota_condition(Inode
*in
, const UserPerm
& perms
,
15212 std::function
<bool (const Inode
&in
)> test
)
15215 ceph_assert(in
!= NULL
);
15220 if (in
== root_ancestor
) {
15221 // We're done traversing, drop out
15224 // Continue up the tree
15225 in
= get_quota_root(in
, perms
);
15232 bool Client::is_quota_files_exceeded(Inode
*in
, const UserPerm
& perms
)
15234 return check_quota_condition(in
, perms
,
15235 [](const Inode
&in
) {
15236 return in
.quota
.max_files
&& in
.rstat
.rsize() >= in
.quota
.max_files
;
15240 bool Client::is_quota_bytes_exceeded(Inode
*in
, int64_t new_bytes
,
15241 const UserPerm
& perms
)
15243 return check_quota_condition(in
, perms
,
15244 [&new_bytes
](const Inode
&in
) {
15245 return in
.quota
.max_bytes
&& (in
.rstat
.rbytes
+ new_bytes
)
15246 > in
.quota
.max_bytes
;
15250 bool Client::is_quota_bytes_approaching(Inode
*in
, const UserPerm
& perms
)
15252 ceph_assert(in
->size
>= in
->reported_size
);
15253 const uint64_t size
= in
->size
- in
->reported_size
;
15254 return check_quota_condition(in
, perms
,
15255 [&size
](const Inode
&in
) {
15256 if (in
.quota
.max_bytes
) {
15257 if (in
.rstat
.rbytes
>= in
.quota
.max_bytes
) {
15261 const uint64_t space
= in
.quota
.max_bytes
- in
.rstat
.rbytes
;
15262 return (space
>> 4) < size
;
15276 int Client::check_pool_perm(Inode
*in
, int need
)
15278 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
15280 if (!cct
->_conf
->client_check_pool_perm
)
15283 /* Only need to do this for regular files */
15284 if (!in
->is_file())
15287 int64_t pool_id
= in
->layout
.pool_id
;
15288 std::string pool_ns
= in
->layout
.pool_ns
;
15289 std::pair
<int64_t, std::string
> perm_key(pool_id
, pool_ns
);
15292 auto it
= pool_perms
.find(perm_key
);
15293 if (it
== pool_perms
.end())
15295 if (it
->second
== POOL_CHECKING
) {
15296 // avoid concurrent checkings
15297 wait_on_list(waiting_for_pool_perm
);
15300 ceph_assert(have
& POOL_CHECKED
);
15306 if (in
->snapid
!= CEPH_NOSNAP
) {
15307 // pool permission check needs to write to the first object. But for snapshot,
15308 // head of the first object may have already been deleted. To avoid creating
15309 // orphan object, skip the check for now.
15313 pool_perms
[perm_key
] = POOL_CHECKING
;
15316 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (unsigned long long)in
->ino
);
15317 object_t oid
= oid_buf
;
15319 SnapContext nullsnapc
;
15321 C_SaferCond rd_cond
;
15322 ObjectOperation rd_op
;
15323 rd_op
.stat(nullptr, nullptr, nullptr);
15325 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), rd_op
,
15326 nullsnapc
, ceph::real_clock::now(), 0, &rd_cond
);
15328 C_SaferCond wr_cond
;
15329 ObjectOperation wr_op
;
15330 wr_op
.create(true);
15332 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), wr_op
,
15333 nullsnapc
, ceph::real_clock::now(), 0, &wr_cond
);
15335 client_lock
.unlock();
15336 int rd_ret
= rd_cond
.wait();
15337 int wr_ret
= wr_cond
.wait();
15338 client_lock
.lock();
15340 bool errored
= false;
15342 if (rd_ret
== 0 || rd_ret
== -CEPHFS_ENOENT
)
15344 else if (rd_ret
!= -CEPHFS_EPERM
) {
15345 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
15346 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
15350 if (wr_ret
== 0 || wr_ret
== -CEPHFS_EEXIST
)
15351 have
|= POOL_WRITE
;
15352 else if (wr_ret
!= -CEPHFS_EPERM
) {
15353 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
15354 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
15359 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
15360 // Raise EIO because actual error code might be misleading for
15361 // userspace filesystem user.
15362 pool_perms
.erase(perm_key
);
15363 signal_cond_list(waiting_for_pool_perm
);
15364 return -CEPHFS_EIO
;
15367 pool_perms
[perm_key
] = have
| POOL_CHECKED
;
15368 signal_cond_list(waiting_for_pool_perm
);
15371 if ((need
& CEPH_CAP_FILE_RD
) && !(have
& POOL_READ
)) {
15372 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
15373 << " need " << ccap_string(need
) << ", but no read perm" << dendl
;
15374 return -CEPHFS_EPERM
;
15376 if ((need
& CEPH_CAP_FILE_WR
) && !(have
& POOL_WRITE
)) {
15377 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
15378 << " need " << ccap_string(need
) << ", but no write perm" << dendl
;
15379 return -CEPHFS_EPERM
;
15385 int Client::_posix_acl_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
15387 if (acl_type
== POSIX_ACL
) {
15388 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
15389 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
15391 return posix_acl_permits(access_acl
, in
->uid
, in
->gid
, perms
, want
);
15394 return -CEPHFS_EAGAIN
;
15397 int Client::_posix_acl_chmod(Inode
*in
, mode_t mode
, const UserPerm
& perms
)
15399 if (acl_type
== NO_ACL
)
15402 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
15406 if (acl_type
== POSIX_ACL
) {
15407 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
15408 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
15409 bufferptr
acl(access_acl
.c_str(), access_acl
.length());
15410 r
= posix_acl_access_chmod(acl
, mode
);
15413 r
= _do_setxattr(in
, ACL_EA_ACCESS
, acl
.c_str(), acl
.length(), 0, perms
);
15419 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< " result=" << r
<< dendl
;
15423 int Client::_posix_acl_create(Inode
*dir
, mode_t
*mode
, bufferlist
& xattrs_bl
,
15424 const UserPerm
& perms
)
15426 if (acl_type
== NO_ACL
)
15429 if (S_ISLNK(*mode
))
15432 int r
= _getattr(dir
, CEPH_STAT_CAP_XATTR
, perms
, dir
->xattr_version
== 0);
15436 if (acl_type
== POSIX_ACL
) {
15437 if (dir
->xattrs
.count(ACL_EA_DEFAULT
)) {
15438 map
<string
, bufferptr
> xattrs
;
15440 const bufferptr
& default_acl
= dir
->xattrs
[ACL_EA_DEFAULT
];
15441 bufferptr
acl(default_acl
.c_str(), default_acl
.length());
15442 r
= posix_acl_inherit_mode(acl
, mode
);
15447 r
= posix_acl_equiv_mode(acl
.c_str(), acl
.length(), mode
);
15451 xattrs
[ACL_EA_ACCESS
] = acl
;
15454 if (S_ISDIR(*mode
))
15455 xattrs
[ACL_EA_DEFAULT
] = dir
->xattrs
[ACL_EA_DEFAULT
];
15459 encode(xattrs
, xattrs_bl
);
15462 *mode
&= ~umask_cb(callback_handle
);
15467 ldout(cct
, 10) << __func__
<< " dir ino " << dir
->ino
<< " result=" << r
<< dendl
;
15471 void Client::set_filer_flags(int flags
)
15473 std::scoped_lock
l(client_lock
);
15474 ceph_assert(flags
== 0 ||
15475 flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
15476 objecter
->add_global_op_flags(flags
);
15479 void Client::clear_filer_flags(int flags
)
15481 std::scoped_lock
l(client_lock
);
15482 ceph_assert(flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
15483 objecter
->clear_global_op_flag(flags
);
15486 // called before mount
15487 void Client::set_uuid(const std::string
& uuid
)
15489 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
15490 ceph_assert(iref_reader
.is_state_satisfied());
15492 std::scoped_lock
l(client_lock
);
15493 ceph_assert(!uuid
.empty());
15495 metadata
["uuid"] = uuid
;
15499 // called before mount. 0 means infinite
15500 void Client::set_session_timeout(unsigned timeout
)
15502 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
15503 ceph_assert(iref_reader
.is_state_satisfied());
15505 std::scoped_lock
l(client_lock
);
15507 metadata
["timeout"] = stringify(timeout
);
15510 // called before mount
15511 int Client::start_reclaim(const std::string
& uuid
, unsigned flags
,
15512 const std::string
& fs_name
)
15514 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
15515 if (!iref_reader
.is_state_satisfied())
15516 return -CEPHFS_ENOTCONN
;
15519 return -CEPHFS_EINVAL
;
15521 std::unique_lock
l(client_lock
);
15523 auto it
= metadata
.find("uuid");
15524 if (it
!= metadata
.end() && it
->second
== uuid
)
15525 return -CEPHFS_EINVAL
;
15528 int r
= subscribe_mdsmap(fs_name
);
15530 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
15534 if (metadata
.empty())
15535 populate_metadata("");
15537 while (mdsmap
->get_epoch() == 0)
15538 wait_on_list(waiting_for_mdsmap
);
15541 for (unsigned mds
= 0; mds
< mdsmap
->get_num_in_mds(); ) {
15542 if (!mdsmap
->is_up(mds
)) {
15543 ldout(cct
, 10) << "mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
15544 wait_on_list(waiting_for_mdsmap
);
15548 MetaSessionRef session
;
15549 if (!have_open_session(mds
)) {
15550 session
= _get_or_open_mds_session(mds
);
15551 if (session
->state
== MetaSession::STATE_REJECTED
)
15552 return -CEPHFS_EPERM
;
15553 if (session
->state
!= MetaSession::STATE_OPENING
) {
15555 return -CEPHFS_EINVAL
;
15557 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
15558 wait_on_context_list(session
->waiting_for_open
);
15562 session
= mds_sessions
.at(mds
);
15563 if (!session
->mds_features
.test(CEPHFS_FEATURE_RECLAIM_CLIENT
))
15564 return -CEPHFS_EOPNOTSUPP
;
15566 if (session
->reclaim_state
== MetaSession::RECLAIM_NULL
||
15567 session
->reclaim_state
== MetaSession::RECLAIMING
) {
15568 session
->reclaim_state
= MetaSession::RECLAIMING
;
15569 auto m
= make_message
<MClientReclaim
>(uuid
, flags
);
15570 session
->con
->send_message2(std::move(m
));
15571 wait_on_list(waiting_for_reclaim
);
15572 } else if (session
->reclaim_state
== MetaSession::RECLAIM_FAIL
) {
15573 return reclaim_errno
? : -CEPHFS_ENOTRECOVERABLE
;
15579 // didn't find target session in any mds
15580 if (reclaim_target_addrs
.empty()) {
15581 if (flags
& CEPH_RECLAIM_RESET
)
15582 return -CEPHFS_ENOENT
;
15583 return -CEPHFS_ENOTRECOVERABLE
;
15586 if (flags
& CEPH_RECLAIM_RESET
)
15589 // use blocklist to check if target session was killed
15590 // (config option mds_session_blocklist_on_evict needs to be true)
15591 ldout(cct
, 10) << __func__
<< ": waiting for OSD epoch " << reclaim_osd_epoch
<< dendl
;
15594 objecter
->wait_for_map(reclaim_osd_epoch
, ca::use_blocked
[ec
]);
15598 return ceph::from_error_code(ec
);
15600 bool blocklisted
= objecter
->with_osdmap(
15601 [this](const OSDMap
&osd_map
) -> bool {
15602 return osd_map
.is_blocklisted(reclaim_target_addrs
);
15605 return -CEPHFS_ENOTRECOVERABLE
;
15607 metadata
["reclaiming_uuid"] = uuid
;
15611 void Client::finish_reclaim()
15613 auto it
= metadata
.find("reclaiming_uuid");
15614 if (it
== metadata
.end()) {
15615 for (auto &p
: mds_sessions
)
15616 p
.second
->reclaim_state
= MetaSession::RECLAIM_NULL
;
15620 for (auto &p
: mds_sessions
) {
15621 p
.second
->reclaim_state
= MetaSession::RECLAIM_NULL
;
15622 auto m
= make_message
<MClientReclaim
>("", MClientReclaim::FLAG_FINISH
);
15623 p
.second
->con
->send_message2(std::move(m
));
15626 metadata
["uuid"] = it
->second
;
15627 metadata
.erase(it
);
15630 void Client::handle_client_reclaim_reply(const MConstRef
<MClientReclaimReply
>& reply
)
15632 mds_rank_t from
= mds_rank_t(reply
->get_source().num());
15633 ldout(cct
, 10) << __func__
<< " " << *reply
<< " from mds." << from
<< dendl
;
15635 std::scoped_lock
cl(client_lock
);
15636 auto session
= _get_mds_session(from
, reply
->get_connection().get());
15638 ldout(cct
, 10) << " discarding reclaim reply from sessionless mds." << from
<< dendl
;
15642 if (reply
->get_result() >= 0) {
15643 session
->reclaim_state
= MetaSession::RECLAIM_OK
;
15644 if (reply
->get_epoch() > reclaim_osd_epoch
)
15645 reclaim_osd_epoch
= reply
->get_epoch();
15646 if (!reply
->get_addrs().empty())
15647 reclaim_target_addrs
= reply
->get_addrs();
15649 session
->reclaim_state
= MetaSession::RECLAIM_FAIL
;
15650 reclaim_errno
= reply
->get_result();
15653 signal_cond_list(waiting_for_reclaim
);
15657 * This is included in cap release messages, to cause
15658 * the MDS to wait until this OSD map epoch. It is necessary
15659 * in corner cases where we cancel RADOS ops, so that
15660 * nobody else tries to do IO to the same objects in
15661 * the same epoch as the cancelled ops.
15663 void Client::set_cap_epoch_barrier(epoch_t e
)
15665 ldout(cct
, 5) << __func__
<< " epoch = " << e
<< dendl
;
15666 cap_epoch_barrier
= e
;
15669 const char** Client::get_tracked_conf_keys() const
15671 static const char* keys
[] = {
15672 "client_cache_size",
15673 "client_cache_mid",
15675 "client_deleg_timeout",
15676 "client_deleg_break_on_open",
15678 "client_oc_max_objects",
15679 "client_oc_max_dirty",
15680 "client_oc_target_dirty",
15681 "client_oc_max_dirty_age",
15687 void Client::handle_conf_change(const ConfigProxy
& conf
,
15688 const std::set
<std::string
> &changed
)
15690 std::scoped_lock
lock(client_lock
);
15692 if (changed
.count("client_cache_mid")) {
15693 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
15695 if (changed
.count("client_acl_type")) {
15697 if (cct
->_conf
->client_acl_type
== "posix_acl")
15698 acl_type
= POSIX_ACL
;
15700 if (changed
.count("client_oc_size")) {
15701 objectcacher
->set_max_size(cct
->_conf
->client_oc_size
);
15703 if (changed
.count("client_oc_max_objects")) {
15704 objectcacher
->set_max_objects(cct
->_conf
->client_oc_max_objects
);
15706 if (changed
.count("client_oc_max_dirty")) {
15707 objectcacher
->set_max_dirty(cct
->_conf
->client_oc_max_dirty
);
15709 if (changed
.count("client_oc_target_dirty")) {
15710 objectcacher
->set_target_dirty(cct
->_conf
->client_oc_target_dirty
);
15712 if (changed
.count("client_oc_max_dirty_age")) {
15713 objectcacher
->set_max_dirty_age(cct
->_conf
->client_oc_max_dirty_age
);
15717 void intrusive_ptr_add_ref(Inode
*in
)
15722 void intrusive_ptr_release(Inode
*in
)
15724 in
->client
->put_inode(in
);
15727 mds_rank_t
Client::_get_random_up_mds() const
15729 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
15731 std::set
<mds_rank_t
> up
;
15732 mdsmap
->get_up_mds_set(up
);
15735 return MDS_RANK_NONE
;
15736 std::set
<mds_rank_t
>::const_iterator p
= up
.begin();
15737 for (int n
= rand() % up
.size(); n
; n
--)
15743 StandaloneClient::StandaloneClient(Messenger
*m
, MonClient
*mc
,
15744 boost::asio::io_context
& ictx
)
15745 : Client(m
, mc
, new Objecter(m
->cct
, m
, mc
, ictx
))
15747 monclient
->set_messenger(m
);
15748 objecter
->set_client_incarnation(0);
15751 StandaloneClient::~StandaloneClient()
15754 objecter
= nullptr;
15757 int StandaloneClient::init()
15759 RWRef_t
iref_writer(initialize_state
, CLIENT_INITIALIZING
, false);
15760 ceph_assert(iref_writer
.is_first_writer());
15765 client_lock
.lock();
15767 messenger
->add_dispatcher_tail(objecter
);
15768 messenger
->add_dispatcher_tail(this);
15770 monclient
->set_want_keys(CEPH_ENTITY_TYPE_MDS
| CEPH_ENTITY_TYPE_OSD
);
15771 int r
= monclient
->init();
15773 // need to do cleanup because we're in an intermediate init state
15775 std::scoped_lock
l(timer_lock
);
15779 client_lock
.unlock();
15780 objecter
->shutdown();
15781 objectcacher
->stop();
15782 monclient
->shutdown();
15787 client_lock
.unlock();
15789 iref_writer
.update_state(CLIENT_INITIALIZED
);
15794 void StandaloneClient::shutdown()
15796 Client::shutdown();
15797 objecter
->shutdown();
15798 monclient
->shutdown();