1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <sys/types.h>
23 #include <sys/param.h>
27 #include <sys/utsname.h>
31 #include <boost/lexical_cast.hpp>
32 #include <boost/fusion/include/std_pair.hpp>
34 #include "common/async/waiter.h"
36 #if defined(__FreeBSD__) || defined(_WIN32)
37 #define XATTR_CREATE 0x1
38 #define XATTR_REPLACE 0x2
40 #include <sys/xattr.h>
43 #if defined(__linux__)
44 #include <linux/falloc.h>
47 #include <sys/statvfs.h>
49 #include "common/config.h"
50 #include "common/version.h"
51 #include "common/async/blocked_completion.h"
53 #include "mon/MonClient.h"
55 #include "messages/MClientCaps.h"
56 #include "messages/MClientLease.h"
57 #include "messages/MClientQuota.h"
58 #include "messages/MClientReclaim.h"
59 #include "messages/MClientReclaimReply.h"
60 #include "messages/MClientReconnect.h"
61 #include "messages/MClientReply.h"
62 #include "messages/MClientRequest.h"
63 #include "messages/MClientRequestForward.h"
64 #include "messages/MClientSession.h"
65 #include "messages/MClientSnap.h"
66 #include "messages/MClientMetrics.h"
67 #include "messages/MCommandReply.h"
68 #include "messages/MFSMap.h"
69 #include "messages/MFSMapUser.h"
70 #include "messages/MMDSMap.h"
71 #include "messages/MOSDMap.h"
73 #include "mds/flock.h"
74 #include "mds/cephfs_features.h"
75 #include "osd/OSDMap.h"
76 #include "osdc/Filer.h"
78 #include "common/Cond.h"
79 #include "common/perf_counters.h"
80 #include "common/admin_socket.h"
81 #include "common/errno.h"
82 #include "include/str_list.h"
84 #define dout_subsys ceph_subsys_client
86 #include "include/lru.h"
87 #include "include/compat.h"
88 #include "include/stringify.h"
89 #include "include/random.h"
94 #include "Delegation.h"
96 #include "ClientSnapRealm.h"
98 #include "MetaSession.h"
99 #include "MetaRequest.h"
100 #include "ObjecterWriteback.h"
101 #include "posix_acl.h"
103 #include "include/ceph_assert.h"
104 #include "include/stat.h"
106 #include "include/cephfs/ceph_ll_client.h"
108 #if HAVE_GETGROUPLIST
115 #define dout_prefix *_dout << "client." << whoami << " "
117 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
119 // FreeBSD fails to define this
123 // Darwin fails to define this
132 // Windows doesn't define those values. While the Posix compatibilty layer
133 // doesn't support those values, the Windows native functions do provide
134 // similar flags. Special care should be taken if we're going to use those
135 // flags in ceph-dokan. The current values are no-ops, while propagating
136 // them to the rest of the code might cause the Windows functions to reject
139 #define O_NOFOLLOW 0x0
146 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
149 #define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)
160 using namespace TOPNSPC::common
;
162 namespace bs
= boost::system
;
163 namespace ca
= ceph::async
;
165 void client_flush_set_callback(void *p
, ObjectCacher::ObjectSet
*oset
)
167 Client
*client
= static_cast<Client
*>(p
);
168 client
->flush_set_callback(oset
);
171 bool Client::is_reserved_vino(vinodeno_t
&vino
) {
172 if (MDS_IS_PRIVATE_INO(vino
.ino
)) {
173 ldout(cct
, -1) << __func__
<< " attempt to access reserved inode number " << vino
<< dendl
;
179 // running average and standard deviation -- presented in
180 // Donald Knuth's TAoCP, Volume II.
181 double calc_average(double old_avg
, double value
, uint64_t count
) {
186 new_avg
= old_avg
+ ((value
- old_avg
) / count
);
192 double calc_sq_sum(double old_sq_sum
, double old_mean
, double new_mean
,
193 double value
, uint64_t count
) {
198 new_sq_sum
= old_sq_sum
+ (value
- old_mean
)*(value
- new_mean
);
206 Client::CommandHook::CommandHook(Client
*client
) :
211 int Client::CommandHook::call(
212 std::string_view command
,
213 const cmdmap_t
& cmdmap
,
219 f
->open_object_section("result");
221 std::scoped_lock l
{m_client
->client_lock
};
222 if (command
== "mds_requests")
223 m_client
->dump_mds_requests(f
);
224 else if (command
== "mds_sessions") {
225 bool cap_dump
= false;
226 cmd_getval(cmdmap
, "cap_dump", cap_dump
);
227 m_client
->dump_mds_sessions(f
, cap_dump
);
228 } else if (command
== "dump_cache")
229 m_client
->dump_cache(f
);
230 else if (command
== "kick_stale_sessions")
231 m_client
->_kick_stale_sessions();
232 else if (command
== "status")
233 m_client
->dump_status(f
);
235 ceph_abort_msg("bad command registered");
244 int Client::get_fd_inode(int fd
, InodeRef
*in
) {
246 if (fd
== CEPHFS_AT_FDCWD
) {
249 Fh
*f
= get_filehandle(fd
);
259 dir_result_t::dir_result_t(Inode
*in
, const UserPerm
& perms
)
260 : inode(in
), offset(0), next_offset(2),
261 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
265 void Client::_reset_faked_inos()
268 free_faked_inos
.clear();
269 free_faked_inos
.insert(start
, (uint32_t)-1 - start
+ 1);
270 last_used_faked_ino
= 0;
271 last_used_faked_root
= 0;
273 // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
274 // Windows structures, including Dokan ones, are using 64B identifiers.
275 _use_faked_inos
= false;
277 _use_faked_inos
= sizeof(ino_t
) < 8 || cct
->_conf
->client_use_faked_inos
;
281 void Client::_assign_faked_ino(Inode
*in
)
283 if (0 == last_used_faked_ino
)
284 last_used_faked_ino
= last_used_faked_ino
+ 2048; // start(1024)~2048 reserved for _assign_faked_root
285 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
286 if (it
== free_faked_inos
.end() && last_used_faked_ino
> 0) {
287 last_used_faked_ino
= 2048;
288 it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
290 ceph_assert(it
!= free_faked_inos
.end());
291 if (last_used_faked_ino
< it
.get_start()) {
292 ceph_assert(it
.get_len() > 0);
293 last_used_faked_ino
= it
.get_start();
295 ++last_used_faked_ino
;
296 ceph_assert(it
.get_start() + it
.get_len() > last_used_faked_ino
);
298 in
->faked_ino
= last_used_faked_ino
;
299 free_faked_inos
.erase(in
->faked_ino
);
300 faked_ino_map
[in
->faked_ino
] = in
->vino();
304 * In the faked mode, if you export multiple subdirectories,
305 * you will see that the inode numbers of the exported subdirectories
306 * are the same. so we distinguish the mount point by reserving
307 * the "fake ids" between "1024~2048" and combining the last
308 * 10bits(0x3ff) of the "root inodes".
310 void Client::_assign_faked_root(Inode
*in
)
312 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
313 if (it
== free_faked_inos
.end() && last_used_faked_root
> 0) {
314 last_used_faked_root
= 0;
315 it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
317 ceph_assert(it
!= free_faked_inos
.end());
318 vinodeno_t inode_info
= in
->vino();
319 uint64_t inode_num
= (uint64_t)inode_info
.ino
;
320 ldout(cct
, 10) << "inode_num " << inode_num
<< "inode_num & 0x3ff=" << (inode_num
& 0x3ff)<< dendl
;
321 last_used_faked_root
= it
.get_start() + (inode_num
& 0x3ff); // 0x3ff mask and get_start will not exceed 2048
322 ceph_assert(it
.get_start() + it
.get_len() > last_used_faked_root
);
324 in
->faked_ino
= last_used_faked_root
;
325 free_faked_inos
.erase(in
->faked_ino
);
326 faked_ino_map
[in
->faked_ino
] = in
->vino();
329 void Client::_release_faked_ino(Inode
*in
)
331 free_faked_inos
.insert(in
->faked_ino
);
332 faked_ino_map
.erase(in
->faked_ino
);
335 vinodeno_t
Client::_map_faked_ino(ino_t ino
)
340 else if (faked_ino_map
.count(ino
))
341 vino
= faked_ino_map
[ino
];
343 vino
= vinodeno_t(0, CEPH_NOSNAP
);
344 ldout(cct
, 10) << __func__
<< " " << ino
<< " -> " << vino
<< dendl
;
348 vinodeno_t
Client::map_faked_ino(ino_t ino
)
350 std::scoped_lock
lock(client_lock
);
351 return _map_faked_ino(ino
);
356 Client::Client(Messenger
*m
, MonClient
*mc
, Objecter
*objecter_
)
357 : Dispatcher(m
->cct
->get()),
358 timer(m
->cct
, timer_lock
, false),
362 whoami(mc
->get_global_id()),
363 mount_state(CLIENT_UNMOUNTED
, "Client::mountstate_lock"),
364 initialize_state(CLIENT_NEW
, "Client::initstate_lock"),
365 cct_deleter
{m
->cct
, [](CephContext
*p
) {p
->put();}},
366 async_ino_invalidator(m
->cct
),
367 async_dentry_invalidator(m
->cct
),
368 interrupt_finisher(m
->cct
),
369 remount_finisher(m
->cct
),
370 async_ino_releasor(m
->cct
),
371 objecter_finisher(m
->cct
),
372 m_command_hook(this),
377 user_id
= cct
->_conf
->client_mount_uid
;
378 group_id
= cct
->_conf
->client_mount_gid
;
379 fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
380 "fuse_default_permissions");
382 _collect_and_send_global_metrics
= cct
->_conf
.get_val
<bool>(
383 "client_collect_and_send_global_metrics");
385 mount_timeout
= cct
->_conf
.get_val
<std::chrono::seconds
>(
386 "client_mount_timeout");
388 caps_release_delay
= cct
->_conf
.get_val
<std::chrono::seconds
>(
389 "client_caps_release_delay");
391 if (cct
->_conf
->client_acl_type
== "posix_acl")
392 acl_type
= POSIX_ACL
;
394 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
397 free_fd_set
.insert(10, 1<<30);
399 mdsmap
.reset(new MDSMap
);
402 writeback_handler
.reset(new ObjecterWriteback(objecter
, &objecter_finisher
,
404 objectcacher
.reset(new ObjectCacher(cct
, "libcephfs", *writeback_handler
, client_lock
,
405 client_flush_set_callback
, // all commit callback
407 cct
->_conf
->client_oc_size
,
408 cct
->_conf
->client_oc_max_objects
,
409 cct
->_conf
->client_oc_max_dirty
,
410 cct
->_conf
->client_oc_target_dirty
,
411 cct
->_conf
->client_oc_max_dirty_age
,
418 ceph_assert(ceph_mutex_is_not_locked(client_lock
));
420 // If the task is crashed or aborted and doesn't
421 // get any chance to run the umount and shutdow.
423 std::scoped_lock l
{client_lock
};
424 tick_thread_stopped
= true;
425 upkeep_cond
.notify_one();
428 if (upkeeper
.joinable())
431 // It is necessary to hold client_lock, because any inode destruction
432 // may call into ObjectCacher, which asserts that it's lock (which is
433 // client_lock) is held.
434 std::scoped_lock l
{client_lock
};
438 void Client::tear_down_cache()
441 for (auto &[fd
, fh
] : fd_map
) {
442 ldout(cct
, 1) << __func__
<< " forcing close of fh " << fd
<< " ino " << fh
->inode
->ino
<< dendl
;
447 while (!opened_dirs
.empty()) {
448 dir_result_t
*dirp
= *opened_dirs
.begin();
449 ldout(cct
, 1) << __func__
<< " forcing close of dir " << dirp
<< " ino " << dirp
->inode
->ino
<< dendl
;
458 ceph_assert(lru
.lru_get_size() == 0);
461 ceph_assert(inode_map
.size() <= 1 + root_parents
.size());
462 if (root
&& inode_map
.size() == 1 + root_parents
.size()) {
466 ceph_assert(inode_map
.empty());
469 inodeno_t
Client::get_root_ino()
471 std::scoped_lock
l(client_lock
);
472 if (use_faked_inos())
473 return root
->faked_ino
;
478 Inode
*Client::get_root()
480 std::scoped_lock
l(client_lock
);
488 void Client::dump_inode(Formatter
*f
, Inode
*in
, set
<Inode
*>& did
, bool disconnected
)
491 in
->make_long_path(path
);
492 ldout(cct
, 1) << "dump_inode: "
493 << (disconnected
? "DISCONNECTED ":"")
494 << "inode " << in
->ino
496 << " ref " << in
->get_nref()
497 << " " << *in
<< dendl
;
500 f
->open_object_section("inode");
501 f
->dump_stream("path") << path
;
503 f
->dump_int("disconnected", 1);
510 ldout(cct
, 1) << " dir " << in
->dir
<< " size " << in
->dir
->dentries
.size() << dendl
;
511 for (ceph::unordered_map
<string
, Dentry
*>::iterator it
= in
->dir
->dentries
.begin();
512 it
!= in
->dir
->dentries
.end();
514 ldout(cct
, 1) << " " << in
->ino
<< " dn " << it
->first
<< " " << it
->second
<< " ref " << it
->second
->ref
<< dendl
;
516 f
->open_object_section("dentry");
520 if (it
->second
->inode
)
521 dump_inode(f
, it
->second
->inode
.get(), did
, false);
526 void Client::dump_cache(Formatter
*f
)
530 ldout(cct
, 1) << __func__
<< dendl
;
533 f
->open_array_section("cache");
536 dump_inode(f
, root
.get(), did
, true);
538 // make a second pass to catch anything disconnected
539 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
540 it
!= inode_map
.end();
542 if (did
.count(it
->second
))
544 dump_inode(f
, it
->second
, did
, true);
551 void Client::dump_status(Formatter
*f
)
553 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
555 ldout(cct
, 1) << __func__
<< dendl
;
557 const epoch_t osd_epoch
558 = objecter
->with_osdmap(std::mem_fn(&OSDMap::get_epoch
));
561 f
->open_object_section("metadata");
562 for (const auto& kv
: metadata
)
563 f
->dump_string(kv
.first
.c_str(), kv
.second
);
566 f
->dump_int("dentry_count", lru
.lru_get_size());
567 f
->dump_int("dentry_pinned_count", lru
.lru_get_num_pinned());
568 f
->dump_int("id", get_nodeid().v
);
569 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
570 f
->dump_object("inst", inst
);
571 f
->dump_object("addr", inst
.addr
);
572 f
->dump_stream("inst_str") << inst
.name
<< " " << inst
.addr
.get_legacy_str();
573 f
->dump_string("addr_str", inst
.addr
.get_legacy_str());
574 f
->dump_int("inode_count", inode_map
.size());
575 f
->dump_int("mds_epoch", mdsmap
->get_epoch());
576 f
->dump_int("osd_epoch", osd_epoch
);
577 f
->dump_int("osd_epoch_barrier", cap_epoch_barrier
);
578 f
->dump_bool("blocklisted", blocklisted
);
579 f
->dump_string("fs_name", mdsmap
->get_fs_name());
583 void Client::_pre_init()
587 objecter_finisher
.start();
588 filer
.reset(new Filer(objecter
, &objecter_finisher
));
590 objectcacher
->start();
595 RWRef_t
iref_writer(initialize_state
, CLIENT_INITIALIZING
, false);
596 ceph_assert(iref_writer
.is_first_writer());
600 std::scoped_lock l
{client_lock
};
601 messenger
->add_dispatcher_tail(this);
604 iref_writer
.update_state(CLIENT_INITIALIZED
);
608 void Client::_finish_init()
611 std::scoped_lock l
{client_lock
};
613 PerfCountersBuilder
plb(cct
, "client", l_c_first
, l_c_last
);
614 plb
.add_time_avg(l_c_reply
, "reply", "Latency of receiving a reply on metadata request");
615 plb
.add_time_avg(l_c_lat
, "lat", "Latency of processing a metadata request");
616 plb
.add_time_avg(l_c_wrlat
, "wrlat", "Latency of a file data write operation");
617 plb
.add_time_avg(l_c_read
, "rdlat", "Latency of a file data read operation");
618 plb
.add_time_avg(l_c_fsync
, "fsync", "Latency of a file sync operation");
619 // average, standard deviation mds/r/w/ latencies
620 plb
.add_time(l_c_md_avg
, "mdavg", "Average latency for processing metadata requests");
621 plb
.add_u64(l_c_md_sqsum
, "mdsqsum", "Sum of squares (to calculate variability/stdev) for metadata requests");
622 plb
.add_u64(l_c_md_ops
, "mdops", "Total metadata IO operations");
623 plb
.add_time(l_c_rd_avg
, "readavg", "Average latency for processing read requests");
624 plb
.add_u64(l_c_rd_sqsum
, "readsqsum", "Sum of squares ((to calculate variability/stdev) for read requests");
625 plb
.add_u64(l_c_rd_ops
, "rdops", "Total read IO operations");
626 plb
.add_time(l_c_wr_avg
, "writeavg", "Average latency for processing write requests");
627 plb
.add_u64(l_c_wr_sqsum
, "writesqsum", "Sum of squares ((to calculate variability/stdev) for write requests");
628 plb
.add_u64(l_c_wr_ops
, "rdops", "Total write IO operations");
629 logger
.reset(plb
.create_perf_counters());
630 cct
->get_perfcounters_collection()->add(logger
.get());
633 cct
->_conf
.add_observer(this);
635 AdminSocket
* admin_socket
= cct
->get_admin_socket();
636 int ret
= admin_socket
->register_command("mds_requests",
638 "show in-progress mds requests");
640 lderr(cct
) << "error registering admin socket command: "
641 << cpp_strerror(-ret
) << dendl
;
643 ret
= admin_socket
->register_command("mds_sessions "
644 "name=cap_dump,type=CephBool,req=false",
646 "show mds session state");
648 lderr(cct
) << "error registering admin socket command: "
649 << cpp_strerror(-ret
) << dendl
;
651 ret
= admin_socket
->register_command("dump_cache",
653 "show in-memory metadata cache contents");
655 lderr(cct
) << "error registering admin socket command: "
656 << cpp_strerror(-ret
) << dendl
;
658 ret
= admin_socket
->register_command("kick_stale_sessions",
660 "kick sessions that were remote reset");
662 lderr(cct
) << "error registering admin socket command: "
663 << cpp_strerror(-ret
) << dendl
;
665 ret
= admin_socket
->register_command("status",
667 "show overall client status");
669 lderr(cct
) << "error registering admin socket command: "
670 << cpp_strerror(-ret
) << dendl
;
674 void Client::shutdown()
676 ldout(cct
, 1) << __func__
<< dendl
;
678 // If we were not mounted, but were being used for sending
679 // MDS commands, we may have sessions that need closing.
681 std::scoped_lock l
{client_lock
};
683 // To make sure the tick thread will be stoppped before
684 // destructing the Client, just in case like the _mount()
685 // failed but didn't not get a chance to stop the tick
687 tick_thread_stopped
= true;
688 upkeep_cond
.notify_one();
692 cct
->_conf
.remove_observer(this);
694 cct
->get_admin_socket()->unregister_commands(&m_command_hook
);
696 if (ino_invalidate_cb
) {
697 ldout(cct
, 10) << "shutdown stopping cache invalidator finisher" << dendl
;
698 async_ino_invalidator
.wait_for_empty();
699 async_ino_invalidator
.stop();
702 if (dentry_invalidate_cb
) {
703 ldout(cct
, 10) << "shutdown stopping dentry invalidator finisher" << dendl
;
704 async_dentry_invalidator
.wait_for_empty();
705 async_dentry_invalidator
.stop();
708 if (switch_interrupt_cb
) {
709 ldout(cct
, 10) << "shutdown stopping interrupt finisher" << dendl
;
710 interrupt_finisher
.wait_for_empty();
711 interrupt_finisher
.stop();
715 ldout(cct
, 10) << "shutdown stopping remount finisher" << dendl
;
716 remount_finisher
.wait_for_empty();
717 remount_finisher
.stop();
720 if (ino_release_cb
) {
721 ldout(cct
, 10) << "shutdown stopping inode release finisher" << dendl
;
722 async_ino_releasor
.wait_for_empty();
723 async_ino_releasor
.stop();
726 objectcacher
->stop(); // outside of client_lock! this does a join.
729 * We are shuting down the client.
731 * Just declare the state to CLIENT_NEW to block and fail any
732 * new comming "reader" and then try to wait all the in-flight
733 * "readers" to finish.
735 RWRef_t
iref_writer(initialize_state
, CLIENT_NEW
, false);
736 if (!iref_writer
.is_first_writer())
738 iref_writer
.wait_readers_done();
741 std::scoped_lock
l(timer_lock
);
745 objecter_finisher
.wait_for_empty();
746 objecter_finisher
.stop();
749 cct
->get_perfcounters_collection()->remove(logger
.get());
754 void Client::update_io_stat_metadata(utime_t latency
) {
755 auto lat_nsec
= latency
.to_nsec();
756 // old values are used to compute new ones
757 auto o_avg
= logger
->tget(l_c_md_avg
).to_nsec();
758 auto o_sqsum
= logger
->get(l_c_md_sqsum
);
760 auto n_avg
= calc_average(o_avg
, lat_nsec
, nr_metadata_request
);
761 auto n_sqsum
= calc_sq_sum(o_sqsum
, o_avg
, n_avg
, lat_nsec
,
762 nr_metadata_request
);
764 logger
->tinc(l_c_lat
, latency
);
765 logger
->tinc(l_c_reply
, latency
);
768 avg
.set_from_double(n_avg
/ 1000000000);
769 logger
->tset(l_c_md_avg
, avg
);
770 logger
->set(l_c_md_sqsum
, n_sqsum
);
771 logger
->set(l_c_md_ops
, nr_metadata_request
);
774 void Client::update_io_stat_read(utime_t latency
) {
775 auto lat_nsec
= latency
.to_nsec();
776 // old values are used to compute new ones
777 auto o_avg
= logger
->tget(l_c_rd_avg
).to_nsec();
778 auto o_sqsum
= logger
->get(l_c_rd_sqsum
);
780 auto n_avg
= calc_average(o_avg
, lat_nsec
, nr_read_request
);
781 auto n_sqsum
= calc_sq_sum(o_sqsum
, o_avg
, n_avg
, lat_nsec
,
784 logger
->tinc(l_c_read
, latency
);
787 avg
.set_from_double(n_avg
/ 1000000000);
788 logger
->tset(l_c_rd_avg
, avg
);
789 logger
->set(l_c_rd_sqsum
, n_sqsum
);
790 logger
->set(l_c_rd_ops
, nr_read_request
);
793 void Client::update_io_stat_write(utime_t latency
) {
794 auto lat_nsec
= latency
.to_nsec();
795 // old values are used to compute new ones
796 auto o_avg
= logger
->tget(l_c_wr_avg
).to_nsec();
797 auto o_sqsum
= logger
->get(l_c_wr_sqsum
);
799 auto n_avg
= calc_average(o_avg
, lat_nsec
, nr_write_request
);
800 auto n_sqsum
= calc_sq_sum(o_sqsum
, o_avg
, n_avg
, lat_nsec
,
803 logger
->tinc(l_c_wrlat
, latency
);
806 avg
.set_from_double(n_avg
/ 1000000000);
807 logger
->tset(l_c_wr_avg
, avg
);
808 logger
->set(l_c_wr_sqsum
, n_sqsum
);
809 logger
->set(l_c_wr_ops
, nr_write_request
);
812 // ===================
813 // metadata cache stuff
815 void Client::trim_cache(bool trim_kernel_dcache
)
817 uint64_t max
= cct
->_conf
->client_cache_size
;
818 ldout(cct
, 20) << "trim_cache size " << lru
.lru_get_size() << " max " << max
<< dendl
;
820 while (lru
.lru_get_size() != last
) {
821 last
= lru
.lru_get_size();
823 if (!is_unmounting() && lru
.lru_get_size() <= max
) break;
826 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_get_next_expire());
833 if (trim_kernel_dcache
&& lru
.lru_get_size() > max
)
834 _invalidate_kernel_dcache();
837 if (lru
.lru_get_size() == 0 && root
&& root
->get_nref() == 1 && inode_map
.size() == 1 + root_parents
.size()) {
838 ldout(cct
, 15) << "trim_cache trimmed root " << root
<< dendl
;
843 void Client::trim_cache_for_reconnect(MetaSession
*s
)
845 mds_rank_t mds
= s
->mds_num
;
846 ldout(cct
, 20) << __func__
<< " mds." << mds
<< dendl
;
849 list
<Dentry
*> skipped
;
850 while (lru
.lru_get_size() > 0) {
851 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_expire());
855 if ((dn
->inode
&& dn
->inode
->caps
.count(mds
)) ||
856 dn
->dir
->parent_inode
->caps
.count(mds
)) {
860 skipped
.push_back(dn
);
863 for(list
<Dentry
*>::iterator p
= skipped
.begin(); p
!= skipped
.end(); ++p
)
864 lru
.lru_insert_mid(*p
);
866 ldout(cct
, 20) << __func__
<< " mds." << mds
867 << " trimmed " << trimmed
<< " dentries" << dendl
;
869 if (s
->caps
.size() > 0)
870 _invalidate_kernel_dcache();
873 void Client::trim_dentry(Dentry
*dn
)
875 ldout(cct
, 15) << "trim_dentry unlinking dn " << dn
->name
877 << std::hex
<< dn
->dir
->parent_inode
->ino
<< std::dec
880 Inode
*diri
= dn
->dir
->parent_inode
;
881 clear_dir_complete_and_ordered(diri
, true);
883 unlink(dn
, false, false); // drop dir, drop dentry
887 void Client::update_inode_file_size(Inode
*in
, int issued
, uint64_t size
,
888 uint64_t truncate_seq
, uint64_t truncate_size
)
890 uint64_t prior_size
= in
->size
;
892 if (truncate_seq
> in
->truncate_seq
||
893 (truncate_seq
== in
->truncate_seq
&& size
> in
->size
)) {
894 ldout(cct
, 10) << "size " << in
->size
<< " -> " << size
<< dendl
;
896 in
->reported_size
= size
;
897 if (truncate_seq
!= in
->truncate_seq
) {
898 ldout(cct
, 10) << "truncate_seq " << in
->truncate_seq
<< " -> "
899 << truncate_seq
<< dendl
;
900 in
->truncate_seq
= truncate_seq
;
901 in
->oset
.truncate_seq
= truncate_seq
;
903 // truncate cached file data
904 if (prior_size
> size
) {
905 _invalidate_inode_cache(in
, truncate_size
, prior_size
- truncate_size
);
909 // truncate inline data
910 if (in
->inline_version
< CEPH_INLINE_NONE
) {
911 uint32_t len
= in
->inline_data
.length();
913 in
->inline_data
.splice(size
, len
- size
);
916 if (truncate_seq
>= in
->truncate_seq
&&
917 in
->truncate_size
!= truncate_size
) {
919 ldout(cct
, 10) << "truncate_size " << in
->truncate_size
<< " -> "
920 << truncate_size
<< dendl
;
921 in
->truncate_size
= truncate_size
;
922 in
->oset
.truncate_size
= truncate_size
;
924 ldout(cct
, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl
;
929 void Client::update_inode_file_time(Inode
*in
, int issued
, uint64_t time_warp_seq
,
930 utime_t ctime
, utime_t mtime
, utime_t atime
)
932 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << ccap_string(issued
)
933 << " ctime " << ctime
<< " mtime " << mtime
<< dendl
;
935 if (time_warp_seq
> in
->time_warp_seq
)
936 ldout(cct
, 10) << " mds time_warp_seq " << time_warp_seq
937 << " is higher than local time_warp_seq "
938 << in
->time_warp_seq
<< dendl
;
941 // be careful with size, mtime, atime
942 if (issued
& (CEPH_CAP_FILE_EXCL
|
944 CEPH_CAP_FILE_BUFFER
|
946 CEPH_CAP_XATTR_EXCL
)) {
947 ldout(cct
, 30) << "Yay have enough caps to look at our times" << dendl
;
948 if (ctime
> in
->ctime
)
950 if (time_warp_seq
> in
->time_warp_seq
) {
951 //the mds updated times, so take those!
954 in
->time_warp_seq
= time_warp_seq
;
955 } else if (time_warp_seq
== in
->time_warp_seq
) {
957 if (mtime
> in
->mtime
)
959 if (atime
> in
->atime
)
961 } else if (issued
& CEPH_CAP_FILE_EXCL
) {
962 //ignore mds values as we have a higher seq
965 ldout(cct
, 30) << "Don't have enough caps, just taking mds' time values" << dendl
;
966 if (time_warp_seq
>= in
->time_warp_seq
) {
970 in
->time_warp_seq
= time_warp_seq
;
974 ldout(cct
, 0) << "WARNING: " << *in
<< " mds time_warp_seq "
975 << time_warp_seq
<< " is lower than local time_warp_seq "
981 void Client::_fragmap_remove_non_leaves(Inode
*in
)
983 for (map
<frag_t
,int>::iterator p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
984 if (!in
->dirfragtree
.is_leaf(p
->first
))
985 in
->fragmap
.erase(p
++);
990 void Client::_fragmap_remove_stopped_mds(Inode
*in
, mds_rank_t mds
)
992 for (auto p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
993 if (p
->second
== mds
)
994 in
->fragmap
.erase(p
++);
999 Inode
* Client::add_update_inode(InodeStat
*st
, utime_t from
,
1000 MetaSession
*session
,
1001 const UserPerm
& request_perms
)
1004 bool was_new
= false;
1005 if (inode_map
.count(st
->vino
)) {
1006 in
= inode_map
[st
->vino
];
1007 ldout(cct
, 12) << __func__
<< " had " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
1009 in
= new Inode(this, st
->vino
, &st
->layout
);
1010 inode_map
[st
->vino
] = in
;
1012 if (use_faked_inos())
1013 _assign_faked_ino(in
);
1017 if (use_faked_inos())
1018 _assign_faked_root(root
.get());
1021 } else if (is_mounting()) {
1022 root_parents
[root_ancestor
] = in
;
1027 in
->ino
= st
->vino
.ino
;
1028 in
->snapid
= st
->vino
.snapid
;
1029 in
->mode
= st
->mode
& S_IFMT
;
1033 in
->rdev
= st
->rdev
;
1034 if (in
->is_symlink())
1035 in
->symlink
= st
->symlink
;
1037 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1038 bool new_version
= false;
1039 if (in
->version
== 0 ||
1040 ((st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
) &&
1041 (in
->version
& ~1) < st
->version
))
1045 in
->caps_issued(&issued
);
1046 issued
|= in
->caps_dirty();
1047 int new_issued
= ~issued
& (int)st
->cap
.caps
;
1049 bool need_snapdir_attr_refresh
= false;
1050 if ((new_version
|| (new_issued
& CEPH_CAP_AUTH_SHARED
)) &&
1051 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
1052 in
->mode
= st
->mode
;
1055 in
->btime
= st
->btime
;
1056 in
->snap_btime
= st
->snap_btime
;
1057 in
->snap_metadata
= st
->snap_metadata
;
1058 need_snapdir_attr_refresh
= true;
1061 if ((new_version
|| (new_issued
& CEPH_CAP_LINK_SHARED
)) &&
1062 !(issued
& CEPH_CAP_LINK_EXCL
)) {
1063 in
->nlink
= st
->nlink
;
1066 if (new_version
|| (new_issued
& CEPH_CAP_ANY_RD
)) {
1067 need_snapdir_attr_refresh
= true;
1068 update_inode_file_time(in
, issued
, st
->time_warp_seq
,
1069 st
->ctime
, st
->mtime
, st
->atime
);
1073 (new_issued
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
))) {
1074 in
->layout
= st
->layout
;
1075 update_inode_file_size(in
, issued
, st
->size
, st
->truncate_seq
, st
->truncate_size
);
1079 if (new_version
|| (new_issued
& CEPH_CAP_FILE_SHARED
)) {
1080 in
->dirstat
= st
->dirstat
;
1082 // dir_layout/rstat/quota are not tracked by capability, update them only if
1083 // the inode stat is from auth mds
1084 if (new_version
|| (st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
)) {
1085 in
->dir_layout
= st
->dir_layout
;
1086 ldout(cct
, 20) << " dir hash is " << (int)in
->dir_layout
.dl_dir_hash
<< dendl
;
1087 in
->rstat
= st
->rstat
;
1088 in
->quota
= st
->quota
;
1089 in
->dir_pin
= st
->dir_pin
;
1091 // move me if/when version reflects fragtree changes.
1092 if (in
->dirfragtree
!= st
->dirfragtree
) {
1093 in
->dirfragtree
= st
->dirfragtree
;
1094 _fragmap_remove_non_leaves(in
);
1098 if ((in
->xattr_version
== 0 || !(issued
& CEPH_CAP_XATTR_EXCL
)) &&
1099 st
->xattrbl
.length() &&
1100 st
->xattr_version
> in
->xattr_version
) {
1101 auto p
= st
->xattrbl
.cbegin();
1102 decode(in
->xattrs
, p
);
1103 in
->xattr_version
= st
->xattr_version
;
1104 need_snapdir_attr_refresh
= true;
1107 if (st
->inline_version
> in
->inline_version
) {
1108 in
->inline_data
= st
->inline_data
;
1109 in
->inline_version
= st
->inline_version
;
1112 /* always take a newer change attr */
1113 ldout(cct
, 12) << __func__
<< " client inode change_attr: " << in
->change_attr
<< " , mds inodestat change_attr: " << st
->change_attr
<< dendl
;
1114 if (st
->change_attr
> in
->change_attr
)
1115 in
->change_attr
= st
->change_attr
;
1117 if (st
->version
> in
->version
)
1118 in
->version
= st
->version
;
1121 ldout(cct
, 12) << __func__
<< " adding " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
1124 return in
; // as with readdir returning indoes in different snaprealms (no caps!)
1126 if (in
->snapid
== CEPH_NOSNAP
) {
1127 add_update_cap(in
, session
, st
->cap
.cap_id
, st
->cap
.caps
, st
->cap
.wanted
,
1128 st
->cap
.seq
, st
->cap
.mseq
, inodeno_t(st
->cap
.realm
),
1129 st
->cap
.flags
, request_perms
);
1130 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
1131 in
->max_size
= st
->max_size
;
1132 in
->rstat
= st
->rstat
;
1135 // setting I_COMPLETE needs to happen after adding the cap
1137 (st
->cap
.caps
& CEPH_CAP_FILE_SHARED
) &&
1138 (issued
& CEPH_CAP_FILE_EXCL
) == 0 &&
1139 in
->dirstat
.nfiles
== 0 &&
1140 in
->dirstat
.nsubdirs
== 0) {
1141 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in
<< dendl
;
1142 in
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
1144 ldout(cct
, 10) << " dir is open on empty dir " << in
->ino
<< " with "
1145 << in
->dir
->dentries
.size() << " entries, marking all dentries null" << dendl
;
1146 in
->dir
->readdir_cache
.clear();
1147 for (const auto& p
: in
->dir
->dentries
) {
1148 unlink(p
.second
, true, true); // keep dir, keep dentry
1150 if (in
->dir
->dentries
.empty())
1155 in
->snap_caps
|= st
->cap
.caps
;
1158 in
->fscrypt
= st
->fscrypt
;
1159 if (need_snapdir_attr_refresh
&& in
->is_dir() && in
->snapid
== CEPH_NOSNAP
) {
1160 vinodeno_t
vino(in
->ino
, CEPH_SNAPDIR
);
1161 if (inode_map
.count(vino
)) {
1162 refresh_snapdir_attrs(inode_map
[vino
], in
);
1171 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1173 Dentry
*Client::insert_dentry_inode(Dir
*dir
, const string
& dname
, LeaseStat
*dlease
,
1174 Inode
*in
, utime_t from
, MetaSession
*session
,
1178 if (dir
->dentries
.count(dname
))
1179 dn
= dir
->dentries
[dname
];
1181 ldout(cct
, 12) << __func__
<< " '" << dname
<< "' vino " << in
->vino()
1182 << " in dir " << dir
->parent_inode
->vino() << " dn " << dn
1185 if (dn
&& dn
->inode
) {
1186 if (dn
->inode
->vino() == in
->vino()) {
1188 ldout(cct
, 12) << " had dentry " << dname
1189 << " with correct vino " << dn
->inode
->vino()
1192 ldout(cct
, 12) << " had dentry " << dname
1193 << " with WRONG vino " << dn
->inode
->vino()
1195 unlink(dn
, true, true); // keep dir, keep dentry
1199 if (!dn
|| !dn
->inode
) {
1200 InodeRef
tmp_ref(in
);
1202 if (old_dentry
->dir
!= dir
) {
1203 Inode
*old_diri
= old_dentry
->dir
->parent_inode
;
1204 clear_dir_complete_and_ordered(old_diri
, false);
1206 unlink(old_dentry
, dir
== old_dentry
->dir
, false); // drop dentry, keep dir open if its the same dir
1208 Inode
*diri
= dir
->parent_inode
;
1209 clear_dir_complete_and_ordered(diri
, false);
1210 dn
= link(dir
, dname
, in
, dn
);
1213 update_dentry_lease(dn
, dlease
, from
, session
);
1217 void Client::update_dentry_lease(Dentry
*dn
, LeaseStat
*dlease
, utime_t from
, MetaSession
*session
)
1219 utime_t dttl
= from
;
1220 dttl
+= (float)dlease
->duration_ms
/ 1000.0;
1222 ldout(cct
, 15) << __func__
<< " " << *dn
<< " " << *dlease
<< " from " << from
<< dendl
;
1226 if (dlease
->mask
& CEPH_LEASE_VALID
) {
1227 if (dttl
> dn
->lease_ttl
) {
1228 ldout(cct
, 10) << "got dentry lease on " << dn
->name
1229 << " dur " << dlease
->duration_ms
<< "ms ttl " << dttl
<< dendl
;
1230 dn
->lease_ttl
= dttl
;
1231 dn
->lease_mds
= session
->mds_num
;
1232 dn
->lease_seq
= dlease
->seq
;
1233 dn
->lease_gen
= session
->cap_gen
;
1236 dn
->cap_shared_gen
= dn
->dir
->parent_inode
->shared_gen
;
1237 if (dlease
->mask
& CEPH_LEASE_PRIMARY_LINK
)
1239 dn
->alternate_name
= std::move(dlease
->alternate_name
);
1244 * update MDS location cache for a single inode
1246 void Client::update_dir_dist(Inode
*in
, DirStat
*dst
, mds_rank_t from
)
1249 ldout(cct
, 20) << "got dirfrag map for " << in
->ino
<< " frag " << dst
->frag
<< " to mds " << dst
->auth
<< dendl
;
1250 if (dst
->auth
>= 0) {
1251 in
->fragmap
[dst
->frag
] = dst
->auth
;
1253 in
->fragmap
.erase(dst
->frag
);
1255 if (!in
->dirfragtree
.is_leaf(dst
->frag
)) {
1256 in
->dirfragtree
.force_to_leaf(cct
, dst
->frag
);
1257 _fragmap_remove_non_leaves(in
);
1260 // replicated, only update from auth mds reply
1261 if (from
== dst
->auth
) {
1262 in
->dir_replicated
= !dst
->dist
.empty();
1263 if (!dst
->dist
.empty())
1264 in
->frag_repmap
[dst
->frag
].assign(dst
->dist
.begin(), dst
->dist
.end()) ;
1266 in
->frag_repmap
.erase(dst
->frag
);
1270 void Client::clear_dir_complete_and_ordered(Inode
*diri
, bool complete
)
1273 diri
->dir_release_count
++;
1275 diri
->dir_ordered_count
++;
1276 if (diri
->flags
& I_COMPLETE
) {
1278 ldout(cct
, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
1279 diri
->flags
&= ~(I_COMPLETE
| I_DIR_ORDERED
);
1281 if (diri
->flags
& I_DIR_ORDERED
) {
1282 ldout(cct
, 10) << " clearing I_DIR_ORDERED on " << *diri
<< dendl
;
1283 diri
->flags
&= ~I_DIR_ORDERED
;
1287 diri
->dir
->readdir_cache
.clear();
1292 * insert results from readdir or lssnap into the metadata cache.
1294 void Client::insert_readdir_results(MetaRequest
*request
, MetaSession
*session
, Inode
*diri
) {
1296 auto& reply
= request
->reply
;
1297 ConnectionRef con
= request
->reply
->get_connection();
1299 if(session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1300 features
= (uint64_t)-1;
1303 features
= con
->get_features();
1306 dir_result_t
*dirp
= request
->dirp
;
1309 // the extra buffer list is only set for readdir and lssnap replies
1310 auto p
= reply
->get_extra_bl().cbegin();
1313 if (request
->head
.op
== CEPH_MDS_OP_LSSNAP
) {
1315 diri
= open_snapdir(diri
);
1318 // only open dir if we're actually adding stuff to it!
1319 Dir
*dir
= diri
->open_dir();
1323 DirStat
dst(p
, features
);
1329 bool end
= ((unsigned)flags
& CEPH_READDIR_FRAG_END
);
1330 bool hash_order
= ((unsigned)flags
& CEPH_READDIR_HASH_ORDER
);
1332 frag_t fg
= (unsigned)request
->head
.args
.readdir
.frag
;
1333 unsigned readdir_offset
= dirp
->next_offset
;
1334 string readdir_start
= dirp
->last_name
;
1335 ceph_assert(!readdir_start
.empty() || readdir_offset
== 2);
1337 unsigned last_hash
= 0;
1339 if (!readdir_start
.empty()) {
1340 last_hash
= ceph_frag_value(diri
->hash_dentry_name(readdir_start
));
1341 } else if (flags
& CEPH_READDIR_OFFSET_HASH
) {
1342 /* mds understands offset_hash */
1343 last_hash
= (unsigned)request
->head
.args
.readdir
.offset_hash
;
1347 if (fg
!= dst
.frag
) {
1348 ldout(cct
, 10) << "insert_trace got new frag " << fg
<< " -> " << dst
.frag
<< dendl
;
1352 readdir_start
.clear();
1353 dirp
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
, false);
1357 ldout(cct
, 10) << __func__
<< " " << numdn
<< " readdir items, end=" << end
1358 << ", hash_order=" << hash_order
1359 << ", readdir_start " << readdir_start
1360 << ", last_hash " << last_hash
1361 << ", next_offset " << readdir_offset
<< dendl
;
1363 if (diri
->snapid
!= CEPH_SNAPDIR
&&
1364 fg
.is_leftmost() && readdir_offset
== 2 &&
1365 !(hash_order
&& last_hash
)) {
1366 dirp
->release_count
= diri
->dir_release_count
;
1367 dirp
->ordered_count
= diri
->dir_ordered_count
;
1368 dirp
->start_shared_gen
= diri
->shared_gen
;
1369 dirp
->cache_index
= 0;
1372 dirp
->buffer_frag
= fg
;
1374 _readdir_drop_dirp_buffer(dirp
);
1375 dirp
->buffer
.reserve(numdn
);
1379 for (unsigned i
=0; i
<numdn
; i
++) {
1381 dlease
.decode(p
, features
);
1382 InodeStat
ist(p
, features
);
1384 ldout(cct
, 15) << "" << i
<< ": '" << dname
<< "'" << dendl
;
1386 Inode
*in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1389 if (diri
->dir
->dentries
.count(dname
)) {
1390 Dentry
*olddn
= diri
->dir
->dentries
[dname
];
1391 if (olddn
->inode
!= in
) {
1392 // replace incorrect dentry
1393 unlink(olddn
, true, true); // keep dir, dentry
1394 dn
= link(dir
, dname
, in
, olddn
);
1395 ceph_assert(dn
== olddn
);
1403 dn
= link(dir
, dname
, in
, NULL
);
1405 dn
->alternate_name
= std::move(dlease
.alternate_name
);
1407 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1409 unsigned hash
= ceph_frag_value(diri
->hash_dentry_name(dname
));
1410 if (hash
!= last_hash
)
1413 dn
->offset
= dir_result_t::make_fpos(hash
, readdir_offset
++, true);
1415 dn
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
++, false);
1417 // add to readdir cache
1418 if (dirp
->release_count
== diri
->dir_release_count
&&
1419 dirp
->ordered_count
== diri
->dir_ordered_count
&&
1420 dirp
->start_shared_gen
== diri
->shared_gen
) {
1421 if (dirp
->cache_index
== dir
->readdir_cache
.size()) {
1423 ceph_assert(!dirp
->inode
->is_complete_and_ordered());
1424 dir
->readdir_cache
.reserve(dirp
->cache_index
+ numdn
);
1426 dir
->readdir_cache
.push_back(dn
);
1427 } else if (dirp
->cache_index
< dir
->readdir_cache
.size()) {
1428 if (dirp
->inode
->is_complete_and_ordered())
1429 ceph_assert(dir
->readdir_cache
[dirp
->cache_index
] == dn
);
1431 dir
->readdir_cache
[dirp
->cache_index
] = dn
;
1433 ceph_abort_msg("unexpected readdir buffer idx");
1435 dirp
->cache_index
++;
1437 // add to cached result list
1438 dirp
->buffer
.push_back(dir_result_t::dentry(dn
->offset
, dname
, dn
->alternate_name
, in
));
1439 ldout(cct
, 15) << __func__
<< " " << hex
<< dn
->offset
<< dec
<< ": '" << dname
<< "' -> " << in
->ino
<< dendl
;
1443 dirp
->last_name
= dname
;
1445 dirp
->next_offset
= 2;
1447 dirp
->next_offset
= readdir_offset
;
1449 if (dir
->is_empty())
1456 * insert a trace from a MDS reply into the cache.
1458 Inode
* Client::insert_trace(MetaRequest
*request
, MetaSession
*session
)
1460 auto& reply
= request
->reply
;
1461 int op
= request
->get_op();
1463 ldout(cct
, 10) << "insert_trace from " << request
->sent_stamp
<< " mds." << session
->mds_num
1464 << " is_target=" << (int)reply
->head
.is_target
1465 << " is_dentry=" << (int)reply
->head
.is_dentry
1468 auto p
= reply
->get_trace_bl().cbegin();
1469 if (request
->got_unsafe
) {
1470 ldout(cct
, 10) << "insert_trace -- already got unsafe; ignoring" << dendl
;
1471 ceph_assert(p
.end());
1476 ldout(cct
, 10) << "insert_trace -- no trace" << dendl
;
1478 Dentry
*d
= request
->dentry();
1480 Inode
*diri
= d
->dir
->parent_inode
;
1481 clear_dir_complete_and_ordered(diri
, true);
1484 if (d
&& reply
->get_result() == 0) {
1485 if (op
== CEPH_MDS_OP_RENAME
) {
1487 Dentry
*od
= request
->old_dentry();
1488 ldout(cct
, 10) << " unlinking rename src dn " << od
<< " for traceless reply" << dendl
;
1490 unlink(od
, true, true); // keep dir, dentry
1491 } else if (op
== CEPH_MDS_OP_RMDIR
||
1492 op
== CEPH_MDS_OP_UNLINK
) {
1494 ldout(cct
, 10) << " unlinking unlink/rmdir dn " << d
<< " for traceless reply" << dendl
;
1495 unlink(d
, true, true); // keep dir, dentry
1501 ConnectionRef con
= request
->reply
->get_connection();
1503 if (session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1504 features
= (uint64_t)-1;
1507 features
= con
->get_features();
1509 ldout(cct
, 10) << " features 0x" << hex
<< features
<< dec
<< dendl
;
1512 SnapRealm
*realm
= NULL
;
1513 if (reply
->snapbl
.length())
1514 update_snap_trace(reply
->snapbl
, &realm
);
1516 ldout(cct
, 10) << " hrm "
1517 << " is_target=" << (int)reply
->head
.is_target
1518 << " is_dentry=" << (int)reply
->head
.is_dentry
1527 if (reply
->head
.is_dentry
) {
1528 dirst
.decode(p
, features
);
1529 dst
.decode(p
, features
);
1531 dlease
.decode(p
, features
);
1535 if (reply
->head
.is_target
) {
1536 ist
.decode(p
, features
);
1537 if (cct
->_conf
->client_debug_getattr_caps
) {
1538 unsigned wanted
= 0;
1539 if (op
== CEPH_MDS_OP_GETATTR
|| op
== CEPH_MDS_OP_LOOKUP
)
1540 wanted
= request
->head
.args
.getattr
.mask
;
1541 else if (op
== CEPH_MDS_OP_OPEN
|| op
== CEPH_MDS_OP_CREATE
)
1542 wanted
= request
->head
.args
.open
.mask
;
1544 if ((wanted
& CEPH_CAP_XATTR_SHARED
) &&
1545 !(ist
.xattr_version
> 0 && ist
.xattrbl
.length() > 0))
1546 ceph_abort_msg("MDS reply does not contain xattrs");
1549 in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1554 if (reply
->head
.is_dentry
) {
1555 diri
= add_update_inode(&dirst
, request
->sent_stamp
, session
,
1557 mds_rank_t from_mds
= mds_rank_t(reply
->get_source().num());
1558 update_dir_dist(diri
, &dst
, from_mds
); // dir stat info is attached to ..
1561 Dir
*dir
= diri
->open_dir();
1562 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
,
1563 (op
== CEPH_MDS_OP_RENAME
) ? request
->old_dentry() : NULL
);
1566 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1567 dn
= diri
->dir
->dentries
[dname
];
1569 clear_dir_complete_and_ordered(diri
, false);
1570 unlink(dn
, true, true); // keep dir, dentry
1573 if (dlease
.duration_ms
> 0) {
1575 Dir
*dir
= diri
->open_dir();
1576 dn
= link(dir
, dname
, NULL
, NULL
);
1578 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1581 } else if (op
== CEPH_MDS_OP_LOOKUPSNAP
||
1582 op
== CEPH_MDS_OP_MKSNAP
) {
1583 ldout(cct
, 10) << " faking snap lookup weirdness" << dendl
;
1584 // fake it for snap lookup
1585 vinodeno_t vino
= ist
.vino
;
1586 vino
.snapid
= CEPH_SNAPDIR
;
1587 ceph_assert(inode_map
.count(vino
));
1588 diri
= inode_map
[vino
];
1590 string dname
= request
->path
.last_dentry();
1593 dlease
.duration_ms
= 0;
1596 Dir
*dir
= diri
->open_dir();
1597 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
);
1599 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1600 Dentry
*dn
= diri
->dir
->dentries
[dname
];
1602 unlink(dn
, true, true); // keep dir, dentry
1608 if (op
== CEPH_MDS_OP_READDIR
||
1609 op
== CEPH_MDS_OP_LSSNAP
) {
1610 insert_readdir_results(request
, session
, in
);
1611 } else if (op
== CEPH_MDS_OP_LOOKUPNAME
) {
1612 // hack: return parent inode instead
1616 if (request
->dentry() == NULL
&& in
!= request
->inode()) {
1617 // pin the target inode if its parent dentry is not pinned
1618 request
->set_other_inode(in
);
1623 put_snap_realm(realm
);
1625 request
->target
= in
;
1631 mds_rank_t
Client::choose_target_mds(MetaRequest
*req
, Inode
** phash_diri
)
1633 mds_rank_t mds
= MDS_RANK_NONE
;
1635 bool is_hash
= false;
1641 if (req
->resend_mds
>= 0) {
1642 mds
= req
->resend_mds
;
1643 req
->resend_mds
= -1;
1644 ldout(cct
, 10) << __func__
<< " resend_mds specified as mds." << mds
<< dendl
;
1648 if (cct
->_conf
->client_use_random_mds
)
1654 ldout(cct
, 20) << __func__
<< " starting with req->inode " << *in
<< dendl
;
1655 if (req
->path
.depth()) {
1656 hash
= in
->hash_dentry_name(req
->path
[0]);
1657 ldout(cct
, 20) << __func__
<< " inode dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1658 << " on " << req
->path
[0]
1659 << " => " << hash
<< dendl
;
1664 in
= de
->inode
.get();
1665 ldout(cct
, 20) << __func__
<< " starting with req->dentry inode " << *in
<< dendl
;
1667 in
= de
->dir
->parent_inode
;
1668 hash
= in
->hash_dentry_name(de
->name
);
1669 ldout(cct
, 20) << __func__
<< " dentry dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1670 << " on " << de
->name
1671 << " => " << hash
<< dendl
;
1676 if (in
->snapid
!= CEPH_NOSNAP
) {
1677 ldout(cct
, 10) << __func__
<< " " << *in
<< " is snapped, using nonsnap parent" << dendl
;
1678 while (in
->snapid
!= CEPH_NOSNAP
) {
1679 if (in
->snapid
== CEPH_SNAPDIR
)
1680 in
= in
->snapdir_parent
.get();
1681 else if (!in
->dentries
.empty())
1682 /* In most cases there will only be one dentry, so getting it
1683 * will be the correct action. If there are multiple hard links,
1684 * I think the MDS should be able to redirect as needed*/
1685 in
= in
->get_first_parent()->dir
->parent_inode
;
1687 ldout(cct
, 10) << "got unlinked inode, can't look at parent" << dendl
;
1694 ldout(cct
, 20) << __func__
<< " " << *in
<< " is_hash=" << is_hash
1695 << " hash=" << hash
<< dendl
;
1697 if (req
->get_op() == CEPH_MDS_OP_GETATTR
)
1698 issued
= req
->inode()->caps_issued();
1700 if (is_hash
&& S_ISDIR(in
->mode
) && (!in
->fragmap
.empty() || !in
->frag_repmap
.empty())) {
1701 frag_t fg
= in
->dirfragtree
[hash
];
1702 if (!req
->auth_is_best(issued
)) {
1703 auto repmapit
= in
->frag_repmap
.find(fg
);
1704 if (repmapit
!= in
->frag_repmap
.end()) {
1705 auto& repmap
= repmapit
->second
;
1706 auto r
= ceph::util::generate_random_number
<uint64_t>(0, repmap
.size()-1);
1709 } else if (in
->fragmap
.count(fg
)) {
1710 mds
= in
->fragmap
[fg
];
1713 } else if (in
->auth_cap
) {
1714 req
->send_to_auth
= true;
1715 mds
= in
->auth_cap
->session
->mds_num
;
1718 ldout(cct
, 10) << __func__
<< " from dirfragtree hash" << dendl
;
1723 if (in
->auth_cap
&& req
->auth_is_best(issued
)) {
1724 mds
= in
->auth_cap
->session
->mds_num
;
1725 } else if (!in
->caps
.empty()) {
1726 mds
= in
->caps
.begin()->second
.session
->mds_num
;
1730 ldout(cct
, 10) << __func__
<< " from caps on inode " << *in
<< dendl
;
1737 mds
= _get_random_up_mds();
1738 ldout(cct
, 10) << "did not get mds through better means, so chose random mds " << mds
<< dendl
;
1742 ldout(cct
, 20) << "mds is " << mds
<< dendl
;
1746 void Client::connect_mds_targets(mds_rank_t mds
)
1748 ldout(cct
, 10) << __func__
<< " for mds." << mds
<< dendl
;
1749 ceph_assert(mds_sessions
.count(mds
));
1750 const MDSMap::mds_info_t
& info
= mdsmap
->get_mds_info(mds
);
1751 for (const auto &rank
: info
.export_targets
) {
1752 if (mds_sessions
.count(rank
) == 0 &&
1753 mdsmap
->is_clientreplay_or_active_or_stopping(rank
)) {
1754 ldout(cct
, 10) << "check_mds_sessions opening mds." << mds
1755 << " export target mds." << rank
<< dendl
;
1756 _open_mds_session(rank
);
1761 void Client::dump_mds_sessions(Formatter
*f
, bool cap_dump
)
1763 f
->dump_int("id", get_nodeid().v
);
1764 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
1765 f
->dump_object("inst", inst
);
1766 f
->dump_stream("inst_str") << inst
;
1767 f
->dump_stream("addr_str") << inst
.addr
;
1768 f
->open_array_section("sessions");
1769 for (const auto &p
: mds_sessions
) {
1770 f
->open_object_section("session");
1771 p
.second
->dump(f
, cap_dump
);
1775 f
->dump_int("mdsmap_epoch", mdsmap
->get_epoch());
1778 void Client::dump_mds_requests(Formatter
*f
)
1780 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
1781 p
!= mds_requests
.end();
1783 f
->open_object_section("request");
1789 int Client::verify_reply_trace(int r
, MetaSession
*session
,
1790 MetaRequest
*request
, const MConstRef
<MClientReply
>& reply
,
1791 InodeRef
*ptarget
, bool *pcreated
,
1792 const UserPerm
& perms
)
1794 // check whether this request actually did the create, and set created flag
1795 bufferlist extra_bl
;
1796 inodeno_t created_ino
;
1797 bool got_created_ino
= false;
1798 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
;
1800 extra_bl
= reply
->get_extra_bl();
1801 if (extra_bl
.length() >= 8) {
1802 if (session
->mds_features
.test(CEPHFS_FEATURE_DELEG_INO
)) {
1803 struct openc_response_t ocres
;
1805 decode(ocres
, extra_bl
);
1806 created_ino
= ocres
.created_ino
;
1808 * The userland cephfs client doesn't have a way to do an async create
1809 * (yet), so just discard delegated_inos for now. Eventually we should
1810 * store them and use them in create calls, even if they are synchronous,
1811 * if only for testing purposes.
1813 ldout(cct
, 10) << "delegated_inos: " << ocres
.delegated_inos
<< dendl
;
1815 // u64 containing number of created ino
1816 decode(created_ino
, extra_bl
);
1818 ldout(cct
, 10) << "make_request created ino " << created_ino
<< dendl
;
1819 got_created_ino
= true;
1823 *pcreated
= got_created_ino
;
1825 if (request
->target
) {
1826 *ptarget
= request
->target
;
1827 ldout(cct
, 20) << "make_request target is " << *ptarget
->get() << dendl
;
1829 if (got_created_ino
&& (p
= inode_map
.find(vinodeno_t(created_ino
, CEPH_NOSNAP
))) != inode_map
.end()) {
1830 (*ptarget
) = p
->second
;
1831 ldout(cct
, 20) << "make_request created, target is " << *ptarget
->get() << dendl
;
1833 // we got a traceless reply, and need to look up what we just
1834 // created. for now, do this by name. someday, do this by the
1835 // ino... which we know! FIXME.
1837 Dentry
*d
= request
->dentry();
1840 ldout(cct
, 10) << "make_request got traceless reply, looking up #"
1841 << d
->dir
->parent_inode
->ino
<< "/" << d
->name
1842 << " got_ino " << got_created_ino
1843 << " ino " << created_ino
1845 r
= _do_lookup(d
->dir
->parent_inode
, d
->name
, request
->regetattr_mask
,
1848 // if the dentry is not linked, just do our best. see #5021.
1849 ceph_abort_msg("how did this happen? i want logs!");
1852 Inode
*in
= request
->inode();
1853 ldout(cct
, 10) << "make_request got traceless reply, forcing getattr on #"
1854 << in
->ino
<< dendl
;
1855 r
= _getattr(in
, request
->regetattr_mask
, perms
, true);
1859 // verify ino returned in reply and trace_dist are the same
1860 if (got_created_ino
&&
1861 created_ino
.val
!= target
->ino
.val
) {
1862 ldout(cct
, 5) << "create got ino " << created_ino
<< " but then failed on lookup; EINTR?" << dendl
;
1866 ptarget
->swap(target
);
1878 * Blocking helper to make an MDS request.
1880 * If the ptarget flag is set, behavior changes slightly: the caller
1881 * expects to get a pointer to the inode we are creating or operating
1882 * on. As a result, we will follow up any traceless mutation reply
1883 * with a getattr or lookup to transparently handle a traceless reply
1884 * from the MDS (as when the MDS restarts and the client has to replay
1887 * @param request the MetaRequest to execute
1888 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1889 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1890 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1891 * @param use_mds [optional] prefer a specific mds (-1 for default)
1892 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1894 int Client::make_request(MetaRequest
*request
,
1895 const UserPerm
& perms
,
1896 InodeRef
*ptarget
, bool *pcreated
,
1899 size_t feature_needed
)
1903 // assign a unique tid
1904 ceph_tid_t tid
= ++last_tid
;
1905 request
->set_tid(tid
);
1908 request
->op_stamp
= ceph_clock_now();
1909 request
->created
= ceph::coarse_mono_clock::now();
1912 mds_requests
[tid
] = request
->get();
1913 if (oldest_tid
== 0 && request
->get_op() != CEPH_MDS_OP_SETFILELOCK
)
1916 request
->set_caller_perms(perms
);
1918 if (cct
->_conf
->client_inject_fixed_oldest_tid
) {
1919 ldout(cct
, 20) << __func__
<< " injecting fixed oldest_client_tid(1)" << dendl
;
1920 request
->set_oldest_client_tid(1);
1922 request
->set_oldest_client_tid(oldest_tid
);
1927 request
->resend_mds
= use_mds
;
1929 MetaSessionRef session
= NULL
;
1931 if (request
->aborted())
1935 request
->abort(-CEPHFS_EBLOCKLISTED
);
1940 ceph::condition_variable caller_cond
;
1941 request
->caller_cond
= &caller_cond
;
1944 Inode
*hash_diri
= NULL
;
1945 mds_rank_t mds
= choose_target_mds(request
, &hash_diri
);
1946 int mds_state
= (mds
== MDS_RANK_NONE
) ? MDSMap::STATE_NULL
: mdsmap
->get_state(mds
);
1947 if (mds_state
!= MDSMap::STATE_ACTIVE
&& mds_state
!= MDSMap::STATE_STOPPING
) {
1948 if (mds_state
== MDSMap::STATE_NULL
&& mds
>= mdsmap
->get_max_mds()) {
1950 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, remove it from fragmap" << dendl
;
1951 _fragmap_remove_stopped_mds(hash_diri
, mds
);
1953 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, trying a random mds" << dendl
;
1954 request
->resend_mds
= _get_random_up_mds();
1957 ldout(cct
, 10) << " target mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
1958 wait_on_list(waiting_for_mdsmap
);
1964 if (!have_open_session(mds
)) {
1965 session
= _get_or_open_mds_session(mds
);
1966 if (session
->state
== MetaSession::STATE_REJECTED
) {
1967 request
->abort(-CEPHFS_EPERM
);
1971 if (session
->state
== MetaSession::STATE_OPENING
) {
1972 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
1973 wait_on_context_list(session
->waiting_for_open
);
1977 if (!have_open_session(mds
))
1980 session
= mds_sessions
.at(mds
);
1983 if (feature_needed
!= ULONG_MAX
&& !session
->mds_features
.test(feature_needed
)) {
1984 request
->abort(-CEPHFS_EOPNOTSUPP
);
1989 send_request(request
, session
.get());
1992 ldout(cct
, 20) << "awaiting reply|forward|kick on " << &caller_cond
<< dendl
;
1993 request
->kick
= false;
1994 std::unique_lock l
{client_lock
, std::adopt_lock
};
1995 caller_cond
.wait(l
, [request
] {
1996 return (request
->reply
|| // reply
1997 request
->resend_mds
>= 0 || // forward
2001 request
->caller_cond
= nullptr;
2003 // did we get a reply?
2008 if (!request
->reply
) {
2009 ceph_assert(request
->aborted());
2010 ceph_assert(!request
->got_unsafe
);
2011 r
= request
->get_abort_code();
2012 request
->item
.remove_myself();
2013 unregister_request(request
);
2014 put_request(request
);
2019 auto reply
= std::move(request
->reply
);
2020 r
= reply
->get_result();
2022 request
->success
= true;
2024 // kick dispatcher (we've got it!)
2025 ceph_assert(request
->dispatch_cond
);
2026 request
->dispatch_cond
->notify_all();
2027 ldout(cct
, 20) << "sendrecv kickback on tid " << tid
<< " " << request
->dispatch_cond
<< dendl
;
2028 request
->dispatch_cond
= 0;
2030 if (r
>= 0 && ptarget
)
2031 r
= verify_reply_trace(r
, session
.get(), request
, reply
, ptarget
, pcreated
, perms
);
2034 *pdirbl
= reply
->get_extra_bl();
2037 utime_t lat
= ceph_clock_now();
2038 lat
-= request
->sent_stamp
;
2039 ldout(cct
, 20) << "lat " << lat
<< dendl
;
2041 ++nr_metadata_request
;
2042 update_io_stat_metadata(lat
);
2044 put_request(request
);
2048 void Client::unregister_request(MetaRequest
*req
)
2050 mds_requests
.erase(req
->tid
);
2051 if (req
->tid
== oldest_tid
) {
2052 map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.upper_bound(oldest_tid
);
2054 if (p
== mds_requests
.end()) {
2058 if (p
->second
->get_op() != CEPH_MDS_OP_SETFILELOCK
) {
2059 oldest_tid
= p
->first
;
2068 void Client::put_request(MetaRequest
*request
)
2070 if (request
->_put()) {
2072 if (request
->success
)
2073 op
= request
->get_op();
2075 request
->take_other_inode(&other_in
);
2079 (op
== CEPH_MDS_OP_RMDIR
||
2080 op
== CEPH_MDS_OP_RENAME
||
2081 op
== CEPH_MDS_OP_RMSNAP
)) {
2082 _try_to_trim_inode(other_in
.get(), false);
2087 int Client::encode_inode_release(Inode
*in
, MetaRequest
*req
,
2088 mds_rank_t mds
, int drop
,
2089 int unless
, int force
)
2091 ldout(cct
, 20) << __func__
<< " enter(in:" << *in
<< ", req:" << req
2092 << " mds:" << mds
<< ", drop:" << ccap_string(drop
) << ", unless:" << ccap_string(unless
)
2093 << ", force:" << force
<< ")" << dendl
;
2095 auto it
= in
->caps
.find(mds
);
2096 if (it
!= in
->caps
.end()) {
2097 Cap
&cap
= it
->second
;
2098 drop
&= ~(in
->dirty_caps
| get_caps_used(in
));
2099 if ((drop
& cap
.issued
) &&
2100 !(unless
& cap
.issued
)) {
2101 ldout(cct
, 25) << "dropping caps " << ccap_string(drop
) << dendl
;
2102 cap
.issued
&= ~drop
;
2103 cap
.implemented
&= ~drop
;
2109 cap
.wanted
= in
->caps_wanted();
2110 if (&cap
== in
->auth_cap
&&
2111 !(cap
.wanted
& CEPH_CAP_ANY_FILE_WR
)) {
2112 in
->requested_max_size
= 0;
2113 ldout(cct
, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl
;
2115 ceph_mds_request_release rel
;
2117 rel
.cap_id
= cap
.cap_id
;
2119 rel
.issue_seq
= cap
.issue_seq
;
2120 rel
.mseq
= cap
.mseq
;
2121 rel
.caps
= cap
.implemented
;
2122 rel
.wanted
= cap
.wanted
;
2125 req
->cap_releases
.push_back(MClientRequest::Release(rel
,""));
2128 ldout(cct
, 25) << __func__
<< " exit(in:" << *in
<< ") released:"
2129 << released
<< dendl
;
2133 void Client::encode_dentry_release(Dentry
*dn
, MetaRequest
*req
,
2134 mds_rank_t mds
, int drop
, int unless
)
2136 ldout(cct
, 20) << __func__
<< " enter(dn:"
2137 << dn
<< ")" << dendl
;
2140 released
= encode_inode_release(dn
->dir
->parent_inode
, req
,
2141 mds
, drop
, unless
, 1);
2142 if (released
&& dn
->lease_mds
== mds
) {
2143 ldout(cct
, 25) << "preemptively releasing dn to mds" << dendl
;
2144 auto& rel
= req
->cap_releases
.back();
2145 rel
.item
.dname_len
= dn
->name
.length();
2146 rel
.item
.dname_seq
= dn
->lease_seq
;
2147 rel
.dname
= dn
->name
;
2150 ldout(cct
, 25) << __func__
<< " exit(dn:"
2151 << dn
<< ")" << dendl
;
2156 * This requires the MClientRequest *request member to be set.
2157 * It will error out horribly without one.
2158 * Additionally, if you set any *drop member, you'd better have
2159 * set the corresponding dentry!
2161 void Client::encode_cap_releases(MetaRequest
*req
, mds_rank_t mds
)
2163 ldout(cct
, 20) << __func__
<< " enter (req: "
2164 << req
<< ", mds: " << mds
<< ")" << dendl
;
2165 if (req
->inode_drop
&& req
->inode())
2166 encode_inode_release(req
->inode(), req
,
2167 mds
, req
->inode_drop
,
2170 if (req
->old_inode_drop
&& req
->old_inode())
2171 encode_inode_release(req
->old_inode(), req
,
2172 mds
, req
->old_inode_drop
,
2173 req
->old_inode_unless
);
2174 if (req
->other_inode_drop
&& req
->other_inode())
2175 encode_inode_release(req
->other_inode(), req
,
2176 mds
, req
->other_inode_drop
,
2177 req
->other_inode_unless
);
2179 if (req
->dentry_drop
&& req
->dentry())
2180 encode_dentry_release(req
->dentry(), req
,
2181 mds
, req
->dentry_drop
,
2182 req
->dentry_unless
);
2184 if (req
->old_dentry_drop
&& req
->old_dentry())
2185 encode_dentry_release(req
->old_dentry(), req
,
2186 mds
, req
->old_dentry_drop
,
2187 req
->old_dentry_unless
);
2188 ldout(cct
, 25) << __func__
<< " exit (req: "
2189 << req
<< ", mds " << mds
<<dendl
;
2192 bool Client::have_open_session(mds_rank_t mds
)
2194 const auto &it
= mds_sessions
.find(mds
);
2195 return it
!= mds_sessions
.end() &&
2196 (it
->second
->state
== MetaSession::STATE_OPEN
||
2197 it
->second
->state
== MetaSession::STATE_STALE
);
2200 MetaSessionRef
Client::_get_mds_session(mds_rank_t mds
, Connection
*con
)
2202 const auto &it
= mds_sessions
.find(mds
);
2203 if (it
== mds_sessions
.end() || it
->second
->con
!= con
) {
2210 MetaSessionRef
Client::_get_or_open_mds_session(mds_rank_t mds
)
2212 auto it
= mds_sessions
.find(mds
);
2213 return it
== mds_sessions
.end() ? _open_mds_session(mds
) : it
->second
;
2217 * Populate a map of strings with client-identifying metadata,
2218 * such as the hostname. Call this once at initialization.
2220 void Client::populate_metadata(const std::string
&mount_root
)
2224 // TODO: move this to compat.h
2226 DWORD hostname_sz
= 64;
2227 GetComputerNameA(hostname
, &hostname_sz
);
2228 metadata
["hostname"] = hostname
;
2233 metadata
["hostname"] = u
.nodename
;
2234 ldout(cct
, 20) << __func__
<< " read hostname '" << u
.nodename
<< "'" << dendl
;
2236 ldout(cct
, 1) << __func__
<< " failed to read hostname (" << cpp_strerror(r
) << ")" << dendl
;
2240 metadata
["pid"] = stringify(getpid());
2242 // Ceph entity id (the '0' in "client.0")
2243 metadata
["entity_id"] = cct
->_conf
->name
.get_id();
2245 // Our mount position
2246 if (!mount_root
.empty()) {
2247 metadata
["root"] = mount_root
;
2251 metadata
["ceph_version"] = pretty_version_to_str();
2252 metadata
["ceph_sha1"] = git_version_to_str();
2254 // Apply any metadata from the user's configured overrides
2255 std::vector
<std::string
> tokens
;
2256 get_str_vec(cct
->_conf
->client_metadata
, ",", tokens
);
2257 for (const auto &i
: tokens
) {
2258 auto eqpos
= i
.find("=");
2259 // Throw out anything that isn't of the form "<str>=<str>"
2260 if (eqpos
== 0 || eqpos
== std::string::npos
|| eqpos
== i
.size()) {
2261 lderr(cct
) << "Invalid metadata keyval pair: '" << i
<< "'" << dendl
;
2264 metadata
[i
.substr(0, eqpos
)] = i
.substr(eqpos
+ 1);
2269 * Optionally add or override client metadata fields.
2271 void Client::update_metadata(std::string
const &k
, std::string
const &v
)
2273 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
2274 ceph_assert(iref_reader
.is_state_satisfied());
2276 std::scoped_lock
l(client_lock
);
2278 auto it
= metadata
.find(k
);
2279 if (it
!= metadata
.end()) {
2280 ldout(cct
, 1) << __func__
<< " warning, overriding metadata field '" << k
2281 << "' from '" << it
->second
<< "' to '" << v
<< "'" << dendl
;
2287 MetaSessionRef
Client::_open_mds_session(mds_rank_t mds
)
2289 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
2290 auto addrs
= mdsmap
->get_addrs(mds
);
2291 auto em
= mds_sessions
.emplace(std::piecewise_construct
,
2292 std::forward_as_tuple(mds
),
2293 std::forward_as_tuple(new MetaSession(mds
, messenger
->connect_to_mds(addrs
), addrs
)));
2294 ceph_assert(em
.second
); /* not already present */
2295 auto session
= em
.first
->second
;
2297 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_OPEN
);
2298 m
->metadata
= metadata
;
2299 m
->supported_features
= feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED
);
2300 m
->metric_spec
= feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL
);
2301 session
->con
->send_message2(std::move(m
));
2305 void Client::_close_mds_session(MetaSession
*s
)
2307 ldout(cct
, 2) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2308 s
->state
= MetaSession::STATE_CLOSING
;
2309 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2312 void Client::_closed_mds_session(MetaSession
*s
, int err
, bool rejected
)
2314 ldout(cct
, 5) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2315 if (rejected
&& s
->state
!= MetaSession::STATE_CLOSING
)
2316 s
->state
= MetaSession::STATE_REJECTED
;
2318 s
->state
= MetaSession::STATE_CLOSED
;
2319 s
->con
->mark_down();
2320 signal_context_list(s
->waiting_for_open
);
2321 mount_cond
.notify_all();
2322 remove_session_caps(s
, err
);
2323 kick_requests_closed(s
);
2324 mds_ranks_closing
.erase(s
->mds_num
);
2325 if (s
->state
== MetaSession::STATE_CLOSED
)
2326 mds_sessions
.erase(s
->mds_num
);
2329 void Client::handle_client_session(const MConstRef
<MClientSession
>& m
)
2331 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2332 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << from
<< dendl
;
2334 std::scoped_lock
cl(client_lock
);
2335 auto session
= _get_mds_session(from
, m
->get_connection().get());
2337 ldout(cct
, 10) << " discarding session message from sessionless mds " << m
->get_source_inst() << dendl
;
2341 switch (m
->get_op()) {
2342 case CEPH_SESSION_OPEN
:
2344 if (session
->state
== MetaSession::STATE_OPEN
) {
2345 ldout(cct
, 10) << "mds." << from
<< " already opened, ignore it"
2350 * The connection maybe broken and the session in client side
2351 * has been reinitialized, need to update the seq anyway.
2353 if (!session
->seq
&& m
->get_seq())
2354 session
->seq
= m
->get_seq();
2356 feature_bitset_t
missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED
);
2357 missing_features
-= m
->supported_features
;
2358 if (!missing_features
.empty()) {
2359 lderr(cct
) << "mds." << from
<< " lacks required features '"
2360 << missing_features
<< "', closing session " << dendl
;
2361 _close_mds_session(session
.get());
2362 _closed_mds_session(session
.get(), -CEPHFS_EPERM
, true);
2365 session
->mds_features
= std::move(m
->supported_features
);
2366 session
->mds_metric_flags
= std::move(m
->metric_spec
.metric_flags
);
2368 renew_caps(session
.get());
2369 session
->state
= MetaSession::STATE_OPEN
;
2370 if (is_unmounting())
2371 mount_cond
.notify_all();
2373 connect_mds_targets(from
);
2374 signal_context_list(session
->waiting_for_open
);
2378 case CEPH_SESSION_CLOSE
:
2379 _closed_mds_session(session
.get());
2382 case CEPH_SESSION_RENEWCAPS
:
2383 if (session
->cap_renew_seq
== m
->get_seq()) {
2384 bool was_stale
= ceph_clock_now() >= session
->cap_ttl
;
2386 session
->last_cap_renew_request
+ mdsmap
->get_session_timeout();
2388 wake_up_session_caps(session
.get(), false);
2392 case CEPH_SESSION_STALE
:
2393 // invalidate session caps/leases
2395 session
->cap_ttl
= ceph_clock_now();
2396 session
->cap_ttl
-= 1;
2397 renew_caps(session
.get());
2400 case CEPH_SESSION_RECALL_STATE
:
2402 * Call the renew caps and flush cap releases just before
2403 * triming the caps in case the tick() won't get a chance
2404 * to run them, which could cause the client to be blocklisted
2405 * and MDS daemons trying to recall the caps again and
2408 * In most cases it will do nothing, and the new cap releases
2409 * added by trim_caps() followed will be deferred flushing
2412 renew_and_flush_cap_releases();
2413 trim_caps(session
.get(), m
->get_max_caps());
2416 case CEPH_SESSION_FLUSHMSG
:
2417 /* flush cap release */
2418 if (auto& m
= session
->release
; m
) {
2419 session
->con
->send_message2(std::move(m
));
2421 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG_ACK
, m
->get_seq()));
2424 case CEPH_SESSION_FORCE_RO
:
2425 force_session_readonly(session
.get());
2428 case CEPH_SESSION_REJECT
:
2430 std::string_view error_str
;
2431 auto it
= m
->metadata
.find("error_string");
2432 if (it
!= m
->metadata
.end())
2433 error_str
= it
->second
;
2435 error_str
= "unknown error";
2436 lderr(cct
) << "mds." << from
<< " rejected us (" << error_str
<< ")" << dendl
;
2438 _closed_mds_session(session
.get(), -CEPHFS_EPERM
, true);
2447 bool Client::_any_stale_sessions() const
2449 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
2451 for (const auto &p
: mds_sessions
) {
2452 if (p
.second
->state
== MetaSession::STATE_STALE
) {
2460 void Client::_kick_stale_sessions()
2462 ldout(cct
, 1) << __func__
<< dendl
;
2464 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
2465 auto s
= it
->second
;
2466 if (s
->state
== MetaSession::STATE_REJECTED
) {
2467 mds_sessions
.erase(it
->first
);
2470 if (s
->state
== MetaSession::STATE_STALE
)
2471 _closed_mds_session(s
.get());
2475 void Client::send_request(MetaRequest
*request
, MetaSession
*session
,
2476 bool drop_cap_releases
)
2479 mds_rank_t mds
= session
->mds_num
;
2480 ldout(cct
, 10) << __func__
<< " rebuilding request " << request
->get_tid()
2481 << " for mds." << mds
<< dendl
;
2482 auto r
= build_client_request(request
);
2483 if (request
->dentry()) {
2484 r
->set_dentry_wanted();
2486 if (request
->got_unsafe
) {
2487 r
->set_replayed_op();
2488 if (request
->target
)
2489 r
->head
.ino
= request
->target
->ino
;
2491 encode_cap_releases(request
, mds
);
2492 if (drop_cap_releases
) // we haven't send cap reconnect yet, drop cap releases
2493 request
->cap_releases
.clear();
2495 r
->releases
.swap(request
->cap_releases
);
2497 r
->set_mdsmap_epoch(mdsmap
->get_epoch());
2498 if (r
->head
.op
== CEPH_MDS_OP_SETXATTR
) {
2499 objecter
->with_osdmap([r
](const OSDMap
& o
) {
2500 r
->set_osdmap_epoch(o
.get_epoch());
2504 if (request
->mds
== -1) {
2505 request
->sent_stamp
= ceph_clock_now();
2506 ldout(cct
, 20) << __func__
<< " set sent_stamp to " << request
->sent_stamp
<< dendl
;
2510 Inode
*in
= request
->inode();
2512 auto it
= in
->caps
.find(mds
);
2513 if (it
!= in
->caps
.end()) {
2514 request
->sent_on_mseq
= it
->second
.mseq
;
2518 session
->requests
.push_back(&request
->item
);
2520 ldout(cct
, 10) << __func__
<< " " << *r
<< " to mds." << mds
<< dendl
;
2521 session
->con
->send_message2(std::move(r
));
2524 ref_t
<MClientRequest
> Client::build_client_request(MetaRequest
*request
)
2526 auto req
= make_message
<MClientRequest
>(request
->get_op());
2527 req
->set_tid(request
->tid
);
2528 req
->set_stamp(request
->op_stamp
);
2529 memcpy(&req
->head
, &request
->head
, sizeof(ceph_mds_request_head
));
2531 // if the filepath's haven't been set, set them!
2532 if (request
->path
.empty()) {
2533 Inode
*in
= request
->inode();
2534 Dentry
*de
= request
->dentry();
2536 in
->make_nosnap_relative_path(request
->path
);
2539 de
->inode
->make_nosnap_relative_path(request
->path
);
2541 de
->dir
->parent_inode
->make_nosnap_relative_path(request
->path
);
2542 request
->path
.push_dentry(de
->name
);
2544 else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2545 << " No path, inode, or appropriately-endowed dentry given!"
2547 } else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2548 << " No path, inode, or dentry given!"
2551 req
->set_filepath(request
->get_filepath());
2552 req
->set_filepath2(request
->get_filepath2());
2553 req
->set_alternate_name(request
->alternate_name
);
2554 req
->set_data(request
->data
);
2555 req
->set_retry_attempt(request
->retry_attempt
++);
2556 req
->head
.num_fwd
= request
->num_fwd
;
2558 int gid_count
= request
->perms
.get_gids(&_gids
);
2559 req
->set_gid_list(gid_count
, _gids
);
2565 void Client::handle_client_request_forward(const MConstRef
<MClientRequestForward
>& fwd
)
2567 mds_rank_t mds
= mds_rank_t(fwd
->get_source().num());
2569 std::scoped_lock
cl(client_lock
);
2570 auto session
= _get_mds_session(mds
, fwd
->get_connection().get());
2574 ceph_tid_t tid
= fwd
->get_tid();
2576 if (mds_requests
.count(tid
) == 0) {
2577 ldout(cct
, 10) << __func__
<< " no pending request on tid " << tid
<< dendl
;
2581 MetaRequest
*request
= mds_requests
[tid
];
2582 ceph_assert(request
);
2585 * The type of 'num_fwd' in ceph 'MClientRequestForward'
2586 * is 'int32_t', while in 'ceph_mds_request_head' the
2587 * type is '__u8'. So in case the request bounces between
2588 * MDSes exceeding 256 times, the client will get stuck.
2590 * In this case it's ususally a bug in MDS and continue
2591 * bouncing the request makes no sense.
2593 * In future this could be fixed in ceph code, so avoid
2594 * using the hardcode here.
2596 int max_fwd
= sizeof(((struct ceph_mds_request_head
*)0)->num_fwd
);
2597 max_fwd
= 1 << (max_fwd
* CHAR_BIT
) - 1;
2598 auto num_fwd
= fwd
->get_num_fwd();
2599 if (num_fwd
<= request
->num_fwd
|| num_fwd
>= max_fwd
) {
2600 if (request
->num_fwd
>= max_fwd
|| num_fwd
>= max_fwd
) {
2601 request
->abort(-EMULTIHOP
);
2602 request
->caller_cond
->notify_all();
2603 ldout(cct
, 1) << __func__
<< " tid " << tid
<< " seq overflow"
2604 << ", abort it" << dendl
;
2606 ldout(cct
, 10) << __func__
<< " tid " << tid
2607 << " old fwd seq " << fwd
->get_num_fwd()
2608 << " <= req fwd " << request
->num_fwd
2609 << ", ignore it" << dendl
;
2614 // reset retry counter
2615 request
->retry_attempt
= 0;
2617 // request not forwarded, or dest mds has no session.
2619 ldout(cct
, 10) << __func__
<< " tid " << tid
2620 << " fwd " << fwd
->get_num_fwd()
2621 << " to mds." << fwd
->get_dest_mds()
2622 << ", resending to " << fwd
->get_dest_mds()
2626 request
->item
.remove_myself();
2627 request
->num_fwd
= num_fwd
;
2628 request
->resend_mds
= fwd
->get_dest_mds();
2629 request
->caller_cond
->notify_all();
2632 bool Client::is_dir_operation(MetaRequest
*req
)
2634 int op
= req
->get_op();
2635 if (op
== CEPH_MDS_OP_MKNOD
|| op
== CEPH_MDS_OP_LINK
||
2636 op
== CEPH_MDS_OP_UNLINK
|| op
== CEPH_MDS_OP_RENAME
||
2637 op
== CEPH_MDS_OP_MKDIR
|| op
== CEPH_MDS_OP_RMDIR
||
2638 op
== CEPH_MDS_OP_SYMLINK
|| op
== CEPH_MDS_OP_CREATE
)
2643 void Client::handle_client_reply(const MConstRef
<MClientReply
>& reply
)
2645 mds_rank_t mds_num
= mds_rank_t(reply
->get_source().num());
2647 std::scoped_lock
cl(client_lock
);
2648 auto session
= _get_mds_session(mds_num
, reply
->get_connection().get());
2653 ceph_tid_t tid
= reply
->get_tid();
2654 bool is_safe
= reply
->is_safe();
2656 if (mds_requests
.count(tid
) == 0) {
2657 lderr(cct
) << __func__
<< " no pending request on tid " << tid
2658 << " safe is:" << is_safe
<< dendl
;
2661 MetaRequest
*request
= mds_requests
.at(tid
);
2663 ldout(cct
, 20) << __func__
<< " got a reply. Safe:" << is_safe
2664 << " tid " << tid
<< dendl
;
2666 if (request
->got_unsafe
&& !is_safe
) {
2667 //duplicate response
2668 ldout(cct
, 0) << "got a duplicate reply on tid " << tid
<< " from mds "
2669 << mds_num
<< " safe:" << is_safe
<< dendl
;
2673 ceph_assert(!request
->reply
);
2674 request
->reply
= reply
;
2675 insert_trace(request
, session
.get());
2677 // Handle unsafe reply
2679 request
->got_unsafe
= true;
2680 session
->unsafe_requests
.push_back(&request
->unsafe_item
);
2681 if (is_dir_operation(request
)) {
2682 Inode
*dir
= request
->inode();
2684 dir
->unsafe_ops
.push_back(&request
->unsafe_dir_item
);
2686 if (request
->target
) {
2687 InodeRef
&in
= request
->target
;
2688 in
->unsafe_ops
.push_back(&request
->unsafe_target_item
);
2692 // Only signal the caller once (on the first reply):
2693 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2694 if (!is_safe
|| !request
->got_unsafe
) {
2695 ceph::condition_variable cond
;
2696 request
->dispatch_cond
= &cond
;
2699 ldout(cct
, 20) << __func__
<< " signalling caller " << (void*)request
->caller_cond
<< dendl
;
2700 request
->caller_cond
->notify_all();
2702 // wake for kick back
2703 std::unique_lock l
{client_lock
, std::adopt_lock
};
2704 cond
.wait(l
, [tid
, request
, &cond
, this] {
2705 if (request
->dispatch_cond
) {
2706 ldout(cct
, 20) << "handle_client_reply awaiting kickback on tid "
2707 << tid
<< " " << &cond
<< dendl
;
2709 return !request
->dispatch_cond
;
2715 // the filesystem change is committed to disk
2716 // we're done, clean up
2717 if (request
->got_unsafe
) {
2718 request
->unsafe_item
.remove_myself();
2719 request
->unsafe_dir_item
.remove_myself();
2720 request
->unsafe_target_item
.remove_myself();
2721 signal_cond_list(request
->waitfor_safe
);
2723 request
->item
.remove_myself();
2724 unregister_request(request
);
2726 if (is_unmounting())
2727 mount_cond
.notify_all();
2730 void Client::_handle_full_flag(int64_t pool
)
2732 ldout(cct
, 1) << __func__
<< ": FULL: cancelling outstanding operations "
2733 << "on " << pool
<< dendl
;
2734 // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
2735 // to do this rather than blocking, because otherwise when we fill up we
2736 // potentially lock caps forever on files with dirty pages, and we need
2737 // to be able to release those caps to the MDS so that it can delete files
2738 // and free up space.
2739 epoch_t cancelled_epoch
= objecter
->op_cancel_writes(-CEPHFS_ENOSPC
, pool
);
2741 // For all inodes with layouts in this pool and a pending flush write op
2742 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2743 // from ObjectCacher so that it doesn't re-issue the write in response to
2744 // the ENOSPC error.
2745 // Fortunately since we're cancelling everything in a given pool, we don't
2746 // need to know which ops belong to which ObjectSet, we can just blow all
2747 // the un-flushed cached data away and mark any dirty inodes' async_err
2748 // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
2749 // affecting this pool, and all the objectsets we're purging were also
2751 for (unordered_map
<vinodeno_t
,Inode
*>::iterator i
= inode_map
.begin();
2752 i
!= inode_map
.end(); ++i
)
2754 Inode
*inode
= i
->second
;
2755 if (inode
->oset
.dirty_or_tx
2756 && (pool
== -1 || inode
->layout
.pool_id
== pool
)) {
2757 ldout(cct
, 4) << __func__
<< ": FULL: inode 0x" << std::hex
<< i
->first
<< std::dec
2758 << " has dirty objects, purging and setting ENOSPC" << dendl
;
2759 objectcacher
->purge_set(&inode
->oset
);
2760 inode
->set_async_err(-CEPHFS_ENOSPC
);
2764 if (cancelled_epoch
!= (epoch_t
)-1) {
2765 set_cap_epoch_barrier(cancelled_epoch
);
2769 void Client::handle_osd_map(const MConstRef
<MOSDMap
>& m
)
2771 std::scoped_lock
cl(client_lock
);
2773 const auto myaddrs
= messenger
->get_myaddrs();
2774 bool new_blocklist
= objecter
->with_osdmap(
2775 [&](const OSDMap
& o
) {
2776 return o
.is_blocklisted(myaddrs
);
2779 if (new_blocklist
&& !blocklisted
) {
2780 auto epoch
= objecter
->with_osdmap([](const OSDMap
&o
){
2781 return o
.get_epoch();
2783 lderr(cct
) << "I was blocklisted at osd epoch " << epoch
<< dendl
;
2786 _abort_mds_sessions(-CEPHFS_EBLOCKLISTED
);
2788 // Since we know all our OSD ops will fail, cancel them all preemtively,
2789 // so that on an unhealthy cluster we can umount promptly even if e.g.
2790 // some PGs were inaccessible.
2791 objecter
->op_cancel_writes(-CEPHFS_EBLOCKLISTED
);
2796 // Handle case where we were blocklisted but no longer are
2797 blocklisted
= objecter
->with_osdmap([myaddrs
](const OSDMap
&o
){
2798 return o
.is_blocklisted(myaddrs
);});
2801 // Always subscribe to next osdmap for blocklisted client
2802 // until this client is not blocklisted.
2804 objecter
->maybe_request_map();
2807 if (objecter
->osdmap_full_flag()) {
2808 _handle_full_flag(-1);
2810 // Accumulate local list of full pools so that I can drop
2811 // the objecter lock before re-entering objecter in
2813 std::vector
<int64_t> full_pools
;
2815 objecter
->with_osdmap([&full_pools
](const OSDMap
&o
) {
2816 for (const auto& kv
: o
.get_pools()) {
2817 if (kv
.second
.has_flag(pg_pool_t::FLAG_FULL
)) {
2818 full_pools
.push_back(kv
.first
);
2823 for (auto p
: full_pools
)
2824 _handle_full_flag(p
);
2826 // Subscribe to subsequent maps to watch for the full flag going
2827 // away. For the global full flag objecter does this for us, but
2828 // it pays no attention to the per-pool full flag so in this branch
2829 // we do it ourselves.
2830 if (!full_pools
.empty()) {
2831 objecter
->maybe_request_map();
2837 // ------------------------
2838 // incoming messages
2841 bool Client::ms_dispatch2(const MessageRef
&m
)
2843 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
2844 if (!iref_reader
.is_state_satisfied()) {
2845 ldout(cct
, 10) << "inactive, discarding " << *m
<< dendl
;
2849 switch (m
->get_type()) {
2850 // mounting and mds sessions
2851 case CEPH_MSG_MDS_MAP
:
2852 handle_mds_map(ref_cast
<MMDSMap
>(m
));
2854 case CEPH_MSG_FS_MAP
:
2855 handle_fs_map(ref_cast
<MFSMap
>(m
));
2857 case CEPH_MSG_FS_MAP_USER
:
2858 handle_fs_map_user(ref_cast
<MFSMapUser
>(m
));
2860 case CEPH_MSG_CLIENT_SESSION
:
2861 handle_client_session(ref_cast
<MClientSession
>(m
));
2864 case CEPH_MSG_OSD_MAP
:
2865 handle_osd_map(ref_cast
<MOSDMap
>(m
));
2869 case CEPH_MSG_CLIENT_REQUEST_FORWARD
:
2870 handle_client_request_forward(ref_cast
<MClientRequestForward
>(m
));
2872 case CEPH_MSG_CLIENT_REPLY
:
2873 handle_client_reply(ref_cast
<MClientReply
>(m
));
2877 case CEPH_MSG_CLIENT_RECLAIM_REPLY
:
2878 handle_client_reclaim_reply(ref_cast
<MClientReclaimReply
>(m
));
2881 case CEPH_MSG_CLIENT_SNAP
:
2882 handle_snap(ref_cast
<MClientSnap
>(m
));
2884 case CEPH_MSG_CLIENT_CAPS
:
2885 handle_caps(ref_cast
<MClientCaps
>(m
));
2887 case CEPH_MSG_CLIENT_LEASE
:
2888 handle_lease(ref_cast
<MClientLease
>(m
));
2890 case MSG_COMMAND_REPLY
:
2891 if (m
->get_source().type() == CEPH_ENTITY_TYPE_MDS
) {
2892 handle_command_reply(ref_cast
<MCommandReply
>(m
));
2897 case CEPH_MSG_CLIENT_QUOTA
:
2898 handle_quota(ref_cast
<MClientQuota
>(m
));
2906 std::scoped_lock
cl(client_lock
);
2907 if (is_unmounting()) {
2908 ldout(cct
, 10) << "unmounting: trim pass, size was " << lru
.lru_get_size()
2909 << "+" << inode_map
.size() << dendl
;
2910 uint64_t size
= lru
.lru_get_size() + inode_map
.size();
2912 if (size
> lru
.lru_get_size() + inode_map
.size()) {
2913 ldout(cct
, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl
;
2914 mount_cond
.notify_all();
2916 ldout(cct
, 10) << "unmounting: trim pass, size still " << lru
.lru_get_size()
2917 << "+" << inode_map
.size() << dendl
;
2924 void Client::handle_fs_map(const MConstRef
<MFSMap
>& m
)
2926 std::scoped_lock
cl(client_lock
);
2927 fsmap
.reset(new FSMap(m
->get_fsmap()));
2929 signal_cond_list(waiting_for_fsmap
);
2931 monclient
->sub_got("fsmap", fsmap
->get_epoch());
2934 void Client::handle_fs_map_user(const MConstRef
<MFSMapUser
>& m
)
2936 std::scoped_lock
cl(client_lock
);
2937 fsmap_user
.reset(new FSMapUser
);
2938 *fsmap_user
= m
->get_fsmap();
2940 monclient
->sub_got("fsmap.user", fsmap_user
->get_epoch());
2941 signal_cond_list(waiting_for_fsmap
);
2944 // Cancel all the commands for missing or laggy GIDs
2945 void Client::cancel_commands(const MDSMap
& newmap
)
2947 std::vector
<ceph_tid_t
> cancel_ops
;
2949 std::scoped_lock
cmd_lock(command_lock
);
2950 auto &commands
= command_table
.get_commands();
2951 for (const auto &[tid
, op
] : commands
) {
2952 const mds_gid_t op_mds_gid
= op
.mds_gid
;
2953 if (newmap
.is_dne_gid(op_mds_gid
) || newmap
.is_laggy_gid(op_mds_gid
)) {
2954 ldout(cct
, 1) << __func__
<< ": cancelling command op " << tid
<< dendl
;
2955 cancel_ops
.push_back(tid
);
2957 std::ostringstream ss
;
2958 ss
<< "MDS " << op_mds_gid
<< " went away";
2959 *(op
.outs
) = ss
.str();
2962 * No need to make the con->mark_down under
2963 * client_lock here, because the con will
2966 op
.con
->mark_down();
2968 op
.on_finish
->complete(-CEPHFS_ETIMEDOUT
);
2972 for (const auto &tid
: cancel_ops
)
2973 command_table
.erase(tid
);
2976 void Client::handle_mds_map(const MConstRef
<MMDSMap
>& m
)
2978 std::unique_lock
cl(client_lock
);
2979 if (m
->get_epoch() <= mdsmap
->get_epoch()) {
2980 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch()
2981 << " is identical to or older than our "
2982 << mdsmap
->get_epoch() << dendl
;
2987 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch() << dendl
;
2988 std::unique_ptr
<MDSMap
> _mdsmap(new MDSMap
);
2989 _mdsmap
->decode(m
->get_encoded());
2990 cancel_commands(*_mdsmap
.get());
2993 _mdsmap
.swap(mdsmap
);
2996 for (auto p
= mds_sessions
.begin(); p
!= mds_sessions
.end(); ) {
2997 mds_rank_t mds
= p
->first
;
2998 MetaSessionRef session
= p
->second
;
3001 int oldstate
= _mdsmap
->get_state(mds
);
3002 int newstate
= mdsmap
->get_state(mds
);
3003 if (!mdsmap
->is_up(mds
)) {
3004 session
->con
->mark_down();
3005 } else if (mdsmap
->get_addrs(mds
) != session
->addrs
) {
3006 auto old_inc
= _mdsmap
->get_incarnation(mds
);
3007 auto new_inc
= mdsmap
->get_incarnation(mds
);
3008 if (old_inc
!= new_inc
) {
3009 ldout(cct
, 1) << "mds incarnation changed from "
3010 << old_inc
<< " to " << new_inc
<< dendl
;
3011 oldstate
= MDSMap::STATE_NULL
;
3013 session
->con
->mark_down();
3014 session
->addrs
= mdsmap
->get_addrs(mds
);
3015 // When new MDS starts to take over, notify kernel to trim unused entries
3016 // in its dcache/icache. Hopefully, the kernel will release some unused
3017 // inodes before the new MDS enters reconnect state.
3018 trim_cache_for_reconnect(session
.get());
3019 } else if (oldstate
== newstate
)
3020 continue; // no change
3022 session
->mds_state
= newstate
;
3023 if (newstate
== MDSMap::STATE_RECONNECT
) {
3024 session
->con
= messenger
->connect_to_mds(session
->addrs
);
3025 send_reconnect(session
.get());
3026 } else if (newstate
> MDSMap::STATE_RECONNECT
) {
3027 if (oldstate
< MDSMap::STATE_RECONNECT
) {
3028 ldout(cct
, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl
;
3029 _closed_mds_session(session
.get());
3032 if (newstate
>= MDSMap::STATE_ACTIVE
) {
3033 if (oldstate
< MDSMap::STATE_ACTIVE
) {
3034 // kick new requests
3035 kick_requests(session
.get());
3036 kick_flushing_caps(session
.get());
3037 signal_context_list(session
->waiting_for_open
);
3038 wake_up_session_caps(session
.get(), true);
3040 connect_mds_targets(mds
);
3042 } else if (newstate
== MDSMap::STATE_NULL
&&
3043 mds
>= mdsmap
->get_max_mds()) {
3044 _closed_mds_session(session
.get());
3048 // kick any waiting threads
3049 signal_cond_list(waiting_for_mdsmap
);
3051 monclient
->sub_got("mdsmap", mdsmap
->get_epoch());
3054 void Client::send_reconnect(MetaSession
*session
)
3056 mds_rank_t mds
= session
->mds_num
;
3057 ldout(cct
, 10) << __func__
<< " to mds." << mds
<< dendl
;
3059 // trim unused caps to reduce MDS's cache rejoin time
3060 trim_cache_for_reconnect(session
);
3062 session
->readonly
= false;
3064 session
->release
.reset();
3066 // reset my cap seq number
3068 //connect to the mds' offload targets
3069 connect_mds_targets(mds
);
3070 //make sure unsafe requests get saved
3071 resend_unsafe_requests(session
);
3073 early_kick_flushing_caps(session
);
3075 auto m
= make_message
<MClientReconnect
>();
3076 bool allow_multi
= session
->mds_features
.test(CEPHFS_FEATURE_MULTI_RECONNECT
);
3078 // i have an open session.
3079 ceph::unordered_set
<inodeno_t
> did_snaprealm
;
3080 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
= inode_map
.begin();
3081 p
!= inode_map
.end();
3083 Inode
*in
= p
->second
;
3084 auto it
= in
->caps
.find(mds
);
3085 if (it
!= in
->caps
.end()) {
3087 m
->get_approx_size() >=
3088 static_cast<size_t>((std::numeric_limits
<int>::max() >> 1))) {
3090 session
->con
->send_message2(std::move(m
));
3092 m
= make_message
<MClientReconnect
>();
3095 Cap
&cap
= it
->second
;
3096 ldout(cct
, 10) << " caps on " << p
->first
3097 << " " << ccap_string(cap
.issued
)
3098 << " wants " << ccap_string(in
->caps_wanted())
3101 in
->make_short_path(path
);
3102 ldout(cct
, 10) << " path " << path
<< dendl
;
3105 _encode_filelocks(in
, flockbl
);
3107 cap
.seq
= 0; // reset seq.
3108 cap
.issue_seq
= 0; // reset seq.
3109 cap
.mseq
= 0; // reset seq.
3110 // cap gen should catch up with session cap_gen
3111 if (cap
.gen
< session
->cap_gen
) {
3112 cap
.gen
= session
->cap_gen
;
3113 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
3115 cap
.issued
= cap
.implemented
;
3117 snapid_t snap_follows
= 0;
3118 if (!in
->cap_snaps
.empty())
3119 snap_follows
= in
->cap_snaps
.begin()->first
;
3121 m
->add_cap(p
->first
.ino
,
3123 path
.get_ino(), path
.get_path(), // ino
3124 in
->caps_wanted(), // wanted
3125 cap
.issued
, // issued
3130 if (did_snaprealm
.count(in
->snaprealm
->ino
) == 0) {
3131 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
3132 m
->add_snaprealm(in
->snaprealm
->ino
, in
->snaprealm
->seq
, in
->snaprealm
->parent
);
3133 did_snaprealm
.insert(in
->snaprealm
->ino
);
3139 m
->set_encoding_version(0); // use connection features to choose encoding
3140 session
->con
->send_message2(std::move(m
));
3142 mount_cond
.notify_all();
3144 if (session
->reclaim_state
== MetaSession::RECLAIMING
)
3145 signal_cond_list(waiting_for_reclaim
);
3149 void Client::kick_requests(MetaSession
*session
)
3151 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
3152 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
3153 p
!= mds_requests
.end();
3155 MetaRequest
*req
= p
->second
;
3156 if (req
->got_unsafe
)
3158 if (req
->aborted()) {
3159 if (req
->caller_cond
) {
3161 req
->caller_cond
->notify_all();
3165 if (req
->retry_attempt
> 0)
3166 continue; // new requests only
3167 if (req
->mds
== session
->mds_num
) {
3168 send_request(p
->second
, session
);
3173 void Client::resend_unsafe_requests(MetaSession
*session
)
3175 for (xlist
<MetaRequest
*>::iterator iter
= session
->unsafe_requests
.begin();
3178 send_request(*iter
, session
);
3180 // also re-send old requests when MDS enters reconnect stage. So that MDS can
3181 // process completed requests in clientreplay stage.
3182 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
3183 p
!= mds_requests
.end();
3185 MetaRequest
*req
= p
->second
;
3186 if (req
->got_unsafe
)
3190 if (req
->retry_attempt
== 0)
3191 continue; // old requests only
3192 if (req
->mds
== session
->mds_num
)
3193 send_request(req
, session
, true);
3197 void Client::wait_unsafe_requests()
3199 list
<MetaRequest
*> last_unsafe_reqs
;
3200 for (const auto &p
: mds_sessions
) {
3201 const auto s
= p
.second
;
3202 if (!s
->unsafe_requests
.empty()) {
3203 MetaRequest
*req
= s
->unsafe_requests
.back();
3205 last_unsafe_reqs
.push_back(req
);
3209 for (list
<MetaRequest
*>::iterator p
= last_unsafe_reqs
.begin();
3210 p
!= last_unsafe_reqs
.end();
3212 MetaRequest
*req
= *p
;
3213 if (req
->unsafe_item
.is_on_list())
3214 wait_on_list(req
->waitfor_safe
);
3219 void Client::kick_requests_closed(MetaSession
*session
)
3221 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
3222 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
3223 p
!= mds_requests
.end(); ) {
3224 MetaRequest
*req
= p
->second
;
3226 if (req
->mds
== session
->mds_num
) {
3227 if (req
->caller_cond
) {
3229 req
->caller_cond
->notify_all();
3231 req
->item
.remove_myself();
3232 if (req
->got_unsafe
) {
3233 lderr(cct
) << __func__
<< " removing unsafe request " << req
->get_tid() << dendl
;
3234 req
->unsafe_item
.remove_myself();
3235 if (is_dir_operation(req
)) {
3236 Inode
*dir
= req
->inode();
3238 dir
->set_async_err(-CEPHFS_EIO
);
3239 lderr(cct
) << "kick_requests_closed drop req of inode(dir) : "
3240 << dir
->ino
<< " " << req
->get_tid() << dendl
;
3241 req
->unsafe_dir_item
.remove_myself();
3244 InodeRef
&in
= req
->target
;
3245 in
->set_async_err(-CEPHFS_EIO
);
3246 lderr(cct
) << "kick_requests_closed drop req of inode : "
3247 << in
->ino
<< " " << req
->get_tid() << dendl
;
3248 req
->unsafe_target_item
.remove_myself();
3250 signal_cond_list(req
->waitfor_safe
);
3251 unregister_request(req
);
3255 ceph_assert(session
->requests
.empty());
3256 ceph_assert(session
->unsafe_requests
.empty());
3266 void Client::got_mds_push(MetaSession
*s
)
3269 ldout(cct
, 10) << " mds." << s
->mds_num
<< " seq now " << s
->seq
<< dendl
;
3270 if (s
->state
== MetaSession::STATE_CLOSING
) {
3271 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
3275 void Client::handle_lease(const MConstRef
<MClientLease
>& m
)
3277 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
3279 ceph_assert(m
->get_action() == CEPH_MDS_LEASE_REVOKE
);
3280 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
3282 std::scoped_lock
cl(client_lock
);
3283 auto session
= _get_mds_session(mds
, m
->get_connection().get());
3288 got_mds_push(session
.get());
3290 ceph_seq_t seq
= m
->get_seq();
3293 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
3294 if (inode_map
.count(vino
) == 0) {
3295 ldout(cct
, 10) << " don't have vino " << vino
<< dendl
;
3298 in
= inode_map
[vino
];
3300 if (m
->get_mask() & CEPH_LEASE_VALID
) {
3301 if (!in
->dir
|| in
->dir
->dentries
.count(m
->dname
) == 0) {
3302 ldout(cct
, 10) << " don't have dir|dentry " << m
->get_ino() << "/" << m
->dname
<<dendl
;
3305 Dentry
*dn
= in
->dir
->dentries
[m
->dname
];
3306 ldout(cct
, 10) << " revoked DN lease on " << dn
<< dendl
;
3312 auto reply
= make_message
<MClientLease
>(CEPH_MDS_LEASE_RELEASE
, seq
,
3313 m
->get_mask(), m
->get_ino(),
3314 m
->get_first(), m
->get_last(), m
->dname
);
3315 m
->get_connection()->send_message2(std::move(reply
));
3319 void Client::_put_inode(Inode
*in
, int n
)
3321 ldout(cct
, 10) << __func__
<< " on " << *in
<< " n = " << n
<< dendl
;
3323 int left
= in
->get_nref();
3324 ceph_assert(left
>= n
+ 1);
3327 if (left
== 1) { // the last one will be held by the inode_map
3329 remove_all_caps(in
);
3331 ldout(cct
, 10) << __func__
<< " deleting " << *in
<< dendl
;
3332 bool unclean
= objectcacher
->release_set(&in
->oset
);
3333 ceph_assert(!unclean
);
3334 inode_map
.erase(in
->vino());
3335 if (use_faked_inos())
3336 _release_faked_ino(in
);
3338 if (root
== nullptr) {
3340 while (!root_parents
.empty())
3341 root_parents
.erase(root_parents
.begin());
3348 void Client::delay_put_inodes(bool wakeup
)
3350 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
3352 std::map
<Inode
*,int> release
;
3354 std::scoped_lock
dl(delay_i_lock
);
3355 release
.swap(delay_i_release
);
3358 if (release
.empty())
3361 for (auto &[in
, cnt
] : release
)
3362 _put_inode(in
, cnt
);
3365 mount_cond
.notify_all();
3368 void Client::put_inode(Inode
*in
, int n
)
3370 ldout(cct
, 20) << __func__
<< " on " << *in
<< " n = " << n
<< dendl
;
3372 std::scoped_lock
dl(delay_i_lock
);
3373 delay_i_release
[in
] += n
;
3376 void Client::close_dir(Dir
*dir
)
3378 Inode
*in
= dir
->parent_inode
;
3379 ldout(cct
, 15) << __func__
<< " dir " << dir
<< " on " << in
<< dendl
;
3380 ceph_assert(dir
->is_empty());
3381 ceph_assert(in
->dir
== dir
);
3382 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
3383 if (!in
->dentries
.empty())
3384 in
->get_first_parent()->put(); // unpin dentry
3388 put_inode(in
); // unpin inode
3392 * Don't call this with in==NULL, use get_or_create for that
3393 * leave dn set to default NULL unless you're trying to add
3394 * a new inode to a pre-created Dentry
3396 Dentry
* Client::link(Dir
*dir
, const string
& name
, Inode
*in
, Dentry
*dn
)
3399 // create a new Dentry
3400 dn
= new Dentry(dir
, name
);
3402 lru
.lru_insert_mid(dn
); // mid or top?
3404 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3405 << " dn " << dn
<< " (new dn)" << dendl
;
3407 ceph_assert(!dn
->inode
);
3408 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3409 << " dn " << dn
<< " (old dn)" << dendl
;
3412 if (in
) { // link to inode
3414 // only one parent for directories!
3415 if (in
->is_dir() && !in
->dentries
.empty()) {
3416 tmp_ref
= in
; // prevent unlink below from freeing the inode.
3417 Dentry
*olddn
= in
->get_first_parent();
3418 ceph_assert(olddn
->dir
!= dir
|| olddn
->name
!= name
);
3419 Inode
*old_diri
= olddn
->dir
->parent_inode
;
3420 clear_dir_complete_and_ordered(old_diri
, true);
3421 unlink(olddn
, true, true); // keep dir, dentry
3426 ldout(cct
, 20) << "link inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3432 void Client::unlink(Dentry
*dn
, bool keepdir
, bool keepdentry
)
3434 InodeRef
in(dn
->inode
);
3435 ldout(cct
, 15) << "unlink dir " << dn
->dir
->parent_inode
<< " '" << dn
->name
<< "' dn " << dn
3436 << " inode " << dn
->inode
<< dendl
;
3438 // unlink from inode
3442 ldout(cct
, 20) << "unlink inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3448 ldout(cct
, 15) << "unlink removing '" << dn
->name
<< "' dn " << dn
<< dendl
;
3458 if (dir
->is_empty() && !keepdir
)
3464 * For asynchronous flushes, check for errors from the IO and
3465 * update the inode if necessary
3467 class C_Client_FlushComplete
: public Context
{
3472 C_Client_FlushComplete(Client
*c
, Inode
*in
) : client(c
), inode(in
) { }
3473 void finish(int r
) override
{
3474 ceph_assert(ceph_mutex_is_locked_by_me(client
->client_lock
));
3476 client_t
const whoami
= client
->whoami
; // For the benefit of ldout prefix
3477 ldout(client
->cct
, 1) << "I/O error from flush on inode " << inode
3478 << " 0x" << std::hex
<< inode
->ino
<< std::dec
3479 << ": " << r
<< "(" << cpp_strerror(r
) << ")" << dendl
;
3480 inode
->set_async_err(r
);
3490 void Client::get_cap_ref(Inode
*in
, int cap
)
3492 if ((cap
& CEPH_CAP_FILE_BUFFER
) &&
3493 in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] == 0) {
3494 ldout(cct
, 5) << __func__
<< " got first FILE_BUFFER ref on " << *in
<< dendl
;
3497 if ((cap
& CEPH_CAP_FILE_CACHE
) &&
3498 in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3499 ldout(cct
, 5) << __func__
<< " got first FILE_CACHE ref on " << *in
<< dendl
;
3502 in
->get_cap_ref(cap
);
3505 void Client::put_cap_ref(Inode
*in
, int cap
)
3507 int last
= in
->put_cap_ref(cap
);
3510 int drop
= last
& ~in
->caps_issued();
3511 if (in
->snapid
== CEPH_NOSNAP
) {
3512 if ((last
& (CEPH_CAP_FILE_WR
| CEPH_CAP_FILE_BUFFER
)) &&
3513 !in
->cap_snaps
.empty() &&
3514 in
->cap_snaps
.rbegin()->second
.writing
) {
3515 ldout(cct
, 10) << __func__
<< " finishing pending cap_snap on " << *in
<< dendl
;
3516 in
->cap_snaps
.rbegin()->second
.writing
= 0;
3517 finish_cap_snap(in
, in
->cap_snaps
.rbegin()->second
, get_caps_used(in
));
3518 signal_cond_list(in
->waitfor_caps
); // wake up blocked sync writers
3520 if (last
& CEPH_CAP_FILE_BUFFER
) {
3521 for (auto &p
: in
->cap_snaps
)
3522 p
.second
.dirty_data
= 0;
3523 signal_cond_list(in
->waitfor_commit
);
3524 ldout(cct
, 5) << __func__
<< " dropped last FILE_BUFFER ref on " << *in
<< dendl
;
3528 if (last
& CEPH_CAP_FILE_CACHE
) {
3529 ldout(cct
, 5) << __func__
<< " dropped last FILE_CACHE ref on " << *in
<< dendl
;
3535 put_inode(in
, put_nref
);
3539 // get caps for a given file handle -- the inode should have @need caps
3540 // issued by the mds and @want caps not revoked (or not under revocation).
3541 // this routine blocks till the cap requirement is satisfied. also account
3542 // (track) for capability hit when required (when cap requirement succeedes).
3543 int Client::get_caps(Fh
*fh
, int need
, int want
, int *phave
, loff_t endoff
)
3545 Inode
*in
= fh
->inode
.get();
3547 int r
= check_pool_perm(in
, need
);
3552 int file_wanted
= in
->caps_file_wanted();
3553 if ((file_wanted
& need
) != need
) {
3554 ldout(cct
, 10) << "get_caps " << *in
<< " need " << ccap_string(need
)
3555 << " file_wanted " << ccap_string(file_wanted
) << ", EBADF "
3557 return -CEPHFS_EBADF
;
3560 if ((fh
->mode
& CEPH_FILE_MODE_WR
) && fh
->gen
!= fd_gen
)
3561 return -CEPHFS_EBADF
;
3563 if ((in
->flags
& I_ERROR_FILELOCK
) && fh
->has_any_filelocks())
3567 int have
= in
->caps_issued(&implemented
);
3569 bool waitfor_caps
= false;
3570 bool waitfor_commit
= false;
3572 if (have
& need
& CEPH_CAP_FILE_WR
) {
3574 if ((endoff
>= (loff_t
)in
->max_size
||
3575 endoff
> (loff_t
)(in
->size
<< 1)) &&
3576 endoff
> (loff_t
)in
->wanted_max_size
) {
3577 ldout(cct
, 10) << "wanted_max_size " << in
->wanted_max_size
<< " -> " << endoff
<< dendl
;
3578 in
->wanted_max_size
= endoff
;
3580 if (in
->wanted_max_size
> in
->max_size
&&
3581 in
->wanted_max_size
> in
->requested_max_size
)
3585 if (endoff
>= 0 && endoff
> (loff_t
)in
->max_size
) {
3586 ldout(cct
, 10) << "waiting on max_size, endoff " << endoff
<< " max_size " << in
->max_size
<< " on " << *in
<< dendl
;
3587 waitfor_caps
= true;
3589 if (!in
->cap_snaps
.empty()) {
3590 if (in
->cap_snaps
.rbegin()->second
.writing
) {
3591 ldout(cct
, 10) << "waiting on cap_snap write to complete" << dendl
;
3592 waitfor_caps
= true;
3594 for (auto &p
: in
->cap_snaps
) {
3595 if (p
.second
.dirty_data
) {
3596 waitfor_commit
= true;
3600 if (waitfor_commit
) {
3601 _flush(in
, new C_Client_FlushComplete(this, in
));
3602 ldout(cct
, 10) << "waiting for WRBUFFER to get dropped" << dendl
;
3607 if (!waitfor_caps
&& !waitfor_commit
) {
3608 if ((have
& need
) == need
) {
3609 int revoking
= implemented
& ~have
;
3610 ldout(cct
, 10) << "get_caps " << *in
<< " have " << ccap_string(have
)
3611 << " need " << ccap_string(need
) << " want " << ccap_string(want
)
3612 << " revoking " << ccap_string(revoking
)
3614 if ((revoking
& want
) == 0) {
3615 *phave
= need
| (have
& want
);
3616 in
->get_cap_ref(need
);
3621 ldout(cct
, 10) << "waiting for caps " << *in
<< " need " << ccap_string(need
) << " want " << ccap_string(want
) << dendl
;
3622 waitfor_caps
= true;
3625 if ((need
& CEPH_CAP_FILE_WR
) && in
->auth_cap
&&
3626 in
->auth_cap
->session
->readonly
)
3627 return -CEPHFS_EROFS
;
3629 if (in
->flags
& I_CAP_DROPPED
) {
3630 int mds_wanted
= in
->caps_mds_wanted();
3631 if ((mds_wanted
& need
) != need
) {
3632 int ret
= _renew_caps(in
);
3637 if (!(file_wanted
& ~mds_wanted
))
3638 in
->flags
&= ~I_CAP_DROPPED
;
3642 wait_on_list(in
->waitfor_caps
);
3643 else if (waitfor_commit
)
3644 wait_on_list(in
->waitfor_commit
);
3648 int Client::get_caps_used(Inode
*in
)
3650 unsigned used
= in
->caps_used();
3651 if (!(used
& CEPH_CAP_FILE_CACHE
) &&
3652 !objectcacher
->set_is_empty(&in
->oset
))
3653 used
|= CEPH_CAP_FILE_CACHE
;
3657 void Client::cap_delay_requeue(Inode
*in
)
3659 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3661 in
->hold_caps_until
= ceph::coarse_mono_clock::now() + caps_release_delay
;
3662 delayed_list
.push_back(&in
->delay_cap_item
);
3665 void Client::send_cap(Inode
*in
, MetaSession
*session
, Cap
*cap
,
3666 int flags
, int used
, int want
, int retain
,
3667 int flush
, ceph_tid_t flush_tid
)
3669 int held
= cap
->issued
| cap
->implemented
;
3670 int revoking
= cap
->implemented
& ~cap
->issued
;
3671 retain
&= ~revoking
;
3672 int dropping
= cap
->issued
& ~retain
;
3673 int op
= CEPH_CAP_OP_UPDATE
;
3675 ldout(cct
, 10) << __func__
<< " " << *in
3676 << " mds." << session
->mds_num
<< " seq " << cap
->seq
3677 << " used " << ccap_string(used
)
3678 << " want " << ccap_string(want
)
3679 << " flush " << ccap_string(flush
)
3680 << " retain " << ccap_string(retain
)
3681 << " held "<< ccap_string(held
)
3682 << " revoking " << ccap_string(revoking
)
3683 << " dropping " << ccap_string(dropping
)
3686 if (cct
->_conf
->client_inject_release_failure
&& revoking
) {
3687 const int would_have_issued
= cap
->issued
& retain
;
3688 const int would_have_implemented
= cap
->implemented
& (cap
->issued
| used
);
3690 // - tell the server we think issued is whatever they issued plus whatever we implemented
3691 // - leave what we have implemented in place
3692 ldout(cct
, 20) << __func__
<< " injecting failure to release caps" << dendl
;
3693 cap
->issued
= cap
->issued
| cap
->implemented
;
3695 // Make an exception for revoking xattr caps: we are injecting
3696 // failure to release other caps, but allow xattr because client
3697 // will block on xattr ops if it can't release these to MDS (#9800)
3698 const int xattr_mask
= CEPH_CAP_XATTR_SHARED
| CEPH_CAP_XATTR_EXCL
;
3699 cap
->issued
^= xattr_mask
& revoking
;
3700 cap
->implemented
^= xattr_mask
& revoking
;
3702 ldout(cct
, 20) << __func__
<< " issued " << ccap_string(cap
->issued
) << " vs " << ccap_string(would_have_issued
) << dendl
;
3703 ldout(cct
, 20) << __func__
<< " implemented " << ccap_string(cap
->implemented
) << " vs " << ccap_string(would_have_implemented
) << dendl
;
3706 cap
->issued
&= retain
;
3707 cap
->implemented
&= cap
->issued
| used
;
3710 snapid_t follows
= 0;
3713 follows
= in
->snaprealm
->get_snap_context().seq
;
3715 auto m
= make_message
<MClientCaps
>(op
,
3718 cap
->cap_id
, cap
->seq
,
3724 m
->caller_uid
= in
->cap_dirtier_uid
;
3725 m
->caller_gid
= in
->cap_dirtier_gid
;
3727 m
->head
.issue_seq
= cap
->issue_seq
;
3728 m
->set_tid(flush_tid
);
3730 m
->head
.uid
= in
->uid
;
3731 m
->head
.gid
= in
->gid
;
3732 m
->head
.mode
= in
->mode
;
3734 m
->head
.nlink
= in
->nlink
;
3736 if (flush
& CEPH_CAP_XATTR_EXCL
) {
3737 encode(in
->xattrs
, m
->xattrbl
);
3738 m
->head
.xattr_version
= in
->xattr_version
;
3742 m
->max_size
= in
->max_size
;
3743 m
->truncate_seq
= in
->truncate_seq
;
3744 m
->truncate_size
= in
->truncate_size
;
3745 m
->mtime
= in
->mtime
;
3746 m
->atime
= in
->atime
;
3747 m
->ctime
= in
->ctime
;
3748 m
->btime
= in
->btime
;
3749 m
->time_warp_seq
= in
->time_warp_seq
;
3750 m
->change_attr
= in
->change_attr
;
3752 if (!(flags
& MClientCaps::FLAG_PENDING_CAPSNAP
) &&
3753 !in
->cap_snaps
.empty() &&
3754 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3755 flags
|= MClientCaps::FLAG_PENDING_CAPSNAP
;
3758 if (flush
& CEPH_CAP_FILE_WR
) {
3759 m
->inline_version
= in
->inline_version
;
3760 m
->inline_data
= in
->inline_data
;
3763 in
->reported_size
= in
->size
;
3764 m
->set_snap_follows(follows
);
3766 if (cap
== in
->auth_cap
) {
3767 if (want
& CEPH_CAP_ANY_FILE_WR
) {
3768 m
->set_max_size(in
->wanted_max_size
);
3769 in
->requested_max_size
= in
->wanted_max_size
;
3770 ldout(cct
, 15) << "auth cap, requesting max_size " << in
->requested_max_size
<< dendl
;
3772 in
->requested_max_size
= 0;
3773 ldout(cct
, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl
;
3777 if (!session
->flushing_caps_tids
.empty())
3778 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3780 session
->con
->send_message2(std::move(m
));
3783 static bool is_max_size_approaching(Inode
*in
)
3785 /* mds will adjust max size according to the reported size */
3786 if (in
->flushing_caps
& CEPH_CAP_FILE_WR
)
3788 if (in
->size
>= in
->max_size
)
3790 /* half of previous max_size increment has been used */
3791 if (in
->max_size
> in
->reported_size
&&
3792 (in
->size
<< 1) >= in
->max_size
+ in
->reported_size
)
3797 static int adjust_caps_used_for_lazyio(int used
, int issued
, int implemented
)
3799 if (!(used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
)))
3801 if (!(implemented
& CEPH_CAP_FILE_LAZYIO
))
3804 if (issued
& CEPH_CAP_FILE_LAZYIO
) {
3805 if (!(issued
& CEPH_CAP_FILE_CACHE
)) {
3806 used
&= ~CEPH_CAP_FILE_CACHE
;
3807 used
|= CEPH_CAP_FILE_LAZYIO
;
3809 if (!(issued
& CEPH_CAP_FILE_BUFFER
)) {
3810 used
&= ~CEPH_CAP_FILE_BUFFER
;
3811 used
|= CEPH_CAP_FILE_LAZYIO
;
3814 if (!(implemented
& CEPH_CAP_FILE_CACHE
)) {
3815 used
&= ~CEPH_CAP_FILE_CACHE
;
3816 used
|= CEPH_CAP_FILE_LAZYIO
;
3818 if (!(implemented
& CEPH_CAP_FILE_BUFFER
)) {
3819 used
&= ~CEPH_CAP_FILE_BUFFER
;
3820 used
|= CEPH_CAP_FILE_LAZYIO
;
3829 * Examine currently used and wanted versus held caps. Release, flush or ack
3830 * revoked caps to the MDS as appropriate.
3832 * @param in the inode to check
3833 * @param flags flags to apply to cap check
3835 void Client::check_caps(Inode
*in
, unsigned flags
)
3837 unsigned wanted
= in
->caps_wanted();
3838 unsigned used
= get_caps_used(in
);
3842 int issued
= in
->caps_issued(&implemented
);
3843 int revoking
= implemented
& ~issued
;
3845 int orig_used
= used
;
3846 used
= adjust_caps_used_for_lazyio(used
, issued
, implemented
);
3848 int retain
= wanted
| used
| CEPH_CAP_PIN
;
3849 if (!is_unmounting() && in
->nlink
> 0) {
3851 retain
|= CEPH_CAP_ANY
;
3852 } else if (in
->is_dir() &&
3853 (issued
& CEPH_CAP_FILE_SHARED
) &&
3854 (in
->flags
& I_COMPLETE
)) {
3855 // we do this here because we don't want to drop to Fs (and then
3856 // drop the Fs if we do a create!) if that alone makes us send lookups
3857 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3858 wanted
= CEPH_CAP_ANY_SHARED
| CEPH_CAP_FILE_EXCL
;
3861 retain
|= CEPH_CAP_ANY_SHARED
;
3862 // keep RD only if we didn't have the file open RW,
3863 // because then the mds would revoke it anyway to
3864 // journal max_size=0.
3865 if (in
->max_size
== 0)
3866 retain
|= CEPH_CAP_ANY_RD
;
3870 ldout(cct
, 10) << __func__
<< " on " << *in
3871 << " wanted " << ccap_string(wanted
)
3872 << " used " << ccap_string(used
)
3873 << " issued " << ccap_string(issued
)
3874 << " revoking " << ccap_string(revoking
)
3875 << " flags=" << flags
3878 if (in
->snapid
!= CEPH_NOSNAP
)
3879 return; //snap caps last forever, can't write
3881 if (in
->caps
.empty())
3882 return; // guard if at end of func
3884 if (!(orig_used
& CEPH_CAP_FILE_BUFFER
) &&
3885 (revoking
& used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
3887 used
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
3890 for (auto &[mds
, cap
] : in
->caps
) {
3891 auto session
= mds_sessions
.at(mds
);
3894 if (in
->auth_cap
&& &cap
!= in
->auth_cap
)
3895 cap_used
&= ~in
->auth_cap
->issued
;
3897 revoking
= cap
.implemented
& ~cap
.issued
;
3899 ldout(cct
, 10) << " cap mds." << mds
3900 << " issued " << ccap_string(cap
.issued
)
3901 << " implemented " << ccap_string(cap
.implemented
)
3902 << " revoking " << ccap_string(revoking
) << dendl
;
3904 if (in
->wanted_max_size
> in
->max_size
&&
3905 in
->wanted_max_size
> in
->requested_max_size
&&
3906 &cap
== in
->auth_cap
)
3909 /* approaching file_max? */
3910 if ((cap
.issued
& CEPH_CAP_FILE_WR
) &&
3911 &cap
== in
->auth_cap
&&
3912 is_max_size_approaching(in
)) {
3913 ldout(cct
, 10) << "size " << in
->size
<< " approaching max_size " << in
->max_size
3914 << ", reported " << in
->reported_size
<< dendl
;
3918 /* completed revocation? */
3919 if (revoking
&& (revoking
& cap_used
) == 0) {
3920 ldout(cct
, 10) << "completed revocation of " << ccap_string(cap
.implemented
& ~cap
.issued
) << dendl
;
3924 /* want more caps from mds? */
3925 if (wanted
& ~(cap
.wanted
| cap
.issued
))
3928 if (!revoking
&& is_unmounting() && (cap_used
== 0))
3931 if ((cap
.issued
& ~retain
) == 0 && // and we don't have anything we wouldn't like
3932 !in
->dirty_caps
) // and we have no dirty caps
3935 if (!(flags
& CHECK_CAPS_NODELAY
)) {
3936 ldout(cct
, 10) << "delaying cap release" << dendl
;
3937 cap_delay_requeue(in
);
3942 if (&cap
== in
->auth_cap
) {
3943 if (in
->flags
& I_KICK_FLUSH
) {
3944 ldout(cct
, 20) << " reflushing caps (check_caps) on " << *in
3945 << " to mds." << mds
<< dendl
;
3946 kick_flushing_caps(in
, session
.get());
3948 if (!in
->cap_snaps
.empty() &&
3949 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3955 ceph_tid_t flush_tid
;
3956 if (in
->auth_cap
== &cap
&& in
->dirty_caps
) {
3957 flushing
= mark_caps_flushing(in
, &flush_tid
);
3958 if (flags
& CHECK_CAPS_SYNCHRONOUS
)
3959 msg_flags
|= MClientCaps::FLAG_SYNC
;
3965 in
->delay_cap_item
.remove_myself();
3966 send_cap(in
, session
.get(), &cap
, msg_flags
, cap_used
, wanted
, retain
,
3967 flushing
, flush_tid
);
3972 void Client::queue_cap_snap(Inode
*in
, SnapContext
& old_snapc
)
3974 int used
= get_caps_used(in
);
3975 int dirty
= in
->caps_dirty();
3976 ldout(cct
, 10) << __func__
<< " " << *in
<< " snapc " << old_snapc
<< " used " << ccap_string(used
) << dendl
;
3978 if (in
->cap_snaps
.size() &&
3979 in
->cap_snaps
.rbegin()->second
.writing
) {
3980 ldout(cct
, 10) << __func__
<< " already have pending cap_snap on " << *in
<< dendl
;
3982 } else if (in
->caps_dirty() ||
3983 (used
& CEPH_CAP_FILE_WR
) ||
3984 (dirty
& CEPH_CAP_ANY_WR
)) {
3985 const auto &capsnapem
= in
->cap_snaps
.emplace(std::piecewise_construct
, std::make_tuple(old_snapc
.seq
), std::make_tuple(in
));
3986 ceph_assert(capsnapem
.second
); /* element inserted */
3987 CapSnap
&capsnap
= capsnapem
.first
->second
;
3988 capsnap
.context
= old_snapc
;
3989 capsnap
.issued
= in
->caps_issued();
3990 capsnap
.dirty
= in
->caps_dirty();
3992 capsnap
.dirty_data
= (used
& CEPH_CAP_FILE_BUFFER
);
3994 capsnap
.uid
= in
->uid
;
3995 capsnap
.gid
= in
->gid
;
3996 capsnap
.mode
= in
->mode
;
3997 capsnap
.btime
= in
->btime
;
3998 capsnap
.xattrs
= in
->xattrs
;
3999 capsnap
.xattr_version
= in
->xattr_version
;
4000 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
4001 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
4003 if (used
& CEPH_CAP_FILE_WR
) {
4004 ldout(cct
, 10) << __func__
<< " WR used on " << *in
<< dendl
;
4005 capsnap
.writing
= 1;
4007 finish_cap_snap(in
, capsnap
, used
);
4010 ldout(cct
, 10) << __func__
<< " not dirty|writing on " << *in
<< dendl
;
4014 void Client::finish_cap_snap(Inode
*in
, CapSnap
&capsnap
, int used
)
4016 ldout(cct
, 10) << __func__
<< " " << *in
<< " capsnap " << (void *)&capsnap
<< " used " << ccap_string(used
) << dendl
;
4017 capsnap
.size
= in
->size
;
4018 capsnap
.mtime
= in
->mtime
;
4019 capsnap
.atime
= in
->atime
;
4020 capsnap
.ctime
= in
->ctime
;
4021 capsnap
.time_warp_seq
= in
->time_warp_seq
;
4022 capsnap
.change_attr
= in
->change_attr
;
4023 capsnap
.dirty
|= in
->caps_dirty();
4025 /* Only reset it if it wasn't set before */
4026 if (capsnap
.cap_dirtier_uid
== -1) {
4027 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
4028 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
4031 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
4032 capsnap
.inline_data
= in
->inline_data
;
4033 capsnap
.inline_version
= in
->inline_version
;
4036 if (used
& CEPH_CAP_FILE_BUFFER
) {
4037 capsnap
.writing
= 1;
4038 ldout(cct
, 10) << __func__
<< " " << *in
<< " cap_snap " << &capsnap
<< " used " << used
4039 << " WRBUFFER, delaying" << dendl
;
4041 capsnap
.dirty_data
= 0;
4046 void Client::send_flush_snap(Inode
*in
, MetaSession
*session
,
4047 snapid_t follows
, CapSnap
& capsnap
)
4049 auto m
= make_message
<MClientCaps
>(CEPH_CAP_OP_FLUSHSNAP
,
4050 in
->ino
, in
->snaprealm
->ino
, 0,
4051 in
->auth_cap
->mseq
, cap_epoch_barrier
);
4052 m
->caller_uid
= capsnap
.cap_dirtier_uid
;
4053 m
->caller_gid
= capsnap
.cap_dirtier_gid
;
4055 m
->set_client_tid(capsnap
.flush_tid
);
4056 m
->head
.snap_follows
= follows
;
4058 m
->head
.caps
= capsnap
.issued
;
4059 m
->head
.dirty
= capsnap
.dirty
;
4061 m
->head
.uid
= capsnap
.uid
;
4062 m
->head
.gid
= capsnap
.gid
;
4063 m
->head
.mode
= capsnap
.mode
;
4064 m
->btime
= capsnap
.btime
;
4066 m
->size
= capsnap
.size
;
4068 m
->head
.xattr_version
= capsnap
.xattr_version
;
4069 encode(capsnap
.xattrs
, m
->xattrbl
);
4071 m
->ctime
= capsnap
.ctime
;
4072 m
->btime
= capsnap
.btime
;
4073 m
->mtime
= capsnap
.mtime
;
4074 m
->atime
= capsnap
.atime
;
4075 m
->time_warp_seq
= capsnap
.time_warp_seq
;
4076 m
->change_attr
= capsnap
.change_attr
;
4078 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
4079 m
->inline_version
= in
->inline_version
;
4080 m
->inline_data
= in
->inline_data
;
4083 ceph_assert(!session
->flushing_caps_tids
.empty());
4084 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
4086 session
->con
->send_message2(std::move(m
));
4089 void Client::flush_snaps(Inode
*in
)
4091 ldout(cct
, 10) << "flush_snaps on " << *in
<< dendl
;
4092 ceph_assert(in
->cap_snaps
.size());
4095 ceph_assert(in
->auth_cap
);
4096 MetaSession
*session
= in
->auth_cap
->session
;
4098 for (auto &p
: in
->cap_snaps
) {
4099 CapSnap
&capsnap
= p
.second
;
4100 // only do new flush
4101 if (capsnap
.flush_tid
> 0)
4104 ldout(cct
, 10) << "flush_snaps mds." << session
->mds_num
4105 << " follows " << p
.first
4106 << " size " << capsnap
.size
4107 << " mtime " << capsnap
.mtime
4108 << " dirty_data=" << capsnap
.dirty_data
4109 << " writing=" << capsnap
.writing
4110 << " on " << *in
<< dendl
;
4111 if (capsnap
.dirty_data
|| capsnap
.writing
)
4114 capsnap
.flush_tid
= ++last_flush_tid
;
4115 session
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4116 in
->flushing_cap_tids
[capsnap
.flush_tid
] = 0;
4117 if (!in
->flushing_cap_item
.is_on_list())
4118 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4120 send_flush_snap(in
, session
, p
.first
, capsnap
);
4124 void Client::wait_on_list(list
<ceph::condition_variable
*>& ls
)
4126 ceph::condition_variable cond
;
4127 ls
.push_back(&cond
);
4128 std::unique_lock l
{client_lock
, std::adopt_lock
};
4134 void Client::signal_cond_list(list
<ceph::condition_variable
*>& ls
)
4136 for (auto cond
: ls
) {
4141 void Client::wait_on_context_list(list
<Context
*>& ls
)
4143 ceph::condition_variable cond
;
4146 ls
.push_back(new C_Cond(cond
, &done
, &r
));
4147 std::unique_lock l
{client_lock
, std::adopt_lock
};
4148 cond
.wait(l
, [&done
] { return done
;});
4152 void Client::signal_context_list(list
<Context
*>& ls
)
4154 while (!ls
.empty()) {
4155 ls
.front()->complete(0);
4160 void Client::wake_up_session_caps(MetaSession
*s
, bool reconnect
)
4162 for (const auto &cap
: s
->caps
) {
4163 auto &in
= cap
->inode
;
4165 in
.requested_max_size
= 0;
4166 in
.wanted_max_size
= 0;
4168 if (cap
->gen
< s
->cap_gen
) {
4169 // mds did not re-issue stale cap.
4170 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
4171 // make sure mds knows what we want.
4172 if (in
.caps_file_wanted() & ~cap
->wanted
)
4173 in
.flags
|= I_CAP_DROPPED
;
4176 signal_cond_list(in
.waitfor_caps
);
4181 // flush dirty data (from objectcache)
4183 class C_Client_CacheInvalidate
: public Context
{
4187 int64_t offset
, length
;
4189 C_Client_CacheInvalidate(Client
*c
, Inode
*in
, int64_t off
, int64_t len
) :
4190 client(c
), offset(off
), length(len
) {
4191 if (client
->use_faked_inos())
4192 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
4196 void finish(int r
) override
{
4197 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
4198 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
4199 client
->_async_invalidate(ino
, offset
, length
);
4203 void Client::_async_invalidate(vinodeno_t ino
, int64_t off
, int64_t len
)
4205 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
4206 if (!mref_reader
.is_state_satisfied())
4209 ldout(cct
, 10) << __func__
<< " " << ino
<< " " << off
<< "~" << len
<< dendl
;
4210 ino_invalidate_cb(callback_handle
, ino
, off
, len
);
4213 void Client::_schedule_invalidate_callback(Inode
*in
, int64_t off
, int64_t len
) {
4215 if (ino_invalidate_cb
)
4216 // we queue the invalidate, which calls the callback and decrements the ref
4217 async_ino_invalidator
.queue(new C_Client_CacheInvalidate(this, in
, off
, len
));
4220 void Client::_invalidate_inode_cache(Inode
*in
)
4222 ldout(cct
, 10) << __func__
<< " " << *in
<< dendl
;
4224 // invalidate our userspace inode cache
4225 if (cct
->_conf
->client_oc
) {
4226 objectcacher
->release_set(&in
->oset
);
4227 if (!objectcacher
->set_is_empty(&in
->oset
))
4228 lderr(cct
) << "failed to invalidate cache for " << *in
<< dendl
;
4231 _schedule_invalidate_callback(in
, 0, 0);
4234 void Client::_invalidate_inode_cache(Inode
*in
, int64_t off
, int64_t len
)
4236 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
4238 // invalidate our userspace inode cache
4239 if (cct
->_conf
->client_oc
) {
4240 vector
<ObjectExtent
> ls
;
4241 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, len
, in
->truncate_size
, ls
);
4242 objectcacher
->discard_writeback(&in
->oset
, ls
, nullptr);
4245 _schedule_invalidate_callback(in
, off
, len
);
4248 bool Client::_release(Inode
*in
)
4250 ldout(cct
, 20) << "_release " << *in
<< dendl
;
4251 if (in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
4252 _invalidate_inode_cache(in
);
4258 bool Client::_flush(Inode
*in
, Context
*onfinish
)
4260 ldout(cct
, 10) << "_flush " << *in
<< dendl
;
4262 if (!in
->oset
.dirty_or_tx
) {
4263 ldout(cct
, 10) << " nothing to flush" << dendl
;
4264 onfinish
->complete(0);
4268 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
4269 ldout(cct
, 8) << __func__
<< ": FULL, purging for ENOSPC" << dendl
;
4270 objectcacher
->purge_set(&in
->oset
);
4272 onfinish
->complete(-CEPHFS_ENOSPC
);
4277 return objectcacher
->flush_set(&in
->oset
, onfinish
);
4280 void Client::_flush_range(Inode
*in
, int64_t offset
, uint64_t size
)
4282 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
4283 if (!in
->oset
.dirty_or_tx
) {
4284 ldout(cct
, 10) << " nothing to flush" << dendl
;
4288 C_SaferCond
onflush("Client::_flush_range flock");
4289 bool ret
= objectcacher
->file_flush(&in
->oset
, &in
->layout
, in
->snaprealm
->get_snap_context(),
4290 offset
, size
, &onflush
);
4293 client_lock
.unlock();
4299 void Client::flush_set_callback(ObjectCacher::ObjectSet
*oset
)
4301 // std::scoped_lock l(client_lock);
4302 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
)); // will be called via dispatch() -> objecter -> ...
4303 Inode
*in
= static_cast<Inode
*>(oset
->parent
);
4308 void Client::_flushed(Inode
*in
)
4310 ldout(cct
, 10) << "_flushed " << *in
<< dendl
;
4312 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
4317 // checks common to add_update_cap, handle_cap_grant
4318 void Client::check_cap_issue(Inode
*in
, unsigned issued
)
4320 unsigned had
= in
->caps_issued();
4322 if ((issued
& CEPH_CAP_FILE_CACHE
) &&
4323 !(had
& CEPH_CAP_FILE_CACHE
))
4326 if ((issued
& CEPH_CAP_FILE_SHARED
) !=
4327 (had
& CEPH_CAP_FILE_SHARED
)) {
4328 if (issued
& CEPH_CAP_FILE_SHARED
)
4331 clear_dir_complete_and_ordered(in
, true);
4335 void Client::add_update_cap(Inode
*in
, MetaSession
*mds_session
, uint64_t cap_id
,
4336 unsigned issued
, unsigned wanted
, unsigned seq
, unsigned mseq
,
4337 inodeno_t realm
, int flags
, const UserPerm
& cap_perms
)
4339 if (!in
->is_any_caps()) {
4340 ceph_assert(in
->snaprealm
== 0);
4341 in
->snaprealm
= get_snap_realm(realm
);
4342 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4343 ldout(cct
, 15) << __func__
<< " first one, opened snaprealm " << in
->snaprealm
<< dendl
;
4345 ceph_assert(in
->snaprealm
);
4346 if ((flags
& CEPH_CAP_FLAG_AUTH
) &&
4347 realm
!= inodeno_t(-1) && in
->snaprealm
->ino
!= realm
) {
4348 in
->snaprealm_item
.remove_myself();
4349 auto oldrealm
= in
->snaprealm
;
4350 in
->snaprealm
= get_snap_realm(realm
);
4351 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4352 put_snap_realm(oldrealm
);
4356 mds_rank_t mds
= mds_session
->mds_num
;
4357 const auto &capem
= in
->caps
.emplace(std::piecewise_construct
, std::forward_as_tuple(mds
), std::forward_as_tuple(*in
, mds_session
));
4358 Cap
&cap
= capem
.first
->second
;
4359 if (!capem
.second
) {
4360 if (cap
.gen
< mds_session
->cap_gen
)
4361 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
4364 * auth mds of the inode changed. we received the cap export
4365 * message, but still haven't received the cap import message.
4366 * handle_cap_export() updated the new auth MDS' cap.
4368 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4369 * a message that was send before the cap import message. So
4370 * don't remove caps.
4372 if (ceph_seq_cmp(seq
, cap
.seq
) <= 0) {
4373 if (&cap
!= in
->auth_cap
)
4374 ldout(cct
, 0) << "WARNING: " << "inode " << *in
<< " caps on mds." << mds
<< " != auth_cap." << dendl
;
4376 ceph_assert(cap
.cap_id
== cap_id
);
4379 issued
|= cap
.issued
;
4380 flags
|= CEPH_CAP_FLAG_AUTH
;
4386 check_cap_issue(in
, issued
);
4388 if (flags
& CEPH_CAP_FLAG_AUTH
) {
4389 if (in
->auth_cap
!= &cap
&&
4390 (!in
->auth_cap
|| ceph_seq_cmp(in
->auth_cap
->mseq
, mseq
) < 0)) {
4391 if (in
->auth_cap
&& in
->flushing_cap_item
.is_on_list()) {
4392 ldout(cct
, 10) << __func__
<< " changing auth cap: "
4393 << "add myself to new auth MDS' flushing caps list" << dendl
;
4394 adjust_session_flushing_caps(in
, in
->auth_cap
->session
, mds_session
);
4396 in
->auth_cap
= &cap
;
4400 unsigned old_caps
= cap
.issued
;
4401 cap
.cap_id
= cap_id
;
4402 cap
.issued
= issued
;
4403 cap
.implemented
|= issued
;
4404 if (ceph_seq_cmp(mseq
, cap
.mseq
) > 0)
4405 cap
.wanted
= wanted
;
4407 cap
.wanted
|= wanted
;
4409 cap
.issue_seq
= seq
;
4411 cap
.gen
= mds_session
->cap_gen
;
4412 cap
.latest_perms
= cap_perms
;
4413 ldout(cct
, 10) << __func__
<< " issued " << ccap_string(old_caps
) << " -> " << ccap_string(cap
.issued
)
4414 << " from mds." << mds
4418 if ((issued
& ~old_caps
) && in
->auth_cap
== &cap
) {
4419 // non-auth MDS is revoking the newly grant caps ?
4420 for (auto &p
: in
->caps
) {
4421 if (&p
.second
== &cap
)
4423 if (p
.second
.implemented
& ~p
.second
.issued
& issued
) {
4424 check_caps(in
, CHECK_CAPS_NODELAY
);
4430 if (issued
& ~old_caps
)
4431 signal_cond_list(in
->waitfor_caps
);
4434 void Client::remove_cap(Cap
*cap
, bool queue_release
)
4436 auto &in
= cap
->inode
;
4437 MetaSession
*session
= cap
->session
;
4438 mds_rank_t mds
= cap
->session
->mds_num
;
4440 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " on " << in
<< dendl
;
4442 if (queue_release
) {
4443 session
->enqueue_cap_release(
4454 if (in
.auth_cap
== cap
) {
4455 if (in
.flushing_cap_item
.is_on_list()) {
4456 ldout(cct
, 10) << " removing myself from flushing_cap list" << dendl
;
4457 in
.flushing_cap_item
.remove_myself();
4461 size_t n
= in
.caps
.erase(mds
);
4462 ceph_assert(n
== 1);
4465 if (!in
.is_any_caps()) {
4466 ldout(cct
, 15) << __func__
<< " last one, closing snaprealm " << in
.snaprealm
<< dendl
;
4467 in
.snaprealm_item
.remove_myself();
4468 put_snap_realm(in
.snaprealm
);
4473 void Client::remove_all_caps(Inode
*in
)
4475 while (!in
->caps
.empty())
4476 remove_cap(&in
->caps
.begin()->second
, true);
4479 void Client::remove_session_caps(MetaSession
*s
, int err
)
4481 ldout(cct
, 10) << __func__
<< " mds." << s
->mds_num
<< dendl
;
4483 while (s
->caps
.size()) {
4484 Cap
*cap
= *s
->caps
.begin();
4485 InodeRef
in(&cap
->inode
);
4486 bool dirty_caps
= false;
4487 if (in
->auth_cap
== cap
) {
4488 dirty_caps
= in
->dirty_caps
| in
->flushing_caps
;
4489 in
->wanted_max_size
= 0;
4490 in
->requested_max_size
= 0;
4491 if (in
->has_any_filelocks())
4492 in
->flags
|= I_ERROR_FILELOCK
;
4494 auto caps
= cap
->implemented
;
4495 if (cap
->wanted
| cap
->issued
)
4496 in
->flags
|= I_CAP_DROPPED
;
4497 remove_cap(cap
, false);
4498 in
->cap_snaps
.clear();
4500 lderr(cct
) << __func__
<< " still has dirty|flushing caps on " << *in
<< dendl
;
4501 if (in
->flushing_caps
) {
4502 num_flushing_caps
--;
4503 in
->flushing_cap_tids
.clear();
4505 in
->flushing_caps
= 0;
4506 in
->mark_caps_clean();
4507 put_inode(in
.get());
4509 caps
&= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
;
4510 if (caps
&& !in
->caps_issued_mask(caps
, true)) {
4511 if (err
== -CEPHFS_EBLOCKLISTED
) {
4512 if (in
->oset
.dirty_or_tx
) {
4513 lderr(cct
) << __func__
<< " still has dirty data on " << *in
<< dendl
;
4514 in
->set_async_err(err
);
4516 objectcacher
->purge_set(&in
->oset
);
4518 objectcacher
->release_set(&in
->oset
);
4520 _schedule_invalidate_callback(in
.get(), 0, 0);
4523 signal_cond_list(in
->waitfor_caps
);
4525 s
->flushing_caps_tids
.clear();
4526 sync_cond
.notify_all();
4529 std::pair
<int, bool> Client::_do_remount(bool retry_on_error
)
4531 uint64_t max_retries
= cct
->_conf
.get_val
<uint64_t>("client_max_retries_on_remount_failure");
4532 bool abort_on_failure
= false;
4535 int r
= remount_cb(callback_handle
);
4537 retries_on_invalidate
= 0;
4540 client_t whoami
= get_nodeid();
4543 "failed to remount (to trim kernel dentries): "
4544 "errno = " << e
<< " (" << strerror(e
) << ")" << dendl
;
4547 "failed to remount (to trim kernel dentries): "
4548 "return code = " << r
<< dendl
;
4551 (cct
->_conf
.get_val
<bool>("client_die_on_failed_remount") ||
4552 cct
->_conf
.get_val
<bool>("client_die_on_failed_dentry_invalidate")) &&
4553 !(retry_on_error
&& (++retries_on_invalidate
< max_retries
));
4554 if (should_abort
&& !is_unmounting()) {
4555 lderr(cct
) << "failed to remount for kernel dentry trimming; quitting!" << dendl
;
4556 abort_on_failure
= true;
4559 return std::make_pair(r
, abort_on_failure
);
4562 class C_Client_Remount
: public Context
{
4566 explicit C_Client_Remount(Client
*c
) : client(c
) {}
4567 void finish(int r
) override
{
4568 ceph_assert(r
== 0);
4569 auto result
= client
->_do_remount(true);
4570 if (result
.second
) {
4576 void Client::_invalidate_kernel_dcache()
4578 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
4579 if (!mref_reader
.is_state_satisfied())
4582 if (can_invalidate_dentries
) {
4583 if (dentry_invalidate_cb
&& root
->dir
) {
4584 for (ceph::unordered_map
<string
, Dentry
*>::iterator p
= root
->dir
->dentries
.begin();
4585 p
!= root
->dir
->dentries
.end();
4587 if (p
->second
->inode
)
4588 _schedule_invalidate_dentry_callback(p
->second
, false);
4591 } else if (remount_cb
) {
4593 // when remounting a file system, linux kernel trims all unused dentries in the fs
4594 remount_finisher
.queue(new C_Client_Remount(this));
4598 void Client::_trim_negative_child_dentries(InodeRef
& in
)
4604 if (dir
&& dir
->dentries
.size() == dir
->num_null_dentries
) {
4605 for (auto p
= dir
->dentries
.begin(); p
!= dir
->dentries
.end(); ) {
4606 Dentry
*dn
= p
->second
;
4608 ceph_assert(!dn
->inode
);
4609 if (dn
->lru_is_expireable())
4610 unlink(dn
, true, false); // keep dir, drop dentry
4612 if (dir
->dentries
.empty()) {
4617 if (in
->flags
& I_SNAPDIR_OPEN
) {
4618 InodeRef snapdir
= open_snapdir(in
.get());
4619 _trim_negative_child_dentries(snapdir
);
4623 class C_Client_CacheRelease
: public Context
{
4628 C_Client_CacheRelease(Client
*c
, Inode
*in
) :
4630 if (client
->use_faked_inos())
4631 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
4635 void finish(int r
) override
{
4636 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
4637 client
->_async_inode_release(ino
);
4641 void Client::_async_inode_release(vinodeno_t ino
)
4643 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
4644 if (!mref_reader
.is_state_satisfied())
4647 ldout(cct
, 10) << __func__
<< " " << ino
<< dendl
;
4648 ino_release_cb(callback_handle
, ino
);
4651 void Client::_schedule_ino_release_callback(Inode
*in
) {
4654 // we queue the invalidate, which calls the callback and decrements the ref
4655 async_ino_releasor
.queue(new C_Client_CacheRelease(this, in
));
4658 void Client::trim_caps(MetaSession
*s
, uint64_t max
)
4660 mds_rank_t mds
= s
->mds_num
;
4661 size_t caps_size
= s
->caps
.size();
4662 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " max " << max
4663 << " caps " << caps_size
<< dendl
;
4665 uint64_t trimmed
= 0;
4666 auto p
= s
->caps
.begin();
4667 std::set
<Dentry
*> to_trim
; /* this avoids caps other than the one we're
4668 * looking at from getting deleted during traversal. */
4669 while ((caps_size
- trimmed
) > max
&& !p
.end()) {
4671 InodeRef
in(&cap
->inode
);
4673 // Increment p early because it will be invalidated if cap
4674 // is deleted inside remove_cap
4677 if (in
->caps
.size() > 1 && cap
!= in
->auth_cap
) {
4678 int mine
= cap
->issued
| cap
->implemented
;
4679 int oissued
= in
->auth_cap
? in
->auth_cap
->issued
: 0;
4680 // disposable non-auth cap
4681 if (!(get_caps_used(in
.get()) & ~oissued
& mine
)) {
4682 ldout(cct
, 20) << " removing unused, unneeded non-auth cap on " << *in
<< dendl
;
4683 cap
= (remove_cap(cap
, true), nullptr);
4687 ldout(cct
, 20) << " trying to trim dentries for " << *in
<< dendl
;
4688 _trim_negative_child_dentries(in
);
4690 auto q
= in
->dentries
.begin();
4691 while (q
!= in
->dentries
.end()) {
4694 if (dn
->lru_is_expireable()) {
4695 if (can_invalidate_dentries
&&
4696 dn
->dir
->parent_inode
->ino
== CEPH_INO_ROOT
) {
4697 // Only issue one of these per DN for inodes in root: handle
4698 // others more efficiently by calling for root-child DNs at
4699 // the end of this function.
4700 _schedule_invalidate_dentry_callback(dn
, true);
4702 ldout(cct
, 20) << " queueing dentry for trimming: " << dn
->name
<< dendl
;
4705 ldout(cct
, 20) << " not expirable: " << dn
->name
<< dendl
;
4709 if (in
->ll_ref
== 1 && in
->ino
!= CEPH_INO_ROOT
) {
4710 _schedule_ino_release_callback(in
.get());
4712 if (all
&& in
->ino
!= CEPH_INO_ROOT
) {
4713 ldout(cct
, 20) << __func__
<< " counting as trimmed: " << *in
<< dendl
;
4718 ldout(cct
, 20) << " trimming queued dentries: " << dendl
;
4719 for (const auto &dn
: to_trim
) {
4724 caps_size
= s
->caps
.size();
4725 if (caps_size
> (size_t)max
)
4726 _invalidate_kernel_dcache();
4729 void Client::force_session_readonly(MetaSession
*s
)
4732 for (xlist
<Cap
*>::iterator p
= s
->caps
.begin(); !p
.end(); ++p
) {
4733 auto &in
= (*p
)->inode
;
4734 if (in
.caps_wanted() & CEPH_CAP_FILE_WR
)
4735 signal_cond_list(in
.waitfor_caps
);
4739 int Client::mark_caps_flushing(Inode
*in
, ceph_tid_t
* ptid
)
4741 MetaSession
*session
= in
->auth_cap
->session
;
4743 int flushing
= in
->dirty_caps
;
4744 ceph_assert(flushing
);
4746 ceph_tid_t flush_tid
= ++last_flush_tid
;
4747 in
->flushing_cap_tids
[flush_tid
] = flushing
;
4749 if (!in
->flushing_caps
) {
4750 ldout(cct
, 10) << __func__
<< " " << ccap_string(flushing
) << " " << *in
<< dendl
;
4751 num_flushing_caps
++;
4753 ldout(cct
, 10) << __func__
<< " (more) " << ccap_string(flushing
) << " " << *in
<< dendl
;
4756 in
->flushing_caps
|= flushing
;
4757 in
->mark_caps_clean();
4759 if (!in
->flushing_cap_item
.is_on_list())
4760 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4761 session
->flushing_caps_tids
.insert(flush_tid
);
4767 void Client::adjust_session_flushing_caps(Inode
*in
, MetaSession
*old_s
, MetaSession
*new_s
)
4769 for (auto &p
: in
->cap_snaps
) {
4770 CapSnap
&capsnap
= p
.second
;
4771 if (capsnap
.flush_tid
> 0) {
4772 old_s
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
4773 new_s
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4776 for (map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4777 it
!= in
->flushing_cap_tids
.end();
4779 old_s
->flushing_caps_tids
.erase(it
->first
);
4780 new_s
->flushing_caps_tids
.insert(it
->first
);
4782 new_s
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4786 * Flush all the dirty caps back to the MDS. Because the callers
4787 * generally wait on the result of this function (syncfs and umount
4788 * cases), we set CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4790 void Client::flush_caps_sync()
4792 ldout(cct
, 10) << __func__
<< dendl
;
4793 for (auto &q
: mds_sessions
) {
4795 xlist
<Inode
*>::iterator p
= s
->dirty_list
.begin();
4797 unsigned flags
= CHECK_CAPS_NODELAY
;
4802 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4803 check_caps(in
, flags
);
4808 void Client::wait_sync_caps(Inode
*in
, ceph_tid_t want
)
4810 while (in
->flushing_caps
) {
4811 map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4812 ceph_assert(it
!= in
->flushing_cap_tids
.end());
4813 if (it
->first
> want
)
4815 ldout(cct
, 10) << __func__
<< " on " << *in
<< " flushing "
4816 << ccap_string(it
->second
) << " want " << want
4817 << " last " << it
->first
<< dendl
;
4818 wait_on_list(in
->waitfor_caps
);
4822 void Client::wait_sync_caps(ceph_tid_t want
)
4825 ldout(cct
, 10) << __func__
<< " want " << want
<< " (last is " << last_flush_tid
<< ", "
4826 << num_flushing_caps
<< " total flushing)" << dendl
;
4827 for (auto &p
: mds_sessions
) {
4829 if (s
->flushing_caps_tids
.empty())
4831 ceph_tid_t oldest_tid
= *s
->flushing_caps_tids
.begin();
4832 if (oldest_tid
<= want
) {
4833 ldout(cct
, 10) << " waiting on mds." << p
.first
<< " tid " << oldest_tid
4834 << " (want " << want
<< ")" << dendl
;
4835 std::unique_lock l
{client_lock
, std::adopt_lock
};
4843 void Client::kick_flushing_caps(Inode
*in
, MetaSession
*session
)
4845 in
->flags
&= ~I_KICK_FLUSH
;
4847 Cap
*cap
= in
->auth_cap
;
4848 ceph_assert(cap
->session
== session
);
4850 ceph_tid_t last_snap_flush
= 0;
4851 for (auto p
= in
->flushing_cap_tids
.rbegin();
4852 p
!= in
->flushing_cap_tids
.rend();
4855 last_snap_flush
= p
->first
;
4860 int wanted
= in
->caps_wanted();
4861 int used
= get_caps_used(in
) | in
->caps_dirty();
4862 auto it
= in
->cap_snaps
.begin();
4863 for (auto& p
: in
->flushing_cap_tids
) {
4865 int msg_flags
= p
.first
< last_snap_flush
? MClientCaps::FLAG_PENDING_CAPSNAP
: 0;
4866 send_cap(in
, session
, cap
, msg_flags
, used
, wanted
, (cap
->issued
| cap
->implemented
),
4869 ceph_assert(it
!= in
->cap_snaps
.end());
4870 ceph_assert(it
->second
.flush_tid
== p
.first
);
4871 send_flush_snap(in
, session
, it
->first
, it
->second
);
4877 void Client::kick_flushing_caps(MetaSession
*session
)
4879 mds_rank_t mds
= session
->mds_num
;
4880 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
4882 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4884 if (in
->flags
& I_KICK_FLUSH
) {
4885 ldout(cct
, 20) << " reflushing caps on " << *in
<< " to mds." << mds
<< dendl
;
4886 kick_flushing_caps(in
, session
);
4891 void Client::early_kick_flushing_caps(MetaSession
*session
)
4893 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4895 Cap
*cap
= in
->auth_cap
;
4898 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4899 // stage. This guarantees that MDS processes the cap flush message before issuing
4900 // the flushing caps to other client.
4901 if ((in
->flushing_caps
& in
->auth_cap
->issued
) == in
->flushing_caps
) {
4902 in
->flags
|= I_KICK_FLUSH
;
4906 ldout(cct
, 20) << " reflushing caps (early_kick) on " << *in
4907 << " to mds." << session
->mds_num
<< dendl
;
4908 // send_reconnect() also will reset these sequence numbers. make sure
4909 // sequence numbers in cap flush message match later reconnect message.
4913 cap
->issued
= cap
->implemented
;
4915 kick_flushing_caps(in
, session
);
4919 void Client::invalidate_snaprealm_and_children(SnapRealm
*realm
)
4924 while (!q
.empty()) {
4928 ldout(cct
, 10) << __func__
<< " " << *realm
<< dendl
;
4929 realm
->invalidate_cache();
4931 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4932 p
!= realm
->pchildren
.end();
4938 SnapRealm
*Client::get_snap_realm(inodeno_t r
)
4940 SnapRealm
*realm
= snap_realms
[r
];
4942 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< ", nref was "
4943 << (realm
? realm
->nref
: 0) << dendl
;
4945 snap_realms
[r
] = realm
= new SnapRealm(r
);
4947 // Do not release the global snaprealm until unmounting.
4948 if (r
== CEPH_INO_GLOBAL_SNAPREALM
)
4953 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< ", nref now is "
4954 << realm
->nref
<< dendl
;
4958 SnapRealm
*Client::get_snap_realm_maybe(inodeno_t r
)
4960 if (snap_realms
.count(r
) == 0) {
4961 ldout(cct
, 20) << __func__
<< " " << r
<< " fail" << dendl
;
4964 SnapRealm
*realm
= snap_realms
[r
];
4965 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4970 void Client::put_snap_realm(SnapRealm
*realm
)
4972 ldout(cct
, 20) << __func__
<< " " << realm
->ino
<< " " << realm
4973 << " " << realm
->nref
<< " -> " << (realm
->nref
- 1) << dendl
;
4974 if (--realm
->nref
== 0) {
4975 snap_realms
.erase(realm
->ino
);
4976 if (realm
->pparent
) {
4977 realm
->pparent
->pchildren
.erase(realm
);
4978 put_snap_realm(realm
->pparent
);
4984 bool Client::adjust_realm_parent(SnapRealm
*realm
, inodeno_t parent
)
4986 if (realm
->parent
!= parent
) {
4987 ldout(cct
, 10) << __func__
<< " " << *realm
4988 << " " << realm
->parent
<< " -> " << parent
<< dendl
;
4989 realm
->parent
= parent
;
4990 if (realm
->pparent
) {
4991 realm
->pparent
->pchildren
.erase(realm
);
4992 put_snap_realm(realm
->pparent
);
4994 realm
->pparent
= get_snap_realm(parent
);
4995 realm
->pparent
->pchildren
.insert(realm
);
5001 static bool has_new_snaps(const SnapContext
& old_snapc
,
5002 const SnapContext
& new_snapc
)
5004 return !new_snapc
.snaps
.empty() && new_snapc
.snaps
[0] > old_snapc
.seq
;
5008 void Client::update_snap_trace(const bufferlist
& bl
, SnapRealm
**realm_ret
, bool flush
)
5010 SnapRealm
*first_realm
= NULL
;
5011 ldout(cct
, 10) << __func__
<< " len " << bl
.length() << dendl
;
5013 map
<SnapRealm
*, SnapContext
> dirty_realms
;
5015 auto p
= bl
.cbegin();
5019 SnapRealm
*realm
= get_snap_realm(info
.ino());
5021 bool invalidate
= false;
5023 if (info
.seq() > realm
->seq
) {
5024 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq() << " > " << realm
->seq
5028 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
5029 // flush me + children
5032 while (!q
.empty()) {
5033 SnapRealm
*realm
= q
.front();
5036 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
5037 p
!= realm
->pchildren
.end();
5041 if (dirty_realms
.count(realm
) == 0) {
5043 dirty_realms
[realm
] = realm
->get_snap_context();
5049 realm
->seq
= info
.seq();
5050 realm
->created
= info
.created();
5051 realm
->parent_since
= info
.parent_since();
5052 realm
->prior_parent_snaps
= info
.prior_parent_snaps
;
5053 realm
->my_snaps
= info
.my_snaps
;
5057 // _always_ verify parent
5058 if (adjust_realm_parent(realm
, info
.parent()))
5062 invalidate_snaprealm_and_children(realm
);
5063 ldout(cct
, 15) << __func__
<< " " << *realm
<< " self|parent updated" << dendl
;
5064 ldout(cct
, 15) << " snapc " << realm
->get_snap_context() << dendl
;
5066 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq()
5067 << " <= " << realm
->seq
<< " and same parent, SKIPPING" << dendl
;
5071 first_realm
= realm
;
5073 put_snap_realm(realm
);
5076 for (auto &[realm
, snapc
] : dirty_realms
) {
5077 // if there are new snaps ?
5078 if (has_new_snaps(snapc
, realm
->get_snap_context())) {
5079 ldout(cct
, 10) << " flushing caps on " << *realm
<< dendl
;
5080 for (auto&& in
: realm
->inodes_with_caps
) {
5081 queue_cap_snap(in
, snapc
);
5084 ldout(cct
, 10) << " no new snap on " << *realm
<< dendl
;
5086 put_snap_realm(realm
);
5090 *realm_ret
= first_realm
;
5092 put_snap_realm(first_realm
);
5095 void Client::handle_snap(const MConstRef
<MClientSnap
>& m
)
5097 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
5098 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
5100 std::scoped_lock
cl(client_lock
);
5101 auto session
= _get_mds_session(mds
, m
->get_connection().get());
5106 got_mds_push(session
.get());
5108 map
<Inode
*, SnapContext
> to_move
;
5109 SnapRealm
*realm
= 0;
5111 if (m
->head
.op
== CEPH_SNAP_OP_SPLIT
) {
5112 ceph_assert(m
->head
.split
);
5114 auto p
= m
->bl
.cbegin();
5116 ceph_assert(info
.ino() == m
->head
.split
);
5118 // flush, then move, ino's.
5119 realm
= get_snap_realm(info
.ino());
5120 ldout(cct
, 10) << " splitting off " << *realm
<< dendl
;
5121 for (auto& ino
: m
->split_inos
) {
5122 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
5123 if (inode_map
.count(vino
)) {
5124 Inode
*in
= inode_map
[vino
];
5125 if (!in
->snaprealm
|| in
->snaprealm
== realm
)
5127 if (in
->snaprealm
->created
> info
.created()) {
5128 ldout(cct
, 10) << " NOT moving " << *in
<< " from _newer_ realm "
5129 << *in
->snaprealm
<< dendl
;
5132 ldout(cct
, 10) << " moving " << *in
<< " from " << *in
->snaprealm
<< dendl
;
5135 in
->snaprealm_item
.remove_myself();
5136 to_move
[in
] = in
->snaprealm
->get_snap_context();
5137 put_snap_realm(in
->snaprealm
);
5141 // move child snaprealms, too
5142 for (auto& child_realm
: m
->split_realms
) {
5143 ldout(cct
, 10) << "adjusting snaprealm " << child_realm
<< " parent" << dendl
;
5144 SnapRealm
*child
= get_snap_realm_maybe(child_realm
);
5147 adjust_realm_parent(child
, realm
->ino
);
5148 put_snap_realm(child
);
5152 update_snap_trace(m
->bl
, NULL
, m
->head
.op
!= CEPH_SNAP_OP_DESTROY
);
5155 for (auto p
= to_move
.begin(); p
!= to_move
.end(); ++p
) {
5156 Inode
*in
= p
->first
;
5157 in
->snaprealm
= realm
;
5158 realm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
5160 // queue for snap writeback
5161 if (has_new_snaps(p
->second
, realm
->get_snap_context()))
5162 queue_cap_snap(in
, p
->second
);
5164 put_snap_realm(realm
);
5168 void Client::handle_quota(const MConstRef
<MClientQuota
>& m
)
5170 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
5172 std::scoped_lock
cl(client_lock
);
5173 auto session
= _get_mds_session(mds
, m
->get_connection().get());
5178 got_mds_push(session
.get());
5180 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << mds
<< dendl
;
5182 vinodeno_t
vino(m
->ino
, CEPH_NOSNAP
);
5183 if (inode_map
.count(vino
)) {
5185 in
= inode_map
[vino
];
5188 in
->quota
= m
->quota
;
5189 in
->rstat
= m
->rstat
;
5194 void Client::handle_caps(const MConstRef
<MClientCaps
>& m
)
5196 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
5198 std::scoped_lock
cl(client_lock
);
5199 auto session
= _get_mds_session(mds
, m
->get_connection().get());
5204 if (m
->osd_epoch_barrier
&& !objecter
->have_map(m
->osd_epoch_barrier
)) {
5205 // Pause RADOS operations until we see the required epoch
5206 objecter
->set_epoch_barrier(m
->osd_epoch_barrier
);
5209 if (m
->osd_epoch_barrier
> cap_epoch_barrier
) {
5210 // Record the barrier so that we will transmit it to MDS when releasing
5211 set_cap_epoch_barrier(m
->osd_epoch_barrier
);
5214 got_mds_push(session
.get());
5217 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
5218 if (auto it
= inode_map
.find(vino
); it
!= inode_map
.end()) {
5221 if (m
->get_op() == CEPH_CAP_OP_IMPORT
) {
5222 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< " on IMPORT, immediately releasing" << dendl
;
5223 session
->enqueue_cap_release(
5230 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< ", dropping" << dendl
;
5233 // in case the mds is waiting on e.g. a revocation
5234 flush_cap_releases();
5238 switch (m
->get_op()) {
5239 case CEPH_CAP_OP_EXPORT
: return handle_cap_export(session
.get(), in
, m
);
5240 case CEPH_CAP_OP_FLUSHSNAP_ACK
: return handle_cap_flushsnap_ack(session
.get(), in
, m
);
5241 case CEPH_CAP_OP_IMPORT
: /* no return */ handle_cap_import(session
.get(), in
, m
);
5244 if (auto it
= in
->caps
.find(mds
); it
!= in
->caps
.end()) {
5245 Cap
&cap
= in
->caps
.at(mds
);
5247 switch (m
->get_op()) {
5248 case CEPH_CAP_OP_TRUNC
: return handle_cap_trunc(session
.get(), in
, m
);
5249 case CEPH_CAP_OP_IMPORT
:
5250 case CEPH_CAP_OP_REVOKE
:
5251 case CEPH_CAP_OP_GRANT
: return handle_cap_grant(session
.get(), in
, &cap
, m
);
5252 case CEPH_CAP_OP_FLUSH_ACK
: return handle_cap_flush_ack(session
.get(), in
, &cap
, m
);
5255 ldout(cct
, 5) << __func__
<< " don't have " << *in
<< " cap on mds." << mds
<< dendl
;
5260 void Client::handle_cap_import(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5262 mds_rank_t mds
= session
->mds_num
;
5264 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
5265 << " IMPORT from mds." << mds
<< dendl
;
5267 const mds_rank_t peer_mds
= mds_rank_t(m
->peer
.mds
);
5270 if (auto it
= in
->caps
.find(peer_mds
); m
->peer
.cap_id
&& it
!= in
->caps
.end()) {
5272 cap_perms
= cap
->latest_perms
;
5276 SnapRealm
*realm
= NULL
;
5277 update_snap_trace(m
->snapbl
, &realm
);
5279 int issued
= m
->get_caps();
5280 int wanted
= m
->get_wanted();
5281 add_update_cap(in
, session
, m
->get_cap_id(),
5282 issued
, wanted
, m
->get_seq(), m
->get_mseq(),
5283 m
->get_realm(), CEPH_CAP_FLAG_AUTH
, cap_perms
);
5285 if (cap
&& cap
->cap_id
== m
->peer
.cap_id
) {
5286 remove_cap(cap
, (m
->peer
.flags
& CEPH_CAP_FLAG_RELEASE
));
5290 put_snap_realm(realm
);
5292 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
5293 if (!(wanted
& CEPH_CAP_ANY_FILE_WR
) ||
5294 in
->requested_max_size
> m
->get_max_size()) {
5295 in
->requested_max_size
= 0;
5296 ldout(cct
, 15) << "reset requested_max_size after cap import" << dendl
;
5298 // reflush any/all caps (if we are now the auth_cap)
5299 kick_flushing_caps(in
, session
);
5303 void Client::handle_cap_export(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5305 mds_rank_t mds
= session
->mds_num
;
5307 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
5308 << " EXPORT from mds." << mds
<< dendl
;
5310 auto it
= in
->caps
.find(mds
);
5311 if (it
!= in
->caps
.end()) {
5312 Cap
&cap
= it
->second
;
5313 if (cap
.cap_id
== m
->get_cap_id()) {
5314 if (m
->peer
.cap_id
) {
5315 const auto peer_mds
= mds_rank_t(m
->peer
.mds
);
5316 auto tsession
= _get_or_open_mds_session(peer_mds
);
5317 auto it
= in
->caps
.find(peer_mds
);
5318 if (it
!= in
->caps
.end()) {
5319 Cap
&tcap
= it
->second
;
5320 if (tcap
.cap_id
== m
->peer
.cap_id
&&
5321 ceph_seq_cmp(tcap
.seq
, m
->peer
.seq
) < 0) {
5322 tcap
.cap_id
= m
->peer
.cap_id
;
5323 tcap
.seq
= m
->peer
.seq
- 1;
5324 tcap
.issue_seq
= tcap
.seq
;
5325 tcap
.issued
|= cap
.issued
;
5326 tcap
.implemented
|= cap
.issued
;
5327 if (&cap
== in
->auth_cap
)
5328 in
->auth_cap
= &tcap
;
5329 if (in
->auth_cap
== &tcap
&& in
->flushing_cap_item
.is_on_list())
5330 adjust_session_flushing_caps(in
, session
, tsession
.get());
5333 add_update_cap(in
, tsession
.get(), m
->peer
.cap_id
, cap
.issued
, 0,
5334 m
->peer
.seq
- 1, m
->peer
.mseq
, (uint64_t)-1,
5335 &cap
== in
->auth_cap
? CEPH_CAP_FLAG_AUTH
: 0,
5339 if (cap
.wanted
| cap
.issued
)
5340 in
->flags
|= I_CAP_DROPPED
;
5343 remove_cap(&cap
, false);
5348 void Client::handle_cap_trunc(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5350 mds_rank_t mds
= session
->mds_num
;
5351 ceph_assert(in
->caps
.count(mds
));
5353 ldout(cct
, 10) << __func__
<< " on ino " << *in
5354 << " size " << in
->size
<< " -> " << m
->get_size()
5358 in
->caps_issued(&issued
);
5359 issued
|= in
->caps_dirty();
5360 update_inode_file_size(in
, issued
, m
->get_size(),
5361 m
->get_truncate_seq(), m
->get_truncate_size());
5364 void Client::handle_cap_flush_ack(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5366 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5367 int dirty
= m
->get_dirty();
5371 auto it
= in
->flushing_cap_tids
.begin();
5372 if (it
->first
< flush_ack_tid
) {
5373 ldout(cct
, 0) << __func__
<< " mds." << session
->mds_num
5374 << " got unexpected flush ack tid " << flush_ack_tid
5375 << " expected is " << it
->first
<< dendl
;
5377 for (; it
!= in
->flushing_cap_tids
.end(); ) {
5383 if (it
->first
== flush_ack_tid
)
5384 cleaned
= it
->second
;
5385 if (it
->first
<= flush_ack_tid
) {
5386 session
->flushing_caps_tids
.erase(it
->first
);
5387 in
->flushing_cap_tids
.erase(it
++);
5391 cleaned
&= ~it
->second
;
5397 ldout(cct
, 5) << __func__
<< " mds." << session
->mds_num
5398 << " cleaned " << ccap_string(cleaned
) << " on " << *in
5399 << " with " << ccap_string(dirty
) << dendl
;
5402 signal_cond_list(in
->waitfor_caps
);
5403 if (session
->flushing_caps_tids
.empty() ||
5404 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5405 sync_cond
.notify_all();
5409 in
->cap_dirtier_uid
= -1;
5410 in
->cap_dirtier_gid
= -1;
5414 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != any cap bit tids" << dendl
;
5416 if (in
->flushing_caps
) {
5417 ldout(cct
, 5) << " flushing_caps " << ccap_string(in
->flushing_caps
)
5418 << " -> " << ccap_string(in
->flushing_caps
& ~cleaned
) << dendl
;
5419 in
->flushing_caps
&= ~cleaned
;
5420 if (in
->flushing_caps
== 0) {
5421 ldout(cct
, 10) << " " << *in
<< " !flushing" << dendl
;
5422 num_flushing_caps
--;
5423 if (in
->flushing_cap_tids
.empty())
5424 in
->flushing_cap_item
.remove_myself();
5426 if (!in
->caps_dirty())
5433 void Client::handle_cap_flushsnap_ack(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5435 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5436 mds_rank_t mds
= session
->mds_num
;
5437 ceph_assert(in
->caps
.count(mds
));
5438 snapid_t follows
= m
->get_snap_follows();
5440 if (auto it
= in
->cap_snaps
.find(follows
); it
!= in
->cap_snaps
.end()) {
5441 auto& capsnap
= it
->second
;
5442 if (flush_ack_tid
!= capsnap
.flush_tid
) {
5443 ldout(cct
, 10) << " tid " << flush_ack_tid
<< " != " << capsnap
.flush_tid
<< dendl
;
5445 InodeRef
tmp_ref(in
);
5446 ldout(cct
, 5) << __func__
<< " mds." << mds
<< " flushed snap follows " << follows
5447 << " on " << *in
<< dendl
;
5448 session
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
5449 in
->flushing_cap_tids
.erase(capsnap
.flush_tid
);
5450 if (in
->flushing_caps
== 0 && in
->flushing_cap_tids
.empty())
5451 in
->flushing_cap_item
.remove_myself();
5452 in
->cap_snaps
.erase(it
);
5454 signal_cond_list(in
->waitfor_caps
);
5455 if (session
->flushing_caps_tids
.empty() ||
5456 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5457 sync_cond
.notify_all();
5460 ldout(cct
, 5) << __func__
<< " DUP(?) mds." << mds
<< " flushed snap follows " << follows
5461 << " on " << *in
<< dendl
;
5462 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5466 class C_Client_DentryInvalidate
: public Context
{
5473 C_Client_DentryInvalidate(Client
*c
, Dentry
*dn
, bool del
) :
5474 client(c
), name(dn
->name
) {
5475 if (client
->use_faked_inos()) {
5476 dirino
.ino
= dn
->dir
->parent_inode
->faked_ino
;
5478 ino
.ino
= dn
->inode
->faked_ino
;
5480 dirino
= dn
->dir
->parent_inode
->vino();
5482 ino
= dn
->inode
->vino();
5485 ino
.ino
= inodeno_t();
5487 void finish(int r
) override
{
5488 // _async_dentry_invalidate is responsible for its own locking
5489 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
5490 client
->_async_dentry_invalidate(dirino
, ino
, name
);
5494 void Client::_async_dentry_invalidate(vinodeno_t dirino
, vinodeno_t ino
, string
& name
)
5496 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
5497 if (!mref_reader
.is_state_satisfied())
5500 ldout(cct
, 10) << __func__
<< " '" << name
<< "' ino " << ino
5501 << " in dir " << dirino
<< dendl
;
5502 dentry_invalidate_cb(callback_handle
, dirino
, ino
, name
.c_str(), name
.length());
5505 void Client::_schedule_invalidate_dentry_callback(Dentry
*dn
, bool del
)
5507 if (dentry_invalidate_cb
&& dn
->inode
->ll_ref
> 0)
5508 async_dentry_invalidator
.queue(new C_Client_DentryInvalidate(this, dn
, del
));
5511 void Client::_try_to_trim_inode(Inode
*in
, bool sched_inval
)
5513 int ref
= in
->get_nref();
5514 ldout(cct
, 5) << __func__
<< " in " << *in
<<dendl
;
5516 if (in
->dir
&& !in
->dir
->dentries
.empty()) {
5517 for (auto p
= in
->dir
->dentries
.begin();
5518 p
!= in
->dir
->dentries
.end(); ) {
5519 Dentry
*dn
= p
->second
;
5521 /* rmsnap removes whole subtree, need trim inodes recursively.
5522 * we don't need to invalidate dentries recursively. because
5523 * invalidating a directory dentry effectively invalidate
5525 if (in
->snapid
!= CEPH_NOSNAP
&& dn
->inode
&& dn
->inode
->is_dir())
5526 _try_to_trim_inode(dn
->inode
.get(), false);
5528 if (dn
->lru_is_expireable())
5529 unlink(dn
, true, false); // keep dir, drop dentry
5531 if (in
->dir
->dentries
.empty()) {
5537 if (ref
> 1 && (in
->flags
& I_SNAPDIR_OPEN
)) {
5538 InodeRef snapdir
= open_snapdir(in
);
5539 _try_to_trim_inode(snapdir
.get(), false);
5544 auto q
= in
->dentries
.begin();
5545 while (q
!= in
->dentries
.end()) {
5548 if( in
->ll_ref
> 0 && sched_inval
) {
5549 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5550 // so in->dentries doesn't always reflect the state of kernel's dcache.
5551 _schedule_invalidate_dentry_callback(dn
, true);
5553 unlink(dn
, true, true);
5558 void Client::handle_cap_grant(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5560 mds_rank_t mds
= session
->mds_num
;
5561 int used
= get_caps_used(in
);
5562 int wanted
= in
->caps_wanted();
5565 const unsigned new_caps
= m
->get_caps();
5566 const bool was_stale
= session
->cap_gen
> cap
->gen
;
5567 ldout(cct
, 5) << __func__
<< " on in " << m
->get_ino()
5568 << " mds." << mds
<< " seq " << m
->get_seq()
5569 << " caps now " << ccap_string(new_caps
)
5570 << " was " << ccap_string(cap
->issued
)
5571 << (was_stale
? " (stale)" : "") << dendl
;
5574 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
5575 cap
->seq
= m
->get_seq();
5576 cap
->gen
= session
->cap_gen
;
5578 check_cap_issue(in
, new_caps
);
5582 in
->caps_issued(&issued
);
5583 issued
|= in
->caps_dirty();
5585 if ((new_caps
& CEPH_CAP_AUTH_SHARED
) &&
5586 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
5587 in
->mode
= m
->head
.mode
;
5588 in
->uid
= m
->head
.uid
;
5589 in
->gid
= m
->head
.gid
;
5590 in
->btime
= m
->btime
;
5592 bool deleted_inode
= false;
5593 if ((new_caps
& CEPH_CAP_LINK_SHARED
) &&
5594 !(issued
& CEPH_CAP_LINK_EXCL
)) {
5595 in
->nlink
= m
->head
.nlink
;
5597 deleted_inode
= true;
5599 if (!(issued
& CEPH_CAP_XATTR_EXCL
) &&
5600 m
->xattrbl
.length() &&
5601 m
->head
.xattr_version
> in
->xattr_version
) {
5602 auto p
= m
->xattrbl
.cbegin();
5603 decode(in
->xattrs
, p
);
5604 in
->xattr_version
= m
->head
.xattr_version
;
5607 if ((new_caps
& CEPH_CAP_FILE_SHARED
) && m
->dirstat_is_valid()) {
5608 in
->dirstat
.nfiles
= m
->get_nfiles();
5609 in
->dirstat
.nsubdirs
= m
->get_nsubdirs();
5612 if (new_caps
& CEPH_CAP_ANY_RD
) {
5613 update_inode_file_time(in
, issued
, m
->get_time_warp_seq(),
5614 m
->get_ctime(), m
->get_mtime(), m
->get_atime());
5617 if (new_caps
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
)) {
5618 in
->layout
= m
->get_layout();
5619 update_inode_file_size(in
, issued
, m
->get_size(),
5620 m
->get_truncate_seq(), m
->get_truncate_size());
5623 if (m
->inline_version
> in
->inline_version
) {
5624 in
->inline_data
= m
->inline_data
;
5625 in
->inline_version
= m
->inline_version
;
5628 /* always take a newer change attr */
5629 if (m
->get_change_attr() > in
->change_attr
)
5630 in
->change_attr
= m
->get_change_attr();
5633 if (cap
== in
->auth_cap
&&
5634 (new_caps
& CEPH_CAP_ANY_FILE_WR
) &&
5635 (m
->get_max_size() != in
->max_size
)) {
5636 ldout(cct
, 10) << "max_size " << in
->max_size
<< " -> " << m
->get_max_size() << dendl
;
5637 in
->max_size
= m
->get_max_size();
5638 if (in
->max_size
> in
->wanted_max_size
) {
5639 in
->wanted_max_size
= 0;
5640 in
->requested_max_size
= 0;
5645 if ((was_stale
|| m
->get_op() == CEPH_CAP_OP_IMPORT
) &&
5646 (wanted
& ~(cap
->wanted
| new_caps
))) {
5647 // If mds is importing cap, prior cap messages that update 'wanted'
5648 // may get dropped by mds (migrate seq mismatch).
5650 // We don't send cap message to update 'wanted' if what we want are
5651 // already issued. If mds revokes caps, cap message that releases caps
5652 // also tells mds what we want. But if caps got revoked by mds forcedly
5653 // (session stale). We may haven't told mds what we want.
5659 auto revoked
= cap
->issued
& ~new_caps
;
5661 ldout(cct
, 10) << " revocation of " << ccap_string(revoked
) << dendl
;
5662 cap
->issued
= new_caps
;
5663 cap
->implemented
|= new_caps
;
5665 // recall delegations if we're losing caps necessary for them
5666 if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_RD
))
5667 in
->recall_deleg(false);
5668 else if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_WR
))
5669 in
->recall_deleg(true);
5671 used
= adjust_caps_used_for_lazyio(used
, cap
->issued
, cap
->implemented
);
5672 if ((used
& revoked
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
)) &&
5673 !_flush(in
, new C_Client_FlushComplete(this, in
))) {
5674 // waitin' for flush
5675 } else if (used
& revoked
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
)) {
5678 flags
= CHECK_CAPS_NODELAY
;
5681 cap
->wanted
= 0; // don't let check_caps skip sending a response to MDS
5683 flags
= CHECK_CAPS_NODELAY
;
5685 } else if (cap
->issued
== new_caps
) {
5686 ldout(cct
, 10) << " caps unchanged at " << ccap_string(cap
->issued
) << dendl
;
5688 ldout(cct
, 10) << " grant, new caps are " << ccap_string(new_caps
& ~cap
->issued
) << dendl
;
5689 cap
->issued
= new_caps
;
5690 cap
->implemented
|= new_caps
;
5692 if (cap
== in
->auth_cap
) {
5693 // non-auth MDS is revoking the newly grant caps ?
5694 for (const auto &p
: in
->caps
) {
5695 if (&p
.second
== cap
)
5697 if (p
.second
.implemented
& ~p
.second
.issued
& new_caps
) {
5706 check_caps(in
, flags
);
5710 signal_cond_list(in
->waitfor_caps
);
5712 // may drop inode's last ref
5714 _try_to_trim_inode(in
, true);
5717 int Client::inode_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
5719 if (perms
.uid() == 0) {
5720 // For directories, DACs are overridable.
5721 // For files, Read/write DACs are always overridable but executable DACs are
5722 // overridable when there is at least one exec bit set
5723 if(!S_ISDIR(in
->mode
) && (want
& MAY_EXEC
) && !(in
->mode
& S_IXUGO
))
5724 return -CEPHFS_EACCES
;
5728 if (perms
.uid() != in
->uid
&& (in
->mode
& S_IRWXG
)) {
5729 int ret
= _posix_acl_permission(in
, perms
, want
);
5730 if (ret
!= -CEPHFS_EAGAIN
)
5734 // check permissions before doing anything else
5735 if (!in
->check_mode(perms
, want
))
5736 return -CEPHFS_EACCES
;
5740 int Client::xattr_permission(Inode
*in
, const char *name
, unsigned want
,
5741 const UserPerm
& perms
)
5743 int r
= _getattr_for_perm(in
, perms
);
5748 if (strncmp(name
, "system.", 7) == 0) {
5749 if ((want
& MAY_WRITE
) && (perms
.uid() != 0 && perms
.uid() != in
->uid
))
5752 r
= inode_permission(in
, perms
, want
);
5755 ldout(cct
, 5) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5759 std::ostream
& operator<<(std::ostream
&out
, const UserPerm
& perm
) {
5760 out
<< "UserPerm(uid: " << perm
.uid() << ", gid: " << perm
.gid() << ")";
5764 int Client::may_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
5765 const UserPerm
& perms
)
5767 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5768 int r
= _getattr_for_perm(in
, perms
);
5772 if (mask
& CEPH_SETATTR_SIZE
) {
5773 r
= inode_permission(in
, perms
, MAY_WRITE
);
5779 if (mask
& CEPH_SETATTR_UID
) {
5780 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
|| stx
->stx_uid
!= in
->uid
))
5783 if (mask
& CEPH_SETATTR_GID
) {
5784 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
||
5785 (!perms
.gid_in_groups(stx
->stx_gid
) && stx
->stx_gid
!= in
->gid
)))
5789 if (mask
& CEPH_SETATTR_MODE
) {
5790 if (perms
.uid() != 0 && perms
.uid() != in
->uid
)
5793 gid_t i_gid
= (mask
& CEPH_SETATTR_GID
) ? stx
->stx_gid
: in
->gid
;
5794 if (perms
.uid() != 0 && !perms
.gid_in_groups(i_gid
))
5795 stx
->stx_mode
&= ~S_ISGID
;
5798 if (mask
& (CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
|
5799 CEPH_SETATTR_MTIME
| CEPH_SETATTR_ATIME
)) {
5800 if (perms
.uid() != 0 && perms
.uid() != in
->uid
) {
5801 int check_mask
= CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
;
5802 if (!(mask
& CEPH_SETATTR_MTIME_NOW
))
5803 check_mask
|= CEPH_SETATTR_MTIME
;
5804 if (!(mask
& CEPH_SETATTR_ATIME_NOW
))
5805 check_mask
|= CEPH_SETATTR_ATIME
;
5806 if (check_mask
& mask
) {
5809 r
= inode_permission(in
, perms
, MAY_WRITE
);
5817 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5821 int Client::may_open(Inode
*in
, int flags
, const UserPerm
& perms
)
5823 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5826 if ((flags
& O_ACCMODE
) == O_WRONLY
)
5828 else if ((flags
& O_ACCMODE
) == O_RDWR
)
5829 want
= MAY_READ
| MAY_WRITE
;
5830 else if ((flags
& O_ACCMODE
) == O_RDONLY
)
5832 if (flags
& O_TRUNC
)
5836 switch (in
->mode
& S_IFMT
) {
5841 if (want
& MAY_WRITE
) {
5848 r
= _getattr_for_perm(in
, perms
);
5852 r
= inode_permission(in
, perms
, want
);
5854 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5858 int Client::may_lookup(Inode
*dir
, const UserPerm
& perms
)
5860 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5861 int r
= _getattr_for_perm(dir
, perms
);
5865 r
= inode_permission(dir
, perms
, MAY_EXEC
);
5867 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5871 int Client::may_create(Inode
*dir
, const UserPerm
& perms
)
5873 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5874 int r
= _getattr_for_perm(dir
, perms
);
5878 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5880 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5884 int Client::may_delete(Inode
*dir
, const char *name
, const UserPerm
& perms
)
5886 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << "; name " << name
<< "; " << perms
<< dendl
;
5887 int r
= _getattr_for_perm(dir
, perms
);
5891 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5895 /* 'name == NULL' means rmsnap w/o permission checks */
5896 if (perms
.uid() != 0 && name
&& (dir
->mode
& S_ISVTX
)) {
5898 r
= _lookup(dir
, name
, CEPH_CAP_AUTH_SHARED
, &otherin
, perms
);
5901 if (dir
->uid
!= perms
.uid() && otherin
->uid
!= perms
.uid())
5905 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5909 int Client::may_delete(const char *relpath
, const UserPerm
& perms
) {
5910 ldout(cct
, 20) << __func__
<< " " << relpath
<< "; " << perms
<< dendl
;
5912 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
5913 if (!mref_reader
.is_state_satisfied())
5916 filepath
path(relpath
);
5917 string name
= path
.last_dentry();
5921 std::scoped_lock
lock(client_lock
);
5922 int r
= path_walk(path
, &dir
, perms
);
5925 if (cct
->_conf
->client_permissions
) {
5926 int r
= may_delete(dir
.get(), name
.c_str(), perms
);
5934 int Client::may_hardlink(Inode
*in
, const UserPerm
& perms
)
5936 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5937 int r
= _getattr_for_perm(in
, perms
);
5941 if (perms
.uid() == 0 || perms
.uid() == in
->uid
) {
5947 if (!S_ISREG(in
->mode
))
5950 if (in
->mode
& S_ISUID
)
5953 if ((in
->mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
))
5956 r
= inode_permission(in
, perms
, MAY_READ
| MAY_WRITE
);
5958 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5962 int Client::_getattr_for_perm(Inode
*in
, const UserPerm
& perms
)
5964 int mask
= CEPH_STAT_CAP_MODE
;
5966 if (acl_type
!= NO_ACL
) {
5967 mask
|= CEPH_STAT_CAP_XATTR
;
5968 force
= in
->xattr_version
== 0;
5970 return _getattr(in
, mask
, perms
, force
);
5973 vinodeno_t
Client::_get_vino(Inode
*in
)
5975 /* The caller must hold the client lock */
5976 return vinodeno_t(in
->ino
, in
->snapid
);
5980 * Resolve an MDS spec to a list of MDS daemon GIDs.
5982 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5983 * It may be '*' in which case it matches all GIDs.
5985 * If no error is returned, the `targets` vector will be populated with at least
5988 int Client::resolve_mds(
5989 const std::string
&mds_spec
,
5990 std::vector
<mds_gid_t
> *targets
)
5993 ceph_assert(targets
!= nullptr);
5996 CachedStackStringStream css
;
5997 int role_r
= fsmap
->parse_role(mds_spec
, &role
, *css
);
5999 // We got a role, resolve it to a GID
6000 auto& info
= fsmap
->get_filesystem(role
.fscid
)->mds_map
.get_info(role
.rank
);
6001 ldout(cct
, 10) << __func__
<< ": resolved " << mds_spec
<< " to role '"
6002 << role
<< "' aka " << info
.human_name() << dendl
;
6003 targets
->push_back(info
.global_id
);
6007 std::string strtol_err
;
6008 long long rank_or_gid
= strict_strtoll(mds_spec
.c_str(), 10, &strtol_err
);
6009 if (strtol_err
.empty()) {
6010 // It is a possible GID
6011 const mds_gid_t mds_gid
= mds_gid_t(rank_or_gid
);
6012 if (fsmap
->gid_exists(mds_gid
)) {
6013 auto& info
= fsmap
->get_info_gid(mds_gid
);
6014 ldout(cct
, 10) << __func__
<< ": validated gid " << mds_gid
<< " aka "
6015 << info
.human_name() << dendl
;
6016 targets
->push_back(mds_gid
);
6019 lderr(cct
) << __func__
<< ": gid " << mds_gid
<< " not in MDS map"
6021 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
6022 return -CEPHFS_ENOENT
;
6024 } else if (mds_spec
== "*") {
6025 // It is a wildcard: use all MDSs
6026 const auto& mds_info
= fsmap
->get_mds_info();
6028 ldout(cct
, 10) << __func__
<< ": resolving `*' to all MDS daemons" << dendl
;
6029 if (mds_info
.empty()) {
6030 lderr(cct
) << __func__
<< ": no MDS daemons found" << dendl
;
6031 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
6032 return -CEPHFS_ENOENT
;
6035 for (const auto& [gid
, info
] : mds_info
) {
6036 ldout(cct
, 10) << __func__
<< ": appending " << info
.human_name() << " to targets" << dendl
;
6037 targets
->push_back(gid
);
6041 // It did not parse as an integer, it is not a wildcard, it must be a name
6042 const mds_gid_t mds_gid
= fsmap
->find_mds_gid_by_name(mds_spec
);
6044 lderr(cct
) << __func__
<< ": no MDS daemons found by name `" << mds_spec
<< "'" << dendl
;
6045 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
6046 return -CEPHFS_ENOENT
;
6048 auto& info
= fsmap
->get_info_gid(mds_gid
);
6049 ldout(cct
, 10) << __func__
<< ": resolved name '" << mds_spec
6050 << "' to " << info
.human_name() << dendl
;
6051 targets
->push_back(mds_gid
);
6059 * Authenticate with mon and establish global ID
6061 int Client::authenticate()
6063 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6065 if (monclient
->is_authenticated()) {
6069 client_lock
.unlock();
6070 int r
= monclient
->authenticate(std::chrono::duration
<double>(mount_timeout
).count());
6076 whoami
= monclient
->get_global_id();
6077 messenger
->set_myname(entity_name_t::CLIENT(whoami
.v
));
6082 int Client::fetch_fsmap(bool user
)
6084 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6086 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
6087 // rather than MDSMap because no one MDSMap contains all the daemons, and
6088 // a `tell` can address any daemon.
6089 version_t fsmap_latest
;
6092 client_lock
.unlock();
6093 std::tie(fsmap_latest
, std::ignore
) =
6094 monclient
->get_version("fsmap", ca::use_blocked
[ec
]);
6096 } while (ec
== bs::errc::resource_unavailable_try_again
);
6099 lderr(cct
) << "Failed to learn FSMap version: " << ec
<< dendl
;
6100 return ceph::from_error_code(ec
);
6103 ldout(cct
, 10) << __func__
<< " learned FSMap version " << fsmap_latest
<< dendl
;
6106 if (!fsmap_user
|| fsmap_user
->get_epoch() < fsmap_latest
) {
6107 monclient
->sub_want("fsmap.user", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
6108 monclient
->renew_subs();
6109 wait_on_list(waiting_for_fsmap
);
6111 ceph_assert(fsmap_user
);
6112 ceph_assert(fsmap_user
->get_epoch() >= fsmap_latest
);
6114 if (!fsmap
|| fsmap
->get_epoch() < fsmap_latest
) {
6115 monclient
->sub_want("fsmap", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
6116 monclient
->renew_subs();
6117 wait_on_list(waiting_for_fsmap
);
6120 ceph_assert(fsmap
->get_epoch() >= fsmap_latest
);
6122 ldout(cct
, 10) << __func__
<< " finished waiting for FSMap version "
6123 << fsmap_latest
<< dendl
;
6129 * @mds_spec one of ID, rank, GID, "*"
6132 int Client::mds_command(
6133 const std::string
&mds_spec
,
6134 const vector
<string
>& cmd
,
6135 const bufferlist
& inbl
,
6140 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
6141 if (!iref_reader
.is_state_satisfied())
6142 return -CEPHFS_ENOTCONN
;
6144 std::unique_lock
cl(client_lock
);
6152 r
= fetch_fsmap(false);
6157 // Look up MDS target(s) of the command
6158 std::vector
<mds_gid_t
> targets
;
6159 r
= resolve_mds(mds_spec
, &targets
);
6164 // If daemons are laggy, we won't send them commands. If all
6165 // are laggy then we fail.
6166 std::vector
<mds_gid_t
> non_laggy
;
6167 for (const auto& gid
: targets
) {
6168 const auto info
= fsmap
->get_info_gid(gid
);
6169 if (!info
.laggy()) {
6170 non_laggy
.push_back(gid
);
6173 if (non_laggy
.size() == 0) {
6174 *outs
= "All targeted MDS daemons are laggy";
6175 return -CEPHFS_ENOENT
;
6178 if (metadata
.empty()) {
6179 // We are called on an unmounted client, so metadata
6180 // won't be initialized yet.
6181 populate_metadata("");
6184 // Send commands to targets
6185 C_GatherBuilder
gather(cct
, onfinish
);
6186 for (const auto& target_gid
: non_laggy
) {
6187 const auto info
= fsmap
->get_info_gid(target_gid
);
6189 // Open a connection to the target MDS
6190 ConnectionRef conn
= messenger
->connect_to_mds(info
.get_addrs());
6194 std::scoped_lock
cmd_lock(command_lock
);
6195 // Generate MDSCommandOp state
6196 auto &op
= command_table
.start_command();
6198 op
.on_finish
= gather
.new_sub();
6203 op
.mds_gid
= target_gid
;
6206 ldout(cct
, 4) << __func__
<< ": new command op to " << target_gid
6207 << " tid=" << op
.tid
<< cmd
<< dendl
;
6209 // Construct and send MCommand
6210 MessageRef m
= op
.get_message(monclient
->get_fsid());
6211 conn
->send_message2(std::move(m
));
6220 void Client::handle_command_reply(const MConstRef
<MCommandReply
>& m
)
6222 ceph_tid_t
const tid
= m
->get_tid();
6224 ldout(cct
, 10) << __func__
<< ": tid=" << m
->get_tid() << dendl
;
6226 std::scoped_lock
cmd_lock(command_lock
);
6227 if (!command_table
.exists(tid
)) {
6228 ldout(cct
, 1) << __func__
<< ": unknown tid " << tid
<< ", dropping" << dendl
;
6232 auto &op
= command_table
.get_command(tid
);
6234 *op
.outbl
= m
->get_data();
6241 op
.on_finish
->complete(m
->r
);
6244 command_table
.erase(tid
);
6247 // -------------------
6250 int Client::subscribe_mdsmap(const std::string
&fs_name
)
6252 int r
= authenticate();
6254 lderr(cct
) << "authentication failed: " << cpp_strerror(r
) << dendl
;
6258 std::string resolved_fs_name
;
6259 if (fs_name
.empty()) {
6260 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_fs");
6261 if (resolved_fs_name
.empty())
6262 // Try the backwards compatibility fs name option
6263 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_mds_namespace");
6265 resolved_fs_name
= fs_name
;
6268 std::string want
= "mdsmap";
6269 if (!resolved_fs_name
.empty()) {
6270 r
= fetch_fsmap(true);
6273 fscid
= fsmap_user
->get_fs_cid(resolved_fs_name
);
6274 if (fscid
== FS_CLUSTER_ID_NONE
) {
6275 return -CEPHFS_ENOENT
;
6278 std::ostringstream oss
;
6279 oss
<< want
<< "." << fscid
;
6282 ldout(cct
, 10) << "Subscribing to map '" << want
<< "'" << dendl
;
6284 monclient
->sub_want(want
, 0, 0);
6285 monclient
->renew_subs();
6290 int Client::mount(const std::string
&mount_root
, const UserPerm
& perms
,
6291 bool require_mds
, const std::string
&fs_name
)
6293 ceph_assert(is_initialized());
6296 * To make sure that the _unmount() must wait until the mount()
6299 RWRef_t
mref_writer(mount_state
, CLIENT_MOUNTING
, false);
6300 if (!mref_writer
.is_first_writer()) // already mounting or mounted
6303 std::unique_lock
cl(client_lock
);
6305 int r
= subscribe_mdsmap(fs_name
);
6307 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
6311 start_tick_thread(); // start tick thread
6315 auto availability
= mdsmap
->is_cluster_available();
6316 if (availability
== MDSMap::STUCK_UNAVAILABLE
) {
6318 ldout(cct
, 10) << "mds cluster unavailable: epoch=" << mdsmap
->get_epoch() << dendl
;
6319 return CEPH_FUSE_NO_MDS_UP
;
6320 } else if (availability
== MDSMap::AVAILABLE
) {
6321 // Continue to mount
6323 } else if (availability
== MDSMap::TRANSIENT_UNAVAILABLE
) {
6324 // Else, wait. MDSMonitor will update the map to bring
6325 // us to a conclusion eventually.
6326 wait_on_list(waiting_for_mdsmap
);
6328 // Unexpected value!
6334 populate_metadata(mount_root
.empty() ? "/" : mount_root
);
6336 filepath
fp(CEPH_INO_ROOT
);
6337 if (!mount_root
.empty()) {
6338 fp
= filepath(mount_root
.c_str());
6341 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
6342 req
->set_filepath(fp
);
6343 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
6344 int res
= make_request(req
, perms
);
6346 if (res
== -CEPHFS_EACCES
&& root
) {
6347 ldout(cct
, 1) << __func__
<< " EACCES on parent of mount point; quotas may not work" << dendl
;
6360 _ll_get(root
.get());
6363 if (!cct
->_conf
->client_trace
.empty()) {
6364 traceout
.open(cct
->_conf
->client_trace
.c_str());
6365 if (traceout
.is_open()) {
6366 ldout(cct
, 1) << "opened trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6368 ldout(cct
, 1) << "FAILED to open trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6373 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6374 ldout(cct, 3) << "op: struct stat st;" << dendl;
6375 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6376 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6377 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6378 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6379 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6380 ldout(cct, 3) << "op: int fd;" << dendl;
6383 mref_writer
.update_state(CLIENT_MOUNTED
);
6389 void Client::_close_sessions()
6391 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
6392 if (it
->second
->state
== MetaSession::STATE_REJECTED
)
6393 mds_sessions
.erase(it
++);
6398 while (!mds_sessions
.empty()) {
6399 // send session closes!
6400 for (auto &p
: mds_sessions
) {
6401 if (p
.second
->state
!= MetaSession::STATE_CLOSING
) {
6402 _close_mds_session(p
.second
.get());
6403 mds_ranks_closing
.insert(p
.first
);
6407 // wait for sessions to close
6408 double timo
= cct
->_conf
.get_val
<std::chrono::seconds
>("client_shutdown_timeout").count();
6409 ldout(cct
, 2) << "waiting for " << mds_ranks_closing
.size() << " mds session(s) to close (timeout: "
6410 << timo
<< "s)" << dendl
;
6411 std::unique_lock l
{client_lock
, std::adopt_lock
};
6414 } else if (!mount_cond
.wait_for(l
, ceph::make_timespan(timo
), [this] { return mds_ranks_closing
.empty(); })) {
6415 ldout(cct
, 1) << mds_ranks_closing
.size() << " mds(s) did not respond to session close -- timing out." << dendl
;
6416 while (!mds_ranks_closing
.empty()) {
6417 auto session
= mds_sessions
.at(*mds_ranks_closing
.begin());
6418 // this prunes entry from mds_sessions and mds_ranks_closing
6419 _closed_mds_session(session
.get(), -CEPHFS_ETIMEDOUT
);
6423 mds_ranks_closing
.clear();
6428 void Client::flush_mdlog_sync(Inode
*in
)
6430 if (in
->unsafe_ops
.empty()) {
6434 std::set
<mds_rank_t
> anchor
;
6435 for (auto &&p
: in
->unsafe_ops
) {
6436 anchor
.emplace(p
->mds
);
6439 anchor
.emplace(in
->auth_cap
->session
->mds_num
);
6442 for (auto &rank
: anchor
) {
6443 auto session
= &mds_sessions
.at(rank
);
6444 flush_mdlog(session
->get());
6448 void Client::flush_mdlog_sync()
6450 if (mds_requests
.empty())
6452 for (auto &p
: mds_sessions
) {
6453 flush_mdlog(p
.second
.get());
6457 void Client::flush_mdlog(MetaSession
*session
)
6459 // Only send this to Luminous or newer MDS daemons, older daemons
6460 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6461 const uint64_t features
= session
->con
->get_features();
6462 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
6463 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_FLUSH_MDLOG
);
6464 session
->con
->send_message2(std::move(m
));
6469 void Client::_abort_mds_sessions(int err
)
6471 for (auto p
= mds_requests
.begin(); p
!= mds_requests
.end(); ) {
6472 auto req
= p
->second
;
6474 // unsafe requests will be removed during close session below.
6475 if (req
->got_unsafe
)
6479 if (req
->caller_cond
) {
6481 req
->caller_cond
->notify_all();
6485 // Process aborts on any requests that were on this waitlist.
6486 // Any requests that were on a waiting_for_open session waitlist
6487 // will get kicked during close session below.
6488 signal_cond_list(waiting_for_mdsmap
);
6490 // Force-close all sessions
6491 while(!mds_sessions
.empty()) {
6492 auto session
= mds_sessions
.begin()->second
;
6493 _closed_mds_session(session
.get(), err
);
6497 void Client::_unmount(bool abort
)
6500 * We are unmounting the client.
6502 * Just declare the state to STATE_UNMOUNTING to block and fail
6503 * any new comming "reader" and then try to wait all the in-flight
6504 * "readers" to finish.
6506 RWRef_t
mref_writer(mount_state
, CLIENT_UNMOUNTING
, false);
6507 if (!mref_writer
.is_first_writer())
6509 mref_writer
.wait_readers_done();
6511 std::unique_lock lock
{client_lock
};
6513 if (abort
|| blocklisted
) {
6514 ldout(cct
, 2) << "unmounting (" << (abort
? "abort)" : "blocklisted)") << dendl
;
6516 ldout(cct
, 2) << "unmounting" << dendl
;
6522 mount_aborted
= true;
6523 // Abort all mds sessions
6524 _abort_mds_sessions(-CEPHFS_ENOTCONN
);
6526 objecter
->op_cancel_writes(-CEPHFS_ENOTCONN
);
6528 // flush the mdlog for pending requests, if any
6532 mount_cond
.wait(lock
, [this] {
6533 if (!mds_requests
.empty()) {
6534 ldout(cct
, 10) << "waiting on " << mds_requests
.size() << " requests"
6537 return mds_requests
.empty();
6543 // clean up any unclosed files
6544 while (!fd_map
.empty()) {
6545 Fh
*fh
= fd_map
.begin()->second
;
6546 fd_map
.erase(fd_map
.begin());
6547 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *fh
->inode
<< dendl
;
6551 while (!ll_unclosed_fh_set
.empty()) {
6552 set
<Fh
*>::iterator it
= ll_unclosed_fh_set
.begin();
6554 ll_unclosed_fh_set
.erase(fh
);
6555 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *(fh
->inode
) << dendl
;
6559 while (!opened_dirs
.empty()) {
6560 dir_result_t
*dirp
= *opened_dirs
.begin();
6561 ldout(cct
, 0) << " destroyed lost open dir " << dirp
<< " on " << *dirp
->inode
<< dendl
;
6567 if (cct
->_conf
->client_oc
) {
6568 // flush/release all buffered data
6569 std::list
<InodeRef
> anchor
;
6570 for (auto& p
: inode_map
) {
6571 Inode
*in
= p
.second
;
6573 ldout(cct
, 0) << "null inode_map entry ino " << p
.first
<< dendl
;
6577 // prevent inode from getting freed
6578 anchor
.emplace_back(in
);
6580 if (abort
|| blocklisted
) {
6581 objectcacher
->purge_set(&in
->oset
);
6582 } else if (!in
->caps
.empty()) {
6584 _flush(in
, new C_Client_FlushComplete(this, in
));
6589 if (abort
|| blocklisted
) {
6590 for (auto &q
: mds_sessions
) {
6592 for (auto p
= s
->dirty_list
.begin(); !p
.end(); ) {
6595 if (in
->dirty_caps
) {
6596 ldout(cct
, 0) << " drop dirty caps on " << *in
<< dendl
;
6597 in
->mark_caps_clean();
6604 wait_sync_caps(last_flush_tid
);
6612 while (lru
.lru_get_size() > 0 ||
6613 !inode_map
.empty()) {
6614 ldout(cct
, 2) << "cache still has " << lru
.lru_get_size()
6615 << "+" << inode_map
.size() << " items"
6616 << ", waiting (for caps to release?)"
6619 if (auto r
= mount_cond
.wait_for(lock
, ceph::make_timespan(5));
6620 r
== std::cv_status::timeout
) {
6624 ceph_assert(lru
.lru_get_size() == 0);
6625 ceph_assert(inode_map
.empty());
6628 if (!cct
->_conf
->client_trace
.empty()) {
6629 ldout(cct
, 1) << "closing trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6633 // stop the tick thread
6634 tick_thread_stopped
= true;
6635 upkeep_cond
.notify_one();
6639 // release the global snapshot realm
6640 SnapRealm
*global_realm
= snap_realms
[CEPH_INO_GLOBAL_SNAPREALM
];
6642 ceph_assert(global_realm
->nref
== 1);
6643 put_snap_realm(global_realm
);
6646 mref_writer
.update_state(CLIENT_UNMOUNTED
);
6649 * Stop the remount_queue before clearing the mountpoint memory
6650 * to avoid possible use-after-free bug.
6653 ldout(cct
, 10) << "unmount stopping remount finisher" << dendl
;
6654 remount_finisher
.wait_for_empty();
6655 remount_finisher
.stop();
6656 remount_cb
= nullptr;
6659 ldout(cct
, 2) << "unmounted." << dendl
;
6662 void Client::unmount()
6667 void Client::abort_conn()
6672 void Client::flush_cap_releases()
6674 uint64_t nr_caps
= 0;
6676 // send any cap releases
6677 for (auto &p
: mds_sessions
) {
6678 auto session
= p
.second
;
6679 if (session
->release
&& mdsmap
->is_clientreplay_or_active_or_stopping(
6681 nr_caps
+= session
->release
->caps
.size();
6682 if (cct
->_conf
->client_inject_release_failure
) {
6683 ldout(cct
, 20) << __func__
<< " injecting failure to send cap release message" << dendl
;
6685 session
->con
->send_message2(std::move(session
->release
));
6687 session
->release
.reset();
6692 dec_pinned_icaps(nr_caps
);
6696 void Client::renew_and_flush_cap_releases()
6698 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6700 if (!mount_aborted
&& mdsmap
->get_epoch()) {
6702 auto el
= ceph::coarse_mono_clock::now() - last_cap_renew
;
6703 if (unlikely(utime_t(el
) > mdsmap
->get_session_timeout() / 3.0))
6706 flush_cap_releases();
6712 ldout(cct
, 20) << "tick" << dendl
;
6714 auto now
= ceph::coarse_mono_clock::now();
6717 * If the mount() is not finished
6719 if (is_mounting() && !mds_requests
.empty()) {
6720 MetaRequest
*req
= mds_requests
.begin()->second
;
6722 if (req
->created
+ mount_timeout
< now
) {
6723 req
->abort(-CEPHFS_ETIMEDOUT
);
6724 if (req
->caller_cond
) {
6726 req
->caller_cond
->notify_all();
6728 signal_cond_list(waiting_for_mdsmap
);
6729 for (auto &p
: mds_sessions
) {
6730 signal_context_list(p
.second
->waiting_for_open
);
6735 renew_and_flush_cap_releases();
6738 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
6742 if (!mount_aborted
&& in
->hold_caps_until
> now
)
6744 delayed_list
.pop_front();
6746 check_caps(in
, CHECK_CAPS_NODELAY
);
6750 collect_and_send_metrics();
6752 delay_put_inodes(is_unmounting());
6755 if (blocklisted
&& (is_mounted() || is_unmounting()) &&
6756 last_auto_reconnect
+ std::chrono::seconds(30 * 60) < now
&&
6757 cct
->_conf
.get_val
<bool>("client_reconnect_stale")) {
6758 messenger
->client_reset();
6759 fd_gen
++; // invalidate open files
6760 blocklisted
= false;
6761 _kick_stale_sessions();
6762 last_auto_reconnect
= now
;
6766 void Client::start_tick_thread()
6768 upkeeper
= std::thread([this]() {
6769 using time
= ceph::coarse_mono_time
;
6770 using sec
= std::chrono::seconds
;
6772 auto last_tick
= time::min();
6774 std::unique_lock
cl(client_lock
);
6775 while (!tick_thread_stopped
) {
6776 auto now
= clock::now();
6777 auto since
= now
- last_tick
;
6779 auto t_interval
= clock::duration(cct
->_conf
.get_val
<sec
>("client_tick_interval"));
6780 auto d_interval
= clock::duration(cct
->_conf
.get_val
<sec
>("client_debug_inject_tick_delay"));
6782 auto interval
= std::max(t_interval
, d_interval
);
6783 if (likely(since
>= interval
*.90)) {
6785 last_tick
= clock::now();
6790 ldout(cct
, 20) << "upkeep thread waiting interval " << interval
<< dendl
;
6791 if (!tick_thread_stopped
)
6792 upkeep_cond
.wait_for(cl
, interval
);
6797 void Client::collect_and_send_metrics() {
6798 ldout(cct
, 20) << __func__
<< dendl
;
6800 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6802 // right now, we only track and send global metrics. its sufficient
6803 // to send these metrics to MDS rank0.
6804 collect_and_send_global_metrics();
6807 void Client::collect_and_send_global_metrics() {
6808 ldout(cct
, 20) << __func__
<< dendl
;
6809 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6811 if (!have_open_session((mds_rank_t
)0)) {
6812 ldout(cct
, 5) << __func__
<< ": no session with rank=0 -- not sending metric"
6816 auto session
= _get_or_open_mds_session((mds_rank_t
)0);
6817 if (!session
->mds_features
.test(CEPHFS_FEATURE_METRIC_COLLECT
)) {
6818 ldout(cct
, 5) << __func__
<< ": rank=0 does not support metrics" << dendl
;
6822 ClientMetricMessage metric
;
6823 std::vector
<ClientMetricMessage
> message
;
6826 if (_collect_and_send_global_metrics
||
6827 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_READ_LATENCY
)) {
6828 metric
= ClientMetricMessage(ReadLatencyPayload(logger
->tget(l_c_read
),
6829 logger
->tget(l_c_rd_avg
),
6830 logger
->get(l_c_rd_sqsum
),
6832 message
.push_back(metric
);
6836 if (_collect_and_send_global_metrics
||
6837 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_WRITE_LATENCY
)) {
6838 metric
= ClientMetricMessage(WriteLatencyPayload(logger
->tget(l_c_wrlat
),
6839 logger
->tget(l_c_wr_avg
),
6840 logger
->get(l_c_wr_sqsum
),
6842 message
.push_back(metric
);
6846 if (_collect_and_send_global_metrics
||
6847 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_METADATA_LATENCY
)) {
6848 metric
= ClientMetricMessage(MetadataLatencyPayload(logger
->tget(l_c_lat
),
6849 logger
->tget(l_c_md_avg
),
6850 logger
->get(l_c_md_sqsum
),
6851 nr_metadata_request
));
6852 message
.push_back(metric
);
6855 // cap hit ratio -- nr_caps is unused right now
6856 if (_collect_and_send_global_metrics
||
6857 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_CAP_INFO
)) {
6858 auto [cap_hits
, cap_misses
] = get_cap_hit_rates();
6859 metric
= ClientMetricMessage(CapInfoPayload(cap_hits
, cap_misses
, 0));
6860 message
.push_back(metric
);
6863 // dentry lease hit ratio
6864 if (_collect_and_send_global_metrics
||
6865 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_DENTRY_LEASE
)) {
6866 auto [dlease_hits
, dlease_misses
, nr
] = get_dlease_hit_rates();
6867 metric
= ClientMetricMessage(DentryLeasePayload(dlease_hits
, dlease_misses
, nr
));
6868 message
.push_back(metric
);
6872 if (_collect_and_send_global_metrics
||
6873 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_OPENED_FILES
)) {
6874 auto [opened_files
, total_inodes
] = get_opened_files_rates();
6875 metric
= ClientMetricMessage(OpenedFilesPayload(opened_files
, total_inodes
));
6876 message
.push_back(metric
);
6880 if (_collect_and_send_global_metrics
||
6881 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_PINNED_ICAPS
)) {
6882 auto [pinned_icaps
, total_inodes
] = get_pinned_icaps_rates();
6883 metric
= ClientMetricMessage(PinnedIcapsPayload(pinned_icaps
, total_inodes
));
6884 message
.push_back(metric
);
6888 if (_collect_and_send_global_metrics
||
6889 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_OPENED_INODES
)) {
6890 auto [opened_inodes
, total_inodes
] = get_opened_inodes_rates();
6891 metric
= ClientMetricMessage(OpenedInodesPayload(opened_inodes
, total_inodes
));
6892 message
.push_back(metric
);
6896 if (_collect_and_send_global_metrics
||
6897 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_READ_IO_SIZES
)) {
6898 metric
= ClientMetricMessage(ReadIoSizesPayload(total_read_ops
,
6900 message
.push_back(metric
);
6904 if (_collect_and_send_global_metrics
||
6905 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_WRITE_IO_SIZES
)) {
6906 metric
= ClientMetricMessage(WriteIoSizesPayload(total_write_ops
,
6908 message
.push_back(metric
);
6911 session
->con
->send_message2(make_message
<MClientMetrics
>(std::move(message
)));
6914 void Client::renew_caps()
6916 ldout(cct
, 10) << "renew_caps()" << dendl
;
6917 last_cap_renew
= ceph::coarse_mono_clock::now();
6919 for (auto &p
: mds_sessions
) {
6920 ldout(cct
, 15) << "renew_caps requesting from mds." << p
.first
<< dendl
;
6921 if (mdsmap
->get_state(p
.first
) >= MDSMap::STATE_REJOIN
)
6922 renew_caps(p
.second
.get());
6926 void Client::renew_caps(MetaSession
*session
)
6928 ldout(cct
, 10) << "renew_caps mds." << session
->mds_num
<< dendl
;
6929 session
->last_cap_renew_request
= ceph_clock_now();
6930 uint64_t seq
= ++session
->cap_renew_seq
;
6931 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_RENEWCAPS
, seq
));
6935 // ===============================================================
6936 // high level (POSIXy) interface
6938 int Client::_do_lookup(Inode
*dir
, const string
& name
, int mask
,
6939 InodeRef
*target
, const UserPerm
& perms
)
6941 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_LOOKUPSNAP
: CEPH_MDS_OP_LOOKUP
;
6942 MetaRequest
*req
= new MetaRequest(op
);
6944 dir
->make_nosnap_relative_path(path
);
6945 path
.push_dentry(name
);
6946 req
->set_filepath(path
);
6947 req
->set_inode(dir
);
6948 if (cct
->_conf
->client_debug_getattr_caps
&& op
== CEPH_MDS_OP_LOOKUP
)
6949 mask
|= DEBUG_GETATTR_CAPS
;
6950 req
->head
.args
.getattr
.mask
= mask
;
6952 ldout(cct
, 10) << __func__
<< " on " << path
<< dendl
;
6954 int r
= make_request(req
, perms
, target
);
6955 ldout(cct
, 10) << __func__
<< " res is " << r
<< dendl
;
6959 bool Client::_dentry_valid(const Dentry
*dn
)
6961 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6963 // is dn lease valid?
6964 utime_t now
= ceph_clock_now();
6965 if (dn
->lease_mds
>= 0 && dn
->lease_ttl
> now
&&
6966 mds_sessions
.count(dn
->lease_mds
)) {
6967 auto s
= mds_sessions
.at(dn
->lease_mds
);
6968 if (s
->cap_ttl
> now
&& s
->cap_gen
== dn
->lease_gen
) {
6973 ldout(cct
, 20) << " bad lease, cap_ttl " << s
->cap_ttl
<< ", cap_gen " << s
->cap_gen
6974 << " vs lease_gen " << dn
->lease_gen
<< dendl
;
6981 int Client::_lookup(Inode
*dir
, const string
& dname
, int mask
, InodeRef
*target
,
6982 const UserPerm
& perms
, std::string
* alternate_name
)
6986 bool did_lookup_request
= false;
6987 // can only request shared caps
6988 mask
&= CEPH_CAP_ANY_SHARED
| CEPH_STAT_RSTAT
;
6990 if (dname
== "..") {
6991 if (dir
->dentries
.empty()) {
6992 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
6993 filepath
path(dir
->ino
);
6994 req
->set_filepath(path
);
6997 int r
= make_request(req
, perms
, &tmptarget
, NULL
, rand() % mdsmap
->get_num_in_mds());
7000 *target
= std::move(tmptarget
);
7001 ldout(cct
, 8) << __func__
<< " found target " << (*target
)->ino
<< dendl
;
7007 *target
= dir
->get_first_parent()->dir
->parent_inode
; //dirs can't be hard-linked
7016 if (!dir
->is_dir()) {
7017 r
= -CEPHFS_ENOTDIR
;
7021 if (dname
.length() > NAME_MAX
) {
7022 r
= -CEPHFS_ENAMETOOLONG
;
7026 if (dname
== cct
->_conf
->client_snapdir
&&
7027 dir
->snapid
== CEPH_NOSNAP
) {
7028 *target
= open_snapdir(dir
);
7034 dir
->dir
->dentries
.count(dname
)) {
7035 dn
= dir
->dir
->dentries
[dname
];
7037 ldout(cct
, 20) << __func__
<< " have " << *dn
<< " from mds." << dn
->lease_mds
7038 << " ttl " << dn
->lease_ttl
<< " seq " << dn
->lease_seq
<< dendl
;
7040 if (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)) {
7041 if (_dentry_valid(dn
)) {
7042 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
7043 // make trim_caps() behave.
7044 dir
->try_touch_cap(dn
->lease_mds
);
7048 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
7049 if (dn
->cap_shared_gen
== dir
->shared_gen
&&
7050 (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)))
7052 if (!dn
->inode
&& (dir
->flags
& I_COMPLETE
)) {
7053 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for "
7054 << *dir
<< " dn '" << dname
<< "'" << dendl
;
7055 return -CEPHFS_ENOENT
;
7059 ldout(cct
, 20) << " no cap on " << dn
->inode
->vino() << dendl
;
7062 // can we conclude ENOENT locally?
7063 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true) &&
7064 (dir
->flags
& I_COMPLETE
)) {
7065 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for " << *dir
<< " dn '" << dname
<< "'" << dendl
;
7066 return -CEPHFS_ENOENT
;
7070 if (did_lookup_request
) {
7074 r
= _do_lookup(dir
, dname
, mask
, target
, perms
);
7075 did_lookup_request
= true;
7077 /* complete lookup to get dentry for alternate_name */
7085 *target
= dn
->inode
;
7087 *alternate_name
= dn
->alternate_name
;
7096 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << r
<< dendl
;
7098 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << **target
<< dendl
;
7102 int Client::get_or_create(Inode
*dir
, const char* name
,
7103 Dentry
**pdn
, bool expect_null
)
7106 ldout(cct
, 20) << __func__
<< " " << *dir
<< " name " << name
<< dendl
;
7108 if (dir
->dir
->dentries
.count(name
)) {
7109 Dentry
*dn
= dir
->dir
->dentries
[name
];
7110 if (_dentry_valid(dn
)) {
7112 return -CEPHFS_EEXIST
;
7116 // otherwise link up a new one
7117 *pdn
= link(dir
->dir
, name
, NULL
, NULL
);
7124 int Client::walk(std::string_view path
, walk_dentry_result
* wdr
, const UserPerm
& perms
, bool followsym
)
7126 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7127 if (!mref_reader
.is_state_satisfied())
7128 return -CEPHFS_ENOTCONN
;
7130 ldout(cct
, 10) << __func__
<< ": " << path
<< dendl
;
7132 std::scoped_lock
lock(client_lock
);
7134 return path_walk(path
, wdr
, perms
, followsym
);
7137 int Client::path_walk(const filepath
& origpath
, InodeRef
*end
,
7138 const UserPerm
& perms
, bool followsym
, int mask
, InodeRef dirinode
)
7140 walk_dentry_result wdr
;
7141 int rc
= path_walk(origpath
, &wdr
, perms
, followsym
, mask
, dirinode
);
7142 *end
= std::move(wdr
.in
);
7146 int Client::path_walk(const filepath
& origpath
, walk_dentry_result
* result
, const UserPerm
& perms
,
7147 bool followsym
, int mask
, InodeRef dirinode
)
7149 filepath path
= origpath
;
7151 std::string alternate_name
;
7152 if (origpath
.absolute())
7161 ldout(cct
, 20) << __func__
<< " cur=" << *cur
<< dendl
;
7162 ldout(cct
, 10) << __func__
<< " " << path
<< dendl
;
7167 while (i
< path
.depth() && cur
) {
7169 const string
&dname
= path
[i
];
7170 ldout(cct
, 10) << " " << i
<< " " << *cur
<< " " << dname
<< dendl
;
7171 ldout(cct
, 20) << " (path is " << path
<< ")" << dendl
;
7173 if (cct
->_conf
->client_permissions
) {
7174 int r
= may_lookup(cur
.get(), perms
);
7177 caps
= CEPH_CAP_AUTH_SHARED
;
7180 /* Get extra requested caps on the last component */
7181 if (i
== (path
.depth() - 1))
7183 int r
= _lookup(cur
.get(), dname
, caps
, &next
, perms
, &alternate_name
);
7186 // only follow trailing symlink if followsym. always follow
7187 // 'directory' symlinks.
7188 if (next
&& next
->is_symlink()) {
7190 ldout(cct
, 20) << " symlink count " << symlinks
<< ", value is '" << next
->symlink
<< "'" << dendl
;
7191 if (symlinks
> MAXSYMLINKS
) {
7192 return -CEPHFS_ELOOP
;
7195 if (i
< path
.depth() - 1) {
7197 // replace consumed components of path with symlink dir target
7198 filepath
resolved(next
->symlink
.c_str());
7199 resolved
.append(path
.postfixpath(i
+ 1));
7202 if (next
->symlink
[0] == '/') {
7206 } else if (followsym
) {
7207 if (next
->symlink
[0] == '/') {
7208 path
= next
->symlink
.c_str();
7213 filepath
more(next
->symlink
.c_str());
7214 // we need to remove the symlink component from off of the path
7215 // before adding the target that the symlink points to. remain
7216 // at the same position in the path.
7227 return -CEPHFS_ENOENT
;
7229 result
->in
= std::move(cur
);
7230 result
->alternate_name
= std::move(alternate_name
);
7238 int Client::link(const char *relexisting
, const char *relpath
, const UserPerm
& perm
, std::string alternate_name
)
7240 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7241 if (!mref_reader
.is_state_satisfied())
7242 return -CEPHFS_ENOTCONN
;
7244 tout(cct
) << "link" << std::endl
;
7245 tout(cct
) << relexisting
<< std::endl
;
7246 tout(cct
) << relpath
<< std::endl
;
7248 filepath
existing(relexisting
);
7252 std::scoped_lock
lock(client_lock
);
7253 int r
= path_walk(existing
, &in
, perm
, true);
7256 if (std::string(relpath
) == "/") {
7260 filepath
path(relpath
);
7261 string name
= path
.last_dentry();
7264 r
= path_walk(path
, &dir
, perm
, true);
7267 if (cct
->_conf
->client_permissions
) {
7268 if (S_ISDIR(in
->mode
)) {
7272 r
= may_hardlink(in
.get(), perm
);
7275 r
= may_create(dir
.get(), perm
);
7279 r
= _link(in
.get(), dir
.get(), name
.c_str(), perm
, std::move(alternate_name
));
7283 int Client::unlink(const char *relpath
, const UserPerm
& perm
)
7285 return unlinkat(CEPHFS_AT_FDCWD
, relpath
, 0, perm
);
7288 int Client::unlinkat(int dirfd
, const char *relpath
, int flags
, const UserPerm
& perm
)
7290 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7291 if (!mref_reader
.is_state_satisfied()) {
7292 return -CEPHFS_ENOTCONN
;
7295 tout(cct
) << __func__
<< std::endl
;
7296 tout(cct
) << dirfd
<< std::endl
;
7297 tout(cct
) << relpath
<< std::endl
;
7298 tout(cct
) << flags
<< std::endl
;
7300 if (std::string(relpath
) == "/") {
7301 return flags
& AT_REMOVEDIR
? -CEPHFS_EBUSY
: -CEPHFS_EISDIR
;
7304 filepath
path(relpath
);
7305 string name
= path
.last_dentry();
7309 std::scoped_lock
lock(client_lock
);
7312 int r
= get_fd_inode(dirfd
, &dirinode
);
7317 r
= path_walk(path
, &dir
, perm
, true, 0, dirinode
);
7321 if (cct
->_conf
->client_permissions
) {
7322 r
= may_delete(dir
.get(), name
.c_str(), perm
);
7327 if (flags
& AT_REMOVEDIR
) {
7328 r
= _rmdir(dir
.get(), name
.c_str(), perm
);
7330 r
= _unlink(dir
.get(), name
.c_str(), perm
);
7335 int Client::rename(const char *relfrom
, const char *relto
, const UserPerm
& perm
, std::string alternate_name
)
7337 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7338 if (!mref_reader
.is_state_satisfied())
7339 return -CEPHFS_ENOTCONN
;
7341 tout(cct
) << __func__
<< std::endl
;
7342 tout(cct
) << relfrom
<< std::endl
;
7343 tout(cct
) << relto
<< std::endl
;
7345 if (std::string(relfrom
) == "/" || std::string(relto
) == "/")
7346 return -CEPHFS_EBUSY
;
7348 filepath
from(relfrom
);
7350 string fromname
= from
.last_dentry();
7352 string toname
= to
.last_dentry();
7355 InodeRef fromdir
, todir
;
7357 std::scoped_lock
lock(client_lock
);
7358 int r
= path_walk(from
, &fromdir
, perm
);
7361 r
= path_walk(to
, &todir
, perm
);
7365 if (cct
->_conf
->client_permissions
) {
7366 int r
= may_delete(fromdir
.get(), fromname
.c_str(), perm
);
7369 r
= may_delete(todir
.get(), toname
.c_str(), perm
);
7370 if (r
< 0 && r
!= -CEPHFS_ENOENT
)
7373 r
= _rename(fromdir
.get(), fromname
.c_str(), todir
.get(), toname
.c_str(), perm
, std::move(alternate_name
));
7380 int Client::mkdir(const char *relpath
, mode_t mode
, const UserPerm
& perm
, std::string alternate_name
)
7382 return mkdirat(CEPHFS_AT_FDCWD
, relpath
, mode
, perm
, alternate_name
);
7385 int Client::mkdirat(int dirfd
, const char *relpath
, mode_t mode
, const UserPerm
& perm
,
7386 std::string alternate_name
)
7388 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7389 if (!mref_reader
.is_state_satisfied())
7390 return -CEPHFS_ENOTCONN
;
7392 tout(cct
) << __func__
<< std::endl
;
7393 tout(cct
) << dirfd
<< std::endl
;
7394 tout(cct
) << relpath
<< std::endl
;
7395 tout(cct
) << mode
<< std::endl
;
7396 ldout(cct
, 10) << __func__
<< ": " << relpath
<< dendl
;
7398 if (std::string(relpath
) == "/") {
7399 return -CEPHFS_EEXIST
;
7402 filepath
path(relpath
);
7403 string name
= path
.last_dentry();
7407 std::scoped_lock
lock(client_lock
);
7410 int r
= get_fd_inode(dirfd
, &dirinode
);
7415 r
= path_walk(path
, &dir
, perm
, true, 0, dirinode
);
7419 if (cct
->_conf
->client_permissions
) {
7420 r
= may_create(dir
.get(), perm
);
7425 return _mkdir(dir
.get(), name
.c_str(), mode
, perm
, 0, {}, std::move(alternate_name
));
7428 int Client::mkdirs(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7430 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7431 if (!mref_reader
.is_state_satisfied())
7432 return -CEPHFS_ENOTCONN
;
7434 ldout(cct
, 10) << "Client::mkdirs " << relpath
<< dendl
;
7435 tout(cct
) << __func__
<< std::endl
;
7436 tout(cct
) << relpath
<< std::endl
;
7437 tout(cct
) << mode
<< std::endl
;
7439 //get through existing parts of path
7440 filepath
path(relpath
);
7442 int r
= 0, caps
= 0;
7445 std::scoped_lock
lock(client_lock
);
7447 for (i
=0; i
<path
.depth(); ++i
) {
7448 if (cct
->_conf
->client_permissions
) {
7449 r
= may_lookup(cur
.get(), perms
);
7452 caps
= CEPH_CAP_AUTH_SHARED
;
7454 r
= _lookup(cur
.get(), path
[i
].c_str(), caps
, &next
, perms
);
7459 if (r
!=-CEPHFS_ENOENT
) return r
;
7460 ldout(cct
, 20) << __func__
<< " got through " << i
<< " directories on path " << relpath
<< dendl
;
7461 //make new directory at each level
7462 for (; i
<path
.depth(); ++i
) {
7463 if (cct
->_conf
->client_permissions
) {
7464 r
= may_create(cur
.get(), perms
);
7469 r
= _mkdir(cur
.get(), path
[i
].c_str(), mode
, perms
, &next
);
7471 //check proper creation/existence
7472 if(-CEPHFS_EEXIST
== r
&& i
< path
.depth() - 1) {
7473 r
= _lookup(cur
.get(), path
[i
].c_str(), CEPH_CAP_AUTH_SHARED
, &next
, perms
);
7477 //move to new dir and continue
7479 ldout(cct
, 20) << __func__
<< ": successfully created directory "
7480 << filepath(cur
->ino
).get_path() << dendl
;
7485 int Client::rmdir(const char *relpath
, const UserPerm
& perms
)
7487 return unlinkat(CEPHFS_AT_FDCWD
, relpath
, AT_REMOVEDIR
, perms
);
7490 int Client::mknod(const char *relpath
, mode_t mode
, const UserPerm
& perms
, dev_t rdev
)
7492 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7493 if (!mref_reader
.is_state_satisfied())
7494 return -CEPHFS_ENOTCONN
;
7496 tout(cct
) << __func__
<< std::endl
;
7497 tout(cct
) << relpath
<< std::endl
;
7498 tout(cct
) << mode
<< std::endl
;
7499 tout(cct
) << rdev
<< std::endl
;
7501 if (std::string(relpath
) == "/")
7502 return -CEPHFS_EEXIST
;
7504 filepath
path(relpath
);
7505 string name
= path
.last_dentry();
7509 std::scoped_lock
lock(client_lock
);
7510 int r
= path_walk(path
, &dir
, perms
);
7513 if (cct
->_conf
->client_permissions
) {
7514 int r
= may_create(dir
.get(), perms
);
7518 return _mknod(dir
.get(), name
.c_str(), mode
, rdev
, perms
);
7523 int Client::symlink(const char *target
, const char *relpath
, const UserPerm
& perms
, std::string alternate_name
)
7525 return symlinkat(target
, CEPHFS_AT_FDCWD
, relpath
, perms
, alternate_name
);
7528 int Client::symlinkat(const char *target
, int dirfd
, const char *relpath
, const UserPerm
& perms
,
7529 std::string alternate_name
)
7531 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7532 if (!mref_reader
.is_state_satisfied()) {
7533 return -CEPHFS_ENOTCONN
;
7536 tout(cct
) << __func__
<< std::endl
;
7537 tout(cct
) << target
<< std::endl
;
7538 tout(cct
) << dirfd
<< std::endl
;
7539 tout(cct
) << relpath
<< std::endl
;
7541 if (std::string(relpath
) == "/") {
7542 return -CEPHFS_EEXIST
;
7545 filepath
path(relpath
);
7546 string name
= path
.last_dentry();
7550 std::scoped_lock
lock(client_lock
);
7553 int r
= get_fd_inode(dirfd
, &dirinode
);
7557 r
= path_walk(path
, &dir
, perms
, true, 0, dirinode
);
7561 if (cct
->_conf
->client_permissions
) {
7562 int r
= may_create(dir
.get(), perms
);
7567 return _symlink(dir
.get(), name
.c_str(), target
, perms
, std::move(alternate_name
));
7570 int Client::readlink(const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
)
7572 return readlinkat(CEPHFS_AT_FDCWD
, relpath
, buf
, size
, perms
);
7575 int Client::readlinkat(int dirfd
, const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
) {
7576 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7577 if (!mref_reader
.is_state_satisfied()) {
7578 return -CEPHFS_ENOTCONN
;
7581 tout(cct
) << __func__
<< std::endl
;
7582 tout(cct
) << dirfd
<< std::endl
;
7583 tout(cct
) << relpath
<< std::endl
;
7586 std::scoped_lock
lock(client_lock
);
7587 int r
= get_fd_inode(dirfd
, &dirinode
);
7593 filepath
path(relpath
);
7594 r
= path_walk(path
, &in
, perms
, false, 0, dirinode
);
7599 return _readlink(in
.get(), buf
, size
);
7602 int Client::_readlink(Inode
*in
, char *buf
, size_t size
)
7604 if (!in
->is_symlink())
7605 return -CEPHFS_EINVAL
;
7607 // copy into buf (at most size bytes)
7608 int r
= in
->symlink
.length();
7611 memcpy(buf
, in
->symlink
.c_str(), r
);
7618 int Client::_getattr(Inode
*in
, int mask
, const UserPerm
& perms
, bool force
)
7620 bool yes
= in
->caps_issued_mask(mask
, true);
7622 ldout(cct
, 10) << __func__
<< " mask " << ccap_string(mask
) << " issued=" << yes
<< dendl
;
7626 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
7628 in
->make_nosnap_relative_path(path
);
7629 req
->set_filepath(path
);
7631 req
->head
.args
.getattr
.mask
= mask
;
7633 int res
= make_request(req
, perms
);
7634 ldout(cct
, 10) << __func__
<< " result=" << res
<< dendl
;
7638 int Client::_getvxattr(
7640 const UserPerm
& perms
,
7641 const char *xattr_name
,
7646 if (!xattr_name
|| strlen(xattr_name
) <= 0 || strlen(xattr_name
) > 255) {
7647 return -CEPHFS_ENODATA
;
7650 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETVXATTR
);
7652 in
->make_nosnap_relative_path(path
);
7653 req
->set_filepath(path
);
7655 req
->set_string2(xattr_name
);
7658 int res
= make_request(req
, perms
, nullptr, nullptr, rank
, &bl
,
7659 CEPHFS_FEATURE_OP_GETVXATTR
);
7660 ldout(cct
, 10) << __func__
<< " result=" << res
<< dendl
;
7663 if (res
== -CEPHFS_EOPNOTSUPP
) {
7664 return -CEPHFS_ENODATA
;
7670 auto p
= bl
.cbegin();
7676 ssize_t len
= buf
.length();
7678 res
= len
; // refer to man getxattr(2) for output buffer size == 0
7682 res
= -CEPHFS_ERANGE
; // insufficient output buffer space
7684 memcpy(value
, buf
.c_str(), len
);
7690 int Client::_do_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
7691 const UserPerm
& perms
, InodeRef
*inp
)
7693 int issued
= in
->caps_issued();
7694 union ceph_mds_request_args args
;
7695 bool kill_sguid
= false;
7698 ldout(cct
, 10) << __func__
<< " mask " << mask
<< " issued " <<
7699 ccap_string(issued
) << dendl
;
7701 if (in
->snapid
!= CEPH_NOSNAP
) {
7702 return -CEPHFS_EROFS
;
7704 if ((mask
& CEPH_SETATTR_SIZE
) &&
7705 (uint64_t)stx
->stx_size
> in
->size
&&
7706 is_quota_bytes_exceeded(in
, (uint64_t)stx
->stx_size
- in
->size
,
7708 return -CEPHFS_EDQUOT
;
7711 memset(&args
, 0, sizeof(args
));
7713 // make the change locally?
7714 if ((in
->cap_dirtier_uid
>= 0 && perms
.uid() != in
->cap_dirtier_uid
) ||
7715 (in
->cap_dirtier_gid
>= 0 && perms
.gid() != in
->cap_dirtier_gid
)) {
7716 ldout(cct
, 10) << __func__
<< " caller " << perms
.uid() << ":" << perms
.gid()
7717 << " != cap dirtier " << in
->cap_dirtier_uid
<< ":"
7718 << in
->cap_dirtier_gid
<< ", forcing sync setattr"
7721 * This works because we implicitly flush the caps as part of the
7722 * request, so the cap update check will happen with the writeback
7723 * cap context, and then the setattr check will happen with the
7726 * In reality this pattern is likely pretty rare (different users
7727 * setattr'ing the same file). If that turns out not to be the
7728 * case later, we can build a more complex pipelined cap writeback
7731 mask
|= CEPH_SETATTR_CTIME
;
7735 // caller just needs us to bump the ctime
7736 in
->ctime
= ceph_clock_now();
7737 in
->cap_dirtier_uid
= perms
.uid();
7738 in
->cap_dirtier_gid
= perms
.gid();
7739 if (issued
& CEPH_CAP_AUTH_EXCL
)
7740 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7741 else if (issued
& CEPH_CAP_FILE_EXCL
)
7742 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7743 else if (issued
& CEPH_CAP_XATTR_EXCL
)
7744 in
->mark_caps_dirty(CEPH_CAP_XATTR_EXCL
);
7746 mask
|= CEPH_SETATTR_CTIME
;
7749 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7750 kill_sguid
= mask
& (CEPH_SETATTR_SIZE
|CEPH_SETATTR_KILL_SGUID
);
7752 mask
&= ~CEPH_SETATTR_KILL_SGUID
;
7753 } else if (mask
& CEPH_SETATTR_SIZE
) {
7754 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7755 mask
|= CEPH_SETATTR_KILL_SGUID
;
7756 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7759 if (mask
& CEPH_SETATTR_UID
) {
7760 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
7762 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7763 in
->ctime
= ceph_clock_now();
7764 in
->cap_dirtier_uid
= perms
.uid();
7765 in
->cap_dirtier_gid
= perms
.gid();
7766 in
->uid
= stx
->stx_uid
;
7767 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7768 mask
&= ~CEPH_SETATTR_UID
;
7770 } else if (!in
->caps_issued_mask(CEPH_CAP_AUTH_SHARED
) ||
7771 in
->uid
!= stx
->stx_uid
) {
7772 args
.setattr
.uid
= stx
->stx_uid
;
7773 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7775 mask
&= ~CEPH_SETATTR_UID
;
7779 if (mask
& CEPH_SETATTR_GID
) {
7780 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
7782 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7783 in
->ctime
= ceph_clock_now();
7784 in
->cap_dirtier_uid
= perms
.uid();
7785 in
->cap_dirtier_gid
= perms
.gid();
7786 in
->gid
= stx
->stx_gid
;
7787 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7788 mask
&= ~CEPH_SETATTR_GID
;
7790 } else if (!in
->caps_issued_mask(CEPH_CAP_AUTH_SHARED
) ||
7791 in
->gid
!= stx
->stx_gid
) {
7792 args
.setattr
.gid
= stx
->stx_gid
;
7793 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7795 mask
&= ~CEPH_SETATTR_GID
;
7799 if (mask
& CEPH_SETATTR_MODE
) {
7800 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
7802 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7803 in
->ctime
= ceph_clock_now();
7804 in
->cap_dirtier_uid
= perms
.uid();
7805 in
->cap_dirtier_gid
= perms
.gid();
7806 in
->mode
= (in
->mode
& ~07777) | (stx
->stx_mode
& 07777);
7807 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7808 mask
&= ~CEPH_SETATTR_MODE
;
7809 } else if (!in
->caps_issued_mask(CEPH_CAP_AUTH_SHARED
) ||
7810 in
->mode
!= stx
->stx_mode
) {
7811 args
.setattr
.mode
= stx
->stx_mode
;
7812 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7814 mask
&= ~CEPH_SETATTR_MODE
;
7816 } else if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
) &&
7817 kill_sguid
&& S_ISREG(in
->mode
) &&
7818 (in
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
7819 /* Must squash the any setuid/setgid bits with an ownership change */
7820 in
->mode
&= ~(S_ISUID
|S_ISGID
);
7821 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7824 if (mask
& CEPH_SETATTR_BTIME
) {
7825 ldout(cct
,10) << "changing btime to " << in
->btime
<< dendl
;
7827 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7828 in
->ctime
= ceph_clock_now();
7829 in
->cap_dirtier_uid
= perms
.uid();
7830 in
->cap_dirtier_gid
= perms
.gid();
7831 in
->btime
= utime_t(stx
->stx_btime
);
7832 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7833 mask
&= ~CEPH_SETATTR_BTIME
;
7834 } else if (!in
->caps_issued_mask(CEPH_CAP_AUTH_SHARED
) ||
7835 in
->btime
!= utime_t(stx
->stx_btime
)) {
7836 args
.setattr
.btime
= utime_t(stx
->stx_btime
);
7837 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7839 mask
&= ~CEPH_SETATTR_BTIME
;
7843 if (mask
& CEPH_SETATTR_SIZE
) {
7844 if ((uint64_t)stx
->stx_size
>= mdsmap
->get_max_filesize()) {
7846 ldout(cct
,10) << "unable to set size to " << stx
->stx_size
<< ". Too large!" << dendl
;
7847 return -CEPHFS_EFBIG
;
7850 ldout(cct
,10) << "changing size to " << stx
->stx_size
<< dendl
;
7851 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
) &&
7852 !(mask
& CEPH_SETATTR_KILL_SGUID
) &&
7853 stx
->stx_size
>= in
->size
) {
7854 if (stx
->stx_size
> in
->size
) {
7855 in
->size
= in
->reported_size
= stx
->stx_size
;
7856 in
->cap_dirtier_uid
= perms
.uid();
7857 in
->cap_dirtier_gid
= perms
.gid();
7858 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7859 mask
&= ~(CEPH_SETATTR_SIZE
);
7860 mask
|= CEPH_SETATTR_MTIME
;
7862 // ignore it when size doesn't change
7863 mask
&= ~(CEPH_SETATTR_SIZE
);
7866 args
.setattr
.size
= stx
->stx_size
;
7867 inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7872 if (mask
& CEPH_SETATTR_MTIME
) {
7873 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
7874 in
->mtime
= utime_t(stx
->stx_mtime
);
7875 in
->ctime
= ceph_clock_now();
7876 in
->cap_dirtier_uid
= perms
.uid();
7877 in
->cap_dirtier_gid
= perms
.gid();
7878 in
->time_warp_seq
++;
7879 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7880 mask
&= ~CEPH_SETATTR_MTIME
;
7881 } else if (in
->caps_issued_mask(CEPH_CAP_FILE_WR
) &&
7882 utime_t(stx
->stx_mtime
) > in
->mtime
) {
7883 in
->mtime
= utime_t(stx
->stx_mtime
);
7884 in
->ctime
= ceph_clock_now();
7885 in
->cap_dirtier_uid
= perms
.uid();
7886 in
->cap_dirtier_gid
= perms
.gid();
7887 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
7888 mask
&= ~CEPH_SETATTR_MTIME
;
7889 } else if (!in
->caps_issued_mask(CEPH_CAP_FILE_SHARED
) ||
7890 in
->mtime
!= utime_t(stx
->stx_mtime
)) {
7891 args
.setattr
.mtime
= utime_t(stx
->stx_mtime
);
7892 inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7895 mask
&= ~CEPH_SETATTR_MTIME
;
7899 if (mask
& CEPH_SETATTR_ATIME
) {
7900 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
7901 in
->atime
= utime_t(stx
->stx_atime
);
7902 in
->ctime
= ceph_clock_now();
7903 in
->cap_dirtier_uid
= perms
.uid();
7904 in
->cap_dirtier_gid
= perms
.gid();
7905 in
->time_warp_seq
++;
7906 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7907 mask
&= ~CEPH_SETATTR_ATIME
;
7908 } else if (in
->caps_issued_mask(CEPH_CAP_FILE_WR
) &&
7909 utime_t(stx
->stx_atime
) > in
->atime
) {
7910 in
->atime
= utime_t(stx
->stx_atime
);
7911 in
->ctime
= ceph_clock_now();
7912 in
->cap_dirtier_uid
= perms
.uid();
7913 in
->cap_dirtier_gid
= perms
.gid();
7914 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
7915 mask
&= ~CEPH_SETATTR_ATIME
;
7916 } else if (!in
->caps_issued_mask(CEPH_CAP_FILE_SHARED
) ||
7917 in
->atime
!= utime_t(stx
->stx_atime
)) {
7918 args
.setattr
.atime
= utime_t(stx
->stx_atime
);
7919 inode_drop
|= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_RD
|
7922 mask
&= ~CEPH_SETATTR_ATIME
;
7928 if (in
->is_dir() && in
->snapid
== CEPH_NOSNAP
) {
7929 vinodeno_t
vino(in
->ino
, CEPH_SNAPDIR
);
7930 if (inode_map
.count(vino
)) {
7931 refresh_snapdir_attrs(inode_map
[vino
], in
);
7937 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETATTR
);
7941 in
->make_nosnap_relative_path(path
);
7942 req
->set_filepath(path
);
7945 req
->head
.args
= args
;
7946 req
->inode_drop
= inode_drop
;
7947 req
->head
.args
.setattr
.mask
= mask
;
7948 req
->regetattr_mask
= mask
;
7950 int res
= make_request(req
, perms
, inp
);
7951 ldout(cct
, 10) << "_setattr result=" << res
<< dendl
;
7955 /* Note that we only care about attrs that setattr cares about */
7956 void Client::stat_to_statx(struct stat
*st
, struct ceph_statx
*stx
)
7958 stx
->stx_size
= st
->st_size
;
7959 stx
->stx_mode
= st
->st_mode
;
7960 stx
->stx_uid
= st
->st_uid
;
7961 stx
->stx_gid
= st
->st_gid
;
7963 stx
->stx_mtime
= st
->st_mtimespec
;
7964 stx
->stx_atime
= st
->st_atimespec
;
7966 stx
->stx_mtime
.tv_sec
= st
->st_mtime
;
7967 stx
->stx_atime
.tv_sec
= st
->st_atime
;
7969 stx
->stx_mtime
= st
->st_mtim
;
7970 stx
->stx_atime
= st
->st_atim
;
7974 int Client::__setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
7975 const UserPerm
& perms
, InodeRef
*inp
)
7977 int ret
= _do_setattr(in
, stx
, mask
, perms
, inp
);
7980 if (mask
& CEPH_SETATTR_MODE
)
7981 ret
= _posix_acl_chmod(in
, stx
->stx_mode
, perms
);
7985 int Client::_setattrx(InodeRef
&in
, struct ceph_statx
*stx
, int mask
,
7986 const UserPerm
& perms
)
7988 mask
&= (CEPH_SETATTR_MODE
| CEPH_SETATTR_UID
|
7989 CEPH_SETATTR_GID
| CEPH_SETATTR_MTIME
|
7990 CEPH_SETATTR_ATIME
| CEPH_SETATTR_SIZE
|
7991 CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
);
7992 if (cct
->_conf
->client_permissions
) {
7993 int r
= may_setattr(in
.get(), stx
, mask
, perms
);
7997 return __setattrx(in
.get(), stx
, mask
, perms
);
8000 int Client::_setattr(InodeRef
&in
, struct stat
*attr
, int mask
,
8001 const UserPerm
& perms
)
8003 struct ceph_statx stx
;
8005 stat_to_statx(attr
, &stx
);
8006 mask
&= ~CEPH_SETATTR_BTIME
;
8008 if ((mask
& CEPH_SETATTR_UID
) && attr
->st_uid
== static_cast<uid_t
>(-1)) {
8009 mask
&= ~CEPH_SETATTR_UID
;
8011 if ((mask
& CEPH_SETATTR_GID
) && attr
->st_gid
== static_cast<uid_t
>(-1)) {
8012 mask
&= ~CEPH_SETATTR_GID
;
8015 return _setattrx(in
, &stx
, mask
, perms
);
8018 int Client::setattr(const char *relpath
, struct stat
*attr
, int mask
,
8019 const UserPerm
& perms
)
8021 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8022 if (!mref_reader
.is_state_satisfied())
8023 return -CEPHFS_ENOTCONN
;
8025 tout(cct
) << __func__
<< std::endl
;
8026 tout(cct
) << relpath
<< std::endl
;
8027 tout(cct
) << mask
<< std::endl
;
8029 filepath
path(relpath
);
8032 std::scoped_lock
lock(client_lock
);
8033 int r
= path_walk(path
, &in
, perms
);
8036 return _setattr(in
, attr
, mask
, perms
);
8039 int Client::setattrx(const char *relpath
, struct ceph_statx
*stx
, int mask
,
8040 const UserPerm
& perms
, int flags
)
8042 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8043 if (!mref_reader
.is_state_satisfied())
8044 return -CEPHFS_ENOTCONN
;
8046 tout(cct
) << __func__
<< std::endl
;
8047 tout(cct
) << relpath
<< std::endl
;
8048 tout(cct
) << mask
<< std::endl
;
8050 filepath
path(relpath
);
8053 std::scoped_lock
lock(client_lock
);
8054 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
));
8057 return _setattrx(in
, stx
, mask
, perms
);
8060 int Client::fsetattr(int fd
, struct stat
*attr
, int mask
, const UserPerm
& perms
)
8062 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8063 if (!mref_reader
.is_state_satisfied())
8064 return -CEPHFS_ENOTCONN
;
8066 tout(cct
) << __func__
<< std::endl
;
8067 tout(cct
) << fd
<< std::endl
;
8068 tout(cct
) << mask
<< std::endl
;
8070 std::scoped_lock
lock(client_lock
);
8071 Fh
*f
= get_filehandle(fd
);
8073 return -CEPHFS_EBADF
;
8074 #if defined(__linux__) && defined(O_PATH)
8075 if (f
->flags
& O_PATH
)
8076 return -CEPHFS_EBADF
;
8078 return _setattr(f
->inode
, attr
, mask
, perms
);
8081 int Client::fsetattrx(int fd
, struct ceph_statx
*stx
, int mask
, const UserPerm
& perms
)
8083 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8084 if (!mref_reader
.is_state_satisfied())
8085 return -CEPHFS_ENOTCONN
;
8087 tout(cct
) << __func__
<< std::endl
;
8088 tout(cct
) << fd
<< std::endl
;
8089 tout(cct
) << mask
<< std::endl
;
8091 std::scoped_lock
lock(client_lock
);
8092 Fh
*f
= get_filehandle(fd
);
8094 return -CEPHFS_EBADF
;
8095 #if defined(__linux__) && defined(O_PATH)
8096 if (f
->flags
& O_PATH
)
8097 return -CEPHFS_EBADF
;
8099 return _setattrx(f
->inode
, stx
, mask
, perms
);
8102 int Client::stat(const char *relpath
, struct stat
*stbuf
, const UserPerm
& perms
,
8103 frag_info_t
*dirstat
, int mask
)
8105 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8106 if (!mref_reader
.is_state_satisfied())
8107 return -CEPHFS_ENOTCONN
;
8109 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
8110 tout(cct
) << "stat" << std::endl
;
8111 tout(cct
) << relpath
<< std::endl
;
8113 filepath
path(relpath
);
8116 std::scoped_lock
lock(client_lock
);
8117 int r
= path_walk(path
, &in
, perms
, true, mask
);
8120 r
= _getattr(in
, mask
, perms
);
8122 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
8125 fill_stat(in
, stbuf
, dirstat
);
8126 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
8130 unsigned Client::statx_to_mask(unsigned int flags
, unsigned int want
)
8134 /* The AT_STATX_FORCE_SYNC is always in higher priority than AT_STATX_DONT_SYNC. */
8135 if ((flags
& AT_STATX_SYNC_TYPE
) == AT_STATX_DONT_SYNC
)
8138 /* Always set PIN to distinguish from AT_STATX_DONT_SYNC case */
8139 mask
|= CEPH_CAP_PIN
;
8140 if (want
& (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
8141 mask
|= CEPH_CAP_AUTH_SHARED
;
8142 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
8143 mask
|= CEPH_CAP_LINK_SHARED
;
8144 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|CEPH_STATX_CTIME
|CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
|CEPH_STATX_VERSION
))
8145 mask
|= CEPH_CAP_FILE_SHARED
;
8146 if (want
& (CEPH_STATX_VERSION
|CEPH_STATX_CTIME
))
8147 mask
|= CEPH_CAP_XATTR_SHARED
;
8152 int Client::statx(const char *relpath
, struct ceph_statx
*stx
,
8153 const UserPerm
& perms
,
8154 unsigned int want
, unsigned int flags
)
8156 return statxat(CEPHFS_AT_FDCWD
, relpath
, stx
, perms
, want
, flags
);
8159 int Client::lstat(const char *relpath
, struct stat
*stbuf
,
8160 const UserPerm
& perms
, frag_info_t
*dirstat
, int mask
)
8162 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8163 if (!mref_reader
.is_state_satisfied())
8164 return -CEPHFS_ENOTCONN
;
8166 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
8167 tout(cct
) << __func__
<< std::endl
;
8168 tout(cct
) << relpath
<< std::endl
;
8170 filepath
path(relpath
);
8173 std::scoped_lock
lock(client_lock
);
8174 // don't follow symlinks
8175 int r
= path_walk(path
, &in
, perms
, false, mask
);
8178 r
= _getattr(in
, mask
, perms
);
8180 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
8183 fill_stat(in
, stbuf
, dirstat
);
8184 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
8188 int Client::fill_stat(Inode
*in
, struct stat
*st
, frag_info_t
*dirstat
, nest_info_t
*rstat
)
8190 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
8191 << " mode 0" << oct
<< in
->mode
<< dec
8192 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
8193 memset(st
, 0, sizeof(struct stat
));
8194 if (use_faked_inos())
8195 st
->st_ino
= in
->faked_ino
;
8197 st
->st_ino
= in
->ino
;
8198 st
->st_dev
= in
->snapid
;
8199 st
->st_mode
= in
->mode
;
8200 st
->st_rdev
= in
->rdev
;
8202 switch (in
->nlink
) {
8204 st
->st_nlink
= 0; /* dir is unlinked */
8207 st
->st_nlink
= 1 /* parent dentry */
8209 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
8215 st
->st_nlink
= in
->nlink
;
8217 st
->st_uid
= in
->uid
;
8218 st
->st_gid
= in
->gid
;
8219 if (in
->ctime
> in
->mtime
) {
8220 stat_set_ctime_sec(st
, in
->ctime
.sec());
8221 stat_set_ctime_nsec(st
, in
->ctime
.nsec());
8223 stat_set_ctime_sec(st
, in
->mtime
.sec());
8224 stat_set_ctime_nsec(st
, in
->mtime
.nsec());
8226 stat_set_atime_sec(st
, in
->atime
.sec());
8227 stat_set_atime_nsec(st
, in
->atime
.nsec());
8228 stat_set_mtime_sec(st
, in
->mtime
.sec());
8229 stat_set_mtime_nsec(st
, in
->mtime
.nsec());
8231 if (cct
->_conf
->client_dirsize_rbytes
) {
8232 st
->st_size
= in
->rstat
.rbytes
;
8233 } else if (in
->snapid
== CEPH_SNAPDIR
) {
8234 SnapRealm
*realm
= get_snap_realm_maybe(in
->vino().ino
);
8236 st
->st_size
= realm
->my_snaps
.size();
8237 put_snap_realm(realm
);
8240 st
->st_size
= in
->dirstat
.size();
8242 // The Windows "stat" structure provides just a subset of the fields that are
8243 // available on Linux.
8248 st
->st_size
= in
->size
;
8250 st
->st_blocks
= (in
->size
+ 511) >> 9;
8254 st
->st_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
8258 *dirstat
= in
->dirstat
;
8262 return in
->caps_issued();
8265 void Client::fill_statx(Inode
*in
, unsigned int mask
, struct ceph_statx
*stx
)
8267 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
8268 << " mode 0" << oct
<< in
->mode
<< dec
8269 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< " change_attr " << in
->change_attr
<< dendl
;
8270 memset(stx
, 0, sizeof(struct ceph_statx
));
8273 * If mask is 0, then the caller set AT_STATX_DONT_SYNC. Reset the mask
8274 * so that all bits are set.
8279 /* These are always considered to be available */
8280 stx
->stx_dev
= in
->snapid
;
8281 stx
->stx_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
8283 /* Type bits are always set, even when CEPH_STATX_MODE is not */
8284 stx
->stx_mode
= S_IFMT
& in
->mode
;
8285 stx
->stx_ino
= use_faked_inos() ? in
->faked_ino
: (ino_t
)in
->ino
;
8286 stx
->stx_rdev
= in
->rdev
;
8287 stx
->stx_mask
|= (CEPH_STATX_INO
|CEPH_STATX_RDEV
);
8289 if (mask
& CEPH_CAP_AUTH_SHARED
) {
8290 stx
->stx_uid
= in
->uid
;
8291 stx
->stx_gid
= in
->gid
;
8292 stx
->stx_mode
= in
->mode
;
8293 in
->btime
.to_timespec(&stx
->stx_btime
);
8294 stx
->stx_mask
|= (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
);
8297 if (mask
& CEPH_CAP_LINK_SHARED
) {
8299 switch (in
->nlink
) {
8301 stx
->stx_nlink
= 0; /* dir is unlinked */
8304 stx
->stx_nlink
= 1 /* parent dentry */
8306 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
8312 stx
->stx_nlink
= in
->nlink
;
8314 stx
->stx_mask
|= CEPH_STATX_NLINK
;
8317 if (mask
& CEPH_CAP_FILE_SHARED
) {
8319 in
->atime
.to_timespec(&stx
->stx_atime
);
8320 in
->mtime
.to_timespec(&stx
->stx_mtime
);
8323 if (cct
->_conf
->client_dirsize_rbytes
) {
8324 stx
->stx_size
= in
->rstat
.rbytes
;
8325 } else if (in
->snapid
== CEPH_SNAPDIR
) {
8326 SnapRealm
*realm
= get_snap_realm_maybe(in
->vino().ino
);
8328 stx
->stx_size
= realm
->my_snaps
.size();
8329 put_snap_realm(realm
);
8332 stx
->stx_size
= in
->dirstat
.size();
8334 stx
->stx_blocks
= 1;
8336 stx
->stx_size
= in
->size
;
8337 stx
->stx_blocks
= (in
->size
+ 511) >> 9;
8339 stx
->stx_mask
|= (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|
8340 CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
);
8343 /* Change time and change_attr both require all shared caps to view */
8344 if ((mask
& CEPH_STAT_CAP_INODE_ALL
) == CEPH_STAT_CAP_INODE_ALL
) {
8345 stx
->stx_version
= in
->change_attr
;
8346 if (in
->ctime
> in
->mtime
)
8347 in
->ctime
.to_timespec(&stx
->stx_ctime
);
8349 in
->mtime
.to_timespec(&stx
->stx_ctime
);
8350 stx
->stx_mask
|= (CEPH_STATX_CTIME
|CEPH_STATX_VERSION
);
8355 void Client::touch_dn(Dentry
*dn
)
8360 int Client::chmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
8362 return chmodat(CEPHFS_AT_FDCWD
, relpath
, mode
, 0, perms
);
8365 int Client::fchmod(int fd
, mode_t mode
, const UserPerm
& perms
)
8367 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8368 if (!mref_reader
.is_state_satisfied())
8369 return -CEPHFS_ENOTCONN
;
8371 tout(cct
) << __func__
<< std::endl
;
8372 tout(cct
) << fd
<< std::endl
;
8373 tout(cct
) << mode
<< std::endl
;
8375 std::scoped_lock
lock(client_lock
);
8376 Fh
*f
= get_filehandle(fd
);
8378 return -CEPHFS_EBADF
;
8379 #if defined(__linux__) && defined(O_PATH)
8380 if (f
->flags
& O_PATH
)
8381 return -CEPHFS_EBADF
;
8384 attr
.st_mode
= mode
;
8385 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MODE
, perms
);
8388 int Client::chmodat(int dirfd
, const char *relpath
, mode_t mode
, int flags
,
8389 const UserPerm
& perms
) {
8390 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8391 if (!mref_reader
.is_state_satisfied()) {
8392 return -CEPHFS_ENOTCONN
;
8395 tout(cct
) << __func__
<< std::endl
;
8396 tout(cct
) << dirfd
<< std::endl
;
8397 tout(cct
) << relpath
<< std::endl
;
8398 tout(cct
) << mode
<< std::endl
;
8399 tout(cct
) << flags
<< std::endl
;
8401 filepath
path(relpath
);
8405 std::scoped_lock
lock(client_lock
);
8406 int r
= get_fd_inode(dirfd
, &dirinode
);
8411 r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), 0, dirinode
);
8416 attr
.st_mode
= mode
;
8417 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
8420 int Client::lchmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
8422 return chmodat(CEPHFS_AT_FDCWD
, relpath
, mode
, AT_SYMLINK_NOFOLLOW
, perms
);
8425 int Client::chown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
8426 const UserPerm
& perms
)
8428 return chownat(CEPHFS_AT_FDCWD
, relpath
, new_uid
, new_gid
, 0, perms
);
8431 int Client::fchown(int fd
, uid_t new_uid
, gid_t new_gid
, const UserPerm
& perms
)
8433 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8434 if (!mref_reader
.is_state_satisfied())
8435 return -CEPHFS_ENOTCONN
;
8437 tout(cct
) << __func__
<< std::endl
;
8438 tout(cct
) << fd
<< std::endl
;
8439 tout(cct
) << new_uid
<< std::endl
;
8440 tout(cct
) << new_gid
<< std::endl
;
8442 std::scoped_lock
lock(client_lock
);
8443 Fh
*f
= get_filehandle(fd
);
8445 return -CEPHFS_EBADF
;
8446 #if defined(__linux__) && defined(O_PATH)
8447 if (f
->flags
& O_PATH
)
8448 return -CEPHFS_EBADF
;
8451 attr
.st_uid
= new_uid
;
8452 attr
.st_gid
= new_gid
;
8454 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
8455 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
8456 return _setattr(f
->inode
, &attr
, mask
, perms
);
8459 int Client::lchown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
8460 const UserPerm
& perms
)
8462 return chownat(CEPHFS_AT_FDCWD
, relpath
, new_uid
, new_gid
, AT_SYMLINK_NOFOLLOW
, perms
);
8465 int Client::chownat(int dirfd
, const char *relpath
, uid_t new_uid
, gid_t new_gid
,
8466 int flags
, const UserPerm
& perms
) {
8467 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8468 if (!mref_reader
.is_state_satisfied()) {
8469 return -CEPHFS_ENOTCONN
;
8472 tout(cct
) << __func__
<< std::endl
;
8473 tout(cct
) << dirfd
<< std::endl
;
8474 tout(cct
) << relpath
<< std::endl
;
8475 tout(cct
) << new_uid
<< std::endl
;
8476 tout(cct
) << new_gid
<< std::endl
;
8477 tout(cct
) << flags
<< std::endl
;
8479 filepath
path(relpath
);
8483 std::scoped_lock
lock(client_lock
);
8484 int r
= get_fd_inode(dirfd
, &dirinode
);
8489 r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), 0, dirinode
);
8494 attr
.st_uid
= new_uid
;
8495 attr
.st_gid
= new_gid
;
8496 return _setattr(in
, &attr
, CEPH_SETATTR_UID
|CEPH_SETATTR_GID
, perms
);
8499 static void attr_set_atime_and_mtime(struct stat
*attr
,
8500 const utime_t
&atime
,
8501 const utime_t
&mtime
)
8503 stat_set_atime_sec(attr
, atime
.tv
.tv_sec
);
8504 stat_set_atime_nsec(attr
, atime
.tv
.tv_nsec
);
8505 stat_set_mtime_sec(attr
, mtime
.tv
.tv_sec
);
8506 stat_set_mtime_nsec(attr
, mtime
.tv
.tv_nsec
);
8509 // for [l]utime() invoke the timeval variant as the timespec
8510 // variant are not yet implemented. for futime[s](), invoke
8511 // the timespec variant.
8512 int Client::utime(const char *relpath
, struct utimbuf
*buf
,
8513 const UserPerm
& perms
)
8515 struct timeval tv
[2];
8516 tv
[0].tv_sec
= buf
->actime
;
8518 tv
[1].tv_sec
= buf
->modtime
;
8521 return utimes(relpath
, tv
, perms
);
8524 int Client::lutime(const char *relpath
, struct utimbuf
*buf
,
8525 const UserPerm
& perms
)
8527 struct timeval tv
[2];
8528 tv
[0].tv_sec
= buf
->actime
;
8530 tv
[1].tv_sec
= buf
->modtime
;
8533 return lutimes(relpath
, tv
, perms
);
8536 int Client::futime(int fd
, struct utimbuf
*buf
, const UserPerm
& perms
)
8538 struct timespec ts
[2];
8539 ts
[0].tv_sec
= buf
->actime
;
8541 ts
[1].tv_sec
= buf
->modtime
;
8544 return futimens(fd
, ts
, perms
);
8547 int Client::utimes(const char *relpath
, struct timeval times
[2],
8548 const UserPerm
& perms
)
8550 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8551 if (!mref_reader
.is_state_satisfied())
8552 return -CEPHFS_ENOTCONN
;
8554 tout(cct
) << __func__
<< std::endl
;
8555 tout(cct
) << relpath
<< std::endl
;
8556 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
8558 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
8561 filepath
path(relpath
);
8564 std::scoped_lock
lock(client_lock
);
8565 int r
= path_walk(path
, &in
, perms
);
8569 utime_t
atime(times
[0]);
8570 utime_t
mtime(times
[1]);
8572 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
8573 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8576 int Client::lutimes(const char *relpath
, struct timeval times
[2],
8577 const UserPerm
& perms
)
8579 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8580 if (!mref_reader
.is_state_satisfied())
8581 return -CEPHFS_ENOTCONN
;
8583 tout(cct
) << __func__
<< std::endl
;
8584 tout(cct
) << relpath
<< std::endl
;
8585 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
8587 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
8590 filepath
path(relpath
);
8593 std::scoped_lock
lock(client_lock
);
8594 int r
= path_walk(path
, &in
, perms
, false);
8598 utime_t
atime(times
[0]);
8599 utime_t
mtime(times
[1]);
8601 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
8602 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8605 int Client::futimes(int fd
, struct timeval times
[2], const UserPerm
& perms
)
8607 struct timespec ts
[2];
8608 ts
[0].tv_sec
= times
[0].tv_sec
;
8609 ts
[0].tv_nsec
= times
[0].tv_usec
* 1000;
8610 ts
[1].tv_sec
= times
[1].tv_sec
;
8611 ts
[1].tv_nsec
= times
[1].tv_usec
* 1000;
8613 return futimens(fd
, ts
, perms
);
8616 int Client::futimens(int fd
, struct timespec times
[2], const UserPerm
& perms
)
8618 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8619 if (!mref_reader
.is_state_satisfied())
8620 return -CEPHFS_ENOTCONN
;
8622 tout(cct
) << __func__
<< std::endl
;
8623 tout(cct
) << fd
<< std::endl
;
8624 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_nsec
8626 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_nsec
8629 std::scoped_lock
lock(client_lock
);
8630 Fh
*f
= get_filehandle(fd
);
8632 return -CEPHFS_EBADF
;
8633 #if defined(__linux__) && defined(O_PATH)
8634 if (f
->flags
& O_PATH
)
8635 return -CEPHFS_EBADF
;
8638 utime_t
atime(times
[0]);
8639 utime_t
mtime(times
[1]);
8641 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
8642 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8645 int Client::utimensat(int dirfd
, const char *relpath
, struct timespec times
[2], int flags
,
8646 const UserPerm
& perms
) {
8647 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8648 if (!mref_reader
.is_state_satisfied()) {
8649 return -CEPHFS_ENOTCONN
;
8652 tout(cct
) << __func__
<< std::endl
;
8653 tout(cct
) << dirfd
<< std::endl
;
8654 tout(cct
) << relpath
<< std::endl
;
8655 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_nsec
8657 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_nsec
8659 tout(cct
) << flags
<< std::endl
;
8661 filepath
path(relpath
);
8665 std::scoped_lock
lock(client_lock
);
8666 int r
= get_fd_inode(dirfd
, &dirinode
);
8671 #if defined(__linux__) && defined(O_PATH)
8672 if (flags
& O_PATH
) {
8673 return -CEPHFS_EBADF
;
8677 r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), 0, dirinode
);
8682 utime_t
atime(times
[0]);
8683 utime_t
mtime(times
[1]);
8685 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
8686 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8689 int Client::flock(int fd
, int operation
, uint64_t owner
)
8691 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8692 if (!mref_reader
.is_state_satisfied())
8693 return -CEPHFS_ENOTCONN
;
8695 tout(cct
) << __func__
<< std::endl
;
8696 tout(cct
) << fd
<< std::endl
;
8697 tout(cct
) << operation
<< std::endl
;
8698 tout(cct
) << owner
<< std::endl
;
8700 std::scoped_lock
lock(client_lock
);
8701 Fh
*f
= get_filehandle(fd
);
8703 return -CEPHFS_EBADF
;
8705 return _flock(f
, operation
, owner
);
8708 int Client::opendir(const char *relpath
, dir_result_t
**dirpp
, const UserPerm
& perms
)
8710 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8711 if (!mref_reader
.is_state_satisfied())
8712 return -CEPHFS_ENOTCONN
;
8714 tout(cct
) << __func__
<< std::endl
;
8715 tout(cct
) << relpath
<< std::endl
;
8717 filepath
path(relpath
);
8720 std::scoped_lock
lock(client_lock
);
8721 int r
= path_walk(path
, &in
, perms
, true);
8724 if (cct
->_conf
->client_permissions
) {
8725 int r
= may_open(in
.get(), O_RDONLY
, perms
);
8729 r
= _opendir(in
.get(), dirpp
, perms
);
8730 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8731 if (r
!= -CEPHFS_ENOTDIR
)
8732 tout(cct
) << (uintptr_t)*dirpp
<< std::endl
;
8736 int Client::fdopendir(int dirfd
, dir_result_t
**dirpp
, const UserPerm
&perms
) {
8737 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8738 if (!mref_reader
.is_state_satisfied()) {
8739 return -CEPHFS_ENOTCONN
;
8742 tout(cct
) << __func__
<< std::endl
;
8743 tout(cct
) << dirfd
<< std::endl
;
8746 std::scoped_lock
locker(client_lock
);
8747 int r
= get_fd_inode(dirfd
, &dirinode
);
8752 if (cct
->_conf
->client_permissions
) {
8753 r
= may_open(dirinode
.get(), O_RDONLY
, perms
);
8758 r
= _opendir(dirinode
.get(), dirpp
, perms
);
8759 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8760 if (r
!= -CEPHFS_ENOTDIR
) {
8761 tout(cct
) << (uintptr_t)*dirpp
<< std::endl
;
8766 int Client::_opendir(Inode
*in
, dir_result_t
**dirpp
, const UserPerm
& perms
)
8769 return -CEPHFS_ENOTDIR
;
8770 *dirpp
= new dir_result_t(in
, perms
);
8771 opened_dirs
.insert(*dirpp
);
8772 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ") = " << 0 << " (" << *dirpp
<< ")" << dendl
;
8777 int Client::closedir(dir_result_t
*dir
)
8779 tout(cct
) << __func__
<< std::endl
;
8780 tout(cct
) << (uintptr_t)dir
<< std::endl
;
8782 ldout(cct
, 3) << __func__
<< "(" << dir
<< ") = 0" << dendl
;
8783 std::scoped_lock
lock(client_lock
);
8788 void Client::_closedir(dir_result_t
*dirp
)
8790 ldout(cct
, 10) << __func__
<< "(" << dirp
<< ")" << dendl
;
8793 ldout(cct
, 10) << __func__
<< " detaching inode " << dirp
->inode
<< dendl
;
8794 dirp
->inode
.reset();
8796 _readdir_drop_dirp_buffer(dirp
);
8797 opened_dirs
.erase(dirp
);
8801 void Client::rewinddir(dir_result_t
*dirp
)
8803 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ")" << dendl
;
8805 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8806 if (!mref_reader
.is_state_satisfied())
8809 std::scoped_lock
lock(client_lock
);
8810 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
8811 _readdir_drop_dirp_buffer(d
);
8815 loff_t
Client::telldir(dir_result_t
*dirp
)
8817 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
8818 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ") = " << d
->offset
<< dendl
;
8822 void Client::seekdir(dir_result_t
*dirp
, loff_t offset
)
8824 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ", " << offset
<< ")" << dendl
;
8826 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8827 if (!mref_reader
.is_state_satisfied())
8830 std::scoped_lock
lock(client_lock
);
8832 if (offset
== dirp
->offset
)
8835 if (offset
> dirp
->offset
)
8836 dirp
->release_count
= 0; // bump if we do a forward seek
8838 dirp
->ordered_count
= 0; // disable filling readdir cache
8840 if (dirp
->hash_order()) {
8841 if (dirp
->offset
> offset
) {
8842 _readdir_drop_dirp_buffer(dirp
);
8847 dirp
->buffer_frag
!= frag_t(dir_result_t::fpos_high(offset
)) ||
8848 dirp
->offset_low() > dir_result_t::fpos_low(offset
)) {
8849 _readdir_drop_dirp_buffer(dirp
);
8854 dirp
->offset
= offset
;
8859 // ino_t d_ino; /* inode number */
8860 // off_t d_off; /* offset to the next dirent */
8861 // unsigned short d_reclen; /* length of this record */
8862 // unsigned char d_type; /* type of file */
8863 // char d_name[256]; /* filename */
8865 void Client::fill_dirent(struct dirent
*de
, const char *name
, int type
, uint64_t ino
, loff_t next_off
)
8867 strncpy(de
->d_name
, name
, 255);
8868 de
->d_name
[255] = '\0';
8869 #if !defined(__CYGWIN__) && !(defined(_WIN32))
8871 #if !defined(__APPLE__) && !defined(__FreeBSD__)
8872 de
->d_off
= next_off
;
8875 de
->d_type
= IFTODT(type
);
8876 ldout(cct
, 10) << __func__
<< " '" << de
->d_name
<< "' -> " << inodeno_t(de
->d_ino
)
8877 << " type " << (int)de
->d_type
<< " w/ next_off " << hex
<< next_off
<< dec
<< dendl
;
8881 void Client::_readdir_next_frag(dir_result_t
*dirp
)
8883 frag_t fg
= dirp
->buffer_frag
;
8885 if (fg
.is_rightmost()) {
8886 ldout(cct
, 10) << __func__
<< " advance from " << fg
<< " to END" << dendl
;
8893 ldout(cct
, 10) << __func__
<< " advance from " << dirp
->buffer_frag
<< " to " << fg
<< dendl
;
8895 if (dirp
->hash_order()) {
8897 int64_t new_offset
= dir_result_t::make_fpos(fg
.value(), 2, true);
8898 if (dirp
->offset
< new_offset
) // don't decrease offset
8899 dirp
->offset
= new_offset
;
8901 dirp
->last_name
.clear();
8902 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
8903 _readdir_rechoose_frag(dirp
);
8907 void Client::_readdir_rechoose_frag(dir_result_t
*dirp
)
8909 ceph_assert(dirp
->inode
);
8911 if (dirp
->hash_order())
8914 frag_t cur
= frag_t(dirp
->offset_high());
8915 frag_t fg
= dirp
->inode
->dirfragtree
[cur
.value()];
8917 ldout(cct
, 10) << __func__
<< " frag " << cur
<< " maps to " << fg
<< dendl
;
8918 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
8919 dirp
->last_name
.clear();
8920 dirp
->next_offset
= 2;
8924 void Client::_readdir_drop_dirp_buffer(dir_result_t
*dirp
)
8926 ldout(cct
, 10) << __func__
<< " " << dirp
<< dendl
;
8927 dirp
->buffer
.clear();
8930 int Client::_readdir_get_frag(dir_result_t
*dirp
)
8933 ceph_assert(dirp
->inode
);
8935 // get the current frag.
8937 if (dirp
->hash_order())
8938 fg
= dirp
->inode
->dirfragtree
[dirp
->offset_high()];
8940 fg
= frag_t(dirp
->offset_high());
8942 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " fg " << fg
8943 << " offset " << hex
<< dirp
->offset
<< dec
<< dendl
;
8945 int op
= CEPH_MDS_OP_READDIR
;
8946 if (dirp
->inode
&& dirp
->inode
->snapid
== CEPH_SNAPDIR
)
8947 op
= CEPH_MDS_OP_LSSNAP
;
8949 InodeRef
& diri
= dirp
->inode
;
8951 MetaRequest
*req
= new MetaRequest(op
);
8953 diri
->make_nosnap_relative_path(path
);
8954 req
->set_filepath(path
);
8955 req
->set_inode(diri
.get());
8956 req
->head
.args
.readdir
.frag
= fg
;
8957 req
->head
.args
.readdir
.flags
= CEPH_READDIR_REPLY_BITFLAGS
;
8958 if (dirp
->last_name
.length()) {
8959 req
->path2
.set_path(dirp
->last_name
);
8960 } else if (dirp
->hash_order()) {
8961 req
->head
.args
.readdir
.offset_hash
= dirp
->offset_high();
8966 int res
= make_request(req
, dirp
->perms
, NULL
, NULL
, -1, &dirbl
);
8968 if (res
== -CEPHFS_EAGAIN
) {
8969 ldout(cct
, 10) << __func__
<< " got EAGAIN, retrying" << dendl
;
8970 _readdir_rechoose_frag(dirp
);
8971 return _readdir_get_frag(dirp
);
8975 ldout(cct
, 10) << __func__
<< " " << dirp
<< " got frag " << dirp
->buffer_frag
8976 << " size " << dirp
->buffer
.size() << dendl
;
8978 ldout(cct
, 10) << __func__
<< " got error " << res
<< ", setting end flag" << dendl
;
8985 struct dentry_off_lt
{
8986 bool operator()(const Dentry
* dn
, int64_t off
) const {
8987 return dir_result_t::fpos_cmp(dn
->offset
, off
) < 0;
8991 int Client::_readdir_cache_cb(dir_result_t
*dirp
, add_dirent_cb_t cb
, void *p
,
8992 int caps
, bool getref
)
8994 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
8995 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
8996 << " last_name " << dirp
->last_name
<< " offset " << hex
<< dirp
->offset
<< dec
8998 Dir
*dir
= dirp
->inode
->dir
;
9001 ldout(cct
, 10) << " dir is empty" << dendl
;
9006 vector
<Dentry
*>::iterator pd
= std::lower_bound(dir
->readdir_cache
.begin(),
9007 dir
->readdir_cache
.end(),
9008 dirp
->offset
, dentry_off_lt());
9013 if (!dirp
->inode
->is_complete_and_ordered())
9014 return -CEPHFS_EAGAIN
;
9015 if (pd
== dir
->readdir_cache
.end())
9018 if (dn
->inode
== NULL
) {
9019 ldout(cct
, 15) << " skipping null '" << dn
->name
<< "'" << dendl
;
9023 if (dn
->cap_shared_gen
!= dir
->parent_inode
->shared_gen
) {
9024 ldout(cct
, 15) << " skipping mismatch shared gen '" << dn
->name
<< "'" << dendl
;
9029 int idx
= pd
- dir
->readdir_cache
.begin();
9030 if (dn
->inode
->is_dir()) {
9031 mask
|= CEPH_STAT_RSTAT
;
9033 int r
= _getattr(dn
->inode
, mask
, dirp
->perms
);
9037 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
9038 pd
= dir
->readdir_cache
.begin() + idx
;
9039 if (pd
>= dir
->readdir_cache
.end() || *pd
!= dn
)
9040 return -CEPHFS_EAGAIN
;
9042 struct ceph_statx stx
;
9044 fill_statx(dn
->inode
, caps
, &stx
);
9046 uint64_t next_off
= dn
->offset
+ 1;
9047 fill_dirent(&de
, dn
->name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
9049 if (pd
== dir
->readdir_cache
.end())
9050 next_off
= dir_result_t::END
;
9054 in
= dn
->inode
.get();
9058 dn_name
= dn
->name
; // fill in name while we have lock
9060 client_lock
.unlock();
9061 r
= cb(p
, &de
, &stx
, next_off
, in
); // _next_ offset
9063 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< dn
->offset
<< dec
9064 << " = " << r
<< dendl
;
9069 dirp
->offset
= next_off
;
9071 dirp
->next_offset
= 2;
9073 dirp
->next_offset
= dirp
->offset_low();
9074 dirp
->last_name
= dn_name
; // we successfully returned this one; update!
9075 dirp
->release_count
= 0; // last_name no longer match cache index
9080 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " at end" << dendl
;
9085 int Client::readdir_r_cb(dir_result_t
*d
, add_dirent_cb_t cb
, void *p
,
9086 unsigned want
, unsigned flags
, bool getref
)
9088 int caps
= statx_to_mask(flags
, want
);
9090 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9091 if (!mref_reader
.is_state_satisfied())
9092 return -CEPHFS_ENOTCONN
;
9094 std::unique_lock
cl(client_lock
);
9096 dir_result_t
*dirp
= static_cast<dir_result_t
*>(d
);
9098 ldout(cct
, 10) << __func__
<< " " << *dirp
->inode
<< " offset " << hex
<< dirp
->offset
9099 << dec
<< " at_end=" << dirp
->at_end()
9100 << " hash_order=" << dirp
->hash_order() << dendl
;
9103 struct ceph_statx stx
;
9104 memset(&de
, 0, sizeof(de
));
9105 memset(&stx
, 0, sizeof(stx
));
9107 InodeRef
& diri
= dirp
->inode
;
9112 if (dirp
->offset
== 0) {
9113 ldout(cct
, 15) << " including ." << dendl
;
9114 ceph_assert(diri
->dentries
.size() < 2); // can't have multiple hard-links to a dir
9115 uint64_t next_off
= 1;
9118 r
= _getattr(diri
, caps
| CEPH_STAT_RSTAT
, dirp
->perms
);
9122 fill_statx(diri
, caps
, &stx
);
9123 fill_dirent(&de
, ".", S_IFDIR
, stx
.stx_ino
, next_off
);
9125 Inode
*inode
= NULL
;
9132 r
= cb(p
, &de
, &stx
, next_off
, inode
);
9137 dirp
->offset
= next_off
;
9141 if (dirp
->offset
== 1) {
9142 ldout(cct
, 15) << " including .." << dendl
;
9143 uint64_t next_off
= 2;
9145 if (diri
->dentries
.empty())
9148 in
= diri
->get_first_parent()->dir
->parent_inode
;
9151 r
= _getattr(in
, caps
| CEPH_STAT_RSTAT
, dirp
->perms
);
9155 fill_statx(in
, caps
, &stx
);
9156 fill_dirent(&de
, "..", S_IFDIR
, stx
.stx_ino
, next_off
);
9158 Inode
*inode
= NULL
;
9165 r
= cb(p
, &de
, &stx
, next_off
, inode
);
9170 dirp
->offset
= next_off
;
9175 // can we read from our cache?
9176 ldout(cct
, 10) << "offset " << hex
<< dirp
->offset
<< dec
9177 << " snapid " << dirp
->inode
->snapid
<< " (complete && ordered) "
9178 << dirp
->inode
->is_complete_and_ordered()
9179 << " issued " << ccap_string(dirp
->inode
->caps_issued())
9181 if (dirp
->inode
->snapid
!= CEPH_SNAPDIR
&&
9182 dirp
->inode
->is_complete_and_ordered() &&
9183 dirp
->inode
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
9184 int err
= _readdir_cache_cb(dirp
, cb
, p
, caps
, getref
);
9185 if (err
!= -CEPHFS_EAGAIN
)
9193 bool check_caps
= true;
9194 if (!dirp
->is_cached()) {
9195 int r
= _readdir_get_frag(dirp
);
9198 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
9199 // different than the requested one. (our dirfragtree was outdated)
9202 frag_t fg
= dirp
->buffer_frag
;
9204 ldout(cct
, 10) << "frag " << fg
<< " buffer size " << dirp
->buffer
.size()
9205 << " offset " << hex
<< dirp
->offset
<< dendl
;
9207 for (auto it
= std::lower_bound(dirp
->buffer
.begin(), dirp
->buffer
.end(),
9208 dirp
->offset
, dir_result_t::dentry_off_lt());
9209 it
!= dirp
->buffer
.end();
9211 dir_result_t::dentry
&entry
= *it
;
9213 uint64_t next_off
= entry
.offset
+ 1;
9218 if(entry
.inode
->is_dir()){
9219 mask
|= CEPH_STAT_RSTAT
;
9221 r
= _getattr(entry
.inode
, mask
, dirp
->perms
);
9226 fill_statx(entry
.inode
, caps
, &stx
);
9227 fill_dirent(&de
, entry
.name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
9229 Inode
*inode
= NULL
;
9231 inode
= entry
.inode
.get();
9236 r
= cb(p
, &de
, &stx
, next_off
, inode
); // _next_ offset
9239 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< next_off
- 1 << dec
9240 << " = " << r
<< dendl
;
9244 dirp
->offset
= next_off
;
9249 if (dirp
->next_offset
> 2) {
9250 ldout(cct
, 10) << " fetching next chunk of this frag" << dendl
;
9251 _readdir_drop_dirp_buffer(dirp
);
9255 if (!fg
.is_rightmost()) {
9257 _readdir_next_frag(dirp
);
9261 if (diri
->shared_gen
== dirp
->start_shared_gen
&&
9262 diri
->dir_release_count
== dirp
->release_count
) {
9263 if (diri
->dir_ordered_count
== dirp
->ordered_count
) {
9264 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
9266 ceph_assert(diri
->dir
->readdir_cache
.size() >= dirp
->cache_index
);
9267 diri
->dir
->readdir_cache
.resize(dirp
->cache_index
);
9269 diri
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
9271 ldout(cct
, 10) << " marking I_COMPLETE on " << *diri
<< dendl
;
9272 diri
->flags
|= I_COMPLETE
;
9284 int Client::readdir_r(dir_result_t
*d
, struct dirent
*de
)
9286 return readdirplus_r(d
, de
, 0, 0, 0, NULL
);
9293 * 1 if we got a dirent
9294 * 0 for end of directory
9298 struct single_readdir
{
9300 struct ceph_statx
*stx
;
9305 static int _readdir_single_dirent_cb(void *p
, struct dirent
*de
,
9306 struct ceph_statx
*stx
, off_t off
,
9309 single_readdir
*c
= static_cast<single_readdir
*>(p
);
9312 return -1; // already filled this dirent
9322 struct dirent
*Client::readdir(dir_result_t
*d
)
9332 // our callback fills the dirent and sets sr.full=true on first
9333 // call, and returns -1 the second time around.
9334 ret
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
);
9336 errno
= -ret
; // this sucks.
9337 return (dirent
*) NULL
;
9342 return (dirent
*) NULL
;
9345 int Client::readdirplus_r(dir_result_t
*d
, struct dirent
*de
,
9346 struct ceph_statx
*stx
, unsigned want
,
9347 unsigned flags
, Inode
**out
)
9355 // our callback fills the dirent and sets sr.full=true on first
9356 // call, and returns -1 the second time around.
9357 int r
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
, want
, flags
, out
);
9369 struct getdents_result
{
9376 static int _readdir_getdent_cb(void *p
, struct dirent
*de
,
9377 struct ceph_statx
*stx
, off_t off
, Inode
*in
)
9379 struct getdents_result
*c
= static_cast<getdents_result
*>(p
);
9385 dlen
= strlen(de
->d_name
) + 1;
9387 if (c
->pos
+ dlen
> c
->buflen
)
9388 return -1; // doesn't fit
9391 memcpy(c
->buf
+ c
->pos
, de
, sizeof(*de
));
9393 memcpy(c
->buf
+ c
->pos
, de
->d_name
, dlen
);
9399 int Client::_getdents(dir_result_t
*dir
, char *buf
, int buflen
, bool fullent
)
9404 gr
.fullent
= fullent
;
9407 int r
= readdir_r_cb(dir
, _readdir_getdent_cb
, (void *)&gr
);
9409 if (r
< 0) { // some error
9410 if (r
== -1) { // buffer ran out of space
9411 if (gr
.pos
) { // but we got some entries already!
9413 } // or we need a larger buffer
9414 return -CEPHFS_ERANGE
;
9415 } else { // actual error, return it
9424 struct getdir_result
{
9425 list
<string
> *contents
;
9429 static int _getdir_cb(void *p
, struct dirent
*de
, struct ceph_statx
*stx
, off_t off
, Inode
*in
)
9431 getdir_result
*r
= static_cast<getdir_result
*>(p
);
9433 r
->contents
->push_back(de
->d_name
);
9438 int Client::getdir(const char *relpath
, list
<string
>& contents
,
9439 const UserPerm
& perms
)
9441 ldout(cct
, 3) << "getdir(" << relpath
<< ")" << dendl
;
9442 tout(cct
) << "getdir" << std::endl
;
9443 tout(cct
) << relpath
<< std::endl
;
9446 int r
= opendir(relpath
, &d
, perms
);
9451 gr
.contents
= &contents
;
9453 r
= readdir_r_cb(d
, _getdir_cb
, (void *)&gr
);
9463 /****** file i/o **********/
9465 // common parts for open and openat. call with client_lock locked.
9466 int Client::create_and_open(int dirfd
, const char *relpath
, int flags
,
9467 const UserPerm
& perms
, mode_t mode
, int stripe_unit
,
9468 int stripe_count
, int object_size
, const char *data_pool
,
9469 std::string alternate_name
) {
9470 ceph_assert(ceph_mutex_is_locked(client_lock
));
9471 int cflags
= ceph_flags_sys2wire(flags
);
9472 tout(cct
) << cflags
<< std::endl
;
9476 #if defined(__linux__) && defined(O_PATH)
9477 /* When the O_PATH is being specified, others flags than O_DIRECTORY
9478 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9479 * in kernel (fs/open.c). */
9481 flags
&= O_DIRECTORY
| O_NOFOLLOW
| O_PATH
;
9484 filepath
path(relpath
);
9486 bool created
= false;
9487 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9488 bool followsym
= !((flags
& O_NOFOLLOW
) || ((flags
& O_CREAT
) && (flags
& O_EXCL
)));
9489 int mask
= ceph_caps_for_mode(ceph_flags_to_mode(cflags
));
9491 InodeRef dirinode
= nullptr;
9492 int r
= get_fd_inode(dirfd
, &dirinode
);
9497 r
= path_walk(path
, &in
, perms
, followsym
, mask
, dirinode
);
9498 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
9499 return -CEPHFS_EEXIST
;
9501 #if defined(__linux__) && defined(O_PATH)
9502 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
) && !(flags
& O_PATH
))
9504 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
))
9506 return -CEPHFS_ELOOP
;
9508 if (r
== -CEPHFS_ENOENT
&& (flags
& O_CREAT
)) {
9509 filepath dirpath
= path
;
9510 string dname
= dirpath
.last_dentry();
9511 dirpath
.pop_dentry();
9513 r
= path_walk(dirpath
, &dir
, perms
, true,
9514 cct
->_conf
->client_permissions
? CEPH_CAP_AUTH_SHARED
: 0, dirinode
);
9518 if (cct
->_conf
->client_permissions
) {
9519 r
= may_create(dir
.get(), perms
);
9523 r
= _create(dir
.get(), dname
.c_str(), flags
, mode
, &in
, &fh
, stripe_unit
,
9524 stripe_count
, object_size
, data_pool
, &created
, perms
,
9525 std::move(alternate_name
));
9531 // posix says we can only check permissions of existing files
9532 if (cct
->_conf
->client_permissions
) {
9533 r
= may_open(in
.get(), flags
, perms
);
9540 r
= _open(in
.get(), flags
, mode
, &fh
, perms
);
9542 // allocate a integer file descriptor
9545 ceph_assert(fd_map
.count(r
) == 0);
9553 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
,
9554 mode_t mode
, int stripe_unit
, int stripe_count
,
9555 int object_size
, const char *data_pool
, std::string alternate_name
)
9557 return openat(CEPHFS_AT_FDCWD
, relpath
, flags
, perms
, mode
, stripe_unit
,
9558 stripe_count
, object_size
, data_pool
, alternate_name
);
9561 int Client::openat(int dirfd
, const char *relpath
, int flags
, const UserPerm
& perms
,
9562 mode_t mode
, int stripe_unit
, int stripe_count
, int object_size
,
9563 const char *data_pool
, std::string alternate_name
) {
9564 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9565 if (!mref_reader
.is_state_satisfied()) {
9566 return -CEPHFS_ENOTCONN
;
9569 ldout(cct
, 3) << "openat enter(" << relpath
<< ")" << dendl
;
9570 tout(cct
) << dirfd
<< std::endl
;
9571 tout(cct
) << relpath
<< std::endl
;
9572 tout(cct
) << flags
<< std::endl
;
9573 tout(cct
) << mode
<< std::endl
;
9575 std::scoped_lock
locker(client_lock
);
9576 int r
= create_and_open(dirfd
, relpath
, flags
, perms
, mode
, stripe_unit
, stripe_count
,
9577 object_size
, data_pool
, alternate_name
);
9579 tout(cct
) << r
<< std::endl
;
9580 ldout(cct
, 3) << "openat exit(" << relpath
<< ")" << dendl
;
9584 int Client::lookup_hash(inodeno_t ino
, inodeno_t dirino
, const char *name
,
9585 const UserPerm
& perms
)
9587 ldout(cct
, 3) << __func__
<< " enter(" << ino
<< ", #" << dirino
<< "/" << name
<< ")" << dendl
;
9589 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9590 if (!mref_reader
.is_state_satisfied())
9591 return -CEPHFS_ENOTCONN
;
9593 std::scoped_lock
lock(client_lock
);
9594 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPHASH
);
9596 req
->set_filepath(path
);
9598 uint32_t h
= ceph_str_hash(CEPH_STR_HASH_RJENKINS
, name
, strlen(name
));
9600 sprintf(f
, "%u", h
);
9601 filepath
path2(dirino
);
9602 path2
.push_dentry(string(f
));
9603 req
->set_filepath2(path2
);
9605 int r
= make_request(req
, perms
, NULL
, NULL
,
9606 rand() % mdsmap
->get_num_in_mds());
9607 ldout(cct
, 3) << __func__
<< " exit(" << ino
<< ", #" << dirino
<< "/" << name
<< ") = " << r
<< dendl
;
9613 * Load inode into local cache.
9615 * If inode pointer is non-NULL, and take a reference on
9616 * the resulting Inode object in one operation, so that caller
9617 * can safely assume inode will still be there after return.
9619 int Client::_lookup_vino(vinodeno_t vino
, const UserPerm
& perms
, Inode
**inode
)
9621 ldout(cct
, 8) << __func__
<< " enter(" << vino
<< ")" << dendl
;
9623 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9624 if (!mref_reader
.is_state_satisfied())
9625 return -CEPHFS_ENOTCONN
;
9627 if (is_reserved_vino(vino
))
9628 return -CEPHFS_ESTALE
;
9630 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPINO
);
9631 filepath
path(vino
.ino
);
9632 req
->set_filepath(path
);
9635 * The MDS expects either a "real" snapid here or 0. The special value
9636 * carveouts for the snapid are all at the end of the range so we can
9637 * just look for any snapid below this value.
9639 if (vino
.snapid
< CEPH_NOSNAP
)
9640 req
->head
.args
.lookupino
.snapid
= vino
.snapid
;
9642 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
9643 if (r
== 0 && inode
!= NULL
) {
9644 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
9645 ceph_assert(p
!= inode_map
.end());
9649 ldout(cct
, 8) << __func__
<< " exit(" << vino
<< ") = " << r
<< dendl
;
9653 int Client::lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
9655 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
9656 std::scoped_lock
lock(client_lock
);
9657 return _lookup_vino(vino
, perms
, inode
);
9661 * Find the parent inode of `ino` and insert it into
9662 * our cache. Conditionally also set `parent` to a referenced
9663 * Inode* if caller provides non-NULL value.
9665 int Client::_lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
9667 ldout(cct
, 8) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
9669 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
9670 filepath
path(ino
->ino
);
9671 req
->set_filepath(path
);
9674 int r
= make_request(req
, perms
, &target
, NULL
, rand() % mdsmap
->get_num_in_mds());
9675 // Give caller a reference to the parent ino if they provided a pointer.
9676 if (parent
!= NULL
) {
9678 *parent
= target
.get();
9680 ldout(cct
, 8) << __func__
<< " found parent " << (*parent
)->ino
<< dendl
;
9685 ldout(cct
, 8) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
9690 * Populate the parent dentry for `ino`, provided it is
9691 * a child of `parent`.
9693 int Client::_lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
9695 ceph_assert(parent
->is_dir());
9696 ldout(cct
, 3) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
9698 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9699 if (!mref_reader
.is_state_satisfied())
9700 return -CEPHFS_ENOTCONN
;
9702 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
9703 req
->set_filepath2(filepath(parent
->ino
));
9704 req
->set_filepath(filepath(ino
->ino
));
9705 req
->set_inode(ino
);
9707 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
9708 ldout(cct
, 3) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
9712 int Client::lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
9714 std::scoped_lock
lock(client_lock
);
9715 return _lookup_name(ino
, parent
, perms
);
9718 Fh
*Client::_create_fh(Inode
*in
, int flags
, int cmode
, const UserPerm
& perms
)
9721 Fh
*f
= new Fh(in
, flags
, cmode
, fd_gen
, perms
);
9723 ldout(cct
, 10) << __func__
<< " " << in
->ino
<< " mode " << cmode
<< dendl
;
9725 if (in
->snapid
!= CEPH_NOSNAP
) {
9726 in
->snap_cap_refs
++;
9727 ldout(cct
, 5) << "open success, fh is " << f
<< " combined IMMUTABLE SNAP caps "
9728 << ccap_string(in
->caps_issued()) << dendl
;
9731 const auto& conf
= cct
->_conf
;
9732 f
->readahead
.set_trigger_requests(1);
9733 f
->readahead
.set_min_readahead_size(conf
->client_readahead_min
);
9734 uint64_t max_readahead
= Readahead::NO_LIMIT
;
9735 if (conf
->client_readahead_max_bytes
) {
9736 max_readahead
= std::min(max_readahead
, (uint64_t)conf
->client_readahead_max_bytes
);
9738 if (conf
->client_readahead_max_periods
) {
9739 max_readahead
= std::min(max_readahead
, in
->layout
.get_period()*(uint64_t)conf
->client_readahead_max_periods
);
9741 f
->readahead
.set_max_readahead_size(max_readahead
);
9742 vector
<uint64_t> alignments
;
9743 alignments
.push_back(in
->layout
.get_period());
9744 alignments
.push_back(in
->layout
.stripe_unit
);
9745 f
->readahead
.set_alignments(alignments
);
9750 int Client::_release_fh(Fh
*f
)
9752 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
9753 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
9754 Inode
*in
= f
->inode
.get();
9755 ldout(cct
, 8) << __func__
<< " " << f
<< " mode " << f
->mode
<< " on " << *in
<< dendl
;
9759 if (in
->snapid
== CEPH_NOSNAP
) {
9760 if (in
->put_open_ref(f
->mode
)) {
9761 _flush(in
, new C_Client_FlushComplete(this, in
));
9765 ceph_assert(in
->snap_cap_refs
> 0);
9766 in
->snap_cap_refs
--;
9769 _release_filelocks(f
);
9771 // Finally, read any async err (i.e. from flushes)
9772 int err
= f
->take_async_err();
9774 ldout(cct
, 1) << __func__
<< " " << f
<< " on inode " << *in
<< " caught async_err = "
9775 << cpp_strerror(err
) << dendl
;
9777 ldout(cct
, 10) << __func__
<< " " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
9785 void Client::_put_fh(Fh
*f
)
9787 int left
= f
->put();
9793 int Client::_open(Inode
*in
, int flags
, mode_t mode
, Fh
**fhp
,
9794 const UserPerm
& perms
)
9796 if (in
->snapid
!= CEPH_NOSNAP
&&
9797 (flags
& (O_WRONLY
| O_RDWR
| O_CREAT
| O_TRUNC
| O_APPEND
))) {
9798 return -CEPHFS_EROFS
;
9801 // use normalized flags to generate cmode
9802 int cflags
= ceph_flags_sys2wire(flags
);
9803 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
9804 cflags
|= CEPH_O_LAZY
;
9806 int cmode
= ceph_flags_to_mode(cflags
);
9807 int want
= ceph_caps_for_mode(cmode
);
9810 in
->get_open_ref(cmode
); // make note of pending open, since it effects _wanted_ caps.
9812 if ((flags
& O_TRUNC
) == 0 && in
->caps_issued_mask(want
)) {
9814 check_caps(in
, CHECK_CAPS_NODELAY
);
9817 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
9819 in
->make_nosnap_relative_path(path
);
9820 req
->set_filepath(path
);
9821 req
->head
.args
.open
.flags
= cflags
& ~CEPH_O_CREAT
;
9822 req
->head
.args
.open
.mode
= mode
;
9823 req
->head
.args
.open
.pool
= -1;
9824 if (cct
->_conf
->client_debug_getattr_caps
)
9825 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
9827 req
->head
.args
.open
.mask
= 0;
9828 req
->head
.args
.open
.old_size
= in
->size
; // for O_TRUNC
9830 result
= make_request(req
, perms
);
9833 * NFS expects that delegations will be broken on a conflicting open,
9834 * not just when there is actual conflicting access to the file. SMB leases
9835 * and oplocks also have similar semantics.
9837 * Ensure that clients that have delegations enabled will wait on minimal
9838 * caps during open, just to ensure that other clients holding delegations
9839 * return theirs first.
9841 if (deleg_timeout
&& result
== 0) {
9844 if (cmode
& CEPH_FILE_MODE_WR
)
9845 need
|= CEPH_CAP_FILE_WR
;
9846 if (cmode
& CEPH_FILE_MODE_RD
)
9847 need
|= CEPH_CAP_FILE_RD
;
9849 Fh
fh(in
, flags
, cmode
, fd_gen
, perms
);
9850 result
= get_caps(&fh
, need
, want
, &have
, -1);
9852 ldout(cct
, 8) << "Unable to get caps after open of inode " << *in
<<
9853 " . Denying open: " <<
9854 cpp_strerror(result
) << dendl
;
9856 put_cap_ref(in
, need
);
9864 *fhp
= _create_fh(in
, flags
, cmode
, perms
);
9866 in
->put_open_ref(cmode
);
9874 int Client::_renew_caps(Inode
*in
)
9876 int wanted
= in
->caps_file_wanted();
9877 if (in
->is_any_caps() &&
9878 ((wanted
& CEPH_CAP_ANY_WR
) == 0 || in
->auth_cap
)) {
9879 check_caps(in
, CHECK_CAPS_NODELAY
);
9884 if ((wanted
& CEPH_CAP_FILE_RD
) && (wanted
& CEPH_CAP_FILE_WR
))
9886 else if (wanted
& CEPH_CAP_FILE_RD
)
9888 else if (wanted
& CEPH_CAP_FILE_WR
)
9891 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
9893 in
->make_nosnap_relative_path(path
);
9894 req
->set_filepath(path
);
9895 req
->head
.args
.open
.flags
= flags
;
9896 req
->head
.args
.open
.pool
= -1;
9897 if (cct
->_conf
->client_debug_getattr_caps
)
9898 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
9900 req
->head
.args
.open
.mask
= 0;
9903 // duplicate in case Cap goes away; not sure if that race is a concern?
9904 const UserPerm
*pperm
= in
->get_best_perms();
9908 int ret
= make_request(req
, perms
);
9912 int Client::_close(int fd
)
9914 ldout(cct
, 3) << "close enter(" << fd
<< ")" << dendl
;
9915 tout(cct
) << "close" << std::endl
;
9916 tout(cct
) << fd
<< std::endl
;
9918 Fh
*fh
= get_filehandle(fd
);
9920 return -CEPHFS_EBADF
;
9921 int err
= _release_fh(fh
);
9924 ldout(cct
, 3) << "close exit(" << fd
<< ")" << dendl
;
9928 int Client::close(int fd
) {
9929 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9930 if (!mref_reader
.is_state_satisfied())
9931 return -CEPHFS_ENOTCONN
;
9933 std::scoped_lock
lock(client_lock
);
9940 loff_t
Client::lseek(int fd
, loff_t offset
, int whence
)
9942 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9943 if (!mref_reader
.is_state_satisfied())
9944 return -CEPHFS_ENOTCONN
;
9946 tout(cct
) << "lseek" << std::endl
;
9947 tout(cct
) << fd
<< std::endl
;
9948 tout(cct
) << offset
<< std::endl
;
9949 tout(cct
) << whence
<< std::endl
;
9951 std::scoped_lock
lock(client_lock
);
9952 Fh
*f
= get_filehandle(fd
);
9954 return -CEPHFS_EBADF
;
9955 #if defined(__linux__) && defined(O_PATH)
9956 if (f
->flags
& O_PATH
)
9957 return -CEPHFS_EBADF
;
9959 return _lseek(f
, offset
, whence
);
9962 loff_t
Client::_lseek(Fh
*f
, loff_t offset
, int whence
)
9964 Inode
*in
= f
->inode
.get();
9965 bool whence_check
= false;
9970 whence_check
= true;
9975 whence_check
= true;
9981 whence_check
= true;
9987 int r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
9998 pos
= f
->pos
+ offset
;
10002 pos
= in
->size
+ offset
;
10007 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
10008 return -CEPHFS_ENXIO
;
10015 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
10016 return -CEPHFS_ENXIO
;
10022 ldout(cct
, 1) << __func__
<< ": invalid whence value " << whence
<< dendl
;
10023 return -CEPHFS_EINVAL
;
10027 return -CEPHFS_EINVAL
;
10032 ldout(cct
, 8) << "_lseek(" << f
<< ", " << offset
<< ", " << whence
<< ") = " << f
->pos
<< dendl
;
10037 void Client::lock_fh_pos(Fh
*f
)
10039 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
10041 if (f
->pos_locked
|| !f
->pos_waiters
.empty()) {
10042 ceph::condition_variable cond
;
10043 f
->pos_waiters
.push_back(&cond
);
10044 ldout(cct
, 10) << __func__
<< " BLOCKING on " << f
<< dendl
;
10045 std::unique_lock l
{client_lock
, std::adopt_lock
};
10046 cond
.wait(l
, [f
, me
=&cond
] {
10047 return !f
->pos_locked
&& f
->pos_waiters
.front() == me
;
10050 ldout(cct
, 10) << __func__
<< " UNBLOCKING on " << f
<< dendl
;
10051 ceph_assert(f
->pos_waiters
.front() == &cond
);
10052 f
->pos_waiters
.pop_front();
10055 f
->pos_locked
= true;
10058 void Client::unlock_fh_pos(Fh
*f
)
10060 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10062 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
10063 f
->pos_locked
= false;
10064 if (!f
->pos_waiters
.empty()) {
10065 // only wake up the oldest waiter
10066 auto cond
= f
->pos_waiters
.front();
10067 cond
->notify_one();
10071 int Client::uninline_data(Inode
*in
, Context
*onfinish
)
10073 if (!in
->inline_data
.length()) {
10074 onfinish
->complete(0);
10079 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (long long unsigned)in
->ino
);
10080 object_t oid
= oid_buf
;
10082 ObjectOperation create_ops
;
10083 create_ops
.create(false);
10085 objecter
->mutate(oid
,
10086 OSDMap::file_to_object_locator(in
->layout
),
10088 in
->snaprealm
->get_snap_context(),
10089 ceph::real_clock::now(),
10093 bufferlist inline_version_bl
;
10094 encode(in
->inline_version
, inline_version_bl
);
10096 ObjectOperation uninline_ops
;
10097 uninline_ops
.cmpxattr("inline_version",
10098 CEPH_OSD_CMPXATTR_OP_GT
,
10099 CEPH_OSD_CMPXATTR_MODE_U64
,
10100 inline_version_bl
);
10101 bufferlist inline_data
= in
->inline_data
;
10102 uninline_ops
.write(0, inline_data
, in
->truncate_size
, in
->truncate_seq
);
10103 uninline_ops
.setxattr("inline_version", stringify(in
->inline_version
));
10105 objecter
->mutate(oid
,
10106 OSDMap::file_to_object_locator(in
->layout
),
10108 in
->snaprealm
->get_snap_context(),
10109 ceph::real_clock::now(),
10118 // blocking osd interface
10120 int Client::read(int fd
, char *buf
, loff_t size
, loff_t offset
)
10122 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10123 if (!mref_reader
.is_state_satisfied())
10124 return -CEPHFS_ENOTCONN
;
10126 tout(cct
) << "read" << std::endl
;
10127 tout(cct
) << fd
<< std::endl
;
10128 tout(cct
) << size
<< std::endl
;
10129 tout(cct
) << offset
<< std::endl
;
10131 std::unique_lock
lock(client_lock
);
10132 Fh
*f
= get_filehandle(fd
);
10134 return -CEPHFS_EBADF
;
10135 #if defined(__linux__) && defined(O_PATH)
10136 if (f
->flags
& O_PATH
)
10137 return -CEPHFS_EBADF
;
10140 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10141 size
= std::min(size
, (loff_t
)INT_MAX
);
10142 int r
= _read(f
, offset
, size
, &bl
);
10143 ldout(cct
, 3) << "read(" << fd
<< ", " << (void*)buf
<< ", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
10146 bl
.begin().copy(bl
.length(), buf
);
10152 int Client::preadv(int fd
, const struct iovec
*iov
, int iovcnt
, loff_t offset
)
10155 return -CEPHFS_EINVAL
;
10156 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, false);
10159 int64_t Client::_read(Fh
*f
, int64_t offset
, uint64_t size
, bufferlist
*bl
)
10161 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10163 int want
, have
= 0;
10164 bool movepos
= false;
10166 const auto& conf
= cct
->_conf
;
10167 Inode
*in
= f
->inode
.get();
10169 utime_t start
= ceph_clock_now();
10171 if ((f
->mode
& CEPH_FILE_MODE_RD
) == 0)
10172 return -CEPHFS_EBADF
;
10173 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10180 loff_t start_pos
= offset
;
10182 if (in
->inline_version
== 0) {
10183 auto r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
10188 ceph_assert(in
->inline_version
> 0);
10192 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
10193 want
= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
;
10195 want
= CEPH_CAP_FILE_CACHE
;
10197 auto r
= get_caps(f
, CEPH_CAP_FILE_RD
, want
, &have
, -1);
10203 if (f
->flags
& O_DIRECT
)
10204 have
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
10206 if (in
->inline_version
< CEPH_INLINE_NONE
) {
10207 uint32_t len
= in
->inline_data
.length();
10208 uint64_t endoff
= offset
+ size
;
10209 if (endoff
> in
->size
)
10212 if (offset
< len
) {
10213 if (endoff
<= len
) {
10214 bl
->substr_of(in
->inline_data
, offset
, endoff
- offset
);
10216 bl
->substr_of(in
->inline_data
, offset
, len
- offset
);
10217 bl
->append_zero(endoff
- len
);
10219 rc
= endoff
- offset
;
10220 } else if ((uint64_t)offset
< endoff
) {
10221 bl
->append_zero(endoff
- offset
);
10222 rc
= endoff
- offset
;
10229 if (!conf
->client_debug_force_sync_read
&&
10231 (have
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
10233 if (f
->flags
& O_RSYNC
) {
10234 _flush_range(in
, offset
, size
);
10236 rc
= _read_async(f
, offset
, size
, bl
);
10240 if (f
->flags
& O_DIRECT
)
10241 _flush_range(in
, offset
, size
);
10243 bool checkeof
= false;
10244 rc
= _read_sync(f
, offset
, size
, bl
, &checkeof
);
10251 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
10255 auto r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
10262 // eof? short read.
10263 if ((uint64_t)offset
< in
->size
)
10269 ceph_assert(rc
>= 0);
10270 update_read_io_size(bl
->length());
10273 f
->pos
= start_pos
+ rc
;
10276 lat
= ceph_clock_now();
10280 update_io_stat_read(lat
);
10285 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
10293 Client::C_Readahead::C_Readahead(Client
*c
, Fh
*f
) :
10296 f
->readahead
.inc_pending();
10299 Client::C_Readahead::~C_Readahead() {
10300 f
->readahead
.dec_pending();
10301 client
->_put_fh(f
);
10304 void Client::C_Readahead::finish(int r
) {
10305 lgeneric_subdout(client
->cct
, client
, 20) << "client." << client
->get_nodeid() << " " << "C_Readahead on " << f
->inode
<< dendl
;
10306 client
->put_cap_ref(f
->inode
.get(), CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
10308 client
->update_read_io_size(r
);
10312 int Client::_read_async(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
)
10314 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10316 const auto& conf
= cct
->_conf
;
10317 Inode
*in
= f
->inode
.get();
10319 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
10321 // trim read based on file size?
10322 if (off
>= in
->size
)
10326 if (off
+ len
> in
->size
) {
10327 len
= in
->size
- off
;
10330 ldout(cct
, 10) << " min_bytes=" << f
->readahead
.get_min_readahead_size()
10331 << " max_bytes=" << f
->readahead
.get_max_readahead_size()
10332 << " max_periods=" << conf
->client_readahead_max_periods
<< dendl
;
10334 // read (and possibly block)
10336 C_SaferCond
onfinish("Client::_read_async flock");
10337 r
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
10338 off
, len
, bl
, 0, &onfinish
);
10340 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
10341 client_lock
.unlock();
10342 r
= onfinish
.wait();
10343 client_lock
.lock();
10344 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
10345 update_read_io_size(bl
->length());
10348 if(f
->readahead
.get_min_readahead_size() > 0) {
10349 pair
<uint64_t, uint64_t> readahead_extent
= f
->readahead
.update(off
, len
, in
->size
);
10350 if (readahead_extent
.second
> 0) {
10351 ldout(cct
, 20) << "readahead " << readahead_extent
.first
<< "~" << readahead_extent
.second
10352 << " (caller wants " << off
<< "~" << len
<< ")" << dendl
;
10353 Context
*onfinish2
= new C_Readahead(this, f
);
10354 int r2
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
10355 readahead_extent
.first
, readahead_extent
.second
,
10356 NULL
, 0, onfinish2
);
10358 ldout(cct
, 20) << "readahead initiated, c " << onfinish2
<< dendl
;
10359 get_cap_ref(in
, CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
10361 ldout(cct
, 20) << "readahead was no-op, already cached" << dendl
;
10370 int Client::_read_sync(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
,
10373 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10375 Inode
*in
= f
->inode
.get();
10376 uint64_t pos
= off
;
10380 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
10382 // 0 success, 1 continue and < 0 error happen.
10383 auto wait_and_copy
= [&](C_SaferCond
&onfinish
, bufferlist
&tbl
, int wanted
) {
10384 int r
= onfinish
.wait();
10386 // if we get ENOENT from OSD, assume 0 bytes returned
10387 if (r
== -CEPHFS_ENOENT
)
10392 if (tbl
.length()) {
10398 bl
->claim_append(tbl
);
10401 if (r
>= 0 && r
< wanted
) {
10402 if (pos
< in
->size
) {
10403 // zero up to known EOF
10404 int64_t some
= in
->size
- pos
;
10407 auto z
= buffer::ptr_node::create(some
);
10409 bl
->push_back(std::move(z
));
10424 C_SaferCond
onfinish("Client::_read_sync flock");
10428 filer
->read_trunc(in
->ino
, &in
->layout
, in
->snapid
,
10429 pos
, left
, &tbl
, 0,
10430 in
->truncate_size
, in
->truncate_seq
,
10432 client_lock
.unlock();
10433 int r
= wait_and_copy(onfinish
, tbl
, wanted
);
10434 client_lock
.lock();
10443 int Client::write(int fd
, const char *buf
, loff_t size
, loff_t offset
)
10445 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10446 if (!mref_reader
.is_state_satisfied())
10447 return -CEPHFS_ENOTCONN
;
10449 tout(cct
) << "write" << std::endl
;
10450 tout(cct
) << fd
<< std::endl
;
10451 tout(cct
) << size
<< std::endl
;
10452 tout(cct
) << offset
<< std::endl
;
10454 std::scoped_lock
lock(client_lock
);
10455 Fh
*fh
= get_filehandle(fd
);
10457 return -CEPHFS_EBADF
;
10458 #if defined(__linux__) && defined(O_PATH)
10459 if (fh
->flags
& O_PATH
)
10460 return -CEPHFS_EBADF
;
10462 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10463 size
= std::min(size
, (loff_t
)INT_MAX
);
10464 int r
= _write(fh
, offset
, size
, buf
, NULL
, false);
10465 ldout(cct
, 3) << "write(" << fd
<< ", \"...\", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
10469 int Client::pwritev(int fd
, const struct iovec
*iov
, int iovcnt
, int64_t offset
)
10472 return -CEPHFS_EINVAL
;
10473 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, true);
10476 int64_t Client::_preadv_pwritev_locked(Fh
*fh
, const struct iovec
*iov
,
10477 unsigned iovcnt
, int64_t offset
,
10478 bool write
, bool clamp_to_int
)
10480 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10482 #if defined(__linux__) && defined(O_PATH)
10483 if (fh
->flags
& O_PATH
)
10484 return -CEPHFS_EBADF
;
10486 loff_t totallen
= 0;
10487 for (unsigned i
= 0; i
< iovcnt
; i
++) {
10488 totallen
+= iov
[i
].iov_len
;
10492 * Some of the API functions take 64-bit size values, but only return
10493 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10494 * we don't do I/Os larger than the values we can return.
10496 if (clamp_to_int
) {
10497 totallen
= std::min(totallen
, (loff_t
)INT_MAX
);
10500 int64_t w
= _write(fh
, offset
, totallen
, NULL
, iov
, iovcnt
);
10501 ldout(cct
, 3) << "pwritev(" << fh
<< ", \"...\", " << totallen
<< ", " << offset
<< ") = " << w
<< dendl
;
10505 int64_t r
= _read(fh
, offset
, totallen
, &bl
);
10506 ldout(cct
, 3) << "preadv(" << fh
<< ", " << offset
<< ") = " << r
<< dendl
;
10510 client_lock
.unlock();
10511 auto iter
= bl
.cbegin();
10512 for (unsigned j
= 0, resid
= r
; j
< iovcnt
&& resid
> 0; j
++) {
10514 * This piece of code aims to handle the case that bufferlist
10515 * does not have enough data to fill in the iov
10517 const auto round_size
= std::min
<unsigned>(resid
, iov
[j
].iov_len
);
10518 iter
.copy(round_size
, reinterpret_cast<char*>(iov
[j
].iov_base
));
10519 resid
-= round_size
;
10520 /* iter is self-updating */
10522 client_lock
.lock();
10527 int Client::_preadv_pwritev(int fd
, const struct iovec
*iov
, unsigned iovcnt
, int64_t offset
, bool write
)
10529 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10530 if (!mref_reader
.is_state_satisfied())
10531 return -CEPHFS_ENOTCONN
;
10533 tout(cct
) << fd
<< std::endl
;
10534 tout(cct
) << offset
<< std::endl
;
10536 std::scoped_lock
cl(client_lock
);
10537 Fh
*fh
= get_filehandle(fd
);
10539 return -CEPHFS_EBADF
;
10540 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, offset
, write
, true);
10543 int64_t Client::_write(Fh
*f
, int64_t offset
, uint64_t size
, const char *buf
,
10544 const struct iovec
*iov
, int iovcnt
)
10546 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10549 Inode
*in
= f
->inode
.get();
10551 if ( (uint64_t)(offset
+size
) > mdsmap
->get_max_filesize() && //exceeds config
10552 (uint64_t)(offset
+size
) > in
->size
) { //exceeds filesize
10553 return -CEPHFS_EFBIG
;
10555 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
10557 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
10558 return -CEPHFS_ENOSPC
;
10561 ceph_assert(in
->snapid
== CEPH_NOSNAP
);
10563 // was Fh opened as writeable?
10564 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
10565 return -CEPHFS_EBADF
;
10567 // use/adjust fd pos?
10571 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10572 * change out from under us.
10574 if (f
->flags
& O_APPEND
) {
10575 auto r
= _lseek(f
, 0, SEEK_END
);
10582 fpos
= offset
+size
;
10587 uint64_t endoff
= offset
+ size
;
10588 if (endoff
> in
->size
&& is_quota_bytes_exceeded(in
, endoff
- in
->size
,
10590 return -CEPHFS_EDQUOT
;
10593 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10595 ldout(cct
, 10) << "cur file size is " << in
->size
<< dendl
;
10598 utime_t start
= ceph_clock_now();
10600 if (in
->inline_version
== 0) {
10601 int r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
10604 ceph_assert(in
->inline_version
> 0);
10607 // copy into fresh buffer (since our write may be resub, async)
10611 bl
.append(buf
, size
);
10613 for (int i
= 0; i
< iovcnt
; i
++) {
10614 if (iov
[i
].iov_len
> 0) {
10615 bl
.append((const char *)iov
[i
].iov_base
, iov
[i
].iov_len
);
10621 uint64_t totalwritten
;
10623 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
10624 want
= CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
;
10626 want
= CEPH_CAP_FILE_BUFFER
;
10627 int r
= get_caps(f
, CEPH_CAP_FILE_WR
|CEPH_CAP_AUTH_SHARED
, want
, &have
, endoff
);
10631 /* clear the setuid/setgid bits, if any */
10632 if (unlikely(in
->mode
& (S_ISUID
|S_ISGID
)) && size
> 0) {
10633 struct ceph_statx stx
= { 0 };
10635 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
10636 r
= __setattrx(in
, &stx
, CEPH_SETATTR_KILL_SGUID
, f
->actor_perms
);
10640 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
10643 if (f
->flags
& O_DIRECT
)
10644 have
&= ~(CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
);
10646 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
10648 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
10650 if (in
->inline_version
< CEPH_INLINE_NONE
) {
10651 if (endoff
> cct
->_conf
->client_max_inline_size
||
10652 endoff
> CEPH_INLINE_MAX_SIZE
||
10653 !(have
& CEPH_CAP_FILE_BUFFER
)) {
10654 onuninline
.reset(new C_SaferCond("Client::_write_uninline_data flock"));
10655 uninline_data(in
, onuninline
.get());
10657 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10659 uint32_t len
= in
->inline_data
.length();
10662 in
->inline_data
.begin(endoff
).copy(len
- endoff
, bl
); // XXX
10665 in
->inline_data
.splice(offset
, len
- offset
);
10666 else if (offset
> len
)
10667 in
->inline_data
.append_zero(offset
- len
);
10669 in
->inline_data
.append(bl
);
10670 in
->inline_version
++;
10672 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10678 if (cct
->_conf
->client_oc
&&
10679 (have
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
))) {
10680 // do buffered write
10681 if (!in
->oset
.dirty_or_tx
)
10682 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
10684 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10686 // async, caching, non-blocking.
10687 r
= objectcacher
->file_write(&in
->oset
, &in
->layout
,
10688 in
->snaprealm
->get_snap_context(),
10689 offset
, size
, bl
, ceph::real_clock::now(),
10691 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10696 // flush cached write if O_SYNC is set on file fh
10697 // O_DSYNC == O_SYNC on linux < 2.6.33
10698 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
10699 if ((f
->flags
& O_SYNC
) || (f
->flags
& O_DSYNC
)) {
10700 _flush_range(in
, offset
, size
);
10703 if (f
->flags
& O_DIRECT
)
10704 _flush_range(in
, offset
, size
);
10706 // simple, non-atomic sync write
10707 C_SaferCond
onfinish("Client::_write flock");
10708 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10710 filer
->write_trunc(in
->ino
, &in
->layout
, in
->snaprealm
->get_snap_context(),
10711 offset
, size
, bl
, ceph::real_clock::now(), 0,
10712 in
->truncate_size
, in
->truncate_seq
,
10714 client_lock
.unlock();
10715 r
= onfinish
.wait();
10716 client_lock
.lock();
10717 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10722 // if we get here, write was successful, update client metadata
10724 update_write_io_size(size
);
10726 lat
= ceph_clock_now();
10729 ++nr_write_request
;
10730 update_io_stat_write(lat
);
10737 totalwritten
= size
;
10738 r
= (int64_t)totalwritten
;
10741 if (totalwritten
+ offset
> in
->size
) {
10742 in
->size
= totalwritten
+ offset
;
10743 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
10745 if (is_quota_bytes_approaching(in
, f
->actor_perms
)) {
10746 check_caps(in
, CHECK_CAPS_NODELAY
);
10747 } else if (is_max_size_approaching(in
)) {
10751 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", extending file size" << dendl
;
10753 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", leaving file size at " << in
->size
<< dendl
;
10757 in
->mtime
= in
->ctime
= ceph_clock_now();
10759 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
10763 if (nullptr != onuninline
) {
10764 client_lock
.unlock();
10765 int uninline_ret
= onuninline
->wait();
10766 client_lock
.lock();
10768 if (uninline_ret
>= 0 || uninline_ret
== -CEPHFS_ECANCELED
) {
10769 in
->inline_data
.clear();
10770 in
->inline_version
= CEPH_INLINE_NONE
;
10771 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
10777 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
10781 int Client::_flush(Fh
*f
)
10783 Inode
*in
= f
->inode
.get();
10784 int err
= f
->take_async_err();
10786 ldout(cct
, 1) << __func__
<< ": " << f
<< " on inode " << *in
<< " caught async_err = "
10787 << cpp_strerror(err
) << dendl
;
10789 ldout(cct
, 10) << __func__
<< ": " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
10795 int Client::truncate(const char *relpath
, loff_t length
, const UserPerm
& perms
)
10797 struct ceph_statx stx
;
10798 stx
.stx_size
= length
;
10799 return setattrx(relpath
, &stx
, CEPH_SETATTR_SIZE
, perms
);
10802 int Client::ftruncate(int fd
, loff_t length
, const UserPerm
& perms
)
10804 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10805 if (!mref_reader
.is_state_satisfied())
10806 return -CEPHFS_ENOTCONN
;
10808 tout(cct
) << __func__
<< std::endl
;
10809 tout(cct
) << fd
<< std::endl
;
10810 tout(cct
) << length
<< std::endl
;
10812 std::scoped_lock
lock(client_lock
);
10813 Fh
*f
= get_filehandle(fd
);
10815 return -CEPHFS_EBADF
;
10816 #if defined(__linux__) && defined(O_PATH)
10817 if (f
->flags
& O_PATH
)
10818 return -CEPHFS_EBADF
;
10820 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
10821 return -CEPHFS_EBADF
;
10823 attr
.st_size
= length
;
10824 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_SIZE
, perms
);
10827 int Client::fsync(int fd
, bool syncdataonly
)
10829 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10830 if (!mref_reader
.is_state_satisfied())
10831 return -CEPHFS_ENOTCONN
;
10833 tout(cct
) << "fsync" << std::endl
;
10834 tout(cct
) << fd
<< std::endl
;
10835 tout(cct
) << syncdataonly
<< std::endl
;
10837 std::scoped_lock
lock(client_lock
);
10838 Fh
*f
= get_filehandle(fd
);
10840 return -CEPHFS_EBADF
;
10841 #if defined(__linux__) && defined(O_PATH)
10842 if (f
->flags
& O_PATH
)
10843 return -CEPHFS_EBADF
;
10845 int r
= _fsync(f
, syncdataonly
);
10847 // The IOs in this fsync were okay, but maybe something happened
10848 // in the background that we shoudl be reporting?
10849 r
= f
->take_async_err();
10850 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
10851 << ") = 0, async_err = " << r
<< dendl
;
10853 // Assume that an error we encountered during fsync, even reported
10854 // synchronously, would also have applied the error to the Fh, and we
10855 // should clear it here to avoid returning the same error again on next
10857 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
<< ") = "
10859 f
->take_async_err();
10864 int Client::_fsync(Inode
*in
, bool syncdataonly
)
10866 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10869 std::unique_ptr
<C_SaferCond
> object_cacher_completion
= nullptr;
10870 ceph_tid_t flush_tid
= 0;
10873 utime_t start
= ceph_clock_now();
10875 ldout(cct
, 8) << "_fsync on " << *in
<< " " << (syncdataonly
? "(dataonly)":"(data+metadata)") << dendl
;
10877 if (cct
->_conf
->client_oc
) {
10878 object_cacher_completion
.reset(new C_SaferCond("Client::_fsync::lock"));
10879 tmp_ref
= in
; // take a reference; C_SaferCond doesn't and _flush won't either
10880 _flush(in
, object_cacher_completion
.get());
10881 ldout(cct
, 15) << "using return-valued form of _fsync" << dendl
;
10884 if (!syncdataonly
&& in
->dirty_caps
) {
10885 check_caps(in
, CHECK_CAPS_NODELAY
|CHECK_CAPS_SYNCHRONOUS
);
10886 if (in
->flushing_caps
)
10887 flush_tid
= last_flush_tid
;
10888 } else ldout(cct
, 10) << "no metadata needs to commit" << dendl
;
10890 if (!syncdataonly
&& !in
->unsafe_ops
.empty()) {
10891 flush_mdlog_sync(in
);
10893 MetaRequest
*req
= in
->unsafe_ops
.back();
10894 ldout(cct
, 15) << "waiting on unsafe requests, last tid " << req
->get_tid() << dendl
;
10897 wait_on_list(req
->waitfor_safe
);
10901 if (nullptr != object_cacher_completion
) { // wait on a real reply instead of guessing
10902 client_lock
.unlock();
10903 ldout(cct
, 15) << "waiting on data to flush" << dendl
;
10904 r
= object_cacher_completion
->wait();
10905 client_lock
.lock();
10906 ldout(cct
, 15) << "got " << r
<< " from flush writeback" << dendl
;
10908 // FIXME: this can starve
10909 while (in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] > 0) {
10910 ldout(cct
, 10) << "ino " << in
->ino
<< " has " << in
->cap_refs
[CEPH_CAP_FILE_BUFFER
]
10911 << " uncommitted, waiting" << dendl
;
10912 wait_on_list(in
->waitfor_commit
);
10918 wait_sync_caps(in
, flush_tid
);
10920 ldout(cct
, 10) << "ino " << in
->ino
<< " has no uncommitted writes" << dendl
;
10922 ldout(cct
, 8) << "ino " << in
->ino
<< " failed to commit to disk! "
10923 << cpp_strerror(-r
) << dendl
;
10926 lat
= ceph_clock_now();
10928 logger
->tinc(l_c_fsync
, lat
);
10933 int Client::_fsync(Fh
*f
, bool syncdataonly
)
10935 ldout(cct
, 8) << "_fsync(" << f
<< ", " << (syncdataonly
? "dataonly)":"data+metadata)") << dendl
;
10936 return _fsync(f
->inode
.get(), syncdataonly
);
10939 int Client::fstat(int fd
, struct stat
*stbuf
, const UserPerm
& perms
, int mask
)
10941 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10942 if (!mref_reader
.is_state_satisfied())
10943 return -CEPHFS_ENOTCONN
;
10945 tout(cct
) << "fstat mask " << hex
<< mask
<< dec
<< std::endl
;
10946 tout(cct
) << fd
<< std::endl
;
10948 std::scoped_lock
lock(client_lock
);
10949 Fh
*f
= get_filehandle(fd
);
10951 return -CEPHFS_EBADF
;
10952 int r
= _getattr(f
->inode
, mask
, perms
);
10955 fill_stat(f
->inode
, stbuf
, NULL
);
10956 ldout(cct
, 5) << "fstat(" << fd
<< ", " << stbuf
<< ") = " << r
<< dendl
;
10960 int Client::fstatx(int fd
, struct ceph_statx
*stx
, const UserPerm
& perms
,
10961 unsigned int want
, unsigned int flags
)
10963 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10964 if (!mref_reader
.is_state_satisfied())
10965 return -CEPHFS_ENOTCONN
;
10967 tout(cct
) << "fstatx flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
10968 tout(cct
) << fd
<< std::endl
;
10970 std::scoped_lock
lock(client_lock
);
10971 Fh
*f
= get_filehandle(fd
);
10973 return -CEPHFS_EBADF
;
10975 unsigned mask
= statx_to_mask(flags
, want
);
10979 r
= _getattr(f
->inode
, mask
, perms
);
10981 ldout(cct
, 3) << "fstatx exit on error!" << dendl
;
10986 fill_statx(f
->inode
, mask
, stx
);
10987 ldout(cct
, 3) << "fstatx(" << fd
<< ", " << stx
<< ") = " << r
<< dendl
;
10991 int Client::statxat(int dirfd
, const char *relpath
,
10992 struct ceph_statx
*stx
, const UserPerm
& perms
,
10993 unsigned int want
, unsigned int flags
) {
10994 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10995 if (!mref_reader
.is_state_satisfied()) {
10996 return -CEPHFS_ENOTCONN
;
10999 tout(cct
) << __func__
<< " flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
11000 tout(cct
) << dirfd
<< std::endl
;
11001 tout(cct
) << relpath
<< std::endl
;
11003 unsigned mask
= statx_to_mask(flags
, want
);
11006 std::scoped_lock
lock(client_lock
);
11007 int r
= get_fd_inode(dirfd
, &dirinode
);
11013 filepath
path(relpath
);
11014 r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
, dirinode
);
11018 r
= _getattr(in
, mask
, perms
);
11020 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
11024 fill_statx(in
, mask
, stx
);
11025 ldout(cct
, 3) << __func__
<< " dirfd" << dirfd
<< ", r= " << r
<< dendl
;
11029 // not written yet, but i want to link!
11031 int Client::chdir(const char *relpath
, std::string
&new_cwd
,
11032 const UserPerm
& perms
)
11034 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11035 if (!mref_reader
.is_state_satisfied())
11036 return -CEPHFS_ENOTCONN
;
11038 tout(cct
) << "chdir" << std::endl
;
11039 tout(cct
) << relpath
<< std::endl
;
11041 filepath
path(relpath
);
11044 std::scoped_lock
lock(client_lock
);
11045 int r
= path_walk(path
, &in
, perms
);
11049 if (!(in
.get()->is_dir()))
11050 return -CEPHFS_ENOTDIR
;
11054 ldout(cct
, 3) << "chdir(" << relpath
<< ") cwd now " << cwd
->ino
<< dendl
;
11056 _getcwd(new_cwd
, perms
);
11060 void Client::_getcwd(string
& dir
, const UserPerm
& perms
)
11063 ldout(cct
, 10) << __func__
<< " " << *cwd
<< dendl
;
11065 Inode
*in
= cwd
.get();
11066 while (in
!= root
.get()) {
11067 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
11069 // A cwd or ancester is unlinked
11070 if (in
->dentries
.empty()) {
11074 Dentry
*dn
= in
->get_first_parent();
11079 ldout(cct
, 10) << __func__
<< " looking up parent for " << *in
<< dendl
;
11080 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
11081 filepath
path(in
->ino
);
11082 req
->set_filepath(path
);
11083 req
->set_inode(in
);
11084 int res
= make_request(req
, perms
);
11093 path
.push_front_dentry(dn
->name
);
11094 in
= dn
->dir
->parent_inode
;
11097 dir
+= path
.get_path();
11100 void Client::getcwd(string
& dir
, const UserPerm
& perms
)
11102 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11103 if (!mref_reader
.is_state_satisfied())
11106 std::scoped_lock
l(client_lock
);
11108 _getcwd(dir
, perms
);
11111 int Client::statfs(const char *path
, struct statvfs
*stbuf
,
11112 const UserPerm
& perms
)
11114 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11115 if (!mref_reader
.is_state_satisfied())
11116 return -CEPHFS_ENOTCONN
;
11118 tout(cct
) << __func__
<< std::endl
;
11119 unsigned long int total_files_on_fs
;
11124 std::unique_lock
lock(client_lock
);
11125 const vector
<int64_t> &data_pools
= mdsmap
->get_data_pools();
11126 if (data_pools
.size() == 1) {
11127 objecter
->get_fs_stats(stats
, data_pools
[0], &cond
);
11129 objecter
->get_fs_stats(stats
, std::optional
<int64_t>(), &cond
);
11133 int rval
= cond
.wait();
11137 total_files_on_fs
= root
->rstat
.rfiles
+ root
->rstat
.rsubdirs
;
11140 ldout(cct
, 1) << "underlying call to statfs returned error: "
11141 << cpp_strerror(rval
)
11146 memset(stbuf
, 0, sizeof(*stbuf
));
11149 * we're going to set a block size of 4MB so we can represent larger
11150 * FSes without overflowing. Additionally convert the space
11151 * measurements from KB to bytes while making them in terms of
11152 * blocks. We use 4MB only because it is big enough, and because it
11153 * actually *is* the (ceph) default block size.
11155 const int CEPH_BLOCK_SHIFT
= 22;
11156 stbuf
->f_frsize
= 1 << CEPH_BLOCK_SHIFT
;
11157 stbuf
->f_bsize
= 1 << CEPH_BLOCK_SHIFT
;
11158 stbuf
->f_files
= total_files_on_fs
;
11159 stbuf
->f_ffree
= -1;
11160 stbuf
->f_favail
= -1;
11161 stbuf
->f_fsid
= -1; // ??
11162 stbuf
->f_flag
= 0; // ??
11163 stbuf
->f_namemax
= NAME_MAX
;
11165 // Usually quota_root will == root_ancestor, but if the mount root has no
11166 // quota but we can see a parent of it that does have a quota, we'll
11167 // respect that one instead.
11168 ceph_assert(root
!= nullptr);
11169 InodeRef quota_root
= root
->quota
.is_enable() ? root
: get_quota_root(root
.get(), perms
);
11171 // get_quota_root should always give us something if client quotas are
11173 ceph_assert(cct
->_conf
.get_val
<bool>("client_quota") == false || quota_root
!= nullptr);
11175 if (quota_root
&& cct
->_conf
->client_quota_df
&& quota_root
->quota
.max_bytes
) {
11177 // Skip the getattr if any sessions are stale, as we don't want to
11178 // block `df` if this client has e.g. been evicted, or if the MDS cluster
11180 if (!_any_stale_sessions()) {
11181 int r
= _getattr(quota_root
, 0, perms
, true);
11183 // Ignore return value: error getting latest inode metadata is not a good
11184 // reason to break "df".
11185 lderr(cct
) << "Error in getattr on quota root 0x"
11186 << std::hex
<< quota_root
->ino
<< std::dec
11187 << " statfs result may be outdated" << dendl
;
11191 // Special case: if there is a size quota set on the Inode acting
11192 // as the root for this client mount, then report the quota status
11193 // as the filesystem statistics.
11194 const fsblkcnt_t total
= quota_root
->quota
.max_bytes
>> CEPH_BLOCK_SHIFT
;
11195 const fsblkcnt_t used
= quota_root
->rstat
.rbytes
>> CEPH_BLOCK_SHIFT
;
11196 // It is possible for a quota to be exceeded: arithmetic here must
11197 // handle case where used > total.
11198 const fsblkcnt_t free
= total
> used
? total
- used
: 0;
11200 stbuf
->f_blocks
= total
;
11201 stbuf
->f_bfree
= free
;
11202 stbuf
->f_bavail
= free
;
11204 // General case: report the cluster statistics returned from RADOS. Because
11205 // multiple pools may be used without one filesystem namespace via
11206 // layouts, this is the most correct thing we can do.
11207 stbuf
->f_blocks
= stats
.kb
>> (CEPH_BLOCK_SHIFT
- 10);
11208 stbuf
->f_bfree
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
11209 stbuf
->f_bavail
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
11215 int Client::_do_filelock(Inode
*in
, Fh
*fh
, int lock_type
, int op
, int sleep
,
11216 struct flock
*fl
, uint64_t owner
, bool removing
)
11218 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
11219 << (lock_type
== CEPH_LOCK_FCNTL
? " fcntl" : " flock")
11220 << " type " << fl
->l_type
<< " owner " << owner
11221 << " " << fl
->l_start
<< "~" << fl
->l_len
<< dendl
;
11223 if (in
->flags
& I_ERROR_FILELOCK
)
11224 return -CEPHFS_EIO
;
11227 if (F_RDLCK
== fl
->l_type
)
11228 lock_cmd
= CEPH_LOCK_SHARED
;
11229 else if (F_WRLCK
== fl
->l_type
)
11230 lock_cmd
= CEPH_LOCK_EXCL
;
11231 else if (F_UNLCK
== fl
->l_type
)
11232 lock_cmd
= CEPH_LOCK_UNLOCK
;
11234 return -CEPHFS_EIO
;
11236 if (op
!= CEPH_MDS_OP_SETFILELOCK
|| lock_cmd
== CEPH_LOCK_UNLOCK
)
11240 * Set the most significant bit, so that MDS knows the 'owner'
11241 * is sufficient to identify the owner of lock. (old code uses
11242 * both 'owner' and 'pid')
11244 owner
|= (1ULL << 63);
11246 MetaRequest
*req
= new MetaRequest(op
);
11248 in
->make_nosnap_relative_path(path
);
11249 req
->set_filepath(path
);
11250 req
->set_inode(in
);
11252 req
->head
.args
.filelock_change
.rule
= lock_type
;
11253 req
->head
.args
.filelock_change
.type
= lock_cmd
;
11254 req
->head
.args
.filelock_change
.owner
= owner
;
11255 req
->head
.args
.filelock_change
.pid
= fl
->l_pid
;
11256 req
->head
.args
.filelock_change
.start
= fl
->l_start
;
11257 req
->head
.args
.filelock_change
.length
= fl
->l_len
;
11258 req
->head
.args
.filelock_change
.wait
= sleep
;
11263 if (sleep
&& switch_interrupt_cb
) {
11264 // enable interrupt
11265 switch_interrupt_cb(callback_handle
, req
->get());
11266 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
11267 // disable interrupt
11268 switch_interrupt_cb(callback_handle
, NULL
);
11269 if (ret
== 0 && req
->aborted()) {
11270 // effect of this lock request has been revoked by the 'lock intr' request
11271 ret
= req
->get_abort_code();
11275 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
11279 if (op
== CEPH_MDS_OP_GETFILELOCK
) {
11280 ceph_filelock filelock
;
11281 auto p
= bl
.cbegin();
11282 decode(filelock
, p
);
11284 if (CEPH_LOCK_SHARED
== filelock
.type
)
11285 fl
->l_type
= F_RDLCK
;
11286 else if (CEPH_LOCK_EXCL
== filelock
.type
)
11287 fl
->l_type
= F_WRLCK
;
11289 fl
->l_type
= F_UNLCK
;
11291 fl
->l_whence
= SEEK_SET
;
11292 fl
->l_start
= filelock
.start
;
11293 fl
->l_len
= filelock
.length
;
11294 fl
->l_pid
= filelock
.pid
;
11295 } else if (op
== CEPH_MDS_OP_SETFILELOCK
) {
11296 ceph_lock_state_t
*lock_state
;
11297 if (lock_type
== CEPH_LOCK_FCNTL
) {
11298 if (!in
->fcntl_locks
)
11299 in
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
11300 lock_state
= in
->fcntl_locks
.get();
11301 } else if (lock_type
== CEPH_LOCK_FLOCK
) {
11302 if (!in
->flock_locks
)
11303 in
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
11304 lock_state
= in
->flock_locks
.get();
11307 return -CEPHFS_EINVAL
;
11309 _update_lock_state(fl
, owner
, lock_state
);
11312 if (lock_type
== CEPH_LOCK_FCNTL
) {
11313 if (!fh
->fcntl_locks
)
11314 fh
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
11315 lock_state
= fh
->fcntl_locks
.get();
11317 if (!fh
->flock_locks
)
11318 fh
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
11319 lock_state
= fh
->flock_locks
.get();
11321 _update_lock_state(fl
, owner
, lock_state
);
11329 int Client::_interrupt_filelock(MetaRequest
*req
)
11331 // Set abort code, but do not kick. The abort code prevents the request
11332 // from being re-sent.
11333 req
->abort(-CEPHFS_EINTR
);
11335 return 0; // haven't sent the request
11337 Inode
*in
= req
->inode();
11340 if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FLOCK
)
11341 lock_type
= CEPH_LOCK_FLOCK_INTR
;
11342 else if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FCNTL
)
11343 lock_type
= CEPH_LOCK_FCNTL_INTR
;
11346 return -CEPHFS_EINVAL
;
11349 MetaRequest
*intr_req
= new MetaRequest(CEPH_MDS_OP_SETFILELOCK
);
11351 in
->make_nosnap_relative_path(path
);
11352 intr_req
->set_filepath(path
);
11353 intr_req
->set_inode(in
);
11354 intr_req
->head
.args
.filelock_change
= req
->head
.args
.filelock_change
;
11355 intr_req
->head
.args
.filelock_change
.rule
= lock_type
;
11356 intr_req
->head
.args
.filelock_change
.type
= CEPH_LOCK_UNLOCK
;
11358 UserPerm
perms(req
->get_uid(), req
->get_gid());
11359 return make_request(intr_req
, perms
, NULL
, NULL
, -1);
11362 void Client::_encode_filelocks(Inode
*in
, bufferlist
& bl
)
11364 if (!in
->fcntl_locks
&& !in
->flock_locks
)
11367 unsigned nr_fcntl_locks
= in
->fcntl_locks
? in
->fcntl_locks
->held_locks
.size() : 0;
11368 encode(nr_fcntl_locks
, bl
);
11369 if (nr_fcntl_locks
) {
11370 auto &lock_state
= in
->fcntl_locks
;
11371 for(auto p
= lock_state
->held_locks
.begin();
11372 p
!= lock_state
->held_locks
.end();
11374 encode(p
->second
, bl
);
11377 unsigned nr_flock_locks
= in
->flock_locks
? in
->flock_locks
->held_locks
.size() : 0;
11378 encode(nr_flock_locks
, bl
);
11379 if (nr_flock_locks
) {
11380 auto &lock_state
= in
->flock_locks
;
11381 for(auto p
= lock_state
->held_locks
.begin();
11382 p
!= lock_state
->held_locks
.end();
11384 encode(p
->second
, bl
);
11387 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< ", " << nr_fcntl_locks
11388 << " fcntl locks, " << nr_flock_locks
<< " flock locks" << dendl
;
11391 void Client::_release_filelocks(Fh
*fh
)
11393 if (!fh
->fcntl_locks
&& !fh
->flock_locks
)
11396 Inode
*in
= fh
->inode
.get();
11397 ldout(cct
, 10) << __func__
<< " " << fh
<< " ino " << in
->ino
<< dendl
;
11399 list
<ceph_filelock
> activated_locks
;
11401 list
<pair
<int, ceph_filelock
> > to_release
;
11403 if (fh
->fcntl_locks
) {
11404 auto &lock_state
= fh
->fcntl_locks
;
11405 for(auto p
= lock_state
->held_locks
.begin(); p
!= lock_state
->held_locks
.end(); ) {
11407 if (in
->flags
& I_ERROR_FILELOCK
) {
11408 lock_state
->remove_lock(q
->second
, activated_locks
);
11410 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FCNTL
, q
->second
));
11413 lock_state
.reset();
11415 if (fh
->flock_locks
) {
11416 auto &lock_state
= fh
->flock_locks
;
11417 for(auto p
= lock_state
->held_locks
.begin(); p
!= lock_state
->held_locks
.end(); ) {
11419 if (in
->flags
& I_ERROR_FILELOCK
) {
11420 lock_state
->remove_lock(q
->second
, activated_locks
);
11422 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FLOCK
, q
->second
));
11425 lock_state
.reset();
11428 if ((in
->flags
& I_ERROR_FILELOCK
) && !in
->has_any_filelocks())
11429 in
->flags
&= ~I_ERROR_FILELOCK
;
11431 if (to_release
.empty())
11435 memset(&fl
, 0, sizeof(fl
));
11436 fl
.l_whence
= SEEK_SET
;
11437 fl
.l_type
= F_UNLCK
;
11439 for (list
<pair
<int, ceph_filelock
> >::iterator p
= to_release
.begin();
11440 p
!= to_release
.end();
11442 fl
.l_start
= p
->second
.start
;
11443 fl
.l_len
= p
->second
.length
;
11444 fl
.l_pid
= p
->second
.pid
;
11445 _do_filelock(in
, fh
, p
->first
, CEPH_MDS_OP_SETFILELOCK
, 0, &fl
,
11446 p
->second
.owner
, true);
11450 void Client::_update_lock_state(struct flock
*fl
, uint64_t owner
,
11451 ceph_lock_state_t
*lock_state
)
11454 if (F_RDLCK
== fl
->l_type
)
11455 lock_cmd
= CEPH_LOCK_SHARED
;
11456 else if (F_WRLCK
== fl
->l_type
)
11457 lock_cmd
= CEPH_LOCK_EXCL
;
11459 lock_cmd
= CEPH_LOCK_UNLOCK
;;
11461 ceph_filelock filelock
;
11462 filelock
.start
= fl
->l_start
;
11463 filelock
.length
= fl
->l_len
;
11464 filelock
.client
= 0;
11465 // see comment in _do_filelock()
11466 filelock
.owner
= owner
| (1ULL << 63);
11467 filelock
.pid
= fl
->l_pid
;
11468 filelock
.type
= lock_cmd
;
11470 if (filelock
.type
== CEPH_LOCK_UNLOCK
) {
11471 list
<ceph_filelock
> activated_locks
;
11472 lock_state
->remove_lock(filelock
, activated_locks
);
11474 bool r
= lock_state
->add_lock(filelock
, false, false, NULL
);
11479 int Client::_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
11481 Inode
*in
= fh
->inode
.get();
11482 ldout(cct
, 10) << "_getlk " << fh
<< " ino " << in
->ino
<< dendl
;
11483 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_GETFILELOCK
, 0, fl
, owner
);
11487 int Client::_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
11489 Inode
*in
= fh
->inode
.get();
11490 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< dendl
;
11491 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_SETFILELOCK
, sleep
, fl
, owner
);
11492 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
11496 int Client::_flock(Fh
*fh
, int cmd
, uint64_t owner
)
11498 Inode
*in
= fh
->inode
.get();
11499 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< dendl
;
11501 int sleep
= !(cmd
& LOCK_NB
);
11516 return -CEPHFS_EINVAL
;
11520 memset(&fl
, 0, sizeof(fl
));
11522 fl
.l_whence
= SEEK_SET
;
11524 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FLOCK
, CEPH_MDS_OP_SETFILELOCK
, sleep
, &fl
, owner
);
11525 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
11529 int Client::get_snap_info(const char *path
, const UserPerm
&perms
, SnapInfo
*snap_info
) {
11530 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11531 if (!mref_reader
.is_state_satisfied()) {
11532 return -CEPHFS_ENOTCONN
;
11535 std::scoped_lock
lock(client_lock
);
11537 int r
= Client::path_walk(path
, &in
, perms
, true);
11542 if (in
->snapid
== CEPH_NOSNAP
) {
11543 return -CEPHFS_EINVAL
;
11546 snap_info
->id
= in
->snapid
;
11547 snap_info
->metadata
= in
->snap_metadata
;
11551 int Client::ll_statfs(Inode
*in
, struct statvfs
*stbuf
, const UserPerm
& perms
)
11553 /* Since the only thing this does is wrap a call to statfs, and
11554 statfs takes a lock, it doesn't seem we have a need to split it
11556 return statfs(0, stbuf
, perms
);
11559 void Client::_ll_register_callbacks(struct ceph_client_callback_args
*args
)
11564 ldout(cct
, 10) << __func__
<< " cb " << args
->handle
11565 << " invalidate_ino_cb " << args
->ino_cb
11566 << " invalidate_dentry_cb " << args
->dentry_cb
11567 << " switch_interrupt_cb " << args
->switch_intr_cb
11568 << " remount_cb " << args
->remount_cb
11570 callback_handle
= args
->handle
;
11571 if (args
->ino_cb
) {
11572 ino_invalidate_cb
= args
->ino_cb
;
11573 async_ino_invalidator
.start();
11575 if (args
->dentry_cb
) {
11576 dentry_invalidate_cb
= args
->dentry_cb
;
11577 async_dentry_invalidator
.start();
11579 if (args
->switch_intr_cb
) {
11580 switch_interrupt_cb
= args
->switch_intr_cb
;
11581 interrupt_finisher
.start();
11583 if (args
->remount_cb
) {
11584 remount_cb
= args
->remount_cb
;
11585 remount_finisher
.start();
11587 if (args
->ino_release_cb
) {
11588 ino_release_cb
= args
->ino_release_cb
;
11589 async_ino_releasor
.start();
11591 if (args
->umask_cb
)
11592 umask_cb
= args
->umask_cb
;
11595 // This is deprecated, use ll_register_callbacks2() instead.
11596 void Client::ll_register_callbacks(struct ceph_client_callback_args
*args
)
11598 ceph_assert(!is_mounting() && !is_mounted() && !is_unmounting());
11600 _ll_register_callbacks(args
);
11603 int Client::ll_register_callbacks2(struct ceph_client_callback_args
*args
)
11605 if (is_mounting() || is_mounted() || is_unmounting())
11606 return -CEPHFS_EBUSY
;
11608 _ll_register_callbacks(args
);
11612 std::pair
<int, bool> Client::test_dentry_handling(bool can_invalidate
)
11614 std::pair
<int, bool> r(0, false);
11616 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
11617 if (!iref_reader
.is_state_satisfied())
11618 return std::make_pair(-CEPHFS_ENOTCONN
, false);
11620 can_invalidate_dentries
= can_invalidate
;
11623 * Force to use the old and slow method to invalidate the dcache
11624 * if the euid is non-root, or the remount may fail with return
11627 uid_t euid
= geteuid();
11628 ldout(cct
, 10) << "euid: " << euid
<< dendl
;
11630 can_invalidate_dentries
= true;
11633 if (can_invalidate_dentries
) {
11634 ceph_assert(dentry_invalidate_cb
);
11635 ldout(cct
, 1) << "using dentry_invalidate_cb" << dendl
;
11637 ceph_assert(remount_cb
);
11638 ldout(cct
, 1) << "using remount_cb" << dendl
;
11639 r
= _do_remount(false);
11645 int Client::_sync_fs()
11647 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
11649 ldout(cct
, 10) << __func__
<< dendl
;
11652 std::unique_ptr
<C_SaferCond
> cond
= nullptr;
11653 if (cct
->_conf
->client_oc
) {
11654 cond
.reset(new C_SaferCond("Client::_sync_fs:lock"));
11655 objectcacher
->flush_all(cond
.get());
11660 ceph_tid_t flush_tid
= last_flush_tid
;
11662 // wait for unsafe mds requests
11663 wait_unsafe_requests();
11665 wait_sync_caps(flush_tid
);
11667 if (nullptr != cond
) {
11668 client_lock
.unlock();
11669 ldout(cct
, 15) << __func__
<< " waiting on data to flush" << dendl
;
11671 ldout(cct
, 15) << __func__
<< " flush finished" << dendl
;
11672 client_lock
.lock();
11678 int Client::sync_fs()
11680 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11681 if (!mref_reader
.is_state_satisfied())
11682 return -CEPHFS_ENOTCONN
;
11684 std::scoped_lock
l(client_lock
);
11689 int64_t Client::drop_caches()
11691 std::scoped_lock
l(client_lock
);
11692 return objectcacher
->release_all();
11695 int Client::_lazyio(Fh
*fh
, int enable
)
11697 Inode
*in
= fh
->inode
.get();
11698 ldout(cct
, 20) << __func__
<< " " << *in
<< " " << !!enable
<< dendl
;
11700 if (!!(fh
->mode
& CEPH_FILE_MODE_LAZY
) == !!enable
)
11703 int orig_mode
= fh
->mode
;
11705 fh
->mode
|= CEPH_FILE_MODE_LAZY
;
11706 in
->get_open_ref(fh
->mode
);
11707 in
->put_open_ref(orig_mode
);
11708 check_caps(in
, CHECK_CAPS_NODELAY
);
11710 fh
->mode
&= ~CEPH_FILE_MODE_LAZY
;
11711 in
->get_open_ref(fh
->mode
);
11712 in
->put_open_ref(orig_mode
);
11719 int Client::lazyio(int fd
, int enable
)
11721 std::scoped_lock
l(client_lock
);
11722 Fh
*f
= get_filehandle(fd
);
11724 return -CEPHFS_EBADF
;
11726 return _lazyio(f
, enable
);
11729 int Client::ll_lazyio(Fh
*fh
, int enable
)
11731 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << !!enable
<< dendl
;
11732 tout(cct
) << __func__
<< std::endl
;
11734 std::scoped_lock
lock(client_lock
);
11735 return _lazyio(fh
, enable
);
11738 int Client::lazyio_propagate(int fd
, loff_t offset
, size_t count
)
11740 std::scoped_lock
l(client_lock
);
11741 ldout(cct
, 3) << "op: client->lazyio_propagate(" << fd
11742 << ", " << offset
<< ", " << count
<< ")" << dendl
;
11744 Fh
*f
= get_filehandle(fd
);
11746 return -CEPHFS_EBADF
;
11754 int Client::lazyio_synchronize(int fd
, loff_t offset
, size_t count
)
11756 std::scoped_lock
l(client_lock
);
11757 ldout(cct
, 3) << "op: client->lazyio_synchronize(" << fd
11758 << ", " << offset
<< ", " << count
<< ")" << dendl
;
11760 Fh
*f
= get_filehandle(fd
);
11762 return -CEPHFS_EBADF
;
11763 Inode
*in
= f
->inode
.get();
11766 if (_release(in
)) {
11767 int r
=_getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
11775 // =============================
11778 int Client::mksnap(const char *relpath
, const char *name
, const UserPerm
& perm
,
11779 mode_t mode
, const std::map
<std::string
, std::string
> &metadata
)
11781 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11782 if (!mref_reader
.is_state_satisfied())
11783 return -CEPHFS_ENOTCONN
;
11785 std::scoped_lock
l(client_lock
);
11787 filepath
path(relpath
);
11789 int r
= path_walk(path
, &in
, perm
);
11792 if (cct
->_conf
->client_permissions
) {
11793 r
= may_create(in
.get(), perm
);
11797 Inode
*snapdir
= open_snapdir(in
.get());
11798 return _mkdir(snapdir
, name
, mode
, perm
, nullptr, metadata
);
11801 int Client::rmsnap(const char *relpath
, const char *name
, const UserPerm
& perms
, bool check_perms
)
11803 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11804 if (!mref_reader
.is_state_satisfied())
11805 return -CEPHFS_ENOTCONN
;
11807 std::scoped_lock
l(client_lock
);
11809 filepath
path(relpath
);
11811 int r
= path_walk(path
, &in
, perms
);
11814 Inode
*snapdir
= open_snapdir(in
.get());
11815 if (cct
->_conf
->client_permissions
) {
11816 r
= may_delete(snapdir
, check_perms
? name
: NULL
, perms
);
11820 return _rmdir(snapdir
, name
, perms
);
11823 // =============================
11826 int Client::get_caps_issued(int fd
)
11828 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11829 if (!mref_reader
.is_state_satisfied())
11830 return -CEPHFS_ENOTCONN
;
11832 std::scoped_lock
lock(client_lock
);
11834 Fh
*f
= get_filehandle(fd
);
11836 return -CEPHFS_EBADF
;
11838 return f
->inode
->caps_issued();
11841 int Client::get_caps_issued(const char *path
, const UserPerm
& perms
)
11843 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11844 if (!mref_reader
.is_state_satisfied())
11845 return -CEPHFS_ENOTCONN
;
11847 std::scoped_lock
lock(client_lock
);
11851 int r
= path_walk(p
, &in
, perms
, true);
11854 return in
->caps_issued();
11857 // =========================================
11860 void Client::refresh_snapdir_attrs(Inode
*in
, Inode
*diri
) {
11861 ldout(cct
, 10) << __func__
<< ": snapdir inode=" << *in
11862 << ", inode=" << *diri
<< dendl
;
11863 in
->ino
= diri
->ino
;
11864 in
->snapid
= CEPH_SNAPDIR
;
11865 in
->mode
= diri
->mode
;
11866 in
->uid
= diri
->uid
;
11867 in
->gid
= diri
->gid
;
11869 in
->mtime
= diri
->mtime
;
11870 in
->ctime
= diri
->ctime
;
11871 in
->btime
= diri
->btime
;
11872 in
->atime
= diri
->atime
;
11873 in
->size
= diri
->size
;
11874 in
->change_attr
= diri
->change_attr
;
11876 in
->dirfragtree
.clear();
11877 in
->snapdir_parent
= diri
;
11878 // copy posix acls to snapshotted inode
11879 in
->xattrs
.clear();
11880 for (auto &[xattr_key
, xattr_value
] : diri
->xattrs
) {
11881 if (xattr_key
.rfind("system.", 0) == 0) {
11882 in
->xattrs
[xattr_key
] = xattr_value
;
11887 Inode
*Client::open_snapdir(Inode
*diri
)
11890 vinodeno_t
vino(diri
->ino
, CEPH_SNAPDIR
);
11891 if (!inode_map
.count(vino
)) {
11892 in
= new Inode(this, vino
, &diri
->layout
);
11893 refresh_snapdir_attrs(in
, diri
);
11894 diri
->flags
|= I_SNAPDIR_OPEN
;
11895 inode_map
[vino
] = in
;
11896 if (use_faked_inos())
11897 _assign_faked_ino(in
);
11898 ldout(cct
, 10) << "open_snapdir created snapshot inode " << *in
<< dendl
;
11900 in
= inode_map
[vino
];
11901 ldout(cct
, 10) << "open_snapdir had snapshot inode " << *in
<< dendl
;
11906 int Client::ll_lookup(Inode
*parent
, const char *name
, struct stat
*attr
,
11907 Inode
**out
, const UserPerm
& perms
)
11909 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11910 if (!mref_reader
.is_state_satisfied())
11911 return -CEPHFS_ENOTCONN
;
11913 vinodeno_t vparent
= _get_vino(parent
);
11914 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
11915 tout(cct
) << __func__
<< std::endl
;
11916 tout(cct
) << name
<< std::endl
;
11918 std::scoped_lock
lock(client_lock
);
11921 if (!fuse_default_permissions
) {
11922 if (strcmp(name
, ".") && strcmp(name
, "..")) {
11923 r
= may_lookup(parent
, perms
);
11929 string
dname(name
);
11932 r
= _lookup(parent
, dname
, CEPH_STAT_CAP_INODE_ALL
, &in
, perms
);
11939 fill_stat(in
, attr
);
11943 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
11944 << " -> " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
11945 tout(cct
) << attr
->st_ino
<< std::endl
;
11950 int Client::ll_lookup_vino(
11952 const UserPerm
& perms
,
11955 ceph_assert(inode
!= NULL
);
11956 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11957 if (!mref_reader
.is_state_satisfied())
11958 return -CEPHFS_ENOTCONN
;
11960 if (is_reserved_vino(vino
))
11961 return -CEPHFS_ESTALE
;
11963 std::scoped_lock
lock(client_lock
);
11964 ldout(cct
, 3) << __func__
<< " " << vino
<< dendl
;
11966 // Check the cache first
11967 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
11968 if (p
!= inode_map
.end()) {
11969 *inode
= p
->second
;
11974 uint64_t snapid
= vino
.snapid
;
11976 // for snapdir, find the non-snapped dir inode
11977 if (snapid
== CEPH_SNAPDIR
)
11978 vino
.snapid
= CEPH_NOSNAP
;
11980 int r
= _lookup_vino(vino
, perms
, inode
);
11983 ceph_assert(*inode
!= NULL
);
11985 if (snapid
== CEPH_SNAPDIR
) {
11986 Inode
*tmp
= *inode
;
11988 // open the snapdir and put the inode ref
11989 *inode
= open_snapdir(tmp
);
11990 _ll_forget(tmp
, 1);
11996 int Client::ll_lookup_inode(
11997 struct inodeno_t ino
,
11998 const UserPerm
& perms
,
12001 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
12002 return ll_lookup_vino(vino
, perms
, inode
);
12005 int Client::ll_lookupx(Inode
*parent
, const char *name
, Inode
**out
,
12006 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12007 const UserPerm
& perms
)
12009 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12010 if (!mref_reader
.is_state_satisfied())
12011 return -CEPHFS_ENOTCONN
;
12013 vinodeno_t vparent
= _get_vino(parent
);
12014 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
12015 tout(cct
) << "ll_lookupx" << std::endl
;
12016 tout(cct
) << name
<< std::endl
;
12018 std::scoped_lock
lock(client_lock
);
12021 if (!fuse_default_permissions
) {
12022 r
= may_lookup(parent
, perms
);
12027 string
dname(name
);
12030 unsigned mask
= statx_to_mask(flags
, want
);
12031 r
= _lookup(parent
, dname
, mask
, &in
, perms
);
12037 fill_statx(in
, mask
, stx
);
12041 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
12042 << " -> " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12043 tout(cct
) << stx
->stx_ino
<< std::endl
;
12048 int Client::ll_walk(const char* name
, Inode
**out
, struct ceph_statx
*stx
,
12049 unsigned int want
, unsigned int flags
, const UserPerm
& perms
)
12051 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12052 if (!mref_reader
.is_state_satisfied())
12053 return -CEPHFS_ENOTCONN
;
12055 filepath
fp(name
, 0);
12058 unsigned mask
= statx_to_mask(flags
, want
);
12060 ldout(cct
, 3) << __func__
<< " " << name
<< dendl
;
12061 tout(cct
) << __func__
<< std::endl
;
12062 tout(cct
) << name
<< std::endl
;
12064 std::scoped_lock
lock(client_lock
);
12065 rc
= path_walk(fp
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
12067 /* zero out mask, just in case... */
12074 fill_statx(in
, mask
, stx
);
12081 void Client::_ll_get(Inode
*in
)
12083 if (in
->ll_ref
== 0) {
12085 if (in
->is_dir() && !in
->dentries
.empty()) {
12086 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
12087 in
->get_first_parent()->get(); // pin dentry
12089 if (in
->snapid
!= CEPH_NOSNAP
)
12090 ll_snap_ref
[in
->snapid
]++;
12093 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " -> " << in
->ll_ref
<< dendl
;
12096 int Client::_ll_put(Inode
*in
, uint64_t num
)
12099 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " " << num
<< " -> " << in
->ll_ref
<< dendl
;
12100 if (in
->ll_ref
== 0) {
12101 if (in
->is_dir() && !in
->dentries
.empty()) {
12102 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
12103 in
->get_first_parent()->put(); // unpin dentry
12105 if (in
->snapid
!= CEPH_NOSNAP
) {
12106 auto p
= ll_snap_ref
.find(in
->snapid
);
12107 ceph_assert(p
!= ll_snap_ref
.end());
12108 ceph_assert(p
->second
> 0);
12109 if (--p
->second
== 0)
12110 ll_snap_ref
.erase(p
);
12119 void Client::_ll_drop_pins()
12121 ldout(cct
, 10) << __func__
<< dendl
;
12122 std::set
<InodeRef
> to_be_put
; //this set will be deconstructed item by item when exit
12123 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator next
;
12124 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
12125 it
!= inode_map
.end();
12127 Inode
*in
= it
->second
;
12131 to_be_put
.insert(in
);
12132 _ll_put(in
, in
->ll_ref
);
12137 bool Client::_ll_forget(Inode
*in
, uint64_t count
)
12139 inodeno_t ino
= in
->ino
;
12141 ldout(cct
, 8) << __func__
<< " " << ino
<< " " << count
<< dendl
;
12142 tout(cct
) << __func__
<< std::endl
;
12143 tout(cct
) << ino
.val
<< std::endl
;
12144 tout(cct
) << count
<< std::endl
;
12146 // Ignore forget if we're no longer mounted
12147 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12148 if (!mref_reader
.is_state_satisfied())
12151 if (ino
== 1) return true; // ignore forget on root.
12154 if (in
->ll_ref
< count
) {
12155 ldout(cct
, 1) << "WARNING: ll_forget on " << ino
<< " " << count
12156 << ", which only has ll_ref=" << in
->ll_ref
<< dendl
;
12157 _ll_put(in
, in
->ll_ref
);
12160 if (_ll_put(in
, count
) == 0)
12167 bool Client::ll_forget(Inode
*in
, uint64_t count
)
12169 std::scoped_lock
lock(client_lock
);
12170 return _ll_forget(in
, count
);
12173 bool Client::ll_put(Inode
*in
)
12175 /* ll_forget already takes the lock */
12176 return ll_forget(in
, 1);
12179 int Client::ll_get_snap_ref(snapid_t snap
)
12181 std::scoped_lock
lock(client_lock
);
12182 auto p
= ll_snap_ref
.find(snap
);
12183 if (p
!= ll_snap_ref
.end())
12188 snapid_t
Client::ll_get_snapid(Inode
*in
)
12190 std::scoped_lock
lock(client_lock
);
12194 Inode
*Client::ll_get_inode(ino_t ino
)
12196 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12197 if (!mref_reader
.is_state_satisfied())
12200 std::scoped_lock
lock(client_lock
);
12202 vinodeno_t vino
= _map_faked_ino(ino
);
12203 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
12204 if (p
== inode_map
.end())
12206 Inode
*in
= p
->second
;
12211 Inode
*Client::ll_get_inode(vinodeno_t vino
)
12213 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12214 if (!mref_reader
.is_state_satisfied())
12217 if (is_reserved_vino(vino
))
12220 std::scoped_lock
lock(client_lock
);
12222 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
12223 if (p
== inode_map
.end())
12225 Inode
*in
= p
->second
;
12230 int Client::_ll_getattr(Inode
*in
, int caps
, const UserPerm
& perms
)
12232 vinodeno_t vino
= _get_vino(in
);
12234 ldout(cct
, 8) << __func__
<< " " << vino
<< dendl
;
12235 tout(cct
) << __func__
<< std::endl
;
12236 tout(cct
) << vino
.ino
.val
<< std::endl
;
12238 if (vino
.snapid
< CEPH_NOSNAP
)
12241 return _getattr(in
, caps
, perms
);
12244 int Client::ll_getattr(Inode
*in
, struct stat
*attr
, const UserPerm
& perms
)
12246 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12247 if (!mref_reader
.is_state_satisfied())
12248 return -CEPHFS_ENOTCONN
;
12250 std::scoped_lock
lock(client_lock
);
12252 int res
= _ll_getattr(in
, CEPH_STAT_CAP_INODE_ALL
, perms
);
12255 fill_stat(in
, attr
);
12256 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
12260 int Client::ll_getattrx(Inode
*in
, struct ceph_statx
*stx
, unsigned int want
,
12261 unsigned int flags
, const UserPerm
& perms
)
12263 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12264 if (!mref_reader
.is_state_satisfied())
12265 return -CEPHFS_ENOTCONN
;
12267 std::scoped_lock
lock(client_lock
);
12270 unsigned mask
= statx_to_mask(flags
, want
);
12272 if (mask
&& !in
->caps_issued_mask(mask
, true))
12273 res
= _ll_getattr(in
, mask
, perms
);
12276 fill_statx(in
, mask
, stx
);
12277 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
12281 int Client::_ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
12282 const UserPerm
& perms
, InodeRef
*inp
)
12284 vinodeno_t vino
= _get_vino(in
);
12286 ldout(cct
, 8) << __func__
<< " " << vino
<< " mask " << hex
<< mask
<< dec
12288 tout(cct
) << __func__
<< std::endl
;
12289 tout(cct
) << vino
.ino
.val
<< std::endl
;
12290 tout(cct
) << stx
->stx_mode
<< std::endl
;
12291 tout(cct
) << stx
->stx_uid
<< std::endl
;
12292 tout(cct
) << stx
->stx_gid
<< std::endl
;
12293 tout(cct
) << stx
->stx_size
<< std::endl
;
12294 tout(cct
) << stx
->stx_mtime
<< std::endl
;
12295 tout(cct
) << stx
->stx_atime
<< std::endl
;
12296 tout(cct
) << stx
->stx_btime
<< std::endl
;
12297 tout(cct
) << mask
<< std::endl
;
12299 if (!fuse_default_permissions
) {
12300 int res
= may_setattr(in
, stx
, mask
, perms
);
12305 mask
&= ~(CEPH_SETATTR_MTIME_NOW
| CEPH_SETATTR_ATIME_NOW
);
12307 return __setattrx(in
, stx
, mask
, perms
, inp
);
12310 int Client::ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
12311 const UserPerm
& perms
)
12313 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12314 if (!mref_reader
.is_state_satisfied())
12315 return -CEPHFS_ENOTCONN
;
12317 std::scoped_lock
lock(client_lock
);
12319 InodeRef
target(in
);
12320 int res
= _ll_setattrx(in
, stx
, mask
, perms
, &target
);
12322 ceph_assert(in
== target
.get());
12323 fill_statx(in
, in
->caps_issued(), stx
);
12326 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
12330 int Client::ll_setattr(Inode
*in
, struct stat
*attr
, int mask
,
12331 const UserPerm
& perms
)
12333 struct ceph_statx stx
;
12334 stat_to_statx(attr
, &stx
);
12336 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12337 if (!mref_reader
.is_state_satisfied())
12338 return -CEPHFS_ENOTCONN
;
12340 std::scoped_lock
lock(client_lock
);
12342 InodeRef
target(in
);
12343 int res
= _ll_setattrx(in
, &stx
, mask
, perms
, &target
);
12345 ceph_assert(in
== target
.get());
12346 fill_stat(in
, attr
);
12349 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
12357 int Client::getxattr(const char *path
, const char *name
, void *value
, size_t size
,
12358 const UserPerm
& perms
)
12360 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12361 if (!mref_reader
.is_state_satisfied())
12362 return -CEPHFS_ENOTCONN
;
12364 std::scoped_lock
lock(client_lock
);
12367 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
12370 return _getxattr(in
, name
, value
, size
, perms
);
12373 int Client::lgetxattr(const char *path
, const char *name
, void *value
, size_t size
,
12374 const UserPerm
& perms
)
12376 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12377 if (!mref_reader
.is_state_satisfied())
12378 return -CEPHFS_ENOTCONN
;
12380 std::scoped_lock
lock(client_lock
);
12383 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
12386 return _getxattr(in
, name
, value
, size
, perms
);
12389 int Client::fgetxattr(int fd
, const char *name
, void *value
, size_t size
,
12390 const UserPerm
& perms
)
12392 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12393 if (!mref_reader
.is_state_satisfied())
12394 return -CEPHFS_ENOTCONN
;
12396 std::scoped_lock
lock(client_lock
);
12398 Fh
*f
= get_filehandle(fd
);
12400 return -CEPHFS_EBADF
;
12401 return _getxattr(f
->inode
, name
, value
, size
, perms
);
12404 int Client::listxattr(const char *path
, char *list
, size_t size
,
12405 const UserPerm
& perms
)
12407 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12408 if (!mref_reader
.is_state_satisfied())
12409 return -CEPHFS_ENOTCONN
;
12411 std::scoped_lock
lock(client_lock
);
12414 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
12417 return Client::_listxattr(in
.get(), list
, size
, perms
);
12420 int Client::llistxattr(const char *path
, char *list
, size_t size
,
12421 const UserPerm
& perms
)
12423 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12424 if (!mref_reader
.is_state_satisfied())
12425 return -CEPHFS_ENOTCONN
;
12427 std::scoped_lock
lock(client_lock
);
12430 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
12433 return Client::_listxattr(in
.get(), list
, size
, perms
);
12436 int Client::flistxattr(int fd
, char *list
, size_t size
, const UserPerm
& perms
)
12438 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12439 if (!mref_reader
.is_state_satisfied())
12440 return -CEPHFS_ENOTCONN
;
12442 std::scoped_lock
lock(client_lock
);
12444 Fh
*f
= get_filehandle(fd
);
12446 return -CEPHFS_EBADF
;
12447 return Client::_listxattr(f
->inode
.get(), list
, size
, perms
);
12450 int Client::removexattr(const char *path
, const char *name
,
12451 const UserPerm
& perms
)
12453 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12454 if (!mref_reader
.is_state_satisfied())
12455 return -CEPHFS_ENOTCONN
;
12457 std::scoped_lock
lock(client_lock
);
12460 int r
= Client::path_walk(path
, &in
, perms
, true);
12463 return _removexattr(in
, name
, perms
);
12466 int Client::lremovexattr(const char *path
, const char *name
,
12467 const UserPerm
& perms
)
12469 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12470 if (!mref_reader
.is_state_satisfied())
12471 return -CEPHFS_ENOTCONN
;
12473 std::scoped_lock
lock(client_lock
);
12476 int r
= Client::path_walk(path
, &in
, perms
, false);
12479 return _removexattr(in
, name
, perms
);
12482 int Client::fremovexattr(int fd
, const char *name
, const UserPerm
& perms
)
12484 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12485 if (!mref_reader
.is_state_satisfied())
12486 return -CEPHFS_ENOTCONN
;
12488 std::scoped_lock
lock(client_lock
);
12490 Fh
*f
= get_filehandle(fd
);
12492 return -CEPHFS_EBADF
;
12493 return _removexattr(f
->inode
, name
, perms
);
12496 int Client::setxattr(const char *path
, const char *name
, const void *value
,
12497 size_t size
, int flags
, const UserPerm
& perms
)
12499 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12500 if (!mref_reader
.is_state_satisfied())
12501 return -CEPHFS_ENOTCONN
;
12503 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
12505 std::scoped_lock
lock(client_lock
);
12508 int r
= Client::path_walk(path
, &in
, perms
, true);
12511 return _setxattr(in
, name
, value
, size
, flags
, perms
);
12514 int Client::lsetxattr(const char *path
, const char *name
, const void *value
,
12515 size_t size
, int flags
, const UserPerm
& perms
)
12517 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12518 if (!mref_reader
.is_state_satisfied())
12519 return -CEPHFS_ENOTCONN
;
12521 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
12523 std::scoped_lock
lock(client_lock
);
12526 int r
= Client::path_walk(path
, &in
, perms
, false);
12529 return _setxattr(in
, name
, value
, size
, flags
, perms
);
12532 int Client::fsetxattr(int fd
, const char *name
, const void *value
, size_t size
,
12533 int flags
, const UserPerm
& perms
)
12535 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12536 if (!mref_reader
.is_state_satisfied())
12537 return -CEPHFS_ENOTCONN
;
12539 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
12541 std::scoped_lock
lock(client_lock
);
12543 Fh
*f
= get_filehandle(fd
);
12545 return -CEPHFS_EBADF
;
12546 return _setxattr(f
->inode
, name
, value
, size
, flags
, perms
);
12549 int Client::_getxattr(Inode
*in
, const char *name
, void *value
, size_t size
,
12550 const UserPerm
& perms
)
12553 const VXattr
*vxattr
= nullptr;
12555 vxattr
= _match_vxattr(in
, name
);
12557 r
= -CEPHFS_ENODATA
;
12559 // Do a force getattr to get the latest quota before returning
12560 // a value to userspace.
12562 if (vxattr
->flags
& VXATTR_RSTAT
) {
12563 flags
|= CEPH_STAT_RSTAT
;
12565 if (vxattr
->flags
& VXATTR_DIRSTAT
) {
12566 flags
|= CEPH_CAP_FILE_SHARED
;
12568 r
= _getattr(in
, flags
| CEPH_STAT_CAP_XATTR
, perms
, true);
12570 // Error from getattr!
12574 // call pointer-to-member function
12576 if (!(vxattr
->exists_cb
&& !(this->*(vxattr
->exists_cb
))(in
))) {
12577 r
= (this->*(vxattr
->getxattr_cb
))(in
, buf
, sizeof(buf
));
12579 r
= -CEPHFS_ENODATA
;
12583 if (r
> (int)size
) {
12584 r
= -CEPHFS_ERANGE
;
12585 } else if (r
> 0) {
12586 memcpy(value
, buf
, r
);
12592 if (!strncmp(name
, "ceph.", 5)) {
12593 r
= _getvxattr(in
, perms
, name
, size
, value
, MDS_RANK_NONE
);
12597 if (acl_type
== NO_ACL
&& !strncmp(name
, "system.", 7)) {
12598 r
= -CEPHFS_EOPNOTSUPP
;
12602 r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
12605 r
= -CEPHFS_ENODATA
;
12606 if (in
->xattrs
.count(n
)) {
12607 r
= in
->xattrs
[n
].length();
12608 if (r
> 0 && size
!= 0) {
12609 if (size
>= (unsigned)r
)
12610 memcpy(value
, in
->xattrs
[n
].c_str(), r
);
12612 r
= -CEPHFS_ERANGE
;
12617 ldout(cct
, 8) << "_getxattr(" << in
->ino
<< ", \"" << name
<< "\", " << size
<< ") = " << r
<< dendl
;
12621 int Client::_getxattr(InodeRef
&in
, const char *name
, void *value
, size_t size
,
12622 const UserPerm
& perms
)
12624 if (cct
->_conf
->client_permissions
) {
12625 int r
= xattr_permission(in
.get(), name
, MAY_READ
, perms
);
12629 return _getxattr(in
.get(), name
, value
, size
, perms
);
12632 int Client::ll_getxattr(Inode
*in
, const char *name
, void *value
,
12633 size_t size
, const UserPerm
& perms
)
12635 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12636 if (!mref_reader
.is_state_satisfied())
12637 return -CEPHFS_ENOTCONN
;
12639 vinodeno_t vino
= _get_vino(in
);
12641 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
12642 tout(cct
) << __func__
<< std::endl
;
12643 tout(cct
) << vino
.ino
.val
<< std::endl
;
12644 tout(cct
) << name
<< std::endl
;
12646 std::scoped_lock
lock(client_lock
);
12647 if (!fuse_default_permissions
) {
12648 int r
= xattr_permission(in
, name
, MAY_READ
, perms
);
12653 return _getxattr(in
, name
, value
, size
, perms
);
12656 int Client::_listxattr(Inode
*in
, char *name
, size_t size
,
12657 const UserPerm
& perms
)
12659 bool len_only
= (size
== 0);
12660 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
12666 for ([[maybe_unused
]] const auto &[xattr_name
, xattr_value_bl
] : in
->xattrs
) {
12667 if (xattr_name
.rfind("ceph.", 0) == 0) {
12671 size_t this_len
= xattr_name
.length() + 1;
12676 if (this_len
> size
) {
12677 r
= -CEPHFS_ERANGE
;
12681 memcpy(name
, xattr_name
.c_str(), this_len
);
12686 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ", " << size
<< ") = " << r
<< dendl
;
12690 int Client::ll_listxattr(Inode
*in
, char *names
, size_t size
,
12691 const UserPerm
& perms
)
12693 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12694 if (!mref_reader
.is_state_satisfied())
12695 return -CEPHFS_ENOTCONN
;
12697 vinodeno_t vino
= _get_vino(in
);
12699 ldout(cct
, 3) << __func__
<< " " << vino
<< " size " << size
<< dendl
;
12700 tout(cct
) << __func__
<< std::endl
;
12701 tout(cct
) << vino
.ino
.val
<< std::endl
;
12702 tout(cct
) << size
<< std::endl
;
12704 std::scoped_lock
lock(client_lock
);
12705 return _listxattr(in
, names
, size
, perms
);
12708 int Client::_do_setxattr(Inode
*in
, const char *name
, const void *value
,
12709 size_t size
, int flags
, const UserPerm
& perms
)
12712 int xattr_flags
= 0;
12714 xattr_flags
|= CEPH_XATTR_REMOVE
;
12715 if (flags
& XATTR_CREATE
)
12716 xattr_flags
|= CEPH_XATTR_CREATE
;
12717 if (flags
& XATTR_REPLACE
)
12718 xattr_flags
|= CEPH_XATTR_REPLACE
;
12720 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETXATTR
);
12722 in
->make_nosnap_relative_path(path
);
12723 req
->set_filepath(path
);
12724 req
->set_string2(name
);
12725 req
->set_inode(in
);
12726 req
->head
.args
.setxattr
.flags
= xattr_flags
;
12729 ceph_assert(value
|| size
== 0);
12730 bl
.append((const char*)value
, size
);
12733 int res
= make_request(req
, perms
);
12736 ldout(cct
, 3) << __func__
<< "(" << in
->ino
<< ", \"" << name
<< "\") = " <<
12741 int Client::_setxattr(Inode
*in
, const char *name
, const void *value
,
12742 size_t size
, int flags
, const UserPerm
& perms
)
12744 if (in
->snapid
!= CEPH_NOSNAP
) {
12745 return -CEPHFS_EROFS
;
12750 } else if (value
== NULL
) {
12751 return -CEPHFS_EINVAL
;
12754 bool posix_acl_xattr
= false;
12755 if (acl_type
== POSIX_ACL
)
12756 posix_acl_xattr
= !strncmp(name
, "system.", 7);
12758 if (strncmp(name
, "user.", 5) &&
12759 strncmp(name
, "security.", 9) &&
12760 strncmp(name
, "trusted.", 8) &&
12761 strncmp(name
, "ceph.", 5) &&
12763 return -CEPHFS_EOPNOTSUPP
;
12765 bool check_realm
= false;
12767 if (posix_acl_xattr
) {
12768 if (!strcmp(name
, ACL_EA_ACCESS
)) {
12769 mode_t new_mode
= in
->mode
;
12771 int ret
= posix_acl_equiv_mode(value
, size
, &new_mode
);
12778 if (new_mode
!= in
->mode
) {
12779 struct ceph_statx stx
;
12780 stx
.stx_mode
= new_mode
;
12781 ret
= _do_setattr(in
, &stx
, CEPH_SETATTR_MODE
, perms
, NULL
);
12786 } else if (!strcmp(name
, ACL_EA_DEFAULT
)) {
12788 if (!S_ISDIR(in
->mode
))
12789 return -CEPHFS_EACCES
;
12790 int ret
= posix_acl_check(value
, size
);
12792 return -CEPHFS_EINVAL
;
12799 return -CEPHFS_EOPNOTSUPP
;
12802 const VXattr
*vxattr
= _match_vxattr(in
, name
);
12804 if (vxattr
->readonly
)
12805 return -CEPHFS_EOPNOTSUPP
;
12806 if (vxattr
->name
.compare(0, 10, "ceph.quota") == 0 && value
)
12807 check_realm
= true;
12811 int ret
= _do_setxattr(in
, name
, value
, size
, flags
, perms
);
12812 if (ret
>= 0 && check_realm
) {
12813 // check if snaprealm was created for quota inode
12814 if (in
->quota
.is_enable() &&
12815 !(in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
))
12816 ret
= -CEPHFS_EOPNOTSUPP
;
12822 int Client::_setxattr(InodeRef
&in
, const char *name
, const void *value
,
12823 size_t size
, int flags
, const UserPerm
& perms
)
12825 if (cct
->_conf
->client_permissions
) {
12826 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
12830 return _setxattr(in
.get(), name
, value
, size
, flags
, perms
);
12833 int Client::_setxattr_check_data_pool(string
& name
, string
& value
, const OSDMap
*osdmap
)
12836 if (name
== "layout") {
12837 string::iterator begin
= value
.begin();
12838 string::iterator end
= value
.end();
12839 keys_and_values
<string::iterator
> p
; // create instance of parser
12840 std::map
<string
, string
> m
; // map to receive results
12841 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
12842 return -CEPHFS_EINVAL
;
12845 return -CEPHFS_EINVAL
;
12846 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
12847 if (q
->first
== "pool") {
12852 } else if (name
== "layout.pool") {
12856 if (tmp
.length()) {
12859 pool
= boost::lexical_cast
<unsigned>(tmp
);
12860 if (!osdmap
->have_pg_pool(pool
))
12861 return -CEPHFS_ENOENT
;
12862 } catch (boost::bad_lexical_cast
const&) {
12863 pool
= osdmap
->lookup_pg_pool_name(tmp
);
12865 return -CEPHFS_ENOENT
;
12873 void Client::_setxattr_maybe_wait_for_osdmap(const char *name
, const void *value
, size_t size
)
12875 // For setting pool of layout, MetaRequest need osdmap epoch.
12876 // There is a race which create a new data pool but client and mds both don't have.
12877 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
12878 ldout(cct
, 15) << __func__
<< ": name = " << name
<< dendl
;
12879 if (strcmp(name
, "ceph.file.layout.pool") == 0 || strcmp(name
, "ceph.dir.layout.pool") == 0 ||
12880 strcmp(name
, "ceph.file.layout") == 0 || strcmp(name
, "ceph.dir.layout") == 0) {
12881 string
rest(strstr(name
, "layout"));
12882 string
v((const char*)value
, size
);
12883 int r
= objecter
->with_osdmap([&](const OSDMap
& o
) {
12884 return _setxattr_check_data_pool(rest
, v
, &o
);
12887 if (r
== -CEPHFS_ENOENT
) {
12889 ldout(cct
, 20) << __func__
<< ": waiting for latest osdmap" << dendl
;
12890 objecter
->wait_for_latest_osdmap(ca::use_blocked
[ec
]);
12891 ldout(cct
, 20) << __func__
<< ": got latest osdmap: " << ec
<< dendl
;
12896 int Client::ll_setxattr(Inode
*in
, const char *name
, const void *value
,
12897 size_t size
, int flags
, const UserPerm
& perms
)
12899 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12900 if (!mref_reader
.is_state_satisfied())
12901 return -CEPHFS_ENOTCONN
;
12903 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
12905 vinodeno_t vino
= _get_vino(in
);
12907 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
12908 tout(cct
) << __func__
<< std::endl
;
12909 tout(cct
) << vino
.ino
.val
<< std::endl
;
12910 tout(cct
) << name
<< std::endl
;
12912 std::scoped_lock
lock(client_lock
);
12913 if (!fuse_default_permissions
) {
12914 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
12918 return _setxattr(in
, name
, value
, size
, flags
, perms
);
12921 int Client::_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
12923 if (in
->snapid
!= CEPH_NOSNAP
) {
12924 return -CEPHFS_EROFS
;
12927 // same xattrs supported by kernel client
12928 if (strncmp(name
, "user.", 5) &&
12929 strncmp(name
, "system.", 7) &&
12930 strncmp(name
, "security.", 9) &&
12931 strncmp(name
, "trusted.", 8) &&
12932 strncmp(name
, "ceph.", 5))
12933 return -CEPHFS_EOPNOTSUPP
;
12935 const VXattr
*vxattr
= _match_vxattr(in
, name
);
12936 if (vxattr
&& vxattr
->readonly
)
12937 return -CEPHFS_EOPNOTSUPP
;
12939 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_RMXATTR
);
12941 in
->make_nosnap_relative_path(path
);
12942 req
->set_filepath(path
);
12943 req
->set_filepath2(name
);
12944 req
->set_inode(in
);
12946 int res
= make_request(req
, perms
);
12949 ldout(cct
, 8) << "_removexattr(" << in
->ino
<< ", \"" << name
<< "\") = " << res
<< dendl
;
12953 int Client::_removexattr(InodeRef
&in
, const char *name
, const UserPerm
& perms
)
12955 if (cct
->_conf
->client_permissions
) {
12956 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
12960 return _removexattr(in
.get(), name
, perms
);
12963 int Client::ll_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
12965 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12966 if (!mref_reader
.is_state_satisfied())
12967 return -CEPHFS_ENOTCONN
;
12969 vinodeno_t vino
= _get_vino(in
);
12971 ldout(cct
, 3) << "ll_removexattr " << vino
<< " " << name
<< dendl
;
12972 tout(cct
) << "ll_removexattr" << std::endl
;
12973 tout(cct
) << vino
.ino
.val
<< std::endl
;
12974 tout(cct
) << name
<< std::endl
;
12976 std::scoped_lock
lock(client_lock
);
12977 if (!fuse_default_permissions
) {
12978 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
12983 return _removexattr(in
, name
, perms
);
12986 bool Client::_vxattrcb_quota_exists(Inode
*in
)
12988 return in
->quota
.is_enable() &&
12989 (in
->snapid
!= CEPH_NOSNAP
||
12990 (in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
));
12992 size_t Client::_vxattrcb_quota(Inode
*in
, char *val
, size_t size
)
12994 return snprintf(val
, size
,
12995 "max_bytes=%lld max_files=%lld",
12996 (long long int)in
->quota
.max_bytes
,
12997 (long long int)in
->quota
.max_files
);
12999 size_t Client::_vxattrcb_quota_max_bytes(Inode
*in
, char *val
, size_t size
)
13001 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_bytes
);
13003 size_t Client::_vxattrcb_quota_max_files(Inode
*in
, char *val
, size_t size
)
13005 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_files
);
13008 bool Client::_vxattrcb_layout_exists(Inode
*in
)
13010 return in
->layout
!= file_layout_t();
13012 size_t Client::_vxattrcb_layout(Inode
*in
, char *val
, size_t size
)
13014 int r
= snprintf(val
, size
,
13015 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
13016 (unsigned long long)in
->layout
.stripe_unit
,
13017 (unsigned long long)in
->layout
.stripe_count
,
13018 (unsigned long long)in
->layout
.object_size
);
13019 objecter
->with_osdmap([&](const OSDMap
& o
) {
13020 if (o
.have_pg_pool(in
->layout
.pool_id
))
13021 r
+= snprintf(val
+ r
, size
- r
, "%s",
13022 o
.get_pool_name(in
->layout
.pool_id
).c_str());
13024 r
+= snprintf(val
+ r
, size
- r
, "%" PRIu64
,
13025 (uint64_t)in
->layout
.pool_id
);
13027 if (in
->layout
.pool_ns
.length())
13028 r
+= snprintf(val
+ r
, size
- r
, " pool_namespace=%s",
13029 in
->layout
.pool_ns
.c_str());
13032 size_t Client::_vxattrcb_layout_stripe_unit(Inode
*in
, char *val
, size_t size
)
13034 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_unit
);
13036 size_t Client::_vxattrcb_layout_stripe_count(Inode
*in
, char *val
, size_t size
)
13038 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_count
);
13040 size_t Client::_vxattrcb_layout_object_size(Inode
*in
, char *val
, size_t size
)
13042 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.object_size
);
13044 size_t Client::_vxattrcb_layout_pool(Inode
*in
, char *val
, size_t size
)
13047 objecter
->with_osdmap([&](const OSDMap
& o
) {
13048 if (o
.have_pg_pool(in
->layout
.pool_id
))
13049 r
= snprintf(val
, size
, "%s", o
.get_pool_name(
13050 in
->layout
.pool_id
).c_str());
13052 r
= snprintf(val
, size
, "%" PRIu64
, (uint64_t)in
->layout
.pool_id
);
13056 size_t Client::_vxattrcb_layout_pool_namespace(Inode
*in
, char *val
, size_t size
)
13058 return snprintf(val
, size
, "%s", in
->layout
.pool_ns
.c_str());
13060 size_t Client::_vxattrcb_dir_entries(Inode
*in
, char *val
, size_t size
)
13062 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->dirstat
.nfiles
+ in
->dirstat
.nsubdirs
));
13064 size_t Client::_vxattrcb_dir_files(Inode
*in
, char *val
, size_t size
)
13066 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nfiles
);
13068 size_t Client::_vxattrcb_dir_subdirs(Inode
*in
, char *val
, size_t size
)
13070 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nsubdirs
);
13072 size_t Client::_vxattrcb_dir_rentries(Inode
*in
, char *val
, size_t size
)
13074 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->rstat
.rfiles
+ in
->rstat
.rsubdirs
));
13076 size_t Client::_vxattrcb_dir_rfiles(Inode
*in
, char *val
, size_t size
)
13078 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rfiles
);
13080 size_t Client::_vxattrcb_dir_rsubdirs(Inode
*in
, char *val
, size_t size
)
13082 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsubdirs
);
13084 size_t Client::_vxattrcb_dir_rsnaps(Inode
*in
, char *val
, size_t size
)
13086 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsnaps
);
13088 size_t Client::_vxattrcb_dir_rbytes(Inode
*in
, char *val
, size_t size
)
13090 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rbytes
);
13092 size_t Client::_vxattrcb_dir_rctime(Inode
*in
, char *val
, size_t size
)
13094 return snprintf(val
, size
, "%ld.%09ld", (long)in
->rstat
.rctime
.sec(),
13095 (long)in
->rstat
.rctime
.nsec());
13097 bool Client::_vxattrcb_dir_pin_exists(Inode
*in
)
13099 return in
->dir_pin
!= -CEPHFS_ENODATA
;
13101 size_t Client::_vxattrcb_dir_pin(Inode
*in
, char *val
, size_t size
)
13103 return snprintf(val
, size
, "%ld", (long)in
->dir_pin
);
13106 bool Client::_vxattrcb_snap_btime_exists(Inode
*in
)
13108 return !in
->snap_btime
.is_zero();
13111 size_t Client::_vxattrcb_snap_btime(Inode
*in
, char *val
, size_t size
)
13113 return snprintf(val
, size
, "%llu.%09lu",
13114 (long long unsigned)in
->snap_btime
.sec(),
13115 (long unsigned)in
->snap_btime
.nsec());
13118 size_t Client::_vxattrcb_caps(Inode
*in
, char *val
, size_t size
)
13122 in
->caps_issued(&issued
);
13123 return snprintf(val
, size
, "%s/0x%x", ccap_string(issued
).c_str(), issued
);
13126 bool Client::_vxattrcb_mirror_info_exists(Inode
*in
)
13128 // checking one of the xattrs would suffice
13129 return in
->xattrs
.count("ceph.mirror.info.cluster_id") != 0;
13132 size_t Client::_vxattrcb_mirror_info(Inode
*in
, char *val
, size_t size
)
13134 return snprintf(val
, size
, "cluster_id=%.*s fs_id=%.*s",
13135 in
->xattrs
["ceph.mirror.info.cluster_id"].length(),
13136 in
->xattrs
["ceph.mirror.info.cluster_id"].c_str(),
13137 in
->xattrs
["ceph.mirror.info.fs_id"].length(),
13138 in
->xattrs
["ceph.mirror.info.fs_id"].c_str());
13141 size_t Client::_vxattrcb_cluster_fsid(Inode
*in
, char *val
, size_t size
)
13143 return snprintf(val
, size
, "%s", monclient
->get_fsid().to_string().c_str());
13146 size_t Client::_vxattrcb_client_id(Inode
*in
, char *val
, size_t size
)
13148 auto name
= messenger
->get_myname();
13149 return snprintf(val
, size
, "%s%" PRId64
, name
.type_str(), name
.num());
13152 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
13153 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
13155 #define XATTR_NAME_CEPH(_type, _name, _flags) \
13157 name: CEPH_XATTR_NAME(_type, _name), \
13158 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
13163 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
13165 name: CEPH_XATTR_NAME2(_type, _name, _field), \
13166 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
13168 exists_cb: &Client::_vxattrcb_layout_exists, \
13171 #define XATTR_QUOTA_FIELD(_type, _name) \
13173 name: CEPH_XATTR_NAME(_type, _name), \
13174 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
13176 exists_cb: &Client::_vxattrcb_quota_exists, \
13180 const Client::VXattr
Client::_dir_vxattrs
[] = {
13182 name
: "ceph.dir.layout",
13183 getxattr_cb
: &Client::_vxattrcb_layout
,
13185 exists_cb
: &Client::_vxattrcb_layout_exists
,
13189 // Delete the following dir layout field definitions for release "S"
13190 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_unit
),
13191 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_count
),
13192 XATTR_LAYOUT_FIELD(dir
, layout
, object_size
),
13193 XATTR_LAYOUT_FIELD(dir
, layout
, pool
),
13194 XATTR_LAYOUT_FIELD(dir
, layout
, pool_namespace
),
13195 XATTR_NAME_CEPH(dir
, entries
, VXATTR_DIRSTAT
),
13196 XATTR_NAME_CEPH(dir
, files
, VXATTR_DIRSTAT
),
13197 XATTR_NAME_CEPH(dir
, subdirs
, VXATTR_DIRSTAT
),
13198 XATTR_NAME_CEPH(dir
, rentries
, VXATTR_RSTAT
),
13199 XATTR_NAME_CEPH(dir
, rfiles
, VXATTR_RSTAT
),
13200 XATTR_NAME_CEPH(dir
, rsubdirs
, VXATTR_RSTAT
),
13201 XATTR_NAME_CEPH(dir
, rsnaps
, VXATTR_RSTAT
),
13202 XATTR_NAME_CEPH(dir
, rbytes
, VXATTR_RSTAT
),
13203 XATTR_NAME_CEPH(dir
, rctime
, VXATTR_RSTAT
),
13205 name
: "ceph.quota",
13206 getxattr_cb
: &Client::_vxattrcb_quota
,
13208 exists_cb
: &Client::_vxattrcb_quota_exists
,
13211 XATTR_QUOTA_FIELD(quota
, max_bytes
),
13212 XATTR_QUOTA_FIELD(quota
, max_files
),
13214 // Delete the following dir pin field definitions for release "S"
13216 name
: "ceph.dir.pin",
13217 getxattr_cb
: &Client::_vxattrcb_dir_pin
,
13219 exists_cb
: &Client::_vxattrcb_dir_pin_exists
,
13223 name
: "ceph.snap.btime",
13224 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
13226 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
13230 name
: "ceph.mirror.info",
13231 getxattr_cb
: &Client::_vxattrcb_mirror_info
,
13233 exists_cb
: &Client::_vxattrcb_mirror_info_exists
,
13238 getxattr_cb
: &Client::_vxattrcb_caps
,
13243 { name
: "" } /* Required table terminator */
13246 const Client::VXattr
Client::_file_vxattrs
[] = {
13248 name
: "ceph.file.layout",
13249 getxattr_cb
: &Client::_vxattrcb_layout
,
13251 exists_cb
: &Client::_vxattrcb_layout_exists
,
13254 XATTR_LAYOUT_FIELD(file
, layout
, stripe_unit
),
13255 XATTR_LAYOUT_FIELD(file
, layout
, stripe_count
),
13256 XATTR_LAYOUT_FIELD(file
, layout
, object_size
),
13257 XATTR_LAYOUT_FIELD(file
, layout
, pool
),
13258 XATTR_LAYOUT_FIELD(file
, layout
, pool_namespace
),
13260 name
: "ceph.snap.btime",
13261 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
13263 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
13268 getxattr_cb
: &Client::_vxattrcb_caps
,
13273 { name
: "" } /* Required table terminator */
13276 const Client::VXattr
Client::_common_vxattrs
[] = {
13278 name
: "ceph.cluster_fsid",
13279 getxattr_cb
: &Client::_vxattrcb_cluster_fsid
,
13281 exists_cb
: nullptr,
13285 name
: "ceph.client_id",
13286 getxattr_cb
: &Client::_vxattrcb_client_id
,
13288 exists_cb
: nullptr,
13291 { name
: "" } /* Required table terminator */
13294 const Client::VXattr
*Client::_get_vxattrs(Inode
*in
)
13297 return _dir_vxattrs
;
13298 else if (in
->is_file())
13299 return _file_vxattrs
;
13303 const Client::VXattr
*Client::_match_vxattr(Inode
*in
, const char *name
)
13305 if (strncmp(name
, "ceph.", 5) == 0) {
13306 const VXattr
*vxattr
= _get_vxattrs(in
);
13308 while (!vxattr
->name
.empty()) {
13309 if (vxattr
->name
== name
)
13315 // for common vxattrs
13316 vxattr
= _common_vxattrs
;
13317 while (!vxattr
->name
.empty()) {
13318 if (vxattr
->name
== name
)
13327 int Client::ll_readlink(Inode
*in
, char *buf
, size_t buflen
, const UserPerm
& perms
)
13329 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13330 if (!mref_reader
.is_state_satisfied())
13331 return -CEPHFS_ENOTCONN
;
13333 vinodeno_t vino
= _get_vino(in
);
13335 ldout(cct
, 3) << "ll_readlink " << vino
<< dendl
;
13336 tout(cct
) << "ll_readlink" << std::endl
;
13337 tout(cct
) << vino
.ino
.val
<< std::endl
;
13339 std::scoped_lock
lock(client_lock
);
13340 for (auto dn
: in
->dentries
) {
13344 int r
= _readlink(in
, buf
, buflen
); // FIXME: no permission checking!
13345 ldout(cct
, 3) << "ll_readlink " << vino
<< " = " << r
<< dendl
;
13349 int Client::_mknod(Inode
*dir
, const char *name
, mode_t mode
, dev_t rdev
,
13350 const UserPerm
& perms
, InodeRef
*inp
)
13352 ldout(cct
, 8) << "_mknod(" << dir
->ino
<< " " << name
<< ", 0" << oct
13353 << mode
<< dec
<< ", " << rdev
<< ", uid " << perms
.uid()
13354 << ", gid " << perms
.gid() << ")" << dendl
;
13356 if (strlen(name
) > NAME_MAX
)
13357 return -CEPHFS_ENAMETOOLONG
;
13359 if (dir
->snapid
!= CEPH_NOSNAP
) {
13360 return -CEPHFS_EROFS
;
13362 if (is_quota_files_exceeded(dir
, perms
)) {
13363 return -CEPHFS_EDQUOT
;
13366 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_MKNOD
);
13369 dir
->make_nosnap_relative_path(path
);
13370 path
.push_dentry(name
);
13371 req
->set_filepath(path
);
13372 req
->set_inode(dir
);
13373 req
->head
.args
.mknod
.rdev
= rdev
;
13374 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13375 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13377 bufferlist xattrs_bl
;
13378 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
13381 req
->head
.args
.mknod
.mode
= mode
;
13382 if (xattrs_bl
.length() > 0)
13383 req
->set_data(xattrs_bl
);
13386 res
= get_or_create(dir
, name
, &de
);
13389 req
->set_dentry(de
);
13391 res
= make_request(req
, perms
, inp
);
13395 ldout(cct
, 8) << "mknod(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
13403 int Client::ll_mknod(Inode
*parent
, const char *name
, mode_t mode
,
13404 dev_t rdev
, struct stat
*attr
, Inode
**out
,
13405 const UserPerm
& perms
)
13407 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13408 if (!mref_reader
.is_state_satisfied())
13409 return -CEPHFS_ENOTCONN
;
13411 vinodeno_t vparent
= _get_vino(parent
);
13413 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
<< dendl
;
13414 tout(cct
) << "ll_mknod" << std::endl
;
13415 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13416 tout(cct
) << name
<< std::endl
;
13417 tout(cct
) << mode
<< std::endl
;
13418 tout(cct
) << rdev
<< std::endl
;
13420 std::scoped_lock
lock(client_lock
);
13421 if (!fuse_default_permissions
) {
13422 int r
= may_create(parent
, perms
);
13428 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
13430 fill_stat(in
, attr
);
13433 tout(cct
) << attr
->st_ino
<< std::endl
;
13434 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
13435 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
13440 int Client::ll_mknodx(Inode
*parent
, const char *name
, mode_t mode
,
13441 dev_t rdev
, Inode
**out
,
13442 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
13443 const UserPerm
& perms
)
13445 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13446 if (!mref_reader
.is_state_satisfied())
13447 return -CEPHFS_ENOTCONN
;
13449 unsigned caps
= statx_to_mask(flags
, want
);
13451 vinodeno_t vparent
= _get_vino(parent
);
13453 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
<< dendl
;
13454 tout(cct
) << "ll_mknodx" << std::endl
;
13455 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13456 tout(cct
) << name
<< std::endl
;
13457 tout(cct
) << mode
<< std::endl
;
13458 tout(cct
) << rdev
<< std::endl
;
13460 std::scoped_lock
lock(client_lock
);
13462 if (!fuse_default_permissions
) {
13463 int r
= may_create(parent
, perms
);
13469 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
13471 fill_statx(in
, caps
, stx
);
13474 tout(cct
) << stx
->stx_ino
<< std::endl
;
13475 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
13476 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
13481 int Client::_create(Inode
*dir
, const char *name
, int flags
, mode_t mode
,
13482 InodeRef
*inp
, Fh
**fhp
, int stripe_unit
, int stripe_count
,
13483 int object_size
, const char *data_pool
, bool *created
,
13484 const UserPerm
& perms
, std::string alternate_name
)
13486 ldout(cct
, 8) << "_create(" << dir
->ino
<< " " << name
<< ", 0" << oct
<<
13487 mode
<< dec
<< ")" << dendl
;
13489 if (strlen(name
) > NAME_MAX
)
13490 return -CEPHFS_ENAMETOOLONG
;
13491 if (dir
->snapid
!= CEPH_NOSNAP
) {
13492 return -CEPHFS_EROFS
;
13494 if (is_quota_files_exceeded(dir
, perms
)) {
13495 return -CEPHFS_EDQUOT
;
13498 // use normalized flags to generate cmode
13499 int cflags
= ceph_flags_sys2wire(flags
);
13500 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
13501 cflags
|= CEPH_O_LAZY
;
13503 int cmode
= ceph_flags_to_mode(cflags
);
13505 int64_t pool_id
= -1;
13506 if (data_pool
&& *data_pool
) {
13507 pool_id
= objecter
->with_osdmap(
13508 std::mem_fn(&OSDMap::lookup_pg_pool_name
), data_pool
);
13510 return -CEPHFS_EINVAL
;
13511 if (pool_id
> 0xffffffffll
)
13512 return -CEPHFS_ERANGE
; // bummer!
13515 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_CREATE
);
13518 dir
->make_nosnap_relative_path(path
);
13519 path
.push_dentry(name
);
13520 req
->set_filepath(path
);
13521 req
->set_alternate_name(std::move(alternate_name
));
13522 req
->set_inode(dir
);
13523 req
->head
.args
.open
.flags
= cflags
| CEPH_O_CREAT
;
13525 req
->head
.args
.open
.stripe_unit
= stripe_unit
;
13526 req
->head
.args
.open
.stripe_count
= stripe_count
;
13527 req
->head
.args
.open
.object_size
= object_size
;
13528 if (cct
->_conf
->client_debug_getattr_caps
)
13529 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
13531 req
->head
.args
.open
.mask
= 0;
13532 req
->head
.args
.open
.pool
= pool_id
;
13533 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13534 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13537 bufferlist xattrs_bl
;
13538 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
13541 req
->head
.args
.open
.mode
= mode
;
13542 if (xattrs_bl
.length() > 0)
13543 req
->set_data(xattrs_bl
);
13546 res
= get_or_create(dir
, name
, &de
);
13549 req
->set_dentry(de
);
13551 res
= make_request(req
, perms
, inp
, created
);
13556 /* If the caller passed a value in fhp, do the open */
13558 (*inp
)->get_open_ref(cmode
);
13559 *fhp
= _create_fh(inp
->get(), flags
, cmode
, perms
);
13565 ldout(cct
, 8) << "create(" << path
<< ", 0" << oct
<< mode
<< dec
13566 << " layout " << stripe_unit
13567 << ' ' << stripe_count
13568 << ' ' << object_size
13569 <<") = " << res
<< dendl
;
13577 int Client::_mkdir(Inode
*dir
, const char *name
, mode_t mode
, const UserPerm
& perm
,
13578 InodeRef
*inp
, const std::map
<std::string
, std::string
> &metadata
,
13579 std::string alternate_name
)
13581 ldout(cct
, 8) << "_mkdir(" << dir
->ino
<< " " << name
<< ", 0" << oct
13582 << mode
<< dec
<< ", uid " << perm
.uid()
13583 << ", gid " << perm
.gid() << ")" << dendl
;
13585 if (strlen(name
) > NAME_MAX
)
13586 return -CEPHFS_ENAMETOOLONG
;
13588 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
13589 return -CEPHFS_EROFS
;
13591 if (is_quota_files_exceeded(dir
, perm
)) {
13592 return -CEPHFS_EDQUOT
;
13595 bool is_snap_op
= dir
->snapid
== CEPH_SNAPDIR
;
13596 MetaRequest
*req
= new MetaRequest(is_snap_op
?
13597 CEPH_MDS_OP_MKSNAP
: CEPH_MDS_OP_MKDIR
);
13600 dir
->make_nosnap_relative_path(path
);
13601 path
.push_dentry(name
);
13602 req
->set_filepath(path
);
13603 req
->set_inode(dir
);
13604 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13605 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13606 req
->set_alternate_name(std::move(alternate_name
));
13610 int res
= _posix_acl_create(dir
, &mode
, bl
, perm
);
13613 req
->head
.args
.mkdir
.mode
= mode
;
13615 SnapPayload payload
;
13616 // clear the bufferlist that may have been populated by the call
13617 // to _posix_acl_create(). MDS mksnap does not make use of it.
13618 // So, reuse it to pass metadata payload.
13620 payload
.metadata
= metadata
;
13621 encode(payload
, bl
);
13623 if (bl
.length() > 0) {
13628 res
= get_or_create(dir
, name
, &de
);
13631 req
->set_dentry(de
);
13633 ldout(cct
, 10) << "_mkdir: making request" << dendl
;
13634 res
= make_request(req
, perm
, inp
);
13635 ldout(cct
, 10) << "_mkdir result is " << res
<< dendl
;
13639 ldout(cct
, 8) << "_mkdir(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
13647 int Client::ll_mkdir(Inode
*parent
, const char *name
, mode_t mode
,
13648 struct stat
*attr
, Inode
**out
, const UserPerm
& perm
)
13650 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13651 if (!mref_reader
.is_state_satisfied())
13652 return -CEPHFS_ENOTCONN
;
13654 vinodeno_t vparent
= _get_vino(parent
);
13656 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
<< dendl
;
13657 tout(cct
) << "ll_mkdir" << std::endl
;
13658 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13659 tout(cct
) << name
<< std::endl
;
13660 tout(cct
) << mode
<< std::endl
;
13662 std::scoped_lock
lock(client_lock
);
13664 if (!fuse_default_permissions
) {
13665 int r
= may_create(parent
, perm
);
13671 int r
= _mkdir(parent
, name
, mode
, perm
, &in
);
13673 fill_stat(in
, attr
);
13676 tout(cct
) << attr
->st_ino
<< std::endl
;
13677 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
13678 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
13683 int Client::ll_mkdirx(Inode
*parent
, const char *name
, mode_t mode
, Inode
**out
,
13684 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
13685 const UserPerm
& perms
)
13687 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13688 if (!mref_reader
.is_state_satisfied())
13689 return -CEPHFS_ENOTCONN
;
13691 vinodeno_t vparent
= _get_vino(parent
);
13693 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
<< dendl
;
13694 tout(cct
) << "ll_mkdirx" << std::endl
;
13695 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13696 tout(cct
) << name
<< std::endl
;
13697 tout(cct
) << mode
<< std::endl
;
13699 std::scoped_lock
lock(client_lock
);
13701 if (!fuse_default_permissions
) {
13702 int r
= may_create(parent
, perms
);
13708 int r
= _mkdir(parent
, name
, mode
, perms
, &in
);
13710 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
13716 tout(cct
) << stx
->stx_ino
<< std::endl
;
13717 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
13718 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
13723 int Client::_symlink(Inode
*dir
, const char *name
, const char *target
,
13724 const UserPerm
& perms
, std::string alternate_name
, InodeRef
*inp
)
13726 ldout(cct
, 8) << "_symlink(" << dir
->ino
<< " " << name
<< ", " << target
13727 << ", uid " << perms
.uid() << ", gid " << perms
.gid() << ")"
13730 if (strlen(name
) > NAME_MAX
)
13731 return -CEPHFS_ENAMETOOLONG
;
13733 if (dir
->snapid
!= CEPH_NOSNAP
) {
13734 return -CEPHFS_EROFS
;
13736 if (is_quota_files_exceeded(dir
, perms
)) {
13737 return -CEPHFS_EDQUOT
;
13740 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SYMLINK
);
13743 dir
->make_nosnap_relative_path(path
);
13744 path
.push_dentry(name
);
13745 req
->set_filepath(path
);
13746 req
->set_alternate_name(std::move(alternate_name
));
13747 req
->set_inode(dir
);
13748 req
->set_string2(target
);
13749 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13750 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13753 int res
= get_or_create(dir
, name
, &de
);
13756 req
->set_dentry(de
);
13758 res
= make_request(req
, perms
, inp
);
13761 ldout(cct
, 8) << "_symlink(\"" << path
<< "\", \"" << target
<< "\") = " <<
13770 int Client::ll_symlink(Inode
*parent
, const char *name
, const char *value
,
13771 struct stat
*attr
, Inode
**out
, const UserPerm
& perms
)
13773 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13774 if (!mref_reader
.is_state_satisfied())
13775 return -CEPHFS_ENOTCONN
;
13777 vinodeno_t vparent
= _get_vino(parent
);
13779 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
<< " -> " << value
13781 tout(cct
) << "ll_symlink" << std::endl
;
13782 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13783 tout(cct
) << name
<< std::endl
;
13784 tout(cct
) << value
<< std::endl
;
13786 std::scoped_lock
lock(client_lock
);
13788 if (!fuse_default_permissions
) {
13789 int r
= may_create(parent
, perms
);
13795 int r
= _symlink(parent
, name
, value
, perms
, "", &in
);
13797 fill_stat(in
, attr
);
13800 tout(cct
) << attr
->st_ino
<< std::endl
;
13801 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
13802 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
13807 int Client::ll_symlinkx(Inode
*parent
, const char *name
, const char *value
,
13808 Inode
**out
, struct ceph_statx
*stx
, unsigned want
,
13809 unsigned flags
, const UserPerm
& perms
)
13811 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13812 if (!mref_reader
.is_state_satisfied())
13813 return -CEPHFS_ENOTCONN
;
13815 vinodeno_t vparent
= _get_vino(parent
);
13817 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
<< " -> " << value
13819 tout(cct
) << "ll_symlinkx" << std::endl
;
13820 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13821 tout(cct
) << name
<< std::endl
;
13822 tout(cct
) << value
<< std::endl
;
13824 std::scoped_lock
lock(client_lock
);
13826 if (!fuse_default_permissions
) {
13827 int r
= may_create(parent
, perms
);
13833 int r
= _symlink(parent
, name
, value
, perms
, "", &in
);
13835 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
13838 tout(cct
) << stx
->stx_ino
<< std::endl
;
13839 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
13840 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
13845 int Client::_unlink(Inode
*dir
, const char *name
, const UserPerm
& perm
)
13847 ldout(cct
, 8) << "_unlink(" << dir
->ino
<< " " << name
13848 << " uid " << perm
.uid() << " gid " << perm
.gid()
13851 if (dir
->snapid
!= CEPH_NOSNAP
) {
13852 return -CEPHFS_EROFS
;
13855 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_UNLINK
);
13858 dir
->make_nosnap_relative_path(path
);
13859 path
.push_dentry(name
);
13860 req
->set_filepath(path
);
13866 int res
= get_or_create(dir
, name
, &de
);
13869 req
->set_dentry(de
);
13870 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13871 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13873 res
= _lookup(dir
, name
, 0, &otherin
, perm
);
13877 in
= otherin
.get();
13878 req
->set_other_inode(in
);
13879 in
->break_all_delegs();
13880 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
13882 req
->set_inode(dir
);
13884 res
= make_request(req
, perm
);
13887 ldout(cct
, 8) << "unlink(" << path
<< ") = " << res
<< dendl
;
13895 int Client::ll_unlink(Inode
*in
, const char *name
, const UserPerm
& perm
)
13897 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13898 if (!mref_reader
.is_state_satisfied())
13899 return -CEPHFS_ENOTCONN
;
13901 vinodeno_t vino
= _get_vino(in
);
13903 ldout(cct
, 3) << "ll_unlink " << vino
<< " " << name
<< dendl
;
13904 tout(cct
) << "ll_unlink" << std::endl
;
13905 tout(cct
) << vino
.ino
.val
<< std::endl
;
13906 tout(cct
) << name
<< std::endl
;
13908 std::scoped_lock
lock(client_lock
);
13910 if (!fuse_default_permissions
) {
13911 int r
= may_delete(in
, name
, perm
);
13915 return _unlink(in
, name
, perm
);
13918 int Client::_rmdir(Inode
*dir
, const char *name
, const UserPerm
& perms
)
13920 ldout(cct
, 8) << "_rmdir(" << dir
->ino
<< " " << name
<< " uid "
13921 << perms
.uid() << " gid " << perms
.gid() << ")" << dendl
;
13923 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
13924 return -CEPHFS_EROFS
;
13927 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_RMSNAP
: CEPH_MDS_OP_RMDIR
;
13928 MetaRequest
*req
= new MetaRequest(op
);
13930 dir
->make_nosnap_relative_path(path
);
13931 path
.push_dentry(name
);
13932 req
->set_filepath(path
);
13933 req
->set_inode(dir
);
13935 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13936 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13937 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
13942 int res
= get_or_create(dir
, name
, &de
);
13945 if (op
== CEPH_MDS_OP_RMDIR
)
13946 req
->set_dentry(de
);
13950 res
= _lookup(dir
, name
, 0, &in
, perms
);
13954 if (op
== CEPH_MDS_OP_RMSNAP
) {
13955 unlink(de
, true, true);
13958 req
->set_other_inode(in
.get());
13960 res
= make_request(req
, perms
);
13963 ldout(cct
, 8) << "rmdir(" << path
<< ") = " << res
<< dendl
;
13971 int Client::ll_rmdir(Inode
*in
, const char *name
, const UserPerm
& perms
)
13973 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13974 if (!mref_reader
.is_state_satisfied())
13975 return -CEPHFS_ENOTCONN
;
13977 vinodeno_t vino
= _get_vino(in
);
13979 ldout(cct
, 3) << "ll_rmdir " << vino
<< " " << name
<< dendl
;
13980 tout(cct
) << "ll_rmdir" << std::endl
;
13981 tout(cct
) << vino
.ino
.val
<< std::endl
;
13982 tout(cct
) << name
<< std::endl
;
13984 std::scoped_lock
lock(client_lock
);
13986 if (!fuse_default_permissions
) {
13987 int r
= may_delete(in
, name
, perms
);
13992 return _rmdir(in
, name
, perms
);
13995 int Client::_rename(Inode
*fromdir
, const char *fromname
, Inode
*todir
, const char *toname
, const UserPerm
& perm
, std::string alternate_name
)
13997 ldout(cct
, 8) << "_rename(" << fromdir
->ino
<< " " << fromname
<< " to "
13998 << todir
->ino
<< " " << toname
13999 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")"
14002 if (fromdir
->snapid
!= todir
->snapid
)
14003 return -CEPHFS_EXDEV
;
14005 int op
= CEPH_MDS_OP_RENAME
;
14006 if (fromdir
->snapid
!= CEPH_NOSNAP
) {
14007 if (fromdir
== todir
&& fromdir
->snapid
== CEPH_SNAPDIR
)
14008 op
= CEPH_MDS_OP_RENAMESNAP
;
14010 return -CEPHFS_EROFS
;
14012 if (cct
->_conf
.get_val
<bool>("client_quota") && fromdir
!= todir
) {
14013 Inode
*fromdir_root
=
14014 fromdir
->quota
.is_enable() ? fromdir
: get_quota_root(fromdir
, perm
);
14015 Inode
*todir_root
=
14016 todir
->quota
.is_enable() ? todir
: get_quota_root(todir
, perm
);
14017 if (fromdir_root
!= todir_root
) {
14018 return -CEPHFS_EXDEV
;
14023 MetaRequest
*req
= new MetaRequest(op
);
14026 fromdir
->make_nosnap_relative_path(from
);
14027 from
.push_dentry(fromname
);
14029 todir
->make_nosnap_relative_path(to
);
14030 to
.push_dentry(toname
);
14031 req
->set_filepath(to
);
14032 req
->set_filepath2(from
);
14033 req
->set_alternate_name(std::move(alternate_name
));
14036 int res
= get_or_create(fromdir
, fromname
, &oldde
);
14040 res
= get_or_create(todir
, toname
, &de
);
14044 if (op
== CEPH_MDS_OP_RENAME
) {
14045 req
->set_old_dentry(oldde
);
14046 req
->old_dentry_drop
= CEPH_CAP_FILE_SHARED
;
14047 req
->old_dentry_unless
= CEPH_CAP_FILE_EXCL
;
14049 req
->set_dentry(de
);
14050 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
14051 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
14053 InodeRef oldin
, otherin
;
14054 res
= _lookup(fromdir
, fromname
, 0, &oldin
, perm
);
14058 Inode
*oldinode
= oldin
.get();
14059 oldinode
->break_all_delegs();
14060 req
->set_old_inode(oldinode
);
14061 req
->old_inode_drop
= CEPH_CAP_LINK_SHARED
;
14063 res
= _lookup(todir
, toname
, 0, &otherin
, perm
);
14067 Inode
*in
= otherin
.get();
14068 req
->set_other_inode(in
);
14069 in
->break_all_delegs();
14071 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
14073 case -CEPHFS_ENOENT
:
14079 req
->set_inode(todir
);
14081 // renamesnap reply contains no tracedn, so we need to invalidate
14083 unlink(oldde
, true, true);
14084 unlink(de
, true, true);
14086 req
->set_inode(todir
);
14089 res
= make_request(req
, perm
, &target
);
14090 ldout(cct
, 10) << "rename result is " << res
<< dendl
;
14092 // renamed item from our cache
14095 ldout(cct
, 8) << "_rename(" << from
<< ", " << to
<< ") = " << res
<< dendl
;
14103 int Client::ll_rename(Inode
*parent
, const char *name
, Inode
*newparent
,
14104 const char *newname
, const UserPerm
& perm
)
14106 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14107 if (!mref_reader
.is_state_satisfied())
14108 return -CEPHFS_ENOTCONN
;
14110 vinodeno_t vparent
= _get_vino(parent
);
14111 vinodeno_t vnewparent
= _get_vino(newparent
);
14113 ldout(cct
, 3) << "ll_rename " << vparent
<< " " << name
<< " to "
14114 << vnewparent
<< " " << newname
<< dendl
;
14115 tout(cct
) << "ll_rename" << std::endl
;
14116 tout(cct
) << vparent
.ino
.val
<< std::endl
;
14117 tout(cct
) << name
<< std::endl
;
14118 tout(cct
) << vnewparent
.ino
.val
<< std::endl
;
14119 tout(cct
) << newname
<< std::endl
;
14121 std::scoped_lock
lock(client_lock
);
14123 if (!fuse_default_permissions
) {
14124 int r
= may_delete(parent
, name
, perm
);
14127 r
= may_delete(newparent
, newname
, perm
);
14128 if (r
< 0 && r
!= -CEPHFS_ENOENT
)
14132 return _rename(parent
, name
, newparent
, newname
, perm
, "");
14135 int Client::_link(Inode
*in
, Inode
*dir
, const char *newname
, const UserPerm
& perm
, std::string alternate_name
, InodeRef
*inp
)
14137 ldout(cct
, 8) << "_link(" << in
->ino
<< " to " << dir
->ino
<< " " << newname
14138 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")" << dendl
;
14140 if (strlen(newname
) > NAME_MAX
)
14141 return -CEPHFS_ENAMETOOLONG
;
14143 if (in
->snapid
!= CEPH_NOSNAP
|| dir
->snapid
!= CEPH_NOSNAP
) {
14144 return -CEPHFS_EROFS
;
14146 if (is_quota_files_exceeded(dir
, perm
)) {
14147 return -CEPHFS_EDQUOT
;
14150 in
->break_all_delegs();
14151 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LINK
);
14153 filepath
path(newname
, dir
->ino
);
14154 req
->set_filepath(path
);
14155 req
->set_alternate_name(std::move(alternate_name
));
14156 filepath
existing(in
->ino
);
14157 req
->set_filepath2(existing
);
14159 req
->set_inode(dir
);
14160 req
->inode_drop
= CEPH_CAP_FILE_SHARED
;
14161 req
->inode_unless
= CEPH_CAP_FILE_EXCL
;
14164 int res
= get_or_create(dir
, newname
, &de
);
14167 req
->set_dentry(de
);
14169 res
= make_request(req
, perm
, inp
);
14170 ldout(cct
, 10) << "link result is " << res
<< dendl
;
14173 ldout(cct
, 8) << "link(" << existing
<< ", " << path
<< ") = " << res
<< dendl
;
14181 int Client::ll_link(Inode
*in
, Inode
*newparent
, const char *newname
,
14182 const UserPerm
& perm
)
14184 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14185 if (!mref_reader
.is_state_satisfied())
14186 return -CEPHFS_ENOTCONN
;
14188 vinodeno_t vino
= _get_vino(in
);
14189 vinodeno_t vnewparent
= _get_vino(newparent
);
14191 ldout(cct
, 3) << "ll_link " << vino
<< " to " << vnewparent
<< " " <<
14193 tout(cct
) << "ll_link" << std::endl
;
14194 tout(cct
) << vino
.ino
.val
<< std::endl
;
14195 tout(cct
) << vnewparent
<< std::endl
;
14196 tout(cct
) << newname
<< std::endl
;
14200 std::scoped_lock
lock(client_lock
);
14202 if (!fuse_default_permissions
) {
14203 if (S_ISDIR(in
->mode
))
14204 return -CEPHFS_EPERM
;
14206 int r
= may_hardlink(in
, perm
);
14210 r
= may_create(newparent
, perm
);
14215 return _link(in
, newparent
, newname
, perm
, "", &target
);
14218 int Client::ll_num_osds(void)
14220 std::scoped_lock
lock(client_lock
);
14221 return objecter
->with_osdmap(std::mem_fn(&OSDMap::get_num_osds
));
14224 int Client::ll_osdaddr(int osd
, uint32_t *addr
)
14226 std::scoped_lock
lock(client_lock
);
14229 bool exists
= objecter
->with_osdmap([&](const OSDMap
& o
) {
14230 if (!o
.exists(osd
))
14232 g
= o
.get_addrs(osd
).front();
14237 uint32_t nb_addr
= (g
.in4_addr()).sin_addr
.s_addr
;
14238 *addr
= ntohl(nb_addr
);
14242 uint32_t Client::ll_stripe_unit(Inode
*in
)
14244 std::scoped_lock
lock(client_lock
);
14245 return in
->layout
.stripe_unit
;
14248 uint64_t Client::ll_snap_seq(Inode
*in
)
14250 std::scoped_lock
lock(client_lock
);
14251 return in
->snaprealm
->seq
;
14254 int Client::ll_file_layout(Inode
*in
, file_layout_t
*layout
)
14256 std::scoped_lock
lock(client_lock
);
14257 *layout
= in
->layout
;
14261 int Client::ll_file_layout(Fh
*fh
, file_layout_t
*layout
)
14263 return ll_file_layout(fh
->inode
.get(), layout
);
14266 /* Currently we cannot take advantage of redundancy in reads, since we
14267 would have to go through all possible placement groups (a
14268 potentially quite large number determined by a hash), and use CRUSH
14269 to calculate the appropriate set of OSDs for each placement group,
14270 then index into that. An array with one entry per OSD is much more
14271 tractable and works for demonstration purposes. */
14273 int Client::ll_get_stripe_osd(Inode
*in
, uint64_t blockno
,
14274 file_layout_t
* layout
)
14276 std::scoped_lock
lock(client_lock
);
14278 inodeno_t ino
= in
->ino
;
14279 uint32_t object_size
= layout
->object_size
;
14280 uint32_t su
= layout
->stripe_unit
;
14281 uint32_t stripe_count
= layout
->stripe_count
;
14282 uint64_t stripes_per_object
= object_size
/ su
;
14283 uint64_t stripeno
= 0, stripepos
= 0;
14286 stripeno
= blockno
/ stripe_count
; // which horizontal stripe (Y)
14287 stripepos
= blockno
% stripe_count
; // which object in the object set (X)
14289 uint64_t objectsetno
= stripeno
/ stripes_per_object
; // which object set
14290 uint64_t objectno
= objectsetno
* stripe_count
+ stripepos
; // object id
14292 object_t oid
= file_object_t(ino
, objectno
);
14293 return objecter
->with_osdmap([&](const OSDMap
& o
) {
14294 ceph_object_layout olayout
=
14295 o
.file_to_object_layout(oid
, *layout
);
14296 pg_t pg
= (pg_t
)olayout
.ol_pgid
;
14299 o
.pg_to_acting_osds(pg
, &osds
, &primary
);
14304 /* Return the offset of the block, internal to the object */
14306 uint64_t Client::ll_get_internal_offset(Inode
*in
, uint64_t blockno
)
14308 std::scoped_lock
lock(client_lock
);
14309 file_layout_t
*layout
=&(in
->layout
);
14310 uint32_t object_size
= layout
->object_size
;
14311 uint32_t su
= layout
->stripe_unit
;
14312 uint64_t stripes_per_object
= object_size
/ su
;
14314 return (blockno
% stripes_per_object
) * su
;
14317 int Client::ll_opendir(Inode
*in
, int flags
, dir_result_t
** dirpp
,
14318 const UserPerm
& perms
)
14320 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14321 if (!mref_reader
.is_state_satisfied())
14322 return -CEPHFS_ENOTCONN
;
14324 vinodeno_t vino
= _get_vino(in
);
14326 ldout(cct
, 3) << "ll_opendir " << vino
<< dendl
;
14327 tout(cct
) << "ll_opendir" << std::endl
;
14328 tout(cct
) << vino
.ino
.val
<< std::endl
;
14330 std::scoped_lock
lock(client_lock
);
14332 if (!fuse_default_permissions
) {
14333 int r
= may_open(in
, flags
, perms
);
14338 int r
= _opendir(in
, dirpp
, perms
);
14339 tout(cct
) << (uintptr_t)*dirpp
<< std::endl
;
14341 ldout(cct
, 3) << "ll_opendir " << vino
<< " = " << r
<< " (" << *dirpp
<< ")"
14346 int Client::ll_releasedir(dir_result_t
*dirp
)
14348 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14349 if (!mref_reader
.is_state_satisfied())
14350 return -CEPHFS_ENOTCONN
;
14352 ldout(cct
, 3) << "ll_releasedir " << dirp
<< dendl
;
14353 tout(cct
) << "ll_releasedir" << std::endl
;
14354 tout(cct
) << (uintptr_t)dirp
<< std::endl
;
14356 std::scoped_lock
lock(client_lock
);
14362 int Client::ll_fsyncdir(dir_result_t
*dirp
)
14364 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14365 if (!mref_reader
.is_state_satisfied())
14366 return -CEPHFS_ENOTCONN
;
14368 ldout(cct
, 3) << "ll_fsyncdir " << dirp
<< dendl
;
14369 tout(cct
) << "ll_fsyncdir" << std::endl
;
14370 tout(cct
) << (uintptr_t)dirp
<< std::endl
;
14372 std::scoped_lock
lock(client_lock
);
14373 return _fsync(dirp
->inode
.get(), false);
14376 int Client::ll_open(Inode
*in
, int flags
, Fh
**fhp
, const UserPerm
& perms
)
14378 ceph_assert(!(flags
& O_CREAT
));
14380 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14381 if (!mref_reader
.is_state_satisfied())
14382 return -CEPHFS_ENOTCONN
;
14384 vinodeno_t vino
= _get_vino(in
);
14386 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) << dendl
;
14387 tout(cct
) << "ll_open" << std::endl
;
14388 tout(cct
) << vino
.ino
.val
<< std::endl
;
14389 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
14391 std::scoped_lock
lock(client_lock
);
14394 if (!fuse_default_permissions
) {
14395 r
= may_open(in
, flags
, perms
);
14400 r
= _open(in
, flags
, 0, fhp
/* may be NULL */, perms
);
14403 Fh
*fhptr
= fhp
? *fhp
: NULL
;
14405 ll_unclosed_fh_set
.insert(fhptr
);
14407 tout(cct
) << (uintptr_t)fhptr
<< std::endl
;
14408 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) <<
14409 " = " << r
<< " (" << fhptr
<< ")" << dendl
;
14413 int Client::_ll_create(Inode
*parent
, const char *name
, mode_t mode
,
14414 int flags
, InodeRef
*in
, int caps
, Fh
**fhp
,
14415 const UserPerm
& perms
)
14419 vinodeno_t vparent
= _get_vino(parent
);
14421 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
14422 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << ", uid " << perms
.uid()
14423 << ", gid " << perms
.gid() << dendl
;
14424 tout(cct
) << "ll_create" << std::endl
;
14425 tout(cct
) << vparent
.ino
.val
<< std::endl
;
14426 tout(cct
) << name
<< std::endl
;
14427 tout(cct
) << mode
<< std::endl
;
14428 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
14430 bool created
= false;
14431 int r
= _lookup(parent
, name
, caps
, in
, perms
);
14433 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
14434 return -CEPHFS_EEXIST
;
14436 if (r
== -CEPHFS_ENOENT
&& (flags
& O_CREAT
)) {
14437 if (!fuse_default_permissions
) {
14438 r
= may_create(parent
, perms
);
14442 r
= _create(parent
, name
, flags
, mode
, in
, fhp
, 0, 0, 0, NULL
, &created
,
14453 ldout(cct
, 20) << "_ll_create created = " << created
<< dendl
;
14455 if (!fuse_default_permissions
) {
14456 r
= may_open(in
->get(), flags
, perms
);
14459 int release_r
= _release_fh(*fhp
);
14460 ceph_assert(release_r
== 0); // during create, no async data ops should have happened
14465 if (*fhp
== NULL
) {
14466 r
= _open(in
->get(), flags
, mode
, fhp
, perms
);
14474 ll_unclosed_fh_set
.insert(*fhp
);
14479 Inode
*inode
= in
->get();
14480 if (use_faked_inos())
14481 ino
= inode
->faked_ino
;
14486 tout(cct
) << (uintptr_t)*fhp
<< std::endl
;
14487 tout(cct
) << ino
<< std::endl
;
14488 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
14489 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << " = " << r
<< " (" <<
14490 *fhp
<< " " << hex
<< ino
<< dec
<< ")" << dendl
;
14495 int Client::ll_create(Inode
*parent
, const char *name
, mode_t mode
,
14496 int flags
, struct stat
*attr
, Inode
**outp
, Fh
**fhp
,
14497 const UserPerm
& perms
)
14499 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14500 if (!mref_reader
.is_state_satisfied())
14501 return -CEPHFS_ENOTCONN
;
14503 std::scoped_lock
lock(client_lock
);
14506 int r
= _ll_create(parent
, name
, mode
, flags
, &in
, CEPH_STAT_CAP_INODE_ALL
,
14511 // passing an Inode in outp requires an additional ref
14516 fill_stat(in
, attr
);
14524 int Client::ll_createx(Inode
*parent
, const char *name
, mode_t mode
,
14525 int oflags
, Inode
**outp
, Fh
**fhp
,
14526 struct ceph_statx
*stx
, unsigned want
, unsigned lflags
,
14527 const UserPerm
& perms
)
14529 unsigned caps
= statx_to_mask(lflags
, want
);
14530 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14531 if (!mref_reader
.is_state_satisfied())
14532 return -CEPHFS_ENOTCONN
;
14534 std::scoped_lock
lock(client_lock
);
14537 int r
= _ll_create(parent
, name
, mode
, oflags
, &in
, caps
, fhp
, perms
);
14541 // passing an Inode in outp requires an additional ref
14546 fill_statx(in
, caps
, stx
);
14555 loff_t
Client::ll_lseek(Fh
*fh
, loff_t offset
, int whence
)
14557 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14558 if (!mref_reader
.is_state_satisfied())
14559 return -CEPHFS_ENOTCONN
;
14561 tout(cct
) << "ll_lseek" << std::endl
;
14562 tout(cct
) << offset
<< std::endl
;
14563 tout(cct
) << whence
<< std::endl
;
14565 std::scoped_lock
lock(client_lock
);
14566 return _lseek(fh
, offset
, whence
);
14569 int Client::ll_read(Fh
*fh
, loff_t off
, loff_t len
, bufferlist
*bl
)
14571 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14572 if (!mref_reader
.is_state_satisfied())
14573 return -CEPHFS_ENOTCONN
;
14575 ldout(cct
, 3) << "ll_read " << fh
<< " " << fh
->inode
->ino
<< " " << " " << off
<< "~" << len
<< dendl
;
14576 tout(cct
) << "ll_read" << std::endl
;
14577 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14578 tout(cct
) << off
<< std::endl
;
14579 tout(cct
) << len
<< std::endl
;
14581 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14582 len
= std::min(len
, (loff_t
)INT_MAX
);
14583 std::scoped_lock
lock(client_lock
);
14585 int r
= _read(fh
, off
, len
, bl
);
14586 ldout(cct
, 3) << "ll_read " << fh
<< " " << off
<< "~" << len
<< " = " << r
14591 int Client::ll_read_block(Inode
*in
, uint64_t blockid
,
14595 file_layout_t
* layout
)
14597 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14598 if (!mref_reader
.is_state_satisfied())
14599 return -CEPHFS_ENOTCONN
;
14601 vinodeno_t vino
= _get_vino(in
);
14602 object_t oid
= file_object_t(vino
.ino
, blockid
);
14603 C_SaferCond onfinish
;
14606 objecter
->read(oid
,
14607 object_locator_t(layout
->pool_id
),
14612 CEPH_OSD_FLAG_READ
,
14615 int r
= onfinish
.wait();
14617 bl
.begin().copy(bl
.length(), buf
);
14624 /* It appears that the OSD doesn't return success unless the entire
14625 buffer was written, return the write length on success. */
14627 int Client::ll_write_block(Inode
*in
, uint64_t blockid
,
14628 char* buf
, uint64_t offset
,
14629 uint64_t length
, file_layout_t
* layout
,
14630 uint64_t snapseq
, uint32_t sync
)
14632 vinodeno_t vino
= ll_get_vino(in
);
14634 std::unique_ptr
<C_SaferCond
> onsafe
= nullptr;
14636 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14637 if (!mref_reader
.is_state_satisfied())
14638 return -CEPHFS_ENOTCONN
;
14641 return -CEPHFS_EINVAL
;
14643 if (true || sync
) {
14644 /* if write is stable, the epilogue is waiting on
14646 onsafe
.reset(new C_SaferCond("Client::ll_write_block flock"));
14648 object_t oid
= file_object_t(vino
.ino
, blockid
);
14649 SnapContext fakesnap
;
14650 ceph::bufferlist bl
;
14652 bl
.push_back(buffer::copy(buf
, length
));
14655 ldout(cct
, 1) << "ll_block_write for " << vino
.ino
<< "." << blockid
14658 fakesnap
.seq
= snapseq
;
14660 /* lock just in time */
14661 objecter
->write(oid
,
14662 object_locator_t(layout
->pool_id
),
14667 ceph::real_clock::now(),
14671 if (nullptr != onsafe
) {
14672 r
= onsafe
->wait();
14682 int Client::ll_commit_blocks(Inode
*in
,
14687 BarrierContext *bctx;
14688 vinodeno_t vino = _get_vino(in);
14689 uint64_t ino = vino.ino;
14691 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
14692 << offset << " to " << length << dendl;
14695 return -CEPHFS_EINVAL;
14698 std::scoped_lock lock(client_lock);
14699 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
14700 if (p != barriers.end()) {
14701 barrier_interval civ(offset, offset + length);
14702 p->second->commit_barrier(civ);
14708 int Client::ll_write(Fh
*fh
, loff_t off
, loff_t len
, const char *data
)
14710 ldout(cct
, 3) << "ll_write " << fh
<< " " << fh
->inode
->ino
<< " " << off
<<
14711 "~" << len
<< dendl
;
14712 tout(cct
) << "ll_write" << std::endl
;
14713 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14714 tout(cct
) << off
<< std::endl
;
14715 tout(cct
) << len
<< std::endl
;
14717 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14718 if (!mref_reader
.is_state_satisfied())
14719 return -CEPHFS_ENOTCONN
;
14721 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14722 len
= std::min(len
, (loff_t
)INT_MAX
);
14723 std::scoped_lock
lock(client_lock
);
14725 int r
= _write(fh
, off
, len
, data
, NULL
, 0);
14726 ldout(cct
, 3) << "ll_write " << fh
<< " " << off
<< "~" << len
<< " = " << r
14731 int64_t Client::ll_writev(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
14733 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14734 if (!mref_reader
.is_state_satisfied())
14735 return -CEPHFS_ENOTCONN
;
14737 std::scoped_lock
cl(client_lock
);
14738 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, true, false);
14741 int64_t Client::ll_readv(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
14743 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14744 if (!mref_reader
.is_state_satisfied())
14745 return -CEPHFS_ENOTCONN
;
14747 std::scoped_lock
cl(client_lock
);
14748 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, false, false);
14751 int Client::ll_flush(Fh
*fh
)
14753 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14754 if (!mref_reader
.is_state_satisfied())
14755 return -CEPHFS_ENOTCONN
;
14757 ldout(cct
, 3) << "ll_flush " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
14758 tout(cct
) << "ll_flush" << std::endl
;
14759 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14761 std::scoped_lock
lock(client_lock
);
14765 int Client::ll_fsync(Fh
*fh
, bool syncdataonly
)
14767 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14768 if (!mref_reader
.is_state_satisfied())
14769 return -CEPHFS_ENOTCONN
;
14771 ldout(cct
, 3) << "ll_fsync " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
14772 tout(cct
) << "ll_fsync" << std::endl
;
14773 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14775 std::scoped_lock
lock(client_lock
);
14776 int r
= _fsync(fh
, syncdataonly
);
14778 // If we're returning an error, clear it from the FH
14779 fh
->take_async_err();
14784 int Client::ll_sync_inode(Inode
*in
, bool syncdataonly
)
14786 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14787 if (!mref_reader
.is_state_satisfied())
14788 return -CEPHFS_ENOTCONN
;
14790 ldout(cct
, 3) << "ll_sync_inode " << *in
<< " " << dendl
;
14791 tout(cct
) << "ll_sync_inode" << std::endl
;
14792 tout(cct
) << (uintptr_t)in
<< std::endl
;
14794 std::scoped_lock
lock(client_lock
);
14795 return _fsync(in
, syncdataonly
);
14798 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
14800 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
14802 if (offset
< 0 || length
<= 0)
14803 return -CEPHFS_EINVAL
;
14805 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
))
14806 return -CEPHFS_EOPNOTSUPP
;
14808 if ((mode
& FALLOC_FL_PUNCH_HOLE
) && !(mode
& FALLOC_FL_KEEP_SIZE
))
14809 return -CEPHFS_EOPNOTSUPP
;
14811 Inode
*in
= fh
->inode
.get();
14813 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
) &&
14814 !(mode
& FALLOC_FL_PUNCH_HOLE
)) {
14815 return -CEPHFS_ENOSPC
;
14818 if (in
->snapid
!= CEPH_NOSNAP
)
14819 return -CEPHFS_EROFS
;
14821 if ((fh
->mode
& CEPH_FILE_MODE_WR
) == 0)
14822 return -CEPHFS_EBADF
;
14824 uint64_t size
= offset
+ length
;
14825 if (!(mode
& (FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
)) &&
14827 is_quota_bytes_exceeded(in
, size
- in
->size
, fh
->actor_perms
)) {
14828 return -CEPHFS_EDQUOT
;
14832 int r
= get_caps(fh
, CEPH_CAP_FILE_WR
, CEPH_CAP_FILE_BUFFER
, &have
, -1);
14836 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
14837 if (mode
& FALLOC_FL_PUNCH_HOLE
) {
14838 if (in
->inline_version
< CEPH_INLINE_NONE
&&
14839 (have
& CEPH_CAP_FILE_BUFFER
)) {
14841 auto inline_iter
= in
->inline_data
.cbegin();
14842 int len
= in
->inline_data
.length();
14843 if (offset
< len
) {
14845 inline_iter
.copy(offset
, bl
);
14847 if (offset
+ size
> len
)
14848 size
= len
- offset
;
14850 bl
.append_zero(size
);
14851 if (offset
+ size
< len
) {
14852 inline_iter
+= size
;
14853 inline_iter
.copy(len
- offset
- size
, bl
);
14855 in
->inline_data
= bl
;
14856 in
->inline_version
++;
14858 in
->mtime
= in
->ctime
= ceph_clock_now();
14860 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
14862 if (in
->inline_version
< CEPH_INLINE_NONE
) {
14863 onuninline
.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
14864 uninline_data(in
, onuninline
.get());
14867 C_SaferCond
onfinish("Client::_punch_hole flock");
14869 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
14871 _invalidate_inode_cache(in
, offset
, length
);
14872 filer
->zero(in
->ino
, &in
->layout
,
14873 in
->snaprealm
->get_snap_context(),
14875 ceph::real_clock::now(),
14876 0, true, &onfinish
);
14877 in
->mtime
= in
->ctime
= ceph_clock_now();
14879 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
14881 client_lock
.unlock();
14883 client_lock
.lock();
14884 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
14886 } else if (!(mode
& FALLOC_FL_KEEP_SIZE
)) {
14887 uint64_t size
= offset
+ length
;
14888 if (size
> in
->size
) {
14890 in
->mtime
= in
->ctime
= ceph_clock_now();
14892 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
14894 if (is_quota_bytes_approaching(in
, fh
->actor_perms
)) {
14895 check_caps(in
, CHECK_CAPS_NODELAY
);
14896 } else if (is_max_size_approaching(in
)) {
14902 if (nullptr != onuninline
) {
14903 client_lock
.unlock();
14904 int ret
= onuninline
->wait();
14905 client_lock
.lock();
14907 if (ret
>= 0 || ret
== -CEPHFS_ECANCELED
) {
14908 in
->inline_data
.clear();
14909 in
->inline_version
= CEPH_INLINE_NONE
;
14910 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
14916 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
14920 int Client::ll_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
14922 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14923 if (!mref_reader
.is_state_satisfied())
14924 return -CEPHFS_ENOTCONN
;
14926 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
14927 tout(cct
) << __func__
<< " " << mode
<< " " << offset
<< " " << length
<< std::endl
;
14928 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14930 std::scoped_lock
lock(client_lock
);
14931 return _fallocate(fh
, mode
, offset
, length
);
14934 int Client::fallocate(int fd
, int mode
, loff_t offset
, loff_t length
)
14936 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14937 if (!mref_reader
.is_state_satisfied())
14938 return -CEPHFS_ENOTCONN
;
14940 tout(cct
) << __func__
<< " " << " " << fd
<< mode
<< " " << offset
<< " " << length
<< std::endl
;
14942 std::scoped_lock
lock(client_lock
);
14943 Fh
*fh
= get_filehandle(fd
);
14945 return -CEPHFS_EBADF
;
14946 #if defined(__linux__) && defined(O_PATH)
14947 if (fh
->flags
& O_PATH
)
14948 return -CEPHFS_EBADF
;
14950 return _fallocate(fh
, mode
, offset
, length
);
14953 int Client::ll_release(Fh
*fh
)
14955 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14956 if (!mref_reader
.is_state_satisfied())
14957 return -CEPHFS_ENOTCONN
;
14959 ldout(cct
, 3) << __func__
<< " (fh)" << fh
<< " " << fh
->inode
->ino
<< " " <<
14961 tout(cct
) << __func__
<< " (fh)" << std::endl
;
14962 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14964 std::scoped_lock
lock(client_lock
);
14966 if (ll_unclosed_fh_set
.count(fh
))
14967 ll_unclosed_fh_set
.erase(fh
);
14968 return _release_fh(fh
);
14971 int Client::ll_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
14973 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14974 if (!mref_reader
.is_state_satisfied())
14975 return -CEPHFS_ENOTCONN
;
14977 ldout(cct
, 3) << "ll_getlk (fh)" << fh
<< " " << fh
->inode
->ino
<< dendl
;
14978 tout(cct
) << "ll_getk (fh)" << (uintptr_t)fh
<< std::endl
;
14980 std::scoped_lock
lock(client_lock
);
14981 return _getlk(fh
, fl
, owner
);
14984 int Client::ll_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
14986 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14987 if (!mref_reader
.is_state_satisfied())
14988 return -CEPHFS_ENOTCONN
;
14990 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
14991 tout(cct
) << __func__
<< " (fh)" << (uintptr_t)fh
<< std::endl
;
14993 std::scoped_lock
lock(client_lock
);
14994 return _setlk(fh
, fl
, owner
, sleep
);
14997 int Client::ll_flock(Fh
*fh
, int cmd
, uint64_t owner
)
14999 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15000 if (!mref_reader
.is_state_satisfied())
15001 return -CEPHFS_ENOTCONN
;
15003 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
15004 tout(cct
) << __func__
<< " (fh)" << (uintptr_t)fh
<< std::endl
;
15006 std::scoped_lock
lock(client_lock
);
15007 return _flock(fh
, cmd
, owner
);
15010 int Client::set_deleg_timeout(uint32_t timeout
)
15012 std::scoped_lock
lock(client_lock
);
15015 * The whole point is to prevent blocklisting so we must time out the
15016 * delegation before the session autoclose timeout kicks in.
15018 if (timeout
>= mdsmap
->get_session_autoclose())
15019 return -CEPHFS_EINVAL
;
15021 deleg_timeout
= timeout
;
15025 int Client::ll_delegation(Fh
*fh
, unsigned cmd
, ceph_deleg_cb_t cb
, void *priv
)
15027 int ret
= -CEPHFS_EINVAL
;
15029 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15030 if (!mref_reader
.is_state_satisfied())
15031 return -CEPHFS_ENOTCONN
;
15033 std::scoped_lock
lock(client_lock
);
15035 Inode
*inode
= fh
->inode
.get();
15038 case CEPH_DELEGATION_NONE
:
15039 inode
->unset_deleg(fh
);
15044 ret
= inode
->set_deleg(fh
, cmd
, cb
, priv
);
15045 } catch (std::bad_alloc
&) {
15046 ret
= -CEPHFS_ENOMEM
;
15053 class C_Client_RequestInterrupt
: public Context
{
15058 C_Client_RequestInterrupt(Client
*c
, MetaRequest
*r
) : client(c
), req(r
) {
15061 void finish(int r
) override
{
15062 std::scoped_lock
l(client
->client_lock
);
15063 ceph_assert(req
->head
.op
== CEPH_MDS_OP_SETFILELOCK
);
15064 client
->_interrupt_filelock(req
);
15065 client
->put_request(req
);
15069 void Client::ll_interrupt(void *d
)
15071 MetaRequest
*req
= static_cast<MetaRequest
*>(d
);
15072 ldout(cct
, 3) << __func__
<< " tid " << req
->get_tid() << dendl
;
15073 tout(cct
) << __func__
<< " tid " << req
->get_tid() << std::endl
;
15074 interrupt_finisher
.queue(new C_Client_RequestInterrupt(this, req
));
15077 // =========================================
15080 // expose file layouts
15082 int Client::describe_layout(const char *relpath
, file_layout_t
*lp
,
15083 const UserPerm
& perms
)
15085 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15086 if (!mref_reader
.is_state_satisfied())
15087 return -CEPHFS_ENOTCONN
;
15089 std::scoped_lock
lock(client_lock
);
15091 filepath
path(relpath
);
15093 int r
= path_walk(path
, &in
, perms
);
15099 ldout(cct
, 3) << __func__
<< "(" << relpath
<< ") = 0" << dendl
;
15103 int Client::fdescribe_layout(int fd
, file_layout_t
*lp
)
15105 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15106 if (!mref_reader
.is_state_satisfied())
15107 return -CEPHFS_ENOTCONN
;
15109 std::scoped_lock
lock(client_lock
);
15111 Fh
*f
= get_filehandle(fd
);
15113 return -CEPHFS_EBADF
;
15114 Inode
*in
= f
->inode
.get();
15118 ldout(cct
, 3) << __func__
<< "(" << fd
<< ") = 0" << dendl
;
15122 int64_t Client::get_default_pool_id()
15124 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15125 if (!mref_reader
.is_state_satisfied())
15126 return -CEPHFS_ENOTCONN
;
15128 std::scoped_lock
lock(client_lock
);
15130 /* first data pool is the default */
15131 return mdsmap
->get_first_data_pool();
15136 int64_t Client::get_pool_id(const char *pool_name
)
15138 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15139 if (!mref_reader
.is_state_satisfied())
15140 return -CEPHFS_ENOTCONN
;
15142 std::scoped_lock
lock(client_lock
);
15144 return objecter
->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name
),
15148 string
Client::get_pool_name(int64_t pool
)
15150 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15151 if (!mref_reader
.is_state_satisfied())
15154 std::scoped_lock
lock(client_lock
);
15156 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
15157 return o
.have_pg_pool(pool
) ? o
.get_pool_name(pool
) : string();
15161 int Client::get_pool_replication(int64_t pool
)
15163 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15164 if (!mref_reader
.is_state_satisfied())
15165 return -CEPHFS_ENOTCONN
;
15167 std::scoped_lock
lock(client_lock
);
15169 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
15170 return o
.have_pg_pool(pool
) ? o
.get_pg_pool(pool
)->get_size() : -CEPHFS_ENOENT
;
15174 int Client::get_file_extent_osds(int fd
, loff_t off
, loff_t
*len
, vector
<int>& osds
)
15176 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15177 if (!mref_reader
.is_state_satisfied())
15178 return -CEPHFS_ENOTCONN
;
15180 std::scoped_lock
lock(client_lock
);
15182 Fh
*f
= get_filehandle(fd
);
15184 return -CEPHFS_EBADF
;
15185 Inode
*in
= f
->inode
.get();
15187 vector
<ObjectExtent
> extents
;
15188 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, 1, in
->truncate_size
, extents
);
15189 ceph_assert(extents
.size() == 1);
15191 objecter
->with_osdmap([&](const OSDMap
& o
) {
15192 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
15193 o
.pg_to_acting_osds(pg
, osds
);
15197 return -CEPHFS_EINVAL
;
15200 * Return the remainder of the extent (stripe unit)
15202 * If length = 1 is passed to Striper::file_to_extents we get a single
15203 * extent back, but its length is one so we still need to compute the length
15204 * to the end of the stripe unit.
15206 * If length = su then we may get 1 or 2 objects back in the extents vector
15207 * which would have to be examined. Even then, the offsets are local to the
15208 * object, so matching up to the file offset is extra work.
15210 * It seems simpler to stick with length = 1 and manually compute the
15214 uint64_t su
= in
->layout
.stripe_unit
;
15215 *len
= su
- (off
% su
);
15221 int Client::get_osd_crush_location(int id
, vector
<pair
<string
, string
> >& path
)
15223 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15224 if (!mref_reader
.is_state_satisfied())
15225 return -CEPHFS_ENOTCONN
;
15227 std::scoped_lock
lock(client_lock
);
15230 return -CEPHFS_EINVAL
;
15231 return objecter
->with_osdmap([&](const OSDMap
& o
) {
15232 return o
.crush
->get_full_location_ordered(id
, path
);
15236 int Client::get_file_stripe_address(int fd
, loff_t offset
,
15237 vector
<entity_addr_t
>& address
)
15239 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15240 if (!mref_reader
.is_state_satisfied())
15241 return -CEPHFS_ENOTCONN
;
15243 std::scoped_lock
lock(client_lock
);
15245 Fh
*f
= get_filehandle(fd
);
15247 return -CEPHFS_EBADF
;
15248 Inode
*in
= f
->inode
.get();
15251 vector
<ObjectExtent
> extents
;
15252 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, 1,
15253 in
->truncate_size
, extents
);
15254 ceph_assert(extents
.size() == 1);
15256 // now we have the object and its 'layout'
15257 return objecter
->with_osdmap([&](const OSDMap
& o
) {
15258 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
15260 o
.pg_to_acting_osds(pg
, osds
);
15262 return -CEPHFS_EINVAL
;
15263 for (unsigned i
= 0; i
< osds
.size(); i
++) {
15264 entity_addr_t addr
= o
.get_addrs(osds
[i
]).front();
15265 address
.push_back(addr
);
15271 int Client::get_osd_addr(int osd
, entity_addr_t
& addr
)
15273 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15274 if (!mref_reader
.is_state_satisfied())
15275 return -CEPHFS_ENOTCONN
;
15277 std::scoped_lock
lock(client_lock
);
15279 return objecter
->with_osdmap([&](const OSDMap
& o
) {
15280 if (!o
.exists(osd
))
15281 return -CEPHFS_ENOENT
;
15283 addr
= o
.get_addrs(osd
).front();
15288 int Client::enumerate_layout(int fd
, vector
<ObjectExtent
>& result
,
15289 loff_t length
, loff_t offset
)
15291 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15292 if (!mref_reader
.is_state_satisfied())
15293 return -CEPHFS_ENOTCONN
;
15295 std::scoped_lock
lock(client_lock
);
15297 Fh
*f
= get_filehandle(fd
);
15299 return -CEPHFS_EBADF
;
15300 Inode
*in
= f
->inode
.get();
15302 // map to a list of extents
15303 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, length
, in
->truncate_size
, result
);
15305 ldout(cct
, 3) << __func__
<< "(" << fd
<< ", " << length
<< ", " << offset
<< ") = 0" << dendl
;
15310 /* find an osd with the same ip. -CEPHFS_ENXIO if none. */
15311 int Client::get_local_osd()
15313 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15314 if (!mref_reader
.is_state_satisfied())
15315 return -CEPHFS_ENOTCONN
;
15317 std::scoped_lock
lock(client_lock
);
15319 objecter
->with_osdmap([this](const OSDMap
& o
) {
15320 if (o
.get_epoch() != local_osd_epoch
) {
15321 local_osd
= o
.find_osd_on_ip(messenger
->get_myaddrs().front());
15322 local_osd_epoch
= o
.get_epoch();
15333 // ===============================
15335 void Client::ms_handle_connect(Connection
*con
)
15337 ldout(cct
, 10) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
15340 bool Client::ms_handle_reset(Connection
*con
)
15342 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
15346 void Client::ms_handle_remote_reset(Connection
*con
)
15348 std::scoped_lock
lock(client_lock
);
15349 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
15350 switch (con
->get_peer_type()) {
15351 case CEPH_ENTITY_TYPE_MDS
:
15353 // kludge to figure out which mds this is; fixme with a Connection* state
15354 mds_rank_t mds
= MDS_RANK_NONE
;
15355 MetaSessionRef s
= NULL
;
15356 for (auto &p
: mds_sessions
) {
15357 if (mdsmap
->have_inst(p
.first
) && mdsmap
->get_addrs(p
.first
) == con
->get_peer_addrs()) {
15363 ceph_assert(s
!= NULL
);
15364 switch (s
->state
) {
15365 case MetaSession::STATE_CLOSING
:
15366 ldout(cct
, 1) << "reset from mds we were closing; we'll call that closed" << dendl
;
15367 _closed_mds_session(s
.get());
15370 case MetaSession::STATE_OPENING
:
15372 ldout(cct
, 1) << "reset from mds we were opening; retrying" << dendl
;
15373 list
<Context
*> waiters
;
15374 waiters
.swap(s
->waiting_for_open
);
15375 _closed_mds_session(s
.get());
15376 auto news
= _get_or_open_mds_session(mds
);
15377 news
->waiting_for_open
.swap(waiters
);
15381 case MetaSession::STATE_OPEN
:
15383 objecter
->maybe_request_map(); /* to check if we are blocklisted */
15384 if (cct
->_conf
.get_val
<bool>("client_reconnect_stale")) {
15385 ldout(cct
, 1) << "reset from mds we were open; close mds session for reconnect" << dendl
;
15386 _closed_mds_session(s
.get());
15388 ldout(cct
, 1) << "reset from mds we were open; mark session as stale" << dendl
;
15389 s
->state
= MetaSession::STATE_STALE
;
15394 case MetaSession::STATE_NEW
:
15395 case MetaSession::STATE_CLOSED
:
15405 bool Client::ms_handle_refused(Connection
*con
)
15407 ldout(cct
, 1) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
15411 Inode
*Client::get_quota_root(Inode
*in
, const UserPerm
& perms
)
15413 Inode
*quota_in
= root_ancestor
;
15414 SnapRealm
*realm
= in
->snaprealm
;
15416 if (!cct
->_conf
.get_val
<bool>("client_quota"))
15420 ldout(cct
, 10) << __func__
<< " realm " << realm
->ino
<< dendl
;
15421 if (realm
->ino
!= in
->ino
) {
15422 auto p
= inode_map
.find(vinodeno_t(realm
->ino
, CEPH_NOSNAP
));
15423 if (p
== inode_map
.end())
15426 if (p
->second
->quota
.is_enable()) {
15427 quota_in
= p
->second
;
15431 realm
= realm
->pparent
;
15433 ldout(cct
, 10) << __func__
<< " " << in
->vino() << " -> " << quota_in
->vino() << dendl
;
15438 * Traverse quota ancestors of the Inode, return true
15439 * if any of them passes the passed function
15441 bool Client::check_quota_condition(Inode
*in
, const UserPerm
& perms
,
15442 std::function
<bool (const Inode
&in
)> test
)
15444 if (!cct
->_conf
.get_val
<bool>("client_quota"))
15448 ceph_assert(in
!= NULL
);
15453 if (in
== root_ancestor
) {
15454 // We're done traversing, drop out
15457 // Continue up the tree
15458 in
= get_quota_root(in
, perms
);
15465 bool Client::is_quota_files_exceeded(Inode
*in
, const UserPerm
& perms
)
15467 return check_quota_condition(in
, perms
,
15468 [](const Inode
&in
) {
15469 return in
.quota
.max_files
&& in
.rstat
.rsize() >= in
.quota
.max_files
;
15473 bool Client::is_quota_bytes_exceeded(Inode
*in
, int64_t new_bytes
,
15474 const UserPerm
& perms
)
15476 return check_quota_condition(in
, perms
,
15477 [&new_bytes
](const Inode
&in
) {
15478 return in
.quota
.max_bytes
&& (in
.rstat
.rbytes
+ new_bytes
)
15479 > in
.quota
.max_bytes
;
15483 bool Client::is_quota_bytes_approaching(Inode
*in
, const UserPerm
& perms
)
15485 ceph_assert(in
->size
>= in
->reported_size
);
15486 const uint64_t size
= in
->size
- in
->reported_size
;
15487 return check_quota_condition(in
, perms
,
15488 [&size
](const Inode
&in
) {
15489 if (in
.quota
.max_bytes
) {
15490 if (in
.rstat
.rbytes
>= in
.quota
.max_bytes
) {
15494 const uint64_t space
= in
.quota
.max_bytes
- in
.rstat
.rbytes
;
15495 return (space
>> 4) < size
;
15509 int Client::check_pool_perm(Inode
*in
, int need
)
15511 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
15513 if (!cct
->_conf
->client_check_pool_perm
)
15516 /* Only need to do this for regular files */
15517 if (!in
->is_file())
15520 int64_t pool_id
= in
->layout
.pool_id
;
15521 std::string pool_ns
= in
->layout
.pool_ns
;
15522 std::pair
<int64_t, std::string
> perm_key(pool_id
, pool_ns
);
15525 auto it
= pool_perms
.find(perm_key
);
15526 if (it
== pool_perms
.end())
15528 if (it
->second
== POOL_CHECKING
) {
15529 // avoid concurrent checkings
15530 wait_on_list(waiting_for_pool_perm
);
15533 ceph_assert(have
& POOL_CHECKED
);
15539 if (in
->snapid
!= CEPH_NOSNAP
) {
15540 // pool permission check needs to write to the first object. But for snapshot,
15541 // head of the first object may have already been deleted. To avoid creating
15542 // orphan object, skip the check for now.
15546 pool_perms
[perm_key
] = POOL_CHECKING
;
15549 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (unsigned long long)in
->ino
);
15550 object_t oid
= oid_buf
;
15552 SnapContext nullsnapc
;
15554 C_SaferCond rd_cond
;
15555 ObjectOperation rd_op
;
15556 rd_op
.stat(nullptr, nullptr, nullptr);
15558 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), rd_op
,
15559 nullsnapc
, ceph::real_clock::now(), 0, &rd_cond
);
15561 C_SaferCond wr_cond
;
15562 ObjectOperation wr_op
;
15563 wr_op
.create(true);
15565 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), wr_op
,
15566 nullsnapc
, ceph::real_clock::now(), 0, &wr_cond
);
15568 client_lock
.unlock();
15569 int rd_ret
= rd_cond
.wait();
15570 int wr_ret
= wr_cond
.wait();
15571 client_lock
.lock();
15573 bool errored
= false;
15575 if (rd_ret
== 0 || rd_ret
== -CEPHFS_ENOENT
)
15577 else if (rd_ret
!= -CEPHFS_EPERM
) {
15578 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
15579 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
15583 if (wr_ret
== 0 || wr_ret
== -CEPHFS_EEXIST
)
15584 have
|= POOL_WRITE
;
15585 else if (wr_ret
!= -CEPHFS_EPERM
) {
15586 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
15587 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
15592 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
15593 // Raise EIO because actual error code might be misleading for
15594 // userspace filesystem user.
15595 pool_perms
.erase(perm_key
);
15596 signal_cond_list(waiting_for_pool_perm
);
15597 return -CEPHFS_EIO
;
15600 pool_perms
[perm_key
] = have
| POOL_CHECKED
;
15601 signal_cond_list(waiting_for_pool_perm
);
15604 if ((need
& CEPH_CAP_FILE_RD
) && !(have
& POOL_READ
)) {
15605 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
15606 << " need " << ccap_string(need
) << ", but no read perm" << dendl
;
15607 return -CEPHFS_EPERM
;
15609 if ((need
& CEPH_CAP_FILE_WR
) && !(have
& POOL_WRITE
)) {
15610 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
15611 << " need " << ccap_string(need
) << ", but no write perm" << dendl
;
15612 return -CEPHFS_EPERM
;
15618 int Client::_posix_acl_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
15620 if (acl_type
== POSIX_ACL
) {
15621 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
15622 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
15624 return posix_acl_permits(access_acl
, in
->uid
, in
->gid
, perms
, want
);
15627 return -CEPHFS_EAGAIN
;
15630 int Client::_posix_acl_chmod(Inode
*in
, mode_t mode
, const UserPerm
& perms
)
15632 if (acl_type
== NO_ACL
)
15635 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
15639 if (acl_type
== POSIX_ACL
) {
15640 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
15641 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
15642 bufferptr
acl(access_acl
.c_str(), access_acl
.length());
15643 r
= posix_acl_access_chmod(acl
, mode
);
15646 r
= _do_setxattr(in
, ACL_EA_ACCESS
, acl
.c_str(), acl
.length(), 0, perms
);
15652 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< " result=" << r
<< dendl
;
15656 int Client::_posix_acl_create(Inode
*dir
, mode_t
*mode
, bufferlist
& xattrs_bl
,
15657 const UserPerm
& perms
)
15659 if (acl_type
== NO_ACL
)
15662 if (S_ISLNK(*mode
))
15665 int r
= _getattr(dir
, CEPH_STAT_CAP_XATTR
, perms
, dir
->xattr_version
== 0);
15669 if (acl_type
== POSIX_ACL
) {
15670 if (dir
->xattrs
.count(ACL_EA_DEFAULT
)) {
15671 map
<string
, bufferptr
> xattrs
;
15673 const bufferptr
& default_acl
= dir
->xattrs
[ACL_EA_DEFAULT
];
15674 bufferptr
acl(default_acl
.c_str(), default_acl
.length());
15675 r
= posix_acl_inherit_mode(acl
, mode
);
15680 r
= posix_acl_equiv_mode(acl
.c_str(), acl
.length(), mode
);
15684 xattrs
[ACL_EA_ACCESS
] = acl
;
15687 if (S_ISDIR(*mode
))
15688 xattrs
[ACL_EA_DEFAULT
] = dir
->xattrs
[ACL_EA_DEFAULT
];
15692 encode(xattrs
, xattrs_bl
);
15695 *mode
&= ~umask_cb(callback_handle
);
15700 ldout(cct
, 10) << __func__
<< " dir ino " << dir
->ino
<< " result=" << r
<< dendl
;
15704 void Client::set_filer_flags(int flags
)
15706 std::scoped_lock
l(client_lock
);
15707 ceph_assert(flags
== 0 ||
15708 flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
15709 objecter
->add_global_op_flags(flags
);
15712 void Client::clear_filer_flags(int flags
)
15714 std::scoped_lock
l(client_lock
);
15715 ceph_assert(flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
15716 objecter
->clear_global_op_flag(flags
);
15719 // called before mount
15720 void Client::set_uuid(const std::string
& uuid
)
15722 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
15723 ceph_assert(iref_reader
.is_state_satisfied());
15725 std::scoped_lock
l(client_lock
);
15726 ceph_assert(!uuid
.empty());
15728 metadata
["uuid"] = uuid
;
15732 // called before mount. 0 means infinite
15733 void Client::set_session_timeout(unsigned timeout
)
15735 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
15736 ceph_assert(iref_reader
.is_state_satisfied());
15738 std::scoped_lock
l(client_lock
);
15740 metadata
["timeout"] = stringify(timeout
);
15743 // called before mount
15744 int Client::start_reclaim(const std::string
& uuid
, unsigned flags
,
15745 const std::string
& fs_name
)
15747 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
15748 if (!iref_reader
.is_state_satisfied())
15749 return -CEPHFS_ENOTCONN
;
15752 return -CEPHFS_EINVAL
;
15754 std::unique_lock
l(client_lock
);
15756 auto it
= metadata
.find("uuid");
15757 if (it
!= metadata
.end() && it
->second
== uuid
)
15758 return -CEPHFS_EINVAL
;
15761 int r
= subscribe_mdsmap(fs_name
);
15763 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
15767 if (metadata
.empty())
15768 populate_metadata("");
15770 while (mdsmap
->get_epoch() == 0)
15771 wait_on_list(waiting_for_mdsmap
);
15774 for (unsigned mds
= 0; mds
< mdsmap
->get_num_in_mds(); ) {
15775 if (!mdsmap
->is_up(mds
)) {
15776 ldout(cct
, 10) << "mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
15777 wait_on_list(waiting_for_mdsmap
);
15781 MetaSessionRef session
;
15782 if (!have_open_session(mds
)) {
15783 session
= _get_or_open_mds_session(mds
);
15784 if (session
->state
== MetaSession::STATE_REJECTED
)
15785 return -CEPHFS_EPERM
;
15786 if (session
->state
!= MetaSession::STATE_OPENING
) {
15788 return -CEPHFS_EINVAL
;
15790 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
15791 wait_on_context_list(session
->waiting_for_open
);
15795 session
= mds_sessions
.at(mds
);
15796 if (!session
->mds_features
.test(CEPHFS_FEATURE_RECLAIM_CLIENT
))
15797 return -CEPHFS_EOPNOTSUPP
;
15799 if (session
->reclaim_state
== MetaSession::RECLAIM_NULL
||
15800 session
->reclaim_state
== MetaSession::RECLAIMING
) {
15801 session
->reclaim_state
= MetaSession::RECLAIMING
;
15802 auto m
= make_message
<MClientReclaim
>(uuid
, flags
);
15803 session
->con
->send_message2(std::move(m
));
15804 wait_on_list(waiting_for_reclaim
);
15805 } else if (session
->reclaim_state
== MetaSession::RECLAIM_FAIL
) {
15806 return reclaim_errno
? : -CEPHFS_ENOTRECOVERABLE
;
15812 // didn't find target session in any mds
15813 if (reclaim_target_addrs
.empty()) {
15814 if (flags
& CEPH_RECLAIM_RESET
)
15815 return -CEPHFS_ENOENT
;
15816 return -CEPHFS_ENOTRECOVERABLE
;
15819 if (flags
& CEPH_RECLAIM_RESET
)
15822 // use blocklist to check if target session was killed
15823 // (config option mds_session_blocklist_on_evict needs to be true)
15824 ldout(cct
, 10) << __func__
<< ": waiting for OSD epoch " << reclaim_osd_epoch
<< dendl
;
15827 objecter
->wait_for_map(reclaim_osd_epoch
, ca::use_blocked
[ec
]);
15831 return ceph::from_error_code(ec
);
15833 bool blocklisted
= objecter
->with_osdmap(
15834 [this](const OSDMap
&osd_map
) -> bool {
15835 return osd_map
.is_blocklisted(reclaim_target_addrs
);
15838 return -CEPHFS_ENOTRECOVERABLE
;
15840 metadata
["reclaiming_uuid"] = uuid
;
15844 void Client::finish_reclaim()
15846 auto it
= metadata
.find("reclaiming_uuid");
15847 if (it
== metadata
.end()) {
15848 for (auto &p
: mds_sessions
)
15849 p
.second
->reclaim_state
= MetaSession::RECLAIM_NULL
;
15853 for (auto &p
: mds_sessions
) {
15854 p
.second
->reclaim_state
= MetaSession::RECLAIM_NULL
;
15855 auto m
= make_message
<MClientReclaim
>("", MClientReclaim::FLAG_FINISH
);
15856 p
.second
->con
->send_message2(std::move(m
));
15859 metadata
["uuid"] = it
->second
;
15860 metadata
.erase(it
);
15863 void Client::handle_client_reclaim_reply(const MConstRef
<MClientReclaimReply
>& reply
)
15865 mds_rank_t from
= mds_rank_t(reply
->get_source().num());
15866 ldout(cct
, 10) << __func__
<< " " << *reply
<< " from mds." << from
<< dendl
;
15868 std::scoped_lock
cl(client_lock
);
15869 auto session
= _get_mds_session(from
, reply
->get_connection().get());
15871 ldout(cct
, 10) << " discarding reclaim reply from sessionless mds." << from
<< dendl
;
15875 if (reply
->get_result() >= 0) {
15876 session
->reclaim_state
= MetaSession::RECLAIM_OK
;
15877 if (reply
->get_epoch() > reclaim_osd_epoch
)
15878 reclaim_osd_epoch
= reply
->get_epoch();
15879 if (!reply
->get_addrs().empty())
15880 reclaim_target_addrs
= reply
->get_addrs();
15882 session
->reclaim_state
= MetaSession::RECLAIM_FAIL
;
15883 reclaim_errno
= reply
->get_result();
15886 signal_cond_list(waiting_for_reclaim
);
15890 * This is included in cap release messages, to cause
15891 * the MDS to wait until this OSD map epoch. It is necessary
15892 * in corner cases where we cancel RADOS ops, so that
15893 * nobody else tries to do IO to the same objects in
15894 * the same epoch as the cancelled ops.
15896 void Client::set_cap_epoch_barrier(epoch_t e
)
15898 ldout(cct
, 5) << __func__
<< " epoch = " << e
<< dendl
;
15899 cap_epoch_barrier
= e
;
15902 const char** Client::get_tracked_conf_keys() const
15904 static const char* keys
[] = {
15905 "client_cache_size",
15906 "client_cache_mid",
15908 "client_deleg_timeout",
15909 "client_deleg_break_on_open",
15911 "client_oc_max_objects",
15912 "client_oc_max_dirty",
15913 "client_oc_target_dirty",
15914 "client_oc_max_dirty_age",
15915 "client_caps_release_delay",
15916 "client_mount_timeout",
15922 void Client::handle_conf_change(const ConfigProxy
& conf
,
15923 const std::set
<std::string
> &changed
)
15925 std::scoped_lock
lock(client_lock
);
15927 if (changed
.count("client_cache_mid")) {
15928 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
15930 if (changed
.count("client_acl_type")) {
15932 if (cct
->_conf
->client_acl_type
== "posix_acl")
15933 acl_type
= POSIX_ACL
;
15935 if (changed
.count("client_oc_size")) {
15936 objectcacher
->set_max_size(cct
->_conf
->client_oc_size
);
15938 if (changed
.count("client_oc_max_objects")) {
15939 objectcacher
->set_max_objects(cct
->_conf
->client_oc_max_objects
);
15941 if (changed
.count("client_oc_max_dirty")) {
15942 objectcacher
->set_max_dirty(cct
->_conf
->client_oc_max_dirty
);
15944 if (changed
.count("client_oc_target_dirty")) {
15945 objectcacher
->set_target_dirty(cct
->_conf
->client_oc_target_dirty
);
15947 if (changed
.count("client_oc_max_dirty_age")) {
15948 objectcacher
->set_max_dirty_age(cct
->_conf
->client_oc_max_dirty_age
);
15950 if (changed
.count("client_collect_and_send_global_metrics")) {
15951 _collect_and_send_global_metrics
= cct
->_conf
.get_val
<bool>(
15952 "client_collect_and_send_global_metrics");
15954 if (changed
.count("client_caps_release_delay")) {
15955 caps_release_delay
= cct
->_conf
.get_val
<std::chrono::seconds
>(
15956 "client_caps_release_delay");
15958 if (changed
.count("client_mount_timeout")) {
15959 mount_timeout
= cct
->_conf
.get_val
<std::chrono::seconds
>(
15960 "client_mount_timeout");
15964 void intrusive_ptr_add_ref(Inode
*in
)
15969 void intrusive_ptr_release(Inode
*in
)
15971 in
->client
->put_inode(in
);
15974 mds_rank_t
Client::_get_random_up_mds() const
15976 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
15978 std::set
<mds_rank_t
> up
;
15979 mdsmap
->get_up_mds_set(up
);
15982 return MDS_RANK_NONE
;
15983 std::set
<mds_rank_t
>::const_iterator p
= up
.begin();
15984 for (int n
= rand() % up
.size(); n
; n
--)
15990 StandaloneClient::StandaloneClient(Messenger
*m
, MonClient
*mc
,
15991 boost::asio::io_context
& ictx
)
15992 : Client(m
, mc
, new Objecter(m
->cct
, m
, mc
, ictx
))
15994 monclient
->set_messenger(m
);
15995 objecter
->set_client_incarnation(0);
15998 StandaloneClient::~StandaloneClient()
16001 objecter
= nullptr;
16004 int StandaloneClient::init()
16006 RWRef_t
iref_writer(initialize_state
, CLIENT_INITIALIZING
, false);
16007 ceph_assert(iref_writer
.is_first_writer());
16012 client_lock
.lock();
16014 messenger
->add_dispatcher_tail(objecter
);
16015 messenger
->add_dispatcher_tail(this);
16017 monclient
->set_want_keys(CEPH_ENTITY_TYPE_MDS
| CEPH_ENTITY_TYPE_OSD
);
16018 int r
= monclient
->init();
16020 // need to do cleanup because we're in an intermediate init state
16022 std::scoped_lock
l(timer_lock
);
16026 client_lock
.unlock();
16027 objecter
->shutdown();
16028 objectcacher
->stop();
16029 monclient
->shutdown();
16034 client_lock
.unlock();
16036 iref_writer
.update_state(CLIENT_INITIALIZED
);
16041 void StandaloneClient::shutdown()
16043 Client::shutdown();
16044 objecter
->shutdown();
16045 monclient
->shutdown();